From 5b19bda85c2ce01e4a1c7f324b7ef14bffed3315 Mon Sep 17 00:00:00 2001
From: rockerBOO <rockerboo@gmail.com>
Date: Sun, 5 Nov 2023 12:35:46 -0500
Subject: [PATCH 001/582] Add validation loss

---
 library/train_util.py |   4 ++
 train_network.py      | 117 +++++++++++++++++++++++++++++++++++++++++-
 2 files changed, 120 insertions(+), 1 deletion(-)

diff --git a/library/train_util.py b/library/train_util.py
index cc9ac4555..e26f39799 100644
--- a/library/train_util.py
+++ b/library/train_util.py
@@ -4736,6 +4736,10 @@ def __call__(self, examples):
         else:
             dataset = self.dataset
 
+        # If we split a dataset we will get a Subset
+        if type(dataset) is torch.utils.data.Subset:
+            dataset = dataset.dataset
+
         # set epoch and step
         dataset.set_current_epoch(self.current_epoch.value)
         dataset.set_current_step(self.current_step.value)
diff --git a/train_network.py b/train_network.py
index d50916b74..58767b6f7 100644
--- a/train_network.py
+++ b/train_network.py
@@ -345,8 +345,21 @@ def train(self, args):
         # DataLoaderのプロセス数：0はメインプロセスになる
         n_workers = min(args.max_data_loader_n_workers, os.cpu_count() - 1)  # cpu_count-1 ただし最大で指定された数まで
 
+        if args.validation_ratio > 0.0:
+            train_ratio = 1 - args.validation_ratio
+            validation_ratio = args.validation_ratio
+            train, val = torch.utils.data.random_split(
+                train_dataset_group,
+                [train_ratio, validation_ratio]
+            )
+            print(f"split dataset by ratio: train {train_ratio}, validation {validation_ratio}")
+            print(f"train images: {len(train)}, validation images: {len(val)}")
+        else:
+            train = train_dataset_group
+            val = []
+
         train_dataloader = torch.utils.data.DataLoader(
-            train_dataset_group,
+            train,
             batch_size=1,
             shuffle=True,
             collate_fn=collator,
@@ -354,6 +367,15 @@ def train(self, args):
             persistent_workers=args.persistent_data_loader_workers,
         )
 
+        val_dataloader = torch.utils.data.DataLoader(
+            val,
+            shuffle=False,
+            batch_size=1,
+            collate_fn=collator,
+            num_workers=n_workers,
+            persistent_workers=args.persistent_data_loader_workers,
+        )
+
         # 学習ステップ数を計算する
         if args.max_train_epochs is not None:
             args.max_train_steps = args.max_train_epochs * math.ceil(
@@ -711,6 +733,8 @@ def train(self, args):
             )
 
         loss_recorder = train_util.LossRecorder()
+        val_loss_recorder = train_util.LossRecorder()
+
         del train_dataset_group
 
         # callback for step start
@@ -752,6 +776,8 @@ def remove_model(old_ckpt_name):
 
             network.on_epoch_start(text_encoder, unet)
 
+            # TRAINING
+
             for step, batch in enumerate(train_dataloader):
                 current_step.value = global_step
                 with accelerator.accumulate(network):
@@ -877,6 +903,87 @@ def remove_model(old_ckpt_name):
                 if global_step >= args.max_train_steps:
                     break
 
+            # VALIDATION
+
+            if len(val_dataloader) > 0:
+                print("Validating バリデーション処理...")
+
+            with torch.no_grad():
+                for val_step, batch in enumerate(val_dataloader):
+                    if "latents" in batch and batch["latents"] is not None:
+                        latents = batch["latents"].to(accelerator.device)
+                    else:
+                        # latentに変換
+                        latents = vae.encode(batch["images"].to(device=accelerator.device, dtype=vae_dtype)).latent_dist.sample()
+
+                        # NaNが含まれていれば警告を表示し0に置き換える
+                        if torch.any(torch.isnan(latents)):
+                            accelerator.print("NaN found in latents, replacing with zeros")
+                            latents = torch.where(torch.isnan(latents), torch.zeros_like(latents), latents)
+                    latents = latents * self.vae_scale_factor
+                    b_size = latents.shape[0]
+
+                    # Get the text embedding for conditioning
+                    if args.weighted_captions:
+                        text_encoder_conds = get_weighted_text_embeddings(
+                            tokenizer,
+                            text_encoder,
+                            batch["captions"],
+                            accelerator.device,
+                            args.max_token_length // 75 if args.max_token_length else 1,
+                            clip_skip=args.clip_skip,
+                        )
+                    else:
+                        text_encoder_conds = self.get_text_cond(
+                            args, accelerator, batch, tokenizers, text_encoders, weight_dtype
+                        )
+
+                    # Sample noise, sample a random timestep for each image, and add noise to the latents,
+                    # with noise offset and/or multires noise if specified
+                    noise, noisy_latents, timesteps = train_util.get_noise_noisy_latents_and_timesteps(
+                        args, noise_scheduler, latents
+                    )
+
+                    # Predict the noise residual
+                    with accelerator.autocast():
+                        noise_pred = self.call_unet(
+                            args, accelerator, unet, noisy_latents, timesteps, text_encoder_conds, batch, weight_dtype
+                        )
+
+                    if args.v_parameterization:
+                        # v-parameterization training
+                        target = noise_scheduler.get_velocity(latents, noise, timesteps)
+                    else:
+                        target = noise
+
+                    loss = torch.nn.functional.mse_loss(noise_pred.float(), target.float(), reduction="none")
+                    loss = loss.mean([1, 2, 3])
+
+                    loss_weights = batch["loss_weights"].to(accelerator.device)  # 各sampleごとのweight
+
+                    loss = loss * loss_weights
+
+                    if args.min_snr_gamma:
+                        loss = apply_snr_weight(loss, timesteps, noise_scheduler, args.min_snr_gamma)
+                    if args.scale_v_pred_loss_like_noise_pred:
+                        loss = scale_v_prediction_loss_like_noise_prediction(loss, timesteps, noise_scheduler)
+                    if args.v_pred_like_loss:
+                        loss = add_v_prediction_like_loss(loss, timesteps, noise_scheduler, args.v_pred_like_loss)
+
+                    loss = loss.mean()  # 平均なのでbatch_sizeで割る必要なし
+
+                    current_loss = loss.detach().item()
+
+                    val_loss_recorder.add(epoch=epoch, step=val_step, loss=current_loss)
+
+                if len(val_dataloader) > 0:
+                    avr_loss: float = val_loss_recorder.moving_average
+
+                    if args.logging_dir is not None:
+                        logs = {"loss/validation": avr_loss}
+                        accelerator.log(logs, step=epoch + 1)
+
+
             if args.logging_dir is not None:
                 logs = {"loss/epoch": loss_recorder.moving_average}
                 accelerator.log(logs, step=epoch + 1)
@@ -999,6 +1106,14 @@ def setup_parser() -> argparse.ArgumentParser:
         action="store_true",
         help="do not use fp16/bf16 VAE in mixed precision (use float VAE) / mixed precisionでも fp16/bf16 VAEを使わずfloat VAEを使う",
     )
+
+    parser.add_argument(
+        "--validation_ratio",
+        type=float,
+        default=0.0,
+        help="Ratio for validation images out of the training dataset"
+    )
+
     return parser
 
 

From 33c311ed19821c9be7094ba89371777d7478b028 Mon Sep 17 00:00:00 2001
From: rockerBOO <rockerboo@gmail.com>
Date: Sun, 5 Nov 2023 12:37:37 -0500
Subject: [PATCH 002/582] new ratio code

---
 train_network.py | 48 +++++++++++++++++++++++++++++++++++++++++++-----
 1 file changed, 43 insertions(+), 5 deletions(-)

diff --git a/train_network.py b/train_network.py
index 58767b6f7..967c95fb4 100644
--- a/train_network.py
+++ b/train_network.py
@@ -345,10 +345,48 @@ def train(self, args):
         # DataLoaderのプロセス数：0はメインプロセスになる
         n_workers = min(args.max_data_loader_n_workers, os.cpu_count() - 1)  # cpu_count-1 ただし最大で指定された数まで
 
+        def get_indices_without_reg(dataset: torch.utils.data.Dataset):
+            return [id for id, (key, item) in enumerate(dataset.image_data.items()) if item.is_reg is False]
+
+        from typing import Sequence, Union
+        from torch._utils import _accumulate
+        import warnings
+        from torch.utils.data.dataset import Subset
+
+        def random_split(dataset: torch.utils.data.Dataset, lengths: Sequence[Union[int, float]]):
+            indices = get_indices_without_reg(dataset)
+            random.shuffle(indices)
+
+            subset_lengths = []
+
+            for i, frac in enumerate(lengths):
+                if frac < 0 or frac > 1:
+                    raise ValueError(f"Fraction at index {i} is not between 0 and 1")
+                n_items_in_split = int(math.floor(len(indices) * frac))
+                subset_lengths.append(n_items_in_split)
+
+            remainder = len(indices) - sum(subset_lengths)
+
+            for i in range(remainder):
+                idx_to_add_at = i % len(subset_lengths)
+                subset_lengths[idx_to_add_at] += 1
+
+            lengths = subset_lengths
+            for i, length in enumerate(lengths):
+                if length == 0:
+                    warnings.warn(f"Length of split at index {i} is 0. "
+                                  f"This might result in an empty dataset.")
+
+            if sum(lengths) != len(indices):
+                raise ValueError("Sum of input lengths does not equal the length of the input dataset!")
+
+            return [Subset(dataset, indices[offset - length: offset]) for offset, length in zip(_accumulate(lengths), lengths)]
+
+
         if args.validation_ratio > 0.0:
             train_ratio = 1 - args.validation_ratio
             validation_ratio = args.validation_ratio
-            train, val = torch.utils.data.random_split(
+            train, val = random_split(
                 train_dataset_group,
                 [train_ratio, validation_ratio]
             )
@@ -358,6 +396,8 @@ def train(self, args):
             train = train_dataset_group
             val = []
 
+
+
         train_dataloader = torch.utils.data.DataLoader(
             train,
             batch_size=1,
@@ -898,7 +938,7 @@ def remove_model(old_ckpt_name):
 
                 if args.logging_dir is not None:
                     logs = self.generate_step_logs(args, current_loss, avr_loss, lr_scheduler, keys_scaled, mean_norm, maximum_norm)
-                    accelerator.log(logs, step=global_step)
+                    accelerator.log(logs)
 
                 if global_step >= args.max_train_steps:
                     break
@@ -973,13 +1013,11 @@ def remove_model(old_ckpt_name):
                     loss = loss.mean()  # 平均なのでbatch_sizeで割る必要なし
 
                     current_loss = loss.detach().item()
-
                     val_loss_recorder.add(epoch=epoch, step=val_step, loss=current_loss)
 
                 if len(val_dataloader) > 0:
-                    avr_loss: float = val_loss_recorder.moving_average
-
                     if args.logging_dir is not None:
+                        avr_loss: float = val_loss_recorder.moving_average
                         logs = {"loss/validation": avr_loss}
                         accelerator.log(logs, step=epoch + 1)
 

From 3de9e6c443037abf99832d1be60f4fc9c0d67b8c Mon Sep 17 00:00:00 2001
From: rockerBOO <rockerboo@gmail.com>
Date: Sun, 5 Nov 2023 01:45:23 -0500
Subject: [PATCH 003/582] Add validation split of datasets

---
 library/config_util.py | 145 ++++++++++++++++++++++++++---------------
 library/train_util.py  |  26 ++++++++
 train_network.py       |  67 ++++---------------
 3 files changed, 128 insertions(+), 110 deletions(-)

diff --git a/library/config_util.py b/library/config_util.py
index e8e0fda7c..1bf7ed955 100644
--- a/library/config_util.py
+++ b/library/config_util.py
@@ -85,6 +85,8 @@ class BaseDatasetParams:
   max_token_length: int = None
   resolution: Optional[Tuple[int, int]] = None
   debug_dataset: bool = False
+  validation_seed: Optional[int] = None
+  validation_split: float = 0.0
 
 @dataclass
 class DreamBoothDatasetParams(BaseDatasetParams):
@@ -200,6 +202,8 @@ def __validate_and_convert_scalar_or_twodim(klass, value: Union[float, Sequence]
     "enable_bucket": bool,
     "max_bucket_reso": int,
     "min_bucket_reso": int,
+    "validation_seed": int,
+    "validation_split": float,
     "resolution": functools.partial(__validate_and_convert_scalar_or_twodim.__func__, int),
   }
 
@@ -427,64 +431,89 @@ def generate_dataset_group_by_blueprint(dataset_group_blueprint: DatasetGroupBlu
       dataset_klass = FineTuningDataset
 
     subsets = [subset_klass(**asdict(subset_blueprint.params)) for subset_blueprint in dataset_blueprint.subsets]
-    dataset = dataset_klass(subsets=subsets, **asdict(dataset_blueprint.params))
+    dataset = dataset_klass(subsets=subsets, is_train=True, **asdict(dataset_blueprint.params))
     datasets.append(dataset)
 
-  # print info
-  info = ""
-  for i, dataset in enumerate(datasets):
-    is_dreambooth = isinstance(dataset, DreamBoothDataset)
-    is_controlnet = isinstance(dataset, ControlNetDataset)
-    info += dedent(f"""\
-      [Dataset {i}]
-        batch_size: {dataset.batch_size}
-        resolution: {(dataset.width, dataset.height)}
-        enable_bucket: {dataset.enable_bucket}
-    """)
-
-    if dataset.enable_bucket:
-      info += indent(dedent(f"""\
-        min_bucket_reso: {dataset.min_bucket_reso}
-        max_bucket_reso: {dataset.max_bucket_reso}
-        bucket_reso_steps: {dataset.bucket_reso_steps}
-        bucket_no_upscale: {dataset.bucket_no_upscale}
-      \n"""), "  ")
+  val_datasets:List[Union[DreamBoothDataset, FineTuningDataset, ControlNetDataset]] = []
+  for dataset_blueprint in dataset_group_blueprint.datasets:
+    if dataset_blueprint.params.validation_split <= 0.0:
+      continue
+    if dataset_blueprint.is_controlnet:
+      subset_klass = ControlNetSubset
+      dataset_klass = ControlNetDataset
+    elif dataset_blueprint.is_dreambooth:
+      subset_klass = DreamBoothSubset
+      dataset_klass = DreamBoothDataset
     else:
-      info += "\n"
-
-    for j, subset in enumerate(dataset.subsets):
-      info += indent(dedent(f"""\
-        [Subset {j} of Dataset {i}]
-          image_dir: "{subset.image_dir}"
-          image_count: {subset.img_count}
-          num_repeats: {subset.num_repeats}
-          shuffle_caption: {subset.shuffle_caption}
-          keep_tokens: {subset.keep_tokens}
-          caption_dropout_rate: {subset.caption_dropout_rate}
-          caption_dropout_every_n_epoches: {subset.caption_dropout_every_n_epochs}
-          caption_tag_dropout_rate: {subset.caption_tag_dropout_rate}
-          caption_prefix: {subset.caption_prefix}
-          caption_suffix: {subset.caption_suffix}
-          color_aug: {subset.color_aug}
-          flip_aug: {subset.flip_aug}
-          face_crop_aug_range: {subset.face_crop_aug_range}
-          random_crop: {subset.random_crop}
-          token_warmup_min: {subset.token_warmup_min},
-          token_warmup_step: {subset.token_warmup_step},
-      """), "  ")
-
-      if is_dreambooth:
+      subset_klass = FineTuningSubset
+      dataset_klass = FineTuningDataset
+  
+    subsets = [subset_klass(**asdict(subset_blueprint.params)) for subset_blueprint in dataset_blueprint.subsets]
+    dataset = dataset_klass(subsets=subsets, is_train=False, **asdict(dataset_blueprint.params))
+    val_datasets.append(dataset)
+
+  # print info
+  def print_info(_datasets):
+    info = ""
+    for i, dataset in enumerate(_datasets):
+      is_dreambooth = isinstance(dataset, DreamBoothDataset)
+      is_controlnet = isinstance(dataset, ControlNetDataset)
+      info += dedent(f"""\
+        [Dataset {i}]
+          batch_size: {dataset.batch_size}
+          resolution: {(dataset.width, dataset.height)}
+          enable_bucket: {dataset.enable_bucket}
+      """)
+  
+      if dataset.enable_bucket:
         info += indent(dedent(f"""\
-          is_reg: {subset.is_reg}
-          class_tokens: {subset.class_tokens}
-          caption_extension: {subset.caption_extension}
-        \n"""), "    ")
-      elif not is_controlnet:
+          min_bucket_reso: {dataset.min_bucket_reso}
+          max_bucket_reso: {dataset.max_bucket_reso}
+          bucket_reso_steps: {dataset.bucket_reso_steps}
+          bucket_no_upscale: {dataset.bucket_no_upscale}
+        \n"""), "  ")
+      else:
+        info += "\n"
+  
+      for j, subset in enumerate(dataset.subsets):
         info += indent(dedent(f"""\
-          metadata_file: {subset.metadata_file}
-        \n"""), "    ")
-
-  print(info)
+          [Subset {j} of Dataset {i}]
+            image_dir: "{subset.image_dir}"
+            image_count: {subset.img_count}
+            num_repeats: {subset.num_repeats}
+            shuffle_caption: {subset.shuffle_caption}
+            keep_tokens: {subset.keep_tokens}
+            caption_dropout_rate: {subset.caption_dropout_rate}
+            caption_dropout_every_n_epoches: {subset.caption_dropout_every_n_epochs}
+            caption_tag_dropout_rate: {subset.caption_tag_dropout_rate}
+            caption_prefix: {subset.caption_prefix}
+            caption_suffix: {subset.caption_suffix}
+            color_aug: {subset.color_aug}
+            flip_aug: {subset.flip_aug}
+            face_crop_aug_range: {subset.face_crop_aug_range}
+            random_crop: {subset.random_crop}
+            token_warmup_min: {subset.token_warmup_min},
+            token_warmup_step: {subset.token_warmup_step},
+        """), "  ")
+  
+        if is_dreambooth:
+          info += indent(dedent(f"""\
+            is_reg: {subset.is_reg}
+            class_tokens: {subset.class_tokens}
+            caption_extension: {subset.caption_extension}
+          \n"""), "    ")
+        elif not is_controlnet:
+          info += indent(dedent(f"""\
+            metadata_file: {subset.metadata_file}
+          \n"""), "    ")
+
+    print(info)
+
+  print_info(datasets)
+
+  if len(val_datasets) > 0:
+    print("Validation dataset")
+    print_info(val_datasets)
 
   # make buckets first because it determines the length of dataset
   # and set the same seed for all datasets
@@ -494,7 +523,15 @@ def generate_dataset_group_by_blueprint(dataset_group_blueprint: DatasetGroupBlu
     dataset.make_buckets()
     dataset.set_seed(seed)
 
-  return DatasetGroup(datasets)
+  for i, dataset in enumerate(val_datasets):
+    print(f"[Validation Dataset {i}]")
+    dataset.make_buckets()
+    dataset.set_seed(seed)
+
+  return (
+      DatasetGroup(datasets),
+      DatasetGroup(val_datasets) if val_datasets else None
+  )
 
 
 def generate_dreambooth_subsets_config_by_subdirs(train_data_dir: Optional[str] = None, reg_data_dir: Optional[str] = None):
diff --git a/library/train_util.py b/library/train_util.py
index e26f39799..ba37ec13d 100644
--- a/library/train_util.py
+++ b/library/train_util.py
@@ -123,6 +123,22 @@
 TEXT_ENCODER_OUTPUTS_CACHE_SUFFIX = "_te_outputs.npz"
 
 
+def split_train_val(paths, is_train, validation_split, validation_seed):
+    if validation_seed is not None:
+        print(f"Using validation seed: {validation_seed}")
+        prevstate = random.getstate()
+        random.seed(validation_seed)
+        random.shuffle(paths)
+        random.setstate(prevstate)
+    else:
+        random.shuffle(paths)
+
+    if is_train:
+        return paths[0:math.ceil(len(paths) * (1 - validation_split))]
+    else:
+        return paths[len(paths) - round(len(paths) * validation_split):]
+
+
 class ImageInfo:
     def __init__(self, image_key: str, num_repeats: int, caption: str, is_reg: bool, absolute_path: str) -> None:
         self.image_key: str = image_key
@@ -1314,6 +1330,7 @@ class DreamBoothDataset(BaseDataset):
     def __init__(
         self,
         subsets: Sequence[DreamBoothSubset],
+        is_train: bool,
         batch_size: int,
         tokenizer,
         max_token_length,
@@ -1324,12 +1341,18 @@ def __init__(
         bucket_reso_steps: int,
         bucket_no_upscale: bool,
         prior_loss_weight: float,
+        validation_split: float,
+        validation_seed: Optional[int],
         debug_dataset,
     ) -> None:
         super().__init__(tokenizer, max_token_length, resolution, debug_dataset)
 
         assert resolution is not None, f"resolution is required / resolution（解像度）指定は必須です"
 
+        self.is_train = is_train
+        self.validation_split = validation_split
+        self.validation_seed = validation_seed 
+
         self.batch_size = batch_size
         self.size = min(self.width, self.height)  # 短いほう
         self.prior_loss_weight = prior_loss_weight
@@ -1382,6 +1405,9 @@ def load_dreambooth_dir(subset: DreamBoothSubset):
                 return [], []
 
             img_paths = glob_images(subset.image_dir, "*")
+
+            if self.validation_split > 0.0:
+                img_paths = split_train_val(img_paths, self.is_train, self.validation_split, self.validation_seed)
             print(f"found directory {subset.image_dir} contains {len(img_paths)} image files")
 
             # 画像ファイルごとにプロンプトを読み込み、もしあればそちらを使う
diff --git a/train_network.py b/train_network.py
index 967c95fb4..97ecfe7be 100644
--- a/train_network.py
+++ b/train_network.py
@@ -189,10 +189,11 @@ def train(self, args):
                     }
 
             blueprint = blueprint_generator.generate(user_config, args, tokenizer=tokenizer)
-            train_dataset_group = config_util.generate_dataset_group_by_blueprint(blueprint.dataset_group)
+            train_dataset_group, val_dataset_group = config_util.generate_dataset_group_by_blueprint(blueprint.dataset_group)
         else:
             # use arbitrary dataset class
             train_dataset_group = train_util.load_arbitrary_dataset(args, tokenizer)
+            val_dataset_group = None # placeholder until validation dataset supported for arbitrary
 
         current_epoch = Value("i", 0)
         current_step = Value("i", 0)
@@ -212,6 +213,10 @@ def train(self, args):
             assert (
                 train_dataset_group.is_latent_cacheable()
             ), "when caching latents, either color_aug or random_crop cannot be used / latentをキャッシュするときはcolor_augとrandom_cropは使えません"
+            if val_dataset_group is not None:
+              assert (
+                  val_dataset_group.is_latent_cacheable()
+              ), "when caching validation latents, either color_aug or random_crop cannot be used / latentをキャッシュするときはcolor_augとrandom_cropは使えません"
 
         self.assert_extra_args(args, train_dataset_group)
 
@@ -264,6 +269,9 @@ def train(self, args):
             vae.eval()
             with torch.no_grad():
                 train_dataset_group.cache_latents(vae, args.vae_batch_size, args.cache_latents_to_disk, accelerator.is_main_process)
+                if val_dataset_group is not None:
+                    print("Cache validation latents...")
+                    val_dataset_group.cache_latents(vae, args.vae_batch_size, args.cache_latents_to_disk, accelerator.is_main_process)
             vae.to("cpu")
             if torch.cuda.is_available():
                 torch.cuda.empty_cache()
@@ -345,61 +353,8 @@ def train(self, args):
         # DataLoaderのプロセス数：0はメインプロセスになる
         n_workers = min(args.max_data_loader_n_workers, os.cpu_count() - 1)  # cpu_count-1 ただし最大で指定された数まで
 
-        def get_indices_without_reg(dataset: torch.utils.data.Dataset):
-            return [id for id, (key, item) in enumerate(dataset.image_data.items()) if item.is_reg is False]
-
-        from typing import Sequence, Union
-        from torch._utils import _accumulate
-        import warnings
-        from torch.utils.data.dataset import Subset
-
-        def random_split(dataset: torch.utils.data.Dataset, lengths: Sequence[Union[int, float]]):
-            indices = get_indices_without_reg(dataset)
-            random.shuffle(indices)
-
-            subset_lengths = []
-
-            for i, frac in enumerate(lengths):
-                if frac < 0 or frac > 1:
-                    raise ValueError(f"Fraction at index {i} is not between 0 and 1")
-                n_items_in_split = int(math.floor(len(indices) * frac))
-                subset_lengths.append(n_items_in_split)
-
-            remainder = len(indices) - sum(subset_lengths)
-
-            for i in range(remainder):
-                idx_to_add_at = i % len(subset_lengths)
-                subset_lengths[idx_to_add_at] += 1
-
-            lengths = subset_lengths
-            for i, length in enumerate(lengths):
-                if length == 0:
-                    warnings.warn(f"Length of split at index {i} is 0. "
-                                  f"This might result in an empty dataset.")
-
-            if sum(lengths) != len(indices):
-                raise ValueError("Sum of input lengths does not equal the length of the input dataset!")
-
-            return [Subset(dataset, indices[offset - length: offset]) for offset, length in zip(_accumulate(lengths), lengths)]
-
-
-        if args.validation_ratio > 0.0:
-            train_ratio = 1 - args.validation_ratio
-            validation_ratio = args.validation_ratio
-            train, val = random_split(
-                train_dataset_group,
-                [train_ratio, validation_ratio]
-            )
-            print(f"split dataset by ratio: train {train_ratio}, validation {validation_ratio}")
-            print(f"train images: {len(train)}, validation images: {len(val)}")
-        else:
-            train = train_dataset_group
-            val = []
-
-
-
         train_dataloader = torch.utils.data.DataLoader(
-            train,
+            train_dataset_group,
             batch_size=1,
             shuffle=True,
             collate_fn=collator,
@@ -408,7 +363,7 @@ def random_split(dataset: torch.utils.data.Dataset, lengths: Sequence[Union[int,
         )
 
         val_dataloader = torch.utils.data.DataLoader(
-            val,
+            val_dataset_group if val_dataset_group is not None else [],
             shuffle=False,
             batch_size=1,
             collate_fn=collator,

From a93c524b3a0e5c80a58c1317211dec93b6c137a7 Mon Sep 17 00:00:00 2001
From: rockerBOO <rockerboo@gmail.com>
Date: Sun, 5 Nov 2023 02:07:39 -0500
Subject: [PATCH 004/582] Update args to validation_seed and validation_split

---
 train_network.py | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/train_network.py b/train_network.py
index 97ecfe7be..f9e5debdb 100644
--- a/train_network.py
+++ b/train_network.py
@@ -1099,12 +1099,17 @@ def setup_parser() -> argparse.ArgumentParser:
         action="store_true",
         help="do not use fp16/bf16 VAE in mixed precision (use float VAE) / mixed precisionでも fp16/bf16 VAEを使わずfloat VAEを使う",
     )
-
     parser.add_argument(
-        "--validation_ratio",
+        "--validation_seed",
+        type=int,
+        default=None,
+        help="Validation seed"
+    )
+    parser.add_argument(
+        "--validation_split",
         type=float,
         default=0.0,
-        help="Ratio for validation images out of the training dataset"
+        help="Split for validation images out of the training dataset"
     )
 
     return parser

From c89252101e8e8bd74cb3ab09ae33b548fd828e15 Mon Sep 17 00:00:00 2001
From: rockerBOO <rockerboo@gmail.com>
Date: Sun, 5 Nov 2023 16:27:36 -0500
Subject: [PATCH 005/582] Add process_batch for train_network

---
 train_network.py | 211 ++++++++++++++++++-----------------------------
 1 file changed, 82 insertions(+), 129 deletions(-)

diff --git a/train_network.py b/train_network.py
index f9e5debdb..387b94b1c 100644
--- a/train_network.py
+++ b/train_network.py
@@ -130,6 +130,75 @@ def call_unet(self, args, accelerator, unet, noisy_latents, timesteps, text_cond
     def sample_images(self, accelerator, args, epoch, global_step, device, vae, tokenizer, text_encoder, unet):
         train_util.sample_images(accelerator, args, epoch, global_step, device, vae, tokenizer, text_encoder, unet)
 
+    def process_batch(self, batch, is_train, tokenizers, text_encoders, unet, vae, noise_scheduler, vae_dtype, weight_dtype, accelerator, args, train_text_encoder=True):
+        with torch.no_grad():
+            if "latents" in batch and batch["latents"] is not None:
+                latents = batch["latents"].to(accelerator.device)
+            else:
+                # latentに変換
+                latents = vae.encode(batch["images"].to(accelerator.device, dtype=vae_dtype)).latent_dist.sample()
+
+                # NaNが含まれていれば警告を表示し0に置き換える
+                if torch.any(torch.isnan(latents)):
+                    accelerator.print("NaN found in latents, replacing with zeros")
+                    latents = torch.where(torch.isnan(latents), torch.zeros_like(latents), latents)
+            latents = latents * self.vae_scale_factor
+        b_size = latents.shape[0]
+
+        with torch.set_grad_enabled(train_text_encoder), accelerator.autocast():
+            # Get the text embedding for conditioning
+            if args.weighted_captions:
+                text_encoder_conds = get_weighted_text_embeddings(
+                    tokenizers[0],
+                    text_encoders[0],
+                    batch["captions"],
+                    accelerator.device,
+                    args.max_token_length // 75 if args.max_token_length else 1,
+                    clip_skip=args.clip_skip,
+                )
+            else:
+                text_encoder_conds = self.get_text_cond(
+                    args, accelerator, batch, tokenizers, text_encoders, weight_dtype
+                )
+
+        # Sample noise, sample a random timestep for each image, and add noise to the latents,
+        # with noise offset and/or multires noise if specified
+        noise, noisy_latents, timesteps = train_util.get_noise_noisy_latents_and_timesteps(
+            args, noise_scheduler, latents
+        )
+
+        # Predict the noise residual
+        with torch.set_grad_enabled(is_train), accelerator.autocast():
+            noise_pred = self.call_unet(
+                args, accelerator, unet, noisy_latents, timesteps, text_encoder_conds, batch, weight_dtype
+            )
+
+        if args.v_parameterization:
+            # v-parameterization training
+            target = noise_scheduler.get_velocity(latents, noise, timesteps)
+        else:
+            target = noise
+
+        loss = torch.nn.functional.mse_loss(noise_pred.float(), target.float(), reduction="none")
+        loss = loss.mean([1, 2, 3])
+
+        loss_weights = batch["loss_weights"].to(accelerator.device)  # 各sampleごとのweight
+        loss = loss * loss_weights
+
+        if args.min_snr_gamma:
+            loss = apply_snr_weight(loss, timesteps, noise_scheduler, args.min_snr_gamma)
+        if args.scale_v_pred_loss_like_noise_pred:
+            loss = scale_v_prediction_loss_like_noise_prediction(loss, timesteps, noise_scheduler)
+        if args.v_pred_like_loss:
+            loss = add_v_prediction_like_loss(loss, timesteps, noise_scheduler, args.v_pred_like_loss)
+        if args.debiased_estimation_loss:
+            loss = apply_debiased_estimation(loss, timesteps, noise_scheduler)
+
+        loss = loss.mean()  # 平均なのでbatch_sizeで割る必要なし
+
+        return loss
+
+
     def train(self, args):
         session_id = random.randint(0, 2**32)
         training_started_at = time.time()
@@ -777,71 +846,8 @@ def remove_model(old_ckpt_name):
                 current_step.value = global_step
                 with accelerator.accumulate(network):
                     on_step_start(text_encoder, unet)
-
-                    with torch.no_grad():
-                        if "latents" in batch and batch["latents"] is not None:
-                            latents = batch["latents"].to(accelerator.device)
-                        else:
-                            # latentに変換
-                            latents = vae.encode(batch["images"].to(dtype=vae_dtype)).latent_dist.sample()
-
-                            # NaNが含まれていれば警告を表示し0に置き換える
-                            if torch.any(torch.isnan(latents)):
-                                accelerator.print("NaN found in latents, replacing with zeros")
-                                latents = torch.where(torch.isnan(latents), torch.zeros_like(latents), latents)
-                        latents = latents * self.vae_scale_factor
-                    b_size = latents.shape[0]
-
-                    with torch.set_grad_enabled(train_text_encoder), accelerator.autocast():
-                        # Get the text embedding for conditioning
-                        if args.weighted_captions:
-                            text_encoder_conds = get_weighted_text_embeddings(
-                                tokenizer,
-                                text_encoder,
-                                batch["captions"],
-                                accelerator.device,
-                                args.max_token_length // 75 if args.max_token_length else 1,
-                                clip_skip=args.clip_skip,
-                            )
-                        else:
-                            text_encoder_conds = self.get_text_cond(
-                                args, accelerator, batch, tokenizers, text_encoders, weight_dtype
-                            )
-
-                    # Sample noise, sample a random timestep for each image, and add noise to the latents,
-                    # with noise offset and/or multires noise if specified
-                    noise, noisy_latents, timesteps = train_util.get_noise_noisy_latents_and_timesteps(
-                        args, noise_scheduler, latents
-                    )
-
-                    # Predict the noise residual
-                    with accelerator.autocast():
-                        noise_pred = self.call_unet(
-                            args, accelerator, unet, noisy_latents, timesteps, text_encoder_conds, batch, weight_dtype
-                        )
-
-                    if args.v_parameterization:
-                        # v-parameterization training
-                        target = noise_scheduler.get_velocity(latents, noise, timesteps)
-                    else:
-                        target = noise
-
-                    loss = torch.nn.functional.mse_loss(noise_pred.float(), target.float(), reduction="none")
-                    loss = loss.mean([1, 2, 3])
-
-                    loss_weights = batch["loss_weights"]  # 各sampleごとのweight
-                    loss = loss * loss_weights
-
-                    if args.min_snr_gamma:
-                        loss = apply_snr_weight(loss, timesteps, noise_scheduler, args.min_snr_gamma)
-                    if args.scale_v_pred_loss_like_noise_pred:
-                        loss = scale_v_prediction_loss_like_noise_prediction(loss, timesteps, noise_scheduler)
-                    if args.v_pred_like_loss:
-                        loss = add_v_prediction_like_loss(loss, timesteps, noise_scheduler, args.v_pred_like_loss)
-                    if args.debiased_estimation_loss:
-                        loss = apply_debiased_estimation(loss, timesteps, noise_scheduler)
-
-                    loss = loss.mean()  # 平均なのでbatch_sizeで割る必要なし
+                    is_train = True
+                    loss = self.process_batch(batch, is_train, tokenizers, text_encoders, unet, vae, noise_scheduler, vae_dtype, weight_dtype, accelerator, args, train_text_encoder=train_text_encoder)
 
                     accelerator.backward(loss)
                     if accelerator.sync_gradients and args.max_grad_norm != 0.0:
@@ -893,7 +899,7 @@ def remove_model(old_ckpt_name):
 
                 if args.logging_dir is not None:
                     logs = self.generate_step_logs(args, current_loss, avr_loss, lr_scheduler, keys_scaled, mean_norm, maximum_norm)
-                    accelerator.log(logs)
+                    accelerator.log(logs, step=global_step)
 
                 if global_step >= args.max_train_steps:
                     break
@@ -905,80 +911,27 @@ def remove_model(old_ckpt_name):
 
             with torch.no_grad():
                 for val_step, batch in enumerate(val_dataloader):
-                    if "latents" in batch and batch["latents"] is not None:
-                        latents = batch["latents"].to(accelerator.device)
-                    else:
-                        # latentに変換
-                        latents = vae.encode(batch["images"].to(device=accelerator.device, dtype=vae_dtype)).latent_dist.sample()
-
-                        # NaNが含まれていれば警告を表示し0に置き換える
-                        if torch.any(torch.isnan(latents)):
-                            accelerator.print("NaN found in latents, replacing with zeros")
-                            latents = torch.where(torch.isnan(latents), torch.zeros_like(latents), latents)
-                    latents = latents * self.vae_scale_factor
-                    b_size = latents.shape[0]
-
-                    # Get the text embedding for conditioning
-                    if args.weighted_captions:
-                        text_encoder_conds = get_weighted_text_embeddings(
-                            tokenizer,
-                            text_encoder,
-                            batch["captions"],
-                            accelerator.device,
-                            args.max_token_length // 75 if args.max_token_length else 1,
-                            clip_skip=args.clip_skip,
-                        )
-                    else:
-                        text_encoder_conds = self.get_text_cond(
-                            args, accelerator, batch, tokenizers, text_encoders, weight_dtype
-                        )
-
-                    # Sample noise, sample a random timestep for each image, and add noise to the latents,
-                    # with noise offset and/or multires noise if specified
-                    noise, noisy_latents, timesteps = train_util.get_noise_noisy_latents_and_timesteps(
-                        args, noise_scheduler, latents
-                    )
-
-                    # Predict the noise residual
-                    with accelerator.autocast():
-                        noise_pred = self.call_unet(
-                            args, accelerator, unet, noisy_latents, timesteps, text_encoder_conds, batch, weight_dtype
-                        )
-
-                    if args.v_parameterization:
-                        # v-parameterization training
-                        target = noise_scheduler.get_velocity(latents, noise, timesteps)
-                    else:
-                        target = noise
-
-                    loss = torch.nn.functional.mse_loss(noise_pred.float(), target.float(), reduction="none")
-                    loss = loss.mean([1, 2, 3])
-
-                    loss_weights = batch["loss_weights"].to(accelerator.device)  # 各sampleごとのweight
-
-                    loss = loss * loss_weights
-
-                    if args.min_snr_gamma:
-                        loss = apply_snr_weight(loss, timesteps, noise_scheduler, args.min_snr_gamma)
-                    if args.scale_v_pred_loss_like_noise_pred:
-                        loss = scale_v_prediction_loss_like_noise_prediction(loss, timesteps, noise_scheduler)
-                    if args.v_pred_like_loss:
-                        loss = add_v_prediction_like_loss(loss, timesteps, noise_scheduler, args.v_pred_like_loss)
-
-                    loss = loss.mean()  # 平均なのでbatch_sizeで割る必要なし
+                    is_train = False
+                    loss = self.process_batch(batch, is_train, tokenizers, text_encoders, unet, vae, noise_scheduler, vae_dtype, weight_dtype, accelerator, args)
 
                     current_loss = loss.detach().item()
                     val_loss_recorder.add(epoch=epoch, step=val_step, loss=current_loss)
 
+                    if args.logging_dir is not None:
+                        avr_loss: float = val_loss_recorder.moving_average
+                        logs = {"loss/validation_current": current_loss}
+                        accelerator.log(logs, step=(len(val_dataloader) * epoch) + 1 + val_step)
+
                 if len(val_dataloader) > 0:
                     if args.logging_dir is not None:
                         avr_loss: float = val_loss_recorder.moving_average
-                        logs = {"loss/validation": avr_loss}
+                        logs = {"loss/validation_average": avr_loss}
                         accelerator.log(logs, step=epoch + 1)
 
 
             if args.logging_dir is not None:
-                logs = {"loss/epoch": loss_recorder.moving_average}
+                # logs = {"loss/epoch": loss_recorder.moving_average}
+                logs = {"loss/epoch_average": loss_recorder.moving_average}
                 accelerator.log(logs, step=epoch + 1)
 
             accelerator.wait_for_everyone()

From e545fdfd9affabff83f8bd2e7680369bb34dd301 Mon Sep 17 00:00:00 2001
From: rockerBOO <rockerboo@gmail.com>
Date: Sun, 5 Nov 2023 16:56:36 -0500
Subject: [PATCH 006/582] Removed/cleanup a line

---
 train_network.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/train_network.py b/train_network.py
index 387b94b1c..a4125e9f2 100644
--- a/train_network.py
+++ b/train_network.py
@@ -930,7 +930,6 @@ def remove_model(old_ckpt_name):
 
 
             if args.logging_dir is not None:
-                # logs = {"loss/epoch": loss_recorder.moving_average}
                 logs = {"loss/epoch_average": loss_recorder.moving_average}
                 accelerator.log(logs, step=epoch + 1)
 

From 9c591bdb12ce663b3fe9e91c0963d2cf71461bad Mon Sep 17 00:00:00 2001
From: rockerBOO <rockerboo@gmail.com>
Date: Sun, 5 Nov 2023 16:58:20 -0500
Subject: [PATCH 007/582] Remove unnecessary subset line from collate

---
 library/train_util.py | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/library/train_util.py b/library/train_util.py
index ba37ec13d..1979207b0 100644
--- a/library/train_util.py
+++ b/library/train_util.py
@@ -4762,10 +4762,6 @@ def __call__(self, examples):
         else:
             dataset = self.dataset
 
-        # If we split a dataset we will get a Subset
-        if type(dataset) is torch.utils.data.Subset:
-            dataset = dataset.dataset
-
         # set epoch and step
         dataset.set_current_epoch(self.current_epoch.value)
         dataset.set_current_step(self.current_step.value)

From 569ca72fc4cda2f4ce30e43b1c62989e79e3c3b3 Mon Sep 17 00:00:00 2001
From: rockerBOO <rockerboo@gmail.com>
Date: Tue, 7 Nov 2023 11:59:30 -0500
Subject: [PATCH 008/582] Set grad enabled if is_train and train_text_encoder

We only want to be enabling grad if we are training.
---
 train_network.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/train_network.py b/train_network.py
index a4125e9f2..edd3ff944 100644
--- a/train_network.py
+++ b/train_network.py
@@ -145,7 +145,7 @@ def process_batch(self, batch, is_train, tokenizers, text_encoders, unet, vae, n
             latents = latents * self.vae_scale_factor
         b_size = latents.shape[0]
 
-        with torch.set_grad_enabled(train_text_encoder), accelerator.autocast():
+        with torch.set_grad_enabled(is_train and train_text_encoder), accelerator.autocast():
             # Get the text embedding for conditioning
             if args.weighted_captions:
                 text_encoder_conds = get_weighted_text_embeddings(

From b558a5b73d07a7e15ad90d9d15c2b55c5d2b3d61 Mon Sep 17 00:00:00 2001
From: gesen2egee <79357052+gesen2egee@users.noreply.github.com>
Date: Sun, 10 Mar 2024 04:37:16 +0800
Subject: [PATCH 009/582] val

---
 library/config_util.py | 176 ++++++++++++++++++++++-------------------
 library/train_util.py  |  22 ++++++
 train_network.py       | 135 ++++++++++++++++++++++++++++---
 3 files changed, 241 insertions(+), 92 deletions(-)

diff --git a/library/config_util.py b/library/config_util.py
index fc4b36175..17fc17818 100644
--- a/library/config_util.py
+++ b/library/config_util.py
@@ -98,7 +98,8 @@ class BaseDatasetParams:
     resolution: Optional[Tuple[int, int]] = None
     network_multiplier: float = 1.0
     debug_dataset: bool = False
-
+    validation_seed: Optional[int] = None
+    validation_split: float = 0.0
 
 @dataclass
 class DreamBoothDatasetParams(BaseDatasetParams):
@@ -109,8 +110,7 @@ class DreamBoothDatasetParams(BaseDatasetParams):
     bucket_reso_steps: int = 64
     bucket_no_upscale: bool = False
     prior_loss_weight: float = 1.0
-
-
+    
 @dataclass
 class FineTuningDatasetParams(BaseDatasetParams):
     batch_size: int = 1
@@ -222,8 +222,11 @@ def __validate_and_convert_scalar_or_twodim(klass, value: Union[float, Sequence]
         "enable_bucket": bool,
         "max_bucket_reso": int,
         "min_bucket_reso": int,
+        "validation_seed": int,
+        "validation_split": float,        
         "resolution": functools.partial(__validate_and_convert_scalar_or_twodim.__func__, int),
         "network_multiplier": float,
+        
     }
 
     # options handled by argparse but not handled by user config
@@ -460,100 +463,107 @@ def generate_dataset_group_by_blueprint(dataset_group_blueprint: DatasetGroupBlu
             dataset_klass = FineTuningDataset
 
         subsets = [subset_klass(**asdict(subset_blueprint.params)) for subset_blueprint in dataset_blueprint.subsets]
-        dataset = dataset_klass(subsets=subsets, **asdict(dataset_blueprint.params))
+        dataset = dataset_klass(subsets=subsets, is_train=True, **asdict(dataset_blueprint.params))
         datasets.append(dataset)
 
-    # print info
-    info = ""
-    for i, dataset in enumerate(datasets):
-        is_dreambooth = isinstance(dataset, DreamBoothDataset)
-        is_controlnet = isinstance(dataset, ControlNetDataset)
-        info += dedent(
-            f"""\
-      [Dataset {i}]
-        batch_size: {dataset.batch_size}
-        resolution: {(dataset.width, dataset.height)}
-        enable_bucket: {dataset.enable_bucket}
-        network_multiplier: {dataset.network_multiplier}
-    """
-        )
+    val_datasets:List[Union[DreamBoothDataset, FineTuningDataset, ControlNetDataset]] = []
+    
+    for dataset_blueprint in dataset_group_blueprint.datasets:
+        if dataset_blueprint.params.validation_split <= 0.0:
+            continue
+        if dataset_blueprint.is_controlnet:
+            subset_klass = ControlNetSubset
+            dataset_klass = ControlNetDataset
+        elif dataset_blueprint.is_dreambooth:
+            subset_klass = DreamBoothSubset
+            dataset_klass = DreamBoothDataset
+        else:            
+            subset_klass = FineTuningSubset
+            dataset_klass = FineTuningDataset
+
+        subsets = [subset_klass(**asdict(subset_blueprint.params)) for subset_blueprint in dataset_blueprint.subsets]
+        dataset = dataset_klass(subsets=subsets, is_train=False, **asdict(dataset_blueprint.params))
+        val_datasets.append(dataset)
+
+    def print_info(_datasets):
+        info = ""
+        for i, dataset in enumerate(_datasets):
+            is_dreambooth = isinstance(dataset, DreamBoothDataset)
+            is_controlnet = isinstance(dataset, ControlNetDataset)
+            info += dedent(f"""\
+                [Dataset {i}]
+                batch_size: {dataset.batch_size}
+                resolution: {(dataset.width, dataset.height)}
+                enable_bucket: {dataset.enable_bucket}
+            """)
 
         if dataset.enable_bucket:
-            info += indent(
-                dedent(
-                    f"""\
-        min_bucket_reso: {dataset.min_bucket_reso}
-        max_bucket_reso: {dataset.max_bucket_reso}
-        bucket_reso_steps: {dataset.bucket_reso_steps}
-        bucket_no_upscale: {dataset.bucket_no_upscale}
-      \n"""
-                ),
-                "  ",
-            )
+            info += indent(dedent(f"""\
+                min_bucket_reso: {dataset.min_bucket_reso}
+                max_bucket_reso: {dataset.max_bucket_reso}
+                bucket_reso_steps: {dataset.bucket_reso_steps}
+                bucket_no_upscale: {dataset.bucket_no_upscale}
+            \n"""), "  ")
         else:
             info += "\n"
-
         for j, subset in enumerate(dataset.subsets):
-            info += indent(
-                dedent(
-                    f"""\
-        [Subset {j} of Dataset {i}]
-          image_dir: "{subset.image_dir}"
-          image_count: {subset.img_count}
-          num_repeats: {subset.num_repeats}
-          shuffle_caption: {subset.shuffle_caption}
-          keep_tokens: {subset.keep_tokens}
-          keep_tokens_separator: {subset.keep_tokens_separator}
-          caption_dropout_rate: {subset.caption_dropout_rate}
-          caption_dropout_every_n_epoches: {subset.caption_dropout_every_n_epochs}
-          caption_tag_dropout_rate: {subset.caption_tag_dropout_rate}
-          caption_prefix: {subset.caption_prefix}
-          caption_suffix: {subset.caption_suffix}
-          color_aug: {subset.color_aug}
-          flip_aug: {subset.flip_aug}
-          face_crop_aug_range: {subset.face_crop_aug_range}
-          random_crop: {subset.random_crop}
-          token_warmup_min: {subset.token_warmup_min},
-          token_warmup_step: {subset.token_warmup_step},
-      """
-                ),
-                "  ",
-            )
-
-            if is_dreambooth:
-                info += indent(
-                    dedent(
-                        f"""\
-          is_reg: {subset.is_reg}
-          class_tokens: {subset.class_tokens}
-          caption_extension: {subset.caption_extension}
-        \n"""
-                    ),
-                    "    ",
-                )
-            elif not is_controlnet:
-                info += indent(
-                    dedent(
-                        f"""\
-          metadata_file: {subset.metadata_file}
-        \n"""
-                    ),
-                    "    ",
-                )
-
-    logger.info(f'{info}')
-
+            info += indent(dedent(f"""\
+                [Subset {j} of Dataset {i}]
+                image_dir: "{subset.image_dir}"
+                image_count: {subset.img_count}
+                num_repeats: {subset.num_repeats}
+                shuffle_caption: {subset.shuffle_caption}
+                keep_tokens: {subset.keep_tokens}
+                caption_dropout_rate: {subset.caption_dropout_rate}
+                caption_dropout_every_n_epoches: {subset.caption_dropout_every_n_epochs}
+                caption_tag_dropout_rate: {subset.caption_tag_dropout_rate}
+                caption_prefix: {subset.caption_prefix}
+                caption_suffix: {subset.caption_suffix}
+                color_aug: {subset.color_aug}
+                flip_aug: {subset.flip_aug}
+                face_crop_aug_range: {subset.face_crop_aug_range}
+                random_crop: {subset.random_crop}
+                token_warmup_min: {subset.token_warmup_min},
+                token_warmup_step: {subset.token_warmup_step},
+            """), "  ")
+
+        if is_dreambooth:
+            info += indent(dedent(f"""\
+                is_reg: {subset.is_reg}
+                class_tokens: {subset.class_tokens}
+                caption_extension: {subset.caption_extension}
+            \n"""), "    ")
+        elif not is_controlnet:
+            info += indent(dedent(f"""\
+                metadata_file: {subset.metadata_file}
+            \n"""), "    ")
+
+        print(info)
+
+    print_info(datasets)
+
+    if len(val_datasets) > 0:
+        print("Validation dataset")
+        print_info(val_datasets)         
+ 
     # make buckets first because it determines the length of dataset
     # and set the same seed for all datasets
     seed = random.randint(0, 2**31) # actual seed is seed + epoch_no
     for i, dataset in enumerate(datasets):
-        logger.info(f"[Dataset {i}]")
+        print(f"[Dataset {i}]")
+        dataset.make_buckets()
+        dataset.set_seed(seed)
+    
+    for i, dataset in enumerate(val_datasets):
+        print(f"[Validation Dataset {i}]")
         dataset.make_buckets()
         dataset.set_seed(seed)
 
-    return DatasetGroup(datasets)
-
-
+    return (
+        DatasetGroup(datasets),
+        DatasetGroup(val_datasets) if val_datasets else None
+    )        
+        
 def generate_dreambooth_subsets_config_by_subdirs(train_data_dir: Optional[str] = None, reg_data_dir: Optional[str] = None):
     def extract_dreambooth_params(name: str) -> Tuple[int, str]:
         tokens = name.split("_")
diff --git a/library/train_util.py b/library/train_util.py
index d2b69edb5..753539e04 100644
--- a/library/train_util.py
+++ b/library/train_util.py
@@ -134,6 +134,20 @@
 
 TEXT_ENCODER_OUTPUTS_CACHE_SUFFIX = "_te_outputs.npz"
 
+def split_train_val(paths, is_train, validation_split, validation_seed):
+    if validation_seed is not None:
+        print(f"Using validation seed: {validation_seed}")
+        prevstate = random.getstate()
+        random.seed(validation_seed)
+        random.shuffle(paths)
+        random.setstate(prevstate)
+    else:
+        random.shuffle(paths)
+
+    if is_train:
+        return paths[0:math.ceil(len(paths) * (1 - validation_split))]
+    else:
+        return paths[len(paths) - round(len(paths) * validation_split):]
 
 class ImageInfo:
     def __init__(self, image_key: str, num_repeats: int, caption: str, is_reg: bool, absolute_path: str) -> None:
@@ -1360,6 +1374,7 @@ class DreamBoothDataset(BaseDataset):
     def __init__(
         self,
         subsets: Sequence[DreamBoothSubset],
+        is_train: bool,        
         batch_size: int,
         tokenizer,
         max_token_length,
@@ -1371,12 +1386,17 @@ def __init__(
         bucket_reso_steps: int,
         bucket_no_upscale: bool,
         prior_loss_weight: float,
+        validation_split: float,
+        validation_seed: Optional[int],        
         debug_dataset: bool,
     ) -> None:
         super().__init__(tokenizer, max_token_length, resolution, network_multiplier, debug_dataset)
 
         assert resolution is not None, f"resolution is required / resolution（解像度）指定は必須です"
 
+        self.is_train = is_train
+        self.validation_split = validation_split
+        self.validation_seed = validation_seed 
         self.batch_size = batch_size
         self.size = min(self.width, self.height)  # 短いほう
         self.prior_loss_weight = prior_loss_weight
@@ -1429,6 +1449,8 @@ def load_dreambooth_dir(subset: DreamBoothSubset):
                 return [], []
 
             img_paths = glob_images(subset.image_dir, "*")
+            if self.validation_split > 0.0:
+                img_paths = split_train_val(img_paths, self.is_train, self.validation_split, self.validation_seed)            
             logger.info(f"found directory {subset.image_dir} contains {len(img_paths)} image files")
 
             # 画像ファイルごとにプロンプトを読み込み、もしあればそちらを使う
diff --git a/train_network.py b/train_network.py
index e0fa69458..db7000e82 100644
--- a/train_network.py
+++ b/train_network.py
@@ -136,6 +136,67 @@ def all_reduce_network(self, accelerator, network):
     def sample_images(self, accelerator, args, epoch, global_step, device, vae, tokenizer, text_encoder, unet):
         train_util.sample_images(accelerator, args, epoch, global_step, device, vae, tokenizer, text_encoder, unet)
 
+    def process_val_batch(self, batch, is_train, tokenizers, text_encoders, unet, vae, noise_scheduler, vae_dtype, weight_dtype, accelerator, args, train_text_encoder=True):
+
+        total_loss = 0.0
+        timesteps_list = [10, 350, 500, 650, 990]    
+        
+        with torch.no_grad():
+            if "latents" in batch and batch["latents"] is not None:
+                latents = batch["latents"].to(accelerator.device)
+            else:
+                # latentに変換
+                latents = vae.encode(batch["images"].to(accelerator.device, dtype=vae_dtype)).latent_dist.sample()
+
+                # NaNが含まれていれば警告を表示し0に置き換える
+                if torch.any(torch.isnan(latents)):
+                    accelerator.print("NaN found in latents, replacing with zeros")
+                    latents = torch.where(torch.isnan(latents), torch.zeros_like(latents), latents)
+            latents = latents * self.vae_scale_factor
+        b_size = latents.shape[0]
+
+        with torch.set_grad_enabled(is_train and train_text_encoder), accelerator.autocast():
+            # Get the text embedding for conditioning
+            if args.weighted_captions:
+                text_encoder_conds = get_weighted_text_embeddings(
+                    tokenizers[0],
+                    text_encoders[0],
+                    batch["captions"],
+                    accelerator.device,
+                    args.max_token_length // 75 if args.max_token_length else 1,
+                    clip_skip=args.clip_skip,
+                )
+            else:
+                text_encoder_conds = self.get_text_cond(
+                    args, accelerator, batch, tokenizers, text_encoders, weight_dtype
+                )
+
+        # Sample noise, sample a random timestep for each image, and add noise to the latents,
+        # with noise offset and/or multires noise if specified
+        noise, noisy_latents, _ = train_util.get_noise_noisy_latents_and_timesteps(
+            args, noise_scheduler, latents
+        )
+        for timesteps in timesteps_list:
+            # Predict the noise residual
+            with torch.set_grad_enabled(is_train), accelerator.autocast():
+                noise_pred = self.call_unet(
+                    args, accelerator, unet, noisy_latents, timesteps, text_encoder_conds, batch, weight_dtype
+                )
+
+            if args.v_parameterization:
+                # v-parameterization training
+                target = noise_scheduler.get_velocity(latents, noise, timesteps)
+            else:
+                target = noise
+
+            loss = torch.nn.functional.mse_loss(noise_pred.float(), target.float(), reduction="none")
+            loss = loss.mean([1, 2, 3])
+            loss = loss.mean()  # 平均なのでbatch_sizeで割る必要なし
+            total_loss += loss
+            
+        average_loss = total_loss / len(timesteps_list)    
+        return average_loss
+
     def train(self, args):
         session_id = random.randint(0, 2**32)
         training_started_at = time.time()
@@ -196,11 +257,12 @@ def train(self, args):
                     }
 
             blueprint = blueprint_generator.generate(user_config, args, tokenizer=tokenizer)
-            train_dataset_group = config_util.generate_dataset_group_by_blueprint(blueprint.dataset_group)
+            train_dataset_group, val_dataset_group = config_util.generate_dataset_group_by_blueprint(blueprint.dataset_group)
         else:
             # use arbitrary dataset class
             train_dataset_group = train_util.load_arbitrary_dataset(args, tokenizer)
-
+            val_dataset_group = None # placeholder until validation dataset supported for arbitrary
+            
         current_epoch = Value("i", 0)
         current_step = Value("i", 0)
         ds_for_collator = train_dataset_group if args.max_data_loader_n_workers == 0 else None
@@ -219,7 +281,11 @@ def train(self, args):
             assert (
                 train_dataset_group.is_latent_cacheable()
             ), "when caching latents, either color_aug or random_crop cannot be used / latentをキャッシュするときはcolor_augとrandom_cropは使えません"
-
+            if val_dataset_group is not None:
+                assert (
+                    val_dataset_group.is_latent_cacheable()
+                ), "when caching validation latents, either color_aug or random_crop cannot be used / latentをキャッシュするときはcolor_augとrandom_cropは使えません"
+              
         self.assert_extra_args(args, train_dataset_group)
 
         # acceleratorを準備する
@@ -271,6 +337,9 @@ def train(self, args):
             vae.eval()
             with torch.no_grad():
                 train_dataset_group.cache_latents(vae, args.vae_batch_size, args.cache_latents_to_disk, accelerator.is_main_process)
+                if val_dataset_group is not None:
+                    print("Cache validation latents...")
+                    val_dataset_group.cache_latents(vae, args.vae_batch_size, args.cache_latents_to_disk, accelerator.is_main_process)                
             vae.to("cpu")
             clean_memory_on_device(accelerator.device)
 
@@ -360,6 +429,15 @@ def train(self, args):
             num_workers=n_workers,
             persistent_workers=args.persistent_data_loader_workers,
         )
+        
+        val_dataloader = torch.utils.data.DataLoader(
+            val_dataset_group if val_dataset_group is not None else [],
+            shuffle=False,
+            batch_size=1,
+            collate_fn=collator,
+            num_workers=n_workers,
+            persistent_workers=args.persistent_data_loader_workers,
+        )
 
         # 学習ステップ数を計算する
         if args.max_train_epochs is not None:
@@ -707,6 +785,8 @@ def train(self, args):
             )
 
         loss_recorder = train_util.LossRecorder()
+        val_loss_recorder = train_util.LossRecorder()
+        
         del train_dataset_group
 
         # callback for step start
@@ -755,7 +835,8 @@ def remove_model(old_ckpt_name):
                 current_step.value = global_step
                 with accelerator.accumulate(network):
                     on_step_start(text_encoder, unet)
-
+                    
+                    is_train = True
                     with torch.no_grad():
                         if "latents" in batch and batch["latents"] is not None:
                             latents = batch["latents"].to(accelerator.device)
@@ -780,7 +861,7 @@ def remove_model(old_ckpt_name):
                         # print(f"set multiplier: {multipliers}")
                         accelerator.unwrap_model(network).set_multiplier(multipliers)
 
-                    with torch.set_grad_enabled(train_text_encoder), accelerator.autocast():
+                    with torch.set_grad_enabled(is_train and train_text_encoder), accelerator.autocast():
                         # Get the text embedding for conditioning
                         if args.weighted_captions:
                             text_encoder_conds = get_weighted_text_embeddings(
@@ -810,7 +891,7 @@ def remove_model(old_ckpt_name):
                             t.requires_grad_(True)
 
                     # Predict the noise residual
-                    with accelerator.autocast():
+                    with torch.set_grad_enabled(is_train), accelerator.autocast():
                         noise_pred = self.call_unet(
                             args,
                             accelerator,
@@ -844,7 +925,7 @@ def remove_model(old_ckpt_name):
                         loss = apply_debiased_estimation(loss, timesteps, noise_scheduler)
 
                     loss = loss.mean()  # 平均なのでbatch_sizeで割る必要なし
-
+                    
                     accelerator.backward(loss)
                     if accelerator.sync_gradients:
                         self.all_reduce_network(accelerator, network)  # sync DDP grad manually
@@ -898,14 +979,38 @@ def remove_model(old_ckpt_name):
                 if args.logging_dir is not None:
                     logs = self.generate_step_logs(args, current_loss, avr_loss, lr_scheduler, keys_scaled, mean_norm, maximum_norm)
                     accelerator.log(logs, step=global_step)
-
+                    
+                if global_step % 25 == 0:                
+                    if len(val_dataloader) > 0:
+                        print("Validating バリデーション処理...")
+
+                        with torch.no_grad():
+                            val_dataloader_iter = iter(val_dataloader)
+                            batch = next(val_dataloader_iter)
+                            is_train = False
+                            loss = self.process_val_batch(batch, is_train, tokenizers, text_encoders, unet, vae, noise_scheduler, vae_dtype, weight_dtype, accelerator, args)
+
+                            current_loss = loss.detach().item()
+                            val_loss_recorder.add(epoch=epoch, step=global_step, loss=current_loss)
+
+                            if args.logging_dir is not None:
+                                avr_loss: float = val_loss_recorder.moving_average
+                                logs = {"loss/validation_current": current_loss}
+                                accelerator.log(logs, step=global_step)
+                                
                 if global_step >= args.max_train_steps:
                     break
 
             if args.logging_dir is not None:
-                logs = {"loss/epoch": loss_recorder.moving_average}
+                logs = {"loss/epoch_average": loss_recorder.moving_average}                
                 accelerator.log(logs, step=epoch + 1)
 
+            if len(val_dataloader) > 0:
+                if args.logging_dir is not None:
+                    avr_loss: float = val_loss_recorder.moving_average
+                    logs = {"loss/validation_epoch_average": avr_loss}
+                    accelerator.log(logs, step=epoch + 1)
+                    
             accelerator.wait_for_everyone()
 
             # 指定エポックごとにモデルを保存
@@ -1045,6 +1150,18 @@ def setup_parser() -> argparse.ArgumentParser:
         action="store_true",
         help="do not use fp16/bf16 VAE in mixed precision (use float VAE) / mixed precisionでも fp16/bf16 VAEを使わずfloat VAEを使う",
     )
+    parser.add_argument(
+        "--validation_seed",
+        type=int,
+        default=None,
+        help="Validation seed"
+    )
+    parser.add_argument(
+        "--validation_split",
+        type=float,
+        default=0.0,
+        help="Split for validation images out of the training dataset"
+    )    
     return parser
 
 

From 78cfb01922ff97bbc62ff12a4d69eaaa2d89d7c1 Mon Sep 17 00:00:00 2001
From: gesen2egee <79357052+gesen2egee@users.noreply.github.com>
Date: Sun, 10 Mar 2024 18:55:48 +0800
Subject: [PATCH 010/582] improve

---
 library/config_util.py | 260 +++++++++++++++++++++++++++++------------
 train_network.py       |  67 +++++++----
 2 files changed, 234 insertions(+), 93 deletions(-)

diff --git a/library/config_util.py b/library/config_util.py
index 17fc17818..d198cee35 100644
--- a/library/config_util.py
+++ b/library/config_util.py
@@ -41,12 +41,17 @@
     DatasetGroup,
 )
 from .utils import setup_logging
+
 setup_logging()
 import logging
+
 logger = logging.getLogger(__name__)
 
+
 def add_config_arguments(parser: argparse.ArgumentParser):
-    parser.add_argument("--dataset_config", type=Path, default=None, help="config file for detail settings / 詳細な設定用の設定ファイル")
+    parser.add_argument(
+        "--dataset_config", type=Path, default=None, help="config file for detail settings / 詳細な設定用の設定ファイル"
+    )
 
 
 # TODO: inherit Params class in Subset, Dataset
@@ -60,6 +65,8 @@ class BaseSubsetParams:
     caption_separator: str = (",",)
     keep_tokens: int = 0
     keep_tokens_separator: str = (None,)
+    secondary_separator: Optional[str] = None
+    enable_wildcard: bool = False
     color_aug: bool = False
     flip_aug: bool = False
     face_crop_aug_range: Optional[Tuple[float, float]] = None
@@ -181,6 +188,8 @@ def __validate_and_convert_scalar_or_twodim(klass, value: Union[float, Sequence]
         "shuffle_caption": bool,
         "keep_tokens": int,
         "keep_tokens_separator": str,
+        "secondary_separator": str,
+        "enable_wildcard": bool,
         "token_warmup_min": int,
         "token_warmup_step": Any(float, int),
         "caption_prefix": str,
@@ -247,9 +256,10 @@ def __validate_and_convert_scalar_or_twodim(klass, value: Union[float, Sequence]
     }
 
     def __init__(self, support_dreambooth: bool, support_finetuning: bool, support_controlnet: bool, support_dropout: bool) -> None:
-        assert (
-            support_dreambooth or support_finetuning or support_controlnet
-        ), "Neither DreamBooth mode nor fine tuning mode specified. Please specify one mode or more. / DreamBooth モードか fine tuning モードのどちらも指定されていません。1つ以上指定してください。"
+        assert support_dreambooth or support_finetuning or support_controlnet, (
+            "Neither DreamBooth mode nor fine tuning mode nor controlnet mode specified. Please specify one mode or more."
+            + " / DreamBooth モードか fine tuning モードか controlnet モードのどれも指定されていません。1つ以上指定してください。"
+        )
 
         self.db_subset_schema = self.__merge_dict(
             self.SUBSET_ASCENDABLE_SCHEMA,
@@ -361,7 +371,9 @@ def sanitize_argparse_namespace(self, argparse_namespace: argparse.Namespace) ->
             return self.argparse_config_validator(argparse_namespace)
         except MultipleInvalid:
             # XXX: this should be a bug
-            logger.error("Invalid cmdline parsed arguments. This should be a bug. / コマンドラインのパース結果が正しくないようです。プログラムのバグの可能性が高いです。")
+            logger.error(
+                "Invalid cmdline parsed arguments. This should be a bug. / コマンドラインのパース結果が正しくないようです。プログラムのバグの可能性が高いです。"
+            )
             raise
 
     # NOTE: value would be overwritten by latter dict if there is already the same key
@@ -447,7 +459,6 @@ def search_value(key: str, fallbacks: Sequence[dict], default_value=None):
 
         return default_value
 
-
 def generate_dataset_group_by_blueprint(dataset_group_blueprint: DatasetGroupBlueprint):
     datasets: List[Union[DreamBoothDataset, FineTuningDataset, ControlNetDataset]] = []
 
@@ -467,7 +478,7 @@ def generate_dataset_group_by_blueprint(dataset_group_blueprint: DatasetGroupBlu
         datasets.append(dataset)
 
     val_datasets:List[Union[DreamBoothDataset, FineTuningDataset, ControlNetDataset]] = []
-    
+
     for dataset_blueprint in dataset_group_blueprint.datasets:
         if dataset_blueprint.params.validation_split <= 0.0:
             continue
@@ -485,75 +496,174 @@ def generate_dataset_group_by_blueprint(dataset_group_blueprint: DatasetGroupBlu
         dataset = dataset_klass(subsets=subsets, is_train=False, **asdict(dataset_blueprint.params))
         val_datasets.append(dataset)
 
-    def print_info(_datasets):
-        info = ""
-        for i, dataset in enumerate(_datasets):
-            is_dreambooth = isinstance(dataset, DreamBoothDataset)
-            is_controlnet = isinstance(dataset, ControlNetDataset)
-            info += dedent(f"""\
-                [Dataset {i}]
-                batch_size: {dataset.batch_size}
-                resolution: {(dataset.width, dataset.height)}
-                enable_bucket: {dataset.enable_bucket}
-            """)
+    # print info
+    info = ""
+    for i, dataset in enumerate(datasets):
+        is_dreambooth = isinstance(dataset, DreamBoothDataset)
+        is_controlnet = isinstance(dataset, ControlNetDataset)
+        info += dedent(
+            f"""\
+      [Dataset {i}]
+        batch_size: {dataset.batch_size}
+        resolution: {(dataset.width, dataset.height)}
+        enable_bucket: {dataset.enable_bucket}
+        network_multiplier: {dataset.network_multiplier}
+    """
+        )
 
         if dataset.enable_bucket:
-            info += indent(dedent(f"""\
-                min_bucket_reso: {dataset.min_bucket_reso}
-                max_bucket_reso: {dataset.max_bucket_reso}
-                bucket_reso_steps: {dataset.bucket_reso_steps}
-                bucket_no_upscale: {dataset.bucket_no_upscale}
-            \n"""), "  ")
+            info += indent(
+                dedent(
+                    f"""\
+        min_bucket_reso: {dataset.min_bucket_reso}
+        max_bucket_reso: {dataset.max_bucket_reso}
+        bucket_reso_steps: {dataset.bucket_reso_steps}
+        bucket_no_upscale: {dataset.bucket_no_upscale}
+      \n"""
+                ),
+                "  ",
+            )
         else:
             info += "\n"
+
         for j, subset in enumerate(dataset.subsets):
-            info += indent(dedent(f"""\
-                [Subset {j} of Dataset {i}]
-                image_dir: "{subset.image_dir}"
-                image_count: {subset.img_count}
-                num_repeats: {subset.num_repeats}
-                shuffle_caption: {subset.shuffle_caption}
-                keep_tokens: {subset.keep_tokens}
-                caption_dropout_rate: {subset.caption_dropout_rate}
-                caption_dropout_every_n_epoches: {subset.caption_dropout_every_n_epochs}
-                caption_tag_dropout_rate: {subset.caption_tag_dropout_rate}
-                caption_prefix: {subset.caption_prefix}
-                caption_suffix: {subset.caption_suffix}
-                color_aug: {subset.color_aug}
-                flip_aug: {subset.flip_aug}
-                face_crop_aug_range: {subset.face_crop_aug_range}
-                random_crop: {subset.random_crop}
-                token_warmup_min: {subset.token_warmup_min},
-                token_warmup_step: {subset.token_warmup_step},
-            """), "  ")
-
-        if is_dreambooth:
-            info += indent(dedent(f"""\
-                is_reg: {subset.is_reg}
-                class_tokens: {subset.class_tokens}
-                caption_extension: {subset.caption_extension}
-            \n"""), "    ")
-        elif not is_controlnet:
-            info += indent(dedent(f"""\
-                metadata_file: {subset.metadata_file}
-            \n"""), "    ")
-
-        print(info)
-
-    print_info(datasets)
-
-    if len(val_datasets) > 0:
-        print("Validation dataset")
-        print_info(val_datasets)         
- 
+            info += indent(
+                dedent(
+                    f"""\
+        [Subset {j} of Dataset {i}]
+          image_dir: "{subset.image_dir}"
+          image_count: {subset.img_count}
+          num_repeats: {subset.num_repeats}
+          shuffle_caption: {subset.shuffle_caption}
+          keep_tokens: {subset.keep_tokens}
+          keep_tokens_separator: {subset.keep_tokens_separator}
+          caption_dropout_rate: {subset.caption_dropout_rate}
+          caption_dropout_every_n_epoches: {subset.caption_dropout_every_n_epochs}
+          caption_tag_dropout_rate: {subset.caption_tag_dropout_rate}
+          caption_prefix: {subset.caption_prefix}
+          caption_suffix: {subset.caption_suffix}
+          color_aug: {subset.color_aug}
+          flip_aug: {subset.flip_aug}
+          face_crop_aug_range: {subset.face_crop_aug_range}
+          random_crop: {subset.random_crop}
+          token_warmup_min: {subset.token_warmup_min},
+          token_warmup_step: {subset.token_warmup_step},
+      """
+                ),
+                "  ",
+            )
+
+            if is_dreambooth:
+                info += indent(
+                    dedent(
+                        f"""\
+          is_reg: {subset.is_reg}
+          class_tokens: {subset.class_tokens}
+          caption_extension: {subset.caption_extension}
+        \n"""
+                    ),
+                    "    ",
+                )
+            elif not is_controlnet:
+                info += indent(
+                    dedent(
+                        f"""\
+          metadata_file: {subset.metadata_file}
+        \n"""
+                    ),
+                    "    ",
+                )
+
+    logger.info(f'{info}')
+
+    # print validation info
+    info = ""
+    for i, dataset in enumerate(val_datasets):
+        is_dreambooth = isinstance(dataset, DreamBoothDataset)
+        is_controlnet = isinstance(dataset, ControlNetDataset)
+        info += dedent(
+            f"""\
+      [Validation Dataset {i}]
+        batch_size: {dataset.batch_size}
+        resolution: {(dataset.width, dataset.height)}
+        enable_bucket: {dataset.enable_bucket}
+        network_multiplier: {dataset.network_multiplier}
+    """
+        )
+
+        if dataset.enable_bucket:
+            info += indent(
+                dedent(
+                    f"""\
+        min_bucket_reso: {dataset.min_bucket_reso}
+        max_bucket_reso: {dataset.max_bucket_reso}
+        bucket_reso_steps: {dataset.bucket_reso_steps}
+        bucket_no_upscale: {dataset.bucket_no_upscale}
+      \n"""
+                ),
+                "  ",
+            )
+        else:
+            info += "\n"
+
+        for j, subset in enumerate(dataset.subsets):
+            info += indent(
+                dedent(
+                    f"""\
+        [Subset {j} of Dataset {i}]
+          image_dir: "{subset.image_dir}"
+          image_count: {subset.img_count}
+          num_repeats: {subset.num_repeats}
+          shuffle_caption: {subset.shuffle_caption}
+          keep_tokens: {subset.keep_tokens}
+          keep_tokens_separator: {subset.keep_tokens_separator}
+          caption_dropout_rate: {subset.caption_dropout_rate}
+          caption_dropout_every_n_epoches: {subset.caption_dropout_every_n_epochs}
+          caption_tag_dropout_rate: {subset.caption_tag_dropout_rate}
+          caption_prefix: {subset.caption_prefix}
+          caption_suffix: {subset.caption_suffix}
+          color_aug: {subset.color_aug}
+          flip_aug: {subset.flip_aug}
+          face_crop_aug_range: {subset.face_crop_aug_range}
+          random_crop: {subset.random_crop}
+          token_warmup_min: {subset.token_warmup_min},
+          token_warmup_step: {subset.token_warmup_step},
+      """
+                ),
+                "  ",
+            )
+
+            if is_dreambooth:
+                info += indent(
+                    dedent(
+                        f"""\
+          is_reg: {subset.is_reg}
+          class_tokens: {subset.class_tokens}
+          caption_extension: {subset.caption_extension}
+        \n"""
+                    ),
+                    "    ",
+                )
+            elif not is_controlnet:
+                info += indent(
+                    dedent(
+                        f"""\
+          metadata_file: {subset.metadata_file}
+        \n"""
+                    ),
+                    "    ",
+                )
+
+    logger.info(f'{info}')
+
     # make buckets first because it determines the length of dataset
     # and set the same seed for all datasets
     seed = random.randint(0, 2**31) # actual seed is seed + epoch_no
     for i, dataset in enumerate(datasets):
-        print(f"[Dataset {i}]")
+        logger.info(f"[Dataset {i}]")
         dataset.make_buckets()
         dataset.set_seed(seed)
-    
+
     for i, dataset in enumerate(val_datasets):
         print(f"[Validation Dataset {i}]")
         dataset.make_buckets()
@@ -562,8 +672,8 @@ def print_info(_datasets):
     return (
         DatasetGroup(datasets),
         DatasetGroup(val_datasets) if val_datasets else None
-    )        
-        
+    )
+ 
 def generate_dreambooth_subsets_config_by_subdirs(train_data_dir: Optional[str] = None, reg_data_dir: Optional[str] = None):
     def extract_dreambooth_params(name: str) -> Tuple[int, str]:
         tokens = name.split("_")
@@ -642,13 +752,17 @@ def load_user_config(file: str) -> dict:
             with open(file, "r") as f:
                 config = json.load(f)
         except Exception:
-            logger.error(f"Error on parsing JSON config file. Please check the format. / JSON 形式の設定ファイルの読み込みに失敗しました。文法が正しいか確認してください。: {file}")
+            logger.error(
+                f"Error on parsing JSON config file. Please check the format. / JSON 形式の設定ファイルの読み込みに失敗しました。文法が正しいか確認してください。: {file}"
+            )
             raise
     elif file.name.lower().endswith(".toml"):
         try:
             config = toml.load(file)
         except Exception:
-            logger.error(f"Error on parsing TOML config file. Please check the format. / TOML 形式の設定ファイルの読み込みに失敗しました。文法が正しいか確認してください。: {file}")
+            logger.error(
+                f"Error on parsing TOML config file. Please check the format. / TOML 形式の設定ファイルの読み込みに失敗しました。文法が正しいか確認してください。: {file}"
+            )
             raise
     else:
         raise ValueError(f"not supported config file format / 対応していない設定ファイルの形式です: {file}")
@@ -675,13 +789,13 @@ def load_user_config(file: str) -> dict:
     train_util.prepare_dataset_args(argparse_namespace, config_args.support_finetuning)
 
     logger.info("[argparse_namespace]")
-    logger.info(f'{vars(argparse_namespace)}')
+    logger.info(f"{vars(argparse_namespace)}")
 
     user_config = load_user_config(config_args.dataset_config)
 
     logger.info("")
     logger.info("[user_config]")
-    logger.info(f'{user_config}')
+    logger.info(f"{user_config}")
 
     sanitizer = ConfigSanitizer(
         config_args.support_dreambooth, config_args.support_finetuning, config_args.support_controlnet, config_args.support_dropout
@@ -690,10 +804,10 @@ def load_user_config(file: str) -> dict:
 
     logger.info("")
     logger.info("[sanitized_user_config]")
-    logger.info(f'{sanitized_user_config}')
+    logger.info(f"{sanitized_user_config}")
 
     blueprint = BlueprintGenerator(sanitizer).generate(user_config, argparse_namespace)
 
     logger.info("")
     logger.info("[blueprint]")
-    logger.info(f'{blueprint}')
+    logger.info(f"{blueprint}")
diff --git a/train_network.py b/train_network.py
index db7000e82..d3e34eb7e 100644
--- a/train_network.py
+++ b/train_network.py
@@ -44,6 +44,7 @@
 
 setup_logging()
 import logging
+import itertools
 
 logger = logging.getLogger(__name__)
 
@@ -438,6 +439,7 @@ def train(self, args):
             num_workers=n_workers,
             persistent_workers=args.persistent_data_loader_workers,
         )
+        cyclic_val_dataloader = itertools.cycle(val_dataloader)
 
         # 学習ステップ数を計算する
         if args.max_train_epochs is not None:
@@ -979,23 +981,24 @@ def remove_model(old_ckpt_name):
                 if args.logging_dir is not None:
                     logs = self.generate_step_logs(args, current_loss, avr_loss, lr_scheduler, keys_scaled, mean_norm, maximum_norm)
                     accelerator.log(logs, step=global_step)
-                    
-                if global_step % 25 == 0:                
-                    if len(val_dataloader) > 0:
-                        print("Validating バリデーション処理...")
-
-                        with torch.no_grad():
-                            val_dataloader_iter = iter(val_dataloader)
-                            batch = next(val_dataloader_iter)
-                            is_train = False
-                            loss = self.process_val_batch(batch, is_train, tokenizers, text_encoders, unet, vae, noise_scheduler, vae_dtype, weight_dtype, accelerator, args)
-
-                            current_loss = loss.detach().item()
-                            val_loss_recorder.add(epoch=epoch, step=global_step, loss=current_loss)
+                
+                if args.validation_every_n_step is not None:        
+                    if global_step % (args.validation_every_n_step) == 0:                
+                        if len(val_dataloader) > 0:
+                            print("Validating バリデーション処理...")
+                            total_loss = 0.0
+                            with torch.no_grad():
+                                for val_step in min(len(val_dataloader), args.validation_batches):
+                                    is_train = False
+                                    batch = next(cyclic_val_dataloader)
+                                    loss = self.process_val_batch(batch, is_train, tokenizers, text_encoders, unet, vae, noise_scheduler, vae_dtype, weight_dtype, accelerator, args)
+                                    total_loss += loss.detach().item()
+                                current_loss = total_loss / args.validation_batches   
+                                val_loss_recorder.add(epoch=epoch, step=global_step, loss=current_loss)
 
                             if args.logging_dir is not None:
                                 avr_loss: float = val_loss_recorder.moving_average
-                                logs = {"loss/validation_current": current_loss}
+                                logs = {"loss/avr_val_loss": avr_loss}
                                 accelerator.log(logs, step=global_step)
                                 
                 if global_step >= args.max_train_steps:
@@ -1005,12 +1008,24 @@ def remove_model(old_ckpt_name):
                 logs = {"loss/epoch_average": loss_recorder.moving_average}                
                 accelerator.log(logs, step=epoch + 1)
 
-            if len(val_dataloader) > 0:
-                if args.logging_dir is not None:
-                    avr_loss: float = val_loss_recorder.moving_average
-                    logs = {"loss/validation_epoch_average": avr_loss}
-                    accelerator.log(logs, step=epoch + 1)
-                    
+            if args.validation_every_n_step is None:               
+                if len(val_dataloader) > 0:
+                    print("Validating バリデーション処理...")
+                    total_loss = 0.0
+                    with torch.no_grad():
+                        for val_step in min(len(val_dataloader), args.validation_batches):
+                            is_train = False
+                            batch = next(cyclic_val_dataloader)
+                            loss = self.process_val_batch(batch, is_train, tokenizers, text_encoders, unet, vae, noise_scheduler, vae_dtype, weight_dtype, accelerator, args)
+                            total_loss += loss.detach().item()
+                        current_loss = total_loss / args.validation_batches   
+                        val_loss_recorder.add(epoch=epoch, step=global_step, loss=current_loss)
+
+                    if args.logging_dir is not None:
+                        avr_loss: float = val_loss_recorder.moving_average
+                        logs = {"loss/val_epoch_average": avr_loss}
+                        accelerator.log(logs, step=epoch + 1)
+                            
             accelerator.wait_for_everyone()
 
             # 指定エポックごとにモデルを保存
@@ -1162,6 +1177,18 @@ def setup_parser() -> argparse.ArgumentParser:
         default=0.0,
         help="Split for validation images out of the training dataset"
     )    
+    parser.add_argument(
+        "--validation_every_n_step",
+        type=int,
+        default=None,
+        help="Number of steps for counting validation loss. By default, validation per epoch is performed"
+    )    
+    parser.add_argument(
+        "--validation_batches",
+        type=int,
+        default=1,
+        help="Number of val steps for counting validation loss. By default, validation one batch is performed"
+    )            
     return parser
 
 

From 923b761ce3622a3132bf0db7768e6b97df21c607 Mon Sep 17 00:00:00 2001
From: gesen2egee <79357052+gesen2egee@users.noreply.github.com>
Date: Sun, 10 Mar 2024 20:01:40 +0800
Subject: [PATCH 011/582] Update train_network.py

---
 train_network.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/train_network.py b/train_network.py
index d3e34eb7e..821100666 100644
--- a/train_network.py
+++ b/train_network.py
@@ -988,6 +988,7 @@ def remove_model(old_ckpt_name):
                             print("Validating バリデーション処理...")
                             total_loss = 0.0
                             with torch.no_grad():
+                                validation_steps = args.validation_batches if args.validation_batches is not None else len(val_dataloader)
                                 for val_step in min(len(val_dataloader), args.validation_batches):
                                     is_train = False
                                     batch = next(cyclic_val_dataloader)
@@ -1013,6 +1014,7 @@ def remove_model(old_ckpt_name):
                     print("Validating バリデーション処理...")
                     total_loss = 0.0
                     with torch.no_grad():
+                        validation_steps = args.validation_batches if args.validation_batches is not None else len(val_dataloader)
                         for val_step in min(len(val_dataloader), args.validation_batches):
                             is_train = False
                             batch = next(cyclic_val_dataloader)
@@ -1186,8 +1188,8 @@ def setup_parser() -> argparse.ArgumentParser:
     parser.add_argument(
         "--validation_batches",
         type=int,
-        default=1,
-        help="Number of val steps for counting validation loss. By default, validation one batch is performed"
+        default=None,
+        help="Number of val steps for counting validation loss. By default, validation for all val_dataset is performed"
     )            
     return parser
 

From 47359b8fac9602415f56b1f7e3f25a00255a1d78 Mon Sep 17 00:00:00 2001
From: gesen2egee <79357052+gesen2egee@users.noreply.github.com>
Date: Sun, 10 Mar 2024 20:17:40 +0800
Subject: [PATCH 012/582] Update train_network.py

---
 train_network.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/train_network.py b/train_network.py
index 821100666..d549378cc 100644
--- a/train_network.py
+++ b/train_network.py
@@ -989,7 +989,7 @@ def remove_model(old_ckpt_name):
                             total_loss = 0.0
                             with torch.no_grad():
                                 validation_steps = args.validation_batches if args.validation_batches is not None else len(val_dataloader)
-                                for val_step in min(len(val_dataloader), args.validation_batches):
+                                for val_step in range(validation_steps):
                                     is_train = False
                                     batch = next(cyclic_val_dataloader)
                                     loss = self.process_val_batch(batch, is_train, tokenizers, text_encoders, unet, vae, noise_scheduler, vae_dtype, weight_dtype, accelerator, args)
@@ -1015,7 +1015,7 @@ def remove_model(old_ckpt_name):
                     total_loss = 0.0
                     with torch.no_grad():
                         validation_steps = args.validation_batches if args.validation_batches is not None else len(val_dataloader)
-                        for val_step in min(len(val_dataloader), args.validation_batches):
+                        for val_step in range(validation_steps):
                             is_train = False
                             batch = next(cyclic_val_dataloader)
                             loss = self.process_val_batch(batch, is_train, tokenizers, text_encoders, unet, vae, noise_scheduler, vae_dtype, weight_dtype, accelerator, args)

From a51723cc2a3dd50b45e60945f97bc5adfe753d1f Mon Sep 17 00:00:00 2001
From: gesen2egee <79357052+gesen2egee@users.noreply.github.com>
Date: Mon, 11 Mar 2024 09:42:58 +0800
Subject: [PATCH 013/582] fix timesteps

---
 train_network.py | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/train_network.py b/train_network.py
index d549378cc..f0f27ea74 100644
--- a/train_network.py
+++ b/train_network.py
@@ -141,7 +141,6 @@ def process_val_batch(self, batch, is_train, tokenizers, text_encoders, unet, va
 
         total_loss = 0.0
         timesteps_list = [10, 350, 500, 650, 990]    
-        
         with torch.no_grad():
             if "latents" in batch and batch["latents"] is not None:
                 latents = batch["latents"].to(accelerator.device)
@@ -174,16 +173,17 @@ def process_val_batch(self, batch, is_train, tokenizers, text_encoders, unet, va
 
         # Sample noise, sample a random timestep for each image, and add noise to the latents,
         # with noise offset and/or multires noise if specified
-        noise, noisy_latents, _ = train_util.get_noise_noisy_latents_and_timesteps(
-            args, noise_scheduler, latents
-        )
-        for timesteps in timesteps_list:
-            # Predict the noise residual
+        
+        for fixed_timesteps in timesteps_list:
             with torch.set_grad_enabled(is_train), accelerator.autocast():
+                noise = torch.randn_like(latents, device=latents.device)
+                b_size = latents.shape[0]
+                timesteps = torch.randint(fixed_timesteps, fixed_timesteps, (b_size,), device=latents.device)
+                timesteps = timesteps.long()
+                noisy_latents = noise_scheduler.add_noise(latents, noise, timesteps)
                 noise_pred = self.call_unet(
                     args, accelerator, unet, noisy_latents, timesteps, text_encoder_conds, batch, weight_dtype
                 )
-
             if args.v_parameterization:
                 # v-parameterization training
                 target = noise_scheduler.get_velocity(latents, noise, timesteps)
@@ -988,7 +988,7 @@ def remove_model(old_ckpt_name):
                             print("Validating バリデーション処理...")
                             total_loss = 0.0
                             with torch.no_grad():
-                                validation_steps = args.validation_batches if args.validation_batches is not None else len(val_dataloader)
+                                validation_steps = min(args.validation_batches, len(val_dataloader)) if args.validation_batches is not None else len(val_dataloader)
                                 for val_step in range(validation_steps):
                                     is_train = False
                                     batch = next(cyclic_val_dataloader)
@@ -999,7 +999,7 @@ def remove_model(old_ckpt_name):
 
                             if args.logging_dir is not None:
                                 avr_loss: float = val_loss_recorder.moving_average
-                                logs = {"loss/avr_val_loss": avr_loss}
+                                logs = {"loss/average_val_loss": avr_loss}
                                 accelerator.log(logs, step=global_step)
                                 
                 if global_step >= args.max_train_steps:
@@ -1014,7 +1014,7 @@ def remove_model(old_ckpt_name):
                     print("Validating バリデーション処理...")
                     total_loss = 0.0
                     with torch.no_grad():
-                        validation_steps = args.validation_batches if args.validation_batches is not None else len(val_dataloader)
+                        validation_steps = min(args.validation_batches, len(val_dataloader)) if args.validation_batches is not None else len(val_dataloader)
                         for val_step in range(validation_steps):
                             is_train = False
                             batch = next(cyclic_val_dataloader)

From 7d84ac2177a603e9aa6834fd1c0ee19a463eb5a0 Mon Sep 17 00:00:00 2001
From: gesen2egee <79357052+gesen2egee@users.noreply.github.com>
Date: Mon, 11 Mar 2024 14:41:51 +0800
Subject: [PATCH 014/582] only use train subset to val

---
 library/config_util.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/library/config_util.py b/library/config_util.py
index d198cee35..1a6cef971 100644
--- a/library/config_util.py
+++ b/library/config_util.py
@@ -492,7 +492,7 @@ def generate_dataset_group_by_blueprint(dataset_group_blueprint: DatasetGroupBlu
             subset_klass = FineTuningSubset
             dataset_klass = FineTuningDataset
 
-        subsets = [subset_klass(**asdict(subset_blueprint.params)) for subset_blueprint in dataset_blueprint.subsets]
+        subsets = [subset_klass(**asdict(subset_blueprint.params)) for subset_blueprint in dataset_blueprint.subsets if subset_blueprint.params.is_reg is False]
         dataset = dataset_klass(subsets=subsets, is_train=False, **asdict(dataset_blueprint.params))
         val_datasets.append(dataset)
 

From befbec5335ed1f8018d22b65993b376571ea2989 Mon Sep 17 00:00:00 2001
From: gesen2egee <79357052+gesen2egee@users.noreply.github.com>
Date: Mon, 11 Mar 2024 18:47:04 +0800
Subject: [PATCH 015/582] Update train_network.py

---
 train_network.py | 32 +++++++++++++++++---------------
 1 file changed, 17 insertions(+), 15 deletions(-)

diff --git a/train_network.py b/train_network.py
index f0f27ea74..cbc107b6b 100644
--- a/train_network.py
+++ b/train_network.py
@@ -174,7 +174,7 @@ def process_val_batch(self, batch, is_train, tokenizers, text_encoders, unet, va
         # Sample noise, sample a random timestep for each image, and add noise to the latents,
         # with noise offset and/or multires noise if specified
         
-        for fixed_timesteps in timesteps_list:
+        for fixed_timesteps in tqdm(timesteps_list, desc='Training Progress'):
             with torch.set_grad_enabled(is_train), accelerator.autocast():
                 noise = torch.randn_like(latents, device=latents.device)
                 b_size = latents.shape[0]
@@ -184,16 +184,16 @@ def process_val_batch(self, batch, is_train, tokenizers, text_encoders, unet, va
                 noise_pred = self.call_unet(
                     args, accelerator, unet, noisy_latents, timesteps, text_encoder_conds, batch, weight_dtype
                 )
-            if args.v_parameterization:
-                # v-parameterization training
-                target = noise_scheduler.get_velocity(latents, noise, timesteps)
-            else:
-                target = noise
+                if args.v_parameterization:
+                    # v-parameterization training
+                    target = noise_scheduler.get_velocity(latents, noise, timesteps)
+                else:
+                    target = noise
 
-            loss = torch.nn.functional.mse_loss(noise_pred.float(), target.float(), reduction="none")
-            loss = loss.mean([1, 2, 3])
-            loss = loss.mean()  # 平均なのでbatch_sizeで割る必要なし
-            total_loss += loss
+                loss = torch.nn.functional.mse_loss(noise_pred.float(), target.float(), reduction="none")
+                loss = loss.mean([1, 2, 3])
+                loss = loss.mean()  # 平均なのでbatch_sizeで割る必要なし
+                total_loss += loss
             
         average_loss = total_loss / len(timesteps_list)    
         return average_loss
@@ -985,7 +985,7 @@ def remove_model(old_ckpt_name):
                 if args.validation_every_n_step is not None:        
                     if global_step % (args.validation_every_n_step) == 0:                
                         if len(val_dataloader) > 0:
-                            print("Validating バリデーション処理...")
+                            print(f"\nValidating バリデーション処理...")
                             total_loss = 0.0
                             with torch.no_grad():
                                 validation_steps = min(args.validation_batches, len(val_dataloader)) if args.validation_batches is not None else len(val_dataloader)
@@ -994,10 +994,12 @@ def remove_model(old_ckpt_name):
                                     batch = next(cyclic_val_dataloader)
                                     loss = self.process_val_batch(batch, is_train, tokenizers, text_encoders, unet, vae, noise_scheduler, vae_dtype, weight_dtype, accelerator, args)
                                     total_loss += loss.detach().item()
-                                current_loss = total_loss / args.validation_batches   
-                                val_loss_recorder.add(epoch=epoch, step=global_step, loss=current_loss)
+                                current_loss = total_loss / args.validation_batches
+                                val_loss_recorder.add(epoch=epoch, step=global_step, loss=current_loss)   
 
                             if args.logging_dir is not None:
+                                logs = {"loss/current_val_loss": current_loss}
+                                accelerator.log(logs, step=global_step)                            
                                 avr_loss: float = val_loss_recorder.moving_average
                                 logs = {"loss/average_val_loss": avr_loss}
                                 accelerator.log(logs, step=global_step)
@@ -1011,7 +1013,7 @@ def remove_model(old_ckpt_name):
 
             if args.validation_every_n_step is None:               
                 if len(val_dataloader) > 0:
-                    print("Validating バリデーション処理...")
+                    print(f"\nValidating バリデーション処理...")
                     total_loss = 0.0
                     with torch.no_grad():
                         validation_steps = min(args.validation_batches, len(val_dataloader)) if args.validation_batches is not None else len(val_dataloader)
@@ -1025,7 +1027,7 @@ def remove_model(old_ckpt_name):
 
                     if args.logging_dir is not None:
                         avr_loss: float = val_loss_recorder.moving_average
-                        logs = {"loss/val_epoch_average": avr_loss}
+                        logs = {"loss/epoch_val_average": avr_loss}
                         accelerator.log(logs, step=epoch + 1)
                             
             accelerator.wait_for_everyone()

From 63e58f78e3df7608045071cdc247bb26bd19a333 Mon Sep 17 00:00:00 2001
From: gesen2egee <79357052+gesen2egee@users.noreply.github.com>
Date: Mon, 11 Mar 2024 19:15:55 +0800
Subject: [PATCH 016/582] Update train_network.py

---
 train_network.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/train_network.py b/train_network.py
index cbc107b6b..82d72df24 100644
--- a/train_network.py
+++ b/train_network.py
@@ -178,8 +178,7 @@ def process_val_batch(self, batch, is_train, tokenizers, text_encoders, unet, va
             with torch.set_grad_enabled(is_train), accelerator.autocast():
                 noise = torch.randn_like(latents, device=latents.device)
                 b_size = latents.shape[0]
-                timesteps = torch.randint(fixed_timesteps, fixed_timesteps, (b_size,), device=latents.device)
-                timesteps = timesteps.long()
+                timesteps = torch.full((b_size,), fixed_timesteps, dtype=torch.long, device=latents.device)
                 noisy_latents = noise_scheduler.add_noise(latents, noise, timesteps)
                 noise_pred = self.call_unet(
                     args, accelerator, unet, noisy_latents, timesteps, text_encoder_conds, batch, weight_dtype

From a6c41c6bea0465112c7bd472dff68b7e8ecea46e Mon Sep 17 00:00:00 2001
From: gesen2egee <79357052+gesen2egee@users.noreply.github.com>
Date: Mon, 11 Mar 2024 19:23:48 +0800
Subject: [PATCH 017/582] Update train_network.py

---
 train_network.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/train_network.py b/train_network.py
index 82d72df24..6eefdb2be 100644
--- a/train_network.py
+++ b/train_network.py
@@ -174,7 +174,7 @@ def process_val_batch(self, batch, is_train, tokenizers, text_encoders, unet, va
         # Sample noise, sample a random timestep for each image, and add noise to the latents,
         # with noise offset and/or multires noise if specified
         
-        for fixed_timesteps in tqdm(timesteps_list, desc='Training Progress'):
+        for fixed_timesteps in timesteps_list:
             with torch.set_grad_enabled(is_train), accelerator.autocast():
                 noise = torch.randn_like(latents, device=latents.device)
                 b_size = latents.shape[0]
@@ -988,7 +988,7 @@ def remove_model(old_ckpt_name):
                             total_loss = 0.0
                             with torch.no_grad():
                                 validation_steps = min(args.validation_batches, len(val_dataloader)) if args.validation_batches is not None else len(val_dataloader)
-                                for val_step in range(validation_steps):
+                                for val_step in tqdm(range(validation_steps), desc='Validation Steps'): 
                                     is_train = False
                                     batch = next(cyclic_val_dataloader)
                                     loss = self.process_val_batch(batch, is_train, tokenizers, text_encoders, unet, vae, noise_scheduler, vae_dtype, weight_dtype, accelerator, args)
@@ -1016,7 +1016,7 @@ def remove_model(old_ckpt_name):
                     total_loss = 0.0
                     with torch.no_grad():
                         validation_steps = min(args.validation_batches, len(val_dataloader)) if args.validation_batches is not None else len(val_dataloader)
-                        for val_step in range(validation_steps):
+                        for val_step in tqdm(range(validation_steps), desc='Validation Steps'):
                             is_train = False
                             batch = next(cyclic_val_dataloader)
                             loss = self.process_val_batch(batch, is_train, tokenizers, text_encoders, unet, vae, noise_scheduler, vae_dtype, weight_dtype, accelerator, args)

From bd7e2295b7c4d1444a9e844309e1685cb29c6961 Mon Sep 17 00:00:00 2001
From: gesen2egee <gesen2gee@gmail.com>
Date: Wed, 13 Mar 2024 17:54:21 +0800
Subject: [PATCH 018/582] fix

---
 train_network.py | 38 +++++++++-----------------------------
 1 file changed, 9 insertions(+), 29 deletions(-)

diff --git a/train_network.py b/train_network.py
index 6eefdb2be..128690fba 100644
--- a/train_network.py
+++ b/train_network.py
@@ -981,20 +981,19 @@ def remove_model(old_ckpt_name):
                     logs = self.generate_step_logs(args, current_loss, avr_loss, lr_scheduler, keys_scaled, mean_norm, maximum_norm)
                     accelerator.log(logs, step=global_step)
                 
-                if args.validation_every_n_step is not None:        
-                    if global_step % (args.validation_every_n_step) == 0:                
-                        if len(val_dataloader) > 0:
+                if len(val_dataloader) > 0:
+                    if  (args.validation_every_n_step is not None and global_step % args.validation_every_n_step == 0) or step == len(train_dataloader) - 1 or global_step >= args.max_train_steps:
                             print(f"\nValidating バリデーション処理...")
                             total_loss = 0.0
                             with torch.no_grad():
-                                validation_steps = min(args.validation_batches, len(val_dataloader)) if args.validation_batches is not None else len(val_dataloader)
+                                validation_steps = min(args.max_validation_steps, len(val_dataloader)) if args.max_validation_steps is not None else len(val_dataloader)
                                 for val_step in tqdm(range(validation_steps), desc='Validation Steps'): 
                                     is_train = False
                                     batch = next(cyclic_val_dataloader)
                                     loss = self.process_val_batch(batch, is_train, tokenizers, text_encoders, unet, vae, noise_scheduler, vae_dtype, weight_dtype, accelerator, args)
                                     total_loss += loss.detach().item()
-                                current_loss = total_loss / args.validation_batches
-                                val_loss_recorder.add(epoch=epoch, step=global_step, loss=current_loss)   
+                                current_loss = total_loss / validation_steps
+                                val_loss_recorder.add(epoch=epoch, step=step, loss=current_loss)   
 
                             if args.logging_dir is not None:
                                 logs = {"loss/current_val_loss": current_loss}
@@ -1009,25 +1008,6 @@ def remove_model(old_ckpt_name):
             if args.logging_dir is not None:
                 logs = {"loss/epoch_average": loss_recorder.moving_average}                
                 accelerator.log(logs, step=epoch + 1)
-
-            if args.validation_every_n_step is None:               
-                if len(val_dataloader) > 0:
-                    print(f"\nValidating バリデーション処理...")
-                    total_loss = 0.0
-                    with torch.no_grad():
-                        validation_steps = min(args.validation_batches, len(val_dataloader)) if args.validation_batches is not None else len(val_dataloader)
-                        for val_step in tqdm(range(validation_steps), desc='Validation Steps'):
-                            is_train = False
-                            batch = next(cyclic_val_dataloader)
-                            loss = self.process_val_batch(batch, is_train, tokenizers, text_encoders, unet, vae, noise_scheduler, vae_dtype, weight_dtype, accelerator, args)
-                            total_loss += loss.detach().item()
-                        current_loss = total_loss / args.validation_batches   
-                        val_loss_recorder.add(epoch=epoch, step=global_step, loss=current_loss)
-
-                    if args.logging_dir is not None:
-                        avr_loss: float = val_loss_recorder.moving_average
-                        logs = {"loss/epoch_val_average": avr_loss}
-                        accelerator.log(logs, step=epoch + 1)
                             
             accelerator.wait_for_everyone()
 
@@ -1184,14 +1164,14 @@ def setup_parser() -> argparse.ArgumentParser:
         "--validation_every_n_step",
         type=int,
         default=None,
-        help="Number of steps for counting validation loss. By default, validation per epoch is performed"
+        help="Number of train steps for counting validation loss. By default, validation per train epoch is performed"
     )    
     parser.add_argument(
-        "--validation_batches",
+        "--max_validation_steps",
         type=int,
         default=None,
-        help="Number of val steps for counting validation loss. By default, validation for all val_dataset is performed"
-    )            
+        help="Number of max validation steps for counting validation loss. By default, validation will run entire validation dataset"
+    )    
     return parser
 
 

From d05965dbadf430dab6a05f171292f6d2077ec946 Mon Sep 17 00:00:00 2001
From: gesen2egee <gesen2gee@gmail.com>
Date: Wed, 13 Mar 2024 18:33:51 +0800
Subject: [PATCH 019/582] Update train_network.py

---
 train_network.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/train_network.py b/train_network.py
index 864bfd708..cc9fcbbed 100644
--- a/train_network.py
+++ b/train_network.py
@@ -987,8 +987,8 @@ def remove_model(old_ckpt_name):
                     accelerator.log(logs, step=global_step)
                 
                 if len(val_dataloader) > 0:
-                    if  (args.validation_every_n_step is not None and global_step % args.validation_every_n_step == 0) or step == len(train_dataloader) - 1 or global_step >= args.max_train_steps:
-                            print(f"\nValidating バリデーション処理...")
+                    if  (args.validation_every_n_step is not None and global_step % args.validation_every_n_step == 0) or (args.validation_every_n_step is None and step == len(train_dataloader) - 1) or global_step >= args.max_train_steps:
+                            accelerator.print("Validating バリデーション処理...")
                             total_loss = 0.0
                             with torch.no_grad():
                                 validation_steps = min(args.max_validation_steps, len(val_dataloader)) if args.max_validation_steps is not None else len(val_dataloader)
@@ -998,7 +998,7 @@ def remove_model(old_ckpt_name):
                                     loss = self.process_val_batch(batch, is_train, tokenizers, text_encoders, unet, vae, noise_scheduler, vae_dtype, weight_dtype, accelerator, args)
                                     total_loss += loss.detach().item()
                                 current_loss = total_loss / validation_steps
-                                val_loss_recorder.add(epoch=epoch, step=step, loss=current_loss)   
+                                val_loss_recorder.add(epoch=0, step=global_step, loss=current_loss)   
 
                             if args.logging_dir is not None:
                                 logs = {"loss/current_val_loss": current_loss}

From b5e8045df40ed4a437492ed2b6ea6d5be7282080 Mon Sep 17 00:00:00 2001
From: gesen2egee <gesen2gee@gmail.com>
Date: Sat, 16 Mar 2024 11:51:11 +0800
Subject: [PATCH 020/582] fix control net

---
 library/config_util.py |  6 ++++--
 library/train_util.py  | 15 ++++++++++++---
 2 files changed, 16 insertions(+), 5 deletions(-)

diff --git a/library/config_util.py b/library/config_util.py
index ec6ef4b2b..0da0b1437 100644
--- a/library/config_util.py
+++ b/library/config_util.py
@@ -491,8 +491,10 @@ def generate_dataset_group_by_blueprint(dataset_group_blueprint: DatasetGroupBlu
         else:            
             subset_klass = FineTuningSubset
             dataset_klass = FineTuningDataset
-
-        subsets = [subset_klass(**asdict(subset_blueprint.params)) for subset_blueprint in dataset_blueprint.subsets if subset_blueprint.params.is_reg is False]
+        if subset_klass == DreamBoothSubset:
+            subsets = [subset_klass(**asdict(subset_blueprint.params)) for subset_blueprint in dataset_blueprint.subsets if subset_blueprint.params.is_reg is False]
+        else:
+            subsets = [subset_klass(**asdict(subset_blueprint.params)) for subset_blueprint in dataset_blueprint.subsets]
         dataset = dataset_klass(subsets=subsets, is_train=False, **asdict(dataset_blueprint.params))
         val_datasets.append(dataset)
 
diff --git a/library/train_util.py b/library/train_util.py
index 892979628..ae7968d73 100644
--- a/library/train_util.py
+++ b/library/train_util.py
@@ -1816,6 +1816,7 @@ class ControlNetDataset(BaseDataset):
     def __init__(
         self,
         subsets: Sequence[ControlNetSubset],
+        is_train: bool,
         batch_size: int,
         tokenizer,
         max_token_length,
@@ -1826,6 +1827,8 @@ def __init__(
         max_bucket_reso: int,
         bucket_reso_steps: int,
         bucket_no_upscale: bool,
+        validation_split: float,
+        validation_seed: Optional[int],        
         debug_dataset: float,
     ) -> None:
         super().__init__(tokenizer, max_token_length, resolution, network_multiplier, debug_dataset)
@@ -1860,6 +1863,7 @@ def __init__(
 
         self.dreambooth_dataset_delegate = DreamBoothDataset(
             db_subsets,
+            is_train,
             batch_size,
             tokenizer,
             max_token_length,
@@ -1871,6 +1875,8 @@ def __init__(
             bucket_reso_steps,
             bucket_no_upscale,
             1.0,
+            validation_split,
+            validation_seed,
             debug_dataset,
         )
 
@@ -1878,7 +1884,10 @@ def __init__(
         self.image_data = self.dreambooth_dataset_delegate.image_data
         self.batch_size = batch_size
         self.num_train_images = self.dreambooth_dataset_delegate.num_train_images
-        self.num_reg_images = self.dreambooth_dataset_delegate.num_reg_images
+        self.num_reg_images = self.dreambooth_dataset_delegate.num_reg_images        
+        self.is_train = is_train
+        self.validation_split = validation_split
+        self.validation_seed = validation_seed 
 
         # assert all conditioning data exists
         missing_imgs = []
@@ -1911,8 +1920,8 @@ def __init__(
                 [cond_img_path for cond_img_path in conditioning_img_paths if cond_img_path not in cond_imgs_with_img]
             )
 
-        assert len(missing_imgs) == 0, f"missing conditioning data for {len(missing_imgs)} images: {missing_imgs}"
-        assert len(extra_imgs) == 0, f"extra conditioning data for {len(extra_imgs)} images: {extra_imgs}"
+        #assert len(missing_imgs) == 0, f"missing conditioning data for {len(missing_imgs)} images: {missing_imgs}"
+        #assert len(extra_imgs) == 0, f"extra conditioning data for {len(extra_imgs)} images: {extra_imgs}"
 
         self.conditioning_image_transforms = IMAGE_TRANSFORMS
 

From 36d4023431d10718b00673d5ba34f426690c62de Mon Sep 17 00:00:00 2001
From: gesen2egee <79357052+gesen2egee@users.noreply.github.com>
Date: Thu, 11 Apr 2024 01:39:17 +0800
Subject: [PATCH 021/582] Update config_util.py

---
 library/config_util.py | 19 +++++++++++++++----
 1 file changed, 15 insertions(+), 4 deletions(-)

diff --git a/library/config_util.py b/library/config_util.py
index a7e0024e3..c6667690e 100644
--- a/library/config_util.py
+++ b/library/config_util.py
@@ -498,10 +498,21 @@ def generate_dataset_group_by_blueprint(dataset_group_blueprint: DatasetGroupBlu
         else:            
             subset_klass = FineTuningSubset
             dataset_klass = FineTuningDataset
-        if subset_klass == DreamBoothSubset:
-            subsets = [subset_klass(**asdict(subset_blueprint.params)) for subset_blueprint in dataset_blueprint.subsets if subset_blueprint.params.is_reg is False]
-        else:
-            subsets = [subset_klass(**asdict(subset_blueprint.params)) for subset_blueprint in dataset_blueprint.subsets]
+
+        subsets = []
+        for subset_blueprint in dataset_blueprint.subsets:
+            subset_blueprint.params.num_repeats = 1
+            subset_blueprint.params.color_aug = False
+            subset_blueprint.params.flip_aug = False
+            subset_blueprint.params.random_crop = False
+            subset_blueprint.params.random_crop = None
+            subset_blueprint.params.caption_dropout_rate = 0.0
+            subset_blueprint.params.caption_dropout_every_n_epochs = 0
+            subset_blueprint.params.caption_tag_dropout_rate = 0.0
+            subset_blueprint.params.token_warmup_step = 0
+            if subset_klass != DreamBoothSubset or not subset_blueprint.params.is_reg:
+                subsets.append(subset_klass(**asdict(subset_blueprint.params)))
+
         dataset = dataset_klass(subsets=subsets, is_train=False, **asdict(dataset_blueprint.params))
         val_datasets.append(dataset)
 

From 229c5a38ef4e93e2023d748b4fa1588d490340ad Mon Sep 17 00:00:00 2001
From: gesen2egee <79357052+gesen2egee@users.noreply.github.com>
Date: Thu, 11 Apr 2024 01:45:49 +0800
Subject: [PATCH 022/582] Update train_util.py

---
 library/train_util.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/library/train_util.py b/library/train_util.py
index 832be75d5..b143e85a8 100644
--- a/library/train_util.py
+++ b/library/train_util.py
@@ -3123,7 +3123,7 @@ def add_training_arguments(parser: argparse.ArgumentParser, support_dreambooth:
     )
     parser.add_argument("--seed", type=int, default=None, help="random seed for training / 学習時の乱数のseed")
     parser.add_argument(
-        "--gradient_checkpointing", action="store_true", help="enable gradient checkpointing / grandient checkpointingを有効にする"
+        "--gradient_checkpointing", action="store_true", help="enable gradient checkpointing / gradient checkpointingを有効にする"
     )
     parser.add_argument(
         "--gradient_accumulation_steps",

From 3b251b758dae6e4f11e0bbc7e544dc9542c836ff Mon Sep 17 00:00:00 2001
From: gesen2egee <79357052+gesen2egee@users.noreply.github.com>
Date: Thu, 11 Apr 2024 01:50:32 +0800
Subject: [PATCH 023/582] Update config_util.py

---
 library/config_util.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/library/config_util.py b/library/config_util.py
index c6667690e..8f01e1f60 100644
--- a/library/config_util.py
+++ b/library/config_util.py
@@ -510,8 +510,10 @@ def generate_dataset_group_by_blueprint(dataset_group_blueprint: DatasetGroupBlu
             subset_blueprint.params.caption_dropout_every_n_epochs = 0
             subset_blueprint.params.caption_tag_dropout_rate = 0.0
             subset_blueprint.params.token_warmup_step = 0
-            if subset_klass != DreamBoothSubset or not subset_blueprint.params.is_reg:
-                subsets.append(subset_klass(**asdict(subset_blueprint.params)))
+
+            if subset_klass != DreamBoothSubset or (subset_klass == DreamBoothSubset and not subset_blueprint.params.is_reg):
+                subset = subset_klass(**asdict(subset_blueprint.params))
+                subsets.append(subset)
 
         dataset = dataset_klass(subsets=subsets, is_train=False, **asdict(dataset_blueprint.params))
         val_datasets.append(dataset)

From 459b12539b0ae1a92da98e38568ea0a61db1e89f Mon Sep 17 00:00:00 2001
From: gesen2egee <79357052+gesen2egee@users.noreply.github.com>
Date: Thu, 11 Apr 2024 01:52:14 +0800
Subject: [PATCH 024/582] Update config_util.py

---
 library/config_util.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/library/config_util.py b/library/config_util.py
index 8f01e1f60..6f243aac3 100644
--- a/library/config_util.py
+++ b/library/config_util.py
@@ -512,8 +512,7 @@ def generate_dataset_group_by_blueprint(dataset_group_blueprint: DatasetGroupBlu
             subset_blueprint.params.token_warmup_step = 0
 
             if subset_klass != DreamBoothSubset or (subset_klass == DreamBoothSubset and not subset_blueprint.params.is_reg):
-                subset = subset_klass(**asdict(subset_blueprint.params))
-                subsets.append(subset)
+                subsets.append(subset_klass(**asdict(subset_blueprint.params)))
 
         dataset = dataset_klass(subsets=subsets, is_train=False, **asdict(dataset_blueprint.params))
         val_datasets.append(dataset)

From 89ad69b6a0d35791627cb58630a711befc6bb3b5 Mon Sep 17 00:00:00 2001
From: gesen2egee <79357052+gesen2egee@users.noreply.github.com>
Date: Thu, 11 Apr 2024 08:42:31 +0800
Subject: [PATCH 025/582] Update train_util.py

---
 library/train_util.py | 13 ++-----------
 1 file changed, 2 insertions(+), 11 deletions(-)

diff --git a/library/train_util.py b/library/train_util.py
index b143e85a8..8bf6823bb 100644
--- a/library/train_util.py
+++ b/library/train_util.py
@@ -1511,17 +1511,6 @@ def load_dreambooth_dir(subset: DreamBoothSubset):
                 logger.warning(f"not directory: {subset.image_dir}")
                 return [], []
 
-            img_paths = glob_images(subset.image_dir, "*")
-            if self.validation_split > 0.0:
-                img_paths = split_train_val(img_paths, self.is_train, self.validation_split, self.validation_seed)            
-            logger.info(f"found directory {subset.image_dir} contains {len(img_paths)} image files")
-
-            # 画像ファイルごとにプロンプトを読み込み、もしあればそちらを使う
-            captions = []
-            missing_captions = []
-            for img_path in img_paths:
-                cap_for_img = read_caption(img_path, subset.caption_extension)
-                if cap_for_img is None and subset.class_tokens is None:
             info_cache_file = os.path.join(subset.image_dir, self.IMAGE_INFO_CACHE_FILE)
             use_cached_info_for_subset = subset.cache_info
             if use_cached_info_for_subset:
@@ -1545,6 +1534,8 @@ def load_dreambooth_dir(subset: DreamBoothSubset):
                 # we may need to check image size and existence of image files, but it takes time, so user should check it before training
             else:
                 img_paths = glob_images(subset.image_dir, "*")
+                if self.validation_split > 0.0:
+                    img_paths = split_train_val(img_paths, self.is_train, self.validation_split, self.validation_seed)
                 sizes = [None] * len(img_paths)
 
             logger.info(f"found directory {subset.image_dir} contains {len(img_paths)} image files")

From fde8026c2d92fe4991927eed6fa1ff373e8d38d2 Mon Sep 17 00:00:00 2001
From: gesen2egee <79357052+gesen2egee@users.noreply.github.com>
Date: Thu, 11 Apr 2024 11:29:26 +0800
Subject: [PATCH 026/582] Update config_util.py

---
 library/config_util.py | 10 +---------
 1 file changed, 1 insertion(+), 9 deletions(-)

diff --git a/library/config_util.py b/library/config_util.py
index 6f243aac3..a1b02bd1e 100644
--- a/library/config_util.py
+++ b/library/config_util.py
@@ -636,19 +636,11 @@ def generate_dataset_group_by_blueprint(dataset_group_blueprint: DatasetGroupBlu
         [Subset {j} of Dataset {i}]
           image_dir: "{subset.image_dir}"
           image_count: {subset.img_count}
-          num_repeats: {subset.num_repeats}
           shuffle_caption: {subset.shuffle_caption}
           keep_tokens: {subset.keep_tokens}
           keep_tokens_separator: {subset.keep_tokens_separator}
-          caption_dropout_rate: {subset.caption_dropout_rate}
-          caption_dropout_every_n_epoches: {subset.caption_dropout_every_n_epochs}
-          caption_tag_dropout_rate: {subset.caption_tag_dropout_rate}
           caption_prefix: {subset.caption_prefix}
           caption_suffix: {subset.caption_suffix}
-          color_aug: {subset.color_aug}
-          flip_aug: {subset.flip_aug}
-          face_crop_aug_range: {subset.face_crop_aug_range}
-          random_crop: {subset.random_crop}
           token_warmup_min: {subset.token_warmup_min},
           token_warmup_step: {subset.token_warmup_step},
       """
@@ -688,7 +680,7 @@ def generate_dataset_group_by_blueprint(dataset_group_blueprint: DatasetGroupBlu
         dataset.set_seed(seed)
 
     for i, dataset in enumerate(val_datasets):
-        print(f"[Validation Dataset {i}]")
+        logger.info(f"[Validation Dataset {i}]")
         dataset.make_buckets()
         dataset.set_seed(seed)
 

From e5268286bf90ddcc53ad1deb31aba857cfa967d5 Mon Sep 17 00:00:00 2001
From: Kohya S <ykumeykume@gmail.com>
Date: Sat, 15 Jun 2024 22:20:24 +0900
Subject: [PATCH 027/582] add sd3 models and inference script

---
 library/sd3_models.py    | 1796 ++++++++++++++++++++++++++++++++++++++
 library/sd3_utils.py     |  113 +++
 sd3_minimal_inference.py |  347 ++++++++
 3 files changed, 2256 insertions(+)
 create mode 100644 library/sd3_models.py
 create mode 100644 library/sd3_utils.py
 create mode 100644 sd3_minimal_inference.py

diff --git a/library/sd3_models.py b/library/sd3_models.py
new file mode 100644
index 000000000..294a69b06
--- /dev/null
+++ b/library/sd3_models.py
@@ -0,0 +1,1796 @@
+# some modules/classes are copied and modified from https://github.com/mcmonkey4eva/sd3-ref 
+# the original code is licensed under the MIT License
+
+# and some module/classes are contributed from KohakuBlueleaf. Thanks for the contribution!
+
+from functools import partial
+import math
+from typing import Dict, Optional
+import einops
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.utils.checkpoint import checkpoint
+from transformers import CLIPTokenizer, T5TokenizerFast
+
+
+memory_efficient_attention = None
+try:
+    import xformers
+except:
+    pass
+
+try:
+    from xformers.ops import memory_efficient_attention
+except:
+    memory_efficient_attention = None
+
+
+# region tokenizer
+class SDTokenizer:
+    def __init__(
+        self, max_length=77, pad_with_end=True, tokenizer=None, has_start_token=True, pad_to_max_length=True, min_length=None
+    ):
+        """
+        サブクラスで各種の設定を行ってる。このクラスはその設定に基づき重み付きのトークン化を行うようだ。
+        Some settings are done in subclasses. This class seems to perform tokenization with weights based on those settings.
+        """
+        self.tokenizer = tokenizer
+        self.max_length = max_length
+        self.min_length = min_length
+        empty = self.tokenizer("")["input_ids"]
+        if has_start_token:
+            self.tokens_start = 1
+            self.start_token = empty[0]
+            self.end_token = empty[1]
+        else:
+            self.tokens_start = 0
+            self.start_token = None
+            self.end_token = empty[0]
+        self.pad_with_end = pad_with_end
+        self.pad_to_max_length = pad_to_max_length
+        vocab = self.tokenizer.get_vocab()
+        self.inv_vocab = {v: k for k, v in vocab.items()}
+        self.max_word_length = 8
+
+    def tokenize_with_weights(self, text: str):
+        """Tokenize the text, with weight values - presume 1.0 for all and ignore other features here.
+        The details aren't relevant for a reference impl, and weights themselves has weak effect on SD3."""
+        """
+        ja: テキストをトークン化し、重み値を持ちます - すべての値に1.0を仮定し、他の機能を無視します。
+        詳細は参考実装には関係なく、重み自体はSD3に対して弱い影響しかありません。へぇ～
+        """
+        if self.pad_with_end:
+            pad_token = self.end_token
+        else:
+            pad_token = 0
+        batch = []
+        if self.start_token is not None:
+            batch.append((self.start_token, 1.0))
+        to_tokenize = text.replace("\n", " ").split(" ")
+        to_tokenize = [x for x in to_tokenize if x != ""]
+        for word in to_tokenize:
+            batch.extend([(t, 1) for t in self.tokenizer(word)["input_ids"][self.tokens_start : -1]])
+        batch.append((self.end_token, 1.0))
+        if self.pad_to_max_length:
+            batch.extend([(pad_token, 1.0)] * (self.max_length - len(batch)))
+        if self.min_length is not None and len(batch) < self.min_length:
+            batch.extend([(pad_token, 1.0)] * (self.min_length - len(batch)))
+        return [batch]
+
+
+class T5XXLTokenizer(SDTokenizer):
+    """Wraps the T5 Tokenizer from HF into the SDTokenizer interface"""
+
+    def __init__(self):
+        super().__init__(
+            pad_with_end=False,
+            tokenizer=T5TokenizerFast.from_pretrained("google/t5-v1_1-xxl"),
+            has_start_token=False,
+            pad_to_max_length=False,
+            max_length=99999999,
+            min_length=77,
+        )
+
+
+class SDXLClipGTokenizer(SDTokenizer):
+    def __init__(self, tokenizer):
+        super().__init__(pad_with_end=False, tokenizer=tokenizer)
+
+
+class SD3Tokenizer:
+    def __init__(self, t5xxl=True):
+        # TODO cache tokenizer settings locally or hold them in the repo like ComfyUI
+        clip_tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-large-patch14")
+        self.clip_l = SDTokenizer(tokenizer=clip_tokenizer)
+        self.clip_g = SDXLClipGTokenizer(clip_tokenizer)
+        self.t5xxl = T5XXLTokenizer() if t5xxl else None
+
+    def tokenize_with_weights(self, text: str):
+        return (
+            self.clip_l.tokenize_with_weights(text),
+            self.clip_g.tokenize_with_weights(text),
+            self.t5xxl.tokenize_with_weights(text) if self.t5xxl is not None else None,
+        )
+
+
+# endregion
+
+# region mmdit
+
+
+def get_2d_sincos_pos_embed(
+    embed_dim,
+    grid_size,
+    scaling_factor=None,
+    offset=None,
+):
+    grid_h = np.arange(grid_size, dtype=np.float32)
+    grid_w = np.arange(grid_size, dtype=np.float32)
+    grid = np.meshgrid(grid_w, grid_h)  # here w goes first
+    grid = np.stack(grid, axis=0)
+    if scaling_factor is not None:
+        grid = grid / scaling_factor
+    if offset is not None:
+        grid = grid - offset
+
+    grid = grid.reshape([2, 1, grid_size, grid_size])
+    pos_embed = get_2d_sincos_pos_embed_from_grid(embed_dim, grid)
+    return pos_embed
+
+
+def get_2d_sincos_pos_embed_from_grid(embed_dim, grid):
+    assert embed_dim % 2 == 0
+
+    # use half of dimensions to encode grid_h
+    emb_h = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[0])  # (H*W, D/2)
+    emb_w = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[1])  # (H*W, D/2)
+
+    emb = np.concatenate([emb_h, emb_w], axis=1)  # (H*W, D)
+    return emb
+
+
+def get_1d_sincos_pos_embed_from_grid(embed_dim, pos):
+    """
+    embed_dim: output dimension for each position
+    pos: a list of positions to be encoded: size (M,)
+    out: (M, D)
+    """
+    assert embed_dim % 2 == 0
+    omega = np.arange(embed_dim // 2, dtype=np.float64)
+    omega /= embed_dim / 2.0
+    omega = 1.0 / 10000**omega  # (D/2,)
+
+    pos = pos.reshape(-1)  # (M,)
+    out = np.einsum("m,d->md", pos, omega)  # (M, D/2), outer product
+
+    emb_sin = np.sin(out)  # (M, D/2)
+    emb_cos = np.cos(out)  # (M, D/2)
+
+    emb = np.concatenate([emb_sin, emb_cos], axis=1)  # (M, D)
+    return emb
+
+
+def get_1d_sincos_pos_embed_from_grid_torch(
+    embed_dim,
+    pos,
+    device=None,
+    dtype=torch.float32,
+):
+    omega = torch.arange(embed_dim // 2, device=device, dtype=dtype)
+    omega *= 2.0 / embed_dim
+    omega = 1.0 / 10000**omega
+    out = torch.outer(pos.reshape(-1), omega)
+    emb = torch.cat([out.sin(), out.cos()], dim=1)
+    return emb
+
+
+def get_2d_sincos_pos_embed_torch(
+    embed_dim,
+    w,
+    h,
+    val_center=7.5,
+    val_magnitude=7.5,
+    device=None,
+    dtype=torch.float32,
+):
+    small = min(h, w)
+    val_h = (h / small) * val_magnitude
+    val_w = (w / small) * val_magnitude
+    grid_h, grid_w = torch.meshgrid(
+        torch.linspace(-val_h + val_center, val_h + val_center, h, device=device, dtype=dtype),
+        torch.linspace(-val_w + val_center, val_w + val_center, w, device=device, dtype=dtype),
+        indexing="ij",
+    )
+    emb_h = get_1d_sincos_pos_embed_from_grid_torch(embed_dim // 2, grid_h, device=device, dtype=dtype)
+    emb_w = get_1d_sincos_pos_embed_from_grid_torch(embed_dim // 2, grid_w, device=device, dtype=dtype)
+    emb = torch.cat([emb_w, emb_h], dim=1)  # (H*W, D)
+    return emb
+
+
+def modulate(x, shift, scale):
+    if shift is None:
+        shift = torch.zeros_like(scale)
+    return x * (1 + scale.unsqueeze(1)) + shift.unsqueeze(1)
+
+
+def default(x, default_value):
+    if x is None:
+        return default_value
+    return x
+
+
+def timestep_embedding(t, dim, max_period=10000):
+    half = dim // 2
+    # freqs = torch.exp(-math.log(max_period) * torch.arange(start=0, end=half, dtype=torch.float32) / half).to(
+    #     device=t.device, dtype=t.dtype
+    # )
+    freqs = torch.exp(-math.log(max_period) * torch.arange(start=0, end=half, dtype=torch.float32) / half).to(device=t.device)
+    args = t[:, None].float() * freqs[None]
+    embedding = torch.cat([torch.cos(args), torch.sin(args)], dim=-1)
+    if dim % 2:
+        embedding = torch.cat([embedding, torch.zeros_like(embedding[:, :1])], dim=-1)
+    if torch.is_floating_point(t):
+        embedding = embedding.to(dtype=t.dtype)
+    return embedding
+
+
+def rmsnorm(x, eps=1e-6):
+    return x * torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + eps)
+
+
+class PatchEmbed(nn.Module):
+    def __init__(
+        self,
+        img_size=256,
+        patch_size=4,
+        in_channels=3,
+        embed_dim=512,
+        norm_layer=None,
+        flatten=True,
+        bias=True,
+        strict_img_size=True,
+        dynamic_img_pad=True,
+    ):
+        super().__init__()
+        self.patch_size = patch_size
+        self.flatten = flatten
+        self.strict_img_size = strict_img_size
+        self.dynamic_img_pad = dynamic_img_pad
+        if img_size is not None:
+            self.img_size = img_size
+            self.grid_size = img_size // patch_size
+            self.num_patches = self.grid_size**2
+        else:
+            self.img_size = None
+            self.grid_size = None
+            self.num_patches = None
+
+        self.proj = nn.Conv2d(in_channels, embed_dim, patch_size, patch_size, bias=bias)
+        self.norm = nn.Identity() if norm_layer is None else norm_layer(embed_dim)
+
+    def forward(self, x):
+        B, C, H, W = x.shape
+
+        if self.dynamic_img_pad:
+            # Pad input so we won't have partial patch
+            pad_h = (self.patch_size - H % self.patch_size) % self.patch_size
+            pad_w = (self.patch_size - W % self.patch_size) % self.patch_size
+            x = nn.functional.pad(x, (0, pad_w, 0, pad_h), mode="reflect")
+        x = self.proj(x)
+        if self.flatten:
+            x = x.flatten(2).transpose(1, 2)
+        x = self.norm(x)
+        return x
+
+
+# FinalLayer in mmdit.py
+class UnPatch(nn.Module):
+    def __init__(self, hidden_size=512, patch_size=4, out_channels=3):
+        super().__init__()
+        self.patch_size = patch_size
+        self.c = out_channels
+
+        # eps is default in mmdit.py
+        self.norm_final = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
+        self.linear = nn.Linear(hidden_size, patch_size**2 * out_channels)
+        self.adaLN_modulation = nn.Sequential(
+            nn.SiLU(),
+            nn.Linear(hidden_size, 2 * hidden_size),
+        )
+
+    def forward(self, x: torch.Tensor, cmod, H=None, W=None):
+        b, n, _ = x.shape
+        p = self.patch_size
+        c = self.c
+        if H is None and W is None:
+            w = h = int(n**0.5)
+            assert h * w == n
+        else:
+            h = H // p if H else n // (W // p)
+            w = W // p if W else n // h
+            assert h * w == n
+
+        shift, scale = self.adaLN_modulation(cmod).chunk(2, dim=-1)
+        x = modulate(self.norm_final(x), shift, scale)
+        x = self.linear(x)
+
+        x = x.view(b, h, w, p, p, c)
+        x = x.permute(0, 5, 1, 3, 2, 4).contiguous()
+        x = x.view(b, c, h * p, w * p)
+        return x
+
+
+class MLP(nn.Module):
+    def __init__(
+        self,
+        in_features,
+        hidden_features=None,
+        out_features=None,
+        act_layer=lambda: nn.GELU(),
+        norm_layer=None,
+        bias=True,
+        use_conv=False,
+    ):
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        self.use_conv = use_conv
+
+        layer = partial(nn.Conv1d, kernel_size=1) if use_conv else nn.Linear
+
+        self.fc1 = layer(in_features, hidden_features, bias=bias)
+        self.fc2 = layer(hidden_features, out_features, bias=bias)
+        self.act = act_layer()
+        self.norm = norm_layer(hidden_features) if norm_layer else nn.Identity()
+
+    def forward(self, x):
+        x = self.fc1(x)
+        x = self.act(x)
+        x = self.norm(x)
+        x = self.fc2(x)
+        return x
+
+
+class TimestepEmbedding(nn.Module):
+    def __init__(self, hidden_size, freq_embed_size=256):
+        super().__init__()
+        self.mlp = nn.Sequential(
+            nn.Linear(freq_embed_size, hidden_size),
+            nn.SiLU(),
+            nn.Linear(hidden_size, hidden_size),
+        )
+        self.freq_embed_size = freq_embed_size
+
+    def forward(self, t, dtype=None, **kwargs):
+        t_freq = timestep_embedding(t, self.freq_embed_size).to(dtype)
+        t_emb = self.mlp(t_freq)
+        return t_emb
+
+
+class Embedder(nn.Module):
+    def __init__(self, input_dim, hidden_size):
+        super().__init__()
+        self.mlp = nn.Sequential(
+            nn.Linear(input_dim, hidden_size),
+            nn.SiLU(),
+            nn.Linear(hidden_size, hidden_size),
+        )
+
+    def forward(self, x):
+        return self.mlp(x)
+
+
+class RMSNorm(torch.nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        elementwise_affine: bool = False,
+        eps: float = 1e-6,
+        device=None,
+        dtype=None,
+    ):
+        """
+        Initialize the RMSNorm normalization layer.
+        Args:
+            dim (int): The dimension of the input tensor.
+            eps (float, optional): A small value added to the denominator for numerical stability. Default is 1e-6.
+        Attributes:
+            eps (float): A small value added to the denominator for numerical stability.
+            weight (nn.Parameter): Learnable scaling parameter.
+        """
+        super().__init__()
+        self.eps = eps
+        self.learnable_scale = elementwise_affine
+        if self.learnable_scale:
+            self.weight = nn.Parameter(torch.empty(dim, device=device, dtype=dtype))
+        else:
+            self.register_parameter("weight", None)
+
+    def forward(self, x):
+        """
+        Forward pass through the RMSNorm layer.
+        Args:
+            x (torch.Tensor): The input tensor.
+        Returns:
+            torch.Tensor: The output tensor after applying RMSNorm.
+        """
+        x = rmsnorm(x, eps=self.eps)
+        if self.learnable_scale:
+            return x * self.weight.to(device=x.device, dtype=x.dtype)
+        else:
+            return x
+
+
+class SwiGLUFeedForward(nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        hidden_dim: int,
+        multiple_of: int,
+        ffn_dim_multiplier: float = None,
+    ):
+        super().__init__()
+        hidden_dim = int(2 * hidden_dim / 3)
+        # custom dim factor multiplier
+        if ffn_dim_multiplier is not None:
+            hidden_dim = int(ffn_dim_multiplier * hidden_dim)
+        hidden_dim = multiple_of * ((hidden_dim + multiple_of - 1) // multiple_of)
+
+        self.w1 = nn.Linear(dim, hidden_dim, bias=False)
+        self.w2 = nn.Linear(hidden_dim, dim, bias=False)
+        self.w3 = nn.Linear(dim, hidden_dim, bias=False)
+
+    def forward(self, x):
+        return self.w2(nn.functional.silu(self.w1(x)) * self.w3(x))
+
+
+# Linears for SelfAttention in mmdit.py
+class AttentionLinears(nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        num_heads: int = 8,
+        qkv_bias: bool = False,
+        pre_only: bool = False,
+        qk_norm: str = None,
+    ):
+        super().__init__()
+        self.num_heads = num_heads
+        self.head_dim = dim // num_heads
+
+        self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
+        if not pre_only:
+            self.proj = nn.Linear(dim, dim)
+        self.pre_only = pre_only
+
+        if qk_norm == "rms":
+            self.ln_q = RMSNorm(self.head_dim, elementwise_affine=True, eps=1.0e-6)
+            self.ln_k = RMSNorm(self.head_dim, elementwise_affine=True, eps=1.0e-6)
+        elif qk_norm == "ln":
+            self.ln_q = nn.LayerNorm(self.head_dim, elementwise_affine=True, eps=1.0e-6)
+            self.ln_k = nn.LayerNorm(self.head_dim, elementwise_affine=True, eps=1.0e-6)
+        elif qk_norm is None:
+            self.ln_q = nn.Identity()
+            self.ln_k = nn.Identity()
+        else:
+            raise ValueError(qk_norm)
+
+    def pre_attention(self, x: torch.Tensor) -> torch.Tensor:
+        """
+        output:
+            q, k, v: [B, L, D]
+        """
+        B, L, C = x.shape
+        qkv: torch.Tensor = self.qkv(x)
+        q, k, v = qkv.reshape(B, L, -1, self.head_dim).chunk(3, dim=2)
+        q = self.ln_q(q).reshape(q.shape[0], q.shape[1], -1)
+        k = self.ln_k(k).reshape(q.shape[0], q.shape[1], -1)
+        return (q, k, v)
+
+    def post_attention(self, x: torch.Tensor) -> torch.Tensor:
+        assert not self.pre_only
+        x = self.proj(x)
+        return x
+
+
+MEMORY_LAYOUTS = {
+    "torch": (
+        lambda x, head_dim: x.reshape(x.shape[0], x.shape[1], -1, head_dim).transpose(1, 2),
+        lambda x: x.transpose(1, 2).reshape(x.shape[0], x.shape[2], -1),
+        lambda x: (1, x, 1, 1),
+    ),
+    "xformers": (
+        lambda x, head_dim: x.reshape(x.shape[0], x.shape[1], -1, head_dim),
+        lambda x: x.reshape(x.shape[0], x.shape[1], -1),
+        lambda x: (1, 1, x, 1),
+    ),
+    "math": (
+        lambda x, head_dim: x.reshape(x.shape[0], x.shape[1], -1, head_dim).transpose(1, 2),
+        lambda x: x.transpose(1, 2).reshape(x.shape[0], x.shape[2], -1),
+        lambda x: (1, x, 1, 1),
+    ),
+}
+# ATTN_FUNCTION = {
+#     "torch": F.scaled_dot_product_attention,
+#     "xformers": memory_efficient_attention,
+# }
+
+
+def vanilla_attention(q, k, v, mask, scale=None):
+    if scale is None:
+        scale = math.sqrt(q.size(-1))
+    scores = torch.bmm(q, k.transpose(-1, -2)) / scale
+    if mask is not None:
+        mask = einops.rearrange(mask, "b ... -> b (...)")
+        max_neg_value = -torch.finfo(scores.dtype).max
+        mask = einops.repeat(mask, "b j -> (b h) j", h=q.size(-3))
+        scores = scores.masked_fill(~mask, max_neg_value)
+    p_attn = F.softmax(scores, dim=-1)
+    return torch.bmm(p_attn, v)
+
+
+def attention(q, k, v, head_dim, mask=None, scale=None, mode="xformers"):
+    """
+    q, k, v: [B, L, D]
+    """
+    pre_attn_layout = MEMORY_LAYOUTS[mode][0]
+    post_attn_layout = MEMORY_LAYOUTS[mode][1]
+    q = pre_attn_layout(q, head_dim)
+    k = pre_attn_layout(k, head_dim)
+    v = pre_attn_layout(v, head_dim)
+
+    # scores = ATTN_FUNCTION[mode](q, k.to(q), v.to(q), mask, scale=scale)
+    if mode == "torch":
+        assert scale is None
+        scores = F.scaled_dot_product_attention(q, k.to(q), v.to(q), mask)  # , scale=scale)
+    elif mode == "xformers":
+        scores = memory_efficient_attention(q, k.to(q), v.to(q), mask, scale=scale)
+    else:
+        scores = vanilla_attention(q, k.to(q), v.to(q), mask, scale=scale)
+
+    scores = post_attn_layout(scores)
+    return scores
+
+
+class SelfAttention(AttentionLinears):
+    def __init__(self, dim, num_heads=8, mode="xformers"):
+        super().__init__(dim, num_heads, qkv_bias=True, pre_only=False)
+        assert mode in MEMORY_LAYOUTS
+        self.head_dim = dim // num_heads
+        self.attn_mode = mode
+
+    def set_attn_mode(self, mode):
+        self.attn_mode = mode
+
+    def forward(self, x):
+        q, k, v = self.pre_attention(x)
+        attn_score = attention(q, k, v, self.head_dim, mode=self.attn_mode)
+        return self.post_attention(attn_score)
+
+
+class TransformerBlock(nn.Module):
+    def __init__(self, context_size, mode="xformers"):
+        super().__init__()
+        self.context_size = context_size
+        self.norm1 = nn.LayerNorm(context_size, elementwise_affine=False, eps=1e-6)
+        self.attn = SelfAttention(context_size, mode=mode)
+        self.norm2 = nn.LayerNorm(context_size, elementwise_affine=False, eps=1e-6)
+        self.mlp = MLP(
+            in_features=context_size,
+            hidden_features=context_size * 4,
+            act_layer=lambda: nn.GELU(approximate="tanh"),
+        )
+
+    def forward(self, x):
+        x = x + self.attn(self.norm1(x))
+        x = x + self.mlp(self.norm2(x))
+        return x
+
+
+class Transformer(nn.Module):
+    def __init__(self, context_size, num_layers, mode="xformers"):
+        super().__init__()
+        self.layers = nn.ModuleList([TransformerBlock(context_size, mode) for _ in range(num_layers)])
+        self.norm = nn.LayerNorm(context_size, elementwise_affine=False, eps=1e-6)
+
+    def forward(self, x):
+        for layer in self.layers:
+            x = layer(x)
+        return self.norm(x)
+
+
+# DismantledBlock in mmdit.py
+class SingleDiTBlock(nn.Module):
+    """
+    A DiT block with gated adaptive layer norm (adaLN) conditioning.
+    """
+
+    def __init__(
+        self,
+        hidden_size: int,
+        num_heads: int,
+        mlp_ratio: float = 4.0,
+        attn_mode: str = "xformers",
+        qkv_bias: bool = False,
+        pre_only: bool = False,
+        rmsnorm: bool = False,
+        scale_mod_only: bool = False,
+        swiglu: bool = False,
+        qk_norm: Optional[str] = None,
+        **block_kwargs,
+    ):
+        super().__init__()
+        assert attn_mode in MEMORY_LAYOUTS
+        self.attn_mode = attn_mode
+        if not rmsnorm:
+            self.norm1 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
+        else:
+            self.norm1 = RMSNorm(hidden_size, elementwise_affine=False, eps=1e-6)
+        self.attn = AttentionLinears(
+            dim=hidden_size,
+            num_heads=num_heads,
+            qkv_bias=qkv_bias,
+            pre_only=pre_only,
+            qk_norm=qk_norm,
+        )
+        if not pre_only:
+            if not rmsnorm:
+                self.norm2 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
+            else:
+                self.norm2 = RMSNorm(hidden_size, elementwise_affine=False, eps=1e-6)
+        mlp_hidden_dim = int(hidden_size * mlp_ratio)
+        if not pre_only:
+            if not swiglu:
+                self.mlp = MLP(
+                    in_features=hidden_size,
+                    hidden_features=mlp_hidden_dim,
+                    act_layer=lambda: nn.GELU(approximate="tanh"),
+                )
+            else:
+                self.mlp = SwiGLUFeedForward(
+                    dim=hidden_size,
+                    hidden_dim=mlp_hidden_dim,
+                    multiple_of=256,
+                )
+        self.scale_mod_only = scale_mod_only
+        if not scale_mod_only:
+            n_mods = 6 if not pre_only else 2
+        else:
+            n_mods = 4 if not pre_only else 1
+        self.adaLN_modulation = nn.Sequential(nn.SiLU(), nn.Linear(hidden_size, n_mods * hidden_size))
+        self.pre_only = pre_only
+
+    def pre_attention(self, x: torch.Tensor, c: torch.Tensor) -> torch.Tensor:
+        if not self.pre_only:
+            if not self.scale_mod_only:
+                (
+                    shift_msa,
+                    scale_msa,
+                    gate_msa,
+                    shift_mlp,
+                    scale_mlp,
+                    gate_mlp,
+                ) = self.adaLN_modulation(
+                    c
+                ).chunk(6, dim=-1)
+            else:
+                shift_msa = None
+                shift_mlp = None
+                (
+                    scale_msa,
+                    gate_msa,
+                    scale_mlp,
+                    gate_mlp,
+                ) = self.adaLN_modulation(
+                    c
+                ).chunk(4, dim=-1)
+            qkv = self.attn.pre_attention(modulate(self.norm1(x), shift_msa, scale_msa))
+            return qkv, (
+                x,
+                gate_msa,
+                shift_mlp,
+                scale_mlp,
+                gate_mlp,
+            )
+        else:
+            if not self.scale_mod_only:
+                (
+                    shift_msa,
+                    scale_msa,
+                ) = self.adaLN_modulation(
+                    c
+                ).chunk(2, dim=-1)
+            else:
+                shift_msa = None
+                scale_msa = self.adaLN_modulation(c)
+            qkv = self.attn.pre_attention(modulate(self.norm1(x), shift_msa, scale_msa))
+            return qkv, None
+
+    def post_attention(self, attn, x, gate_msa, shift_mlp, scale_mlp, gate_mlp):
+        assert not self.pre_only
+        x = x + gate_msa.unsqueeze(1) * self.attn.post_attention(attn)
+        x = x + gate_mlp.unsqueeze(1) * self.mlp(modulate(self.norm2(x), shift_mlp, scale_mlp))
+        return x
+
+
+# JointBlock + block_mixing in mmdit.py
+class MMDiTBlock(nn.Module):
+    def __init__(self, *args, **kwargs):
+        super().__init__()
+        pre_only = kwargs.pop("pre_only")
+        self.context_block = SingleDiTBlock(*args, pre_only=pre_only, **kwargs)
+        self.x_block = SingleDiTBlock(*args, pre_only=False, **kwargs)
+        self.head_dim = self.x_block.attn.head_dim
+        self.mode = self.x_block.attn_mode
+        self.gradient_checkpointing = False
+
+    def enable_gradient_checkpointing(self):
+        self.gradient_checkpointing = True
+
+    def _forward(self, context, x, c):
+        ctx_qkv, ctx_intermediate = self.context_block.pre_attention(context, c)
+        x_qkv, x_intermediate = self.x_block.pre_attention(x, c)
+
+        ctx_len = ctx_qkv[0].size(1)
+
+        q = torch.concat((ctx_qkv[0], x_qkv[0]), dim=1)
+        k = torch.concat((ctx_qkv[1], x_qkv[1]), dim=1)
+        v = torch.concat((ctx_qkv[2], x_qkv[2]), dim=1)
+
+        attn = attention(q, k, v, head_dim=self.head_dim, mode=self.mode)
+        ctx_attn_out = attn[:, :ctx_len]
+        x_attn_out = attn[:, ctx_len:]
+
+        x = self.x_block.post_attention(x_attn_out, *x_intermediate)
+        if not self.context_block.pre_only:
+            context = self.context_block.post_attention(ctx_attn_out, *ctx_intermediate)
+        else:
+            context = None
+        return context, x
+
+    def forward(self, *args, **kwargs):
+        if self.training and self.gradient_checkpointing:
+            return checkpoint(self._forward, *args, use_reentrant=False, **kwargs)
+        else:
+            return self._forward(*args, **kwargs)
+
+
+class MMDiT(nn.Module):
+    """
+    Diffusion model with a Transformer backbone.
+    """
+
+    def __init__(
+        self,
+        input_size: int = 32,
+        patch_size: int = 2,
+        in_channels: int = 4,
+        depth: int = 28,
+        # hidden_size: Optional[int] = None,
+        # num_heads: Optional[int] = None,
+        mlp_ratio: float = 4.0,
+        learn_sigma: bool = False,
+        adm_in_channels: Optional[int] = None,
+        context_embedder_config: Optional[Dict] = None,
+        use_checkpoint: bool = False,
+        register_length: int = 0,
+        attn_mode: str = "torch",
+        rmsnorm: bool = False,
+        scale_mod_only: bool = False,
+        swiglu: bool = False,
+        out_channels: Optional[int] = None,
+        pos_embed_scaling_factor: Optional[float] = None,
+        pos_embed_offset: Optional[float] = None,
+        pos_embed_max_size: Optional[int] = None,
+        num_patches=None,
+        qk_norm: Optional[str] = None,
+        qkv_bias: bool = True,
+        context_processor_layers=None,
+        context_size=4096,
+    ):
+        super().__init__()
+        self.learn_sigma = learn_sigma
+        self.in_channels = in_channels
+        default_out_channels = in_channels * 2 if learn_sigma else in_channels
+        self.out_channels = default(out_channels, default_out_channels)
+        self.patch_size = patch_size
+        self.pos_embed_scaling_factor = pos_embed_scaling_factor
+        self.pos_embed_offset = pos_embed_offset
+        self.pos_embed_max_size = pos_embed_max_size
+        self.gradient_checkpointing = use_checkpoint
+
+        # hidden_size = default(hidden_size, 64 * depth)
+        # num_heads = default(num_heads, hidden_size // 64)
+
+        # apply magic --> this defines a head_size of 64
+        self.hidden_size = 64 * depth
+        num_heads = depth
+
+        self.num_heads = num_heads
+
+        self.x_embedder = PatchEmbed(
+            input_size,
+            patch_size,
+            in_channels,
+            self.hidden_size,
+            bias=True,
+            strict_img_size=self.pos_embed_max_size is None,
+        )
+        self.t_embedder = TimestepEmbedding(self.hidden_size)
+
+        self.y_embedder = None
+        if adm_in_channels is not None:
+            assert isinstance(adm_in_channels, int)
+            self.y_embedder = Embedder(adm_in_channels, self.hidden_size)
+
+        if context_processor_layers is not None:
+            self.context_processor = Transformer(context_size, context_processor_layers, attn_mode)
+        else:
+            self.context_processor = None
+
+        self.context_embedder = nn.Linear(context_size, self.hidden_size)
+        self.register_length = register_length
+        if self.register_length > 0:
+            self.register = nn.Parameter(torch.randn(1, register_length, self.hidden_size))
+
+        # num_patches = self.x_embedder.num_patches
+        # Will use fixed sin-cos embedding:
+        # just use a buffer already
+        if num_patches is not None:
+            self.register_buffer(
+                "pos_embed",
+                torch.empty(1, num_patches, self.hidden_size),
+            )
+        else:
+            self.pos_embed = None
+
+        self.use_checkpoint = use_checkpoint
+        self.joint_blocks = nn.ModuleList(
+            [
+                MMDiTBlock(
+                    self.hidden_size,
+                    num_heads,
+                    mlp_ratio=mlp_ratio,
+                    attn_mode=attn_mode,
+                    qkv_bias=qkv_bias,
+                    pre_only=i == depth - 1,
+                    rmsnorm=rmsnorm,
+                    scale_mod_only=scale_mod_only,
+                    swiglu=swiglu,
+                    qk_norm=qk_norm,
+                )
+                for i in range(depth)
+            ]
+        )
+        for block in self.joint_blocks:
+            block.gradient_checkpointing = use_checkpoint
+
+        self.final_layer = UnPatch(self.hidden_size, patch_size, self.out_channels)
+        # self.initialize_weights()
+
+    def enable_gradient_checkpointing(self):
+        self.gradient_checkpointing = True
+        for block in self.joint_blocks:
+            block.enable_gradient_checkpointing()
+
+    def disable_gradient_checkpointing(self):
+        self.gradient_checkpointing = False
+        for block in self.joint_blocks:
+            block.disable_gradient_checkpointing()
+
+    def initialize_weights(self):
+        # TODO: Init context_embedder?
+        # Initialize transformer layers:
+        def _basic_init(module):
+            if isinstance(module, nn.Linear):
+                torch.nn.init.xavier_uniform_(module.weight)
+                if module.bias is not None:
+                    nn.init.constant_(module.bias, 0)
+
+        self.apply(_basic_init)
+
+        # Initialize (and freeze) pos_embed by sin-cos embedding
+        if self.pos_embed is not None:
+            pos_embed = get_2d_sincos_pos_embed(
+                self.pos_embed.shape[-1],
+                int(self.pos_embed.shape[-2] ** 0.5),
+                scaling_factor=self.pos_embed_scaling_factor,
+            )
+            self.pos_embed.data.copy_(torch.from_numpy(pos_embed).float().unsqueeze(0))
+
+        # Initialize patch_embed like nn.Linear (instead of nn.Conv2d)
+        w = self.x_embedder.proj.weight.data
+        nn.init.xavier_uniform_(w.view([w.shape[0], -1]))
+        nn.init.constant_(self.x_embedder.proj.bias, 0)
+
+        if getattr(self, "y_embedder", None) is not None:
+            nn.init.normal_(self.y_embedder.mlp[0].weight, std=0.02)
+            nn.init.normal_(self.y_embedder.mlp[2].weight, std=0.02)
+
+        # Initialize timestep embedding MLP:
+        nn.init.normal_(self.t_embedder.mlp[0].weight, std=0.02)
+        nn.init.normal_(self.t_embedder.mlp[2].weight, std=0.02)
+
+        # Zero-out adaLN modulation layers in DiT blocks:
+        for block in self.joint_blocks:
+            nn.init.constant_(block.x_block.adaLN_modulation[-1].weight, 0)
+            nn.init.constant_(block.x_block.adaLN_modulation[-1].bias, 0)
+            nn.init.constant_(block.context_block.adaLN_modulation[-1].weight, 0)
+            nn.init.constant_(block.context_block.adaLN_modulation[-1].bias, 0)
+
+        # Zero-out output layers:
+        nn.init.constant_(self.final_layer.adaLN_modulation[-1].weight, 0)
+        nn.init.constant_(self.final_layer.adaLN_modulation[-1].bias, 0)
+        nn.init.constant_(self.final_layer.linear.weight, 0)
+        nn.init.constant_(self.final_layer.linear.bias, 0)
+
+    def cropped_pos_embed(self, h, w, device=None):
+        p = self.x_embedder.patch_size
+        # patched size
+        h = (h + 1) // p
+        w = (w + 1) // p
+        if self.pos_embed is None:
+            return get_2d_sincos_pos_embed_torch(self.hidden_size, w, h, device=device)
+        assert self.pos_embed_max_size is not None
+        assert h <= self.pos_embed_max_size, (h, self.pos_embed_max_size)
+        assert w <= self.pos_embed_max_size, (w, self.pos_embed_max_size)
+        top = (self.pos_embed_max_size - h) // 2
+        left = (self.pos_embed_max_size - w) // 2
+        spatial_pos_embed = self.pos_embed.reshape(
+            1,
+            self.pos_embed_max_size,
+            self.pos_embed_max_size,
+            self.pos_embed.shape[-1],
+        )
+        spatial_pos_embed = spatial_pos_embed[:, top : top + h, left : left + w, :]
+        spatial_pos_embed = spatial_pos_embed.reshape(1, -1, spatial_pos_embed.shape[-1])
+        return spatial_pos_embed
+
+    def forward(
+        self,
+        x: torch.Tensor,
+        t: torch.Tensor,
+        y: Optional[torch.Tensor] = None,
+        context: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        """
+        Forward pass of DiT.
+        x: (N, C, H, W) tensor of spatial inputs (images or latent representations of images)
+        t: (N,) tensor of diffusion timesteps
+        y: (N, D) tensor of class labels
+        """
+
+        if self.context_processor is not None:
+            context = self.context_processor(context)
+
+        B, C, H, W = x.shape
+        x = self.x_embedder(x) + self.cropped_pos_embed(H, W, device=x.device).to(dtype=x.dtype)
+        c = self.t_embedder(t, dtype=x.dtype)  # (N, D)
+        if y is not None and self.y_embedder is not None:
+            y = self.y_embedder(y)  # (N, D)
+            c = c + y  # (N, D)
+
+        if context is not None:
+            context = self.context_embedder(context)
+
+        if self.register_length > 0:
+            context = torch.cat(
+                (
+                    einops.repeat(self.register, "1 ... -> b ...", b=x.shape[0]),
+                    default(context, torch.Tensor([]).type_as(x)),
+                ),
+                1,
+            )
+
+        for block in self.joint_blocks:
+            context, x = block(context, x, c)
+        x = self.final_layer(x, c, H, W)  # Our final layer combined UnPatchify
+        return x[:, :, :H, :W]
+
+
+def create_mmdit_sd3_medium_configs(attn_mode: str):
+    # {'patch_size': 2, 'depth': 24, 'num_patches': 36864,
+    # 'pos_embed_max_size': 192, 'adm_in_channels': 2048, 'context_embedder':
+    # {'target': 'torch.nn.Linear', 'params': {'in_features': 4096, 'out_features': 1536}}}
+    mmdit = MMDiT(
+        input_size=None,
+        pos_embed_max_size=192,
+        patch_size=2,
+        in_channels=16,
+        adm_in_channels=2048,
+        depth=24,
+        mlp_ratio=4,
+        qk_norm=None,
+        num_patches=36864,
+        context_size=4096,
+        attn_mode=attn_mode,
+    )
+    return mmdit
+
+
+# endregion
+
+# region VAE
+
+
+def Normalize(in_channels, num_groups=32, dtype=torch.float32, device=None):
+    return torch.nn.GroupNorm(num_groups=num_groups, num_channels=in_channels, eps=1e-6, affine=True, dtype=dtype, device=device)
+
+
+class ResnetBlock(torch.nn.Module):
+    def __init__(self, *, in_channels, out_channels=None, dtype=torch.float32, device=None):
+        super().__init__()
+        self.in_channels = in_channels
+        out_channels = in_channels if out_channels is None else out_channels
+        self.out_channels = out_channels
+
+        self.norm1 = Normalize(in_channels, dtype=dtype, device=device)
+        self.conv1 = torch.nn.Conv2d(in_channels, out_channels, kernel_size=3, stride=1, padding=1, dtype=dtype, device=device)
+        self.norm2 = Normalize(out_channels, dtype=dtype, device=device)
+        self.conv2 = torch.nn.Conv2d(out_channels, out_channels, kernel_size=3, stride=1, padding=1, dtype=dtype, device=device)
+        if self.in_channels != self.out_channels:
+            self.nin_shortcut = torch.nn.Conv2d(
+                in_channels, out_channels, kernel_size=1, stride=1, padding=0, dtype=dtype, device=device
+            )
+        else:
+            self.nin_shortcut = None
+        self.swish = torch.nn.SiLU(inplace=True)
+
+    def forward(self, x):
+        hidden = x
+        hidden = self.norm1(hidden)
+        hidden = self.swish(hidden)
+        hidden = self.conv1(hidden)
+        hidden = self.norm2(hidden)
+        hidden = self.swish(hidden)
+        hidden = self.conv2(hidden)
+        if self.in_channels != self.out_channels:
+            x = self.nin_shortcut(x)
+        return x + hidden
+
+
+class AttnBlock(torch.nn.Module):
+    def __init__(self, in_channels, dtype=torch.float32, device=None):
+        super().__init__()
+        self.norm = Normalize(in_channels, dtype=dtype, device=device)
+        self.q = torch.nn.Conv2d(in_channels, in_channels, kernel_size=1, stride=1, padding=0, dtype=dtype, device=device)
+        self.k = torch.nn.Conv2d(in_channels, in_channels, kernel_size=1, stride=1, padding=0, dtype=dtype, device=device)
+        self.v = torch.nn.Conv2d(in_channels, in_channels, kernel_size=1, stride=1, padding=0, dtype=dtype, device=device)
+        self.proj_out = torch.nn.Conv2d(in_channels, in_channels, kernel_size=1, stride=1, padding=0, dtype=dtype, device=device)
+
+    def forward(self, x):
+        hidden = self.norm(x)
+        q = self.q(hidden)
+        k = self.k(hidden)
+        v = self.v(hidden)
+        b, c, h, w = q.shape
+        q, k, v = map(lambda x: einops.rearrange(x, "b c h w -> b 1 (h w) c").contiguous(), (q, k, v))
+        hidden = torch.nn.functional.scaled_dot_product_attention(q, k, v)  # scale is dim ** -0.5 per default
+        hidden = einops.rearrange(hidden, "b 1 (h w) c -> b c h w", h=h, w=w, c=c, b=b)
+        hidden = self.proj_out(hidden)
+        return x + hidden
+
+
+class Downsample(torch.nn.Module):
+    def __init__(self, in_channels, dtype=torch.float32, device=None):
+        super().__init__()
+        self.conv = torch.nn.Conv2d(in_channels, in_channels, kernel_size=3, stride=2, padding=0, dtype=dtype, device=device)
+
+    def forward(self, x):
+        pad = (0, 1, 0, 1)
+        x = torch.nn.functional.pad(x, pad, mode="constant", value=0)
+        x = self.conv(x)
+        return x
+
+
+class Upsample(torch.nn.Module):
+    def __init__(self, in_channels, dtype=torch.float32, device=None):
+        super().__init__()
+        self.conv = torch.nn.Conv2d(in_channels, in_channels, kernel_size=3, stride=1, padding=1, dtype=dtype, device=device)
+
+    def forward(self, x):
+        x = torch.nn.functional.interpolate(x, scale_factor=2.0, mode="nearest")
+        x = self.conv(x)
+        return x
+
+
+class VAEEncoder(torch.nn.Module):
+    def __init__(
+        self, ch=128, ch_mult=(1, 2, 4, 4), num_res_blocks=2, in_channels=3, z_channels=16, dtype=torch.float32, device=None
+    ):
+        super().__init__()
+        self.num_resolutions = len(ch_mult)
+        self.num_res_blocks = num_res_blocks
+        # downsampling
+        self.conv_in = torch.nn.Conv2d(in_channels, ch, kernel_size=3, stride=1, padding=1, dtype=dtype, device=device)
+        in_ch_mult = (1,) + tuple(ch_mult)
+        self.in_ch_mult = in_ch_mult
+        self.down = torch.nn.ModuleList()
+        for i_level in range(self.num_resolutions):
+            block = torch.nn.ModuleList()
+            attn = torch.nn.ModuleList()
+            block_in = ch * in_ch_mult[i_level]
+            block_out = ch * ch_mult[i_level]
+            for i_block in range(num_res_blocks):
+                block.append(ResnetBlock(in_channels=block_in, out_channels=block_out, dtype=dtype, device=device))
+                block_in = block_out
+            down = torch.nn.Module()
+            down.block = block
+            down.attn = attn
+            if i_level != self.num_resolutions - 1:
+                down.downsample = Downsample(block_in, dtype=dtype, device=device)
+            self.down.append(down)
+        # middle
+        self.mid = torch.nn.Module()
+        self.mid.block_1 = ResnetBlock(in_channels=block_in, out_channels=block_in, dtype=dtype, device=device)
+        self.mid.attn_1 = AttnBlock(block_in, dtype=dtype, device=device)
+        self.mid.block_2 = ResnetBlock(in_channels=block_in, out_channels=block_in, dtype=dtype, device=device)
+        # end
+        self.norm_out = Normalize(block_in, dtype=dtype, device=device)
+        self.conv_out = torch.nn.Conv2d(block_in, 2 * z_channels, kernel_size=3, stride=1, padding=1, dtype=dtype, device=device)
+        self.swish = torch.nn.SiLU(inplace=True)
+
+    def forward(self, x):
+        # downsampling
+        hs = [self.conv_in(x)]
+        for i_level in range(self.num_resolutions):
+            for i_block in range(self.num_res_blocks):
+                h = self.down[i_level].block[i_block](hs[-1])
+                hs.append(h)
+            if i_level != self.num_resolutions - 1:
+                hs.append(self.down[i_level].downsample(hs[-1]))
+        # middle
+        h = hs[-1]
+        h = self.mid.block_1(h)
+        h = self.mid.attn_1(h)
+        h = self.mid.block_2(h)
+        # end
+        h = self.norm_out(h)
+        h = self.swish(h)
+        h = self.conv_out(h)
+        return h
+
+
+class VAEDecoder(torch.nn.Module):
+    def __init__(
+        self,
+        ch=128,
+        out_ch=3,
+        ch_mult=(1, 2, 4, 4),
+        num_res_blocks=2,
+        resolution=256,
+        z_channels=16,
+        dtype=torch.float32,
+        device=None,
+    ):
+        super().__init__()
+        self.num_resolutions = len(ch_mult)
+        self.num_res_blocks = num_res_blocks
+        block_in = ch * ch_mult[self.num_resolutions - 1]
+        curr_res = resolution // 2 ** (self.num_resolutions - 1)
+        # z to block_in
+        self.conv_in = torch.nn.Conv2d(z_channels, block_in, kernel_size=3, stride=1, padding=1, dtype=dtype, device=device)
+        # middle
+        self.mid = torch.nn.Module()
+        self.mid.block_1 = ResnetBlock(in_channels=block_in, out_channels=block_in, dtype=dtype, device=device)
+        self.mid.attn_1 = AttnBlock(block_in, dtype=dtype, device=device)
+        self.mid.block_2 = ResnetBlock(in_channels=block_in, out_channels=block_in, dtype=dtype, device=device)
+        # upsampling
+        self.up = torch.nn.ModuleList()
+        for i_level in reversed(range(self.num_resolutions)):
+            block = torch.nn.ModuleList()
+            block_out = ch * ch_mult[i_level]
+            for i_block in range(self.num_res_blocks + 1):
+                block.append(ResnetBlock(in_channels=block_in, out_channels=block_out, dtype=dtype, device=device))
+                block_in = block_out
+            up = torch.nn.Module()
+            up.block = block
+            if i_level != 0:
+                up.upsample = Upsample(block_in, dtype=dtype, device=device)
+                curr_res = curr_res * 2
+            self.up.insert(0, up)  # prepend to get consistent order
+        # end
+        self.norm_out = Normalize(block_in, dtype=dtype, device=device)
+        self.conv_out = torch.nn.Conv2d(block_in, out_ch, kernel_size=3, stride=1, padding=1, dtype=dtype, device=device)
+        self.swish = torch.nn.SiLU(inplace=True)
+
+    def forward(self, z):
+        # z to block_in
+        hidden = self.conv_in(z)
+        # middle
+        hidden = self.mid.block_1(hidden)
+        hidden = self.mid.attn_1(hidden)
+        hidden = self.mid.block_2(hidden)
+        # upsampling
+        for i_level in reversed(range(self.num_resolutions)):
+            for i_block in range(self.num_res_blocks + 1):
+                hidden = self.up[i_level].block[i_block](hidden)
+            if i_level != 0:
+                hidden = self.up[i_level].upsample(hidden)
+        # end
+        hidden = self.norm_out(hidden)
+        hidden = self.swish(hidden)
+        hidden = self.conv_out(hidden)
+        return hidden
+
+
+class SDVAE(torch.nn.Module):
+    def __init__(self, dtype=torch.float32, device=None):
+        super().__init__()
+        self.encoder = VAEEncoder(dtype=dtype, device=device)
+        self.decoder = VAEDecoder(dtype=dtype, device=device)
+
+    @torch.autocast("cuda", dtype=torch.float16)
+    def decode(self, latent):
+        return self.decoder(latent)
+
+    @torch.autocast("cuda", dtype=torch.float16)
+    def encode(self, image):
+        hidden = self.encoder(image)
+        mean, logvar = torch.chunk(hidden, 2, dim=1)
+        logvar = torch.clamp(logvar, -30.0, 20.0)
+        std = torch.exp(0.5 * logvar)
+        return mean + std * torch.randn_like(mean)
+
+
+# endregion
+
+
+# region Text Encoder
+class CLIPAttention(torch.nn.Module):
+    def __init__(self, embed_dim, heads, dtype, device, mode="xformers"):
+        super().__init__()
+        self.heads = heads
+        self.q_proj = nn.Linear(embed_dim, embed_dim, bias=True, dtype=dtype, device=device)
+        self.k_proj = nn.Linear(embed_dim, embed_dim, bias=True, dtype=dtype, device=device)
+        self.v_proj = nn.Linear(embed_dim, embed_dim, bias=True, dtype=dtype, device=device)
+        self.out_proj = nn.Linear(embed_dim, embed_dim, bias=True, dtype=dtype, device=device)
+        self.attn_mode = mode
+
+    def set_attn_mode(self, mode):
+        self.attn_mode = mode
+
+    def forward(self, x, mask=None):
+        q = self.q_proj(x)
+        k = self.k_proj(x)
+        v = self.v_proj(x)
+        out = attention(q, k, v, self.heads, mask, mode=self.attn_mode)
+        return self.out_proj(out)
+
+
+ACTIVATIONS = {
+    "quick_gelu": lambda: (lambda a: a * torch.sigmoid(1.702 * a)),
+    # "gelu": torch.nn.functional.gelu,
+    "gelu": lambda: nn.GELU(),
+}
+
+
+class CLIPLayer(torch.nn.Module):
+    def __init__(self, embed_dim, heads, intermediate_size, intermediate_activation, dtype, device):
+        super().__init__()
+        self.layer_norm1 = nn.LayerNorm(embed_dim, dtype=dtype, device=device)
+        self.self_attn = CLIPAttention(embed_dim, heads, dtype, device)
+        self.layer_norm2 = nn.LayerNorm(embed_dim, dtype=dtype, device=device)
+        # # self.mlp = CLIPMLP(embed_dim, intermediate_size, intermediate_activation, dtype, device)
+        # self.mlp = Mlp(
+        #     embed_dim, intermediate_size, embed_dim, act_layer=ACTIVATIONS[intermediate_activation], dtype=dtype, device=device
+        # )
+        self.mlp = MLP(embed_dim, intermediate_size, embed_dim, act_layer=ACTIVATIONS[intermediate_activation])
+        self.mlp.to(device=device, dtype=dtype)
+
+    def forward(self, x, mask=None):
+        x += self.self_attn(self.layer_norm1(x), mask)
+        x += self.mlp(self.layer_norm2(x))
+        return x
+
+
+class CLIPEncoder(torch.nn.Module):
+    def __init__(self, num_layers, embed_dim, heads, intermediate_size, intermediate_activation, dtype, device):
+        super().__init__()
+        self.layers = torch.nn.ModuleList(
+            [CLIPLayer(embed_dim, heads, intermediate_size, intermediate_activation, dtype, device) for i in range(num_layers)]
+        )
+
+    def forward(self, x, mask=None, intermediate_output=None):
+        if intermediate_output is not None:
+            if intermediate_output < 0:
+                intermediate_output = len(self.layers) + intermediate_output
+        intermediate = None
+        for i, l in enumerate(self.layers):
+            x = l(x, mask)
+            if i == intermediate_output:
+                intermediate = x.clone()
+        return x, intermediate
+
+
+class CLIPEmbeddings(torch.nn.Module):
+    def __init__(self, embed_dim, vocab_size=49408, num_positions=77, dtype=None, device=None):
+        super().__init__()
+        self.token_embedding = torch.nn.Embedding(vocab_size, embed_dim, dtype=dtype, device=device)
+        self.position_embedding = torch.nn.Embedding(num_positions, embed_dim, dtype=dtype, device=device)
+
+    def forward(self, input_tokens):
+        return self.token_embedding(input_tokens) + self.position_embedding.weight
+
+
+class CLIPTextModel_(torch.nn.Module):
+    def __init__(self, config_dict, dtype, device):
+        num_layers = config_dict["num_hidden_layers"]
+        embed_dim = config_dict["hidden_size"]
+        heads = config_dict["num_attention_heads"]
+        intermediate_size = config_dict["intermediate_size"]
+        intermediate_activation = config_dict["hidden_act"]
+        super().__init__()
+        self.embeddings = CLIPEmbeddings(embed_dim, dtype=torch.float32, device=device)
+        self.encoder = CLIPEncoder(num_layers, embed_dim, heads, intermediate_size, intermediate_activation, dtype, device)
+        self.final_layer_norm = nn.LayerNorm(embed_dim, dtype=dtype, device=device)
+
+    def forward(self, input_tokens, intermediate_output=None, final_layer_norm_intermediate=True):
+        x = self.embeddings(input_tokens)
+
+        if x.dtype == torch.bfloat16:
+            causal_mask = torch.empty(x.shape[1], x.shape[1], dtype=torch.float32, device=x.device).fill_(float("-inf")).triu_(1)
+            causal_mask = causal_mask.to(dtype=x.dtype)
+        else:
+            causal_mask = torch.empty(x.shape[1], x.shape[1], dtype=x.dtype, device=x.device).fill_(float("-inf")).triu_(1)
+
+        x, i = self.encoder(x, mask=causal_mask, intermediate_output=intermediate_output)
+        x = self.final_layer_norm(x)
+        if i is not None and final_layer_norm_intermediate:
+            i = self.final_layer_norm(i)
+        pooled_output = x[
+            torch.arange(x.shape[0], device=x.device),
+            input_tokens.to(dtype=torch.int, device=x.device).argmax(dim=-1),
+        ]
+        return x, i, pooled_output
+
+
+class CLIPTextModel(torch.nn.Module):
+    def __init__(self, config_dict, dtype, device):
+        super().__init__()
+        self.num_layers = config_dict["num_hidden_layers"]
+        self.text_model = CLIPTextModel_(config_dict, dtype, device)
+        embed_dim = config_dict["hidden_size"]
+        self.text_projection = nn.Linear(embed_dim, embed_dim, bias=False, dtype=dtype, device=device)
+        self.text_projection.weight.copy_(torch.eye(embed_dim))
+        self.dtype = dtype
+
+    def get_input_embeddings(self):
+        return self.text_model.embeddings.token_embedding
+
+    def set_input_embeddings(self, embeddings):
+        self.text_model.embeddings.token_embedding = embeddings
+
+    def forward(self, *args, **kwargs):
+        x = self.text_model(*args, **kwargs)
+        out = self.text_projection(x[2])
+        return (x[0], x[1], out, x[2])
+
+
+class ClipTokenWeightEncoder:
+    def encode_token_weights(self, token_weight_pairs):
+        tokens = list(map(lambda a: a[0], token_weight_pairs[0]))
+        out, pooled = self([tokens])
+        if pooled is not None:
+            first_pooled = pooled[0:1].cpu()
+        else:
+            first_pooled = pooled
+        output = [out[0:1]]
+        return torch.cat(output, dim=-2).cpu(), first_pooled
+
+
+class SDClipModel(torch.nn.Module, ClipTokenWeightEncoder):
+    """Uses the CLIP transformer encoder for text (from huggingface)"""
+
+    LAYERS = ["last", "pooled", "hidden"]
+
+    def __init__(
+        self,
+        device="cpu",
+        max_length=77,
+        layer="last",
+        layer_idx=None,
+        textmodel_json_config=None,
+        dtype=None,
+        model_class=CLIPTextModel,
+        special_tokens={"start": 49406, "end": 49407, "pad": 49407},
+        layer_norm_hidden_state=True,
+        return_projected_pooled=True,
+    ):
+        super().__init__()
+        assert layer in self.LAYERS
+        self.transformer = model_class(textmodel_json_config, dtype, device)
+        self.num_layers = self.transformer.num_layers
+        self.max_length = max_length
+        self.transformer = self.transformer.eval()
+        for param in self.parameters():
+            param.requires_grad = False
+        self.layer = layer
+        self.layer_idx = None
+        self.special_tokens = special_tokens
+        self.logit_scale = torch.nn.Parameter(torch.tensor(4.6055))
+        self.layer_norm_hidden_state = layer_norm_hidden_state
+        self.return_projected_pooled = return_projected_pooled
+        if layer == "hidden":
+            assert layer_idx is not None
+            assert abs(layer_idx) < self.num_layers
+            self.set_clip_options({"layer": layer_idx})
+        self.options_default = (self.layer, self.layer_idx, self.return_projected_pooled)
+
+    def set_attn_mode(self, mode):
+        raise NotImplementedError("This model does not support setting the attention mode")
+
+    def set_clip_options(self, options):
+        layer_idx = options.get("layer", self.layer_idx)
+        self.return_projected_pooled = options.get("projected_pooled", self.return_projected_pooled)
+        if layer_idx is None or abs(layer_idx) > self.num_layers:
+            self.layer = "last"
+        else:
+            self.layer = "hidden"
+            self.layer_idx = layer_idx
+
+    def forward(self, tokens):
+        backup_embeds = self.transformer.get_input_embeddings()
+        device = backup_embeds.weight.device
+        tokens = torch.LongTensor(tokens).to(device)
+        outputs = self.transformer(
+            tokens, intermediate_output=self.layer_idx, final_layer_norm_intermediate=self.layer_norm_hidden_state
+        )
+        self.transformer.set_input_embeddings(backup_embeds)
+        if self.layer == "last":
+            z = outputs[0]
+        else:
+            z = outputs[1]
+        pooled_output = None
+        if len(outputs) >= 3:
+            if not self.return_projected_pooled and len(outputs) >= 4 and outputs[3] is not None:
+                pooled_output = outputs[3].float()
+            elif outputs[2] is not None:
+                pooled_output = outputs[2].float()
+        return z.float(), pooled_output
+
+    def set_attn_mode(self, mode):
+        clip_text_model = self.transformer.text_model
+        for layer in clip_text_model.encoder.layers:
+            layer.self_attn.set_attn_mode(mode)
+
+
+class SDXLClipG(SDClipModel):
+    """Wraps the CLIP-G model into the SD-CLIP-Model interface"""
+
+    def __init__(self, config, device="cpu", layer="penultimate", layer_idx=None, dtype=None):
+        if layer == "penultimate":
+            layer = "hidden"
+            layer_idx = -2
+        super().__init__(
+            device=device,
+            layer=layer,
+            layer_idx=layer_idx,
+            textmodel_json_config=config,
+            dtype=dtype,
+            special_tokens={"start": 49406, "end": 49407, "pad": 0},
+            layer_norm_hidden_state=False,
+        )
+
+    def set_attn_mode(self, mode):
+        clip_text_model = self.transformer.text_model
+        for layer in clip_text_model.encoder.layers:
+            layer.self_attn.set_attn_mode(mode)
+
+
+class T5XXLModel(SDClipModel):
+    """Wraps the T5-XXL model into the SD-CLIP-Model interface for convenience"""
+
+    def __init__(self, config, device="cpu", layer="last", layer_idx=None, dtype=None):
+        super().__init__(
+            device=device,
+            layer=layer,
+            layer_idx=layer_idx,
+            textmodel_json_config=config,
+            dtype=dtype,
+            special_tokens={"end": 1, "pad": 0},
+            model_class=T5,
+        )
+
+    def set_attn_mode(self, mode):
+        t5: T5 = self.transformer
+        for t5block in t5.encoder.block:
+            t5block: T5Block
+            t5layer: T5LayerSelfAttention = t5block.layer[0]
+            t5SaSa: T5Attention = t5layer.SelfAttention
+            t5SaSa.set_attn_mode(mode)
+
+
+#################################################################################################
+### T5 implementation, for the T5-XXL text encoder portion, largely pulled from upstream impl
+#################################################################################################
+
+
+class T5XXLTokenizer(SDTokenizer):
+    """Wraps the T5 Tokenizer from HF into the SDTokenizer interface"""
+
+    def __init__(self):
+        super().__init__(
+            pad_with_end=False,
+            tokenizer=T5TokenizerFast.from_pretrained("google/t5-v1_1-xxl"),
+            has_start_token=False,
+            pad_to_max_length=False,
+            max_length=99999999,
+            min_length=77,
+        )
+
+
+class T5LayerNorm(torch.nn.Module):
+    def __init__(self, hidden_size, eps=1e-6, dtype=None, device=None):
+        super().__init__()
+        self.weight = torch.nn.Parameter(torch.ones(hidden_size, dtype=dtype, device=device))
+        self.variance_epsilon = eps
+
+    def forward(self, x):
+        variance = x.pow(2).mean(-1, keepdim=True)
+        x = x * torch.rsqrt(variance + self.variance_epsilon)
+        return self.weight.to(device=x.device, dtype=x.dtype) * x
+
+
+class T5DenseGatedActDense(torch.nn.Module):
+    def __init__(self, model_dim, ff_dim, dtype, device):
+        super().__init__()
+        self.wi_0 = nn.Linear(model_dim, ff_dim, bias=False, dtype=dtype, device=device)
+        self.wi_1 = nn.Linear(model_dim, ff_dim, bias=False, dtype=dtype, device=device)
+        self.wo = nn.Linear(ff_dim, model_dim, bias=False, dtype=dtype, device=device)
+
+    def forward(self, x):
+        hidden_gelu = torch.nn.functional.gelu(self.wi_0(x), approximate="tanh")
+        hidden_linear = self.wi_1(x)
+        x = hidden_gelu * hidden_linear
+        x = self.wo(x)
+        return x
+
+
+class T5LayerFF(torch.nn.Module):
+    def __init__(self, model_dim, ff_dim, dtype, device):
+        super().__init__()
+        self.DenseReluDense = T5DenseGatedActDense(model_dim, ff_dim, dtype, device)
+        self.layer_norm = T5LayerNorm(model_dim, dtype=dtype, device=device)
+
+    def forward(self, x):
+        forwarded_states = self.layer_norm(x)
+        forwarded_states = self.DenseReluDense(forwarded_states)
+        x += forwarded_states
+        return x
+
+
+class T5Attention(torch.nn.Module):
+    def __init__(self, model_dim, inner_dim, num_heads, relative_attention_bias, dtype, device):
+        super().__init__()
+        # Mesh TensorFlow initialization to avoid scaling before softmax
+        self.q = nn.Linear(model_dim, inner_dim, bias=False, dtype=dtype, device=device)
+        self.k = nn.Linear(model_dim, inner_dim, bias=False, dtype=dtype, device=device)
+        self.v = nn.Linear(model_dim, inner_dim, bias=False, dtype=dtype, device=device)
+        self.o = nn.Linear(inner_dim, model_dim, bias=False, dtype=dtype, device=device)
+        self.num_heads = num_heads
+        self.relative_attention_bias = None
+        if relative_attention_bias:
+            self.relative_attention_num_buckets = 32
+            self.relative_attention_max_distance = 128
+            self.relative_attention_bias = torch.nn.Embedding(self.relative_attention_num_buckets, self.num_heads, device=device)
+
+        self.attn_mode = "xformers"  # TODO 何とかする
+
+    def set_attn_mode(self, mode):
+        self.attn_mode = mode
+
+    @staticmethod
+    def _relative_position_bucket(relative_position, bidirectional=True, num_buckets=32, max_distance=128):
+        """
+        Adapted from Mesh Tensorflow:
+        https://github.com/tensorflow/mesh/blob/0cb87fe07da627bf0b7e60475d59f95ed6b5be3d/mesh_tensorflow/transformer/transformer_layers.py#L593
+
+        Translate relative position to a bucket number for relative attention. The relative position is defined as
+        memory_position - query_position, i.e. the distance in tokens from the attending position to the attended-to
+        position. If bidirectional=False, then positive relative positions are invalid. We use smaller buckets for
+        small absolute relative_position and larger buckets for larger absolute relative_positions. All relative
+        positions >=max_distance map to the same bucket. All relative positions <=-max_distance map to the same bucket.
+        This should allow for more graceful generalization to longer sequences than the model has been trained on
+
+        Args:
+            relative_position: an int32 Tensor
+            bidirectional: a boolean - whether the attention is bidirectional
+            num_buckets: an integer
+            max_distance: an integer
+
+        Returns:
+            a Tensor with the same shape as relative_position, containing int32 values in the range [0, num_buckets)
+        """
+        relative_buckets = 0
+        if bidirectional:
+            num_buckets //= 2
+            relative_buckets += (relative_position > 0).to(torch.long) * num_buckets
+            relative_position = torch.abs(relative_position)
+        else:
+            relative_position = -torch.min(relative_position, torch.zeros_like(relative_position))
+        # now relative_position is in the range [0, inf)
+        # half of the buckets are for exact increments in positions
+        max_exact = num_buckets // 2
+        is_small = relative_position < max_exact
+        # The other half of the buckets are for logarithmically bigger bins in positions up to max_distance
+        relative_position_if_large = max_exact + (
+            torch.log(relative_position.float() / max_exact) / math.log(max_distance / max_exact) * (num_buckets - max_exact)
+        ).to(torch.long)
+        relative_position_if_large = torch.min(
+            relative_position_if_large, torch.full_like(relative_position_if_large, num_buckets - 1)
+        )
+        relative_buckets += torch.where(is_small, relative_position, relative_position_if_large)
+        return relative_buckets
+
+    def compute_bias(self, query_length, key_length, device):
+        """Compute binned relative position bias"""
+        context_position = torch.arange(query_length, dtype=torch.long, device=device)[:, None]
+        memory_position = torch.arange(key_length, dtype=torch.long, device=device)[None, :]
+        relative_position = memory_position - context_position  # shape (query_length, key_length)
+        relative_position_bucket = self._relative_position_bucket(
+            relative_position,  # shape (query_length, key_length)
+            bidirectional=True,
+            num_buckets=self.relative_attention_num_buckets,
+            max_distance=self.relative_attention_max_distance,
+        )
+        values = self.relative_attention_bias(relative_position_bucket)  # shape (query_length, key_length, num_heads)
+        values = values.permute([2, 0, 1]).unsqueeze(0)  # shape (1, num_heads, query_length, key_length)
+        return values
+
+    def forward(self, x, past_bias=None):
+        q = self.q(x)
+        k = self.k(x)
+        v = self.v(x)
+        if self.relative_attention_bias is not None:
+            past_bias = self.compute_bias(x.shape[1], x.shape[1], x.device)
+        if past_bias is not None:
+            mask = past_bias
+        out = attention(q, k * ((k.shape[-1] / self.num_heads) ** 0.5), v, self.num_heads, mask, mode=self.attn_mode)
+        return self.o(out), past_bias
+
+
+class T5LayerSelfAttention(torch.nn.Module):
+    def __init__(self, model_dim, inner_dim, ff_dim, num_heads, relative_attention_bias, dtype, device):
+        super().__init__()
+        self.SelfAttention = T5Attention(model_dim, inner_dim, num_heads, relative_attention_bias, dtype, device)
+        self.layer_norm = T5LayerNorm(model_dim, dtype=dtype, device=device)
+
+    def forward(self, x, past_bias=None):
+        output, past_bias = self.SelfAttention(self.layer_norm(x), past_bias=past_bias)
+        x += output
+        return x, past_bias
+
+
+class T5Block(torch.nn.Module):
+    def __init__(self, model_dim, inner_dim, ff_dim, num_heads, relative_attention_bias, dtype, device):
+        super().__init__()
+        self.layer = torch.nn.ModuleList()
+        self.layer.append(T5LayerSelfAttention(model_dim, inner_dim, ff_dim, num_heads, relative_attention_bias, dtype, device))
+        self.layer.append(T5LayerFF(model_dim, ff_dim, dtype, device))
+
+    def forward(self, x, past_bias=None):
+        x, past_bias = self.layer[0](x, past_bias)
+        x = self.layer[-1](x)
+        return x, past_bias
+
+
+class T5Stack(torch.nn.Module):
+    def __init__(self, num_layers, model_dim, inner_dim, ff_dim, num_heads, vocab_size, dtype, device):
+        super().__init__()
+        self.embed_tokens = torch.nn.Embedding(vocab_size, model_dim, device=device)
+        self.block = torch.nn.ModuleList(
+            [
+                T5Block(model_dim, inner_dim, ff_dim, num_heads, relative_attention_bias=(i == 0), dtype=dtype, device=device)
+                for i in range(num_layers)
+            ]
+        )
+        self.final_layer_norm = T5LayerNorm(model_dim, dtype=dtype, device=device)
+
+    def forward(self, input_ids, intermediate_output=None, final_layer_norm_intermediate=True):
+        intermediate = None
+        x = self.embed_tokens(input_ids)
+        past_bias = None
+        for i, l in enumerate(self.block):
+            # print(i, x.mean(), x.std())
+            x, past_bias = l(x, past_bias)
+            if i == intermediate_output:
+                intermediate = x.clone()
+        # print(x.mean(), x.std())
+        x = self.final_layer_norm(x)
+        if intermediate is not None and final_layer_norm_intermediate:
+            intermediate = self.final_layer_norm(intermediate)
+        # print(x.mean(), x.std())
+        return x, intermediate
+
+
+class T5(torch.nn.Module):
+    def __init__(self, config_dict, dtype, device):
+        super().__init__()
+        self.num_layers = config_dict["num_layers"]
+        self.encoder = T5Stack(
+            self.num_layers,
+            config_dict["d_model"],
+            config_dict["d_model"],
+            config_dict["d_ff"],
+            config_dict["num_heads"],
+            config_dict["vocab_size"],
+            dtype,
+            device,
+        )
+        self.dtype = dtype
+
+    def get_input_embeddings(self):
+        return self.encoder.embed_tokens
+
+    def set_input_embeddings(self, embeddings):
+        self.encoder.embed_tokens = embeddings
+
+    def forward(self, *args, **kwargs):
+        return self.encoder(*args, **kwargs)
+
+
+def create_clip_l(device="cpu", dtype=torch.float32, state_dict: Optional[Dict[str, torch.Tensor]] = None):
+    r"""
+    state_dict is not loaded, but updated with missing keys
+    """
+    CLIPL_CONFIG = {
+        "hidden_act": "quick_gelu",
+        "hidden_size": 768,
+        "intermediate_size": 3072,
+        "num_attention_heads": 12,
+        "num_hidden_layers": 12,
+    }
+    with torch.no_grad():
+        clip_l = SDClipModel(
+            layer="hidden",
+            layer_idx=-2,
+            device=device,
+            dtype=dtype,
+            layer_norm_hidden_state=False,
+            return_projected_pooled=False,
+            textmodel_json_config=CLIPL_CONFIG,
+        )
+    if state_dict is not None:
+        # update state_dict if provided to include logit_scale and text_projection.weight avoid errors
+        if "logit_scale" not in state_dict:
+            state_dict["logit_scale"] = clip_l.logit_scale
+        if "transformer.text_projection.weight" not in state_dict:
+            state_dict["transformer.text_projection.weight"] = clip_l.transformer.text_projection.weight
+    return clip_l
+
+
+def create_clip_g(device="cpu", dtype=torch.float32, state_dict: Optional[Dict[str, torch.Tensor]] = None):
+    r"""
+    state_dict is not loaded, but updated with missing keys
+    """
+    CLIPG_CONFIG = {
+        "hidden_act": "gelu",
+        "hidden_size": 1280,
+        "intermediate_size": 5120,
+        "num_attention_heads": 20,
+        "num_hidden_layers": 32,
+    }
+    with torch.no_grad():
+        clip_g = SDXLClipG(CLIPG_CONFIG, device=device, dtype=dtype)
+    if state_dict is not None:
+        if "logit_scale" not in state_dict:
+            state_dict["logit_scale"] = clip_g.logit_scale
+    return clip_g
+
+
+def create_t5xxl(device="cpu", dtype=torch.float32, state_dict: Optional[Dict[str, torch.Tensor]] = None) -> T5XXLModel:
+    T5_CONFIG = {"d_ff": 10240, "d_model": 4096, "num_heads": 64, "num_layers": 24, "vocab_size": 32128}
+    with torch.no_grad():
+        t5 = T5XXLModel(T5_CONFIG, dtype=dtype, device=device)
+    if state_dict is not None:
+        if "logit_scale" not in state_dict:
+            state_dict["logit_scale"] = t5.logit_scale
+        if "transformer.shared.weight" in state_dict:
+            state_dict.pop("transformer.shared.weight")
+    return t5
+
+
+# endregion
diff --git a/library/sd3_utils.py b/library/sd3_utils.py
new file mode 100644
index 000000000..6f8c361fd
--- /dev/null
+++ b/library/sd3_utils.py
@@ -0,0 +1,113 @@
+import math
+from typing import Dict
+import torch
+
+from library import sd3_models
+
+
+def get_cond(
+    prompt: str,
+    tokenizer: sd3_models.SD3Tokenizer,
+    clip_l: sd3_models.SDClipModel,
+    clip_g: sd3_models.SDXLClipG,
+    t5xxl: sd3_models.T5XXLModel,
+):
+    l_tokens, g_tokens, t5_tokens = tokenizer.tokenize_with_weights(prompt)
+    l_out, l_pooled = clip_l.encode_token_weights(l_tokens)
+    g_out, g_pooled = clip_g.encode_token_weights(g_tokens)
+    lg_out = torch.cat([l_out, g_out], dim=-1)
+    lg_out = torch.nn.functional.pad(lg_out, (0, 4096 - lg_out.shape[-1]))
+
+    if t5_tokens is None:
+        t5_out = torch.zeros((lg_out.shape[0], 77, 4096), device=lg_out.device)
+    else:
+        t5_out, t5_pooled = t5xxl.encode_token_weights(t5_tokens)  # t5_out is [1, 77, 4096], t5_pooled is None
+        t5_out = t5_out.to(lg_out.dtype)
+
+    return torch.cat([lg_out, t5_out], dim=-2), torch.cat((l_pooled, g_pooled), dim=-1)
+
+
+# used if other sd3 models is available
+r"""
+def get_sd3_configs(state_dict: Dict):
+    # Important configuration values can be quickly determined by checking shapes in the source file
+    # Some of these will vary between models (eg 2B vs 8B primarily differ in their depth, but also other details change)
+    # prefix = "model.diffusion_model."
+    prefix = ""
+
+    patch_size = state_dict[prefix + "x_embedder.proj.weight"].shape[2]
+    depth = state_dict[prefix + "x_embedder.proj.weight"].shape[0] // 64
+    num_patches = state_dict[prefix + "pos_embed"].shape[1]
+    pos_embed_max_size = round(math.sqrt(num_patches))
+    adm_in_channels = state_dict[prefix + "y_embedder.mlp.0.weight"].shape[1]
+    context_shape = state_dict[prefix + "context_embedder.weight"].shape
+    context_embedder_config = {
+        "target": "torch.nn.Linear",
+        "params": {"in_features": context_shape[1], "out_features": context_shape[0]},
+    }
+    return {
+        "patch_size": patch_size,
+        "depth": depth,
+        "num_patches": num_patches,
+        "pos_embed_max_size": pos_embed_max_size,
+        "adm_in_channels": adm_in_channels,
+        "context_embedder": context_embedder_config,
+    }
+
+
+def create_mmdit_from_sd3_checkpoint(state_dict: Dict, attn_mode: str = "xformers"):
+    ""
+    Doesn't load state dict.
+    ""
+    sd3_configs = get_sd3_configs(state_dict)
+
+    mmdit = sd3_models.MMDiT(
+        input_size=None,
+        pos_embed_max_size=sd3_configs["pos_embed_max_size"],
+        patch_size=sd3_configs["patch_size"],
+        in_channels=16,
+        adm_in_channels=sd3_configs["adm_in_channels"],
+        depth=sd3_configs["depth"],
+        mlp_ratio=4,
+        qk_norm=None,
+        num_patches=sd3_configs["num_patches"],
+        context_size=4096,
+        attn_mode=attn_mode,
+    )
+    return mmdit
+"""
+
+
+class ModelSamplingDiscreteFlow:
+    """Helper for sampler scheduling (ie timestep/sigma calculations) for Discrete Flow models"""
+
+    def __init__(self, shift=1.0):
+        self.shift = shift
+        timesteps = 1000
+        self.sigmas = self.sigma(torch.arange(1, timesteps + 1, 1))
+
+    @property
+    def sigma_min(self):
+        return self.sigmas[0]
+
+    @property
+    def sigma_max(self):
+        return self.sigmas[-1]
+
+    def timestep(self, sigma):
+        return sigma * 1000
+
+    def sigma(self, timestep: torch.Tensor):
+        timestep = timestep / 1000.0
+        if self.shift == 1.0:
+            return timestep
+        return self.shift * timestep / (1 + (self.shift - 1) * timestep)
+
+    def calculate_denoised(self, sigma, model_output, model_input):
+        sigma = sigma.view(sigma.shape[:1] + (1,) * (model_output.ndim - 1))
+        return model_input - model_output * sigma
+
+    def noise_scaling(self, sigma, noise, latent_image, max_denoise=False):
+        # assert max_denoise is False, "max_denoise not implemented"
+        # max_denoise is always True, I'm not sure why it's there
+        return sigma * noise + (1.0 - sigma) * latent_image
diff --git a/sd3_minimal_inference.py b/sd3_minimal_inference.py
new file mode 100644
index 000000000..e14f784d4
--- /dev/null
+++ b/sd3_minimal_inference.py
@@ -0,0 +1,347 @@
+# Minimum Inference Code for SD3
+
+import argparse
+import datetime
+import math
+import os
+import random
+from typing import Optional, Tuple
+import numpy as np
+
+import torch
+from safetensors.torch import safe_open, load_file
+from tqdm import tqdm
+from PIL import Image
+
+from library.device_utils import init_ipex, get_preferred_device
+
+init_ipex()
+
+from library.utils import setup_logging
+
+setup_logging()
+import logging
+
+logger = logging.getLogger(__name__)
+
+from library import sd3_models, sd3_utils
+
+
+def get_noise(seed, latent):
+    generator = torch.manual_seed(seed)
+    return torch.randn(latent.size(), dtype=torch.float32, layout=latent.layout, generator=generator, device="cpu").to(latent.dtype)
+
+
+def get_sigmas(sampling: sd3_utils.ModelSamplingDiscreteFlow, steps):
+    start = sampling.timestep(sampling.sigma_max)
+    end = sampling.timestep(sampling.sigma_min)
+    timesteps = torch.linspace(start, end, steps)
+    sigs = []
+    for x in range(len(timesteps)):
+        ts = timesteps[x]
+        sigs.append(sampling.sigma(ts))
+    sigs += [0.0]
+    return torch.FloatTensor(sigs)
+
+
+def max_denoise(model_sampling, sigmas):
+    max_sigma = float(model_sampling.sigma_max)
+    sigma = float(sigmas[0])
+    return math.isclose(max_sigma, sigma, rel_tol=1e-05) or sigma > max_sigma
+
+
+def do_sample(
+    height: int,
+    width: int,
+    initial_latent: Optional[torch.Tensor],
+    seed: int,
+    cond: Tuple[torch.Tensor, torch.Tensor],
+    neg_cond: Tuple[torch.Tensor, torch.Tensor],
+    mmdit: sd3_models.MMDiT,
+    steps: int,
+    guidance_scale: float,
+    dtype: torch.dtype,
+    device: str,
+):
+    if initial_latent is None:
+        latent = torch.ones(1, 16, height // 8, width // 8, device=device) * 0.0609
+    else:
+        latent = initial_latent
+
+    latent = latent.to(dtype).to(device)
+
+    noise = get_noise(seed, latent).to(device)
+
+    model_sampling = sd3_utils.ModelSamplingDiscreteFlow()
+
+    sigmas = get_sigmas(model_sampling, steps).to(device)
+    # sigmas = sigmas[int(steps * (1 - denoise)) :] # do not support i2i
+
+    # conditioning = fix_cond(conditioning)
+    # neg_cond = fix_cond(neg_cond)
+    # extra_args = {"cond": cond, "uncond": neg_cond, "cond_scale": guidance_scale}
+
+    noise_scaled = model_sampling.noise_scaling(sigmas[0], noise, latent, max_denoise(model_sampling, sigmas))
+
+    c_crossattn = torch.cat([cond[0], neg_cond[0]]).to(device).to(dtype)
+    y = torch.cat([cond[1], neg_cond[1]]).to(device).to(dtype)
+
+    x = noise_scaled.to(device).to(dtype)
+    # print(x.shape)
+
+    with torch.no_grad():
+        for i in tqdm(range(len(sigmas) - 1)):
+            sigma_hat = sigmas[i]
+
+            timestep = model_sampling.timestep(sigma_hat).float()
+            timestep = torch.FloatTensor([timestep, timestep]).to(device)
+
+            x_c_nc = torch.cat([x, x], dim=0)
+            # print(x_c_nc.shape, timestep.shape, c_crossattn.shape, y.shape)
+
+            model_output = mmdit(x_c_nc, timestep, context=c_crossattn, y=y)
+            model_output = model_output.float()
+            batched = model_sampling.calculate_denoised(sigma_hat, model_output, x)
+
+            pos_out, neg_out = batched.chunk(2)
+            denoised = neg_out + (pos_out - neg_out) * guidance_scale
+            # print(denoised.shape)
+
+            # d = to_d(x, sigma_hat, denoised)
+            dims_to_append = x.ndim - sigma_hat.ndim
+            sigma_hat_dims = sigma_hat[(...,) + (None,) * dims_to_append]
+            # print(dims_to_append, x.shape, sigma_hat.shape, denoised.shape, sigma_hat_dims.shape)
+            """Converts a denoiser output to a Karras ODE derivative."""
+            d = (x - denoised) / sigma_hat_dims
+
+            dt = sigmas[i + 1] - sigma_hat
+
+            # Euler method
+            x = x + d * dt
+            x = x.to(dtype)
+
+    latent = x
+    scale_factor = 1.5305
+    shift_factor = 0.0609
+    # def process_out(self, latent):
+    #     return (latent / self.scale_factor) + self.shift_factor
+    latent = (latent / scale_factor) + shift_factor
+    return latent
+
+
+if __name__ == "__main__":
+    target_height = 1024
+    target_width = 1024
+
+    # steps = 50  # 28  # 50
+    guidance_scale = 5
+    # seed = 1  # None  # 1
+
+    device = get_preferred_device()
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--ckpt_path", type=str, required=True)
+    parser.add_argument("--clip_g", type=str, required=False)
+    parser.add_argument("--clip_l", type=str, required=False)
+    parser.add_argument("--t5xxl", type=str, required=False)
+    parser.add_argument("--prompt", type=str, default="A photo of a cat")
+    # parser.add_argument("--prompt2", type=str, default=None)  # do not support different prompts for text encoders
+    parser.add_argument("--negative_prompt", type=str, default="")
+    parser.add_argument("--output_dir", type=str, default=".")
+    parser.add_argument("--do_not_use_t5xxl", action="store_true")
+    parser.add_argument("--attn_mode", type=str, default="torch", help="torch (SDPA) or xformers. default: torch")
+    parser.add_argument("--fp16", action="store_true")
+    parser.add_argument("--bf16", action="store_true")
+    parser.add_argument("--seed", type=int, default=1)
+    parser.add_argument("--steps", type=int, default=50)
+    # parser.add_argument(
+    #     "--lora_weights",
+    #     type=str,
+    #     nargs="*",
+    #     default=[],
+    #     help="LoRA weights, only supports networks.lora, each argument is a `path;multiplier` (semi-colon separated)",
+    # )
+    # parser.add_argument("--interactive", action="store_true")
+    args = parser.parse_args()
+
+    seed = args.seed
+    steps = args.steps
+
+    sd3_dtype = torch.float32
+    if args.fp16:
+        sd3_dtype = torch.float16
+    elif args.bf16:
+        sd3_dtype = torch.bfloat16
+
+    # TODO test with separated safetenors files for each model
+
+    # load state dict
+    logger.info(f"Loading SD3 models from {args.ckpt_path}...")
+    state_dict = load_file(args.ckpt_path)
+
+    if "text_encoders.clip_g.transformer.text_model.embeddings.position_embedding.weight" in state_dict:
+        # found clip_g: remove prefix "text_encoders.clip_g."
+        logger.info("clip_g is included in the checkpoint")
+        clip_g_sd = {}
+        prefix = "text_encoders.clip_g."
+        for k, v in list(state_dict.items()):
+            if k.startswith(prefix):
+                clip_g_sd[k[len(prefix) :]] = state_dict.pop(k)
+    else:
+        logger.info(f"Lodaing clip_g from {args.clip_g}...")
+        clip_g_sd = load_file(args.clip_g)
+        for key in list(clip_g_sd.keys()):
+            clip_g_sd["transformer." + key] = clip_g_sd.pop(key)
+
+    if "text_encoders.clip_l.transformer.text_model.embeddings.position_embedding.weight" in state_dict:
+        # found clip_l: remove prefix "text_encoders.clip_l."
+        logger.info("clip_l is included in the checkpoint")
+        clip_l_sd = {}
+        prefix = "text_encoders.clip_l."
+        for k, v in list(state_dict.items()):
+            if k.startswith(prefix):
+                clip_l_sd[k[len(prefix) :]] = state_dict.pop(k)
+    else:
+        logger.info(f"Lodaing clip_l from {args.clip_l}...")
+        clip_l_sd = load_file(args.clip_l)
+        for key in list(clip_l_sd.keys()):
+            clip_l_sd["transformer." + key] = clip_l_sd.pop(key)
+
+    if "text_encoders.t5xxl.transformer.encoder.block.0.layer.0.SelfAttention.k.weight" in state_dict:
+        # found t5xxl: remove prefix "text_encoders.t5xxl."
+        logger.info("t5xxl is included in the checkpoint")
+        if not args.do_not_use_t5xxl:
+            t5xxl_sd = {}
+            prefix = "text_encoders.t5xxl."
+            for k, v in list(state_dict.items()):
+                if k.startswith(prefix):
+                    t5xxl_sd[k[len(prefix) :]] = state_dict.pop(k)
+        else:
+            logger.info("but not used")
+            for key in list(state_dict.keys()):
+                if key.startswith("text_encoders.t5xxl."):
+                    state_dict.pop(key)
+            t5xxl_sd = None
+    elif args.t5xxl:
+        assert not args.do_not_use_t5xxl, "t5xxl is not used but specified"
+        logger.info(f"Lodaing t5xxl from {args.t5xxl}...")
+        t5xxl_sd = load_file(args.t5xxl)
+        for key in list(t5xxl_sd.keys()):
+            t5xxl_sd["transformer." + key] = t5xxl_sd.pop(key)
+    else:
+        logger.info("t5xxl is not used")
+        t5xxl_sd = None
+
+    use_t5xxl = t5xxl_sd is not None
+
+    # MMDiT and VAE
+    vae_sd = {}
+    vae_prefix = "first_stage_model."
+    mmdit_prefix = "model.diffusion_model."
+    for k, v in list(state_dict.items()):
+        if k.startswith(vae_prefix):
+            vae_sd[k[len(vae_prefix) :]] = state_dict.pop(k)
+        elif k.startswith(mmdit_prefix):
+            state_dict[k[len(mmdit_prefix) :]] = state_dict.pop(k)
+
+    # load tokenizers
+    logger.info("Loading tokenizers...")
+    tokenizer = sd3_models.SD3Tokenizer(use_t5xxl)  # combined tokenizer
+
+    # load models
+    # logger.info("Create MMDiT from SD3 checkpoint...")
+    # mmdit = sd3_utils.create_mmdit_from_sd3_checkpoint(state_dict)
+    logger.info("Create MMDiT")
+    mmdit = sd3_models.create_mmdit_sd3_medium_configs(args.attn_mode)
+
+    logger.info("Loading state dict...")
+    info = mmdit.load_state_dict(state_dict)
+    logger.info(f"Loaded MMDiT: {info}")
+
+    logger.info(f"Move MMDiT to {device} and {sd3_dtype}...")
+    mmdit.to(device, dtype=sd3_dtype)
+    mmdit.eval()
+
+    # load VAE
+    logger.info("Create VAE")
+    vae = sd3_models.SDVAE()
+    logger.info("Loading state dict...")
+    info = vae.load_state_dict(vae_sd)
+    logger.info(f"Loaded VAE: {info}")
+
+    logger.info(f"Move VAE to {device} and {sd3_dtype}...")
+    vae.to(device, dtype=sd3_dtype)
+    vae.eval()
+
+    # load text encoders
+    logger.info("Create clip_l")
+    clip_l = sd3_models.create_clip_l(device, sd3_dtype, clip_l_sd)
+
+    logger.info("Loading state dict...")
+    info = clip_l.load_state_dict(clip_l_sd)
+    logger.info(f"Loaded clip_l: {info}")
+
+    logger.info(f"Move clip_l to {device} and {sd3_dtype}...")
+    clip_l.to(device, dtype=sd3_dtype)
+    clip_l.eval()
+    logger.info(f"Set attn_mode to {args.attn_mode}...")
+    clip_l.set_attn_mode(args.attn_mode)
+
+    logger.info("Create clip_g")
+    clip_g = sd3_models.create_clip_g(device, sd3_dtype, clip_g_sd)
+
+    logger.info("Loading state dict...")
+    info = clip_g.load_state_dict(clip_g_sd)
+    logger.info(f"Loaded clip_g: {info}")
+
+    logger.info(f"Move clip_g to {device} and {sd3_dtype}...")
+    clip_g.to(device, dtype=sd3_dtype)
+    clip_g.eval()
+    logger.info(f"Set attn_mode to {args.attn_mode}...")
+    clip_g.set_attn_mode(args.attn_mode)
+
+    if use_t5xxl:
+        logger.info("Create t5xxl")
+        t5xxl = sd3_models.create_t5xxl(device, sd3_dtype, t5xxl_sd)
+
+        logger.info("Loading state dict...")
+        info = t5xxl.load_state_dict(t5xxl_sd)
+        logger.info(f"Loaded t5xxl: {info}")
+
+        logger.info(f"Move t5xxl to {device} and {sd3_dtype}...")
+        t5xxl.to(device, dtype=sd3_dtype)
+        # t5xxl.to("cpu", dtype=torch.float32) # run on CPU
+        t5xxl.eval()
+        logger.info(f"Set attn_mode to {args.attn_mode}...")
+        t5xxl.set_attn_mode(args.attn_mode)
+    else:
+        t5xxl = None
+
+    # prepare embeddings
+    logger.info("Encoding prompts...")
+    # embeds, pooled_embed
+    cond = sd3_utils.get_cond(args.prompt, tokenizer, clip_l, clip_g, t5xxl)
+    neg_cond = sd3_utils.get_cond(args.negative_prompt, tokenizer, clip_l, clip_g, t5xxl)
+
+    # generate image
+    logger.info("Generating image...")
+    latent_sampled = do_sample(
+        target_height, target_width, None, seed, cond, neg_cond, mmdit, steps, guidance_scale, sd3_dtype, device
+    )
+
+    # latent to image
+    with torch.no_grad():
+        image = vae.decode(latent_sampled)
+    image = image.float()
+    image = torch.clamp((image + 1.0) / 2.0, min=0.0, max=1.0)[0]
+    decoded_np = 255.0 * np.moveaxis(image.cpu().numpy(), 0, 2)
+    decoded_np = decoded_np.astype(np.uint8)
+    out_image = Image.fromarray(decoded_np)
+
+    # save image
+    output_dir = args.output_dir
+    os.makedirs(output_dir, exist_ok=True)
+    output_path = os.path.join(output_dir, f"{datetime.datetime.now().strftime('%Y%m%d_%H%M%S')}.png")
+    out_image.save(output_path)
+
+    logger.info(f"Saved image to {output_path}")

From d53ea22b2a8366e6bc9f14aaeec057cd817f60d3 Mon Sep 17 00:00:00 2001
From: Kohya S <ykumeykume@gmail.com>
Date: Sun, 23 Jun 2024 23:38:20 +0900
Subject: [PATCH 028/582] sd3 training

---
 README.md                  |  25 +
 library/sai_model_spec.py  |  20 +-
 library/sd3_models.py      | 102 ++++-
 library/sd3_train_utils.py | 544 ++++++++++++++++++++++
 library/sd3_utils.py       | 211 ++++++++-
 library/train_util.py      | 137 +++++-
 sd3_minimal_inference.py   |   7 +-
 sd3_train.py               | 907 +++++++++++++++++++++++++++++++++++++
 8 files changed, 1909 insertions(+), 44 deletions(-)
 create mode 100644 library/sd3_train_utils.py
 create mode 100644 sd3_train.py

diff --git a/README.md b/README.md
index 946df58f3..34aa2bb2f 100644
--- a/README.md
+++ b/README.md
@@ -1,5 +1,30 @@
 This repository contains training, generation and utility scripts for Stable Diffusion.
 
+## SD3 training
+
+SD3 training is done with `sd3_train.py`. 
+
+`optimizer_type = "adafactor"` is recommended for 24GB VRAM GPUs. `cache_text_encoder_outputs_to_disk` and `cache_latents_to_disk` are necessary currently. 
+
+`clip_l`, `clip_g` and `t5xxl` can be specified if the checkpoint does not include them.  
+
+t5xxl doesn't seem to work with `fp16`, so use`bf16` or `fp32`. 
+
+There are `t5xxl_device` and `t5xxl_dtype` options for `t5xxl` device and dtype. 
+
+```toml
+learning_rate = 1e-5 # seems to be too high
+optimizer_type = "adafactor"
+optimizer_args = [ "scale_parameter=False", "relative_step=False", "warmup_init=False" ]
+cache_text_encoder_outputs = true
+cache_text_encoder_outputs_to_disk = true
+vae_batch_size = 1
+cache_latents = true
+cache_latents_to_disk = true
+```
+
+--- 
+
 [__Change History__](#change-history) is moved to the bottom of the page. 
 更新履歴は[ページ末尾](#change-history)に移しました。
 
diff --git a/library/sai_model_spec.py b/library/sai_model_spec.py
index a63bd82ec..f7bf644d7 100644
--- a/library/sai_model_spec.py
+++ b/library/sai_model_spec.py
@@ -6,8 +6,10 @@
 from typing import List, Optional, Tuple, Union
 import safetensors
 from library.utils import setup_logging
+
 setup_logging()
 import logging
+
 logger = logging.getLogger(__name__)
 
 r"""
@@ -55,11 +57,14 @@
 ARCH_SD_V2_512 = "stable-diffusion-v2-512"
 ARCH_SD_V2_768_V = "stable-diffusion-v2-768-v"
 ARCH_SD_XL_V1_BASE = "stable-diffusion-xl-v1-base"
+ARCH_SD3_M = "stable-diffusion-3-medium"
+ARCH_SD3_UNKNOWN = "stable-diffusion-3"
 
 ADAPTER_LORA = "lora"
 ADAPTER_TEXTUAL_INVERSION = "textual-inversion"
 
 IMPL_STABILITY_AI = "https://github.com/Stability-AI/generative-models"
+IMPL_COMFY_UI = "https://github.com/comfyanonymous/ComfyUI"
 IMPL_DIFFUSERS = "diffusers"
 
 PRED_TYPE_EPSILON = "epsilon"
@@ -113,7 +118,11 @@ def build_metadata(
     merged_from: Optional[str] = None,
     timesteps: Optional[Tuple[int, int]] = None,
     clip_skip: Optional[int] = None,
+    sd3: str = None,
 ):
+    """
+    sd3: only supports "m"
+    """
     # if state_dict is None, hash is not calculated
 
     metadata = {}
@@ -126,6 +135,11 @@ def build_metadata(
 
     if sdxl:
         arch = ARCH_SD_XL_V1_BASE
+    elif sd3 is not None:
+        if sd3 == "m":
+            arch = ARCH_SD3_M
+        else:
+            arch = ARCH_SD3_UNKNOWN
     elif v2:
         if v_parameterization:
             arch = ARCH_SD_V2_768_V
@@ -142,7 +156,7 @@ def build_metadata(
     metadata["modelspec.architecture"] = arch
 
     if not lora and not textual_inversion and is_stable_diffusion_ckpt is None:
-        is_stable_diffusion_ckpt = True # default is stable diffusion ckpt if not lora and not textual_inversion
+        is_stable_diffusion_ckpt = True  # default is stable diffusion ckpt if not lora and not textual_inversion
 
     if (lora and sdxl) or textual_inversion or is_stable_diffusion_ckpt:
         # Stable Diffusion ckpt, TI, SDXL LoRA
@@ -236,7 +250,7 @@ def build_metadata(
     # assert all([v is not None for v in metadata.values()]), metadata
     if not all([v is not None for v in metadata.values()]):
         logger.error(f"Internal error: some metadata values are None: {metadata}")
-    
+
     return metadata
 
 
@@ -250,7 +264,7 @@ def get_title(metadata: dict) -> Optional[str]:
 def load_metadata_from_safetensors(model: str) -> dict:
     if not model.endswith(".safetensors"):
         return {}
-    
+
     with safetensors.safe_open(model, framework="pt") as f:
         metadata = f.metadata()
     if metadata is None:
diff --git a/library/sd3_models.py b/library/sd3_models.py
index 294a69b06..a4fe400e3 100644
--- a/library/sd3_models.py
+++ b/library/sd3_models.py
@@ -1,11 +1,13 @@
-# some modules/classes are copied and modified from https://github.com/mcmonkey4eva/sd3-ref 
+# some modules/classes are copied and modified from https://github.com/mcmonkey4eva/sd3-ref
 # the original code is licensed under the MIT License
 
 # and some module/classes are contributed from KohakuBlueleaf. Thanks for the contribution!
 
+from ast import Tuple
 from functools import partial
 import math
-from typing import Dict, Optional
+from types import SimpleNamespace
+from typing import Dict, List, Optional, Union
 import einops
 import numpy as np
 import torch
@@ -106,6 +108,8 @@ def __init__(self, t5xxl=True):
         self.clip_l = SDTokenizer(tokenizer=clip_tokenizer)
         self.clip_g = SDXLClipGTokenizer(clip_tokenizer)
         self.t5xxl = T5XXLTokenizer() if t5xxl else None
+        # t5xxl has 99999999 max length, clip has 77
+        self.model_max_length = self.clip_l.max_length  # 77
 
     def tokenize_with_weights(self, text: str):
         return (
@@ -870,6 +874,10 @@ def __init__(
         self.final_layer = UnPatch(self.hidden_size, patch_size, self.out_channels)
         # self.initialize_weights()
 
+    @property
+    def model_type(self):
+        return "m"  # only support medium
+
     def enable_gradient_checkpointing(self):
         self.gradient_checkpointing = True
         for block in self.joint_blocks:
@@ -1013,6 +1021,10 @@ def create_mmdit_sd3_medium_configs(attn_mode: str):
 # endregion
 
 # region VAE
+# TODO support xformers
+
+VAE_SCALE_FACTOR = 1.5305
+VAE_SHIFT_FACTOR = 0.0609
 
 
 def Normalize(in_channels, num_groups=32, dtype=torch.float32, device=None):
@@ -1222,6 +1234,14 @@ def __init__(self, dtype=torch.float32, device=None):
         self.encoder = VAEEncoder(dtype=dtype, device=device)
         self.decoder = VAEDecoder(dtype=dtype, device=device)
 
+    @property
+    def device(self):
+        return next(self.parameters()).device
+
+    @property
+    def dtype(self):
+        return next(self.parameters()).dtype
+
     @torch.autocast("cuda", dtype=torch.float16)
     def decode(self, latent):
         return self.decoder(latent)
@@ -1234,6 +1254,43 @@ def encode(self, image):
         std = torch.exp(0.5 * logvar)
         return mean + std * torch.randn_like(mean)
 
+    @staticmethod
+    def process_in(latent):
+        return (latent - VAE_SHIFT_FACTOR) * VAE_SCALE_FACTOR
+
+    @staticmethod
+    def process_out(latent):
+        return (latent / VAE_SCALE_FACTOR) + VAE_SHIFT_FACTOR
+
+
+class VAEOutput:
+    def __init__(self, latent):
+        self.latent = latent
+
+    @property
+    def latent_dist(self):
+        return self
+
+    def sample(self):
+        return self.latent
+
+
+class VAEWrapper:
+    def __init__(self, vae):
+        self.vae = vae
+
+    @property
+    def device(self):
+        return self.vae.device
+
+    @property
+    def dtype(self):
+        return self.vae.dtype
+
+    # latents = vae.encode(img_tensors).latent_dist.sample().to("cpu")
+    def encode(self, image):
+        return VAEOutput(self.vae.encode(image))
+
 
 # endregion
 
@@ -1370,15 +1427,39 @@ def forward(self, *args, **kwargs):
 
 
 class ClipTokenWeightEncoder:
-    def encode_token_weights(self, token_weight_pairs):
-        tokens = list(map(lambda a: a[0], token_weight_pairs[0]))
-        out, pooled = self([tokens])
-        if pooled is not None:
-            first_pooled = pooled[0:1].cpu()
+    # def encode_token_weights(self, token_weight_pairs):
+    #     tokens = list(map(lambda a: a[0], token_weight_pairs[0]))
+    #     out, pooled = self([tokens])
+    #     if pooled is not None:
+    #         first_pooled = pooled[0:1]
+    #     else:
+    #         first_pooled = pooled
+    #     output = [out[0:1]]
+    #     return torch.cat(output, dim=-2), first_pooled
+
+    # fix to support batched inputs
+    # : Union[List[Tuple[torch.Tensor, torch.Tensor]], List[List[Tuple[torch.Tensor, torch.Tensor]]]]
+    def encode_token_weights(self, list_of_token_weight_pairs):
+        has_batch = isinstance(list_of_token_weight_pairs[0][0], list)
+
+        if has_batch:
+            list_of_tokens = []
+            for pairs in list_of_token_weight_pairs:
+                tokens = [a[0] for a in pairs[0]]  # I'm not sure why this is [0]
+                list_of_tokens.append(tokens)
         else:
-            first_pooled = pooled
-        output = [out[0:1]]
-        return torch.cat(output, dim=-2).cpu(), first_pooled
+            list_of_tokens = [[a[0] for a in list_of_token_weight_pairs[0]]]
+
+        out, pooled = self(list_of_tokens)
+        if has_batch:
+            return out, pooled
+        else:
+            if pooled is not None:
+                first_pooled = pooled[0:1]
+            else:
+                first_pooled = pooled
+            output = [out[0:1]]
+            return torch.cat(output, dim=-2), first_pooled
 
 
 class SDClipModel(torch.nn.Module, ClipTokenWeightEncoder):
@@ -1694,6 +1775,7 @@ def forward(self, input_ids, intermediate_output=None, final_layer_norm_intermed
         x = self.embed_tokens(input_ids)
         past_bias = None
         for i, l in enumerate(self.block):
+            # uncomment to debug layerwise output: fp16 may cause issues
             # print(i, x.mean(), x.std())
             x, past_bias = l(x, past_bias)
             if i == intermediate_output:
diff --git a/library/sd3_train_utils.py b/library/sd3_train_utils.py
new file mode 100644
index 000000000..4e45871f4
--- /dev/null
+++ b/library/sd3_train_utils.py
@@ -0,0 +1,544 @@
+import argparse
+import math
+import os
+from typing import Optional, Tuple
+
+import torch
+from safetensors.torch import save_file
+
+from library import sd3_models, sd3_utils, train_util
+from library.device_utils import init_ipex, clean_memory_on_device
+
+init_ipex()
+
+from accelerate import init_empty_weights
+from tqdm import tqdm
+
+# from transformers import CLIPTokenizer
+# from library import model_util
+# , sdxl_model_util, train_util, sdxl_original_unet
+# from library.sdxl_lpw_stable_diffusion import SdxlStableDiffusionLongPromptWeightingPipeline
+from .utils import setup_logging
+
+setup_logging()
+import logging
+
+logger = logging.getLogger(__name__)
+
+from .sdxl_train_util import match_mixed_precision
+
+
+def load_target_model(args, accelerator, attn_mode, weight_dtype, t5xxl_device, t5xxl_dtype) -> Tuple[
+    sd3_models.MMDiT,
+    Optional[sd3_models.SDClipModel],
+    Optional[sd3_models.SDXLClipG],
+    Optional[sd3_models.T5XXLModel],
+    sd3_models.SDVAE,
+]:
+    model_dtype = match_mixed_precision(args, weight_dtype)  # prepare fp16/bf16
+
+    for pi in range(accelerator.state.num_processes):
+        if pi == accelerator.state.local_process_index:
+            logger.info(f"loading model for process {accelerator.state.local_process_index}/{accelerator.state.num_processes}")
+
+            mmdit, clip_l, clip_g, t5xxl, vae = sd3_utils.load_models(
+                args.pretrained_model_name_or_path,
+                args.clip_l,
+                args.clip_g,
+                args.t5xxl,
+                args.vae,
+                attn_mode,
+                accelerator.device if args.lowram else "cpu",
+                weight_dtype,
+                args.disable_mmap_load_safetensors,
+                t5xxl_device,
+                t5xxl_dtype,
+            )
+
+            # work on low-ram device
+            if args.lowram:
+                if clip_l is not None:
+                    clip_l.to(accelerator.device)
+                if clip_g is not None:
+                    clip_g.to(accelerator.device)
+                if t5xxl is not None:
+                    t5xxl.to(accelerator.device)
+                vae.to(accelerator.device)
+                mmdit.to(accelerator.device)
+
+            clean_memory_on_device(accelerator.device)
+        accelerator.wait_for_everyone()
+
+    return mmdit, clip_l, clip_g, t5xxl, vae
+
+
+def save_models(
+    ckpt_path: str,
+    mmdit: sd3_models.MMDiT,
+    vae: sd3_models.SDVAE,
+    clip_l: sd3_models.SDClipModel,
+    clip_g: sd3_models.SDXLClipG,
+    t5xxl: Optional[sd3_models.T5XXLModel],
+    sai_metadata: Optional[dict],
+    save_dtype: Optional[torch.dtype] = None,
+):
+    r"""
+    Save models to checkpoint file. Only supports unified checkpoint format.
+    """
+
+    state_dict = {}
+
+    def update_sd(prefix, sd):
+        for k, v in sd.items():
+            key = prefix + k
+            if save_dtype is not None:
+                v = v.detach().clone().to("cpu").to(save_dtype)
+            state_dict[key] = v
+
+    update_sd("model.diffusion_model.", mmdit.state_dict())
+    update_sd("first_stage_model.", vae.state_dict())
+
+    if clip_l is not None:
+        update_sd("text_encoders.clip_l.", clip_l.state_dict())
+    if clip_g is not None:
+        update_sd("text_encoders.clip_g.", clip_g.state_dict())
+    if t5xxl is not None:
+        update_sd("text_encoders.t5xxl.", t5xxl.state_dict())
+
+    save_file(state_dict, ckpt_path, metadata=sai_metadata)
+
+
+def save_sd3_model_on_train_end(
+    args: argparse.Namespace,
+    save_dtype: torch.dtype,
+    epoch: int,
+    global_step: int,
+    clip_l: sd3_models.SDClipModel,
+    clip_g: sd3_models.SDXLClipG,
+    t5xxl: Optional[sd3_models.T5XXLModel],
+    mmdit: sd3_models.MMDiT,
+    vae: sd3_models.SDVAE,
+):
+    def sd_saver(ckpt_file, epoch_no, global_step):
+        sai_metadata = train_util.get_sai_model_spec(
+            None, args, False, False, False, is_stable_diffusion_ckpt=True, sd3=mmdit.model_type
+        )
+        save_models(ckpt_file, mmdit, vae, clip_l, clip_g, t5xxl, sai_metadata, save_dtype)
+
+    train_util.save_sd_model_on_train_end_common(args, True, True, epoch, global_step, sd_saver, None)
+
+
+# epochとstepの保存、メタデータにepoch/stepが含まれ引数が同じになるため、統合している
+# on_epoch_end: Trueならepoch終了時、Falseならstep経過時
+def save_sd3_model_on_epoch_end_or_stepwise(
+    args: argparse.Namespace,
+    on_epoch_end: bool,
+    accelerator,
+    save_dtype: torch.dtype,
+    epoch: int,
+    num_train_epochs: int,
+    global_step: int,
+    clip_l: sd3_models.SDClipModel,
+    clip_g: sd3_models.SDXLClipG,
+    t5xxl: Optional[sd3_models.T5XXLModel],
+    mmdit: sd3_models.MMDiT,
+    vae: sd3_models.SDVAE,
+):
+    def sd_saver(ckpt_file, epoch_no, global_step):
+        sai_metadata = train_util.get_sai_model_spec(
+            None, args, False, False, False, is_stable_diffusion_ckpt=True, sd3=mmdit.model_type
+        )
+        save_models(ckpt_file, mmdit, vae, clip_l, clip_g, t5xxl, sai_metadata, save_dtype)
+
+    train_util.save_sd_model_on_epoch_end_or_stepwise_common(
+        args,
+        on_epoch_end,
+        accelerator,
+        True,
+        True,
+        epoch,
+        num_train_epochs,
+        global_step,
+        sd_saver,
+        None,
+    )
+
+
+def add_sd3_training_arguments(parser: argparse.ArgumentParser):
+    parser.add_argument(
+        "--cache_text_encoder_outputs", action="store_true", help="cache text encoder outputs / text encoderの出力をキャッシュする"
+    )
+    parser.add_argument(
+        "--cache_text_encoder_outputs_to_disk",
+        action="store_true",
+        help="cache text encoder outputs to disk / text encoderの出力をディスクにキャッシュする",
+    )
+    parser.add_argument(
+        "--disable_mmap_load_safetensors",
+        action="store_true",
+        help="disable mmap load for safetensors. Speed up model loading in WSL environment / safetensorsのmmapロードを無効にする。WSL環境等でモデル読み込みを高速化できる",
+    )
+
+    parser.add_argument(
+        "--clip_l",
+        type=str,
+        required=False,
+        help="CLIP-L model path. if not specified, use ckpt's state_dict / CLIP-Lモデルのパス。指定しない場合はckptのstate_dictを使用",
+    )
+    parser.add_argument(
+        "--clip_g",
+        type=str,
+        required=False,
+        help="CLIP-G model path. if not specified, use ckpt's state_dict / CLIP-Gモデルのパス。指定しない場合はckptのstate_dictを使用",
+    )
+    parser.add_argument(
+        "--t5xxl",
+        type=str,
+        required=False,
+        help="T5-XXL model path. if not specified, use ckpt's state_dict / T5-XXLモデルのパス。指定しない場合はckptのstate_dictを使用",
+    )
+    parser.add_argument(
+        "--save_clip", action="store_true", help="save CLIP models to checkpoint / CLIPモデルをチェックポイントに保存する"
+    )
+    parser.add_argument(
+        "--save_t5xxl", action="store_true", help="save T5-XXL model to checkpoint / T5-XXLモデルをチェックポイントに保存する"
+    )
+
+    parser.add_argument(
+        "--t5xxl_device",
+        type=str,
+        default=None,
+        help="T5-XXL device. if not specified, use accelerator's device / T5-XXLデバイス。指定しない場合はacceleratorのデバイスを使用",
+    )
+    parser.add_argument(
+        "--t5xxl_dtype",
+        type=str,
+        default=None,
+        help="T5-XXL dtype. if not specified, use default dtype (from mixed precision) / T5-XXL dtype。指定しない場合はデフォルトのdtype（mixed precisionから）を使用",
+    )
+
+    # copy from Diffusers
+    parser.add_argument(
+        "--weighting_scheme",
+        type=str,
+        default="logit_normal",
+        choices=["sigma_sqrt", "logit_normal", "mode", "cosmap"],
+    )
+    parser.add_argument(
+        "--logit_mean", type=float, default=0.0, help="mean to use when using the `'logit_normal'` weighting scheme."
+    )
+    parser.add_argument("--logit_std", type=float, default=1.0, help="std to use when using the `'logit_normal'` weighting scheme.")
+    parser.add_argument(
+        "--mode_scale",
+        type=float,
+        default=1.29,
+        help="Scale of mode weighting scheme. Only effective when using the `'mode'` as the `weighting_scheme`.",
+    )
+
+
+def verify_sdxl_training_args(args: argparse.Namespace, supportTextEncoderCaching: bool = True):
+    assert not args.v2, "v2 cannot be enabled in SDXL training / SDXL学習ではv2を有効にすることはできません"
+    if args.v_parameterization:
+        logger.warning("v_parameterization will be unexpected / SDXL学習ではv_parameterizationは想定外の動作になります")
+
+    if args.clip_skip is not None:
+        logger.warning("clip_skip will be unexpected / SDXL学習ではclip_skipは動作しません")
+
+    # if args.multires_noise_iterations:
+    #     logger.info(
+    #         f"Warning: SDXL has been trained with noise_offset={DEFAULT_NOISE_OFFSET}, but noise_offset is disabled due to multires_noise_iterations / SDXLはnoise_offset={DEFAULT_NOISE_OFFSET}で学習されていますが、multires_noise_iterationsが有効になっているためnoise_offsetは無効になります"
+    #     )
+    # else:
+    #     if args.noise_offset is None:
+    #         args.noise_offset = DEFAULT_NOISE_OFFSET
+    #     elif args.noise_offset != DEFAULT_NOISE_OFFSET:
+    #         logger.info(
+    #             f"Warning: SDXL has been trained with noise_offset={DEFAULT_NOISE_OFFSET} / SDXLはnoise_offset={DEFAULT_NOISE_OFFSET}で学習されています"
+    #         )
+    #     logger.info(f"noise_offset is set to {args.noise_offset} / noise_offsetが{args.noise_offset}に設定されました")
+
+    assert (
+        not hasattr(args, "weighted_captions") or not args.weighted_captions
+    ), "weighted_captions cannot be enabled in SDXL training currently / SDXL学習では今のところweighted_captionsを有効にすることはできません"
+
+    if supportTextEncoderCaching:
+        if args.cache_text_encoder_outputs_to_disk and not args.cache_text_encoder_outputs:
+            args.cache_text_encoder_outputs = True
+            logger.warning(
+                "cache_text_encoder_outputs is enabled because cache_text_encoder_outputs_to_disk is enabled / "
+                + "cache_text_encoder_outputs_to_diskが有効になっているためcache_text_encoder_outputsが有効になりました"
+            )
+
+
+def sample_images(*args, **kwargs):
+    return train_util.sample_images_common(SdxlStableDiffusionLongPromptWeightingPipeline, *args, **kwargs)
+
+
+# region Diffusers
+
+
+from dataclasses import dataclass
+from typing import Optional, Tuple, Union
+
+import numpy as np
+import torch
+
+from diffusers.configuration_utils import ConfigMixin, register_to_config
+from diffusers.schedulers.scheduling_utils import SchedulerMixin
+from diffusers.utils.torch_utils import randn_tensor
+from diffusers.utils import BaseOutput
+
+
+@dataclass
+class FlowMatchEulerDiscreteSchedulerOutput(BaseOutput):
+    """
+    Output class for the scheduler's `step` function output.
+
+    Args:
+        prev_sample (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)` for images):
+            Computed sample `(x_{t-1})` of previous timestep. `prev_sample` should be used as next model input in the
+            denoising loop.
+    """
+
+    prev_sample: torch.FloatTensor
+
+
+class FlowMatchEulerDiscreteScheduler(SchedulerMixin, ConfigMixin):
+    """
+    Euler scheduler.
+
+    This model inherits from [`SchedulerMixin`] and [`ConfigMixin`]. Check the superclass documentation for the generic
+    methods the library implements for all schedulers such as loading and saving.
+
+    Args:
+        num_train_timesteps (`int`, defaults to 1000):
+            The number of diffusion steps to train the model.
+        timestep_spacing (`str`, defaults to `"linspace"`):
+            The way the timesteps should be scaled. Refer to Table 2 of the [Common Diffusion Noise Schedules and
+            Sample Steps are Flawed](https://huggingface.co/papers/2305.08891) for more information.
+        shift (`float`, defaults to 1.0):
+            The shift value for the timestep schedule.
+    """
+
+    _compatibles = []
+    order = 1
+
+    @register_to_config
+    def __init__(
+        self,
+        num_train_timesteps: int = 1000,
+        shift: float = 1.0,
+    ):
+        timesteps = np.linspace(1, num_train_timesteps, num_train_timesteps, dtype=np.float32)[::-1].copy()
+        timesteps = torch.from_numpy(timesteps).to(dtype=torch.float32)
+
+        sigmas = timesteps / num_train_timesteps
+        sigmas = shift * sigmas / (1 + (shift - 1) * sigmas)
+
+        self.timesteps = sigmas * num_train_timesteps
+
+        self._step_index = None
+        self._begin_index = None
+
+        self.sigmas = sigmas.to("cpu")  # to avoid too much CPU/GPU communication
+        self.sigma_min = self.sigmas[-1].item()
+        self.sigma_max = self.sigmas[0].item()
+
+    @property
+    def step_index(self):
+        """
+        The index counter for current timestep. It will increase 1 after each scheduler step.
+        """
+        return self._step_index
+
+    @property
+    def begin_index(self):
+        """
+        The index for the first timestep. It should be set from pipeline with `set_begin_index` method.
+        """
+        return self._begin_index
+
+    # Copied from diffusers.schedulers.scheduling_dpmsolver_multistep.DPMSolverMultistepScheduler.set_begin_index
+    def set_begin_index(self, begin_index: int = 0):
+        """
+        Sets the begin index for the scheduler. This function should be run from pipeline before the inference.
+
+        Args:
+            begin_index (`int`):
+                The begin index for the scheduler.
+        """
+        self._begin_index = begin_index
+
+    def scale_noise(
+        self,
+        sample: torch.FloatTensor,
+        timestep: Union[float, torch.FloatTensor],
+        noise: Optional[torch.FloatTensor] = None,
+    ) -> torch.FloatTensor:
+        """
+        Forward process in flow-matching
+
+        Args:
+            sample (`torch.FloatTensor`):
+                The input sample.
+            timestep (`int`, *optional*):
+                The current timestep in the diffusion chain.
+
+        Returns:
+            `torch.FloatTensor`:
+                A scaled input sample.
+        """
+        if self.step_index is None:
+            self._init_step_index(timestep)
+
+        sigma = self.sigmas[self.step_index]
+        sample = sigma * noise + (1.0 - sigma) * sample
+
+        return sample
+
+    def _sigma_to_t(self, sigma):
+        return sigma * self.config.num_train_timesteps
+
+    def set_timesteps(self, num_inference_steps: int, device: Union[str, torch.device] = None):
+        """
+        Sets the discrete timesteps used for the diffusion chain (to be run before inference).
+
+        Args:
+            num_inference_steps (`int`):
+                The number of diffusion steps used when generating samples with a pre-trained model.
+            device (`str` or `torch.device`, *optional*):
+                The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
+        """
+        self.num_inference_steps = num_inference_steps
+
+        timesteps = np.linspace(self._sigma_to_t(self.sigma_max), self._sigma_to_t(self.sigma_min), num_inference_steps)
+
+        sigmas = timesteps / self.config.num_train_timesteps
+        sigmas = self.config.shift * sigmas / (1 + (self.config.shift - 1) * sigmas)
+        sigmas = torch.from_numpy(sigmas).to(dtype=torch.float32, device=device)
+
+        timesteps = sigmas * self.config.num_train_timesteps
+        self.timesteps = timesteps.to(device=device)
+        self.sigmas = torch.cat([sigmas, torch.zeros(1, device=sigmas.device)])
+
+        self._step_index = None
+        self._begin_index = None
+
+    def index_for_timestep(self, timestep, schedule_timesteps=None):
+        if schedule_timesteps is None:
+            schedule_timesteps = self.timesteps
+
+        indices = (schedule_timesteps == timestep).nonzero()
+
+        # The sigma index that is taken for the **very** first `step`
+        # is always the second index (or the last index if there is only 1)
+        # This way we can ensure we don't accidentally skip a sigma in
+        # case we start in the middle of the denoising schedule (e.g. for image-to-image)
+        pos = 1 if len(indices) > 1 else 0
+
+        return indices[pos].item()
+
+    def _init_step_index(self, timestep):
+        if self.begin_index is None:
+            if isinstance(timestep, torch.Tensor):
+                timestep = timestep.to(self.timesteps.device)
+            self._step_index = self.index_for_timestep(timestep)
+        else:
+            self._step_index = self._begin_index
+
+    def step(
+        self,
+        model_output: torch.FloatTensor,
+        timestep: Union[float, torch.FloatTensor],
+        sample: torch.FloatTensor,
+        s_churn: float = 0.0,
+        s_tmin: float = 0.0,
+        s_tmax: float = float("inf"),
+        s_noise: float = 1.0,
+        generator: Optional[torch.Generator] = None,
+        return_dict: bool = True,
+    ) -> Union[FlowMatchEulerDiscreteSchedulerOutput, Tuple]:
+        """
+        Predict the sample from the previous timestep by reversing the SDE. This function propagates the diffusion
+        process from the learned model outputs (most often the predicted noise).
+
+        Args:
+            model_output (`torch.FloatTensor`):
+                The direct output from learned diffusion model.
+            timestep (`float`):
+                The current discrete timestep in the diffusion chain.
+            sample (`torch.FloatTensor`):
+                A current instance of a sample created by the diffusion process.
+            s_churn (`float`):
+            s_tmin  (`float`):
+            s_tmax  (`float`):
+            s_noise (`float`, defaults to 1.0):
+                Scaling factor for noise added to the sample.
+            generator (`torch.Generator`, *optional*):
+                A random number generator.
+            return_dict (`bool`):
+                Whether or not to return a [`~schedulers.scheduling_euler_discrete.EulerDiscreteSchedulerOutput`] or
+                tuple.
+
+        Returns:
+            [`~schedulers.scheduling_euler_discrete.EulerDiscreteSchedulerOutput`] or `tuple`:
+                If return_dict is `True`, [`~schedulers.scheduling_euler_discrete.EulerDiscreteSchedulerOutput`] is
+                returned, otherwise a tuple is returned where the first element is the sample tensor.
+        """
+
+        if isinstance(timestep, int) or isinstance(timestep, torch.IntTensor) or isinstance(timestep, torch.LongTensor):
+            raise ValueError(
+                (
+                    "Passing integer indices (e.g. from `enumerate(timesteps)`) as timesteps to"
+                    " `EulerDiscreteScheduler.step()` is not supported. Make sure to pass"
+                    " one of the `scheduler.timesteps` as a timestep."
+                ),
+            )
+
+        if self.step_index is None:
+            self._init_step_index(timestep)
+
+        # Upcast to avoid precision issues when computing prev_sample
+        sample = sample.to(torch.float32)
+
+        sigma = self.sigmas[self.step_index]
+
+        gamma = min(s_churn / (len(self.sigmas) - 1), 2**0.5 - 1) if s_tmin <= sigma <= s_tmax else 0.0
+
+        noise = randn_tensor(model_output.shape, dtype=model_output.dtype, device=model_output.device, generator=generator)
+
+        eps = noise * s_noise
+        sigma_hat = sigma * (gamma + 1)
+
+        if gamma > 0:
+            sample = sample + eps * (sigma_hat**2 - sigma**2) ** 0.5
+
+        # 1. compute predicted original sample (x_0) from sigma-scaled predicted noise
+        # NOTE: "original_sample" should not be an expected prediction_type but is left in for
+        # backwards compatibility
+
+        # if self.config.prediction_type == "vector_field":
+
+        denoised = sample - model_output * sigma
+        # 2. Convert to an ODE derivative
+        derivative = (sample - denoised) / sigma_hat
+
+        dt = self.sigmas[self.step_index + 1] - sigma_hat
+
+        prev_sample = sample + derivative * dt
+        # Cast sample back to model compatible dtype
+        prev_sample = prev_sample.to(model_output.dtype)
+
+        # upon completion increase step index by one
+        self._step_index += 1
+
+        if not return_dict:
+            return (prev_sample,)
+
+        return FlowMatchEulerDiscreteSchedulerOutput(prev_sample=prev_sample)
+
+    def __len__(self):
+        return self.config.num_train_timesteps
+
+
+# endregion
diff --git a/library/sd3_utils.py b/library/sd3_utils.py
index 6f8c361fd..c2c914123 100644
--- a/library/sd3_utils.py
+++ b/library/sd3_utils.py
@@ -1,30 +1,226 @@
 import math
-from typing import Dict
+from typing import Dict, Optional, Union
 import torch
+import safetensors
+from safetensors.torch import load_file
+from accelerate import init_empty_weights
+
+from .utils import setup_logging
+
+setup_logging()
+import logging
+
+logger = logging.getLogger(__name__)
 
 from library import sd3_models
 
+# TODO move some of functions to model_util.py
+from library import sdxl_model_util
+
+# region models
+
+
+def load_models(
+    ckpt_path: str,
+    clip_l_path: str,
+    clip_g_path: str,
+    t5xxl_path: str,
+    vae_path: str,
+    attn_mode: str,
+    device: Union[str, torch.device],
+    weight_dtype: torch.dtype,
+    disable_mmap: bool = False,
+    t5xxl_device: Optional[str] = None,
+    t5xxl_dtype: Optional[str] = None,
+):
+    def load_state_dict(path: str, dvc: Union[str, torch.device] = device):
+        if disable_mmap:
+            return safetensors.torch.load(open(path, "rb").read())
+        else:
+            try:
+                return load_file(path, device=dvc)
+            except:
+                return load_file(path)  # prevent device invalid Error
+
+    t5xxl_device = t5xxl_device or device
+
+    logger.info(f"Loading SD3 models from {ckpt_path}...")
+    state_dict = load_state_dict(ckpt_path)
+
+    # load clip_l
+    clip_l_sd = None
+    if clip_l_path:
+        logger.info(f"Loading clip_l from {clip_l_path}...")
+        clip_l_sd = load_state_dict(clip_l_path)
+        for key in list(clip_l_sd.keys()):
+            clip_l_sd["transformer." + key] = clip_l_sd.pop(key)
+    else:
+        if "text_encoders.clip_l.transformer.text_model.embeddings.position_embedding.weight" in state_dict:
+            # found clip_l: remove prefix "text_encoders.clip_l."
+            logger.info("clip_l is included in the checkpoint")
+            clip_l_sd = {}
+            prefix = "text_encoders.clip_l."
+            for k in list(state_dict.keys()):
+                if k.startswith(prefix):
+                    clip_l_sd[k[len(prefix) :]] = state_dict.pop(k)
+
+    # load clip_g
+    clip_g_sd = None
+    if clip_g_path:
+        logger.info(f"Loading clip_g from {clip_g_path}...")
+        clip_g_sd = load_state_dict(clip_g_path)
+        for key in list(clip_g_sd.keys()):
+            clip_g_sd["transformer." + key] = clip_g_sd.pop(key)
+    else:
+        if "text_encoders.clip_g.transformer.text_model.embeddings.position_embedding.weight" in state_dict:
+            # found clip_g: remove prefix "text_encoders.clip_g."
+            logger.info("clip_g is included in the checkpoint")
+            clip_g_sd = {}
+            prefix = "text_encoders.clip_g."
+            for k in list(state_dict.keys()):
+                if k.startswith(prefix):
+                    clip_g_sd[k[len(prefix) :]] = state_dict.pop(k)
+
+    # load t5xxl
+    t5xxl_sd = None
+    if t5xxl_path:
+        logger.info(f"Loading t5xxl from {t5xxl_path}...")
+        t5xxl_sd = load_state_dict(t5xxl_path, t5xxl_device)
+        for key in list(t5xxl_sd.keys()):
+            t5xxl_sd["transformer." + key] = t5xxl_sd.pop(key)
+    else:
+        if "text_encoders.t5xxl.transformer.encoder.block.0.layer.0.SelfAttention.k.weight" in state_dict:
+            # found t5xxl: remove prefix "text_encoders.t5xxl."
+            logger.info("t5xxl is included in the checkpoint")
+            t5xxl_sd = {}
+            prefix = "text_encoders.t5xxl."
+            for k in list(state_dict.keys()):
+                if k.startswith(prefix):
+                    t5xxl_sd[k[len(prefix) :]] = state_dict.pop(k)
+
+    # MMDiT and VAE
+    vae_sd = {}
+    if vae_path:
+        logger.info(f"Loading VAE from {vae_path}...")
+        vae_sd = load_state_dict(vae_path)
+    else:
+        # remove prefix "first_stage_model."
+        vae_sd = {}
+        vae_prefix = "first_stage_model."
+        for k in list(state_dict.keys()):
+            if k.startswith(vae_prefix):
+                vae_sd[k[len(vae_prefix) :]] = state_dict.pop(k)
+
+    mmdit_prefix = "model.diffusion_model."
+    for k in list(state_dict.keys()):
+        if k.startswith(mmdit_prefix):
+            state_dict[k[len(mmdit_prefix) :]] = state_dict.pop(k)
+        else:
+            state_dict.pop(k)  # remove other keys
+
+    # load MMDiT
+    logger.info("Building MMDit")
+    with init_empty_weights():
+        mmdit = sd3_models.create_mmdit_sd3_medium_configs(attn_mode)
+
+    logger.info("Loading state dict...")
+    info = sdxl_model_util._load_state_dict_on_device(mmdit, state_dict, device, weight_dtype)
+    logger.info(f"Loaded MMDiT: {info}")
+
+    # load ClipG and ClipL
+    if clip_l_sd is None:
+        clip_l = None
+    else:
+        logger.info("Building ClipL")
+        clip_l = sd3_models.create_clip_l(device, weight_dtype, clip_l_sd)
+        logger.info("Loading state dict...")
+        info = clip_l.load_state_dict(clip_l_sd)
+        logger.info(f"Loaded ClipL: {info}")
+        clip_l.set_attn_mode(attn_mode)
+
+    if clip_g_sd is None:
+        clip_g = None
+    else:
+        logger.info("Building ClipG")
+        clip_g = sd3_models.create_clip_g(device, weight_dtype, clip_g_sd)
+        logger.info("Loading state dict...")
+        info = clip_g.load_state_dict(clip_g_sd)
+        logger.info(f"Loaded ClipG: {info}")
+        clip_g.set_attn_mode(attn_mode)
+
+    # load T5XXL
+    if t5xxl_sd is None:
+        t5xxl = None
+    else:
+        logger.info("Building T5XXL")
+        t5xxl = sd3_models.create_t5xxl(t5xxl_device, t5xxl_dtype, t5xxl_sd)
+        logger.info("Loading state dict...")
+        info = t5xxl.load_state_dict(t5xxl_sd)
+        logger.info(f"Loaded T5XXL: {info}")
+        t5xxl.set_attn_mode(attn_mode)
+
+    # load VAE
+    logger.info("Building VAE")
+    vae = sd3_models.SDVAE()
+    logger.info("Loading state dict...")
+    info = vae.load_state_dict(vae_sd)
+    logger.info(f"Loaded VAE: {info}")
+
+    return mmdit, clip_l, clip_g, t5xxl, vae
+
+
+# endregion
+# region utils
+
 
 def get_cond(
     prompt: str,
     tokenizer: sd3_models.SD3Tokenizer,
     clip_l: sd3_models.SDClipModel,
     clip_g: sd3_models.SDXLClipG,
-    t5xxl: sd3_models.T5XXLModel,
+    t5xxl: Optional[sd3_models.T5XXLModel] = None,
+    device: Optional[torch.device] = None,
+    dtype: Optional[torch.dtype] = None,
 ):
     l_tokens, g_tokens, t5_tokens = tokenizer.tokenize_with_weights(prompt)
+    return get_cond_from_tokens(l_tokens, g_tokens, t5_tokens, clip_l, clip_g, t5xxl, device=device, dtype=dtype)
+
+
+def get_cond_from_tokens(
+    l_tokens,
+    g_tokens,
+    t5_tokens,
+    clip_l: sd3_models.SDClipModel,
+    clip_g: sd3_models.SDXLClipG,
+    t5xxl: Optional[sd3_models.T5XXLModel] = None,
+    device: Optional[torch.device] = None,
+    dtype: Optional[torch.dtype] = None,
+):
     l_out, l_pooled = clip_l.encode_token_weights(l_tokens)
     g_out, g_pooled = clip_g.encode_token_weights(g_tokens)
     lg_out = torch.cat([l_out, g_out], dim=-1)
     lg_out = torch.nn.functional.pad(lg_out, (0, 4096 - lg_out.shape[-1]))
+    if device is not None:
+        lg_out = lg_out.to(device=device)
+        l_pooled = l_pooled.to(device=device)
+        g_pooled = g_pooled.to(device=device)
+    if dtype is not None:
+        lg_out = lg_out.to(dtype=dtype)
+        l_pooled = l_pooled.to(dtype=dtype)
+        g_pooled = g_pooled.to(dtype=dtype)
 
+    # t5xxl may be in another device (eg. cpu)
     if t5_tokens is None:
-        t5_out = torch.zeros((lg_out.shape[0], 77, 4096), device=lg_out.device)
+        t5_out = torch.zeros((lg_out.shape[0], 77, 4096), device=lg_out.device, dtype=lg_out.dtype)
     else:
-        t5_out, t5_pooled = t5xxl.encode_token_weights(t5_tokens)  # t5_out is [1, 77, 4096], t5_pooled is None
-        t5_out = t5_out.to(lg_out.dtype)
+        t5_out, _ = t5xxl.encode_token_weights(t5_tokens)  # t5_out is [1, 77, 4096], t5_pooled is None
+        if device is not None:
+            t5_out = t5_out.to(device=device)
+        if dtype is not None:
+            t5_out = t5_out.to(dtype=dtype)
 
-    return torch.cat([lg_out, t5_out], dim=-2), torch.cat((l_pooled, g_pooled), dim=-1)
+    # return torch.cat([lg_out, t5_out], dim=-2), torch.cat((l_pooled, g_pooled), dim=-1)
+    return lg_out, t5_out, torch.cat((l_pooled, g_pooled), dim=-1)
 
 
 # used if other sd3 models is available
@@ -111,3 +307,6 @@ def noise_scaling(self, sigma, noise, latent_image, max_denoise=False):
         # assert max_denoise is False, "max_denoise not implemented"
         # max_denoise is always True, I'm not sure why it's there
         return sigma * noise + (1.0 - sigma) * latent_image
+
+
+# endregion
diff --git a/library/train_util.py b/library/train_util.py
index 4736ff4ff..c67e8737c 100644
--- a/library/train_util.py
+++ b/library/train_util.py
@@ -58,7 +58,7 @@
     KDPM2AncestralDiscreteScheduler,
     AutoencoderKL,
 )
-from library import custom_train_functions
+from library import custom_train_functions, sd3_utils
 from library.original_unet import UNet2DConditionModel
 from huggingface_hub import hf_hub_download
 import numpy as np
@@ -135,6 +135,7 @@
 )
 
 TEXT_ENCODER_OUTPUTS_CACHE_SUFFIX = "_te_outputs.npz"
+TEXT_ENCODER_OUTPUTS_CACHE_SUFFIX_SD3 = "_sd3_te.npz"
 
 
 class ImageInfo:
@@ -985,7 +986,7 @@ def is_text_encoder_output_cacheable(self):
             ]
         )
 
-    def cache_latents(self, vae, vae_batch_size=1, cache_to_disk=False, is_main_process=True):
+    def cache_latents(self, vae, vae_batch_size=1, cache_to_disk=False, is_main_process=True, file_suffix=".npz"):
         # マルチGPUには対応していないので、そちらはtools/cache_latents.pyを使うこと
         logger.info("caching latents.")
 
@@ -1006,7 +1007,7 @@ def cache_latents(self, vae, vae_batch_size=1, cache_to_disk=False, is_main_proc
 
             # check disk cache exists and size of latents
             if cache_to_disk:
-                info.latents_npz = os.path.splitext(info.absolute_path)[0] + ".npz"
+                info.latents_npz = os.path.splitext(info.absolute_path)[0] + file_suffix
                 if not is_main_process:  # store to info only
                     continue
 
@@ -1040,14 +1041,43 @@ def cache_latents(self, vae, vae_batch_size=1, cache_to_disk=False, is_main_proc
         for batch in tqdm(batches, smoothing=1, total=len(batches)):
             cache_batch_latents(vae, cache_to_disk, batch, subset.flip_aug, subset.alpha_mask, subset.random_crop)
 
-    # weight_dtypeを指定するとText Encoderそのもの、およひ出力がweight_dtypeになる
-    # SDXLでのみ有効だが、datasetのメソッドとする必要があるので、sdxl_train_util.pyではなくこちらに実装する
-    # SD1/2に対応するにはv2のフラグを持つ必要があるので後回し
+    # if weight_dtype is specified, Text Encoder itself and output will be converted to the dtype
+    # this method is only for SDXL, but it should be implemented here because it needs to be a method of dataset
+    # to support SD1/2, it needs a flag for v2, but it is postponed
     def cache_text_encoder_outputs(
-        self, tokenizers, text_encoders, device, weight_dtype, cache_to_disk=False, is_main_process=True
+        self, tokenizers, text_encoders, device, output_dtype, cache_to_disk=False, is_main_process=True
     ):
         assert len(tokenizers) == 2, "only support SDXL"
+        return self.cache_text_encoder_outputs_common(
+            tokenizers, text_encoders, [device, device], output_dtype, [output_dtype], cache_to_disk, is_main_process
+        )
 
+    # same as above, but for SD3
+    def cache_text_encoder_outputs_sd3(
+        self, tokenizer, text_encoders, devices, output_dtype, te_dtypes, cache_to_disk=False, is_main_process=True
+    ):
+        return self.cache_text_encoder_outputs_common(
+            [tokenizer],
+            text_encoders,
+            devices,
+            output_dtype,
+            te_dtypes,
+            cache_to_disk,
+            is_main_process,
+            TEXT_ENCODER_OUTPUTS_CACHE_SUFFIX_SD3,
+        )
+
+    def cache_text_encoder_outputs_common(
+        self,
+        tokenizers,
+        text_encoders,
+        devices,
+        output_dtype,
+        te_dtypes,
+        cache_to_disk=False,
+        is_main_process=True,
+        file_suffix=TEXT_ENCODER_OUTPUTS_CACHE_SUFFIX,
+    ):
         # latentsのキャッシュと同様に、ディスクへのキャッシュに対応する
         # またマルチGPUには対応していないので、そちらはtools/cache_latents.pyを使うこと
         logger.info("caching text encoder outputs.")
@@ -1058,13 +1088,14 @@ def cache_text_encoder_outputs(
         for info in tqdm(image_infos):
             # subset = self.image_to_subset[info.image_key]
             if cache_to_disk:
-                te_out_npz = os.path.splitext(info.absolute_path)[0] + TEXT_ENCODER_OUTPUTS_CACHE_SUFFIX
+                te_out_npz = os.path.splitext(info.absolute_path)[0] + file_suffix
                 info.text_encoder_outputs_npz = te_out_npz
 
                 if not is_main_process:  # store to info only
                     continue
 
                 if os.path.exists(te_out_npz):
+                    # TODO check varidity of cache here
                     continue
 
             image_infos_to_cache.append(info)
@@ -1073,18 +1104,23 @@ def cache_text_encoder_outputs(
             return
 
         # prepare tokenizers and text encoders
-        for text_encoder in text_encoders:
+        for text_encoder, device, te_dtype in zip(text_encoders, devices, te_dtypes):
             text_encoder.to(device)
-            if weight_dtype is not None:
-                text_encoder.to(dtype=weight_dtype)
+            if te_dtype is not None:
+                text_encoder.to(dtype=te_dtype)
 
         # create batch
+        is_sd3 = len(tokenizers) == 1
         batch = []
         batches = []
         for info in image_infos_to_cache:
-            input_ids1 = self.get_input_ids(info.caption, tokenizers[0])
-            input_ids2 = self.get_input_ids(info.caption, tokenizers[1])
-            batch.append((info, input_ids1, input_ids2))
+            if not is_sd3:
+                input_ids1 = self.get_input_ids(info.caption, tokenizers[0])
+                input_ids2 = self.get_input_ids(info.caption, tokenizers[1])
+                batch.append((info, input_ids1, input_ids2))
+            else:
+                l_tokens, g_tokens, t5_tokens = tokenizers[0].tokenize_with_weights(info.caption)
+                batch.append((info, l_tokens, g_tokens, t5_tokens))
 
             if len(batch) >= self.batch_size:
                 batches.append(batch)
@@ -1095,13 +1131,32 @@ def cache_text_encoder_outputs(
 
         # iterate batches: call text encoder and cache outputs for memory or disk
         logger.info("caching text encoder outputs...")
-        for batch in tqdm(batches):
-            infos, input_ids1, input_ids2 = zip(*batch)
-            input_ids1 = torch.stack(input_ids1, dim=0)
-            input_ids2 = torch.stack(input_ids2, dim=0)
-            cache_batch_text_encoder_outputs(
-                infos, tokenizers, text_encoders, self.max_token_length, cache_to_disk, input_ids1, input_ids2, weight_dtype
-            )
+        if not is_sd3:
+            for batch in tqdm(batches):
+                infos, input_ids1, input_ids2 = zip(*batch)
+                input_ids1 = torch.stack(input_ids1, dim=0)
+                input_ids2 = torch.stack(input_ids2, dim=0)
+                cache_batch_text_encoder_outputs(
+                    infos, tokenizers, text_encoders, self.max_token_length, cache_to_disk, input_ids1, input_ids2, output_dtype
+                )
+        else:
+            for batch in tqdm(batches):
+                infos, l_tokens, g_tokens, t5_tokens = zip(*batch)
+
+                # stack tokens
+                # l_tokens = [tokens[0] for tokens in l_tokens]
+                # g_tokens = [tokens[0] for tokens in g_tokens]
+                # t5_tokens = [tokens[0] for tokens in t5_tokens]
+
+                cache_batch_text_encoder_outputs_sd3(
+                    infos,
+                    tokenizers[0],
+                    text_encoders,
+                    self.max_token_length,
+                    cache_to_disk,
+                    (l_tokens, g_tokens, t5_tokens),
+                    output_dtype,
+                )
 
     def get_image_size(self, image_path):
         return imagesize.get(image_path)
@@ -1332,6 +1387,7 @@ def __getitem__(self, index):
                     captions.append(caption)
 
                 if not self.token_padding_disabled:  # this option might be omitted in future
+                    # TODO get_input_ids must support SD3
                     if self.XTI_layers:
                         token_caption = self.get_input_ids(caption_layer, self.tokenizers[0])
                     else:
@@ -2140,10 +2196,10 @@ def enable_XTI(self, *args, **kwargs):
         for dataset in self.datasets:
             dataset.enable_XTI(*args, **kwargs)
 
-    def cache_latents(self, vae, vae_batch_size=1, cache_to_disk=False, is_main_process=True):
+    def cache_latents(self, vae, vae_batch_size=1, cache_to_disk=False, is_main_process=True, file_suffix=".npz"):
         for i, dataset in enumerate(self.datasets):
             logger.info(f"[Dataset {i}]")
-            dataset.cache_latents(vae, vae_batch_size, cache_to_disk, is_main_process)
+            dataset.cache_latents(vae, vae_batch_size, cache_to_disk, is_main_process, file_suffix)
 
     def cache_text_encoder_outputs(
         self, tokenizers, text_encoders, device, weight_dtype, cache_to_disk=False, is_main_process=True
@@ -2152,6 +2208,15 @@ def cache_text_encoder_outputs(
             logger.info(f"[Dataset {i}]")
             dataset.cache_text_encoder_outputs(tokenizers, text_encoders, device, weight_dtype, cache_to_disk, is_main_process)
 
+    def cache_text_encoder_outputs_sd3(
+        self, tokenizer, text_encoders, device, output_dtype, te_dtypes, cache_to_disk=False, is_main_process=True
+    ):
+        for i, dataset in enumerate(self.datasets):
+            logger.info(f"[Dataset {i}]")
+            dataset.cache_text_encoder_outputs_sd3(
+                tokenizer, text_encoders, device, output_dtype, te_dtypes, cache_to_disk, is_main_process
+            )
+
     def set_caching_mode(self, caching_mode):
         for dataset in self.datasets:
             dataset.set_caching_mode(caching_mode)
@@ -2585,6 +2650,30 @@ def cache_batch_text_encoder_outputs(
             info.text_encoder_pool2 = pool2
 
 
+def cache_batch_text_encoder_outputs_sd3(
+    image_infos, tokenizer, text_encoders, max_token_length, cache_to_disk, input_ids, output_dtype
+):
+    # make input_ids for each text encoder
+    l_tokens, g_tokens, t5_tokens = input_ids
+    
+    clip_l, clip_g, t5xxl = text_encoders
+    with torch.no_grad():
+        b_lg_out, b_t5_out, b_pool = sd3_utils.get_cond_from_tokens(
+            l_tokens, g_tokens, t5_tokens, clip_l, clip_g, t5xxl, "cpu", output_dtype
+        )
+        b_lg_out = b_lg_out.detach()
+        b_t5_out = b_t5_out.detach()
+        b_pool = b_pool.detach()
+
+    for info, lg_out, t5_out, pool in zip(image_infos, b_lg_out, b_t5_out, b_pool):
+        if cache_to_disk:
+            save_text_encoder_outputs_to_disk(info.text_encoder_outputs_npz, lg_out, t5_out, pool)
+        else:
+            info.text_encoder_outputs1 = lg_out
+            info.text_encoder_outputs2 = t5_out
+            info.text_encoder_pool2 = pool
+
+
 def save_text_encoder_outputs_to_disk(npz_path, hidden_state1, hidden_state2, pool2):
     np.savez(
         npz_path,
@@ -2907,6 +2996,7 @@ def get_sai_model_spec(
     lora: bool,
     textual_inversion: bool,
     is_stable_diffusion_ckpt: Optional[bool] = None,  # None for TI and LoRA
+    sd3: str = None,
 ):
     timestamp = time.time()
 
@@ -2940,6 +3030,7 @@ def get_sai_model_spec(
         tags=args.metadata_tags,
         timesteps=timesteps,
         clip_skip=args.clip_skip,  # None or int
+        sd3=sd3,
     )
     return metadata
 
diff --git a/sd3_minimal_inference.py b/sd3_minimal_inference.py
index e14f784d4..96e9da4ac 100644
--- a/sd3_minimal_inference.py
+++ b/sd3_minimal_inference.py
@@ -320,8 +320,11 @@ def do_sample(
     # prepare embeddings
     logger.info("Encoding prompts...")
     # embeds, pooled_embed
-    cond = sd3_utils.get_cond(args.prompt, tokenizer, clip_l, clip_g, t5xxl)
-    neg_cond = sd3_utils.get_cond(args.negative_prompt, tokenizer, clip_l, clip_g, t5xxl)
+    lg_out, t5_out, pooled = sd3_utils.get_cond(args.prompt, tokenizer, clip_l, clip_g, t5xxl)
+    cond = torch.cat([lg_out, t5_out], dim=-2), pooled
+
+    lg_out, t5_out, pooled = sd3_utils.get_cond(args.negative_prompt, tokenizer, clip_l, clip_g, t5xxl)
+    neg_cond = torch.cat([lg_out, t5_out], dim=-2), pooled
 
     # generate image
     logger.info("Generating image...")
diff --git a/sd3_train.py b/sd3_train.py
new file mode 100644
index 000000000..0721b2ae4
--- /dev/null
+++ b/sd3_train.py
@@ -0,0 +1,907 @@
+# training with captions
+
+import argparse
+import copy
+import math
+import os
+from multiprocessing import Value
+from typing import List
+import toml
+
+from tqdm import tqdm
+
+import torch
+from library.device_utils import init_ipex, clean_memory_on_device
+
+
+init_ipex()
+
+from accelerate.utils import set_seed
+from diffusers import DDPMScheduler
+from library import deepspeed_utils, sd3_models, sd3_train_utils, sd3_utils
+
+# , sdxl_model_util
+
+import library.train_util as train_util
+
+from library.utils import setup_logging, add_logging_arguments
+
+setup_logging()
+import logging
+
+logger = logging.getLogger(__name__)
+
+import library.config_util as config_util
+
+# import library.sdxl_train_util as sdxl_train_util
+from library.config_util import (
+    ConfigSanitizer,
+    BlueprintGenerator,
+)
+import library.custom_train_functions as custom_train_functions
+
+# from library.custom_train_functions import (
+#     apply_snr_weight,
+#     prepare_scheduler_for_custom_training,
+#     scale_v_prediction_loss_like_noise_prediction,
+#     add_v_prediction_like_loss,
+#     apply_debiased_estimation,
+#     apply_masked_loss,
+# )
+
+
+def train(args):
+    train_util.verify_training_args(args)
+    train_util.prepare_dataset_args(args, True)
+    # sdxl_train_util.verify_sdxl_training_args(args)
+    deepspeed_utils.prepare_deepspeed_args(args)
+    setup_logging(args, reset=True)
+
+    assert (
+        not args.weighted_captions
+    ), "weighted_captions is not supported currently / weighted_captionsは現在サポートされていません"
+    assert (
+        not args.train_text_encoder or not args.cache_text_encoder_outputs
+    ), "cache_text_encoder_outputs is not supported when training text encoder / text encoderを学習するときはcache_text_encoder_outputsはサポートされていません"
+
+    # if args.block_lr:
+    #     block_lrs = [float(lr) for lr in args.block_lr.split(",")]
+    #     assert (
+    #         len(block_lrs) == UNET_NUM_BLOCKS_FOR_BLOCK_LR
+    #     ), f"block_lr must have {UNET_NUM_BLOCKS_FOR_BLOCK_LR} values / block_lrは{UNET_NUM_BLOCKS_FOR_BLOCK_LR}個の値を指定してください"
+    # else:
+    #     block_lrs = None
+
+    cache_latents = args.cache_latents
+    use_dreambooth_method = args.in_json is None
+
+    if args.seed is not None:
+        set_seed(args.seed)  # 乱数系列を初期化する
+
+    # load tokenizer
+    sd3_tokenizer = sd3_models.SD3Tokenizer()
+
+    # データセットを準備する
+    if args.dataset_class is None:
+        blueprint_generator = BlueprintGenerator(ConfigSanitizer(True, True, args.masked_loss, True))
+        if args.dataset_config is not None:
+            logger.info(f"Load dataset config from {args.dataset_config}")
+            user_config = config_util.load_user_config(args.dataset_config)
+            ignored = ["train_data_dir", "in_json"]
+            if any(getattr(args, attr) is not None for attr in ignored):
+                logger.warning(
+                    "ignore following options because config file is found: {0} / 設定ファイルが利用されるため以下のオプションは無視されます: {0}".format(
+                        ", ".join(ignored)
+                    )
+                )
+        else:
+            if use_dreambooth_method:
+                logger.info("Using DreamBooth method.")
+                user_config = {
+                    "datasets": [
+                        {
+                            "subsets": config_util.generate_dreambooth_subsets_config_by_subdirs(
+                                args.train_data_dir, args.reg_data_dir
+                            )
+                        }
+                    ]
+                }
+            else:
+                logger.info("Training with captions.")
+                user_config = {
+                    "datasets": [
+                        {
+                            "subsets": [
+                                {
+                                    "image_dir": args.train_data_dir,
+                                    "metadata_file": args.in_json,
+                                }
+                            ]
+                        }
+                    ]
+                }
+
+        blueprint = blueprint_generator.generate(user_config, args, tokenizer=[sd3_tokenizer])
+        train_dataset_group = config_util.generate_dataset_group_by_blueprint(blueprint.dataset_group)
+    else:
+        train_dataset_group = train_util.load_arbitrary_dataset(args, [sd3_tokenizer])
+
+    current_epoch = Value("i", 0)
+    current_step = Value("i", 0)
+    ds_for_collator = train_dataset_group if args.max_data_loader_n_workers == 0 else None
+    collator = train_util.collator_class(current_epoch, current_step, ds_for_collator)
+
+    train_dataset_group.verify_bucket_reso_steps(8)  # TODO これでいいか確認
+
+    if args.debug_dataset:
+        train_util.debug_dataset(train_dataset_group, True)
+        return
+    if len(train_dataset_group) == 0:
+        logger.error(
+            "No data found. Please verify the metadata file and train_data_dir option. / 画像がありません。メタデータおよびtrain_data_dirオプションを確認してください。"
+        )
+        return
+
+    if cache_latents:
+        assert (
+            train_dataset_group.is_latent_cacheable()
+        ), "when caching latents, either color_aug or random_crop cannot be used / latentをキャッシュするときはcolor_augとrandom_cropは使えません"
+
+    if args.cache_text_encoder_outputs:
+        assert (
+            train_dataset_group.is_text_encoder_output_cacheable()
+        ), "when caching text encoder output, either caption_dropout_rate, shuffle_caption, token_warmup_step or caption_tag_dropout_rate cannot be used / text encoderの出力をキャッシュするときはcaption_dropout_rate, shuffle_caption, token_warmup_step, caption_tag_dropout_rateは使えません"
+
+    # acceleratorを準備する
+    logger.info("prepare accelerator")
+    accelerator = train_util.prepare_accelerator(args)
+
+    # mixed precisionに対応した型を用意しておき適宜castする
+    weight_dtype, save_dtype = train_util.prepare_dtype(args)
+    vae_dtype = weight_dtype  # torch.float32 if args.no_half_vae else weight_dtype # SD3 VAE works with fp16
+
+    t5xxl_dtype = weight_dtype
+    if args.t5xxl_dtype is not None:
+        if args.t5xxl_dtype == "fp16":
+            t5xxl_dtype = torch.float16
+        elif args.t5xxl_dtype == "bf16":
+            t5xxl_dtype = torch.bfloat16
+        elif args.t5xxl_dtype == "fp32" or args.t5xxl_dtype == "float":
+            t5xxl_dtype = torch.float32
+        else:
+            raise ValueError(f"unexpected t5xxl_dtype: {args.t5xxl_dtype}")
+    t5xxl_device = accelerator.device if args.t5xxl_device is None else args.t5xxl_device
+
+    # モデルを読み込む
+    attn_mode = "xformers" if args.xformers else "torch"
+
+    assert (
+        attn_mode == "torch"
+    ), f"attn_mode {attn_mode} is not supported. Please use `--sdpa` instead of `--xformers`. / attn_mode {attn_mode} はサポートされていません。`--xformers`の代わりに`--sdpa`を使ってください。"
+
+    mmdit, clip_l, clip_g, t5xxl, vae = sd3_train_utils.load_target_model(
+        args, accelerator, attn_mode, weight_dtype, t5xxl_device, t5xxl_dtype
+    )
+    assert clip_l is not None, "clip_l is required / clip_lは必須です"
+    assert clip_g is not None, "clip_g is required / clip_gは必須です"
+    # logit_scale = logit_scale.to(accelerator.device, dtype=weight_dtype)
+
+    # 学習を準備する
+    if cache_latents:
+        vae.to(accelerator.device, dtype=vae_dtype)
+        vae.requires_grad_(False)
+        vae.eval()
+        vae_wrapper = sd3_models.VAEWrapper(vae)  # make SD/SDXL compatible
+        with torch.no_grad():
+            train_dataset_group.cache_latents(
+                vae_wrapper, args.vae_batch_size, args.cache_latents_to_disk, accelerator.is_main_process, file_suffix="_sd3.npz"
+            )
+        vae.to("cpu")
+        clean_memory_on_device(accelerator.device)
+
+        accelerator.wait_for_everyone()
+
+    # 学習を準備する：モデルを適切な状態にする
+    if args.gradient_checkpointing:
+        mmdit.enable_gradient_checkpointing()
+    train_mmdit = args.learning_rate != 0
+    train_clip_l = False
+    train_clip_g = False
+    train_t5xxl = False
+
+    # if args.train_text_encoder:
+    #     # TODO each option for two text encoders?
+    #     accelerator.print("enable text encoder training")
+    #     if args.gradient_checkpointing:
+    #         text_encoder1.gradient_checkpointing_enable()
+    #         text_encoder2.gradient_checkpointing_enable()
+    #     lr_te1 = args.learning_rate_te1 if args.learning_rate_te1 is not None else args.learning_rate  # 0 means not train
+    #     lr_te2 = args.learning_rate_te2 if args.learning_rate_te2 is not None else args.learning_rate  # 0 means not train
+    #     train_clip_l = lr_te1 != 0
+    #     train_clip_g = lr_te2 != 0
+
+    #     # caching one text encoder output is not supported
+    #     if not train_clip_l:
+    #         text_encoder1.to(weight_dtype)
+    #     if not train_clip_g:
+    #         text_encoder2.to(weight_dtype)
+    #     text_encoder1.requires_grad_(train_clip_l)
+    #     text_encoder2.requires_grad_(train_clip_g)
+    #     text_encoder1.train(train_clip_l)
+    #     text_encoder2.train(train_clip_g)
+    # else:
+    clip_l.to(weight_dtype)
+    clip_g.to(weight_dtype)
+    clip_l.requires_grad_(False)
+    clip_g.requires_grad_(False)
+    clip_l.eval()
+    clip_g.eval()
+    if t5xxl is not None:
+        t5xxl.to(t5xxl_dtype)
+        t5xxl.requires_grad_(False)
+        t5xxl.eval()
+
+    # TextEncoderの出力をキャッシュする
+    if args.cache_text_encoder_outputs:
+        # Text Encodes are eval and no grad
+
+        with torch.no_grad(), accelerator.autocast():
+            train_dataset_group.cache_text_encoder_outputs_sd3(
+                sd3_tokenizer,
+                (clip_l, clip_g, t5xxl),
+                (accelerator.device, accelerator.device, t5xxl_device),
+                None,
+                (None, None, None),
+                args.cache_text_encoder_outputs_to_disk,
+                accelerator.is_main_process,
+            )
+        accelerator.wait_for_everyone()
+
+    if not cache_latents:
+        vae.requires_grad_(False)
+        vae.eval()
+        vae.to(accelerator.device, dtype=vae_dtype)
+
+    mmdit.requires_grad_(train_mmdit)
+    if not train_mmdit:
+        mmdit.to(accelerator.device, dtype=weight_dtype)  # because of unet is not prepared
+
+    training_models = []
+    params_to_optimize = []
+    # if train_unet:
+    training_models.append(mmdit)
+    # if block_lrs is None:
+    params_to_optimize.append({"params": list(mmdit.parameters()), "lr": args.learning_rate})
+    # else:
+    #     params_to_optimize.extend(get_block_params_to_optimize(mmdit, block_lrs))
+
+    # if train_clip_l:
+    #     training_models.append(text_encoder1)
+    #     params_to_optimize.append({"params": list(text_encoder1.parameters()), "lr": args.learning_rate_te1 or args.learning_rate})
+    # if train_clip_g:
+    #     training_models.append(text_encoder2)
+    #     params_to_optimize.append({"params": list(text_encoder2.parameters()), "lr": args.learning_rate_te2 or args.learning_rate})
+
+    # calculate number of trainable parameters
+    n_params = 0
+    for group in params_to_optimize:
+        for p in group["params"]:
+            n_params += p.numel()
+
+    accelerator.print(f"train mmdit: {train_mmdit}")  # , text_encoder1: {train_clip_l}, text_encoder2: {train_clip_g}")
+    accelerator.print(f"number of models: {len(training_models)}")
+    accelerator.print(f"number of trainable parameters: {n_params}")
+
+    # 学習に必要なクラスを準備する
+    accelerator.print("prepare optimizer, data loader etc.")
+
+    if args.fused_optimizer_groups:
+        # fused backward pass: https://pytorch.org/tutorials/intermediate/optimizer_step_in_backward_tutorial.html
+        # Instead of creating an optimizer for all parameters as in the tutorial, we create an optimizer for each group of parameters.
+        # This balances memory usage and management complexity.
+
+        # calculate total number of parameters
+        n_total_params = sum(len(params["params"]) for params in params_to_optimize)
+        params_per_group = math.ceil(n_total_params / args.fused_optimizer_groups)
+
+        # split params into groups, keeping the learning rate the same for all params in a group
+        # this will increase the number of groups if the learning rate is different for different params (e.g. U-Net and text encoders)
+        grouped_params = []
+        param_group = []
+        param_group_lr = -1
+        for group in params_to_optimize:
+            lr = group["lr"]
+            for p in group["params"]:
+                # if the learning rate is different for different params, start a new group
+                if lr != param_group_lr:
+                    if param_group:
+                        grouped_params.append({"params": param_group, "lr": param_group_lr})
+                        param_group = []
+                    param_group_lr = lr
+
+                param_group.append(p)
+
+                # if the group has enough parameters, start a new group
+                if len(param_group) == params_per_group:
+                    grouped_params.append({"params": param_group, "lr": param_group_lr})
+                    param_group = []
+                    param_group_lr = -1
+
+        if param_group:
+            grouped_params.append({"params": param_group, "lr": param_group_lr})
+
+        # prepare optimizers for each group
+        optimizers = []
+        for group in grouped_params:
+            _, _, optimizer = train_util.get_optimizer(args, trainable_params=[group])
+            optimizers.append(optimizer)
+        optimizer = optimizers[0]  # avoid error in the following code
+
+        logger.info(f"using {len(optimizers)} optimizers for fused optimizer groups")
+
+    else:
+        _, _, optimizer = train_util.get_optimizer(args, trainable_params=params_to_optimize)
+
+    # dataloaderを準備する
+    # DataLoaderのプロセス数：0 は persistent_workers が使えないので注意
+    n_workers = min(args.max_data_loader_n_workers, os.cpu_count())  # cpu_count or max_data_loader_n_workers
+    train_dataloader = torch.utils.data.DataLoader(
+        train_dataset_group,
+        batch_size=1,
+        shuffle=True,
+        collate_fn=collator,
+        num_workers=n_workers,
+        persistent_workers=args.persistent_data_loader_workers,
+    )
+
+    # 学習ステップ数を計算する
+    if args.max_train_epochs is not None:
+        args.max_train_steps = args.max_train_epochs * math.ceil(
+            len(train_dataloader) / accelerator.num_processes / args.gradient_accumulation_steps
+        )
+        accelerator.print(
+            f"override steps. steps for {args.max_train_epochs} epochs is / 指定エポックまでのステップ数: {args.max_train_steps}"
+        )
+
+    # データセット側にも学習ステップを送信
+    train_dataset_group.set_max_train_steps(args.max_train_steps)
+
+    # lr schedulerを用意する
+    if args.fused_optimizer_groups:
+        # prepare lr schedulers for each optimizer
+        lr_schedulers = [train_util.get_scheduler_fix(args, optimizer, accelerator.num_processes) for optimizer in optimizers]
+        lr_scheduler = lr_schedulers[0]  # avoid error in the following code
+    else:
+        lr_scheduler = train_util.get_scheduler_fix(args, optimizer, accelerator.num_processes)
+
+    # 実験的機能：勾配も含めたfp16/bf16学習を行う　モデル全体をfp16/bf16にする
+    if args.full_fp16:
+        assert (
+            args.mixed_precision == "fp16"
+        ), "full_fp16 requires mixed precision='fp16' / full_fp16を使う場合はmixed_precision='fp16'を指定してください。"
+        accelerator.print("enable full fp16 training.")
+        mmdit.to(weight_dtype)
+        clip_l.to(weight_dtype)
+        clip_g.to(weight_dtype)
+        if t5xxl is not None:
+            t5xxl.to(weight_dtype)  # TODO check works with fp16 or not
+    elif args.full_bf16:
+        assert (
+            args.mixed_precision == "bf16"
+        ), "full_bf16 requires mixed precision='bf16' / full_bf16を使う場合はmixed_precision='bf16'を指定してください。"
+        accelerator.print("enable full bf16 training.")
+        mmdit.to(weight_dtype)
+        clip_l.to(weight_dtype)
+        clip_g.to(weight_dtype)
+        if t5xxl is not None:
+            t5xxl.to(weight_dtype)
+
+    # TODO check if this is necessary. SD3 uses pool for clip_l and clip_g
+    # # freeze last layer and final_layer_norm in te1 since we use the output of the penultimate layer
+    # if train_clip_l:
+    #     text_encoder1.text_model.encoder.layers[-1].requires_grad_(False)
+    #     text_encoder1.text_model.final_layer_norm.requires_grad_(False)
+
+    if args.deepspeed:
+        ds_model = deepspeed_utils.prepare_deepspeed_model(
+            args,
+            mmdit=mmdit,
+            # mmdie=mmdit if train_mmdit else None,
+            # text_encoder1=text_encoder1 if train_clip_l else None,
+            # text_encoder2=text_encoder2 if train_clip_g else None,
+        )
+        # most of ZeRO stage uses optimizer partitioning, so we have to prepare optimizer and ds_model at the same time. # pull/1139#issuecomment-1986790007
+        ds_model, optimizer, train_dataloader, lr_scheduler = accelerator.prepare(
+            ds_model, optimizer, train_dataloader, lr_scheduler
+        )
+        training_models = [ds_model]
+
+    else:
+        # acceleratorがなんかよろしくやってくれるらしい
+        if train_mmdit:
+            mmdit = accelerator.prepare(mmdit)
+        # if train_clip_l:
+        #     text_encoder1 = accelerator.prepare(text_encoder1)
+        # if train_clip_g:
+        #     text_encoder2 = accelerator.prepare(text_encoder2)
+        optimizer, train_dataloader, lr_scheduler = accelerator.prepare(optimizer, train_dataloader, lr_scheduler)
+
+    # TextEncoderの出力をキャッシュするときには、すでに出力を取得済みなのでCPUへ移動する
+    if args.cache_text_encoder_outputs:
+        # move Text Encoders for sampling images. Text Encoder doesn't work on CPU with fp16
+        clip_l.to("cpu", dtype=torch.float32)
+        clip_g.to("cpu", dtype=torch.float32)
+        if t5xxl is not None:
+            t5xxl.to("cpu", dtype=torch.float32)
+        clean_memory_on_device(accelerator.device)
+    else:
+        # make sure Text Encoders are on GPU
+        # TODO support CPU for text encoders
+        clip_l.to(accelerator.device)
+        clip_g.to(accelerator.device)
+        if t5xxl is not None:
+            t5xxl.to(accelerator.device)
+
+    # TODO cache sample prompt's embeddings to free text encoder's memory
+    if args.cache_text_encoder_outputs:
+        if not args.save_t5xxl:
+            t5xxl = None  # free memory
+    clean_memory_on_device(accelerator.device)
+
+    # 実験的機能：勾配も含めたfp16学習を行う　PyTorchにパッチを当ててfp16でのgrad scaleを有効にする
+    if args.full_fp16:
+        # During deepseed training, accelerate not handles fp16/bf16|mixed precision directly via scaler. Let deepspeed engine do.
+        # -> But we think it's ok to patch accelerator even if deepspeed is enabled.
+        train_util.patch_accelerator_for_fp16_training(accelerator)
+
+    # resumeする
+    train_util.resume_from_local_or_hf_if_specified(accelerator, args)
+
+    if args.fused_backward_pass:
+        # use fused optimizer for backward pass: other optimizers will be supported in the future
+        import library.adafactor_fused
+
+        library.adafactor_fused.patch_adafactor_fused(optimizer)
+        for param_group in optimizer.param_groups:
+            for parameter in param_group["params"]:
+                if parameter.requires_grad:
+
+                    def __grad_hook(tensor: torch.Tensor, param_group=param_group):
+                        if accelerator.sync_gradients and args.max_grad_norm != 0.0:
+                            accelerator.clip_grad_norm_(tensor, args.max_grad_norm)
+                        optimizer.step_param(tensor, param_group)
+                        tensor.grad = None
+
+                    parameter.register_post_accumulate_grad_hook(__grad_hook)
+
+    elif args.fused_optimizer_groups:
+        # prepare for additional optimizers and lr schedulers
+        for i in range(1, len(optimizers)):
+            optimizers[i] = accelerator.prepare(optimizers[i])
+            lr_schedulers[i] = accelerator.prepare(lr_schedulers[i])
+
+        # counters are used to determine when to step the optimizer
+        global optimizer_hooked_count
+        global num_parameters_per_group
+        global parameter_optimizer_map
+
+        optimizer_hooked_count = {}
+        num_parameters_per_group = [0] * len(optimizers)
+        parameter_optimizer_map = {}
+
+        for opt_idx, optimizer in enumerate(optimizers):
+            for param_group in optimizer.param_groups:
+                for parameter in param_group["params"]:
+                    if parameter.requires_grad:
+
+                        def optimizer_hook(parameter: torch.Tensor):
+                            if accelerator.sync_gradients and args.max_grad_norm != 0.0:
+                                accelerator.clip_grad_norm_(parameter, args.max_grad_norm)
+
+                            i = parameter_optimizer_map[parameter]
+                            optimizer_hooked_count[i] += 1
+                            if optimizer_hooked_count[i] == num_parameters_per_group[i]:
+                                optimizers[i].step()
+                                optimizers[i].zero_grad(set_to_none=True)
+
+                        parameter.register_post_accumulate_grad_hook(optimizer_hook)
+                        parameter_optimizer_map[parameter] = opt_idx
+                        num_parameters_per_group[opt_idx] += 1
+
+    # epoch数を計算する
+    num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
+    num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch)
+    if (args.save_n_epoch_ratio is not None) and (args.save_n_epoch_ratio > 0):
+        args.save_every_n_epochs = math.floor(num_train_epochs / args.save_n_epoch_ratio) or 1
+
+    # 学習する
+    # total_batch_size = args.train_batch_size * accelerator.num_processes * args.gradient_accumulation_steps
+    accelerator.print("running training / 学習開始")
+    accelerator.print(f"  num examples / サンプル数: {train_dataset_group.num_train_images}")
+    accelerator.print(f"  num batches per epoch / 1epochのバッチ数: {len(train_dataloader)}")
+    accelerator.print(f"  num epochs / epoch数: {num_train_epochs}")
+    accelerator.print(
+        f"  batch size per device / バッチサイズ: {', '.join([str(d.batch_size) for d in train_dataset_group.datasets])}"
+    )
+    # accelerator.print(
+    #     f"  total train batch size (with parallel & distributed & accumulation) / 総バッチサイズ（並列学習、勾配合計含む）: {total_batch_size}"
+    # )
+    accelerator.print(f"  gradient accumulation steps / 勾配を合計するステップ数 = {args.gradient_accumulation_steps}")
+    accelerator.print(f"  total optimization steps / 学習ステップ数: {args.max_train_steps}")
+
+    progress_bar = tqdm(range(args.max_train_steps), smoothing=0, disable=not accelerator.is_local_main_process, desc="steps")
+    global_step = 0
+
+    # noise_scheduler = DDPMScheduler(
+    #     beta_start=0.00085, beta_end=0.012, beta_schedule="scaled_linear", num_train_timesteps=1000, clip_sample=False
+    # )
+
+    noise_scheduler = sd3_train_utils.FlowMatchEulerDiscreteScheduler(num_train_timesteps=1000, shift=3.0)
+    noise_scheduler_copy = copy.deepcopy(noise_scheduler)
+
+    # prepare_scheduler_for_custom_training(noise_scheduler, accelerator.device)
+    # if args.zero_terminal_snr:
+    #     custom_train_functions.fix_noise_scheduler_betas_for_zero_terminal_snr(noise_scheduler)
+
+    if accelerator.is_main_process:
+        init_kwargs = {}
+        if args.wandb_run_name:
+            init_kwargs["wandb"] = {"name": args.wandb_run_name}
+        if args.log_tracker_config is not None:
+            init_kwargs = toml.load(args.log_tracker_config)
+        accelerator.init_trackers(
+            "finetuning" if args.log_tracker_name is None else args.log_tracker_name,
+            config=train_util.get_sanitized_config_or_none(args),
+            init_kwargs=init_kwargs,
+        )
+
+    # # For --sample_at_first
+    # sd3_train_utils.sample_images(
+    #     accelerator, args, 0, global_step, accelerator.device, vae, [tokenizer1, tokenizer2], [text_encoder1, text_encoder2], mmdit
+    # )
+
+    # following function will be moved to sd3_train_utils
+
+    def get_sigmas(timesteps, n_dim=4, dtype=torch.float32):
+        sigmas = noise_scheduler_copy.sigmas.to(device=accelerator.device, dtype=dtype)
+        schedule_timesteps = noise_scheduler_copy.timesteps.to(accelerator.device)
+        timesteps = timesteps.to(accelerator.device)
+        step_indices = [(schedule_timesteps == t).nonzero().item() for t in timesteps]
+
+        sigma = sigmas[step_indices].flatten()
+        while len(sigma.shape) < n_dim:
+            sigma = sigma.unsqueeze(-1)
+        return sigma
+
+    def compute_density_for_timestep_sampling(
+        weighting_scheme: str, batch_size: int, logit_mean: float = None, logit_std: float = None, mode_scale: float = None
+    ):
+        """Compute the density for sampling the timesteps when doing SD3 training.
+
+        Courtesy: This was contributed by Rafie Walker in https://github.com/huggingface/diffusers/pull/8528.
+
+        SD3 paper reference: https://arxiv.org/abs/2403.03206v1.
+        """
+        if weighting_scheme == "logit_normal":
+            # See 3.1 in the SD3 paper ($rf/lognorm(0.00,1.00)$).
+            u = torch.normal(mean=logit_mean, std=logit_std, size=(batch_size,), device="cpu")
+            u = torch.nn.functional.sigmoid(u)
+        elif weighting_scheme == "mode":
+            u = torch.rand(size=(batch_size,), device="cpu")
+            u = 1 - u - mode_scale * (torch.cos(math.pi * u / 2) ** 2 - 1 + u)
+        else:
+            u = torch.rand(size=(batch_size,), device="cpu")
+        return u
+
+    def compute_loss_weighting_for_sd3(weighting_scheme: str, sigmas=None):
+        """Computes loss weighting scheme for SD3 training.
+
+        Courtesy: This was contributed by Rafie Walker in https://github.com/huggingface/diffusers/pull/8528.
+
+        SD3 paper reference: https://arxiv.org/abs/2403.03206v1.
+        """
+        if weighting_scheme == "sigma_sqrt":
+            weighting = (sigmas**-2.0).float()
+        elif weighting_scheme == "cosmap":
+            bot = 1 - 2 * sigmas + 2 * sigmas**2
+            weighting = 2 / (math.pi * bot)
+        else:
+            weighting = torch.ones_like(sigmas)
+        return weighting
+
+    loss_recorder = train_util.LossRecorder()
+    for epoch in range(num_train_epochs):
+        accelerator.print(f"\nepoch {epoch+1}/{num_train_epochs}")
+        current_epoch.value = epoch + 1
+
+        for m in training_models:
+            m.train()
+
+        for step, batch in enumerate(train_dataloader):
+            current_step.value = global_step
+
+            if args.fused_optimizer_groups:
+                optimizer_hooked_count = {i: 0 for i in range(len(optimizers))}  # reset counter for each step
+
+            with accelerator.accumulate(*training_models):
+                if "latents" in batch and batch["latents"] is not None:
+                    latents = batch["latents"].to(accelerator.device).to(dtype=weight_dtype)
+                else:
+                    with torch.no_grad():
+                        # encode images to latents. images are [-1, 1]
+                        latents = vae.encode(batch["images"].to(vae_dtype)).to(weight_dtype)
+
+                        # NaNが含まれていれば警告を表示し0に置き換える
+                        if torch.any(torch.isnan(latents)):
+                            accelerator.print("NaN found in latents, replacing with zeros")
+                            latents = torch.nan_to_num(latents, 0, out=latents)
+                # latents = latents * sdxl_model_util.VAE_SCALE_FACTOR
+                latents = sd3_models.SDVAE.process_in(latents)
+
+                if "text_encoder_outputs1_list" not in batch or batch["text_encoder_outputs1_list"] is None:
+                    # not cached, get text encoder outputs
+                    # XXX This does not work yet
+                    input_ids_clip_l, input_ids_clip_g, input_ids_t5xxl = batch["input_ids"]
+                    with torch.set_grad_enabled(args.train_text_encoder):
+                        # TODO support weighted captions
+                        # TODO support length > 75
+                        input_ids_clip_l = input_ids_clip_l.to(accelerator.device)
+                        input_ids_clip_g = input_ids_clip_g.to(accelerator.device)
+                        input_ids_t5xxl = input_ids_t5xxl.to(accelerator.device)
+
+                        # get text encoder outputs: outputs are concatenated
+                        context, pool = sd3_utils.get_cond_from_tokens(
+                            input_ids_clip_l, input_ids_clip_g, input_ids_t5xxl, clip_l, clip_g, t5xxl
+                        )
+                else:
+                    # encoder_hidden_states1 = batch["text_encoder_outputs1_list"].to(accelerator.device).to(weight_dtype)
+                    # encoder_hidden_states2 = batch["text_encoder_outputs2_list"].to(accelerator.device).to(weight_dtype)
+                    # pool2 = batch["text_encoder_pool2_list"].to(accelerator.device).to(weight_dtype)
+                    # TODO this reuses SDXL keys, it should be fixed
+                    lg_out = batch["text_encoder_outputs1_list"]
+                    t5_out = batch["text_encoder_outputs2_list"]
+                    pool = batch["text_encoder_pool2_list"]
+                    context = torch.cat([lg_out, t5_out], dim=-2)
+
+                # TODO support some features for noise implemented in get_noise_noisy_latents_and_timesteps
+
+                # Sample noise that we'll add to the latents
+                noise = torch.randn_like(latents)
+                bsz = latents.shape[0]
+
+                # Sample a random timestep for each image
+                # for weighting schemes where we sample timesteps non-uniformly
+                u = compute_density_for_timestep_sampling(
+                    weighting_scheme=args.weighting_scheme,
+                    batch_size=bsz,
+                    logit_mean=args.logit_mean,
+                    logit_std=args.logit_std,
+                    mode_scale=args.mode_scale,
+                )
+                indices = (u * noise_scheduler_copy.config.num_train_timesteps).long()
+                timesteps = noise_scheduler_copy.timesteps[indices].to(device=accelerator.device)
+
+                # Add noise according to flow matching.
+                sigmas = get_sigmas(timesteps, n_dim=latents.ndim, dtype=weight_dtype)
+                noisy_model_input = sigmas * noise + (1.0 - sigmas) * latents
+
+                # call model
+                with accelerator.autocast():
+                    model_pred = mmdit(noisy_model_input, timesteps, context=context, y=pool)
+
+                # Follow: Section 5 of https://arxiv.org/abs/2206.00364.
+                # Preconditioning of the model outputs.
+                model_pred = model_pred * (-sigmas) + noisy_model_input
+
+                # these weighting schemes use a uniform timestep sampling
+                # and instead post-weight the loss
+                weighting = compute_loss_weighting_for_sd3(weighting_scheme=args.weighting_scheme, sigmas=sigmas)
+
+                # flow matching loss
+                target = latents
+
+                # Compute regular loss. TODO simplify this
+                loss = torch.mean(
+                    (weighting.float() * (model_pred.float() - target.float()) ** 2).reshape(target.shape[0], -1),
+                    1,
+                )
+                loss = loss.mean()
+
+                accelerator.backward(loss)
+
+                if not (args.fused_backward_pass or args.fused_optimizer_groups):
+                    if accelerator.sync_gradients and args.max_grad_norm != 0.0:
+                        params_to_clip = []
+                        for m in training_models:
+                            params_to_clip.extend(m.parameters())
+                        accelerator.clip_grad_norm_(params_to_clip, args.max_grad_norm)
+
+                    optimizer.step()
+                    lr_scheduler.step()
+                    optimizer.zero_grad(set_to_none=True)
+                else:
+                    # optimizer.step() and optimizer.zero_grad() are called in the optimizer hook
+                    lr_scheduler.step()
+                    if args.fused_optimizer_groups:
+                        for i in range(1, len(optimizers)):
+                            lr_schedulers[i].step()
+
+            # Checks if the accelerator has performed an optimization step behind the scenes
+            if accelerator.sync_gradients:
+                progress_bar.update(1)
+                global_step += 1
+
+                # sdxl_train_util.sample_images(
+                #     accelerator,
+                #     args,
+                #     None,
+                #     global_step,
+                #     accelerator.device,
+                #     vae,
+                #     [tokenizer1, tokenizer2],
+                #     [text_encoder1, text_encoder2],
+                #     mmdit,
+                # )
+
+                # 指定ステップごとにモデルを保存
+                if args.save_every_n_steps is not None and global_step % args.save_every_n_steps == 0:
+                    accelerator.wait_for_everyone()
+                    if accelerator.is_main_process:
+                        sd3_train_utils.save_sd3_model_on_epoch_end_or_stepwise(
+                            args,
+                            False,
+                            accelerator,
+                            save_dtype,
+                            epoch,
+                            num_train_epochs,
+                            global_step,
+                            clip_l if args.save_clip else None,
+                            clip_g if args.save_clip else None,
+                            t5xxl if args.save_t5xxl else None,
+                            mmdit,
+                            vae,
+                        )
+
+            current_loss = loss.detach().item()  # 平均なのでbatch sizeは関係ないはず
+            if args.logging_dir is not None:
+                logs = {"loss": current_loss}
+                train_util.append_lr_to_logs(logs, lr_scheduler, args.optimizer_type, including_unet=train_mmdit)
+
+                accelerator.log(logs, step=global_step)
+
+            loss_recorder.add(epoch=epoch, step=step, loss=current_loss)
+            avr_loss: float = loss_recorder.moving_average
+            logs = {"avr_loss": avr_loss}  # , "lr": lr_scheduler.get_last_lr()[0]}
+            progress_bar.set_postfix(**logs)
+
+            if global_step >= args.max_train_steps:
+                break
+
+        if args.logging_dir is not None:
+            logs = {"loss/epoch": loss_recorder.moving_average}
+            accelerator.log(logs, step=epoch + 1)
+
+        accelerator.wait_for_everyone()
+
+        if args.save_every_n_epochs is not None:
+            if accelerator.is_main_process:
+                sd3_train_utils.save_sd3_model_on_epoch_end_or_stepwise(
+                    args,
+                    True,
+                    accelerator,
+                    save_dtype,
+                    epoch,
+                    num_train_epochs,
+                    global_step,
+                    clip_l if args.save_clip else None,
+                    clip_g if args.save_clip else None,
+                    t5xxl if args.save_t5xxl else None,
+                    mmdit,
+                    vae,
+                )
+
+        # sdxl_train_util.sample_images(
+        #     accelerator,
+        #     args,
+        #     epoch + 1,
+        #     global_step,
+        #     accelerator.device,
+        #     vae,
+        #     [tokenizer1, tokenizer2],
+        #     [text_encoder1, text_encoder2],
+        #     mmdit,
+        # )
+
+    is_main_process = accelerator.is_main_process
+    # if is_main_process:
+    mmdit = accelerator.unwrap_model(mmdit)
+    clip_l = accelerator.unwrap_model(clip_l)
+    clip_g = accelerator.unwrap_model(clip_g)
+    if t5xxl is not None:
+        t5xxl = accelerator.unwrap_model(t5xxl)
+
+    accelerator.end_training()
+
+    if args.save_state or args.save_state_on_train_end:
+        train_util.save_state_on_train_end(args, accelerator)
+
+    del accelerator  # この後メモリを使うのでこれは消す
+
+    if is_main_process:
+        sd3_train_utils.save_sd3_model_on_train_end(
+            args,
+            save_dtype,
+            epoch,
+            global_step,
+            clip_l if args.save_clip else None,
+            clip_g if args.save_clip else None,
+            t5xxl if args.save_t5xxl else None,
+            mmdit,
+            vae,
+        )
+        logger.info("model saved.")
+
+
+def setup_parser() -> argparse.ArgumentParser:
+    parser = argparse.ArgumentParser()
+
+    add_logging_arguments(parser)
+    train_util.add_sd_models_arguments(parser)
+    train_util.add_dataset_arguments(parser, True, True, True)
+    train_util.add_training_arguments(parser, False)
+    train_util.add_masked_loss_arguments(parser)
+    deepspeed_utils.add_deepspeed_arguments(parser)
+    train_util.add_sd_saving_arguments(parser)
+    train_util.add_optimizer_arguments(parser)
+    config_util.add_config_arguments(parser)
+    custom_train_functions.add_custom_train_arguments(parser)
+    sd3_train_utils.add_sd3_training_arguments(parser)
+
+    # TE training is disabled temporarily
+
+    # parser.add_argument(
+    #     "--learning_rate_te1",
+    #     type=float,
+    #     default=None,
+    #     help="learning rate for text encoder 1 (ViT-L) / text encoder 1 (ViT-L)の学習率",
+    # )
+    # parser.add_argument(
+    #     "--learning_rate_te2",
+    #     type=float,
+    #     default=None,
+    #     help="learning rate for text encoder 2 (BiG-G) / text encoder 2 (BiG-G)の学習率",
+    # )
+
+    # parser.add_argument(
+    #     "--diffusers_xformers", action="store_true", help="use xformers by diffusers / Diffusersでxformersを使用する"
+    # )
+    # parser.add_argument("--train_text_encoder", action="store_true", help="train text encoder / text encoderも学習する")
+    # parser.add_argument(
+    #     "--no_half_vae",
+    #     action="store_true",
+    #     help="do not use fp16/bf16 VAE in mixed precision (use float VAE) / mixed precisionでも fp16/bf16 VAEを使わずfloat VAEを使う",
+    # )
+    # parser.add_argument(
+    #     "--block_lr",
+    #     type=str,
+    #     default=None,
+    #     help=f"learning rates for each block of U-Net, comma-separated, {UNET_NUM_BLOCKS_FOR_BLOCK_LR} values / "
+    #     + f"U-Netの各ブロックの学習率、カンマ区切り、{UNET_NUM_BLOCKS_FOR_BLOCK_LR}個の値",
+    # )
+    parser.add_argument(
+        "--fused_optimizer_groups",
+        type=int,
+        default=None,
+        help="number of optimizers for fused backward pass and optimizer step / fused backward passとoptimizer stepのためのoptimizer数",
+    )
+    return parser
+
+
+if __name__ == "__main__":
+    parser = setup_parser()
+
+    args = parser.parse_args()
+    train_util.verify_command_line_training_args(args)
+    args = train_util.read_config_from_file(args, parser)
+
+    train(args)

From 0fe4eafac996fa5139a311aadc86aca28ddc6930 Mon Sep 17 00:00:00 2001
From: Kohya S <ykumeykume@gmail.com>
Date: Mon, 24 Jun 2024 23:12:48 +0900
Subject: [PATCH 029/582] fix to use zero for initial latent

---
 sd3_minimal_inference.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/sd3_minimal_inference.py b/sd3_minimal_inference.py
index 96e9da4ac..7f5f28cea 100644
--- a/sd3_minimal_inference.py
+++ b/sd3_minimal_inference.py
@@ -64,7 +64,8 @@ def do_sample(
     device: str,
 ):
     if initial_latent is None:
-        latent = torch.ones(1, 16, height // 8, width // 8, device=device) * 0.0609
+        # latent = torch.ones(1, 16, height // 8, width // 8, device=device) * 0.0609
+        latent = torch.zeros(1, 16, height // 8, width // 8, device=device)
     else:
         latent = initial_latent
 

From 4802e4aaec74429f733fae289e41c5618ebb0e92 Mon Sep 17 00:00:00 2001
From: Kohya S <ykumeykume@gmail.com>
Date: Mon, 24 Jun 2024 23:13:14 +0900
Subject: [PATCH 030/582] workaround for long caption ref #1382

---
 library/sd3_models.py | 17 +++++++++++++++--
 1 file changed, 15 insertions(+), 2 deletions(-)

diff --git a/library/sd3_models.py b/library/sd3_models.py
index a4fe400e3..c19aec6aa 100644
--- a/library/sd3_models.py
+++ b/library/sd3_models.py
@@ -56,7 +56,7 @@ def __init__(
         self.inv_vocab = {v: k for k, v in vocab.items()}
         self.max_word_length = 8
 
-    def tokenize_with_weights(self, text: str):
+    def tokenize_with_weights(self, text: str, truncate_to_max_length=True, truncate_length=None):
         """Tokenize the text, with weight values - presume 1.0 for all and ignore other features here.
         The details aren't relevant for a reference impl, and weights themselves has weak effect on SD3."""
         """
@@ -79,6 +79,14 @@ def tokenize_with_weights(self, text: str):
             batch.extend([(pad_token, 1.0)] * (self.max_length - len(batch)))
         if self.min_length is not None and len(batch) < self.min_length:
             batch.extend([(pad_token, 1.0)] * (self.min_length - len(batch)))
+
+        # truncate to max_length
+        # print(f"batch: {batch}, truncate: {truncate}, len(batch): {len(batch)}, max_length: {self.max_length}")
+        if truncate_to_max_length and len(batch) > self.max_length:
+            batch = batch[: self.max_length]
+        if truncate_length is not None and len(batch) > truncate_length:
+            batch = batch[:truncate_length]
+
         return [batch]
 
 
@@ -112,10 +120,15 @@ def __init__(self, t5xxl=True):
         self.model_max_length = self.clip_l.max_length  # 77
 
     def tokenize_with_weights(self, text: str):
+        # temporary truncate to max_length even for t5xxl
         return (
             self.clip_l.tokenize_with_weights(text),
             self.clip_g.tokenize_with_weights(text),
-            self.t5xxl.tokenize_with_weights(text) if self.t5xxl is not None else None,
+            (
+                self.t5xxl.tokenize_with_weights(text, truncate_to_max_length=False, truncate_length=self.model_max_length)
+                if self.t5xxl is not None
+                else None
+            ),
         )
 
 

From 8f2ba27869e4c5b9225a309aeed275a47d8eed6a Mon Sep 17 00:00:00 2001
From: Kohya S <ykumeykume@gmail.com>
Date: Wed, 26 Jun 2024 20:36:22 +0900
Subject: [PATCH 031/582] support text_encoder_batch_size for caching

---
 library/sd3_train_utils.py |  7 +++++++
 library/train_util.py      | 14 ++++++++++----
 sd3_train.py               |  1 +
 3 files changed, 18 insertions(+), 4 deletions(-)

diff --git a/library/sd3_train_utils.py b/library/sd3_train_utils.py
index 4e45871f4..70c83c0ba 100644
--- a/library/sd3_train_utils.py
+++ b/library/sd3_train_utils.py
@@ -173,6 +173,13 @@ def add_sd3_training_arguments(parser: argparse.ArgumentParser):
         action="store_true",
         help="cache text encoder outputs to disk / text encoderの出力をディスクにキャッシュする",
     )
+    parser.add_argument(
+        "--text_encoder_batch_size",
+        type=int,
+        default=None,
+        help="text encoder batch size (default: None, use dataset's batch size)"
+        + " / text encoderのバッチサイズ（デフォルト: None, データセットのバッチサイズを使用）",
+    )
     parser.add_argument(
         "--disable_mmap_load_safetensors",
         action="store_true",
diff --git a/library/train_util.py b/library/train_util.py
index c67e8737c..96d32e3bc 100644
--- a/library/train_util.py
+++ b/library/train_util.py
@@ -1054,7 +1054,7 @@ def cache_text_encoder_outputs(
 
     # same as above, but for SD3
     def cache_text_encoder_outputs_sd3(
-        self, tokenizer, text_encoders, devices, output_dtype, te_dtypes, cache_to_disk=False, is_main_process=True
+        self, tokenizer, text_encoders, devices, output_dtype, te_dtypes, cache_to_disk=False, is_main_process=True, batch_size=None
     ):
         return self.cache_text_encoder_outputs_common(
             [tokenizer],
@@ -1065,6 +1065,7 @@ def cache_text_encoder_outputs_sd3(
             cache_to_disk,
             is_main_process,
             TEXT_ENCODER_OUTPUTS_CACHE_SUFFIX_SD3,
+            batch_size,
         )
 
     def cache_text_encoder_outputs_common(
@@ -1077,10 +1078,15 @@ def cache_text_encoder_outputs_common(
         cache_to_disk=False,
         is_main_process=True,
         file_suffix=TEXT_ENCODER_OUTPUTS_CACHE_SUFFIX,
+        batch_size=None,
     ):
         # latentsのキャッシュと同様に、ディスクへのキャッシュに対応する
         # またマルチGPUには対応していないので、そちらはtools/cache_latents.pyを使うこと
         logger.info("caching text encoder outputs.")
+
+        if batch_size is None:
+            batch_size = self.batch_size
+            
         image_infos = list(self.image_data.values())
 
         logger.info("checking cache existence...")
@@ -1122,7 +1128,7 @@ def cache_text_encoder_outputs_common(
                 l_tokens, g_tokens, t5_tokens = tokenizers[0].tokenize_with_weights(info.caption)
                 batch.append((info, l_tokens, g_tokens, t5_tokens))
 
-            if len(batch) >= self.batch_size:
+            if len(batch) >= batch_size:
                 batches.append(batch)
                 batch = []
 
@@ -2209,12 +2215,12 @@ def cache_text_encoder_outputs(
             dataset.cache_text_encoder_outputs(tokenizers, text_encoders, device, weight_dtype, cache_to_disk, is_main_process)
 
     def cache_text_encoder_outputs_sd3(
-        self, tokenizer, text_encoders, device, output_dtype, te_dtypes, cache_to_disk=False, is_main_process=True
+        self, tokenizer, text_encoders, device, output_dtype, te_dtypes, cache_to_disk=False, is_main_process=True, batch_size=None
     ):
         for i, dataset in enumerate(self.datasets):
             logger.info(f"[Dataset {i}]")
             dataset.cache_text_encoder_outputs_sd3(
-                tokenizer, text_encoders, device, output_dtype, te_dtypes, cache_to_disk, is_main_process
+                tokenizer, text_encoders, device, output_dtype, te_dtypes, cache_to_disk, is_main_process, batch_size
             )
 
     def set_caching_mode(self, caching_mode):
diff --git a/sd3_train.py b/sd3_train.py
index 0721b2ae4..8216a62b3 100644
--- a/sd3_train.py
+++ b/sd3_train.py
@@ -254,6 +254,7 @@ def train(args):
                 (None, None, None),
                 args.cache_text_encoder_outputs_to_disk,
                 accelerator.is_main_process,
+                args.text_encoder_batch_size,
             )
         accelerator.wait_for_everyone()
 

From 828a581e2968935c00d22e7e03ca32c1281aa5dd Mon Sep 17 00:00:00 2001
From: Kohya S <ykumeykume@gmail.com>
Date: Wed, 26 Jun 2024 20:43:31 +0900
Subject: [PATCH 032/582] fix assertion for experimental impl ref #1389

---
 sd3_train.py | 14 ++++++++++++--
 1 file changed, 12 insertions(+), 2 deletions(-)

diff --git a/sd3_train.py b/sd3_train.py
index 8216a62b3..ea9a11049 100644
--- a/sd3_train.py
+++ b/sd3_train.py
@@ -60,9 +60,19 @@ def train(args):
     assert (
         not args.weighted_captions
     ), "weighted_captions is not supported currently / weighted_captionsは現在サポートされていません"
+    # assert (
+    #     not args.train_text_encoder or not args.cache_text_encoder_outputs
+    # ), "cache_text_encoder_outputs is not supported when training text encoder / text encoderを学習するときはcache_text_encoder_outputsはサポートされていません"
+
+    # training text encoder is not supported
+    assert (
+        not args.train_text_encoder
+    ), "training text encoder is not supported currently / text encoderの学習は現在サポートされていません"
+
+    # training without text encoder cache is not supported
     assert (
-        not args.train_text_encoder or not args.cache_text_encoder_outputs
-    ), "cache_text_encoder_outputs is not supported when training text encoder / text encoderを学習するときはcache_text_encoder_outputsはサポートされていません"
+        args.cache_text_encoder_outputs
+    ), "training without text encoder cache is not supported currently / text encoderのキャッシュなしの学習は現在サポートされていません"
 
     # if args.block_lr:
     #     block_lrs = [float(lr) for lr in args.block_lr.split(",")]

From 381598c8bbd3d4e50ec4327fa27d5d0072ec2a67 Mon Sep 17 00:00:00 2001
From: Kohya S <ykumeykume@gmail.com>
Date: Wed, 26 Jun 2024 21:15:02 +0900
Subject: [PATCH 033/582] fix resolution in metadata for sd3

---
 library/sai_model_spec.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/library/sai_model_spec.py b/library/sai_model_spec.py
index f7bf644d7..af073677e 100644
--- a/library/sai_model_spec.py
+++ b/library/sai_model_spec.py
@@ -216,7 +216,7 @@ def build_metadata(
             reso = (reso[0], reso[0])
     else:
         # resolution is defined in dataset, so use default
-        if sdxl:
+        if sdxl or sd3 is not None:
             reso = 1024
         elif v2 and v_parameterization:
             reso = 768

From 66cf43547972647389fbd2addb53cff2ab478660 Mon Sep 17 00:00:00 2001
From: Kohya S <ykumeykume@gmail.com>
Date: Thu, 27 Jun 2024 13:14:09 +0900
Subject: [PATCH 034/582] re-fix assertion  ref #1389

---
 sd3_train.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/sd3_train.py b/sd3_train.py
index ea9a11049..b6c932c4c 100644
--- a/sd3_train.py
+++ b/sd3_train.py
@@ -64,10 +64,10 @@ def train(args):
     #     not args.train_text_encoder or not args.cache_text_encoder_outputs
     # ), "cache_text_encoder_outputs is not supported when training text encoder / text encoderを学習するときはcache_text_encoder_outputsはサポートされていません"
 
-    # training text encoder is not supported
-    assert (
-        not args.train_text_encoder
-    ), "training text encoder is not supported currently / text encoderの学習は現在サポートされていません"
+    # # training text encoder is not supported
+    # assert (
+    #     not args.train_text_encoder
+    # ), "training text encoder is not supported currently / text encoderの学習は現在サポートされていません"
 
     # training without text encoder cache is not supported
     assert (

From 19086465e8040c01c38d38eec5c53f966f0dad8b Mon Sep 17 00:00:00 2001
From: Kohya S <ykumeykume@gmail.com>
Date: Sat, 29 Jun 2024 17:21:25 +0900
Subject: [PATCH 035/582] Fix fp16 mixed precision, model is in bf16 without
 full_bf16

---
 README.md                  | 11 +++++++--
 library/sd3_train_utils.py | 10 +++++----
 library/sd3_utils.py       | 46 +++++++++++++++++++++++++++++++++-----
 sd3_train.py               |  9 +++++---
 4 files changed, 61 insertions(+), 15 deletions(-)

diff --git a/README.md b/README.md
index 34aa2bb2f..3eed636c5 100644
--- a/README.md
+++ b/README.md
@@ -4,21 +4,28 @@ This repository contains training, generation and utility scripts for Stable Dif
 
 SD3 training is done with `sd3_train.py`. 
 
+__Jun 29, 2024__: Fixed mixed precision training with fp16 is not working. Fixed the model is in bf16 dtype even without `--full_bf16` option (this could worsen the training result).
+
+`fp16` and `bf16` are available for mixed precision training. We are not sure which is better.
+
 `optimizer_type = "adafactor"` is recommended for 24GB VRAM GPUs. `cache_text_encoder_outputs_to_disk` and `cache_latents_to_disk` are necessary currently. 
 
 `clip_l`, `clip_g` and `t5xxl` can be specified if the checkpoint does not include them.  
 
-t5xxl doesn't seem to work with `fp16`, so use`bf16` or `fp32`. 
+t5xxl doesn't seem to work with `fp16`, so 1) use`bf16` for mixed precision, or 2) use `bf16` or `float32` for `t5xxl_dtype`. 
 
 There are `t5xxl_device` and `t5xxl_dtype` options for `t5xxl` device and dtype. 
 
+`text_encoder_batch_size` is added experimentally for caching faster.
+
 ```toml
-learning_rate = 1e-5 # seems to be too high
+learning_rate = 1e-6 # seems to depend on the batch size
 optimizer_type = "adafactor"
 optimizer_args = [ "scale_parameter=False", "relative_step=False", "warmup_init=False" ]
 cache_text_encoder_outputs = true
 cache_text_encoder_outputs_to_disk = true
 vae_batch_size = 1
+text_encoder_batch_size = 4
 cache_latents = true
 cache_latents_to_disk = true
 ```
diff --git a/library/sd3_train_utils.py b/library/sd3_train_utils.py
index 70c83c0ba..c8d52e1c8 100644
--- a/library/sd3_train_utils.py
+++ b/library/sd3_train_utils.py
@@ -28,14 +28,14 @@
 from .sdxl_train_util import match_mixed_precision
 
 
-def load_target_model(args, accelerator, attn_mode, weight_dtype, t5xxl_device, t5xxl_dtype) -> Tuple[
+def load_target_model(args, accelerator, attn_mode, weight_dtype, clip_dtype, t5xxl_device, t5xxl_dtype, vae_dtype) -> Tuple[
     sd3_models.MMDiT,
     Optional[sd3_models.SDClipModel],
     Optional[sd3_models.SDXLClipG],
     Optional[sd3_models.T5XXLModel],
     sd3_models.SDVAE,
 ]:
-    model_dtype = match_mixed_precision(args, weight_dtype)  # prepare fp16/bf16
+    model_dtype = match_mixed_precision(args, weight_dtype)  # prepare fp16/bf16, None or fp16/bf16
 
     for pi in range(accelerator.state.num_processes):
         if pi == accelerator.state.local_process_index:
@@ -49,13 +49,15 @@ def load_target_model(args, accelerator, attn_mode, weight_dtype, t5xxl_device,
                 args.vae,
                 attn_mode,
                 accelerator.device if args.lowram else "cpu",
-                weight_dtype,
+                model_dtype,
                 args.disable_mmap_load_safetensors,
+                clip_dtype,
                 t5xxl_device,
                 t5xxl_dtype,
+                vae_dtype,
             )
 
-            # work on low-ram device
+            # work on low-ram device: models are already loaded on accelerator.device, but we ensure they are on device
             if args.lowram:
                 if clip_l is not None:
                     clip_l.to(accelerator.device)
diff --git a/library/sd3_utils.py b/library/sd3_utils.py
index c2c914123..45b49b04b 100644
--- a/library/sd3_utils.py
+++ b/library/sd3_utils.py
@@ -28,11 +28,41 @@ def load_models(
     vae_path: str,
     attn_mode: str,
     device: Union[str, torch.device],
-    weight_dtype: torch.dtype,
+    default_dtype: Optional[Union[str, torch.dtype]] = None,
     disable_mmap: bool = False,
-    t5xxl_device: Optional[str] = None,
-    t5xxl_dtype: Optional[str] = None,
+    clip_dtype: Optional[Union[str, torch.dtype]] = None,
+    t5xxl_device: Optional[Union[str, torch.device]] = None,
+    t5xxl_dtype: Optional[Union[str, torch.dtype]] = None,
+    vae_dtype: Optional[Union[str, torch.dtype]] = None,
 ):
+    """
+    Load SD3 models from checkpoint files.
+
+    Args:
+        ckpt_path: Path to the SD3 checkpoint file.
+        clip_l_path: Path to the clip_l checkpoint file.
+        clip_g_path: Path to the clip_g checkpoint file.
+        t5xxl_path: Path to the t5xxl checkpoint file.
+        vae_path: Path to the VAE checkpoint file.
+        attn_mode: Attention mode for MMDiT model.
+        device: Device for MMDiT model.
+        default_dtype: Default dtype for each model. In training, it's usually None. None means using float32.
+        disable_mmap: Disable memory mapping when loading state dict.
+        clip_dtype: Dtype for Clip models, or None to use default dtype.
+        t5xxl_device: Device for T5XXL model to load T5XXL in another device (eg. gpu). Default is None to use device.
+        t5xxl_dtype: Dtype for T5XXL model, or None to use default dtype.
+        vae_dtype: Dtype for VAE model, or None to use default dtype.
+
+    Returns:
+        Tuple of MMDiT, ClipL, ClipG, T5XXL, and VAE models.
+    """
+
+    # In SD1/2 and SDXL, the model is created with empty weights and then loaded with state dict.
+    # However, in SD3, Clip and T5XXL models are created with dtype, so we need to set dtype before loading state dict.
+    # Therefore, we need clip_dtype and t5xxl_dtype.
+
+    # default_dtype is used for full_fp16/full_bf16 training.
+
     def load_state_dict(path: str, dvc: Union[str, torch.device] = device):
         if disable_mmap:
             return safetensors.torch.load(open(path, "rb").read())
@@ -43,6 +73,9 @@ def load_state_dict(path: str, dvc: Union[str, torch.device] = device):
                 return load_file(path)  # prevent device invalid Error
 
     t5xxl_device = t5xxl_device or device
+    clip_dtype = clip_dtype or default_dtype or torch.float32
+    t5xxl_dtype = t5xxl_dtype or default_dtype or torch.float32
+    vae_dtype = vae_dtype or default_dtype or torch.float32
 
     logger.info(f"Loading SD3 models from {ckpt_path}...")
     state_dict = load_state_dict(ckpt_path)
@@ -124,7 +157,7 @@ def load_state_dict(path: str, dvc: Union[str, torch.device] = device):
         mmdit = sd3_models.create_mmdit_sd3_medium_configs(attn_mode)
 
     logger.info("Loading state dict...")
-    info = sdxl_model_util._load_state_dict_on_device(mmdit, state_dict, device, weight_dtype)
+    info = sdxl_model_util._load_state_dict_on_device(mmdit, state_dict, device, default_dtype)
     logger.info(f"Loaded MMDiT: {info}")
 
     # load ClipG and ClipL
@@ -132,7 +165,7 @@ def load_state_dict(path: str, dvc: Union[str, torch.device] = device):
         clip_l = None
     else:
         logger.info("Building ClipL")
-        clip_l = sd3_models.create_clip_l(device, weight_dtype, clip_l_sd)
+        clip_l = sd3_models.create_clip_l(device, clip_dtype, clip_l_sd)
         logger.info("Loading state dict...")
         info = clip_l.load_state_dict(clip_l_sd)
         logger.info(f"Loaded ClipL: {info}")
@@ -142,7 +175,7 @@ def load_state_dict(path: str, dvc: Union[str, torch.device] = device):
         clip_g = None
     else:
         logger.info("Building ClipG")
-        clip_g = sd3_models.create_clip_g(device, weight_dtype, clip_g_sd)
+        clip_g = sd3_models.create_clip_g(device, clip_dtype, clip_g_sd)
         logger.info("Loading state dict...")
         info = clip_g.load_state_dict(clip_g_sd)
         logger.info(f"Loaded ClipG: {info}")
@@ -165,6 +198,7 @@ def load_state_dict(path: str, dvc: Union[str, torch.device] = device):
     logger.info("Loading state dict...")
     info = vae.load_state_dict(vae_sd)
     logger.info(f"Loaded VAE: {info}")
+    vae.to(device=device, dtype=vae_dtype)
 
     return mmdit, clip_l, clip_g, t5xxl, vae
 
diff --git a/sd3_train.py b/sd3_train.py
index b6c932c4c..bd30cdc72 100644
--- a/sd3_train.py
+++ b/sd3_train.py
@@ -182,6 +182,8 @@ def train(args):
             raise ValueError(f"unexpected t5xxl_dtype: {args.t5xxl_dtype}")
     t5xxl_device = accelerator.device if args.t5xxl_device is None else args.t5xxl_device
 
+    clip_dtype = weight_dtype # if not args.train_text_encoder else None
+
     # モデルを読み込む
     attn_mode = "xformers" if args.xformers else "torch"
 
@@ -189,8 +191,9 @@ def train(args):
         attn_mode == "torch"
     ), f"attn_mode {attn_mode} is not supported. Please use `--sdpa` instead of `--xformers`. / attn_mode {attn_mode} はサポートされていません。`--xformers`の代わりに`--sdpa`を使ってください。"
 
+    # models are usually loaded on CPU and moved to GPU later. This is to avoid OOM on GPU0.
     mmdit, clip_l, clip_g, t5xxl, vae = sd3_train_utils.load_target_model(
-        args, accelerator, attn_mode, weight_dtype, t5xxl_device, t5xxl_dtype
+        args, accelerator, attn_mode, None, clip_dtype, t5xxl_device, t5xxl_dtype, vae_dtype
     )
     assert clip_l is not None, "clip_l is required / clip_lは必須です"
     assert clip_g is not None, "clip_g is required / clip_gは必須です"
@@ -868,8 +871,9 @@ def setup_parser() -> argparse.ArgumentParser:
     custom_train_functions.add_custom_train_arguments(parser)
     sd3_train_utils.add_sd3_training_arguments(parser)
 
-    # TE training is disabled temporarily
+    # parser.add_argument("--train_text_encoder", action="store_true", help="train text encoder / text encoderも学習する")
 
+    # TE training is disabled temporarily
     # parser.add_argument(
     #     "--learning_rate_te1",
     #     type=float,
@@ -886,7 +890,6 @@ def setup_parser() -> argparse.ArgumentParser:
     # parser.add_argument(
     #     "--diffusers_xformers", action="store_true", help="use xformers by diffusers / Diffusersでxformersを使用する"
     # )
-    # parser.add_argument("--train_text_encoder", action="store_true", help="train text encoder / text encoderも学習する")
     # parser.add_argument(
     #     "--no_half_vae",
     #     action="store_true",

From ea18d5ba6d856995d5c44be4b449b63ac66fe5db Mon Sep 17 00:00:00 2001
From: Kohya S <ykumeykume@gmail.com>
Date: Sat, 29 Jun 2024 17:45:50 +0900
Subject: [PATCH 036/582] Fix to work full_bf16 and full_fp16.

---
 library/sd3_models.py |  8 ++++++++
 library/sd3_utils.py  | 14 ++++++--------
 sd3_train.py          | 20 ++++++++++----------
 3 files changed, 24 insertions(+), 18 deletions(-)

diff --git a/library/sd3_models.py b/library/sd3_models.py
index c19aec6aa..7041420cb 100644
--- a/library/sd3_models.py
+++ b/library/sd3_models.py
@@ -891,6 +891,14 @@ def __init__(
     def model_type(self):
         return "m"  # only support medium
 
+    @property
+    def device(self):
+        return next(self.parameters()).device
+
+    @property
+    def dtype(self):
+        return next(self.parameters()).dtype
+
     def enable_gradient_checkpointing(self):
         self.gradient_checkpointing = True
         for block in self.joint_blocks:
diff --git a/library/sd3_utils.py b/library/sd3_utils.py
index 45b49b04b..9dc9e7967 100644
--- a/library/sd3_utils.py
+++ b/library/sd3_utils.py
@@ -28,7 +28,7 @@ def load_models(
     vae_path: str,
     attn_mode: str,
     device: Union[str, torch.device],
-    default_dtype: Optional[Union[str, torch.dtype]] = None,
+    weight_dtype: Optional[Union[str, torch.dtype]] = None,
     disable_mmap: bool = False,
     clip_dtype: Optional[Union[str, torch.dtype]] = None,
     t5xxl_device: Optional[Union[str, torch.device]] = None,
@@ -46,7 +46,7 @@ def load_models(
         vae_path: Path to the VAE checkpoint file.
         attn_mode: Attention mode for MMDiT model.
         device: Device for MMDiT model.
-        default_dtype: Default dtype for each model. In training, it's usually None. None means using float32.
+        weight_dtype: Default dtype of weights for all models. This is weight dtype, so the model dtype may be different.
         disable_mmap: Disable memory mapping when loading state dict.
         clip_dtype: Dtype for Clip models, or None to use default dtype.
         t5xxl_device: Device for T5XXL model to load T5XXL in another device (eg. gpu). Default is None to use device.
@@ -61,8 +61,6 @@ def load_models(
     # However, in SD3, Clip and T5XXL models are created with dtype, so we need to set dtype before loading state dict.
     # Therefore, we need clip_dtype and t5xxl_dtype.
 
-    # default_dtype is used for full_fp16/full_bf16 training.
-
     def load_state_dict(path: str, dvc: Union[str, torch.device] = device):
         if disable_mmap:
             return safetensors.torch.load(open(path, "rb").read())
@@ -73,9 +71,9 @@ def load_state_dict(path: str, dvc: Union[str, torch.device] = device):
                 return load_file(path)  # prevent device invalid Error
 
     t5xxl_device = t5xxl_device or device
-    clip_dtype = clip_dtype or default_dtype or torch.float32
-    t5xxl_dtype = t5xxl_dtype or default_dtype or torch.float32
-    vae_dtype = vae_dtype or default_dtype or torch.float32
+    clip_dtype = clip_dtype or weight_dtype or torch.float32
+    t5xxl_dtype = t5xxl_dtype or weight_dtype or torch.float32
+    vae_dtype = vae_dtype or weight_dtype or torch.float32
 
     logger.info(f"Loading SD3 models from {ckpt_path}...")
     state_dict = load_state_dict(ckpt_path)
@@ -157,7 +155,7 @@ def load_state_dict(path: str, dvc: Union[str, torch.device] = device):
         mmdit = sd3_models.create_mmdit_sd3_medium_configs(attn_mode)
 
     logger.info("Loading state dict...")
-    info = sdxl_model_util._load_state_dict_on_device(mmdit, state_dict, device, default_dtype)
+    info = sdxl_model_util._load_state_dict_on_device(mmdit, state_dict, device, weight_dtype)
     logger.info(f"Loaded MMDiT: {info}")
 
     # load ClipG and ClipL
diff --git a/sd3_train.py b/sd3_train.py
index bd30cdc72..de763ac6d 100644
--- a/sd3_train.py
+++ b/sd3_train.py
@@ -182,7 +182,7 @@ def train(args):
             raise ValueError(f"unexpected t5xxl_dtype: {args.t5xxl_dtype}")
     t5xxl_device = accelerator.device if args.t5xxl_device is None else args.t5xxl_device
 
-    clip_dtype = weight_dtype # if not args.train_text_encoder else None
+    clip_dtype = weight_dtype  # if not args.train_text_encoder else None
 
     # モデルを読み込む
     attn_mode = "xformers" if args.xformers else "torch"
@@ -193,7 +193,7 @@ def train(args):
 
     # models are usually loaded on CPU and moved to GPU later. This is to avoid OOM on GPU0.
     mmdit, clip_l, clip_g, t5xxl, vae = sd3_train_utils.load_target_model(
-        args, accelerator, attn_mode, None, clip_dtype, t5xxl_device, t5xxl_dtype, vae_dtype
+        args, accelerator, attn_mode, weight_dtype, clip_dtype, t5xxl_device, t5xxl_dtype, vae_dtype
     )
     assert clip_l is not None, "clip_l is required / clip_lは必須です"
     assert clip_g is not None, "clip_g is required / clip_gは必須です"
@@ -769,10 +769,10 @@ def compute_loss_weighting_for_sd3(weighting_scheme: str, sigmas=None):
                             epoch,
                             num_train_epochs,
                             global_step,
-                            clip_l if args.save_clip else None,
-                            clip_g if args.save_clip else None,
-                            t5xxl if args.save_t5xxl else None,
-                            mmdit,
+                            accelerator.unwrap_model(clip_l) if args.save_clip else None,
+                            accelerator.unwrap_model(clip_g) if args.save_clip else None,
+                            accelerator.unwrap_model(t5xxl) if args.save_t5xxl else None,
+                            accelerator.unwrap_model(mmdit),
                             vae,
                         )
 
@@ -807,10 +807,10 @@ def compute_loss_weighting_for_sd3(weighting_scheme: str, sigmas=None):
                     epoch,
                     num_train_epochs,
                     global_step,
-                    clip_l if args.save_clip else None,
-                    clip_g if args.save_clip else None,
-                    t5xxl if args.save_t5xxl else None,
-                    mmdit,
+                    accelerator.unwrap_model(clip_l) if args.save_clip else None,
+                    accelerator.unwrap_model(clip_g) if args.save_clip else None,
+                    accelerator.unwrap_model(t5xxl) if args.save_t5xxl else None,
+                    accelerator.unwrap_model(mmdit),
                     vae,
                 )
 

From 50e3d6247459c9f59facaef42e03b34cd8d6287d Mon Sep 17 00:00:00 2001
From: Kohya S <ykumeykume@gmail.com>
Date: Mon, 8 Jul 2024 19:46:23 +0900
Subject: [PATCH 037/582] fix to work T5XXL with fp16

---
 library/sd3_models.py | 144 ++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 138 insertions(+), 6 deletions(-)

diff --git a/library/sd3_models.py b/library/sd3_models.py
index 7041420cb..e4c0790d9 100644
--- a/library/sd3_models.py
+++ b/library/sd3_models.py
@@ -1124,7 +1124,12 @@ def __init__(self, in_channels, dtype=torch.float32, device=None):
         self.conv = torch.nn.Conv2d(in_channels, in_channels, kernel_size=3, stride=1, padding=1, dtype=dtype, device=device)
 
     def forward(self, x):
+        org_dtype = x.dtype
+        if x.dtype == torch.bfloat16:
+            x = x.to(torch.float32)
         x = torch.nn.functional.interpolate(x, scale_factor=2.0, mode="nearest")
+        if x.dtype != org_dtype:
+            x = x.to(org_dtype)
         x = self.conv(x)
         return x
 
@@ -1263,11 +1268,11 @@ def device(self):
     def dtype(self):
         return next(self.parameters()).dtype
 
-    @torch.autocast("cuda", dtype=torch.float16)
+    # @torch.autocast("cuda", dtype=torch.float16)
     def decode(self, latent):
         return self.decoder(latent)
 
-    @torch.autocast("cuda", dtype=torch.float16)
+    # @torch.autocast("cuda", dtype=torch.float16)
     def encode(self, image):
         hidden = self.encoder(image)
         mean, logvar = torch.chunk(hidden, 2, dim=1)
@@ -1630,10 +1635,25 @@ def __init__(self, hidden_size, eps=1e-6, dtype=None, device=None):
         self.weight = torch.nn.Parameter(torch.ones(hidden_size, dtype=dtype, device=device))
         self.variance_epsilon = eps
 
-    def forward(self, x):
-        variance = x.pow(2).mean(-1, keepdim=True)
-        x = x * torch.rsqrt(variance + self.variance_epsilon)
-        return self.weight.to(device=x.device, dtype=x.dtype) * x
+    # def forward(self, x):
+    #     variance = x.pow(2).mean(-1, keepdim=True)
+    #     x = x * torch.rsqrt(variance + self.variance_epsilon)
+    #     return self.weight.to(device=x.device, dtype=x.dtype) * x
+
+    # copy from transformers' T5LayerNorm
+    def forward(self, hidden_states):
+        # T5 uses a layer_norm which only scales and doesn't shift, which is also known as Root Mean
+        # Square Layer Normalization https://arxiv.org/abs/1910.07467 thus varience is calculated
+        # w/o mean and there is no bias. Additionally we want to make sure that the accumulation for
+        # half-precision inputs is done in fp32
+        variance = hidden_states.to(torch.float32).pow(2).mean(-1, keepdim=True)
+        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
+
+        # convert into half-precision if necessary
+        if self.weight.dtype in [torch.float16, torch.bfloat16]:
+            hidden_states = hidden_states.to(self.weight.dtype)
+
+        return self.weight * hidden_states
 
 
 class T5DenseGatedActDense(torch.nn.Module):
@@ -1775,7 +1795,27 @@ def __init__(self, model_dim, inner_dim, ff_dim, num_heads, relative_attention_b
 
     def forward(self, x, past_bias=None):
         x, past_bias = self.layer[0](x, past_bias)
+
+        # copy from transformers' T5Block
+        # clamp inf values to enable fp16 training
+        if x.dtype == torch.float16:
+            clamp_value = torch.where(
+                torch.isinf(x).any(),
+                torch.finfo(x.dtype).max - 1000,
+                torch.finfo(x.dtype).max,
+            )
+            x = torch.clamp(x, min=-clamp_value, max=clamp_value)
+
         x = self.layer[-1](x)
+        # clamp inf values to enable fp16 training
+        if x.dtype == torch.float16:
+            clamp_value = torch.where(
+                torch.isinf(x).any(),
+                torch.finfo(x.dtype).max - 1000,
+                torch.finfo(x.dtype).max,
+            )
+            x = torch.clamp(x, min=-clamp_value, max=clamp_value)
+
         return x, past_bias
 
 
@@ -1896,4 +1936,96 @@ def create_t5xxl(device="cpu", dtype=torch.float32, state_dict: Optional[Dict[st
     return t5
 
 
+"""
+    # snippet for using the T5 model from transformers
+    
+    from transformers import T5EncoderModel, T5Config
+    import accelerate
+    import json
+
+    T5_CONFIG_JSON = ""
+{
+  "architectures": [
+    "T5EncoderModel"
+  ],
+  "classifier_dropout": 0.0,
+  "d_ff": 10240,
+  "d_kv": 64,
+  "d_model": 4096,
+  "decoder_start_token_id": 0,
+  "dense_act_fn": "gelu_new",
+  "dropout_rate": 0.1,
+  "eos_token_id": 1,
+  "feed_forward_proj": "gated-gelu",
+  "initializer_factor": 1.0,
+  "is_encoder_decoder": true,
+  "is_gated_act": true,
+  "layer_norm_epsilon": 1e-06,
+  "model_type": "t5",
+  "num_decoder_layers": 24,
+  "num_heads": 64,
+  "num_layers": 24,
+  "output_past": true,
+  "pad_token_id": 0,
+  "relative_attention_max_distance": 128,
+  "relative_attention_num_buckets": 32,
+  "tie_word_embeddings": false,
+  "torch_dtype": "float16",
+  "transformers_version": "4.41.2",
+  "use_cache": true,
+  "vocab_size": 32128
+}
+""
+    config = json.loads(T5_CONFIG_JSON)
+    config = T5Config(**config)
+
+    # model = T5EncoderModel.from_pretrained("stabilityai/stable-diffusion-3-medium-diffusers", subfolder="text_encoder_3")
+    # print(model.config)
+    # # model(**load_model.config)
+
+    # with accelerate.init_empty_weights():
+    model = T5EncoderModel._from_config(config)  # , torch_dtype=dtype)
+    for key in list(state_dict.keys()):
+        if key.startswith("transformer."):
+            new_key = key[len("transformer.") :]
+            state_dict[new_key] = state_dict.pop(key)
+
+    info = model.load_state_dict(state_dict)
+    print(info)
+    model.set_attn_mode = lambda x: None
+    # model.to("cpu")
+
+    _self = model
+
+    def enc(list_of_token_weight_pairs):
+        has_batch = isinstance(list_of_token_weight_pairs[0][0], list)
+
+        if has_batch:
+            list_of_tokens = []
+            for pairs in list_of_token_weight_pairs:
+                tokens = [a[0] for a in pairs[0]]  # I'm not sure why this is [0]
+                list_of_tokens.append(tokens)
+        else:
+            list_of_tokens = [[a[0] for a in list_of_token_weight_pairs[0]]]
+
+        list_of_tokens = np.array(list_of_tokens)
+        list_of_tokens = torch.from_numpy(list_of_tokens).to("cuda", dtype=torch.long)
+        out = _self(list_of_tokens)
+        pooled = None
+        if has_batch:
+            return out, pooled
+        else:
+            if pooled is not None:
+                first_pooled = pooled[0:1]
+            else:
+                first_pooled = pooled
+            return out[0], first_pooled
+            # output = [out[0:1]]
+            # return torch.cat(output, dim=-2), first_pooled
+
+    model.encode_token_weights = enc
+
+    return model
+"""
+
 # endregion

From c9de7c4e9a3d02ab6f18f105c880a9ba88b667ab Mon Sep 17 00:00:00 2001
From: Kohya S <ykumeykume@gmail.com>
Date: Mon, 8 Jul 2024 19:48:28 +0900
Subject: [PATCH 038/582] WIP: new latents caching

---
 library/sd3_train_utils.py |  94 +++++++++++++++++++++++-
 library/train_util.py      | 147 ++++++++++++++++++++++++++++++++++++-
 sd3_train.py               |  37 +++++++++-
 3 files changed, 270 insertions(+), 8 deletions(-)

diff --git a/library/sd3_train_utils.py b/library/sd3_train_utils.py
index c8d52e1c8..9309ee30c 100644
--- a/library/sd3_train_utils.py
+++ b/library/sd3_train_utils.py
@@ -1,7 +1,7 @@
 import argparse
 import math
 import os
-from typing import Optional, Tuple
+from typing import List, Optional, Tuple
 
 import torch
 from safetensors.torch import save_file
@@ -283,6 +283,98 @@ def sample_images(*args, **kwargs):
     return train_util.sample_images_common(SdxlStableDiffusionLongPromptWeightingPipeline, *args, **kwargs)
 
 
+class Sd3LatensCachingStrategy(train_util.LatentsCachingStrategy):
+    SD3_LATENTS_NPZ_SUFFIX = "_sd3.npz"
+
+    def __init__(self, vae: sd3_models.SDVAE, cache_to_disk: bool, batch_size: int, skip_disk_cache_validity_check: bool) -> None:
+        super().__init__(cache_to_disk, batch_size, skip_disk_cache_validity_check)
+        self.vae = vae
+
+    def get_latents_npz_path(self, absolute_path: str):
+        return os.path.splitext(absolute_path)[0] + self.SD3_LATENTS_NPZ_SUFFIX
+
+    def is_disk_cached_latents_expected(self, bucket_reso: Tuple[int, int], npz_path: str, flip_aug: bool, alpha_mask: bool):
+        if not self.cache_to_disk:
+            return False
+        if not os.path.exists(npz_path):
+            return False
+        if self.skip_disk_cache_validity_check:
+            return True
+
+        expected_latents_size = (bucket_reso[1] // 8, bucket_reso[0] // 8)  # bucket_reso is (W, H)
+
+        try:
+            npz = np.load(npz_path)
+            if npz["latents"].shape[1:3] != expected_latents_size:
+                return False
+
+            if flip_aug:
+                if "latents_flipped" not in npz:
+                    return False
+                if npz["latents_flipped"].shape[1:3] != expected_latents_size:
+                    return False
+
+            if alpha_mask:
+                if "alpha_mask" not in npz:
+                    return False
+                if npz["alpha_mask"].shape[0:2] != (bucket_reso[1], bucket_reso[0]):
+                    return False
+            else:
+                if "alpha_mask" in npz:
+                    return False
+        except Exception as e:
+            logger.error(f"Error loading file: {npz_path}")
+            raise e
+
+        return True
+
+    def cache_batch_latents(self, image_infos: List[train_util.ImageInfo], flip_aug: bool, alpha_mask: bool, random_crop: bool):
+        img_tensor, alpha_masks, original_sizes, crop_ltrbs = train_util.load_images_and_masks_for_caching(
+            image_infos, alpha_mask, random_crop
+        )
+        img_tensor = img_tensor.to(device=self.vae.device, dtype=self.vae.dtype)
+
+        with torch.no_grad():
+            latents = self.vae.encode(img_tensor).to("cpu")
+        if flip_aug:
+            img_tensor = torch.flip(img_tensor, dims=[3])
+            with torch.no_grad():
+                flipped_latents = self.vae.encode(img_tensor).to("cpu")
+        else:
+            flipped_latents = [None] * len(latents)
+
+        for info, latent, flipped_latent, alpha_mask in zip(image_infos, latents, flipped_latents, alpha_masks):
+            if self.cache_to_disk:
+                # save_latents_to_disk(
+                #     info.latents_npz,
+                #     latent,
+                #     info.latents_original_size,
+                #     info.latents_crop_ltrb,
+                #     flipped_latent,
+                #     alpha_mask,
+                # )
+                kwargs = {}
+                if flipped_latent is not None:
+                    kwargs["latents_flipped"] = flipped_latent.float().cpu().numpy()
+                if alpha_mask is not None:
+                    kwargs["alpha_mask"] = alpha_mask.float().cpu().numpy()
+                np.savez(
+                    info.latents_npz,
+                    latents=latents.float().cpu().numpy(),
+                    original_size=np.array(original_sizes),
+                    crop_ltrb=np.array(crop_ltrbs),
+                    **kwargs,
+                )
+            else:
+                info.latents = latent
+                if flip_aug:
+                    info.latents_flipped = flipped_latent
+                info.alpha_mask = alpha_mask
+
+        if not train_util.HIGH_VRAM:
+            clean_memory_on_device(self.vae.device)
+
+
 # region Diffusers
 
 
diff --git a/library/train_util.py b/library/train_util.py
index 96d32e3bc..8444827df 100644
--- a/library/train_util.py
+++ b/library/train_util.py
@@ -359,6 +359,30 @@ def get_augmentor(self, use_color_aug: bool):  # -> Optional[Callable[[np.ndarra
         return self.color_aug if use_color_aug else None
 
 
+class LatentsCachingStrategy:
+    def __init__(self, cache_to_disk: bool, batch_size: int, skip_disk_cache_validity_check: bool) -> None:
+        self._cache_to_disk = cache_to_disk
+        self._batch_size = batch_size
+        self.skip_disk_cache_validity_check = skip_disk_cache_validity_check
+
+    @property
+    def cache_to_disk(self):
+        return self._cache_to_disk
+
+    @property
+    def batch_size(self):
+        return self._batch_size
+
+    def get_latents_npz_path(self, absolute_path: str):
+        raise NotImplementedError
+
+    def is_disk_cached_latents_expected(self, bucket_reso: Tuple[int, int], npz_path: str, flip_aug: bool, alpha_mask: bool):
+        raise NotImplementedError
+
+    def cache_batch_latents(self, batch: List[ImageInfo], flip_aug: bool, alpha_mask: bool, random_crop: bool):
+        raise NotImplementedError
+
+
 class BaseSubset:
     def __init__(
         self,
@@ -986,6 +1010,69 @@ def is_text_encoder_output_cacheable(self):
             ]
         )
 
+    def new_cache_latents(self, is_main_process: bool, caching_strategy: LatentsCachingStrategy):
+        r"""
+        a brand new method to cache latents. This method caches latents with caching strategy.
+        normal cache_latents method is used by default, but this method is used when caching strategy is specified.
+        """
+        logger.info("caching latents with caching strategy.")
+        image_infos = list(self.image_data.values())
+
+        # sort by resolution
+        image_infos.sort(key=lambda info: info.bucket_reso[0] * info.bucket_reso[1])
+
+        # split by resolution
+        batches = []
+        batch = []
+        logger.info("checking cache validity...")
+        for info in tqdm(image_infos):
+            subset = self.image_to_subset[info.image_key]
+
+            if info.latents_npz is not None:  # fine tuning dataset
+                continue
+
+            # check disk cache exists and size of latents
+            if caching_strategy.cache_to_disk:
+                # info.latents_npz = os.path.splitext(info.absolute_path)[0] + file_suffix
+                info.latents_npz = caching_strategy.get_latents_npz_path(info.absolute_path)
+                if not is_main_process:  # prepare for multi-gpu, only store to info
+                    continue
+
+                cache_available = caching_strategy.is_disk_cached_latents_expected(
+                    info.bucket_reso, info.latents_npz, subset.flip_aug, subset.alpha_mask
+                )
+                if cache_available:  # do not add to batch
+                    continue
+
+            # if last member of batch has different resolution, flush the batch
+            if len(batch) > 0 and batch[-1].bucket_reso != info.bucket_reso:
+                batches.append(batch)
+                batch = []
+
+            batch.append(info)
+
+            # if number of data in batch is enough, flush the batch
+            if len(batch) >= caching_strategy.batch_size:
+                batches.append(batch)
+                batch = []
+
+        if len(batch) > 0:
+            batches.append(batch)
+
+        # if cache to disk, don't cache latents in non-main process, set to info only
+        if caching_strategy.cache_to_disk and not is_main_process:
+            return
+
+        if len(batches) == 0:
+            logger.info("no latents to cache")
+            return
+
+        # iterate batches: batch doesn't have image here. image will be loaded in cache_batch_latents and discarded
+        logger.info("caching latents...")
+        for batch in tqdm(batches, smoothing=1, total=len(batches)):
+            # cache_batch_latents(vae, cache_to_disk, batch, subset.flip_aug, subset.alpha_mask, subset.random_crop)
+            caching_strategy.cache_batch_latents(batch, subset.flip_aug, subset.alpha_mask, subset.random_crop)
+
     def cache_latents(self, vae, vae_batch_size=1, cache_to_disk=False, is_main_process=True, file_suffix=".npz"):
         # マルチGPUには対応していないので、そちらはtools/cache_latents.pyを使うこと
         logger.info("caching latents.")
@@ -1086,7 +1173,7 @@ def cache_text_encoder_outputs_common(
 
         if batch_size is None:
             batch_size = self.batch_size
-            
+
         image_infos = list(self.image_data.values())
 
         logger.info("checking cache existence...")
@@ -2207,6 +2294,11 @@ def cache_latents(self, vae, vae_batch_size=1, cache_to_disk=False, is_main_proc
             logger.info(f"[Dataset {i}]")
             dataset.cache_latents(vae, vae_batch_size, cache_to_disk, is_main_process, file_suffix)
 
+    def new_cache_latents(self, is_main_process: bool, strategy: LatentsCachingStrategy):
+        for i, dataset in enumerate(self.datasets):
+            logger.info(f"[Dataset {i}]")
+            dataset.new_cache_latents(is_main_process, strategy)
+
     def cache_text_encoder_outputs(
         self, tokenizers, text_encoders, device, weight_dtype, cache_to_disk=False, is_main_process=True
     ):
@@ -2550,6 +2642,51 @@ def trim_and_resize_if_required(
     return image, original_size, crop_ltrb
 
 
+# for new_cache_latents
+def load_images_and_masks_for_caching(
+    image_infos: List[ImageInfo], use_alpha_mask: bool, random_crop: bool
+) -> Tuple[torch.Tensor, List[np.ndarray], List[Tuple[int, int]], List[Tuple[int, int, int, int]]]:
+    r"""
+    requires image_infos to have: [absolute_path or image], bucket_reso, resized_size
+
+    returns: image_tensor, alpha_masks, original_sizes, crop_ltrbs
+
+    image_tensor: torch.Tensor = torch.Size([B, 3, H, W]), ...], normalized to [-1, 1]
+    alpha_masks: List[np.ndarray] = [np.ndarray([H, W]), ...], normalized to [0, 1]
+    original_sizes: List[Tuple[int, int]] = [(W, H), ...]
+    crop_ltrbs: List[Tuple[int, int, int, int]] = [(L, T, R, B), ...]
+    """
+    images: List[torch.Tensor] = []
+    alpha_masks: List[np.ndarray] = []
+    original_sizes: List[Tuple[int, int]] = []
+    crop_ltrbs: List[Tuple[int, int, int, int]] = []
+    for info in image_infos:
+        image = load_image(info.absolute_path, use_alpha_mask) if info.image is None else np.array(info.image, np.uint8)
+        # TODO 画像のメタデータが壊れていて、メタデータから割り当てたbucketと実際の画像サイズが一致しない場合があるのでチェック追加要
+        image, original_size, crop_ltrb = trim_and_resize_if_required(random_crop, image, info.bucket_reso, info.resized_size)
+
+        original_sizes.append(original_size)
+        crop_ltrbs.append(crop_ltrb)
+
+        if use_alpha_mask:
+            if image.shape[2] == 4:
+                alpha_mask = image[:, :, 3]  # [H,W]
+                alpha_mask = alpha_mask.astype(np.float32) / 255.0
+                alpha_mask = torch.FloatTensor(alpha_mask)  # [H,W]
+            else:
+                alpha_mask = torch.ones_like(image[:, :, 0], dtype=torch.float32)  # [H,W]
+        else:
+            alpha_mask = None
+        alpha_masks.append(alpha_mask)
+
+        image = image[:, :, :3]  # remove alpha channel if exists
+        image = IMAGE_TRANSFORMS(image)
+        images.append(image)
+
+    img_tensor = torch.stack(images, dim=0)
+    return img_tensor, alpha_masks, original_sizes, crop_ltrbs
+
+
 def cache_batch_latents(
     vae: AutoencoderKL, cache_to_disk: bool, image_infos: List[ImageInfo], flip_aug: bool, use_alpha_mask: bool, random_crop: bool
 ) -> None:
@@ -2661,7 +2798,7 @@ def cache_batch_text_encoder_outputs_sd3(
 ):
     # make input_ids for each text encoder
     l_tokens, g_tokens, t5_tokens = input_ids
-    
+
     clip_l, clip_g, t5xxl = text_encoders
     with torch.no_grad():
         b_lg_out, b_t5_out, b_pool = sd3_utils.get_cond_from_tokens(
@@ -2670,8 +2807,12 @@ def cache_batch_text_encoder_outputs_sd3(
         b_lg_out = b_lg_out.detach()
         b_t5_out = b_t5_out.detach()
         b_pool = b_pool.detach()
-
+    
     for info, lg_out, t5_out, pool in zip(image_infos, b_lg_out, b_t5_out, b_pool):
+        # debug: NaN check
+        if torch.isnan(lg_out).any() or torch.isnan(t5_out).any() or torch.isnan(pool).any():
+            raise RuntimeError(f"NaN detected in text encoder outputs: {info.absolute_path}")
+        
         if cache_to_disk:
             save_text_encoder_outputs_to_disk(info.text_encoder_outputs_npz, lg_out, t5_out, pool)
         else:
diff --git a/sd3_train.py b/sd3_train.py
index de763ac6d..c073ec0e2 100644
--- a/sd3_train.py
+++ b/sd3_train.py
@@ -204,11 +204,22 @@ def train(args):
         vae.to(accelerator.device, dtype=vae_dtype)
         vae.requires_grad_(False)
         vae.eval()
-        vae_wrapper = sd3_models.VAEWrapper(vae)  # make SD/SDXL compatible
-        with torch.no_grad():
-            train_dataset_group.cache_latents(
-                vae_wrapper, args.vae_batch_size, args.cache_latents_to_disk, accelerator.is_main_process, file_suffix="_sd3.npz"
+
+        if not args.new_caching:
+            vae_wrapper = sd3_models.VAEWrapper(vae)  # make SD/SDXL compatible
+            with torch.no_grad():
+                train_dataset_group.cache_latents(
+                    vae_wrapper,
+                    args.vae_batch_size,
+                    args.cache_latents_to_disk,
+                    accelerator.is_main_process,
+                    file_suffix="_sd3.npz",
+                )
+        else:
+            strategy = sd3_train_utils.Sd3LatensCachingStrategy(
+                vae, args.cache_latents_to_disk, args.vae_batch_size, args.skip_latents_validity_check
             )
+            train_dataset_group.new_cache_latents(accelerator.is_main_process, strategy)
         vae.to("cpu")
         clean_memory_on_device(accelerator.device)
 
@@ -699,6 +710,17 @@ def compute_loss_weighting_for_sd3(weighting_scheme: str, sigmas=None):
                 sigmas = get_sigmas(timesteps, n_dim=latents.ndim, dtype=weight_dtype)
                 noisy_model_input = sigmas * noise + (1.0 - sigmas) * latents
 
+                # debug: NaN check for all inputs
+                if torch.any(torch.isnan(noisy_model_input)):
+                    accelerator.print("NaN found in noisy_model_input, replacing with zeros")
+                    noisy_model_input = torch.nan_to_num(noisy_model_input, 0, out=noisy_model_input)
+                if torch.any(torch.isnan(context)):
+                    accelerator.print("NaN found in context, replacing with zeros")
+                    context = torch.nan_to_num(context, 0, out=context)
+                if torch.any(torch.isnan(pool)):
+                    accelerator.print("NaN found in pool, replacing with zeros")
+                    pool = torch.nan_to_num(pool, 0, out=pool)
+
                 # call model
                 with accelerator.autocast():
                     model_pred = mmdit(noisy_model_input, timesteps, context=context, y=pool)
@@ -908,6 +930,13 @@ def setup_parser() -> argparse.ArgumentParser:
         default=None,
         help="number of optimizers for fused backward pass and optimizer step / fused backward passとoptimizer stepのためのoptimizer数",
     )
+
+    parser.add_argument("--new_caching", action="store_true", help="use new caching method / 新しいキャッシング方法を使う")
+    parser.add_argument(
+        "--skip_latents_validity_check",
+        action="store_true",
+        help="skip latents validity check / latentsの正当性チェックをスキップする",
+    )
     return parser
 
 

From 3ea4fce5e0f3d1a9c2718d77f49c3b304d25e565 Mon Sep 17 00:00:00 2001
From: Kohya S <ykumeykume@gmail.com>
Date: Mon, 8 Jul 2024 22:04:43 +0900
Subject: [PATCH 039/582] load models one by one

---
 library/sd3_train_utils.py |  56 ++++++------
 library/sd3_utils.py       | 169 +++++++++++++++++++++++++++++++++++++
 sd3_train.py               |  58 +++++++++----
 3 files changed, 236 insertions(+), 47 deletions(-)

diff --git a/library/sd3_train_utils.py b/library/sd3_train_utils.py
index 9309ee30c..98ee66bf8 100644
--- a/library/sd3_train_utils.py
+++ b/library/sd3_train_utils.py
@@ -1,19 +1,17 @@
 import argparse
 import math
 import os
-from typing import List, Optional, Tuple
+from typing import List, Optional, Tuple, Union
 
 import torch
 from safetensors.torch import save_file
+from accelerate import Accelerator
 
 from library import sd3_models, sd3_utils, train_util
 from library.device_utils import init_ipex, clean_memory_on_device
 
 init_ipex()
 
-from accelerate import init_empty_weights
-from tqdm import tqdm
-
 # from transformers import CLIPTokenizer
 # from library import model_util
 # , sdxl_model_util, train_util, sdxl_original_unet
@@ -28,50 +26,48 @@
 from .sdxl_train_util import match_mixed_precision
 
 
-def load_target_model(args, accelerator, attn_mode, weight_dtype, clip_dtype, t5xxl_device, t5xxl_dtype, vae_dtype) -> Tuple[
+def load_target_model(
+    model_type: str,
+    args: argparse.Namespace,
+    state_dict: dict,
+    accelerator: Accelerator,
+    attn_mode: str,
+    model_dtype: Optional[torch.dtype],
+    device: Optional[torch.device],
+) -> Union[
     sd3_models.MMDiT,
     Optional[sd3_models.SDClipModel],
     Optional[sd3_models.SDXLClipG],
     Optional[sd3_models.T5XXLModel],
     sd3_models.SDVAE,
 ]:
-    model_dtype = match_mixed_precision(args, weight_dtype)  # prepare fp16/bf16, None or fp16/bf16
+    loading_device = device if device is not None else (accelerator.device if args.lowram else "cpu")
 
     for pi in range(accelerator.state.num_processes):
         if pi == accelerator.state.local_process_index:
             logger.info(f"loading model for process {accelerator.state.local_process_index}/{accelerator.state.num_processes}")
 
-            mmdit, clip_l, clip_g, t5xxl, vae = sd3_utils.load_models(
-                args.pretrained_model_name_or_path,
-                args.clip_l,
-                args.clip_g,
-                args.t5xxl,
-                args.vae,
-                attn_mode,
-                accelerator.device if args.lowram else "cpu",
-                model_dtype,
-                args.disable_mmap_load_safetensors,
-                clip_dtype,
-                t5xxl_device,
-                t5xxl_dtype,
-                vae_dtype,
-            )
+            if model_type == "mmdit":
+                model = sd3_utils.load_mmdit(state_dict, attn_mode, model_dtype, loading_device)
+            elif model_type == "clip_l":
+                model = sd3_utils.load_clip_l(state_dict, args.clip_l, attn_mode, model_dtype, loading_device)
+            elif model_type == "clip_g":
+                model = sd3_utils.load_clip_g(state_dict, args.clip_g, attn_mode, model_dtype, loading_device)
+            elif model_type == "t5xxl":
+                model = sd3_utils.load_t5xxl(state_dict, args.t5xxl, attn_mode, model_dtype, loading_device)
+            elif model_type == "vae":
+                model = sd3_utils.load_vae(state_dict, args.vae, model_dtype, loading_device)
+            else:
+                raise ValueError(f"Unknown model type: {model_type}")
 
             # work on low-ram device: models are already loaded on accelerator.device, but we ensure they are on device
             if args.lowram:
-                if clip_l is not None:
-                    clip_l.to(accelerator.device)
-                if clip_g is not None:
-                    clip_g.to(accelerator.device)
-                if t5xxl is not None:
-                    t5xxl.to(accelerator.device)
-                vae.to(accelerator.device)
-                mmdit.to(accelerator.device)
+                model = model.to(accelerator.device)
 
             clean_memory_on_device(accelerator.device)
         accelerator.wait_for_everyone()
 
-    return mmdit, clip_l, clip_g, t5xxl, vae
+    return model
 
 
 def save_models(
diff --git a/library/sd3_utils.py b/library/sd3_utils.py
index 9dc9e7967..16f80c60d 100644
--- a/library/sd3_utils.py
+++ b/library/sd3_utils.py
@@ -20,6 +20,175 @@
 # region models
 
 
+def load_safetensors(path: str, dvc: Union[str, torch.device], disable_mmap: bool = False):
+    if disable_mmap:
+        return safetensors.torch.load(open(path, "rb").read())
+    else:
+        try:
+            return load_file(path, device=dvc)
+        except:
+            return load_file(path)  # prevent device invalid Error
+
+
+def load_mmdit(state_dict: Dict, attn_mode: str, dtype: Optional[Union[str, torch.dtype]], device: Union[str, torch.device]):
+    mmdit_sd = {}
+
+    mmdit_prefix = "model.diffusion_model."
+    for k in list(state_dict.keys()):
+        if k.startswith(mmdit_prefix):
+            mmdit_sd[k[len(mmdit_prefix) :]] = state_dict.pop(k)
+
+    # load MMDiT
+    logger.info("Building MMDit")
+    with init_empty_weights():
+        mmdit = sd3_models.create_mmdit_sd3_medium_configs(attn_mode)
+
+    logger.info("Loading state dict...")
+    info = sdxl_model_util._load_state_dict_on_device(mmdit, mmdit_sd, device, dtype)
+    logger.info(f"Loaded MMDiT: {info}")
+    return mmdit
+
+
+def load_clip_l(
+    state_dict: Dict,
+    clip_l_path: Optional[str],
+    attn_mode: str,
+    clip_dtype: Optional[Union[str, torch.dtype]],
+    device: Union[str, torch.device],
+    disable_mmap: bool = False,
+):
+    clip_l_sd = None
+    if clip_l_path:
+        logger.info(f"Loading clip_l from {clip_l_path}...")
+        clip_l_sd = load_safetensors(clip_l_path, device, disable_mmap)
+        for key in list(clip_l_sd.keys()):
+            clip_l_sd["transformer." + key] = clip_l_sd.pop(key)
+    else:
+        if "text_encoders.clip_l.transformer.text_model.embeddings.position_embedding.weight" in state_dict:
+            # found clip_l: remove prefix "text_encoders.clip_l."
+            logger.info("clip_l is included in the checkpoint")
+            clip_l_sd = {}
+            prefix = "text_encoders.clip_l."
+            for k in list(state_dict.keys()):
+                if k.startswith(prefix):
+                    clip_l_sd[k[len(prefix) :]] = state_dict.pop(k)
+
+    if clip_l_sd is None:
+        clip_l = None
+    else:
+        logger.info("Building ClipL")
+        clip_l = sd3_models.create_clip_l(device, clip_dtype, clip_l_sd)
+        logger.info("Loading state dict...")
+        info = clip_l.load_state_dict(clip_l_sd)
+        logger.info(f"Loaded ClipL: {info}")
+        clip_l.set_attn_mode(attn_mode)
+    return clip_l
+
+
+def load_clip_g(
+    state_dict: Dict,
+    clip_g_path: Optional[str],
+    attn_mode: str,
+    clip_dtype: Optional[Union[str, torch.dtype]],
+    device: Union[str, torch.device],
+    disable_mmap: bool = False,
+):
+    clip_g_sd = None
+    if clip_g_path:
+        logger.info(f"Loading clip_g from {clip_g_path}...")
+        clip_g_sd = load_safetensors(clip_g_path, device, disable_mmap)
+        for key in list(clip_g_sd.keys()):
+            clip_g_sd["transformer." + key] = clip_g_sd.pop(key)
+    else:
+        if "text_encoders.clip_g.transformer.text_model.embeddings.position_embedding.weight" in state_dict:
+            # found clip_g: remove prefix "text_encoders.clip_g."
+            logger.info("clip_g is included in the checkpoint")
+            clip_g_sd = {}
+            prefix = "text_encoders.clip_g."
+            for k in list(state_dict.keys()):
+                if k.startswith(prefix):
+                    clip_g_sd[k[len(prefix) :]] = state_dict.pop(k)
+
+    if clip_g_sd is None:
+        clip_g = None
+    else:
+        logger.info("Building ClipG")
+        clip_g = sd3_models.create_clip_g(device, clip_dtype, clip_g_sd)
+        logger.info("Loading state dict...")
+        info = clip_g.load_state_dict(clip_g_sd)
+        logger.info(f"Loaded ClipG: {info}")
+        clip_g.set_attn_mode(attn_mode)
+    return clip_g
+
+
+def load_t5xxl(
+    state_dict: Dict,
+    t5xxl_path: Optional[str],
+    attn_mode: str,
+    dtype: Optional[Union[str, torch.dtype]],
+    device: Union[str, torch.device],
+    disable_mmap: bool = False,
+):
+    t5xxl_sd = None
+    if t5xxl_path:
+        logger.info(f"Loading t5xxl from {t5xxl_path}...")
+        t5xxl_sd = load_safetensors(t5xxl_path, device, disable_mmap)
+        for key in list(t5xxl_sd.keys()):
+            t5xxl_sd["transformer." + key] = t5xxl_sd.pop(key)
+    else:
+        if "text_encoders.t5xxl.transformer.encoder.block.0.layer.0.SelfAttention.k.weight" in state_dict:
+            # found t5xxl: remove prefix "text_encoders.t5xxl."
+            logger.info("t5xxl is included in the checkpoint")
+            t5xxl_sd = {}
+            prefix = "text_encoders.t5xxl."
+            for k in list(state_dict.keys()):
+                if k.startswith(prefix):
+                    t5xxl_sd[k[len(prefix) :]] = state_dict.pop(k)
+
+    if t5xxl_sd is None:
+        t5xxl = None
+    else:
+        logger.info("Building T5XXL")
+
+        # workaround for T5XXL model creation: create with fp16 takes too long TODO support virtual device
+        t5xxl = sd3_models.create_t5xxl(device, torch.float32, t5xxl_sd)
+        t5xxl.to(dtype=dtype)
+
+        logger.info("Loading state dict...")
+        info = t5xxl.load_state_dict(t5xxl_sd)
+        logger.info(f"Loaded T5XXL: {info}")
+        t5xxl.set_attn_mode(attn_mode)
+    return t5xxl
+
+
+def load_vae(
+    state_dict: Dict,
+    vae_path: Optional[str],
+    vae_dtype: Optional[Union[str, torch.dtype]],
+    device: Optional[Union[str, torch.device]],
+    disable_mmap: bool = False,
+):
+    vae_sd = {}
+    if vae_path:
+        logger.info(f"Loading VAE from {vae_path}...")
+        vae_sd = load_safetensors(vae_path, device, disable_mmap)
+    else:
+        # remove prefix "first_stage_model."
+        vae_sd = {}
+        vae_prefix = "first_stage_model."
+        for k in list(state_dict.keys()):
+            if k.startswith(vae_prefix):
+                vae_sd[k[len(vae_prefix) :]] = state_dict.pop(k)
+
+    logger.info("Building VAE")
+    vae = sd3_models.SDVAE()
+    logger.info("Loading state dict...")
+    info = vae.load_state_dict(vae_sd)
+    logger.info(f"Loaded VAE: {info}")
+    vae.to(device=device, dtype=vae_dtype)
+    return vae
+
+
 def load_models(
     ckpt_path: str,
     clip_l_path: str,
diff --git a/sd3_train.py b/sd3_train.py
index c073ec0e2..10cc5d57f 100644
--- a/sd3_train.py
+++ b/sd3_train.py
@@ -13,12 +13,12 @@
 import torch
 from library.device_utils import init_ipex, clean_memory_on_device
 
-
 init_ipex()
 
 from accelerate.utils import set_seed
 from diffusers import DDPMScheduler
 from library import deepspeed_utils, sd3_models, sd3_train_utils, sd3_utils
+from library.sdxl_train_util import match_mixed_precision
 
 # , sdxl_model_util
 
@@ -189,18 +189,19 @@ def train(args):
 
     assert (
         attn_mode == "torch"
-    ), f"attn_mode {attn_mode} is not supported. Please use `--sdpa` instead of `--xformers`. / attn_mode {attn_mode} はサポートされていません。`--xformers`の代わりに`--sdpa`を使ってください。"
+    ), f"attn_mode {attn_mode} is not supported yet. Please use `--sdpa` instead of `--xformers`. / attn_mode {attn_mode} はサポートされていません。`--xformers`の代わりに`--sdpa`を使ってください。"
 
-    # models are usually loaded on CPU and moved to GPU later. This is to avoid OOM on GPU0.
-    mmdit, clip_l, clip_g, t5xxl, vae = sd3_train_utils.load_target_model(
-        args, accelerator, attn_mode, weight_dtype, clip_dtype, t5xxl_device, t5xxl_dtype, vae_dtype
+    # SD3 state dict may contain multiple models, so we need to load it and extract one by one. annoying.
+    logger.info(f"Loading SD3 models from {args.pretrained_model_name_or_path}")
+    device_to_load = accelerator.device if args.lowram else "cpu"
+    sd3_state_dict = sd3_utils.load_safetensors(
+        args.pretrained_model_name_or_path, device_to_load, args.disable_mmap_load_safetensors
     )
-    assert clip_l is not None, "clip_l is required / clip_lは必須です"
-    assert clip_g is not None, "clip_g is required / clip_gは必須です"
-    # logit_scale = logit_scale.to(accelerator.device, dtype=weight_dtype)
 
-    # 学習を準備する
+    # load VAE for caching latents
+    vae: sd3_models.SDVAE = None
     if cache_latents:
+        vae = sd3_train_utils.load_target_model("vae", args, sd3_state_dict, accelerator, attn_mode, vae_dtype, device_to_load)
         vae.to(accelerator.device, dtype=vae_dtype)
         vae.requires_grad_(False)
         vae.eval()
@@ -220,15 +221,25 @@ def train(args):
                 vae, args.cache_latents_to_disk, args.vae_batch_size, args.skip_latents_validity_check
             )
             train_dataset_group.new_cache_latents(accelerator.is_main_process, strategy)
-        vae.to("cpu")
+        vae.to("cpu")  # if no sampling, vae can be deleted
         clean_memory_on_device(accelerator.device)
 
         accelerator.wait_for_everyone()
 
+    # load clip_l, clip_g, t5xxl for caching text encoder outputs
+    # # models are usually loaded on CPU and moved to GPU later. This is to avoid OOM on GPU0.
+    # mmdit, clip_l, clip_g, t5xxl, vae = sd3_train_utils.load_target_model(
+    #     args, accelerator, attn_mode, weight_dtype, clip_dtype, t5xxl_device, t5xxl_dtype, vae_dtype
+    # )
+    clip_l = sd3_train_utils.load_target_model("clip_l", args, sd3_state_dict, accelerator, attn_mode, clip_dtype, device_to_load)
+    clip_g = sd3_train_utils.load_target_model("clip_g", args, sd3_state_dict, accelerator, attn_mode, clip_dtype, device_to_load)
+    assert clip_l is not None, "clip_l is required / clip_lは必須です"
+    assert clip_g is not None, "clip_g is required / clip_gは必須です"
+
+    t5xxl = sd3_train_utils.load_target_model("t5xxl", args, sd3_state_dict, accelerator, attn_mode, t5xxl_dtype, device_to_load)
+    # logit_scale = logit_scale.to(accelerator.device, dtype=weight_dtype)
+
     # 学習を準備する：モデルを適切な状態にする
-    if args.gradient_checkpointing:
-        mmdit.enable_gradient_checkpointing()
-    train_mmdit = args.learning_rate != 0
     train_clip_l = False
     train_clip_g = False
     train_t5xxl = False
@@ -280,17 +291,30 @@ def train(args):
                 accelerator.is_main_process,
                 args.text_encoder_batch_size,
             )
+
+        # TODO we can delete text encoders after caching
         accelerator.wait_for_everyone()
 
+    # load MMDIT
+    # if full_fp16/bf16, model_dtype is casted to fp16/bf16. If not, model_dtype is None (float32).
+    # by loading with model_dtype, we can reduce memory usage.
+    model_dtype = match_mixed_precision(args, weight_dtype)  # None (default) or fp16/bf16 (full_xxxx)
+    mmdit = sd3_train_utils.load_target_model("mmdit", args, sd3_state_dict, accelerator, attn_mode, model_dtype, device_to_load)
+    if args.gradient_checkpointing:
+        mmdit.enable_gradient_checkpointing()
+
+    train_mmdit = args.learning_rate != 0
+    mmdit.requires_grad_(train_mmdit)
+    if not train_mmdit:
+        mmdit.to(accelerator.device, dtype=weight_dtype)  # because of mmdie will not be prepared
+
     if not cache_latents:
+        # load VAE here if not cached
+        vae = sd3_train_utils.load_target_model("vae", args, sd3_state_dict, accelerator, attn_mode, vae_dtype, device_to_load)
         vae.requires_grad_(False)
         vae.eval()
         vae.to(accelerator.device, dtype=vae_dtype)
 
-    mmdit.requires_grad_(train_mmdit)
-    if not train_mmdit:
-        mmdit.to(accelerator.device, dtype=weight_dtype)  # because of unet is not prepared
-
     training_models = []
     params_to_optimize = []
     # if train_unet:

From 9dc7997803d70c718969526352e88908e827f091 Mon Sep 17 00:00:00 2001
From: Kohya S <ykumeykume@gmail.com>
Date: Tue, 9 Jul 2024 20:37:00 +0900
Subject: [PATCH 040/582] fix typo

---
 library/sd3_models.py      | 2 +-
 library/sd3_train_utils.py | 2 +-
 sd3_train.py               | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/library/sd3_models.py b/library/sd3_models.py
index e4c0790d9..a1ff1e75a 100644
--- a/library/sd3_models.py
+++ b/library/sd3_models.py
@@ -1643,7 +1643,7 @@ def __init__(self, hidden_size, eps=1e-6, dtype=None, device=None):
     # copy from transformers' T5LayerNorm
     def forward(self, hidden_states):
         # T5 uses a layer_norm which only scales and doesn't shift, which is also known as Root Mean
-        # Square Layer Normalization https://arxiv.org/abs/1910.07467 thus varience is calculated
+        # Square Layer Normalization https://arxiv.org/abs/1910.07467 thus variance is calculated
         # w/o mean and there is no bias. Additionally we want to make sure that the accumulation for
         # half-precision inputs is done in fp32
         variance = hidden_states.to(torch.float32).pow(2).mean(-1, keepdim=True)
diff --git a/library/sd3_train_utils.py b/library/sd3_train_utils.py
index 98ee66bf8..660342108 100644
--- a/library/sd3_train_utils.py
+++ b/library/sd3_train_utils.py
@@ -279,7 +279,7 @@ def sample_images(*args, **kwargs):
     return train_util.sample_images_common(SdxlStableDiffusionLongPromptWeightingPipeline, *args, **kwargs)
 
 
-class Sd3LatensCachingStrategy(train_util.LatentsCachingStrategy):
+class Sd3LatentsCachingStrategy(train_util.LatentsCachingStrategy):
     SD3_LATENTS_NPZ_SUFFIX = "_sd3.npz"
 
     def __init__(self, vae: sd3_models.SDVAE, cache_to_disk: bool, batch_size: int, skip_disk_cache_validity_check: bool) -> None:
diff --git a/sd3_train.py b/sd3_train.py
index 10cc5d57f..30d994c78 100644
--- a/sd3_train.py
+++ b/sd3_train.py
@@ -217,7 +217,7 @@ def train(args):
                     file_suffix="_sd3.npz",
                 )
         else:
-            strategy = sd3_train_utils.Sd3LatensCachingStrategy(
+            strategy = sd3_train_utils.Sd3LatentsCachingStrategy(
                 vae, args.cache_latents_to_disk, args.vae_batch_size, args.skip_latents_validity_check
             )
             train_dataset_group.new_cache_latents(accelerator.is_main_process, strategy)

From 3d402927efb2d396f8f33fe6a1747e43f7a5f0f3 Mon Sep 17 00:00:00 2001
From: Kohya S <ykumeykume@gmail.com>
Date: Tue, 9 Jul 2024 23:15:38 +0900
Subject: [PATCH 041/582] WIP: update new latents caching

---
 library/sd3_train_utils.py | 49 +++++++++++++++++++++++++-------------
 library/train_util.py      | 39 ++++++++++++++++++++++++++----
 sd3_train.py               | 15 ++++++++----
 3 files changed, 77 insertions(+), 26 deletions(-)

diff --git a/library/sd3_train_utils.py b/library/sd3_train_utils.py
index 660342108..245912199 100644
--- a/library/sd3_train_utils.py
+++ b/library/sd3_train_utils.py
@@ -1,4 +1,5 @@
 import argparse
+import glob
 import math
 import os
 from typing import List, Optional, Tuple, Union
@@ -282,12 +283,26 @@ def sample_images(*args, **kwargs):
 class Sd3LatentsCachingStrategy(train_util.LatentsCachingStrategy):
     SD3_LATENTS_NPZ_SUFFIX = "_sd3.npz"
 
-    def __init__(self, vae: sd3_models.SDVAE, cache_to_disk: bool, batch_size: int, skip_disk_cache_validity_check: bool) -> None:
+    def __init__(self, cache_to_disk: bool, batch_size: int, skip_disk_cache_validity_check: bool) -> None:
         super().__init__(cache_to_disk, batch_size, skip_disk_cache_validity_check)
+        self.vae = None
+
+    def set_vae(self, vae: sd3_models.SDVAE):
         self.vae = vae
 
-    def get_latents_npz_path(self, absolute_path: str):
-        return os.path.splitext(absolute_path)[0] + self.SD3_LATENTS_NPZ_SUFFIX
+    def get_image_size_from_image_absolute_path(self, absolute_path: str) -> Tuple[Optional[int], Optional[int]]:
+        npz_file = glob.glob(os.path.splitext(absolute_path)[0] + "_*" + Sd3LatentsCachingStrategy.SD3_LATENTS_NPZ_SUFFIX)
+        if len(npz_file) == 0:
+            return None, None
+        w, h = os.path.splitext(npz_file[0])[0].split("_")[-2].split("x")
+        return int(w), int(h)
+
+    def get_latents_npz_path(self, absolute_path: str, image_size: Tuple[int, int]) -> str:
+        return (
+            os.path.splitext(absolute_path)[0]
+            + f"_{image_size[0]:04d}x{image_size[1]:04d}"
+            + Sd3LatentsCachingStrategy.SD3_LATENTS_NPZ_SUFFIX
+        )
 
     def is_disk_cached_latents_expected(self, bucket_reso: Tuple[int, int], npz_path: str, flip_aug: bool, alpha_mask: bool):
         if not self.cache_to_disk:
@@ -331,24 +346,24 @@ def cache_batch_latents(self, image_infos: List[train_util.ImageInfo], flip_aug:
         img_tensor = img_tensor.to(device=self.vae.device, dtype=self.vae.dtype)
 
         with torch.no_grad():
-            latents = self.vae.encode(img_tensor).to("cpu")
+            latents_tensors = self.vae.encode(img_tensor).to("cpu")
         if flip_aug:
             img_tensor = torch.flip(img_tensor, dims=[3])
             with torch.no_grad():
                 flipped_latents = self.vae.encode(img_tensor).to("cpu")
         else:
-            flipped_latents = [None] * len(latents)
+            flipped_latents = [None] * len(latents_tensors)
+
+        # for info, latents, flipped_latent, alpha_mask in zip(image_infos, latents_tensors, flipped_latents, alpha_masks):
+        for i in range(len(image_infos)):
+            info = image_infos[i]
+            latents = latents_tensors[i]
+            flipped_latent = flipped_latents[i]
+            alpha_mask = alpha_masks[i]
+            original_size = original_sizes[i]
+            crop_ltrb = crop_ltrbs[i]
 
-        for info, latent, flipped_latent, alpha_mask in zip(image_infos, latents, flipped_latents, alpha_masks):
             if self.cache_to_disk:
-                # save_latents_to_disk(
-                #     info.latents_npz,
-                #     latent,
-                #     info.latents_original_size,
-                #     info.latents_crop_ltrb,
-                #     flipped_latent,
-                #     alpha_mask,
-                # )
                 kwargs = {}
                 if flipped_latent is not None:
                     kwargs["latents_flipped"] = flipped_latent.float().cpu().numpy()
@@ -357,12 +372,12 @@ def cache_batch_latents(self, image_infos: List[train_util.ImageInfo], flip_aug:
                 np.savez(
                     info.latents_npz,
                     latents=latents.float().cpu().numpy(),
-                    original_size=np.array(original_sizes),
-                    crop_ltrb=np.array(crop_ltrbs),
+                    original_size=np.array(original_size),
+                    crop_ltrb=np.array(crop_ltrb),
                     **kwargs,
                 )
             else:
-                info.latents = latent
+                info.latents = latents
                 if flip_aug:
                     info.latents_flipped = flipped_latent
                 info.alpha_mask = alpha_mask
diff --git a/library/train_util.py b/library/train_util.py
index 8444827df..9db226ea8 100644
--- a/library/train_util.py
+++ b/library/train_util.py
@@ -360,11 +360,23 @@ def get_augmentor(self, use_color_aug: bool):  # -> Optional[Callable[[np.ndarra
 
 
 class LatentsCachingStrategy:
+    _strategy = None  # strategy instance: actual strategy class
+
     def __init__(self, cache_to_disk: bool, batch_size: int, skip_disk_cache_validity_check: bool) -> None:
         self._cache_to_disk = cache_to_disk
         self._batch_size = batch_size
         self.skip_disk_cache_validity_check = skip_disk_cache_validity_check
 
+    @classmethod
+    def set_strategy(cls, strategy):
+        if cls._strategy is not None:
+            raise RuntimeError(f"Internal error. {cls.__name__} strategy is already set")
+        cls._strategy = strategy
+
+    @classmethod
+    def get_strategy(cls) -> Optional["LatentsCachingStrategy"]:
+        return cls._strategy
+
     @property
     def cache_to_disk(self):
         return self._cache_to_disk
@@ -373,10 +385,15 @@ def cache_to_disk(self):
     def batch_size(self):
         return self._batch_size
 
-    def get_latents_npz_path(self, absolute_path: str):
+    def get_image_size_from_image_absolute_path(self, absolute_path: str) -> Tuple[Optional[int], Optional[int]]:
+        raise NotImplementedError
+
+    def get_latents_npz_path(self, absolute_path: str, bucket_reso: Tuple[int, int]) -> str:
         raise NotImplementedError
 
-    def is_disk_cached_latents_expected(self, bucket_reso: Tuple[int, int], npz_path: str, flip_aug: bool, alpha_mask: bool):
+    def is_disk_cached_latents_expected(
+        self, bucket_reso: Tuple[int, int], npz_path: str, flip_aug: bool, alpha_mask: bool
+    ) -> bool:
         raise NotImplementedError
 
     def cache_batch_latents(self, batch: List[ImageInfo], flip_aug: bool, alpha_mask: bool, random_crop: bool):
@@ -1034,7 +1051,7 @@ def new_cache_latents(self, is_main_process: bool, caching_strategy: LatentsCach
             # check disk cache exists and size of latents
             if caching_strategy.cache_to_disk:
                 # info.latents_npz = os.path.splitext(info.absolute_path)[0] + file_suffix
-                info.latents_npz = caching_strategy.get_latents_npz_path(info.absolute_path)
+                info.latents_npz = caching_strategy.get_latents_npz_path(info.absolute_path, info.image_size)
                 if not is_main_process:  # prepare for multi-gpu, only store to info
                     continue
 
@@ -1730,6 +1747,18 @@ def load_dreambooth_dir(subset: DreamBoothSubset):
                 img_paths = glob_images(subset.image_dir, "*")
                 sizes = [None] * len(img_paths)
 
+                # new caching: get image size from cache files
+                strategy = LatentsCachingStrategy.get_strategy()
+                if strategy is not None:
+                    logger.info("get image size from cache files")
+                    size_set_count = 0
+                    for i, img_path in enumerate(tqdm(img_paths)):
+                        w, h = strategy.get_image_size_from_image_absolute_path(img_path)
+                        if w is not None and h is not None:
+                            sizes[i] = [w, h]
+                            size_set_count += 1
+                    logger.info(f"set image size from cache files: {size_set_count}/{len(img_paths)}")
+
             logger.info(f"found directory {subset.image_dir} contains {len(img_paths)} image files")
 
             if use_cached_info_for_subset:
@@ -2807,12 +2836,12 @@ def cache_batch_text_encoder_outputs_sd3(
         b_lg_out = b_lg_out.detach()
         b_t5_out = b_t5_out.detach()
         b_pool = b_pool.detach()
-    
+
     for info, lg_out, t5_out, pool in zip(image_infos, b_lg_out, b_t5_out, b_pool):
         # debug: NaN check
         if torch.isnan(lg_out).any() or torch.isnan(t5_out).any() or torch.isnan(pool).any():
             raise RuntimeError(f"NaN detected in text encoder outputs: {info.absolute_path}")
-        
+
         if cache_to_disk:
             save_text_encoder_outputs_to_disk(info.text_encoder_outputs_npz, lg_out, t5_out, pool)
         else:
diff --git a/sd3_train.py b/sd3_train.py
index 30d994c78..e2f622e47 100644
--- a/sd3_train.py
+++ b/sd3_train.py
@@ -91,6 +91,15 @@ def train(args):
     # load tokenizer
     sd3_tokenizer = sd3_models.SD3Tokenizer()
 
+    # prepare caching strategy
+    if args.new_caching:
+        latents_caching_strategy = sd3_train_utils.Sd3LatentsCachingStrategy(
+            args.cache_latents_to_disk, args.vae_batch_size, args.skip_latents_validity_check
+        )
+    else:
+        latents_caching_strategy = None
+    train_util.LatentsCachingStrategy.set_strategy(latents_caching_strategy)
+
     # データセットを準備する
     if args.dataset_class is None:
         blueprint_generator = BlueprintGenerator(ConfigSanitizer(True, True, args.masked_loss, True))
@@ -217,10 +226,8 @@ def train(args):
                     file_suffix="_sd3.npz",
                 )
         else:
-            strategy = sd3_train_utils.Sd3LatentsCachingStrategy(
-                vae, args.cache_latents_to_disk, args.vae_batch_size, args.skip_latents_validity_check
-            )
-            train_dataset_group.new_cache_latents(accelerator.is_main_process, strategy)
+            latents_caching_strategy.set_vae(vae)
+            train_dataset_group.new_cache_latents(accelerator.is_main_process, latents_caching_strategy)
         vae.to("cpu")  # if no sampling, vae can be deleted
         clean_memory_on_device(accelerator.device)
 

From 6f0e235f2cb9a9829bc12280c29e12c0ae66c88f Mon Sep 17 00:00:00 2001
From: Kohya S <ykumeykume@gmail.com>
Date: Thu, 11 Jul 2024 08:00:45 +0900
Subject: [PATCH 042/582] Fix shift value in SD3 inference.

---
 sd3_minimal_inference.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/sd3_minimal_inference.py b/sd3_minimal_inference.py
index 7f5f28cea..ffa0d46de 100644
--- a/sd3_minimal_inference.py
+++ b/sd3_minimal_inference.py
@@ -64,7 +64,7 @@ def do_sample(
     device: str,
 ):
     if initial_latent is None:
-        # latent = torch.ones(1, 16, height // 8, width // 8, device=device) * 0.0609
+        # latent = torch.ones(1, 16, height // 8, width // 8, device=device) * 0.0609 # this seems to be a bug in the original code. thanks to furusu for pointing it out
         latent = torch.zeros(1, 16, height // 8, width // 8, device=device)
     else:
         latent = initial_latent
@@ -73,7 +73,7 @@ def do_sample(
 
     noise = get_noise(seed, latent).to(device)
 
-    model_sampling = sd3_utils.ModelSamplingDiscreteFlow()
+    model_sampling = sd3_utils.ModelSamplingDiscreteFlow(shift=3.0)  # 3.0 is for SD3
 
     sigmas = get_sigmas(model_sampling, steps).to(device)
     # sigmas = sigmas[int(steps * (1 - denoise)) :] # do not support i2i

From b8896aad400222c8c4441b217fda0f9bb0807ffd Mon Sep 17 00:00:00 2001
From: Kohya S <ykumeykume@gmail.com>
Date: Thu, 11 Jul 2024 08:01:23 +0900
Subject: [PATCH 043/582] update README

---
 README.md | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index 3eed636c5..5d4f9621d 100644
--- a/README.md
+++ b/README.md
@@ -4,7 +4,9 @@ This repository contains training, generation and utility scripts for Stable Dif
 
 SD3 training is done with `sd3_train.py`. 
 
-__Jun 29, 2024__: Fixed mixed precision training with fp16 is not working. Fixed the model is in bf16 dtype even without `--full_bf16` option (this could worsen the training result).
+__Jul  11, 2024__: Fixed to work t5xxl with `fp16`. If you change the dtype to `fp16` for t5xxl, please remove existing latents cache files (`*_sd3.npz`). The shift in `sd3_minimum_inference.py` is fixed to 3.0. Thanks to araleza!
+
+Jun 29, 2024: Fixed mixed precision training with fp16 is not working. Fixed the model is in bf16 dtype even without `--full_bf16` option (this could worsen the training result).
 
 `fp16` and `bf16` are available for mixed precision training. We are not sure which is better.
 
@@ -12,7 +14,7 @@ __Jun 29, 2024__: Fixed mixed precision training with fp16 is not working. Fixed
 
 `clip_l`, `clip_g` and `t5xxl` can be specified if the checkpoint does not include them.  
 
-t5xxl doesn't seem to work with `fp16`, so 1) use`bf16` for mixed precision, or 2) use `bf16` or `float32` for `t5xxl_dtype`. 
+~~t5xxl doesn't seem to work with `fp16`, so 1) use`bf16` for mixed precision, or 2) use `bf16` or `float32` for `t5xxl_dtype`. ~~ t5xxl works with `fp16` now.
 
 There are `t5xxl_device` and `t5xxl_dtype` options for `t5xxl` device and dtype. 
 

From 082f13658bdbaed872ede6c0a7a75ab1a5f3712d Mon Sep 17 00:00:00 2001
From: Kohya S <ykumeykume@gmail.com>
Date: Fri, 12 Jul 2024 21:28:01 +0900
Subject: [PATCH 044/582] reduce peak GPU memory usage before training

---
 library/sd3_models.py |  2 +-
 library/train_util.py |  1 +
 sd3_train.py          | 44 +++++++++++++++++++++----------------------
 3 files changed, 24 insertions(+), 23 deletions(-)

diff --git a/library/sd3_models.py b/library/sd3_models.py
index a1ff1e75a..ec8e1bbdd 100644
--- a/library/sd3_models.py
+++ b/library/sd3_models.py
@@ -471,7 +471,7 @@ def __init__(
         num_heads: int = 8,
         qkv_bias: bool = False,
         pre_only: bool = False,
-        qk_norm: str = None,
+        qk_norm: Optional[str] = None,
     ):
         super().__init__()
         self.num_heads = num_heads
diff --git a/library/train_util.py b/library/train_util.py
index 9db226ea8..7af0070e1 100644
--- a/library/train_util.py
+++ b/library/train_util.py
@@ -2410,6 +2410,7 @@ def is_disk_cached_latents_is_expected(reso, npz_path: str, flip_aug: bool, alph
 
 
 # 戻り値は、latents_tensor, (original_size width, original_size height), (crop left, crop top)
+# TODO update to use CachingStrategy
 def load_latents_from_disk(
     npz_path,
 ) -> Tuple[Optional[np.ndarray], Optional[List[int]], Optional[List[int]], Optional[np.ndarray], Optional[np.ndarray]]:
diff --git a/sd3_train.py b/sd3_train.py
index e2f622e47..f34e47124 100644
--- a/sd3_train.py
+++ b/sd3_train.py
@@ -458,6 +458,28 @@ def train(args):
     #     text_encoder1.text_model.encoder.layers[-1].requires_grad_(False)
     #     text_encoder1.text_model.final_layer_norm.requires_grad_(False)
 
+    # TextEncoderの出力をキャッシュするときには、すでに出力を取得済みなのでCPUへ移動する
+    if args.cache_text_encoder_outputs:
+        # move Text Encoders for sampling images. Text Encoder doesn't work on CPU with fp16
+        clip_l.to("cpu", dtype=torch.float32)
+        clip_g.to("cpu", dtype=torch.float32)
+        if t5xxl is not None:
+            t5xxl.to("cpu", dtype=torch.float32)
+        clean_memory_on_device(accelerator.device)
+    else:
+        # make sure Text Encoders are on GPU
+        # TODO support CPU for text encoders
+        clip_l.to(accelerator.device)
+        clip_g.to(accelerator.device)
+        if t5xxl is not None:
+            t5xxl.to(accelerator.device)
+
+    # TODO cache sample prompt's embeddings to free text encoder's memory
+    if args.cache_text_encoder_outputs:
+        if not args.save_t5xxl:
+            t5xxl = None  # free memory
+    clean_memory_on_device(accelerator.device)
+
     if args.deepspeed:
         ds_model = deepspeed_utils.prepare_deepspeed_model(
             args,
@@ -482,28 +504,6 @@ def train(args):
         #     text_encoder2 = accelerator.prepare(text_encoder2)
         optimizer, train_dataloader, lr_scheduler = accelerator.prepare(optimizer, train_dataloader, lr_scheduler)
 
-    # TextEncoderの出力をキャッシュするときには、すでに出力を取得済みなのでCPUへ移動する
-    if args.cache_text_encoder_outputs:
-        # move Text Encoders for sampling images. Text Encoder doesn't work on CPU with fp16
-        clip_l.to("cpu", dtype=torch.float32)
-        clip_g.to("cpu", dtype=torch.float32)
-        if t5xxl is not None:
-            t5xxl.to("cpu", dtype=torch.float32)
-        clean_memory_on_device(accelerator.device)
-    else:
-        # make sure Text Encoders are on GPU
-        # TODO support CPU for text encoders
-        clip_l.to(accelerator.device)
-        clip_g.to(accelerator.device)
-        if t5xxl is not None:
-            t5xxl.to(accelerator.device)
-
-    # TODO cache sample prompt's embeddings to free text encoder's memory
-    if args.cache_text_encoder_outputs:
-        if not args.save_t5xxl:
-            t5xxl = None  # free memory
-    clean_memory_on_device(accelerator.device)
-
     # 実験的機能：勾配も含めたfp16学習を行う　PyTorchにパッチを当ててfp16でのgrad scaleを有効にする
     if args.full_fp16:
         # During deepseed training, accelerate not handles fp16/bf16|mixed precision directly via scaler. Let deepspeed engine do.

From 41dee60383a3b88859b80929a2c0d94b12c42068 Mon Sep 17 00:00:00 2001
From: Kohya S <ykumeykume@gmail.com>
Date: Sat, 27 Jul 2024 13:50:05 +0900
Subject: [PATCH 045/582] Refactor caching mechanism for latents and text
 encoder outputs, etc.

---
 README.md                        |  21 +-
 fine_tune.py                     |  54 +++-
 library/config_util.py           |   2 -
 library/sd3_models.py            |  47 +++-
 library/sd3_train_utils.py       | 105 -------
 library/sd3_utils.py             |   1 +
 library/sdxl_train_util.py       |   2 +-
 library/strategy_base.py         | 328 ++++++++++++++++++++++
 library/strategy_sd.py           | 139 ++++++++++
 library/strategy_sd3.py          | 229 ++++++++++++++++
 library/strategy_sdxl.py         | 247 +++++++++++++++++
 library/train_util.py            | 451 +++++++++++++++----------------
 sd3_minimal_inference.py         |  22 +-
 sd3_train.py                     | 272 +++++++++++--------
 sdxl_train.py                    | 108 ++++----
 sdxl_train_control_net_lllite.py |  99 ++++---
 sdxl_train_network.py            |  48 +++-
 sdxl_train_textual_inversion.py  |  49 ++--
 train_db.py                      |  67 +++--
 train_network.py                 | 122 ++++++---
 train_textual_inversion.py       | 118 ++++----
 21 files changed, 1792 insertions(+), 739 deletions(-)
 create mode 100644 library/strategy_base.py
 create mode 100644 library/strategy_sd.py
 create mode 100644 library/strategy_sd3.py
 create mode 100644 library/strategy_sdxl.py

diff --git a/README.md b/README.md
index 5d4f9621d..d406fecde 100644
--- a/README.md
+++ b/README.md
@@ -4,9 +4,16 @@ This repository contains training, generation and utility scripts for Stable Dif
 
 SD3 training is done with `sd3_train.py`. 
 
-__Jul  11, 2024__: Fixed to work t5xxl with `fp16`. If you change the dtype to `fp16` for t5xxl, please remove existing latents cache files (`*_sd3.npz`). The shift in `sd3_minimum_inference.py` is fixed to 3.0. Thanks to araleza!
+__Jul  27, 2024__: 
+- Latents and text encoder outputs caching mechanism is refactored significantly. 
+  - Existing cache files for SD3 need to be recreated. Please delete the previous cache files. 
+  - With this change, dataset initialization is significantly faster, especially for large datasets. 
 
-Jun 29, 2024: Fixed mixed precision training with fp16 is not working. Fixed the model is in bf16 dtype even without `--full_bf16` option (this could worsen the training result).
+- Architecture-dependent parts are extracted from the dataset (`train_util.py`). This is expected to make it easier to add future architectures.
+
+- Architecture-dependent parts including the cache mechanism for SD1/2/SDXL are also extracted. The basic operation of SD1/2/SDXL training on the sd3 branch has been confirmed, but there may be bugs. Please use the main or dev branch for SD1/2/SDXL training.
+
+---
 
 `fp16` and `bf16` are available for mixed precision training. We are not sure which is better.
 
@@ -14,7 +21,7 @@ Jun 29, 2024: Fixed mixed precision training with fp16 is not working. Fixed the
 
 `clip_l`, `clip_g` and `t5xxl` can be specified if the checkpoint does not include them.  
 
-~~t5xxl doesn't seem to work with `fp16`, so 1) use`bf16` for mixed precision, or 2) use `bf16` or `float32` for `t5xxl_dtype`. ~~ t5xxl works with `fp16` now.
+t5xxl works with `fp16` now.
 
 There are `t5xxl_device` and `t5xxl_dtype` options for `t5xxl` device and dtype. 
 
@@ -32,6 +39,14 @@ cache_latents = true
 cache_latents_to_disk = true
 ```
 
+__2024/7/27:__
+
+Latents およびテキストエンコーダ出力のキャッシュの仕組みを大きくリファクタリングしました。SD3 用の既存のキャッシュファイルの再作成が必要になりますが、ご了承ください（以前のキャッシュファイルは削除してください）。これにより、特にデータセットの規模が大きい場合のデータセット初期化が大幅に高速化されます。
+
+データセット (`train_util.py`) からアーキテクチャ依存の部分を切り出しました。これにより将来的なアーキテクチャ追加が容易になると期待しています。
+
+SD1/2/SDXL のキャッシュ機構を含むアーキテクチャ依存の部分も切り出しました。sd3 ブランチの SD1/2/SDXL 学習について、基本的な動作は確認していますが、不具合があるかもしれません。SD1/2/SDXL の学習には main または dev ブランチをお使いください。
+
 --- 
 
 [__Change History__](#change-history) is moved to the bottom of the page. 
diff --git a/fine_tune.py b/fine_tune.py
index d865cd2de..c9102f6c0 100644
--- a/fine_tune.py
+++ b/fine_tune.py
@@ -10,7 +10,7 @@
 from tqdm import tqdm
 
 import torch
-from library import deepspeed_utils
+from library import deepspeed_utils, strategy_base
 from library.device_utils import init_ipex, clean_memory_on_device
 
 init_ipex()
@@ -39,6 +39,7 @@
     scale_v_prediction_loss_like_noise_prediction,
     apply_debiased_estimation,
 )
+import library.strategy_sd as strategy_sd
 
 
 def train(args):
@@ -52,7 +53,15 @@ def train(args):
     if args.seed is not None:
         set_seed(args.seed)  # 乱数系列を初期化する
 
-    tokenizer = train_util.load_tokenizer(args)
+    tokenize_strategy = strategy_sd.SdTokenizeStrategy(args.v2, args.max_token_length, args.tokenizer_cache_dir)
+    strategy_base.TokenizeStrategy.set_strategy(tokenize_strategy)
+
+    # prepare caching strategy: this must be set before preparing dataset. because dataset may use this strategy for initialization.
+    if cache_latents:
+        latents_caching_strategy = strategy_sd.SdSdxlLatentsCachingStrategy(
+            False, args.cache_latents_to_disk, args.vae_batch_size, False
+        )
+        strategy_base.LatentsCachingStrategy.set_strategy(latents_caching_strategy)
 
     # データセットを準備する
     if args.dataset_class is None:
@@ -81,10 +90,10 @@ def train(args):
                 ]
             }
 
-        blueprint = blueprint_generator.generate(user_config, args, tokenizer=tokenizer)
+        blueprint = blueprint_generator.generate(user_config, args)
         train_dataset_group = config_util.generate_dataset_group_by_blueprint(blueprint.dataset_group)
     else:
-        train_dataset_group = train_util.load_arbitrary_dataset(args, tokenizer)
+        train_dataset_group = train_util.load_arbitrary_dataset(args)
 
     current_epoch = Value("i", 0)
     current_step = Value("i", 0)
@@ -165,8 +174,9 @@ def fn_recursive_set_mem_eff(module: torch.nn.Module):
         vae.to(accelerator.device, dtype=vae_dtype)
         vae.requires_grad_(False)
         vae.eval()
-        with torch.no_grad():
-            train_dataset_group.cache_latents(vae, args.vae_batch_size, args.cache_latents_to_disk, accelerator.is_main_process)
+
+        train_dataset_group.new_cache_latents(vae, accelerator.is_main_process)
+
         vae.to("cpu")
         clean_memory_on_device(accelerator.device)
 
@@ -192,6 +202,9 @@ def fn_recursive_set_mem_eff(module: torch.nn.Module):
         else:
             text_encoder.eval()
 
+    text_encoding_strategy = strategy_sd.SdTextEncodingStrategy(args.clip_skip)
+    strategy_base.TextEncodingStrategy.set_strategy(text_encoding_strategy)
+
     if not cache_latents:
         vae.requires_grad_(False)
         vae.eval()
@@ -214,7 +227,11 @@ def fn_recursive_set_mem_eff(module: torch.nn.Module):
     accelerator.print("prepare optimizer, data loader etc.")
     _, _, optimizer = train_util.get_optimizer(args, trainable_params=trainable_params)
 
-    # dataloaderを準備する
+    # prepare dataloader
+    # strategies are set here because they cannot be referenced in another process. Copy them with the dataset
+    # some strategies can be None
+    train_dataset_group.set_current_strategies()
+
     # DataLoaderのプロセス数：0 は persistent_workers が使えないので注意
     n_workers = min(args.max_data_loader_n_workers, os.cpu_count())  # cpu_count or max_data_loader_n_workers
     train_dataloader = torch.utils.data.DataLoader(
@@ -317,7 +334,9 @@ def fn_recursive_set_mem_eff(module: torch.nn.Module):
         )
 
     # For --sample_at_first
-    train_util.sample_images(accelerator, args, 0, global_step, accelerator.device, vae, tokenizer, text_encoder, unet)
+    train_util.sample_images(
+        accelerator, args, 0, global_step, accelerator.device, vae, tokenize_strategy.tokenizer, text_encoder, unet
+    )
 
     loss_recorder = train_util.LossRecorder()
     for epoch in range(num_train_epochs):
@@ -342,8 +361,9 @@ def fn_recursive_set_mem_eff(module: torch.nn.Module):
                 with torch.set_grad_enabled(args.train_text_encoder):
                     # Get the text embedding for conditioning
                     if args.weighted_captions:
+                        # TODO move to strategy_sd.py
                         encoder_hidden_states = get_weighted_text_embeddings(
-                            tokenizer,
+                            tokenize_strategy.tokenizer,
                             text_encoder,
                             batch["captions"],
                             accelerator.device,
@@ -351,10 +371,12 @@ def fn_recursive_set_mem_eff(module: torch.nn.Module):
                             clip_skip=args.clip_skip,
                         )
                     else:
-                        input_ids = batch["input_ids"].to(accelerator.device)
-                        encoder_hidden_states = train_util.get_hidden_states(
-                            args, input_ids, tokenizer, text_encoder, None if not args.full_fp16 else weight_dtype
-                        )
+                        input_ids = batch["input_ids_list"][0].to(accelerator.device)
+                        encoder_hidden_states = text_encoding_strategy.encode_tokens(
+                            tokenize_strategy, [text_encoder], [input_ids]
+                        )[0]
+                        if args.full_fp16:
+                            encoder_hidden_states = encoder_hidden_states.to(weight_dtype)
 
                 # Sample noise, sample a random timestep for each image, and add noise to the latents,
                 # with noise offset and/or multires noise if specified
@@ -409,7 +431,7 @@ def fn_recursive_set_mem_eff(module: torch.nn.Module):
                 global_step += 1
 
                 train_util.sample_images(
-                    accelerator, args, None, global_step, accelerator.device, vae, tokenizer, text_encoder, unet
+                    accelerator, args, None, global_step, accelerator.device, vae, tokenize_strategy.tokenizer, text_encoder, unet
                 )
 
                 # 指定ステップごとにモデルを保存
@@ -472,7 +494,9 @@ def fn_recursive_set_mem_eff(module: torch.nn.Module):
                     vae,
                 )
 
-        train_util.sample_images(accelerator, args, epoch + 1, global_step, accelerator.device, vae, tokenizer, text_encoder, unet)
+        train_util.sample_images(
+            accelerator, args, epoch + 1, global_step, accelerator.device, vae, tokenize_strategy.tokenizer, text_encoder, unet
+        )
 
     is_main_process = accelerator.is_main_process
     if is_main_process:
diff --git a/library/config_util.py b/library/config_util.py
index 10b2457f3..f8cdfe60a 100644
--- a/library/config_util.py
+++ b/library/config_util.py
@@ -104,8 +104,6 @@ class ControlNetSubsetParams(BaseSubsetParams):
 
 @dataclass
 class BaseDatasetParams:
-    tokenizer: Union[CLIPTokenizer, List[CLIPTokenizer]] = None
-    max_token_length: int = None
     resolution: Optional[Tuple[int, int]] = None
     network_multiplier: float = 1.0
     debug_dataset: bool = False
diff --git a/library/sd3_models.py b/library/sd3_models.py
index ec8e1bbdd..28378c73b 100644
--- a/library/sd3_models.py
+++ b/library/sd3_models.py
@@ -38,7 +38,7 @@ def __init__(
         サブクラスで各種の設定を行ってる。このクラスはその設定に基づき重み付きのトークン化を行うようだ。
         Some settings are done in subclasses. This class seems to perform tokenization with weights based on those settings.
         """
-        self.tokenizer = tokenizer
+        self.tokenizer: CLIPTokenizer = tokenizer
         self.max_length = max_length
         self.min_length = min_length
         empty = self.tokenizer("")["input_ids"]
@@ -56,6 +56,19 @@ def __init__(
         self.inv_vocab = {v: k for k, v in vocab.items()}
         self.max_word_length = 8
 
+    def tokenize(self, text: Union[str, List[str]]) -> List[torch.Tensor]:
+        """
+        Tokenize the text without weights.
+        """
+        if type(text) == str:
+            text = [text]
+        batch_tokens = self.tokenizer(text, padding="max_length", truncation=True, max_length=self.max_length, return_tensors="pt")
+        # return tokens["input_ids"]
+
+        pad_token = self.end_token if self.pad_with_end else 0
+        for tokens in batch_tokens["input_ids"]:
+            assert tokens[0] == self.start_token, f"tokens[0]: {tokens[0]}, start_token: {self.start_token}"
+
     def tokenize_with_weights(self, text: str, truncate_to_max_length=True, truncate_length=None):
         """Tokenize the text, with weight values - presume 1.0 for all and ignore other features here.
         The details aren't relevant for a reference impl, and weights themselves has weak effect on SD3."""
@@ -75,13 +88,14 @@ def tokenize_with_weights(self, text: str, truncate_to_max_length=True, truncate
         for word in to_tokenize:
             batch.extend([(t, 1) for t in self.tokenizer(word)["input_ids"][self.tokens_start : -1]])
         batch.append((self.end_token, 1.0))
+        print(len(batch), self.max_length, self.min_length)
         if self.pad_to_max_length:
             batch.extend([(pad_token, 1.0)] * (self.max_length - len(batch)))
         if self.min_length is not None and len(batch) < self.min_length:
             batch.extend([(pad_token, 1.0)] * (self.min_length - len(batch)))
 
         # truncate to max_length
-        # print(f"batch: {batch}, truncate: {truncate}, len(batch): {len(batch)}, max_length: {self.max_length}")
+        print(f"batch: {batch}, max_length: {self.max_length}, truncate: {truncate_to_max_length}, truncate_length: {truncate_length}")
         if truncate_to_max_length and len(batch) > self.max_length:
             batch = batch[: self.max_length]
         if truncate_length is not None and len(batch) > truncate_length:
@@ -110,27 +124,38 @@ def __init__(self, tokenizer):
 
 
 class SD3Tokenizer:
-    def __init__(self, t5xxl=True):
+    def __init__(self, t5xxl=True, t5xxl_max_length: Optional[int] = 256):
+        if t5xxl_max_length is None:
+            t5xxl_max_length = 256
+
         # TODO cache tokenizer settings locally or hold them in the repo like ComfyUI
         clip_tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-large-patch14")
         self.clip_l = SDTokenizer(tokenizer=clip_tokenizer)
         self.clip_g = SDXLClipGTokenizer(clip_tokenizer)
+        # self.clip_l = CLIPTokenizer.from_pretrained("openai/clip-vit-large-patch14")
+        # self.clip_g = CLIPTokenizer.from_pretrained("laion/CLIP-ViT-bigG-14-laion2B-39B-b160k")
         self.t5xxl = T5XXLTokenizer() if t5xxl else None
         # t5xxl has 99999999 max length, clip has 77
-        self.model_max_length = self.clip_l.max_length  # 77
+        self.t5xxl_max_length = t5xxl_max_length
 
     def tokenize_with_weights(self, text: str):
-        # temporary truncate to max_length even for t5xxl
         return (
             self.clip_l.tokenize_with_weights(text),
             self.clip_g.tokenize_with_weights(text),
             (
-                self.t5xxl.tokenize_with_weights(text, truncate_to_max_length=False, truncate_length=self.model_max_length)
+                self.t5xxl.tokenize_with_weights(text, truncate_to_max_length=False, truncate_length=self.t5xxl_max_length)
                 if self.t5xxl is not None
                 else None
             ),
         )
 
+    def tokenize(self, text: str):
+        return (
+            self.clip_l.tokenize(text),
+            self.clip_g.tokenize(text),
+            (self.t5xxl.tokenize(text) if self.t5xxl is not None else None),
+        )
+
 
 # endregion
 
@@ -1474,7 +1499,10 @@ def encode_token_weights(self, list_of_token_weight_pairs):
                 tokens = [a[0] for a in pairs[0]]  # I'm not sure why this is [0]
                 list_of_tokens.append(tokens)
         else:
-            list_of_tokens = [[a[0] for a in list_of_token_weight_pairs[0]]]
+            if isinstance(list_of_token_weight_pairs[0], torch.Tensor):
+                list_of_tokens = [list(list_of_token_weight_pairs[0])]
+            else:
+                list_of_tokens = [[a[0] for a in list_of_token_weight_pairs[0]]]
 
         out, pooled = self(list_of_tokens)
         if has_batch:
@@ -1614,9 +1642,9 @@ def set_attn_mode(self, mode):
 ### T5 implementation, for the T5-XXL text encoder portion, largely pulled from upstream impl
 #################################################################################################
 
-
+"""
 class T5XXLTokenizer(SDTokenizer):
-    """Wraps the T5 Tokenizer from HF into the SDTokenizer interface"""
+    ""Wraps the T5 Tokenizer from HF into the SDTokenizer interface""
 
     def __init__(self):
         super().__init__(
@@ -1627,6 +1655,7 @@ def __init__(self):
             max_length=99999999,
             min_length=77,
         )
+"""
 
 
 class T5LayerNorm(torch.nn.Module):
diff --git a/library/sd3_train_utils.py b/library/sd3_train_utils.py
index 245912199..8f99d9474 100644
--- a/library/sd3_train_utils.py
+++ b/library/sd3_train_utils.py
@@ -280,111 +280,6 @@ def sample_images(*args, **kwargs):
     return train_util.sample_images_common(SdxlStableDiffusionLongPromptWeightingPipeline, *args, **kwargs)
 
 
-class Sd3LatentsCachingStrategy(train_util.LatentsCachingStrategy):
-    SD3_LATENTS_NPZ_SUFFIX = "_sd3.npz"
-
-    def __init__(self, cache_to_disk: bool, batch_size: int, skip_disk_cache_validity_check: bool) -> None:
-        super().__init__(cache_to_disk, batch_size, skip_disk_cache_validity_check)
-        self.vae = None
-
-    def set_vae(self, vae: sd3_models.SDVAE):
-        self.vae = vae
-
-    def get_image_size_from_image_absolute_path(self, absolute_path: str) -> Tuple[Optional[int], Optional[int]]:
-        npz_file = glob.glob(os.path.splitext(absolute_path)[0] + "_*" + Sd3LatentsCachingStrategy.SD3_LATENTS_NPZ_SUFFIX)
-        if len(npz_file) == 0:
-            return None, None
-        w, h = os.path.splitext(npz_file[0])[0].split("_")[-2].split("x")
-        return int(w), int(h)
-
-    def get_latents_npz_path(self, absolute_path: str, image_size: Tuple[int, int]) -> str:
-        return (
-            os.path.splitext(absolute_path)[0]
-            + f"_{image_size[0]:04d}x{image_size[1]:04d}"
-            + Sd3LatentsCachingStrategy.SD3_LATENTS_NPZ_SUFFIX
-        )
-
-    def is_disk_cached_latents_expected(self, bucket_reso: Tuple[int, int], npz_path: str, flip_aug: bool, alpha_mask: bool):
-        if not self.cache_to_disk:
-            return False
-        if not os.path.exists(npz_path):
-            return False
-        if self.skip_disk_cache_validity_check:
-            return True
-
-        expected_latents_size = (bucket_reso[1] // 8, bucket_reso[0] // 8)  # bucket_reso is (W, H)
-
-        try:
-            npz = np.load(npz_path)
-            if npz["latents"].shape[1:3] != expected_latents_size:
-                return False
-
-            if flip_aug:
-                if "latents_flipped" not in npz:
-                    return False
-                if npz["latents_flipped"].shape[1:3] != expected_latents_size:
-                    return False
-
-            if alpha_mask:
-                if "alpha_mask" not in npz:
-                    return False
-                if npz["alpha_mask"].shape[0:2] != (bucket_reso[1], bucket_reso[0]):
-                    return False
-            else:
-                if "alpha_mask" in npz:
-                    return False
-        except Exception as e:
-            logger.error(f"Error loading file: {npz_path}")
-            raise e
-
-        return True
-
-    def cache_batch_latents(self, image_infos: List[train_util.ImageInfo], flip_aug: bool, alpha_mask: bool, random_crop: bool):
-        img_tensor, alpha_masks, original_sizes, crop_ltrbs = train_util.load_images_and_masks_for_caching(
-            image_infos, alpha_mask, random_crop
-        )
-        img_tensor = img_tensor.to(device=self.vae.device, dtype=self.vae.dtype)
-
-        with torch.no_grad():
-            latents_tensors = self.vae.encode(img_tensor).to("cpu")
-        if flip_aug:
-            img_tensor = torch.flip(img_tensor, dims=[3])
-            with torch.no_grad():
-                flipped_latents = self.vae.encode(img_tensor).to("cpu")
-        else:
-            flipped_latents = [None] * len(latents_tensors)
-
-        # for info, latents, flipped_latent, alpha_mask in zip(image_infos, latents_tensors, flipped_latents, alpha_masks):
-        for i in range(len(image_infos)):
-            info = image_infos[i]
-            latents = latents_tensors[i]
-            flipped_latent = flipped_latents[i]
-            alpha_mask = alpha_masks[i]
-            original_size = original_sizes[i]
-            crop_ltrb = crop_ltrbs[i]
-
-            if self.cache_to_disk:
-                kwargs = {}
-                if flipped_latent is not None:
-                    kwargs["latents_flipped"] = flipped_latent.float().cpu().numpy()
-                if alpha_mask is not None:
-                    kwargs["alpha_mask"] = alpha_mask.float().cpu().numpy()
-                np.savez(
-                    info.latents_npz,
-                    latents=latents.float().cpu().numpy(),
-                    original_size=np.array(original_size),
-                    crop_ltrb=np.array(crop_ltrb),
-                    **kwargs,
-                )
-            else:
-                info.latents = latents
-                if flip_aug:
-                    info.latents_flipped = flipped_latent
-                info.alpha_mask = alpha_mask
-
-        if not train_util.HIGH_VRAM:
-            clean_memory_on_device(self.vae.device)
-
 
 # region Diffusers
 
diff --git a/library/sd3_utils.py b/library/sd3_utils.py
index 16f80c60d..5849518fb 100644
--- a/library/sd3_utils.py
+++ b/library/sd3_utils.py
@@ -384,6 +384,7 @@ def get_cond(
     dtype: Optional[torch.dtype] = None,
 ):
     l_tokens, g_tokens, t5_tokens = tokenizer.tokenize_with_weights(prompt)
+    print(t5_tokens)
     return get_cond_from_tokens(l_tokens, g_tokens, t5_tokens, clip_l, clip_g, t5xxl, device=device, dtype=dtype)
 
 
diff --git a/library/sdxl_train_util.py b/library/sdxl_train_util.py
index b74bea91a..f009b5779 100644
--- a/library/sdxl_train_util.py
+++ b/library/sdxl_train_util.py
@@ -327,7 +327,7 @@ def diffusers_saver(out_dir):
     )
 
 
-def add_sdxl_training_arguments(parser: argparse.ArgumentParser):
+def add_sdxl_training_arguments(parser: argparse.ArgumentParser, support_text_encoder_caching: bool = True):
     parser.add_argument(
         "--cache_text_encoder_outputs", action="store_true", help="cache text encoder outputs / text encoderの出力をキャッシュする"
     )
diff --git a/library/strategy_base.py b/library/strategy_base.py
new file mode 100644
index 000000000..594cca5eb
--- /dev/null
+++ b/library/strategy_base.py
@@ -0,0 +1,328 @@
+# base class for platform strategies. this file defines the interface for strategies
+
+import os
+from typing import Any, List, Optional, Tuple, Union
+
+import numpy as np
+import torch
+from transformers import CLIPTokenizer, CLIPTextModel, CLIPTextModelWithProjection
+
+
+# TODO remove circular import by moving ImageInfo to a separate file
+# from library.train_util import ImageInfo
+
+from library.utils import setup_logging
+
+setup_logging()
+import logging
+
+logger = logging.getLogger(__name__)
+
+
+class TokenizeStrategy:
+    _strategy = None  # strategy instance: actual strategy class
+
+    @classmethod
+    def set_strategy(cls, strategy):
+        if cls._strategy is not None:
+            raise RuntimeError(f"Internal error. {cls.__name__} strategy is already set")
+        cls._strategy = strategy
+
+    @classmethod
+    def get_strategy(cls) -> Optional["TokenizeStrategy"]:
+        return cls._strategy
+
+    def _load_tokenizer(
+        self, model_class: Any, model_id: str, subfolder: Optional[str] = None, tokenizer_cache_dir: Optional[str] = None
+    ) -> Any:
+        tokenizer = None
+        if tokenizer_cache_dir:
+            local_tokenizer_path = os.path.join(tokenizer_cache_dir, model_id.replace("/", "_"))
+            if os.path.exists(local_tokenizer_path):
+                logger.info(f"load tokenizer from cache: {local_tokenizer_path}")
+                tokenizer = model_class.from_pretrained(local_tokenizer_path)  # same for v1 and v2
+
+        if tokenizer is None:
+            tokenizer = model_class.from_pretrained(model_id, subfolder=subfolder)
+
+        if tokenizer_cache_dir and not os.path.exists(local_tokenizer_path):
+            logger.info(f"save Tokenizer to cache: {local_tokenizer_path}")
+            tokenizer.save_pretrained(local_tokenizer_path)
+
+        return tokenizer
+
+    def tokenize(self, text: Union[str, List[str]]) -> List[torch.Tensor]:
+        raise NotImplementedError
+
+    def _get_input_ids(self, tokenizer: CLIPTokenizer, text: str, max_length: Optional[int] = None) -> torch.Tensor:
+        """
+        for SD1.5/2.0/SDXL
+        TODO support batch input
+        """
+        if max_length is None:
+            max_length = tokenizer.model_max_length - 2
+
+        input_ids = tokenizer(text, padding="max_length", truncation=True, max_length=max_length, return_tensors="pt").input_ids
+
+        if max_length > tokenizer.model_max_length:
+            input_ids = input_ids.squeeze(0)
+            iids_list = []
+            if tokenizer.pad_token_id == tokenizer.eos_token_id:
+                # v1
+                # 77以上の時は "<BOS> .... <EOS> <EOS> <EOS>" でトータル227とかになっているので、"<BOS>...<EOS>"の三連に変換する
+                # 1111氏のやつは , で区切る、とかしているようだが　とりあえず単純に
+                for i in range(1, max_length - tokenizer.model_max_length + 2, tokenizer.model_max_length - 2):  # (1, 152, 75)
+                    ids_chunk = (
+                        input_ids[0].unsqueeze(0),
+                        input_ids[i : i + tokenizer.model_max_length - 2],
+                        input_ids[-1].unsqueeze(0),
+                    )
+                    ids_chunk = torch.cat(ids_chunk)
+                    iids_list.append(ids_chunk)
+            else:
+                # v2 or SDXL
+                # 77以上の時は "<BOS> .... <EOS> <PAD> <PAD>..." でトータル227とかになっているので、"<BOS>...<EOS> <PAD> <PAD> ..."の三連に変換する
+                for i in range(1, max_length - tokenizer.model_max_length + 2, tokenizer.model_max_length - 2):
+                    ids_chunk = (
+                        input_ids[0].unsqueeze(0),  # BOS
+                        input_ids[i : i + tokenizer.model_max_length - 2],
+                        input_ids[-1].unsqueeze(0),
+                    )  # PAD or EOS
+                    ids_chunk = torch.cat(ids_chunk)
+
+                    # 末尾が <EOS> <PAD> または <PAD> <PAD> の場合は、何もしなくてよい
+                    # 末尾が x <PAD/EOS> の場合は末尾を <EOS> に変える（x <EOS> なら結果的に変化なし）
+                    if ids_chunk[-2] != tokenizer.eos_token_id and ids_chunk[-2] != tokenizer.pad_token_id:
+                        ids_chunk[-1] = tokenizer.eos_token_id
+                    # 先頭が <BOS> <PAD> ... の場合は <BOS> <EOS> <PAD> ... に変える
+                    if ids_chunk[1] == tokenizer.pad_token_id:
+                        ids_chunk[1] = tokenizer.eos_token_id
+
+                    iids_list.append(ids_chunk)
+
+            input_ids = torch.stack(iids_list)  # 3,77
+        return input_ids
+
+
+class TextEncodingStrategy:
+    _strategy = None  # strategy instance: actual strategy class
+
+    @classmethod
+    def set_strategy(cls, strategy):
+        if cls._strategy is not None:
+            raise RuntimeError(f"Internal error. {cls.__name__} strategy is already set")
+        cls._strategy = strategy
+
+    @classmethod
+    def get_strategy(cls) -> Optional["TextEncodingStrategy"]:
+        return cls._strategy
+
+    def encode_tokens(
+        self, tokenize_strategy: TokenizeStrategy, models: List[Any], tokens: List[torch.Tensor]
+    ) -> List[torch.Tensor]:
+        """
+        Encode tokens into embeddings and outputs.
+        :param tokens: list of token tensors for each TextModel
+        :return: list of output embeddings for each architecture
+        """
+        raise NotImplementedError
+
+
+class TextEncoderOutputsCachingStrategy:
+    _strategy = None  # strategy instance: actual strategy class
+
+    def __init__(
+        self, cache_to_disk: bool, batch_size: int, skip_disk_cache_validity_check: bool, is_partial: bool = False
+    ) -> None:
+        self._cache_to_disk = cache_to_disk
+        self._batch_size = batch_size
+        self.skip_disk_cache_validity_check = skip_disk_cache_validity_check
+        self._is_partial = is_partial
+
+    @classmethod
+    def set_strategy(cls, strategy):
+        if cls._strategy is not None:
+            raise RuntimeError(f"Internal error. {cls.__name__} strategy is already set")
+        cls._strategy = strategy
+
+    @classmethod
+    def get_strategy(cls) -> Optional["TextEncoderOutputsCachingStrategy"]:
+        return cls._strategy
+
+    @property
+    def cache_to_disk(self):
+        return self._cache_to_disk
+
+    @property
+    def batch_size(self):
+        return self._batch_size
+
+    @property
+    def is_partial(self):
+        return self._is_partial
+
+    def get_outputs_npz_path(self, image_abs_path: str) -> str:
+        raise NotImplementedError
+
+    def load_outputs_npz(self, npz_path: str) -> List[np.ndarray]:
+        raise NotImplementedError
+
+    def is_disk_cached_outputs_expected(self, npz_path: str) -> bool:
+        raise NotImplementedError
+
+    def cache_batch_outputs(
+        self, tokenize_strategy: TokenizeStrategy, models: List[Any], text_encoding_strategy: TextEncodingStrategy, batch: List
+    ):
+        raise NotImplementedError
+
+
+class LatentsCachingStrategy:
+    # TODO commonize utillity functions to this class, such as npz handling etc.
+
+    _strategy = None  # strategy instance: actual strategy class
+
+    def __init__(self, cache_to_disk: bool, batch_size: int, skip_disk_cache_validity_check: bool) -> None:
+        self._cache_to_disk = cache_to_disk
+        self._batch_size = batch_size
+        self.skip_disk_cache_validity_check = skip_disk_cache_validity_check
+
+    @classmethod
+    def set_strategy(cls, strategy):
+        if cls._strategy is not None:
+            raise RuntimeError(f"Internal error. {cls.__name__} strategy is already set")
+        cls._strategy = strategy
+
+    @classmethod
+    def get_strategy(cls) -> Optional["LatentsCachingStrategy"]:
+        return cls._strategy
+
+    @property
+    def cache_to_disk(self):
+        return self._cache_to_disk
+
+    @property
+    def batch_size(self):
+        return self._batch_size
+
+    def get_image_size_from_disk_cache_path(self, absolute_path: str) -> Tuple[Optional[int], Optional[int]]:
+        raise NotImplementedError
+
+    def get_latents_npz_path(self, absolute_path: str, image_size: Tuple[int, int]) -> str:
+        raise NotImplementedError
+
+    def is_disk_cached_latents_expected(
+        self, bucket_reso: Tuple[int, int], npz_path: str, flip_aug: bool, alpha_mask: bool
+    ) -> bool:
+        raise NotImplementedError
+
+    def cache_batch_latents(self, model: Any, batch: List, flip_aug: bool, alpha_mask: bool, random_crop: bool):
+        raise NotImplementedError
+
+    def _defualt_is_disk_cached_latents_expected(
+        self, latents_stride: int, bucket_reso: Tuple[int, int], npz_path: str, flip_aug: bool, alpha_mask: bool
+    ):
+        if not self.cache_to_disk:
+            return False
+        if not os.path.exists(npz_path):
+            return False
+        if self.skip_disk_cache_validity_check:
+            return True
+
+        expected_latents_size = (bucket_reso[1] // latents_stride, bucket_reso[0] // latents_stride)  # bucket_reso is (W, H)
+
+        try:
+            npz = np.load(npz_path)
+            if npz["latents"].shape[1:3] != expected_latents_size:
+                return False
+
+            if flip_aug:
+                if "latents_flipped" not in npz:
+                    return False
+                if npz["latents_flipped"].shape[1:3] != expected_latents_size:
+                    return False
+
+            if alpha_mask:
+                if "alpha_mask" not in npz:
+                    return False
+                if npz["alpha_mask"].shape[0:2] != (bucket_reso[1], bucket_reso[0]):
+                    return False
+            else:
+                if "alpha_mask" in npz:
+                    return False
+        except Exception as e:
+            logger.error(f"Error loading file: {npz_path}")
+            raise e
+
+        return True
+
+    # TODO remove circular dependency for ImageInfo
+    def _default_cache_batch_latents(
+        self, encode_by_vae, vae_device, vae_dtype, image_infos: List, flip_aug: bool, alpha_mask: bool, random_crop: bool
+    ):
+        """
+        Default implementation for cache_batch_latents. Image loading, VAE, flipping, alpha mask handling are common.
+        """
+        from library import train_util  # import here to avoid circular import
+
+        img_tensor, alpha_masks, original_sizes, crop_ltrbs = train_util.load_images_and_masks_for_caching(
+            image_infos, alpha_mask, random_crop
+        )
+        img_tensor = img_tensor.to(device=vae_device, dtype=vae_dtype)
+
+        with torch.no_grad():
+            latents_tensors = encode_by_vae(img_tensor).to("cpu")
+        if flip_aug:
+            img_tensor = torch.flip(img_tensor, dims=[3])
+            with torch.no_grad():
+                flipped_latents = encode_by_vae(img_tensor).to("cpu")
+        else:
+            flipped_latents = [None] * len(latents_tensors)
+
+        # for info, latents, flipped_latent, alpha_mask in zip(image_infos, latents_tensors, flipped_latents, alpha_masks):
+        for i in range(len(image_infos)):
+            info = image_infos[i]
+            latents = latents_tensors[i]
+            flipped_latent = flipped_latents[i]
+            alpha_mask = alpha_masks[i]
+            original_size = original_sizes[i]
+            crop_ltrb = crop_ltrbs[i]
+
+            if self.cache_to_disk:
+                self.save_latents_to_disk(info.latents_npz, latents, original_size, crop_ltrb, flipped_latent, alpha_mask)
+            else:
+                info.latents_original_size = original_size
+                info.latents_crop_ltrb = crop_ltrb
+                info.latents = latents
+                if flip_aug:
+                    info.latents_flipped = flipped_latent
+                info.alpha_mask = alpha_mask
+
+    def load_latents_from_disk(
+        self, npz_path: str
+    ) -> Tuple[Optional[np.ndarray], Optional[List[int]], Optional[List[int]], Optional[np.ndarray], Optional[np.ndarray]]:
+        npz = np.load(npz_path)
+        if "latents" not in npz:
+            raise ValueError(f"error: npz is old format. please re-generate {npz_path}")
+
+        latents = npz["latents"]
+        original_size = npz["original_size"].tolist()
+        crop_ltrb = npz["crop_ltrb"].tolist()
+        flipped_latents = npz["latents_flipped"] if "latents_flipped" in npz else None
+        alpha_mask = npz["alpha_mask"] if "alpha_mask" in npz else None
+        return latents, original_size, crop_ltrb, flipped_latents, alpha_mask
+
+    def save_latents_to_disk(
+        self, npz_path, latents_tensor, original_size, crop_ltrb, flipped_latents_tensor=None, alpha_mask=None
+    ):
+        kwargs = {}
+        if flipped_latents_tensor is not None:
+            kwargs["latents_flipped"] = flipped_latents_tensor.float().cpu().numpy()
+        if alpha_mask is not None:
+            kwargs["alpha_mask"] = alpha_mask.float().cpu().numpy()
+        np.savez(
+            npz_path,
+            latents=latents_tensor.float().cpu().numpy(),
+            original_size=np.array(original_size),
+            crop_ltrb=np.array(crop_ltrb),
+            **kwargs,
+        )
diff --git a/library/strategy_sd.py b/library/strategy_sd.py
new file mode 100644
index 000000000..105816145
--- /dev/null
+++ b/library/strategy_sd.py
@@ -0,0 +1,139 @@
+import glob
+import os
+from typing import Any, List, Optional, Tuple, Union
+
+import torch
+from transformers import CLIPTokenizer
+from library import train_util
+from library.strategy_base import LatentsCachingStrategy, TokenizeStrategy, TextEncodingStrategy
+from library.utils import setup_logging
+
+setup_logging()
+import logging
+
+logger = logging.getLogger(__name__)
+
+
+TOKENIZER_ID = "openai/clip-vit-large-patch14"
+V2_STABLE_DIFFUSION_ID = "stabilityai/stable-diffusion-2"  # ここからtokenizerだけ使う v2とv2.1はtokenizer仕様は同じ
+
+
+class SdTokenizeStrategy(TokenizeStrategy):
+    def __init__(self, v2: bool, max_length: Optional[int], tokenizer_cache_dir: Optional[str] = None) -> None:
+        """
+        max_length does not include <BOS> and <EOS> (None, 75, 150, 225)
+        """
+        logger.info(f"Using {'v2' if v2 else 'v1'} tokenizer")
+        if v2:
+            self.tokenizer = self._load_tokenizer(
+                CLIPTokenizer, V2_STABLE_DIFFUSION_ID, subfolder="tokenizer", tokenizer_cache_dir=tokenizer_cache_dir
+            )
+        else:
+            self.tokenizer = self._load_tokenizer(CLIPTokenizer, TOKENIZER_ID, tokenizer_cache_dir=tokenizer_cache_dir)
+
+        if max_length is None:
+            self.max_length = self.tokenizer.model_max_length
+        else:
+            self.max_length = max_length + 2
+
+    def tokenize(self, text: Union[str, List[str]]) -> List[torch.Tensor]:
+        text = [text] if isinstance(text, str) else text
+        return [torch.stack([self._get_input_ids(self.tokenizer, t, self.max_length) for t in text], dim=0)]
+
+
+class SdTextEncodingStrategy(TextEncodingStrategy):
+    def __init__(self, clip_skip: Optional[int] = None) -> None:
+        self.clip_skip = clip_skip
+
+    def encode_tokens(
+        self, tokenize_strategy: TokenizeStrategy, models: List[Any], tokens: List[torch.Tensor]
+    ) -> List[torch.Tensor]:
+        text_encoder = models[0]
+        tokens = tokens[0]
+        sd_tokenize_strategy = tokenize_strategy  # type: SdTokenizeStrategy
+
+        # tokens: b,n,77
+        b_size = tokens.size()[0]
+        max_token_length = tokens.size()[1] * tokens.size()[2]
+        model_max_length = sd_tokenize_strategy.tokenizer.model_max_length
+        tokens = tokens.reshape((-1, model_max_length))  # batch_size*3, 77
+
+        if self.clip_skip is None:
+            encoder_hidden_states = text_encoder(tokens)[0]
+        else:
+            enc_out = text_encoder(tokens, output_hidden_states=True, return_dict=True)
+            encoder_hidden_states = enc_out["hidden_states"][-self.clip_skip]
+            encoder_hidden_states = text_encoder.text_model.final_layer_norm(encoder_hidden_states)
+
+        # bs*3, 77, 768 or 1024
+        encoder_hidden_states = encoder_hidden_states.reshape((b_size, -1, encoder_hidden_states.shape[-1]))
+
+        if max_token_length != model_max_length:
+            v1 = sd_tokenize_strategy.tokenizer.pad_token_id == sd_tokenize_strategy.tokenizer.eos_token_id
+            if not v1:
+                # v2: <BOS>...<EOS> <PAD> ... の三連を <BOS>...<EOS> <PAD> ... へ戻す　正直この実装でいいのかわからん
+                states_list = [encoder_hidden_states[:, 0].unsqueeze(1)]  # <BOS>
+                for i in range(1, max_token_length, model_max_length):
+                    chunk = encoder_hidden_states[:, i : i + model_max_length - 2]  # <BOS> の後から 最後の前まで
+                    if i > 0:
+                        for j in range(len(chunk)):
+                            if tokens[j, 1] == sd_tokenize_strategy.tokenizer.eos_token:
+                                # 空、つまり <BOS> <EOS> <PAD> ...のパターン
+                                chunk[j, 0] = chunk[j, 1]  # 次の <PAD> の値をコピーする
+                    states_list.append(chunk)  # <BOS> の後から <EOS> の前まで
+                states_list.append(encoder_hidden_states[:, -1].unsqueeze(1))  # <EOS> か <PAD> のどちらか
+                encoder_hidden_states = torch.cat(states_list, dim=1)
+            else:
+                # v1: <BOS>...<EOS> の三連を <BOS>...<EOS> へ戻す
+                states_list = [encoder_hidden_states[:, 0].unsqueeze(1)]  # <BOS>
+                for i in range(1, max_token_length, model_max_length):
+                    states_list.append(encoder_hidden_states[:, i : i + model_max_length - 2])  # <BOS> の後から <EOS> の前まで
+                states_list.append(encoder_hidden_states[:, -1].unsqueeze(1))  # <EOS>
+                encoder_hidden_states = torch.cat(states_list, dim=1)
+
+        return [encoder_hidden_states]
+
+
+class SdSdxlLatentsCachingStrategy(LatentsCachingStrategy):
+    # sd and sdxl share the same strategy. we can make them separate, but the difference is only the suffix.
+    # and we keep the old npz for the backward compatibility.
+
+    SD_OLD_LATENTS_NPZ_SUFFIX = ".npz"
+    SD_LATENTS_NPZ_SUFFIX = "_sd.npz"
+    SDXL_LATENTS_NPZ_SUFFIX = "_sdxl.npz"
+
+    def __init__(self, sd: bool, cache_to_disk: bool, batch_size: int, skip_disk_cache_validity_check: bool) -> None:
+        super().__init__(cache_to_disk, batch_size, skip_disk_cache_validity_check)
+        self.sd = sd
+        self.suffix = (
+            SdSdxlLatentsCachingStrategy.SD_LATENTS_NPZ_SUFFIX if sd else SdSdxlLatentsCachingStrategy.SDXL_LATENTS_NPZ_SUFFIX
+        )
+
+    def get_image_size_from_disk_cache_path(self, absolute_path: str) -> Tuple[Optional[int], Optional[int]]:
+        # does not include old npz
+        npz_file = glob.glob(os.path.splitext(absolute_path)[0] + "_*" + self.suffix)
+        if len(npz_file) == 0:
+            return None, None
+        w, h = os.path.splitext(npz_file[0])[0].split("_")[-2].split("x")
+        return int(w), int(h)
+
+    def get_latents_npz_path(self, absolute_path: str, image_size: Tuple[int, int]) -> str:
+        # support old .npz
+        old_npz_file = os.path.splitext(absolute_path)[0] + SdSdxlLatentsCachingStrategy.SD_OLD_LATENTS_NPZ_SUFFIX
+        if os.path.exists(old_npz_file):
+            return old_npz_file
+        return os.path.splitext(absolute_path)[0] + f"_{image_size[0]:04d}x{image_size[1]:04d}" + self.suffix
+
+    def is_disk_cached_latents_expected(self, bucket_reso: Tuple[int, int], npz_path: str, flip_aug: bool, alpha_mask: bool):
+        return self._defualt_is_disk_cached_latents_expected(8, bucket_reso, npz_path, flip_aug, alpha_mask)
+
+    # TODO remove circular dependency for ImageInfo
+    def cache_batch_latents(self, vae, image_infos: List, flip_aug: bool, alpha_mask: bool, random_crop: bool):
+        encode_by_vae = lambda img_tensor: vae.encode(img_tensor).latent_dist.sample()
+        vae_device = vae.device
+        vae_dtype = vae.dtype
+
+        self._default_cache_batch_latents(encode_by_vae, vae_device, vae_dtype, image_infos, flip_aug, alpha_mask, random_crop)
+
+        if not train_util.HIGH_VRAM:
+            train_util.clean_memory_on_device(vae.device)
diff --git a/library/strategy_sd3.py b/library/strategy_sd3.py
new file mode 100644
index 000000000..42630ab22
--- /dev/null
+++ b/library/strategy_sd3.py
@@ -0,0 +1,229 @@
+import os
+import glob
+from typing import Any, List, Optional, Tuple, Union
+import torch
+import numpy as np
+from transformers import CLIPTokenizer, T5TokenizerFast
+
+from library import sd3_utils, train_util
+from library import sd3_models
+from library.strategy_base import LatentsCachingStrategy, TextEncodingStrategy, TokenizeStrategy, TextEncoderOutputsCachingStrategy
+
+from library.utils import setup_logging
+
+setup_logging()
+import logging
+
+logger = logging.getLogger(__name__)
+
+
+CLIP_L_TOKENIZER_ID = "openai/clip-vit-large-patch14"
+CLIP_G_TOKENIZER_ID = "laion/CLIP-ViT-bigG-14-laion2B-39B-b160k"
+T5_XXL_TOKENIZER_ID = "google/t5-v1_1-xxl"
+
+
+class Sd3TokenizeStrategy(TokenizeStrategy):
+    def __init__(self, t5xxl_max_length: int = 256, tokenizer_cache_dir: Optional[str] = None) -> None:
+        self.t5xxl_max_length = t5xxl_max_length
+        self.clip_l = self._load_tokenizer(CLIPTokenizer, CLIP_L_TOKENIZER_ID, tokenizer_cache_dir=tokenizer_cache_dir)
+        self.clip_g = self._load_tokenizer(CLIPTokenizer, CLIP_G_TOKENIZER_ID, tokenizer_cache_dir=tokenizer_cache_dir)
+        self.t5xxl = self._load_tokenizer(T5TokenizerFast, T5_XXL_TOKENIZER_ID, tokenizer_cache_dir=tokenizer_cache_dir)
+        self.clip_g.pad_token_id = 0  # use 0 as pad token for clip_g
+
+    def tokenize(self, text: Union[str, List[str]]) -> List[torch.Tensor]:
+        text = [text] if isinstance(text, str) else text
+
+        l_tokens = self.clip_l(text, max_length=77, padding="max_length", truncation=True, return_tensors="pt")
+        g_tokens = self.clip_g(text, max_length=77, padding="max_length", truncation=True, return_tensors="pt")
+        t5_tokens = self.t5xxl(text, max_length=self.t5xxl_max_length, padding="max_length", truncation=True, return_tensors="pt")
+
+        l_tokens = l_tokens["input_ids"]
+        g_tokens = g_tokens["input_ids"]
+        t5_tokens = t5_tokens["input_ids"]
+
+        return [l_tokens, g_tokens, t5_tokens]
+
+
+class Sd3TextEncodingStrategy(TextEncodingStrategy):
+    def __init__(self) -> None:
+        pass
+
+    def encode_tokens(
+        self, tokenize_strategy: TokenizeStrategy, models: List[Any], tokens: List[torch.Tensor]
+    ) -> List[torch.Tensor]:
+        clip_l, clip_g, t5xxl = models
+
+        l_tokens, g_tokens, t5_tokens = tokens
+        if l_tokens is None:
+            assert g_tokens is None, "g_tokens must be None if l_tokens is None"
+            lg_out = None
+        else:
+            assert g_tokens is not None, "g_tokens must not be None if l_tokens is not None"
+            l_out, l_pooled = clip_l(l_tokens)
+            g_out, g_pooled = clip_g(g_tokens)
+            lg_out = torch.cat([l_out, g_out], dim=-1)
+
+        if t5xxl is not None and t5_tokens is not None:
+            t5_out, _ = t5xxl(t5_tokens)  # t5_out is [1, max length, 4096]
+        else:
+            t5_out = None
+
+        lg_pooled = torch.cat((l_pooled, g_pooled), dim=-1) if l_tokens is not None else None
+        return [lg_out, t5_out, lg_pooled]
+
+    def concat_encodings(
+        self, lg_out: torch.Tensor, t5_out: Optional[torch.Tensor], lg_pooled: torch.Tensor
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        lg_out = torch.nn.functional.pad(lg_out, (0, 4096 - lg_out.shape[-1]))
+        if t5_out is None:
+            t5_out = torch.zeros((lg_out.shape[0], 77, 4096), device=lg_out.device, dtype=lg_out.dtype)
+        return torch.cat([lg_out, t5_out], dim=-2), lg_pooled
+
+
+class Sd3TextEncoderOutputsCachingStrategy(TextEncoderOutputsCachingStrategy):
+    SD3_TEXT_ENCODER_OUTPUTS_NPZ_SUFFIX = "_sd3_te.npz"
+
+    def __init__(
+        self, cache_to_disk: bool, batch_size: int, skip_disk_cache_validity_check: bool, is_partial: bool = False
+    ) -> None:
+        super().__init__(cache_to_disk, batch_size, skip_disk_cache_validity_check, is_partial)
+
+    def get_outputs_npz_path(self, image_abs_path: str) -> str:
+        return os.path.splitext(image_abs_path)[0] + Sd3TextEncoderOutputsCachingStrategy.SD3_TEXT_ENCODER_OUTPUTS_NPZ_SUFFIX
+
+    def is_disk_cached_outputs_expected(self, abs_path: str):
+        if not self.cache_to_disk:
+            return False
+        if not os.path.exists(self.get_outputs_npz_path(abs_path)):
+            return False
+        if self.skip_disk_cache_validity_check:
+            return True
+
+        try:
+            npz = np.load(self.get_outputs_npz_path(abs_path))
+            if "clip_l" not in npz or "clip_g" not in npz:
+                return False
+            if "clip_l_pool" not in npz or "clip_g_pool" not in npz:
+                return False
+            # t5xxl is optional
+        except Exception as e:
+            logger.error(f"Error loading file: {self.get_outputs_npz_path(abs_path)}")
+            raise e
+
+        return True
+
+    def load_outputs_npz(self, npz_path: str) -> List[np.ndarray]:
+        data = np.load(npz_path)
+        lg_out = data["lg_out"]
+        lg_pooled = data["lg_pooled"]
+        t5_out = data["t5_out"] if "t5_out" in data else None
+        return [lg_out, t5_out, lg_pooled]
+
+    def cache_batch_outputs(
+        self, tokenize_strategy: TokenizeStrategy, models: List[Any], text_encoding_strategy: TextEncodingStrategy, infos: List
+    ):
+        captions = [info.caption for info in infos]
+
+        clip_l_tokens, clip_g_tokens, t5xxl_tokens = tokenize_strategy.tokenize(captions)
+        with torch.no_grad():
+            lg_out, t5_out, lg_pooled = text_encoding_strategy.encode_tokens(
+                tokenize_strategy, models, [clip_l_tokens, clip_g_tokens, t5xxl_tokens]
+            )
+
+        if lg_out.dtype == torch.bfloat16:
+            lg_out = lg_out.float()
+        if lg_pooled.dtype == torch.bfloat16:
+            lg_pooled = lg_pooled.float()
+        if t5_out is not None and t5_out.dtype == torch.bfloat16:
+            t5_out = t5_out.float()
+
+        lg_out = lg_out.cpu().numpy()
+        lg_pooled = lg_pooled.cpu().numpy()
+        if t5_out is not None:
+            t5_out = t5_out.cpu().numpy()
+
+        for i, info in enumerate(infos):
+            lg_out_i = lg_out[i]
+            t5_out_i = t5_out[i] if t5_out is not None else None
+            lg_pooled_i = lg_pooled[i]
+
+            if self.cache_to_disk:
+                kwargs = {}
+                if t5_out is not None:
+                    kwargs["t5_out"] = t5_out_i
+                np.savez(info.text_encoder_outputs_npz, lg_out=lg_out_i, lg_pooled=lg_pooled_i, **kwargs)
+            else:
+                info.text_encoder_outputs = (lg_out_i, t5_out_i, lg_pooled_i)
+
+
+class Sd3LatentsCachingStrategy(LatentsCachingStrategy):
+    SD3_LATENTS_NPZ_SUFFIX = "_sd3.npz"
+
+    def __init__(self, cache_to_disk: bool, batch_size: int, skip_disk_cache_validity_check: bool) -> None:
+        super().__init__(cache_to_disk, batch_size, skip_disk_cache_validity_check)
+
+    def get_image_size_from_disk_cache_path(self, absolute_path: str) -> Tuple[Optional[int], Optional[int]]:
+        npz_file = glob.glob(os.path.splitext(absolute_path)[0] + "_*" + Sd3LatentsCachingStrategy.SD3_LATENTS_NPZ_SUFFIX)
+        if len(npz_file) == 0:
+            return None, None
+        w, h = os.path.splitext(npz_file[0])[0].split("_")[-2].split("x")
+        return int(w), int(h)
+
+    def get_latents_npz_path(self, absolute_path: str, image_size: Tuple[int, int]) -> str:
+        return (
+            os.path.splitext(absolute_path)[0]
+            + f"_{image_size[0]:04d}x{image_size[1]:04d}"
+            + Sd3LatentsCachingStrategy.SD3_LATENTS_NPZ_SUFFIX
+        )
+
+    def is_disk_cached_latents_expected(self, bucket_reso: Tuple[int, int], npz_path: str, flip_aug: bool, alpha_mask: bool):
+        return self._defualt_is_disk_cached_latents_expected(8, bucket_reso, npz_path, flip_aug, alpha_mask)
+
+    # TODO remove circular dependency for ImageInfo
+    def cache_batch_latents(self, vae, image_infos: List, flip_aug: bool, alpha_mask: bool, random_crop: bool):
+        encode_by_vae = lambda img_tensor: vae.encode(img_tensor).to("cpu")
+        vae_device = vae.device
+        vae_dtype = vae.dtype
+
+        self._default_cache_batch_latents(encode_by_vae, vae_device, vae_dtype, image_infos, flip_aug, alpha_mask, random_crop)
+
+        if not train_util.HIGH_VRAM:
+            train_util.clean_memory_on_device(vae.device)
+
+
+if __name__ == "__main__":
+    # test code for Sd3TokenizeStrategy
+    # tokenizer = sd3_models.SD3Tokenizer()
+    strategy = Sd3TokenizeStrategy(256)
+    text = "hello world"
+
+    l_tokens, g_tokens, t5_tokens = strategy.tokenize(text)
+    # print(l_tokens.shape)
+    print(l_tokens)
+    print(g_tokens)
+    print(t5_tokens)
+
+    texts = ["hello world", "the quick brown fox jumps over the lazy dog"]
+    l_tokens_2 = strategy.clip_l(texts, max_length=77, padding="max_length", truncation=True, return_tensors="pt")
+    g_tokens_2 = strategy.clip_g(texts, max_length=77, padding="max_length", truncation=True, return_tensors="pt")
+    t5_tokens_2 = strategy.t5xxl(
+        texts, max_length=strategy.t5xxl_max_length, padding="max_length", truncation=True, return_tensors="pt"
+    )
+    print(l_tokens_2)
+    print(g_tokens_2)
+    print(t5_tokens_2)
+
+    # compare
+    print(torch.allclose(l_tokens, l_tokens_2["input_ids"][0]))
+    print(torch.allclose(g_tokens, g_tokens_2["input_ids"][0]))
+    print(torch.allclose(t5_tokens, t5_tokens_2["input_ids"][0]))
+
+    text = ",".join(["hello world! this is long text"] * 50)
+    l_tokens, g_tokens, t5_tokens = strategy.tokenize(text)
+    print(l_tokens)
+    print(g_tokens)
+    print(t5_tokens)
+
+    print(f"model max length l: {strategy.clip_l.model_max_length}")
+    print(f"model max length g: {strategy.clip_g.model_max_length}")
+    print(f"model max length t5: {strategy.t5xxl.model_max_length}")
diff --git a/library/strategy_sdxl.py b/library/strategy_sdxl.py
new file mode 100644
index 000000000..a4513336d
--- /dev/null
+++ b/library/strategy_sdxl.py
@@ -0,0 +1,247 @@
+import os
+from typing import Any, List, Optional, Tuple, Union
+
+import numpy as np
+import torch
+from transformers import CLIPTokenizer, CLIPTextModel, CLIPTextModelWithProjection
+from library.strategy_base import TokenizeStrategy, TextEncodingStrategy, TextEncoderOutputsCachingStrategy
+
+
+from library.utils import setup_logging
+
+setup_logging()
+import logging
+
+logger = logging.getLogger(__name__)
+
+
+TOKENIZER1_PATH = "openai/clip-vit-large-patch14"
+TOKENIZER2_PATH = "laion/CLIP-ViT-bigG-14-laion2B-39B-b160k"
+
+
+class SdxlTokenizeStrategy(TokenizeStrategy):
+    def __init__(self, max_length: Optional[int], tokenizer_cache_dir: Optional[str] = None) -> None:
+        self.tokenizer1 = self._load_tokenizer(CLIPTokenizer, TOKENIZER1_PATH, tokenizer_cache_dir=tokenizer_cache_dir)
+        self.tokenizer2 = self._load_tokenizer(CLIPTokenizer, TOKENIZER2_PATH, tokenizer_cache_dir=tokenizer_cache_dir)
+        self.tokenizer2.pad_token_id = 0  # use 0 as pad token for tokenizer2
+
+        if max_length is None:
+            self.max_length = self.tokenizer1.model_max_length
+        else:
+            self.max_length = max_length + 2
+
+    def tokenize(self, text: Union[str, List[str]]) -> List[torch.Tensor]:
+        text = [text] if isinstance(text, str) else text
+        return (
+            torch.stack([self._get_input_ids(self.tokenizer1, t, self.max_length) for t in text], dim=0),
+            torch.stack([self._get_input_ids(self.tokenizer2, t, self.max_length) for t in text], dim=0),
+        )
+
+
+class SdxlTextEncodingStrategy(TextEncodingStrategy):
+    def __init__(self) -> None:
+        pass
+
+    def _pool_workaround(
+        self, text_encoder: CLIPTextModelWithProjection, last_hidden_state: torch.Tensor, input_ids: torch.Tensor, eos_token_id: int
+    ):
+        r"""
+        workaround for CLIP's pooling bug: it returns the hidden states for the max token id as the pooled output
+        instead of the hidden states for the EOS token
+        If we use Textual Inversion, we need to use the hidden states for the EOS token as the pooled output
+
+        Original code from CLIP's pooling function:
+
+        \# text_embeds.shape = [batch_size, sequence_length, transformer.width]
+        \# take features from the eot embedding (eot_token is the highest number in each sequence)
+        \# casting to torch.int for onnx compatibility: argmax doesn't support int64 inputs with opset 14
+        pooled_output = last_hidden_state[
+            torch.arange(last_hidden_state.shape[0], device=last_hidden_state.device),
+            input_ids.to(dtype=torch.int, device=last_hidden_state.device).argmax(dim=-1),
+        ]
+        """
+
+        # input_ids: b*n,77
+        # find index for EOS token
+
+        # Following code is not working if one of the input_ids has multiple EOS tokens (very odd case)
+        # eos_token_index = torch.where(input_ids == eos_token_id)[1]
+        # eos_token_index = eos_token_index.to(device=last_hidden_state.device)
+
+        # Create a mask where the EOS tokens are
+        eos_token_mask = (input_ids == eos_token_id).int()
+
+        # Use argmax to find the last index of the EOS token for each element in the batch
+        eos_token_index = torch.argmax(eos_token_mask, dim=1)  # this will be 0 if there is no EOS token, it's fine
+        eos_token_index = eos_token_index.to(device=last_hidden_state.device)
+
+        # get hidden states for EOS token
+        pooled_output = last_hidden_state[
+            torch.arange(last_hidden_state.shape[0], device=last_hidden_state.device), eos_token_index
+        ]
+
+        # apply projection: projection may be of different dtype than last_hidden_state
+        pooled_output = text_encoder.text_projection(pooled_output.to(text_encoder.text_projection.weight.dtype))
+        pooled_output = pooled_output.to(last_hidden_state.dtype)
+
+        return pooled_output
+
+    def _get_hidden_states_sdxl(
+        self,
+        input_ids1: torch.Tensor,
+        input_ids2: torch.Tensor,
+        tokenizer1: CLIPTokenizer,
+        tokenizer2: CLIPTokenizer,
+        text_encoder1: Union[CLIPTextModel, torch.nn.Module],
+        text_encoder2: Union[CLIPTextModelWithProjection, torch.nn.Module],
+        unwrapped_text_encoder2: Optional[CLIPTextModelWithProjection] = None,
+    ):
+        # input_ids: b,n,77 -> b*n, 77
+        b_size = input_ids1.size()[0]
+        max_token_length = input_ids1.size()[1] * input_ids1.size()[2]
+        input_ids1 = input_ids1.reshape((-1, tokenizer1.model_max_length))  # batch_size*n, 77
+        input_ids2 = input_ids2.reshape((-1, tokenizer2.model_max_length))  # batch_size*n, 77
+        input_ids1 = input_ids1.to(text_encoder1.device)
+        input_ids2 = input_ids2.to(text_encoder2.device)
+
+        # text_encoder1
+        enc_out = text_encoder1(input_ids1, output_hidden_states=True, return_dict=True)
+        hidden_states1 = enc_out["hidden_states"][11]
+
+        # text_encoder2
+        enc_out = text_encoder2(input_ids2, output_hidden_states=True, return_dict=True)
+        hidden_states2 = enc_out["hidden_states"][-2]  # penuultimate layer
+
+        # pool2 = enc_out["text_embeds"]
+        unwrapped_text_encoder2 = unwrapped_text_encoder2 or text_encoder2
+        pool2 = self._pool_workaround(unwrapped_text_encoder2, enc_out["last_hidden_state"], input_ids2, tokenizer2.eos_token_id)
+
+        # b*n, 77, 768 or 1280 -> b, n*77, 768 or 1280
+        n_size = 1 if max_token_length is None else max_token_length // 75
+        hidden_states1 = hidden_states1.reshape((b_size, -1, hidden_states1.shape[-1]))
+        hidden_states2 = hidden_states2.reshape((b_size, -1, hidden_states2.shape[-1]))
+
+        if max_token_length is not None:
+            # bs*3, 77, 768 or 1024
+            # encoder1: <BOS>...<EOS> の三連を <BOS>...<EOS> へ戻す
+            states_list = [hidden_states1[:, 0].unsqueeze(1)]  # <BOS>
+            for i in range(1, max_token_length, tokenizer1.model_max_length):
+                states_list.append(hidden_states1[:, i : i + tokenizer1.model_max_length - 2])  # <BOS> の後から <EOS> の前まで
+            states_list.append(hidden_states1[:, -1].unsqueeze(1))  # <EOS>
+            hidden_states1 = torch.cat(states_list, dim=1)
+
+            # v2: <BOS>...<EOS> <PAD> ... の三連を <BOS>...<EOS> <PAD> ... へ戻す　正直この実装でいいのかわからん
+            states_list = [hidden_states2[:, 0].unsqueeze(1)]  # <BOS>
+            for i in range(1, max_token_length, tokenizer2.model_max_length):
+                chunk = hidden_states2[:, i : i + tokenizer2.model_max_length - 2]  # <BOS> の後から 最後の前まで
+                # this causes an error:
+                # RuntimeError: one of the variables needed for gradient computation has been modified by an inplace operation
+                # if i > 1:
+                #     for j in range(len(chunk)):  # batch_size
+                #         if input_ids2[n_index + j * n_size, 1] == tokenizer2.eos_token_id:  # 空、つまり <BOS> <EOS> <PAD> ...のパターン
+                #             chunk[j, 0] = chunk[j, 1]  # 次の <PAD> の値をコピーする
+                states_list.append(chunk)  # <BOS> の後から <EOS> の前まで
+            states_list.append(hidden_states2[:, -1].unsqueeze(1))  # <EOS> か <PAD> のどちらか
+            hidden_states2 = torch.cat(states_list, dim=1)
+
+            # pool はnの最初のものを使う
+            pool2 = pool2[::n_size]
+
+        return hidden_states1, hidden_states2, pool2
+
+    def encode_tokens(
+        self, tokenize_strategy: TokenizeStrategy, models: List[Any], tokens: List[torch.Tensor]
+    ) -> List[torch.Tensor]:
+        """
+        Args:
+            tokenize_strategy: TokenizeStrategy
+            models: List of models, [text_encoder1, text_encoder2, unwrapped text_encoder2 (optional)]
+            tokens: List of tokens, for text_encoder1 and text_encoder2
+        """
+        if len(models) == 2:
+            text_encoder1, text_encoder2 = models
+            unwrapped_text_encoder2 = None
+        else:
+            text_encoder1, text_encoder2, unwrapped_text_encoder2 = models
+        tokens1, tokens2 = tokens
+        sdxl_tokenize_strategy = tokenize_strategy  # type: SdxlTokenizeStrategy
+        tokenizer1, tokenizer2 = sdxl_tokenize_strategy.tokenizer1, sdxl_tokenize_strategy.tokenizer2
+
+        hidden_states1, hidden_states2, pool2 = self._get_hidden_states_sdxl(
+            tokens1, tokens2, tokenizer1, tokenizer2, text_encoder1, text_encoder2, unwrapped_text_encoder2
+        )
+        return [hidden_states1, hidden_states2, pool2]
+
+
+class SdxlTextEncoderOutputsCachingStrategy(TextEncoderOutputsCachingStrategy):
+    SDXL_TEXT_ENCODER_OUTPUTS_NPZ_SUFFIX = "_te_outputs.npz"
+
+    def __init__(
+        self, cache_to_disk: bool, batch_size: int, skip_disk_cache_validity_check: bool, is_partial: bool = False
+    ) -> None:
+        super().__init__(cache_to_disk, batch_size, skip_disk_cache_validity_check, is_partial)
+
+    def get_outputs_npz_path(self, image_abs_path: str) -> str:
+        return os.path.splitext(image_abs_path)[0] + SdxlTextEncoderOutputsCachingStrategy.SDXL_TEXT_ENCODER_OUTPUTS_NPZ_SUFFIX
+
+    def is_disk_cached_outputs_expected(self, abs_path: str):
+        if not self.cache_to_disk:
+            return False
+        if not os.path.exists(self.get_outputs_npz_path(abs_path)):
+            return False
+        if self.skip_disk_cache_validity_check:
+            return True
+
+        try:
+            npz = np.load(self.get_outputs_npz_path(abs_path))
+            if "hidden_state1" not in npz or "hidden_state2" not in npz or "pool2" not in npz:
+                return False
+        except Exception as e:
+            logger.error(f"Error loading file: {self.get_outputs_npz_path(abs_path)}")
+            raise e
+
+        return True
+
+    def load_outputs_npz(self, npz_path: str) -> List[np.ndarray]:
+        data = np.load(npz_path)
+        hidden_state1 = data["hidden_state1"]
+        hidden_state2 = data["hidden_state2"]
+        pool2 = data["pool2"]
+        return [hidden_state1, hidden_state2, pool2]
+
+    def cache_batch_outputs(
+        self, tokenize_strategy: TokenizeStrategy, models: List[Any], text_encoding_strategy: TextEncodingStrategy, infos: List
+    ):
+        sdxl_text_encoding_strategy = text_encoding_strategy  # type: SdxlTextEncodingStrategy
+        captions = [info.caption for info in infos]
+
+        tokens1, tokens2 = tokenize_strategy.tokenize(captions)
+        with torch.no_grad():
+            hidden_state1, hidden_state2, pool2 = sdxl_text_encoding_strategy.encode_tokens(
+                tokenize_strategy, models, [tokens1, tokens2]
+            )
+        if hidden_state1.dtype == torch.bfloat16:
+            hidden_state1 = hidden_state1.float()
+        if hidden_state2.dtype == torch.bfloat16:
+            hidden_state2 = hidden_state2.float()
+        if pool2.dtype == torch.bfloat16:
+            pool2 = pool2.float()
+
+        hidden_state1 = hidden_state1.cpu().numpy()
+        hidden_state2 = hidden_state2.cpu().numpy()
+        pool2 = pool2.cpu().numpy()
+
+        for i, info in enumerate(infos):
+            hidden_state1_i = hidden_state1[i]
+            hidden_state2_i = hidden_state2[i]
+            pool2_i = pool2[i]
+
+            if self.cache_to_disk:
+                np.savez(
+                    info.text_encoder_outputs_npz,
+                    hidden_state1=hidden_state1_i,
+                    hidden_state2=hidden_state2_i,
+                    pool2=pool2_i,
+                )
+            else:
+                info.text_encoder_outputs = [hidden_state1_i, hidden_state2_i, pool2_i]
diff --git a/library/train_util.py b/library/train_util.py
index 7af0070e1..a747e0478 100644
--- a/library/train_util.py
+++ b/library/train_util.py
@@ -12,6 +12,7 @@
 import shutil
 import time
 from typing import (
+    Any,
     Dict,
     List,
     NamedTuple,
@@ -34,6 +35,7 @@
 
 import torch
 from library.device_utils import init_ipex, clean_memory_on_device
+from library.strategy_base import LatentsCachingStrategy, TokenizeStrategy, TextEncoderOutputsCachingStrategy, TextEncodingStrategy
 
 init_ipex()
 
@@ -81,10 +83,6 @@
 # from library.hypernetwork import replace_attentions_for_hypernetwork
 from library.original_unet import UNet2DConditionModel
 
-# Tokenizer: checkpointから読み込むのではなくあらかじめ提供されているものを使う
-TOKENIZER_PATH = "openai/clip-vit-large-patch14"
-V2_STABLE_DIFFUSION_PATH = "stabilityai/stable-diffusion-2"  # ここからtokenizerだけ使う v2とv2.1はtokenizer仕様は同じ
-
 HIGH_VRAM = False
 
 # checkpointファイル名
@@ -148,18 +146,24 @@ def __init__(self, image_key: str, num_repeats: int, caption: str, is_reg: bool,
         self.image_size: Tuple[int, int] = None
         self.resized_size: Tuple[int, int] = None
         self.bucket_reso: Tuple[int, int] = None
-        self.latents: torch.Tensor = None
-        self.latents_flipped: torch.Tensor = None
-        self.latents_npz: str = None
-        self.latents_original_size: Tuple[int, int] = None  # original image size, not latents size
-        self.latents_crop_ltrb: Tuple[int, int] = None  # crop left top right bottom in original pixel size, not latents size
-        self.cond_img_path: str = None
+        self.latents: Optional[torch.Tensor] = None
+        self.latents_flipped: Optional[torch.Tensor] = None
+        self.latents_npz: Optional[str] = None  # set in cache_latents
+        self.latents_original_size: Optional[Tuple[int, int]] = None  # original image size, not latents size
+        self.latents_crop_ltrb: Optional[Tuple[int, int]] = (
+            None  # crop left top right bottom in original pixel size, not latents size
+        )
+        self.cond_img_path: Optional[str] = None
         self.image: Optional[Image.Image] = None  # optional, original PIL Image
-        # SDXL, optional
-        self.text_encoder_outputs_npz: Optional[str] = None
+        self.text_encoder_outputs_npz: Optional[str] = None  # set in cache_text_encoder_outputs
+
+        # new
+        self.text_encoder_outputs: Optional[List[torch.Tensor]] = None
+        # old
         self.text_encoder_outputs1: Optional[torch.Tensor] = None
         self.text_encoder_outputs2: Optional[torch.Tensor] = None
         self.text_encoder_pool2: Optional[torch.Tensor] = None
+
         self.alpha_mask: Optional[torch.Tensor] = None  # alpha mask can be flipped in runtime
 
 
@@ -359,47 +363,6 @@ def get_augmentor(self, use_color_aug: bool):  # -> Optional[Callable[[np.ndarra
         return self.color_aug if use_color_aug else None
 
 
-class LatentsCachingStrategy:
-    _strategy = None  # strategy instance: actual strategy class
-
-    def __init__(self, cache_to_disk: bool, batch_size: int, skip_disk_cache_validity_check: bool) -> None:
-        self._cache_to_disk = cache_to_disk
-        self._batch_size = batch_size
-        self.skip_disk_cache_validity_check = skip_disk_cache_validity_check
-
-    @classmethod
-    def set_strategy(cls, strategy):
-        if cls._strategy is not None:
-            raise RuntimeError(f"Internal error. {cls.__name__} strategy is already set")
-        cls._strategy = strategy
-
-    @classmethod
-    def get_strategy(cls) -> Optional["LatentsCachingStrategy"]:
-        return cls._strategy
-
-    @property
-    def cache_to_disk(self):
-        return self._cache_to_disk
-
-    @property
-    def batch_size(self):
-        return self._batch_size
-
-    def get_image_size_from_image_absolute_path(self, absolute_path: str) -> Tuple[Optional[int], Optional[int]]:
-        raise NotImplementedError
-
-    def get_latents_npz_path(self, absolute_path: str, bucket_reso: Tuple[int, int]) -> str:
-        raise NotImplementedError
-
-    def is_disk_cached_latents_expected(
-        self, bucket_reso: Tuple[int, int], npz_path: str, flip_aug: bool, alpha_mask: bool
-    ) -> bool:
-        raise NotImplementedError
-
-    def cache_batch_latents(self, batch: List[ImageInfo], flip_aug: bool, alpha_mask: bool, random_crop: bool):
-        raise NotImplementedError
-
-
 class BaseSubset:
     def __init__(
         self,
@@ -639,17 +602,12 @@ def __eq__(self, other) -> bool:
 class BaseDataset(torch.utils.data.Dataset):
     def __init__(
         self,
-        tokenizer: Union[CLIPTokenizer, List[CLIPTokenizer]],
-        max_token_length: int,
         resolution: Optional[Tuple[int, int]],
         network_multiplier: float,
         debug_dataset: bool,
     ) -> None:
         super().__init__()
 
-        self.tokenizers = tokenizer if isinstance(tokenizer, list) else [tokenizer]
-
-        self.max_token_length = max_token_length
         # width/height is used when enable_bucket==False
         self.width, self.height = (None, None) if resolution is None else resolution
         self.network_multiplier = network_multiplier
@@ -670,8 +628,6 @@ def __init__(
         self.bucket_no_upscale = None
         self.bucket_info = None  # for metadata
 
-        self.tokenizer_max_length = self.tokenizers[0].model_max_length if max_token_length is None else max_token_length + 2
-
         self.current_epoch: int = 0  # インスタンスがepochごとに新しく作られるようなので外側から渡さないとダメ
 
         self.current_step: int = 0
@@ -690,6 +646,15 @@ def __init__(
 
         # caching
         self.caching_mode = None  # None, 'latents', 'text'
+        
+        self.tokenize_strategy = None
+        self.text_encoder_output_caching_strategy = None
+        self.latents_caching_strategy = None
+
+    def set_current_strategies(self):
+        self.tokenize_strategy = TokenizeStrategy.get_strategy()
+        self.text_encoder_output_caching_strategy = TextEncoderOutputsCachingStrategy.get_strategy()
+        self.latents_caching_strategy = LatentsCachingStrategy.get_strategy()
 
     def set_seed(self, seed):
         self.seed = seed
@@ -979,22 +944,6 @@ def make_buckets(self):
             for batch_index in range(batch_count):
                 self.buckets_indices.append(BucketBatchIndex(bucket_index, self.batch_size, batch_index))
 
-            # ↓以下はbucketごとのbatch件数があまりにも増えて混乱を招くので元に戻す
-            # 　学習時はステップ数がランダムなので、同一画像が同一batch内にあってもそれほど悪影響はないであろう、と考えられる
-            #
-            # # bucketが細分化されることにより、ひとつのbucketに一種類の画像のみというケースが増え、つまりそれは
-            # # ひとつのbatchが同じ画像で占められることになるので、さすがに良くないであろう
-            # # そのためバッチサイズを画像種類までに制限する
-            # # ただそれでも同一画像が同一バッチに含まれる可能性はあるので、繰り返し回数が少ないほうがshuffleの品質は良くなることは間違いない？
-            # # TO DO 正則化画像をepochまたがりで利用する仕組み
-            # num_of_image_types = len(set(bucket))
-            # bucket_batch_size = min(self.batch_size, num_of_image_types)
-            # batch_count = int(math.ceil(len(bucket) / bucket_batch_size))
-            # # logger.info(bucket_index, num_of_image_types, bucket_batch_size, batch_count)
-            # for batch_index in range(batch_count):
-            #   self.buckets_indices.append(BucketBatchIndex(bucket_index, bucket_batch_size, batch_index))
-            # ↑ここまで
-
         self.shuffle_buckets()
         self._length = len(self.buckets_indices)
 
@@ -1027,12 +976,13 @@ def is_text_encoder_output_cacheable(self):
             ]
         )
 
-    def new_cache_latents(self, is_main_process: bool, caching_strategy: LatentsCachingStrategy):
+    def new_cache_latents(self, model: Any, is_main_process: bool):
         r"""
         a brand new method to cache latents. This method caches latents with caching strategy.
         normal cache_latents method is used by default, but this method is used when caching strategy is specified.
         """
         logger.info("caching latents with caching strategy.")
+        caching_strategy = LatentsCachingStrategy.get_strategy()
         image_infos = list(self.image_data.values())
 
         # sort by resolution
@@ -1088,7 +1038,7 @@ def new_cache_latents(self, is_main_process: bool, caching_strategy: LatentsCach
         logger.info("caching latents...")
         for batch in tqdm(batches, smoothing=1, total=len(batches)):
             # cache_batch_latents(vae, cache_to_disk, batch, subset.flip_aug, subset.alpha_mask, subset.random_crop)
-            caching_strategy.cache_batch_latents(batch, subset.flip_aug, subset.alpha_mask, subset.random_crop)
+            caching_strategy.cache_batch_latents(model, batch, subset.flip_aug, subset.alpha_mask, subset.random_crop)
 
     def cache_latents(self, vae, vae_batch_size=1, cache_to_disk=False, is_main_process=True, file_suffix=".npz"):
         # マルチGPUには対応していないので、そちらはtools/cache_latents.pyを使うこと
@@ -1145,6 +1095,56 @@ def cache_latents(self, vae, vae_batch_size=1, cache_to_disk=False, is_main_proc
         for batch in tqdm(batches, smoothing=1, total=len(batches)):
             cache_batch_latents(vae, cache_to_disk, batch, subset.flip_aug, subset.alpha_mask, subset.random_crop)
 
+    def new_cache_text_encoder_outputs(self, models: List[Any], is_main_process: bool):
+        r"""
+        a brand new method to cache text encoder outputs. This method caches text encoder outputs with caching strategy.
+        """
+        tokenize_strategy = TokenizeStrategy.get_strategy()
+        text_encoding_strategy = TextEncodingStrategy.get_strategy()
+        caching_strategy = TextEncoderOutputsCachingStrategy.get_strategy()
+        batch_size = caching_strategy.batch_size or self.batch_size
+
+        # if cache to disk, don't cache TE outputs in non-main process
+        if caching_strategy.cache_to_disk and not is_main_process:
+            return
+
+        logger.info("caching Text Encoder outputs with caching strategy.")
+        image_infos = list(self.image_data.values())
+
+        # split by resolution
+        batches = []
+        batch = []
+        logger.info("checking cache validity...")
+        for info in tqdm(image_infos):
+            te_out_npz = caching_strategy.get_outputs_npz_path(info.absolute_path)
+
+            # check disk cache exists and size of latents
+            if caching_strategy.cache_to_disk:
+                info.text_encoder_outputs_npz = te_out_npz
+                cache_available = caching_strategy.is_disk_cached_outputs_expected(te_out_npz)
+                if cache_available:  # do not add to batch
+                    continue
+
+            batch.append(info)
+
+            # if number of data in batch is enough, flush the batch
+            if len(batch) >= batch_size:
+                batches.append(batch)
+                batch = []
+
+        if len(batch) > 0:
+            batches.append(batch)
+
+        if len(batches) == 0:
+            logger.info("no Text Encoder outputs to cache")
+            return
+
+        # iterate batches
+        logger.info("caching Text Encoder outputs...")
+        for batch in tqdm(batches, smoothing=1, total=len(batches)):
+            # cache_batch_latents(vae, cache_to_disk, batch, subset.flip_aug, subset.alpha_mask, subset.random_crop)
+            caching_strategy.cache_batch_outputs(tokenize_strategy, models, text_encoding_strategy, batch)
+
     # if weight_dtype is specified, Text Encoder itself and output will be converted to the dtype
     # this method is only for SDXL, but it should be implemented here because it needs to be a method of dataset
     # to support SD1/2, it needs a flag for v2, but it is postponed
@@ -1188,6 +1188,8 @@ def cache_text_encoder_outputs_common(
         # またマルチGPUには対応していないので、そちらはtools/cache_latents.pyを使うこと
         logger.info("caching text encoder outputs.")
 
+        tokenize_strategy = TokenizeStrategy.get_strategy()
+
         if batch_size is None:
             batch_size = self.batch_size
 
@@ -1229,7 +1231,7 @@ def cache_text_encoder_outputs_common(
                 input_ids2 = self.get_input_ids(info.caption, tokenizers[1])
                 batch.append((info, input_ids1, input_ids2))
             else:
-                l_tokens, g_tokens, t5_tokens = tokenizers[0].tokenize_with_weights(info.caption)
+                l_tokens, g_tokens, t5_tokens = tokenize_strategy.tokenize(info.caption)
                 batch.append((info, l_tokens, g_tokens, t5_tokens))
 
             if len(batch) >= batch_size:
@@ -1347,7 +1349,6 @@ def __getitem__(self, index):
         loss_weights = []
         captions = []
         input_ids_list = []
-        input_ids2_list = []
         latents_list = []
         alpha_mask_list = []
         images = []
@@ -1355,16 +1356,14 @@ def __getitem__(self, index):
         crop_top_lefts = []
         target_sizes_hw = []
         flippeds = []  # 変数名が微妙
-        text_encoder_outputs1_list = []
-        text_encoder_outputs2_list = []
-        text_encoder_pool2_list = []
+        text_encoder_outputs_list = []
 
         for image_key in bucket[image_index : image_index + bucket_batch_size]:
             image_info = self.image_data[image_key]
             subset = self.image_to_subset[image_key]
-            loss_weights.append(
-                self.prior_loss_weight if image_info.is_reg else 1.0
-            )  # in case of fine tuning, is_reg is always False
+
+            # in case of fine tuning, is_reg is always False
+            loss_weights.append(self.prior_loss_weight if image_info.is_reg else 1.0)
 
             flipped = subset.flip_aug and random.random() < 0.5  # not flipped or flipped with 50% chance
 
@@ -1381,7 +1380,9 @@ def __getitem__(self, index):
 
                 image = None
             elif image_info.latents_npz is not None:  # FineTuningDatasetまたはcache_latents_to_disk=Trueの場合
-                latents, original_size, crop_ltrb, flipped_latents, alpha_mask = load_latents_from_disk(image_info.latents_npz)
+                latents, original_size, crop_ltrb, flipped_latents, alpha_mask = (
+                    self.latents_caching_strategy.load_latents_from_disk(image_info.latents_npz)
+                )
                 if flipped:
                     latents = flipped_latents
                     alpha_mask = None if alpha_mask is None else alpha_mask[:, ::-1].copy()  # copy to avoid negative stride problem
@@ -1470,75 +1471,67 @@ def __getitem__(self, index):
 
             # captionとtext encoder outputを処理する
             caption = image_info.caption  # default
-            if image_info.text_encoder_outputs1 is not None:
-                text_encoder_outputs1_list.append(image_info.text_encoder_outputs1)
-                text_encoder_outputs2_list.append(image_info.text_encoder_outputs2)
-                text_encoder_pool2_list.append(image_info.text_encoder_pool2)
-                captions.append(caption)
+
+            tokenization_required = (
+                self.text_encoder_output_caching_strategy is None or self.text_encoder_output_caching_strategy.is_partial
+            )
+            text_encoder_outputs = None
+            input_ids = None
+
+            if image_info.text_encoder_outputs is not None:
+                # cached
+                text_encoder_outputs = image_info.text_encoder_outputs
             elif image_info.text_encoder_outputs_npz is not None:
-                text_encoder_outputs1, text_encoder_outputs2, text_encoder_pool2 = load_text_encoder_outputs_from_disk(
+                # on disk
+                text_encoder_outputs = self.text_encoder_output_caching_strategy.load_outputs_npz(
                     image_info.text_encoder_outputs_npz
                 )
-                text_encoder_outputs1_list.append(text_encoder_outputs1)
-                text_encoder_outputs2_list.append(text_encoder_outputs2)
-                text_encoder_pool2_list.append(text_encoder_pool2)
-                captions.append(caption)
             else:
-                caption = self.process_caption(subset, image_info.caption)
-                if self.XTI_layers:
-                    caption_layer = []
-                    for layer in self.XTI_layers:
-                        token_strings_from = " ".join(self.token_strings)
-                        token_strings_to = " ".join([f"{x}_{layer}" for x in self.token_strings])
-                        caption_ = caption.replace(token_strings_from, token_strings_to)
-                        caption_layer.append(caption_)
-                    captions.append(caption_layer)
-                else:
-                    captions.append(caption)
+                tokenization_required = True
+            text_encoder_outputs_list.append(text_encoder_outputs)
 
-                if not self.token_padding_disabled:  # this option might be omitted in future
-                    # TODO get_input_ids must support SD3
-                    if self.XTI_layers:
-                        token_caption = self.get_input_ids(caption_layer, self.tokenizers[0])
-                    else:
-                        token_caption = self.get_input_ids(caption, self.tokenizers[0])
-                    input_ids_list.append(token_caption)
+            if tokenization_required:
+                caption = self.process_caption(subset, image_info.caption)
+                input_ids = [ids[0] for ids in self.tokenize_strategy.tokenize(caption)]  # remove batch dimension
+                # if self.XTI_layers:
+                #     caption_layer = []
+                #     for layer in self.XTI_layers:
+                #         token_strings_from = " ".join(self.token_strings)
+                #         token_strings_to = " ".join([f"{x}_{layer}" for x in self.token_strings])
+                #         caption_ = caption.replace(token_strings_from, token_strings_to)
+                #         caption_layer.append(caption_)
+                #     captions.append(caption_layer)
+                # else:
+                #     captions.append(caption)
+
+                # if not self.token_padding_disabled:  # this option might be omitted in future
+                #     # TODO get_input_ids must support SD3
+                #     if self.XTI_layers:
+                #         token_caption = self.get_input_ids(caption_layer, self.tokenizers[0])
+                #     else:
+                #         token_caption = self.get_input_ids(caption, self.tokenizers[0])
+                #     input_ids_list.append(token_caption)
+
+                #     if len(self.tokenizers) > 1:
+                #         if self.XTI_layers:
+                #             token_caption2 = self.get_input_ids(caption_layer, self.tokenizers[1])
+                #         else:
+                #             token_caption2 = self.get_input_ids(caption, self.tokenizers[1])
+                #         input_ids2_list.append(token_caption2)
+
+            input_ids_list.append(input_ids)
+            captions.append(caption)
 
-                    if len(self.tokenizers) > 1:
-                        if self.XTI_layers:
-                            token_caption2 = self.get_input_ids(caption_layer, self.tokenizers[1])
-                        else:
-                            token_caption2 = self.get_input_ids(caption, self.tokenizers[1])
-                        input_ids2_list.append(token_caption2)
+        def none_or_stack_elements(tensors_list, converter):
+            # [[clip_l, clip_g, t5xxl], [clip_l, clip_g, t5xxl], ...] -> [torch.stack(clip_l), torch.stack(clip_g), torch.stack(t5xxl)]
+            if len(tensors_list) == 0 or tensors_list[0] == None or len(tensors_list[0]) == 0 or tensors_list[0][0] is None:
+                return None
+            return [torch.stack([converter(x[i]) for x in tensors_list]) for i in range(len(tensors_list[0]))]
 
         example = {}
         example["loss_weights"] = torch.FloatTensor(loss_weights)
-
-        if len(text_encoder_outputs1_list) == 0:
-            if self.token_padding_disabled:
-                # padding=True means pad in the batch
-                example["input_ids"] = self.tokenizer[0](captions, padding=True, truncation=True, return_tensors="pt").input_ids
-                if len(self.tokenizers) > 1:
-                    example["input_ids2"] = self.tokenizer[1](
-                        captions, padding=True, truncation=True, return_tensors="pt"
-                    ).input_ids
-                else:
-                    example["input_ids2"] = None
-            else:
-                example["input_ids"] = torch.stack(input_ids_list)
-                example["input_ids2"] = torch.stack(input_ids2_list) if len(self.tokenizers) > 1 else None
-            example["text_encoder_outputs1_list"] = None
-            example["text_encoder_outputs2_list"] = None
-            example["text_encoder_pool2_list"] = None
-        else:
-            example["input_ids"] = None
-            example["input_ids2"] = None
-            # # for assertion
-            # example["input_ids"] = torch.stack([self.get_input_ids(cap, self.tokenizers[0]) for cap in captions])
-            # example["input_ids2"] = torch.stack([self.get_input_ids(cap, self.tokenizers[1]) for cap in captions])
-            example["text_encoder_outputs1_list"] = torch.stack(text_encoder_outputs1_list)
-            example["text_encoder_outputs2_list"] = torch.stack(text_encoder_outputs2_list)
-            example["text_encoder_pool2_list"] = torch.stack(text_encoder_pool2_list)
+        example["text_encoder_outputs_list"] = none_or_stack_elements(text_encoder_outputs_list, torch.FloatTensor)
+        example["input_ids_list"] = none_or_stack_elements(input_ids_list, lambda x: x)
 
         # if one of alpha_masks is not None, we need to replace None with ones
         none_or_not = [x is None for x in alpha_mask_list]
@@ -1652,8 +1645,6 @@ def __init__(
         self,
         subsets: Sequence[DreamBoothSubset],
         batch_size: int,
-        tokenizer,
-        max_token_length,
         resolution,
         network_multiplier: float,
         enable_bucket: bool,
@@ -1664,7 +1655,7 @@ def __init__(
         prior_loss_weight: float,
         debug_dataset: bool,
     ) -> None:
-        super().__init__(tokenizer, max_token_length, resolution, network_multiplier, debug_dataset)
+        super().__init__(resolution, network_multiplier, debug_dataset)
 
         assert resolution is not None, f"resolution is required / resolution（解像度）指定は必須です"
 
@@ -1750,10 +1741,10 @@ def load_dreambooth_dir(subset: DreamBoothSubset):
                 # new caching: get image size from cache files
                 strategy = LatentsCachingStrategy.get_strategy()
                 if strategy is not None:
-                    logger.info("get image size from cache files")
+                    logger.info("get image size from name of cache files")
                     size_set_count = 0
                     for i, img_path in enumerate(tqdm(img_paths)):
-                        w, h = strategy.get_image_size_from_image_absolute_path(img_path)
+                        w, h = strategy.get_image_size_from_disk_cache_path(img_path)
                         if w is not None and h is not None:
                             sizes[i] = [w, h]
                             size_set_count += 1
@@ -1886,8 +1877,6 @@ def __init__(
         self,
         subsets: Sequence[FineTuningSubset],
         batch_size: int,
-        tokenizer,
-        max_token_length,
         resolution,
         network_multiplier: float,
         enable_bucket: bool,
@@ -1897,7 +1886,7 @@ def __init__(
         bucket_no_upscale: bool,
         debug_dataset: bool,
     ) -> None:
-        super().__init__(tokenizer, max_token_length, resolution, network_multiplier, debug_dataset)
+        super().__init__(resolution, network_multiplier, debug_dataset)
 
         self.batch_size = batch_size
 
@@ -2111,8 +2100,6 @@ def __init__(
         self,
         subsets: Sequence[ControlNetSubset],
         batch_size: int,
-        tokenizer,
-        max_token_length,
         resolution,
         network_multiplier: float,
         enable_bucket: bool,
@@ -2122,7 +2109,7 @@ def __init__(
         bucket_no_upscale: bool,
         debug_dataset: float,
     ) -> None:
-        super().__init__(tokenizer, max_token_length, resolution, network_multiplier, debug_dataset)
+        super().__init__(resolution, network_multiplier, debug_dataset)
 
         db_subsets = []
         for subset in subsets:
@@ -2160,8 +2147,6 @@ def __init__(
         self.dreambooth_dataset_delegate = DreamBoothDataset(
             db_subsets,
             batch_size,
-            tokenizer,
-            max_token_length,
             resolution,
             network_multiplier,
             enable_bucket,
@@ -2221,6 +2206,9 @@ def __init__(
 
         self.conditioning_image_transforms = IMAGE_TRANSFORMS
 
+    def set_current_strategies(self):
+        return self.dreambooth_dataset_delegate.set_current_strategies()
+
     def make_buckets(self):
         self.dreambooth_dataset_delegate.make_buckets()
         self.bucket_manager = self.dreambooth_dataset_delegate.bucket_manager
@@ -2229,6 +2217,12 @@ def make_buckets(self):
     def cache_latents(self, vae, vae_batch_size=1, cache_to_disk=False, is_main_process=True):
         return self.dreambooth_dataset_delegate.cache_latents(vae, vae_batch_size, cache_to_disk, is_main_process)
 
+    def new_cache_latents(self, model: Any, is_main_process: bool):
+        return self.dreambooth_dataset_delegate.new_cache_latents(model, is_main_process)
+
+    def new_cache_text_encoder_outputs(self, models: List[Any], is_main_process: bool):
+        return self.dreambooth_dataset_delegate.new_cache_text_encoder_outputs(models, is_main_process)
+
     def __len__(self):
         return self.dreambooth_dataset_delegate.__len__()
 
@@ -2314,6 +2308,13 @@ def add_replacement(self, str_from, str_to):
     #   for dataset in self.datasets:
     #     dataset.make_buckets()
 
+    def set_text_encoder_output_caching_strategy(self, strategy: TextEncoderOutputsCachingStrategy):
+        """
+        DataLoader is run in multiple processes, so we need to set the strategy manually.
+        """
+        for dataset in self.datasets:
+            dataset.set_text_encoder_output_caching_strategy(strategy)
+
     def enable_XTI(self, *args, **kwargs):
         for dataset in self.datasets:
             dataset.enable_XTI(*args, **kwargs)
@@ -2323,10 +2324,10 @@ def cache_latents(self, vae, vae_batch_size=1, cache_to_disk=False, is_main_proc
             logger.info(f"[Dataset {i}]")
             dataset.cache_latents(vae, vae_batch_size, cache_to_disk, is_main_process, file_suffix)
 
-    def new_cache_latents(self, is_main_process: bool, strategy: LatentsCachingStrategy):
+    def new_cache_latents(self, model: Any, is_main_process: bool):
         for i, dataset in enumerate(self.datasets):
             logger.info(f"[Dataset {i}]")
-            dataset.new_cache_latents(is_main_process, strategy)
+            dataset.new_cache_latents(model, is_main_process)
 
     def cache_text_encoder_outputs(
         self, tokenizers, text_encoders, device, weight_dtype, cache_to_disk=False, is_main_process=True
@@ -2344,6 +2345,11 @@ def cache_text_encoder_outputs_sd3(
                 tokenizer, text_encoders, device, output_dtype, te_dtypes, cache_to_disk, is_main_process, batch_size
             )
 
+    def new_cache_text_encoder_outputs(self, models: List[Any], is_main_process: bool):
+        for i, dataset in enumerate(self.datasets):
+            logger.info(f"[Dataset {i}]")
+            dataset.new_cache_text_encoder_outputs(models, is_main_process)
+
     def set_caching_mode(self, caching_mode):
         for dataset in self.datasets:
             dataset.set_caching_mode(caching_mode)
@@ -2358,6 +2364,10 @@ def is_latent_cacheable(self) -> bool:
     def is_text_encoder_output_cacheable(self) -> bool:
         return all([dataset.is_text_encoder_output_cacheable() for dataset in self.datasets])
 
+    def set_current_strategies(self):
+        for dataset in self.datasets:
+            dataset.set_current_strategies()
+
     def set_current_epoch(self, epoch):
         for dataset in self.datasets:
             dataset.set_current_epoch(epoch)
@@ -2411,34 +2421,34 @@ def is_disk_cached_latents_is_expected(reso, npz_path: str, flip_aug: bool, alph
 
 # 戻り値は、latents_tensor, (original_size width, original_size height), (crop left, crop top)
 # TODO update to use CachingStrategy
-def load_latents_from_disk(
-    npz_path,
-) -> Tuple[Optional[np.ndarray], Optional[List[int]], Optional[List[int]], Optional[np.ndarray], Optional[np.ndarray]]:
-    npz = np.load(npz_path)
-    if "latents" not in npz:
-        raise ValueError(f"error: npz is old format. please re-generate {npz_path}")
-
-    latents = npz["latents"]
-    original_size = npz["original_size"].tolist()
-    crop_ltrb = npz["crop_ltrb"].tolist()
-    flipped_latents = npz["latents_flipped"] if "latents_flipped" in npz else None
-    alpha_mask = npz["alpha_mask"] if "alpha_mask" in npz else None
-    return latents, original_size, crop_ltrb, flipped_latents, alpha_mask
-
-
-def save_latents_to_disk(npz_path, latents_tensor, original_size, crop_ltrb, flipped_latents_tensor=None, alpha_mask=None):
-    kwargs = {}
-    if flipped_latents_tensor is not None:
-        kwargs["latents_flipped"] = flipped_latents_tensor.float().cpu().numpy()
-    if alpha_mask is not None:
-        kwargs["alpha_mask"] = alpha_mask.float().cpu().numpy()
-    np.savez(
-        npz_path,
-        latents=latents_tensor.float().cpu().numpy(),
-        original_size=np.array(original_size),
-        crop_ltrb=np.array(crop_ltrb),
-        **kwargs,
-    )
+# def load_latents_from_disk(
+#     npz_path,
+# ) -> Tuple[Optional[np.ndarray], Optional[List[int]], Optional[List[int]], Optional[np.ndarray], Optional[np.ndarray]]:
+#     npz = np.load(npz_path)
+#     if "latents" not in npz:
+#         raise ValueError(f"error: npz is old format. please re-generate {npz_path}")
+
+#     latents = npz["latents"]
+#     original_size = npz["original_size"].tolist()
+#     crop_ltrb = npz["crop_ltrb"].tolist()
+#     flipped_latents = npz["latents_flipped"] if "latents_flipped" in npz else None
+#     alpha_mask = npz["alpha_mask"] if "alpha_mask" in npz else None
+#     return latents, original_size, crop_ltrb, flipped_latents, alpha_mask
+
+
+# def save_latents_to_disk(npz_path, latents_tensor, original_size, crop_ltrb, flipped_latents_tensor=None, alpha_mask=None):
+#     kwargs = {}
+#     if flipped_latents_tensor is not None:
+#         kwargs["latents_flipped"] = flipped_latents_tensor.float().cpu().numpy()
+#     if alpha_mask is not None:
+#         kwargs["alpha_mask"] = alpha_mask.float().cpu().numpy()
+#     np.savez(
+#         npz_path,
+#         latents=latents_tensor.float().cpu().numpy(),
+#         original_size=np.array(original_size),
+#         crop_ltrb=np.array(crop_ltrb),
+#         **kwargs,
+#     )
 
 
 def debug_dataset(train_dataset, show_input_ids=False):
@@ -2465,12 +2475,12 @@ def debug_dataset(train_dataset, show_input_ids=False):
             example = train_dataset[idx]
             if example["latents"] is not None:
                 logger.info(f"sample has latents from npz file: {example['latents'].size()}")
-            for j, (ik, cap, lw, iid, orgsz, crptl, trgsz, flpdz) in enumerate(
+            for j, (ik, cap, lw, orgsz, crptl, trgsz, flpdz) in enumerate(
                 zip(
                     example["image_keys"],
                     example["captions"],
                     example["loss_weights"],
-                    example["input_ids"],
+                    # example["input_ids"],
                     example["original_sizes_hw"],
                     example["crop_top_lefts"],
                     example["target_sizes_hw"],
@@ -2483,10 +2493,10 @@ def debug_dataset(train_dataset, show_input_ids=False):
                 if "network_multipliers" in example:
                     print(f"network multiplier: {example['network_multipliers'][j]}")
 
-                if show_input_ids:
-                    logger.info(f"input ids: {iid}")
-                    if "input_ids2" in example:
-                        logger.info(f"input ids2: {example['input_ids2'][j]}")
+                # if show_input_ids:
+                #     logger.info(f"input ids: {iid}")
+                #     if "input_ids2" in example:
+                #         logger.info(f"input ids2: {example['input_ids2'][j]}")
                 if example["images"] is not None:
                     im = example["images"][j]
                     logger.info(f"image size: {im.size()}")
@@ -2555,8 +2565,8 @@ def glob_images_pathlib(dir_path, recursive):
 
 
 class MinimalDataset(BaseDataset):
-    def __init__(self, tokenizer, max_token_length, resolution, network_multiplier, debug_dataset=False):
-        super().__init__(tokenizer, max_token_length, resolution, network_multiplier, debug_dataset)
+    def __init__(self, resolution, network_multiplier, debug_dataset=False):
+        super().__init__(resolution, network_multiplier, debug_dataset)
 
         self.num_train_images = 0  # update in subclass
         self.num_reg_images = 0  # update in subclass
@@ -2773,14 +2783,15 @@ def cache_batch_latents(
             raise RuntimeError(f"NaN detected in latents: {info.absolute_path}")
 
         if cache_to_disk:
-            save_latents_to_disk(
-                info.latents_npz,
-                latent,
-                info.latents_original_size,
-                info.latents_crop_ltrb,
-                flipped_latent,
-                alpha_mask,
-            )
+            # save_latents_to_disk(
+            #     info.latents_npz,
+            #     latent,
+            #     info.latents_original_size,
+            #     info.latents_crop_ltrb,
+            #     flipped_latent,
+            #     alpha_mask,
+            # )
+            pass
         else:
             info.latents = latent
             if flip_aug:
@@ -4662,33 +4673,6 @@ def prepare_dataset_args(args: argparse.Namespace, support_metadata: bool):
             )
 
 
-def load_tokenizer(args: argparse.Namespace):
-    logger.info("prepare tokenizer")
-    original_path = V2_STABLE_DIFFUSION_PATH if args.v2 else TOKENIZER_PATH
-
-    tokenizer: CLIPTokenizer = None
-    if args.tokenizer_cache_dir:
-        local_tokenizer_path = os.path.join(args.tokenizer_cache_dir, original_path.replace("/", "_"))
-        if os.path.exists(local_tokenizer_path):
-            logger.info(f"load tokenizer from cache: {local_tokenizer_path}")
-            tokenizer = CLIPTokenizer.from_pretrained(local_tokenizer_path)  # same for v1 and v2
-
-    if tokenizer is None:
-        if args.v2:
-            tokenizer = CLIPTokenizer.from_pretrained(original_path, subfolder="tokenizer")
-        else:
-            tokenizer = CLIPTokenizer.from_pretrained(original_path)
-
-    if hasattr(args, "max_token_length") and args.max_token_length is not None:
-        logger.info(f"update token length: {args.max_token_length}")
-
-    if args.tokenizer_cache_dir and not os.path.exists(local_tokenizer_path):
-        logger.info(f"save Tokenizer to cache: {local_tokenizer_path}")
-        tokenizer.save_pretrained(local_tokenizer_path)
-
-    return tokenizer
-
-
 def prepare_accelerator(args: argparse.Namespace):
     """
     this function also prepares deepspeed plugin
@@ -5550,6 +5534,7 @@ def sample_images_common(
 ):
     """
     StableDiffusionLongPromptWeightingPipelineの改造版を使うようにしたので、clip skipおよびプロンプトの重みづけに対応した
+    TODO Use strategies here
     """
 
     if steps == 0:
diff --git a/sd3_minimal_inference.py b/sd3_minimal_inference.py
index ffa0d46de..e9e61af1b 100644
--- a/sd3_minimal_inference.py
+++ b/sd3_minimal_inference.py
@@ -24,7 +24,7 @@
 
 logger = logging.getLogger(__name__)
 
-from library import sd3_models, sd3_utils
+from library import sd3_models, sd3_utils, strategy_sd3
 
 
 def get_noise(seed, latent):
@@ -145,6 +145,7 @@ def do_sample(
     parser.add_argument("--clip_g", type=str, required=False)
     parser.add_argument("--clip_l", type=str, required=False)
     parser.add_argument("--t5xxl", type=str, required=False)
+    parser.add_argument("--t5xxl_token_length", type=int, default=77, help="t5xxl token length, default: 77")
     parser.add_argument("--prompt", type=str, default="A photo of a cat")
     # parser.add_argument("--prompt2", type=str, default=None)  # do not support different prompts for text encoders
     parser.add_argument("--negative_prompt", type=str, default="")
@@ -247,7 +248,7 @@ def do_sample(
 
     # load tokenizers
     logger.info("Loading tokenizers...")
-    tokenizer = sd3_models.SD3Tokenizer(use_t5xxl)  # combined tokenizer
+    tokenize_strategy = strategy_sd3.Sd3TokenizeStrategy(args.t5xxl_token_length)
 
     # load models
     # logger.info("Create MMDiT from SD3 checkpoint...")
@@ -320,12 +321,19 @@ def do_sample(
 
     # prepare embeddings
     logger.info("Encoding prompts...")
-    # embeds, pooled_embed
-    lg_out, t5_out, pooled = sd3_utils.get_cond(args.prompt, tokenizer, clip_l, clip_g, t5xxl)
-    cond = torch.cat([lg_out, t5_out], dim=-2), pooled
+    encoding_strategy = strategy_sd3.Sd3TextEncodingStrategy()
 
-    lg_out, t5_out, pooled = sd3_utils.get_cond(args.negative_prompt, tokenizer, clip_l, clip_g, t5xxl)
-    neg_cond = torch.cat([lg_out, t5_out], dim=-2), pooled
+    l_tokens, g_tokens, t5_tokens = tokenize_strategy.tokenize(args.prompt)
+    lg_out, t5_out, pooled = encoding_strategy.encode_tokens(
+        tokenize_strategy, [clip_l, clip_g, t5xxl], [l_tokens, g_tokens, t5_tokens]
+    )
+    cond = encoding_strategy.concat_encodings(lg_out, t5_out, pooled)
+
+    l_tokens, g_tokens, t5_tokens = tokenize_strategy.tokenize(args.negative_prompt)
+    lg_out, t5_out, pooled = encoding_strategy.encode_tokens(
+        tokenize_strategy, [clip_l, clip_g, t5xxl], [l_tokens, g_tokens, t5_tokens]
+    )
+    neg_cond = encoding_strategy.concat_encodings(lg_out, t5_out, pooled)
 
     # generate image
     logger.info("Generating image...")
diff --git a/sd3_train.py b/sd3_train.py
index f34e47124..617e30271 100644
--- a/sd3_train.py
+++ b/sd3_train.py
@@ -17,7 +17,7 @@
 
 from accelerate.utils import set_seed
 from diffusers import DDPMScheduler
-from library import deepspeed_utils, sd3_models, sd3_train_utils, sd3_utils
+from library import deepspeed_utils, sd3_models, sd3_train_utils, sd3_utils, strategy_base, strategy_sd3
 from library.sdxl_train_util import match_mixed_precision
 
 # , sdxl_model_util
@@ -69,10 +69,22 @@ def train(args):
     #     not args.train_text_encoder
     # ), "training text encoder is not supported currently / text encoderの学習は現在サポートされていません"
 
-    # training without text encoder cache is not supported
-    assert (
-        args.cache_text_encoder_outputs
-    ), "training without text encoder cache is not supported currently / text encoderのキャッシュなしの学習は現在サポートされていません"
+    # # training without text encoder cache is not supported: because T5XXL must be cached
+    # assert (
+    #     args.cache_text_encoder_outputs
+    # ), "training without text encoder cache is not supported currently / text encoderのキャッシュなしの学習は現在サポートされていません"
+
+    assert not args.train_text_encoder or (args.use_t5xxl_cache_only or not args.cache_text_encoder_outputs), (
+        "when training text encoder, text encoder outputs must not be cached (except for T5XXL)"
+        + " / text encoderの学習時はtext encoderの出力はキャッシュできません（t5xxlのみキャッシュすることは可能です）"
+    )
+
+    if args.use_t5xxl_cache_only and not args.cache_text_encoder_outputs:
+        logger.warning(
+            "use_t5xxl_cache_only is enabled, so cache_text_encoder_outputs is automatically enabled."
+            + " / use_t5xxl_cache_onlyが有効なため、cache_text_encoder_outputsも自動的に有効になります"
+        )
+        args.cache_text_encoder_outputs = True
 
     # if args.block_lr:
     #     block_lrs = [float(lr) for lr in args.block_lr.split(",")]
@@ -88,17 +100,17 @@ def train(args):
     if args.seed is not None:
         set_seed(args.seed)  # 乱数系列を初期化する
 
-    # load tokenizer
-    sd3_tokenizer = sd3_models.SD3Tokenizer()
-
-    # prepare caching strategy
-    if args.new_caching:
-        latents_caching_strategy = sd3_train_utils.Sd3LatentsCachingStrategy(
+    # prepare caching strategy: this must be set before preparing dataset. because dataset may use this strategy for initialization.
+    if args.cache_latents:
+        latents_caching_strategy = strategy_sd3.Sd3LatentsCachingStrategy(
             args.cache_latents_to_disk, args.vae_batch_size, args.skip_latents_validity_check
         )
-    else:
-        latents_caching_strategy = None
-    train_util.LatentsCachingStrategy.set_strategy(latents_caching_strategy)
+        strategy_base.LatentsCachingStrategy.set_strategy(latents_caching_strategy)
+
+    # load tokenizer and prepare tokenize strategy
+    sd3_tokenizer = sd3_models.SD3Tokenizer(t5xxl_max_length=args.t5xxl_max_token_length)
+    sd3_tokenize_strategy = strategy_sd3.Sd3TokenizeStrategy(args.t5xxl_max_token_length)
+    strategy_base.TokenizeStrategy.set_strategy(sd3_tokenize_strategy)
 
     # データセットを準備する
     if args.dataset_class is None:
@@ -153,6 +165,16 @@ def train(args):
     train_dataset_group.verify_bucket_reso_steps(8)  # TODO これでいいか確認
 
     if args.debug_dataset:
+        if args.cache_text_encoder_outputs:
+            strategy_base.TextEncoderOutputsCachingStrategy.set_strategy(
+                strategy_sd3.Sd3TextEncoderOutputsCachingStrategy(
+                    args.cache_text_encoder_outputs_to_disk,
+                    args.text_encoder_batch_size,
+                    False,
+                    False,
+                )
+            )
+        train_dataset_group.set_current_strategies()
         train_util.debug_dataset(train_dataset_group, True)
         return
     if len(train_dataset_group) == 0:
@@ -215,19 +237,8 @@ def train(args):
         vae.requires_grad_(False)
         vae.eval()
 
-        if not args.new_caching:
-            vae_wrapper = sd3_models.VAEWrapper(vae)  # make SD/SDXL compatible
-            with torch.no_grad():
-                train_dataset_group.cache_latents(
-                    vae_wrapper,
-                    args.vae_batch_size,
-                    args.cache_latents_to_disk,
-                    accelerator.is_main_process,
-                    file_suffix="_sd3.npz",
-                )
-        else:
-            latents_caching_strategy.set_vae(vae)
-            train_dataset_group.new_cache_latents(accelerator.is_main_process, latents_caching_strategy)
+        train_dataset_group.new_cache_latents(vae, accelerator.is_main_process)
+
         vae.to("cpu")  # if no sampling, vae can be deleted
         clean_memory_on_device(accelerator.device)
 
@@ -246,60 +257,70 @@ def train(args):
     t5xxl = sd3_train_utils.load_target_model("t5xxl", args, sd3_state_dict, accelerator, attn_mode, t5xxl_dtype, device_to_load)
     # logit_scale = logit_scale.to(accelerator.device, dtype=weight_dtype)
 
+    # should be deleted after caching text encoder outputs when not training text encoder
+    # this strategy should not be used other than this process
+    text_encoding_strategy = strategy_sd3.Sd3TextEncodingStrategy()
+    strategy_base.TextEncodingStrategy.set_strategy(text_encoding_strategy)
+
     # 学習を準備する：モデルを適切な状態にする
     train_clip_l = False
     train_clip_g = False
     train_t5xxl = False
 
-    # if args.train_text_encoder:
-    #     # TODO each option for two text encoders?
-    #     accelerator.print("enable text encoder training")
-    #     if args.gradient_checkpointing:
-    #         text_encoder1.gradient_checkpointing_enable()
-    #         text_encoder2.gradient_checkpointing_enable()
-    #     lr_te1 = args.learning_rate_te1 if args.learning_rate_te1 is not None else args.learning_rate  # 0 means not train
-    #     lr_te2 = args.learning_rate_te2 if args.learning_rate_te2 is not None else args.learning_rate  # 0 means not train
-    #     train_clip_l = lr_te1 != 0
-    #     train_clip_g = lr_te2 != 0
-
-    #     # caching one text encoder output is not supported
-    #     if not train_clip_l:
-    #         text_encoder1.to(weight_dtype)
-    #     if not train_clip_g:
-    #         text_encoder2.to(weight_dtype)
-    #     text_encoder1.requires_grad_(train_clip_l)
-    #     text_encoder2.requires_grad_(train_clip_g)
-    #     text_encoder1.train(train_clip_l)
-    #     text_encoder2.train(train_clip_g)
-    # else:
-    clip_l.to(weight_dtype)
-    clip_g.to(weight_dtype)
-    clip_l.requires_grad_(False)
-    clip_g.requires_grad_(False)
-    clip_l.eval()
-    clip_g.eval()
+    if args.train_text_encoder:
+        accelerator.print("enable text encoder training")
+        if args.gradient_checkpointing:
+            clip_l.gradient_checkpointing_enable()
+            clip_g.gradient_checkpointing_enable()
+        lr_te1 = args.learning_rate_te1 if args.learning_rate_te1 is not None else args.learning_rate  # 0 means not train
+        lr_te2 = args.learning_rate_te2 if args.learning_rate_te2 is not None else args.learning_rate  # 0 means not train
+        train_clip_l = lr_te1 != 0
+        train_clip_g = lr_te2 != 0
+
+        if not train_clip_l:
+            clip_l.to(weight_dtype)
+        if not train_clip_g:
+            clip_g.to(weight_dtype)
+        clip_l.requires_grad_(train_clip_l)
+        clip_g.requires_grad_(train_clip_g)
+        clip_l.train(train_clip_l)
+        clip_g.train(train_clip_g)
+    else:
+        clip_l.to(weight_dtype)
+        clip_g.to(weight_dtype)
+        clip_l.requires_grad_(False)
+        clip_g.requires_grad_(False)
+        clip_l.eval()
+        clip_g.eval()
+
     if t5xxl is not None:
         t5xxl.to(t5xxl_dtype)
         t5xxl.requires_grad_(False)
         t5xxl.eval()
 
-    # TextEncoderの出力をキャッシュする
+    # cache text encoder outputs
     if args.cache_text_encoder_outputs:
-        # Text Encodes are eval and no grad
-
-        with torch.no_grad(), accelerator.autocast():
-            train_dataset_group.cache_text_encoder_outputs_sd3(
-                sd3_tokenizer,
-                (clip_l, clip_g, t5xxl),
-                (accelerator.device, accelerator.device, t5xxl_device),
-                None,
-                (None, None, None),
-                args.cache_text_encoder_outputs_to_disk,
-                accelerator.is_main_process,
-                args.text_encoder_batch_size,
-            )
+        # Text Encodes are eval and no grad here
+        clip_l.to(accelerator.device)
+        clip_g.to(accelerator.device)
+        if t5xxl is not None:
+            t5xxl.to(t5xxl_device)
+
+        text_encoder_caching_strategy = strategy_sd3.Sd3TextEncoderOutputsCachingStrategy(
+            args.cache_text_encoder_outputs_to_disk,
+            args.text_encoder_batch_size,
+            False,
+            train_clip_g or train_clip_l or args.use_t5xxl_cache_only,
+        )
+        strategy_base.TextEncoderOutputsCachingStrategy.set_strategy(text_encoder_caching_strategy)
+
+        clip_l.to(accelerator.device, dtype=weight_dtype)
+        clip_g.to(accelerator.device, dtype=weight_dtype)
+        if t5xxl is not None:
+            t5xxl.to(t5xxl_device, dtype=t5xxl_dtype)
 
-        # TODO we can delete text encoders after caching
+        with accelerator.autocast():
+            train_dataset_group.new_cache_text_encoder_outputs([clip_l, clip_g, t5xxl], accelerator.is_main_process)
         accelerator.wait_for_everyone()
 
     # load MMDIT
@@ -332,11 +353,11 @@ def train(args):
     #     params_to_optimize.extend(get_block_params_to_optimize(mmdit, block_lrs))
 
     # if train_clip_l:
-    #     training_models.append(text_encoder1)
-    #     params_to_optimize.append({"params": list(text_encoder1.parameters()), "lr": args.learning_rate_te1 or args.learning_rate})
+    #     training_models.append(clip_l)
+    #     params_to_optimize.append({"params": list(clip_l.parameters()), "lr": args.learning_rate_te1 or args.learning_rate})
     # if train_clip_g:
-    #     training_models.append(text_encoder2)
-    #     params_to_optimize.append({"params": list(text_encoder2.parameters()), "lr": args.learning_rate_te2 or args.learning_rate})
+    #     training_models.append(clip_g)
+    #     params_to_optimize.append({"params": list(clip_g.parameters()), "lr": args.learning_rate_te2 or args.learning_rate})
 
     # calculate number of trainable parameters
     n_params = 0
@@ -344,7 +365,7 @@ def train(args):
         for p in group["params"]:
             n_params += p.numel()
 
-    accelerator.print(f"train mmdit: {train_mmdit}")  # , text_encoder1: {train_clip_l}, text_encoder2: {train_clip_g}")
+    accelerator.print(f"train mmdit: {train_mmdit}")  # , clip_l: {train_clip_l}, clip_g: {train_clip_g}")
     accelerator.print(f"number of models: {len(training_models)}")
     accelerator.print(f"number of trainable parameters: {n_params}")
 
@@ -398,7 +419,11 @@ def train(args):
     else:
         _, _, optimizer = train_util.get_optimizer(args, trainable_params=params_to_optimize)
 
-    # dataloaderを準備する
+    # prepare dataloader
+    # strategies are set here because they cannot be referenced in another process. Copy them with the dataset
+    # some strategies can be None
+    train_dataset_group.set_current_strategies()
+
     # DataLoaderのプロセス数：0 は persistent_workers が使えないので注意
     n_workers = min(args.max_data_loader_n_workers, os.cpu_count())  # cpu_count or max_data_loader_n_workers
     train_dataloader = torch.utils.data.DataLoader(
@@ -455,8 +480,8 @@ def train(args):
     # TODO check if this is necessary. SD3 uses pool for clip_l and clip_g
     # # freeze last layer and final_layer_norm in te1 since we use the output of the penultimate layer
     # if train_clip_l:
-    #     text_encoder1.text_model.encoder.layers[-1].requires_grad_(False)
-    #     text_encoder1.text_model.final_layer_norm.requires_grad_(False)
+    #     clip_l.text_model.encoder.layers[-1].requires_grad_(False)
+    #     clip_l.text_model.final_layer_norm.requires_grad_(False)
 
     # TextEncoderの出力をキャッシュするときには、すでに出力を取得済みなのでCPUへ移動する
     if args.cache_text_encoder_outputs:
@@ -484,9 +509,8 @@ def train(args):
         ds_model = deepspeed_utils.prepare_deepspeed_model(
             args,
             mmdit=mmdit,
-            # mmdie=mmdit if train_mmdit else None,
-            # text_encoder1=text_encoder1 if train_clip_l else None,
-            # text_encoder2=text_encoder2 if train_clip_g else None,
+            clip_l=clip_l if train_clip_l else None,
+            clip_g=clip_g if train_clip_g else None,
         )
         # most of ZeRO stage uses optimizer partitioning, so we have to prepare optimizer and ds_model at the same time. # pull/1139#issuecomment-1986790007
         ds_model, optimizer, train_dataloader, lr_scheduler = accelerator.prepare(
@@ -498,10 +522,10 @@ def train(args):
         # acceleratorがなんかよろしくやってくれるらしい
         if train_mmdit:
             mmdit = accelerator.prepare(mmdit)
-        # if train_clip_l:
-        #     text_encoder1 = accelerator.prepare(text_encoder1)
-        # if train_clip_g:
-        #     text_encoder2 = accelerator.prepare(text_encoder2)
+        if train_clip_l:
+            clip_l = accelerator.prepare(clip_l)
+        if train_clip_g:
+            clip_g = accelerator.prepare(clip_g)
         optimizer, train_dataloader, lr_scheduler = accelerator.prepare(optimizer, train_dataloader, lr_scheduler)
 
     # 実験的機能：勾配も含めたfp16学習を行う　PyTorchにパッチを当ててfp16でのgrad scaleを有効にする
@@ -613,7 +637,7 @@ def optimizer_hook(parameter: torch.Tensor):
 
     # # For --sample_at_first
     # sd3_train_utils.sample_images(
-    #     accelerator, args, 0, global_step, accelerator.device, vae, [tokenizer1, tokenizer2], [text_encoder1, text_encoder2], mmdit
+    #     accelerator, args, 0, global_step, accelerator.device, vae, [tokenizer1, tokenizer2], [clip_l, clip_g], mmdit
     # )
 
     # following function will be moved to sd3_train_utils
@@ -666,6 +690,7 @@ def compute_loss_weighting_for_sd3(weighting_scheme: str, sigmas=None):
         return weighting
 
     loss_recorder = train_util.LossRecorder()
+    epoch = 0  # avoid error when max_train_steps is 0
     for epoch in range(num_train_epochs):
         accelerator.print(f"\nepoch {epoch+1}/{num_train_epochs}")
         current_epoch.value = epoch + 1
@@ -687,37 +712,45 @@ def compute_loss_weighting_for_sd3(weighting_scheme: str, sigmas=None):
                         # encode images to latents. images are [-1, 1]
                         latents = vae.encode(batch["images"].to(vae_dtype)).to(weight_dtype)
 
-                        # NaNが含まれていれば警告を表示し0に置き換える
-                        if torch.any(torch.isnan(latents)):
-                            accelerator.print("NaN found in latents, replacing with zeros")
-                            latents = torch.nan_to_num(latents, 0, out=latents)
+                    # NaNが含まれていれば警告を表示し0に置き換える
+                    if torch.any(torch.isnan(latents)):
+                        accelerator.print("NaN found in latents, replacing with zeros")
+                        latents = torch.nan_to_num(latents, 0, out=latents)
+
                 # latents = latents * sdxl_model_util.VAE_SCALE_FACTOR
                 latents = sd3_models.SDVAE.process_in(latents)
 
-                if "text_encoder_outputs1_list" not in batch or batch["text_encoder_outputs1_list"] is None:
-                    # not cached, get text encoder outputs
-                    # XXX This does not work yet
-                    input_ids_clip_l, input_ids_clip_g, input_ids_t5xxl = batch["input_ids"]
+                text_encoder_outputs_list = batch.get("text_encoder_outputs_list", None)
+                if text_encoder_outputs_list is not None:
+                    lg_out, t5_out, lg_pooled = text_encoder_outputs_list
+                    if args.use_t5xxl_cache_only:
+                        lg_out = None
+                        lg_pooled = None
+                else:
+                    lg_out = None
+                    t5_out = None
+                    lg_pooled = None
+
+                if lg_out is None or (train_clip_l or train_clip_g):
+                    # not cached or training, so get from text encoders
+                    input_ids_clip_l, input_ids_clip_g, _ = batch["input_ids_list"]
                     with torch.set_grad_enabled(args.train_text_encoder):
                         # TODO support weighted captions
-                        # TODO support length > 75
                         input_ids_clip_l = input_ids_clip_l.to(accelerator.device)
                         input_ids_clip_g = input_ids_clip_g.to(accelerator.device)
-                        input_ids_t5xxl = input_ids_t5xxl.to(accelerator.device)
+                        lg_out, _, lg_pooled = text_encoding_strategy.encode_tokens(
+                            sd3_tokenize_strategy, [clip_l, clip_g, None], [input_ids_clip_l, input_ids_clip_g, None]
+                        )
 
-                        # get text encoder outputs: outputs are concatenated
-                        context, pool = sd3_utils.get_cond_from_tokens(
-                            input_ids_clip_l, input_ids_clip_g, input_ids_t5xxl, clip_l, clip_g, t5xxl
+                if t5_out is None:
+                    _, _, input_ids_t5xxl = batch["input_ids_list"]
+                    with torch.no_grad():
+                        input_ids_t5xxl = input_ids_t5xxl.to(accelerator.device) if t5_out is None else None
+                        _, t5_out, _ = text_encoding_strategy.encode_tokens(
+                            sd3_tokenize_strategy, [None, None, t5xxl], [None, None, input_ids_t5xxl]
                         )
-                else:
-                    # encoder_hidden_states1 = batch["text_encoder_outputs1_list"].to(accelerator.device).to(weight_dtype)
-                    # encoder_hidden_states2 = batch["text_encoder_outputs2_list"].to(accelerator.device).to(weight_dtype)
-                    # pool2 = batch["text_encoder_pool2_list"].to(accelerator.device).to(weight_dtype)
-                    # TODO this reuses SDXL keys, it should be fixed
-                    lg_out = batch["text_encoder_outputs1_list"]
-                    t5_out = batch["text_encoder_outputs2_list"]
-                    pool = batch["text_encoder_pool2_list"]
-                    context = torch.cat([lg_out, t5_out], dim=-2)
+
+                context, lg_pooled = text_encoding_strategy.concat_encodings(lg_out, t5_out, lg_pooled)
 
                 # TODO support some features for noise implemented in get_noise_noisy_latents_and_timesteps
 
@@ -748,13 +781,13 @@ def compute_loss_weighting_for_sd3(weighting_scheme: str, sigmas=None):
                 if torch.any(torch.isnan(context)):
                     accelerator.print("NaN found in context, replacing with zeros")
                     context = torch.nan_to_num(context, 0, out=context)
-                if torch.any(torch.isnan(pool)):
+                if torch.any(torch.isnan(lg_pooled)):
                     accelerator.print("NaN found in pool, replacing with zeros")
-                    pool = torch.nan_to_num(pool, 0, out=pool)
+                    lg_pooled = torch.nan_to_num(lg_pooled, 0, out=lg_pooled)
 
                 # call model
                 with accelerator.autocast():
-                    model_pred = mmdit(noisy_model_input, timesteps, context=context, y=pool)
+                    model_pred = mmdit(noisy_model_input, timesteps, context=context, y=lg_pooled)
 
                 # Follow: Section 5 of https://arxiv.org/abs/2206.00364.
                 # Preconditioning of the model outputs.
@@ -806,7 +839,7 @@ def compute_loss_weighting_for_sd3(weighting_scheme: str, sigmas=None):
                 #     accelerator.device,
                 #     vae,
                 #     [tokenizer1, tokenizer2],
-                #     [text_encoder1, text_encoder2],
+                #     [clip_l, clip_g],
                 #     mmdit,
                 # )
 
@@ -875,7 +908,7 @@ def compute_loss_weighting_for_sd3(weighting_scheme: str, sigmas=None):
         #     accelerator.device,
         #     vae,
         #     [tokenizer1, tokenizer2],
-        #     [text_encoder1, text_encoder2],
+        #     [clip_l, clip_g],
         #     mmdit,
         # )
 
@@ -924,7 +957,19 @@ def setup_parser() -> argparse.ArgumentParser:
     custom_train_functions.add_custom_train_arguments(parser)
     sd3_train_utils.add_sd3_training_arguments(parser)
 
-    # parser.add_argument("--train_text_encoder", action="store_true", help="train text encoder / text encoderも学習する")
+    parser.add_argument(
+        "--train_text_encoder", action="store_true", help="train text encoder (CLIP-L and G) / text encoderも学習する"
+    )
+    # parser.add_argument("--train_t5xxl", action="store_true", help="train T5-XXL / T5-XXLも学習する")
+    parser.add_argument(
+        "--use_t5xxl_cache_only", action="store_true", help="cache T5-XXL outputs only / T5-XXLの出力のみキャッシュする"
+    )
+    parser.add_argument(
+        "--t5xxl_max_token_length",
+        type=int,
+        default=None,
+        help="maximum token length for T5-XXL. 256 if omitted / T5-XXLの最大トークン数。省略時は256",
+    )
 
     # TE training is disabled temporarily
     # parser.add_argument(
@@ -962,7 +1007,6 @@ def setup_parser() -> argparse.ArgumentParser:
         help="number of optimizers for fused backward pass and optimizer step / fused backward passとoptimizer stepのためのoptimizer数",
     )
 
-    parser.add_argument("--new_caching", action="store_true", help="use new caching method / 新しいキャッシング方法を使う")
     parser.add_argument(
         "--skip_latents_validity_check",
         action="store_true",
diff --git a/sdxl_train.py b/sdxl_train.py
index ae92d6a3d..b6d4afd6a 100644
--- a/sdxl_train.py
+++ b/sdxl_train.py
@@ -17,7 +17,7 @@
 
 from accelerate.utils import set_seed
 from diffusers import DDPMScheduler
-from library import deepspeed_utils, sdxl_model_util
+from library import deepspeed_utils, sdxl_model_util, strategy_base, strategy_sd, strategy_sdxl
 
 import library.train_util as train_util
 
@@ -124,7 +124,16 @@ def train(args):
     if args.seed is not None:
         set_seed(args.seed)  # 乱数系列を初期化する
 
-    tokenizer1, tokenizer2 = sdxl_train_util.load_tokenizers(args)
+    tokenize_strategy = strategy_sdxl.SdxlTokenizeStrategy(args.max_token_length, args.tokenizer_cache_dir)
+    strategy_base.TokenizeStrategy.set_strategy(tokenize_strategy)
+    tokenizers = [tokenize_strategy.tokenizer1, tokenize_strategy.tokenizer2]  # will be removed in the future
+
+    # prepare caching strategy: this must be set before preparing dataset. because dataset may use this strategy for initialization.
+    if args.cache_latents:
+        latents_caching_strategy = strategy_sd.SdSdxlLatentsCachingStrategy(
+            False, args.cache_latents_to_disk, args.vae_batch_size, False
+        )
+        strategy_base.LatentsCachingStrategy.set_strategy(latents_caching_strategy)
 
     # データセットを準備する
     if args.dataset_class is None:
@@ -166,10 +175,10 @@ def train(args):
                     ]
                 }
 
-        blueprint = blueprint_generator.generate(user_config, args, tokenizer=[tokenizer1, tokenizer2])
+        blueprint = blueprint_generator.generate(user_config, args)
         train_dataset_group = config_util.generate_dataset_group_by_blueprint(blueprint.dataset_group)
     else:
-        train_dataset_group = train_util.load_arbitrary_dataset(args, [tokenizer1, tokenizer2])
+        train_dataset_group = train_util.load_arbitrary_dataset(args)
 
     current_epoch = Value("i", 0)
     current_step = Value("i", 0)
@@ -262,8 +271,9 @@ def fn_recursive_set_mem_eff(module: torch.nn.Module):
         vae.to(accelerator.device, dtype=vae_dtype)
         vae.requires_grad_(False)
         vae.eval()
-        with torch.no_grad():
-            train_dataset_group.cache_latents(vae, args.vae_batch_size, args.cache_latents_to_disk, accelerator.is_main_process)
+
+        train_dataset_group.new_cache_latents(vae, accelerator.is_main_process)
+
         vae.to("cpu")
         clean_memory_on_device(accelerator.device)
 
@@ -276,6 +286,9 @@ def fn_recursive_set_mem_eff(module: torch.nn.Module):
     train_text_encoder1 = False
     train_text_encoder2 = False
 
+    text_encoding_strategy = strategy_sdxl.SdxlTextEncodingStrategy()
+    strategy_base.TextEncodingStrategy.set_strategy(text_encoding_strategy)
+
     if args.train_text_encoder:
         # TODO each option for two text encoders?
         accelerator.print("enable text encoder training")
@@ -307,16 +320,17 @@ def fn_recursive_set_mem_eff(module: torch.nn.Module):
         # TextEncoderの出力をキャッシュする
         if args.cache_text_encoder_outputs:
             # Text Encodes are eval and no grad
-            with torch.no_grad(), accelerator.autocast():
-                train_dataset_group.cache_text_encoder_outputs(
-                    (tokenizer1, tokenizer2),
-                    (text_encoder1, text_encoder2),
-                    accelerator.device,
-                    None,
-                    args.cache_text_encoder_outputs_to_disk,
-                    accelerator.is_main_process,
-                )
-            accelerator.wait_for_everyone()
+            text_encoder_output_caching_strategy = strategy_sdxl.SdxlTextEncoderOutputsCachingStrategy(
+                args.cache_text_encoder_outputs_to_disk, None, False
+            )
+            strategy_base.TextEncoderOutputsCachingStrategy.set_strategy(text_encoder_output_caching_strategy)
+
+            text_encoder1.to(accelerator.device)
+            text_encoder2.to(accelerator.device)
+            with accelerator.autocast():
+                train_dataset_group.new_cache_text_encoder_outputs([text_encoder1, text_encoder2], accelerator.is_main_process)
+
+        accelerator.wait_for_everyone()
 
     if not cache_latents:
         vae.requires_grad_(False)
@@ -403,7 +417,11 @@ def fn_recursive_set_mem_eff(module: torch.nn.Module):
     else:
         _, _, optimizer = train_util.get_optimizer(args, trainable_params=params_to_optimize)
 
-    # dataloaderを準備する
+    # prepare dataloader
+    # strategies are set here because they cannot be referenced in another process. Copy them with the dataset
+    # some strategies can be None
+    train_dataset_group.set_current_strategies()
+
     # DataLoaderのプロセス数：0 は persistent_workers が使えないので注意
     n_workers = min(args.max_data_loader_n_workers, os.cpu_count())  # cpu_count or max_data_loader_n_workers
     train_dataloader = torch.utils.data.DataLoader(
@@ -597,7 +615,7 @@ def optimizer_hook(parameter: torch.Tensor):
 
     # For --sample_at_first
     sdxl_train_util.sample_images(
-        accelerator, args, 0, global_step, accelerator.device, vae, [tokenizer1, tokenizer2], [text_encoder1, text_encoder2], unet
+        accelerator, args, 0, global_step, accelerator.device, vae, tokenizers, [text_encoder1, text_encoder2], unet
     )
 
     loss_recorder = train_util.LossRecorder()
@@ -628,9 +646,15 @@ def optimizer_hook(parameter: torch.Tensor):
                             latents = torch.nan_to_num(latents, 0, out=latents)
                 latents = latents * sdxl_model_util.VAE_SCALE_FACTOR
 
-                if "text_encoder_outputs1_list" not in batch or batch["text_encoder_outputs1_list"] is None:
-                    input_ids1 = batch["input_ids"]
-                    input_ids2 = batch["input_ids2"]
+                text_encoder_outputs_list = batch.get("text_encoder_outputs_list", None)
+                if text_encoder_outputs_list is not None:
+                    # Text Encoder outputs are cached
+                    encoder_hidden_states1, encoder_hidden_states2, pool2 = text_encoder_outputs_list
+                    encoder_hidden_states1 = encoder_hidden_states1.to(accelerator.device, dtype=weight_dtype)
+                    encoder_hidden_states2 = encoder_hidden_states2.to(accelerator.device, dtype=weight_dtype)
+                    pool2 = pool2.to(accelerator.device, dtype=weight_dtype)
+                else:
+                    input_ids1, input_ids2 = batch["input_ids_list"]
                     with torch.set_grad_enabled(args.train_text_encoder):
                         # Get the text embedding for conditioning
                         # TODO support weighted captions
@@ -646,39 +670,13 @@ def optimizer_hook(parameter: torch.Tensor):
                         # else:
                         input_ids1 = input_ids1.to(accelerator.device)
                         input_ids2 = input_ids2.to(accelerator.device)
-                        # unwrap_model is fine for models not wrapped by accelerator
-                        encoder_hidden_states1, encoder_hidden_states2, pool2 = train_util.get_hidden_states_sdxl(
-                            args.max_token_length,
-                            input_ids1,
-                            input_ids2,
-                            tokenizer1,
-                            tokenizer2,
-                            text_encoder1,
-                            text_encoder2,
-                            None if not args.full_fp16 else weight_dtype,
-                            accelerator=accelerator,
+                        encoder_hidden_states1, encoder_hidden_states2, pool2 = text_encoding_strategy.encode_tokens(
+                            tokenize_strategy, [text_encoder1, text_encoder2], [input_ids1, input_ids2]
                         )
-                else:
-                    encoder_hidden_states1 = batch["text_encoder_outputs1_list"].to(accelerator.device).to(weight_dtype)
-                    encoder_hidden_states2 = batch["text_encoder_outputs2_list"].to(accelerator.device).to(weight_dtype)
-                    pool2 = batch["text_encoder_pool2_list"].to(accelerator.device).to(weight_dtype)
-
-                    # # verify that the text encoder outputs are correct
-                    # ehs1, ehs2, p2 = train_util.get_hidden_states_sdxl(
-                    #     args.max_token_length,
-                    #     batch["input_ids"].to(text_encoder1.device),
-                    #     batch["input_ids2"].to(text_encoder1.device),
-                    #     tokenizer1,
-                    #     tokenizer2,
-                    #     text_encoder1,
-                    #     text_encoder2,
-                    #     None if not args.full_fp16 else weight_dtype,
-                    # )
-                    # b_size = encoder_hidden_states1.shape[0]
-                    # assert ((encoder_hidden_states1.to("cpu") - ehs1.to(dtype=weight_dtype)).abs().max() > 1e-2).sum() <= b_size * 2
-                    # assert ((encoder_hidden_states2.to("cpu") - ehs2.to(dtype=weight_dtype)).abs().max() > 1e-2).sum() <= b_size * 2
-                    # assert ((pool2.to("cpu") - p2.to(dtype=weight_dtype)).abs().max() > 1e-2).sum() <= b_size * 2
-                    # logger.info("text encoder outputs verified")
+                        if args.full_fp16:
+                            encoder_hidden_states1 = encoder_hidden_states1.to(weight_dtype)
+                            encoder_hidden_states2 = encoder_hidden_states2.to(weight_dtype)
+                            pool2 = pool2.to(weight_dtype)
 
                 # get size embeddings
                 orig_size = batch["original_sizes_hw"]
@@ -765,7 +763,7 @@ def optimizer_hook(parameter: torch.Tensor):
                     global_step,
                     accelerator.device,
                     vae,
-                    [tokenizer1, tokenizer2],
+                    tokenizers,
                     [text_encoder1, text_encoder2],
                     unet,
                 )
@@ -847,7 +845,7 @@ def optimizer_hook(parameter: torch.Tensor):
             global_step,
             accelerator.device,
             vae,
-            [tokenizer1, tokenizer2],
+            tokenizers,
             [text_encoder1, text_encoder2],
             unet,
         )
diff --git a/sdxl_train_control_net_lllite.py b/sdxl_train_control_net_lllite.py
index 5ff060a9f..0eaec29b8 100644
--- a/sdxl_train_control_net_lllite.py
+++ b/sdxl_train_control_net_lllite.py
@@ -23,7 +23,16 @@
 import accelerate
 from diffusers import DDPMScheduler, ControlNetModel
 from safetensors.torch import load_file
-from library import deepspeed_utils, sai_model_spec, sdxl_model_util, sdxl_original_unet, sdxl_train_util
+from library import (
+    deepspeed_utils,
+    sai_model_spec,
+    sdxl_model_util,
+    sdxl_original_unet,
+    sdxl_train_util,
+    strategy_base,
+    strategy_sd,
+    strategy_sdxl,
+)
 
 import library.model_util as model_util
 import library.train_util as train_util
@@ -79,7 +88,14 @@ def train(args):
         args.seed = random.randint(0, 2**32)
     set_seed(args.seed)
 
-    tokenizer1, tokenizer2 = sdxl_train_util.load_tokenizers(args)
+    tokenize_strategy = strategy_sdxl.SdxlTokenizeStrategy(args.max_token_length, args.tokenizer_cache_dir)
+    strategy_base.TokenizeStrategy.set_strategy(tokenize_strategy)
+
+    # prepare caching strategy: this must be set before preparing dataset. because dataset may use this strategy for initialization.
+    latents_caching_strategy = strategy_sd.SdSdxlLatentsCachingStrategy(
+        False, args.cache_latents_to_disk, args.vae_batch_size, False
+    )
+    strategy_base.LatentsCachingStrategy.set_strategy(latents_caching_strategy)
 
     # データセットを準備する
     blueprint_generator = BlueprintGenerator(ConfigSanitizer(False, False, True, True))
@@ -106,7 +122,7 @@ def train(args):
             ]
         }
 
-    blueprint = blueprint_generator.generate(user_config, args, tokenizer=[tokenizer1, tokenizer2])
+    blueprint = blueprint_generator.generate(user_config, args)
     train_dataset_group = config_util.generate_dataset_group_by_blueprint(blueprint.dataset_group)
 
     current_epoch = Value("i", 0)
@@ -164,30 +180,30 @@ def train(args):
         vae.to(accelerator.device, dtype=vae_dtype)
         vae.requires_grad_(False)
         vae.eval()
-        with torch.no_grad():
-            train_dataset_group.cache_latents(
-                vae,
-                args.vae_batch_size,
-                args.cache_latents_to_disk,
-                accelerator.is_main_process,
-            )
+
+        train_dataset_group.new_cache_latents(vae, accelerator.is_main_process)
+
         vae.to("cpu")
         clean_memory_on_device(accelerator.device)
 
         accelerator.wait_for_everyone()
 
+    text_encoding_strategy = strategy_sdxl.SdxlTextEncodingStrategy()
+    strategy_base.TextEncodingStrategy.set_strategy(text_encoding_strategy)
+
     # TextEncoderの出力をキャッシュする
     if args.cache_text_encoder_outputs:
         # Text Encodes are eval and no grad
-        with torch.no_grad():
-            train_dataset_group.cache_text_encoder_outputs(
-                (tokenizer1, tokenizer2),
-                (text_encoder1, text_encoder2),
-                accelerator.device,
-                None,
-                args.cache_text_encoder_outputs_to_disk,
-                accelerator.is_main_process,
-            )
+        text_encoder_output_caching_strategy = strategy_sdxl.SdxlTextEncoderOutputsCachingStrategy(
+            args.cache_text_encoder_outputs_to_disk, None, False
+        )
+        strategy_base.TextEncoderOutputsCachingStrategy.set_strategy(text_encoder_output_caching_strategy)
+
+        text_encoder1.to(accelerator.device)
+        text_encoder2.to(accelerator.device)
+        with accelerator.autocast():
+            train_dataset_group.new_cache_text_encoder_outputs([text_encoder1, text_encoder2], accelerator.is_main_process)
+
         accelerator.wait_for_everyone()
 
     # prepare ControlNet-LLLite
@@ -242,7 +258,11 @@ def train(args):
 
     _, _, optimizer = train_util.get_optimizer(args, trainable_params)
 
-    # dataloaderを準備する
+    # prepare dataloader
+    # strategies are set here because they cannot be referenced in another process. Copy them with the dataset
+    # some strategies can be None
+    train_dataset_group.set_current_strategies()
+
     # DataLoaderのプロセス数：0 は persistent_workers が使えないので注意
     n_workers = min(args.max_data_loader_n_workers, os.cpu_count())  # cpu_count or max_data_loader_n_workers
 
@@ -290,7 +310,7 @@ def train(args):
     unet, optimizer, train_dataloader, lr_scheduler = accelerator.prepare(unet, optimizer, train_dataloader, lr_scheduler)
 
     if isinstance(unet, DDP):
-        unet._set_static_graph() # avoid error for multiple use of the parameter
+        unet._set_static_graph()  # avoid error for multiple use of the parameter
 
     if args.gradient_checkpointing:
         unet.train()  # according to TI example in Diffusers, train is required -> これオリジナルのU-Netしたので本当は外せる
@@ -357,7 +377,9 @@ def train(args):
         if args.log_tracker_config is not None:
             init_kwargs = toml.load(args.log_tracker_config)
         accelerator.init_trackers(
-            "lllite_control_net_train" if args.log_tracker_name is None else args.log_tracker_name, config=train_util.get_sanitized_config_or_none(args), init_kwargs=init_kwargs
+            "lllite_control_net_train" if args.log_tracker_name is None else args.log_tracker_name,
+            config=train_util.get_sanitized_config_or_none(args),
+            init_kwargs=init_kwargs,
         )
 
     loss_recorder = train_util.LossRecorder()
@@ -409,27 +431,26 @@ def remove_model(old_ckpt_name):
                             latents = torch.nan_to_num(latents, 0, out=latents)
                     latents = latents * sdxl_model_util.VAE_SCALE_FACTOR
 
-                if "text_encoder_outputs1_list" not in batch or batch["text_encoder_outputs1_list"] is None:
-                    input_ids1 = batch["input_ids"]
-                    input_ids2 = batch["input_ids2"]
+
+                text_encoder_outputs_list = batch.get("text_encoder_outputs_list", None)
+                if text_encoder_outputs_list is not None:
+                    # Text Encoder outputs are cached
+                    encoder_hidden_states1, encoder_hidden_states2, pool2 = text_encoder_outputs_list
+                    encoder_hidden_states1 = encoder_hidden_states1.to(accelerator.device, dtype=weight_dtype)
+                    encoder_hidden_states2 = encoder_hidden_states2.to(accelerator.device, dtype=weight_dtype)
+                    pool2 = pool2.to(accelerator.device, dtype=weight_dtype)
+                else:
+                    input_ids1, input_ids2 = batch["input_ids_list"]
                     with torch.no_grad():
-                        # Get the text embedding for conditioning
                         input_ids1 = input_ids1.to(accelerator.device)
                         input_ids2 = input_ids2.to(accelerator.device)
-                        encoder_hidden_states1, encoder_hidden_states2, pool2 = train_util.get_hidden_states_sdxl(
-                            args.max_token_length,
-                            input_ids1,
-                            input_ids2,
-                            tokenizer1,
-                            tokenizer2,
-                            text_encoder1,
-                            text_encoder2,
-                            None if not args.full_fp16 else weight_dtype,
+                        encoder_hidden_states1, encoder_hidden_states2, pool2 = text_encoding_strategy.encode_tokens(
+                            tokenize_strategy, [text_encoder1, text_encoder2], [input_ids1, input_ids2]
                         )
-                else:
-                    encoder_hidden_states1 = batch["text_encoder_outputs1_list"].to(accelerator.device).to(weight_dtype)
-                    encoder_hidden_states2 = batch["text_encoder_outputs2_list"].to(accelerator.device).to(weight_dtype)
-                    pool2 = batch["text_encoder_pool2_list"].to(accelerator.device).to(weight_dtype)
+                        if args.full_fp16:
+                            encoder_hidden_states1 = encoder_hidden_states1.to(weight_dtype)
+                            encoder_hidden_states2 = encoder_hidden_states2.to(weight_dtype)
+                            pool2 = pool2.to(weight_dtype)
 
                 # get size embeddings
                 orig_size = batch["original_sizes_hw"]
diff --git a/sdxl_train_network.py b/sdxl_train_network.py
index 83969bb1d..67ccae62c 100644
--- a/sdxl_train_network.py
+++ b/sdxl_train_network.py
@@ -1,16 +1,21 @@
 import argparse
 
 import torch
+from accelerate import Accelerator
 from library.device_utils import init_ipex, clean_memory_on_device
+
 init_ipex()
 
-from library import sdxl_model_util, sdxl_train_util, train_util
+from library import sdxl_model_util, sdxl_train_util, strategy_base, strategy_sd, strategy_sdxl, train_util
 import train_network
 from library.utils import setup_logging
+
 setup_logging()
 import logging
+
 logger = logging.getLogger(__name__)
 
+
 class SdxlNetworkTrainer(train_network.NetworkTrainer):
     def __init__(self):
         super().__init__()
@@ -49,15 +54,32 @@ def load_target_model(self, args, weight_dtype, accelerator):
 
         return sdxl_model_util.MODEL_VERSION_SDXL_BASE_V1_0, [text_encoder1, text_encoder2], vae, unet
 
-    def load_tokenizer(self, args):
-        tokenizer = sdxl_train_util.load_tokenizers(args)
-        return tokenizer
+    def get_tokenize_strategy(self, args):
+        return strategy_sdxl.SdxlTokenizeStrategy(args.max_token_length, args.tokenizer_cache_dir)
+
+    def get_tokenizers(self, tokenize_strategy: strategy_sdxl.SdxlTokenizeStrategy):
+        return [tokenize_strategy.tokenizer1, tokenize_strategy.tokenizer2]
+
+    def get_latents_caching_strategy(self, args):
+        latents_caching_strategy = strategy_sd.SdSdxlLatentsCachingStrategy(
+            False, args.cache_latents_to_disk, args.vae_batch_size, False
+        )
+        return latents_caching_strategy
 
-    def is_text_encoder_outputs_cached(self, args):
-        return args.cache_text_encoder_outputs
+    def get_text_encoding_strategy(self, args):
+        return strategy_sdxl.SdxlTextEncodingStrategy()
+
+    def get_models_for_text_encoding(self, args, accelerator, text_encoders):
+        return text_encoders + [accelerator.unwrap_model(text_encoders[-1])]
+
+    def get_text_encoder_outputs_caching_strategy(self, args):
+        if args.cache_text_encoder_outputs:
+            return strategy_sdxl.SdxlTextEncoderOutputsCachingStrategy(args.cache_text_encoder_outputs_to_disk, None, False)
+        else:
+            return None
 
     def cache_text_encoder_outputs_if_needed(
-        self, args, accelerator, unet, vae, tokenizers, text_encoders, dataset: train_util.DatasetGroup, weight_dtype
+        self, args, accelerator: Accelerator, unet, vae, text_encoders, dataset: train_util.DatasetGroup, weight_dtype
     ):
         if args.cache_text_encoder_outputs:
             if not args.lowram:
@@ -70,15 +92,13 @@ def cache_text_encoder_outputs_if_needed(
                 clean_memory_on_device(accelerator.device)
 
             # When TE is not be trained, it will not be prepared so we need to use explicit autocast
+            text_encoders[0].to(accelerator.device, dtype=weight_dtype)
+            text_encoders[1].to(accelerator.device, dtype=weight_dtype)
             with accelerator.autocast():
-                dataset.cache_text_encoder_outputs(
-                    tokenizers,
-                    text_encoders,
-                    accelerator.device,
-                    weight_dtype,
-                    args.cache_text_encoder_outputs_to_disk,
-                    accelerator.is_main_process,
+                dataset.new_cache_text_encoder_outputs(
+                    text_encoders + [accelerator.unwrap_model(text_encoders[-1])], accelerator.is_main_process
                 )
+            accelerator.wait_for_everyone()
 
             text_encoders[0].to("cpu", dtype=torch.float32)  # Text Encoder doesn't work with fp16 on CPU
             text_encoders[1].to("cpu", dtype=torch.float32)
diff --git a/sdxl_train_textual_inversion.py b/sdxl_train_textual_inversion.py
index 5df739e28..cbfcef554 100644
--- a/sdxl_train_textual_inversion.py
+++ b/sdxl_train_textual_inversion.py
@@ -5,10 +5,10 @@
 
 import torch
 from library.device_utils import init_ipex
-init_ipex()
 
-from library import sdxl_model_util, sdxl_train_util, train_util
+init_ipex()
 
+from library import sdxl_model_util, sdxl_train_util, strategy_sd, strategy_sdxl, train_util
 import train_textual_inversion
 
 
@@ -41,28 +41,20 @@ def load_target_model(self, args, weight_dtype, accelerator):
 
         return sdxl_model_util.MODEL_VERSION_SDXL_BASE_V1_0, [text_encoder1, text_encoder2], vae, unet
 
-    def load_tokenizer(self, args):
-        tokenizer = sdxl_train_util.load_tokenizers(args)
-        return tokenizer
-
-    def get_text_cond(self, args, accelerator, batch, tokenizers, text_encoders, weight_dtype):
-        input_ids1 = batch["input_ids"]
-        input_ids2 = batch["input_ids2"]
-        with torch.enable_grad():
-            input_ids1 = input_ids1.to(accelerator.device)
-            input_ids2 = input_ids2.to(accelerator.device)
-            encoder_hidden_states1, encoder_hidden_states2, pool2 = train_util.get_hidden_states_sdxl(
-                args.max_token_length,
-                input_ids1,
-                input_ids2,
-                tokenizers[0],
-                tokenizers[1],
-                text_encoders[0],
-                text_encoders[1],
-                None if not args.full_fp16 else weight_dtype,
-                accelerator=accelerator,
-            )
-        return encoder_hidden_states1, encoder_hidden_states2, pool2
+    def get_tokenize_strategy(self, args):
+        return strategy_sdxl.SdxlTokenizeStrategy(args.max_token_length, args.tokenizer_cache_dir)
+
+    def get_tokenizers(self, tokenize_strategy: strategy_sdxl.SdxlTokenizeStrategy):
+        return [tokenize_strategy.tokenizer1, tokenize_strategy.tokenizer2]
+
+    def get_latents_caching_strategy(self, args):
+        latents_caching_strategy = strategy_sd.SdSdxlLatentsCachingStrategy(
+            False, args.cache_latents_to_disk, args.vae_batch_size, False
+        )
+        return latents_caching_strategy
+
+    def get_text_encoding_strategy(self, args):
+        return strategy_sdxl.SdxlTextEncodingStrategy()
 
     def call_unet(self, args, accelerator, unet, noisy_latents, timesteps, text_conds, batch, weight_dtype):
         noisy_latents = noisy_latents.to(weight_dtype)  # TODO check why noisy_latents is not weight_dtype
@@ -81,9 +73,11 @@ def call_unet(self, args, accelerator, unet, noisy_latents, timesteps, text_cond
         noise_pred = unet(noisy_latents, timesteps, text_embedding, vector_embedding)
         return noise_pred
 
-    def sample_images(self, accelerator, args, epoch, global_step, device, vae, tokenizer, text_encoder, unet, prompt_replacement):
+    def sample_images(
+        self, accelerator, args, epoch, global_step, device, vae, tokenizers, text_encoders, unet, prompt_replacement
+    ):
         sdxl_train_util.sample_images(
-            accelerator, args, epoch, global_step, device, vae, tokenizer, text_encoder, unet, prompt_replacement
+            accelerator, args, epoch, global_step, device, vae, tokenizers, text_encoders, unet, prompt_replacement
         )
 
     def save_weights(self, file, updated_embs, save_dtype, metadata):
@@ -122,8 +116,7 @@ def load_weights(self, file):
 
 def setup_parser() -> argparse.ArgumentParser:
     parser = train_textual_inversion.setup_parser()
-    # don't add sdxl_train_util.add_sdxl_training_arguments(parser): because it only adds text encoder caching
-    # sdxl_train_util.add_sdxl_training_arguments(parser)
+    sdxl_train_util.add_sdxl_training_arguments(parser, support_text_encoder_caching=False)
     return parser
 
 
diff --git a/train_db.py b/train_db.py
index 39d8ea6ed..7caee6647 100644
--- a/train_db.py
+++ b/train_db.py
@@ -11,7 +11,7 @@
 from tqdm import tqdm
 
 import torch
-from library import deepspeed_utils
+from library import deepspeed_utils, strategy_base
 from library.device_utils import init_ipex, clean_memory_on_device
 
 
@@ -38,6 +38,7 @@
     apply_masked_loss,
 )
 from library.utils import setup_logging, add_logging_arguments
+import library.strategy_sd as strategy_sd
 
 setup_logging()
 import logging
@@ -58,7 +59,14 @@ def train(args):
     if args.seed is not None:
         set_seed(args.seed)  # 乱数系列を初期化する
 
-    tokenizer = train_util.load_tokenizer(args)
+    tokenize_strategy = strategy_sd.SdTokenizeStrategy(args.v2, args.max_token_length, args.tokenizer_cache_dir)
+    strategy_base.TokenizeStrategy.set_strategy(tokenize_strategy)
+
+    # prepare caching strategy: this must be set before preparing dataset. because dataset may use this strategy for initialization.
+    latents_caching_strategy = strategy_sd.SdSdxlLatentsCachingStrategy(
+        False, args.cache_latents_to_disk, args.vae_batch_size, False
+    )
+    strategy_base.LatentsCachingStrategy.set_strategy(latents_caching_strategy)
 
     # データセットを準備する
     if args.dataset_class is None:
@@ -80,10 +88,10 @@ def train(args):
                 ]
             }
 
-        blueprint = blueprint_generator.generate(user_config, args, tokenizer=tokenizer)
+        blueprint = blueprint_generator.generate(user_config, args)
         train_dataset_group = config_util.generate_dataset_group_by_blueprint(blueprint.dataset_group)
     else:
-        train_dataset_group = train_util.load_arbitrary_dataset(args, tokenizer)
+        train_dataset_group = train_util.load_arbitrary_dataset(args)
 
     current_epoch = Value("i", 0)
     current_step = Value("i", 0)
@@ -145,13 +153,17 @@ def train(args):
         vae.to(accelerator.device, dtype=vae_dtype)
         vae.requires_grad_(False)
         vae.eval()
-        with torch.no_grad():
-            train_dataset_group.cache_latents(vae, args.vae_batch_size, args.cache_latents_to_disk, accelerator.is_main_process)
+
+        train_dataset_group.new_cache_latents(vae, accelerator.is_main_process)
+
         vae.to("cpu")
         clean_memory_on_device(accelerator.device)
 
         accelerator.wait_for_everyone()
 
+    text_encoding_strategy = strategy_sd.SdTextEncodingStrategy(args.clip_skip)
+    strategy_base.TextEncodingStrategy.set_strategy(text_encoding_strategy)
+
     # 学習を準備する：モデルを適切な状態にする
     train_text_encoder = args.stop_text_encoder_training is None or args.stop_text_encoder_training >= 0
     unet.requires_grad_(True)  # 念のため追加
@@ -184,8 +196,11 @@ def train(args):
 
     _, _, optimizer = train_util.get_optimizer(args, trainable_params)
 
-    # dataloaderを準備する
-    # DataLoaderのプロセス数：0 は persistent_workers が使えないので注意
+    # prepare dataloader
+    # strategies are set here because they cannot be referenced in another process. Copy them with the dataset
+    # some strategies can be None
+    train_dataset_group.set_current_strategies()
+
     n_workers = min(args.max_data_loader_n_workers, os.cpu_count())  # cpu_count or max_data_loader_n_workers
     train_dataloader = torch.utils.data.DataLoader(
         train_dataset_group,
@@ -290,10 +305,16 @@ def train(args):
             init_kwargs["wandb"] = {"name": args.wandb_run_name}
         if args.log_tracker_config is not None:
             init_kwargs = toml.load(args.log_tracker_config)
-        accelerator.init_trackers("dreambooth" if args.log_tracker_name is None else args.log_tracker_name, config=train_util.get_sanitized_config_or_none(args), init_kwargs=init_kwargs)
+        accelerator.init_trackers(
+            "dreambooth" if args.log_tracker_name is None else args.log_tracker_name,
+            config=train_util.get_sanitized_config_or_none(args),
+            init_kwargs=init_kwargs,
+        )
 
     # For --sample_at_first
-    train_util.sample_images(accelerator, args, 0, global_step, accelerator.device, vae, tokenizer, text_encoder, unet)
+    train_util.sample_images(
+        accelerator, args, 0, global_step, accelerator.device, vae, tokenize_strategy.tokenizer, text_encoder, unet
+    )
 
     loss_recorder = train_util.LossRecorder()
     for epoch in range(num_train_epochs):
@@ -331,7 +352,7 @@ def train(args):
                 with torch.set_grad_enabled(global_step < args.stop_text_encoder_training):
                     if args.weighted_captions:
                         encoder_hidden_states = get_weighted_text_embeddings(
-                            tokenizer,
+                            tokenize_strategy.tokenizer,
                             text_encoder,
                             batch["captions"],
                             accelerator.device,
@@ -339,14 +360,18 @@ def train(args):
                             clip_skip=args.clip_skip,
                         )
                     else:
-                        input_ids = batch["input_ids"].to(accelerator.device)
-                        encoder_hidden_states = train_util.get_hidden_states(
-                            args, input_ids, tokenizer, text_encoder, None if not args.full_fp16 else weight_dtype
-                        )
+                        input_ids = batch["input_ids_list"][0].to(accelerator.device)
+                        encoder_hidden_states = text_encoding_strategy.encode_tokens(
+                            tokenize_strategy, [text_encoder], [input_ids]
+                        )[0]
+                        if args.full_fp16:
+                            encoder_hidden_states = encoder_hidden_states.to(weight_dtype)
 
                 # Sample noise, sample a random timestep for each image, and add noise to the latents,
                 # with noise offset and/or multires noise if specified
-                noise, noisy_latents, timesteps, huber_c = train_util.get_noise_noisy_latents_and_timesteps(args, noise_scheduler, latents)
+                noise, noisy_latents, timesteps, huber_c = train_util.get_noise_noisy_latents_and_timesteps(
+                    args, noise_scheduler, latents
+                )
 
                 # Predict the noise residual
                 with accelerator.autocast():
@@ -358,7 +383,9 @@ def train(args):
                 else:
                     target = noise
 
-                loss = train_util.conditional_loss(noise_pred.float(), target.float(), reduction="none", loss_type=args.loss_type, huber_c=huber_c)
+                loss = train_util.conditional_loss(
+                    noise_pred.float(), target.float(), reduction="none", loss_type=args.loss_type, huber_c=huber_c
+                )
                 if args.masked_loss or ("alpha_masks" in batch and batch["alpha_masks"] is not None):
                     loss = apply_masked_loss(loss, batch)
                 loss = loss.mean([1, 2, 3])
@@ -393,7 +420,7 @@ def train(args):
                 global_step += 1
 
                 train_util.sample_images(
-                    accelerator, args, None, global_step, accelerator.device, vae, tokenizer, text_encoder, unet
+                    accelerator, args, None, global_step, accelerator.device, vae, tokenize_strategy.tokenizer, text_encoder, unet
                 )
 
                 # 指定ステップごとにモデルを保存
@@ -457,7 +484,9 @@ def train(args):
                     vae,
                 )
 
-        train_util.sample_images(accelerator, args, epoch + 1, global_step, accelerator.device, vae, tokenizer, text_encoder, unet)
+        train_util.sample_images(
+            accelerator, args, epoch + 1, global_step, accelerator.device, vae, tokenize_strategy.tokenizer, text_encoder, unet
+        )
 
     is_main_process = accelerator.is_main_process
     if is_main_process:
diff --git a/train_network.py b/train_network.py
index 7ba073855..3828fed19 100644
--- a/train_network.py
+++ b/train_network.py
@@ -7,6 +7,7 @@
 import time
 import json
 from multiprocessing import Value
+from typing import Any, List
 import toml
 
 from tqdm import tqdm
@@ -18,7 +19,7 @@
 
 from accelerate.utils import set_seed
 from diffusers import DDPMScheduler
-from library import deepspeed_utils, model_util
+from library import deepspeed_utils, model_util, strategy_base, strategy_sd
 
 import library.train_util as train_util
 from library.train_util import DreamBoothDataset
@@ -101,19 +102,31 @@ def load_target_model(self, args, weight_dtype, accelerator):
         text_encoder, vae, unet, _ = train_util.load_target_model(args, weight_dtype, accelerator)
         return model_util.get_model_version_str_for_sd1_sd2(args.v2, args.v_parameterization), text_encoder, vae, unet
 
-    def load_tokenizer(self, args):
-        tokenizer = train_util.load_tokenizer(args)
-        return tokenizer
+    def get_tokenize_strategy(self, args):
+        return strategy_sd.SdTokenizeStrategy(args.v2, args.max_token_length, args.tokenizer_cache_dir)
 
-    def is_text_encoder_outputs_cached(self, args):
-        return False
+    def get_tokenizers(self, tokenize_strategy: strategy_sd.SdTokenizeStrategy) -> List[Any]:
+        return [tokenize_strategy.tokenizer]
+
+    def get_latents_caching_strategy(self, args):
+        latents_caching_strategy = strategy_sd.SdSdxlLatentsCachingStrategy(
+            True, args.cache_latents_to_disk, args.vae_batch_size, False
+        )
+        return latents_caching_strategy
+
+    def get_text_encoding_strategy(self, args):
+        return strategy_sd.SdTextEncodingStrategy(args.clip_skip)
+
+    def get_text_encoder_outputs_caching_strategy(self, args):
+        return None
+
+    def get_models_for_text_encoding(self, args, accelerator, text_encoders):
+        return text_encoders
 
     def is_train_text_encoder(self, args):
-        return not args.network_train_unet_only and not self.is_text_encoder_outputs_cached(args)
+        return not args.network_train_unet_only
 
-    def cache_text_encoder_outputs_if_needed(
-        self, args, accelerator, unet, vae, tokenizers, text_encoders, data_loader, weight_dtype
-    ):
+    def cache_text_encoder_outputs_if_needed(self, args, accelerator, unet, vae, text_encoders, dataset, weight_dtype):
         for t_enc in text_encoders:
             t_enc.to(accelerator.device, dtype=weight_dtype)
 
@@ -123,7 +136,7 @@ def get_text_cond(self, args, accelerator, batch, tokenizers, text_encoders, wei
         return encoder_hidden_states
 
     def call_unet(self, args, accelerator, unet, noisy_latents, timesteps, text_conds, batch, weight_dtype):
-        noise_pred = unet(noisy_latents, timesteps, text_conds).sample
+        noise_pred = unet(noisy_latents, timesteps, text_conds[0]).sample
         return noise_pred
 
     def all_reduce_network(self, accelerator, network):
@@ -131,8 +144,8 @@ def all_reduce_network(self, accelerator, network):
             if param.grad is not None:
                 param.grad = accelerator.reduce(param.grad, reduction="mean")
 
-    def sample_images(self, accelerator, args, epoch, global_step, device, vae, tokenizer, text_encoder, unet):
-        train_util.sample_images(accelerator, args, epoch, global_step, device, vae, tokenizer, text_encoder, unet)
+    def sample_images(self, accelerator, args, epoch, global_step, device, vae, tokenizers, text_encoder, unet):
+        train_util.sample_images(accelerator, args, epoch, global_step, device, vae, tokenizers[0], text_encoder, unet)
 
     def train(self, args):
         session_id = random.randint(0, 2**32)
@@ -150,9 +163,13 @@ def train(self, args):
             args.seed = random.randint(0, 2**32)
         set_seed(args.seed)
 
-        # tokenizerは単体またはリスト、tokenizersは必ずリスト：既存のコードとの互換性のため
-        tokenizer = self.load_tokenizer(args)
-        tokenizers = tokenizer if isinstance(tokenizer, list) else [tokenizer]
+        tokenize_strategy = self.get_tokenize_strategy(args)
+        strategy_base.TokenizeStrategy.set_strategy(tokenize_strategy)
+        tokenizers = self.get_tokenizers(tokenize_strategy)  # will be removed after sample_image is refactored
+
+        # prepare caching strategy: this must be set before preparing dataset. because dataset may use this strategy for initialization.
+        latents_caching_strategy = self.get_latents_caching_strategy(args)
+        strategy_base.LatentsCachingStrategy.set_strategy(latents_caching_strategy)
 
         # データセットを準備する
         if args.dataset_class is None:
@@ -194,11 +211,11 @@ def train(self, args):
                         ]
                     }
 
-            blueprint = blueprint_generator.generate(user_config, args, tokenizer=tokenizer)
+            blueprint = blueprint_generator.generate(user_config, args)
             train_dataset_group = config_util.generate_dataset_group_by_blueprint(blueprint.dataset_group)
         else:
             # use arbitrary dataset class
-            train_dataset_group = train_util.load_arbitrary_dataset(args, tokenizer)
+            train_dataset_group = train_util.load_arbitrary_dataset(args)
 
         current_epoch = Value("i", 0)
         current_step = Value("i", 0)
@@ -268,8 +285,9 @@ def train(self, args):
             vae.to(accelerator.device, dtype=vae_dtype)
             vae.requires_grad_(False)
             vae.eval()
-            with torch.no_grad():
-                train_dataset_group.cache_latents(vae, args.vae_batch_size, args.cache_latents_to_disk, accelerator.is_main_process)
+
+            train_dataset_group.new_cache_latents(vae, accelerator.is_main_process)
+
             vae.to("cpu")
             clean_memory_on_device(accelerator.device)
 
@@ -277,9 +295,13 @@ def train(self, args):
 
         # 必要ならテキストエンコーダーの出力をキャッシュする: Text Encoderはcpuまたはgpuへ移される
         # cache text encoder outputs if needed: Text Encoder is moved to cpu or gpu
-        self.cache_text_encoder_outputs_if_needed(
-            args, accelerator, unet, vae, tokenizers, text_encoders, train_dataset_group, weight_dtype
-        )
+        text_encoding_strategy = self.get_text_encoding_strategy(args)
+        strategy_base.TextEncodingStrategy.set_strategy(text_encoding_strategy)
+
+        text_encoder_outputs_caching_strategy = self.get_text_encoder_outputs_caching_strategy(args)
+        if text_encoder_outputs_caching_strategy is not None:
+            strategy_base.TextEncoderOutputsCachingStrategy.set_strategy(text_encoder_outputs_caching_strategy)
+        self.cache_text_encoder_outputs_if_needed(args, accelerator, unet, vae, text_encoders, train_dataset_group, weight_dtype)
 
         # prepare network
         net_kwargs = {}
@@ -366,7 +388,11 @@ def train(self, args):
 
         optimizer_name, optimizer_args, optimizer = train_util.get_optimizer(args, trainable_params)
 
-        # dataloaderを準備する
+        # prepare dataloader
+        # strategies are set here because they cannot be referenced in another process. Copy them with the dataset
+        # some strategies can be None
+        train_dataset_group.set_current_strategies()
+
         # DataLoaderのプロセス数：0 は persistent_workers が使えないので注意
         n_workers = min(args.max_data_loader_n_workers, os.cpu_count())  # cpu_count or max_data_loader_n_workers
 
@@ -878,7 +904,7 @@ def remove_model(old_ckpt_name):
                 os.remove(old_ckpt_file)
 
         # For --sample_at_first
-        self.sample_images(accelerator, args, 0, global_step, accelerator.device, vae, tokenizer, text_encoder, unet)
+        self.sample_images(accelerator, args, 0, global_step, accelerator.device, vae, tokenizers, text_encoder, unet)
 
         # training loop
         if initial_step > 0:  # only if skip_until_initial_step is specified
@@ -933,21 +959,31 @@ def remove_model(old_ckpt_name):
                         # print(f"set multiplier: {multipliers}")
                         accelerator.unwrap_model(network).set_multiplier(multipliers)
 
-                    with torch.set_grad_enabled(train_text_encoder), accelerator.autocast():
-                        # Get the text embedding for conditioning
-                        if args.weighted_captions:
-                            text_encoder_conds = get_weighted_text_embeddings(
-                                tokenizer,
-                                text_encoder,
-                                batch["captions"],
-                                accelerator.device,
-                                args.max_token_length // 75 if args.max_token_length else 1,
-                                clip_skip=args.clip_skip,
-                            )
-                        else:
-                            text_encoder_conds = self.get_text_cond(
-                                args, accelerator, batch, tokenizers, text_encoders, weight_dtype
-                            )
+                    text_encoder_outputs_list = batch.get("text_encoder_outputs_list", None)
+                    if text_encoder_outputs_list is not None:
+                        text_encoder_conds = text_encoder_outputs_list  # List of text encoder outputs
+                    else:
+                        with torch.set_grad_enabled(train_text_encoder), accelerator.autocast():
+                            # Get the text embedding for conditioning
+                            if args.weighted_captions:
+                                # SD only
+                                text_encoder_conds = get_weighted_text_embeddings(
+                                    tokenizers[0],
+                                    text_encoder,
+                                    batch["captions"],
+                                    accelerator.device,
+                                    args.max_token_length // 75 if args.max_token_length else 1,
+                                    clip_skip=args.clip_skip,
+                                )
+                            else:
+                                input_ids = [ids.to(accelerator.device) for ids in batch["input_ids_list"]]
+                                text_encoder_conds = text_encoding_strategy.encode_tokens(
+                                    tokenize_strategy,
+                                    self.get_models_for_text_encoding(args, accelerator, text_encoders),
+                                    input_ids,
+                                )
+                                if args.full_fp16:
+                                    text_encoder_conds = [c.to(weight_dtype) for c in text_encoder_conds]
 
                     # Sample noise, sample a random timestep for each image, and add noise to the latents,
                     # with noise offset and/or multires noise if specified
@@ -1026,7 +1062,9 @@ def remove_model(old_ckpt_name):
                     progress_bar.update(1)
                     global_step += 1
 
-                    self.sample_images(accelerator, args, None, global_step, accelerator.device, vae, tokenizer, text_encoder, unet)
+                    self.sample_images(
+                        accelerator, args, None, global_step, accelerator.device, vae, tokenizers, text_encoder, unet
+                    )
 
                     # 指定ステップごとにモデルを保存
                     if args.save_every_n_steps is not None and global_step % args.save_every_n_steps == 0:
@@ -1082,7 +1120,7 @@ def remove_model(old_ckpt_name):
                     if args.save_state:
                         train_util.save_and_remove_state_on_epoch_end(args, accelerator, epoch + 1)
 
-            self.sample_images(accelerator, args, epoch + 1, global_step, accelerator.device, vae, tokenizer, text_encoder, unet)
+            self.sample_images(accelerator, args, epoch + 1, global_step, accelerator.device, vae, tokenizers, text_encoder, unet)
 
             # end of epoch
 
diff --git a/train_textual_inversion.py b/train_textual_inversion.py
index ade077c36..9044f50df 100644
--- a/train_textual_inversion.py
+++ b/train_textual_inversion.py
@@ -2,6 +2,7 @@
 import math
 import os
 from multiprocessing import Value
+from typing import Any, List
 import toml
 
 from tqdm import tqdm
@@ -15,7 +16,7 @@
 from accelerate.utils import set_seed
 from diffusers import DDPMScheduler
 from transformers import CLIPTokenizer
-from library import deepspeed_utils, model_util
+from library import deepspeed_utils, model_util, strategy_base, strategy_sd
 
 import library.train_util as train_util
 import library.huggingface_util as huggingface_util
@@ -103,28 +104,38 @@ def assert_extra_args(self, args, train_dataset_group):
 
     def load_target_model(self, args, weight_dtype, accelerator):
         text_encoder, vae, unet, _ = train_util.load_target_model(args, weight_dtype, accelerator)
-        return model_util.get_model_version_str_for_sd1_sd2(args.v2, args.v_parameterization), text_encoder, vae, unet
+        return model_util.get_model_version_str_for_sd1_sd2(args.v2, args.v_parameterization), [text_encoder], vae, unet
 
-    def load_tokenizer(self, args):
-        tokenizer = train_util.load_tokenizer(args)
-        return tokenizer
+    def get_tokenize_strategy(self, args):
+        return strategy_sd.SdTokenizeStrategy(args.v2, args.max_token_length, args.tokenizer_cache_dir)
+
+    def get_tokenizers(self, tokenize_strategy: strategy_sd.SdTokenizeStrategy) -> List[Any]:
+        return [tokenize_strategy.tokenizer]
+
+    def get_latents_caching_strategy(self, args):
+        latents_caching_strategy = strategy_sd.SdSdxlLatentsCachingStrategy(
+            True, args.cache_latents_to_disk, args.vae_batch_size, False
+        )
+        return latents_caching_strategy
 
     def assert_token_string(self, token_string, tokenizers: CLIPTokenizer):
         pass
 
-    def get_text_cond(self, args, accelerator, batch, tokenizers, text_encoders, weight_dtype):
-        with torch.enable_grad():
-            input_ids = batch["input_ids"].to(accelerator.device)
-            encoder_hidden_states = train_util.get_hidden_states(args, input_ids, tokenizers[0], text_encoders[0], None)
-            return encoder_hidden_states
+    def get_text_encoding_strategy(self, args):
+        return strategy_sd.SdTextEncodingStrategy(args.clip_skip)
+
+    def get_models_for_text_encoding(self, args, accelerator, text_encoders) -> List[Any]:
+        return text_encoders
 
     def call_unet(self, args, accelerator, unet, noisy_latents, timesteps, text_conds, batch, weight_dtype):
-        noise_pred = unet(noisy_latents, timesteps, text_conds).sample
+        noise_pred = unet(noisy_latents, timesteps, text_conds[0]).sample
         return noise_pred
 
-    def sample_images(self, accelerator, args, epoch, global_step, device, vae, tokenizer, text_encoder, unet, prompt_replacement):
+    def sample_images(
+        self, accelerator, args, epoch, global_step, device, vae, tokenizers, text_encoders, unet, prompt_replacement
+    ):
         train_util.sample_images(
-            accelerator, args, epoch, global_step, device, vae, tokenizer, text_encoder, unet, prompt_replacement
+            accelerator, args, epoch, global_step, device, vae, tokenizers[0], text_encoders[0], unet, prompt_replacement
         )
 
     def save_weights(self, file, updated_embs, save_dtype, metadata):
@@ -182,8 +193,13 @@ def train(self, args):
         if args.seed is not None:
             set_seed(args.seed)
 
-        tokenizer_or_list = self.load_tokenizer(args)  # list of tokenizer or tokenizer
-        tokenizers = tokenizer_or_list if isinstance(tokenizer_or_list, list) else [tokenizer_or_list]
+        tokenize_strategy = self.get_tokenize_strategy(args)
+        strategy_base.TokenizeStrategy.set_strategy(tokenize_strategy)
+        tokenizers = self.get_tokenizers(tokenize_strategy)  # will be removed after sample_image is refactored
+
+        # prepare caching strategy: this must be set before preparing dataset. because dataset may use this strategy for initialization.
+        latents_caching_strategy = self.get_latents_caching_strategy(args)
+        strategy_base.LatentsCachingStrategy.set_strategy(latents_caching_strategy)
 
         # acceleratorを準備する
         logger.info("prepare accelerator")
@@ -194,14 +210,7 @@ def train(self, args):
         vae_dtype = torch.float32 if args.no_half_vae else weight_dtype
 
         # モデルを読み込む
-        model_version, text_encoder_or_list, vae, unet = self.load_target_model(args, weight_dtype, accelerator)
-        text_encoders = [text_encoder_or_list] if not isinstance(text_encoder_or_list, list) else text_encoder_or_list
-
-        if len(text_encoders) > 1 and args.gradient_accumulation_steps > 1:
-            accelerator.print(
-                "accelerate doesn't seem to support gradient_accumulation_steps for multiple models (text encoders) / "
-                + "accelerateでは複数のモデル（テキストエンコーダー）のgradient_accumulation_stepsはサポートされていないようです"
-            )
+        model_version, text_encoders, vae, unet = self.load_target_model(args, weight_dtype, accelerator)
 
         # Convert the init_word to token_id
         init_token_ids_list = []
@@ -310,10 +319,10 @@ def train(self, args):
                         ]
                     }
 
-            blueprint = blueprint_generator.generate(user_config, args, tokenizer=tokenizer_or_list)
+            blueprint = blueprint_generator.generate(user_config, args)
             train_dataset_group = config_util.generate_dataset_group_by_blueprint(blueprint.dataset_group)
         else:
-            train_dataset_group = train_util.load_arbitrary_dataset(args, tokenizer_or_list)
+            train_dataset_group = train_util.load_arbitrary_dataset(args)
 
         self.assert_extra_args(args, train_dataset_group)
 
@@ -368,11 +377,10 @@ def train(self, args):
             vae.to(accelerator.device, dtype=vae_dtype)
             vae.requires_grad_(False)
             vae.eval()
-            with torch.no_grad():
-                train_dataset_group.cache_latents(vae, args.vae_batch_size, args.cache_latents_to_disk, accelerator.is_main_process)
-            vae.to("cpu")
-            clean_memory_on_device(accelerator.device)
 
+            train_dataset_group.new_cache_latents(vae, accelerator.is_main_process)
+
+            clean_memory_on_device(accelerator.device)
             accelerator.wait_for_everyone()
 
         if args.gradient_checkpointing:
@@ -387,7 +395,11 @@ def train(self, args):
             trainable_params += text_encoder.get_input_embeddings().parameters()
         _, _, optimizer = train_util.get_optimizer(args, trainable_params)
 
-        # dataloaderを準備する
+        # prepare dataloader
+        # strategies are set here because they cannot be referenced in another process. Copy them with the dataset
+        # some strategies can be None
+        train_dataset_group.set_current_strategies()
+
         # DataLoaderのプロセス数：0 は persistent_workers が使えないので注意
         n_workers = min(args.max_data_loader_n_workers, os.cpu_count())  # cpu_count or max_data_loader_n_workers
         train_dataloader = torch.utils.data.DataLoader(
@@ -415,20 +427,8 @@ def train(self, args):
         lr_scheduler = train_util.get_scheduler_fix(args, optimizer, accelerator.num_processes)
 
         # acceleratorがなんかよろしくやってくれるらしい
-        if len(text_encoders) == 1:
-            text_encoder_or_list, optimizer, train_dataloader, lr_scheduler = accelerator.prepare(
-                text_encoder_or_list, optimizer, train_dataloader, lr_scheduler
-            )
-
-        elif len(text_encoders) == 2:
-            text_encoder1, text_encoder2, optimizer, train_dataloader, lr_scheduler = accelerator.prepare(
-                text_encoders[0], text_encoders[1], optimizer, train_dataloader, lr_scheduler
-            )
-
-            text_encoder_or_list = text_encoders = [text_encoder1, text_encoder2]
-
-        else:
-            raise NotImplementedError()
+        optimizer, train_dataloader, lr_scheduler = accelerator.prepare(optimizer, train_dataloader, lr_scheduler)
+        text_encoders = [accelerator.prepare(text_encoder) for text_encoder in text_encoders]
 
         index_no_updates_list = []
         orig_embeds_params_list = []
@@ -456,6 +456,9 @@ def train(self, args):
         else:
             unet.eval()
 
+        text_encoding_strategy = self.get_text_encoding_strategy(args)
+        strategy_base.TextEncodingStrategy.set_strategy(text_encoding_strategy)
+
         if not cache_latents:  # キャッシュしない場合はVAEを使うのでVAEを準備する
             vae.requires_grad_(False)
             vae.eval()
@@ -510,7 +513,9 @@ def train(self, args):
             if args.log_tracker_config is not None:
                 init_kwargs = toml.load(args.log_tracker_config)
             accelerator.init_trackers(
-                "textual_inversion" if args.log_tracker_name is None else args.log_tracker_name, config=train_util.get_sanitized_config_or_none(args), init_kwargs=init_kwargs
+                "textual_inversion" if args.log_tracker_name is None else args.log_tracker_name,
+                config=train_util.get_sanitized_config_or_none(args),
+                init_kwargs=init_kwargs,
             )
 
         # function for saving/removing
@@ -540,8 +545,8 @@ def remove_model(old_ckpt_name):
             global_step,
             accelerator.device,
             vae,
-            tokenizer_or_list,
-            text_encoder_or_list,
+            tokenizers,
+            text_encoders,
             unet,
             prompt_replacement,
         )
@@ -568,7 +573,12 @@ def remove_model(old_ckpt_name):
                         latents = latents * self.vae_scale_factor
 
                     # Get the text embedding for conditioning
-                    text_encoder_conds = self.get_text_cond(args, accelerator, batch, tokenizers, text_encoders, weight_dtype)
+                    input_ids = [ids.to(accelerator.device) for ids in batch["input_ids_list"]]
+                    text_encoder_conds = text_encoding_strategy.encode_tokens(
+                        tokenize_strategy, self.get_models_for_text_encoding(args, accelerator, text_encoders), input_ids
+                    )
+                    if args.full_fp16:
+                        text_encoder_conds = [c.to(weight_dtype) for c in text_encoder_conds]
 
                     # Sample noise, sample a random timestep for each image, and add noise to the latents,
                     # with noise offset and/or multires noise if specified
@@ -588,7 +598,9 @@ def remove_model(old_ckpt_name):
                     else:
                         target = noise
 
-                    loss = train_util.conditional_loss(noise_pred.float(), target.float(), reduction="none", loss_type=args.loss_type, huber_c=huber_c)
+                    loss = train_util.conditional_loss(
+                        noise_pred.float(), target.float(), reduction="none", loss_type=args.loss_type, huber_c=huber_c
+                    )
                     if args.masked_loss or ("alpha_masks" in batch and batch["alpha_masks"] is not None):
                         loss = apply_masked_loss(loss, batch)
                     loss = loss.mean([1, 2, 3])
@@ -639,8 +651,8 @@ def remove_model(old_ckpt_name):
                         global_step,
                         accelerator.device,
                         vae,
-                        tokenizer_or_list,
-                        text_encoder_or_list,
+                        tokenizers,
+                        text_encoders,
                         unet,
                         prompt_replacement,
                     )
@@ -722,8 +734,8 @@ def remove_model(old_ckpt_name):
                 global_step,
                 accelerator.device,
                 vae,
-                tokenizer_or_list,
-                text_encoder_or_list,
+                tokenizers,
+                text_encoders,
                 unet,
                 prompt_replacement,
             )

From 1a977e847a10975c042c0fdacd871a33c9e93900 Mon Sep 17 00:00:00 2001
From: Kohya S <ykumeykume@gmail.com>
Date: Sat, 27 Jul 2024 13:51:50 +0900
Subject: [PATCH 046/582] fix typos

---
 library/strategy_base.py | 2 +-
 library/strategy_sd.py   | 2 +-
 library/strategy_sd3.py  | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/library/strategy_base.py b/library/strategy_base.py
index 594cca5eb..a99a08290 100644
--- a/library/strategy_base.py
+++ b/library/strategy_base.py
@@ -218,7 +218,7 @@ def is_disk_cached_latents_expected(
     def cache_batch_latents(self, model: Any, batch: List, flip_aug: bool, alpha_mask: bool, random_crop: bool):
         raise NotImplementedError
 
-    def _defualt_is_disk_cached_latents_expected(
+    def _default_is_disk_cached_latents_expected(
         self, latents_stride: int, bucket_reso: Tuple[int, int], npz_path: str, flip_aug: bool, alpha_mask: bool
     ):
         if not self.cache_to_disk:
diff --git a/library/strategy_sd.py b/library/strategy_sd.py
index 105816145..83ffaa31b 100644
--- a/library/strategy_sd.py
+++ b/library/strategy_sd.py
@@ -125,7 +125,7 @@ def get_latents_npz_path(self, absolute_path: str, image_size: Tuple[int, int])
         return os.path.splitext(absolute_path)[0] + f"_{image_size[0]:04d}x{image_size[1]:04d}" + self.suffix
 
     def is_disk_cached_latents_expected(self, bucket_reso: Tuple[int, int], npz_path: str, flip_aug: bool, alpha_mask: bool):
-        return self._defualt_is_disk_cached_latents_expected(8, bucket_reso, npz_path, flip_aug, alpha_mask)
+        return self._default_is_disk_cached_latents_expected(8, bucket_reso, npz_path, flip_aug, alpha_mask)
 
     # TODO remove circular dependency for ImageInfo
     def cache_batch_latents(self, vae, image_infos: List, flip_aug: bool, alpha_mask: bool, random_crop: bool):
diff --git a/library/strategy_sd3.py b/library/strategy_sd3.py
index 42630ab22..7491e814f 100644
--- a/library/strategy_sd3.py
+++ b/library/strategy_sd3.py
@@ -177,7 +177,7 @@ def get_latents_npz_path(self, absolute_path: str, image_size: Tuple[int, int])
         )
 
     def is_disk_cached_latents_expected(self, bucket_reso: Tuple[int, int], npz_path: str, flip_aug: bool, alpha_mask: bool):
-        return self._defualt_is_disk_cached_latents_expected(8, bucket_reso, npz_path, flip_aug, alpha_mask)
+        return self._default_is_disk_cached_latents_expected(8, bucket_reso, npz_path, flip_aug, alpha_mask)
 
     # TODO remove circular dependency for ImageInfo
     def cache_batch_latents(self, vae, image_infos: List, flip_aug: bool, alpha_mask: bool, random_crop: bool):

From 002d75179ae5a3b165a65c5cf49c00bf8f98e2df Mon Sep 17 00:00:00 2001
From: Kohya S <ykumeykume@gmail.com>
Date: Mon, 29 Jul 2024 23:18:34 +0900
Subject: [PATCH 047/582] sample images for training

---
 library/sd3_train_utils.py | 348 ++++++++++++++++++++++++++++++++++++-
 sd3_train.py               |  51 +++---
 2 files changed, 367 insertions(+), 32 deletions(-)

diff --git a/library/sd3_train_utils.py b/library/sd3_train_utils.py
index 8f99d9474..da0729506 100644
--- a/library/sd3_train_utils.py
+++ b/library/sd3_train_utils.py
@@ -1,14 +1,18 @@
 import argparse
-import glob
 import math
 import os
-from typing import List, Optional, Tuple, Union
+import toml
+import json
+import time
+from typing import Dict, List, Optional, Tuple, Union
 
 import torch
 from safetensors.torch import save_file
-from accelerate import Accelerator
+from accelerate import Accelerator, PartialState
+from tqdm import tqdm
+from PIL import Image
 
-from library import sd3_models, sd3_utils, train_util
+from library import sd3_models, sd3_utils, strategy_base, train_util
 from library.device_utils import init_ipex, clean_memory_on_device
 
 init_ipex()
@@ -276,10 +280,342 @@ def verify_sdxl_training_args(args: argparse.Namespace, supportTextEncoderCachin
             )
 
 
-def sample_images(*args, **kwargs):
-    return train_util.sample_images_common(SdxlStableDiffusionLongPromptWeightingPipeline, *args, **kwargs)
+# temporary copied from sd3_minimal_inferece.py
 
 
+def get_sigmas(sampling: sd3_utils.ModelSamplingDiscreteFlow, steps):
+    start = sampling.timestep(sampling.sigma_max)
+    end = sampling.timestep(sampling.sigma_min)
+    timesteps = torch.linspace(start, end, steps)
+    sigs = []
+    for x in range(len(timesteps)):
+        ts = timesteps[x]
+        sigs.append(sampling.sigma(ts))
+    sigs += [0.0]
+    return torch.FloatTensor(sigs)
+
+
+def max_denoise(model_sampling, sigmas):
+    max_sigma = float(model_sampling.sigma_max)
+    sigma = float(sigmas[0])
+    return math.isclose(max_sigma, sigma, rel_tol=1e-05) or sigma > max_sigma
+
+
+def do_sample(
+    height: int,
+    width: int,
+    seed: int,
+    cond: Tuple[torch.Tensor, torch.Tensor],
+    neg_cond: Tuple[torch.Tensor, torch.Tensor],
+    mmdit: sd3_models.MMDiT,
+    steps: int,
+    guidance_scale: float,
+    dtype: torch.dtype,
+    device: str,
+):
+    latent = torch.zeros(1, 16, height // 8, width // 8, device=device)
+    latent = latent.to(dtype).to(device)
+
+    # noise = get_noise(seed, latent).to(device)
+    if seed is not None:
+        generator = torch.manual_seed(seed)
+    noise = (
+        torch.randn(latent.size(), dtype=torch.float32, layout=latent.layout, generator=generator, device="cpu")
+        .to(latent.dtype)
+        .to(device)
+    )
+
+    model_sampling = sd3_utils.ModelSamplingDiscreteFlow(shift=3.0)  # 3.0 is for SD3
+
+    sigmas = get_sigmas(model_sampling, steps).to(device)
+
+    noise_scaled = model_sampling.noise_scaling(sigmas[0], noise, latent, max_denoise(model_sampling, sigmas))
+
+    c_crossattn = torch.cat([cond[0], neg_cond[0]]).to(device).to(dtype)
+    y = torch.cat([cond[1], neg_cond[1]]).to(device).to(dtype)
+
+    x = noise_scaled.to(device).to(dtype)
+    # print(x.shape)
+
+    with torch.no_grad():
+        for i in tqdm(range(len(sigmas) - 1)):
+            sigma_hat = sigmas[i]
+
+            timestep = model_sampling.timestep(sigma_hat).float()
+            timestep = torch.FloatTensor([timestep, timestep]).to(device)
+
+            x_c_nc = torch.cat([x, x], dim=0)
+            # print(x_c_nc.shape, timestep.shape, c_crossattn.shape, y.shape)
+
+            model_output = mmdit(x_c_nc, timestep, context=c_crossattn, y=y)
+            model_output = model_output.float()
+            batched = model_sampling.calculate_denoised(sigma_hat, model_output, x)
+
+            pos_out, neg_out = batched.chunk(2)
+            denoised = neg_out + (pos_out - neg_out) * guidance_scale
+            # print(denoised.shape)
+
+            # d = to_d(x, sigma_hat, denoised)
+            dims_to_append = x.ndim - sigma_hat.ndim
+            sigma_hat_dims = sigma_hat[(...,) + (None,) * dims_to_append]
+            # print(dims_to_append, x.shape, sigma_hat.shape, denoised.shape, sigma_hat_dims.shape)
+            """Converts a denoiser output to a Karras ODE derivative."""
+            d = (x - denoised) / sigma_hat_dims
+
+            dt = sigmas[i + 1] - sigma_hat
+
+            # Euler method
+            x = x + d * dt
+            x = x.to(dtype)
+
+    return x
+
+
+def load_prompts(prompt_file: str) -> List[Dict]:
+    # read prompts
+    if prompt_file.endswith(".txt"):
+        with open(prompt_file, "r", encoding="utf-8") as f:
+            lines = f.readlines()
+        prompts = [line.strip() for line in lines if len(line.strip()) > 0 and line[0] != "#"]
+    elif prompt_file.endswith(".toml"):
+        with open(prompt_file, "r", encoding="utf-8") as f:
+            data = toml.load(f)
+        prompts = [dict(**data["prompt"], **subset) for subset in data["prompt"]["subset"]]
+    elif prompt_file.endswith(".json"):
+        with open(prompt_file, "r", encoding="utf-8") as f:
+            prompts = json.load(f)
+
+    # preprocess prompts
+    for i in range(len(prompts)):
+        prompt_dict = prompts[i]
+        if isinstance(prompt_dict, str):
+            from library.train_util import line_to_prompt_dict
+
+            prompt_dict = line_to_prompt_dict(prompt_dict)
+            prompts[i] = prompt_dict
+        assert isinstance(prompt_dict, dict)
+
+        # Adds an enumerator to the dict based on prompt position. Used later to name image files. Also cleanup of extra data in original prompt dict.
+        prompt_dict["enum"] = i
+        prompt_dict.pop("subset", None)
+
+    return prompts
+
+
+def sample_images(
+    accelerator: Accelerator,
+    args: argparse.Namespace,
+    epoch,
+    steps,
+    mmdit,
+    vae,
+    text_encoders,
+    sample_prompts_te_outputs,
+    prompt_replacement=None,
+):
+    if steps == 0:
+        if not args.sample_at_first:
+            return
+    else:
+        if args.sample_every_n_steps is None and args.sample_every_n_epochs is None:
+            return
+        if args.sample_every_n_epochs is not None:
+            # sample_every_n_steps は無視する
+            if epoch is None or epoch % args.sample_every_n_epochs != 0:
+                return
+        else:
+            if steps % args.sample_every_n_steps != 0 or epoch is not None:  # steps is not divisible or end of epoch
+                return
+
+    logger.info("")
+    logger.info(f"generating sample images at step / サンプル画像生成 ステップ: {steps}")
+    if not os.path.isfile(args.sample_prompts):
+        logger.error(f"No prompt file / プロンプトファイルがありません: {args.sample_prompts}")
+        return
+
+    distributed_state = PartialState()  # for multi gpu distributed inference. this is a singleton, so it's safe to use it here
+
+    # unwrap unet and text_encoder(s)
+    mmdit = accelerator.unwrap_model(mmdit)
+    text_encoders = [accelerator.unwrap_model(te) for te in text_encoders]
+    # print([(te.parameters().__next__().device if te is not None else None) for te in text_encoders])
+
+    prompts = load_prompts(args.sample_prompts)
+
+    save_dir = args.output_dir + "/sample"
+    os.makedirs(save_dir, exist_ok=True)
+
+    # save random state to restore later
+    rng_state = torch.get_rng_state()
+    cuda_rng_state = None
+    try:
+        cuda_rng_state = torch.cuda.get_rng_state() if torch.cuda.is_available() else None
+    except Exception:
+        pass
+
+    org_vae_device = vae.device  # will be on cpu
+    vae.to(distributed_state.device)  # distributed_state.device is same as accelerator.device
+
+    if distributed_state.num_processes <= 1:
+        # If only one device is available, just use the original prompt list. We don't need to care about the distribution of prompts.
+        with torch.no_grad():
+            for prompt_dict in prompts:
+                sample_image_inference(
+                    accelerator,
+                    args,
+                    mmdit,
+                    text_encoders,
+                    vae,
+                    save_dir,
+                    prompt_dict,
+                    epoch,
+                    steps,
+                    sample_prompts_te_outputs,
+                    prompt_replacement,
+                )
+    else:
+        # Creating list with N elements, where each element is a list of prompt_dicts, and N is the number of processes available (number of devices available)
+        # prompt_dicts are assigned to lists based on order of processes, to attempt to time the image creation time to match enum order. Probably only works when steps and sampler are identical.
+        per_process_prompts = []  # list of lists
+        for i in range(distributed_state.num_processes):
+            per_process_prompts.append(prompts[i :: distributed_state.num_processes])
+
+        with torch.no_grad():
+            with distributed_state.split_between_processes(per_process_prompts) as prompt_dict_lists:
+                for prompt_dict in prompt_dict_lists[0]:
+                    sample_image_inference(
+                        accelerator,
+                        args,
+                        mmdit,
+                        text_encoders,
+                        vae,
+                        save_dir,
+                        prompt_dict,
+                        epoch,
+                        steps,
+                        sample_prompts_te_outputs,
+                        prompt_replacement,
+                    )
+
+    torch.set_rng_state(rng_state)
+    if cuda_rng_state is not None:
+        torch.cuda.set_rng_state(cuda_rng_state)
+
+    vae.to(org_vae_device)
+
+    clean_memory_on_device(accelerator.device)
+
+
+def sample_image_inference(
+    accelerator: Accelerator,
+    args: argparse.Namespace,
+    mmdit: sd3_models.MMDiT,
+    text_encoders: List[Union[sd3_models.SDClipModel, sd3_models.SDXLClipG, sd3_models.T5XXLModel]],
+    vae: sd3_models.SDVAE,
+    save_dir,
+    prompt_dict,
+    epoch,
+    steps,
+    sample_prompts_te_outputs,
+    prompt_replacement,
+):
+    assert isinstance(prompt_dict, dict)
+    negative_prompt = prompt_dict.get("negative_prompt")
+    sample_steps = prompt_dict.get("sample_steps", 30)
+    width = prompt_dict.get("width", 512)
+    height = prompt_dict.get("height", 512)
+    scale = prompt_dict.get("scale", 7.5)
+    seed = prompt_dict.get("seed")
+    # controlnet_image = prompt_dict.get("controlnet_image")
+    prompt: str = prompt_dict.get("prompt", "")
+    # sampler_name: str = prompt_dict.get("sample_sampler", args.sample_sampler)
+
+    if prompt_replacement is not None:
+        prompt = prompt.replace(prompt_replacement[0], prompt_replacement[1])
+        if negative_prompt is not None:
+            negative_prompt = negative_prompt.replace(prompt_replacement[0], prompt_replacement[1])
+
+    if seed is not None:
+        torch.manual_seed(seed)
+        torch.cuda.manual_seed(seed)
+    else:
+        # True random sample image generation
+        torch.seed()
+        torch.cuda.seed()
+
+    if negative_prompt is None:
+        negative_prompt = ""
+
+    height = max(64, height - height % 8)  # round to divisible by 8
+    width = max(64, width - width % 8)  # round to divisible by 8
+    logger.info(f"prompt: {prompt}")
+    logger.info(f"negative_prompt: {negative_prompt}")
+    logger.info(f"height: {height}")
+    logger.info(f"width: {width}")
+    logger.info(f"sample_steps: {sample_steps}")
+    logger.info(f"scale: {scale}")
+    # logger.info(f"sample_sampler: {sampler_name}")
+    if seed is not None:
+        logger.info(f"seed: {seed}")
+
+    # encode prompts
+    tokenize_strategy = strategy_base.TokenizeStrategy.get_strategy()
+    encoding_strategy = strategy_base.TextEncodingStrategy.get_strategy()
+
+    if sample_prompts_te_outputs and prompt in sample_prompts_te_outputs:
+        te_outputs = sample_prompts_te_outputs[prompt]
+    else:
+        l_tokens, g_tokens, t5_tokens = tokenize_strategy.tokenize(prompt)
+        te_outputs = encoding_strategy.encode_tokens(tokenize_strategy, text_encoders, [l_tokens, g_tokens, t5_tokens])
+
+    lg_out, t5_out, pooled = te_outputs
+    cond = encoding_strategy.concat_encodings(lg_out, t5_out, pooled)
+
+    # encode negative prompts
+    if sample_prompts_te_outputs and negative_prompt in sample_prompts_te_outputs:
+        neg_te_outputs = sample_prompts_te_outputs[negative_prompt]
+    else:
+        l_tokens, g_tokens, t5_tokens = tokenize_strategy.tokenize(negative_prompt)
+        neg_te_outputs = encoding_strategy.encode_tokens(tokenize_strategy, text_encoders, [l_tokens, g_tokens, t5_tokens])
+
+    lg_out, t5_out, pooled = neg_te_outputs
+    neg_cond = encoding_strategy.concat_encodings(lg_out, t5_out, pooled)
+
+    # sample image
+    latents = do_sample(height, width, seed, cond, neg_cond, mmdit, sample_steps, scale, mmdit.dtype, accelerator.device)
+    latents = vae.process_out(latents.to(vae.device, dtype=vae.dtype))
+
+    # latent to image
+    with torch.no_grad():
+        image = vae.decode(latents)
+    image = image.float()
+    image = torch.clamp((image + 1.0) / 2.0, min=0.0, max=1.0)[0]
+    decoded_np = 255.0 * np.moveaxis(image.cpu().numpy(), 0, 2)
+    decoded_np = decoded_np.astype(np.uint8)
+
+    image = Image.fromarray(decoded_np)
+    # adding accelerator.wait_for_everyone() here should sync up and ensure that sample images are saved in the same order as the original prompt list
+    # but adding 'enum' to the filename should be enough
+
+    ts_str = time.strftime("%Y%m%d%H%M%S", time.localtime())
+    num_suffix = f"e{epoch:06d}" if epoch is not None else f"{steps:06d}"
+    seed_suffix = "" if seed is None else f"_{seed}"
+    i: int = prompt_dict["enum"]
+    img_filename = f"{'' if args.output_name is None else args.output_name + '_'}{num_suffix}_{i:02d}_{ts_str}{seed_suffix}.png"
+    image.save(os.path.join(save_dir, img_filename))
+
+    # wandb有効時のみログを送信
+    try:
+        wandb_tracker = accelerator.get_tracker("wandb")
+        try:
+            import wandb
+        except ImportError:  # 事前に一度確認するのでここはエラー出ないはず
+            raise ImportError("No wandb / wandb がインストールされていないようです")
+
+        wandb_tracker.log({f"sample_{i}": wandb.Image(image)})
+    except:  # wandb 無効時
+        pass
+
 
 # region Diffusers
 
diff --git a/sd3_train.py b/sd3_train.py
index 617e30271..2f4ea8cb2 100644
--- a/sd3_train.py
+++ b/sd3_train.py
@@ -299,6 +299,7 @@ def train(args):
         t5xxl.eval()
 
     # cache text encoder outputs
+    sample_prompts_te_outputs = None
     if args.cache_text_encoder_outputs:
         # Text Encodes are eval and no grad here
         clip_l.to(accelerator.device)
@@ -321,6 +322,22 @@ def train(args):
 
         with accelerator.autocast():
             train_dataset_group.new_cache_text_encoder_outputs([clip_l, clip_g, t5xxl], accelerator.is_main_process)
+
+        # cache sample prompt's embeddings to free text encoder's memory
+        if args.sample_prompts is not None:
+            logger.info(f"cache Text Encoder outputs for sample prompt: {args.sample_prompts}")
+            prompts = sd3_train_utils.load_prompts(args.sample_prompts)
+            sample_prompts_te_outputs = {}  # key: prompt, value: text encoder outputs
+            with accelerator.autocast(), torch.no_grad():
+                for prompt_dict in prompts:
+                    for p in [prompt_dict.get("prompt", ""), prompt_dict.get("negative_prompt", "")]:
+                        if p not in sample_prompts_te_outputs:
+                            logger.info(f"cache Text Encoder outputs for prompt: {p}")
+                            tokens_list = sd3_tokenize_strategy.tokenize(p)
+                            sample_prompts_te_outputs[p] = text_encoding_strategy.encode_tokens(
+                                sd3_tokenize_strategy, [clip_l, clip_g, t5xxl], tokens_list
+                            )
+
         accelerator.wait_for_everyone()
 
     # load MMDIT
@@ -635,10 +652,8 @@ def optimizer_hook(parameter: torch.Tensor):
             init_kwargs=init_kwargs,
         )
 
-    # # For --sample_at_first
-    # sd3_train_utils.sample_images(
-    #     accelerator, args, 0, global_step, accelerator.device, vae, [tokenizer1, tokenizer2], [clip_l, clip_g], mmdit
-    # )
+    # For --sample_at_first
+    sd3_train_utils.sample_images(accelerator, args, 0, global_step, mmdit, vae, [clip_l, clip_g, t5xxl], sample_prompts_te_outputs)
 
     # following function will be moved to sd3_train_utils
 
@@ -831,17 +846,9 @@ def compute_loss_weighting_for_sd3(weighting_scheme: str, sigmas=None):
                 progress_bar.update(1)
                 global_step += 1
 
-                # sdxl_train_util.sample_images(
-                #     accelerator,
-                #     args,
-                #     None,
-                #     global_step,
-                #     accelerator.device,
-                #     vae,
-                #     [tokenizer1, tokenizer2],
-                #     [clip_l, clip_g],
-                #     mmdit,
-                # )
+                sd3_train_utils.sample_images(
+                    accelerator, args, None, global_step, mmdit, vae, [clip_l, clip_g, t5xxl], sample_prompts_te_outputs
+                )
 
                 # 指定ステップごとにモデルを保存
                 if args.save_every_n_steps is not None and global_step % args.save_every_n_steps == 0:
@@ -900,17 +907,9 @@ def compute_loss_weighting_for_sd3(weighting_scheme: str, sigmas=None):
                     vae,
                 )
 
-        # sdxl_train_util.sample_images(
-        #     accelerator,
-        #     args,
-        #     epoch + 1,
-        #     global_step,
-        #     accelerator.device,
-        #     vae,
-        #     [tokenizer1, tokenizer2],
-        #     [clip_l, clip_g],
-        #     mmdit,
-        # )
+        sd3_train_utils.sample_images(
+            accelerator, args, epoch + 1, global_step, mmdit, vae, [clip_l, clip_g, t5xxl], sample_prompts_te_outputs
+        )
 
     is_main_process = accelerator.is_main_process
     # if is_main_process:

From 31507b9901d1d9ab65ba79ebd747b7f35c7e0fc1 Mon Sep 17 00:00:00 2001
From: gesen2egee <79357052+gesen2egee@users.noreply.github.com>
Date: Fri, 2 Aug 2024 13:15:21 +0800
Subject: [PATCH 048/582] Remove unnecessary is_train changes and use
 apply_debiased_estimation to calculate validation loss. Balances the
 influence of different time steps on training performance (without affecting
 actual training results)

---
 train_network.py | 19 +++++++++----------
 1 file changed, 9 insertions(+), 10 deletions(-)

diff --git a/train_network.py b/train_network.py
index 2a3a44824..4a5940cd5 100644
--- a/train_network.py
+++ b/train_network.py
@@ -135,7 +135,7 @@ def all_reduce_network(self, accelerator, network):
     def sample_images(self, accelerator, args, epoch, global_step, device, vae, tokenizer, text_encoder, unet):
         train_util.sample_images(accelerator, args, epoch, global_step, device, vae, tokenizer, text_encoder, unet)
 
-    def process_val_batch(self, batch, is_train, tokenizers, text_encoders, unet, vae, noise_scheduler, vae_dtype, weight_dtype, accelerator, args, train_text_encoder=True):
+    def process_val_batch(self, batch, tokenizers, text_encoders, unet, vae, noise_scheduler, vae_dtype, weight_dtype, accelerator, args, train_text_encoder=True):
 
         total_loss = 0.0
         timesteps_list = [10, 350, 500, 650, 990]    
@@ -153,7 +153,7 @@ def process_val_batch(self, batch, is_train, tokenizers, text_encoders, unet, va
             latents = latents * self.vae_scale_factor
         b_size = latents.shape[0]
 
-        with torch.set_grad_enabled(is_train and train_text_encoder), accelerator.autocast():
+        with torch.set_grad_enabled(False), accelerator.autocast():
             # Get the text embedding for conditioning
             if args.weighted_captions:
                 text_encoder_conds = get_weighted_text_embeddings(
@@ -173,7 +173,7 @@ def process_val_batch(self, batch, is_train, tokenizers, text_encoders, unet, va
         # with noise offset and/or multires noise if specified
         
         for fixed_timesteps in timesteps_list:
-            with torch.set_grad_enabled(is_train), accelerator.autocast():
+            with torch.set_grad_enabled(False), accelerator.autocast():
                 noise = torch.randn_like(latents, device=latents.device)
                 b_size = latents.shape[0]
                 timesteps = torch.full((b_size,), fixed_timesteps, dtype=torch.long, device=latents.device)
@@ -189,6 +189,7 @@ def process_val_batch(self, batch, is_train, tokenizers, text_encoders, unet, va
 
                 loss = torch.nn.functional.mse_loss(noise_pred.float(), target.float(), reduction="none")
                 loss = loss.mean([1, 2, 3])
+                loss = apply_debiased_estimation(loss, timesteps, noise_scheduler)
                 loss = loss.mean()  # 平均なのでbatch_sizeで割る必要なし
                 total_loss += loss
             
@@ -885,8 +886,7 @@ def remove_model(old_ckpt_name):
             for step, batch in enumerate(train_dataloader):
                 current_step.value = global_step
                 with accelerator.accumulate(training_model):
-                    on_step_start(text_encoder, unet)                    
-                    is_train = True
+                    on_step_start(text_encoder, unet)
                     if "latents" in batch and batch["latents"] is not None:
                         latents = batch["latents"].to(accelerator.device).to(dtype=weight_dtype)
                     else:
@@ -911,7 +911,7 @@ def remove_model(old_ckpt_name):
                         # print(f"set multiplier: {multipliers}")
                         accelerator.unwrap_model(network).set_multiplier(multipliers)
 
-                    with torch.set_grad_enabled(is_train and train_text_encoder), accelerator.autocast():
+                    with torch.set_grad_enabled(train_text_encoder), accelerator.autocast():
                         # Get the text embedding for conditioning
                         if args.weighted_captions:
                             text_encoder_conds = get_weighted_text_embeddings(
@@ -941,7 +941,7 @@ def remove_model(old_ckpt_name):
                             t.requires_grad_(True)
 
                     # Predict the noise residual
-                    with torch.set_grad_enabled(is_train), accelerator.autocast():
+                    with accelerator.autocast():
                         noise_pred = self.call_unet(
                             args,
                             accelerator,
@@ -1040,10 +1040,9 @@ def remove_model(old_ckpt_name):
                             total_loss = 0.0
                             with torch.no_grad():
                                 validation_steps = min(args.max_validation_steps, len(val_dataloader)) if args.max_validation_steps is not None else len(val_dataloader)
-                                for val_step in tqdm(range(validation_steps), desc='Validation Steps'): 
-                                    is_train = False
+                                for val_step in tqdm(range(validation_steps), desc='Validation Steps'):
                                     batch = next(cyclic_val_dataloader)
-                                    loss = self.process_val_batch(batch, is_train, tokenizers, text_encoders, unet, vae, noise_scheduler, vae_dtype, weight_dtype, accelerator, args)
+                                    loss = self.process_val_batch(batch, tokenizers, text_encoders, unet, vae, noise_scheduler, vae_dtype, weight_dtype, accelerator, args)
                                     total_loss += loss.detach().item()
                                 current_loss = total_loss / validation_steps
                                 val_loss_recorder.add(epoch=0, step=global_step, loss=current_loss)   

From 1db495127f25c1b17694780f635a4760b4e345d0 Mon Sep 17 00:00:00 2001
From: gesen2egee <79357052+gesen2egee@users.noreply.github.com>
Date: Sun, 4 Aug 2024 14:53:46 +0800
Subject: [PATCH 049/582] Update train_db.py

---
 train_db.py | 132 +++++++++++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 126 insertions(+), 6 deletions(-)

diff --git a/train_db.py b/train_db.py
index 1de504ed8..9f8ec777c 100644
--- a/train_db.py
+++ b/train_db.py
@@ -2,7 +2,6 @@
 # XXX dropped option: fine_tune
 
 import argparse
-import itertools
 import math
 import os
 from multiprocessing import Value
@@ -41,11 +40,73 @@
 
 setup_logging()
 import logging
+import itertools
 
 logger = logging.getLogger(__name__)
 
 # perlin_noise,
-
+def process_val_batch(*training_models, batch, tokenizers, text_encoders, unet, vae, noise_scheduler, vae_dtype, weight_dtype, accelerator, args):
+	total_loss = 0.0
+	timesteps_list = [10, 350, 500, 650, 990]
+	
+	with accelerator.accumulate(*training_models):
+		with torch.no_grad():
+			# latentに変換
+			if cache_latents:
+				latents = batch["latents"].to(accelerator.device).to(dtype=weight_dtype)
+			else:
+				latents = vae.encode(batch["images"].to(dtype=weight_dtype)).latent_dist.sample()
+			latents = latents * 0.18215
+		b_size = latents.shape[0]
+
+	with torch.set_grad_enabled(False), accelerator.autocast():
+		if args.weighted_captions:
+			encoder_hidden_states = get_weighted_text_embeddings(
+				tokenizer,
+				text_encoder,
+				batch["captions"],
+				accelerator.device,
+				args.max_token_length // 75 if args.max_token_length else 1,
+				clip_skip=args.clip_skip,
+			)
+		else:
+			input_ids = batch["input_ids"].to(accelerator.device)
+			encoder_hidden_states = train_util.get_hidden_states(
+				args, input_ids, tokenizer, text_encoder, None if not args.full_fp16 else weight_dtype
+			)
+
+	# Sample noise, sample a random timestep for each image, and add noise to the latents,
+	# with noise offset and/or multires noise if specified
+
+	for fixed_timesteps in timesteps_list:
+		with torch.set_grad_enabled(False), accelerator.autocast():
+			# Sample noise, sample a random timestep for each image, and add noise to the latents,
+			# with noise offset and/or multires noise if specified
+			noise = torch.randn_like(latents, device=latents.device)
+			b_size = latents.shape[0]
+			timesteps = torch.full((b_size,), fixed_timesteps, dtype=torch.long, device=latents.device)
+			noisy_latents = noise_scheduler.add_noise(latents, noise, timesteps)
+
+			# Predict the noise residual
+			with accelerator.autocast():
+				noise_pred = unet(noisy_latents, timesteps, encoder_hidden_states).sample
+
+			if args.v_parameterization:
+				# v-parameterization training
+				target = noise_scheduler.get_velocity(latents, noise, timesteps)
+			else:
+				target = noise
+
+			loss = torch.nn.functional.mse_loss(noise_pred.float(), target.float(), reduction="none")
+			if args.masked_loss:
+				loss = apply_masked_loss(loss, batch)
+			loss = loss.mean([1, 2, 3])
+			loss = apply_debiased_estimation(loss, timesteps, noise_scheduler)
+			loss = loss.mean()  # 平均なのでbatch_sizeで割る必要なし
+			total_loss += loss
+
+	average_loss = total_loss / len(timesteps_list)    
+	return average_loss
 
 def train(args):
     train_util.verify_training_args(args)
@@ -81,9 +142,10 @@ def train(args):
             }
 
         blueprint = blueprint_generator.generate(user_config, args, tokenizer=tokenizer)
-        train_dataset_group = config_util.generate_dataset_group_by_blueprint(blueprint.dataset_group)
+        train_dataset_group, val_dataset_group = config_util.generate_dataset_group_by_blueprint(blueprint.dataset_group)
     else:
         train_dataset_group = train_util.load_arbitrary_dataset(args, tokenizer)
+        val_dataset_group = None
 
     current_epoch = Value("i", 0)
     current_step = Value("i", 0)
@@ -148,6 +210,9 @@ def train(args):
         with torch.no_grad():
             train_dataset_group.cache_latents(vae, args.vae_batch_size, args.cache_latents_to_disk, accelerator.is_main_process)
         vae.to("cpu")
+            if val_dataset_group is not None:
+            print("Cache validation latents...")
+            val_dataset_group.cache_latents(vae, args.vae_batch_size, args.cache_latents_to_disk, accelerator.is_main_process)     
         clean_memory_on_device(accelerator.device)
 
         accelerator.wait_for_everyone()
@@ -195,6 +260,15 @@ def train(args):
         num_workers=n_workers,
         persistent_workers=args.persistent_data_loader_workers,
     )
+    val_dataloader = torch.utils.data.DataLoader(
+        val_dataset_group if val_dataset_group is not None else [],
+        shuffle=False,
+        batch_size=1,
+        collate_fn=collator,
+        num_workers=n_workers,
+        persistent_workers=args.persistent_data_loader_workers,
+    )
+    cyclic_val_dataloader = itertools.cycle(val_dataloader)
 
     # 学習ステップ数を計算する
     if args.max_train_epochs is not None:
@@ -296,6 +370,8 @@ def train(args):
     train_util.sample_images(accelerator, args, 0, global_step, accelerator.device, vae, tokenizer, text_encoder, unet)
 
     loss_recorder = train_util.LossRecorder()
+    val_loss_recorder = train_util.LossRecorder()
+
     for epoch in range(num_train_epochs):
         accelerator.print(f"\nepoch {epoch+1}/{num_train_epochs}")
         current_epoch.value = epoch + 1
@@ -427,12 +503,33 @@ def train(args):
             avr_loss: float = loss_recorder.moving_average
             logs = {"avr_loss": avr_loss}  # , "lr": lr_scheduler.get_last_lr()[0]}
             progress_bar.set_postfix(**logs)
-
+            
+			if len(val_dataloader) > 0:
+				if  (args.validation_every_n_step is not None and global_step % args.validation_every_n_step == 0) or (args.validation_every_n_step is None and step == len(train_dataloader) - 1) or global_step >= args.max_train_steps:
+						accelerator.print("Validating バリデーション処理...")
+						total_loss = 0.0
+						with torch.no_grad():
+							validation_steps = min(args.max_validation_steps, len(val_dataloader)) if args.max_validation_steps is not None else len(val_dataloader)
+							for val_step in tqdm(range(validation_steps), desc='Validation Steps'):
+								batch = next(cyclic_val_dataloader)
+								loss = self.process_val_batch(batch, tokenizers, text_encoders, unet, vae, noise_scheduler, vae_dtype, weight_dtype, accelerator, args)
+								total_loss += loss.detach().item()
+							current_loss = total_loss / validation_steps
+							val_loss_recorder.add(epoch=0, step=global_step, loss=current_loss)   
+
+						if args.logging_dir is not None:
+							logs = {"loss/current_val_loss": current_loss}
+							accelerator.log(logs, step=global_step)                            
+							avr_loss: float = val_loss_recorder.moving_average
+							logs = {"loss/average_val_loss": avr_loss}
+							accelerator.log(logs, step=global_step)
+
+            
             if global_step >= args.max_train_steps:
                 break
 
         if args.logging_dir is not None:
-            logs = {"loss/epoch": loss_recorder.moving_average}
+            logs = {"loss/epoch_average": loss_recorder.moving_average}  
             accelerator.log(logs, step=epoch + 1)
 
         accelerator.wait_for_everyone()
@@ -515,7 +612,30 @@ def setup_parser() -> argparse.ArgumentParser:
         action="store_true",
         help="do not use fp16/bf16 VAE in mixed precision (use float VAE) / mixed precisionでも fp16/bf16 VAEを使わずfloat VAEを使う",
     )
-
+    parser.add_argument(
+        "--validation_seed",
+        type=int,
+        default=None,
+        help="Validation seed"
+    )
+    parser.add_argument(
+        "--validation_split",
+        type=float,
+        default=0.0,
+        help="Split for validation images out of the training dataset"
+    )
+    parser.add_argument(
+        "--validation_every_n_step",
+        type=int,
+        default=None,
+        help="Number of train steps for counting validation loss. By default, validation per train epoch is performed"
+    )
+    parser.add_argument(
+        "--max_validation_steps",
+        type=int,
+        default=None,
+        help="Number of max validation steps for counting validation loss. By default, validation will run entire validation dataset"
+    )
     return parser
 
 

From 68162172ebf9afa21ad526fc833fcc04f74aeb5f Mon Sep 17 00:00:00 2001
From: gesen2egee <79357052+gesen2egee@users.noreply.github.com>
Date: Sun, 4 Aug 2024 15:03:56 +0800
Subject: [PATCH 050/582] Update train_db.py

---
 train_db.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/train_db.py b/train_db.py
index 9f8ec777c..e98434dba 100644
--- a/train_db.py
+++ b/train_db.py
@@ -209,10 +209,10 @@ def train(args):
         vae.eval()
         with torch.no_grad():
             train_dataset_group.cache_latents(vae, args.vae_batch_size, args.cache_latents_to_disk, accelerator.is_main_process)
-        vae.to("cpu")
             if val_dataset_group is not None:
             print("Cache validation latents...")
-            val_dataset_group.cache_latents(vae, args.vae_batch_size, args.cache_latents_to_disk, accelerator.is_main_process)     
+            val_dataset_group.cache_latents(vae, args.vae_batch_size, args.cache_latents_to_disk, accelerator.is_main_process)
+        vae.to("cpu")
         clean_memory_on_device(accelerator.device)
 
         accelerator.wait_for_everyone()

From 96eb74f0cba3253ba29c8e87d7479c355916cca5 Mon Sep 17 00:00:00 2001
From: gesen2egee <79357052+gesen2egee@users.noreply.github.com>
Date: Sun, 4 Aug 2024 15:06:05 +0800
Subject: [PATCH 051/582] Update train_db.py

---
 train_db.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/train_db.py b/train_db.py
index e98434dba..80fdff3e7 100644
--- a/train_db.py
+++ b/train_db.py
@@ -210,8 +210,8 @@ def train(args):
         with torch.no_grad():
             train_dataset_group.cache_latents(vae, args.vae_batch_size, args.cache_latents_to_disk, accelerator.is_main_process)
             if val_dataset_group is not None:
-            print("Cache validation latents...")
-            val_dataset_group.cache_latents(vae, args.vae_batch_size, args.cache_latents_to_disk, accelerator.is_main_process)
+		print("Cache validation latents...")
+		val_dataset_group.cache_latents(vae, args.vae_batch_size, args.cache_latents_to_disk, accelerator.is_main_process)
         vae.to("cpu")
         clean_memory_on_device(accelerator.device)
 

From b9bdd101296b8dc3c60b25e31d04d39b57eaee71 Mon Sep 17 00:00:00 2001
From: gesen2egee <79357052+gesen2egee@users.noreply.github.com>
Date: Sun, 4 Aug 2024 15:11:26 +0800
Subject: [PATCH 052/582] Update train_network.py

---
 train_network.py | 38 +++++++++++++++++++-------------------
 1 file changed, 19 insertions(+), 19 deletions(-)

diff --git a/train_network.py b/train_network.py
index 4a5940cd5..d7b24dae9 100644
--- a/train_network.py
+++ b/train_network.py
@@ -1034,25 +1034,25 @@ def remove_model(old_ckpt_name):
                     logs = self.generate_step_logs(args, current_loss, avr_loss, lr_scheduler, keys_scaled, mean_norm, maximum_norm)
                     accelerator.log(logs, step=global_step)
                 
-                if len(val_dataloader) > 0:
-                    if  (args.validation_every_n_step is not None and global_step % args.validation_every_n_step == 0) or (args.validation_every_n_step is None and step == len(train_dataloader) - 1) or global_step >= args.max_train_steps:
-                            accelerator.print("Validating バリデーション処理...")
-                            total_loss = 0.0
-                            with torch.no_grad():
-                                validation_steps = min(args.max_validation_steps, len(val_dataloader)) if args.max_validation_steps is not None else len(val_dataloader)
-                                for val_step in tqdm(range(validation_steps), desc='Validation Steps'):
-                                    batch = next(cyclic_val_dataloader)
-                                    loss = self.process_val_batch(batch, tokenizers, text_encoders, unet, vae, noise_scheduler, vae_dtype, weight_dtype, accelerator, args)
-                                    total_loss += loss.detach().item()
-                                current_loss = total_loss / validation_steps
-                                val_loss_recorder.add(epoch=0, step=global_step, loss=current_loss)   
-
-                            if args.logging_dir is not None:
-                                logs = {"loss/current_val_loss": current_loss}
-                                accelerator.log(logs, step=global_step)                            
-                                avr_loss: float = val_loss_recorder.moving_average
-                                logs = {"loss/average_val_loss": avr_loss}
-                                accelerator.log(logs, step=global_step)
+			if len(val_dataloader) > 0:
+				if  (args.validation_every_n_step is not None and global_step % args.validation_every_n_step == 0) or (args.validation_every_n_step is None and step == len(train_dataloader) - 1) or global_step >= args.max_train_steps:
+					accelerator.print("Validating バリデーション処理...")
+					total_loss = 0.0
+					with torch.no_grad():
+						validation_steps = min(args.max_validation_steps, len(val_dataloader)) if args.max_validation_steps is not None else len(val_dataloader)
+						for val_step in tqdm(range(validation_steps), desc='Validation Steps'):
+							batch = next(cyclic_val_dataloader)
+							loss = self.process_val_batch(batch, tokenizers, text_encoders, unet, vae, noise_scheduler, vae_dtype, weight_dtype, accelerator, args)
+							total_loss += loss.detach().item()
+						current_loss = total_loss / validation_steps
+						val_loss_recorder.add(epoch=0, step=global_step, loss=current_loss)   
+
+					if args.logging_dir is not None:
+						logs = {"loss/current_val_loss": current_loss}
+						accelerator.log(logs, step=global_step)                            
+						avr_loss: float = val_loss_recorder.moving_average
+						logs = {"loss/average_val_loss": avr_loss}
+						accelerator.log(logs, step=global_step)
                                 
                 if global_step >= args.max_train_steps:
                     break

From 3d68754defde57b10f96d9c934dd78bf25c39235 Mon Sep 17 00:00:00 2001
From: gesen2egee <79357052+gesen2egee@users.noreply.github.com>
Date: Sun, 4 Aug 2024 15:15:42 +0800
Subject: [PATCH 053/582] Update train_db.py

---
 train_db.py | 38 ++++++++++++++++++--------------------
 1 file changed, 18 insertions(+), 20 deletions(-)

diff --git a/train_db.py b/train_db.py
index 80fdff3e7..800a157bf 100644
--- a/train_db.py
+++ b/train_db.py
@@ -503,28 +503,26 @@ def train(args):
             avr_loss: float = loss_recorder.moving_average
             logs = {"avr_loss": avr_loss}  # , "lr": lr_scheduler.get_last_lr()[0]}
             progress_bar.set_postfix(**logs)
-            
 			if len(val_dataloader) > 0:
 				if  (args.validation_every_n_step is not None and global_step % args.validation_every_n_step == 0) or (args.validation_every_n_step is None and step == len(train_dataloader) - 1) or global_step >= args.max_train_steps:
-						accelerator.print("Validating バリデーション処理...")
-						total_loss = 0.0
-						with torch.no_grad():
-							validation_steps = min(args.max_validation_steps, len(val_dataloader)) if args.max_validation_steps is not None else len(val_dataloader)
-							for val_step in tqdm(range(validation_steps), desc='Validation Steps'):
-								batch = next(cyclic_val_dataloader)
-								loss = self.process_val_batch(batch, tokenizers, text_encoders, unet, vae, noise_scheduler, vae_dtype, weight_dtype, accelerator, args)
-								total_loss += loss.detach().item()
-							current_loss = total_loss / validation_steps
-							val_loss_recorder.add(epoch=0, step=global_step, loss=current_loss)   
-
-						if args.logging_dir is not None:
-							logs = {"loss/current_val_loss": current_loss}
-							accelerator.log(logs, step=global_step)                            
-							avr_loss: float = val_loss_recorder.moving_average
-							logs = {"loss/average_val_loss": avr_loss}
-							accelerator.log(logs, step=global_step)
-
-            
+					accelerator.print("Validating バリデーション処理...")
+					total_loss = 0.0
+					with torch.no_grad():
+						validation_steps = min(args.max_validation_steps, len(val_dataloader)) if args.max_validation_steps is not None else len(val_dataloader)
+						for val_step in tqdm(range(validation_steps), desc='Validation Steps'):
+							batch = next(cyclic_val_dataloader)
+							loss = self.process_val_batch(batch, tokenizers, text_encoders, unet, vae, noise_scheduler, vae_dtype, weight_dtype, accelerator, args)
+							total_loss += loss.detach().item()
+						current_loss = total_loss / validation_steps
+						val_loss_recorder.add(epoch=0, step=global_step, loss=current_loss)   
+
+					if args.logging_dir is not None:
+						logs = {"loss/current_val_loss": current_loss}
+						accelerator.log(logs, step=global_step)                            
+						avr_loss: float = val_loss_recorder.moving_average
+						logs = {"loss/average_val_loss": avr_loss}
+						accelerator.log(logs, step=global_step)
+
             if global_step >= args.max_train_steps:
                 break
 

From a593e837f36b6299101dc85a367c0986501ecc0a Mon Sep 17 00:00:00 2001
From: gesen2egee <79357052+gesen2egee@users.noreply.github.com>
Date: Sun, 4 Aug 2024 15:17:30 +0800
Subject: [PATCH 054/582] Update train_network.py

---
 train_network.py | 40 ++++++++++++++++++++--------------------
 1 file changed, 20 insertions(+), 20 deletions(-)

diff --git a/train_network.py b/train_network.py
index d7b24dae9..7d9134638 100644
--- a/train_network.py
+++ b/train_network.py
@@ -1034,26 +1034,26 @@ def remove_model(old_ckpt_name):
                     logs = self.generate_step_logs(args, current_loss, avr_loss, lr_scheduler, keys_scaled, mean_norm, maximum_norm)
                     accelerator.log(logs, step=global_step)
                 
-			if len(val_dataloader) > 0:
-				if  (args.validation_every_n_step is not None and global_step % args.validation_every_n_step == 0) or (args.validation_every_n_step is None and step == len(train_dataloader) - 1) or global_step >= args.max_train_steps:
-					accelerator.print("Validating バリデーション処理...")
-					total_loss = 0.0
-					with torch.no_grad():
-						validation_steps = min(args.max_validation_steps, len(val_dataloader)) if args.max_validation_steps is not None else len(val_dataloader)
-						for val_step in tqdm(range(validation_steps), desc='Validation Steps'):
-							batch = next(cyclic_val_dataloader)
-							loss = self.process_val_batch(batch, tokenizers, text_encoders, unet, vae, noise_scheduler, vae_dtype, weight_dtype, accelerator, args)
-							total_loss += loss.detach().item()
-						current_loss = total_loss / validation_steps
-						val_loss_recorder.add(epoch=0, step=global_step, loss=current_loss)   
-
-					if args.logging_dir is not None:
-						logs = {"loss/current_val_loss": current_loss}
-						accelerator.log(logs, step=global_step)                            
-						avr_loss: float = val_loss_recorder.moving_average
-						logs = {"loss/average_val_loss": avr_loss}
-						accelerator.log(logs, step=global_step)
-                                
+				if len(val_dataloader) > 0:
+					if  (args.validation_every_n_step is not None and global_step % args.validation_every_n_step == 0) or (args.validation_every_n_step is None and step == len(train_dataloader) - 1) or global_step >= args.max_train_steps:
+						accelerator.print("Validating バリデーション処理...")
+						total_loss = 0.0
+						with torch.no_grad():
+							validation_steps = min(args.max_validation_steps, len(val_dataloader)) if args.max_validation_steps is not None else len(val_dataloader)
+							for val_step in tqdm(range(validation_steps), desc='Validation Steps'):
+								batch = next(cyclic_val_dataloader)
+								loss = self.process_val_batch(batch, tokenizers, text_encoders, unet, vae, noise_scheduler, vae_dtype, weight_dtype, accelerator, args)
+								total_loss += loss.detach().item()
+							current_loss = total_loss / validation_steps
+							val_loss_recorder.add(epoch=0, step=global_step, loss=current_loss)   
+	
+						if args.logging_dir is not None:
+							logs = {"loss/current_val_loss": current_loss}
+							accelerator.log(logs, step=global_step)                            
+							avr_loss: float = val_loss_recorder.moving_average
+							logs = {"loss/average_val_loss": avr_loss}
+							accelerator.log(logs, step=global_step)
+	                                
                 if global_step >= args.max_train_steps:
                     break
 

From f6dbf7c419bbcf2e51c82a6bffa8d30cad2e3512 Mon Sep 17 00:00:00 2001
From: gesen2egee <79357052+gesen2egee@users.noreply.github.com>
Date: Sun, 4 Aug 2024 15:18:53 +0800
Subject: [PATCH 055/582] Update train_network.py

---
 train_network.py | 38 +++++++++++++++++++-------------------
 1 file changed, 19 insertions(+), 19 deletions(-)

diff --git a/train_network.py b/train_network.py
index 7d9134638..fa6407eef 100644
--- a/train_network.py
+++ b/train_network.py
@@ -1034,26 +1034,26 @@ def remove_model(old_ckpt_name):
                     logs = self.generate_step_logs(args, current_loss, avr_loss, lr_scheduler, keys_scaled, mean_norm, maximum_norm)
                     accelerator.log(logs, step=global_step)
                 
-				if len(val_dataloader) > 0:
-					if  (args.validation_every_n_step is not None and global_step % args.validation_every_n_step == 0) or (args.validation_every_n_step is None and step == len(train_dataloader) - 1) or global_step >= args.max_train_steps:
-						accelerator.print("Validating バリデーション処理...")
-						total_loss = 0.0
-						with torch.no_grad():
-							validation_steps = min(args.max_validation_steps, len(val_dataloader)) if args.max_validation_steps is not None else len(val_dataloader)
-							for val_step in tqdm(range(validation_steps), desc='Validation Steps'):
-								batch = next(cyclic_val_dataloader)
-								loss = self.process_val_batch(batch, tokenizers, text_encoders, unet, vae, noise_scheduler, vae_dtype, weight_dtype, accelerator, args)
-								total_loss += loss.detach().item()
-							current_loss = total_loss / validation_steps
-							val_loss_recorder.add(epoch=0, step=global_step, loss=current_loss)   
+			if len(val_dataloader) > 0:
+				if  (args.validation_every_n_step is not None and global_step % args.validation_every_n_step == 0) or (args.validation_every_n_step is None and step == len(train_dataloader) - 1) or global_step >= args.max_train_steps:
+					accelerator.print("Validating バリデーション処理...")
+					total_loss = 0.0
+					with torch.no_grad():
+						validation_steps = min(args.max_validation_steps, len(val_dataloader)) if args.max_validation_steps is not None else len(val_dataloader)
+						for val_step in tqdm(range(validation_steps), desc='Validation Steps'):
+							batch = next(cyclic_val_dataloader)
+							loss = self.process_val_batch(batch, tokenizers, text_encoders, unet, vae, noise_scheduler, vae_dtype, weight_dtype, accelerator, args)
+							total_loss += loss.detach().item()
+						current_loss = total_loss / validation_steps
+						val_loss_recorder.add(epoch=0, step=global_step, loss=current_loss)   
 	
-						if args.logging_dir is not None:
-							logs = {"loss/current_val_loss": current_loss}
-							accelerator.log(logs, step=global_step)                            
-							avr_loss: float = val_loss_recorder.moving_average
-							logs = {"loss/average_val_loss": avr_loss}
-							accelerator.log(logs, step=global_step)
-	                                
+					if args.logging_dir is not None:
+						logs = {"loss/current_val_loss": current_loss}
+						accelerator.log(logs, step=global_step)                            
+						avr_loss: float = val_loss_recorder.moving_average
+						logs = {"loss/average_val_loss": avr_loss}
+						accelerator.log(logs, step=global_step)
+		                                
                 if global_step >= args.max_train_steps:
                     break
 

From aa850aa531b0e396b6f2fbd68cd1e6f1319d1d0b Mon Sep 17 00:00:00 2001
From: gesen2egee <79357052+gesen2egee@users.noreply.github.com>
Date: Sun, 4 Aug 2024 17:34:20 +0800
Subject: [PATCH 056/582] Update train_network.py

---
 train_network.py | 38 +++++++++++++++++++-------------------
 1 file changed, 19 insertions(+), 19 deletions(-)

diff --git a/train_network.py b/train_network.py
index fa6407eef..938e41938 100644
--- a/train_network.py
+++ b/train_network.py
@@ -1034,25 +1034,25 @@ def remove_model(old_ckpt_name):
                     logs = self.generate_step_logs(args, current_loss, avr_loss, lr_scheduler, keys_scaled, mean_norm, maximum_norm)
                     accelerator.log(logs, step=global_step)
                 
-			if len(val_dataloader) > 0:
-				if  (args.validation_every_n_step is not None and global_step % args.validation_every_n_step == 0) or (args.validation_every_n_step is None and step == len(train_dataloader) - 1) or global_step >= args.max_train_steps:
-					accelerator.print("Validating バリデーション処理...")
-					total_loss = 0.0
-					with torch.no_grad():
-						validation_steps = min(args.max_validation_steps, len(val_dataloader)) if args.max_validation_steps is not None else len(val_dataloader)
-						for val_step in tqdm(range(validation_steps), desc='Validation Steps'):
-							batch = next(cyclic_val_dataloader)
-							loss = self.process_val_batch(batch, tokenizers, text_encoders, unet, vae, noise_scheduler, vae_dtype, weight_dtype, accelerator, args)
-							total_loss += loss.detach().item()
-						current_loss = total_loss / validation_steps
-						val_loss_recorder.add(epoch=0, step=global_step, loss=current_loss)   
-	
-					if args.logging_dir is not None:
-						logs = {"loss/current_val_loss": current_loss}
-						accelerator.log(logs, step=global_step)                            
-						avr_loss: float = val_loss_recorder.moving_average
-						logs = {"loss/average_val_loss": avr_loss}
-						accelerator.log(logs, step=global_step)
+				if len(val_dataloader) > 0:
+					if  (args.validation_every_n_step is not None and global_step % args.validation_every_n_step == 0) or (args.validation_every_n_step is None and step == len(train_dataloader) - 1) or global_step >= args.max_train_steps:
+						accelerator.print("Validating バリデーション処理...")
+						total_loss = 0.0
+						with torch.no_grad():
+							validation_steps = min(args.max_validation_steps, len(val_dataloader)) if args.max_validation_steps is not None else len(val_dataloader)
+							for val_step in tqdm(range(validation_steps), desc='Validation Steps'):
+								batch = next(cyclic_val_dataloader)
+								loss = self.process_val_batch(batch, tokenizers, text_encoders, unet, vae, noise_scheduler, vae_dtype, weight_dtype, accelerator, args)
+								total_loss += loss.detach().item()
+							current_loss = total_loss / validation_steps
+							val_loss_recorder.add(epoch=0, step=global_step, loss=current_loss)   
+		
+						if args.logging_dir is not None:
+							logs = {"loss/current_val_loss": current_loss}
+							accelerator.log(logs, step=global_step)                            
+							avr_loss: float = val_loss_recorder.moving_average
+							logs = {"loss/average_val_loss": avr_loss}
+							accelerator.log(logs, step=global_step)
 		                                
                 if global_step >= args.max_train_steps:
                     break

From cdb2d9c516fbffe0faa9788b8174e5d418fb766b Mon Sep 17 00:00:00 2001
From: gesen2egee <79357052+gesen2egee@users.noreply.github.com>
Date: Sun, 4 Aug 2024 17:36:34 +0800
Subject: [PATCH 057/582] Update train_network.py

---
 train_network.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/train_network.py b/train_network.py
index 938e41938..e10c17c0c 100644
--- a/train_network.py
+++ b/train_network.py
@@ -192,7 +192,7 @@ def process_val_batch(self, batch, tokenizers, text_encoders, unet, vae, noise_s
                 loss = apply_debiased_estimation(loss, timesteps, noise_scheduler)
                 loss = loss.mean()  # 平均なのでbatch_sizeで割る必要なし
                 total_loss += loss
-            
+
         average_loss = total_loss / len(timesteps_list)    
         return average_loss
 

From 231df197ddf4372b3d90751146927f33e1965d1a Mon Sep 17 00:00:00 2001
From: Kohya S <ykumeykume@gmail.com>
Date: Mon, 5 Aug 2024 20:26:30 +0900
Subject: [PATCH 058/582] Fix npz path for verification

---
 library/strategy_sdxl.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/library/strategy_sdxl.py b/library/strategy_sdxl.py
index a4513336d..3eb0ab6f6 100644
--- a/library/strategy_sdxl.py
+++ b/library/strategy_sdxl.py
@@ -184,20 +184,20 @@ def __init__(
     def get_outputs_npz_path(self, image_abs_path: str) -> str:
         return os.path.splitext(image_abs_path)[0] + SdxlTextEncoderOutputsCachingStrategy.SDXL_TEXT_ENCODER_OUTPUTS_NPZ_SUFFIX
 
-    def is_disk_cached_outputs_expected(self, abs_path: str):
+    def is_disk_cached_outputs_expected(self, npz_path: str):
         if not self.cache_to_disk:
             return False
-        if not os.path.exists(self.get_outputs_npz_path(abs_path)):
+        if not os.path.exists(npz_path):
             return False
         if self.skip_disk_cache_validity_check:
             return True
 
         try:
-            npz = np.load(self.get_outputs_npz_path(abs_path))
+            npz = np.load(npz_path)
             if "hidden_state1" not in npz or "hidden_state2" not in npz or "pool2" not in npz:
                 return False
         except Exception as e:
-            logger.error(f"Error loading file: {self.get_outputs_npz_path(abs_path)}")
+            logger.error(f"Error loading file: {npz_path}")
             raise e
 
         return True

From da4d0fe0165b3e0143c237de8cf307d53a9de45a Mon Sep 17 00:00:00 2001
From: Kohya S <ykumeykume@gmail.com>
Date: Mon, 5 Aug 2024 20:51:34 +0900
Subject: [PATCH 059/582] support attn mask for l+g/t5

---
 library/strategy_sd3.py  | 88 +++++++++++++++++++++++++++++++++-------
 library/train_util.py    |  3 +-
 sd3_minimal_inference.py | 10 +++--
 sd3_train.py             | 30 +++++++++++---
 4 files changed, 107 insertions(+), 24 deletions(-)

diff --git a/library/strategy_sd3.py b/library/strategy_sd3.py
index 7491e814f..a22818903 100644
--- a/library/strategy_sd3.py
+++ b/library/strategy_sd3.py
@@ -37,11 +37,14 @@ def tokenize(self, text: Union[str, List[str]]) -> List[torch.Tensor]:
         g_tokens = self.clip_g(text, max_length=77, padding="max_length", truncation=True, return_tensors="pt")
         t5_tokens = self.t5xxl(text, max_length=self.t5xxl_max_length, padding="max_length", truncation=True, return_tensors="pt")
 
+        l_attn_mask = l_tokens["attention_mask"]
+        g_attn_mask = g_tokens["attention_mask"]
+        t5_attn_mask = t5_tokens["attention_mask"]
         l_tokens = l_tokens["input_ids"]
         g_tokens = g_tokens["input_ids"]
         t5_tokens = t5_tokens["input_ids"]
 
-        return [l_tokens, g_tokens, t5_tokens]
+        return [l_tokens, g_tokens, t5_tokens, l_attn_mask, g_attn_mask, t5_attn_mask]
 
 
 class Sd3TextEncodingStrategy(TextEncodingStrategy):
@@ -49,11 +52,20 @@ def __init__(self) -> None:
         pass
 
     def encode_tokens(
-        self, tokenize_strategy: TokenizeStrategy, models: List[Any], tokens: List[torch.Tensor]
+        self,
+        tokenize_strategy: TokenizeStrategy,
+        models: List[Any],
+        tokens: List[torch.Tensor],
+        apply_lg_attn_mask: bool = False,
+        apply_t5_attn_mask: bool = False,
     ) -> List[torch.Tensor]:
+        """
+        returned embeddings are not masked
+        """
         clip_l, clip_g, t5xxl = models
 
-        l_tokens, g_tokens, t5_tokens = tokens
+        l_tokens, g_tokens, t5_tokens = tokens[:3]
+        l_attn_mask, g_attn_mask, t5_attn_mask = tokens[3:] if len(tokens) > 3 else [None, None, None]
         if l_tokens is None:
             assert g_tokens is None, "g_tokens must be None if l_tokens is None"
             lg_out = None
@@ -61,10 +73,15 @@ def encode_tokens(
             assert g_tokens is not None, "g_tokens must not be None if l_tokens is not None"
             l_out, l_pooled = clip_l(l_tokens)
             g_out, g_pooled = clip_g(g_tokens)
+            if apply_lg_attn_mask:
+                l_out = l_out * l_attn_mask.to(l_out.device).unsqueeze(-1)
+                g_out = g_out * g_attn_mask.to(g_out.device).unsqueeze(-1)
             lg_out = torch.cat([l_out, g_out], dim=-1)
 
         if t5xxl is not None and t5_tokens is not None:
             t5_out, _ = t5xxl(t5_tokens)  # t5_out is [1, max length, 4096]
+            if apply_t5_attn_mask:
+                t5_out = t5_out * t5_attn_mask.to(t5_out.device).unsqueeze(-1)
         else:
             t5_out = None
 
@@ -84,50 +101,81 @@ class Sd3TextEncoderOutputsCachingStrategy(TextEncoderOutputsCachingStrategy):
     SD3_TEXT_ENCODER_OUTPUTS_NPZ_SUFFIX = "_sd3_te.npz"
 
     def __init__(
-        self, cache_to_disk: bool, batch_size: int, skip_disk_cache_validity_check: bool, is_partial: bool = False
+        self,
+        cache_to_disk: bool,
+        batch_size: int,
+        skip_disk_cache_validity_check: bool,
+        is_partial: bool = False,
+        apply_lg_attn_mask: bool = False,
+        apply_t5_attn_mask: bool = False,
     ) -> None:
         super().__init__(cache_to_disk, batch_size, skip_disk_cache_validity_check, is_partial)
+        self.apply_lg_attn_mask = apply_lg_attn_mask
+        self.apply_t5_attn_mask = apply_t5_attn_mask
 
     def get_outputs_npz_path(self, image_abs_path: str) -> str:
         return os.path.splitext(image_abs_path)[0] + Sd3TextEncoderOutputsCachingStrategy.SD3_TEXT_ENCODER_OUTPUTS_NPZ_SUFFIX
 
-    def is_disk_cached_outputs_expected(self, abs_path: str):
+    def is_disk_cached_outputs_expected(self, npz_path: str):
         if not self.cache_to_disk:
             return False
-        if not os.path.exists(self.get_outputs_npz_path(abs_path)):
+        if not os.path.exists(npz_path):
             return False
         if self.skip_disk_cache_validity_check:
             return True
 
         try:
-            npz = np.load(self.get_outputs_npz_path(abs_path))
-            if "clip_l" not in npz or "clip_g" not in npz:
+            npz = np.load(npz_path)
+            if "lg_out" not in npz:
                 return False
-            if "clip_l_pool" not in npz or "clip_g_pool" not in npz:
+            if "lg_pooled" not in npz:
+                return False
+            if "clip_l_attn_mask" not in npz or "clip_g_attn_mask" not in npz:  # necessary even if not used
                 return False
             # t5xxl is optional
         except Exception as e:
-            logger.error(f"Error loading file: {self.get_outputs_npz_path(abs_path)}")
+            logger.error(f"Error loading file: {npz_path}")
             raise e
 
         return True
 
+    def mask_lg_attn(self, lg_out: np.ndarray, l_attn_mask: np.ndarray, g_attn_mask: np.ndarray) -> np.ndarray:
+        l_out = lg_out[..., :768]
+        g_out = lg_out[..., 768:]  # 1280
+        l_out = l_out * np.expand_dims(l_attn_mask, -1)  # l_out = l_out * l_attn_mask.
+        g_out = g_out * np.expand_dims(g_attn_mask, -1)  # g_out = g_out * g_attn_mask.
+        return np.concatenate([l_out, g_out], axis=-1)
+
+    def mask_t5_attn(self, t5_out: np.ndarray, t5_attn_mask: np.ndarray) -> np.ndarray:
+        return t5_out * np.expand_dims(t5_attn_mask, -1)
+
     def load_outputs_npz(self, npz_path: str) -> List[np.ndarray]:
         data = np.load(npz_path)
         lg_out = data["lg_out"]
         lg_pooled = data["lg_pooled"]
         t5_out = data["t5_out"] if "t5_out" in data else None
+
+        if self.apply_lg_attn_mask:
+            l_attn_mask = data["clip_l_attn_mask"]
+            g_attn_mask = data["clip_g_attn_mask"]
+            lg_out = self.mask_lg_attn(lg_out, l_attn_mask, g_attn_mask)
+
+        if self.apply_t5_attn_mask and t5_out is not None:
+            t5_attn_mask = data["t5_attn_mask"]
+            t5_out = self.mask_t5_attn(t5_out, t5_attn_mask)
+
         return [lg_out, t5_out, lg_pooled]
 
     def cache_batch_outputs(
         self, tokenize_strategy: TokenizeStrategy, models: List[Any], text_encoding_strategy: TextEncodingStrategy, infos: List
     ):
+        sd3_text_encoding_strategy: Sd3TextEncodingStrategy = text_encoding_strategy
         captions = [info.caption for info in infos]
 
-        clip_l_tokens, clip_g_tokens, t5xxl_tokens = tokenize_strategy.tokenize(captions)
+        tokens_and_masks = tokenize_strategy.tokenize(captions)
         with torch.no_grad():
-            lg_out, t5_out, lg_pooled = text_encoding_strategy.encode_tokens(
-                tokenize_strategy, models, [clip_l_tokens, clip_g_tokens, t5xxl_tokens]
+            lg_out, t5_out, lg_pooled = sd3_text_encoding_strategy.encode_tokens(
+                tokenize_strategy, models, tokens_and_masks, self.apply_lg_attn_mask, self.apply_t5_attn_mask
             )
 
         if lg_out.dtype == torch.bfloat16:
@@ -148,10 +196,22 @@ def cache_batch_outputs(
             lg_pooled_i = lg_pooled[i]
 
             if self.cache_to_disk:
+                clip_l_attn_mask, clip_g_attn_mask, t5_attn_mask = tokens_and_masks[3:6]
+                clip_l_attn_mask_i = clip_l_attn_mask[i].cpu().numpy()
+                clip_g_attn_mask_i = clip_g_attn_mask[i].cpu().numpy()
+                t5_attn_mask_i = t5_attn_mask[i].cpu().numpy() if t5_attn_mask is not None else None  # shouldn't be None
                 kwargs = {}
                 if t5_out is not None:
                     kwargs["t5_out"] = t5_out_i
-                np.savez(info.text_encoder_outputs_npz, lg_out=lg_out_i, lg_pooled=lg_pooled_i, **kwargs)
+                np.savez(
+                    info.text_encoder_outputs_npz,
+                    lg_out=lg_out_i,
+                    lg_pooled=lg_pooled_i,
+                    clip_l_attn_mask=clip_l_attn_mask_i,
+                    clip_g_attn_mask=clip_g_attn_mask_i,
+                    t5_attn_mask=t5_attn_mask_i,
+                    **kwargs,
+                )
             else:
                 info.text_encoder_outputs = (lg_out_i, t5_out_i, lg_pooled_i)
 
diff --git a/library/train_util.py b/library/train_util.py
index a747e0478..fc458a884 100644
--- a/library/train_util.py
+++ b/library/train_util.py
@@ -646,7 +646,7 @@ def __init__(
 
         # caching
         self.caching_mode = None  # None, 'latents', 'text'
-        
+
         self.tokenize_strategy = None
         self.text_encoder_output_caching_strategy = None
         self.latents_caching_strategy = None
@@ -1486,6 +1486,7 @@ def __getitem__(self, index):
                 text_encoder_outputs = self.text_encoder_output_caching_strategy.load_outputs_npz(
                     image_info.text_encoder_outputs_npz
                 )
+                text_encoder_outputs = [torch.FloatTensor(x) for x in text_encoder_outputs]
             else:
                 tokenization_required = True
             text_encoder_outputs_list.append(text_encoder_outputs)
diff --git a/sd3_minimal_inference.py b/sd3_minimal_inference.py
index e9e61af1b..630da7e08 100644
--- a/sd3_minimal_inference.py
+++ b/sd3_minimal_inference.py
@@ -146,6 +146,8 @@ def do_sample(
     parser.add_argument("--clip_l", type=str, required=False)
     parser.add_argument("--t5xxl", type=str, required=False)
     parser.add_argument("--t5xxl_token_length", type=int, default=77, help="t5xxl token length, default: 77")
+    parser.add_argument("--apply_lg_attn_mask", action="store_true")
+    parser.add_argument("--apply_t5_attn_mask", action="store_true")
     parser.add_argument("--prompt", type=str, default="A photo of a cat")
     # parser.add_argument("--prompt2", type=str, default=None)  # do not support different prompts for text encoders
     parser.add_argument("--negative_prompt", type=str, default="")
@@ -323,15 +325,15 @@ def do_sample(
     logger.info("Encoding prompts...")
     encoding_strategy = strategy_sd3.Sd3TextEncodingStrategy()
 
-    l_tokens, g_tokens, t5_tokens = tokenize_strategy.tokenize(args.prompt)
+    tokens_and_masks = tokenize_strategy.tokenize(args.prompt)
     lg_out, t5_out, pooled = encoding_strategy.encode_tokens(
-        tokenize_strategy, [clip_l, clip_g, t5xxl], [l_tokens, g_tokens, t5_tokens]
+        tokenize_strategy, [clip_l, clip_g, t5xxl], tokens_and_masks, args.apply_lg_attn_mask, args.apply_t5_attn_mask
     )
     cond = encoding_strategy.concat_encodings(lg_out, t5_out, pooled)
 
-    l_tokens, g_tokens, t5_tokens = tokenize_strategy.tokenize(args.negative_prompt)
+    tokens_and_masks = tokenize_strategy.tokenize(args.negative_prompt)
     lg_out, t5_out, pooled = encoding_strategy.encode_tokens(
-        tokenize_strategy, [clip_l, clip_g, t5xxl], [l_tokens, g_tokens, t5_tokens]
+        tokenize_strategy, [clip_l, clip_g, t5xxl], tokens_and_masks, args.apply_lg_attn_mask, args.apply_t5_attn_mask
     )
     neg_cond = encoding_strategy.concat_encodings(lg_out, t5_out, pooled)
 
diff --git a/sd3_train.py b/sd3_train.py
index 2f4ea8cb2..9c37cbce6 100644
--- a/sd3_train.py
+++ b/sd3_train.py
@@ -172,6 +172,8 @@ def train(args):
                     args.text_encoder_batch_size,
                     False,
                     False,
+                    False,
+                    False,
                 )
             )
         train_dataset_group.set_current_strategies()
@@ -312,6 +314,8 @@ def train(args):
             args.text_encoder_batch_size,
             False,
             train_clip_g or train_clip_l or args.use_t5xxl_cache_only,
+            args.apply_lg_attn_mask,
+            args.apply_t5_attn_mask,
         )
         strategy_base.TextEncoderOutputsCachingStrategy.set_strategy(text_encoder_caching_strategy)
 
@@ -335,7 +339,11 @@ def train(args):
                             logger.info(f"cache Text Encoder outputs for prompt: {p}")
                             tokens_list = sd3_tokenize_strategy.tokenize(p)
                             sample_prompts_te_outputs[p] = text_encoding_strategy.encode_tokens(
-                                sd3_tokenize_strategy, [clip_l, clip_g, t5xxl], tokens_list
+                                sd3_tokenize_strategy,
+                                [clip_l, clip_g, t5xxl],
+                                tokens_list,
+                                args.apply_lg_attn_mask,
+                                args.apply_t5_attn_mask,
                             )
 
         accelerator.wait_for_everyone()
@@ -748,21 +756,23 @@ def compute_loss_weighting_for_sd3(weighting_scheme: str, sigmas=None):
 
                 if lg_out is None or (train_clip_l or train_clip_g):
                     # not cached or training, so get from text encoders
-                    input_ids_clip_l, input_ids_clip_g, _ = batch["input_ids_list"]
+                    input_ids_clip_l, input_ids_clip_g, _, l_attn_mask, g_attn_mask, _ = batch["input_ids_list"]
                     with torch.set_grad_enabled(args.train_text_encoder):
                         # TODO support weighted captions
                         input_ids_clip_l = input_ids_clip_l.to(accelerator.device)
                         input_ids_clip_g = input_ids_clip_g.to(accelerator.device)
                         lg_out, _, lg_pooled = text_encoding_strategy.encode_tokens(
-                            sd3_tokenize_strategy, [clip_l, clip_g, None], [input_ids_clip_l, input_ids_clip_g, None]
+                            sd3_tokenize_strategy,
+                            [clip_l, clip_g, None],
+                            [input_ids_clip_l, input_ids_clip_g, None, l_attn_mask, g_attn_mask, None],
                         )
 
                 if t5_out is None:
-                    _, _, input_ids_t5xxl = batch["input_ids_list"]
+                    _, _, input_ids_t5xxl, _, _, t5_attn_mask = batch["input_ids_list"]
                     with torch.no_grad():
                         input_ids_t5xxl = input_ids_t5xxl.to(accelerator.device) if t5_out is None else None
                         _, t5_out, _ = text_encoding_strategy.encode_tokens(
-                            sd3_tokenize_strategy, [None, None, t5xxl], [None, None, input_ids_t5xxl]
+                            sd3_tokenize_strategy, [None, None, t5xxl], [None, None, input_ids_t5xxl, None, None, t5_attn_mask]
                         )
 
                 context, lg_pooled = text_encoding_strategy.concat_encodings(lg_out, t5_out, lg_pooled)
@@ -969,6 +979,16 @@ def setup_parser() -> argparse.ArgumentParser:
         default=None,
         help="maximum token length for T5-XXL. 256 if omitted / T5-XXLの最大トークン数。省略時は256",
     )
+    parser.add_argument(
+        "--apply_lg_attn_mask",
+        action="store_true",
+        help="apply attention mask (zero embs) to CLIP-L and G / CLIP-LとGにアテンションマスク（ゼロ埋め）を適用する",
+    )
+    parser.add_argument(
+        "--apply_t5_attn_mask",
+        action="store_true",
+        help="apply attention mask (zero embs) to T5-XXL / T5-XXLにアテンションマスク（ゼロ埋め）を適用する",
+    )
 
     # TE training is disabled temporarily
     # parser.add_argument(

From 36b2e6fc288c57f496a061e4d638f5641c32c9ea Mon Sep 17 00:00:00 2001
From: Kohya S <ykumeykume@gmail.com>
Date: Fri, 9 Aug 2024 22:56:48 +0900
Subject: [PATCH 060/582] add FLUX.1 LoRA training

---
 README.md                 |  20 +
 flux_minimal_inference.py | 390 ++++++++++++++++
 flux_train_network.py     | 332 ++++++++++++++
 library/flux_models.py    | 920 ++++++++++++++++++++++++++++++++++++++
 library/flux_utils.py     | 215 +++++++++
 library/sd3_models.py     |  22 +-
 library/strategy_flux.py  | 244 ++++++++++
 networks/lora_flux.py     | 730 ++++++++++++++++++++++++++++++
 sdxl_train_network.py     |   5 +
 train_network.py          | 169 ++++---
 10 files changed, 2992 insertions(+), 55 deletions(-)
 create mode 100644 flux_minimal_inference.py
 create mode 100644 flux_train_network.py
 create mode 100644 library/flux_models.py
 create mode 100644 library/flux_utils.py
 create mode 100644 library/strategy_flux.py
 create mode 100644 networks/lora_flux.py

diff --git a/README.md b/README.md
index d406fecde..a0b02f108 100644
--- a/README.md
+++ b/README.md
@@ -1,5 +1,25 @@
 This repository contains training, generation and utility scripts for Stable Diffusion.
 
+## FLUX.1 LoRA training (WIP)
+
+__Aug 9, 2024__:
+
+Please update PyTorch to 2.4.0. We have tested with PyTorch 2.4.0 with CUDA 12.4. We also updated `accelerate` to 0.33.0 just to be safe. 
+
+We have added a new training script for LoRA training. The script is `flux_train_network.py`. See `--help` for options. 
+
+```
+accelerate launch  --mixed_precision bf16 --num_cpu_threads_per_process 1 flux_train_network.py --pretrained_model_name_or_path flux1-dev.sft --clip_l sd3/clip_l.safetensors --t5xxl sd3/t5xxl_fp16.safetensors --ae ae.sft --cache_latents_to_disk --save_model_as safetensors --sdpa --persistent_data_loader_workers --max_data_loader_n_workers 2 --seed 42 --gradient_checkpointing --mixed_precision bf16 --save_precision bf16 --network_module networks.lora_flux --network_dim 4 --optimizer_type adamw8bit --learning_rate 1e-4 --network_train_unet_only --cache_text_encoder_outputs --cache_text_encoder_outputs_to_disk --fp8_base --highvram --max_train_epochs 4 --save_every_n_epochs 1 --dataset_config dataset_1024_bs2.toml --output_dir path/to/output/dir --output_name flux-lora-name
+```
+
+The inference script is also available. The script is `flux_minimal_inference.py`. See `--help` for options. 
+
+```
+python flux_minimal_inference.py --ckpt flux1-dev.sft --clip_l sd3/clip_l.safetensors --t5xxl sd3/t5xxl_fp16.safetensors --ae ae.sft --dtype bf16 --prompt "a cat holding a sign that says hello world" --out path/to/output/dir --seed 1 --flux_dtype fp8 --offload --lora lora-flux-name.safetensors
+```
+
+Unfortnately the training result is not good. Please let us know if you have any idea to improve the training.
+
 ## SD3 training
 
 SD3 training is done with `sd3_train.py`. 
diff --git a/flux_minimal_inference.py b/flux_minimal_inference.py
new file mode 100644
index 000000000..f3affca80
--- /dev/null
+++ b/flux_minimal_inference.py
@@ -0,0 +1,390 @@
+# Minimum Inference Code for FLUX
+
+import argparse
+import datetime
+import math
+import os
+import random
+from typing import Callable, Optional, Tuple
+import einops
+import numpy as np
+
+import torch
+from safetensors.torch import safe_open, load_file
+from tqdm import tqdm
+from PIL import Image
+import accelerate
+
+from library import device_utils
+from library.device_utils import init_ipex, get_preferred_device
+
+init_ipex()
+
+
+from library.utils import setup_logging
+
+setup_logging()
+import logging
+
+logger = logging.getLogger(__name__)
+
+import networks.lora_flux as lora_flux
+from library import flux_models, flux_utils, sd3_utils, strategy_flux
+
+
+def time_shift(mu: float, sigma: float, t: torch.Tensor):
+    return math.exp(mu) / (math.exp(mu) + (1 / t - 1) ** sigma)
+
+
+def get_lin_function(x1: float = 256, y1: float = 0.5, x2: float = 4096, y2: float = 1.15) -> Callable[[float], float]:
+    m = (y2 - y1) / (x2 - x1)
+    b = y1 - m * x1
+    return lambda x: m * x + b
+
+
+def get_schedule(
+    num_steps: int,
+    image_seq_len: int,
+    base_shift: float = 0.5,
+    max_shift: float = 1.15,
+    shift: bool = True,
+) -> list[float]:
+    # extra step for zero
+    timesteps = torch.linspace(1, 0, num_steps + 1)
+
+    # shifting the schedule to favor high timesteps for higher signal images
+    if shift:
+        # eastimate mu based on linear estimation between two points
+        mu = get_lin_function(y1=base_shift, y2=max_shift)(image_seq_len)
+        timesteps = time_shift(mu, 1.0, timesteps)
+
+    return timesteps.tolist()
+
+
+def denoise(
+    model: flux_models.Flux,
+    img: torch.Tensor,
+    img_ids: torch.Tensor,
+    txt: torch.Tensor,
+    txt_ids: torch.Tensor,
+    vec: torch.Tensor,
+    timesteps: list[float],
+    guidance: float = 4.0,
+):
+    # this is ignored for schnell
+    guidance_vec = torch.full((img.shape[0],), guidance, device=img.device, dtype=img.dtype)
+    for t_curr, t_prev in zip(tqdm(timesteps[:-1]), timesteps[1:]):
+        t_vec = torch.full((img.shape[0],), t_curr, dtype=img.dtype, device=img.device)
+        pred = model(img=img, img_ids=img_ids, txt=txt, txt_ids=txt_ids, y=vec, timesteps=t_vec, guidance=guidance_vec)
+
+        img = img + (t_prev - t_curr) * pred
+
+    return img
+
+
+def do_sample(
+    accelerator: Optional[accelerate.Accelerator],
+    model: flux_models.Flux,
+    img: torch.Tensor,
+    img_ids: torch.Tensor,
+    l_pooled: torch.Tensor,
+    t5_out: torch.Tensor,
+    txt_ids: torch.Tensor,
+    num_steps: int,
+    guidance: float,
+    is_schnell: bool,
+    device: torch.device,
+    flux_dtype: torch.dtype,
+):
+    timesteps = get_schedule(num_steps, img.shape[1], shift=not is_schnell)
+
+    # denoise initial noise
+    if accelerator:
+        with accelerator.autocast(), torch.no_grad():
+            x = denoise(model, img, img_ids, t5_out, txt_ids, l_pooled, timesteps=timesteps, guidance=guidance)
+    else:
+        with torch.autocast(device_type=device.type, dtype=flux_dtype), torch.no_grad():
+            x = denoise(model, img, img_ids, t5_out, txt_ids, l_pooled, timesteps=timesteps, guidance=guidance)
+
+    return x
+
+
+def generate_image(
+    model,
+    clip_l,
+    t5xxl,
+    ae,
+    prompt: str,
+    seed: Optional[int],
+    image_width: int,
+    image_height: int,
+    steps: Optional[int],
+    guidance: float,
+):
+    # make first noise with packed shape
+    # original: b,16,2*h//16,2*w//16, packed: b,h//16*w//16,16*2*2
+    packed_latent_height, packed_latent_width = math.ceil(image_height / 16), math.ceil(image_width / 16)
+    noise = torch.randn(
+        1,
+        packed_latent_height * packed_latent_width,
+        16 * 2 * 2,
+        device=device,
+        dtype=dtype,
+        generator=torch.Generator(device=device).manual_seed(seed),
+    )
+
+    # prepare img and img ids
+
+    # this is needed only for img2img
+    # img = rearrange(img, "b c (h ph) (w pw) -> b (h w) (c ph pw)", ph=2, pw=2)
+    # if img.shape[0] == 1 and bs > 1:
+    #     img = repeat(img, "1 ... -> bs ...", bs=bs)
+
+    # txt2img only needs img_ids
+    img_ids = flux_utils.prepare_img_ids(1, packed_latent_height, packed_latent_width)
+
+    # prepare embeddings
+    logger.info("Encoding prompts...")
+    tokens_and_masks = tokenize_strategy.tokenize(prompt)
+    clip_l = clip_l.to(device)
+    t5xxl = t5xxl.to(device)
+    with torch.no_grad():
+        if is_fp8(clip_l_dtype) or is_fp8(t5xxl_dtype):
+            clip_l.to(clip_l_dtype)
+            t5xxl.to(t5xxl_dtype)
+            with accelerator.autocast():
+                _, t5_out, txt_ids = encoding_strategy.encode_tokens(
+                    tokenize_strategy, [clip_l, t5xxl], tokens_and_masks, args.apply_t5_attn_mask
+                )
+        else:
+            with torch.autocast(device_type=device.type, dtype=clip_l_dtype):
+                l_pooled, _, _ = encoding_strategy.encode_tokens(tokenize_strategy, [clip_l, None], tokens_and_masks)
+            with torch.autocast(device_type=device.type, dtype=t5xxl_dtype):
+                _, t5_out, txt_ids = encoding_strategy.encode_tokens(
+                    tokenize_strategy, [None, t5xxl], tokens_and_masks, args.apply_t5_attn_mask
+                )
+
+    # NaN check
+    if torch.isnan(l_pooled).any():
+        raise ValueError("NaN in l_pooled")
+    if torch.isnan(t5_out).any():
+        raise ValueError("NaN in t5_out")
+
+    if args.offload:
+        clip_l = clip_l.cpu()
+        t5xxl = t5xxl.cpu()
+    # del clip_l, t5xxl
+    device_utils.clean_memory()
+
+    # generate image
+    logger.info("Generating image...")
+    model = model.to(device)
+    if steps is None:
+        steps = 4 if is_schnell else 50
+
+    img_ids = img_ids.to(device)
+    x = do_sample(
+        accelerator, model, noise, img_ids, l_pooled, t5_out, txt_ids, steps, guidance_scale, is_schnell, device, flux_dtype
+    )
+    if args.offload:
+        model = model.cpu()
+    # del model
+    device_utils.clean_memory()
+
+    # unpack
+    x = x.float()
+    x = einops.rearrange(x, "b (h w) (c ph pw) -> b c (h ph) (w pw)", h=packed_latent_height, w=packed_latent_width, ph=2, pw=2)
+
+    # decode
+    logger.info("Decoding image...")
+    ae = ae.to(device)
+    with torch.no_grad():
+        if is_fp8(ae_dtype):
+            with accelerator.autocast():
+                x = ae.decode(x)
+        else:
+            with torch.autocast(device_type=device.type, dtype=ae_dtype):
+                x = ae.decode(x)
+    if args.offload:
+        ae = ae.cpu()
+
+    x = x.clamp(-1, 1)
+    x = x.permute(0, 2, 3, 1)
+    img = Image.fromarray((127.5 * (x + 1.0)).float().cpu().numpy().astype(np.uint8)[0])
+
+    # save image
+    output_dir = args.output_dir
+    os.makedirs(output_dir, exist_ok=True)
+    output_path = os.path.join(output_dir, f"{datetime.datetime.now().strftime('%Y%m%d_%H%M%S')}.png")
+    img.save(output_path)
+
+    logger.info(f"Saved image to {output_path}")
+
+
+if __name__ == "__main__":
+    target_height = 768  # 1024
+    target_width = 1360  # 1024
+
+    # steps = 50  # 28  # 50
+    # guidance_scale = 5
+    # seed = 1  # None  # 1
+
+    device = get_preferred_device()
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--ckpt_path", type=str, required=True)
+    parser.add_argument("--clip_l", type=str, required=False)
+    parser.add_argument("--t5xxl", type=str, required=False)
+    parser.add_argument("--ae", type=str, required=False)
+    parser.add_argument("--apply_t5_attn_mask", action="store_true")
+    parser.add_argument("--prompt", type=str, default="A photo of a cat")
+    parser.add_argument("--output_dir", type=str, default=".")
+    parser.add_argument("--dtype", type=str, default="bfloat16", help="base dtype")
+    parser.add_argument("--clip_l_dtype", type=str, default=None, help="dtype for clip_l")
+    parser.add_argument("--ae_dtype", type=str, default=None, help="dtype for ae")
+    parser.add_argument("--t5xxl_dtype", type=str, default=None, help="dtype for t5xxl")
+    parser.add_argument("--flux_dtype", type=str, default=None, help="dtype for flux")
+    parser.add_argument("--seed", type=int, default=None)
+    parser.add_argument("--steps", type=int, default=None, help="Number of steps. Default is 4 for schnell, 50 for dev")
+    parser.add_argument("--guidance", type=float, default=3.5)
+    parser.add_argument("--offload", action="store_true", help="Offload to CPU")
+    parser.add_argument(
+        "--lora_weights",
+        type=str,
+        nargs="*",
+        default=[],
+        help="LoRA weights, only supports networks.lora_flux, each argument is a `path;multiplier` (semi-colon separated)",
+    )
+    parser.add_argument("--width", type=int, default=target_width)
+    parser.add_argument("--height", type=int, default=target_height)
+    parser.add_argument("--interactive", action="store_true")
+    args = parser.parse_args()
+
+    seed = args.seed
+    steps = args.steps
+    guidance_scale = args.guidance
+
+    name = "schnell" if "schnell" in args.ckpt_path else "dev"  # TODO change this to a more robust way
+    is_schnell = name == "schnell"
+
+    def str_to_dtype(s: Optional[str], default_dtype: Optional[torch.dtype] = None) -> torch.dtype:
+        if s is None:
+            return default_dtype
+        if s in ["bf16", "bfloat16"]:
+            return torch.bfloat16
+        elif s in ["fp16", "float16"]:
+            return torch.float16
+        elif s in ["fp32", "float32"]:
+            return torch.float32
+        elif s in ["fp8_e4m3fn", "e4m3fn", "float8_e4m3fn"]:
+            return torch.float8_e4m3fn
+        elif s in ["fp8_e4m3fnuz", "e4m3fnuz", "float8_e4m3fnuz"]:
+            return torch.float8_e4m3fnuz
+        elif s in ["fp8_e5m2", "e5m2", "float8_e5m2"]:
+            return torch.float8_e5m2
+        elif s in ["fp8_e5m2fnuz", "e5m2fnuz", "float8_e5m2fnuz"]:
+            return torch.float8_e5m2fnuz
+        elif s in ["fp8", "float8"]:
+            return torch.float8_e4m3fn  # default fp8
+        else:
+            raise ValueError(f"Unsupported dtype: {s}")
+
+    def is_fp8(dt):
+        return dt in [torch.float8_e4m3fn, torch.float8_e4m3fnuz, torch.float8_e5m2, torch.float8_e5m2fnuz]
+
+    dtype = str_to_dtype(args.dtype)
+    clip_l_dtype = str_to_dtype(args.clip_l_dtype, dtype)
+    t5xxl_dtype = str_to_dtype(args.t5xxl_dtype, dtype)
+    ae_dtype = str_to_dtype(args.ae_dtype, dtype)
+    flux_dtype = str_to_dtype(args.flux_dtype, dtype)
+
+    logger.info(f"Dtypes for clip_l, t5xxl, ae, flux: {clip_l_dtype}, {t5xxl_dtype}, {ae_dtype}, {flux_dtype}")
+
+    loading_device = "cpu" if args.offload else device
+
+    use_fp8 = [is_fp8(d) for d in [dtype, clip_l_dtype, t5xxl_dtype, ae_dtype, flux_dtype]]
+    if any(use_fp8):
+        accelerator = accelerate.Accelerator(mixed_precision="bf16")
+    else:
+        accelerator = None
+
+    # load clip_l
+    logger.info(f"Loading clip_l from {args.clip_l}...")
+    clip_l = flux_utils.load_clip_l(args.clip_l, clip_l_dtype, loading_device)
+    clip_l.eval()
+
+    logger.info(f"Loading t5xxl from {args.t5xxl}...")
+    t5xxl = flux_utils.load_t5xxl(args.t5xxl, t5xxl_dtype, loading_device)
+    t5xxl.eval()
+
+    if is_fp8(clip_l_dtype):
+        clip_l = accelerator.prepare(clip_l)
+    if is_fp8(t5xxl_dtype):
+        t5xxl = accelerator.prepare(t5xxl)
+
+    t5xxl_max_length = 256 if is_schnell else 512
+    tokenize_strategy = strategy_flux.FluxTokenizeStrategy(t5xxl_max_length)
+    encoding_strategy = strategy_flux.FluxTextEncodingStrategy()
+
+    # DiT
+    model = flux_utils.load_flow_model(name, args.ckpt_path, flux_dtype, loading_device)
+    model.eval()
+    logger.info(f"Casting model to {flux_dtype}")
+    model.to(flux_dtype)  # make sure model is dtype
+    if is_fp8(flux_dtype):
+        model = accelerator.prepare(model)
+
+    # AE
+    ae = flux_utils.load_ae(name, args.ae, ae_dtype, loading_device)
+    ae.eval()
+    if is_fp8(ae_dtype):
+        ae = accelerator.prepare(ae)
+
+    # LoRA
+    for weights_file in args.lora_weights:
+        if ";" in weights_file:
+            weights_file, multiplier = weights_file.split(";")
+            multiplier = float(multiplier)
+        else:
+            multiplier = 1.0
+
+        lora_model, weights_sd = lora_flux.create_network_from_weights(
+            multiplier, weights_file, ae, [clip_l, t5xxl], model, None, True
+        )
+        lora_model.merge_to([clip_l, t5xxl], model, weights_sd)
+
+    if not args.interactive:
+        generate_image(model, clip_l, t5xxl, ae, args.prompt, args.seed, args.width, args.height, args.steps, args.guidance)
+    else:
+        # loop for interactive
+        width = target_width
+        height = target_height
+        steps = None
+        guidance = args.guidance
+
+        while True:
+            print("Enter prompt (empty to exit). Options: --w <width> --h <height> --s <steps> --d <seed> --g <guidance>")
+            prompt = input()
+            if prompt == "":
+                break
+
+            # parse options
+            options = prompt.split("--")
+            prompt = options[0].strip()
+            seed = None
+            for opt in options[1:]:
+                opt = opt.strip()
+                if opt.startswith("w"):
+                    width = int(opt[1:].strip())
+                elif opt.startswith("h"):
+                    height = int(opt[1:].strip())
+                elif opt.startswith("s"):
+                    steps = int(opt[1:].strip())
+                elif opt.startswith("d"):
+                    seed = int(opt[1:].strip())
+                elif opt.startswith("g"):
+                    guidance = float(opt[1:].strip())
+
+            generate_image(model, clip_l, t5xxl, ae, prompt, seed, width, height, steps, guidance)
+
+    logger.info("Done!")
diff --git a/flux_train_network.py b/flux_train_network.py
new file mode 100644
index 000000000..7c762c86d
--- /dev/null
+++ b/flux_train_network.py
@@ -0,0 +1,332 @@
+import argparse
+import copy
+import math
+import random
+from typing import Any
+
+import torch
+from accelerate import Accelerator
+from library.device_utils import init_ipex, clean_memory_on_device
+
+init_ipex()
+
+from library import flux_models, flux_utils, sd3_train_utils, sd3_utils, sdxl_model_util, sdxl_train_util, strategy_flux, train_util
+import train_network
+from library.utils import setup_logging
+
+setup_logging()
+import logging
+
+logger = logging.getLogger(__name__)
+
+
+class FluxNetworkTrainer(train_network.NetworkTrainer):
+    def __init__(self):
+        super().__init__()
+
+    def assert_extra_args(self, args, train_dataset_group):
+        super().assert_extra_args(args, train_dataset_group)
+        # sdxl_train_util.verify_sdxl_training_args(args)
+
+        if args.cache_text_encoder_outputs:
+            assert (
+                train_dataset_group.is_text_encoder_output_cacheable()
+            ), "when caching Text Encoder output, either caption_dropout_rate, shuffle_caption, token_warmup_step or caption_tag_dropout_rate cannot be used / Text Encoderの出力をキャッシュするときはcaption_dropout_rate, shuffle_caption, token_warmup_step, caption_tag_dropout_rateは使えません"
+
+        assert (
+            args.network_train_unet_only or not args.cache_text_encoder_outputs
+        ), "network for Text Encoder cannot be trained with caching Text Encoder outputs / Text Encoderの出力をキャッシュしながらText Encoderのネットワークを学習することはできません"
+
+        train_dataset_group.verify_bucket_reso_steps(32)
+
+    def load_target_model(self, args, weight_dtype, accelerator):
+        # currently offload to cpu for some models
+
+        clip_l = flux_utils.load_clip_l(args.clip_l, weight_dtype, "cpu")
+        clip_l.eval()
+
+        # loading t5xxl to cpu takes a long time, so we should load to gpu in future
+        t5xxl = flux_utils.load_t5xxl(args.t5xxl, weight_dtype, "cpu")
+        t5xxl.eval()
+
+        name = "schnell" if "schnell" in args.pretrained_model_name_or_path else "dev"  # TODO change this to a more robust way
+        # if we load to cpu, flux.to(fp8) takes a long time
+        model = flux_utils.load_flow_model(name, args.pretrained_model_name_or_path, weight_dtype, "cpu")
+        ae = flux_utils.load_ae(name, args.ae, weight_dtype, "cpu")
+
+        return flux_utils.MODEL_VERSION_FLUX_V1, [clip_l, t5xxl], ae, model
+
+    def get_tokenize_strategy(self, args):
+        return strategy_flux.FluxTokenizeStrategy(args.max_token_length, args.tokenizer_cache_dir)
+
+    def get_tokenizers(self, tokenize_strategy: strategy_flux.FluxTokenizeStrategy):
+        return [tokenize_strategy.clip_l, tokenize_strategy.t5xxl]
+
+    def get_latents_caching_strategy(self, args):
+        latents_caching_strategy = strategy_flux.FluxLatentsCachingStrategy(args.cache_latents_to_disk, args.vae_batch_size, False)
+        return latents_caching_strategy
+
+    def get_text_encoding_strategy(self, args):
+        return strategy_flux.FluxTextEncodingStrategy()
+
+    def get_models_for_text_encoding(self, args, accelerator, text_encoders):
+        return text_encoders  # + [accelerator.unwrap_model(text_encoders[-1])]
+
+    def get_text_encoder_outputs_caching_strategy(self, args):
+        if args.cache_text_encoder_outputs:
+            return strategy_flux.FluxTextEncoderOutputsCachingStrategy(args.cache_text_encoder_outputs_to_disk, None, False)
+        else:
+            return None
+
+    def cache_text_encoder_outputs_if_needed(
+        self, args, accelerator: Accelerator, unet, vae, text_encoders, dataset: train_util.DatasetGroup, weight_dtype
+    ):
+        if args.cache_text_encoder_outputs:
+            if not args.lowram:
+                # メモリ消費を減らす
+                logger.info("move vae and unet to cpu to save memory")
+                org_vae_device = vae.device
+                org_unet_device = unet.device
+                vae.to("cpu")
+                unet.to("cpu")
+                clean_memory_on_device(accelerator.device)
+
+            # When TE is not be trained, it will not be prepared so we need to use explicit autocast
+            logger.info("move text encoders to gpu")
+            text_encoders[0].to(accelerator.device, dtype=weight_dtype)
+            text_encoders[1].to(accelerator.device, dtype=weight_dtype)
+            with accelerator.autocast():
+                dataset.new_cache_text_encoder_outputs(text_encoders, accelerator.is_main_process)
+            accelerator.wait_for_everyone()
+
+            logger.info("move text encoders back to cpu")
+            text_encoders[0].to("cpu")  # , dtype=torch.float32)  # Text Encoder doesn't work with fp16 on CPU
+            text_encoders[1].to("cpu")  # , dtype=torch.float32)
+            clean_memory_on_device(accelerator.device)
+
+            if not args.lowram:
+                logger.info("move vae and unet back to original device")
+                vae.to(org_vae_device)
+                unet.to(org_unet_device)
+        else:
+            # Text Encoderから毎回出力を取得するので、GPUに乗せておく
+            text_encoders[0].to(accelerator.device, dtype=weight_dtype)
+            text_encoders[1].to(accelerator.device, dtype=weight_dtype)
+
+    # def call_unet(self, args, accelerator, unet, noisy_latents, timesteps, text_conds, batch, weight_dtype):
+    #     noisy_latents = noisy_latents.to(weight_dtype)  # TODO check why noisy_latents is not weight_dtype
+
+    #     # get size embeddings
+    #     orig_size = batch["original_sizes_hw"]
+    #     crop_size = batch["crop_top_lefts"]
+    #     target_size = batch["target_sizes_hw"]
+    #     embs = sdxl_train_util.get_size_embeddings(orig_size, crop_size, target_size, accelerator.device).to(weight_dtype)
+
+    #     # concat embeddings
+    #     encoder_hidden_states1, encoder_hidden_states2, pool2 = text_conds
+    #     vector_embedding = torch.cat([pool2, embs], dim=1).to(weight_dtype)
+    #     text_embedding = torch.cat([encoder_hidden_states1, encoder_hidden_states2], dim=2).to(weight_dtype)
+
+    #     noise_pred = unet(noisy_latents, timesteps, text_embedding, vector_embedding)
+    #     return noise_pred
+
+    def sample_images(self, accelerator, args, epoch, global_step, device, vae, tokenizer, text_encoder, unet):
+        # logger.warning("Sampling images is not supported for Flux model")
+        pass
+
+    def get_noise_scheduler(self, args: argparse.Namespace, device: torch.device) -> Any:
+        noise_scheduler = sd3_train_utils.FlowMatchEulerDiscreteScheduler(num_train_timesteps=1000, shift=3.0)
+        self.noise_scheduler_copy = copy.deepcopy(noise_scheduler)
+        return noise_scheduler
+
+    def encode_images_to_latents(self, args, accelerator, vae, images):
+        return vae.encode(images).latent_dist.sample()
+
+    def shift_scale_latents(self, args, latents):
+        return latents
+
+    def get_noise_pred_and_target(
+        self,
+        args,
+        accelerator,
+        noise_scheduler,
+        latents,
+        batch,
+        text_encoder_conds,
+        unet: flux_models.Flux,
+        network,
+        weight_dtype,
+        train_unet,
+    ):
+        # copy from sd3_train.py and modified
+
+        def get_sigmas(timesteps, n_dim=4, dtype=torch.float32):
+            sigmas = self.noise_scheduler_copy.sigmas.to(device=accelerator.device, dtype=dtype)
+            schedule_timesteps = self.noise_scheduler_copy.timesteps.to(accelerator.device)
+            timesteps = timesteps.to(accelerator.device)
+            step_indices = [(schedule_timesteps == t).nonzero().item() for t in timesteps]
+
+            sigma = sigmas[step_indices].flatten()
+            while len(sigma.shape) < n_dim:
+                sigma = sigma.unsqueeze(-1)
+            return sigma
+
+        def compute_density_for_timestep_sampling(
+            weighting_scheme: str, batch_size: int, logit_mean: float = None, logit_std: float = None, mode_scale: float = None
+        ):
+            """Compute the density for sampling the timesteps when doing SD3 training.
+
+            Courtesy: This was contributed by Rafie Walker in https://github.com/huggingface/diffusers/pull/8528.
+
+            SD3 paper reference: https://arxiv.org/abs/2403.03206v1.
+            """
+            if weighting_scheme == "logit_normal":
+                # See 3.1 in the SD3 paper ($rf/lognorm(0.00,1.00)$).
+                u = torch.normal(mean=logit_mean, std=logit_std, size=(batch_size,), device="cpu")
+                u = torch.nn.functional.sigmoid(u)
+            elif weighting_scheme == "mode":
+                u = torch.rand(size=(batch_size,), device="cpu")
+                u = 1 - u - mode_scale * (torch.cos(math.pi * u / 2) ** 2 - 1 + u)
+            else:
+                u = torch.rand(size=(batch_size,), device="cpu")
+            return u
+
+        def compute_loss_weighting_for_sd3(weighting_scheme: str, sigmas=None):
+            """Computes loss weighting scheme for SD3 training.
+
+            Courtesy: This was contributed by Rafie Walker in https://github.com/huggingface/diffusers/pull/8528.
+
+            SD3 paper reference: https://arxiv.org/abs/2403.03206v1.
+            """
+            if weighting_scheme == "sigma_sqrt":
+                weighting = (sigmas**-2.0).float()
+            elif weighting_scheme == "cosmap":
+                bot = 1 - 2 * sigmas + 2 * sigmas**2
+                weighting = 2 / (math.pi * bot)
+            else:
+                weighting = torch.ones_like(sigmas)
+            return weighting
+
+        # Sample noise that we'll add to the latents
+        noise = torch.randn_like(latents)
+        bsz = latents.shape[0]
+
+        # Sample a random timestep for each image
+        # for weighting schemes where we sample timesteps non-uniformly
+        u = compute_density_for_timestep_sampling(
+            weighting_scheme=args.weighting_scheme,
+            batch_size=bsz,
+            logit_mean=args.logit_mean,
+            logit_std=args.logit_std,
+            mode_scale=args.mode_scale,
+        )
+        indices = (u * self.noise_scheduler_copy.config.num_train_timesteps).long()
+        timesteps = self.noise_scheduler_copy.timesteps[indices].to(device=accelerator.device)
+
+        # Add noise according to flow matching.
+        sigmas = get_sigmas(timesteps, n_dim=latents.ndim, dtype=weight_dtype)
+        noisy_model_input = sigmas * noise + (1.0 - sigmas) * latents
+
+        # pack latents and get img_ids
+        packed_noisy_model_input = flux_utils.pack_latents(noisy_model_input)  # b, c, h*2, w*2 -> b, h*w, c*4
+        packed_latent_height, packed_latent_width = noisy_model_input.shape[2] // 2, noisy_model_input.shape[3] // 2
+        img_ids = flux_utils.prepare_img_ids(bsz, packed_latent_height, packed_latent_width).to(device=accelerator.device)
+
+        # get guidance
+        guidance_vec = torch.full((bsz,), args.guidance_scale, device=accelerator.device)
+
+        # ensure the hidden state will require grad
+        if args.gradient_checkpointing:
+            noisy_model_input.requires_grad_(True)
+            for t in text_encoder_conds:
+                t.requires_grad_(True)
+            img_ids.requires_grad_(True)
+            guidance_vec.requires_grad_(True)
+
+        # Predict the noise residual
+        l_pooled, t5_out, txt_ids = text_encoder_conds
+        # print(
+        #     f"model_input: {noisy_model_input.shape}, img_ids: {img_ids.shape}, t5_out: {t5_out.shape}, txt_ids: {txt_ids.shape}, l_pooled: {l_pooled.shape}, timesteps: {timesteps.shape}, guidance_vec: {guidance_vec.shape}"
+        # )
+
+        with accelerator.autocast():
+            # YiYi notes: divide it by 1000 for now because we scale it by 1000 in the transforme rmodel (we should not keep it but I want to keep the inputs same for the model for testing)
+            model_pred = unet(
+                img=packed_noisy_model_input,
+                img_ids=img_ids,
+                txt=t5_out,
+                txt_ids=txt_ids,
+                y=l_pooled,
+                timesteps=timesteps / 1000,
+                guidance=guidance_vec,
+            )
+
+        # unpack latents
+        model_pred = flux_utils.unpack_latents(model_pred, packed_latent_height, packed_latent_width)
+
+        model_pred = model_pred * (-sigmas) + noisy_model_input
+
+        # these weighting schemes use a uniform timestep sampling
+        # and instead post-weight the loss
+        weighting = compute_loss_weighting_for_sd3(weighting_scheme=args.weighting_scheme, sigmas=sigmas)
+
+        # flow matching loss: this is different from SD3
+        target = noise - latents
+
+        return model_pred, target, timesteps, None, weighting
+
+    def post_process_loss(self, loss, args, timesteps, noise_scheduler):
+        return loss
+
+
+def setup_parser() -> argparse.ArgumentParser:
+    parser = train_network.setup_parser()
+    # sdxl_train_util.add_sdxl_training_arguments(parser)
+    parser.add_argument("--clip_l", type=str, help="path to clip_l")
+    parser.add_argument("--t5xxl", type=str, help="path to t5xxl")
+    parser.add_argument("--ae", type=str, help="path to ae")
+    parser.add_argument("--apply_t5_attn_mask", action="store_true")
+    parser.add_argument(
+        "--cache_text_encoder_outputs", action="store_true", help="cache text encoder outputs / text encoderの出力をキャッシュする"
+    )
+    parser.add_argument(
+        "--cache_text_encoder_outputs_to_disk",
+        action="store_true",
+        help="cache text encoder outputs to disk / text encoderの出力をディスクにキャッシュする",
+    )
+
+    # copy from Diffusers
+    parser.add_argument(
+        "--weighting_scheme",
+        type=str,
+        default="none",
+        choices=["sigma_sqrt", "logit_normal", "mode", "cosmap", "none"],
+    )
+    parser.add_argument(
+        "--logit_mean", type=float, default=0.0, help="mean to use when using the `'logit_normal'` weighting scheme."
+    )
+    parser.add_argument("--logit_std", type=float, default=1.0, help="std to use when using the `'logit_normal'` weighting scheme.")
+    parser.add_argument(
+        "--mode_scale",
+        type=float,
+        default=1.29,
+        help="Scale of mode weighting scheme. Only effective when using the `'mode'` as the `weighting_scheme`.",
+    )
+    parser.add_argument(
+        "--guidance_scale",
+        type=float,
+        default=3.5,
+        help="the FLUX.1 dev variant is a guidance distilled model",
+    )
+    return parser
+
+
+if __name__ == "__main__":
+    parser = setup_parser()
+
+    args = parser.parse_args()
+    train_util.verify_command_line_training_args(args)
+    args = train_util.read_config_from_file(args, parser)
+
+    trainer = FluxNetworkTrainer()
+    trainer.train(args)
diff --git a/library/flux_models.py b/library/flux_models.py
new file mode 100644
index 000000000..d0955e375
--- /dev/null
+++ b/library/flux_models.py
@@ -0,0 +1,920 @@
+# copy from FLUX repo: https://github.com/black-forest-labs/flux
+# license: Apache-2.0 License
+
+
+from dataclasses import dataclass
+import math
+
+import torch
+from einops import rearrange
+from torch import Tensor, nn
+from torch.utils.checkpoint import checkpoint
+
+# USE_REENTRANT = True
+
+
+@dataclass
+class FluxParams:
+    in_channels: int
+    vec_in_dim: int
+    context_in_dim: int
+    hidden_size: int
+    mlp_ratio: float
+    num_heads: int
+    depth: int
+    depth_single_blocks: int
+    axes_dim: list[int]
+    theta: int
+    qkv_bias: bool
+    guidance_embed: bool
+
+
+# region autoencoder
+
+
+@dataclass
+class AutoEncoderParams:
+    resolution: int
+    in_channels: int
+    ch: int
+    out_ch: int
+    ch_mult: list[int]
+    num_res_blocks: int
+    z_channels: int
+    scale_factor: float
+    shift_factor: float
+
+
+def swish(x: Tensor) -> Tensor:
+    return x * torch.sigmoid(x)
+
+
+class AttnBlock(nn.Module):
+    def __init__(self, in_channels: int):
+        super().__init__()
+        self.in_channels = in_channels
+
+        self.norm = nn.GroupNorm(num_groups=32, num_channels=in_channels, eps=1e-6, affine=True)
+
+        self.q = nn.Conv2d(in_channels, in_channels, kernel_size=1)
+        self.k = nn.Conv2d(in_channels, in_channels, kernel_size=1)
+        self.v = nn.Conv2d(in_channels, in_channels, kernel_size=1)
+        self.proj_out = nn.Conv2d(in_channels, in_channels, kernel_size=1)
+
+    def attention(self, h_: Tensor) -> Tensor:
+        h_ = self.norm(h_)
+        q = self.q(h_)
+        k = self.k(h_)
+        v = self.v(h_)
+
+        b, c, h, w = q.shape
+        q = rearrange(q, "b c h w -> b 1 (h w) c").contiguous()
+        k = rearrange(k, "b c h w -> b 1 (h w) c").contiguous()
+        v = rearrange(v, "b c h w -> b 1 (h w) c").contiguous()
+        h_ = nn.functional.scaled_dot_product_attention(q, k, v)
+
+        return rearrange(h_, "b 1 (h w) c -> b c h w", h=h, w=w, c=c, b=b)
+
+    def forward(self, x: Tensor) -> Tensor:
+        return x + self.proj_out(self.attention(x))
+
+
+class ResnetBlock(nn.Module):
+    def __init__(self, in_channels: int, out_channels: int):
+        super().__init__()
+        self.in_channels = in_channels
+        out_channels = in_channels if out_channels is None else out_channels
+        self.out_channels = out_channels
+
+        self.norm1 = nn.GroupNorm(num_groups=32, num_channels=in_channels, eps=1e-6, affine=True)
+        self.conv1 = nn.Conv2d(in_channels, out_channels, kernel_size=3, stride=1, padding=1)
+        self.norm2 = nn.GroupNorm(num_groups=32, num_channels=out_channels, eps=1e-6, affine=True)
+        self.conv2 = nn.Conv2d(out_channels, out_channels, kernel_size=3, stride=1, padding=1)
+        if self.in_channels != self.out_channels:
+            self.nin_shortcut = nn.Conv2d(in_channels, out_channels, kernel_size=1, stride=1, padding=0)
+
+    def forward(self, x):
+        h = x
+        h = self.norm1(h)
+        h = swish(h)
+        h = self.conv1(h)
+
+        h = self.norm2(h)
+        h = swish(h)
+        h = self.conv2(h)
+
+        if self.in_channels != self.out_channels:
+            x = self.nin_shortcut(x)
+
+        return x + h
+
+
+class Downsample(nn.Module):
+    def __init__(self, in_channels: int):
+        super().__init__()
+        # no asymmetric padding in torch conv, must do it ourselves
+        self.conv = nn.Conv2d(in_channels, in_channels, kernel_size=3, stride=2, padding=0)
+
+    def forward(self, x: Tensor):
+        pad = (0, 1, 0, 1)
+        x = nn.functional.pad(x, pad, mode="constant", value=0)
+        x = self.conv(x)
+        return x
+
+
+class Upsample(nn.Module):
+    def __init__(self, in_channels: int):
+        super().__init__()
+        self.conv = nn.Conv2d(in_channels, in_channels, kernel_size=3, stride=1, padding=1)
+
+    def forward(self, x: Tensor):
+        x = nn.functional.interpolate(x, scale_factor=2.0, mode="nearest")
+        x = self.conv(x)
+        return x
+
+
+class Encoder(nn.Module):
+    def __init__(
+        self,
+        resolution: int,
+        in_channels: int,
+        ch: int,
+        ch_mult: list[int],
+        num_res_blocks: int,
+        z_channels: int,
+    ):
+        super().__init__()
+        self.ch = ch
+        self.num_resolutions = len(ch_mult)
+        self.num_res_blocks = num_res_blocks
+        self.resolution = resolution
+        self.in_channels = in_channels
+        # downsampling
+        self.conv_in = nn.Conv2d(in_channels, self.ch, kernel_size=3, stride=1, padding=1)
+
+        curr_res = resolution
+        in_ch_mult = (1,) + tuple(ch_mult)
+        self.in_ch_mult = in_ch_mult
+        self.down = nn.ModuleList()
+        block_in = self.ch
+        for i_level in range(self.num_resolutions):
+            block = nn.ModuleList()
+            attn = nn.ModuleList()
+            block_in = ch * in_ch_mult[i_level]
+            block_out = ch * ch_mult[i_level]
+            for _ in range(self.num_res_blocks):
+                block.append(ResnetBlock(in_channels=block_in, out_channels=block_out))
+                block_in = block_out
+            down = nn.Module()
+            down.block = block
+            down.attn = attn
+            if i_level != self.num_resolutions - 1:
+                down.downsample = Downsample(block_in)
+                curr_res = curr_res // 2
+            self.down.append(down)
+
+        # middle
+        self.mid = nn.Module()
+        self.mid.block_1 = ResnetBlock(in_channels=block_in, out_channels=block_in)
+        self.mid.attn_1 = AttnBlock(block_in)
+        self.mid.block_2 = ResnetBlock(in_channels=block_in, out_channels=block_in)
+
+        # end
+        self.norm_out = nn.GroupNorm(num_groups=32, num_channels=block_in, eps=1e-6, affine=True)
+        self.conv_out = nn.Conv2d(block_in, 2 * z_channels, kernel_size=3, stride=1, padding=1)
+
+    def forward(self, x: Tensor) -> Tensor:
+        # downsampling
+        hs = [self.conv_in(x)]
+        for i_level in range(self.num_resolutions):
+            for i_block in range(self.num_res_blocks):
+                h = self.down[i_level].block[i_block](hs[-1])
+                if len(self.down[i_level].attn) > 0:
+                    h = self.down[i_level].attn[i_block](h)
+                hs.append(h)
+            if i_level != self.num_resolutions - 1:
+                hs.append(self.down[i_level].downsample(hs[-1]))
+
+        # middle
+        h = hs[-1]
+        h = self.mid.block_1(h)
+        h = self.mid.attn_1(h)
+        h = self.mid.block_2(h)
+        # end
+        h = self.norm_out(h)
+        h = swish(h)
+        h = self.conv_out(h)
+        return h
+
+
+class Decoder(nn.Module):
+    def __init__(
+        self,
+        ch: int,
+        out_ch: int,
+        ch_mult: list[int],
+        num_res_blocks: int,
+        in_channels: int,
+        resolution: int,
+        z_channels: int,
+    ):
+        super().__init__()
+        self.ch = ch
+        self.num_resolutions = len(ch_mult)
+        self.num_res_blocks = num_res_blocks
+        self.resolution = resolution
+        self.in_channels = in_channels
+        self.ffactor = 2 ** (self.num_resolutions - 1)
+
+        # compute in_ch_mult, block_in and curr_res at lowest res
+        block_in = ch * ch_mult[self.num_resolutions - 1]
+        curr_res = resolution // 2 ** (self.num_resolutions - 1)
+        self.z_shape = (1, z_channels, curr_res, curr_res)
+
+        # z to block_in
+        self.conv_in = nn.Conv2d(z_channels, block_in, kernel_size=3, stride=1, padding=1)
+
+        # middle
+        self.mid = nn.Module()
+        self.mid.block_1 = ResnetBlock(in_channels=block_in, out_channels=block_in)
+        self.mid.attn_1 = AttnBlock(block_in)
+        self.mid.block_2 = ResnetBlock(in_channels=block_in, out_channels=block_in)
+
+        # upsampling
+        self.up = nn.ModuleList()
+        for i_level in reversed(range(self.num_resolutions)):
+            block = nn.ModuleList()
+            attn = nn.ModuleList()
+            block_out = ch * ch_mult[i_level]
+            for _ in range(self.num_res_blocks + 1):
+                block.append(ResnetBlock(in_channels=block_in, out_channels=block_out))
+                block_in = block_out
+            up = nn.Module()
+            up.block = block
+            up.attn = attn
+            if i_level != 0:
+                up.upsample = Upsample(block_in)
+                curr_res = curr_res * 2
+            self.up.insert(0, up)  # prepend to get consistent order
+
+        # end
+        self.norm_out = nn.GroupNorm(num_groups=32, num_channels=block_in, eps=1e-6, affine=True)
+        self.conv_out = nn.Conv2d(block_in, out_ch, kernel_size=3, stride=1, padding=1)
+
+    def forward(self, z: Tensor) -> Tensor:
+        # z to block_in
+        h = self.conv_in(z)
+
+        # middle
+        h = self.mid.block_1(h)
+        h = self.mid.attn_1(h)
+        h = self.mid.block_2(h)
+
+        # upsampling
+        for i_level in reversed(range(self.num_resolutions)):
+            for i_block in range(self.num_res_blocks + 1):
+                h = self.up[i_level].block[i_block](h)
+                if len(self.up[i_level].attn) > 0:
+                    h = self.up[i_level].attn[i_block](h)
+            if i_level != 0:
+                h = self.up[i_level].upsample(h)
+
+        # end
+        h = self.norm_out(h)
+        h = swish(h)
+        h = self.conv_out(h)
+        return h
+
+
+class DiagonalGaussian(nn.Module):
+    def __init__(self, sample: bool = True, chunk_dim: int = 1):
+        super().__init__()
+        self.sample = sample
+        self.chunk_dim = chunk_dim
+
+    def forward(self, z: Tensor) -> Tensor:
+        mean, logvar = torch.chunk(z, 2, dim=self.chunk_dim)
+        if self.sample:
+            std = torch.exp(0.5 * logvar)
+            return mean + std * torch.randn_like(mean)
+        else:
+            return mean
+
+
+class AutoEncoder(nn.Module):
+    def __init__(self, params: AutoEncoderParams):
+        super().__init__()
+        self.encoder = Encoder(
+            resolution=params.resolution,
+            in_channels=params.in_channels,
+            ch=params.ch,
+            ch_mult=params.ch_mult,
+            num_res_blocks=params.num_res_blocks,
+            z_channels=params.z_channels,
+        )
+        self.decoder = Decoder(
+            resolution=params.resolution,
+            in_channels=params.in_channels,
+            ch=params.ch,
+            out_ch=params.out_ch,
+            ch_mult=params.ch_mult,
+            num_res_blocks=params.num_res_blocks,
+            z_channels=params.z_channels,
+        )
+        self.reg = DiagonalGaussian()
+
+        self.scale_factor = params.scale_factor
+        self.shift_factor = params.shift_factor
+
+    @property
+    def device(self) -> torch.device:
+        return next(self.parameters()).device
+
+    @property
+    def dtype(self) -> torch.dtype:
+        return next(self.parameters()).dtype
+
+    def encode(self, x: Tensor) -> Tensor:
+        z = self.reg(self.encoder(x))
+        z = self.scale_factor * (z - self.shift_factor)
+        return z
+
+    def decode(self, z: Tensor) -> Tensor:
+        z = z / self.scale_factor + self.shift_factor
+        return self.decoder(z)
+
+    def forward(self, x: Tensor) -> Tensor:
+        return self.decode(self.encode(x))
+
+
+# endregion
+# region config
+
+
+@dataclass
+class ModelSpec:
+    params: FluxParams
+    ae_params: AutoEncoderParams
+    ckpt_path: str | None
+    ae_path: str | None
+    # repo_id: str | None
+    # repo_flow: str | None
+    # repo_ae: str | None
+
+
+configs = {
+    "dev": ModelSpec(
+        # repo_id="black-forest-labs/FLUX.1-dev",
+        # repo_flow="flux1-dev.sft",
+        # repo_ae="ae.sft",
+        ckpt_path=None,  # os.getenv("FLUX_DEV"),
+        params=FluxParams(
+            in_channels=64,
+            vec_in_dim=768,
+            context_in_dim=4096,
+            hidden_size=3072,
+            mlp_ratio=4.0,
+            num_heads=24,
+            depth=19,
+            depth_single_blocks=38,
+            axes_dim=[16, 56, 56],
+            theta=10_000,
+            qkv_bias=True,
+            guidance_embed=True,
+        ),
+        ae_path=None,  # os.getenv("AE"),
+        ae_params=AutoEncoderParams(
+            resolution=256,
+            in_channels=3,
+            ch=128,
+            out_ch=3,
+            ch_mult=[1, 2, 4, 4],
+            num_res_blocks=2,
+            z_channels=16,
+            scale_factor=0.3611,
+            shift_factor=0.1159,
+        ),
+    ),
+    "schnell": ModelSpec(
+        # repo_id="black-forest-labs/FLUX.1-schnell",
+        # repo_flow="flux1-schnell.sft",
+        # repo_ae="ae.sft",
+        ckpt_path=None,  # os.getenv("FLUX_SCHNELL"),
+        params=FluxParams(
+            in_channels=64,
+            vec_in_dim=768,
+            context_in_dim=4096,
+            hidden_size=3072,
+            mlp_ratio=4.0,
+            num_heads=24,
+            depth=19,
+            depth_single_blocks=38,
+            axes_dim=[16, 56, 56],
+            theta=10_000,
+            qkv_bias=True,
+            guidance_embed=False,
+        ),
+        ae_path=None,  # os.getenv("AE"),
+        ae_params=AutoEncoderParams(
+            resolution=256,
+            in_channels=3,
+            ch=128,
+            out_ch=3,
+            ch_mult=[1, 2, 4, 4],
+            num_res_blocks=2,
+            z_channels=16,
+            scale_factor=0.3611,
+            shift_factor=0.1159,
+        ),
+    ),
+}
+
+
+# endregion
+
+# region math
+
+
+def attention(q: Tensor, k: Tensor, v: Tensor, pe: Tensor) -> Tensor:
+    q, k = apply_rope(q, k, pe)
+
+    x = torch.nn.functional.scaled_dot_product_attention(q, k, v)
+    x = rearrange(x, "B H L D -> B L (H D)")
+
+    return x
+
+
+def rope(pos: Tensor, dim: int, theta: int) -> Tensor:
+    assert dim % 2 == 0
+    scale = torch.arange(0, dim, 2, dtype=torch.float64, device=pos.device) / dim
+    omega = 1.0 / (theta**scale)
+    out = torch.einsum("...n,d->...nd", pos, omega)
+    out = torch.stack([torch.cos(out), -torch.sin(out), torch.sin(out), torch.cos(out)], dim=-1)
+    out = rearrange(out, "b n d (i j) -> b n d i j", i=2, j=2)
+    return out.float()
+
+
+def apply_rope(xq: Tensor, xk: Tensor, freqs_cis: Tensor) -> tuple[Tensor, Tensor]:
+    xq_ = xq.float().reshape(*xq.shape[:-1], -1, 1, 2)
+    xk_ = xk.float().reshape(*xk.shape[:-1], -1, 1, 2)
+    xq_out = freqs_cis[..., 0] * xq_[..., 0] + freqs_cis[..., 1] * xq_[..., 1]
+    xk_out = freqs_cis[..., 0] * xk_[..., 0] + freqs_cis[..., 1] * xk_[..., 1]
+    return xq_out.reshape(*xq.shape).type_as(xq), xk_out.reshape(*xk.shape).type_as(xk)
+
+
+# endregion
+
+
+# region layers
+class EmbedND(nn.Module):
+    def __init__(self, dim: int, theta: int, axes_dim: list[int]):
+        super().__init__()
+        self.dim = dim
+        self.theta = theta
+        self.axes_dim = axes_dim
+
+    def forward(self, ids: Tensor) -> Tensor:
+        n_axes = ids.shape[-1]
+        emb = torch.cat(
+            [rope(ids[..., i], self.axes_dim[i], self.theta) for i in range(n_axes)],
+            dim=-3,
+        )
+
+        return emb.unsqueeze(1)
+
+
+def timestep_embedding(t: Tensor, dim, max_period=10000, time_factor: float = 1000.0):
+    """
+    Create sinusoidal timestep embeddings.
+    :param t: a 1-D Tensor of N indices, one per batch element.
+                      These may be fractional.
+    :param dim: the dimension of the output.
+    :param max_period: controls the minimum frequency of the embeddings.
+    :return: an (N, D) Tensor of positional embeddings.
+    """
+    t = time_factor * t
+    half = dim // 2
+    freqs = torch.exp(-math.log(max_period) * torch.arange(start=0, end=half, dtype=torch.float32) / half).to(t.device)
+
+    args = t[:, None].float() * freqs[None]
+    embedding = torch.cat([torch.cos(args), torch.sin(args)], dim=-1)
+    if dim % 2:
+        embedding = torch.cat([embedding, torch.zeros_like(embedding[:, :1])], dim=-1)
+    if torch.is_floating_point(t):
+        embedding = embedding.to(t)
+    return embedding
+
+
+class MLPEmbedder(nn.Module):
+    def __init__(self, in_dim: int, hidden_dim: int):
+        super().__init__()
+        self.in_layer = nn.Linear(in_dim, hidden_dim, bias=True)
+        self.silu = nn.SiLU()
+        self.out_layer = nn.Linear(hidden_dim, hidden_dim, bias=True)
+
+        self.gradient_checkpointing = False
+
+    def enable_gradient_checkpointing(self):
+        self.gradient_checkpointing = True
+
+    def disable_gradient_checkpointing(self):
+        self.gradient_checkpointing = False
+
+    def _forward(self, x: Tensor) -> Tensor:
+        return self.out_layer(self.silu(self.in_layer(x)))
+
+    def forward(self, *args, **kwargs):
+        if self.training and self.gradient_checkpointing:
+            return checkpoint(self._forward, *args, use_reentrant=False, **kwargs)
+        else:
+            return self._forward(*args, **kwargs)
+
+    # def forward(self, x):
+    #     if self.training and self.gradient_checkpointing:
+    #         def create_custom_forward(func):
+    #             def custom_forward(*inputs):
+    #                 return func(*inputs)
+    #             return custom_forward
+    #         return torch.utils.checkpoint.checkpoint(create_custom_forward(self._forward), x, use_reentrant=USE_REENTRANT)
+    #     else:
+    #         return self._forward(x)
+
+
+class RMSNorm(torch.nn.Module):
+    def __init__(self, dim: int):
+        super().__init__()
+        self.scale = nn.Parameter(torch.ones(dim))
+
+    def forward(self, x: Tensor):
+        x_dtype = x.dtype
+        x = x.float()
+        rrms = torch.rsqrt(torch.mean(x**2, dim=-1, keepdim=True) + 1e-6)
+        # return (x * rrms).to(dtype=x_dtype) * self.scale
+        return ((x * rrms) * self.scale.float()).to(dtype=x_dtype)
+
+
+class QKNorm(torch.nn.Module):
+    def __init__(self, dim: int):
+        super().__init__()
+        self.query_norm = RMSNorm(dim)
+        self.key_norm = RMSNorm(dim)
+
+    def forward(self, q: Tensor, k: Tensor, v: Tensor) -> tuple[Tensor, Tensor]:
+        q = self.query_norm(q)
+        k = self.key_norm(k)
+        return q.to(v), k.to(v)
+
+
+class SelfAttention(nn.Module):
+    def __init__(self, dim: int, num_heads: int = 8, qkv_bias: bool = False):
+        super().__init__()
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+
+        self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
+        self.norm = QKNorm(head_dim)
+        self.proj = nn.Linear(dim, dim)
+
+    #     self.gradient_checkpointing = False
+
+    # def enable_gradient_checkpointing(self):
+    #     self.gradient_checkpointing = True
+
+    def forward(self, x: Tensor, pe: Tensor) -> Tensor:
+        qkv = self.qkv(x)
+        q, k, v = rearrange(qkv, "B L (K H D) -> K B H L D", K=3, H=self.num_heads)
+        q, k = self.norm(q, k, v)
+        x = attention(q, k, v, pe=pe)
+        x = self.proj(x)
+        return x
+
+    # def forward(self, *args, **kwargs):
+    #     if self.training and self.gradient_checkpointing:
+    #         return checkpoint(self._forward, *args, use_reentrant=False, **kwargs)
+    #     else:
+    #         return self._forward(*args, **kwargs)
+
+
+@dataclass
+class ModulationOut:
+    shift: Tensor
+    scale: Tensor
+    gate: Tensor
+
+
+class Modulation(nn.Module):
+    def __init__(self, dim: int, double: bool):
+        super().__init__()
+        self.is_double = double
+        self.multiplier = 6 if double else 3
+        self.lin = nn.Linear(dim, self.multiplier * dim, bias=True)
+
+    def forward(self, vec: Tensor) -> tuple[ModulationOut, ModulationOut | None]:
+        out = self.lin(nn.functional.silu(vec))[:, None, :].chunk(self.multiplier, dim=-1)
+
+        return (
+            ModulationOut(*out[:3]),
+            ModulationOut(*out[3:]) if self.is_double else None,
+        )
+
+
+class DoubleStreamBlock(nn.Module):
+    def __init__(self, hidden_size: int, num_heads: int, mlp_ratio: float, qkv_bias: bool = False):
+        super().__init__()
+
+        mlp_hidden_dim = int(hidden_size * mlp_ratio)
+        self.num_heads = num_heads
+        self.hidden_size = hidden_size
+        self.img_mod = Modulation(hidden_size, double=True)
+        self.img_norm1 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
+        self.img_attn = SelfAttention(dim=hidden_size, num_heads=num_heads, qkv_bias=qkv_bias)
+
+        self.img_norm2 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
+        self.img_mlp = nn.Sequential(
+            nn.Linear(hidden_size, mlp_hidden_dim, bias=True),
+            nn.GELU(approximate="tanh"),
+            nn.Linear(mlp_hidden_dim, hidden_size, bias=True),
+        )
+
+        self.txt_mod = Modulation(hidden_size, double=True)
+        self.txt_norm1 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
+        self.txt_attn = SelfAttention(dim=hidden_size, num_heads=num_heads, qkv_bias=qkv_bias)
+
+        self.txt_norm2 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
+        self.txt_mlp = nn.Sequential(
+            nn.Linear(hidden_size, mlp_hidden_dim, bias=True),
+            nn.GELU(approximate="tanh"),
+            nn.Linear(mlp_hidden_dim, hidden_size, bias=True),
+        )
+
+        self.gradient_checkpointing = False
+
+    def enable_gradient_checkpointing(self):
+        self.gradient_checkpointing = True
+        # self.img_attn.enable_gradient_checkpointing()
+        # self.txt_attn.enable_gradient_checkpointing()
+
+    def disable_gradient_checkpointing(self):
+        self.gradient_checkpointing = False
+        # self.img_attn.disable_gradient_checkpointing()
+        # self.txt_attn.disable_gradient_checkpointing()
+
+    def _forward(self, img: Tensor, txt: Tensor, vec: Tensor, pe: Tensor) -> tuple[Tensor, Tensor]:
+        img_mod1, img_mod2 = self.img_mod(vec)
+        txt_mod1, txt_mod2 = self.txt_mod(vec)
+
+        # prepare image for attention
+        img_modulated = self.img_norm1(img)
+        img_modulated = (1 + img_mod1.scale) * img_modulated + img_mod1.shift
+        img_qkv = self.img_attn.qkv(img_modulated)
+        img_q, img_k, img_v = rearrange(img_qkv, "B L (K H D) -> K B H L D", K=3, H=self.num_heads)
+        img_q, img_k = self.img_attn.norm(img_q, img_k, img_v)
+
+        # prepare txt for attention
+        txt_modulated = self.txt_norm1(txt)
+        txt_modulated = (1 + txt_mod1.scale) * txt_modulated + txt_mod1.shift
+        txt_qkv = self.txt_attn.qkv(txt_modulated)
+        txt_q, txt_k, txt_v = rearrange(txt_qkv, "B L (K H D) -> K B H L D", K=3, H=self.num_heads)
+        txt_q, txt_k = self.txt_attn.norm(txt_q, txt_k, txt_v)
+
+        # run actual attention
+        q = torch.cat((txt_q, img_q), dim=2)
+        k = torch.cat((txt_k, img_k), dim=2)
+        v = torch.cat((txt_v, img_v), dim=2)
+
+        attn = attention(q, k, v, pe=pe)
+        txt_attn, img_attn = attn[:, : txt.shape[1]], attn[:, txt.shape[1] :]
+
+        # calculate the img bloks
+        img = img + img_mod1.gate * self.img_attn.proj(img_attn)
+        img = img + img_mod2.gate * self.img_mlp((1 + img_mod2.scale) * self.img_norm2(img) + img_mod2.shift)
+
+        # calculate the txt bloks
+        txt = txt + txt_mod1.gate * self.txt_attn.proj(txt_attn)
+        txt = txt + txt_mod2.gate * self.txt_mlp((1 + txt_mod2.scale) * self.txt_norm2(txt) + txt_mod2.shift)
+        return img, txt
+
+    def forward(self, *args, **kwargs):
+        if self.training and self.gradient_checkpointing:
+            return checkpoint(self._forward, *args, use_reentrant=False, **kwargs)
+        else:
+            return self._forward(*args, **kwargs)
+
+    # def forward(self, img: Tensor, txt: Tensor, vec: Tensor, pe: Tensor):
+    #     if self.training and self.gradient_checkpointing:
+    #         def create_custom_forward(func):
+    #             def custom_forward(*inputs):
+    #                 return func(*inputs)
+    #             return custom_forward
+    #         return torch.utils.checkpoint.checkpoint(
+    #             create_custom_forward(self._forward), img, txt, vec, pe, use_reentrant=USE_REENTRANT
+    #         )
+    #     else:
+    #         return self._forward(img, txt, vec, pe)
+
+
+class SingleStreamBlock(nn.Module):
+    """
+    A DiT block with parallel linear layers as described in
+    https://arxiv.org/abs/2302.05442 and adapted modulation interface.
+    """
+
+    def __init__(
+        self,
+        hidden_size: int,
+        num_heads: int,
+        mlp_ratio: float = 4.0,
+        qk_scale: float | None = None,
+    ):
+        super().__init__()
+        self.hidden_dim = hidden_size
+        self.num_heads = num_heads
+        head_dim = hidden_size // num_heads
+        self.scale = qk_scale or head_dim**-0.5
+
+        self.mlp_hidden_dim = int(hidden_size * mlp_ratio)
+        # qkv and mlp_in
+        self.linear1 = nn.Linear(hidden_size, hidden_size * 3 + self.mlp_hidden_dim)
+        # proj and mlp_out
+        self.linear2 = nn.Linear(hidden_size + self.mlp_hidden_dim, hidden_size)
+
+        self.norm = QKNorm(head_dim)
+
+        self.hidden_size = hidden_size
+        self.pre_norm = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
+
+        self.mlp_act = nn.GELU(approximate="tanh")
+        self.modulation = Modulation(hidden_size, double=False)
+
+        self.gradient_checkpointing = False
+
+    def enable_gradient_checkpointing(self):
+        self.gradient_checkpointing = True
+
+    def disable_gradient_checkpointing(self):
+        self.gradient_checkpointing = False
+
+    def _forward(self, x: Tensor, vec: Tensor, pe: Tensor) -> Tensor:
+        mod, _ = self.modulation(vec)
+        x_mod = (1 + mod.scale) * self.pre_norm(x) + mod.shift
+        qkv, mlp = torch.split(self.linear1(x_mod), [3 * self.hidden_size, self.mlp_hidden_dim], dim=-1)
+
+        q, k, v = rearrange(qkv, "B L (K H D) -> K B H L D", K=3, H=self.num_heads)
+        q, k = self.norm(q, k, v)
+
+        # compute attention
+        attn = attention(q, k, v, pe=pe)
+        # compute activation in mlp stream, cat again and run second linear layer
+        output = self.linear2(torch.cat((attn, self.mlp_act(mlp)), 2))
+        return x + mod.gate * output
+
+    def forward(self, *args, **kwargs):
+        if self.training and self.gradient_checkpointing:
+            return checkpoint(self._forward, *args, use_reentrant=False, **kwargs)
+        else:
+            return self._forward(*args, **kwargs)
+
+    # def forward(self, x: Tensor, vec: Tensor, pe: Tensor):
+    #     if self.training and self.gradient_checkpointing:
+    #         def create_custom_forward(func):
+    #             def custom_forward(*inputs):
+    #                 return func(*inputs)
+    #             return custom_forward
+    #         return torch.utils.checkpoint.checkpoint(create_custom_forward(self._forward), x, vec, pe, use_reentrant=USE_REENTRANT)
+    #     else:
+    #         return self._forward(x, vec, pe)
+
+
+class LastLayer(nn.Module):
+    def __init__(self, hidden_size: int, patch_size: int, out_channels: int):
+        super().__init__()
+        self.norm_final = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
+        self.linear = nn.Linear(hidden_size, patch_size * patch_size * out_channels, bias=True)
+        self.adaLN_modulation = nn.Sequential(nn.SiLU(), nn.Linear(hidden_size, 2 * hidden_size, bias=True))
+
+    def forward(self, x: Tensor, vec: Tensor) -> Tensor:
+        shift, scale = self.adaLN_modulation(vec).chunk(2, dim=1)
+        x = (1 + scale[:, None, :]) * self.norm_final(x) + shift[:, None, :]
+        x = self.linear(x)
+        return x
+
+
+# endregion
+
+
+class Flux(nn.Module):
+    """
+    Transformer model for flow matching on sequences.
+    """
+
+    def __init__(self, params: FluxParams):
+        super().__init__()
+
+        self.params = params
+        self.in_channels = params.in_channels
+        self.out_channels = self.in_channels
+        if params.hidden_size % params.num_heads != 0:
+            raise ValueError(f"Hidden size {params.hidden_size} must be divisible by num_heads {params.num_heads}")
+        pe_dim = params.hidden_size // params.num_heads
+        if sum(params.axes_dim) != pe_dim:
+            raise ValueError(f"Got {params.axes_dim} but expected positional dim {pe_dim}")
+        self.hidden_size = params.hidden_size
+        self.num_heads = params.num_heads
+        self.pe_embedder = EmbedND(dim=pe_dim, theta=params.theta, axes_dim=params.axes_dim)
+        self.img_in = nn.Linear(self.in_channels, self.hidden_size, bias=True)
+        self.time_in = MLPEmbedder(in_dim=256, hidden_dim=self.hidden_size)
+        self.vector_in = MLPEmbedder(params.vec_in_dim, self.hidden_size)
+        self.guidance_in = MLPEmbedder(in_dim=256, hidden_dim=self.hidden_size) if params.guidance_embed else nn.Identity()
+        self.txt_in = nn.Linear(params.context_in_dim, self.hidden_size)
+
+        self.double_blocks = nn.ModuleList(
+            [
+                DoubleStreamBlock(
+                    self.hidden_size,
+                    self.num_heads,
+                    mlp_ratio=params.mlp_ratio,
+                    qkv_bias=params.qkv_bias,
+                )
+                for _ in range(params.depth)
+            ]
+        )
+
+        self.single_blocks = nn.ModuleList(
+            [
+                SingleStreamBlock(self.hidden_size, self.num_heads, mlp_ratio=params.mlp_ratio)
+                for _ in range(params.depth_single_blocks)
+            ]
+        )
+
+        self.final_layer = LastLayer(self.hidden_size, 1, self.out_channels)
+
+        self.gradient_checkpointing = False
+
+    @property
+    def device(self):
+        return next(self.parameters()).device
+
+    @property
+    def dtype(self):
+        return next(self.parameters()).dtype
+
+    def enable_gradient_checkpointing(self):
+        self.gradient_checkpointing = True
+
+        self.time_in.enable_gradient_checkpointing()
+        self.vector_in.enable_gradient_checkpointing()
+        self.guidance_in.enable_gradient_checkpointing()
+
+        for block in self.double_blocks + self.single_blocks:
+            block.enable_gradient_checkpointing()
+
+        print("FLUX: Gradient checkpointing enabled.")
+
+    def disable_gradient_checkpointing(self):
+        self.gradient_checkpointing = False
+
+        self.time_in.disable_gradient_checkpointing()
+        self.vector_in.disable_gradient_checkpointing()
+        self.guidance_in.disable_gradient_checkpointing()
+
+        for block in self.double_blocks + self.single_blocks:
+            block.disable_gradient_checkpointing()
+
+        print("FLUX: Gradient checkpointing disabled.")
+
+    def forward(
+        self,
+        img: Tensor,
+        img_ids: Tensor,
+        txt: Tensor,
+        txt_ids: Tensor,
+        timesteps: Tensor,
+        y: Tensor,
+        guidance: Tensor | None = None,
+    ) -> Tensor:
+        if img.ndim != 3 or txt.ndim != 3:
+            raise ValueError("Input img and txt tensors must have 3 dimensions.")
+
+        # running on sequences img
+        img = self.img_in(img)
+        vec = self.time_in(timestep_embedding(timesteps, 256))
+        if self.params.guidance_embed:
+            if guidance is None:
+                raise ValueError("Didn't get guidance strength for guidance distilled model.")
+            vec = vec + self.guidance_in(timestep_embedding(guidance, 256))
+        vec = vec + self.vector_in(y)
+        txt = self.txt_in(txt)
+
+        ids = torch.cat((txt_ids, img_ids), dim=1)
+        pe = self.pe_embedder(ids)
+
+        for block in self.double_blocks:
+            img, txt = block(img=img, txt=txt, vec=vec, pe=pe)
+
+        img = torch.cat((txt, img), 1)
+        for block in self.single_blocks:
+            img = block(img, vec=vec, pe=pe)
+        img = img[:, txt.shape[1] :, ...]
+
+        img = self.final_layer(img, vec)  # (N, T, patch_size ** 2 * out_channels)
+        return img
diff --git a/library/flux_utils.py b/library/flux_utils.py
new file mode 100644
index 000000000..ba828d508
--- /dev/null
+++ b/library/flux_utils.py
@@ -0,0 +1,215 @@
+import json
+from typing import Union
+import einops
+import torch
+
+from safetensors.torch import load_file
+from accelerate import init_empty_weights
+from transformers import CLIPTextModel, CLIPConfig, T5EncoderModel, T5Config
+
+from library import flux_models
+
+from library.utils import setup_logging
+
+setup_logging()
+import logging
+
+logger = logging.getLogger(__name__)
+
+MODEL_VERSION_FLUX_V1 = "flux1"
+
+
+def load_flow_model(name: str, ckpt_path: str, dtype: torch.dtype, device: Union[str, torch.device]) -> flux_models.Flux:
+    logger.info(f"Bulding Flux model {name}")
+    with torch.device("meta"):
+        model = flux_models.Flux(flux_models.configs[name].params).to(dtype)
+
+    # load_sft doesn't support torch.device
+    logger.info(f"Loading state dict from {ckpt_path}")
+    sd = load_file(ckpt_path, device=str(device))
+    info = model.load_state_dict(sd, strict=False, assign=True)
+    logger.info(f"Loaded Flux: {info}")
+    return model
+
+
+def load_ae(name: str, ckpt_path: str, dtype: torch.dtype, device: Union[str, torch.device]) -> flux_models.AutoEncoder:
+    logger.info("Building AutoEncoder")
+    with torch.device("meta"):
+        ae = flux_models.AutoEncoder(flux_models.configs[name].ae_params).to(dtype)
+
+    logger.info(f"Loading state dict from {ckpt_path}")
+    sd = load_file(ckpt_path, device=str(device))
+    info = ae.load_state_dict(sd, strict=False, assign=True)
+    logger.info(f"Loaded AE: {info}")
+    return ae
+
+
+def load_clip_l(ckpt_path: str, dtype: torch.dtype, device: Union[str, torch.device]) -> CLIPTextModel:
+    logger.info("Building CLIP")
+    CLIPL_CONFIG = {
+        "_name_or_path": "clip-vit-large-patch14/",
+        "architectures": ["CLIPModel"],
+        "initializer_factor": 1.0,
+        "logit_scale_init_value": 2.6592,
+        "model_type": "clip",
+        "projection_dim": 768,
+        # "text_config": {
+        "_name_or_path": "",
+        "add_cross_attention": False,
+        "architectures": None,
+        "attention_dropout": 0.0,
+        "bad_words_ids": None,
+        "bos_token_id": 0,
+        "chunk_size_feed_forward": 0,
+        "cross_attention_hidden_size": None,
+        "decoder_start_token_id": None,
+        "diversity_penalty": 0.0,
+        "do_sample": False,
+        "dropout": 0.0,
+        "early_stopping": False,
+        "encoder_no_repeat_ngram_size": 0,
+        "eos_token_id": 2,
+        "finetuning_task": None,
+        "forced_bos_token_id": None,
+        "forced_eos_token_id": None,
+        "hidden_act": "quick_gelu",
+        "hidden_size": 768,
+        "id2label": {"0": "LABEL_0", "1": "LABEL_1"},
+        "initializer_factor": 1.0,
+        "initializer_range": 0.02,
+        "intermediate_size": 3072,
+        "is_decoder": False,
+        "is_encoder_decoder": False,
+        "label2id": {"LABEL_0": 0, "LABEL_1": 1},
+        "layer_norm_eps": 1e-05,
+        "length_penalty": 1.0,
+        "max_length": 20,
+        "max_position_embeddings": 77,
+        "min_length": 0,
+        "model_type": "clip_text_model",
+        "no_repeat_ngram_size": 0,
+        "num_attention_heads": 12,
+        "num_beam_groups": 1,
+        "num_beams": 1,
+        "num_hidden_layers": 12,
+        "num_return_sequences": 1,
+        "output_attentions": False,
+        "output_hidden_states": False,
+        "output_scores": False,
+        "pad_token_id": 1,
+        "prefix": None,
+        "problem_type": None,
+        "projection_dim": 768,
+        "pruned_heads": {},
+        "remove_invalid_values": False,
+        "repetition_penalty": 1.0,
+        "return_dict": True,
+        "return_dict_in_generate": False,
+        "sep_token_id": None,
+        "task_specific_params": None,
+        "temperature": 1.0,
+        "tie_encoder_decoder": False,
+        "tie_word_embeddings": True,
+        "tokenizer_class": None,
+        "top_k": 50,
+        "top_p": 1.0,
+        "torch_dtype": None,
+        "torchscript": False,
+        "transformers_version": "4.16.0.dev0",
+        "use_bfloat16": False,
+        "vocab_size": 49408,
+        "hidden_act": "gelu",
+        "hidden_size": 1280,
+        "intermediate_size": 5120,
+        "num_attention_heads": 20,
+        "num_hidden_layers": 32,
+        # },
+        # "text_config_dict": {
+        "hidden_size": 768,
+        "intermediate_size": 3072,
+        "num_attention_heads": 12,
+        "num_hidden_layers": 12,
+        "projection_dim": 768,
+        # },
+        # "torch_dtype": "float32",
+        # "transformers_version": None,
+    }
+    config = CLIPConfig(**CLIPL_CONFIG)
+    with init_empty_weights():
+        clip = CLIPTextModel._from_config(config)
+
+    logger.info(f"Loading state dict from {ckpt_path}")
+    sd = load_file(ckpt_path, device=str(device))
+    info = clip.load_state_dict(sd, strict=False, assign=True)
+    logger.info(f"Loaded CLIP: {info}")
+    return clip
+
+
+def load_t5xxl(ckpt_path: str, dtype: torch.dtype, device: Union[str, torch.device]) -> T5EncoderModel:
+    T5_CONFIG_JSON = """
+{
+  "architectures": [
+    "T5EncoderModel"
+  ],
+  "classifier_dropout": 0.0,
+  "d_ff": 10240,
+  "d_kv": 64,
+  "d_model": 4096,
+  "decoder_start_token_id": 0,
+  "dense_act_fn": "gelu_new",
+  "dropout_rate": 0.1,
+  "eos_token_id": 1,
+  "feed_forward_proj": "gated-gelu",
+  "initializer_factor": 1.0,
+  "is_encoder_decoder": true,
+  "is_gated_act": true,
+  "layer_norm_epsilon": 1e-06,
+  "model_type": "t5",
+  "num_decoder_layers": 24,
+  "num_heads": 64,
+  "num_layers": 24,
+  "output_past": true,
+  "pad_token_id": 0,
+  "relative_attention_max_distance": 128,
+  "relative_attention_num_buckets": 32,
+  "tie_word_embeddings": false,
+  "torch_dtype": "float16",
+  "transformers_version": "4.41.2",
+  "use_cache": true,
+  "vocab_size": 32128
+}
+"""
+    config = json.loads(T5_CONFIG_JSON)
+    config = T5Config(**config)
+    with init_empty_weights():
+        t5xxl = T5EncoderModel._from_config(config)
+
+    logger.info(f"Loading state dict from {ckpt_path}")
+    sd = load_file(ckpt_path, device=str(device))
+    info = t5xxl.load_state_dict(sd, strict=False, assign=True)
+    logger.info(f"Loaded T5xxl: {info}")
+    return t5xxl
+
+
+def prepare_img_ids(batch_size: int, packed_latent_height: int, packed_latent_width: int):
+    img_ids = torch.zeros(packed_latent_height, packed_latent_width, 3)
+    img_ids[..., 1] = img_ids[..., 1] + torch.arange(packed_latent_height)[:, None]
+    img_ids[..., 2] = img_ids[..., 2] + torch.arange(packed_latent_width)[None, :]
+    img_ids = einops.repeat(img_ids, "h w c -> b (h w) c", b=batch_size)
+    return img_ids
+
+
+def unpack_latents(x: torch.Tensor, packed_latent_height: int, packed_latent_width: int) -> torch.Tensor:
+    """
+    x: [b (h w) (c ph pw)] -> [b c (h ph) (w pw)], ph=2, pw=2
+    """
+    x = einops.rearrange(x, "b (h w) (c ph pw) -> b c (h ph) (w pw)", h=packed_latent_height, w=packed_latent_width, ph=2, pw=2)
+    return x
+
+
+def pack_latents(x: torch.Tensor) -> torch.Tensor:
+    """
+    x: [b c (h ph) (w pw)] -> [b (h w) (c ph pw)], ph=2, pw=2
+    """
+    x = einops.rearrange(x, "b c (h ph) (w pw) -> b (h w) (c ph pw)", ph=2, pw=2)
+    return x
diff --git a/library/sd3_models.py b/library/sd3_models.py
index 28378c73b..ec704dcba 100644
--- a/library/sd3_models.py
+++ b/library/sd3_models.py
@@ -15,6 +15,12 @@
 import torch.nn.functional as F
 from torch.utils.checkpoint import checkpoint
 from transformers import CLIPTokenizer, T5TokenizerFast
+from .utils import setup_logging
+
+setup_logging()
+import logging
+
+logger = logging.getLogger(__name__)
 
 
 memory_efficient_attention = None
@@ -95,7 +101,9 @@ def tokenize_with_weights(self, text: str, truncate_to_max_length=True, truncate
             batch.extend([(pad_token, 1.0)] * (self.min_length - len(batch)))
 
         # truncate to max_length
-        print(f"batch: {batch}, max_length: {self.max_length}, truncate: {truncate_to_max_length}, truncate_length: {truncate_length}")
+        print(
+            f"batch: {batch}, max_length: {self.max_length}, truncate: {truncate_to_max_length}, truncate_length: {truncate_length}"
+        )
         if truncate_to_max_length and len(batch) > self.max_length:
             batch = batch[: self.max_length]
         if truncate_length is not None and len(batch) > truncate_length:
@@ -1554,6 +1562,17 @@ def __init__(
             self.set_clip_options({"layer": layer_idx})
         self.options_default = (self.layer, self.layer_idx, self.return_projected_pooled)
 
+    @property
+    def device(self):
+        return next(self.parameters()).device
+
+    @property
+    def dtype(self):
+        return next(self.parameters()).dtype
+
+    def gradient_checkpointing_enable(self):
+        logger.warning("Gradient checkpointing is not supported for this model")
+
     def set_attn_mode(self, mode):
         raise NotImplementedError("This model does not support setting the attention mode")
 
@@ -1925,6 +1944,7 @@ def create_clip_l(device="cpu", dtype=torch.float32, state_dict: Optional[Dict[s
             return_projected_pooled=False,
             textmodel_json_config=CLIPL_CONFIG,
         )
+        clip_l.gradient_checkpointing_enable()
     if state_dict is not None:
         # update state_dict if provided to include logit_scale and text_projection.weight avoid errors
         if "logit_scale" not in state_dict:
diff --git a/library/strategy_flux.py b/library/strategy_flux.py
new file mode 100644
index 000000000..f194ccf6e
--- /dev/null
+++ b/library/strategy_flux.py
@@ -0,0 +1,244 @@
+import os
+import glob
+from typing import Any, List, Optional, Tuple, Union
+import torch
+import numpy as np
+from transformers import CLIPTokenizer, T5TokenizerFast
+
+from library import sd3_utils, train_util
+from library import sd3_models
+from library.strategy_base import LatentsCachingStrategy, TextEncodingStrategy, TokenizeStrategy, TextEncoderOutputsCachingStrategy
+
+from library.utils import setup_logging
+
+setup_logging()
+import logging
+
+logger = logging.getLogger(__name__)
+
+
+CLIP_L_TOKENIZER_ID = "openai/clip-vit-large-patch14"
+T5_XXL_TOKENIZER_ID = "google/t5-v1_1-xxl"
+
+
+class FluxTokenizeStrategy(TokenizeStrategy):
+    def __init__(self, t5xxl_max_length: int = 256, tokenizer_cache_dir: Optional[str] = None) -> None:
+        self.t5xxl_max_length = t5xxl_max_length
+        self.clip_l = self._load_tokenizer(CLIPTokenizer, CLIP_L_TOKENIZER_ID, tokenizer_cache_dir=tokenizer_cache_dir)
+        self.t5xxl = self._load_tokenizer(T5TokenizerFast, T5_XXL_TOKENIZER_ID, tokenizer_cache_dir=tokenizer_cache_dir)
+
+    def tokenize(self, text: Union[str, List[str]]) -> List[torch.Tensor]:
+        text = [text] if isinstance(text, str) else text
+
+        l_tokens = self.clip_l(text, max_length=77, padding="max_length", truncation=True, return_tensors="pt")
+        t5_tokens = self.t5xxl(text, max_length=self.t5xxl_max_length, padding="max_length", truncation=True, return_tensors="pt")
+
+        t5_attn_mask = t5_tokens["attention_mask"]
+        l_tokens = l_tokens["input_ids"]
+        t5_tokens = t5_tokens["input_ids"]
+
+        return [l_tokens, t5_tokens, t5_attn_mask]
+
+
+class FluxTextEncodingStrategy(TextEncodingStrategy):
+    def __init__(self) -> None:
+        pass
+
+    def encode_tokens(
+        self,
+        tokenize_strategy: TokenizeStrategy,
+        models: List[Any],
+        tokens: List[torch.Tensor],
+        apply_t5_attn_mask: bool = False,
+    ) -> List[torch.Tensor]:
+        # supports single model inference only
+
+        clip_l, t5xxl = models
+        l_tokens, t5_tokens = tokens[:2]
+        t5_attn_mask = tokens[2] if len(tokens) > 2 else None
+
+        if clip_l is not None and l_tokens is not None:
+            l_pooled = clip_l(l_tokens.to(clip_l.device))["pooler_output"]
+        else:
+            l_pooled = None
+
+        if t5xxl is not None and t5_tokens is not None:
+            # t5_out is [1, max length, 4096]
+            t5_out, _ = t5xxl(t5_tokens.to(t5xxl.device), return_dict=False, output_hidden_states=True)
+            if apply_t5_attn_mask:
+                t5_out = t5_out * t5_attn_mask.to(t5_out.device).unsqueeze(-1)
+            txt_ids = torch.zeros(1, t5_out.shape[1], 3, device=t5_out.device)
+        else:
+            t5_out = None
+            txt_ids = None
+
+        return [l_pooled, t5_out, txt_ids]
+
+
+class FluxTextEncoderOutputsCachingStrategy(TextEncoderOutputsCachingStrategy):
+    FLUX_TEXT_ENCODER_OUTPUTS_NPZ_SUFFIX = "_flux_te.npz"
+
+    def __init__(
+        self,
+        cache_to_disk: bool,
+        batch_size: int,
+        skip_disk_cache_validity_check: bool,
+        is_partial: bool = False,
+        apply_t5_attn_mask: bool = False,
+    ) -> None:
+        super().__init__(cache_to_disk, batch_size, skip_disk_cache_validity_check, is_partial)
+        self.apply_t5_attn_mask = apply_t5_attn_mask
+
+    def get_outputs_npz_path(self, image_abs_path: str) -> str:
+        return os.path.splitext(image_abs_path)[0] + FluxTextEncoderOutputsCachingStrategy.FLUX_TEXT_ENCODER_OUTPUTS_NPZ_SUFFIX
+
+    def is_disk_cached_outputs_expected(self, npz_path: str):
+        if not self.cache_to_disk:
+            return False
+        if not os.path.exists(npz_path):
+            return False
+        if self.skip_disk_cache_validity_check:
+            return True
+
+        try:
+            npz = np.load(npz_path)
+            if "l_pooled" not in npz:
+                return False
+            if "t5_out" not in npz:
+                return False
+            if "txt_ids" not in npz:
+                return False
+        except Exception as e:
+            logger.error(f"Error loading file: {npz_path}")
+            raise e
+
+        return True
+
+    def mask_t5_attn(self, t5_out: np.ndarray, t5_attn_mask: np.ndarray) -> np.ndarray:
+        return t5_out * np.expand_dims(t5_attn_mask, -1)
+
+    def load_outputs_npz(self, npz_path: str) -> List[np.ndarray]:
+        data = np.load(npz_path)
+        l_pooled = data["l_pooled"]
+        t5_out = data["t5_out"]
+        txt_ids = data["txt_ids"]
+
+        if self.apply_t5_attn_mask:
+            t5_attn_mask = data["t5_attn_mask"]
+            t5_out = self.mask_t5_attn(t5_out, t5_attn_mask)
+
+        return [l_pooled, t5_out, txt_ids]
+
+    def cache_batch_outputs(
+        self, tokenize_strategy: TokenizeStrategy, models: List[Any], text_encoding_strategy: TextEncodingStrategy, infos: List
+    ):
+        flux_text_encoding_strategy: FluxTextEncodingStrategy = text_encoding_strategy
+        captions = [info.caption for info in infos]
+
+        tokens_and_masks = tokenize_strategy.tokenize(captions)
+        with torch.no_grad():
+            l_pooled, t5_out, txt_ids = flux_text_encoding_strategy.encode_tokens(
+                tokenize_strategy, models, tokens_and_masks, self.apply_t5_attn_mask
+            )
+
+        if l_pooled.dtype == torch.bfloat16:
+            l_pooled = l_pooled.float()
+        if t5_out.dtype == torch.bfloat16:
+            t5_out = t5_out.float()
+        if txt_ids.dtype == torch.bfloat16:
+            txt_ids = txt_ids.float()
+
+        l_pooled = l_pooled.cpu().numpy()
+        t5_out = t5_out.cpu().numpy()
+        txt_ids = txt_ids.cpu().numpy()
+
+        for i, info in enumerate(infos):
+            l_pooled_i = l_pooled[i]
+            t5_out_i = t5_out[i]
+            txt_ids_i = txt_ids[i]
+
+            if self.cache_to_disk:
+                t5_attn_mask = tokens_and_masks[2]
+                t5_attn_mask_i = t5_attn_mask[i].cpu().numpy()
+                np.savez(
+                    info.text_encoder_outputs_npz,
+                    l_pooled=l_pooled_i,
+                    t5_out=t5_out_i,
+                    txt_ids=txt_ids_i,
+                    t5_attn_mask=t5_attn_mask_i,
+                )
+            else:
+                info.text_encoder_outputs = (l_pooled_i, t5_out_i, txt_ids_i)
+
+
+class FluxLatentsCachingStrategy(LatentsCachingStrategy):
+    FLUX_LATENTS_NPZ_SUFFIX = "_flux.npz"
+
+    def __init__(self, cache_to_disk: bool, batch_size: int, skip_disk_cache_validity_check: bool) -> None:
+        super().__init__(cache_to_disk, batch_size, skip_disk_cache_validity_check)
+
+    def get_image_size_from_disk_cache_path(self, absolute_path: str) -> Tuple[Optional[int], Optional[int]]:
+        npz_file = glob.glob(os.path.splitext(absolute_path)[0] + "_*" + FluxLatentsCachingStrategy.FLUX_LATENTS_NPZ_SUFFIX)
+        if len(npz_file) == 0:
+            return None, None
+        w, h = os.path.splitext(npz_file[0])[0].split("_")[-2].split("x")
+        return int(w), int(h)
+
+    def get_latents_npz_path(self, absolute_path: str, image_size: Tuple[int, int]) -> str:
+        return (
+            os.path.splitext(absolute_path)[0]
+            + f"_{image_size[0]:04d}x{image_size[1]:04d}"
+            + FluxLatentsCachingStrategy.FLUX_LATENTS_NPZ_SUFFIX
+        )
+
+    def is_disk_cached_latents_expected(self, bucket_reso: Tuple[int, int], npz_path: str, flip_aug: bool, alpha_mask: bool):
+        return self._default_is_disk_cached_latents_expected(8, bucket_reso, npz_path, flip_aug, alpha_mask)
+
+    # TODO remove circular dependency for ImageInfo
+    def cache_batch_latents(self, vae, image_infos: List, flip_aug: bool, alpha_mask: bool, random_crop: bool):
+        encode_by_vae = lambda img_tensor: vae.encode(img_tensor).to("cpu")
+        vae_device = vae.device
+        vae_dtype = vae.dtype
+
+        self._default_cache_batch_latents(encode_by_vae, vae_device, vae_dtype, image_infos, flip_aug, alpha_mask, random_crop)
+
+        if not train_util.HIGH_VRAM:
+            train_util.clean_memory_on_device(vae.device)
+
+
+if __name__ == "__main__":
+    # test code for FluxTokenizeStrategy
+    # tokenizer = sd3_models.SD3Tokenizer()
+    strategy = FluxTokenizeStrategy(256)
+    text = "hello world"
+
+    l_tokens, g_tokens, t5_tokens = strategy.tokenize(text)
+    # print(l_tokens.shape)
+    print(l_tokens)
+    print(g_tokens)
+    print(t5_tokens)
+
+    texts = ["hello world", "the quick brown fox jumps over the lazy dog"]
+    l_tokens_2 = strategy.clip_l(texts, max_length=77, padding="max_length", truncation=True, return_tensors="pt")
+    g_tokens_2 = strategy.clip_g(texts, max_length=77, padding="max_length", truncation=True, return_tensors="pt")
+    t5_tokens_2 = strategy.t5xxl(
+        texts, max_length=strategy.t5xxl_max_length, padding="max_length", truncation=True, return_tensors="pt"
+    )
+    print(l_tokens_2)
+    print(g_tokens_2)
+    print(t5_tokens_2)
+
+    # compare
+    print(torch.allclose(l_tokens, l_tokens_2["input_ids"][0]))
+    print(torch.allclose(g_tokens, g_tokens_2["input_ids"][0]))
+    print(torch.allclose(t5_tokens, t5_tokens_2["input_ids"][0]))
+
+    text = ",".join(["hello world! this is long text"] * 50)
+    l_tokens, g_tokens, t5_tokens = strategy.tokenize(text)
+    print(l_tokens)
+    print(g_tokens)
+    print(t5_tokens)
+
+    print(f"model max length l: {strategy.clip_l.model_max_length}")
+    print(f"model max length g: {strategy.clip_g.model_max_length}")
+    print(f"model max length t5: {strategy.t5xxl.model_max_length}")
diff --git a/networks/lora_flux.py b/networks/lora_flux.py
new file mode 100644
index 000000000..141137b46
--- /dev/null
+++ b/networks/lora_flux.py
@@ -0,0 +1,730 @@
+# temporary minimum implementation of LoRA
+# FLUX doesn't have Conv2d, so we ignore it
+# TODO commonize with the original implementation
+
+# LoRA network module
+# reference:
+# https://github.com/microsoft/LoRA/blob/main/loralib/layers.py
+# https://github.com/cloneofsimo/lora/blob/master/lora_diffusion/lora.py
+
+import math
+import os
+from typing import Dict, List, Optional, Tuple, Type, Union
+from diffusers import AutoencoderKL
+from transformers import CLIPTextModel
+import numpy as np
+import torch
+import re
+from library.utils import setup_logging
+from library.sdxl_original_unet import SdxlUNet2DConditionModel
+
+setup_logging()
+import logging
+
+logger = logging.getLogger(__name__)
+
+
+class LoRAModule(torch.nn.Module):
+    """
+    replaces forward method of the original Linear, instead of replacing the original Linear module.
+    """
+
+    def __init__(
+        self,
+        lora_name,
+        org_module: torch.nn.Module,
+        multiplier=1.0,
+        lora_dim=4,
+        alpha=1,
+        dropout=None,
+        rank_dropout=None,
+        module_dropout=None,
+    ):
+        """if alpha == 0 or None, alpha is rank (no scaling)."""
+        super().__init__()
+        self.lora_name = lora_name
+
+        if org_module.__class__.__name__ == "Conv2d":
+            in_dim = org_module.in_channels
+            out_dim = org_module.out_channels
+        else:
+            in_dim = org_module.in_features
+            out_dim = org_module.out_features
+
+        self.lora_dim = lora_dim
+
+        if org_module.__class__.__name__ == "Conv2d":
+            kernel_size = org_module.kernel_size
+            stride = org_module.stride
+            padding = org_module.padding
+            self.lora_down = torch.nn.Conv2d(in_dim, self.lora_dim, kernel_size, stride, padding, bias=False)
+            self.lora_up = torch.nn.Conv2d(self.lora_dim, out_dim, (1, 1), (1, 1), bias=False)
+        else:
+            self.lora_down = torch.nn.Linear(in_dim, self.lora_dim, bias=False)
+            self.lora_up = torch.nn.Linear(self.lora_dim, out_dim, bias=False)
+
+        if type(alpha) == torch.Tensor:
+            alpha = alpha.detach().float().numpy()  # without casting, bf16 causes error
+        alpha = self.lora_dim if alpha is None or alpha == 0 else alpha
+        self.scale = alpha / self.lora_dim
+        self.register_buffer("alpha", torch.tensor(alpha))  # 定数として扱える
+
+        # same as microsoft's
+        torch.nn.init.kaiming_uniform_(self.lora_down.weight, a=math.sqrt(5))
+        torch.nn.init.zeros_(self.lora_up.weight)
+
+        self.multiplier = multiplier
+        self.org_module = org_module  # remove in applying
+        self.dropout = dropout
+        self.rank_dropout = rank_dropout
+        self.module_dropout = module_dropout
+
+    def apply_to(self):
+        self.org_forward = self.org_module.forward
+        self.org_module.forward = self.forward
+        del self.org_module
+
+    def forward(self, x):
+        org_forwarded = self.org_forward(x)
+
+        # module dropout
+        if self.module_dropout is not None and self.training:
+            if torch.rand(1) < self.module_dropout:
+                return org_forwarded
+
+        lx = self.lora_down(x)
+
+        # normal dropout
+        if self.dropout is not None and self.training:
+            lx = torch.nn.functional.dropout(lx, p=self.dropout)
+
+        # rank dropout
+        if self.rank_dropout is not None and self.training:
+            mask = torch.rand((lx.size(0), self.lora_dim), device=lx.device) > self.rank_dropout
+            if len(lx.size()) == 3:
+                mask = mask.unsqueeze(1)  # for Text Encoder
+            elif len(lx.size()) == 4:
+                mask = mask.unsqueeze(-1).unsqueeze(-1)  # for Conv2d
+            lx = lx * mask
+
+            # scaling for rank dropout: treat as if the rank is changed
+            # maskから計算することも考えられるが、augmentation的な効果を期待してrank_dropoutを用いる
+            scale = self.scale * (1.0 / (1.0 - self.rank_dropout))  # redundant for readability
+        else:
+            scale = self.scale
+
+        lx = self.lora_up(lx)
+
+        return org_forwarded + lx * self.multiplier * scale
+
+
+class LoRAInfModule(LoRAModule):
+    def __init__(
+        self,
+        lora_name,
+        org_module: torch.nn.Module,
+        multiplier=1.0,
+        lora_dim=4,
+        alpha=1,
+        **kwargs,
+    ):
+        # no dropout for inference
+        super().__init__(lora_name, org_module, multiplier, lora_dim, alpha)
+
+        self.org_module_ref = [org_module]  # 後から参照できるように
+        self.enabled = True
+        self.network: LoRANetwork = None
+
+    def set_network(self, network):
+        self.network = network
+
+    # freezeしてマージする
+    def merge_to(self, sd, dtype, device):
+        # extract weight from org_module
+        org_sd = self.org_module.state_dict()
+        weight = org_sd["weight"]
+        org_dtype = weight.dtype
+        org_device = weight.device
+        weight = weight.to(torch.float)  # calc in float
+
+        if dtype is None:
+            dtype = org_dtype
+        if device is None:
+            device = org_device
+
+        # get up/down weight
+        up_weight = sd["lora_up.weight"].to(torch.float).to(device)
+        down_weight = sd["lora_down.weight"].to(torch.float).to(device)
+
+        # merge weight
+        if len(weight.size()) == 2:
+            # linear
+            weight = weight + self.multiplier * (up_weight @ down_weight) * self.scale
+        elif down_weight.size()[2:4] == (1, 1):
+            # conv2d 1x1
+            weight = (
+                weight
+                + self.multiplier
+                * (up_weight.squeeze(3).squeeze(2) @ down_weight.squeeze(3).squeeze(2)).unsqueeze(2).unsqueeze(3)
+                * self.scale
+            )
+        else:
+            # conv2d 3x3
+            conved = torch.nn.functional.conv2d(down_weight.permute(1, 0, 2, 3), up_weight).permute(1, 0, 2, 3)
+            # logger.info(conved.size(), weight.size(), module.stride, module.padding)
+            weight = weight + self.multiplier * conved * self.scale
+
+        # set weight to org_module
+        org_sd["weight"] = weight.to(dtype)
+        self.org_module.load_state_dict(org_sd)
+
+    # 復元できるマージのため、このモジュールのweightを返す
+    def get_weight(self, multiplier=None):
+        if multiplier is None:
+            multiplier = self.multiplier
+
+        # get up/down weight from module
+        up_weight = self.lora_up.weight.to(torch.float)
+        down_weight = self.lora_down.weight.to(torch.float)
+
+        # pre-calculated weight
+        if len(down_weight.size()) == 2:
+            # linear
+            weight = self.multiplier * (up_weight @ down_weight) * self.scale
+        elif down_weight.size()[2:4] == (1, 1):
+            # conv2d 1x1
+            weight = (
+                self.multiplier
+                * (up_weight.squeeze(3).squeeze(2) @ down_weight.squeeze(3).squeeze(2)).unsqueeze(2).unsqueeze(3)
+                * self.scale
+            )
+        else:
+            # conv2d 3x3
+            conved = torch.nn.functional.conv2d(down_weight.permute(1, 0, 2, 3), up_weight).permute(1, 0, 2, 3)
+            weight = self.multiplier * conved * self.scale
+
+        return weight
+
+    def set_region(self, region):
+        self.region = region
+        self.region_mask = None
+
+    def default_forward(self, x):
+        # logger.info(f"default_forward {self.lora_name} {x.size()}")
+        return self.org_forward(x) + self.lora_up(self.lora_down(x)) * self.multiplier * self.scale
+
+    def forward(self, x):
+        if not self.enabled:
+            return self.org_forward(x)
+        return self.default_forward(x)
+
+
+def create_network(
+    multiplier: float,
+    network_dim: Optional[int],
+    network_alpha: Optional[float],
+    ae: AutoencoderKL,
+    text_encoders: List[CLIPTextModel],
+    flux,
+    neuron_dropout: Optional[float] = None,
+    **kwargs,
+):
+    if network_dim is None:
+        network_dim = 4  # default
+    if network_alpha is None:
+        network_alpha = 1.0
+
+    # extract dim/alpha for conv2d, and block dim
+    conv_dim = kwargs.get("conv_dim", None)
+    conv_alpha = kwargs.get("conv_alpha", None)
+    if conv_dim is not None:
+        conv_dim = int(conv_dim)
+        if conv_alpha is None:
+            conv_alpha = 1.0
+        else:
+            conv_alpha = float(conv_alpha)
+
+    # rank/module dropout
+    rank_dropout = kwargs.get("rank_dropout", None)
+    if rank_dropout is not None:
+        rank_dropout = float(rank_dropout)
+    module_dropout = kwargs.get("module_dropout", None)
+    if module_dropout is not None:
+        module_dropout = float(module_dropout)
+
+    # すごく引数が多いな ( ^ω^)･･･
+    network = LoRANetwork(
+        text_encoders,
+        flux,
+        multiplier=multiplier,
+        lora_dim=network_dim,
+        alpha=network_alpha,
+        dropout=neuron_dropout,
+        rank_dropout=rank_dropout,
+        module_dropout=module_dropout,
+        conv_lora_dim=conv_dim,
+        conv_alpha=conv_alpha,
+        varbose=True,
+    )
+
+    loraplus_lr_ratio = kwargs.get("loraplus_lr_ratio", None)
+    loraplus_unet_lr_ratio = kwargs.get("loraplus_unet_lr_ratio", None)
+    loraplus_text_encoder_lr_ratio = kwargs.get("loraplus_text_encoder_lr_ratio", None)
+    loraplus_lr_ratio = float(loraplus_lr_ratio) if loraplus_lr_ratio is not None else None
+    loraplus_unet_lr_ratio = float(loraplus_unet_lr_ratio) if loraplus_unet_lr_ratio is not None else None
+    loraplus_text_encoder_lr_ratio = float(loraplus_text_encoder_lr_ratio) if loraplus_text_encoder_lr_ratio is not None else None
+    if loraplus_lr_ratio is not None or loraplus_unet_lr_ratio is not None or loraplus_text_encoder_lr_ratio is not None:
+        network.set_loraplus_lr_ratio(loraplus_lr_ratio, loraplus_unet_lr_ratio, loraplus_text_encoder_lr_ratio)
+
+    return network
+
+
+# Create network from weights for inference, weights are not loaded here (because can be merged)
+def create_network_from_weights(multiplier, file, ae, text_encoders, flux, weights_sd=None, for_inference=False, **kwargs):
+    # if unet is an instance of SdxlUNet2DConditionModel or subclass, set is_sdxl to True
+    if weights_sd is None:
+        if os.path.splitext(file)[1] == ".safetensors":
+            from safetensors.torch import load_file, safe_open
+
+            weights_sd = load_file(file)
+        else:
+            weights_sd = torch.load(file, map_location="cpu")
+
+    # get dim/alpha mapping
+    modules_dim = {}
+    modules_alpha = {}
+    for key, value in weights_sd.items():
+        if "." not in key:
+            continue
+
+        lora_name = key.split(".")[0]
+        if "alpha" in key:
+            modules_alpha[lora_name] = value
+        elif "lora_down" in key:
+            dim = value.size()[0]
+            modules_dim[lora_name] = dim
+            # logger.info(lora_name, value.size(), dim)
+
+    module_class = LoRAInfModule if for_inference else LoRAModule
+
+    network = LoRANetwork(text_encoders, flux, multiplier=multiplier, module_class=module_class)
+    return network, weights_sd
+
+
+class LoRANetwork(torch.nn.Module):
+    FLUX_TARGET_REPLACE_MODULE = ["DoubleStreamBlock", "SingleStreamBlock"]
+    TEXT_ENCODER_TARGET_REPLACE_MODULE = ["CLIPAttention", "CLIPMLP"]
+    LORA_PREFIX_FLUX = "lora_flux"
+    LORA_PREFIX_TEXT_ENCODER_CLIP = "lora_te1"
+    LORA_PREFIX_TEXT_ENCODER_T5 = "lora_te2"
+
+    def __init__(
+        self,
+        text_encoders: Union[List[CLIPTextModel], CLIPTextModel],
+        unet,
+        multiplier: float = 1.0,
+        lora_dim: int = 4,
+        alpha: float = 1,
+        dropout: Optional[float] = None,
+        rank_dropout: Optional[float] = None,
+        module_dropout: Optional[float] = None,
+        conv_lora_dim: Optional[int] = None,
+        conv_alpha: Optional[float] = None,
+        module_class: Type[object] = LoRAModule,
+        varbose: Optional[bool] = False,
+    ) -> None:
+        super().__init__()
+        self.multiplier = multiplier
+
+        self.lora_dim = lora_dim
+        self.alpha = alpha
+        self.conv_lora_dim = conv_lora_dim
+        self.conv_alpha = conv_alpha
+        self.dropout = dropout
+        self.rank_dropout = rank_dropout
+        self.module_dropout = module_dropout
+
+        self.loraplus_lr_ratio = None
+        self.loraplus_unet_lr_ratio = None
+        self.loraplus_text_encoder_lr_ratio = None
+
+        logger.info(f"create LoRA network. base dim (rank): {lora_dim}, alpha: {alpha}")
+        logger.info(
+            f"neuron dropout: p={self.dropout}, rank dropout: p={self.rank_dropout}, module dropout: p={self.module_dropout}"
+        )
+        if self.conv_lora_dim is not None:
+            logger.info(f"apply LoRA to Conv2d with kernel size (3,3). dim (rank): {self.conv_lora_dim}, alpha: {self.conv_alpha}")
+
+        # create module instances
+        def create_modules(
+            is_flux: bool, text_encoder_idx: Optional[int], root_module: torch.nn.Module, target_replace_modules: List[str]
+        ) -> List[LoRAModule]:
+            prefix = (
+                self.LORA_PREFIX_FLUX
+                if is_flux
+                else (self.LORA_PREFIX_TEXT_ENCODER_CLIP if text_encoder_idx == 0 else self.LORA_PREFIX_TEXT_ENCODER_T5)
+            )
+
+            loras = []
+            skipped = []
+            for name, module in root_module.named_modules():
+                if module.__class__.__name__ in target_replace_modules:
+                    for child_name, child_module in module.named_modules():
+                        is_linear = child_module.__class__.__name__ == "Linear"
+                        is_conv2d = child_module.__class__.__name__ == "Conv2d"
+                        is_conv2d_1x1 = is_conv2d and child_module.kernel_size == (1, 1)
+
+                        if is_linear or is_conv2d:
+                            lora_name = prefix + "." + name + "." + child_name
+                            lora_name = lora_name.replace(".", "_")
+
+                            dim = None
+                            alpha = None
+
+                            # 通常、すべて対象とする
+                            if is_linear or is_conv2d_1x1:
+                                dim = self.lora_dim
+                                alpha = self.alpha
+                            elif self.conv_lora_dim is not None:
+                                dim = self.conv_lora_dim
+                                alpha = self.conv_alpha
+
+                            if dim is None or dim == 0:
+                                # skipした情報を出力
+                                if is_linear or is_conv2d_1x1 or (self.conv_lora_dim is not None):
+                                    skipped.append(lora_name)
+                                continue
+
+                            lora = module_class(
+                                lora_name,
+                                child_module,
+                                self.multiplier,
+                                dim,
+                                alpha,
+                                dropout=dropout,
+                                rank_dropout=rank_dropout,
+                                module_dropout=module_dropout,
+                            )
+                            loras.append(lora)
+            return loras, skipped
+
+        # create LoRA for text encoder
+        # 毎回すべてのモジュールを作るのは無駄なので要検討
+        self.text_encoder_loras: List[Union[LoRAModule, LoRAInfModule]] = []
+        skipped_te = []
+        for i, text_encoder in enumerate(text_encoders):
+            index = i
+            logger.info(f"create LoRA for Text Encoder {index+1}:")
+
+            text_encoder_loras, skipped = create_modules(False, index, text_encoder, LoRANetwork.TEXT_ENCODER_TARGET_REPLACE_MODULE)
+            self.text_encoder_loras.extend(text_encoder_loras)
+            skipped_te += skipped
+        logger.info(f"create LoRA for Text Encoder: {len(self.text_encoder_loras)} modules.")
+
+        self.unet_loras: List[Union[LoRAModule, LoRAInfModule]]
+        self.unet_loras, skipped_un = create_modules(True, None, unet, LoRANetwork.FLUX_TARGET_REPLACE_MODULE)
+        logger.info(f"create LoRA for U-Net: {len(self.unet_loras)} modules.")
+
+        skipped = skipped_te + skipped_un
+        if varbose and len(skipped) > 0:
+            logger.warning(
+                f"because dim (rank) is 0, {len(skipped)} LoRA modules are skipped / dim (rank)が0の為、次の{len(skipped)}個のLoRAモジュールはスキップされます:"
+            )
+            for name in skipped:
+                logger.info(f"\t{name}")
+
+        # assertion
+        names = set()
+        for lora in self.text_encoder_loras + self.unet_loras:
+            assert lora.lora_name not in names, f"duplicated lora name: {lora.lora_name}"
+            names.add(lora.lora_name)
+
+    def set_multiplier(self, multiplier):
+        self.multiplier = multiplier
+        for lora in self.text_encoder_loras + self.unet_loras:
+            lora.multiplier = self.multiplier
+
+    def set_enabled(self, is_enabled):
+        for lora in self.text_encoder_loras + self.unet_loras:
+            lora.enabled = is_enabled
+
+    def load_weights(self, file):
+        if os.path.splitext(file)[1] == ".safetensors":
+            from safetensors.torch import load_file
+
+            weights_sd = load_file(file)
+        else:
+            weights_sd = torch.load(file, map_location="cpu")
+
+        info = self.load_state_dict(weights_sd, False)
+        return info
+
+    def apply_to(self, text_encoders, flux, apply_text_encoder=True, apply_unet=True):
+        if apply_text_encoder:
+            logger.info(f"enable LoRA for text encoder: {len(self.text_encoder_loras)} modules")
+        else:
+            self.text_encoder_loras = []
+
+        if apply_unet:
+            logger.info(f"enable LoRA for U-Net: {len(self.unet_loras)} modules")
+        else:
+            self.unet_loras = []
+
+        for lora in self.text_encoder_loras + self.unet_loras:
+            lora.apply_to()
+            self.add_module(lora.lora_name, lora)
+
+    # マージできるかどうかを返す
+    def is_mergeable(self):
+        return True
+
+    # TODO refactor to common function with apply_to
+    def merge_to(self, text_encoders, flux, weights_sd, dtype=None, device=None):
+        apply_text_encoder = apply_unet = False
+        for key in weights_sd.keys():
+            if key.startswith(LoRANetwork.LORA_PREFIX_TEXT_ENCODER_CLIP) or key.startswith(LoRANetwork.LORA_PREFIX_TEXT_ENCODER_T5):
+                apply_text_encoder = True
+            elif key.startswith(LoRANetwork.LORA_PREFIX_FLUX):
+                apply_unet = True
+
+        if apply_text_encoder:
+            logger.info("enable LoRA for text encoder")
+        else:
+            self.text_encoder_loras = []
+
+        if apply_unet:
+            logger.info("enable LoRA for U-Net")
+        else:
+            self.unet_loras = []
+
+        for lora in self.text_encoder_loras + self.unet_loras:
+            sd_for_lora = {}
+            for key in weights_sd.keys():
+                if key.startswith(lora.lora_name):
+                    sd_for_lora[key[len(lora.lora_name) + 1 :]] = weights_sd[key]
+            lora.merge_to(sd_for_lora, dtype, device)
+
+        logger.info(f"weights are merged")
+
+    def set_loraplus_lr_ratio(self, loraplus_lr_ratio, loraplus_unet_lr_ratio, loraplus_text_encoder_lr_ratio):
+        self.loraplus_lr_ratio = loraplus_lr_ratio
+        self.loraplus_unet_lr_ratio = loraplus_unet_lr_ratio
+        self.loraplus_text_encoder_lr_ratio = loraplus_text_encoder_lr_ratio
+
+        logger.info(f"LoRA+ UNet LR Ratio: {self.loraplus_unet_lr_ratio or self.loraplus_lr_ratio}")
+        logger.info(f"LoRA+ Text Encoder LR Ratio: {self.loraplus_text_encoder_lr_ratio or self.loraplus_lr_ratio}")
+
+    # 二つのText Encoderに別々の学習率を設定できるようにするといいかも
+    def prepare_optimizer_params(self, text_encoder_lr, unet_lr, default_lr):
+        # TODO warn if optimizer is not compatible with LoRA+ (but it will cause error so we don't need to check it here?)
+        # if (
+        #     self.loraplus_lr_ratio is not None
+        #     or self.loraplus_text_encoder_lr_ratio is not None
+        #     or self.loraplus_unet_lr_ratio is not None
+        # ):
+        #     assert (
+        #         optimizer_type.lower() != "prodigy" and "dadapt" not in optimizer_type.lower()
+        #     ), "LoRA+ and Prodigy/DAdaptation is not supported / LoRA+とProdigy/DAdaptationの組み合わせはサポートされていません"
+
+        self.requires_grad_(True)
+
+        all_params = []
+        lr_descriptions = []
+
+        def assemble_params(loras, lr, ratio):
+            param_groups = {"lora": {}, "plus": {}}
+            for lora in loras:
+                for name, param in lora.named_parameters():
+                    if ratio is not None and "lora_up" in name:
+                        param_groups["plus"][f"{lora.lora_name}.{name}"] = param
+                    else:
+                        param_groups["lora"][f"{lora.lora_name}.{name}"] = param
+
+            params = []
+            descriptions = []
+            for key in param_groups.keys():
+                param_data = {"params": param_groups[key].values()}
+
+                if len(param_data["params"]) == 0:
+                    continue
+
+                if lr is not None:
+                    if key == "plus":
+                        param_data["lr"] = lr * ratio
+                    else:
+                        param_data["lr"] = lr
+
+                if param_data.get("lr", None) == 0 or param_data.get("lr", None) is None:
+                    logger.info("NO LR skipping!")
+                    continue
+
+                params.append(param_data)
+                descriptions.append("plus" if key == "plus" else "")
+
+            return params, descriptions
+
+        if self.text_encoder_loras:
+            params, descriptions = assemble_params(
+                self.text_encoder_loras,
+                text_encoder_lr if text_encoder_lr is not None else default_lr,
+                self.loraplus_text_encoder_lr_ratio or self.loraplus_lr_ratio,
+            )
+            all_params.extend(params)
+            lr_descriptions.extend(["textencoder" + (" " + d if d else "") for d in descriptions])
+
+        if self.unet_loras:
+            # if self.block_lr:
+            #     is_sdxl = False
+            #     for lora in self.unet_loras:
+            #         if "input_blocks" in lora.lora_name or "output_blocks" in lora.lora_name:
+            #             is_sdxl = True
+            #             break
+
+            #     # 学習率のグラフをblockごとにしたいので、blockごとにloraを分類
+            #     block_idx_to_lora = {}
+            #     for lora in self.unet_loras:
+            #         idx = get_block_index(lora.lora_name, is_sdxl)
+            #         if idx not in block_idx_to_lora:
+            #             block_idx_to_lora[idx] = []
+            #         block_idx_to_lora[idx].append(lora)
+
+            #     # blockごとにパラメータを設定する
+            #     for idx, block_loras in block_idx_to_lora.items():
+            #         params, descriptions = assemble_params(
+            #             block_loras,
+            #             (unet_lr if unet_lr is not None else default_lr) * self.get_lr_weight(idx),
+            #             self.loraplus_unet_lr_ratio or self.loraplus_lr_ratio,
+            #         )
+            #         all_params.extend(params)
+            #         lr_descriptions.extend([f"unet_block{idx}" + (" " + d if d else "") for d in descriptions])
+
+            # else:
+            params, descriptions = assemble_params(
+                self.unet_loras,
+                unet_lr if unet_lr is not None else default_lr,
+                self.loraplus_unet_lr_ratio or self.loraplus_lr_ratio,
+            )
+            all_params.extend(params)
+            lr_descriptions.extend(["unet" + (" " + d if d else "") for d in descriptions])
+
+        return all_params, lr_descriptions
+
+    def enable_gradient_checkpointing(self):
+        # not supported
+        pass
+
+    def prepare_grad_etc(self, text_encoder, unet):
+        self.requires_grad_(True)
+
+    def on_epoch_start(self, text_encoder, unet):
+        self.train()
+
+    def get_trainable_params(self):
+        return self.parameters()
+
+    def save_weights(self, file, dtype, metadata):
+        if metadata is not None and len(metadata) == 0:
+            metadata = None
+
+        state_dict = self.state_dict()
+
+        if dtype is not None:
+            for key in list(state_dict.keys()):
+                v = state_dict[key]
+                v = v.detach().clone().to("cpu").to(dtype)
+                state_dict[key] = v
+
+        if os.path.splitext(file)[1] == ".safetensors":
+            from safetensors.torch import save_file
+            from library import train_util
+
+            # Precalculate model hashes to save time on indexing
+            if metadata is None:
+                metadata = {}
+            model_hash, legacy_hash = train_util.precalculate_safetensors_hashes(state_dict, metadata)
+            metadata["sshs_model_hash"] = model_hash
+            metadata["sshs_legacy_hash"] = legacy_hash
+
+            save_file(state_dict, file, metadata)
+        else:
+            torch.save(state_dict, file)
+
+    def backup_weights(self):
+        # 重みのバックアップを行う
+        loras: List[LoRAInfModule] = self.text_encoder_loras + self.unet_loras
+        for lora in loras:
+            org_module = lora.org_module_ref[0]
+            if not hasattr(org_module, "_lora_org_weight"):
+                sd = org_module.state_dict()
+                org_module._lora_org_weight = sd["weight"].detach().clone()
+                org_module._lora_restored = True
+
+    def restore_weights(self):
+        # 重みのリストアを行う
+        loras: List[LoRAInfModule] = self.text_encoder_loras + self.unet_loras
+        for lora in loras:
+            org_module = lora.org_module_ref[0]
+            if not org_module._lora_restored:
+                sd = org_module.state_dict()
+                sd["weight"] = org_module._lora_org_weight
+                org_module.load_state_dict(sd)
+                org_module._lora_restored = True
+
+    def pre_calculation(self):
+        # 事前計算を行う
+        loras: List[LoRAInfModule] = self.text_encoder_loras + self.unet_loras
+        for lora in loras:
+            org_module = lora.org_module_ref[0]
+            sd = org_module.state_dict()
+
+            org_weight = sd["weight"]
+            lora_weight = lora.get_weight().to(org_weight.device, dtype=org_weight.dtype)
+            sd["weight"] = org_weight + lora_weight
+            assert sd["weight"].shape == org_weight.shape
+            org_module.load_state_dict(sd)
+
+            org_module._lora_restored = False
+            lora.enabled = False
+
+    def apply_max_norm_regularization(self, max_norm_value, device):
+        downkeys = []
+        upkeys = []
+        alphakeys = []
+        norms = []
+        keys_scaled = 0
+
+        state_dict = self.state_dict()
+        for key in state_dict.keys():
+            if "lora_down" in key and "weight" in key:
+                downkeys.append(key)
+                upkeys.append(key.replace("lora_down", "lora_up"))
+                alphakeys.append(key.replace("lora_down.weight", "alpha"))
+
+        for i in range(len(downkeys)):
+            down = state_dict[downkeys[i]].to(device)
+            up = state_dict[upkeys[i]].to(device)
+            alpha = state_dict[alphakeys[i]].to(device)
+            dim = down.shape[0]
+            scale = alpha / dim
+
+            if up.shape[2:] == (1, 1) and down.shape[2:] == (1, 1):
+                updown = (up.squeeze(2).squeeze(2) @ down.squeeze(2).squeeze(2)).unsqueeze(2).unsqueeze(3)
+            elif up.shape[2:] == (3, 3) or down.shape[2:] == (3, 3):
+                updown = torch.nn.functional.conv2d(down.permute(1, 0, 2, 3), up).permute(1, 0, 2, 3)
+            else:
+                updown = up @ down
+
+            updown *= scale
+
+            norm = updown.norm().clamp(min=max_norm_value / 2)
+            desired = torch.clamp(norm, max=max_norm_value)
+            ratio = desired.cpu() / norm.cpu()
+            sqrt_ratio = ratio**0.5
+            if ratio != 1:
+                keys_scaled += 1
+                state_dict[upkeys[i]] *= sqrt_ratio
+                state_dict[downkeys[i]] *= sqrt_ratio
+            scalednorm = updown.norm() * ratio
+            norms.append(scalednorm.item())
+
+        return keys_scaled, sum(norms) / len(norms), max(norms)
diff --git a/sdxl_train_network.py b/sdxl_train_network.py
index 67ccae62c..4d6e3f184 100644
--- a/sdxl_train_network.py
+++ b/sdxl_train_network.py
@@ -52,6 +52,11 @@ def load_target_model(self, args, weight_dtype, accelerator):
         self.logit_scale = logit_scale
         self.ckpt_info = ckpt_info
 
+        # モデルに xformers とか memory efficient attention を組み込む
+        train_util.replace_unet_modules(unet, args.mem_eff_attn, args.xformers, args.sdpa)
+        if torch.__version__ >= "2.0.0":  # PyTorch 2.0.0 以上対応のxformersなら以下が使える
+            vae.set_use_memory_efficient_attention_xformers(args.xformers)
+
         return sdxl_model_util.MODEL_VERSION_SDXL_BASE_V1_0, [text_encoder1, text_encoder2], vae, unet
 
     def get_tokenize_strategy(self, args):
diff --git a/train_network.py b/train_network.py
index 3828fed19..48d988624 100644
--- a/train_network.py
+++ b/train_network.py
@@ -100,6 +100,12 @@ def assert_extra_args(self, args, train_dataset_group):
 
     def load_target_model(self, args, weight_dtype, accelerator):
         text_encoder, vae, unet, _ = train_util.load_target_model(args, weight_dtype, accelerator)
+
+        # モデルに xformers とか memory efficient attention を組み込む
+        train_util.replace_unet_modules(unet, args.mem_eff_attn, args.xformers, args.sdpa)
+        if torch.__version__ >= "2.0.0":  # PyTorch 2.0.0 以上対応のxformersなら以下が使える
+            vae.set_use_memory_efficient_attention_xformers(args.xformers)
+
         return model_util.get_model_version_str_for_sd1_sd2(args.v2, args.v_parameterization), text_encoder, vae, unet
 
     def get_tokenize_strategy(self, args):
@@ -147,6 +153,81 @@ def all_reduce_network(self, accelerator, network):
     def sample_images(self, accelerator, args, epoch, global_step, device, vae, tokenizers, text_encoder, unet):
         train_util.sample_images(accelerator, args, epoch, global_step, device, vae, tokenizers[0], text_encoder, unet)
 
+    # region SD/SDXL
+
+    def get_noise_scheduler(self, args: argparse.Namespace, device: torch.device) -> Any:
+        noise_scheduler = DDPMScheduler(
+            beta_start=0.00085, beta_end=0.012, beta_schedule="scaled_linear", num_train_timesteps=1000, clip_sample=False
+        )
+        prepare_scheduler_for_custom_training(noise_scheduler, device)
+        if args.zero_terminal_snr:
+            custom_train_functions.fix_noise_scheduler_betas_for_zero_terminal_snr(noise_scheduler)
+        return noise_scheduler
+
+    def encode_images_to_latents(self, args, accelerator, vae, images):
+        return vae.encode(images).latent_dist.sample()
+
+    def shift_scale_latents(self, args, latents):
+        return latents * self.vae_scale_factor
+
+    def get_noise_pred_and_target(
+        self,
+        args,
+        accelerator,
+        noise_scheduler,
+        latents,
+        batch,
+        text_encoder_conds,
+        unet,
+        network,
+        weight_dtype,
+        train_unet,
+    ):
+        # Sample noise, sample a random timestep for each image, and add noise to the latents,
+        # with noise offset and/or multires noise if specified
+        noise, noisy_latents, timesteps, huber_c = train_util.get_noise_noisy_latents_and_timesteps(args, noise_scheduler, latents)
+
+        # ensure the hidden state will require grad
+        if args.gradient_checkpointing:
+            for x in noisy_latents:
+                x.requires_grad_(True)
+            for t in text_encoder_conds:
+                t.requires_grad_(True)
+
+        # Predict the noise residual
+        with accelerator.autocast():
+            noise_pred = self.call_unet(
+                args,
+                accelerator,
+                unet,
+                noisy_latents.requires_grad_(train_unet),
+                timesteps,
+                text_encoder_conds,
+                batch,
+                weight_dtype,
+            )
+
+        if args.v_parameterization:
+            # v-parameterization training
+            target = noise_scheduler.get_velocity(latents, noise, timesteps)
+        else:
+            target = noise
+
+        return noise_pred, target, timesteps, huber_c, None
+
+    def post_process_loss(self, loss, args, timesteps, noise_scheduler):
+        if args.min_snr_gamma:
+            loss = apply_snr_weight(loss, timesteps, noise_scheduler, args.min_snr_gamma, args.v_parameterization)
+        if args.scale_v_pred_loss_like_noise_pred:
+            loss = scale_v_prediction_loss_like_noise_prediction(loss, timesteps, noise_scheduler)
+        if args.v_pred_like_loss:
+            loss = add_v_prediction_like_loss(loss, timesteps, noise_scheduler, args.v_pred_like_loss)
+        if args.debiased_estimation_loss:
+            loss = apply_debiased_estimation(loss, timesteps, noise_scheduler)
+        return loss
+
+    # endregion
+
     def train(self, args):
         session_id = random.randint(0, 2**32)
         training_started_at = time.time()
@@ -253,11 +334,6 @@ def train(self, args):
         # text_encoder is List[CLIPTextModel] or CLIPTextModel
         text_encoders = text_encoder if isinstance(text_encoder, list) else [text_encoder]
 
-        # モデルに xformers とか memory efficient attention を組み込む
-        train_util.replace_unet_modules(unet, args.mem_eff_attn, args.xformers, args.sdpa)
-        if torch.__version__ >= "2.0.0":  # PyTorch 2.0.0 以上対応のxformersなら以下が使える
-            vae.set_use_memory_efficient_attention_xformers(args.xformers)
-
         # 差分追加学習のためにモデルを読み込む
         sys.path.append(os.path.dirname(__file__))
         accelerator.print("import network module:", args.network_module)
@@ -445,16 +521,19 @@ def train(self, args):
             unet_weight_dtype = torch.float8_e4m3fn
             te_weight_dtype = torch.float8_e4m3fn
 
+            unet.to(accelerator.device)  # this makes faster `to(dtype)` below
+
         unet.requires_grad_(False)
-        unet.to(dtype=unet_weight_dtype)
+        unet.to(dtype=unet_weight_dtype)  # this takes long time and large memory
         for t_enc in text_encoders:
             t_enc.requires_grad_(False)
 
             # in case of cpu, dtype is already set to fp32 because cpu does not support fp8/fp16/bf16
             if t_enc.device.type != "cpu":
                 t_enc.to(dtype=te_weight_dtype)
-                # nn.Embedding not support FP8
-                t_enc.text_model.embeddings.to(dtype=(weight_dtype if te_weight_dtype != weight_dtype else te_weight_dtype))
+                if hasattr(t_enc.text_model, "embeddings"):
+                    # nn.Embedding not support FP8
+                    t_enc.text_model.embeddings.to(dtype=(weight_dtype if te_weight_dtype != weight_dtype else te_weight_dtype))
 
         # acceleratorがなんかよろしくやってくれるらしい / accelerator will do something good
         if args.deepspeed:
@@ -851,12 +930,7 @@ def load_model_hook(models, input_dir):
 
         global_step = 0
 
-        noise_scheduler = DDPMScheduler(
-            beta_start=0.00085, beta_end=0.012, beta_schedule="scaled_linear", num_train_timesteps=1000, clip_sample=False
-        )
-        prepare_scheduler_for_custom_training(noise_scheduler, accelerator.device)
-        if args.zero_terminal_snr:
-            custom_train_functions.fix_noise_scheduler_betas_for_zero_terminal_snr(noise_scheduler)
+        noise_scheduler = self.get_noise_scheduler(args, accelerator.device)
 
         if accelerator.is_main_process:
             init_kwargs = {}
@@ -913,6 +987,13 @@ def remove_model(old_ckpt_name):
                 initial_step -= len(train_dataloader)
             global_step = initial_step
 
+        # log device and dtype for each model
+        logger.info(f"unet dtype: {unet_weight_dtype}, device: {unet.device}")
+        for t_enc in text_encoders:
+            logger.info(f"text_encoder dtype: {te_weight_dtype}, device: {t_enc.device}")
+
+        clean_memory_on_device(accelerator.device)
+
         for epoch in range(epoch_to_start, num_train_epochs):
             accelerator.print(f"\nepoch {epoch+1}/{num_train_epochs}")
             current_epoch.value = epoch + 1
@@ -940,13 +1021,15 @@ def remove_model(old_ckpt_name):
                     else:
                         with torch.no_grad():
                             # latentに変換
-                            latents = vae.encode(batch["images"].to(dtype=vae_dtype)).latent_dist.sample().to(dtype=weight_dtype)
+                            latents = self.encode_images_to_latents(args, accelerator, vae, batch["images"].to(vae_dtype))
+                            latents = latents.to(dtype=weight_dtype)
 
                             # NaNが含まれていれば警告を表示し0に置き換える
                             if torch.any(torch.isnan(latents)):
                                 accelerator.print("NaN found in latents, replacing with zeros")
                                 latents = torch.nan_to_num(latents, 0, out=latents)
-                    latents = latents * self.vae_scale_factor
+
+                    latents = self.shift_scale_latents(args, latents)
 
                     # get multiplier for each sample
                     if network_has_multiplier:
@@ -985,41 +1068,25 @@ def remove_model(old_ckpt_name):
                                 if args.full_fp16:
                                     text_encoder_conds = [c.to(weight_dtype) for c in text_encoder_conds]
 
-                    # Sample noise, sample a random timestep for each image, and add noise to the latents,
-                    # with noise offset and/or multires noise if specified
-                    noise, noisy_latents, timesteps, huber_c = train_util.get_noise_noisy_latents_and_timesteps(
-                        args, noise_scheduler, latents
+                    # sample noise, call unet, get target
+                    noise_pred, target, timesteps, huber_c, weighting = self.get_noise_pred_and_target(
+                        args,
+                        accelerator,
+                        noise_scheduler,
+                        latents,
+                        batch,
+                        text_encoder_conds,
+                        unet,
+                        network,
+                        weight_dtype,
+                        train_unet,
                     )
 
-                    # ensure the hidden state will require grad
-                    if args.gradient_checkpointing:
-                        for x in noisy_latents:
-                            x.requires_grad_(True)
-                        for t in text_encoder_conds:
-                            t.requires_grad_(True)
-
-                    # Predict the noise residual
-                    with accelerator.autocast():
-                        noise_pred = self.call_unet(
-                            args,
-                            accelerator,
-                            unet,
-                            noisy_latents.requires_grad_(train_unet),
-                            timesteps,
-                            text_encoder_conds,
-                            batch,
-                            weight_dtype,
-                        )
-
-                    if args.v_parameterization:
-                        # v-parameterization training
-                        target = noise_scheduler.get_velocity(latents, noise, timesteps)
-                    else:
-                        target = noise
-
                     loss = train_util.conditional_loss(
                         noise_pred.float(), target.float(), reduction="none", loss_type=args.loss_type, huber_c=huber_c
                     )
+                    if weighting is not None:
+                        loss = loss * weighting
                     if args.masked_loss or ("alpha_masks" in batch and batch["alpha_masks"] is not None):
                         loss = apply_masked_loss(loss, batch)
                     loss = loss.mean([1, 2, 3])
@@ -1027,14 +1094,8 @@ def remove_model(old_ckpt_name):
                     loss_weights = batch["loss_weights"]  # 各sampleごとのweight
                     loss = loss * loss_weights
 
-                    if args.min_snr_gamma:
-                        loss = apply_snr_weight(loss, timesteps, noise_scheduler, args.min_snr_gamma, args.v_parameterization)
-                    if args.scale_v_pred_loss_like_noise_pred:
-                        loss = scale_v_prediction_loss_like_noise_prediction(loss, timesteps, noise_scheduler)
-                    if args.v_pred_like_loss:
-                        loss = add_v_prediction_like_loss(loss, timesteps, noise_scheduler, args.v_pred_like_loss)
-                    if args.debiased_estimation_loss:
-                        loss = apply_debiased_estimation(loss, timesteps, noise_scheduler)
+                    # min snr gamma, scale v pred loss like noise pred, v pred like loss, debiased estimation etc.
+                    loss = self.post_process_loss(loss, args, timesteps, noise_scheduler)
 
                     loss = loss.mean()  # 平均なのでbatch_sizeで割る必要なし
 

From 808d2d1f48e2f4e544d47464edb2727c03da2f53 Mon Sep 17 00:00:00 2001
From: Kohya S <ykumeykume@gmail.com>
Date: Fri, 9 Aug 2024 23:02:51 +0900
Subject: [PATCH 061/582] fix typos

---
 flux_train_network.py  | 2 +-
 library/flux_models.py | 4 ++--
 library/flux_utils.py  | 2 +-
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/flux_train_network.py b/flux_train_network.py
index 7c762c86d..e4be97ad8 100644
--- a/flux_train_network.py
+++ b/flux_train_network.py
@@ -250,7 +250,7 @@ def compute_loss_weighting_for_sd3(weighting_scheme: str, sigmas=None):
         # )
 
         with accelerator.autocast():
-            # YiYi notes: divide it by 1000 for now because we scale it by 1000 in the transforme rmodel (we should not keep it but I want to keep the inputs same for the model for testing)
+            # YiYi notes: divide it by 1000 for now because we scale it by 1000 in the transformer model (we should not keep it but I want to keep the inputs same for the model for testing)
             model_pred = unet(
                 img=packed_noisy_model_input,
                 img_ids=img_ids,
diff --git a/library/flux_models.py b/library/flux_models.py
index d0955e375..92c79bcca 100644
--- a/library/flux_models.py
+++ b/library/flux_models.py
@@ -685,11 +685,11 @@ def _forward(self, img: Tensor, txt: Tensor, vec: Tensor, pe: Tensor) -> tuple[T
         attn = attention(q, k, v, pe=pe)
         txt_attn, img_attn = attn[:, : txt.shape[1]], attn[:, txt.shape[1] :]
 
-        # calculate the img bloks
+        # calculate the img blocks
         img = img + img_mod1.gate * self.img_attn.proj(img_attn)
         img = img + img_mod2.gate * self.img_mlp((1 + img_mod2.scale) * self.img_norm2(img) + img_mod2.shift)
 
-        # calculate the txt bloks
+        # calculate the txt blocks
         txt = txt + txt_mod1.gate * self.txt_attn.proj(txt_attn)
         txt = txt + txt_mod2.gate * self.txt_mlp((1 + txt_mod2.scale) * self.txt_norm2(txt) + txt_mod2.shift)
         return img, txt
diff --git a/library/flux_utils.py b/library/flux_utils.py
index ba828d508..166cd833b 100644
--- a/library/flux_utils.py
+++ b/library/flux_utils.py
@@ -20,7 +20,7 @@
 
 
 def load_flow_model(name: str, ckpt_path: str, dtype: torch.dtype, device: Union[str, torch.device]) -> flux_models.Flux:
-    logger.info(f"Bulding Flux model {name}")
+    logger.info(f"Building Flux model {name}")
     with torch.device("meta"):
         model = flux_models.Flux(flux_models.configs[name].params).to(dtype)
 

From 358f13f2c92a04fb524006f124fc029a9edb0eaf Mon Sep 17 00:00:00 2001
From: Kohya S <ykumeykume@gmail.com>
Date: Sat, 10 Aug 2024 14:03:59 +0900
Subject: [PATCH 062/582] fix alpha is ignored

---
 networks/lora_flux.py | 41 +++++++++++++++++++++++++++--------------
 1 file changed, 27 insertions(+), 14 deletions(-)

diff --git a/networks/lora_flux.py b/networks/lora_flux.py
index 141137b46..332a73d97 100644
--- a/networks/lora_flux.py
+++ b/networks/lora_flux.py
@@ -307,7 +307,9 @@ def create_network_from_weights(multiplier, file, ae, text_encoders, flux, weigh
 
     module_class = LoRAInfModule if for_inference else LoRAModule
 
-    network = LoRANetwork(text_encoders, flux, multiplier=multiplier, module_class=module_class)
+    network = LoRANetwork(
+        text_encoders, flux, multiplier=multiplier, modules_dim=modules_dim, modules_alpha=modules_alpha, module_class=module_class
+    )
     return network, weights_sd
 
 
@@ -331,6 +333,8 @@ def __init__(
         conv_lora_dim: Optional[int] = None,
         conv_alpha: Optional[float] = None,
         module_class: Type[object] = LoRAModule,
+        modules_dim: Optional[Dict[str, int]] = None,
+        modules_alpha: Optional[Dict[str, int]] = None,
         varbose: Optional[bool] = False,
     ) -> None:
         super().__init__()
@@ -348,12 +352,15 @@ def __init__(
         self.loraplus_unet_lr_ratio = None
         self.loraplus_text_encoder_lr_ratio = None
 
-        logger.info(f"create LoRA network. base dim (rank): {lora_dim}, alpha: {alpha}")
-        logger.info(
-            f"neuron dropout: p={self.dropout}, rank dropout: p={self.rank_dropout}, module dropout: p={self.module_dropout}"
-        )
-        if self.conv_lora_dim is not None:
-            logger.info(f"apply LoRA to Conv2d with kernel size (3,3). dim (rank): {self.conv_lora_dim}, alpha: {self.conv_alpha}")
+        if modules_dim is not None:
+            logger.info(f"create LoRA network from weights")
+        else:
+            logger.info(f"create LoRA network. base dim (rank): {lora_dim}, alpha: {alpha}")
+            logger.info(
+                f"neuron dropout: p={self.dropout}, rank dropout: p={self.rank_dropout}, module dropout: p={self.module_dropout}"
+            )
+            if self.conv_lora_dim is not None:
+                logger.info(f"apply LoRA to Conv2d with kernel size (3,3). dim (rank): {self.conv_lora_dim}, alpha: {self.conv_alpha}")
 
         # create module instances
         def create_modules(
@@ -381,13 +388,19 @@ def create_modules(
                             dim = None
                             alpha = None
 
-                            # 通常、すべて対象とする
-                            if is_linear or is_conv2d_1x1:
-                                dim = self.lora_dim
-                                alpha = self.alpha
-                            elif self.conv_lora_dim is not None:
-                                dim = self.conv_lora_dim
-                                alpha = self.conv_alpha
+                            if modules_dim is not None:
+                                # モジュール指定あり
+                                if lora_name in modules_dim:
+                                    dim = modules_dim[lora_name]
+                                    alpha = modules_alpha[lora_name]
+                            else:
+                                # 通常、すべて対象とする
+                                if is_linear or is_conv2d_1x1:
+                                    dim = self.lora_dim
+                                    alpha = self.alpha
+                                elif self.conv_lora_dim is not None:
+                                    dim = self.conv_lora_dim
+                                    alpha = self.conv_alpha
 
                             if dim is None or dim == 0:
                                 # skipした情報を出力

From 8a0f12dde812994ec3facdcdb7c08b362dbceb0f Mon Sep 17 00:00:00 2001
From: Kohya S <ykumeykume@gmail.com>
Date: Sat, 10 Aug 2024 23:42:05 +0900
Subject: [PATCH 063/582] update FLUX LoRA training

---
 README.md                 |  29 ++++++++---
 flux_train_network.py     | 105 ++++++++++++++++++++++++++++++--------
 library/sai_model_spec.py |  24 +++++++--
 library/strategy_flux.py  |   4 +-
 library/train_util.py     |   9 ++--
 networks/lora_flux.py     |   2 +-
 train_network.py          |  18 +++++--
 7 files changed, 150 insertions(+), 41 deletions(-)

diff --git a/README.md b/README.md
index a0b02f108..1089dd001 100644
--- a/README.md
+++ b/README.md
@@ -2,24 +2,41 @@ This repository contains training, generation and utility scripts for Stable Dif
 
 ## FLUX.1 LoRA training (WIP)
 
-__Aug 9, 2024__:
+This feature is experimental. The options and the training script may change in the future. Please let us know if you have any idea to improve the training.
+
+Aug 10, 2024:  LoRA key prefix is changed to `lora_unet` from `lora_flex` to make it compatible with ComfyUI.
 
 Please update PyTorch to 2.4.0. We have tested with PyTorch 2.4.0 with CUDA 12.4. We also updated `accelerate` to 0.33.0 just to be safe. 
 
-We have added a new training script for LoRA training. The script is `flux_train_network.py`. See `--help` for options. 
+We have added a new training script for LoRA training. The script is `flux_train_network.py`. See `--help` for options. Sample command is below. It will work with 24GB VRAM GPUs.
 
 ```
-accelerate launch  --mixed_precision bf16 --num_cpu_threads_per_process 1 flux_train_network.py --pretrained_model_name_or_path flux1-dev.sft --clip_l sd3/clip_l.safetensors --t5xxl sd3/t5xxl_fp16.safetensors --ae ae.sft --cache_latents_to_disk --save_model_as safetensors --sdpa --persistent_data_loader_workers --max_data_loader_n_workers 2 --seed 42 --gradient_checkpointing --mixed_precision bf16 --save_precision bf16 --network_module networks.lora_flux --network_dim 4 --optimizer_type adamw8bit --learning_rate 1e-4 --network_train_unet_only --cache_text_encoder_outputs --cache_text_encoder_outputs_to_disk --fp8_base --highvram --max_train_epochs 4 --save_every_n_epochs 1 --dataset_config dataset_1024_bs2.toml --output_dir path/to/output/dir --output_name flux-lora-name
+accelerate launch  --mixed_precision bf16 --num_cpu_threads_per_process 1 flux_train_network.py --pretrained_model_name_or_path flux1-dev.sft --clip_l sd3/clip_l.safetensors --t5xxl sd3/t5xxl_fp16.safetensors --ae ae.sft --cache_latents_to_disk --save_model_as safetensors --sdpa --persistent_data_loader_workers --max_data_loader_n_workers 2 --seed 42 --gradient_checkpointing --mixed_precision bf16 --save_precision bf16 --network_module networks.lora_flux --network_dim 4 --optimizer_type adamw8bit --learning_rate 1e-4 --network_train_unet_only --cache_text_encoder_outputs --cache_text_encoder_outputs_to_disk --fp8_base --highvram --max_train_epochs 4 --save_every_n_epochs 1 --dataset_config dataset_1024_bs2.toml --output_dir path/to/output/dir --output_name flux-lora-name --timestep_sampling sigma --model_prediction_type raw --discrete_flow_shift 1.0 --loss_type l2
 ```
 
+LoRAs for Text Encoders are not tested yet.
+
+We have added some new options (Aug 10, 2024): `--time_sampling`, `--sigmoid_scale`, `--model_prediction_type` and `--discrete_flow_shift`. The options are as follows:
+
+- `--timestep_sampling` is the method to sample timesteps (0-1): `sigma` (sigma-based, same as SD3), `uniform` (uniform random), or `sigmoid` (sigmoid of random normal, same as x-flux).
+- `--sigmoid_scale` is the scale factor for sigmoid timestep sampling (only used when timestep-sampling is "sigmoid"). The default is 1.0. Larger values will make the sampling more uniform.
+- `--model_prediction_type` is how to interpret and process the model prediction: `raw` (use as is, same as x-flux), `additive` (add to noisy input), `sigma_scaled` (apply sigma scaling, same as SD3).
+- `--discrete_flow_shift` is the discrete flow shift for the Euler Discrete Scheduler, default is 3.0 (same as SD3).
+
+`--loss_type` may be useful for FLUX.1 training. The default is `l2`.
+
+In our experiments, `--timestep_sampling sigma --model_prediction_type raw --discrete_flow_shift 1.0` with `--loss_type l2` seems to work better than the default (SD3) settings. The multiplier of LoRA should be adjusted. Other settings may work better, so please try different settings.
+
+We also not sure how many epochs are needed for convergence, and how the learning rate should be adjusted.
+
+The trained LoRA model can be used with ComfyUI. 
+
 The inference script is also available. The script is `flux_minimal_inference.py`. See `--help` for options. 
 
 ```
-python flux_minimal_inference.py --ckpt flux1-dev.sft --clip_l sd3/clip_l.safetensors --t5xxl sd3/t5xxl_fp16.safetensors --ae ae.sft --dtype bf16 --prompt "a cat holding a sign that says hello world" --out path/to/output/dir --seed 1 --flux_dtype fp8 --offload --lora lora-flux-name.safetensors
+python flux_minimal_inference.py --ckpt flux1-dev.sft --clip_l sd3/clip_l.safetensors --t5xxl sd3/t5xxl_fp16.safetensors --ae ae.sft --dtype bf16 --prompt "a cat holding a sign that says hello world" --out path/to/output/dir --seed 1 --flux_dtype fp8 --offload --lora lora-flux-name.safetensors;1.0
 ```
 
-Unfortnately the training result is not good. Please let us know if you have any idea to improve the training.
-
 ## SD3 training
 
 SD3 training is done with `sd3_train.py`. 
diff --git a/flux_train_network.py b/flux_train_network.py
index e4be97ad8..69b6e8eaf 100644
--- a/flux_train_network.py
+++ b/flux_train_network.py
@@ -135,7 +135,7 @@ def sample_images(self, accelerator, args, epoch, global_step, device, vae, toke
         pass
 
     def get_noise_scheduler(self, args: argparse.Namespace, device: torch.device) -> Any:
-        noise_scheduler = sd3_train_utils.FlowMatchEulerDiscreteScheduler(num_train_timesteps=1000, shift=3.0)
+        noise_scheduler = sd3_train_utils.FlowMatchEulerDiscreteScheduler(num_train_timesteps=1000, shift=args.discrete_flow_shift)
         self.noise_scheduler_copy = copy.deepcopy(noise_scheduler)
         return noise_scheduler
 
@@ -211,21 +211,32 @@ def compute_loss_weighting_for_sd3(weighting_scheme: str, sigmas=None):
         noise = torch.randn_like(latents)
         bsz = latents.shape[0]
 
-        # Sample a random timestep for each image
-        # for weighting schemes where we sample timesteps non-uniformly
-        u = compute_density_for_timestep_sampling(
-            weighting_scheme=args.weighting_scheme,
-            batch_size=bsz,
-            logit_mean=args.logit_mean,
-            logit_std=args.logit_std,
-            mode_scale=args.mode_scale,
-        )
-        indices = (u * self.noise_scheduler_copy.config.num_train_timesteps).long()
-        timesteps = self.noise_scheduler_copy.timesteps[indices].to(device=accelerator.device)
-
-        # Add noise according to flow matching.
-        sigmas = get_sigmas(timesteps, n_dim=latents.ndim, dtype=weight_dtype)
-        noisy_model_input = sigmas * noise + (1.0 - sigmas) * latents
+        if args.timestep_sampling == "uniform" or args.timestep_sampling == "sigmoid":
+            # Simple random t-based noise sampling
+            if args.timestep_sampling == "sigmoid":
+                # https://github.com/XLabs-AI/x-flux/tree/main
+                t = torch.sigmoid(args.sigmoid_scale * torch.randn((bsz,), device=accelerator.device))
+            else:
+                t = torch.rand((bsz,), device=accelerator.device)
+            timesteps = t * 1000.0
+            t = t.view(-1, 1, 1, 1)
+            noisy_model_input = (1 - t) * latents + t * noise
+        else:
+            # Sample a random timestep for each image
+            # for weighting schemes where we sample timesteps non-uniformly
+            u = compute_density_for_timestep_sampling(
+                weighting_scheme=args.weighting_scheme,
+                batch_size=bsz,
+                logit_mean=args.logit_mean,
+                logit_std=args.logit_std,
+                mode_scale=args.mode_scale,
+            )
+            indices = (u * self.noise_scheduler_copy.config.num_train_timesteps).long()
+            timesteps = self.noise_scheduler_copy.timesteps[indices].to(device=accelerator.device)
+
+            # Add noise according to flow matching.
+            sigmas = get_sigmas(timesteps, n_dim=latents.ndim, dtype=weight_dtype)
+            noisy_model_input = sigmas * noise + (1.0 - sigmas) * latents
 
         # pack latents and get img_ids
         packed_noisy_model_input = flux_utils.pack_latents(noisy_model_input)  # b, c, h*2, w*2 -> b, h*w, c*4
@@ -264,11 +275,20 @@ def compute_loss_weighting_for_sd3(weighting_scheme: str, sigmas=None):
         # unpack latents
         model_pred = flux_utils.unpack_latents(model_pred, packed_latent_height, packed_latent_width)
 
-        model_pred = model_pred * (-sigmas) + noisy_model_input
-
-        # these weighting schemes use a uniform timestep sampling
-        # and instead post-weight the loss
-        weighting = compute_loss_weighting_for_sd3(weighting_scheme=args.weighting_scheme, sigmas=sigmas)
+        if args.model_prediction_type == "raw":
+            # use model_pred as is
+            weighting = None
+        elif args.model_prediction_type == "additive":
+            # add the model_pred to the noisy_model_input
+            model_pred = model_pred + noisy_model_input
+            weighting = None
+        elif args.model_prediction_type == "sigma_scaled":
+            # apply sigma scaling
+            model_pred = model_pred * (-sigmas) + noisy_model_input
+
+            # these weighting schemes use a uniform timestep sampling
+            # and instead post-weight the loss
+            weighting = compute_loss_weighting_for_sd3(weighting_scheme=args.weighting_scheme, sigmas=sigmas)
 
         # flow matching loss: this is different from SD3
         target = noise - latents
@@ -278,6 +298,21 @@ def compute_loss_weighting_for_sd3(weighting_scheme: str, sigmas=None):
     def post_process_loss(self, loss, args, timesteps, noise_scheduler):
         return loss
 
+    def get_sai_model_spec(self, args):
+        return train_util.get_sai_model_spec(None, args, False, True, False, flux="dev")
+
+    def update_metadata(self, metadata, args):
+        metadata["ss_apply_t5_attn_mask"] = args.apply_t5_attn_mask
+        metadata["ss_weighting_scheme"] = args.weighting_scheme
+        metadata["ss_logit_mean"] = args.logit_mean
+        metadata["ss_logit_std"] = args.logit_std
+        metadata["ss_mode_scale"] = args.mode_scale
+        metadata["ss_guidance_scale"] = args.guidance_scale
+        metadata["ss_timestep_sampling"] = args.timestep_sampling
+        metadata["ss_sigmoid_scale"] = args.sigmoid_scale
+        metadata["ss_model_prediction_type"] = args.model_prediction_type
+        metadata["ss_discrete_flow_shift"] = args.discrete_flow_shift
+
 
 def setup_parser() -> argparse.ArgumentParser:
     parser = train_network.setup_parser()
@@ -318,6 +353,34 @@ def setup_parser() -> argparse.ArgumentParser:
         default=3.5,
         help="the FLUX.1 dev variant is a guidance distilled model",
     )
+
+    parser.add_argument(
+        "--timestep_sampling",
+        choices=["sigma", "uniform", "sigmoid"],
+        default="sigma",
+        help="Method to sample timesteps: sigma-based, uniform random, or sigmoid of random normal. / タイムステップをサンプリングする方法：sigma、random uniform、またはrandom normalのsigmoid。",
+    )
+    parser.add_argument(
+        "--sigmoid_scale",
+        type=float,
+        default=1.0,
+        help='Scale factor for sigmoid timestep sampling (only used when timestep-sampling is "sigmoid"). / sigmoidタイムステップサンプリングの倍率（timestep-samplingが"sigmoid"の場合のみ有効）。',
+    )
+    parser.add_argument(
+        "--model_prediction_type",
+        choices=["raw", "additive", "sigma_scaled"],
+        default="sigma_scaled",
+        help="How to interpret and process the model prediction: "
+        "raw (use as is), additive (add to noisy input), sigma_scaled (apply sigma scaling)."
+        " / モデル予測の解釈と処理方法："
+        "raw（そのまま使用）、additive（ノイズ入力に加算）、sigma_scaled（シグマスケーリングを適用）。",
+    )
+    parser.add_argument(
+        "--discrete_flow_shift",
+        type=float,
+        default=3.0,
+        help="Discrete flow shift for the Euler Discrete Scheduler, default is 3.0. / Euler Discrete Schedulerの離散フローシフト、デフォルトは3.0。",
+    )
     return parser
 
 
diff --git a/library/sai_model_spec.py b/library/sai_model_spec.py
index af073677e..ad72ec00d 100644
--- a/library/sai_model_spec.py
+++ b/library/sai_model_spec.py
@@ -59,6 +59,8 @@
 ARCH_SD_XL_V1_BASE = "stable-diffusion-xl-v1-base"
 ARCH_SD3_M = "stable-diffusion-3-medium"
 ARCH_SD3_UNKNOWN = "stable-diffusion-3"
+ARCH_FLUX_1_DEV = "flux-1-dev"
+ARCH_FLUX_1_UNKNOWN = "flux-1"
 
 ADAPTER_LORA = "lora"
 ADAPTER_TEXTUAL_INVERSION = "textual-inversion"
@@ -66,6 +68,7 @@
 IMPL_STABILITY_AI = "https://github.com/Stability-AI/generative-models"
 IMPL_COMFY_UI = "https://github.com/comfyanonymous/ComfyUI"
 IMPL_DIFFUSERS = "diffusers"
+IMPL_FLUX = "https://github.com/black-forest-labs/flux"
 
 PRED_TYPE_EPSILON = "epsilon"
 PRED_TYPE_V = "v"
@@ -118,10 +121,11 @@ def build_metadata(
     merged_from: Optional[str] = None,
     timesteps: Optional[Tuple[int, int]] = None,
     clip_skip: Optional[int] = None,
-    sd3: str = None,
+    sd3: Optional[str] = None,
+    flux: Optional[str] = None,
 ):
     """
-    sd3: only supports "m"
+    sd3: only supports "m", flux: only supports "dev"
     """
     # if state_dict is None, hash is not calculated
 
@@ -140,6 +144,11 @@ def build_metadata(
             arch = ARCH_SD3_M
         else:
             arch = ARCH_SD3_UNKNOWN
+    elif flux is not None:
+        if flux == "dev":
+            arch = ARCH_FLUX_1_DEV
+        else:
+            arch = ARCH_FLUX_1_UNKNOWN
     elif v2:
         if v_parameterization:
             arch = ARCH_SD_V2_768_V
@@ -158,7 +167,10 @@ def build_metadata(
     if not lora and not textual_inversion and is_stable_diffusion_ckpt is None:
         is_stable_diffusion_ckpt = True  # default is stable diffusion ckpt if not lora and not textual_inversion
 
-    if (lora and sdxl) or textual_inversion or is_stable_diffusion_ckpt:
+    if flux is not None:
+        # Flux
+        impl = IMPL_FLUX
+    elif (lora and sdxl) or textual_inversion or is_stable_diffusion_ckpt:
         # Stable Diffusion ckpt, TI, SDXL LoRA
         impl = IMPL_STABILITY_AI
     else:
@@ -216,7 +228,7 @@ def build_metadata(
             reso = (reso[0], reso[0])
     else:
         # resolution is defined in dataset, so use default
-        if sdxl or sd3 is not None:
+        if sdxl or sd3 is not None or flux is not None:
             reso = 1024
         elif v2 and v_parameterization:
             reso = 768
@@ -227,7 +239,9 @@ def build_metadata(
 
     metadata["modelspec.resolution"] = f"{reso[0]}x{reso[1]}"
 
-    if v_parameterization:
+    if flux is not None:
+        del metadata["modelspec.prediction_type"]
+    elif v_parameterization:
         metadata["modelspec.prediction_type"] = PRED_TYPE_V
     else:
         metadata["modelspec.prediction_type"] = PRED_TYPE_EPSILON
diff --git a/library/strategy_flux.py b/library/strategy_flux.py
index f194ccf6e..13459d32f 100644
--- a/library/strategy_flux.py
+++ b/library/strategy_flux.py
@@ -63,11 +63,11 @@ def encode_tokens(
             l_pooled = None
 
         if t5xxl is not None and t5_tokens is not None:
-            # t5_out is [1, max length, 4096]
+            # t5_out is [b, max length, 4096]
             t5_out, _ = t5xxl(t5_tokens.to(t5xxl.device), return_dict=False, output_hidden_states=True)
             if apply_t5_attn_mask:
                 t5_out = t5_out * t5_attn_mask.to(t5_out.device).unsqueeze(-1)
-            txt_ids = torch.zeros(1, t5_out.shape[1], 3, device=t5_out.device)
+            txt_ids = torch.zeros(t5_out.shape[0], t5_out.shape[1], 3, device=t5_out.device)
         else:
             t5_out = None
             txt_ids = None
diff --git a/library/train_util.py b/library/train_util.py
index fc458a884..6b74bb3fa 100644
--- a/library/train_util.py
+++ b/library/train_util.py
@@ -3186,6 +3186,7 @@ def get_sai_model_spec(
     textual_inversion: bool,
     is_stable_diffusion_ckpt: Optional[bool] = None,  # None for TI and LoRA
     sd3: str = None,
+    flux: str = None,
 ):
     timestamp = time.time()
 
@@ -3220,6 +3221,7 @@ def get_sai_model_spec(
         timesteps=timesteps,
         clip_skip=args.clip_skip,  # None or int
         sd3=sd3,
+        flux=flux,
     )
     return metadata
 
@@ -3642,8 +3644,8 @@ def add_training_arguments(parser: argparse.ArgumentParser, support_dreambooth:
         "--loss_type",
         type=str,
         default="l2",
-        choices=["l2", "huber", "smooth_l1"],
-        help="The type of loss function to use (L2, Huber, or smooth L1), default is L2 / 使用する損失関数の種類（L2、Huber、またはsmooth L1）、デフォルトはL2",
+        choices=["l1", "l2", "huber", "smooth_l1"],
+        help="The type of loss function to use (L1, L2, Huber, or smooth L1), default is L2 / 使用する損失関数の種類（L1、L2、Huber、またはsmooth L1）、デフォルトはL2",
     )
     parser.add_argument(
         "--huber_schedule",
@@ -5359,9 +5361,10 @@ def get_noise_noisy_latents_and_timesteps(args, noise_scheduler, latents):
 def conditional_loss(
     model_pred: torch.Tensor, target: torch.Tensor, reduction: str = "mean", loss_type: str = "l2", huber_c: float = 0.1
 ):
-
     if loss_type == "l2":
         loss = torch.nn.functional.mse_loss(model_pred, target, reduction=reduction)
+    elif loss_type == "l1":
+        loss = torch.nn.functional.l1_loss(model_pred, target, reduction=reduction)
     elif loss_type == "huber":
         loss = 2 * huber_c * (torch.sqrt((model_pred - target) ** 2 + huber_c**2) - huber_c)
         if reduction == "mean":
diff --git a/networks/lora_flux.py b/networks/lora_flux.py
index 332a73d97..a4dab287a 100644
--- a/networks/lora_flux.py
+++ b/networks/lora_flux.py
@@ -316,7 +316,7 @@ def create_network_from_weights(multiplier, file, ae, text_encoders, flux, weigh
 class LoRANetwork(torch.nn.Module):
     FLUX_TARGET_REPLACE_MODULE = ["DoubleStreamBlock", "SingleStreamBlock"]
     TEXT_ENCODER_TARGET_REPLACE_MODULE = ["CLIPAttention", "CLIPMLP"]
-    LORA_PREFIX_FLUX = "lora_flux"
+    LORA_PREFIX_FLUX = "lora_unet" # make ComfyUI compatible
     LORA_PREFIX_TEXT_ENCODER_CLIP = "lora_te1"
     LORA_PREFIX_TEXT_ENCODER_T5 = "lora_te2"
 
diff --git a/train_network.py b/train_network.py
index 48d988624..367203f54 100644
--- a/train_network.py
+++ b/train_network.py
@@ -226,6 +226,12 @@ def post_process_loss(self, loss, args, timesteps, noise_scheduler):
             loss = apply_debiased_estimation(loss, timesteps, noise_scheduler)
         return loss
 
+    def get_sai_model_spec(self, args):
+        return train_util.get_sai_model_spec(None, args, self.is_sdxl, True, False)
+
+    def update_metadata(self, metadata, args):
+        pass
+
     # endregion
 
     def train(self, args):
@@ -521,10 +527,13 @@ def train(self, args):
             unet_weight_dtype = torch.float8_e4m3fn
             te_weight_dtype = torch.float8_e4m3fn
 
-            unet.to(accelerator.device)  # this makes faster `to(dtype)` below
+            # unet.to(accelerator.device)  # this makes faster `to(dtype)` below, but consumes 23 GB VRAM
+            # unet.to(dtype=unet_weight_dtype)  # without moving to gpu, this takes a lot of time and main memory
+            
+            unet.to(accelerator.device, dtype=unet_weight_dtype)  # this seems to be safer than above
 
         unet.requires_grad_(False)
-        unet.to(dtype=unet_weight_dtype)  # this takes long time and large memory
+        unet.to(dtype=unet_weight_dtype)
         for t_enc in text_encoders:
             t_enc.requires_grad_(False)
 
@@ -718,8 +727,11 @@ def load_model_hook(models, input_dir):
             "ss_loss_type": args.loss_type,
             "ss_huber_schedule": args.huber_schedule,
             "ss_huber_c": args.huber_c,
+            "ss_fp8_base": args.fp8_base,
         }
 
+        self.update_metadata(metadata, args)  # architecture specific metadata
+
         if use_user_config:
             # save metadata of multiple datasets
             # NOTE: pack "ss_datasets" value as json one time
@@ -964,7 +976,7 @@ def save_model(ckpt_name, unwrapped_nw, steps, epoch_no, force_sync_upload=False
             metadata["ss_epoch"] = str(epoch_no)
 
             metadata_to_save = minimum_metadata if args.no_metadata else metadata
-            sai_metadata = train_util.get_sai_model_spec(None, args, self.is_sdxl, True, False)
+            sai_metadata = self.get_sai_model_spec(args)
             metadata_to_save.update(sai_metadata)
 
             unwrapped_nw.save_weights(ckpt_file, save_dtype, metadata_to_save)

From 82314ac2e7926ed15eac6306bebe4ffb78280346 Mon Sep 17 00:00:00 2001
From: Kohya S <ykumeykume@gmail.com>
Date: Sun, 11 Aug 2024 11:14:08 +0900
Subject: [PATCH 064/582] update readme for ai toolkit settings

---
 README.md | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 1089dd001..d016bcec4 100644
--- a/README.md
+++ b/README.md
@@ -25,7 +25,11 @@ We have added some new options (Aug 10, 2024): `--time_sampling`, `--sigmoid_sca
 
 `--loss_type` may be useful for FLUX.1 training. The default is `l2`.
 
-In our experiments, `--timestep_sampling sigma --model_prediction_type raw --discrete_flow_shift 1.0` with `--loss_type l2` seems to work better than the default (SD3) settings. The multiplier of LoRA should be adjusted. Other settings may work better, so please try different settings.
+In our experiments, `--timestep_sampling sigma --model_prediction_type raw --discrete_flow_shift 1.0` with `--loss_type l2` seems to work better than the default (SD3) settings. The multiplier of LoRA should be adjusted. 
+
+additional note (Aug 11): A quick check shows that the settings in [AI Toolkit by Ostris](https://github.com/ostris/ai-toolkit) seems to be equivalent to `--timestep_sampling sigmoid --model_prediction_type raw --guidance_scale 1.0` (with the default `l2` loss_type).
+
+Other settings may work better, so please try different settings.
 
 We also not sure how many epochs are needed for convergence, and how the learning rate should be adjusted.
 

From d25ae361d06bb6f49c104ca2e6b4a9188a88c95f Mon Sep 17 00:00:00 2001
From: Kohya S <ykumeykume@gmail.com>
Date: Sun, 11 Aug 2024 19:07:07 +0900
Subject: [PATCH 065/582] fix apply_t5_attn_mask to work

---
 README.md                |  2 ++
 flux_train_network.py    |  6 ++++--
 library/strategy_flux.py | 18 +++++++++++++-----
 3 files changed, 19 insertions(+), 7 deletions(-)

diff --git a/README.md b/README.md
index d016bcec4..d47776ca6 100644
--- a/README.md
+++ b/README.md
@@ -4,6 +4,8 @@ This repository contains training, generation and utility scripts for Stable Dif
 
 This feature is experimental. The options and the training script may change in the future. Please let us know if you have any idea to improve the training.
 
+Aug 11, 2024: Fix `--apply_t5_attn_mask` option to work. Please remove and re-generate the latents cache file if you have used the option before.
+
 Aug 10, 2024:  LoRA key prefix is changed to `lora_unet` from `lora_flex` to make it compatible with ComfyUI.
 
 Please update PyTorch to 2.4.0. We have tested with PyTorch 2.4.0 with CUDA 12.4. We also updated `accelerate` to 0.33.0 just to be safe. 
diff --git a/flux_train_network.py b/flux_train_network.py
index 69b6e8eaf..59a666aae 100644
--- a/flux_train_network.py
+++ b/flux_train_network.py
@@ -67,14 +67,16 @@ def get_latents_caching_strategy(self, args):
         return latents_caching_strategy
 
     def get_text_encoding_strategy(self, args):
-        return strategy_flux.FluxTextEncodingStrategy()
+        return strategy_flux.FluxTextEncodingStrategy(apply_t5_attn_mask=args.apply_t5_attn_mask)
 
     def get_models_for_text_encoding(self, args, accelerator, text_encoders):
         return text_encoders  # + [accelerator.unwrap_model(text_encoders[-1])]
 
     def get_text_encoder_outputs_caching_strategy(self, args):
         if args.cache_text_encoder_outputs:
-            return strategy_flux.FluxTextEncoderOutputsCachingStrategy(args.cache_text_encoder_outputs_to_disk, None, False)
+            return strategy_flux.FluxTextEncoderOutputsCachingStrategy(
+                args.cache_text_encoder_outputs_to_disk, None, False, apply_t5_attn_mask=args.apply_t5_attn_mask
+            )
         else:
             return None
 
diff --git a/library/strategy_flux.py b/library/strategy_flux.py
index 13459d32f..3880a1e1b 100644
--- a/library/strategy_flux.py
+++ b/library/strategy_flux.py
@@ -41,17 +41,24 @@ def tokenize(self, text: Union[str, List[str]]) -> List[torch.Tensor]:
 
 
 class FluxTextEncodingStrategy(TextEncodingStrategy):
-    def __init__(self) -> None:
-        pass
+    def __init__(self, apply_t5_attn_mask: Optional[bool] = None) -> None:
+        """
+        Args:
+            apply_t5_attn_mask: Default value for apply_t5_attn_mask.
+        """
+        self.apply_t5_attn_mask = apply_t5_attn_mask
 
     def encode_tokens(
         self,
         tokenize_strategy: TokenizeStrategy,
         models: List[Any],
         tokens: List[torch.Tensor],
-        apply_t5_attn_mask: bool = False,
+        apply_t5_attn_mask: Optional[bool] = None,
     ) -> List[torch.Tensor]:
-        # supports single model inference only
+        # supports single model inference
+
+        if apply_t5_attn_mask is None:
+            apply_t5_attn_mask = self.apply_t5_attn_mask
 
         clip_l, t5xxl = models
         l_tokens, t5_tokens = tokens[:2]
@@ -137,8 +144,9 @@ def cache_batch_outputs(
 
         tokens_and_masks = tokenize_strategy.tokenize(captions)
         with torch.no_grad():
+            # attn_mask is not applied when caching to disk: it is applied when loading from disk
             l_pooled, t5_out, txt_ids = flux_text_encoding_strategy.encode_tokens(
-                tokenize_strategy, models, tokens_and_masks, self.apply_t5_attn_mask
+                tokenize_strategy, models, tokens_and_masks, not self.cache_to_disk
             )
 
         if l_pooled.dtype == torch.bfloat16:

From 9e09a69df1ea8aa76ec98df3b2eed961c66432e4 Mon Sep 17 00:00:00 2001
From: Kohya S <ykumeykume@gmail.com>
Date: Mon, 12 Aug 2024 08:19:45 +0900
Subject: [PATCH 066/582] update README

---
 README.md | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/README.md b/README.md
index d47776ca6..ccc83e6e8 100644
--- a/README.md
+++ b/README.md
@@ -10,10 +10,10 @@ Aug 10, 2024:  LoRA key prefix is changed to `lora_unet` from `lora_flex` to mak
 
 Please update PyTorch to 2.4.0. We have tested with PyTorch 2.4.0 with CUDA 12.4. We also updated `accelerate` to 0.33.0 just to be safe. 
 
-We have added a new training script for LoRA training. The script is `flux_train_network.py`. See `--help` for options. Sample command is below. It will work with 24GB VRAM GPUs.
+We have added a new training script for LoRA training. The script is `flux_train_network.py`. See `--help` for options. Sample command is below, settings are based on [AI Toolkit by Ostris](https://github.com/ostris/ai-toolkit). It will work with 24GB VRAM GPUs.
 
 ```
-accelerate launch  --mixed_precision bf16 --num_cpu_threads_per_process 1 flux_train_network.py --pretrained_model_name_or_path flux1-dev.sft --clip_l sd3/clip_l.safetensors --t5xxl sd3/t5xxl_fp16.safetensors --ae ae.sft --cache_latents_to_disk --save_model_as safetensors --sdpa --persistent_data_loader_workers --max_data_loader_n_workers 2 --seed 42 --gradient_checkpointing --mixed_precision bf16 --save_precision bf16 --network_module networks.lora_flux --network_dim 4 --optimizer_type adamw8bit --learning_rate 1e-4 --network_train_unet_only --cache_text_encoder_outputs --cache_text_encoder_outputs_to_disk --fp8_base --highvram --max_train_epochs 4 --save_every_n_epochs 1 --dataset_config dataset_1024_bs2.toml --output_dir path/to/output/dir --output_name flux-lora-name --timestep_sampling sigma --model_prediction_type raw --discrete_flow_shift 1.0 --loss_type l2
+accelerate launch  --mixed_precision bf16 --num_cpu_threads_per_process 1 flux_train_network.py --pretrained_model_name_or_path flux1-dev.sft --clip_l sd3/clip_l.safetensors --t5xxl sd3/t5xxl_fp16.safetensors --ae ae.sft --cache_latents_to_disk --save_model_as safetensors --sdpa --persistent_data_loader_workers --max_data_loader_n_workers 2 --seed 42 --gradient_checkpointing --mixed_precision bf16 --save_precision bf16 --network_module networks.lora_flux --network_dim 4 --optimizer_type adamw8bit --learning_rate 1e-4 --network_train_unet_only --cache_text_encoder_outputs --cache_text_encoder_outputs_to_disk --fp8_base --highvram --max_train_epochs 4 --save_every_n_epochs 1 --dataset_config dataset_1024_bs2.toml --output_dir path/to/output/dir --output_name flux-lora-name --timestep_sampling sigmoid --model_prediction_type raw --guidance_scale 1.0 --loss_type l2
 ```
 
 LoRAs for Text Encoders are not tested yet.
@@ -29,7 +29,7 @@ We have added some new options (Aug 10, 2024): `--time_sampling`, `--sigmoid_sca
 
 In our experiments, `--timestep_sampling sigma --model_prediction_type raw --discrete_flow_shift 1.0` with `--loss_type l2` seems to work better than the default (SD3) settings. The multiplier of LoRA should be adjusted. 
 
-additional note (Aug 11): A quick check shows that the settings in [AI Toolkit by Ostris](https://github.com/ostris/ai-toolkit) seems to be equivalent to `--timestep_sampling sigmoid --model_prediction_type raw --guidance_scale 1.0` (with the default `l2` loss_type).
+additional note (Aug 11): A quick check shows that the settings in [AI Toolkit by Ostris](https://github.com/ostris/ai-toolkit) seems to be equivalent to `--timestep_sampling sigmoid --model_prediction_type raw --guidance_scale 1.0` (with the default `l2` loss_type). This seems to be a good starting point. Thanks to Ostris for the great work!
 
 Other settings may work better, so please try different settings.
 

From 4af36f96320d553025cfdf067cae1e346af44a67 Mon Sep 17 00:00:00 2001
From: kohya-ss <ykumeykume@gmail.com>
Date: Mon, 12 Aug 2024 13:24:10 +0900
Subject: [PATCH 067/582] update to work interactive mode

---
 README.md                 |  2 ++
 flux_minimal_inference.py | 33 +++++++++++++++++++++++++++------
 2 files changed, 29 insertions(+), 6 deletions(-)

diff --git a/README.md b/README.md
index ccc83e6e8..c0d50a5a2 100644
--- a/README.md
+++ b/README.md
@@ -39,6 +39,8 @@ The trained LoRA model can be used with ComfyUI.
 
 The inference script is also available. The script is `flux_minimal_inference.py`. See `--help` for options. 
 
+Aug 12: `--interactive` option is now working.
+
 ```
 python flux_minimal_inference.py --ckpt flux1-dev.sft --clip_l sd3/clip_l.safetensors --t5xxl sd3/t5xxl_fp16.safetensors --ae ae.sft --dtype bf16 --prompt "a cat holding a sign that says hello world" --out path/to/output/dir --seed 1 --flux_dtype fp8 --offload --lora lora-flux-name.safetensors;1.0
 ```
diff --git a/flux_minimal_inference.py b/flux_minimal_inference.py
index f3affca80..b09f63808 100644
--- a/flux_minimal_inference.py
+++ b/flux_minimal_inference.py
@@ -5,7 +5,7 @@
 import math
 import os
 import random
-from typing import Callable, Optional, Tuple
+from typing import Callable, List, Optional, Tuple
 import einops
 import numpy as np
 
@@ -121,6 +121,9 @@ def generate_image(
     steps: Optional[int],
     guidance: float,
 ):
+    seed = seed if seed is not None else random.randint(0, 2**32 - 1)
+    logger.info(f"Seed: {seed}")
+
     # make first noise with packed shape
     # original: b,16,2*h//16,2*w//16, packed: b,h//16*w//16,16*2*2
     packed_latent_height, packed_latent_width = math.ceil(image_height / 16), math.ceil(image_width / 16)
@@ -183,9 +186,7 @@ def generate_image(
         steps = 4 if is_schnell else 50
 
     img_ids = img_ids.to(device)
-    x = do_sample(
-        accelerator, model, noise, img_ids, l_pooled, t5_out, txt_ids, steps, guidance_scale, is_schnell, device, flux_dtype
-    )
+    x = do_sample(accelerator, model, noise, img_ids, l_pooled, t5_out, txt_ids, steps, guidance, is_schnell, device, flux_dtype)
     if args.offload:
         model = model.cpu()
     # del model
@@ -255,6 +256,7 @@ def generate_image(
         default=[],
         help="LoRA weights, only supports networks.lora_flux, each argument is a `path;multiplier` (semi-colon separated)",
     )
+    parser.add_argument("--merge_lora_weights", action="store_true", help="Merge LoRA weights to model")
     parser.add_argument("--width", type=int, default=target_width)
     parser.add_argument("--height", type=int, default=target_height)
     parser.add_argument("--interactive", action="store_true")
@@ -341,6 +343,7 @@ def is_fp8(dt):
         ae = accelerator.prepare(ae)
 
     # LoRA
+    lora_models: List[lora_flux.LoRANetwork] = []
     for weights_file in args.lora_weights:
         if ";" in weights_file:
             weights_file, multiplier = weights_file.split(";")
@@ -351,7 +354,16 @@ def is_fp8(dt):
         lora_model, weights_sd = lora_flux.create_network_from_weights(
             multiplier, weights_file, ae, [clip_l, t5xxl], model, None, True
         )
-        lora_model.merge_to([clip_l, t5xxl], model, weights_sd)
+        if args.merge_lora_weights:
+            lora_model.merge_to([clip_l, t5xxl], model, weights_sd)
+        else:
+            lora_model.apply_to([clip_l, t5xxl], model)
+            info = lora_model.load_state_dict(weights_sd, strict=True)
+            logger.info(f"Loaded LoRA weights from {weights_file}: {info}")
+            lora_model.eval()
+            lora_model.to(device)
+
+        lora_models.append(lora_model)
 
     if not args.interactive:
         generate_image(model, clip_l, t5xxl, ae, args.prompt, args.seed, args.width, args.height, args.steps, args.guidance)
@@ -363,7 +375,9 @@ def is_fp8(dt):
         guidance = args.guidance
 
         while True:
-            print("Enter prompt (empty to exit). Options: --w <width> --h <height> --s <steps> --d <seed> --g <guidance>")
+            print(
+                "Enter prompt (empty to exit). Options: --w <width> --h <height> --s <steps> --d <seed> --g <guidance> --m <multipliers for LoRA>"
+            )
             prompt = input()
             if prompt == "":
                 break
@@ -384,6 +398,13 @@ def is_fp8(dt):
                     seed = int(opt[1:].strip())
                 elif opt.startswith("g"):
                     guidance = float(opt[1:].strip())
+                elif opt.startswith("m"):
+                    mutipliers = opt[1:].strip().split(",")
+                    if len(mutipliers) != len(lora_models):
+                        logger.error(f"Invalid number of multipliers, expected {len(lora_models)}")
+                        continue
+                    for i, lora_model in enumerate(lora_models):
+                        lora_model.set_multiplier(float(mutipliers[i]))
 
             generate_image(model, clip_l, t5xxl, ae, prompt, seed, width, height, steps, guidance)
 

From a7d5dabde3facb57d069eba0aa91e961e04303ad Mon Sep 17 00:00:00 2001
From: Kohya S <ykumeykume@gmail.com>
Date: Mon, 12 Aug 2024 17:09:19 +0900
Subject: [PATCH 068/582] Update readme

---
 README.md | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/README.md b/README.md
index c0d50a5a2..19aed2212 100644
--- a/README.md
+++ b/README.md
@@ -16,6 +16,12 @@ We have added a new training script for LoRA training. The script is `flux_train
 accelerate launch  --mixed_precision bf16 --num_cpu_threads_per_process 1 flux_train_network.py --pretrained_model_name_or_path flux1-dev.sft --clip_l sd3/clip_l.safetensors --t5xxl sd3/t5xxl_fp16.safetensors --ae ae.sft --cache_latents_to_disk --save_model_as safetensors --sdpa --persistent_data_loader_workers --max_data_loader_n_workers 2 --seed 42 --gradient_checkpointing --mixed_precision bf16 --save_precision bf16 --network_module networks.lora_flux --network_dim 4 --optimizer_type adamw8bit --learning_rate 1e-4 --network_train_unet_only --cache_text_encoder_outputs --cache_text_encoder_outputs_to_disk --fp8_base --highvram --max_train_epochs 4 --save_every_n_epochs 1 --dataset_config dataset_1024_bs2.toml --output_dir path/to/output/dir --output_name flux-lora-name --timestep_sampling sigmoid --model_prediction_type raw --guidance_scale 1.0 --loss_type l2
 ```
 
+The training can be done with 16GB VRAM GPUs with Adafactor optimizer. Please use settings like below:
+
+```
+--optimizer_type adafactor --optimizer_args "relative_step=False" "scale_parameter=False" "warmup_init=False"`
+```
+
 LoRAs for Text Encoders are not tested yet.
 
 We have added some new options (Aug 10, 2024): `--time_sampling`, `--sigmoid_scale`, `--model_prediction_type` and `--discrete_flow_shift`. The options are as follows:

From 0415d200f5f3db89e33b33c9b36cb3c3e15d0266 Mon Sep 17 00:00:00 2001
From: kohya-ss <ykumeykume@gmail.com>
Date: Tue, 13 Aug 2024 21:00:16 +0900
Subject: [PATCH 069/582] update dependencies closes #1450

---
 requirements.txt | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/requirements.txt b/requirements.txt
index e99775b8a..4ee19b3ee 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,12 +1,12 @@
-accelerate==0.25.0
-transformers==4.36.2
+accelerate==0.33.0
+transformers==4.44.0
 diffusers[torch]==0.25.0
 ftfy==6.1.1
 # albumentations==1.3.0
 opencv-python==4.7.0.68
 einops==0.7.0
 pytorch-lightning==1.9.0
-bitsandbytes==0.43.0
+bitsandbytes==0.43.3
 prodigyopt==1.0
 lion-pytorch==0.0.6
 tensorboard
@@ -16,7 +16,7 @@ altair==4.2.2
 easygui==0.98.3
 toml==0.10.2
 voluptuous==0.13.1
-huggingface-hub==0.20.1
+huggingface-hub==0.24.5
 # for Image utils
 imagesize==1.4.1
 # for BLIP captioning
@@ -38,5 +38,7 @@ imagesize==1.4.1
 # open-clip-torch==2.20.0
 # For logging
 rich==13.7.0
+# for T5XXL tokenizer (SD3/FLUX)
+sentencepiece==0.2.0
 # for kohya_ss library
 -e .

From 9711c96f96038df5fa1a15d073244198b93ef0a2 Mon Sep 17 00:00:00 2001
From: kohya-ss <ykumeykume@gmail.com>
Date: Tue, 13 Aug 2024 21:03:17 +0900
Subject: [PATCH 070/582] update README

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 19aed2212..3eb034ed4 100644
--- a/README.md
+++ b/README.md
@@ -8,7 +8,7 @@ Aug 11, 2024: Fix `--apply_t5_attn_mask` option to work. Please remove and re-ge
 
 Aug 10, 2024:  LoRA key prefix is changed to `lora_unet` from `lora_flex` to make it compatible with ComfyUI.
 
-Please update PyTorch to 2.4.0. We have tested with PyTorch 2.4.0 with CUDA 12.4. We also updated `accelerate` to 0.33.0 just to be safe. 
+__Please update PyTorch to 2.4.0. We have tested with PyTorch 2.4.0 with CUDA 12.4. We also updated `accelerate` to 0.33.0 just to be safe. `requirements.txt` is also updated, so please update the requirements.__
 
 We have added a new training script for LoRA training. The script is `flux_train_network.py`. See `--help` for options. Sample command is below, settings are based on [AI Toolkit by Ostris](https://github.com/ostris/ai-toolkit). It will work with 24GB VRAM GPUs.
 

From 56d7651f0895c805c403a8db01083a522503eb7d Mon Sep 17 00:00:00 2001
From: Kohya S <ykumeykume@gmail.com>
Date: Tue, 13 Aug 2024 22:28:39 +0900
Subject: [PATCH 071/582] add experimental split mode for FLUX

---
 README.md              |  22 +++++-
 flux_train_network.py  | 110 +++++++++++++++++++++++----
 library/flux_models.py | 165 +++++++++++++++++++++++++++++++++++++++++
 networks/lora_flux.py  |  30 ++++++--
 4 files changed, 304 insertions(+), 23 deletions(-)

diff --git a/README.md b/README.md
index 3eb034ed4..64b018804 100644
--- a/README.md
+++ b/README.md
@@ -4,12 +4,22 @@ This repository contains training, generation and utility scripts for Stable Dif
 
 This feature is experimental. The options and the training script may change in the future. Please let us know if you have any idea to improve the training.
 
+__Please update PyTorch to 2.4.0. We have tested with PyTorch 2.4.0 with CUDA 12.4. We also updated `accelerate` to 0.33.0 just to be safe. `requirements.txt` is also updated, so please update the requirements.__
+
+Aug 13, 2024:  
+
+__Experimental__  A network argument `train_blocks` is added to `lora_flux`. This is to select the target blocks of LoRA from FLUX double blocks and single blocks. Specify like `--network_args "train_blocks=single"`. `all` trains both double blocks and single blocks, `double` trains only double blocks, and `single` trains only single blocks. The default (omission) is `all`.
+
+This argument is available even if `--split_mode` is not specified.
+
+__Experimental__  `--split_mode` option is added to `flux_train_network.py`. This splits FLUX into double blocks and single blocks for training. By enabling gradients only for the single blocks part, memory usage is reduced. When this option is specified, you need to specify `"train_blocks=single"` in the network arguments.
+
+This option enables training with 12GB VRAM GPUs, but the training speed is 2-3 times slower than the default. 
+
 Aug 11, 2024: Fix `--apply_t5_attn_mask` option to work. Please remove and re-generate the latents cache file if you have used the option before.
 
 Aug 10, 2024:  LoRA key prefix is changed to `lora_unet` from `lora_flex` to make it compatible with ComfyUI.
 
-__Please update PyTorch to 2.4.0. We have tested with PyTorch 2.4.0 with CUDA 12.4. We also updated `accelerate` to 0.33.0 just to be safe. `requirements.txt` is also updated, so please update the requirements.__
-
 We have added a new training script for LoRA training. The script is `flux_train_network.py`. See `--help` for options. Sample command is below, settings are based on [AI Toolkit by Ostris](https://github.com/ostris/ai-toolkit). It will work with 24GB VRAM GPUs.
 
 ```
@@ -19,7 +29,13 @@ accelerate launch  --mixed_precision bf16 --num_cpu_threads_per_process 1 flux_t
 The training can be done with 16GB VRAM GPUs with Adafactor optimizer. Please use settings like below:
 
 ```
---optimizer_type adafactor --optimizer_args "relative_step=False" "scale_parameter=False" "warmup_init=False"`
+--optimizer_type adafactor --optimizer_args "relative_step=False" "scale_parameter=False" "warmup_init=False"
+```
+
+The training can be done with 12GB VRAM GPUs with Adafactor optimizer, `--split_mode` and `train_blocks=single` options. Please use settings like below:
+
+```
+--optimizer_type adafactor --optimizer_args "relative_step=False" "scale_parameter=False" "warmup_init=False" --split_mode --network_args "train_blocks=single" 
 ```
 
 LoRAs for Text Encoders are not tested yet.
diff --git a/flux_train_network.py b/flux_train_network.py
index 59a666aae..1d1f00d84 100644
--- a/flux_train_network.py
+++ b/flux_train_network.py
@@ -37,10 +37,16 @@ def assert_extra_args(self, args, train_dataset_group):
             args.network_train_unet_only or not args.cache_text_encoder_outputs
         ), "network for Text Encoder cannot be trained with caching Text Encoder outputs / Text Encoderの出力をキャッシュしながらText Encoderのネットワークを学習することはできません"
 
-        train_dataset_group.verify_bucket_reso_steps(32)
+        train_dataset_group.verify_bucket_reso_steps(32)  # TODO check this
 
     def load_target_model(self, args, weight_dtype, accelerator):
         # currently offload to cpu for some models
+        name = "schnell" if "schnell" in args.pretrained_model_name_or_path else "dev"  # TODO change this to a more robust way
+        # if we load to cpu, flux.to(fp8) takes a long time
+        model = flux_utils.load_flow_model(name, args.pretrained_model_name_or_path, weight_dtype, "cpu")
+
+        if args.split_mode:
+            model = self.prepare_split_model(model, weight_dtype, accelerator)
 
         clip_l = flux_utils.load_clip_l(args.clip_l, weight_dtype, "cpu")
         clip_l.eval()
@@ -49,13 +55,47 @@ def load_target_model(self, args, weight_dtype, accelerator):
         t5xxl = flux_utils.load_t5xxl(args.t5xxl, weight_dtype, "cpu")
         t5xxl.eval()
 
-        name = "schnell" if "schnell" in args.pretrained_model_name_or_path else "dev"  # TODO change this to a more robust way
-        # if we load to cpu, flux.to(fp8) takes a long time
-        model = flux_utils.load_flow_model(name, args.pretrained_model_name_or_path, weight_dtype, "cpu")
         ae = flux_utils.load_ae(name, args.ae, weight_dtype, "cpu")
 
         return flux_utils.MODEL_VERSION_FLUX_V1, [clip_l, t5xxl], ae, model
 
+    def prepare_split_model(self, model, weight_dtype, accelerator):
+        from accelerate import init_empty_weights
+
+        logger.info("prepare split model")
+        with init_empty_weights():
+            flux_upper = flux_models.FluxUpper(model.params)
+            flux_lower = flux_models.FluxLower(model.params)
+        sd = model.state_dict()
+
+        # lower (trainable)
+        logger.info("load state dict for lower")
+        flux_lower.load_state_dict(sd, strict=False, assign=True)
+        flux_lower.to(dtype=weight_dtype)
+
+        # upper (frozen)
+        logger.info("load state dict for upper")
+        flux_upper.load_state_dict(sd, strict=False, assign=True)
+
+        logger.info("prepare upper model")
+        target_dtype = torch.float8_e4m3fn if args.fp8_base else weight_dtype
+        flux_upper.to(accelerator.device, dtype=target_dtype)
+        flux_upper.eval()
+
+        if args.fp8_base:
+            # this is required to run on fp8
+            flux_upper = accelerator.prepare(flux_upper)
+
+        flux_upper.to("cpu")
+
+        self.flux_upper = flux_upper
+        del model  # we don't need model anymore
+        clean_memory_on_device(accelerator.device)
+
+        logger.info("split model prepared")
+
+        return flux_lower
+
     def get_tokenize_strategy(self, args):
         return strategy_flux.FluxTokenizeStrategy(args.max_token_length, args.tokenizer_cache_dir)
 
@@ -262,17 +302,51 @@ def compute_loss_weighting_for_sd3(weighting_scheme: str, sigmas=None):
         #     f"model_input: {noisy_model_input.shape}, img_ids: {img_ids.shape}, t5_out: {t5_out.shape}, txt_ids: {txt_ids.shape}, l_pooled: {l_pooled.shape}, timesteps: {timesteps.shape}, guidance_vec: {guidance_vec.shape}"
         # )
 
-        with accelerator.autocast():
-            # YiYi notes: divide it by 1000 for now because we scale it by 1000 in the transformer model (we should not keep it but I want to keep the inputs same for the model for testing)
-            model_pred = unet(
-                img=packed_noisy_model_input,
-                img_ids=img_ids,
-                txt=t5_out,
-                txt_ids=txt_ids,
-                y=l_pooled,
-                timesteps=timesteps / 1000,
-                guidance=guidance_vec,
-            )
+        if not args.split_mode:
+            # normal forward
+            with accelerator.autocast():
+                # YiYi notes: divide it by 1000 for now because we scale it by 1000 in the transformer model (we should not keep it but I want to keep the inputs same for the model for testing)
+                model_pred = unet(
+                    img=packed_noisy_model_input,
+                    img_ids=img_ids,
+                    txt=t5_out,
+                    txt_ids=txt_ids,
+                    y=l_pooled,
+                    timesteps=timesteps / 1000,
+                    guidance=guidance_vec,
+                )
+        else:
+            # split forward to reduce memory usage
+            assert network.train_blocks == "single", "train_blocks must be single for split mode"
+            with accelerator.autocast():
+                # move flux lower to cpu, and then move flux upper to gpu
+                unet.to("cpu")
+                clean_memory_on_device(accelerator.device)
+                self.flux_upper.to(accelerator.device)
+
+                # upper model does not require grad
+                with torch.no_grad():
+                    intermediate_img, intermediate_txt, vec, pe = self.flux_upper(
+                        img=packed_noisy_model_input,
+                        img_ids=img_ids,
+                        txt=t5_out,
+                        txt_ids=txt_ids,
+                        y=l_pooled,
+                        timesteps=timesteps / 1000,
+                        guidance=guidance_vec,
+                    )
+
+                # move flux upper back to cpu, and then move flux lower to gpu
+                self.flux_upper.to("cpu")
+                clean_memory_on_device(accelerator.device)
+                unet.to(accelerator.device)
+
+                # lower model requires grad
+                intermediate_img.requires_grad_(True)
+                intermediate_txt.requires_grad_(True)
+                vec.requires_grad_(True)
+                pe.requires_grad_(True)
+                model_pred = unet(img=intermediate_img, txt=intermediate_txt, vec=vec, pe=pe)
 
         # unpack latents
         model_pred = flux_utils.unpack_latents(model_pred, packed_latent_height, packed_latent_width)
@@ -331,6 +405,12 @@ def setup_parser() -> argparse.ArgumentParser:
         action="store_true",
         help="cache text encoder outputs to disk / text encoderの出力をディスクにキャッシュする",
     )
+    parser.add_argument(
+        "--split_mode",
+        action="store_true",
+        help="[EXPERIMENTAL] use split mode for Flux model, network arg `train_blocks=single` is required"
+        + "/[実験的] Fluxモデルの分割モードを使用する。ネットワーク引数`train_blocks=single`が必要",
+    )
 
     # copy from Diffusers
     parser.add_argument(
diff --git a/library/flux_models.py b/library/flux_models.py
index 92c79bcca..3c7766b85 100644
--- a/library/flux_models.py
+++ b/library/flux_models.py
@@ -918,3 +918,168 @@ def forward(
 
         img = self.final_layer(img, vec)  # (N, T, patch_size ** 2 * out_channels)
         return img
+
+
+class FluxUpper(nn.Module):
+    """
+    Transformer model for flow matching on sequences.
+    """
+
+    def __init__(self, params: FluxParams):
+        super().__init__()
+
+        self.params = params
+        self.in_channels = params.in_channels
+        self.out_channels = self.in_channels
+        if params.hidden_size % params.num_heads != 0:
+            raise ValueError(f"Hidden size {params.hidden_size} must be divisible by num_heads {params.num_heads}")
+        pe_dim = params.hidden_size // params.num_heads
+        if sum(params.axes_dim) != pe_dim:
+            raise ValueError(f"Got {params.axes_dim} but expected positional dim {pe_dim}")
+        self.hidden_size = params.hidden_size
+        self.num_heads = params.num_heads
+        self.pe_embedder = EmbedND(dim=pe_dim, theta=params.theta, axes_dim=params.axes_dim)
+        self.img_in = nn.Linear(self.in_channels, self.hidden_size, bias=True)
+        self.time_in = MLPEmbedder(in_dim=256, hidden_dim=self.hidden_size)
+        self.vector_in = MLPEmbedder(params.vec_in_dim, self.hidden_size)
+        self.guidance_in = MLPEmbedder(in_dim=256, hidden_dim=self.hidden_size) if params.guidance_embed else nn.Identity()
+        self.txt_in = nn.Linear(params.context_in_dim, self.hidden_size)
+
+        self.double_blocks = nn.ModuleList(
+            [
+                DoubleStreamBlock(
+                    self.hidden_size,
+                    self.num_heads,
+                    mlp_ratio=params.mlp_ratio,
+                    qkv_bias=params.qkv_bias,
+                )
+                for _ in range(params.depth)
+            ]
+        )
+
+        self.gradient_checkpointing = False
+
+    @property
+    def device(self):
+        return next(self.parameters()).device
+
+    @property
+    def dtype(self):
+        return next(self.parameters()).dtype
+
+    def enable_gradient_checkpointing(self):
+        self.gradient_checkpointing = True
+
+        self.time_in.enable_gradient_checkpointing()
+        self.vector_in.enable_gradient_checkpointing()
+        self.guidance_in.enable_gradient_checkpointing()
+
+        for block in self.double_blocks:
+            block.enable_gradient_checkpointing()
+
+        print("FLUX: Gradient checkpointing enabled.")
+
+    def disable_gradient_checkpointing(self):
+        self.gradient_checkpointing = False
+
+        self.time_in.disable_gradient_checkpointing()
+        self.vector_in.disable_gradient_checkpointing()
+        self.guidance_in.disable_gradient_checkpointing()
+
+        for block in self.double_blocks:
+            block.disable_gradient_checkpointing()
+
+        print("FLUX: Gradient checkpointing disabled.")
+
+    def forward(
+        self,
+        img: Tensor,
+        img_ids: Tensor,
+        txt: Tensor,
+        txt_ids: Tensor,
+        timesteps: Tensor,
+        y: Tensor,
+        guidance: Tensor | None = None,
+    ) -> Tensor:
+        if img.ndim != 3 or txt.ndim != 3:
+            raise ValueError("Input img and txt tensors must have 3 dimensions.")
+
+        # running on sequences img
+        img = self.img_in(img)
+        vec = self.time_in(timestep_embedding(timesteps, 256))
+        if self.params.guidance_embed:
+            if guidance is None:
+                raise ValueError("Didn't get guidance strength for guidance distilled model.")
+            vec = vec + self.guidance_in(timestep_embedding(guidance, 256))
+        vec = vec + self.vector_in(y)
+        txt = self.txt_in(txt)
+
+        ids = torch.cat((txt_ids, img_ids), dim=1)
+        pe = self.pe_embedder(ids)
+
+        for block in self.double_blocks:
+            img, txt = block(img=img, txt=txt, vec=vec, pe=pe)
+
+        return img, txt, vec, pe
+
+
+class FluxLower(nn.Module):
+    """
+    Transformer model for flow matching on sequences.
+    """
+
+    def __init__(self, params: FluxParams):
+        super().__init__()
+        self.hidden_size = params.hidden_size
+        self.num_heads = params.num_heads
+        self.out_channels = params.in_channels
+
+        self.single_blocks = nn.ModuleList(
+            [
+                SingleStreamBlock(self.hidden_size, self.num_heads, mlp_ratio=params.mlp_ratio)
+                for _ in range(params.depth_single_blocks)
+            ]
+        )
+
+        self.final_layer = LastLayer(self.hidden_size, 1, self.out_channels)
+
+        self.gradient_checkpointing = False
+
+    @property
+    def device(self):
+        return next(self.parameters()).device
+
+    @property
+    def dtype(self):
+        return next(self.parameters()).dtype
+
+    def enable_gradient_checkpointing(self):
+        self.gradient_checkpointing = True
+
+        for block in self.single_blocks:
+            block.enable_gradient_checkpointing()
+
+        print("FLUX: Gradient checkpointing enabled.")
+
+    def disable_gradient_checkpointing(self):
+        self.gradient_checkpointing = False
+
+        for block in self.single_blocks:
+            block.disable_gradient_checkpointing()
+
+        print("FLUX: Gradient checkpointing disabled.")
+
+    def forward(
+        self,
+        img: Tensor,
+        txt: Tensor,
+        vec: Tensor | None = None,
+        pe: Tensor | None = None,
+    ) -> Tensor:
+        img = torch.cat((txt, img), 1)
+        for block in self.single_blocks:
+            img = block(img, vec=vec, pe=pe)
+        img = img[:, txt.shape[1] :, ...]
+
+        img = self.final_layer(img, vec)  # (N, T, patch_size ** 2 * out_channels)
+        return img
diff --git a/networks/lora_flux.py b/networks/lora_flux.py
index a4dab287a..4da33542f 100644
--- a/networks/lora_flux.py
+++ b/networks/lora_flux.py
@@ -252,6 +252,11 @@ def create_network(
     if module_dropout is not None:
         module_dropout = float(module_dropout)
 
+    # single or double blocks
+    train_blocks = kwargs.get("train_blocks", None)  # None (default), "all" (same as None), "single", "double"
+    if train_blocks is not None:
+        assert train_blocks in ["all", "single", "double"], f"invalid train_blocks: {train_blocks}"
+
     # すごく引数が多いな ( ^ω^)･･･
     network = LoRANetwork(
         text_encoders,
@@ -264,6 +269,7 @@ def create_network(
         module_dropout=module_dropout,
         conv_lora_dim=conv_dim,
         conv_alpha=conv_alpha,
+        train_blocks=train_blocks,
         varbose=True,
     )
 
@@ -314,9 +320,11 @@ def create_network_from_weights(multiplier, file, ae, text_encoders, flux, weigh
 
 
 class LoRANetwork(torch.nn.Module):
-    FLUX_TARGET_REPLACE_MODULE = ["DoubleStreamBlock", "SingleStreamBlock"]
+    # FLUX_TARGET_REPLACE_MODULE = ["DoubleStreamBlock", "SingleStreamBlock"]
+    FLUX_TARGET_REPLACE_MODULE_DOUBLE = ["DoubleStreamBlock"]
+    FLUX_TARGET_REPLACE_MODULE_SINGLE = ["SingleStreamBlock"]
     TEXT_ENCODER_TARGET_REPLACE_MODULE = ["CLIPAttention", "CLIPMLP"]
-    LORA_PREFIX_FLUX = "lora_unet" # make ComfyUI compatible
+    LORA_PREFIX_FLUX = "lora_unet"  # make ComfyUI compatible
     LORA_PREFIX_TEXT_ENCODER_CLIP = "lora_te1"
     LORA_PREFIX_TEXT_ENCODER_T5 = "lora_te2"
 
@@ -335,6 +343,7 @@ def __init__(
         module_class: Type[object] = LoRAModule,
         modules_dim: Optional[Dict[str, int]] = None,
         modules_alpha: Optional[Dict[str, int]] = None,
+        train_blocks: Optional[str] = None,
         varbose: Optional[bool] = False,
     ) -> None:
         super().__init__()
@@ -347,6 +356,7 @@ def __init__(
         self.dropout = dropout
         self.rank_dropout = rank_dropout
         self.module_dropout = module_dropout
+        self.train_blocks = train_blocks if train_blocks is not None else "all"
 
         self.loraplus_lr_ratio = None
         self.loraplus_unet_lr_ratio = None
@@ -360,7 +370,9 @@ def __init__(
                 f"neuron dropout: p={self.dropout}, rank dropout: p={self.rank_dropout}, module dropout: p={self.module_dropout}"
             )
             if self.conv_lora_dim is not None:
-                logger.info(f"apply LoRA to Conv2d with kernel size (3,3). dim (rank): {self.conv_lora_dim}, alpha: {self.conv_alpha}")
+                logger.info(
+                    f"apply LoRA to Conv2d with kernel size (3,3). dim (rank): {self.conv_lora_dim}, alpha: {self.conv_alpha}"
+                )
 
         # create module instances
         def create_modules(
@@ -434,9 +446,17 @@ def create_modules(
             skipped_te += skipped
         logger.info(f"create LoRA for Text Encoder: {len(self.text_encoder_loras)} modules.")
 
+        # create LoRA for U-Net
+        if self.train_blocks == "all":
+            target_replace_modules = LoRANetwork.FLUX_TARGET_REPLACE_MODULE_DOUBLE + LoRANetwork.FLUX_TARGET_REPLACE_MODULE_SINGLE
+        elif self.train_blocks == "single":
+            target_replace_modules = LoRANetwork.FLUX_TARGET_REPLACE_MODULE_SINGLE
+        elif self.train_blocks == "double":
+            target_replace_modules = LoRANetwork.FLUX_TARGET_REPLACE_MODULE_DOUBLE
+
         self.unet_loras: List[Union[LoRAModule, LoRAInfModule]]
-        self.unet_loras, skipped_un = create_modules(True, None, unet, LoRANetwork.FLUX_TARGET_REPLACE_MODULE)
-        logger.info(f"create LoRA for U-Net: {len(self.unet_loras)} modules.")
+        self.unet_loras, skipped_un = create_modules(True, None, unet, target_replace_modules)
+        logger.info(f"create LoRA for FLUX {self.train_blocks} blocks: {len(self.unet_loras)} modules.")
 
         skipped = skipped_te + skipped_un
         if varbose and len(skipped) > 0:

From 9760d097b0bd7efbeb065d4320b2216a94e76efd Mon Sep 17 00:00:00 2001
From: DukeG <fireicewolf@gmail.com>
Date: Wed, 14 Aug 2024 19:58:54 +0800
Subject: [PATCH 072/582] Fix AttributeError: 'T5EncoderModel' object has no
 attribute 'text_model' While loading T5 model in GPU.

---
 train_network.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/train_network.py b/train_network.py
index 367203f54..405aa747c 100644
--- a/train_network.py
+++ b/train_network.py
@@ -540,9 +540,13 @@ def train(self, args):
             # in case of cpu, dtype is already set to fp32 because cpu does not support fp8/fp16/bf16
             if t_enc.device.type != "cpu":
                 t_enc.to(dtype=te_weight_dtype)
-                if hasattr(t_enc.text_model, "embeddings"):
+                if hasattr(t_enc, "text_model") and hasattr(t_enc.text_model, "embeddings"):
                     # nn.Embedding not support FP8
-                    t_enc.text_model.embeddings.to(dtype=(weight_dtype if te_weight_dtype != weight_dtype else te_weight_dtype))
+                    t_enc.text_model.embeddings.to(
+                        dtype=(weight_dtype if te_weight_dtype != weight_dtype else te_weight_dtype))
+                elif hasattr(t_enc, "encoder") and hasattr(t_enc.encoder, "embeddings"):
+                    t_enc.encoder.embeddings.to(
+                        dtype=(weight_dtype if te_weight_dtype != weight_dtype else te_weight_dtype))
 
         # acceleratorがなんかよろしくやってくれるらしい / accelerator will do something good
         if args.deepspeed:

From 7db422211907df3c50703b419655202276a53301 Mon Sep 17 00:00:00 2001
From: Kohya S <ykumeykume@gmail.com>
Date: Wed, 14 Aug 2024 22:15:26 +0900
Subject: [PATCH 073/582] add sample image generation during training

---
 README.md                   |   2 +
 flux_train_network.py       |  67 +++++++-
 library/flux_train_utils.py | 297 ++++++++++++++++++++++++++++++++++++
 train_network.py            |  13 +-
 4 files changed, 374 insertions(+), 5 deletions(-)
 create mode 100644 library/flux_train_utils.py

diff --git a/README.md b/README.md
index 64b018804..7dc954fbc 100644
--- a/README.md
+++ b/README.md
@@ -6,6 +6,8 @@ This feature is experimental. The options and the training script may change in
 
 __Please update PyTorch to 2.4.0. We have tested with PyTorch 2.4.0 with CUDA 12.4. We also updated `accelerate` to 0.33.0 just to be safe. `requirements.txt` is also updated, so please update the requirements.__
 
+Aug 14, 2024: Sample image generation during training is now supported. Specify options such as `--sample_prompts` and `--sample_every_n_epochs`. It will be very slow when `--split_mode` is specified.
+
 Aug 13, 2024:  
 
 __Experimental__  A network argument `train_blocks` is added to `lora_flux`. This is to select the target blocks of LoRA from FLUX double blocks and single blocks. Specify like `--network_args "train_blocks=single"`. `all` trains both double blocks and single blocks, `double` trains only double blocks, and `single` trains only single blocks. The default (omission) is `all`.
diff --git a/flux_train_network.py b/flux_train_network.py
index 1d1f00d84..b8ea56223 100644
--- a/flux_train_network.py
+++ b/flux_train_network.py
@@ -10,7 +10,7 @@
 
 init_ipex()
 
-from library import flux_models, flux_utils, sd3_train_utils, sd3_utils, sdxl_model_util, sdxl_train_util, strategy_flux, train_util
+from library import flux_models, flux_train_utils, flux_utils, sd3_train_utils, strategy_base, strategy_flux, train_util
 import train_network
 from library.utils import setup_logging
 
@@ -28,6 +28,12 @@ def assert_extra_args(self, args, train_dataset_group):
         super().assert_extra_args(args, train_dataset_group)
         # sdxl_train_util.verify_sdxl_training_args(args)
 
+        if args.cache_text_encoder_outputs_to_disk and not args.cache_text_encoder_outputs:
+            logger.warning(
+                "cache_text_encoder_outputs_to_disk is enabled, so cache_text_encoder_outputs is also enabled / cache_text_encoder_outputs_to_diskが有効になっているため、cache_text_encoder_outputsも有効になります"
+            )
+            args.cache_text_encoder_outputs = True
+
         if args.cache_text_encoder_outputs:
             assert (
                 train_dataset_group.is_text_encoder_output_cacheable()
@@ -139,8 +145,31 @@ def cache_text_encoder_outputs_if_needed(
             text_encoders[1].to(accelerator.device, dtype=weight_dtype)
             with accelerator.autocast():
                 dataset.new_cache_text_encoder_outputs(text_encoders, accelerator.is_main_process)
+
+            # cache sample prompts
+            self.sample_prompts_te_outputs = None
+            if args.sample_prompts is not None:
+                logger.info(f"cache Text Encoder outputs for sample prompt: {args.sample_prompts}")
+
+                tokenize_strategy: strategy_flux.FluxTokenizeStrategy = strategy_base.TokenizeStrategy.get_strategy()
+                text_encoding_strategy: strategy_flux.FluxTextEncodingStrategy = strategy_base.TextEncodingStrategy.get_strategy()
+
+                prompts = sd3_train_utils.load_prompts(args.sample_prompts)
+                sample_prompts_te_outputs = {}  # key: prompt, value: text encoder outputs
+                with accelerator.autocast(), torch.no_grad():
+                    for prompt_dict in prompts:
+                        for p in [prompt_dict.get("prompt", ""), prompt_dict.get("negative_prompt", "")]:
+                            if p not in sample_prompts_te_outputs:
+                                logger.info(f"cache Text Encoder outputs for prompt: {p}")
+                                tokens_and_masks = tokenize_strategy.tokenize(p)
+                                sample_prompts_te_outputs[p] = text_encoding_strategy.encode_tokens(
+                                    tokenize_strategy, text_encoders, tokens_and_masks, args.apply_t5_attn_mask
+                                )
+                self.sample_prompts_te_outputs = sample_prompts_te_outputs
+
             accelerator.wait_for_everyone()
 
+            # move back to cpu
             logger.info("move text encoders back to cpu")
             text_encoders[0].to("cpu")  # , dtype=torch.float32)  # Text Encoder doesn't work with fp16 on CPU
             text_encoders[1].to("cpu")  # , dtype=torch.float32)
@@ -172,9 +201,36 @@ def cache_text_encoder_outputs_if_needed(
     #     noise_pred = unet(noisy_latents, timesteps, text_embedding, vector_embedding)
     #     return noise_pred
 
-    def sample_images(self, accelerator, args, epoch, global_step, device, vae, tokenizer, text_encoder, unet):
-        # logger.warning("Sampling images is not supported for Flux model")
-        pass
+    def sample_images(self, accelerator, args, epoch, global_step, device, ae, tokenizer, text_encoder, flux):
+        if not args.split_mode:
+            flux_train_utils.sample_images(
+                accelerator, args, epoch, global_step, flux, ae, text_encoder, self.sample_prompts_te_outputs
+            )
+            return
+
+        class FluxUpperLowerWrapper(torch.nn.Module):
+            def __init__(self, flux_upper: flux_models.FluxUpper, flux_lower: flux_models.FluxLower, device: torch.device):
+                super().__init__()
+                self.flux_upper = flux_upper
+                self.flux_lower = flux_lower
+                self.target_device = device
+
+            def forward(self, img, img_ids, txt, txt_ids, timesteps, y, guidance=None):
+                self.flux_lower.to("cpu")
+                clean_memory_on_device(self.target_device)
+                self.flux_upper.to(self.target_device)
+                img, txt, vec, pe = self.flux_upper(img, img_ids, txt, txt_ids, timesteps, y, guidance)
+                self.flux_upper.to("cpu")
+                clean_memory_on_device(self.target_device)
+                self.flux_lower.to(self.target_device)
+                return self.flux_lower(img, txt, vec, pe)
+
+        wrapper = FluxUpperLowerWrapper(self.flux_upper, flux, accelerator.device)
+        clean_memory_on_device(accelerator.device)
+        flux_train_utils.sample_images(
+            accelerator, args, epoch, global_step, wrapper, ae, text_encoder, self.sample_prompts_te_outputs
+        )
+        clean_memory_on_device(accelerator.device)
 
     def get_noise_scheduler(self, args: argparse.Namespace, device: torch.device) -> Any:
         noise_scheduler = sd3_train_utils.FlowMatchEulerDiscreteScheduler(num_train_timesteps=1000, shift=args.discrete_flow_shift)
@@ -389,6 +445,9 @@ def update_metadata(self, metadata, args):
         metadata["ss_model_prediction_type"] = args.model_prediction_type
         metadata["ss_discrete_flow_shift"] = args.discrete_flow_shift
 
+    def is_text_encoder_not_needed_for_training(self, args):
+        return args.cache_text_encoder_outputs
+
 
 def setup_parser() -> argparse.ArgumentParser:
     parser = train_network.setup_parser()
diff --git a/library/flux_train_utils.py b/library/flux_train_utils.py
new file mode 100644
index 000000000..91f522389
--- /dev/null
+++ b/library/flux_train_utils.py
@@ -0,0 +1,297 @@
+import argparse
+import math
+import os
+import numpy as np
+import toml
+import json
+import time
+from typing import Callable, Dict, List, Optional, Tuple, Union
+
+import torch
+from accelerate import Accelerator, PartialState
+from transformers import CLIPTextModel
+from tqdm import tqdm
+from PIL import Image
+
+from library import flux_models, flux_utils, strategy_base
+from library.sd3_train_utils import load_prompts
+from library.device_utils import init_ipex, clean_memory_on_device
+
+init_ipex()
+
+from .utils import setup_logging
+
+setup_logging()
+import logging
+
+logger = logging.getLogger(__name__)
+
+
+def sample_images(
+    accelerator: Accelerator,
+    args: argparse.Namespace,
+    epoch,
+    steps,
+    flux,
+    ae,
+    text_encoders,
+    sample_prompts_te_outputs,
+    prompt_replacement=None,
+):
+    if steps == 0:
+        if not args.sample_at_first:
+            return
+    else:
+        if args.sample_every_n_steps is None and args.sample_every_n_epochs is None:
+            return
+        if args.sample_every_n_epochs is not None:
+            # sample_every_n_steps は無視する
+            if epoch is None or epoch % args.sample_every_n_epochs != 0:
+                return
+        else:
+            if steps % args.sample_every_n_steps != 0 or epoch is not None:  # steps is not divisible or end of epoch
+                return
+
+    logger.info("")
+    logger.info(f"generating sample images at step / サンプル画像生成 ステップ: {steps}")
+    if not os.path.isfile(args.sample_prompts):
+        logger.error(f"No prompt file / プロンプトファイルがありません: {args.sample_prompts}")
+        return
+
+    distributed_state = PartialState()  # for multi gpu distributed inference. this is a singleton, so it's safe to use it here
+
+    # unwrap unet and text_encoder(s)
+    flux = accelerator.unwrap_model(flux)
+    text_encoders = [accelerator.unwrap_model(te) for te in text_encoders]
+    # print([(te.parameters().__next__().device if te is not None else None) for te in text_encoders])
+
+    prompts = load_prompts(args.sample_prompts)
+
+    save_dir = args.output_dir + "/sample"
+    os.makedirs(save_dir, exist_ok=True)
+
+    # save random state to restore later
+    rng_state = torch.get_rng_state()
+    cuda_rng_state = None
+    try:
+        cuda_rng_state = torch.cuda.get_rng_state() if torch.cuda.is_available() else None
+    except Exception:
+        pass
+
+    if distributed_state.num_processes <= 1:
+        # If only one device is available, just use the original prompt list. We don't need to care about the distribution of prompts.
+        with torch.no_grad():
+            for prompt_dict in prompts:
+                sample_image_inference(
+                    accelerator,
+                    args,
+                    flux,
+                    text_encoders,
+                    ae,
+                    save_dir,
+                    prompt_dict,
+                    epoch,
+                    steps,
+                    sample_prompts_te_outputs,
+                    prompt_replacement,
+                )
+    else:
+        # Creating list with N elements, where each element is a list of prompt_dicts, and N is the number of processes available (number of devices available)
+        # prompt_dicts are assigned to lists based on order of processes, to attempt to time the image creation time to match enum order. Probably only works when steps and sampler are identical.
+        per_process_prompts = []  # list of lists
+        for i in range(distributed_state.num_processes):
+            per_process_prompts.append(prompts[i :: distributed_state.num_processes])
+
+        with torch.no_grad():
+            with distributed_state.split_between_processes(per_process_prompts) as prompt_dict_lists:
+                for prompt_dict in prompt_dict_lists[0]:
+                    sample_image_inference(
+                        accelerator,
+                        args,
+                        flux,
+                        text_encoders,
+                        ae,
+                        save_dir,
+                        prompt_dict,
+                        epoch,
+                        steps,
+                        sample_prompts_te_outputs,
+                        prompt_replacement,
+                    )
+
+    torch.set_rng_state(rng_state)
+    if cuda_rng_state is not None:
+        torch.cuda.set_rng_state(cuda_rng_state)
+
+    clean_memory_on_device(accelerator.device)
+
+
+def sample_image_inference(
+    accelerator: Accelerator,
+    args: argparse.Namespace,
+    flux: flux_models.Flux,
+    text_encoders: List[CLIPTextModel],
+    ae: flux_models.AutoEncoder,
+    save_dir,
+    prompt_dict,
+    epoch,
+    steps,
+    sample_prompts_te_outputs,
+    prompt_replacement,
+):
+    assert isinstance(prompt_dict, dict)
+    # negative_prompt = prompt_dict.get("negative_prompt")
+    sample_steps = prompt_dict.get("sample_steps", 20)
+    width = prompt_dict.get("width", 512)
+    height = prompt_dict.get("height", 512)
+    scale = prompt_dict.get("scale", 3.5)
+    seed = prompt_dict.get("seed")
+    # controlnet_image = prompt_dict.get("controlnet_image")
+    prompt: str = prompt_dict.get("prompt", "")
+    # sampler_name: str = prompt_dict.get("sample_sampler", args.sample_sampler)
+
+    if prompt_replacement is not None:
+        prompt = prompt.replace(prompt_replacement[0], prompt_replacement[1])
+        # if negative_prompt is not None:
+        #     negative_prompt = negative_prompt.replace(prompt_replacement[0], prompt_replacement[1])
+
+    if seed is not None:
+        torch.manual_seed(seed)
+        torch.cuda.manual_seed(seed)
+    else:
+        # True random sample image generation
+        torch.seed()
+        torch.cuda.seed()
+
+    # if negative_prompt is None:
+    #     negative_prompt = ""
+
+    height = max(64, height - height % 16)  # round to divisible by 16
+    width = max(64, width - width % 16)  # round to divisible by 16
+    logger.info(f"prompt: {prompt}")
+    # logger.info(f"negative_prompt: {negative_prompt}")
+    logger.info(f"height: {height}")
+    logger.info(f"width: {width}")
+    logger.info(f"sample_steps: {sample_steps}")
+    logger.info(f"scale: {scale}")
+    # logger.info(f"sample_sampler: {sampler_name}")
+    if seed is not None:
+        logger.info(f"seed: {seed}")
+
+    # encode prompts
+    tokenize_strategy = strategy_base.TokenizeStrategy.get_strategy()
+    encoding_strategy = strategy_base.TextEncodingStrategy.get_strategy()
+
+    if sample_prompts_te_outputs and prompt in sample_prompts_te_outputs:
+        te_outputs = sample_prompts_te_outputs[prompt]
+    else:
+        tokens_and_masks = tokenize_strategy.tokenize(prompt)
+        te_outputs = encoding_strategy.encode_tokens(tokenize_strategy, text_encoders, tokens_and_masks)
+
+    l_pooled, t5_out, txt_ids = te_outputs
+
+    # sample image
+    weight_dtype = ae.dtype  # TOFO give dtype as argument
+    packed_latent_height = height // 16
+    packed_latent_width = width // 16
+    noise = torch.randn(
+        1,
+        packed_latent_height * packed_latent_width,
+        16 * 2 * 2,
+        device=accelerator.device,
+        dtype=weight_dtype,
+        generator=torch.Generator(device=accelerator.device).manual_seed(seed) if seed is not None else None,
+    )
+    timesteps = get_schedule(sample_steps, noise.shape[1], shift=True)  # FLUX.1 dev -> shift=True
+    img_ids = flux_utils.prepare_img_ids(1, packed_latent_height, packed_latent_width).to(accelerator.device, weight_dtype)
+
+    with accelerator.autocast(), torch.no_grad():
+        x = denoise(flux, noise, img_ids, t5_out, txt_ids, l_pooled, timesteps=timesteps, guidance=scale)
+
+    x = x.float()
+    x = flux_utils.unpack_latents(x, packed_latent_height, packed_latent_width)
+
+    # latent to image
+    clean_memory_on_device(accelerator.device)
+    org_vae_device = ae.device  # will be on cpu
+    ae.to(accelerator.device)  # distributed_state.device is same as accelerator.device
+    with accelerator.autocast(), torch.no_grad():
+        x = ae.decode(x)
+    ae.to(org_vae_device)
+    clean_memory_on_device(accelerator.device)
+
+    x = x.clamp(-1, 1)
+    x = x.permute(0, 2, 3, 1)
+    image = Image.fromarray((127.5 * (x + 1.0)).float().cpu().numpy().astype(np.uint8)[0])
+
+    # adding accelerator.wait_for_everyone() here should sync up and ensure that sample images are saved in the same order as the original prompt list
+    # but adding 'enum' to the filename should be enough
+
+    ts_str = time.strftime("%Y%m%d%H%M%S", time.localtime())
+    num_suffix = f"e{epoch:06d}" if epoch is not None else f"{steps:06d}"
+    seed_suffix = "" if seed is None else f"_{seed}"
+    i: int = prompt_dict["enum"]
+    img_filename = f"{'' if args.output_name is None else args.output_name + '_'}{num_suffix}_{i:02d}_{ts_str}{seed_suffix}.png"
+    image.save(os.path.join(save_dir, img_filename))
+
+    # wandb有効時のみログを送信
+    try:
+        wandb_tracker = accelerator.get_tracker("wandb")
+        try:
+            import wandb
+        except ImportError:  # 事前に一度確認するのでここはエラー出ないはず
+            raise ImportError("No wandb / wandb がインストールされていないようです")
+
+        wandb_tracker.log({f"sample_{i}": wandb.Image(image)})
+    except:  # wandb 無効時
+        pass
+
+
+def time_shift(mu: float, sigma: float, t: torch.Tensor):
+    return math.exp(mu) / (math.exp(mu) + (1 / t - 1) ** sigma)
+
+
+def get_lin_function(x1: float = 256, y1: float = 0.5, x2: float = 4096, y2: float = 1.15) -> Callable[[float], float]:
+    m = (y2 - y1) / (x2 - x1)
+    b = y1 - m * x1
+    return lambda x: m * x + b
+
+
+def get_schedule(
+    num_steps: int,
+    image_seq_len: int,
+    base_shift: float = 0.5,
+    max_shift: float = 1.15,
+    shift: bool = True,
+) -> list[float]:
+    # extra step for zero
+    timesteps = torch.linspace(1, 0, num_steps + 1)
+
+    # shifting the schedule to favor high timesteps for higher signal images
+    if shift:
+        # eastimate mu based on linear estimation between two points
+        mu = get_lin_function(y1=base_shift, y2=max_shift)(image_seq_len)
+        timesteps = time_shift(mu, 1.0, timesteps)
+
+    return timesteps.tolist()
+
+
+def denoise(
+    model: flux_models.Flux,
+    img: torch.Tensor,
+    img_ids: torch.Tensor,
+    txt: torch.Tensor,
+    txt_ids: torch.Tensor,
+    vec: torch.Tensor,
+    timesteps: list[float],
+    guidance: float = 4.0,
+):
+    # this is ignored for schnell
+    guidance_vec = torch.full((img.shape[0],), guidance, device=img.device, dtype=img.dtype)
+    for t_curr, t_prev in zip(tqdm(timesteps[:-1]), timesteps[1:]):
+        t_vec = torch.full((img.shape[0],), t_curr, dtype=img.dtype, device=img.device)
+        pred = model(img=img, img_ids=img_ids, txt=txt, txt_ids=txt_ids, y=vec, timesteps=t_vec, guidance=guidance_vec)
+
+        img = img + (t_prev - t_curr) * pred
+
+    return img
diff --git a/train_network.py b/train_network.py
index 367203f54..53d71b57d 100644
--- a/train_network.py
+++ b/train_network.py
@@ -232,6 +232,9 @@ def get_sai_model_spec(self, args):
     def update_metadata(self, metadata, args):
         pass
 
+    def is_text_encoder_not_needed_for_training(self, args):
+        return False  # use for sample images
+
     # endregion
 
     def train(self, args):
@@ -529,7 +532,7 @@ def train(self, args):
 
             # unet.to(accelerator.device)  # this makes faster `to(dtype)` below, but consumes 23 GB VRAM
             # unet.to(dtype=unet_weight_dtype)  # without moving to gpu, this takes a lot of time and main memory
-            
+
             unet.to(accelerator.device, dtype=unet_weight_dtype)  # this seems to be safer than above
 
         unet.requires_grad_(False)
@@ -989,6 +992,14 @@ def remove_model(old_ckpt_name):
                 accelerator.print(f"removing old checkpoint: {old_ckpt_file}")
                 os.remove(old_ckpt_file)
 
+        # if text_encoder is not needed for training, delete it to save memory.
+        # TODO this can be automated after SDXL sample prompt cache is implemented
+        if self.is_text_encoder_not_needed_for_training(args):
+            logger.info("text_encoder is not needed for training. deleting to save memory.")
+            for t_enc in text_encoders:
+                del t_enc
+            text_encoders = []
+
         # For --sample_at_first
         self.sample_images(accelerator, args, 0, global_step, accelerator.device, vae, tokenizers, text_encoder, unet)
 

From 8aaa1967bd3d3a9b4b44e97e5432d23f2101cf51 Mon Sep 17 00:00:00 2001
From: Kohya S <ykumeykume@gmail.com>
Date: Thu, 15 Aug 2024 22:07:23 +0900
Subject: [PATCH 074/582] fix encoding latents closes #1456

---
 flux_train_network.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/flux_train_network.py b/flux_train_network.py
index b8ea56223..daa65c857 100644
--- a/flux_train_network.py
+++ b/flux_train_network.py
@@ -238,8 +238,8 @@ def get_noise_scheduler(self, args: argparse.Namespace, device: torch.device) ->
         return noise_scheduler
 
     def encode_images_to_latents(self, args, accelerator, vae, images):
-        return vae.encode(images).latent_dist.sample()
-
+        return vae.encode(images)
+    
     def shift_scale_latents(self, args, latents):
         return latents
 

From 35b6cb0cd1b319d5f34b44a8c24c81c42895fa2e Mon Sep 17 00:00:00 2001
From: Kohya S <ykumeykume@gmail.com>
Date: Thu, 15 Aug 2024 22:07:35 +0900
Subject: [PATCH 075/582] update for torchvision

---
 README.md | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 7dc954fbc..bdb6bf2ed 100644
--- a/README.md
+++ b/README.md
@@ -4,7 +4,10 @@ This repository contains training, generation and utility scripts for Stable Dif
 
 This feature is experimental. The options and the training script may change in the future. Please let us know if you have any idea to improve the training.
 
-__Please update PyTorch to 2.4.0. We have tested with PyTorch 2.4.0 with CUDA 12.4. We also updated `accelerate` to 0.33.0 just to be safe. `requirements.txt` is also updated, so please update the requirements.__
+__Please update PyTorch to 2.4.0. We have tested with `torch==2.4.0` and `torchvision==0.19.0` with CUDA 12.4. We also updated `accelerate` to 0.33.0 just to be safe. `requirements.txt` is also updated, so please update the requirements.__
+
+The command to install PyTorch is as follows:
+`pip3 install torch==2.4.0 torchvision==0.19.0 --index-url https://download.pytorch.org/whl/cu124`
 
 Aug 14, 2024: Sample image generation during training is now supported. Specify options such as `--sample_prompts` and `--sample_every_n_epochs`. It will be very slow when `--split_mode` is specified.
 

From 08ef886bfeb058aa6d6f7e0a19589c0fd80b3757 Mon Sep 17 00:00:00 2001
From: DukeG <fireicewolf@gmail.com>
Date: Fri, 16 Aug 2024 11:00:08 +0800
Subject: [PATCH 076/582] Fix AttributeError: 'FluxNetworkTrainer' object has
 no attribute 'sample_prompts_te_outputs' Move "self.sample_prompts_te_outputs
 = None" from Line 150 to Line 26.

---
 flux_train_network.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/flux_train_network.py b/flux_train_network.py
index daa65c857..59b9d84b5 100644
--- a/flux_train_network.py
+++ b/flux_train_network.py
@@ -23,6 +23,7 @@
 class FluxNetworkTrainer(train_network.NetworkTrainer):
     def __init__(self):
         super().__init__()
+        self.sample_prompts_te_outputs = None
 
     def assert_extra_args(self, args, train_dataset_group):
         super().assert_extra_args(args, train_dataset_group)
@@ -147,7 +148,6 @@ def cache_text_encoder_outputs_if_needed(
                 dataset.new_cache_text_encoder_outputs(text_encoders, accelerator.is_main_process)
 
             # cache sample prompts
-            self.sample_prompts_te_outputs = None
             if args.sample_prompts is not None:
                 logger.info(f"cache Text Encoder outputs for sample prompt: {args.sample_prompts}")
 

From 3921a4efda1cd1d7d873177ea7f51b77c3f15d3d Mon Sep 17 00:00:00 2001
From: Kohya S <ykumeykume@gmail.com>
Date: Fri, 16 Aug 2024 17:06:05 +0900
Subject: [PATCH 077/582] add t5xxl max token length, support schnell

---
 README.md              |  8 ++++++++
 flux_train_network.py  | 32 ++++++++++++++++++++++++++++----
 library/flux_models.py | 12 ++++++++----
 3 files changed, 44 insertions(+), 8 deletions(-)

diff --git a/README.md b/README.md
index bdb6bf2ed..6fb050dff 100644
--- a/README.md
+++ b/README.md
@@ -9,6 +9,14 @@ __Please update PyTorch to 2.4.0. We have tested with `torch==2.4.0` and `torchv
 The command to install PyTorch is as follows:
 `pip3 install torch==2.4.0 torchvision==0.19.0 --index-url https://download.pytorch.org/whl/cu124`
 
+Aug 16, 2024: 
+
+FLUX.1 schnell model based training is now supported (but not tested). If the name of the model file contains `schnell`, the model is treated as a schnell model. 
+
+Added `--t5xxl_max_token_length` option to specify the maximum token length of T5XXL. The default is 512 in dev and 256 in schnell.
+
+Previously, when `--max_token_length` was specified, that value was used, and 512 was used when omitted (default). Therefore, there is no impact if `--max_token_length` was not specified. If `--max_token_length` was specified, please specify `--t5xxl_max_token_length` instead. `--max_token_length` is ignored during FLUX.1 training.
+
 Aug 14, 2024: Sample image generation during training is now supported. Specify options such as `--sample_prompts` and `--sample_every_n_epochs`. It will be very slow when `--split_mode` is specified.
 
 Aug 13, 2024:  
diff --git a/flux_train_network.py b/flux_train_network.py
index 59b9d84b5..b9a29c160 100644
--- a/flux_train_network.py
+++ b/flux_train_network.py
@@ -44,11 +44,18 @@ def assert_extra_args(self, args, train_dataset_group):
             args.network_train_unet_only or not args.cache_text_encoder_outputs
         ), "network for Text Encoder cannot be trained with caching Text Encoder outputs / Text Encoderの出力をキャッシュしながらText Encoderのネットワークを学習することはできません"
 
+        if args.max_token_length is not None:
+            logger.warning("max_token_length is not used in Flux training / max_token_lengthはFluxのトレーニングでは使用されません")
+
         train_dataset_group.verify_bucket_reso_steps(32)  # TODO check this
 
+    def get_flux_model_name(self, args):
+        return "schnell" if "schnell" in args.pretrained_model_name_or_path else "dev"
+
     def load_target_model(self, args, weight_dtype, accelerator):
         # currently offload to cpu for some models
-        name = "schnell" if "schnell" in args.pretrained_model_name_or_path else "dev"  # TODO change this to a more robust way
+        name = self.get_flux_model_name(args)
+
         # if we load to cpu, flux.to(fp8) takes a long time
         model = flux_utils.load_flow_model(name, args.pretrained_model_name_or_path, weight_dtype, "cpu")
 
@@ -104,7 +111,18 @@ def prepare_split_model(self, model, weight_dtype, accelerator):
         return flux_lower
 
     def get_tokenize_strategy(self, args):
-        return strategy_flux.FluxTokenizeStrategy(args.max_token_length, args.tokenizer_cache_dir)
+        name = self.get_flux_model_name(args)
+
+        if args.t5xxl_max_token_length is None:
+            if name == "schnell":
+                t5xxl_max_token_length = 256
+            else:
+                t5xxl_max_token_length = 512
+        else:
+            t5xxl_max_token_length = args.t5xxl_max_token_length
+
+        logger.info(f"t5xxl_max_token_length: {t5xxl_max_token_length}")
+        return strategy_flux.FluxTokenizeStrategy(t5xxl_max_token_length, args.tokenizer_cache_dir)
 
     def get_tokenizers(self, tokenize_strategy: strategy_flux.FluxTokenizeStrategy):
         return [tokenize_strategy.clip_l, tokenize_strategy.t5xxl]
@@ -239,7 +257,7 @@ def get_noise_scheduler(self, args: argparse.Namespace, device: torch.device) ->
 
     def encode_images_to_latents(self, args, accelerator, vae, images):
         return vae.encode(images)
-    
+
     def shift_scale_latents(self, args, latents):
         return latents
 
@@ -470,7 +488,13 @@ def setup_parser() -> argparse.ArgumentParser:
         help="[EXPERIMENTAL] use split mode for Flux model, network arg `train_blocks=single` is required"
         + "/[実験的] Fluxモデルの分割モードを使用する。ネットワーク引数`train_blocks=single`が必要",
     )
-
+    parser.add_argument(
+        "--t5xxl_max_token_length",
+        type=int,
+        default=None,
+        help="maximum token length for T5-XXL. if omitted, 256 for schnell and 512 for dev"
+        " / T5-XXLの最大トークン長。省略された場合、schnellの場合は256、devの場合は512",
+    )
     # copy from Diffusers
     parser.add_argument(
         "--weighting_scheme",
diff --git a/library/flux_models.py b/library/flux_models.py
index 3c7766b85..ed0bc8c7d 100644
--- a/library/flux_models.py
+++ b/library/flux_models.py
@@ -863,7 +863,8 @@ def enable_gradient_checkpointing(self):
 
         self.time_in.enable_gradient_checkpointing()
         self.vector_in.enable_gradient_checkpointing()
-        self.guidance_in.enable_gradient_checkpointing()
+        if self.guidance_in.__class__ != nn.Identity:
+            self.guidance_in.enable_gradient_checkpointing()
 
         for block in self.double_blocks + self.single_blocks:
             block.enable_gradient_checkpointing()
@@ -875,7 +876,8 @@ def disable_gradient_checkpointing(self):
 
         self.time_in.disable_gradient_checkpointing()
         self.vector_in.disable_gradient_checkpointing()
-        self.guidance_in.disable_gradient_checkpointing()
+        if self.guidance_in.__class__ != nn.Identity:
+            self.guidance_in.disable_gradient_checkpointing()
 
         for block in self.double_blocks + self.single_blocks:
             block.disable_gradient_checkpointing()
@@ -972,7 +974,8 @@ def enable_gradient_checkpointing(self):
 
         self.time_in.enable_gradient_checkpointing()
         self.vector_in.enable_gradient_checkpointing()
-        self.guidance_in.enable_gradient_checkpointing()
+        if self.guidance_in.__class__ != nn.Identity:
+            self.guidance_in.enable_gradient_checkpointing()
 
         for block in self.double_blocks:
             block.enable_gradient_checkpointing()
@@ -984,7 +987,8 @@ def disable_gradient_checkpointing(self):
 
         self.time_in.disable_gradient_checkpointing()
         self.vector_in.disable_gradient_checkpointing()
-        self.guidance_in.disable_gradient_checkpointing()
+        if self.guidance_in.__class__ != nn.Identity:
+            self.guidance_in.disable_gradient_checkpointing()
 
         for block in self.double_blocks:
             block.disable_gradient_checkpointing()

From e45d3f8634c6dd4e358a8c7972f7c851f18f94d3 Mon Sep 17 00:00:00 2001
From: Kohya S <ykumeykume@gmail.com>
Date: Fri, 16 Aug 2024 22:19:21 +0900
Subject: [PATCH 078/582] add merge LoRA script

---
 README.md                   |  24 +++
 library/train_util.py       |   2 +-
 networks/flux_merge_lora.py | 361 ++++++++++++++++++++++++++++++++++++
 3 files changed, 386 insertions(+), 1 deletion(-)
 create mode 100644 networks/flux_merge_lora.py

diff --git a/README.md b/README.md
index 6fb050dff..e231cc24e 100644
--- a/README.md
+++ b/README.md
@@ -11,6 +11,8 @@ The command to install PyTorch is as follows:
 
 Aug 16, 2024: 
 
+Added a script `networks/flux_merge_lora.py` to merge LoRA into FLUX.1 checkpoint. See [Merge LoRA to FLUX.1 checkpoint](#merge-lora-to-flux1-checkpoint) for details.
+
 FLUX.1 schnell model based training is now supported (but not tested). If the name of the model file contains `schnell`, the model is treated as a schnell model. 
 
 Added `--t5xxl_max_token_length` option to specify the maximum token length of T5XXL. The default is 512 in dev and 256 in schnell.
@@ -80,6 +82,28 @@ Aug 12: `--interactive` option is now working.
 python flux_minimal_inference.py --ckpt flux1-dev.sft --clip_l sd3/clip_l.safetensors --t5xxl sd3/t5xxl_fp16.safetensors --ae ae.sft --dtype bf16 --prompt "a cat holding a sign that says hello world" --out path/to/output/dir --seed 1 --flux_dtype fp8 --offload --lora lora-flux-name.safetensors;1.0
 ```
 
+### Merge LoRA to FLUX.1 checkpoint
+
+`networks/flux_merge_lora.py` merges LoRA to FLUX.1 checkpoint. __The script is experimental.__ 
+
+```
+python networks/flux_merge_lora.py --flux_model flux1-dev.sft --save_to output.safetensors --models lora1.safetensors --ratios 2.0 --save_precision fp16 --loading_device cuda --working_device cpu
+```
+
+You can also merge multiple LoRA models into a FLUX.1 model. Specify multiple LoRA models in `--models`. Specify the same number of ratios in `--ratios`.
+
+`--loading_device` is the device to load the LoRA models. `--working_device` is the device to merge (calculate) the models. Default is `cpu` for both. Loading / working device examples are below (in the case of `--save_precision fp16` or `--save_precision bf16`):
+
+- 'cpu' / 'cpu': Uses >50GB of RAM, but works on any machine.
+- 'cuda' / 'cpu': Uses 24GB of VRAM, but requires 30GB of RAM.
+- 'cuda' / 'cuda': Uses 30GB of VRAM, but requires 30GB of RAM, faster than 'cuda' / 'cpu'.
+
+In the case of LoRA models are trained with `bf16`, we are not sure which is better, `fp16` or `bf16` for `--save_precision`.
+
+The script can merge multiple LoRA models. If you want to merge multiple LoRA models, specify `--concat` option to work the merged LoRA model properly.
+
+```
+
 ## SD3 training
 
 SD3 training is done with `sd3_train.py`. 
diff --git a/library/train_util.py b/library/train_util.py
index 59ec3e56d..fa0eb9e51 100644
--- a/library/train_util.py
+++ b/library/train_util.py
@@ -3160,7 +3160,7 @@ def load_metadata_from_safetensors(safetensors_file: str) -> dict:
 
 
 def build_minimum_network_metadata(
-    v2: Optional[bool],
+    v2: Optional[str],
     base_model: Optional[str],
     network_module: str,
     network_dim: str,
diff --git a/networks/flux_merge_lora.py b/networks/flux_merge_lora.py
new file mode 100644
index 000000000..c3986ef1f
--- /dev/null
+++ b/networks/flux_merge_lora.py
@@ -0,0 +1,361 @@
+import math
+import argparse
+import os
+import time
+import torch
+from safetensors import safe_open
+from safetensors.torch import load_file, save_file
+from tqdm import tqdm
+from library import sai_model_spec, train_util
+import networks.lora_flux as lora_flux
+from library.utils import setup_logging
+
+setup_logging()
+import logging
+
+logger = logging.getLogger(__name__)
+
+
+def load_state_dict(file_name, dtype):
+    if os.path.splitext(file_name)[1] == ".safetensors":
+        sd = load_file(file_name)
+        metadata = train_util.load_metadata_from_safetensors(file_name)
+    else:
+        sd = torch.load(file_name, map_location="cpu")
+        metadata = {}
+
+    for key in list(sd.keys()):
+        if type(sd[key]) == torch.Tensor:
+            sd[key] = sd[key].to(dtype)
+
+    return sd, metadata
+
+
+def save_to_file(file_name, state_dict, dtype, metadata):
+    if dtype is not None:
+        logger.info(f"converting to {dtype}...")
+        for key in list(state_dict.keys()):
+            if type(state_dict[key]) == torch.Tensor:
+                state_dict[key] = state_dict[key].to(dtype)
+
+    logger.info(f"saving to: {file_name}")
+    save_file(state_dict, file_name, metadata=metadata)
+
+
+def merge_to_flux_model(loading_device, working_device, flux_model, models, ratios, merge_dtype, save_dtype):
+    # create module map without loading state_dict
+    logger.info(f"loading keys from FLUX.1 model: {flux_model}")
+    lora_name_to_module_key = {}
+    with safe_open(flux_model, framework="pt", device=loading_device) as flux_file:
+        keys = list(flux_file.keys())
+        for key in keys:
+            if key.endswith(".weight"):
+                module_name = ".".join(key.split(".")[:-1])
+                lora_name = lora_flux.LoRANetwork.LORA_PREFIX_FLUX + "_" + module_name.replace(".", "_")
+                lora_name_to_module_key[lora_name] = key
+
+    flux_state_dict = load_file(flux_model, device=loading_device)
+    for model, ratio in zip(models, ratios):
+        logger.info(f"loading: {model}")
+        lora_sd, _ = load_state_dict(model, merge_dtype)  # loading on CPU
+
+        logger.info(f"merging...")
+        for key in tqdm(lora_sd.keys()):
+            if "lora_down" in key:
+                lora_name = key[: key.rfind(".lora_down")]
+                up_key = key.replace("lora_down", "lora_up")
+                alpha_key = key[: key.index("lora_down")] + "alpha"
+
+                if lora_name not in lora_name_to_module_key:
+                    logger.warning(f"no module found for LoRA weight: {key}.  LoRA for Text Encoder is not supported yet.")
+                    continue
+
+                down_weight = lora_sd[key]
+                up_weight = lora_sd[up_key]
+
+                dim = down_weight.size()[0]
+                alpha = lora_sd.get(alpha_key, dim)
+                scale = alpha / dim
+
+                # W <- W + U * D
+                module_weight_key = lora_name_to_module_key[lora_name]
+                if module_weight_key not in flux_state_dict:
+                    weight = flux_file.get_tensor(module_weight_key)
+                else:
+                    weight = flux_state_dict[module_weight_key]
+
+                weight = weight.to(working_device, merge_dtype)
+                up_weight = up_weight.to(working_device, merge_dtype)
+                down_weight = down_weight.to(working_device, merge_dtype)
+
+                # logger.info(module_name, down_weight.size(), up_weight.size())
+                if len(weight.size()) == 2:
+                    # linear
+                    weight = weight + ratio * (up_weight @ down_weight) * scale
+                elif down_weight.size()[2:4] == (1, 1):
+                    # conv2d 1x1
+                    weight = (
+                        weight
+                        + ratio
+                        * (up_weight.squeeze(3).squeeze(2) @ down_weight.squeeze(3).squeeze(2)).unsqueeze(2).unsqueeze(3)
+                        * scale
+                    )
+                else:
+                    # conv2d 3x3
+                    conved = torch.nn.functional.conv2d(down_weight.permute(1, 0, 2, 3), up_weight).permute(1, 0, 2, 3)
+                    # logger.info(conved.size(), weight.size(), module.stride, module.padding)
+                    weight = weight + ratio * conved * scale
+
+                flux_state_dict[module_weight_key] = weight.to(loading_device, save_dtype)
+                del up_weight
+                del down_weight
+                del weight
+
+    return flux_state_dict
+
+
+def merge_lora_models(models, ratios, merge_dtype, concat=False, shuffle=False):
+    base_alphas = {}  # alpha for merged model
+    base_dims = {}
+
+    merged_sd = {}
+    base_model = None
+    for model, ratio in zip(models, ratios):
+        logger.info(f"loading: {model}")
+        lora_sd, lora_metadata = load_state_dict(model, merge_dtype)
+
+        if lora_metadata is not None:
+            if base_model is None:
+                base_model = lora_metadata.get(train_util.SS_METADATA_KEY_BASE_MODEL_VERSION, None)
+
+        # get alpha and dim
+        alphas = {}  # alpha for current model
+        dims = {}  # dims for current model
+        for key in lora_sd.keys():
+            if "alpha" in key:
+                lora_module_name = key[: key.rfind(".alpha")]
+                alpha = float(lora_sd[key].detach().numpy())
+                alphas[lora_module_name] = alpha
+                if lora_module_name not in base_alphas:
+                    base_alphas[lora_module_name] = alpha
+            elif "lora_down" in key:
+                lora_module_name = key[: key.rfind(".lora_down")]
+                dim = lora_sd[key].size()[0]
+                dims[lora_module_name] = dim
+                if lora_module_name not in base_dims:
+                    base_dims[lora_module_name] = dim
+
+        for lora_module_name in dims.keys():
+            if lora_module_name not in alphas:
+                alpha = dims[lora_module_name]
+                alphas[lora_module_name] = alpha
+                if lora_module_name not in base_alphas:
+                    base_alphas[lora_module_name] = alpha
+
+        logger.info(f"dim: {list(set(dims.values()))}, alpha: {list(set(alphas.values()))}")
+
+        # merge
+        logger.info(f"merging...")
+        for key in tqdm(lora_sd.keys()):
+            if "alpha" in key:
+                continue
+
+            if "lora_up" in key and concat:
+                concat_dim = 1
+            elif "lora_down" in key and concat:
+                concat_dim = 0
+            else:
+                concat_dim = None
+
+            lora_module_name = key[: key.rfind(".lora_")]
+
+            base_alpha = base_alphas[lora_module_name]
+            alpha = alphas[lora_module_name]
+
+            scale = math.sqrt(alpha / base_alpha) * ratio
+            scale = abs(scale) if "lora_up" in key else scale  # マイナスの重みに対応する。
+
+            if key in merged_sd:
+                assert (
+                    merged_sd[key].size() == lora_sd[key].size() or concat_dim is not None
+                ), f"weights shape mismatch, different dims? / 重みのサイズが合いません。dimが異なる可能性があります。"
+                if concat_dim is not None:
+                    merged_sd[key] = torch.cat([merged_sd[key], lora_sd[key] * scale], dim=concat_dim)
+                else:
+                    merged_sd[key] = merged_sd[key] + lora_sd[key] * scale
+            else:
+                merged_sd[key] = lora_sd[key] * scale
+
+    # set alpha to sd
+    for lora_module_name, alpha in base_alphas.items():
+        key = lora_module_name + ".alpha"
+        merged_sd[key] = torch.tensor(alpha)
+        if shuffle:
+            key_down = lora_module_name + ".lora_down.weight"
+            key_up = lora_module_name + ".lora_up.weight"
+            dim = merged_sd[key_down].shape[0]
+            perm = torch.randperm(dim)
+            merged_sd[key_down] = merged_sd[key_down][perm]
+            merged_sd[key_up] = merged_sd[key_up][:, perm]
+
+    logger.info("merged model")
+    logger.info(f"dim: {list(set(base_dims.values()))}, alpha: {list(set(base_alphas.values()))}")
+
+    # check all dims are same
+    dims_list = list(set(base_dims.values()))
+    alphas_list = list(set(base_alphas.values()))
+    all_same_dims = True
+    all_same_alphas = True
+    for dims in dims_list:
+        if dims != dims_list[0]:
+            all_same_dims = False
+            break
+    for alphas in alphas_list:
+        if alphas != alphas_list[0]:
+            all_same_alphas = False
+            break
+
+    # build minimum metadata
+    dims = f"{dims_list[0]}" if all_same_dims else "Dynamic"
+    alphas = f"{alphas_list[0]}" if all_same_alphas else "Dynamic"
+    metadata = train_util.build_minimum_network_metadata(str(False), base_model, "networks.lora", dims, alphas, None)
+
+    return merged_sd, metadata
+
+
+def merge(args):
+    assert len(args.models) == len(
+        args.ratios
+    ), f"number of models must be equal to number of ratios / モデルの数と重みの数は合わせてください"
+
+    def str_to_dtype(p):
+        if p == "float":
+            return torch.float
+        if p == "fp16":
+            return torch.float16
+        if p == "bf16":
+            return torch.bfloat16
+        return None
+
+    merge_dtype = str_to_dtype(args.precision)
+    save_dtype = str_to_dtype(args.save_precision)
+    if save_dtype is None:
+        save_dtype = merge_dtype
+
+    dest_dir = os.path.dirname(args.save_to)
+    if not os.path.exists(dest_dir):
+        logger.info(f"creating directory: {dest_dir}")
+        os.makedirs(dest_dir)
+
+    if args.flux_model is not None:
+        state_dict = merge_to_flux_model(
+            args.loading_device, args.working_device, args.flux_model, args.models, args.ratios, merge_dtype, save_dtype
+        )
+
+        if args.no_metadata:
+            sai_metadata = None
+        else:
+            merged_from = sai_model_spec.build_merged_from([args.flux_model] + args.models)
+            title = os.path.splitext(os.path.basename(args.save_to))[0]
+            sai_metadata = sai_model_spec.build_metadata(
+                None, False, False, False, False, False, time.time(), title=title, merged_from=merged_from, flux="dev"
+            )
+
+        logger.info(f"saving FLUX model to: {args.save_to}")
+        save_to_file(args.save_to, state_dict, save_dtype, sai_metadata)
+
+    else:
+        state_dict, metadata = merge_lora_models(args.models, args.ratios, merge_dtype, args.concat, args.shuffle)
+
+        logger.info(f"calculating hashes and creating metadata...")
+
+        model_hash, legacy_hash = train_util.precalculate_safetensors_hashes(state_dict, metadata)
+        metadata["sshs_model_hash"] = model_hash
+        metadata["sshs_legacy_hash"] = legacy_hash
+
+        if not args.no_metadata:
+            merged_from = sai_model_spec.build_merged_from(args.models)
+            title = os.path.splitext(os.path.basename(args.save_to))[0]
+            sai_metadata = sai_model_spec.build_metadata(
+                state_dict, False, False, False, True, False, time.time(), title=title, merged_from=merged_from, flux="dev"
+            )
+            metadata.update(sai_metadata)
+
+        logger.info(f"saving model to: {args.save_to}")
+        save_to_file(args.save_to, state_dict, save_dtype, metadata)
+
+
+def setup_parser() -> argparse.ArgumentParser:
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--save_precision",
+        type=str,
+        default=None,
+        choices=[None, "float", "fp16", "bf16"],
+        help="precision in saving, same to merging if omitted / 保存時に精度を変更して保存する、省略時はマージ時の精度と同じ",
+    )
+    parser.add_argument(
+        "--precision",
+        type=str,
+        default="float",
+        choices=["float", "fp16", "bf16"],
+        help="precision in merging (float is recommended) / マージの計算時の精度（floatを推奨）",
+    )
+    parser.add_argument(
+        "--flux_model",
+        type=str,
+        default=None,
+        help="FLUX.1 model to load, merge LoRA models if omitted / 読み込むモデル、指定しない場合はLoRAモデルをマージする",
+    )
+    parser.add_argument(
+        "--loading_device",
+        type=str,
+        default="cpu",
+        help="device to load FLUX.1 model. LoRA models are loaded on CPU / FLUX.1モデルを読み込むデバイス。LoRAモデルはCPUで読み込まれます",
+    )
+    parser.add_argument(
+        "--working_device",
+        type=str,
+        default="cpu",
+        help="device to work (merge). Merging LoRA models are done on CPU."
+        + " / 作業（マージ）するデバイス。LoRAモデルのマージはCPUで行われます。",
+    )
+    parser.add_argument(
+        "--save_to",
+        type=str,
+        default=None,
+        help="destination file name: safetensors file / 保存先のファイル名、safetensorsファイル",
+    )
+    parser.add_argument(
+        "--models",
+        type=str,
+        nargs="*",
+        help="LoRA models to merge: safetensors file / マージするLoRAモデル、safetensorsファイル",
+    )
+    parser.add_argument("--ratios", type=float, nargs="*", help="ratios for each model / それぞれのLoRAモデルの比率")
+    parser.add_argument(
+        "--no_metadata",
+        action="store_true",
+        help="do not save sai modelspec metadata (minimum ss_metadata for LoRA is saved) / "
+        + "sai modelspecのメタデータを保存しない（LoRAの最低限のss_metadataは保存される）",
+    )
+    parser.add_argument(
+        "--concat",
+        action="store_true",
+        help="concat lora instead of merge (The dim(rank) of the output LoRA is the sum of the input dims) / "
+        + "マージの代わりに結合する（LoRAのdim(rank)は入力dimの合計になる）",
+    )
+    parser.add_argument(
+        "--shuffle",
+        action="store_true",
+        help="shuffle lora weight./ " + "LoRAの重みをシャッフルする",
+    )
+
+    return parser
+
+
+if __name__ == "__main__":
+    parser = setup_parser()
+
+    args = parser.parse_args()
+    merge(args)

From 7367584e6749448cb9b012df0d3bcbe4f0531ea5 Mon Sep 17 00:00:00 2001
From: Kohya S <ykumeykume@gmail.com>
Date: Sat, 17 Aug 2024 14:38:34 +0900
Subject: [PATCH 079/582] fix sd3 training to work without cachine TE outputs
 #1465

---
 sd3_train.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/sd3_train.py b/sd3_train.py
index 9c37cbce6..3b6c8a118 100644
--- a/sd3_train.py
+++ b/sd3_train.py
@@ -759,8 +759,9 @@ def compute_loss_weighting_for_sd3(weighting_scheme: str, sigmas=None):
                     input_ids_clip_l, input_ids_clip_g, _, l_attn_mask, g_attn_mask, _ = batch["input_ids_list"]
                     with torch.set_grad_enabled(args.train_text_encoder):
                         # TODO support weighted captions
-                        input_ids_clip_l = input_ids_clip_l.to(accelerator.device)
-                        input_ids_clip_g = input_ids_clip_g.to(accelerator.device)
+                        # text models in sd3_models require "cpu" for input_ids
+                        input_ids_clip_l = input_ids_clip_l.to("cpu")
+                        input_ids_clip_g = input_ids_clip_g.to("cpu")
                         lg_out, _, lg_pooled = text_encoding_strategy.encode_tokens(
                             sd3_tokenize_strategy,
                             [clip_l, clip_g, None],
@@ -770,7 +771,7 @@ def compute_loss_weighting_for_sd3(weighting_scheme: str, sigmas=None):
                 if t5_out is None:
                     _, _, input_ids_t5xxl, _, _, t5_attn_mask = batch["input_ids_list"]
                     with torch.no_grad():
-                        input_ids_t5xxl = input_ids_t5xxl.to(accelerator.device) if t5_out is None else None
+                        input_ids_t5xxl = input_ids_t5xxl.to("cpu") if t5_out is None else None
                         _, t5_out, _ = text_encoding_strategy.encode_tokens(
                             sd3_tokenize_strategy, [None, None, t5xxl], [None, None, input_ids_t5xxl, None, None, t5_attn_mask]
                         )

From 400955d3ea4088e8da7a3917dec9b0664424e24a Mon Sep 17 00:00:00 2001
From: Kohya S <ykumeykume@gmail.com>
Date: Sat, 17 Aug 2024 15:36:18 +0900
Subject: [PATCH 080/582] add fine tuning FLUX.1 (WIP)

---
 flux_train.py               | 729 ++++++++++++++++++++++++++++++++++++
 flux_train_network.py       | 168 +--------
 library/flux_train_utils.py | 270 ++++++++++++-
 library/train_util.py       |   2 +-
 4 files changed, 1007 insertions(+), 162 deletions(-)
 create mode 100644 flux_train.py

diff --git a/flux_train.py b/flux_train.py
new file mode 100644
index 000000000..2ca20ded2
--- /dev/null
+++ b/flux_train.py
@@ -0,0 +1,729 @@
+# training with captions
+
+import argparse
+import copy
+import math
+import os
+from multiprocessing import Value
+from typing import List
+import toml
+
+from tqdm import tqdm
+
+import torch
+from library.device_utils import init_ipex, clean_memory_on_device
+
+init_ipex()
+
+from accelerate.utils import set_seed
+from library import deepspeed_utils, flux_train_utils, flux_utils, strategy_base, strategy_flux
+from library.sd3_train_utils import load_prompts, FlowMatchEulerDiscreteScheduler
+
+import library.train_util as train_util
+
+from library.utils import setup_logging, add_logging_arguments
+
+setup_logging()
+import logging
+
+logger = logging.getLogger(__name__)
+
+import library.config_util as config_util
+
+# import library.sdxl_train_util as sdxl_train_util
+from library.config_util import (
+    ConfigSanitizer,
+    BlueprintGenerator,
+)
+from library.custom_train_functions import apply_masked_loss, add_custom_train_arguments
+
+
+def train(args):
+    train_util.verify_training_args(args)
+    train_util.prepare_dataset_args(args, True)
+    # sdxl_train_util.verify_sdxl_training_args(args)
+    deepspeed_utils.prepare_deepspeed_args(args)
+    setup_logging(args, reset=True)
+
+    # assert (
+    #     not args.weighted_captions
+    # ), "weighted_captions is not supported currently / weighted_captionsは現在サポートされていません"
+    if args.cache_text_encoder_outputs_to_disk and not args.cache_text_encoder_outputs:
+        logger.warning(
+            "cache_text_encoder_outputs_to_disk is enabled, so cache_text_encoder_outputs is also enabled / cache_text_encoder_outputs_to_diskが有効になっているため、cache_text_encoder_outputsも有効になります"
+        )
+        args.cache_text_encoder_outputs = True
+
+    cache_latents = args.cache_latents
+    use_dreambooth_method = args.in_json is None
+
+    if args.seed is not None:
+        set_seed(args.seed)  # 乱数系列を初期化する
+
+    # prepare caching strategy: this must be set before preparing dataset. because dataset may use this strategy for initialization.
+    if args.cache_latents:
+        latents_caching_strategy = strategy_flux.FluxLatentsCachingStrategy(
+            args.cache_latents_to_disk, args.vae_batch_size, args.skip_latents_validity_check
+        )
+        strategy_base.LatentsCachingStrategy.set_strategy(latents_caching_strategy)
+
+    # データセットを準備する
+    if args.dataset_class is None:
+        blueprint_generator = BlueprintGenerator(ConfigSanitizer(True, True, args.masked_loss, True))
+        if args.dataset_config is not None:
+            logger.info(f"Load dataset config from {args.dataset_config}")
+            user_config = config_util.load_user_config(args.dataset_config)
+            ignored = ["train_data_dir", "in_json"]
+            if any(getattr(args, attr) is not None for attr in ignored):
+                logger.warning(
+                    "ignore following options because config file is found: {0} / 設定ファイルが利用されるため以下のオプションは無視されます: {0}".format(
+                        ", ".join(ignored)
+                    )
+                )
+        else:
+            if use_dreambooth_method:
+                logger.info("Using DreamBooth method.")
+                user_config = {
+                    "datasets": [
+                        {
+                            "subsets": config_util.generate_dreambooth_subsets_config_by_subdirs(
+                                args.train_data_dir, args.reg_data_dir
+                            )
+                        }
+                    ]
+                }
+            else:
+                logger.info("Training with captions.")
+                user_config = {
+                    "datasets": [
+                        {
+                            "subsets": [
+                                {
+                                    "image_dir": args.train_data_dir,
+                                    "metadata_file": args.in_json,
+                                }
+                            ]
+                        }
+                    ]
+                }
+
+        blueprint = blueprint_generator.generate(user_config, args)
+        train_dataset_group = config_util.generate_dataset_group_by_blueprint(blueprint.dataset_group)
+    else:
+        train_dataset_group = train_util.load_arbitrary_dataset(args)
+
+    current_epoch = Value("i", 0)
+    current_step = Value("i", 0)
+    ds_for_collator = train_dataset_group if args.max_data_loader_n_workers == 0 else None
+    collator = train_util.collator_class(current_epoch, current_step, ds_for_collator)
+
+    train_dataset_group.verify_bucket_reso_steps(16)  # TODO これでいいか確認
+
+    if args.debug_dataset:
+        if args.cache_text_encoder_outputs:
+            strategy_base.TextEncoderOutputsCachingStrategy.set_strategy(
+                strategy_flux.FluxTextEncoderOutputsCachingStrategy(
+                    args.cache_text_encoder_outputs_to_disk, args.text_encoder_batch_size, False, False
+                )
+            )
+        train_dataset_group.set_current_strategies()
+        train_util.debug_dataset(train_dataset_group, True)
+        return
+    if len(train_dataset_group) == 0:
+        logger.error(
+            "No data found. Please verify the metadata file and train_data_dir option. / 画像がありません。メタデータおよびtrain_data_dirオプションを確認してください。"
+        )
+        return
+
+    if cache_latents:
+        assert (
+            train_dataset_group.is_latent_cacheable()
+        ), "when caching latents, either color_aug or random_crop cannot be used / latentをキャッシュするときはcolor_augとrandom_cropは使えません"
+
+    if args.cache_text_encoder_outputs:
+        assert (
+            train_dataset_group.is_text_encoder_output_cacheable()
+        ), "when caching text encoder output, either caption_dropout_rate, shuffle_caption, token_warmup_step or caption_tag_dropout_rate cannot be used / text encoderの出力をキャッシュするときはcaption_dropout_rate, shuffle_caption, token_warmup_step, caption_tag_dropout_rateは使えません"
+
+    # acceleratorを準備する
+    logger.info("prepare accelerator")
+    accelerator = train_util.prepare_accelerator(args)
+
+    # mixed precisionに対応した型を用意しておき適宜castする
+    weight_dtype, save_dtype = train_util.prepare_dtype(args)
+
+    # モデルを読み込む
+    name = "schnell" if "schnell" in args.pretrained_model_name_or_path else "dev"
+
+    # load VAE for caching latents
+    ae = None
+    if cache_latents:
+        ae = flux_utils.load_ae(name, args.ae, weight_dtype, "cpu")
+        ae.to(accelerator.device, dtype=weight_dtype)
+        ae.requires_grad_(False)
+        ae.eval()
+
+        train_dataset_group.new_cache_latents(ae, accelerator.is_main_process)
+
+        ae.to("cpu")  # if no sampling, vae can be deleted
+        clean_memory_on_device(accelerator.device)
+
+        accelerator.wait_for_everyone()
+
+    # prepare tokenize strategy
+    if args.t5xxl_max_token_length is None:
+        if name == "schnell":
+            t5xxl_max_token_length = 256
+        else:
+            t5xxl_max_token_length = 512
+    else:
+        t5xxl_max_token_length = args.t5xxl_max_token_length
+
+    flux_tokenize_strategy = strategy_flux.FluxTokenizeStrategy(t5xxl_max_token_length)
+    strategy_base.TokenizeStrategy.set_strategy(flux_tokenize_strategy)
+
+    # load clip_l, t5xxl for caching text encoder outputs
+    clip_l = flux_utils.load_clip_l(args.clip_l, weight_dtype, "cpu")
+    t5xxl = flux_utils.load_t5xxl(args.t5xxl, weight_dtype, "cpu")
+    clip_l.eval()
+    t5xxl.eval()
+    clip_l.requires_grad_(False)
+    t5xxl.requires_grad_(False)
+
+    text_encoding_strategy = strategy_flux.FluxTextEncodingStrategy(args.apply_t5_attn_mask)
+    strategy_base.TextEncodingStrategy.set_strategy(text_encoding_strategy)
+
+    # cache text encoder outputs
+    sample_prompts_te_outputs = None
+    if args.cache_text_encoder_outputs:
+        # Text Encodes are eval and no grad here
+        clip_l.to(accelerator.device)
+        t5xxl.to(accelerator.device)
+
+        text_encoder_caching_strategy = strategy_flux.FluxTextEncoderOutputsCachingStrategy(
+            args.cache_text_encoder_outputs_to_disk, args.text_encoder_batch_size, False, False, args.apply_t5_attn_mask
+        )
+        strategy_base.TextEncoderOutputsCachingStrategy.set_strategy(text_encoder_caching_strategy)
+
+        with accelerator.autocast():
+            train_dataset_group.new_cache_text_encoder_outputs([clip_l, t5xxl], accelerator.is_main_process)
+
+        # cache sample prompt's embeddings to free text encoder's memory
+        if args.sample_prompts is not None:
+            logger.info(f"cache Text Encoder outputs for sample prompt: {args.sample_prompts}")
+
+            tokenize_strategy: strategy_flux.FluxTokenizeStrategy = strategy_base.TokenizeStrategy.get_strategy()
+            text_encoding_strategy: strategy_flux.FluxTextEncodingStrategy = strategy_base.TextEncodingStrategy.get_strategy()
+
+            prompts = load_prompts(args.sample_prompts)
+            sample_prompts_te_outputs = {}  # key: prompt, value: text encoder outputs
+            with accelerator.autocast(), torch.no_grad():
+                for prompt_dict in prompts:
+                    for p in [prompt_dict.get("prompt", ""), prompt_dict.get("negative_prompt", "")]:
+                        if p not in sample_prompts_te_outputs:
+                            logger.info(f"cache Text Encoder outputs for prompt: {p}")
+                            tokens_and_masks = tokenize_strategy.tokenize(p)
+                            sample_prompts_te_outputs[p] = text_encoding_strategy.encode_tokens(
+                                tokenize_strategy, [clip_l, t5xxl], tokens_and_masks, args.apply_t5_attn_mask
+                            )
+
+        accelerator.wait_for_everyone()
+
+        # now we can delete Text Encoders to free memory
+        clip_l = None
+        t5xxl = None
+
+    # load FLUX
+    # if we load to cpu, flux.to(fp8) takes a long time
+    flux = flux_utils.load_flow_model(name, args.pretrained_model_name_or_path, weight_dtype, "cpu")
+
+    if args.gradient_checkpointing:
+        flux.enable_gradient_checkpointing()
+
+    flux.requires_grad_(True)
+
+    if not cache_latents:
+        # load VAE here if not cached
+        ae = flux_utils.load_ae(name, args.ae, weight_dtype, "cpu")
+        ae.requires_grad_(False)
+        ae.eval()
+        ae.to(accelerator.device, dtype=weight_dtype)
+
+    training_models = []
+    params_to_optimize = []
+    training_models.append(flux)
+    params_to_optimize.append({"params": list(flux.parameters()), "lr": args.learning_rate})
+
+    # calculate number of trainable parameters
+    n_params = 0
+    for group in params_to_optimize:
+        for p in group["params"]:
+            n_params += p.numel()
+
+    accelerator.print(f"number of trainable parameters: {n_params}")
+
+    # 学習に必要なクラスを準備する
+    accelerator.print("prepare optimizer, data loader etc.")
+
+    if args.fused_optimizer_groups:
+        # fused backward pass: https://pytorch.org/tutorials/intermediate/optimizer_step_in_backward_tutorial.html
+        # Instead of creating an optimizer for all parameters as in the tutorial, we create an optimizer for each group of parameters.
+        # This balances memory usage and management complexity.
+
+        # calculate total number of parameters
+        n_total_params = sum(len(params["params"]) for params in params_to_optimize)
+        params_per_group = math.ceil(n_total_params / args.fused_optimizer_groups)
+
+        # split params into groups, keeping the learning rate the same for all params in a group
+        # this will increase the number of groups if the learning rate is different for different params (e.g. U-Net and text encoders)
+        grouped_params = []
+        param_group = []
+        param_group_lr = -1
+        for group in params_to_optimize:
+            lr = group["lr"]
+            for p in group["params"]:
+                # if the learning rate is different for different params, start a new group
+                if lr != param_group_lr:
+                    if param_group:
+                        grouped_params.append({"params": param_group, "lr": param_group_lr})
+                        param_group = []
+                    param_group_lr = lr
+
+                param_group.append(p)
+
+                # if the group has enough parameters, start a new group
+                if len(param_group) == params_per_group:
+                    grouped_params.append({"params": param_group, "lr": param_group_lr})
+                    param_group = []
+                    param_group_lr = -1
+
+        if param_group:
+            grouped_params.append({"params": param_group, "lr": param_group_lr})
+
+        # prepare optimizers for each group
+        optimizers = []
+        for group in grouped_params:
+            _, _, optimizer = train_util.get_optimizer(args, trainable_params=[group])
+            optimizers.append(optimizer)
+        optimizer = optimizers[0]  # avoid error in the following code
+
+        logger.info(f"using {len(optimizers)} optimizers for fused optimizer groups")
+
+    else:
+        _, _, optimizer = train_util.get_optimizer(args, trainable_params=params_to_optimize)
+
+    # prepare dataloader
+    # strategies are set here because they cannot be referenced in another process. Copy them with the dataset
+    # some strategies can be None
+    train_dataset_group.set_current_strategies()
+
+    # DataLoaderのプロセス数：0 は persistent_workers が使えないので注意
+    n_workers = min(args.max_data_loader_n_workers, os.cpu_count())  # cpu_count or max_data_loader_n_workers
+    train_dataloader = torch.utils.data.DataLoader(
+        train_dataset_group,
+        batch_size=1,
+        shuffle=True,
+        collate_fn=collator,
+        num_workers=n_workers,
+        persistent_workers=args.persistent_data_loader_workers,
+    )
+
+    # 学習ステップ数を計算する
+    if args.max_train_epochs is not None:
+        args.max_train_steps = args.max_train_epochs * math.ceil(
+            len(train_dataloader) / accelerator.num_processes / args.gradient_accumulation_steps
+        )
+        accelerator.print(
+            f"override steps. steps for {args.max_train_epochs} epochs is / 指定エポックまでのステップ数: {args.max_train_steps}"
+        )
+
+    # データセット側にも学習ステップを送信
+    train_dataset_group.set_max_train_steps(args.max_train_steps)
+
+    # lr schedulerを用意する
+    if args.fused_optimizer_groups:
+        # prepare lr schedulers for each optimizer
+        lr_schedulers = [train_util.get_scheduler_fix(args, optimizer, accelerator.num_processes) for optimizer in optimizers]
+        lr_scheduler = lr_schedulers[0]  # avoid error in the following code
+    else:
+        lr_scheduler = train_util.get_scheduler_fix(args, optimizer, accelerator.num_processes)
+
+    # 実験的機能：勾配も含めたfp16/bf16学習を行う　モデル全体をfp16/bf16にする
+    if args.full_fp16:
+        assert (
+            args.mixed_precision == "fp16"
+        ), "full_fp16 requires mixed precision='fp16' / full_fp16を使う場合はmixed_precision='fp16'を指定してください。"
+        accelerator.print("enable full fp16 training.")
+        flux.to(weight_dtype)
+        if clip_l is not None:
+            clip_l.to(weight_dtype)
+            t5xxl.to(weight_dtype)  # TODO check works with fp16 or not
+    elif args.full_bf16:
+        assert (
+            args.mixed_precision == "bf16"
+        ), "full_bf16 requires mixed precision='bf16' / full_bf16を使う場合はmixed_precision='bf16'を指定してください。"
+        accelerator.print("enable full bf16 training.")
+        flux.to(weight_dtype)
+        if clip_l is not None:
+            clip_l.to(weight_dtype)
+            t5xxl.to(weight_dtype)
+
+    # if we don't cache text encoder outputs, move them to device
+    if not args.cache_text_encoder_outputs:
+        clip_l.to(accelerator.device)
+        t5xxl.to(accelerator.device)
+
+    clean_memory_on_device(accelerator.device)
+
+    if args.deepspeed:
+        ds_model = deepspeed_utils.prepare_deepspeed_model(args, mmdit=flux)
+        # most of ZeRO stage uses optimizer partitioning, so we have to prepare optimizer and ds_model at the same time. # pull/1139#issuecomment-1986790007
+        ds_model, optimizer, train_dataloader, lr_scheduler = accelerator.prepare(
+            ds_model, optimizer, train_dataloader, lr_scheduler
+        )
+        training_models = [ds_model]
+
+    else:
+        # acceleratorがなんかよろしくやってくれるらしい
+        flux = accelerator.prepare(flux)
+        optimizer, train_dataloader, lr_scheduler = accelerator.prepare(optimizer, train_dataloader, lr_scheduler)
+
+    # 実験的機能：勾配も含めたfp16学習を行う　PyTorchにパッチを当ててfp16でのgrad scaleを有効にする
+    if args.full_fp16:
+        # During deepseed training, accelerate not handles fp16/bf16|mixed precision directly via scaler. Let deepspeed engine do.
+        # -> But we think it's ok to patch accelerator even if deepspeed is enabled.
+        train_util.patch_accelerator_for_fp16_training(accelerator)
+
+    # resumeする
+    train_util.resume_from_local_or_hf_if_specified(accelerator, args)
+
+    if args.fused_backward_pass:
+        # use fused optimizer for backward pass: other optimizers will be supported in the future
+        import library.adafactor_fused
+
+        library.adafactor_fused.patch_adafactor_fused(optimizer)
+        for param_group in optimizer.param_groups:
+            for parameter in param_group["params"]:
+                if parameter.requires_grad:
+
+                    def __grad_hook(tensor: torch.Tensor, param_group=param_group):
+                        if accelerator.sync_gradients and args.max_grad_norm != 0.0:
+                            accelerator.clip_grad_norm_(tensor, args.max_grad_norm)
+                        optimizer.step_param(tensor, param_group)
+                        tensor.grad = None
+
+                    parameter.register_post_accumulate_grad_hook(__grad_hook)
+
+    elif args.fused_optimizer_groups:
+        # prepare for additional optimizers and lr schedulers
+        for i in range(1, len(optimizers)):
+            optimizers[i] = accelerator.prepare(optimizers[i])
+            lr_schedulers[i] = accelerator.prepare(lr_schedulers[i])
+
+        # counters are used to determine when to step the optimizer
+        global optimizer_hooked_count
+        global num_parameters_per_group
+        global parameter_optimizer_map
+
+        optimizer_hooked_count = {}
+        num_parameters_per_group = [0] * len(optimizers)
+        parameter_optimizer_map = {}
+
+        for opt_idx, optimizer in enumerate(optimizers):
+            for param_group in optimizer.param_groups:
+                for parameter in param_group["params"]:
+                    if parameter.requires_grad:
+
+                        def optimizer_hook(parameter: torch.Tensor):
+                            if accelerator.sync_gradients and args.max_grad_norm != 0.0:
+                                accelerator.clip_grad_norm_(parameter, args.max_grad_norm)
+
+                            i = parameter_optimizer_map[parameter]
+                            optimizer_hooked_count[i] += 1
+                            if optimizer_hooked_count[i] == num_parameters_per_group[i]:
+                                optimizers[i].step()
+                                optimizers[i].zero_grad(set_to_none=True)
+
+                        parameter.register_post_accumulate_grad_hook(optimizer_hook)
+                        parameter_optimizer_map[parameter] = opt_idx
+                        num_parameters_per_group[opt_idx] += 1
+
+    # epoch数を計算する
+    num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
+    num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch)
+    if (args.save_n_epoch_ratio is not None) and (args.save_n_epoch_ratio > 0):
+        args.save_every_n_epochs = math.floor(num_train_epochs / args.save_n_epoch_ratio) or 1
+
+    # 学習する
+    # total_batch_size = args.train_batch_size * accelerator.num_processes * args.gradient_accumulation_steps
+    accelerator.print("running training / 学習開始")
+    accelerator.print(f"  num examples / サンプル数: {train_dataset_group.num_train_images}")
+    accelerator.print(f"  num batches per epoch / 1epochのバッチ数: {len(train_dataloader)}")
+    accelerator.print(f"  num epochs / epoch数: {num_train_epochs}")
+    accelerator.print(
+        f"  batch size per device / バッチサイズ: {', '.join([str(d.batch_size) for d in train_dataset_group.datasets])}"
+    )
+    # accelerator.print(
+    #     f"  total train batch size (with parallel & distributed & accumulation) / 総バッチサイズ（並列学習、勾配合計含む）: {total_batch_size}"
+    # )
+    accelerator.print(f"  gradient accumulation steps / 勾配を合計するステップ数 = {args.gradient_accumulation_steps}")
+    accelerator.print(f"  total optimization steps / 学習ステップ数: {args.max_train_steps}")
+
+    progress_bar = tqdm(range(args.max_train_steps), smoothing=0, disable=not accelerator.is_local_main_process, desc="steps")
+    global_step = 0
+
+    noise_scheduler = FlowMatchEulerDiscreteScheduler(num_train_timesteps=1000, shift=args.discrete_flow_shift)
+    noise_scheduler_copy = copy.deepcopy(noise_scheduler)
+
+    if accelerator.is_main_process:
+        init_kwargs = {}
+        if args.wandb_run_name:
+            init_kwargs["wandb"] = {"name": args.wandb_run_name}
+        if args.log_tracker_config is not None:
+            init_kwargs = toml.load(args.log_tracker_config)
+        accelerator.init_trackers(
+            "finetuning" if args.log_tracker_name is None else args.log_tracker_name,
+            config=train_util.get_sanitized_config_or_none(args),
+            init_kwargs=init_kwargs,
+        )
+
+    # For --sample_at_first
+    flux_train_utils.sample_images(accelerator, args, 0, global_step, flux, ae, [clip_l, t5xxl], sample_prompts_te_outputs)
+
+    loss_recorder = train_util.LossRecorder()
+    epoch = 0  # avoid error when max_train_steps is 0
+    for epoch in range(num_train_epochs):
+        accelerator.print(f"\nepoch {epoch+1}/{num_train_epochs}")
+        current_epoch.value = epoch + 1
+
+        for m in training_models:
+            m.train()
+
+        for step, batch in enumerate(train_dataloader):
+            current_step.value = global_step
+
+            if args.fused_optimizer_groups:
+                optimizer_hooked_count = {i: 0 for i in range(len(optimizers))}  # reset counter for each step
+
+            with accelerator.accumulate(*training_models):
+                if "latents" in batch and batch["latents"] is not None:
+                    latents = batch["latents"].to(accelerator.device, dtype=weight_dtype)
+                else:
+                    with torch.no_grad():
+                        # encode images to latents. images are [-1, 1]
+                        latents = ae.encode(batch["images"])
+
+                    # NaNが含まれていれば警告を表示し0に置き換える
+                    if torch.any(torch.isnan(latents)):
+                        accelerator.print("NaN found in latents, replacing with zeros")
+                        latents = torch.nan_to_num(latents, 0, out=latents)
+
+                text_encoder_outputs_list = batch.get("text_encoder_outputs_list", None)
+                if text_encoder_outputs_list is not None:
+                    text_encoder_conds = text_encoder_outputs_list
+                else:
+                    # not cached or training, so get from text encoders
+                    tokens_and_masks = batch["input_ids_list"]
+                    with torch.no_grad():
+                        input_ids = [ids.to(accelerator.device) for ids in batch["input_ids_list"]]
+                        text_encoder_conds = text_encoding_strategy.encode_tokens(
+                            tokenize_strategy, [clip_l, t5xxl], input_ids, args.apply_t5_attn_mask
+                        )
+                        if args.full_fp16:
+                            text_encoder_conds = [c.to(weight_dtype) for c in text_encoder_conds]
+
+                # TODO support some features for noise implemented in get_noise_noisy_latents_and_timesteps
+
+                # Sample noise that we'll add to the latents
+                noise = torch.randn_like(latents)
+                bsz = latents.shape[0]
+
+                # get noisy model input and timesteps
+                noisy_model_input, timesteps, sigmas = flux_train_utils.get_noisy_model_input_and_timesteps(
+                    args, noise_scheduler, latents, noise, accelerator.device, weight_dtype
+                )
+
+                # pack latents and get img_ids
+                packed_noisy_model_input = flux_utils.pack_latents(noisy_model_input)  # b, c, h*2, w*2 -> b, h*w, c*4
+                packed_latent_height, packed_latent_width = noisy_model_input.shape[2] // 2, noisy_model_input.shape[3] // 2
+                img_ids = flux_utils.prepare_img_ids(bsz, packed_latent_height, packed_latent_width).to(device=accelerator.device)
+
+                # get guidance
+                guidance_vec = torch.full((bsz,), args.guidance_scale, device=accelerator.device)
+
+                # call model
+                l_pooled, t5_out, txt_ids = text_encoder_conds
+                with accelerator.autocast():
+                    # YiYi notes: divide it by 1000 for now because we scale it by 1000 in the transformer model (we should not keep it but I want to keep the inputs same for the model for testing)
+                    model_pred = flux(
+                        img=packed_noisy_model_input,
+                        img_ids=img_ids,
+                        txt=t5_out,
+                        txt_ids=txt_ids,
+                        y=l_pooled,
+                        timesteps=timesteps / 1000,
+                        guidance=guidance_vec,
+                    )
+
+                # unpack latents
+                model_pred = flux_utils.unpack_latents(model_pred, packed_latent_height, packed_latent_width)
+
+                # apply model prediction type
+                model_pred, weighting = flux_train_utils.apply_model_prediction_type(args, model_pred, noisy_model_input, sigmas)
+
+                # flow matching loss: this is different from SD3
+                target = noise - latents
+
+                # calculate loss
+                loss = train_util.conditional_loss(
+                    model_pred.float(), target.float(), reduction="none", loss_type=args.loss_type, huber_c=None
+                )
+                if weighting is not None:
+                    loss = loss * weighting
+                if args.masked_loss or ("alpha_masks" in batch and batch["alpha_masks"] is not None):
+                    loss = apply_masked_loss(loss, batch)
+                loss = loss.mean([1, 2, 3])
+
+                loss_weights = batch["loss_weights"]  # 各sampleごとのweight
+                loss = loss * loss_weights
+                loss = loss.mean()
+
+                # backward
+                accelerator.backward(loss)
+
+                if not (args.fused_backward_pass or args.fused_optimizer_groups):
+                    if accelerator.sync_gradients and args.max_grad_norm != 0.0:
+                        params_to_clip = []
+                        for m in training_models:
+                            params_to_clip.extend(m.parameters())
+                        accelerator.clip_grad_norm_(params_to_clip, args.max_grad_norm)
+
+                    optimizer.step()
+                    lr_scheduler.step()
+                    optimizer.zero_grad(set_to_none=True)
+                else:
+                    # optimizer.step() and optimizer.zero_grad() are called in the optimizer hook
+                    lr_scheduler.step()
+                    if args.fused_optimizer_groups:
+                        for i in range(1, len(optimizers)):
+                            lr_schedulers[i].step()
+
+            # Checks if the accelerator has performed an optimization step behind the scenes
+            if accelerator.sync_gradients:
+                progress_bar.update(1)
+                global_step += 1
+
+                flux_train_utils.sample_images(
+                    accelerator, args, epoch, global_step, flux, ae, [clip_l, t5xxl], sample_prompts_te_outputs
+                )
+
+                # 指定ステップごとにモデルを保存
+                if args.save_every_n_steps is not None and global_step % args.save_every_n_steps == 0:
+                    accelerator.wait_for_everyone()
+                    if accelerator.is_main_process:
+                        flux_train_utils.save_flux_model_on_epoch_end_or_stepwise(
+                            args,
+                            False,
+                            accelerator,
+                            save_dtype,
+                            epoch,
+                            num_train_epochs,
+                            global_step,
+                            accelerator.unwrap_model(flux),
+                        )
+
+            current_loss = loss.detach().item()  # 平均なのでbatch sizeは関係ないはず
+            if args.logging_dir is not None:
+                logs = {"loss": current_loss}
+                train_util.append_lr_to_logs(logs, lr_scheduler, args.optimizer_type, including_unet=True)
+
+                accelerator.log(logs, step=global_step)
+
+            loss_recorder.add(epoch=epoch, step=step, loss=current_loss)
+            avr_loss: float = loss_recorder.moving_average
+            logs = {"avr_loss": avr_loss}  # , "lr": lr_scheduler.get_last_lr()[0]}
+            progress_bar.set_postfix(**logs)
+
+            if global_step >= args.max_train_steps:
+                break
+
+        if args.logging_dir is not None:
+            logs = {"loss/epoch": loss_recorder.moving_average}
+            accelerator.log(logs, step=epoch + 1)
+
+        accelerator.wait_for_everyone()
+
+        if args.save_every_n_epochs is not None:
+            if accelerator.is_main_process:
+                flux_train_utils.save_flux_model_on_epoch_end_or_stepwise(
+                    args,
+                    True,
+                    accelerator,
+                    save_dtype,
+                    epoch,
+                    num_train_epochs,
+                    global_step,
+                    accelerator.unwrap_model(flux),
+                )
+
+        flux_train_utils.sample_images(
+            accelerator, args, epoch + 1, global_step, flux, ae, [clip_l, t5xxl], sample_prompts_te_outputs
+        )
+
+    is_main_process = accelerator.is_main_process
+    # if is_main_process:
+    flux = accelerator.unwrap_model(flux)
+    clip_l = accelerator.unwrap_model(clip_l)
+    clip_g = accelerator.unwrap_model(clip_g)
+    if t5xxl is not None:
+        t5xxl = accelerator.unwrap_model(t5xxl)
+
+    accelerator.end_training()
+
+    if args.save_state or args.save_state_on_train_end:
+        train_util.save_state_on_train_end(args, accelerator)
+
+    del accelerator  # この後メモリを使うのでこれは消す
+
+    if is_main_process:
+        flux_train_utils.save_flux_model_on_train_end(args, save_dtype, epoch, global_step, flux, ae)
+        logger.info("model saved.")
+
+
+def setup_parser() -> argparse.ArgumentParser:
+    parser = argparse.ArgumentParser()
+
+    add_logging_arguments(parser)
+    train_util.add_sd_models_arguments(parser)  # TODO split this
+    train_util.add_dataset_arguments(parser, True, True, True)
+    train_util.add_training_arguments(parser, False)
+    train_util.add_masked_loss_arguments(parser)
+    deepspeed_utils.add_deepspeed_arguments(parser)
+    train_util.add_sd_saving_arguments(parser)
+    train_util.add_optimizer_arguments(parser)
+    config_util.add_config_arguments(parser)
+    add_custom_train_arguments(parser)  # TODO remove this from here
+    flux_train_utils.add_flux_train_arguments(parser)
+
+    parser.add_argument(
+        "--fused_optimizer_groups",
+        type=int,
+        default=None,
+        help="number of optimizers for fused backward pass and optimizer step / fused backward passとoptimizer stepのためのoptimizer数",
+    )
+    parser.add_argument(
+        "--skip_latents_validity_check",
+        action="store_true",
+        help="skip latents validity check / latentsの正当性チェックをスキップする",
+    )
+    return parser
+
+
+if __name__ == "__main__":
+    parser = setup_parser()
+
+    args = parser.parse_args()
+    train_util.verify_command_line_training_args(args)
+    args = train_util.read_config_from_file(args, parser)
+
+    train(args)
diff --git a/flux_train_network.py b/flux_train_network.py
index b9a29c160..002252c87 100644
--- a/flux_train_network.py
+++ b/flux_train_network.py
@@ -274,85 +274,14 @@ def get_noise_pred_and_target(
         weight_dtype,
         train_unet,
     ):
-        # copy from sd3_train.py and modified
-
-        def get_sigmas(timesteps, n_dim=4, dtype=torch.float32):
-            sigmas = self.noise_scheduler_copy.sigmas.to(device=accelerator.device, dtype=dtype)
-            schedule_timesteps = self.noise_scheduler_copy.timesteps.to(accelerator.device)
-            timesteps = timesteps.to(accelerator.device)
-            step_indices = [(schedule_timesteps == t).nonzero().item() for t in timesteps]
-
-            sigma = sigmas[step_indices].flatten()
-            while len(sigma.shape) < n_dim:
-                sigma = sigma.unsqueeze(-1)
-            return sigma
-
-        def compute_density_for_timestep_sampling(
-            weighting_scheme: str, batch_size: int, logit_mean: float = None, logit_std: float = None, mode_scale: float = None
-        ):
-            """Compute the density for sampling the timesteps when doing SD3 training.
-
-            Courtesy: This was contributed by Rafie Walker in https://github.com/huggingface/diffusers/pull/8528.
-
-            SD3 paper reference: https://arxiv.org/abs/2403.03206v1.
-            """
-            if weighting_scheme == "logit_normal":
-                # See 3.1 in the SD3 paper ($rf/lognorm(0.00,1.00)$).
-                u = torch.normal(mean=logit_mean, std=logit_std, size=(batch_size,), device="cpu")
-                u = torch.nn.functional.sigmoid(u)
-            elif weighting_scheme == "mode":
-                u = torch.rand(size=(batch_size,), device="cpu")
-                u = 1 - u - mode_scale * (torch.cos(math.pi * u / 2) ** 2 - 1 + u)
-            else:
-                u = torch.rand(size=(batch_size,), device="cpu")
-            return u
-
-        def compute_loss_weighting_for_sd3(weighting_scheme: str, sigmas=None):
-            """Computes loss weighting scheme for SD3 training.
-
-            Courtesy: This was contributed by Rafie Walker in https://github.com/huggingface/diffusers/pull/8528.
-
-            SD3 paper reference: https://arxiv.org/abs/2403.03206v1.
-            """
-            if weighting_scheme == "sigma_sqrt":
-                weighting = (sigmas**-2.0).float()
-            elif weighting_scheme == "cosmap":
-                bot = 1 - 2 * sigmas + 2 * sigmas**2
-                weighting = 2 / (math.pi * bot)
-            else:
-                weighting = torch.ones_like(sigmas)
-            return weighting
-
         # Sample noise that we'll add to the latents
         noise = torch.randn_like(latents)
         bsz = latents.shape[0]
 
-        if args.timestep_sampling == "uniform" or args.timestep_sampling == "sigmoid":
-            # Simple random t-based noise sampling
-            if args.timestep_sampling == "sigmoid":
-                # https://github.com/XLabs-AI/x-flux/tree/main
-                t = torch.sigmoid(args.sigmoid_scale * torch.randn((bsz,), device=accelerator.device))
-            else:
-                t = torch.rand((bsz,), device=accelerator.device)
-            timesteps = t * 1000.0
-            t = t.view(-1, 1, 1, 1)
-            noisy_model_input = (1 - t) * latents + t * noise
-        else:
-            # Sample a random timestep for each image
-            # for weighting schemes where we sample timesteps non-uniformly
-            u = compute_density_for_timestep_sampling(
-                weighting_scheme=args.weighting_scheme,
-                batch_size=bsz,
-                logit_mean=args.logit_mean,
-                logit_std=args.logit_std,
-                mode_scale=args.mode_scale,
-            )
-            indices = (u * self.noise_scheduler_copy.config.num_train_timesteps).long()
-            timesteps = self.noise_scheduler_copy.timesteps[indices].to(device=accelerator.device)
-
-            # Add noise according to flow matching.
-            sigmas = get_sigmas(timesteps, n_dim=latents.ndim, dtype=weight_dtype)
-            noisy_model_input = sigmas * noise + (1.0 - sigmas) * latents
+        # get noisy model input and timesteps
+        noisy_model_input, timesteps, sigmas = flux_train_utils.get_noisy_model_input_and_timesteps(
+            args, noise_scheduler, latents, noise, accelerator.device, weight_dtype
+        )
 
         # pack latents and get img_ids
         packed_noisy_model_input = flux_utils.pack_latents(noisy_model_input)  # b, c, h*2, w*2 -> b, h*w, c*4
@@ -425,20 +354,8 @@ def compute_loss_weighting_for_sd3(weighting_scheme: str, sigmas=None):
         # unpack latents
         model_pred = flux_utils.unpack_latents(model_pred, packed_latent_height, packed_latent_width)
 
-        if args.model_prediction_type == "raw":
-            # use model_pred as is
-            weighting = None
-        elif args.model_prediction_type == "additive":
-            # add the model_pred to the noisy_model_input
-            model_pred = model_pred + noisy_model_input
-            weighting = None
-        elif args.model_prediction_type == "sigma_scaled":
-            # apply sigma scaling
-            model_pred = model_pred * (-sigmas) + noisy_model_input
-
-            # these weighting schemes use a uniform timestep sampling
-            # and instead post-weight the loss
-            weighting = compute_loss_weighting_for_sd3(weighting_scheme=args.weighting_scheme, sigmas=sigmas)
+        # apply model prediction type
+        model_pred, weighting = flux_train_utils.apply_model_prediction_type(args, model_pred, noisy_model_input, sigmas)
 
         # flow matching loss: this is different from SD3
         target = noise - latents
@@ -469,83 +386,14 @@ def is_text_encoder_not_needed_for_training(self, args):
 
 def setup_parser() -> argparse.ArgumentParser:
     parser = train_network.setup_parser()
-    # sdxl_train_util.add_sdxl_training_arguments(parser)
-    parser.add_argument("--clip_l", type=str, help="path to clip_l")
-    parser.add_argument("--t5xxl", type=str, help="path to t5xxl")
-    parser.add_argument("--ae", type=str, help="path to ae")
-    parser.add_argument("--apply_t5_attn_mask", action="store_true")
-    parser.add_argument(
-        "--cache_text_encoder_outputs", action="store_true", help="cache text encoder outputs / text encoderの出力をキャッシュする"
-    )
-    parser.add_argument(
-        "--cache_text_encoder_outputs_to_disk",
-        action="store_true",
-        help="cache text encoder outputs to disk / text encoderの出力をディスクにキャッシュする",
-    )
+    flux_train_utils.add_flux_train_arguments(parser)
+
     parser.add_argument(
         "--split_mode",
         action="store_true",
         help="[EXPERIMENTAL] use split mode for Flux model, network arg `train_blocks=single` is required"
         + "/[実験的] Fluxモデルの分割モードを使用する。ネットワーク引数`train_blocks=single`が必要",
     )
-    parser.add_argument(
-        "--t5xxl_max_token_length",
-        type=int,
-        default=None,
-        help="maximum token length for T5-XXL. if omitted, 256 for schnell and 512 for dev"
-        " / T5-XXLの最大トークン長。省略された場合、schnellの場合は256、devの場合は512",
-    )
-    # copy from Diffusers
-    parser.add_argument(
-        "--weighting_scheme",
-        type=str,
-        default="none",
-        choices=["sigma_sqrt", "logit_normal", "mode", "cosmap", "none"],
-    )
-    parser.add_argument(
-        "--logit_mean", type=float, default=0.0, help="mean to use when using the `'logit_normal'` weighting scheme."
-    )
-    parser.add_argument("--logit_std", type=float, default=1.0, help="std to use when using the `'logit_normal'` weighting scheme.")
-    parser.add_argument(
-        "--mode_scale",
-        type=float,
-        default=1.29,
-        help="Scale of mode weighting scheme. Only effective when using the `'mode'` as the `weighting_scheme`.",
-    )
-    parser.add_argument(
-        "--guidance_scale",
-        type=float,
-        default=3.5,
-        help="the FLUX.1 dev variant is a guidance distilled model",
-    )
-
-    parser.add_argument(
-        "--timestep_sampling",
-        choices=["sigma", "uniform", "sigmoid"],
-        default="sigma",
-        help="Method to sample timesteps: sigma-based, uniform random, or sigmoid of random normal. / タイムステップをサンプリングする方法：sigma、random uniform、またはrandom normalのsigmoid。",
-    )
-    parser.add_argument(
-        "--sigmoid_scale",
-        type=float,
-        default=1.0,
-        help='Scale factor for sigmoid timestep sampling (only used when timestep-sampling is "sigmoid"). / sigmoidタイムステップサンプリングの倍率（timestep-samplingが"sigmoid"の場合のみ有効）。',
-    )
-    parser.add_argument(
-        "--model_prediction_type",
-        choices=["raw", "additive", "sigma_scaled"],
-        default="sigma_scaled",
-        help="How to interpret and process the model prediction: "
-        "raw (use as is), additive (add to noisy input), sigma_scaled (apply sigma scaling)."
-        " / モデル予測の解釈と処理方法："
-        "raw（そのまま使用）、additive（ノイズ入力に加算）、sigma_scaled（シグマスケーリングを適用）。",
-    )
-    parser.add_argument(
-        "--discrete_flow_shift",
-        type=float,
-        default=3.0,
-        help="Discrete flow shift for the Euler Discrete Scheduler, default is 3.0. / Euler Discrete Schedulerの離散フローシフト、デフォルトは3.0。",
-    )
     return parser
 
 
diff --git a/library/flux_train_utils.py b/library/flux_train_utils.py
index 91f522389..167d61c7e 100644
--- a/library/flux_train_utils.py
+++ b/library/flux_train_utils.py
@@ -12,8 +12,9 @@
 from transformers import CLIPTextModel
 from tqdm import tqdm
 from PIL import Image
+from safetensors.torch import save_file
 
-from library import flux_models, flux_utils, strategy_base
+from library import flux_models, flux_utils, strategy_base, train_util
 from library.sd3_train_utils import load_prompts
 from library.device_utils import init_ipex, clean_memory_on_device
 
@@ -27,6 +28,9 @@
 logger = logging.getLogger(__name__)
 
 
+# region sample images
+
+
 def sample_images(
     accelerator: Accelerator,
     args: argparse.Namespace,
@@ -295,3 +299,267 @@ def denoise(
         img = img + (t_prev - t_curr) * pred
 
     return img
+
+
+# endregion
+
+
+# region train
+def get_sigmas(noise_scheduler, timesteps, device, n_dim=4, dtype=torch.float32):
+    sigmas = noise_scheduler.sigmas.to(device=device, dtype=dtype)
+    schedule_timesteps = noise_scheduler.timesteps.to(device)
+    timesteps = timesteps.to(device)
+    step_indices = [(schedule_timesteps == t).nonzero().item() for t in timesteps]
+
+    sigma = sigmas[step_indices].flatten()
+    while len(sigma.shape) < n_dim:
+        sigma = sigma.unsqueeze(-1)
+    return sigma
+
+
+def compute_density_for_timestep_sampling(
+    weighting_scheme: str, batch_size: int, logit_mean: float = None, logit_std: float = None, mode_scale: float = None
+):
+    """Compute the density for sampling the timesteps when doing SD3 training.
+
+    Courtesy: This was contributed by Rafie Walker in https://github.com/huggingface/diffusers/pull/8528.
+
+    SD3 paper reference: https://arxiv.org/abs/2403.03206v1.
+    """
+    if weighting_scheme == "logit_normal":
+        # See 3.1 in the SD3 paper ($rf/lognorm(0.00,1.00)$).
+        u = torch.normal(mean=logit_mean, std=logit_std, size=(batch_size,), device="cpu")
+        u = torch.nn.functional.sigmoid(u)
+    elif weighting_scheme == "mode":
+        u = torch.rand(size=(batch_size,), device="cpu")
+        u = 1 - u - mode_scale * (torch.cos(math.pi * u / 2) ** 2 - 1 + u)
+    else:
+        u = torch.rand(size=(batch_size,), device="cpu")
+    return u
+
+
+def compute_loss_weighting_for_sd3(weighting_scheme: str, sigmas=None):
+    """Computes loss weighting scheme for SD3 training.
+
+    Courtesy: This was contributed by Rafie Walker in https://github.com/huggingface/diffusers/pull/8528.
+
+    SD3 paper reference: https://arxiv.org/abs/2403.03206v1.
+    """
+    if weighting_scheme == "sigma_sqrt":
+        weighting = (sigmas**-2.0).float()
+    elif weighting_scheme == "cosmap":
+        bot = 1 - 2 * sigmas + 2 * sigmas**2
+        weighting = 2 / (math.pi * bot)
+    else:
+        weighting = torch.ones_like(sigmas)
+    return weighting
+
+
+def get_noisy_model_input_and_timesteps(
+    args, noise_scheduler, latents, noise, device, dtype
+) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+    bsz = latents.shape[0]
+    sigmas = None
+
+    if args.timestep_sampling == "uniform" or args.timestep_sampling == "sigmoid":
+        # Simple random t-based noise sampling
+        if args.timestep_sampling == "sigmoid":
+            # https://github.com/XLabs-AI/x-flux/tree/main
+            t = torch.sigmoid(args.sigmoid_scale * torch.randn((bsz,), device=device))
+        else:
+            t = torch.rand((bsz,), device=device)
+        timesteps = t * 1000.0
+        t = t.view(-1, 1, 1, 1)
+        noisy_model_input = (1 - t) * latents + t * noise
+    else:
+        # Sample a random timestep for each image
+        # for weighting schemes where we sample timesteps non-uniformly
+        u = compute_density_for_timestep_sampling(
+            weighting_scheme=args.weighting_scheme,
+            batch_size=bsz,
+            logit_mean=args.logit_mean,
+            logit_std=args.logit_std,
+            mode_scale=args.mode_scale,
+        )
+        indices = (u * noise_scheduler.config.num_train_timesteps).long()
+        timesteps = noise_scheduler.timesteps[indices].to(device=device)
+
+        # Add noise according to flow matching.
+        sigmas = get_sigmas(noise_scheduler, timesteps, device, n_dim=latents.ndim, dtype=dtype)
+        noisy_model_input = sigmas * noise + (1.0 - sigmas) * latents
+
+    return noisy_model_input, timesteps, sigmas
+
+
+def apply_model_prediction_type(args, model_pred, noisy_model_input, sigmas):
+    weighting = None
+    if args.model_prediction_type == "raw":
+        pass
+    elif args.model_prediction_type == "additive":
+        # add the model_pred to the noisy_model_input
+        model_pred = model_pred + noisy_model_input
+    elif args.model_prediction_type == "sigma_scaled":
+        # apply sigma scaling
+        model_pred = model_pred * (-sigmas) + noisy_model_input
+
+        # these weighting schemes use a uniform timestep sampling
+        # and instead post-weight the loss
+        weighting = compute_loss_weighting_for_sd3(weighting_scheme=args.weighting_scheme, sigmas=sigmas)
+
+    return model_pred, weighting
+
+
+def save_models(ckpt_path: str, flux: flux_models.Flux, sai_metadata: Optional[dict], save_dtype: Optional[torch.dtype] = None):
+    state_dict = {}
+
+    def update_sd(prefix, sd):
+        for k, v in sd.items():
+            key = prefix + k
+            if save_dtype is not None:
+                v = v.detach().clone().to("cpu").to(save_dtype)
+            state_dict[key] = v
+
+    update_sd("", flux.state_dict())
+
+    save_file(state_dict, ckpt_path, metadata=sai_metadata)
+
+
+def save_flux_model_on_train_end(
+    args: argparse.Namespace, save_dtype: torch.dtype, epoch: int, global_step: int, flux: flux_models.Flux
+):
+    def sd_saver(ckpt_file, epoch_no, global_step):
+        sai_metadata = train_util.get_sai_model_spec(None, args, False, False, False, is_stable_diffusion_ckpt=True, flux="dev")
+        save_models(ckpt_file, flux, sai_metadata, save_dtype)
+
+    train_util.save_sd_model_on_train_end_common(args, True, True, epoch, global_step, sd_saver, None)
+
+
+# epochとstepの保存、メタデータにepoch/stepが含まれ引数が同じになるため、統合している
+# on_epoch_end: Trueならepoch終了時、Falseならstep経過時
+def save_flux_model_on_epoch_end_or_stepwise(
+    args: argparse.Namespace,
+    on_epoch_end: bool,
+    accelerator,
+    save_dtype: torch.dtype,
+    epoch: int,
+    num_train_epochs: int,
+    global_step: int,
+    flux: flux_models.Flux,
+):
+    def sd_saver(ckpt_file, epoch_no, global_step):
+        sai_metadata = train_util.get_sai_model_spec(None, args, False, False, False, is_stable_diffusion_ckpt=True, flux="dev")
+        save_models(ckpt_file, flux, sai_metadata, save_dtype)
+
+    train_util.save_sd_model_on_epoch_end_or_stepwise_common(
+        args,
+        on_epoch_end,
+        accelerator,
+        True,
+        True,
+        epoch,
+        num_train_epochs,
+        global_step,
+        sd_saver,
+        None,
+    )
+
+
+# endregion
+
+
+def add_flux_train_arguments(parser: argparse.ArgumentParser):
+    parser.add_argument(
+        "--clip_l",
+        type=str,
+        help="path to clip_l (*.sft or *.safetensors), should be float16 / clip_lのパス（*.sftまたは*.safetensors）、float16が前提",
+    )
+    parser.add_argument(
+        "--t5xxl",
+        type=str,
+        help="path to t5xxl (*.sft or *.safetensors), should be float16 / t5xxlのパス（*.sftまたは*.safetensors）、float16が前提",
+    )
+    parser.add_argument("--ae", type=str, help="path to ae (*.sft or *.safetensors) / aeのパス（*.sftまたは*.safetensors）")
+    parser.add_argument(
+        "--t5xxl_max_token_length",
+        type=int,
+        default=None,
+        help="maximum token length for T5-XXL. if omitted, 256 for schnell and 512 for dev"
+        " / T5-XXLの最大トークン長。省略された場合、schnellの場合は256、devの場合は512",
+    )
+    parser.add_argument(
+        "--apply_t5_attn_mask",
+        action="store_true",
+        help="apply attention mask (zero embs) to T5-XXL / T5-XXLにアテンションマスク（ゼロ埋め）を適用する",
+    )
+    parser.add_argument(
+        "--cache_text_encoder_outputs", action="store_true", help="cache text encoder outputs / text encoderの出力をキャッシュする"
+    )
+    parser.add_argument(
+        "--cache_text_encoder_outputs_to_disk",
+        action="store_true",
+        help="cache text encoder outputs to disk / text encoderの出力をディスクにキャッシュする",
+    )
+    parser.add_argument(
+        "--text_encoder_batch_size",
+        type=int,
+        default=None,
+        help="text encoder batch size (default: None, use dataset's batch size)"
+        + " / text encoderのバッチサイズ（デフォルト: None, データセットのバッチサイズを使用）",
+    )
+    parser.add_argument(
+        "--disable_mmap_load_safetensors",
+        action="store_true",
+        help="disable mmap load for safetensors. Speed up model loading in WSL environment / safetensorsのmmapロードを無効にする。WSL環境等でモデル読み込みを高速化できる",
+    )
+
+    # copy from Diffusers
+    parser.add_argument(
+        "--weighting_scheme",
+        type=str,
+        default="none",
+        choices=["sigma_sqrt", "logit_normal", "mode", "cosmap", "none"],
+    )
+    parser.add_argument(
+        "--logit_mean", type=float, default=0.0, help="mean to use when using the `'logit_normal'` weighting scheme."
+    )
+    parser.add_argument("--logit_std", type=float, default=1.0, help="std to use when using the `'logit_normal'` weighting scheme.")
+    parser.add_argument(
+        "--mode_scale",
+        type=float,
+        default=1.29,
+        help="Scale of mode weighting scheme. Only effective when using the `'mode'` as the `weighting_scheme`.",
+    )
+    parser.add_argument(
+        "--guidance_scale",
+        type=float,
+        default=3.5,
+        help="the FLUX.1 dev variant is a guidance distilled model",
+    )
+
+    parser.add_argument(
+        "--timestep_sampling",
+        choices=["sigma", "uniform", "sigmoid"],
+        default="sigma",
+        help="Method to sample timesteps: sigma-based, uniform random, or sigmoid of random normal. / タイムステップをサンプリングする方法：sigma、random uniform、またはrandom normalのsigmoid。",
+    )
+    parser.add_argument(
+        "--sigmoid_scale",
+        type=float,
+        default=1.0,
+        help='Scale factor for sigmoid timestep sampling (only used when timestep-sampling is "sigmoid"). / sigmoidタイムステップサンプリングの倍率（timestep-samplingが"sigmoid"の場合のみ有効）。',
+    )
+    parser.add_argument(
+        "--model_prediction_type",
+        choices=["raw", "additive", "sigma_scaled"],
+        default="sigma_scaled",
+        help="How to interpret and process the model prediction: "
+        "raw (use as is), additive (add to noisy input), sigma_scaled (apply sigma scaling)."
+        " / モデル予測の解釈と処理方法："
+        "raw（そのまま使用）、additive（ノイズ入力に加算）、sigma_scaled（シグマスケーリングを適用）。",
+    )
+    parser.add_argument(
+        "--discrete_flow_shift",
+        type=float,
+        default=3.0,
+        help="Discrete flow shift for the Euler Discrete Scheduler, default is 3.0. / Euler Discrete Schedulerの離散フローシフト、デフォルトは3.0。",
+    )
diff --git a/library/train_util.py b/library/train_util.py
index fa0eb9e51..f4ac8740a 100644
--- a/library/train_util.py
+++ b/library/train_util.py
@@ -2629,7 +2629,7 @@ def __getitem__(self, idx):
         raise NotImplementedError
 
 
-def load_arbitrary_dataset(args, tokenizer) -> MinimalDataset:
+def load_arbitrary_dataset(args, tokenizer=None) -> MinimalDataset:
     module = ".".join(args.dataset_class.split(".")[:-1])
     dataset_class = args.dataset_class.split(".")[-1]
     module = importlib.import_module(module)

From 25f77f6ef04ee760506338e7e7f9835c28657c59 Mon Sep 17 00:00:00 2001
From: kohya-ss <ykumeykume@gmail.com>
Date: Sat, 17 Aug 2024 15:54:32 +0900
Subject: [PATCH 081/582] fix flux fine tuning to work

---
 README.md     | 4 ++++
 flux_train.py | 6 ++----
 2 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/README.md b/README.md
index e231cc24e..2b7b110f3 100644
--- a/README.md
+++ b/README.md
@@ -9,6 +9,10 @@ __Please update PyTorch to 2.4.0. We have tested with `torch==2.4.0` and `torchv
 The command to install PyTorch is as follows:
 `pip3 install torch==2.4.0 torchvision==0.19.0 --index-url https://download.pytorch.org/whl/cu124`
 
+
+Aug 17. 2024:
+Added a script `flux_train.py` to train FLUX.1. The script is experimental and not an optimized version. It needs >28GB VRAM for training. 
+
 Aug 16, 2024: 
 
 Added a script `networks/flux_merge_lora.py` to merge LoRA into FLUX.1 checkpoint. See [Merge LoRA to FLUX.1 checkpoint](#merge-lora-to-flux1-checkpoint) for details.
diff --git a/flux_train.py b/flux_train.py
index 2ca20ded2..d2a9b3f32 100644
--- a/flux_train.py
+++ b/flux_train.py
@@ -674,9 +674,7 @@ def optimizer_hook(parameter: torch.Tensor):
     # if is_main_process:
     flux = accelerator.unwrap_model(flux)
     clip_l = accelerator.unwrap_model(clip_l)
-    clip_g = accelerator.unwrap_model(clip_g)
-    if t5xxl is not None:
-        t5xxl = accelerator.unwrap_model(t5xxl)
+    t5xxl = accelerator.unwrap_model(t5xxl)
 
     accelerator.end_training()
 
@@ -686,7 +684,7 @@ def optimizer_hook(parameter: torch.Tensor):
     del accelerator  # この後メモリを使うのでこれは消す
 
     if is_main_process:
-        flux_train_utils.save_flux_model_on_train_end(args, save_dtype, epoch, global_step, flux, ae)
+        flux_train_utils.save_flux_model_on_train_end(args, save_dtype, epoch, global_step, flux)
         logger.info("model saved.")
 
 

From 7e688913aef4c852f54a703c9f91d135b17dff87 Mon Sep 17 00:00:00 2001
From: exveria1015 <exveria1015@gmail.com>
Date: Sun, 18 Aug 2024 12:38:05 +0900
Subject: [PATCH 082/582] =?UTF-8?q?fix:=20Flux=20=E3=81=AE=20LoRA=20?=
 =?UTF-8?q?=E3=83=9E=E3=83=BC=E3=82=B8=E6=A9=9F=E8=83=BD=E3=82=92=E4=BF=AE?=
 =?UTF-8?q?=E6=AD=A3?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 networks/flux_merge_lora.py | 364 +++++++++++++++++++++++++++++-------
 1 file changed, 297 insertions(+), 67 deletions(-)

diff --git a/networks/flux_merge_lora.py b/networks/flux_merge_lora.py
index c3986ef1f..df0ba606a 100644
--- a/networks/flux_merge_lora.py
+++ b/networks/flux_merge_lora.py
@@ -1,13 +1,14 @@
-import math
 import argparse
+import math
 import os
 import time
+
 import torch
-from safetensors import safe_open
 from safetensors.torch import load_file, save_file
 from tqdm import tqdm
+
+import lora_flux as lora_flux
 from library import sai_model_spec, train_util
-import networks.lora_flux as lora_flux
 from library.utils import setup_logging
 
 setup_logging()
@@ -42,34 +43,181 @@ def save_to_file(file_name, state_dict, dtype, metadata):
     save_file(state_dict, file_name, metadata=metadata)
 
 
-def merge_to_flux_model(loading_device, working_device, flux_model, models, ratios, merge_dtype, save_dtype):
-    # create module map without loading state_dict
+def merge_to_flux_model(
+    loading_device, working_device, flux_model, models, ratios, merge_dtype, save_dtype
+):
     logger.info(f"loading keys from FLUX.1 model: {flux_model}")
-    lora_name_to_module_key = {}
-    with safe_open(flux_model, framework="pt", device=loading_device) as flux_file:
-        keys = list(flux_file.keys())
-        for key in keys:
-            if key.endswith(".weight"):
-                module_name = ".".join(key.split(".")[:-1])
-                lora_name = lora_flux.LoRANetwork.LORA_PREFIX_FLUX + "_" + module_name.replace(".", "_")
-                lora_name_to_module_key[lora_name] = key
-
     flux_state_dict = load_file(flux_model, device=loading_device)
+
+    def create_key_map(n_double_layers, n_single_layers, hidden_size):
+        key_map = {}
+        for index in range(n_double_layers):
+            prefix_from = f"transformer_blocks.{index}"
+            prefix_to = f"double_blocks.{index}"
+
+            for end in ("weight", "bias"):
+                k = f"{prefix_from}.attn."
+                qkv_img = f"{prefix_to}.img_attn.qkv.{end}"
+                qkv_txt = f"{prefix_to}.txt_attn.qkv.{end}"
+
+                key_map[f"{k}to_q.{end}"] = (qkv_img, (0, 0, hidden_size))
+                key_map[f"{k}to_k.{end}"] = (qkv_img, (0, hidden_size, hidden_size))
+                key_map[f"{k}to_v.{end}"] = (qkv_img, (0, hidden_size * 2, hidden_size))
+                key_map[f"{k}add_q_proj.{end}"] = (qkv_txt, (0, 0, hidden_size))
+                key_map[f"{k}add_k_proj.{end}"] = (
+                    qkv_txt,
+                    (0, hidden_size, hidden_size),
+                )
+                key_map[f"{k}add_v_proj.{end}"] = (
+                    qkv_txt,
+                    (0, hidden_size * 2, hidden_size),
+                )
+
+            block_map = {
+                "attn.to_out.0.weight": "img_attn.proj.weight",
+                "attn.to_out.0.bias": "img_attn.proj.bias",
+                "norm1.linear.weight": "img_mod.lin.weight",
+                "norm1.linear.bias": "img_mod.lin.bias",
+                "norm1_context.linear.weight": "txt_mod.lin.weight",
+                "norm1_context.linear.bias": "txt_mod.lin.bias",
+                "attn.to_add_out.weight": "txt_attn.proj.weight",
+                "attn.to_add_out.bias": "txt_attn.proj.bias",
+                "ff.net.0.proj.weight": "img_mlp.0.weight",
+                "ff.net.0.proj.bias": "img_mlp.0.bias",
+                "ff.net.2.weight": "img_mlp.2.weight",
+                "ff.net.2.bias": "img_mlp.2.bias",
+                "ff_context.net.0.proj.weight": "txt_mlp.0.weight",
+                "ff_context.net.0.proj.bias": "txt_mlp.0.bias",
+                "ff_context.net.2.weight": "txt_mlp.2.weight",
+                "ff_context.net.2.bias": "txt_mlp.2.bias",
+                "attn.norm_q.weight": "img_attn.norm.query_norm.scale",
+                "attn.norm_k.weight": "img_attn.norm.key_norm.scale",
+                "attn.norm_added_q.weight": "txt_attn.norm.query_norm.scale",
+                "attn.norm_added_k.weight": "txt_attn.norm.key_norm.scale",
+            }
+
+            for k, v in block_map.items():
+                key_map[f"{prefix_from}.{k}"] = f"{prefix_to}.{v}"
+
+        for index in range(n_single_layers):
+            prefix_from = f"single_transformer_blocks.{index}"
+            prefix_to = f"single_blocks.{index}"
+
+            for end in ("weight", "bias"):
+                k = f"{prefix_from}.attn."
+                qkv = f"{prefix_to}.linear1.{end}"
+                key_map[f"{k}to_q.{end}"] = (qkv, (0, 0, hidden_size))
+                key_map[f"{k}to_k.{end}"] = (qkv, (0, hidden_size, hidden_size))
+                key_map[f"{k}to_v.{end}"] = (qkv, (0, hidden_size * 2, hidden_size))
+                key_map[f"{prefix_from}.proj_mlp.{end}"] = (
+                    qkv,
+                    (0, hidden_size * 3, hidden_size * 4),
+                )
+
+            block_map = {
+                "norm.linear.weight": "modulation.lin.weight",
+                "norm.linear.bias": "modulation.lin.bias",
+                "proj_out.weight": "linear2.weight",
+                "proj_out.bias": "linear2.bias",
+                "attn.norm_q.weight": "norm.query_norm.scale",
+                "attn.norm_k.weight": "norm.key_norm.scale",
+            }
+
+            for k, v in block_map.items():
+                key_map[f"{prefix_from}.{k}"] = f"{prefix_to}.{v}"
+
+        return key_map
+
+    key_map = create_key_map(
+        18, 1, 2048
+    )  # Assuming 18 double layers, 1 single layer, and hidden size of 2048
+
+    def find_matching_key(flux_dict, lora_key):
+        lora_key = lora_key.replace("diffusion_model.", "")
+        lora_key = lora_key.replace("transformer.", "")
+        lora_key = lora_key.replace("lora_A", "lora_down").replace("lora_B", "lora_up")
+        lora_key = lora_key.replace("single_transformer_blocks", "single_blocks")
+        lora_key = lora_key.replace("transformer_blocks", "double_blocks")
+
+        double_block_map = {
+            "attn.to_out.0": "img_attn.proj",
+            "norm1.linear": "img_mod.lin",
+            "norm1_context.linear": "txt_mod.lin",
+            "attn.to_add_out": "txt_attn.proj",
+            "ff.net.0.proj": "img_mlp.0",
+            "ff.net.2": "img_mlp.2",
+            "ff_context.net.0.proj": "txt_mlp.0",
+            "ff_context.net.2": "txt_mlp.2",
+            "attn.norm_q": "img_attn.norm.query_norm",
+            "attn.norm_k": "img_attn.norm.key_norm",
+            "attn.norm_added_q": "txt_attn.norm.query_norm",
+            "attn.norm_added_k": "txt_attn.norm.key_norm",
+            "attn.to_q": "img_attn.qkv",
+            "attn.to_k": "img_attn.qkv",
+            "attn.to_v": "img_attn.qkv",
+            "attn.add_q_proj": "txt_attn.qkv",
+            "attn.add_k_proj": "txt_attn.qkv",
+            "attn.add_v_proj": "txt_attn.qkv",
+        }
+
+        single_block_map = {
+            "norm.linear": "modulation.lin",
+            "proj_out": "linear2",
+            "attn.norm_q": "norm.query_norm",
+            "attn.norm_k": "norm.key_norm",
+            "attn.to_q": "linear1",
+            "attn.to_k": "linear1",
+            "attn.to_v": "linear1",
+        }
+
+        for old, new in double_block_map.items():
+            lora_key = lora_key.replace(old, new)
+
+        for old, new in single_block_map.items():
+            lora_key = lora_key.replace(old, new)
+
+        if lora_key in key_map:
+            flux_key = key_map[lora_key]
+            if isinstance(flux_key, tuple):
+                flux_key = flux_key[0]
+            logger.info(f"Found matching key: {flux_key}")
+            return flux_key
+
+        # If not found in key_map, try partial matching
+        potential_key = lora_key + ".weight"
+        logger.info(f"Searching for key: {potential_key}")
+        matches = [k for k in flux_dict.keys() if potential_key in k]
+        if matches:
+            logger.info(f"Found matching key: {matches[0]}")
+            return matches[0]
+        return None
+
+    merged_keys = set()
     for model, ratio in zip(models, ratios):
         logger.info(f"loading: {model}")
-        lora_sd, _ = load_state_dict(model, merge_dtype)  # loading on CPU
+        lora_sd, _ = load_state_dict(model, merge_dtype)
 
-        logger.info(f"merging...")
+        logger.info("merging...")
         for key in tqdm(lora_sd.keys()):
-            if "lora_down" in key:
-                lora_name = key[: key.rfind(".lora_down")]
-                up_key = key.replace("lora_down", "lora_up")
-                alpha_key = key[: key.index("lora_down")] + "alpha"
-
-                if lora_name not in lora_name_to_module_key:
-                    logger.warning(f"no module found for LoRA weight: {key}.  LoRA for Text Encoder is not supported yet.")
+            if "lora_down" in key or "lora_A" in key:
+                lora_name = key[
+                    : key.rfind(".lora_down" if "lora_down" in key else ".lora_A")
+                ]
+                up_key = key.replace("lora_down", "lora_up").replace("lora_A", "lora_B")
+                alpha_key = (
+                    key[: key.index("lora_down" if "lora_down" in key else "lora_A")]
+                    + "alpha"
+                )
+
+                logger.info(f"Processing LoRA key: {lora_name}")
+                flux_key = find_matching_key(flux_state_dict, lora_name)
+
+                if flux_key is None:
+                    logger.warning(f"no module found for LoRA weight: {key}")
                     continue
 
+                logger.info(f"Merging LoRA key {lora_name} into Flux key {flux_key}")
+
                 down_weight = lora_sd[key]
                 up_weight = lora_sd[up_key]
 
@@ -77,40 +225,74 @@ def merge_to_flux_model(loading_device, working_device, flux_model, models, rati
                 alpha = lora_sd.get(alpha_key, dim)
                 scale = alpha / dim
 
-                # W <- W + U * D
-                module_weight_key = lora_name_to_module_key[lora_name]
-                if module_weight_key not in flux_state_dict:
-                    weight = flux_file.get_tensor(module_weight_key)
-                else:
-                    weight = flux_state_dict[module_weight_key]
+                weight = flux_state_dict[flux_key]
 
                 weight = weight.to(working_device, merge_dtype)
                 up_weight = up_weight.to(working_device, merge_dtype)
                 down_weight = down_weight.to(working_device, merge_dtype)
 
-                # logger.info(module_name, down_weight.size(), up_weight.size())
-                if len(weight.size()) == 2:
-                    # linear
-                    weight = weight + ratio * (up_weight @ down_weight) * scale
-                elif down_weight.size()[2:4] == (1, 1):
-                    # conv2d 1x1
-                    weight = (
-                        weight
-                        + ratio
-                        * (up_weight.squeeze(3).squeeze(2) @ down_weight.squeeze(3).squeeze(2)).unsqueeze(2).unsqueeze(3)
-                        * scale
-                    )
+                if lora_name.startswith("transformer."):
+                    if "qkv" in flux_key:
+                        hidden_size = weight.size(-1) // 3
+                        update = ratio * (up_weight @ down_weight) * scale
+
+                        if "img_attn" in flux_key or "txt_attn" in flux_key:
+                            q, k, v = torch.chunk(weight, 3, dim=-1)
+                            if "to_q" in lora_name or "add_q_proj" in lora_name:
+                                q += update.reshape(q.shape)
+                            elif "to_k" in lora_name or "add_k_proj" in lora_name:
+                                k += update.reshape(k.shape)
+                            elif "to_v" in lora_name or "add_v_proj" in lora_name:
+                                v += update.reshape(v.shape)
+                            weight = torch.cat([q, k, v], dim=-1)
+                    else:
+                        if len(weight.size()) == 2:
+                            weight = weight + ratio * (up_weight @ down_weight) * scale
+                        elif down_weight.size()[2:4] == (1, 1):
+                            weight = (
+                                weight
+                                + ratio
+                                * (
+                                    up_weight.squeeze(3).squeeze(2)
+                                    @ down_weight.squeeze(3).squeeze(2)
+                                )
+                                .unsqueeze(2)
+                                .unsqueeze(3)
+                                * scale
+                            )
+                        else:
+                            conved = torch.nn.functional.conv2d(
+                                down_weight.permute(1, 0, 2, 3), up_weight
+                            ).permute(1, 0, 2, 3)
+                            weight = weight + ratio * conved * scale
                 else:
-                    # conv2d 3x3
-                    conved = torch.nn.functional.conv2d(down_weight.permute(1, 0, 2, 3), up_weight).permute(1, 0, 2, 3)
-                    # logger.info(conved.size(), weight.size(), module.stride, module.padding)
-                    weight = weight + ratio * conved * scale
-
-                flux_state_dict[module_weight_key] = weight.to(loading_device, save_dtype)
+                    if len(weight.size()) == 2:
+                        weight = weight + ratio * (up_weight @ down_weight) * scale
+                    elif down_weight.size()[2:4] == (1, 1):
+                        weight = (
+                            weight
+                            + ratio
+                            * (
+                                up_weight.squeeze(3).squeeze(2)
+                                @ down_weight.squeeze(3).squeeze(2)
+                            )
+                            .unsqueeze(2)
+                            .unsqueeze(3)
+                            * scale
+                        )
+                    else:
+                        conved = torch.nn.functional.conv2d(
+                            down_weight.permute(1, 0, 2, 3), up_weight
+                        ).permute(1, 0, 2, 3)
+                        weight = weight + ratio * conved * scale
+
+                flux_state_dict[flux_key] = weight.to(loading_device, save_dtype)
+                merged_keys.add(flux_key)
                 del up_weight
                 del down_weight
                 del weight
 
+    logger.info(f"Merged keys: {sorted(list(merged_keys))}")
     return flux_state_dict
 
 
@@ -126,7 +308,9 @@ def merge_lora_models(models, ratios, merge_dtype, concat=False, shuffle=False):
 
         if lora_metadata is not None:
             if base_model is None:
-                base_model = lora_metadata.get(train_util.SS_METADATA_KEY_BASE_MODEL_VERSION, None)
+                base_model = lora_metadata.get(
+                    train_util.SS_METADATA_KEY_BASE_MODEL_VERSION, None
+                )
 
         # get alpha and dim
         alphas = {}  # alpha for current model
@@ -152,10 +336,12 @@ def merge_lora_models(models, ratios, merge_dtype, concat=False, shuffle=False):
                 if lora_module_name not in base_alphas:
                     base_alphas[lora_module_name] = alpha
 
-        logger.info(f"dim: {list(set(dims.values()))}, alpha: {list(set(alphas.values()))}")
+        logger.info(
+            f"dim: {list(set(dims.values()))}, alpha: {list(set(alphas.values()))}"
+        )
 
         # merge
-        logger.info(f"merging...")
+        logger.info("merging...")
         for key in tqdm(lora_sd.keys()):
             if "alpha" in key:
                 continue
@@ -173,14 +359,19 @@ def merge_lora_models(models, ratios, merge_dtype, concat=False, shuffle=False):
             alpha = alphas[lora_module_name]
 
             scale = math.sqrt(alpha / base_alpha) * ratio
-            scale = abs(scale) if "lora_up" in key else scale  # マイナスの重みに対応する。
+            scale = (
+                abs(scale) if "lora_up" in key else scale
+            )  # マイナスの重みに対応する。
 
             if key in merged_sd:
                 assert (
-                    merged_sd[key].size() == lora_sd[key].size() or concat_dim is not None
-                ), f"weights shape mismatch, different dims? / 重みのサイズが合いません。dimが異なる可能性があります。"
+                    merged_sd[key].size() == lora_sd[key].size()
+                    or concat_dim is not None
+                ), "weights shape mismatch, different dims? / 重みのサイズが合いません。dimが異なる可能性があります。"
                 if concat_dim is not None:
-                    merged_sd[key] = torch.cat([merged_sd[key], lora_sd[key] * scale], dim=concat_dim)
+                    merged_sd[key] = torch.cat(
+                        [merged_sd[key], lora_sd[key] * scale], dim=concat_dim
+                    )
                 else:
                     merged_sd[key] = merged_sd[key] + lora_sd[key] * scale
             else:
@@ -199,7 +390,9 @@ def merge_lora_models(models, ratios, merge_dtype, concat=False, shuffle=False):
             merged_sd[key_up] = merged_sd[key_up][:, perm]
 
     logger.info("merged model")
-    logger.info(f"dim: {list(set(base_dims.values()))}, alpha: {list(set(base_alphas.values()))}")
+    logger.info(
+        f"dim: {list(set(base_dims.values()))}, alpha: {list(set(base_alphas.values()))}"
+    )
 
     # check all dims are same
     dims_list = list(set(base_dims.values()))
@@ -218,15 +411,17 @@ def merge_lora_models(models, ratios, merge_dtype, concat=False, shuffle=False):
     # build minimum metadata
     dims = f"{dims_list[0]}" if all_same_dims else "Dynamic"
     alphas = f"{alphas_list[0]}" if all_same_alphas else "Dynamic"
-    metadata = train_util.build_minimum_network_metadata(str(False), base_model, "networks.lora", dims, alphas, None)
+    metadata = train_util.build_minimum_network_metadata(
+        str(False), base_model, "networks.lora", dims, alphas, None
+    )
 
     return merged_sd, metadata
 
 
 def merge(args):
-    assert len(args.models) == len(
-        args.ratios
-    ), f"number of models must be equal to number of ratios / モデルの数と重みの数は合わせてください"
+    assert (
+        len(args.models) == len(args.ratios)
+    ), "number of models must be equal to number of ratios / モデルの数と重みの数は合わせてください"
 
     def str_to_dtype(p):
         if p == "float":
@@ -249,27 +444,48 @@ def str_to_dtype(p):
 
     if args.flux_model is not None:
         state_dict = merge_to_flux_model(
-            args.loading_device, args.working_device, args.flux_model, args.models, args.ratios, merge_dtype, save_dtype
+            args.loading_device,
+            args.working_device,
+            args.flux_model,
+            args.models,
+            args.ratios,
+            merge_dtype,
+            save_dtype,
         )
 
         if args.no_metadata:
             sai_metadata = None
         else:
-            merged_from = sai_model_spec.build_merged_from([args.flux_model] + args.models)
+            merged_from = sai_model_spec.build_merged_from(
+                [args.flux_model] + args.models
+            )
             title = os.path.splitext(os.path.basename(args.save_to))[0]
             sai_metadata = sai_model_spec.build_metadata(
-                None, False, False, False, False, False, time.time(), title=title, merged_from=merged_from, flux="dev"
+                None,
+                False,
+                False,
+                False,
+                False,
+                False,
+                time.time(),
+                title=title,
+                merged_from=merged_from,
+                flux="dev",
             )
 
         logger.info(f"saving FLUX model to: {args.save_to}")
         save_to_file(args.save_to, state_dict, save_dtype, sai_metadata)
 
     else:
-        state_dict, metadata = merge_lora_models(args.models, args.ratios, merge_dtype, args.concat, args.shuffle)
+        state_dict, metadata = merge_lora_models(
+            args.models, args.ratios, merge_dtype, args.concat, args.shuffle
+        )
 
-        logger.info(f"calculating hashes and creating metadata...")
+        logger.info("calculating hashes and creating metadata...")
 
-        model_hash, legacy_hash = train_util.precalculate_safetensors_hashes(state_dict, metadata)
+        model_hash, legacy_hash = train_util.precalculate_safetensors_hashes(
+            state_dict, metadata
+        )
         metadata["sshs_model_hash"] = model_hash
         metadata["sshs_legacy_hash"] = legacy_hash
 
@@ -277,7 +493,16 @@ def str_to_dtype(p):
             merged_from = sai_model_spec.build_merged_from(args.models)
             title = os.path.splitext(os.path.basename(args.save_to))[0]
             sai_metadata = sai_model_spec.build_metadata(
-                state_dict, False, False, False, True, False, time.time(), title=title, merged_from=merged_from, flux="dev"
+                state_dict,
+                False,
+                False,
+                False,
+                True,
+                False,
+                time.time(),
+                title=title,
+                merged_from=merged_from,
+                flux="dev",
             )
             metadata.update(sai_metadata)
 
@@ -332,7 +557,12 @@ def setup_parser() -> argparse.ArgumentParser:
         nargs="*",
         help="LoRA models to merge: safetensors file / マージするLoRAモデル、safetensorsファイル",
     )
-    parser.add_argument("--ratios", type=float, nargs="*", help="ratios for each model / それぞれのLoRAモデルの比率")
+    parser.add_argument(
+        "--ratios",
+        type=float,
+        nargs="*",
+        help="ratios for each model / それぞれのLoRAモデルの比率",
+    )
     parser.add_argument(
         "--no_metadata",
         action="store_true",

From ef535ec6bb99918027afc1e31efa72cd3761d453 Mon Sep 17 00:00:00 2001
From: Kohya S <ykumeykume@gmail.com>
Date: Sun, 18 Aug 2024 16:54:18 +0900
Subject: [PATCH 083/582] add memory efficient training for FLUX.1

---
 README.md              |  64 ++++++++++++--
 flux_train.py          | 187 +++++++++++++++++++++++++++++------------
 library/flux_models.py | 182 ++++++++++++++++++++++++++++++++++-----
 3 files changed, 354 insertions(+), 79 deletions(-)

diff --git a/README.md b/README.md
index 2b7b110f3..521e82e86 100644
--- a/README.md
+++ b/README.md
@@ -10,7 +10,11 @@ The command to install PyTorch is as follows:
 `pip3 install torch==2.4.0 torchvision==0.19.0 --index-url https://download.pytorch.org/whl/cu124`
 
 
-Aug 17. 2024:
+Aug 18, 2024:
+Memory-efficient training based on 2kpr's implementation is implemented in `flux_train.py`. Thanks to 2kpr. See [FLUX.1 fine-tuning](#flux1-fine-tuning) for details.
+
+
+Aug 17, 2024:
 Added a script `flux_train.py` to train FLUX.1. The script is experimental and not an optimized version. It needs >28GB VRAM for training. 
 
 Aug 16, 2024: 
@@ -39,11 +43,23 @@ Aug 11, 2024: Fix `--apply_t5_attn_mask` option to work. Please remove and re-ge
 
 Aug 10, 2024:  LoRA key prefix is changed to `lora_unet` from `lora_flex` to make it compatible with ComfyUI.
 
-We have added a new training script for LoRA training. The script is `flux_train_network.py`. See `--help` for options. Sample command is below, settings are based on [AI Toolkit by Ostris](https://github.com/ostris/ai-toolkit). It will work with 24GB VRAM GPUs.
+
+### FLUX.1 LoRA training
+
+We have added a new training script for LoRA training. The script is `flux_train_network.py`. See `--help` for options. Sample command is below, settings are based on [AI Toolkit by Ostris](https://github.com/ostris/ai-toolkit). It will work with 24GB VRAM GPUs. 
 
 ```
-accelerate launch  --mixed_precision bf16 --num_cpu_threads_per_process 1 flux_train_network.py --pretrained_model_name_or_path flux1-dev.sft --clip_l sd3/clip_l.safetensors --t5xxl sd3/t5xxl_fp16.safetensors --ae ae.sft --cache_latents_to_disk --save_model_as safetensors --sdpa --persistent_data_loader_workers --max_data_loader_n_workers 2 --seed 42 --gradient_checkpointing --mixed_precision bf16 --save_precision bf16 --network_module networks.lora_flux --network_dim 4 --optimizer_type adamw8bit --learning_rate 1e-4 --network_train_unet_only --cache_text_encoder_outputs --cache_text_encoder_outputs_to_disk --fp8_base --highvram --max_train_epochs 4 --save_every_n_epochs 1 --dataset_config dataset_1024_bs2.toml --output_dir path/to/output/dir --output_name flux-lora-name --timestep_sampling sigmoid --model_prediction_type raw --guidance_scale 1.0 --loss_type l2
+accelerate launch  --mixed_precision bf16 --num_cpu_threads_per_process 1 flux_train_network.py 
+--pretrained_model_name_or_path flux1-dev.sft --clip_l sd3/clip_l.safetensors --t5xxl sd3/t5xxl_fp16.safetensors 
+--ae ae.sft --cache_latents_to_disk --save_model_as safetensors --sdpa --persistent_data_loader_workers 
+--max_data_loader_n_workers 2 --seed 42 --gradient_checkpointing --mixed_precision bf16 --save_precision bf16 
+--network_module networks.lora_flux --network_dim 4 --optimizer_type adamw8bit --learning_rate 1e-4 
+--network_train_unet_only --cache_text_encoder_outputs --cache_text_encoder_outputs_to_disk --fp8_base 
+--highvram --max_train_epochs 4 --save_every_n_epochs 1 --dataset_config dataset_1024_bs2.toml 
+--output_dir path/to/output/dir --output_name flux-lora-name --timestep_sampling sigmoid 
+--model_prediction_type raw --guidance_scale 1.0 --loss_type l2
 ```
+(The command is multi-line for readability. Please combine it into one line.)
 
 The training can be done with 16GB VRAM GPUs with Adafactor optimizer. Please use settings like below:
 
@@ -80,12 +96,44 @@ The trained LoRA model can be used with ComfyUI.
 
 The inference script is also available. The script is `flux_minimal_inference.py`. See `--help` for options. 
 
-Aug 12: `--interactive` option is now working.
-
 ```
 python flux_minimal_inference.py --ckpt flux1-dev.sft --clip_l sd3/clip_l.safetensors --t5xxl sd3/t5xxl_fp16.safetensors --ae ae.sft --dtype bf16 --prompt "a cat holding a sign that says hello world" --out path/to/output/dir --seed 1 --flux_dtype fp8 --offload --lora lora-flux-name.safetensors;1.0
 ```
 
+### FLUX.1 fine-tuning
+
+Sample command for FLUX.1 fine-tuning is below. This will work with 24GB VRAM GPUs, and 64GB main memory is recommended. 
+
+```
+accelerate launch  --mixed_precision bf16 --num_cpu_threads_per_process 1 flux_train.py   
+--pretrained_model_name_or_path flux1-dev.sft  --clip_l clip_l.safetensors --t5xxl t5xxl_fp16.safetensors --ae ae_dev.sft 
+--mixed_precision bf16 --save_model_as safetensors --sdpa --persistent_data_loader_workers --max_data_loader_n_workers 2 
+--seed 42 --gradient_checkpointing --mixed_precision bf16 --save_precision bf16 
+--dataset_config dataset_1024_bs1.toml  --output_dir path/to/output/dir --output_name test-bf16 
+--learning_rate 5e-5 --max_train_epochs 4  --sdpa --highvram --cache_text_encoder_outputs_to_disk --cache_latents_to_disk --save_every_n_epochs 1 
+--optimizer_type adafactor --optimizer_args "relative_step=False" "scale_parameter=False" "warmup_init=False" 
+--timestep_sampling sigmoid --model_prediction_type raw --guidance_scale 1.0 
+--blockwise_fused_optimizer  --double_blocks_to_swap 6 --cpu_offload_checkpointing
+```
+
+(Combine the command into one line.)
+
+Options are almost the same as LoRA training. The difference is `--blockwise_fused_optimizer`, `--double_blocks_to_swap` and `--cpu_offload_checkpointing`.  `--single_blocks_to_swap` is also available.
+
+`--blockwise_fused_optimizer` enables the fusing of the optimizer for each block. This is similar to `--fused_backward_pass`. Any optimizer can be used, but Adafactor is recommended for memory efficiency. `--fused_optimizer_groups` is deprecated due to the addition of this option for FLUX.1 training.
+
+`--double_blocks_to_swap` and `--single_blocks_to_swap` are the number of double blocks and single blocks to swap. The default is None (no swap). These options must be combined with `--blockwise_fused_optimizer`. 
+
+`--cpu_offload_checkpointing` is to offload the gradient checkpointing to CPU. This reduces about 2GB of VRAM usage.
+
+All these options are experimental and may change in the future.
+
+The increasing the number of blocks to swap may reduce the memory usage, but the training speed will be slower. `--cpu_offload_checkpointing` also slows down the training.
+
+Swap 6 double blocks and use cpu offload checkpointing may be a good starting point. Please try different settings according to VRAM usage and training speed.
+
+The learning rate and the number of epochs are not optimized yet. Please adjust them according to the training results.
+
 ### Merge LoRA to FLUX.1 checkpoint
 
 `networks/flux_merge_lora.py` merges LoRA to FLUX.1 checkpoint. __The script is experimental.__ 
@@ -298,7 +346,7 @@ The majority of scripts is licensed under ASL 2.0 (including codes from Diffuser
 
 - Fused optimizer is available for SDXL training. PR [#1259](https://github.com/kohya-ss/sd-scripts/pull/1259) Thanks to 2kpr!
   - The memory usage during training is significantly reduced by integrating the optimizer's backward pass with step. The training results are the same as before, but if you have plenty of memory, the speed will be slower.
-  - Specify the `--fused_backward_pass` option in `sdxl_train.py`. At this time, only AdaFactor is supported. Gradient accumulation is not available.
+  - Specify the `--fused_backward_pass` option in `sdxl_train.py`. At this time, only Adafactor is supported. Gradient accumulation is not available.
   - Setting mixed precision to `no` seems to use less memory than `fp16` or `bf16`.
   - Training is possible with a memory usage of about 17GB with a batch size of 1 and fp32. If you specify the `--full_bf16` option, you can further reduce the memory usage (but the accuracy will be lower). With the same memory usage as before, you can increase the batch size.
   - PyTorch 2.1 or later is required because it uses the new API `Tensor.register_post_accumulate_grad_hook(hook)`.
@@ -308,7 +356,7 @@ The majority of scripts is licensed under ASL 2.0 (including codes from Diffuser
   - Memory usage is reduced by the same principle as Fused optimizer. The training results and speed are the same as Fused optimizer.
   - Specify the number of groups like `--fused_optimizer_groups 10` in `sdxl_train.py`. Increasing the number of groups reduces memory usage but slows down training. Since the effect is limited to a certain number, it is recommended to specify 4-10.
   - Any optimizer can be used, but optimizers that automatically calculate the learning rate (such as D-Adaptation and Prodigy) cannot be used. Gradient accumulation is not available.
-  - `--fused_optimizer_groups` cannot be used with `--fused_backward_pass`. When using AdaFactor, the memory usage is slightly larger than with Fused optimizer. PyTorch 2.1 or later is required.
+  - `--fused_optimizer_groups` cannot be used with `--fused_backward_pass`. When using Adafactor, the memory usage is slightly larger than with Fused optimizer. PyTorch 2.1 or later is required.
   - Mechanism: While Fused optimizer performs backward/step for individual parameters within the optimizer, optimizer groups reduce memory usage by grouping parameters and creating multiple optimizers to perform backward/step for each group. Fused optimizer requires implementation on the optimizer side, while optimizer groups are implemented only on the training script side.
 
 - LoRA+ is supported. PR [#1233](https://github.com/kohya-ss/sd-scripts/pull/1233) Thanks to rockerBOO!
@@ -361,7 +409,7 @@ https://github.com/kohya-ss/sd-scripts/pull/1290) Thanks to frodo821!
 
 - SDXL の学習時に Fused optimizer が使えるようになりました。PR [#1259](https://github.com/kohya-ss/sd-scripts/pull/1259) 2kpr 氏に感謝します。
   - optimizer の backward pass に step を統合することで学習時のメモリ使用量を大きく削減します。学習結果は未適用時と同一ですが、メモリが潤沢にある場合は速度は遅くなります。
-  - `sdxl_train.py` に `--fused_backward_pass` オプションを指定してください。現時点では optimizer は AdaFactor のみ対応しています。また gradient accumulation は使えません。
+  - `sdxl_train.py` に `--fused_backward_pass` オプションを指定してください。現時点では optimizer は Adafactor のみ対応しています。また gradient accumulation は使えません。
   - mixed precision は `no` のほうが `fp16` や `bf16` よりも使用メモリ量が少ないようです。
   - バッチサイズ 1、fp32 で 17GB 程度で学習可能なようです。`--full_bf16` オプションを指定するとさらに削減できます（精度は劣ります）。以前と同じメモリ使用量ではバッチサイズを増やせます。
   - PyTorch 2.1 以降の新 API `Tensor.register_post_accumulate_grad_hook(hook)` を使用しているため、PyTorch 2.1 以降が必要です。
diff --git a/flux_train.py b/flux_train.py
index d2a9b3f32..ecb3c7dda 100644
--- a/flux_train.py
+++ b/flux_train.py
@@ -1,5 +1,15 @@
 # training with captions
 
+# Swap blocks between CPU and GPU:
+# This implementation is inspired by and based on the work of 2kpr.
+# Many thanks to 2kpr for the original concept and implementation of memory-efficient offloading.
+# The original idea has been adapted and extended to fit the current project's needs.
+
+# Key features:
+# - CPU offloading during forward and backward passes
+# - Use of fused optimizer and grad_hook for efficient gradient processing
+# - Per-block fused optimizer instances
+
 import argparse
 import copy
 import math
@@ -54,6 +64,12 @@ def train(args):
         )
         args.cache_text_encoder_outputs = True
 
+    if args.cpu_offload_checkpointing and not args.gradient_checkpointing:
+        logger.warning(
+            "cpu_offload_checkpointing is enabled, so gradient_checkpointing is also enabled / cpu_offload_checkpointingが有効になっているため、gradient_checkpointingも有効になります"
+        )
+        args.gradient_checkpointing = True
+
     cache_latents = args.cache_latents
     use_dreambooth_method = args.in_json is None
 
@@ -232,16 +248,25 @@ def train(args):
         # now we can delete Text Encoders to free memory
         clip_l = None
         t5xxl = None
+        clean_memory_on_device(accelerator.device)
 
     # load FLUX
     # if we load to cpu, flux.to(fp8) takes a long time
     flux = flux_utils.load_flow_model(name, args.pretrained_model_name_or_path, weight_dtype, "cpu")
 
     if args.gradient_checkpointing:
-        flux.enable_gradient_checkpointing()
+        flux.enable_gradient_checkpointing(args.cpu_offload_checkpointing)
 
     flux.requires_grad_(True)
 
+    if args.double_blocks_to_swap is not None or args.single_blocks_to_swap is not None:
+        # Swap blocks between CPU and GPU to reduce memory usage, in forward and backward passes.
+        # This idea is based on 2kpr's great work. Thank you!
+        logger.info(
+            f"enable block swap: double_blocks_to_swap={args.double_blocks_to_swap}, single_blocks_to_swap={args.single_blocks_to_swap}"
+        )
+        flux.enable_block_swap(args.double_blocks_to_swap, args.single_blocks_to_swap)
+
     if not cache_latents:
         # load VAE here if not cached
         ae = flux_utils.load_ae(name, args.ae, weight_dtype, "cpu")
@@ -265,40 +290,43 @@ def train(args):
     # 学習に必要なクラスを準備する
     accelerator.print("prepare optimizer, data loader etc.")
 
-    if args.fused_optimizer_groups:
+    if args.blockwise_fused_optimizers:
         # fused backward pass: https://pytorch.org/tutorials/intermediate/optimizer_step_in_backward_tutorial.html
-        # Instead of creating an optimizer for all parameters as in the tutorial, we create an optimizer for each group of parameters.
+        # Instead of creating an optimizer for all parameters as in the tutorial, we create an optimizer for each block of parameters.
         # This balances memory usage and management complexity.
 
-        # calculate total number of parameters
-        n_total_params = sum(len(params["params"]) for params in params_to_optimize)
-        params_per_group = math.ceil(n_total_params / args.fused_optimizer_groups)
-
-        # split params into groups, keeping the learning rate the same for all params in a group
-        # this will increase the number of groups if the learning rate is different for different params (e.g. U-Net and text encoders)
+        # split params into groups. currently different learning rates are not supported
         grouped_params = []
-        param_group = []
-        param_group_lr = -1
+        param_group = {}
         for group in params_to_optimize:
-            lr = group["lr"]
-            for p in group["params"]:
-                # if the learning rate is different for different params, start a new group
-                if lr != param_group_lr:
-                    if param_group:
-                        grouped_params.append({"params": param_group, "lr": param_group_lr})
-                        param_group = []
-                    param_group_lr = lr
-
-                param_group.append(p)
-
-                # if the group has enough parameters, start a new group
-                if len(param_group) == params_per_group:
-                    grouped_params.append({"params": param_group, "lr": param_group_lr})
-                    param_group = []
-                    param_group_lr = -1
-
-        if param_group:
-            grouped_params.append({"params": param_group, "lr": param_group_lr})
+            named_parameters = list(flux.named_parameters())
+            assert len(named_parameters) == len(group["params"]), "number of parameters does not match"
+            for p, np in zip(group["params"], named_parameters):
+                # determine target layer and block index for each parameter
+                block_type = "other"  # double, single or other
+                if np[0].startswith("double_blocks"):
+                    block_idx = int(np[0].split(".")[1])
+                    block_type = "double"
+                elif np[0].startswith("single_blocks"):
+                    block_idx = int(np[0].split(".")[1])
+                    block_type = "single"
+                else:
+                    block_idx = -1
+
+                param_group_key = (block_type, block_idx)
+                if param_group_key not in param_group:
+                    param_group[param_group_key] = []
+                param_group[param_group_key].append(p)
+
+        block_types_and_indices = []
+        for param_group_key, param_group in param_group.items():
+            block_types_and_indices.append(param_group_key)
+            grouped_params.append({"params": param_group, "lr": args.learning_rate})
+
+            num_params = 0
+            for p in param_group:
+                num_params += p.numel()
+            accelerator.print(f"block {param_group_key}: {num_params} parameters")
 
         # prepare optimizers for each group
         optimizers = []
@@ -307,7 +335,7 @@ def train(args):
             optimizers.append(optimizer)
         optimizer = optimizers[0]  # avoid error in the following code
 
-        logger.info(f"using {len(optimizers)} optimizers for fused optimizer groups")
+        logger.info(f"using {len(optimizers)} optimizers for blockwise fused optimizers")
 
     else:
         _, _, optimizer = train_util.get_optimizer(args, trainable_params=params_to_optimize)
@@ -341,7 +369,7 @@ def train(args):
     train_dataset_group.set_max_train_steps(args.max_train_steps)
 
     # lr schedulerを用意する
-    if args.fused_optimizer_groups:
+    if args.blockwise_fused_optimizers:
         # prepare lr schedulers for each optimizer
         lr_schedulers = [train_util.get_scheduler_fix(args, optimizer, accelerator.num_processes) for optimizer in optimizers]
         lr_scheduler = lr_schedulers[0]  # avoid error in the following code
@@ -414,7 +442,7 @@ def __grad_hook(tensor: torch.Tensor, param_group=param_group):
 
                     parameter.register_post_accumulate_grad_hook(__grad_hook)
 
-    elif args.fused_optimizer_groups:
+    elif args.blockwise_fused_optimizers:
         # prepare for additional optimizers and lr schedulers
         for i in range(1, len(optimizers)):
             optimizers[i] = accelerator.prepare(optimizers[i])
@@ -429,22 +457,46 @@ def __grad_hook(tensor: torch.Tensor, param_group=param_group):
         num_parameters_per_group = [0] * len(optimizers)
         parameter_optimizer_map = {}
 
+        double_blocks_to_swap = args.double_blocks_to_swap
+        single_blocks_to_swap = args.single_blocks_to_swap
+        num_double_blocks = len(flux.double_blocks)
+        num_single_blocks = len(flux.single_blocks)
+
         for opt_idx, optimizer in enumerate(optimizers):
             for param_group in optimizer.param_groups:
                 for parameter in param_group["params"]:
                     if parameter.requires_grad:
-
-                        def optimizer_hook(parameter: torch.Tensor):
-                            if accelerator.sync_gradients and args.max_grad_norm != 0.0:
-                                accelerator.clip_grad_norm_(parameter, args.max_grad_norm)
-
-                            i = parameter_optimizer_map[parameter]
-                            optimizer_hooked_count[i] += 1
-                            if optimizer_hooked_count[i] == num_parameters_per_group[i]:
-                                optimizers[i].step()
-                                optimizers[i].zero_grad(set_to_none=True)
-
-                        parameter.register_post_accumulate_grad_hook(optimizer_hook)
+                        block_type, block_idx = block_types_and_indices[opt_idx]
+
+                        def create_optimizer_hook(btype, bidx):
+                            def optimizer_hook(parameter: torch.Tensor):
+                                # print(f"optimizer_hook: {btype}, {bidx}")
+                                if accelerator.sync_gradients and args.max_grad_norm != 0.0:
+                                    accelerator.clip_grad_norm_(parameter, args.max_grad_norm)
+
+                                i = parameter_optimizer_map[parameter]
+                                optimizer_hooked_count[i] += 1
+                                if optimizer_hooked_count[i] == num_parameters_per_group[i]:
+                                    optimizers[i].step()
+                                    optimizers[i].zero_grad(set_to_none=True)
+
+                                    # swap blocks if necessary
+                                    if btype == "double" and double_blocks_to_swap:
+                                        if bidx >= num_double_blocks - double_blocks_to_swap:
+                                            bidx_cuda = double_blocks_to_swap - (num_double_blocks - bidx)
+                                            flux.double_blocks[bidx].to("cpu")
+                                            flux.double_blocks[bidx_cuda].to(accelerator.device)
+                                            # print(f"Move double block {bidx} to cpu and {bidx_cuda} to device")
+                                    elif btype == "single" and single_blocks_to_swap:
+                                        if bidx >= num_single_blocks - single_blocks_to_swap:
+                                            bidx_cuda = single_blocks_to_swap - (num_single_blocks - bidx)
+                                            flux.single_blocks[bidx].to("cpu")
+                                            flux.single_blocks[bidx_cuda].to(accelerator.device)
+                                            # print(f"Move single block {bidx} to cpu and {bidx_cuda} to device")
+
+                            return optimizer_hook
+
+                        parameter.register_post_accumulate_grad_hook(create_optimizer_hook(block_type, block_idx))
                         parameter_optimizer_map[parameter] = opt_idx
                         num_parameters_per_group[opt_idx] += 1
 
@@ -487,6 +539,9 @@ def optimizer_hook(parameter: torch.Tensor):
             init_kwargs=init_kwargs,
         )
 
+    if args.double_blocks_to_swap is not None or args.single_blocks_to_swap is not None:
+        flux.prepare_block_swap_before_forward()
+
     # For --sample_at_first
     flux_train_utils.sample_images(accelerator, args, 0, global_step, flux, ae, [clip_l, t5xxl], sample_prompts_te_outputs)
 
@@ -502,7 +557,7 @@ def optimizer_hook(parameter: torch.Tensor):
         for step, batch in enumerate(train_dataloader):
             current_step.value = global_step
 
-            if args.fused_optimizer_groups:
+            if args.blockwise_fused_optimizers:
                 optimizer_hooked_count = {i: 0 for i in range(len(optimizers))}  # reset counter for each step
 
             with accelerator.accumulate(*training_models):
@@ -591,7 +646,7 @@ def optimizer_hook(parameter: torch.Tensor):
                 # backward
                 accelerator.backward(loss)
 
-                if not (args.fused_backward_pass or args.fused_optimizer_groups):
+                if not (args.fused_backward_pass or args.blockwise_fused_optimizers):
                     if accelerator.sync_gradients and args.max_grad_norm != 0.0:
                         params_to_clip = []
                         for m in training_models:
@@ -604,7 +659,7 @@ def optimizer_hook(parameter: torch.Tensor):
                 else:
                     # optimizer.step() and optimizer.zero_grad() are called in the optimizer hook
                     lr_scheduler.step()
-                    if args.fused_optimizer_groups:
+                    if args.blockwise_fused_optimizers:
                         for i in range(1, len(optimizers)):
                             lr_schedulers[i].step()
 
@@ -614,7 +669,7 @@ def optimizer_hook(parameter: torch.Tensor):
                 global_step += 1
 
                 flux_train_utils.sample_images(
-                    accelerator, args, epoch, global_step, flux, ae, [clip_l, t5xxl], sample_prompts_te_outputs
+                    accelerator, args, None, global_step, flux, ae, [clip_l, t5xxl], sample_prompts_te_outputs
                 )
 
                 # 指定ステップごとにモデルを保存
@@ -673,8 +728,6 @@ def optimizer_hook(parameter: torch.Tensor):
     is_main_process = accelerator.is_main_process
     # if is_main_process:
     flux = accelerator.unwrap_model(flux)
-    clip_l = accelerator.unwrap_model(clip_l)
-    t5xxl = accelerator.unwrap_model(t5xxl)
 
     accelerator.end_training()
 
@@ -707,13 +760,43 @@ def setup_parser() -> argparse.ArgumentParser:
         "--fused_optimizer_groups",
         type=int,
         default=None,
-        help="number of optimizers for fused backward pass and optimizer step / fused backward passとoptimizer stepのためのoptimizer数",
+        help="**this option is not working** will be removed in the future / このオプションは動作しません。将来削除されます",
+    )
+    parser.add_argument(
+        "--blockwise_fused_optimizers",
+        action="store_true",
+        help="enable blockwise optimizers for fused backward pass and optimizer step / fused backward passとoptimizer step のためブロック単位のoptimizerを有効にする",
     )
     parser.add_argument(
         "--skip_latents_validity_check",
         action="store_true",
         help="skip latents validity check / latentsの正当性チェックをスキップする",
     )
+    parser.add_argument(
+        "--double_blocks_to_swap",
+        type=int,
+        default=None,
+        help="[EXPERIMENTAL] "
+        "Sets the number of 'double_blocks' (~640MB) to swap during the forward and backward passes."
+        "Increasing this number lowers the overall VRAM used during training at the expense of training speed (s/it)."
+        " / 順伝播および逆伝播中にスワップする'変換ブロック'（約640MB）の数を設定します。"
+        "この数を増やすと、トレーニング中のVRAM使用量が減りますが、トレーニング速度（s/it）も低下します。",
+    )
+    parser.add_argument(
+        "--single_blocks_to_swap",
+        type=int,
+        default=None,
+        help="[EXPERIMENTAL] "
+        "Sets the number of 'single_blocks' (~320MB) to swap during the forward and backward passes."
+        "Increasing this number lowers the overall VRAM used during training at the expense of training speed (s/it)."
+        " / 順伝播および逆伝播中にスワップする'変換ブロック'（約320MB）の数を設定します。"
+        "この数を増やすと、トレーニング中のVRAM使用量が減りますが、トレーニング速度（s/it）も低下します。",
+    )
+    parser.add_argument(
+        "--cpu_offload_checkpointing",
+        action="store_true",
+        help="[EXPERIMENTAL] enable offloading of tensors to CPU during checkpointing / チェックポイント時にテンソルをCPUにオフロードする",
+    )
     return parser
 
 
diff --git a/library/flux_models.py b/library/flux_models.py
index ed0bc8c7d..3f44068f9 100644
--- a/library/flux_models.py
+++ b/library/flux_models.py
@@ -4,6 +4,11 @@
 
 from dataclasses import dataclass
 import math
+from typing import Optional
+
+from library.device_utils import init_ipex, clean_memory_on_device
+
+init_ipex()
 
 import torch
 from einops import rearrange
@@ -466,6 +471,33 @@ def apply_rope(xq: Tensor, xk: Tensor, freqs_cis: Tensor) -> tuple[Tensor, Tenso
 
 
 # region layers
+
+
+# for cpu_offload_checkpointing
+
+
+def to_cuda(x):
+    if isinstance(x, torch.Tensor):
+        return x.cuda()
+    elif isinstance(x, (list, tuple)):
+        return [to_cuda(elem) for elem in x]
+    elif isinstance(x, dict):
+        return {k: to_cuda(v) for k, v in x.items()}
+    else:
+        return x
+
+
+def to_cpu(x):
+    if isinstance(x, torch.Tensor):
+        return x.cpu()
+    elif isinstance(x, (list, tuple)):
+        return [to_cpu(elem) for elem in x]
+    elif isinstance(x, dict):
+        return {k: to_cpu(v) for k, v in x.items()}
+    else:
+        return x
+
+
 class EmbedND(nn.Module):
     def __init__(self, dim: int, theta: int, axes_dim: list[int]):
         super().__init__()
@@ -648,16 +680,15 @@ def __init__(self, hidden_size: int, num_heads: int, mlp_ratio: float, qkv_bias:
         )
 
         self.gradient_checkpointing = False
+        self.cpu_offload_checkpointing = False
 
-    def enable_gradient_checkpointing(self):
+    def enable_gradient_checkpointing(self, cpu_offload: bool = False):
         self.gradient_checkpointing = True
-        # self.img_attn.enable_gradient_checkpointing()
-        # self.txt_attn.enable_gradient_checkpointing()
+        self.cpu_offload_checkpointing = cpu_offload
 
     def disable_gradient_checkpointing(self):
         self.gradient_checkpointing = False
-        # self.img_attn.disable_gradient_checkpointing()
-        # self.txt_attn.disable_gradient_checkpointing()
+        self.cpu_offload_checkpointing = False
 
     def _forward(self, img: Tensor, txt: Tensor, vec: Tensor, pe: Tensor) -> tuple[Tensor, Tensor]:
         img_mod1, img_mod2 = self.img_mod(vec)
@@ -694,11 +725,24 @@ def _forward(self, img: Tensor, txt: Tensor, vec: Tensor, pe: Tensor) -> tuple[T
         txt = txt + txt_mod2.gate * self.txt_mlp((1 + txt_mod2.scale) * self.txt_norm2(txt) + txt_mod2.shift)
         return img, txt
 
-    def forward(self, *args, **kwargs):
+    def forward(self, img: Tensor, txt: Tensor, vec: Tensor, pe: Tensor) -> tuple[Tensor, Tensor]:
         if self.training and self.gradient_checkpointing:
-            return checkpoint(self._forward, *args, use_reentrant=False, **kwargs)
+            if not self.cpu_offload_checkpointing:
+                return checkpoint(self._forward, img, txt, vec, pe, use_reentrant=False)
+            # cpu offload checkpointing
+
+            def create_custom_forward(func):
+                def custom_forward(*inputs):
+                    cuda_inputs = to_cuda(inputs)
+                    outputs = func(*cuda_inputs)
+                    return to_cpu(outputs)
+
+                return custom_forward
+
+            return torch.utils.checkpoint.checkpoint(create_custom_forward(self._forward), img, txt, vec, pe)
+
         else:
-            return self._forward(*args, **kwargs)
+            return self._forward(img, txt, vec, pe)
 
     # def forward(self, img: Tensor, txt: Tensor, vec: Tensor, pe: Tensor):
     #     if self.training and self.gradient_checkpointing:
@@ -747,12 +791,15 @@ def __init__(
         self.modulation = Modulation(hidden_size, double=False)
 
         self.gradient_checkpointing = False
+        self.cpu_offload_checkpointing = False
 
-    def enable_gradient_checkpointing(self):
+    def enable_gradient_checkpointing(self, cpu_offload: bool = False):
         self.gradient_checkpointing = True
+        self.cpu_offload_checkpointing = cpu_offload
 
     def disable_gradient_checkpointing(self):
         self.gradient_checkpointing = False
+        self.cpu_offload_checkpointing = False
 
     def _forward(self, x: Tensor, vec: Tensor, pe: Tensor) -> Tensor:
         mod, _ = self.modulation(vec)
@@ -768,11 +815,24 @@ def _forward(self, x: Tensor, vec: Tensor, pe: Tensor) -> Tensor:
         output = self.linear2(torch.cat((attn, self.mlp_act(mlp)), 2))
         return x + mod.gate * output
 
-    def forward(self, *args, **kwargs):
+    def forward(self, x: Tensor, vec: Tensor, pe: Tensor) -> Tensor:
         if self.training and self.gradient_checkpointing:
-            return checkpoint(self._forward, *args, use_reentrant=False, **kwargs)
+            if not self.cpu_offload_checkpointing:
+                return checkpoint(self._forward, x, vec, pe, use_reentrant=False)
+
+            # cpu offload checkpointing
+
+            def create_custom_forward(func):
+                def custom_forward(*inputs):
+                    cuda_inputs = to_cuda(inputs)
+                    outputs = func(*cuda_inputs)
+                    return to_cpu(outputs)
+
+                return custom_forward
+
+            return torch.utils.checkpoint.checkpoint(create_custom_forward(self._forward), x, vec, pe)
         else:
-            return self._forward(*args, **kwargs)
+            return self._forward(x, vec, pe)
 
     # def forward(self, x: Tensor, vec: Tensor, pe: Tensor):
     #     if self.training and self.gradient_checkpointing:
@@ -849,6 +909,9 @@ def __init__(self, params: FluxParams):
         self.final_layer = LastLayer(self.hidden_size, 1, self.out_channels)
 
         self.gradient_checkpointing = False
+        self.cpu_offload_checkpointing = False
+        self.double_blocks_to_swap = None
+        self.single_blocks_to_swap = None
 
     @property
     def device(self):
@@ -858,8 +921,9 @@ def device(self):
     def dtype(self):
         return next(self.parameters()).dtype
 
-    def enable_gradient_checkpointing(self):
+    def enable_gradient_checkpointing(self, cpu_offload: bool = False):
         self.gradient_checkpointing = True
+        self.cpu_offload_checkpointing = cpu_offload
 
         self.time_in.enable_gradient_checkpointing()
         self.vector_in.enable_gradient_checkpointing()
@@ -867,12 +931,13 @@ def enable_gradient_checkpointing(self):
             self.guidance_in.enable_gradient_checkpointing()
 
         for block in self.double_blocks + self.single_blocks:
-            block.enable_gradient_checkpointing()
+            block.enable_gradient_checkpointing(cpu_offload=cpu_offload)
 
-        print("FLUX: Gradient checkpointing enabled.")
+        print(f"FLUX: Gradient checkpointing enabled. CPU offload: {cpu_offload}")
 
     def disable_gradient_checkpointing(self):
         self.gradient_checkpointing = False
+        self.cpu_offload_checkpointing = False
 
         self.time_in.disable_gradient_checkpointing()
         self.vector_in.disable_gradient_checkpointing()
@@ -884,6 +949,24 @@ def disable_gradient_checkpointing(self):
 
         print("FLUX: Gradient checkpointing disabled.")
 
+    def enable_block_swap(self, double_blocks: Optional[int], single_blocks: Optional[int]):
+        self.double_blocks_to_swap = double_blocks
+        self.single_blocks_to_swap = single_blocks
+
+    def prepare_block_swap_before_forward(self):
+        # move last n blocks to cpu: they are on cuda
+        if self.double_blocks_to_swap:
+            for i in range(len(self.double_blocks) - self.double_blocks_to_swap):
+                self.double_blocks[i].to(self.device)
+            for i in range(len(self.double_blocks) - self.double_blocks_to_swap, len(self.double_blocks)):
+                self.double_blocks[i].to("cpu")  # , non_blocking=True)
+        if self.single_blocks_to_swap:
+            for i in range(len(self.single_blocks) - self.single_blocks_to_swap):
+                self.single_blocks[i].to(self.device)
+            for i in range(len(self.single_blocks) - self.single_blocks_to_swap, len(self.single_blocks)):
+                self.single_blocks[i].to("cpu")  # , non_blocking=True)
+        clean_memory_on_device(self.device)
+
     def forward(
         self,
         img: Tensor,
@@ -910,14 +993,75 @@ def forward(
         ids = torch.cat((txt_ids, img_ids), dim=1)
         pe = self.pe_embedder(ids)
 
-        for block in self.double_blocks:
-            img, txt = block(img=img, txt=txt, vec=vec, pe=pe)
+        if not self.double_blocks_to_swap:
+            for block in self.double_blocks:
+                img, txt = block(img=img, txt=txt, vec=vec, pe=pe)
+        else:
+            # make sure first n blocks are on cuda, and last n blocks are on cpu at beginning
+            for block_idx in range(self.double_blocks_to_swap):
+                block = self.double_blocks[len(self.double_blocks) - self.double_blocks_to_swap + block_idx]
+                if block.parameters().__next__().device.type != "cpu":
+                    block.to("cpu")  # , non_blocking=True)
+                    # print(f"Moved double block {len(self.double_blocks) - self.double_blocks_to_swap + block_idx} to cpu.")
+
+                block = self.double_blocks[block_idx]
+                if block.parameters().__next__().device.type == "cpu":
+                    block.to(self.device)
+                    # print(f"Moved double block {block_idx} to cuda.")
+
+            to_cpu_block_index = 0
+            for block_idx, block in enumerate(self.double_blocks):
+                # move last n blocks to cuda: they are on cpu, and move first n blocks to cpu: they are on cuda
+                moving = block_idx >= len(self.double_blocks) - self.double_blocks_to_swap
+                if moving:
+                    block.to(self.device)  # move to cuda
+                    # print(f"Moved double block {block_idx} to cuda.")
+
+                img, txt = block(img=img, txt=txt, vec=vec, pe=pe)
+
+                if moving:
+                    self.double_blocks[to_cpu_block_index].to("cpu")  # , non_blocking=True)
+                    # print(f"Moved double block {to_cpu_block_index} to cpu.")
+                    to_cpu_block_index += 1
 
         img = torch.cat((txt, img), 1)
-        for block in self.single_blocks:
-            img = block(img, vec=vec, pe=pe)
+
+        if not self.single_blocks_to_swap:
+            for block in self.single_blocks:
+                img = block(img, vec=vec, pe=pe)
+        else:
+            # make sure first n blocks are on cuda, and last n blocks are on cpu at beginning
+            for block_idx in range(self.single_blocks_to_swap):
+                block = self.single_blocks[len(self.single_blocks) - self.single_blocks_to_swap + block_idx]
+                if block.parameters().__next__().device.type != "cpu":
+                    block.to("cpu")  # , non_blocking=True)
+                    # print(f"Moved single block {len(self.single_blocks) - self.single_blocks_to_swap + block_idx} to cpu.")
+
+                block = self.single_blocks[block_idx]
+                if block.parameters().__next__().device.type == "cpu":
+                    block.to(self.device)
+                    # print(f"Moved single block {block_idx} to cuda.")
+
+            to_cpu_block_index = 0
+            for block_idx, block in enumerate(self.single_blocks):
+                # move last n blocks to cuda: they are on cpu, and move first n blocks to cpu: they are on cuda
+                moving = block_idx >= len(self.single_blocks) - self.single_blocks_to_swap
+                if moving:
+                    block.to(self.device)  # move to cuda
+                    # print(f"Moved single block {block_idx} to cuda.")
+
+                img = block(img, vec=vec, pe=pe)
+
+                if moving:
+                    self.single_blocks[to_cpu_block_index].to("cpu")  # , non_blocking=True)
+                    # print(f"Moved single block {to_cpu_block_index} to cpu.")
+
         img = img[:, txt.shape[1] :, ...]
 
+        if self.training and self.cpu_offload_checkpointing:
+            img = img.to(self.device)
+            vec = vec.to(self.device)
+
         img = self.final_layer(img, vec)  # (N, T, patch_size ** 2 * out_channels)
         return img
 

From a45048892802dce43e86a7e377ba84e89b51fdf5 Mon Sep 17 00:00:00 2001
From: Kohya S <ykumeykume@gmail.com>
Date: Sun, 18 Aug 2024 16:56:50 +0900
Subject: [PATCH 084/582] update readme

---
 README.md | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/README.md b/README.md
index 521e82e86..df2a612d7 100644
--- a/README.md
+++ b/README.md
@@ -9,10 +9,8 @@ __Please update PyTorch to 2.4.0. We have tested with `torch==2.4.0` and `torchv
 The command to install PyTorch is as follows:
 `pip3 install torch==2.4.0 torchvision==0.19.0 --index-url https://download.pytorch.org/whl/cu124`
 
-
 Aug 18, 2024:
-Memory-efficient training based on 2kpr's implementation is implemented in `flux_train.py`. Thanks to 2kpr. See [FLUX.1 fine-tuning](#flux1-fine-tuning) for details.
-
+Memory-efficient training based on 2kpr's implementation is implemented in `flux_train.py`. Thanks to 2kpr! See [FLUX.1 fine-tuning](#flux1-fine-tuning) for details.
 
 Aug 17, 2024:
 Added a script `flux_train.py` to train FLUX.1. The script is experimental and not an optimized version. It needs >28GB VRAM for training. 
@@ -118,6 +116,8 @@ accelerate launch  --mixed_precision bf16 --num_cpu_threads_per_process 1 flux_t
 
 (Combine the command into one line.)
 
+Sample image generation during training is not tested yet.
+
 Options are almost the same as LoRA training. The difference is `--blockwise_fused_optimizer`, `--double_blocks_to_swap` and `--cpu_offload_checkpointing`.  `--single_blocks_to_swap` is also available.
 
 `--blockwise_fused_optimizer` enables the fusing of the optimizer for each block. This is similar to `--fused_backward_pass`. Any optimizer can be used, but Adafactor is recommended for memory efficiency. `--fused_optimizer_groups` is deprecated due to the addition of this option for FLUX.1 training.

From d034032a5dff4a5ee1a108e4f1cec41d8efadab0 Mon Sep 17 00:00:00 2001
From: Kohya S <ykumeykume@gmail.com>
Date: Mon, 19 Aug 2024 13:08:49 +0900
Subject: [PATCH 085/582] update README fix option name

---
 README.md | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/README.md b/README.md
index df2a612d7..9a603b281 100644
--- a/README.md
+++ b/README.md
@@ -105,24 +105,24 @@ Sample command for FLUX.1 fine-tuning is below. This will work with 24GB VRAM GP
 ```
 accelerate launch  --mixed_precision bf16 --num_cpu_threads_per_process 1 flux_train.py   
 --pretrained_model_name_or_path flux1-dev.sft  --clip_l clip_l.safetensors --t5xxl t5xxl_fp16.safetensors --ae ae_dev.sft 
---mixed_precision bf16 --save_model_as safetensors --sdpa --persistent_data_loader_workers --max_data_loader_n_workers 2 
+--save_model_as safetensors --sdpa --persistent_data_loader_workers --max_data_loader_n_workers 2 
 --seed 42 --gradient_checkpointing --mixed_precision bf16 --save_precision bf16 
---dataset_config dataset_1024_bs1.toml  --output_dir path/to/output/dir --output_name test-bf16 
+--dataset_config dataset_1024_bs1.toml  --output_dir path/to/output/dir --output_name output-name 
 --learning_rate 5e-5 --max_train_epochs 4  --sdpa --highvram --cache_text_encoder_outputs_to_disk --cache_latents_to_disk --save_every_n_epochs 1 
 --optimizer_type adafactor --optimizer_args "relative_step=False" "scale_parameter=False" "warmup_init=False" 
 --timestep_sampling sigmoid --model_prediction_type raw --guidance_scale 1.0 
---blockwise_fused_optimizer  --double_blocks_to_swap 6 --cpu_offload_checkpointing
+--blockwise_fused_optimizers  --double_blocks_to_swap 6 --cpu_offload_checkpointing
 ```
 
 (Combine the command into one line.)
 
 Sample image generation during training is not tested yet.
 
-Options are almost the same as LoRA training. The difference is `--blockwise_fused_optimizer`, `--double_blocks_to_swap` and `--cpu_offload_checkpointing`.  `--single_blocks_to_swap` is also available.
+Options are almost the same as LoRA training. The difference is `--blockwise_fused_optimizers`, `--double_blocks_to_swap` and `--cpu_offload_checkpointing`.  `--single_blocks_to_swap` is also available.
 
-`--blockwise_fused_optimizer` enables the fusing of the optimizer for each block. This is similar to `--fused_backward_pass`. Any optimizer can be used, but Adafactor is recommended for memory efficiency. `--fused_optimizer_groups` is deprecated due to the addition of this option for FLUX.1 training.
+`--blockwise_fused_optimizers` enables the fusing of the optimizer for each block. This is similar to `--fused_backward_pass`. Any optimizer can be used, but Adafactor is recommended for memory efficiency. `--fused_optimizer_groups` is deprecated due to the addition of this option for FLUX.1 training.
 
-`--double_blocks_to_swap` and `--single_blocks_to_swap` are the number of double blocks and single blocks to swap. The default is None (no swap). These options must be combined with `--blockwise_fused_optimizer`. 
+`--double_blocks_to_swap` and `--single_blocks_to_swap` are the number of double blocks and single blocks to swap. The default is None (no swap). These options must be combined with `--blockwise_fused_optimizers`. 
 
 `--cpu_offload_checkpointing` is to offload the gradient checkpointing to CPU. This reduces about 2GB of VRAM usage.
 

From 6e72a799c8f55f148a248693d2c0c3fb1912b04e Mon Sep 17 00:00:00 2001
From: Kohya S <ykumeykume@gmail.com>
Date: Mon, 19 Aug 2024 21:55:28 +0900
Subject: [PATCH 086/582] reduce peak VRAM usage by excluding some blocks to
 cuda

---
 flux_train.py          | 15 +++++++++------
 library/flux_models.py | 16 ++++++++++++++++
 2 files changed, 25 insertions(+), 6 deletions(-)

diff --git a/flux_train.py b/flux_train.py
index ecb3c7dda..b294ce42a 100644
--- a/flux_train.py
+++ b/flux_train.py
@@ -251,7 +251,6 @@ def train(args):
         clean_memory_on_device(accelerator.device)
 
     # load FLUX
-    # if we load to cpu, flux.to(fp8) takes a long time
     flux = flux_utils.load_flow_model(name, args.pretrained_model_name_or_path, weight_dtype, "cpu")
 
     if args.gradient_checkpointing:
@@ -259,7 +258,8 @@ def train(args):
 
     flux.requires_grad_(True)
 
-    if args.double_blocks_to_swap is not None or args.single_blocks_to_swap is not None:
+    is_swapping_blocks = args.double_blocks_to_swap is not None or args.single_blocks_to_swap is not None
+    if is_swapping_blocks:
         # Swap blocks between CPU and GPU to reduce memory usage, in forward and backward passes.
         # This idea is based on 2kpr's great work. Thank you!
         logger.info(
@@ -412,8 +412,11 @@ def train(args):
         training_models = [ds_model]
 
     else:
-        # acceleratorがなんかよろしくやってくれるらしい
-        flux = accelerator.prepare(flux)
+        # accelerator does some magic
+        # if we doesn't swap blocks, we can move the model to device
+        flux = accelerator.prepare(flux, device_placement=[not is_swapping_blocks])
+        if is_swapping_blocks:
+            flux.move_to_device_except_swap_blocks(accelerator.device)  # reduce peak memory usage
         optimizer, train_dataloader, lr_scheduler = accelerator.prepare(optimizer, train_dataloader, lr_scheduler)
 
     # 実験的機能：勾配も含めたfp16学習を行う　PyTorchにパッチを当ててfp16でのgrad scaleを有効にする
@@ -539,7 +542,7 @@ def optimizer_hook(parameter: torch.Tensor):
             init_kwargs=init_kwargs,
         )
 
-    if args.double_blocks_to_swap is not None or args.single_blocks_to_swap is not None:
+    if is_swapping_blocks:
         flux.prepare_block_swap_before_forward()
 
     # For --sample_at_first
@@ -595,7 +598,7 @@ def optimizer_hook(parameter: torch.Tensor):
 
                 # get noisy model input and timesteps
                 noisy_model_input, timesteps, sigmas = flux_train_utils.get_noisy_model_input_and_timesteps(
-                    args, noise_scheduler, latents, noise, accelerator.device, weight_dtype
+                    args, noise_scheduler_copy, latents, noise, accelerator.device, weight_dtype
                 )
 
                 # pack latents and get img_ids
diff --git a/library/flux_models.py b/library/flux_models.py
index 3f44068f9..11ef647ad 100644
--- a/library/flux_models.py
+++ b/library/flux_models.py
@@ -953,6 +953,22 @@ def enable_block_swap(self, double_blocks: Optional[int], single_blocks: Optiona
         self.double_blocks_to_swap = double_blocks
         self.single_blocks_to_swap = single_blocks
 
+    def move_to_device_except_swap_blocks(self, device: torch.device):
+        # assume model is on cpu
+        if self.double_blocks_to_swap:
+            save_double_blocks = self.double_blocks
+            self.double_blocks = None
+        if self.single_blocks_to_swap:
+            save_single_blocks = self.single_blocks
+            self.single_blocks = None
+
+        self.to(device)
+
+        if self.double_blocks_to_swap:
+            self.double_blocks = save_double_blocks
+        if self.single_blocks_to_swap:
+            self.single_blocks = save_single_blocks
+
     def prepare_block_swap_before_forward(self):
         # move last n blocks to cpu: they are on cuda
         if self.double_blocks_to_swap:

From 486fe8f70a53166f21f08b1c896bd9ba1e31d7e7 Mon Sep 17 00:00:00 2001
From: Kohya S <ykumeykume@gmail.com>
Date: Mon, 19 Aug 2024 22:30:24 +0900
Subject: [PATCH 087/582] feat: reduce memory usage and add memory efficient
 option for model saving

---
 README.md                   |  5 +++
 flux_train.py               |  6 +++
 library/flux_train_utils.py | 21 ++++++++---
 library/utils.py            | 75 ++++++++++++++++++++++++++++++++++++-
 4 files changed, 100 insertions(+), 7 deletions(-)

diff --git a/README.md b/README.md
index 9a603b281..51e4635bb 100644
--- a/README.md
+++ b/README.md
@@ -9,6 +9,11 @@ __Please update PyTorch to 2.4.0. We have tested with `torch==2.4.0` and `torchv
 The command to install PyTorch is as follows:
 `pip3 install torch==2.4.0 torchvision==0.19.0 --index-url https://download.pytorch.org/whl/cu124`
 
+Aug 19, 2024:
+In `flux_train.py`, the memory consumption during model saving is reduced when `--save_precision` is set to the same value as `--mixed_precision` (about 22GB). Please set the same value unless there is a reason.
+
+An experimental option `--mem_eff_save` is also added. When specified, it can further reduce memory consumption (about 22GB), but since it is a custom implementation, unexpected problems may occur. We do not recommend using it unless you are familiar with the code.
+
 Aug 18, 2024:
 Memory-efficient training based on 2kpr's implementation is implemented in `flux_train.py`. Thanks to 2kpr! See [FLUX.1 fine-tuning](#flux1-fine-tuning) for details.
 
diff --git a/flux_train.py b/flux_train.py
index b294ce42a..669963856 100644
--- a/flux_train.py
+++ b/flux_train.py
@@ -759,6 +759,12 @@ def setup_parser() -> argparse.ArgumentParser:
     add_custom_train_arguments(parser)  # TODO remove this from here
     flux_train_utils.add_flux_train_arguments(parser)
 
+    parser.add_argument(
+        "--mem_eff_save",
+        action="store_true",
+        help="[EXPERIMENTAL] use memory efficient custom model saving method / メモリ効率の良い独自のモデル保存方法を使う",
+    )
+
     parser.add_argument(
         "--fused_optimizer_groups",
         type=int,
diff --git a/library/flux_train_utils.py b/library/flux_train_utils.py
index 167d61c7e..3f9e8660f 100644
--- a/library/flux_train_utils.py
+++ b/library/flux_train_utils.py
@@ -20,7 +20,7 @@
 
 init_ipex()
 
-from .utils import setup_logging
+from .utils import setup_logging, mem_eff_save_file
 
 setup_logging()
 import logging
@@ -409,19 +409,28 @@ def apply_model_prediction_type(args, model_pred, noisy_model_input, sigmas):
     return model_pred, weighting
 
 
-def save_models(ckpt_path: str, flux: flux_models.Flux, sai_metadata: Optional[dict], save_dtype: Optional[torch.dtype] = None):
+def save_models(
+    ckpt_path: str,
+    flux: flux_models.Flux,
+    sai_metadata: Optional[dict],
+    save_dtype: Optional[torch.dtype] = None,
+    use_mem_eff_save: bool = False,
+):
     state_dict = {}
 
     def update_sd(prefix, sd):
         for k, v in sd.items():
             key = prefix + k
-            if save_dtype is not None:
+            if save_dtype is not None and v.dtype != save_dtype:
                 v = v.detach().clone().to("cpu").to(save_dtype)
             state_dict[key] = v
 
     update_sd("", flux.state_dict())
 
-    save_file(state_dict, ckpt_path, metadata=sai_metadata)
+    if not use_mem_eff_save:
+        save_file(state_dict, ckpt_path, metadata=sai_metadata)
+    else:
+        mem_eff_save_file(state_dict, ckpt_path, metadata=sai_metadata)
 
 
 def save_flux_model_on_train_end(
@@ -429,7 +438,7 @@ def save_flux_model_on_train_end(
 ):
     def sd_saver(ckpt_file, epoch_no, global_step):
         sai_metadata = train_util.get_sai_model_spec(None, args, False, False, False, is_stable_diffusion_ckpt=True, flux="dev")
-        save_models(ckpt_file, flux, sai_metadata, save_dtype)
+        save_models(ckpt_file, flux, sai_metadata, save_dtype, args.mem_eff_save)
 
     train_util.save_sd_model_on_train_end_common(args, True, True, epoch, global_step, sd_saver, None)
 
@@ -448,7 +457,7 @@ def save_flux_model_on_epoch_end_or_stepwise(
 ):
     def sd_saver(ckpt_file, epoch_no, global_step):
         sai_metadata = train_util.get_sai_model_spec(None, args, False, False, False, is_stable_diffusion_ckpt=True, flux="dev")
-        save_models(ckpt_file, flux, sai_metadata, save_dtype)
+        save_models(ckpt_file, flux, sai_metadata, save_dtype, args.mem_eff_save)
 
     train_util.save_sd_model_on_epoch_end_or_stepwise_common(
         args,
diff --git a/library/utils.py b/library/utils.py
index 3037c055d..7de22d5a9 100644
--- a/library/utils.py
+++ b/library/utils.py
@@ -1,9 +1,12 @@
 import logging
 import sys
 import threading
+from typing import *
+import json
+import struct
+
 import torch
 from torchvision import transforms
-from typing import *
 from diffusers import EulerAncestralDiscreteScheduler
 import diffusers.schedulers.scheduling_euler_ancestral_discrete
 from diffusers.schedulers.scheduling_euler_ancestral_discrete import EulerAncestralDiscreteSchedulerOutput
@@ -79,6 +82,76 @@ def setup_logging(args=None, log_level=None, reset=False):
         logger.info(msg_init)
 
 
+def mem_eff_save_file(tensors: Dict[str, torch.Tensor], filename: str, metadata: Dict[str, Any] = None):
+    """
+    memory efficient save file
+    """
+
+    _TYPES = {
+        torch.float64: "F64",
+        torch.float32: "F32",
+        torch.float16: "F16",
+        torch.bfloat16: "BF16",
+        torch.int64: "I64",
+        torch.int32: "I32",
+        torch.int16: "I16",
+        torch.int8: "I8",
+        torch.uint8: "U8",
+        torch.bool: "BOOL",
+        getattr(torch, "float8_e5m2", None): "F8_E5M2",
+        getattr(torch, "float8_e4m3fn", None): "F8_E4M3",
+    }
+    _ALIGN = 256
+
+    def validate_metadata(metadata: Dict[str, Any]) -> Dict[str, str]:
+        validated = {}
+        for key, value in metadata.items():
+            if not isinstance(key, str):
+                raise ValueError(f"Metadata key must be a string, got {type(key)}")
+            if not isinstance(value, str):
+                print(f"Warning: Metadata value for key '{key}' is not a string. Converting to string.")
+                validated[key] = str(value)
+            else:
+                validated[key] = value
+        return validated
+
+    print(f"Using memory efficient save file: {filename}")
+
+    header = {}
+    offset = 0
+    if metadata:
+        header["__metadata__"] = validate_metadata(metadata)
+    for k, v in tensors.items():
+        if v.numel() == 0:  # empty tensor
+            header[k] = {"dtype": _TYPES[v.dtype], "shape": list(v.shape), "data_offsets": [offset, offset]}
+        else:
+            size = v.numel() * v.element_size()
+            header[k] = {"dtype": _TYPES[v.dtype], "shape": list(v.shape), "data_offsets": [offset, offset + size]}
+            offset += size
+
+    hjson = json.dumps(header).encode("utf-8")
+    hjson += b" " * (-(len(hjson) + 8) % _ALIGN)
+
+    with open(filename, "wb") as f:
+        f.write(struct.pack("<Q", len(hjson)))
+        f.write(hjson)
+
+        for k, v in tensors.items():
+            if v.numel() == 0:
+                continue
+            if v.is_cuda:
+                # Direct GPU to disk save
+                with torch.cuda.device(v.device):
+                    if v.dim() == 0:  # if scalar, need to add a dimension to work with view
+                        v = v.unsqueeze(0)
+                    tensor_bytes = v.contiguous().view(torch.uint8)
+                    tensor_bytes.cpu().numpy().tofile(f)
+            else:
+                # CPU tensor save
+                if v.dim() == 0:  # if scalar, need to add a dimension to work with view
+                    v = v.unsqueeze(0)
+                v.contiguous().view(torch.uint8).numpy().tofile(f)
+
 
 # TODO make inf_utils.py
 

From 9e72be0a13fcd09df23406f49dc0fd7ec288c713 Mon Sep 17 00:00:00 2001
From: Kohya S <ykumeykume@gmail.com>
Date: Tue, 20 Aug 2024 08:19:00 +0900
Subject: [PATCH 088/582] Fix debug_dataset to work

---
 train_network.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/train_network.py b/train_network.py
index 086b314a5..cab0ec52e 100644
--- a/train_network.py
+++ b/train_network.py
@@ -313,6 +313,7 @@ def train(self, args):
         collator = train_util.collator_class(current_epoch, current_step, ds_for_collator)
 
         if args.debug_dataset:
+            train_dataset_group.set_current_strategies() # dasaset needs to know the strategies explicitly
             train_util.debug_dataset(train_dataset_group)
             return
         if len(train_dataset_group) == 0:

From c62c95e8626bdb727cedc8f037c82ab3a8e66059 Mon Sep 17 00:00:00 2001
From: Kohya S <ykumeykume@gmail.com>
Date: Tue, 20 Aug 2024 08:21:01 +0900
Subject: [PATCH 089/582] update about multi-resolution training in FLUX.1

---
 README.md | 52 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 52 insertions(+)

diff --git a/README.md b/README.md
index 51e4635bb..165eed341 100644
--- a/README.md
+++ b/README.md
@@ -9,6 +9,13 @@ __Please update PyTorch to 2.4.0. We have tested with `torch==2.4.0` and `torchv
 The command to install PyTorch is as follows:
 `pip3 install torch==2.4.0 torchvision==0.19.0 --index-url https://download.pytorch.org/whl/cu124`
 
+Aug 20, 2024:
+FLUX.1 supports multi-resolution inference, so training at multiple resolutions may be possible and the results may be improved (like 1024x1024, 768x768 and 512x512 ... you can use any resolution).
+
+The script seems to support multi-resolution even in the current version, __if `--cache_latents_to_disk` is not specified__. Please try if you are interested. See [FLUX.1 Multi-resolution training](#flux1-multi-resolution-training) for details.
+
+We will support multi-resolution caching to disk in the near future.
+
 Aug 19, 2024:
 In `flux_train.py`, the memory consumption during model saving is reduced when `--save_precision` is set to the same value as `--mixed_precision` (about 22GB). Please set the same value unless there is a reason.
 
@@ -159,6 +166,51 @@ In the case of LoRA models are trained with `bf16`, we are not sure which is bet
 
 The script can merge multiple LoRA models. If you want to merge multiple LoRA models, specify `--concat` option to work the merged LoRA model properly.
 
+### FLUX.1 Multi-resolution training
+
+You can define multiple resolutions in the dataset configuration file. __Caching latents to disk is not supported yet.__
+
+The dataset configuration file is like below. You can define multiple resolutions with different batch sizes. The resolutions are defined in the `[[datasets]]` section. The `[[datasets.subsets]]` section is for the dataset directory. Please specify the same directory for each resolution.
+
+```
+[general]
+# define common settings here
+flip_aug = true
+color_aug = false
+keep_tokens_separator= "|||"
+shuffle_caption = false
+caption_tag_dropout_rate = 0
+caption_extension = ".txt"
+
+[[datasets]]
+# define the first resolution here
+batch_size = 2
+enable_bucket = true
+resolution = [1024, 1024]
+
+  [[datasets.subsets]]
+  image_dir = "path/to/image/dir"
+  num_repeats = 1
+
+[[datasets]]
+# define the second resolution here
+batch_size = 3
+enable_bucket = true
+resolution = [768, 768]
+
+  [[datasets.subsets]]
+  image_dir = "path/to/image/dir"
+  num_repeats = 1
+
+[[datasets]]
+# define the third resolution here
+batch_size = 4
+enable_bucket = true
+resolution = [512, 512]
+
+  [[datasets.subsets]]
+  image_dir = "path/to/image/dir"
+  num_repeats = 1
 ```
 
 ## SD3 training

From 6f6faf9b5a99b7f741f657a06a42f63754e450c0 Mon Sep 17 00:00:00 2001
From: Kohya S <ykumeykume@gmail.com>
Date: Tue, 20 Aug 2024 19:16:25 +0900
Subject: [PATCH 090/582] fix to work with ai-toolkit LoRA

---
 networks/flux_merge_lora.py | 163 +++++++++++++++---------------------
 1 file changed, 68 insertions(+), 95 deletions(-)

diff --git a/networks/flux_merge_lora.py b/networks/flux_merge_lora.py
index df0ba606a..1ba1f314d 100644
--- a/networks/flux_merge_lora.py
+++ b/networks/flux_merge_lora.py
@@ -7,8 +7,6 @@
 from safetensors.torch import load_file, save_file
 from tqdm import tqdm
 
-import lora_flux as lora_flux
-from library import sai_model_spec, train_util
 from library.utils import setup_logging
 
 setup_logging()
@@ -16,6 +14,9 @@
 
 logger = logging.getLogger(__name__)
 
+import lora_flux as lora_flux
+from library import sai_model_spec, train_util
+
 
 def load_state_dict(file_name, dtype):
     if os.path.splitext(file_name)[1] == ".safetensors":
@@ -43,13 +44,11 @@ def save_to_file(file_name, state_dict, dtype, metadata):
     save_file(state_dict, file_name, metadata=metadata)
 
 
-def merge_to_flux_model(
-    loading_device, working_device, flux_model, models, ratios, merge_dtype, save_dtype
-):
+def merge_to_flux_model(loading_device, working_device, flux_model, models, ratios, merge_dtype, save_dtype):
     logger.info(f"loading keys from FLUX.1 model: {flux_model}")
     flux_state_dict = load_file(flux_model, device=loading_device)
 
-    def create_key_map(n_double_layers, n_single_layers, hidden_size):
+    def create_key_map(n_double_layers, n_single_layers):
         key_map = {}
         for index in range(n_double_layers):
             prefix_from = f"transformer_blocks.{index}"
@@ -60,18 +59,12 @@ def create_key_map(n_double_layers, n_single_layers, hidden_size):
                 qkv_img = f"{prefix_to}.img_attn.qkv.{end}"
                 qkv_txt = f"{prefix_to}.txt_attn.qkv.{end}"
 
-                key_map[f"{k}to_q.{end}"] = (qkv_img, (0, 0, hidden_size))
-                key_map[f"{k}to_k.{end}"] = (qkv_img, (0, hidden_size, hidden_size))
-                key_map[f"{k}to_v.{end}"] = (qkv_img, (0, hidden_size * 2, hidden_size))
-                key_map[f"{k}add_q_proj.{end}"] = (qkv_txt, (0, 0, hidden_size))
-                key_map[f"{k}add_k_proj.{end}"] = (
-                    qkv_txt,
-                    (0, hidden_size, hidden_size),
-                )
-                key_map[f"{k}add_v_proj.{end}"] = (
-                    qkv_txt,
-                    (0, hidden_size * 2, hidden_size),
-                )
+                key_map[f"{k}to_q.{end}"] = qkv_img
+                key_map[f"{k}to_k.{end}"] = qkv_img
+                key_map[f"{k}to_v.{end}"] = qkv_img
+                key_map[f"{k}add_q_proj.{end}"] = qkv_txt
+                key_map[f"{k}add_k_proj.{end}"] = qkv_txt
+                key_map[f"{k}add_v_proj.{end}"] = qkv_txt
 
             block_map = {
                 "attn.to_out.0.weight": "img_attn.proj.weight",
@@ -106,13 +99,10 @@ def create_key_map(n_double_layers, n_single_layers, hidden_size):
             for end in ("weight", "bias"):
                 k = f"{prefix_from}.attn."
                 qkv = f"{prefix_to}.linear1.{end}"
-                key_map[f"{k}to_q.{end}"] = (qkv, (0, 0, hidden_size))
-                key_map[f"{k}to_k.{end}"] = (qkv, (0, hidden_size, hidden_size))
-                key_map[f"{k}to_v.{end}"] = (qkv, (0, hidden_size * 2, hidden_size))
-                key_map[f"{prefix_from}.proj_mlp.{end}"] = (
-                    qkv,
-                    (0, hidden_size * 3, hidden_size * 4),
-                )
+                key_map[f"{k}to_q.{end}"] = qkv
+                key_map[f"{k}to_k.{end}"] = qkv
+                key_map[f"{k}to_v.{end}"] = qkv
+                key_map[f"{prefix_from}.proj_mlp.{end}"] = qkv
 
             block_map = {
                 "norm.linear.weight": "modulation.lin.weight",
@@ -126,11 +116,14 @@ def create_key_map(n_double_layers, n_single_layers, hidden_size):
             for k, v in block_map.items():
                 key_map[f"{prefix_from}.{k}"] = f"{prefix_to}.{v}"
 
+        # add as-is keys
+        values = list([(v if isinstance(v, str) else v[0]) for v in set(key_map.values())])
+        values.sort()
+        key_map.update({v: v for v in values})
+
         return key_map
 
-    key_map = create_key_map(
-        18, 1, 2048
-    )  # Assuming 18 double layers, 1 single layer, and hidden size of 2048
+    key_map = create_key_map(18, 38)  # 18 double layers, 38 single layers
 
     def find_matching_key(flux_dict, lora_key):
         lora_key = lora_key.replace("diffusion_model.", "")
@@ -159,7 +152,6 @@ def find_matching_key(flux_dict, lora_key):
             "attn.add_k_proj": "txt_attn.qkv",
             "attn.add_v_proj": "txt_attn.qkv",
         }
-
         single_block_map = {
             "norm.linear": "modulation.lin",
             "proj_out": "linear2",
@@ -168,18 +160,22 @@ def find_matching_key(flux_dict, lora_key):
             "attn.to_q": "linear1",
             "attn.to_k": "linear1",
             "attn.to_v": "linear1",
+            "proj_mlp": "linear1",
         }
 
+        # same key exists in both single_block_map and double_block_map, so we must care about single/double
+        # print("lora_key before double_block_map", lora_key)
         for old, new in double_block_map.items():
-            lora_key = lora_key.replace(old, new)
-
+            if "double" in lora_key:
+                lora_key = lora_key.replace(old, new)
+        # print("lora_key before single_block_map", lora_key)
         for old, new in single_block_map.items():
-            lora_key = lora_key.replace(old, new)
+            if "single" in lora_key:
+                lora_key = lora_key.replace(old, new)
+        # print("lora_key after mapping", lora_key)
 
         if lora_key in key_map:
             flux_key = key_map[lora_key]
-            if isinstance(flux_key, tuple):
-                flux_key = flux_key[0]
             logger.info(f"Found matching key: {flux_key}")
             return flux_key
 
@@ -198,16 +194,11 @@ def find_matching_key(flux_dict, lora_key):
         lora_sd, _ = load_state_dict(model, merge_dtype)
 
         logger.info("merging...")
-        for key in tqdm(lora_sd.keys()):
+        for key in lora_sd.keys():
             if "lora_down" in key or "lora_A" in key:
-                lora_name = key[
-                    : key.rfind(".lora_down" if "lora_down" in key else ".lora_A")
-                ]
+                lora_name = key[: key.rfind(".lora_down" if "lora_down" in key else ".lora_A")]
                 up_key = key.replace("lora_down", "lora_up").replace("lora_A", "lora_B")
-                alpha_key = (
-                    key[: key.index("lora_down" if "lora_down" in key else "lora_A")]
-                    + "alpha"
-                )
+                alpha_key = key[: key.index("lora_down" if "lora_down" in key else "lora_A")] + "alpha"
 
                 logger.info(f"Processing LoRA key: {lora_name}")
                 flux_key = find_matching_key(flux_state_dict, lora_name)
@@ -231,20 +222,35 @@ def find_matching_key(flux_dict, lora_key):
                 up_weight = up_weight.to(working_device, merge_dtype)
                 down_weight = down_weight.to(working_device, merge_dtype)
 
+                # print(up_weight.size(), down_weight.size(), weight.size())
+
                 if lora_name.startswith("transformer."):
-                    if "qkv" in flux_key:
-                        hidden_size = weight.size(-1) // 3
+                    if "qkv" in flux_key or "linear1" in flux_key:  # combined qkv or qkv+mlp
                         update = ratio * (up_weight @ down_weight) * scale
+                        # print(update.shape)
 
                         if "img_attn" in flux_key or "txt_attn" in flux_key:
-                            q, k, v = torch.chunk(weight, 3, dim=-1)
+                            q, k, v = torch.chunk(weight, 3, dim=0)
                             if "to_q" in lora_name or "add_q_proj" in lora_name:
                                 q += update.reshape(q.shape)
                             elif "to_k" in lora_name or "add_k_proj" in lora_name:
                                 k += update.reshape(k.shape)
                             elif "to_v" in lora_name or "add_v_proj" in lora_name:
                                 v += update.reshape(v.shape)
-                            weight = torch.cat([q, k, v], dim=-1)
+                            weight = torch.cat([q, k, v], dim=0)
+                        elif "linear1" in flux_key:
+                            q, k, v = torch.chunk(weight[: int(update.shape[-1] * 3)], 3, dim=0)
+                            mlp = weight[int(update.shape[-1] * 3) :]
+                            # print(q.shape, k.shape, v.shape, mlp.shape)
+                            if "to_q" in lora_name:
+                                q += update.reshape(q.shape)
+                            elif "to_k" in lora_name:
+                                k += update.reshape(k.shape)
+                            elif "to_v" in lora_name:
+                                v += update.reshape(v.shape)
+                            elif "proj_mlp" in lora_name:
+                                mlp += update.reshape(mlp.shape)
+                            weight = torch.cat([q, k, v, mlp], dim=0)
                     else:
                         if len(weight.size()) == 2:
                             weight = weight + ratio * (up_weight @ down_weight) * scale
@@ -252,18 +258,11 @@ def find_matching_key(flux_dict, lora_key):
                             weight = (
                                 weight
                                 + ratio
-                                * (
-                                    up_weight.squeeze(3).squeeze(2)
-                                    @ down_weight.squeeze(3).squeeze(2)
-                                )
-                                .unsqueeze(2)
-                                .unsqueeze(3)
+                                * (up_weight.squeeze(3).squeeze(2) @ down_weight.squeeze(3).squeeze(2)).unsqueeze(2).unsqueeze(3)
                                 * scale
                             )
                         else:
-                            conved = torch.nn.functional.conv2d(
-                                down_weight.permute(1, 0, 2, 3), up_weight
-                            ).permute(1, 0, 2, 3)
+                            conved = torch.nn.functional.conv2d(down_weight.permute(1, 0, 2, 3), up_weight).permute(1, 0, 2, 3)
                             weight = weight + ratio * conved * scale
                 else:
                     if len(weight.size()) == 2:
@@ -272,18 +271,11 @@ def find_matching_key(flux_dict, lora_key):
                         weight = (
                             weight
                             + ratio
-                            * (
-                                up_weight.squeeze(3).squeeze(2)
-                                @ down_weight.squeeze(3).squeeze(2)
-                            )
-                            .unsqueeze(2)
-                            .unsqueeze(3)
+                            * (up_weight.squeeze(3).squeeze(2) @ down_weight.squeeze(3).squeeze(2)).unsqueeze(2).unsqueeze(3)
                             * scale
                         )
                     else:
-                        conved = torch.nn.functional.conv2d(
-                            down_weight.permute(1, 0, 2, 3), up_weight
-                        ).permute(1, 0, 2, 3)
+                        conved = torch.nn.functional.conv2d(down_weight.permute(1, 0, 2, 3), up_weight).permute(1, 0, 2, 3)
                         weight = weight + ratio * conved * scale
 
                 flux_state_dict[flux_key] = weight.to(loading_device, save_dtype)
@@ -308,9 +300,7 @@ def merge_lora_models(models, ratios, merge_dtype, concat=False, shuffle=False):
 
         if lora_metadata is not None:
             if base_model is None:
-                base_model = lora_metadata.get(
-                    train_util.SS_METADATA_KEY_BASE_MODEL_VERSION, None
-                )
+                base_model = lora_metadata.get(train_util.SS_METADATA_KEY_BASE_MODEL_VERSION, None)
 
         # get alpha and dim
         alphas = {}  # alpha for current model
@@ -336,9 +326,7 @@ def merge_lora_models(models, ratios, merge_dtype, concat=False, shuffle=False):
                 if lora_module_name not in base_alphas:
                     base_alphas[lora_module_name] = alpha
 
-        logger.info(
-            f"dim: {list(set(dims.values()))}, alpha: {list(set(alphas.values()))}"
-        )
+        logger.info(f"dim: {list(set(dims.values()))}, alpha: {list(set(alphas.values()))}")
 
         # merge
         logger.info("merging...")
@@ -359,19 +347,14 @@ def merge_lora_models(models, ratios, merge_dtype, concat=False, shuffle=False):
             alpha = alphas[lora_module_name]
 
             scale = math.sqrt(alpha / base_alpha) * ratio
-            scale = (
-                abs(scale) if "lora_up" in key else scale
-            )  # マイナスの重みに対応する。
+            scale = abs(scale) if "lora_up" in key else scale  # マイナスの重みに対応する。
 
             if key in merged_sd:
                 assert (
-                    merged_sd[key].size() == lora_sd[key].size()
-                    or concat_dim is not None
+                    merged_sd[key].size() == lora_sd[key].size() or concat_dim is not None
                 ), "weights shape mismatch, different dims? / 重みのサイズが合いません。dimが異なる可能性があります。"
                 if concat_dim is not None:
-                    merged_sd[key] = torch.cat(
-                        [merged_sd[key], lora_sd[key] * scale], dim=concat_dim
-                    )
+                    merged_sd[key] = torch.cat([merged_sd[key], lora_sd[key] * scale], dim=concat_dim)
                 else:
                     merged_sd[key] = merged_sd[key] + lora_sd[key] * scale
             else:
@@ -390,9 +373,7 @@ def merge_lora_models(models, ratios, merge_dtype, concat=False, shuffle=False):
             merged_sd[key_up] = merged_sd[key_up][:, perm]
 
     logger.info("merged model")
-    logger.info(
-        f"dim: {list(set(base_dims.values()))}, alpha: {list(set(base_alphas.values()))}"
-    )
+    logger.info(f"dim: {list(set(base_dims.values()))}, alpha: {list(set(base_alphas.values()))}")
 
     # check all dims are same
     dims_list = list(set(base_dims.values()))
@@ -411,16 +392,14 @@ def merge_lora_models(models, ratios, merge_dtype, concat=False, shuffle=False):
     # build minimum metadata
     dims = f"{dims_list[0]}" if all_same_dims else "Dynamic"
     alphas = f"{alphas_list[0]}" if all_same_alphas else "Dynamic"
-    metadata = train_util.build_minimum_network_metadata(
-        str(False), base_model, "networks.lora", dims, alphas, None
-    )
+    metadata = train_util.build_minimum_network_metadata(str(False), base_model, "networks.lora", dims, alphas, None)
 
     return merged_sd, metadata
 
 
 def merge(args):
-    assert (
-        len(args.models) == len(args.ratios)
+    assert len(args.models) == len(
+        args.ratios
     ), "number of models must be equal to number of ratios / モデルの数と重みの数は合わせてください"
 
     def str_to_dtype(p):
@@ -456,9 +435,7 @@ def str_to_dtype(p):
         if args.no_metadata:
             sai_metadata = None
         else:
-            merged_from = sai_model_spec.build_merged_from(
-                [args.flux_model] + args.models
-            )
+            merged_from = sai_model_spec.build_merged_from([args.flux_model] + args.models)
             title = os.path.splitext(os.path.basename(args.save_to))[0]
             sai_metadata = sai_model_spec.build_metadata(
                 None,
@@ -477,15 +454,11 @@ def str_to_dtype(p):
         save_to_file(args.save_to, state_dict, save_dtype, sai_metadata)
 
     else:
-        state_dict, metadata = merge_lora_models(
-            args.models, args.ratios, merge_dtype, args.concat, args.shuffle
-        )
+        state_dict, metadata = merge_lora_models(args.models, args.ratios, merge_dtype, args.concat, args.shuffle)
 
         logger.info("calculating hashes and creating metadata...")
 
-        model_hash, legacy_hash = train_util.precalculate_safetensors_hashes(
-            state_dict, metadata
-        )
+        model_hash, legacy_hash = train_util.precalculate_safetensors_hashes(state_dict, metadata)
         metadata["sshs_model_hash"] = model_hash
         metadata["sshs_legacy_hash"] = legacy_hash
 

From 9381332020b7089a41eb8d041938f8ba417528d1 Mon Sep 17 00:00:00 2001
From: Kohya S <ykumeykume@gmail.com>
Date: Tue, 20 Aug 2024 19:32:26 +0900
Subject: [PATCH 091/582] revert merge function add add option to use new func

---
 README.md                   |   3 +
 networks/flux_merge_lora.py | 120 +++++++++++++++++++++++++++---------
 2 files changed, 94 insertions(+), 29 deletions(-)

diff --git a/README.md b/README.md
index 165eed341..3f5c4daa5 100644
--- a/README.md
+++ b/README.md
@@ -9,6 +9,9 @@ __Please update PyTorch to 2.4.0. We have tested with `torch==2.4.0` and `torchv
 The command to install PyTorch is as follows:
 `pip3 install torch==2.4.0 torchvision==0.19.0 --index-url https://download.pytorch.org/whl/cu124`
 
+Aug 20, 2024 (update 2):
+`flux_merge_lora.py` now supports LoRA from AI-toolkit (Diffusers based keys). Specify `--diffusers` option to merge LoRA with Diffusers based keys. Thanks to exveria1015!
+
 Aug 20, 2024:
 FLUX.1 supports multi-resolution inference, so training at multiple resolutions may be possible and the results may be improved (like 1024x1024, 768x768 and 512x512 ... you can use any resolution).
 
diff --git a/networks/flux_merge_lora.py b/networks/flux_merge_lora.py
index 1ba1f314d..fd9cc4e3a 100644
--- a/networks/flux_merge_lora.py
+++ b/networks/flux_merge_lora.py
@@ -4,6 +4,7 @@
 import time
 
 import torch
+from safetensors import safe_open
 from safetensors.torch import load_file, save_file
 from tqdm import tqdm
 
@@ -45,6 +46,81 @@ def save_to_file(file_name, state_dict, dtype, metadata):
 
 
 def merge_to_flux_model(loading_device, working_device, flux_model, models, ratios, merge_dtype, save_dtype):
+    # create module map without loading state_dict
+    logger.info(f"loading keys from FLUX.1 model: {flux_model}")
+    lora_name_to_module_key = {}
+    with safe_open(flux_model, framework="pt", device=loading_device) as flux_file:
+        keys = list(flux_file.keys())
+        for key in keys:
+            if key.endswith(".weight"):
+                module_name = ".".join(key.split(".")[:-1])
+                lora_name = lora_flux.LoRANetwork.LORA_PREFIX_FLUX + "_" + module_name.replace(".", "_")
+                lora_name_to_module_key[lora_name] = key
+
+    flux_state_dict = load_file(flux_model, device=loading_device)
+    for model, ratio in zip(models, ratios):
+        logger.info(f"loading: {model}")
+        lora_sd, _ = load_state_dict(model, merge_dtype)  # loading on CPU
+
+        logger.info(f"merging...")
+        for key in tqdm(list(lora_sd.keys())):
+            if "lora_down" in key:
+                lora_name = key[: key.rfind(".lora_down")]
+                up_key = key.replace("lora_down", "lora_up")
+                alpha_key = key[: key.index("lora_down")] + "alpha"
+
+                if lora_name not in lora_name_to_module_key:
+                    logger.warning(f"no module found for LoRA weight: {key}.  LoRA for Text Encoder is not supported yet.")
+                    continue
+
+                down_weight = lora_sd.pop(key)
+                up_weight = lora_sd.pop(up_key)
+
+                dim = down_weight.size()[0]
+                alpha = lora_sd.pop(alpha_key, dim)
+                scale = alpha / dim
+
+                # W <- W + U * D
+                module_weight_key = lora_name_to_module_key[lora_name]
+                if module_weight_key not in flux_state_dict:
+                    weight = flux_file.get_tensor(module_weight_key)
+                else:
+                    weight = flux_state_dict[module_weight_key]
+
+                weight = weight.to(working_device, merge_dtype)
+                up_weight = up_weight.to(working_device, merge_dtype)
+                down_weight = down_weight.to(working_device, merge_dtype)
+
+                # logger.info(module_name, down_weight.size(), up_weight.size())
+                if len(weight.size()) == 2:
+                    # linear
+                    weight = weight + ratio * (up_weight @ down_weight) * scale
+                elif down_weight.size()[2:4] == (1, 1):
+                    # conv2d 1x1
+                    weight = (
+                        weight
+                        + ratio
+                        * (up_weight.squeeze(3).squeeze(2) @ down_weight.squeeze(3).squeeze(2)).unsqueeze(2).unsqueeze(3)
+                        * scale
+                    )
+                else:
+                    # conv2d 3x3
+                    conved = torch.nn.functional.conv2d(down_weight.permute(1, 0, 2, 3), up_weight).permute(1, 0, 2, 3)
+                    # logger.info(conved.size(), weight.size(), module.stride, module.padding)
+                    weight = weight + ratio * conved * scale
+
+                flux_state_dict[module_weight_key] = weight.to(loading_device, save_dtype)
+                del up_weight
+                del down_weight
+                del weight
+        
+        if len(lora_sd) > 0:
+            logger.warning(f"Unused keys in LoRA model: {list(lora_sd.keys())}")
+
+    return flux_state_dict
+
+
+def merge_to_flux_model_diffusers(loading_device, working_device, flux_model, models, ratios, merge_dtype, save_dtype):
     logger.info(f"loading keys from FLUX.1 model: {flux_model}")
     flux_state_dict = load_file(flux_model, device=loading_device)
 
@@ -422,15 +498,14 @@ def str_to_dtype(p):
         os.makedirs(dest_dir)
 
     if args.flux_model is not None:
-        state_dict = merge_to_flux_model(
-            args.loading_device,
-            args.working_device,
-            args.flux_model,
-            args.models,
-            args.ratios,
-            merge_dtype,
-            save_dtype,
-        )
+        if not args.diffusers:
+            state_dict = merge_to_flux_model(
+                args.loading_device, args.working_device, args.flux_model, args.models, args.ratios, merge_dtype, save_dtype
+            )
+        else:
+            state_dict = merge_to_flux_model_diffusers(
+                args.loading_device, args.working_device, args.flux_model, args.models, args.ratios, merge_dtype, save_dtype
+            )
 
         if args.no_metadata:
             sai_metadata = None
@@ -438,16 +513,7 @@ def str_to_dtype(p):
             merged_from = sai_model_spec.build_merged_from([args.flux_model] + args.models)
             title = os.path.splitext(os.path.basename(args.save_to))[0]
             sai_metadata = sai_model_spec.build_metadata(
-                None,
-                False,
-                False,
-                False,
-                False,
-                False,
-                time.time(),
-                title=title,
-                merged_from=merged_from,
-                flux="dev",
+                None, False, False, False, False, False, time.time(), title=title, merged_from=merged_from, flux="dev"
             )
 
         logger.info(f"saving FLUX model to: {args.save_to}")
@@ -466,16 +532,7 @@ def str_to_dtype(p):
             merged_from = sai_model_spec.build_merged_from(args.models)
             title = os.path.splitext(os.path.basename(args.save_to))[0]
             sai_metadata = sai_model_spec.build_metadata(
-                state_dict,
-                False,
-                False,
-                False,
-                True,
-                False,
-                time.time(),
-                title=title,
-                merged_from=merged_from,
-                flux="dev",
+                state_dict, False, False, False, True, False, time.time(), title=title, merged_from=merged_from, flux="dev"
             )
             metadata.update(sai_metadata)
 
@@ -553,6 +610,11 @@ def setup_parser() -> argparse.ArgumentParser:
         action="store_true",
         help="shuffle lora weight./ " + "LoRAの重みをシャッフルする",
     )
+    parser.add_argument(
+        "--diffusers",
+        action="store_true",
+        help="merge Diffusers (?) LoRA models / Diffusers (?) LoRAモデルをマージする",
+    )
 
     return parser
 

From dbed5126bd1133da832dae31ce73ba6c41afc9d3 Mon Sep 17 00:00:00 2001
From: Kohya S <ykumeykume@gmail.com>
Date: Tue, 20 Aug 2024 19:33:47 +0900
Subject: [PATCH 092/582] chore: formatting

---
 networks/flux_merge_lora.py | 9 ++-------
 1 file changed, 2 insertions(+), 7 deletions(-)

diff --git a/networks/flux_merge_lora.py b/networks/flux_merge_lora.py
index fd9cc4e3a..d5e82920d 100644
--- a/networks/flux_merge_lora.py
+++ b/networks/flux_merge_lora.py
@@ -113,7 +113,7 @@ def merge_to_flux_model(loading_device, working_device, flux_model, models, rati
                 del up_weight
                 del down_weight
                 del weight
-        
+
         if len(lora_sd) > 0:
             logger.warning(f"Unused keys in LoRA model: {list(lora_sd.keys())}")
 
@@ -587,12 +587,7 @@ def setup_parser() -> argparse.ArgumentParser:
         nargs="*",
         help="LoRA models to merge: safetensors file / マージするLoRAモデル、safetensorsファイル",
     )
-    parser.add_argument(
-        "--ratios",
-        type=float,
-        nargs="*",
-        help="ratios for each model / それぞれのLoRAモデルの比率",
-    )
+    parser.add_argument("--ratios", type=float, nargs="*", help="ratios for each model / それぞれのLoRAモデルの比率")
     parser.add_argument(
         "--no_metadata",
         action="store_true",

From 6ab48b09d8e46973d5e5fa47baeae3a464d06d04 Mon Sep 17 00:00:00 2001
From: Kohya S <ykumeykume@gmail.com>
Date: Tue, 20 Aug 2024 21:39:43 +0900
Subject: [PATCH 093/582] feat: Support multi-resolution training with caching
 latents to disk

---
 README.md                |  11 +++-
 library/strategy_base.py | 112 ++++++++++++++++++++++++++-------------
 library/strategy_flux.py |  11 +++-
 library/train_util.py    |   2 +-
 4 files changed, 93 insertions(+), 43 deletions(-)

diff --git a/README.md b/README.md
index 3f5c4daa5..1d44c9e58 100644
--- a/README.md
+++ b/README.md
@@ -9,13 +9,20 @@ __Please update PyTorch to 2.4.0. We have tested with `torch==2.4.0` and `torchv
 The command to install PyTorch is as follows:
 `pip3 install torch==2.4.0 torchvision==0.19.0 --index-url https://download.pytorch.org/whl/cu124`
 
+Aug 20, 2024 (update 3):
+__Experimental__  The multi-resolution training is now supported with caching latents to disk.
+
+The cache files now hold latents for multiple resolutions. Since the latents are appended to the current cache file, it is recommended to delete the cache file in advance (if not, the old latents is kept in .npz file).
+
+See [FLUX.1 Multi-resolution training](#flux1-multi-resolution-training) for details.
+
 Aug 20, 2024 (update 2):
 `flux_merge_lora.py` now supports LoRA from AI-toolkit (Diffusers based keys). Specify `--diffusers` option to merge LoRA with Diffusers based keys. Thanks to exveria1015!
 
 Aug 20, 2024:
 FLUX.1 supports multi-resolution inference, so training at multiple resolutions may be possible and the results may be improved (like 1024x1024, 768x768 and 512x512 ... you can use any resolution).
 
-The script seems to support multi-resolution even in the current version, __if `--cache_latents_to_disk` is not specified__. Please try if you are interested. See [FLUX.1 Multi-resolution training](#flux1-multi-resolution-training) for details.
+The script seems to support multi-resolution even in the current version, ~~if `--cache_latents_to_disk` is not specified~~ -> `--cache_latents_to_disk` is now supported for multi-resolution training. Please try if you are interested. See [FLUX.1 Multi-resolution training](#flux1-multi-resolution-training) for details.
 
 We will support multi-resolution caching to disk in the near future.
 
@@ -171,7 +178,7 @@ The script can merge multiple LoRA models. If you want to merge multiple LoRA mo
 
 ### FLUX.1 Multi-resolution training
 
-You can define multiple resolutions in the dataset configuration file. __Caching latents to disk is not supported yet.__
+You can define multiple resolutions in the dataset configuration file.
 
 The dataset configuration file is like below. You can define multiple resolutions with different batch sizes. The resolutions are defined in the `[[datasets]]` section. The `[[datasets.subsets]]` section is for the dataset directory. Please specify the same directory for each resolution.
 
diff --git a/library/strategy_base.py b/library/strategy_base.py
index a99a08290..e7d3a97ef 100644
--- a/library/strategy_base.py
+++ b/library/strategy_base.py
@@ -219,7 +219,13 @@ def cache_batch_latents(self, model: Any, batch: List, flip_aug: bool, alpha_mas
         raise NotImplementedError
 
     def _default_is_disk_cached_latents_expected(
-        self, latents_stride: int, bucket_reso: Tuple[int, int], npz_path: str, flip_aug: bool, alpha_mask: bool
+        self,
+        latents_stride: int,
+        bucket_reso: Tuple[int, int],
+        npz_path: str,
+        flip_aug: bool,
+        alpha_mask: bool,
+        multi_resolution: bool = False,
     ):
         if not self.cache_to_disk:
             return False
@@ -230,25 +236,17 @@ def _default_is_disk_cached_latents_expected(
 
         expected_latents_size = (bucket_reso[1] // latents_stride, bucket_reso[0] // latents_stride)  # bucket_reso is (W, H)
 
+        # e.g. "_32x64", HxW
+        key_reso_suffix = f"_{expected_latents_size[0]}x{expected_latents_size[1]}" if multi_resolution else ""
+
         try:
             npz = np.load(npz_path)
-            if npz["latents"].shape[1:3] != expected_latents_size:
+            if "latents" + key_reso_suffix not in npz:
+                return False
+            if flip_aug and "latents_flipped" + key_reso_suffix not in npz:
+                return False
+            if alpha_mask and "alpha_mask" + key_reso_suffix not in npz:
                 return False
-
-            if flip_aug:
-                if "latents_flipped" not in npz:
-                    return False
-                if npz["latents_flipped"].shape[1:3] != expected_latents_size:
-                    return False
-
-            if alpha_mask:
-                if "alpha_mask" not in npz:
-                    return False
-                if npz["alpha_mask"].shape[0:2] != (bucket_reso[1], bucket_reso[0]):
-                    return False
-            else:
-                if "alpha_mask" in npz:
-                    return False
         except Exception as e:
             logger.error(f"Error loading file: {npz_path}")
             raise e
@@ -257,7 +255,15 @@ def _default_is_disk_cached_latents_expected(
 
     # TODO remove circular dependency for ImageInfo
     def _default_cache_batch_latents(
-        self, encode_by_vae, vae_device, vae_dtype, image_infos: List, flip_aug: bool, alpha_mask: bool, random_crop: bool
+        self,
+        encode_by_vae,
+        vae_device,
+        vae_dtype,
+        image_infos: List,
+        flip_aug: bool,
+        alpha_mask: bool,
+        random_crop: bool,
+        multi_resolution: bool = False,
     ):
         """
         Default implementation for cache_batch_latents. Image loading, VAE, flipping, alpha mask handling are common.
@@ -287,8 +293,13 @@ def _default_cache_batch_latents(
             original_size = original_sizes[i]
             crop_ltrb = crop_ltrbs[i]
 
+            latents_size = latents.shape[1:3]  # H, W
+            key_reso_suffix = f"_{latents_size[0]}x{latents_size[1]}" if multi_resolution else ""  # e.g. "_32x64", HxW
+
             if self.cache_to_disk:
-                self.save_latents_to_disk(info.latents_npz, latents, original_size, crop_ltrb, flipped_latent, alpha_mask)
+                self.save_latents_to_disk(
+                    info.latents_npz, latents, original_size, crop_ltrb, flipped_latent, alpha_mask, key_reso_suffix
+                )
             else:
                 info.latents_original_size = original_size
                 info.latents_crop_ltrb = crop_ltrb
@@ -298,31 +309,56 @@ def _default_cache_batch_latents(
                 info.alpha_mask = alpha_mask
 
     def load_latents_from_disk(
-        self, npz_path: str
+        self, npz_path: str, bucket_reso: Tuple[int, int]
+    ) -> Tuple[Optional[np.ndarray], Optional[List[int]], Optional[List[int]], Optional[np.ndarray], Optional[np.ndarray]]:
+        """
+        for SD/SDXL/SD3.0
+        """
+        return self._default_load_latents_from_disk(None, npz_path, bucket_reso)
+
+    def _default_load_latents_from_disk(
+        self, latents_stride: Optional[int], npz_path: str, bucket_reso: Tuple[int, int]
     ) -> Tuple[Optional[np.ndarray], Optional[List[int]], Optional[List[int]], Optional[np.ndarray], Optional[np.ndarray]]:
+        if latents_stride is None:
+            key_reso_suffix = ""
+        else:
+            latents_size = (bucket_reso[1] // latents_stride, bucket_reso[0] // latents_stride)  # bucket_reso is (W, H)
+            key_reso_suffix = f"_{latents_size[0]}x{latents_size[1]}"  # e.g. "_32x64", HxW
+
         npz = np.load(npz_path)
-        if "latents" not in npz:
-            raise ValueError(f"error: npz is old format. please re-generate {npz_path}")
-
-        latents = npz["latents"]
-        original_size = npz["original_size"].tolist()
-        crop_ltrb = npz["crop_ltrb"].tolist()
-        flipped_latents = npz["latents_flipped"] if "latents_flipped" in npz else None
-        alpha_mask = npz["alpha_mask"] if "alpha_mask" in npz else None
+        if "latents" + key_reso_suffix not in npz:
+            raise ValueError(f"latents{key_reso_suffix} not found in {npz_path}")
+
+        latents = npz["latents" + key_reso_suffix]
+        original_size = npz["original_size" + key_reso_suffix].tolist()
+        crop_ltrb = npz["crop_ltrb" + key_reso_suffix].tolist()
+        flipped_latents = npz["latents_flipped" + key_reso_suffix] if "latents_flipped" + key_reso_suffix in npz else None
+        alpha_mask = npz["alpha_mask" + key_reso_suffix] if "alpha_mask" + key_reso_suffix in npz else None
         return latents, original_size, crop_ltrb, flipped_latents, alpha_mask
 
     def save_latents_to_disk(
-        self, npz_path, latents_tensor, original_size, crop_ltrb, flipped_latents_tensor=None, alpha_mask=None
+        self,
+        npz_path,
+        latents_tensor,
+        original_size,
+        crop_ltrb,
+        flipped_latents_tensor=None,
+        alpha_mask=None,
+        key_reso_suffix="",
     ):
         kwargs = {}
+
+        if os.path.exists(npz_path):
+            # load existing npz and update it
+            npz = np.load(npz_path)
+            for key in npz.files:
+                kwargs[key] = npz[key]
+
+        kwargs["latents" + key_reso_suffix] = latents_tensor.float().cpu().numpy()
+        kwargs["original_size" + key_reso_suffix] = np.array(original_size)
+        kwargs["crop_ltrb" + key_reso_suffix] = np.array(crop_ltrb)
         if flipped_latents_tensor is not None:
-            kwargs["latents_flipped"] = flipped_latents_tensor.float().cpu().numpy()
+            kwargs["latents_flipped" + key_reso_suffix] = flipped_latents_tensor.float().cpu().numpy()
         if alpha_mask is not None:
-            kwargs["alpha_mask"] = alpha_mask.float().cpu().numpy()
-        np.savez(
-            npz_path,
-            latents=latents_tensor.float().cpu().numpy(),
-            original_size=np.array(original_size),
-            crop_ltrb=np.array(crop_ltrb),
-            **kwargs,
-        )
+            kwargs["alpha_mask" + key_reso_suffix] = alpha_mask.float().cpu().numpy()
+        np.savez(npz_path, **kwargs)
diff --git a/library/strategy_flux.py b/library/strategy_flux.py
index 3880a1e1b..5c620f3d6 100644
--- a/library/strategy_flux.py
+++ b/library/strategy_flux.py
@@ -200,7 +200,12 @@ def get_latents_npz_path(self, absolute_path: str, image_size: Tuple[int, int])
         )
 
     def is_disk_cached_latents_expected(self, bucket_reso: Tuple[int, int], npz_path: str, flip_aug: bool, alpha_mask: bool):
-        return self._default_is_disk_cached_latents_expected(8, bucket_reso, npz_path, flip_aug, alpha_mask)
+        return self._default_is_disk_cached_latents_expected(8, bucket_reso, npz_path, flip_aug, alpha_mask, True)
+
+    def load_latents_from_disk(
+        self, npz_path: str, bucket_reso: Tuple[int, int]
+    ) -> Tuple[Optional[np.ndarray], Optional[List[int]], Optional[List[int]], Optional[np.ndarray], Optional[np.ndarray]]:
+        return self._default_load_latents_from_disk(8, npz_path, bucket_reso)  # support multi-resolution
 
     # TODO remove circular dependency for ImageInfo
     def cache_batch_latents(self, vae, image_infos: List, flip_aug: bool, alpha_mask: bool, random_crop: bool):
@@ -208,7 +213,9 @@ def cache_batch_latents(self, vae, image_infos: List, flip_aug: bool, alpha_mask
         vae_device = vae.device
         vae_dtype = vae.dtype
 
-        self._default_cache_batch_latents(encode_by_vae, vae_device, vae_dtype, image_infos, flip_aug, alpha_mask, random_crop)
+        self._default_cache_batch_latents(
+            encode_by_vae, vae_device, vae_dtype, image_infos, flip_aug, alpha_mask, random_crop, True
+        )
 
         if not train_util.HIGH_VRAM:
             train_util.clean_memory_on_device(vae.device)
diff --git a/library/train_util.py b/library/train_util.py
index f4ac8740a..8929c192f 100644
--- a/library/train_util.py
+++ b/library/train_util.py
@@ -1381,7 +1381,7 @@ def __getitem__(self, index):
                 image = None
             elif image_info.latents_npz is not None:  # FineTuningDatasetまたはcache_latents_to_disk=Trueの場合
                 latents, original_size, crop_ltrb, flipped_latents, alpha_mask = (
-                    self.latents_caching_strategy.load_latents_from_disk(image_info.latents_npz)
+                    self.latents_caching_strategy.load_latents_from_disk(image_info.latents_npz, image_info.bucket_reso)
                 )
                 if flipped:
                     latents = flipped_latents

From 7e459c00b2e142e40a9452341934c2eb9f70a172 Mon Sep 17 00:00:00 2001
From: Kohya S <ykumeykume@gmail.com>
Date: Wed, 21 Aug 2024 08:02:33 +0900
Subject: [PATCH 094/582] Update T5 attention mask handling in FLUX

---
 README.md                   |  3 +++
 flux_minimal_inference.py   | 33 +++++++++++++++++++-----
 flux_train.py               |  6 ++++-
 flux_train_network.py       | 13 +++++-----
 library/flux_models.py      | 51 +++++++++++++++++++++----------------
 library/flux_train_utils.py | 20 ++++++++++++---
 library/strategy_flux.py    | 25 ++++++++++--------
 7 files changed, 101 insertions(+), 50 deletions(-)

diff --git a/README.md b/README.md
index 1d44c9e58..43edbbed6 100644
--- a/README.md
+++ b/README.md
@@ -9,6 +9,9 @@ __Please update PyTorch to 2.4.0. We have tested with `torch==2.4.0` and `torchv
 The command to install PyTorch is as follows:
 `pip3 install torch==2.4.0 torchvision==0.19.0 --index-url https://download.pytorch.org/whl/cu124`
 
+Aug 21, 2024:
+The specification of `--apply_t5_attn_mask` has been changed. Previously, the T5 output was zero-padded, but now, two steps are taken: "1. Apply mask when encoding T5" and "2. Apply mask in the attention of Double Block". Fine tuning, LoRA training, and inference in `flux_mini_inference.py` have been changed.
+
 Aug 20, 2024 (update 3):
 __Experimental__  The multi-resolution training is now supported with caching latents to disk.
 
diff --git a/flux_minimal_inference.py b/flux_minimal_inference.py
index b09f63808..5b8aa2506 100644
--- a/flux_minimal_inference.py
+++ b/flux_minimal_inference.py
@@ -70,12 +70,22 @@ def denoise(
     vec: torch.Tensor,
     timesteps: list[float],
     guidance: float = 4.0,
+    t5_attn_mask: Optional[torch.Tensor] = None,
 ):
     # this is ignored for schnell
     guidance_vec = torch.full((img.shape[0],), guidance, device=img.device, dtype=img.dtype)
     for t_curr, t_prev in zip(tqdm(timesteps[:-1]), timesteps[1:]):
         t_vec = torch.full((img.shape[0],), t_curr, dtype=img.dtype, device=img.device)
-        pred = model(img=img, img_ids=img_ids, txt=txt, txt_ids=txt_ids, y=vec, timesteps=t_vec, guidance=guidance_vec)
+        pred = model(
+            img=img,
+            img_ids=img_ids,
+            txt=txt,
+            txt_ids=txt_ids,
+            y=vec,
+            timesteps=t_vec,
+            guidance=guidance_vec,
+            txt_attention_mask=t5_attn_mask,
+        )
 
         img = img + (t_prev - t_curr) * pred
 
@@ -92,6 +102,7 @@ def do_sample(
     txt_ids: torch.Tensor,
     num_steps: int,
     guidance: float,
+    t5_attn_mask: Optional[torch.Tensor],
     is_schnell: bool,
     device: torch.device,
     flux_dtype: torch.dtype,
@@ -101,10 +112,14 @@ def do_sample(
     # denoise initial noise
     if accelerator:
         with accelerator.autocast(), torch.no_grad():
-            x = denoise(model, img, img_ids, t5_out, txt_ids, l_pooled, timesteps=timesteps, guidance=guidance)
+            x = denoise(
+                model, img, img_ids, t5_out, txt_ids, l_pooled, timesteps=timesteps, guidance=guidance, t5_attn_mask=t5_attn_mask
+            )
     else:
         with torch.autocast(device_type=device.type, dtype=flux_dtype), torch.no_grad():
-            x = denoise(model, img, img_ids, t5_out, txt_ids, l_pooled, timesteps=timesteps, guidance=guidance)
+            x = denoise(
+                model, img, img_ids, t5_out, txt_ids, l_pooled, timesteps=timesteps, guidance=guidance, t5_attn_mask=t5_attn_mask
+            )
 
     return x
 
@@ -156,14 +171,14 @@ def generate_image(
             clip_l.to(clip_l_dtype)
             t5xxl.to(t5xxl_dtype)
             with accelerator.autocast():
-                _, t5_out, txt_ids = encoding_strategy.encode_tokens(
+                _, t5_out, txt_ids, t5_attn_mask = encoding_strategy.encode_tokens(
                     tokenize_strategy, [clip_l, t5xxl], tokens_and_masks, args.apply_t5_attn_mask
                 )
         else:
             with torch.autocast(device_type=device.type, dtype=clip_l_dtype):
-                l_pooled, _, _ = encoding_strategy.encode_tokens(tokenize_strategy, [clip_l, None], tokens_and_masks)
+                l_pooled, _, _, _ = encoding_strategy.encode_tokens(tokenize_strategy, [clip_l, None], tokens_and_masks)
             with torch.autocast(device_type=device.type, dtype=t5xxl_dtype):
-                _, t5_out, txt_ids = encoding_strategy.encode_tokens(
+                _, t5_out, txt_ids, t5_attn_mask = encoding_strategy.encode_tokens(
                     tokenize_strategy, [None, t5xxl], tokens_and_masks, args.apply_t5_attn_mask
                 )
 
@@ -186,7 +201,11 @@ def generate_image(
         steps = 4 if is_schnell else 50
 
     img_ids = img_ids.to(device)
-    x = do_sample(accelerator, model, noise, img_ids, l_pooled, t5_out, txt_ids, steps, guidance, is_schnell, device, flux_dtype)
+    t5_attn_mask = t5_attn_mask.to(device) if args.apply_t5_attn_mask else None
+
+    x = do_sample(
+        accelerator, model, noise, img_ids, l_pooled, t5_out, txt_ids, steps, guidance, t5_attn_mask, is_schnell, device, flux_dtype
+    )
     if args.offload:
         model = model.cpu()
     # del model
diff --git a/flux_train.py b/flux_train.py
index 669963856..ecb8a1086 100644
--- a/flux_train.py
+++ b/flux_train.py
@@ -610,7 +610,10 @@ def optimizer_hook(parameter: torch.Tensor):
                 guidance_vec = torch.full((bsz,), args.guidance_scale, device=accelerator.device)
 
                 # call model
-                l_pooled, t5_out, txt_ids = text_encoder_conds
+                l_pooled, t5_out, txt_ids, t5_attn_mask = text_encoder_conds
+                if not args.apply_t5_attn_mask:
+                    t5_attn_mask = None
+
                 with accelerator.autocast():
                     # YiYi notes: divide it by 1000 for now because we scale it by 1000 in the transformer model (we should not keep it but I want to keep the inputs same for the model for testing)
                     model_pred = flux(
@@ -621,6 +624,7 @@ def optimizer_hook(parameter: torch.Tensor):
                         y=l_pooled,
                         timesteps=timesteps / 1000,
                         guidance=guidance_vec,
+                        txt_attention_mask=t5_attn_mask,
                     )
 
                 # unpack latents
diff --git a/flux_train_network.py b/flux_train_network.py
index 002252c87..49bd270c7 100644
--- a/flux_train_network.py
+++ b/flux_train_network.py
@@ -233,11 +233,11 @@ def __init__(self, flux_upper: flux_models.FluxUpper, flux_lower: flux_models.Fl
                 self.flux_lower = flux_lower
                 self.target_device = device
 
-            def forward(self, img, img_ids, txt, txt_ids, timesteps, y, guidance=None):
+            def forward(self, img, img_ids, txt, txt_ids, timesteps, y, guidance=None, txt_attention_mask=None):
                 self.flux_lower.to("cpu")
                 clean_memory_on_device(self.target_device)
                 self.flux_upper.to(self.target_device)
-                img, txt, vec, pe = self.flux_upper(img, img_ids, txt, txt_ids, timesteps, y, guidance)
+                img, txt, vec, pe = self.flux_upper(img, img_ids, txt, txt_ids, timesteps, y, guidance, txt_attention_mask)
                 self.flux_upper.to("cpu")
                 clean_memory_on_device(self.target_device)
                 self.flux_lower.to(self.target_device)
@@ -300,10 +300,9 @@ def get_noise_pred_and_target(
             guidance_vec.requires_grad_(True)
 
         # Predict the noise residual
-        l_pooled, t5_out, txt_ids = text_encoder_conds
-        # print(
-        #     f"model_input: {noisy_model_input.shape}, img_ids: {img_ids.shape}, t5_out: {t5_out.shape}, txt_ids: {txt_ids.shape}, l_pooled: {l_pooled.shape}, timesteps: {timesteps.shape}, guidance_vec: {guidance_vec.shape}"
-        # )
+        l_pooled, t5_out, txt_ids, t5_attn_mask = text_encoder_conds
+        if not args.apply_t5_attn_mask:
+            t5_attn_mask = None
 
         if not args.split_mode:
             # normal forward
@@ -317,6 +316,7 @@ def get_noise_pred_and_target(
                     y=l_pooled,
                     timesteps=timesteps / 1000,
                     guidance=guidance_vec,
+                    txt_attention_mask=t5_attn_mask,
                 )
         else:
             # split forward to reduce memory usage
@@ -337,6 +337,7 @@ def get_noise_pred_and_target(
                         y=l_pooled,
                         timesteps=timesteps / 1000,
                         guidance=guidance_vec,
+                        txt_attention_mask=t5_attn_mask,
                     )
 
                 # move flux upper back to cpu, and then move flux lower to gpu
diff --git a/library/flux_models.py b/library/flux_models.py
index 11ef647ad..6f28da603 100644
--- a/library/flux_models.py
+++ b/library/flux_models.py
@@ -440,10 +440,10 @@ class ModelSpec:
 # region math
 
 
-def attention(q: Tensor, k: Tensor, v: Tensor, pe: Tensor) -> Tensor:
+def attention(q: Tensor, k: Tensor, v: Tensor, pe: Tensor, attn_mask: Optional[Tensor] = None) -> Tensor:
     q, k = apply_rope(q, k, pe)
 
-    x = torch.nn.functional.scaled_dot_product_attention(q, k, v)
+    x = torch.nn.functional.scaled_dot_product_attention(q, k, v, attn_mask=attn_mask)
     x = rearrange(x, "B H L D -> B L (H D)")
 
     return x
@@ -607,11 +607,7 @@ def __init__(self, dim: int, num_heads: int = 8, qkv_bias: bool = False):
         self.norm = QKNorm(head_dim)
         self.proj = nn.Linear(dim, dim)
 
-    #     self.gradient_checkpointing = False
-
-    # def enable_gradient_checkpointing(self):
-    #     self.gradient_checkpointing = True
-
+    # this is not called from DoubleStreamBlock/SingleStreamBlock because they uses attention function directly
     def forward(self, x: Tensor, pe: Tensor) -> Tensor:
         qkv = self.qkv(x)
         q, k, v = rearrange(qkv, "B L (K H D) -> K B H L D", K=3, H=self.num_heads)
@@ -620,12 +616,6 @@ def forward(self, x: Tensor, pe: Tensor) -> Tensor:
         x = self.proj(x)
         return x
 
-    # def forward(self, *args, **kwargs):
-    #     if self.training and self.gradient_checkpointing:
-    #         return checkpoint(self._forward, *args, use_reentrant=False, **kwargs)
-    #     else:
-    #         return self._forward(*args, **kwargs)
-
 
 @dataclass
 class ModulationOut:
@@ -690,7 +680,9 @@ def disable_gradient_checkpointing(self):
         self.gradient_checkpointing = False
         self.cpu_offload_checkpointing = False
 
-    def _forward(self, img: Tensor, txt: Tensor, vec: Tensor, pe: Tensor) -> tuple[Tensor, Tensor]:
+    def _forward(
+        self, img: Tensor, txt: Tensor, vec: Tensor, pe: Tensor, txt_attention_mask: Optional[Tensor] = None
+    ) -> tuple[Tensor, Tensor]:
         img_mod1, img_mod2 = self.img_mod(vec)
         txt_mod1, txt_mod2 = self.txt_mod(vec)
 
@@ -713,7 +705,18 @@ def _forward(self, img: Tensor, txt: Tensor, vec: Tensor, pe: Tensor) -> tuple[T
         k = torch.cat((txt_k, img_k), dim=2)
         v = torch.cat((txt_v, img_v), dim=2)
 
-        attn = attention(q, k, v, pe=pe)
+        # make attention mask if not None
+        attn_mask = None
+        if txt_attention_mask is not None:
+            attn_mask = txt_attention_mask  # b, seq_len
+            attn_mask = torch.cat(
+                (attn_mask, torch.ones(attn_mask.shape[0], img.shape[1]).to(attn_mask.device)), dim=1
+            )  # b, seq_len + img_len
+
+            # broadcast attn_mask to all heads
+            attn_mask = attn_mask[:, None, None, :].expand(-1, q.shape[1], q.shape[2], -1)
+
+        attn = attention(q, k, v, pe=pe, attn_mask=attn_mask)
         txt_attn, img_attn = attn[:, : txt.shape[1]], attn[:, txt.shape[1] :]
 
         # calculate the img blocks
@@ -725,10 +728,12 @@ def _forward(self, img: Tensor, txt: Tensor, vec: Tensor, pe: Tensor) -> tuple[T
         txt = txt + txt_mod2.gate * self.txt_mlp((1 + txt_mod2.scale) * self.txt_norm2(txt) + txt_mod2.shift)
         return img, txt
 
-    def forward(self, img: Tensor, txt: Tensor, vec: Tensor, pe: Tensor) -> tuple[Tensor, Tensor]:
+    def forward(
+        self, img: Tensor, txt: Tensor, vec: Tensor, pe: Tensor, txt_attention_mask: Optional[Tensor] = None
+    ) -> tuple[Tensor, Tensor]:
         if self.training and self.gradient_checkpointing:
             if not self.cpu_offload_checkpointing:
-                return checkpoint(self._forward, img, txt, vec, pe, use_reentrant=False)
+                return checkpoint(self._forward, img, txt, vec, pe, txt_attention_mask, use_reentrant=False)
             # cpu offload checkpointing
 
             def create_custom_forward(func):
@@ -739,10 +744,10 @@ def custom_forward(*inputs):
 
                 return custom_forward
 
-            return torch.utils.checkpoint.checkpoint(create_custom_forward(self._forward), img, txt, vec, pe)
+            return torch.utils.checkpoint.checkpoint(create_custom_forward(self._forward), img, txt, vec, pe, txt_attention_mask)
 
         else:
-            return self._forward(img, txt, vec, pe)
+            return self._forward(img, txt, vec, pe, txt_attention_mask)
 
     # def forward(self, img: Tensor, txt: Tensor, vec: Tensor, pe: Tensor):
     #     if self.training and self.gradient_checkpointing:
@@ -992,6 +997,7 @@ def forward(
         timesteps: Tensor,
         y: Tensor,
         guidance: Tensor | None = None,
+        txt_attention_mask: Tensor | None = None,
     ) -> Tensor:
         if img.ndim != 3 or txt.ndim != 3:
             raise ValueError("Input img and txt tensors must have 3 dimensions.")
@@ -1011,7 +1017,7 @@ def forward(
 
         if not self.double_blocks_to_swap:
             for block in self.double_blocks:
-                img, txt = block(img=img, txt=txt, vec=vec, pe=pe)
+                img, txt = block(img=img, txt=txt, vec=vec, pe=pe, txt_attention_mask=txt_attention_mask)
         else:
             # make sure first n blocks are on cuda, and last n blocks are on cpu at beginning
             for block_idx in range(self.double_blocks_to_swap):
@@ -1033,7 +1039,7 @@ def forward(
                     block.to(self.device)  # move to cuda
                     # print(f"Moved double block {block_idx} to cuda.")
 
-                img, txt = block(img=img, txt=txt, vec=vec, pe=pe)
+                img, txt = block(img=img, txt=txt, vec=vec, pe=pe, txt_attention_mask=txt_attention_mask)
 
                 if moving:
                     self.double_blocks[to_cpu_block_index].to("cpu")  # , non_blocking=True)
@@ -1164,6 +1170,7 @@ def forward(
         timesteps: Tensor,
         y: Tensor,
         guidance: Tensor | None = None,
+        txt_attention_mask: Tensor | None = None,
     ) -> Tensor:
         if img.ndim != 3 or txt.ndim != 3:
             raise ValueError("Input img and txt tensors must have 3 dimensions.")
@@ -1182,7 +1189,7 @@ def forward(
         pe = self.pe_embedder(ids)
 
         for block in self.double_blocks:
-            img, txt = block(img=img, txt=txt, vec=vec, pe=pe)
+            img, txt = block(img=img, txt=txt, vec=vec, pe=pe, txt_attention_mask=txt_attention_mask)
 
         return img, txt, vec, pe
 
diff --git a/library/flux_train_utils.py b/library/flux_train_utils.py
index 3f9e8660f..1d3f80d72 100644
--- a/library/flux_train_utils.py
+++ b/library/flux_train_utils.py
@@ -190,9 +190,10 @@ def sample_image_inference(
         te_outputs = sample_prompts_te_outputs[prompt]
     else:
         tokens_and_masks = tokenize_strategy.tokenize(prompt)
+        # strategy has apply_t5_attn_mask option
         te_outputs = encoding_strategy.encode_tokens(tokenize_strategy, text_encoders, tokens_and_masks)
 
-    l_pooled, t5_out, txt_ids = te_outputs
+    l_pooled, t5_out, txt_ids, t5_attn_mask = te_outputs
 
     # sample image
     weight_dtype = ae.dtype  # TOFO give dtype as argument
@@ -208,9 +209,10 @@ def sample_image_inference(
     )
     timesteps = get_schedule(sample_steps, noise.shape[1], shift=True)  # FLUX.1 dev -> shift=True
     img_ids = flux_utils.prepare_img_ids(1, packed_latent_height, packed_latent_width).to(accelerator.device, weight_dtype)
+    t5_attn_mask = t5_attn_mask.to(accelerator.device) if args.apply_t5_attn_mask else None
 
     with accelerator.autocast(), torch.no_grad():
-        x = denoise(flux, noise, img_ids, t5_out, txt_ids, l_pooled, timesteps=timesteps, guidance=scale)
+        x = denoise(flux, noise, img_ids, t5_out, txt_ids, l_pooled, timesteps=timesteps, guidance=scale, t5_attn_mask=t5_attn_mask)
 
     x = x.float()
     x = flux_utils.unpack_latents(x, packed_latent_height, packed_latent_width)
@@ -289,12 +291,22 @@ def denoise(
     vec: torch.Tensor,
     timesteps: list[float],
     guidance: float = 4.0,
+    t5_attn_mask: Optional[torch.Tensor] = None,
 ):
     # this is ignored for schnell
     guidance_vec = torch.full((img.shape[0],), guidance, device=img.device, dtype=img.dtype)
     for t_curr, t_prev in zip(tqdm(timesteps[:-1]), timesteps[1:]):
         t_vec = torch.full((img.shape[0],), t_curr, dtype=img.dtype, device=img.device)
-        pred = model(img=img, img_ids=img_ids, txt=txt, txt_ids=txt_ids, y=vec, timesteps=t_vec, guidance=guidance_vec)
+        pred = model(
+            img=img,
+            img_ids=img_ids,
+            txt=txt,
+            txt_ids=txt_ids,
+            y=vec,
+            timesteps=t_vec,
+            guidance=guidance_vec,
+            txt_attention_mask=t5_attn_mask,
+        )
 
         img = img + (t_prev - t_curr) * pred
 
@@ -498,7 +510,7 @@ def add_flux_train_arguments(parser: argparse.ArgumentParser):
     parser.add_argument(
         "--apply_t5_attn_mask",
         action="store_true",
-        help="apply attention mask (zero embs) to T5-XXL / T5-XXLにアテンションマスク（ゼロ埋め）を適用する",
+        help="apply attention mask to T5-XXL encode and FLUX double blocks / T5-XXLエンコードとFLUXダブルブロックにアテンションマスクを適用する",
     )
     parser.add_argument(
         "--cache_text_encoder_outputs", action="store_true", help="cache text encoder outputs / text encoderの出力をキャッシュする"
diff --git a/library/strategy_flux.py b/library/strategy_flux.py
index 5c620f3d6..737af390a 100644
--- a/library/strategy_flux.py
+++ b/library/strategy_flux.py
@@ -64,22 +64,25 @@ def encode_tokens(
         l_tokens, t5_tokens = tokens[:2]
         t5_attn_mask = tokens[2] if len(tokens) > 2 else None
 
+        # clip_l is None when using T5 only
         if clip_l is not None and l_tokens is not None:
             l_pooled = clip_l(l_tokens.to(clip_l.device))["pooler_output"]
         else:
             l_pooled = None
 
+        # t5xxl is None when using CLIP only
         if t5xxl is not None and t5_tokens is not None:
             # t5_out is [b, max length, 4096]
-            t5_out, _ = t5xxl(t5_tokens.to(t5xxl.device), return_dict=False, output_hidden_states=True)
-            if apply_t5_attn_mask:
-                t5_out = t5_out * t5_attn_mask.to(t5_out.device).unsqueeze(-1)
+            attention_mask = None if not apply_t5_attn_mask else t5_attn_mask.to(t5xxl.device)
+            t5_out, _ = t5xxl(t5_tokens.to(t5xxl.device), attention_mask, return_dict=False, output_hidden_states=True)
+            # if zero_pad_t5_output:
+            #     t5_out = t5_out * t5_attn_mask.to(t5_out.device).unsqueeze(-1)
             txt_ids = torch.zeros(t5_out.shape[0], t5_out.shape[1], 3, device=t5_out.device)
         else:
             t5_out = None
             txt_ids = None
 
-        return [l_pooled, t5_out, txt_ids]
+        return [l_pooled, t5_out, txt_ids, t5_attn_mask]  # returns t5_attn_mask for attention mask in transformer
 
 
 class FluxTextEncoderOutputsCachingStrategy(TextEncoderOutputsCachingStrategy):
@@ -115,6 +118,8 @@ def is_disk_cached_outputs_expected(self, npz_path: str):
                 return False
             if "txt_ids" not in npz:
                 return False
+            if "t5_attn_mask" not in npz:
+                return False
         except Exception as e:
             logger.error(f"Error loading file: {npz_path}")
             raise e
@@ -129,12 +134,12 @@ def load_outputs_npz(self, npz_path: str) -> List[np.ndarray]:
         l_pooled = data["l_pooled"]
         t5_out = data["t5_out"]
         txt_ids = data["txt_ids"]
+        t5_attn_mask = data["t5_attn_mask"]
 
         if self.apply_t5_attn_mask:
-            t5_attn_mask = data["t5_attn_mask"]
             t5_out = self.mask_t5_attn(t5_out, t5_attn_mask)
 
-        return [l_pooled, t5_out, txt_ids]
+        return [l_pooled, t5_out, txt_ids, t5_attn_mask]
 
     def cache_batch_outputs(
         self, tokenize_strategy: TokenizeStrategy, models: List[Any], text_encoding_strategy: TextEncodingStrategy, infos: List
@@ -145,7 +150,7 @@ def cache_batch_outputs(
         tokens_and_masks = tokenize_strategy.tokenize(captions)
         with torch.no_grad():
             # attn_mask is not applied when caching to disk: it is applied when loading from disk
-            l_pooled, t5_out, txt_ids = flux_text_encoding_strategy.encode_tokens(
+            l_pooled, t5_out, txt_ids, _ = flux_text_encoding_strategy.encode_tokens(
                 tokenize_strategy, models, tokens_and_masks, not self.cache_to_disk
             )
 
@@ -159,15 +164,15 @@ def cache_batch_outputs(
         l_pooled = l_pooled.cpu().numpy()
         t5_out = t5_out.cpu().numpy()
         txt_ids = txt_ids.cpu().numpy()
+        t5_attn_mask = tokens_and_masks[2].cpu().numpy()
 
         for i, info in enumerate(infos):
             l_pooled_i = l_pooled[i]
             t5_out_i = t5_out[i]
             txt_ids_i = txt_ids[i]
+            t5_attn_mask_i = t5_attn_mask[i]
 
             if self.cache_to_disk:
-                t5_attn_mask = tokens_and_masks[2]
-                t5_attn_mask_i = t5_attn_mask[i].cpu().numpy()
                 np.savez(
                     info.text_encoder_outputs_npz,
                     l_pooled=l_pooled_i,
@@ -176,7 +181,7 @@ def cache_batch_outputs(
                     t5_attn_mask=t5_attn_mask_i,
                 )
             else:
-                info.text_encoder_outputs = (l_pooled_i, t5_out_i, txt_ids_i)
+                info.text_encoder_outputs = (l_pooled_i, t5_out_i, txt_ids_i, t5_attn_mask_i)
 
 
 class FluxLatentsCachingStrategy(LatentsCachingStrategy):

From e17c42cb0de8a1303a607ecc75af092dc12dc272 Mon Sep 17 00:00:00 2001
From: Kohya S <ykumeykume@gmail.com>
Date: Wed, 21 Aug 2024 12:28:45 +0900
Subject: [PATCH 095/582] Add BFL/Diffusers LoRA converter #1467 #1458 #1483

---
 networks/convert_flux_lora.py | 403 ++++++++++++++++++++++++++++++++++
 1 file changed, 403 insertions(+)
 create mode 100644 networks/convert_flux_lora.py

diff --git a/networks/convert_flux_lora.py b/networks/convert_flux_lora.py
new file mode 100644
index 000000000..dd962ebfe
--- /dev/null
+++ b/networks/convert_flux_lora.py
@@ -0,0 +1,403 @@
+# convert key mapping and data format from some LoRA format to another
+"""
+Original LoRA format: Based on Black Forest Labs, QKV and MLP are unified into one module
+alpha is scalar for each LoRA module
+
+0 to 18
+lora_unet_double_blocks_0_img_attn_proj.alpha torch.Size([])
+lora_unet_double_blocks_0_img_attn_proj.lora_down.weight torch.Size([4, 3072])
+lora_unet_double_blocks_0_img_attn_proj.lora_up.weight torch.Size([3072, 4])
+lora_unet_double_blocks_0_img_attn_qkv.alpha torch.Size([])
+lora_unet_double_blocks_0_img_attn_qkv.lora_down.weight torch.Size([4, 3072])
+lora_unet_double_blocks_0_img_attn_qkv.lora_up.weight torch.Size([9216, 4])
+lora_unet_double_blocks_0_img_mlp_0.alpha torch.Size([])
+lora_unet_double_blocks_0_img_mlp_0.lora_down.weight torch.Size([4, 3072])
+lora_unet_double_blocks_0_img_mlp_0.lora_up.weight torch.Size([12288, 4])
+lora_unet_double_blocks_0_img_mlp_2.alpha torch.Size([])
+lora_unet_double_blocks_0_img_mlp_2.lora_down.weight torch.Size([4, 12288])
+lora_unet_double_blocks_0_img_mlp_2.lora_up.weight torch.Size([3072, 4])
+lora_unet_double_blocks_0_img_mod_lin.alpha torch.Size([])
+lora_unet_double_blocks_0_img_mod_lin.lora_down.weight torch.Size([4, 3072])
+lora_unet_double_blocks_0_img_mod_lin.lora_up.weight torch.Size([18432, 4])
+lora_unet_double_blocks_0_txt_attn_proj.alpha torch.Size([])
+lora_unet_double_blocks_0_txt_attn_proj.lora_down.weight torch.Size([4, 3072])
+lora_unet_double_blocks_0_txt_attn_proj.lora_up.weight torch.Size([3072, 4])
+lora_unet_double_blocks_0_txt_attn_qkv.alpha torch.Size([])
+lora_unet_double_blocks_0_txt_attn_qkv.lora_down.weight torch.Size([4, 3072])
+lora_unet_double_blocks_0_txt_attn_qkv.lora_up.weight torch.Size([9216, 4])
+lora_unet_double_blocks_0_txt_mlp_0.alpha torch.Size([])
+lora_unet_double_blocks_0_txt_mlp_0.lora_down.weight torch.Size([4, 3072])
+lora_unet_double_blocks_0_txt_mlp_0.lora_up.weight torch.Size([12288, 4])
+lora_unet_double_blocks_0_txt_mlp_2.alpha torch.Size([])
+lora_unet_double_blocks_0_txt_mlp_2.lora_down.weight torch.Size([4, 12288])
+lora_unet_double_blocks_0_txt_mlp_2.lora_up.weight torch.Size([3072, 4])
+lora_unet_double_blocks_0_txt_mod_lin.alpha torch.Size([])
+lora_unet_double_blocks_0_txt_mod_lin.lora_down.weight torch.Size([4, 3072])
+lora_unet_double_blocks_0_txt_mod_lin.lora_up.weight torch.Size([18432, 4])
+
+0 to 37
+lora_unet_single_blocks_0_linear1.alpha torch.Size([])
+lora_unet_single_blocks_0_linear1.lora_down.weight torch.Size([4, 3072])
+lora_unet_single_blocks_0_linear1.lora_up.weight torch.Size([21504, 4])
+lora_unet_single_blocks_0_linear2.alpha torch.Size([])
+lora_unet_single_blocks_0_linear2.lora_down.weight torch.Size([4, 15360])
+lora_unet_single_blocks_0_linear2.lora_up.weight torch.Size([3072, 4])
+lora_unet_single_blocks_0_modulation_lin.alpha torch.Size([])
+lora_unet_single_blocks_0_modulation_lin.lora_down.weight torch.Size([4, 3072])
+lora_unet_single_blocks_0_modulation_lin.lora_up.weight torch.Size([9216, 4])
+"""
+"""
+ai-toolkit: Based on Diffusers, QKV and MLP are separated into 3 modules.
+A is down, B is up. No alpha for each LoRA module.
+
+0 to 18
+transformer.transformer_blocks.0.attn.add_k_proj.lora_A.weight torch.Size([16, 3072])
+transformer.transformer_blocks.0.attn.add_k_proj.lora_B.weight torch.Size([3072, 16])
+transformer.transformer_blocks.0.attn.add_q_proj.lora_A.weight torch.Size([16, 3072])
+transformer.transformer_blocks.0.attn.add_q_proj.lora_B.weight torch.Size([3072, 16])
+transformer.transformer_blocks.0.attn.add_v_proj.lora_A.weight torch.Size([16, 3072])
+transformer.transformer_blocks.0.attn.add_v_proj.lora_B.weight torch.Size([3072, 16])
+transformer.transformer_blocks.0.attn.to_add_out.lora_A.weight torch.Size([16, 3072])
+transformer.transformer_blocks.0.attn.to_add_out.lora_B.weight torch.Size([3072, 16])
+transformer.transformer_blocks.0.attn.to_k.lora_A.weight torch.Size([16, 3072])
+transformer.transformer_blocks.0.attn.to_k.lora_B.weight torch.Size([3072, 16])
+transformer.transformer_blocks.0.attn.to_out.0.lora_A.weight torch.Size([16, 3072])
+transformer.transformer_blocks.0.attn.to_out.0.lora_B.weight torch.Size([3072, 16])
+transformer.transformer_blocks.0.attn.to_q.lora_A.weight torch.Size([16, 3072])
+transformer.transformer_blocks.0.attn.to_q.lora_B.weight torch.Size([3072, 16])
+transformer.transformer_blocks.0.attn.to_v.lora_A.weight torch.Size([16, 3072])
+transformer.transformer_blocks.0.attn.to_v.lora_B.weight torch.Size([3072, 16])
+transformer.transformer_blocks.0.ff.net.0.proj.lora_A.weight torch.Size([16, 3072])
+transformer.transformer_blocks.0.ff.net.0.proj.lora_B.weight torch.Size([12288, 16])
+transformer.transformer_blocks.0.ff.net.2.lora_A.weight torch.Size([16, 12288])
+transformer.transformer_blocks.0.ff.net.2.lora_B.weight torch.Size([3072, 16])
+transformer.transformer_blocks.0.ff_context.net.0.proj.lora_A.weight torch.Size([16, 3072])
+transformer.transformer_blocks.0.ff_context.net.0.proj.lora_B.weight torch.Size([12288, 16])
+transformer.transformer_blocks.0.ff_context.net.2.lora_A.weight torch.Size([16, 12288])
+transformer.transformer_blocks.0.ff_context.net.2.lora_B.weight torch.Size([3072, 16])
+transformer.transformer_blocks.0.norm1.linear.lora_A.weight torch.Size([16, 3072])
+transformer.transformer_blocks.0.norm1.linear.lora_B.weight torch.Size([18432, 16])
+transformer.transformer_blocks.0.norm1_context.linear.lora_A.weight torch.Size([16, 3072])
+transformer.transformer_blocks.0.norm1_context.linear.lora_B.weight torch.Size([18432, 16])
+
+0 to 37
+transformer.single_transformer_blocks.0.attn.to_k.lora_A.weight torch.Size([16, 3072])
+transformer.single_transformer_blocks.0.attn.to_k.lora_B.weight torch.Size([3072, 16])
+transformer.single_transformer_blocks.0.attn.to_q.lora_A.weight torch.Size([16, 3072])
+transformer.single_transformer_blocks.0.attn.to_q.lora_B.weight torch.Size([3072, 16])
+transformer.single_transformer_blocks.0.attn.to_v.lora_A.weight torch.Size([16, 3072])
+transformer.single_transformer_blocks.0.attn.to_v.lora_B.weight torch.Size([3072, 16])
+transformer.single_transformer_blocks.0.norm.linear.lora_A.weight torch.Size([16, 3072])
+transformer.single_transformer_blocks.0.norm.linear.lora_B.weight torch.Size([9216, 16])
+transformer.single_transformer_blocks.0.proj_mlp.lora_A.weight torch.Size([16, 3072])
+transformer.single_transformer_blocks.0.proj_mlp.lora_B.weight torch.Size([12288, 16])
+transformer.single_transformer_blocks.0.proj_out.lora_A.weight torch.Size([16, 15360])
+transformer.single_transformer_blocks.0.proj_out.lora_B.weight torch.Size([3072, 16])
+"""
+"""
+xlabs: Unknown format.
+0 to 18
+double_blocks.0.processor.proj_lora1.down.weight torch.Size([16, 3072])
+double_blocks.0.processor.proj_lora1.up.weight torch.Size([3072, 16])
+double_blocks.0.processor.proj_lora2.down.weight torch.Size([16, 3072])
+double_blocks.0.processor.proj_lora2.up.weight torch.Size([3072, 16])
+double_blocks.0.processor.qkv_lora1.down.weight torch.Size([16, 3072])
+double_blocks.0.processor.qkv_lora1.up.weight torch.Size([9216, 16])
+double_blocks.0.processor.qkv_lora2.down.weight torch.Size([16, 3072])
+double_blocks.0.processor.qkv_lora2.up.weight torch.Size([9216, 16])
+"""
+
+
+import argparse
+from safetensors.torch import save_file
+from safetensors import safe_open
+import torch
+
+
+from library.utils import setup_logging
+
+setup_logging()
+import logging
+
+logger = logging.getLogger(__name__)
+
+
+def convert_to_sd_scripts(sds_sd, ait_sd, sds_key, ait_key):
+    ait_down_key = ait_key + ".lora_A.weight"
+    if ait_down_key not in ait_sd:
+        return
+    ait_up_key = ait_key + ".lora_B.weight"
+
+    down_weight = ait_sd.pop(ait_down_key)
+    sds_sd[sds_key + ".lora_down.weight"] = down_weight
+    sds_sd[sds_key + ".lora_up.weight"] = ait_sd.pop(ait_up_key)
+    rank = down_weight.shape[0]
+    sds_sd[sds_key + ".alpha"] = torch.scalar_tensor(rank, dtype=down_weight.dtype, device=down_weight.device)
+
+
+def convert_to_sd_scripts_cat(sds_sd, ait_sd, sds_key, ait_keys):
+    ait_down_keys = [k + ".lora_A.weight" for k in ait_keys]
+    if ait_down_keys[0] not in ait_sd:
+        return
+    ait_up_keys = [k + ".lora_B.weight" for k in ait_keys]
+
+    down_weights = [ait_sd.pop(k) for k in ait_down_keys]
+    up_weights = [ait_sd.pop(k) for k in ait_up_keys]
+
+    # lora_down is concatenated along dim=0, so rank is multiplied by the number of splits
+    rank = down_weights[0].shape[0]
+    num_splits = len(ait_keys)
+    sds_sd[sds_key + ".lora_down.weight"] = torch.cat(down_weights, dim=0)
+
+    merged_up_weights = torch.zeros(
+        (sum(w.shape[0] for w in up_weights), rank * num_splits),
+        dtype=up_weights[0].dtype,
+        device=up_weights[0].device,
+    )
+
+    i = 0
+    for j, up_weight in enumerate(up_weights):
+        merged_up_weights[i : i + up_weight.shape[0], j * rank : (j + 1) * rank] = up_weight
+        i += up_weight.shape[0]
+
+    sds_sd[sds_key + ".lora_up.weight"] = merged_up_weights
+
+    # set alpha to new_rank
+    new_rank = rank * num_splits
+    sds_sd[sds_key + ".alpha"] = torch.scalar_tensor(new_rank, dtype=down_weights[0].dtype, device=down_weights[0].device)
+
+
+def convert_ai_toolkit_to_sd_scripts(ait_sd):
+    sds_sd = {}
+    for i in range(19):
+        convert_to_sd_scripts(
+            sds_sd, ait_sd, f"lora_unet_double_blocks_{i}_img_attn_proj", f"transformer.transformer_blocks.{i}.attn.to_out.0"
+        )
+        convert_to_sd_scripts_cat(
+            sds_sd,
+            ait_sd,
+            f"lora_unet_double_blocks_{i}_img_attn_qkv",
+            [
+                f"transformer.transformer_blocks.{i}.attn.to_q",
+                f"transformer.transformer_blocks.{i}.attn.to_k",
+                f"transformer.transformer_blocks.{i}.attn.to_v",
+            ],
+        )
+        convert_to_sd_scripts(
+            sds_sd, ait_sd, f"lora_unet_double_blocks_{i}_img_mlp_0", f"transformer.transformer_blocks.{i}.ff.net.0.proj"
+        )
+        convert_to_sd_scripts(
+            sds_sd, ait_sd, f"lora_unet_double_blocks_{i}_img_mlp_2", f"transformer.transformer_blocks.{i}.ff.net.2"
+        )
+        convert_to_sd_scripts(
+            sds_sd, ait_sd, f"lora_unet_double_blocks_{i}_img_mod_lin", f"transformer.transformer_blocks.{i}.norm1.linear"
+        )
+        convert_to_sd_scripts(
+            sds_sd, ait_sd, f"lora_unet_double_blocks_{i}_txt_attn_proj", f"transformer.transformer_blocks.{i}.attn.to_add_out"
+        )
+        convert_to_sd_scripts_cat(
+            sds_sd,
+            ait_sd,
+            f"lora_unet_double_blocks_{i}_txt_attn_qkv",
+            [
+                f"transformer.transformer_blocks.{i}.attn.add_q_proj",
+                f"transformer.transformer_blocks.{i}.attn.add_k_proj",
+                f"transformer.transformer_blocks.{i}.attn.add_v_proj",
+            ],
+        )
+        convert_to_sd_scripts(
+            sds_sd, ait_sd, f"lora_unet_double_blocks_{i}_txt_mlp_0", f"transformer.transformer_blocks.{i}.ff_context.net.0.proj"
+        )
+        convert_to_sd_scripts(
+            sds_sd, ait_sd, f"lora_unet_double_blocks_{i}_txt_mlp_2", f"transformer.transformer_blocks.{i}.ff_context.net.2"
+        )
+        convert_to_sd_scripts(
+            sds_sd, ait_sd, f"lora_unet_double_blocks_{i}_txt_mod_lin", f"transformer.transformer_blocks.{i}.norm1_context.linear"
+        )
+
+    for i in range(38):
+        convert_to_sd_scripts_cat(
+            sds_sd,
+            ait_sd,
+            f"lora_unet_single_blocks_{i}_linear1",
+            [
+                f"transformer.single_transformer_blocks.{i}.attn.to_q",
+                f"transformer.single_transformer_blocks.{i}.attn.to_k",
+                f"transformer.single_transformer_blocks.{i}.attn.to_v",
+                f"transformer.single_transformer_blocks.{i}.proj_mlp",
+            ],
+        )
+        convert_to_sd_scripts(
+            sds_sd, ait_sd, f"lora_unet_single_blocks_{i}_linear2", f"transformer.single_transformer_blocks.{i}.proj_out"
+        )
+        convert_to_sd_scripts(
+            sds_sd, ait_sd, f"lora_unet_single_blocks_{i}_modulation_lin", f"transformer.single_transformer_blocks.{i}.norm.linear"
+        )
+
+    if len(ait_sd) > 0:
+        logger.warning(f"Unsuppored keys for sd-scripts: {ait_sd.keys()}")
+    return sds_sd
+
+
+def convert_to_ai_toolkit(sds_sd, ait_sd, sds_key, ait_key):
+    if sds_key + ".lora_down.weight" not in sds_sd:
+        return
+    down_weight = sds_sd.pop(sds_key + ".lora_down.weight")
+
+    # scale weight by alpha and dim
+    rank = down_weight.shape[0]
+    alpha = sds_sd.pop(sds_key + ".alpha").item()  # alpha is scalar
+    scale = alpha / rank  # LoRA is scaled by 'alpha / rank' in forward pass, so we need to scale it back here
+    print(f"rank: {rank}, alpha: {alpha}, scale: {scale}")
+
+    # calculate scale_down and scale_up to keep the same value. if scale is 4, scale_down is 2 and scale_up is 2
+    scale_down = scale
+    scale_up = 1.0
+    while scale_down * 2 < scale_up:
+        scale_down *= 2
+        scale_up /= 2
+    # print(f"scale: {scale}, scale_down: {scale_down}, scale_up: {scale_up}")
+
+    ait_sd[ait_key + ".lora_A.weight"] = down_weight * scale_down
+    ait_sd[ait_key + ".lora_B.weight"] = sds_sd.pop(sds_key + ".lora_up.weight") * scale_up
+
+
+def convert_to_ai_toolkit_cat(sds_sd, ait_sd, sds_key, ait_keys, dims=None):
+    if sds_key + ".lora_down.weight" not in sds_sd:
+        return
+    down_weight = sds_sd.pop(sds_key + ".lora_down.weight")
+
+    # scale weight by alpha and dim
+    rank = down_weight.shape[0]
+    alpha = sds_sd.pop(sds_key + ".alpha")
+    scale = alpha / rank
+
+    # calculate scale_down and scale_up
+    scale_down = scale
+    scale_up = 1.0
+    while scale_down * 2 < scale_up:
+        scale_down *= 2
+        scale_up /= 2
+
+    ait_down_keys = [k + ".lora_A.weight" for k in ait_keys]
+    ait_up_keys = [k + ".lora_B.weight" for k in ait_keys]
+
+    num_splits = len(ait_keys)
+    up_weight = sds_sd.pop(sds_key + ".lora_up.weight")
+
+    # down_weight is copied to each split
+    ait_sd.update({k: down_weight * scale_down for k in ait_down_keys})
+
+    # calculate dims if not provided
+    if dims is None:
+        dims = [up_weight.shape[0] // num_splits] * num_splits
+    else:
+        assert sum(dims) == up_weight.shape[0]
+
+    # up_weight is split to each split
+    ait_sd.update({k: v * scale_up for k, v in zip(ait_up_keys, torch.split(up_weight, dims, dim=0))})
+
+
+def convert_sd_scripts_to_ai_toolkit(sds_sd):
+    ait_sd = {}
+    for i in range(19):
+        convert_to_ai_toolkit(
+            sds_sd, ait_sd, f"lora_unet_double_blocks_{i}_img_attn_proj", f"transformer.transformer_blocks.{i}.attn.to_out.0"
+        )
+        convert_to_ai_toolkit_cat(
+            sds_sd,
+            ait_sd,
+            f"lora_unet_double_blocks_{i}_img_attn_qkv",
+            [
+                f"transformer.transformer_blocks.{i}.attn.to_q",
+                f"transformer.transformer_blocks.{i}.attn.to_k",
+                f"transformer.transformer_blocks.{i}.attn.to_v",
+            ],
+        )
+        convert_to_ai_toolkit(
+            sds_sd, ait_sd, f"lora_unet_double_blocks_{i}_img_mlp_0", f"transformer.transformer_blocks.{i}.ff.net.0.proj"
+        )
+        convert_to_ai_toolkit(
+            sds_sd, ait_sd, f"lora_unet_double_blocks_{i}_img_mlp_2", f"transformer.transformer_blocks.{i}.ff.net.2"
+        )
+        convert_to_ai_toolkit(
+            sds_sd, ait_sd, f"lora_unet_double_blocks_{i}_img_mod_lin", f"transformer.transformer_blocks.{i}.norm1.linear"
+        )
+        convert_to_ai_toolkit(
+            sds_sd, ait_sd, f"lora_unet_double_blocks_{i}_txt_attn_proj", f"transformer.transformer_blocks.{i}.attn.to_add_out"
+        )
+        convert_to_ai_toolkit_cat(
+            sds_sd,
+            ait_sd,
+            f"lora_unet_double_blocks_{i}_txt_attn_qkv",
+            [
+                f"transformer.transformer_blocks.{i}.attn.add_q_proj",
+                f"transformer.transformer_blocks.{i}.attn.add_k_proj",
+                f"transformer.transformer_blocks.{i}.attn.add_v_proj",
+            ],
+        )
+        convert_to_ai_toolkit(
+            sds_sd, ait_sd, f"lora_unet_double_blocks_{i}_txt_mlp_0", f"transformer.transformer_blocks.{i}.ff_context.net.0.proj"
+        )
+        convert_to_ai_toolkit(
+            sds_sd, ait_sd, f"lora_unet_double_blocks_{i}_txt_mlp_2", f"transformer.transformer_blocks.{i}.ff_context.net.2"
+        )
+        convert_to_ai_toolkit(
+            sds_sd, ait_sd, f"lora_unet_double_blocks_{i}_txt_mod_lin", f"transformer.transformer_blocks.{i}.norm1_context.linear"
+        )
+
+    for i in range(38):
+        convert_to_ai_toolkit_cat(
+            sds_sd,
+            ait_sd,
+            f"lora_unet_single_blocks_{i}_linear1",
+            [
+                f"transformer.single_transformer_blocks.{i}.attn.to_q",
+                f"transformer.single_transformer_blocks.{i}.attn.to_k",
+                f"transformer.single_transformer_blocks.{i}.attn.to_v",
+                f"transformer.single_transformer_blocks.{i}.proj_mlp",
+            ],
+            dims=[3072, 3072, 3072, 12288],
+        )
+        convert_to_ai_toolkit(
+            sds_sd, ait_sd, f"lora_unet_single_blocks_{i}_linear2", f"transformer.single_transformer_blocks.{i}.proj_out"
+        )
+        convert_to_ai_toolkit(
+            sds_sd, ait_sd, f"lora_unet_single_blocks_{i}_modulation_lin", f"transformer.single_transformer_blocks.{i}.norm.linear"
+        )
+
+    if len(sds_sd) > 0:
+        logger.warning(f"Unsuppored keys for ai-toolkit: {sds_sd.keys()}")
+    return ait_sd
+
+
+def main(args):
+    # load source safetensors
+    logger.info(f"Loading source file {args.src_path}")
+    state_dict = {}
+    with safe_open(args.src_path, framework="pt") as f:
+        metadata = f.metadata()
+        for k in f.keys():
+            state_dict[k] = f.get_tensor(k)
+
+    logger.info(f"Converting {args.src} to {args.dst} format")
+    if args.src == "ai-toolkit" and args.dst == "sd-scripts":
+        state_dict = convert_ai_toolkit_to_sd_scripts(state_dict)
+    elif args.src == "sd-scripts" and args.dst == "ai-toolkit":
+        state_dict = convert_sd_scripts_to_ai_toolkit(state_dict)
+    else:
+        raise NotImplementedError(f"Conversion from {args.src} to {args.dst} is not supported")
+
+    # save destination safetensors
+    logger.info(f"Saving destination file {args.dst_path}")
+    save_file(state_dict, args.dst_path, metadata=metadata)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Convert LoRA format")
+    parser.add_argument("--src", type=str, default="ai-toolkit", help="source format, ai-toolkit or sd-scripts")
+    parser.add_argument("--dst", type=str, default="sd-scripts", help="destination format, ai-toolkit or sd-scripts")
+    parser.add_argument("--src_path", type=str, default=None, help="source path")
+    parser.add_argument("--dst_path", type=str, default=None, help="destination path")
+    args = parser.parse_args()
+    main(args)

From 2b07a92c8d970a8538a47dd1bcad3122da4e195a Mon Sep 17 00:00:00 2001
From: Kohya S <ykumeykume@gmail.com>
Date: Wed, 21 Aug 2024 12:30:23 +0900
Subject: [PATCH 096/582] Fix error in applying mask in Attention and add LoRA
 converter script

---
 README.md                     | 6 ++++++
 library/flux_models.py        | 5 +++--
 networks/convert_flux_lora.py | 2 +-
 3 files changed, 10 insertions(+), 3 deletions(-)

diff --git a/README.md b/README.md
index 43edbbed6..f4056851f 100644
--- a/README.md
+++ b/README.md
@@ -9,6 +9,12 @@ __Please update PyTorch to 2.4.0. We have tested with `torch==2.4.0` and `torchv
 The command to install PyTorch is as follows:
 `pip3 install torch==2.4.0 torchvision==0.19.0 --index-url https://download.pytorch.org/whl/cu124`
 
+Aug 21, 2024 (update 2):
+Fixed an error in applying mask in Attention. The attention mask was float, but it should be bool. 
+
+Added a script `convert_flux_lora.py` to convert LoRA between sd-scripts format (BFL-based) and AI-toolkit format (Diffusers-based). See `--help` for details. BFL-based LoRA has a large module, so converting it to Diffusers format may reduce temporary memory usage in the inference environment. Note that re-conversion will increase the size of LoRA.
+
+
 Aug 21, 2024:
 The specification of `--apply_t5_attn_mask` has been changed. Previously, the T5 output was zero-padded, but now, two steps are taken: "1. Apply mask when encoding T5" and "2. Apply mask in the attention of Double Block". Fine tuning, LoRA training, and inference in `flux_mini_inference.py` have been changed.
 
diff --git a/library/flux_models.py b/library/flux_models.py
index 6f28da603..e38119cd7 100644
--- a/library/flux_models.py
+++ b/library/flux_models.py
@@ -708,9 +708,10 @@ def _forward(
         # make attention mask if not None
         attn_mask = None
         if txt_attention_mask is not None:
-            attn_mask = txt_attention_mask  # b, seq_len
+            # F.scaled_dot_product_attention expects attn_mask to be bool for binary mask
+            attn_mask = txt_attention_mask.to(torch.bool)  # b, seq_len
             attn_mask = torch.cat(
-                (attn_mask, torch.ones(attn_mask.shape[0], img.shape[1]).to(attn_mask.device)), dim=1
+                (attn_mask, torch.ones(attn_mask.shape[0], img.shape[1], device=attn_mask.device, dtype=torch.bool)), dim=1
             )  # b, seq_len + img_len
 
             # broadcast attn_mask to all heads
diff --git a/networks/convert_flux_lora.py b/networks/convert_flux_lora.py
index dd962ebfe..e9743534d 100644
--- a/networks/convert_flux_lora.py
+++ b/networks/convert_flux_lora.py
@@ -248,7 +248,7 @@ def convert_to_ai_toolkit(sds_sd, ait_sd, sds_key, ait_key):
     rank = down_weight.shape[0]
     alpha = sds_sd.pop(sds_key + ".alpha").item()  # alpha is scalar
     scale = alpha / rank  # LoRA is scaled by 'alpha / rank' in forward pass, so we need to scale it back here
-    print(f"rank: {rank}, alpha: {alpha}, scale: {scale}")
+    # print(f"rank: {rank}, alpha: {alpha}, scale: {scale}")
 
     # calculate scale_down and scale_up to keep the same value. if scale is 4, scale_down is 2 and scale_up is 2
     scale_down = scale

From e1cd19c0c0ef55709e8eb1e5babe25045f65031f Mon Sep 17 00:00:00 2001
From: Kohya S <ykumeykume@gmail.com>
Date: Wed, 21 Aug 2024 21:04:10 +0900
Subject: [PATCH 097/582] add stochastic rounding, fix single block

---
 README.md                  | 19 ++++++--
 flux_train.py              | 95 ++++++++++++++++++++++++++++++++++----
 library/adafactor_fused.py | 36 ++++++++++++++-
 library/flux_models.py     |  1 +
 4 files changed, 136 insertions(+), 15 deletions(-)

diff --git a/README.md b/README.md
index f4056851f..45349ba38 100644
--- a/README.md
+++ b/README.md
@@ -9,6 +9,15 @@ __Please update PyTorch to 2.4.0. We have tested with `torch==2.4.0` and `torchv
 The command to install PyTorch is as follows:
 `pip3 install torch==2.4.0 torchvision==0.19.0 --index-url https://download.pytorch.org/whl/cu124`
 
+Aug 21, 2024 (update 3):
+- There is a bug that `--full_bf16` option is enabled even if it is not specified in `flux_train.py`. The bug will be fixed sooner. __Please specify the `--full_bf16` option explicitly, especially when training with 24GB VRAM.__
+- Stochastic rounding is now implemented when `--fused_backward_pass` is specified. The implementation is 
+based on the code provided by 2kpr. Thank you so much! 
+    - With this change, `--fused_backward_pass` is recommended over `--blockwise_fused_optimizers` when `--full_bf16` is specified. 
+    - Please note that `--fused_backward_pass` is only supported with Adafactor.
+- The sample command in [FLUX.1 fine-tuning](#flux1-fine-tuning) is updated to reflect these changes.
+- Fixed `--single_blocks_to_swap` is not working in `flux_train.py`. 
+
 Aug 21, 2024 (update 2):
 Fixed an error in applying mask in Attention. The attention mask was float, but it should be bool. 
 
@@ -142,7 +151,7 @@ accelerate launch  --mixed_precision bf16 --num_cpu_threads_per_process 1 flux_t
 --learning_rate 5e-5 --max_train_epochs 4  --sdpa --highvram --cache_text_encoder_outputs_to_disk --cache_latents_to_disk --save_every_n_epochs 1 
 --optimizer_type adafactor --optimizer_args "relative_step=False" "scale_parameter=False" "warmup_init=False" 
 --timestep_sampling sigmoid --model_prediction_type raw --guidance_scale 1.0 
---blockwise_fused_optimizers  --double_blocks_to_swap 6 --cpu_offload_checkpointing
+--fused_backward_pass  --double_blocks_to_swap 6 --cpu_offload_checkpointing --full_bf16 
 ```
 
 (Combine the command into one line.)
@@ -151,9 +160,13 @@ Sample image generation during training is not tested yet.
 
 Options are almost the same as LoRA training. The difference is `--blockwise_fused_optimizers`, `--double_blocks_to_swap` and `--cpu_offload_checkpointing`.  `--single_blocks_to_swap` is also available.
 
-`--blockwise_fused_optimizers` enables the fusing of the optimizer for each block. This is similar to `--fused_backward_pass`. Any optimizer can be used, but Adafactor is recommended for memory efficiency. `--fused_optimizer_groups` is deprecated due to the addition of this option for FLUX.1 training.
+`--full_bf16` enables the training with bf16 (weights and gradients). 
+
+`--fused_backward_pass` enables the fusing of the optimizer step into the backward pass for each parameter. This reduces the memory usage during training. Only Adafactor optimizer is supported for now. Stochastic rounding is also enabled when `--fused_backward_pass` and `--full_bf16` are specified.
+
+`--blockwise_fused_optimizers` enables the fusing of the optimizer step into the backward pass for each block. This is similar to `--fused_backward_pass`. Any optimizer can be used, but Adafactor is recommended for memory efficiency. `--blockwise_fused_optimizers` cannot be used with `--fused_backward_pass`. Stochastic rounding is not supported for now.
 
-`--double_blocks_to_swap` and `--single_blocks_to_swap` are the number of double blocks and single blocks to swap. The default is None (no swap). These options must be combined with `--blockwise_fused_optimizers`. 
+`--double_blocks_to_swap` and `--single_blocks_to_swap` are the number of double blocks and single blocks to swap. The default is None (no swap). These options must be combined with `--fused_backward_pass` or `--blockwise_fused_optimizers`. `--double_blocks_to_swap` can be specified with `--single_blocks_to_swap`. The recommended maximum number of blocks to swap is 9 for double blocks and 18 for single blocks.
 
 `--cpu_offload_checkpointing` is to offload the gradient checkpointing to CPU. This reduces about 2GB of VRAM usage.
 
diff --git a/flux_train.py b/flux_train.py
index ecb8a1086..bcf4b9564 100644
--- a/flux_train.py
+++ b/flux_train.py
@@ -277,7 +277,10 @@ def train(args):
     training_models = []
     params_to_optimize = []
     training_models.append(flux)
-    params_to_optimize.append({"params": list(flux.parameters()), "lr": args.learning_rate})
+    name_and_params = list(flux.named_parameters())
+    # single param group for now
+    params_to_optimize.append({"params": [p for _, p in name_and_params], "lr": args.learning_rate})
+    param_names = [[n for n, _ in name_and_params]]
 
     # calculate number of trainable parameters
     n_params = 0
@@ -433,17 +436,89 @@ def train(args):
         import library.adafactor_fused
 
         library.adafactor_fused.patch_adafactor_fused(optimizer)
-        for param_group in optimizer.param_groups:
-            for parameter in param_group["params"]:
-                if parameter.requires_grad:
 
-                    def __grad_hook(tensor: torch.Tensor, param_group=param_group):
-                        if accelerator.sync_gradients and args.max_grad_norm != 0.0:
-                            accelerator.clip_grad_norm_(tensor, args.max_grad_norm)
-                        optimizer.step_param(tensor, param_group)
-                        tensor.grad = None
+        double_blocks_to_swap = args.double_blocks_to_swap
+        single_blocks_to_swap = args.single_blocks_to_swap
+        num_double_blocks = len(flux.double_blocks)
+        num_single_blocks = len(flux.single_blocks)
+        handled_double_block_indices = set()
+        handled_single_block_indices = set()
 
-                    parameter.register_post_accumulate_grad_hook(__grad_hook)
+        for param_group, param_name_group in zip(optimizer.param_groups, param_names):
+            for parameter, param_name in zip(param_group["params"], param_name_group):
+                if parameter.requires_grad:
+                    grad_hook = None
+
+                    if double_blocks_to_swap:
+                        if param_name.startswith("double_blocks"):
+                            block_idx = int(param_name.split(".")[1])
+                            if (
+                                block_idx not in handled_double_block_indices
+                                and block_idx >= (num_double_blocks - double_blocks_to_swap) - 1
+                                and block_idx < num_double_blocks - 1
+                            ):
+                                # swap next (already backpropagated) block
+                                handled_double_block_indices.add(block_idx)
+                                block_idx_cpu = block_idx + 1
+                                block_idx_cuda = double_blocks_to_swap - (num_double_blocks - block_idx_cpu)
+
+                                # create swap hook
+                                def create_double_swap_grad_hook(bidx, bidx_cuda):
+                                    def __grad_hook(tensor: torch.Tensor):
+                                        if accelerator.sync_gradients and args.max_grad_norm != 0.0:
+                                            accelerator.clip_grad_norm_(tensor, args.max_grad_norm)
+                                        optimizer.step_param(tensor, param_group)
+                                        tensor.grad = None
+
+                                        # swap blocks if necessary
+                                        flux.double_blocks[bidx].to("cpu")
+                                        flux.double_blocks[bidx_cuda].to(accelerator.device)
+                                        # print(f"Move double block {bidx} to cpu and {bidx_cuda} to device")
+
+                                    return __grad_hook
+
+                                grad_hook = create_double_swap_grad_hook(block_idx_cpu, block_idx_cuda)
+                    if single_blocks_to_swap:
+                        if param_name.startswith("single_blocks"):
+                            block_idx = int(param_name.split(".")[1])
+                            if (
+                                block_idx not in handled_single_block_indices
+                                and block_idx >= (num_single_blocks - single_blocks_to_swap) - 1
+                                and block_idx < num_single_blocks - 1
+                            ):
+                                handled_single_block_indices.add(block_idx)
+                                block_idx_cpu = block_idx + 1
+                                block_idx_cuda = single_blocks_to_swap - (num_single_blocks - block_idx_cpu)
+                                # print(param_name, block_idx_cpu, block_idx_cuda)
+
+                                # create swap hook
+                                def create_single_swap_grad_hook(bidx, bidx_cuda):
+                                    def __grad_hook(tensor: torch.Tensor):
+                                        if accelerator.sync_gradients and args.max_grad_norm != 0.0:
+                                            accelerator.clip_grad_norm_(tensor, args.max_grad_norm)
+                                        optimizer.step_param(tensor, param_group)
+                                        tensor.grad = None
+
+                                        # swap blocks if necessary
+                                        flux.single_blocks[bidx].to("cpu")
+                                        flux.single_blocks[bidx_cuda].to(accelerator.device)
+                                        # print(f"Move single block {bidx} to cpu and {bidx_cuda} to device")
+
+                                    return __grad_hook
+
+                                grad_hook = create_single_swap_grad_hook(block_idx_cpu, block_idx_cuda)
+
+                    if grad_hook is None:
+
+                        def __grad_hook(tensor: torch.Tensor, param_group=param_group):
+                            if accelerator.sync_gradients and args.max_grad_norm != 0.0:
+                                accelerator.clip_grad_norm_(tensor, args.max_grad_norm)
+                            optimizer.step_param(tensor, param_group)
+                            tensor.grad = None
+
+                        grad_hook = __grad_hook
+
+                    parameter.register_post_accumulate_grad_hook(grad_hook)
 
     elif args.blockwise_fused_optimizers:
         # prepare for additional optimizers and lr schedulers
diff --git a/library/adafactor_fused.py b/library/adafactor_fused.py
index bdfc32ced..b5afa236b 100644
--- a/library/adafactor_fused.py
+++ b/library/adafactor_fused.py
@@ -2,6 +2,32 @@
 import torch
 from transformers import Adafactor
 
+# stochastic rounding for bfloat16
+# The implementation was provided by 2kpr. Thank you very much!
+
+def copy_stochastic_(target: torch.Tensor, source: torch.Tensor):
+    """
+    copies source into target using stochastic rounding
+
+    Args:
+        target: the target tensor with dtype=bfloat16
+        source: the target tensor with dtype=float32
+    """
+    # create a random 16 bit integer
+    result = torch.randint_like(source, dtype=torch.int32, low=0, high=(1 << 16))
+
+    # add the random number to the lower 16 bit of the mantissa
+    result.add_(source.view(dtype=torch.int32))
+
+    # mask off the lower 16 bit of the mantissa
+    result.bitwise_and_(-65536)  # -65536 = FFFF0000 as a signed int32
+
+    # copy the higher 16 bit into the target tensor
+    target.copy_(result.view(dtype=torch.float32))
+
+    del result
+
+
 @torch.no_grad()
 def adafactor_step_param(self, p, group):
     if p.grad is None:
@@ -48,7 +74,7 @@ def adafactor_step_param(self, p, group):
     lr = Adafactor._get_lr(group, state)
 
     beta2t = 1.0 - math.pow(state["step"], group["decay_rate"])
-    update = (grad ** 2) + group["eps"][0]
+    update = (grad**2) + group["eps"][0]
     if factored:
         exp_avg_sq_row = state["exp_avg_sq_row"]
         exp_avg_sq_col = state["exp_avg_sq_col"]
@@ -78,7 +104,12 @@ def adafactor_step_param(self, p, group):
 
     p_data_fp32.add_(-update)
 
-    if p.dtype in {torch.float16, torch.bfloat16}:
+    # if p.dtype in {torch.float16, torch.bfloat16}:
+    #    p.copy_(p_data_fp32)
+
+    if p.dtype == torch.bfloat16:
+        copy_stochastic_(p, p_data_fp32)
+    elif p.dtype == torch.float16:
         p.copy_(p_data_fp32)
 
 
@@ -101,6 +132,7 @@ def adafactor_step(self, closure=None):
 
     return loss
 
+
 def patch_adafactor_fused(optimizer: Adafactor):
     optimizer.step_param = adafactor_step_param.__get__(optimizer)
     optimizer.step = adafactor_step.__get__(optimizer)
diff --git a/library/flux_models.py b/library/flux_models.py
index e38119cd7..c98d52ec0 100644
--- a/library/flux_models.py
+++ b/library/flux_models.py
@@ -1078,6 +1078,7 @@ def forward(
                 if moving:
                     self.single_blocks[to_cpu_block_index].to("cpu")  # , non_blocking=True)
                     # print(f"Moved single block {to_cpu_block_index} to cpu.")
+                    to_cpu_block_index += 1
 
         img = img[:, txt.shape[1] :, ...]
 

From 98c91a762513bbce9ebce137da720a448a3da6c9 Mon Sep 17 00:00:00 2001
From: kohya-ss <ykumeykume@gmail.com>
Date: Thu, 22 Aug 2024 12:37:41 +0900
Subject: [PATCH 098/582] Fix bug in FLUX multi GPU training

---
 README.md                |  6 +++
 flux_train.py            | 29 ++++++-------
 flux_train_network.py    | 10 +++--
 library/flux_models.py   |  6 ++-
 library/flux_utils.py    | 40 ++++++++++++++----
 library/strategy_flux.py |  4 +-
 library/train_util.py    | 10 ++---
 library/utils.py         | 89 ++++++++++++++++++++++++++++++++++++++++
 8 files changed, 156 insertions(+), 38 deletions(-)

diff --git a/README.md b/README.md
index 45349ba38..5125c6631 100644
--- a/README.md
+++ b/README.md
@@ -9,6 +9,12 @@ __Please update PyTorch to 2.4.0. We have tested with `torch==2.4.0` and `torchv
 The command to install PyTorch is as follows:
 `pip3 install torch==2.4.0 torchvision==0.19.0 --index-url https://download.pytorch.org/whl/cu124`
 
+Aug 22, 2024:
+Fixed a bug in multi-GPU training. It should work with fine-tuning and LoRA training. `--double_blocks_to_swap` and `--single_blocks_to_swap` cannot be used in multi-GPU training. 
+
+`--disable_mmap_load_safetensors` option now works in `flux_train.py`. It speeds up model loading during training in WSL2. It is also effective in reducing memory usage when loading models during multi-GPU training. Please always check if the model is loaded correctly, as it uses a custom implementation of safetensors loading.
+
+
 Aug 21, 2024 (update 3):
 - There is a bug that `--full_bf16` option is enabled even if it is not specified in `flux_train.py`. The bug will be fixed sooner. __Please specify the `--full_bf16` option explicitly, especially when training with 24GB VRAM.__
 - Stochastic rounding is now implemented when `--fused_backward_pass` is specified. The implementation is 
diff --git a/flux_train.py b/flux_train.py
index bcf4b9564..e7d45e04d 100644
--- a/flux_train.py
+++ b/flux_train.py
@@ -174,7 +174,7 @@ def train(args):
     # load VAE for caching latents
     ae = None
     if cache_latents:
-        ae = flux_utils.load_ae(name, args.ae, weight_dtype, "cpu")
+        ae = flux_utils.load_ae(name, args.ae, weight_dtype, "cpu", args.disable_mmap_load_safetensors)
         ae.to(accelerator.device, dtype=weight_dtype)
         ae.requires_grad_(False)
         ae.eval()
@@ -199,8 +199,8 @@ def train(args):
     strategy_base.TokenizeStrategy.set_strategy(flux_tokenize_strategy)
 
     # load clip_l, t5xxl for caching text encoder outputs
-    clip_l = flux_utils.load_clip_l(args.clip_l, weight_dtype, "cpu")
-    t5xxl = flux_utils.load_t5xxl(args.t5xxl, weight_dtype, "cpu")
+    clip_l = flux_utils.load_clip_l(args.clip_l, weight_dtype, "cpu", args.disable_mmap_load_safetensors)
+    t5xxl = flux_utils.load_t5xxl(args.t5xxl, weight_dtype, "cpu", args.disable_mmap_load_safetensors)
     clip_l.eval()
     t5xxl.eval()
     clip_l.requires_grad_(False)
@@ -228,7 +228,6 @@ def train(args):
         if args.sample_prompts is not None:
             logger.info(f"cache Text Encoder outputs for sample prompt: {args.sample_prompts}")
 
-            tokenize_strategy: strategy_flux.FluxTokenizeStrategy = strategy_base.TokenizeStrategy.get_strategy()
             text_encoding_strategy: strategy_flux.FluxTextEncodingStrategy = strategy_base.TextEncodingStrategy.get_strategy()
 
             prompts = load_prompts(args.sample_prompts)
@@ -238,9 +237,9 @@ def train(args):
                     for p in [prompt_dict.get("prompt", ""), prompt_dict.get("negative_prompt", "")]:
                         if p not in sample_prompts_te_outputs:
                             logger.info(f"cache Text Encoder outputs for prompt: {p}")
-                            tokens_and_masks = tokenize_strategy.tokenize(p)
+                            tokens_and_masks = flux_tokenize_strategy.tokenize(p)
                             sample_prompts_te_outputs[p] = text_encoding_strategy.encode_tokens(
-                                tokenize_strategy, [clip_l, t5xxl], tokens_and_masks, args.apply_t5_attn_mask
+                                flux_tokenize_strategy, [clip_l, t5xxl], tokens_and_masks, args.apply_t5_attn_mask
                             )
 
         accelerator.wait_for_everyone()
@@ -251,7 +250,9 @@ def train(args):
         clean_memory_on_device(accelerator.device)
 
     # load FLUX
-    flux = flux_utils.load_flow_model(name, args.pretrained_model_name_or_path, weight_dtype, "cpu")
+    flux = flux_utils.load_flow_model(
+        name, args.pretrained_model_name_or_path, weight_dtype, "cpu", args.disable_mmap_load_safetensors
+    )
 
     if args.gradient_checkpointing:
         flux.enable_gradient_checkpointing(args.cpu_offload_checkpointing)
@@ -419,7 +420,7 @@ def train(args):
         # if we doesn't swap blocks, we can move the model to device
         flux = accelerator.prepare(flux, device_placement=[not is_swapping_blocks])
         if is_swapping_blocks:
-            flux.move_to_device_except_swap_blocks(accelerator.device)  # reduce peak memory usage
+            accelerator.unwrap_model(flux).move_to_device_except_swap_blocks(accelerator.device)  # reduce peak memory usage
         optimizer, train_dataloader, lr_scheduler = accelerator.prepare(optimizer, train_dataloader, lr_scheduler)
 
     # 実験的機能：勾配も含めたfp16学習を行う　PyTorchにパッチを当ててfp16でのgrad scaleを有効にする
@@ -439,8 +440,8 @@ def train(args):
 
         double_blocks_to_swap = args.double_blocks_to_swap
         single_blocks_to_swap = args.single_blocks_to_swap
-        num_double_blocks = len(flux.double_blocks)
-        num_single_blocks = len(flux.single_blocks)
+        num_double_blocks = 19  # len(flux.double_blocks)
+        num_single_blocks = 38  # len(flux.single_blocks)
         handled_double_block_indices = set()
         handled_single_block_indices = set()
 
@@ -537,8 +538,8 @@ def __grad_hook(tensor: torch.Tensor, param_group=param_group):
 
         double_blocks_to_swap = args.double_blocks_to_swap
         single_blocks_to_swap = args.single_blocks_to_swap
-        num_double_blocks = len(flux.double_blocks)
-        num_single_blocks = len(flux.single_blocks)
+        num_double_blocks = 19  # len(flux.double_blocks)
+        num_single_blocks = 38  # len(flux.single_blocks)
 
         for opt_idx, optimizer in enumerate(optimizers):
             for param_group in optimizer.param_groups:
@@ -618,7 +619,7 @@ def optimizer_hook(parameter: torch.Tensor):
         )
 
     if is_swapping_blocks:
-        flux.prepare_block_swap_before_forward()
+        accelerator.unwrap_model(flux).prepare_block_swap_before_forward()
 
     # For --sample_at_first
     flux_train_utils.sample_images(accelerator, args, 0, global_step, flux, ae, [clip_l, t5xxl], sample_prompts_te_outputs)
@@ -660,7 +661,7 @@ def optimizer_hook(parameter: torch.Tensor):
                     with torch.no_grad():
                         input_ids = [ids.to(accelerator.device) for ids in batch["input_ids_list"]]
                         text_encoder_conds = text_encoding_strategy.encode_tokens(
-                            tokenize_strategy, [clip_l, t5xxl], input_ids, args.apply_t5_attn_mask
+                            flux_tokenize_strategy, [clip_l, t5xxl], input_ids, args.apply_t5_attn_mask
                         )
                         if args.full_fp16:
                             text_encoder_conds = [c.to(weight_dtype) for c in text_encoder_conds]
diff --git a/flux_train_network.py b/flux_train_network.py
index 49bd270c7..3e2057e91 100644
--- a/flux_train_network.py
+++ b/flux_train_network.py
@@ -57,19 +57,21 @@ def load_target_model(self, args, weight_dtype, accelerator):
         name = self.get_flux_model_name(args)
 
         # if we load to cpu, flux.to(fp8) takes a long time
-        model = flux_utils.load_flow_model(name, args.pretrained_model_name_or_path, weight_dtype, "cpu")
+        model = flux_utils.load_flow_model(
+            name, args.pretrained_model_name_or_path, weight_dtype, "cpu", disable_mmap=args.disable_mmap_load_safetensors
+        )
 
         if args.split_mode:
             model = self.prepare_split_model(model, weight_dtype, accelerator)
 
-        clip_l = flux_utils.load_clip_l(args.clip_l, weight_dtype, "cpu")
+        clip_l = flux_utils.load_clip_l(args.clip_l, weight_dtype, "cpu", disable_mmap=args.disable_mmap_load_safetensors)
         clip_l.eval()
 
         # loading t5xxl to cpu takes a long time, so we should load to gpu in future
-        t5xxl = flux_utils.load_t5xxl(args.t5xxl, weight_dtype, "cpu")
+        t5xxl = flux_utils.load_t5xxl(args.t5xxl, weight_dtype, "cpu", disable_mmap=args.disable_mmap_load_safetensors)
         t5xxl.eval()
 
-        ae = flux_utils.load_ae(name, args.ae, weight_dtype, "cpu")
+        ae = flux_utils.load_ae(name, args.ae, weight_dtype, "cpu", disable_mmap=args.disable_mmap_load_safetensors)
 
         return flux_utils.MODEL_VERSION_FLUX_V1, [clip_l, t5xxl], ae, model
 
diff --git a/library/flux_models.py b/library/flux_models.py
index c98d52ec0..c045aef6b 100644
--- a/library/flux_models.py
+++ b/library/flux_models.py
@@ -745,7 +745,9 @@ def custom_forward(*inputs):
 
                 return custom_forward
 
-            return torch.utils.checkpoint.checkpoint(create_custom_forward(self._forward), img, txt, vec, pe, txt_attention_mask)
+            return torch.utils.checkpoint.checkpoint(
+                create_custom_forward(self._forward), img, txt, vec, pe, txt_attention_mask, use_reentrant=False
+            )
 
         else:
             return self._forward(img, txt, vec, pe, txt_attention_mask)
@@ -836,7 +838,7 @@ def custom_forward(*inputs):
 
                 return custom_forward
 
-            return torch.utils.checkpoint.checkpoint(create_custom_forward(self._forward), x, vec, pe)
+            return torch.utils.checkpoint.checkpoint(create_custom_forward(self._forward), x, vec, pe, use_reentrant=False)
         else:
             return self._forward(x, vec, pe)
 
diff --git a/library/flux_utils.py b/library/flux_utils.py
index 166cd833b..37166933a 100644
--- a/library/flux_utils.py
+++ b/library/flux_utils.py
@@ -9,7 +9,7 @@
 
 from library import flux_models
 
-from library.utils import setup_logging
+from library.utils import setup_logging, MemoryEfficientSafeOpen
 
 setup_logging()
 import logging
@@ -19,32 +19,54 @@
 MODEL_VERSION_FLUX_V1 = "flux1"
 
 
-def load_flow_model(name: str, ckpt_path: str, dtype: torch.dtype, device: Union[str, torch.device]) -> flux_models.Flux:
+# temporary copy from sd3_utils TODO refactor
+def load_safetensors(path: str, device: Union[str, torch.device], disable_mmap: bool = False, dtype: torch.dtype = torch.float32):
+    if disable_mmap:
+        # return safetensors.torch.load(open(path, "rb").read())
+        # use experimental loader
+        logger.info(f"Loading without mmap (experimental)")
+        state_dict = {}
+        with MemoryEfficientSafeOpen(path) as f:
+            for key in f.keys():
+                state_dict[key] = f.get_tensor(key).to(device, dtype=dtype)
+        return state_dict
+    else:
+        try:
+            return load_file(path, device=device)
+        except:
+            return load_file(path)  # prevent device invalid Error
+
+
+def load_flow_model(
+    name: str, ckpt_path: str, dtype: torch.dtype, device: Union[str, torch.device], disable_mmap: bool = False
+) -> flux_models.Flux:
     logger.info(f"Building Flux model {name}")
     with torch.device("meta"):
         model = flux_models.Flux(flux_models.configs[name].params).to(dtype)
 
     # load_sft doesn't support torch.device
     logger.info(f"Loading state dict from {ckpt_path}")
-    sd = load_file(ckpt_path, device=str(device))
+    sd = load_safetensors(ckpt_path, device=str(device), disable_mmap=disable_mmap, dtype=dtype)
     info = model.load_state_dict(sd, strict=False, assign=True)
     logger.info(f"Loaded Flux: {info}")
     return model
 
 
-def load_ae(name: str, ckpt_path: str, dtype: torch.dtype, device: Union[str, torch.device]) -> flux_models.AutoEncoder:
+def load_ae(
+    name: str, ckpt_path: str, dtype: torch.dtype, device: Union[str, torch.device], disable_mmap: bool = False
+) -> flux_models.AutoEncoder:
     logger.info("Building AutoEncoder")
     with torch.device("meta"):
         ae = flux_models.AutoEncoder(flux_models.configs[name].ae_params).to(dtype)
 
     logger.info(f"Loading state dict from {ckpt_path}")
-    sd = load_file(ckpt_path, device=str(device))
+    sd = load_safetensors(ckpt_path, device=str(device), disable_mmap=disable_mmap, dtype=dtype)
     info = ae.load_state_dict(sd, strict=False, assign=True)
     logger.info(f"Loaded AE: {info}")
     return ae
 
 
-def load_clip_l(ckpt_path: str, dtype: torch.dtype, device: Union[str, torch.device]) -> CLIPTextModel:
+def load_clip_l(ckpt_path: str, dtype: torch.dtype, device: Union[str, torch.device], disable_mmap: bool = False) -> CLIPTextModel:
     logger.info("Building CLIP")
     CLIPL_CONFIG = {
         "_name_or_path": "clip-vit-large-patch14/",
@@ -139,13 +161,13 @@ def load_clip_l(ckpt_path: str, dtype: torch.dtype, device: Union[str, torch.dev
         clip = CLIPTextModel._from_config(config)
 
     logger.info(f"Loading state dict from {ckpt_path}")
-    sd = load_file(ckpt_path, device=str(device))
+    sd = load_safetensors(ckpt_path, device=str(device), disable_mmap=disable_mmap, dtype=dtype)
     info = clip.load_state_dict(sd, strict=False, assign=True)
     logger.info(f"Loaded CLIP: {info}")
     return clip
 
 
-def load_t5xxl(ckpt_path: str, dtype: torch.dtype, device: Union[str, torch.device]) -> T5EncoderModel:
+def load_t5xxl(ckpt_path: str, dtype: torch.dtype, device: Union[str, torch.device], disable_mmap: bool = False) -> T5EncoderModel:
     T5_CONFIG_JSON = """
 {
   "architectures": [
@@ -185,7 +207,7 @@ def load_t5xxl(ckpt_path: str, dtype: torch.dtype, device: Union[str, torch.devi
         t5xxl = T5EncoderModel._from_config(config)
 
     logger.info(f"Loading state dict from {ckpt_path}")
-    sd = load_file(ckpt_path, device=str(device))
+    sd = load_safetensors(ckpt_path, device=str(device), disable_mmap=disable_mmap, dtype=dtype)
     info = t5xxl.load_state_dict(sd, strict=False, assign=True)
     logger.info(f"Loaded T5xxl: {info}")
     return t5xxl
diff --git a/library/strategy_flux.py b/library/strategy_flux.py
index 737af390a..b3643cbfc 100644
--- a/library/strategy_flux.py
+++ b/library/strategy_flux.py
@@ -137,7 +137,7 @@ def load_outputs_npz(self, npz_path: str) -> List[np.ndarray]:
         t5_attn_mask = data["t5_attn_mask"]
 
         if self.apply_t5_attn_mask:
-            t5_out = self.mask_t5_attn(t5_out, t5_attn_mask)
+            t5_out = self.mask_t5_attn(t5_out, t5_attn_mask) # FIXME do not mask here!!!
 
         return [l_pooled, t5_out, txt_ids, t5_attn_mask]
 
@@ -149,7 +149,7 @@ def cache_batch_outputs(
 
         tokens_and_masks = tokenize_strategy.tokenize(captions)
         with torch.no_grad():
-            # attn_mask is not applied when caching to disk: it is applied when loading from disk
+            # attn_mask is not applied when caching to disk: it is applied when loading from disk FIXME apply mask when loading
             l_pooled, t5_out, txt_ids, _ = flux_text_encoding_strategy.encode_tokens(
                 tokenize_strategy, models, tokens_and_masks, not self.cache_to_disk
             )
diff --git a/library/train_util.py b/library/train_util.py
index 8929c192f..989758ad5 100644
--- a/library/train_util.py
+++ b/library/train_util.py
@@ -1104,10 +1104,6 @@ def new_cache_text_encoder_outputs(self, models: List[Any], is_main_process: boo
         caching_strategy = TextEncoderOutputsCachingStrategy.get_strategy()
         batch_size = caching_strategy.batch_size or self.batch_size
 
-        # if cache to disk, don't cache TE outputs in non-main process
-        if caching_strategy.cache_to_disk and not is_main_process:
-            return
-
         logger.info("caching Text Encoder outputs with caching strategy.")
         image_infos = list(self.image_data.values())
 
@@ -1120,9 +1116,9 @@ def new_cache_text_encoder_outputs(self, models: List[Any], is_main_process: boo
 
             # check disk cache exists and size of latents
             if caching_strategy.cache_to_disk:
-                info.text_encoder_outputs_npz = te_out_npz
+                info.text_encoder_outputs_npz = te_out_npz  # set npz filename regardless of cache availability/main process
                 cache_available = caching_strategy.is_disk_cached_outputs_expected(te_out_npz)
-                if cache_available:  # do not add to batch
+                if cache_available or not is_main_process:  # do not add to batch
                     continue
 
             batch.append(info)
@@ -2638,7 +2634,7 @@ def load_arbitrary_dataset(args, tokenizer=None) -> MinimalDataset:
     return train_dataset_group
 
 
-def load_image(image_path, alpha=False):    
+def load_image(image_path, alpha=False):
     try:
         with Image.open(image_path) as image:
             if alpha:
diff --git a/library/utils.py b/library/utils.py
index 7de22d5a9..a16209979 100644
--- a/library/utils.py
+++ b/library/utils.py
@@ -153,6 +153,95 @@ def validate_metadata(metadata: Dict[str, Any]) -> Dict[str, str]:
                 v.contiguous().view(torch.uint8).numpy().tofile(f)
 
 
+class MemoryEfficientSafeOpen:
+    # does not support metadata loading
+    def __init__(self, filename):
+        self.filename = filename
+        self.header, self.header_size = self._read_header()
+        self.file = open(filename, "rb")
+
+    def __enter__(self):
+        return self
+
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        self.file.close()
+
+    def keys(self):
+        return [k for k in self.header.keys() if k != "__metadata__"]
+
+    def get_tensor(self, key):
+        if key not in self.header:
+            raise KeyError(f"Tensor '{key}' not found in the file")
+
+        metadata = self.header[key]
+        offset_start, offset_end = metadata["data_offsets"]
+
+        if offset_start == offset_end:
+            tensor_bytes = None
+        else:
+            # adjust offset by header size
+            self.file.seek(self.header_size + 8 + offset_start)
+            tensor_bytes = self.file.read(offset_end - offset_start)
+
+        return self._deserialize_tensor(tensor_bytes, metadata)
+
+    def _read_header(self):
+        with open(self.filename, "rb") as f:
+            header_size = struct.unpack("<Q", f.read(8))[0]
+            header_json = f.read(header_size).decode("utf-8")
+            return json.loads(header_json), header_size
+
+    def _deserialize_tensor(self, tensor_bytes, metadata):
+        dtype = self._get_torch_dtype(metadata["dtype"])
+        shape = metadata["shape"]
+
+        if tensor_bytes is None:
+            byte_tensor = torch.empty(0, dtype=torch.uint8)
+        else:
+            tensor_bytes = bytearray(tensor_bytes) # make it writable
+            byte_tensor = torch.frombuffer(tensor_bytes, dtype=torch.uint8)
+
+        # process float8 types
+        if metadata["dtype"] in ["F8_E5M2", "F8_E4M3"]:
+            return self._convert_float8(byte_tensor, metadata["dtype"], shape)
+
+        # convert to the target dtype and reshape
+        return byte_tensor.view(dtype).reshape(shape)
+
+    @staticmethod
+    def _get_torch_dtype(dtype_str):
+        dtype_map = {
+            "F64": torch.float64,
+            "F32": torch.float32,
+            "F16": torch.float16,
+            "BF16": torch.bfloat16,
+            "I64": torch.int64,
+            "I32": torch.int32,
+            "I16": torch.int16,
+            "I8": torch.int8,
+            "U8": torch.uint8,
+            "BOOL": torch.bool,
+        }
+        # add float8 types if available
+        if hasattr(torch, "float8_e5m2"):
+            dtype_map["F8_E5M2"] = torch.float8_e5m2
+        if hasattr(torch, "float8_e4m3fn"):
+            dtype_map["F8_E4M3"] = torch.float8_e4m3fn
+        return dtype_map.get(dtype_str)
+
+    @staticmethod
+    def _convert_float8(byte_tensor, dtype_str, shape):
+        if dtype_str == "F8_E5M2" and hasattr(torch, "float8_e5m2"):
+            return byte_tensor.view(torch.float8_e5m2).reshape(shape)
+        elif dtype_str == "F8_E4M3" and hasattr(torch, "float8_e4m3fn"):
+            return byte_tensor.view(torch.float8_e4m3fn).reshape(shape)
+        else:
+            # # convert to float16 if float8 is not supported
+            # print(f"Warning: {dtype_str} is not supported in this PyTorch version. Converting to float16.")
+            # return byte_tensor.view(torch.uint8).to(torch.float16).reshape(shape)
+            raise ValueError(f"Unsupported float8 type: {dtype_str} (upgrade PyTorch to support float8 types)")
+
+
 # TODO make inf_utils.py
 
 

From a4d27a232b8d7470d7b0ffbcede8af99d11c03b1 Mon Sep 17 00:00:00 2001
From: Kohya S <ykumeykume@gmail.com>
Date: Thu, 22 Aug 2024 19:55:31 +0900
Subject: [PATCH 099/582] Fix --debug_dataset to work.

---
 flux_train.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/flux_train.py b/flux_train.py
index e7d45e04d..410728d44 100644
--- a/flux_train.py
+++ b/flux_train.py
@@ -142,6 +142,12 @@ def train(args):
                     args.cache_text_encoder_outputs_to_disk, args.text_encoder_batch_size, False, False
                 )
             )
+        name = "schnell" if "schnell" in args.pretrained_model_name_or_path else "dev"
+        t5xxl_max_token_length = (
+            args.t5xxl_max_token_length if args.t5xxl_max_token_length is not None else (256 if name == "schnell" else 512)
+        )
+        strategy_base.TokenizeStrategy.set_strategy(strategy_flux.FluxTokenizeStrategy(t5xxl_max_token_length))
+
         train_dataset_group.set_current_strategies()
         train_util.debug_dataset(train_dataset_group, True)
         return

From 2d8fa3387a4adfdc2e36f2582e4ffc21864569f0 Mon Sep 17 00:00:00 2001
From: Kohya S <ykumeykume@gmail.com>
Date: Thu, 22 Aug 2024 19:56:27 +0900
Subject: [PATCH 100/582] Fix to remove zero pad for t5xxl output

---
 README.md                |  5 +++++
 library/strategy_flux.py | 23 +++++++++++------------
 2 files changed, 16 insertions(+), 12 deletions(-)

diff --git a/README.md b/README.md
index 5125c6631..33b3a9a99 100644
--- a/README.md
+++ b/README.md
@@ -9,6 +9,11 @@ __Please update PyTorch to 2.4.0. We have tested with `torch==2.4.0` and `torchv
 The command to install PyTorch is as follows:
 `pip3 install torch==2.4.0 torchvision==0.19.0 --index-url https://download.pytorch.org/whl/cu124`
 
+Aug 22, 2024 (update 2):
+Fixed a bug that the embedding was zero-padded when `--apply_t5_attn_mask` option was applied. Also, the cache file for text encoder outputs now records whether the mask is applied or not. Please note that the cache file will be recreated when switching the `--apply_t5_attn_mask` option.
+
+Added a script to extract LoRA from the difference between the two models of FLUX.1. Use `networks/flux_extract_lora.py`. See `--help` for details. Normally, more than 50GB of memory is required, but specifying the `--mem_eff_safe_open` option significantly reduces memory usage. However, this option is a custom implementation, so unexpected problems may occur. Please always check if the model is loaded correctly.
+
 Aug 22, 2024:
 Fixed a bug in multi-GPU training. It should work with fine-tuning and LoRA training. `--double_blocks_to_swap` and `--single_blocks_to_swap` cannot be used in multi-GPU training. 
 
diff --git a/library/strategy_flux.py b/library/strategy_flux.py
index b3643cbfc..d52b3b8dd 100644
--- a/library/strategy_flux.py
+++ b/library/strategy_flux.py
@@ -22,7 +22,7 @@
 
 
 class FluxTokenizeStrategy(TokenizeStrategy):
-    def __init__(self, t5xxl_max_length: int = 256, tokenizer_cache_dir: Optional[str] = None) -> None:
+    def __init__(self, t5xxl_max_length: int = 512, tokenizer_cache_dir: Optional[str] = None) -> None:
         self.t5xxl_max_length = t5xxl_max_length
         self.clip_l = self._load_tokenizer(CLIPTokenizer, CLIP_L_TOKENIZER_ID, tokenizer_cache_dir=tokenizer_cache_dir)
         self.t5xxl = self._load_tokenizer(T5TokenizerFast, T5_XXL_TOKENIZER_ID, tokenizer_cache_dir=tokenizer_cache_dir)
@@ -120,25 +120,24 @@ def is_disk_cached_outputs_expected(self, npz_path: str):
                 return False
             if "t5_attn_mask" not in npz:
                 return False
+            if "apply_t5_attn_mask" not in npz:
+                return False
+            npz_apply_t5_attn_mask = npz["apply_t5_attn_mask"]
+            if npz_apply_t5_attn_mask != self.apply_t5_attn_mask:
+                return False
         except Exception as e:
             logger.error(f"Error loading file: {npz_path}")
             raise e
 
         return True
 
-    def mask_t5_attn(self, t5_out: np.ndarray, t5_attn_mask: np.ndarray) -> np.ndarray:
-        return t5_out * np.expand_dims(t5_attn_mask, -1)
-
     def load_outputs_npz(self, npz_path: str) -> List[np.ndarray]:
         data = np.load(npz_path)
         l_pooled = data["l_pooled"]
         t5_out = data["t5_out"]
         txt_ids = data["txt_ids"]
         t5_attn_mask = data["t5_attn_mask"]
-
-        if self.apply_t5_attn_mask:
-            t5_out = self.mask_t5_attn(t5_out, t5_attn_mask) # FIXME do not mask here!!!
-
+        # apply_t5_attn_mask should be same as self.apply_t5_attn_mask
         return [l_pooled, t5_out, txt_ids, t5_attn_mask]
 
     def cache_batch_outputs(
@@ -149,10 +148,8 @@ def cache_batch_outputs(
 
         tokens_and_masks = tokenize_strategy.tokenize(captions)
         with torch.no_grad():
-            # attn_mask is not applied when caching to disk: it is applied when loading from disk FIXME apply mask when loading
-            l_pooled, t5_out, txt_ids, _ = flux_text_encoding_strategy.encode_tokens(
-                tokenize_strategy, models, tokens_and_masks, not self.cache_to_disk
-            )
+            # attn_mask is applied in text_encoding_strategy.encode_tokens if apply_t5_attn_mask is True
+            l_pooled, t5_out, txt_ids, _ = flux_text_encoding_strategy.encode_tokens(tokenize_strategy, models, tokens_and_masks)
 
         if l_pooled.dtype == torch.bfloat16:
             l_pooled = l_pooled.float()
@@ -171,6 +168,7 @@ def cache_batch_outputs(
             t5_out_i = t5_out[i]
             txt_ids_i = txt_ids[i]
             t5_attn_mask_i = t5_attn_mask[i]
+            apply_t5_attn_mask_i = self.apply_t5_attn_mask
 
             if self.cache_to_disk:
                 np.savez(
@@ -179,6 +177,7 @@ def cache_batch_outputs(
                     t5_out=t5_out_i,
                     txt_ids=txt_ids_i,
                     t5_attn_mask=t5_attn_mask_i,
+                    apply_t5_attn_mask=apply_t5_attn_mask_i,
                 )
             else:
                 info.text_encoder_outputs = (l_pooled_i, t5_out_i, txt_ids_i, t5_attn_mask_i)

From b0a980844a2e02b1b1ae4cf615ae489dbf8ece67 Mon Sep 17 00:00:00 2001
From: Kohya S <ykumeykume@gmail.com>
Date: Thu, 22 Aug 2024 19:57:29 +0900
Subject: [PATCH 101/582] added a script to extract LoRA

---
 networks/flux_extract_lora.py | 219 ++++++++++++++++++++++++++++++++++
 1 file changed, 219 insertions(+)
 create mode 100644 networks/flux_extract_lora.py

diff --git a/networks/flux_extract_lora.py b/networks/flux_extract_lora.py
new file mode 100644
index 000000000..3ee6e816d
--- /dev/null
+++ b/networks/flux_extract_lora.py
@@ -0,0 +1,219 @@
+# extract approximating LoRA by svd from two FLUX models
+# The code is based on https://github.com/cloneofsimo/lora/blob/develop/lora_diffusion/cli_svd.py
+# Thanks to cloneofsimo!
+
+import argparse
+import json
+import os
+import time
+import torch
+from safetensors.torch import load_file, save_file
+from safetensors import safe_open
+from tqdm import tqdm
+from library import flux_utils, sai_model_spec, model_util, sdxl_model_util
+import lora
+from library.utils import MemoryEfficientSafeOpen
+from library.utils import setup_logging
+from networks import lora_flux
+
+setup_logging()
+import logging
+
+logger = logging.getLogger(__name__)
+
+# CLAMP_QUANTILE = 0.99
+# MIN_DIFF = 1e-1
+
+
+def save_to_file(file_name, state_dict, metadata, dtype):
+    if dtype is not None:
+        for key in list(state_dict.keys()):
+            if type(state_dict[key]) == torch.Tensor:
+                state_dict[key] = state_dict[key].to(dtype)
+
+    save_file(state_dict, file_name, metadata=metadata)
+
+
+def svd(
+    model_org=None,
+    model_tuned=None,
+    save_to=None,
+    dim=4,
+    device=None,
+    save_precision=None,
+    clamp_quantile=0.99,
+    min_diff=0.01,
+    no_metadata=False,
+    mem_eff_safe_open=False,
+):
+    def str_to_dtype(p):
+        if p == "float":
+            return torch.float
+        if p == "fp16":
+            return torch.float16
+        if p == "bf16":
+            return torch.bfloat16
+        return None
+
+    calc_dtype = torch.float
+    save_dtype = str_to_dtype(save_precision)
+    store_device = "cpu"
+
+    # open models
+    lora_weights = {}
+    if not mem_eff_safe_open:
+        # use original safetensors.safe_open
+        open_fn = lambda fn: safe_open(fn, framework="pt")
+    else:
+        logger.info("Using memory efficient safe_open")
+        open_fn = lambda fn: MemoryEfficientSafeOpen(fn)
+
+    with open_fn(model_org) as fo:
+        # filter keys
+        keys = []
+        for key in fo.keys():
+            if not ("single_block" in key or "double_block" in key):
+                continue
+            if ".bias" in key:
+                continue
+            if "norm" in key:
+                continue
+            keys.append(key)
+
+        with open_fn(model_tuned) as ft:
+            for key in tqdm(keys):
+                # get tensors and calculate difference
+                value_o = fo.get_tensor(key)
+                value_t = ft.get_tensor(key)
+                mat = value_t.to(calc_dtype) - value_o.to(calc_dtype)
+                del value_o, value_t
+
+                # extract LoRA weights
+                if device:
+                    mat = mat.to(device)
+                out_dim, in_dim = mat.size()[0:2]
+                rank = min(dim, in_dim, out_dim)  # LoRA rank cannot exceed the original dim
+
+                mat = mat.squeeze()
+
+                U, S, Vh = torch.linalg.svd(mat)
+
+                U = U[:, :rank]
+                S = S[:rank]
+                U = U @ torch.diag(S)
+
+                Vh = Vh[:rank, :]
+
+                dist = torch.cat([U.flatten(), Vh.flatten()])
+                hi_val = torch.quantile(dist, clamp_quantile)
+                low_val = -hi_val
+
+                U = U.clamp(low_val, hi_val)
+                Vh = Vh.clamp(low_val, hi_val)
+
+                U = U.to(store_device, dtype=save_dtype).contiguous()
+                Vh = Vh.to(store_device, dtype=save_dtype).contiguous()
+
+                print(f"key: {key}, U: {U.size()}, Vh: {Vh.size()}")
+                lora_weights[key] = (U, Vh)
+                del mat, U, S, Vh
+
+    # make state dict for LoRA
+    lora_sd = {}
+    for key, (up_weight, down_weight) in lora_weights.items():
+        lora_name = key.replace(".weight", "").replace(".", "_")
+        lora_name = lora_flux.LoRANetwork.LORA_PREFIX_FLUX + "_" + lora_name
+        lora_sd[lora_name + ".lora_up.weight"] = up_weight
+        lora_sd[lora_name + ".lora_down.weight"] = down_weight
+        lora_sd[lora_name + ".alpha"] = torch.tensor(down_weight.size()[0])  # same as rank
+
+    # minimum metadata
+    net_kwargs = {}
+    metadata = {
+        "ss_v2": str(False),
+        "ss_base_model_version": flux_utils.MODEL_VERSION_FLUX_V1,
+        "ss_network_module": "networks.lora_flux",
+        "ss_network_dim": str(dim),
+        "ss_network_alpha": str(float(dim)),
+        "ss_network_args": json.dumps(net_kwargs),
+    }
+
+    if not no_metadata:
+        title = os.path.splitext(os.path.basename(save_to))[0]
+        sai_metadata = sai_model_spec.build_metadata(lora_sd, False, False, False, True, False, time.time(), title, flux="dev")
+        metadata.update(sai_metadata)
+
+    save_to_file(save_to, lora_sd, metadata, save_dtype)
+
+    logger.info(f"LoRA weights saved to {save_to}")
+
+
+def setup_parser() -> argparse.ArgumentParser:
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--save_precision",
+        type=str,
+        default=None,
+        choices=[None, "float", "fp16", "bf16"],
+        help="precision in saving, same to merging if omitted / 保存時に精度を変更して保存する、省略時はfloat",
+    )
+    parser.add_argument(
+        "--model_org",
+        type=str,
+        default=None,
+        required=True,
+        help="Original model: safetensors file / 元モデル、safetensors",
+    )
+    parser.add_argument(
+        "--model_tuned",
+        type=str,
+        default=None,
+        required=True,
+        help="Tuned model, LoRA is difference of `original to tuned`: safetensors file / 派生モデル（生成されるLoRAは元→派生の差分になります）、ckptまたはsafetensors",
+    )
+    parser.add_argument(
+        "--mem_eff_safe_open",
+        action="store_true",
+        help="use memory efficient safe_open. This is an experimental feature, use only when memory is not enough."
+        " / メモリ効率の良いsafe_openを使用する。実装は実験的なものなので、メモリが足りない場合のみ使用してください。",
+    )
+    parser.add_argument(
+        "--save_to",
+        type=str,
+        default=None,
+        required=True,
+        help="destination file name: safetensors file / 保存先のファイル名、safetensors",
+    )
+    parser.add_argument(
+        "--dim", type=int, default=4, help="dimension (rank) of LoRA (default 4) / LoRAの次元数（rank）（デフォルト4）"
+    )
+    parser.add_argument(
+        "--device", type=str, default=None, help="device to use, cuda for GPU / 計算を行うデバイス、cuda でGPUを使う"
+    )
+    parser.add_argument(
+        "--clamp_quantile",
+        type=float,
+        default=0.99,
+        help="Quantile clamping value, float, (0-1). Default = 0.99 / 値をクランプするための分位点、float、(0-1)。デフォルトは0.99",
+    )
+    # parser.add_argument(
+    #     "--min_diff",
+    #     type=float,
+    #     default=0.01,
+    #     help="Minimum difference between finetuned model and base to consider them different enough to extract, float, (0-1). Default = 0.01 /"
+    #     + "LoRAを抽出するために元モデルと派生モデルの差分の最小値、float、(0-1)。デフォルトは0.01",
+    # )
+    parser.add_argument(
+        "--no_metadata",
+        action="store_true",
+        help="do not save sai modelspec metadata (minimum ss_metadata for LoRA is saved) / "
+        + "sai modelspecのメタデータを保存しない（LoRAの最低限のss_metadataは保存される）",
+    )
+    return parser
+
+
+if __name__ == "__main__":
+    parser = setup_parser()
+
+    args = parser.parse_args()
+    svd(**vars(args))

From bf9f798985dd75fc2dd1fbc8c8dc775c92176854 Mon Sep 17 00:00:00 2001
From: Kohya S <ykumeykume@gmail.com>
Date: Thu, 22 Aug 2024 19:59:38 +0900
Subject: [PATCH 102/582] chore: fix typos, remove debug print

---
 networks/flux_extract_lora.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/networks/flux_extract_lora.py b/networks/flux_extract_lora.py
index 3ee6e816d..63ab2960c 100644
--- a/networks/flux_extract_lora.py
+++ b/networks/flux_extract_lora.py
@@ -68,10 +68,10 @@ def str_to_dtype(p):
         logger.info("Using memory efficient safe_open")
         open_fn = lambda fn: MemoryEfficientSafeOpen(fn)
 
-    with open_fn(model_org) as fo:
+    with open_fn(model_org) as f_org:
         # filter keys
         keys = []
-        for key in fo.keys():
+        for key in f_org.keys():
             if not ("single_block" in key or "double_block" in key):
                 continue
             if ".bias" in key:
@@ -80,11 +80,11 @@ def str_to_dtype(p):
                 continue
             keys.append(key)
 
-        with open_fn(model_tuned) as ft:
+        with open_fn(model_tuned) as f_tuned:
             for key in tqdm(keys):
                 # get tensors and calculate difference
-                value_o = fo.get_tensor(key)
-                value_t = ft.get_tensor(key)
+                value_o = f_org.get_tensor(key)
+                value_t = f_tuned.get_tensor(key)
                 mat = value_t.to(calc_dtype) - value_o.to(calc_dtype)
                 del value_o, value_t
 
@@ -114,7 +114,7 @@ def str_to_dtype(p):
                 U = U.to(store_device, dtype=save_dtype).contiguous()
                 Vh = Vh.to(store_device, dtype=save_dtype).contiguous()
 
-                print(f"key: {key}, U: {U.size()}, Vh: {Vh.size()}")
+                # print(f"key: {key}, U: {U.size()}, Vh: {Vh.size()}")
                 lora_weights[key] = (U, Vh)
                 del mat, U, S, Vh
 

From 81411a398eb4ce28d84cc2da8238ff013d40d62f Mon Sep 17 00:00:00 2001
From: Kohya S <ykumeykume@gmail.com>
Date: Thu, 22 Aug 2024 22:02:29 +0900
Subject: [PATCH 103/582] speed up getting image sizes

---
 library/strategy_base.py |  7 ++++++-
 library/strategy_flux.py |  9 +++------
 library/strategy_sd.py   | 12 ++++--------
 library/strategy_sd3.py  |  9 +++------
 library/train_util.py    | 23 ++++++++++++++++++++++-
 5 files changed, 38 insertions(+), 22 deletions(-)

diff --git a/library/strategy_base.py b/library/strategy_base.py
index e7d3a97ef..6a01c30a5 100644
--- a/library/strategy_base.py
+++ b/library/strategy_base.py
@@ -204,9 +204,14 @@ def cache_to_disk(self):
     def batch_size(self):
         return self._batch_size
 
-    def get_image_size_from_disk_cache_path(self, absolute_path: str) -> Tuple[Optional[int], Optional[int]]:
+    @property
+    def cache_suffix(self):
         raise NotImplementedError
 
+    def get_image_size_from_disk_cache_path(self, absolute_path: str, npz_path: str) -> Tuple[Optional[int], Optional[int]]:
+        w, h = os.path.splitext(npz_path)[0].split("_")[-2].split("x")
+        return int(w), int(h)
+
     def get_latents_npz_path(self, absolute_path: str, image_size: Tuple[int, int]) -> str:
         raise NotImplementedError
 
diff --git a/library/strategy_flux.py b/library/strategy_flux.py
index d52b3b8dd..887113ca1 100644
--- a/library/strategy_flux.py
+++ b/library/strategy_flux.py
@@ -189,12 +189,9 @@ class FluxLatentsCachingStrategy(LatentsCachingStrategy):
     def __init__(self, cache_to_disk: bool, batch_size: int, skip_disk_cache_validity_check: bool) -> None:
         super().__init__(cache_to_disk, batch_size, skip_disk_cache_validity_check)
 
-    def get_image_size_from_disk_cache_path(self, absolute_path: str) -> Tuple[Optional[int], Optional[int]]:
-        npz_file = glob.glob(os.path.splitext(absolute_path)[0] + "_*" + FluxLatentsCachingStrategy.FLUX_LATENTS_NPZ_SUFFIX)
-        if len(npz_file) == 0:
-            return None, None
-        w, h = os.path.splitext(npz_file[0])[0].split("_")[-2].split("x")
-        return int(w), int(h)
+    @property
+    def cache_suffix(self) -> str:
+        return FluxLatentsCachingStrategy.FLUX_LATENTS_NPZ_SUFFIX
 
     def get_latents_npz_path(self, absolute_path: str, image_size: Tuple[int, int]) -> str:
         return (
diff --git a/library/strategy_sd.py b/library/strategy_sd.py
index 83ffaa31b..af472e491 100644
--- a/library/strategy_sd.py
+++ b/library/strategy_sd.py
@@ -108,14 +108,10 @@ def __init__(self, sd: bool, cache_to_disk: bool, batch_size: int, skip_disk_cac
         self.suffix = (
             SdSdxlLatentsCachingStrategy.SD_LATENTS_NPZ_SUFFIX if sd else SdSdxlLatentsCachingStrategy.SDXL_LATENTS_NPZ_SUFFIX
         )
-
-    def get_image_size_from_disk_cache_path(self, absolute_path: str) -> Tuple[Optional[int], Optional[int]]:
-        # does not include old npz
-        npz_file = glob.glob(os.path.splitext(absolute_path)[0] + "_*" + self.suffix)
-        if len(npz_file) == 0:
-            return None, None
-        w, h = os.path.splitext(npz_file[0])[0].split("_")[-2].split("x")
-        return int(w), int(h)
+    
+    @property
+    def cache_suffix(self) -> str:
+        return self.suffix
 
     def get_latents_npz_path(self, absolute_path: str, image_size: Tuple[int, int]) -> str:
         # support old .npz
diff --git a/library/strategy_sd3.py b/library/strategy_sd3.py
index a22818903..9fde02084 100644
--- a/library/strategy_sd3.py
+++ b/library/strategy_sd3.py
@@ -222,12 +222,9 @@ class Sd3LatentsCachingStrategy(LatentsCachingStrategy):
     def __init__(self, cache_to_disk: bool, batch_size: int, skip_disk_cache_validity_check: bool) -> None:
         super().__init__(cache_to_disk, batch_size, skip_disk_cache_validity_check)
 
-    def get_image_size_from_disk_cache_path(self, absolute_path: str) -> Tuple[Optional[int], Optional[int]]:
-        npz_file = glob.glob(os.path.splitext(absolute_path)[0] + "_*" + Sd3LatentsCachingStrategy.SD3_LATENTS_NPZ_SUFFIX)
-        if len(npz_file) == 0:
-            return None, None
-        w, h = os.path.splitext(npz_file[0])[0].split("_")[-2].split("x")
-        return int(w), int(h)
+    @property
+    def cache_suffix(self) -> str:
+        return Sd3LatentsCachingStrategy.SD3_LATENTS_NPZ_SUFFIX
 
     def get_latents_npz_path(self, absolute_path: str, image_size: Tuple[int, int]) -> str:
         return (
diff --git a/library/train_util.py b/library/train_util.py
index 989758ad5..dcc01f6f7 100644
--- a/library/train_util.py
+++ b/library/train_util.py
@@ -1739,9 +1739,30 @@ def load_dreambooth_dir(subset: DreamBoothSubset):
                 strategy = LatentsCachingStrategy.get_strategy()
                 if strategy is not None:
                     logger.info("get image size from name of cache files")
+
+                    # make image path to npz path mapping
+                    npz_paths = glob.glob(os.path.join(subset.image_dir, "*" + strategy.cache_suffix))
+                    npz_paths.sort()
+                    npz_path_index = 0
+
                     size_set_count = 0
                     for i, img_path in enumerate(tqdm(img_paths)):
-                        w, h = strategy.get_image_size_from_disk_cache_path(img_path)
+                        l = len(os.path.splitext(img_path)[0])  # remove extension
+                        found = False
+                        while npz_path_index < len(npz_paths):  # until found or end of npz_paths
+                            # npz_paths are sorted, so if npz_path > img_path, img_path is not found
+                            if npz_paths[npz_path_index][:l] > img_path[:l]:
+                                break
+                            if npz_paths[npz_path_index][:l] == img_path[:l]:  # found
+                                found = True
+                                break
+                            npz_path_index += 1  # next npz_path
+
+                        if found:
+                            w, h = strategy.get_image_size_from_disk_cache_path(img_path, npz_paths[npz_path_index])
+                        else:
+                            w, h = None, None
+
                         if w is not None and h is not None:
                             sizes[i] = [w, h]
                             size_set_count += 1

From 2e89cd2cc634c27add7a04c21fcb6d0e16716a2b Mon Sep 17 00:00:00 2001
From: Kohya S <ykumeykume@gmail.com>
Date: Sat, 24 Aug 2024 12:39:54 +0900
Subject: [PATCH 104/582] Fix issue with attention mask not being applied in
 single blocks

---
 README.md              |  3 ++
 flux_train_network.py  |  4 +--
 library/flux_models.py | 62 +++++++++++++++++++++---------------------
 3 files changed, 36 insertions(+), 33 deletions(-)

diff --git a/README.md b/README.md
index 33b3a9a99..4151bf44e 100644
--- a/README.md
+++ b/README.md
@@ -9,6 +9,9 @@ __Please update PyTorch to 2.4.0. We have tested with `torch==2.4.0` and `torchv
 The command to install PyTorch is as follows:
 `pip3 install torch==2.4.0 torchvision==0.19.0 --index-url https://download.pytorch.org/whl/cu124`
 
+Aug 24, 2024:
+Fixed an issue where the attention mask was not applied in single blocks when `--apply_t5_attn_mask` was specified.
+
 Aug 22, 2024 (update 2):
 Fixed a bug that the embedding was zero-padded when `--apply_t5_attn_mask` option was applied. Also, the cache file for text encoder outputs now records whether the mask is applied or not. Please note that the cache file will be recreated when switching the `--apply_t5_attn_mask` option.
 
diff --git a/flux_train_network.py b/flux_train_network.py
index 3e2057e91..82f77a77e 100644
--- a/flux_train_network.py
+++ b/flux_train_network.py
@@ -243,7 +243,7 @@ def forward(self, img, img_ids, txt, txt_ids, timesteps, y, guidance=None, txt_a
                 self.flux_upper.to("cpu")
                 clean_memory_on_device(self.target_device)
                 self.flux_lower.to(self.target_device)
-                return self.flux_lower(img, txt, vec, pe)
+                return self.flux_lower(img, txt, vec, pe, txt_attention_mask)
 
         wrapper = FluxUpperLowerWrapper(self.flux_upper, flux, accelerator.device)
         clean_memory_on_device(accelerator.device)
@@ -352,7 +352,7 @@ def get_noise_pred_and_target(
                 intermediate_txt.requires_grad_(True)
                 vec.requires_grad_(True)
                 pe.requires_grad_(True)
-                model_pred = unet(img=intermediate_img, txt=intermediate_txt, vec=vec, pe=pe)
+                model_pred = unet(img=intermediate_img, txt=intermediate_txt, vec=vec, pe=pe, txt_attention_mask=t5_attn_mask)
 
         # unpack latents
         model_pred = flux_utils.unpack_latents(model_pred, packed_latent_height, packed_latent_width)
diff --git a/library/flux_models.py b/library/flux_models.py
index c045aef6b..b5726c298 100644
--- a/library/flux_models.py
+++ b/library/flux_models.py
@@ -752,18 +752,6 @@ def custom_forward(*inputs):
         else:
             return self._forward(img, txt, vec, pe, txt_attention_mask)
 
-    # def forward(self, img: Tensor, txt: Tensor, vec: Tensor, pe: Tensor):
-    #     if self.training and self.gradient_checkpointing:
-    #         def create_custom_forward(func):
-    #             def custom_forward(*inputs):
-    #                 return func(*inputs)
-    #             return custom_forward
-    #         return torch.utils.checkpoint.checkpoint(
-    #             create_custom_forward(self._forward), img, txt, vec, pe, use_reentrant=USE_REENTRANT
-    #         )
-    #     else:
-    #         return self._forward(img, txt, vec, pe)
-
 
 class SingleStreamBlock(nn.Module):
     """
@@ -809,7 +797,7 @@ def disable_gradient_checkpointing(self):
         self.gradient_checkpointing = False
         self.cpu_offload_checkpointing = False
 
-    def _forward(self, x: Tensor, vec: Tensor, pe: Tensor) -> Tensor:
+    def _forward(self, x: Tensor, vec: Tensor, pe: Tensor, txt_attention_mask: Optional[Tensor] = None) -> Tensor:
         mod, _ = self.modulation(vec)
         x_mod = (1 + mod.scale) * self.pre_norm(x) + mod.shift
         qkv, mlp = torch.split(self.linear1(x_mod), [3 * self.hidden_size, self.mlp_hidden_dim], dim=-1)
@@ -817,16 +805,35 @@ def _forward(self, x: Tensor, vec: Tensor, pe: Tensor) -> Tensor:
         q, k, v = rearrange(qkv, "B L (K H D) -> K B H L D", K=3, H=self.num_heads)
         q, k = self.norm(q, k, v)
 
+        # make attention mask if not None
+        attn_mask = None
+        if txt_attention_mask is not None:
+            # F.scaled_dot_product_attention expects attn_mask to be bool for binary mask
+            attn_mask = txt_attention_mask.to(torch.bool)  # b, seq_len
+            attn_mask = torch.cat(
+                (
+                    attn_mask,
+                    torch.ones(
+                        attn_mask.shape[0], x.shape[1] - txt_attention_mask.shape[1], device=attn_mask.device, dtype=torch.bool
+                    ),
+                ),
+                dim=1,
+            )  # b, seq_len + img_len = x_len
+
+            # broadcast attn_mask to all heads
+            attn_mask = attn_mask[:, None, None, :].expand(-1, q.shape[1], q.shape[2], -1)
+
         # compute attention
-        attn = attention(q, k, v, pe=pe)
+        attn = attention(q, k, v, pe=pe, attn_mask=attn_mask)
+
         # compute activation in mlp stream, cat again and run second linear layer
         output = self.linear2(torch.cat((attn, self.mlp_act(mlp)), 2))
         return x + mod.gate * output
 
-    def forward(self, x: Tensor, vec: Tensor, pe: Tensor) -> Tensor:
+    def forward(self, x: Tensor, vec: Tensor, pe: Tensor, txt_attention_mask: Optional[Tensor] = None) -> Tensor:
         if self.training and self.gradient_checkpointing:
             if not self.cpu_offload_checkpointing:
-                return checkpoint(self._forward, x, vec, pe, use_reentrant=False)
+                return checkpoint(self._forward, x, vec, pe, txt_attention_mask, use_reentrant=False)
 
             # cpu offload checkpointing
 
@@ -838,19 +845,11 @@ def custom_forward(*inputs):
 
                 return custom_forward
 
-            return torch.utils.checkpoint.checkpoint(create_custom_forward(self._forward), x, vec, pe, use_reentrant=False)
+            return torch.utils.checkpoint.checkpoint(
+                create_custom_forward(self._forward), x, vec, pe, txt_attention_mask, use_reentrant=False
+            )
         else:
-            return self._forward(x, vec, pe)
-
-    # def forward(self, x: Tensor, vec: Tensor, pe: Tensor):
-    #     if self.training and self.gradient_checkpointing:
-    #         def create_custom_forward(func):
-    #             def custom_forward(*inputs):
-    #                 return func(*inputs)
-    #             return custom_forward
-    #         return torch.utils.checkpoint.checkpoint(create_custom_forward(self._forward), x, vec, pe, use_reentrant=USE_REENTRANT)
-    #     else:
-    #         return self._forward(x, vec, pe)
+            return self._forward(x, vec, pe, txt_attention_mask)
 
 
 class LastLayer(nn.Module):
@@ -1053,7 +1052,7 @@ def forward(
 
         if not self.single_blocks_to_swap:
             for block in self.single_blocks:
-                img = block(img, vec=vec, pe=pe)
+                img = block(img, vec=vec, pe=pe, txt_attention_mask=txt_attention_mask)
         else:
             # make sure first n blocks are on cuda, and last n blocks are on cpu at beginning
             for block_idx in range(self.single_blocks_to_swap):
@@ -1075,7 +1074,7 @@ def forward(
                     block.to(self.device)  # move to cuda
                     # print(f"Moved single block {block_idx} to cuda.")
 
-                img = block(img, vec=vec, pe=pe)
+                img = block(img, vec=vec, pe=pe, txt_attention_mask=txt_attention_mask)
 
                 if moving:
                     self.single_blocks[to_cpu_block_index].to("cpu")  # , non_blocking=True)
@@ -1250,10 +1249,11 @@ def forward(
         txt: Tensor,
         vec: Tensor | None = None,
         pe: Tensor | None = None,
+        txt_attention_mask: Tensor | None = None,
     ) -> Tensor:
         img = torch.cat((txt, img), 1)
         for block in self.single_blocks:
-            img = block(img, vec=vec, pe=pe)
+            img = block(img, vec=vec, pe=pe, txt_attention_mask=txt_attention_mask)
         img = img[:, txt.shape[1] :, ...]
 
         img = self.final_layer(img, vec)  # (N, T, patch_size ** 2 * out_channels)

From cf689e7aa697877a0eee58622035ab702ce59d3e Mon Sep 17 00:00:00 2001
From: Kohya S <ykumeykume@gmail.com>
Date: Sat, 24 Aug 2024 16:35:43 +0900
Subject: [PATCH 105/582] feat: Add option to split projection layers and apply
 LoRA

---
 README.md                      |  14 ++
 networks/check_lora_weights.py |   2 +-
 networks/convert_flux_lora.py  |  51 ++++--
 networks/lora_flux.py          | 326 +++++++++++++++++++++++++++------
 4 files changed, 325 insertions(+), 68 deletions(-)

diff --git a/README.md b/README.md
index 4151bf44e..7d326a867 100644
--- a/README.md
+++ b/README.md
@@ -9,6 +9,20 @@ __Please update PyTorch to 2.4.0. We have tested with `torch==2.4.0` and `torchv
 The command to install PyTorch is as follows:
 `pip3 install torch==2.4.0 torchvision==0.19.0 --index-url https://download.pytorch.org/whl/cu124`
 
+Aug 24, 2024 (update 2):
+
+__Experimental__ Added an option to split the projection layers of q/k/v/txt in the attention and apply LoRA to each of them in FLUX.1 LoRA training. Specify `"split_qkv=True"` in network_args like `--network_args "split_qkv=True"` (`train_blocks` is also available). 
+
+The number of parameters may increase slightly, so the expressiveness may increase, but the training time may be longer. No detailed verification has been done.
+
+This implementation is experimental, so it may be deprecated or changed in the future.
+
+The .safetensors file of the trained model is compatible with the normal LoRA model of sd-scripts, so it should be usable in inference environments such as ComfyUI as it is. Also, converting it to AI-toolkit (Diffusers) format with `convert_flux_lora.py` will reduce the size. It should be no problem to convert it if you use it in the inference environment.
+
+Technical details: In the implementation of Black Forest Labs' model, the projection layers of q/k/v (and txt in single blocks) are concatenated into one. If LoRA is added there as it is, the LoRA module is only one, and the dimension is large. In contrast, in the implementation of Diffusers, the projection layers of q/k/v/txt are separated. Therefore, the LoRA module is applied to q/k/v/txt separately, and the dimension is smaller. This option is for training LoRA similar to the latter.
+
+The compatibility of the saved model (state dict) is ensured by concatenating the weights of multiple LoRAs. However, since there are zero weights in some parts, the model size will be large.
+
 Aug 24, 2024:
 Fixed an issue where the attention mask was not applied in single blocks when `--apply_t5_attn_mask` was specified.
 
diff --git a/networks/check_lora_weights.py b/networks/check_lora_weights.py
index 794659c94..b5b5e61ae 100644
--- a/networks/check_lora_weights.py
+++ b/networks/check_lora_weights.py
@@ -18,7 +18,7 @@ def main(file):
 
     keys = list(sd.keys())
     for key in keys:
-        if "lora_up" in key or "lora_down" in key:
+        if "lora_up" in key or "lora_down" in key or "lora_A" in key or "lora_B" in key:
             values.append((key, sd[key]))
     print(f"number of LoRA modules: {len(values)}")
 
diff --git a/networks/convert_flux_lora.py b/networks/convert_flux_lora.py
index e9743534d..bd4c1cf78 100644
--- a/networks/convert_flux_lora.py
+++ b/networks/convert_flux_lora.py
@@ -266,11 +266,12 @@ def convert_to_ai_toolkit_cat(sds_sd, ait_sd, sds_key, ait_keys, dims=None):
     if sds_key + ".lora_down.weight" not in sds_sd:
         return
     down_weight = sds_sd.pop(sds_key + ".lora_down.weight")
+    up_weight = sds_sd.pop(sds_key + ".lora_up.weight")
+    sd_lora_rank = down_weight.shape[0]
 
     # scale weight by alpha and dim
-    rank = down_weight.shape[0]
     alpha = sds_sd.pop(sds_key + ".alpha")
-    scale = alpha / rank
+    scale = alpha / sd_lora_rank
 
     # calculate scale_down and scale_up
     scale_down = scale
@@ -279,23 +280,49 @@ def convert_to_ai_toolkit_cat(sds_sd, ait_sd, sds_key, ait_keys, dims=None):
         scale_down *= 2
         scale_up /= 2
 
-    ait_down_keys = [k + ".lora_A.weight" for k in ait_keys]
-    ait_up_keys = [k + ".lora_B.weight" for k in ait_keys]
-
-    num_splits = len(ait_keys)
-    up_weight = sds_sd.pop(sds_key + ".lora_up.weight")
-
-    # down_weight is copied to each split
-    ait_sd.update({k: down_weight * scale_down for k in ait_down_keys})
+    down_weight = down_weight * scale_down
+    up_weight = up_weight * scale_up
 
     # calculate dims if not provided
+    num_splits = len(ait_keys)
     if dims is None:
         dims = [up_weight.shape[0] // num_splits] * num_splits
     else:
         assert sum(dims) == up_weight.shape[0]
 
-    # up_weight is split to each split
-    ait_sd.update({k: v * scale_up for k, v in zip(ait_up_keys, torch.split(up_weight, dims, dim=0))})
+    # check upweight is sparse or not
+    is_sparse = False
+    if sd_lora_rank % num_splits == 0:
+        ait_rank = sd_lora_rank // num_splits
+        is_sparse = True
+        i = 0
+        for j in range(len(dims)):
+            for k in range(len(dims)):
+                if j == k:
+                    continue
+                is_sparse = is_sparse and torch.all(up_weight[i : i + dims[j], k * ait_rank : (k + 1) * ait_rank] == 0)
+            i += dims[j]
+        if is_sparse:
+            logger.info(f"weight is sparse: {sds_key}")
+
+    # make ai-toolkit weight
+    ait_down_keys = [k + ".lora_A.weight" for k in ait_keys]
+    ait_up_keys = [k + ".lora_B.weight" for k in ait_keys]
+    if not is_sparse:
+        # down_weight is copied to each split
+        ait_sd.update({k: down_weight for k in ait_down_keys})
+
+        # up_weight is split to each split
+        ait_sd.update({k: v for k, v in zip(ait_up_keys, torch.split(up_weight, dims, dim=0))})
+    else:
+        # down_weight is chunked to each split
+        ait_sd.update({k: v for k, v in zip(ait_down_keys, torch.chunk(down_weight, num_splits, dim=0))})
+
+        # up_weight is sparse: only non-zero values are copied to each split
+        i = 0
+        for j in range(len(dims)):
+            ait_sd[ait_up_keys[j]] = up_weight[i : i + dims[j], j * ait_rank : (j + 1) * ait_rank].contiguous()
+            i += dims[j]
 
 
 def convert_sd_scripts_to_ai_toolkit(sds_sd):
diff --git a/networks/lora_flux.py b/networks/lora_flux.py
index 4da33542f..efc7847ed 100644
--- a/networks/lora_flux.py
+++ b/networks/lora_flux.py
@@ -39,6 +39,7 @@ def __init__(
         dropout=None,
         rank_dropout=None,
         module_dropout=None,
+        split_dims: Optional[List[int]] = None,
     ):
         """if alpha == 0 or None, alpha is rank (no scaling)."""
         super().__init__()
@@ -52,16 +53,34 @@ def __init__(
             out_dim = org_module.out_features
 
         self.lora_dim = lora_dim
+        self.split_dims = split_dims
+
+        if split_dims is None:
+            if org_module.__class__.__name__ == "Conv2d":
+                kernel_size = org_module.kernel_size
+                stride = org_module.stride
+                padding = org_module.padding
+                self.lora_down = torch.nn.Conv2d(in_dim, self.lora_dim, kernel_size, stride, padding, bias=False)
+                self.lora_up = torch.nn.Conv2d(self.lora_dim, out_dim, (1, 1), (1, 1), bias=False)
+            else:
+                self.lora_down = torch.nn.Linear(in_dim, self.lora_dim, bias=False)
+                self.lora_up = torch.nn.Linear(self.lora_dim, out_dim, bias=False)
 
-        if org_module.__class__.__name__ == "Conv2d":
-            kernel_size = org_module.kernel_size
-            stride = org_module.stride
-            padding = org_module.padding
-            self.lora_down = torch.nn.Conv2d(in_dim, self.lora_dim, kernel_size, stride, padding, bias=False)
-            self.lora_up = torch.nn.Conv2d(self.lora_dim, out_dim, (1, 1), (1, 1), bias=False)
+            torch.nn.init.kaiming_uniform_(self.lora_down.weight, a=math.sqrt(5))
+            torch.nn.init.zeros_(self.lora_up.weight)
         else:
-            self.lora_down = torch.nn.Linear(in_dim, self.lora_dim, bias=False)
-            self.lora_up = torch.nn.Linear(self.lora_dim, out_dim, bias=False)
+            # conv2d not supported
+            assert sum(split_dims) == out_dim, "sum of split_dims must be equal to out_dim"
+            assert org_module.__class__.__name__ == "Linear", "split_dims is only supported for Linear"
+            # print(f"split_dims: {split_dims}")
+            self.lora_down = torch.nn.ModuleList(
+                [torch.nn.Linear(in_dim, self.lora_dim, bias=False) for _ in range(len(split_dims))]
+            )
+            self.lora_up = torch.nn.ModuleList([torch.nn.Linear(self.lora_dim, split_dim, bias=False) for split_dim in split_dims])
+            for lora_down in self.lora_down:
+                torch.nn.init.kaiming_uniform_(lora_down.weight, a=math.sqrt(5))
+            for lora_up in self.lora_up:
+                torch.nn.init.zeros_(lora_up.weight)
 
         if type(alpha) == torch.Tensor:
             alpha = alpha.detach().float().numpy()  # without casting, bf16 causes error
@@ -70,9 +89,6 @@ def __init__(
         self.register_buffer("alpha", torch.tensor(alpha))  # 定数として扱える
 
         # same as microsoft's
-        torch.nn.init.kaiming_uniform_(self.lora_down.weight, a=math.sqrt(5))
-        torch.nn.init.zeros_(self.lora_up.weight)
-
         self.multiplier = multiplier
         self.org_module = org_module  # remove in applying
         self.dropout = dropout
@@ -92,30 +108,56 @@ def forward(self, x):
             if torch.rand(1) < self.module_dropout:
                 return org_forwarded
 
-        lx = self.lora_down(x)
-
-        # normal dropout
-        if self.dropout is not None and self.training:
-            lx = torch.nn.functional.dropout(lx, p=self.dropout)
+        if self.split_dims is None:
+            lx = self.lora_down(x)
+
+            # normal dropout
+            if self.dropout is not None and self.training:
+                lx = torch.nn.functional.dropout(lx, p=self.dropout)
+
+            # rank dropout
+            if self.rank_dropout is not None and self.training:
+                mask = torch.rand((lx.size(0), self.lora_dim), device=lx.device) > self.rank_dropout
+                if len(lx.size()) == 3:
+                    mask = mask.unsqueeze(1)  # for Text Encoder
+                elif len(lx.size()) == 4:
+                    mask = mask.unsqueeze(-1).unsqueeze(-1)  # for Conv2d
+                lx = lx * mask
+
+                # scaling for rank dropout: treat as if the rank is changed
+                # maskから計算することも考えられるが、augmentation的な効果を期待してrank_dropoutを用いる
+                scale = self.scale * (1.0 / (1.0 - self.rank_dropout))  # redundant for readability
+            else:
+                scale = self.scale
 
-        # rank dropout
-        if self.rank_dropout is not None and self.training:
-            mask = torch.rand((lx.size(0), self.lora_dim), device=lx.device) > self.rank_dropout
-            if len(lx.size()) == 3:
-                mask = mask.unsqueeze(1)  # for Text Encoder
-            elif len(lx.size()) == 4:
-                mask = mask.unsqueeze(-1).unsqueeze(-1)  # for Conv2d
-            lx = lx * mask
+            lx = self.lora_up(lx)
 
-            # scaling for rank dropout: treat as if the rank is changed
-            # maskから計算することも考えられるが、augmentation的な効果を期待してrank_dropoutを用いる
-            scale = self.scale * (1.0 / (1.0 - self.rank_dropout))  # redundant for readability
+            return org_forwarded + lx * self.multiplier * scale
         else:
-            scale = self.scale
+            lxs = [lora_down(x) for lora_down in self.lora_down]
+
+            # normal dropout
+            if self.dropout is not None and self.training:
+                lxs = [torch.nn.functional.dropout(lx, p=self.dropout) for lx in lxs]
+
+            # rank dropout
+            if self.rank_dropout is not None and self.training:
+                masks = [torch.rand((lx.size(0), self.lora_dim), device=lx.device) > self.rank_dropout for lx in lxs]
+                for i in range(len(lxs)):
+                    if len(lx.size()) == 3:
+                        masks[i] = masks[i].unsqueeze(1)
+                    elif len(lx.size()) == 4:
+                        masks[i] = masks[i].unsqueeze(-1).unsqueeze(-1)
+                    lxs[i] = lxs[i] * masks[i]
+
+                # scaling for rank dropout: treat as if the rank is changed
+                scale = self.scale * (1.0 / (1.0 - self.rank_dropout))  # redundant for readability
+            else:
+                scale = self.scale
 
-        lx = self.lora_up(lx)
+            lxs = [lora_up(lx) for lora_up, lx in zip(self.lora_up, lxs)]
 
-        return org_forwarded + lx * self.multiplier * scale
+            return org_forwarded + torch.cat(lxs, dim=-1) * self.multiplier * scale
 
 
 class LoRAInfModule(LoRAModule):
@@ -152,31 +194,50 @@ def merge_to(self, sd, dtype, device):
         if device is None:
             device = org_device
 
-        # get up/down weight
-        up_weight = sd["lora_up.weight"].to(torch.float).to(device)
-        down_weight = sd["lora_down.weight"].to(torch.float).to(device)
-
-        # merge weight
-        if len(weight.size()) == 2:
-            # linear
-            weight = weight + self.multiplier * (up_weight @ down_weight) * self.scale
-        elif down_weight.size()[2:4] == (1, 1):
-            # conv2d 1x1
-            weight = (
-                weight
-                + self.multiplier
-                * (up_weight.squeeze(3).squeeze(2) @ down_weight.squeeze(3).squeeze(2)).unsqueeze(2).unsqueeze(3)
-                * self.scale
-            )
+        if self.split_dims is None:
+            # get up/down weight
+            down_weight = sd["lora_down.weight"].to(torch.float).to(device)
+            up_weight = sd["lora_up.weight"].to(torch.float).to(device)
+
+            # merge weight
+            if len(weight.size()) == 2:
+                # linear
+                weight = weight + self.multiplier * (up_weight @ down_weight) * self.scale
+            elif down_weight.size()[2:4] == (1, 1):
+                # conv2d 1x1
+                weight = (
+                    weight
+                    + self.multiplier
+                    * (up_weight.squeeze(3).squeeze(2) @ down_weight.squeeze(3).squeeze(2)).unsqueeze(2).unsqueeze(3)
+                    * self.scale
+                )
+            else:
+                # conv2d 3x3
+                conved = torch.nn.functional.conv2d(down_weight.permute(1, 0, 2, 3), up_weight).permute(1, 0, 2, 3)
+                # logger.info(conved.size(), weight.size(), module.stride, module.padding)
+                weight = weight + self.multiplier * conved * self.scale
+
+            # set weight to org_module
+            org_sd["weight"] = weight.to(dtype)
+            self.org_module.load_state_dict(org_sd)
         else:
-            # conv2d 3x3
-            conved = torch.nn.functional.conv2d(down_weight.permute(1, 0, 2, 3), up_weight).permute(1, 0, 2, 3)
-            # logger.info(conved.size(), weight.size(), module.stride, module.padding)
-            weight = weight + self.multiplier * conved * self.scale
+            # split_dims
+            total_dims = sum(self.split_dims)
+            for i in range(len(self.split_dims)):
+                # get up/down weight
+                down_weight = sd[f"lora_down.{i}.weight"].to(torch.float).to(device)  # (rank, in_dim)
+                up_weight = sd[f"lora_up.{i}.weight"].to(torch.float).to(device)  # (split dim, rank)
+
+                # pad up_weight -> (total_dims, rank)
+                padded_up_weight = torch.zeros((total_dims, up_weight.size(0)), device=device, dtype=torch.float)
+                padded_up_weight[sum(self.split_dims[:i]) : sum(self.split_dims[: i + 1])] = up_weight
+
+                # merge weight
+                weight = weight + self.multiplier * (up_weight @ down_weight) * self.scale
 
-        # set weight to org_module
-        org_sd["weight"] = weight.to(dtype)
-        self.org_module.load_state_dict(org_sd)
+            # set weight to org_module
+            org_sd["weight"] = weight.to(dtype)
+            self.org_module.load_state_dict(org_sd)
 
     # 復元できるマージのため、このモジュールのweightを返す
     def get_weight(self, multiplier=None):
@@ -211,7 +272,14 @@ def set_region(self, region):
 
     def default_forward(self, x):
         # logger.info(f"default_forward {self.lora_name} {x.size()}")
-        return self.org_forward(x) + self.lora_up(self.lora_down(x)) * self.multiplier * self.scale
+        if self.split_dims is None:
+            lx = self.lora_down(x)
+            lx = self.lora_up(lx)
+            return self.org_forward(x) + lx * self.multiplier * self.scale
+        else:
+            lxs = [lora_down(x) for lora_down in self.lora_down]
+            lxs = [lora_up(lx) for lora_up, lx in zip(self.lora_up, lxs)]
+            return self.org_forward(x) + torch.cat(lxs, dim=-1) * self.multiplier * self.scale
 
     def forward(self, x):
         if not self.enabled:
@@ -257,6 +325,11 @@ def create_network(
     if train_blocks is not None:
         assert train_blocks in ["all", "single", "double"], f"invalid train_blocks: {train_blocks}"
 
+    # split qkv
+    split_qkv = kwargs.get("split_qkv", False)
+    if split_qkv is not None:
+        split_qkv = True if split_qkv == "True" else False
+
     # すごく引数が多いな ( ^ω^)･･･
     network = LoRANetwork(
         text_encoders,
@@ -270,6 +343,7 @@ def create_network(
         conv_lora_dim=conv_dim,
         conv_alpha=conv_alpha,
         train_blocks=train_blocks,
+        split_qkv=split_qkv,
         varbose=True,
     )
 
@@ -311,10 +385,34 @@ def create_network_from_weights(multiplier, file, ae, text_encoders, flux, weigh
             modules_dim[lora_name] = dim
             # logger.info(lora_name, value.size(), dim)
 
+    # # split qkv
+    # double_qkv_rank = None
+    # single_qkv_rank = None
+    # rank = None
+    # for lora_name, dim in modules_dim.items():
+    #     if "double" in lora_name and "qkv" in lora_name:
+    #         double_qkv_rank = dim
+    #     elif "single" in lora_name and "linear1" in lora_name:
+    #         single_qkv_rank = dim
+    #     elif rank is None:
+    #         rank = dim
+    #     if double_qkv_rank is not None and single_qkv_rank is not None and rank is not None:
+    #         break
+    # split_qkv = (double_qkv_rank is not None and double_qkv_rank != rank) or (
+    #     single_qkv_rank is not None and single_qkv_rank != rank
+    # )
+    split_qkv = False # split_qkv is not needed to care, because state_dict is qkv combined
+
     module_class = LoRAInfModule if for_inference else LoRAModule
 
     network = LoRANetwork(
-        text_encoders, flux, multiplier=multiplier, modules_dim=modules_dim, modules_alpha=modules_alpha, module_class=module_class
+        text_encoders,
+        flux,
+        multiplier=multiplier,
+        modules_dim=modules_dim,
+        modules_alpha=modules_alpha,
+        module_class=module_class,
+        split_qkv=split_qkv,
     )
     return network, weights_sd
 
@@ -344,6 +442,7 @@ def __init__(
         modules_dim: Optional[Dict[str, int]] = None,
         modules_alpha: Optional[Dict[str, int]] = None,
         train_blocks: Optional[str] = None,
+        split_qkv: bool = False,
         varbose: Optional[bool] = False,
     ) -> None:
         super().__init__()
@@ -357,6 +456,7 @@ def __init__(
         self.rank_dropout = rank_dropout
         self.module_dropout = module_dropout
         self.train_blocks = train_blocks if train_blocks is not None else "all"
+        self.split_qkv = split_qkv
 
         self.loraplus_lr_ratio = None
         self.loraplus_unet_lr_ratio = None
@@ -373,6 +473,8 @@ def __init__(
                 logger.info(
                     f"apply LoRA to Conv2d with kernel size (3,3). dim (rank): {self.conv_lora_dim}, alpha: {self.conv_alpha}"
                 )
+        if self.split_qkv:
+            logger.info(f"split qkv for LoRA")
 
         # create module instances
         def create_modules(
@@ -420,6 +522,14 @@ def create_modules(
                                     skipped.append(lora_name)
                                 continue
 
+                            # qkv split
+                            split_dims = None
+                            if is_flux and split_qkv:
+                                if "double" in lora_name and "qkv" in lora_name:
+                                    split_dims = [3072] * 3
+                                elif "single" in lora_name and "linear1" in lora_name:
+                                    split_dims = [3072] * 3 + [12288]
+
                             lora = module_class(
                                 lora_name,
                                 child_module,
@@ -429,6 +539,7 @@ def create_modules(
                                 dropout=dropout,
                                 rank_dropout=rank_dropout,
                                 module_dropout=module_dropout,
+                                split_dims=split_dims,
                             )
                             loras.append(lora)
             return loras, skipped
@@ -492,6 +603,111 @@ def load_weights(self, file):
         info = self.load_state_dict(weights_sd, False)
         return info
 
+    def load_state_dict(self, state_dict, strict=True):
+        # override to convert original weight to splitted qkv weight
+        if not self.split_qkv:
+            return super().load_state_dict(state_dict, strict)
+
+        # split qkv
+        for key in list(state_dict.keys()):
+            if "double" in key and "qkv" in key:
+                split_dims = [3072] * 3
+            elif "single" in key and "linear1" in key:
+                split_dims = [3072] * 3 + [12288]
+            else:
+                continue
+
+            weight = state_dict[key]
+            lora_name = key.split(".")[0]
+            if "lora_down" in key and "weight" in key:
+                # dense weight (rank*3, in_dim)
+                split_weight = torch.chunk(weight, len(split_dims), dim=0)
+                for i, split_w in enumerate(split_weight):
+                    state_dict[f"{lora_name}.lora_down.{i}.weight"] = split_w
+
+                del state_dict[key]
+                # print(f"split {key}: {weight.shape} to {[w.shape for w in split_weight]}")
+            elif "lora_up" in key and "weight" in key:
+                # sparse weight (out_dim=sum(split_dims), rank*3)
+                rank = weight.size(1) // len(split_dims)
+                i = 0
+                for j in range(len(split_dims)):
+                    state_dict[f"{lora_name}.lora_up.{j}.weight"] = weight[i : i + split_dims[j], j * rank : (j + 1) * rank]
+                    i += split_dims[j]
+                del state_dict[key]
+
+                # # check is sparse
+                # i = 0
+                # is_zero = True
+                # for j in range(len(split_dims)):
+                #     for k in range(len(split_dims)):
+                #         if j == k:
+                #             continue
+                #         is_zero = is_zero and torch.all(weight[i : i + split_dims[j], k * rank : (k + 1) * rank] == 0)
+                #     i += split_dims[j]
+                # if not is_zero:
+                #     logger.warning(f"weight is not sparse: {key}")
+                # else:
+                #     logger.info(f"weight is sparse: {key}")
+
+                # print(
+                #     f"split {key}: {weight.shape} to {[state_dict[k].shape for k in [f'{lora_name}.lora_up.{j}.weight' for j in range(len(split_dims))]]}"
+                # )
+
+            # alpha is unchanged
+
+        return super().load_state_dict(state_dict, strict)
+
+    def state_dict(self, destination=None, prefix="", keep_vars=False):
+        if not self.split_qkv:
+            return super().state_dict(destination, prefix, keep_vars)
+
+        # merge qkv
+        state_dict = super().state_dict(destination, prefix, keep_vars)
+        new_state_dict = {}
+        for key in list(state_dict.keys()):
+            if "double" in key and "qkv" in key:
+                split_dims = [3072] * 3
+            elif "single" in key and "linear1" in key:
+                split_dims = [3072] * 3 + [12288]
+            else:
+                new_state_dict[key] = state_dict[key]
+                continue
+
+            if key not in state_dict:
+                continue  # already merged
+
+            lora_name = key.split(".")[0]
+
+            # (rank, in_dim) * 3
+            down_weights = [state_dict.pop(f"{lora_name}.lora_down.{i}.weight") for i in range(len(split_dims))]
+            # (split dim, rank) * 3
+            up_weights = [state_dict.pop(f"{lora_name}.lora_up.{i}.weight") for i in range(len(split_dims))]
+
+            alpha = state_dict.pop(f"{lora_name}.alpha")
+
+            # merge down weight
+            down_weight = torch.cat(down_weights, dim=0)  # (rank, split_dim) * 3 -> (rank*3, sum of split_dim)
+
+            # merge up weight (sum of split_dim, rank*3)
+            rank = up_weights[0].size(1)
+            up_weight = torch.zeros((sum(split_dims), down_weight.size(0)), device=down_weight.device, dtype=down_weight.dtype)
+            i = 0
+            for j in range(len(split_dims)):
+                up_weight[i : i + split_dims[j], j * rank : (j + 1) * rank] = up_weights[j]
+                i += split_dims[j]
+
+            new_state_dict[f"{lora_name}.lora_down.weight"] = down_weight
+            new_state_dict[f"{lora_name}.lora_up.weight"] = up_weight
+            new_state_dict[f"{lora_name}.alpha"] = alpha
+
+            # print(
+            #     f"merged {lora_name}: {lora_name}, {[w.shape for w in down_weights]}, {[w.shape for w in up_weights]} to {down_weight.shape}, {up_weight.shape}"
+            # )
+            print(f"new key: {lora_name}.lora_down.weight, {lora_name}.lora_up.weight, {lora_name}.alpha")
+
+        return new_state_dict
+
     def apply_to(self, text_encoders, flux, apply_text_encoder=True, apply_unet=True):
         if apply_text_encoder:
             logger.info(f"enable LoRA for text encoder: {len(self.text_encoder_loras)} modules")

From 5639c2adc0085e2e995bb3eee5a278aace397e7a Mon Sep 17 00:00:00 2001
From: Kohya S <ykumeykume@gmail.com>
Date: Sat, 24 Aug 2024 16:37:49 +0900
Subject: [PATCH 106/582] fix typo

---
 networks/lora_flux.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/networks/lora_flux.py b/networks/lora_flux.py
index efc7847ed..07a80f0bf 100644
--- a/networks/lora_flux.py
+++ b/networks/lora_flux.py
@@ -604,7 +604,7 @@ def load_weights(self, file):
         return info
 
     def load_state_dict(self, state_dict, strict=True):
-        # override to convert original weight to splitted qkv weight
+        # override to convert original weight to split qkv
         if not self.split_qkv:
             return super().load_state_dict(state_dict, strict)
 

From 72287d39c76176c0e1c16e8da4f5ddc6f94ea7d6 Mon Sep 17 00:00:00 2001
From: Kohya S <ykumeykume@gmail.com>
Date: Sun, 25 Aug 2024 16:01:24 +0900
Subject: [PATCH 107/582] feat: Add `shift` option to `--timestep_sampling` in
 FLUX.1 fine-tuning and LoRA training

---
 README.md                   |  4 ++++
 library/flux_train_utils.py | 15 +++++++++++++--
 2 files changed, 17 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index 282f3b3bd..562dcdb2a 100644
--- a/README.md
+++ b/README.md
@@ -9,6 +9,10 @@ __Please update PyTorch to 2.4.0. We have tested with `torch==2.4.0` and `torchv
 The command to install PyTorch is as follows:
 `pip3 install torch==2.4.0 torchvision==0.19.0 --index-url https://download.pytorch.org/whl/cu124`
 
+Aug 25, 2024:
+Added `shift` option to `--timestep_sampling` in FLUX.1 fine-tuning and LoRA training. Shifts timesteps according to the value of `--discrete_flow_shift` (shifts the value of sigmoid of normal distribution random number). It may be good to start with a value of 3.1582 (=e^1.15) for `--discrete_flow_shift`.
+Sample command: `--timestep_sampling shift --discrete_flow_shift 3.1582 --model_prediction_type raw --guidance_scale 1.0`
+
 Aug 24, 2024 (update 2):
 
 __Experimental__ Added an option to split the projection layers of q/k/v/txt in the attention and apply LoRA to each of them in FLUX.1 LoRA training. Specify `"split_qkv=True"` in network_args like `--network_args "split_qkv=True"` (`train_blocks` is also available). 
diff --git a/library/flux_train_utils.py b/library/flux_train_utils.py
index 1d3f80d72..75f70a54f 100644
--- a/library/flux_train_utils.py
+++ b/library/flux_train_utils.py
@@ -380,9 +380,19 @@ def get_noisy_model_input_and_timesteps(
             t = torch.sigmoid(args.sigmoid_scale * torch.randn((bsz,), device=device))
         else:
             t = torch.rand((bsz,), device=device)
+
         timesteps = t * 1000.0
         t = t.view(-1, 1, 1, 1)
         noisy_model_input = (1 - t) * latents + t * noise
+    elif args.timestep_sampling == "shift":
+        shift = args.discrete_flow_shift
+        logits_norm = torch.randn(bsz, device=device)
+        timesteps = logits_norm.sigmoid()
+        timesteps = (timesteps * shift) / (1 + (shift - 1) * timesteps)
+
+        t = timesteps.view(-1, 1, 1, 1)
+        timesteps = timesteps * 1000.0
+        noisy_model_input = (1 - t) * latents + t * noise
     else:
         # Sample a random timestep for each image
         # for weighting schemes where we sample timesteps non-uniformly
@@ -559,9 +569,10 @@ def add_flux_train_arguments(parser: argparse.ArgumentParser):
 
     parser.add_argument(
         "--timestep_sampling",
-        choices=["sigma", "uniform", "sigmoid"],
+        choices=["sigma", "uniform", "sigmoid", "shift"],
         default="sigma",
-        help="Method to sample timesteps: sigma-based, uniform random, or sigmoid of random normal. / タイムステップをサンプリングする方法：sigma、random uniform、またはrandom normalのsigmoid。",
+        help="Method to sample timesteps: sigma-based, uniform random, sigmoid of random normal and shift of sigmoid."
+        " / タイムステップをサンプリングする方法：sigma、random uniform、random normalのsigmoid、sigmoidのシフト。",
     )
     parser.add_argument(
         "--sigmoid_scale",

From 0087a46e14c8e568982cbe3a5d9b9c561b175abf Mon Sep 17 00:00:00 2001
From: Kohya S <ykumeykume@gmail.com>
Date: Tue, 27 Aug 2024 19:59:40 +0900
Subject: [PATCH 108/582] FLUX.1 LoRA supports CLIP-L

---
 README.md                   |  8 ++++
 flux_train_network.py       | 40 +++++++++++++-----
 library/flux_train_utils.py |  8 ++--
 library/strategy_flux.py    |  3 +-
 networks/lora_flux.py       |  4 +-
 train_network.py            | 81 ++++++++++++++++++++++++-------------
 6 files changed, 101 insertions(+), 43 deletions(-)

diff --git a/README.md b/README.md
index 562dcdb2a..1203b5ebc 100644
--- a/README.md
+++ b/README.md
@@ -9,6 +9,14 @@ __Please update PyTorch to 2.4.0. We have tested with `torch==2.4.0` and `torchv
 The command to install PyTorch is as follows:
 `pip3 install torch==2.4.0 torchvision==0.19.0 --index-url https://download.pytorch.org/whl/cu124`
 
+Aug 27, 2024:
+
+- FLUX.1 LoRA training now supports CLIP-L LoRA. Please remove `--network_train_unet_only`. T5XXL is not trained. The output of T5XXL is still cached, so `--cache_text_encoder_outputs` or `--cache_text_encoder_outputs_to_disk` is still required. The trained LoRA can be used with ComfyUI.
+  - `flux_extract_lora.py` and `convert_flux_lora.py` do not support CLIP-L LoRA.
+- `--sigmoid_scale` is now effective even when `--timestep_sampling shift` is specified. Normally, leave it at 1.0. Larger values make the value before shift closer to a uniform distribution.
+
+- __Experimental__ `--fp8_base_unet` option is added to `flux_train_network.py`. Flux can be trained with fp8, and CLIP-L can be trained with bf16/fp16. When specifying this option, the `--fp8_base` option is not required (Flux is fp8, and CLIP-L is bf16/fp16, regardless of the `--fp8_base` option). 
+
 Aug 25, 2024:
 Added `shift` option to `--timestep_sampling` in FLUX.1 fine-tuning and LoRA training. Shifts timesteps according to the value of `--discrete_flow_shift` (shifts the value of sigmoid of normal distribution random number). It may be good to start with a value of 3.1582 (=e^1.15) for `--discrete_flow_shift`.
 Sample command: `--timestep_sampling shift --discrete_flow_shift 3.1582 --model_prediction_type raw --guidance_scale 1.0`
diff --git a/flux_train_network.py b/flux_train_network.py
index 82f77a77e..1a40de61a 100644
--- a/flux_train_network.py
+++ b/flux_train_network.py
@@ -40,9 +40,13 @@ def assert_extra_args(self, args, train_dataset_group):
                 train_dataset_group.is_text_encoder_output_cacheable()
             ), "when caching Text Encoder output, either caption_dropout_rate, shuffle_caption, token_warmup_step or caption_tag_dropout_rate cannot be used / Text Encoderの出力をキャッシュするときはcaption_dropout_rate, shuffle_caption, token_warmup_step, caption_tag_dropout_rateは使えません"
 
-        assert (
-            args.network_train_unet_only or not args.cache_text_encoder_outputs
-        ), "network for Text Encoder cannot be trained with caching Text Encoder outputs / Text Encoderの出力をキャッシュしながらText Encoderのネットワークを学習することはできません"
+        # assert (
+        #     args.network_train_unet_only or not args.cache_text_encoder_outputs
+        # ), "network for Text Encoder cannot be trained with caching Text Encoder outputs / Text Encoderの出力をキャッシュしながらText Encoderのネットワークを学習することはできません"
+        if not args.network_train_unet_only:
+            logger.info(
+                "network for CLIP-L only will be trained. T5XXL will not be trained / CLIP-Lのネットワークのみが学習されます。T5XXLは学習されません"
+            )
 
         if args.max_token_length is not None:
             logger.warning("max_token_length is not used in Flux training / max_token_lengthはFluxのトレーニングでは使用されません")
@@ -137,12 +141,25 @@ def get_text_encoding_strategy(self, args):
         return strategy_flux.FluxTextEncodingStrategy(apply_t5_attn_mask=args.apply_t5_attn_mask)
 
     def get_models_for_text_encoding(self, args, accelerator, text_encoders):
-        return text_encoders  # + [accelerator.unwrap_model(text_encoders[-1])]
+        if args.cache_text_encoder_outputs:
+            if self.is_train_text_encoder(args):
+                return text_encoders[0:1]  # only CLIP-L is needed for encoding because T5XXL is cached
+            else:
+                return text_encoders  # ignored
+        else:
+            return text_encoders  # both CLIP-L and T5XXL are needed for encoding
+
+    def get_text_encoders_train_flags(self, args, text_encoders):
+        return [True, False] if self.is_train_text_encoder(args) else [False, False]
 
     def get_text_encoder_outputs_caching_strategy(self, args):
         if args.cache_text_encoder_outputs:
             return strategy_flux.FluxTextEncoderOutputsCachingStrategy(
-                args.cache_text_encoder_outputs_to_disk, None, False, apply_t5_attn_mask=args.apply_t5_attn_mask
+                args.cache_text_encoder_outputs_to_disk,
+                None,
+                False,
+                is_partial=self.is_train_text_encoder(args),
+                apply_t5_attn_mask=args.apply_t5_attn_mask,
             )
         else:
             return None
@@ -190,9 +207,11 @@ def cache_text_encoder_outputs_if_needed(
             accelerator.wait_for_everyone()
 
             # move back to cpu
-            logger.info("move text encoders back to cpu")
-            text_encoders[0].to("cpu")  # , dtype=torch.float32)  # Text Encoder doesn't work with fp16 on CPU
-            text_encoders[1].to("cpu")  # , dtype=torch.float32)
+            if not self.is_train_text_encoder(args):
+                logger.info("move CLIP-L back to cpu")
+                text_encoders[0].to("cpu")
+            logger.info("move t5XXL back to cpu")
+            text_encoders[1].to("cpu")
             clean_memory_on_device(accelerator.device)
 
             if not args.lowram:
@@ -297,7 +316,8 @@ def get_noise_pred_and_target(
         if args.gradient_checkpointing:
             noisy_model_input.requires_grad_(True)
             for t in text_encoder_conds:
-                t.requires_grad_(True)
+                if t.dtype.is_floating_point:
+                    t.requires_grad_(True)
             img_ids.requires_grad_(True)
             guidance_vec.requires_grad_(True)
 
@@ -384,7 +404,7 @@ def update_metadata(self, metadata, args):
         metadata["ss_discrete_flow_shift"] = args.discrete_flow_shift
 
     def is_text_encoder_not_needed_for_training(self, args):
-        return args.cache_text_encoder_outputs
+        return args.cache_text_encoder_outputs and not self.is_train_text_encoder(args)
 
 
 def setup_parser() -> argparse.ArgumentParser:
diff --git a/library/flux_train_utils.py b/library/flux_train_utils.py
index 75f70a54f..a8e94ac00 100644
--- a/library/flux_train_utils.py
+++ b/library/flux_train_utils.py
@@ -58,7 +58,7 @@ def sample_images(
 
     logger.info("")
     logger.info(f"generating sample images at step / サンプル画像生成 ステップ: {steps}")
-    if not os.path.isfile(args.sample_prompts):
+    if not os.path.isfile(args.sample_prompts) and sample_prompts_te_outputs is None:
         logger.error(f"No prompt file / プロンプトファイルがありません: {args.sample_prompts}")
         return
 
@@ -66,7 +66,8 @@ def sample_images(
 
     # unwrap unet and text_encoder(s)
     flux = accelerator.unwrap_model(flux)
-    text_encoders = [accelerator.unwrap_model(te) for te in text_encoders]
+    if text_encoders is not None:
+        text_encoders = [accelerator.unwrap_model(te) for te in text_encoders]
     # print([(te.parameters().__next__().device if te is not None else None) for te in text_encoders])
 
     prompts = load_prompts(args.sample_prompts)
@@ -134,7 +135,7 @@ def sample_image_inference(
     accelerator: Accelerator,
     args: argparse.Namespace,
     flux: flux_models.Flux,
-    text_encoders: List[CLIPTextModel],
+    text_encoders: Optional[List[CLIPTextModel]],
     ae: flux_models.AutoEncoder,
     save_dir,
     prompt_dict,
@@ -387,6 +388,7 @@ def get_noisy_model_input_and_timesteps(
     elif args.timestep_sampling == "shift":
         shift = args.discrete_flow_shift
         logits_norm = torch.randn(bsz, device=device)
+        logits_norm = logits_norm * args.sigmoid_scale  # larger scale for more uniform sampling
         timesteps = logits_norm.sigmoid()
         timesteps = (timesteps * shift) / (1 + (shift - 1) * timesteps)
 
diff --git a/library/strategy_flux.py b/library/strategy_flux.py
index d52b3b8dd..5d0839132 100644
--- a/library/strategy_flux.py
+++ b/library/strategy_flux.py
@@ -60,7 +60,7 @@ def encode_tokens(
         if apply_t5_attn_mask is None:
             apply_t5_attn_mask = self.apply_t5_attn_mask
 
-        clip_l, t5xxl = models
+        clip_l, t5xxl = models if len(models) == 2 else (models[0], None)
         l_tokens, t5_tokens = tokens[:2]
         t5_attn_mask = tokens[2] if len(tokens) > 2 else None
 
@@ -81,6 +81,7 @@ def encode_tokens(
         else:
             t5_out = None
             txt_ids = None
+            t5_attn_mask = None  # caption may be dropped/shuffled, so t5_attn_mask should not be used to make sure the mask is same as the cached one
 
         return [l_pooled, t5_out, txt_ids, t5_attn_mask]  # returns t5_attn_mask for attention mask in transformer
 
diff --git a/networks/lora_flux.py b/networks/lora_flux.py
index 07a80f0bf..fcb56a467 100644
--- a/networks/lora_flux.py
+++ b/networks/lora_flux.py
@@ -401,7 +401,7 @@ def create_network_from_weights(multiplier, file, ae, text_encoders, flux, weigh
     # split_qkv = (double_qkv_rank is not None and double_qkv_rank != rank) or (
     #     single_qkv_rank is not None and single_qkv_rank != rank
     # )
-    split_qkv = False # split_qkv is not needed to care, because state_dict is qkv combined
+    split_qkv = False  # split_qkv is not needed to care, because state_dict is qkv combined
 
     module_class = LoRAInfModule if for_inference else LoRAModule
 
@@ -421,7 +421,7 @@ class LoRANetwork(torch.nn.Module):
     # FLUX_TARGET_REPLACE_MODULE = ["DoubleStreamBlock", "SingleStreamBlock"]
     FLUX_TARGET_REPLACE_MODULE_DOUBLE = ["DoubleStreamBlock"]
     FLUX_TARGET_REPLACE_MODULE_SINGLE = ["SingleStreamBlock"]
-    TEXT_ENCODER_TARGET_REPLACE_MODULE = ["CLIPAttention", "CLIPMLP"]
+    TEXT_ENCODER_TARGET_REPLACE_MODULE = ["CLIPAttention", "CLIPSdpaAttention", "CLIPMLP"]
     LORA_PREFIX_FLUX = "lora_unet"  # make ComfyUI compatible
     LORA_PREFIX_TEXT_ENCODER_CLIP = "lora_te1"
     LORA_PREFIX_TEXT_ENCODER_T5 = "lora_te2"
diff --git a/train_network.py b/train_network.py
index cab0ec52e..048c7e7bd 100644
--- a/train_network.py
+++ b/train_network.py
@@ -127,8 +127,15 @@ def get_text_encoder_outputs_caching_strategy(self, args):
         return None
 
     def get_models_for_text_encoding(self, args, accelerator, text_encoders):
+        """
+        Returns a list of models that will be used for text encoding. SDXL uses wrapped and unwrapped models.
+        """
         return text_encoders
 
+    # returns a list of bool values indicating whether each text encoder should be trained
+    def get_text_encoders_train_flags(self, args, text_encoders):
+        return [True] * len(text_encoders) if self.is_train_text_encoder(args) else [False] * len(text_encoders)
+
     def is_train_text_encoder(self, args):
         return not args.network_train_unet_only
 
@@ -136,11 +143,6 @@ def cache_text_encoder_outputs_if_needed(self, args, accelerator, unet, vae, tex
         for t_enc in text_encoders:
             t_enc.to(accelerator.device, dtype=weight_dtype)
 
-    def get_text_cond(self, args, accelerator, batch, tokenizers, text_encoders, weight_dtype):
-        input_ids = batch["input_ids"].to(accelerator.device)
-        encoder_hidden_states = train_util.get_hidden_states(args, input_ids, tokenizers[0], text_encoders[0], weight_dtype)
-        return encoder_hidden_states
-
     def call_unet(self, args, accelerator, unet, noisy_latents, timesteps, text_conds, batch, weight_dtype):
         noise_pred = unet(noisy_latents, timesteps, text_conds[0]).sample
         return noise_pred
@@ -313,7 +315,7 @@ def train(self, args):
         collator = train_util.collator_class(current_epoch, current_step, ds_for_collator)
 
         if args.debug_dataset:
-            train_dataset_group.set_current_strategies() # dasaset needs to know the strategies explicitly
+            train_dataset_group.set_current_strategies()  # dasaset needs to know the strategies explicitly
             train_util.debug_dataset(train_dataset_group)
             return
         if len(train_dataset_group) == 0:
@@ -437,8 +439,10 @@ def train(self, args):
 
         if args.gradient_checkpointing:
             unet.enable_gradient_checkpointing()
-            for t_enc in text_encoders:
-                t_enc.gradient_checkpointing_enable()
+            for t_enc, flag in zip(text_encoders, self.get_text_encoders_train_flags(args, text_encoders)):
+                if flag:
+                    if t_enc.supports_gradient_checkpointing:
+                        t_enc.gradient_checkpointing_enable()
             del t_enc
             network.enable_gradient_checkpointing()  # may have no effect
 
@@ -522,14 +526,17 @@ def train(self, args):
 
         unet_weight_dtype = te_weight_dtype = weight_dtype
         # Experimental Feature: Put base model into fp8 to save vram
-        if args.fp8_base:
+        if args.fp8_base or args.fp8_base_unet:
             assert torch.__version__ >= "2.1.0", "fp8_base requires torch>=2.1.0 / fp8を使う場合はtorch>=2.1.0が必要です。"
             assert (
                 args.mixed_precision != "no"
             ), "fp8_base requires mixed precision='fp16' or 'bf16' / fp8を使う場合はmixed_precision='fp16'または'bf16'が必要です。"
-            accelerator.print("enable fp8 training.")
+            accelerator.print("enable fp8 training for U-Net.")
             unet_weight_dtype = torch.float8_e4m3fn
-            te_weight_dtype = torch.float8_e4m3fn
+            
+            if not args.fp8_base_unet:
+                accelerator.print("enable fp8 training for Text Encoder.")
+            te_weight_dtype = weight_dtype if args.fp8_base_unet else torch.float8_e4m3fn
 
             # unet.to(accelerator.device)  # this makes faster `to(dtype)` below, but consumes 23 GB VRAM
             # unet.to(dtype=unet_weight_dtype)  # without moving to gpu, this takes a lot of time and main memory
@@ -546,19 +553,18 @@ def train(self, args):
                 t_enc.to(dtype=te_weight_dtype)
                 if hasattr(t_enc, "text_model") and hasattr(t_enc.text_model, "embeddings"):
                     # nn.Embedding not support FP8
-                    t_enc.text_model.embeddings.to(
-                        dtype=(weight_dtype if te_weight_dtype != weight_dtype else te_weight_dtype))
+                    t_enc.text_model.embeddings.to(dtype=(weight_dtype if te_weight_dtype != weight_dtype else te_weight_dtype))
                 elif hasattr(t_enc, "encoder") and hasattr(t_enc.encoder, "embeddings"):
-                    t_enc.encoder.embeddings.to(
-                        dtype=(weight_dtype if te_weight_dtype != weight_dtype else te_weight_dtype))
+                    t_enc.encoder.embeddings.to(dtype=(weight_dtype if te_weight_dtype != weight_dtype else te_weight_dtype))
 
         # acceleratorがなんかよろしくやってくれるらしい / accelerator will do something good
         if args.deepspeed:
+            flags = self.get_text_encoders_train_flags(args, text_encoders)
             ds_model = deepspeed_utils.prepare_deepspeed_model(
                 args,
                 unet=unet if train_unet else None,
-                text_encoder1=text_encoders[0] if train_text_encoder else None,
-                text_encoder2=text_encoders[1] if train_text_encoder and len(text_encoders) > 1 else None,
+                text_encoder1=text_encoders[0] if flags[0] else None,
+                text_encoder2=(text_encoders[1] if flags[1] else None) if len(text_encoders) > 1 else None,
                 network=network,
             )
             ds_model, optimizer, train_dataloader, lr_scheduler = accelerator.prepare(
@@ -571,11 +577,14 @@ def train(self, args):
             else:
                 unet.to(accelerator.device, dtype=unet_weight_dtype)  # move to device because unet is not prepared by accelerator
             if train_text_encoder:
+                text_encoders = [
+                    (accelerator.prepare(t_enc) if flag else t_enc)
+                    for t_enc, flag in zip(text_encoders, self.get_text_encoders_train_flags(args, text_encoders))
+                ]
                 if len(text_encoders) > 1:
-                    text_encoder = text_encoders = [accelerator.prepare(t_enc) for t_enc in text_encoders]
+                    text_encoder = text_encoders
                 else:
-                    text_encoder = accelerator.prepare(text_encoder)
-                    text_encoders = [text_encoder]
+                    text_encoder = text_encoders[0]
             else:
                 pass  # if text_encoder is not trained, no need to prepare. and device and dtype are already set
 
@@ -587,11 +596,11 @@ def train(self, args):
         if args.gradient_checkpointing:
             # according to TI example in Diffusers, train is required
             unet.train()
-            for t_enc in text_encoders:
+            for t_enc, frag in zip(text_encoders, self.get_text_encoders_train_flags(args, text_encoders)):
                 t_enc.train()
 
                 # set top parameter requires_grad = True for gradient checkpointing works
-                if train_text_encoder:
+                if frag:
                     t_enc.text_model.embeddings.requires_grad_(True)
 
         else:
@@ -736,6 +745,7 @@ def load_model_hook(models, input_dir):
             "ss_huber_schedule": args.huber_schedule,
             "ss_huber_c": args.huber_c,
             "ss_fp8_base": args.fp8_base,
+            "ss_fp8_base_unet": args.fp8_base_unet,
         }
 
         self.update_metadata(metadata, args)  # architecture specific metadata
@@ -1004,6 +1014,7 @@ def remove_model(old_ckpt_name):
             for t_enc in text_encoders:
                 del t_enc
             text_encoders = []
+            text_encoder = None
 
         # For --sample_at_first
         self.sample_images(accelerator, args, 0, global_step, accelerator.device, vae, tokenizers, text_encoder, unet)
@@ -1018,7 +1029,7 @@ def remove_model(old_ckpt_name):
         # log device and dtype for each model
         logger.info(f"unet dtype: {unet_weight_dtype}, device: {unet.device}")
         for t_enc in text_encoders:
-            logger.info(f"text_encoder dtype: {te_weight_dtype}, device: {t_enc.device}")
+            logger.info(f"text_encoder dtype: {t_enc.dtype}, device: {t_enc.device}")
 
         clean_memory_on_device(accelerator.device)
 
@@ -1073,12 +1084,17 @@ def remove_model(old_ckpt_name):
                     text_encoder_outputs_list = batch.get("text_encoder_outputs_list", None)
                     if text_encoder_outputs_list is not None:
                         text_encoder_conds = text_encoder_outputs_list  # List of text encoder outputs
-                    else:
+                    if (
+                        text_encoder_conds is None
+                        or len(text_encoder_conds) == 0
+                        or text_encoder_conds[0] is None
+                        or train_text_encoder
+                    ):
                         with torch.set_grad_enabled(train_text_encoder), accelerator.autocast():
                             # Get the text embedding for conditioning
                             if args.weighted_captions:
                                 # SD only
-                                text_encoder_conds = get_weighted_text_embeddings(
+                                encoded_text_encoder_conds = get_weighted_text_embeddings(
                                     tokenizers[0],
                                     text_encoder,
                                     batch["captions"],
@@ -1088,13 +1104,18 @@ def remove_model(old_ckpt_name):
                                 )
                             else:
                                 input_ids = [ids.to(accelerator.device) for ids in batch["input_ids_list"]]
-                                text_encoder_conds = text_encoding_strategy.encode_tokens(
+                                encoded_text_encoder_conds = text_encoding_strategy.encode_tokens(
                                     tokenize_strategy,
                                     self.get_models_for_text_encoding(args, accelerator, text_encoders),
                                     input_ids,
                                 )
                                 if args.full_fp16:
-                                    text_encoder_conds = [c.to(weight_dtype) for c in text_encoder_conds]
+                                    encoded_text_encoder_conds = [c.to(weight_dtype) for c in encoded_text_encoder_conds]
+
+                        # if encoded_text_encoder_conds is not None, update cached text_encoder_conds
+                        for i in range(len(encoded_text_encoder_conds)):
+                            if encoded_text_encoder_conds[i] is not None:
+                                text_encoder_conds[i] = encoded_text_encoder_conds[i]
 
                     # sample noise, call unet, get target
                     noise_pred, target, timesteps, huber_c, weighting = self.get_noise_pred_and_target(
@@ -1257,6 +1278,12 @@ def setup_parser() -> argparse.ArgumentParser:
 
     parser.add_argument("--unet_lr", type=float, default=None, help="learning rate for U-Net / U-Netの学習率")
     parser.add_argument("--text_encoder_lr", type=float, default=None, help="learning rate for Text Encoder / Text Encoderの学習率")
+    parser.add_argument(
+        "--fp8_base_unet",
+        action="store_true",
+        help="use fp8 for U-Net (or DiT), Text Encoder is fp16 or bf16"
+        " / U-Net（またはDiT）にfp8を使用する。Text Encoderはfp16またはbf16",
+    )
 
     parser.add_argument(
         "--network_weights", type=str, default=None, help="pretrained weights for network / 学習するネットワークの初期重み"

From 3be712e3e011b0378fad389641cec0c1869555ab Mon Sep 17 00:00:00 2001
From: Kohya S <ykumeykume@gmail.com>
Date: Tue, 27 Aug 2024 21:40:02 +0900
Subject: [PATCH 109/582] feat: Update direct loading fp8 ckpt for LoRA
 training

---
 README.md                   |  7 +++-
 flux_minimal_inference.py   | 27 +-----------
 flux_train_network.py       | 16 +++++++-
 library/flux_utils.py       | 12 ++++--
 library/utils.py            | 62 +++++++++++++++++++++++++++-
 networks/flux_merge_lora.py | 82 ++++++++++++++++++++++++++-----------
 6 files changed, 151 insertions(+), 55 deletions(-)

diff --git a/README.md b/README.md
index 1203b5ebc..0108ada59 100644
--- a/README.md
+++ b/README.md
@@ -9,13 +9,18 @@ __Please update PyTorch to 2.4.0. We have tested with `torch==2.4.0` and `torchv
 The command to install PyTorch is as follows:
 `pip3 install torch==2.4.0 torchvision==0.19.0 --index-url https://download.pytorch.org/whl/cu124`
 
+Aug 27, 2024 (update 2):
+In FLUX.1 LoRA training, when `--fp8_base` is specified, the FLUX.1 model file with fp8 (`float8_e4m3fn` type) can be loaded directly. Also, in `flux_minimal_inference.py`, it is possible to load it by specifying `fp8 (float8_e4m3fn)` in `--flux_dtype`.
+
+In `flux_merge_lora.py`, you can now specify the precision at save time with `fp8` (see `--help` for details). Also, if you do not specify the merge model, only the model type conversion will be performed.
+
 Aug 27, 2024:
 
 - FLUX.1 LoRA training now supports CLIP-L LoRA. Please remove `--network_train_unet_only`. T5XXL is not trained. The output of T5XXL is still cached, so `--cache_text_encoder_outputs` or `--cache_text_encoder_outputs_to_disk` is still required. The trained LoRA can be used with ComfyUI.
   - `flux_extract_lora.py` and `convert_flux_lora.py` do not support CLIP-L LoRA.
 - `--sigmoid_scale` is now effective even when `--timestep_sampling shift` is specified. Normally, leave it at 1.0. Larger values make the value before shift closer to a uniform distribution.
 
-- __Experimental__ `--fp8_base_unet` option is added to `flux_train_network.py`. Flux can be trained with fp8, and CLIP-L can be trained with bf16/fp16. When specifying this option, the `--fp8_base` option is not required (Flux is fp8, and CLIP-L is bf16/fp16, regardless of the `--fp8_base` option). 
+- __Experimental__ `--fp8_base_unet` option is added to `flux_train_network.py`. Flux can be trained with fp8, and CLIP-L can be trained with bf16/fp16. When specifying this option, the `--fp8_base` option is automatically enabled. 
 
 Aug 25, 2024:
 Added `shift` option to `--timestep_sampling` in FLUX.1 fine-tuning and LoRA training. Shifts timesteps according to the value of `--discrete_flow_shift` (shifts the value of sigmoid of normal distribution random number). It may be good to start with a value of 3.1582 (=e^1.15) for `--discrete_flow_shift`.
diff --git a/flux_minimal_inference.py b/flux_minimal_inference.py
index 5b8aa2506..56c1b1982 100644
--- a/flux_minimal_inference.py
+++ b/flux_minimal_inference.py
@@ -10,7 +10,6 @@
 import numpy as np
 
 import torch
-from safetensors.torch import safe_open, load_file
 from tqdm import tqdm
 from PIL import Image
 import accelerate
@@ -21,7 +20,7 @@
 init_ipex()
 
 
-from library.utils import setup_logging
+from library.utils import setup_logging, str_to_dtype
 
 setup_logging()
 import logging
@@ -288,28 +287,6 @@ def generate_image(
     name = "schnell" if "schnell" in args.ckpt_path else "dev"  # TODO change this to a more robust way
     is_schnell = name == "schnell"
 
-    def str_to_dtype(s: Optional[str], default_dtype: Optional[torch.dtype] = None) -> torch.dtype:
-        if s is None:
-            return default_dtype
-        if s in ["bf16", "bfloat16"]:
-            return torch.bfloat16
-        elif s in ["fp16", "float16"]:
-            return torch.float16
-        elif s in ["fp32", "float32"]:
-            return torch.float32
-        elif s in ["fp8_e4m3fn", "e4m3fn", "float8_e4m3fn"]:
-            return torch.float8_e4m3fn
-        elif s in ["fp8_e4m3fnuz", "e4m3fnuz", "float8_e4m3fnuz"]:
-            return torch.float8_e4m3fnuz
-        elif s in ["fp8_e5m2", "e5m2", "float8_e5m2"]:
-            return torch.float8_e5m2
-        elif s in ["fp8_e5m2fnuz", "e5m2fnuz", "float8_e5m2fnuz"]:
-            return torch.float8_e5m2fnuz
-        elif s in ["fp8", "float8"]:
-            return torch.float8_e4m3fn  # default fp8
-        else:
-            raise ValueError(f"Unsupported dtype: {s}")
-
     def is_fp8(dt):
         return dt in [torch.float8_e4m3fn, torch.float8_e4m3fnuz, torch.float8_e5m2, torch.float8_e5m2fnuz]
 
@@ -348,7 +325,7 @@ def is_fp8(dt):
     encoding_strategy = strategy_flux.FluxTextEncodingStrategy()
 
     # DiT
-    model = flux_utils.load_flow_model(name, args.ckpt_path, flux_dtype, loading_device)
+    model = flux_utils.load_flow_model(name, args.ckpt_path, None, loading_device)
     model.eval()
     logger.info(f"Casting model to {flux_dtype}")
     model.to(flux_dtype)  # make sure model is dtype
diff --git a/flux_train_network.py b/flux_train_network.py
index 1a40de61a..4a63c2de4 100644
--- a/flux_train_network.py
+++ b/flux_train_network.py
@@ -29,6 +29,9 @@ def assert_extra_args(self, args, train_dataset_group):
         super().assert_extra_args(args, train_dataset_group)
         # sdxl_train_util.verify_sdxl_training_args(args)
 
+        if args.fp8_base_unet:
+            args.fp8_base = True  # if fp8_base_unet is enabled, fp8_base is also enabled for FLUX.1
+
         if args.cache_text_encoder_outputs_to_disk and not args.cache_text_encoder_outputs:
             logger.warning(
                 "cache_text_encoder_outputs_to_disk is enabled, so cache_text_encoder_outputs is also enabled / cache_text_encoder_outputs_to_diskが有効になっているため、cache_text_encoder_outputsも有効になります"
@@ -61,9 +64,20 @@ def load_target_model(self, args, weight_dtype, accelerator):
         name = self.get_flux_model_name(args)
 
         # if we load to cpu, flux.to(fp8) takes a long time
+        if args.fp8_base:
+            loading_dtype = None  # as is
+        else:
+            loading_dtype = weight_dtype
+
         model = flux_utils.load_flow_model(
-            name, args.pretrained_model_name_or_path, weight_dtype, "cpu", disable_mmap=args.disable_mmap_load_safetensors
+            name, args.pretrained_model_name_or_path, loading_dtype, "cpu", disable_mmap=args.disable_mmap_load_safetensors
         )
+        if args.fp8_base:
+            # check dtype of model
+            if model.dtype == torch.float8_e4m3fnuz or model.dtype == torch.float8_e5m2 or model.dtype == torch.float8_e5m2fnuz:
+                raise ValueError(f"Unsupported fp8 model dtype: {model.dtype}")
+            elif model.dtype == torch.float8_e4m3fn:
+                logger.info("Loaded fp8 FLUX model")
 
         if args.split_mode:
             model = self.prepare_split_model(model, weight_dtype, accelerator)
diff --git a/library/flux_utils.py b/library/flux_utils.py
index 37166933a..680836168 100644
--- a/library/flux_utils.py
+++ b/library/flux_utils.py
@@ -1,5 +1,5 @@
 import json
-from typing import Union
+from typing import Optional, Union
 import einops
 import torch
 
@@ -20,7 +20,9 @@
 
 
 # temporary copy from sd3_utils TODO refactor
-def load_safetensors(path: str, device: Union[str, torch.device], disable_mmap: bool = False, dtype: torch.dtype = torch.float32):
+def load_safetensors(
+    path: str, device: Union[str, torch.device], disable_mmap: bool = False, dtype: Optional[torch.dtype] = torch.float32
+):
     if disable_mmap:
         # return safetensors.torch.load(open(path, "rb").read())
         # use experimental loader
@@ -38,11 +40,13 @@ def load_safetensors(path: str, device: Union[str, torch.device], disable_mmap:
 
 
 def load_flow_model(
-    name: str, ckpt_path: str, dtype: torch.dtype, device: Union[str, torch.device], disable_mmap: bool = False
+    name: str, ckpt_path: str, dtype: Optional[torch.dtype], device: Union[str, torch.device], disable_mmap: bool = False
 ) -> flux_models.Flux:
     logger.info(f"Building Flux model {name}")
     with torch.device("meta"):
-        model = flux_models.Flux(flux_models.configs[name].params).to(dtype)
+        model = flux_models.Flux(flux_models.configs[name].params)
+        if dtype is not None:
+            model = model.to(dtype)
 
     # load_sft doesn't support torch.device
     logger.info(f"Loading state dict from {ckpt_path}")
diff --git a/library/utils.py b/library/utils.py
index a16209979..d355cb109 100644
--- a/library/utils.py
+++ b/library/utils.py
@@ -82,6 +82,66 @@ def setup_logging(args=None, log_level=None, reset=False):
         logger.info(msg_init)
 
 
+def str_to_dtype(s: Optional[str], default_dtype: Optional[torch.dtype] = None) -> torch.dtype:
+    """
+    Convert a string to a torch.dtype
+
+    Args:
+        s: string representation of the dtype
+        default_dtype: default dtype to return if s is None
+
+    Returns:
+        torch.dtype: the corresponding torch.dtype
+
+    Raises:
+        ValueError: if the dtype is not supported
+
+    Examples:
+        >>> str_to_dtype("float32")
+        torch.float32
+        >>> str_to_dtype("fp32")
+        torch.float32
+        >>> str_to_dtype("float16")
+        torch.float16
+        >>> str_to_dtype("fp16")
+        torch.float16
+        >>> str_to_dtype("bfloat16")
+        torch.bfloat16
+        >>> str_to_dtype("bf16")
+        torch.bfloat16
+        >>> str_to_dtype("fp8")
+        torch.float8_e4m3fn
+        >>> str_to_dtype("fp8_e4m3fn")
+        torch.float8_e4m3fn
+        >>> str_to_dtype("fp8_e4m3fnuz")
+        torch.float8_e4m3fnuz
+        >>> str_to_dtype("fp8_e5m2")
+        torch.float8_e5m2
+        >>> str_to_dtype("fp8_e5m2fnuz")
+        torch.float8_e5m2fnuz
+    """
+    if s is None:
+        return default_dtype
+    if s in ["bf16", "bfloat16"]:
+        return torch.bfloat16
+    elif s in ["fp16", "float16"]:
+        return torch.float16
+    elif s in ["fp32", "float32", "float"]:
+        return torch.float32
+    elif s in ["fp8_e4m3fn", "e4m3fn", "float8_e4m3fn"]:
+        return torch.float8_e4m3fn
+    elif s in ["fp8_e4m3fnuz", "e4m3fnuz", "float8_e4m3fnuz"]:
+        return torch.float8_e4m3fnuz
+    elif s in ["fp8_e5m2", "e5m2", "float8_e5m2"]:
+        return torch.float8_e5m2
+    elif s in ["fp8_e5m2fnuz", "e5m2fnuz", "float8_e5m2fnuz"]:
+        return torch.float8_e5m2fnuz
+    elif s in ["fp8", "float8"]:
+        return torch.float8_e4m3fn  # default fp8
+    else:
+        raise ValueError(f"Unsupported dtype: {s}")
+
+
 def mem_eff_save_file(tensors: Dict[str, torch.Tensor], filename: str, metadata: Dict[str, Any] = None):
     """
     memory efficient save file
@@ -198,7 +258,7 @@ def _deserialize_tensor(self, tensor_bytes, metadata):
         if tensor_bytes is None:
             byte_tensor = torch.empty(0, dtype=torch.uint8)
         else:
-            tensor_bytes = bytearray(tensor_bytes) # make it writable
+            tensor_bytes = bytearray(tensor_bytes)  # make it writable
             byte_tensor = torch.frombuffer(tensor_bytes, dtype=torch.uint8)
 
         # process float8 types
diff --git a/networks/flux_merge_lora.py b/networks/flux_merge_lora.py
index d5e82920d..2e0d4c297 100644
--- a/networks/flux_merge_lora.py
+++ b/networks/flux_merge_lora.py
@@ -8,7 +8,7 @@
 from safetensors.torch import load_file, save_file
 from tqdm import tqdm
 
-from library.utils import setup_logging
+from library.utils import setup_logging, str_to_dtype, MemoryEfficientSafeOpen, mem_eff_save_file
 
 setup_logging()
 import logging
@@ -34,18 +34,23 @@ def load_state_dict(file_name, dtype):
     return sd, metadata
 
 
-def save_to_file(file_name, state_dict, dtype, metadata):
+def save_to_file(file_name, state_dict, dtype, metadata, mem_eff_save=False):
     if dtype is not None:
         logger.info(f"converting to {dtype}...")
-        for key in list(state_dict.keys()):
+        for key in tqdm(list(state_dict.keys())):
             if type(state_dict[key]) == torch.Tensor:
                 state_dict[key] = state_dict[key].to(dtype)
 
     logger.info(f"saving to: {file_name}")
-    save_file(state_dict, file_name, metadata=metadata)
+    if mem_eff_save:
+        mem_eff_save_file(state_dict, file_name, metadata=metadata)
+    else:
+        save_file(state_dict, file_name, metadata=metadata)
 
 
-def merge_to_flux_model(loading_device, working_device, flux_model, models, ratios, merge_dtype, save_dtype):
+def merge_to_flux_model(
+    loading_device, working_device, flux_model, models, ratios, merge_dtype, save_dtype, mem_eff_load_save=False
+):
     # create module map without loading state_dict
     logger.info(f"loading keys from FLUX.1 model: {flux_model}")
     lora_name_to_module_key = {}
@@ -57,7 +62,14 @@ def merge_to_flux_model(loading_device, working_device, flux_model, models, rati
                 lora_name = lora_flux.LoRANetwork.LORA_PREFIX_FLUX + "_" + module_name.replace(".", "_")
                 lora_name_to_module_key[lora_name] = key
 
-    flux_state_dict = load_file(flux_model, device=loading_device)
+    if mem_eff_load_save:
+        flux_state_dict = {}
+        with MemoryEfficientSafeOpen(flux_model) as flux_file:
+            for key in tqdm(flux_file.keys()):
+                flux_state_dict[key] = flux_file.get_tensor(key).to(loading_device)  # dtype is not changed
+    else:
+        flux_state_dict = load_file(flux_model, device=loading_device)
+
     for model, ratio in zip(models, ratios):
         logger.info(f"loading: {model}")
         lora_sd, _ = load_state_dict(model, merge_dtype)  # loading on CPU
@@ -120,9 +132,17 @@ def merge_to_flux_model(loading_device, working_device, flux_model, models, rati
     return flux_state_dict
 
 
-def merge_to_flux_model_diffusers(loading_device, working_device, flux_model, models, ratios, merge_dtype, save_dtype):
+def merge_to_flux_model_diffusers(
+    loading_device, working_device, flux_model, models, ratios, merge_dtype, save_dtype, mem_eff_load_save=False
+):
     logger.info(f"loading keys from FLUX.1 model: {flux_model}")
-    flux_state_dict = load_file(flux_model, device=loading_device)
+    if mem_eff_load_save:
+        flux_state_dict = {}
+        with MemoryEfficientSafeOpen(flux_model) as flux_file:
+            for key in tqdm(flux_file.keys()):
+                flux_state_dict[key] = flux_file.get_tensor(key).to(loading_device)  # dtype is not changed
+    else:
+        flux_state_dict = load_file(flux_model, device=loading_device)
 
     def create_key_map(n_double_layers, n_single_layers):
         key_map = {}
@@ -474,19 +494,15 @@ def merge_lora_models(models, ratios, merge_dtype, concat=False, shuffle=False):
 
 
 def merge(args):
+    if args.models is None:
+        args.models = []
+    if args.ratios is None:
+        args.ratios = []
+
     assert len(args.models) == len(
         args.ratios
     ), "number of models must be equal to number of ratios / モデルの数と重みの数は合わせてください"
 
-    def str_to_dtype(p):
-        if p == "float":
-            return torch.float
-        if p == "fp16":
-            return torch.float16
-        if p == "bf16":
-            return torch.bfloat16
-        return None
-
     merge_dtype = str_to_dtype(args.precision)
     save_dtype = str_to_dtype(args.save_precision)
     if save_dtype is None:
@@ -500,11 +516,25 @@ def str_to_dtype(p):
     if args.flux_model is not None:
         if not args.diffusers:
             state_dict = merge_to_flux_model(
-                args.loading_device, args.working_device, args.flux_model, args.models, args.ratios, merge_dtype, save_dtype
+                args.loading_device,
+                args.working_device,
+                args.flux_model,
+                args.models,
+                args.ratios,
+                merge_dtype,
+                save_dtype,
+                args.mem_eff_load_save,
             )
         else:
             state_dict = merge_to_flux_model_diffusers(
-                args.loading_device, args.working_device, args.flux_model, args.models, args.ratios, merge_dtype, save_dtype
+                args.loading_device,
+                args.working_device,
+                args.flux_model,
+                args.models,
+                args.ratios,
+                merge_dtype,
+                save_dtype,
+                args.mem_eff_load_save,
             )
 
         if args.no_metadata:
@@ -517,7 +547,7 @@ def str_to_dtype(p):
             )
 
         logger.info(f"saving FLUX model to: {args.save_to}")
-        save_to_file(args.save_to, state_dict, save_dtype, sai_metadata)
+        save_to_file(args.save_to, state_dict, save_dtype, sai_metadata, args.mem_eff_load_save)
 
     else:
         state_dict, metadata = merge_lora_models(args.models, args.ratios, merge_dtype, args.concat, args.shuffle)
@@ -546,14 +576,14 @@ def setup_parser() -> argparse.ArgumentParser:
         "--save_precision",
         type=str,
         default=None,
-        choices=[None, "float", "fp16", "bf16"],
-        help="precision in saving, same to merging if omitted / 保存時に精度を変更して保存する、省略時はマージ時の精度と同じ",
+        help="precision in saving, same to merging if omitted. supported types: "
+        "float32, fp16, bf16, fp8 (same as fp8_e4m3fn), fp8_e4m3fn, fp8_e4m3fnuz, fp8_e5m2, fp8_e5m2fnuz"
+        " / 保存時に精度を変更して保存する、省略時はマージ時の精度と同じ",
     )
     parser.add_argument(
         "--precision",
         type=str,
         default="float",
-        choices=["float", "fp16", "bf16"],
         help="precision in merging (float is recommended) / マージの計算時の精度（floatを推奨）",
     )
     parser.add_argument(
@@ -562,6 +592,12 @@ def setup_parser() -> argparse.ArgumentParser:
         default=None,
         help="FLUX.1 model to load, merge LoRA models if omitted / 読み込むモデル、指定しない場合はLoRAモデルをマージする",
     )
+    parser.add_argument(
+        "--mem_eff_load_save",
+        action="store_true",
+        help="use custom memory efficient load and save functions for FLUX.1 model"
+        " / カスタムのメモリ効率の良い読み込みと保存関数をFLUX.1モデルに使用する",
+    )
     parser.add_argument(
         "--loading_device",
         type=str,

From a61cf73a5cb5209c3f4d1a3688dd276a4dfd1ecb Mon Sep 17 00:00:00 2001
From: Kohya S <ykumeykume@gmail.com>
Date: Tue, 27 Aug 2024 21:44:10 +0900
Subject: [PATCH 110/582] update readme

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 0108ada59..7b1d9cc6c 100644
--- a/README.md
+++ b/README.md
@@ -12,7 +12,7 @@ The command to install PyTorch is as follows:
 Aug 27, 2024 (update 2):
 In FLUX.1 LoRA training, when `--fp8_base` is specified, the FLUX.1 model file with fp8 (`float8_e4m3fn` type) can be loaded directly. Also, in `flux_minimal_inference.py`, it is possible to load it by specifying `fp8 (float8_e4m3fn)` in `--flux_dtype`.
 
-In `flux_merge_lora.py`, you can now specify the precision at save time with `fp8` (see `--help` for details). Also, if you do not specify the merge model, only the model type conversion will be performed.
+In `flux_merge_lora.py`, you can now specify `fp8` for the save precision (see `--help` for details). Also, if you do not specify the merge model, only the dtype conversion will be performed.
 
 Aug 27, 2024:
 

From 6c0e8a5a1740dbd50a0a45ec1f08983877605cd7 Mon Sep 17 00:00:00 2001
From: Akegarasu <akiba@anzu.link>
Date: Thu, 29 Aug 2024 14:50:29 +0800
Subject: [PATCH 111/582] make guidance_scale keep float in args

---
 flux_train_network.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/flux_train_network.py b/flux_train_network.py
index 4a63c2de4..354a8c6f3 100644
--- a/flux_train_network.py
+++ b/flux_train_network.py
@@ -324,7 +324,8 @@ def get_noise_pred_and_target(
         img_ids = flux_utils.prepare_img_ids(bsz, packed_latent_height, packed_latent_width).to(device=accelerator.device)
 
         # get guidance
-        guidance_vec = torch.full((bsz,), args.guidance_scale, device=accelerator.device)
+        # ensure guidance_scale in args is float
+        guidance_vec = torch.full((bsz,), float(args.guidance_scale), device=accelerator.device)
 
         # ensure the hidden state will require grad
         if args.gradient_checkpointing:

From a0cfb0894c4be4ea27412e4c12ed13f68b57094b Mon Sep 17 00:00:00 2001
From: Kohya S <ykumeykume@gmail.com>
Date: Thu, 29 Aug 2024 21:20:33 +0900
Subject: [PATCH 112/582] Cleaned up README

---
 README.md | 281 +++++++++++++++++++++++++++---------------------------
 1 file changed, 143 insertions(+), 138 deletions(-)

diff --git a/README.md b/README.md
index 7b1d9cc6c..a73eead0b 100644
--- a/README.md
+++ b/README.md
@@ -1,6 +1,6 @@
 This repository contains training, generation and utility scripts for Stable Diffusion.
 
-## FLUX.1 LoRA training (WIP)
+## FLUX.1 training (WIP)
 
 This feature is experimental. The options and the training script may change in the future. Please let us know if you have any idea to improve the training.
 
@@ -9,127 +9,24 @@ __Please update PyTorch to 2.4.0. We have tested with `torch==2.4.0` and `torchv
 The command to install PyTorch is as follows:
 `pip3 install torch==2.4.0 torchvision==0.19.0 --index-url https://download.pytorch.org/whl/cu124`
 
-Aug 27, 2024 (update 2):
-In FLUX.1 LoRA training, when `--fp8_base` is specified, the FLUX.1 model file with fp8 (`float8_e4m3fn` type) can be loaded directly. Also, in `flux_minimal_inference.py`, it is possible to load it by specifying `fp8 (float8_e4m3fn)` in `--flux_dtype`.
+- [FLUX.1 LoRA training](#flux1-lora-training)
+  - [Key Options for FLUX.1 LoRA training](#key-options-for-flux1-lora-training)
+  - [Inference for FLUX.1 LoRA model](#inference-for-flux1-lora-model)
+  - [Key Features for FLUX.1 LoRA training](#key-features-for-flux1-lora-training)
+- [FLUX.1 fine-tuning](#flux1-fine-tuning)
+  - [Key Features for FLUX.1 fine-tuning](#key-features-for-flux1-fine-tuning)
+- [Extract LoRA from FLUX.1 Models](#extract-lora-from-flux1-models)
+- [Convert FLUX LoRA](#convert-flux-lora)
+- [Merge LoRA to FLUX.1 checkpoint](#merge-lora-to-flux1-checkpoint)
+- [FLUX.1 Multi-resolution training](#flux1-multi-resolution-training)
 
-In `flux_merge_lora.py`, you can now specify `fp8` for the save precision (see `--help` for details). Also, if you do not specify the merge model, only the dtype conversion will be performed.
-
-Aug 27, 2024:
-
-- FLUX.1 LoRA training now supports CLIP-L LoRA. Please remove `--network_train_unet_only`. T5XXL is not trained. The output of T5XXL is still cached, so `--cache_text_encoder_outputs` or `--cache_text_encoder_outputs_to_disk` is still required. The trained LoRA can be used with ComfyUI.
-  - `flux_extract_lora.py` and `convert_flux_lora.py` do not support CLIP-L LoRA.
-- `--sigmoid_scale` is now effective even when `--timestep_sampling shift` is specified. Normally, leave it at 1.0. Larger values make the value before shift closer to a uniform distribution.
-
-- __Experimental__ `--fp8_base_unet` option is added to `flux_train_network.py`. Flux can be trained with fp8, and CLIP-L can be trained with bf16/fp16. When specifying this option, the `--fp8_base` option is automatically enabled. 
-
-Aug 25, 2024:
-Added `shift` option to `--timestep_sampling` in FLUX.1 fine-tuning and LoRA training. Shifts timesteps according to the value of `--discrete_flow_shift` (shifts the value of sigmoid of normal distribution random number). It may be good to start with a value of 3.1582 (=e^1.15) for `--discrete_flow_shift`.
-Sample command: `--timestep_sampling shift --discrete_flow_shift 3.1582 --model_prediction_type raw --guidance_scale 1.0`
-
-Aug 24, 2024 (update 2):
-
-__Experimental__ Added an option to split the projection layers of q/k/v/txt in the attention and apply LoRA to each of them in FLUX.1 LoRA training. Specify `"split_qkv=True"` in network_args like `--network_args "split_qkv=True"` (`train_blocks` is also available). 
-
-The number of parameters may increase slightly, so the expressiveness may increase, but the training time may be longer. No detailed verification has been done.
-
-This implementation is experimental, so it may be deprecated or changed in the future.
-
-The .safetensors file of the trained model is compatible with the normal LoRA model of sd-scripts, so it should be usable in inference environments such as ComfyUI as it is. Also, converting it to AI-toolkit (Diffusers) format with `convert_flux_lora.py` will reduce the size. It should be no problem to convert it if you use it in the inference environment.
-
-Technical details: In the implementation of Black Forest Labs' model, the projection layers of q/k/v (and txt in single blocks) are concatenated into one. If LoRA is added there as it is, the LoRA module is only one, and the dimension is large. In contrast, in the implementation of Diffusers, the projection layers of q/k/v/txt are separated. Therefore, the LoRA module is applied to q/k/v/txt separately, and the dimension is smaller. This option is for training LoRA similar to the latter.
-
-The compatibility of the saved model (state dict) is ensured by concatenating the weights of multiple LoRAs. However, since there are zero weights in some parts, the model size will be large.
-
-Aug 24, 2024:
-Fixed an issue where the attention mask was not applied in single blocks when `--apply_t5_attn_mask` was specified.
-
-Aug 22, 2024 (update 2):
-Fixed a bug that the embedding was zero-padded when `--apply_t5_attn_mask` option was applied. Also, the cache file for text encoder outputs now records whether the mask is applied or not. Please note that the cache file will be recreated when switching the `--apply_t5_attn_mask` option.
-
-Added a script to extract LoRA from the difference between the two models of FLUX.1. Use `networks/flux_extract_lora.py`. See `--help` for details. Normally, more than 50GB of memory is required, but specifying the `--mem_eff_safe_open` option significantly reduces memory usage. However, this option is a custom implementation, so unexpected problems may occur. Please always check if the model is loaded correctly.
-
-Aug 22, 2024:
-Fixed a bug in multi-GPU training. It should work with fine-tuning and LoRA training. `--double_blocks_to_swap` and `--single_blocks_to_swap` cannot be used in multi-GPU training. 
-
-`--disable_mmap_load_safetensors` option now works in `flux_train.py`. It speeds up model loading during training in WSL2. It is also effective in reducing memory usage when loading models during multi-GPU training. Please always check if the model is loaded correctly, as it uses a custom implementation of safetensors loading.
-
-
-Aug 21, 2024 (update 3):
-- There is a bug that `--full_bf16` option is enabled even if it is not specified in `flux_train.py`. The bug will be fixed sooner. __Please specify the `--full_bf16` option explicitly, especially when training with 24GB VRAM.__
-- Stochastic rounding is now implemented when `--fused_backward_pass` is specified. The implementation is 
-based on the code provided by 2kpr. Thank you so much! 
-    - With this change, `--fused_backward_pass` is recommended over `--blockwise_fused_optimizers` when `--full_bf16` is specified. 
-    - Please note that `--fused_backward_pass` is only supported with Adafactor.
-- The sample command in [FLUX.1 fine-tuning](#flux1-fine-tuning) is updated to reflect these changes.
-- Fixed `--single_blocks_to_swap` is not working in `flux_train.py`. 
-
-Aug 21, 2024 (update 2):
-Fixed an error in applying mask in Attention. The attention mask was float, but it should be bool. 
-
-Added a script `convert_flux_lora.py` to convert LoRA between sd-scripts format (BFL-based) and AI-toolkit format (Diffusers-based). See `--help` for details. BFL-based LoRA has a large module, so converting it to Diffusers format may reduce temporary memory usage in the inference environment. Note that re-conversion will increase the size of LoRA.
-
-
-Aug 21, 2024:
-The specification of `--apply_t5_attn_mask` has been changed. Previously, the T5 output was zero-padded, but now, two steps are taken: "1. Apply mask when encoding T5" and "2. Apply mask in the attention of Double Block". Fine tuning, LoRA training, and inference in `flux_mini_inference.py` have been changed.
-
-Aug 20, 2024 (update 3):
-__Experimental__  The multi-resolution training is now supported with caching latents to disk.
-
-The cache files now hold latents for multiple resolutions. Since the latents are appended to the current cache file, it is recommended to delete the cache file in advance (if not, the old latents is kept in .npz file).
-
-See [FLUX.1 Multi-resolution training](#flux1-multi-resolution-training) for details.
-
-Aug 20, 2024 (update 2):
-`flux_merge_lora.py` now supports LoRA from AI-toolkit (Diffusers based keys). Specify `--diffusers` option to merge LoRA with Diffusers based keys. Thanks to exveria1015!
-
-Aug 20, 2024:
-FLUX.1 supports multi-resolution inference, so training at multiple resolutions may be possible and the results may be improved (like 1024x1024, 768x768 and 512x512 ... you can use any resolution).
-
-The script seems to support multi-resolution even in the current version, ~~if `--cache_latents_to_disk` is not specified~~ -> `--cache_latents_to_disk` is now supported for multi-resolution training. Please try if you are interested. See [FLUX.1 Multi-resolution training](#flux1-multi-resolution-training) for details.
-
-We will support multi-resolution caching to disk in the near future.
-
-Aug 19, 2024:
-In `flux_train.py`, the memory consumption during model saving is reduced when `--save_precision` is set to the same value as `--mixed_precision` (about 22GB). Please set the same value unless there is a reason.
-
-An experimental option `--mem_eff_save` is also added. When specified, it can further reduce memory consumption (about 22GB), but since it is a custom implementation, unexpected problems may occur. We do not recommend using it unless you are familiar with the code.
-
-Aug 18, 2024:
-Memory-efficient training based on 2kpr's implementation is implemented in `flux_train.py`. Thanks to 2kpr! See [FLUX.1 fine-tuning](#flux1-fine-tuning) for details.
-
-Aug 17, 2024:
-Added a script `flux_train.py` to train FLUX.1. The script is experimental and not an optimized version. It needs >28GB VRAM for training. 
-
-Aug 16, 2024: 
-
-Added a script `networks/flux_merge_lora.py` to merge LoRA into FLUX.1 checkpoint. See [Merge LoRA to FLUX.1 checkpoint](#merge-lora-to-flux1-checkpoint) for details.
-
-FLUX.1 schnell model based training is now supported (but not tested). If the name of the model file contains `schnell`, the model is treated as a schnell model. 
-
-Added `--t5xxl_max_token_length` option to specify the maximum token length of T5XXL. The default is 512 in dev and 256 in schnell.
-
-Previously, when `--max_token_length` was specified, that value was used, and 512 was used when omitted (default). Therefore, there is no impact if `--max_token_length` was not specified. If `--max_token_length` was specified, please specify `--t5xxl_max_token_length` instead. `--max_token_length` is ignored during FLUX.1 training.
-
-Aug 14, 2024: Sample image generation during training is now supported. Specify options such as `--sample_prompts` and `--sample_every_n_epochs`. It will be very slow when `--split_mode` is specified.
-
-Aug 13, 2024:  
-
-__Experimental__  A network argument `train_blocks` is added to `lora_flux`. This is to select the target blocks of LoRA from FLUX double blocks and single blocks. Specify like `--network_args "train_blocks=single"`. `all` trains both double blocks and single blocks, `double` trains only double blocks, and `single` trains only single blocks. The default (omission) is `all`.
-
-This argument is available even if `--split_mode` is not specified.
-
-__Experimental__  `--split_mode` option is added to `flux_train_network.py`. This splits FLUX into double blocks and single blocks for training. By enabling gradients only for the single blocks part, memory usage is reduced. When this option is specified, you need to specify `"train_blocks=single"` in the network arguments.
-
-This option enables training with 12GB VRAM GPUs, but the training speed is 2-3 times slower than the default. 
-
-Aug 11, 2024: Fix `--apply_t5_attn_mask` option to work. Please remove and re-generate the latents cache file if you have used the option before.
-
-Aug 10, 2024:  LoRA key prefix is changed to `lora_unet` from `lora_flex` to make it compatible with ComfyUI.
+### FLUX.1 LoRA training
 
+We have added a new training script for LoRA training. The script is `flux_train_network.py`. See `--help` for options. 
 
-### FLUX.1 LoRA training
+FLUX.1 model, CLIP-L, and T5XXL models are recommended to be in bf16/fp16 format. If you specify `--fp8_base`, you can use fp8 models for FLUX.1. The fp8 model is only compatible with `float8_e4m3fn` format.
 
-We have added a new training script for LoRA training. The script is `flux_train_network.py`. See `--help` for options. Sample command is below, settings are based on [AI Toolkit by Ostris](https://github.com/ostris/ai-toolkit). It will work with 24GB VRAM GPUs. 
+Sample command is below. It will work with 24GB VRAM GPUs. 
 
 ```
 accelerate launch  --mixed_precision bf16 --num_cpu_threads_per_process 1 flux_train_network.py 
@@ -137,45 +34,106 @@ accelerate launch  --mixed_precision bf16 --num_cpu_threads_per_process 1 flux_t
 --ae ae.sft --cache_latents_to_disk --save_model_as safetensors --sdpa --persistent_data_loader_workers 
 --max_data_loader_n_workers 2 --seed 42 --gradient_checkpointing --mixed_precision bf16 --save_precision bf16 
 --network_module networks.lora_flux --network_dim 4 --optimizer_type adamw8bit --learning_rate 1e-4 
---network_train_unet_only --cache_text_encoder_outputs --cache_text_encoder_outputs_to_disk --fp8_base 
+--cache_text_encoder_outputs --cache_text_encoder_outputs_to_disk --fp8_base 
 --highvram --max_train_epochs 4 --save_every_n_epochs 1 --dataset_config dataset_1024_bs2.toml 
---output_dir path/to/output/dir --output_name flux-lora-name --timestep_sampling sigmoid 
---model_prediction_type raw --guidance_scale 1.0 --loss_type l2
+--output_dir path/to/output/dir --output_name flux-lora-name 
+--timestep_sampling shift --discrete_flow_shift 3.1582 --model_prediction_type raw --guidance_scale 1.0 
 ```
 (The command is multi-line for readability. Please combine it into one line.)
 
 The training can be done with 16GB VRAM GPUs with Adafactor optimizer. Please use settings like below:
 
 ```
---optimizer_type adafactor --optimizer_args "relative_step=False" "scale_parameter=False" "warmup_init=False"
+--optimizer_type adafactor --optimizer_args "relative_step=False" "scale_parameter=False" "warmup_init=False" --lr_scheduler constant_with_warmup --max_grad_norm 0.0
 ```
 
 The training can be done with 12GB VRAM GPUs with Adafactor optimizer, `--split_mode` and `train_blocks=single` options. Please use settings like below:
 
 ```
---optimizer_type adafactor --optimizer_args "relative_step=False" "scale_parameter=False" "warmup_init=False" --split_mode --network_args "train_blocks=single" 
+--optimizer_type adafactor --optimizer_args "relative_step=False" "scale_parameter=False" "warmup_init=False" --split_mode --network_args "train_blocks=single" --lr_scheduler constant_with_warmup --max_grad_norm 0.0
 ```
 
-LoRAs for Text Encoders are not tested yet.
+We also not sure how many epochs are needed for convergence, and how the learning rate should be adjusted.
+
+The trained LoRA model can be used with ComfyUI. 
+
+#### Key Options for FLUX.1 LoRA training
 
-We have added some new options (Aug 10, 2024): `--time_sampling`, `--sigmoid_scale`, `--model_prediction_type` and `--discrete_flow_shift`. The options are as follows:
+There are many unknown points in FLUX.1 training, so some settings can be specified by arguments. Here are the arguments. The arguments and sample settings are still experimental and may change in the future. Feedback on the settings is welcome.
 
-- `--timestep_sampling` is the method to sample timesteps (0-1): `sigma` (sigma-based, same as SD3), `uniform` (uniform random), or `sigmoid` (sigmoid of random normal, same as x-flux).
+- `--timestep_sampling` is the method to sample timesteps (0-1):
+  - `sigma`: sigma-based, same as SD3
+  - `uniform`: uniform random
+  - `sigmoid`: sigmoid of random normal, same as x-flux, AI-toolkit etc.
+  - `shift`: shifts the value of sigmoid of normal distribution random number
 - `--sigmoid_scale` is the scale factor for sigmoid timestep sampling (only used when timestep-sampling is "sigmoid"). The default is 1.0. Larger values will make the sampling more uniform.
-- `--model_prediction_type` is how to interpret and process the model prediction: `raw` (use as is, same as x-flux), `additive` (add to noisy input), `sigma_scaled` (apply sigma scaling, same as SD3).
+  - This option is effective even when`--timestep_sampling shift` is specified.
+  - Normally, leave it at 1.0. Larger values make the value before shift closer to a uniform distribution.
+- `--model_prediction_type` is how to interpret and process the model prediction:
+  - `raw`: use as is, same as x-flux
+  - `additive`: add to noisy input
+  - `sigma_scaled`: apply sigma scaling, same as SD3
 - `--discrete_flow_shift` is the discrete flow shift for the Euler Discrete Scheduler, default is 3.0 (same as SD3).
 
-`--loss_type` may be useful for FLUX.1 training. The default is `l2`.
+The existing `--loss_type` option may be useful for FLUX.1 training. The default is `l2`.
 
-In our experiments, `--timestep_sampling sigma --model_prediction_type raw --discrete_flow_shift 1.0` with `--loss_type l2` seems to work better than the default (SD3) settings. The multiplier of LoRA should be adjusted. 
+~~In our experiments, `--timestep_sampling sigma --model_prediction_type raw --discrete_flow_shift 1.0` with `--loss_type l2` seems to work better than the default (SD3) settings. The multiplier of LoRA should be adjusted. ~~
 
-additional note (Aug 11): A quick check shows that the settings in [AI Toolkit by Ostris](https://github.com/ostris/ai-toolkit) seems to be equivalent to `--timestep_sampling sigmoid --model_prediction_type raw --guidance_scale 1.0` (with the default `l2` loss_type). This seems to be a good starting point. Thanks to Ostris for the great work!
+In our experiments, `--timestep_sampling shift --discrete_flow_shift 3.1582 --model_prediction_type raw --guidance_scale 1.0` with `--loss_type l2` seems to work better than other settings. 
+
+The settings in [AI Toolkit by Ostris](https://github.com/ostris/ai-toolkit) seems to be equivalent to `--timestep_sampling sigmoid --model_prediction_type raw --guidance_scale 1.0` (with the default `l2` loss_type). 
 
 Other settings may work better, so please try different settings.
 
-We also not sure how many epochs are needed for convergence, and how the learning rate should be adjusted.
+Other options are described below.
 
-The trained LoRA model can be used with ComfyUI. 
+#### Distribution of timesteps
+
+`--timestep_sampling` and `--sigmoid_scale`, `--discrete_flow_shift` adjust the distribution of timesteps. The distribution is shown in the figures below.
+
+The effect of `--discrete_flow_shift` with `--timestep_sampling shift` (when `--sigmoid_scale` is not specified, the default is 1.0):
+
+The difference between `--timestep_sampling uniform` and `--timestep_sampling sigma`:
+
+The effect of `--timestep_sampling sigmoid` and `--sigmoid_scale` (when `--timestep_sampling sigmoid` is specified, `--discrete_flow_shift` is ignored):
+
+#### Key Features for FLUX.1 LoRA training
+
+1. CLIP-L LoRA Support:
+   - FLUX.1 LoRA training now supports CLIP-L LoRA.
+   - Remove `--network_train_unet_only` from your command.
+   - T5XXL is not trained. Its output is still cached, so `--cache_text_encoder_outputs` or `--cache_text_encoder_outputs_to_disk` is still required.
+   - The trained LoRA can be used with ComfyUI.
+   - Note: `flux_extract_lora.py` and `convert_flux_lora.py` do not support CLIP-L LoRA.
+
+2. Experimental FP8/FP16 mixed training:
+   - `--fp8_base_unet` enables training with fp8 for FLUX and bf16/fp16 for CLIP-L.
+   - FLUX can be trained with fp8, and CLIP-L can be trained with bf16/fp16.
+   - When specifying this option, the `--fp8_base` option is automatically enabled.
+
+3. Split Q/K/V Projection Layers (Experimental):
+   - Added an option to split the projection layers of q/k/v/txt in the attention and apply LoRA to each of them.
+   - Specify `"split_qkv=True"` in network_args like `--network_args "split_qkv=True"` (`train_blocks` is also available).
+   - May increase expressiveness but also training time.
+   - The trained model is compatible with normal LoRA models in sd-scripts and can be used in environments like ComfyUI.
+   - Converting to AI-toolkit (Diffusers) format with `convert_flux_lora.py` will reduce the size.
+   
+4. T5 Attention Mask Application:
+   - T5 attention mask is applied when `--apply_t5_attn_mask` is specified.
+   - Now applies mask when encoding T5 and in the attention of Double and Single Blocks
+   - Affects fine-tuning, LoRA training, and inference in `flux_minimal_inference.py`.
+
+5. Multi-resolution Training Support:
+   - FLUX.1 now supports multi-resolution training, even with caching latents to disk.
+
+
+Technical details of Q/K/V split: 
+
+In the implementation of Black Forest Labs' model, the projection layers of q/k/v (and txt in single blocks) are concatenated into one. If LoRA is added there as it is, the LoRA module is only one, and the dimension is large. In contrast, in the implementation of Diffusers, the projection layers of q/k/v/txt are separated. Therefore, the LoRA module is applied to q/k/v/txt separately, and the dimension is smaller. This option is for training LoRA similar to the latter.
+
+The compatibility of the saved model (state dict) is ensured by concatenating the weights of multiple LoRAs. However, since there are zero weights in some parts, the model size will be large.
+
+### Inference for FLUX.1 with LoRA model
 
 The inference script is also available. The script is `flux_minimal_inference.py`. See `--help` for options. 
 
@@ -185,6 +143,8 @@ python flux_minimal_inference.py --ckpt flux1-dev.sft --clip_l sd3/clip_l.safete
 
 ### FLUX.1 fine-tuning
 
+The memory-efficient training with block swap is based on 2kpr's implementation. Thanks to 2kpr!
+
 Sample command for FLUX.1 fine-tuning is below. This will work with 24GB VRAM GPUs, and 64GB main memory is recommended. 
 
 ```
@@ -195,15 +155,13 @@ accelerate launch  --mixed_precision bf16 --num_cpu_threads_per_process 1 flux_t
 --dataset_config dataset_1024_bs1.toml  --output_dir path/to/output/dir --output_name output-name 
 --learning_rate 5e-5 --max_train_epochs 4  --sdpa --highvram --cache_text_encoder_outputs_to_disk --cache_latents_to_disk --save_every_n_epochs 1 
 --optimizer_type adafactor --optimizer_args "relative_step=False" "scale_parameter=False" "warmup_init=False" 
---timestep_sampling sigmoid --model_prediction_type raw --guidance_scale 1.0 
+--lr_scheduler constant_with_warmup --max_grad_norm 0.0 
+--timestep_sampling shift --discrete_flow_shift 3.1582 --model_prediction_type raw --guidance_scale 1.0 
 --fused_backward_pass  --double_blocks_to_swap 6 --cpu_offload_checkpointing --full_bf16 
 ```
+(The command is multi-line for readability. Please combine it into one line.)
 
-(Combine the command into one line.)
-
-Sample image generation during training is not tested yet.
-
-Options are almost the same as LoRA training. The difference is `--blockwise_fused_optimizers`, `--double_blocks_to_swap` and `--cpu_offload_checkpointing`.  `--single_blocks_to_swap` is also available.
+Options are almost the same as LoRA training. The difference is `--full_bf16`, `--blockwise_fused_optimizers`, `--double_blocks_to_swap` and `--cpu_offload_checkpointing`.  `--single_blocks_to_swap` is also available.
 
 `--full_bf16` enables the training with bf16 (weights and gradients). 
 
@@ -223,6 +181,53 @@ Swap 6 double blocks and use cpu offload checkpointing may be a good starting po
 
 The learning rate and the number of epochs are not optimized yet. Please adjust them according to the training results.
 
+#### Key Features for FLUX.1 fine-tuning
+
+1. Sample Image Generation:
+   - Sample image generation during training is now supported.
+   - The prompts are cached and used for generation if `--cache_latents` is specified. So changing the prompts during training will not affect the generated images.
+   - Specify options such as `--sample_prompts` and `--sample_every_n_epochs`.
+   - Note: It will be very slow when `--split_mode` is specified.
+
+2. Experimental Memory-Efficient Saving:
+   - `--mem_eff_save` option can further reduce memory consumption during model saving (about 22GB).
+   - This is a custom implementation and may cause unexpected issues. Use with caution.
+
+3. T5XXL Token Length Control:
+   - Added `--t5xxl_max_token_length` option to specify the maximum token length of T5XXL.
+   - Default is 512 in dev and 256 in schnell models.
+
+4. Multi-GPU Training Support:
+   - Note: `--double_blocks_to_swap` and `--single_blocks_to_swap` cannot be used in multi-GPU training.
+
+5. Disable mmap Load for Safetensors:
+   - `--disable_mmap_load_safetensors` option now works in `flux_train.py`.
+   - Speeds up model loading during training in WSL2.
+   - Effective in reducing memory usage when loading models during multi-GPU training.
+
+
+### Extract LoRA from FLUX.1 Models
+
+Script: `networks/flux_extract_lora.py`
+
+Extracts LoRA from the difference between two FLUX.1 models.
+
+Offers memory-efficient option with `--mem_eff_safe_open`.
+
+CLIP-L LoRA is not supported.
+
+### Convert FLUX LoRA
+
+Script: `convert_flux_lora.py`
+
+Converts LoRA between sd-scripts format (BFL-based) and AI-toolkit format (Diffusers-based).
+
+If you use LoRA in the inference environment, converting it to AI-toolkit format may reduce temporary memory usage.
+
+Note that re-conversion will increase the size of LoRA.
+
+CLIP-L LoRA is not supported.
+
 ### Merge LoRA to FLUX.1 checkpoint
 
 `networks/flux_merge_lora.py` merges LoRA to FLUX.1 checkpoint. __The script is experimental.__ 

From daa6ad516581872aa6acaa15c0d24aad4f998838 Mon Sep 17 00:00:00 2001
From: "Kohya S." <52813779+kohya-ss@users.noreply.github.com>
Date: Thu, 29 Aug 2024 21:25:30 +0900
Subject: [PATCH 113/582] Update README.md

---
 README.md | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/README.md b/README.md
index a73eead0b..6e2ae3376 100644
--- a/README.md
+++ b/README.md
@@ -77,9 +77,9 @@ There are many unknown points in FLUX.1 training, so some settings can be specif
 
 The existing `--loss_type` option may be useful for FLUX.1 training. The default is `l2`.
 
-~~In our experiments, `--timestep_sampling sigma --model_prediction_type raw --discrete_flow_shift 1.0` with `--loss_type l2` seems to work better than the default (SD3) settings. The multiplier of LoRA should be adjusted. ~~
+~~In our experiments, `--timestep_sampling sigma --model_prediction_type raw --discrete_flow_shift 1.0` with `--loss_type l2` seems to work better than the default (SD3) settings. The multiplier of LoRA should be adjusted.~~
 
-In our experiments, `--timestep_sampling shift --discrete_flow_shift 3.1582 --model_prediction_type raw --guidance_scale 1.0` with `--loss_type l2` seems to work better than other settings. 
+In our experiments, `--timestep_sampling shift --discrete_flow_shift 3.1582 --model_prediction_type raw --guidance_scale 1.0` (with the default `l2` loss_type) seems to work better. 
 
 The settings in [AI Toolkit by Ostris](https://github.com/ostris/ai-toolkit) seems to be equivalent to `--timestep_sampling sigmoid --model_prediction_type raw --guidance_scale 1.0` (with the default `l2` loss_type). 
 
@@ -92,10 +92,13 @@ Other options are described below.
 `--timestep_sampling` and `--sigmoid_scale`, `--discrete_flow_shift` adjust the distribution of timesteps. The distribution is shown in the figures below.
 
 The effect of `--discrete_flow_shift` with `--timestep_sampling shift` (when `--sigmoid_scale` is not specified, the default is 1.0):
+![Figure_2](https://github.com/user-attachments/assets/d9de42f9-f17d-40da-b88d-d964402569c6)
 
-The difference between `--timestep_sampling uniform` and `--timestep_sampling sigma`:
+The difference between `--timestep_sampling sigmoid` and `--timestep_sampling uniform` (when `--timestep_sampling sigmoid` or `uniform` is specified, `--discrete_flow_shift` is ignored):
+![Figure_3](https://github.com/user-attachments/assets/27029009-1f5d-4dc0-bb24-13d02ac4fdad)
 
 The effect of `--timestep_sampling sigmoid` and `--sigmoid_scale` (when `--timestep_sampling sigmoid` is specified, `--discrete_flow_shift` is ignored):
+![Figure_4](https://github.com/user-attachments/assets/08a2267c-e47e-48b7-826e-f9a080787cdc)
 
 #### Key Features for FLUX.1 LoRA training
 

From 8ecf0fc4bfd1b03cfc6fd4055af0b3363f5d1f38 Mon Sep 17 00:00:00 2001
From: Kohya S <ykumeykume@gmail.com>
Date: Thu, 29 Aug 2024 22:10:57 +0900
Subject: [PATCH 114/582] Refactor code to ensure args.guidance_scale is always
 a float #1525

---
 flux_train.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/flux_train.py b/flux_train.py
index 410728d44..32a36f036 100644
--- a/flux_train.py
+++ b/flux_train.py
@@ -688,8 +688,8 @@ def optimizer_hook(parameter: torch.Tensor):
                 packed_latent_height, packed_latent_width = noisy_model_input.shape[2] // 2, noisy_model_input.shape[3] // 2
                 img_ids = flux_utils.prepare_img_ids(bsz, packed_latent_height, packed_latent_width).to(device=accelerator.device)
 
-                # get guidance
-                guidance_vec = torch.full((bsz,), args.guidance_scale, device=accelerator.device)
+                # get guidance: ensure args.guidance_scale is float
+                guidance_vec = torch.full((bsz,), float(args.guidance_scale), device=accelerator.device)
 
                 # call model
                 l_pooled, t5_out, txt_ids, t5_attn_mask = text_encoder_conds

From 8fdfd8c857a88aaa78ac9c2488432ef8115982f2 Mon Sep 17 00:00:00 2001
From: Kohya S <ykumeykume@gmail.com>
Date: Thu, 29 Aug 2024 22:26:29 +0900
Subject: [PATCH 115/582] Update safetensors to version 0.4.4 in
 requirements.txt #1524

---
 README.md        | 7 +++++++
 requirements.txt | 2 +-
 2 files changed, 8 insertions(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 6e2ae3376..30264e738 100644
--- a/README.md
+++ b/README.md
@@ -9,6 +9,13 @@ __Please update PyTorch to 2.4.0. We have tested with `torch==2.4.0` and `torchv
 The command to install PyTorch is as follows:
 `pip3 install torch==2.4.0 torchvision==0.19.0 --index-url https://download.pytorch.org/whl/cu124`
 
+### Recent Updates
+
+Aug 29, 2024: 
+Please update `safetensors` to `0.4.4` to fix the error when using `--resume`. `requirements.txt` is updated.
+
+### Contents
+
 - [FLUX.1 LoRA training](#flux1-lora-training)
   - [Key Options for FLUX.1 LoRA training](#key-options-for-flux1-lora-training)
   - [Inference for FLUX.1 LoRA model](#inference-for-flux1-lora-model)
diff --git a/requirements.txt b/requirements.txt
index 4ee19b3ee..4c1bc3922 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -10,7 +10,7 @@ bitsandbytes==0.43.3
 prodigyopt==1.0
 lion-pytorch==0.0.6
 tensorboard
-safetensors==0.4.2
+safetensors==0.4.4
 # gradio==3.16.2
 altair==4.2.2
 easygui==0.98.3

From 34f2315047f8d5b89b7a8a6093bb56679bff13c3 Mon Sep 17 00:00:00 2001
From: Akegarasu <akiba@anzu.link>
Date: Thu, 29 Aug 2024 22:33:37 +0800
Subject: [PATCH 116/582] fix: text_encoder_conds referenced before assignment

---
 train_network.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/train_network.py b/train_network.py
index 048c7e7bd..628c421cb 100644
--- a/train_network.py
+++ b/train_network.py
@@ -1081,12 +1081,12 @@ def remove_model(old_ckpt_name):
                         # print(f"set multiplier: {multipliers}")
                         accelerator.unwrap_model(network).set_multiplier(multipliers)
 
+                    text_encoder_conds = []
                     text_encoder_outputs_list = batch.get("text_encoder_outputs_list", None)
                     if text_encoder_outputs_list is not None:
                         text_encoder_conds = text_encoder_outputs_list  # List of text encoder outputs
                     if (
-                        text_encoder_conds is None
-                        or len(text_encoder_conds) == 0
+                        len(text_encoder_conds) == 0
                         or text_encoder_conds[0] is None
                         or train_text_encoder
                     ):

From 35882f8d5bbd076a97622cf6193c988621481803 Mon Sep 17 00:00:00 2001
From: Akegarasu <akiba@anzu.link>
Date: Thu, 29 Aug 2024 23:03:43 +0800
Subject: [PATCH 117/582] fix

---
 train_network.py | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/train_network.py b/train_network.py
index 628c421cb..4204bce34 100644
--- a/train_network.py
+++ b/train_network.py
@@ -1112,10 +1112,14 @@ def remove_model(old_ckpt_name):
                                 if args.full_fp16:
                                     encoded_text_encoder_conds = [c.to(weight_dtype) for c in encoded_text_encoder_conds]
 
-                        # if encoded_text_encoder_conds is not None, update cached text_encoder_conds
-                        for i in range(len(encoded_text_encoder_conds)):
-                            if encoded_text_encoder_conds[i] is not None:
-                                text_encoder_conds[i] = encoded_text_encoder_conds[i]
+                        # if text_encoder_conds is not cached, use encoded_text_encoder_conds
+                        if len(text_encoder_conds) == 0:
+                            text_encoder_conds = encoded_text_encoder_conds
+                        else:
+                            # if encoded_text_encoder_conds is not None, update cached text_encoder_conds
+                            for i in range(len(encoded_text_encoder_conds)):
+                                if encoded_text_encoder_conds[i] is not None:
+                                    text_encoder_conds[i] = encoded_text_encoder_conds[i]
 
                     # sample noise, call unet, get target
                     noise_pred, target, timesteps, huber_c, weighting = self.get_noise_pred_and_target(

From 25c9040f4fbbcbddc0297895369337846152fea4 Mon Sep 17 00:00:00 2001
From: sdbds <865105819@qq.com>
Date: Sat, 31 Aug 2024 03:05:19 +0800
Subject: [PATCH 118/582] Update flux_train_utils.py

---
 library/flux_train_utils.py | 14 ++++++++++++--
 1 file changed, 12 insertions(+), 2 deletions(-)

diff --git a/library/flux_train_utils.py b/library/flux_train_utils.py
index a8e94ac00..735bcced7 100644
--- a/library/flux_train_utils.py
+++ b/library/flux_train_utils.py
@@ -371,7 +371,7 @@ def compute_loss_weighting_for_sd3(weighting_scheme: str, sigmas=None):
 def get_noisy_model_input_and_timesteps(
     args, noise_scheduler, latents, noise, device, dtype
 ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
-    bsz = latents.shape[0]
+    bsz, _, H, W = latents.shape
     sigmas = None
 
     if args.timestep_sampling == "uniform" or args.timestep_sampling == "sigmoid":
@@ -392,6 +392,16 @@ def get_noisy_model_input_and_timesteps(
         timesteps = logits_norm.sigmoid()
         timesteps = (timesteps * shift) / (1 + (shift - 1) * timesteps)
 
+        t = timesteps.view(-1, 1, 1, 1)
+        timesteps = timesteps * 1000.0
+        noisy_model_input = (1 - t) * latents + t * noise
+    elif args.timestep_sampling == "flux_shift":
+        logits_norm = torch.randn(bsz, device=device)
+        logits_norm = logits_norm * args.sigmoid_scale  # larger scale for more uniform sampling
+        timesteps = logits_norm.sigmoid()
+        mu=get_lin_function(y1=0.5, y2=1.15)((H//2) * (W//2))
+        timesteps = time_shift(mu, 1.0, timesteps)
+
         t = timesteps.view(-1, 1, 1, 1)
         timesteps = timesteps * 1000.0
         noisy_model_input = (1 - t) * latents + t * noise
@@ -571,7 +581,7 @@ def add_flux_train_arguments(parser: argparse.ArgumentParser):
 
     parser.add_argument(
         "--timestep_sampling",
-        choices=["sigma", "uniform", "sigmoid", "shift"],
+        choices=["sigma", "uniform", "sigmoid", "shift", "flux_shift"],
         default="sigma",
         help="Method to sample timesteps: sigma-based, uniform random, sigmoid of random normal and shift of sigmoid."
         " / タイムステップをサンプリングする方法：sigma、random uniform、random normalのsigmoid、sigmoidのシフト。",

From ef510b3cb94427d72df681389e1214251813b1a3 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E9=9D=92=E9=BE=8D=E8=81=96=E8=80=85=40bdsqlsz?=
 <865105819@qq.com>
Date: Sun, 1 Sep 2024 17:41:01 +0800
Subject: [PATCH 119/582] Sd3 freeze x_block (#1417)

* Update sd3_train.py

* add freeze block lr

* Update train_util.py

* update
---
 library/train_util.py | 21 +++++++++++++++++++++
 sd3_train.py          |  9 ++++++++-
 2 files changed, 29 insertions(+), 1 deletion(-)

diff --git a/library/train_util.py b/library/train_util.py
index 989758ad5..74aae0a79 100644
--- a/library/train_util.py
+++ b/library/train_util.py
@@ -3246,6 +3246,12 @@ def add_sd_models_arguments(parser: argparse.ArgumentParser):
         default=None,
         help="directory for caching Tokenizer (for offline training) / Tokenizerをキャッシュするディレクトリ（ネット接続なしでの学習のため）",
     )
+    parser.add_argument(
+        "--num_last_block_to_freeze",
+        type=int,
+        default=None,
+        help="num_last_block_to_freeze",
+    )
 
 
 def add_optimizer_arguments(parser: argparse.ArgumentParser):
@@ -5758,6 +5764,21 @@ def sample_image_inference(
         pass
 
 
+def freeze_blocks(model, num_last_block_to_freeze, block_name="x_block"):
+
+    filtered_blocks = [(name, param) for name, param in model.named_parameters() if block_name in name]
+    print(f"filtered_blocks: {len(filtered_blocks)}")
+
+    num_blocks_to_freeze = min(len(filtered_blocks), num_last_block_to_freeze)
+
+    print(f"freeze_blocks: {num_blocks_to_freeze}")
+    
+    start_freezing_from = max(0, len(filtered_blocks) - num_blocks_to_freeze)
+
+    for i in range(start_freezing_from, len(filtered_blocks)):
+        _, param = filtered_blocks[i]
+        param.requires_grad = False
+
 # endregion
 
 
diff --git a/sd3_train.py b/sd3_train.py
index 3b6c8a118..ce9500b0b 100644
--- a/sd3_train.py
+++ b/sd3_train.py
@@ -368,12 +368,19 @@ def train(args):
         vae.eval()
         vae.to(accelerator.device, dtype=vae_dtype)
 
+    mmdit.requires_grad_(train_mmdit)
+    if not train_mmdit:
+        mmdit.to(accelerator.device, dtype=weight_dtype)  # because of unet is not prepared
+
+    if args.num_last_block_to_freeze:
+        train_util.freeze_blocks(mmdit,num_last_block_to_freeze=args.num_last_block_to_freeze)
+
     training_models = []
     params_to_optimize = []
     # if train_unet:
     training_models.append(mmdit)
     # if block_lrs is None:
-    params_to_optimize.append({"params": list(mmdit.parameters()), "lr": args.learning_rate})
+    params_to_optimize.append({"params": list(filter(lambda p: p.requires_grad, mmdit.parameters())), "lr": args.learning_rate})
     # else:
     #     params_to_optimize.extend(get_block_params_to_optimize(mmdit, block_lrs))
 

From 92e7600cc2fea604321004f260e7db76c764f388 Mon Sep 17 00:00:00 2001
From: Kohya S <ykumeykume@gmail.com>
Date: Sun, 1 Sep 2024 18:57:07 +0900
Subject: [PATCH 120/582] Move freeze_blocks to sd3_train because it's only for
 sd3

---
 README.md             |  3 +++
 library/train_util.py | 21 ---------------------
 sd3_train.py          | 22 ++++++++++++++++++++--
 3 files changed, 23 insertions(+), 23 deletions(-)

diff --git a/README.md b/README.md
index 30264e738..d96367194 100644
--- a/README.md
+++ b/README.md
@@ -309,6 +309,9 @@ resolution = [512, 512]
 
 SD3 training is done with `sd3_train.py`. 
 
+__Sep 1, 2024__:
+- `--num_last_block_to_freeze` is added to `sd3_train.py`. This option is to freeze the last n blocks of the MMDiT. See [#1417](https://github.com/kohya-ss/sd-scripts/pull/1417) for details. Thanks to sdbds!
+
 __Jul  27, 2024__: 
 - Latents and text encoder outputs caching mechanism is refactored significantly. 
   - Existing cache files for SD3 need to be recreated. Please delete the previous cache files. 
diff --git a/library/train_util.py b/library/train_util.py
index 74aae0a79..989758ad5 100644
--- a/library/train_util.py
+++ b/library/train_util.py
@@ -3246,12 +3246,6 @@ def add_sd_models_arguments(parser: argparse.ArgumentParser):
         default=None,
         help="directory for caching Tokenizer (for offline training) / Tokenizerをキャッシュするディレクトリ（ネット接続なしでの学習のため）",
     )
-    parser.add_argument(
-        "--num_last_block_to_freeze",
-        type=int,
-        default=None,
-        help="num_last_block_to_freeze",
-    )
 
 
 def add_optimizer_arguments(parser: argparse.ArgumentParser):
@@ -5764,21 +5758,6 @@ def sample_image_inference(
         pass
 
 
-def freeze_blocks(model, num_last_block_to_freeze, block_name="x_block"):
-
-    filtered_blocks = [(name, param) for name, param in model.named_parameters() if block_name in name]
-    print(f"filtered_blocks: {len(filtered_blocks)}")
-
-    num_blocks_to_freeze = min(len(filtered_blocks), num_last_block_to_freeze)
-
-    print(f"freeze_blocks: {num_blocks_to_freeze}")
-    
-    start_freezing_from = max(0, len(filtered_blocks) - num_blocks_to_freeze)
-
-    for i in range(start_freezing_from, len(filtered_blocks)):
-        _, param = filtered_blocks[i]
-        param.requires_grad = False
-
 # endregion
 
 
diff --git a/sd3_train.py b/sd3_train.py
index ce9500b0b..87011b215 100644
--- a/sd3_train.py
+++ b/sd3_train.py
@@ -373,7 +373,20 @@ def train(args):
         mmdit.to(accelerator.device, dtype=weight_dtype)  # because of unet is not prepared
 
     if args.num_last_block_to_freeze:
-        train_util.freeze_blocks(mmdit,num_last_block_to_freeze=args.num_last_block_to_freeze)
+        # freeze last n blocks of MM-DIT
+        block_name = "x_block"
+        filtered_blocks = [(name, param) for name, param in mmdit.named_parameters() if block_name in name]
+        accelerator.print(f"filtered_blocks: {len(filtered_blocks)}")
+
+        num_blocks_to_freeze = min(len(filtered_blocks), args.num_last_block_to_freeze)
+
+        accelerator.print(f"freeze_blocks: {num_blocks_to_freeze}")
+
+        start_freezing_from = max(0, len(filtered_blocks) - num_blocks_to_freeze)
+
+        for i in range(start_freezing_from, len(filtered_blocks)):
+            _, param = filtered_blocks[i]
+            param.requires_grad = False
 
     training_models = []
     params_to_optimize = []
@@ -1033,12 +1046,17 @@ def setup_parser() -> argparse.ArgumentParser:
         default=None,
         help="number of optimizers for fused backward pass and optimizer step / fused backward passとoptimizer stepのためのoptimizer数",
     )
-
     parser.add_argument(
         "--skip_latents_validity_check",
         action="store_true",
         help="skip latents validity check / latentsの正当性チェックをスキップする",
     )
+    parser.add_argument(
+        "--num_last_block_to_freeze",
+        type=int,
+        default=None,
+        help="freeze last n blocks of MM-DIT / MM-DITの最後のnブロックを凍結する",
+    )
     return parser
 
 

From 4f6d915d15262447b1049a78a55678b2825784a3 Mon Sep 17 00:00:00 2001
From: Kohya S <ykumeykume@gmail.com>
Date: Sun, 1 Sep 2024 19:12:29 +0900
Subject: [PATCH 121/582] update help and README

---
 README.md                   | 5 +++++
 library/flux_train_utils.py | 8 ++++----
 2 files changed, 9 insertions(+), 4 deletions(-)

diff --git a/README.md b/README.md
index d96367194..331951ef4 100644
--- a/README.md
+++ b/README.md
@@ -11,6 +11,10 @@ The command to install PyTorch is as follows:
 
 ### Recent Updates
 
+Sep 1, 2024:
+- `--timestamp_sampling` has `flux_shift` option. Thanks to sdbds!
+  - This is the same shift as FLUX.1 dev inference, adjusting the timestep sampling depending on the resolution. `--discrete_flow_shift` is ignored when `flux_shift` is specified. It is not verified which is better, `shift` or `flux_shift`.
+
 Aug 29, 2024: 
 Please update `safetensors` to `0.4.4` to fix the error when using `--resume`. `requirements.txt` is updated.
 
@@ -73,6 +77,7 @@ There are many unknown points in FLUX.1 training, so some settings can be specif
   - `uniform`: uniform random
   - `sigmoid`: sigmoid of random normal, same as x-flux, AI-toolkit etc.
   - `shift`: shifts the value of sigmoid of normal distribution random number
+  - `flux_shift`: shifts the value of sigmoid of normal distribution random number, depending on the resolution (same as FLUX.1 dev inference). `--discrete_flow_shift` is ignored when `flux_shift` is specified.
 - `--sigmoid_scale` is the scale factor for sigmoid timestep sampling (only used when timestep-sampling is "sigmoid"). The default is 1.0. Larger values will make the sampling more uniform.
   - This option is effective even when`--timestep_sampling shift` is specified.
   - Normally, leave it at 1.0. Larger values make the value before shift closer to a uniform distribution.
diff --git a/library/flux_train_utils.py b/library/flux_train_utils.py
index 735bcced7..9dad4baa2 100644
--- a/library/flux_train_utils.py
+++ b/library/flux_train_utils.py
@@ -371,7 +371,7 @@ def compute_loss_weighting_for_sd3(weighting_scheme: str, sigmas=None):
 def get_noisy_model_input_and_timesteps(
     args, noise_scheduler, latents, noise, device, dtype
 ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
-    bsz, _, H, W = latents.shape
+    bsz, _, h, w = latents.shape
     sigmas = None
 
     if args.timestep_sampling == "uniform" or args.timestep_sampling == "sigmoid":
@@ -399,7 +399,7 @@ def get_noisy_model_input_and_timesteps(
         logits_norm = torch.randn(bsz, device=device)
         logits_norm = logits_norm * args.sigmoid_scale  # larger scale for more uniform sampling
         timesteps = logits_norm.sigmoid()
-        mu=get_lin_function(y1=0.5, y2=1.15)((H//2) * (W//2))
+        mu = get_lin_function(y1=0.5, y2=1.15)((h // 2) * (w // 2))
         timesteps = time_shift(mu, 1.0, timesteps)
 
         t = timesteps.view(-1, 1, 1, 1)
@@ -583,8 +583,8 @@ def add_flux_train_arguments(parser: argparse.ArgumentParser):
         "--timestep_sampling",
         choices=["sigma", "uniform", "sigmoid", "shift", "flux_shift"],
         default="sigma",
-        help="Method to sample timesteps: sigma-based, uniform random, sigmoid of random normal and shift of sigmoid."
-        " / タイムステップをサンプリングする方法：sigma、random uniform、random normalのsigmoid、sigmoidのシフト。",
+        help="Method to sample timesteps: sigma-based, uniform random, sigmoid of random normal, shift of sigmoid and FLUX.1 shifting."
+        " / タイムステップをサンプリングする方法：sigma、random uniform、random normalのsigmoid、sigmoidのシフト、FLUX.1のシフト。",
     )
     parser.add_argument(
         "--sigmoid_scale",

From 6abacf04da756808ffca567f6660445ecdf478bd Mon Sep 17 00:00:00 2001
From: Kohya S <ykumeykume@gmail.com>
Date: Mon, 2 Sep 2024 13:05:26 +0900
Subject: [PATCH 122/582] update README

---
 README.md | 20 ++++++++++++++------
 1 file changed, 14 insertions(+), 6 deletions(-)

diff --git a/README.md b/README.md
index 331951ef4..5dd916aa0 100644
--- a/README.md
+++ b/README.md
@@ -184,7 +184,7 @@ Options are almost the same as LoRA training. The difference is `--full_bf16`, `
 
 `--blockwise_fused_optimizers` enables the fusing of the optimizer step into the backward pass for each block. This is similar to `--fused_backward_pass`. Any optimizer can be used, but Adafactor is recommended for memory efficiency. `--blockwise_fused_optimizers` cannot be used with `--fused_backward_pass`. Stochastic rounding is not supported for now.
 
-`--double_blocks_to_swap` and `--single_blocks_to_swap` are the number of double blocks and single blocks to swap. The default is None (no swap). These options must be combined with `--fused_backward_pass` or `--blockwise_fused_optimizers`. `--double_blocks_to_swap` can be specified with `--single_blocks_to_swap`. The recommended maximum number of blocks to swap is 9 for double blocks and 18 for single blocks.
+`--double_blocks_to_swap` and `--single_blocks_to_swap` are the number of double blocks and single blocks to swap. The default is None (no swap). These options must be combined with `--fused_backward_pass` or `--blockwise_fused_optimizers`. `--double_blocks_to_swap` can be specified with `--single_blocks_to_swap`. The recommended maximum number of blocks to swap is 9 for double blocks and 18 for single blocks. Please see the next chapter for details.
 
 `--cpu_offload_checkpointing` is to offload the gradient checkpointing to CPU. This reduces about 2GB of VRAM usage.
 
@@ -198,24 +198,32 @@ The learning rate and the number of epochs are not optimized yet. Please adjust
 
 #### Key Features for FLUX.1 fine-tuning
 
-1. Sample Image Generation:
+1.  Technical details of double/single block swap:
+    - Reduce memory usage by transferring double and single blocks of FLUX.1 from GPU to CPU when they are not needed.
+    - During forward pass, the weights of the blocks that have finished calculation are transferred to CPU, and the weights of the blocks to be calculated are transferred to GPU.
+    - The same is true for the backward pass, but the order is reversed. The gradients remain on the GPU.
+    - Since the transfer between CPU and GPU takes time, the training will be slower.
+    - `--double_blocks_to_swap` and `--single_blocks_to_swap` specify the number of blocks to swap. For example, `--double_blocks_to_swap 6` swaps 6 blocks at each step of training, but the remaining 13 blocks are always on the GPU.
+    - About 640MB of memory can be saved per double block, and about 320MB of memory can be saved per single block.
+
+2. Sample Image Generation:
    - Sample image generation during training is now supported.
    - The prompts are cached and used for generation if `--cache_latents` is specified. So changing the prompts during training will not affect the generated images.
    - Specify options such as `--sample_prompts` and `--sample_every_n_epochs`.
    - Note: It will be very slow when `--split_mode` is specified.
 
-2. Experimental Memory-Efficient Saving:
+3. Experimental Memory-Efficient Saving:
    - `--mem_eff_save` option can further reduce memory consumption during model saving (about 22GB).
    - This is a custom implementation and may cause unexpected issues. Use with caution.
 
-3. T5XXL Token Length Control:
+4. T5XXL Token Length Control:
    - Added `--t5xxl_max_token_length` option to specify the maximum token length of T5XXL.
    - Default is 512 in dev and 256 in schnell models.
 
-4. Multi-GPU Training Support:
+5. Multi-GPU Training Support:
    - Note: `--double_blocks_to_swap` and `--single_blocks_to_swap` cannot be used in multi-GPU training.
 
-5. Disable mmap Load for Safetensors:
+6. Disable mmap Load for Safetensors:
    - `--disable_mmap_load_safetensors` option now works in `flux_train.py`.
    - Speeds up model loading during training in WSL2.
    - Effective in reducing memory usage when loading models during multi-GPU training.

From b65ae9b439e4324359014d6d720aa01def3a19fc Mon Sep 17 00:00:00 2001
From: Kohya S <ykumeykume@gmail.com>
Date: Wed, 4 Sep 2024 21:33:17 +0900
Subject: [PATCH 123/582] T5XXL LoRA training, fp8 T5XXL support

---
 README.md                   |  45 +++++++++++----
 flux_train_network.py       | 112 +++++++++++++++++++++++++++++-------
 library/flux_train_utils.py |  23 ++++++--
 library/flux_utils.py       |   9 ++-
 library/strategy_flux.py    |  13 ++++-
 networks/lora_flux.py       |  39 ++++++++++---
 train_network.py            |  48 ++++++++++------
 7 files changed, 222 insertions(+), 67 deletions(-)

diff --git a/README.md b/README.md
index 5dd916aa0..840655705 100644
--- a/README.md
+++ b/README.md
@@ -11,6 +11,11 @@ The command to install PyTorch is as follows:
 
 ### Recent Updates
 
+Sep 4, 2024:
+- T5XXL LoRA is supported in LoRA training. Remove `--network_train_unet_only` and add `train_t5xxl=True` to `--network_args`. CLIP-L is also trained at the same time (T5XXL only cannot be trained). The trained model can be used with ComfyUI.
+- In LoRA training, when `--fp8_base` is specified, you can specify `t5xxl_fp8_e4m3fn.safetensors` as the T5XXL weights. However, it is recommended to use fp16 weights for caching.
+- Fixed an issue where the training CLIP-L LoRA was not used in sample image generation during LoRA training.
+
 Sep 1, 2024:
 - `--timestamp_sampling` has `flux_shift` option. Thanks to sdbds!
   - This is the same shift as FLUX.1 dev inference, adjusting the timestep sampling depending on the resolution. `--discrete_flow_shift` is ignored when `flux_shift` is specified. It is not verified which is better, `shift` or `flux_shift`.
@@ -41,8 +46,8 @@ Sample command is below. It will work with 24GB VRAM GPUs.
 
 ```
 accelerate launch  --mixed_precision bf16 --num_cpu_threads_per_process 1 flux_train_network.py 
---pretrained_model_name_or_path flux1-dev.sft --clip_l sd3/clip_l.safetensors --t5xxl sd3/t5xxl_fp16.safetensors 
---ae ae.sft --cache_latents_to_disk --save_model_as safetensors --sdpa --persistent_data_loader_workers 
+--pretrained_model_name_or_path flux1-dev.safetensors --clip_l sd3/clip_l.safetensors --t5xxl sd3/t5xxl_fp16.safetensors 
+--ae ae.safetensors --cache_latents_to_disk --save_model_as safetensors --sdpa --persistent_data_loader_workers 
 --max_data_loader_n_workers 2 --seed 42 --gradient_checkpointing --mixed_precision bf16 --save_precision bf16 
 --network_module networks.lora_flux --network_dim 4 --optimizer_type adamw8bit --learning_rate 1e-4 
 --cache_text_encoder_outputs --cache_text_encoder_outputs_to_disk --fp8_base 
@@ -72,6 +77,11 @@ The trained LoRA model can be used with ComfyUI.
 
 There are many unknown points in FLUX.1 training, so some settings can be specified by arguments. Here are the arguments. The arguments and sample settings are still experimental and may change in the future. Feedback on the settings is welcome.
 
+- `--pretrained_model_name_or_path` is the path to the pretrained model (FLUX.1). bf16 (original BFL model) is recommended (`flux1-dev.safetensors` or `flux1-dev.sft`). If you specify `--fp8_base`, you can use fp8 models for FLUX.1. The fp8 model is only compatible with `float8_e4m3fn` format.
+- `--clip_l` is the path to the CLIP-L model. 
+- `--t5xxl` is the path to the T5XXL model. If you specify `--fp8_base`, you can use fp8 (float8_e4m3fn) models for T5XXL. However, it is recommended to use fp16 models for caching.
+- `--ae` is the path to the autoencoder model (`ae.safetensors` or `ae.sft`).
+
 - `--timestep_sampling` is the method to sample timesteps (0-1):
   - `sigma`: sigma-based, same as SD3
   - `uniform`: uniform random
@@ -114,16 +124,29 @@ The effect of `--timestep_sampling sigmoid` and `--sigmoid_scale` (when `--times
 
 #### Key Features for FLUX.1 LoRA training
 
-1. CLIP-L LoRA Support:
-   - FLUX.1 LoRA training now supports CLIP-L LoRA.
+1. CLIP-L and T5XXL LoRA Support:
+   - FLUX.1 LoRA training now supports CLIP-L and T5XXL LoRA training.
    - Remove `--network_train_unet_only` from your command.
-   - T5XXL is not trained. Its output is still cached, so `--cache_text_encoder_outputs` or `--cache_text_encoder_outputs_to_disk` is still required.
+   - Add `train_t5xxl=True` to `--network_args` to train T5XXL LoRA. CLIP-L is also trained at the same time.
+   - T5XXL output can be cached for CLIP-L LoRA training. So, `--cache_text_encoder_outputs` or `--cache_text_encoder_outputs_to_disk` is also available.
    - The trained LoRA can be used with ComfyUI.
-   - Note: `flux_extract_lora.py` and `convert_flux_lora.py` do not support CLIP-L LoRA.
+   - Note: `flux_extract_lora.py`, `convert_flux_lora.py`and `merge_flux_lora.py` do not support CLIP-L and T5XXL LoRA yet.
+
+    | trained LoRA|option|network_args|cache_text_encoder_outputs (*1)|
+    |---|---|---|---|
+    |FLUX.1|`--network_train_unet_only`|-|o|
+    |FLUX.1 + CLIP-L|-|-|o (*2)|
+    |FLUX.1 + CLIP-L + T5XXL|-|`train_t5xxl=True`|-|
+    |CLIP-L (*3)|`--network_train_text_encoder_only`|-|o (*2)|
+    |CLIP-L + T5XXL (*3)|`--network_train_text_encoder_only`|`train_t5xxl=True`|-|
+
+    - *1: `--cache_text_encoder_outputs` or `--cache_text_encoder_outputs_to_disk` is also available.
+    - *2: T5XXL output can be cached for CLIP-L LoRA training.
+    - *3: Not tested yet.
 
 2. Experimental FP8/FP16 mixed training:
-   - `--fp8_base_unet` enables training with fp8 for FLUX and bf16/fp16 for CLIP-L.
-   - FLUX can be trained with fp8, and CLIP-L can be trained with bf16/fp16.
+   - `--fp8_base_unet` enables training with fp8 for FLUX and bf16/fp16 for CLIP-L/T5XXL.
+   - FLUX can be trained with fp8, and CLIP-L/T5XXL can be trained with bf16/fp16.
    - When specifying this option, the `--fp8_base` option is automatically enabled.
 
 3. Split Q/K/V Projection Layers (Experimental):
@@ -153,7 +176,7 @@ The compatibility of the saved model (state dict) is ensured by concatenating th
 The inference script is also available. The script is `flux_minimal_inference.py`. See `--help` for options. 
 
 ```
-python flux_minimal_inference.py --ckpt flux1-dev.sft --clip_l sd3/clip_l.safetensors --t5xxl sd3/t5xxl_fp16.safetensors --ae ae.sft --dtype bf16 --prompt "a cat holding a sign that says hello world" --out path/to/output/dir --seed 1 --flux_dtype fp8 --offload --lora lora-flux-name.safetensors;1.0
+python flux_minimal_inference.py --ckpt flux1-dev.safetensors --clip_l sd3/clip_l.safetensors --t5xxl sd3/t5xxl_fp16.safetensors --ae ae.safetensors --dtype bf16 --prompt "a cat holding a sign that says hello world" --out path/to/output/dir --seed 1 --flux_dtype fp8 --offload --lora lora-flux-name.safetensors;1.0
 ```
 
 ### FLUX.1 fine-tuning
@@ -164,7 +187,7 @@ Sample command for FLUX.1 fine-tuning is below. This will work with 24GB VRAM GP
 
 ```
 accelerate launch  --mixed_precision bf16 --num_cpu_threads_per_process 1 flux_train.py   
---pretrained_model_name_or_path flux1-dev.sft  --clip_l clip_l.safetensors --t5xxl t5xxl_fp16.safetensors --ae ae_dev.sft 
+--pretrained_model_name_or_path flux1-dev.safetensors  --clip_l clip_l.safetensors --t5xxl t5xxl_fp16.safetensors --ae ae_dev.safetensors 
 --save_model_as safetensors --sdpa --persistent_data_loader_workers --max_data_loader_n_workers 2 
 --seed 42 --gradient_checkpointing --mixed_precision bf16 --save_precision bf16 
 --dataset_config dataset_1024_bs1.toml  --output_dir path/to/output/dir --output_name output-name 
@@ -256,7 +279,7 @@ CLIP-L LoRA is not supported.
 `networks/flux_merge_lora.py` merges LoRA to FLUX.1 checkpoint. __The script is experimental.__ 
 
 ```
-python networks/flux_merge_lora.py --flux_model flux1-dev.sft --save_to output.safetensors --models lora1.safetensors --ratios 2.0 --save_precision fp16 --loading_device cuda --working_device cpu
+python networks/flux_merge_lora.py --flux_model flux1-dev.safetensors --save_to output.safetensors --models lora1.safetensors --ratios 2.0 --save_precision fp16 --loading_device cuda --working_device cpu
 ```
 
 You can also merge multiple LoRA models into a FLUX.1 model. Specify multiple LoRA models in `--models`. Specify the same number of ratios in `--ratios`.
diff --git a/flux_train_network.py b/flux_train_network.py
index 354a8c6f3..2fc0f3234 100644
--- a/flux_train_network.py
+++ b/flux_train_network.py
@@ -43,13 +43,9 @@ def assert_extra_args(self, args, train_dataset_group):
                 train_dataset_group.is_text_encoder_output_cacheable()
             ), "when caching Text Encoder output, either caption_dropout_rate, shuffle_caption, token_warmup_step or caption_tag_dropout_rate cannot be used / Text Encoderの出力をキャッシュするときはcaption_dropout_rate, shuffle_caption, token_warmup_step, caption_tag_dropout_rateは使えません"
 
-        # assert (
-        #     args.network_train_unet_only or not args.cache_text_encoder_outputs
-        # ), "network for Text Encoder cannot be trained with caching Text Encoder outputs / Text Encoderの出力をキャッシュしながらText Encoderのネットワークを学習することはできません"
-        if not args.network_train_unet_only:
-            logger.info(
-                "network for CLIP-L only will be trained. T5XXL will not be trained / CLIP-Lのネットワークのみが学習されます。T5XXLは学習されません"
-            )
+        # prepare CLIP-L/T5XXL training flags
+        self.train_clip_l = not args.network_train_unet_only
+        self.train_t5xxl = False  # default is False even if args.network_train_unet_only is False
 
         if args.max_token_length is not None:
             logger.warning("max_token_length is not used in Flux training / max_token_lengthはFluxのトレーニングでは使用されません")
@@ -63,12 +59,10 @@ def load_target_model(self, args, weight_dtype, accelerator):
         # currently offload to cpu for some models
         name = self.get_flux_model_name(args)
 
-        # if we load to cpu, flux.to(fp8) takes a long time
-        if args.fp8_base:
-            loading_dtype = None  # as is
-        else:
-            loading_dtype = weight_dtype
+        # if the file is fp8 and we are using fp8_base, we can load it as is (fp8)
+        loading_dtype = None if args.fp8_base else weight_dtype
 
+        # if we load to cpu, flux.to(fp8) takes a long time, so we should load to gpu in future
         model = flux_utils.load_flow_model(
             name, args.pretrained_model_name_or_path, loading_dtype, "cpu", disable_mmap=args.disable_mmap_load_safetensors
         )
@@ -85,9 +79,21 @@ def load_target_model(self, args, weight_dtype, accelerator):
         clip_l = flux_utils.load_clip_l(args.clip_l, weight_dtype, "cpu", disable_mmap=args.disable_mmap_load_safetensors)
         clip_l.eval()
 
+        # if the file is fp8 and we are using fp8_base (not unet), we can load it as is (fp8)
+        if args.fp8_base and not args.fp8_base_unet:
+            loading_dtype = None  # as is
+        else:
+            loading_dtype = weight_dtype
+
         # loading t5xxl to cpu takes a long time, so we should load to gpu in future
-        t5xxl = flux_utils.load_t5xxl(args.t5xxl, weight_dtype, "cpu", disable_mmap=args.disable_mmap_load_safetensors)
+        t5xxl = flux_utils.load_t5xxl(args.t5xxl, loading_dtype, "cpu", disable_mmap=args.disable_mmap_load_safetensors)
         t5xxl.eval()
+        if args.fp8_base and not args.fp8_base_unet:
+            # check dtype of model
+            if t5xxl.dtype == torch.float8_e4m3fnuz or t5xxl.dtype == torch.float8_e5m2 or t5xxl.dtype == torch.float8_e5m2fnuz:
+                raise ValueError(f"Unsupported fp8 model dtype: {t5xxl.dtype}")
+            elif t5xxl.dtype == torch.float8_e4m3fn:
+                logger.info("Loaded fp8 T5XXL model")
 
         ae = flux_utils.load_ae(name, args.ae, weight_dtype, "cpu", disable_mmap=args.disable_mmap_load_safetensors)
 
@@ -154,25 +160,35 @@ def get_latents_caching_strategy(self, args):
     def get_text_encoding_strategy(self, args):
         return strategy_flux.FluxTextEncodingStrategy(apply_t5_attn_mask=args.apply_t5_attn_mask)
 
+    def post_process_network(self, args, accelerator, network, text_encoders, unet):
+        # check t5xxl is trained or not
+        self.train_t5xxl = network.train_t5xxl
+
+        if self.train_t5xxl and args.cache_text_encoder_outputs:
+            raise ValueError(
+                "T5XXL is trained, so cache_text_encoder_outputs cannot be used / T5XXL学習時はcache_text_encoder_outputsは使用できません"
+            )
+
     def get_models_for_text_encoding(self, args, accelerator, text_encoders):
         if args.cache_text_encoder_outputs:
-            if self.is_train_text_encoder(args):
+            if self.train_clip_l and not self.train_t5xxl:
                 return text_encoders[0:1]  # only CLIP-L is needed for encoding because T5XXL is cached
             else:
-                return text_encoders  # ignored
+                return None  # no text encoders are needed for encoding because both are cached
         else:
             return text_encoders  # both CLIP-L and T5XXL are needed for encoding
 
     def get_text_encoders_train_flags(self, args, text_encoders):
-        return [True, False] if self.is_train_text_encoder(args) else [False, False]
+        return [self.train_clip_l, self.train_t5xxl]
 
     def get_text_encoder_outputs_caching_strategy(self, args):
         if args.cache_text_encoder_outputs:
+            # if the text encoders is trained, we need tokenization, so is_partial is True
             return strategy_flux.FluxTextEncoderOutputsCachingStrategy(
                 args.cache_text_encoder_outputs_to_disk,
                 None,
                 False,
-                is_partial=self.is_train_text_encoder(args),
+                is_partial=self.train_clip_l or self.train_t5xxl,
                 apply_t5_attn_mask=args.apply_t5_attn_mask,
             )
         else:
@@ -193,8 +209,16 @@ def cache_text_encoder_outputs_if_needed(
 
             # When TE is not be trained, it will not be prepared so we need to use explicit autocast
             logger.info("move text encoders to gpu")
-            text_encoders[0].to(accelerator.device, dtype=weight_dtype)
-            text_encoders[1].to(accelerator.device, dtype=weight_dtype)
+            text_encoders[0].to(accelerator.device, dtype=weight_dtype)  # always not fp8
+            text_encoders[1].to(accelerator.device)
+
+            if text_encoders[1].dtype == torch.float8_e4m3fn:
+                # if we load fp8 weights, the model is already fp8, so we use it as is
+                self.prepare_text_encoder_fp8(1, text_encoders[1], text_encoders[1].dtype, weight_dtype)
+            else:
+                # otherwise, we need to convert it to target dtype
+                text_encoders[1].to(weight_dtype)
+
             with accelerator.autocast():
                 dataset.new_cache_text_encoder_outputs(text_encoders, accelerator.is_main_process)
 
@@ -235,7 +259,7 @@ def cache_text_encoder_outputs_if_needed(
         else:
             # Text Encoderから毎回出力を取得するので、GPUに乗せておく
             text_encoders[0].to(accelerator.device, dtype=weight_dtype)
-            text_encoders[1].to(accelerator.device, dtype=weight_dtype)
+            text_encoders[1].to(accelerator.device)
 
     # def call_unet(self, args, accelerator, unet, noisy_latents, timesteps, text_conds, batch, weight_dtype):
     #     noisy_latents = noisy_latents.to(weight_dtype)  # TODO check why noisy_latents is not weight_dtype
@@ -255,9 +279,12 @@ def cache_text_encoder_outputs_if_needed(
     #     return noise_pred
 
     def sample_images(self, accelerator, args, epoch, global_step, device, ae, tokenizer, text_encoder, flux):
+        text_encoders = text_encoder  # for compatibility
+        text_encoders = self.get_models_for_text_encoding(args, accelerator, text_encoders)
+
         if not args.split_mode:
             flux_train_utils.sample_images(
-                accelerator, args, epoch, global_step, flux, ae, text_encoder, self.sample_prompts_te_outputs
+                accelerator, args, epoch, global_step, flux, ae, text_encoders, self.sample_prompts_te_outputs
             )
             return
 
@@ -281,7 +308,7 @@ def forward(self, img, img_ids, txt, txt_ids, timesteps, y, guidance=None, txt_a
         wrapper = FluxUpperLowerWrapper(self.flux_upper, flux, accelerator.device)
         clean_memory_on_device(accelerator.device)
         flux_train_utils.sample_images(
-            accelerator, args, epoch, global_step, wrapper, ae, text_encoder, self.sample_prompts_te_outputs
+            accelerator, args, epoch, global_step, wrapper, ae, text_encoders, self.sample_prompts_te_outputs
         )
         clean_memory_on_device(accelerator.device)
 
@@ -421,6 +448,47 @@ def update_metadata(self, metadata, args):
     def is_text_encoder_not_needed_for_training(self, args):
         return args.cache_text_encoder_outputs and not self.is_train_text_encoder(args)
 
+    def prepare_text_encoder_grad_ckpt_workaround(self, index, text_encoder):
+        if index == 0:  # CLIP-L
+            return super().prepare_text_encoder_grad_ckpt_workaround(index, text_encoder)
+        else:  # T5XXL
+            text_encoder.encoder.embed_tokens.requires_grad_(True)
+
+    def prepare_text_encoder_fp8(self, index, text_encoder, te_weight_dtype, weight_dtype):
+        if index == 0:  # CLIP-L
+            logger.info(f"prepare CLIP-L for fp8: set to {te_weight_dtype}, set embeddings to {weight_dtype}")
+            text_encoder.to(te_weight_dtype)  # fp8
+            text_encoder.text_model.embeddings.to(dtype=weight_dtype)
+        else:  # T5XXL
+
+            def prepare_fp8(text_encoder, target_dtype):
+                def forward_hook(module):
+                    def forward(hidden_states):
+                        hidden_gelu = module.act(module.wi_0(hidden_states))
+                        hidden_linear = module.wi_1(hidden_states)
+                        hidden_states = hidden_gelu * hidden_linear
+                        hidden_states = module.dropout(hidden_states)
+
+                        hidden_states = module.wo(hidden_states)
+                        return hidden_states
+
+                    return forward
+
+                for module in text_encoder.modules():
+                    if module.__class__.__name__ in ["T5LayerNorm", "Embedding"]:
+                        # print("set", module.__class__.__name__, "to", target_dtype)
+                        module.to(target_dtype)
+                    if module.__class__.__name__ in ["T5DenseGatedActDense"]:
+                        # print("set", module.__class__.__name__, "hooks")
+                        module.forward = forward_hook(module)
+
+            if flux_utils.get_t5xxl_actual_dtype(text_encoder) == torch.float8_e4m3fn and text_encoder.dtype == weight_dtype:
+                logger.info(f"T5XXL already prepared for fp8")
+            else:
+                logger.info(f"prepare T5XXL for fp8: set to {te_weight_dtype}, set embeddings to {weight_dtype}, add hooks")
+                text_encoder.to(te_weight_dtype)  # fp8
+                prepare_fp8(text_encoder, weight_dtype)
+
 
 def setup_parser() -> argparse.ArgumentParser:
     parser = train_network.setup_parser()
diff --git a/library/flux_train_utils.py b/library/flux_train_utils.py
index 9dad4baa2..0b5d4d90e 100644
--- a/library/flux_train_utils.py
+++ b/library/flux_train_utils.py
@@ -85,7 +85,7 @@ def sample_images(
 
     if distributed_state.num_processes <= 1:
         # If only one device is available, just use the original prompt list. We don't need to care about the distribution of prompts.
-        with torch.no_grad():
+        with torch.no_grad(), accelerator.autocast():
             for prompt_dict in prompts:
                 sample_image_inference(
                     accelerator,
@@ -187,14 +187,27 @@ def sample_image_inference(
     tokenize_strategy = strategy_base.TokenizeStrategy.get_strategy()
     encoding_strategy = strategy_base.TextEncodingStrategy.get_strategy()
 
+    text_encoder_conds = []
     if sample_prompts_te_outputs and prompt in sample_prompts_te_outputs:
-        te_outputs = sample_prompts_te_outputs[prompt]
-    else:
+        text_encoder_conds = sample_prompts_te_outputs[prompt]
+        print(f"Using cached text encoder outputs for prompt: {prompt}")
+    if text_encoders is not None:
+        print(f"Encoding prompt: {prompt}")
         tokens_and_masks = tokenize_strategy.tokenize(prompt)
         # strategy has apply_t5_attn_mask option
-        te_outputs = encoding_strategy.encode_tokens(tokenize_strategy, text_encoders, tokens_and_masks)
+        encoded_text_encoder_conds = encoding_strategy.encode_tokens(tokenize_strategy, text_encoders, tokens_and_masks)
+        print([x.shape if x is not None else None for x in encoded_text_encoder_conds])
+
+        # if text_encoder_conds is not cached, use encoded_text_encoder_conds
+        if len(text_encoder_conds) == 0:
+            text_encoder_conds = encoded_text_encoder_conds
+        else:
+            # if encoded_text_encoder_conds is not None, update cached text_encoder_conds
+            for i in range(len(encoded_text_encoder_conds)):
+                if encoded_text_encoder_conds[i] is not None:
+                    text_encoder_conds[i] = encoded_text_encoder_conds[i]
 
-    l_pooled, t5_out, txt_ids, t5_attn_mask = te_outputs
+    l_pooled, t5_out, txt_ids, t5_attn_mask = text_encoder_conds
 
     # sample image
     weight_dtype = ae.dtype  # TOFO give dtype as argument
diff --git a/library/flux_utils.py b/library/flux_utils.py
index 680836168..7b0a41a8a 100644
--- a/library/flux_utils.py
+++ b/library/flux_utils.py
@@ -171,7 +171,9 @@ def load_clip_l(ckpt_path: str, dtype: torch.dtype, device: Union[str, torch.dev
     return clip
 
 
-def load_t5xxl(ckpt_path: str, dtype: torch.dtype, device: Union[str, torch.device], disable_mmap: bool = False) -> T5EncoderModel:
+def load_t5xxl(
+    ckpt_path: str, dtype: Optional[torch.dtype], device: Union[str, torch.device], disable_mmap: bool = False
+) -> T5EncoderModel:
     T5_CONFIG_JSON = """
 {
   "architectures": [
@@ -217,6 +219,11 @@ def load_t5xxl(ckpt_path: str, dtype: torch.dtype, device: Union[str, torch.devi
     return t5xxl
 
 
+def get_t5xxl_actual_dtype(t5xxl: T5EncoderModel) -> torch.dtype:
+    # nn.Embedding is the first layer, but it could be casted to bfloat16 or float32
+    return t5xxl.encoder.block[0].layer[0].SelfAttention.q.weight.dtype
+
+
 def prepare_img_ids(batch_size: int, packed_latent_height: int, packed_latent_width: int):
     img_ids = torch.zeros(packed_latent_height, packed_latent_width, 3)
     img_ids[..., 1] = img_ids[..., 1] + torch.arange(packed_latent_height)[:, None]
diff --git a/library/strategy_flux.py b/library/strategy_flux.py
index 5d0839132..6c9ef5e4a 100644
--- a/library/strategy_flux.py
+++ b/library/strategy_flux.py
@@ -5,8 +5,7 @@
 import numpy as np
 from transformers import CLIPTokenizer, T5TokenizerFast
 
-from library import sd3_utils, train_util
-from library import sd3_models
+from library import flux_utils, train_util
 from library.strategy_base import LatentsCachingStrategy, TextEncodingStrategy, TokenizeStrategy, TextEncoderOutputsCachingStrategy
 
 from library.utils import setup_logging
@@ -100,6 +99,8 @@ def __init__(
         super().__init__(cache_to_disk, batch_size, skip_disk_cache_validity_check, is_partial)
         self.apply_t5_attn_mask = apply_t5_attn_mask
 
+        self.warn_fp8_weights = False
+
     def get_outputs_npz_path(self, image_abs_path: str) -> str:
         return os.path.splitext(image_abs_path)[0] + FluxTextEncoderOutputsCachingStrategy.FLUX_TEXT_ENCODER_OUTPUTS_NPZ_SUFFIX
 
@@ -144,6 +145,14 @@ def load_outputs_npz(self, npz_path: str) -> List[np.ndarray]:
     def cache_batch_outputs(
         self, tokenize_strategy: TokenizeStrategy, models: List[Any], text_encoding_strategy: TextEncodingStrategy, infos: List
     ):
+        if not self.warn_fp8_weights:
+            if flux_utils.get_t5xxl_actual_dtype(models[1]) == torch.float8_e4m3fn:
+                logger.warning(
+                    "T5 model is using fp8 weights for caching. This may affect the quality of the cached outputs."
+                    " / T5モデルはfp8の重みを使用しています。これはキャッシュの品質に影響を与える可能性があります。"
+                )
+            self.warn_fp8_weights = True
+
         flux_text_encoding_strategy: FluxTextEncodingStrategy = text_encoding_strategy
         captions = [info.caption for info in infos]
 
diff --git a/networks/lora_flux.py b/networks/lora_flux.py
index fcb56a467..295267beb 100644
--- a/networks/lora_flux.py
+++ b/networks/lora_flux.py
@@ -330,6 +330,11 @@ def create_network(
     if split_qkv is not None:
         split_qkv = True if split_qkv == "True" else False
 
+    # train T5XXL
+    train_t5xxl = kwargs.get("train_t5xxl", False)
+    if train_t5xxl is not None:
+        train_t5xxl = True if train_t5xxl == "True" else False
+
     # すごく引数が多いな ( ^ω^)･･･
     network = LoRANetwork(
         text_encoders,
@@ -344,6 +349,7 @@ def create_network(
         conv_alpha=conv_alpha,
         train_blocks=train_blocks,
         split_qkv=split_qkv,
+        train_t5xxl=train_t5xxl,
         varbose=True,
     )
 
@@ -370,9 +376,10 @@ def create_network_from_weights(multiplier, file, ae, text_encoders, flux, weigh
         else:
             weights_sd = torch.load(file, map_location="cpu")
 
-    # get dim/alpha mapping
+    # get dim/alpha mapping, and train t5xxl
     modules_dim = {}
     modules_alpha = {}
+    train_t5xxl = None
     for key, value in weights_sd.items():
         if "." not in key:
             continue
@@ -385,6 +392,12 @@ def create_network_from_weights(multiplier, file, ae, text_encoders, flux, weigh
             modules_dim[lora_name] = dim
             # logger.info(lora_name, value.size(), dim)
 
+        if train_t5xxl is None:
+            train_t5xxl = "lora_te3" in lora_name
+
+    if train_t5xxl is None:
+        train_t5xxl = False
+
     # # split qkv
     # double_qkv_rank = None
     # single_qkv_rank = None
@@ -413,6 +426,7 @@ def create_network_from_weights(multiplier, file, ae, text_encoders, flux, weigh
         modules_alpha=modules_alpha,
         module_class=module_class,
         split_qkv=split_qkv,
+        train_t5xxl=train_t5xxl,
     )
     return network, weights_sd
 
@@ -421,10 +435,10 @@ class LoRANetwork(torch.nn.Module):
     # FLUX_TARGET_REPLACE_MODULE = ["DoubleStreamBlock", "SingleStreamBlock"]
     FLUX_TARGET_REPLACE_MODULE_DOUBLE = ["DoubleStreamBlock"]
     FLUX_TARGET_REPLACE_MODULE_SINGLE = ["SingleStreamBlock"]
-    TEXT_ENCODER_TARGET_REPLACE_MODULE = ["CLIPAttention", "CLIPSdpaAttention", "CLIPMLP"]
+    TEXT_ENCODER_TARGET_REPLACE_MODULE = ["CLIPAttention", "CLIPSdpaAttention", "CLIPMLP", "T5Attention", "T5DenseGatedActDense"]
     LORA_PREFIX_FLUX = "lora_unet"  # make ComfyUI compatible
     LORA_PREFIX_TEXT_ENCODER_CLIP = "lora_te1"
-    LORA_PREFIX_TEXT_ENCODER_T5 = "lora_te2"
+    LORA_PREFIX_TEXT_ENCODER_T5 = "lora_te3"  # make ComfyUI compatible
 
     def __init__(
         self,
@@ -443,6 +457,7 @@ def __init__(
         modules_alpha: Optional[Dict[str, int]] = None,
         train_blocks: Optional[str] = None,
         split_qkv: bool = False,
+        train_t5xxl: bool = False,
         varbose: Optional[bool] = False,
     ) -> None:
         super().__init__()
@@ -457,6 +472,7 @@ def __init__(
         self.module_dropout = module_dropout
         self.train_blocks = train_blocks if train_blocks is not None else "all"
         self.split_qkv = split_qkv
+        self.train_t5xxl = train_t5xxl
 
         self.loraplus_lr_ratio = None
         self.loraplus_unet_lr_ratio = None
@@ -469,12 +485,16 @@ def __init__(
             logger.info(
                 f"neuron dropout: p={self.dropout}, rank dropout: p={self.rank_dropout}, module dropout: p={self.module_dropout}"
             )
-            if self.conv_lora_dim is not None:
-                logger.info(
-                    f"apply LoRA to Conv2d with kernel size (3,3). dim (rank): {self.conv_lora_dim}, alpha: {self.conv_alpha}"
-                )
+            # if self.conv_lora_dim is not None:
+            #     logger.info(
+            #         f"apply LoRA to Conv2d with kernel size (3,3). dim (rank): {self.conv_lora_dim}, alpha: {self.conv_alpha}"
+            #     )
         if self.split_qkv:
             logger.info(f"split qkv for LoRA")
+        if self.train_blocks is not None:
+            logger.info(f"train {self.train_blocks} blocks only")
+        if train_t5xxl:
+            logger.info(f"train T5XXL as well")
 
         # create module instances
         def create_modules(
@@ -550,12 +570,15 @@ def create_modules(
         skipped_te = []
         for i, text_encoder in enumerate(text_encoders):
             index = i
+            if not train_t5xxl and index > 0:  # 0: CLIP, 1: T5XXL, so we skip T5XXL if train_t5xxl is False
+                break
+
             logger.info(f"create LoRA for Text Encoder {index+1}:")
 
             text_encoder_loras, skipped = create_modules(False, index, text_encoder, LoRANetwork.TEXT_ENCODER_TARGET_REPLACE_MODULE)
+            logger.info(f"create LoRA for Text Encoder {index+1}: {len(text_encoder_loras)} modules.")
             self.text_encoder_loras.extend(text_encoder_loras)
             skipped_te += skipped
-        logger.info(f"create LoRA for Text Encoder: {len(self.text_encoder_loras)} modules.")
 
         # create LoRA for U-Net
         if self.train_blocks == "all":
diff --git a/train_network.py b/train_network.py
index 4204bce34..a68ccfcc4 100644
--- a/train_network.py
+++ b/train_network.py
@@ -157,6 +157,9 @@ def sample_images(self, accelerator, args, epoch, global_step, device, vae, toke
 
     # region SD/SDXL
 
+    def post_process_network(self, args, accelerator, network, text_encoders, unet):
+        pass
+
     def get_noise_scheduler(self, args: argparse.Namespace, device: torch.device) -> Any:
         noise_scheduler = DDPMScheduler(
             beta_start=0.00085, beta_end=0.012, beta_schedule="scaled_linear", num_train_timesteps=1000, clip_sample=False
@@ -237,6 +240,13 @@ def update_metadata(self, metadata, args):
     def is_text_encoder_not_needed_for_training(self, args):
         return False  # use for sample images
 
+    def prepare_text_encoder_grad_ckpt_workaround(self, index, text_encoder):
+        # set top parameter requires_grad = True for gradient checkpointing works
+        text_encoder.text_model.embeddings.requires_grad_(True)
+
+    def prepare_text_encoder_fp8(self, index, text_encoder, te_weight_dtype, weight_dtype):
+        text_encoder.text_model.embeddings.to(dtype=weight_dtype)
+
     # endregion
 
     def train(self, args):
@@ -329,7 +339,7 @@ def train(self, args):
                 train_dataset_group.is_latent_cacheable()
             ), "when caching latents, either color_aug or random_crop cannot be used / latentをキャッシュするときはcolor_augとrandom_cropは使えません"
 
-        self.assert_extra_args(args, train_dataset_group)
+        self.assert_extra_args(args, train_dataset_group)  # may change some args
 
         # acceleratorを準備する
         logger.info("preparing accelerator")
@@ -428,12 +438,15 @@ def train(self, args):
             )
             args.scale_weight_norms = False
 
+        self.post_process_network(args, accelerator, network, text_encoders, unet)
+
+        # apply network to unet and text_encoder
         train_unet = not args.network_train_text_encoder_only
         train_text_encoder = self.is_train_text_encoder(args)
         network.apply_to(text_encoder, unet, train_text_encoder, train_unet)
 
         if args.network_weights is not None:
-            # FIXME consider alpha of weights
+            # FIXME consider alpha of weights: this assumes that the alpha is not changed
             info = network.load_weights(args.network_weights)
             accelerator.print(f"load network weights from {args.network_weights}: {info}")
 
@@ -533,7 +546,7 @@ def train(self, args):
             ), "fp8_base requires mixed precision='fp16' or 'bf16' / fp8を使う場合はmixed_precision='fp16'または'bf16'が必要です。"
             accelerator.print("enable fp8 training for U-Net.")
             unet_weight_dtype = torch.float8_e4m3fn
-            
+
             if not args.fp8_base_unet:
                 accelerator.print("enable fp8 training for Text Encoder.")
             te_weight_dtype = weight_dtype if args.fp8_base_unet else torch.float8_e4m3fn
@@ -545,17 +558,16 @@ def train(self, args):
 
         unet.requires_grad_(False)
         unet.to(dtype=unet_weight_dtype)
-        for t_enc in text_encoders:
+        for i, t_enc in enumerate(text_encoders):
             t_enc.requires_grad_(False)
 
             # in case of cpu, dtype is already set to fp32 because cpu does not support fp8/fp16/bf16
             if t_enc.device.type != "cpu":
                 t_enc.to(dtype=te_weight_dtype)
-                if hasattr(t_enc, "text_model") and hasattr(t_enc.text_model, "embeddings"):
-                    # nn.Embedding not support FP8
-                    t_enc.text_model.embeddings.to(dtype=(weight_dtype if te_weight_dtype != weight_dtype else te_weight_dtype))
-                elif hasattr(t_enc, "encoder") and hasattr(t_enc.encoder, "embeddings"):
-                    t_enc.encoder.embeddings.to(dtype=(weight_dtype if te_weight_dtype != weight_dtype else te_weight_dtype))
+
+                # nn.Embedding not support FP8
+                if te_weight_dtype != weight_dtype:
+                    self.prepare_text_encoder_fp8(i, t_enc, te_weight_dtype, weight_dtype)
 
         # acceleratorがなんかよろしくやってくれるらしい / accelerator will do something good
         if args.deepspeed:
@@ -596,12 +608,12 @@ def train(self, args):
         if args.gradient_checkpointing:
             # according to TI example in Diffusers, train is required
             unet.train()
-            for t_enc, frag in zip(text_encoders, self.get_text_encoders_train_flags(args, text_encoders)):
+            for i, (t_enc, frag) in enumerate(zip(text_encoders, self.get_text_encoders_train_flags(args, text_encoders))):
                 t_enc.train()
 
                 # set top parameter requires_grad = True for gradient checkpointing works
                 if frag:
-                    t_enc.text_model.embeddings.requires_grad_(True)
+                    self.prepare_text_encoder_grad_ckpt_workaround(i, t_enc)
 
         else:
             unet.eval()
@@ -1028,8 +1040,12 @@ def remove_model(old_ckpt_name):
 
         # log device and dtype for each model
         logger.info(f"unet dtype: {unet_weight_dtype}, device: {unet.device}")
-        for t_enc in text_encoders:
-            logger.info(f"text_encoder dtype: {t_enc.dtype}, device: {t_enc.device}")
+        for i, t_enc in enumerate(text_encoders):
+            params_itr = t_enc.parameters()
+            params_itr.__next__()  # skip the first parameter
+            params_itr.__next__()  # skip the second parameter. because CLIP first two parameters are embeddings
+            param_3rd = params_itr.__next__()
+            logger.info(f"text_encoder [{i}] dtype: {param_3rd.dtype}, device: {t_enc.device}")
 
         clean_memory_on_device(accelerator.device)
 
@@ -1085,11 +1101,7 @@ def remove_model(old_ckpt_name):
                     text_encoder_outputs_list = batch.get("text_encoder_outputs_list", None)
                     if text_encoder_outputs_list is not None:
                         text_encoder_conds = text_encoder_outputs_list  # List of text encoder outputs
-                    if (
-                        len(text_encoder_conds) == 0
-                        or text_encoder_conds[0] is None
-                        or train_text_encoder
-                    ):
+                    if len(text_encoder_conds) == 0 or text_encoder_conds[0] is None or train_text_encoder:
                         with torch.set_grad_enabled(train_text_encoder), accelerator.autocast():
                             # Get the text embedding for conditioning
                             if args.weighted_captions:

From b7cff0a7548e5e33f735f06293ba24119fdaa585 Mon Sep 17 00:00:00 2001
From: Kohya S <ykumeykume@gmail.com>
Date: Wed, 4 Sep 2024 21:35:47 +0900
Subject: [PATCH 124/582] update README

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 840655705..c0acfa1d2 100644
--- a/README.md
+++ b/README.md
@@ -12,7 +12,7 @@ The command to install PyTorch is as follows:
 ### Recent Updates
 
 Sep 4, 2024:
-- T5XXL LoRA is supported in LoRA training. Remove `--network_train_unet_only` and add `train_t5xxl=True` to `--network_args`. CLIP-L is also trained at the same time (T5XXL only cannot be trained). The trained model can be used with ComfyUI.
+- T5XXL LoRA is supported in LoRA training. Remove `--network_train_unet_only` and add `train_t5xxl=True` to `--network_args`. CLIP-L is also trained at the same time (T5XXL only cannot be trained). The trained model can be used with ComfyUI. See [Key Features for FLUX.1 LoRA training](#key-features-for-flux1-lora-training) for details.
 - In LoRA training, when `--fp8_base` is specified, you can specify `t5xxl_fp8_e4m3fn.safetensors` as the T5XXL weights. However, it is recommended to use fp16 weights for caching.
 - Fixed an issue where the training CLIP-L LoRA was not used in sample image generation during LoRA training.
 

From 56cb2fc885d818e9c4493fb2843870d7a141db1c Mon Sep 17 00:00:00 2001
From: Kohya S <ykumeykume@gmail.com>
Date: Wed, 4 Sep 2024 23:15:27 +0900
Subject: [PATCH 125/582] support T5XXL LoRA, reduce peak memory usage #1560

---
 flux_minimal_inference.py | 73 +++++++++++++++++++++++++++++++--------
 networks/lora_flux.py     |  2 +-
 2 files changed, 59 insertions(+), 16 deletions(-)

diff --git a/flux_minimal_inference.py b/flux_minimal_inference.py
index 56c1b1982..1c194e7c1 100644
--- a/flux_minimal_inference.py
+++ b/flux_minimal_inference.py
@@ -5,7 +5,7 @@
 import math
 import os
 import random
-from typing import Callable, List, Optional, Tuple
+from typing import Callable, List, Optional
 import einops
 import numpy as np
 
@@ -13,6 +13,7 @@
 from tqdm import tqdm
 from PIL import Image
 import accelerate
+from transformers import CLIPTextModel
 
 from library import device_utils
 from library.device_utils import init_ipex, get_preferred_device
@@ -125,7 +126,7 @@ def do_sample(
 
 def generate_image(
     model,
-    clip_l,
+    clip_l: CLIPTextModel,
     t5xxl,
     ae,
     prompt: str,
@@ -141,12 +142,13 @@ def generate_image(
     # make first noise with packed shape
     # original: b,16,2*h//16,2*w//16, packed: b,h//16*w//16,16*2*2
     packed_latent_height, packed_latent_width = math.ceil(image_height / 16), math.ceil(image_width / 16)
+    noise_dtype = torch.float32 if is_fp8(dtype) else dtype
     noise = torch.randn(
         1,
         packed_latent_height * packed_latent_width,
         16 * 2 * 2,
         device=device,
-        dtype=dtype,
+        dtype=noise_dtype,
         generator=torch.Generator(device=device).manual_seed(seed),
     )
 
@@ -166,9 +168,48 @@ def generate_image(
     clip_l = clip_l.to(device)
     t5xxl = t5xxl.to(device)
     with torch.no_grad():
-        if is_fp8(clip_l_dtype) or is_fp8(t5xxl_dtype):
-            clip_l.to(clip_l_dtype)
-            t5xxl.to(t5xxl_dtype)
+        if is_fp8(clip_l_dtype):
+            param_itr = clip_l.parameters()
+            param_itr.__next__()  # skip first
+            param_2nd = param_itr.__next__()
+            if param_2nd.dtype != clip_l_dtype:
+                logger.info(f"prepare CLIP-L for fp8: set to {clip_l_dtype}, set embeddings to {torch.bfloat16}")
+                clip_l.to(clip_l_dtype)  # fp8
+                clip_l.text_model.embeddings.to(dtype=torch.bfloat16)
+
+            with accelerator.autocast():
+                l_pooled, _, _, _ = encoding_strategy.encode_tokens(tokenize_strategy, [clip_l, None], tokens_and_masks)
+
+        if is_fp8(t5xxl_dtype):
+            if flux_utils.get_t5xxl_actual_dtype(t5xxl) != t5xxl_dtype or not hasattr(t5xxl, "fp8_prepared"):
+                logger.info(f"prepare T5xxl for fp8: set to {t5xxl_dtype}")
+
+                def prepare_fp8(text_encoder, target_dtype):
+                    def forward_hook(module):
+                        def forward(hidden_states):
+                            hidden_gelu = module.act(module.wi_0(hidden_states))
+                            hidden_linear = module.wi_1(hidden_states)
+                            hidden_states = hidden_gelu * hidden_linear
+                            hidden_states = module.dropout(hidden_states)
+
+                            hidden_states = module.wo(hidden_states)
+                            return hidden_states
+
+                        return forward
+
+                    for module in text_encoder.modules():
+                        if module.__class__.__name__ in ["T5LayerNorm", "Embedding"]:
+                            # print("set", module.__class__.__name__, "to", target_dtype)
+                            module.to(target_dtype)
+                        if module.__class__.__name__ in ["T5DenseGatedActDense"]:
+                            # print("set", module.__class__.__name__, "hooks")
+                            module.forward = forward_hook(module)
+
+                    text_encoder.fp8_prepared = True
+
+                t5xxl.to(t5xxl_dtype)
+                prepare_fp8(t5xxl.encoder, torch.bfloat16)
+
             with accelerator.autocast():
                 _, t5_out, txt_ids, t5_attn_mask = encoding_strategy.encode_tokens(
                     tokenize_strategy, [clip_l, t5xxl], tokens_and_masks, args.apply_t5_attn_mask
@@ -315,10 +356,10 @@ def is_fp8(dt):
     t5xxl = flux_utils.load_t5xxl(args.t5xxl, t5xxl_dtype, loading_device)
     t5xxl.eval()
 
-    if is_fp8(clip_l_dtype):
-        clip_l = accelerator.prepare(clip_l)
-    if is_fp8(t5xxl_dtype):
-        t5xxl = accelerator.prepare(t5xxl)
+    # if is_fp8(clip_l_dtype):
+    #     clip_l = accelerator.prepare(clip_l)
+    # if is_fp8(t5xxl_dtype):
+    #     t5xxl = accelerator.prepare(t5xxl)
 
     t5xxl_max_length = 256 if is_schnell else 512
     tokenize_strategy = strategy_flux.FluxTokenizeStrategy(t5xxl_max_length)
@@ -329,14 +370,16 @@ def is_fp8(dt):
     model.eval()
     logger.info(f"Casting model to {flux_dtype}")
     model.to(flux_dtype)  # make sure model is dtype
-    if is_fp8(flux_dtype):
-        model = accelerator.prepare(model)
+    # if is_fp8(flux_dtype):
+    #     model = accelerator.prepare(model)
+    #     if args.offload:
+    #         model = model.to("cpu")
 
     # AE
     ae = flux_utils.load_ae(name, args.ae, ae_dtype, loading_device)
     ae.eval()
-    if is_fp8(ae_dtype):
-        ae = accelerator.prepare(ae)
+    # if is_fp8(ae_dtype):
+    #     ae = accelerator.prepare(ae)
 
     # LoRA
     lora_models: List[lora_flux.LoRANetwork] = []
@@ -360,7 +403,7 @@ def is_fp8(dt):
             lora_model.to(device)
 
         lora_models.append(lora_model)
-
+    
     if not args.interactive:
         generate_image(model, clip_l, t5xxl, ae, args.prompt, args.seed, args.width, args.height, args.steps, args.guidance)
     else:
diff --git a/networks/lora_flux.py b/networks/lora_flux.py
index 295267beb..ab9ccc4d8 100644
--- a/networks/lora_flux.py
+++ b/networks/lora_flux.py
@@ -392,7 +392,7 @@ def create_network_from_weights(multiplier, file, ae, text_encoders, flux, weigh
             modules_dim[lora_name] = dim
             # logger.info(lora_name, value.size(), dim)
 
-        if train_t5xxl is None:
+        if train_t5xxl is None or train_t5xxl is False:
             train_t5xxl = "lora_te3" in lora_name
 
     if train_t5xxl is None:

From 90ed2dfb526168b2e77b8d367e928d8cc44b4278 Mon Sep 17 00:00:00 2001
From: Kohya S <ykumeykume@gmail.com>
Date: Thu, 5 Sep 2024 08:39:29 +0900
Subject: [PATCH 126/582] feat: Add support for merging CLIP-L and T5XXL LoRA
 models

---
 README.md                   |  22 ++++-
 networks/flux_merge_lora.py | 182 ++++++++++++++++++++++++++++--------
 2 files changed, 163 insertions(+), 41 deletions(-)

diff --git a/README.md b/README.md
index c0acfa1d2..fa81f6c0f 100644
--- a/README.md
+++ b/README.md
@@ -11,6 +11,9 @@ The command to install PyTorch is as follows:
 
 ### Recent Updates
 
+Sep 5, 2024:
+The LoRA merge script now supports CLIP-L and T5XXL LoRA. Please specify `--clip_l` and `--t5xxl`. `--clip_l_save_to` and `--t5xxl_save_to` specify the save destination for CLIP-L and T5XXL. See [Merge LoRA to FLUX.1 checkpoint](#merge-lora-to-flux1-checkpoint) for details.
+
 Sep 4, 2024:
 - T5XXL LoRA is supported in LoRA training. Remove `--network_train_unet_only` and add `train_t5xxl=True` to `--network_args`. CLIP-L is also trained at the same time (T5XXL only cannot be trained). The trained model can be used with ComfyUI. See [Key Features for FLUX.1 LoRA training](#key-features-for-flux1-lora-training) for details.
 - In LoRA training, when `--fp8_base` is specified, you can specify `t5xxl_fp8_e4m3fn.safetensors` as the T5XXL weights. However, it is recommended to use fp16 weights for caching.
@@ -276,7 +279,7 @@ CLIP-L LoRA is not supported.
 
 ### Merge LoRA to FLUX.1 checkpoint
 
-`networks/flux_merge_lora.py` merges LoRA to FLUX.1 checkpoint. __The script is experimental.__ 
+`networks/flux_merge_lora.py` merges LoRA to FLUX.1 checkpoint, CLIP-L or T5XXL models. __The script is experimental.__ 
 
 ```
 python networks/flux_merge_lora.py --flux_model flux1-dev.safetensors --save_to output.safetensors --models lora1.safetensors --ratios 2.0 --save_precision fp16 --loading_device cuda --working_device cpu
@@ -284,13 +287,24 @@ python networks/flux_merge_lora.py --flux_model flux1-dev.safetensors --save_to
 
 You can also merge multiple LoRA models into a FLUX.1 model. Specify multiple LoRA models in `--models`. Specify the same number of ratios in `--ratios`.
 
-`--loading_device` is the device to load the LoRA models. `--working_device` is the device to merge (calculate) the models. Default is `cpu` for both. Loading / working device examples are below (in the case of `--save_precision fp16` or `--save_precision bf16`):
+CLIP-L and T5XXL LoRA are supported. `--clip_l` and `--clip_l_save_to` are for CLIP-L, `--t5xxl` and `--t5xxl_save_to` are for T5XXL. Sample command is below.
+
+```
+--clip_l clip_l.safetensors --clip_l_save_to merged_clip_l.safetensors  --t5xxl t5xxl_fp16.safetensors --t5xxl_save_to merged_t5xxl.safetensors
+```
+
+FLUX.1, CLIP-L, and T5XXL can be merged together or separately for memory efficiency.
+
+An experimental option `--mem_eff_load_save` is available. This option is for memory-efficient loading and saving. It may also speed up loading and saving. 
+
+`--loading_device` is the device to load the LoRA models. `--working_device` is the device to merge (calculate) the models. Default is `cpu` for both. Loading / working device examples are below (in the case of `--save_precision fp16` or `--save_precision bf16`, `float32` will consume more memory):
 
 - 'cpu' / 'cpu': Uses >50GB of RAM, but works on any machine.
 - 'cuda' / 'cpu': Uses 24GB of VRAM, but requires 30GB of RAM.
-- 'cuda' / 'cuda': Uses 30GB of VRAM, but requires 30GB of RAM, faster than 'cuda' / 'cpu'.
+- 'cpu' / 'cuda': Uses 4GB of VRAM, but requires 50GB of RAM, faster than 'cpu' / 'cpu' or 'cuda' / 'cpu'.
+- 'cuda' / 'cuda': Uses 30GB of VRAM, but requires 30GB of RAM, faster than 'cpu' / 'cpu' or 'cuda' / 'cpu'.
 
-In the case of LoRA models are trained with `bf16`, we are not sure which is better, `fp16` or `bf16` for `--save_precision`.
+`--save_precision` is the precision to save the merged model. In the case of LoRA models are trained with `bf16`, we are not sure which is better, `fp16` or `bf16` for `--save_precision`.
 
 The script can merge multiple LoRA models. If you want to merge multiple LoRA models, specify `--concat` option to work the merged LoRA model properly.
 
diff --git a/networks/flux_merge_lora.py b/networks/flux_merge_lora.py
index 2e0d4c297..5e100a3ba 100644
--- a/networks/flux_merge_lora.py
+++ b/networks/flux_merge_lora.py
@@ -2,6 +2,7 @@
 import math
 import os
 import time
+from typing import Any, Dict, Union
 
 import torch
 from safetensors import safe_open
@@ -34,11 +35,11 @@ def load_state_dict(file_name, dtype):
     return sd, metadata
 
 
-def save_to_file(file_name, state_dict, dtype, metadata, mem_eff_save=False):
+def save_to_file(file_name, state_dict: Dict[str, Union[Any, torch.Tensor]], dtype, metadata, mem_eff_save=False):
     if dtype is not None:
         logger.info(f"converting to {dtype}...")
         for key in tqdm(list(state_dict.keys())):
-            if type(state_dict[key]) == torch.Tensor:
+            if type(state_dict[key]) == torch.Tensor and state_dict[key].dtype.is_floating_point:
                 state_dict[key] = state_dict[key].to(dtype)
 
     logger.info(f"saving to: {file_name}")
@@ -49,26 +50,76 @@ def save_to_file(file_name, state_dict, dtype, metadata, mem_eff_save=False):
 
 
 def merge_to_flux_model(
-    loading_device, working_device, flux_model, models, ratios, merge_dtype, save_dtype, mem_eff_load_save=False
+    loading_device,
+    working_device,
+    flux_path: str,
+    clip_l_path: str,
+    t5xxl_path: str,
+    models,
+    ratios,
+    merge_dtype,
+    save_dtype,
+    mem_eff_load_save=False,
 ):
     # create module map without loading state_dict
-    logger.info(f"loading keys from FLUX.1 model: {flux_model}")
     lora_name_to_module_key = {}
-    with safe_open(flux_model, framework="pt", device=loading_device) as flux_file:
-        keys = list(flux_file.keys())
-        for key in keys:
-            if key.endswith(".weight"):
-                module_name = ".".join(key.split(".")[:-1])
-                lora_name = lora_flux.LoRANetwork.LORA_PREFIX_FLUX + "_" + module_name.replace(".", "_")
-                lora_name_to_module_key[lora_name] = key
-
+    if flux_path is not None:
+        logger.info(f"loading keys from FLUX.1 model: {flux_path}")
+        with safe_open(flux_path, framework="pt", device=loading_device) as flux_file:
+            keys = list(flux_file.keys())
+            for key in keys:
+                if key.endswith(".weight"):
+                    module_name = ".".join(key.split(".")[:-1])
+                    lora_name = lora_flux.LoRANetwork.LORA_PREFIX_FLUX + "_" + module_name.replace(".", "_")
+                    lora_name_to_module_key[lora_name] = key
+
+    lora_name_to_clip_l_key = {}
+    if clip_l_path is not None:
+        logger.info(f"loading keys from clip_l model: {clip_l_path}")
+        with safe_open(clip_l_path, framework="pt", device=loading_device) as clip_l_file:
+            keys = list(clip_l_file.keys())
+            for key in keys:
+                if key.endswith(".weight"):
+                    module_name = ".".join(key.split(".")[:-1])
+                    lora_name = lora_flux.LoRANetwork.LORA_PREFIX_TEXT_ENCODER_CLIP + "_" + module_name.replace(".", "_")
+                    lora_name_to_clip_l_key[lora_name] = key
+
+    lora_name_to_t5xxl_key = {}
+    if t5xxl_path is not None:
+        logger.info(f"loading keys from t5xxl model: {t5xxl_path}")
+        with safe_open(t5xxl_path, framework="pt", device=loading_device) as t5xxl_file:
+            keys = list(t5xxl_file.keys())
+            for key in keys:
+                if key.endswith(".weight"):
+                    module_name = ".".join(key.split(".")[:-1])
+                    lora_name = lora_flux.LoRANetwork.LORA_PREFIX_TEXT_ENCODER_T5 + "_" + module_name.replace(".", "_")
+                    lora_name_to_t5xxl_key[lora_name] = key
+
+    flux_state_dict = {}
+    clip_l_state_dict = {}
+    t5xxl_state_dict = {}
     if mem_eff_load_save:
-        flux_state_dict = {}
-        with MemoryEfficientSafeOpen(flux_model) as flux_file:
-            for key in tqdm(flux_file.keys()):
-                flux_state_dict[key] = flux_file.get_tensor(key).to(loading_device)  # dtype is not changed
+        if flux_path is not None:
+            with MemoryEfficientSafeOpen(flux_path) as flux_file:
+                for key in tqdm(flux_file.keys()):
+                    flux_state_dict[key] = flux_file.get_tensor(key).to(loading_device)  # dtype is not changed
+
+        if clip_l_path is not None:
+            with MemoryEfficientSafeOpen(clip_l_path) as clip_l_file:
+                for key in tqdm(clip_l_file.keys()):
+                    clip_l_state_dict[key] = clip_l_file.get_tensor(key).to(loading_device)
+
+        if t5xxl_path is not None:
+            with MemoryEfficientSafeOpen(t5xxl_path) as t5xxl_file:
+                for key in tqdm(t5xxl_file.keys()):
+                    t5xxl_state_dict[key] = t5xxl_file.get_tensor(key).to(loading_device)
     else:
-        flux_state_dict = load_file(flux_model, device=loading_device)
+        if flux_path is not None:
+            flux_state_dict = load_file(flux_path, device=loading_device)
+        if clip_l_path is not None:
+            clip_l_state_dict = load_file(clip_l_path, device=loading_device)
+        if t5xxl_path is not None:
+            t5xxl_state_dict = load_file(t5xxl_path, device=loading_device)
 
     for model, ratio in zip(models, ratios):
         logger.info(f"loading: {model}")
@@ -81,8 +132,20 @@ def merge_to_flux_model(
                 up_key = key.replace("lora_down", "lora_up")
                 alpha_key = key[: key.index("lora_down")] + "alpha"
 
-                if lora_name not in lora_name_to_module_key:
-                    logger.warning(f"no module found for LoRA weight: {key}.  LoRA for Text Encoder is not supported yet.")
+                if lora_name in lora_name_to_module_key:
+                    module_weight_key = lora_name_to_module_key[lora_name]
+                    state_dict = flux_state_dict
+                elif lora_name in lora_name_to_clip_l_key:
+                    module_weight_key = lora_name_to_clip_l_key[lora_name]
+                    state_dict = clip_l_state_dict
+                elif lora_name in lora_name_to_t5xxl_key:
+                    module_weight_key = lora_name_to_t5xxl_key[lora_name]
+                    state_dict = t5xxl_state_dict
+                else:
+                    logger.warning(
+                        f"no module found for LoRA weight: {key}. Skipping..."
+                        f"LoRAの重みに対応するモジュールが見つかりませんでした。スキップします。"
+                    )
                     continue
 
                 down_weight = lora_sd.pop(key)
@@ -93,11 +156,7 @@ def merge_to_flux_model(
                 scale = alpha / dim
 
                 # W <- W + U * D
-                module_weight_key = lora_name_to_module_key[lora_name]
-                if module_weight_key not in flux_state_dict:
-                    weight = flux_file.get_tensor(module_weight_key)
-                else:
-                    weight = flux_state_dict[module_weight_key]
+                weight = state_dict[module_weight_key]
 
                 weight = weight.to(working_device, merge_dtype)
                 up_weight = up_weight.to(working_device, merge_dtype)
@@ -121,7 +180,7 @@ def merge_to_flux_model(
                     # logger.info(conved.size(), weight.size(), module.stride, module.padding)
                     weight = weight + ratio * conved * scale
 
-                flux_state_dict[module_weight_key] = weight.to(loading_device, save_dtype)
+                state_dict[module_weight_key] = weight.to(loading_device, save_dtype)
                 del up_weight
                 del down_weight
                 del weight
@@ -129,7 +188,7 @@ def merge_to_flux_model(
         if len(lora_sd) > 0:
             logger.warning(f"Unused keys in LoRA model: {list(lora_sd.keys())}")
 
-    return flux_state_dict
+    return flux_state_dict, clip_l_state_dict, t5xxl_state_dict
 
 
 def merge_to_flux_model_diffusers(
@@ -508,17 +567,28 @@ def merge(args):
     if save_dtype is None:
         save_dtype = merge_dtype
 
-    dest_dir = os.path.dirname(args.save_to)
+    assert (
+        args.save_to or args.clip_l_save_to or args.t5xxl_save_to
+    ), "save_to or clip_l_save_to or t5xxl_save_to must be specified / save_toまたはclip_l_save_toまたはt5xxl_save_toを指定してください"
+    dest_dir = os.path.dirname(args.save_to or args.clip_l_save_to or args.t5xxl_save_to)
     if not os.path.exists(dest_dir):
         logger.info(f"creating directory: {dest_dir}")
         os.makedirs(dest_dir)
 
-    if args.flux_model is not None:
+    if args.flux_model is not None or args.clip_l is not None or args.t5xxl is not None:
         if not args.diffusers:
-            state_dict = merge_to_flux_model(
+            assert (args.clip_l is None and args.clip_l_save_to is None) or (
+                args.clip_l is not None and args.clip_l_save_to is not None
+            ), "clip_l_save_to must be specified if clip_l is specified / clip_lが指定されている場合はclip_l_save_toも指定してください"
+            assert (args.t5xxl is None and args.t5xxl_save_to is None) or (
+                args.t5xxl is not None and args.t5xxl_save_to is not None
+            ), "t5xxl_save_to must be specified if t5xxl is specified / t5xxlが指定されている場合はt5xxl_save_toも指定してください"
+            flux_state_dict, clip_l_state_dict, t5xxl_state_dict = merge_to_flux_model(
                 args.loading_device,
                 args.working_device,
                 args.flux_model,
+                args.clip_l,
+                args.t5xxl,
                 args.models,
                 args.ratios,
                 merge_dtype,
@@ -526,7 +596,10 @@ def merge(args):
                 args.mem_eff_load_save,
             )
         else:
-            state_dict = merge_to_flux_model_diffusers(
+            assert (
+                args.clip_l is None and args.t5xxl is None
+            ), "clip_l and t5xxl are not supported with --diffusers / clip_l、t5xxlはDiffusersではサポートされていません"
+            flux_state_dict = merge_to_flux_model_diffusers(
                 args.loading_device,
                 args.working_device,
                 args.flux_model,
@@ -536,8 +609,10 @@ def merge(args):
                 save_dtype,
                 args.mem_eff_load_save,
             )
+            clip_l_state_dict = None
+            t5xxl_state_dict = None
 
-        if args.no_metadata:
+        if args.no_metadata or (flux_state_dict is None or len(flux_state_dict) == 0):
             sai_metadata = None
         else:
             merged_from = sai_model_spec.build_merged_from([args.flux_model] + args.models)
@@ -546,15 +621,24 @@ def merge(args):
                 None, False, False, False, False, False, time.time(), title=title, merged_from=merged_from, flux="dev"
             )
 
-        logger.info(f"saving FLUX model to: {args.save_to}")
-        save_to_file(args.save_to, state_dict, save_dtype, sai_metadata, args.mem_eff_load_save)
+        if flux_state_dict is not None and len(flux_state_dict) > 0:
+            logger.info(f"saving FLUX model to: {args.save_to}")
+            save_to_file(args.save_to, flux_state_dict, save_dtype, sai_metadata, args.mem_eff_load_save)
+
+        if clip_l_state_dict is not None and len(clip_l_state_dict) > 0:
+            logger.info(f"saving clip_l model to: {args.clip_l_save_to}")
+            save_to_file(args.clip_l_save_to, clip_l_state_dict, save_dtype, None, args.mem_eff_load_save)
+
+        if t5xxl_state_dict is not None and len(t5xxl_state_dict) > 0:
+            logger.info(f"saving t5xxl model to: {args.t5xxl_save_to}")
+            save_to_file(args.t5xxl_save_to, t5xxl_state_dict, save_dtype, None, args.mem_eff_load_save)
 
     else:
-        state_dict, metadata = merge_lora_models(args.models, args.ratios, merge_dtype, args.concat, args.shuffle)
+        flux_state_dict, metadata = merge_lora_models(args.models, args.ratios, merge_dtype, args.concat, args.shuffle)
 
         logger.info("calculating hashes and creating metadata...")
 
-        model_hash, legacy_hash = train_util.precalculate_safetensors_hashes(state_dict, metadata)
+        model_hash, legacy_hash = train_util.precalculate_safetensors_hashes(flux_state_dict, metadata)
         metadata["sshs_model_hash"] = model_hash
         metadata["sshs_legacy_hash"] = legacy_hash
 
@@ -562,12 +646,12 @@ def merge(args):
             merged_from = sai_model_spec.build_merged_from(args.models)
             title = os.path.splitext(os.path.basename(args.save_to))[0]
             sai_metadata = sai_model_spec.build_metadata(
-                state_dict, False, False, False, True, False, time.time(), title=title, merged_from=merged_from, flux="dev"
+                flux_state_dict, False, False, False, True, False, time.time(), title=title, merged_from=merged_from, flux="dev"
             )
             metadata.update(sai_metadata)
 
         logger.info(f"saving model to: {args.save_to}")
-        save_to_file(args.save_to, state_dict, save_dtype, metadata)
+        save_to_file(args.save_to, flux_state_dict, save_dtype, metadata)
 
 
 def setup_parser() -> argparse.ArgumentParser:
@@ -592,6 +676,18 @@ def setup_parser() -> argparse.ArgumentParser:
         default=None,
         help="FLUX.1 model to load, merge LoRA models if omitted / 読み込むモデル、指定しない場合はLoRAモデルをマージする",
     )
+    parser.add_argument(
+        "--clip_l",
+        type=str,
+        default=None,
+        help="path to clip_l (*.sft or *.safetensors), should be float16 / clip_lのパス（*.sftまたは*.safetensors）",
+    )
+    parser.add_argument(
+        "--t5xxl",
+        type=str,
+        default=None,
+        help="path to t5xxl (*.sft or *.safetensors), should be float16 / t5xxlのパス（*.sftまたは*.safetensors）",
+    )
     parser.add_argument(
         "--mem_eff_load_save",
         action="store_true",
@@ -617,6 +713,18 @@ def setup_parser() -> argparse.ArgumentParser:
         default=None,
         help="destination file name: safetensors file / 保存先のファイル名、safetensorsファイル",
     )
+    parser.add_argument(
+        "--clip_l_save_to",
+        type=str,
+        default=None,
+        help="destination file name for clip_l: safetensors file / clip_lの保存先のファイル名、safetensorsファイル",
+    )
+    parser.add_argument(
+        "--t5xxl_save_to",
+        type=str,
+        default=None,
+        help="destination file name for t5xxl: safetensors file / t5xxlの保存先のファイル名、safetensorsファイル",
+    )
     parser.add_argument(
         "--models",
         type=str,

From d9129522a6effea7077f18cdea0ee733a5ac7cb0 Mon Sep 17 00:00:00 2001
From: Kohya S <ykumeykume@gmail.com>
Date: Thu, 5 Sep 2024 12:20:07 +0900
Subject: [PATCH 127/582] set dtype before calling ae closes #1562

---
 flux_train.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/flux_train.py b/flux_train.py
index 32a36f036..0293b7be3 100644
--- a/flux_train.py
+++ b/flux_train.py
@@ -651,7 +651,7 @@ def optimizer_hook(parameter: torch.Tensor):
                 else:
                     with torch.no_grad():
                         # encode images to latents. images are [-1, 1]
-                        latents = ae.encode(batch["images"])
+                        latents = ae.encode(batch["images"].to(ae.dtype)).to(accelerator.device, dtype=weight_dtype)
 
                     # NaNが含まれていれば警告を表示し0に置き換える
                     if torch.any(torch.isnan(latents)):

From 2889108d858880589d362e06e98eeadf4682476a Mon Sep 17 00:00:00 2001
From: Kohya S <ykumeykume@gmail.com>
Date: Thu, 5 Sep 2024 20:58:33 +0900
Subject: [PATCH 128/582] feat: Add --cpu_offload_checkpointing option to LoRA
 training

---
 README.md             |  7 +++++++
 flux_train.py         |  2 +-
 flux_train_network.py |  5 +++++
 train_network.py      | 12 +++++++++++-
 4 files changed, 24 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index fa81f6c0f..e8a12089f 100644
--- a/README.md
+++ b/README.md
@@ -11,7 +11,12 @@ The command to install PyTorch is as follows:
 
 ### Recent Updates
 
+Sep 5, 2024 (update 1):
+
+Added `--cpu_offload_checkpointing` option to LoRA training script. Offloads gradient checkpointing to CPU. This reduces up to 1GB of VRAM usage but slows down the training by about 15%. Cannot be used with `--split_mode`.
+
 Sep 5, 2024:
+
 The LoRA merge script now supports CLIP-L and T5XXL LoRA. Please specify `--clip_l` and `--t5xxl`. `--clip_l_save_to` and `--t5xxl_save_to` specify the save destination for CLIP-L and T5XXL. See [Merge LoRA to FLUX.1 checkpoint](#merge-lora-to-flux1-checkpoint) for details.
 
 Sep 4, 2024:
@@ -72,6 +77,8 @@ The training can be done with 12GB VRAM GPUs with Adafactor optimizer, `--split_
 --optimizer_type adafactor --optimizer_args "relative_step=False" "scale_parameter=False" "warmup_init=False" --split_mode --network_args "train_blocks=single" --lr_scheduler constant_with_warmup --max_grad_norm 0.0
 ```
 
+`--cpu_offload_checkpointing` offloads gradient checkpointing to CPU. This reduces up to 1GB of VRAM usage but slows down the training by about 15%. Cannot be used with `--split_mode`.
+
 We also not sure how many epochs are needed for convergence, and how the learning rate should be adjusted.
 
 The trained LoRA model can be used with ComfyUI. 
diff --git a/flux_train.py b/flux_train.py
index 0293b7be3..0edc83a9f 100644
--- a/flux_train.py
+++ b/flux_train.py
@@ -261,7 +261,7 @@ def train(args):
     )
 
     if args.gradient_checkpointing:
-        flux.enable_gradient_checkpointing(args.cpu_offload_checkpointing)
+        flux.enable_gradient_checkpointing(cpu_offload=args.cpu_offload_checkpointing)
 
     flux.requires_grad_(True)
 
diff --git a/flux_train_network.py b/flux_train_network.py
index 2fc0f3234..a6e57eede 100644
--- a/flux_train_network.py
+++ b/flux_train_network.py
@@ -50,6 +50,11 @@ def assert_extra_args(self, args, train_dataset_group):
         if args.max_token_length is not None:
             logger.warning("max_token_length is not used in Flux training / max_token_lengthはFluxのトレーニングでは使用されません")
 
+        assert not args.split_mode or not args.cpu_offload_checkpointing, (
+            "split_mode and cpu_offload_checkpointing cannot be used together"
+            " / split_modeとcpu_offload_checkpointingは同時に使用できません"
+        )
+
         train_dataset_group.verify_bucket_reso_steps(32)  # TODO check this
 
     def get_flux_model_name(self, args):
diff --git a/train_network.py b/train_network.py
index a68ccfcc4..ad97491df 100644
--- a/train_network.py
+++ b/train_network.py
@@ -451,7 +451,11 @@ def train(self, args):
             accelerator.print(f"load network weights from {args.network_weights}: {info}")
 
         if args.gradient_checkpointing:
-            unet.enable_gradient_checkpointing()
+            if args.cpu_offload_checkpointing:
+                unet.enable_gradient_checkpointing(cpu_offload=True)
+            else:
+                unet.enable_gradient_checkpointing()
+
             for t_enc, flag in zip(text_encoders, self.get_text_encoders_train_flags(args, text_encoders)):
                 if flag:
                     if t_enc.supports_gradient_checkpointing:
@@ -1281,6 +1285,12 @@ def setup_parser() -> argparse.ArgumentParser:
     config_util.add_config_arguments(parser)
     custom_train_functions.add_custom_train_arguments(parser)
 
+    parser.add_argument(
+        "--cpu_offload_checkpointing",
+        action="store_true",
+        help="[EXPERIMENTAL] enable offloading of tensors to CPU during checkpointing for U-Net or DiT, if supported"
+        " / 勾配チェックポイント時にテンソルをCPUにオフロードする（U-NetまたはDiTのみ、サポートされている場合）",
+    )
     parser.add_argument(
         "--no_metadata", action="store_true", help="do not save metadata in output model / メタデータを出力先モデルに保存しない"
     )

From d29af146b8d4c4d028f8752657bd1349c8cd3509 Mon Sep 17 00:00:00 2001
From: Kohya S <ykumeykume@gmail.com>
Date: Mon, 9 Sep 2024 23:01:15 +0900
Subject: [PATCH 129/582] add negative prompt for flux inference script

---
 README.md                 |   3 +
 flux_minimal_inference.py | 289 ++++++++++++++++++++++++++------------
 2 files changed, 206 insertions(+), 86 deletions(-)

diff --git a/README.md b/README.md
index 2f010f499..126516f95 100644
--- a/README.md
+++ b/README.md
@@ -11,6 +11,9 @@ The command to install PyTorch is as follows:
 
 ### Recent Updates
 
+Sep 9, 2024:
+Added `--negative_prompt` and `--cfg_scale` to `flux_minimal_inference.py`. Negative prompts can be used. 
+
 Sep 5, 2024 (update 1):
 
 Added `--cpu_offload_checkpointing` option to LoRA training script. Offloads gradient checkpointing to CPU. This reduces up to 1GB of VRAM usage but slows down the training by about 15%. Cannot be used with `--split_mode`.
diff --git a/flux_minimal_inference.py b/flux_minimal_inference.py
index 1c194e7c1..de607c52a 100644
--- a/flux_minimal_inference.py
+++ b/flux_minimal_inference.py
@@ -71,22 +71,57 @@ def denoise(
     timesteps: list[float],
     guidance: float = 4.0,
     t5_attn_mask: Optional[torch.Tensor] = None,
+    neg_txt: Optional[torch.Tensor] = None,
+    neg_vec: Optional[torch.Tensor] = None,
+    neg_t5_attn_mask: Optional[torch.Tensor] = None,
+    cfg_scale: Optional[float] = None,
 ):
     # this is ignored for schnell
+    logger.info(f"guidance: {guidance}, cfg_scale: {cfg_scale}")
     guidance_vec = torch.full((img.shape[0],), guidance, device=img.device, dtype=img.dtype)
+
+    # prepare classifier free guidance
+    if neg_txt is not None and neg_vec is not None:
+        b_img_ids = torch.cat([img_ids, img_ids], dim=0)
+        b_txt_ids = torch.cat([txt_ids, txt_ids], dim=0)
+        b_txt = torch.cat([neg_txt, txt], dim=0)
+        b_vec = torch.cat([neg_vec, vec], dim=0)
+        if t5_attn_mask is not None and neg_t5_attn_mask is not None:
+            b_t5_attn_mask = torch.cat([neg_t5_attn_mask, t5_attn_mask], dim=0)
+        else:
+            b_t5_attn_mask = None
+    else:
+        b_img_ids = img_ids
+        b_txt_ids = txt_ids
+        b_txt = txt
+        b_vec = vec
+        b_t5_attn_mask = t5_attn_mask
+
     for t_curr, t_prev in zip(tqdm(timesteps[:-1]), timesteps[1:]):
-        t_vec = torch.full((img.shape[0],), t_curr, dtype=img.dtype, device=img.device)
+        t_vec = torch.full((b_img_ids.shape[0],), t_curr, dtype=img.dtype, device=img.device)
+
+        # classifier free guidance
+        if neg_txt is not None and neg_vec is not None:
+            b_img = torch.cat([img, img], dim=0)
+        else:
+            b_img = img
+
         pred = model(
-            img=img,
-            img_ids=img_ids,
-            txt=txt,
-            txt_ids=txt_ids,
-            y=vec,
+            img=b_img,
+            img_ids=b_img_ids,
+            txt=b_txt,
+            txt_ids=b_txt_ids,
+            y=b_vec,
             timesteps=t_vec,
             guidance=guidance_vec,
-            txt_attention_mask=t5_attn_mask,
+            txt_attention_mask=b_t5_attn_mask,
         )
 
+        # classifier free guidance
+        if neg_txt is not None and neg_vec is not None:
+            pred_uncond, pred = torch.chunk(pred, 2, dim=0)
+            pred = pred_uncond + cfg_scale * (pred - pred_uncond)
+
         img = img + (t_prev - t_curr) * pred
 
     return img
@@ -106,19 +141,48 @@ def do_sample(
     is_schnell: bool,
     device: torch.device,
     flux_dtype: torch.dtype,
+    neg_l_pooled: Optional[torch.Tensor] = None,
+    neg_t5_out: Optional[torch.Tensor] = None,
+    neg_t5_attn_mask: Optional[torch.Tensor] = None,
+    cfg_scale: Optional[float] = None,
 ):
+    logger.info(f"num_steps: {num_steps}")
     timesteps = get_schedule(num_steps, img.shape[1], shift=not is_schnell)
 
     # denoise initial noise
     if accelerator:
         with accelerator.autocast(), torch.no_grad():
             x = denoise(
-                model, img, img_ids, t5_out, txt_ids, l_pooled, timesteps=timesteps, guidance=guidance, t5_attn_mask=t5_attn_mask
+                model,
+                img,
+                img_ids,
+                t5_out,
+                txt_ids,
+                l_pooled,
+                timesteps,
+                guidance,
+                t5_attn_mask,
+                neg_t5_out,
+                neg_l_pooled,
+                neg_t5_attn_mask,
+                cfg_scale,
             )
     else:
         with torch.autocast(device_type=device.type, dtype=flux_dtype), torch.no_grad():
             x = denoise(
-                model, img, img_ids, t5_out, txt_ids, l_pooled, timesteps=timesteps, guidance=guidance, t5_attn_mask=t5_attn_mask
+                model,
+                img,
+                img_ids,
+                t5_out,
+                txt_ids,
+                l_pooled,
+                timesteps,
+                guidance,
+                t5_attn_mask,
+                neg_t5_out,
+                neg_l_pooled,
+                neg_t5_attn_mask,
+                cfg_scale,
             )
 
     return x
@@ -135,6 +199,8 @@ def generate_image(
     image_height: int,
     steps: Optional[int],
     guidance: float,
+    negative_prompt: Optional[str],
+    cfg_scale: float,
 ):
     seed = seed if seed is not None else random.randint(0, 2**32 - 1)
     logger.info(f"Seed: {seed}")
@@ -162,65 +228,73 @@ def generate_image(
     # txt2img only needs img_ids
     img_ids = flux_utils.prepare_img_ids(1, packed_latent_height, packed_latent_width)
 
+    # prepare fp8 models
+    if is_fp8(clip_l_dtype) and (not hasattr(clip_l, "fp8_prepared") or not clip_l.fp8_prepared):
+        logger.info(f"prepare CLIP-L for fp8: set to {clip_l_dtype}, set embeddings to {torch.bfloat16}")
+        clip_l.to(clip_l_dtype)  # fp8
+        clip_l.text_model.embeddings.to(dtype=torch.bfloat16)
+        clip_l.fp8_prepared = True
+
+    if is_fp8(t5xxl_dtype) and (not hasattr(t5xxl, "fp8_prepared") or not t5xxl.fp8_prepared):
+        logger.info(f"prepare T5xxl for fp8: set to {t5xxl_dtype}")
+
+        def prepare_fp8(text_encoder, target_dtype):
+            def forward_hook(module):
+                def forward(hidden_states):
+                    hidden_gelu = module.act(module.wi_0(hidden_states))
+                    hidden_linear = module.wi_1(hidden_states)
+                    hidden_states = hidden_gelu * hidden_linear
+                    hidden_states = module.dropout(hidden_states)
+
+                    hidden_states = module.wo(hidden_states)
+                    return hidden_states
+
+                return forward
+
+            for module in text_encoder.modules():
+                if module.__class__.__name__ in ["T5LayerNorm", "Embedding"]:
+                    # print("set", module.__class__.__name__, "to", target_dtype)
+                    module.to(target_dtype)
+                if module.__class__.__name__ in ["T5DenseGatedActDense"]:
+                    # print("set", module.__class__.__name__, "hooks")
+                    module.forward = forward_hook(module)
+
+        t5xxl.to(t5xxl_dtype)
+        prepare_fp8(t5xxl.encoder, torch.bfloat16)
+        t5xxl.fp8_prepared = True
+
     # prepare embeddings
     logger.info("Encoding prompts...")
-    tokens_and_masks = tokenize_strategy.tokenize(prompt)
     clip_l = clip_l.to(device)
     t5xxl = t5xxl.to(device)
-    with torch.no_grad():
-        if is_fp8(clip_l_dtype):
-            param_itr = clip_l.parameters()
-            param_itr.__next__()  # skip first
-            param_2nd = param_itr.__next__()
-            if param_2nd.dtype != clip_l_dtype:
-                logger.info(f"prepare CLIP-L for fp8: set to {clip_l_dtype}, set embeddings to {torch.bfloat16}")
-                clip_l.to(clip_l_dtype)  # fp8
-                clip_l.text_model.embeddings.to(dtype=torch.bfloat16)
-
-            with accelerator.autocast():
-                l_pooled, _, _, _ = encoding_strategy.encode_tokens(tokenize_strategy, [clip_l, None], tokens_and_masks)
 
-        if is_fp8(t5xxl_dtype):
-            if flux_utils.get_t5xxl_actual_dtype(t5xxl) != t5xxl_dtype or not hasattr(t5xxl, "fp8_prepared"):
-                logger.info(f"prepare T5xxl for fp8: set to {t5xxl_dtype}")
-
-                def prepare_fp8(text_encoder, target_dtype):
-                    def forward_hook(module):
-                        def forward(hidden_states):
-                            hidden_gelu = module.act(module.wi_0(hidden_states))
-                            hidden_linear = module.wi_1(hidden_states)
-                            hidden_states = hidden_gelu * hidden_linear
-                            hidden_states = module.dropout(hidden_states)
-
-                            hidden_states = module.wo(hidden_states)
-                            return hidden_states
-
-                        return forward
-
-                    for module in text_encoder.modules():
-                        if module.__class__.__name__ in ["T5LayerNorm", "Embedding"]:
-                            # print("set", module.__class__.__name__, "to", target_dtype)
-                            module.to(target_dtype)
-                        if module.__class__.__name__ in ["T5DenseGatedActDense"]:
-                            # print("set", module.__class__.__name__, "hooks")
-                            module.forward = forward_hook(module)
-
-                    text_encoder.fp8_prepared = True
-
-                t5xxl.to(t5xxl_dtype)
-                prepare_fp8(t5xxl.encoder, torch.bfloat16)
-
-            with accelerator.autocast():
-                _, t5_out, txt_ids, t5_attn_mask = encoding_strategy.encode_tokens(
-                    tokenize_strategy, [clip_l, t5xxl], tokens_and_masks, args.apply_t5_attn_mask
-                )
-        else:
-            with torch.autocast(device_type=device.type, dtype=clip_l_dtype):
-                l_pooled, _, _, _ = encoding_strategy.encode_tokens(tokenize_strategy, [clip_l, None], tokens_and_masks)
-            with torch.autocast(device_type=device.type, dtype=t5xxl_dtype):
-                _, t5_out, txt_ids, t5_attn_mask = encoding_strategy.encode_tokens(
-                    tokenize_strategy, [None, t5xxl], tokens_and_masks, args.apply_t5_attn_mask
-                )
+    def encode(prpt: str):
+        tokens_and_masks = tokenize_strategy.tokenize(prpt)
+        with torch.no_grad():
+            if is_fp8(clip_l_dtype):
+                with accelerator.autocast():
+                    l_pooled, _, _, _ = encoding_strategy.encode_tokens(tokenize_strategy, [clip_l, None], tokens_and_masks)
+            else:
+                with torch.autocast(device_type=device.type, dtype=clip_l_dtype):
+                    l_pooled, _, _, _ = encoding_strategy.encode_tokens(tokenize_strategy, [clip_l, None], tokens_and_masks)
+
+            if is_fp8(t5xxl_dtype):
+                with accelerator.autocast():
+                    _, t5_out, txt_ids, t5_attn_mask = encoding_strategy.encode_tokens(
+                        tokenize_strategy, [clip_l, t5xxl], tokens_and_masks, args.apply_t5_attn_mask
+                    )
+            else:
+                with torch.autocast(device_type=device.type, dtype=t5xxl_dtype):
+                    _, t5_out, txt_ids, t5_attn_mask = encoding_strategy.encode_tokens(
+                        tokenize_strategy, [None, t5xxl], tokens_and_masks, args.apply_t5_attn_mask
+                    )
+        return l_pooled, t5_out, txt_ids, t5_attn_mask
+
+    l_pooled, t5_out, txt_ids, t5_attn_mask = encode(prompt)
+    if negative_prompt:
+        neg_l_pooled, neg_t5_out, _, neg_t5_attn_mask = encode(negative_prompt)
+    else:
+        neg_l_pooled, neg_t5_out, neg_t5_attn_mask = None, None, None
 
     # NaN check
     if torch.isnan(l_pooled).any():
@@ -244,7 +318,23 @@ def forward(hidden_states):
     t5_attn_mask = t5_attn_mask.to(device) if args.apply_t5_attn_mask else None
 
     x = do_sample(
-        accelerator, model, noise, img_ids, l_pooled, t5_out, txt_ids, steps, guidance, t5_attn_mask, is_schnell, device, flux_dtype
+        accelerator,
+        model,
+        noise,
+        img_ids,
+        l_pooled,
+        t5_out,
+        txt_ids,
+        steps,
+        guidance,
+        t5_attn_mask,
+        is_schnell,
+        device,
+        flux_dtype,
+        neg_l_pooled,
+        neg_t5_out,
+        neg_t5_attn_mask,
+        cfg_scale,
     )
     if args.offload:
         model = model.cpu()
@@ -307,6 +397,8 @@ def forward(hidden_states):
     parser.add_argument("--seed", type=int, default=None)
     parser.add_argument("--steps", type=int, default=None, help="Number of steps. Default is 4 for schnell, 50 for dev")
     parser.add_argument("--guidance", type=float, default=3.5)
+    parser.add_argument("--negative_prompt", type=str, default=None)
+    parser.add_argument("--cfg_scale", type=float, default=1.0)
     parser.add_argument("--offload", action="store_true", help="Offload to CPU")
     parser.add_argument(
         "--lora_weights",
@@ -403,19 +495,34 @@ def is_fp8(dt):
             lora_model.to(device)
 
         lora_models.append(lora_model)
-    
+
     if not args.interactive:
-        generate_image(model, clip_l, t5xxl, ae, args.prompt, args.seed, args.width, args.height, args.steps, args.guidance)
+        generate_image(
+            model,
+            clip_l,
+            t5xxl,
+            ae,
+            args.prompt,
+            args.seed,
+            args.width,
+            args.height,
+            args.steps,
+            args.guidance,
+            args.negative_prompt,
+            args.cfg_scale,
+        )
     else:
         # loop for interactive
         width = target_width
         height = target_height
         steps = None
         guidance = args.guidance
+        cfg_scale = args.cfg_scale
 
         while True:
             print(
                 "Enter prompt (empty to exit). Options: --w <width> --h <height> --s <steps> --d <seed> --g <guidance> --m <multipliers for LoRA>"
+                " --n <negative prompt>, `-` for empty negative prompt --c <cfg_scale>"
             )
             prompt = input()
             if prompt == "":
@@ -425,26 +532,36 @@ def is_fp8(dt):
             options = prompt.split("--")
             prompt = options[0].strip()
             seed = None
+            negative_prompt = None
             for opt in options[1:]:
-                opt = opt.strip()
-                if opt.startswith("w"):
-                    width = int(opt[1:].strip())
-                elif opt.startswith("h"):
-                    height = int(opt[1:].strip())
-                elif opt.startswith("s"):
-                    steps = int(opt[1:].strip())
-                elif opt.startswith("d"):
-                    seed = int(opt[1:].strip())
-                elif opt.startswith("g"):
-                    guidance = float(opt[1:].strip())
-                elif opt.startswith("m"):
-                    mutipliers = opt[1:].strip().split(",")
-                    if len(mutipliers) != len(lora_models):
-                        logger.error(f"Invalid number of multipliers, expected {len(lora_models)}")
-                        continue
-                    for i, lora_model in enumerate(lora_models):
-                        lora_model.set_multiplier(float(mutipliers[i]))
-
-            generate_image(model, clip_l, t5xxl, ae, prompt, seed, width, height, steps, guidance)
+                try:
+                    opt = opt.strip()
+                    if opt.startswith("w"):
+                        width = int(opt[1:].strip())
+                    elif opt.startswith("h"):
+                        height = int(opt[1:].strip())
+                    elif opt.startswith("s"):
+                        steps = int(opt[1:].strip())
+                    elif opt.startswith("d"):
+                        seed = int(opt[1:].strip())
+                    elif opt.startswith("g"):
+                        guidance = float(opt[1:].strip())
+                    elif opt.startswith("m"):
+                        mutipliers = opt[1:].strip().split(",")
+                        if len(mutipliers) != len(lora_models):
+                            logger.error(f"Invalid number of multipliers, expected {len(lora_models)}")
+                            continue
+                        for i, lora_model in enumerate(lora_models):
+                            lora_model.set_multiplier(float(mutipliers[i]))
+                    elif opt.startswith("n"):
+                        negative_prompt = opt[1:].strip()
+                        if negative_prompt == "-":
+                            negative_prompt = ""
+                    elif opt.startswith("c"):
+                        cfg_scale = float(opt[1:].strip())
+                except ValueError as e:
+                    logger.error(f"Invalid option: {opt}, {e}")
+
+            generate_image(model, clip_l, t5xxl, ae, prompt, seed, width, height, steps, guidance, negative_prompt, cfg_scale)
 
     logger.info("Done!")

From d10ff62a78b15d0bb55f443cc2849c460300131b Mon Sep 17 00:00:00 2001
From: Kohya S <ykumeykume@gmail.com>
Date: Tue, 10 Sep 2024 20:32:09 +0900
Subject: [PATCH 130/582] support individual LR for CLIP-L/T5XXL

---
 README.md             |  4 +++
 networks/lora_flux.py | 71 +++++++++++++++----------------------------
 train_network.py      | 32 ++++++++++++-------
 3 files changed, 49 insertions(+), 58 deletions(-)

diff --git a/README.md b/README.md
index 126516f95..b5799dd6f 100644
--- a/README.md
+++ b/README.md
@@ -11,6 +11,9 @@ The command to install PyTorch is as follows:
 
 ### Recent Updates
 
+Sep 10, 2024:
+In FLUX.1 LoRA training, individual learning rates can be specified for CLIP-L and T5XXL. By specifying multiple numbers in `--text_encoder_lr`, you can set the learning rates for CLIP-L and T5XXL separately. Specify like `--text_encoder_lr 1e-4 1e-5`. If you specify only one, the learning rates for CLIP-L and T5XXL will be the same.
+
 Sep 9, 2024:
 Added `--negative_prompt` and `--cfg_scale` to `flux_minimal_inference.py`. Negative prompts can be used. 
 
@@ -142,6 +145,7 @@ The effect of `--timestep_sampling sigmoid` and `--sigmoid_scale` (when `--times
    - Remove `--network_train_unet_only` from your command.
    - Add `train_t5xxl=True` to `--network_args` to train T5XXL LoRA. CLIP-L is also trained at the same time.
    - T5XXL output can be cached for CLIP-L LoRA training. So, `--cache_text_encoder_outputs` or `--cache_text_encoder_outputs_to_disk` is also available.
+   - The learning rates for CLIP-L and T5XXL can be specified separately. Multiple numbers can be specified in `--text_encoder_lr`. For example, `--text_encoder_lr 1e-4 1e-5`. If you specify only one, the learning rates for CLIP-L and T5XXL will be the same. If `--text_encoder_lr` is not specified, the default learning rate `--learning_rate` is used for both CLIP-L and T5XXL.
    - The trained LoRA can be used with ComfyUI.
    - Note: `flux_extract_lora.py`, `convert_flux_lora.py`and `merge_flux_lora.py` do not support CLIP-L and T5XXL LoRA yet.
 
diff --git a/networks/lora_flux.py b/networks/lora_flux.py
index ab9ccc4d8..d540c2215 100644
--- a/networks/lora_flux.py
+++ b/networks/lora_flux.py
@@ -786,28 +786,23 @@ def set_loraplus_lr_ratio(self, loraplus_lr_ratio, loraplus_unet_lr_ratio, lorap
         logger.info(f"LoRA+ UNet LR Ratio: {self.loraplus_unet_lr_ratio or self.loraplus_lr_ratio}")
         logger.info(f"LoRA+ Text Encoder LR Ratio: {self.loraplus_text_encoder_lr_ratio or self.loraplus_lr_ratio}")
 
-    # 二つのText Encoderに別々の学習率を設定できるようにするといいかも
-    def prepare_optimizer_params(self, text_encoder_lr, unet_lr, default_lr):
-        # TODO warn if optimizer is not compatible with LoRA+ (but it will cause error so we don't need to check it here?)
-        # if (
-        #     self.loraplus_lr_ratio is not None
-        #     or self.loraplus_text_encoder_lr_ratio is not None
-        #     or self.loraplus_unet_lr_ratio is not None
-        # ):
-        #     assert (
-        #         optimizer_type.lower() != "prodigy" and "dadapt" not in optimizer_type.lower()
-        #     ), "LoRA+ and Prodigy/DAdaptation is not supported / LoRA+とProdigy/DAdaptationの組み合わせはサポートされていません"
+    def prepare_optimizer_params_with_multiple_te_lrs(self, text_encoder_lr, unet_lr, default_lr):
+        # make sure text_encoder_lr as list of two elements
+        if text_encoder_lr is None or len(text_encoder_lr) == 0:
+            text_encoder_lr = [default_lr, default_lr]
+        elif len(text_encoder_lr) == 1:
+            text_encoder_lr = [text_encoder_lr[0], text_encoder_lr[0]]
 
         self.requires_grad_(True)
 
         all_params = []
         lr_descriptions = []
 
-        def assemble_params(loras, lr, ratio):
+        def assemble_params(loras, lr, loraplus_ratio):
             param_groups = {"lora": {}, "plus": {}}
             for lora in loras:
                 for name, param in lora.named_parameters():
-                    if ratio is not None and "lora_up" in name:
+                    if loraplus_ratio is not None and "lora_up" in name:
                         param_groups["plus"][f"{lora.lora_name}.{name}"] = param
                     else:
                         param_groups["lora"][f"{lora.lora_name}.{name}"] = param
@@ -822,7 +817,7 @@ def assemble_params(loras, lr, ratio):
 
                 if lr is not None:
                     if key == "plus":
-                        param_data["lr"] = lr * ratio
+                        param_data["lr"] = lr * loraplus_ratio
                     else:
                         param_data["lr"] = lr
 
@@ -836,41 +831,23 @@ def assemble_params(loras, lr, ratio):
             return params, descriptions
 
         if self.text_encoder_loras:
-            params, descriptions = assemble_params(
-                self.text_encoder_loras,
-                text_encoder_lr if text_encoder_lr is not None else default_lr,
-                self.loraplus_text_encoder_lr_ratio or self.loraplus_lr_ratio,
-            )
-            all_params.extend(params)
-            lr_descriptions.extend(["textencoder" + (" " + d if d else "") for d in descriptions])
+            loraplus_lr_ratio = self.loraplus_text_encoder_lr_ratio or self.loraplus_lr_ratio
+
+            # split text encoder loras for te1 and te3
+            te1_loras = [lora for lora in self.text_encoder_loras if lora.lora_name.startswith(self.LORA_PREFIX_TEXT_ENCODER_CLIP)]
+            te3_loras = [lora for lora in self.text_encoder_loras if lora.lora_name.startswith(self.LORA_PREFIX_TEXT_ENCODER_T5)]
+            if len(te1_loras) > 0:
+                logger.info(f"Text Encoder 1 (CLIP-L): {len(te1_loras)} modules, LR {text_encoder_lr[0]}")
+                params, descriptions = assemble_params(te1_loras, text_encoder_lr[0], loraplus_lr_ratio)
+                all_params.extend(params)
+                lr_descriptions.extend(["textencoder 1 " + (" " + d if d else "") for d in descriptions])
+            if len(te3_loras) > 0:
+                logger.info(f"Text Encoder 2 (T5XXL): {len(te3_loras)} modules, LR {text_encoder_lr[1]}")
+                params, descriptions = assemble_params(te3_loras, text_encoder_lr[1], loraplus_lr_ratio)
+                all_params.extend(params)
+                lr_descriptions.extend(["textencoder 2 " + (" " + d if d else "") for d in descriptions])
 
         if self.unet_loras:
-            # if self.block_lr:
-            #     is_sdxl = False
-            #     for lora in self.unet_loras:
-            #         if "input_blocks" in lora.lora_name or "output_blocks" in lora.lora_name:
-            #             is_sdxl = True
-            #             break
-
-            #     # 学習率のグラフをblockごとにしたいので、blockごとにloraを分類
-            #     block_idx_to_lora = {}
-            #     for lora in self.unet_loras:
-            #         idx = get_block_index(lora.lora_name, is_sdxl)
-            #         if idx not in block_idx_to_lora:
-            #             block_idx_to_lora[idx] = []
-            #         block_idx_to_lora[idx].append(lora)
-
-            #     # blockごとにパラメータを設定する
-            #     for idx, block_loras in block_idx_to_lora.items():
-            #         params, descriptions = assemble_params(
-            #             block_loras,
-            #             (unet_lr if unet_lr is not None else default_lr) * self.get_lr_weight(idx),
-            #             self.loraplus_unet_lr_ratio or self.loraplus_lr_ratio,
-            #         )
-            #         all_params.extend(params)
-            #         lr_descriptions.extend([f"unet_block{idx}" + (" " + d if d else "") for d in descriptions])
-
-            # else:
             params, descriptions = assemble_params(
                 self.unet_loras,
                 unet_lr if unet_lr is not None else default_lr,
diff --git a/train_network.py b/train_network.py
index ad97491df..e45db0525 100644
--- a/train_network.py
+++ b/train_network.py
@@ -466,9 +466,17 @@ def train(self, args):
         # 学習に必要なクラスを準備する
         accelerator.print("prepare optimizer, data loader etc.")
 
-        # 後方互換性を確保するよ
+        # make backward compatibility for text_encoder_lr
+        support_multiple_lrs = hasattr(network, "prepare_optimizer_params_with_multiple_te_lrs")
+        if support_multiple_lrs:
+            text_encoder_lr = args.text_encoder_lr
+        else:
+            text_encoder_lr = None if args.text_encoder_lr is None or len(args.text_encoder_lr) == 0 else args.text_encoder_lr[0]
         try:
-            results = network.prepare_optimizer_params(args.text_encoder_lr, args.unet_lr, args.learning_rate)
+            if support_multiple_lrs:
+                results = network.prepare_optimizer_params_with_multiple_te_lrs(text_encoder_lr, args.unet_lr, args.learning_rate)
+            else:
+                results = network.prepare_optimizer_params(text_encoder_lr, args.unet_lr, args.learning_rate)
             if type(results) is tuple:
                 trainable_params = results[0]
                 lr_descriptions = results[1]
@@ -476,11 +484,7 @@ def train(self, args):
                 trainable_params = results
                 lr_descriptions = None
         except TypeError as e:
-            # logger.warning(f"{e}")
-            # accelerator.print(
-            #     "Deprecated: use prepare_optimizer_params(text_encoder_lr, unet_lr, learning_rate) instead of prepare_optimizer_params(text_encoder_lr, unet_lr)"
-            # )
-            trainable_params = network.prepare_optimizer_params(args.text_encoder_lr, args.unet_lr)
+            trainable_params = network.prepare_optimizer_params(text_encoder_lr, args.unet_lr)
             lr_descriptions = None
 
         # if len(trainable_params) == 0:
@@ -713,7 +717,7 @@ def load_model_hook(models, input_dir):
             "ss_training_started_at": training_started_at,  # unix timestamp
             "ss_output_name": args.output_name,
             "ss_learning_rate": args.learning_rate,
-            "ss_text_encoder_lr": args.text_encoder_lr,
+            "ss_text_encoder_lr": text_encoder_lr,
             "ss_unet_lr": args.unet_lr,
             "ss_num_train_images": train_dataset_group.num_train_images,
             "ss_num_reg_images": train_dataset_group.num_reg_images,
@@ -760,8 +764,8 @@ def load_model_hook(models, input_dir):
             "ss_loss_type": args.loss_type,
             "ss_huber_schedule": args.huber_schedule,
             "ss_huber_c": args.huber_c,
-            "ss_fp8_base": args.fp8_base,
-            "ss_fp8_base_unet": args.fp8_base_unet,
+            "ss_fp8_base": bool(args.fp8_base),
+            "ss_fp8_base_unet": bool(args.fp8_base_unet),
         }
 
         self.update_metadata(metadata, args)  # architecture specific metadata
@@ -1303,7 +1307,13 @@ def setup_parser() -> argparse.ArgumentParser:
     )
 
     parser.add_argument("--unet_lr", type=float, default=None, help="learning rate for U-Net / U-Netの学習率")
-    parser.add_argument("--text_encoder_lr", type=float, default=None, help="learning rate for Text Encoder / Text Encoderの学習率")
+    parser.add_argument(
+        "--text_encoder_lr",
+        type=float,
+        default=None,
+        nargs="*",
+        help="learning rate for Text Encoder, can be multiple / Text Encoderの学習率、複数指定可能",
+    )
     parser.add_argument(
         "--fp8_base_unet",
         action="store_true",

From 65b8a064f6bb9a403374d4b08f4003037df42f8d Mon Sep 17 00:00:00 2001
From: Kohya S <ykumeykume@gmail.com>
Date: Tue, 10 Sep 2024 21:20:38 +0900
Subject: [PATCH 131/582] update README

---
 README.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index b5799dd6f..caea59b7e 100644
--- a/README.md
+++ b/README.md
@@ -12,7 +12,7 @@ The command to install PyTorch is as follows:
 ### Recent Updates
 
 Sep 10, 2024:
-In FLUX.1 LoRA training, individual learning rates can be specified for CLIP-L and T5XXL. By specifying multiple numbers in `--text_encoder_lr`, you can set the learning rates for CLIP-L and T5XXL separately. Specify like `--text_encoder_lr 1e-4 1e-5`. If you specify only one, the learning rates for CLIP-L and T5XXL will be the same.
+In FLUX.1 LoRA training, individual learning rates can be specified for CLIP-L and T5XXL. By specifying multiple numbers in `--text_encoder_lr`, you can set the learning rates for CLIP-L and T5XXL separately. Specify like `--text_encoder_lr 1e-4 1e-5`. The first value is the learning rate for CLIP-L, and the second value is for T5XXL. If you specify only one, the learning rates for CLIP-L and T5XXL will be the same.
 
 Sep 9, 2024:
 Added `--negative_prompt` and `--cfg_scale` to `flux_minimal_inference.py`. Negative prompts can be used. 
@@ -145,7 +145,7 @@ The effect of `--timestep_sampling sigmoid` and `--sigmoid_scale` (when `--times
    - Remove `--network_train_unet_only` from your command.
    - Add `train_t5xxl=True` to `--network_args` to train T5XXL LoRA. CLIP-L is also trained at the same time.
    - T5XXL output can be cached for CLIP-L LoRA training. So, `--cache_text_encoder_outputs` or `--cache_text_encoder_outputs_to_disk` is also available.
-   - The learning rates for CLIP-L and T5XXL can be specified separately. Multiple numbers can be specified in `--text_encoder_lr`. For example, `--text_encoder_lr 1e-4 1e-5`. If you specify only one, the learning rates for CLIP-L and T5XXL will be the same. If `--text_encoder_lr` is not specified, the default learning rate `--learning_rate` is used for both CLIP-L and T5XXL.
+   - The learning rates for CLIP-L and T5XXL can be specified separately. Multiple numbers can be specified in `--text_encoder_lr`. For example, `--text_encoder_lr 1e-4 1e-5`. The first value is the learning rate for CLIP-L, and the second value is for T5XXL. If you specify only one, the learning rates for CLIP-L and T5XXL will be the same. If `--text_encoder_lr` is not specified, the default learning rate `--learning_rate` is used for both CLIP-L and T5XXL.
    - The trained LoRA can be used with ComfyUI.
    - Note: `flux_extract_lora.py`, `convert_flux_lora.py`and `merge_flux_lora.py` do not support CLIP-L and T5XXL LoRA yet.
 

From 8311e88225fef377591e5be19eb1f50fe7a2941f Mon Sep 17 00:00:00 2001
From: cocktailpeanut <cocktailpeanuts@proton.me>
Date: Wed, 11 Sep 2024 09:02:29 -0400
Subject: [PATCH 132/582] typo fix

---
 library/train_util.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/library/train_util.py b/library/train_util.py
index c38864fe6..f682dcbfb 100644
--- a/library/train_util.py
+++ b/library/train_util.py
@@ -3355,15 +3355,14 @@ def int_or_float(value):
         type=int,
         default=None,
         help="Inverse sqrt timescale for inverse sqrt scheduler,defaults to `num_warmup_steps`"
-        " / 逆平方根スケジューラのタイムスケール、デフォルトは`num_warmup_steps`",
-        ,
+        + " / 逆平方根スケジューラのタイムスケール、デフォルトは`num_warmup_steps`",
     )
     parser.add_argument(
         "--lr_scheduler_min_lr_ratio",
         type=float,
         default=None,
         help="The minimum learning rate as a ratio of the initial learning rate for cosine with min lr scheduler and warmup decay scheduler"
-        " / 初期学習率の比率としての最小学習率を指定する、cosine with min lr と warmup decay スケジューラ で有効",
+        + " / 初期学習率の比率としての最小学習率を指定する、cosine with min lr と warmup decay スケジューラ で有効",
     )
 
 

From a823fd9fb8d219b5b4c57df12eed41ae34fdf843 Mon Sep 17 00:00:00 2001
From: Plat <60182057+p1atdev@users.noreply.github.com>
Date: Wed, 11 Sep 2024 22:21:16 +0900
Subject: [PATCH 133/582] Improve wandb logging (#1576)

* fix: wrong training steps were recorded to wandb, and no log was sent when logging_dir was not specified

* fix: checking of whether wandb is enabled

* feat: log images to wandb with their positive prompt as captions

* feat: logging sample images' caption for sd3 and flux

* fix: import wandb before use
---
 fine_tune.py                         |  7 +++++--
 flux_train.py                        |  7 +++++--
 library/flux_train_utils.py          | 20 +++++++++++---------
 library/sd3_train_utils.py           | 20 +++++++++++---------
 library/train_util.py                | 20 +++++++++++---------
 sd3_train.py                         |  7 +++++--
 sdxl_train.py                        |  7 +++++--
 sdxl_train_control_net_lllite.py     |  4 ++--
 sdxl_train_control_net_lllite_old.py |  4 ++--
 train_controlnet.py                  |  7 +++++--
 train_db.py                          |  7 +++++--
 train_network.py                     |  7 +++++--
 train_textual_inversion.py           |  8 ++++++--
 train_textual_inversion_XTI.py       |  4 ++--
 14 files changed, 80 insertions(+), 49 deletions(-)

diff --git a/fine_tune.py b/fine_tune.py
index c9102f6c0..fb6b3ed69 100644
--- a/fine_tune.py
+++ b/fine_tune.py
@@ -337,6 +337,9 @@ def fn_recursive_set_mem_eff(module: torch.nn.Module):
     train_util.sample_images(
         accelerator, args, 0, global_step, accelerator.device, vae, tokenize_strategy.tokenizer, text_encoder, unet
     )
+    if len(accelerator.trackers) > 0:
+        # log empty object to commit the sample images to wandb
+        accelerator.log({}, step=0)
 
     loss_recorder = train_util.LossRecorder()
     for epoch in range(num_train_epochs):
@@ -456,7 +459,7 @@ def fn_recursive_set_mem_eff(module: torch.nn.Module):
                         )
 
             current_loss = loss.detach().item()  # 平均なのでbatch sizeは関係ないはず
-            if args.logging_dir is not None:
+            if len(accelerator.trackers) > 0:
                 logs = {"loss": current_loss}
                 train_util.append_lr_to_logs(logs, lr_scheduler, args.optimizer_type, including_unet=True)
                 accelerator.log(logs, step=global_step)
@@ -469,7 +472,7 @@ def fn_recursive_set_mem_eff(module: torch.nn.Module):
             if global_step >= args.max_train_steps:
                 break
 
-        if args.logging_dir is not None:
+        if len(accelerator.trackers) > 0:
             logs = {"loss/epoch": loss_recorder.moving_average}
             accelerator.log(logs, step=epoch + 1)
 
diff --git a/flux_train.py b/flux_train.py
index 0edc83a9f..33481df8f 100644
--- a/flux_train.py
+++ b/flux_train.py
@@ -629,6 +629,9 @@ def optimizer_hook(parameter: torch.Tensor):
 
     # For --sample_at_first
     flux_train_utils.sample_images(accelerator, args, 0, global_step, flux, ae, [clip_l, t5xxl], sample_prompts_te_outputs)
+    if len(accelerator.trackers) > 0:
+        # log empty object to commit the sample images to wandb
+        accelerator.log({}, step=0)
 
     loss_recorder = train_util.LossRecorder()
     epoch = 0  # avoid error when max_train_steps is 0
@@ -777,7 +780,7 @@ def optimizer_hook(parameter: torch.Tensor):
                         )
 
             current_loss = loss.detach().item()  # 平均なのでbatch sizeは関係ないはず
-            if args.logging_dir is not None:
+            if len(accelerator.trackers) > 0:
                 logs = {"loss": current_loss}
                 train_util.append_lr_to_logs(logs, lr_scheduler, args.optimizer_type, including_unet=True)
 
@@ -791,7 +794,7 @@ def optimizer_hook(parameter: torch.Tensor):
             if global_step >= args.max_train_steps:
                 break
 
-        if args.logging_dir is not None:
+        if len(accelerator.trackers) > 0:
             logs = {"loss/epoch": loss_recorder.moving_average}
             accelerator.log(logs, step=epoch + 1)
 
diff --git a/library/flux_train_utils.py b/library/flux_train_utils.py
index 0b5d4d90e..f77d4b585 100644
--- a/library/flux_train_utils.py
+++ b/library/flux_train_utils.py
@@ -254,17 +254,19 @@ def sample_image_inference(
     img_filename = f"{'' if args.output_name is None else args.output_name + '_'}{num_suffix}_{i:02d}_{ts_str}{seed_suffix}.png"
     image.save(os.path.join(save_dir, img_filename))
 
-    # wandb有効時のみログを送信
-    try:
+    # send images to wandb if enabled
+    if "wandb" in [tracker.name for tracker in accelerator.trackers]:
         wandb_tracker = accelerator.get_tracker("wandb")
-        try:
-            import wandb
-        except ImportError:  # 事前に一度確認するのでここはエラー出ないはず
-            raise ImportError("No wandb / wandb がインストールされていないようです")
 
-        wandb_tracker.log({f"sample_{i}": wandb.Image(image)})
-    except:  # wandb 無効時
-        pass
+        import wandb
+        # not to commit images to avoid inconsistency between training and logging steps
+        wandb_tracker.log(
+            {f"sample_{i}": wandb.Image(
+                image,
+                caption=prompt # positive prompt as a caption
+            )}, 
+            commit=False
+        )
 
 
 def time_shift(mu: float, sigma: float, t: torch.Tensor):
diff --git a/library/sd3_train_utils.py b/library/sd3_train_utils.py
index da0729506..e819d440c 100644
--- a/library/sd3_train_utils.py
+++ b/library/sd3_train_utils.py
@@ -604,17 +604,19 @@ def sample_image_inference(
     img_filename = f"{'' if args.output_name is None else args.output_name + '_'}{num_suffix}_{i:02d}_{ts_str}{seed_suffix}.png"
     image.save(os.path.join(save_dir, img_filename))
 
-    # wandb有効時のみログを送信
-    try:
+    # send images to wandb if enabled
+    if "wandb" in [tracker.name for tracker in accelerator.trackers]:
         wandb_tracker = accelerator.get_tracker("wandb")
-        try:
-            import wandb
-        except ImportError:  # 事前に一度確認するのでここはエラー出ないはず
-            raise ImportError("No wandb / wandb がインストールされていないようです")
 
-        wandb_tracker.log({f"sample_{i}": wandb.Image(image)})
-    except:  # wandb 無効時
-        pass
+        import wandb
+        # not to commit images to avoid inconsistency between training and logging steps
+        wandb_tracker.log(
+            {f"sample_{i}": wandb.Image(
+                image,
+                caption=prompt # positive prompt as a caption
+            )}, 
+            commit=False
+        )
 
 
 # region Diffusers
diff --git a/library/train_util.py b/library/train_util.py
index f682dcbfb..742d057e0 100644
--- a/library/train_util.py
+++ b/library/train_util.py
@@ -5832,17 +5832,19 @@ def sample_image_inference(
     img_filename = f"{'' if args.output_name is None else args.output_name + '_'}{num_suffix}_{i:02d}_{ts_str}{seed_suffix}.png"
     image.save(os.path.join(save_dir, img_filename))
 
-    # wandb有効時のみログを送信
-    try:
+    # send images to wandb if enabled
+    if "wandb" in [tracker.name for tracker in accelerator.trackers]:
         wandb_tracker = accelerator.get_tracker("wandb")
-        try:
-            import wandb
-        except ImportError:  # 事前に一度確認するのでここはエラー出ないはず
-            raise ImportError("No wandb / wandb がインストールされていないようです")
 
-        wandb_tracker.log({f"sample_{i}": wandb.Image(image)})
-    except:  # wandb 無効時
-        pass
+        import wandb
+        # not to commit images to avoid inconsistency between training and logging steps
+        wandb_tracker.log(
+            {f"sample_{i}": wandb.Image(
+                image,
+                caption=prompt # positive prompt as a caption
+            )}, 
+            commit=False
+        )
 
 
 # endregion
diff --git a/sd3_train.py b/sd3_train.py
index 87011b215..5120105f2 100644
--- a/sd3_train.py
+++ b/sd3_train.py
@@ -682,6 +682,9 @@ def optimizer_hook(parameter: torch.Tensor):
 
     # For --sample_at_first
     sd3_train_utils.sample_images(accelerator, args, 0, global_step, mmdit, vae, [clip_l, clip_g, t5xxl], sample_prompts_te_outputs)
+    if len(accelerator.trackers) > 0:
+        # log empty object to commit the sample images to wandb
+        accelerator.log({}, step=0)
 
     # following function will be moved to sd3_train_utils
 
@@ -901,7 +904,7 @@ def compute_loss_weighting_for_sd3(weighting_scheme: str, sigmas=None):
                         )
 
             current_loss = loss.detach().item()  # 平均なのでbatch sizeは関係ないはず
-            if args.logging_dir is not None:
+            if len(accelerator.trackers) > 0:
                 logs = {"loss": current_loss}
                 train_util.append_lr_to_logs(logs, lr_scheduler, args.optimizer_type, including_unet=train_mmdit)
 
@@ -915,7 +918,7 @@ def compute_loss_weighting_for_sd3(weighting_scheme: str, sigmas=None):
             if global_step >= args.max_train_steps:
                 break
 
-        if args.logging_dir is not None:
+        if len(accelerator.trackers) > 0:
             logs = {"loss/epoch": loss_recorder.moving_average}
             accelerator.log(logs, step=epoch + 1)
 
diff --git a/sdxl_train.py b/sdxl_train.py
index b2c62dd11..7291ddd2f 100644
--- a/sdxl_train.py
+++ b/sdxl_train.py
@@ -617,6 +617,9 @@ def optimizer_hook(parameter: torch.Tensor):
     sdxl_train_util.sample_images(
         accelerator, args, 0, global_step, accelerator.device, vae, tokenizers, [text_encoder1, text_encoder2], unet
     )
+    if len(accelerator.trackers) > 0:
+        # log empty object to commit the sample images to wandb
+        accelerator.log({}, step=0)
 
     loss_recorder = train_util.LossRecorder()
     for epoch in range(num_train_epochs):
@@ -797,7 +800,7 @@ def optimizer_hook(parameter: torch.Tensor):
                         )
 
             current_loss = loss.detach().item()  # 平均なのでbatch sizeは関係ないはず
-            if args.logging_dir is not None:
+            if len(accelerator.trackers) > 0:
                 logs = {"loss": current_loss}
                 if block_lrs is None:
                     train_util.append_lr_to_logs(logs, lr_scheduler, args.optimizer_type, including_unet=train_unet)
@@ -814,7 +817,7 @@ def optimizer_hook(parameter: torch.Tensor):
             if global_step >= args.max_train_steps:
                 break
 
-        if args.logging_dir is not None:
+        if len(accelerator.trackers) > 0:
             logs = {"loss/epoch": loss_recorder.moving_average}
             accelerator.log(logs, step=epoch + 1)
 
diff --git a/sdxl_train_control_net_lllite.py b/sdxl_train_control_net_lllite.py
index 0eaec29b8..9d1cfc63e 100644
--- a/sdxl_train_control_net_lllite.py
+++ b/sdxl_train_control_net_lllite.py
@@ -541,14 +541,14 @@ def remove_model(old_ckpt_name):
             logs = {"avr_loss": avr_loss}  # , "lr": lr_scheduler.get_last_lr()[0]}
             progress_bar.set_postfix(**logs)
 
-            if args.logging_dir is not None:
+            if len(accelerator.trackers) > 0:
                 logs = generate_step_logs(args, current_loss, avr_loss, lr_scheduler)
                 accelerator.log(logs, step=global_step)
 
             if global_step >= args.max_train_steps:
                 break
 
-        if args.logging_dir is not None:
+        if len(accelerator.trackers) > 0:
             logs = {"loss/epoch": loss_recorder.moving_average}
             accelerator.log(logs, step=epoch + 1)
 
diff --git a/sdxl_train_control_net_lllite_old.py b/sdxl_train_control_net_lllite_old.py
index 292a0463a..6fa1d6096 100644
--- a/sdxl_train_control_net_lllite_old.py
+++ b/sdxl_train_control_net_lllite_old.py
@@ -480,14 +480,14 @@ def remove_model(old_ckpt_name):
             logs = {"avr_loss": avr_loss}  # , "lr": lr_scheduler.get_last_lr()[0]}
             progress_bar.set_postfix(**logs)
 
-            if args.logging_dir is not None:
+            if len(accelerator.trackers) > 0:
                 logs = generate_step_logs(args, current_loss, avr_loss, lr_scheduler)
                 accelerator.log(logs, step=global_step)
 
             if global_step >= args.max_train_steps:
                 break
 
-        if args.logging_dir is not None:
+        if len(accelerator.trackers) > 0:
             logs = {"loss/epoch": loss_recorder.moving_average}
             accelerator.log(logs, step=epoch + 1)
 
diff --git a/train_controlnet.py b/train_controlnet.py
index c9ac6c5a8..57f0d263f 100644
--- a/train_controlnet.py
+++ b/train_controlnet.py
@@ -409,6 +409,9 @@ def remove_model(old_ckpt_name):
     train_util.sample_images(
         accelerator, args, 0, global_step, accelerator.device, vae, tokenizer, text_encoder, unet, controlnet=controlnet
     )
+    if len(accelerator.trackers) > 0:
+        # log empty object to commit the sample images to wandb
+        accelerator.log({}, step=0)
 
     # training loop
     for epoch in range(num_train_epochs):
@@ -542,14 +545,14 @@ def remove_model(old_ckpt_name):
             logs = {"avr_loss": avr_loss}  # , "lr": lr_scheduler.get_last_lr()[0]}
             progress_bar.set_postfix(**logs)
 
-            if args.logging_dir is not None:
+            if len(accelerator.trackers) > 0:
                 logs = generate_step_logs(args, current_loss, avr_loss, lr_scheduler)
                 accelerator.log(logs, step=global_step)
 
             if global_step >= args.max_train_steps:
                 break
 
-        if args.logging_dir is not None:
+        if len(accelerator.trackers) > 0:
             logs = {"loss/epoch": loss_recorder.moving_average}
             accelerator.log(logs, step=epoch + 1)
 
diff --git a/train_db.py b/train_db.py
index 7caee6647..d42afd89a 100644
--- a/train_db.py
+++ b/train_db.py
@@ -315,6 +315,9 @@ def train(args):
     train_util.sample_images(
         accelerator, args, 0, global_step, accelerator.device, vae, tokenize_strategy.tokenizer, text_encoder, unet
     )
+    if len(accelerator.trackers) > 0:
+        # log empty object to commit the sample images to wandb
+        accelerator.log({}, step=0)
 
     loss_recorder = train_util.LossRecorder()
     for epoch in range(num_train_epochs):
@@ -445,7 +448,7 @@ def train(args):
                         )
 
             current_loss = loss.detach().item()
-            if args.logging_dir is not None:
+            if len(accelerator.trackers) > 0:
                 logs = {"loss": current_loss}
                 train_util.append_lr_to_logs(logs, lr_scheduler, args.optimizer_type, including_unet=True)
                 accelerator.log(logs, step=global_step)
@@ -458,7 +461,7 @@ def train(args):
             if global_step >= args.max_train_steps:
                 break
 
-        if args.logging_dir is not None:
+        if len(accelerator.trackers) > 0:
             logs = {"loss/epoch": loss_recorder.moving_average}
             accelerator.log(logs, step=epoch + 1)
 
diff --git a/train_network.py b/train_network.py
index e45db0525..34385ae08 100644
--- a/train_network.py
+++ b/train_network.py
@@ -1038,6 +1038,9 @@ def remove_model(old_ckpt_name):
 
         # For --sample_at_first
         self.sample_images(accelerator, args, 0, global_step, accelerator.device, vae, tokenizers, text_encoder, unet)
+        if len(accelerator.trackers) > 0:
+            # log empty object to commit the sample images to wandb
+            accelerator.log({}, step=0)
 
         # training loop
         if initial_step > 0:  # only if skip_until_initial_step is specified
@@ -1224,7 +1227,7 @@ def remove_model(old_ckpt_name):
                 if args.scale_weight_norms:
                     progress_bar.set_postfix(**{**max_mean_logs, **logs})
 
-                if args.logging_dir is not None:
+                if len(accelerator.trackers) > 0:
                     logs = self.generate_step_logs(
                         args, current_loss, avr_loss, lr_scheduler, lr_descriptions, keys_scaled, mean_norm, maximum_norm
                     )
@@ -1233,7 +1236,7 @@ def remove_model(old_ckpt_name):
                 if global_step >= args.max_train_steps:
                     break
 
-            if args.logging_dir is not None:
+            if len(accelerator.trackers) > 0:
                 logs = {"loss/epoch": loss_recorder.moving_average}
                 accelerator.log(logs, step=epoch + 1)
 
diff --git a/train_textual_inversion.py b/train_textual_inversion.py
index 9044f50df..956c78603 100644
--- a/train_textual_inversion.py
+++ b/train_textual_inversion.py
@@ -550,6 +550,9 @@ def remove_model(old_ckpt_name):
             unet,
             prompt_replacement,
         )
+        if len(accelerator.trackers) > 0:
+            # log empty object to commit the sample images to wandb
+            accelerator.log({}, step=0)
 
         # training loop
         for epoch in range(num_train_epochs):
@@ -684,7 +687,7 @@ def remove_model(old_ckpt_name):
                                 remove_model(remove_ckpt_name)
 
                 current_loss = loss.detach().item()
-                if args.logging_dir is not None:
+                if len(accelerator.trackers) > 0:
                     logs = {"loss": current_loss, "lr": float(lr_scheduler.get_last_lr()[0])}
                     if (
                         args.optimizer_type.lower().startswith("DAdapt".lower()) or args.optimizer_type.lower() == "Prodigy".lower()
@@ -702,7 +705,7 @@ def remove_model(old_ckpt_name):
                 if global_step >= args.max_train_steps:
                     break
 
-            if args.logging_dir is not None:
+            if len(accelerator.trackers) > 0:
                 logs = {"loss/epoch": loss_total / len(train_dataloader)}
                 accelerator.log(logs, step=epoch + 1)
 
@@ -739,6 +742,7 @@ def remove_model(old_ckpt_name):
                 unet,
                 prompt_replacement,
             )
+            accelerator.log({})
 
             # end of epoch
 
diff --git a/train_textual_inversion_XTI.py b/train_textual_inversion_XTI.py
index efb59137b..ca0b603fb 100644
--- a/train_textual_inversion_XTI.py
+++ b/train_textual_inversion_XTI.py
@@ -538,7 +538,7 @@ def remove_model(old_ckpt_name):
                             remove_model(remove_ckpt_name)
 
             current_loss = loss.detach().item()
-            if args.logging_dir is not None:
+            if len(accelerator.trackers) > 0:
                 logs = {"loss": current_loss, "lr": float(lr_scheduler.get_last_lr()[0])}
                 if (
                     args.optimizer_type.lower().startswith("DAdapt".lower()) or args.optimizer_type.lower() == "Prodigy".lower()
@@ -556,7 +556,7 @@ def remove_model(old_ckpt_name):
             if global_step >= args.max_train_steps:
                 break
 
-        if args.logging_dir is not None:
+        if len(accelerator.trackers) > 0:
             logs = {"loss/epoch": loss_total / len(train_dataloader)}
             accelerator.log(logs, step=epoch + 1)
 

From 237317fffd060bcfb078b770ccd2df18bc4dd3a6 Mon Sep 17 00:00:00 2001
From: Kohya S <ykumeykume@gmail.com>
Date: Wed, 11 Sep 2024 22:23:43 +0900
Subject: [PATCH 134/582] update README

---
 README.md | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/README.md b/README.md
index 2b3d0d5a8..d3481b6ae 100644
--- a/README.md
+++ b/README.md
@@ -11,6 +11,9 @@ The command to install PyTorch is as follows:
 
 ### Recent Updates
 
+Sep 11, 2024: 
+Logging to wandb is improved. See PR [#1576](https://github.com/kohya-ss/sd-scripts/pull/1576) for details. Thanks to p1atdev!
+
 Sep 10, 2024:
 In FLUX.1 LoRA training, individual learning rates can be specified for CLIP-L and T5XXL. By specifying multiple numbers in `--text_encoder_lr`, you can set the learning rates for CLIP-L and T5XXL separately. Specify like `--text_encoder_lr 1e-4 1e-5`. The first value is the learning rate for CLIP-L, and the second value is for T5XXL. If you specify only one, the learning rates for CLIP-L and T5XXL will be the same.
 

From cefe52629e1901dd8192b0487afd5e9f089e3519 Mon Sep 17 00:00:00 2001
From: Kohya S <ykumeykume@gmail.com>
Date: Thu, 12 Sep 2024 12:36:07 +0900
Subject: [PATCH 135/582] fix to work old notation for TE LR in .toml

---
 networks/lora_flux.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/networks/lora_flux.py b/networks/lora_flux.py
index d540c2215..dd267de0f 100644
--- a/networks/lora_flux.py
+++ b/networks/lora_flux.py
@@ -788,8 +788,11 @@ def set_loraplus_lr_ratio(self, loraplus_lr_ratio, loraplus_unet_lr_ratio, lorap
 
     def prepare_optimizer_params_with_multiple_te_lrs(self, text_encoder_lr, unet_lr, default_lr):
         # make sure text_encoder_lr as list of two elements
-        if text_encoder_lr is None or len(text_encoder_lr) == 0:
+        # if float, use the same value for both text encoders
+        if text_encoder_lr is None or (isinstance(text_encoder_lr, list) and len(text_encoder_lr) == 0):
             text_encoder_lr = [default_lr, default_lr]
+        elif isinstance(text_encoder_lr, float):
+            text_encoder_lr = [text_encoder_lr, text_encoder_lr]
         elif len(text_encoder_lr) == 1:
             text_encoder_lr = [text_encoder_lr[0], text_encoder_lr[0]]
 

From 2d8ee3c28007393386528cfeec0a9b714dafd85b Mon Sep 17 00:00:00 2001
From: Kohya S <ykumeykume@gmail.com>
Date: Sat, 14 Sep 2024 15:48:16 +0900
Subject: [PATCH 136/582] OFT for FLUX.1

---
 flux_minimal_inference.py |  20 +-
 networks/lora_flux.py     |   6 +-
 networks/oft.py           |   2 +-
 networks/oft_flux.py      | 482 ++++++++++++++++++++++++++++++++++++++
 4 files changed, 504 insertions(+), 6 deletions(-)
 create mode 100644 networks/oft_flux.py

diff --git a/flux_minimal_inference.py b/flux_minimal_inference.py
index de607c52a..2f1b9a377 100644
--- a/flux_minimal_inference.py
+++ b/flux_minimal_inference.py
@@ -14,9 +14,11 @@
 from PIL import Image
 import accelerate
 from transformers import CLIPTextModel
+from safetensors.torch import load_file
 
 from library import device_utils
 from library.device_utils import init_ipex, get_preferred_device
+from networks import oft_flux
 
 init_ipex()
 
@@ -405,7 +407,7 @@ def encode(prpt: str):
         type=str,
         nargs="*",
         default=[],
-        help="LoRA weights, only supports networks.lora_flux, each argument is a `path;multiplier` (semi-colon separated)",
+        help="LoRA weights, only supports networks.lora_flux and lora_oft, each argument is a `path;multiplier` (semi-colon separated)",
     )
     parser.add_argument("--merge_lora_weights", action="store_true", help="Merge LoRA weights to model")
     parser.add_argument("--width", type=int, default=target_width)
@@ -482,9 +484,19 @@ def is_fp8(dt):
         else:
             multiplier = 1.0
 
-        lora_model, weights_sd = lora_flux.create_network_from_weights(
-            multiplier, weights_file, ae, [clip_l, t5xxl], model, None, True
-        )
+        weights_sd = load_file(weights_file)
+        is_lora = is_oft = False
+        for key in weights_sd.keys():
+            if key.startswith("lora"):
+                is_lora = True
+            if key.startswith("oft"):
+                is_oft = True
+            if is_lora or is_oft:
+                break
+
+        module = lora_flux if is_lora else oft_flux
+        lora_model, _ = module.create_network_from_weights(multiplier, None, ae, [clip_l, t5xxl], model, weights_sd, True)
+
         if args.merge_lora_weights:
             lora_model.merge_to([clip_l, t5xxl], model, weights_sd)
         else:
diff --git a/networks/lora_flux.py b/networks/lora_flux.py
index dd267de0f..ea7df8b4d 100644
--- a/networks/lora_flux.py
+++ b/networks/lora_flux.py
@@ -41,7 +41,11 @@ def __init__(
         module_dropout=None,
         split_dims: Optional[List[int]] = None,
     ):
-        """if alpha == 0 or None, alpha is rank (no scaling)."""
+        """
+        if alpha == 0 or None, alpha is rank (no scaling).
+
+        split_dims is used to mimic the split qkv of FLUX as same as Diffusers
+        """
         super().__init__()
         self.lora_name = lora_name
 
diff --git a/networks/oft.py b/networks/oft.py
index 6321def3b..0c3a5393f 100644
--- a/networks/oft.py
+++ b/networks/oft.py
@@ -51,7 +51,7 @@ def __init__(
             alpha = alpha.detach().numpy()
         
         # constraint in original paper is alpha * out_dim * out_dim, but we use alpha * out_dim for backward compatibility
-        # original alpha is 1e-6, so we use 1e-3 or 1e-4 for alpha
+        # original alpha is 1e-5, so we use 1e-2 or 1e-4 for alpha
         self.constraint = alpha * out_dim 
         
         self.register_buffer("alpha", torch.tensor(alpha))
diff --git a/networks/oft_flux.py b/networks/oft_flux.py
new file mode 100644
index 000000000..27b8b637a
--- /dev/null
+++ b/networks/oft_flux.py
@@ -0,0 +1,482 @@
+# OFT network module
+
+import math
+import os
+from typing import Dict, List, Optional, Tuple, Type, Union
+from diffusers import AutoencoderKL
+import einops
+from transformers import CLIPTextModel
+import numpy as np
+import torch
+import torch.nn.functional as F
+import re
+from library.utils import setup_logging
+
+setup_logging()
+import logging
+
+logger = logging.getLogger(__name__)
+
+
+class OFTModule(torch.nn.Module):
+    """
+    replaces forward method of the original Linear, instead of replacing the original Linear module.
+    """
+
+    def __init__(
+        self,
+        oft_name,
+        org_module: torch.nn.Module,
+        multiplier=1.0,
+        dim=4,
+        alpha=1,
+        split_dims: Optional[List[int]] = None,
+    ):
+        """
+        dim -> num blocks
+        alpha -> constraint
+
+        split_dims is used to mimic the split qkv of FLUX as same as Diffusers
+        """
+        super().__init__()
+        self.oft_name = oft_name
+        self.num_blocks = dim
+
+        if type(alpha) == torch.Tensor:
+            alpha = alpha.detach().numpy()
+        self.register_buffer("alpha", torch.tensor(alpha))
+
+        # No conv2d in FLUX
+        # if "Linear" in org_module.__class__.__name__:
+        self.out_dim = org_module.out_features
+        # elif "Conv" in org_module.__class__.__name__:
+        #     out_dim = org_module.out_channels
+
+        if split_dims is None:
+            split_dims = [self.out_dim]
+        else:
+            assert sum(split_dims) == self.out_dim, "sum of split_dims must be equal to out_dim"
+        self.split_dims = split_dims
+
+        # assert all dim is divisible by num_blocks
+        for split_dim in self.split_dims:
+            assert split_dim % self.num_blocks == 0, "split_dim must be divisible by num_blocks"
+
+        self.constraint = [alpha * split_dim for split_dim in self.split_dims]
+        self.block_size = [split_dim // self.num_blocks for split_dim in self.split_dims]
+        self.oft_blocks = torch.nn.ParameterList(
+            [torch.nn.Parameter(torch.zeros(self.num_blocks, block_size, block_size)) for block_size in self.block_size]
+        )
+        self.I = [torch.eye(block_size).unsqueeze(0).repeat(self.num_blocks, 1, 1) for block_size in self.block_size]
+
+        self.shape = org_module.weight.shape
+        self.multiplier = multiplier
+        self.org_module = [org_module]  # moduleにならないようにlistに入れる
+
+    def apply_to(self):
+        self.org_forward = self.org_module[0].forward
+        self.org_module[0].forward = self.forward
+
+    def get_weight(self, multiplier=None):
+        if multiplier is None:
+            multiplier = self.multiplier
+
+        if self.I[0].device != self.oft_blocks[0].device:
+            self.I = [I.to(self.oft_blocks[0].device) for I in self.I]
+
+        block_R_weighted_list = []
+        for i in range(len(self.oft_blocks)):
+            block_Q = self.oft_blocks[i] - self.oft_blocks[i].transpose(1, 2)
+            norm_Q = torch.norm(block_Q.flatten())
+            new_norm_Q = torch.clamp(norm_Q, max=self.constraint[i])
+            block_Q = block_Q * ((new_norm_Q + 1e-8) / (norm_Q + 1e-8))
+
+            I = self.I[i]
+            block_R = torch.matmul(I + block_Q, (I - block_Q).float().inverse())
+            block_R_weighted = self.multiplier * (block_R - I) + I
+
+            block_R_weighted_list.append(block_R_weighted)
+
+        return block_R_weighted_list
+
+    def forward(self, x, scale=None):
+        if self.multiplier == 0.0:
+            return self.org_forward(x)
+
+        org_module = self.org_module[0]
+        org_dtype = x.dtype
+
+        R = self.get_weight()
+        W = org_module.weight.to(torch.float32)
+        B = org_module.bias.to(torch.float32)
+
+        # split W to match R
+        results = []
+        d2 = 0
+        for i in range(len(R)):
+            d1 = d2
+            d2 += self.split_dims[i]
+
+            W1 = W[d1:d2]
+            W_reshaped = einops.rearrange(W1, "(k n) m -> k n m", k=self.num_blocks, n=self.block_size[i])
+            RW_1 = torch.einsum("k n m, k n p -> k m p", R[i], W_reshaped)
+            RW_1 = einops.rearrange(RW_1, "k m p -> (k m) p")
+
+            B1 = B[d1:d2]
+            result = F.linear(x, RW_1.to(org_dtype), B1.to(org_dtype))
+            results.append(result)
+
+        result = torch.cat(results, dim=-1)
+        return result
+
+
+class OFTInfModule(OFTModule):
+    def __init__(
+        self,
+        oft_name,
+        org_module: torch.nn.Module,
+        multiplier=1.0,
+        dim=4,
+        alpha=1,
+        split_dims: Optional[List[int]] = None,
+        **kwargs,
+    ):
+        # no dropout for inference
+        super().__init__(oft_name, org_module, multiplier, dim, alpha, split_dims)
+        self.enabled = True
+        self.network: OFTNetwork = None
+
+    def set_network(self, network):
+        self.network = network
+
+    def forward(self, x, scale=None):
+        if not self.enabled:
+            return self.org_forward(x)
+        return super().forward(x, scale)
+
+    def merge_to(self, multiplier=None):
+        # get org weight
+        org_sd = self.org_module[0].state_dict()
+        W = org_sd["weight"].to(torch.float32)
+        R = self.get_weight(multiplier).to(torch.float32)
+
+        d2 = 0
+        W_list = []
+        for i in range(len(self.oft_blocks)):
+            d1 = d2
+            d2 += self.split_dims[i]
+
+            W1 = W[d1:d2]
+            W_reshaped = einops.rearrange(W1, "(k n) m -> k n m", k=self.num_blocks, n=self.block_size[i])
+            W1 = torch.einsum("k n m, k n p -> k m p", R[i], W_reshaped)
+            W1 = einops.rearrange(W1, "k m p -> (k m) p")
+
+            W_list.append(W1)
+
+        W = torch.cat(W_list, dim=-1)
+
+        # convert back to original dtype
+        W = W.to(org_sd["weight"].dtype)
+
+        # set weight to org_module
+        org_sd["weight"] = W
+        self.org_module[0].load_state_dict(org_sd)
+
+
+def create_network(
+    multiplier: float,
+    network_dim: Optional[int],
+    network_alpha: Optional[float],
+    vae: AutoencoderKL,
+    text_encoder: Union[CLIPTextModel, List[CLIPTextModel]],
+    unet,
+    neuron_dropout: Optional[float] = None,
+    **kwargs,
+):
+    if network_dim is None:
+        network_dim = 4  # default
+    if network_alpha is None:  # should be set
+        logger.info(
+            "network_alpha is not set, use default value 1e-3 / network_alphaが設定されていないのでデフォルト値 1e-3 を使用します"
+        )
+        network_alpha = 1e-3
+    elif network_alpha >= 1:
+        logger.warning(
+            "network_alpha is too large (>=1, maybe default value is too large), please consider to set smaller value like 1e-3"
+            " / network_alphaが大きすぎるようです(>=1, デフォルト値が大きすぎる可能性があります)。1e-3のような小さな値を推奨"
+        )
+
+    # attn only or all linear (FFN) layers
+    enable_all_linear = kwargs.get("enable_all_linear", None)
+    # enable_conv = kwargs.get("enable_conv", None)
+    if enable_all_linear is not None:
+        enable_all_linear = bool(enable_all_linear)
+    # if enable_conv is not None:
+    #     enable_conv = bool(enable_conv)
+
+    network = OFTNetwork(
+        text_encoder,
+        unet,
+        multiplier=multiplier,
+        dim=network_dim,
+        alpha=network_alpha,
+        enable_all_linear=enable_all_linear,
+        varbose=True,
+    )
+    return network
+
+
+# Create network from weights for inference, weights are not loaded here (because can be merged)
+def create_network_from_weights(multiplier, file, vae, text_encoder, unet, weights_sd=None, for_inference=False, **kwargs):
+    if weights_sd is None:
+        if os.path.splitext(file)[1] == ".safetensors":
+            from safetensors.torch import load_file, safe_open
+
+            weights_sd = load_file(file)
+        else:
+            weights_sd = torch.load(file, map_location="cpu")
+
+    # check dim, alpha and if weights have for conv2d
+    dim = None
+    alpha = None
+    all_linear = None
+    for name, param in weights_sd.items():
+        if name.endswith(".alpha"):
+            if alpha is None:
+                alpha = param.item()
+        elif "qkv" in name:
+            continue  # ignore qkv
+        else:
+            if dim is None:
+                dim = param.size()[0]
+            if all_linear is None and "_mlp" in name:
+                all_linear = True
+        if dim is not None and alpha is not None and all_linear is not None:
+            break
+    if all_linear is None:
+        all_linear = False
+
+    module_class = OFTInfModule if for_inference else OFTModule
+    network = OFTNetwork(
+        text_encoder,
+        unet,
+        multiplier=multiplier,
+        dim=dim,
+        alpha=alpha,
+        enable_all_linear=all_linear,
+        module_class=module_class,
+    )
+    return network, weights_sd
+
+
+class OFTNetwork(torch.nn.Module):
+    FLUX_TARGET_REPLACE_MODULE_ALL_LINEAR = ["DoubleStreamBlock", "SingleStreamBlock"]
+    FLUX_TARGET_REPLACE_MODULE_ATTN_ONLY = ["SelfAttention"]
+    OFT_PREFIX_UNET = "oft_unet"
+
+    def __init__(
+        self,
+        text_encoder: Union[List[CLIPTextModel], CLIPTextModel],
+        unet,
+        multiplier: float = 1.0,
+        dim: int = 4,
+        alpha: float = 1,
+        enable_all_linear: Optional[bool] = False,
+        module_class: Union[Type[OFTModule], Type[OFTInfModule]] = OFTModule,
+        varbose: Optional[bool] = False,
+    ) -> None:
+        super().__init__()
+        self.train_t5xxl = False  # make compatible with LoRA
+        self.multiplier = multiplier
+
+        self.dim = dim
+        self.alpha = alpha
+
+        logger.info(
+            f"create OFT network. num blocks: {self.dim}, constraint: {self.alpha}, multiplier: {self.multiplier}, enable_all_linear: {enable_all_linear}"
+        )
+
+        # create module instances
+        def create_modules(
+            root_module: torch.nn.Module,
+            target_replace_modules: List[torch.nn.Module],
+        ) -> List[OFTModule]:
+            prefix = self.OFT_PREFIX_UNET
+            ofts = []
+            for name, module in root_module.named_modules():
+                if module.__class__.__name__ in target_replace_modules:
+                    for child_name, child_module in module.named_modules():
+                        is_linear = "Linear" in child_module.__class__.__name__
+
+                        if is_linear:
+                            oft_name = prefix + "." + name + "." + child_name
+                            oft_name = oft_name.replace(".", "_")
+                            # logger.info(oft_name)
+
+                            if "double" in oft_name and "qkv" in oft_name:
+                                split_dims = [3072] * 3
+                            elif "single" in oft_name and "linear1" in oft_name:
+                                split_dims = [3072] * 3 + [12288]
+                            else:
+                                split_dims = None
+
+                            oft = module_class(oft_name, child_module, self.multiplier, dim, alpha, split_dims)
+                            ofts.append(oft)
+            return ofts
+
+        # extend U-Net target modules if conv2d 3x3 is enabled, or load from weights
+        if enable_all_linear:
+            target_modules = OFTNetwork.FLUX_TARGET_REPLACE_MODULE_ALL_LINEAR
+        else:
+            target_modules = OFTNetwork.FLUX_TARGET_REPLACE_MODULE_ATTN_ONLY
+
+        self.unet_ofts: List[OFTModule] = create_modules(unet, target_modules)
+        logger.info(f"create OFT for Flux: {len(self.unet_ofts)} modules.")
+
+        # assertion
+        names = set()
+        for oft in self.unet_ofts:
+            assert oft.oft_name not in names, f"duplicated oft name: {oft.oft_name}"
+            names.add(oft.oft_name)
+
+    def set_multiplier(self, multiplier):
+        self.multiplier = multiplier
+        for oft in self.unet_ofts:
+            oft.multiplier = self.multiplier
+
+    def load_weights(self, file):
+        if os.path.splitext(file)[1] == ".safetensors":
+            from safetensors.torch import load_file
+
+            weights_sd = load_file(file)
+        else:
+            weights_sd = torch.load(file, map_location="cpu")
+
+        info = self.load_state_dict(weights_sd, False)
+        return info
+
+    def apply_to(self, text_encoder, unet, apply_text_encoder=True, apply_unet=True):
+        assert apply_unet, "apply_unet must be True"
+
+        for oft in self.unet_ofts:
+            oft.apply_to()
+            self.add_module(oft.oft_name, oft)
+
+    # マージできるかどうかを返す
+    def is_mergeable(self):
+        return True
+
+    # TODO refactor to common function with apply_to
+    def merge_to(self, text_encoder, unet, weights_sd, dtype, device):
+        logger.info("enable OFT for U-Net")
+
+        for oft in self.unet_ofts:
+            sd_for_lora = {}
+            for key in weights_sd.keys():
+                if key.startswith(oft.oft_name):
+                    sd_for_lora[key[len(oft.oft_name) + 1 :]] = weights_sd[key]
+            oft.load_state_dict(sd_for_lora, False)
+            oft.merge_to()
+
+        logger.info(f"weights are merged")
+
+    # 二つのText Encoderに別々の学習率を設定できるようにするといいかも
+    def prepare_optimizer_params(self, text_encoder_lr, unet_lr, default_lr):
+        self.requires_grad_(True)
+        all_params = []
+
+        def enumerate_params(ofts):
+            params = []
+            for oft in ofts:
+                params.extend(oft.parameters())
+
+            # logger.info num of params
+            num_params = 0
+            for p in params:
+                num_params += p.numel()
+            logger.info(f"OFT params: {num_params}")
+            return params
+
+        param_data = {"params": enumerate_params(self.unet_ofts)}
+        if unet_lr is not None:
+            param_data["lr"] = unet_lr
+        all_params.append(param_data)
+
+        return all_params
+
+    def enable_gradient_checkpointing(self):
+        # not supported
+        pass
+
+    def prepare_grad_etc(self, text_encoder, unet):
+        self.requires_grad_(True)
+
+    def on_epoch_start(self, text_encoder, unet):
+        self.train()
+
+    def get_trainable_params(self):
+        return self.parameters()
+
+    def save_weights(self, file, dtype, metadata):
+        if metadata is not None and len(metadata) == 0:
+            metadata = None
+
+        state_dict = self.state_dict()
+
+        if dtype is not None:
+            for key in list(state_dict.keys()):
+                v = state_dict[key]
+                v = v.detach().clone().to("cpu").to(dtype)
+                state_dict[key] = v
+
+        if os.path.splitext(file)[1] == ".safetensors":
+            from safetensors.torch import save_file
+            from library import train_util
+
+            # Precalculate model hashes to save time on indexing
+            if metadata is None:
+                metadata = {}
+            model_hash, legacy_hash = train_util.precalculate_safetensors_hashes(state_dict, metadata)
+            metadata["sshs_model_hash"] = model_hash
+            metadata["sshs_legacy_hash"] = legacy_hash
+
+            save_file(state_dict, file, metadata)
+        else:
+            torch.save(state_dict, file)
+
+    def backup_weights(self):
+        # 重みのバックアップを行う
+        ofts: List[OFTInfModule] = self.unet_ofts
+        for oft in ofts:
+            org_module = oft.org_module[0]
+            if not hasattr(org_module, "_lora_org_weight"):
+                sd = org_module.state_dict()
+                org_module._lora_org_weight = sd["weight"].detach().clone()
+                org_module._lora_restored = True
+
+    def restore_weights(self):
+        # 重みのリストアを行う
+        ofts: List[OFTInfModule] = self.unet_ofts
+        for oft in ofts:
+            org_module = oft.org_module[0]
+            if not org_module._lora_restored:
+                sd = org_module.state_dict()
+                sd["weight"] = org_module._lora_org_weight
+                org_module.load_state_dict(sd)
+                org_module._lora_restored = True
+
+    def pre_calculation(self):
+        # 事前計算を行う
+        ofts: List[OFTInfModule] = self.unet_ofts
+        for oft in ofts:
+            org_module = oft.org_module[0]
+            oft.merge_to()
+            # sd = org_module.state_dict()
+            # org_weight = sd["weight"]
+            # lora_weight = oft.get_weight().to(org_weight.device, dtype=org_weight.dtype)
+            # sd["weight"] = org_weight + lora_weight
+            # assert sd["weight"].shape == org_weight.shape
+            # org_module.load_state_dict(sd)
+
+            org_module._lora_restored = False
+            oft.enabled = False

From c9ff4de90597e933b441502d45c175fe46b99714 Mon Sep 17 00:00:00 2001
From: Kohya S <ykumeykume@gmail.com>
Date: Sat, 14 Sep 2024 22:17:52 +0900
Subject: [PATCH 137/582] Add support for specifying rank for each layer in
 FLUX.1

---
 README.md             |  61 ++++++++++++++++++++++++
 networks/lora_flux.py | 107 +++++++++++++++++++++++++++++++++++++++---
 2 files changed, 161 insertions(+), 7 deletions(-)

diff --git a/README.md b/README.md
index 6e32fa31d..9a9794796 100644
--- a/README.md
+++ b/README.md
@@ -11,6 +11,10 @@ The command to install PyTorch is as follows:
 
 ### Recent Updates
 
+Sep 14, 2024:
+- You can now specify the rank for each layer in FLUX.1. See [Specify rank for each layer in FLUX.1](#specify-rank-for-each-layer-in-flux1) for details.
+- OFT is now supported with FLUX.1. See [FLUX.1 OFT training](#flux1-oft-training) for details.
+
 Sep 11, 2024: 
 Logging to wandb is improved. See PR [#1576](https://github.com/kohya-ss/sd-scripts/pull/1576) for details. Thanks to p1atdev!
 
@@ -46,6 +50,7 @@ Please update `safetensors` to `0.4.4` to fix the error when using `--resume`. `
   - [Key Options for FLUX.1 LoRA training](#key-options-for-flux1-lora-training)
   - [Inference for FLUX.1 LoRA model](#inference-for-flux1-lora-model)
   - [Key Features for FLUX.1 LoRA training](#key-features-for-flux1-lora-training)
+- [FLUX.1 OFT training](#flux1-oft-training)
 - [FLUX.1 fine-tuning](#flux1-fine-tuning)
   - [Key Features for FLUX.1 fine-tuning](#key-features-for-flux1-fine-tuning)
 - [Extract LoRA from FLUX.1 Models](#extract-lora-from-flux1-models)
@@ -191,6 +196,62 @@ In the implementation of Black Forest Labs' model, the projection layers of q/k/
 
 The compatibility of the saved model (state dict) is ensured by concatenating the weights of multiple LoRAs. However, since there are zero weights in some parts, the model size will be large.
 
+#### Specify rank for each layer in FLUX.1
+
+You can specify the rank for each layer in FLUX.1 by specifying the following network_args. If you specify `0`, LoRA will not be applied to that layer.
+
+When network_args is not specified, the default value (`network_dim`) is applied, same as before.
+
+|network_args|target layer|
+|---|---|
+|img_attn_dim|img_attn in DoubleStreamBlock|
+|txt_attn_dim|txt_attn in DoubleStreamBlock|
+|img_mlp_dim|img_mlp in DoubleStreamBlock|
+|txt_mlp_dim|txt_mlp in DoubleStreamBlock|
+|img_mod_dim|img_mod in DoubleStreamBlock|
+|txt_mod_dim|txt_mod in DoubleStreamBlock|
+|single_dim|linear1 and linear2 in SingleStreamBlock|
+|single_mod_dim|modulation in SingleStreamBlock|
+
+example: 
+```
+--network_args "img_attn_dim=4" "img_mlp_dim=8" "txt_attn_dim=2" "txt_mlp_dim=2" 
+"img_mod_dim=2" "txt_mod_dim=2" "single_dim=4" "single_mod_dim=2"
+```
+
+You can apply LoRA to the conditioning layers of Flux by specifying `in_dims` in network_args. When specifying, be sure to specify 5 numbers in `[]` as a comma-separated list.
+
+example: 
+```
+--network_args "in_dims=[4,2,2,2,4]"
+```
+
+Each number corresponds to `img_in`, `time_in`, `vector_in`, `guidance_in`, `txt_in`. The above example applies LoRA to all conditioning layers, with rank 4 for `img_in`, 2 for `time_in`, `vector_in`, `guidance_in`, and 4 for `txt_in`.
+
+If you specify `0`, LoRA will not be applied to that layer. For example, `[4,0,0,0,4]` applies LoRA only to `img_in` and `txt_in`.
+
+### FLUX.1 OFT training
+
+You can train OFT with almost the same options as LoRA, such as `--timestamp_sampling`. The following points are different.
+
+- Change `--network_module` from `networks.lora_flux` to `networks.oft_flux`.
+- `--network_dim` is the number of OFT blocks. Unlike LoRA rank, the smaller the dim, the larger the model. We recommend about 64 or 128. Please make the output dimension of the target layer of OFT divisible by the value of `--network_dim` (an error will occur if it is not divisible). Valid values are 64, 128, 256, 512, 1024, etc.
+- `--network_alpha` is treated as a constraint for OFT. We recommend about 1e-2 to 1e-4. The default value when omitted is 1, which is too large, so be sure to specify it.
+- CLIP/T5XXL is not supported. Specify `--network_train_unet_only`.
+- `--network_args` specifies the hyperparameters of OFT. The following are valid:
+    - Specify `enable_all_linear=True` to target all linear connections in the MLP layer. The default is False, which targets only attention.
+
+Currently, there is no environment to infer FLUX.1 OFT. Inference is only possible with `flux_minimal_inference.py` (specify OFT model with `--lora`).
+
+Sample command is below. It will work with 24GB VRAM GPUs with the batch size of 1.
+
+```
+--network_module networks.oft_flux  --network_dim 128 --network_alpha 1e-3 
+--network_args "enable_all_linear=True" --learning_rate 1e-5 
+```
+
+The training can be done with 16GB VRAM GPUs without `--enable_all_linear` option and with Adafactor optimizer. 
+
 ### Inference for FLUX.1 with LoRA model
 
 The inference script is also available. The script is `flux_minimal_inference.py`. See `--help` for options. 
diff --git a/networks/lora_flux.py b/networks/lora_flux.py
index ea7df8b4d..a34cde1a8 100644
--- a/networks/lora_flux.py
+++ b/networks/lora_flux.py
@@ -316,6 +316,44 @@ def create_network(
         else:
             conv_alpha = float(conv_alpha)
 
+    # attn dim, mlp dim: only for DoubleStreamBlock. SingleStreamBlock is not supported because of combined qkv
+    img_attn_dim = kwargs.get("img_attn_dim", None)
+    txt_attn_dim = kwargs.get("txt_attn_dim", None)
+    img_mlp_dim = kwargs.get("img_mlp_dim", None)
+    txt_mlp_dim = kwargs.get("txt_mlp_dim", None)
+    img_mod_dim = kwargs.get("img_mod_dim", None)
+    txt_mod_dim = kwargs.get("txt_mod_dim", None)
+    single_dim = kwargs.get("single_dim", None)  # SingleStreamBlock
+    single_mod_dim = kwargs.get("single_mod_dim", None)  # SingleStreamBlock
+    if img_attn_dim is not None:
+        img_attn_dim = int(img_attn_dim)
+    if txt_attn_dim is not None:
+        txt_attn_dim = int(txt_attn_dim)
+    if img_mlp_dim is not None:
+        img_mlp_dim = int(img_mlp_dim)
+    if txt_mlp_dim is not None:
+        txt_mlp_dim = int(txt_mlp_dim)
+    if img_mod_dim is not None:
+        img_mod_dim = int(img_mod_dim)
+    if txt_mod_dim is not None:
+        txt_mod_dim = int(txt_mod_dim)
+    if single_dim is not None:
+        single_dim = int(single_dim)
+    if single_mod_dim is not None:
+        single_mod_dim = int(single_mod_dim)
+    type_dims = [img_attn_dim, txt_attn_dim, img_mlp_dim, txt_mlp_dim, img_mod_dim, txt_mod_dim, single_dim, single_mod_dim]
+    if all([d is None for d in type_dims]):
+        type_dims = None
+
+    # in_dims [img, time, vector, guidance, txt]
+    in_dims = kwargs.get("in_dims", None)
+    if in_dims is not None:
+        in_dims = in_dims.strip()
+        if in_dims.startswith("[") and in_dims.endswith("]"):
+            in_dims = in_dims[1:-1]
+        in_dims = [int(d) for d in in_dims.split(",")]  # is it better to use ast.literal_eval?
+        assert len(in_dims) == 5, f"invalid in_dims: {in_dims}, must be 5 dimensions (img, time, vector, guidance, txt)"
+
     # rank/module dropout
     rank_dropout = kwargs.get("rank_dropout", None)
     if rank_dropout is not None:
@@ -339,6 +377,11 @@ def create_network(
     if train_t5xxl is not None:
         train_t5xxl = True if train_t5xxl == "True" else False
 
+    # verbose
+    verbose = kwargs.get("verbose", False)
+    if verbose is not None:
+        verbose = True if verbose == "True" else False
+
     # すごく引数が多いな ( ^ω^)･･･
     network = LoRANetwork(
         text_encoders,
@@ -354,7 +397,9 @@ def create_network(
         train_blocks=train_blocks,
         split_qkv=split_qkv,
         train_t5xxl=train_t5xxl,
-        varbose=True,
+        type_dims=type_dims,
+        in_dims=in_dims,
+        verbose=verbose,
     )
 
     loraplus_lr_ratio = kwargs.get("loraplus_lr_ratio", None)
@@ -462,7 +507,9 @@ def __init__(
         train_blocks: Optional[str] = None,
         split_qkv: bool = False,
         train_t5xxl: bool = False,
-        varbose: Optional[bool] = False,
+        type_dims: Optional[List[int]] = None,
+        in_dims: Optional[List[int]] = None,
+        verbose: Optional[bool] = False,
     ) -> None:
         super().__init__()
         self.multiplier = multiplier
@@ -478,12 +525,17 @@ def __init__(
         self.split_qkv = split_qkv
         self.train_t5xxl = train_t5xxl
 
+        self.type_dims = type_dims
+        self.in_dims = in_dims
+
         self.loraplus_lr_ratio = None
         self.loraplus_unet_lr_ratio = None
         self.loraplus_text_encoder_lr_ratio = None
 
         if modules_dim is not None:
             logger.info(f"create LoRA network from weights")
+            self.in_dims = [0] * 5  # create in_dims
+            # verbose = True
         else:
             logger.info(f"create LoRA network. base dim (rank): {lora_dim}, alpha: {alpha}")
             logger.info(
@@ -502,7 +554,12 @@ def __init__(
 
         # create module instances
         def create_modules(
-            is_flux: bool, text_encoder_idx: Optional[int], root_module: torch.nn.Module, target_replace_modules: List[str]
+            is_flux: bool,
+            text_encoder_idx: Optional[int],
+            root_module: torch.nn.Module,
+            target_replace_modules: List[str],
+            filter: Optional[str] = None,
+            default_dim: Optional[int] = None,
         ) -> List[LoRAModule]:
             prefix = (
                 self.LORA_PREFIX_FLUX
@@ -513,16 +570,22 @@ def create_modules(
             loras = []
             skipped = []
             for name, module in root_module.named_modules():
-                if module.__class__.__name__ in target_replace_modules:
+                if target_replace_modules is None or module.__class__.__name__ in target_replace_modules:
+                    if target_replace_modules is None:  # dirty hack for all modules
+                        module = root_module  # search all modules
+
                     for child_name, child_module in module.named_modules():
                         is_linear = child_module.__class__.__name__ == "Linear"
                         is_conv2d = child_module.__class__.__name__ == "Conv2d"
                         is_conv2d_1x1 = is_conv2d and child_module.kernel_size == (1, 1)
 
                         if is_linear or is_conv2d:
-                            lora_name = prefix + "." + name + "." + child_name
+                            lora_name = prefix + "." + (name + "." if name else "") + child_name
                             lora_name = lora_name.replace(".", "_")
 
+                            if filter is not None and not filter in lora_name:
+                                continue
+
                             dim = None
                             alpha = None
 
@@ -534,8 +597,25 @@ def create_modules(
                             else:
                                 # 通常、すべて対象とする
                                 if is_linear or is_conv2d_1x1:
-                                    dim = self.lora_dim
+                                    dim = default_dim if default_dim is not None else self.lora_dim
                                     alpha = self.alpha
+
+                                    if type_dims is not None:
+                                        identifier = [
+                                            ("img_attn",),
+                                            ("txt_attn",),
+                                            ("img_mlp",),
+                                            ("txt_mlp",),
+                                            ("img_mod",),
+                                            ("txt_mod",),
+                                            ("single_blocks", "linear"),
+                                            ("modulation",),
+                                        ]
+                                        for i, d in enumerate(type_dims):
+                                            if d is not None and all([id in lora_name for id in identifier[i]]):
+                                                dim = d
+                                                break
+
                                 elif self.conv_lora_dim is not None:
                                     dim = self.conv_lora_dim
                                     alpha = self.conv_alpha
@@ -566,6 +646,9 @@ def create_modules(
                                 split_dims=split_dims,
                             )
                             loras.append(lora)
+
+                if target_replace_modules is None:
+                    break  # all modules are searched
             return loras, skipped
 
         # create LoRA for text encoder
@@ -594,10 +677,20 @@ def create_modules(
 
         self.unet_loras: List[Union[LoRAModule, LoRAInfModule]]
         self.unet_loras, skipped_un = create_modules(True, None, unet, target_replace_modules)
+
+        # img, time, vector, guidance, txt
+        if self.in_dims:
+            for filter, in_dim in zip(["_img_in", "_time_in", "_vector_in", "_guidance_in", "_txt_in"], self.in_dims):
+                loras, _ = create_modules(True, None, unet, None, filter=filter, default_dim=in_dim)
+                self.unet_loras.extend(loras)
+
         logger.info(f"create LoRA for FLUX {self.train_blocks} blocks: {len(self.unet_loras)} modules.")
+        if verbose:
+            for lora in self.unet_loras:
+                logger.info(f"\t{lora.lora_name:50} {lora.lora_dim}, {lora.alpha}")
 
         skipped = skipped_te + skipped_un
-        if varbose and len(skipped) > 0:
+        if verbose and len(skipped) > 0:
             logger.warning(
                 f"because dim (rank) is 0, {len(skipped)} LoRA modules are skipped / dim (rank)が0の為、次の{len(skipped)}個のLoRAモジュールはスキップされます:"
             )

From 6445bb2bc974cec51256ae38c1be0900e90e6f87 Mon Sep 17 00:00:00 2001
From: Kohya S <ykumeykume@gmail.com>
Date: Sat, 14 Sep 2024 22:37:26 +0900
Subject: [PATCH 138/582] update README

---
 README.md | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 9a9794796..c94ea3598 100644
--- a/README.md
+++ b/README.md
@@ -213,10 +213,12 @@ When network_args is not specified, the default value (`network_dim`) is applied
 |single_dim|linear1 and linear2 in SingleStreamBlock|
 |single_mod_dim|modulation in SingleStreamBlock|
 
+`"verbose=True"` is also available for debugging. It shows the rank of each layer.
+
 example: 
 ```
 --network_args "img_attn_dim=4" "img_mlp_dim=8" "txt_attn_dim=2" "txt_mlp_dim=2" 
-"img_mod_dim=2" "txt_mod_dim=2" "single_dim=4" "single_mod_dim=2"
+"img_mod_dim=2" "txt_mod_dim=2" "single_dim=4" "single_mod_dim=2" "verbose=True"
 ```
 
 You can apply LoRA to the conditioning layers of Flux by specifying `in_dims` in network_args. When specifying, be sure to specify 5 numbers in `[]` as a comma-separated list.

From 9f44ef133083c530874c6cf022a4de8fda3edae2 Mon Sep 17 00:00:00 2001
From: Kohya S <ykumeykume@gmail.com>
Date: Sun, 15 Sep 2024 13:52:23 +0900
Subject: [PATCH 139/582] add diffusers to FLUX.1 conversion script

---
 README.md                          |  19 ++-
 tools/convert_diffusers_to_flux.py | 223 +++++++++++++++++++++++++++++
 2 files changed, 241 insertions(+), 1 deletion(-)
 create mode 100644 tools/convert_diffusers_to_flux.py

diff --git a/README.md b/README.md
index c94ea3598..7d6c336e6 100644
--- a/README.md
+++ b/README.md
@@ -11,6 +11,12 @@ The command to install PyTorch is as follows:
 
 ### Recent Updates
 
+Sep 15, 2024:
+
+Added a script `convert_diffusers_to_flux.py` to convert Diffusers format FLUX.1 models (checkpoints) to BFL format. See `--help` for usage. Only Flux models are supported. AE/CLIP/T5XXL are not supported. 
+
+The implementation is based on 2kpr's code. Thanks to 2kpr!
+
 Sep 14, 2024:
 - You can now specify the rank for each layer in FLUX.1. See [Specify rank for each layer in FLUX.1](#specify-rank-for-each-layer-in-flux1) for details.
 - OFT is now supported with FLUX.1. See [FLUX.1 OFT training](#flux1-oft-training) for details.
@@ -57,6 +63,7 @@ Please update `safetensors` to `0.4.4` to fix the error when using `--resume`. `
 - [Convert FLUX LoRA](#convert-flux-lora)
 - [Merge LoRA to FLUX.1 checkpoint](#merge-lora-to-flux1-checkpoint)
 - [FLUX.1 Multi-resolution training](#flux1-multi-resolution-training)
+- [Convert Diffusers to FLUX.1](#convert-diffusers-to-flux1)
 
 ### FLUX.1 LoRA training
 
@@ -355,7 +362,7 @@ If you use LoRA in the inference environment, converting it to AI-toolkit format
 
 Note that re-conversion will increase the size of LoRA.
 
-CLIP-L LoRA is not supported.
+CLIP-L/T5XXL LoRA is not supported.
 
 ### Merge LoRA to FLUX.1 checkpoint
 
@@ -435,6 +442,16 @@ resolution = [512, 512]
   num_repeats = 1
 ```
 
+### Convert Diffusers to FLUX.1
+
+Script: `convert_diffusers_to_flux1.py`
+
+Converts Diffusers models to FLUX.1 models. The script is experimental. See `--help` for options. schnell and dev models are supported. AE/CLIP/T5XXL are not supported. The diffusers folder is a parent folder of `transfomer` folder.
+
+```
+python tools/convert_diffusers_to_flux.py --diffusers_path path/to/diffusers_folder_or_00001_safetensors --save_to path/to/flux1.safetensors --mem_eff_load_save --save_precision bf16
+```
+
 ## SD3 training
 
 SD3 training is done with `sd3_train.py`. 
diff --git a/tools/convert_diffusers_to_flux.py b/tools/convert_diffusers_to_flux.py
new file mode 100644
index 000000000..9d8f7c74b
--- /dev/null
+++ b/tools/convert_diffusers_to_flux.py
@@ -0,0 +1,223 @@
+# This script converts the diffusers of a Flux model to a safetensors file of a Flux.1 model.
+# It is based on the implementation by 2kpr. Thanks to 2kpr!
+# Major changes:
+# - Iterates over three safetensors files to reduce memory usage, not loading all tensors at once.
+# - Makes reverse map from diffusers map to avoid loading all tensors.
+# - Removes dependency on .json file for weights mapping.
+# - Adds support for custom memory efficient load and save functions.
+# - Supports saving with different precision.
+# - Supports .safetensors file as input.
+
+# Copyright 2024 2kpr. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+
+import argparse
+import os
+from pathlib import Path
+import safetensors
+from safetensors.torch import safe_open
+import torch
+from tqdm import tqdm
+
+from library.utils import setup_logging, str_to_dtype, MemoryEfficientSafeOpen, mem_eff_save_file
+
+setup_logging()
+import logging
+
+logger = logging.getLogger(__name__)
+
+NUM_DOUBLE_BLOCKS = 19
+NUM_SINGLE_BLOCKS = 38
+
+BFL_TO_DIFFUSERS_MAP = {
+    "time_in.in_layer.weight": ["time_text_embed.timestep_embedder.linear_1.weight"],
+    "time_in.in_layer.bias": ["time_text_embed.timestep_embedder.linear_1.bias"],
+    "time_in.out_layer.weight": ["time_text_embed.timestep_embedder.linear_2.weight"],
+    "time_in.out_layer.bias": ["time_text_embed.timestep_embedder.linear_2.bias"],
+    "vector_in.in_layer.weight": ["time_text_embed.text_embedder.linear_1.weight"],
+    "vector_in.in_layer.bias": ["time_text_embed.text_embedder.linear_1.bias"],
+    "vector_in.out_layer.weight": ["time_text_embed.text_embedder.linear_2.weight"],
+    "vector_in.out_layer.bias": ["time_text_embed.text_embedder.linear_2.bias"],
+    "guidance_in.in_layer.weight": ["time_text_embed.guidance_embedder.linear_1.weight"],
+    "guidance_in.in_layer.bias": ["time_text_embed.guidance_embedder.linear_1.bias"],
+    "guidance_in.out_layer.weight": ["time_text_embed.guidance_embedder.linear_2.weight"],
+    "guidance_in.out_layer.bias": ["time_text_embed.guidance_embedder.linear_2.bias"],
+    "txt_in.weight": ["context_embedder.weight"],
+    "txt_in.bias": ["context_embedder.bias"],
+    "img_in.weight": ["x_embedder.weight"],
+    "img_in.bias": ["x_embedder.bias"],
+    "double_blocks.().img_mod.lin.weight": ["norm1.linear.weight"],
+    "double_blocks.().img_mod.lin.bias": ["norm1.linear.bias"],
+    "double_blocks.().txt_mod.lin.weight": ["norm1_context.linear.weight"],
+    "double_blocks.().txt_mod.lin.bias": ["norm1_context.linear.bias"],
+    "double_blocks.().img_attn.qkv.weight": ["attn.to_q.weight", "attn.to_k.weight", "attn.to_v.weight"],
+    "double_blocks.().img_attn.qkv.bias": ["attn.to_q.bias", "attn.to_k.bias", "attn.to_v.bias"],
+    "double_blocks.().txt_attn.qkv.weight": ["attn.add_q_proj.weight", "attn.add_k_proj.weight", "attn.add_v_proj.weight"],
+    "double_blocks.().txt_attn.qkv.bias": ["attn.add_q_proj.bias", "attn.add_k_proj.bias", "attn.add_v_proj.bias"],
+    "double_blocks.().img_attn.norm.query_norm.scale": ["attn.norm_q.weight"],
+    "double_blocks.().img_attn.norm.key_norm.scale": ["attn.norm_k.weight"],
+    "double_blocks.().txt_attn.norm.query_norm.scale": ["attn.norm_added_q.weight"],
+    "double_blocks.().txt_attn.norm.key_norm.scale": ["attn.norm_added_k.weight"],
+    "double_blocks.().img_mlp.0.weight": ["ff.net.0.proj.weight"],
+    "double_blocks.().img_mlp.0.bias": ["ff.net.0.proj.bias"],
+    "double_blocks.().img_mlp.2.weight": ["ff.net.2.weight"],
+    "double_blocks.().img_mlp.2.bias": ["ff.net.2.bias"],
+    "double_blocks.().txt_mlp.0.weight": ["ff_context.net.0.proj.weight"],
+    "double_blocks.().txt_mlp.0.bias": ["ff_context.net.0.proj.bias"],
+    "double_blocks.().txt_mlp.2.weight": ["ff_context.net.2.weight"],
+    "double_blocks.().txt_mlp.2.bias": ["ff_context.net.2.bias"],
+    "double_blocks.().img_attn.proj.weight": ["attn.to_out.0.weight"],
+    "double_blocks.().img_attn.proj.bias": ["attn.to_out.0.bias"],
+    "double_blocks.().txt_attn.proj.weight": ["attn.to_add_out.weight"],
+    "double_blocks.().txt_attn.proj.bias": ["attn.to_add_out.bias"],
+    "single_blocks.().modulation.lin.weight": ["norm.linear.weight"],
+    "single_blocks.().modulation.lin.bias": ["norm.linear.bias"],
+    "single_blocks.().linear1.weight": ["attn.to_q.weight", "attn.to_k.weight", "attn.to_v.weight", "proj_mlp.weight"],
+    "single_blocks.().linear1.bias": ["attn.to_q.bias", "attn.to_k.bias", "attn.to_v.bias", "proj_mlp.bias"],
+    "single_blocks.().linear2.weight": ["proj_out.weight"],
+    "single_blocks.().norm.query_norm.scale": ["attn.norm_q.weight"],
+    "single_blocks.().norm.key_norm.scale": ["attn.norm_k.weight"],
+    "single_blocks.().linear2.weight": ["proj_out.weight"],
+    "single_blocks.().linear2.bias": ["proj_out.bias"],
+    "final_layer.linear.weight": ["proj_out.weight"],
+    "final_layer.linear.bias": ["proj_out.bias"],
+    "final_layer.adaLN_modulation.1.weight": ["norm_out.linear.weight"],
+    "final_layer.adaLN_modulation.1.bias": ["norm_out.linear.bias"],
+}
+
+
+def convert(args):
+    # if diffusers_path is folder, get safetensors file
+    diffusers_path = Path(args.diffusers_path)
+    if diffusers_path.is_dir():
+        diffusers_path = Path.joinpath(diffusers_path, "transformer", "diffusion_pytorch_model-00001-of-00003.safetensors")
+
+    flux_path = Path(args.save_to)
+    if not os.path.exists(flux_path.parent):
+        os.makedirs(flux_path.parent)
+
+    if not diffusers_path.exists():
+        logger.error(f"Error: Missing transformer safetensors file: {diffusers_path}")
+        return
+
+    mem_eff_flag = args.mem_eff_load_save
+    save_dtype = str_to_dtype(args.save_precision) if args.save_precision is not None else None
+
+    # make reverse map from diffusers map
+    diffusers_to_bfl_map = {}  # key: diffusers_key, value: (index, bfl_key)
+    for b in range(NUM_DOUBLE_BLOCKS):
+        for key, weights in BFL_TO_DIFFUSERS_MAP.items():
+            if key.startswith("double_blocks."):
+                block_prefix = f"transformer_blocks.{b}."
+                for i, weight in enumerate(weights):
+                    diffusers_to_bfl_map[f"{block_prefix}{weight}"] = (i, key.replace("()", f"{b}"))
+    for b in range(NUM_SINGLE_BLOCKS):
+        for key, weights in BFL_TO_DIFFUSERS_MAP.items():
+            if key.startswith("single_blocks."):
+                block_prefix = f"single_transformer_blocks.{b}."
+                for i, weight in enumerate(weights):
+                    diffusers_to_bfl_map[f"{block_prefix}{weight}"] = (i, key.replace("()", f"{b}"))
+    for key, weights in BFL_TO_DIFFUSERS_MAP.items():
+        if not (key.startswith("double_blocks.") or key.startswith("single_blocks.")):
+            for i, weight in enumerate(weights):
+                diffusers_to_bfl_map[weight] = (i, key)
+
+    # iterate over three safetensors files to reduce memory usage
+    flux_sd = {}
+    for i in range(3):
+        # replace 00001 with 0000i
+        current_diffusers_path = Path(str(diffusers_path).replace("00001", f"0000{i+1}"))
+        logger.info(f"Loading diffusers file: {current_diffusers_path}")
+
+        open_func = MemoryEfficientSafeOpen if mem_eff_flag else (lambda x: safe_open(x, framework="pt"))
+        with open_func(current_diffusers_path) as f:
+            for diffusers_key in tqdm(f.keys()):
+                if diffusers_key in diffusers_to_bfl_map:
+                    tensor = f.get_tensor(diffusers_key).to("cpu")
+                    if save_dtype is not None:
+                        tensor = tensor.to(save_dtype)
+
+                    index, bfl_key = diffusers_to_bfl_map[diffusers_key]
+                    if bfl_key not in flux_sd:
+                        flux_sd[bfl_key] = []
+                    flux_sd[bfl_key].append((index, tensor))
+                else:
+                    logger.error(f"Error: Key not found in diffusers_to_bfl_map: {diffusers_key}")
+                    return
+
+    # concat tensors if multiple tensors are mapped to a single key, sort by index
+    for key, values in flux_sd.items():
+        if len(values) == 1:
+            flux_sd[key] = values[0][1]
+        else:
+            flux_sd[key] = torch.cat([value[1] for value in sorted(values, key=lambda x: x[0])])
+
+    # special case for final_layer.adaLN_modulation.1.weight and final_layer.adaLN_modulation.1.bias
+    def swap_scale_shift(weight):
+        shift, scale = weight.chunk(2, dim=0)
+        new_weight = torch.cat([scale, shift], dim=0)
+        return new_weight
+
+    if "final_layer.adaLN_modulation.1.weight" in flux_sd:
+        flux_sd["final_layer.adaLN_modulation.1.weight"] = swap_scale_shift(flux_sd["final_layer.adaLN_modulation.1.weight"])
+    if "final_layer.adaLN_modulation.1.bias" in flux_sd:
+        flux_sd["final_layer.adaLN_modulation.1.bias"] = swap_scale_shift(flux_sd["final_layer.adaLN_modulation.1.bias"])
+
+    # save flux_sd to safetensors file
+    logger.info(f"Saving Flux safetensors file: {flux_path}")
+    if mem_eff_flag:
+        mem_eff_save_file(flux_sd, flux_path)
+    else:
+        safetensors.torch.save_file(flux_sd, flux_path)
+
+    logger.info("Conversion completed.")
+
+
+def setup_parser():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--diffusers_path",
+        default=None,
+        type=str,
+        required=True,
+        help="Path to the original Flux diffusers folder or *-00001-of-00003.safetensors file."
+        " / 元のFlux diffusersフォルダーまたは*-00001-of-00003.safetensorsファイルへのパス",
+    )
+    parser.add_argument(
+        "--save_to",
+        default=None,
+        type=str,
+        required=True,
+        help="Output path for the Flux safetensors file. / Flux safetensorsファイルの出力先",
+    )
+    parser.add_argument(
+        "--mem_eff_load_save",
+        action="store_true",
+        help="use custom memory efficient load and save functions for FLUX.1 model"
+        " / カスタムのメモリ効率の良い読み込みと保存関数をFLUX.1モデルに使用する",
+    )
+    parser.add_argument(
+        "--save_precision",
+        type=str,
+        default=None,
+        help="precision in saving, default is same as loading precision"
+        "float32, fp16, bf16, fp8 (same as fp8_e4m3fn), fp8_e4m3fn, fp8_e4m3fnuz, fp8_e5m2, fp8_e5m2fnuz"
+        " / 保存時に精度を変更して保存する、デフォルトは読み込み時と同じ精度",
+    )
+    return parser
+
+
+if __name__ == "__main__":
+    parser = setup_parser()
+    args = parser.parse_args()
+    convert(args)

From be078bdaca41084a20edb952b98a82f3e05d2dad Mon Sep 17 00:00:00 2001
From: Kohya S <ykumeykume@gmail.com>
Date: Sun, 15 Sep 2024 13:59:17 +0900
Subject: [PATCH 140/582] fix typo

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 7d6c336e6..f79fe21af 100644
--- a/README.md
+++ b/README.md
@@ -446,7 +446,7 @@ resolution = [512, 512]
 
 Script: `convert_diffusers_to_flux1.py`
 
-Converts Diffusers models to FLUX.1 models. The script is experimental. See `--help` for options. schnell and dev models are supported. AE/CLIP/T5XXL are not supported. The diffusers folder is a parent folder of `transfomer` folder.
+Converts Diffusers models to FLUX.1 models. The script is experimental. See `--help` for options. schnell and dev models are supported. AE/CLIP/T5XXL are not supported. The diffusers folder is a parent folder of `rmer` folder.
 
 ```
 python tools/convert_diffusers_to_flux.py --diffusers_path path/to/diffusers_folder_or_00001_safetensors --save_to path/to/flux1.safetensors --mem_eff_load_save --save_precision bf16

From 96c677b4594ed6f28f3ef896f6deca7c3aced25d Mon Sep 17 00:00:00 2001
From: Kohya S <ykumeykume@gmail.com>
Date: Mon, 16 Sep 2024 10:42:09 +0900
Subject: [PATCH 141/582] fix to work lienar/cosine lr scheduler closes #1602
 ref #1393

---
 library/train_util.py | 18 +++++++++++-------
 1 file changed, 11 insertions(+), 7 deletions(-)

diff --git a/library/train_util.py b/library/train_util.py
index 742d057e0..60afd4219 100644
--- a/library/train_util.py
+++ b/library/train_util.py
@@ -4707,6 +4707,15 @@ def wrap_check_needless_num_warmup_steps(return_vals):
             **lr_scheduler_kwargs,
         )
 
+    # these schedulers do not require `num_decay_steps`
+    if name == SchedulerType.LINEAR or name == SchedulerType.COSINE:
+        return schedule_func(
+            optimizer,
+            num_warmup_steps=num_warmup_steps,
+            num_training_steps=num_training_steps,
+            **lr_scheduler_kwargs,
+        )
+
     # All other schedulers require `num_decay_steps`
     if num_decay_steps is None:
         raise ValueError(f"{name} requires `num_decay_steps`, please provide that argument.")
@@ -5837,14 +5846,9 @@ def sample_image_inference(
         wandb_tracker = accelerator.get_tracker("wandb")
 
         import wandb
+
         # not to commit images to avoid inconsistency between training and logging steps
-        wandb_tracker.log(
-            {f"sample_{i}": wandb.Image(
-                image,
-                caption=prompt # positive prompt as a caption
-            )}, 
-            commit=False
-        )
+        wandb_tracker.log({f"sample_{i}": wandb.Image(image, caption=prompt)}, commit=False)  # positive prompt as a caption
 
 
 # endregion

From d8d15f1a7e09ca217930288b41bd239881126b93 Mon Sep 17 00:00:00 2001
From: Kohya S <ykumeykume@gmail.com>
Date: Mon, 16 Sep 2024 23:14:09 +0900
Subject: [PATCH 142/582] add support for specifying blocks in FLUX.1 LoRA
 training

---
 README.md             | 24 ++++++++++++-
 networks/lora_flux.py | 82 +++++++++++++++++++++++++++++++++++++++++--
 2 files changed, 103 insertions(+), 3 deletions(-)

diff --git a/README.md b/README.md
index f79fe21af..24217d8b7 100644
--- a/README.md
+++ b/README.md
@@ -11,6 +11,10 @@ The command to install PyTorch is as follows:
 
 ### Recent Updates
 
+Sep 16, 2024:
+
+ Added `train_double_block_indices` and `train_double_block_indices` to the LoRA training script to specify the indices of the blocks to train. See [Specify blocks to train in FLUX.1 LoRA training](#specify-blocks-to-train-in-flux1-lora-training) for details.
+
 Sep 15, 2024:
 
 Added a script `convert_diffusers_to_flux.py` to convert Diffusers format FLUX.1 models (checkpoints) to BFL format. See `--help` for usage. Only Flux models are supported. AE/CLIP/T5XXL are not supported. 
@@ -54,9 +58,12 @@ Please update `safetensors` to `0.4.4` to fix the error when using `--resume`. `
 
 - [FLUX.1 LoRA training](#flux1-lora-training)
   - [Key Options for FLUX.1 LoRA training](#key-options-for-flux1-lora-training)
-  - [Inference for FLUX.1 LoRA model](#inference-for-flux1-lora-model)
+  - [Distribution of timesteps](#distribution-of-timesteps)
   - [Key Features for FLUX.1 LoRA training](#key-features-for-flux1-lora-training)
+  - [Specify rank for each layer in FLUX.1](#specify-rank-for-each-layer-in-flux1)
+  - [Specify blocks to train in FLUX.1 LoRA training](#specify-blocks-to-train-in-flux1-lora-training)
 - [FLUX.1 OFT training](#flux1-oft-training)
+- [Inference for FLUX.1 with LoRA model](#inference-for-flux1-with-lora-model)
 - [FLUX.1 fine-tuning](#flux1-fine-tuning)
   - [Key Features for FLUX.1 fine-tuning](#key-features-for-flux1-fine-tuning)
 - [Extract LoRA from FLUX.1 Models](#extract-lora-from-flux1-models)
@@ -239,6 +246,21 @@ Each number corresponds to `img_in`, `time_in`, `vector_in`, `guidance_in`, `txt
 
 If you specify `0`, LoRA will not be applied to that layer. For example, `[4,0,0,0,4]` applies LoRA only to `img_in` and `txt_in`.
 
+#### Specify blocks to train in FLUX.1 LoRA training
+
+You can specify the blocks to train in FLUX.1 LoRA training by specifying `train_double_block_indices` and `train_single_block_indices` in network_args. The indices are 0-based. The default (when omitted) is to train all blocks. The indices are specified as a list of integers or a range of integers, like `0,1,5,8` or `0,1,4-5,7`. The number of double blocks is 19, and the number of single blocks is 38, so the valid range is 0-18 and 0-37, respectively. `all` is also available to train all blocks, `none` is also available to train no blocks.
+
+example: 
+```
+--network_args "train_double_block_indices=0,1,8-12,18" "train_single_block_indices=3,10,20-25,37"
+```
+
+```
+--network_args "train_double_block_indices=none" "train_single_block_indices=10-15"
+```
+
+If you specify one of `train_double_block_indices` or `train_single_block_indices`, the other will be trained as usual. 
+
 ### FLUX.1 OFT training
 
 You can train OFT with almost the same options as LoRA, such as `--timestamp_sampling`. The following points are different.
diff --git a/networks/lora_flux.py b/networks/lora_flux.py
index a34cde1a8..f549ac18f 100644
--- a/networks/lora_flux.py
+++ b/networks/lora_flux.py
@@ -24,6 +24,10 @@
 logger = logging.getLogger(__name__)
 
 
+NUM_DOUBLE_BLOCKS = 19
+NUM_SINGLE_BLOCKS = 38
+
+
 class LoRAModule(torch.nn.Module):
     """
     replaces forward method of the original Linear, instead of replacing the original Linear module.
@@ -354,6 +358,50 @@ def create_network(
         in_dims = [int(d) for d in in_dims.split(",")]  # is it better to use ast.literal_eval?
         assert len(in_dims) == 5, f"invalid in_dims: {in_dims}, must be 5 dimensions (img, time, vector, guidance, txt)"
 
+    # double/single train blocks
+    def parse_block_selection(selection: str, total_blocks: int) -> List[bool]:
+        """
+        Parse a block selection string and return a list of booleans.
+
+        Args:
+        selection (str): A string specifying which blocks to select.
+        total_blocks (int): The total number of blocks available.
+
+        Returns:
+        List[bool]: A list of booleans indicating which blocks are selected.
+        """
+        if selection == "all":
+            return [True] * total_blocks
+        if selection == "none" or selection == "":
+            return [False] * total_blocks
+
+        selected = [False] * total_blocks
+        ranges = selection.split(",")
+
+        for r in ranges:
+            if "-" in r:
+                start, end = map(str.strip, r.split("-"))
+                start = int(start)
+                end = int(end)
+                assert 0 <= start < total_blocks, f"invalid start index: {start}"
+                assert 0 <= end < total_blocks, f"invalid end index: {end}"
+                assert start <= end, f"invalid range: {start}-{end}"
+                for i in range(start, end + 1):
+                    selected[i] = True
+            else:
+                index = int(r)
+                assert 0 <= index < total_blocks, f"invalid index: {index}"
+                selected[index] = True
+
+        return selected
+
+    train_double_block_indices = kwargs.get("train_double_block_indices", None)
+    train_single_block_indices = kwargs.get("train_single_block_indices", None)
+    if train_double_block_indices is not None:
+        train_double_block_indices = parse_block_selection(train_double_block_indices, NUM_DOUBLE_BLOCKS)
+    if train_single_block_indices is not None:
+        train_single_block_indices = parse_block_selection(train_single_block_indices, NUM_SINGLE_BLOCKS)
+
     # rank/module dropout
     rank_dropout = kwargs.get("rank_dropout", None)
     if rank_dropout is not None:
@@ -399,6 +447,8 @@ def create_network(
         train_t5xxl=train_t5xxl,
         type_dims=type_dims,
         in_dims=in_dims,
+        train_double_block_indices=train_double_block_indices,
+        train_single_block_indices=train_single_block_indices,
         verbose=verbose,
     )
 
@@ -509,6 +559,8 @@ def __init__(
         train_t5xxl: bool = False,
         type_dims: Optional[List[int]] = None,
         in_dims: Optional[List[int]] = None,
+        train_double_block_indices: Optional[List[bool]] = None,
+        train_single_block_indices: Optional[List[bool]] = None,
         verbose: Optional[bool] = False,
     ) -> None:
         super().__init__()
@@ -527,6 +579,8 @@ def __init__(
 
         self.type_dims = type_dims
         self.in_dims = in_dims
+        self.train_double_block_indices = train_double_block_indices
+        self.train_single_block_indices = train_single_block_indices
 
         self.loraplus_lr_ratio = None
         self.loraplus_unet_lr_ratio = None
@@ -600,7 +654,7 @@ def create_modules(
                                     dim = default_dim if default_dim is not None else self.lora_dim
                                     alpha = self.alpha
 
-                                    if type_dims is not None:
+                                    if is_flux and type_dims is not None:
                                         identifier = [
                                             ("img_attn",),
                                             ("txt_attn",),
@@ -613,9 +667,33 @@ def create_modules(
                                         ]
                                         for i, d in enumerate(type_dims):
                                             if d is not None and all([id in lora_name for id in identifier[i]]):
-                                                dim = d
+                                                dim = d  # may be 0 for skip
                                                 break
 
+                                    if (
+                                        is_flux
+                                        and dim
+                                        and (
+                                            self.train_double_block_indices is not None
+                                            or self.train_single_block_indices is not None
+                                        )
+                                        and ("double" in lora_name or "single" in lora_name)
+                                    ):
+                                        # "lora_unet_double_blocks_0_..." or "lora_unet_single_blocks_0_..."
+                                        block_index = int(lora_name.split("_")[4])  # bit dirty
+                                        if (
+                                            "double" in lora_name
+                                            and self.train_double_block_indices is not None
+                                            and not self.train_double_block_indices[block_index]
+                                        ):
+                                            dim = 0
+                                        elif (
+                                            "single" in lora_name
+                                            and self.train_single_block_indices is not None
+                                            and not self.train_single_block_indices[block_index]
+                                        ):
+                                            dim = 0
+
                                 elif self.conv_lora_dim is not None:
                                     dim = self.conv_lora_dim
                                     alpha = self.conv_alpha

From 0cbe95bcc7e88f518802f29fe2b99da806963267 Mon Sep 17 00:00:00 2001
From: Kohya S <ykumeykume@gmail.com>
Date: Tue, 17 Sep 2024 21:21:28 +0900
Subject: [PATCH 143/582] fix text_encoder_lr to work with int closes #1608

---
 networks/lora_flux.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/networks/lora_flux.py b/networks/lora_flux.py
index f549ac18f..91e9cd77f 100644
--- a/networks/lora_flux.py
+++ b/networks/lora_flux.py
@@ -966,8 +966,8 @@ def prepare_optimizer_params_with_multiple_te_lrs(self, text_encoder_lr, unet_lr
         # if float, use the same value for both text encoders
         if text_encoder_lr is None or (isinstance(text_encoder_lr, list) and len(text_encoder_lr) == 0):
             text_encoder_lr = [default_lr, default_lr]
-        elif isinstance(text_encoder_lr, float):
-            text_encoder_lr = [text_encoder_lr, text_encoder_lr]
+        elif isinstance(text_encoder_lr, float) or isinstance(text_encoder_lr, int):
+            text_encoder_lr = [float(text_encoder_lr), float(text_encoder_lr)]
         elif len(text_encoder_lr) == 1:
             text_encoder_lr = [text_encoder_lr[0], text_encoder_lr[0]]
 

From a2ad7e5644f08141fe053a2b63446d70d777bdcf Mon Sep 17 00:00:00 2001
From: Kohya S <ykumeykume@gmail.com>
Date: Tue, 17 Sep 2024 21:42:14 +0900
Subject: [PATCH 144/582] blocks_to_swap=0 means no swap

---
 flux_train.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/flux_train.py b/flux_train.py
index 33481df8f..5d8326b1d 100644
--- a/flux_train.py
+++ b/flux_train.py
@@ -265,7 +265,7 @@ def train(args):
 
     flux.requires_grad_(True)
 
-    is_swapping_blocks = args.double_blocks_to_swap is not None or args.single_blocks_to_swap is not None
+    is_swapping_blocks = args.double_blocks_to_swap or args.single_blocks_to_swap
     if is_swapping_blocks:
         # Swap blocks between CPU and GPU to reduce memory usage, in forward and backward passes.
         # This idea is based on 2kpr's great work. Thank you!

From bbd160b4ca9293881c222f9b9e1d832af69699db Mon Sep 17 00:00:00 2001
From: "Kohya S." <52813779+kohya-ss@users.noreply.github.com>
Date: Wed, 18 Sep 2024 07:55:04 +0900
Subject: [PATCH 145/582] sd3 schedule free opt (#1605)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* New ScheduleFree support for Flux (#1600)

* init

* use no schedule

* fix typo

* update for eval()

* fix typo

* update

* Update train_util.py

* Update requirements.txt

* update sfwrapper WIP

* no need to check schedulefree optimizer

* remove debug print

* comment out schedulefree wrapper

* update readme

---------

Co-authored-by: 青龍聖者@bdsqlsz <865105819@qq.com>
---
 README.md             |   8 +++
 library/train_util.py | 152 ++++++++++++++++++++++++++++++++++++++++--
 requirements.txt      |   1 +
 3 files changed, 154 insertions(+), 7 deletions(-)

diff --git a/README.md b/README.md
index 24217d8b7..dc9862927 100644
--- a/README.md
+++ b/README.md
@@ -11,6 +11,14 @@ The command to install PyTorch is as follows:
 
 ### Recent Updates
 
+Sep 18, 2024:
+
+- Schedule-free optimizer is added. Thanks to sdbds! See PR [#1600](https://github.com/kohya-ss/sd-scripts/pull/1600) for details.
+  - `schedulefree` is added to the dependencies. Please update the library if necessary.
+  - AdamWScheduleFree or SGDScheduleFree can be used. Specify `adamwschedulefree` or `sgdschedulefree` in `--optimizer_type`.
+  - Wrapper classes are not available for now.
+  - These can be used not only for FLUX.1 training but also for other training scripts after merging to the dev/main branch.
+
 Sep 16, 2024:
 
  Added `train_double_block_indices` and `train_double_block_indices` to the LoRA training script to specify the indices of the blocks to train. See [Specify blocks to train in FLUX.1 LoRA training](#specify-blocks-to-train-in-flux1-lora-training) for details.
diff --git a/library/train_util.py b/library/train_util.py
index 60afd4219..a54f23ff6 100644
--- a/library/train_util.py
+++ b/library/train_util.py
@@ -3303,6 +3303,20 @@ def int_or_float(value):
         help='additional arguments for optimizer (like "weight_decay=0.01 betas=0.9,0.999 ...") / オプティマイザの追加引数（例： "weight_decay=0.01 betas=0.9,0.999 ..."）',
     )
 
+    # parser.add_argument(
+    #     "--optimizer_schedulefree_wrapper",
+    #     action="store_true",
+    #     help="use schedulefree_wrapper any optimizer / 任意のオプティマイザにschedulefree_wrapperを使用",
+    # )
+
+    # parser.add_argument(
+    #     "--schedulefree_wrapper_args",
+    #     type=str,
+    #     default=None,
+    #     nargs="*",
+    #     help='additional arguments for schedulefree_wrapper (like "momentum=0.9 weight_decay_at_y=0.1 ...") / オプティマイザの追加引数（例： "momentum=0.9 weight_decay_at_y=0.1 ..."）',
+    # )
+
     parser.add_argument("--lr_scheduler_type", type=str, default="", help="custom scheduler module / 使用するスケジューラ")
     parser.add_argument(
         "--lr_scheduler_args",
@@ -4582,26 +4596,146 @@ def get_optimizer(args, trainable_params):
         optimizer_class = torch.optim.AdamW
         optimizer = optimizer_class(trainable_params, lr=lr, **optimizer_kwargs)
 
+    elif optimizer_type.endswith("schedulefree".lower()):
+        try:
+            import schedulefree as sf
+        except ImportError:
+            raise ImportError("No schedulefree / schedulefreeがインストールされていないようです")
+        if optimizer_type == "AdamWScheduleFree".lower():
+            optimizer_class = sf.AdamWScheduleFree
+            logger.info(f"use AdamWScheduleFree optimizer | {optimizer_kwargs}")
+        elif optimizer_type == "SGDScheduleFree".lower():
+            optimizer_class = sf.SGDScheduleFree
+            logger.info(f"use SGDScheduleFree optimizer | {optimizer_kwargs}")
+        else:
+            raise ValueError(f"Unknown optimizer type: {optimizer_type}")
+        optimizer = optimizer_class(trainable_params, lr=lr, **optimizer_kwargs)
+        # make optimizer as train mode: we don't need to call train again, because eval will not be called in training loop
+        optimizer.train()
+
     if optimizer is None:
         # 任意のoptimizerを使う
-        optimizer_type = args.optimizer_type  # lowerでないやつ（微妙）
-        logger.info(f"use {optimizer_type} | {optimizer_kwargs}")
-        if "." not in optimizer_type:
+        case_sensitive_optimizer_type = args.optimizer_type  # not lower
+        logger.info(f"use {case_sensitive_optimizer_type} | {optimizer_kwargs}")
+
+        if "." not in case_sensitive_optimizer_type:  # from torch.optim
             optimizer_module = torch.optim
-        else:
-            values = optimizer_type.split(".")
+        else:  # from other library
+            values = case_sensitive_optimizer_type.split(".")
             optimizer_module = importlib.import_module(".".join(values[:-1]))
-            optimizer_type = values[-1]
+            case_sensitive_optimizer_type = values[-1]
 
-        optimizer_class = getattr(optimizer_module, optimizer_type)
+        optimizer_class = getattr(optimizer_module, case_sensitive_optimizer_type)
         optimizer = optimizer_class(trainable_params, lr=lr, **optimizer_kwargs)
 
+    """
+    # wrap any of above optimizer with schedulefree, if optimizer is not schedulefree
+    if args.optimizer_schedulefree_wrapper and not optimizer_type.endswith("schedulefree".lower()):
+        try:
+            import schedulefree as sf
+        except ImportError:
+            raise ImportError("No schedulefree / schedulefreeがインストールされていないようです")
+
+        schedulefree_wrapper_kwargs = {}
+        if args.schedulefree_wrapper_args is not None and len(args.schedulefree_wrapper_args) > 0:
+            for arg in args.schedulefree_wrapper_args:
+                key, value = arg.split("=")
+                value = ast.literal_eval(value)
+                schedulefree_wrapper_kwargs[key] = value
+
+        sf_wrapper = sf.ScheduleFreeWrapper(optimizer, **schedulefree_wrapper_kwargs)
+        sf_wrapper.train()  # make optimizer as train mode
+
+        # we need to make optimizer as a subclass of torch.optim.Optimizer, we make another Proxy class over SFWrapper
+        class OptimizerProxy(torch.optim.Optimizer):
+            def __init__(self, sf_wrapper):
+                self._sf_wrapper = sf_wrapper
+
+            def __getattr__(self, name):
+                return getattr(self._sf_wrapper, name)
+
+            # override properties
+            @property
+            def state(self):
+                return self._sf_wrapper.state
+
+            @state.setter
+            def state(self, state):
+                self._sf_wrapper.state = state
+
+            @property
+            def param_groups(self):
+                return self._sf_wrapper.param_groups
+
+            @param_groups.setter
+            def param_groups(self, param_groups):
+                self._sf_wrapper.param_groups = param_groups
+
+            @property
+            def defaults(self):
+                return self._sf_wrapper.defaults
+
+            @defaults.setter
+            def defaults(self, defaults):
+                self._sf_wrapper.defaults = defaults
+
+            def add_param_group(self, param_group):
+                self._sf_wrapper.add_param_group(param_group)
+
+            def load_state_dict(self, state_dict):
+                self._sf_wrapper.load_state_dict(state_dict)
+
+            def state_dict(self):
+                return self._sf_wrapper.state_dict()
+
+            def zero_grad(self):
+                self._sf_wrapper.zero_grad()
+
+            def step(self, closure=None):
+                self._sf_wrapper.step(closure)
+
+            def train(self):
+                self._sf_wrapper.train()
+
+            def eval(self):
+                self._sf_wrapper.eval()
+
+            # isinstance チェックをパスするためのメソッド
+            def __instancecheck__(self, instance):
+                return isinstance(instance, (type(self), Optimizer))
+
+        optimizer = OptimizerProxy(sf_wrapper)
+
+        logger.info(f"wrap optimizer with ScheduleFreeWrapper | {schedulefree_wrapper_kwargs}")
+    """
+
     optimizer_name = optimizer_class.__module__ + "." + optimizer_class.__name__
     optimizer_args = ",".join([f"{k}={v}" for k, v in optimizer_kwargs.items()])
 
     return optimizer_name, optimizer_args, optimizer
 
 
+def is_schedulefree_optimizer(optimizer: Optimizer, args: argparse.Namespace) -> bool:
+    return args.optimizer_type.lower().endswith("schedulefree".lower()) # or args.optimizer_schedulefree_wrapper
+
+
+def get_dummy_scheduler(optimizer: Optimizer) -> Any:
+    # dummy scheduler for schedulefree optimizer. supports only empty step(), get_last_lr() and optimizers.
+    # this scheduler is used for logging only.
+    # this isn't be wrapped by accelerator because of this class is not a subclass of torch.optim.lr_scheduler._LRScheduler
+    class DummyScheduler:
+        def __init__(self, optimizer: Optimizer):
+            self.optimizer = optimizer
+
+        def step(self):
+            pass
+
+        def get_last_lr(self):
+            return [group["lr"] for group in self.optimizer.param_groups]
+
+    return DummyScheduler(optimizer)
+
+
 # Modified version of get_scheduler() function from diffusers.optimizer.get_scheduler
 # Add some checking and features to the original function.
 
@@ -4610,6 +4744,10 @@ def get_scheduler_fix(args, optimizer: Optimizer, num_processes: int):
     """
     Unified API to get any scheduler from its name.
     """
+    # if schedulefree optimizer, return dummy scheduler
+    if is_schedulefree_optimizer(optimizer, args):
+        return get_dummy_scheduler(optimizer)
+
     name = args.lr_scheduler
     num_training_steps = args.max_train_steps * num_processes  # * args.gradient_accumulation_steps
     num_warmup_steps: Optional[int] = (
diff --git a/requirements.txt b/requirements.txt
index 9a4fa0c15..bab53f20f 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -9,6 +9,7 @@ pytorch-lightning==1.9.0
 bitsandbytes==0.43.3
 prodigyopt==1.0
 lion-pytorch==0.0.6
+schedulefree==1.2.7
 tensorboard
 safetensors==0.4.4
 # gradio==3.16.2

From e74502117bcf161ef5698fb0adba4f9fa0171b8d Mon Sep 17 00:00:00 2001
From: Kohya S <ykumeykume@gmail.com>
Date: Wed, 18 Sep 2024 08:04:32 +0900
Subject: [PATCH 146/582] update README

---
 README.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/README.md b/README.md
index dc9862927..034a260ff 100644
--- a/README.md
+++ b/README.md
@@ -14,6 +14,7 @@ The command to install PyTorch is as follows:
 Sep 18, 2024:
 
 - Schedule-free optimizer is added. Thanks to sdbds! See PR [#1600](https://github.com/kohya-ss/sd-scripts/pull/1600) for details.
+  - Details of the schedule-free optimizer can be found in [facebookresearch/schedule_free](https://github.com/facebookresearch/schedule_free).
   - `schedulefree` is added to the dependencies. Please update the library if necessary.
   - AdamWScheduleFree or SGDScheduleFree can be used. Specify `adamwschedulefree` or `sgdschedulefree` in `--optimizer_type`.
   - Wrapper classes are not available for now.

From 1286e00bb0fc34c296f24b7057777f1c37cf8e11 Mon Sep 17 00:00:00 2001
From: Kohya S <ykumeykume@gmail.com>
Date: Wed, 18 Sep 2024 21:31:54 +0900
Subject: [PATCH 147/582] fix to call train/eval in schedulefree #1605

---
 README.md             |  3 +++
 flux_train.py         | 10 ++++++++++
 library/train_util.py | 15 ++++++++++++++-
 train_network.py      |  6 ++++++
 4 files changed, 33 insertions(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 034a260ff..843ae181b 100644
--- a/README.md
+++ b/README.md
@@ -11,6 +11,9 @@ The command to install PyTorch is as follows:
 
 ### Recent Updates
 
+Sep 18, 2024 (update 1):
+Fixed an issue where train()/eval() was not called properly with the schedule-free optimizer. The schedule-free optimizer can be used in FLUX.1 LoRA training and fine-tuning for now.
+
 Sep 18, 2024:
 
 - Schedule-free optimizer is added. Thanks to sdbds! See PR [#1600](https://github.com/kohya-ss/sd-scripts/pull/1600) for details.
diff --git a/flux_train.py b/flux_train.py
index 5d8326b1d..bc4e62793 100644
--- a/flux_train.py
+++ b/flux_train.py
@@ -347,8 +347,13 @@ def train(args):
 
         logger.info(f"using {len(optimizers)} optimizers for blockwise fused optimizers")
 
+        if train_util.is_schedulefree_optimizer(optimizers[0], args):
+            raise ValueError("Schedule-free optimizer is not supported with blockwise fused optimizers")
+        optimizer_train_fn = lambda: None  # dummy function
+        optimizer_eval_fn = lambda: None  # dummy function
     else:
         _, _, optimizer = train_util.get_optimizer(args, trainable_params=params_to_optimize)
+        optimizer_train_fn, optimizer_eval_fn = train_util.get_optimizer_train_eval_fn(optimizer, args)
 
     # prepare dataloader
     # strategies are set here because they cannot be referenced in another process. Copy them with the dataset
@@ -760,6 +765,7 @@ def optimizer_hook(parameter: torch.Tensor):
                 progress_bar.update(1)
                 global_step += 1
 
+                optimizer_eval_fn()
                 flux_train_utils.sample_images(
                     accelerator, args, None, global_step, flux, ae, [clip_l, t5xxl], sample_prompts_te_outputs
                 )
@@ -778,6 +784,7 @@ def optimizer_hook(parameter: torch.Tensor):
                             global_step,
                             accelerator.unwrap_model(flux),
                         )
+                optimizer_train_fn()
 
             current_loss = loss.detach().item()  # 平均なのでbatch sizeは関係ないはず
             if len(accelerator.trackers) > 0:
@@ -800,6 +807,7 @@ def optimizer_hook(parameter: torch.Tensor):
 
         accelerator.wait_for_everyone()
 
+        optimizer_eval_fn()
         if args.save_every_n_epochs is not None:
             if accelerator.is_main_process:
                 flux_train_utils.save_flux_model_on_epoch_end_or_stepwise(
@@ -816,12 +824,14 @@ def optimizer_hook(parameter: torch.Tensor):
         flux_train_utils.sample_images(
             accelerator, args, epoch + 1, global_step, flux, ae, [clip_l, t5xxl], sample_prompts_te_outputs
         )
+        optimizer_train_fn()
 
     is_main_process = accelerator.is_main_process
     # if is_main_process:
     flux = accelerator.unwrap_model(flux)
 
     accelerator.end_training()
+    optimizer_eval_fn()
 
     if args.save_state or args.save_state_on_train_end:
         train_util.save_state_on_train_end(args, accelerator)
diff --git a/library/train_util.py b/library/train_util.py
index a54f23ff6..fe9deb940 100644
--- a/library/train_util.py
+++ b/library/train_util.py
@@ -13,6 +13,7 @@
 import time
 from typing import (
     Any,
+    Callable,
     Dict,
     List,
     NamedTuple,
@@ -4715,8 +4716,20 @@ def __instancecheck__(self, instance):
     return optimizer_name, optimizer_args, optimizer
 
 
+def get_optimizer_train_eval_fn(optimizer: Optimizer, args: argparse.Namespace) -> Tuple[Callable, Callable]:
+    if not is_schedulefree_optimizer(optimizer, args):
+        # return dummy func
+        return lambda: None, lambda: None
+
+    # get train and eval functions from optimizer
+    train_fn = optimizer.train
+    eval_fn = optimizer.eval
+
+    return train_fn, eval_fn
+
+
 def is_schedulefree_optimizer(optimizer: Optimizer, args: argparse.Namespace) -> bool:
-    return args.optimizer_type.lower().endswith("schedulefree".lower()) # or args.optimizer_schedulefree_wrapper
+    return args.optimizer_type.lower().endswith("schedulefree".lower())  # or args.optimizer_schedulefree_wrapper
 
 
 def get_dummy_scheduler(optimizer: Optimizer) -> Any:
diff --git a/train_network.py b/train_network.py
index 34385ae08..55faa143e 100644
--- a/train_network.py
+++ b/train_network.py
@@ -498,6 +498,7 @@ def train(self, args):
         #         accelerator.print(f"trainable_params: {k} = {v}")
 
         optimizer_name, optimizer_args, optimizer = train_util.get_optimizer(args, trainable_params)
+        optimizer_train_fn, optimizer_eval_fn = train_util.get_optimizer_train_eval_fn(optimizer, args)
 
         # prepare dataloader
         # strategies are set here because they cannot be referenced in another process. Copy them with the dataset
@@ -1199,6 +1200,7 @@ def remove_model(old_ckpt_name):
                     progress_bar.update(1)
                     global_step += 1
 
+                    optimizer_eval_fn()
                     self.sample_images(
                         accelerator, args, None, global_step, accelerator.device, vae, tokenizers, text_encoder, unet
                     )
@@ -1217,6 +1219,7 @@ def remove_model(old_ckpt_name):
                             if remove_step_no is not None:
                                 remove_ckpt_name = train_util.get_step_ckpt_name(args, "." + args.save_model_as, remove_step_no)
                                 remove_model(remove_ckpt_name)
+                    optimizer_train_fn()
 
                 current_loss = loss.detach().item()
                 loss_recorder.add(epoch=epoch, step=step, loss=current_loss)
@@ -1243,6 +1246,7 @@ def remove_model(old_ckpt_name):
             accelerator.wait_for_everyone()
 
             # 指定エポックごとにモデルを保存
+            optimizer_eval_fn()
             if args.save_every_n_epochs is not None:
                 saving = (epoch + 1) % args.save_every_n_epochs == 0 and (epoch + 1) < num_train_epochs
                 if is_main_process and saving:
@@ -1258,6 +1262,7 @@ def remove_model(old_ckpt_name):
                         train_util.save_and_remove_state_on_epoch_end(args, accelerator, epoch + 1)
 
             self.sample_images(accelerator, args, epoch + 1, global_step, accelerator.device, vae, tokenizers, text_encoder, unet)
+            optimizer_train_fn()
 
             # end of epoch
 
@@ -1268,6 +1273,7 @@ def remove_model(old_ckpt_name):
             network = accelerator.unwrap_model(network)
 
         accelerator.end_training()
+        optimizer_eval_fn()
 
         if is_main_process and (args.save_state or args.save_state_on_train_end):
             train_util.save_state_on_train_end(args, accelerator)

From 3957372ded6fda20553acaf169993a422b829bdc Mon Sep 17 00:00:00 2001
From: Ed McManus <emcmanus@users.noreply.github.com>
Date: Thu, 19 Sep 2024 14:30:03 -0700
Subject: [PATCH 148/582] Retain alpha in `pil_resize`
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Currently the alpha channel is dropped by `pil_resize()` when `--alpha_mask` is supplied and the image width does not exceed the bucket.

This codepath is entered on the last line, here:
```
def trim_and_resize_if_required(
    random_crop: bool, image: np.ndarray, reso, resized_size: Tuple[int, int]
) -> Tuple[np.ndarray, Tuple[int, int], Tuple[int, int, int, int]]:
    image_height, image_width = image.shape[0:2]
    original_size = (image_width, image_height)  # size before resize

    if image_width != resized_size[0] or image_height != resized_size[1]:
        # リサイズする
        if image_width > resized_size[0] and image_height > resized_size[1]:
            image = cv2.resize(image, resized_size, interpolation=cv2.INTER_AREA)  # INTER_AREAでやりたいのでcv2でリサイズ
        else:
            image = pil_resize(image, resized_size)
```
---
 library/utils.py | 21 +++++++++++++++++----
 1 file changed, 17 insertions(+), 4 deletions(-)

diff --git a/library/utils.py b/library/utils.py
index a0bb19650..2171c7190 100644
--- a/library/utils.py
+++ b/library/utils.py
@@ -305,13 +305,26 @@ def _convert_float8(byte_tensor, dtype_str, shape):
             raise ValueError(f"Unsupported float8 type: {dtype_str} (upgrade PyTorch to support float8 types)")
 
 def pil_resize(image, size, interpolation=Image.LANCZOS):
-    pil_image = Image.fromarray(cv2.cvtColor(image, cv2.COLOR_BGR2RGB))
+    # Check if the image has an alpha channel
+    has_alpha = image.shape[2] == 4 if len(image.shape) == 3 else False
 
-    # use Pillow resize
+    if has_alpha:
+        # Convert BGRA to RGBA
+        pil_image = Image.fromarray(cv2.cvtColor(image, cv2.COLOR_BGRA2RGBA))
+    else:
+        # Convert BGR to RGB
+        pil_image = Image.fromarray(cv2.cvtColor(image, cv2.COLOR_BGR2RGB))
+
+    # Resize the image
     resized_pil = pil_image.resize(size, interpolation)
 
-    # return cv2 image
-    resized_cv2 = cv2.cvtColor(np.array(resized_pil), cv2.COLOR_RGB2BGR)
+    # Convert back to cv2 format
+    if has_alpha:
+        # Convert RGBA to BGRA
+        resized_cv2 = cv2.cvtColor(np.array(resized_pil), cv2.COLOR_RGBA2BGRA)
+    else:
+        # Convert RGB to BGR
+        resized_cv2 = cv2.cvtColor(np.array(resized_pil), cv2.COLOR_RGB2BGR)
 
     return resized_cv2
 

From de4bb657b089cc28f4127e891b927895892e20b5 Mon Sep 17 00:00:00 2001
From: Ed McManus <emcmanus@users.noreply.github.com>
Date: Thu, 19 Sep 2024 14:38:32 -0700
Subject: [PATCH 149/582] Update utils.py

Cleanup
---
 library/utils.py | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/library/utils.py b/library/utils.py
index 2171c7190..8a0c782c0 100644
--- a/library/utils.py
+++ b/library/utils.py
@@ -305,25 +305,19 @@ def _convert_float8(byte_tensor, dtype_str, shape):
             raise ValueError(f"Unsupported float8 type: {dtype_str} (upgrade PyTorch to support float8 types)")
 
 def pil_resize(image, size, interpolation=Image.LANCZOS):
-    # Check if the image has an alpha channel
     has_alpha = image.shape[2] == 4 if len(image.shape) == 3 else False
 
     if has_alpha:
-        # Convert BGRA to RGBA
         pil_image = Image.fromarray(cv2.cvtColor(image, cv2.COLOR_BGRA2RGBA))
     else:
-        # Convert BGR to RGB
         pil_image = Image.fromarray(cv2.cvtColor(image, cv2.COLOR_BGR2RGB))
 
-    # Resize the image
     resized_pil = pil_image.resize(size, interpolation)
 
     # Convert back to cv2 format
     if has_alpha:
-        # Convert RGBA to BGRA
         resized_cv2 = cv2.cvtColor(np.array(resized_pil), cv2.COLOR_RGBA2BGRA)
     else:
-        # Convert RGB to BGR
         resized_cv2 = cv2.cvtColor(np.array(resized_pil), cv2.COLOR_RGB2BGR)
 
     return resized_cv2

From 0535cd29b926530255d5400374813432ec52c3df Mon Sep 17 00:00:00 2001
From: Akegarasu <akiba@anzu.link>
Date: Fri, 20 Sep 2024 10:05:22 +0800
Subject: [PATCH 150/582] fix: backward compatibility for text_encoder_lr

---
 train_network.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/train_network.py b/train_network.py
index 55faa143e..dfa51a9c8 100644
--- a/train_network.py
+++ b/train_network.py
@@ -471,7 +471,11 @@ def train(self, args):
         if support_multiple_lrs:
             text_encoder_lr = args.text_encoder_lr
         else:
-            text_encoder_lr = None if args.text_encoder_lr is None or len(args.text_encoder_lr) == 0 else args.text_encoder_lr[0]
+            # toml backward compatibility
+            if args.text_encoder_lr is None or isinstance(args.text_encoder_lr, float):
+                text_encoder_lr = args.text_encoder_lr
+            else:
+                text_encoder_lr = None if len(args.text_encoder_lr) == 0 else args.text_encoder_lr[0]
         try:
             if support_multiple_lrs:
                 results = network.prepare_optimizer_params_with_multiple_te_lrs(text_encoder_lr, args.unet_lr, args.learning_rate)

From 583d4a436c1cef57fce405d0167fb7ce575fc768 Mon Sep 17 00:00:00 2001
From: Kohya S <ykumeykume@gmail.com>
Date: Fri, 20 Sep 2024 22:22:24 +0900
Subject: [PATCH 151/582] add compatibility for int LR (D-Adaptation etc.)
 #1620

---
 train_network.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/train_network.py b/train_network.py
index dfa51a9c8..b24f89b1e 100644
--- a/train_network.py
+++ b/train_network.py
@@ -472,7 +472,7 @@ def train(self, args):
             text_encoder_lr = args.text_encoder_lr
         else:
             # toml backward compatibility
-            if args.text_encoder_lr is None or isinstance(args.text_encoder_lr, float):
+            if args.text_encoder_lr is None or isinstance(args.text_encoder_lr, float) or isinstance(args.text_encoder_lr, int):
                 text_encoder_lr = args.text_encoder_lr
             else:
                 text_encoder_lr = None if len(args.text_encoder_lr) == 0 else args.text_encoder_lr[0]

From 56a7bc171d48089fb50f8638537e42d07c579db3 Mon Sep 17 00:00:00 2001
From: Kohya S <ykumeykume@gmail.com>
Date: Thu, 26 Sep 2024 08:26:31 +0900
Subject: [PATCH 152/582] new block swap for FLUX.1 fine tuning

---
 README.md              |  47 ++++++--
 flux_train.py          | 251 ++++++++++++++++++++++++++---------------
 library/flux_models.py | 168 +++++++++++++++------------
 3 files changed, 297 insertions(+), 169 deletions(-)

diff --git a/README.md b/README.md
index ef691e918..7d623f900 100644
--- a/README.md
+++ b/README.md
@@ -11,6 +11,10 @@ The command to install PyTorch is as follows:
 
 ### Recent Updates
 
+Sep 26, 2024:
+The implementation of block swap during FLUX.1 fine-tuning has been changed to improve speed about 10% (depends on the environment). A new `--blocks_to_swap` option has been added, and `--double_blocks_to_swap` and `--single_blocks_to_swap` are deprecated. `--double_blocks_to_swap` and `--single_blocks_to_swap` are working as before, but they will be removed in the future. See [FLUX.1 fine-tuning](#flux1-fine-tuning) for details.
+
+
 Sep 18, 2024 (update 1):
 Fixed an issue where train()/eval() was not called properly with the schedule-free optimizer. The schedule-free optimizer can be used in FLUX.1 LoRA training and fine-tuning for now.
 
@@ -307,6 +311,8 @@ python flux_minimal_inference.py --ckpt flux1-dev.safetensors --clip_l sd3/clip_
 
 The memory-efficient training with block swap is based on 2kpr's implementation. Thanks to 2kpr!
 
+__`--double_blocks_to_swap` and `--single_blocks_to_swap` are deprecated. These options is still available, but they will be removed in the future. Please use `--blocks_to_swap` instead. These options are equivalent to specifying `double_blocks_to_swap + single_blocks_to_swap // 2` in `--blocks_to_swap`.__
+
 Sample command for FLUX.1 fine-tuning is below. This will work with 24GB VRAM GPUs, and 64GB main memory is recommended. 
 
 ```
@@ -319,39 +325,62 @@ accelerate launch  --mixed_precision bf16 --num_cpu_threads_per_process 1 flux_t
 --optimizer_type adafactor --optimizer_args "relative_step=False" "scale_parameter=False" "warmup_init=False" 
 --lr_scheduler constant_with_warmup --max_grad_norm 0.0 
 --timestep_sampling shift --discrete_flow_shift 3.1582 --model_prediction_type raw --guidance_scale 1.0 
---fused_backward_pass  --double_blocks_to_swap 6 --cpu_offload_checkpointing --full_bf16 
+--fused_backward_pass  --blocks_to_swap 8 --full_bf16 
 ```
 (The command is multi-line for readability. Please combine it into one line.)
 
-Options are almost the same as LoRA training. The difference is `--full_bf16`, `--blockwise_fused_optimizers`, `--double_blocks_to_swap` and `--cpu_offload_checkpointing`.  `--single_blocks_to_swap` is also available.
+Options are almost the same as LoRA training. The difference is `--full_bf16`, `--fused_backward_pass` and  `--blocks_to_swap`. `--cpu_offload_checkpointing` is also available.
 
 `--full_bf16` enables the training with bf16 (weights and gradients). 
 
 `--fused_backward_pass` enables the fusing of the optimizer step into the backward pass for each parameter. This reduces the memory usage during training. Only Adafactor optimizer is supported for now. Stochastic rounding is also enabled when `--fused_backward_pass` and `--full_bf16` are specified.
 
-`--blockwise_fused_optimizers` enables the fusing of the optimizer step into the backward pass for each block. This is similar to `--fused_backward_pass`. Any optimizer can be used, but Adafactor is recommended for memory efficiency. `--blockwise_fused_optimizers` cannot be used with `--fused_backward_pass`. Stochastic rounding is not supported for now.
+`--blockwise_fused_optimizers` enables the fusing of the optimizer step into the backward pass for each block. This is similar to `--fused_backward_pass`. Any optimizer can be used, but Adafactor is recommended for memory efficiency and stochastic rounding. `--blockwise_fused_optimizers` cannot be used with `--fused_backward_pass`. Stochastic rounding is not supported for now.
 
-`--double_blocks_to_swap` and `--single_blocks_to_swap` are the number of double blocks and single blocks to swap. The default is None (no swap). These options must be combined with `--fused_backward_pass` or `--blockwise_fused_optimizers`. `--double_blocks_to_swap` can be specified with `--single_blocks_to_swap`. The recommended maximum number of blocks to swap is 9 for double blocks and 18 for single blocks. Please see the next chapter for details.
+`--blocks_to_swap` is the number of blocks to swap. The default is None (no swap). These options must be combined with `--fused_backward_pass` or `--blockwise_fused_optimizers`. The recommended maximum value is 36. 
 
-`--cpu_offload_checkpointing` is to offload the gradient checkpointing to CPU. This reduces about 2GB of VRAM usage.
+`--cpu_offload_checkpointing` is to offload the gradient checkpointing to CPU. This reduces about 2GB of VRAM usage. 
 
 All these options are experimental and may change in the future.
 
 The increasing the number of blocks to swap may reduce the memory usage, but the training speed will be slower. `--cpu_offload_checkpointing` also slows down the training.
 
-Swap 6 double blocks and use cpu offload checkpointing may be a good starting point. Please try different settings according to VRAM usage and training speed.
+Swap 8 blocks without cpu offload checkpointing may be a good starting point for 24GB VRAM GPUs. Please try different settings according to VRAM usage and training speed.
 
 The learning rate and the number of epochs are not optimized yet. Please adjust them according to the training results.
 
+#### How to use block swap
+
+There are two possible ways to use block swap. It is unknown which is better.
+
+1. Swap the minimum number of blocks that fit in VRAM with batch size 1 and shorten the training speed of one step.
+
+    The above command example is for this usage.
+
+2. Swap many blocks to increase the batch size and shorten the training speed per data.
+
+    For example, swapping 20 blocks seems to increase the batch size to about 6. In this case, the training speed per data will be relatively faster than 1.
+  
+#### Training with <24GB VRAM GPUs
+
+Swap 28 blocks without cpu offload checkpointing may be working with 12GB VRAM GPUs. Please try different settings according to VRAM size of your GPU.
+
+T5XXL requires about 10GB of VRAM, so 10GB of VRAM will be minimum requirement for FLUX.1 fine-tuning.
+
 #### Key Features for FLUX.1 fine-tuning
 
-1.  Technical details of double/single block swap:
+1.  Technical details of block swap:
     - Reduce memory usage by transferring double and single blocks of FLUX.1 from GPU to CPU when they are not needed.
     - During forward pass, the weights of the blocks that have finished calculation are transferred to CPU, and the weights of the blocks to be calculated are transferred to GPU.
     - The same is true for the backward pass, but the order is reversed. The gradients remain on the GPU.
     - Since the transfer between CPU and GPU takes time, the training will be slower.
-    - `--double_blocks_to_swap` and `--single_blocks_to_swap` specify the number of blocks to swap. For example, `--double_blocks_to_swap 6` swaps 6 blocks at each step of training, but the remaining 13 blocks are always on the GPU.
-    - About 640MB of memory can be saved per double block, and about 320MB of memory can be saved per single block.
+    - `--blocks_to_swap` specify the number of blocks to swap. 
+    - About 640MB of memory can be saved per block.
+    - Since the memory usage of one double block and two single blocks is almost the same, the transfer of single blocks is done in units of two. For example, consider the case of `--blocks_to_swap 6`.
+      - Before the forward pass, all double blocks and 26 (=38-12) single blocks are on the GPU. The last 12 single blocks are on the CPU.
+      - In the forward pass, the 6 double blocks that have finished calculation (the first 6 blocks) are transferred to the CPU, and the 12 single blocks to be calculated (the last 12 blocks) are transferred to the GPU.
+      - The same is true for the backward pass, but in reverse order. The 12 single blocks that have finished calculation are transferred to the CPU, and the 6 double blocks to be calculated are transferred to the GPU. 
+      - After the backward pass, the blocks are back to their original locations.
 
 2. Sample Image Generation:
    - Sample image generation during training is now supported.
diff --git a/flux_train.py b/flux_train.py
index bc4e62793..bf34208f1 100644
--- a/flux_train.py
+++ b/flux_train.py
@@ -11,10 +11,12 @@
 # - Per-block fused optimizer instances
 
 import argparse
+from concurrent.futures import ThreadPoolExecutor
 import copy
 import math
 import os
 from multiprocessing import Value
+import time
 from typing import List
 import toml
 
@@ -265,14 +267,30 @@ def train(args):
 
     flux.requires_grad_(True)
 
-    is_swapping_blocks = args.double_blocks_to_swap or args.single_blocks_to_swap
+    # block swap
+
+    # backward compatibility
+    if args.blocks_to_swap is None:
+        blocks_to_swap = args.double_blocks_to_swap or 0
+        if args.single_blocks_to_swap is not None:
+            blocks_to_swap += args.single_blocks_to_swap // 2
+        if blocks_to_swap > 0:
+            logger.warning(
+                "double_blocks_to_swap and single_blocks_to_swap are deprecated. Use blocks_to_swap instead."
+                " / double_blocks_to_swapとsingle_blocks_to_swapは非推奨です。blocks_to_swapを使ってください。"
+            )
+            logger.info(
+                f"double_blocks_to_swap={args.double_blocks_to_swap} and single_blocks_to_swap={args.single_blocks_to_swap} are converted to blocks_to_swap={blocks_to_swap}."
+            )
+            args.blocks_to_swap = blocks_to_swap
+        del blocks_to_swap
+
+    is_swapping_blocks = args.blocks_to_swap is not None and args.blocks_to_swap > 0
     if is_swapping_blocks:
         # Swap blocks between CPU and GPU to reduce memory usage, in forward and backward passes.
         # This idea is based on 2kpr's great work. Thank you!
-        logger.info(
-            f"enable block swap: double_blocks_to_swap={args.double_blocks_to_swap}, single_blocks_to_swap={args.single_blocks_to_swap}"
-        )
-        flux.enable_block_swap(args.double_blocks_to_swap, args.single_blocks_to_swap)
+        logger.info(f"enable block swap: blocks_to_swap={args.blocks_to_swap}")
+        flux.enable_block_swap(args.blocks_to_swap)
 
     if not cache_latents:
         # load VAE here if not cached
@@ -443,82 +461,120 @@ def train(args):
     # resumeする
     train_util.resume_from_local_or_hf_if_specified(accelerator, args)
 
+    # memory efficient block swapping
+
+    def get_block_unit(dbl_blocks, sgl_blocks, index: int):
+        if index < len(dbl_blocks):
+            return (dbl_blocks[index],)
+        else:
+            index -= len(dbl_blocks)
+            index *= 2
+            return (sgl_blocks[index], sgl_blocks[index + 1])
+
+    def submit_move_blocks(futures, thread_pool, block_idx_to_cpu, block_idx_to_cuda, dbl_blocks, sgl_blocks, device):
+        def move_blocks(bidx_to_cpu, blocks_to_cpu, bidx_to_cuda, blocks_to_cuda, dvc):
+            # print(f"Backward: Move block {bidx_to_cpu} to CPU")
+            for block in blocks_to_cpu:
+                block = block.to("cpu", non_blocking=True)
+            torch.cuda.empty_cache()
+
+            # print(f"Backward: Move block {bidx_to_cuda} to CUDA")
+            for block in blocks_to_cuda:
+                block = block.to(dvc, non_blocking=True)
+
+            torch.cuda.synchronize()
+            # print(f"Backward: Moved blocks {bidx_to_cpu} and {bidx_to_cuda}")
+            return bidx_to_cpu, bidx_to_cuda
+
+        blocks_to_cpu = get_block_unit(dbl_blocks, sgl_blocks, block_idx_to_cpu)
+        blocks_to_cuda = get_block_unit(dbl_blocks, sgl_blocks, block_idx_to_cuda)
+
+        futures[block_idx_to_cuda] = thread_pool.submit(
+            move_blocks, block_idx_to_cpu, blocks_to_cpu, block_idx_to_cuda, blocks_to_cuda, device
+        )
+
+    def wait_blocks_move(block_idx, futures):
+        if block_idx not in futures:
+            return
+        # print(f"Backward: Wait for block {block_idx}")
+        # start_time = time.perf_counter()
+        future = futures.pop(block_idx)
+        future.result()
+        # print(f"Backward: Waited for block {block_idx}: {time.perf_counter()-start_time:.2f}s")
+        # torch.cuda.synchronize()
+        # print(f"Backward: Synchronized: {time.perf_counter()-start_time:.2f}s")
+
     if args.fused_backward_pass:
         # use fused optimizer for backward pass: other optimizers will be supported in the future
         import library.adafactor_fused
 
         library.adafactor_fused.patch_adafactor_fused(optimizer)
 
-        double_blocks_to_swap = args.double_blocks_to_swap
-        single_blocks_to_swap = args.single_blocks_to_swap
+        blocks_to_swap = args.blocks_to_swap
         num_double_blocks = 19  # len(flux.double_blocks)
         num_single_blocks = 38  # len(flux.single_blocks)
-        handled_double_block_indices = set()
-        handled_single_block_indices = set()
+        num_block_units = num_double_blocks + num_single_blocks // 2
+        handled_unit_indices = set()
+
+        n = 1  # only asyncronous purpose, no need to increase this number
+        # n = 2
+        # n = max(1, os.cpu_count() // 2)
+        thread_pool = ThreadPoolExecutor(max_workers=n)
+        futures = {}
 
         for param_group, param_name_group in zip(optimizer.param_groups, param_names):
             for parameter, param_name in zip(param_group["params"], param_name_group):
                 if parameter.requires_grad:
                     grad_hook = None
 
-                    if double_blocks_to_swap:
-                        if param_name.startswith("double_blocks"):
-                            block_idx = int(param_name.split(".")[1])
-                            if (
-                                block_idx not in handled_double_block_indices
-                                and block_idx >= (num_double_blocks - double_blocks_to_swap) - 1
-                                and block_idx < num_double_blocks - 1
-                            ):
-                                # swap next (already backpropagated) block
-                                handled_double_block_indices.add(block_idx)
-                                block_idx_cpu = block_idx + 1
-                                block_idx_cuda = double_blocks_to_swap - (num_double_blocks - block_idx_cpu)
-
-                                # create swap hook
-                                def create_double_swap_grad_hook(bidx, bidx_cuda):
-                                    def __grad_hook(tensor: torch.Tensor):
-                                        if accelerator.sync_gradients and args.max_grad_norm != 0.0:
-                                            accelerator.clip_grad_norm_(tensor, args.max_grad_norm)
-                                        optimizer.step_param(tensor, param_group)
-                                        tensor.grad = None
-
-                                        # swap blocks if necessary
-                                        flux.double_blocks[bidx].to("cpu")
-                                        flux.double_blocks[bidx_cuda].to(accelerator.device)
-                                        # print(f"Move double block {bidx} to cpu and {bidx_cuda} to device")
-
-                                    return __grad_hook
-
-                                grad_hook = create_double_swap_grad_hook(block_idx_cpu, block_idx_cuda)
-                    if single_blocks_to_swap:
-                        if param_name.startswith("single_blocks"):
+                    if blocks_to_swap:
+                        is_double = param_name.startswith("double_blocks")
+                        is_single = param_name.startswith("single_blocks")
+                        if is_double or is_single:
                             block_idx = int(param_name.split(".")[1])
-                            if (
-                                block_idx not in handled_single_block_indices
-                                and block_idx >= (num_single_blocks - single_blocks_to_swap) - 1
-                                and block_idx < num_single_blocks - 1
-                            ):
-                                handled_single_block_indices.add(block_idx)
-                                block_idx_cpu = block_idx + 1
-                                block_idx_cuda = single_blocks_to_swap - (num_single_blocks - block_idx_cpu)
-                                # print(param_name, block_idx_cpu, block_idx_cuda)
-
-                                # create swap hook
-                                def create_single_swap_grad_hook(bidx, bidx_cuda):
-                                    def __grad_hook(tensor: torch.Tensor):
-                                        if accelerator.sync_gradients and args.max_grad_norm != 0.0:
-                                            accelerator.clip_grad_norm_(tensor, args.max_grad_norm)
-                                        optimizer.step_param(tensor, param_group)
-                                        tensor.grad = None
-
-                                        # swap blocks if necessary
-                                        flux.single_blocks[bidx].to("cpu")
-                                        flux.single_blocks[bidx_cuda].to(accelerator.device)
-                                        # print(f"Move single block {bidx} to cpu and {bidx_cuda} to device")
-
-                                    return __grad_hook
-
-                                grad_hook = create_single_swap_grad_hook(block_idx_cpu, block_idx_cuda)
+                            unit_idx = block_idx if is_double else num_double_blocks + block_idx // 2
+                            if unit_idx not in handled_unit_indices:
+                                # swap following (already backpropagated) block
+                                handled_unit_indices.add(unit_idx)
+
+                                # if n blocks were already backpropagated
+                                num_blocks_propagated = num_block_units - unit_idx - 1
+                                swapping = num_blocks_propagated > 0 and num_blocks_propagated <= blocks_to_swap
+                                waiting = unit_idx > 0 and unit_idx <= blocks_to_swap
+                                if swapping or waiting:
+                                    block_idx_to_cpu = num_block_units - num_blocks_propagated
+                                    block_idx_to_cuda = blocks_to_swap - num_blocks_propagated
+                                    block_idx_to_wait = unit_idx - 1
+
+                                    # create swap hook
+                                    def create_swap_grad_hook(
+                                        bidx_to_cpu, bidx_to_cuda, bidx_to_wait, uidx: int, swpng: bool, wtng: bool
+                                    ):
+                                        def __grad_hook(tensor: torch.Tensor):
+                                            if accelerator.sync_gradients and args.max_grad_norm != 0.0:
+                                                accelerator.clip_grad_norm_(tensor, args.max_grad_norm)
+                                            optimizer.step_param(tensor, param_group)
+                                            tensor.grad = None
+
+                                            # print(f"Backward: {uidx}, {swpng}, {wtng}")
+                                            if swpng:
+                                                submit_move_blocks(
+                                                    futures,
+                                                    thread_pool,
+                                                    bidx_to_cpu,
+                                                    bidx_to_cuda,
+                                                    flux.double_blocks,
+                                                    flux.single_blocks,
+                                                    accelerator.device,
+                                                )
+                                            if wtng:
+                                                wait_blocks_move(bidx_to_wait, futures)
+
+                                        return __grad_hook
+
+                                    grad_hook = create_swap_grad_hook(
+                                        block_idx_to_cpu, block_idx_to_cuda, block_idx_to_wait, unit_idx, swapping, waiting
+                                    )
 
                     if grad_hook is None:
 
@@ -547,10 +603,15 @@ def __grad_hook(tensor: torch.Tensor, param_group=param_group):
         num_parameters_per_group = [0] * len(optimizers)
         parameter_optimizer_map = {}
 
-        double_blocks_to_swap = args.double_blocks_to_swap
-        single_blocks_to_swap = args.single_blocks_to_swap
+        blocks_to_swap = args.blocks_to_swap
         num_double_blocks = 19  # len(flux.double_blocks)
         num_single_blocks = 38  # len(flux.single_blocks)
+        num_block_units = num_double_blocks + num_single_blocks // 2
+
+        n = 1  # only asyncronous purpose, no need to increase this number
+        # n = max(1, os.cpu_count() // 2)
+        thread_pool = ThreadPoolExecutor(max_workers=n)
+        futures = {}
 
         for opt_idx, optimizer in enumerate(optimizers):
             for param_group in optimizer.param_groups:
@@ -571,18 +632,30 @@ def optimizer_hook(parameter: torch.Tensor):
                                     optimizers[i].zero_grad(set_to_none=True)
 
                                     # swap blocks if necessary
-                                    if btype == "double" and double_blocks_to_swap:
-                                        if bidx >= num_double_blocks - double_blocks_to_swap:
-                                            bidx_cuda = double_blocks_to_swap - (num_double_blocks - bidx)
-                                            flux.double_blocks[bidx].to("cpu")
-                                            flux.double_blocks[bidx_cuda].to(accelerator.device)
-                                            # print(f"Move double block {bidx} to cpu and {bidx_cuda} to device")
-                                    elif btype == "single" and single_blocks_to_swap:
-                                        if bidx >= num_single_blocks - single_blocks_to_swap:
-                                            bidx_cuda = single_blocks_to_swap - (num_single_blocks - bidx)
-                                            flux.single_blocks[bidx].to("cpu")
-                                            flux.single_blocks[bidx_cuda].to(accelerator.device)
-                                            # print(f"Move single block {bidx} to cpu and {bidx_cuda} to device")
+                                    if blocks_to_swap and (btype == "double" or (btype == "single" and bidx % 2 == 0)):
+                                        unit_idx = bidx if btype == "double" else num_double_blocks + bidx // 2
+                                        num_blocks_propagated = num_block_units - unit_idx
+
+                                        swapping = num_blocks_propagated > 0 and num_blocks_propagated <= blocks_to_swap
+                                        waiting = unit_idx > 0 and unit_idx <= blocks_to_swap
+
+                                        if swapping:
+                                            block_idx_to_cpu = num_block_units - num_blocks_propagated
+                                            block_idx_to_cuda = blocks_to_swap - num_blocks_propagated
+                                            # print(f"Backward: Swap blocks {block_idx_to_cpu} and {block_idx_to_cuda}")
+                                            submit_move_blocks(
+                                                futures,
+                                                thread_pool,
+                                                block_idx_to_cpu,
+                                                block_idx_to_cuda,
+                                                flux.double_blocks,
+                                                flux.single_blocks,
+                                                accelerator.device,
+                                            )
+
+                                        if waiting:
+                                            block_idx_to_wait = unit_idx - 1
+                                            wait_blocks_move(block_idx_to_wait, futures)
 
                             return optimizer_hook
 
@@ -881,24 +954,26 @@ def setup_parser() -> argparse.ArgumentParser:
         help="skip latents validity check / latentsの正当性チェックをスキップする",
     )
     parser.add_argument(
-        "--double_blocks_to_swap",
+        "--blocks_to_swap",
         type=int,
         default=None,
         help="[EXPERIMENTAL] "
-        "Sets the number of 'double_blocks' (~640MB) to swap during the forward and backward passes."
+        "Sets the number of blocks (~640MB) to swap during the forward and backward passes."
         "Increasing this number lowers the overall VRAM used during training at the expense of training speed (s/it)."
-        " / 順伝播および逆伝播中にスワップする'変換ブロック'（約640MB）の数を設定します。"
+        " / 順伝播および逆伝播中にスワップするブロック（約640MB）の数を設定します。"
         "この数を増やすと、トレーニング中のVRAM使用量が減りますが、トレーニング速度（s/it）も低下します。",
     )
+    parser.add_argument(
+        "--double_blocks_to_swap",
+        type=int,
+        default=None,
+        help="[Deprecated] use 'blocks_to_swap' instead / 代わりに 'blocks_to_swap' を使用してください",
+    )
     parser.add_argument(
         "--single_blocks_to_swap",
         type=int,
         default=None,
-        help="[EXPERIMENTAL] "
-        "Sets the number of 'single_blocks' (~320MB) to swap during the forward and backward passes."
-        "Increasing this number lowers the overall VRAM used during training at the expense of training speed (s/it)."
-        " / 順伝播および逆伝播中にスワップする'変換ブロック'（約320MB）の数を設定します。"
-        "この数を増やすと、トレーニング中のVRAM使用量が減りますが、トレーニング速度（s/it）も低下します。",
+        help="[Deprecated] use 'blocks_to_swap' instead / 代わりに 'blocks_to_swap' を使用してください",
     )
     parser.add_argument(
         "--cpu_offload_checkpointing",
diff --git a/library/flux_models.py b/library/flux_models.py
index b5726c298..a35dbc106 100644
--- a/library/flux_models.py
+++ b/library/flux_models.py
@@ -2,9 +2,12 @@
 # license: Apache-2.0 License
 
 
+from concurrent.futures import Future, ThreadPoolExecutor
 from dataclasses import dataclass
 import math
-from typing import Optional
+import os
+import time
+from typing import Dict, List, Optional
 
 from library.device_utils import init_ipex, clean_memory_on_device
 
@@ -917,8 +920,10 @@ def __init__(self, params: FluxParams):
 
         self.gradient_checkpointing = False
         self.cpu_offload_checkpointing = False
-        self.double_blocks_to_swap = None
-        self.single_blocks_to_swap = None
+        self.blocks_to_swap = None
+
+        self.thread_pool: Optional[ThreadPoolExecutor] = None
+        self.num_block_units = len(self.double_blocks) + len(self.single_blocks) // 2
 
     @property
     def device(self):
@@ -956,38 +961,52 @@ def disable_gradient_checkpointing(self):
 
         print("FLUX: Gradient checkpointing disabled.")
 
-    def enable_block_swap(self, double_blocks: Optional[int], single_blocks: Optional[int]):
-        self.double_blocks_to_swap = double_blocks
-        self.single_blocks_to_swap = single_blocks
+    def enable_block_swap(self, num_blocks: int):
+        self.blocks_to_swap = num_blocks
+
+        n = 1  # async block swap. 1 is enough
+        # n = 2
+        # n = max(1, os.cpu_count() // 2)
+        self.thread_pool = ThreadPoolExecutor(max_workers=n)
 
     def move_to_device_except_swap_blocks(self, device: torch.device):
         # assume model is on cpu
-        if self.double_blocks_to_swap:
+        if self.blocks_to_swap:
             save_double_blocks = self.double_blocks
-            self.double_blocks = None
-        if self.single_blocks_to_swap:
             save_single_blocks = self.single_blocks
+            self.double_blocks = None
             self.single_blocks = None
 
         self.to(device)
 
-        if self.double_blocks_to_swap:
+        if self.blocks_to_swap:
             self.double_blocks = save_double_blocks
-        if self.single_blocks_to_swap:
             self.single_blocks = save_single_blocks
 
+    def get_block_unit(self, index: int):
+        if index < len(self.double_blocks):
+            return (self.double_blocks[index],)
+        else:
+            index -= len(self.double_blocks)
+            index *= 2
+            return self.single_blocks[index], self.single_blocks[index + 1]
+
+    def get_unit_index(self, is_double: bool, index: int):
+        if is_double:
+            return index
+        else:
+            return len(self.double_blocks) + index // 2
+
     def prepare_block_swap_before_forward(self):
-        # move last n blocks to cpu: they are on cuda
-        if self.double_blocks_to_swap:
-            for i in range(len(self.double_blocks) - self.double_blocks_to_swap):
-                self.double_blocks[i].to(self.device)
-            for i in range(len(self.double_blocks) - self.double_blocks_to_swap, len(self.double_blocks)):
-                self.double_blocks[i].to("cpu")  # , non_blocking=True)
-        if self.single_blocks_to_swap:
-            for i in range(len(self.single_blocks) - self.single_blocks_to_swap):
-                self.single_blocks[i].to(self.device)
-            for i in range(len(self.single_blocks) - self.single_blocks_to_swap, len(self.single_blocks)):
-                self.single_blocks[i].to("cpu")  # , non_blocking=True)
+        # make: first n blocks are on cuda, and last n blocks are on cpu
+        if self.blocks_to_swap is None:
+            raise ValueError("Block swap is not enabled.")
+        for i in range(self.num_block_units - self.blocks_to_swap):
+            for b in self.get_block_unit(i):
+                b.to(self.device)
+        for i in range(self.num_block_units - self.blocks_to_swap, self.num_block_units):
+            for b in self.get_block_unit(i):
+                b.to("cpu")
         clean_memory_on_device(self.device)
 
     def forward(
@@ -1017,69 +1036,73 @@ def forward(
         ids = torch.cat((txt_ids, img_ids), dim=1)
         pe = self.pe_embedder(ids)
 
-        if not self.double_blocks_to_swap:
+        if not self.blocks_to_swap:
             for block in self.double_blocks:
                 img, txt = block(img=img, txt=txt, vec=vec, pe=pe, txt_attention_mask=txt_attention_mask)
+            img = torch.cat((txt, img), 1)
+            for block in self.single_blocks:
+                img = block(img, vec=vec, pe=pe, txt_attention_mask=txt_attention_mask)
         else:
-            # make sure first n blocks are on cuda, and last n blocks are on cpu at beginning
-            for block_idx in range(self.double_blocks_to_swap):
-                block = self.double_blocks[len(self.double_blocks) - self.double_blocks_to_swap + block_idx]
-                if block.parameters().__next__().device.type != "cpu":
-                    block.to("cpu")  # , non_blocking=True)
-                    # print(f"Moved double block {len(self.double_blocks) - self.double_blocks_to_swap + block_idx} to cpu.")
-
-                block = self.double_blocks[block_idx]
-                if block.parameters().__next__().device.type == "cpu":
-                    block.to(self.device)
-                    # print(f"Moved double block {block_idx} to cuda.")
-
-            to_cpu_block_index = 0
+            futures = {}
+
+            def submit_move_blocks(block_idx_to_cpu, block_idx_to_cuda):
+                def move_blocks(bidx_to_cpu, blocks_to_cpu, bidx_to_cuda, blocks_to_cuda):
+                    # print(f"Moving {bidx_to_cpu} to cpu.")
+                    for block in blocks_to_cpu:
+                        block.to("cpu", non_blocking=True)
+                    torch.cuda.empty_cache()
+
+                    # print(f"Moving {bidx_to_cuda} to cuda.")
+                    for block in blocks_to_cuda:
+                        block.to(self.device, non_blocking=True)
+
+                    torch.cuda.synchronize()
+                    # print(f"Block move done. {bidx_to_cpu} to cpu, {bidx_to_cuda} to cuda.")
+                    return block_idx_to_cpu, block_idx_to_cuda
+
+                blocks_to_cpu = self.get_block_unit(block_idx_to_cpu)
+                blocks_to_cuda = self.get_block_unit(block_idx_to_cuda)
+                # print(f"Submit move blocks. {block_idx_to_cpu} to cpu, {block_idx_to_cuda} to cuda.")
+                return self.thread_pool.submit(move_blocks, block_idx_to_cpu, blocks_to_cpu, block_idx_to_cuda, blocks_to_cuda)
+
+            def wait_for_blocks_move(block_idx, ftrs):
+                if block_idx not in ftrs:
+                    return
+                # print(f"Waiting for move blocks: {block_idx}")
+                # start_time = time.perf_counter()
+                ftr = ftrs.pop(block_idx)
+                ftr.result()
+                # torch.cuda.synchronize()
+                # print(f"Move blocks took {time.perf_counter() - start_time:.2f} seconds")
+
             for block_idx, block in enumerate(self.double_blocks):
-                # move last n blocks to cuda: they are on cpu, and move first n blocks to cpu: they are on cuda
-                moving = block_idx >= len(self.double_blocks) - self.double_blocks_to_swap
-                if moving:
-                    block.to(self.device)  # move to cuda
-                    # print(f"Moved double block {block_idx} to cuda.")
+                # print(f"Double block {block_idx}")
+                unit_idx = self.get_unit_index(is_double=True, index=block_idx)
+                wait_for_blocks_move(unit_idx, futures)
 
                 img, txt = block(img=img, txt=txt, vec=vec, pe=pe, txt_attention_mask=txt_attention_mask)
 
-                if moving:
-                    self.double_blocks[to_cpu_block_index].to("cpu")  # , non_blocking=True)
-                    # print(f"Moved double block {to_cpu_block_index} to cpu.")
-                    to_cpu_block_index += 1
+                if unit_idx < self.blocks_to_swap:
+                    block_idx_to_cpu = unit_idx
+                    block_idx_to_cuda = self.num_block_units - self.blocks_to_swap + unit_idx
+                    future = submit_move_blocks(block_idx_to_cpu, block_idx_to_cuda)
+                    futures[block_idx_to_cuda] = future
 
-        img = torch.cat((txt, img), 1)
+            img = torch.cat((txt, img), 1)
 
-        if not self.single_blocks_to_swap:
-            for block in self.single_blocks:
-                img = block(img, vec=vec, pe=pe, txt_attention_mask=txt_attention_mask)
-        else:
-            # make sure first n blocks are on cuda, and last n blocks are on cpu at beginning
-            for block_idx in range(self.single_blocks_to_swap):
-                block = self.single_blocks[len(self.single_blocks) - self.single_blocks_to_swap + block_idx]
-                if block.parameters().__next__().device.type != "cpu":
-                    block.to("cpu")  # , non_blocking=True)
-                    # print(f"Moved single block {len(self.single_blocks) - self.single_blocks_to_swap + block_idx} to cpu.")
-
-                block = self.single_blocks[block_idx]
-                if block.parameters().__next__().device.type == "cpu":
-                    block.to(self.device)
-                    # print(f"Moved single block {block_idx} to cuda.")
-
-            to_cpu_block_index = 0
             for block_idx, block in enumerate(self.single_blocks):
-                # move last n blocks to cuda: they are on cpu, and move first n blocks to cpu: they are on cuda
-                moving = block_idx >= len(self.single_blocks) - self.single_blocks_to_swap
-                if moving:
-                    block.to(self.device)  # move to cuda
-                    # print(f"Moved single block {block_idx} to cuda.")
+                # print(f"Single block {block_idx}")
+                unit_idx = self.get_unit_index(is_double=False, index=block_idx)
+                if block_idx % 2 == 0:
+                    wait_for_blocks_move(unit_idx, futures)
 
                 img = block(img, vec=vec, pe=pe, txt_attention_mask=txt_attention_mask)
 
-                if moving:
-                    self.single_blocks[to_cpu_block_index].to("cpu")  # , non_blocking=True)
-                    # print(f"Moved single block {to_cpu_block_index} to cpu.")
-                    to_cpu_block_index += 1
+                if block_idx % 2 == 1 and unit_idx < self.blocks_to_swap:
+                    block_idx_to_cpu = unit_idx
+                    block_idx_to_cuda = self.num_block_units - self.blocks_to_swap + unit_idx
+                    future = submit_move_blocks(block_idx_to_cpu, block_idx_to_cuda)
+                    futures[block_idx_to_cuda] = future
 
         img = img[:, txt.shape[1] :, ...]
 
@@ -1088,6 +1111,7 @@ def forward(
             vec = vec.to(self.device)
 
         img = self.final_layer(img, vec)  # (N, T, patch_size ** 2 * out_channels)
+
         return img
 
 

From da94fd934eb4951d1cb132abc9d2a355e44d7abf Mon Sep 17 00:00:00 2001
From: Kohya S <ykumeykume@gmail.com>
Date: Thu, 26 Sep 2024 08:27:48 +0900
Subject: [PATCH 153/582] fix typos

---
 flux_train.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/flux_train.py b/flux_train.py
index bf34208f1..022467ea7 100644
--- a/flux_train.py
+++ b/flux_train.py
@@ -516,7 +516,7 @@ def wait_blocks_move(block_idx, futures):
         num_block_units = num_double_blocks + num_single_blocks // 2
         handled_unit_indices = set()
 
-        n = 1  # only asyncronous purpose, no need to increase this number
+        n = 1  # only asynchronous purpose, no need to increase this number
         # n = 2
         # n = max(1, os.cpu_count() // 2)
         thread_pool = ThreadPoolExecutor(max_workers=n)
@@ -608,7 +608,7 @@ def __grad_hook(tensor: torch.Tensor, param_group=param_group):
         num_single_blocks = 38  # len(flux.single_blocks)
         num_block_units = num_double_blocks + num_single_blocks // 2
 
-        n = 1  # only asyncronous purpose, no need to increase this number
+        n = 1  # only asynchronous purpose, no need to increase this number
         # n = max(1, os.cpu_count() // 2)
         thread_pool = ThreadPoolExecutor(max_workers=n)
         futures = {}

From 392e8dedd84e469b125e2935e3ecf02e6270a5b2 Mon Sep 17 00:00:00 2001
From: Kohya S <ykumeykume@gmail.com>
Date: Thu, 26 Sep 2024 21:14:11 +0900
Subject: [PATCH 154/582] fix flip_aug, alpha_mask, random_crop issue in
 caching in caching strategy

---
 library/train_util.py | 41 ++++++++++++++++++++++++++++++-----------
 1 file changed, 30 insertions(+), 11 deletions(-)

diff --git a/library/train_util.py b/library/train_util.py
index 319337a47..17dd447eb 100644
--- a/library/train_util.py
+++ b/library/train_util.py
@@ -993,9 +993,26 @@ def new_cache_latents(self, model: Any, is_main_process: bool):
         # sort by resolution
         image_infos.sort(key=lambda info: info.bucket_reso[0] * info.bucket_reso[1])
 
-        # split by resolution
-        batches = []
-        batch = []
+        # split by resolution and some conditions
+        class Condition:
+            def __init__(self, reso, flip_aug, alpha_mask, random_crop):
+                self.reso = reso
+                self.flip_aug = flip_aug
+                self.alpha_mask = alpha_mask
+                self.random_crop = random_crop
+
+            def __eq__(self, other):
+                return (
+                    self.reso == other.reso
+                    and self.flip_aug == other.flip_aug
+                    and self.alpha_mask == other.alpha_mask
+                    and self.random_crop == other.random_crop
+                )
+
+        batches: List[Tuple[Condition, List[ImageInfo]]] = []
+        batch: List[ImageInfo] = []
+        current_condition = None
+
         logger.info("checking cache validity...")
         for info in tqdm(image_infos):
             subset = self.image_to_subset[info.image_key]
@@ -1016,20 +1033,23 @@ def new_cache_latents(self, model: Any, is_main_process: bool):
                 if cache_available:  # do not add to batch
                     continue
 
-            # if last member of batch has different resolution, flush the batch
-            if len(batch) > 0 and batch[-1].bucket_reso != info.bucket_reso:
-                batches.append(batch)
+            # if batch is not empty and condition is changed, flush the batch. Note that current_condition is not None if batch is not empty
+            condition = Condition(info.bucket_reso, subset.flip_aug, subset.alpha_mask, subset.random_crop)
+            if len(batch) > 0 and current_condition != condition:
+                batches.append((current_condition, batch))
                 batch = []
 
             batch.append(info)
+            current_condition = condition
 
             # if number of data in batch is enough, flush the batch
             if len(batch) >= caching_strategy.batch_size:
-                batches.append(batch)
+                batches.append((current_condition, batch))
                 batch = []
+                current_condition = None
 
         if len(batch) > 0:
-            batches.append(batch)
+            batches.append((current_condition, batch))
 
         # if cache to disk, don't cache latents in non-main process, set to info only
         if caching_strategy.cache_to_disk and not is_main_process:
@@ -1041,9 +1061,8 @@ def new_cache_latents(self, model: Any, is_main_process: bool):
 
         # iterate batches: batch doesn't have image here. image will be loaded in cache_batch_latents and discarded
         logger.info("caching latents...")
-        for batch in tqdm(batches, smoothing=1, total=len(batches)):
-            # cache_batch_latents(vae, cache_to_disk, batch, subset.flip_aug, subset.alpha_mask, subset.random_crop)
-            caching_strategy.cache_batch_latents(model, batch, subset.flip_aug, subset.alpha_mask, subset.random_crop)
+        for condition, batch in tqdm(batches, smoothing=1, total=len(batches)):
+            caching_strategy.cache_batch_latents(model, batch, condition.flip_aug, condition.alpha_mask, condition.random_crop)
 
     def cache_latents(self, vae, vae_batch_size=1, cache_to_disk=False, is_main_process=True, file_suffix=".npz"):
         # マルチGPUには対応していないので、そちらはtools/cache_latents.pyを使うこと

From 9249d00311002c84b189c2f6792cbe7aa344a1d5 Mon Sep 17 00:00:00 2001
From: kohya-ss <ykumeykume@gmail.com>
Date: Thu, 26 Sep 2024 22:19:56 +0900
Subject: [PATCH 155/582] experimental support for multi-gpus latents caching

---
 library/train_util.py | 27 ++++++++++++++++-----------
 train_network.py      |  2 +-
 2 files changed, 17 insertions(+), 12 deletions(-)

diff --git a/library/train_util.py b/library/train_util.py
index 3768b6051..2ca662dcb 100644
--- a/library/train_util.py
+++ b/library/train_util.py
@@ -981,7 +981,7 @@ def is_text_encoder_output_cacheable(self):
             ]
         )
 
-    def new_cache_latents(self, model: Any, is_main_process: bool):
+    def new_cache_latents(self, model: Any, accelerator: Accelerator):
         r"""
         a brand new method to cache latents. This method caches latents with caching strategy.
         normal cache_latents method is used by default, but this method is used when caching strategy is specified.
@@ -1013,8 +1013,12 @@ def __eq__(self, other):
         batch: List[ImageInfo] = []
         current_condition = None
 
+        # support multiple-gpus
+        num_processes = accelerator.num_processes
+        process_index = accelerator.process_index
+
         logger.info("checking cache validity...")
-        for info in tqdm(image_infos):
+        for i, info in enumerate(tqdm(image_infos)):
             subset = self.image_to_subset[info.image_key]
 
             if info.latents_npz is not None:  # fine tuning dataset
@@ -1024,9 +1028,14 @@ def __eq__(self, other):
             if caching_strategy.cache_to_disk:
                 # info.latents_npz = os.path.splitext(info.absolute_path)[0] + file_suffix
                 info.latents_npz = caching_strategy.get_latents_npz_path(info.absolute_path, info.image_size)
-                if not is_main_process:  # prepare for multi-gpu, only store to info
+
+                # if the modulo of num_processes is not equal to process_index, skip caching
+                # this makes each process cache different latents
+                if i % num_processes != process_index:  
                     continue
 
+                print(f"{process_index}/{num_processes} {i}/{len(image_infos)} {info.latents_npz}")
+
                 cache_available = caching_strategy.is_disk_cached_latents_expected(
                     info.bucket_reso, info.latents_npz, subset.flip_aug, subset.alpha_mask
                 )
@@ -1051,10 +1060,6 @@ def __eq__(self, other):
         if len(batch) > 0:
             batches.append((current_condition, batch))
 
-        # if cache to disk, don't cache latents in non-main process, set to info only
-        if caching_strategy.cache_to_disk and not is_main_process:
-            return
-
         if len(batches) == 0:
             logger.info("no latents to cache")
             return
@@ -2258,8 +2263,8 @@ def make_buckets(self):
     def cache_latents(self, vae, vae_batch_size=1, cache_to_disk=False, is_main_process=True):
         return self.dreambooth_dataset_delegate.cache_latents(vae, vae_batch_size, cache_to_disk, is_main_process)
 
-    def new_cache_latents(self, model: Any, is_main_process: bool):
-        return self.dreambooth_dataset_delegate.new_cache_latents(model, is_main_process)
+    def new_cache_latents(self, model: Any, accelerator: Accelerator):
+        return self.dreambooth_dataset_delegate.new_cache_latents(model, accelerator)
 
     def new_cache_text_encoder_outputs(self, models: List[Any], is_main_process: bool):
         return self.dreambooth_dataset_delegate.new_cache_text_encoder_outputs(models, is_main_process)
@@ -2363,10 +2368,10 @@ def cache_latents(self, vae, vae_batch_size=1, cache_to_disk=False, is_main_proc
             logger.info(f"[Dataset {i}]")
             dataset.cache_latents(vae, vae_batch_size, cache_to_disk, is_main_process, file_suffix)
 
-    def new_cache_latents(self, model: Any, is_main_process: bool):
+    def new_cache_latents(self, model: Any, accelerator: Accelerator):
         for i, dataset in enumerate(self.datasets):
             logger.info(f"[Dataset {i}]")
-            dataset.new_cache_latents(model, is_main_process)
+            dataset.new_cache_latents(model, accelerator)
 
     def cache_text_encoder_outputs(
         self, tokenizers, text_encoders, device, weight_dtype, cache_to_disk=False, is_main_process=True
diff --git a/train_network.py b/train_network.py
index b24f89b1e..7eb7aa49c 100644
--- a/train_network.py
+++ b/train_network.py
@@ -384,7 +384,7 @@ def train(self, args):
             vae.requires_grad_(False)
             vae.eval()
 
-            train_dataset_group.new_cache_latents(vae, accelerator.is_main_process)
+            train_dataset_group.new_cache_latents(vae, accelerator)
 
             vae.to("cpu")
             clean_memory_on_device(accelerator.device)

From 24b1fdb66485af70b3c79feaf8ff1a348b66668e Mon Sep 17 00:00:00 2001
From: kohya-ss <ykumeykume@gmail.com>
Date: Thu, 26 Sep 2024 22:22:06 +0900
Subject: [PATCH 156/582] remove debug print

---
 library/train_util.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/library/train_util.py b/library/train_util.py
index 2ca662dcb..8d6164b1b 100644
--- a/library/train_util.py
+++ b/library/train_util.py
@@ -1031,10 +1031,10 @@ def __eq__(self, other):
 
                 # if the modulo of num_processes is not equal to process_index, skip caching
                 # this makes each process cache different latents
-                if i % num_processes != process_index:  
+                if i % num_processes != process_index:
                     continue
 
-                print(f"{process_index}/{num_processes} {i}/{len(image_infos)} {info.latents_npz}")
+                # print(f"{process_index}/{num_processes} {i}/{len(image_infos)} {info.latents_npz}")
 
                 cache_available = caching_strategy.is_disk_cached_latents_expected(
                     info.bucket_reso, info.latents_npz, subset.flip_aug, subset.alpha_mask

From a9aa52658a0d9ba7910a1d1983b650bc9de7153e Mon Sep 17 00:00:00 2001
From: Kohya S <ykumeykume@gmail.com>
Date: Sat, 28 Sep 2024 17:12:56 +0900
Subject: [PATCH 157/582] fix sample generation is not working in FLUX1 fine
 tuning #1647

---
 library/flux_models.py      | 5 +++--
 library/flux_train_utils.py | 4 +++-
 2 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/library/flux_models.py b/library/flux_models.py
index a35dbc106..0bc1c02b9 100644
--- a/library/flux_models.py
+++ b/library/flux_models.py
@@ -999,8 +999,9 @@ def get_unit_index(self, is_double: bool, index: int):
 
     def prepare_block_swap_before_forward(self):
         # make: first n blocks are on cuda, and last n blocks are on cpu
-        if self.blocks_to_swap is None:
-            raise ValueError("Block swap is not enabled.")
+        if self.blocks_to_swap is None or self.blocks_to_swap == 0:
+            # raise ValueError("Block swap is not enabled.")
+            return
         for i in range(self.num_block_units - self.blocks_to_swap):
             for b in self.get_block_unit(i):
                 b.to(self.device)
diff --git a/library/flux_train_utils.py b/library/flux_train_utils.py
index f77d4b585..1d1eb9d24 100644
--- a/library/flux_train_utils.py
+++ b/library/flux_train_utils.py
@@ -313,6 +313,7 @@ def denoise(
     guidance_vec = torch.full((img.shape[0],), guidance, device=img.device, dtype=img.dtype)
     for t_curr, t_prev in zip(tqdm(timesteps[:-1]), timesteps[1:]):
         t_vec = torch.full((img.shape[0],), t_curr, dtype=img.dtype, device=img.device)
+        model.prepare_block_swap_before_forward()
         pred = model(
             img=img,
             img_ids=img_ids,
@@ -325,7 +326,8 @@ def denoise(
         )
 
         img = img + (t_prev - t_curr) * pred
-
+        
+    model.prepare_block_swap_before_forward()
     return img
 
 

From 822fe578591e44ac949830e03a8841e222483052 Mon Sep 17 00:00:00 2001
From: Kohya S <ykumeykume@gmail.com>
Date: Sat, 28 Sep 2024 20:57:27 +0900
Subject: [PATCH 158/582] add workaround for 'Some tensors share memory' error
 #1614

---
 networks/convert_flux_lora.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/networks/convert_flux_lora.py b/networks/convert_flux_lora.py
index bd4c1cf78..fe6466ebc 100644
--- a/networks/convert_flux_lora.py
+++ b/networks/convert_flux_lora.py
@@ -412,6 +412,10 @@ def main(args):
         state_dict = convert_ai_toolkit_to_sd_scripts(state_dict)
     elif args.src == "sd-scripts" and args.dst == "ai-toolkit":
         state_dict = convert_sd_scripts_to_ai_toolkit(state_dict)
+
+        # eliminate 'shared tensors' 
+        for k in list(state_dict.keys()):
+            state_dict[k] = state_dict[k].detach().clone()
     else:
         raise NotImplementedError(f"Conversion from {args.src} to {args.dst} is not supported")
 

From 1a0f5b0c389f4e9fab5edb06b36f203e8894d581 Mon Sep 17 00:00:00 2001
From: Kohya S <ykumeykume@gmail.com>
Date: Sun, 29 Sep 2024 00:35:29 +0900
Subject: [PATCH 159/582] re-fix sample generation is not working in FLUX1
 split mode #1647

---
 flux_train_network.py       | 3 +++
 library/flux_train_utils.py | 1 -
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/flux_train_network.py b/flux_train_network.py
index a6e57eede..65b121e7c 100644
--- a/flux_train_network.py
+++ b/flux_train_network.py
@@ -300,6 +300,9 @@ def __init__(self, flux_upper: flux_models.FluxUpper, flux_lower: flux_models.Fl
                 self.flux_lower = flux_lower
                 self.target_device = device
 
+            def prepare_block_swap_before_forward(self):
+                pass
+
             def forward(self, img, img_ids, txt, txt_ids, timesteps, y, guidance=None, txt_attention_mask=None):
                 self.flux_lower.to("cpu")
                 clean_memory_on_device(self.target_device)
diff --git a/library/flux_train_utils.py b/library/flux_train_utils.py
index 1d1eb9d24..b3c9184f2 100644
--- a/library/flux_train_utils.py
+++ b/library/flux_train_utils.py
@@ -196,7 +196,6 @@ def sample_image_inference(
         tokens_and_masks = tokenize_strategy.tokenize(prompt)
         # strategy has apply_t5_attn_mask option
         encoded_text_encoder_conds = encoding_strategy.encode_tokens(tokenize_strategy, text_encoders, tokens_and_masks)
-        print([x.shape if x is not None else None for x in encoded_text_encoder_conds])
 
         # if text_encoder_conds is not cached, use encoded_text_encoder_conds
         if len(text_encoder_conds) == 0:

From e0c3630203776dc568c32d67806a0a9d443f5721 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E9=9D=92=E9=BE=8D=E8=81=96=E8=80=85=40bdsqlsz?=
 <865105819@qq.com>
Date: Sun, 29 Sep 2024 09:11:15 +0800
Subject: [PATCH 160/582] Support Sdxl Controlnet (#1648)

* Create sdxl_train_controlnet.py

* add fuse_background_pass

* Update sdxl_train_controlnet.py

* add fuse and fix error

* update

* Update sdxl_train_controlnet.py

* Update sdxl_train_controlnet.py

* Update sdxl_train_controlnet.py

* update

* Update sdxl_train_controlnet.py
---
 library/train_util.py    |   2 +-
 sdxl_train_controlnet.py | 752 +++++++++++++++++++++++++++++++++++++++
 train_controlnet.py      |  33 +-
 3 files changed, 779 insertions(+), 8 deletions(-)
 create mode 100644 sdxl_train_controlnet.py

diff --git a/library/train_util.py b/library/train_util.py
index e023f63a2..293fc05ad 100644
--- a/library/train_util.py
+++ b/library/train_util.py
@@ -3581,7 +3581,7 @@ def add_training_arguments(parser: argparse.ArgumentParser, support_dreambooth:
         # available backends:
         # https://github.com/huggingface/accelerate/blob/d1abd59114ada8ba673e1214218cb2878c13b82d/src/accelerate/utils/dataclasses.py#L376-L388C5
         # https://pytorch.org/docs/stable/torch.compiler.html
-        choices=["eager", "aot_eager", "inductor", "aot_ts_nvfuser", "nvprims_nvfuser", "cudagraphs", "ofi", "fx2trt", "onnxrt"],
+        choices=["eager", "aot_eager", "inductor", "aot_ts_nvfuser", "nvprims_nvfuser", "cudagraphs", "ofi", "fx2trt", "onnxrt", "tensort", "ipex", "tvm"],
         help="dynamo backend type (default is inductor) / dynamoのbackendの種類（デフォルトは inductor）",
     )
     parser.add_argument("--xformers", action="store_true", help="use xformers for CrossAttention / CrossAttentionにxformersを使う")
diff --git a/sdxl_train_controlnet.py b/sdxl_train_controlnet.py
new file mode 100644
index 000000000..00026d2cc
--- /dev/null
+++ b/sdxl_train_controlnet.py
@@ -0,0 +1,752 @@
+import argparse
+import math
+import os
+import random
+from multiprocessing import Value
+import toml
+
+from tqdm import tqdm
+
+import torch
+from library.device_utils import init_ipex, clean_memory_on_device
+
+init_ipex()
+
+from torch.nn.parallel import DistributedDataParallel as DDP
+from accelerate.utils import set_seed
+from diffusers import DDPMScheduler, ControlNetModel
+from diffusers.utils.torch_utils import is_compiled_module
+from safetensors.torch import load_file
+from library import (
+    deepspeed_utils,
+    sai_model_spec,
+    sdxl_model_util,
+    sdxl_original_unet,
+    sdxl_train_util,
+)
+
+import library.model_util as model_util
+import library.train_util as train_util
+import library.config_util as config_util
+from library.config_util import (
+    ConfigSanitizer,
+    BlueprintGenerator,
+)
+import library.huggingface_util as huggingface_util
+import library.custom_train_functions as custom_train_functions
+from library.custom_train_functions import (
+    add_v_prediction_like_loss,
+    apply_snr_weight,
+    prepare_scheduler_for_custom_training,
+    scale_v_prediction_loss_like_noise_prediction,
+    apply_debiased_estimation,
+)
+from library.utils import setup_logging, add_logging_arguments
+
+setup_logging()
+import logging
+
+logger = logging.getLogger(__name__)
+
+
+# TODO 他のスクリプトと共通化する
+def generate_step_logs(args: argparse.Namespace, current_loss, avr_loss, lr_scheduler):
+    logs = {
+        "loss/current": current_loss,
+        "loss/average": avr_loss,
+        "lr": lr_scheduler.get_last_lr()[0],
+    }
+
+    if args.optimizer_type.lower().startswith("DAdapt".lower()):
+        logs["lr/d*lr"] = (
+            lr_scheduler.optimizers[-1].param_groups[0]["d"]
+            * lr_scheduler.optimizers[-1].param_groups[0]["lr"]
+        )
+
+    return logs
+
+
+def train(args):
+    train_util.verify_training_args(args)
+    train_util.prepare_dataset_args(args, True)
+    sdxl_train_util.verify_sdxl_training_args(args)
+    setup_logging(args, reset=True)
+
+    cache_latents = args.cache_latents
+    use_user_config = args.dataset_config is not None
+
+    if args.seed is None:
+        args.seed = random.randint(0, 2**32)
+    set_seed(args.seed)
+
+    tokenizer1, tokenizer2 = sdxl_train_util.load_tokenizers(args)
+
+    # データセットを準備する
+    blueprint_generator = BlueprintGenerator(ConfigSanitizer(False, False, True, True))
+    if use_user_config:
+        logger.info(f"Load dataset config from {args.dataset_config}")
+        user_config = config_util.load_user_config(args.dataset_config)
+        ignored = ["train_data_dir", "conditioning_data_dir"]
+        if any(getattr(args, attr) is not None for attr in ignored):
+            logger.warning(
+                "ignore following options because config file is found: {0} / 設定ファイルが利用されるため以下のオプションは無視されます: {0}".format(
+                    ", ".join(ignored)
+                )
+            )
+    else:
+        user_config = {
+            "datasets": [
+                {
+                    "subsets": config_util.generate_controlnet_subsets_config_by_subdirs(
+                        args.train_data_dir,
+                        args.conditioning_data_dir,
+                        args.caption_extension,
+                    )
+                }
+            ]
+        }
+
+    blueprint = blueprint_generator.generate(user_config, args, tokenizer=[tokenizer1, tokenizer2])
+    train_dataset_group = config_util.generate_dataset_group_by_blueprint(blueprint.dataset_group)
+
+    current_epoch = Value("i", 0)
+    current_step = Value("i", 0)
+    ds_for_collator = (train_dataset_group if args.max_data_loader_n_workers == 0 else None)
+    collator = train_util.collator_class(current_epoch, current_step, ds_for_collator)
+
+    train_dataset_group.verify_bucket_reso_steps(32)
+
+    if args.debug_dataset:
+        train_util.debug_dataset(train_dataset_group)
+        return
+    if len(train_dataset_group) == 0:
+        logger.error(
+            "No data found. Please verify arguments (train_data_dir must be the parent of folders with images) / 画像がありません。引数指定を確認してください（train_data_dirには画像があるフォルダではなく、画像があるフォルダの親フォルダを指定する必要があります）"
+        )
+        return
+
+    if cache_latents:
+        assert (
+            train_dataset_group.is_latent_cacheable()
+        ), "when caching latents, either color_aug or random_crop cannot be used / latentをキャッシュするときはcolor_augとrandom_cropは使えません"
+    else:
+        logger.warning(
+            "WARNING: random_crop is not supported yet for ControlNet training / ControlNetの学習ではrandom_cropはまだサポートされていません"
+        )
+
+    if args.cache_text_encoder_outputs:
+        assert (
+            train_dataset_group.is_text_encoder_output_cacheable()
+        ), "when caching Text Encoder output, either caption_dropout_rate, shuffle_caption, token_warmup_step or caption_tag_dropout_rate cannot be used / Text Encoderの出力をキャッシュするときはcaption_dropout_rate, shuffle_caption, token_warmup_step, caption_tag_dropout_rateは使えません"
+
+    # acceleratorを準備する
+    logger.info("prepare accelerator")
+    accelerator = train_util.prepare_accelerator(args)
+    is_main_process = accelerator.is_main_process
+
+    def unwrap_model(model):
+        model = accelerator.unwrap_model(model)
+        model = model._orig_mod if is_compiled_module(model) else model
+        return model
+
+    # mixed precisionに対応した型を用意しておき適宜castする
+    weight_dtype, save_dtype = train_util.prepare_dtype(args)
+    vae_dtype = torch.float32 if args.no_half_vae else weight_dtype
+
+    # モデルを読み込む
+    (
+        load_stable_diffusion_format,
+        text_encoder1,
+        text_encoder2,
+        vae,
+        unet,
+        logit_scale,
+        ckpt_info,
+    ) = sdxl_train_util.load_target_model(
+        args, accelerator, sdxl_model_util.MODEL_VERSION_SDXL_BASE_V1_0, weight_dtype
+    )
+
+    # convert U-Net
+    with torch.no_grad():
+        du_unet_sd = sdxl_model_util.convert_sdxl_unet_state_dict_to_diffusers(unet.state_dict())
+        unet.to("cpu")
+        clean_memory_on_device(accelerator.device)
+        del unet
+        unet = sdxl_model_util.UNet2DConditionModel(**sdxl_model_util.DIFFUSERS_SDXL_UNET_CONFIG)
+        unet.load_state_dict(du_unet_sd)
+
+    controlnet = ControlNetModel.from_unet(unet)
+
+    if args.controlnet_model_name_or_path:
+        filename = args.controlnet_model_name_or_path
+        if os.path.isfile(filename):
+            if os.path.splitext(filename)[1] == ".safetensors":
+                state_dict = load_file(filename)
+            else:
+                state_dict = torch.load(filename)
+            state_dict = model_util.convert_controlnet_state_dict_to_diffusers(state_dict)
+            controlnet.load_state_dict(state_dict)
+        elif os.path.isdir(filename):
+            controlnet = ControlNetModel.from_pretrained(filename)
+
+    # 学習を準備する
+    if cache_latents:
+        vae.to(accelerator.device, dtype=vae_dtype)
+        vae.requires_grad_(False)
+        vae.eval()
+        with torch.no_grad():
+            train_dataset_group.cache_latents(
+                vae,
+                args.vae_batch_size,
+                args.cache_latents_to_disk,
+                accelerator.is_main_process,
+            )
+        vae.to("cpu")
+        clean_memory_on_device(accelerator.device)
+
+        accelerator.wait_for_everyone()
+
+    # TextEncoderの出力をキャッシュする
+    if args.cache_text_encoder_outputs:
+        # Text Encodes are eval and no grad
+        with torch.no_grad():
+            train_dataset_group.cache_text_encoder_outputs(
+                (tokenizer1, tokenizer2),
+                (text_encoder1, text_encoder2),
+                accelerator.device,
+                None,
+                args.cache_text_encoder_outputs_to_disk,
+                accelerator.is_main_process,
+            )
+        accelerator.wait_for_everyone()
+
+    # モデルに xformers とか memory efficient attention を組み込む
+    # train_util.replace_unet_modules(unet, args.mem_eff_attn, args.xformers, args.sdpa)
+    if args.xformers:
+        unet.enable_xformers_memory_efficient_attention()
+        controlnet.enable_xformers_memory_efficient_attention()
+
+    if args.gradient_checkpointing:
+        unet.enable_gradient_checkpointing()
+        controlnet.enable_gradient_checkpointing()
+
+    # 学習に必要なクラスを準備する
+    accelerator.print("prepare optimizer, data loader etc.")
+
+    trainable_params = list(filter(lambda p: p.requires_grad, controlnet.parameters()))
+    logger.info(f"trainable params count: {len(trainable_params)}")
+    logger.info(
+        f"number of trainable parameters: {sum(p.numel() for p in trainable_params if p.requires_grad)}"
+    )
+
+    _, _, optimizer = train_util.get_optimizer(args, trainable_params)
+
+    # dataloaderを準備する
+    # DataLoaderのプロセス数：0 は persistent_workers が使えないので注意
+    n_workers = min(args.max_data_loader_n_workers, os.cpu_count())  # cpu_count or max_data_loader_n_workers
+
+    train_dataloader = torch.utils.data.DataLoader(
+        train_dataset_group,
+        batch_size=1,
+        shuffle=True,
+        collate_fn=collator,
+        num_workers=n_workers,
+        persistent_workers=args.persistent_data_loader_workers,
+    )
+
+    # 学習ステップ数を計算する
+    if args.max_train_epochs is not None:
+        args.max_train_steps = args.max_train_epochs * math.ceil(
+            len(train_dataloader)/ accelerator.num_processes/ args.gradient_accumulation_steps
+        )
+        accelerator.print(
+            f"override steps. steps for {args.max_train_epochs} epochs is / 指定エポックまでのステップ数: {args.max_train_steps}"
+        )
+
+    # データセット側にも学習ステップを送信
+    train_dataset_group.set_max_train_steps(args.max_train_steps)
+
+    # lr schedulerを用意する
+    lr_scheduler = train_util.get_scheduler_fix(
+        args, optimizer, accelerator.num_processes
+    )
+
+    # 実験的機能：勾配も含めたfp16/bf16学習を行う　モデル全体をfp16/bf16にする
+    if args.full_fp16:
+        assert (
+            args.mixed_precision == "fp16"
+        ), "full_fp16 requires mixed precision='fp16' / full_fp16を使う場合はmixed_precision='fp16'を指定してください。"
+        accelerator.print("enable full fp16 training.")
+        controlnet.to(weight_dtype)
+        unet.to(weight_dtype)
+    elif args.full_bf16:
+        assert (
+            args.mixed_precision == "bf16"
+        ), "full_bf16 requires mixed precision='bf16' / full_bf16を使う場合はmixed_precision='bf16'を指定してください。"
+        accelerator.print("enable full bf16 training.")
+        controlnet.to(weight_dtype)
+        unet.to(weight_dtype)
+
+    # acceleratorがなんかよろしくやってくれるらしい
+    controlnet, optimizer, train_dataloader, lr_scheduler = accelerator.prepare(
+        controlnet, optimizer, train_dataloader, lr_scheduler
+    )
+
+    if args.fused_backward_pass:
+        # use fused optimizer for backward pass: other optimizers will be supported in the future
+        import library.adafactor_fused
+
+        library.adafactor_fused.patch_adafactor_fused(optimizer)
+        for param_group in optimizer.param_groups:
+            for parameter in param_group["params"]:
+                if parameter.requires_grad:
+
+                    def __grad_hook(tensor: torch.Tensor, param_group=param_group):
+                        if accelerator.sync_gradients and args.max_grad_norm != 0.0:
+                            accelerator.clip_grad_norm_(tensor, args.max_grad_norm)
+                        optimizer.step_param(tensor, param_group)
+                        tensor.grad = None
+
+                    parameter.register_post_accumulate_grad_hook(__grad_hook)
+
+    unet.requires_grad_(False)
+    text_encoder1.requires_grad_(False)
+    text_encoder2.requires_grad_(False)
+    unet.to(accelerator.device, dtype=weight_dtype)
+
+    # transform DDP after prepare
+    controlnet = controlnet.module if isinstance(controlnet, DDP) else controlnet
+
+    controlnet.train()
+
+    # TextEncoderの出力をキャッシュするときにはCPUへ移動する
+    if args.cache_text_encoder_outputs:
+        # move Text Encoders for sampling images. Text Encoder doesn't work on CPU with fp16
+        text_encoder1.to("cpu", dtype=torch.float32)
+        text_encoder2.to("cpu", dtype=torch.float32)
+        clean_memory_on_device(accelerator.device)
+    else:
+        # make sure Text Encoders are on GPU
+        text_encoder1.to(accelerator.device)
+        text_encoder2.to(accelerator.device)
+
+    if not cache_latents:
+        vae.requires_grad_(False)
+        vae.eval()
+        vae.to(accelerator.device, dtype=vae_dtype)
+
+    # 実験的機能：勾配も含めたfp16学習を行う　PyTorchにパッチを当ててfp16でのgrad scaleを有効にする
+    if args.full_fp16:
+        train_util.patch_accelerator_for_fp16_training(accelerator)
+
+    # resumeする
+    train_util.resume_from_local_or_hf_if_specified(accelerator, args)
+
+    # epoch数を計算する
+    num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
+    num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch)
+    if (args.save_n_epoch_ratio is not None) and (args.save_n_epoch_ratio > 0):
+        args.save_every_n_epochs = math.floor(num_train_epochs / args.save_n_epoch_ratio) or 1
+
+    # 学習する
+    # TODO: find a way to handle total batch size when there are multiple datasets
+    accelerator.print("running training / 学習開始")
+    accelerator.print(f"  num train images * repeats / 学習画像の数×繰り返し回数: {train_dataset_group.num_train_images}")
+    accelerator.print(f"  num reg images / 正則化画像の数: {train_dataset_group.num_reg_images}")
+    accelerator.print(f"  num batches per epoch / 1epochのバッチ数: {len(train_dataloader)}")
+    accelerator.print(f"  num epochs / epoch数: {num_train_epochs}")
+    accelerator.print(
+        f"  batch size per device / バッチサイズ: {', '.join([str(d.batch_size) for d in train_dataset_group.datasets])}"
+    )
+    # logger.info(f"  total train batch size (with parallel & distributed & accumulation) / 総バッチサイズ（並列学習、勾配合計含む）: {total_batch_size}")
+    accelerator.print(f"  gradient accumulation steps / 勾配を合計するステップ数 = {args.gradient_accumulation_steps}")
+    accelerator.print(f"  total optimization steps / 学習ステップ数: {args.max_train_steps}")
+
+    progress_bar = tqdm(
+        range(args.max_train_steps),
+        smoothing=0,
+        disable=not accelerator.is_local_main_process,
+        desc="steps",
+    )
+    global_step = 0
+
+    noise_scheduler = DDPMScheduler(
+        beta_start=0.00085,
+        beta_end=0.012,
+        beta_schedule="scaled_linear",
+        num_train_timesteps=1000,
+        clip_sample=False,
+    )
+    prepare_scheduler_for_custom_training(noise_scheduler, accelerator.device)
+    if args.zero_terminal_snr:
+        custom_train_functions.fix_noise_scheduler_betas_for_zero_terminal_snr(
+            noise_scheduler
+        )
+
+    if accelerator.is_main_process:
+        init_kwargs = {}
+        if args.wandb_run_name:
+            init_kwargs["wandb"] = {"name": args.wandb_run_name}
+        if args.log_tracker_config is not None:
+            init_kwargs = toml.load(args.log_tracker_config)
+        accelerator.init_trackers(
+            (
+                "controlnet_train"
+                if args.log_tracker_name is None
+                else args.log_tracker_name
+            ),
+            config=train_util.get_sanitized_config_or_none(args),
+            init_kwargs=init_kwargs,
+        )
+
+    loss_recorder = train_util.LossRecorder()
+    del train_dataset_group
+
+    # function for saving/removing
+    def save_model(ckpt_name, model, force_sync_upload=False):
+        os.makedirs(args.output_dir, exist_ok=True)
+        ckpt_file = os.path.join(args.output_dir, ckpt_name)
+
+        accelerator.print(f"\nsaving checkpoint: {ckpt_file}")
+        sai_metadata = train_util.get_sai_model_spec(None, args, True, True, False)
+        sai_metadata["modelspec.architecture"] = (
+            sai_model_spec.ARCH_SD_XL_V1_BASE + "/controlnet"
+        )
+        state_dict = model_util.convert_controlnet_state_dict_to_sd(model.state_dict())
+
+        if save_dtype is not None:
+            for key in list(state_dict.keys()):
+                v = state_dict[key]
+                v = v.detach().clone().to("cpu").to(save_dtype)
+                state_dict[key] = v
+
+        if os.path.splitext(ckpt_file)[1] == ".safetensors":
+            from safetensors.torch import save_file
+
+            save_file(state_dict, ckpt_file, sai_metadata)
+        else:
+            torch.save(state_dict, ckpt_file)
+
+        if args.huggingface_repo_id is not None:
+            huggingface_util.upload(args, ckpt_file, "/" + ckpt_name, force_sync_upload=force_sync_upload)
+
+    def remove_model(old_ckpt_name):
+        old_ckpt_file = os.path.join(args.output_dir, old_ckpt_name)
+        if os.path.exists(old_ckpt_file):
+            accelerator.print(f"removing old checkpoint: {old_ckpt_file}")
+            os.remove(old_ckpt_file)
+
+    # For --sample_at_first
+    sdxl_train_util.sample_images(
+        accelerator,
+        args,
+        0,
+        global_step,
+        accelerator.device,
+        vae,
+        [tokenizer1, tokenizer2],
+        [text_encoder1, text_encoder2],
+        unet,
+        controlnet=controlnet,
+    )
+
+    # training loop
+    for epoch in range(num_train_epochs):
+        accelerator.print(f"\nepoch {epoch+1}/{num_train_epochs}")
+        current_epoch.value = epoch + 1
+
+        for step, batch in enumerate(train_dataloader):
+            current_step.value = global_step
+            with accelerator.accumulate(controlnet):
+                with torch.no_grad():
+                    if "latents" in batch and batch["latents"] is not None:
+                        latents = (
+                            batch["latents"]
+                            .to(accelerator.device)
+                            .to(dtype=weight_dtype)
+                        )
+                    else:
+                        # latentに変換
+                        latents = (
+                            vae.encode(batch["images"].to(dtype=vae_dtype))
+                            .latent_dist.sample()
+                            .to(dtype=weight_dtype)
+                        )
+
+                        # NaNが含まれていれば警告を表示し0に置き換える
+                        if torch.any(torch.isnan(latents)):
+                            accelerator.print(
+                                "NaN found in latents, replacing with zeros"
+                            )
+                            latents = torch.nan_to_num(latents, 0, out=latents)
+                    latents = latents * sdxl_model_util.VAE_SCALE_FACTOR
+
+                if (
+                    "text_encoder_outputs1_list" not in batch
+                    or batch["text_encoder_outputs1_list"] is None
+                ):
+                    input_ids1 = batch["input_ids"]
+                    input_ids2 = batch["input_ids2"]
+                    with torch.no_grad():
+                        # Get the text embedding for conditioning
+                        input_ids1 = input_ids1.to(accelerator.device)
+                        input_ids2 = input_ids2.to(accelerator.device)
+                        encoder_hidden_states1, encoder_hidden_states2, pool2 = (
+                            train_util.get_hidden_states_sdxl(
+                                args.max_token_length,
+                                input_ids1,
+                                input_ids2,
+                                tokenizer1,
+                                tokenizer2,
+                                text_encoder1,
+                                text_encoder2,
+                                None if not args.full_fp16 else weight_dtype,
+                            )
+                        )
+                else:
+                    encoder_hidden_states1 = (
+                        batch["text_encoder_outputs1_list"]
+                        .to(accelerator.device)
+                        .to(weight_dtype)
+                    )
+                    encoder_hidden_states2 = (
+                        batch["text_encoder_outputs2_list"]
+                        .to(accelerator.device)
+                        .to(weight_dtype)
+                    )
+                    pool2 = (
+                        batch["text_encoder_pool2_list"]
+                        .to(accelerator.device)
+                        .to(weight_dtype)
+                    )
+
+                # get size embeddings
+                orig_size = batch["original_sizes_hw"]
+                crop_size = batch["crop_top_lefts"]
+                target_size = batch["target_sizes_hw"]
+                # embs = sdxl_train_util.get_size_embeddings(
+                #     orig_size, crop_size, target_size, accelerator.device
+                # ).to(weight_dtype)
+
+                embs = torch.cat([orig_size, crop_size, target_size]).to(accelerator.device).to(weight_dtype) #B,6
+                # concat embeddings
+                #vector_embedding = torch.cat([pool2, embs], dim=1).to(weight_dtype)
+                vector_embedding_dict = {
+                    "text_embeds": pool2,
+                    "time_ids": embs
+                }
+                text_embedding = torch.cat(
+                    [encoder_hidden_states1, encoder_hidden_states2], dim=2
+                ).to(weight_dtype)
+
+                # Sample noise, sample a random timestep for each image, and add noise to the latents,
+                # with noise offset and/or multires noise if specified
+                noise, noisy_latents, timesteps, huber_c = (
+                    train_util.get_noise_noisy_latents_and_timesteps(
+                        args, noise_scheduler, latents
+                    )
+                )
+
+                controlnet_image = batch["conditioning_images"].to(dtype=weight_dtype)
+
+
+                with accelerator.autocast():
+                    down_block_res_samples, mid_block_res_sample = controlnet(
+                        noisy_latents,
+                        timesteps,
+                        encoder_hidden_states=text_embedding,
+                        added_cond_kwargs=vector_embedding_dict,
+                        controlnet_cond=controlnet_image,
+                        return_dict=False,
+                    )
+
+                    # Predict the noise residual
+                    noise_pred = unet(
+                        noisy_latents,
+                        timesteps,
+                        encoder_hidden_states=text_embedding,
+                        added_cond_kwargs=vector_embedding_dict,
+                        down_block_additional_residuals=[
+                            sample.to(dtype=weight_dtype) for sample in down_block_res_samples
+                        ],
+                        mid_block_additional_residual=mid_block_res_sample.to(dtype=weight_dtype),
+                        return_dict=False,
+                    )[0]
+
+                if args.v_parameterization:
+                    # v-parameterization training
+                    target = noise_scheduler.get_velocity(latents, noise, timesteps)
+                else:
+                    target = noise
+
+                loss = train_util.conditional_loss(
+                    noise_pred.float(),target.float(),reduction="none",loss_type=args.loss_type,huber_c=huber_c,
+                )
+                loss = loss.mean([1, 2, 3])
+
+                loss_weights = batch["loss_weights"]  # 各sampleごとのweight
+                loss = loss * loss_weights
+
+                if args.min_snr_gamma:
+                    loss = apply_snr_weight(loss,timesteps,noise_scheduler,args.min_snr_gamma,args.v_parameterization)
+                if args.scale_v_pred_loss_like_noise_pred:
+                    loss = scale_v_prediction_loss_like_noise_prediction(loss, timesteps, noise_scheduler)
+                if args.v_pred_like_loss:
+                    loss = add_v_prediction_like_loss(loss, timesteps, noise_scheduler, args.v_pred_like_loss)
+                if args.debiased_estimation_loss:
+                    loss = apply_debiased_estimation(loss, timesteps, noise_scheduler)
+
+                loss = loss.mean()  # 平均なのでbatch_sizeで割る必要なし
+
+                accelerator.backward(loss)
+                if not args.fused_backward_pass:
+                    if accelerator.sync_gradients and args.max_grad_norm != 0.0:
+                        params_to_clip = controlnet.parameters()
+                        accelerator.clip_grad_norm_(params_to_clip, args.max_grad_norm)
+
+                    optimizer.step()
+                    lr_scheduler.step()
+                    optimizer.zero_grad(set_to_none=True)
+                else:
+                    # optimizer.step() and optimizer.zero_grad() are called in the optimizer hook
+                    lr_scheduler.step()
+
+            # Checks if the accelerator has performed an optimization step behind the scenes
+            if accelerator.sync_gradients:
+                progress_bar.update(1)
+                global_step += 1
+
+                sdxl_train_util.sample_images(
+                    accelerator,
+                    args,
+                    None,
+                    global_step,
+                    accelerator.device,
+                    vae,
+                    [tokenizer1, tokenizer2],
+                    [text_encoder1, text_encoder2],
+                    unet,
+                    controlnet=controlnet,
+                )
+
+                # 指定ステップごとにモデルを保存
+                if args.save_every_n_steps is not None and global_step % args.save_every_n_steps == 0:
+                    accelerator.wait_for_everyone()
+                    if accelerator.is_main_process:
+                        ckpt_name = train_util.get_step_ckpt_name(args, "." + args.save_model_as, global_step)
+                        save_model(ckpt_name,unwrap_model(controlnet))
+
+                        if args.save_state:
+                            train_util.save_and_remove_state_stepwise(args, accelerator, global_step)
+
+                        remove_step_no = train_util.get_remove_step_no(args, global_step)
+                        if remove_step_no is not None:
+                            remove_ckpt_name = train_util.get_step_ckpt_name(args, "." + args.save_model_as, remove_step_no)
+                            remove_model(remove_ckpt_name)
+
+            current_loss = loss.detach().item()
+            loss_recorder.add(epoch=epoch, step=step, loss=current_loss)
+            avr_loss: float = loss_recorder.moving_average
+            logs = {"avr_loss": avr_loss}  # , "lr": lr_scheduler.get_last_lr()[0]}
+            progress_bar.set_postfix(**logs)
+
+            if args.logging_dir is not None:
+                logs = generate_step_logs(args, current_loss, avr_loss, lr_scheduler)
+                accelerator.log(logs, step=global_step)
+
+            if global_step >= args.max_train_steps:
+                break
+
+        if args.logging_dir is not None:
+            logs = {"loss/epoch": loss_recorder.moving_average}
+            accelerator.log(logs, step=epoch + 1)
+
+        accelerator.wait_for_everyone()
+
+        # 指定エポックごとにモデルを保存
+        if args.save_every_n_epochs is not None:
+            saving = (epoch + 1) % args.save_every_n_epochs == 0 and (epoch + 1) < num_train_epochs
+            if is_main_process and saving:
+                ckpt_name = train_util.get_epoch_ckpt_name(args, "." + args.save_model_as, epoch + 1)
+                save_model(ckpt_name,unwrap_model(controlnet))
+
+                remove_epoch_no = train_util.get_remove_epoch_no(args, epoch + 1)
+                if remove_epoch_no is not None:
+                    remove_ckpt_name = train_util.get_epoch_ckpt_name(args, "." + args.save_model_as, remove_epoch_no)
+                    remove_model(remove_ckpt_name)
+
+                if args.save_state:
+                    train_util.save_and_remove_state_on_epoch_end(args, accelerator, epoch + 1)
+
+        sdxl_train_util.sample_images(
+            accelerator,
+            args,
+            epoch + 1,
+            global_step,
+            accelerator.device,
+            vae,
+            [tokenizer1, tokenizer2],
+            [text_encoder1, text_encoder2],
+            unet,
+            controlnet=controlnet,
+        )
+
+        # end of epoch
+
+    if is_main_process:
+        controlnet = unwrap_model(controlnet)
+
+    accelerator.end_training()
+
+    if is_main_process and (args.save_state or args.save_state_on_train_end):
+        train_util.save_state_on_train_end(args, accelerator)
+
+    if is_main_process:
+        ckpt_name = train_util.get_last_ckpt_name(args, "." + args.save_model_as)
+        save_model(
+            ckpt_name, controlnet, force_sync_upload=True
+        )
+
+        logger.info("model saved.")
+
+
+def setup_parser() -> argparse.ArgumentParser:
+    parser = argparse.ArgumentParser()
+
+    add_logging_arguments(parser)
+    train_util.add_sd_models_arguments(parser)
+    train_util.add_dataset_arguments(parser, False, True, True)
+    train_util.add_training_arguments(parser, False)
+    train_util.add_masked_loss_arguments(parser)
+    deepspeed_utils.add_deepspeed_arguments(parser)
+    train_util.add_sd_saving_arguments(parser)
+    train_util.add_optimizer_arguments(parser)
+    config_util.add_config_arguments(parser)
+    custom_train_functions.add_custom_train_arguments(parser)
+    sdxl_train_util.add_sdxl_training_arguments(parser)
+
+    parser.add_argument(
+        "--controlnet_model_name_or_path",
+        type=str,
+        default=None,
+        help="controlnet model name or path / controlnetのモデル名またはパス",
+    )
+    parser.add_argument(
+        "--no_half_vae",
+        action="store_true",
+        help="do not use fp16/bf16 VAE in mixed precision (use float VAE) / mixed precisionでも fp16/bf16 VAEを使わずfloat VAEを使う",
+    )
+
+    return parser
+
+
+if __name__ == "__main__":
+    # sdxl_original_unet.USE_REENTRANT = False
+
+    parser = setup_parser()
+
+    args = parser.parse_args()
+    train_util.verify_command_line_training_args(args)
+    args = train_util.read_config_from_file(args, parser)
+
+    train(args)
diff --git a/train_controlnet.py b/train_controlnet.py
index c2945b083..8c7882c8f 100644
--- a/train_controlnet.py
+++ b/train_controlnet.py
@@ -254,6 +254,7 @@ def __contains__(self, name):
         accelerator.wait_for_everyone()
 
     if args.gradient_checkpointing:
+        unet.enable_gradient_checkpointing()
         controlnet.enable_gradient_checkpointing()
 
     # 学習に必要なクラスを準備する
@@ -304,6 +305,20 @@ def __contains__(self, name):
         controlnet, optimizer, train_dataloader, lr_scheduler
     )
 
+    if args.fused_backward_pass:
+        import library.adafactor_fused
+        library.adafactor_fused.patch_adafactor_fused(optimizer)
+        for param_group in optimizer.param_groups:
+            for parameter in param_group["params"]:
+                if parameter.requires_grad:
+                    def __grad_hook(tensor: torch.Tensor, param_group=param_group):
+                        if accelerator.sync_gradients and args.max_grad_norm != 0.0:
+                            accelerator.clip_grad_norm_(tensor, args.max_grad_norm)
+                        optimizer.step_param(tensor, param_group)
+                        tensor.grad = None
+
+                    parameter.register_post_accumulate_grad_hook(__grad_hook)
+
     unet.requires_grad_(False)
     text_encoder.requires_grad_(False)
     unet.to(accelerator.device)
@@ -497,13 +512,17 @@ def remove_model(old_ckpt_name):
                 loss = loss.mean()  # 平均なのでbatch_sizeで割る必要なし
 
                 accelerator.backward(loss)
-                if accelerator.sync_gradients and args.max_grad_norm != 0.0:
-                    params_to_clip = controlnet.parameters()
-                    accelerator.clip_grad_norm_(params_to_clip, args.max_grad_norm)
-
-                optimizer.step()
-                lr_scheduler.step()
-                optimizer.zero_grad(set_to_none=True)
+                if not args.fused_backward_pass:
+                    if accelerator.sync_gradients and args.max_grad_norm != 0.0:
+                        params_to_clip = controlnet.parameters()
+                        accelerator.clip_grad_norm_(params_to_clip, args.max_grad_norm)
+
+                    optimizer.step()
+                    lr_scheduler.step()
+                    optimizer.zero_grad(set_to_none=True)
+                else:
+                    # optimizer.step() and optimizer.zero_grad() are called in the optimizer hook
+                    lr_scheduler.step()
 
             # Checks if the accelerator has performed an optimization step behind the scenes
             if accelerator.sync_gradients:

From 8919b31145d38a2a790fae6e8e1c34c205c6794e Mon Sep 17 00:00:00 2001
From: Kohya S <ykumeykume@gmail.com>
Date: Sun, 29 Sep 2024 23:07:34 +0900
Subject: [PATCH 161/582] use original ControlNet instead of Diffusers

---
 gen_img.py                                    |  89 +++-
 library/sdxl_model_util.py                    |   2 +-
 library/sdxl_original_control_net.py          | 272 ++++++++++++
 library/sdxl_original_unet.py                 |  14 +-
 ...controlnet.py => sdxl_train_control_net.py | 390 ++++++++----------
 5 files changed, 528 insertions(+), 239 deletions(-)
 create mode 100644 library/sdxl_original_control_net.py
 rename sdxl_train_controlnet.py => sdxl_train_control_net.py (69%)

diff --git a/gen_img.py b/gen_img.py
index 59bcd5b09..70b3c81ff 100644
--- a/gen_img.py
+++ b/gen_img.py
@@ -43,8 +43,8 @@
 )
 from einops import rearrange
 from tqdm import tqdm
-from torchvision import transforms
 from transformers import CLIPTextModel, CLIPTokenizer, CLIPVisionModelWithProjection, CLIPImageProcessor
+from accelerate import init_empty_weights
 import PIL
 from PIL import Image
 from PIL.PngImagePlugin import PngInfo
@@ -58,6 +58,7 @@
 from tools.original_control_net import ControlNetInfo
 from library.original_unet import UNet2DConditionModel, InferUNet2DConditionModel
 from library.sdxl_original_unet import InferSdxlUNet2DConditionModel
+from library.sdxl_original_control_net import SdxlControlNet
 from library.original_unet import FlashAttentionFunction
 from networks.control_net_lllite import ControlNetLLLite
 from library.utils import GradualLatent, EulerAncestralDiscreteSchedulerGL
@@ -352,8 +353,8 @@ def __init__(
             self.token_replacements_list.append({})
 
         # ControlNet
-        self.control_nets: List[ControlNetInfo] = []  # only for SD 1.5
-        self.control_net_lllites: List[ControlNetLLLite] = []
+        self.control_nets: List[Union[ControlNetInfo, Tuple[SdxlControlNet, float]]] = []
+        self.control_net_lllites: List[Tuple[ControlNetLLLite, float]] = []
         self.control_net_enabled = True  # control_netsが空ならTrueでもFalseでもControlNetは動作しない
 
         self.gradual_latent: GradualLatent = None
@@ -542,7 +543,7 @@ def __call__(
             else:
                 text_embeddings = torch.cat([uncond_embeddings, text_embeddings, real_uncond_embeddings])
 
-        if self.control_net_lllites:
+        if self.control_net_lllites or (self.control_nets and self.is_sdxl):
             # ControlNetのhintにguide imageを流用する。ControlNetの場合はControlNet側で行う
             if isinstance(clip_guide_images, PIL.Image.Image):
                 clip_guide_images = [clip_guide_images]
@@ -731,7 +732,12 @@ def __call__(
         num_latent_input = (3 if negative_scale is not None else 2) if do_classifier_free_guidance else 1
 
         if self.control_nets:
-            guided_hints = original_control_net.get_guided_hints(self.control_nets, num_latent_input, batch_size, clip_guide_images)
+            if not self.is_sdxl:
+                guided_hints = original_control_net.get_guided_hints(
+                    self.control_nets, num_latent_input, batch_size, clip_guide_images
+                )
+            else:
+                clip_guide_images = clip_guide_images * 0.5 + 0.5  # [-1, 1] => [0, 1]
             each_control_net_enabled = [self.control_net_enabled] * len(self.control_nets)
 
         if self.control_net_lllites:
@@ -793,7 +799,7 @@ def __call__(
             latent_model_input = latents.repeat((num_latent_input, 1, 1, 1))
             latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
 
-            # disable ControlNet-LLLite if ratio is set. ControlNet is disabled in ControlNetInfo
+            # disable ControlNet-LLLite or SDXL ControlNet if ratio is set. ControlNet is disabled in ControlNetInfo
             if self.control_net_lllites:
                 for j, ((control_net, ratio), enabled) in enumerate(zip(self.control_net_lllites, each_control_net_enabled)):
                     if not enabled or ratio >= 1.0:
@@ -802,9 +808,16 @@ def __call__(
                         logger.info(f"ControlNetLLLite {j} is disabled (ratio={ratio} at {i} / {len(timesteps)})")
                         control_net.set_cond_image(None)
                         each_control_net_enabled[j] = False
+            if self.control_nets and self.is_sdxl:
+                for j, ((control_net, ratio), enabled) in enumerate(zip(self.control_nets, each_control_net_enabled)):
+                    if not enabled or ratio >= 1.0:
+                        continue
+                    if ratio < i / len(timesteps):
+                        logger.info(f"ControlNet {j} is disabled (ratio={ratio} at {i} / {len(timesteps)})")
+                        each_control_net_enabled[j] = False
 
             # predict the noise residual
-            if self.control_nets and self.control_net_enabled:
+            if self.control_nets and self.control_net_enabled and not self.is_sdxl:
                 if regional_network:
                     num_sub_and_neg_prompts = len(text_embeddings) // batch_size
                     text_emb_last = text_embeddings[num_sub_and_neg_prompts - 2 :: num_sub_and_neg_prompts]  # last subprompt
@@ -823,6 +836,31 @@ def __call__(
                     text_embeddings,
                     text_emb_last,
                 ).sample
+            elif self.control_nets:
+                input_resi_add_list = []
+                mid_add_list = []
+                for (control_net, _), enbld in zip(self.control_nets, each_control_net_enabled):
+                    if not enbld:
+                        continue
+                    input_resi_add, mid_add = control_net(
+                        latent_model_input, t, text_embeddings, vector_embeddings, clip_guide_images
+                    )
+                    input_resi_add_list.append(input_resi_add)
+                    mid_add_list.append(mid_add)
+                if len(input_resi_add_list) == 0:
+                    noise_pred = self.unet(latent_model_input, t, text_embeddings, vector_embeddings)
+                else:
+                    if len(input_resi_add_list) > 1:
+                        # get mean of input_resi_add_list and mid_add_list
+                        input_resi_add_mean = []
+                        for i in range(len(input_resi_add_list[0])):
+                            input_resi_add_mean.append(
+                                torch.mean(torch.stack([input_resi_add_list[j][i] for j in range(len(input_resi_add_list))], dim=0))
+                            )
+                        input_resi_add = input_resi_add_mean
+                        mid_add = torch.mean(torch.stack(mid_add_list), dim=0)
+                        
+                    noise_pred = self.unet(latent_model_input, t, text_embeddings, vector_embeddings, input_resi_add, mid_add)
             elif self.is_sdxl:
                 noise_pred = self.unet(latent_model_input, t, text_embeddings, vector_embeddings)
             else:
@@ -1827,16 +1865,37 @@ def __getattr__(self, item):
         upscaler.to(dtype).to(device)
 
     # ControlNetの処理
-    control_nets: List[ControlNetInfo] = []
+    control_nets: List[Union[ControlNetInfo, Tuple[SdxlControlNet, float]]] = []
     if args.control_net_models:
-        for i, model in enumerate(args.control_net_models):
-            prep_type = None if not args.control_net_preps or len(args.control_net_preps) <= i else args.control_net_preps[i]
-            weight = 1.0 if not args.control_net_weights or len(args.control_net_weights) <= i else args.control_net_weights[i]
-            ratio = 1.0 if not args.control_net_ratios or len(args.control_net_ratios) <= i else args.control_net_ratios[i]
+        if not is_sdxl:
+            for i, model in enumerate(args.control_net_models):
+                prep_type = None if not args.control_net_preps or len(args.control_net_preps) <= i else args.control_net_preps[i]
+                weight = 1.0 if not args.control_net_weights or len(args.control_net_weights) <= i else args.control_net_weights[i]
+                ratio = 1.0 if not args.control_net_ratios or len(args.control_net_ratios) <= i else args.control_net_ratios[i]
+
+                ctrl_unet, ctrl_net = original_control_net.load_control_net(args.v2, unet, model)
+                prep = original_control_net.load_preprocess(prep_type)
+                control_nets.append(ControlNetInfo(ctrl_unet, ctrl_net, prep, weight, ratio))
+        else:
+            for i, model_file in enumerate(args.control_net_models):
+                multiplier = (
+                    1.0
+                    if not args.control_net_multipliers or len(args.control_net_multipliers) <= i
+                    else args.control_net_multipliers[i]
+                )
+                ratio = 1.0 if not args.control_net_ratios or len(args.control_net_ratios) <= i else args.control_net_ratios[i]
+
+                logger.info(f"loading SDXL ControlNet: {model_file}")
+                from safetensors.torch import load_file
+
+                state_dict = load_file(model_file)
 
-            ctrl_unet, ctrl_net = original_control_net.load_control_net(args.v2, unet, model)
-            prep = original_control_net.load_preprocess(prep_type)
-            control_nets.append(ControlNetInfo(ctrl_unet, ctrl_net, prep, weight, ratio))
+                logger.info(f"Initalizing SDXL ControlNet with multiplier: {multiplier}")
+                with init_empty_weights():
+                    control_net = SdxlControlNet(multiplier=multiplier)
+                control_net.load_state_dict(state_dict)
+                control_net.to(dtype).to(device)
+                control_nets.append((control_net, ratio))
 
     control_net_lllites: List[Tuple[ControlNetLLLite, float]] = []
     if args.control_net_lllite_models:
diff --git a/library/sdxl_model_util.py b/library/sdxl_model_util.py
index 4fad78a1c..0466c1fa5 100644
--- a/library/sdxl_model_util.py
+++ b/library/sdxl_model_util.py
@@ -8,7 +8,7 @@
 from diffusers import AutoencoderKL, EulerDiscreteScheduler, UNet2DConditionModel
 from library import model_util
 from library import sdxl_original_unet
-from .utils import setup_logging
+from library.utils import setup_logging
 
 setup_logging()
 import logging
diff --git a/library/sdxl_original_control_net.py b/library/sdxl_original_control_net.py
new file mode 100644
index 000000000..3af45f4db
--- /dev/null
+++ b/library/sdxl_original_control_net.py
@@ -0,0 +1,272 @@
+# some parts are modified from Diffusers library (Apache License 2.0)
+
+import math
+from types import SimpleNamespace
+from typing import Any, Optional
+import torch
+import torch.utils.checkpoint
+from torch import nn
+from torch.nn import functional as F
+from einops import rearrange
+from library.utils import setup_logging
+
+setup_logging()
+import logging
+
+logger = logging.getLogger(__name__)
+
+from library import sdxl_original_unet
+from library.sdxl_model_util import convert_sdxl_unet_state_dict_to_diffusers, convert_diffusers_unet_state_dict_to_sdxl
+
+
+class ControlNetConditioningEmbedding(nn.Module):
+    def __init__(self):
+        super().__init__()
+
+        dims = [16, 32, 96, 256]
+
+        self.conv_in = nn.Conv2d(3, dims[0], kernel_size=3, padding=1)
+        self.blocks = nn.ModuleList([])
+
+        for i in range(len(dims) - 1):
+            channel_in = dims[i]
+            channel_out = dims[i + 1]
+            self.blocks.append(nn.Conv2d(channel_in, channel_in, kernel_size=3, padding=1))
+            self.blocks.append(nn.Conv2d(channel_in, channel_out, kernel_size=3, padding=1, stride=2))
+
+        self.conv_out = nn.Conv2d(dims[-1], 320, kernel_size=3, padding=1)
+        nn.init.zeros_(self.conv_out.weight)  # zero module weight
+        nn.init.zeros_(self.conv_out.bias)  # zero module bias
+
+    def forward(self, x):
+        x = self.conv_in(x)
+        x = F.silu(x)
+        for block in self.blocks:
+            x = block(x)
+            x = F.silu(x)
+        x = self.conv_out(x)
+        return x
+
+
+class SdxlControlNet(sdxl_original_unet.SdxlUNet2DConditionModel):
+    def __init__(self, multiplier: Optional[float] = None, **kwargs):
+        super().__init__(**kwargs)
+        self.multiplier = multiplier
+
+        # remove unet layers
+        self.output_blocks = nn.ModuleList([])
+        del self.out
+
+        self.controlnet_cond_embedding = ControlNetConditioningEmbedding()
+
+        dims = [320, 320, 320, 320, 640, 640, 640, 1280, 1280]
+        self.controlnet_down_blocks = nn.ModuleList([])
+        for dim in dims:
+            self.controlnet_down_blocks.append(nn.Conv2d(dim, dim, kernel_size=1))
+            nn.init.zeros_(self.controlnet_down_blocks[-1].weight)  # zero module weight
+            nn.init.zeros_(self.controlnet_down_blocks[-1].bias)  # zero module bias
+
+        self.controlnet_mid_block = nn.Conv2d(1280, 1280, kernel_size=1)
+        nn.init.zeros_(self.controlnet_mid_block.weight)  # zero module weight
+        nn.init.zeros_(self.controlnet_mid_block.bias)  # zero module bias
+
+    def init_from_unet(self, unet: sdxl_original_unet.SdxlUNet2DConditionModel):
+        unet_sd = unet.state_dict()
+        unet_sd = {k: v for k, v in unet_sd.items() if not k.startswith("out")}
+        sd = super().state_dict()
+        sd.update(unet_sd)
+        info = super().load_state_dict(sd, strict=True, assign=True)
+        return info
+
+    def load_state_dict(self, state_dict: dict, strict: bool = True, assign: bool = True) -> Any:
+        # convert state_dict to SAI format
+        unet_sd = {}
+        for k in list(state_dict.keys()):
+            if not k.startswith("controlnet_"):
+                unet_sd[k] = state_dict.pop(k)
+        unet_sd = convert_diffusers_unet_state_dict_to_sdxl(unet_sd)
+        state_dict.update(unet_sd)
+        super().load_state_dict(state_dict, strict=strict, assign=assign)
+
+    def state_dict(self, destination=None, prefix="", keep_vars=False):
+        # convert state_dict to Diffusers format
+        state_dict = super().state_dict(destination, prefix, keep_vars)
+        control_net_sd = {}
+        for k in list(state_dict.keys()):
+            if k.startswith("controlnet_"):
+                control_net_sd[k] = state_dict.pop(k)
+        state_dict = convert_sdxl_unet_state_dict_to_diffusers(state_dict)
+        state_dict.update(control_net_sd)
+        return state_dict
+
+    def forward(
+        self,
+        x: torch.Tensor,
+        timesteps: Optional[torch.Tensor] = None,
+        context: Optional[torch.Tensor] = None,
+        y: Optional[torch.Tensor] = None,
+        cond_image: Optional[torch.Tensor] = None,
+        **kwargs,
+    ) -> torch.Tensor:
+        # broadcast timesteps to batch dimension
+        timesteps = timesteps.expand(x.shape[0])
+
+        t_emb = sdxl_original_unet.get_timestep_embedding(timesteps, self.model_channels, downscale_freq_shift=0)
+        t_emb = t_emb.to(x.dtype)
+        emb = self.time_embed(t_emb)
+
+        assert x.shape[0] == y.shape[0], f"batch size mismatch: {x.shape[0]} != {y.shape[0]}"
+        assert x.dtype == y.dtype, f"dtype mismatch: {x.dtype} != {y.dtype}"
+        emb = emb + self.label_emb(y)
+
+        def call_module(module, h, emb, context):
+            x = h
+            for layer in module:
+                if isinstance(layer, sdxl_original_unet.ResnetBlock2D):
+                    x = layer(x, emb)
+                elif isinstance(layer, sdxl_original_unet.Transformer2DModel):
+                    x = layer(x, context)
+                else:
+                    x = layer(x)
+            return x
+
+        h = x
+        multiplier = self.multiplier if self.multiplier is not None else 1.0
+        hs = []
+        for i, module in enumerate(self.input_blocks):
+            h = call_module(module, h, emb, context)
+            if i == 0:
+                h = self.controlnet_cond_embedding(cond_image) + h
+            hs.append(self.controlnet_down_blocks[i](h) * multiplier)
+
+        h = call_module(self.middle_block, h, emb, context)
+        h = self.controlnet_mid_block(h) * multiplier
+
+        return hs, h
+
+
+class SdxlControlledUNet(sdxl_original_unet.SdxlUNet2DConditionModel):
+    """
+    This class is for training purpose only.
+    """
+
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+
+    def forward(self, x, timesteps=None, context=None, y=None, input_resi_add=None, mid_add=None, **kwargs):
+        # broadcast timesteps to batch dimension
+        timesteps = timesteps.expand(x.shape[0])
+
+        hs = []
+        t_emb = sdxl_original_unet.get_timestep_embedding(timesteps, self.model_channels, downscale_freq_shift=0)
+        t_emb = t_emb.to(x.dtype)
+        emb = self.time_embed(t_emb)
+
+        assert x.shape[0] == y.shape[0], f"batch size mismatch: {x.shape[0]} != {y.shape[0]}"
+        assert x.dtype == y.dtype, f"dtype mismatch: {x.dtype} != {y.dtype}"
+        emb = emb + self.label_emb(y)
+
+        def call_module(module, h, emb, context):
+            x = h
+            for layer in module:
+                if isinstance(layer, sdxl_original_unet.ResnetBlock2D):
+                    x = layer(x, emb)
+                elif isinstance(layer, sdxl_original_unet.Transformer2DModel):
+                    x = layer(x, context)
+                else:
+                    x = layer(x)
+            return x
+
+        h = x
+        for module in self.input_blocks:
+            h = call_module(module, h, emb, context)
+            hs.append(h)
+
+        h = call_module(self.middle_block, h, emb, context)
+        h = h + mid_add
+
+        for module in self.output_blocks:
+            resi = hs.pop() + input_resi_add.pop()
+            h = torch.cat([h, resi], dim=1)
+            h = call_module(module, h, emb, context)
+
+        h = h.type(x.dtype)
+        h = call_module(self.out, h, emb, context)
+
+        return h
+
+
+if __name__ == "__main__":
+    import time
+
+    logger.info("create unet")
+    unet = SdxlControlledUNet()
+    unet.to("cuda", torch.bfloat16)
+    unet.set_use_sdpa(True)
+    unet.set_gradient_checkpointing(True)
+    unet.train()
+
+    logger.info("create control_net")
+    control_net = SdxlControlNet()
+    control_net.to("cuda")
+    control_net.set_use_sdpa(True)
+    control_net.set_gradient_checkpointing(True)
+    control_net.train()
+
+    logger.info("Initialize control_net from unet")
+    control_net.init_from_unet(unet)
+
+    unet.requires_grad_(False)
+    control_net.requires_grad_(True)
+
+    # 使用メモリ量確認用の疑似学習ループ
+    logger.info("preparing optimizer")
+
+    # optimizer = torch.optim.SGD(unet.parameters(), lr=1e-3, nesterov=True, momentum=0.9) # not working
+
+    import bitsandbytes
+
+    optimizer = bitsandbytes.adam.Adam8bit(control_net.parameters(), lr=1e-3)  # not working
+    # optimizer = bitsandbytes.optim.RMSprop8bit(unet.parameters(), lr=1e-3)  # working at 23.5 GB with torch2
+    # optimizer=bitsandbytes.optim.Adagrad8bit(unet.parameters(), lr=1e-3)  # working at 23.5 GB with torch2
+
+    # import transformers
+    # optimizer = transformers.optimization.Adafactor(unet.parameters(), relative_step=True)  # working at 22.2GB with torch2
+
+    scaler = torch.cuda.amp.GradScaler(enabled=True)
+
+    logger.info("start training")
+    steps = 10
+    batch_size = 1
+
+    for step in range(steps):
+        logger.info(f"step {step}")
+        if step == 1:
+            time_start = time.perf_counter()
+
+        x = torch.randn(batch_size, 4, 128, 128).cuda()  # 1024x1024
+        t = torch.randint(low=0, high=1000, size=(batch_size,), device="cuda")
+        txt = torch.randn(batch_size, 77, 2048).cuda()
+        vector = torch.randn(batch_size, sdxl_original_unet.ADM_IN_CHANNELS).cuda()
+        cond_img = torch.rand(batch_size, 3, 1024, 1024).cuda()
+
+        with torch.cuda.amp.autocast(enabled=True, dtype=torch.bfloat16):
+            input_resi_add, mid_add = control_net(x, t, txt, vector, cond_img)
+            output = unet(x, t, txt, vector, input_resi_add, mid_add)
+            target = torch.randn_like(output)
+            loss = torch.nn.functional.mse_loss(output, target)
+
+        scaler.scale(loss).backward()
+        scaler.step(optimizer)
+        scaler.update()
+        optimizer.zero_grad(set_to_none=True)
+
+    time_end = time.perf_counter()
+    logger.info(f"elapsed time: {time_end - time_start} [sec] for last {steps - 1} steps")
+
+    logger.info("finish training")
+    sd = control_net.state_dict()
+
+    from safetensors.torch import save_file
+
+    save_file(sd, r"E:\Work\SD\Tmp\sdxl\ctrl\control_net.safetensors")
diff --git a/library/sdxl_original_unet.py b/library/sdxl_original_unet.py
index 17c345a89..0aa07d0d6 100644
--- a/library/sdxl_original_unet.py
+++ b/library/sdxl_original_unet.py
@@ -30,7 +30,7 @@
 from torch import nn
 from torch.nn import functional as F
 from einops import rearrange
-from .utils import setup_logging
+from library.utils import setup_logging
 
 setup_logging()
 import logging
@@ -1156,9 +1156,9 @@ def set_deep_shrink(self, ds_depth_1, ds_timesteps_1=650, ds_depth_2=None, ds_ti
             self.ds_timesteps_2 = ds_timesteps_2 if ds_timesteps_2 is not None else 1000
             self.ds_ratio = ds_ratio
 
-    def forward(self, x, timesteps=None, context=None, y=None, **kwargs):
+    def forward(self, x, timesteps=None, context=None, y=None, input_resi_add=None, mid_add=None, **kwargs):
         r"""
-        current implementation is a copy of `SdxlUNet2DConditionModel.forward()` with Deep Shrink.
+        current implementation is a copy of `SdxlUNet2DConditionModel.forward()` with Deep Shrink and ControlNet.
         """
         _self = self.delegate
 
@@ -1209,6 +1209,8 @@ def call_module(module, h, emb, context):
             hs.append(h)
 
         h = call_module(_self.middle_block, h, emb, context)
+        if mid_add is not None:
+            h = h + mid_add
 
         for module in _self.output_blocks:
             # Deep Shrink
@@ -1217,7 +1219,11 @@ def call_module(module, h, emb, context):
                     # print("upsample", h.shape, hs[-1].shape)
                     h = resize_like(h, hs[-1])
 
-            h = torch.cat([h, hs.pop()], dim=1)
+            resi = hs.pop()
+            if input_resi_add is not None:
+                resi = resi + input_resi_add.pop()
+
+            h = torch.cat([h, resi], dim=1)
             h = call_module(module, h, emb, context)
 
         # Deep Shrink: in case of depth 0
diff --git a/sdxl_train_controlnet.py b/sdxl_train_control_net.py
similarity index 69%
rename from sdxl_train_controlnet.py
rename to sdxl_train_control_net.py
index 00026d2cc..74dcff2af 100644
--- a/sdxl_train_controlnet.py
+++ b/sdxl_train_control_net.py
@@ -14,6 +14,7 @@
 
 from torch.nn.parallel import DistributedDataParallel as DDP
 from accelerate.utils import set_seed
+from accelerate import init_empty_weights
 from diffusers import DDPMScheduler, ControlNetModel
 from diffusers.utils.torch_utils import is_compiled_module
 from safetensors.torch import load_file
@@ -23,6 +24,9 @@
     sdxl_model_util,
     sdxl_original_unet,
     sdxl_train_util,
+    strategy_base,
+    strategy_sd,
+    strategy_sdxl,
 )
 
 import library.model_util as model_util
@@ -41,6 +45,7 @@
     scale_v_prediction_loss_like_noise_prediction,
     apply_debiased_estimation,
 )
+from library.sdxl_original_control_net import SdxlControlNet, SdxlControlledUNet
 from library.utils import setup_logging, add_logging_arguments
 
 setup_logging()
@@ -58,10 +63,7 @@ def generate_step_logs(args: argparse.Namespace, current_loss, avr_loss, lr_sche
     }
 
     if args.optimizer_type.lower().startswith("DAdapt".lower()):
-        logs["lr/d*lr"] = (
-            lr_scheduler.optimizers[-1].param_groups[0]["d"]
-            * lr_scheduler.optimizers[-1].param_groups[0]["lr"]
-        )
+        logs["lr/d*lr"] = lr_scheduler.optimizers[-1].param_groups[0]["d"] * lr_scheduler.optimizers[-1].param_groups[0]["lr"]
 
     return logs
 
@@ -79,7 +81,14 @@ def train(args):
         args.seed = random.randint(0, 2**32)
     set_seed(args.seed)
 
-    tokenizer1, tokenizer2 = sdxl_train_util.load_tokenizers(args)
+    tokenize_strategy = strategy_sdxl.SdxlTokenizeStrategy(args.max_token_length, args.tokenizer_cache_dir)
+    strategy_base.TokenizeStrategy.set_strategy(tokenize_strategy)
+
+    # prepare caching strategy: this must be set before preparing dataset. because dataset may use this strategy for initialization.
+    latents_caching_strategy = strategy_sd.SdSdxlLatentsCachingStrategy(
+        False, args.cache_latents_to_disk, args.vae_batch_size, False
+    )
+    strategy_base.LatentsCachingStrategy.set_strategy(latents_caching_strategy)
 
     # データセットを準備する
     blueprint_generator = BlueprintGenerator(ConfigSanitizer(False, False, True, True))
@@ -106,17 +115,18 @@ def train(args):
             ]
         }
 
-    blueprint = blueprint_generator.generate(user_config, args, tokenizer=[tokenizer1, tokenizer2])
+    blueprint = blueprint_generator.generate(user_config, args)
     train_dataset_group = config_util.generate_dataset_group_by_blueprint(blueprint.dataset_group)
 
     current_epoch = Value("i", 0)
     current_step = Value("i", 0)
-    ds_for_collator = (train_dataset_group if args.max_data_loader_n_workers == 0 else None)
+    ds_for_collator = train_dataset_group if args.max_data_loader_n_workers == 0 else None
     collator = train_util.collator_class(current_epoch, current_step, ds_for_collator)
 
     train_dataset_group.verify_bucket_reso_steps(32)
 
     if args.debug_dataset:
+        train_dataset_group.set_current_strategies()  # dasaset needs to know the strategies explicitly
         train_util.debug_dataset(train_dataset_group)
         return
     if len(train_dataset_group) == 0:
@@ -162,86 +172,99 @@ def unwrap_model(model):
         unet,
         logit_scale,
         ckpt_info,
-    ) = sdxl_train_util.load_target_model(
-        args, accelerator, sdxl_model_util.MODEL_VERSION_SDXL_BASE_V1_0, weight_dtype
-    )
+    ) = sdxl_train_util.load_target_model(args, accelerator, sdxl_model_util.MODEL_VERSION_SDXL_BASE_V1_0, weight_dtype)
+
+    unet.to(accelerator.device)  # reduce main memory usage
+
+    # convert U-Net to Controlled U-Net
+    logger.info("convert U-Net to Controlled U-Net")
+    unet_sd = unet.state_dict()
+    with init_empty_weights():
+        unet = SdxlControlledUNet()
+    unet.load_state_dict(unet_sd, strict=True, assign=True)
+    del unet_sd
+
+    # make control net
+    logger.info("make ControlNet")
+    if args.controlnet_model_path:
+        with init_empty_weights():
+            control_net = SdxlControlNet()
+
+        logger.info(f"load ControlNet from {args.controlnet_model_path}")
+        filename = args.controlnet_model_path
+        if os.path.splitext(filename)[1] == ".safetensors":
+            state_dict = load_file(filename)
+        else:
+            state_dict = torch.load(filename)
+        info = control_net.load_state_dict(state_dict, strict=True, assign=True)
+        logger.info(f"ControlNet loaded from {filename}: {info}")
+    else:
+        control_net = SdxlControlNet()
 
-    # convert U-Net
-    with torch.no_grad():
-        du_unet_sd = sdxl_model_util.convert_sdxl_unet_state_dict_to_diffusers(unet.state_dict())
-        unet.to("cpu")
-        clean_memory_on_device(accelerator.device)
-        del unet
-        unet = sdxl_model_util.UNet2DConditionModel(**sdxl_model_util.DIFFUSERS_SDXL_UNET_CONFIG)
-        unet.load_state_dict(du_unet_sd)
-
-    controlnet = ControlNetModel.from_unet(unet)
-
-    if args.controlnet_model_name_or_path:
-        filename = args.controlnet_model_name_or_path
-        if os.path.isfile(filename):
-            if os.path.splitext(filename)[1] == ".safetensors":
-                state_dict = load_file(filename)
-            else:
-                state_dict = torch.load(filename)
-            state_dict = model_util.convert_controlnet_state_dict_to_diffusers(state_dict)
-            controlnet.load_state_dict(state_dict)
-        elif os.path.isdir(filename):
-            controlnet = ControlNetModel.from_pretrained(filename)
+        logger.info("initialize ControlNet from U-Net")
+        info = control_net.init_from_unet(unet)
+        logger.info(f"ControlNet initialized from U-Net: {info}")
 
     # 学習を準備する
     if cache_latents:
         vae.to(accelerator.device, dtype=vae_dtype)
         vae.requires_grad_(False)
         vae.eval()
-        with torch.no_grad():
-            train_dataset_group.cache_latents(
-                vae,
-                args.vae_batch_size,
-                args.cache_latents_to_disk,
-                accelerator.is_main_process,
-            )
+
+        train_dataset_group.new_cache_latents(vae, accelerator.is_main_process)
+
         vae.to("cpu")
         clean_memory_on_device(accelerator.device)
 
         accelerator.wait_for_everyone()
 
+    text_encoding_strategy = strategy_sdxl.SdxlTextEncodingStrategy()
+    strategy_base.TextEncodingStrategy.set_strategy(text_encoding_strategy)
+
     # TextEncoderの出力をキャッシュする
     if args.cache_text_encoder_outputs:
         # Text Encodes are eval and no grad
-        with torch.no_grad():
-            train_dataset_group.cache_text_encoder_outputs(
-                (tokenizer1, tokenizer2),
-                (text_encoder1, text_encoder2),
-                accelerator.device,
-                None,
-                args.cache_text_encoder_outputs_to_disk,
-                accelerator.is_main_process,
-            )
+        text_encoder_output_caching_strategy = strategy_sdxl.SdxlTextEncoderOutputsCachingStrategy(
+            args.cache_text_encoder_outputs_to_disk, None, False
+        )
+        strategy_base.TextEncoderOutputsCachingStrategy.set_strategy(text_encoder_output_caching_strategy)
+
+        text_encoder1.to(accelerator.device)
+        text_encoder2.to(accelerator.device)
+        with accelerator.autocast():
+            train_dataset_group.new_cache_text_encoder_outputs([text_encoder1, text_encoder2], accelerator.is_main_process)
+
         accelerator.wait_for_everyone()
 
     # モデルに xformers とか memory efficient attention を組み込む
     # train_util.replace_unet_modules(unet, args.mem_eff_attn, args.xformers, args.sdpa)
     if args.xformers:
-        unet.enable_xformers_memory_efficient_attention()
-        controlnet.enable_xformers_memory_efficient_attention()
+        unet.set_use_memory_efficient_attention(True, False)
+        control_net.set_use_memory_efficient_attention(True, False)
+    elif args.sdpa:
+        unet.set_use_sdpa(True)
+        control_net.set_use_sdpa(True)
 
     if args.gradient_checkpointing:
         unet.enable_gradient_checkpointing()
-        controlnet.enable_gradient_checkpointing()
+        control_net.enable_gradient_checkpointing()
 
     # 学習に必要なクラスを準備する
     accelerator.print("prepare optimizer, data loader etc.")
 
-    trainable_params = list(filter(lambda p: p.requires_grad, controlnet.parameters()))
+    trainable_params = list(control_net.parameters())
+    # for p in trainable_params:
+    #     p.requires_grad = True
     logger.info(f"trainable params count: {len(trainable_params)}")
-    logger.info(
-        f"number of trainable parameters: {sum(p.numel() for p in trainable_params if p.requires_grad)}"
-    )
+    logger.info(f"number of trainable parameters: {sum(p.numel() for p in trainable_params if p.requires_grad)}")
 
     _, _, optimizer = train_util.get_optimizer(args, trainable_params)
 
-    # dataloaderを準備する
+    # prepare dataloader
+    # strategies are set here because they cannot be referenced in another process. Copy them with the dataset
+    # some strategies can be None
+    train_dataset_group.set_current_strategies()
+
     # DataLoaderのプロセス数：0 は persistent_workers が使えないので注意
     n_workers = min(args.max_data_loader_n_workers, os.cpu_count())  # cpu_count or max_data_loader_n_workers
 
@@ -257,7 +280,7 @@ def unwrap_model(model):
     # 学習ステップ数を計算する
     if args.max_train_epochs is not None:
         args.max_train_steps = args.max_train_epochs * math.ceil(
-            len(train_dataloader)/ accelerator.num_processes/ args.gradient_accumulation_steps
+            len(train_dataloader) / accelerator.num_processes / args.gradient_accumulation_steps
         )
         accelerator.print(
             f"override steps. steps for {args.max_train_epochs} epochs is / 指定エポックまでのステップ数: {args.max_train_steps}"
@@ -267,9 +290,7 @@ def unwrap_model(model):
     train_dataset_group.set_max_train_steps(args.max_train_steps)
 
     # lr schedulerを用意する
-    lr_scheduler = train_util.get_scheduler_fix(
-        args, optimizer, accelerator.num_processes
-    )
+    lr_scheduler = train_util.get_scheduler_fix(args, optimizer, accelerator.num_processes)
 
     # 実験的機能：勾配も含めたfp16/bf16学習を行う　モデル全体をfp16/bf16にする
     if args.full_fp16:
@@ -277,19 +298,17 @@ def unwrap_model(model):
             args.mixed_precision == "fp16"
         ), "full_fp16 requires mixed precision='fp16' / full_fp16を使う場合はmixed_precision='fp16'を指定してください。"
         accelerator.print("enable full fp16 training.")
-        controlnet.to(weight_dtype)
-        unet.to(weight_dtype)
+        control_net.to(weight_dtype)
     elif args.full_bf16:
         assert (
             args.mixed_precision == "bf16"
         ), "full_bf16 requires mixed precision='bf16' / full_bf16を使う場合はmixed_precision='bf16'を指定してください。"
         accelerator.print("enable full bf16 training.")
-        controlnet.to(weight_dtype)
-        unet.to(weight_dtype)
+        control_net.to(weight_dtype)
 
     # acceleratorがなんかよろしくやってくれるらしい
-    controlnet, optimizer, train_dataloader, lr_scheduler = accelerator.prepare(
-        controlnet, optimizer, train_dataloader, lr_scheduler
+    control_net, optimizer, train_dataloader, lr_scheduler = accelerator.prepare(
+        control_net, optimizer, train_dataloader, lr_scheduler
     )
 
     if args.fused_backward_pass:
@@ -314,10 +333,8 @@ def __grad_hook(tensor: torch.Tensor, param_group=param_group):
     text_encoder2.requires_grad_(False)
     unet.to(accelerator.device, dtype=weight_dtype)
 
-    # transform DDP after prepare
-    controlnet = controlnet.module if isinstance(controlnet, DDP) else controlnet
-
-    controlnet.train()
+    unet.eval()
+    control_net.train()
 
     # TextEncoderの出力をキャッシュするときにはCPUへ移動する
     if args.cache_text_encoder_outputs:
@@ -362,26 +379,15 @@ def __grad_hook(tensor: torch.Tensor, param_group=param_group):
     accelerator.print(f"  gradient accumulation steps / 勾配を合計するステップ数 = {args.gradient_accumulation_steps}")
     accelerator.print(f"  total optimization steps / 学習ステップ数: {args.max_train_steps}")
 
-    progress_bar = tqdm(
-        range(args.max_train_steps),
-        smoothing=0,
-        disable=not accelerator.is_local_main_process,
-        desc="steps",
-    )
+    progress_bar = tqdm(range(args.max_train_steps), smoothing=0, disable=not accelerator.is_local_main_process, desc="steps")
     global_step = 0
 
     noise_scheduler = DDPMScheduler(
-        beta_start=0.00085,
-        beta_end=0.012,
-        beta_schedule="scaled_linear",
-        num_train_timesteps=1000,
-        clip_sample=False,
+        beta_start=0.00085, beta_end=0.012, beta_schedule="scaled_linear", num_train_timesteps=1000, clip_sample=False
     )
     prepare_scheduler_for_custom_training(noise_scheduler, accelerator.device)
     if args.zero_terminal_snr:
-        custom_train_functions.fix_noise_scheduler_betas_for_zero_terminal_snr(
-            noise_scheduler
-        )
+        custom_train_functions.fix_noise_scheduler_betas_for_zero_terminal_snr(noise_scheduler)
 
     if accelerator.is_main_process:
         init_kwargs = {}
@@ -390,11 +396,7 @@ def __grad_hook(tensor: torch.Tensor, param_group=param_group):
         if args.log_tracker_config is not None:
             init_kwargs = toml.load(args.log_tracker_config)
         accelerator.init_trackers(
-            (
-                "controlnet_train"
-                if args.log_tracker_name is None
-                else args.log_tracker_name
-            ),
+            ("sdxl_control_net_train" if args.log_tracker_name is None else args.log_tracker_name),
             config=train_util.get_sanitized_config_or_none(args),
             init_kwargs=init_kwargs,
         )
@@ -409,10 +411,8 @@ def save_model(ckpt_name, model, force_sync_upload=False):
 
         accelerator.print(f"\nsaving checkpoint: {ckpt_file}")
         sai_metadata = train_util.get_sai_model_spec(None, args, True, True, False)
-        sai_metadata["modelspec.architecture"] = (
-            sai_model_spec.ARCH_SD_XL_V1_BASE + "/controlnet"
-        )
-        state_dict = model_util.convert_controlnet_state_dict_to_sd(model.state_dict())
+        sai_metadata["modelspec.architecture"] = sai_model_spec.ARCH_SD_XL_V1_BASE + "/controlnet"
+        state_dict = model.state_dict()
 
         if save_dtype is not None:
             for key in list(state_dict.keys()):
@@ -436,19 +436,19 @@ def remove_model(old_ckpt_name):
             accelerator.print(f"removing old checkpoint: {old_ckpt_file}")
             os.remove(old_ckpt_file)
 
-    # For --sample_at_first
-    sdxl_train_util.sample_images(
-        accelerator,
-        args,
-        0,
-        global_step,
-        accelerator.device,
-        vae,
-        [tokenizer1, tokenizer2],
-        [text_encoder1, text_encoder2],
-        unet,
-        controlnet=controlnet,
-    )
+    # # For --sample_at_first
+    # sdxl_train_util.sample_images(
+    #     accelerator,
+    #     args,
+    #     0,
+    #     global_step,
+    #     accelerator.device,
+    #     vae,
+    #     [tokenizer1, tokenizer2],
+    #     [text_encoder1, text_encoder2],
+    #     unet,
+    #     controlnet=control_net,
+    # )
 
     # training loop
     for epoch in range(num_train_epochs):
@@ -457,121 +457,63 @@ def remove_model(old_ckpt_name):
 
         for step, batch in enumerate(train_dataloader):
             current_step.value = global_step
-            with accelerator.accumulate(controlnet):
+            with accelerator.accumulate(control_net):
                 with torch.no_grad():
                     if "latents" in batch and batch["latents"] is not None:
-                        latents = (
-                            batch["latents"]
-                            .to(accelerator.device)
-                            .to(dtype=weight_dtype)
-                        )
+                        latents = batch["latents"].to(accelerator.device).to(dtype=weight_dtype)
                     else:
                         # latentに変換
-                        latents = (
-                            vae.encode(batch["images"].to(dtype=vae_dtype))
-                            .latent_dist.sample()
-                            .to(dtype=weight_dtype)
-                        )
+                        latents = vae.encode(batch["images"].to(dtype=vae_dtype)).latent_dist.sample().to(dtype=weight_dtype)
 
                         # NaNが含まれていれば警告を表示し0に置き換える
                         if torch.any(torch.isnan(latents)):
-                            accelerator.print(
-                                "NaN found in latents, replacing with zeros"
-                            )
+                            accelerator.print("NaN found in latents, replacing with zeros")
                             latents = torch.nan_to_num(latents, 0, out=latents)
                     latents = latents * sdxl_model_util.VAE_SCALE_FACTOR
 
-                if (
-                    "text_encoder_outputs1_list" not in batch
-                    or batch["text_encoder_outputs1_list"] is None
-                ):
-                    input_ids1 = batch["input_ids"]
-                    input_ids2 = batch["input_ids2"]
+                text_encoder_outputs_list = batch.get("text_encoder_outputs_list", None)
+                if text_encoder_outputs_list is not None:
+                    # Text Encoder outputs are cached
+                    encoder_hidden_states1, encoder_hidden_states2, pool2 = text_encoder_outputs_list
+                    encoder_hidden_states1 = encoder_hidden_states1.to(accelerator.device, dtype=weight_dtype)
+                    encoder_hidden_states2 = encoder_hidden_states2.to(accelerator.device, dtype=weight_dtype)
+                    pool2 = pool2.to(accelerator.device, dtype=weight_dtype)
+                else:
+                    input_ids1, input_ids2 = batch["input_ids_list"]
                     with torch.no_grad():
-                        # Get the text embedding for conditioning
                         input_ids1 = input_ids1.to(accelerator.device)
                         input_ids2 = input_ids2.to(accelerator.device)
-                        encoder_hidden_states1, encoder_hidden_states2, pool2 = (
-                            train_util.get_hidden_states_sdxl(
-                                args.max_token_length,
-                                input_ids1,
-                                input_ids2,
-                                tokenizer1,
-                                tokenizer2,
-                                text_encoder1,
-                                text_encoder2,
-                                None if not args.full_fp16 else weight_dtype,
-                            )
+                        encoder_hidden_states1, encoder_hidden_states2, pool2 = text_encoding_strategy.encode_tokens(
+                            tokenize_strategy, [text_encoder1, text_encoder2], [input_ids1, input_ids2]
                         )
-                else:
-                    encoder_hidden_states1 = (
-                        batch["text_encoder_outputs1_list"]
-                        .to(accelerator.device)
-                        .to(weight_dtype)
-                    )
-                    encoder_hidden_states2 = (
-                        batch["text_encoder_outputs2_list"]
-                        .to(accelerator.device)
-                        .to(weight_dtype)
-                    )
-                    pool2 = (
-                        batch["text_encoder_pool2_list"]
-                        .to(accelerator.device)
-                        .to(weight_dtype)
-                    )
+                        if args.full_fp16:
+                            encoder_hidden_states1 = encoder_hidden_states1.to(weight_dtype)
+                            encoder_hidden_states2 = encoder_hidden_states2.to(weight_dtype)
+                            pool2 = pool2.to(weight_dtype)
 
                 # get size embeddings
                 orig_size = batch["original_sizes_hw"]
                 crop_size = batch["crop_top_lefts"]
                 target_size = batch["target_sizes_hw"]
-                # embs = sdxl_train_util.get_size_embeddings(
-                #     orig_size, crop_size, target_size, accelerator.device
-                # ).to(weight_dtype)
+                embs = sdxl_train_util.get_size_embeddings(orig_size, crop_size, target_size, accelerator.device).to(weight_dtype)
 
-                embs = torch.cat([orig_size, crop_size, target_size]).to(accelerator.device).to(weight_dtype) #B,6
                 # concat embeddings
-                #vector_embedding = torch.cat([pool2, embs], dim=1).to(weight_dtype)
-                vector_embedding_dict = {
-                    "text_embeds": pool2,
-                    "time_ids": embs
-                }
-                text_embedding = torch.cat(
-                    [encoder_hidden_states1, encoder_hidden_states2], dim=2
-                ).to(weight_dtype)
+                vector_embedding = torch.cat([pool2, embs], dim=1).to(weight_dtype)
+                text_embedding = torch.cat([encoder_hidden_states1, encoder_hidden_states2], dim=2).to(weight_dtype)
 
                 # Sample noise, sample a random timestep for each image, and add noise to the latents,
                 # with noise offset and/or multires noise if specified
-                noise, noisy_latents, timesteps, huber_c = (
-                    train_util.get_noise_noisy_latents_and_timesteps(
-                        args, noise_scheduler, latents
-                    )
+                noise, noisy_latents, timesteps, huber_c = train_util.get_noise_noisy_latents_and_timesteps(
+                    args, noise_scheduler, latents
                 )
 
                 controlnet_image = batch["conditioning_images"].to(dtype=weight_dtype)
 
-
                 with accelerator.autocast():
-                    down_block_res_samples, mid_block_res_sample = controlnet(
-                        noisy_latents,
-                        timesteps,
-                        encoder_hidden_states=text_embedding,
-                        added_cond_kwargs=vector_embedding_dict,
-                        controlnet_cond=controlnet_image,
-                        return_dict=False,
+                    input_resi_add, mid_add = control_net(
+                        noisy_latents, timesteps, text_embedding, vector_embedding, controlnet_image
                     )
-
-                    # Predict the noise residual
-                    noise_pred = unet(
-                        noisy_latents,
-                        timesteps,
-                        encoder_hidden_states=text_embedding,
-                        added_cond_kwargs=vector_embedding_dict,
-                        down_block_additional_residuals=[
-                            sample.to(dtype=weight_dtype) for sample in down_block_res_samples
-                        ],
-                        mid_block_additional_residual=mid_block_res_sample.to(dtype=weight_dtype),
-                        return_dict=False,
-                    )[0]
+                    noise_pred = unet(noisy_latents, timesteps, text_embedding, vector_embedding, input_resi_add, mid_add)
 
                 if args.v_parameterization:
                     # v-parameterization training
@@ -580,7 +522,7 @@ def remove_model(old_ckpt_name):
                     target = noise
 
                 loss = train_util.conditional_loss(
-                    noise_pred.float(),target.float(),reduction="none",loss_type=args.loss_type,huber_c=huber_c,
+                    noise_pred.float(), target.float(), reduction="none", loss_type=args.loss_type, huber_c=huber_c
                 )
                 loss = loss.mean([1, 2, 3])
 
@@ -588,7 +530,7 @@ def remove_model(old_ckpt_name):
                 loss = loss * loss_weights
 
                 if args.min_snr_gamma:
-                    loss = apply_snr_weight(loss,timesteps,noise_scheduler,args.min_snr_gamma,args.v_parameterization)
+                    loss = apply_snr_weight(loss, timesteps, noise_scheduler, args.min_snr_gamma, args.v_parameterization)
                 if args.scale_v_pred_loss_like_noise_pred:
                     loss = scale_v_prediction_loss_like_noise_prediction(loss, timesteps, noise_scheduler)
                 if args.v_pred_like_loss:
@@ -601,7 +543,7 @@ def remove_model(old_ckpt_name):
                 accelerator.backward(loss)
                 if not args.fused_backward_pass:
                     if accelerator.sync_gradients and args.max_grad_norm != 0.0:
-                        params_to_clip = controlnet.parameters()
+                        params_to_clip = control_net.parameters()
                         accelerator.clip_grad_norm_(params_to_clip, args.max_grad_norm)
 
                     optimizer.step()
@@ -616,25 +558,25 @@ def remove_model(old_ckpt_name):
                 progress_bar.update(1)
                 global_step += 1
 
-                sdxl_train_util.sample_images(
-                    accelerator,
-                    args,
-                    None,
-                    global_step,
-                    accelerator.device,
-                    vae,
-                    [tokenizer1, tokenizer2],
-                    [text_encoder1, text_encoder2],
-                    unet,
-                    controlnet=controlnet,
-                )
+                # sdxl_train_util.sample_images(
+                #     accelerator,
+                #     args,
+                #     None,
+                #     global_step,
+                #     accelerator.device,
+                #     vae,
+                #     [tokenizer1, tokenizer2],
+                #     [text_encoder1, text_encoder2],
+                #     unet,
+                #     controlnet=control_net,
+                # )
 
                 # 指定ステップごとにモデルを保存
                 if args.save_every_n_steps is not None and global_step % args.save_every_n_steps == 0:
                     accelerator.wait_for_everyone()
                     if accelerator.is_main_process:
                         ckpt_name = train_util.get_step_ckpt_name(args, "." + args.save_model_as, global_step)
-                        save_model(ckpt_name,unwrap_model(controlnet))
+                        save_model(ckpt_name, unwrap_model(control_net))
 
                         if args.save_state:
                             train_util.save_and_remove_state_stepwise(args, accelerator, global_step)
@@ -650,14 +592,14 @@ def remove_model(old_ckpt_name):
             logs = {"avr_loss": avr_loss}  # , "lr": lr_scheduler.get_last_lr()[0]}
             progress_bar.set_postfix(**logs)
 
-            if args.logging_dir is not None:
+            if len(accelerator.trackers) > 0:
                 logs = generate_step_logs(args, current_loss, avr_loss, lr_scheduler)
                 accelerator.log(logs, step=global_step)
 
             if global_step >= args.max_train_steps:
                 break
 
-        if args.logging_dir is not None:
+        if len(accelerator.trackers) > 0:
             logs = {"loss/epoch": loss_recorder.moving_average}
             accelerator.log(logs, step=epoch + 1)
 
@@ -668,7 +610,7 @@ def remove_model(old_ckpt_name):
             saving = (epoch + 1) % args.save_every_n_epochs == 0 and (epoch + 1) < num_train_epochs
             if is_main_process and saving:
                 ckpt_name = train_util.get_epoch_ckpt_name(args, "." + args.save_model_as, epoch + 1)
-                save_model(ckpt_name,unwrap_model(controlnet))
+                save_model(ckpt_name, unwrap_model(control_net))
 
                 remove_epoch_no = train_util.get_remove_epoch_no(args, epoch + 1)
                 if remove_epoch_no is not None:
@@ -688,13 +630,13 @@ def remove_model(old_ckpt_name):
             [tokenizer1, tokenizer2],
             [text_encoder1, text_encoder2],
             unet,
-            controlnet=controlnet,
+            controlnet=control_net,
         )
 
         # end of epoch
 
     if is_main_process:
-        controlnet = unwrap_model(controlnet)
+        control_net = unwrap_model(control_net)
 
     accelerator.end_training()
 
@@ -703,9 +645,7 @@ def remove_model(old_ckpt_name):
 
     if is_main_process:
         ckpt_name = train_util.get_last_ckpt_name(args, "." + args.save_model_as)
-        save_model(
-            ckpt_name, controlnet, force_sync_upload=True
-        )
+        save_model(ckpt_name, control_net, force_sync_upload=True)
 
         logger.info("model saved.")
 
@@ -717,26 +657,38 @@ def setup_parser() -> argparse.ArgumentParser:
     train_util.add_sd_models_arguments(parser)
     train_util.add_dataset_arguments(parser, False, True, True)
     train_util.add_training_arguments(parser, False)
-    train_util.add_masked_loss_arguments(parser)
+    # train_util.add_masked_loss_arguments(parser)
     deepspeed_utils.add_deepspeed_arguments(parser)
-    train_util.add_sd_saving_arguments(parser)
+    # train_util.add_sd_saving_arguments(parser)
     train_util.add_optimizer_arguments(parser)
     config_util.add_config_arguments(parser)
     custom_train_functions.add_custom_train_arguments(parser)
     sdxl_train_util.add_sdxl_training_arguments(parser)
 
     parser.add_argument(
-        "--controlnet_model_name_or_path",
+        "--controlnet_model_path",
         type=str,
         default=None,
         help="controlnet model name or path / controlnetのモデル名またはパス",
     )
+    parser.add_argument(
+        "--conditioning_data_dir",
+        type=str,
+        default=None,
+        help="conditioning data directory / 条件付けデータのディレクトリ",
+    )
+    parser.add_argument(
+        "--save_model_as",
+        type=str,
+        default="safetensors",
+        choices=[None, "ckpt", "pt", "safetensors"],
+        help="format to save the model (default is .safetensors) / モデル保存時の形式（デフォルトはsafetensors）",
+    )
     parser.add_argument(
         "--no_half_vae",
         action="store_true",
         help="do not use fp16/bf16 VAE in mixed precision (use float VAE) / mixed precisionでも fp16/bf16 VAEを使わずfloat VAEを使う",
     )
-
     return parser
 
 

From 0243c65877a7700ffab1e782690f26080a0deadc Mon Sep 17 00:00:00 2001
From: Kohya S <ykumeykume@gmail.com>
Date: Sun, 29 Sep 2024 23:09:56 +0900
Subject: [PATCH 162/582] fix typo

---
 gen_img.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/gen_img.py b/gen_img.py
index 70b3c81ff..421d5c0b9 100644
--- a/gen_img.py
+++ b/gen_img.py
@@ -1890,7 +1890,7 @@ def __getattr__(self, item):
 
                 state_dict = load_file(model_file)
 
-                logger.info(f"Initalizing SDXL ControlNet with multiplier: {multiplier}")
+                logger.info(f"Initializing SDXL ControlNet with multiplier: {multiplier}")
                 with init_empty_weights():
                     control_net = SdxlControlNet(multiplier=multiplier)
                 control_net.load_state_dict(state_dict)

From 793999d116638548fc16579b712f44456ee3034e Mon Sep 17 00:00:00 2001
From: Kohya S <ykumeykume@gmail.com>
Date: Mon, 30 Sep 2024 23:39:32 +0900
Subject: [PATCH 163/582] sample generation in SDXL ControlNet training

---
 library/sdxl_lpw_stable_diffusion.py | 168 +++++++----------------
 library/strategy_base.py             | 192 ++++++++++++++++++++++++++-
 library/strategy_sdxl.py             |  39 +++++-
 library/train_util.py                |  35 +++--
 sdxl_train_control_net.py            |  55 ++++----
 5 files changed, 323 insertions(+), 166 deletions(-)

diff --git a/library/sdxl_lpw_stable_diffusion.py b/library/sdxl_lpw_stable_diffusion.py
index 03b182566..9196eb0f2 100644
--- a/library/sdxl_lpw_stable_diffusion.py
+++ b/library/sdxl_lpw_stable_diffusion.py
@@ -13,12 +13,20 @@
 from transformers import CLIPFeatureExtractor, CLIPTextModel, CLIPTokenizer
 
 from diffusers import SchedulerMixin, StableDiffusionPipeline
-from diffusers.models import AutoencoderKL, UNet2DConditionModel
-from diffusers.pipelines.stable_diffusion import StableDiffusionPipelineOutput, StableDiffusionSafetyChecker
+from diffusers.models import AutoencoderKL
+from diffusers.pipelines.stable_diffusion import StableDiffusionSafetyChecker
 from diffusers.utils import logging
 from PIL import Image
 
-from library import sdxl_model_util, sdxl_train_util, train_util
+from library import (
+    sdxl_model_util,
+    sdxl_train_util,
+    strategy_base,
+    strategy_sdxl,
+    train_util,
+    sdxl_original_unet,
+    sdxl_original_control_net,
+)
 
 
 try:
@@ -537,7 +545,7 @@ def __init__(
         vae: AutoencoderKL,
         text_encoder: List[CLIPTextModel],
         tokenizer: List[CLIPTokenizer],
-        unet: UNet2DConditionModel,
+        unet: Union[sdxl_original_unet.SdxlUNet2DConditionModel, sdxl_original_control_net.SdxlControlledUNet],
         scheduler: SchedulerMixin,
         # clip_skip: int,
         safety_checker: StableDiffusionSafetyChecker,
@@ -594,74 +602,6 @@ def _execution_device(self):
                 return torch.device(module._hf_hook.execution_device)
         return self.device
 
-    def _encode_prompt(
-        self,
-        prompt,
-        device,
-        num_images_per_prompt,
-        do_classifier_free_guidance,
-        negative_prompt,
-        max_embeddings_multiples,
-        is_sdxl_text_encoder2,
-    ):
-        r"""
-        Encodes the prompt into text encoder hidden states.
-
-        Args:
-            prompt (`str` or `list(int)`):
-                prompt to be encoded
-            device: (`torch.device`):
-                torch device
-            num_images_per_prompt (`int`):
-                number of images that should be generated per prompt
-            do_classifier_free_guidance (`bool`):
-                whether to use classifier free guidance or not
-            negative_prompt (`str` or `List[str]`):
-                The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
-                if `guidance_scale` is less than `1`).
-            max_embeddings_multiples (`int`, *optional*, defaults to `3`):
-                The max multiple length of prompt embeddings compared to the max output length of text encoder.
-        """
-        batch_size = len(prompt) if isinstance(prompt, list) else 1
-
-        if negative_prompt is None:
-            negative_prompt = [""] * batch_size
-        elif isinstance(negative_prompt, str):
-            negative_prompt = [negative_prompt] * batch_size
-        if batch_size != len(negative_prompt):
-            raise ValueError(
-                f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
-                f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
-                " the batch size of `prompt`."
-            )
-
-        text_embeddings, text_pool, uncond_embeddings, uncond_pool = get_weighted_text_embeddings(
-            pipe=self,
-            prompt=prompt,
-            uncond_prompt=negative_prompt if do_classifier_free_guidance else None,
-            max_embeddings_multiples=max_embeddings_multiples,
-            clip_skip=self.clip_skip,
-            is_sdxl_text_encoder2=is_sdxl_text_encoder2,
-        )
-        bs_embed, seq_len, _ = text_embeddings.shape
-        text_embeddings = text_embeddings.repeat(1, num_images_per_prompt, 1)  # ??
-        text_embeddings = text_embeddings.view(bs_embed * num_images_per_prompt, seq_len, -1)
-        if text_pool is not None:
-            text_pool = text_pool.repeat(1, num_images_per_prompt)
-            text_pool = text_pool.view(bs_embed * num_images_per_prompt, -1)
-
-        if do_classifier_free_guidance:
-            bs_embed, seq_len, _ = uncond_embeddings.shape
-            uncond_embeddings = uncond_embeddings.repeat(1, num_images_per_prompt, 1)
-            uncond_embeddings = uncond_embeddings.view(bs_embed * num_images_per_prompt, seq_len, -1)
-            if uncond_pool is not None:
-                uncond_pool = uncond_pool.repeat(1, num_images_per_prompt)
-                uncond_pool = uncond_pool.view(bs_embed * num_images_per_prompt, -1)
-
-            return text_embeddings, text_pool, uncond_embeddings, uncond_pool
-
-        return text_embeddings, text_pool, None, None
-
     def check_inputs(self, prompt, height, width, strength, callback_steps):
         if not isinstance(prompt, str) and not isinstance(prompt, list):
             raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
@@ -792,7 +732,7 @@ def __call__(
         max_embeddings_multiples: Optional[int] = 3,
         output_type: Optional[str] = "pil",
         return_dict: bool = True,
-        controlnet=None,
+        controlnet: sdxl_original_control_net.SdxlControlNet = None,
         controlnet_image=None,
         callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
         is_cancelled_callback: Optional[Callable[[], bool]] = None,
@@ -896,32 +836,24 @@ def __call__(
         do_classifier_free_guidance = guidance_scale > 1.0
 
         # 3. Encode input prompt
-        # 実装を簡単にするためにtokenzer/text encoderを切り替えて二回呼び出す
-        # To simplify the implementation, switch the tokenzer/text encoder and call it twice
-        text_embeddings_list = []
-        text_pool = None
-        uncond_embeddings_list = []
-        uncond_pool = None
-        for i in range(len(self.tokenizers)):
-            self.tokenizer = self.tokenizers[i]
-            self.text_encoder = self.text_encoders[i]
-
-            text_embeddings, tp1, uncond_embeddings, up1 = self._encode_prompt(
-                prompt,
-                device,
-                num_images_per_prompt,
-                do_classifier_free_guidance,
-                negative_prompt,
-                max_embeddings_multiples,
-                is_sdxl_text_encoder2=i == 1,
-            )
-            text_embeddings_list.append(text_embeddings)
-            uncond_embeddings_list.append(uncond_embeddings)
+        tokenize_strategy: strategy_sdxl.SdxlTokenizeStrategy = strategy_base.TokenizeStrategy.get_strategy()
+        encoding_strategy: strategy_sdxl.SdxlTextEncodingStrategy = strategy_base.TextEncodingStrategy.get_strategy()
 
-            if tp1 is not None:
-                text_pool = tp1
-            if up1 is not None:
-                uncond_pool = up1
+        text_input_ids, text_weights = tokenize_strategy.tokenize_with_weights(prompt)
+        hidden_states_1, hidden_states_2, text_pool = encoding_strategy.encode_tokens_with_weights(
+            tokenize_strategy, self.text_encoders, text_input_ids, text_weights
+        )
+        text_embeddings = torch.cat([hidden_states_1, hidden_states_2], dim=-1)
+
+        if do_classifier_free_guidance:
+            input_ids, weights = tokenize_strategy.tokenize_with_weights(negative_prompt or "")
+            hidden_states_1, hidden_states_2, uncond_pool = encoding_strategy.encode_tokens_with_weights(
+                tokenize_strategy, self.text_encoders, input_ids, weights
+            )
+            uncond_embeddings = torch.cat([hidden_states_1, hidden_states_2], dim=-1)
+        else:
+            uncond_embeddings = None
+            uncond_pool = None
 
         unet_dtype = self.unet.dtype
         dtype = unet_dtype
@@ -970,23 +902,23 @@ def __call__(
         extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
 
         # create size embs and concat embeddings for SDXL
-        orig_size = torch.tensor([height, width]).repeat(batch_size * num_images_per_prompt, 1).to(dtype)
+        orig_size = torch.tensor([height, width]).repeat(batch_size * num_images_per_prompt, 1).to(device, dtype)
         crop_size = torch.zeros_like(orig_size)
         target_size = orig_size
-        embs = sdxl_train_util.get_size_embeddings(orig_size, crop_size, target_size, device).to(dtype)
+        embs = sdxl_train_util.get_size_embeddings(orig_size, crop_size, target_size, device).to(device, dtype)
 
         # make conditionings
+        text_pool = text_pool.to(device, dtype)
         if do_classifier_free_guidance:
-            text_embeddings = torch.cat(text_embeddings_list, dim=2)
-            uncond_embeddings = torch.cat(uncond_embeddings_list, dim=2)
-            text_embedding = torch.cat([uncond_embeddings, text_embeddings]).to(dtype)
+            text_embedding = torch.cat([uncond_embeddings, text_embeddings]).to(device, dtype)
 
-            cond_vector = torch.cat([text_pool, embs], dim=1)
-            uncond_vector = torch.cat([uncond_pool, embs], dim=1)
-            vector_embedding = torch.cat([uncond_vector, cond_vector]).to(dtype)
+            uncond_pool = uncond_pool.to(device, dtype)
+            cond_vector = torch.cat([text_pool, embs], dim=1).to(dtype)
+            uncond_vector = torch.cat([uncond_pool, embs], dim=1).to(dtype)
+            vector_embedding = torch.cat([uncond_vector, cond_vector])
         else:
-            text_embedding = torch.cat(text_embeddings_list, dim=2).to(dtype)
-            vector_embedding = torch.cat([text_pool, embs], dim=1).to(dtype)
+            text_embedding = text_embeddings.to(device, dtype)
+            vector_embedding = torch.cat([text_pool, embs], dim=1)
 
         # 8. Denoising loop
         for i, t in enumerate(self.progress_bar(timesteps)):
@@ -994,22 +926,14 @@ def __call__(
             latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
             latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
 
-            unet_additional_args = {}
-            if controlnet is not None:
-                down_block_res_samples, mid_block_res_sample = controlnet(
-                    latent_model_input,
-                    t,
-                    encoder_hidden_states=text_embeddings,
-                    controlnet_cond=controlnet_image,
-                    conditioning_scale=1.0,
-                    guess_mode=False,
-                    return_dict=False,
-                )
-                unet_additional_args["down_block_additional_residuals"] = down_block_res_samples
-                unet_additional_args["mid_block_additional_residual"] = mid_block_res_sample
+            # FIXME SD1 ControlNet is not working
 
             # predict the noise residual
-            noise_pred = self.unet(latent_model_input, t, text_embedding, vector_embedding)
+            if controlnet is not None:
+                input_resi_add, mid_add = controlnet(latent_model_input, t, text_embedding, vector_embedding, controlnet_image)
+                noise_pred = self.unet(latent_model_input, t, text_embedding, vector_embedding, input_resi_add, mid_add)
+            else:
+                noise_pred = self.unet(latent_model_input, t, text_embedding, vector_embedding)
             noise_pred = noise_pred.to(dtype)  # U-Net changes dtype in LoRA training
 
             # perform guidance
diff --git a/library/strategy_base.py b/library/strategy_base.py
index e7d3a97ef..10820afa1 100644
--- a/library/strategy_base.py
+++ b/library/strategy_base.py
@@ -1,6 +1,7 @@
 # base class for platform strategies. this file defines the interface for strategies
 
 import os
+import re
 from typing import Any, List, Optional, Tuple, Union
 
 import numpy as np
@@ -22,6 +23,24 @@
 class TokenizeStrategy:
     _strategy = None  # strategy instance: actual strategy class
 
+    _re_attention = re.compile(
+        r"""\\\(|
+\\\)|
+\\\[|
+\\]|
+\\\\|
+\\|
+\(|
+\[|
+:([+-]?[.\d]+)\)|
+\)|
+]|
+[^\\()\[\]:]+|
+:
+""",
+        re.X,
+    )
+
     @classmethod
     def set_strategy(cls, strategy):
         if cls._strategy is not None:
@@ -54,7 +73,151 @@ def _load_tokenizer(
     def tokenize(self, text: Union[str, List[str]]) -> List[torch.Tensor]:
         raise NotImplementedError
 
-    def _get_input_ids(self, tokenizer: CLIPTokenizer, text: str, max_length: Optional[int] = None) -> torch.Tensor:
+    def tokenize_with_weights(self, text: Union[str, List[str]]) -> Tuple[List[torch.Tensor], List[torch.Tensor]]:
+        raise NotImplementedError
+
+    def _get_weighted_input_ids(
+        self, tokenizer: CLIPTokenizer, text: str, max_length: Optional[int] = None
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        """
+        max_length includes starting and ending tokens.
+        """
+
+        def parse_prompt_attention(text):
+            """
+            Parses a string with attention tokens and returns a list of pairs: text and its associated weight.
+            Accepted tokens are:
+            (abc) - increases attention to abc by a multiplier of 1.1
+            (abc:3.12) - increases attention to abc by a multiplier of 3.12
+            [abc] - decreases attention to abc by a multiplier of 1.1
+            \( - literal character '('
+            \[ - literal character '['
+            \) - literal character ')'
+            \] - literal character ']'
+            \\ - literal character '\'
+            anything else - just text
+            >>> parse_prompt_attention('normal text')
+            [['normal text', 1.0]]
+            >>> parse_prompt_attention('an (important) word')
+            [['an ', 1.0], ['important', 1.1], [' word', 1.0]]
+            >>> parse_prompt_attention('(unbalanced')
+            [['unbalanced', 1.1]]
+            >>> parse_prompt_attention('\(literal\]')
+            [['(literal]', 1.0]]
+            >>> parse_prompt_attention('(unnecessary)(parens)')
+            [['unnecessaryparens', 1.1]]
+            >>> parse_prompt_attention('a (((house:1.3)) [on] a (hill:0.5), sun, (((sky))).')
+            [['a ', 1.0],
+            ['house', 1.5730000000000004],
+            [' ', 1.1],
+            ['on', 1.0],
+            [' a ', 1.1],
+            ['hill', 0.55],
+            [', sun, ', 1.1],
+            ['sky', 1.4641000000000006],
+            ['.', 1.1]]
+            """
+
+            res = []
+            round_brackets = []
+            square_brackets = []
+
+            round_bracket_multiplier = 1.1
+            square_bracket_multiplier = 1 / 1.1
+
+            def multiply_range(start_position, multiplier):
+                for p in range(start_position, len(res)):
+                    res[p][1] *= multiplier
+
+            for m in TokenizeStrategy._re_attention.finditer(text):
+                text = m.group(0)
+                weight = m.group(1)
+
+                if text.startswith("\\"):
+                    res.append([text[1:], 1.0])
+                elif text == "(":
+                    round_brackets.append(len(res))
+                elif text == "[":
+                    square_brackets.append(len(res))
+                elif weight is not None and len(round_brackets) > 0:
+                    multiply_range(round_brackets.pop(), float(weight))
+                elif text == ")" and len(round_brackets) > 0:
+                    multiply_range(round_brackets.pop(), round_bracket_multiplier)
+                elif text == "]" and len(square_brackets) > 0:
+                    multiply_range(square_brackets.pop(), square_bracket_multiplier)
+                else:
+                    res.append([text, 1.0])
+
+            for pos in round_brackets:
+                multiply_range(pos, round_bracket_multiplier)
+
+            for pos in square_brackets:
+                multiply_range(pos, square_bracket_multiplier)
+
+            if len(res) == 0:
+                res = [["", 1.0]]
+
+            # merge runs of identical weights
+            i = 0
+            while i + 1 < len(res):
+                if res[i][1] == res[i + 1][1]:
+                    res[i][0] += res[i + 1][0]
+                    res.pop(i + 1)
+                else:
+                    i += 1
+
+            return res
+
+        def get_prompts_with_weights(text: str, max_length: int):
+            r"""
+            Tokenize a list of prompts and return its tokens with weights of each token. max_length does not include starting and ending token.
+
+            No padding, starting or ending token is included.
+            """
+            truncated = False
+
+            texts_and_weights = parse_prompt_attention(text)
+            tokens = []
+            weights = []
+            for word, weight in texts_and_weights:
+                # tokenize and discard the starting and the ending token
+                token = tokenizer(word).input_ids[1:-1]
+                tokens += token
+                # copy the weight by length of token
+                weights += [weight] * len(token)
+                # stop if the text is too long (longer than truncation limit)
+                if len(tokens) > max_length:
+                    truncated = True
+                    break
+            # truncate
+            if len(tokens) > max_length:
+                truncated = True
+                tokens = tokens[:max_length]
+                weights = weights[:max_length]
+            if truncated:
+                logger.warning("Prompt was truncated. Try to shorten the prompt or increase max_embeddings_multiples")
+            return tokens, weights
+
+        def pad_tokens_and_weights(tokens, weights, max_length, bos, eos, pad):
+            r"""
+            Pad the tokens (with starting and ending tokens) and weights (with 1.0) to max_length.
+            """
+            tokens = [bos] + tokens + [eos] + [pad] * (max_length - 2 - len(tokens))
+            weights = [1.0] + weights + [1.0] * (max_length - 1 - len(weights))
+            return tokens, weights
+
+        if max_length is None:
+            max_length = tokenizer.model_max_length
+
+        tokens, weights = get_prompts_with_weights(text, max_length - 2)
+        tokens, weights = pad_tokens_and_weights(
+            tokens, weights, max_length, tokenizer.bos_token_id, tokenizer.eos_token_id, tokenizer.pad_token_id
+        )
+        return torch.tensor(tokens).unsqueeze(0), torch.tensor(weights).unsqueeze(0)
+
+    def _get_input_ids(
+        self, tokenizer: CLIPTokenizer, text: str, max_length: Optional[int] = None, weighted: bool = False
+    ) -> torch.Tensor:
         """
         for SD1.5/2.0/SDXL
         TODO support batch input
@@ -62,7 +225,10 @@ def _get_input_ids(self, tokenizer: CLIPTokenizer, text: str, max_length: Option
         if max_length is None:
             max_length = tokenizer.model_max_length - 2
 
-        input_ids = tokenizer(text, padding="max_length", truncation=True, max_length=max_length, return_tensors="pt").input_ids
+        if weighted:
+            input_ids, weights = self._get_weighted_input_ids(tokenizer, text, max_length)
+        else:
+            input_ids = tokenizer(text, padding="max_length", truncation=True, max_length=max_length, return_tensors="pt").input_ids
 
         if max_length > tokenizer.model_max_length:
             input_ids = input_ids.squeeze(0)
@@ -101,6 +267,17 @@ def _get_input_ids(self, tokenizer: CLIPTokenizer, text: str, max_length: Option
                     iids_list.append(ids_chunk)
 
             input_ids = torch.stack(iids_list)  # 3,77
+
+            if weighted:
+                weights = weights.squeeze(0)
+                new_weights = torch.ones(input_ids.shape)
+                for i in range(1, max_length - tokenizer.model_max_length + 2, tokenizer.model_max_length - 2):
+                    b = i // (tokenizer.model_max_length - 2)
+                    new_weights[b, 1 : 1 + tokenizer.model_max_length - 2] = weights[i : i + tokenizer.model_max_length - 2]
+                weights = new_weights
+
+        if weighted:
+            return input_ids, weights
         return input_ids
 
 
@@ -126,6 +303,17 @@ def encode_tokens(
         :return: list of output embeddings for each architecture
         """
         raise NotImplementedError
+    
+    def encode_tokens_with_weights(
+        self, tokenize_strategy: TokenizeStrategy, models: List[Any], tokens: List[torch.Tensor], weights: List[torch.Tensor]
+    ) -> List[torch.Tensor]:
+        """
+        Encode tokens into embeddings and outputs.
+        :param tokens: list of token tensors for each TextModel
+        :param weights: list of weight tensors for each TextModel
+        :return: list of output embeddings for each architecture
+        """
+        raise NotImplementedError
 
 
 class TextEncoderOutputsCachingStrategy:
diff --git a/library/strategy_sdxl.py b/library/strategy_sdxl.py
index 3eb0ab6f6..b48e6d55a 100644
--- a/library/strategy_sdxl.py
+++ b/library/strategy_sdxl.py
@@ -37,6 +37,22 @@ def tokenize(self, text: Union[str, List[str]]) -> List[torch.Tensor]:
             torch.stack([self._get_input_ids(self.tokenizer2, t, self.max_length) for t in text], dim=0),
         )
 
+    def tokenize_with_weights(self, text: str | List[str]) -> Tuple[List[torch.Tensor]]:
+        text = [text] if isinstance(text, str) else text
+        tokens1_list, tokens2_list = [], []
+        weights1_list, weights2_list = [], []
+        for t in text:
+            tokens1, weights1 = self._get_weighted_input_ids(self.tokenizer1, t, self.max_length)
+            tokens2, weights2 = self._get_weighted_input_ids(self.tokenizer2, t, self.max_length)
+            tokens1_list.append(tokens1)
+            tokens2_list.append(tokens2)
+            weights1_list.append(weights1)
+            weights2_list.append(weights2)
+        return (torch.stack(tokens1_list, dim=0), torch.stack(tokens2_list, dim=0)), (
+            torch.stack(weights1_list, dim=0),
+            torch.stack(weights2_list, dim=0),
+        )
+
 
 class SdxlTextEncodingStrategy(TextEncodingStrategy):
     def __init__(self) -> None:
@@ -98,7 +114,10 @@ def _get_hidden_states_sdxl(
     ):
         # input_ids: b,n,77 -> b*n, 77
         b_size = input_ids1.size()[0]
-        max_token_length = input_ids1.size()[1] * input_ids1.size()[2]
+        if input_ids1.size()[1] == 1:
+            max_token_length = None
+        else:
+            max_token_length = input_ids1.size()[1] * input_ids1.size()[2]
         input_ids1 = input_ids1.reshape((-1, tokenizer1.model_max_length))  # batch_size*n, 77
         input_ids2 = input_ids2.reshape((-1, tokenizer2.model_max_length))  # batch_size*n, 77
         input_ids1 = input_ids1.to(text_encoder1.device)
@@ -172,6 +191,24 @@ def encode_tokens(
         )
         return [hidden_states1, hidden_states2, pool2]
 
+    def encode_tokens_with_weights(
+        self, tokenize_strategy: TokenizeStrategy, models: List[Any], tokens: List[torch.Tensor], weights: List[torch.Tensor]
+    ) -> List[torch.Tensor]:
+        hidden_states1, hidden_states2, pool2 = self.encode_tokens(tokenize_strategy, models, tokens)
+
+        # apply weights
+        if weights[0].shape[1] == 1:  # no max_token_length
+            # weights: ((b, 1, 77), (b, 1, 77)), hidden_states: (b, 77, 768), (b, 77, 768)
+            hidden_states1 = hidden_states1 * weights[0].squeeze(1).unsqueeze(2)
+            hidden_states2 = hidden_states2 * weights[1].squeeze(1).unsqueeze(2)
+        else:
+            # weights: ((b, n, 77), (b, n, 77)), hidden_states: (b, n*75+2, 768), (b, n*75+2, 768)
+            for weight, hidden_states in zip(weights, [hidden_states1, hidden_states2]):
+                for i in range(weight.shape[1]):
+                    hidden_states[:, i * 75 + 1 : i * 75 + 76] = hidden_states[:, i * 75 + 1 : i * 75 + 76] * weight[:, i, 1:-1]
+
+        return [hidden_states1, hidden_states2, pool2]
+
 
 class SdxlTextEncoderOutputsCachingStrategy(TextEncoderOutputsCachingStrategy):
     SDXL_TEXT_ENCODER_OUTPUTS_NPZ_SUFFIX = "_te_outputs.npz"
diff --git a/library/train_util.py b/library/train_util.py
index 293fc05ad..b559616f2 100644
--- a/library/train_util.py
+++ b/library/train_util.py
@@ -74,6 +74,7 @@
 import cv2
 import safetensors.torch
 from library.lpw_stable_diffusion import StableDiffusionLongPromptWeightingPipeline
+from library.sdxl_lpw_stable_diffusion import SdxlStableDiffusionLongPromptWeightingPipeline
 import library.model_util as model_util
 import library.huggingface_util as huggingface_util
 import library.sai_model_spec as sai_model_spec
@@ -3581,7 +3582,20 @@ def add_training_arguments(parser: argparse.ArgumentParser, support_dreambooth:
         # available backends:
         # https://github.com/huggingface/accelerate/blob/d1abd59114ada8ba673e1214218cb2878c13b82d/src/accelerate/utils/dataclasses.py#L376-L388C5
         # https://pytorch.org/docs/stable/torch.compiler.html
-        choices=["eager", "aot_eager", "inductor", "aot_ts_nvfuser", "nvprims_nvfuser", "cudagraphs", "ofi", "fx2trt", "onnxrt", "tensort", "ipex", "tvm"],
+        choices=[
+            "eager",
+            "aot_eager",
+            "inductor",
+            "aot_ts_nvfuser",
+            "nvprims_nvfuser",
+            "cudagraphs",
+            "ofi",
+            "fx2trt",
+            "onnxrt",
+            "tensort",
+            "ipex",
+            "tvm",
+        ],
         help="dynamo backend type (default is inductor) / dynamoのbackendの種類（デフォルトは inductor）",
     )
     parser.add_argument("--xformers", action="store_true", help="use xformers for CrossAttention / CrossAttentionにxformersを使う")
@@ -5850,8 +5864,8 @@ def sample_images_common(
     pipe_class,
     accelerator: Accelerator,
     args: argparse.Namespace,
-    epoch,
-    steps,
+    epoch: int,
+    steps: int,
     device,
     vae,
     tokenizer,
@@ -5910,11 +5924,7 @@ def sample_images_common(
         with open(args.sample_prompts, "r", encoding="utf-8") as f:
             prompts = json.load(f)
 
-    # schedulers: dict = {}  cannot find where this is used
-    default_scheduler = get_my_scheduler(
-        sample_sampler=args.sample_sampler,
-        v_parameterization=args.v_parameterization,
-    )
+    default_scheduler = get_my_scheduler(sample_sampler=args.sample_sampler, v_parameterization=args.v_parameterization)
 
     pipeline = pipe_class(
         text_encoder=text_encoder,
@@ -5975,21 +5985,18 @@ def sample_images_common(
     # clear pipeline and cache to reduce vram usage
     del pipeline
 
-    # I'm not sure which of these is the correct way to clear the memory, but accelerator's device is used in the pipeline, so I'm using it here.
-    # with torch.cuda.device(torch.cuda.current_device()):
-    #     torch.cuda.empty_cache()
-    clean_memory_on_device(accelerator.device)
-
     torch.set_rng_state(rng_state)
     if torch.cuda.is_available() and cuda_rng_state is not None:
         torch.cuda.set_rng_state(cuda_rng_state)
     vae.to(org_vae_device)
 
+    clean_memory_on_device(accelerator.device)
+
 
 def sample_image_inference(
     accelerator: Accelerator,
     args: argparse.Namespace,
-    pipeline,
+    pipeline: Union[StableDiffusionLongPromptWeightingPipeline, SdxlStableDiffusionLongPromptWeightingPipeline],
     save_dir,
     prompt_dict,
     epoch,
diff --git a/sdxl_train_control_net.py b/sdxl_train_control_net.py
index 74dcff2af..583a27dcc 100644
--- a/sdxl_train_control_net.py
+++ b/sdxl_train_control_net.py
@@ -83,6 +83,7 @@ def train(args):
 
     tokenize_strategy = strategy_sdxl.SdxlTokenizeStrategy(args.max_token_length, args.tokenizer_cache_dir)
     strategy_base.TokenizeStrategy.set_strategy(tokenize_strategy)
+    tokenizer1, tokenizer2 = tokenize_strategy.tokenizer1, tokenize_strategy.tokenizer2  # this is used for sampling images
 
     # prepare caching strategy: this must be set before preparing dataset. because dataset may use this strategy for initialization.
     latents_caching_strategy = strategy_sd.SdSdxlLatentsCachingStrategy(
@@ -436,19 +437,19 @@ def remove_model(old_ckpt_name):
             accelerator.print(f"removing old checkpoint: {old_ckpt_file}")
             os.remove(old_ckpt_file)
 
-    # # For --sample_at_first
-    # sdxl_train_util.sample_images(
-    #     accelerator,
-    #     args,
-    #     0,
-    #     global_step,
-    #     accelerator.device,
-    #     vae,
-    #     [tokenizer1, tokenizer2],
-    #     [text_encoder1, text_encoder2],
-    #     unet,
-    #     controlnet=control_net,
-    # )
+    # For --sample_at_first
+    sdxl_train_util.sample_images(
+        accelerator,
+        args,
+        0,
+        global_step,
+        accelerator.device,
+        vae,
+        [tokenizer1, tokenizer2],
+        [text_encoder1, text_encoder2, unwrap_model(text_encoder2)],
+        unet,
+        controlnet=control_net,
+    )
 
     # training loop
     for epoch in range(num_train_epochs):
@@ -484,7 +485,7 @@ def remove_model(old_ckpt_name):
                         input_ids1 = input_ids1.to(accelerator.device)
                         input_ids2 = input_ids2.to(accelerator.device)
                         encoder_hidden_states1, encoder_hidden_states2, pool2 = text_encoding_strategy.encode_tokens(
-                            tokenize_strategy, [text_encoder1, text_encoder2], [input_ids1, input_ids2]
+                            tokenize_strategy, [text_encoder1, text_encoder2, unwrap_model(text_encoder2)], [input_ids1, input_ids2]
                         )
                         if args.full_fp16:
                             encoder_hidden_states1 = encoder_hidden_states1.to(weight_dtype)
@@ -558,18 +559,18 @@ def remove_model(old_ckpt_name):
                 progress_bar.update(1)
                 global_step += 1
 
-                # sdxl_train_util.sample_images(
-                #     accelerator,
-                #     args,
-                #     None,
-                #     global_step,
-                #     accelerator.device,
-                #     vae,
-                #     [tokenizer1, tokenizer2],
-                #     [text_encoder1, text_encoder2],
-                #     unet,
-                #     controlnet=control_net,
-                # )
+                sdxl_train_util.sample_images(
+                    accelerator,
+                    args,
+                    None,
+                    global_step,
+                    accelerator.device,
+                    vae,
+                    [tokenizer1, tokenizer2],
+                    [text_encoder1, text_encoder2, unwrap_model(text_encoder2)],
+                    unet,
+                    controlnet=control_net,
+                )
 
                 # 指定ステップごとにモデルを保存
                 if args.save_every_n_steps is not None and global_step % args.save_every_n_steps == 0:
@@ -628,7 +629,7 @@ def remove_model(old_ckpt_name):
             accelerator.device,
             vae,
             [tokenizer1, tokenizer2],
-            [text_encoder1, text_encoder2],
+            [text_encoder1, text_encoder2, unwrap_model(text_encoder2)],
             unet,
             controlnet=control_net,
         )

From c2440f9e53239e7e5dee426f611800d3e38a7f0e Mon Sep 17 00:00:00 2001
From: Kohya S <ykumeykume@gmail.com>
Date: Thu, 3 Oct 2024 21:32:21 +0900
Subject: [PATCH 164/582] fix cond image normlization, add independent LR for
 control

---
 library/sdxl_train_util.py |  3 ++-
 library/train_util.py      | 20 +++++++++++++++++++-
 sdxl_train_control_net.py  | 30 +++++++++++++++++++++++++-----
 3 files changed, 46 insertions(+), 7 deletions(-)

diff --git a/library/sdxl_train_util.py b/library/sdxl_train_util.py
index f009b5779..aaf77b8dd 100644
--- a/library/sdxl_train_util.py
+++ b/library/sdxl_train_util.py
@@ -12,7 +12,6 @@
 from tqdm import tqdm
 from transformers import CLIPTokenizer
 from library import model_util, sdxl_model_util, train_util, sdxl_original_unet
-from library.sdxl_lpw_stable_diffusion import SdxlStableDiffusionLongPromptWeightingPipeline
 from .utils import setup_logging
 
 setup_logging()
@@ -378,4 +377,6 @@ def verify_sdxl_training_args(args: argparse.Namespace, supportTextEncoderCachin
 
 
 def sample_images(*args, **kwargs):
+    from library.sdxl_lpw_stable_diffusion import SdxlStableDiffusionLongPromptWeightingPipeline
+
     return train_util.sample_images_common(SdxlStableDiffusionLongPromptWeightingPipeline, *args, **kwargs)
diff --git a/library/train_util.py b/library/train_util.py
index b559616f2..07c253a0e 100644
--- a/library/train_util.py
+++ b/library/train_util.py
@@ -31,6 +31,7 @@
 import subprocess
 from io import BytesIO
 import toml
+# from concurrent.futures import ThreadPoolExecutor, as_completed
 
 from tqdm import tqdm
 
@@ -912,6 +913,23 @@ def make_buckets(self):
             if info.image_size is None:
                 info.image_size = self.get_image_size(info.absolute_path)
 
+        # # run in parallel
+        # max_workers = min(os.cpu_count(), len(self.image_data))  # TODO consider multi-gpu (processes)
+        # with ThreadPoolExecutor(max_workers) as executor:
+        #     futures = []
+        #     for info in tqdm(self.image_data.values(), desc="loading image sizes"):
+        #         if info.image_size is None:
+        #             def get_and_set_image_size(info):
+        #                 info.image_size = self.get_image_size(info.absolute_path)
+        #             futures.append(executor.submit(get_and_set_image_size, info))
+        #             # consume futures to reduce memory usage and prevent Ctrl-C hang
+        #             if len(futures) >= max_workers:
+        #                 for future in futures:
+        #                     future.result()
+        #                 futures = []
+        #     for future in futures:
+        #         future.result()
+
         if self.enable_bucket:
             logger.info("make buckets")
         else:
@@ -1826,7 +1844,7 @@ def load_dreambooth_dir(subset: DreamBoothSubset):
                 # 画像ファイルごとにプロンプトを読み込み、もしあればそちらを使う
                 captions = []
                 missing_captions = []
-                for img_path in img_paths:
+                for img_path in tqdm(img_paths, desc="read caption"):
                     cap_for_img = read_caption(img_path, subset.caption_extension, subset.enable_wildcard)
                     if cap_for_img is None and subset.class_tokens is None:
                         logger.warning(
diff --git a/sdxl_train_control_net.py b/sdxl_train_control_net.py
index 583a27dcc..b902cda69 100644
--- a/sdxl_train_control_net.py
+++ b/sdxl_train_control_net.py
@@ -253,11 +253,20 @@ def unwrap_model(model):
     # 学習に必要なクラスを準備する
     accelerator.print("prepare optimizer, data loader etc.")
 
-    trainable_params = list(control_net.parameters())
-    # for p in trainable_params:
-    #     p.requires_grad = True
-    logger.info(f"trainable params count: {len(trainable_params)}")
-    logger.info(f"number of trainable parameters: {sum(p.numel() for p in trainable_params if p.requires_grad)}")
+    trainable_params = []
+    ctrlnet_params = []
+    unet_params = []
+    for name, param in control_net.named_parameters():
+        if name.startswith("controlnet_"):
+            ctrlnet_params.append(param)
+        else:
+            unet_params.append(param)
+    trainable_params.append({"params": ctrlnet_params, "lr": args.control_net_lr})
+    trainable_params.append({"params": unet_params, "lr": args.learning_rate})
+    all_params = ctrlnet_params + unet_params
+    
+    logger.info(f"trainable params count: {len(all_params)}")
+    logger.info(f"number of trainable parameters: {sum(p.numel() for p in all_params)}")
 
     _, _, optimizer = train_util.get_optimizer(args, trainable_params)
 
@@ -456,6 +465,8 @@ def remove_model(old_ckpt_name):
         accelerator.print(f"\nepoch {epoch+1}/{num_train_epochs}")
         current_epoch.value = epoch + 1
 
+        control_net.train()
+
         for step, batch in enumerate(train_dataloader):
             current_step.value = global_step
             with accelerator.accumulate(control_net):
@@ -510,6 +521,9 @@ def remove_model(old_ckpt_name):
 
                 controlnet_image = batch["conditioning_images"].to(dtype=weight_dtype)
 
+                # '-1 to +1' to '0 to 1'
+                controlnet_image = (controlnet_image + 1) / 2
+
                 with accelerator.autocast():
                     input_resi_add, mid_add = control_net(
                         noisy_latents, timesteps, text_embedding, vector_embedding, controlnet_image
@@ -690,6 +704,12 @@ def setup_parser() -> argparse.ArgumentParser:
         action="store_true",
         help="do not use fp16/bf16 VAE in mixed precision (use float VAE) / mixed precisionでも fp16/bf16 VAEを使わずfloat VAEを使う",
     )
+    parser.add_argument(
+        "--control_net_lr",
+        type=float,
+        default=1e-4,
+        help="learning rate for controlnet / controlnetの学習率",
+    )
     return parser
 
 

From 3028027e074c891f33d45fff27068b490a408329 Mon Sep 17 00:00:00 2001
From: gesen2egee <gesen2gee@gmail.com>
Date: Fri, 4 Oct 2024 16:41:41 +0800
Subject: [PATCH 165/582] Update train_network.py

---
 train_network.py | 40 ++++++++++++++++++++--------------------
 1 file changed, 20 insertions(+), 20 deletions(-)

diff --git a/train_network.py b/train_network.py
index e10c17c0c..c0239a6da 100644
--- a/train_network.py
+++ b/train_network.py
@@ -1034,26 +1034,26 @@ def remove_model(old_ckpt_name):
                     logs = self.generate_step_logs(args, current_loss, avr_loss, lr_scheduler, keys_scaled, mean_norm, maximum_norm)
                     accelerator.log(logs, step=global_step)
                 
-				if len(val_dataloader) > 0:
-					if  (args.validation_every_n_step is not None and global_step % args.validation_every_n_step == 0) or (args.validation_every_n_step is None and step == len(train_dataloader) - 1) or global_step >= args.max_train_steps:
-						accelerator.print("Validating バリデーション処理...")
-						total_loss = 0.0
-						with torch.no_grad():
-							validation_steps = min(args.max_validation_steps, len(val_dataloader)) if args.max_validation_steps is not None else len(val_dataloader)
-							for val_step in tqdm(range(validation_steps), desc='Validation Steps'):
-								batch = next(cyclic_val_dataloader)
-								loss = self.process_val_batch(batch, tokenizers, text_encoders, unet, vae, noise_scheduler, vae_dtype, weight_dtype, accelerator, args)
-								total_loss += loss.detach().item()
-							current_loss = total_loss / validation_steps
-							val_loss_recorder.add(epoch=0, step=global_step, loss=current_loss)   
-		
-						if args.logging_dir is not None:
-							logs = {"loss/current_val_loss": current_loss}
-							accelerator.log(logs, step=global_step)                            
-							avr_loss: float = val_loss_recorder.moving_average
-							logs = {"loss/average_val_loss": avr_loss}
-							accelerator.log(logs, step=global_step)
-		                                
+                if len(val_dataloader) > 0:
+                    if  (args.validation_every_n_step is not None and global_step % args.validation_every_n_step == 0) or (args.validation_every_n_step is None and step == len(train_dataloader) - 1) or global_step >= args.max_train_steps:
+                        accelerator.print("Validating バリデーション処理...")
+                        total_loss = 0.0
+                        with torch.no_grad():
+                            validation_steps = min(args.max_validation_steps, len(val_dataloader)) if args.max_validation_steps is not None else len(val_dataloader)
+                            for val_step in tqdm(range(validation_steps), desc='Validation Steps'):
+                                batch = next(cyclic_val_dataloader)
+                                loss = self.process_val_batch(batch, tokenizers, text_encoders, unet, vae, noise_scheduler, vae_dtype, weight_dtype, accelerator, args)
+                                total_loss += loss.detach().item()
+                            current_loss = total_loss / validation_steps
+                            val_loss_recorder.add(epoch=0, step=global_step, loss=current_loss)   
+        
+                        if args.logging_dir is not None:
+                            logs = {"loss/current_val_loss": current_loss}
+                            accelerator.log(logs, step=global_step)                            
+                            avr_loss: float = val_loss_recorder.moving_average
+                            logs = {"loss/average_val_loss": avr_loss}
+                            accelerator.log(logs, step=global_step)
+                                        
                 if global_step >= args.max_train_steps:
                     break
 

From dece2c388f1c39e7baca201b4bf4e61d9f67a219 Mon Sep 17 00:00:00 2001
From: gesen2egee <gesen2gee@gmail.com>
Date: Fri, 4 Oct 2024 16:43:07 +0800
Subject: [PATCH 166/582] Update train_db.py

---
 train_db.py | 164 ++++++++++++++++++++++++++--------------------------
 1 file changed, 82 insertions(+), 82 deletions(-)

diff --git a/train_db.py b/train_db.py
index 800a157bf..2c17e521f 100644
--- a/train_db.py
+++ b/train_db.py
@@ -46,67 +46,67 @@
 
 # perlin_noise,
 def process_val_batch(*training_models, batch, tokenizers, text_encoders, unet, vae, noise_scheduler, vae_dtype, weight_dtype, accelerator, args):
-	total_loss = 0.0
-	timesteps_list = [10, 350, 500, 650, 990]
-	
-	with accelerator.accumulate(*training_models):
-		with torch.no_grad():
-			# latentに変換
-			if cache_latents:
-				latents = batch["latents"].to(accelerator.device).to(dtype=weight_dtype)
-			else:
-				latents = vae.encode(batch["images"].to(dtype=weight_dtype)).latent_dist.sample()
-			latents = latents * 0.18215
-		b_size = latents.shape[0]
-
-	with torch.set_grad_enabled(False), accelerator.autocast():
-		if args.weighted_captions:
-			encoder_hidden_states = get_weighted_text_embeddings(
-				tokenizer,
-				text_encoder,
-				batch["captions"],
-				accelerator.device,
-				args.max_token_length // 75 if args.max_token_length else 1,
-				clip_skip=args.clip_skip,
-			)
-		else:
-			input_ids = batch["input_ids"].to(accelerator.device)
-			encoder_hidden_states = train_util.get_hidden_states(
-				args, input_ids, tokenizer, text_encoder, None if not args.full_fp16 else weight_dtype
-			)
-
-	# Sample noise, sample a random timestep for each image, and add noise to the latents,
-	# with noise offset and/or multires noise if specified
-
-	for fixed_timesteps in timesteps_list:
-		with torch.set_grad_enabled(False), accelerator.autocast():
-			# Sample noise, sample a random timestep for each image, and add noise to the latents,
-			# with noise offset and/or multires noise if specified
-			noise = torch.randn_like(latents, device=latents.device)
-			b_size = latents.shape[0]
-			timesteps = torch.full((b_size,), fixed_timesteps, dtype=torch.long, device=latents.device)
-			noisy_latents = noise_scheduler.add_noise(latents, noise, timesteps)
-
-			# Predict the noise residual
-			with accelerator.autocast():
-				noise_pred = unet(noisy_latents, timesteps, encoder_hidden_states).sample
-
-			if args.v_parameterization:
-				# v-parameterization training
-				target = noise_scheduler.get_velocity(latents, noise, timesteps)
-			else:
-				target = noise
-
-			loss = torch.nn.functional.mse_loss(noise_pred.float(), target.float(), reduction="none")
-			if args.masked_loss:
-				loss = apply_masked_loss(loss, batch)
-			loss = loss.mean([1, 2, 3])
-			loss = apply_debiased_estimation(loss, timesteps, noise_scheduler)
-			loss = loss.mean()  # 平均なのでbatch_sizeで割る必要なし
-			total_loss += loss
-
-	average_loss = total_loss / len(timesteps_list)    
-	return average_loss
+    total_loss = 0.0
+    timesteps_list = [10, 350, 500, 650, 990]
+    
+    with accelerator.accumulate(*training_models):
+        with torch.no_grad():
+            # latentに変換
+            if cache_latents:
+                latents = batch["latents"].to(accelerator.device).to(dtype=weight_dtype)
+            else:
+                latents = vae.encode(batch["images"].to(dtype=weight_dtype)).latent_dist.sample()
+            latents = latents * 0.18215
+        b_size = latents.shape[0]
+
+    with torch.set_grad_enabled(False), accelerator.autocast():
+        if args.weighted_captions:
+            encoder_hidden_states = get_weighted_text_embeddings(
+                tokenizer,
+                text_encoder,
+                batch["captions"],
+                accelerator.device,
+                args.max_token_length // 75 if args.max_token_length else 1,
+                clip_skip=args.clip_skip,
+            )
+        else:
+            input_ids = batch["input_ids"].to(accelerator.device)
+            encoder_hidden_states = train_util.get_hidden_states(
+                args, input_ids, tokenizer, text_encoder, None if not args.full_fp16 else weight_dtype
+            )
+
+    # Sample noise, sample a random timestep for each image, and add noise to the latents,
+    # with noise offset and/or multires noise if specified
+
+    for fixed_timesteps in timesteps_list:
+        with torch.set_grad_enabled(False), accelerator.autocast():
+            # Sample noise, sample a random timestep for each image, and add noise to the latents,
+            # with noise offset and/or multires noise if specified
+            noise = torch.randn_like(latents, device=latents.device)
+            b_size = latents.shape[0]
+            timesteps = torch.full((b_size,), fixed_timesteps, dtype=torch.long, device=latents.device)
+            noisy_latents = noise_scheduler.add_noise(latents, noise, timesteps)
+
+            # Predict the noise residual
+            with accelerator.autocast():
+                noise_pred = unet(noisy_latents, timesteps, encoder_hidden_states).sample
+
+            if args.v_parameterization:
+                # v-parameterization training
+                target = noise_scheduler.get_velocity(latents, noise, timesteps)
+            else:
+                target = noise
+
+            loss = torch.nn.functional.mse_loss(noise_pred.float(), target.float(), reduction="none")
+            if args.masked_loss:
+                loss = apply_masked_loss(loss, batch)
+            loss = loss.mean([1, 2, 3])
+            loss = apply_debiased_estimation(loss, timesteps, noise_scheduler)
+            loss = loss.mean()  # 平均なのでbatch_sizeで割る必要なし
+            total_loss += loss
+
+    average_loss = total_loss / len(timesteps_list)    
+    return average_loss
 
 def train(args):
     train_util.verify_training_args(args)
@@ -210,8 +210,8 @@ def train(args):
         with torch.no_grad():
             train_dataset_group.cache_latents(vae, args.vae_batch_size, args.cache_latents_to_disk, accelerator.is_main_process)
             if val_dataset_group is not None:
-		print("Cache validation latents...")
-		val_dataset_group.cache_latents(vae, args.vae_batch_size, args.cache_latents_to_disk, accelerator.is_main_process)
+        print("Cache validation latents...")
+        val_dataset_group.cache_latents(vae, args.vae_batch_size, args.cache_latents_to_disk, accelerator.is_main_process)
         vae.to("cpu")
         clean_memory_on_device(accelerator.device)
 
@@ -503,25 +503,25 @@ def train(args):
             avr_loss: float = loss_recorder.moving_average
             logs = {"avr_loss": avr_loss}  # , "lr": lr_scheduler.get_last_lr()[0]}
             progress_bar.set_postfix(**logs)
-			if len(val_dataloader) > 0:
-				if  (args.validation_every_n_step is not None and global_step % args.validation_every_n_step == 0) or (args.validation_every_n_step is None and step == len(train_dataloader) - 1) or global_step >= args.max_train_steps:
-					accelerator.print("Validating バリデーション処理...")
-					total_loss = 0.0
-					with torch.no_grad():
-						validation_steps = min(args.max_validation_steps, len(val_dataloader)) if args.max_validation_steps is not None else len(val_dataloader)
-						for val_step in tqdm(range(validation_steps), desc='Validation Steps'):
-							batch = next(cyclic_val_dataloader)
-							loss = self.process_val_batch(batch, tokenizers, text_encoders, unet, vae, noise_scheduler, vae_dtype, weight_dtype, accelerator, args)
-							total_loss += loss.detach().item()
-						current_loss = total_loss / validation_steps
-						val_loss_recorder.add(epoch=0, step=global_step, loss=current_loss)   
-
-					if args.logging_dir is not None:
-						logs = {"loss/current_val_loss": current_loss}
-						accelerator.log(logs, step=global_step)                            
-						avr_loss: float = val_loss_recorder.moving_average
-						logs = {"loss/average_val_loss": avr_loss}
-						accelerator.log(logs, step=global_step)
+            if len(val_dataloader) > 0:
+                if  (args.validation_every_n_step is not None and global_step % args.validation_every_n_step == 0) or (args.validation_every_n_step is None and step == len(train_dataloader) - 1) or global_step >= args.max_train_steps:
+                    accelerator.print("Validating バリデーション処理...")
+                    total_loss = 0.0
+                    with torch.no_grad():
+                        validation_steps = min(args.max_validation_steps, len(val_dataloader)) if args.max_validation_steps is not None else len(val_dataloader)
+                        for val_step in tqdm(range(validation_steps), desc='Validation Steps'):
+                            batch = next(cyclic_val_dataloader)
+                            loss = self.process_val_batch(batch, tokenizers, text_encoders, unet, vae, noise_scheduler, vae_dtype, weight_dtype, accelerator, args)
+                            total_loss += loss.detach().item()
+                        current_loss = total_loss / validation_steps
+                        val_loss_recorder.add(epoch=0, step=global_step, loss=current_loss)   
+
+                    if args.logging_dir is not None:
+                        logs = {"loss/current_val_loss": current_loss}
+                        accelerator.log(logs, step=global_step)                            
+                        avr_loss: float = val_loss_recorder.moving_average
+                        logs = {"loss/average_val_loss": avr_loss}
+                        accelerator.log(logs, step=global_step)
 
             if global_step >= args.max_train_steps:
                 break

From ba08a898940c80a6551111fdd77b53c6d3a019ac Mon Sep 17 00:00:00 2001
From: Kohya S <ykumeykume@gmail.com>
Date: Fri, 4 Oct 2024 20:35:16 +0900
Subject: [PATCH 167/582] call optimizer eval/train for sample_at_first, also
 set train after resuming closes #1667

---
 flux_train.py    | 2 ++
 train_network.py | 2 ++
 2 files changed, 4 insertions(+)

diff --git a/flux_train.py b/flux_train.py
index 022467ea7..81c13e4cc 100644
--- a/flux_train.py
+++ b/flux_train.py
@@ -706,7 +706,9 @@ def optimizer_hook(parameter: torch.Tensor):
         accelerator.unwrap_model(flux).prepare_block_swap_before_forward()
 
     # For --sample_at_first
+    optimizer_eval_fn()
     flux_train_utils.sample_images(accelerator, args, 0, global_step, flux, ae, [clip_l, t5xxl], sample_prompts_te_outputs)
+    optimizer_train_fn()
     if len(accelerator.trackers) > 0:
         # log empty object to commit the sample images to wandb
         accelerator.log({}, step=0)
diff --git a/train_network.py b/train_network.py
index 7b2b76a1b..f0d397b9e 100644
--- a/train_network.py
+++ b/train_network.py
@@ -1042,7 +1042,9 @@ def remove_model(old_ckpt_name):
             text_encoder = None
 
         # For --sample_at_first
+        optimizer_eval_fn()
         self.sample_images(accelerator, args, 0, global_step, accelerator.device, vae, tokenizers, text_encoder, unet)
+        optimizer_train_fn()
         if len(accelerator.trackers) > 0:
             # log empty object to commit the sample images to wandb
             accelerator.log({}, step=0)

From 83e3048cb089bf6726751609da26da751b8383ae Mon Sep 17 00:00:00 2001
From: Kohya S <ykumeykume@gmail.com>
Date: Sun, 6 Oct 2024 21:32:21 +0900
Subject: [PATCH 168/582] load Diffusers format, check schnell/dev

---
 README.md                          |   4 +
 flux_minimal_inference.py          |  15 +--
 flux_train.py                      |  15 ++-
 flux_train_network.py              |  17 ++-
 library/flux_utils.py              | 178 +++++++++++++++++++++++++++--
 tools/convert_diffusers_to_flux.py |  78 +------------
 6 files changed, 196 insertions(+), 111 deletions(-)

diff --git a/README.md b/README.md
index 789fe514a..c567758a5 100644
--- a/README.md
+++ b/README.md
@@ -11,6 +11,10 @@ The command to install PyTorch is as follows:
 
 ### Recent Updates
 
+Oct 6, 2024:
+- In FLUX.1 LoRA training and fine-tuning, the specified weight file (*.safetensors) is automatically determined to be dev or schnell. This allows schnell models to be loaded correctly. Note that LoRA training with schnell models and fine-tuning with schnell models are unverified.
+- FLUX.1 LoRA training and fine-tuning can now load weights in Diffusers format in addition to BFL format (a single *.safetensors file). Please specify the parent directory of `transformer` or `diffusion_pytorch_model-00001-of-00003.safetensors` with the full path. However, Diffusers format CLIP/T5XXL is not supported. Saving is supported only in BFL format.
+
 Sep 26, 2024:
 The implementation of block swap during FLUX.1 fine-tuning has been changed to improve speed about 10% (depends on the environment). A new `--blocks_to_swap` option has been added, and `--double_blocks_to_swap` and `--single_blocks_to_swap` are deprecated. `--double_blocks_to_swap` and `--single_blocks_to_swap` are working as before, but they will be removed in the future. See [FLUX.1 fine-tuning](#flux1-fine-tuning) for details.
 
diff --git a/flux_minimal_inference.py b/flux_minimal_inference.py
index 2f1b9a377..7ab224f1b 100644
--- a/flux_minimal_inference.py
+++ b/flux_minimal_inference.py
@@ -419,9 +419,6 @@ def encode(prpt: str):
     steps = args.steps
     guidance_scale = args.guidance
 
-    name = "schnell" if "schnell" in args.ckpt_path else "dev"  # TODO change this to a more robust way
-    is_schnell = name == "schnell"
-
     def is_fp8(dt):
         return dt in [torch.float8_e4m3fn, torch.float8_e4m3fnuz, torch.float8_e5m2, torch.float8_e5m2fnuz]
 
@@ -455,12 +452,8 @@ def is_fp8(dt):
     # if is_fp8(t5xxl_dtype):
     #     t5xxl = accelerator.prepare(t5xxl)
 
-    t5xxl_max_length = 256 if is_schnell else 512
-    tokenize_strategy = strategy_flux.FluxTokenizeStrategy(t5xxl_max_length)
-    encoding_strategy = strategy_flux.FluxTextEncodingStrategy()
-
     # DiT
-    model = flux_utils.load_flow_model(name, args.ckpt_path, None, loading_device)
+    is_schnell, model = flux_utils.load_flow_model(args.ckpt_path, None, loading_device)
     model.eval()
     logger.info(f"Casting model to {flux_dtype}")
     model.to(flux_dtype)  # make sure model is dtype
@@ -469,8 +462,12 @@ def is_fp8(dt):
     #     if args.offload:
     #         model = model.to("cpu")
 
+    t5xxl_max_length = 256 if is_schnell else 512
+    tokenize_strategy = strategy_flux.FluxTokenizeStrategy(t5xxl_max_length)
+    encoding_strategy = strategy_flux.FluxTextEncodingStrategy()
+
     # AE
-    ae = flux_utils.load_ae(name, args.ae, ae_dtype, loading_device)
+    ae = flux_utils.load_ae(args.ae, ae_dtype, loading_device)
     ae.eval()
     # if is_fp8(ae_dtype):
     #     ae = accelerator.prepare(ae)
diff --git a/flux_train.py b/flux_train.py
index 81c13e4cc..ecc87c0a8 100644
--- a/flux_train.py
+++ b/flux_train.py
@@ -137,6 +137,7 @@ def train(args):
 
     train_dataset_group.verify_bucket_reso_steps(16)  # TODO これでいいか確認
 
+    _, is_schnell, _ = flux_utils.check_flux_state_dict_diffusers_schnell(args.pretrained_model_name_or_path)
     if args.debug_dataset:
         if args.cache_text_encoder_outputs:
             strategy_base.TextEncoderOutputsCachingStrategy.set_strategy(
@@ -144,9 +145,8 @@ def train(args):
                     args.cache_text_encoder_outputs_to_disk, args.text_encoder_batch_size, False, False
                 )
             )
-        name = "schnell" if "schnell" in args.pretrained_model_name_or_path else "dev"
         t5xxl_max_token_length = (
-            args.t5xxl_max_token_length if args.t5xxl_max_token_length is not None else (256 if name == "schnell" else 512)
+            args.t5xxl_max_token_length if args.t5xxl_max_token_length is not None else (256 if is_schnell else 512)
         )
         strategy_base.TokenizeStrategy.set_strategy(strategy_flux.FluxTokenizeStrategy(t5xxl_max_token_length))
 
@@ -177,12 +177,11 @@ def train(args):
     weight_dtype, save_dtype = train_util.prepare_dtype(args)
 
     # モデルを読み込む
-    name = "schnell" if "schnell" in args.pretrained_model_name_or_path else "dev"
 
     # load VAE for caching latents
     ae = None
     if cache_latents:
-        ae = flux_utils.load_ae(name, args.ae, weight_dtype, "cpu", args.disable_mmap_load_safetensors)
+        ae = flux_utils.load_ae( args.ae, weight_dtype, "cpu", args.disable_mmap_load_safetensors)
         ae.to(accelerator.device, dtype=weight_dtype)
         ae.requires_grad_(False)
         ae.eval()
@@ -196,7 +195,7 @@ def train(args):
 
     # prepare tokenize strategy
     if args.t5xxl_max_token_length is None:
-        if name == "schnell":
+        if is_schnell:
             t5xxl_max_token_length = 256
         else:
             t5xxl_max_token_length = 512
@@ -258,8 +257,8 @@ def train(args):
         clean_memory_on_device(accelerator.device)
 
     # load FLUX
-    flux = flux_utils.load_flow_model(
-        name, args.pretrained_model_name_or_path, weight_dtype, "cpu", args.disable_mmap_load_safetensors
+    _, flux = flux_utils.load_flow_model(
+        args.pretrained_model_name_or_path, weight_dtype, "cpu", args.disable_mmap_load_safetensors
     )
 
     if args.gradient_checkpointing:
@@ -294,7 +293,7 @@ def train(args):
 
     if not cache_latents:
         # load VAE here if not cached
-        ae = flux_utils.load_ae(name, args.ae, weight_dtype, "cpu")
+        ae = flux_utils.load_ae(args.ae, weight_dtype, "cpu")
         ae.requires_grad_(False)
         ae.eval()
         ae.to(accelerator.device, dtype=weight_dtype)
diff --git a/flux_train_network.py b/flux_train_network.py
index 65b121e7c..5d14bd28e 100644
--- a/flux_train_network.py
+++ b/flux_train_network.py
@@ -2,7 +2,7 @@
 import copy
 import math
 import random
-from typing import Any
+from typing import Any, Optional
 
 import torch
 from accelerate import Accelerator
@@ -24,6 +24,7 @@ class FluxNetworkTrainer(train_network.NetworkTrainer):
     def __init__(self):
         super().__init__()
         self.sample_prompts_te_outputs = None
+        self.is_schnell: Optional[bool] = None
 
     def assert_extra_args(self, args, train_dataset_group):
         super().assert_extra_args(args, train_dataset_group)
@@ -57,19 +58,15 @@ def assert_extra_args(self, args, train_dataset_group):
 
         train_dataset_group.verify_bucket_reso_steps(32)  # TODO check this
 
-    def get_flux_model_name(self, args):
-        return "schnell" if "schnell" in args.pretrained_model_name_or_path else "dev"
-
     def load_target_model(self, args, weight_dtype, accelerator):
         # currently offload to cpu for some models
-        name = self.get_flux_model_name(args)
 
         # if the file is fp8 and we are using fp8_base, we can load it as is (fp8)
         loading_dtype = None if args.fp8_base else weight_dtype
 
         # if we load to cpu, flux.to(fp8) takes a long time, so we should load to gpu in future
-        model = flux_utils.load_flow_model(
-            name, args.pretrained_model_name_or_path, loading_dtype, "cpu", disable_mmap=args.disable_mmap_load_safetensors
+        self.is_schnell, model = flux_utils.load_flow_model(
+            args.pretrained_model_name_or_path, loading_dtype, "cpu", disable_mmap=args.disable_mmap_load_safetensors
         )
         if args.fp8_base:
             # check dtype of model
@@ -100,7 +97,7 @@ def load_target_model(self, args, weight_dtype, accelerator):
             elif t5xxl.dtype == torch.float8_e4m3fn:
                 logger.info("Loaded fp8 T5XXL model")
 
-        ae = flux_utils.load_ae(name, args.ae, weight_dtype, "cpu", disable_mmap=args.disable_mmap_load_safetensors)
+        ae = flux_utils.load_ae(args.ae, weight_dtype, "cpu", disable_mmap=args.disable_mmap_load_safetensors)
 
         return flux_utils.MODEL_VERSION_FLUX_V1, [clip_l, t5xxl], ae, model
 
@@ -142,10 +139,10 @@ def prepare_split_model(self, model, weight_dtype, accelerator):
         return flux_lower
 
     def get_tokenize_strategy(self, args):
-        name = self.get_flux_model_name(args)
+        _, is_schnell, _ = flux_utils.check_flux_state_dict_diffusers_schnell(args.pretrained_model_name_or_path)
 
         if args.t5xxl_max_token_length is None:
-            if name == "schnell":
+            if is_schnell:
                 t5xxl_max_token_length = 256
             else:
                 t5xxl_max_token_length = 512
diff --git a/library/flux_utils.py b/library/flux_utils.py
index 7b0a41a8a..713814e28 100644
--- a/library/flux_utils.py
+++ b/library/flux_utils.py
@@ -1,9 +1,11 @@
 import json
-from typing import Optional, Union
+import os
+from typing import List, Optional, Tuple, Union
 import einops
 import torch
 
 from safetensors.torch import load_file
+from safetensors import safe_open
 from accelerate import init_empty_weights
 from transformers import CLIPTextModel, CLIPConfig, T5EncoderModel, T5Config
 
@@ -17,6 +19,8 @@
 logger = logging.getLogger(__name__)
 
 MODEL_VERSION_FLUX_V1 = "flux1"
+MODEL_NAME_DEV = "dev"
+MODEL_NAME_SCHNELL = "schnell"
 
 
 # temporary copy from sd3_utils TODO refactor
@@ -39,10 +43,35 @@ def load_safetensors(
             return load_file(path)  # prevent device invalid Error
 
 
+def check_flux_state_dict_diffusers_schnell(ckpt_path: str) -> Tuple[bool, bool, List[str]]:
+    # check the state dict: Diffusers or BFL, dev or schnell
+    logger.info(f"Checking the state dict: Diffusers or BFL, dev or schnell")
+
+    if os.path.isdir(ckpt_path):  # if ckpt_path is a directory, it is Diffusers
+        ckpt_path = os.path.join(ckpt_path, "transformer", "diffusion_pytorch_model-00001-of-00003.safetensors")
+    if "00001-of-00003" in ckpt_path:
+        ckpt_paths = [ckpt_path.replace("00001-of-00003", f"0000{i}-of-00003") for i in range(1, 4)]
+    else:
+        ckpt_paths = [ckpt_path]
+
+    keys = []
+    for ckpt_path in ckpt_paths:
+        with safe_open(ckpt_path, framework="pt") as f:
+            keys.extend(f.keys())
+
+    is_diffusers = "transformer_blocks.0.attn.add_k_proj.bias" in keys
+    is_schnell = not ("guidance_in.in_layer.bias" in keys or "time_text_embed.guidance_embedder.linear_1.bias" in keys)
+    return is_diffusers, is_schnell, ckpt_paths
+
+
 def load_flow_model(
-    name: str, ckpt_path: str, dtype: Optional[torch.dtype], device: Union[str, torch.device], disable_mmap: bool = False
-) -> flux_models.Flux:
-    logger.info(f"Building Flux model {name}")
+    ckpt_path: str, dtype: Optional[torch.dtype], device: Union[str, torch.device], disable_mmap: bool = False
+) -> Tuple[bool, flux_models.Flux]:
+    is_diffusers, is_schnell, ckpt_paths = check_flux_state_dict_diffusers_schnell(ckpt_path)
+    name = MODEL_NAME_DEV if not is_schnell else MODEL_NAME_SCHNELL
+
+    # build model
+    logger.info(f"Building Flux model {name} from {'Diffusers' if is_diffusers else 'BFL'} checkpoint")
     with torch.device("meta"):
         model = flux_models.Flux(flux_models.configs[name].params)
         if dtype is not None:
@@ -50,18 +79,28 @@ def load_flow_model(
 
     # load_sft doesn't support torch.device
     logger.info(f"Loading state dict from {ckpt_path}")
-    sd = load_safetensors(ckpt_path, device=str(device), disable_mmap=disable_mmap, dtype=dtype)
+    sd = {}
+    for ckpt_path in ckpt_paths:
+        sd.update(load_safetensors(ckpt_path, device=str(device), disable_mmap=disable_mmap, dtype=dtype))
+
+    # convert Diffusers to BFL
+    if is_diffusers:
+        logger.info("Converting Diffusers to BFL")
+        sd = convert_diffusers_sd_to_bfl(sd)
+        logger.info("Converted Diffusers to BFL")
+
     info = model.load_state_dict(sd, strict=False, assign=True)
     logger.info(f"Loaded Flux: {info}")
-    return model
+    return is_schnell, model
 
 
 def load_ae(
-    name: str, ckpt_path: str, dtype: torch.dtype, device: Union[str, torch.device], disable_mmap: bool = False
+    ckpt_path: str, dtype: torch.dtype, device: Union[str, torch.device], disable_mmap: bool = False
 ) -> flux_models.AutoEncoder:
     logger.info("Building AutoEncoder")
     with torch.device("meta"):
-        ae = flux_models.AutoEncoder(flux_models.configs[name].ae_params).to(dtype)
+        # dev and schnell have the same AE params
+        ae = flux_models.AutoEncoder(flux_models.configs[MODEL_NAME_DEV].ae_params).to(dtype)
 
     logger.info(f"Loading state dict from {ckpt_path}")
     sd = load_safetensors(ckpt_path, device=str(device), disable_mmap=disable_mmap, dtype=dtype)
@@ -246,3 +285,126 @@ def pack_latents(x: torch.Tensor) -> torch.Tensor:
     """
     x = einops.rearrange(x, "b c (h ph) (w pw) -> b (h w) (c ph pw)", ph=2, pw=2)
     return x
+
+
+# region Diffusers
+
+NUM_DOUBLE_BLOCKS = 19
+NUM_SINGLE_BLOCKS = 38
+
+BFL_TO_DIFFUSERS_MAP = {
+    "time_in.in_layer.weight": ["time_text_embed.timestep_embedder.linear_1.weight"],
+    "time_in.in_layer.bias": ["time_text_embed.timestep_embedder.linear_1.bias"],
+    "time_in.out_layer.weight": ["time_text_embed.timestep_embedder.linear_2.weight"],
+    "time_in.out_layer.bias": ["time_text_embed.timestep_embedder.linear_2.bias"],
+    "vector_in.in_layer.weight": ["time_text_embed.text_embedder.linear_1.weight"],
+    "vector_in.in_layer.bias": ["time_text_embed.text_embedder.linear_1.bias"],
+    "vector_in.out_layer.weight": ["time_text_embed.text_embedder.linear_2.weight"],
+    "vector_in.out_layer.bias": ["time_text_embed.text_embedder.linear_2.bias"],
+    "guidance_in.in_layer.weight": ["time_text_embed.guidance_embedder.linear_1.weight"],
+    "guidance_in.in_layer.bias": ["time_text_embed.guidance_embedder.linear_1.bias"],
+    "guidance_in.out_layer.weight": ["time_text_embed.guidance_embedder.linear_2.weight"],
+    "guidance_in.out_layer.bias": ["time_text_embed.guidance_embedder.linear_2.bias"],
+    "txt_in.weight": ["context_embedder.weight"],
+    "txt_in.bias": ["context_embedder.bias"],
+    "img_in.weight": ["x_embedder.weight"],
+    "img_in.bias": ["x_embedder.bias"],
+    "double_blocks.().img_mod.lin.weight": ["norm1.linear.weight"],
+    "double_blocks.().img_mod.lin.bias": ["norm1.linear.bias"],
+    "double_blocks.().txt_mod.lin.weight": ["norm1_context.linear.weight"],
+    "double_blocks.().txt_mod.lin.bias": ["norm1_context.linear.bias"],
+    "double_blocks.().img_attn.qkv.weight": ["attn.to_q.weight", "attn.to_k.weight", "attn.to_v.weight"],
+    "double_blocks.().img_attn.qkv.bias": ["attn.to_q.bias", "attn.to_k.bias", "attn.to_v.bias"],
+    "double_blocks.().txt_attn.qkv.weight": ["attn.add_q_proj.weight", "attn.add_k_proj.weight", "attn.add_v_proj.weight"],
+    "double_blocks.().txt_attn.qkv.bias": ["attn.add_q_proj.bias", "attn.add_k_proj.bias", "attn.add_v_proj.bias"],
+    "double_blocks.().img_attn.norm.query_norm.scale": ["attn.norm_q.weight"],
+    "double_blocks.().img_attn.norm.key_norm.scale": ["attn.norm_k.weight"],
+    "double_blocks.().txt_attn.norm.query_norm.scale": ["attn.norm_added_q.weight"],
+    "double_blocks.().txt_attn.norm.key_norm.scale": ["attn.norm_added_k.weight"],
+    "double_blocks.().img_mlp.0.weight": ["ff.net.0.proj.weight"],
+    "double_blocks.().img_mlp.0.bias": ["ff.net.0.proj.bias"],
+    "double_blocks.().img_mlp.2.weight": ["ff.net.2.weight"],
+    "double_blocks.().img_mlp.2.bias": ["ff.net.2.bias"],
+    "double_blocks.().txt_mlp.0.weight": ["ff_context.net.0.proj.weight"],
+    "double_blocks.().txt_mlp.0.bias": ["ff_context.net.0.proj.bias"],
+    "double_blocks.().txt_mlp.2.weight": ["ff_context.net.2.weight"],
+    "double_blocks.().txt_mlp.2.bias": ["ff_context.net.2.bias"],
+    "double_blocks.().img_attn.proj.weight": ["attn.to_out.0.weight"],
+    "double_blocks.().img_attn.proj.bias": ["attn.to_out.0.bias"],
+    "double_blocks.().txt_attn.proj.weight": ["attn.to_add_out.weight"],
+    "double_blocks.().txt_attn.proj.bias": ["attn.to_add_out.bias"],
+    "single_blocks.().modulation.lin.weight": ["norm.linear.weight"],
+    "single_blocks.().modulation.lin.bias": ["norm.linear.bias"],
+    "single_blocks.().linear1.weight": ["attn.to_q.weight", "attn.to_k.weight", "attn.to_v.weight", "proj_mlp.weight"],
+    "single_blocks.().linear1.bias": ["attn.to_q.bias", "attn.to_k.bias", "attn.to_v.bias", "proj_mlp.bias"],
+    "single_blocks.().linear2.weight": ["proj_out.weight"],
+    "single_blocks.().norm.query_norm.scale": ["attn.norm_q.weight"],
+    "single_blocks.().norm.key_norm.scale": ["attn.norm_k.weight"],
+    "single_blocks.().linear2.weight": ["proj_out.weight"],
+    "single_blocks.().linear2.bias": ["proj_out.bias"],
+    "final_layer.linear.weight": ["proj_out.weight"],
+    "final_layer.linear.bias": ["proj_out.bias"],
+    "final_layer.adaLN_modulation.1.weight": ["norm_out.linear.weight"],
+    "final_layer.adaLN_modulation.1.bias": ["norm_out.linear.bias"],
+}
+
+
+def make_diffusers_to_bfl_map() -> dict[str, tuple[int, str]]:
+    # make reverse map from diffusers map
+    diffusers_to_bfl_map = {}  # key: diffusers_key, value: (index, bfl_key)
+    for b in range(NUM_DOUBLE_BLOCKS):
+        for key, weights in BFL_TO_DIFFUSERS_MAP.items():
+            if key.startswith("double_blocks."):
+                block_prefix = f"transformer_blocks.{b}."
+                for i, weight in enumerate(weights):
+                    diffusers_to_bfl_map[f"{block_prefix}{weight}"] = (i, key.replace("()", f"{b}"))
+    for b in range(NUM_SINGLE_BLOCKS):
+        for key, weights in BFL_TO_DIFFUSERS_MAP.items():
+            if key.startswith("single_blocks."):
+                block_prefix = f"single_transformer_blocks.{b}."
+                for i, weight in enumerate(weights):
+                    diffusers_to_bfl_map[f"{block_prefix}{weight}"] = (i, key.replace("()", f"{b}"))
+    for key, weights in BFL_TO_DIFFUSERS_MAP.items():
+        if not (key.startswith("double_blocks.") or key.startswith("single_blocks.")):
+            for i, weight in enumerate(weights):
+                diffusers_to_bfl_map[weight] = (i, key)
+    return diffusers_to_bfl_map
+
+
+def convert_diffusers_sd_to_bfl(diffusers_sd: dict[str, torch.Tensor]) -> dict[str, torch.Tensor]:
+    diffusers_to_bfl_map = make_diffusers_to_bfl_map()
+
+    # iterate over three safetensors files to reduce memory usage
+    flux_sd = {}
+    for diffusers_key, tensor in diffusers_sd.items():
+        if diffusers_key in diffusers_to_bfl_map:
+            index, bfl_key = diffusers_to_bfl_map[diffusers_key]
+            if bfl_key not in flux_sd:
+                flux_sd[bfl_key] = []
+            flux_sd[bfl_key].append((index, tensor))
+        else:
+            logger.error(f"Error: Key not found in diffusers_to_bfl_map: {diffusers_key}")
+            raise KeyError(f"Key not found in diffusers_to_bfl_map: {diffusers_key}")
+
+    # concat tensors if multiple tensors are mapped to a single key, sort by index
+    for key, values in flux_sd.items():
+        if len(values) == 1:
+            flux_sd[key] = values[0][1]
+        else:
+            flux_sd[key] = torch.cat([value[1] for value in sorted(values, key=lambda x: x[0])])
+
+    # special case for final_layer.adaLN_modulation.1.weight and final_layer.adaLN_modulation.1.bias
+    def swap_scale_shift(weight):
+        shift, scale = weight.chunk(2, dim=0)
+        new_weight = torch.cat([scale, shift], dim=0)
+        return new_weight
+
+    if "final_layer.adaLN_modulation.1.weight" in flux_sd:
+        flux_sd["final_layer.adaLN_modulation.1.weight"] = swap_scale_shift(flux_sd["final_layer.adaLN_modulation.1.weight"])
+    if "final_layer.adaLN_modulation.1.bias" in flux_sd:
+        flux_sd["final_layer.adaLN_modulation.1.bias"] = swap_scale_shift(flux_sd["final_layer.adaLN_modulation.1.bias"])
+
+    return flux_sd
+
+
+# endregion
diff --git a/tools/convert_diffusers_to_flux.py b/tools/convert_diffusers_to_flux.py
index 9d8f7c74b..65ba7321a 100644
--- a/tools/convert_diffusers_to_flux.py
+++ b/tools/convert_diffusers_to_flux.py
@@ -29,6 +29,7 @@
 import torch
 from tqdm import tqdm
 
+from library import flux_utils
 from library.utils import setup_logging, str_to_dtype, MemoryEfficientSafeOpen, mem_eff_save_file
 
 setup_logging()
@@ -36,65 +37,6 @@
 
 logger = logging.getLogger(__name__)
 
-NUM_DOUBLE_BLOCKS = 19
-NUM_SINGLE_BLOCKS = 38
-
-BFL_TO_DIFFUSERS_MAP = {
-    "time_in.in_layer.weight": ["time_text_embed.timestep_embedder.linear_1.weight"],
-    "time_in.in_layer.bias": ["time_text_embed.timestep_embedder.linear_1.bias"],
-    "time_in.out_layer.weight": ["time_text_embed.timestep_embedder.linear_2.weight"],
-    "time_in.out_layer.bias": ["time_text_embed.timestep_embedder.linear_2.bias"],
-    "vector_in.in_layer.weight": ["time_text_embed.text_embedder.linear_1.weight"],
-    "vector_in.in_layer.bias": ["time_text_embed.text_embedder.linear_1.bias"],
-    "vector_in.out_layer.weight": ["time_text_embed.text_embedder.linear_2.weight"],
-    "vector_in.out_layer.bias": ["time_text_embed.text_embedder.linear_2.bias"],
-    "guidance_in.in_layer.weight": ["time_text_embed.guidance_embedder.linear_1.weight"],
-    "guidance_in.in_layer.bias": ["time_text_embed.guidance_embedder.linear_1.bias"],
-    "guidance_in.out_layer.weight": ["time_text_embed.guidance_embedder.linear_2.weight"],
-    "guidance_in.out_layer.bias": ["time_text_embed.guidance_embedder.linear_2.bias"],
-    "txt_in.weight": ["context_embedder.weight"],
-    "txt_in.bias": ["context_embedder.bias"],
-    "img_in.weight": ["x_embedder.weight"],
-    "img_in.bias": ["x_embedder.bias"],
-    "double_blocks.().img_mod.lin.weight": ["norm1.linear.weight"],
-    "double_blocks.().img_mod.lin.bias": ["norm1.linear.bias"],
-    "double_blocks.().txt_mod.lin.weight": ["norm1_context.linear.weight"],
-    "double_blocks.().txt_mod.lin.bias": ["norm1_context.linear.bias"],
-    "double_blocks.().img_attn.qkv.weight": ["attn.to_q.weight", "attn.to_k.weight", "attn.to_v.weight"],
-    "double_blocks.().img_attn.qkv.bias": ["attn.to_q.bias", "attn.to_k.bias", "attn.to_v.bias"],
-    "double_blocks.().txt_attn.qkv.weight": ["attn.add_q_proj.weight", "attn.add_k_proj.weight", "attn.add_v_proj.weight"],
-    "double_blocks.().txt_attn.qkv.bias": ["attn.add_q_proj.bias", "attn.add_k_proj.bias", "attn.add_v_proj.bias"],
-    "double_blocks.().img_attn.norm.query_norm.scale": ["attn.norm_q.weight"],
-    "double_blocks.().img_attn.norm.key_norm.scale": ["attn.norm_k.weight"],
-    "double_blocks.().txt_attn.norm.query_norm.scale": ["attn.norm_added_q.weight"],
-    "double_blocks.().txt_attn.norm.key_norm.scale": ["attn.norm_added_k.weight"],
-    "double_blocks.().img_mlp.0.weight": ["ff.net.0.proj.weight"],
-    "double_blocks.().img_mlp.0.bias": ["ff.net.0.proj.bias"],
-    "double_blocks.().img_mlp.2.weight": ["ff.net.2.weight"],
-    "double_blocks.().img_mlp.2.bias": ["ff.net.2.bias"],
-    "double_blocks.().txt_mlp.0.weight": ["ff_context.net.0.proj.weight"],
-    "double_blocks.().txt_mlp.0.bias": ["ff_context.net.0.proj.bias"],
-    "double_blocks.().txt_mlp.2.weight": ["ff_context.net.2.weight"],
-    "double_blocks.().txt_mlp.2.bias": ["ff_context.net.2.bias"],
-    "double_blocks.().img_attn.proj.weight": ["attn.to_out.0.weight"],
-    "double_blocks.().img_attn.proj.bias": ["attn.to_out.0.bias"],
-    "double_blocks.().txt_attn.proj.weight": ["attn.to_add_out.weight"],
-    "double_blocks.().txt_attn.proj.bias": ["attn.to_add_out.bias"],
-    "single_blocks.().modulation.lin.weight": ["norm.linear.weight"],
-    "single_blocks.().modulation.lin.bias": ["norm.linear.bias"],
-    "single_blocks.().linear1.weight": ["attn.to_q.weight", "attn.to_k.weight", "attn.to_v.weight", "proj_mlp.weight"],
-    "single_blocks.().linear1.bias": ["attn.to_q.bias", "attn.to_k.bias", "attn.to_v.bias", "proj_mlp.bias"],
-    "single_blocks.().linear2.weight": ["proj_out.weight"],
-    "single_blocks.().norm.query_norm.scale": ["attn.norm_q.weight"],
-    "single_blocks.().norm.key_norm.scale": ["attn.norm_k.weight"],
-    "single_blocks.().linear2.weight": ["proj_out.weight"],
-    "single_blocks.().linear2.bias": ["proj_out.bias"],
-    "final_layer.linear.weight": ["proj_out.weight"],
-    "final_layer.linear.bias": ["proj_out.bias"],
-    "final_layer.adaLN_modulation.1.weight": ["norm_out.linear.weight"],
-    "final_layer.adaLN_modulation.1.bias": ["norm_out.linear.bias"],
-}
-
 
 def convert(args):
     # if diffusers_path is folder, get safetensors file
@@ -114,23 +56,7 @@ def convert(args):
     save_dtype = str_to_dtype(args.save_precision) if args.save_precision is not None else None
 
     # make reverse map from diffusers map
-    diffusers_to_bfl_map = {}  # key: diffusers_key, value: (index, bfl_key)
-    for b in range(NUM_DOUBLE_BLOCKS):
-        for key, weights in BFL_TO_DIFFUSERS_MAP.items():
-            if key.startswith("double_blocks."):
-                block_prefix = f"transformer_blocks.{b}."
-                for i, weight in enumerate(weights):
-                    diffusers_to_bfl_map[f"{block_prefix}{weight}"] = (i, key.replace("()", f"{b}"))
-    for b in range(NUM_SINGLE_BLOCKS):
-        for key, weights in BFL_TO_DIFFUSERS_MAP.items():
-            if key.startswith("single_blocks."):
-                block_prefix = f"single_transformer_blocks.{b}."
-                for i, weight in enumerate(weights):
-                    diffusers_to_bfl_map[f"{block_prefix}{weight}"] = (i, key.replace("()", f"{b}"))
-    for key, weights in BFL_TO_DIFFUSERS_MAP.items():
-        if not (key.startswith("double_blocks.") or key.startswith("single_blocks.")):
-            for i, weight in enumerate(weights):
-                diffusers_to_bfl_map[weight] = (i, key)
+    diffusers_to_bfl_map = flux_utils.make_diffusers_to_bfl_map()
 
     # iterate over three safetensors files to reduce memory usage
     flux_sd = {}

From 886f75345c95cddec8752ffdd4e60a471ee75403 Mon Sep 17 00:00:00 2001
From: Kohya S <ykumeykume@gmail.com>
Date: Thu, 10 Oct 2024 08:27:15 +0900
Subject: [PATCH 169/582] support weighted captions for sdxl LoRA and fine
 tuning

---
 library/strategy_base.py  |  5 ++++-
 library/strategy_sdxl.py  |  3 ++-
 sdxl_train.py             | 38 ++++++++++++++++++++------------------
 sdxl_train_control_net.py |  7 ++-----
 train_network.py          | 27 +++++++++++++++++----------
 5 files changed, 45 insertions(+), 35 deletions(-)

diff --git a/library/strategy_base.py b/library/strategy_base.py
index 10820afa1..7981bd0b9 100644
--- a/library/strategy_base.py
+++ b/library/strategy_base.py
@@ -74,6 +74,9 @@ def tokenize(self, text: Union[str, List[str]]) -> List[torch.Tensor]:
         raise NotImplementedError
 
     def tokenize_with_weights(self, text: Union[str, List[str]]) -> Tuple[List[torch.Tensor], List[torch.Tensor]]:
+        """
+        returns: [tokens1, tokens2, ...], [weights1, weights2, ...]
+        """
         raise NotImplementedError
 
     def _get_weighted_input_ids(
@@ -303,7 +306,7 @@ def encode_tokens(
         :return: list of output embeddings for each architecture
         """
         raise NotImplementedError
-    
+
     def encode_tokens_with_weights(
         self, tokenize_strategy: TokenizeStrategy, models: List[Any], tokens: List[torch.Tensor], weights: List[torch.Tensor]
     ) -> List[torch.Tensor]:
diff --git a/library/strategy_sdxl.py b/library/strategy_sdxl.py
index b48e6d55a..6650e2b43 100644
--- a/library/strategy_sdxl.py
+++ b/library/strategy_sdxl.py
@@ -174,7 +174,8 @@ def encode_tokens(
         """
         Args:
             tokenize_strategy: TokenizeStrategy
-            models: List of models, [text_encoder1, text_encoder2, unwrapped text_encoder2 (optional)]
+            models: List of models, [text_encoder1, text_encoder2, unwrapped text_encoder2 (optional)].
+                If text_encoder2 is wrapped by accelerate, unwrapped_text_encoder2 is required
             tokens: List of tokens, for text_encoder1 and text_encoder2
         """
         if len(models) == 2:
diff --git a/sdxl_train.py b/sdxl_train.py
index 7291ddd2f..320169d77 100644
--- a/sdxl_train.py
+++ b/sdxl_train.py
@@ -104,8 +104,8 @@ def train(args):
     setup_logging(args, reset=True)
 
     assert (
-        not args.weighted_captions
-    ), "weighted_captions is not supported currently / weighted_captionsは現在サポートされていません"
+        not args.weighted_captions or not args.cache_text_encoder_outputs
+    ), "weighted_captions is not supported when caching text encoder outputs / cache_text_encoder_outputsを使うときはweighted_captionsはサポートされていません"
     assert (
         not args.train_text_encoder or not args.cache_text_encoder_outputs
     ), "cache_text_encoder_outputs is not supported when training text encoder / text encoderを学習するときはcache_text_encoder_outputsはサポートされていません"
@@ -660,22 +660,24 @@ def optimizer_hook(parameter: torch.Tensor):
                     input_ids1, input_ids2 = batch["input_ids_list"]
                     with torch.set_grad_enabled(args.train_text_encoder):
                         # Get the text embedding for conditioning
-                        # TODO support weighted captions
-                        # if args.weighted_captions:
-                        #     encoder_hidden_states = get_weighted_text_embeddings(
-                        #         tokenizer,
-                        #         text_encoder,
-                        #         batch["captions"],
-                        #         accelerator.device,
-                        #         args.max_token_length // 75 if args.max_token_length else 1,
-                        #         clip_skip=args.clip_skip,
-                        #     )
-                        # else:
-                        input_ids1 = input_ids1.to(accelerator.device)
-                        input_ids2 = input_ids2.to(accelerator.device)
-                        encoder_hidden_states1, encoder_hidden_states2, pool2 = text_encoding_strategy.encode_tokens(
-                            tokenize_strategy, [text_encoder1, text_encoder2], [input_ids1, input_ids2]
-                        )
+                        if args.weighted_captions:
+                            input_ids_list, weights_list = tokenize_strategy.tokenize_with_weights(batch["captions"])
+                            encoder_hidden_states1, encoder_hidden_states2, pool2 = (
+                                text_encoding_strategy.encode_tokens_with_weights(
+                                    tokenize_strategy,
+                                    [text_encoder1, text_encoder2, accelerator.unwrap_model(text_encoder2)],
+                                    input_ids_list,
+                                    weights_list,
+                                )
+                            )
+                        else:
+                            input_ids1 = input_ids1.to(accelerator.device)
+                            input_ids2 = input_ids2.to(accelerator.device)
+                            encoder_hidden_states1, encoder_hidden_states2, pool2 = text_encoding_strategy.encode_tokens(
+                                tokenize_strategy,
+                                [text_encoder1, text_encoder2, accelerator.unwrap_model(text_encoder2)],
+                                [input_ids1, input_ids2],
+                            )
                         if args.full_fp16:
                             encoder_hidden_states1 = encoder_hidden_states1.to(weight_dtype)
                             encoder_hidden_states2 = encoder_hidden_states2.to(weight_dtype)
diff --git a/sdxl_train_control_net.py b/sdxl_train_control_net.py
index b902cda69..f6cc5a4f9 100644
--- a/sdxl_train_control_net.py
+++ b/sdxl_train_control_net.py
@@ -12,24 +12,21 @@
 
 init_ipex()
 
-from torch.nn.parallel import DistributedDataParallel as DDP
 from accelerate.utils import set_seed
 from accelerate import init_empty_weights
-from diffusers import DDPMScheduler, ControlNetModel
+from diffusers import DDPMScheduler
 from diffusers.utils.torch_utils import is_compiled_module
 from safetensors.torch import load_file
 from library import (
     deepspeed_utils,
     sai_model_spec,
     sdxl_model_util,
-    sdxl_original_unet,
     sdxl_train_util,
     strategy_base,
     strategy_sd,
     strategy_sdxl,
 )
 
-import library.model_util as model_util
 import library.train_util as train_util
 import library.config_util as config_util
 from library.config_util import (
@@ -264,7 +261,7 @@ def unwrap_model(model):
     trainable_params.append({"params": ctrlnet_params, "lr": args.control_net_lr})
     trainable_params.append({"params": unet_params, "lr": args.learning_rate})
     all_params = ctrlnet_params + unet_params
-    
+
     logger.info(f"trainable params count: {len(all_params)}")
     logger.info(f"number of trainable parameters: {sum(p.numel() for p in all_params)}")
 
diff --git a/train_network.py b/train_network.py
index f0d397b9e..e48e6a070 100644
--- a/train_network.py
+++ b/train_network.py
@@ -1123,14 +1123,21 @@ def remove_model(old_ckpt_name):
                         with torch.set_grad_enabled(train_text_encoder), accelerator.autocast():
                             # Get the text embedding for conditioning
                             if args.weighted_captions:
-                                # SD only
-                                encoded_text_encoder_conds = get_weighted_text_embeddings(
-                                    tokenizers[0],
-                                    text_encoder,
-                                    batch["captions"],
-                                    accelerator.device,
-                                    args.max_token_length // 75 if args.max_token_length else 1,
-                                    clip_skip=args.clip_skip,
+                                # # SD only
+                                # encoded_text_encoder_conds = get_weighted_text_embeddings(
+                                #     tokenizers[0],
+                                #     text_encoder,
+                                #     batch["captions"],
+                                #     accelerator.device,
+                                #     args.max_token_length // 75 if args.max_token_length else 1,
+                                #     clip_skip=args.clip_skip,
+                                # )
+                                input_ids_list, weights_list = tokenize_strategy.tokenize_with_weights(batch["captions"])
+                                encoded_text_encoder_conds = text_encoding_strategy.encode_tokens_with_weights(
+                                    tokenize_strategy,
+                                    self.get_models_for_text_encoding(args, accelerator, text_encoders),
+                                    input_ids_list,
+                                    weights_list,
                                 )
                             else:
                                 input_ids = [ids.to(accelerator.device) for ids in batch["input_ids_list"]]
@@ -1139,8 +1146,8 @@ def remove_model(old_ckpt_name):
                                     self.get_models_for_text_encoding(args, accelerator, text_encoders),
                                     input_ids,
                                 )
-                                if args.full_fp16:
-                                    encoded_text_encoder_conds = [c.to(weight_dtype) for c in encoded_text_encoder_conds]
+                            if args.full_fp16:
+                                encoded_text_encoder_conds = [c.to(weight_dtype) for c in encoded_text_encoder_conds]
 
                         # if text_encoder_conds is not cached, use encoded_text_encoder_conds
                         if len(text_encoder_conds) == 0:

From 3de42b6edb151b172f483aec99fe380b1406a84a Mon Sep 17 00:00:00 2001
From: Akegarasu <akiba@anzu.link>
Date: Thu, 10 Oct 2024 14:03:59 +0800
Subject: [PATCH 170/582] fix: distributed training in windows

---
 library/train_util.py | 23 ++++++++++++-----------
 1 file changed, 12 insertions(+), 11 deletions(-)

diff --git a/library/train_util.py b/library/train_util.py
index e023f63a2..3dabf9e26 100644
--- a/library/train_util.py
+++ b/library/train_util.py
@@ -5045,17 +5045,18 @@ def prepare_accelerator(args: argparse.Namespace):
     if args.torch_compile:
         dynamo_backend = args.dynamo_backend
 
-    kwargs_handlers = (
-        InitProcessGroupKwargs(timeout=datetime.timedelta(minutes=args.ddp_timeout)) if args.ddp_timeout else None,
-        (
-            DistributedDataParallelKwargs(
-                gradient_as_bucket_view=args.ddp_gradient_as_bucket_view, static_graph=args.ddp_static_graph
-            )
-            if args.ddp_gradient_as_bucket_view or args.ddp_static_graph
-            else None
-        ),
-    )
-    kwargs_handlers = list(filter(lambda x: x is not None, kwargs_handlers))
+    kwargs_handlers = [
+        InitProcessGroupKwargs(
+            backend = "gloo" if os.name == "nt" or not torch.cuda.is_available() else "nccl",
+            init_method="env://?use_libuv=False" if os.name == "nt" else None,
+            timeout=datetime.timedelta(minutes=args.ddp_timeout) if args.ddp_timeout else None
+        ) if torch.cuda.device_count() > 1 else None,
+        DistributedDataParallelKwargs(
+            gradient_as_bucket_view=args.ddp_gradient_as_bucket_view, 
+            static_graph=args.ddp_static_graph
+        ) if args.ddp_gradient_as_bucket_view or args.ddp_static_graph else None
+    ]
+    kwargs_handlers = [i for i in kwargs_handlers if i is not None]
     deepspeed_plugin = deepspeed_utils.prepare_deepspeed_plugin(args)
 
     accelerator = Accelerator(

From 9f4dac5731fe2299c75b7671c6132febd57a4117 Mon Sep 17 00:00:00 2001
From: Akegarasu <akiba@anzu.link>
Date: Thu, 10 Oct 2024 14:08:55 +0800
Subject: [PATCH 171/582] torch 2.4

---
 library/train_util.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/library/train_util.py b/library/train_util.py
index 3dabf9e26..2c20a9244 100644
--- a/library/train_util.py
+++ b/library/train_util.py
@@ -33,6 +33,7 @@
 import toml
 
 from tqdm import tqdm
+from packaging.version import Version
 
 import torch
 from library.device_utils import init_ipex, clean_memory_on_device
@@ -5048,7 +5049,7 @@ def prepare_accelerator(args: argparse.Namespace):
     kwargs_handlers = [
         InitProcessGroupKwargs(
             backend = "gloo" if os.name == "nt" or not torch.cuda.is_available() else "nccl",
-            init_method="env://?use_libuv=False" if os.name == "nt" else None,
+            init_method="env://?use_libuv=False" if os.name == "nt" and Version(torch.__version__) >= Version("2.4.0") else None,
             timeout=datetime.timedelta(minutes=args.ddp_timeout) if args.ddp_timeout else None
         ) if torch.cuda.device_count() > 1 else None,
         DistributedDataParallelKwargs(

From f2bc8201330d1370c182c57047a5c23e9c6bee71 Mon Sep 17 00:00:00 2001
From: Kohya S <ykumeykume@gmail.com>
Date: Fri, 11 Oct 2024 08:48:55 +0900
Subject: [PATCH 172/582] support weighted captions for SD/SDXL

---
 fine_tune.py               | 17 ++++--------
 library/sdxl_train_util.py |  6 ++--
 library/strategy_base.py   | 12 +++++++-
 library/strategy_sd.py     | 36 ++++++++++++++++++++++++
 library/strategy_sdxl.py   | 57 ++++++++++++++++++++++++++------------
 sdxl_train.py              |  2 +-
 sdxl_train_network.py      |  4 ++-
 train_db.py                | 16 ++++-------
 8 files changed, 105 insertions(+), 45 deletions(-)

diff --git a/fine_tune.py b/fine_tune.py
index 62a545a13..fd63385b3 100644
--- a/fine_tune.py
+++ b/fine_tune.py
@@ -366,22 +366,17 @@ def fn_recursive_set_mem_eff(module: torch.nn.Module):
                 with torch.set_grad_enabled(args.train_text_encoder):
                     # Get the text embedding for conditioning
                     if args.weighted_captions:
-                        # TODO move to strategy_sd.py
-                        encoder_hidden_states = get_weighted_text_embeddings(
-                            tokenize_strategy.tokenizer,
-                            text_encoder,
-                            batch["captions"],
-                            accelerator.device,
-                            args.max_token_length // 75 if args.max_token_length else 1,
-                            clip_skip=args.clip_skip,
-                        )
+                        input_ids_list, weights_list = tokenize_strategy.tokenize_with_weights(batch["captions"])
+                        encoder_hidden_states = text_encoding_strategy.encode_tokens_with_weights(
+                            tokenize_strategy, [text_encoder], input_ids_list, weights_list
+                        )[0]
                     else:
                         input_ids = batch["input_ids_list"][0].to(accelerator.device)
                         encoder_hidden_states = text_encoding_strategy.encode_tokens(
                             tokenize_strategy, [text_encoder], [input_ids]
                         )[0]
-                        if args.full_fp16:
-                            encoder_hidden_states = encoder_hidden_states.to(weight_dtype)
+                    if args.full_fp16:
+                        encoder_hidden_states = encoder_hidden_states.to(weight_dtype)
 
                 # Sample noise, sample a random timestep for each image, and add noise to the latents,
                 # with noise offset and/or multires noise if specified
diff --git a/library/sdxl_train_util.py b/library/sdxl_train_util.py
index aaf77b8dd..dc3887c34 100644
--- a/library/sdxl_train_util.py
+++ b/library/sdxl_train_util.py
@@ -363,9 +363,9 @@ def verify_sdxl_training_args(args: argparse.Namespace, supportTextEncoderCachin
     #         )
     #     logger.info(f"noise_offset is set to {args.noise_offset} / noise_offsetが{args.noise_offset}に設定されました")
 
-    assert (
-        not hasattr(args, "weighted_captions") or not args.weighted_captions
-    ), "weighted_captions cannot be enabled in SDXL training currently / SDXL学習では今のところweighted_captionsを有効にすることはできません"
+    # assert (
+    #     not hasattr(args, "weighted_captions") or not args.weighted_captions
+    # ), "weighted_captions cannot be enabled in SDXL training currently / SDXL学習では今のところweighted_captionsを有効にすることはできません"
 
     if supportTextEncoderCaching:
         if args.cache_text_encoder_outputs_to_disk and not args.cache_text_encoder_outputs:
diff --git a/library/strategy_base.py b/library/strategy_base.py
index 7981bd0b9..2bff4178a 100644
--- a/library/strategy_base.py
+++ b/library/strategy_base.py
@@ -323,12 +323,18 @@ class TextEncoderOutputsCachingStrategy:
     _strategy = None  # strategy instance: actual strategy class
 
     def __init__(
-        self, cache_to_disk: bool, batch_size: int, skip_disk_cache_validity_check: bool, is_partial: bool = False
+        self,
+        cache_to_disk: bool,
+        batch_size: int,
+        skip_disk_cache_validity_check: bool,
+        is_partial: bool = False,
+        is_weighted: bool = False,
     ) -> None:
         self._cache_to_disk = cache_to_disk
         self._batch_size = batch_size
         self.skip_disk_cache_validity_check = skip_disk_cache_validity_check
         self._is_partial = is_partial
+        self._is_weighted = is_weighted
 
     @classmethod
     def set_strategy(cls, strategy):
@@ -352,6 +358,10 @@ def batch_size(self):
     def is_partial(self):
         return self._is_partial
 
+    @property
+    def is_weighted(self):
+        return self._is_weighted
+
     def get_outputs_npz_path(self, image_abs_path: str) -> str:
         raise NotImplementedError
 
diff --git a/library/strategy_sd.py b/library/strategy_sd.py
index 83ffaa31b..4e7931fdb 100644
--- a/library/strategy_sd.py
+++ b/library/strategy_sd.py
@@ -40,6 +40,16 @@ def tokenize(self, text: Union[str, List[str]]) -> List[torch.Tensor]:
         text = [text] if isinstance(text, str) else text
         return [torch.stack([self._get_input_ids(self.tokenizer, t, self.max_length) for t in text], dim=0)]
 
+    def tokenize_with_weights(self, text: str | List[str]) -> Tuple[List[torch.Tensor]]:
+        text = [text] if isinstance(text, str) else text
+        tokens_list = []
+        weights_list = []
+        for t in text:
+            tokens, weights = self._get_input_ids(self.tokenizer, t, self.max_length, weighted=True)
+            tokens_list.append(tokens)
+            weights_list.append(weights)
+        return [torch.stack(tokens_list, dim=0)], [torch.stack(weights_list, dim=0)]
+
 
 class SdTextEncodingStrategy(TextEncodingStrategy):
     def __init__(self, clip_skip: Optional[int] = None) -> None:
@@ -58,6 +68,8 @@ def encode_tokens(
         model_max_length = sd_tokenize_strategy.tokenizer.model_max_length
         tokens = tokens.reshape((-1, model_max_length))  # batch_size*3, 77
 
+        tokens = tokens.to(text_encoder.device)
+
         if self.clip_skip is None:
             encoder_hidden_states = text_encoder(tokens)[0]
         else:
@@ -93,6 +105,30 @@ def encode_tokens(
 
         return [encoder_hidden_states]
 
+    def encode_tokens_with_weights(
+        self,
+        tokenize_strategy: TokenizeStrategy,
+        models: List[Any],
+        tokens_list: List[torch.Tensor],
+        weights_list: List[torch.Tensor],
+    ) -> List[torch.Tensor]:
+        encoder_hidden_states = self.encode_tokens(tokenize_strategy, models, tokens_list)[0]
+
+        weights = weights_list[0].to(encoder_hidden_states.device)
+
+        # apply weights
+        if weights.shape[1] == 1:  # no max_token_length
+            # weights: ((b, 1, 77), (b, 1, 77)), hidden_states: (b, 77, 768), (b, 77, 768)
+            encoder_hidden_states = encoder_hidden_states * weights.squeeze(1).unsqueeze(2)
+        else:
+            # weights: ((b, n, 77), (b, n, 77)), hidden_states: (b, n*75+2, 768), (b, n*75+2, 768)
+            for i in range(weights.shape[1]):
+                encoder_hidden_states[:, i * 75 + 1 : i * 75 + 76] = encoder_hidden_states[:, i * 75 + 1 : i * 75 + 76] * weights[
+                    :, i, 1:-1
+                ].unsqueeze(-1)
+
+        return [encoder_hidden_states]
+
 
 class SdSdxlLatentsCachingStrategy(LatentsCachingStrategy):
     # sd and sdxl share the same strategy. we can make them separate, but the difference is only the suffix.
diff --git a/library/strategy_sdxl.py b/library/strategy_sdxl.py
index 6650e2b43..6b3e2afa6 100644
--- a/library/strategy_sdxl.py
+++ b/library/strategy_sdxl.py
@@ -42,16 +42,16 @@ def tokenize_with_weights(self, text: str | List[str]) -> Tuple[List[torch.Tenso
         tokens1_list, tokens2_list = [], []
         weights1_list, weights2_list = [], []
         for t in text:
-            tokens1, weights1 = self._get_weighted_input_ids(self.tokenizer1, t, self.max_length)
-            tokens2, weights2 = self._get_weighted_input_ids(self.tokenizer2, t, self.max_length)
+            tokens1, weights1 = self._get_input_ids(self.tokenizer1, t, self.max_length, weighted=True)
+            tokens2, weights2 = self._get_input_ids(self.tokenizer2, t, self.max_length, weighted=True)
             tokens1_list.append(tokens1)
             tokens2_list.append(tokens2)
             weights1_list.append(weights1)
             weights2_list.append(weights2)
-        return (torch.stack(tokens1_list, dim=0), torch.stack(tokens2_list, dim=0)), (
+        return [torch.stack(tokens1_list, dim=0), torch.stack(tokens2_list, dim=0)], [
             torch.stack(weights1_list, dim=0),
             torch.stack(weights2_list, dim=0),
-        )
+        ]
 
 
 class SdxlTextEncodingStrategy(TextEncodingStrategy):
@@ -193,20 +193,28 @@ def encode_tokens(
         return [hidden_states1, hidden_states2, pool2]
 
     def encode_tokens_with_weights(
-        self, tokenize_strategy: TokenizeStrategy, models: List[Any], tokens: List[torch.Tensor], weights: List[torch.Tensor]
+        self,
+        tokenize_strategy: TokenizeStrategy,
+        models: List[Any],
+        tokens_list: List[torch.Tensor],
+        weights_list: List[torch.Tensor],
     ) -> List[torch.Tensor]:
-        hidden_states1, hidden_states2, pool2 = self.encode_tokens(tokenize_strategy, models, tokens)
+        hidden_states1, hidden_states2, pool2 = self.encode_tokens(tokenize_strategy, models, tokens_list)
+
+        weights_list = [weights.to(hidden_states1.device) for weights in weights_list]
 
         # apply weights
-        if weights[0].shape[1] == 1:  # no max_token_length
+        if weights_list[0].shape[1] == 1:  # no max_token_length
             # weights: ((b, 1, 77), (b, 1, 77)), hidden_states: (b, 77, 768), (b, 77, 768)
-            hidden_states1 = hidden_states1 * weights[0].squeeze(1).unsqueeze(2)
-            hidden_states2 = hidden_states2 * weights[1].squeeze(1).unsqueeze(2)
+            hidden_states1 = hidden_states1 * weights_list[0].squeeze(1).unsqueeze(2)
+            hidden_states2 = hidden_states2 * weights_list[1].squeeze(1).unsqueeze(2)
         else:
             # weights: ((b, n, 77), (b, n, 77)), hidden_states: (b, n*75+2, 768), (b, n*75+2, 768)
-            for weight, hidden_states in zip(weights, [hidden_states1, hidden_states2]):
+            for weight, hidden_states in zip(weights_list, [hidden_states1, hidden_states2]):
                 for i in range(weight.shape[1]):
-                    hidden_states[:, i * 75 + 1 : i * 75 + 76] = hidden_states[:, i * 75 + 1 : i * 75 + 76] * weight[:, i, 1:-1]
+                    hidden_states[:, i * 75 + 1 : i * 75 + 76] = hidden_states[:, i * 75 + 1 : i * 75 + 76] * weight[
+                        :, i, 1:-1
+                    ].unsqueeze(-1)
 
         return [hidden_states1, hidden_states2, pool2]
 
@@ -215,9 +223,14 @@ class SdxlTextEncoderOutputsCachingStrategy(TextEncoderOutputsCachingStrategy):
     SDXL_TEXT_ENCODER_OUTPUTS_NPZ_SUFFIX = "_te_outputs.npz"
 
     def __init__(
-        self, cache_to_disk: bool, batch_size: int, skip_disk_cache_validity_check: bool, is_partial: bool = False
+        self,
+        cache_to_disk: bool,
+        batch_size: int,
+        skip_disk_cache_validity_check: bool,
+        is_partial: bool = False,
+        is_weighted: bool = False,
     ) -> None:
-        super().__init__(cache_to_disk, batch_size, skip_disk_cache_validity_check, is_partial)
+        super().__init__(cache_to_disk, batch_size, skip_disk_cache_validity_check, is_partial, is_weighted)
 
     def get_outputs_npz_path(self, image_abs_path: str) -> str:
         return os.path.splitext(image_abs_path)[0] + SdxlTextEncoderOutputsCachingStrategy.SDXL_TEXT_ENCODER_OUTPUTS_NPZ_SUFFIX
@@ -253,11 +266,19 @@ def cache_batch_outputs(
         sdxl_text_encoding_strategy = text_encoding_strategy  # type: SdxlTextEncodingStrategy
         captions = [info.caption for info in infos]
 
-        tokens1, tokens2 = tokenize_strategy.tokenize(captions)
-        with torch.no_grad():
-            hidden_state1, hidden_state2, pool2 = sdxl_text_encoding_strategy.encode_tokens(
-                tokenize_strategy, models, [tokens1, tokens2]
-            )
+        if self.is_weighted:
+            tokens_list, weights_list = tokenize_strategy.tokenize_with_weights(captions)
+            with torch.no_grad():
+                hidden_state1, hidden_state2, pool2 = sdxl_text_encoding_strategy.encode_tokens_with_weights(
+                    tokenize_strategy, models, tokens_list, weights_list
+                )
+        else:
+            tokens1, tokens2 = tokenize_strategy.tokenize(captions)
+            with torch.no_grad():
+                hidden_state1, hidden_state2, pool2 = sdxl_text_encoding_strategy.encode_tokens(
+                    tokenize_strategy, models, [tokens1, tokens2]
+                )
+
         if hidden_state1.dtype == torch.bfloat16:
             hidden_state1 = hidden_state1.float()
         if hidden_state2.dtype == torch.bfloat16:
diff --git a/sdxl_train.py b/sdxl_train.py
index 320169d77..aeff9c469 100644
--- a/sdxl_train.py
+++ b/sdxl_train.py
@@ -321,7 +321,7 @@ def fn_recursive_set_mem_eff(module: torch.nn.Module):
         if args.cache_text_encoder_outputs:
             # Text Encodes are eval and no grad
             text_encoder_output_caching_strategy = strategy_sdxl.SdxlTextEncoderOutputsCachingStrategy(
-                args.cache_text_encoder_outputs_to_disk, None, False
+                args.cache_text_encoder_outputs_to_disk, None, False, is_weighted=args.weighted_captions
             )
             strategy_base.TextEncoderOutputsCachingStrategy.set_strategy(text_encoder_output_caching_strategy)
 
diff --git a/sdxl_train_network.py b/sdxl_train_network.py
index 4d6e3f184..20e32155c 100644
--- a/sdxl_train_network.py
+++ b/sdxl_train_network.py
@@ -79,7 +79,9 @@ def get_models_for_text_encoding(self, args, accelerator, text_encoders):
 
     def get_text_encoder_outputs_caching_strategy(self, args):
         if args.cache_text_encoder_outputs:
-            return strategy_sdxl.SdxlTextEncoderOutputsCachingStrategy(args.cache_text_encoder_outputs_to_disk, None, False)
+            return strategy_sdxl.SdxlTextEncoderOutputsCachingStrategy(
+                args.cache_text_encoder_outputs_to_disk, None, False, is_weighted=args.weighted_captions
+            )
         else:
             return None
 
diff --git a/train_db.py b/train_db.py
index a5d520b12..e49a7e70f 100644
--- a/train_db.py
+++ b/train_db.py
@@ -356,21 +356,17 @@ def train(args):
                 # Get the text embedding for conditioning
                 with torch.set_grad_enabled(global_step < args.stop_text_encoder_training):
                     if args.weighted_captions:
-                        encoder_hidden_states = get_weighted_text_embeddings(
-                            tokenize_strategy.tokenizer,
-                            text_encoder,
-                            batch["captions"],
-                            accelerator.device,
-                            args.max_token_length // 75 if args.max_token_length else 1,
-                            clip_skip=args.clip_skip,
-                        )
+                        input_ids_list, weights_list = tokenize_strategy.tokenize_with_weights(batch["captions"])
+                        encoder_hidden_states = text_encoding_strategy.encode_tokens_with_weights(
+                            tokenize_strategy, [text_encoder], input_ids_list, weights_list
+                        )[0]
                     else:
                         input_ids = batch["input_ids_list"][0].to(accelerator.device)
                         encoder_hidden_states = text_encoding_strategy.encode_tokens(
                             tokenize_strategy, [text_encoder], [input_ids]
                         )[0]
-                        if args.full_fp16:
-                            encoder_hidden_states = encoder_hidden_states.to(weight_dtype)
+                    if args.full_fp16:
+                        encoder_hidden_states = encoder_hidden_states.to(weight_dtype)
 
                 # Sample noise, sample a random timestep for each image, and add noise to the latents,
                 # with noise offset and/or multires noise if specified

From 035c4a8552bf6214ad4d39657d3eb1204cdecdfd Mon Sep 17 00:00:00 2001
From: Kohya S <ykumeykume@gmail.com>
Date: Fri, 11 Oct 2024 22:23:15 +0900
Subject: [PATCH 173/582] update docs and help text

---
 README.md                   | 10 ++++++++++
 docs/train_lllite_README.md |  2 +-
 sdxl_train_control_net.py   |  2 +-
 3 files changed, 12 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index c567758a5..d3f49c994 100644
--- a/README.md
+++ b/README.md
@@ -11,6 +11,16 @@ The command to install PyTorch is as follows:
 
 ### Recent Updates
 
+Oct 11, 2024:
+- ControlNet training for SDXL has been implemented in this branch. Please use `sdxl_train_control_net.py`. 
+  - For details on defining the dataset, see [here](docs/train_lllite_README.md#creating-a-dataset-configuration-file).
+  - The learning rate for the copy part of the U-Net is specified by `--learning_rate`. The learning rate for the added modules in ControlNet is specified by `--control_net_lr`. The optimal value is still unknown, but try around U-Net `1e-5` and ControlNet `1e-4`.
+  - If you want to generate sample images, specify the control image as `--cn path/to/control/image`.
+  - The trained weights are automatically converted and saved in Diffusers format. It should be available in ComfyUI.
+- Weighting of prompts (captions) during training in SDXL is now supported (e.g., `(some text)`, `[some text]`, `(some text:1.4)`, etc.). The function is enabled by specifying `--weighted_captions`. 
+  - The default is `False`. It is same as before, and the parentheses are used as normal text.
+  - If `--weighted_captions` is specified, please use `\` to escape the parentheses in the prompt. For example, `\(some text:1.4\)`.
+
 Oct 6, 2024:
 - In FLUX.1 LoRA training and fine-tuning, the specified weight file (*.safetensors) is automatically determined to be dev or schnell. This allows schnell models to be loaded correctly. Note that LoRA training with schnell models and fine-tuning with schnell models are unverified.
 - FLUX.1 LoRA training and fine-tuning can now load weights in Diffusers format in addition to BFL format (a single *.safetensors file). Please specify the parent directory of `transformer` or `diffusion_pytorch_model-00001-of-00003.safetensors` with the full path. However, Diffusers format CLIP/T5XXL is not supported. Saving is supported only in BFL format.
diff --git a/docs/train_lllite_README.md b/docs/train_lllite_README.md
index a05f87f5f..1bd8e4ae1 100644
--- a/docs/train_lllite_README.md
+++ b/docs/train_lllite_README.md
@@ -185,7 +185,7 @@ for img_file in img_files:
 
 ### Creating a dataset configuration file
 
-You can use the command line arguments of `sdxl_train_control_net_lllite.py` to specify the conditioning image directory. However, if you want to use a `.toml` file, specify the conditioning image directory in `conditioning_data_dir`.
+You can use the command line argument `--conditioning_data_dir` of `sdxl_train_control_net_lllite.py` to specify the conditioning image directory. However, if you want to use a `.toml` file, specify the conditioning image directory in `conditioning_data_dir`.
 
 ```toml
 [general]
diff --git a/sdxl_train_control_net.py b/sdxl_train_control_net.py
index f6cc5a4f9..67c8d52c8 100644
--- a/sdxl_train_control_net.py
+++ b/sdxl_train_control_net.py
@@ -705,7 +705,7 @@ def setup_parser() -> argparse.ArgumentParser:
         "--control_net_lr",
         type=float,
         default=1e-4,
-        help="learning rate for controlnet / controlnetの学習率",
+        help="learning rate for controlnet modules / controlnetモジュールの学習率",
     )
     return parser
 

From 0d3058b65ab7cd827e44f16f84c68a4bb73f701e Mon Sep 17 00:00:00 2001
From: kohya-ss <ykumeykume@gmail.com>
Date: Sat, 12 Oct 2024 14:46:35 +0900
Subject: [PATCH 174/582] update README

---
 README.md | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/README.md b/README.md
index d3f49c994..37fc911f6 100644
--- a/README.md
+++ b/README.md
@@ -11,6 +11,17 @@ The command to install PyTorch is as follows:
 
 ### Recent Updates
 
+Oct 12, 2024:
+
+- Multi-GPU training now works on Windows. Thanks to Akegarasu for PR [#1686](https://github.com/kohya-ss/sd-scripts/pull/1686)!
+  - It should work with all training scripts, but it is unverified.
+  - Set up multi-GPU training with `accelerate config`.
+  - Specify `--rdzv_backend=c10d` when launching `accelerate launch`. You can also edit `config.yaml` directly.
+    ```
+    accelerate launch --rdzv_backend=c10d sdxl_train_network.py ...
+    ```
+  - In multi-GPU training, the memory of multiple GPUs is not integrated. In other words, even if you have two 12GB VRAM GPUs, you cannot train the model that requires 24GB VRAM. Training that can be done with 12GB VRAM is executed at (up to) twice the speed.
+
 Oct 11, 2024:
 - ControlNet training for SDXL has been implemented in this branch. Please use `sdxl_train_control_net.py`. 
   - For details on defining the dataset, see [here](docs/train_lllite_README.md#creating-a-dataset-configuration-file).

From c80c304779775f4d00fd8f4856bfc8e6599e2de0 Mon Sep 17 00:00:00 2001
From: kohya-ss <ykumeykume@gmail.com>
Date: Sat, 12 Oct 2024 20:18:41 +0900
Subject: [PATCH 175/582] Refactor caching in train scripts

---
 README.md                        | 10 +++++
 fine_tune.py                     |  2 +-
 flux_train.py                    | 14 ++++---
 flux_train_network.py            |  6 +--
 library/train_util.py            | 64 +++++++++++++++++++++++---------
 sd3_train.py                     | 17 +++++++--
 sdxl_train.py                    |  4 +-
 sdxl_train_control_net.py        |  4 +-
 sdxl_train_control_net_lllite.py |  5 +--
 sdxl_train_network.py            |  8 ++--
 sdxl_train_textual_inversion.py  |  2 +-
 train_db.py                      |  2 +-
 train_network.py                 |  2 +-
 train_textual_inversion.py       |  2 +-
 14 files changed, 95 insertions(+), 47 deletions(-)

diff --git a/README.md b/README.md
index 37fc911f6..2b2562831 100644
--- a/README.md
+++ b/README.md
@@ -11,6 +11,16 @@ The command to install PyTorch is as follows:
 
 ### Recent Updates
 
+Oct 12, 2024 (update 1):
+
+- During multi-GPU training, caching of latents and Text Encoder outputs is now done in multi-GPU.
+- `--text_encoder_batch_size` option is enabled for FLUX.1 LoRA training and fine tuning. This option specifies the batch size for caching Text Encoder outputs (not for training). The default is same as the dataset batch size. If you have enough VRAM, you can increase the batch size to speed up the caching. 
+- `--skip_cache_check` option is added to each training script. 
+  - When specified, the consistency check of the cache file `*.npz` contents (e.g., image size and flip for latents, mask for Text Encoder outputs) is skipped. 
+  - Specify this option if you have a large number of cache files and the consistency check takes time. 
+  - Even if this option is specified, the cache will be created if the file does not exist.
+  - `--skip_latents_validity_check` in SD3/FLUX.1 is deprecated. Please use `--skip_cache_check` instead.
+
 Oct 12, 2024:
 
 - Multi-GPU training now works on Windows. Thanks to Akegarasu for PR [#1686](https://github.com/kohya-ss/sd-scripts/pull/1686)!
diff --git a/fine_tune.py b/fine_tune.py
index fd63385b3..cdc005d9a 100644
--- a/fine_tune.py
+++ b/fine_tune.py
@@ -59,7 +59,7 @@ def train(args):
     # prepare caching strategy: this must be set before preparing dataset. because dataset may use this strategy for initialization.
     if cache_latents:
         latents_caching_strategy = strategy_sd.SdSdxlLatentsCachingStrategy(
-            False, args.cache_latents_to_disk, args.vae_batch_size, False
+            False, args.cache_latents_to_disk, args.vae_batch_size, args.skip_cache_check
         )
         strategy_base.LatentsCachingStrategy.set_strategy(latents_caching_strategy)
 
diff --git a/flux_train.py b/flux_train.py
index ecc87c0a8..e18a92443 100644
--- a/flux_train.py
+++ b/flux_train.py
@@ -57,6 +57,10 @@ def train(args):
     deepspeed_utils.prepare_deepspeed_args(args)
     setup_logging(args, reset=True)
 
+    # temporary: backward compatibility for deprecated options. remove in the future
+    if not args.skip_cache_check:
+        args.skip_cache_check = args.skip_latents_validity_check
+
     # assert (
     #     not args.weighted_captions
     # ), "weighted_captions is not supported currently / weighted_captionsは現在サポートされていません"
@@ -81,7 +85,7 @@ def train(args):
     # prepare caching strategy: this must be set before preparing dataset. because dataset may use this strategy for initialization.
     if args.cache_latents:
         latents_caching_strategy = strategy_flux.FluxLatentsCachingStrategy(
-            args.cache_latents_to_disk, args.vae_batch_size, args.skip_latents_validity_check
+            args.cache_latents_to_disk, args.vae_batch_size, args.skip_cache_check
         )
         strategy_base.LatentsCachingStrategy.set_strategy(latents_caching_strategy)
 
@@ -142,7 +146,7 @@ def train(args):
         if args.cache_text_encoder_outputs:
             strategy_base.TextEncoderOutputsCachingStrategy.set_strategy(
                 strategy_flux.FluxTextEncoderOutputsCachingStrategy(
-                    args.cache_text_encoder_outputs_to_disk, args.text_encoder_batch_size, False, False
+                    args.cache_text_encoder_outputs_to_disk, args.text_encoder_batch_size, args.skip_cache_check, False
                 )
             )
         t5xxl_max_token_length = (
@@ -181,7 +185,7 @@ def train(args):
     # load VAE for caching latents
     ae = None
     if cache_latents:
-        ae = flux_utils.load_ae( args.ae, weight_dtype, "cpu", args.disable_mmap_load_safetensors)
+        ae = flux_utils.load_ae(args.ae, weight_dtype, "cpu", args.disable_mmap_load_safetensors)
         ae.to(accelerator.device, dtype=weight_dtype)
         ae.requires_grad_(False)
         ae.eval()
@@ -229,7 +233,7 @@ def train(args):
         strategy_base.TextEncoderOutputsCachingStrategy.set_strategy(text_encoder_caching_strategy)
 
         with accelerator.autocast():
-            train_dataset_group.new_cache_text_encoder_outputs([clip_l, t5xxl], accelerator.is_main_process)
+            train_dataset_group.new_cache_text_encoder_outputs([clip_l, t5xxl], accelerator)
 
         # cache sample prompt's embeddings to free text encoder's memory
         if args.sample_prompts is not None:
@@ -952,7 +956,7 @@ def setup_parser() -> argparse.ArgumentParser:
     parser.add_argument(
         "--skip_latents_validity_check",
         action="store_true",
-        help="skip latents validity check / latentsの正当性チェックをスキップする",
+        help="[Deprecated] use 'skip_cache_check' instead / 代わりに 'skip_cache_check' を使用してください",
     )
     parser.add_argument(
         "--blocks_to_swap",
diff --git a/flux_train_network.py b/flux_train_network.py
index 5d14bd28e..3bd8316d4 100644
--- a/flux_train_network.py
+++ b/flux_train_network.py
@@ -188,8 +188,8 @@ def get_text_encoder_outputs_caching_strategy(self, args):
             # if the text encoders is trained, we need tokenization, so is_partial is True
             return strategy_flux.FluxTextEncoderOutputsCachingStrategy(
                 args.cache_text_encoder_outputs_to_disk,
-                None,
-                False,
+                args.text_encoder_batch_size,
+                args.skip_cache_check,
                 is_partial=self.train_clip_l or self.train_t5xxl,
                 apply_t5_attn_mask=args.apply_t5_attn_mask,
             )
@@ -222,7 +222,7 @@ def cache_text_encoder_outputs_if_needed(
                 text_encoders[1].to(weight_dtype)
 
             with accelerator.autocast():
-                dataset.new_cache_text_encoder_outputs(text_encoders, accelerator.is_main_process)
+                dataset.new_cache_text_encoder_outputs(text_encoders, accelerator)
 
             # cache sample prompts
             if args.sample_prompts is not None:
diff --git a/library/train_util.py b/library/train_util.py
index 67eaae41b..4e6b3408d 100644
--- a/library/train_util.py
+++ b/library/train_util.py
@@ -31,6 +31,7 @@
 import subprocess
 from io import BytesIO
 import toml
+
 # from concurrent.futures import ThreadPoolExecutor, as_completed
 
 from tqdm import tqdm
@@ -1192,7 +1193,7 @@ def __eq__(self, other):
         for condition, batch in tqdm(batches, smoothing=1, total=len(batches)):
             cache_batch_latents(vae, cache_to_disk, batch, condition.flip_aug, condition.alpha_mask, condition.random_crop)
 
-    def new_cache_text_encoder_outputs(self, models: List[Any], is_main_process: bool):
+    def new_cache_text_encoder_outputs(self, models: List[Any], accelerator: Accelerator):
         r"""
         a brand new method to cache text encoder outputs. This method caches text encoder outputs with caching strategy.
         """
@@ -1207,15 +1208,25 @@ def new_cache_text_encoder_outputs(self, models: List[Any], is_main_process: boo
         # split by resolution
         batches = []
         batch = []
-        logger.info("checking cache validity...")
-        for info in tqdm(image_infos):
-            te_out_npz = caching_strategy.get_outputs_npz_path(info.absolute_path)
 
-            # check disk cache exists and size of latents
+        # support multiple-gpus
+        num_processes = accelerator.num_processes
+        process_index = accelerator.process_index
+
+        logger.info("checking cache validity...")
+        for i, info in enumerate(tqdm(image_infos)):
+            # check disk cache exists and size of text encoder outputs
             if caching_strategy.cache_to_disk:
-                info.text_encoder_outputs_npz = te_out_npz  # set npz filename regardless of cache availability/main process
+                te_out_npz = caching_strategy.get_outputs_npz_path(info.absolute_path)
+                info.text_encoder_outputs_npz = te_out_npz  # set npz filename regardless of cache availability
+
+                # if the modulo of num_processes is not equal to process_index, skip caching
+                # this makes each process cache different text encoder outputs
+                if i % num_processes != process_index:
+                    continue
+
                 cache_available = caching_strategy.is_disk_cached_outputs_expected(te_out_npz)
-                if cache_available or not is_main_process:  # do not add to batch
+                if cache_available:  # do not add to batch
                     continue
 
             batch.append(info)
@@ -2420,6 +2431,7 @@ def new_cache_latents(self, model: Any, accelerator: Accelerator):
         for i, dataset in enumerate(self.datasets):
             logger.info(f"[Dataset {i}]")
             dataset.new_cache_latents(model, accelerator)
+        accelerator.wait_for_everyone()
 
     def cache_text_encoder_outputs(
         self, tokenizers, text_encoders, device, weight_dtype, cache_to_disk=False, is_main_process=True
@@ -2437,10 +2449,11 @@ def cache_text_encoder_outputs_sd3(
                 tokenizer, text_encoders, device, output_dtype, te_dtypes, cache_to_disk, is_main_process, batch_size
             )
 
-    def new_cache_text_encoder_outputs(self, models: List[Any], is_main_process: bool):
+    def new_cache_text_encoder_outputs(self, models: List[Any], accelerator: Accelerator):
         for i, dataset in enumerate(self.datasets):
             logger.info(f"[Dataset {i}]")
-            dataset.new_cache_text_encoder_outputs(models, is_main_process)
+            dataset.new_cache_text_encoder_outputs(models, accelerator)
+        accelerator.wait_for_everyone()
 
     def set_caching_mode(self, caching_mode):
         for dataset in self.datasets:
@@ -4210,6 +4223,12 @@ def add_dataset_arguments(
         action="store_true",
         help="cache latents to disk to reduce VRAM usage (augmentations must be disabled) / VRAM削減のためにlatentをディスクにcacheする（augmentationは使用不可）",
     )
+    parser.add_argument(
+        "--skip_cache_check",
+        action="store_true",
+        help="skip the content validation of cache (latent and text encoder output). Cache file existence check is always performed, and cache processing is performed if the file does not exist"
+        " / cacheの内容の検証をスキップする（latentとテキストエンコーダの出力）。キャッシュファイルの存在確認は常に行われ、ファイルがなければキャッシュ処理が行われる",
+    )
     parser.add_argument(
         "--enable_bucket",
         action="store_true",
@@ -5084,15 +5103,24 @@ def prepare_accelerator(args: argparse.Namespace):
         dynamo_backend = args.dynamo_backend
 
     kwargs_handlers = [
-        InitProcessGroupKwargs(
-            backend = "gloo" if os.name == "nt" or not torch.cuda.is_available() else "nccl",
-            init_method="env://?use_libuv=False" if os.name == "nt" and Version(torch.__version__) >= Version("2.4.0") else None,
-            timeout=datetime.timedelta(minutes=args.ddp_timeout) if args.ddp_timeout else None
-        ) if torch.cuda.device_count() > 1 else None,
-        DistributedDataParallelKwargs(
-            gradient_as_bucket_view=args.ddp_gradient_as_bucket_view, 
-            static_graph=args.ddp_static_graph
-        ) if args.ddp_gradient_as_bucket_view or args.ddp_static_graph else None
+        (
+            InitProcessGroupKwargs(
+                backend="gloo" if os.name == "nt" or not torch.cuda.is_available() else "nccl",
+                init_method=(
+                    "env://?use_libuv=False" if os.name == "nt" and Version(torch.__version__) >= Version("2.4.0") else None
+                ),
+                timeout=datetime.timedelta(minutes=args.ddp_timeout) if args.ddp_timeout else None,
+            )
+            if torch.cuda.device_count() > 1
+            else None
+        ),
+        (
+            DistributedDataParallelKwargs(
+                gradient_as_bucket_view=args.ddp_gradient_as_bucket_view, static_graph=args.ddp_static_graph
+            )
+            if args.ddp_gradient_as_bucket_view or args.ddp_static_graph
+            else None
+        ),
     ]
     kwargs_handlers = [i for i in kwargs_handlers if i is not None]
     deepspeed_plugin = deepspeed_utils.prepare_deepspeed_plugin(args)
diff --git a/sd3_train.py b/sd3_train.py
index 5120105f2..7290956ad 100644
--- a/sd3_train.py
+++ b/sd3_train.py
@@ -57,6 +57,10 @@ def train(args):
     deepspeed_utils.prepare_deepspeed_args(args)
     setup_logging(args, reset=True)
 
+    # temporary: backward compatibility for deprecated options. remove in the future
+    if not args.skip_cache_check:
+        args.skip_cache_check = args.skip_latents_validity_check
+
     assert (
         not args.weighted_captions
     ), "weighted_captions is not supported currently / weighted_captionsは現在サポートされていません"
@@ -103,7 +107,7 @@ def train(args):
     # prepare caching strategy: this must be set before preparing dataset. because dataset may use this strategy for initialization.
     if args.cache_latents:
         latents_caching_strategy = strategy_sd3.Sd3LatentsCachingStrategy(
-            args.cache_latents_to_disk, args.vae_batch_size, args.skip_latents_validity_check
+            args.cache_latents_to_disk, args.vae_batch_size, args.skip_cache_check
         )
         strategy_base.LatentsCachingStrategy.set_strategy(latents_caching_strategy)
 
@@ -312,7 +316,7 @@ def train(args):
         text_encoder_caching_strategy = strategy_sd3.Sd3TextEncoderOutputsCachingStrategy(
             args.cache_text_encoder_outputs_to_disk,
             args.text_encoder_batch_size,
-            False,
+            args.skip_cache_check,
             train_clip_g or train_clip_l or args.use_t5xxl_cache_only,
             args.apply_lg_attn_mask,
             args.apply_t5_attn_mask,
@@ -325,7 +329,7 @@ def train(args):
             t5xxl.to(t5xxl_device, dtype=t5xxl_dtype)
 
         with accelerator.autocast():
-            train_dataset_group.new_cache_text_encoder_outputs([clip_l, clip_g, t5xxl], accelerator.is_main_process)
+            train_dataset_group.new_cache_text_encoder_outputs([clip_l, clip_g, t5xxl], accelerator)
 
         # cache sample prompt's embeddings to free text encoder's memory
         if args.sample_prompts is not None:
@@ -1052,7 +1056,12 @@ def setup_parser() -> argparse.ArgumentParser:
     parser.add_argument(
         "--skip_latents_validity_check",
         action="store_true",
-        help="skip latents validity check / latentsの正当性チェックをスキップする",
+        help="[Deprecated] use 'skip_cache_check' instead / 代わりに 'skip_cache_check' を使用してください",
+    )
+    parser.add_argument(
+        "--skip_cache_check",
+        action="store_true",
+        help="skip cache (latents and text encoder outputs) check / キャッシュ（latentsとtext encoder outputs）のチェックをスキップする",
     )
     parser.add_argument(
         "--num_last_block_to_freeze",
diff --git a/sdxl_train.py b/sdxl_train.py
index aeff9c469..9b2d19165 100644
--- a/sdxl_train.py
+++ b/sdxl_train.py
@@ -131,7 +131,7 @@ def train(args):
     # prepare caching strategy: this must be set before preparing dataset. because dataset may use this strategy for initialization.
     if args.cache_latents:
         latents_caching_strategy = strategy_sd.SdSdxlLatentsCachingStrategy(
-            False, args.cache_latents_to_disk, args.vae_batch_size, False
+            False, args.cache_latents_to_disk, args.vae_batch_size, args.skip_cache_check
         )
         strategy_base.LatentsCachingStrategy.set_strategy(latents_caching_strategy)
 
@@ -328,7 +328,7 @@ def fn_recursive_set_mem_eff(module: torch.nn.Module):
             text_encoder1.to(accelerator.device)
             text_encoder2.to(accelerator.device)
             with accelerator.autocast():
-                train_dataset_group.new_cache_text_encoder_outputs([text_encoder1, text_encoder2], accelerator.is_main_process)
+                train_dataset_group.new_cache_text_encoder_outputs([text_encoder1, text_encoder2], accelerator)
 
         accelerator.wait_for_everyone()
 
diff --git a/sdxl_train_control_net.py b/sdxl_train_control_net.py
index 67c8d52c8..74b3a64a4 100644
--- a/sdxl_train_control_net.py
+++ b/sdxl_train_control_net.py
@@ -84,7 +84,7 @@ def train(args):
 
     # prepare caching strategy: this must be set before preparing dataset. because dataset may use this strategy for initialization.
     latents_caching_strategy = strategy_sd.SdSdxlLatentsCachingStrategy(
-        False, args.cache_latents_to_disk, args.vae_batch_size, False
+        False, args.cache_latents_to_disk, args.vae_batch_size, args.skip_cache_check
     )
     strategy_base.LatentsCachingStrategy.set_strategy(latents_caching_strategy)
 
@@ -230,7 +230,7 @@ def unwrap_model(model):
         text_encoder1.to(accelerator.device)
         text_encoder2.to(accelerator.device)
         with accelerator.autocast():
-            train_dataset_group.new_cache_text_encoder_outputs([text_encoder1, text_encoder2], accelerator.is_main_process)
+            train_dataset_group.new_cache_text_encoder_outputs([text_encoder1, text_encoder2], accelerator)
 
         accelerator.wait_for_everyone()
 
diff --git a/sdxl_train_control_net_lllite.py b/sdxl_train_control_net_lllite.py
index 9d1cfc63e..14ff7c240 100644
--- a/sdxl_train_control_net_lllite.py
+++ b/sdxl_train_control_net_lllite.py
@@ -93,7 +93,7 @@ def train(args):
 
     # prepare caching strategy: this must be set before preparing dataset. because dataset may use this strategy for initialization.
     latents_caching_strategy = strategy_sd.SdSdxlLatentsCachingStrategy(
-        False, args.cache_latents_to_disk, args.vae_batch_size, False
+        False, args.cache_latents_to_disk, args.vae_batch_size, args.skip_cache_check
     )
     strategy_base.LatentsCachingStrategy.set_strategy(latents_caching_strategy)
 
@@ -202,7 +202,7 @@ def train(args):
         text_encoder1.to(accelerator.device)
         text_encoder2.to(accelerator.device)
         with accelerator.autocast():
-            train_dataset_group.new_cache_text_encoder_outputs([text_encoder1, text_encoder2], accelerator.is_main_process)
+            train_dataset_group.new_cache_text_encoder_outputs([text_encoder1, text_encoder2], accelerator)
 
         accelerator.wait_for_everyone()
 
@@ -431,7 +431,6 @@ def remove_model(old_ckpt_name):
                             latents = torch.nan_to_num(latents, 0, out=latents)
                     latents = latents * sdxl_model_util.VAE_SCALE_FACTOR
 
-
                 text_encoder_outputs_list = batch.get("text_encoder_outputs_list", None)
                 if text_encoder_outputs_list is not None:
                     # Text Encoder outputs are cached
diff --git a/sdxl_train_network.py b/sdxl_train_network.py
index 20e32155c..4a16a4891 100644
--- a/sdxl_train_network.py
+++ b/sdxl_train_network.py
@@ -67,7 +67,7 @@ def get_tokenizers(self, tokenize_strategy: strategy_sdxl.SdxlTokenizeStrategy):
 
     def get_latents_caching_strategy(self, args):
         latents_caching_strategy = strategy_sd.SdSdxlLatentsCachingStrategy(
-            False, args.cache_latents_to_disk, args.vae_batch_size, False
+            False, args.cache_latents_to_disk, args.vae_batch_size, args.skip_cache_check
         )
         return latents_caching_strategy
 
@@ -80,7 +80,7 @@ def get_models_for_text_encoding(self, args, accelerator, text_encoders):
     def get_text_encoder_outputs_caching_strategy(self, args):
         if args.cache_text_encoder_outputs:
             return strategy_sdxl.SdxlTextEncoderOutputsCachingStrategy(
-                args.cache_text_encoder_outputs_to_disk, None, False, is_weighted=args.weighted_captions
+                args.cache_text_encoder_outputs_to_disk, None, args.skip_cache_check, is_weighted=args.weighted_captions
             )
         else:
             return None
@@ -102,9 +102,7 @@ def cache_text_encoder_outputs_if_needed(
             text_encoders[0].to(accelerator.device, dtype=weight_dtype)
             text_encoders[1].to(accelerator.device, dtype=weight_dtype)
             with accelerator.autocast():
-                dataset.new_cache_text_encoder_outputs(
-                    text_encoders + [accelerator.unwrap_model(text_encoders[-1])], accelerator.is_main_process
-                )
+                dataset.new_cache_text_encoder_outputs(text_encoders + [accelerator.unwrap_model(text_encoders[-1])], accelerator)
             accelerator.wait_for_everyone()
 
             text_encoders[0].to("cpu", dtype=torch.float32)  # Text Encoder doesn't work with fp16 on CPU
diff --git a/sdxl_train_textual_inversion.py b/sdxl_train_textual_inversion.py
index cbfcef554..821a69558 100644
--- a/sdxl_train_textual_inversion.py
+++ b/sdxl_train_textual_inversion.py
@@ -49,7 +49,7 @@ def get_tokenizers(self, tokenize_strategy: strategy_sdxl.SdxlTokenizeStrategy):
 
     def get_latents_caching_strategy(self, args):
         latents_caching_strategy = strategy_sd.SdSdxlLatentsCachingStrategy(
-            False, args.cache_latents_to_disk, args.vae_batch_size, False
+            False, args.cache_latents_to_disk, args.vae_batch_size, args.skip_cache_check
         )
         return latents_caching_strategy
 
diff --git a/train_db.py b/train_db.py
index e49a7e70f..683b42332 100644
--- a/train_db.py
+++ b/train_db.py
@@ -64,7 +64,7 @@ def train(args):
 
     # prepare caching strategy: this must be set before preparing dataset. because dataset may use this strategy for initialization.
     latents_caching_strategy = strategy_sd.SdSdxlLatentsCachingStrategy(
-        False, args.cache_latents_to_disk, args.vae_batch_size, False
+        False, args.cache_latents_to_disk, args.vae_batch_size, args.skip_cache_check
     )
     strategy_base.LatentsCachingStrategy.set_strategy(latents_caching_strategy)
 
diff --git a/train_network.py b/train_network.py
index 7437157b9..d5330aef4 100644
--- a/train_network.py
+++ b/train_network.py
@@ -116,7 +116,7 @@ def get_tokenizers(self, tokenize_strategy: strategy_sd.SdTokenizeStrategy) -> L
 
     def get_latents_caching_strategy(self, args):
         latents_caching_strategy = strategy_sd.SdSdxlLatentsCachingStrategy(
-            True, args.cache_latents_to_disk, args.vae_batch_size, False
+            True, args.cache_latents_to_disk, args.vae_batch_size, args.skip_cache_check
         )
         return latents_caching_strategy
 
diff --git a/train_textual_inversion.py b/train_textual_inversion.py
index 3b3d3393f..4d8a3abbf 100644
--- a/train_textual_inversion.py
+++ b/train_textual_inversion.py
@@ -114,7 +114,7 @@ def get_tokenizers(self, tokenize_strategy: strategy_sd.SdTokenizeStrategy) -> L
 
     def get_latents_caching_strategy(self, args):
         latents_caching_strategy = strategy_sd.SdSdxlLatentsCachingStrategy(
-            True, args.cache_latents_to_disk, args.vae_batch_size, False
+            True, args.cache_latents_to_disk, args.vae_batch_size, args.skip_cache_check
         )
         return latents_caching_strategy
 

From ecaea909b10fa8b3eb94a1cf57b26d5daba1683e Mon Sep 17 00:00:00 2001
From: kohya-ss <ykumeykume@gmail.com>
Date: Sat, 12 Oct 2024 20:26:57 +0900
Subject: [PATCH 176/582] update README

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 37fc911f6..9128bf8da 100644
--- a/README.md
+++ b/README.md
@@ -14,7 +14,7 @@ The command to install PyTorch is as follows:
 Oct 12, 2024:
 
 - Multi-GPU training now works on Windows. Thanks to Akegarasu for PR [#1686](https://github.com/kohya-ss/sd-scripts/pull/1686)!
-  - It should work with all training scripts, but it is unverified.
+  - In simple tests, SDXL and FLUX.1 LoRA training worked. FLUX.1 fine-tuning did not work, probably due to a PyTorch-related error. Other scripts are unverified.
   - Set up multi-GPU training with `accelerate config`.
   - Specify `--rdzv_backend=c10d` when launching `accelerate launch`. You can also edit `config.yaml` directly.
     ```

From e277b5789e791539b5e51187530f11bd94e24871 Mon Sep 17 00:00:00 2001
From: Kohya S <ykumeykume@gmail.com>
Date: Sat, 12 Oct 2024 21:49:07 +0900
Subject: [PATCH 177/582] Update FLUX.1 support for compact models

---
 README.md             | 10 ++++++
 flux_train.py         | 12 +++----
 flux_train_network.py |  2 +-
 library/flux_utils.py | 76 ++++++++++++++++++++++++++++++++++++-------
 4 files changed, 82 insertions(+), 18 deletions(-)

diff --git a/README.md b/README.md
index 9128bf8da..b64515a19 100644
--- a/README.md
+++ b/README.md
@@ -11,6 +11,16 @@ The command to install PyTorch is as follows:
 
 ### Recent Updates
 
+Oct 12, 2024 (update 1):
+
+- [Experimental] FLUX.1 fine-tuning and LoRA training now support "FLUX.1 __compact__" models.
+  - A compact model is a model that retains the FLUX.1 architecture but reduces the number of double/single blocks from the default 19/38.
+  - The model is automatically determined based on the keys in *.safetensors.
+  - Specifications for compact model safetensors:
+    - Please specify the block indices as consecutive numbers. An error will occur if there are missing numbers. For example, if you reduce the double blocks to 15, the maximum key will be `double_blocks.14.*`. The same applies to single blocks.
+  - LoRA training is unverified.
+  - The trained model can be used for inference with `flux_minimal_inference.py`. Other inference environments are unverified.
+
 Oct 12, 2024:
 
 - Multi-GPU training now works on Windows. Thanks to Akegarasu for PR [#1686](https://github.com/kohya-ss/sd-scripts/pull/1686)!
diff --git a/flux_train.py b/flux_train.py
index ecc87c0a8..2fc13068e 100644
--- a/flux_train.py
+++ b/flux_train.py
@@ -137,7 +137,7 @@ def train(args):
 
     train_dataset_group.verify_bucket_reso_steps(16)  # TODO これでいいか確認
 
-    _, is_schnell, _ = flux_utils.check_flux_state_dict_diffusers_schnell(args.pretrained_model_name_or_path)
+    _, is_schnell, _, _ = flux_utils.analyze_checkpoint_state(args.pretrained_model_name_or_path)
     if args.debug_dataset:
         if args.cache_text_encoder_outputs:
             strategy_base.TextEncoderOutputsCachingStrategy.set_strategy(
@@ -181,7 +181,7 @@ def train(args):
     # load VAE for caching latents
     ae = None
     if cache_latents:
-        ae = flux_utils.load_ae( args.ae, weight_dtype, "cpu", args.disable_mmap_load_safetensors)
+        ae = flux_utils.load_ae(args.ae, weight_dtype, "cpu", args.disable_mmap_load_safetensors)
         ae.to(accelerator.device, dtype=weight_dtype)
         ae.requires_grad_(False)
         ae.eval()
@@ -510,8 +510,8 @@ def wait_blocks_move(block_idx, futures):
         library.adafactor_fused.patch_adafactor_fused(optimizer)
 
         blocks_to_swap = args.blocks_to_swap
-        num_double_blocks = 19  # len(flux.double_blocks)
-        num_single_blocks = 38  # len(flux.single_blocks)
+        num_double_blocks = len(accelerator.unwrap_model(flux).double_blocks)
+        num_single_blocks = len(accelerator.unwrap_model(flux).single_blocks)
         num_block_units = num_double_blocks + num_single_blocks // 2
         handled_unit_indices = set()
 
@@ -603,8 +603,8 @@ def __grad_hook(tensor: torch.Tensor, param_group=param_group):
         parameter_optimizer_map = {}
 
         blocks_to_swap = args.blocks_to_swap
-        num_double_blocks = 19  # len(flux.double_blocks)
-        num_single_blocks = 38  # len(flux.single_blocks)
+        num_double_blocks = len(accelerator.unwrap_model(flux).double_blocks)
+        num_single_blocks = len(accelerator.unwrap_model(flux).single_blocks)
         num_block_units = num_double_blocks + num_single_blocks // 2
 
         n = 1  # only asynchronous purpose, no need to increase this number
diff --git a/flux_train_network.py b/flux_train_network.py
index 5d14bd28e..a24c1905b 100644
--- a/flux_train_network.py
+++ b/flux_train_network.py
@@ -139,7 +139,7 @@ def prepare_split_model(self, model, weight_dtype, accelerator):
         return flux_lower
 
     def get_tokenize_strategy(self, args):
-        _, is_schnell, _ = flux_utils.check_flux_state_dict_diffusers_schnell(args.pretrained_model_name_or_path)
+        _, is_schnell, _, _ = flux_utils.analyze_checkpoint_state(args.pretrained_model_name_or_path)
 
         if args.t5xxl_max_token_length is None:
             if is_schnell:
diff --git a/library/flux_utils.py b/library/flux_utils.py
index 713814e28..7a1ec37b8 100644
--- a/library/flux_utils.py
+++ b/library/flux_utils.py
@@ -1,3 +1,4 @@
+from dataclasses import replace
 import json
 import os
 from typing import List, Optional, Tuple, Union
@@ -43,8 +44,21 @@ def load_safetensors(
             return load_file(path)  # prevent device invalid Error
 
 
-def check_flux_state_dict_diffusers_schnell(ckpt_path: str) -> Tuple[bool, bool, List[str]]:
-    # check the state dict: Diffusers or BFL, dev or schnell
+def analyze_checkpoint_state(ckpt_path: str) -> Tuple[bool, bool, Tuple[int, int], List[str]]:
+    """
+    チェックポイントの状態を分析し、DiffusersかBFLか、devかschnellか、ブロック数を計算して返す。
+
+    Args:
+        ckpt_path (str): チェックポイントファイルまたはディレクトリのパス。
+
+    Returns:
+        Tuple[bool, bool, Tuple[int, int], List[str]]:
+            - bool: Diffusersかどうかを示すフラグ。
+            - bool: Schnellかどうかを示すフラグ。
+            - Tuple[int, int]: ダブルブロックとシングルブロックの数。
+            - List[str]: チェックポイントに含まれるキーのリスト。
+    """
+    # check the state dict: Diffusers or BFL, dev or schnell, number of blocks
     logger.info(f"Checking the state dict: Diffusers or BFL, dev or schnell")
 
     if os.path.isdir(ckpt_path):  # if ckpt_path is a directory, it is Diffusers
@@ -61,19 +75,57 @@ def check_flux_state_dict_diffusers_schnell(ckpt_path: str) -> Tuple[bool, bool,
 
     is_diffusers = "transformer_blocks.0.attn.add_k_proj.bias" in keys
     is_schnell = not ("guidance_in.in_layer.bias" in keys or "time_text_embed.guidance_embedder.linear_1.bias" in keys)
-    return is_diffusers, is_schnell, ckpt_paths
+
+    # check number of double and single blocks
+    if not is_diffusers:
+        max_double_block_index = max(
+            [int(key.split(".")[1]) for key in keys if key.startswith("double_blocks.") and key.endswith(".img_attn.proj.bias")]
+        )
+        max_single_block_index = max(
+            [int(key.split(".")[1]) for key in keys if key.startswith("single_blocks.") and key.endswith(".modulation.lin.bias")]
+        )
+    else:
+        max_double_block_index = max(
+            [
+                int(key.split(".")[1])
+                for key in keys
+                if key.startswith("transformer_blocks.") and key.endswith(".attn.add_k_proj.bias")
+            ]
+        )
+        max_single_block_index = max(
+            [
+                int(key.split(".")[1])
+                for key in keys
+                if key.startswith("single_transformer_blocks.") and key.endswith(".attn.to_k.bias")
+            ]
+        )
+
+    num_double_blocks = max_double_block_index + 1
+    num_single_blocks = max_single_block_index + 1
+
+    return is_diffusers, is_schnell, (num_double_blocks, num_single_blocks), ckpt_paths
 
 
 def load_flow_model(
     ckpt_path: str, dtype: Optional[torch.dtype], device: Union[str, torch.device], disable_mmap: bool = False
 ) -> Tuple[bool, flux_models.Flux]:
-    is_diffusers, is_schnell, ckpt_paths = check_flux_state_dict_diffusers_schnell(ckpt_path)
+    is_diffusers, is_schnell, (num_double_blocks, num_single_blocks), ckpt_paths = analyze_checkpoint_state(ckpt_path)
     name = MODEL_NAME_DEV if not is_schnell else MODEL_NAME_SCHNELL
 
     # build model
     logger.info(f"Building Flux model {name} from {'Diffusers' if is_diffusers else 'BFL'} checkpoint")
     with torch.device("meta"):
-        model = flux_models.Flux(flux_models.configs[name].params)
+        params = flux_models.configs[name].params
+
+        # set the number of blocks
+        if params.depth != num_double_blocks:
+            logger.info(f"Setting the number of double blocks from {params.depth} to {num_double_blocks}")
+            params = replace(params, depth=num_double_blocks)
+        if params.depth_single_blocks != num_single_blocks:
+            logger.info(f"Setting the number of single blocks from {params.depth_single_blocks} to {num_single_blocks}")
+            params = replace(params, depth_single_blocks=num_single_blocks)
+
+        model = flux_models.Flux(params)
         if dtype is not None:
             model = model.to(dtype)
 
@@ -86,7 +138,7 @@ def load_flow_model(
     # convert Diffusers to BFL
     if is_diffusers:
         logger.info("Converting Diffusers to BFL")
-        sd = convert_diffusers_sd_to_bfl(sd)
+        sd = convert_diffusers_sd_to_bfl(sd, num_double_blocks, num_single_blocks)
         logger.info("Converted Diffusers to BFL")
 
     info = model.load_state_dict(sd, strict=False, assign=True)
@@ -349,16 +401,16 @@ def pack_latents(x: torch.Tensor) -> torch.Tensor:
 }
 
 
-def make_diffusers_to_bfl_map() -> dict[str, tuple[int, str]]:
+def make_diffusers_to_bfl_map(num_double_blocks: int, num_single_blocks: int) -> dict[str, tuple[int, str]]:
     # make reverse map from diffusers map
     diffusers_to_bfl_map = {}  # key: diffusers_key, value: (index, bfl_key)
-    for b in range(NUM_DOUBLE_BLOCKS):
+    for b in range(num_double_blocks):
         for key, weights in BFL_TO_DIFFUSERS_MAP.items():
             if key.startswith("double_blocks."):
                 block_prefix = f"transformer_blocks.{b}."
                 for i, weight in enumerate(weights):
                     diffusers_to_bfl_map[f"{block_prefix}{weight}"] = (i, key.replace("()", f"{b}"))
-    for b in range(NUM_SINGLE_BLOCKS):
+    for b in range(num_single_blocks):
         for key, weights in BFL_TO_DIFFUSERS_MAP.items():
             if key.startswith("single_blocks."):
                 block_prefix = f"single_transformer_blocks.{b}."
@@ -371,8 +423,10 @@ def make_diffusers_to_bfl_map() -> dict[str, tuple[int, str]]:
     return diffusers_to_bfl_map
 
 
-def convert_diffusers_sd_to_bfl(diffusers_sd: dict[str, torch.Tensor]) -> dict[str, torch.Tensor]:
-    diffusers_to_bfl_map = make_diffusers_to_bfl_map()
+def convert_diffusers_sd_to_bfl(
+    diffusers_sd: dict[str, torch.Tensor], num_double_blocks: int = NUM_DOUBLE_BLOCKS, num_single_blocks: int = NUM_SINGLE_BLOCKS
+) -> dict[str, torch.Tensor]:
+    diffusers_to_bfl_map = make_diffusers_to_bfl_map(num_double_blocks, num_single_blocks)
 
     # iterate over three safetensors files to reduce memory usage
     flux_sd = {}

From 74228c9953b4ba0f8b0d68e8f6c8a8a6a469c2f5 Mon Sep 17 00:00:00 2001
From: kohya-ss <ykumeykume@gmail.com>
Date: Sun, 13 Oct 2024 16:27:22 +0900
Subject: [PATCH 178/582] update cache_latents/text_encoder_outputs

---
 library/strategy_base.py            |   2 +-
 tools/cache_latents.py              | 147 +++++++++++------------
 tools/cache_text_encoder_outputs.py | 178 ++++++++++++++++------------
 3 files changed, 166 insertions(+), 161 deletions(-)

diff --git a/library/strategy_base.py b/library/strategy_base.py
index 2bff4178a..363996cec 100644
--- a/library/strategy_base.py
+++ b/library/strategy_base.py
@@ -325,7 +325,7 @@ class TextEncoderOutputsCachingStrategy:
     def __init__(
         self,
         cache_to_disk: bool,
-        batch_size: int,
+        batch_size: Optional[int],
         skip_disk_cache_validity_check: bool,
         is_partial: bool = False,
         is_weighted: bool = False,
diff --git a/tools/cache_latents.py b/tools/cache_latents.py
index 2f0098b42..d8154ec31 100644
--- a/tools/cache_latents.py
+++ b/tools/cache_latents.py
@@ -9,7 +9,7 @@
 import torch
 from tqdm import tqdm
 
-from library import config_util
+from library import config_util, flux_utils, strategy_base, strategy_flux, strategy_sd, strategy_sdxl
 from library import train_util
 from library import sdxl_train_util
 from library.config_util import (
@@ -17,42 +17,73 @@
     BlueprintGenerator,
 )
 from library.utils import setup_logging, add_logging_arguments
+
 setup_logging()
 import logging
 
 logger = logging.getLogger(__name__)
 
 
+def set_tokenize_strategy(is_sd: bool, is_sdxl: bool, is_flux: bool, args: argparse.Namespace) -> None:
+    if is_flux:
+        _, is_schnell, _ = flux_utils.check_flux_state_dict_diffusers_schnell(args.pretrained_model_name_or_path)
+    else:
+        is_schnell = False
+
+    if is_sd or is_sdxl:
+        tokenize_strategy = strategy_sd.SdTokenizeStrategy(args.v2, args.max_token_length, args.tokenizer_cache_dir)
+    elif is_sdxl:
+        tokenize_strategy = strategy_sdxl.SdxlTokenizeStrategy(args.max_token_length, args.tokenizer_cache_dir)
+    else:
+        if args.t5xxl_max_token_length is None:
+            if is_schnell:
+                t5xxl_max_token_length = 256
+            else:
+                t5xxl_max_token_length = 512
+        else:
+            t5xxl_max_token_length = args.t5xxl_max_token_length
+
+        logger.info(f"t5xxl_max_token_length: {t5xxl_max_token_length}")
+        tokenize_strategy = strategy_flux.FluxTokenizeStrategy(t5xxl_max_token_length, args.tokenizer_cache_dir)
+    strategy_base.TokenizeStrategy.set_strategy(tokenize_strategy)
+
+
 def cache_to_disk(args: argparse.Namespace) -> None:
     setup_logging(args, reset=True)
     train_util.prepare_dataset_args(args, True)
 
-    # check cache latents arg
-    assert args.cache_latents_to_disk, "cache_latents_to_disk must be True / cache_latents_to_diskはTrueである必要があります"
+    # assert args.cache_latents_to_disk, "cache_latents_to_disk must be True / cache_latents_to_diskはTrueである必要があります"
+    args.cache_latents = True
+    args.cache_latents_to_disk = True
 
     use_dreambooth_method = args.in_json is None
 
     if args.seed is not None:
         set_seed(args.seed)  # 乱数系列を初期化する
 
-    # tokenizerを準備する：datasetを動かすために必要
-    if args.sdxl:
-        tokenizer1, tokenizer2 = sdxl_train_util.load_tokenizers(args)
-        tokenizers = [tokenizer1, tokenizer2]
+    is_sd = not args.sdxl and not args.flux
+    is_sdxl = args.sdxl
+    is_flux = args.flux
+
+    set_tokenize_strategy(is_sd, is_sdxl, is_flux, args)
+
+    if is_sd or is_sdxl:
+        latents_caching_strategy = strategy_sd.SdSdxlLatentsCachingStrategy(is_sd, True, args.vae_batch_size, args.skip_cache_check)
     else:
-        tokenizer = train_util.load_tokenizer(args)
-        tokenizers = [tokenizer]
+        latents_caching_strategy = strategy_flux.FluxLatentsCachingStrategy(True, args.vae_batch_size, args.skip_cache_check)
+    strategy_base.LatentsCachingStrategy.set_strategy(latents_caching_strategy)
 
     # データセットを準備する
+    use_user_config = args.dataset_config is not None
     if args.dataset_class is None:
-        blueprint_generator = BlueprintGenerator(ConfigSanitizer(True, True, False, True))
-        if args.dataset_config is not None:
-            logger.info(f"Load dataset config from {args.dataset_config}")
+        blueprint_generator = BlueprintGenerator(ConfigSanitizer(True, True, args.masked_loss, True))
+        if use_user_config:
+            logger.info(f"Loading dataset config from {args.dataset_config}")
             user_config = config_util.load_user_config(args.dataset_config)
-            ignored = ["train_data_dir", "in_json"]
+            ignored = ["train_data_dir", "reg_data_dir", "in_json"]
             if any(getattr(args, attr) is not None for attr in ignored):
                 logger.warning(
-                    "ignore following options because config file is found: {0} / 設定ファイルが利用されるため以下のオプションは無視されます: {0}".format(
+                    "ignoring the following options because config file is found: {0} / 設定ファイルが利用されるため以下のオプションは無視されます: {0}".format(
                         ", ".join(ignored)
                     )
                 )
@@ -83,17 +114,11 @@ def cache_to_disk(args: argparse.Namespace) -> None:
                     ]
                 }
 
-        blueprint = blueprint_generator.generate(user_config, args, tokenizer=tokenizers)
+        blueprint = blueprint_generator.generate(user_config, args)
         train_dataset_group = config_util.generate_dataset_group_by_blueprint(blueprint.dataset_group)
     else:
-        train_dataset_group = train_util.load_arbitrary_dataset(args, tokenizers)
-
-    # datasetのcache_latentsを呼ばなければ、生の画像が返る
-
-    current_epoch = Value("i", 0)
-    current_step = Value("i", 0)
-    ds_for_collator = train_dataset_group if args.max_data_loader_n_workers == 0 else None
-    collator = train_util.collator_class(current_epoch, current_step, ds_for_collator)
+        # use arbitrary dataset class
+        train_dataset_group = train_util.load_arbitrary_dataset(args)
 
     # acceleratorを準備する
     logger.info("prepare accelerator")
@@ -106,72 +131,27 @@ def cache_to_disk(args: argparse.Namespace) -> None:
 
     # モデルを読み込む
     logger.info("load model")
-    if args.sdxl:
+    if is_sd:
+        _, vae, _, _ = train_util.load_target_model(args, weight_dtype, accelerator)
+    elif is_sdxl:
         (_, _, _, vae, _, _, _) = sdxl_train_util.load_target_model(args, accelerator, "sdxl", weight_dtype)
     else:
-        _, vae, _, _ = train_util.load_target_model(args, weight_dtype, accelerator)
+        vae = flux_utils.load_ae(args.ae, weight_dtype, "cpu", disable_mmap=args.disable_mmap_load_safetensors)
+
+    if is_sd or is_sdxl:
+        if torch.__version__ >= "2.0.0":  # PyTorch 2.0.0 以上対応のxformersなら以下が使える
+            vae.set_use_memory_efficient_attention_xformers(args.xformers)
 
-    if torch.__version__ >= "2.0.0":  # PyTorch 2.0.0 以上対応のxformersなら以下が使える
-        vae.set_use_memory_efficient_attention_xformers(args.xformers)
     vae.to(accelerator.device, dtype=vae_dtype)
     vae.requires_grad_(False)
     vae.eval()
 
-    # dataloaderを準備する
-    train_dataset_group.set_caching_mode("latents")
-
-    # DataLoaderのプロセス数：0 は persistent_workers が使えないので注意
-    n_workers = min(args.max_data_loader_n_workers, os.cpu_count())  # cpu_count or max_data_loader_n_workers
-
-    train_dataloader = torch.utils.data.DataLoader(
-        train_dataset_group,
-        batch_size=1,
-        shuffle=True,
-        collate_fn=collator,
-        num_workers=n_workers,
-        persistent_workers=args.persistent_data_loader_workers,
-    )
-
-    # acceleratorを使ってモデルを準備する：マルチGPUで使えるようになるはず
-    train_dataloader = accelerator.prepare(train_dataloader)
-
-    # データ取得のためのループ
-    for batch in tqdm(train_dataloader):
-        b_size = len(batch["images"])
-        vae_batch_size = b_size if args.vae_batch_size is None else args.vae_batch_size
-        flip_aug = batch["flip_aug"]
-        alpha_mask = batch["alpha_mask"]
-        random_crop = batch["random_crop"]
-        bucket_reso = batch["bucket_reso"]
-
-        # バッチを分割して処理する
-        for i in range(0, b_size, vae_batch_size):
-            images = batch["images"][i : i + vae_batch_size]
-            absolute_paths = batch["absolute_paths"][i : i + vae_batch_size]
-            resized_sizes = batch["resized_sizes"][i : i + vae_batch_size]
-
-            image_infos = []
-            for i, (image, absolute_path, resized_size) in enumerate(zip(images, absolute_paths, resized_sizes)):
-                image_info = train_util.ImageInfo(absolute_path, 1, "dummy", False, absolute_path)
-                image_info.image = image
-                image_info.bucket_reso = bucket_reso
-                image_info.resized_size = resized_size
-                image_info.latents_npz = os.path.splitext(absolute_path)[0] + ".npz"
-
-                if args.skip_existing:
-                    if train_util.is_disk_cached_latents_is_expected(
-                        image_info.bucket_reso, image_info.latents_npz, flip_aug, alpha_mask
-                    ):
-                        logger.warning(f"Skipping {image_info.latents_npz} because it already exists.")
-                        continue
-
-                image_infos.append(image_info)
-
-            if len(image_infos) > 0:
-                train_util.cache_batch_latents(vae, True, image_infos, flip_aug, alpha_mask, random_crop)
+    # cache latents with dataset
+    # TODO use DataLoader to speed up
+    train_dataset_group.new_cache_latents(vae, accelerator)
 
     accelerator.wait_for_everyone()
-    accelerator.print(f"Finished caching latents for {len(train_dataset_group)} batches.")
+    accelerator.print(f"Finished caching latents to disk.")
 
 
 def setup_parser() -> argparse.ArgumentParser:
@@ -182,7 +162,11 @@ def setup_parser() -> argparse.ArgumentParser:
     train_util.add_training_arguments(parser, True)
     train_util.add_dataset_arguments(parser, True, True, True)
     config_util.add_config_arguments(parser)
+    parser.add_argument(
+        "--ae", type=str, default=None, help="Autoencoder model of FLUX to use / 使用するFLUXのオートエンコーダモデル"
+    )
     parser.add_argument("--sdxl", action="store_true", help="Use SDXL model / SDXLモデルを使用する")
+    parser.add_argument("--flux", action="store_true", help="Use FLUX model / FLUXモデルを使用する")
     parser.add_argument(
         "--no_half_vae",
         action="store_true",
@@ -191,7 +175,8 @@ def setup_parser() -> argparse.ArgumentParser:
     parser.add_argument(
         "--skip_existing",
         action="store_true",
-        help="skip images if npz already exists (both normal and flipped exists if flip_aug is enabled) / npzが既に存在する画像をスキップする（flip_aug有効時は通常、反転の両方が存在する画像をスキップ）",
+        help="[Deprecated] This option does not work. Existing .npz files are always checked. Use `--skip_cache_check` to skip the check."
+        " / [非推奨] このオプションは機能しません。既存の .npz は常に検証されます。`--skip_cache_check` で検証をスキップできます。",
     )
     return parser
 
diff --git a/tools/cache_text_encoder_outputs.py b/tools/cache_text_encoder_outputs.py
index a75d9da74..d294d46c4 100644
--- a/tools/cache_text_encoder_outputs.py
+++ b/tools/cache_text_encoder_outputs.py
@@ -9,55 +9,68 @@
 import torch
 from tqdm import tqdm
 
-from library import config_util
+from library import (
+    config_util,
+    flux_train_utils,
+    flux_utils,
+    sdxl_model_util,
+    strategy_base,
+    strategy_flux,
+    strategy_sd,
+    strategy_sdxl,
+)
 from library import train_util
 from library import sdxl_train_util
+from library import utils
 from library.config_util import (
     ConfigSanitizer,
     BlueprintGenerator,
 )
 from library.utils import setup_logging, add_logging_arguments
+from tools import cache_latents
+
 setup_logging()
 import logging
+
 logger = logging.getLogger(__name__)
 
+
 def cache_to_disk(args: argparse.Namespace) -> None:
     setup_logging(args, reset=True)
     train_util.prepare_dataset_args(args, True)
 
-    # check cache arg
-    assert (
-        args.cache_text_encoder_outputs_to_disk
-    ), "cache_text_encoder_outputs_to_disk must be True / cache_text_encoder_outputs_to_diskはTrueである必要があります"
-
-    # できるだけ準備はしておくが今のところSDXLのみしか動かない
-    assert (
-        args.sdxl
-    ), "cache_text_encoder_outputs_to_disk is only available for SDXL / cache_text_encoder_outputs_to_diskはSDXLのみ利用可能です"
+    args.cache_text_encoder_outputs = True
+    args.cache_text_encoder_outputs_to_disk = True
 
     use_dreambooth_method = args.in_json is None
 
     if args.seed is not None:
         set_seed(args.seed)  # 乱数系列を初期化する
 
-    # tokenizerを準備する：datasetを動かすために必要
-    if args.sdxl:
-        tokenizer1, tokenizer2 = sdxl_train_util.load_tokenizers(args)
-        tokenizers = [tokenizer1, tokenizer2]
-    else:
-        tokenizer = train_util.load_tokenizer(args)
-        tokenizers = [tokenizer]
+    is_sd = not args.sdxl and not args.flux
+    is_sdxl = args.sdxl
+    is_flux = args.flux
+
+    assert (
+        is_sdxl or is_flux
+    ), "Cache text encoder outputs to disk is only supported for SDXL and FLUX models / テキストエンコーダ出力のディスクキャッシュはSDXLまたはFLUXでのみ有効です"
+    assert (
+        is_sdxl or args.weighted_captions is None
+    ), "Weighted captions are only supported for SDXL models / 重み付きキャプションはSDXLモデルでのみ有効です"
+    
+    cache_latents.set_tokenize_strategy(is_sd, is_sdxl, is_flux, args)
 
     # データセットを準備する
+    use_user_config = args.dataset_config is not None
     if args.dataset_class is None:
-        blueprint_generator = BlueprintGenerator(ConfigSanitizer(True, True, False, True))
-        if args.dataset_config is not None:
-            logger.info(f"Load dataset config from {args.dataset_config}")
+        blueprint_generator = BlueprintGenerator(ConfigSanitizer(True, True, args.masked_loss, True))
+        if use_user_config:
+            logger.info(f"Loading dataset config from {args.dataset_config}")
             user_config = config_util.load_user_config(args.dataset_config)
-            ignored = ["train_data_dir", "in_json"]
+            ignored = ["train_data_dir", "reg_data_dir", "in_json"]
             if any(getattr(args, attr) is not None for attr in ignored):
                 logger.warning(
-                    "ignore following options because config file is found: {0} / 設定ファイルが利用されるため以下のオプションは無視されます: {0}".format(
+                    "ignoring the following options because config file is found: {0} / 設定ファイルが利用されるため以下のオプションは無視されます: {0}".format(
                         ", ".join(ignored)
                     )
                 )
@@ -88,15 +101,11 @@ def cache_to_disk(args: argparse.Namespace) -> None:
                     ]
                 }
 
-        blueprint = blueprint_generator.generate(user_config, args, tokenizer=tokenizers)
+        blueprint = blueprint_generator.generate(user_config, args)
         train_dataset_group = config_util.generate_dataset_group_by_blueprint(blueprint.dataset_group)
     else:
-        train_dataset_group = train_util.load_arbitrary_dataset(args, tokenizers)
-
-    current_epoch = Value("i", 0)
-    current_step = Value("i", 0)
-    ds_for_collator = train_dataset_group if args.max_data_loader_n_workers == 0 else None
-    collator = train_util.collator_class(current_epoch, current_step, ds_for_collator)
+        # use arbitrary dataset class
+        train_dataset_group = train_util.load_arbitrary_dataset(args)
 
     # acceleratorを準備する
     logger.info("prepare accelerator")
@@ -105,66 +114,68 @@ def cache_to_disk(args: argparse.Namespace) -> None:
 
     # mixed precisionに対応した型を用意しておき適宜castする
     weight_dtype, _ = train_util.prepare_dtype(args)
+    t5xxl_dtype = utils.str_to_dtype(args.t5xxl_dtype, weight_dtype)
 
     # モデルを読み込む
     logger.info("load model")
-    if args.sdxl:
-        (_, text_encoder1, text_encoder2, _, _, _, _) = sdxl_train_util.load_target_model(args, accelerator, "sdxl", weight_dtype)
+    if is_sdxl:
+        _, text_encoder1, text_encoder2, _, _, _, _ = sdxl_train_util.load_target_model(
+            args, accelerator, sdxl_model_util.MODEL_VERSION_SDXL_BASE_V1_0, weight_dtype
+        )
+        text_encoder1.to(accelerator.device, weight_dtype)
+        text_encoder2.to(accelerator.device, weight_dtype)
         text_encoders = [text_encoder1, text_encoder2]
     else:
-        text_encoder1, _, _, _ = train_util.load_target_model(args, weight_dtype, accelerator)
-        text_encoders = [text_encoder1]
+        clip_l = flux_utils.load_clip_l(
+            args.clip_l, weight_dtype, accelerator.device, disable_mmap=args.disable_mmap_load_safetensors
+        )
+
+        t5xxl = flux_utils.load_t5xxl(args.t5xxl, None, accelerator.device, disable_mmap=args.disable_mmap_load_safetensors)
+
+        if t5xxl.dtype == torch.float8_e4m3fnuz or t5xxl.dtype == torch.float8_e5m2 or t5xxl.dtype == torch.float8_e5m2fnuz:
+            raise ValueError(f"Unsupported fp8 model dtype: {t5xxl.dtype}")
+        elif t5xxl.dtype == torch.float8_e4m3fn:
+            logger.info("Loaded fp8 T5XXL model")
+
+        if t5xxl_dtype != t5xxl_dtype:
+            if t5xxl.dtype == torch.float8_e4m3fn and t5xxl_dtype.itemsize() >= 2:
+                logger.warning(
+                    "The loaded model is fp8, but the specified T5XXL dtype is larger than fp8.  This may cause a performance drop."
+                    " / ロードされたモデルはfp8ですが、指定されたT5XXLのdtypeがfp8より高精度です。精度低下が発生する可能性があります。"
+                )
+            logger.info(f"Casting T5XXL model to {t5xxl_dtype}")
+            t5xxl.to(t5xxl_dtype)
+
+        text_encoders = [clip_l, t5xxl]
 
     for text_encoder in text_encoders:
-        text_encoder.to(accelerator.device, dtype=weight_dtype)
         text_encoder.requires_grad_(False)
         text_encoder.eval()
 
-    # dataloaderを準備する
-    train_dataset_group.set_caching_mode("text")
-
-    # DataLoaderのプロセス数：0 は persistent_workers が使えないので注意
-    n_workers = min(args.max_data_loader_n_workers, os.cpu_count())  # cpu_count or max_data_loader_n_workers
-
-    train_dataloader = torch.utils.data.DataLoader(
-        train_dataset_group,
-        batch_size=1,
-        shuffle=True,
-        collate_fn=collator,
-        num_workers=n_workers,
-        persistent_workers=args.persistent_data_loader_workers,
-    )
+    # build text encoder outputs caching strategy
+    if is_sdxl:
+        text_encoder_outputs_caching_strategy = strategy_sdxl.SdxlTextEncoderOutputsCachingStrategy(
+            args.cache_text_encoder_outputs_to_disk, None, args.skip_cache_check, is_weighted=args.weighted_captions
+        )
+    else:
+        text_encoder_outputs_caching_strategy = strategy_flux.FluxTextEncoderOutputsCachingStrategy(
+            args.cache_text_encoder_outputs_to_disk,
+            args.text_encoder_batch_size,
+            args.skip_cache_check,
+            is_partial=False,
+            apply_t5_attn_mask=args.apply_t5_attn_mask,
+        )
+    strategy_base.TextEncoderOutputsCachingStrategy.set_strategy(text_encoder_outputs_caching_strategy)
+
+    # build text encoding strategy
+    if is_sdxl:
+        text_encoding_strategy = strategy_sdxl.SdxlTextEncodingStrategy()
+    else:
+        text_encoding_strategy = strategy_flux.FluxTextEncodingStrategy(args.apply_t5_attn_mask)
+    strategy_base.TextEncodingStrategy.set_strategy(text_encoding_strategy)
 
-    # acceleratorを使ってモデルを準備する：マルチGPUで使えるようになるはず
-    train_dataloader = accelerator.prepare(train_dataloader)
-
-    # データ取得のためのループ
-    for batch in tqdm(train_dataloader):
-        absolute_paths = batch["absolute_paths"]
-        input_ids1_list = batch["input_ids1_list"]
-        input_ids2_list = batch["input_ids2_list"]
-
-        image_infos = []
-        for absolute_path, input_ids1, input_ids2 in zip(absolute_paths, input_ids1_list, input_ids2_list):
-            image_info = train_util.ImageInfo(absolute_path, 1, "dummy", False, absolute_path)
-            image_info.text_encoder_outputs_npz = os.path.splitext(absolute_path)[0] + train_util.TEXT_ENCODER_OUTPUTS_CACHE_SUFFIX
-            image_info
-
-            if args.skip_existing:
-                if os.path.exists(image_info.text_encoder_outputs_npz):
-                    logger.warning(f"Skipping {image_info.text_encoder_outputs_npz} because it already exists.")
-                    continue
-                
-            image_info.input_ids1 = input_ids1
-            image_info.input_ids2 = input_ids2
-            image_infos.append(image_info)
-
-        if len(image_infos) > 0:
-            b_input_ids1 = torch.stack([image_info.input_ids1 for image_info in image_infos])
-            b_input_ids2 = torch.stack([image_info.input_ids2 for image_info in image_infos])
-            train_util.cache_batch_text_encoder_outputs(
-                image_infos, tokenizers, text_encoders, args.max_token_length, True, b_input_ids1, b_input_ids2, weight_dtype
-            )
+    # cache text encoder outputs
+    train_dataset_group.new_cache_text_encoder_outputs(text_encoders, accelerator)
 
     accelerator.wait_for_everyone()
     accelerator.print(f"Finished caching latents for {len(train_dataset_group)} batches.")
@@ -179,11 +190,20 @@ def setup_parser() -> argparse.ArgumentParser:
     train_util.add_dataset_arguments(parser, True, True, True)
     config_util.add_config_arguments(parser)
     sdxl_train_util.add_sdxl_training_arguments(parser)
+    flux_train_utils.add_flux_train_arguments(parser)
     parser.add_argument("--sdxl", action="store_true", help="Use SDXL model / SDXLモデルを使用する")
+    parser.add_argument("--flux", action="store_true", help="Use FLUX model / FLUXモデルを使用する")
+    parser.add_argument(
+        "--t5xxl_dtype",
+        type=str,
+        default=None,
+        help="T5XXL model dtype, default: None (use mixed precision dtype) / T5XXLモデルのdtype, デフォルト: None (mixed precisionのdtypeを使用)",
+    )
     parser.add_argument(
         "--skip_existing",
         action="store_true",
-        help="skip images if npz already exists (both normal and flipped exists if flip_aug is enabled) / npzが既に存在する画像をスキップする（flip_aug有効時は通常、反転の両方が存在する画像をスキップ）",
+        help="[Deprecated] This option does not work. Existing .npz files are always checked. Use `--skip_cache_check` to skip the check."
+        " / [非推奨] このオプションは機能しません。既存の .npz は常に検証されます。`--skip_cache_check` で検証をスキップできます。",
     )
     return parser
 

From 2244cf5b835cc35179f29b1babb4a2d19f54bfae Mon Sep 17 00:00:00 2001
From: kohya-ss <ykumeykume@gmail.com>
Date: Sun, 13 Oct 2024 18:22:19 +0900
Subject: [PATCH 179/582] load images in parallel when caching latents

---
 library/train_util.py | 93 ++++++++++++++++++++++++-------------------
 1 file changed, 53 insertions(+), 40 deletions(-)

diff --git a/library/train_util.py b/library/train_util.py
index 4e6b3408d..1db470d63 100644
--- a/library/train_util.py
+++ b/library/train_util.py
@@ -3,6 +3,7 @@
 import argparse
 import ast
 import asyncio
+from concurrent.futures import Future, ThreadPoolExecutor
 import datetime
 import importlib
 import json
@@ -1058,7 +1059,6 @@ def __eq__(self, other):
                     and self.random_crop == other.random_crop
                 )
 
-        batches: List[Tuple[Condition, List[ImageInfo]]] = []
         batch: List[ImageInfo] = []
         current_condition = None
 
@@ -1066,57 +1066,70 @@ def __eq__(self, other):
         num_processes = accelerator.num_processes
         process_index = accelerator.process_index
 
-        logger.info("checking cache validity...")
-        for i, info in enumerate(tqdm(image_infos)):
-            subset = self.image_to_subset[info.image_key]
+        # define a function to submit a batch to cache
+        def submit_batch(batch, cond):
+            for info in batch:
+                if info.image is not None and isinstance(info.image, Future):
+                    info.image = info.image.result()  # future to image
+            caching_strategy.cache_batch_latents(model, batch, cond.flip_aug, cond.alpha_mask, cond.random_crop)
 
-            if info.latents_npz is not None:  # fine tuning dataset
-                continue
+        # define ThreadPoolExecutor to load images in parallel
+        max_workers = min(os.cpu_count(), len(image_infos))
+        max_workers = max(1, max_workers // num_processes)  # consider multi-gpu
+        max_workers = min(max_workers, caching_strategy.batch_size)  # max_workers should be less than batch_size
+        executor = ThreadPoolExecutor(max_workers)
 
-            # check disk cache exists and size of latents
-            if caching_strategy.cache_to_disk:
-                # info.latents_npz = os.path.splitext(info.absolute_path)[0] + file_suffix
-                info.latents_npz = caching_strategy.get_latents_npz_path(info.absolute_path, info.image_size)
+        try:
+            # iterate images
+            logger.info("caching latents...")
+            for i, info in enumerate(tqdm(image_infos)):
+                subset = self.image_to_subset[info.image_key]
 
-                # if the modulo of num_processes is not equal to process_index, skip caching
-                # this makes each process cache different latents
-                if i % num_processes != process_index:
+                if info.latents_npz is not None:  # fine tuning dataset
                     continue
 
-                # print(f"{process_index}/{num_processes} {i}/{len(image_infos)} {info.latents_npz}")
+                # check disk cache exists and size of latents
+                if caching_strategy.cache_to_disk:
+                    # info.latents_npz = os.path.splitext(info.absolute_path)[0] + file_suffix
+                    info.latents_npz = caching_strategy.get_latents_npz_path(info.absolute_path, info.image_size)
 
-                cache_available = caching_strategy.is_disk_cached_latents_expected(
-                    info.bucket_reso, info.latents_npz, subset.flip_aug, subset.alpha_mask
-                )
-                if cache_available:  # do not add to batch
-                    continue
+                    # if the modulo of num_processes is not equal to process_index, skip caching
+                    # this makes each process cache different latents
+                    if i % num_processes != process_index:
+                        continue
 
-            # if batch is not empty and condition is changed, flush the batch. Note that current_condition is not None if batch is not empty
-            condition = Condition(info.bucket_reso, subset.flip_aug, subset.alpha_mask, subset.random_crop)
-            if len(batch) > 0 and current_condition != condition:
-                batches.append((current_condition, batch))
-                batch = []
+                    # print(f"{process_index}/{num_processes} {i}/{len(image_infos)} {info.latents_npz}")
 
-            batch.append(info)
-            current_condition = condition
+                    cache_available = caching_strategy.is_disk_cached_latents_expected(
+                        info.bucket_reso, info.latents_npz, subset.flip_aug, subset.alpha_mask
+                    )
+                    if cache_available:  # do not add to batch
+                        continue
 
-            # if number of data in batch is enough, flush the batch
-            if len(batch) >= caching_strategy.batch_size:
-                batches.append((current_condition, batch))
-                batch = []
-                current_condition = None
+                # if batch is not empty and condition is changed, flush the batch. Note that current_condition is not None if batch is not empty
+                condition = Condition(info.bucket_reso, subset.flip_aug, subset.alpha_mask, subset.random_crop)
+                if len(batch) > 0 and current_condition != condition:
+                    submit_batch(batch, current_condition)
+                    batch = []
 
-        if len(batch) > 0:
-            batches.append((current_condition, batch))
+                if info.image is None:
+                    # load image in parallel
+                    info.image = executor.submit(load_image, info.absolute_path, condition.alpha_mask)
 
-        if len(batches) == 0:
-            logger.info("no latents to cache")
-            return
+                batch.append(info)
+                current_condition = condition
 
-        # iterate batches: batch doesn't have image here. image will be loaded in cache_batch_latents and discarded
-        logger.info("caching latents...")
-        for condition, batch in tqdm(batches, smoothing=1, total=len(batches)):
-            caching_strategy.cache_batch_latents(model, batch, condition.flip_aug, condition.alpha_mask, condition.random_crop)
+                # if number of data in batch is enough, flush the batch
+                if len(batch) >= caching_strategy.batch_size:
+                    submit_batch(batch, current_condition)
+                    batch = []
+                    current_condition = None
+
+            if len(batch) > 0:
+                submit_batch(batch, current_condition)
+
+        finally:
+            executor.shutdown()
 
     def cache_latents(self, vae, vae_batch_size=1, cache_to_disk=False, is_main_process=True, file_suffix=".npz"):
         # マルチGPUには対応していないので、そちらはtools/cache_latents.pyを使うこと

From bfc3a65acda7f90abef9c16db279d2952f73fb77 Mon Sep 17 00:00:00 2001
From: kohya-ss <ykumeykume@gmail.com>
Date: Sun, 13 Oct 2024 19:08:16 +0900
Subject: [PATCH 180/582] fix to work cache latents/text encoder outputs

---
 library/train_util.py               | 11 +++++++----
 tools/cache_latents.py              | 11 ++++++-----
 tools/cache_text_encoder_outputs.py | 18 +++++++++++++-----
 3 files changed, 26 insertions(+), 14 deletions(-)

diff --git a/library/train_util.py b/library/train_util.py
index 1db470d63..926609267 100644
--- a/library/train_util.py
+++ b/library/train_util.py
@@ -4064,15 +4064,18 @@ def verify_command_line_training_args(args: argparse.Namespace):
         )
 
 
+def enable_high_vram(args: argparse.Namespace):
+    if args.highvram:
+        logger.info("highvram is enabled / highvramが有効です")
+        global HIGH_VRAM
+        HIGH_VRAM = True
+
 def verify_training_args(args: argparse.Namespace):
     r"""
     Verify training arguments. Also reflect highvram option to global variable
     学習用引数を検証する。あわせて highvram オプションの指定をグローバル変数に反映する
     """
-    if args.highvram:
-        print("highvram is enabled / highvramが有効です")
-        global HIGH_VRAM
-        HIGH_VRAM = True
+    enable_high_vram(args)
 
     if args.v_parameterization and not args.v2:
         logger.warning(
diff --git a/tools/cache_latents.py b/tools/cache_latents.py
index d8154ec31..e2faa58a7 100644
--- a/tools/cache_latents.py
+++ b/tools/cache_latents.py
@@ -9,7 +9,7 @@
 import torch
 from tqdm import tqdm
 
-from library import config_util, flux_utils, strategy_base, strategy_flux, strategy_sd, strategy_sdxl
+from library import config_util, flux_train_utils, flux_utils, strategy_base, strategy_flux, strategy_sd, strategy_sdxl
 from library import train_util
 from library import sdxl_train_util
 from library.config_util import (
@@ -30,7 +30,7 @@ def set_tokenize_strategy(is_sd: bool, is_sdxl: bool, is_flux: bool, args: argpa
     else:
         is_schnell = False
 
-    if is_sd or is_sdxl:
+    if is_sd:
         tokenize_strategy = strategy_sd.SdTokenizeStrategy(args.v2, args.max_token_length, args.tokenizer_cache_dir)
     elif is_sdxl:
         tokenize_strategy = strategy_sdxl.SdxlTokenizeStrategy(args.max_token_length, args.tokenizer_cache_dir)
@@ -51,6 +51,7 @@ def set_tokenize_strategy(is_sd: bool, is_sdxl: bool, is_flux: bool, args: argpa
 def cache_to_disk(args: argparse.Namespace) -> None:
     setup_logging(args, reset=True)
     train_util.prepare_dataset_args(args, True)
+    train_util.enable_high_vram(args)
 
     # assert args.cache_latents_to_disk, "cache_latents_to_disk must be True / cache_latents_to_diskはTrueである必要があります"
     args.cache_latents = True
@@ -161,10 +162,10 @@ def setup_parser() -> argparse.ArgumentParser:
     train_util.add_sd_models_arguments(parser)
     train_util.add_training_arguments(parser, True)
     train_util.add_dataset_arguments(parser, True, True, True)
+    train_util.add_masked_loss_arguments(parser)
     config_util.add_config_arguments(parser)
-    parser.add_argument(
-        "--ae", type=str, default=None, help="Autoencoder model of FLUX to use / 使用するFLUXのオートエンコーダモデル"
-    )
+    flux_train_utils.add_flux_train_arguments(parser)
+
     parser.add_argument("--sdxl", action="store_true", help="Use SDXL model / SDXLモデルを使用する")
     parser.add_argument("--flux", action="store_true", help="Use FLUX model / FLUXモデルを使用する")
     parser.add_argument(
diff --git a/tools/cache_text_encoder_outputs.py b/tools/cache_text_encoder_outputs.py
index d294d46c4..7be9ad781 100644
--- a/tools/cache_text_encoder_outputs.py
+++ b/tools/cache_text_encoder_outputs.py
@@ -27,7 +27,7 @@
     BlueprintGenerator,
 )
 from library.utils import setup_logging, add_logging_arguments
-from tools import cache_latents
+from cache_latents import set_tokenize_strategy
 
 setup_logging()
 import logging
@@ -38,6 +38,7 @@
 def cache_to_disk(args: argparse.Namespace) -> None:
     setup_logging(args, reset=True)
     train_util.prepare_dataset_args(args, True)
+    train_util.enable_high_vram(args)
 
     args.cache_text_encoder_outputs = True
     args.cache_text_encoder_outputs_to_disk = True
@@ -57,8 +58,8 @@ def cache_to_disk(args: argparse.Namespace) -> None:
     assert (
         is_sdxl or args.weighted_captions is None
     ), "Weighted captions are only supported for SDXL models / 重み付きキャプションはSDXLモデルでのみ有効です"
-    
-    cache_latents.set_tokenize_strategy(is_sd, is_sdxl, is_flux, args)
+
+    set_tokenize_strategy(is_sd, is_sdxl, is_flux, args)
 
     # データセットを準備する
     use_user_config = args.dataset_config is not None
@@ -178,7 +179,7 @@ def cache_to_disk(args: argparse.Namespace) -> None:
     train_dataset_group.new_cache_text_encoder_outputs(text_encoders, accelerator)
 
     accelerator.wait_for_everyone()
-    accelerator.print(f"Finished caching latents for {len(train_dataset_group)} batches.")
+    accelerator.print(f"Finished caching text encoder outputs to disk.")
 
 
 def setup_parser() -> argparse.ArgumentParser:
@@ -188,9 +189,10 @@ def setup_parser() -> argparse.ArgumentParser:
     train_util.add_sd_models_arguments(parser)
     train_util.add_training_arguments(parser, True)
     train_util.add_dataset_arguments(parser, True, True, True)
+    train_util.add_masked_loss_arguments(parser)
     config_util.add_config_arguments(parser)
-    sdxl_train_util.add_sdxl_training_arguments(parser)
     flux_train_utils.add_flux_train_arguments(parser)
+
     parser.add_argument("--sdxl", action="store_true", help="Use SDXL model / SDXLモデルを使用する")
     parser.add_argument("--flux", action="store_true", help="Use FLUX model / FLUXモデルを使用する")
     parser.add_argument(
@@ -205,6 +207,12 @@ def setup_parser() -> argparse.ArgumentParser:
         help="[Deprecated] This option does not work. Existing .npz files are always checked. Use `--skip_cache_check` to skip the check."
         " / [非推奨] このオプションは機能しません。既存の .npz は常に検証されます。`--skip_cache_check` で検証をスキップできます。",
     )
+    parser.add_argument(
+        "--weighted_captions",
+        action="store_true",
+        default=False,
+        help="Enable weighted captions in the standard style (token:1.3). No commas inside parens, or shuffle/dropout may break the decoder. / 「[token]」、「(token)」「(token:1.3)」のような重み付きキャプションを有効にする。カンマを括弧内に入れるとシャッフルやdropoutで重みづけがおかしくなるので注意",
+    )
     return parser
 
 

From 2d5f7fa709c31d07a1bb44b5be391c29b77d3cfc Mon Sep 17 00:00:00 2001
From: kohya-ss <ykumeykume@gmail.com>
Date: Sun, 13 Oct 2024 19:23:21 +0900
Subject: [PATCH 181/582] update README

---
 README.md | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index 544c665de..7fae50d1a 100644
--- a/README.md
+++ b/README.md
@@ -11,10 +11,15 @@ The command to install PyTorch is as follows:
 
 ### Recent Updates
 
-Oct 12, 2024 (update 1):
+Oct 13, 2024:
+
+- Fixed an issue where it took a long time to load the image size when initializing the dataset, especially when the number of images in the dataset was large.
 
 - During multi-GPU training, caching of latents and Text Encoder outputs is now done in multi-GPU.
-- `--text_encoder_batch_size` option is enabled for FLUX.1 LoRA training and fine tuning. This option specifies the batch size for caching Text Encoder outputs (not for training). The default is same as the dataset batch size. If you have enough VRAM, you can increase the batch size to speed up the caching. 
+  - Please make sure that `--highvram` and `--vae_batch_size` are specified correctly. If you have enough VRAM, you can increase the batch size to speed up the caching. 
+  - `--text_encoder_batch_size` option is enabled for FLUX.1 LoRA training and fine tuning. This option specifies the batch size for caching Text Encoder outputs (not for training). The default is same as the dataset batch size. If you have enough VRAM, you can increase the batch size to speed up the caching. 
+  - Multi-threading is also implemented for caching of latents. This may speed up the caching process about 5% (depends on the environment).
+  - `tools/cache_latents.py` and `tools/cache_text_encoder_outputs.py` also have been updated to support multi-GPU caching.
 - `--skip_cache_check` option is added to each training script. 
   - When specified, the consistency check of the cache file `*.npz` contents (e.g., image size and flip for latents, mask for Text Encoder outputs) is skipped. 
   - Specify this option if you have a large number of cache files and the consistency check takes time. 

From 2500f5a79806fdbe74c43db24a95ee19329a8fcc Mon Sep 17 00:00:00 2001
From: Kohya S <ykumeykume@gmail.com>
Date: Tue, 15 Oct 2024 07:16:34 +0900
Subject: [PATCH 182/582] fix latents caching not working closes #1696

---
 fine_tune.py               | 2 +-
 flux_train.py              | 2 +-
 sd3_train.py               | 2 +-
 sdxl_train.py              | 2 +-
 sdxl_train_control_net.py  | 2 +-
 train_db.py                | 2 +-
 train_textual_inversion.py | 2 +-
 7 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/fine_tune.py b/fine_tune.py
index cdc005d9a..0b7cc5100 100644
--- a/fine_tune.py
+++ b/fine_tune.py
@@ -177,7 +177,7 @@ def fn_recursive_set_mem_eff(module: torch.nn.Module):
         vae.requires_grad_(False)
         vae.eval()
 
-        train_dataset_group.new_cache_latents(vae, accelerator.is_main_process)
+        train_dataset_group.new_cache_latents(vae, accelerator)
 
         vae.to("cpu")
         clean_memory_on_device(accelerator.device)
diff --git a/flux_train.py b/flux_train.py
index 46a8babdb..91ae3af57 100644
--- a/flux_train.py
+++ b/flux_train.py
@@ -190,7 +190,7 @@ def train(args):
         ae.requires_grad_(False)
         ae.eval()
 
-        train_dataset_group.new_cache_latents(ae, accelerator.is_main_process)
+        train_dataset_group.new_cache_latents(ae, accelerator)
 
         ae.to("cpu")  # if no sampling, vae can be deleted
         clean_memory_on_device(accelerator.device)
diff --git a/sd3_train.py b/sd3_train.py
index 7290956ad..ef18c32c4 100644
--- a/sd3_train.py
+++ b/sd3_train.py
@@ -243,7 +243,7 @@ def train(args):
         vae.requires_grad_(False)
         vae.eval()
 
-        train_dataset_group.new_cache_latents(vae, accelerator.is_main_process)
+        train_dataset_group.new_cache_latents(vae, accelerator)
 
         vae.to("cpu")  # if no sampling, vae can be deleted
         clean_memory_on_device(accelerator.device)
diff --git a/sdxl_train.py b/sdxl_train.py
index 9b2d19165..79a2fbb6e 100644
--- a/sdxl_train.py
+++ b/sdxl_train.py
@@ -272,7 +272,7 @@ def fn_recursive_set_mem_eff(module: torch.nn.Module):
         vae.requires_grad_(False)
         vae.eval()
 
-        train_dataset_group.new_cache_latents(vae, accelerator.is_main_process)
+        train_dataset_group.new_cache_latents(vae, accelerator)
 
         vae.to("cpu")
         clean_memory_on_device(accelerator.device)
diff --git a/sdxl_train_control_net.py b/sdxl_train_control_net.py
index 74b3a64a4..24080afbd 100644
--- a/sdxl_train_control_net.py
+++ b/sdxl_train_control_net.py
@@ -209,7 +209,7 @@ def unwrap_model(model):
         vae.requires_grad_(False)
         vae.eval()
 
-        train_dataset_group.new_cache_latents(vae, accelerator.is_main_process)
+        train_dataset_group.new_cache_latents(vae, accelerator)
 
         vae.to("cpu")
         clean_memory_on_device(accelerator.device)
diff --git a/train_db.py b/train_db.py
index 683b42332..4a58e27b0 100644
--- a/train_db.py
+++ b/train_db.py
@@ -156,7 +156,7 @@ def train(args):
         vae.requires_grad_(False)
         vae.eval()
 
-        train_dataset_group.new_cache_latents(vae, accelerator.is_main_process)
+        train_dataset_group.new_cache_latents(vae, accelerator)
 
         vae.to("cpu")
         clean_memory_on_device(accelerator.device)
diff --git a/train_textual_inversion.py b/train_textual_inversion.py
index 4d8a3abbf..77b5d717a 100644
--- a/train_textual_inversion.py
+++ b/train_textual_inversion.py
@@ -378,7 +378,7 @@ def train(self, args):
             vae.requires_grad_(False)
             vae.eval()
 
-            train_dataset_group.new_cache_latents(vae, accelerator.is_main_process)
+            train_dataset_group.new_cache_latents(vae, accelerator)
 
             clean_memory_on_device(accelerator.device)
             accelerator.wait_for_everyone()

From 3cc5b8db99c66b9e205c4fd4a5f969090c51ef58 Mon Sep 17 00:00:00 2001
From: Kohya S <ykumeykume@gmail.com>
Date: Fri, 18 Oct 2024 20:57:13 +0900
Subject: [PATCH 183/582] Diff Output Preserv loss for SDXL

---
 library/config_util.py | 17 +++++++----------
 library/train_util.py  | 17 ++++++++++++++++-
 sdxl_train_network.py  | 20 +++++++++++++++++++-
 train_network.py       | 35 +++++++++++++++++++++++++----------
 4 files changed, 67 insertions(+), 22 deletions(-)

diff --git a/library/config_util.py b/library/config_util.py
index f8cdfe60a..fc1fbf46d 100644
--- a/library/config_util.py
+++ b/library/config_util.py
@@ -10,13 +10,7 @@
 from pathlib import Path
 
 # from toolz import curry
-from typing import (
-    List,
-    Optional,
-    Sequence,
-    Tuple,
-    Union,
-)
+from typing import Dict, List, Optional, Sequence, Tuple, Union
 
 import toml
 import voluptuous
@@ -78,6 +72,7 @@ class BaseSubsetParams:
     caption_tag_dropout_rate: float = 0.0
     token_warmup_min: int = 1
     token_warmup_step: float = 0
+    custom_attributes: Optional[Dict[str, Any]] = None
 
 
 @dataclass
@@ -197,6 +192,7 @@ def __validate_and_convert_scalar_or_twodim(klass, value: Union[float, Sequence]
         "token_warmup_step": Any(float, int),
         "caption_prefix": str,
         "caption_suffix": str,
+        "custom_attributes": dict,
     }
     # DO means DropOut
     DO_SUBSET_ASCENDABLE_SCHEMA = {
@@ -538,9 +534,10 @@ def generate_dataset_group_by_blueprint(dataset_group_blueprint: DatasetGroupBlu
           flip_aug: {subset.flip_aug}
           face_crop_aug_range: {subset.face_crop_aug_range}
           random_crop: {subset.random_crop}
-          token_warmup_min: {subset.token_warmup_min},
-          token_warmup_step: {subset.token_warmup_step},
-          alpha_mask: {subset.alpha_mask},
+          token_warmup_min: {subset.token_warmup_min}
+          token_warmup_step: {subset.token_warmup_step}
+          alpha_mask: {subset.alpha_mask}
+          custom_attributes: {subset.custom_attributes}
       """
                 ),
                 "  ",
diff --git a/library/train_util.py b/library/train_util.py
index 4a446e81c..7d3fce5b2 100644
--- a/library/train_util.py
+++ b/library/train_util.py
@@ -396,6 +396,7 @@ def __init__(
         caption_suffix: Optional[str],
         token_warmup_min: int,
         token_warmup_step: Union[float, int],
+        custom_attributes: Optional[Dict[str, Any]] = None,
     ) -> None:
         self.image_dir = image_dir
         self.alpha_mask = alpha_mask if alpha_mask is not None else False
@@ -419,6 +420,8 @@ def __init__(
         self.token_warmup_min = token_warmup_min  # step=0におけるタグの数
         self.token_warmup_step = token_warmup_step  # N（N<1ならN*max_train_steps）ステップ目でタグの数が最大になる
 
+        self.custom_attributes = custom_attributes if custom_attributes is not None else {}
+
         self.img_count = 0
 
 
@@ -449,6 +452,7 @@ def __init__(
         caption_suffix,
         token_warmup_min,
         token_warmup_step,
+        custom_attributes: Optional[Dict[str, Any]] = None,
     ) -> None:
         assert image_dir is not None, "image_dir must be specified / image_dirは指定が必須です"
 
@@ -473,6 +477,7 @@ def __init__(
             caption_suffix,
             token_warmup_min,
             token_warmup_step,
+            custom_attributes=custom_attributes,
         )
 
         self.is_reg = is_reg
@@ -512,6 +517,7 @@ def __init__(
         caption_suffix,
         token_warmup_min,
         token_warmup_step,
+        custom_attributes: Optional[Dict[str, Any]] = None,
     ) -> None:
         assert metadata_file is not None, "metadata_file must be specified / metadata_fileは指定が必須です"
 
@@ -536,6 +542,7 @@ def __init__(
             caption_suffix,
             token_warmup_min,
             token_warmup_step,
+            custom_attributes=custom_attributes,
         )
 
         self.metadata_file = metadata_file
@@ -1474,11 +1481,14 @@ def __getitem__(self, index):
         target_sizes_hw = []
         flippeds = []  # 変数名が微妙
         text_encoder_outputs_list = []
+        custom_attributes = []
 
         for image_key in bucket[image_index : image_index + bucket_batch_size]:
             image_info = self.image_data[image_key]
             subset = self.image_to_subset[image_key]
 
+            custom_attributes.append(subset.custom_attributes)
+
             # in case of fine tuning, is_reg is always False
             loss_weights.append(self.prior_loss_weight if image_info.is_reg else 1.0)
 
@@ -1646,7 +1656,9 @@ def none_or_stack_elements(tensors_list, converter):
                 return None
             return [torch.stack([converter(x[i]) for x in tensors_list]) for i in range(len(tensors_list[0]))]
 
+        # set example
         example = {}
+        example["custom_attributes"] = custom_attributes  # may be list of empty dict
         example["loss_weights"] = torch.FloatTensor(loss_weights)
         example["text_encoder_outputs_list"] = none_or_stack_elements(text_encoder_outputs_list, torch.FloatTensor)
         example["input_ids_list"] = none_or_stack_elements(input_ids_list, lambda x: x)
@@ -2630,7 +2642,9 @@ def debug_dataset(train_dataset, show_input_ids=False):
                     f'{ik}, size: {train_dataset.image_data[ik].image_size}, loss weight: {lw}, caption: "{cap}", original size: {orgsz}, crop top left: {crptl}, target size: {trgsz}, flipped: {flpdz}'
                 )
                 if "network_multipliers" in example:
-                    print(f"network multiplier: {example['network_multipliers'][j]}")
+                    logger.info(f"network multiplier: {example['network_multipliers'][j]}")
+                if "custom_attributes" in example:
+                    logger.info(f"custom attributes: {example['custom_attributes'][j]}")
 
                 # if show_input_ids:
                 #     logger.info(f"input ids: {iid}")
@@ -4091,6 +4105,7 @@ def enable_high_vram(args: argparse.Namespace):
         global HIGH_VRAM
         HIGH_VRAM = True
 
+
 def verify_training_args(args: argparse.Namespace):
     r"""
     Verify training arguments. Also reflect highvram option to global variable
diff --git a/sdxl_train_network.py b/sdxl_train_network.py
index 4a16a4891..d45df6e05 100644
--- a/sdxl_train_network.py
+++ b/sdxl_train_network.py
@@ -1,4 +1,5 @@
 import argparse
+from typing import List, Optional
 
 import torch
 from accelerate import Accelerator
@@ -172,7 +173,18 @@ def get_text_cond(self, args, accelerator, batch, tokenizers, text_encoders, wei
 
         return encoder_hidden_states1, encoder_hidden_states2, pool2
 
-    def call_unet(self, args, accelerator, unet, noisy_latents, timesteps, text_conds, batch, weight_dtype):
+    def call_unet(
+        self,
+        args,
+        accelerator,
+        unet,
+        noisy_latents,
+        timesteps,
+        text_conds,
+        batch,
+        weight_dtype,
+        indices: Optional[List[int]] = None,
+    ):
         noisy_latents = noisy_latents.to(weight_dtype)  # TODO check why noisy_latents is not weight_dtype
 
         # get size embeddings
@@ -186,6 +198,12 @@ def call_unet(self, args, accelerator, unet, noisy_latents, timesteps, text_cond
         vector_embedding = torch.cat([pool2, embs], dim=1).to(weight_dtype)
         text_embedding = torch.cat([encoder_hidden_states1, encoder_hidden_states2], dim=2).to(weight_dtype)
 
+        if indices is not None and len(indices) > 0:
+            noisy_latents = noisy_latents[indices]
+            timesteps = timesteps[indices]
+            text_embedding = text_embedding[indices]
+            vector_embedding = vector_embedding[indices]
+
         noise_pred = unet(noisy_latents, timesteps, text_embedding, vector_embedding)
         return noise_pred
 
diff --git a/train_network.py b/train_network.py
index d5330aef4..ef766737d 100644
--- a/train_network.py
+++ b/train_network.py
@@ -143,7 +143,7 @@ def cache_text_encoder_outputs_if_needed(self, args, accelerator, unet, vae, tex
         for t_enc in text_encoders:
             t_enc.to(accelerator.device, dtype=weight_dtype)
 
-    def call_unet(self, args, accelerator, unet, noisy_latents, timesteps, text_conds, batch, weight_dtype):
+    def call_unet(self, args, accelerator, unet, noisy_latents, timesteps, text_conds, batch, weight_dtype, **kwargs):
         noise_pred = unet(noisy_latents, timesteps, text_conds[0]).sample
         return noise_pred
 
@@ -218,6 +218,30 @@ def get_noise_pred_and_target(
         else:
             target = noise
 
+        # differential output preservation
+        if "custom_attributes" in batch:
+            diff_output_pr_indices = []
+            for i, custom_attributes in enumerate(batch["custom_attributes"]):
+                if "diff_output_preservation" in custom_attributes and custom_attributes["diff_output_preservation"]:
+                    diff_output_pr_indices.append(i)
+
+            if len(diff_output_pr_indices) > 0:
+                network.set_multiplier(0.0)
+                with torch.no_grad(), accelerator.autocast():
+                    noise_pred_prior = self.call_unet(
+                        args,
+                        accelerator,
+                        unet,
+                        noisy_latents,
+                        timesteps,
+                        text_encoder_conds,
+                        batch,
+                        weight_dtype,
+                        indices=diff_output_pr_indices,
+                    )
+                network.set_multiplier(1.0)  # may be overwritten by "network_multipliers" in the next step
+                target[diff_output_pr_indices] = noise_pred_prior.to(target.dtype)
+
         return noise_pred, target, timesteps, huber_c, None
 
     def post_process_loss(self, loss, args, timesteps, noise_scheduler):
@@ -1123,15 +1147,6 @@ def remove_model(old_ckpt_name):
                         with torch.set_grad_enabled(train_text_encoder), accelerator.autocast():
                             # Get the text embedding for conditioning
                             if args.weighted_captions:
-                                # # SD only
-                                # encoded_text_encoder_conds = get_weighted_text_embeddings(
-                                #     tokenizers[0],
-                                #     text_encoder,
-                                #     batch["captions"],
-                                #     accelerator.device,
-                                #     args.max_token_length // 75 if args.max_token_length else 1,
-                                #     clip_skip=args.clip_skip,
-                                # )
                                 input_ids_list, weights_list = tokenize_strategy.tokenize_with_weights(batch["captions"])
                                 encoded_text_encoder_conds = text_encoding_strategy.encode_tokens_with_weights(
                                     tokenize_strategy,

From d8d7142665a8f6b2d43827c9b3a6a2de009c09cb Mon Sep 17 00:00:00 2001
From: Kohya S <ykumeykume@gmail.com>
Date: Fri, 18 Oct 2024 23:16:30 +0900
Subject: [PATCH 184/582] fix to work caching latents #1696

---
 sdxl_train_control_net_lllite.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sdxl_train_control_net_lllite.py b/sdxl_train_control_net_lllite.py
index 14ff7c240..913b1d435 100644
--- a/sdxl_train_control_net_lllite.py
+++ b/sdxl_train_control_net_lllite.py
@@ -181,7 +181,7 @@ def train(args):
         vae.requires_grad_(False)
         vae.eval()
 
-        train_dataset_group.new_cache_latents(vae, accelerator.is_main_process)
+        train_dataset_group.new_cache_latents(vae, accelerator)
 
         vae.to("cpu")
         clean_memory_on_device(accelerator.device)

From ef70aa7b42b5c923cc1a8594b2f30487a2b4f700 Mon Sep 17 00:00:00 2001
From: kohya-ss <ykumeykume@gmail.com>
Date: Fri, 18 Oct 2024 23:39:48 +0900
Subject: [PATCH 185/582] add FLUX.1 support

---
 README.md             |  19 +++++++
 flux_train_network.py | 123 ++++++++++++++++++++++++++++--------------
 2 files changed, 103 insertions(+), 39 deletions(-)

diff --git a/README.md b/README.md
index 7fae50d1a..59f70ebcd 100644
--- a/README.md
+++ b/README.md
@@ -11,6 +11,25 @@ The command to install PyTorch is as follows:
 
 ### Recent Updates
 
+Oct 19, 2024:
+
+- Added an implementation of Differential Output Preservation (temporary name) for SDXL/FLUX.1 LoRA training.
+  - A method to make the output of LoRA closer to the output when LoRA is not applied, with captions that do not contain trigger words.
+  - Define a Dataset subset for the regularization image (`is_reg = true`) with `.toml`. Add `custom_attributes.diff_output_preservation = true`.
+    - See [dataset configuration](docs/config_README-en.md) for the regularization dataset.
+  - Specify "number of training images x number of epochs >= number of regularization images x number of epochs".
+  - Specify a large value for `--prior_loss_weight` option (not dataset config). We recommend 10-1000.
+  - Set the loss in the training without using the regularization image to be close to the loss in the training using DOP.
+```
+[[datasets.subsets]]
+image_dir = "path/to/image/dir"
+num_repeats = 1
+is_reg = true
+custom_attributes.diff_output_preservation = true # Add this
+```
+
+
+
 Oct 13, 2024:
 
 - Fixed an issue where it took a long time to load the image size when initializing the dataset, especially when the number of images in the dataset was large.
diff --git a/flux_train_network.py b/flux_train_network.py
index aa92fe3ae..8431a6dc9 100644
--- a/flux_train_network.py
+++ b/flux_train_network.py
@@ -373,33 +373,13 @@ def get_noise_pred_and_target(
         if not args.apply_t5_attn_mask:
             t5_attn_mask = None
 
-        if not args.split_mode:
-            # normal forward
-            with accelerator.autocast():
-                # YiYi notes: divide it by 1000 for now because we scale it by 1000 in the transformer model (we should not keep it but I want to keep the inputs same for the model for testing)
-                model_pred = unet(
-                    img=packed_noisy_model_input,
-                    img_ids=img_ids,
-                    txt=t5_out,
-                    txt_ids=txt_ids,
-                    y=l_pooled,
-                    timesteps=timesteps / 1000,
-                    guidance=guidance_vec,
-                    txt_attention_mask=t5_attn_mask,
-                )
-        else:
-            # split forward to reduce memory usage
-            assert network.train_blocks == "single", "train_blocks must be single for split mode"
-            with accelerator.autocast():
-                # move flux lower to cpu, and then move flux upper to gpu
-                unet.to("cpu")
-                clean_memory_on_device(accelerator.device)
-                self.flux_upper.to(accelerator.device)
-
-                # upper model does not require grad
-                with torch.no_grad():
-                    intermediate_img, intermediate_txt, vec, pe = self.flux_upper(
-                        img=packed_noisy_model_input,
+        def call_dit(img, img_ids, t5_out, txt_ids, l_pooled, timesteps, guidance_vec, t5_attn_mask):
+            if not args.split_mode:
+                # normal forward
+                with accelerator.autocast():
+                    # YiYi notes: divide it by 1000 for now because we scale it by 1000 in the transformer model (we should not keep it but I want to keep the inputs same for the model for testing)
+                    model_pred = unet(
+                        img=img,
                         img_ids=img_ids,
                         txt=t5_out,
                         txt_ids=txt_ids,
@@ -408,18 +388,52 @@ def get_noise_pred_and_target(
                         guidance=guidance_vec,
                         txt_attention_mask=t5_attn_mask,
                     )
-
-                # move flux upper back to cpu, and then move flux lower to gpu
-                self.flux_upper.to("cpu")
-                clean_memory_on_device(accelerator.device)
-                unet.to(accelerator.device)
-
-                # lower model requires grad
-                intermediate_img.requires_grad_(True)
-                intermediate_txt.requires_grad_(True)
-                vec.requires_grad_(True)
-                pe.requires_grad_(True)
-                model_pred = unet(img=intermediate_img, txt=intermediate_txt, vec=vec, pe=pe, txt_attention_mask=t5_attn_mask)
+            else:
+                # split forward to reduce memory usage
+                assert network.train_blocks == "single", "train_blocks must be single for split mode"
+                with accelerator.autocast():
+                    # move flux lower to cpu, and then move flux upper to gpu
+                    unet.to("cpu")
+                    clean_memory_on_device(accelerator.device)
+                    self.flux_upper.to(accelerator.device)
+
+                    # upper model does not require grad
+                    with torch.no_grad():
+                        intermediate_img, intermediate_txt, vec, pe = self.flux_upper(
+                            img=packed_noisy_model_input,
+                            img_ids=img_ids,
+                            txt=t5_out,
+                            txt_ids=txt_ids,
+                            y=l_pooled,
+                            timesteps=timesteps / 1000,
+                            guidance=guidance_vec,
+                            txt_attention_mask=t5_attn_mask,
+                        )
+
+                    # move flux upper back to cpu, and then move flux lower to gpu
+                    self.flux_upper.to("cpu")
+                    clean_memory_on_device(accelerator.device)
+                    unet.to(accelerator.device)
+
+                    # lower model requires grad
+                    intermediate_img.requires_grad_(True)
+                    intermediate_txt.requires_grad_(True)
+                    vec.requires_grad_(True)
+                    pe.requires_grad_(True)
+                    model_pred = unet(img=intermediate_img, txt=intermediate_txt, vec=vec, pe=pe, txt_attention_mask=t5_attn_mask)
+
+            return model_pred
+
+        model_pred = call_dit(
+            img=packed_noisy_model_input,
+            img_ids=img_ids,
+            t5_out=t5_out,
+            txt_ids=txt_ids,
+            l_pooled=l_pooled,
+            timesteps=timesteps,
+            guidance_vec=guidance_vec,
+            t5_attn_mask=t5_attn_mask,
+        )
 
         # unpack latents
         model_pred = flux_utils.unpack_latents(model_pred, packed_latent_height, packed_latent_width)
@@ -430,6 +444,37 @@ def get_noise_pred_and_target(
         # flow matching loss: this is different from SD3
         target = noise - latents
 
+        # differential output preservation
+        if "custom_attributes" in batch:
+            diff_output_pr_indices = []
+            for i, custom_attributes in enumerate(batch["custom_attributes"]):
+                if "diff_output_preservation" in custom_attributes and custom_attributes["diff_output_preservation"]:
+                    diff_output_pr_indices.append(i)
+
+            if len(diff_output_pr_indices) > 0:
+                network.set_multiplier(0.0)
+                with torch.no_grad(), accelerator.autocast():
+                    model_pred_prior = call_dit(
+                        img=packed_noisy_model_input[diff_output_pr_indices],
+                        img_ids=img_ids[diff_output_pr_indices],
+                        t5_out=t5_out[diff_output_pr_indices],
+                        txt_ids=txt_ids[diff_output_pr_indices],
+                        l_pooled=l_pooled[diff_output_pr_indices],
+                        timesteps=timesteps[diff_output_pr_indices],
+                        guidance_vec=guidance_vec[diff_output_pr_indices] if guidance_vec is not None else None,
+                        t5_attn_mask=t5_attn_mask[diff_output_pr_indices] if t5_attn_mask is not None else None,
+                    )
+                network.set_multiplier(1.0)  # may be overwritten by "network_multipliers" in the next step
+
+                model_pred_prior = flux_utils.unpack_latents(model_pred_prior, packed_latent_height, packed_latent_width)
+                model_pred_prior, _ = flux_train_utils.apply_model_prediction_type(
+                    args,
+                    model_pred_prior,
+                    noisy_model_input[diff_output_pr_indices],
+                    sigmas[diff_output_pr_indices] if sigmas is not None else None,
+                )
+                target[diff_output_pr_indices] = model_pred_prior.to(target.dtype)
+
         return model_pred, target, timesteps, None, weighting
 
     def post_process_loss(self, loss, args, timesteps, noise_scheduler):

From 2c45d979e696fd4412ae1336feaee3bc9b967af4 Mon Sep 17 00:00:00 2001
From: kohya-ss <ykumeykume@gmail.com>
Date: Sat, 19 Oct 2024 19:21:12 +0900
Subject: [PATCH 186/582] update README, remove unnecessary autocast

---
 README.md             | 10 ++++------
 flux_train_network.py |  2 +-
 2 files changed, 5 insertions(+), 7 deletions(-)

diff --git a/README.md b/README.md
index 59f70ebcd..32ee38573 100644
--- a/README.md
+++ b/README.md
@@ -13,13 +13,13 @@ The command to install PyTorch is as follows:
 
 Oct 19, 2024:
 
-- Added an implementation of Differential Output Preservation (temporary name) for SDXL/FLUX.1 LoRA training.
+- Added an implementation of Differential Output Preservation (temporary name) for SDXL/FLUX.1 LoRA training. SD1/2 is not tested yet. This is an experimental feature. 
   - A method to make the output of LoRA closer to the output when LoRA is not applied, with captions that do not contain trigger words.
   - Define a Dataset subset for the regularization image (`is_reg = true`) with `.toml`. Add `custom_attributes.diff_output_preservation = true`.
     - See [dataset configuration](docs/config_README-en.md) for the regularization dataset.
-  - Specify "number of training images x number of epochs >= number of regularization images x number of epochs".
-  - Specify a large value for `--prior_loss_weight` option (not dataset config). We recommend 10-1000.
-  - Set the loss in the training without using the regularization image to be close to the loss in the training using DOP.
+  - Specify "number of training images x number of repeats >= number of regularization images x number of repeats".
+  - Specify a large value for `--prior_loss_weight` option (not dataset config). The appropriate value is unknown, but try around 10-100. Note that the default is 1.0.
+  - You may want to start with 2/3 to 3/4 of the loss value when DOP is not applied. If it is 1/2, DOP may not be working.
 ```
 [[datasets.subsets]]
 image_dir = "path/to/image/dir"
@@ -28,8 +28,6 @@ is_reg = true
 custom_attributes.diff_output_preservation = true # Add this
 ```
 
-
-
 Oct 13, 2024:
 
 - Fixed an issue where it took a long time to load the image size when initializing the dataset, especially when the number of images in the dataset was large.
diff --git a/flux_train_network.py b/flux_train_network.py
index 8431a6dc9..9cc8811b5 100644
--- a/flux_train_network.py
+++ b/flux_train_network.py
@@ -453,7 +453,7 @@ def call_dit(img, img_ids, t5_out, txt_ids, l_pooled, timesteps, guidance_vec, t
 
             if len(diff_output_pr_indices) > 0:
                 network.set_multiplier(0.0)
-                with torch.no_grad(), accelerator.autocast():
+                with torch.no_grad():
                     model_pred_prior = call_dit(
                         img=packed_noisy_model_input[diff_output_pr_indices],
                         img_ids=img_ids[diff_output_pr_indices],

From 7fe8e162cb54ccf259eead1cca0ebdcc4e2b77fe Mon Sep 17 00:00:00 2001
From: Kohya S <ykumeykume@gmail.com>
Date: Sun, 20 Oct 2024 08:45:27 +0900
Subject: [PATCH 187/582] fix to work ControlNetSubset with custom_attributes

---
 library/train_util.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/library/train_util.py b/library/train_util.py
index 7d3fce5b2..462c7a9a2 100644
--- a/library/train_util.py
+++ b/library/train_util.py
@@ -578,6 +578,7 @@ def __init__(
         caption_suffix,
         token_warmup_min,
         token_warmup_step,
+        custom_attributes: Optional[Dict[str, Any]] = None,
     ) -> None:
         assert image_dir is not None, "image_dir must be specified / image_dirは指定が必須です"
 
@@ -602,6 +603,7 @@ def __init__(
             caption_suffix,
             token_warmup_min,
             token_warmup_step,
+            custom_attributes=custom_attributes,
         )
 
         self.conditioning_data_dir = conditioning_data_dir

From 138dac4aea57716e2f23580305f6e40836a87228 Mon Sep 17 00:00:00 2001
From: Kohya S <ykumeykume@gmail.com>
Date: Sun, 20 Oct 2024 09:22:38 +0900
Subject: [PATCH 188/582] update README

---
 README.md | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index 32ee38573..532c3368f 100644
--- a/README.md
+++ b/README.md
@@ -18,8 +18,9 @@ Oct 19, 2024:
   - Define a Dataset subset for the regularization image (`is_reg = true`) with `.toml`. Add `custom_attributes.diff_output_preservation = true`.
     - See [dataset configuration](docs/config_README-en.md) for the regularization dataset.
   - Specify "number of training images x number of repeats >= number of regularization images x number of repeats".
-  - Specify a large value for `--prior_loss_weight` option (not dataset config). The appropriate value is unknown, but try around 10-100. Note that the default is 1.0.
-  - You may want to start with 2/3 to 3/4 of the loss value when DOP is not applied. If it is 1/2, DOP may not be working.
+  - The weights of DOP is specified by `--prior_loss_weight` option (not dataset config). 
+  - The appropriate value is still unknown. For FLUX, according to the comments in the [PR](https://github.com/kohya-ss/sd-scripts/pull/1710), the value may be 1 (thanks to dxqbYD!). For SDXL, a larger value may be needed (10-100 may be good starting points).
+  - It may be good to adjust the value so that the loss is about half to three-quarters of the loss when DOP is not applied.
 ```
 [[datasets.subsets]]
 image_dir = "path/to/image/dir"
@@ -28,6 +29,7 @@ is_reg = true
 custom_attributes.diff_output_preservation = true # Add this
 ```
 
+
 Oct 13, 2024:
 
 - Fixed an issue where it took a long time to load the image size when initializing the dataset, especially when the number of images in the dataset was large.

From 623017f71695bcee18f36f5a1f57514974d9350d Mon Sep 17 00:00:00 2001
From: Kohya S <ykumeykume@gmail.com>
Date: Thu, 24 Oct 2024 19:49:28 +0900
Subject: [PATCH 189/582] refactor SD3 CLIP to transformers etc.

---
 flux_train.py               |    4 +-
 flux_train_network.py       |    2 +-
 library/flux_train_utils.py |    3 +-
 library/flux_utils.py       |   59 +--
 library/sai_model_spec.py   |    9 +-
 library/sd3_models.py       | 1000 ++---------------------------------
 library/sd3_train_utils.py  |  244 +++++----
 library/sd3_utils.py        |  503 +++++-------------
 library/strategy_sd3.py     |  184 ++++---
 library/train_util.py       |   31 ++
 library/utils.py            |   42 +-
 sd3_minimal_inference.py    |  390 +++++++-------
 sd3_train.py                |  738 ++++++++++++++------------
 13 files changed, 1130 insertions(+), 2079 deletions(-)

diff --git a/flux_train.py b/flux_train.py
index 91ae3af57..79c44d7b4 100644
--- a/flux_train.py
+++ b/flux_train.py
@@ -29,7 +29,7 @@
 
 from accelerate.utils import set_seed
 from library import deepspeed_utils, flux_train_utils, flux_utils, strategy_base, strategy_flux
-from library.sd3_train_utils import load_prompts, FlowMatchEulerDiscreteScheduler
+from library.sd3_train_utils import FlowMatchEulerDiscreteScheduler
 
 import library.train_util as train_util
 
@@ -241,7 +241,7 @@ def train(args):
 
             text_encoding_strategy: strategy_flux.FluxTextEncodingStrategy = strategy_base.TextEncodingStrategy.get_strategy()
 
-            prompts = load_prompts(args.sample_prompts)
+            prompts = train_util.load_prompts(args.sample_prompts)
             sample_prompts_te_outputs = {}  # key: prompt, value: text encoder outputs
             with accelerator.autocast(), torch.no_grad():
                 for prompt_dict in prompts:
diff --git a/flux_train_network.py b/flux_train_network.py
index 9cc8811b5..cffeb3b19 100644
--- a/flux_train_network.py
+++ b/flux_train_network.py
@@ -231,7 +231,7 @@ def cache_text_encoder_outputs_if_needed(
                 tokenize_strategy: strategy_flux.FluxTokenizeStrategy = strategy_base.TokenizeStrategy.get_strategy()
                 text_encoding_strategy: strategy_flux.FluxTextEncodingStrategy = strategy_base.TextEncodingStrategy.get_strategy()
 
-                prompts = sd3_train_utils.load_prompts(args.sample_prompts)
+                prompts = train_util.load_prompts(args.sample_prompts)
                 sample_prompts_te_outputs = {}  # key: prompt, value: text encoder outputs
                 with accelerator.autocast(), torch.no_grad():
                     for prompt_dict in prompts:
diff --git a/library/flux_train_utils.py b/library/flux_train_utils.py
index b3c9184f2..fa673a2f0 100644
--- a/library/flux_train_utils.py
+++ b/library/flux_train_utils.py
@@ -15,7 +15,6 @@
 from safetensors.torch import save_file
 
 from library import flux_models, flux_utils, strategy_base, train_util
-from library.sd3_train_utils import load_prompts
 from library.device_utils import init_ipex, clean_memory_on_device
 
 init_ipex()
@@ -70,7 +69,7 @@ def sample_images(
         text_encoders = [accelerator.unwrap_model(te) for te in text_encoders]
     # print([(te.parameters().__next__().device if te is not None else None) for te in text_encoders])
 
-    prompts = load_prompts(args.sample_prompts)
+    prompts = train_util.load_prompts(args.sample_prompts)
 
     save_dir = args.output_dir + "/sample"
     os.makedirs(save_dir, exist_ok=True)
diff --git a/library/flux_utils.py b/library/flux_utils.py
index 7a1ec37b8..86a2ec600 100644
--- a/library/flux_utils.py
+++ b/library/flux_utils.py
@@ -10,40 +10,21 @@
 from accelerate import init_empty_weights
 from transformers import CLIPTextModel, CLIPConfig, T5EncoderModel, T5Config
 
-from library import flux_models
-
-from library.utils import setup_logging, MemoryEfficientSafeOpen
+from library.utils import setup_logging
 
 setup_logging()
 import logging
 
 logger = logging.getLogger(__name__)
 
+from library import flux_models
+from library.utils import load_safetensors
+
 MODEL_VERSION_FLUX_V1 = "flux1"
 MODEL_NAME_DEV = "dev"
 MODEL_NAME_SCHNELL = "schnell"
 
 
-# temporary copy from sd3_utils TODO refactor
-def load_safetensors(
-    path: str, device: Union[str, torch.device], disable_mmap: bool = False, dtype: Optional[torch.dtype] = torch.float32
-):
-    if disable_mmap:
-        # return safetensors.torch.load(open(path, "rb").read())
-        # use experimental loader
-        logger.info(f"Loading without mmap (experimental)")
-        state_dict = {}
-        with MemoryEfficientSafeOpen(path) as f:
-            for key in f.keys():
-                state_dict[key] = f.get_tensor(key).to(device, dtype=dtype)
-        return state_dict
-    else:
-        try:
-            return load_file(path, device=device)
-        except:
-            return load_file(path)  # prevent device invalid Error
-
-
 def analyze_checkpoint_state(ckpt_path: str) -> Tuple[bool, bool, Tuple[int, int], List[str]]:
     """
     チェックポイントの状態を分析し、DiffusersかBFLか、devかschnellか、ブロック数を計算して返す。
@@ -161,8 +142,14 @@ def load_ae(
     return ae
 
 
-def load_clip_l(ckpt_path: str, dtype: torch.dtype, device: Union[str, torch.device], disable_mmap: bool = False) -> CLIPTextModel:
-    logger.info("Building CLIP")
+def load_clip_l(
+    ckpt_path: Optional[str],
+    dtype: torch.dtype,
+    device: Union[str, torch.device],
+    disable_mmap: bool = False,
+    state_dict: Optional[dict] = None,
+) -> CLIPTextModel:
+    logger.info("Building CLIP-L")
     CLIPL_CONFIG = {
         "_name_or_path": "clip-vit-large-patch14/",
         "architectures": ["CLIPModel"],
@@ -255,15 +242,22 @@ def load_clip_l(ckpt_path: str, dtype: torch.dtype, device: Union[str, torch.dev
     with init_empty_weights():
         clip = CLIPTextModel._from_config(config)
 
-    logger.info(f"Loading state dict from {ckpt_path}")
-    sd = load_safetensors(ckpt_path, device=str(device), disable_mmap=disable_mmap, dtype=dtype)
+    if state_dict is not None:
+        sd = state_dict
+    else:
+        logger.info(f"Loading state dict from {ckpt_path}")
+        sd = load_safetensors(ckpt_path, device=str(device), disable_mmap=disable_mmap, dtype=dtype)
     info = clip.load_state_dict(sd, strict=False, assign=True)
-    logger.info(f"Loaded CLIP: {info}")
+    logger.info(f"Loaded CLIP-L: {info}")
     return clip
 
 
 def load_t5xxl(
-    ckpt_path: str, dtype: Optional[torch.dtype], device: Union[str, torch.device], disable_mmap: bool = False
+    ckpt_path: str,
+    dtype: Optional[torch.dtype],
+    device: Union[str, torch.device],
+    disable_mmap: bool = False,
+    state_dict: Optional[dict] = None,
 ) -> T5EncoderModel:
     T5_CONFIG_JSON = """
 {
@@ -303,8 +297,11 @@ def load_t5xxl(
     with init_empty_weights():
         t5xxl = T5EncoderModel._from_config(config)
 
-    logger.info(f"Loading state dict from {ckpt_path}")
-    sd = load_safetensors(ckpt_path, device=str(device), disable_mmap=disable_mmap, dtype=dtype)
+    if state_dict is not None:
+        sd = state_dict
+    else:
+        logger.info(f"Loading state dict from {ckpt_path}")
+        sd = load_safetensors(ckpt_path, device=str(device), disable_mmap=disable_mmap, dtype=dtype)
     info = t5xxl.load_state_dict(sd, strict=False, assign=True)
     logger.info(f"Loaded T5xxl: {info}")
     return t5xxl
diff --git a/library/sai_model_spec.py b/library/sai_model_spec.py
index ad72ec00d..8896c047e 100644
--- a/library/sai_model_spec.py
+++ b/library/sai_model_spec.py
@@ -57,8 +57,8 @@
 ARCH_SD_V2_512 = "stable-diffusion-v2-512"
 ARCH_SD_V2_768_V = "stable-diffusion-v2-768-v"
 ARCH_SD_XL_V1_BASE = "stable-diffusion-xl-v1-base"
-ARCH_SD3_M = "stable-diffusion-3-medium"
-ARCH_SD3_UNKNOWN = "stable-diffusion-3"
+ARCH_SD3_M = "stable-diffusion-3"  # may be followed by "-m" or "-5-large" etc.
+# ARCH_SD3_UNKNOWN = "stable-diffusion-3"
 ARCH_FLUX_1_DEV = "flux-1-dev"
 ARCH_FLUX_1_UNKNOWN = "flux-1"
 
@@ -140,10 +140,7 @@ def build_metadata(
     if sdxl:
         arch = ARCH_SD_XL_V1_BASE
     elif sd3 is not None:
-        if sd3 == "m":
-            arch = ARCH_SD3_M
-        else:
-            arch = ARCH_SD3_UNKNOWN
+        arch = ARCH_SD3_M + "-" + sd3
     elif flux is not None:
         if flux == "dev":
             arch = ARCH_FLUX_1_DEV
diff --git a/library/sd3_models.py b/library/sd3_models.py
index ec704dcba..c81aa4794 100644
--- a/library/sd3_models.py
+++ b/library/sd3_models.py
@@ -4,6 +4,7 @@
 # and some module/classes are contributed from KohakuBlueleaf. Thanks for the contribution!
 
 from ast import Tuple
+from dataclasses import dataclass
 from functools import partial
 import math
 from types import SimpleNamespace
@@ -15,6 +16,7 @@
 import torch.nn.functional as F
 from torch.utils.checkpoint import checkpoint
 from transformers import CLIPTokenizer, T5TokenizerFast
+
 from .utils import setup_logging
 
 setup_logging()
@@ -35,139 +37,21 @@
     memory_efficient_attention = None
 
 
-# region tokenizer
-class SDTokenizer:
-    def __init__(
-        self, max_length=77, pad_with_end=True, tokenizer=None, has_start_token=True, pad_to_max_length=True, min_length=None
-    ):
-        """
-        サブクラスで各種の設定を行ってる。このクラスはその設定に基づき重み付きのトークン化を行うようだ。
-        Some settings are done in subclasses. This class seems to perform tokenization with weights based on those settings.
-        """
-        self.tokenizer: CLIPTokenizer = tokenizer
-        self.max_length = max_length
-        self.min_length = min_length
-        empty = self.tokenizer("")["input_ids"]
-        if has_start_token:
-            self.tokens_start = 1
-            self.start_token = empty[0]
-            self.end_token = empty[1]
-        else:
-            self.tokens_start = 0
-            self.start_token = None
-            self.end_token = empty[0]
-        self.pad_with_end = pad_with_end
-        self.pad_to_max_length = pad_to_max_length
-        vocab = self.tokenizer.get_vocab()
-        self.inv_vocab = {v: k for k, v in vocab.items()}
-        self.max_word_length = 8
-
-    def tokenize(self, text: Union[str, List[str]]) -> List[torch.Tensor]:
-        """
-        Tokenize the text without weights.
-        """
-        if type(text) == str:
-            text = [text]
-        batch_tokens = self.tokenizer(text, padding="max_length", truncation=True, max_length=self.max_length, return_tensors="pt")
-        # return tokens["input_ids"]
-
-        pad_token = self.end_token if self.pad_with_end else 0
-        for tokens in batch_tokens["input_ids"]:
-            assert tokens[0] == self.start_token, f"tokens[0]: {tokens[0]}, start_token: {self.start_token}"
-
-    def tokenize_with_weights(self, text: str, truncate_to_max_length=True, truncate_length=None):
-        """Tokenize the text, with weight values - presume 1.0 for all and ignore other features here.
-        The details aren't relevant for a reference impl, and weights themselves has weak effect on SD3."""
-        """
-        ja: テキストをトークン化し、重み値を持ちます - すべての値に1.0を仮定し、他の機能を無視します。
-        詳細は参考実装には関係なく、重み自体はSD3に対して弱い影響しかありません。へぇ～
-        """
-        if self.pad_with_end:
-            pad_token = self.end_token
-        else:
-            pad_token = 0
-        batch = []
-        if self.start_token is not None:
-            batch.append((self.start_token, 1.0))
-        to_tokenize = text.replace("\n", " ").split(" ")
-        to_tokenize = [x for x in to_tokenize if x != ""]
-        for word in to_tokenize:
-            batch.extend([(t, 1) for t in self.tokenizer(word)["input_ids"][self.tokens_start : -1]])
-        batch.append((self.end_token, 1.0))
-        print(len(batch), self.max_length, self.min_length)
-        if self.pad_to_max_length:
-            batch.extend([(pad_token, 1.0)] * (self.max_length - len(batch)))
-        if self.min_length is not None and len(batch) < self.min_length:
-            batch.extend([(pad_token, 1.0)] * (self.min_length - len(batch)))
-
-        # truncate to max_length
-        print(
-            f"batch: {batch}, max_length: {self.max_length}, truncate: {truncate_to_max_length}, truncate_length: {truncate_length}"
-        )
-        if truncate_to_max_length and len(batch) > self.max_length:
-            batch = batch[: self.max_length]
-        if truncate_length is not None and len(batch) > truncate_length:
-            batch = batch[:truncate_length]
-
-        return [batch]
-
-
-class T5XXLTokenizer(SDTokenizer):
-    """Wraps the T5 Tokenizer from HF into the SDTokenizer interface"""
-
-    def __init__(self):
-        super().__init__(
-            pad_with_end=False,
-            tokenizer=T5TokenizerFast.from_pretrained("google/t5-v1_1-xxl"),
-            has_start_token=False,
-            pad_to_max_length=False,
-            max_length=99999999,
-            min_length=77,
-        )
-
-
-class SDXLClipGTokenizer(SDTokenizer):
-    def __init__(self, tokenizer):
-        super().__init__(pad_with_end=False, tokenizer=tokenizer)
-
-
-class SD3Tokenizer:
-    def __init__(self, t5xxl=True, t5xxl_max_length: Optional[int] = 256):
-        if t5xxl_max_length is None:
-            t5xxl_max_length = 256
-
-        # TODO cache tokenizer settings locally or hold them in the repo like ComfyUI
-        clip_tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-large-patch14")
-        self.clip_l = SDTokenizer(tokenizer=clip_tokenizer)
-        self.clip_g = SDXLClipGTokenizer(clip_tokenizer)
-        # self.clip_l = CLIPTokenizer.from_pretrained("openai/clip-vit-large-patch14")
-        # self.clip_g = CLIPTokenizer.from_pretrained("laion/CLIP-ViT-bigG-14-laion2B-39B-b160k")
-        self.t5xxl = T5XXLTokenizer() if t5xxl else None
-        # t5xxl has 99999999 max length, clip has 77
-        self.t5xxl_max_length = t5xxl_max_length
-
-    def tokenize_with_weights(self, text: str):
-        return (
-            self.clip_l.tokenize_with_weights(text),
-            self.clip_g.tokenize_with_weights(text),
-            (
-                self.t5xxl.tokenize_with_weights(text, truncate_to_max_length=False, truncate_length=self.t5xxl_max_length)
-                if self.t5xxl is not None
-                else None
-            ),
-        )
-
-    def tokenize(self, text: str):
-        return (
-            self.clip_l.tokenize(text),
-            self.clip_g.tokenize(text),
-            (self.t5xxl.tokenize(text) if self.t5xxl is not None else None),
-        )
-
+# region mmdit
 
-# endregion
 
-# region mmdit
+@dataclass
+class SD3Params:
+    patch_size: int
+    depth: int
+    num_patches: int
+    pos_embed_max_size: int
+    adm_in_channels: int
+    qk_norm: Optional[str]
+    x_block_self_attn_layers: List[int]
+    context_embedder_in_features: int
+    context_embedder_out_features: int
+    model_type: str
 
 
 def get_2d_sincos_pos_embed(
@@ -286,10 +170,6 @@ def timestep_embedding(t, dim, max_period=10000):
     return embedding
 
 
-def rmsnorm(x, eps=1e-6):
-    return x * torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + eps)
-
-
 class PatchEmbed(nn.Module):
     def __init__(
         self,
@@ -301,8 +181,9 @@ def __init__(
         flatten=True,
         bias=True,
         strict_img_size=True,
-        dynamic_img_pad=True,
+        dynamic_img_pad=False,
     ):
+        # dynamic_img_pad and norm is omitted in SD3.5
         super().__init__()
         self.patch_size = patch_size
         self.flatten = flatten
@@ -432,6 +313,10 @@ def forward(self, x):
         return self.mlp(x)
 
 
+def rmsnorm(x, eps=1e-6):
+    return x * torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + eps)
+
+
 class RMSNorm(torch.nn.Module):
     def __init__(
         self,
@@ -604,53 +489,6 @@ def attention(q, k, v, head_dim, mask=None, scale=None, mode="xformers"):
     return scores
 
 
-class SelfAttention(AttentionLinears):
-    def __init__(self, dim, num_heads=8, mode="xformers"):
-        super().__init__(dim, num_heads, qkv_bias=True, pre_only=False)
-        assert mode in MEMORY_LAYOUTS
-        self.head_dim = dim // num_heads
-        self.attn_mode = mode
-
-    def set_attn_mode(self, mode):
-        self.attn_mode = mode
-
-    def forward(self, x):
-        q, k, v = self.pre_attention(x)
-        attn_score = attention(q, k, v, self.head_dim, mode=self.attn_mode)
-        return self.post_attention(attn_score)
-
-
-class TransformerBlock(nn.Module):
-    def __init__(self, context_size, mode="xformers"):
-        super().__init__()
-        self.context_size = context_size
-        self.norm1 = nn.LayerNorm(context_size, elementwise_affine=False, eps=1e-6)
-        self.attn = SelfAttention(context_size, mode=mode)
-        self.norm2 = nn.LayerNorm(context_size, elementwise_affine=False, eps=1e-6)
-        self.mlp = MLP(
-            in_features=context_size,
-            hidden_features=context_size * 4,
-            act_layer=lambda: nn.GELU(approximate="tanh"),
-        )
-
-    def forward(self, x):
-        x = x + self.attn(self.norm1(x))
-        x = x + self.mlp(self.norm2(x))
-        return x
-
-
-class Transformer(nn.Module):
-    def __init__(self, context_size, num_layers, mode="xformers"):
-        super().__init__()
-        self.layers = nn.ModuleList([TransformerBlock(context_size, mode) for _ in range(num_layers)])
-        self.norm = nn.LayerNorm(context_size, elementwise_affine=False, eps=1e-6)
-
-    def forward(self, x):
-        for layer in self.layers:
-            x = layer(x)
-        return self.norm(x)
-
-
 # DismantledBlock in mmdit.py
 class SingleDiTBlock(nn.Module):
     """
@@ -823,7 +661,8 @@ def __init__(
         mlp_ratio: float = 4.0,
         learn_sigma: bool = False,
         adm_in_channels: Optional[int] = None,
-        context_embedder_config: Optional[Dict] = None,
+        context_embedder_in_features: Optional[int] = None,
+        context_embedder_out_features: Optional[int] = None,
         use_checkpoint: bool = False,
         register_length: int = 0,
         attn_mode: str = "torch",
@@ -837,10 +676,10 @@ def __init__(
         num_patches=None,
         qk_norm: Optional[str] = None,
         qkv_bias: bool = True,
-        context_processor_layers=None,
-        context_size=4096,
+        model_type: str = "sd3m",
     ):
         super().__init__()
+        self._model_type = model_type
         self.learn_sigma = learn_sigma
         self.in_channels = in_channels
         default_out_channels = in_channels * 2 if learn_sigma else in_channels
@@ -875,12 +714,11 @@ def __init__(
             assert isinstance(adm_in_channels, int)
             self.y_embedder = Embedder(adm_in_channels, self.hidden_size)
 
-        if context_processor_layers is not None:
-            self.context_processor = Transformer(context_size, context_processor_layers, attn_mode)
+        if context_embedder_in_features is not None:
+            self.context_embedder = nn.Linear(context_embedder_in_features, context_embedder_out_features)
         else:
-            self.context_processor = None
+            self.context_embedder = nn.Identity()
 
-        self.context_embedder = nn.Linear(context_size, self.hidden_size)
         self.register_length = register_length
         if self.register_length > 0:
             self.register = nn.Parameter(torch.randn(1, register_length, self.hidden_size))
@@ -922,7 +760,7 @@ def __init__(
 
     @property
     def model_type(self):
-        return "m"  # only support medium
+        return self._model_type
 
     @property
     def device(self):
@@ -1024,9 +862,6 @@ def forward(
         y: (N, D) tensor of class labels
         """
 
-        if self.context_processor is not None:
-            context = self.context_processor(context)
-
         B, C, H, W = x.shape
         x = self.x_embedder(x) + self.cropped_pos_embed(H, W, device=x.device).to(dtype=x.dtype)
         c = self.t_embedder(t, dtype=x.dtype)  # (N, D)
@@ -1052,22 +887,21 @@ def forward(
         return x[:, :, :H, :W]
 
 
-def create_mmdit_sd3_medium_configs(attn_mode: str):
-    # {'patch_size': 2, 'depth': 24, 'num_patches': 36864,
-    # 'pos_embed_max_size': 192, 'adm_in_channels': 2048, 'context_embedder':
-    # {'target': 'torch.nn.Linear', 'params': {'in_features': 4096, 'out_features': 1536}}}
+def create_sd3_mmdit(params: SD3Params, attn_mode: str = "torch") -> MMDiT:
     mmdit = MMDiT(
         input_size=None,
-        pos_embed_max_size=192,
-        patch_size=2,
+        pos_embed_max_size=params.pos_embed_max_size,
+        patch_size=params.patch_size,
         in_channels=16,
-        adm_in_channels=2048,
-        depth=24,
+        adm_in_channels=params.adm_in_channels,
+        context_embedder_in_features=params.context_embedder_in_features,
+        context_embedder_out_features=params.context_embedder_out_features,
+        depth=params.depth,
         mlp_ratio=4,
-        qk_norm=None,
-        num_patches=36864,
-        context_size=4096,
+        qk_norm=params.qk_norm,
+        num_patches=params.num_patches,
         attn_mode=attn_mode,
+        model_type=params.model_type,
     )
     return mmdit
 
@@ -1075,7 +909,6 @@ def create_mmdit_sd3_medium_configs(attn_mode: str):
 # endregion
 
 # region VAE
-# TODO support xformers
 
 VAE_SCALE_FACTOR = 1.5305
 VAE_SHIFT_FACTOR = 0.0609
@@ -1322,759 +1155,4 @@ def process_out(latent):
         return (latent / VAE_SCALE_FACTOR) + VAE_SHIFT_FACTOR
 
 
-class VAEOutput:
-    def __init__(self, latent):
-        self.latent = latent
-
-    @property
-    def latent_dist(self):
-        return self
-
-    def sample(self):
-        return self.latent
-
-
-class VAEWrapper:
-    def __init__(self, vae):
-        self.vae = vae
-
-    @property
-    def device(self):
-        return self.vae.device
-
-    @property
-    def dtype(self):
-        return self.vae.dtype
-
-    # latents = vae.encode(img_tensors).latent_dist.sample().to("cpu")
-    def encode(self, image):
-        return VAEOutput(self.vae.encode(image))
-
-
-# endregion
-
-
-# region Text Encoder
-class CLIPAttention(torch.nn.Module):
-    def __init__(self, embed_dim, heads, dtype, device, mode="xformers"):
-        super().__init__()
-        self.heads = heads
-        self.q_proj = nn.Linear(embed_dim, embed_dim, bias=True, dtype=dtype, device=device)
-        self.k_proj = nn.Linear(embed_dim, embed_dim, bias=True, dtype=dtype, device=device)
-        self.v_proj = nn.Linear(embed_dim, embed_dim, bias=True, dtype=dtype, device=device)
-        self.out_proj = nn.Linear(embed_dim, embed_dim, bias=True, dtype=dtype, device=device)
-        self.attn_mode = mode
-
-    def set_attn_mode(self, mode):
-        self.attn_mode = mode
-
-    def forward(self, x, mask=None):
-        q = self.q_proj(x)
-        k = self.k_proj(x)
-        v = self.v_proj(x)
-        out = attention(q, k, v, self.heads, mask, mode=self.attn_mode)
-        return self.out_proj(out)
-
-
-ACTIVATIONS = {
-    "quick_gelu": lambda: (lambda a: a * torch.sigmoid(1.702 * a)),
-    # "gelu": torch.nn.functional.gelu,
-    "gelu": lambda: nn.GELU(),
-}
-
-
-class CLIPLayer(torch.nn.Module):
-    def __init__(self, embed_dim, heads, intermediate_size, intermediate_activation, dtype, device):
-        super().__init__()
-        self.layer_norm1 = nn.LayerNorm(embed_dim, dtype=dtype, device=device)
-        self.self_attn = CLIPAttention(embed_dim, heads, dtype, device)
-        self.layer_norm2 = nn.LayerNorm(embed_dim, dtype=dtype, device=device)
-        # # self.mlp = CLIPMLP(embed_dim, intermediate_size, intermediate_activation, dtype, device)
-        # self.mlp = Mlp(
-        #     embed_dim, intermediate_size, embed_dim, act_layer=ACTIVATIONS[intermediate_activation], dtype=dtype, device=device
-        # )
-        self.mlp = MLP(embed_dim, intermediate_size, embed_dim, act_layer=ACTIVATIONS[intermediate_activation])
-        self.mlp.to(device=device, dtype=dtype)
-
-    def forward(self, x, mask=None):
-        x += self.self_attn(self.layer_norm1(x), mask)
-        x += self.mlp(self.layer_norm2(x))
-        return x
-
-
-class CLIPEncoder(torch.nn.Module):
-    def __init__(self, num_layers, embed_dim, heads, intermediate_size, intermediate_activation, dtype, device):
-        super().__init__()
-        self.layers = torch.nn.ModuleList(
-            [CLIPLayer(embed_dim, heads, intermediate_size, intermediate_activation, dtype, device) for i in range(num_layers)]
-        )
-
-    def forward(self, x, mask=None, intermediate_output=None):
-        if intermediate_output is not None:
-            if intermediate_output < 0:
-                intermediate_output = len(self.layers) + intermediate_output
-        intermediate = None
-        for i, l in enumerate(self.layers):
-            x = l(x, mask)
-            if i == intermediate_output:
-                intermediate = x.clone()
-        return x, intermediate
-
-
-class CLIPEmbeddings(torch.nn.Module):
-    def __init__(self, embed_dim, vocab_size=49408, num_positions=77, dtype=None, device=None):
-        super().__init__()
-        self.token_embedding = torch.nn.Embedding(vocab_size, embed_dim, dtype=dtype, device=device)
-        self.position_embedding = torch.nn.Embedding(num_positions, embed_dim, dtype=dtype, device=device)
-
-    def forward(self, input_tokens):
-        return self.token_embedding(input_tokens) + self.position_embedding.weight
-
-
-class CLIPTextModel_(torch.nn.Module):
-    def __init__(self, config_dict, dtype, device):
-        num_layers = config_dict["num_hidden_layers"]
-        embed_dim = config_dict["hidden_size"]
-        heads = config_dict["num_attention_heads"]
-        intermediate_size = config_dict["intermediate_size"]
-        intermediate_activation = config_dict["hidden_act"]
-        super().__init__()
-        self.embeddings = CLIPEmbeddings(embed_dim, dtype=torch.float32, device=device)
-        self.encoder = CLIPEncoder(num_layers, embed_dim, heads, intermediate_size, intermediate_activation, dtype, device)
-        self.final_layer_norm = nn.LayerNorm(embed_dim, dtype=dtype, device=device)
-
-    def forward(self, input_tokens, intermediate_output=None, final_layer_norm_intermediate=True):
-        x = self.embeddings(input_tokens)
-
-        if x.dtype == torch.bfloat16:
-            causal_mask = torch.empty(x.shape[1], x.shape[1], dtype=torch.float32, device=x.device).fill_(float("-inf")).triu_(1)
-            causal_mask = causal_mask.to(dtype=x.dtype)
-        else:
-            causal_mask = torch.empty(x.shape[1], x.shape[1], dtype=x.dtype, device=x.device).fill_(float("-inf")).triu_(1)
-
-        x, i = self.encoder(x, mask=causal_mask, intermediate_output=intermediate_output)
-        x = self.final_layer_norm(x)
-        if i is not None and final_layer_norm_intermediate:
-            i = self.final_layer_norm(i)
-        pooled_output = x[
-            torch.arange(x.shape[0], device=x.device),
-            input_tokens.to(dtype=torch.int, device=x.device).argmax(dim=-1),
-        ]
-        return x, i, pooled_output
-
-
-class CLIPTextModel(torch.nn.Module):
-    def __init__(self, config_dict, dtype, device):
-        super().__init__()
-        self.num_layers = config_dict["num_hidden_layers"]
-        self.text_model = CLIPTextModel_(config_dict, dtype, device)
-        embed_dim = config_dict["hidden_size"]
-        self.text_projection = nn.Linear(embed_dim, embed_dim, bias=False, dtype=dtype, device=device)
-        self.text_projection.weight.copy_(torch.eye(embed_dim))
-        self.dtype = dtype
-
-    def get_input_embeddings(self):
-        return self.text_model.embeddings.token_embedding
-
-    def set_input_embeddings(self, embeddings):
-        self.text_model.embeddings.token_embedding = embeddings
-
-    def forward(self, *args, **kwargs):
-        x = self.text_model(*args, **kwargs)
-        out = self.text_projection(x[2])
-        return (x[0], x[1], out, x[2])
-
-
-class ClipTokenWeightEncoder:
-    # def encode_token_weights(self, token_weight_pairs):
-    #     tokens = list(map(lambda a: a[0], token_weight_pairs[0]))
-    #     out, pooled = self([tokens])
-    #     if pooled is not None:
-    #         first_pooled = pooled[0:1]
-    #     else:
-    #         first_pooled = pooled
-    #     output = [out[0:1]]
-    #     return torch.cat(output, dim=-2), first_pooled
-
-    # fix to support batched inputs
-    # : Union[List[Tuple[torch.Tensor, torch.Tensor]], List[List[Tuple[torch.Tensor, torch.Tensor]]]]
-    def encode_token_weights(self, list_of_token_weight_pairs):
-        has_batch = isinstance(list_of_token_weight_pairs[0][0], list)
-
-        if has_batch:
-            list_of_tokens = []
-            for pairs in list_of_token_weight_pairs:
-                tokens = [a[0] for a in pairs[0]]  # I'm not sure why this is [0]
-                list_of_tokens.append(tokens)
-        else:
-            if isinstance(list_of_token_weight_pairs[0], torch.Tensor):
-                list_of_tokens = [list(list_of_token_weight_pairs[0])]
-            else:
-                list_of_tokens = [[a[0] for a in list_of_token_weight_pairs[0]]]
-
-        out, pooled = self(list_of_tokens)
-        if has_batch:
-            return out, pooled
-        else:
-            if pooled is not None:
-                first_pooled = pooled[0:1]
-            else:
-                first_pooled = pooled
-            output = [out[0:1]]
-            return torch.cat(output, dim=-2), first_pooled
-
-
-class SDClipModel(torch.nn.Module, ClipTokenWeightEncoder):
-    """Uses the CLIP transformer encoder for text (from huggingface)"""
-
-    LAYERS = ["last", "pooled", "hidden"]
-
-    def __init__(
-        self,
-        device="cpu",
-        max_length=77,
-        layer="last",
-        layer_idx=None,
-        textmodel_json_config=None,
-        dtype=None,
-        model_class=CLIPTextModel,
-        special_tokens={"start": 49406, "end": 49407, "pad": 49407},
-        layer_norm_hidden_state=True,
-        return_projected_pooled=True,
-    ):
-        super().__init__()
-        assert layer in self.LAYERS
-        self.transformer = model_class(textmodel_json_config, dtype, device)
-        self.num_layers = self.transformer.num_layers
-        self.max_length = max_length
-        self.transformer = self.transformer.eval()
-        for param in self.parameters():
-            param.requires_grad = False
-        self.layer = layer
-        self.layer_idx = None
-        self.special_tokens = special_tokens
-        self.logit_scale = torch.nn.Parameter(torch.tensor(4.6055))
-        self.layer_norm_hidden_state = layer_norm_hidden_state
-        self.return_projected_pooled = return_projected_pooled
-        if layer == "hidden":
-            assert layer_idx is not None
-            assert abs(layer_idx) < self.num_layers
-            self.set_clip_options({"layer": layer_idx})
-        self.options_default = (self.layer, self.layer_idx, self.return_projected_pooled)
-
-    @property
-    def device(self):
-        return next(self.parameters()).device
-
-    @property
-    def dtype(self):
-        return next(self.parameters()).dtype
-
-    def gradient_checkpointing_enable(self):
-        logger.warning("Gradient checkpointing is not supported for this model")
-
-    def set_attn_mode(self, mode):
-        raise NotImplementedError("This model does not support setting the attention mode")
-
-    def set_clip_options(self, options):
-        layer_idx = options.get("layer", self.layer_idx)
-        self.return_projected_pooled = options.get("projected_pooled", self.return_projected_pooled)
-        if layer_idx is None or abs(layer_idx) > self.num_layers:
-            self.layer = "last"
-        else:
-            self.layer = "hidden"
-            self.layer_idx = layer_idx
-
-    def forward(self, tokens):
-        backup_embeds = self.transformer.get_input_embeddings()
-        device = backup_embeds.weight.device
-        tokens = torch.LongTensor(tokens).to(device)
-        outputs = self.transformer(
-            tokens, intermediate_output=self.layer_idx, final_layer_norm_intermediate=self.layer_norm_hidden_state
-        )
-        self.transformer.set_input_embeddings(backup_embeds)
-        if self.layer == "last":
-            z = outputs[0]
-        else:
-            z = outputs[1]
-        pooled_output = None
-        if len(outputs) >= 3:
-            if not self.return_projected_pooled and len(outputs) >= 4 and outputs[3] is not None:
-                pooled_output = outputs[3].float()
-            elif outputs[2] is not None:
-                pooled_output = outputs[2].float()
-        return z.float(), pooled_output
-
-    def set_attn_mode(self, mode):
-        clip_text_model = self.transformer.text_model
-        for layer in clip_text_model.encoder.layers:
-            layer.self_attn.set_attn_mode(mode)
-
-
-class SDXLClipG(SDClipModel):
-    """Wraps the CLIP-G model into the SD-CLIP-Model interface"""
-
-    def __init__(self, config, device="cpu", layer="penultimate", layer_idx=None, dtype=None):
-        if layer == "penultimate":
-            layer = "hidden"
-            layer_idx = -2
-        super().__init__(
-            device=device,
-            layer=layer,
-            layer_idx=layer_idx,
-            textmodel_json_config=config,
-            dtype=dtype,
-            special_tokens={"start": 49406, "end": 49407, "pad": 0},
-            layer_norm_hidden_state=False,
-        )
-
-    def set_attn_mode(self, mode):
-        clip_text_model = self.transformer.text_model
-        for layer in clip_text_model.encoder.layers:
-            layer.self_attn.set_attn_mode(mode)
-
-
-class T5XXLModel(SDClipModel):
-    """Wraps the T5-XXL model into the SD-CLIP-Model interface for convenience"""
-
-    def __init__(self, config, device="cpu", layer="last", layer_idx=None, dtype=None):
-        super().__init__(
-            device=device,
-            layer=layer,
-            layer_idx=layer_idx,
-            textmodel_json_config=config,
-            dtype=dtype,
-            special_tokens={"end": 1, "pad": 0},
-            model_class=T5,
-        )
-
-    def set_attn_mode(self, mode):
-        t5: T5 = self.transformer
-        for t5block in t5.encoder.block:
-            t5block: T5Block
-            t5layer: T5LayerSelfAttention = t5block.layer[0]
-            t5SaSa: T5Attention = t5layer.SelfAttention
-            t5SaSa.set_attn_mode(mode)
-
-
-#################################################################################################
-### T5 implementation, for the T5-XXL text encoder portion, largely pulled from upstream impl
-#################################################################################################
-
-"""
-class T5XXLTokenizer(SDTokenizer):
-    ""Wraps the T5 Tokenizer from HF into the SDTokenizer interface""
-
-    def __init__(self):
-        super().__init__(
-            pad_with_end=False,
-            tokenizer=T5TokenizerFast.from_pretrained("google/t5-v1_1-xxl"),
-            has_start_token=False,
-            pad_to_max_length=False,
-            max_length=99999999,
-            min_length=77,
-        )
-"""
-
-
-class T5LayerNorm(torch.nn.Module):
-    def __init__(self, hidden_size, eps=1e-6, dtype=None, device=None):
-        super().__init__()
-        self.weight = torch.nn.Parameter(torch.ones(hidden_size, dtype=dtype, device=device))
-        self.variance_epsilon = eps
-
-    # def forward(self, x):
-    #     variance = x.pow(2).mean(-1, keepdim=True)
-    #     x = x * torch.rsqrt(variance + self.variance_epsilon)
-    #     return self.weight.to(device=x.device, dtype=x.dtype) * x
-
-    # copy from transformers' T5LayerNorm
-    def forward(self, hidden_states):
-        # T5 uses a layer_norm which only scales and doesn't shift, which is also known as Root Mean
-        # Square Layer Normalization https://arxiv.org/abs/1910.07467 thus variance is calculated
-        # w/o mean and there is no bias. Additionally we want to make sure that the accumulation for
-        # half-precision inputs is done in fp32
-        variance = hidden_states.to(torch.float32).pow(2).mean(-1, keepdim=True)
-        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
-
-        # convert into half-precision if necessary
-        if self.weight.dtype in [torch.float16, torch.bfloat16]:
-            hidden_states = hidden_states.to(self.weight.dtype)
-
-        return self.weight * hidden_states
-
-
-class T5DenseGatedActDense(torch.nn.Module):
-    def __init__(self, model_dim, ff_dim, dtype, device):
-        super().__init__()
-        self.wi_0 = nn.Linear(model_dim, ff_dim, bias=False, dtype=dtype, device=device)
-        self.wi_1 = nn.Linear(model_dim, ff_dim, bias=False, dtype=dtype, device=device)
-        self.wo = nn.Linear(ff_dim, model_dim, bias=False, dtype=dtype, device=device)
-
-    def forward(self, x):
-        hidden_gelu = torch.nn.functional.gelu(self.wi_0(x), approximate="tanh")
-        hidden_linear = self.wi_1(x)
-        x = hidden_gelu * hidden_linear
-        x = self.wo(x)
-        return x
-
-
-class T5LayerFF(torch.nn.Module):
-    def __init__(self, model_dim, ff_dim, dtype, device):
-        super().__init__()
-        self.DenseReluDense = T5DenseGatedActDense(model_dim, ff_dim, dtype, device)
-        self.layer_norm = T5LayerNorm(model_dim, dtype=dtype, device=device)
-
-    def forward(self, x):
-        forwarded_states = self.layer_norm(x)
-        forwarded_states = self.DenseReluDense(forwarded_states)
-        x += forwarded_states
-        return x
-
-
-class T5Attention(torch.nn.Module):
-    def __init__(self, model_dim, inner_dim, num_heads, relative_attention_bias, dtype, device):
-        super().__init__()
-        # Mesh TensorFlow initialization to avoid scaling before softmax
-        self.q = nn.Linear(model_dim, inner_dim, bias=False, dtype=dtype, device=device)
-        self.k = nn.Linear(model_dim, inner_dim, bias=False, dtype=dtype, device=device)
-        self.v = nn.Linear(model_dim, inner_dim, bias=False, dtype=dtype, device=device)
-        self.o = nn.Linear(inner_dim, model_dim, bias=False, dtype=dtype, device=device)
-        self.num_heads = num_heads
-        self.relative_attention_bias = None
-        if relative_attention_bias:
-            self.relative_attention_num_buckets = 32
-            self.relative_attention_max_distance = 128
-            self.relative_attention_bias = torch.nn.Embedding(self.relative_attention_num_buckets, self.num_heads, device=device)
-
-        self.attn_mode = "xformers"  # TODO 何とかする
-
-    def set_attn_mode(self, mode):
-        self.attn_mode = mode
-
-    @staticmethod
-    def _relative_position_bucket(relative_position, bidirectional=True, num_buckets=32, max_distance=128):
-        """
-        Adapted from Mesh Tensorflow:
-        https://github.com/tensorflow/mesh/blob/0cb87fe07da627bf0b7e60475d59f95ed6b5be3d/mesh_tensorflow/transformer/transformer_layers.py#L593
-
-        Translate relative position to a bucket number for relative attention. The relative position is defined as
-        memory_position - query_position, i.e. the distance in tokens from the attending position to the attended-to
-        position. If bidirectional=False, then positive relative positions are invalid. We use smaller buckets for
-        small absolute relative_position and larger buckets for larger absolute relative_positions. All relative
-        positions >=max_distance map to the same bucket. All relative positions <=-max_distance map to the same bucket.
-        This should allow for more graceful generalization to longer sequences than the model has been trained on
-
-        Args:
-            relative_position: an int32 Tensor
-            bidirectional: a boolean - whether the attention is bidirectional
-            num_buckets: an integer
-            max_distance: an integer
-
-        Returns:
-            a Tensor with the same shape as relative_position, containing int32 values in the range [0, num_buckets)
-        """
-        relative_buckets = 0
-        if bidirectional:
-            num_buckets //= 2
-            relative_buckets += (relative_position > 0).to(torch.long) * num_buckets
-            relative_position = torch.abs(relative_position)
-        else:
-            relative_position = -torch.min(relative_position, torch.zeros_like(relative_position))
-        # now relative_position is in the range [0, inf)
-        # half of the buckets are for exact increments in positions
-        max_exact = num_buckets // 2
-        is_small = relative_position < max_exact
-        # The other half of the buckets are for logarithmically bigger bins in positions up to max_distance
-        relative_position_if_large = max_exact + (
-            torch.log(relative_position.float() / max_exact) / math.log(max_distance / max_exact) * (num_buckets - max_exact)
-        ).to(torch.long)
-        relative_position_if_large = torch.min(
-            relative_position_if_large, torch.full_like(relative_position_if_large, num_buckets - 1)
-        )
-        relative_buckets += torch.where(is_small, relative_position, relative_position_if_large)
-        return relative_buckets
-
-    def compute_bias(self, query_length, key_length, device):
-        """Compute binned relative position bias"""
-        context_position = torch.arange(query_length, dtype=torch.long, device=device)[:, None]
-        memory_position = torch.arange(key_length, dtype=torch.long, device=device)[None, :]
-        relative_position = memory_position - context_position  # shape (query_length, key_length)
-        relative_position_bucket = self._relative_position_bucket(
-            relative_position,  # shape (query_length, key_length)
-            bidirectional=True,
-            num_buckets=self.relative_attention_num_buckets,
-            max_distance=self.relative_attention_max_distance,
-        )
-        values = self.relative_attention_bias(relative_position_bucket)  # shape (query_length, key_length, num_heads)
-        values = values.permute([2, 0, 1]).unsqueeze(0)  # shape (1, num_heads, query_length, key_length)
-        return values
-
-    def forward(self, x, past_bias=None):
-        q = self.q(x)
-        k = self.k(x)
-        v = self.v(x)
-        if self.relative_attention_bias is not None:
-            past_bias = self.compute_bias(x.shape[1], x.shape[1], x.device)
-        if past_bias is not None:
-            mask = past_bias
-        out = attention(q, k * ((k.shape[-1] / self.num_heads) ** 0.5), v, self.num_heads, mask, mode=self.attn_mode)
-        return self.o(out), past_bias
-
-
-class T5LayerSelfAttention(torch.nn.Module):
-    def __init__(self, model_dim, inner_dim, ff_dim, num_heads, relative_attention_bias, dtype, device):
-        super().__init__()
-        self.SelfAttention = T5Attention(model_dim, inner_dim, num_heads, relative_attention_bias, dtype, device)
-        self.layer_norm = T5LayerNorm(model_dim, dtype=dtype, device=device)
-
-    def forward(self, x, past_bias=None):
-        output, past_bias = self.SelfAttention(self.layer_norm(x), past_bias=past_bias)
-        x += output
-        return x, past_bias
-
-
-class T5Block(torch.nn.Module):
-    def __init__(self, model_dim, inner_dim, ff_dim, num_heads, relative_attention_bias, dtype, device):
-        super().__init__()
-        self.layer = torch.nn.ModuleList()
-        self.layer.append(T5LayerSelfAttention(model_dim, inner_dim, ff_dim, num_heads, relative_attention_bias, dtype, device))
-        self.layer.append(T5LayerFF(model_dim, ff_dim, dtype, device))
-
-    def forward(self, x, past_bias=None):
-        x, past_bias = self.layer[0](x, past_bias)
-
-        # copy from transformers' T5Block
-        # clamp inf values to enable fp16 training
-        if x.dtype == torch.float16:
-            clamp_value = torch.where(
-                torch.isinf(x).any(),
-                torch.finfo(x.dtype).max - 1000,
-                torch.finfo(x.dtype).max,
-            )
-            x = torch.clamp(x, min=-clamp_value, max=clamp_value)
-
-        x = self.layer[-1](x)
-        # clamp inf values to enable fp16 training
-        if x.dtype == torch.float16:
-            clamp_value = torch.where(
-                torch.isinf(x).any(),
-                torch.finfo(x.dtype).max - 1000,
-                torch.finfo(x.dtype).max,
-            )
-            x = torch.clamp(x, min=-clamp_value, max=clamp_value)
-
-        return x, past_bias
-
-
-class T5Stack(torch.nn.Module):
-    def __init__(self, num_layers, model_dim, inner_dim, ff_dim, num_heads, vocab_size, dtype, device):
-        super().__init__()
-        self.embed_tokens = torch.nn.Embedding(vocab_size, model_dim, device=device)
-        self.block = torch.nn.ModuleList(
-            [
-                T5Block(model_dim, inner_dim, ff_dim, num_heads, relative_attention_bias=(i == 0), dtype=dtype, device=device)
-                for i in range(num_layers)
-            ]
-        )
-        self.final_layer_norm = T5LayerNorm(model_dim, dtype=dtype, device=device)
-
-    def forward(self, input_ids, intermediate_output=None, final_layer_norm_intermediate=True):
-        intermediate = None
-        x = self.embed_tokens(input_ids)
-        past_bias = None
-        for i, l in enumerate(self.block):
-            # uncomment to debug layerwise output: fp16 may cause issues
-            # print(i, x.mean(), x.std())
-            x, past_bias = l(x, past_bias)
-            if i == intermediate_output:
-                intermediate = x.clone()
-        # print(x.mean(), x.std())
-        x = self.final_layer_norm(x)
-        if intermediate is not None and final_layer_norm_intermediate:
-            intermediate = self.final_layer_norm(intermediate)
-        # print(x.mean(), x.std())
-        return x, intermediate
-
-
-class T5(torch.nn.Module):
-    def __init__(self, config_dict, dtype, device):
-        super().__init__()
-        self.num_layers = config_dict["num_layers"]
-        self.encoder = T5Stack(
-            self.num_layers,
-            config_dict["d_model"],
-            config_dict["d_model"],
-            config_dict["d_ff"],
-            config_dict["num_heads"],
-            config_dict["vocab_size"],
-            dtype,
-            device,
-        )
-        self.dtype = dtype
-
-    def get_input_embeddings(self):
-        return self.encoder.embed_tokens
-
-    def set_input_embeddings(self, embeddings):
-        self.encoder.embed_tokens = embeddings
-
-    def forward(self, *args, **kwargs):
-        return self.encoder(*args, **kwargs)
-
-
-def create_clip_l(device="cpu", dtype=torch.float32, state_dict: Optional[Dict[str, torch.Tensor]] = None):
-    r"""
-    state_dict is not loaded, but updated with missing keys
-    """
-    CLIPL_CONFIG = {
-        "hidden_act": "quick_gelu",
-        "hidden_size": 768,
-        "intermediate_size": 3072,
-        "num_attention_heads": 12,
-        "num_hidden_layers": 12,
-    }
-    with torch.no_grad():
-        clip_l = SDClipModel(
-            layer="hidden",
-            layer_idx=-2,
-            device=device,
-            dtype=dtype,
-            layer_norm_hidden_state=False,
-            return_projected_pooled=False,
-            textmodel_json_config=CLIPL_CONFIG,
-        )
-        clip_l.gradient_checkpointing_enable()
-    if state_dict is not None:
-        # update state_dict if provided to include logit_scale and text_projection.weight avoid errors
-        if "logit_scale" not in state_dict:
-            state_dict["logit_scale"] = clip_l.logit_scale
-        if "transformer.text_projection.weight" not in state_dict:
-            state_dict["transformer.text_projection.weight"] = clip_l.transformer.text_projection.weight
-    return clip_l
-
-
-def create_clip_g(device="cpu", dtype=torch.float32, state_dict: Optional[Dict[str, torch.Tensor]] = None):
-    r"""
-    state_dict is not loaded, but updated with missing keys
-    """
-    CLIPG_CONFIG = {
-        "hidden_act": "gelu",
-        "hidden_size": 1280,
-        "intermediate_size": 5120,
-        "num_attention_heads": 20,
-        "num_hidden_layers": 32,
-    }
-    with torch.no_grad():
-        clip_g = SDXLClipG(CLIPG_CONFIG, device=device, dtype=dtype)
-    if state_dict is not None:
-        if "logit_scale" not in state_dict:
-            state_dict["logit_scale"] = clip_g.logit_scale
-    return clip_g
-
-
-def create_t5xxl(device="cpu", dtype=torch.float32, state_dict: Optional[Dict[str, torch.Tensor]] = None) -> T5XXLModel:
-    T5_CONFIG = {"d_ff": 10240, "d_model": 4096, "num_heads": 64, "num_layers": 24, "vocab_size": 32128}
-    with torch.no_grad():
-        t5 = T5XXLModel(T5_CONFIG, dtype=dtype, device=device)
-    if state_dict is not None:
-        if "logit_scale" not in state_dict:
-            state_dict["logit_scale"] = t5.logit_scale
-        if "transformer.shared.weight" in state_dict:
-            state_dict.pop("transformer.shared.weight")
-    return t5
-
-
-"""
-    # snippet for using the T5 model from transformers
-    
-    from transformers import T5EncoderModel, T5Config
-    import accelerate
-    import json
-
-    T5_CONFIG_JSON = ""
-{
-  "architectures": [
-    "T5EncoderModel"
-  ],
-  "classifier_dropout": 0.0,
-  "d_ff": 10240,
-  "d_kv": 64,
-  "d_model": 4096,
-  "decoder_start_token_id": 0,
-  "dense_act_fn": "gelu_new",
-  "dropout_rate": 0.1,
-  "eos_token_id": 1,
-  "feed_forward_proj": "gated-gelu",
-  "initializer_factor": 1.0,
-  "is_encoder_decoder": true,
-  "is_gated_act": true,
-  "layer_norm_epsilon": 1e-06,
-  "model_type": "t5",
-  "num_decoder_layers": 24,
-  "num_heads": 64,
-  "num_layers": 24,
-  "output_past": true,
-  "pad_token_id": 0,
-  "relative_attention_max_distance": 128,
-  "relative_attention_num_buckets": 32,
-  "tie_word_embeddings": false,
-  "torch_dtype": "float16",
-  "transformers_version": "4.41.2",
-  "use_cache": true,
-  "vocab_size": 32128
-}
-""
-    config = json.loads(T5_CONFIG_JSON)
-    config = T5Config(**config)
-
-    # model = T5EncoderModel.from_pretrained("stabilityai/stable-diffusion-3-medium-diffusers", subfolder="text_encoder_3")
-    # print(model.config)
-    # # model(**load_model.config)
-
-    # with accelerate.init_empty_weights():
-    model = T5EncoderModel._from_config(config)  # , torch_dtype=dtype)
-    for key in list(state_dict.keys()):
-        if key.startswith("transformer."):
-            new_key = key[len("transformer.") :]
-            state_dict[new_key] = state_dict.pop(key)
-
-    info = model.load_state_dict(state_dict)
-    print(info)
-    model.set_attn_mode = lambda x: None
-    # model.to("cpu")
-
-    _self = model
-
-    def enc(list_of_token_weight_pairs):
-        has_batch = isinstance(list_of_token_weight_pairs[0][0], list)
-
-        if has_batch:
-            list_of_tokens = []
-            for pairs in list_of_token_weight_pairs:
-                tokens = [a[0] for a in pairs[0]]  # I'm not sure why this is [0]
-                list_of_tokens.append(tokens)
-        else:
-            list_of_tokens = [[a[0] for a in list_of_token_weight_pairs[0]]]
-
-        list_of_tokens = np.array(list_of_tokens)
-        list_of_tokens = torch.from_numpy(list_of_tokens).to("cuda", dtype=torch.long)
-        out = _self(list_of_tokens)
-        pooled = None
-        if has_batch:
-            return out, pooled
-        else:
-            if pooled is not None:
-                first_pooled = pooled[0:1]
-            else:
-                first_pooled = pooled
-            return out[0], first_pooled
-            # output = [out[0:1]]
-            # return torch.cat(output, dim=-2), first_pooled
-
-    model.encode_token_weights = enc
-
-    return model
-"""
-
 # endregion
diff --git a/library/sd3_train_utils.py b/library/sd3_train_utils.py
index e819d440c..9282482d9 100644
--- a/library/sd3_train_utils.py
+++ b/library/sd3_train_utils.py
@@ -11,8 +11,8 @@
 from accelerate import Accelerator, PartialState
 from tqdm import tqdm
 from PIL import Image
+from transformers import CLIPTextModelWithProjection, T5EncoderModel
 
-from library import sd3_models, sd3_utils, strategy_base, train_util
 from library.device_utils import init_ipex, clean_memory_on_device
 
 init_ipex()
@@ -28,60 +28,16 @@
 
 logger = logging.getLogger(__name__)
 
-from .sdxl_train_util import match_mixed_precision
-
-
-def load_target_model(
-    model_type: str,
-    args: argparse.Namespace,
-    state_dict: dict,
-    accelerator: Accelerator,
-    attn_mode: str,
-    model_dtype: Optional[torch.dtype],
-    device: Optional[torch.device],
-) -> Union[
-    sd3_models.MMDiT,
-    Optional[sd3_models.SDClipModel],
-    Optional[sd3_models.SDXLClipG],
-    Optional[sd3_models.T5XXLModel],
-    sd3_models.SDVAE,
-]:
-    loading_device = device if device is not None else (accelerator.device if args.lowram else "cpu")
-
-    for pi in range(accelerator.state.num_processes):
-        if pi == accelerator.state.local_process_index:
-            logger.info(f"loading model for process {accelerator.state.local_process_index}/{accelerator.state.num_processes}")
-
-            if model_type == "mmdit":
-                model = sd3_utils.load_mmdit(state_dict, attn_mode, model_dtype, loading_device)
-            elif model_type == "clip_l":
-                model = sd3_utils.load_clip_l(state_dict, args.clip_l, attn_mode, model_dtype, loading_device)
-            elif model_type == "clip_g":
-                model = sd3_utils.load_clip_g(state_dict, args.clip_g, attn_mode, model_dtype, loading_device)
-            elif model_type == "t5xxl":
-                model = sd3_utils.load_t5xxl(state_dict, args.t5xxl, attn_mode, model_dtype, loading_device)
-            elif model_type == "vae":
-                model = sd3_utils.load_vae(state_dict, args.vae, model_dtype, loading_device)
-            else:
-                raise ValueError(f"Unknown model type: {model_type}")
-
-            # work on low-ram device: models are already loaded on accelerator.device, but we ensure they are on device
-            if args.lowram:
-                model = model.to(accelerator.device)
-
-            clean_memory_on_device(accelerator.device)
-        accelerator.wait_for_everyone()
-
-    return model
+from library import sd3_models, sd3_utils, strategy_base, train_util
 
 
 def save_models(
     ckpt_path: str,
-    mmdit: sd3_models.MMDiT,
-    vae: sd3_models.SDVAE,
-    clip_l: sd3_models.SDClipModel,
-    clip_g: sd3_models.SDXLClipG,
-    t5xxl: Optional[sd3_models.T5XXLModel],
+    mmdit: Optional[sd3_models.MMDiT],
+    vae: Optional[sd3_models.SDVAE],
+    clip_l: Optional[CLIPTextModelWithProjection],
+    clip_g: Optional[CLIPTextModelWithProjection],
+    t5xxl: Optional[T5EncoderModel],
     sai_metadata: Optional[dict],
     save_dtype: Optional[torch.dtype] = None,
 ):
@@ -101,14 +57,25 @@ def update_sd(prefix, sd):
     update_sd("model.diffusion_model.", mmdit.state_dict())
     update_sd("first_stage_model.", vae.state_dict())
 
+    # do not support unified checkpoint format for now
+    # if clip_l is not None:
+    #     update_sd("text_encoders.clip_l.", clip_l.state_dict())
+    # if clip_g is not None:
+    #     update_sd("text_encoders.clip_g.", clip_g.state_dict())
+    # if t5xxl is not None:
+    #     update_sd("text_encoders.t5xxl.", t5xxl.state_dict())
+
+    save_file(state_dict, ckpt_path, metadata=sai_metadata)
+
     if clip_l is not None:
-        update_sd("text_encoders.clip_l.", clip_l.state_dict())
+        clip_l_path = ckpt_path.replace(".safetensors", "_clip_l.safetensors")
+        save_file(clip_l.state_dict(), clip_l_path)
     if clip_g is not None:
-        update_sd("text_encoders.clip_g.", clip_g.state_dict())
+        clip_g_path = ckpt_path.replace(".safetensors", "_clip_g.safetensors")
+        save_file(clip_g.state_dict(), clip_g_path)
     if t5xxl is not None:
-        update_sd("text_encoders.t5xxl.", t5xxl.state_dict())
-
-    save_file(state_dict, ckpt_path, metadata=sai_metadata)
+        t5xxl_path = ckpt_path.replace(".safetensors", "_t5xxl.safetensors")
+        save_file(t5xxl.state_dict(), t5xxl_path)
 
 
 def save_sd3_model_on_train_end(
@@ -116,9 +83,9 @@ def save_sd3_model_on_train_end(
     save_dtype: torch.dtype,
     epoch: int,
     global_step: int,
-    clip_l: sd3_models.SDClipModel,
-    clip_g: sd3_models.SDXLClipG,
-    t5xxl: Optional[sd3_models.T5XXLModel],
+    clip_l: Optional[CLIPTextModelWithProjection],
+    clip_g: Optional[CLIPTextModelWithProjection],
+    t5xxl: Optional[T5EncoderModel],
     mmdit: sd3_models.MMDiT,
     vae: sd3_models.SDVAE,
 ):
@@ -141,9 +108,9 @@ def save_sd3_model_on_epoch_end_or_stepwise(
     epoch: int,
     num_train_epochs: int,
     global_step: int,
-    clip_l: sd3_models.SDClipModel,
-    clip_g: sd3_models.SDXLClipG,
-    t5xxl: Optional[sd3_models.T5XXLModel],
+    clip_l: Optional[CLIPTextModelWithProjection],
+    clip_g: Optional[CLIPTextModelWithProjection],
+    t5xxl: Optional[T5EncoderModel],
     mmdit: sd3_models.MMDiT,
     vae: sd3_models.SDVAE,
 ):
@@ -208,23 +175,27 @@ def add_sd3_training_arguments(parser: argparse.ArgumentParser):
         help="T5-XXL model path. if not specified, use ckpt's state_dict / T5-XXLモデルのパス。指定しない場合はckptのstate_dictを使用",
     )
     parser.add_argument(
-        "--save_clip", action="store_true", help="save CLIP models to checkpoint / CLIPモデルをチェックポイントに保存する"
+        "--save_clip",
+        action="store_true",
+        help="[DOES NOT WORK] unified checkpoint is not supported / 統合チェックポイントはまだサポートされていません",
     )
     parser.add_argument(
-        "--save_t5xxl", action="store_true", help="save T5-XXL model to checkpoint / T5-XXLモデルをチェックポイントに保存する"
+        "--save_t5xxl",
+        action="store_true",
+        help="[DOES NOT WORK] unified checkpoint is not supported / 統合チェックポイントはまだサポートされていません",
     )
 
     parser.add_argument(
         "--t5xxl_device",
         type=str,
         default=None,
-        help="T5-XXL device. if not specified, use accelerator's device / T5-XXLデバイス。指定しない場合はacceleratorのデバイスを使用",
+        help="[DOES NOT WORK] not supported yet. T5-XXL device. if not specified, use accelerator's device / T5-XXLデバイス。指定しない場合はacceleratorのデバイスを使用",
     )
     parser.add_argument(
         "--t5xxl_dtype",
         type=str,
         default=None,
-        help="T5-XXL dtype. if not specified, use default dtype (from mixed precision) / T5-XXL dtype。指定しない場合はデフォルトのdtype（mixed precisionから）を使用",
+        help="[DOES NOT WORK] not supported yet. T5-XXL dtype. if not specified, use default dtype (from mixed precision) / T5-XXL dtype。指定しない場合はデフォルトのdtype（mixed precisionから）を使用",
     )
 
     # copy from Diffusers
@@ -233,16 +204,25 @@ def add_sd3_training_arguments(parser: argparse.ArgumentParser):
         type=str,
         default="logit_normal",
         choices=["sigma_sqrt", "logit_normal", "mode", "cosmap"],
+        help="weighting scheme for timestep distribution and loss / タイムステップ分布と損失のための重み付けスキーム",
     )
     parser.add_argument(
-        "--logit_mean", type=float, default=0.0, help="mean to use when using the `'logit_normal'` weighting scheme."
+        "--logit_mean",
+        type=float,
+        default=0.0,
+        help="mean to use when using the `'logit_normal'` weighting scheme for timestep distribution. / タイムステップ分布のために`'logit_normal'`重み付けスキームを使用する場合の平均",
+    )
+    parser.add_argument(
+        "--logit_std",
+        type=float,
+        default=1.0,
+        help="std to use when using the `'logit_normal'` weighting scheme for timestep distribution. / タイムステップ分布のために`'logit_normal'`重み付けスキームを使用する場合のstd",
     )
-    parser.add_argument("--logit_std", type=float, default=1.0, help="std to use when using the `'logit_normal'` weighting scheme.")
     parser.add_argument(
         "--mode_scale",
         type=float,
         default=1.29,
-        help="Scale of mode weighting scheme. Only effective when using the `'mode'` as the `weighting_scheme`.",
+        help="Scale of mode weighting scheme. Only effective when using the `'mode'` as the `weighting_scheme`. / モード重み付けスキームのスケール。`'mode'`を`weighting_scheme`として使用する場合のみ有効",
     )
 
 
@@ -283,7 +263,7 @@ def verify_sdxl_training_args(args: argparse.Namespace, supportTextEncoderCachin
 # temporary copied from sd3_minimal_inferece.py
 
 
-def get_sigmas(sampling: sd3_utils.ModelSamplingDiscreteFlow, steps):
+def get_all_sigmas(sampling: sd3_utils.ModelSamplingDiscreteFlow, steps):
     start = sampling.timestep(sampling.sigma_max)
     end = sampling.timestep(sampling.sigma_min)
     timesteps = torch.linspace(start, end, steps)
@@ -327,7 +307,7 @@ def do_sample(
 
     model_sampling = sd3_utils.ModelSamplingDiscreteFlow(shift=3.0)  # 3.0 is for SD3
 
-    sigmas = get_sigmas(model_sampling, steps).to(device)
+    sigmas = get_all_sigmas(model_sampling, steps).to(device)
 
     noise_scaled = model_sampling.noise_scaling(sigmas[0], noise, latent, max_denoise(model_sampling, sigmas))
 
@@ -371,37 +351,6 @@ def do_sample(
     return x
 
 
-def load_prompts(prompt_file: str) -> List[Dict]:
-    # read prompts
-    if prompt_file.endswith(".txt"):
-        with open(prompt_file, "r", encoding="utf-8") as f:
-            lines = f.readlines()
-        prompts = [line.strip() for line in lines if len(line.strip()) > 0 and line[0] != "#"]
-    elif prompt_file.endswith(".toml"):
-        with open(prompt_file, "r", encoding="utf-8") as f:
-            data = toml.load(f)
-        prompts = [dict(**data["prompt"], **subset) for subset in data["prompt"]["subset"]]
-    elif prompt_file.endswith(".json"):
-        with open(prompt_file, "r", encoding="utf-8") as f:
-            prompts = json.load(f)
-
-    # preprocess prompts
-    for i in range(len(prompts)):
-        prompt_dict = prompts[i]
-        if isinstance(prompt_dict, str):
-            from library.train_util import line_to_prompt_dict
-
-            prompt_dict = line_to_prompt_dict(prompt_dict)
-            prompts[i] = prompt_dict
-        assert isinstance(prompt_dict, dict)
-
-        # Adds an enumerator to the dict based on prompt position. Used later to name image files. Also cleanup of extra data in original prompt dict.
-        prompt_dict["enum"] = i
-        prompt_dict.pop("subset", None)
-
-    return prompts
-
-
 def sample_images(
     accelerator: Accelerator,
     args: argparse.Namespace,
@@ -440,7 +389,7 @@ def sample_images(
     text_encoders = [accelerator.unwrap_model(te) for te in text_encoders]
     # print([(te.parameters().__next__().device if te is not None else None) for te in text_encoders])
 
-    prompts = load_prompts(args.sample_prompts)
+    prompts = train_util.load_prompts(args.sample_prompts)
 
     save_dir = args.output_dir + "/sample"
     os.makedirs(save_dir, exist_ok=True)
@@ -510,7 +459,7 @@ def sample_image_inference(
     accelerator: Accelerator,
     args: argparse.Namespace,
     mmdit: sd3_models.MMDiT,
-    text_encoders: List[Union[sd3_models.SDClipModel, sd3_models.SDXLClipG, sd3_models.T5XXLModel]],
+    text_encoders: List[Union[CLIPTextModelWithProjection, T5EncoderModel]],
     vae: sd3_models.SDVAE,
     save_dir,
     prompt_dict,
@@ -568,7 +517,7 @@ def sample_image_inference(
         l_tokens, g_tokens, t5_tokens = tokenize_strategy.tokenize(prompt)
         te_outputs = encoding_strategy.encode_tokens(tokenize_strategy, text_encoders, [l_tokens, g_tokens, t5_tokens])
 
-    lg_out, t5_out, pooled = te_outputs
+    lg_out, t5_out, pooled, l_attn_mask, g_attn_mask, t5_attn_mask = te_outputs
     cond = encoding_strategy.concat_encodings(lg_out, t5_out, pooled)
 
     # encode negative prompts
@@ -578,7 +527,7 @@ def sample_image_inference(
         l_tokens, g_tokens, t5_tokens = tokenize_strategy.tokenize(negative_prompt)
         neg_te_outputs = encoding_strategy.encode_tokens(tokenize_strategy, text_encoders, [l_tokens, g_tokens, t5_tokens])
 
-    lg_out, t5_out, pooled = neg_te_outputs
+    lg_out, t5_out, pooled, l_attn_mask, g_attn_mask, t5_attn_mask = neg_te_outputs
     neg_cond = encoding_strategy.concat_encodings(lg_out, t5_out, pooled)
 
     # sample image
@@ -609,14 +558,9 @@ def sample_image_inference(
         wandb_tracker = accelerator.get_tracker("wandb")
 
         import wandb
+
         # not to commit images to avoid inconsistency between training and logging steps
-        wandb_tracker.log(
-            {f"sample_{i}": wandb.Image(
-                image,
-                caption=prompt # positive prompt as a caption
-            )}, 
-            commit=False
-        )
+        wandb_tracker.log({f"sample_{i}": wandb.Image(image, caption=prompt)}, commit=False)  # positive prompt as a caption
 
 
 # region Diffusers
@@ -886,4 +830,78 @@ def __len__(self):
         return self.config.num_train_timesteps
 
 
+def get_sigmas(noise_scheduler, timesteps, device, n_dim=4, dtype=torch.float32):
+    sigmas = noise_scheduler.sigmas.to(device=device, dtype=dtype)
+    schedule_timesteps = noise_scheduler.timesteps.to(device)
+    timesteps = timesteps.to(device)
+    step_indices = [(schedule_timesteps == t).nonzero().item() for t in timesteps]
+
+    sigma = sigmas[step_indices].flatten()
+    while len(sigma.shape) < n_dim:
+        sigma = sigma.unsqueeze(-1)
+    return sigma
+
+
+def compute_density_for_timestep_sampling(
+    weighting_scheme: str, batch_size: int, logit_mean: float = None, logit_std: float = None, mode_scale: float = None
+):
+    """Compute the density for sampling the timesteps when doing SD3 training.
+
+    Courtesy: This was contributed by Rafie Walker in https://github.com/huggingface/diffusers/pull/8528.
+
+    SD3 paper reference: https://arxiv.org/abs/2403.03206v1.
+    """
+    if weighting_scheme == "logit_normal":
+        # See 3.1 in the SD3 paper ($rf/lognorm(0.00,1.00)$).
+        u = torch.normal(mean=logit_mean, std=logit_std, size=(batch_size,), device="cpu")
+        u = torch.nn.functional.sigmoid(u)
+    elif weighting_scheme == "mode":
+        u = torch.rand(size=(batch_size,), device="cpu")
+        u = 1 - u - mode_scale * (torch.cos(math.pi * u / 2) ** 2 - 1 + u)
+    else:
+        u = torch.rand(size=(batch_size,), device="cpu")
+    return u
+
+
+def compute_loss_weighting_for_sd3(weighting_scheme: str, sigmas=None):
+    """Computes loss weighting scheme for SD3 training.
+
+    Courtesy: This was contributed by Rafie Walker in https://github.com/huggingface/diffusers/pull/8528.
+
+    SD3 paper reference: https://arxiv.org/abs/2403.03206v1.
+    """
+    if weighting_scheme == "sigma_sqrt":
+        weighting = (sigmas**-2.0).float()
+    elif weighting_scheme == "cosmap":
+        bot = 1 - 2 * sigmas + 2 * sigmas**2
+        weighting = 2 / (math.pi * bot)
+    else:
+        weighting = torch.ones_like(sigmas)
+    return weighting
+
+
+def get_noisy_model_input_and_timesteps(
+    args, noise_scheduler, latents, noise, device, dtype
+) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+    bsz = latents.shape[0]
+
+    # Sample a random timestep for each image
+    # for weighting schemes where we sample timesteps non-uniformly
+    u = compute_density_for_timestep_sampling(
+        weighting_scheme=args.weighting_scheme,
+        batch_size=bsz,
+        logit_mean=args.logit_mean,
+        logit_std=args.logit_std,
+        mode_scale=args.mode_scale,
+    )
+    indices = (u * noise_scheduler.config.num_train_timesteps).long()
+    timesteps = noise_scheduler.timesteps[indices].to(device=device)
+
+    # Add noise according to flow matching.
+    sigmas = get_sigmas(noise_scheduler, timesteps, device, n_dim=latents.ndim, dtype=dtype)
+    noisy_model_input = sigmas * noise + (1.0 - sigmas) * latents
+
+    return noisy_model_input, timesteps, sigmas
+
+
 # endregion
diff --git a/library/sd3_utils.py b/library/sd3_utils.py
index 5849518fb..9ad995d81 100644
--- a/library/sd3_utils.py
+++ b/library/sd3_utils.py
@@ -1,9 +1,12 @@
+from dataclasses import dataclass
 import math
-from typing import Dict, Optional, Union
+import re
+from typing import Dict, List, Optional, Union
 import torch
 import safetensors
 from safetensors.torch import load_file
 from accelerate import init_empty_weights
+from transformers import CLIPTextModel, CLIPTextModelWithProjection, CLIPConfig, CLIPTextConfig
 
 from .utils import setup_logging
 
@@ -19,18 +22,61 @@
 
 # region models
 
+# TODO remove dependency on flux_utils
+from library.utils import load_safetensors
+from library.flux_utils import load_t5xxl as flux_utils_load_t5xxl
 
-def load_safetensors(path: str, dvc: Union[str, torch.device], disable_mmap: bool = False):
-    if disable_mmap:
-        return safetensors.torch.load(open(path, "rb").read())
+
+def analyze_state_dict_state(state_dict: Dict, prefix: str = ""):
+    logger.info(f"Analyzing state dict state...")
+
+    # analyze configs
+    patch_size = state_dict[f"{prefix}x_embedder.proj.weight"].shape[2]
+    depth = state_dict[f"{prefix}x_embedder.proj.weight"].shape[0] // 64
+    num_patches = state_dict[f"{prefix}pos_embed"].shape[1]
+    pos_embed_max_size = round(math.sqrt(num_patches))
+    adm_in_channels = state_dict[f"{prefix}y_embedder.mlp.0.weight"].shape[1]
+    context_shape = state_dict[f"{prefix}context_embedder.weight"].shape
+    qk_norm = "rms" if f"{prefix}joint_blocks.0.context_block.attn.ln_k.weight" in state_dict.keys() else None
+
+    #  x_block_self_attn_layers.append(int(key.split(".x_block.attn2.ln_k.weight")[0].split(".")[-1]))
+    x_block_self_attn_layers = []
+    re_attn = re.compile(r".(\d+).x_block.attn2.ln_k.weight")
+    for key in list(state_dict.keys()):
+        m = re_attn.match(key)
+        if m:
+            x_block_self_attn_layers.append(int(m.group(1)))
+
+    assert len(x_block_self_attn_layers) == 0, "x_block_self_attn_layers is not supported"
+
+    context_embedder_in_features = context_shape[1]
+    context_embedder_out_features = context_shape[0]
+
+    # only supports 3-5-large and 3-medium
+    if qk_norm is not None:
+        model_type = "3-5-large"
     else:
-        try:
-            return load_file(path, device=dvc)
-        except:
-            return load_file(path)  # prevent device invalid Error
+        model_type = "3-medium"
+
+    params = sd3_models.SD3Params(
+        patch_size=patch_size,
+        depth=depth,
+        num_patches=num_patches,
+        pos_embed_max_size=pos_embed_max_size,
+        adm_in_channels=adm_in_channels,
+        qk_norm=qk_norm,
+        x_block_self_attn_layers=x_block_self_attn_layers,
+        context_embedder_in_features=context_embedder_in_features,
+        context_embedder_out_features=context_embedder_out_features,
+        model_type=model_type,
+    )
+    logger.info(f"Analyzed state dict state: {params}")
+    return params
 
 
-def load_mmdit(state_dict: Dict, attn_mode: str, dtype: Optional[Union[str, torch.dtype]], device: Union[str, torch.device]):
+def load_mmdit(
+    state_dict: Dict, dtype: Optional[Union[str, torch.dtype]], device: Union[str, torch.device], attn_mode: str = "torch"
+) -> sd3_models.MMDiT:
     mmdit_sd = {}
 
     mmdit_prefix = "model.diffusion_model."
@@ -40,8 +86,9 @@ def load_mmdit(state_dict: Dict, attn_mode: str, dtype: Optional[Union[str, torc
 
     # load MMDiT
     logger.info("Building MMDit")
+    params = analyze_state_dict_state(mmdit_sd)
     with init_empty_weights():
-        mmdit = sd3_models.create_mmdit_sd3_medium_configs(attn_mode)
+        mmdit = sd3_models.create_sd3_mmdit(params, attn_mode)
 
     logger.info("Loading state dict...")
     info = sdxl_model_util._load_state_dict_on_device(mmdit, mmdit_sd, device, dtype)
@@ -50,20 +97,14 @@ def load_mmdit(state_dict: Dict, attn_mode: str, dtype: Optional[Union[str, torc
 
 
 def load_clip_l(
-    state_dict: Dict,
     clip_l_path: Optional[str],
-    attn_mode: str,
-    clip_dtype: Optional[Union[str, torch.dtype]],
+    dtype: Optional[Union[str, torch.dtype]],
     device: Union[str, torch.device],
     disable_mmap: bool = False,
+    state_dict: Optional[Dict] = None,
 ):
     clip_l_sd = None
-    if clip_l_path:
-        logger.info(f"Loading clip_l from {clip_l_path}...")
-        clip_l_sd = load_safetensors(clip_l_path, device, disable_mmap)
-        for key in list(clip_l_sd.keys()):
-            clip_l_sd["transformer." + key] = clip_l_sd.pop(key)
-    else:
+    if clip_l_path is None:
         if "text_encoders.clip_l.transformer.text_model.embeddings.position_embedding.weight" in state_dict:
             # found clip_l: remove prefix "text_encoders.clip_l."
             logger.info("clip_l is included in the checkpoint")
@@ -72,34 +113,58 @@ def load_clip_l(
             for k in list(state_dict.keys()):
                 if k.startswith(prefix):
                     clip_l_sd[k[len(prefix) :]] = state_dict.pop(k)
+        elif clip_l_path is None:
+            logger.info("clip_l is not included in the checkpoint and clip_l_path is not provided")
+            return None
+
+    # load clip_l
+    logger.info("Building CLIP-L")
+    config = CLIPTextConfig(
+        vocab_size=49408,
+        hidden_size=768,
+        intermediate_size=3072,
+        num_hidden_layers=12,
+        num_attention_heads=12,
+        max_position_embeddings=77,
+        hidden_act="quick_gelu",
+        layer_norm_eps=1e-05,
+        dropout=0.0,
+        attention_dropout=0.0,
+        initializer_range=0.02,
+        initializer_factor=1.0,
+        pad_token_id=1,
+        bos_token_id=0,
+        eos_token_id=2,
+        model_type="clip_text_model",
+        projection_dim=768,
+        # torch_dtype="float32",
+        # transformers_version="4.25.0.dev0",
+    )
+    with init_empty_weights():
+        clip = CLIPTextModelWithProjection(config)
 
     if clip_l_sd is None:
-        clip_l = None
-    else:
-        logger.info("Building ClipL")
-        clip_l = sd3_models.create_clip_l(device, clip_dtype, clip_l_sd)
-        logger.info("Loading state dict...")
-        info = clip_l.load_state_dict(clip_l_sd)
-        logger.info(f"Loaded ClipL: {info}")
-        clip_l.set_attn_mode(attn_mode)
-    return clip_l
+        logger.info(f"Loading state dict from {clip_l_path}")
+        clip_l_sd = load_safetensors(clip_l_path, device=str(device), disable_mmap=disable_mmap, dtype=dtype)
+
+    if "text_projection.weight" not in clip_l_sd:
+        logger.info("Adding text_projection.weight to clip_l_sd")
+        clip_l_sd["text_projection.weight"] = torch.eye(768, dtype=dtype, device=device)
+
+    info = clip.load_state_dict(clip_l_sd, strict=False, assign=True)
+    logger.info(f"Loaded CLIP-L: {info}")
+    return clip
 
 
 def load_clip_g(
-    state_dict: Dict,
     clip_g_path: Optional[str],
-    attn_mode: str,
-    clip_dtype: Optional[Union[str, torch.dtype]],
+    dtype: Optional[Union[str, torch.dtype]],
     device: Union[str, torch.device],
     disable_mmap: bool = False,
+    state_dict: Optional[Dict] = None,
 ):
     clip_g_sd = None
-    if clip_g_path:
-        logger.info(f"Loading clip_g from {clip_g_path}...")
-        clip_g_sd = load_safetensors(clip_g_path, device, disable_mmap)
-        for key in list(clip_g_sd.keys()):
-            clip_g_sd["transformer." + key] = clip_g_sd.pop(key)
-    else:
+    if state_dict is not None:
         if "text_encoders.clip_g.transformer.text_model.embeddings.position_embedding.weight" in state_dict:
             # found clip_g: remove prefix "text_encoders.clip_g."
             logger.info("clip_g is included in the checkpoint")
@@ -108,34 +173,53 @@ def load_clip_g(
             for k in list(state_dict.keys()):
                 if k.startswith(prefix):
                     clip_g_sd[k[len(prefix) :]] = state_dict.pop(k)
+        elif clip_g_path is None:
+            logger.info("clip_g is not included in the checkpoint and clip_g_path is not provided")
+            return None
+
+    # load clip_g
+    logger.info("Building CLIP-G")
+    config = CLIPTextConfig(
+        vocab_size=49408,
+        hidden_size=1280,
+        intermediate_size=5120,
+        num_hidden_layers=32,
+        num_attention_heads=20,
+        max_position_embeddings=77,
+        hidden_act="gelu",
+        layer_norm_eps=1e-05,
+        dropout=0.0,
+        attention_dropout=0.0,
+        initializer_range=0.02,
+        initializer_factor=1.0,
+        pad_token_id=1,
+        bos_token_id=0,
+        eos_token_id=2,
+        model_type="clip_text_model",
+        projection_dim=1280,
+        # torch_dtype="float32",
+        # transformers_version="4.25.0.dev0",
+    )
+    with init_empty_weights():
+        clip = CLIPTextModelWithProjection(config)
 
     if clip_g_sd is None:
-        clip_g = None
-    else:
-        logger.info("Building ClipG")
-        clip_g = sd3_models.create_clip_g(device, clip_dtype, clip_g_sd)
-        logger.info("Loading state dict...")
-        info = clip_g.load_state_dict(clip_g_sd)
-        logger.info(f"Loaded ClipG: {info}")
-        clip_g.set_attn_mode(attn_mode)
-    return clip_g
+        logger.info(f"Loading state dict from {clip_g_path}")
+        clip_g_sd = load_safetensors(clip_g_path, device=str(device), disable_mmap=disable_mmap, dtype=dtype)
+    info = clip.load_state_dict(clip_g_sd, strict=False, assign=True)
+    logger.info(f"Loaded CLIP-G: {info}")
+    return clip
 
 
 def load_t5xxl(
-    state_dict: Dict,
     t5xxl_path: Optional[str],
-    attn_mode: str,
     dtype: Optional[Union[str, torch.dtype]],
     device: Union[str, torch.device],
     disable_mmap: bool = False,
+    state_dict: Optional[Dict] = None,
 ):
     t5xxl_sd = None
-    if t5xxl_path:
-        logger.info(f"Loading t5xxl from {t5xxl_path}...")
-        t5xxl_sd = load_safetensors(t5xxl_path, device, disable_mmap)
-        for key in list(t5xxl_sd.keys()):
-            t5xxl_sd["transformer." + key] = t5xxl_sd.pop(key)
-    else:
+    if state_dict is not None:
         if "text_encoders.t5xxl.transformer.encoder.block.0.layer.0.SelfAttention.k.weight" in state_dict:
             # found t5xxl: remove prefix "text_encoders.t5xxl."
             logger.info("t5xxl is included in the checkpoint")
@@ -144,29 +228,19 @@ def load_t5xxl(
             for k in list(state_dict.keys()):
                 if k.startswith(prefix):
                     t5xxl_sd[k[len(prefix) :]] = state_dict.pop(k)
+        elif t5xxl_path is None:
+            logger.info("t5xxl is not included in the checkpoint and t5xxl_path is not provided")
+            return None
 
-    if t5xxl_sd is None:
-        t5xxl = None
-    else:
-        logger.info("Building T5XXL")
-
-        # workaround for T5XXL model creation: create with fp16 takes too long TODO support virtual device
-        t5xxl = sd3_models.create_t5xxl(device, torch.float32, t5xxl_sd)
-        t5xxl.to(dtype=dtype)
-
-        logger.info("Loading state dict...")
-        info = t5xxl.load_state_dict(t5xxl_sd)
-        logger.info(f"Loaded T5XXL: {info}")
-        t5xxl.set_attn_mode(attn_mode)
-    return t5xxl
+    return flux_utils_load_t5xxl(t5xxl_path, dtype, device, disable_mmap, state_dict=t5xxl_sd)
 
 
 def load_vae(
-    state_dict: Dict,
     vae_path: Optional[str],
     vae_dtype: Optional[Union[str, torch.dtype]],
     device: Optional[Union[str, torch.device]],
     disable_mmap: bool = False,
+    state_dict: Optional[Dict] = None,
 ):
     vae_sd = {}
     if vae_path:
@@ -181,299 +255,15 @@ def load_vae(
                 vae_sd[k[len(vae_prefix) :]] = state_dict.pop(k)
 
     logger.info("Building VAE")
-    vae = sd3_models.SDVAE()
+    vae = sd3_models.SDVAE(vae_dtype, device)
     logger.info("Loading state dict...")
     info = vae.load_state_dict(vae_sd)
     logger.info(f"Loaded VAE: {info}")
-    vae.to(device=device, dtype=vae_dtype)
+    vae.to(device=device, dtype=vae_dtype)  # make sure it's in the right device and dtype
     return vae
 
 
-def load_models(
-    ckpt_path: str,
-    clip_l_path: str,
-    clip_g_path: str,
-    t5xxl_path: str,
-    vae_path: str,
-    attn_mode: str,
-    device: Union[str, torch.device],
-    weight_dtype: Optional[Union[str, torch.dtype]] = None,
-    disable_mmap: bool = False,
-    clip_dtype: Optional[Union[str, torch.dtype]] = None,
-    t5xxl_device: Optional[Union[str, torch.device]] = None,
-    t5xxl_dtype: Optional[Union[str, torch.dtype]] = None,
-    vae_dtype: Optional[Union[str, torch.dtype]] = None,
-):
-    """
-    Load SD3 models from checkpoint files.
-
-    Args:
-        ckpt_path: Path to the SD3 checkpoint file.
-        clip_l_path: Path to the clip_l checkpoint file.
-        clip_g_path: Path to the clip_g checkpoint file.
-        t5xxl_path: Path to the t5xxl checkpoint file.
-        vae_path: Path to the VAE checkpoint file.
-        attn_mode: Attention mode for MMDiT model.
-        device: Device for MMDiT model.
-        weight_dtype: Default dtype of weights for all models. This is weight dtype, so the model dtype may be different.
-        disable_mmap: Disable memory mapping when loading state dict.
-        clip_dtype: Dtype for Clip models, or None to use default dtype.
-        t5xxl_device: Device for T5XXL model to load T5XXL in another device (eg. gpu). Default is None to use device.
-        t5xxl_dtype: Dtype for T5XXL model, or None to use default dtype.
-        vae_dtype: Dtype for VAE model, or None to use default dtype.
-
-    Returns:
-        Tuple of MMDiT, ClipL, ClipG, T5XXL, and VAE models.
-    """
-
-    # In SD1/2 and SDXL, the model is created with empty weights and then loaded with state dict.
-    # However, in SD3, Clip and T5XXL models are created with dtype, so we need to set dtype before loading state dict.
-    # Therefore, we need clip_dtype and t5xxl_dtype.
-
-    def load_state_dict(path: str, dvc: Union[str, torch.device] = device):
-        if disable_mmap:
-            return safetensors.torch.load(open(path, "rb").read())
-        else:
-            try:
-                return load_file(path, device=dvc)
-            except:
-                return load_file(path)  # prevent device invalid Error
-
-    t5xxl_device = t5xxl_device or device
-    clip_dtype = clip_dtype or weight_dtype or torch.float32
-    t5xxl_dtype = t5xxl_dtype or weight_dtype or torch.float32
-    vae_dtype = vae_dtype or weight_dtype or torch.float32
-
-    logger.info(f"Loading SD3 models from {ckpt_path}...")
-    state_dict = load_state_dict(ckpt_path)
-
-    # load clip_l
-    clip_l_sd = None
-    if clip_l_path:
-        logger.info(f"Loading clip_l from {clip_l_path}...")
-        clip_l_sd = load_state_dict(clip_l_path)
-        for key in list(clip_l_sd.keys()):
-            clip_l_sd["transformer." + key] = clip_l_sd.pop(key)
-    else:
-        if "text_encoders.clip_l.transformer.text_model.embeddings.position_embedding.weight" in state_dict:
-            # found clip_l: remove prefix "text_encoders.clip_l."
-            logger.info("clip_l is included in the checkpoint")
-            clip_l_sd = {}
-            prefix = "text_encoders.clip_l."
-            for k in list(state_dict.keys()):
-                if k.startswith(prefix):
-                    clip_l_sd[k[len(prefix) :]] = state_dict.pop(k)
-
-    # load clip_g
-    clip_g_sd = None
-    if clip_g_path:
-        logger.info(f"Loading clip_g from {clip_g_path}...")
-        clip_g_sd = load_state_dict(clip_g_path)
-        for key in list(clip_g_sd.keys()):
-            clip_g_sd["transformer." + key] = clip_g_sd.pop(key)
-    else:
-        if "text_encoders.clip_g.transformer.text_model.embeddings.position_embedding.weight" in state_dict:
-            # found clip_g: remove prefix "text_encoders.clip_g."
-            logger.info("clip_g is included in the checkpoint")
-            clip_g_sd = {}
-            prefix = "text_encoders.clip_g."
-            for k in list(state_dict.keys()):
-                if k.startswith(prefix):
-                    clip_g_sd[k[len(prefix) :]] = state_dict.pop(k)
-
-    # load t5xxl
-    t5xxl_sd = None
-    if t5xxl_path:
-        logger.info(f"Loading t5xxl from {t5xxl_path}...")
-        t5xxl_sd = load_state_dict(t5xxl_path, t5xxl_device)
-        for key in list(t5xxl_sd.keys()):
-            t5xxl_sd["transformer." + key] = t5xxl_sd.pop(key)
-    else:
-        if "text_encoders.t5xxl.transformer.encoder.block.0.layer.0.SelfAttention.k.weight" in state_dict:
-            # found t5xxl: remove prefix "text_encoders.t5xxl."
-            logger.info("t5xxl is included in the checkpoint")
-            t5xxl_sd = {}
-            prefix = "text_encoders.t5xxl."
-            for k in list(state_dict.keys()):
-                if k.startswith(prefix):
-                    t5xxl_sd[k[len(prefix) :]] = state_dict.pop(k)
-
-    # MMDiT and VAE
-    vae_sd = {}
-    if vae_path:
-        logger.info(f"Loading VAE from {vae_path}...")
-        vae_sd = load_state_dict(vae_path)
-    else:
-        # remove prefix "first_stage_model."
-        vae_sd = {}
-        vae_prefix = "first_stage_model."
-        for k in list(state_dict.keys()):
-            if k.startswith(vae_prefix):
-                vae_sd[k[len(vae_prefix) :]] = state_dict.pop(k)
-
-    mmdit_prefix = "model.diffusion_model."
-    for k in list(state_dict.keys()):
-        if k.startswith(mmdit_prefix):
-            state_dict[k[len(mmdit_prefix) :]] = state_dict.pop(k)
-        else:
-            state_dict.pop(k)  # remove other keys
-
-    # load MMDiT
-    logger.info("Building MMDit")
-    with init_empty_weights():
-        mmdit = sd3_models.create_mmdit_sd3_medium_configs(attn_mode)
-
-    logger.info("Loading state dict...")
-    info = sdxl_model_util._load_state_dict_on_device(mmdit, state_dict, device, weight_dtype)
-    logger.info(f"Loaded MMDiT: {info}")
-
-    # load ClipG and ClipL
-    if clip_l_sd is None:
-        clip_l = None
-    else:
-        logger.info("Building ClipL")
-        clip_l = sd3_models.create_clip_l(device, clip_dtype, clip_l_sd)
-        logger.info("Loading state dict...")
-        info = clip_l.load_state_dict(clip_l_sd)
-        logger.info(f"Loaded ClipL: {info}")
-        clip_l.set_attn_mode(attn_mode)
-
-    if clip_g_sd is None:
-        clip_g = None
-    else:
-        logger.info("Building ClipG")
-        clip_g = sd3_models.create_clip_g(device, clip_dtype, clip_g_sd)
-        logger.info("Loading state dict...")
-        info = clip_g.load_state_dict(clip_g_sd)
-        logger.info(f"Loaded ClipG: {info}")
-        clip_g.set_attn_mode(attn_mode)
-
-    # load T5XXL
-    if t5xxl_sd is None:
-        t5xxl = None
-    else:
-        logger.info("Building T5XXL")
-        t5xxl = sd3_models.create_t5xxl(t5xxl_device, t5xxl_dtype, t5xxl_sd)
-        logger.info("Loading state dict...")
-        info = t5xxl.load_state_dict(t5xxl_sd)
-        logger.info(f"Loaded T5XXL: {info}")
-        t5xxl.set_attn_mode(attn_mode)
-
-    # load VAE
-    logger.info("Building VAE")
-    vae = sd3_models.SDVAE()
-    logger.info("Loading state dict...")
-    info = vae.load_state_dict(vae_sd)
-    logger.info(f"Loaded VAE: {info}")
-    vae.to(device=device, dtype=vae_dtype)
-
-    return mmdit, clip_l, clip_g, t5xxl, vae
-
-
 # endregion
-# region utils
-
-
-def get_cond(
-    prompt: str,
-    tokenizer: sd3_models.SD3Tokenizer,
-    clip_l: sd3_models.SDClipModel,
-    clip_g: sd3_models.SDXLClipG,
-    t5xxl: Optional[sd3_models.T5XXLModel] = None,
-    device: Optional[torch.device] = None,
-    dtype: Optional[torch.dtype] = None,
-):
-    l_tokens, g_tokens, t5_tokens = tokenizer.tokenize_with_weights(prompt)
-    print(t5_tokens)
-    return get_cond_from_tokens(l_tokens, g_tokens, t5_tokens, clip_l, clip_g, t5xxl, device=device, dtype=dtype)
-
-
-def get_cond_from_tokens(
-    l_tokens,
-    g_tokens,
-    t5_tokens,
-    clip_l: sd3_models.SDClipModel,
-    clip_g: sd3_models.SDXLClipG,
-    t5xxl: Optional[sd3_models.T5XXLModel] = None,
-    device: Optional[torch.device] = None,
-    dtype: Optional[torch.dtype] = None,
-):
-    l_out, l_pooled = clip_l.encode_token_weights(l_tokens)
-    g_out, g_pooled = clip_g.encode_token_weights(g_tokens)
-    lg_out = torch.cat([l_out, g_out], dim=-1)
-    lg_out = torch.nn.functional.pad(lg_out, (0, 4096 - lg_out.shape[-1]))
-    if device is not None:
-        lg_out = lg_out.to(device=device)
-        l_pooled = l_pooled.to(device=device)
-        g_pooled = g_pooled.to(device=device)
-    if dtype is not None:
-        lg_out = lg_out.to(dtype=dtype)
-        l_pooled = l_pooled.to(dtype=dtype)
-        g_pooled = g_pooled.to(dtype=dtype)
-
-    # t5xxl may be in another device (eg. cpu)
-    if t5_tokens is None:
-        t5_out = torch.zeros((lg_out.shape[0], 77, 4096), device=lg_out.device, dtype=lg_out.dtype)
-    else:
-        t5_out, _ = t5xxl.encode_token_weights(t5_tokens)  # t5_out is [1, 77, 4096], t5_pooled is None
-        if device is not None:
-            t5_out = t5_out.to(device=device)
-        if dtype is not None:
-            t5_out = t5_out.to(dtype=dtype)
-
-    # return torch.cat([lg_out, t5_out], dim=-2), torch.cat((l_pooled, g_pooled), dim=-1)
-    return lg_out, t5_out, torch.cat((l_pooled, g_pooled), dim=-1)
-
-
-# used if other sd3 models is available
-r"""
-def get_sd3_configs(state_dict: Dict):
-    # Important configuration values can be quickly determined by checking shapes in the source file
-    # Some of these will vary between models (eg 2B vs 8B primarily differ in their depth, but also other details change)
-    # prefix = "model.diffusion_model."
-    prefix = ""
-
-    patch_size = state_dict[prefix + "x_embedder.proj.weight"].shape[2]
-    depth = state_dict[prefix + "x_embedder.proj.weight"].shape[0] // 64
-    num_patches = state_dict[prefix + "pos_embed"].shape[1]
-    pos_embed_max_size = round(math.sqrt(num_patches))
-    adm_in_channels = state_dict[prefix + "y_embedder.mlp.0.weight"].shape[1]
-    context_shape = state_dict[prefix + "context_embedder.weight"].shape
-    context_embedder_config = {
-        "target": "torch.nn.Linear",
-        "params": {"in_features": context_shape[1], "out_features": context_shape[0]},
-    }
-    return {
-        "patch_size": patch_size,
-        "depth": depth,
-        "num_patches": num_patches,
-        "pos_embed_max_size": pos_embed_max_size,
-        "adm_in_channels": adm_in_channels,
-        "context_embedder": context_embedder_config,
-    }
-
-
-def create_mmdit_from_sd3_checkpoint(state_dict: Dict, attn_mode: str = "xformers"):
-    ""
-    Doesn't load state dict.
-    ""
-    sd3_configs = get_sd3_configs(state_dict)
-
-    mmdit = sd3_models.MMDiT(
-        input_size=None,
-        pos_embed_max_size=sd3_configs["pos_embed_max_size"],
-        patch_size=sd3_configs["patch_size"],
-        in_channels=16,
-        adm_in_channels=sd3_configs["adm_in_channels"],
-        depth=sd3_configs["depth"],
-        mlp_ratio=4,
-        qk_norm=None,
-        num_patches=sd3_configs["num_patches"],
-        context_size=4096,
-        attn_mode=attn_mode,
-    )
-    return mmdit
-"""
 
 
 class ModelSamplingDiscreteFlow:
@@ -509,6 +299,3 @@ def noise_scaling(self, sigma, noise, latent_image, max_denoise=False):
         # assert max_denoise is False, "max_denoise not implemented"
         # max_denoise is always True, I'm not sure why it's there
         return sigma * noise + (1.0 - sigma) * latent_image
-
-
-# endregion
diff --git a/library/strategy_sd3.py b/library/strategy_sd3.py
index 9fde02084..dd08cf004 100644
--- a/library/strategy_sd3.py
+++ b/library/strategy_sd3.py
@@ -3,7 +3,7 @@
 from typing import Any, List, Optional, Tuple, Union
 import torch
 import numpy as np
-from transformers import CLIPTokenizer, T5TokenizerFast
+from transformers import CLIPTokenizer, T5TokenizerFast, CLIPTextModel, CLIPTextModelWithProjection, T5EncoderModel
 
 from library import sd3_utils, train_util
 from library import sd3_models
@@ -48,45 +48,79 @@ def tokenize(self, text: Union[str, List[str]]) -> List[torch.Tensor]:
 
 
 class Sd3TextEncodingStrategy(TextEncodingStrategy):
-    def __init__(self) -> None:
-        pass
+    def __init__(self, apply_lg_attn_mask: Optional[bool] = None, apply_t5_attn_mask: Optional[bool] = None) -> None:
+        """
+        Args:
+            apply_t5_attn_mask: Default value for apply_t5_attn_mask.
+        """
+        self.apply_lg_attn_mask = apply_lg_attn_mask
+        self.apply_t5_attn_mask = apply_t5_attn_mask
 
     def encode_tokens(
         self,
         tokenize_strategy: TokenizeStrategy,
         models: List[Any],
         tokens: List[torch.Tensor],
-        apply_lg_attn_mask: bool = False,
-        apply_t5_attn_mask: bool = False,
+        apply_lg_attn_mask: Optional[bool] = False,
+        apply_t5_attn_mask: Optional[bool] = False,
     ) -> List[torch.Tensor]:
         """
         returned embeddings are not masked
         """
         clip_l, clip_g, t5xxl = models
+        clip_l: CLIPTextModel
+        clip_g: CLIPTextModelWithProjection
+        t5xxl: T5EncoderModel
+
+        if apply_lg_attn_mask is None:
+            apply_lg_attn_mask = self.apply_lg_attn_mask
+        if apply_t5_attn_mask is None:
+            apply_t5_attn_mask = self.apply_t5_attn_mask
 
         l_tokens, g_tokens, t5_tokens = tokens[:3]
-        l_attn_mask, g_attn_mask, t5_attn_mask = tokens[3:] if len(tokens) > 3 else [None, None, None]
+
+        if len(tokens) > 3:
+            l_attn_mask, g_attn_mask, t5_attn_mask = tokens[3:]
+            if not apply_lg_attn_mask:
+                l_attn_mask = None
+                g_attn_mask = None
+            else:
+                l_attn_mask = l_attn_mask.to(clip_l.device)
+                g_attn_mask = g_attn_mask.to(clip_g.device)
+            if not apply_t5_attn_mask:
+                t5_attn_mask = None
+            else:
+                t5_attn_mask = t5_attn_mask.to(t5xxl.device)
+        else:
+            l_attn_mask = None
+            g_attn_mask = None
+            t5_attn_mask = None
+
         if l_tokens is None:
             assert g_tokens is None, "g_tokens must be None if l_tokens is None"
             lg_out = None
+            lg_pooled = None
         else:
-            assert g_tokens is not None, "g_tokens must not be None if l_tokens is not None"
-            l_out, l_pooled = clip_l(l_tokens)
-            g_out, g_pooled = clip_g(g_tokens)
-            if apply_lg_attn_mask:
-                l_out = l_out * l_attn_mask.to(l_out.device).unsqueeze(-1)
-                g_out = g_out * g_attn_mask.to(g_out.device).unsqueeze(-1)
-            lg_out = torch.cat([l_out, g_out], dim=-1)
+            with torch.no_grad():
+                assert g_tokens is not None, "g_tokens must not be None if l_tokens is not None"
+                prompt_embeds = clip_l(l_tokens.to(clip_l.device), l_attn_mask, output_hidden_states=True)
+                l_pooled = prompt_embeds[0]
+                l_out = prompt_embeds.hidden_states[-2]
+
+                prompt_embeds = clip_g(g_tokens.to(clip_g.device), g_attn_mask, output_hidden_states=True)
+                g_pooled = prompt_embeds[0]
+                g_out = prompt_embeds.hidden_states[-2]
+
+                lg_pooled = torch.cat((l_pooled, g_pooled), dim=-1) if l_tokens is not None else None
+                lg_out = torch.cat([l_out, g_out], dim=-1)
 
         if t5xxl is not None and t5_tokens is not None:
-            t5_out, _ = t5xxl(t5_tokens)  # t5_out is [1, max length, 4096]
-            if apply_t5_attn_mask:
-                t5_out = t5_out * t5_attn_mask.to(t5_out.device).unsqueeze(-1)
+            with torch.no_grad():
+                t5_out, _ = t5xxl(t5_tokens.to(t5xxl.device), t5_attn_mask, return_dict=False, output_hidden_states=True)
         else:
             t5_out = None
 
-        lg_pooled = torch.cat((l_pooled, g_pooled), dim=-1) if l_tokens is not None else None
-        return [lg_out, t5_out, lg_pooled]
+        return [lg_out, t5_out, lg_pooled, l_attn_mask, g_attn_mask, t5_attn_mask] # masks are used for attention masking in transformer
 
     def concat_encodings(
         self, lg_out: torch.Tensor, t5_out: Optional[torch.Tensor], lg_pooled: torch.Tensor
@@ -132,39 +166,38 @@ def is_disk_cached_outputs_expected(self, npz_path: str):
                 return False
             if "clip_l_attn_mask" not in npz or "clip_g_attn_mask" not in npz:  # necessary even if not used
                 return False
-            # t5xxl is optional
+            if "apply_lg_attn_mask" not in npz:
+                return False
+            if "t5_out" not in npz:
+                return False
+            if "t5_attn_mask" not in npz:
+                return False
+            npz_apply_lg_attn_mask = npz["apply_lg_attn_mask"]
+            if npz_apply_lg_attn_mask != self.apply_lg_attn_mask:
+                return False
+            if "apply_t5_attn_mask" not in npz:
+                return False
+            npz_apply_t5_attn_mask = npz["apply_t5_attn_mask"]
+            if npz_apply_t5_attn_mask != self.apply_t5_attn_mask:
+                return False
         except Exception as e:
             logger.error(f"Error loading file: {npz_path}")
             raise e
 
         return True
 
-    def mask_lg_attn(self, lg_out: np.ndarray, l_attn_mask: np.ndarray, g_attn_mask: np.ndarray) -> np.ndarray:
-        l_out = lg_out[..., :768]
-        g_out = lg_out[..., 768:]  # 1280
-        l_out = l_out * np.expand_dims(l_attn_mask, -1)  # l_out = l_out * l_attn_mask.
-        g_out = g_out * np.expand_dims(g_attn_mask, -1)  # g_out = g_out * g_attn_mask.
-        return np.concatenate([l_out, g_out], axis=-1)
-
-    def mask_t5_attn(self, t5_out: np.ndarray, t5_attn_mask: np.ndarray) -> np.ndarray:
-        return t5_out * np.expand_dims(t5_attn_mask, -1)
-
     def load_outputs_npz(self, npz_path: str) -> List[np.ndarray]:
         data = np.load(npz_path)
         lg_out = data["lg_out"]
         lg_pooled = data["lg_pooled"]
-        t5_out = data["t5_out"] if "t5_out" in data else None
-
-        if self.apply_lg_attn_mask:
-            l_attn_mask = data["clip_l_attn_mask"]
-            g_attn_mask = data["clip_g_attn_mask"]
-            lg_out = self.mask_lg_attn(lg_out, l_attn_mask, g_attn_mask)
+        t5_out = data["t5_out"]
 
-        if self.apply_t5_attn_mask and t5_out is not None:
-            t5_attn_mask = data["t5_attn_mask"]
-            t5_out = self.mask_t5_attn(t5_out, t5_attn_mask)
+        l_attn_mask = data["clip_l_attn_mask"]
+        g_attn_mask = data["clip_g_attn_mask"]
+        t5_attn_mask = data["t5_attn_mask"]
 
-        return [lg_out, t5_out, lg_pooled]
+        # apply_t5_attn_mask and apply_lg_attn_mask are same as self.apply_t5_attn_mask and self.apply_lg_attn_mask
+        return [lg_out, t5_out, lg_pooled, l_attn_mask, g_attn_mask, t5_attn_mask]
 
     def cache_batch_outputs(
         self, tokenize_strategy: TokenizeStrategy, models: List[Any], text_encoding_strategy: TextEncodingStrategy, infos: List
@@ -174,7 +207,7 @@ def cache_batch_outputs(
 
         tokens_and_masks = tokenize_strategy.tokenize(captions)
         with torch.no_grad():
-            lg_out, t5_out, lg_pooled = sd3_text_encoding_strategy.encode_tokens(
+            lg_out, t5_out, lg_pooled, l_attn_mask, g_attn_mask, t5_attn_mask = sd3_text_encoding_strategy.encode_tokens(
                 tokenize_strategy, models, tokens_and_masks, self.apply_lg_attn_mask, self.apply_t5_attn_mask
             )
 
@@ -182,38 +215,41 @@ def cache_batch_outputs(
             lg_out = lg_out.float()
         if lg_pooled.dtype == torch.bfloat16:
             lg_pooled = lg_pooled.float()
-        if t5_out is not None and t5_out.dtype == torch.bfloat16:
+        if t5_out.dtype == torch.bfloat16:
             t5_out = t5_out.float()
 
         lg_out = lg_out.cpu().numpy()
         lg_pooled = lg_pooled.cpu().numpy()
-        if t5_out is not None:
-            t5_out = t5_out.cpu().numpy()
+        t5_out = t5_out.cpu().numpy()
+
+        l_attn_mask = tokens_and_masks[3].cpu().numpy()
+        g_attn_mask = tokens_and_masks[4].cpu().numpy()
+        t5_attn_mask = tokens_and_masks[5].cpu().numpy()
 
         for i, info in enumerate(infos):
             lg_out_i = lg_out[i]
-            t5_out_i = t5_out[i] if t5_out is not None else None
+            t5_out_i = t5_out[i]
             lg_pooled_i = lg_pooled[i]
+            l_attn_mask_i = l_attn_mask[i]
+            g_attn_mask_i = g_attn_mask[i]
+            t5_attn_mask_i = t5_attn_mask[i]
+            apply_lg_attn_mask = self.apply_lg_attn_mask
+            apply_t5_attn_mask = self.apply_t5_attn_mask
 
             if self.cache_to_disk:
-                clip_l_attn_mask, clip_g_attn_mask, t5_attn_mask = tokens_and_masks[3:6]
-                clip_l_attn_mask_i = clip_l_attn_mask[i].cpu().numpy()
-                clip_g_attn_mask_i = clip_g_attn_mask[i].cpu().numpy()
-                t5_attn_mask_i = t5_attn_mask[i].cpu().numpy() if t5_attn_mask is not None else None  # shouldn't be None
-                kwargs = {}
-                if t5_out is not None:
-                    kwargs["t5_out"] = t5_out_i
                 np.savez(
                     info.text_encoder_outputs_npz,
                     lg_out=lg_out_i,
                     lg_pooled=lg_pooled_i,
-                    clip_l_attn_mask=clip_l_attn_mask_i,
-                    clip_g_attn_mask=clip_g_attn_mask_i,
+                    t5_out=t5_out_i,
+                    clip_l_attn_mask=l_attn_mask_i,
+                    clip_g_attn_mask=g_attn_mask_i,
                     t5_attn_mask=t5_attn_mask_i,
-                    **kwargs,
+                    apply_lg_attn_mask=apply_lg_attn_mask,
+                    apply_t5_attn_mask=apply_t5_attn_mask,
                 )
             else:
-                info.text_encoder_outputs = (lg_out_i, t5_out_i, lg_pooled_i)
+                info.text_encoder_outputs = (lg_out_i, t5_out_i, lg_pooled_i, l_attn_mask_i, g_attn_mask_i, t5_attn_mask_i)
 
 
 class Sd3LatentsCachingStrategy(LatentsCachingStrategy):
@@ -246,41 +282,3 @@ def cache_batch_latents(self, vae, image_infos: List, flip_aug: bool, alpha_mask
 
         if not train_util.HIGH_VRAM:
             train_util.clean_memory_on_device(vae.device)
-
-
-if __name__ == "__main__":
-    # test code for Sd3TokenizeStrategy
-    # tokenizer = sd3_models.SD3Tokenizer()
-    strategy = Sd3TokenizeStrategy(256)
-    text = "hello world"
-
-    l_tokens, g_tokens, t5_tokens = strategy.tokenize(text)
-    # print(l_tokens.shape)
-    print(l_tokens)
-    print(g_tokens)
-    print(t5_tokens)
-
-    texts = ["hello world", "the quick brown fox jumps over the lazy dog"]
-    l_tokens_2 = strategy.clip_l(texts, max_length=77, padding="max_length", truncation=True, return_tensors="pt")
-    g_tokens_2 = strategy.clip_g(texts, max_length=77, padding="max_length", truncation=True, return_tensors="pt")
-    t5_tokens_2 = strategy.t5xxl(
-        texts, max_length=strategy.t5xxl_max_length, padding="max_length", truncation=True, return_tensors="pt"
-    )
-    print(l_tokens_2)
-    print(g_tokens_2)
-    print(t5_tokens_2)
-
-    # compare
-    print(torch.allclose(l_tokens, l_tokens_2["input_ids"][0]))
-    print(torch.allclose(g_tokens, g_tokens_2["input_ids"][0]))
-    print(torch.allclose(t5_tokens, t5_tokens_2["input_ids"][0]))
-
-    text = ",".join(["hello world! this is long text"] * 50)
-    l_tokens, g_tokens, t5_tokens = strategy.tokenize(text)
-    print(l_tokens)
-    print(g_tokens)
-    print(t5_tokens)
-
-    print(f"model max length l: {strategy.clip_l.model_max_length}")
-    print(f"model max length g: {strategy.clip_g.model_max_length}")
-    print(f"model max length t5: {strategy.t5xxl.model_max_length}")
diff --git a/library/train_util.py b/library/train_util.py
index 462c7a9a2..9ea1eec0e 100644
--- a/library/train_util.py
+++ b/library/train_util.py
@@ -5967,6 +5967,37 @@ def line_to_prompt_dict(line: str) -> dict:
     return prompt_dict
 
 
+def load_prompts(prompt_file: str) -> List[Dict]:
+    # read prompts
+    if prompt_file.endswith(".txt"):
+        with open(prompt_file, "r", encoding="utf-8") as f:
+            lines = f.readlines()
+        prompts = [line.strip() for line in lines if len(line.strip()) > 0 and line[0] != "#"]
+    elif prompt_file.endswith(".toml"):
+        with open(prompt_file, "r", encoding="utf-8") as f:
+            data = toml.load(f)
+        prompts = [dict(**data["prompt"], **subset) for subset in data["prompt"]["subset"]]
+    elif prompt_file.endswith(".json"):
+        with open(prompt_file, "r", encoding="utf-8") as f:
+            prompts = json.load(f)
+
+    # preprocess prompts
+    for i in range(len(prompts)):
+        prompt_dict = prompts[i]
+        if isinstance(prompt_dict, str):
+            from library.train_util import line_to_prompt_dict
+
+            prompt_dict = line_to_prompt_dict(prompt_dict)
+            prompts[i] = prompt_dict
+        assert isinstance(prompt_dict, dict)
+
+        # Adds an enumerator to the dict based on prompt position. Used later to name image files. Also cleanup of extra data in original prompt dict.
+        prompt_dict["enum"] = i
+        prompt_dict.pop("subset", None)
+
+    return prompts
+
+
 def sample_images_common(
     pipe_class,
     accelerator: Accelerator,
diff --git a/library/utils.py b/library/utils.py
index 8a0c782c0..ca0f904d2 100644
--- a/library/utils.py
+++ b/library/utils.py
@@ -13,12 +13,16 @@
 import cv2
 from PIL import Image
 import numpy as np
+from safetensors.torch import load_file
 
 
 def fire_in_thread(f, *args, **kwargs):
     threading.Thread(target=f, args=args, kwargs=kwargs).start()
 
 
+# region Logging
+
+
 def add_logging_arguments(parser):
     parser.add_argument(
         "--console_log_level",
@@ -85,6 +89,11 @@ def setup_logging(args=None, log_level=None, reset=False):
         logger.info(msg_init)
 
 
+# endregion
+
+# region PyTorch utils
+
+
 def str_to_dtype(s: Optional[str], default_dtype: Optional[torch.dtype] = None) -> torch.dtype:
     """
     Convert a string to a torch.dtype
@@ -304,6 +313,35 @@ def _convert_float8(byte_tensor, dtype_str, shape):
             # return byte_tensor.view(torch.uint8).to(torch.float16).reshape(shape)
             raise ValueError(f"Unsupported float8 type: {dtype_str} (upgrade PyTorch to support float8 types)")
 
+def load_safetensors(
+    path: str, device: Union[str, torch.device], disable_mmap: bool = False, dtype: Optional[torch.dtype] = torch.float32
+) -> dict[str, torch.Tensor]:
+    if disable_mmap:
+        # return safetensors.torch.load(open(path, "rb").read())
+        # use experimental loader
+        # logger.info(f"Loading without mmap (experimental)")
+        state_dict = {}
+        with MemoryEfficientSafeOpen(path) as f:
+            for key in f.keys():
+                state_dict[key] = f.get_tensor(key).to(device, dtype=dtype)
+        return state_dict
+    else:
+        try:
+            state_dict = load_file(path, device=device)
+        except:
+            state_dict = load_file(path)  # prevent device invalid Error
+        if dtype is not None:
+            for key in state_dict.keys():
+                state_dict[key] = state_dict[key].to(dtype=dtype)
+        return state_dict
+
+
+
+# endregion
+
+# region Image utils
+
+
 def pil_resize(image, size, interpolation=Image.LANCZOS):
     has_alpha = image.shape[2] == 4 if len(image.shape) == 3 else False
 
@@ -323,9 +361,9 @@ def pil_resize(image, size, interpolation=Image.LANCZOS):
     return resized_cv2
 
 
-# TODO make inf_utils.py
-
+# endregion
 
+# TODO make inf_utils.py
 # region Gradual Latent hires fix
 
 
diff --git a/sd3_minimal_inference.py b/sd3_minimal_inference.py
index 630da7e08..d099fe18d 100644
--- a/sd3_minimal_inference.py
+++ b/sd3_minimal_inference.py
@@ -12,6 +12,7 @@
 from safetensors.torch import safe_open, load_file
 from tqdm import tqdm
 from PIL import Image
+from transformers import CLIPTextModelWithProjection, T5EncoderModel
 
 from library.device_utils import init_ipex, get_preferred_device
 
@@ -25,11 +26,14 @@
 logger = logging.getLogger(__name__)
 
 from library import sd3_models, sd3_utils, strategy_sd3
+from library.utils import load_safetensors
 
 
-def get_noise(seed, latent):
-    generator = torch.manual_seed(seed)
-    return torch.randn(latent.size(), dtype=torch.float32, layout=latent.layout, generator=generator, device="cpu").to(latent.dtype)
+def get_noise(seed, latent, device="cpu"):
+    # generator = torch.manual_seed(seed)
+    generator = torch.Generator(device)
+    generator.manual_seed(seed)
+    return torch.randn(latent.size(), dtype=latent.dtype, layout=latent.layout, generator=generator, device=device)
 
 
 def get_sigmas(sampling: sd3_utils.ModelSamplingDiscreteFlow, steps):
@@ -59,7 +63,7 @@ def do_sample(
     neg_cond: Tuple[torch.Tensor, torch.Tensor],
     mmdit: sd3_models.MMDiT,
     steps: int,
-    guidance_scale: float,
+    cfg_scale: float,
     dtype: torch.dtype,
     device: str,
 ):
@@ -71,7 +75,7 @@ def do_sample(
 
     latent = latent.to(dtype).to(device)
 
-    noise = get_noise(seed, latent).to(device)
+    noise = get_noise(seed, latent, device)
 
     model_sampling = sd3_utils.ModelSamplingDiscreteFlow(shift=3.0)  # 3.0 is for SD3
 
@@ -105,7 +109,7 @@ def do_sample(
             batched = model_sampling.calculate_denoised(sigma_hat, model_output, x)
 
             pos_out, neg_out = batched.chunk(2)
-            denoised = neg_out + (pos_out - neg_out) * guidance_scale
+            denoised = neg_out + (pos_out - neg_out) * cfg_scale
             # print(denoised.shape)
 
             # d = to_d(x, sigma_hat, denoised)
@@ -122,20 +126,89 @@ def do_sample(
             x = x.to(dtype)
 
     latent = x
-    scale_factor = 1.5305
-    shift_factor = 0.0609
-    # def process_out(self, latent):
-    #     return (latent / self.scale_factor) + self.shift_factor
-    latent = (latent / scale_factor) + shift_factor
+    latent = vae.process_out(latent)
     return latent
 
 
+def generate_image(
+    mmdit: sd3_models.MMDiT,
+    vae: sd3_models.SDVAE,
+    clip_l: CLIPTextModelWithProjection,
+    clip_g: CLIPTextModelWithProjection,
+    t5xxl: T5EncoderModel,
+    steps: int,
+    prompt: str,
+    seed: int,
+    target_width: int,
+    target_height: int,
+    device: str,
+    negative_prompt: str,
+    cfg_scale: float,
+):
+    # prepare embeddings
+    logger.info("Encoding prompts...")
+
+    # TODO support one-by-one offloading
+    clip_l.to(device)
+    clip_g.to(device)
+    t5xxl.to(device)
+
+    with torch.no_grad():
+        tokens_and_masks = tokenize_strategy.tokenize(prompt)
+        lg_out, t5_out, pooled, l_attn_mask, g_attn_mask, t5_attn_mask = encoding_strategy.encode_tokens(
+            tokenize_strategy, [clip_l, clip_g, t5xxl], tokens_and_masks, args.apply_lg_attn_mask, args.apply_t5_attn_mask
+        )
+        cond = encoding_strategy.concat_encodings(lg_out, t5_out, pooled)
+
+        tokens_and_masks = tokenize_strategy.tokenize(negative_prompt)
+        lg_out, t5_out, pooled, neg_l_attn_mask, neg_g_attn_mask, neg_t5_attn_mask = encoding_strategy.encode_tokens(
+            tokenize_strategy, [clip_l, clip_g, t5xxl], tokens_and_masks, args.apply_lg_attn_mask, args.apply_t5_attn_mask
+        )
+        neg_cond = encoding_strategy.concat_encodings(lg_out, t5_out, pooled)
+
+    # attn masks are not used currently
+
+    if args.offload:
+        clip_l.to("cpu")
+        clip_g.to("cpu")
+        t5xxl.to("cpu")
+
+    # generate image
+    logger.info("Generating image...")
+    mmdit.to(device)
+    latent_sampled = do_sample(target_height, target_width, None, seed, cond, neg_cond, mmdit, steps, cfg_scale, sd3_dtype, device)
+    if args.offload:
+        mmdit.to("cpu")
+
+    # latent to image
+    vae.to(device)
+    with torch.no_grad():
+        image = vae.decode(latent_sampled)
+
+    if args.offload:
+        vae.to("cpu")
+
+    image = image.float()
+    image = torch.clamp((image + 1.0) / 2.0, min=0.0, max=1.0)[0]
+    decoded_np = 255.0 * np.moveaxis(image.cpu().numpy(), 0, 2)
+    decoded_np = decoded_np.astype(np.uint8)
+    out_image = Image.fromarray(decoded_np)
+
+    # save image
+    output_dir = args.output_dir
+    os.makedirs(output_dir, exist_ok=True)
+    output_path = os.path.join(output_dir, f"{datetime.datetime.now().strftime('%Y%m%d_%H%M%S')}.png")
+    out_image.save(output_path)
+
+    logger.info(f"Saved image to {output_path}")
+
+
 if __name__ == "__main__":
     target_height = 1024
     target_width = 1024
 
     # steps = 50  # 28  # 50
-    guidance_scale = 5
+    # cfg_scale = 5
     # seed = 1  # None  # 1
 
     device = get_preferred_device()
@@ -145,15 +218,17 @@ def do_sample(
     parser.add_argument("--clip_g", type=str, required=False)
     parser.add_argument("--clip_l", type=str, required=False)
     parser.add_argument("--t5xxl", type=str, required=False)
-    parser.add_argument("--t5xxl_token_length", type=int, default=77, help="t5xxl token length, default: 77")
+    parser.add_argument("--t5xxl_token_length", type=int, default=256, help="t5xxl token length, default: 256")
     parser.add_argument("--apply_lg_attn_mask", action="store_true")
     parser.add_argument("--apply_t5_attn_mask", action="store_true")
     parser.add_argument("--prompt", type=str, default="A photo of a cat")
     # parser.add_argument("--prompt2", type=str, default=None)  # do not support different prompts for text encoders
     parser.add_argument("--negative_prompt", type=str, default="")
+    parser.add_argument("--cfg_scale", type=float, default=5.0)
+    parser.add_argument("--offload", action="store_true", help="Offload to CPU")
     parser.add_argument("--output_dir", type=str, default=".")
-    parser.add_argument("--do_not_use_t5xxl", action="store_true")
-    parser.add_argument("--attn_mode", type=str, default="torch", help="torch (SDPA) or xformers. default: torch")
+    # parser.add_argument("--do_not_use_t5xxl", action="store_true")
+    # parser.add_argument("--attn_mode", type=str, default="torch", help="torch (SDPA) or xformers. default: torch")
     parser.add_argument("--fp16", action="store_true")
     parser.add_argument("--bf16", action="store_true")
     parser.add_argument("--seed", type=int, default=1)
@@ -165,7 +240,9 @@ def do_sample(
     #     default=[],
     #     help="LoRA weights, only supports networks.lora, each argument is a `path;multiplier` (semi-colon separated)",
     # )
-    # parser.add_argument("--interactive", action="store_true")
+    parser.add_argument("--width", type=int, default=target_width)
+    parser.add_argument("--height", type=int, default=target_height)
+    parser.add_argument("--interactive", action="store_true")
     args = parser.parse_args()
 
     seed = args.seed
@@ -177,185 +254,126 @@ def do_sample(
     elif args.bf16:
         sd3_dtype = torch.bfloat16
 
-    # TODO test with separated safetenors files for each model
+    loading_device = "cpu" if args.offload else device
 
     # load state dict
     logger.info(f"Loading SD3 models from {args.ckpt_path}...")
-    state_dict = load_file(args.ckpt_path)
-
-    if "text_encoders.clip_g.transformer.text_model.embeddings.position_embedding.weight" in state_dict:
-        # found clip_g: remove prefix "text_encoders.clip_g."
-        logger.info("clip_g is included in the checkpoint")
-        clip_g_sd = {}
-        prefix = "text_encoders.clip_g."
-        for k, v in list(state_dict.items()):
-            if k.startswith(prefix):
-                clip_g_sd[k[len(prefix) :]] = state_dict.pop(k)
-    else:
-        logger.info(f"Lodaing clip_g from {args.clip_g}...")
-        clip_g_sd = load_file(args.clip_g)
-        for key in list(clip_g_sd.keys()):
-            clip_g_sd["transformer." + key] = clip_g_sd.pop(key)
-
-    if "text_encoders.clip_l.transformer.text_model.embeddings.position_embedding.weight" in state_dict:
-        # found clip_l: remove prefix "text_encoders.clip_l."
-        logger.info("clip_l is included in the checkpoint")
-        clip_l_sd = {}
-        prefix = "text_encoders.clip_l."
-        for k, v in list(state_dict.items()):
-            if k.startswith(prefix):
-                clip_l_sd[k[len(prefix) :]] = state_dict.pop(k)
-    else:
-        logger.info(f"Lodaing clip_l from {args.clip_l}...")
-        clip_l_sd = load_file(args.clip_l)
-        for key in list(clip_l_sd.keys()):
-            clip_l_sd["transformer." + key] = clip_l_sd.pop(key)
-
-    if "text_encoders.t5xxl.transformer.encoder.block.0.layer.0.SelfAttention.k.weight" in state_dict:
-        # found t5xxl: remove prefix "text_encoders.t5xxl."
-        logger.info("t5xxl is included in the checkpoint")
-        if not args.do_not_use_t5xxl:
-            t5xxl_sd = {}
-            prefix = "text_encoders.t5xxl."
-            for k, v in list(state_dict.items()):
-                if k.startswith(prefix):
-                    t5xxl_sd[k[len(prefix) :]] = state_dict.pop(k)
-        else:
-            logger.info("but not used")
-            for key in list(state_dict.keys()):
-                if key.startswith("text_encoders.t5xxl."):
-                    state_dict.pop(key)
-            t5xxl_sd = None
-    elif args.t5xxl:
-        assert not args.do_not_use_t5xxl, "t5xxl is not used but specified"
-        logger.info(f"Lodaing t5xxl from {args.t5xxl}...")
-        t5xxl_sd = load_file(args.t5xxl)
-        for key in list(t5xxl_sd.keys()):
-            t5xxl_sd["transformer." + key] = t5xxl_sd.pop(key)
-    else:
-        logger.info("t5xxl is not used")
-        t5xxl_sd = None
-
-    use_t5xxl = t5xxl_sd is not None
-
-    # MMDiT and VAE
-    vae_sd = {}
-    vae_prefix = "first_stage_model."
-    mmdit_prefix = "model.diffusion_model."
-    for k, v in list(state_dict.items()):
-        if k.startswith(vae_prefix):
-            vae_sd[k[len(vae_prefix) :]] = state_dict.pop(k)
-        elif k.startswith(mmdit_prefix):
-            state_dict[k[len(mmdit_prefix) :]] = state_dict.pop(k)
-
-    # load tokenizers
-    logger.info("Loading tokenizers...")
-    tokenize_strategy = strategy_sd3.Sd3TokenizeStrategy(args.t5xxl_token_length)
-
-    # load models
-    # logger.info("Create MMDiT from SD3 checkpoint...")
-    # mmdit = sd3_utils.create_mmdit_from_sd3_checkpoint(state_dict)
-    logger.info("Create MMDiT")
-    mmdit = sd3_models.create_mmdit_sd3_medium_configs(args.attn_mode)
-
-    logger.info("Loading state dict...")
-    info = mmdit.load_state_dict(state_dict)
-    logger.info(f"Loaded MMDiT: {info}")
-
-    logger.info(f"Move MMDiT to {device} and {sd3_dtype}...")
-    mmdit.to(device, dtype=sd3_dtype)
-    mmdit.eval()
-
-    # load VAE
-    logger.info("Create VAE")
-    vae = sd3_models.SDVAE()
-    logger.info("Loading state dict...")
-    info = vae.load_state_dict(vae_sd)
-    logger.info(f"Loaded VAE: {info}")
-
-    logger.info(f"Move VAE to {device} and {sd3_dtype}...")
-    vae.to(device, dtype=sd3_dtype)
-    vae.eval()
+    # state_dict = load_file(args.ckpt_path)
+    state_dict = load_safetensors(args.ckpt_path, loading_device, disable_mmap=True, dtype=sd3_dtype)
 
     # load text encoders
-    logger.info("Create clip_l")
-    clip_l = sd3_models.create_clip_l(device, sd3_dtype, clip_l_sd)
+    clip_l = sd3_utils.load_clip_l(args.clip_l, sd3_dtype, loading_device, state_dict=state_dict)
+    clip_g = sd3_utils.load_clip_g(args.clip_g, sd3_dtype, loading_device, state_dict=state_dict)
+    t5xxl = sd3_utils.load_t5xxl(args.t5xxl, sd3_dtype, loading_device, state_dict=state_dict)
 
-    logger.info("Loading state dict...")
-    info = clip_l.load_state_dict(clip_l_sd)
-    logger.info(f"Loaded clip_l: {info}")
+    # MMDiT and VAE
+    vae = sd3_utils.load_vae(None, sd3_dtype, loading_device, state_dict=state_dict)
+    mmdit = sd3_utils.load_mmdit(state_dict, sd3_dtype, loading_device)
+
+    clip_l.to(sd3_dtype)
+    clip_g.to(sd3_dtype)
+    t5xxl.to(sd3_dtype)
+    vae.to(sd3_dtype)
+    mmdit.to(sd3_dtype)
+    if not args.offload:
+        # make sure to move to the device: some tensors are created in the constructor on the CPU
+        clip_l.to(device)
+        clip_g.to(device)
+        t5xxl.to(device)
+        vae.to(device)
+        mmdit.to(device)
 
-    logger.info(f"Move clip_l to {device} and {sd3_dtype}...")
-    clip_l.to(device, dtype=sd3_dtype)
     clip_l.eval()
-    logger.info(f"Set attn_mode to {args.attn_mode}...")
-    clip_l.set_attn_mode(args.attn_mode)
-
-    logger.info("Create clip_g")
-    clip_g = sd3_models.create_clip_g(device, sd3_dtype, clip_g_sd)
-
-    logger.info("Loading state dict...")
-    info = clip_g.load_state_dict(clip_g_sd)
-    logger.info(f"Loaded clip_g: {info}")
-
-    logger.info(f"Move clip_g to {device} and {sd3_dtype}...")
-    clip_g.to(device, dtype=sd3_dtype)
     clip_g.eval()
-    logger.info(f"Set attn_mode to {args.attn_mode}...")
-    clip_g.set_attn_mode(args.attn_mode)
-
-    if use_t5xxl:
-        logger.info("Create t5xxl")
-        t5xxl = sd3_models.create_t5xxl(device, sd3_dtype, t5xxl_sd)
-
-        logger.info("Loading state dict...")
-        info = t5xxl.load_state_dict(t5xxl_sd)
-        logger.info(f"Loaded t5xxl: {info}")
-
-        logger.info(f"Move t5xxl to {device} and {sd3_dtype}...")
-        t5xxl.to(device, dtype=sd3_dtype)
-        # t5xxl.to("cpu", dtype=torch.float32) # run on CPU
-        t5xxl.eval()
-        logger.info(f"Set attn_mode to {args.attn_mode}...")
-        t5xxl.set_attn_mode(args.attn_mode)
-    else:
-        t5xxl = None
+    t5xxl.eval()
+    mmdit.eval()
+    vae.eval()
 
-    # prepare embeddings
-    logger.info("Encoding prompts...")
+    # load tokenizers
+    logger.info("Loading tokenizers...")
+    tokenize_strategy = strategy_sd3.Sd3TokenizeStrategy(args.t5xxl_token_length)
     encoding_strategy = strategy_sd3.Sd3TextEncodingStrategy()
 
-    tokens_and_masks = tokenize_strategy.tokenize(args.prompt)
-    lg_out, t5_out, pooled = encoding_strategy.encode_tokens(
-        tokenize_strategy, [clip_l, clip_g, t5xxl], tokens_and_masks, args.apply_lg_attn_mask, args.apply_t5_attn_mask
-    )
-    cond = encoding_strategy.concat_encodings(lg_out, t5_out, pooled)
-
-    tokens_and_masks = tokenize_strategy.tokenize(args.negative_prompt)
-    lg_out, t5_out, pooled = encoding_strategy.encode_tokens(
-        tokenize_strategy, [clip_l, clip_g, t5xxl], tokens_and_masks, args.apply_lg_attn_mask, args.apply_t5_attn_mask
-    )
-    neg_cond = encoding_strategy.concat_encodings(lg_out, t5_out, pooled)
-
-    # generate image
-    logger.info("Generating image...")
-    latent_sampled = do_sample(
-        target_height, target_width, None, seed, cond, neg_cond, mmdit, steps, guidance_scale, sd3_dtype, device
-    )
-
-    # latent to image
-    with torch.no_grad():
-        image = vae.decode(latent_sampled)
-    image = image.float()
-    image = torch.clamp((image + 1.0) / 2.0, min=0.0, max=1.0)[0]
-    decoded_np = 255.0 * np.moveaxis(image.cpu().numpy(), 0, 2)
-    decoded_np = decoded_np.astype(np.uint8)
-    out_image = Image.fromarray(decoded_np)
-
-    # save image
-    output_dir = args.output_dir
-    os.makedirs(output_dir, exist_ok=True)
-    output_path = os.path.join(output_dir, f"{datetime.datetime.now().strftime('%Y%m%d_%H%M%S')}.png")
-    out_image.save(output_path)
-
-    logger.info(f"Saved image to {output_path}")
+    if not args.interactive:
+        generate_image(
+            mmdit,
+            vae,
+            clip_l,
+            clip_g,
+            t5xxl,
+            args.steps,
+            args.prompt,
+            args.seed,
+            args.width,
+            args.height,
+            device,
+            args.negative_prompt,
+            args.cfg_scale,
+        )
+    else:
+        # loop for interactive
+        width = args.width
+        height = args.height
+        steps = None
+        cfg_scale = args.cfg_scale
+
+        while True:
+            print(
+                "Enter prompt (empty to exit). Options: --w <width> --h <height> --s <steps> --d <seed>"
+                " --n <negative prompt>, `--n -` for empty negative prompt"
+                "Options are kept for the next prompt. Current options:"
+                f" width={width}, height={height}, steps={steps}, seed={seed}, cfg_scale={cfg_scale}"
+            )
+            prompt = input()
+            if prompt == "":
+                break
+
+            # parse options
+            options = prompt.split("--")
+            prompt = options[0].strip()
+            seed = None
+            negative_prompt = None
+            for opt in options[1:]:
+                try:
+                    opt = opt.strip()
+                    if opt.startswith("w"):
+                        width = int(opt[1:].strip())
+                    elif opt.startswith("h"):
+                        height = int(opt[1:].strip())
+                    elif opt.startswith("s"):
+                        steps = int(opt[1:].strip())
+                    elif opt.startswith("d"):
+                        seed = int(opt[1:].strip())
+                    # elif opt.startswith("m"):
+                    #     mutipliers = opt[1:].strip().split(",")
+                    #     if len(mutipliers) != len(lora_models):
+                    #         logger.error(f"Invalid number of multipliers, expected {len(lora_models)}")
+                    #         continue
+                    #     for i, lora_model in enumerate(lora_models):
+                    #         lora_model.set_multiplier(float(mutipliers[i]))
+                    elif opt.startswith("n"):
+                        negative_prompt = opt[1:].strip()
+                        if negative_prompt == "-":
+                            negative_prompt = ""
+                    elif opt.startswith("c"):
+                        cfg_scale = float(opt[1:].strip())
+                except ValueError as e:
+                    logger.error(f"Invalid option: {opt}, {e}")
+
+            generate_image(
+                mmdit,
+                vae,
+                clip_l,
+                clip_g,
+                t5xxl,
+                steps if steps is not None else args.steps,
+                prompt,
+                seed if seed is not None else args.seed,
+                width,
+                height,
+                device,
+                negative_prompt if negative_prompt is not None else args.negative_prompt,
+                cfg_scale,
+            )
+
+    logger.info("Done!")
diff --git a/sd3_train.py b/sd3_train.py
index ef18c32c4..6336b4cf9 100644
--- a/sd3_train.py
+++ b/sd3_train.py
@@ -1,6 +1,7 @@
 # training with captions
 
 import argparse
+from concurrent.futures import ThreadPoolExecutor
 import copy
 import math
 import os
@@ -11,6 +12,7 @@
 from tqdm import tqdm
 
 import torch
+from library import utils
 from library.device_utils import init_ipex, clean_memory_on_device
 
 init_ipex()
@@ -38,7 +40,7 @@
     ConfigSanitizer,
     BlueprintGenerator,
 )
-import library.custom_train_functions as custom_train_functions
+from library.custom_train_functions import apply_masked_loss, add_custom_train_arguments
 
 # from library.custom_train_functions import (
 #     apply_snr_weight,
@@ -61,23 +63,13 @@ def train(args):
     if not args.skip_cache_check:
         args.skip_cache_check = args.skip_latents_validity_check
 
-    assert (
-        not args.weighted_captions
-    ), "weighted_captions is not supported currently / weighted_captionsは現在サポートされていません"
+    # assert (
+    #     not args.weighted_captions
+    # ), "weighted_captions is not supported currently / weighted_captionsは現在サポートされていません"
     # assert (
     #     not args.train_text_encoder or not args.cache_text_encoder_outputs
     # ), "cache_text_encoder_outputs is not supported when training text encoder / text encoderを学習するときはcache_text_encoder_outputsはサポートされていません"
 
-    # # training text encoder is not supported
-    # assert (
-    #     not args.train_text_encoder
-    # ), "training text encoder is not supported currently / text encoderの学習は現在サポートされていません"
-
-    # # training without text encoder cache is not supported: because T5XXL must be cached
-    # assert (
-    #     args.cache_text_encoder_outputs
-    # ), "training without text encoder cache is not supported currently / text encoderのキャッシュなしの学習は現在サポートされていません"
-
     assert not args.train_text_encoder or (args.use_t5xxl_cache_only or not args.cache_text_encoder_outputs), (
         "when training text encoder, text encoder outputs must not be cached (except for T5XXL)"
         + " / text encoderの学習時はtext encoderの出力はキャッシュできません（t5xxlのみキャッシュすることは可能です）"
@@ -90,13 +82,13 @@ def train(args):
         )
         args.cache_text_encoder_outputs = True
 
-    # if args.block_lr:
-    #     block_lrs = [float(lr) for lr in args.block_lr.split(",")]
-    #     assert (
-    #         len(block_lrs) == UNET_NUM_BLOCKS_FOR_BLOCK_LR
-    #     ), f"block_lr must have {UNET_NUM_BLOCKS_FOR_BLOCK_LR} values / block_lrは{UNET_NUM_BLOCKS_FOR_BLOCK_LR}個の値を指定してください"
-    # else:
-    #     block_lrs = None
+    if args.train_t5xxl:
+        assert (
+            args.train_text_encoder
+        ), "when training T5XXL, text encoder (CLIP-L/G) must be trained / T5XXLを学習するときはtext encoder (CLIP-L/G)も学習する必要があります"
+        assert (
+            not args.cache_text_encoder_outputs
+        ), "when training T5XXL, t5xxl output must not be cached / T5XXLを学習するときはt5xxlの出力をキャッシュできません"
 
     cache_latents = args.cache_latents
     use_dreambooth_method = args.in_json is None
@@ -111,11 +103,6 @@ def train(args):
         )
         strategy_base.LatentsCachingStrategy.set_strategy(latents_caching_strategy)
 
-    # load tokenizer and prepare tokenize strategy
-    sd3_tokenizer = sd3_models.SD3Tokenizer(t5xxl_max_length=args.t5xxl_max_token_length)
-    sd3_tokenize_strategy = strategy_sd3.Sd3TokenizeStrategy(args.t5xxl_max_token_length)
-    strategy_base.TokenizeStrategy.set_strategy(sd3_tokenize_strategy)
-
     # データセットを準備する
     if args.dataset_class is None:
         blueprint_generator = BlueprintGenerator(ConfigSanitizer(True, True, args.masked_loss, True))
@@ -156,10 +143,10 @@ def train(args):
                     ]
                 }
 
-        blueprint = blueprint_generator.generate(user_config, args, tokenizer=[sd3_tokenizer])
+        blueprint = blueprint_generator.generate(user_config, args)
         train_dataset_group = config_util.generate_dataset_group_by_blueprint(blueprint.dataset_group)
     else:
-        train_dataset_group = train_util.load_arbitrary_dataset(args, [sd3_tokenizer])
+        train_dataset_group = train_util.load_arbitrary_dataset(args)
 
     current_epoch = Value("i", 0)
     current_step = Value("i", 0)
@@ -205,72 +192,56 @@ def train(args):
 
     # mixed precisionに対応した型を用意しておき適宜castする
     weight_dtype, save_dtype = train_util.prepare_dtype(args)
-    vae_dtype = weight_dtype  # torch.float32 if args.no_half_vae else weight_dtype # SD3 VAE works with fp16
-
-    t5xxl_dtype = weight_dtype
-    if args.t5xxl_dtype is not None:
-        if args.t5xxl_dtype == "fp16":
-            t5xxl_dtype = torch.float16
-        elif args.t5xxl_dtype == "bf16":
-            t5xxl_dtype = torch.bfloat16
-        elif args.t5xxl_dtype == "fp32" or args.t5xxl_dtype == "float":
-            t5xxl_dtype = torch.float32
-        else:
-            raise ValueError(f"unexpected t5xxl_dtype: {args.t5xxl_dtype}")
-    t5xxl_device = accelerator.device if args.t5xxl_device is None else args.t5xxl_device
-
-    clip_dtype = weight_dtype  # if not args.train_text_encoder else None
 
     # モデルを読み込む
-    attn_mode = "xformers" if args.xformers else "torch"
-
-    assert (
-        attn_mode == "torch"
-    ), f"attn_mode {attn_mode} is not supported yet. Please use `--sdpa` instead of `--xformers`. / attn_mode {attn_mode} はサポートされていません。`--xformers`の代わりに`--sdpa`を使ってください。"
-
-    # SD3 state dict may contain multiple models, so we need to load it and extract one by one. annoying.
-    logger.info(f"Loading SD3 models from {args.pretrained_model_name_or_path}")
-    device_to_load = accelerator.device if args.lowram else "cpu"
-    sd3_state_dict = sd3_utils.load_safetensors(
-        args.pretrained_model_name_or_path, device_to_load, args.disable_mmap_load_safetensors
-    )
 
-    # load VAE for caching latents
-    vae: sd3_models.SDVAE = None
-    if cache_latents:
-        vae = sd3_train_utils.load_target_model("vae", args, sd3_state_dict, accelerator, attn_mode, vae_dtype, device_to_load)
-        vae.to(accelerator.device, dtype=vae_dtype)
-        vae.requires_grad_(False)
-        vae.eval()
-
-        train_dataset_group.new_cache_latents(vae, accelerator)
+    # t5xxl_dtype = weight_dtype
+    # if args.t5xxl_dtype is not None:
+    #     if args.t5xxl_dtype == "fp16":
+    #         t5xxl_dtype = torch.float16
+    #     elif args.t5xxl_dtype == "bf16":
+    #         t5xxl_dtype = torch.bfloat16
+    #     elif args.t5xxl_dtype == "fp32" or args.t5xxl_dtype == "float":
+    #         t5xxl_dtype = torch.float32
+    #     else:
+    #         raise ValueError(f"unexpected t5xxl_dtype: {args.t5xxl_dtype}")
+    # t5xxl_device = accelerator.device if args.t5xxl_device is None else args.t5xxl_device
+    # clip_dtype = weight_dtype  # if not args.train_text_encoder else None
+
+    # if clip_l is not specified, the checkpoint must contain clip_l, so we load state dict here
+    # if full_fp16/bf16, model_dtype is casted to fp16/bf16. If not, model_dtype is None (float32).
+    # by loading with model_dtype, we can reduce memory usage.
+    model_dtype = match_mixed_precision(args, weight_dtype)  # None (default) or fp16/bf16 (full_xxxx)
+    if args.clip_l is None:
+        sd3_state_dict = utils.load_safetensors(
+            args.pretrained_model_name_or_path, "cpu", args.disable_mmap_load_safetensors, model_dtype
+        )
+    else:
+        sd3_state_dict = None
 
-        vae.to("cpu")  # if no sampling, vae can be deleted
-        clean_memory_on_device(accelerator.device)
+    # load tokenizer and prepare tokenize strategy
+    if args.t5xxl_max_token_length is None:
+        t5xxl_max_token_length = 256  # default value for T5XXL
+    else:
+        t5xxl_max_token_length = args.t5xxl_max_token_length
 
-        accelerator.wait_for_everyone()
+    sd3_tokenize_strategy = strategy_sd3.Sd3TokenizeStrategy(t5xxl_max_token_length)
+    strategy_base.TokenizeStrategy.set_strategy(sd3_tokenize_strategy)
 
     # load clip_l, clip_g, t5xxl for caching text encoder outputs
-    # # models are usually loaded on CPU and moved to GPU later. This is to avoid OOM on GPU0.
-    # mmdit, clip_l, clip_g, t5xxl, vae = sd3_train_utils.load_target_model(
-    #     args, accelerator, attn_mode, weight_dtype, clip_dtype, t5xxl_device, t5xxl_dtype, vae_dtype
-    # )
-    clip_l = sd3_train_utils.load_target_model("clip_l", args, sd3_state_dict, accelerator, attn_mode, clip_dtype, device_to_load)
-    clip_g = sd3_train_utils.load_target_model("clip_g", args, sd3_state_dict, accelerator, attn_mode, clip_dtype, device_to_load)
-    assert clip_l is not None, "clip_l is required / clip_lは必須です"
-    assert clip_g is not None, "clip_g is required / clip_gは必須です"
-
-    t5xxl = sd3_train_utils.load_target_model("t5xxl", args, sd3_state_dict, accelerator, attn_mode, t5xxl_dtype, device_to_load)
-    # logit_scale = logit_scale.to(accelerator.device, dtype=weight_dtype)
-
-    # should be deleted after caching text encoder outputs when not training text encoder
-    # this strategy should not be used other than this process
-    text_encoding_strategy = strategy_sd3.Sd3TextEncodingStrategy()
+    # clip_l = sd3_train_utils.load_target_model("clip_l", args, sd3_state_dict, accelerator, attn_mode, clip_dtype, device_to_load)
+    # clip_g = sd3_train_utils.load_target_model("clip_g", args, sd3_state_dict, accelerator, attn_mode, clip_dtype, device_to_load)
+    clip_l = sd3_utils.load_clip_l(args.clip_l, weight_dtype, "cpu", args.disable_mmap_load_safetensors, state_dict=sd3_state_dict)
+    clip_g = sd3_utils.load_clip_g(args.clip_g, weight_dtype, "cpu", args.disable_mmap_load_safetensors, state_dict=sd3_state_dict)
+    t5xxl = sd3_utils.load_t5xxl(args.t5xxl, weight_dtype, "cpu", args.disable_mmap_load_safetensors, state_dict=sd3_state_dict)
+    assert clip_l is not None and clip_g is not None and t5xxl is not None, "clip_l, clip_g, t5xxl must be specified"
+
+    # prepare text encoding strategy
+    text_encoding_strategy = strategy_sd3.Sd3TextEncodingStrategy(args.apply_lg_attn_mask, args.apply_t5_attn_mask)
     strategy_base.TextEncodingStrategy.set_strategy(text_encoding_strategy)
 
     # 学習を準備する：モデルを適切な状態にする
-    train_clip_l = False
-    train_clip_g = False
+    train_clip = False
     train_t5xxl = False
 
     if args.train_text_encoder:
@@ -278,99 +249,135 @@ def train(args):
         if args.gradient_checkpointing:
             clip_l.gradient_checkpointing_enable()
             clip_g.gradient_checkpointing_enable()
+            if args.train_t5xxl:
+                t5xxl.gradient_checkpointing_enable()
+
         lr_te1 = args.learning_rate_te1 if args.learning_rate_te1 is not None else args.learning_rate  # 0 means not train
         lr_te2 = args.learning_rate_te2 if args.learning_rate_te2 is not None else args.learning_rate  # 0 means not train
-        train_clip_l = lr_te1 != 0
-        train_clip_g = lr_te2 != 0
+        lr_t5xxl = args.learning_rate_te3 if args.learning_rate_te3 is not None else args.learning_rate  # 0 means not train
+        train_clip = lr_te1 != 0 or lr_te2 != 0
+        train_t5xxl = lr_t5xxl != 0 and args.train_t5xxl
 
-        if not train_clip_l:
-            clip_l.to(weight_dtype)
-        if not train_clip_g:
-            clip_g.to(weight_dtype)
-        clip_l.requires_grad_(train_clip_l)
-        clip_g.requires_grad_(train_clip_g)
-        clip_l.train(train_clip_l)
-        clip_g.train(train_clip_g)
+        clip_l.to(weight_dtype)
+        clip_g.to(weight_dtype)
+        t5xxl.to(weight_dtype)
+        clip_l.requires_grad_(train_clip)
+        clip_g.requires_grad_(train_clip)
+        t5xxl.requires_grad_(train_t5xxl)
     else:
+        print("disable text encoder training")
         clip_l.to(weight_dtype)
         clip_g.to(weight_dtype)
+        t5xxl.to(weight_dtype)
         clip_l.requires_grad_(False)
         clip_g.requires_grad_(False)
-        clip_l.eval()
-        clip_g.eval()
-
-    if t5xxl is not None:
-        t5xxl.to(t5xxl_dtype)
         t5xxl.requires_grad_(False)
-        t5xxl.eval()
+        lr_te1 = 0
+        lr_te2 = 0
+        lr_t5xxl = 0
 
     # cache text encoder outputs
     sample_prompts_te_outputs = None
     if args.cache_text_encoder_outputs:
-        # Text Encodes are eval and no grad here
         clip_l.to(accelerator.device)
         clip_g.to(accelerator.device)
-        if t5xxl is not None:
-            t5xxl.to(t5xxl_device)
+        t5xxl.to(accelerator.device)
+        clip_l.eval()
+        clip_g.eval()
+        t5xxl.eval()
 
         text_encoder_caching_strategy = strategy_sd3.Sd3TextEncoderOutputsCachingStrategy(
             args.cache_text_encoder_outputs_to_disk,
             args.text_encoder_batch_size,
             args.skip_cache_check,
-            train_clip_g or train_clip_l or args.use_t5xxl_cache_only,
+            train_clip or args.use_t5xxl_cache_only,  # if clip is trained or t5xxl is cached, caching is partial
             args.apply_lg_attn_mask,
             args.apply_t5_attn_mask,
         )
         strategy_base.TextEncoderOutputsCachingStrategy.set_strategy(text_encoder_caching_strategy)
 
-        clip_l.to(accelerator.device, dtype=weight_dtype)
-        clip_g.to(accelerator.device, dtype=weight_dtype)
-        if t5xxl is not None:
-            t5xxl.to(t5xxl_device, dtype=t5xxl_dtype)
-
         with accelerator.autocast():
             train_dataset_group.new_cache_text_encoder_outputs([clip_l, clip_g, t5xxl], accelerator)
 
         # cache sample prompt's embeddings to free text encoder's memory
         if args.sample_prompts is not None:
             logger.info(f"cache Text Encoder outputs for sample prompt: {args.sample_prompts}")
-            prompts = sd3_train_utils.load_prompts(args.sample_prompts)
+            prompts = train_util.load_prompts(args.sample_prompts)
             sample_prompts_te_outputs = {}  # key: prompt, value: text encoder outputs
             with accelerator.autocast(), torch.no_grad():
                 for prompt_dict in prompts:
                     for p in [prompt_dict.get("prompt", ""), prompt_dict.get("negative_prompt", "")]:
                         if p not in sample_prompts_te_outputs:
                             logger.info(f"cache Text Encoder outputs for prompt: {p}")
-                            tokens_list = sd3_tokenize_strategy.tokenize(p)
+                            tokens_and_masks = sd3_tokenize_strategy.tokenize(p)
                             sample_prompts_te_outputs[p] = text_encoding_strategy.encode_tokens(
                                 sd3_tokenize_strategy,
                                 [clip_l, clip_g, t5xxl],
-                                tokens_list,
+                                tokens_and_masks,
                                 args.apply_lg_attn_mask,
                                 args.apply_t5_attn_mask,
                             )
 
         accelerator.wait_for_everyone()
 
+        # now we can delete Text Encoders to free memory
+        if args.use_t5xxl_cache_only:
+            clip_l = None
+            clip_g = None
+        t5xxl = None
+
+        clean_memory_on_device(accelerator.device)
+
+    # load VAE for caching latents
+    if sd3_state_dict is None:
+        sd3_state_dict = utils.load_safetensors(
+            args.pretrained_model_name_or_path, "cpu", args.disable_mmap_load_safetensors, model_dtype
+        )
+
+    vae = sd3_utils.load_vae(args.vae, weight_dtype, "cpu", args.disable_mmap_load_safetensors, state_dict=sd3_state_dict)
+    if cache_latents:
+        # vae = sd3_train_utils.load_target_model("vae", args, sd3_state_dict, accelerator, attn_mode, vae_dtype, device_to_load)
+        vae.to(accelerator.device, dtype=weight_dtype)
+        vae.requires_grad_(False)
+        vae.eval()
+
+        train_dataset_group.new_cache_latents(vae, accelerator)
+
+        vae.to("cpu")  # if no sampling, vae can be deleted
+        clean_memory_on_device(accelerator.device)
+
+        accelerator.wait_for_everyone()
+
     # load MMDIT
-    # if full_fp16/bf16, model_dtype is casted to fp16/bf16. If not, model_dtype is None (float32).
-    # by loading with model_dtype, we can reduce memory usage.
-    model_dtype = match_mixed_precision(args, weight_dtype)  # None (default) or fp16/bf16 (full_xxxx)
-    mmdit = sd3_train_utils.load_target_model("mmdit", args, sd3_state_dict, accelerator, attn_mode, model_dtype, device_to_load)
+    mmdit = sd3_utils.load_mmdit(
+        sd3_state_dict,
+        model_dtype,
+        "cpu",
+    )
+
+    # attn_mode = "xformers" if args.xformers else "torch"
+    # assert (
+    #     attn_mode == "torch"
+    # ), f"attn_mode {attn_mode} is not supported yet. Please use `--sdpa` instead of `--xformers`. / attn_mode {attn_mode} はサポートされていません。`--xformers`の代わりに`--sdpa`を使ってください。"
+
+    # SD3 state dict may contain multiple models, so we need to load it and extract one by one. annoying.
+    logger.info(f"Loading SD3 models from {args.pretrained_model_name_or_path}")
+    device_to_load = accelerator.device if args.lowram else "cpu"
+    sd3_state_dict = utils.load_safetensors(args.pretrained_model_name_or_path, device_to_load, args.disable_mmap_load_safetensors)
+
     if args.gradient_checkpointing:
         mmdit.enable_gradient_checkpointing()
 
     train_mmdit = args.learning_rate != 0
     mmdit.requires_grad_(train_mmdit)
     if not train_mmdit:
-        mmdit.to(accelerator.device, dtype=weight_dtype)  # because of mmdie will not be prepared
+        mmdit.to(accelerator.device, dtype=weight_dtype)  # because of mmdit will not be prepared
 
     if not cache_latents:
-        # load VAE here if not cached
-        vae = sd3_train_utils.load_target_model("vae", args, sd3_state_dict, accelerator, attn_mode, vae_dtype, device_to_load)
+        # move to accelerator device
         vae.requires_grad_(False)
         vae.eval()
-        vae.to(accelerator.device, dtype=vae_dtype)
+        vae.to(accelerator.device, dtype=weight_dtype)
 
     mmdit.requires_grad_(train_mmdit)
     if not train_mmdit:
@@ -394,19 +401,24 @@ def train(args):
 
     training_models = []
     params_to_optimize = []
-    # if train_unet:
+    param_names = []
     training_models.append(mmdit)
-    # if block_lrs is None:
     params_to_optimize.append({"params": list(filter(lambda p: p.requires_grad, mmdit.parameters())), "lr": args.learning_rate})
-    # else:
-    #     params_to_optimize.extend(get_block_params_to_optimize(mmdit, block_lrs))
-
-    # if train_clip_l:
-    #     training_models.append(clip_l)
-    #     params_to_optimize.append({"params": list(clip_l.parameters()), "lr": args.learning_rate_te1 or args.learning_rate})
-    # if train_clip_g:
-    #     training_models.append(clip_g)
-    #     params_to_optimize.append({"params": list(clip_g.parameters()), "lr": args.learning_rate_te2 or args.learning_rate})
+    param_names.append([n for n, _ in mmdit.named_parameters()])
+
+    if train_clip:
+        if lr_te1 > 0:
+            training_models.append(clip_l)
+            params_to_optimize.append({"params": list(clip_l.parameters()), "lr": args.learning_rate_te1 or args.learning_rate})
+            param_names.append([n for n, _ in clip_l.named_parameters()])
+        if lr_te2 > 0:
+            training_models.append(clip_g)
+            params_to_optimize.append({"params": list(clip_g.parameters()), "lr": args.learning_rate_te2 or args.learning_rate})
+            param_names.append([n for n, _ in clip_g.named_parameters()])
+    if train_t5xxl:
+        training_models.append(t5xxl)
+        params_to_optimize.append({"params": list(t5xxl.parameters()), "lr": args.learning_rate_te3 or args.learning_rate})
+        param_names.append([n for n, _ in t5xxl.named_parameters()])
 
     # calculate number of trainable parameters
     n_params = 0
@@ -414,47 +426,49 @@ def train(args):
         for p in group["params"]:
             n_params += p.numel()
 
-    accelerator.print(f"train mmdit: {train_mmdit}")  # , clip_l: {train_clip_l}, clip_g: {train_clip_g}")
+    accelerator.print(f"train mmdit: {train_mmdit} , clip:{train_clip}, t5xxl:{train_t5xxl}")
     accelerator.print(f"number of models: {len(training_models)}")
     accelerator.print(f"number of trainable parameters: {n_params}")
 
     # 学習に必要なクラスを準備する
     accelerator.print("prepare optimizer, data loader etc.")
 
-    if args.fused_optimizer_groups:
+    if args.blockwise_fused_optimizers:
         # fused backward pass: https://pytorch.org/tutorials/intermediate/optimizer_step_in_backward_tutorial.html
-        # Instead of creating an optimizer for all parameters as in the tutorial, we create an optimizer for each group of parameters.
+        # Instead of creating an optimizer for all parameters as in the tutorial, we create an optimizer for each block of parameters.
         # This balances memory usage and management complexity.
 
-        # calculate total number of parameters
-        n_total_params = sum(len(params["params"]) for params in params_to_optimize)
-        params_per_group = math.ceil(n_total_params / args.fused_optimizer_groups)
-
-        # split params into groups, keeping the learning rate the same for all params in a group
-        # this will increase the number of groups if the learning rate is different for different params (e.g. U-Net and text encoders)
+        # split params into groups for mmdit. clip_l, clip_g, t5xxl are in each group
         grouped_params = []
-        param_group = []
-        param_group_lr = -1
-        for group in params_to_optimize:
-            lr = group["lr"]
-            for p in group["params"]:
-                # if the learning rate is different for different params, start a new group
-                if lr != param_group_lr:
-                    if param_group:
-                        grouped_params.append({"params": param_group, "lr": param_group_lr})
-                        param_group = []
-                    param_group_lr = lr
-
-                param_group.append(p)
-
-                # if the group has enough parameters, start a new group
-                if len(param_group) == params_per_group:
-                    grouped_params.append({"params": param_group, "lr": param_group_lr})
-                    param_group = []
-                    param_group_lr = -1
-
-        if param_group:
-            grouped_params.append({"params": param_group, "lr": param_group_lr})
+        param_group = {}
+        group = params_to_optimize[0]
+        named_parameters = list(mmdit.named_parameters())
+        assert len(named_parameters) == len(group["params"]), "number of parameters does not match"
+        for p, np in zip(group["params"], named_parameters):
+            # determine target layer and block index for each parameter
+            block_type = "other"  # joint or other
+            if np[0].startswith("joint_blocks"):
+                block_idx = int(np[0].split(".")[1])
+                block_type = "joint"
+            else:
+                block_idx = -1
+
+            param_group_key = (block_type, block_idx)
+            if param_group_key not in param_group:
+                param_group[param_group_key] = []
+            param_group[param_group_key].append(p)
+
+        block_types_and_indices = []
+        for param_group_key, param_group in param_group.items():
+            block_types_and_indices.append(param_group_key)
+            grouped_params.append({"params": param_group, "lr": args.learning_rate})
+
+            num_params = 0
+            for p in param_group:
+                num_params += p.numel()
+            accelerator.print(f"block {param_group_key}: {num_params} parameters")
+
+        grouped_params.extend(params_to_optimize[1:])  # add clip_l, clip_g, t5xxl if they are trained
 
         # prepare optimizers for each group
         optimizers = []
@@ -463,10 +477,15 @@ def train(args):
             optimizers.append(optimizer)
         optimizer = optimizers[0]  # avoid error in the following code
 
-        logger.info(f"using {len(optimizers)} optimizers for fused optimizer groups")
+        logger.info(f"using {len(optimizers)} optimizers for blockwise fused optimizers")
 
+        if train_util.is_schedulefree_optimizer(optimizers[0], args):
+            raise ValueError("Schedule-free optimizer is not supported with blockwise fused optimizers")
+        optimizer_train_fn = lambda: None  # dummy function
+        optimizer_eval_fn = lambda: None  # dummy function
     else:
         _, _, optimizer = train_util.get_optimizer(args, trainable_params=params_to_optimize)
+        optimizer_train_fn, optimizer_eval_fn = train_util.get_optimizer_train_eval_fn(optimizer, args)
 
     # prepare dataloader
     # strategies are set here because they cannot be referenced in another process. Copy them with the dataset
@@ -497,7 +516,7 @@ def train(args):
     train_dataset_group.set_max_train_steps(args.max_train_steps)
 
     # lr schedulerを用意する
-    if args.fused_optimizer_groups:
+    if args.blockwise_fused_optimizers:
         # prepare lr schedulers for each optimizer
         lr_schedulers = [train_util.get_scheduler_fix(args, optimizer, accelerator.num_processes) for optimizer in optimizers]
         lr_scheduler = lr_schedulers[0]  # avoid error in the following code
@@ -511,18 +530,22 @@ def train(args):
         ), "full_fp16 requires mixed precision='fp16' / full_fp16を使う場合はmixed_precision='fp16'を指定してください。"
         accelerator.print("enable full fp16 training.")
         mmdit.to(weight_dtype)
-        clip_l.to(weight_dtype)
-        clip_g.to(weight_dtype)
+        if clip_l is not None:
+            clip_l.to(weight_dtype)
+        if clip_g is not None:
+            clip_g.to(weight_dtype)
         if t5xxl is not None:
-            t5xxl.to(weight_dtype)  # TODO check works with fp16 or not
+            t5xxl.to(weight_dtype)
     elif args.full_bf16:
         assert (
             args.mixed_precision == "bf16"
         ), "full_bf16 requires mixed precision='bf16' / full_bf16を使う場合はmixed_precision='bf16'を指定してください。"
         accelerator.print("enable full bf16 training.")
         mmdit.to(weight_dtype)
-        clip_l.to(weight_dtype)
-        clip_g.to(weight_dtype)
+        if clip_l is not None:
+            clip_l.to(weight_dtype)
+        if clip_g is not None:
+            clip_g.to(weight_dtype)
         if t5xxl is not None:
             t5xxl.to(weight_dtype)
 
@@ -533,14 +556,7 @@ def train(args):
     #     clip_l.text_model.final_layer_norm.requires_grad_(False)
 
     # TextEncoderの出力をキャッシュするときには、すでに出力を取得済みなのでCPUへ移動する
-    if args.cache_text_encoder_outputs:
-        # move Text Encoders for sampling images. Text Encoder doesn't work on CPU with fp16
-        clip_l.to("cpu", dtype=torch.float32)
-        clip_g.to("cpu", dtype=torch.float32)
-        if t5xxl is not None:
-            t5xxl.to("cpu", dtype=torch.float32)
-        clean_memory_on_device(accelerator.device)
-    else:
+    if not args.cache_text_encoder_outputs:
         # make sure Text Encoders are on GPU
         # TODO support CPU for text encoders
         clip_l.to(accelerator.device)
@@ -548,18 +564,11 @@ def train(args):
         if t5xxl is not None:
             t5xxl.to(accelerator.device)
 
-    # TODO cache sample prompt's embeddings to free text encoder's memory
-    if args.cache_text_encoder_outputs:
-        if not args.save_t5xxl:
-            t5xxl = None  # free memory
     clean_memory_on_device(accelerator.device)
 
     if args.deepspeed:
         ds_model = deepspeed_utils.prepare_deepspeed_model(
-            args,
-            mmdit=mmdit,
-            clip_l=clip_l if train_clip_l else None,
-            clip_g=clip_g if train_clip_g else None,
+            args, mmdit=mmdit, clip_l=clip_l if train_clip else None, clip_g=clip_g if train_clip else None
         )
         # most of ZeRO stage uses optimizer partitioning, so we have to prepare optimizer and ds_model at the same time. # pull/1139#issuecomment-1986790007
         ds_model, optimizer, train_dataloader, lr_scheduler = accelerator.prepare(
@@ -571,10 +580,11 @@ def train(args):
         # acceleratorがなんかよろしくやってくれるらしい
         if train_mmdit:
             mmdit = accelerator.prepare(mmdit)
-        if train_clip_l:
+        if train_clip:
             clip_l = accelerator.prepare(clip_l)
-        if train_clip_g:
             clip_g = accelerator.prepare(clip_g)
+        if train_t5xxl:
+            t5xxl = accelerator.prepare(t5xxl)
         optimizer, train_dataloader, lr_scheduler = accelerator.prepare(optimizer, train_dataloader, lr_scheduler)
 
     # 実験的機能：勾配も含めたfp16学習を行う　PyTorchにパッチを当ててfp16でのgrad scaleを有効にする
@@ -586,24 +596,110 @@ def train(args):
     # resumeする
     train_util.resume_from_local_or_hf_if_specified(accelerator, args)
 
+    # memory efficient block swapping
+
+    def submit_move_blocks(futures, thread_pool, block_idx_to_cpu, block_idx_to_cuda, blocks, device):
+        def move_blocks(bidx_to_cpu, block_to_cpu, bidx_to_cuda, block_to_cuda, dvc):
+            # print(f"Backward: Move block {bidx_to_cpu} to CPU")
+            block_to_cpu = block_to_cpu.to("cpu", non_blocking=True)
+            torch.cuda.empty_cache()
+
+            block_to_cuda = block_to_cuda.to(dvc, non_blocking=True)
+            torch.cuda.synchronize()
+            return bidx_to_cpu, bidx_to_cuda
+
+        block_to_cpu = blocks[block_idx_to_cpu]
+        block_to_cuda = blocks[block_idx_to_cuda]
+
+        futures[block_idx_to_cuda] = thread_pool.submit(
+            move_blocks, block_idx_to_cpu, block_to_cpu, block_idx_to_cuda, block_to_cuda, device
+        )
+
+    def wait_blocks_move(block_idx, futures):
+        if block_idx not in futures:
+            return
+        future = futures.pop(block_idx)
+        future.result()
+
     if args.fused_backward_pass:
         # use fused optimizer for backward pass: other optimizers will be supported in the future
         import library.adafactor_fused
 
         library.adafactor_fused.patch_adafactor_fused(optimizer)
-        for param_group in optimizer.param_groups:
-            for parameter in param_group["params"]:
+
+        blocks_to_swap = args.blocks_to_swap
+        num_blocks = len(accelerator.unwrap_model(mmdit).joint_blocks)
+        handled_block_indices = set()
+
+        n = 1  # only asynchronous purpose, no need to increase this number
+        # n = 2
+        # n = max(1, os.cpu_count() // 2)
+        thread_pool = ThreadPoolExecutor(max_workers=n)
+        futures = {}
+
+        for param_group, param_name_group in zip(optimizer.param_groups, param_names):
+            for parameter, param_name in zip(param_group["params"], param_name_group):
                 if parameter.requires_grad:
+                    grad_hook = None
+
+                    if blocks_to_swap:
+                        is_block = param_name.startswith("double_blocks")
+                        if is_block:
+                            block_idx = int(param_name.split(".")[1])
+                            if block_idx not in handled_block_indices:
+                                # swap following (already backpropagated) block
+                                handled_block_indices.add(block_idx)
+
+                                # if n blocks were already backpropagated
+                                num_blocks_propagated = num_blocks - block_idx - 1
+                                swapping = num_blocks_propagated > 0 and num_blocks_propagated <= blocks_to_swap
+                                waiting = block_idx > 0 and block_idx <= blocks_to_swap
+                                if swapping or waiting:
+                                    block_idx_to_cpu = num_blocks - num_blocks_propagated
+                                    block_idx_to_cuda = blocks_to_swap - num_blocks_propagated
+                                    block_idx_to_wait = block_idx - 1
+
+                                    # create swap hook
+                                    def create_swap_grad_hook(
+                                        bidx_to_cpu, bidx_to_cuda, bidx_to_wait, bidx: int, swpng: bool, wtng: bool
+                                    ):
+                                        def __grad_hook(tensor: torch.Tensor):
+                                            if accelerator.sync_gradients and args.max_grad_norm != 0.0:
+                                                accelerator.clip_grad_norm_(tensor, args.max_grad_norm)
+                                            optimizer.step_param(tensor, param_group)
+                                            tensor.grad = None
+
+                                            if swpng:
+                                                submit_move_blocks(
+                                                    futures,
+                                                    thread_pool,
+                                                    bidx_to_cpu,
+                                                    bidx_to_cuda,
+                                                    mmdit.joint_blocks,
+                                                    accelerator.device,
+                                                )
+                                            if wtng:
+                                                wait_blocks_move(bidx_to_wait, futures)
+
+                                        return __grad_hook
+
+                                    grad_hook = create_swap_grad_hook(
+                                        block_idx_to_cpu, block_idx_to_cuda, block_idx_to_wait, block_idx, swapping, waiting
+                                    )
+
+                    if grad_hook is None:
+
+                        def __grad_hook(tensor: torch.Tensor, param_group=param_group):
+                            if accelerator.sync_gradients and args.max_grad_norm != 0.0:
+                                accelerator.clip_grad_norm_(tensor, args.max_grad_norm)
+                            optimizer.step_param(tensor, param_group)
+                            tensor.grad = None
 
-                    def __grad_hook(tensor: torch.Tensor, param_group=param_group):
-                        if accelerator.sync_gradients and args.max_grad_norm != 0.0:
-                            accelerator.clip_grad_norm_(tensor, args.max_grad_norm)
-                        optimizer.step_param(tensor, param_group)
-                        tensor.grad = None
+                        grad_hook = __grad_hook
 
-                    parameter.register_post_accumulate_grad_hook(__grad_hook)
+                    parameter.register_post_accumulate_grad_hook(grad_hook)
 
-    elif args.fused_optimizer_groups:
+    elif args.blockwise_fused_optimizers:
         # prepare for additional optimizers and lr schedulers
         for i in range(1, len(optimizers)):
             optimizers[i] = accelerator.prepare(optimizers[i])
@@ -618,22 +714,59 @@ def __grad_hook(tensor: torch.Tensor, param_group=param_group):
         num_parameters_per_group = [0] * len(optimizers)
         parameter_optimizer_map = {}
 
+        blocks_to_swap = args.blocks_to_swap
+        num_blocks = len(accelerator.unwrap_model(mmdit).joint_blocks)
+
+        n = 1  # only asynchronous purpose, no need to increase this number
+        # n = max(1, os.cpu_count() // 2)
+        thread_pool = ThreadPoolExecutor(max_workers=n)
+        futures = {}
+
         for opt_idx, optimizer in enumerate(optimizers):
             for param_group in optimizer.param_groups:
                 for parameter in param_group["params"]:
                     if parameter.requires_grad:
-
-                        def optimizer_hook(parameter: torch.Tensor):
-                            if accelerator.sync_gradients and args.max_grad_norm != 0.0:
-                                accelerator.clip_grad_norm_(parameter, args.max_grad_norm)
-
-                            i = parameter_optimizer_map[parameter]
-                            optimizer_hooked_count[i] += 1
-                            if optimizer_hooked_count[i] == num_parameters_per_group[i]:
-                                optimizers[i].step()
-                                optimizers[i].zero_grad(set_to_none=True)
-
-                        parameter.register_post_accumulate_grad_hook(optimizer_hook)
+                        block_type, block_idx = block_types_and_indices[opt_idx]
+
+                        def create_optimizer_hook(btype, bidx):
+                            def optimizer_hook(parameter: torch.Tensor):
+                                # print(f"optimizer_hook: {btype}, {bidx}")
+                                if accelerator.sync_gradients and args.max_grad_norm != 0.0:
+                                    accelerator.clip_grad_norm_(parameter, args.max_grad_norm)
+
+                                i = parameter_optimizer_map[parameter]
+                                optimizer_hooked_count[i] += 1
+                                if optimizer_hooked_count[i] == num_parameters_per_group[i]:
+                                    optimizers[i].step()
+                                    optimizers[i].zero_grad(set_to_none=True)
+
+                                    # swap blocks if necessary
+                                    if blocks_to_swap and btype == "joint":
+                                        num_blocks_propagated = num_blocks - bidx
+
+                                        swapping = num_blocks_propagated > 0 and num_blocks_propagated <= blocks_to_swap
+                                        waiting = bidx > 0 and bidx <= blocks_to_swap
+
+                                        if swapping:
+                                            block_idx_to_cpu = num_blocks - num_blocks_propagated
+                                            block_idx_to_cuda = blocks_to_swap - num_blocks_propagated
+                                            # print(f"Backward: Swap blocks {block_idx_to_cpu} and {block_idx_to_cuda}")
+                                            submit_move_blocks(
+                                                futures,
+                                                thread_pool,
+                                                block_idx_to_cpu,
+                                                block_idx_to_cuda,
+                                                mmdit.joint_blocks,
+                                                accelerator.device,
+                                            )
+
+                                        if waiting:
+                                            block_idx_to_wait = bidx - 1
+                                            wait_blocks_move(block_idx_to_wait, futures)
+
+                            return optimizer_hook
+
+                        parameter.register_post_accumulate_grad_hook(create_optimizer_hook(block_type, block_idx))
                         parameter_optimizer_map[parameter] = opt_idx
                         num_parameters_per_group[opt_idx] += 1
 
@@ -661,17 +794,9 @@ def optimizer_hook(parameter: torch.Tensor):
     progress_bar = tqdm(range(args.max_train_steps), smoothing=0, disable=not accelerator.is_local_main_process, desc="steps")
     global_step = 0
 
-    # noise_scheduler = DDPMScheduler(
-    #     beta_start=0.00085, beta_end=0.012, beta_schedule="scaled_linear", num_train_timesteps=1000, clip_sample=False
-    # )
-
     noise_scheduler = sd3_train_utils.FlowMatchEulerDiscreteScheduler(num_train_timesteps=1000, shift=3.0)
     noise_scheduler_copy = copy.deepcopy(noise_scheduler)
 
-    # prepare_scheduler_for_custom_training(noise_scheduler, accelerator.device)
-    # if args.zero_terminal_snr:
-    #     custom_train_functions.fix_noise_scheduler_betas_for_zero_terminal_snr(noise_scheduler)
-
     if accelerator.is_main_process:
         init_kwargs = {}
         if args.wandb_run_name:
@@ -685,60 +810,13 @@ def optimizer_hook(parameter: torch.Tensor):
         )
 
     # For --sample_at_first
+    optimizer_eval_fn()
     sd3_train_utils.sample_images(accelerator, args, 0, global_step, mmdit, vae, [clip_l, clip_g, t5xxl], sample_prompts_te_outputs)
+    optimizer_train_fn()
     if len(accelerator.trackers) > 0:
         # log empty object to commit the sample images to wandb
         accelerator.log({}, step=0)
 
-    # following function will be moved to sd3_train_utils
-
-    def get_sigmas(timesteps, n_dim=4, dtype=torch.float32):
-        sigmas = noise_scheduler_copy.sigmas.to(device=accelerator.device, dtype=dtype)
-        schedule_timesteps = noise_scheduler_copy.timesteps.to(accelerator.device)
-        timesteps = timesteps.to(accelerator.device)
-        step_indices = [(schedule_timesteps == t).nonzero().item() for t in timesteps]
-
-        sigma = sigmas[step_indices].flatten()
-        while len(sigma.shape) < n_dim:
-            sigma = sigma.unsqueeze(-1)
-        return sigma
-
-    def compute_density_for_timestep_sampling(
-        weighting_scheme: str, batch_size: int, logit_mean: float = None, logit_std: float = None, mode_scale: float = None
-    ):
-        """Compute the density for sampling the timesteps when doing SD3 training.
-
-        Courtesy: This was contributed by Rafie Walker in https://github.com/huggingface/diffusers/pull/8528.
-
-        SD3 paper reference: https://arxiv.org/abs/2403.03206v1.
-        """
-        if weighting_scheme == "logit_normal":
-            # See 3.1 in the SD3 paper ($rf/lognorm(0.00,1.00)$).
-            u = torch.normal(mean=logit_mean, std=logit_std, size=(batch_size,), device="cpu")
-            u = torch.nn.functional.sigmoid(u)
-        elif weighting_scheme == "mode":
-            u = torch.rand(size=(batch_size,), device="cpu")
-            u = 1 - u - mode_scale * (torch.cos(math.pi * u / 2) ** 2 - 1 + u)
-        else:
-            u = torch.rand(size=(batch_size,), device="cpu")
-        return u
-
-    def compute_loss_weighting_for_sd3(weighting_scheme: str, sigmas=None):
-        """Computes loss weighting scheme for SD3 training.
-
-        Courtesy: This was contributed by Rafie Walker in https://github.com/huggingface/diffusers/pull/8528.
-
-        SD3 paper reference: https://arxiv.org/abs/2403.03206v1.
-        """
-        if weighting_scheme == "sigma_sqrt":
-            weighting = (sigmas**-2.0).float()
-        elif weighting_scheme == "cosmap":
-            bot = 1 - 2 * sigmas + 2 * sigmas**2
-            weighting = 2 / (math.pi * bot)
-        else:
-            weighting = torch.ones_like(sigmas)
-        return weighting
-
     loss_recorder = train_util.LossRecorder()
     epoch = 0  # avoid error when max_train_steps is 0
     for epoch in range(num_train_epochs):
@@ -751,16 +829,16 @@ def compute_loss_weighting_for_sd3(weighting_scheme: str, sigmas=None):
         for step, batch in enumerate(train_dataloader):
             current_step.value = global_step
 
-            if args.fused_optimizer_groups:
+            if args.blockwise_fused_optimizers:
                 optimizer_hooked_count = {i: 0 for i in range(len(optimizers))}  # reset counter for each step
 
             with accelerator.accumulate(*training_models):
                 if "latents" in batch and batch["latents"] is not None:
-                    latents = batch["latents"].to(accelerator.device).to(dtype=weight_dtype)
+                    latents = batch["latents"].to(accelerator.device, dtype=weight_dtype)
                 else:
                     with torch.no_grad():
                         # encode images to latents. images are [-1, 1]
-                        latents = vae.encode(batch["images"].to(vae_dtype)).to(weight_dtype)
+                        latents = vae.encode(batch["images"])
 
                     # NaNが含まれていれば警告を表示し0に置き換える
                     if torch.any(torch.isnan(latents)):
@@ -772,7 +850,7 @@ def compute_loss_weighting_for_sd3(weighting_scheme: str, sigmas=None):
 
                 text_encoder_outputs_list = batch.get("text_encoder_outputs_list", None)
                 if text_encoder_outputs_list is not None:
-                    lg_out, t5_out, lg_pooled = text_encoder_outputs_list
+                    lg_out, t5_out, lg_pooled, l_attn_mask, g_attn_mask, t5_attn_mask = text_encoder_outputs_list
                     if args.use_t5xxl_cache_only:
                         lg_out = None
                         lg_pooled = None
@@ -781,7 +859,7 @@ def compute_loss_weighting_for_sd3(weighting_scheme: str, sigmas=None):
                     t5_out = None
                     lg_pooled = None
 
-                if lg_out is None or (train_clip_l or train_clip_g):
+                if lg_out is None:
                     # not cached or training, so get from text encoders
                     input_ids_clip_l, input_ids_clip_g, _, l_attn_mask, g_attn_mask, _ = batch["input_ids_list"]
                     with torch.set_grad_enabled(args.train_text_encoder):
@@ -811,21 +889,10 @@ def compute_loss_weighting_for_sd3(weighting_scheme: str, sigmas=None):
                 noise = torch.randn_like(latents)
                 bsz = latents.shape[0]
 
-                # Sample a random timestep for each image
-                # for weighting schemes where we sample timesteps non-uniformly
-                u = compute_density_for_timestep_sampling(
-                    weighting_scheme=args.weighting_scheme,
-                    batch_size=bsz,
-                    logit_mean=args.logit_mean,
-                    logit_std=args.logit_std,
-                    mode_scale=args.mode_scale,
+                # get noisy model input and timesteps
+                noisy_model_input, timesteps, sigmas = sd3_train_utils.get_noisy_model_input_and_timesteps(
+                    args, noise_scheduler_copy, latents, noise, accelerator.device, weight_dtype
                 )
-                indices = (u * noise_scheduler_copy.config.num_train_timesteps).long()
-                timesteps = noise_scheduler_copy.timesteps[indices].to(device=accelerator.device)
-
-                # Add noise according to flow matching.
-                sigmas = get_sigmas(timesteps, n_dim=latents.ndim, dtype=weight_dtype)
-                noisy_model_input = sigmas * noise + (1.0 - sigmas) * latents
 
                 # debug: NaN check for all inputs
                 if torch.any(torch.isnan(noisy_model_input)):
@@ -840,6 +907,7 @@ def compute_loss_weighting_for_sd3(weighting_scheme: str, sigmas=None):
 
                 # call model
                 with accelerator.autocast():
+                    # TODO support attention mask
                     model_pred = mmdit(noisy_model_input, timesteps, context=context, y=lg_pooled)
 
                 # Follow: Section 5 of https://arxiv.org/abs/2206.00364.
@@ -848,21 +916,34 @@ def compute_loss_weighting_for_sd3(weighting_scheme: str, sigmas=None):
 
                 # these weighting schemes use a uniform timestep sampling
                 # and instead post-weight the loss
-                weighting = compute_loss_weighting_for_sd3(weighting_scheme=args.weighting_scheme, sigmas=sigmas)
+                weighting = sd3_train_utils.compute_loss_weighting_for_sd3(weighting_scheme=args.weighting_scheme, sigmas=sigmas)
 
                 # flow matching loss
                 target = latents
 
-                # Compute regular loss. TODO simplify this
-                loss = torch.mean(
-                    (weighting.float() * (model_pred.float() - target.float()) ** 2).reshape(target.shape[0], -1),
-                    1,
+                # # Compute regular loss. TODO simplify this
+                # loss = torch.mean(
+                #     (weighting.float() * (model_pred.float() - target.float()) ** 2).reshape(target.shape[0], -1),
+                #     1,
+                # )
+                # calculate loss
+                loss = train_util.conditional_loss(
+                    model_pred.float(), target.float(), reduction="none", loss_type=args.loss_type, huber_c=None
                 )
+                if args.masked_loss or ("alpha_masks" in batch and batch["alpha_masks"] is not None):
+                    loss = apply_masked_loss(loss, batch)
+                loss = loss.mean([1, 2, 3])
+
+                if weighting is not None:
+                    loss = loss * weighting
+
+                loss_weights = batch["loss_weights"]  # 各sampleごとのweight
+                loss = loss * loss_weights
                 loss = loss.mean()
 
                 accelerator.backward(loss)
 
-                if not (args.fused_backward_pass or args.fused_optimizer_groups):
+                if not (args.fused_backward_pass or args.blockwise_fused_optimizers):
                     if accelerator.sync_gradients and args.max_grad_norm != 0.0:
                         params_to_clip = []
                         for m in training_models:
@@ -875,7 +956,7 @@ def compute_loss_weighting_for_sd3(weighting_scheme: str, sigmas=None):
                 else:
                     # optimizer.step() and optimizer.zero_grad() are called in the optimizer hook
                     lr_scheduler.step()
-                    if args.fused_optimizer_groups:
+                    if args.blockwise_fused_optimizers:
                         for i in range(1, len(optimizers)):
                             lr_schedulers[i].step()
 
@@ -884,6 +965,7 @@ def compute_loss_weighting_for_sd3(weighting_scheme: str, sigmas=None):
                 progress_bar.update(1)
                 global_step += 1
 
+                optimizer_eval_fn()
                 sd3_train_utils.sample_images(
                     accelerator, args, None, global_step, mmdit, vae, [clip_l, clip_g, t5xxl], sample_prompts_te_outputs
                 )
@@ -900,12 +982,13 @@ def compute_loss_weighting_for_sd3(weighting_scheme: str, sigmas=None):
                             epoch,
                             num_train_epochs,
                             global_step,
-                            accelerator.unwrap_model(clip_l) if args.save_clip else None,
-                            accelerator.unwrap_model(clip_g) if args.save_clip else None,
-                            accelerator.unwrap_model(t5xxl) if args.save_t5xxl else None,
-                            accelerator.unwrap_model(mmdit),
+                            accelerator.unwrap_model(clip_l) if train_clip else None,
+                            accelerator.unwrap_model(clip_g) if train_clip else None,
+                            accelerator.unwrap_model(t5xxl) if train_t5xxl else None,
+                            accelerator.unwrap_model(mmdit) if train_mmdit else None,
                             vae,
                         )
+                optimizer_train_fn()
 
             current_loss = loss.detach().item()  # 平均なのでbatch sizeは関係ないはず
             if len(accelerator.trackers) > 0:
@@ -928,6 +1011,7 @@ def compute_loss_weighting_for_sd3(weighting_scheme: str, sigmas=None):
 
         accelerator.wait_for_everyone()
 
+        optimizer_eval_fn()
         if args.save_every_n_epochs is not None:
             if accelerator.is_main_process:
                 sd3_train_utils.save_sd3_model_on_epoch_end_or_stepwise(
@@ -938,10 +1022,10 @@ def compute_loss_weighting_for_sd3(weighting_scheme: str, sigmas=None):
                     epoch,
                     num_train_epochs,
                     global_step,
-                    accelerator.unwrap_model(clip_l) if args.save_clip else None,
-                    accelerator.unwrap_model(clip_g) if args.save_clip else None,
-                    accelerator.unwrap_model(t5xxl) if args.save_t5xxl else None,
-                    accelerator.unwrap_model(mmdit),
+                    accelerator.unwrap_model(clip_l) if train_clip else None,
+                    accelerator.unwrap_model(clip_g) if train_clip else None,
+                    accelerator.unwrap_model(t5xxl) if train_t5xxl else None,
+                    accelerator.unwrap_model(mmdit) if train_mmdit else None,
                     vae,
                 )
 
@@ -958,6 +1042,7 @@ def compute_loss_weighting_for_sd3(weighting_scheme: str, sigmas=None):
         t5xxl = accelerator.unwrap_model(t5xxl)
 
     accelerator.end_training()
+    optimizer_eval_fn()
 
     if args.save_state or args.save_state_on_train_end:
         train_util.save_state_on_train_end(args, accelerator)
@@ -970,10 +1055,10 @@ def compute_loss_weighting_for_sd3(weighting_scheme: str, sigmas=None):
             save_dtype,
             epoch,
             global_step,
-            clip_l if args.save_clip else None,
-            clip_g if args.save_clip else None,
-            t5xxl if args.save_t5xxl else None,
-            mmdit,
+            accelerator.unwrap_model(clip_l) if train_clip else None,
+            accelerator.unwrap_model(clip_g) if train_clip else None,
+            accelerator.unwrap_model(t5xxl) if train_t5xxl else None,
+            accelerator.unwrap_model(mmdit) if train_mmdit else None,
             vae,
         )
         logger.info("model saved.")
@@ -991,13 +1076,13 @@ def setup_parser() -> argparse.ArgumentParser:
     train_util.add_sd_saving_arguments(parser)
     train_util.add_optimizer_arguments(parser)
     config_util.add_config_arguments(parser)
-    custom_train_functions.add_custom_train_arguments(parser)
+    add_custom_train_arguments(parser)
     sd3_train_utils.add_sd3_training_arguments(parser)
 
     parser.add_argument(
         "--train_text_encoder", action="store_true", help="train text encoder (CLIP-L and G) / text encoderも学習する"
     )
-    # parser.add_argument("--train_t5xxl", action="store_true", help="train T5-XXL / T5-XXLも学習する")
+    parser.add_argument("--train_t5xxl", action="store_true", help="train T5-XXL / T5-XXLも学習する")
     parser.add_argument(
         "--use_t5xxl_cache_only", action="store_true", help="cache T5-XXL outputs only / T5-XXLの出力のみキャッシュする"
     )
@@ -1018,19 +1103,24 @@ def setup_parser() -> argparse.ArgumentParser:
         help="apply attention mask (zero embs) to T5-XXL / T5-XXLにアテンションマスク（ゼロ埋め）を適用する",
     )
 
-    # TE training is disabled temporarily
-    # parser.add_argument(
-    #     "--learning_rate_te1",
-    #     type=float,
-    #     default=None,
-    #     help="learning rate for text encoder 1 (ViT-L) / text encoder 1 (ViT-L)の学習率",
-    # )
-    # parser.add_argument(
-    #     "--learning_rate_te2",
-    #     type=float,
-    #     default=None,
-    #     help="learning rate for text encoder 2 (BiG-G) / text encoder 2 (BiG-G)の学習率",
-    # )
+    parser.add_argument(
+        "--learning_rate_te1",
+        type=float,
+        default=None,
+        help="learning rate for text encoder 1 (ViT-L) / text encoder 1 (ViT-L)の学習率",
+    )
+    parser.add_argument(
+        "--learning_rate_te2",
+        type=float,
+        default=None,
+        help="learning rate for text encoder 2 (BiG-G) / text encoder 2 (BiG-G)の学習率",
+    )
+    parser.add_argument(
+        "--learning_rate_te3",
+        type=float,
+        default=None,
+        help="learning rate for text encoder 3 (T5-XXL) / text encoder 3 (T5-XXL)の学習率",
+    )
 
     # parser.add_argument(
     #     "--diffusers_xformers", action="store_true", help="use xformers by diffusers / Diffusersでxformersを使用する"
@@ -1047,22 +1137,22 @@ def setup_parser() -> argparse.ArgumentParser:
     #     help=f"learning rates for each block of U-Net, comma-separated, {UNET_NUM_BLOCKS_FOR_BLOCK_LR} values / "
     #     + f"U-Netの各ブロックの学習率、カンマ区切り、{UNET_NUM_BLOCKS_FOR_BLOCK_LR}個の値",
     # )
+    parser.add_argument(
+        "--blockwise_fused_optimizers",
+        action="store_true",
+        help="enable blockwise optimizers for fused backward pass and optimizer step / fused backward passとoptimizer step のためブロック単位のoptimizerを有効にする",
+    )
     parser.add_argument(
         "--fused_optimizer_groups",
         type=int,
         default=None,
-        help="number of optimizers for fused backward pass and optimizer step / fused backward passとoptimizer stepのためのoptimizer数",
+        help="[DOES NOT WORK] number of optimizer groups for fused backward pass and optimizer step / fused backward passとoptimizer stepのためのoptimizerグループ数",
     )
     parser.add_argument(
         "--skip_latents_validity_check",
         action="store_true",
         help="[Deprecated] use 'skip_cache_check' instead / 代わりに 'skip_cache_check' を使用してください",
     )
-    parser.add_argument(
-        "--skip_cache_check",
-        action="store_true",
-        help="skip cache (latents and text encoder outputs) check / キャッシュ（latentsとtext encoder outputs）のチェックをスキップする",
-    )
     parser.add_argument(
         "--num_last_block_to_freeze",
         type=int,

From e3c43bda49ec8c5a5cb784e29f8610f1ebff0a66 Mon Sep 17 00:00:00 2001
From: Kohya S <ykumeykume@gmail.com>
Date: Thu, 24 Oct 2024 20:35:47 +0900
Subject: [PATCH 190/582] reduce memory usage in sample image generation

---
 library/sd3_train_utils.py | 20 +++++++++++---------
 1 file changed, 11 insertions(+), 9 deletions(-)

diff --git a/library/sd3_train_utils.py b/library/sd3_train_utils.py
index 9282482d9..af8ecf2c9 100644
--- a/library/sd3_train_utils.py
+++ b/library/sd3_train_utils.py
@@ -402,9 +402,6 @@ def sample_images(
     except Exception:
         pass
 
-    org_vae_device = vae.device  # will be on cpu
-    vae.to(distributed_state.device)  # distributed_state.device is same as accelerator.device
-
     if distributed_state.num_processes <= 1:
         # If only one device is available, just use the original prompt list. We don't need to care about the distribution of prompts.
         with torch.no_grad():
@@ -450,8 +447,6 @@ def sample_images(
     if cuda_rng_state is not None:
         torch.cuda.set_rng_state(cuda_rng_state)
 
-    vae.to(org_vae_device)
-
     clean_memory_on_device(accelerator.device)
 
 
@@ -531,12 +526,19 @@ def sample_image_inference(
     neg_cond = encoding_strategy.concat_encodings(lg_out, t5_out, pooled)
 
     # sample image
-    latents = do_sample(height, width, seed, cond, neg_cond, mmdit, sample_steps, scale, mmdit.dtype, accelerator.device)
-    latents = vae.process_out(latents.to(vae.device, dtype=vae.dtype))
+    clean_memory_on_device(accelerator.device)
+    with accelerator.autocast():
+        latents = do_sample(height, width, seed, cond, neg_cond, mmdit, sample_steps, scale, mmdit.dtype, accelerator.device)
 
     # latent to image
-    with torch.no_grad():
-        image = vae.decode(latents)
+    clean_memory_on_device(accelerator.device)
+    org_vae_device = vae.device  # will be on cpu
+    vae.to(accelerator.device)
+    latents = vae.process_out(latents.to(vae.device, dtype=vae.dtype))
+    image = vae.decode(latents)
+    vae.to(org_vae_device)
+    clean_memory_on_device(accelerator.device)
+    
     image = image.float()
     image = torch.clamp((image + 1.0) / 2.0, min=0.0, max=1.0)[0]
     decoded_np = 255.0 * np.moveaxis(image.cpu().numpy(), 0, 2)

From 0286114bd208717510b537d9acd940db48a158f3 Mon Sep 17 00:00:00 2001
From: Kohya S <ykumeykume@gmail.com>
Date: Thu, 24 Oct 2024 21:28:42 +0900
Subject: [PATCH 191/582] support SD3.5L, fix final saving

---
 sd3_train.py | 35 ++++++++++++++++++++++++-----------
 1 file changed, 24 insertions(+), 11 deletions(-)

diff --git a/sd3_train.py b/sd3_train.py
index 6336b4cf9..d4ab13a34 100644
--- a/sd3_train.py
+++ b/sd3_train.py
@@ -321,7 +321,7 @@ def train(args):
         accelerator.wait_for_everyone()
 
         # now we can delete Text Encoders to free memory
-        if args.use_t5xxl_cache_only:
+        if not args.use_t5xxl_cache_only:
             clip_l = None
             clip_g = None
         t5xxl = None
@@ -330,6 +330,7 @@ def train(args):
 
     # load VAE for caching latents
     if sd3_state_dict is None:
+        logger.info(f"load state dict for MMDiT and VAE from {args.pretrained_model_name_or_path}")
         sd3_state_dict = utils.load_safetensors(
             args.pretrained_model_name_or_path, "cpu", args.disable_mmap_load_safetensors, model_dtype
         )
@@ -360,11 +361,6 @@ def train(args):
     #     attn_mode == "torch"
     # ), f"attn_mode {attn_mode} is not supported yet. Please use `--sdpa` instead of `--xformers`. / attn_mode {attn_mode} はサポートされていません。`--xformers`の代わりに`--sdpa`を使ってください。"
 
-    # SD3 state dict may contain multiple models, so we need to load it and extract one by one. annoying.
-    logger.info(f"Loading SD3 models from {args.pretrained_model_name_or_path}")
-    device_to_load = accelerator.device if args.lowram else "cpu"
-    sd3_state_dict = utils.load_safetensors(args.pretrained_model_name_or_path, device_to_load, args.disable_mmap_load_safetensors)
-
     if args.gradient_checkpointing:
         mmdit.enable_gradient_checkpointing()
 
@@ -555,7 +551,7 @@ def train(args):
     #     clip_l.text_model.encoder.layers[-1].requires_grad_(False)
     #     clip_l.text_model.final_layer_norm.requires_grad_(False)
 
-    # TextEncoderの出力をキャッシュするときには、すでに出力を取得済みなのでCPUへ移動する
+    # move Text Encoders to GPU if not caching outputs
     if not args.cache_text_encoder_outputs:
         # make sure Text Encoders are on GPU
         # TODO support CPU for text encoders
@@ -817,6 +813,13 @@ def optimizer_hook(parameter: torch.Tensor):
         # log empty object to commit the sample images to wandb
         accelerator.log({}, step=0)
 
+    # show model device and dtype
+    logger.info(f"mmdit device: {mmdit.device}, dtype: {mmdit.dtype}" if mmdit else "mmdit is None")
+    logger.info(f"clip_l device: {clip_l.device}, dtype: {clip_l.dtype}" if clip_l else "clip_l is None")
+    logger.info(f"clip_g device: {clip_g.device}, dtype: {clip_g.dtype}" if clip_g else "clip_g is None")
+    logger.info(f"t5xxl device: {t5xxl.device}, dtype: {t5xxl.dtype}" if t5xxl else "t5xxl is None")
+    logger.info(f"vae device: {vae.device}, dtype: {vae.dtype}" if vae is not None else "vae is None")
+
     loss_recorder = train_util.LossRecorder()
     epoch = 0  # avoid error when max_train_steps is 0
     for epoch in range(num_train_epochs):
@@ -1055,10 +1058,10 @@ def optimizer_hook(parameter: torch.Tensor):
             save_dtype,
             epoch,
             global_step,
-            accelerator.unwrap_model(clip_l) if train_clip else None,
-            accelerator.unwrap_model(clip_g) if train_clip else None,
-            accelerator.unwrap_model(t5xxl) if train_t5xxl else None,
-            accelerator.unwrap_model(mmdit) if train_mmdit else None,
+            clip_l if train_clip else None,
+            clip_g if train_clip else None,
+            t5xxl if train_t5xxl else None,
+            mmdit if train_mmdit else None,
             vae,
         )
         logger.info("model saved.")
@@ -1153,6 +1156,16 @@ def setup_parser() -> argparse.ArgumentParser:
         action="store_true",
         help="[Deprecated] use 'skip_cache_check' instead / 代わりに 'skip_cache_check' を使用してください",
     )
+    parser.add_argument(
+        "--blocks_to_swap",
+        type=int,
+        default=None,
+        help="[EXPERIMENTAL] "
+        "Sets the number of blocks (~640MB) to swap during the forward and backward passes."
+        "Increasing this number lowers the overall VRAM used during training at the expense of training speed (s/it)."
+        " / 順伝播および逆伝播中にスワップするブロック（約640MB）の数を設定します。"
+        "この数を増やすと、トレーニング中のVRAM使用量が減りますが、トレーニング速度（s/it）も低下します。",
+    )
     parser.add_argument(
         "--num_last_block_to_freeze",
         type=int,

From f8c5146d71b1c40b69d80b7ea18c21bbb66b84f3 Mon Sep 17 00:00:00 2001
From: Kohya S <ykumeykume@gmail.com>
Date: Thu, 24 Oct 2024 22:02:05 +0900
Subject: [PATCH 192/582] support block swap with fused_optimizer_pass

---
 library/sd3_models.py | 79 +++++++++++++++++++++++++++++++++++++++++--
 sd3_train.py          | 19 +++++++++--
 2 files changed, 94 insertions(+), 4 deletions(-)

diff --git a/library/sd3_models.py b/library/sd3_models.py
index c81aa4794..e5c5887a9 100644
--- a/library/sd3_models.py
+++ b/library/sd3_models.py
@@ -4,6 +4,7 @@
 # and some module/classes are contributed from KohakuBlueleaf. Thanks for the contribution!
 
 from ast import Tuple
+from concurrent.futures import ThreadPoolExecutor
 from dataclasses import dataclass
 from functools import partial
 import math
@@ -17,6 +18,8 @@
 from torch.utils.checkpoint import checkpoint
 from transformers import CLIPTokenizer, T5TokenizerFast
 
+from library.device_utils import clean_memory_on_device
+
 from .utils import setup_logging
 
 setup_logging()
@@ -848,6 +851,35 @@ def cropped_pos_embed(self, h, w, device=None):
         spatial_pos_embed = spatial_pos_embed.reshape(1, -1, spatial_pos_embed.shape[-1])
         return spatial_pos_embed
 
+    def enable_block_swap(self, num_blocks: int):
+        self.blocks_to_swap = num_blocks
+
+        n = 1  # async block swap. 1 is enough
+        self.thread_pool = ThreadPoolExecutor(max_workers=n)
+
+    def move_to_device_except_swap_blocks(self, device: torch.device):
+        # assume model is on cpu
+        if self.blocks_to_swap:
+            save_blocks = self.joint_blocks
+            self.joint_blocks = None
+
+        self.to(device)
+
+        if self.blocks_to_swap:
+            self.joint_blocks = save_blocks
+
+    def prepare_block_swap_before_forward(self):
+        # make: first n blocks are on cuda, and last n blocks are on cpu
+        if self.blocks_to_swap is None or self.blocks_to_swap == 0:
+            # raise ValueError("Block swap is not enabled.")
+            return
+        num_blocks = len(self.joint_blocks)
+        for i in range(num_blocks - self.blocks_to_swap):
+            self.joint_blocks[i].to(self.device)
+        for i in range(num_blocks - self.blocks_to_swap, num_blocks):
+            self.joint_blocks[i].to("cpu")
+        clean_memory_on_device(self.device)
+
     def forward(
         self,
         x: torch.Tensor,
@@ -881,8 +913,51 @@ def forward(
                 1,
             )
 
-        for block in self.joint_blocks:
-            context, x = block(context, x, c)
+        if not self.blocks_to_swap:
+            for block in self.joint_blocks:
+                context, x = block(context, x, c)
+        else:
+            futures = {}
+
+            def submit_move_blocks(block_idx_to_cpu, block_idx_to_cuda):
+                def move_blocks(bidx_to_cpu, block_to_cpu, bidx_to_cuda, block_to_cuda):
+                    # print(f"Moving {bidx_to_cpu} to cpu.")
+                    block_to_cpu.to("cpu", non_blocking=True)
+                    torch.cuda.empty_cache()
+
+                    # print(f"Moving {bidx_to_cuda} to cuda.")
+                    block_to_cuda.to(self.device, non_blocking=True)
+
+                    torch.cuda.synchronize()
+                    # print(f"Block move done. {bidx_to_cpu} to cpu, {bidx_to_cuda} to cuda.")
+                    return block_idx_to_cpu, block_idx_to_cuda
+
+                block_to_cpu = self.joint_blocks[block_idx_to_cpu]
+                block_to_cuda = self.joint_blocks[block_idx_to_cuda]
+                # print(f"Submit move blocks. {block_idx_to_cpu} to cpu, {block_idx_to_cuda} to cuda.")
+                return self.thread_pool.submit(move_blocks, block_idx_to_cpu, block_to_cpu, block_idx_to_cuda, block_to_cuda)
+
+            def wait_for_blocks_move(block_idx, ftrs):
+                if block_idx not in ftrs:
+                    return
+                # print(f"Waiting for move blocks: {block_idx}")
+                # start_time = time.perf_counter()
+                ftr = ftrs.pop(block_idx)
+                ftr.result()
+                # torch.cuda.synchronize()
+                # print(f"Move blocks took {time.perf_counter() - start_time:.2f} seconds")
+
+            for block_idx, block in enumerate(self.joint_blocks):
+                wait_for_blocks_move(block_idx, futures)
+
+                context, x = block(context, x, c)
+
+                if block_idx < self.blocks_to_swap:
+                    block_idx_to_cpu = block_idx
+                    block_idx_to_cuda = len(self.joint_blocks) - self.blocks_to_swap + block_idx
+                    future = submit_move_blocks(block_idx_to_cpu, block_idx_to_cuda)
+                    futures[block_idx_to_cuda] = future
+
         x = self.final_layer(x, c, H, W)  # Our final layer combined UnPatchify
         return x[:, :, :H, :W]
 
diff --git a/sd3_train.py b/sd3_train.py
index d4ab13a34..5e2efa6f8 100644
--- a/sd3_train.py
+++ b/sd3_train.py
@@ -369,6 +369,14 @@ def train(args):
     if not train_mmdit:
         mmdit.to(accelerator.device, dtype=weight_dtype)  # because of mmdit will not be prepared
 
+    # block swap
+    is_swapping_blocks = args.blocks_to_swap is not None and args.blocks_to_swap > 0
+    if is_swapping_blocks:
+        # Swap blocks between CPU and GPU to reduce memory usage, in forward and backward passes.
+        # This idea is based on 2kpr's great work. Thank you!
+        logger.info(f"enable block swap: blocks_to_swap={args.blocks_to_swap}")
+        mmdit.enable_block_swap(args.blocks_to_swap)
+
     if not cache_latents:
         # move to accelerator device
         vae.requires_grad_(False)
@@ -575,7 +583,9 @@ def train(args):
     else:
         # acceleratorがなんかよろしくやってくれるらしい
         if train_mmdit:
-            mmdit = accelerator.prepare(mmdit)
+            mmdit = accelerator.prepare(mmdit, device_placement=[not is_swapping_blocks])
+            if is_swapping_blocks:
+                accelerator.unwrap_model(mmdit).move_to_device_except_swap_blocks(accelerator.device)  # reduce peak memory usage
         if train_clip:
             clip_l = accelerator.prepare(clip_l)
             clip_g = accelerator.prepare(clip_g)
@@ -600,8 +610,10 @@ def move_blocks(bidx_to_cpu, block_to_cpu, bidx_to_cuda, block_to_cuda, dvc):
             block_to_cpu = block_to_cpu.to("cpu", non_blocking=True)
             torch.cuda.empty_cache()
 
+            # print(f"Backward: Move block {bidx_to_cuda} to CUDA")
             block_to_cuda = block_to_cuda.to(dvc, non_blocking=True)
             torch.cuda.synchronize()
+            # print(f"Backward: Done moving blocks {bidx_to_cpu} and {bidx_to_cuda}")
             return bidx_to_cpu, bidx_to_cuda
 
         block_to_cpu = blocks[block_idx_to_cpu]
@@ -639,7 +651,7 @@ def wait_blocks_move(block_idx, futures):
                     grad_hook = None
 
                     if blocks_to_swap:
-                        is_block = param_name.startswith("double_blocks")
+                        is_block = param_name.startswith("joint_blocks")
                         if is_block:
                             block_idx = int(param_name.split(".")[1])
                             if block_idx not in handled_block_indices:
@@ -805,6 +817,9 @@ def optimizer_hook(parameter: torch.Tensor):
             init_kwargs=init_kwargs,
         )
 
+    if is_swapping_blocks:
+        accelerator.unwrap_model(mmdit).prepare_block_swap_before_forward()
+
     # For --sample_at_first
     optimizer_eval_fn()
     sd3_train_utils.sample_images(accelerator, args, 0, global_step, mmdit, vae, [clip_l, clip_g, t5xxl], sample_prompts_te_outputs)

From d2c549d7b2a9bb3e70b5af8539fd744b474a9607 Mon Sep 17 00:00:00 2001
From: kohya-ss <ykumeykume@gmail.com>
Date: Fri, 25 Oct 2024 21:58:31 +0900
Subject: [PATCH 193/582] support SD3 LoRA

---
 library/sd3_models.py      |   3 +
 library/sd3_train_utils.py | 113 +++--
 library/sd3_utils.py       |   2 +-
 networks/lora_sd3.py       | 826 +++++++++++++++++++++++++++++++++++++
 sd3_train.py               |  30 +-
 sd3_train_network.py       | 427 +++++++++++++++++++
 train_network.py           |   2 +
 7 files changed, 1335 insertions(+), 68 deletions(-)
 create mode 100644 networks/lora_sd3.py
 create mode 100644 sd3_train_network.py

diff --git a/library/sd3_models.py b/library/sd3_models.py
index e5c5887a9..5d09f74e8 100644
--- a/library/sd3_models.py
+++ b/library/sd3_models.py
@@ -761,6 +761,9 @@ def __init__(
         self.final_layer = UnPatch(self.hidden_size, patch_size, self.out_channels)
         # self.initialize_weights()
 
+        self.blocks_to_swap = None
+        self.thread_pool: Optional[ThreadPoolExecutor] = None
+
     @property
     def model_type(self):
         return self._model_type
diff --git a/library/sd3_train_utils.py b/library/sd3_train_utils.py
index af8ecf2c9..e3c649f73 100644
--- a/library/sd3_train_utils.py
+++ b/library/sd3_train_utils.py
@@ -198,6 +198,23 @@ def add_sd3_training_arguments(parser: argparse.ArgumentParser):
         help="[DOES NOT WORK] not supported yet. T5-XXL dtype. if not specified, use default dtype (from mixed precision) / T5-XXL dtype。指定しない場合はデフォルトのdtype（mixed precisionから）を使用",
     )
 
+    parser.add_argument(
+        "--t5xxl_max_token_length",
+        type=int,
+        default=256,
+        help="maximum token length for T5-XXL. 256 is the default value / T5-XXLの最大トークン長。デフォルトは256",
+    )
+    parser.add_argument(
+        "--apply_lg_attn_mask",
+        action="store_true",
+        help="apply attention mask (zero embs) to CLIP-L and G / CLIP-LとGにアテンションマスク（ゼロ埋め）を適用する",
+    )
+    parser.add_argument(
+        "--apply_t5_attn_mask",
+        action="store_true",
+        help="apply attention mask (zero embs) to T5-XXL / T5-XXLにアテンションマスク（ゼロ埋め）を適用する",
+    )
+
     # copy from Diffusers
     parser.add_argument(
         "--weighting_scheme",
@@ -317,36 +334,36 @@ def do_sample(
     x = noise_scaled.to(device).to(dtype)
     # print(x.shape)
 
-    with torch.no_grad():
-        for i in tqdm(range(len(sigmas) - 1)):
-            sigma_hat = sigmas[i]
+    # with torch.no_grad():
+    for i in tqdm(range(len(sigmas) - 1)):
+        sigma_hat = sigmas[i]
 
-            timestep = model_sampling.timestep(sigma_hat).float()
-            timestep = torch.FloatTensor([timestep, timestep]).to(device)
+        timestep = model_sampling.timestep(sigma_hat).float()
+        timestep = torch.FloatTensor([timestep, timestep]).to(device)
 
-            x_c_nc = torch.cat([x, x], dim=0)
-            # print(x_c_nc.shape, timestep.shape, c_crossattn.shape, y.shape)
+        x_c_nc = torch.cat([x, x], dim=0)
+        # print(x_c_nc.shape, timestep.shape, c_crossattn.shape, y.shape)
 
-            model_output = mmdit(x_c_nc, timestep, context=c_crossattn, y=y)
-            model_output = model_output.float()
-            batched = model_sampling.calculate_denoised(sigma_hat, model_output, x)
+        model_output = mmdit(x_c_nc, timestep, context=c_crossattn, y=y)
+        model_output = model_output.float()
+        batched = model_sampling.calculate_denoised(sigma_hat, model_output, x)
 
-            pos_out, neg_out = batched.chunk(2)
-            denoised = neg_out + (pos_out - neg_out) * guidance_scale
-            # print(denoised.shape)
+        pos_out, neg_out = batched.chunk(2)
+        denoised = neg_out + (pos_out - neg_out) * guidance_scale
+        # print(denoised.shape)
 
-            # d = to_d(x, sigma_hat, denoised)
-            dims_to_append = x.ndim - sigma_hat.ndim
-            sigma_hat_dims = sigma_hat[(...,) + (None,) * dims_to_append]
-            # print(dims_to_append, x.shape, sigma_hat.shape, denoised.shape, sigma_hat_dims.shape)
-            """Converts a denoiser output to a Karras ODE derivative."""
-            d = (x - denoised) / sigma_hat_dims
+        # d = to_d(x, sigma_hat, denoised)
+        dims_to_append = x.ndim - sigma_hat.ndim
+        sigma_hat_dims = sigma_hat[(...,) + (None,) * dims_to_append]
+        # print(dims_to_append, x.shape, sigma_hat.shape, denoised.shape, sigma_hat_dims.shape)
+        """Converts a denoiser output to a Karras ODE derivative."""
+        d = (x - denoised) / sigma_hat_dims
 
-            dt = sigmas[i + 1] - sigma_hat
+        dt = sigmas[i + 1] - sigma_hat
 
-            # Euler method
-            x = x + d * dt
-            x = x.to(dtype)
+        # Euler method
+        x = x + d * dt
+        x = x.to(dtype)
 
     return x
 
@@ -378,7 +395,7 @@ def sample_images(
 
     logger.info("")
     logger.info(f"generating sample images at step / サンプル画像生成 ステップ: {steps}")
-    if not os.path.isfile(args.sample_prompts):
+    if not os.path.isfile(args.sample_prompts) and sample_prompts_te_outputs is None:
         logger.error(f"No prompt file / プロンプトファイルがありません: {args.sample_prompts}")
         return
 
@@ -386,7 +403,7 @@ def sample_images(
 
     # unwrap unet and text_encoder(s)
     mmdit = accelerator.unwrap_model(mmdit)
-    text_encoders = [accelerator.unwrap_model(te) for te in text_encoders]
+    text_encoders = None if text_encoders is None else [accelerator.unwrap_model(te) for te in text_encoders]
     # print([(te.parameters().__next__().device if te is not None else None) for te in text_encoders])
 
     prompts = train_util.load_prompts(args.sample_prompts)
@@ -404,7 +421,7 @@ def sample_images(
 
     if distributed_state.num_processes <= 1:
         # If only one device is available, just use the original prompt list. We don't need to care about the distribution of prompts.
-        with torch.no_grad():
+        with torch.no_grad(), accelerator.autocast():
             for prompt_dict in prompts:
                 sample_image_inference(
                     accelerator,
@@ -506,29 +523,39 @@ def sample_image_inference(
     tokenize_strategy = strategy_base.TokenizeStrategy.get_strategy()
     encoding_strategy = strategy_base.TextEncodingStrategy.get_strategy()
 
-    if sample_prompts_te_outputs and prompt in sample_prompts_te_outputs:
-        te_outputs = sample_prompts_te_outputs[prompt]
-    else:
-        l_tokens, g_tokens, t5_tokens = tokenize_strategy.tokenize(prompt)
-        te_outputs = encoding_strategy.encode_tokens(tokenize_strategy, text_encoders, [l_tokens, g_tokens, t5_tokens])
-
-    lg_out, t5_out, pooled, l_attn_mask, g_attn_mask, t5_attn_mask = te_outputs
+    def encode_prompt(prpt):
+        text_encoder_conds = []
+        if sample_prompts_te_outputs and prpt in sample_prompts_te_outputs:
+            text_encoder_conds = sample_prompts_te_outputs[prpt]
+            print(f"Using cached text encoder outputs for prompt: {prpt}")
+        if text_encoders is not None:
+            print(f"Encoding prompt: {prpt}")
+            tokens_and_masks = tokenize_strategy.tokenize(prpt)
+            # strategy has apply_t5_attn_mask option
+            encoded_text_encoder_conds = encoding_strategy.encode_tokens(tokenize_strategy, text_encoders, tokens_and_masks)
+
+            # if text_encoder_conds is not cached, use encoded_text_encoder_conds
+            if len(text_encoder_conds) == 0:
+                text_encoder_conds = encoded_text_encoder_conds
+            else:
+                # if encoded_text_encoder_conds is not None, update cached text_encoder_conds
+                for i in range(len(encoded_text_encoder_conds)):
+                    if encoded_text_encoder_conds[i] is not None:
+                        text_encoder_conds[i] = encoded_text_encoder_conds[i]
+        return text_encoder_conds
+
+    lg_out, t5_out, pooled, l_attn_mask, g_attn_mask, t5_attn_mask = encode_prompt(prompt)
     cond = encoding_strategy.concat_encodings(lg_out, t5_out, pooled)
 
     # encode negative prompts
-    if sample_prompts_te_outputs and negative_prompt in sample_prompts_te_outputs:
-        neg_te_outputs = sample_prompts_te_outputs[negative_prompt]
-    else:
-        l_tokens, g_tokens, t5_tokens = tokenize_strategy.tokenize(negative_prompt)
-        neg_te_outputs = encoding_strategy.encode_tokens(tokenize_strategy, text_encoders, [l_tokens, g_tokens, t5_tokens])
-
-    lg_out, t5_out, pooled, l_attn_mask, g_attn_mask, t5_attn_mask = neg_te_outputs
+    lg_out, t5_out, pooled, l_attn_mask, g_attn_mask, t5_attn_mask = encode_prompt(negative_prompt)
     neg_cond = encoding_strategy.concat_encodings(lg_out, t5_out, pooled)
 
     # sample image
     clean_memory_on_device(accelerator.device)
-    with accelerator.autocast():
-        latents = do_sample(height, width, seed, cond, neg_cond, mmdit, sample_steps, scale, mmdit.dtype, accelerator.device)
+    with accelerator.autocast(), torch.no_grad():
+        # mmdit may be fp8, so we need weight_dtype here. vae is always in that dtype.
+        latents = do_sample(height, width, seed, cond, neg_cond, mmdit, sample_steps, scale, vae.dtype, accelerator.device)
 
     # latent to image
     clean_memory_on_device(accelerator.device)
@@ -538,7 +565,7 @@ def sample_image_inference(
     image = vae.decode(latents)
     vae.to(org_vae_device)
     clean_memory_on_device(accelerator.device)
-    
+
     image = image.float()
     image = torch.clamp((image + 1.0) / 2.0, min=0.0, max=1.0)[0]
     decoded_np = 255.0 * np.moveaxis(image.cpu().numpy(), 0, 2)
diff --git a/library/sd3_utils.py b/library/sd3_utils.py
index 9ad995d81..71e50de36 100644
--- a/library/sd3_utils.py
+++ b/library/sd3_utils.py
@@ -91,7 +91,7 @@ def load_mmdit(
         mmdit = sd3_models.create_sd3_mmdit(params, attn_mode)
 
     logger.info("Loading state dict...")
-    info = sdxl_model_util._load_state_dict_on_device(mmdit, mmdit_sd, device, dtype)
+    info = mmdit.load_state_dict(mmdit_sd, strict=False, assign=True)
     logger.info(f"Loaded MMDiT: {info}")
     return mmdit
 
diff --git a/networks/lora_sd3.py b/networks/lora_sd3.py
new file mode 100644
index 000000000..cbabf8da0
--- /dev/null
+++ b/networks/lora_sd3.py
@@ -0,0 +1,826 @@
+# temporary minimum implementation of LoRA
+# SD3 doesn't have Conv2d, so we ignore it
+# TODO commonize with the original/SD3/FLUX implementation
+
+# LoRA network module
+# reference:
+# https://github.com/microsoft/LoRA/blob/main/loralib/layers.py
+# https://github.com/cloneofsimo/lora/blob/master/lora_diffusion/lora.py
+
+import math
+import os
+from typing import Dict, List, Optional, Tuple, Type, Union
+from transformers import CLIPTextModelWithProjection, T5EncoderModel
+import numpy as np
+import torch
+from library.utils import setup_logging
+
+setup_logging()
+import logging
+
+logger = logging.getLogger(__name__)
+
+from networks.lora_flux import LoRAModule, LoRAInfModule
+from library import sd3_models
+
+
+def create_network(
+    multiplier: float,
+    network_dim: Optional[int],
+    network_alpha: Optional[float],
+    vae: sd3_models.SDVAE,
+    text_encoders: List[Union[CLIPTextModelWithProjection, T5EncoderModel]],
+    mmdit,
+    neuron_dropout: Optional[float] = None,
+    **kwargs,
+):
+    if network_dim is None:
+        network_dim = 4  # default
+    if network_alpha is None:
+        network_alpha = 1.0
+
+    # extract dim/alpha for conv2d, and block dim
+    conv_dim = kwargs.get("conv_dim", None)
+    conv_alpha = kwargs.get("conv_alpha", None)
+    if conv_dim is not None:
+        conv_dim = int(conv_dim)
+        if conv_alpha is None:
+            conv_alpha = 1.0
+        else:
+            conv_alpha = float(conv_alpha)
+
+    # attn dim, mlp dim: only for DoubleStreamBlock. SingleStreamBlock is not supported because of combined qkv
+    context_attn_dim = kwargs.get("context_attn_dim", None)
+    context_mlp_dim = kwargs.get("context_mlp_dim", None)
+    context_mod_dim = kwargs.get("context_mod_dim", None)
+    x_attn_dim = kwargs.get("x_attn_dim", None)
+    x_mlp_dim = kwargs.get("x_mlp_dim", None)
+    x_mod_dim = kwargs.get("x_mod_dim", None)
+    if context_attn_dim is not None:
+        context_attn_dim = int(context_attn_dim)
+    if context_mlp_dim is not None:
+        context_mlp_dim = int(context_mlp_dim)
+    if context_mod_dim is not None:
+        context_mod_dim = int(context_mod_dim)
+    if x_attn_dim is not None:
+        x_attn_dim = int(x_attn_dim)
+    if x_mlp_dim is not None:
+        x_mlp_dim = int(x_mlp_dim)
+    if x_mod_dim is not None:
+        x_mod_dim = int(x_mod_dim)
+    type_dims = [context_attn_dim, context_mlp_dim, context_mod_dim, x_attn_dim, x_mlp_dim, x_mod_dim]
+    if all([d is None for d in type_dims]):
+        type_dims = None
+
+    # emb_dims [context_embedder, t_embedder, x_embedder, y_embedder, final_mod, final_linear]
+    emb_dims = kwargs.get("emb_dims", None)
+    if emb_dims is not None:
+        emb_dims = emb_dims.strip()
+        if emb_dims.startswith("[") and emb_dims.endswith("]"):
+            emb_dims = emb_dims[1:-1]
+        emb_dims = [int(d) for d in emb_dims.split(",")]  # is it better to use ast.literal_eval?
+        assert len(emb_dims) == 6, f"invalid emb_dims: {emb_dims}, must be 6 dimensions (context, t, x, y, final_mod, final_linear)"
+
+    # double/single train blocks
+    def parse_block_selection(selection: str, total_blocks: int) -> List[bool]:
+        """
+        Parse a block selection string and return a list of booleans.
+
+        Args:
+        selection (str): A string specifying which blocks to select.
+        total_blocks (int): The total number of blocks available.
+
+        Returns:
+        List[bool]: A list of booleans indicating which blocks are selected.
+        """
+        if selection == "all":
+            return [True] * total_blocks
+        if selection == "none" or selection == "":
+            return [False] * total_blocks
+
+        selected = [False] * total_blocks
+        ranges = selection.split(",")
+
+        for r in ranges:
+            if "-" in r:
+                start, end = map(str.strip, r.split("-"))
+                start = int(start)
+                end = int(end)
+                assert 0 <= start < total_blocks, f"invalid start index: {start}"
+                assert 0 <= end < total_blocks, f"invalid end index: {end}"
+                assert start <= end, f"invalid range: {start}-{end}"
+                for i in range(start, end + 1):
+                    selected[i] = True
+            else:
+                index = int(r)
+                assert 0 <= index < total_blocks, f"invalid index: {index}"
+                selected[index] = True
+
+        return selected
+
+    train_block_indices = kwargs.get("train_block_indices", None)
+    if train_block_indices is not None:
+        train_block_indices = parse_block_selection(train_block_indices, 999)  # 999 is a dummy number
+
+    # rank/module dropout
+    rank_dropout = kwargs.get("rank_dropout", None)
+    if rank_dropout is not None:
+        rank_dropout = float(rank_dropout)
+    module_dropout = kwargs.get("module_dropout", None)
+    if module_dropout is not None:
+        module_dropout = float(module_dropout)
+
+    # split qkv
+    split_qkv = kwargs.get("split_qkv", False)
+    if split_qkv is not None:
+        split_qkv = True if split_qkv == "True" else False
+
+    # train T5XXL
+    train_t5xxl = kwargs.get("train_t5xxl", False)
+    if train_t5xxl is not None:
+        train_t5xxl = True if train_t5xxl == "True" else False
+
+    # verbose
+    verbose = kwargs.get("verbose", False)
+    if verbose is not None:
+        verbose = True if verbose == "True" else False
+
+    # すごく引数が多いな ( ^ω^)･･･
+    network = LoRANetwork(
+        text_encoders,
+        mmdit,
+        multiplier=multiplier,
+        lora_dim=network_dim,
+        alpha=network_alpha,
+        dropout=neuron_dropout,
+        rank_dropout=rank_dropout,
+        module_dropout=module_dropout,
+        conv_lora_dim=conv_dim,
+        conv_alpha=conv_alpha,
+        split_qkv=split_qkv,
+        train_t5xxl=train_t5xxl,
+        type_dims=type_dims,
+        emb_dims=emb_dims,
+        train_block_indices=train_block_indices,
+        verbose=verbose,
+    )
+
+    loraplus_lr_ratio = kwargs.get("loraplus_lr_ratio", None)
+    loraplus_unet_lr_ratio = kwargs.get("loraplus_unet_lr_ratio", None)
+    loraplus_text_encoder_lr_ratio = kwargs.get("loraplus_text_encoder_lr_ratio", None)
+    loraplus_lr_ratio = float(loraplus_lr_ratio) if loraplus_lr_ratio is not None else None
+    loraplus_unet_lr_ratio = float(loraplus_unet_lr_ratio) if loraplus_unet_lr_ratio is not None else None
+    loraplus_text_encoder_lr_ratio = float(loraplus_text_encoder_lr_ratio) if loraplus_text_encoder_lr_ratio is not None else None
+    if loraplus_lr_ratio is not None or loraplus_unet_lr_ratio is not None or loraplus_text_encoder_lr_ratio is not None:
+        network.set_loraplus_lr_ratio(loraplus_lr_ratio, loraplus_unet_lr_ratio, loraplus_text_encoder_lr_ratio)
+
+    return network
+
+
+# Create network from weights for inference, weights are not loaded here (because can be merged)
+def create_network_from_weights(multiplier, file, ae, text_encoders, mmdit, weights_sd=None, for_inference=False, **kwargs):
+    # if unet is an instance of SdxlUNet2DConditionModel or subclass, set is_sdxl to True
+    if weights_sd is None:
+        if os.path.splitext(file)[1] == ".safetensors":
+            from safetensors.torch import load_file, safe_open
+
+            weights_sd = load_file(file)
+        else:
+            weights_sd = torch.load(file, map_location="cpu")
+
+    # get dim/alpha mapping, and train t5xxl
+    modules_dim = {}
+    modules_alpha = {}
+    train_t5xxl = None
+    for key, value in weights_sd.items():
+        if "." not in key:
+            continue
+
+        lora_name = key.split(".")[0]
+        if "alpha" in key:
+            modules_alpha[lora_name] = value
+        elif "lora_down" in key:
+            dim = value.size()[0]
+            modules_dim[lora_name] = dim
+            # logger.info(lora_name, value.size(), dim)
+
+        if train_t5xxl is None or train_t5xxl is False:
+            train_t5xxl = "lora_te3" in lora_name
+
+    if train_t5xxl is None:
+        train_t5xxl = False
+
+    split_qkv = False  # split_qkv is not needed to care, because state_dict is qkv combined
+
+    module_class = LoRAInfModule if for_inference else LoRAModule
+
+    network = LoRANetwork(
+        text_encoders,
+        mmdit,
+        multiplier=multiplier,
+        modules_dim=modules_dim,
+        modules_alpha=modules_alpha,
+        module_class=module_class,
+        split_qkv=split_qkv,
+        train_t5xxl=train_t5xxl,
+    )
+    return network, weights_sd
+
+
+class LoRANetwork(torch.nn.Module):
+    SD3_TARGET_REPLACE_MODULE = ["SingleDiTBlock"]
+    TEXT_ENCODER_TARGET_REPLACE_MODULE = ["CLIPAttention", "CLIPSdpaAttention", "CLIPMLP", "T5Attention", "T5DenseGatedActDense"]
+    LORA_PREFIX_SD3 = "lora_unet"  # make ComfyUI compatible
+    LORA_PREFIX_TEXT_ENCODER_CLIP_L = "lora_te1"
+    LORA_PREFIX_TEXT_ENCODER_CLIP_G = "lora_te2"
+    LORA_PREFIX_TEXT_ENCODER_T5 = "lora_te3"  # make ComfyUI compatible
+
+    def __init__(
+        self,
+        text_encoders: List[Union[CLIPTextModelWithProjection, T5EncoderModel]],
+        unet: sd3_models.MMDiT,
+        multiplier: float = 1.0,
+        lora_dim: int = 4,
+        alpha: float = 1,
+        dropout: Optional[float] = None,
+        rank_dropout: Optional[float] = None,
+        module_dropout: Optional[float] = None,
+        conv_lora_dim: Optional[int] = None,
+        conv_alpha: Optional[float] = None,
+        module_class: Type[object] = LoRAModule,
+        modules_dim: Optional[Dict[str, int]] = None,
+        modules_alpha: Optional[Dict[str, int]] = None,
+        split_qkv: bool = False,
+        train_t5xxl: bool = False,
+        type_dims: Optional[List[int]] = None,
+        emb_dims: Optional[List[int]] = None,
+        train_block_indices: Optional[List[bool]] = None,
+        verbose: Optional[bool] = False,
+    ) -> None:
+        super().__init__()
+        self.multiplier = multiplier
+
+        self.lora_dim = lora_dim
+        self.alpha = alpha
+        self.conv_lora_dim = conv_lora_dim
+        self.conv_alpha = conv_alpha
+        self.dropout = dropout
+        self.rank_dropout = rank_dropout
+        self.module_dropout = module_dropout
+        self.split_qkv = split_qkv
+        self.train_t5xxl = train_t5xxl
+
+        self.type_dims = type_dims
+        self.emb_dims = emb_dims
+        self.train_block_indices = train_block_indices
+
+        self.loraplus_lr_ratio = None
+        self.loraplus_unet_lr_ratio = None
+        self.loraplus_text_encoder_lr_ratio = None
+
+        if modules_dim is not None:
+            logger.info(f"create LoRA network from weights")
+            self.emb_dims = [0] * 6  # create emb_dims
+            # verbose = True
+        else:
+            logger.info(f"create LoRA network. base dim (rank): {lora_dim}, alpha: {alpha}")
+            logger.info(
+                f"neuron dropout: p={self.dropout}, rank dropout: p={self.rank_dropout}, module dropout: p={self.module_dropout}"
+            )
+            # if self.conv_lora_dim is not None:
+            #     logger.info(
+            #         f"apply LoRA to Conv2d with kernel size (3,3). dim (rank): {self.conv_lora_dim}, alpha: {self.conv_alpha}"
+            #     )
+
+        qkv_dim = 0
+        if self.split_qkv:
+            logger.info(f"split qkv for LoRA")
+            qkv_dim = unet.joint_blocks[0].context_block.attn.qkv.weight.size(0)
+        if train_t5xxl:
+            logger.info(f"train T5XXL as well")
+
+        # create module instances
+        def create_modules(
+            is_mmdit: bool,
+            text_encoder_idx: Optional[int],
+            root_module: torch.nn.Module,
+            target_replace_modules: List[str],
+            filter: Optional[str] = None,
+            default_dim: Optional[int] = None,
+        ) -> List[LoRAModule]:
+            prefix = (
+                self.LORA_PREFIX_SD3
+                if is_mmdit
+                else [self.LORA_PREFIX_TEXT_ENCODER_CLIP_L, self.LORA_PREFIX_TEXT_ENCODER_CLIP_G, self.LORA_PREFIX_TEXT_ENCODER_T5][
+                    text_encoder_idx
+                ]
+            )
+
+            loras = []
+            skipped = []
+            for name, module in root_module.named_modules():
+                if target_replace_modules is None or module.__class__.__name__ in target_replace_modules:
+                    if target_replace_modules is None:  # dirty hack for all modules
+                        module = root_module  # search all modules
+
+                    for child_name, child_module in module.named_modules():
+                        is_linear = child_module.__class__.__name__ == "Linear"
+                        is_conv2d = child_module.__class__.__name__ == "Conv2d"
+                        is_conv2d_1x1 = is_conv2d and child_module.kernel_size == (1, 1)
+
+                        if is_linear or is_conv2d:
+                            lora_name = prefix + "." + (name + "." if name else "") + child_name
+                            lora_name = lora_name.replace(".", "_")
+
+                            if filter is not None and not filter in lora_name:
+                                continue
+
+                            dim = None
+                            alpha = None
+
+                            if modules_dim is not None:
+                                # モジュール指定あり
+                                if lora_name in modules_dim:
+                                    dim = modules_dim[lora_name]
+                                    alpha = modules_alpha[lora_name]
+                            else:
+                                # 通常、すべて対象とする
+                                if is_linear or is_conv2d_1x1:
+                                    dim = default_dim if default_dim is not None else self.lora_dim
+                                    alpha = self.alpha
+
+                                    if is_mmdit and type_dims is not None:
+                                        #     type_dims = [context_attn_dim, context_mlp_dim, context_mod_dim, x_attn_dim, x_mlp_dim, x_mod_dim]
+                                        identifier = [
+                                            ("context_block", "attn"),
+                                            ("context_block", "mlp"),
+                                            ("context_block", "adaLN_modulation"),
+                                            ("x_block", "attn"),
+                                            ("x_block", "mlp"),
+                                            ("x_block", "adaLN_modulation"),
+                                        ]
+                                        for i, d in enumerate(type_dims):
+                                            if d is not None and all([id in lora_name for id in identifier[i]]):
+                                                dim = d  # may be 0 for skip
+                                                break
+
+                                    if is_mmdit and dim and self.train_block_indices is not None and "joint_blocks" in lora_name:
+                                        # "lora_unet_joint_blocks_0_x_block_attn_proj..."
+                                        block_index = int(lora_name.split("_")[4])  # bit dirty
+                                        if self.train_block_indices is not None and not self.train_block_indices[block_index]:
+                                            dim = 0
+
+                                elif self.conv_lora_dim is not None:
+                                    dim = self.conv_lora_dim
+                                    alpha = self.conv_alpha
+
+                            if dim is None or dim == 0:
+                                # skipした情報を出力
+                                if is_linear or is_conv2d_1x1 or (self.conv_lora_dim is not None):
+                                    skipped.append(lora_name)
+                                continue
+
+                            # qkv split
+                            split_dims = None
+                            if is_mmdit and split_qkv:
+                                if "joint_blocks" in lora_name and "qkv" in lora_name:
+                                    split_dims = [qkv_dim // 3] * 3
+
+                            lora = module_class(
+                                lora_name,
+                                child_module,
+                                self.multiplier,
+                                dim,
+                                alpha,
+                                dropout=dropout,
+                                rank_dropout=rank_dropout,
+                                module_dropout=module_dropout,
+                                split_dims=split_dims,
+                            )
+                            loras.append(lora)
+
+                if target_replace_modules is None:
+                    break  # all modules are searched
+            return loras, skipped
+
+        # create LoRA for text encoder
+        # 毎回すべてのモジュールを作るのは無駄なので要検討
+        self.text_encoder_loras: List[Union[LoRAModule, LoRAInfModule]] = []
+        skipped_te = []
+        for i, text_encoder in enumerate(text_encoders):
+            index = i
+            if not train_t5xxl and index >= 2:  # 0: CLIP-L, 1: CLIP-G, 2: T5XXL, so we skip T5XXL if train_t5xxl is False
+                break
+
+            logger.info(f"create LoRA for Text Encoder {index+1}:")
+
+            text_encoder_loras, skipped = create_modules(False, index, text_encoder, LoRANetwork.TEXT_ENCODER_TARGET_REPLACE_MODULE)
+            logger.info(f"create LoRA for Text Encoder {index+1}: {len(text_encoder_loras)} modules.")
+            self.text_encoder_loras.extend(text_encoder_loras)
+            skipped_te += skipped
+
+        # create LoRA for U-Net
+        self.unet_loras: List[Union[LoRAModule, LoRAInfModule]]
+        self.unet_loras, skipped_un = create_modules(True, None, unet, LoRANetwork.SD3_TARGET_REPLACE_MODULE)
+
+        # emb_dims [context_embedder, t_embedder, x_embedder, y_embedder, final_mod, final_linear]
+        if self.emb_dims:
+            for filter, in_dim in zip(
+                [
+                    "context_embedder",
+                    "t_embedder",
+                    "x_embedder",
+                    "y_embedder",
+                    "final_layer_adaLN_modulation",
+                    "final_layer_linear",
+                ],
+                self.emb_dims,
+            ):
+                loras, _ = create_modules(True, None, unet, None, filter=filter, default_dim=in_dim)
+                self.unet_loras.extend(loras)
+
+        logger.info(f"create LoRA for SD3 MMDiT: {len(self.unet_loras)} modules.")
+        if verbose:
+            for lora in self.unet_loras:
+                logger.info(f"\t{lora.lora_name:50} {lora.lora_dim}, {lora.alpha}")
+
+        skipped = skipped_te + skipped_un
+        if verbose and len(skipped) > 0:
+            logger.warning(
+                f"because dim (rank) is 0, {len(skipped)} LoRA modules are skipped / dim (rank)が0の為、次の{len(skipped)}個のLoRAモジュールはスキップされます:"
+            )
+            for name in skipped:
+                logger.info(f"\t{name}")
+
+        # assertion
+        names = set()
+        for lora in self.text_encoder_loras + self.unet_loras:
+            assert lora.lora_name not in names, f"duplicated lora name: {lora.lora_name}"
+            names.add(lora.lora_name)
+
+    def set_multiplier(self, multiplier):
+        self.multiplier = multiplier
+        for lora in self.text_encoder_loras + self.unet_loras:
+            lora.multiplier = self.multiplier
+
+    def set_enabled(self, is_enabled):
+        for lora in self.text_encoder_loras + self.unet_loras:
+            lora.enabled = is_enabled
+
+    def load_weights(self, file):
+        if os.path.splitext(file)[1] == ".safetensors":
+            from safetensors.torch import load_file
+
+            weights_sd = load_file(file)
+        else:
+            weights_sd = torch.load(file, map_location="cpu")
+
+        info = self.load_state_dict(weights_sd, False)
+        return info
+
+    def load_state_dict(self, state_dict, strict=True):
+        # override to convert original weight to split qkv
+        if not self.split_qkv:
+            return super().load_state_dict(state_dict, strict)
+
+        # split qkv
+        for key in list(state_dict.keys()):
+            if not ("joint_blocks" in key and "qkv" in key):
+                continue
+
+            weight = state_dict[key]
+            lora_name = key.split(".")[0]
+            if "lora_down" in key and "weight" in key:
+                # dense weight (rank*3, in_dim)
+                split_weight = torch.chunk(weight, 3, dim=0)
+                for i, split_w in enumerate(split_weight):
+                    state_dict[f"{lora_name}.lora_down.{i}.weight"] = split_w
+
+                del state_dict[key]
+                # print(f"split {key}: {weight.shape} to {[w.shape for w in split_weight]}")
+            elif "lora_up" in key and "weight" in key:
+                # sparse weight (out_dim=sum(split_dims), rank*3)
+                rank = weight.size(1) // 3
+                i = 0
+                split_dim = weight.shape[0] // 3
+                for j in range(3):
+                    state_dict[f"{lora_name}.lora_up.{j}.weight"] = weight[i : i + split_dim, j * rank : (j + 1) * rank]
+                    i += split_dim
+                del state_dict[key]
+
+            # alpha is unchanged
+
+        return super().load_state_dict(state_dict, strict)
+
+    def state_dict(self, destination=None, prefix="", keep_vars=False):
+        if not self.split_qkv:
+            return super().state_dict(destination, prefix, keep_vars)
+
+        # merge qkv
+        state_dict = super().state_dict(destination, prefix, keep_vars)
+        new_state_dict = {}
+        for key in list(state_dict.keys()):
+            if not ("joint_blocks" in key and "qkv" in key):
+                new_state_dict[key] = state_dict[key]
+                continue
+
+            if key not in state_dict:
+                continue  # already merged
+
+            lora_name = key.split(".")[0]
+
+            # (rank, in_dim) * 3
+            down_weights = [state_dict.pop(f"{lora_name}.lora_down.{i}.weight") for i in range(3)]
+            # (split dim, rank) * 3
+            up_weights = [state_dict.pop(f"{lora_name}.lora_up.{i}.weight") for i in range(3)]
+
+            alpha = state_dict.pop(f"{lora_name}.alpha")
+
+            # merge down weight
+            down_weight = torch.cat(down_weights, dim=0)  # (rank, split_dim) * 3 -> (rank*3, sum of split_dim)
+
+            # merge up weight (sum of split_dim, rank*3)
+            qkv_dim, rank = up_weights[0].size()
+            split_dim = qkv_dim // 3
+            up_weight = torch.zeros((qkv_dim, down_weight.size(0)), device=down_weight.device, dtype=down_weight.dtype)
+            i = 0
+            for j in range(3):
+                up_weight[i : i + split_dim, j * rank : (j + 1) * rank] = up_weights[j]
+                i += split_dim
+
+            new_state_dict[f"{lora_name}.lora_down.weight"] = down_weight
+            new_state_dict[f"{lora_name}.lora_up.weight"] = up_weight
+            new_state_dict[f"{lora_name}.alpha"] = alpha
+
+            # print(
+            #     f"merged {lora_name}: {lora_name}, {[w.shape for w in down_weights]}, {[w.shape for w in up_weights]} to {down_weight.shape}, {up_weight.shape}"
+            # )
+            print(f"new key: {lora_name}.lora_down.weight, {lora_name}.lora_up.weight, {lora_name}.alpha")
+
+        return new_state_dict
+
+    def apply_to(self, text_encoders, mmdit, apply_text_encoder=True, apply_unet=True):
+        if apply_text_encoder:
+            logger.info(f"enable LoRA for text encoder: {len(self.text_encoder_loras)} modules")
+        else:
+            self.text_encoder_loras = []
+
+        if apply_unet:
+            logger.info(f"enable LoRA for U-Net: {len(self.unet_loras)} modules")
+        else:
+            self.unet_loras = []
+
+        for lora in self.text_encoder_loras + self.unet_loras:
+            lora.apply_to()
+            self.add_module(lora.lora_name, lora)
+
+    # マージできるかどうかを返す
+    def is_mergeable(self):
+        return True
+
+    # TODO refactor to common function with apply_to
+    def merge_to(self, text_encoders, mmdit, weights_sd, dtype=None, device=None):
+        apply_text_encoder = apply_unet = False
+        for key in weights_sd.keys():
+            if (
+                key.startswith(LoRANetwork.LORA_PREFIX_TEXT_ENCODER_CLIP_L)
+                or key.startswith(LoRANetwork.LORA_PREFIX_TEXT_ENCODER_CLIP_G)
+                or key.startswith(LoRANetwork.LORA_PREFIX_TEXT_ENCODER_T5)
+            ):
+                apply_text_encoder = True
+            elif key.startswith(LoRANetwork.LORA_PREFIX_MMDIT):
+                apply_unet = True
+
+        if apply_text_encoder:
+            logger.info("enable LoRA for text encoder")
+        else:
+            self.text_encoder_loras = []
+
+        if apply_unet:
+            logger.info("enable LoRA for U-Net")
+        else:
+            self.unet_loras = []
+
+        for lora in self.text_encoder_loras + self.unet_loras:
+            sd_for_lora = {}
+            for key in weights_sd.keys():
+                if key.startswith(lora.lora_name):
+                    sd_for_lora[key[len(lora.lora_name) + 1 :]] = weights_sd[key]
+            lora.merge_to(sd_for_lora, dtype, device)
+
+        logger.info(f"weights are merged")
+
+    def set_loraplus_lr_ratio(self, loraplus_lr_ratio, loraplus_unet_lr_ratio, loraplus_text_encoder_lr_ratio):
+        self.loraplus_lr_ratio = loraplus_lr_ratio
+        self.loraplus_unet_lr_ratio = loraplus_unet_lr_ratio
+        self.loraplus_text_encoder_lr_ratio = loraplus_text_encoder_lr_ratio
+
+        logger.info(f"LoRA+ UNet LR Ratio: {self.loraplus_unet_lr_ratio or self.loraplus_lr_ratio}")
+        logger.info(f"LoRA+ Text Encoder LR Ratio: {self.loraplus_text_encoder_lr_ratio or self.loraplus_lr_ratio}")
+
+    def prepare_optimizer_params_with_multiple_te_lrs(self, text_encoder_lr, unet_lr, default_lr):
+        # make sure text_encoder_lr as list of three elements
+        # if float, use the same value for all three
+        if text_encoder_lr is None or (isinstance(text_encoder_lr, list) and len(text_encoder_lr) == 0):
+            text_encoder_lr = [default_lr, default_lr, default_lr]
+        elif isinstance(text_encoder_lr, float) or isinstance(text_encoder_lr, int):
+            text_encoder_lr = [float(text_encoder_lr), float(text_encoder_lr), float(text_encoder_lr)]
+        elif len(text_encoder_lr) == 1:
+            text_encoder_lr = [text_encoder_lr[0], text_encoder_lr[0], text_encoder_lr[0]]
+        elif len(text_encoder_lr) == 2:
+            text_encoder_lr = [text_encoder_lr[0], text_encoder_lr[1], text_encoder_lr[1]]
+
+        self.requires_grad_(True)
+
+        all_params = []
+        lr_descriptions = []
+
+        def assemble_params(loras, lr, loraplus_ratio):
+            param_groups = {"lora": {}, "plus": {}}
+            for lora in loras:
+                for name, param in lora.named_parameters():
+                    if loraplus_ratio is not None and "lora_up" in name:
+                        param_groups["plus"][f"{lora.lora_name}.{name}"] = param
+                    else:
+                        param_groups["lora"][f"{lora.lora_name}.{name}"] = param
+
+            params = []
+            descriptions = []
+            for key in param_groups.keys():
+                param_data = {"params": param_groups[key].values()}
+
+                if len(param_data["params"]) == 0:
+                    continue
+
+                if lr is not None:
+                    if key == "plus":
+                        param_data["lr"] = lr * loraplus_ratio
+                    else:
+                        param_data["lr"] = lr
+
+                if param_data.get("lr", None) == 0 or param_data.get("lr", None) is None:
+                    logger.info("NO LR skipping!")
+                    continue
+
+                params.append(param_data)
+                descriptions.append("plus" if key == "plus" else "")
+
+            return params, descriptions
+
+        if self.text_encoder_loras:
+            loraplus_lr_ratio = self.loraplus_text_encoder_lr_ratio or self.loraplus_lr_ratio
+
+            # split text encoder loras for te1 and te3
+            te1_loras = [
+                lora for lora in self.text_encoder_loras if lora.lora_name.startswith(self.LORA_PREFIX_TEXT_ENCODER_CLIP_L)
+            ]
+            te2_loras = [
+                lora for lora in self.text_encoder_loras if lora.lora_name.startswith(self.LORA_PREFIX_TEXT_ENCODER_CLIP_G)
+            ]
+            te3_loras = [lora for lora in self.text_encoder_loras if lora.lora_name.startswith(self.LORA_PREFIX_TEXT_ENCODER_T5)]
+            if len(te1_loras) > 0:
+                logger.info(f"Text Encoder 1 (CLIP-L): {len(te1_loras)} modules, LR {text_encoder_lr[0]}")
+                params, descriptions = assemble_params(te1_loras, text_encoder_lr[0], loraplus_lr_ratio)
+                all_params.extend(params)
+                lr_descriptions.extend(["textencoder 1 " + (" " + d if d else "") for d in descriptions])
+            if len(te2_loras) > 0:
+                logger.info(f"Text Encoder 2 (CLIP-G): {len(te2_loras)} modules, LR {text_encoder_lr[1]}")
+                params, descriptions = assemble_params(te2_loras, text_encoder_lr[1], loraplus_lr_ratio)
+                all_params.extend(params)
+                lr_descriptions.extend(["textencoder 1 " + (" " + d if d else "") for d in descriptions])
+            if len(te3_loras) > 0:
+                logger.info(f"Text Encoder 3 (T5XXL): {len(te3_loras)} modules, LR {text_encoder_lr[2]}")
+                params, descriptions = assemble_params(te3_loras, text_encoder_lr[2], loraplus_lr_ratio)
+                all_params.extend(params)
+                lr_descriptions.extend(["textencoder 3 " + (" " + d if d else "") for d in descriptions])
+
+        if self.unet_loras:
+            params, descriptions = assemble_params(
+                self.unet_loras,
+                unet_lr if unet_lr is not None else default_lr,
+                self.loraplus_unet_lr_ratio or self.loraplus_lr_ratio,
+            )
+            all_params.extend(params)
+            lr_descriptions.extend(["unet" + (" " + d if d else "") for d in descriptions])
+
+        return all_params, lr_descriptions
+
+    def enable_gradient_checkpointing(self):
+        # not supported
+        pass
+
+    def prepare_grad_etc(self, text_encoder, unet):
+        self.requires_grad_(True)
+
+    def on_epoch_start(self, text_encoder, unet):
+        self.train()
+
+    def get_trainable_params(self):
+        return self.parameters()
+
+    def save_weights(self, file, dtype, metadata):
+        if metadata is not None and len(metadata) == 0:
+            metadata = None
+
+        state_dict = self.state_dict()
+
+        if dtype is not None:
+            for key in list(state_dict.keys()):
+                v = state_dict[key]
+                v = v.detach().clone().to("cpu").to(dtype)
+                state_dict[key] = v
+
+        if os.path.splitext(file)[1] == ".safetensors":
+            from safetensors.torch import save_file
+            from library import train_util
+
+            # Precalculate model hashes to save time on indexing
+            if metadata is None:
+                metadata = {}
+            model_hash, legacy_hash = train_util.precalculate_safetensors_hashes(state_dict, metadata)
+            metadata["sshs_model_hash"] = model_hash
+            metadata["sshs_legacy_hash"] = legacy_hash
+
+            save_file(state_dict, file, metadata)
+        else:
+            torch.save(state_dict, file)
+
+    def backup_weights(self):
+        # 重みのバックアップを行う
+        loras: List[LoRAInfModule] = self.text_encoder_loras + self.unet_loras
+        for lora in loras:
+            org_module = lora.org_module_ref[0]
+            if not hasattr(org_module, "_lora_org_weight"):
+                sd = org_module.state_dict()
+                org_module._lora_org_weight = sd["weight"].detach().clone()
+                org_module._lora_restored = True
+
+    def restore_weights(self):
+        # 重みのリストアを行う
+        loras: List[LoRAInfModule] = self.text_encoder_loras + self.unet_loras
+        for lora in loras:
+            org_module = lora.org_module_ref[0]
+            if not org_module._lora_restored:
+                sd = org_module.state_dict()
+                sd["weight"] = org_module._lora_org_weight
+                org_module.load_state_dict(sd)
+                org_module._lora_restored = True
+
+    def pre_calculation(self):
+        # 事前計算を行う
+        loras: List[LoRAInfModule] = self.text_encoder_loras + self.unet_loras
+        for lora in loras:
+            org_module = lora.org_module_ref[0]
+            sd = org_module.state_dict()
+
+            org_weight = sd["weight"]
+            lora_weight = lora.get_weight().to(org_weight.device, dtype=org_weight.dtype)
+            sd["weight"] = org_weight + lora_weight
+            assert sd["weight"].shape == org_weight.shape
+            org_module.load_state_dict(sd)
+
+            org_module._lora_restored = False
+            lora.enabled = False
+
+    def apply_max_norm_regularization(self, max_norm_value, device):
+        downkeys = []
+        upkeys = []
+        alphakeys = []
+        norms = []
+        keys_scaled = 0
+
+        state_dict = self.state_dict()
+        for key in state_dict.keys():
+            if "lora_down" in key and "weight" in key:
+                downkeys.append(key)
+                upkeys.append(key.replace("lora_down", "lora_up"))
+                alphakeys.append(key.replace("lora_down.weight", "alpha"))
+
+        for i in range(len(downkeys)):
+            down = state_dict[downkeys[i]].to(device)
+            up = state_dict[upkeys[i]].to(device)
+            alpha = state_dict[alphakeys[i]].to(device)
+            dim = down.shape[0]
+            scale = alpha / dim
+
+            if up.shape[2:] == (1, 1) and down.shape[2:] == (1, 1):
+                updown = (up.squeeze(2).squeeze(2) @ down.squeeze(2).squeeze(2)).unsqueeze(2).unsqueeze(3)
+            elif up.shape[2:] == (3, 3) or down.shape[2:] == (3, 3):
+                updown = torch.nn.functional.conv2d(down.permute(1, 0, 2, 3), up).permute(1, 0, 2, 3)
+            else:
+                updown = up @ down
+
+            updown *= scale
+
+            norm = updown.norm().clamp(min=max_norm_value / 2)
+            desired = torch.clamp(norm, max=max_norm_value)
+            ratio = desired.cpu() / norm.cpu()
+            sqrt_ratio = ratio**0.5
+            if ratio != 1:
+                keys_scaled += 1
+                state_dict[upkeys[i]] *= sqrt_ratio
+                state_dict[downkeys[i]] *= sqrt_ratio
+            scalednorm = updown.norm() * ratio
+            norms.append(scalednorm.item())
+
+        return keys_scaled, sum(norms) / len(norms), max(norms)
diff --git a/sd3_train.py b/sd3_train.py
index 5e2efa6f8..d12f7f56b 100644
--- a/sd3_train.py
+++ b/sd3_train.py
@@ -220,12 +220,7 @@ def train(args):
         sd3_state_dict = None
 
     # load tokenizer and prepare tokenize strategy
-    if args.t5xxl_max_token_length is None:
-        t5xxl_max_token_length = 256  # default value for T5XXL
-    else:
-        t5xxl_max_token_length = args.t5xxl_max_token_length
-
-    sd3_tokenize_strategy = strategy_sd3.Sd3TokenizeStrategy(t5xxl_max_token_length)
+    sd3_tokenize_strategy = strategy_sd3.Sd3TokenizeStrategy(args.t5xxl_max_token_length)
     strategy_base.TokenizeStrategy.set_strategy(sd3_tokenize_strategy)
 
     # load clip_l, clip_g, t5xxl for caching text encoder outputs
@@ -876,6 +871,9 @@ def optimizer_hook(parameter: torch.Tensor):
                     lg_out = None
                     t5_out = None
                     lg_pooled = None
+                    l_attn_mask = None
+                    g_attn_mask = None
+                    t5_attn_mask = None
 
                 if lg_out is None:
                     # not cached or training, so get from text encoders
@@ -885,7 +883,7 @@ def optimizer_hook(parameter: torch.Tensor):
                         # text models in sd3_models require "cpu" for input_ids
                         input_ids_clip_l = input_ids_clip_l.to("cpu")
                         input_ids_clip_g = input_ids_clip_g.to("cpu")
-                        lg_out, _, lg_pooled = text_encoding_strategy.encode_tokens(
+                        lg_out, _, lg_pooled, l_attn_mask, g_attn_mask, _ = text_encoding_strategy.encode_tokens(
                             sd3_tokenize_strategy,
                             [clip_l, clip_g, None],
                             [input_ids_clip_l, input_ids_clip_g, None, l_attn_mask, g_attn_mask, None],
@@ -895,7 +893,7 @@ def optimizer_hook(parameter: torch.Tensor):
                     _, _, input_ids_t5xxl, _, _, t5_attn_mask = batch["input_ids_list"]
                     with torch.no_grad():
                         input_ids_t5xxl = input_ids_t5xxl.to("cpu") if t5_out is None else None
-                        _, t5_out, _ = text_encoding_strategy.encode_tokens(
+                        _, t5_out, _, _, _, t5_attn_mask = text_encoding_strategy.encode_tokens(
                             sd3_tokenize_strategy, [None, None, t5xxl], [None, None, input_ids_t5xxl, None, None, t5_attn_mask]
                         )
 
@@ -1104,22 +1102,6 @@ def setup_parser() -> argparse.ArgumentParser:
     parser.add_argument(
         "--use_t5xxl_cache_only", action="store_true", help="cache T5-XXL outputs only / T5-XXLの出力のみキャッシュする"
     )
-    parser.add_argument(
-        "--t5xxl_max_token_length",
-        type=int,
-        default=None,
-        help="maximum token length for T5-XXL. 256 if omitted / T5-XXLの最大トークン数。省略時は256",
-    )
-    parser.add_argument(
-        "--apply_lg_attn_mask",
-        action="store_true",
-        help="apply attention mask (zero embs) to CLIP-L and G / CLIP-LとGにアテンションマスク（ゼロ埋め）を適用する",
-    )
-    parser.add_argument(
-        "--apply_t5_attn_mask",
-        action="store_true",
-        help="apply attention mask (zero embs) to T5-XXL / T5-XXLにアテンションマスク（ゼロ埋め）を適用する",
-    )
 
     parser.add_argument(
         "--learning_rate_te1",
diff --git a/sd3_train_network.py b/sd3_train_network.py
new file mode 100644
index 000000000..0f4ca93ef
--- /dev/null
+++ b/sd3_train_network.py
@@ -0,0 +1,427 @@
+import argparse
+import copy
+import math
+import random
+from typing import Any, Optional
+
+import torch
+from accelerate import Accelerator
+from library import strategy_sd3, utils
+from library.device_utils import init_ipex, clean_memory_on_device
+
+init_ipex()
+
+from library import flux_models, flux_train_utils, flux_utils, sd3_train_utils, sd3_utils, strategy_base, strategy_sd3, train_util
+import train_network
+from library.utils import setup_logging
+
+setup_logging()
+import logging
+
+logger = logging.getLogger(__name__)
+
+
+class Sd3NetworkTrainer(train_network.NetworkTrainer):
+    def __init__(self):
+        super().__init__()
+        self.sample_prompts_te_outputs = None
+        self.is_schnell: Optional[bool] = None
+
+    def assert_extra_args(self, args, train_dataset_group):
+        super().assert_extra_args(args, train_dataset_group)
+        # sdxl_train_util.verify_sdxl_training_args(args)
+
+        if args.fp8_base_unet:
+            args.fp8_base = True  # if fp8_base_unet is enabled, fp8_base is also enabled for SD3
+
+        if args.cache_text_encoder_outputs_to_disk and not args.cache_text_encoder_outputs:
+            logger.warning(
+                "cache_text_encoder_outputs_to_disk is enabled, so cache_text_encoder_outputs is also enabled / cache_text_encoder_outputs_to_diskが有効になっているため、cache_text_encoder_outputsも有効になります"
+            )
+            args.cache_text_encoder_outputs = True
+
+        if args.cache_text_encoder_outputs:
+            assert (
+                train_dataset_group.is_text_encoder_output_cacheable()
+            ), "when caching Text Encoder output, either caption_dropout_rate, shuffle_caption, token_warmup_step or caption_tag_dropout_rate cannot be used / Text Encoderの出力をキャッシュするときはcaption_dropout_rate, shuffle_caption, token_warmup_step, caption_tag_dropout_rateは使えません"
+
+        # prepare CLIP-L/CLIP-G/T5XXL training flags
+        self.train_clip = not args.network_train_unet_only
+        self.train_t5xxl = False  # default is False even if args.network_train_unet_only is False
+
+        if args.max_token_length is not None:
+            logger.warning("max_token_length is not used in Flux training / max_token_lengthはFluxのトレーニングでは使用されません")
+
+        train_dataset_group.verify_bucket_reso_steps(32)  # TODO check this
+
+    def load_target_model(self, args, weight_dtype, accelerator):
+        # currently offload to cpu for some models
+
+        # if the file is fp8 and we are using fp8_base, we can load it as is (fp8)
+        loading_dtype = None if args.fp8_base else weight_dtype
+
+        # if we load to cpu, flux.to(fp8) takes a long time, so we should load to gpu in future
+        state_dict = utils.load_safetensors(
+            args.pretrained_model_name_or_path, "cpu", disable_mmap=args.disable_mmap_load_safetensors, dtype=loading_dtype
+        )
+        mmdit = sd3_utils.load_mmdit(state_dict, loading_dtype, "cpu")
+        self.model_type = mmdit.model_type
+
+        if args.fp8_base:
+            # check dtype of model
+            if mmdit.dtype == torch.float8_e4m3fnuz or mmdit.dtype == torch.float8_e5m2 or mmdit.dtype == torch.float8_e5m2fnuz:
+                raise ValueError(f"Unsupported fp8 model dtype: {mmdit.dtype}")
+            elif mmdit.dtype == torch.float8_e4m3fn:
+                logger.info("Loaded fp8 SD3 model")
+
+        clip_l = sd3_utils.load_clip_l(
+            args.clip_l, weight_dtype, "cpu", disable_mmap=args.disable_mmap_load_safetensors, state_dict=state_dict
+        )
+        clip_l.eval()
+        clip_g = sd3_utils.load_clip_g(
+            args.clip_g, weight_dtype, "cpu", disable_mmap=args.disable_mmap_load_safetensors, state_dict=state_dict
+        )
+        clip_g.eval()
+
+        # if the file is fp8 and we are using fp8_base (not unet), we can load it as is (fp8)
+        if args.fp8_base and not args.fp8_base_unet:
+            loading_dtype = None  # as is
+        else:
+            loading_dtype = weight_dtype
+
+        # loading t5xxl to cpu takes a long time, so we should load to gpu in future
+        t5xxl = sd3_utils.load_t5xxl(
+            args.t5xxl, loading_dtype, "cpu", disable_mmap=args.disable_mmap_load_safetensors, state_dict=state_dict
+        )
+        t5xxl.eval()
+        if args.fp8_base and not args.fp8_base_unet:
+            # check dtype of model
+            if t5xxl.dtype == torch.float8_e4m3fnuz or t5xxl.dtype == torch.float8_e5m2 or t5xxl.dtype == torch.float8_e5m2fnuz:
+                raise ValueError(f"Unsupported fp8 model dtype: {t5xxl.dtype}")
+            elif t5xxl.dtype == torch.float8_e4m3fn:
+                logger.info("Loaded fp8 T5XXL model")
+
+        vae = sd3_utils.load_vae(
+            args.vae, weight_dtype, "cpu", disable_mmap=args.disable_mmap_load_safetensors, state_dict=state_dict
+        )
+
+        return mmdit.model_type, [clip_l, clip_g, t5xxl], vae, mmdit
+
+    def get_tokenize_strategy(self, args):
+        logger.info(f"t5xxl_max_token_length: {args.t5xxl_max_token_length}")
+        return strategy_sd3.Sd3TokenizeStrategy(args.t5xxl_max_token_length, args.tokenizer_cache_dir)
+
+    def get_tokenizers(self, tokenize_strategy: strategy_sd3.Sd3TokenizeStrategy):
+        return [tokenize_strategy.clip_l, tokenize_strategy.clip_g, tokenize_strategy.t5xxl]
+
+    def get_latents_caching_strategy(self, args):
+        latents_caching_strategy = strategy_sd3.Sd3LatentsCachingStrategy(
+            args.cache_latents_to_disk, args.vae_batch_size, args.skip_cache_check
+        )
+        return latents_caching_strategy
+
+    def get_text_encoding_strategy(self, args):
+        return strategy_sd3.Sd3TextEncodingStrategy(args.apply_lg_attn_mask, args.apply_t5_attn_mask)
+
+    def post_process_network(self, args, accelerator, network, text_encoders, unet):
+        # check t5xxl is trained or not
+        self.train_t5xxl = network.train_t5xxl
+
+        if self.train_t5xxl and args.cache_text_encoder_outputs:
+            raise ValueError(
+                "T5XXL is trained, so cache_text_encoder_outputs cannot be used / T5XXL学習時はcache_text_encoder_outputsは使用できません"
+            )
+
+    def get_models_for_text_encoding(self, args, accelerator, text_encoders):
+        if args.cache_text_encoder_outputs:
+            if self.train_clip and not self.train_t5xxl:
+                return text_encoders[0:2]  # only CLIP-L/CLIP-G is needed for encoding because T5XXL is cached
+            else:
+                return None  # no text encoders are needed for encoding because both are cached
+        else:
+            return text_encoders  # CLIP-L, CLIP-G and T5XXL are needed for encoding
+
+    def get_text_encoders_train_flags(self, args, text_encoders):
+        return [self.train_clip, self.train_clip, self.train_t5xxl]
+
+    def get_text_encoder_outputs_caching_strategy(self, args):
+        if args.cache_text_encoder_outputs:
+            # if the text encoders is trained, we need tokenization, so is_partial is True
+            return strategy_sd3.Sd3TextEncoderOutputsCachingStrategy(
+                args.cache_text_encoder_outputs_to_disk,
+                args.text_encoder_batch_size,
+                args.skip_cache_check,
+                is_partial=self.train_clip or self.train_t5xxl,
+                apply_lg_attn_mask=args.apply_lg_attn_mask,
+                apply_t5_attn_mask=args.apply_t5_attn_mask,
+            )
+        else:
+            return None
+
+    def cache_text_encoder_outputs_if_needed(
+        self, args, accelerator: Accelerator, unet, vae, text_encoders, dataset: train_util.DatasetGroup, weight_dtype
+    ):
+        if args.cache_text_encoder_outputs:
+            if not args.lowram:
+                # メモリ消費を減らす
+                logger.info("move vae and unet to cpu to save memory")
+                org_vae_device = vae.device
+                org_unet_device = unet.device
+                vae.to("cpu")
+                unet.to("cpu")
+                clean_memory_on_device(accelerator.device)
+
+            # When TE is not be trained, it will not be prepared so we need to use explicit autocast
+            logger.info("move text encoders to gpu")
+            text_encoders[0].to(accelerator.device, dtype=weight_dtype)  # always not fp8
+            text_encoders[1].to(accelerator.device, dtype=weight_dtype)  # always not fp8
+            text_encoders[2].to(accelerator.device)  # may be fp8
+
+            if text_encoders[2].dtype == torch.float8_e4m3fn:
+                # if we load fp8 weights, the model is already fp8, so we use it as is
+                self.prepare_text_encoder_fp8(2, text_encoders[2], text_encoders[2].dtype, weight_dtype)
+            else:
+                # otherwise, we need to convert it to target dtype
+                text_encoders[2].to(weight_dtype)
+
+            with accelerator.autocast():
+                dataset.new_cache_text_encoder_outputs(text_encoders, accelerator)
+
+            # cache sample prompts
+            if args.sample_prompts is not None:
+                logger.info(f"cache Text Encoder outputs for sample prompt: {args.sample_prompts}")
+
+                tokenize_strategy: strategy_sd3.Sd3TokenizeStrategy = strategy_base.TokenizeStrategy.get_strategy()
+                text_encoding_strategy: strategy_sd3.Sd3TextEncodingStrategy = strategy_base.TextEncodingStrategy.get_strategy()
+
+                prompts = train_util.load_prompts(args.sample_prompts)
+                sample_prompts_te_outputs = {}  # key: prompt, value: text encoder outputs
+                with accelerator.autocast(), torch.no_grad():
+                    for prompt_dict in prompts:
+                        for p in [prompt_dict.get("prompt", ""), prompt_dict.get("negative_prompt", "")]:
+                            if p not in sample_prompts_te_outputs:
+                                logger.info(f"cache Text Encoder outputs for prompt: {p}")
+                                tokens_and_masks = tokenize_strategy.tokenize(p)
+                                sample_prompts_te_outputs[p] = text_encoding_strategy.encode_tokens(
+                                    tokenize_strategy,
+                                    text_encoders,
+                                    tokens_and_masks,
+                                    args.apply_lg_attn_mask,
+                                    args.apply_t5_attn_mask,
+                                )
+                self.sample_prompts_te_outputs = sample_prompts_te_outputs
+
+            accelerator.wait_for_everyone()
+
+            # move back to cpu
+            if not self.is_train_text_encoder(args):
+                logger.info("move CLIP-L back to cpu")
+                text_encoders[0].to("cpu")
+                logger.info("move CLIP-G back to cpu")
+                text_encoders[1].to("cpu")
+            logger.info("move t5XXL back to cpu")
+            text_encoders[2].to("cpu")
+            clean_memory_on_device(accelerator.device)
+
+            if not args.lowram:
+                logger.info("move vae and unet back to original device")
+                vae.to(org_vae_device)
+                unet.to(org_unet_device)
+        else:
+            # Text Encoderから毎回出力を取得するので、GPUに乗せておく
+            text_encoders[0].to(accelerator.device, dtype=weight_dtype)
+            text_encoders[1].to(accelerator.device, dtype=weight_dtype)
+            text_encoders[2].to(accelerator.device)
+
+    # def call_unet(self, args, accelerator, unet, noisy_latents, timesteps, text_conds, batch, weight_dtype):
+    #     noisy_latents = noisy_latents.to(weight_dtype)  # TODO check why noisy_latents is not weight_dtype
+
+    #     # get size embeddings
+    #     orig_size = batch["original_sizes_hw"]
+    #     crop_size = batch["crop_top_lefts"]
+    #     target_size = batch["target_sizes_hw"]
+    #     embs = sdxl_train_util.get_size_embeddings(orig_size, crop_size, target_size, accelerator.device).to(weight_dtype)
+
+    #     # concat embeddings
+    #     encoder_hidden_states1, encoder_hidden_states2, pool2 = text_conds
+    #     vector_embedding = torch.cat([pool2, embs], dim=1).to(weight_dtype)
+    #     text_embedding = torch.cat([encoder_hidden_states1, encoder_hidden_states2], dim=2).to(weight_dtype)
+
+    #     noise_pred = unet(noisy_latents, timesteps, text_embedding, vector_embedding)
+    #     return noise_pred
+
+    def sample_images(self, accelerator, args, epoch, global_step, device, vae, tokenizer, text_encoder, mmdit):
+        text_encoders = text_encoder  # for compatibility
+        text_encoders = self.get_models_for_text_encoding(args, accelerator, text_encoders)
+
+        sd3_train_utils.sample_images(
+            accelerator, args, epoch, global_step, mmdit, vae, text_encoders, self.sample_prompts_te_outputs
+        )
+
+    def get_noise_scheduler(self, args: argparse.Namespace, device: torch.device) -> Any:
+        # shift 3.0 is the default value
+        noise_scheduler = sd3_train_utils.FlowMatchEulerDiscreteScheduler(num_train_timesteps=1000, shift=3.0)
+        self.noise_scheduler_copy = copy.deepcopy(noise_scheduler)
+        return noise_scheduler
+
+    def encode_images_to_latents(self, args, accelerator, vae, images):
+        return vae.encode(images)
+
+    def shift_scale_latents(self, args, latents):
+        return latents
+
+    def get_noise_pred_and_target(
+        self,
+        args,
+        accelerator,
+        noise_scheduler,
+        latents,
+        batch,
+        text_encoder_conds,
+        unet: flux_models.Flux,
+        network,
+        weight_dtype,
+        train_unet,
+    ):
+        # Sample noise that we'll add to the latents
+        noise = torch.randn_like(latents)
+
+        # get noisy model input and timesteps
+        noisy_model_input, timesteps, sigmas = sd3_train_utils.get_noisy_model_input_and_timesteps(
+            args, self.noise_scheduler_copy, latents, noise, accelerator.device, weight_dtype
+        )
+
+        # ensure the hidden state will require grad
+        if args.gradient_checkpointing:
+            noisy_model_input.requires_grad_(True)
+            for t in text_encoder_conds:
+                if t.dtype.is_floating_point:
+                    t.requires_grad_(True)
+
+        # Predict the noise residual
+        lg_out, t5_out, lg_pooled, l_attn_mask, g_attn_mask, t5_attn_mask = text_encoder_conds
+        text_encoding_strategy = strategy_base.TextEncodingStrategy.get_strategy()
+        context, lg_pooled = text_encoding_strategy.concat_encodings(lg_out, t5_out, lg_pooled)
+        if not args.apply_lg_attn_mask:
+            l_attn_mask = None
+            g_attn_mask = None
+        if not args.apply_t5_attn_mask:
+            t5_attn_mask = None
+
+        # call model
+        with accelerator.autocast():
+            # TODO support attention mask
+            model_pred = unet(noisy_model_input, timesteps, context=context, y=lg_pooled)
+
+        # Follow: Section 5 of https://arxiv.org/abs/2206.00364.
+        # Preconditioning of the model outputs.
+        model_pred = model_pred * (-sigmas) + noisy_model_input
+
+        # these weighting schemes use a uniform timestep sampling
+        # and instead post-weight the loss
+        weighting = sd3_train_utils.compute_loss_weighting_for_sd3(weighting_scheme=args.weighting_scheme, sigmas=sigmas)
+
+        # flow matching loss
+        target = latents
+
+        # differential output preservation
+        if "custom_attributes" in batch:
+            diff_output_pr_indices = []
+            for i, custom_attributes in enumerate(batch["custom_attributes"]):
+                if "diff_output_preservation" in custom_attributes and custom_attributes["diff_output_preservation"]:
+                    diff_output_pr_indices.append(i)
+
+            if len(diff_output_pr_indices) > 0:
+                network.set_multiplier(0.0)
+                with torch.no_grad(), accelerator.autocast():
+                    model_pred_prior = unet(
+                        noisy_model_input[diff_output_pr_indices],
+                        timesteps[diff_output_pr_indices],
+                        context=context[diff_output_pr_indices],
+                        y=lg_pooled[diff_output_pr_indices],
+                    )
+                network.set_multiplier(1.0)  # may be overwritten by "network_multipliers" in the next step
+
+                model_pred_prior = model_pred_prior * (-sigmas[diff_output_pr_indices]) + noisy_model_input[diff_output_pr_indices]
+
+                # weighting for differential output preservation is not needed because it is already applied
+
+                target[diff_output_pr_indices] = model_pred_prior.to(target.dtype)
+
+        return model_pred, target, timesteps, None, weighting
+
+    def post_process_loss(self, loss, args, timesteps, noise_scheduler):
+        return loss
+
+    def get_sai_model_spec(self, args):
+        return train_util.get_sai_model_spec(None, args, False, True, False, sd3=self.model_type)
+
+    def update_metadata(self, metadata, args):
+        metadata["ss_apply_lg_attn_mask"] = args.apply_lg_attn_mask
+        metadata["ss_apply_t5_attn_mask"] = args.apply_t5_attn_mask
+        metadata["ss_weighting_scheme"] = args.weighting_scheme
+        metadata["ss_logit_mean"] = args.logit_mean
+        metadata["ss_logit_std"] = args.logit_std
+        metadata["ss_mode_scale"] = args.mode_scale
+
+    def is_text_encoder_not_needed_for_training(self, args):
+        return args.cache_text_encoder_outputs and not self.is_train_text_encoder(args)
+
+    def prepare_text_encoder_grad_ckpt_workaround(self, index, text_encoder):
+        if index == 0 or index == 1:  # CLIP-L/CLIP-G
+            return super().prepare_text_encoder_grad_ckpt_workaround(index, text_encoder)
+        else:  # T5XXL
+            text_encoder.encoder.embed_tokens.requires_grad_(True)
+
+    def prepare_text_encoder_fp8(self, index, text_encoder, te_weight_dtype, weight_dtype):
+        if index == 0 or index == 1:  # CLIP-L/CLIP-G
+            clip_type = "CLIP-L" if index == 0 else "CLIP-G"
+            logger.info(f"prepare CLIP-{clip_type} for fp8: set to {te_weight_dtype}, set embeddings to {weight_dtype}")
+            text_encoder.to(te_weight_dtype)  # fp8
+            text_encoder.text_model.embeddings.to(dtype=weight_dtype)
+        else:  # T5XXL
+
+            def prepare_fp8(text_encoder, target_dtype):
+                def forward_hook(module):
+                    def forward(hidden_states):
+                        hidden_gelu = module.act(module.wi_0(hidden_states))
+                        hidden_linear = module.wi_1(hidden_states)
+                        hidden_states = hidden_gelu * hidden_linear
+                        hidden_states = module.dropout(hidden_states)
+
+                        hidden_states = module.wo(hidden_states)
+                        return hidden_states
+
+                    return forward
+
+                for module in text_encoder.modules():
+                    if module.__class__.__name__ in ["T5LayerNorm", "Embedding"]:
+                        # print("set", module.__class__.__name__, "to", target_dtype)
+                        module.to(target_dtype)
+                    if module.__class__.__name__ in ["T5DenseGatedActDense"]:
+                        # print("set", module.__class__.__name__, "hooks")
+                        module.forward = forward_hook(module)
+
+            if flux_utils.get_t5xxl_actual_dtype(text_encoder) == torch.float8_e4m3fn and text_encoder.dtype == weight_dtype:
+                logger.info(f"T5XXL already prepared for fp8")
+            else:
+                logger.info(f"prepare T5XXL for fp8: set to {te_weight_dtype}, set embeddings to {weight_dtype}, add hooks")
+                text_encoder.to(te_weight_dtype)  # fp8
+                prepare_fp8(text_encoder, weight_dtype)
+
+
+def setup_parser() -> argparse.ArgumentParser:
+    parser = train_network.setup_parser()
+    sd3_train_utils.add_sd3_training_arguments(parser)
+    return parser
+
+
+if __name__ == "__main__":
+    parser = setup_parser()
+
+    args = parser.parse_args()
+    train_util.verify_command_line_training_args(args)
+    args = train_util.read_config_from_file(args, parser)
+
+    trainer = Sd3NetworkTrainer()
+    trainer.train(args)
diff --git a/train_network.py b/train_network.py
index 9943b60bd..aab1d84be 100644
--- a/train_network.py
+++ b/train_network.py
@@ -129,6 +129,7 @@ def get_text_encoder_outputs_caching_strategy(self, args):
     def get_models_for_text_encoding(self, args, accelerator, text_encoders):
         """
         Returns a list of models that will be used for text encoding. SDXL uses wrapped and unwrapped models.
+        FLUX.1 and SD3 may cache some outputs of the text encoder, so return the models that will be used for encoding (not cached).
         """
         return text_encoders
 
@@ -591,6 +592,7 @@ def train(self, args):
             # unet.to(accelerator.device)  # this makes faster `to(dtype)` below, but consumes 23 GB VRAM
             # unet.to(dtype=unet_weight_dtype)  # without moving to gpu, this takes a lot of time and main memory
 
+            logger.info(f"set U-Net weight dtype to {unet_weight_dtype}, device to {accelerator.device}")
             unet.to(accelerator.device, dtype=unet_weight_dtype)  # this seems to be safer than above
 
         unet.requires_grad_(False)

From 0031d916f0fa035d5d48a25fcabadc149bfbb639 Mon Sep 17 00:00:00 2001
From: kohya-ss <ykumeykume@gmail.com>
Date: Fri, 25 Oct 2024 23:20:38 +0900
Subject: [PATCH 194/582] add latent scaling/shifting

---
 sd3_train_network.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/sd3_train_network.py b/sd3_train_network.py
index 0f4ca93ef..ecacf16cc 100644
--- a/sd3_train_network.py
+++ b/sd3_train_network.py
@@ -6,7 +6,7 @@
 
 import torch
 from accelerate import Accelerator
-from library import strategy_sd3, utils
+from library import sd3_models, strategy_sd3, utils
 from library.device_utils import init_ipex, clean_memory_on_device
 
 init_ipex()
@@ -25,7 +25,6 @@ class Sd3NetworkTrainer(train_network.NetworkTrainer):
     def __init__(self):
         super().__init__()
         self.sample_prompts_te_outputs = None
-        self.is_schnell: Optional[bool] = None
 
     def assert_extra_args(self, args, train_dataset_group):
         super().assert_extra_args(args, train_dataset_group)
@@ -268,7 +267,7 @@ def encode_images_to_latents(self, args, accelerator, vae, images):
         return vae.encode(images)
 
     def shift_scale_latents(self, args, latents):
-        return latents
+        return sd3_models.SDVAE.process_in(latents)
 
     def get_noise_pred_and_target(
         self,

From 56bf7611644402996072bd8f909cf828ec7b27cc Mon Sep 17 00:00:00 2001
From: kohya-ss <ykumeykume@gmail.com>
Date: Sat, 26 Oct 2024 17:29:24 +0900
Subject: [PATCH 195/582] fix errors in SD3 LoRA training with Text Encoders
 close #1724

---
 library/strategy_sd3.py | 26 +++++++++++++-------------
 sd3_train_network.py    |  2 +-
 2 files changed, 14 insertions(+), 14 deletions(-)

diff --git a/library/strategy_sd3.py b/library/strategy_sd3.py
index dd08cf004..a27e99e63 100644
--- a/library/strategy_sd3.py
+++ b/library/strategy_sd3.py
@@ -68,9 +68,9 @@ def encode_tokens(
         returned embeddings are not masked
         """
         clip_l, clip_g, t5xxl = models
-        clip_l: CLIPTextModel
-        clip_g: CLIPTextModelWithProjection
-        t5xxl: T5EncoderModel
+        clip_l: Optional[CLIPTextModel]
+        clip_g: Optional[CLIPTextModelWithProjection]
+        t5xxl: Optional[T5EncoderModel]
 
         if apply_lg_attn_mask is None:
             apply_lg_attn_mask = self.apply_lg_attn_mask
@@ -84,25 +84,23 @@ def encode_tokens(
             if not apply_lg_attn_mask:
                 l_attn_mask = None
                 g_attn_mask = None
-            else:
-                l_attn_mask = l_attn_mask.to(clip_l.device)
-                g_attn_mask = g_attn_mask.to(clip_g.device)
             if not apply_t5_attn_mask:
                 t5_attn_mask = None
-            else:
-                t5_attn_mask = t5_attn_mask.to(t5xxl.device)
         else:
             l_attn_mask = None
             g_attn_mask = None
             t5_attn_mask = None
 
-        if l_tokens is None:
+        if l_tokens is None or clip_l is None:
             assert g_tokens is None, "g_tokens must be None if l_tokens is None"
             lg_out = None
             lg_pooled = None
         else:
             with torch.no_grad():
                 assert g_tokens is not None, "g_tokens must not be None if l_tokens is not None"
+                l_attn_mask = l_attn_mask.to(clip_l.device) if l_attn_mask is not None else None
+                g_attn_mask = g_attn_mask.to(clip_g.device) if g_attn_mask is not None else None
+
                 prompt_embeds = clip_l(l_tokens.to(clip_l.device), l_attn_mask, output_hidden_states=True)
                 l_pooled = prompt_embeds[0]
                 l_out = prompt_embeds.hidden_states[-2]
@@ -114,13 +112,15 @@ def encode_tokens(
                 lg_pooled = torch.cat((l_pooled, g_pooled), dim=-1) if l_tokens is not None else None
                 lg_out = torch.cat([l_out, g_out], dim=-1)
 
-        if t5xxl is not None and t5_tokens is not None:
+        if t5xxl is None or t5_tokens is None:
+            t5_out = None
+        else:
+            t5_attn_mask = t5_attn_mask.to(t5xxl.device) if t5_attn_mask is not None else None
             with torch.no_grad():
                 t5_out, _ = t5xxl(t5_tokens.to(t5xxl.device), t5_attn_mask, return_dict=False, output_hidden_states=True)
-        else:
-            t5_out = None
 
-        return [lg_out, t5_out, lg_pooled, l_attn_mask, g_attn_mask, t5_attn_mask] # masks are used for attention masking in transformer
+        # masks are used for attention masking in transformer
+        return [lg_out, t5_out, lg_pooled, l_attn_mask, g_attn_mask, t5_attn_mask]
 
     def concat_encodings(
         self, lg_out: torch.Tensor, t5_out: Optional[torch.Tensor], lg_pooled: torch.Tensor
diff --git a/sd3_train_network.py b/sd3_train_network.py
index ecacf16cc..129afed54 100644
--- a/sd3_train_network.py
+++ b/sd3_train_network.py
@@ -134,7 +134,7 @@ def post_process_network(self, args, accelerator, network, text_encoders, unet):
     def get_models_for_text_encoding(self, args, accelerator, text_encoders):
         if args.cache_text_encoder_outputs:
             if self.train_clip and not self.train_t5xxl:
-                return text_encoders[0:2]  # only CLIP-L/CLIP-G is needed for encoding because T5XXL is cached
+                return text_encoders[0:2] + [None]  # only CLIP-L/CLIP-G is needed for encoding because T5XXL is cached
             else:
                 return None  # no text encoders are needed for encoding because both are cached
         else:

From 014064fd8186420abf5dfc7c99ad0b39fee33f8a Mon Sep 17 00:00:00 2001
From: kohya-ss <ykumeykume@gmail.com>
Date: Sat, 26 Oct 2024 18:59:45 +0900
Subject: [PATCH 196/582] fix sample image generation without seed failed close
 #1726

---
 library/sd3_train_utils.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/library/sd3_train_utils.py b/library/sd3_train_utils.py
index e3c649f73..b04b86fb3 100644
--- a/library/sd3_train_utils.py
+++ b/library/sd3_train_utils.py
@@ -316,6 +316,8 @@ def do_sample(
     # noise = get_noise(seed, latent).to(device)
     if seed is not None:
         generator = torch.manual_seed(seed)
+    else:
+        generator = None
     noise = (
         torch.randn(latent.size(), dtype=torch.float32, layout=latent.layout, generator=generator, device="cpu")
         .to(latent.dtype)

From db2b4d41b9637cffd40a694c8e25847446a57aad Mon Sep 17 00:00:00 2001
From: Kohya S <ykumeykume@gmail.com>
Date: Sun, 27 Oct 2024 16:42:58 +0900
Subject: [PATCH 197/582] Add dropout rate arguments for CLIP-L, CLIP-G, and
 T5, fix Text Encoders LoRA not trained

---
 library/sd3_train_utils.py | 18 ++++++++
 library/strategy_sd3.py    | 93 ++++++++++++++++++++++++++++++++++----
 sd3_train.py               | 15 ++++--
 sd3_train_network.py       | 16 ++++++-
 train_network.py           | 13 ++++--
 5 files changed, 138 insertions(+), 17 deletions(-)

diff --git a/library/sd3_train_utils.py b/library/sd3_train_utils.py
index b04b86fb3..a0202ad40 100644
--- a/library/sd3_train_utils.py
+++ b/library/sd3_train_utils.py
@@ -214,6 +214,24 @@ def add_sd3_training_arguments(parser: argparse.ArgumentParser):
         action="store_true",
         help="apply attention mask (zero embs) to T5-XXL / T5-XXLにアテンションマスク（ゼロ埋め）を適用する",
     )
+    parser.add_argument(
+        "--clip_l_dropout_rate",
+        type=float,
+        default=0.0,
+        help="Dropout rate for CLIP-L encoder, default is 0.0 / CLIP-Lエンコーダのドロップアウト率、デフォルトは0.0",
+    )
+    parser.add_argument(
+        "--clip_g_dropout_rate",
+        type=float,
+        default=0.0,
+        help="Dropout rate for CLIP-G encoder, default is 0.0 / CLIP-Gエンコーダのドロップアウト率、デフォルトは0.0",
+    )
+    parser.add_argument(
+        "--t5_dropout_rate",
+        type=float,
+        default=0.0,
+        help="Dropout rate for T5 encoder, default is 0.0 / T5エンコーダのドロップアウト率、デフォルトは0.0",
+    )
 
     # copy from Diffusers
     parser.add_argument(
diff --git a/library/strategy_sd3.py b/library/strategy_sd3.py
index a27e99e63..d87ad7d15 100644
--- a/library/strategy_sd3.py
+++ b/library/strategy_sd3.py
@@ -1,5 +1,6 @@
 import os
 import glob
+import random
 from typing import Any, List, Optional, Tuple, Union
 import torch
 import numpy as np
@@ -48,13 +49,23 @@ def tokenize(self, text: Union[str, List[str]]) -> List[torch.Tensor]:
 
 
 class Sd3TextEncodingStrategy(TextEncodingStrategy):
-    def __init__(self, apply_lg_attn_mask: Optional[bool] = None, apply_t5_attn_mask: Optional[bool] = None) -> None:
+    def __init__(
+        self,
+        apply_lg_attn_mask: Optional[bool] = None,
+        apply_t5_attn_mask: Optional[bool] = None,
+        l_dropout_rate: float = 0.0,
+        g_dropout_rate: float = 0.0,
+        t5_dropout_rate: float = 0.0,
+    ) -> None:
         """
         Args:
             apply_t5_attn_mask: Default value for apply_t5_attn_mask.
         """
         self.apply_lg_attn_mask = apply_lg_attn_mask
         self.apply_t5_attn_mask = apply_t5_attn_mask
+        self.l_dropout_rate = l_dropout_rate
+        self.g_dropout_rate = g_dropout_rate
+        self.t5_dropout_rate = t5_dropout_rate
 
     def encode_tokens(
         self,
@@ -63,6 +74,7 @@ def encode_tokens(
         tokens: List[torch.Tensor],
         apply_lg_attn_mask: Optional[bool] = False,
         apply_t5_attn_mask: Optional[bool] = False,
+        enable_dropout: bool = True,
     ) -> List[torch.Tensor]:
         """
         returned embeddings are not masked
@@ -91,37 +103,92 @@ def encode_tokens(
             g_attn_mask = None
             t5_attn_mask = None
 
+        # dropout: if enable_dropout is False, dropout is not applied. dropout means zeroing out embeddings
+
         if l_tokens is None or clip_l is None:
             assert g_tokens is None, "g_tokens must be None if l_tokens is None"
             lg_out = None
             lg_pooled = None
         else:
-            with torch.no_grad():
-                assert g_tokens is not None, "g_tokens must not be None if l_tokens is not None"
+            assert g_tokens is not None, "g_tokens must not be None if l_tokens is not None"
+            
+            drop_l = enable_dropout and (self.l_dropout_rate > 0.0 and random.random() < self.l_dropout_rate)
+            if drop_l:
+                l_pooled = torch.zeros((l_tokens.shape[0], 768), device=l_tokens.device, dtype=l_tokens.dtype)
+                l_out = torch.zeros((l_tokens.shape[0], l_tokens.shape[1], 768), device=l_tokens.device, dtype=l_tokens.dtype)
+                if l_attn_mask is not None:
+                    l_attn_mask = torch.zeros_like(l_attn_mask)
+            else:
                 l_attn_mask = l_attn_mask.to(clip_l.device) if l_attn_mask is not None else None
-                g_attn_mask = g_attn_mask.to(clip_g.device) if g_attn_mask is not None else None
-
                 prompt_embeds = clip_l(l_tokens.to(clip_l.device), l_attn_mask, output_hidden_states=True)
                 l_pooled = prompt_embeds[0]
                 l_out = prompt_embeds.hidden_states[-2]
 
+            drop_g = enable_dropout and (self.g_dropout_rate > 0.0 and random.random() < self.g_dropout_rate)
+            if drop_g:
+                g_pooled = torch.zeros((g_tokens.shape[0], 1280), device=g_tokens.device, dtype=g_tokens.dtype)
+                g_out = torch.zeros((g_tokens.shape[0], g_tokens.shape[1], 1280), device=g_tokens.device, dtype=g_tokens.dtype)
+                if g_attn_mask is not None:
+                    g_attn_mask = torch.zeros_like(g_attn_mask)
+            else:
+                g_attn_mask = g_attn_mask.to(clip_g.device) if g_attn_mask is not None else None
                 prompt_embeds = clip_g(g_tokens.to(clip_g.device), g_attn_mask, output_hidden_states=True)
                 g_pooled = prompt_embeds[0]
                 g_out = prompt_embeds.hidden_states[-2]
 
-                lg_pooled = torch.cat((l_pooled, g_pooled), dim=-1) if l_tokens is not None else None
-                lg_out = torch.cat([l_out, g_out], dim=-1)
+            lg_pooled = torch.cat((l_pooled, g_pooled), dim=-1) if l_tokens is not None else None
+            lg_out = torch.cat([l_out, g_out], dim=-1)
 
         if t5xxl is None or t5_tokens is None:
             t5_out = None
         else:
-            t5_attn_mask = t5_attn_mask.to(t5xxl.device) if t5_attn_mask is not None else None
-            with torch.no_grad():
+            drop_t5 = enable_dropout and (self.t5_dropout_rate > 0.0 and random.random() < self.t5_dropout_rate)
+            if drop_t5:
+                t5_out = torch.zeros((t5_tokens.shape[0], t5_tokens.shape[1], 4096), device=t5_tokens.device, dtype=t5_tokens.dtype)
+                if t5_attn_mask is not None:
+                    t5_attn_mask = torch.zeros_like(t5_attn_mask)
+            else:
+                t5_attn_mask = t5_attn_mask.to(t5xxl.device) if t5_attn_mask is not None else None
                 t5_out, _ = t5xxl(t5_tokens.to(t5xxl.device), t5_attn_mask, return_dict=False, output_hidden_states=True)
 
         # masks are used for attention masking in transformer
         return [lg_out, t5_out, lg_pooled, l_attn_mask, g_attn_mask, t5_attn_mask]
 
+    def drop_cached_text_encoder_outputs(
+        self,
+        lg_out: torch.Tensor,
+        t5_out: torch.Tensor,
+        lg_pooled: torch.Tensor,
+        l_attn_mask: torch.Tensor,
+        g_attn_mask: torch.Tensor,
+        t5_attn_mask: torch.Tensor,
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+        # dropout: if enable_dropout is True, dropout is not applied. dropout means zeroing out embeddings
+        if lg_out is not None:
+            for i in range(lg_out.shape[0]):
+                drop_l = self.l_dropout_rate > 0.0 and random.random() < self.l_dropout_rate
+                if drop_l:
+                    lg_out[i, :, :768] = torch.zeros_like(lg_out[i, :, :768])
+                    lg_pooled[i, :768] = torch.zeros_like(lg_pooled[i, :768])
+                    if l_attn_mask is not None:
+                        l_attn_mask[i] = torch.zeros_like(l_attn_mask[i])
+                drop_g = self.g_dropout_rate > 0.0 and random.random() < self.g_dropout_rate
+                if drop_g:
+                    lg_out[i, :, 768:] = torch.zeros_like(lg_out[i, :, 768:])
+                    lg_pooled[i, 768:] = torch.zeros_like(lg_pooled[i, 768:])
+                    if g_attn_mask is not None:
+                        g_attn_mask[i] = torch.zeros_like(g_attn_mask[i])
+
+        if t5_out is not None:
+            for i in range(t5_out.shape[0]):
+                drop_t5 = self.t5_dropout_rate > 0.0 and random.random() < self.t5_dropout_rate
+                if drop_t5:
+                    t5_out[i] = torch.zeros_like(t5_out[i])
+                    if t5_attn_mask is not None:
+                        t5_attn_mask[i] = torch.zeros_like(t5_attn_mask[i])
+
+        return lg_out, t5_out, lg_pooled, l_attn_mask, g_attn_mask, t5_attn_mask
+
     def concat_encodings(
         self, lg_out: torch.Tensor, t5_out: Optional[torch.Tensor], lg_pooled: torch.Tensor
     ) -> Tuple[torch.Tensor, torch.Tensor]:
@@ -207,8 +274,14 @@ def cache_batch_outputs(
 
         tokens_and_masks = tokenize_strategy.tokenize(captions)
         with torch.no_grad():
+            # always disable dropout during caching
             lg_out, t5_out, lg_pooled, l_attn_mask, g_attn_mask, t5_attn_mask = sd3_text_encoding_strategy.encode_tokens(
-                tokenize_strategy, models, tokens_and_masks, self.apply_lg_attn_mask, self.apply_t5_attn_mask
+                tokenize_strategy,
+                models,
+                tokens_and_masks,
+                apply_lg_attn_mask=self.apply_lg_attn_mask,
+                apply_t5_attn_mask=self.apply_t5_attn_mask,
+                enable_dropout=False,
             )
 
         if lg_out.dtype == torch.bfloat16:
diff --git a/sd3_train.py b/sd3_train.py
index d12f7f56b..cdac945e6 100644
--- a/sd3_train.py
+++ b/sd3_train.py
@@ -69,6 +69,11 @@ def train(args):
     # assert (
     #     not args.train_text_encoder or not args.cache_text_encoder_outputs
     # ), "cache_text_encoder_outputs is not supported when training text encoder / text encoderを学習するときはcache_text_encoder_outputsはサポートされていません"
+    if args.cache_text_encoder_outputs_to_disk and not args.cache_text_encoder_outputs:
+        logger.warning(
+            "cache_text_encoder_outputs_to_disk is enabled, so cache_text_encoder_outputs is also enabled / cache_text_encoder_outputs_to_diskが有効になっているため、cache_text_encoder_outputsも有効になります"
+        )
+        args.cache_text_encoder_outputs = True
 
     assert not args.train_text_encoder or (args.use_t5xxl_cache_only or not args.cache_text_encoder_outputs), (
         "when training text encoder, text encoder outputs must not be cached (except for T5XXL)"
@@ -232,7 +237,9 @@ def train(args):
     assert clip_l is not None and clip_g is not None and t5xxl is not None, "clip_l, clip_g, t5xxl must be specified"
 
     # prepare text encoding strategy
-    text_encoding_strategy = strategy_sd3.Sd3TextEncodingStrategy(args.apply_lg_attn_mask, args.apply_t5_attn_mask)
+    text_encoding_strategy = strategy_sd3.Sd3TextEncodingStrategy(
+        args.apply_lg_attn_mask, args.apply_t5_attn_mask, args.clip_l_dropout_rate, args.clip_g_dropout_rate, args.t5_dropout_rate
+    )
     strategy_base.TextEncodingStrategy.set_strategy(text_encoding_strategy)
 
     # 学習を準備する：モデルを適切な状態にする
@@ -311,6 +318,7 @@ def train(args):
                                 tokens_and_masks,
                                 args.apply_lg_attn_mask,
                                 args.apply_t5_attn_mask,
+                                enable_dropout=False,
                             )
 
         accelerator.wait_for_everyone()
@@ -863,6 +871,7 @@ def optimizer_hook(parameter: torch.Tensor):
 
                 text_encoder_outputs_list = batch.get("text_encoder_outputs_list", None)
                 if text_encoder_outputs_list is not None:
+                    text_encoder_outputs_list = text_encoding_strategy.drop_cached_text_encoder_outputs(*text_encoder_outputs_list)
                     lg_out, t5_out, lg_pooled, l_attn_mask, g_attn_mask, t5_attn_mask = text_encoder_outputs_list
                     if args.use_t5xxl_cache_only:
                         lg_out = None
@@ -878,7 +887,7 @@ def optimizer_hook(parameter: torch.Tensor):
                 if lg_out is None:
                     # not cached or training, so get from text encoders
                     input_ids_clip_l, input_ids_clip_g, _, l_attn_mask, g_attn_mask, _ = batch["input_ids_list"]
-                    with torch.set_grad_enabled(args.train_text_encoder):
+                    with torch.set_grad_enabled(train_clip):
                         # TODO support weighted captions
                         # text models in sd3_models require "cpu" for input_ids
                         input_ids_clip_l = input_ids_clip_l.to("cpu")
@@ -891,7 +900,7 @@ def optimizer_hook(parameter: torch.Tensor):
 
                 if t5_out is None:
                     _, _, input_ids_t5xxl, _, _, t5_attn_mask = batch["input_ids_list"]
-                    with torch.no_grad():
+                    with torch.set_grad_enabled(train_t5xxl):
                         input_ids_t5xxl = input_ids_t5xxl.to("cpu") if t5_out is None else None
                         _, t5_out, _, _, _, t5_attn_mask = text_encoding_strategy.encode_tokens(
                             sd3_tokenize_strategy, [None, None, t5xxl], [None, None, input_ids_t5xxl, None, None, t5_attn_mask]
diff --git a/sd3_train_network.py b/sd3_train_network.py
index 129afed54..7b5471274 100644
--- a/sd3_train_network.py
+++ b/sd3_train_network.py
@@ -120,7 +120,13 @@ def get_latents_caching_strategy(self, args):
         return latents_caching_strategy
 
     def get_text_encoding_strategy(self, args):
-        return strategy_sd3.Sd3TextEncodingStrategy(args.apply_lg_attn_mask, args.apply_t5_attn_mask)
+        return strategy_sd3.Sd3TextEncodingStrategy(
+            args.apply_lg_attn_mask,
+            args.apply_t5_attn_mask,
+            args.clip_l_dropout_rate,
+            args.clip_g_dropout_rate,
+            args.t5xxl_dropout_rate,
+        )
 
     def post_process_network(self, args, accelerator, network, text_encoders, unet):
         # check t5xxl is trained or not
@@ -408,6 +414,14 @@ def forward(hidden_states):
                 text_encoder.to(te_weight_dtype)  # fp8
                 prepare_fp8(text_encoder, weight_dtype)
 
+    def on_step_start(self, args, accelerator, network, text_encoders, unet, batch, weight_dtype):
+        # drop cached text encoder outputs
+        text_encoder_outputs_list = batch.get("text_encoder_outputs_list", None)
+        if text_encoder_outputs_list is not None:
+            text_encodoing_strategy: strategy_sd3.Sd3TextEncodingStrategy = strategy_base.TextEncodingStrategy.get_strategy()
+            text_encoder_outputs_list = text_encodoing_strategy.drop_cached_text_encoder_outputs(text_encoder_outputs_list)
+            batch["text_encoder_outputs_list"] = text_encoder_outputs_list
+
 
 def setup_parser() -> argparse.ArgumentParser:
     parser = train_network.setup_parser()
diff --git a/train_network.py b/train_network.py
index aab1d84be..9d78a4ef2 100644
--- a/train_network.py
+++ b/train_network.py
@@ -272,6 +272,9 @@ def prepare_text_encoder_grad_ckpt_workaround(self, index, text_encoder):
     def prepare_text_encoder_fp8(self, index, text_encoder, te_weight_dtype, weight_dtype):
         text_encoder.text_model.embeddings.to(dtype=weight_dtype)
 
+    def on_step_start(self, args, accelerator, network, text_encoders, unet, batch, weight_dtype):
+        pass
+
     # endregion
 
     def train(self, args):
@@ -1030,9 +1033,9 @@ def load_model_hook(models, input_dir):
 
         # callback for step start
         if hasattr(accelerator.unwrap_model(network), "on_step_start"):
-            on_step_start = accelerator.unwrap_model(network).on_step_start
+            on_step_start_for_network = accelerator.unwrap_model(network).on_step_start
         else:
-            on_step_start = lambda *args, **kwargs: None
+            on_step_start_for_network = lambda *args, **kwargs: None
 
         # function for saving/removing
         def save_model(ckpt_name, unwrapped_nw, steps, epoch_no, force_sync_upload=False):
@@ -1113,7 +1116,10 @@ def remove_model(old_ckpt_name):
                     continue
 
                 with accelerator.accumulate(training_model):
-                    on_step_start(text_encoder, unet)
+                    on_step_start_for_network(text_encoder, unet)
+
+                    # temporary, for batch processing
+                    self.on_step_start(args, accelerator, network, text_encoders, unet, batch, weight_dtype)
 
                     if "latents" in batch and batch["latents"] is not None:
                         latents = batch["latents"].to(accelerator.device).to(dtype=weight_dtype)
@@ -1146,6 +1152,7 @@ def remove_model(old_ckpt_name):
                     if text_encoder_outputs_list is not None:
                         text_encoder_conds = text_encoder_outputs_list  # List of text encoder outputs
                     if len(text_encoder_conds) == 0 or text_encoder_conds[0] is None or train_text_encoder:
+                        # TODO this does not work if 'some text_encoders are trained' and 'some are not and not cached'
                         with torch.set_grad_enabled(train_text_encoder), accelerator.autocast():
                             # Get the text embedding for conditioning
                             if args.weighted_captions:

From a1255d637f545b0d6defebf080ca31f2370bf311 Mon Sep 17 00:00:00 2001
From: kohya-ss <ykumeykume@gmail.com>
Date: Sun, 27 Oct 2024 17:03:36 +0900
Subject: [PATCH 198/582] Fix SD3 LoRA training to work (WIP)

---
 library/strategy_sd3.py | 20 ++++++++++----------
 sd3_train_network.py    | 15 ++++++++-------
 train_network.py        | 20 ++++++++++++++++++++
 3 files changed, 38 insertions(+), 17 deletions(-)

diff --git a/library/strategy_sd3.py b/library/strategy_sd3.py
index d87ad7d15..e57bb337e 100644
--- a/library/strategy_sd3.py
+++ b/library/strategy_sd3.py
@@ -111,13 +111,13 @@ def encode_tokens(
             lg_pooled = None
         else:
             assert g_tokens is not None, "g_tokens must not be None if l_tokens is not None"
-            
+
             drop_l = enable_dropout and (self.l_dropout_rate > 0.0 and random.random() < self.l_dropout_rate)
             if drop_l:
-                l_pooled = torch.zeros((l_tokens.shape[0], 768), device=l_tokens.device, dtype=l_tokens.dtype)
-                l_out = torch.zeros((l_tokens.shape[0], l_tokens.shape[1], 768), device=l_tokens.device, dtype=l_tokens.dtype)
+                l_pooled = torch.zeros((l_tokens.shape[0], 768), device=clip_l.device, dtype=clip_l.dtype)
+                l_out = torch.zeros((l_tokens.shape[0], l_tokens.shape[1], 768), device=clip_l.device, dtype=clip_l.dtype)
                 if l_attn_mask is not None:
-                    l_attn_mask = torch.zeros_like(l_attn_mask)
+                    l_attn_mask = torch.zeros_like(l_attn_mask, device=clip_l.device)
             else:
                 l_attn_mask = l_attn_mask.to(clip_l.device) if l_attn_mask is not None else None
                 prompt_embeds = clip_l(l_tokens.to(clip_l.device), l_attn_mask, output_hidden_states=True)
@@ -126,10 +126,10 @@ def encode_tokens(
 
             drop_g = enable_dropout and (self.g_dropout_rate > 0.0 and random.random() < self.g_dropout_rate)
             if drop_g:
-                g_pooled = torch.zeros((g_tokens.shape[0], 1280), device=g_tokens.device, dtype=g_tokens.dtype)
-                g_out = torch.zeros((g_tokens.shape[0], g_tokens.shape[1], 1280), device=g_tokens.device, dtype=g_tokens.dtype)
+                g_pooled = torch.zeros((g_tokens.shape[0], 1280), device=clip_g.device, dtype=clip_g.dtype)
+                g_out = torch.zeros((g_tokens.shape[0], g_tokens.shape[1], 1280), device=clip_g.device, dtype=clip_g.dtype)
                 if g_attn_mask is not None:
-                    g_attn_mask = torch.zeros_like(g_attn_mask)
+                    g_attn_mask = torch.zeros_like(g_attn_mask, device=clip_g.device)
             else:
                 g_attn_mask = g_attn_mask.to(clip_g.device) if g_attn_mask is not None else None
                 prompt_embeds = clip_g(g_tokens.to(clip_g.device), g_attn_mask, output_hidden_states=True)
@@ -144,9 +144,9 @@ def encode_tokens(
         else:
             drop_t5 = enable_dropout and (self.t5_dropout_rate > 0.0 and random.random() < self.t5_dropout_rate)
             if drop_t5:
-                t5_out = torch.zeros((t5_tokens.shape[0], t5_tokens.shape[1], 4096), device=t5_tokens.device, dtype=t5_tokens.dtype)
+                t5_out = torch.zeros((t5_tokens.shape[0], t5_tokens.shape[1], 4096), device=t5xxl.device, dtype=t5xxl.dtype)
                 if t5_attn_mask is not None:
-                    t5_attn_mask = torch.zeros_like(t5_attn_mask)
+                    t5_attn_mask = torch.zeros_like(t5_attn_mask, device=t5xxl.device)
             else:
                 t5_attn_mask = t5_attn_mask.to(t5xxl.device) if t5_attn_mask is not None else None
                 t5_out, _ = t5xxl(t5_tokens.to(t5xxl.device), t5_attn_mask, return_dict=False, output_hidden_states=True)
@@ -187,7 +187,7 @@ def drop_cached_text_encoder_outputs(
                     if t5_attn_mask is not None:
                         t5_attn_mask[i] = torch.zeros_like(t5_attn_mask[i])
 
-        return lg_out, t5_out, lg_pooled, l_attn_mask, g_attn_mask, t5_attn_mask
+        return [lg_out, t5_out, lg_pooled, l_attn_mask, g_attn_mask, t5_attn_mask]
 
     def concat_encodings(
         self, lg_out: torch.Tensor, t5_out: Optional[torch.Tensor], lg_pooled: torch.Tensor
diff --git a/sd3_train_network.py b/sd3_train_network.py
index 7b5471274..620a336fd 100644
--- a/sd3_train_network.py
+++ b/sd3_train_network.py
@@ -125,7 +125,7 @@ def get_text_encoding_strategy(self, args):
             args.apply_t5_attn_mask,
             args.clip_l_dropout_rate,
             args.clip_g_dropout_rate,
-            args.t5xxl_dropout_rate,
+            args.t5_dropout_rate,
         )
 
     def post_process_network(self, args, accelerator, network, text_encoders, unet):
@@ -415,12 +415,13 @@ def forward(hidden_states):
                 prepare_fp8(text_encoder, weight_dtype)
 
     def on_step_start(self, args, accelerator, network, text_encoders, unet, batch, weight_dtype):
-        # drop cached text encoder outputs
-        text_encoder_outputs_list = batch.get("text_encoder_outputs_list", None)
-        if text_encoder_outputs_list is not None:
-            text_encodoing_strategy: strategy_sd3.Sd3TextEncodingStrategy = strategy_base.TextEncodingStrategy.get_strategy()
-            text_encoder_outputs_list = text_encodoing_strategy.drop_cached_text_encoder_outputs(text_encoder_outputs_list)
-            batch["text_encoder_outputs_list"] = text_encoder_outputs_list
+        # # drop cached text encoder outputs
+        # text_encoder_outputs_list = batch.get("text_encoder_outputs_list", None)
+        # if text_encoder_outputs_list is not None:
+        #     text_encodoing_strategy: strategy_sd3.Sd3TextEncodingStrategy = strategy_base.TextEncodingStrategy.get_strategy()
+        #     text_encoder_outputs_list = text_encodoing_strategy.drop_cached_text_encoder_outputs(*text_encoder_outputs_list)
+        #     batch["text_encoder_outputs_list"] = text_encoder_outputs_list
+        pass
 
 
 def setup_parser() -> argparse.ArgumentParser:
diff --git a/train_network.py b/train_network.py
index 9d78a4ef2..76936b2ed 100644
--- a/train_network.py
+++ b/train_network.py
@@ -1151,6 +1151,17 @@ def remove_model(old_ckpt_name):
                     text_encoder_outputs_list = batch.get("text_encoder_outputs_list", None)
                     if text_encoder_outputs_list is not None:
                         text_encoder_conds = text_encoder_outputs_list  # List of text encoder outputs
+                    
+                    # if text_encoder_outputs_list is not None:
+                    #     lg_out, t5_out, lg_pooled, l_attn_mask, g_attn_mask, t5_attn_mask = text_encoder_outputs_list
+                    #     for i in range(len(lg_out)):
+                    #         print(
+                    #             f"[{i}] cached L: {lg_out[i,:,:768].max()}, {lg_pooled[i][:768].max()}, cached G: {lg_out[i,:,768:].max()}, {lg_pooled[i][768:].max()}, "
+                    #             f"cached T5: {t5_out[i].max()}, "
+                    #             f"attn mask: {l_attn_mask[i].max() if l_attn_mask is not None else 0},"
+                    #             f" {g_attn_mask[i].max() if g_attn_mask is not None else 0}, {t5_attn_mask[i].max() if t5_attn_mask is not None else 0}"
+                    #         )
+
                     if len(text_encoder_conds) == 0 or text_encoder_conds[0] is None or train_text_encoder:
                         # TODO this does not work if 'some text_encoders are trained' and 'some are not and not cached'
                         with torch.set_grad_enabled(train_text_encoder), accelerator.autocast():
@@ -1182,6 +1193,15 @@ def remove_model(old_ckpt_name):
                                 if encoded_text_encoder_conds[i] is not None:
                                     text_encoder_conds[i] = encoded_text_encoder_conds[i]
 
+                    # lg_out, t5_out, lg_pooled, l_attn_mask, g_attn_mask, t5_attn_mask = text_encoder_conds
+                    # for i in range(len(lg_out)):
+                    #     print(
+                    #         f"[{i}] train L: {lg_out[i,:,:768].max()}, {lg_pooled[i][:768].max()}, train G: {lg_out[i,:,768:].max()}, {lg_pooled[i][768:].max()}, "
+                    #         f"train T5: {t5_out[i].max()}, "
+                    #         f"attn mask: {l_attn_mask[i].max() if l_attn_mask is not None else 0},"
+                    #         f" {g_attn_mask[i].max() if g_attn_mask is not None else 0}, {t5_attn_mask[i].max() if t5_attn_mask is not None else 0}"
+                    #     )
+
                     # sample noise, call unet, get target
                     noise_pred, target, timesteps, huber_c, weighting = self.get_noise_pred_and_target(
                         args,

From d4f7849592c78455ddd268423528830ec5e55f47 Mon Sep 17 00:00:00 2001
From: kohya-ss <ykumeykume@gmail.com>
Date: Sun, 27 Oct 2024 19:35:56 +0900
Subject: [PATCH 199/582] prevent unintended cast for disk cached TE outputs

---
 library/train_util.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/library/train_util.py b/library/train_util.py
index d3c59ef98..d568523ca 100644
--- a/library/train_util.py
+++ b/library/train_util.py
@@ -1615,7 +1615,6 @@ def __getitem__(self, index):
                 text_encoder_outputs = self.text_encoder_output_caching_strategy.load_outputs_npz(
                     image_info.text_encoder_outputs_npz
                 )
-                text_encoder_outputs = [torch.FloatTensor(x) for x in text_encoder_outputs]
             else:
                 tokenization_required = True
             text_encoder_outputs_list.append(text_encoder_outputs)

From 1065dd1b56b4b18e211d3827fe22b459c81dd12c Mon Sep 17 00:00:00 2001
From: kohya-ss <ykumeykume@gmail.com>
Date: Sun, 27 Oct 2024 19:36:36 +0900
Subject: [PATCH 200/582] Fix to work dropout_rate for TEs

---
 flux_train_network.py    |   2 +-
 library/strategy_flux.py |   1 +
 library/strategy_sd3.py  | 142 +++++++++++++++++++++++++++------------
 sd3_train_network.py     |  15 ++---
 train_network.py         |  19 ------
 5 files changed, 108 insertions(+), 71 deletions(-)

diff --git a/flux_train_network.py b/flux_train_network.py
index cffeb3b19..2b71a8979 100644
--- a/flux_train_network.py
+++ b/flux_train_network.py
@@ -363,7 +363,7 @@ def get_noise_pred_and_target(
         if args.gradient_checkpointing:
             noisy_model_input.requires_grad_(True)
             for t in text_encoder_conds:
-                if t.dtype.is_floating_point:
+                if t is not None and t.dtype.is_floating_point:
                     t.requires_grad_(True)
             img_ids.requires_grad_(True)
             guidance_vec.requires_grad_(True)
diff --git a/library/strategy_flux.py b/library/strategy_flux.py
index 0b0c34af7..f662b62e9 100644
--- a/library/strategy_flux.py
+++ b/library/strategy_flux.py
@@ -190,6 +190,7 @@ def cache_batch_outputs(
                     apply_t5_attn_mask=apply_t5_attn_mask_i,
                 )
             else:
+                # it's fine that attn mask is not None. it's overwritten before calling the model if necessary
                 info.text_encoder_outputs = (l_pooled_i, t5_out_i, txt_ids_i, t5_attn_mask_i)
 
 
diff --git a/library/strategy_sd3.py b/library/strategy_sd3.py
index e57bb337e..413169ecc 100644
--- a/library/strategy_sd3.py
+++ b/library/strategy_sd3.py
@@ -89,19 +89,7 @@ def encode_tokens(
         if apply_t5_attn_mask is None:
             apply_t5_attn_mask = self.apply_t5_attn_mask
 
-        l_tokens, g_tokens, t5_tokens = tokens[:3]
-
-        if len(tokens) > 3:
-            l_attn_mask, g_attn_mask, t5_attn_mask = tokens[3:]
-            if not apply_lg_attn_mask:
-                l_attn_mask = None
-                g_attn_mask = None
-            if not apply_t5_attn_mask:
-                t5_attn_mask = None
-        else:
-            l_attn_mask = None
-            g_attn_mask = None
-            t5_attn_mask = None
+        l_tokens, g_tokens, t5_tokens, l_attn_mask, g_attn_mask, t5_attn_mask = tokens
 
         # dropout: if enable_dropout is False, dropout is not applied. dropout means zeroing out embeddings
 
@@ -109,47 +97,114 @@ def encode_tokens(
             assert g_tokens is None, "g_tokens must be None if l_tokens is None"
             lg_out = None
             lg_pooled = None
+            l_attn_mask = None
+            g_attn_mask = None
         else:
             assert g_tokens is not None, "g_tokens must not be None if l_tokens is not None"
 
-            drop_l = enable_dropout and (self.l_dropout_rate > 0.0 and random.random() < self.l_dropout_rate)
-            if drop_l:
-                l_pooled = torch.zeros((l_tokens.shape[0], 768), device=clip_l.device, dtype=clip_l.dtype)
-                l_out = torch.zeros((l_tokens.shape[0], l_tokens.shape[1], 768), device=clip_l.device, dtype=clip_l.dtype)
-                if l_attn_mask is not None:
-                    l_attn_mask = torch.zeros_like(l_attn_mask, device=clip_l.device)
+            # drop some members of the batch: we do not call clip_l and clip_g for dropped members
+            batch_size, l_seq_len = l_tokens.shape
+            g_seq_len = g_tokens.shape[1]
+
+            non_drop_l_indices = []
+            non_drop_g_indices = []
+            for i in range(l_tokens.shape[0]):
+                drop_l = enable_dropout and (self.l_dropout_rate > 0.0 and random.random() < self.l_dropout_rate)
+                drop_g = enable_dropout and (self.g_dropout_rate > 0.0 and random.random() < self.g_dropout_rate)
+                if not drop_l:
+                    non_drop_l_indices.append(i)
+                if not drop_g:
+                    non_drop_g_indices.append(i)
+
+            # filter out dropped members
+            if len(non_drop_l_indices) > 0 and len(non_drop_l_indices) < batch_size:
+                l_tokens = l_tokens[non_drop_l_indices]
+                l_attn_mask = l_attn_mask[non_drop_l_indices]
+            if len(non_drop_g_indices) > 0 and len(non_drop_g_indices) < batch_size:
+                g_tokens = g_tokens[non_drop_g_indices]
+                g_attn_mask = g_attn_mask[non_drop_g_indices]
+
+            # call clip_l for non-dropped members
+            if len(non_drop_l_indices) > 0:
+                nd_l_attn_mask = l_attn_mask.to(clip_l.device)
+                prompt_embeds = clip_l(
+                    l_tokens.to(clip_l.device), nd_l_attn_mask if apply_lg_attn_mask else None, output_hidden_states=True
+                )
+                nd_l_pooled = prompt_embeds[0]
+                nd_l_out = prompt_embeds.hidden_states[-2]
+            if len(non_drop_g_indices) > 0:
+                nd_g_attn_mask = g_attn_mask.to(clip_g.device)
+                prompt_embeds = clip_g(
+                    g_tokens.to(clip_g.device), nd_g_attn_mask if apply_lg_attn_mask else None, output_hidden_states=True
+                )
+                nd_g_pooled = prompt_embeds[0]
+                nd_g_out = prompt_embeds.hidden_states[-2]
+
+            # fill in the dropped members
+            if len(non_drop_l_indices) == batch_size:
+                l_pooled = nd_l_pooled
+                l_out = nd_l_out
             else:
-                l_attn_mask = l_attn_mask.to(clip_l.device) if l_attn_mask is not None else None
-                prompt_embeds = clip_l(l_tokens.to(clip_l.device), l_attn_mask, output_hidden_states=True)
-                l_pooled = prompt_embeds[0]
-                l_out = prompt_embeds.hidden_states[-2]
-
-            drop_g = enable_dropout and (self.g_dropout_rate > 0.0 and random.random() < self.g_dropout_rate)
-            if drop_g:
-                g_pooled = torch.zeros((g_tokens.shape[0], 1280), device=clip_g.device, dtype=clip_g.dtype)
-                g_out = torch.zeros((g_tokens.shape[0], g_tokens.shape[1], 1280), device=clip_g.device, dtype=clip_g.dtype)
-                if g_attn_mask is not None:
-                    g_attn_mask = torch.zeros_like(g_attn_mask, device=clip_g.device)
+                # model output is always float32 because of the models are wrapped with Accelerator
+                l_pooled = torch.zeros((batch_size, 768), device=clip_l.device, dtype=torch.float32)
+                l_out = torch.zeros((batch_size, l_seq_len, 768), device=clip_l.device, dtype=torch.float32)
+                l_attn_mask = torch.zeros((batch_size, l_seq_len), device=clip_l.device, dtype=l_attn_mask.dtype)
+                if len(non_drop_l_indices) > 0:
+                    l_pooled[non_drop_l_indices] = nd_l_pooled
+                    l_out[non_drop_l_indices] = nd_l_out
+                    l_attn_mask[non_drop_l_indices] = nd_l_attn_mask
+
+            if len(non_drop_g_indices) == batch_size:
+                g_pooled = nd_g_pooled
+                g_out = nd_g_out
             else:
-                g_attn_mask = g_attn_mask.to(clip_g.device) if g_attn_mask is not None else None
-                prompt_embeds = clip_g(g_tokens.to(clip_g.device), g_attn_mask, output_hidden_states=True)
-                g_pooled = prompt_embeds[0]
-                g_out = prompt_embeds.hidden_states[-2]
-
-            lg_pooled = torch.cat((l_pooled, g_pooled), dim=-1) if l_tokens is not None else None
+                g_pooled = torch.zeros((batch_size, 1280), device=clip_g.device, dtype=torch.float32)
+                g_out = torch.zeros((batch_size, g_seq_len, 1280), device=clip_g.device, dtype=torch.float32)
+                g_attn_mask = torch.zeros((batch_size, g_seq_len), device=clip_g.device, dtype=g_attn_mask.dtype)
+                if len(non_drop_g_indices) > 0:
+                    g_pooled[non_drop_g_indices] = nd_g_pooled
+                    g_out[non_drop_g_indices] = nd_g_out
+                    g_attn_mask[non_drop_g_indices] = nd_g_attn_mask
+
+            lg_pooled = torch.cat((l_pooled, g_pooled), dim=-1)
             lg_out = torch.cat([l_out, g_out], dim=-1)
 
         if t5xxl is None or t5_tokens is None:
             t5_out = None
+            t5_attn_mask = None
         else:
-            drop_t5 = enable_dropout and (self.t5_dropout_rate > 0.0 and random.random() < self.t5_dropout_rate)
-            if drop_t5:
-                t5_out = torch.zeros((t5_tokens.shape[0], t5_tokens.shape[1], 4096), device=t5xxl.device, dtype=t5xxl.dtype)
-                if t5_attn_mask is not None:
-                    t5_attn_mask = torch.zeros_like(t5_attn_mask, device=t5xxl.device)
+            # drop some members of the batch: we do not call t5xxl for dropped members
+            batch_size, t5_seq_len = t5_tokens.shape
+            non_drop_t5_indices = []
+            for i in range(t5_tokens.shape[0]):
+                drop_t5 = enable_dropout and (self.t5_dropout_rate > 0.0 and random.random() < self.t5_dropout_rate)
+                if not drop_t5:
+                    non_drop_t5_indices.append(i)
+
+            # filter out dropped members
+            if len(non_drop_t5_indices) > 0 and len(non_drop_t5_indices) < batch_size:
+                t5_tokens = t5_tokens[non_drop_t5_indices]
+                t5_attn_mask = t5_attn_mask[non_drop_t5_indices]
+
+            # call t5xxl for non-dropped members
+            if len(non_drop_t5_indices) > 0:
+                nd_t5_attn_mask = t5_attn_mask.to(t5xxl.device)
+                nd_t5_out, _ = t5xxl(
+                    t5_tokens.to(t5xxl.device),
+                    nd_t5_attn_mask if apply_t5_attn_mask else None,
+                    return_dict=False,
+                    output_hidden_states=True,
+                )
+
+            # fill in the dropped members
+            if len(non_drop_t5_indices) == batch_size:
+                t5_out = nd_t5_out
             else:
-                t5_attn_mask = t5_attn_mask.to(t5xxl.device) if t5_attn_mask is not None else None
-                t5_out, _ = t5xxl(t5_tokens.to(t5xxl.device), t5_attn_mask, return_dict=False, output_hidden_states=True)
+                t5_out = torch.zeros((batch_size, t5_seq_len, 4096), device=t5xxl.device, dtype=torch.float32)
+                t5_attn_mask = torch.zeros((batch_size, t5_seq_len), device=t5xxl.device, dtype=t5_attn_mask.dtype)
+                if len(non_drop_t5_indices) > 0:
+                    t5_out[non_drop_t5_indices] = nd_t5_out
+                    t5_attn_mask[non_drop_t5_indices] = nd_t5_attn_mask
 
         # masks are used for attention masking in transformer
         return [lg_out, t5_out, lg_pooled, l_attn_mask, g_attn_mask, t5_attn_mask]
@@ -322,6 +377,7 @@ def cache_batch_outputs(
                     apply_t5_attn_mask=apply_t5_attn_mask,
                 )
             else:
+                # it's fine that attn mask is not None. it's overwritten before calling the model if necessary
                 info.text_encoder_outputs = (lg_out_i, t5_out_i, lg_pooled_i, l_attn_mask_i, g_attn_mask_i, t5_attn_mask_i)
 
 
diff --git a/sd3_train_network.py b/sd3_train_network.py
index 620a336fd..3506404ae 100644
--- a/sd3_train_network.py
+++ b/sd3_train_network.py
@@ -300,7 +300,7 @@ def get_noise_pred_and_target(
         if args.gradient_checkpointing:
             noisy_model_input.requires_grad_(True)
             for t in text_encoder_conds:
-                if t.dtype.is_floating_point:
+                if t is not None and t.dtype.is_floating_point:
                     t.requires_grad_(True)
 
         # Predict the noise residual
@@ -415,13 +415,12 @@ def forward(hidden_states):
                 prepare_fp8(text_encoder, weight_dtype)
 
     def on_step_start(self, args, accelerator, network, text_encoders, unet, batch, weight_dtype):
-        # # drop cached text encoder outputs
-        # text_encoder_outputs_list = batch.get("text_encoder_outputs_list", None)
-        # if text_encoder_outputs_list is not None:
-        #     text_encodoing_strategy: strategy_sd3.Sd3TextEncodingStrategy = strategy_base.TextEncodingStrategy.get_strategy()
-        #     text_encoder_outputs_list = text_encodoing_strategy.drop_cached_text_encoder_outputs(*text_encoder_outputs_list)
-        #     batch["text_encoder_outputs_list"] = text_encoder_outputs_list
-        pass
+        # drop cached text encoder outputs
+        text_encoder_outputs_list = batch.get("text_encoder_outputs_list", None)
+        if text_encoder_outputs_list is not None:
+            text_encodoing_strategy: strategy_sd3.Sd3TextEncodingStrategy = strategy_base.TextEncodingStrategy.get_strategy()
+            text_encoder_outputs_list = text_encodoing_strategy.drop_cached_text_encoder_outputs(*text_encoder_outputs_list)
+            batch["text_encoder_outputs_list"] = text_encoder_outputs_list
 
 
 def setup_parser() -> argparse.ArgumentParser:
diff --git a/train_network.py b/train_network.py
index 76936b2ed..b90aa420e 100644
--- a/train_network.py
+++ b/train_network.py
@@ -1151,16 +1151,6 @@ def remove_model(old_ckpt_name):
                     text_encoder_outputs_list = batch.get("text_encoder_outputs_list", None)
                     if text_encoder_outputs_list is not None:
                         text_encoder_conds = text_encoder_outputs_list  # List of text encoder outputs
-                    
-                    # if text_encoder_outputs_list is not None:
-                    #     lg_out, t5_out, lg_pooled, l_attn_mask, g_attn_mask, t5_attn_mask = text_encoder_outputs_list
-                    #     for i in range(len(lg_out)):
-                    #         print(
-                    #             f"[{i}] cached L: {lg_out[i,:,:768].max()}, {lg_pooled[i][:768].max()}, cached G: {lg_out[i,:,768:].max()}, {lg_pooled[i][768:].max()}, "
-                    #             f"cached T5: {t5_out[i].max()}, "
-                    #             f"attn mask: {l_attn_mask[i].max() if l_attn_mask is not None else 0},"
-                    #             f" {g_attn_mask[i].max() if g_attn_mask is not None else 0}, {t5_attn_mask[i].max() if t5_attn_mask is not None else 0}"
-                    #         )
 
                     if len(text_encoder_conds) == 0 or text_encoder_conds[0] is None or train_text_encoder:
                         # TODO this does not work if 'some text_encoders are trained' and 'some are not and not cached'
@@ -1193,15 +1183,6 @@ def remove_model(old_ckpt_name):
                                 if encoded_text_encoder_conds[i] is not None:
                                     text_encoder_conds[i] = encoded_text_encoder_conds[i]
 
-                    # lg_out, t5_out, lg_pooled, l_attn_mask, g_attn_mask, t5_attn_mask = text_encoder_conds
-                    # for i in range(len(lg_out)):
-                    #     print(
-                    #         f"[{i}] train L: {lg_out[i,:,:768].max()}, {lg_pooled[i][:768].max()}, train G: {lg_out[i,:,768:].max()}, {lg_pooled[i][768:].max()}, "
-                    #         f"train T5: {t5_out[i].max()}, "
-                    #         f"attn mask: {l_attn_mask[i].max() if l_attn_mask is not None else 0},"
-                    #         f" {g_attn_mask[i].max() if g_attn_mask is not None else 0}, {t5_attn_mask[i].max() if t5_attn_mask is not None else 0}"
-                    #     )
-
                     # sample noise, call unet, get target
                     noise_pred, target, timesteps, huber_c, weighting = self.get_noise_pred_and_target(
                         args,

From af8e216035128767234163a24debf2f4df5aa36d Mon Sep 17 00:00:00 2001
From: Kohya S <ykumeykume@gmail.com>
Date: Mon, 28 Oct 2024 22:08:57 +0900
Subject: [PATCH 201/582] Fix sample image gen to work with block swap

---
 library/sd3_train_utils.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/library/sd3_train_utils.py b/library/sd3_train_utils.py
index a0202ad40..054d1b4a1 100644
--- a/library/sd3_train_utils.py
+++ b/library/sd3_train_utils.py
@@ -364,6 +364,7 @@ def do_sample(
         x_c_nc = torch.cat([x, x], dim=0)
         # print(x_c_nc.shape, timestep.shape, c_crossattn.shape, y.shape)
 
+        mmdit.prepare_block_swap_before_forward()
         model_output = mmdit(x_c_nc, timestep, context=c_crossattn, y=y)
         model_output = model_output.float()
         batched = model_sampling.calculate_denoised(sigma_hat, model_output, x)
@@ -385,6 +386,7 @@ def do_sample(
         x = x + d * dt
         x = x.to(dtype)
 
+    mmdit.prepare_block_swap_before_forward()
     return x
 
 

From 75554867ce390ec0957cc52a70c0695e19c71fe2 Mon Sep 17 00:00:00 2001
From: Kohya S <ykumeykume@gmail.com>
Date: Tue, 29 Oct 2024 08:34:31 +0900
Subject: [PATCH 202/582] Fix error on saving T5XXL

---
 library/sd3_train_utils.py | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/library/sd3_train_utils.py b/library/sd3_train_utils.py
index 054d1b4a1..1702e81c2 100644
--- a/library/sd3_train_utils.py
+++ b/library/sd3_train_utils.py
@@ -75,7 +75,14 @@ def update_sd(prefix, sd):
         save_file(clip_g.state_dict(), clip_g_path)
     if t5xxl is not None:
         t5xxl_path = ckpt_path.replace(".safetensors", "_t5xxl.safetensors")
-        save_file(t5xxl.state_dict(), t5xxl_path)
+        t5xxl_state_dict = t5xxl.state_dict()
+
+        # replace "shared.weight" with copy of it to avoid annoying shared tensor error on safetensors.save_file
+        shared_weight = t5xxl_state_dict["shared.weight"]
+        shared_weight_copy = shared_weight.detach().clone()
+        t5xxl_state_dict["shared.weight"] = shared_weight_copy
+
+        save_file(t5xxl_state_dict, t5xxl_path)
 
 
 def save_sd3_model_on_train_end(

From 0af4edd8a63d7fcdf02bdcbd11b8770fd1cae162 Mon Sep 17 00:00:00 2001
From: kohya-ss <ykumeykume@gmail.com>
Date: Tue, 29 Oct 2024 21:51:56 +0900
Subject: [PATCH 203/582] Fix split_qkv

---
 networks/lora_sd3.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/networks/lora_sd3.py b/networks/lora_sd3.py
index cbabf8da0..249298b39 100644
--- a/networks/lora_sd3.py
+++ b/networks/lora_sd3.py
@@ -540,8 +540,8 @@ def state_dict(self, destination=None, prefix="", keep_vars=False):
             down_weight = torch.cat(down_weights, dim=0)  # (rank, split_dim) * 3 -> (rank*3, sum of split_dim)
 
             # merge up weight (sum of split_dim, rank*3)
-            qkv_dim, rank = up_weights[0].size()
-            split_dim = qkv_dim // 3
+            split_dim, rank = up_weights[0].size()
+            qkv_dim = split_dim * 3
             up_weight = torch.zeros((qkv_dim, down_weight.size(0)), device=down_weight.device, dtype=down_weight.dtype)
             i = 0
             for j in range(3):

From d4e19fbd5e34e90347f189a8ba1f77e8878fe0ca Mon Sep 17 00:00:00 2001
From: kohya-ss <ykumeykume@gmail.com>
Date: Tue, 29 Oct 2024 21:52:04 +0900
Subject: [PATCH 204/582] Support Lora

---
 sd3_minimal_inference.py | 60 +++++++++++++++++++++++++++++-----------
 1 file changed, 44 insertions(+), 16 deletions(-)

diff --git a/sd3_minimal_inference.py b/sd3_minimal_inference.py
index d099fe18d..86dba246d 100644
--- a/sd3_minimal_inference.py
+++ b/sd3_minimal_inference.py
@@ -10,11 +10,13 @@
 
 import torch
 from safetensors.torch import safe_open, load_file
+import torch.amp
 from tqdm import tqdm
 from PIL import Image
 from transformers import CLIPTextModelWithProjection, T5EncoderModel
 
 from library.device_utils import init_ipex, get_preferred_device
+from networks import lora_sd3
 
 init_ipex()
 
@@ -104,7 +106,8 @@ def do_sample(
             x_c_nc = torch.cat([x, x], dim=0)
             # print(x_c_nc.shape, timestep.shape, c_crossattn.shape, y.shape)
 
-            model_output = mmdit(x_c_nc, timestep, context=c_crossattn, y=y)
+            with torch.autocast(device_type=device.type, dtype=dtype):
+                model_output = mmdit(x_c_nc, timestep, context=c_crossattn, y=y)
             model_output = model_output.float()
             batched = model_sampling.calculate_denoised(sigma_hat, model_output, x)
 
@@ -153,7 +156,7 @@ def generate_image(
     clip_g.to(device)
     t5xxl.to(device)
 
-    with torch.no_grad():
+    with torch.autocast(device_type=device.type, dtype=mmdit.dtype), torch.no_grad():
         tokens_and_masks = tokenize_strategy.tokenize(prompt)
         lg_out, t5_out, pooled, l_attn_mask, g_attn_mask, t5_attn_mask = encoding_strategy.encode_tokens(
             tokenize_strategy, [clip_l, clip_g, t5xxl], tokens_and_masks, args.apply_lg_attn_mask, args.apply_t5_attn_mask
@@ -233,13 +236,14 @@ def generate_image(
     parser.add_argument("--bf16", action="store_true")
     parser.add_argument("--seed", type=int, default=1)
     parser.add_argument("--steps", type=int, default=50)
-    # parser.add_argument(
-    #     "--lora_weights",
-    #     type=str,
-    #     nargs="*",
-    #     default=[],
-    #     help="LoRA weights, only supports networks.lora, each argument is a `path;multiplier` (semi-colon separated)",
-    # )
+    parser.add_argument(
+        "--lora_weights",
+        type=str,
+        nargs="*",
+        default=[],
+        help="LoRA weights, only supports networks.lora_sd3, each argument is a `path;multiplier` (semi-colon separated)",
+    )
+    parser.add_argument("--merge_lora_weights", action="store_true", help="Merge LoRA weights to model")
     parser.add_argument("--width", type=int, default=target_width)
     parser.add_argument("--height", type=int, default=target_height)
     parser.add_argument("--interactive", action="store_true")
@@ -294,6 +298,30 @@ def generate_image(
     tokenize_strategy = strategy_sd3.Sd3TokenizeStrategy(args.t5xxl_token_length)
     encoding_strategy = strategy_sd3.Sd3TextEncodingStrategy()
 
+    # LoRA
+    lora_models: list[lora_sd3.LoRANetwork] = []
+    for weights_file in args.lora_weights:
+        if ";" in weights_file:
+            weights_file, multiplier = weights_file.split(";")
+            multiplier = float(multiplier)
+        else:
+            multiplier = 1.0
+
+        weights_sd = load_file(weights_file)
+        module = lora_sd3
+        lora_model, _ = module.create_network_from_weights(multiplier, None, vae, [clip_l, clip_g, t5xxl], mmdit, weights_sd, True)
+
+        if args.merge_lora_weights:
+            lora_model.merge_to([clip_l, clip_g, t5xxl], mmdit, weights_sd)
+        else:
+            lora_model.apply_to([clip_l, clip_g, t5xxl], mmdit)
+            info = lora_model.load_state_dict(weights_sd, strict=True)
+            logger.info(f"Loaded LoRA weights from {weights_file}: {info}")
+            lora_model.eval()
+            lora_model.to(device)
+
+        lora_models.append(lora_model)
+
     if not args.interactive:
         generate_image(
             mmdit,
@@ -344,13 +372,13 @@ def generate_image(
                         steps = int(opt[1:].strip())
                     elif opt.startswith("d"):
                         seed = int(opt[1:].strip())
-                    # elif opt.startswith("m"):
-                    #     mutipliers = opt[1:].strip().split(",")
-                    #     if len(mutipliers) != len(lora_models):
-                    #         logger.error(f"Invalid number of multipliers, expected {len(lora_models)}")
-                    #         continue
-                    #     for i, lora_model in enumerate(lora_models):
-                    #         lora_model.set_multiplier(float(mutipliers[i]))
+                    elif opt.startswith("m"):
+                        mutipliers = opt[1:].strip().split(",")
+                        if len(mutipliers) != len(lora_models):
+                            logger.error(f"Invalid number of multipliers, expected {len(lora_models)}")
+                            continue
+                        for i, lora_model in enumerate(lora_models):
+                            lora_model.set_multiplier(float(mutipliers[i]))
                     elif opt.startswith("n"):
                         negative_prompt = opt[1:].strip()
                         if negative_prompt == "-":

From 1e2f7b0e44ee656cd8d0ca8268aa1371618031ac Mon Sep 17 00:00:00 2001
From: Kohya S <ykumeykume@gmail.com>
Date: Tue, 29 Oct 2024 22:11:04 +0900
Subject: [PATCH 205/582] Support for checkpoint files with a mysterious prefix
 "model.diffusion_model."

---
 library/flux_utils.py | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/library/flux_utils.py b/library/flux_utils.py
index 7a1ec37b8..4403835f1 100644
--- a/library/flux_utils.py
+++ b/library/flux_utils.py
@@ -73,6 +73,10 @@ def analyze_checkpoint_state(ckpt_path: str) -> Tuple[bool, bool, Tuple[int, int
         with safe_open(ckpt_path, framework="pt") as f:
             keys.extend(f.keys())
 
+    # if the key has annoying prefix, remove it
+    if keys[0].startswith("model.diffusion_model."):
+        keys = [key.replace("model.diffusion_model.", "") for key in keys]
+
     is_diffusers = "transformer_blocks.0.attn.add_k_proj.bias" in keys
     is_schnell = not ("guidance_in.in_layer.bias" in keys or "time_text_embed.guidance_embedder.linear_1.bias" in keys)
 
@@ -141,6 +145,13 @@ def load_flow_model(
         sd = convert_diffusers_sd_to_bfl(sd, num_double_blocks, num_single_blocks)
         logger.info("Converted Diffusers to BFL")
 
+    # if the key has annoying prefix, remove it
+    for key in list(sd.keys()):
+        new_key = key.replace("model.diffusion_model.", "")
+        if new_key == key:
+            break  # the model doesn't have annoying prefix
+        sd[new_key] = sd.pop(key)
+
     info = model.load_state_dict(sd, strict=False, assign=True)
     logger.info(f"Loaded Flux: {info}")
     return is_schnell, model

From ce5b5325829538c03ff9ce80a79fe2c84ca5283c Mon Sep 17 00:00:00 2001
From: kohya-ss <ykumeykume@gmail.com>
Date: Tue, 29 Oct 2024 22:29:24 +0900
Subject: [PATCH 206/582] Fix additional LoRA to work

---
 networks/lora_sd3.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/networks/lora_sd3.py b/networks/lora_sd3.py
index 249298b39..c1eb68b8a 100644
--- a/networks/lora_sd3.py
+++ b/networks/lora_sd3.py
@@ -428,7 +428,7 @@ def create_modules(
             for filter, in_dim in zip(
                 [
                     "context_embedder",
-                    "t_embedder",
+                    "_t_embedder", # don't use "t_embedder" because it's used in "context_embedder"
                     "x_embedder",
                     "y_embedder",
                     "final_layer_adaLN_modulation",

From b502f584886fbf52f9a180981efe276ea8509de7 Mon Sep 17 00:00:00 2001
From: kohya-ss <ykumeykume@gmail.com>
Date: Tue, 29 Oct 2024 23:29:50 +0900
Subject: [PATCH 207/582] Fix emb_dim to work.

---
 networks/lora_sd3.py | 21 +++++++++++++++++----
 1 file changed, 17 insertions(+), 4 deletions(-)

diff --git a/networks/lora_sd3.py b/networks/lora_sd3.py
index c1eb68b8a..efe202451 100644
--- a/networks/lora_sd3.py
+++ b/networks/lora_sd3.py
@@ -307,6 +307,7 @@ def create_modules(
             target_replace_modules: List[str],
             filter: Optional[str] = None,
             default_dim: Optional[int] = None,
+            include_conv2d_if_filter: bool = False,
         ) -> List[LoRAModule]:
             prefix = (
                 self.LORA_PREFIX_SD3
@@ -332,8 +333,11 @@ def create_modules(
                             lora_name = prefix + "." + (name + "." if name else "") + child_name
                             lora_name = lora_name.replace(".", "_")
 
-                            if filter is not None and not filter in lora_name:
-                                continue
+                            force_incl_conv2d = False
+                            if filter is not None:
+                                if not filter in lora_name:
+                                    continue
+                                force_incl_conv2d = include_conv2d_if_filter
 
                             dim = None
                             alpha = None
@@ -373,6 +377,10 @@ def create_modules(
                                 elif self.conv_lora_dim is not None:
                                     dim = self.conv_lora_dim
                                     alpha = self.conv_alpha
+                                elif force_incl_conv2d:
+                                    # x_embedder
+                                    dim = default_dim if default_dim is not None else self.lora_dim
+                                    alpha = self.alpha
 
                             if dim is None or dim == 0:
                                 # skipした情報を出力
@@ -428,7 +436,7 @@ def create_modules(
             for filter, in_dim in zip(
                 [
                     "context_embedder",
-                    "_t_embedder", # don't use "t_embedder" because it's used in "context_embedder"
+                    "_t_embedder",  # don't use "t_embedder" because it's used in "context_embedder"
                     "x_embedder",
                     "y_embedder",
                     "final_layer_adaLN_modulation",
@@ -436,7 +444,12 @@ def create_modules(
                 ],
                 self.emb_dims,
             ):
-                loras, _ = create_modules(True, None, unet, None, filter=filter, default_dim=in_dim)
+                # x_embedder is conv2d, so we need to include it
+                loras, _ = create_modules(
+                    True, None, unet, None, filter=filter, default_dim=in_dim, include_conv2d_if_filter=filter == "x_embedder"
+                )
+                # if len(loras) > 0:
+                #     logger.info(f"create LoRA for {filter}: {len(loras)} modules.")
                 self.unet_loras.extend(loras)
 
         logger.info(f"create LoRA for SD3 MMDiT: {len(self.unet_loras)} modules.")

From bdddc20d68a7441cccfcf0009528fdd59403b94a Mon Sep 17 00:00:00 2001
From: Kohya S <ykumeykume@gmail.com>
Date: Wed, 30 Oct 2024 12:51:49 +0900
Subject: [PATCH 208/582] support SD3.5M

---
 library/sd3_models.py      | 128 +++++++++++++++++++++++--------------
 library/sd3_train_utils.py |   7 ++
 library/sd3_utils.py       |  13 ++--
 sd3_train.py               |   8 +--
 sd3_train_network.py       |   1 +
 5 files changed, 99 insertions(+), 58 deletions(-)

diff --git a/library/sd3_models.py b/library/sd3_models.py
index 5d09f74e8..840f91869 100644
--- a/library/sd3_models.py
+++ b/library/sd3_models.py
@@ -51,7 +51,7 @@ class SD3Params:
     pos_embed_max_size: int
     adm_in_channels: int
     qk_norm: Optional[str]
-    x_block_self_attn_layers: List[int]
+    x_block_self_attn_layers: list[int]
     context_embedder_in_features: int
     context_embedder_out_features: int
     model_type: str
@@ -510,6 +510,7 @@ def __init__(
         scale_mod_only: bool = False,
         swiglu: bool = False,
         qk_norm: Optional[str] = None,
+        x_block_self_attn: bool = False,
         **block_kwargs,
     ):
         super().__init__()
@@ -519,13 +520,14 @@ def __init__(
             self.norm1 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
         else:
             self.norm1 = RMSNorm(hidden_size, elementwise_affine=False, eps=1e-6)
-        self.attn = AttentionLinears(
-            dim=hidden_size,
-            num_heads=num_heads,
-            qkv_bias=qkv_bias,
-            pre_only=pre_only,
-            qk_norm=qk_norm,
-        )
+        self.attn = AttentionLinears(dim=hidden_size, num_heads=num_heads, qkv_bias=qkv_bias, pre_only=pre_only, qk_norm=qk_norm)
+
+        self.x_block_self_attn = x_block_self_attn
+        if self.x_block_self_attn:
+            assert not pre_only
+            assert not scale_mod_only
+            self.attn2 = AttentionLinears(dim=hidden_size, num_heads=num_heads, qkv_bias=qkv_bias, pre_only=False, qk_norm=qk_norm)
+
         if not pre_only:
             if not rmsnorm:
                 self.norm2 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
@@ -546,7 +548,9 @@ def __init__(
                     multiple_of=256,
                 )
         self.scale_mod_only = scale_mod_only
-        if not scale_mod_only:
+        if self.x_block_self_attn:
+            n_mods = 9
+        elif not scale_mod_only:
             n_mods = 6 if not pre_only else 2
         else:
             n_mods = 4 if not pre_only else 1
@@ -556,63 +560,64 @@ def __init__(
     def pre_attention(self, x: torch.Tensor, c: torch.Tensor) -> torch.Tensor:
         if not self.pre_only:
             if not self.scale_mod_only:
-                (
-                    shift_msa,
-                    scale_msa,
-                    gate_msa,
-                    shift_mlp,
-                    scale_mlp,
-                    gate_mlp,
-                ) = self.adaLN_modulation(
-                    c
-                ).chunk(6, dim=-1)
+                (shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp, gate_mlp) = self.adaLN_modulation(c).chunk(6, dim=-1)
             else:
                 shift_msa = None
                 shift_mlp = None
-                (
-                    scale_msa,
-                    gate_msa,
-                    scale_mlp,
-                    gate_mlp,
-                ) = self.adaLN_modulation(
-                    c
-                ).chunk(4, dim=-1)
+                (scale_msa, gate_msa, scale_mlp, gate_mlp) = self.adaLN_modulation(c).chunk(4, dim=-1)
             qkv = self.attn.pre_attention(modulate(self.norm1(x), shift_msa, scale_msa))
-            return qkv, (
-                x,
-                gate_msa,
-                shift_mlp,
-                scale_mlp,
-                gate_mlp,
-            )
+            return qkv, (x, gate_msa, shift_mlp, scale_mlp, gate_mlp)
         else:
             if not self.scale_mod_only:
-                (
-                    shift_msa,
-                    scale_msa,
-                ) = self.adaLN_modulation(
-                    c
-                ).chunk(2, dim=-1)
+                (shift_msa, scale_msa) = self.adaLN_modulation(c).chunk(2, dim=-1)
             else:
                 shift_msa = None
                 scale_msa = self.adaLN_modulation(c)
             qkv = self.attn.pre_attention(modulate(self.norm1(x), shift_msa, scale_msa))
             return qkv, None
 
+    def pre_attention_x(self, x: torch.Tensor, c: torch.Tensor) -> torch.Tensor:
+        assert self.x_block_self_attn
+        (shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp, gate_mlp, shift_msa2, scale_msa2, gate_msa2) = self.adaLN_modulation(
+            c
+        ).chunk(9, dim=1)
+        x_norm = self.norm1(x)
+        qkv = self.attn.pre_attention(modulate(x_norm, shift_msa, scale_msa))
+        qkv2 = self.attn2.pre_attention(modulate(x_norm, shift_msa2, scale_msa2))
+        return qkv, qkv2, (x, gate_msa, shift_mlp, scale_mlp, gate_mlp, gate_msa2)
+
     def post_attention(self, attn, x, gate_msa, shift_mlp, scale_mlp, gate_mlp):
         assert not self.pre_only
         x = x + gate_msa.unsqueeze(1) * self.attn.post_attention(attn)
         x = x + gate_mlp.unsqueeze(1) * self.mlp(modulate(self.norm2(x), shift_mlp, scale_mlp))
         return x
 
+    def post_attention_x(self, attn, attn2, x, gate_msa, shift_mlp, scale_mlp, gate_mlp, gate_msa2, attn1_dropout: float = 0.0):
+        assert not self.pre_only
+        if attn1_dropout > 0.0:
+            # Use torch.bernoulli to implement dropout, only dropout the batch dimension
+            attn1_dropout = torch.bernoulli(torch.full((attn.size(0), 1, 1), 1 - attn1_dropout, device=attn.device))
+            attn_ = gate_msa.unsqueeze(1) * self.attn.post_attention(attn) * attn1_dropout
+        else:
+            attn_ = gate_msa.unsqueeze(1) * self.attn.post_attention(attn)
+        x = x + attn_
+        attn2_ = gate_msa2.unsqueeze(1) * self.attn2.post_attention(attn2)
+        x = x + attn2_
+        mlp_ = gate_mlp.unsqueeze(1) * self.mlp(modulate(self.norm2(x), shift_mlp, scale_mlp))
+        x = x + mlp_
+        return x
+
 
 # JointBlock + block_mixing in mmdit.py
 class MMDiTBlock(nn.Module):
     def __init__(self, *args, **kwargs):
         super().__init__()
         pre_only = kwargs.pop("pre_only")
+        x_block_self_attn = kwargs.pop("x_block_self_attn")
+
         self.context_block = SingleDiTBlock(*args, pre_only=pre_only, **kwargs)
-        self.x_block = SingleDiTBlock(*args, pre_only=False, **kwargs)
+        self.x_block = SingleDiTBlock(*args, pre_only=False, x_block_self_attn=x_block_self_attn, **kwargs)
+        
         self.head_dim = self.x_block.attn.head_dim
         self.mode = self.x_block.attn_mode
         self.gradient_checkpointing = False
@@ -622,7 +627,11 @@ def enable_gradient_checkpointing(self):
 
     def _forward(self, context, x, c):
         ctx_qkv, ctx_intermediate = self.context_block.pre_attention(context, c)
-        x_qkv, x_intermediate = self.x_block.pre_attention(x, c)
+
+        if self.x_block.x_block_self_attn:
+            x_qkv, x_qkv2, x_intermediates = self.x_block.pre_attention_x(x, c)
+        else:
+            x_qkv, x_intermediates = self.x_block.pre_attention(x, c)
 
         ctx_len = ctx_qkv[0].size(1)
 
@@ -634,11 +643,18 @@ def _forward(self, context, x, c):
         ctx_attn_out = attn[:, :ctx_len]
         x_attn_out = attn[:, ctx_len:]
 
-        x = self.x_block.post_attention(x_attn_out, *x_intermediate)
+        if self.x_block.x_block_self_attn:
+            x_q2, x_k2, x_v2 = x_qkv2
+            attn2 = attention(x_q2, x_k2, x_v2, self.x_block.attn2.num_heads)
+            x = self.x_block.post_attention_x(x_attn_out, attn2, *x_intermediates)
+        else:
+            x = self.x_block.post_attention(x_attn_out, *x_intermediates)
+
         if not self.context_block.pre_only:
             context = self.context_block.post_attention(ctx_attn_out, *ctx_intermediate)
         else:
             context = None
+
         return context, x
 
     def forward(self, *args, **kwargs):
@@ -678,7 +694,9 @@ def __init__(
         pos_embed_max_size: Optional[int] = None,
         num_patches=None,
         qk_norm: Optional[str] = None,
+        x_block_self_attn_layers: Optional[list[int]] = [],
         qkv_bias: bool = True,
+        pos_emb_random_crop_rate: float = 0.0,
         model_type: str = "sd3m",
     ):
         super().__init__()
@@ -691,6 +709,8 @@ def __init__(
         self.pos_embed_scaling_factor = pos_embed_scaling_factor
         self.pos_embed_offset = pos_embed_offset
         self.pos_embed_max_size = pos_embed_max_size
+        self.x_block_self_attn_layers = x_block_self_attn_layers
+        self.pos_emb_random_crop_rate = pos_emb_random_crop_rate
         self.gradient_checkpointing = use_checkpoint
 
         # hidden_size = default(hidden_size, 64 * depth)
@@ -751,6 +771,7 @@ def __init__(
                     scale_mod_only=scale_mod_only,
                     swiglu=swiglu,
                     qk_norm=qk_norm,
+                    x_block_self_attn=(i in self.x_block_self_attn_layers),
                 )
                 for i in range(depth)
             ]
@@ -832,7 +853,10 @@ def _basic_init(module):
         nn.init.constant_(self.final_layer.linear.weight, 0)
         nn.init.constant_(self.final_layer.linear.bias, 0)
 
-    def cropped_pos_embed(self, h, w, device=None):
+    def set_pos_emb_random_crop_rate(self, rate: float):
+        self.pos_emb_random_crop_rate = rate
+
+    def cropped_pos_embed(self, h, w, device=None, random_crop: bool = False):
         p = self.x_embedder.patch_size
         # patched size
         h = (h + 1) // p
@@ -842,8 +866,14 @@ def cropped_pos_embed(self, h, w, device=None):
         assert self.pos_embed_max_size is not None
         assert h <= self.pos_embed_max_size, (h, self.pos_embed_max_size)
         assert w <= self.pos_embed_max_size, (w, self.pos_embed_max_size)
-        top = (self.pos_embed_max_size - h) // 2
-        left = (self.pos_embed_max_size - w) // 2
+
+        if not random_crop:
+            top = (self.pos_embed_max_size - h) // 2
+            left = (self.pos_embed_max_size - w) // 2
+        else:
+            top = torch.randint(0, self.pos_embed_max_size - h + 1, (1,)).item()
+            left = torch.randint(0, self.pos_embed_max_size - w + 1, (1,)).item()
+
         spatial_pos_embed = self.pos_embed.reshape(
             1,
             self.pos_embed_max_size,
@@ -896,9 +926,12 @@ def forward(
         t: (N,) tensor of diffusion timesteps
         y: (N, D) tensor of class labels
         """
+        pos_emb_random_crop = (
+            False if self.pos_emb_random_crop_rate == 0.0 else torch.rand(1).item() < self.pos_emb_random_crop_rate
+        )
 
         B, C, H, W = x.shape
-        x = self.x_embedder(x) + self.cropped_pos_embed(H, W, device=x.device).to(dtype=x.dtype)
+        x = self.x_embedder(x) + self.cropped_pos_embed(H, W, device=x.device, random_crop=pos_emb_random_crop).to(dtype=x.dtype)
         c = self.t_embedder(t, dtype=x.dtype)  # (N, D)
         if y is not None and self.y_embedder is not None:
             y = self.y_embedder(y)  # (N, D)
@@ -977,6 +1010,7 @@ def create_sd3_mmdit(params: SD3Params, attn_mode: str = "torch") -> MMDiT:
         depth=params.depth,
         mlp_ratio=4,
         qk_norm=params.qk_norm,
+        x_block_self_attn_layers=params.x_block_self_attn_layers,
         num_patches=params.num_patches,
         attn_mode=attn_mode,
         model_type=params.model_type,
diff --git a/library/sd3_train_utils.py b/library/sd3_train_utils.py
index 1702e81c2..86f0c9c04 100644
--- a/library/sd3_train_utils.py
+++ b/library/sd3_train_utils.py
@@ -239,6 +239,13 @@ def add_sd3_training_arguments(parser: argparse.ArgumentParser):
         default=0.0,
         help="Dropout rate for T5 encoder, default is 0.0 / T5エンコーダのドロップアウト率、デフォルトは0.0",
     )
+    parser.add_argument(
+        "--pos_emb_random_crop_rate",
+        type=float,
+        default=0.0,
+        help="Random crop rate for positional embeddings, default is 0.0. Only for SD3.5M"
+        " / 位置埋め込みのランダムクロップ率、デフォルトは0.0。SD3.5M以外では予期しない動作になります",
+    )
 
     # copy from Diffusers
     parser.add_argument(
diff --git a/library/sd3_utils.py b/library/sd3_utils.py
index 71e50de36..1861dfbc2 100644
--- a/library/sd3_utils.py
+++ b/library/sd3_utils.py
@@ -41,20 +41,21 @@ def analyze_state_dict_state(state_dict: Dict, prefix: str = ""):
 
     #  x_block_self_attn_layers.append(int(key.split(".x_block.attn2.ln_k.weight")[0].split(".")[-1]))
     x_block_self_attn_layers = []
-    re_attn = re.compile(r".(\d+).x_block.attn2.ln_k.weight")
+    re_attn = re.compile(r"\.(\d+)\.x_block\.attn2\.ln_k\.weight")
     for key in list(state_dict.keys()):
-        m = re_attn.match(key)
+        m = re_attn.search(key)
         if m:
             x_block_self_attn_layers.append(int(m.group(1)))
 
-    assert len(x_block_self_attn_layers) == 0, "x_block_self_attn_layers is not supported"
-
     context_embedder_in_features = context_shape[1]
     context_embedder_out_features = context_shape[0]
 
-    # only supports 3-5-large and 3-medium
+    # only supports 3-5-large, medium or 3-medium
     if qk_norm is not None:
-        model_type = "3-5-large"
+        if len(x_block_self_attn_layers) == 0:
+            model_type = "3-5-large"
+        else:
+            model_type = "3-5-medium"
     else:
         model_type = "3-medium"
 
diff --git a/sd3_train.py b/sd3_train.py
index cdac945e6..df2736901 100644
--- a/sd3_train.py
+++ b/sd3_train.py
@@ -353,17 +353,15 @@ def train(args):
         accelerator.wait_for_everyone()
 
     # load MMDIT
-    mmdit = sd3_utils.load_mmdit(
-        sd3_state_dict,
-        model_dtype,
-        "cpu",
-    )
+    mmdit = sd3_utils.load_mmdit(sd3_state_dict, model_dtype, "cpu")
 
     # attn_mode = "xformers" if args.xformers else "torch"
     # assert (
     #     attn_mode == "torch"
     # ), f"attn_mode {attn_mode} is not supported yet. Please use `--sdpa` instead of `--xformers`. / attn_mode {attn_mode} はサポートされていません。`--xformers`の代わりに`--sdpa`を使ってください。"
 
+    mmdit.set_pos_emb_random_crop_rate(args.pos_emb_random_crop_rate)
+    
     if args.gradient_checkpointing:
         mmdit.enable_gradient_checkpointing()
 
diff --git a/sd3_train_network.py b/sd3_train_network.py
index 3506404ae..3d2a75710 100644
--- a/sd3_train_network.py
+++ b/sd3_train_network.py
@@ -65,6 +65,7 @@ def load_target_model(self, args, weight_dtype, accelerator):
         )
         mmdit = sd3_utils.load_mmdit(state_dict, loading_dtype, "cpu")
         self.model_type = mmdit.model_type
+        mmdit.set_pos_emb_random_crop_rate(args.pos_emb_random_crop_rate)
 
         if args.fp8_base:
             # check dtype of model

From 70a179e446219b66f208e4fbb37b74c5d77d6086 Mon Sep 17 00:00:00 2001
From: Kohya S <ykumeykume@gmail.com>
Date: Wed, 30 Oct 2024 14:34:19 +0900
Subject: [PATCH 209/582] Fix to use SDPA instead of xformers

---
 library/sd3_models.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/library/sd3_models.py b/library/sd3_models.py
index 840f91869..60356e82c 100644
--- a/library/sd3_models.py
+++ b/library/sd3_models.py
@@ -645,7 +645,7 @@ def _forward(self, context, x, c):
 
         if self.x_block.x_block_self_attn:
             x_q2, x_k2, x_v2 = x_qkv2
-            attn2 = attention(x_q2, x_k2, x_v2, self.x_block.attn2.num_heads)
+            attn2 = attention(x_q2, x_k2, x_v2, self.x_block.attn2.num_heads, mode=self.mode)
             x = self.x_block.post_attention_x(x_attn_out, attn2, *x_intermediates)
         else:
             x = self.x_block.post_attention(x_attn_out, *x_intermediates)

From 1434d8506f3ccc4ae6cc005a19531dba3cbb9fb9 Mon Sep 17 00:00:00 2001
From: Kohya S <ykumeykume@gmail.com>
Date: Thu, 31 Oct 2024 19:58:22 +0900
Subject: [PATCH 210/582] Support SD3.5M multi resolutional training

---
 library/sd3_models.py      | 177 ++++++++++++++++++++++++++++++++++++-
 library/sd3_train_utils.py |   6 ++
 library/strategy_base.py   |   2 +-
 library/strategy_flux.py   |   4 +-
 library/strategy_sd3.py    |  11 ++-
 library/train_util.py      |   3 +
 sd3_train.py               |   9 +-
 sd3_train_network.py       |  13 ++-
 8 files changed, 215 insertions(+), 10 deletions(-)

diff --git a/library/sd3_models.py b/library/sd3_models.py
index 60356e82c..0eca94e2f 100644
--- a/library/sd3_models.py
+++ b/library/sd3_models.py
@@ -88,6 +88,78 @@ def get_2d_sincos_pos_embed_from_grid(embed_dim, grid):
     return emb
 
 
+def get_scaled_2d_sincos_pos_embed(embed_dim, grid_size, cls_token=False, extra_tokens=0, sample_size=64, base_size=16):
+    """
+    This function is contributed by KohakuBlueleaf. Thanks for the contribution!
+
+    Creates scaled 2D sinusoidal positional embeddings that maintain consistent relative positions
+    when the resolution differs from the training resolution.
+
+    Args:
+        embed_dim (int): Dimension of the positional embedding.
+        grid_size (int or tuple): Size of the position grid (H, W). If int, assumes square grid.
+        cls_token (bool): Whether to include class token. Defaults to False.
+        extra_tokens (int): Number of extra tokens (e.g., cls_token). Defaults to 0.
+        sample_size (int): Reference resolution (typically training resolution). Defaults to 64.
+        base_size (int): Base grid size used during training. Defaults to 16.
+
+    Returns:
+        numpy.ndarray: Positional embeddings of shape (H*W, embed_dim) or
+                      (H*W + extra_tokens, embed_dim) if cls_token is True.
+    """
+    # Convert grid_size to tuple if it's an integer
+    if isinstance(grid_size, int):
+        grid_size = (grid_size, grid_size)
+
+    # Create normalized grid coordinates (0 to 1)
+    grid_h = np.arange(grid_size[0], dtype=np.float32) / grid_size[0]
+    grid_w = np.arange(grid_size[1], dtype=np.float32) / grid_size[1]
+
+    # Calculate scaling factors for height and width
+    # This ensures that the central region matches the original resolution's embeddings
+    scale_h = base_size * grid_size[0] / (sample_size)
+    scale_w = base_size * grid_size[1] / (sample_size)
+
+    # Calculate shift values to center the original resolution's embedding region
+    # This ensures that the central sample_size x sample_size region has similar
+    # positional embeddings to the original resolution
+    shift_h = 1 * scale_h * (grid_size[0] - sample_size) / (2 * grid_size[0])
+    shift_w = 1 * scale_w * (grid_size[1] - sample_size) / (2 * grid_size[1])
+
+    # Apply scaling and shifting to create the final grid coordinates
+    grid_h = grid_h * scale_h - shift_h
+    grid_w = grid_w * scale_w - shift_w
+
+    # Create 2D grid using meshgrid (note: w goes first)
+    grid = np.meshgrid(grid_w, grid_h)
+    grid = np.stack(grid, axis=0)
+
+    # # Calculate the starting indices for the central region
+    # # This is used for debugging/visualization of the central region
+    # st_h = (grid_size[0] - sample_size) // 2
+    # st_w = (grid_size[1] - sample_size) // 2
+    # print(grid[:, st_h : st_h + sample_size, st_w : st_w + sample_size])
+
+    # Reshape grid for positional embedding calculation
+    grid = grid.reshape([2, 1, grid_size[1], grid_size[0]])
+
+    # Generate the sinusoidal positional embeddings
+    pos_embed = get_2d_sincos_pos_embed_from_grid(embed_dim, grid)
+
+    # Add zeros for extra tokens (e.g., [CLS] token) if required
+    if cls_token and extra_tokens > 0:
+        pos_embed = np.concatenate([np.zeros([extra_tokens, embed_dim]), pos_embed], axis=0)
+
+    return pos_embed
+
+
+# if __name__ == "__main__":
+#     # This is what you get when you load SD3.5 state dict
+#     pos_emb = torch.from_numpy(get_scaled_2d_sincos_pos_embed(
+#         1536, [384, 384], sample_size=64, base_size=16
+#     )).float().unsqueeze(0)
+
+
 def get_1d_sincos_pos_embed_from_grid(embed_dim, pos):
     """
     embed_dim: output dimension for each position
@@ -617,7 +689,7 @@ def __init__(self, *args, **kwargs):
 
         self.context_block = SingleDiTBlock(*args, pre_only=pre_only, **kwargs)
         self.x_block = SingleDiTBlock(*args, pre_only=False, x_block_self_attn=x_block_self_attn, **kwargs)
-        
+
         self.head_dim = self.x_block.attn.head_dim
         self.mode = self.x_block.attn_mode
         self.gradient_checkpointing = False
@@ -669,6 +741,9 @@ class MMDiT(nn.Module):
     Diffusion model with a Transformer backbone.
     """
 
+    # prepare pos_embed for latent size * 2
+    POS_EMBED_MAX_RATIO = 1.5
+
     def __init__(
         self,
         input_size: int = 32,
@@ -697,6 +772,8 @@ def __init__(
         x_block_self_attn_layers: Optional[list[int]] = [],
         qkv_bias: bool = True,
         pos_emb_random_crop_rate: float = 0.0,
+        use_scaled_pos_embed: bool = False,
+        pos_embed_latent_sizes: Optional[list[int]] = None,
         model_type: str = "sd3m",
     ):
         super().__init__()
@@ -722,6 +799,8 @@ def __init__(
 
         self.num_heads = num_heads
 
+        self.enable_scaled_pos_embed(use_scaled_pos_embed, pos_embed_latent_sizes)
+
         self.x_embedder = PatchEmbed(
             input_size,
             patch_size,
@@ -785,6 +864,43 @@ def __init__(
         self.blocks_to_swap = None
         self.thread_pool: Optional[ThreadPoolExecutor] = None
 
+    def enable_scaled_pos_embed(self, use_scaled_pos_embed: bool, latent_sizes: Optional[list[int]]):
+        self.use_scaled_pos_embed = use_scaled_pos_embed
+
+        if self.use_scaled_pos_embed:
+            # # remove pos_embed to free up memory up to 0.4 GB
+            self.pos_embed = None
+
+            # sort latent sizes in ascending order
+            latent_sizes = sorted(latent_sizes)
+
+            patched_sizes = [latent_size // self.patch_size for latent_size in latent_sizes]
+
+            # calculate value range for each latent area: this is used to determine the pos_emb size from the latent shape
+            max_areas = []
+            for i in range(1, len(patched_sizes)):
+                prev_area = patched_sizes[i - 1] ** 2
+                area = patched_sizes[i] ** 2
+                max_areas.append((prev_area + area) // 2)
+
+            # area of the last latent size, if the latent size exceeds this, error will be raised
+            max_areas.append(int((patched_sizes[-1] * MMDiT.POS_EMBED_MAX_RATIO) ** 2))
+            # print("max_areas", max_areas)
+
+            self.resolution_area_to_latent_size = [(area, latent_size) for area, latent_size in zip(max_areas, patched_sizes)]
+
+            self.resolution_pos_embeds = {}
+            for patched_size in patched_sizes:
+                grid_size = int(patched_size * MMDiT.POS_EMBED_MAX_RATIO)
+                pos_embed = get_scaled_2d_sincos_pos_embed(self.hidden_size, grid_size, sample_size=patched_size)
+                pos_embed = torch.from_numpy(pos_embed).float().unsqueeze(0)
+                self.resolution_pos_embeds[patched_size] = pos_embed
+                # print(f"pos_embed for {patched_size}x{patched_size} latent size: {pos_embed.shape}")
+
+        else:
+            self.resolution_area_to_latent_size = None
+            self.resolution_pos_embeds = None
+
     @property
     def model_type(self):
         return self._model_type
@@ -884,6 +1000,54 @@ def cropped_pos_embed(self, h, w, device=None, random_crop: bool = False):
         spatial_pos_embed = spatial_pos_embed.reshape(1, -1, spatial_pos_embed.shape[-1])
         return spatial_pos_embed
 
+    def cropped_scaled_pos_embed(self, h, w, device=None, dtype=None, random_crop: bool = False):
+        p = self.x_embedder.patch_size
+        # patched size
+        h = (h + 1) // p
+        w = (w + 1) // p
+
+        # select pos_embed size based on area
+        area = h * w
+        patched_size = None
+        for area_, patched_size_ in self.resolution_area_to_latent_size:
+            if area <= area_:
+                patched_size = patched_size_
+                break
+        if patched_size is None:
+            raise ValueError(f"Area {area} is too large for the given latent sizes {self.resolution_area_to_latent_size}.")
+
+        pos_embed_size = int(patched_size * MMDiT.POS_EMBED_MAX_RATIO)
+        if h > pos_embed_size or w > pos_embed_size:
+            # fallback to normal pos_embed
+            logger.warning(
+                f"Using normal pos_embed for size {h}x{w} as it exceeds the scaled pos_embed size {pos_embed_size}. Image is too tall or wide."
+            )
+            return self.cropped_pos_embed(h, w, device=device, random_crop=random_crop)
+
+        if not random_crop:
+            top = (pos_embed_size - h) // 2
+            left = (pos_embed_size - w) // 2
+        else:
+            top = torch.randint(0, pos_embed_size - h + 1, (1,)).item()
+            left = torch.randint(0, pos_embed_size - w + 1, (1,)).item()
+
+        pos_embed = self.resolution_pos_embeds[patched_size]
+        if pos_embed.device != device:
+            pos_embed = pos_embed.to(device)
+            # which is better to update device, or transfer every time to device? -> 64x64 emb is 96*96*1536*4=56MB. It's okay to update device.
+            self.resolution_pos_embeds[patched_size] = pos_embed  # update device
+        if pos_embed.dtype != dtype:
+            pos_embed = pos_embed.to(dtype)
+            self.resolution_pos_embeds[patched_size] = pos_embed  # update dtype
+
+        spatial_pos_embed = pos_embed.reshape(1, pos_embed_size, pos_embed_size, pos_embed.shape[-1])
+        spatial_pos_embed = spatial_pos_embed[:, top : top + h, left : left + w, :]
+        spatial_pos_embed = spatial_pos_embed.reshape(1, -1, spatial_pos_embed.shape[-1])
+        # print(
+        #     f"patched size: {h}x{w}, pos_embed size: {pos_embed_size}, pos_embed shape: {pos_embed.shape}, top: {top}, left: {left}"
+        # )
+        return spatial_pos_embed
+
     def enable_block_swap(self, num_blocks: int):
         self.blocks_to_swap = num_blocks
 
@@ -931,7 +1095,16 @@ def forward(
         )
 
         B, C, H, W = x.shape
-        x = self.x_embedder(x) + self.cropped_pos_embed(H, W, device=x.device, random_crop=pos_emb_random_crop).to(dtype=x.dtype)
+
+        # x = self.x_embedder(x) + self.cropped_pos_embed(H, W, device=x.device, random_crop=pos_emb_random_crop).to(dtype=x.dtype)
+        if not self.use_scaled_pos_embed:
+            pos_embed = self.cropped_pos_embed(H, W, device=x.device, random_crop=pos_emb_random_crop).to(dtype=x.dtype)
+        else:
+            # print(f"Using scaled pos_embed for size {H}x{W}")
+            pos_embed = self.cropped_scaled_pos_embed(H, W, device=x.device, dtype=x.dtype, random_crop=pos_emb_random_crop)
+        x = self.x_embedder(x) + pos_embed
+        del pos_embed
+
         c = self.t_embedder(t, dtype=x.dtype)  # (N, D)
         if y is not None and self.y_embedder is not None:
             y = self.y_embedder(y)  # (N, D)
diff --git a/library/sd3_train_utils.py b/library/sd3_train_utils.py
index 86f0c9c04..69878750e 100644
--- a/library/sd3_train_utils.py
+++ b/library/sd3_train_utils.py
@@ -246,6 +246,12 @@ def add_sd3_training_arguments(parser: argparse.ArgumentParser):
         help="Random crop rate for positional embeddings, default is 0.0. Only for SD3.5M"
         " / 位置埋め込みのランダムクロップ率、デフォルトは0.0。SD3.5M以外では予期しない動作になります",
     )
+    parser.add_argument(
+        "--enable_scaled_pos_embed",
+        action="store_true",
+        help="Scale position embeddings for each resolution during multi-resolution training. Only for SD3.5M"
+        " / 複数解像度学習時に解像度ごとに位置埋め込みをスケーリングする。SD3.5M以外では予期しない動作になります",
+    )
 
     # copy from Diffusers
     parser.add_argument(
diff --git a/library/strategy_base.py b/library/strategy_base.py
index e390c5f35..358e42f1d 100644
--- a/library/strategy_base.py
+++ b/library/strategy_base.py
@@ -518,7 +518,7 @@ def load_latents_from_disk(
         self, npz_path: str, bucket_reso: Tuple[int, int]
     ) -> Tuple[Optional[np.ndarray], Optional[List[int]], Optional[List[int]], Optional[np.ndarray], Optional[np.ndarray]]:
         """
-        for SD/SDXL/SD3.0
+        for SD/SDXL
         """
         return self._default_load_latents_from_disk(None, npz_path, bucket_reso)
 
diff --git a/library/strategy_flux.py b/library/strategy_flux.py
index f662b62e9..5e65927f8 100644
--- a/library/strategy_flux.py
+++ b/library/strategy_flux.py
@@ -212,7 +212,7 @@ def get_latents_npz_path(self, absolute_path: str, image_size: Tuple[int, int])
         )
 
     def is_disk_cached_latents_expected(self, bucket_reso: Tuple[int, int], npz_path: str, flip_aug: bool, alpha_mask: bool):
-        return self._default_is_disk_cached_latents_expected(8, bucket_reso, npz_path, flip_aug, alpha_mask, True)
+        return self._default_is_disk_cached_latents_expected(8, bucket_reso, npz_path, flip_aug, alpha_mask, multi_resolution=True)
 
     def load_latents_from_disk(
         self, npz_path: str, bucket_reso: Tuple[int, int]
@@ -226,7 +226,7 @@ def cache_batch_latents(self, vae, image_infos: List, flip_aug: bool, alpha_mask
         vae_dtype = vae.dtype
 
         self._default_cache_batch_latents(
-            encode_by_vae, vae_device, vae_dtype, image_infos, flip_aug, alpha_mask, random_crop, True
+            encode_by_vae, vae_device, vae_dtype, image_infos, flip_aug, alpha_mask, random_crop, multi_resolution=True
         )
 
         if not train_util.HIGH_VRAM:
diff --git a/library/strategy_sd3.py b/library/strategy_sd3.py
index 413169ecc..1d55fe21d 100644
--- a/library/strategy_sd3.py
+++ b/library/strategy_sd3.py
@@ -399,7 +399,12 @@ def get_latents_npz_path(self, absolute_path: str, image_size: Tuple[int, int])
         )
 
     def is_disk_cached_latents_expected(self, bucket_reso: Tuple[int, int], npz_path: str, flip_aug: bool, alpha_mask: bool):
-        return self._default_is_disk_cached_latents_expected(8, bucket_reso, npz_path, flip_aug, alpha_mask)
+        return self._default_is_disk_cached_latents_expected(8, bucket_reso, npz_path, flip_aug, alpha_mask, multi_resolution=True)
+
+    def load_latents_from_disk(
+        self, npz_path: str, bucket_reso: Tuple[int, int]
+    ) -> Tuple[Optional[np.ndarray], Optional[List[int]], Optional[List[int]], Optional[np.ndarray], Optional[np.ndarray]]:
+        return self._default_load_latents_from_disk(8, npz_path, bucket_reso)  # support multi-resolution
 
     # TODO remove circular dependency for ImageInfo
     def cache_batch_latents(self, vae, image_infos: List, flip_aug: bool, alpha_mask: bool, random_crop: bool):
@@ -407,7 +412,9 @@ def cache_batch_latents(self, vae, image_infos: List, flip_aug: bool, alpha_mask
         vae_device = vae.device
         vae_dtype = vae.dtype
 
-        self._default_cache_batch_latents(encode_by_vae, vae_device, vae_dtype, image_infos, flip_aug, alpha_mask, random_crop)
+        self._default_cache_batch_latents(
+            encode_by_vae, vae_device, vae_dtype, image_infos, flip_aug, alpha_mask, random_crop, multi_resolution=True
+        )
 
         if not train_util.HIGH_VRAM:
             train_util.clean_memory_on_device(vae.device)
diff --git a/library/train_util.py b/library/train_util.py
index d568523ca..bd2ff6ef4 100644
--- a/library/train_util.py
+++ b/library/train_util.py
@@ -2510,6 +2510,9 @@ def verify_bucket_reso_steps(self, min_steps: int):
         for dataset in self.datasets:
             dataset.verify_bucket_reso_steps(min_steps)
 
+    def get_resolutions(self) -> List[Tuple[int, int]]:
+        return [(dataset.width, dataset.height) for dataset in self.datasets]
+
     def is_latent_cacheable(self) -> bool:
         return all([dataset.is_latent_cacheable() for dataset in self.datasets])
 
diff --git a/sd3_train.py b/sd3_train.py
index df2736901..40f8c7e1f 100644
--- a/sd3_train.py
+++ b/sd3_train.py
@@ -361,7 +361,14 @@ def train(args):
     # ), f"attn_mode {attn_mode} is not supported yet. Please use `--sdpa` instead of `--xformers`. / attn_mode {attn_mode} はサポートされていません。`--xformers`の代わりに`--sdpa`を使ってください。"
 
     mmdit.set_pos_emb_random_crop_rate(args.pos_emb_random_crop_rate)
-    
+
+    # set resolutions for positional embeddings
+    if args.enable_scaled_pos_embed:
+        resolutions = train_dataset_group.get_resolutions()
+        latent_sizes = [round(math.sqrt(res[0] * res[1])) // 8 for res in resolutions]  # 8 is stride for latent
+        logger.info(f"Prepare scaled positional embeddings for resolutions: {resolutions}, sizes: {latent_sizes}")
+        mmdit.enable_scaled_pos_embed(True, latent_sizes)
+
     if args.gradient_checkpointing:
         mmdit.enable_gradient_checkpointing()
 
diff --git a/sd3_train_network.py b/sd3_train_network.py
index 3d2a75710..9eeac05ca 100644
--- a/sd3_train_network.py
+++ b/sd3_train_network.py
@@ -26,8 +26,8 @@ def __init__(self):
         super().__init__()
         self.sample_prompts_te_outputs = None
 
-    def assert_extra_args(self, args, train_dataset_group):
-        super().assert_extra_args(args, train_dataset_group)
+    def assert_extra_args(self, args, train_dataset_group: train_util.DatasetGroup):
+        # super().assert_extra_args(args, train_dataset_group)
         # sdxl_train_util.verify_sdxl_training_args(args)
 
         if args.fp8_base_unet:
@@ -53,6 +53,9 @@ def assert_extra_args(self, args, train_dataset_group):
 
         train_dataset_group.verify_bucket_reso_steps(32)  # TODO check this
 
+        # enumerate resolutions from dataset for positional embeddings
+        self.resolutions = train_dataset_group.get_resolutions()
+
     def load_target_model(self, args, weight_dtype, accelerator):
         # currently offload to cpu for some models
 
@@ -67,6 +70,12 @@ def load_target_model(self, args, weight_dtype, accelerator):
         self.model_type = mmdit.model_type
         mmdit.set_pos_emb_random_crop_rate(args.pos_emb_random_crop_rate)
 
+        # set resolutions for positional embeddings
+        if args.enable_scaled_pos_embed:
+            latent_sizes = [round(math.sqrt(res[0] * res[1])) // 8 for res in self.resolutions]  # 8 is stride for latent
+            logger.info(f"Prepare scaled positional embeddings for resolutions: {self.resolutions}, sizes: {latent_sizes}")
+            mmdit.enable_scaled_pos_embed(True, latent_sizes)
+
         if args.fp8_base:
             # check dtype of model
             if mmdit.dtype == torch.float8_e4m3fnuz or mmdit.dtype == torch.float8_e5m2 or mmdit.dtype == torch.float8_e5m2fnuz:

From 9e23368e3d6288e85c6fe34f4d5774bd4d948517 Mon Sep 17 00:00:00 2001
From: Kohya S <ykumeykume@gmail.com>
Date: Thu, 31 Oct 2024 19:58:41 +0900
Subject: [PATCH 211/582] Update SD3 training

---
 README.md | 195 +++++++++++++++++++++++++++++++++++++++++++++---------
 1 file changed, 163 insertions(+), 32 deletions(-)

diff --git a/README.md b/README.md
index ad2791e7f..aff78b2c6 100644
--- a/README.md
+++ b/README.md
@@ -1,6 +1,6 @@
 This repository contains training, generation and utility scripts for Stable Diffusion.
 
-## FLUX.1 training (WIP)
+## FLUX.1 and SD3 training (WIP)
 
 This feature is experimental. The options and the training script may change in the future. Please let us know if you have any idea to improve the training.
 
@@ -9,8 +9,15 @@ __Please update PyTorch to 2.4.0. We have tested with `torch==2.4.0` and `torchv
 The command to install PyTorch is as follows:
 `pip3 install torch==2.4.0 torchvision==0.19.0 --index-url https://download.pytorch.org/whl/cu124`
 
+- [FLUX.1 training](#flux1-training)
+- [SD3 training](#sd3-training)
+
 ### Recent Updates
 
+Oct 31, 2024:
+
+- Added support for SD3.5L/M training. See [SD3 training](#sd3-training) for details.
+
 Oct 19, 2024:
 
 - Added an implementation of Differential Output Preservation (temporary name) for SDXL/FLUX.1 LoRA training. SD1/2 is not tested yet. This is an experimental feature. 
@@ -139,7 +146,7 @@ Sep 1, 2024:
 Aug 29, 2024: 
 Please update `safetensors` to `0.4.4` to fix the error when using `--resume`. `requirements.txt` is updated.
 
-### Contents
+## FLUX.1 training
 
 - [FLUX.1 LoRA training](#flux1-lora-training)
   - [Key Options for FLUX.1 LoRA training](#key-options-for-flux1-lora-training)
@@ -586,53 +593,177 @@ python tools/convert_diffusers_to_flux.py --diffusers_path path/to/diffusers_fol
 
 ## SD3 training
 
-SD3 training is done with `sd3_train.py`. 
+SD3.5L/M training is now available. 
+
+### SD3 LoRA training
+
+The script is `sd3_train_network.py`. See `--help` for options. 
+
+SD3 model, CLIP-L, CLIP-G, and T5XXL models are recommended to be in float/fp16 format. If you specify `--fp8_base`, you can use fp8 models for SD3. The fp8 model is only compatible with `float8_e4m3fn` format.
+
+Sample command is below. It will work with 16GB VRAM GPUs (SD3.5L).
+
+```
+accelerate launch  --mixed_precision bf16 --num_cpu_threads_per_process 1 sd3_train_network.py 
+--pretrained_model_name_or_path path/to/sd3.5_large.safetensors --clip_l sd3/clip_l.safetensors --clip_g sd3/clip_g.safetensors --t5xxl sd3/t5xxl_fp16.safetensors 
+--cache_latents_to_disk --save_model_as safetensors --sdpa --persistent_data_loader_workers 
+--max_data_loader_n_workers 2 --seed 42 --gradient_checkpointing --mixed_precision bf16 --save_precision bf16 
+--network_module networks.lora_sd3 --network_dim 4 --optimizer_type adamw8bit --learning_rate 1e-4 
+--cache_text_encoder_outputs --cache_text_encoder_outputs_to_disk --fp8_base 
+--highvram --max_train_epochs 4 --save_every_n_epochs 1 --dataset_config dataset_1024_bs2.toml 
+--output_dir path/to/output/dir --output_name sd3-lora-name 
+```
+(The command is multi-line for readability. Please combine it into one line.)
+
+The training can be done with 12GB VRAM GPUs with Adafactor optimizer. Please use settings like below:
+
+```
+--optimizer_type adafactor --optimizer_args "relative_step=False" "scale_parameter=False" "warmup_init=False" --lr_scheduler constant_with_warmup --max_grad_norm 0.0
+```
+
+`--cpu_offload_checkpointing` and `--split_mode` are not available for SD3 LoRA training.
 
-__Sep 1, 2024__:
-- `--num_last_block_to_freeze` is added to `sd3_train.py`. This option is to freeze the last n blocks of the MMDiT. See [#1417](https://github.com/kohya-ss/sd-scripts/pull/1417) for details. Thanks to sdbds!
+We also not sure how many epochs are needed for convergence, and how the learning rate should be adjusted.
 
-__Jul  27, 2024__: 
-- Latents and text encoder outputs caching mechanism is refactored significantly. 
-  - Existing cache files for SD3 need to be recreated. Please delete the previous cache files. 
-  - With this change, dataset initialization is significantly faster, especially for large datasets. 
+The trained LoRA model can be used with ComfyUI. 
 
-- Architecture-dependent parts are extracted from the dataset (`train_util.py`). This is expected to make it easier to add future architectures.
+#### Key Options for SD3 LoRA training
 
-- Architecture-dependent parts including the cache mechanism for SD1/2/SDXL are also extracted. The basic operation of SD1/2/SDXL training on the sd3 branch has been confirmed, but there may be bugs. Please use the main or dev branch for SD1/2/SDXL training.
+Here are the arguments. The arguments and sample settings are still experimental and may change in the future. Feedback on the settings is welcome.
 
----
+- `--network_module` is the module for LoRA training. Specify `networks.lora_sd3` for SD3 LoRA training.
+- `--pretrained_model_name_or_path` is the path to the pretrained model (SD3/3.5). If you specify `--fp8_base`, you can use fp8 models for SD3/3.5. The fp8 model is only compatible with `float8_e4m3fn` format.
+- `--clip_l` is the path to the CLIP-L model. 
+- `--clip_g` is the path to the CLIP-G model.
+- `--t5xxl` is the path to the T5XXL model. If you specify `--fp8_base`, you can use fp8 (float8_e4m3fn) models for T5XXL. However, it is recommended to use fp16 models for caching.
+- `--vae` is the path to the autoencoder model. __This option is not necessary for SD3.__ VAE is included in the standard SD3 model.
+- `--disable_mmap_load_safetensors` is to disable memory mapping when loading safetensors. __This option significantly reduces the memory usage when loading models for Windows users.__
+- `--clip_l_dropout_rate`, `--clip_g_dropout_rate` and `--t5_dropout_rate` are the dropout rates for the embeddings of CLIP-L, CLIP-G, and T5XXL, described in [SAI research papre](http://arxiv.org/pdf/2403.03206). The default is 0.0. For LoRA training, it is seems to be better to set 0.0.
+- `--pos_emb_random_crop_rate` is the rate of random cropping of positional embeddings, described in [SD3.5M model card](https://huggingface.co/stabilityai/stable-diffusion-3.5-medium). The default is 0. It is seems to be better to set 0.0 for LoRA training.
+- `--enable_scaled_pos_embed` is to enable the scaled positional embeddings. The default is False. This option is an experimental feature for SD3.5M. Details are described below.
 
-`fp16` and `bf16` are available for mixed precision training. We are not sure which is better.
+Other options are described below.
 
-`optimizer_type = "adafactor"` is recommended for 24GB VRAM GPUs. `cache_text_encoder_outputs_to_disk` and `cache_latents_to_disk` are necessary currently. 
+#### Key Features for SD3 LoRA training
 
-`clip_l`, `clip_g` and `t5xxl` can be specified if the checkpoint does not include them.  
+1. CLIP-L, G and T5XXL LoRA Support:
+   - SD3 LoRA training now supports CLIP-L, CLIP-G and T5XXL LoRA training.
+   - Remove `--network_train_unet_only` from your command.
+   - Add `train_t5xxl=True` to `--network_args` to train T5XXL LoRA. CLIP-L and G is also trained at the same time.
+   - T5XXL output can be cached for CLIP-L and G LoRA training. So, `--cache_text_encoder_outputs` or `--cache_text_encoder_outputs_to_disk` is also available.
+   - The learning rates for CLIP-L, CLIP-G and T5XXL can be specified separately. Multiple numbers can be specified in `--text_encoder_lr`. For example, `--text_encoder_lr 1e-4 1e-5 5e-6`. The first value is the learning rate for CLIP-L, the second value is for CLIP-G, and the third value is for T5XXL. If you specify only one, the learning rates for CLIP-L, CLIP-G and T5XXL will be the same. If the third value is not specified, the second value is used for T5XXL. If `--text_encoder_lr` is not specified, the default learning rate `--learning_rate` is used for both CLIP-L and T5XXL.
+   - The trained LoRA can be used with ComfyUI.
 
-t5xxl works with `fp16` now.
+    | trained LoRA|option|network_args|cache_text_encoder_outputs (*1)|
+    |---|---|---|---|
+    |MMDiT|`--network_train_unet_only`|-|o|
+    |MMDiT + CLIP-L + CLIP-G|-|-|o (*2)|
+    |MMDiT + CLIP-L + CLIP-G + T5XXL|-|`train_t5xxl=True`|-|
+    |CLIP-L + CLIP-G (*3)|`--network_train_text_encoder_only`|-|o (*2)|
+    |CLIP-L + CLIP-G + T5XXL (*3)|`--network_train_text_encoder_only`|`train_t5xxl=True`|-|
 
-There are `t5xxl_device` and `t5xxl_dtype` options for `t5xxl` device and dtype. 
+    - *1: `--cache_text_encoder_outputs` or `--cache_text_encoder_outputs_to_disk` is also available.
+    - *2: T5XXL output can be cached for CLIP-L and G LoRA training.
+    - *3: Not tested yet.
+
+2. Experimental FP8/FP16 mixed training:
+   - `--fp8_base_unet` enables training with fp8 for MMDiT and bf16/fp16 for CLIP-L/G/T5XXL.
+   - When specifying this option, the `--fp8_base` option is automatically enabled.
 
-`text_encoder_batch_size` is added experimentally for caching faster.
+3. Split Q/K/V Projection Layers (Experimental):
+   - Same as FLUX.1.
+   
+4. CLIP-L/G and T5 Attention Mask Application:
+   - This function is planned to be implemented in the future.
+   
+5. Multi-resolution Training Support:
+   - Only for SD3.5M. 
+   - Same as FLUX.1 for data preparation.
+   - If you train with multiple resolutions, specify `--enable_scaled_pos_embed` to enable the scaled positional embeddings. The default is False. This option is an experimental feature for SD3.5M.
 
-```toml
-learning_rate = 1e-6 # seems to depend on the batch size
-optimizer_type = "adafactor"
-optimizer_args = [ "scale_parameter=False", "relative_step=False", "warmup_init=False" ]
-cache_text_encoder_outputs = true
-cache_text_encoder_outputs_to_disk = true
-vae_batch_size = 1
-text_encoder_batch_size = 4
-cache_latents = true
-cache_latents_to_disk = true
+
+Technical details of multi-resolution training for SD3.5M:
+
+The values of the positional embeddings must be the same for each resolution. That is, the same value must be in the same position for 512x512, 768x768, and 1024x1024. To achieve this, the positional embeddings for each resolution are calculated in advance and switched according to the resolution of the training data. This feature is enabled by `--enable_scaled_pos_embed`.
+
+This idea and the code for calculating scaled positional embeddings are contributed by KohakuBlueleaf. Thanks to KohakuBlueleaf!
+
+
+#### Specify rank for each layer in SD3 LoRA
+
+You can specify the rank for each layer in SD3 by specifying the following network_args. If you specify `0`, LoRA will not be applied to that layer.
+
+When network_args is not specified, the default value (`network_dim`) is applied, same as before.
+
+|network_args|target layer|
+|---|---|
+|context_attn_dim|attn in context_block|
+|context_mlp_dim|mlp in context_block|
+|context_mod_dim|adaLN_modulation in context_block|
+|x_attn_dim|attn in x_block|
+|x_mlp_dim|mlp in x_block|
+|x_mod_dim|adaLN_modulation in x_block|
+
+`"verbose=True"` is also available for debugging. It shows the rank of each layer.
+
+example: 
 ```
+--network_args "context_attn_dim=2" "context_mlp_dim=3" "context_mod_dim=4" "x_attn_dim=5" "x_mlp_dim=6" "x_mod_dim=7" "verbose=True"
+```
+
+You can apply LoRA to the conditioning layers of SD3 by specifying `emb_dims` in network_args. When specifying, be sure to specify 6 numbers in `[]` as a comma-separated list.
+
+example: 
+```
+--network_args "emb_dims=[2,3,4,5,6,7]"
+```
+
+Each number corresponds to `context_embedder`, `t_embedder`, `x_embedder`, `y_embedder`, `final_layer_adaLN_modulation`, `final_layer_linear`. The above example applies LoRA to all conditioning layers, with rank 2 for `context_embedder`, 3 for `t_embedder`, 4 for `context_embedder`, 5 for `y_embedder`, 6 for `final_layer_adaLN_modulation`, and 7 for `final_layer_linear`.
+
+If you specify `0`, LoRA will not be applied to that layer. For example, `[4,0,0,4,0,0]` applies LoRA only to `context_embedder` and `y_embedder`.
+
+#### Specify blocks to train in SD3 LoRA training
+
+You can specify the blocks to train in SD3 LoRA training by specifying `train_block_indices` in network_args. The indices are 0-based. The default (when omitted) is to train all blocks. The indices are specified as a list of integers or a range of integers, like `0,1,5,8` or `0,1,4-5,7`. 
+
+The number of blocks depends on the model. The valid range is 0-(the number of blocks - 1). `all` is also available to train all blocks, `none` is also available to train no blocks.
+
+example: 
+```
+--network_args "train_block_indices=1,2,6-8" 
+```
+
+### Inference for SD3 with LoRA model
+
+The inference script is also available. The script is `sd3_minimal_inference.py`. See `--help` for options. 
+
+### SD3 fine-tuning
+
+Documentation is not available yet. Please refer to the FLUX.1 fine-tuning guide for now. The major difference are following:
+
+- `--clip_g` is also available for SD3 fine-tuning.
+- `--timestep_sampling` `--discrete_flow_shift``--model_prediction_type` --guidance_scale` are not necessary for SD3 fine-tuning.
+- Use `--vae` instead of `--ae` if necessary. __This option is not necessary for SD3.__ VAE is included in the standard SD3 model.
+- `--disable_mmap_load_safetensors` is available. __This option significantly reduces the memory usage when loading models for Windows users.__
+- `--cpu_offload_checkpointing` is not available for SD3 fine-tuning.
+- `--clip_l_dropout_rate`, `--clip_g_dropout_rate` and `--t5_dropout_rate` are available same as LoRA training. 
+- `--pos_emb_random_crop_rate` and `--enable_scaled_pos_embed` are available for SD3.5M fine-tuning.
+- Training text encoders is available with `--train_text_encoder` option, similar to SDXL training.
+  - CLIP-L and G can be trained with `--train_text_encoder` option. Training T5XXL needs `--train_t5xxl` option.
+  - If you use the cached text encoder outputs for T5XXL with training CLIP-L and G, specify `--use_t5xxl_cache_only`. This option enables to use the cached text encoder outputs for T5XXL only.
+  - The learning rates for CLIP-L, CLIP-G and T5XXL can be specified separately. `--text_encoder_lr1`, `--text_encoder_lr2` and `--text_encoder_lr3` are available. 
+
+### Extract LoRA from SD3 Models
+
+Not available yet.
 
-__2024/7/27:__
+### Convert SD3 LoRA
 
-Latents およびテキストエンコーダ出力のキャッシュの仕組みを大きくリファクタリングしました。SD3 用の既存のキャッシュファイルの再作成が必要になりますが、ご了承ください（以前のキャッシュファイルは削除してください）。これにより、特にデータセットの規模が大きい場合のデータセット初期化が大幅に高速化されます。
+Not available yet.
 
-データセット (`train_util.py`) からアーキテクチャ依存の部分を切り出しました。これにより将来的なアーキテクチャ追加が容易になると期待しています。
+### Merge LoRA to SD3 checkpoint
 
-SD1/2/SDXL のキャッシュ機構を含むアーキテクチャ依存の部分も切り出しました。sd3 ブランチの SD1/2/SDXL 学習について、基本的な動作は確認していますが、不具合があるかもしれません。SD1/2/SDXL の学習には main または dev ブランチをお使いください。
+Not available yet.
 
 --- 
 

From 830df4abcc85ffdfe08b8f97f2c8351c86149af3 Mon Sep 17 00:00:00 2001
From: Kohya S <ykumeykume@gmail.com>
Date: Thu, 31 Oct 2024 21:39:07 +0900
Subject: [PATCH 212/582] Fix crashing if image is too tall or wide.

---
 library/sd3_models.py | 18 ++++++++++++------
 1 file changed, 12 insertions(+), 6 deletions(-)

diff --git a/library/sd3_models.py b/library/sd3_models.py
index 0eca94e2f..15a5b1db4 100644
--- a/library/sd3_models.py
+++ b/library/sd3_models.py
@@ -868,7 +868,7 @@ def enable_scaled_pos_embed(self, use_scaled_pos_embed: bool, latent_sizes: Opti
         self.use_scaled_pos_embed = use_scaled_pos_embed
 
         if self.use_scaled_pos_embed:
-            # # remove pos_embed to free up memory up to 0.4 GB
+            # remove pos_embed to free up memory up to 0.4 GB
             self.pos_embed = None
 
             # sort latent sizes in ascending order
@@ -977,7 +977,7 @@ def cropped_pos_embed(self, h, w, device=None, random_crop: bool = False):
         # patched size
         h = (h + 1) // p
         w = (w + 1) // p
-        if self.pos_embed is None:
+        if self.pos_embed is None:  # should not happen
             return get_2d_sincos_pos_embed_torch(self.hidden_size, w, h, device=device)
         assert self.pos_embed_max_size is not None
         assert h <= self.pos_embed_max_size, (h, self.pos_embed_max_size)
@@ -1016,13 +1016,20 @@ def cropped_scaled_pos_embed(self, h, w, device=None, dtype=None, random_crop: b
         if patched_size is None:
             raise ValueError(f"Area {area} is too large for the given latent sizes {self.resolution_area_to_latent_size}.")
 
-        pos_embed_size = int(patched_size * MMDiT.POS_EMBED_MAX_RATIO)
+        pos_embed = self.resolution_pos_embeds[patched_size]
+        pos_embed_size = round(math.sqrt(pos_embed.shape[1]))
         if h > pos_embed_size or w > pos_embed_size:
-            # fallback to normal pos_embed
+            # # fallback to normal pos_embed
+            # return self.cropped_pos_embed(h * p, w * p, device=device, random_crop=random_crop)
+            # extend pos_embed size
             logger.warning(
                 f"Using normal pos_embed for size {h}x{w} as it exceeds the scaled pos_embed size {pos_embed_size}. Image is too tall or wide."
             )
-            return self.cropped_pos_embed(h, w, device=device, random_crop=random_crop)
+            pos_embed_size = max(h, w)
+            pos_embed = get_scaled_2d_sincos_pos_embed(self.hidden_size, pos_embed_size, sample_size=patched_size)
+            pos_embed = torch.from_numpy(pos_embed).float().unsqueeze(0)
+            self.resolution_pos_embeds[patched_size] = pos_embed
+            logger.info(f"Updated pos_embed for size {pos_embed_size}x{pos_embed_size}")
 
         if not random_crop:
             top = (pos_embed_size - h) // 2
@@ -1031,7 +1038,6 @@ def cropped_scaled_pos_embed(self, h, w, device=None, dtype=None, random_crop: b
             top = torch.randint(0, pos_embed_size - h + 1, (1,)).item()
             left = torch.randint(0, pos_embed_size - w + 1, (1,)).item()
 
-        pos_embed = self.resolution_pos_embeds[patched_size]
         if pos_embed.device != device:
             pos_embed = pos_embed.to(device)
             # which is better to update device, or transfer every time to device? -> 64x64 emb is 96*96*1536*4=56MB. It's okay to update device.

From 9aa6f52ac3c1866d00675daf73c7560b8b76093f Mon Sep 17 00:00:00 2001
From: Kohya S <ykumeykume@gmail.com>
Date: Fri, 1 Nov 2024 21:43:21 +0900
Subject: [PATCH 213/582] Fix memory leak in latent caching. bmp failed to
 cache

---
 library/train_util.py | 16 +++++++++++++++-
 1 file changed, 15 insertions(+), 1 deletion(-)

diff --git a/library/train_util.py b/library/train_util.py
index bd2ff6ef4..18d3cf6c2 100644
--- a/library/train_util.py
+++ b/library/train_util.py
@@ -1082,6 +1082,10 @@ def submit_batch(batch, cond):
                     info.image = info.image.result()  # future to image
             caching_strategy.cache_batch_latents(model, batch, cond.flip_aug, cond.alpha_mask, cond.random_crop)
 
+            # remove image from memory
+            for info in batch:
+                info.image = None
+
         # define ThreadPoolExecutor to load images in parallel
         max_workers = min(os.cpu_count(), len(image_infos))
         max_workers = max(1, max_workers // num_processes)  # consider multi-gpu
@@ -1397,7 +1401,17 @@ def cache_text_encoder_outputs_common(
                 )
 
     def get_image_size(self, image_path):
-        return imagesize.get(image_path)
+        # return imagesize.get(image_path)
+        image_size = imagesize.get(image_path)
+        if image_size[0] <= 0:
+            # imagesize doesn't work for some images, so use cv2
+            img = cv2.imread(image_path)
+            if img is not None:
+                image_size = (img.shape[1], img.shape[0])
+            else:
+                logger.warning(f"failed to get image size: {image_path}")
+                image_size = (0, 0)
+        return image_size
 
     def load_image_with_face_info(self, subset: BaseSubset, image_path: str, alpha_mask=False):
         img = load_image(image_path, alpha_mask)

From 82daa98fe865c30a34638acc145d6f4ea8c193db Mon Sep 17 00:00:00 2001
From: Kohya S <ykumeykume@gmail.com>
Date: Fri, 1 Nov 2024 21:43:47 +0900
Subject: [PATCH 214/582] remove duplicate resolution for scaled pos embed

---
 library/sd3_models.py | 3 ++-
 sd3_train.py          | 1 +
 sd3_train_network.py  | 1 +
 3 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/library/sd3_models.py b/library/sd3_models.py
index 15a5b1db4..b09a57dbd 100644
--- a/library/sd3_models.py
+++ b/library/sd3_models.py
@@ -871,7 +871,8 @@ def enable_scaled_pos_embed(self, use_scaled_pos_embed: bool, latent_sizes: Opti
             # remove pos_embed to free up memory up to 0.4 GB
             self.pos_embed = None
 
-            # sort latent sizes in ascending order
+            # remove duplcates and sort latent sizes in ascending order
+            latent_sizes = list(set(latent_sizes))
             latent_sizes = sorted(latent_sizes)
 
             patched_sizes = [latent_size // self.patch_size for latent_size in latent_sizes]
diff --git a/sd3_train.py b/sd3_train.py
index 40f8c7e1f..f64e2da2c 100644
--- a/sd3_train.py
+++ b/sd3_train.py
@@ -366,6 +366,7 @@ def train(args):
     if args.enable_scaled_pos_embed:
         resolutions = train_dataset_group.get_resolutions()
         latent_sizes = [round(math.sqrt(res[0] * res[1])) // 8 for res in resolutions]  # 8 is stride for latent
+        latent_sizes = list(set(latent_sizes))  # remove duplicates
         logger.info(f"Prepare scaled positional embeddings for resolutions: {resolutions}, sizes: {latent_sizes}")
         mmdit.enable_scaled_pos_embed(True, latent_sizes)
 
diff --git a/sd3_train_network.py b/sd3_train_network.py
index 9eeac05ca..0739e094d 100644
--- a/sd3_train_network.py
+++ b/sd3_train_network.py
@@ -73,6 +73,7 @@ def load_target_model(self, args, weight_dtype, accelerator):
         # set resolutions for positional embeddings
         if args.enable_scaled_pos_embed:
             latent_sizes = [round(math.sqrt(res[0] * res[1])) // 8 for res in self.resolutions]  # 8 is stride for latent
+            latent_sizes = list(set(latent_sizes))  # remove duplicates
             logger.info(f"Prepare scaled positional embeddings for resolutions: {self.resolutions}, sizes: {latent_sizes}")
             mmdit.enable_scaled_pos_embed(True, latent_sizes)
 

From e0db59695fb56e6b7f42132b70e4f828820143ac Mon Sep 17 00:00:00 2001
From: Kohya S <ykumeykume@gmail.com>
Date: Sat, 2 Nov 2024 11:13:04 +0900
Subject: [PATCH 215/582] update multi-res training in SD3.5M

---
 README.md | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index aff78b2c6..fb087c234 100644
--- a/README.md
+++ b/README.md
@@ -679,12 +679,16 @@ Other options are described below.
 5. Multi-resolution Training Support:
    - Only for SD3.5M. 
    - Same as FLUX.1 for data preparation.
-   - If you train with multiple resolutions, specify `--enable_scaled_pos_embed` to enable the scaled positional embeddings. The default is False. This option is an experimental feature for SD3.5M.
+   - If you train with multiple resolutions, you can enable the scaled positional embeddings with `--enable_scaled_pos_embed`. The default is False. __This option is an experimental feature.__
+
+
 
 
 Technical details of multi-resolution training for SD3.5M:
 
-The values of the positional embeddings must be the same for each resolution. That is, the same value must be in the same position for 512x512, 768x768, and 1024x1024. To achieve this, the positional embeddings for each resolution are calculated in advance and switched according to the resolution of the training data. This feature is enabled by `--enable_scaled_pos_embed`.
+SD3.5M does not use scaled positional embeddings for multi-resolution training, and is trained with a single positional embedding. Therefore, this feature is very experimental.
+
+Generally, in multi-resolution training, the values of the positional embeddings must be the same for each resolution. That is, the same value must be in the same position for 512x512, 768x768, and 1024x1024. To achieve this, the positional embeddings for each resolution are calculated in advance and switched according to the resolution of the training data. This feature is enabled by `--enable_scaled_pos_embed`.
 
 This idea and the code for calculating scaled positional embeddings are contributed by KohakuBlueleaf. Thanks to KohakuBlueleaf!
 

From 5e32ee26a13394fdee77149c4e96b78c58eabc5e Mon Sep 17 00:00:00 2001
From: Kohya S <ykumeykume@gmail.com>
Date: Sat, 2 Nov 2024 15:32:16 +0900
Subject: [PATCH 216/582] fix crashing in DDP training closes #1751

---
 sd3_train.py | 30 +++++++++++++++++++++++++-----
 1 file changed, 25 insertions(+), 5 deletions(-)

diff --git a/sd3_train.py b/sd3_train.py
index f64e2da2c..e03d1708b 100644
--- a/sd3_train.py
+++ b/sd3_train.py
@@ -838,11 +838,31 @@ def optimizer_hook(parameter: torch.Tensor):
         accelerator.log({}, step=0)
 
     # show model device and dtype
-    logger.info(f"mmdit device: {mmdit.device}, dtype: {mmdit.dtype}" if mmdit else "mmdit is None")
-    logger.info(f"clip_l device: {clip_l.device}, dtype: {clip_l.dtype}" if clip_l else "clip_l is None")
-    logger.info(f"clip_g device: {clip_g.device}, dtype: {clip_g.dtype}" if clip_g else "clip_g is None")
-    logger.info(f"t5xxl device: {t5xxl.device}, dtype: {t5xxl.dtype}" if t5xxl else "t5xxl is None")
-    logger.info(f"vae device: {vae.device}, dtype: {vae.dtype}" if vae is not None else "vae is None")
+    logger.info(
+        f"mmdit device: {accelerator.unwrap_model(mmdit).device}, dtype: {accelerator.unwrap_model(mmdit).dtype}"
+        if mmdit
+        else "mmdit is None"
+    )
+    logger.info(
+        f"clip_l device: {accelerator.unwrap_model(clip_l).device}, dtype: {accelerator.unwrap_model(clip_l).dtype}"
+        if clip_l
+        else "clip_l is None"
+    )
+    logger.info(
+        f"clip_g device: {accelerator.unwrap_model(clip_g).device}, dtype: {accelerator.unwrap_model(clip_g).dtype}"
+        if clip_g
+        else "clip_g is None"
+    )
+    logger.info(
+        f"t5xxl device: {accelerator.unwrap_model(t5xxl).device}, dtype: {accelerator.unwrap_model(t5xxl).dtype}"
+        if t5xxl
+        else "t5xxl is None"
+    )
+    logger.info(
+        f"vae device: {accelerator.unwrap_model(vae).device}, dtype: {accelerator.unwrap_model(vae).dtype}"
+        if vae is not None
+        else "vae is None"
+    )
 
     loss_recorder = train_util.LossRecorder()
     epoch = 0  # avoid error when max_train_steps is 0

From 81c0c965a24ce4f0f86dfa980f803d7616ca46d8 Mon Sep 17 00:00:00 2001
From: Kohya S <ykumeykume@gmail.com>
Date: Tue, 5 Nov 2024 21:22:42 +0900
Subject: [PATCH 217/582] faster block swap

---
 flux_train.py          | 107 ++++++++++----------
 library/flux_models.py | 138 ++++++++++++++-----------
 library/utils.py       | 222 ++++++++++++++++++++++++++++++++++++++++-
 3 files changed, 352 insertions(+), 115 deletions(-)

diff --git a/flux_train.py b/flux_train.py
index 79c44d7b4..afddc897f 100644
--- a/flux_train.py
+++ b/flux_train.py
@@ -17,12 +17,14 @@
 import os
 from multiprocessing import Value
 import time
-from typing import List
+from typing import List, Optional, Tuple, Union
 import toml
 
 from tqdm import tqdm
 
 import torch
+import torch.nn as nn
+from library import utils
 from library.device_utils import init_ipex, clean_memory_on_device
 
 init_ipex()
@@ -466,45 +468,28 @@ def train(args):
 
     # memory efficient block swapping
 
-    def get_block_unit(dbl_blocks, sgl_blocks, index: int):
-        if index < len(dbl_blocks):
-            return (dbl_blocks[index],)
-        else:
-            index -= len(dbl_blocks)
-            index *= 2
-            return (sgl_blocks[index], sgl_blocks[index + 1])
-
-    def submit_move_blocks(futures, thread_pool, block_idx_to_cpu, block_idx_to_cuda, dbl_blocks, sgl_blocks, device):
-        def move_blocks(bidx_to_cpu, blocks_to_cpu, bidx_to_cuda, blocks_to_cuda, dvc):
-            # print(f"Backward: Move block {bidx_to_cpu} to CPU")
-            for block in blocks_to_cpu:
-                block = block.to("cpu", non_blocking=True)
-            torch.cuda.empty_cache()
-
-            # print(f"Backward: Move block {bidx_to_cuda} to CUDA")
-            for block in blocks_to_cuda:
-                block = block.to(dvc, non_blocking=True)
-
-            torch.cuda.synchronize()
-            # print(f"Backward: Moved blocks {bidx_to_cpu} and {bidx_to_cuda}")
-            return bidx_to_cpu, bidx_to_cuda
-
-        blocks_to_cpu = get_block_unit(dbl_blocks, sgl_blocks, block_idx_to_cpu)
-        blocks_to_cuda = get_block_unit(dbl_blocks, sgl_blocks, block_idx_to_cuda)
-
-        futures[block_idx_to_cuda] = thread_pool.submit(
-            move_blocks, block_idx_to_cpu, blocks_to_cpu, block_idx_to_cuda, blocks_to_cuda, device
-        )
+    def submit_move_blocks(futures, thread_pool, block_idx_to_cpu, block_idx_to_cuda, blocks, block_id):
+        def move_blocks(bidx_to_cpu, block_to_cpu, bidx_to_cuda, block_to_cuda):
+            # start_time = time.perf_counter()
+            # print(f"Backward: Move block {bidx_to_cpu} to CPU and block {bidx_to_cuda} to CUDA")
+            utils.swap_weight_devices(block_to_cpu, block_to_cuda)
+            # print(f"Backward: Moved blocks {bidx_to_cpu} and {bidx_to_cuda} in {time.perf_counter()-start_time:.2f}s")
+            return bidx_to_cpu, bidx_to_cuda # , event
+
+        block_to_cpu = blocks[block_idx_to_cpu]
+        block_to_cuda = blocks[block_idx_to_cuda]
+
+        futures[block_id] = thread_pool.submit(move_blocks, block_idx_to_cpu, block_to_cpu, block_idx_to_cuda, block_to_cuda)
 
-    def wait_blocks_move(block_idx, futures):
-        if block_idx not in futures:
+    def wait_blocks_move(block_id, futures):
+        if block_id not in futures:
             return
-        # print(f"Backward: Wait for block {block_idx}")
+        # print(f"Backward: Wait for block {block_id}")
         # start_time = time.perf_counter()
-        future = futures.pop(block_idx)
-        future.result()
-        # print(f"Backward: Waited for block {block_idx}: {time.perf_counter()-start_time:.2f}s")
-        # torch.cuda.synchronize()
+        future = futures.pop(block_id)
+        _, bidx_to_cuda = future.result()
+        assert block_id[1] == bidx_to_cuda, f"Block index mismatch: {block_id[1]} != {bidx_to_cuda}"
+        # print(f"Backward: Waited for block {block_id}: {time.perf_counter()-start_time:.2f}s")
         # print(f"Backward: Synchronized: {time.perf_counter()-start_time:.2f}s")
 
     if args.fused_backward_pass:
@@ -513,11 +498,11 @@ def wait_blocks_move(block_idx, futures):
 
         library.adafactor_fused.patch_adafactor_fused(optimizer)
 
-        blocks_to_swap = args.blocks_to_swap
+        double_blocks_to_swap = args.blocks_to_swap // 2
+        single_blocks_to_swap = (args.blocks_to_swap - double_blocks_to_swap) * 2
         num_double_blocks = len(accelerator.unwrap_model(flux).double_blocks)
         num_single_blocks = len(accelerator.unwrap_model(flux).single_blocks)
-        num_block_units = num_double_blocks + num_single_blocks // 2
-        handled_unit_indices = set()
+        handled_block_ids = set()
 
         n = 1  # only asynchronous purpose, no need to increase this number
         # n = 2
@@ -530,28 +515,37 @@ def wait_blocks_move(block_idx, futures):
                 if parameter.requires_grad:
                     grad_hook = None
 
-                    if blocks_to_swap:
+                    if double_blocks_to_swap > 0 or single_blocks_to_swap > 0:
                         is_double = param_name.startswith("double_blocks")
                         is_single = param_name.startswith("single_blocks")
-                        if is_double or is_single:
+                        if is_double and double_blocks_to_swap > 0 or is_single and single_blocks_to_swap > 0:
                             block_idx = int(param_name.split(".")[1])
-                            unit_idx = block_idx if is_double else num_double_blocks + block_idx // 2
-                            if unit_idx not in handled_unit_indices:
+                            block_id = (is_double, block_idx)  # double or single, block index
+                            if block_id not in handled_block_ids:
                                 # swap following (already backpropagated) block
-                                handled_unit_indices.add(unit_idx)
+                                handled_block_ids.add(block_id)
 
                                 # if n blocks were already backpropagated
-                                num_blocks_propagated = num_block_units - unit_idx - 1
+                                if is_double:
+                                    num_blocks = num_double_blocks
+                                    blocks_to_swap = double_blocks_to_swap
+                                else:
+                                    num_blocks = num_single_blocks
+                                    blocks_to_swap = single_blocks_to_swap
+
+                                # -1 for 0-based index, -1 for current block is not fully backpropagated yet
+                                num_blocks_propagated = num_blocks - block_idx - 2
                                 swapping = num_blocks_propagated > 0 and num_blocks_propagated <= blocks_to_swap
-                                waiting = unit_idx > 0 and unit_idx <= blocks_to_swap
+                                waiting = block_idx > 0 and block_idx <= blocks_to_swap
+
                                 if swapping or waiting:
-                                    block_idx_to_cpu = num_block_units - num_blocks_propagated
+                                    block_idx_to_cpu = num_blocks - num_blocks_propagated
                                     block_idx_to_cuda = blocks_to_swap - num_blocks_propagated
-                                    block_idx_to_wait = unit_idx - 1
+                                    block_idx_to_wait = block_idx - 1
 
                                     # create swap hook
                                     def create_swap_grad_hook(
-                                        bidx_to_cpu, bidx_to_cuda, bidx_to_wait, uidx: int, swpng: bool, wtng: bool
+                                        is_dbl, bidx_to_cpu, bidx_to_cuda, bidx_to_wait, swpng: bool, wtng: bool
                                     ):
                                         def __grad_hook(tensor: torch.Tensor):
                                             if accelerator.sync_gradients and args.max_grad_norm != 0.0:
@@ -559,24 +553,25 @@ def __grad_hook(tensor: torch.Tensor):
                                             optimizer.step_param(tensor, param_group)
                                             tensor.grad = None
 
-                                            # print(f"Backward: {uidx}, {swpng}, {wtng}")
+                                            # print(
+                                            #     f"Backward: Block {is_dbl}, {bidx_to_cpu}, {bidx_to_cuda}, {bidx_to_wait}, {swpng}, {wtng}"
+                                            # )
                                             if swpng:
                                                 submit_move_blocks(
                                                     futures,
                                                     thread_pool,
                                                     bidx_to_cpu,
                                                     bidx_to_cuda,
-                                                    flux.double_blocks,
-                                                    flux.single_blocks,
-                                                    accelerator.device,
+                                                    flux.double_blocks if is_dbl else flux.single_blocks,
+                                                    (is_dbl, bidx_to_cuda),  # wait for this block
                                                 )
                                             if wtng:
-                                                wait_blocks_move(bidx_to_wait, futures)
+                                                wait_blocks_move((is_dbl, bidx_to_wait), futures)
 
                                         return __grad_hook
 
                                     grad_hook = create_swap_grad_hook(
-                                        block_idx_to_cpu, block_idx_to_cuda, block_idx_to_wait, unit_idx, swapping, waiting
+                                        is_double, block_idx_to_cpu, block_idx_to_cuda, block_idx_to_wait, swapping, waiting
                                     )
 
                     if grad_hook is None:
diff --git a/library/flux_models.py b/library/flux_models.py
index 0bc1c02b9..48dea4fc9 100644
--- a/library/flux_models.py
+++ b/library/flux_models.py
@@ -7,8 +7,9 @@
 import math
 import os
 import time
-from typing import Dict, List, Optional
+from typing import Dict, List, Optional, Union
 
+from library import utils
 from library.device_utils import init_ipex, clean_memory_on_device
 
 init_ipex()
@@ -923,7 +924,8 @@ def __init__(self, params: FluxParams):
         self.blocks_to_swap = None
 
         self.thread_pool: Optional[ThreadPoolExecutor] = None
-        self.num_block_units = len(self.double_blocks) + len(self.single_blocks) // 2
+        self.num_double_blocks = len(self.double_blocks)
+        self.num_single_blocks = len(self.single_blocks)
 
     @property
     def device(self):
@@ -963,14 +965,17 @@ def disable_gradient_checkpointing(self):
 
     def enable_block_swap(self, num_blocks: int):
         self.blocks_to_swap = num_blocks
+        self.double_blocks_to_swap = num_blocks // 2
+        self.single_blocks_to_swap = (num_blocks - self.double_blocks_to_swap) * 2
+        print(
+            f"FLUX: Block swap enabled. Swapping {num_blocks} blocks, double blocks: {self.double_blocks_to_swap}, single blocks: {self.single_blocks_to_swap}."
+        )
 
         n = 1  # async block swap. 1 is enough
-        # n = 2
-        # n = max(1, os.cpu_count() // 2)
         self.thread_pool = ThreadPoolExecutor(max_workers=n)
 
     def move_to_device_except_swap_blocks(self, device: torch.device):
-        # assume model is on cpu
+        # assume model is on cpu. do not move blocks to device to reduce temporary memory usage
         if self.blocks_to_swap:
             save_double_blocks = self.double_blocks
             save_single_blocks = self.single_blocks
@@ -983,31 +988,55 @@ def move_to_device_except_swap_blocks(self, device: torch.device):
             self.double_blocks = save_double_blocks
             self.single_blocks = save_single_blocks
 
-    def get_block_unit(self, index: int):
-        if index < len(self.double_blocks):
-            return (self.double_blocks[index],)
-        else:
-            index -= len(self.double_blocks)
-            index *= 2
-            return self.single_blocks[index], self.single_blocks[index + 1]
+    # def get_block_unit(self, index: int):
+    #     if index < len(self.double_blocks):
+    #         return (self.double_blocks[index],)
+    #     else:
+    #         index -= len(self.double_blocks)
+    #         index *= 2
+    #         return self.single_blocks[index], self.single_blocks[index + 1]
 
-    def get_unit_index(self, is_double: bool, index: int):
-        if is_double:
-            return index
-        else:
-            return len(self.double_blocks) + index // 2
+    # def get_unit_index(self, is_double: bool, index: int):
+    #     if is_double:
+    #         return index
+    #     else:
+    #         return len(self.double_blocks) + index // 2
 
     def prepare_block_swap_before_forward(self):
-        # make: first n blocks are on cuda, and last n blocks are on cpu
+        # # make: first n blocks are on cuda, and last n blocks are on cpu
+        # if self.blocks_to_swap is None or self.blocks_to_swap == 0:
+        #     # raise ValueError("Block swap is not enabled.")
+        #     return
+        # for i in range(self.num_block_units - self.blocks_to_swap):
+        #     for b in self.get_block_unit(i):
+        #         b.to(self.device)
+        # for i in range(self.num_block_units - self.blocks_to_swap, self.num_block_units):
+        #     for b in self.get_block_unit(i):
+        #         b.to("cpu")
+        # clean_memory_on_device(self.device)
+
+        # all blocks are on device, but some weights are on cpu
+        # make first n blocks weights on device, and last n blocks weights on cpu
         if self.blocks_to_swap is None or self.blocks_to_swap == 0:
             # raise ValueError("Block swap is not enabled.")
             return
-        for i in range(self.num_block_units - self.blocks_to_swap):
-            for b in self.get_block_unit(i):
-                b.to(self.device)
-        for i in range(self.num_block_units - self.blocks_to_swap, self.num_block_units):
-            for b in self.get_block_unit(i):
-                b.to("cpu")
+
+        for b in self.double_blocks[0 : self.num_double_blocks - self.double_blocks_to_swap]:
+            b.to(self.device)
+            utils.weighs_to_device(b, self.device)  # make sure weights are on device
+        for b in self.double_blocks[self.num_double_blocks - self.double_blocks_to_swap :]:
+            b.to(self.device)  # move block to device first
+            utils.weighs_to_device(b, "cpu")  # make sure weights are on cpu
+        torch.cuda.synchronize()
+        clean_memory_on_device(self.device)
+
+        for b in self.single_blocks[0 : self.num_single_blocks - self.single_blocks_to_swap]:
+            b.to(self.device)
+            utils.weighs_to_device(b, self.device)  # make sure weights are on device
+        for b in self.single_blocks[self.num_single_blocks - self.single_blocks_to_swap :]:
+            b.to(self.device)  # move block to device first
+            utils.weighs_to_device(b, "cpu")  # make sure weights are on cpu
+        torch.cuda.synchronize()
         clean_memory_on_device(self.device)
 
     def forward(
@@ -1044,27 +1073,22 @@ def forward(
             for block in self.single_blocks:
                 img = block(img, vec=vec, pe=pe, txt_attention_mask=txt_attention_mask)
         else:
-            futures = {}
-
-            def submit_move_blocks(block_idx_to_cpu, block_idx_to_cuda):
-                def move_blocks(bidx_to_cpu, blocks_to_cpu, bidx_to_cuda, blocks_to_cuda):
-                    # print(f"Moving {bidx_to_cpu} to cpu.")
-                    for block in blocks_to_cpu:
-                        block.to("cpu", non_blocking=True)
-                    torch.cuda.empty_cache()
+            # device = self.device
 
-                    # print(f"Moving {bidx_to_cuda} to cuda.")
-                    for block in blocks_to_cuda:
-                        block.to(self.device, non_blocking=True)
-
-                    torch.cuda.synchronize()
+            def submit_move_blocks(blocks, block_idx_to_cpu, block_idx_to_cuda):
+                def move_blocks(bidx_to_cpu, block_to_cpu, bidx_to_cuda, block_to_cuda):
+                    start_time = time.perf_counter()
+                    # print(f"Moving {bidx_to_cpu} to cpu and {bidx_to_cuda} to cuda.")
+                    utils.swap_weight_devices(block_to_cpu, block_to_cuda)
                     # print(f"Block move done. {bidx_to_cpu} to cpu, {bidx_to_cuda} to cuda.")
-                    return block_idx_to_cpu, block_idx_to_cuda
 
-                blocks_to_cpu = self.get_block_unit(block_idx_to_cpu)
-                blocks_to_cuda = self.get_block_unit(block_idx_to_cuda)
+                    # print(f"Move blocks took {time.perf_counter() - start_time:.2f} seconds")
+                    return block_idx_to_cpu, block_idx_to_cuda # , event
+
+                block_to_cpu = blocks[block_idx_to_cpu]
+                block_to_cuda = blocks[block_idx_to_cuda]
                 # print(f"Submit move blocks. {block_idx_to_cpu} to cpu, {block_idx_to_cuda} to cuda.")
-                return self.thread_pool.submit(move_blocks, block_idx_to_cpu, blocks_to_cpu, block_idx_to_cuda, blocks_to_cuda)
+                return self.thread_pool.submit(move_blocks, block_idx_to_cpu, block_to_cpu, block_idx_to_cuda, block_to_cuda)
 
             def wait_for_blocks_move(block_idx, ftrs):
                 if block_idx not in ftrs:
@@ -1073,37 +1097,35 @@ def wait_for_blocks_move(block_idx, ftrs):
                 # start_time = time.perf_counter()
                 ftr = ftrs.pop(block_idx)
                 ftr.result()
-                # torch.cuda.synchronize()
-                # print(f"Move blocks took {time.perf_counter() - start_time:.2f} seconds")
+                # print(f"{block_idx} move blocks took {time.perf_counter() - start_time:.2f} seconds")
 
+            double_futures = {}
             for block_idx, block in enumerate(self.double_blocks):
                 # print(f"Double block {block_idx}")
-                unit_idx = self.get_unit_index(is_double=True, index=block_idx)
-                wait_for_blocks_move(unit_idx, futures)
+                wait_for_blocks_move(block_idx, double_futures)
 
                 img, txt = block(img=img, txt=txt, vec=vec, pe=pe, txt_attention_mask=txt_attention_mask)
 
-                if unit_idx < self.blocks_to_swap:
-                    block_idx_to_cpu = unit_idx
-                    block_idx_to_cuda = self.num_block_units - self.blocks_to_swap + unit_idx
-                    future = submit_move_blocks(block_idx_to_cpu, block_idx_to_cuda)
-                    futures[block_idx_to_cuda] = future
+                if block_idx < self.double_blocks_to_swap:
+                    block_idx_to_cpu = block_idx
+                    block_idx_to_cuda = self.num_double_blocks - self.double_blocks_to_swap + block_idx
+                    future = submit_move_blocks(self.double_blocks, block_idx_to_cpu, block_idx_to_cuda)
+                    double_futures[block_idx_to_cuda] = future
 
             img = torch.cat((txt, img), 1)
 
+            single_futures = {}
             for block_idx, block in enumerate(self.single_blocks):
                 # print(f"Single block {block_idx}")
-                unit_idx = self.get_unit_index(is_double=False, index=block_idx)
-                if block_idx % 2 == 0:
-                    wait_for_blocks_move(unit_idx, futures)
+                wait_for_blocks_move(block_idx, single_futures)
 
                 img = block(img, vec=vec, pe=pe, txt_attention_mask=txt_attention_mask)
 
-                if block_idx % 2 == 1 and unit_idx < self.blocks_to_swap:
-                    block_idx_to_cpu = unit_idx
-                    block_idx_to_cuda = self.num_block_units - self.blocks_to_swap + unit_idx
-                    future = submit_move_blocks(block_idx_to_cpu, block_idx_to_cuda)
-                    futures[block_idx_to_cuda] = future
+                if block_idx < self.single_blocks_to_swap:
+                    block_idx_to_cpu = block_idx
+                    block_idx_to_cuda = self.num_single_blocks - self.blocks_to_swap + block_idx
+                    future = submit_move_blocks(self.single_blocks, block_idx_to_cpu, block_idx_to_cuda)
+                    single_futures[block_idx_to_cuda] = future
 
         img = img[:, txt.shape[1] :, ...]
 
diff --git a/library/utils.py b/library/utils.py
index ca0f904d2..aed510074 100644
--- a/library/utils.py
+++ b/library/utils.py
@@ -6,6 +6,7 @@
 import struct
 
 import torch
+import torch.nn as nn
 from torchvision import transforms
 from diffusers import EulerAncestralDiscreteScheduler
 import diffusers.schedulers.scheduling_euler_ancestral_discrete
@@ -93,6 +94,225 @@ def setup_logging(args=None, log_level=None, reset=False):
 
 # region PyTorch utils
 
+# def swap_weights(layer_to_cpu: nn.Module, layer_to_cuda: nn.Module):
+#     assert layer_to_cpu.__class__ == layer_to_cuda.__class__
+#     for module_to_cpu, module_to_cuda in zip(layer_to_cpu.modules(), layer_to_cuda.modules()):
+#         if hasattr(module_to_cpu, "weight") and module_to_cpu.weight is not None:
+#             # print(f"Swapping {layer_to_cpu.__class__.__name__}-{module_to_cpu.__class__.__name__}.")
+#             # cpu_tensor = module_to_cuda.weight.data
+#             # cuda_tensor = module_to_cpu.weight.data
+#             # assert cuda_tensor.device.type == "cuda"
+#             # temp_cpu_tensor = cuda_tensor.to("cpu", non_blocking=True)
+#             # torch.cuda.current_stream().synchronize()
+#             # cuda_tensor.copy_(cpu_tensor, non_blocking=True)
+#             # torch.cuda.current_stream().synchronize()
+#             # cpu_tensor.copy_(temp_cpu_tensor, non_blocking=True)
+#             # module_to_cpu.weight.data, module_to_cuda.weight.data = cpu_tensor, cuda_tensor
+#             cuda_tensor_view = module_to_cpu.weight.data
+#             cpu_tensor_view = module_to_cuda.weight.data
+#             module_to_cpu.weight.data = module_to_cpu.weight.to("cpu", non_blocking=True).detach().clone()
+#             module_to_cuda.weight.data = cuda_tensor_view
+#             module_to_cuda.weight.data.copy_(cpu_tensor_view)
+
+
+def swap_weight_devices(layer_to_cpu: nn.Module, layer_to_cuda: nn.Module):
+    assert layer_to_cpu.__class__ == layer_to_cuda.__class__
+
+    weight_swap_jobs = []
+    for module_to_cpu, module_to_cuda in zip(layer_to_cpu.modules(), layer_to_cuda.modules()):
+        if hasattr(module_to_cpu, "weight") and module_to_cpu.weight is not None:
+            weight_swap_jobs.append((module_to_cpu, module_to_cuda, module_to_cpu.weight.data, module_to_cuda.weight.data))
+
+    torch.cuda.current_stream().synchronize()  # this prevents the illegal loss value
+
+    stream = torch.cuda.Stream()
+    with torch.cuda.stream(stream):
+        # cuda to cpu
+        for module_to_cpu, module_to_cuda, cuda_data_view, cpu_data_view in weight_swap_jobs:
+            cuda_data_view.record_stream(stream)
+            module_to_cpu.weight.data = cuda_data_view.data.to("cpu", non_blocking=True)
+
+        stream.synchronize()
+
+        # cpu to cuda
+        for module_to_cpu, module_to_cuda, cuda_data_view, cpu_data_view in weight_swap_jobs:
+            cuda_data_view.copy_(module_to_cuda.weight.data, non_blocking=True)
+            module_to_cuda.weight.data = cuda_data_view
+
+    stream.synchronize()
+    torch.cuda.current_stream().synchronize()  # this prevents the illegal loss value
+
+
+def swap_weight_devices_2st(layer_to_cpu: nn.Module, layer_to_cuda: nn.Module):
+    assert layer_to_cpu.__class__ == layer_to_cuda.__class__
+
+    weight_swap_jobs = []
+    for module_to_cpu, module_to_cuda in zip(layer_to_cpu.modules(), layer_to_cuda.modules()):
+        if hasattr(module_to_cpu, "weight") and module_to_cpu.weight is not None:
+            weight_swap_jobs.append((module_to_cpu, module_to_cuda, module_to_cpu.weight.data, module_to_cuda.weight.data))
+
+    stream_to_cpu = torch.cuda.Stream()
+    stream_to_cuda = torch.cuda.Stream()
+
+    events = []
+    with torch.cuda.stream(stream_to_cpu):
+        # cuda to offload
+        offloaded_weights = []
+        for module_to_cpu, module_to_cuda, cuda_data_view, cpu_data_view in weight_swap_jobs:
+            offloaded_weights.append(cuda_data_view.to("cpu", non_blocking=True))
+            event = torch.cuda.Event()
+            event.record(stream=stream_to_cpu)
+            events.append(event)
+
+    with torch.cuda.stream(stream_to_cuda):
+        # cpu to cuda
+        for (module_to_cpu, module_to_cuda, cuda_data_view, cpu_data_view), event in zip(weight_swap_jobs, events):
+            event.synchronize()
+            cuda_data_view.copy_(module_to_cuda.weight.data, non_blocking=True)
+            module_to_cuda.weight.data = cuda_data_view
+
+        # offload to cpu
+        for (module_to_cpu, module_to_cuda, cuda_data_view, cpu_data_view), offloaded_weight in zip(
+            weight_swap_jobs, offloaded_weights
+        ):
+            module_to_cpu.weight.data = offloaded_weight
+
+        stream_to_cuda.synchronize()
+
+    torch.cuda.current_stream().synchronize()  # this prevents the illegal loss value
+
+
+def swap_weight_devices_failed(layer_to_cpu: nn.Module, layer_to_cuda: nn.Module):
+    assert layer_to_cpu.__class__ == layer_to_cuda.__class__
+
+    weight_swap_jobs = []
+    for module_to_cpu, module_to_cuda in zip(layer_to_cpu.modules(), layer_to_cuda.modules()):
+        if hasattr(module_to_cpu, "weight") and module_to_cpu.weight is not None:
+            weight_swap_jobs.append((module_to_cpu, module_to_cuda, module_to_cpu.weight.data, module_to_cuda.weight.data))
+
+    stream_to_cpu = torch.cuda.Stream()
+    stream_to_cuda = torch.cuda.Stream()
+
+    # cuda to offload
+    events = []
+    with torch.cuda.stream(stream_to_cpu):
+        offloaded_weights = []
+        for module_to_cpu, module_to_cuda, cuda_data_view, cpu_data_view in weight_swap_jobs:
+            cuda_data_view.record_stream(stream_to_cpu)
+            offloaded_weights.append(cuda_data_view.to("cpu", non_blocking=True))
+
+            event = torch.cuda.Event()
+            event.record(stream=stream_to_cpu)
+            events.append(event)
+
+    # cpu to cuda
+    with torch.cuda.stream(stream_to_cuda):
+        for (module_to_cpu, module_to_cuda, cuda_data_view, cpu_data_view), event, offloaded_weight in zip(
+            weight_swap_jobs, events, offloaded_weights
+        ):
+            event.synchronize()
+            cuda_data_view.record_stream(stream_to_cuda)
+            cuda_data_view.copy_(module_to_cuda.weight.data, non_blocking=True)
+            module_to_cuda.weight.data = cuda_data_view
+
+            module_to_cpu.weight.data = offloaded_weight
+
+    stream_to_cuda.synchronize()
+
+    torch.cuda.current_stream().synchronize()  # this prevents the illegal loss value
+    # torch.cuda.current_stream().wait_stream(stream_to_cuda)
+    # for job in weight_swap_jobs:
+    #     job[2].record_stream(torch.cuda.current_stream())  # record the ownership of the tensor
+
+
+def swap_weight_devices_works_2(layer_to_cpu: nn.Module, layer_to_cuda: nn.Module):
+    assert layer_to_cpu.__class__ == layer_to_cuda.__class__
+
+    weight_swap_jobs = []
+    for module_to_cpu, module_to_cuda in zip(layer_to_cpu.modules(), layer_to_cuda.modules()):
+        if hasattr(module_to_cpu, "weight") and module_to_cpu.weight is not None:
+            if not (hasattr(module_to_cpu, "offloaded_weight") or hasattr(module_to_cuda, "offloaded_weight")):
+                # one of the modules must have the tensor to offload
+                module_to_cpu.offloaded_weight = torch.zeros_like(module_to_cpu.weight.data, device="cpu")
+                module_to_cpu.offloaded_weight.pin_memory()
+            offloaded_weight = (
+                module_to_cpu.offloaded_weight if hasattr(module_to_cpu, "offloaded_weight") else module_to_cuda.offloaded_weight
+            )
+            assert module_to_cpu.weight.device.type == "cuda" and module_to_cuda.weight.device.type == "cpu"
+            weight_swap_jobs.append(
+                (module_to_cpu, module_to_cuda, module_to_cpu.weight.data, module_to_cuda.weight.data, offloaded_weight)
+            )
+
+    stream = torch.cuda.Stream()
+    with torch.cuda.stream(stream):
+        # cuda to offload
+        for module_to_cpu, module_to_cuda, cuda_data_view, cpu_data_view, offloaded_weight in weight_swap_jobs:
+            cuda_data_view.record_stream(stream)
+            offloaded_weight.copy_(module_to_cpu.weight.data, non_blocking=True)
+
+        stream.synchronize()
+
+        # cpu to cuda
+        for module_to_cpu, module_to_cuda, cuda_data_view, cpu_data_view, offloaded_weight in weight_swap_jobs:
+            cuda_data_view.copy_(module_to_cuda.weight.data, non_blocking=True)
+            module_to_cuda.weight.data = cuda_data_view
+
+        # offload to cpu
+        for module_to_cpu, module_to_cuda, cuda_data_view, cpu_data_view, offloaded_weight in weight_swap_jobs:
+            module_to_cpu.weight.data = offloaded_weight
+            offloaded_weight = cpu_data_view
+            module_to_cpu.offloaded_weight = offloaded_weight
+            module_to_cuda.offloaded_weight = offloaded_weight
+
+        stream.synchronize()
+
+    torch.cuda.current_stream().synchronize()  # this prevents the illegal loss value
+
+
+def swap_weight_devices_safe_works(layer_to_cpu: nn.Module, layer_to_cuda: nn.Module):
+    assert layer_to_cpu.__class__ == layer_to_cuda.__class__
+
+    weight_swap_jobs = []
+    for module_to_cpu, module_to_cuda in zip(layer_to_cpu.modules(), layer_to_cuda.modules()):
+        if hasattr(module_to_cpu, "weight") and module_to_cpu.weight is not None:
+            if not (hasattr(module_to_cpu, "__cached_cpu_weight") or hasattr(module_to_cuda, "__cached_cuda_weight")):
+                # one of the modules must have the tensor to cache
+                module_to_cpu.__cached_cpu_weight = torch.zeros_like(module_to_cpu.weight.data, device="cpu")
+                module_to_cpu.__cached_cpu_weight.pin_memory()
+
+            weight_swap_jobs.append((module_to_cpu, module_to_cuda, module_to_cpu.weight.data, module_to_cuda.weight.data))
+
+    for module_to_cpu, module_to_cuda, cuda_tensor_view, cpu_tensor_view in weight_swap_jobs:
+        module_to_cpu.weight.data = cuda_tensor_view.to("cpu", non_blocking=True)
+        module_to_cuda.weight.data = cpu_tensor_view.to("cuda", non_blocking=True)
+
+    torch.cuda.current_stream().synchronize()  # wait for the copy from cache to cpu to finish
+    torch.cuda.empty_cache()
+
+
+# def swap_weight_devices(layer_to_cpu: nn.Module, layer_to_cuda: nn.Module):
+#     assert layer_to_cpu.__class__ == layer_to_cuda.__class__
+#     for module_to_cpu, module_to_cuda in zip(layer_to_cpu.modules(), layer_to_cuda.modules()):
+#         if hasattr(module_to_cpu, "weight") and module_to_cpu.weight is not None:
+#             assert module_to_cuda.weight.device.type == "cpu" and module_to_cpu.weight.device.type == "cuda"
+#             weight_on_cuda = module_to_cpu.weight
+#             weight_on_cpu = module_to_cuda.weight
+#             cuda_to_cpu_data = weight_on_cuda.data.to("cpu", non_blocking=True)
+#             event = torch.cuda.current_stream().record_event()
+#             event.synchronize()
+#             weight_on_cuda.data.copy_(weight_on_cpu.data, non_blocking=True)
+#             weight_on_cpu.data = cuda_to_cpu_data
+#             weight_on_cpu.grad, weight_on_cuda.grad = weight_on_cuda.grad, weight_on_cpu.grad
+
+#             module_to_cpu.weight = weight_on_cpu
+#             module_to_cuda.weight = weight_on_cuda
+
+
+def weighs_to_device(layer: nn.Module, device: torch.device):
+    for module in layer.modules():
+        if hasattr(module, "weight") and module.weight is not None:
+            module.weight.data = module.weight.data.to(device, non_blocking=True)
+
 
 def str_to_dtype(s: Optional[str], default_dtype: Optional[torch.dtype] = None) -> torch.dtype:
     """
@@ -313,6 +533,7 @@ def _convert_float8(byte_tensor, dtype_str, shape):
             # return byte_tensor.view(torch.uint8).to(torch.float16).reshape(shape)
             raise ValueError(f"Unsupported float8 type: {dtype_str} (upgrade PyTorch to support float8 types)")
 
+
 def load_safetensors(
     path: str, device: Union[str, torch.device], disable_mmap: bool = False, dtype: Optional[torch.dtype] = torch.float32
 ) -> dict[str, torch.Tensor]:
@@ -336,7 +557,6 @@ def load_safetensors(
         return state_dict
 
 
-
 # endregion
 
 # region Image utils

From aab943cea3eb8a91041c857771f1642581133608 Mon Sep 17 00:00:00 2001
From: Kohya S <ykumeykume@gmail.com>
Date: Tue, 5 Nov 2024 23:27:41 +0900
Subject: [PATCH 218/582] remove unused weight swapping functions from utils.py

---
 library/utils.py | 185 -----------------------------------------------
 1 file changed, 185 deletions(-)

diff --git a/library/utils.py b/library/utils.py
index aed510074..07079c6d9 100644
--- a/library/utils.py
+++ b/library/utils.py
@@ -94,26 +94,6 @@ def setup_logging(args=None, log_level=None, reset=False):
 
 # region PyTorch utils
 
-# def swap_weights(layer_to_cpu: nn.Module, layer_to_cuda: nn.Module):
-#     assert layer_to_cpu.__class__ == layer_to_cuda.__class__
-#     for module_to_cpu, module_to_cuda in zip(layer_to_cpu.modules(), layer_to_cuda.modules()):
-#         if hasattr(module_to_cpu, "weight") and module_to_cpu.weight is not None:
-#             # print(f"Swapping {layer_to_cpu.__class__.__name__}-{module_to_cpu.__class__.__name__}.")
-#             # cpu_tensor = module_to_cuda.weight.data
-#             # cuda_tensor = module_to_cpu.weight.data
-#             # assert cuda_tensor.device.type == "cuda"
-#             # temp_cpu_tensor = cuda_tensor.to("cpu", non_blocking=True)
-#             # torch.cuda.current_stream().synchronize()
-#             # cuda_tensor.copy_(cpu_tensor, non_blocking=True)
-#             # torch.cuda.current_stream().synchronize()
-#             # cpu_tensor.copy_(temp_cpu_tensor, non_blocking=True)
-#             # module_to_cpu.weight.data, module_to_cuda.weight.data = cpu_tensor, cuda_tensor
-#             cuda_tensor_view = module_to_cpu.weight.data
-#             cpu_tensor_view = module_to_cuda.weight.data
-#             module_to_cpu.weight.data = module_to_cpu.weight.to("cpu", non_blocking=True).detach().clone()
-#             module_to_cuda.weight.data = cuda_tensor_view
-#             module_to_cuda.weight.data.copy_(cpu_tensor_view)
-
 
 def swap_weight_devices(layer_to_cpu: nn.Module, layer_to_cuda: nn.Module):
     assert layer_to_cpu.__class__ == layer_to_cuda.__class__
@@ -143,171 +123,6 @@ def swap_weight_devices(layer_to_cpu: nn.Module, layer_to_cuda: nn.Module):
     torch.cuda.current_stream().synchronize()  # this prevents the illegal loss value
 
 
-def swap_weight_devices_2st(layer_to_cpu: nn.Module, layer_to_cuda: nn.Module):
-    assert layer_to_cpu.__class__ == layer_to_cuda.__class__
-
-    weight_swap_jobs = []
-    for module_to_cpu, module_to_cuda in zip(layer_to_cpu.modules(), layer_to_cuda.modules()):
-        if hasattr(module_to_cpu, "weight") and module_to_cpu.weight is not None:
-            weight_swap_jobs.append((module_to_cpu, module_to_cuda, module_to_cpu.weight.data, module_to_cuda.weight.data))
-
-    stream_to_cpu = torch.cuda.Stream()
-    stream_to_cuda = torch.cuda.Stream()
-
-    events = []
-    with torch.cuda.stream(stream_to_cpu):
-        # cuda to offload
-        offloaded_weights = []
-        for module_to_cpu, module_to_cuda, cuda_data_view, cpu_data_view in weight_swap_jobs:
-            offloaded_weights.append(cuda_data_view.to("cpu", non_blocking=True))
-            event = torch.cuda.Event()
-            event.record(stream=stream_to_cpu)
-            events.append(event)
-
-    with torch.cuda.stream(stream_to_cuda):
-        # cpu to cuda
-        for (module_to_cpu, module_to_cuda, cuda_data_view, cpu_data_view), event in zip(weight_swap_jobs, events):
-            event.synchronize()
-            cuda_data_view.copy_(module_to_cuda.weight.data, non_blocking=True)
-            module_to_cuda.weight.data = cuda_data_view
-
-        # offload to cpu
-        for (module_to_cpu, module_to_cuda, cuda_data_view, cpu_data_view), offloaded_weight in zip(
-            weight_swap_jobs, offloaded_weights
-        ):
-            module_to_cpu.weight.data = offloaded_weight
-
-        stream_to_cuda.synchronize()
-
-    torch.cuda.current_stream().synchronize()  # this prevents the illegal loss value
-
-
-def swap_weight_devices_failed(layer_to_cpu: nn.Module, layer_to_cuda: nn.Module):
-    assert layer_to_cpu.__class__ == layer_to_cuda.__class__
-
-    weight_swap_jobs = []
-    for module_to_cpu, module_to_cuda in zip(layer_to_cpu.modules(), layer_to_cuda.modules()):
-        if hasattr(module_to_cpu, "weight") and module_to_cpu.weight is not None:
-            weight_swap_jobs.append((module_to_cpu, module_to_cuda, module_to_cpu.weight.data, module_to_cuda.weight.data))
-
-    stream_to_cpu = torch.cuda.Stream()
-    stream_to_cuda = torch.cuda.Stream()
-
-    # cuda to offload
-    events = []
-    with torch.cuda.stream(stream_to_cpu):
-        offloaded_weights = []
-        for module_to_cpu, module_to_cuda, cuda_data_view, cpu_data_view in weight_swap_jobs:
-            cuda_data_view.record_stream(stream_to_cpu)
-            offloaded_weights.append(cuda_data_view.to("cpu", non_blocking=True))
-
-            event = torch.cuda.Event()
-            event.record(stream=stream_to_cpu)
-            events.append(event)
-
-    # cpu to cuda
-    with torch.cuda.stream(stream_to_cuda):
-        for (module_to_cpu, module_to_cuda, cuda_data_view, cpu_data_view), event, offloaded_weight in zip(
-            weight_swap_jobs, events, offloaded_weights
-        ):
-            event.synchronize()
-            cuda_data_view.record_stream(stream_to_cuda)
-            cuda_data_view.copy_(module_to_cuda.weight.data, non_blocking=True)
-            module_to_cuda.weight.data = cuda_data_view
-
-            module_to_cpu.weight.data = offloaded_weight
-
-    stream_to_cuda.synchronize()
-
-    torch.cuda.current_stream().synchronize()  # this prevents the illegal loss value
-    # torch.cuda.current_stream().wait_stream(stream_to_cuda)
-    # for job in weight_swap_jobs:
-    #     job[2].record_stream(torch.cuda.current_stream())  # record the ownership of the tensor
-
-
-def swap_weight_devices_works_2(layer_to_cpu: nn.Module, layer_to_cuda: nn.Module):
-    assert layer_to_cpu.__class__ == layer_to_cuda.__class__
-
-    weight_swap_jobs = []
-    for module_to_cpu, module_to_cuda in zip(layer_to_cpu.modules(), layer_to_cuda.modules()):
-        if hasattr(module_to_cpu, "weight") and module_to_cpu.weight is not None:
-            if not (hasattr(module_to_cpu, "offloaded_weight") or hasattr(module_to_cuda, "offloaded_weight")):
-                # one of the modules must have the tensor to offload
-                module_to_cpu.offloaded_weight = torch.zeros_like(module_to_cpu.weight.data, device="cpu")
-                module_to_cpu.offloaded_weight.pin_memory()
-            offloaded_weight = (
-                module_to_cpu.offloaded_weight if hasattr(module_to_cpu, "offloaded_weight") else module_to_cuda.offloaded_weight
-            )
-            assert module_to_cpu.weight.device.type == "cuda" and module_to_cuda.weight.device.type == "cpu"
-            weight_swap_jobs.append(
-                (module_to_cpu, module_to_cuda, module_to_cpu.weight.data, module_to_cuda.weight.data, offloaded_weight)
-            )
-
-    stream = torch.cuda.Stream()
-    with torch.cuda.stream(stream):
-        # cuda to offload
-        for module_to_cpu, module_to_cuda, cuda_data_view, cpu_data_view, offloaded_weight in weight_swap_jobs:
-            cuda_data_view.record_stream(stream)
-            offloaded_weight.copy_(module_to_cpu.weight.data, non_blocking=True)
-
-        stream.synchronize()
-
-        # cpu to cuda
-        for module_to_cpu, module_to_cuda, cuda_data_view, cpu_data_view, offloaded_weight in weight_swap_jobs:
-            cuda_data_view.copy_(module_to_cuda.weight.data, non_blocking=True)
-            module_to_cuda.weight.data = cuda_data_view
-
-        # offload to cpu
-        for module_to_cpu, module_to_cuda, cuda_data_view, cpu_data_view, offloaded_weight in weight_swap_jobs:
-            module_to_cpu.weight.data = offloaded_weight
-            offloaded_weight = cpu_data_view
-            module_to_cpu.offloaded_weight = offloaded_weight
-            module_to_cuda.offloaded_weight = offloaded_weight
-
-        stream.synchronize()
-
-    torch.cuda.current_stream().synchronize()  # this prevents the illegal loss value
-
-
-def swap_weight_devices_safe_works(layer_to_cpu: nn.Module, layer_to_cuda: nn.Module):
-    assert layer_to_cpu.__class__ == layer_to_cuda.__class__
-
-    weight_swap_jobs = []
-    for module_to_cpu, module_to_cuda in zip(layer_to_cpu.modules(), layer_to_cuda.modules()):
-        if hasattr(module_to_cpu, "weight") and module_to_cpu.weight is not None:
-            if not (hasattr(module_to_cpu, "__cached_cpu_weight") or hasattr(module_to_cuda, "__cached_cuda_weight")):
-                # one of the modules must have the tensor to cache
-                module_to_cpu.__cached_cpu_weight = torch.zeros_like(module_to_cpu.weight.data, device="cpu")
-                module_to_cpu.__cached_cpu_weight.pin_memory()
-
-            weight_swap_jobs.append((module_to_cpu, module_to_cuda, module_to_cpu.weight.data, module_to_cuda.weight.data))
-
-    for module_to_cpu, module_to_cuda, cuda_tensor_view, cpu_tensor_view in weight_swap_jobs:
-        module_to_cpu.weight.data = cuda_tensor_view.to("cpu", non_blocking=True)
-        module_to_cuda.weight.data = cpu_tensor_view.to("cuda", non_blocking=True)
-
-    torch.cuda.current_stream().synchronize()  # wait for the copy from cache to cpu to finish
-    torch.cuda.empty_cache()
-
-
-# def swap_weight_devices(layer_to_cpu: nn.Module, layer_to_cuda: nn.Module):
-#     assert layer_to_cpu.__class__ == layer_to_cuda.__class__
-#     for module_to_cpu, module_to_cuda in zip(layer_to_cpu.modules(), layer_to_cuda.modules()):
-#         if hasattr(module_to_cpu, "weight") and module_to_cpu.weight is not None:
-#             assert module_to_cuda.weight.device.type == "cpu" and module_to_cpu.weight.device.type == "cuda"
-#             weight_on_cuda = module_to_cpu.weight
-#             weight_on_cpu = module_to_cuda.weight
-#             cuda_to_cpu_data = weight_on_cuda.data.to("cpu", non_blocking=True)
-#             event = torch.cuda.current_stream().record_event()
-#             event.synchronize()
-#             weight_on_cuda.data.copy_(weight_on_cpu.data, non_blocking=True)
-#             weight_on_cpu.data = cuda_to_cpu_data
-#             weight_on_cpu.grad, weight_on_cuda.grad = weight_on_cuda.grad, weight_on_cpu.grad
-
-#             module_to_cpu.weight = weight_on_cpu
-#             module_to_cuda.weight = weight_on_cuda
-
-
 def weighs_to_device(layer: nn.Module, device: torch.device):
     for module in layer.modules():
         if hasattr(module, "weight") and module.weight is not None:

From 43849030cf35a7c854311e0bee9cb8a92b77dd83 Mon Sep 17 00:00:00 2001
From: Kohya S <ykumeykume@gmail.com>
Date: Wed, 6 Nov 2024 21:33:28 +0900
Subject: [PATCH 219/582] Fix to work without latent cache #1758

---
 sd3_train.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/sd3_train.py b/sd3_train.py
index e03d1708b..b8a0d04fa 100644
--- a/sd3_train.py
+++ b/sd3_train.py
@@ -885,7 +885,9 @@ def optimizer_hook(parameter: torch.Tensor):
                 else:
                     with torch.no_grad():
                         # encode images to latents. images are [-1, 1]
-                        latents = vae.encode(batch["images"])
+                        latents = vae.encode(batch["images"].to(vae.device, dtype=vae.dtype)).to(
+                            accelerator.device, dtype=weight_dtype
+                        )
 
                     # NaNが含まれていれば警告を表示し0に置き換える
                     if torch.any(torch.isnan(latents)):
@@ -927,7 +929,7 @@ def optimizer_hook(parameter: torch.Tensor):
                 if t5_out is None:
                     _, _, input_ids_t5xxl, _, _, t5_attn_mask = batch["input_ids_list"]
                     with torch.set_grad_enabled(train_t5xxl):
-                        input_ids_t5xxl = input_ids_t5xxl.to("cpu") if t5_out is None else None
+                        input_ids_t5xxl = input_ids_t5xxl.to("cpu")
                         _, t5_out, _, _, _, t5_attn_mask = text_encoding_strategy.encode_tokens(
                             sd3_tokenize_strategy, [None, None, t5xxl], [None, None, input_ids_t5xxl, None, None, t5_attn_mask]
                         )

From 40ed54bfc0ca666c45a4a5d4b7a3064612371005 Mon Sep 17 00:00:00 2001
From: Dango233 <zhengyizhou@outlook.com>
Date: Thu, 7 Nov 2024 09:53:54 +0000
Subject: [PATCH 220/582] Simplify Timestep weighting * Remove diffusers
 dependency in ts & sigma calc * support Shift setting * Add uniform
 distribution * Default to Uniform distribution and shift 1

---
 library/sd3_train_utils.py | 33 ++++++++++++++++++++++-----------
 1 file changed, 22 insertions(+), 11 deletions(-)

diff --git a/library/sd3_train_utils.py b/library/sd3_train_utils.py
index 69878750e..bfe752d5e 100644
--- a/library/sd3_train_utils.py
+++ b/library/sd3_train_utils.py
@@ -253,12 +253,12 @@ def add_sd3_training_arguments(parser: argparse.ArgumentParser):
         " / 複数解像度学習時に解像度ごとに位置埋め込みをスケーリングする。SD3.5M以外では予期しない動作になります",
     )
 
-    # copy from Diffusers
+    # Dependencies of Diffusers noise sampler has been removed for clearity.
     parser.add_argument(
         "--weighting_scheme",
         type=str,
-        default="logit_normal",
-        choices=["sigma_sqrt", "logit_normal", "mode", "cosmap"],
+        default="uniform",
+        choices=["sigma_sqrt", "logit_normal", "mode", "cosmap", "uniform"],
         help="weighting scheme for timestep distribution and loss / タイムステップ分布と損失のための重み付けスキーム",
     )
     parser.add_argument(
@@ -279,8 +279,13 @@ def add_sd3_training_arguments(parser: argparse.ArgumentParser):
         default=1.29,
         help="Scale of mode weighting scheme. Only effective when using the `'mode'` as the `weighting_scheme`. / モード重み付けスキームのスケール。`'mode'`を`weighting_scheme`として使用する場合のみ有効",
     )
-
-
+    parser.add_argument(
+        "--training_shift",
+        type=float,
+        default=1.0,
+        help="Discrete flow shift for training timestep distribution adjustment, applied in addition to the weighting scheme, default is 1.0. /タイムステップ分布のための離散フローシフト、重み付けスキームの上に適用される、デフォルトは1.0。",
+    )
+    
 def verify_sdxl_training_args(args: argparse.Namespace, supportTextEncoderCaching: bool = True):
     assert not args.v2, "v2 cannot be enabled in SDXL training / SDXL学習ではv2を有効にすることはできません"
     if args.v_parameterization:
@@ -965,14 +970,20 @@ def get_noisy_model_input_and_timesteps(
         logit_std=args.logit_std,
         mode_scale=args.mode_scale,
     )
-    indices = (u * noise_scheduler.config.num_train_timesteps).long()
-    timesteps = noise_scheduler.timesteps[indices].to(device=device)
+    t_min = args.min_timestep if args.min_timestep is not None else 0
+    t_max = args.max_timestep if args.max_timestep is not None else 1000
+    shift = args.training_shift
+
+    # weighting shift, value >1 will shift distribution to noisy side (focus more on overall structure), value <1 will shift towards less-noisy side (focus more on details)
+    u = (u * shift) / (1 + (shift - 1) * u)
 
-    # Add noise according to flow matching.
-    sigmas = get_sigmas(noise_scheduler, timesteps, device, n_dim=latents.ndim, dtype=dtype)
+    indices = (u * (t_max-t_min) + t_min).long()
+    timesteps = indices.to(device=device, dtype=dtype)
+
+    # sigmas according to dlowmatching
+    sigmas = timesteps / 1000
+    sigmas = sigmas.view(-1,1,1,1)
     noisy_model_input = sigmas * noise + (1.0 - sigmas) * latents
 
     return noisy_model_input, timesteps, sigmas
 
-
-# endregion

From e54462a4a9cb3d01c5635f8c191d28cbccfba6e0 Mon Sep 17 00:00:00 2001
From: Dango233 <zhengyizhou@outlook.com>
Date: Thu, 7 Nov 2024 09:54:12 +0000
Subject: [PATCH 221/582] Fix SD3 trained lora loading and merging

---
 networks/lora_sd3.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/networks/lora_sd3.py b/networks/lora_sd3.py
index efe202451..ce6d1a16f 100644
--- a/networks/lora_sd3.py
+++ b/networks/lora_sd3.py
@@ -601,7 +601,7 @@ def merge_to(self, text_encoders, mmdit, weights_sd, dtype=None, device=None):
                 or key.startswith(LoRANetwork.LORA_PREFIX_TEXT_ENCODER_T5)
             ):
                 apply_text_encoder = True
-            elif key.startswith(LoRANetwork.LORA_PREFIX_MMDIT):
+            elif key.startswith(LoRANetwork.LORA_PREFIX_SD3):
                 apply_unet = True
 
         if apply_text_encoder:

From bafd10d558bf318ccd7059c2b4dce2775b5758da Mon Sep 17 00:00:00 2001
From: Dango233 <zhengyizhou@outlook.com>
Date: Thu, 7 Nov 2024 18:21:04 +0800
Subject: [PATCH 222/582] Fix typo

---
 library/sd3_train_utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/library/sd3_train_utils.py b/library/sd3_train_utils.py
index bfe752d5e..afbe34cf5 100644
--- a/library/sd3_train_utils.py
+++ b/library/sd3_train_utils.py
@@ -980,7 +980,7 @@ def get_noisy_model_input_and_timesteps(
     indices = (u * (t_max-t_min) + t_min).long()
     timesteps = indices.to(device=device, dtype=dtype)
 
-    # sigmas according to dlowmatching
+    # sigmas according to flowmatching
     sigmas = timesteps / 1000
     sigmas = sigmas.view(-1,1,1,1)
     noisy_model_input = sigmas * noise + (1.0 - sigmas) * latents

From 5e86323f12178605c0b99bc914b4bd970900ce75 Mon Sep 17 00:00:00 2001
From: Kohya S <ykumeykume@gmail.com>
Date: Thu, 7 Nov 2024 21:27:12 +0900
Subject: [PATCH 223/582] Update README and clean-up the code for SD3 timesteps

---
 README.md                  | 13 ++++++++++++-
 library/config_util.py     |  2 +-
 library/sd3_models.py      |  2 +-
 library/sd3_train_utils.py | 17 +++++++++--------
 sd3_train.py               |  8 ++++----
 sd3_train_network.py       |  7 +++----
 6 files changed, 30 insertions(+), 19 deletions(-)

diff --git a/README.md b/README.md
index fb087c234..dba76a3c5 100644
--- a/README.md
+++ b/README.md
@@ -14,6 +14,13 @@ The command to install PyTorch is as follows:
 
 ### Recent Updates
 
+Nov 7, 2024:
+
+- The distribution of timesteps during SD3/3.5 training has been adjusted. This applies to both fine-tuning and LoRA training. PR [#1768](https://github.com/kohya-ss/sd-scripts/pull/1768) Thanks to Dango233!
+  - Previously, the side closer to noise was more sampled, but now it is uniform by default. This may improve the problem of difficulty in learning details.
+  - Specifically, the problem of double shifting has been fixed. The default for `--weighting_scheme` has been changed to `uniform` (the previous default was `logit_normal`).
+  - A new option `--training_shift` has been added. The default is 1.0, and all timesteps are sampled uniformly. If less than 1.0, the side closer to the image is more sampled, and if more than 1.0, the side closer to noise is more sampled.
+
 Oct 31, 2024:
 
 - Added support for SD3.5L/M training. See [SD3 training](#sd3-training) for details.
@@ -641,6 +648,7 @@ Here are the arguments. The arguments and sample settings are still experimental
 - `--clip_l_dropout_rate`, `--clip_g_dropout_rate` and `--t5_dropout_rate` are the dropout rates for the embeddings of CLIP-L, CLIP-G, and T5XXL, described in [SAI research papre](http://arxiv.org/pdf/2403.03206). The default is 0.0. For LoRA training, it is seems to be better to set 0.0.
 - `--pos_emb_random_crop_rate` is the rate of random cropping of positional embeddings, described in [SD3.5M model card](https://huggingface.co/stabilityai/stable-diffusion-3.5-medium). The default is 0. It is seems to be better to set 0.0 for LoRA training.
 - `--enable_scaled_pos_embed` is to enable the scaled positional embeddings. The default is False. This option is an experimental feature for SD3.5M. Details are described below.
+- `--training_shift` is the shift value for the training distribution of timesteps. The default is 1.0 (uniform distribution, no shift).  If less than 1.0, the side closer to the image is more sampled, and if more than 1.0, the side closer to noise is more sampled. 
 
 Other options are described below.
 
@@ -681,7 +689,10 @@ Other options are described below.
    - Same as FLUX.1 for data preparation.
    - If you train with multiple resolutions, you can enable the scaled positional embeddings with `--enable_scaled_pos_embed`. The default is False. __This option is an experimental feature.__
 
-
+6. Weighting scheme and training shift:
+   - The weighting scheme is described in the section 3.1 of the [SD3 paper](https://arxiv.org/abs/2403.03206v1). 
+   - The uniform distribution is the default. If you want to change the distribution, see `--help` for options. 
+   - `--training_shift` is the shift value for the training distribution of timesteps.
 
 
 Technical details of multi-resolution training for SD3.5M:
diff --git a/library/config_util.py b/library/config_util.py
index fc1fbf46d..12d0be173 100644
--- a/library/config_util.py
+++ b/library/config_util.py
@@ -526,7 +526,7 @@ def generate_dataset_group_by_blueprint(dataset_group_blueprint: DatasetGroupBlu
           secondary_separator: {subset.secondary_separator}
           enable_wildcard: {subset.enable_wildcard}
           caption_dropout_rate: {subset.caption_dropout_rate}
-          caption_dropout_every_n_epoches: {subset.caption_dropout_every_n_epochs}
+          caption_dropout_every_n_epochs: {subset.caption_dropout_every_n_epochs}
           caption_tag_dropout_rate: {subset.caption_tag_dropout_rate}
           caption_prefix: {subset.caption_prefix}
           caption_suffix: {subset.caption_suffix}
diff --git a/library/sd3_models.py b/library/sd3_models.py
index b09a57dbd..89225fe4d 100644
--- a/library/sd3_models.py
+++ b/library/sd3_models.py
@@ -871,7 +871,7 @@ def enable_scaled_pos_embed(self, use_scaled_pos_embed: bool, latent_sizes: Opti
             # remove pos_embed to free up memory up to 0.4 GB
             self.pos_embed = None
 
-            # remove duplcates and sort latent sizes in ascending order
+            # remove duplicates and sort latent sizes in ascending order
             latent_sizes = list(set(latent_sizes))
             latent_sizes = sorted(latent_sizes)
 
diff --git a/library/sd3_train_utils.py b/library/sd3_train_utils.py
index afbe34cf5..38f3c25f4 100644
--- a/library/sd3_train_utils.py
+++ b/library/sd3_train_utils.py
@@ -253,7 +253,7 @@ def add_sd3_training_arguments(parser: argparse.ArgumentParser):
         " / 複数解像度学習時に解像度ごとに位置埋め込みをスケーリングする。SD3.5M以外では予期しない動作になります",
     )
 
-    # Dependencies of Diffusers noise sampler has been removed for clearity.
+    # Dependencies of Diffusers noise sampler has been removed for clarity.
     parser.add_argument(
         "--weighting_scheme",
         type=str,
@@ -285,7 +285,8 @@ def add_sd3_training_arguments(parser: argparse.ArgumentParser):
         default=1.0,
         help="Discrete flow shift for training timestep distribution adjustment, applied in addition to the weighting scheme, default is 1.0. /タイムステップ分布のための離散フローシフト、重み付けスキームの上に適用される、デフォルトは1.0。",
     )
-    
+
+
 def verify_sdxl_training_args(args: argparse.Namespace, supportTextEncoderCaching: bool = True):
     assert not args.v2, "v2 cannot be enabled in SDXL training / SDXL学習ではv2を有効にすることはできません"
     if args.v_parameterization:
@@ -956,9 +957,10 @@ def compute_loss_weighting_for_sd3(weighting_scheme: str, sigmas=None):
     return weighting
 
 
-def get_noisy_model_input_and_timesteps(
-    args, noise_scheduler, latents, noise, device, dtype
-) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+# endregion
+
+
+def get_noisy_model_input_and_timesteps(args, latents, noise, device, dtype) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
     bsz = latents.shape[0]
 
     # Sample a random timestep for each image
@@ -977,13 +979,12 @@ def get_noisy_model_input_and_timesteps(
     # weighting shift, value >1 will shift distribution to noisy side (focus more on overall structure), value <1 will shift towards less-noisy side (focus more on details)
     u = (u * shift) / (1 + (shift - 1) * u)
 
-    indices = (u * (t_max-t_min) + t_min).long()
+    indices = (u * (t_max - t_min) + t_min).long()
     timesteps = indices.to(device=device, dtype=dtype)
 
     # sigmas according to flowmatching
     sigmas = timesteps / 1000
-    sigmas = sigmas.view(-1,1,1,1)
+    sigmas = sigmas.view(-1, 1, 1, 1)
     noisy_model_input = sigmas * noise + (1.0 - sigmas) * latents
 
     return noisy_model_input, timesteps, sigmas
-
diff --git a/sd3_train.py b/sd3_train.py
index b8a0d04fa..24ecbfb7d 100644
--- a/sd3_train.py
+++ b/sd3_train.py
@@ -811,8 +811,8 @@ def optimizer_hook(parameter: torch.Tensor):
     progress_bar = tqdm(range(args.max_train_steps), smoothing=0, disable=not accelerator.is_local_main_process, desc="steps")
     global_step = 0
 
-    noise_scheduler = sd3_train_utils.FlowMatchEulerDiscreteScheduler(num_train_timesteps=1000, shift=3.0)
-    noise_scheduler_copy = copy.deepcopy(noise_scheduler)
+    # noise_scheduler = sd3_train_utils.FlowMatchEulerDiscreteScheduler(num_train_timesteps=1000, shift=3.0)
+    # noise_scheduler_copy = copy.deepcopy(noise_scheduler)
 
     if accelerator.is_main_process:
         init_kwargs = {}
@@ -940,11 +940,11 @@ def optimizer_hook(parameter: torch.Tensor):
 
                 # Sample noise that we'll add to the latents
                 noise = torch.randn_like(latents)
-                bsz = latents.shape[0]
+                # bsz = latents.shape[0]
 
                 # get noisy model input and timesteps
                 noisy_model_input, timesteps, sigmas = sd3_train_utils.get_noisy_model_input_and_timesteps(
-                    args, noise_scheduler_copy, latents, noise, accelerator.device, weight_dtype
+                    args, latents, noise, accelerator.device, weight_dtype
                 )
 
                 # debug: NaN check for all inputs
diff --git a/sd3_train_network.py b/sd3_train_network.py
index 0739e094d..bb02c7ac7 100644
--- a/sd3_train_network.py
+++ b/sd3_train_network.py
@@ -275,9 +275,8 @@ def sample_images(self, accelerator, args, epoch, global_step, device, vae, toke
         )
 
     def get_noise_scheduler(self, args: argparse.Namespace, device: torch.device) -> Any:
-        # shift 3.0 is the default value
-        noise_scheduler = sd3_train_utils.FlowMatchEulerDiscreteScheduler(num_train_timesteps=1000, shift=3.0)
-        self.noise_scheduler_copy = copy.deepcopy(noise_scheduler)
+        # this scheduler is not used in training, but used  to get num_train_timesteps etc.
+        noise_scheduler = sd3_train_utils.FlowMatchEulerDiscreteScheduler(num_train_timesteps=1000, shift=args.training_shift)
         return noise_scheduler
 
     def encode_images_to_latents(self, args, accelerator, vae, images):
@@ -304,7 +303,7 @@ def get_noise_pred_and_target(
 
         # get noisy model input and timesteps
         noisy_model_input, timesteps, sigmas = sd3_train_utils.get_noisy_model_input_and_timesteps(
-            args, self.noise_scheduler_copy, latents, noise, accelerator.device, weight_dtype
+            args, latents, noise, accelerator.device, weight_dtype
         )
 
         # ensure the hidden state will require grad

From f264f4091f734b4e4011257b8571ef97315a1343 Mon Sep 17 00:00:00 2001
From: "Kohya S." <52813779+kohya-ss@users.noreply.github.com>
Date: Thu, 7 Nov 2024 21:30:31 +0900
Subject: [PATCH 224/582] Update README.md

---
 README.md | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/README.md b/README.md
index dba76a3c5..9273fc8fb 100644
--- a/README.md
+++ b/README.md
@@ -20,6 +20,8 @@ Nov 7, 2024:
   - Previously, the side closer to noise was more sampled, but now it is uniform by default. This may improve the problem of difficulty in learning details.
   - Specifically, the problem of double shifting has been fixed. The default for `--weighting_scheme` has been changed to `uniform` (the previous default was `logit_normal`).
   - A new option `--training_shift` has been added. The default is 1.0, and all timesteps are sampled uniformly. If less than 1.0, the side closer to the image is more sampled, and if more than 1.0, the side closer to noise is more sampled.
+  - The effect of a shift in uniform distribution is shown in the figure below.
+  - ![Figure_1](https://github.com/user-attachments/assets/99a72c67-adfb-4440-81d4-a718985ff350)
 
 Oct 31, 2024:
 
@@ -693,7 +695,8 @@ Other options are described below.
    - The weighting scheme is described in the section 3.1 of the [SD3 paper](https://arxiv.org/abs/2403.03206v1). 
    - The uniform distribution is the default. If you want to change the distribution, see `--help` for options. 
    - `--training_shift` is the shift value for the training distribution of timesteps.
-
+   - The effect of a shift in uniform distribution is shown in the figure below.
+   - ![Figure_1](https://github.com/user-attachments/assets/99a72c67-adfb-4440-81d4-a718985ff350)
 
 Technical details of multi-resolution training for SD3.5M:
 

From 5eb6d209d5b28d43bf611e0934297703eb041d07 Mon Sep 17 00:00:00 2001
From: Dango233 <zhengyizhou@outlook.com>
Date: Thu, 7 Nov 2024 20:33:31 +0800
Subject: [PATCH 225/582] Update README.md

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 9273fc8fb..fe7c506cb 100644
--- a/README.md
+++ b/README.md
@@ -19,7 +19,7 @@ Nov 7, 2024:
 - The distribution of timesteps during SD3/3.5 training has been adjusted. This applies to both fine-tuning and LoRA training. PR [#1768](https://github.com/kohya-ss/sd-scripts/pull/1768) Thanks to Dango233!
   - Previously, the side closer to noise was more sampled, but now it is uniform by default. This may improve the problem of difficulty in learning details.
   - Specifically, the problem of double shifting has been fixed. The default for `--weighting_scheme` has been changed to `uniform` (the previous default was `logit_normal`).
-  - A new option `--training_shift` has been added. The default is 1.0, and all timesteps are sampled uniformly. If less than 1.0, the side closer to the image is more sampled, and if more than 1.0, the side closer to noise is more sampled.
+  - A new option `--training_shift` has been added. The default is 1.0, and all timesteps are sampled uniformly. If less than 1.0, the side closer to the image is more sampled (training more on image details), and if more than 1.0, the side closer to noise is more sampled (training more on overall structure).
   - The effect of a shift in uniform distribution is shown in the figure below.
   - ![Figure_1](https://github.com/user-attachments/assets/99a72c67-adfb-4440-81d4-a718985ff350)
 

From 186aa5b97d43700706bd8e986e2d5ac3f5d4c9b7 Mon Sep 17 00:00:00 2001
From: Kohya S <ykumeykume@gmail.com>
Date: Thu, 7 Nov 2024 22:16:05 +0900
Subject: [PATCH 226/582] fix illeagal block is swapped #1764

---
 library/flux_models.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/library/flux_models.py b/library/flux_models.py
index 48dea4fc9..4721fa02e 100644
--- a/library/flux_models.py
+++ b/library/flux_models.py
@@ -1077,7 +1077,7 @@ def forward(
 
             def submit_move_blocks(blocks, block_idx_to_cpu, block_idx_to_cuda):
                 def move_blocks(bidx_to_cpu, block_to_cpu, bidx_to_cuda, block_to_cuda):
-                    start_time = time.perf_counter()
+                    # start_time = time.perf_counter()
                     # print(f"Moving {bidx_to_cpu} to cpu and {bidx_to_cuda} to cuda.")
                     utils.swap_weight_devices(block_to_cpu, block_to_cuda)
                     # print(f"Block move done. {bidx_to_cpu} to cpu, {bidx_to_cuda} to cuda.")
@@ -1123,7 +1123,7 @@ def wait_for_blocks_move(block_idx, ftrs):
 
                 if block_idx < self.single_blocks_to_swap:
                     block_idx_to_cpu = block_idx
-                    block_idx_to_cuda = self.num_single_blocks - self.blocks_to_swap + block_idx
+                    block_idx_to_cuda = self.num_single_blocks - self.single_blocks_to_swap + block_idx
                     future = submit_move_blocks(self.single_blocks, block_idx_to_cpu, block_idx_to_cuda)
                     single_futures[block_idx_to_cuda] = future
 

From b3248a8eefe066e6502b535a19501363ec352974 Mon Sep 17 00:00:00 2001
From: feffy380 <114889020+feffy380@users.noreply.github.com>
Date: Thu, 7 Nov 2024 14:31:05 +0100
Subject: [PATCH 227/582] fix: sort order when getting image size from cache
 file

---
 library/train_util.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/library/train_util.py b/library/train_util.py
index 18d3cf6c2..8b5cf214e 100644
--- a/library/train_util.py
+++ b/library/train_util.py
@@ -1887,7 +1887,7 @@ def load_dreambooth_dir(subset: DreamBoothSubset):
 
                     # make image path to npz path mapping
                     npz_paths = glob.glob(os.path.join(subset.image_dir, "*" + strategy.cache_suffix))
-                    npz_paths.sort()
+                    npz_paths.sort(key=lambda item: item.rsplit("_", maxsplit=2)[0])  # sort by name excluding resolution and cache_suffix
                     npz_path_index = 0
 
                     size_set_count = 0

From 8fac3c3b088699f607392694beee76bc0036c8d9 Mon Sep 17 00:00:00 2001
From: Kohya S <ykumeykume@gmail.com>
Date: Sat, 9 Nov 2024 19:56:02 +0900
Subject: [PATCH 228/582] update README

---
 README.md | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/README.md b/README.md
index 87c810012..14328607e 100644
--- a/README.md
+++ b/README.md
@@ -14,6 +14,10 @@ The command to install PyTorch is as follows:
 
 ### Recent Updates
 
+Nov 9, 2024:
+
+- Fixed an issue where the image size could not be obtained when caching latents was enabled and a specific file name existed, causing the latent size to be incorrect. See PR [#1770](https://github.com/kohya-ss/sd-scripts/pull/1770) for details. Thanks to feffy380!
+
 Nov 7, 2024:
 
 - The distribution of timesteps during SD3/3.5 training has been adjusted. This applies to both fine-tuning and LoRA training. PR [#1768](https://github.com/kohya-ss/sd-scripts/pull/1768) Thanks to Dango233!

From 26bd4540a6cc7e62100f4901507d8fa0c5a7f78b Mon Sep 17 00:00:00 2001
From: sdbds <865105819@qq.com>
Date: Mon, 11 Nov 2024 09:25:28 +0800
Subject: [PATCH 229/582] init

---
 library/train_util.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/library/train_util.py b/library/train_util.py
index 8b5cf214e..7f396d36e 100644
--- a/library/train_util.py
+++ b/library/train_util.py
@@ -1405,11 +1405,11 @@ def get_image_size(self, image_path):
         image_size = imagesize.get(image_path)
         if image_size[0] <= 0:
             # imagesize doesn't work for some images, so use cv2
-            img = cv2.imread(image_path)
-            if img is not None:
-                image_size = (img.shape[1], img.shape[0])
-            else:
-                logger.warning(f"failed to get image size: {image_path}")
+            try:
+                with Image.open(image_path) as img:
+                    image_size = img.size
+            except Exception as e:
+                logger.warning(f"failed to get image size: {image_path}, error: {e}")
                 image_size = (0, 0)
         return image_size
 

From 02bd76e6c719ad85c108a177405846c5c958bd78 Mon Sep 17 00:00:00 2001
From: Kohya S <ykumeykume@gmail.com>
Date: Mon, 11 Nov 2024 21:15:36 +0900
Subject: [PATCH 230/582] Refactor block swapping to utilize custom offloading
 utilities

---
 flux_train.py                      | 228 ++++++++---------------------
 library/custom_offloading_utils.py | 216 +++++++++++++++++++++++++++
 library/flux_models.py             | 113 ++------------
 3 files changed, 295 insertions(+), 262 deletions(-)
 create mode 100644 library/custom_offloading_utils.py

diff --git a/flux_train.py b/flux_train.py
index afddc897f..02dede45e 100644
--- a/flux_train.py
+++ b/flux_train.py
@@ -295,7 +295,7 @@ def train(args):
         # Swap blocks between CPU and GPU to reduce memory usage, in forward and backward passes.
         # This idea is based on 2kpr's great work. Thank you!
         logger.info(f"enable block swap: blocks_to_swap={args.blocks_to_swap}")
-        flux.enable_block_swap(args.blocks_to_swap)
+        flux.enable_block_swap(args.blocks_to_swap, accelerator.device)
 
     if not cache_latents:
         # load VAE here if not cached
@@ -338,15 +338,15 @@ def train(args):
                 # determine target layer and block index for each parameter
                 block_type = "other"  # double, single or other
                 if np[0].startswith("double_blocks"):
-                    block_idx = int(np[0].split(".")[1])
+                    block_index = int(np[0].split(".")[1])
                     block_type = "double"
                 elif np[0].startswith("single_blocks"):
-                    block_idx = int(np[0].split(".")[1])
+                    block_index = int(np[0].split(".")[1])
                     block_type = "single"
                 else:
-                    block_idx = -1
+                    block_index = -1
 
-                param_group_key = (block_type, block_idx)
+                param_group_key = (block_type, block_index)
                 if param_group_key not in param_group:
                     param_group[param_group_key] = []
                 param_group[param_group_key].append(p)
@@ -466,123 +466,21 @@ def train(args):
     # resumeする
     train_util.resume_from_local_or_hf_if_specified(accelerator, args)
 
-    # memory efficient block swapping
-
-    def submit_move_blocks(futures, thread_pool, block_idx_to_cpu, block_idx_to_cuda, blocks, block_id):
-        def move_blocks(bidx_to_cpu, block_to_cpu, bidx_to_cuda, block_to_cuda):
-            # start_time = time.perf_counter()
-            # print(f"Backward: Move block {bidx_to_cpu} to CPU and block {bidx_to_cuda} to CUDA")
-            utils.swap_weight_devices(block_to_cpu, block_to_cuda)
-            # print(f"Backward: Moved blocks {bidx_to_cpu} and {bidx_to_cuda} in {time.perf_counter()-start_time:.2f}s")
-            return bidx_to_cpu, bidx_to_cuda # , event
-
-        block_to_cpu = blocks[block_idx_to_cpu]
-        block_to_cuda = blocks[block_idx_to_cuda]
-
-        futures[block_id] = thread_pool.submit(move_blocks, block_idx_to_cpu, block_to_cpu, block_idx_to_cuda, block_to_cuda)
-
-    def wait_blocks_move(block_id, futures):
-        if block_id not in futures:
-            return
-        # print(f"Backward: Wait for block {block_id}")
-        # start_time = time.perf_counter()
-        future = futures.pop(block_id)
-        _, bidx_to_cuda = future.result()
-        assert block_id[1] == bidx_to_cuda, f"Block index mismatch: {block_id[1]} != {bidx_to_cuda}"
-        # print(f"Backward: Waited for block {block_id}: {time.perf_counter()-start_time:.2f}s")
-        # print(f"Backward: Synchronized: {time.perf_counter()-start_time:.2f}s")
-
     if args.fused_backward_pass:
         # use fused optimizer for backward pass: other optimizers will be supported in the future
         import library.adafactor_fused
 
         library.adafactor_fused.patch_adafactor_fused(optimizer)
 
-        double_blocks_to_swap = args.blocks_to_swap // 2
-        single_blocks_to_swap = (args.blocks_to_swap - double_blocks_to_swap) * 2
-        num_double_blocks = len(accelerator.unwrap_model(flux).double_blocks)
-        num_single_blocks = len(accelerator.unwrap_model(flux).single_blocks)
-        handled_block_ids = set()
-
-        n = 1  # only asynchronous purpose, no need to increase this number
-        # n = 2
-        # n = max(1, os.cpu_count() // 2)
-        thread_pool = ThreadPoolExecutor(max_workers=n)
-        futures = {}
-
         for param_group, param_name_group in zip(optimizer.param_groups, param_names):
             for parameter, param_name in zip(param_group["params"], param_name_group):
                 if parameter.requires_grad:
-                    grad_hook = None
-
-                    if double_blocks_to_swap > 0 or single_blocks_to_swap > 0:
-                        is_double = param_name.startswith("double_blocks")
-                        is_single = param_name.startswith("single_blocks")
-                        if is_double and double_blocks_to_swap > 0 or is_single and single_blocks_to_swap > 0:
-                            block_idx = int(param_name.split(".")[1])
-                            block_id = (is_double, block_idx)  # double or single, block index
-                            if block_id not in handled_block_ids:
-                                # swap following (already backpropagated) block
-                                handled_block_ids.add(block_id)
-
-                                # if n blocks were already backpropagated
-                                if is_double:
-                                    num_blocks = num_double_blocks
-                                    blocks_to_swap = double_blocks_to_swap
-                                else:
-                                    num_blocks = num_single_blocks
-                                    blocks_to_swap = single_blocks_to_swap
-
-                                # -1 for 0-based index, -1 for current block is not fully backpropagated yet
-                                num_blocks_propagated = num_blocks - block_idx - 2
-                                swapping = num_blocks_propagated > 0 and num_blocks_propagated <= blocks_to_swap
-                                waiting = block_idx > 0 and block_idx <= blocks_to_swap
-
-                                if swapping or waiting:
-                                    block_idx_to_cpu = num_blocks - num_blocks_propagated
-                                    block_idx_to_cuda = blocks_to_swap - num_blocks_propagated
-                                    block_idx_to_wait = block_idx - 1
-
-                                    # create swap hook
-                                    def create_swap_grad_hook(
-                                        is_dbl, bidx_to_cpu, bidx_to_cuda, bidx_to_wait, swpng: bool, wtng: bool
-                                    ):
-                                        def __grad_hook(tensor: torch.Tensor):
-                                            if accelerator.sync_gradients and args.max_grad_norm != 0.0:
-                                                accelerator.clip_grad_norm_(tensor, args.max_grad_norm)
-                                            optimizer.step_param(tensor, param_group)
-                                            tensor.grad = None
-
-                                            # print(
-                                            #     f"Backward: Block {is_dbl}, {bidx_to_cpu}, {bidx_to_cuda}, {bidx_to_wait}, {swpng}, {wtng}"
-                                            # )
-                                            if swpng:
-                                                submit_move_blocks(
-                                                    futures,
-                                                    thread_pool,
-                                                    bidx_to_cpu,
-                                                    bidx_to_cuda,
-                                                    flux.double_blocks if is_dbl else flux.single_blocks,
-                                                    (is_dbl, bidx_to_cuda),  # wait for this block
-                                                )
-                                            if wtng:
-                                                wait_blocks_move((is_dbl, bidx_to_wait), futures)
-
-                                        return __grad_hook
-
-                                    grad_hook = create_swap_grad_hook(
-                                        is_double, block_idx_to_cpu, block_idx_to_cuda, block_idx_to_wait, swapping, waiting
-                                    )
-
-                    if grad_hook is None:
-
-                        def __grad_hook(tensor: torch.Tensor, param_group=param_group):
-                            if accelerator.sync_gradients and args.max_grad_norm != 0.0:
-                                accelerator.clip_grad_norm_(tensor, args.max_grad_norm)
-                            optimizer.step_param(tensor, param_group)
-                            tensor.grad = None
 
-                        grad_hook = __grad_hook
+                    def grad_hook(tensor: torch.Tensor, param_group=param_group):
+                        if accelerator.sync_gradients and args.max_grad_norm != 0.0:
+                            accelerator.clip_grad_norm_(tensor, args.max_grad_norm)
+                        optimizer.step_param(tensor, param_group)
+                        tensor.grad = None
 
                     parameter.register_post_accumulate_grad_hook(grad_hook)
 
@@ -601,66 +499,66 @@ def __grad_hook(tensor: torch.Tensor, param_group=param_group):
         num_parameters_per_group = [0] * len(optimizers)
         parameter_optimizer_map = {}
 
-        blocks_to_swap = args.blocks_to_swap
-        num_double_blocks = len(accelerator.unwrap_model(flux).double_blocks)
-        num_single_blocks = len(accelerator.unwrap_model(flux).single_blocks)
-        num_block_units = num_double_blocks + num_single_blocks // 2
-
-        n = 1  # only asynchronous purpose, no need to increase this number
-        # n = max(1, os.cpu_count() // 2)
-        thread_pool = ThreadPoolExecutor(max_workers=n)
-        futures = {}
-
         for opt_idx, optimizer in enumerate(optimizers):
             for param_group in optimizer.param_groups:
                 for parameter in param_group["params"]:
                     if parameter.requires_grad:
-                        block_type, block_idx = block_types_and_indices[opt_idx]
-
-                        def create_optimizer_hook(btype, bidx):
-                            def optimizer_hook(parameter: torch.Tensor):
-                                # print(f"optimizer_hook: {btype}, {bidx}")
-                                if accelerator.sync_gradients and args.max_grad_norm != 0.0:
-                                    accelerator.clip_grad_norm_(parameter, args.max_grad_norm)
-
-                                i = parameter_optimizer_map[parameter]
-                                optimizer_hooked_count[i] += 1
-                                if optimizer_hooked_count[i] == num_parameters_per_group[i]:
-                                    optimizers[i].step()
-                                    optimizers[i].zero_grad(set_to_none=True)
-
-                                    # swap blocks if necessary
-                                    if blocks_to_swap and (btype == "double" or (btype == "single" and bidx % 2 == 0)):
-                                        unit_idx = bidx if btype == "double" else num_double_blocks + bidx // 2
-                                        num_blocks_propagated = num_block_units - unit_idx
-
-                                        swapping = num_blocks_propagated > 0 and num_blocks_propagated <= blocks_to_swap
-                                        waiting = unit_idx > 0 and unit_idx <= blocks_to_swap
-
-                                        if swapping:
-                                            block_idx_to_cpu = num_block_units - num_blocks_propagated
-                                            block_idx_to_cuda = blocks_to_swap - num_blocks_propagated
-                                            # print(f"Backward: Swap blocks {block_idx_to_cpu} and {block_idx_to_cuda}")
-                                            submit_move_blocks(
-                                                futures,
-                                                thread_pool,
-                                                block_idx_to_cpu,
-                                                block_idx_to_cuda,
-                                                flux.double_blocks,
-                                                flux.single_blocks,
-                                                accelerator.device,
-                                            )
-
-                                        if waiting:
-                                            block_idx_to_wait = unit_idx - 1
-                                            wait_blocks_move(block_idx_to_wait, futures)
-
-                            return optimizer_hook
-
-                        parameter.register_post_accumulate_grad_hook(create_optimizer_hook(block_type, block_idx))
+
+                        def grad_hook(parameter: torch.Tensor):
+                            if accelerator.sync_gradients and args.max_grad_norm != 0.0:
+                                accelerator.clip_grad_norm_(parameter, args.max_grad_norm)
+
+                            i = parameter_optimizer_map[parameter]
+                            optimizer_hooked_count[i] += 1
+                            if optimizer_hooked_count[i] == num_parameters_per_group[i]:
+                                optimizers[i].step()
+                                optimizers[i].zero_grad(set_to_none=True)
+
+                        parameter.register_post_accumulate_grad_hook(grad_hook)
                         parameter_optimizer_map[parameter] = opt_idx
                         num_parameters_per_group[opt_idx] += 1
 
+    # add hooks for block swapping: this hook is called after fused_backward_pass hook or blockwise_fused_optimizers hook
+    if is_swapping_blocks:
+        import library.custom_offloading_utils as custom_offloading_utils
+
+        num_double_blocks = len(accelerator.unwrap_model(flux).double_blocks)
+        num_single_blocks = len(accelerator.unwrap_model(flux).single_blocks)
+        double_blocks_to_swap = args.blocks_to_swap // 2
+        single_blocks_to_swap = (args.blocks_to_swap - double_blocks_to_swap) * 2
+
+        offloader_double = custom_offloading_utils.TrainOffloader(num_double_blocks, double_blocks_to_swap, accelerator.device)
+        offloader_single = custom_offloading_utils.TrainOffloader(num_single_blocks, single_blocks_to_swap, accelerator.device)
+
+        param_name_pairs = []
+        if not args.blockwise_fused_optimizers:
+            for param_group, param_name_group in zip(optimizer.param_groups, param_names):
+                param_name_pairs.extend(zip(param_group["params"], param_name_group))
+        else:
+            # named_parameters is a list of (name, parameter) pairs
+            param_name_pairs.extend([(p, n) for n, p in flux.named_parameters()])
+
+        for parameter, param_name in param_name_pairs:
+            if not parameter.requires_grad:
+                continue
+
+            is_double = param_name.startswith("double_blocks")
+            is_single = param_name.startswith("single_blocks")
+            if not is_double and not is_single:
+                continue
+
+            block_index = int(param_name.split(".")[1])
+            if is_double:
+                blocks = flux.double_blocks
+                offloader = offloader_double
+            else:
+                blocks = flux.single_blocks
+                offloader = offloader_single
+
+            grad_hook = offloader.create_grad_hook(blocks, block_index)
+            if grad_hook is not None:
+                parameter.register_post_accumulate_grad_hook(grad_hook)
+
     # epoch数を計算する
     num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
     num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch)
diff --git a/library/custom_offloading_utils.py b/library/custom_offloading_utils.py
new file mode 100644
index 000000000..33a413004
--- /dev/null
+++ b/library/custom_offloading_utils.py
@@ -0,0 +1,216 @@
+from concurrent.futures import ThreadPoolExecutor
+import time
+from typing import Optional
+import torch
+import torch.nn as nn
+
+from library.device_utils import clean_memory_on_device
+
+
+def synchronize_device(device: torch.device):
+    if device.type == "cuda":
+        torch.cuda.synchronize()
+    elif device.type == "xpu":
+        torch.xpu.synchronize()
+    elif device.type == "mps":
+        torch.mps.synchronize()
+
+
+def swap_weight_devices(layer_to_cpu: nn.Module, layer_to_cuda: nn.Module):
+    assert layer_to_cpu.__class__ == layer_to_cuda.__class__
+
+    weight_swap_jobs = []
+    for module_to_cpu, module_to_cuda in zip(layer_to_cpu.modules(), layer_to_cuda.modules()):
+        if hasattr(module_to_cpu, "weight") and module_to_cpu.weight is not None:
+            weight_swap_jobs.append((module_to_cpu, module_to_cuda, module_to_cpu.weight.data, module_to_cuda.weight.data))
+
+    torch.cuda.current_stream().synchronize()  # this prevents the illegal loss value
+
+    stream = torch.cuda.Stream()
+    with torch.cuda.stream(stream):
+        # cuda to cpu
+        for module_to_cpu, module_to_cuda, cuda_data_view, cpu_data_view in weight_swap_jobs:
+            cuda_data_view.record_stream(stream)
+            module_to_cpu.weight.data = cuda_data_view.data.to("cpu", non_blocking=True)
+
+        stream.synchronize()
+
+        # cpu to cuda
+        for module_to_cpu, module_to_cuda, cuda_data_view, cpu_data_view in weight_swap_jobs:
+            cuda_data_view.copy_(module_to_cuda.weight.data, non_blocking=True)
+            module_to_cuda.weight.data = cuda_data_view
+
+    stream.synchronize()
+    torch.cuda.current_stream().synchronize()  # this prevents the illegal loss value
+
+
+def swap_weight_devices_no_cuda(device: torch.device, layer_to_cpu: nn.Module, layer_to_cuda: nn.Module):
+    """
+    not tested
+    """
+    assert layer_to_cpu.__class__ == layer_to_cuda.__class__
+
+    weight_swap_jobs = []
+    for module_to_cpu, module_to_cuda in zip(layer_to_cpu.modules(), layer_to_cuda.modules()):
+        if hasattr(module_to_cpu, "weight") and module_to_cpu.weight is not None:
+            weight_swap_jobs.append((module_to_cpu, module_to_cuda, module_to_cpu.weight.data, module_to_cuda.weight.data))
+
+    # device to cpu
+    for module_to_cpu, module_to_cuda, cuda_data_view, cpu_data_view in weight_swap_jobs:
+        module_to_cpu.weight.data = cuda_data_view.data.to("cpu", non_blocking=True)
+
+    synchronize_device()
+
+    # cpu to device
+    for module_to_cpu, module_to_cuda, cuda_data_view, cpu_data_view in weight_swap_jobs:
+        cuda_data_view.copy_(module_to_cuda.weight.data, non_blocking=True)
+        module_to_cuda.weight.data = cuda_data_view
+
+    synchronize_device()
+
+
+def weighs_to_device(layer: nn.Module, device: torch.device):
+    for module in layer.modules():
+        if hasattr(module, "weight") and module.weight is not None:
+            module.weight.data = module.weight.data.to(device, non_blocking=True)
+
+
+class Offloader:
+    """
+    common offloading class
+    """
+
+    def __init__(self, num_blocks: int, blocks_to_swap: int, device: torch.device, debug: bool = False):
+        self.num_blocks = num_blocks
+        self.blocks_to_swap = blocks_to_swap
+        self.device = device
+        self.debug = debug
+
+        self.thread_pool = ThreadPoolExecutor(max_workers=1)
+        self.futures = {}
+        self.cuda_available = device.type == "cuda"
+
+    def swap_weight_devices(self, block_to_cpu: nn.Module, block_to_cuda: nn.Module):
+        if self.cuda_available:
+            swap_weight_devices(block_to_cpu, block_to_cuda)
+        else:
+            swap_weight_devices_no_cuda(self.device, block_to_cpu, block_to_cuda)
+
+    def _submit_move_blocks(self, blocks, block_idx_to_cpu, block_idx_to_cuda):
+        def move_blocks(bidx_to_cpu, block_to_cpu, bidx_to_cuda, block_to_cuda):
+            if self.debug:
+                start_time = time.perf_counter()
+                print(f"Move block {bidx_to_cpu} to CPU and block {bidx_to_cuda} to {'CUDA' if self.cuda_available else 'device'}")
+
+            self.swap_weight_devices(block_to_cpu, block_to_cuda)
+
+            if self.debug:
+                print(f"Moved blocks {bidx_to_cpu} and {bidx_to_cuda} in {time.perf_counter()-start_time:.2f}s")
+            return bidx_to_cpu, bidx_to_cuda  # , event
+
+        block_to_cpu = blocks[block_idx_to_cpu]
+        block_to_cuda = blocks[block_idx_to_cuda]
+
+        self.futures[block_idx_to_cuda] = self.thread_pool.submit(
+            move_blocks, block_idx_to_cpu, block_to_cpu, block_idx_to_cuda, block_to_cuda
+        )
+
+    def _wait_blocks_move(self, block_idx):
+        if block_idx not in self.futures:
+            return
+
+        if self.debug:
+            print(f"Wait for block {block_idx}")
+            start_time = time.perf_counter()
+
+        future = self.futures.pop(block_idx)
+        _, bidx_to_cuda = future.result()
+
+        assert block_idx == bidx_to_cuda, f"Block index mismatch: {block_idx} != {bidx_to_cuda}"
+
+        if self.debug:
+            print(f"Waited for block {block_idx}: {time.perf_counter()-start_time:.2f}s")
+
+
+class TrainOffloader(Offloader):
+    """
+    supports backward offloading
+    """
+
+    def __init__(self, num_blocks: int, blocks_to_swap: int, device: torch.device, debug: bool = False):
+        super().__init__(num_blocks, blocks_to_swap, device, debug)
+        self.hook_added = set()
+
+    def create_grad_hook(self, blocks: list[nn.Module], block_index: int) -> Optional[callable]:
+        if block_index in self.hook_added:
+            return None
+        self.hook_added.add(block_index)
+
+        # -1 for 0-based index, -1 for current block is not fully backpropagated yet
+        num_blocks_propagated = self.num_blocks - block_index - 2
+        swapping = num_blocks_propagated > 0 and num_blocks_propagated <= self.blocks_to_swap
+        waiting = block_index > 0 and block_index <= self.blocks_to_swap
+
+        if not swapping and not waiting:
+            return None
+
+        # create  hook
+        block_idx_to_cpu = self.num_blocks - num_blocks_propagated
+        block_idx_to_cuda = self.blocks_to_swap - num_blocks_propagated
+        block_idx_to_wait = block_index - 1
+
+        if self.debug:
+            print(
+                f"Backward: Created grad hook for block {block_index} with {block_idx_to_cpu}, {block_idx_to_cuda}, {block_idx_to_wait}"
+            )
+        if swapping:
+
+            def grad_hook(tensor: torch.Tensor):
+                self._submit_move_blocks(blocks, block_idx_to_cpu, block_idx_to_cuda)
+
+            return grad_hook
+
+        else:
+
+            def grad_hook(tensor: torch.Tensor):
+                self._wait_blocks_move(block_idx_to_wait)
+
+            return grad_hook
+
+
+class ModelOffloader(Offloader):
+    """
+    supports forward offloading
+    """
+
+    def __init__(self, num_blocks: int, blocks_to_swap: int, device: torch.device, debug: bool = False):
+        super().__init__(num_blocks, blocks_to_swap, device, debug)
+
+    def prepare_block_devices_before_forward(self, blocks: list[nn.Module]):
+        if self.blocks_to_swap is None or self.blocks_to_swap == 0:
+            return
+
+        for b in blocks[0 : self.num_blocks - self.blocks_to_swap]:
+            b.to(self.device)
+            weighs_to_device(b, self.device)  # make sure weights are on device
+
+        for b in blocks[self.num_blocks - self.blocks_to_swap :]:
+            b.to(self.device)  # move block to device first
+            weighs_to_device(b, "cpu")  # make sure weights are on cpu
+
+        synchronize_device(self.device)
+        clean_memory_on_device(self.device)
+
+    def wait_for_block(self, block_idx: int):
+        if self.blocks_to_swap is None or self.blocks_to_swap == 0:
+            return
+        self._wait_blocks_move(block_idx)
+
+    def submit_move_blocks(self, blocks: list[nn.Module], block_idx: int):
+        if self.blocks_to_swap is None or self.blocks_to_swap == 0:
+            return
+        if block_idx >= self.blocks_to_swap:
+            return
+        block_idx_to_cpu = block_idx
+        block_idx_to_cuda = self.num_blocks - self.blocks_to_swap + block_idx
+        self._submit_move_blocks(blocks, block_idx_to_cpu, block_idx_to_cuda)
diff --git a/library/flux_models.py b/library/flux_models.py
index 4721fa02e..e0bee160f 100644
--- a/library/flux_models.py
+++ b/library/flux_models.py
@@ -18,6 +18,7 @@
 from einops import rearrange
 from torch import Tensor, nn
 from torch.utils.checkpoint import checkpoint
+from library import custom_offloading_utils
 
 # USE_REENTRANT = True
 
@@ -923,7 +924,8 @@ def __init__(self, params: FluxParams):
         self.cpu_offload_checkpointing = False
         self.blocks_to_swap = None
 
-        self.thread_pool: Optional[ThreadPoolExecutor] = None
+        self.offloader_double = None
+        self.offloader_single = None
         self.num_double_blocks = len(self.double_blocks)
         self.num_single_blocks = len(self.single_blocks)
 
@@ -963,17 +965,17 @@ def disable_gradient_checkpointing(self):
 
         print("FLUX: Gradient checkpointing disabled.")
 
-    def enable_block_swap(self, num_blocks: int):
+    def enable_block_swap(self, num_blocks: int, device: torch.device):
         self.blocks_to_swap = num_blocks
-        self.double_blocks_to_swap = num_blocks // 2
-        self.single_blocks_to_swap = (num_blocks - self.double_blocks_to_swap) * 2
+        double_blocks_to_swap = num_blocks // 2
+        single_blocks_to_swap = (num_blocks - double_blocks_to_swap) * 2
+
+        self.offloader_double = custom_offloading_utils.ModelOffloader(self.num_double_blocks, double_blocks_to_swap, device)
+        self.offloader_single = custom_offloading_utils.ModelOffloader(self.num_single_blocks, single_blocks_to_swap, device)
         print(
-            f"FLUX: Block swap enabled. Swapping {num_blocks} blocks, double blocks: {self.double_blocks_to_swap}, single blocks: {self.single_blocks_to_swap}."
+            f"FLUX: Block swap enabled. Swapping {num_blocks} blocks, double blocks: {double_blocks_to_swap}, single blocks: {single_blocks_to_swap}."
         )
 
-        n = 1  # async block swap. 1 is enough
-        self.thread_pool = ThreadPoolExecutor(max_workers=n)
-
     def move_to_device_except_swap_blocks(self, device: torch.device):
         # assume model is on cpu. do not move blocks to device to reduce temporary memory usage
         if self.blocks_to_swap:
@@ -988,56 +990,11 @@ def move_to_device_except_swap_blocks(self, device: torch.device):
             self.double_blocks = save_double_blocks
             self.single_blocks = save_single_blocks
 
-    # def get_block_unit(self, index: int):
-    #     if index < len(self.double_blocks):
-    #         return (self.double_blocks[index],)
-    #     else:
-    #         index -= len(self.double_blocks)
-    #         index *= 2
-    #         return self.single_blocks[index], self.single_blocks[index + 1]
-
-    # def get_unit_index(self, is_double: bool, index: int):
-    #     if is_double:
-    #         return index
-    #     else:
-    #         return len(self.double_blocks) + index // 2
-
     def prepare_block_swap_before_forward(self):
-        # # make: first n blocks are on cuda, and last n blocks are on cpu
-        # if self.blocks_to_swap is None or self.blocks_to_swap == 0:
-        #     # raise ValueError("Block swap is not enabled.")
-        #     return
-        # for i in range(self.num_block_units - self.blocks_to_swap):
-        #     for b in self.get_block_unit(i):
-        #         b.to(self.device)
-        # for i in range(self.num_block_units - self.blocks_to_swap, self.num_block_units):
-        #     for b in self.get_block_unit(i):
-        #         b.to("cpu")
-        # clean_memory_on_device(self.device)
-
-        # all blocks are on device, but some weights are on cpu
-        # make first n blocks weights on device, and last n blocks weights on cpu
         if self.blocks_to_swap is None or self.blocks_to_swap == 0:
-            # raise ValueError("Block swap is not enabled.")
             return
-
-        for b in self.double_blocks[0 : self.num_double_blocks - self.double_blocks_to_swap]:
-            b.to(self.device)
-            utils.weighs_to_device(b, self.device)  # make sure weights are on device
-        for b in self.double_blocks[self.num_double_blocks - self.double_blocks_to_swap :]:
-            b.to(self.device)  # move block to device first
-            utils.weighs_to_device(b, "cpu")  # make sure weights are on cpu
-        torch.cuda.synchronize()
-        clean_memory_on_device(self.device)
-
-        for b in self.single_blocks[0 : self.num_single_blocks - self.single_blocks_to_swap]:
-            b.to(self.device)
-            utils.weighs_to_device(b, self.device)  # make sure weights are on device
-        for b in self.single_blocks[self.num_single_blocks - self.single_blocks_to_swap :]:
-            b.to(self.device)  # move block to device first
-            utils.weighs_to_device(b, "cpu")  # make sure weights are on cpu
-        torch.cuda.synchronize()
-        clean_memory_on_device(self.device)
+        self.offloader_double.prepare_block_devices_before_forward(self.double_blocks)
+        self.offloader_single.prepare_block_devices_before_forward(self.single_blocks)
 
     def forward(
         self,
@@ -1073,59 +1030,21 @@ def forward(
             for block in self.single_blocks:
                 img = block(img, vec=vec, pe=pe, txt_attention_mask=txt_attention_mask)
         else:
-            # device = self.device
-
-            def submit_move_blocks(blocks, block_idx_to_cpu, block_idx_to_cuda):
-                def move_blocks(bidx_to_cpu, block_to_cpu, bidx_to_cuda, block_to_cuda):
-                    # start_time = time.perf_counter()
-                    # print(f"Moving {bidx_to_cpu} to cpu and {bidx_to_cuda} to cuda.")
-                    utils.swap_weight_devices(block_to_cpu, block_to_cuda)
-                    # print(f"Block move done. {bidx_to_cpu} to cpu, {bidx_to_cuda} to cuda.")
-
-                    # print(f"Move blocks took {time.perf_counter() - start_time:.2f} seconds")
-                    return block_idx_to_cpu, block_idx_to_cuda # , event
-
-                block_to_cpu = blocks[block_idx_to_cpu]
-                block_to_cuda = blocks[block_idx_to_cuda]
-                # print(f"Submit move blocks. {block_idx_to_cpu} to cpu, {block_idx_to_cuda} to cuda.")
-                return self.thread_pool.submit(move_blocks, block_idx_to_cpu, block_to_cpu, block_idx_to_cuda, block_to_cuda)
-
-            def wait_for_blocks_move(block_idx, ftrs):
-                if block_idx not in ftrs:
-                    return
-                # print(f"Waiting for move blocks: {block_idx}")
-                # start_time = time.perf_counter()
-                ftr = ftrs.pop(block_idx)
-                ftr.result()
-                # print(f"{block_idx} move blocks took {time.perf_counter() - start_time:.2f} seconds")
-
-            double_futures = {}
             for block_idx, block in enumerate(self.double_blocks):
-                # print(f"Double block {block_idx}")
-                wait_for_blocks_move(block_idx, double_futures)
+                self.offloader_double.wait_for_block(block_idx)
 
                 img, txt = block(img=img, txt=txt, vec=vec, pe=pe, txt_attention_mask=txt_attention_mask)
 
-                if block_idx < self.double_blocks_to_swap:
-                    block_idx_to_cpu = block_idx
-                    block_idx_to_cuda = self.num_double_blocks - self.double_blocks_to_swap + block_idx
-                    future = submit_move_blocks(self.double_blocks, block_idx_to_cpu, block_idx_to_cuda)
-                    double_futures[block_idx_to_cuda] = future
+                self.offloader_double.submit_move_blocks(self.double_blocks, block_idx)
 
             img = torch.cat((txt, img), 1)
 
-            single_futures = {}
             for block_idx, block in enumerate(self.single_blocks):
-                # print(f"Single block {block_idx}")
-                wait_for_blocks_move(block_idx, single_futures)
+                self.offloader_single.wait_for_block(block_idx)
 
                 img = block(img, vec=vec, pe=pe, txt_attention_mask=txt_attention_mask)
 
-                if block_idx < self.single_blocks_to_swap:
-                    block_idx_to_cpu = block_idx
-                    block_idx_to_cuda = self.num_single_blocks - self.single_blocks_to_swap + block_idx
-                    future = submit_move_blocks(self.single_blocks, block_idx_to_cpu, block_idx_to_cuda)
-                    single_futures[block_idx_to_cuda] = future
+                self.offloader_single.submit_move_blocks(self.single_blocks, block_idx)
 
         img = img[:, txt.shape[1] :, ...]
 

From 3fe94b058a039b69b6b178bc086e200e40bfa887 Mon Sep 17 00:00:00 2001
From: Kohya S <ykumeykume@gmail.com>
Date: Tue, 12 Nov 2024 08:09:07 +0900
Subject: [PATCH 231/582] update comment

---
 library/train_util.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/library/train_util.py b/library/train_util.py
index 7f396d36e..a5d6fdd21 100644
--- a/library/train_util.py
+++ b/library/train_util.py
@@ -1404,7 +1404,7 @@ def get_image_size(self, image_path):
         # return imagesize.get(image_path)
         image_size = imagesize.get(image_path)
         if image_size[0] <= 0:
-            # imagesize doesn't work for some images, so use cv2
+            # imagesize doesn't work for some images, so use PIL as a fallback
             try:
                 with Image.open(image_path) as img:
                     image_size = img.size

From cde90b8903870b6b28dae274d07ed27978055e3c Mon Sep 17 00:00:00 2001
From: Kohya S <ykumeykume@gmail.com>
Date: Tue, 12 Nov 2024 08:49:05 +0900
Subject: [PATCH 232/582] feat: implement block swapping for FLUX.1 LoRA (WIP)

---
 flux_train.py                      |  2 +-
 flux_train_network.py              | 33 ++++++++++++++++++++++++
 library/custom_offloading_utils.py | 40 +++++++++++++++++++++++++++++-
 library/flux_models.py             |  8 ++++--
 train_network.py                   |  9 ++++++-
 5 files changed, 87 insertions(+), 5 deletions(-)

diff --git a/flux_train.py b/flux_train.py
index 02dede45e..346fe8fbd 100644
--- a/flux_train.py
+++ b/flux_train.py
@@ -519,7 +519,7 @@ def grad_hook(parameter: torch.Tensor):
                         num_parameters_per_group[opt_idx] += 1
 
     # add hooks for block swapping: this hook is called after fused_backward_pass hook or blockwise_fused_optimizers hook
-    if is_swapping_blocks:
+    if False: # is_swapping_blocks:
         import library.custom_offloading_utils as custom_offloading_utils
 
         num_double_blocks = len(accelerator.unwrap_model(flux).double_blocks)
diff --git a/flux_train_network.py b/flux_train_network.py
index 2b71a8979..376cc1597 100644
--- a/flux_train_network.py
+++ b/flux_train_network.py
@@ -25,6 +25,7 @@ def __init__(self):
         super().__init__()
         self.sample_prompts_te_outputs = None
         self.is_schnell: Optional[bool] = None
+        self.is_swapping_blocks: bool = False
 
     def assert_extra_args(self, args, train_dataset_group):
         super().assert_extra_args(args, train_dataset_group)
@@ -78,6 +79,12 @@ def load_target_model(self, args, weight_dtype, accelerator):
         if args.split_mode:
             model = self.prepare_split_model(model, weight_dtype, accelerator)
 
+        self.is_swapping_blocks = args.blocks_to_swap is not None and args.blocks_to_swap > 0
+        if self.is_swapping_blocks:
+            # Swap blocks between CPU and GPU to reduce memory usage, in forward and backward passes.
+            logger.info(f"enable block swap: blocks_to_swap={args.blocks_to_swap}")
+            model.enable_block_swap(args.blocks_to_swap, accelerator.device)
+
         clip_l = flux_utils.load_clip_l(args.clip_l, weight_dtype, "cpu", disable_mmap=args.disable_mmap_load_safetensors)
         clip_l.eval()
 
@@ -285,6 +292,8 @@ def sample_images(self, accelerator, args, epoch, global_step, device, ae, token
         text_encoders = self.get_models_for_text_encoding(args, accelerator, text_encoders)
 
         if not args.split_mode:
+            if self.is_swapping_blocks:
+                accelerator.unwrap_model(flux).prepare_block_swap_before_forward()
             flux_train_utils.sample_images(
                 accelerator, args, epoch, global_step, flux, ae, text_encoders, self.sample_prompts_te_outputs
             )
@@ -539,6 +548,19 @@ def forward(hidden_states):
                 text_encoder.to(te_weight_dtype)  # fp8
                 prepare_fp8(text_encoder, weight_dtype)
 
+    def prepare_unet_with_accelerator(
+        self, args: argparse.Namespace, accelerator: Accelerator, unet: torch.nn.Module
+    ) -> torch.nn.Module:
+        if not self.is_swapping_blocks:
+            return super().prepare_unet_with_accelerator(args, accelerator, unet)
+
+        # if we doesn't swap blocks, we can move the model to device
+        flux: flux_models.Flux = unet
+        flux = accelerator.prepare(flux, device_placement=[not self.is_swapping_blocks])
+        accelerator.unwrap_model(flux).move_to_device_except_swap_blocks(accelerator.device)  # reduce peak memory usage
+
+        return flux
+
 
 def setup_parser() -> argparse.ArgumentParser:
     parser = train_network.setup_parser()
@@ -550,6 +572,17 @@ def setup_parser() -> argparse.ArgumentParser:
         help="[EXPERIMENTAL] use split mode for Flux model, network arg `train_blocks=single` is required"
         + "/[実験的] Fluxモデルの分割モードを使用する。ネットワーク引数`train_blocks=single`が必要",
     )
+
+    parser.add_argument(
+        "--blocks_to_swap",
+        type=int,
+        default=None,
+        help="[EXPERIMENTAL] "
+        "Sets the number of blocks to swap during the forward and backward passes."
+        "Increasing this number lowers the overall VRAM used during training at the expense of training speed (s/it)."
+        " / 順伝播および逆伝播中にスワップするブロックの数を設定します。"
+        "この数を増やすと、トレーニング中のVRAM使用量が減りますが、トレーニング速度（s/it）も低下します。",
+    )
     return parser
 
 
diff --git a/library/custom_offloading_utils.py b/library/custom_offloading_utils.py
index 33a413004..70da93902 100644
--- a/library/custom_offloading_utils.py
+++ b/library/custom_offloading_utils.py
@@ -183,9 +183,47 @@ class ModelOffloader(Offloader):
     supports forward offloading
     """
 
-    def __init__(self, num_blocks: int, blocks_to_swap: int, device: torch.device, debug: bool = False):
+    def __init__(self, blocks: list[nn.Module], num_blocks: int, blocks_to_swap: int, device: torch.device, debug: bool = False):
         super().__init__(num_blocks, blocks_to_swap, device, debug)
 
+        # register backward hooks
+        self.remove_handles = []
+        for i, block in enumerate(blocks):
+            hook = self.create_backward_hook(blocks, i)
+            if hook is not None:
+                handle = block.register_full_backward_hook(hook)
+                self.remove_handles.append(handle)
+
+    def __del__(self):
+        for handle in self.remove_handles:
+            handle.remove()
+
+    def create_backward_hook(self, blocks: list[nn.Module], block_index: int) -> Optional[callable]:
+        # -1 for 0-based index
+        num_blocks_propagated = self.num_blocks - block_index - 1
+        swapping = num_blocks_propagated > 0 and num_blocks_propagated <= self.blocks_to_swap
+        waiting = block_index > 0 and block_index <= self.blocks_to_swap
+
+        if not swapping and not waiting:
+            return None
+
+        # create  hook
+        block_idx_to_cpu = self.num_blocks - num_blocks_propagated
+        block_idx_to_cuda = self.blocks_to_swap - num_blocks_propagated
+        block_idx_to_wait = block_index - 1
+
+        def backward_hook(module, grad_input, grad_output):
+            if self.debug:
+                print(f"Backward hook for block {block_index}")
+
+            if swapping:
+                self._submit_move_blocks(blocks, block_idx_to_cpu, block_idx_to_cuda)
+            if waiting:
+                self._wait_blocks_move(block_idx_to_wait)
+            return None
+
+        return backward_hook
+
     def prepare_block_devices_before_forward(self, blocks: list[nn.Module]):
         if self.blocks_to_swap is None or self.blocks_to_swap == 0:
             return
diff --git a/library/flux_models.py b/library/flux_models.py
index e0bee160f..4fa272522 100644
--- a/library/flux_models.py
+++ b/library/flux_models.py
@@ -970,8 +970,12 @@ def enable_block_swap(self, num_blocks: int, device: torch.device):
         double_blocks_to_swap = num_blocks // 2
         single_blocks_to_swap = (num_blocks - double_blocks_to_swap) * 2
 
-        self.offloader_double = custom_offloading_utils.ModelOffloader(self.num_double_blocks, double_blocks_to_swap, device)
-        self.offloader_single = custom_offloading_utils.ModelOffloader(self.num_single_blocks, single_blocks_to_swap, device)
+        self.offloader_double = custom_offloading_utils.ModelOffloader(
+            self.double_blocks, self.num_double_blocks, double_blocks_to_swap, device #, debug=True
+        )
+        self.offloader_single = custom_offloading_utils.ModelOffloader(
+            self.single_blocks, self.num_single_blocks, single_blocks_to_swap, device #, debug=True
+        )
         print(
             f"FLUX: Block swap enabled. Swapping {num_blocks} blocks, double blocks: {double_blocks_to_swap}, single blocks: {single_blocks_to_swap}."
         )
diff --git a/train_network.py b/train_network.py
index b90aa420e..d70f14ad3 100644
--- a/train_network.py
+++ b/train_network.py
@@ -18,6 +18,7 @@
 init_ipex()
 
 from accelerate.utils import set_seed
+from accelerate import Accelerator
 from diffusers import DDPMScheduler
 from library import deepspeed_utils, model_util, strategy_base, strategy_sd
 
@@ -272,6 +273,11 @@ def prepare_text_encoder_grad_ckpt_workaround(self, index, text_encoder):
     def prepare_text_encoder_fp8(self, index, text_encoder, te_weight_dtype, weight_dtype):
         text_encoder.text_model.embeddings.to(dtype=weight_dtype)
 
+    def prepare_unet_with_accelerator(
+        self, args: argparse.Namespace, accelerator: Accelerator, unet: torch.nn.Module
+    ) -> torch.nn.Module:
+        return accelerator.prepare(unet)
+
     def on_step_start(self, args, accelerator, network, text_encoders, unet, batch, weight_dtype):
         pass
 
@@ -627,7 +633,8 @@ def train(self, args):
             training_model = ds_model
         else:
             if train_unet:
-                unet = accelerator.prepare(unet)
+                # default implementation is:  unet = accelerator.prepare(unet)
+                unet = self.prepare_unet_with_accelerator(args, accelerator, unet)  # accelerator does some magic here
             else:
                 unet.to(accelerator.device, dtype=unet_weight_dtype)  # move to device because unet is not prepared by accelerator
             if train_text_encoder:

From 2cb7a6db02ae001355f4830581b9fc2ffffe01c6 Mon Sep 17 00:00:00 2001
From: Kohya S <ykumeykume@gmail.com>
Date: Tue, 12 Nov 2024 21:39:13 +0900
Subject: [PATCH 233/582] feat: add block swap for FLUX.1/SD3 LoRA training

---
 README.md                           | 212 ++++++----------------------
 flux_train.py                       |  56 +-------
 flux_train_network.py               |  95 +++++++------
 library/custom_offloading_utils.py  |  75 ++++------
 library/flux_models.py              |  19 ++-
 library/flux_train_utils.py         |  48 +------
 library/sd3_models.py               |  71 +++-------
 library/sd3_train_utils.py          |  49 +------
 library/train_util.py               |  74 +++++++++-
 sd3_train.py                        | 186 +++---------------------
 sd3_train_network.py                |  30 ++++
 tools/cache_latents.py              |   1 +
 tools/cache_text_encoder_outputs.py |   1 +
 train_network.py                    |   6 +-
 14 files changed, 291 insertions(+), 632 deletions(-)

diff --git a/README.md b/README.md
index 14328607e..1e63b5830 100644
--- a/README.md
+++ b/README.md
@@ -14,150 +14,11 @@ The command to install PyTorch is as follows:
 
 ### Recent Updates
 
-Nov 9, 2024:
+Nov 12, 2024:
 
-- Fixed an issue where the image size could not be obtained when caching latents was enabled and a specific file name existed, causing the latent size to be incorrect. See PR [#1770](https://github.com/kohya-ss/sd-scripts/pull/1770) for details. Thanks to feffy380!
-
-Nov 7, 2024:
-
-- The distribution of timesteps during SD3/3.5 training has been adjusted. This applies to both fine-tuning and LoRA training. PR [#1768](https://github.com/kohya-ss/sd-scripts/pull/1768) Thanks to Dango233!
-  - Previously, the side closer to noise was more sampled, but now it is uniform by default. This may improve the problem of difficulty in learning details.
-  - Specifically, the problem of double shifting has been fixed. The default for `--weighting_scheme` has been changed to `uniform` (the previous default was `logit_normal`).
-  - A new option `--training_shift` has been added. The default is 1.0, and all timesteps are sampled uniformly. If less than 1.0, the side closer to the image is more sampled (training more on image details), and if more than 1.0, the side closer to noise is more sampled (training more on overall structure).
-  - The effect of a shift in uniform distribution is shown in the figure below.
-  - ![Figure_1](https://github.com/user-attachments/assets/99a72c67-adfb-4440-81d4-a718985ff350)
-
-Oct 31, 2024:
-
-- Added support for SD3.5L/M training. See [SD3 training](#sd3-training) for details.
-
-Oct 19, 2024:
-
-- Added an implementation of Differential Output Preservation (temporary name) for SDXL/FLUX.1 LoRA training. SD1/2 is not tested yet. This is an experimental feature. 
-  - A method to make the output of LoRA closer to the output when LoRA is not applied, with captions that do not contain trigger words.
-  - Define a Dataset subset for the regularization image (`is_reg = true`) with `.toml`. Add `custom_attributes.diff_output_preservation = true`.
-    - See [dataset configuration](docs/config_README-en.md) for the regularization dataset.
-  - Specify "number of training images x number of repeats >= number of regularization images x number of repeats".
-  - The weights of DOP is specified by `--prior_loss_weight` option (not dataset config). 
-  - The appropriate value is still unknown. For FLUX, according to the comments in the [PR](https://github.com/kohya-ss/sd-scripts/pull/1710), the value may be 1 (thanks to dxqbYD!). For SDXL, a larger value may be needed (10-100 may be good starting points).
-  - It may be good to adjust the value so that the loss is about half to three-quarters of the loss when DOP is not applied.
-```
-[[datasets.subsets]]
-image_dir = "path/to/image/dir"
-num_repeats = 1
-is_reg = true
-custom_attributes.diff_output_preservation = true # Add this
-```
-
-
-Oct 13, 2024:
-
-- Fixed an issue where it took a long time to load the image size when initializing the dataset, especially when the number of images in the dataset was large.
-
-- During multi-GPU training, caching of latents and Text Encoder outputs is now done in multi-GPU.
-  - Please make sure that `--highvram` and `--vae_batch_size` are specified correctly. If you have enough VRAM, you can increase the batch size to speed up the caching. 
-  - `--text_encoder_batch_size` option is enabled for FLUX.1 LoRA training and fine tuning. This option specifies the batch size for caching Text Encoder outputs (not for training). The default is same as the dataset batch size. If you have enough VRAM, you can increase the batch size to speed up the caching. 
-  - Multi-threading is also implemented for caching of latents. This may speed up the caching process about 5% (depends on the environment).
-  - `tools/cache_latents.py` and `tools/cache_text_encoder_outputs.py` also have been updated to support multi-GPU caching.
-- `--skip_cache_check` option is added to each training script. 
-  - When specified, the consistency check of the cache file `*.npz` contents (e.g., image size and flip for latents, mask for Text Encoder outputs) is skipped. 
-  - Specify this option if you have a large number of cache files and the consistency check takes time. 
-  - Even if this option is specified, the cache will be created if the file does not exist.
-  - `--skip_latents_validity_check` in SD3/FLUX.1 is deprecated. Please use `--skip_cache_check` instead.
-
-Oct 12, 2024 (update 1):
-
-- [Experimental] FLUX.1 fine-tuning and LoRA training now support "FLUX.1 __compact__" models.
-  - A compact model is a model that retains the FLUX.1 architecture but reduces the number of double/single blocks from the default 19/38.
-  - The model is automatically determined based on the keys in *.safetensors.
-  - Specifications for compact model safetensors:
-    - Please specify the block indices as consecutive numbers. An error will occur if there are missing numbers. For example, if you reduce the double blocks to 15, the maximum key will be `double_blocks.14.*`. The same applies to single blocks.
-  - LoRA training is unverified.
-  - The trained model can be used for inference with `flux_minimal_inference.py`. Other inference environments are unverified.
-
-Oct 12, 2024:
-
-- Multi-GPU training now works on Windows. Thanks to Akegarasu for PR [#1686](https://github.com/kohya-ss/sd-scripts/pull/1686)!
-  - In simple tests, SDXL and FLUX.1 LoRA training worked. FLUX.1 fine-tuning did not work, probably due to a PyTorch-related error. Other scripts are unverified.
-  - Set up multi-GPU training with `accelerate config`.
-  - Specify `--rdzv_backend=c10d` when launching `accelerate launch`. You can also edit `config.yaml` directly.
-    ```
-    accelerate launch --rdzv_backend=c10d sdxl_train_network.py ...
-    ```
-  - In multi-GPU training, the memory of multiple GPUs is not integrated. In other words, even if you have two 12GB VRAM GPUs, you cannot train the model that requires 24GB VRAM. Training that can be done with 12GB VRAM is executed at (up to) twice the speed.
-
-Oct 11, 2024:
-- ControlNet training for SDXL has been implemented in this branch. Please use `sdxl_train_control_net.py`. 
-  - For details on defining the dataset, see [here](docs/train_lllite_README.md#creating-a-dataset-configuration-file).
-  - The learning rate for the copy part of the U-Net is specified by `--learning_rate`. The learning rate for the added modules in ControlNet is specified by `--control_net_lr`. The optimal value is still unknown, but try around U-Net `1e-5` and ControlNet `1e-4`.
-  - If you want to generate sample images, specify the control image as `--cn path/to/control/image`.
-  - The trained weights are automatically converted and saved in Diffusers format. It should be available in ComfyUI.
-- Weighting of prompts (captions) during training in SDXL is now supported (e.g., `(some text)`, `[some text]`, `(some text:1.4)`, etc.). The function is enabled by specifying `--weighted_captions`. 
-  - The default is `False`. It is same as before, and the parentheses are used as normal text.
-  - If `--weighted_captions` is specified, please use `\` to escape the parentheses in the prompt. For example, `\(some text:1.4\)`.
-
-Oct 6, 2024:
-- In FLUX.1 LoRA training and fine-tuning, the specified weight file (*.safetensors) is automatically determined to be dev or schnell. This allows schnell models to be loaded correctly. Note that LoRA training with schnell models and fine-tuning with schnell models are unverified.
-- FLUX.1 LoRA training and fine-tuning can now load weights in Diffusers format in addition to BFL format (a single *.safetensors file). Please specify the parent directory of `transformer` or `diffusion_pytorch_model-00001-of-00003.safetensors` with the full path. However, Diffusers format CLIP/T5XXL is not supported. Saving is supported only in BFL format.
-
-Sep 26, 2024:
-The implementation of block swap during FLUX.1 fine-tuning has been changed to improve speed about 10% (depends on the environment). A new `--blocks_to_swap` option has been added, and `--double_blocks_to_swap` and `--single_blocks_to_swap` are deprecated. `--double_blocks_to_swap` and `--single_blocks_to_swap` are working as before, but they will be removed in the future. See [FLUX.1 fine-tuning](#flux1-fine-tuning) for details.
-
-
-Sep 18, 2024 (update 1):
-Fixed an issue where train()/eval() was not called properly with the schedule-free optimizer. The schedule-free optimizer can be used in FLUX.1 LoRA training and fine-tuning for now.
-
-Sep 18, 2024:
-
-- Schedule-free optimizer is added. Thanks to sdbds! See PR [#1600](https://github.com/kohya-ss/sd-scripts/pull/1600) for details.
-  - Details of the schedule-free optimizer can be found in [facebookresearch/schedule_free](https://github.com/facebookresearch/schedule_free).
-  - `schedulefree` is added to the dependencies. Please update the library if necessary.
-  - AdamWScheduleFree or SGDScheduleFree can be used. Specify `adamwschedulefree` or `sgdschedulefree` in `--optimizer_type`.
-  - Wrapper classes are not available for now.
-  - These can be used not only for FLUX.1 training but also for other training scripts after merging to the dev/main branch.
-
-Sep 16, 2024:
-
- Added `train_double_block_indices` and `train_double_block_indices` to the LoRA training script to specify the indices of the blocks to train. See [Specify blocks to train in FLUX.1 LoRA training](#specify-blocks-to-train-in-flux1-lora-training) for details.
-
-Sep 15, 2024:
-
-Added a script `convert_diffusers_to_flux.py` to convert Diffusers format FLUX.1 models (checkpoints) to BFL format. See `--help` for usage. Only Flux models are supported. AE/CLIP/T5XXL are not supported. 
-
-The implementation is based on 2kpr's code. Thanks to 2kpr!
-
-Sep 14, 2024:
-- You can now specify the rank for each layer in FLUX.1. See [Specify rank for each layer in FLUX.1](#specify-rank-for-each-layer-in-flux1) for details.
-- OFT is now supported with FLUX.1. See [FLUX.1 OFT training](#flux1-oft-training) for details.
-
-Sep 11, 2024: 
-Logging to wandb is improved. See PR [#1576](https://github.com/kohya-ss/sd-scripts/pull/1576) for details. Thanks to p1atdev!
-
-Sep 10, 2024:
-In FLUX.1 LoRA training, individual learning rates can be specified for CLIP-L and T5XXL. By specifying multiple numbers in `--text_encoder_lr`, you can set the learning rates for CLIP-L and T5XXL separately. Specify like `--text_encoder_lr 1e-4 1e-5`. The first value is the learning rate for CLIP-L, and the second value is for T5XXL. If you specify only one, the learning rates for CLIP-L and T5XXL will be the same.
-
-Sep 9, 2024:
-Added `--negative_prompt` and `--cfg_scale` to `flux_minimal_inference.py`. Negative prompts can be used. 
-
-Sep 5, 2024 (update 1):
-
-Added `--cpu_offload_checkpointing` option to LoRA training script. Offloads gradient checkpointing to CPU. This reduces up to 1GB of VRAM usage but slows down the training by about 15%. Cannot be used with `--split_mode`.
-
-Sep 5, 2024:
-
-The LoRA merge script now supports CLIP-L and T5XXL LoRA. Please specify `--clip_l` and `--t5xxl`. `--clip_l_save_to` and `--t5xxl_save_to` specify the save destination for CLIP-L and T5XXL. See [Merge LoRA to FLUX.1 checkpoint](#merge-lora-to-flux1-checkpoint) for details.
-
-Sep 4, 2024:
-- T5XXL LoRA is supported in LoRA training. Remove `--network_train_unet_only` and add `train_t5xxl=True` to `--network_args`. CLIP-L is also trained at the same time (T5XXL only cannot be trained). The trained model can be used with ComfyUI. See [Key Features for FLUX.1 LoRA training](#key-features-for-flux1-lora-training) for details.
-- In LoRA training, when `--fp8_base` is specified, you can specify `t5xxl_fp8_e4m3fn.safetensors` as the T5XXL weights. However, it is recommended to use fp16 weights for caching.
-- Fixed an issue where the training CLIP-L LoRA was not used in sample image generation during LoRA training.
-
-Sep 1, 2024:
-- `--timestamp_sampling` has `flux_shift` option. Thanks to sdbds!
-  - This is the same shift as FLUX.1 dev inference, adjusting the timestep sampling depending on the resolution. `--discrete_flow_shift` is ignored when `flux_shift` is specified. It is not verified which is better, `shift` or `flux_shift`.
-
-Aug 29, 2024: 
-Please update `safetensors` to `0.4.4` to fix the error when using `--resume`. `requirements.txt` is updated.
+- Improved the implementation of block swap and made it available for both FLUX.1 and SD3 LoRA training. See [FLUX.1 LoRA training](#flux1-lora-training) etc. for how to use the new options. Training is possible with about 8-10GB of VRAM.
+- During fine-tuning, the memory usage when specifying the same number of blocks has increased slightly, but the training speed when specifying block swap has been significantly improved.
+- There may be bugs due to the significant changes. Feedback is welcome.
 
 ## FLUX.1 training
 
@@ -190,7 +51,8 @@ accelerate launch  --mixed_precision bf16 --num_cpu_threads_per_process 1 flux_t
 --pretrained_model_name_or_path flux1-dev.safetensors --clip_l sd3/clip_l.safetensors --t5xxl sd3/t5xxl_fp16.safetensors 
 --ae ae.safetensors --cache_latents_to_disk --save_model_as safetensors --sdpa --persistent_data_loader_workers 
 --max_data_loader_n_workers 2 --seed 42 --gradient_checkpointing --mixed_precision bf16 --save_precision bf16 
---network_module networks.lora_flux --network_dim 4 --optimizer_type adamw8bit --learning_rate 1e-4 
+--network_module networks.lora_flux --network_dim 4 --network_train_unet_only 
+--optimizer_type adamw8bit --learning_rate 1e-4 
 --cache_text_encoder_outputs --cache_text_encoder_outputs_to_disk --fp8_base 
 --highvram --max_train_epochs 4 --save_every_n_epochs 1 --dataset_config dataset_1024_bs2.toml 
 --output_dir path/to/output/dir --output_name flux-lora-name 
@@ -198,23 +60,39 @@ accelerate launch  --mixed_precision bf16 --num_cpu_threads_per_process 1 flux_t
 ```
 (The command is multi-line for readability. Please combine it into one line.)
 
-The training can be done with 16GB VRAM GPUs with Adafactor optimizer. Please use settings like below:
+We also not sure how many epochs are needed for convergence, and how the learning rate should be adjusted.
+
+The trained LoRA model can be used with ComfyUI. 
+
+When training LoRA for Text Encoder (without `--network_train_unet_only`), more VRAM is required. Please refer to the settings below to reduce VRAM usage.
+
+__Options for GPUs with less VRAM:__
+
+By specifying `--block_to_swap`, you can save VRAM by swapping some blocks between CPU and GPU. See [FLUX.1 fine-tuning](#flux1-fine-tuning) for details.
+
+Specify a number like `--block_to_swap 10`. A larger number will swap more blocks, saving more VRAM, but training will be slower. In FLUX.1, you can swap up to 35 blocks.
+
+`--cpu_offload_checkpointing` offloads gradient checkpointing to CPU. This reduces up to 1GB of VRAM usage but slows down the training by about 15%. Cannot be used with `--block_to_swap`.
+
+Adafactor optimizer may reduce the VRAM usage than 8bit AdamW. Please use settings like below:
 
 ```
 --optimizer_type adafactor --optimizer_args "relative_step=False" "scale_parameter=False" "warmup_init=False" --lr_scheduler constant_with_warmup --max_grad_norm 0.0
 ```
 
-The training can be done with 12GB VRAM GPUs with Adafactor optimizer, `--split_mode` and `train_blocks=single` options. Please use settings like below:
+The training can be done with 16GB VRAM GPUs with the batch size of 1. Please change your dataset configuration.
+
+The training can be done with 12GB VRAM GPUs with `--block_to_swap 16` with 8bit AdamW. Please use settings like below:
 
 ```
---optimizer_type adafactor --optimizer_args "relative_step=False" "scale_parameter=False" "warmup_init=False" --split_mode --network_args "train_blocks=single" --lr_scheduler constant_with_warmup --max_grad_norm 0.0
+--blocks_to_swap 16 
 ```
 
-`--cpu_offload_checkpointing` offloads gradient checkpointing to CPU. This reduces up to 1GB of VRAM usage but slows down the training by about 15%. Cannot be used with `--split_mode`.
+For GPUs with less than 10GB of VRAM, it is recommended to use an fp8 checkpoint for T5XXL. You can download `t5xxl_fp8_e4m3fn.safetensors` from [comfyanonymous/flux_text_encoders](https://huggingface.co/comfyanonymous/flux_text_encoders) (please use without `scaled`).
 
-We also not sure how many epochs are needed for convergence, and how the learning rate should be adjusted.
+10GB VRAM GPUs will work with 22 blocks swapped, and 8GB VRAM GPUs will work with 28 blocks swapped.
 
-The trained LoRA model can be used with ComfyUI. 
+__`--split_mode` is deprecated. This option is still available, but they will be removed in the future. Please use `--blocks_to_swap` instead. If this option is specified and `--blocks_to_swap` is not specified, `--blocks_to_swap 18` is automatically enabled.__
 
 #### Key Options for FLUX.1 LoRA training
 
@@ -239,6 +117,7 @@ There are many unknown points in FLUX.1 training, so some settings can be specif
   - `additive`: add to noisy input
   - `sigma_scaled`: apply sigma scaling, same as SD3
 - `--discrete_flow_shift` is the discrete flow shift for the Euler Discrete Scheduler, default is 3.0 (same as SD3).
+- `--blocks_to_swap`. See [FLUX.1 fine-tuning](#flux1-fine-tuning) for details.
 
 The existing `--loss_type` option may be useful for FLUX.1 training. The default is `l2`.
 
@@ -426,9 +305,9 @@ Options are almost the same as LoRA training. The difference is `--full_bf16`, `
 
 `--blockwise_fused_optimizers` enables the fusing of the optimizer step into the backward pass for each block. This is similar to `--fused_backward_pass`. Any optimizer can be used, but Adafactor is recommended for memory efficiency and stochastic rounding. `--blockwise_fused_optimizers` cannot be used with `--fused_backward_pass`. Stochastic rounding is not supported for now.
 
-`--blocks_to_swap` is the number of blocks to swap. The default is None (no swap). These options must be combined with `--fused_backward_pass` or `--blockwise_fused_optimizers`. The recommended maximum value is 36. 
+`--blocks_to_swap` is the number of blocks to swap. The default is None (no swap). The maximum value is 35.
 
-`--cpu_offload_checkpointing` is to offload the gradient checkpointing to CPU. This reduces about 2GB of VRAM usage. 
+`--cpu_offload_checkpointing` is to offload the gradient checkpointing to CPU. This reduces about 2GB of VRAM usage. This option cannot be used with `--blocks_to_swap`.
 
 All these options are experimental and may change in the future.
 
@@ -448,13 +327,13 @@ There are two possible ways to use block swap. It is unknown which is better.
 
 2. Swap many blocks to increase the batch size and shorten the training speed per data.
 
-    For example, swapping 20 blocks seems to increase the batch size to about 6. In this case, the training speed per data will be relatively faster than 1.
+    For example, swapping 35 blocks seems to increase the batch size to about 5. In this case, the training speed per data will be relatively faster than 1.
   
 #### Training with <24GB VRAM GPUs
 
 Swap 28 blocks without cpu offload checkpointing may be working with 12GB VRAM GPUs. Please try different settings according to VRAM size of your GPU.
 
-T5XXL requires about 10GB of VRAM, so 10GB of VRAM will be minimum requirement for FLUX.1 fine-tuning.
+T5XXL requires about 10GB of VRAM, so 10GB of VRAM will be minimum requirement for FLUX.1 fine-tuning. 
 
 #### Key Features for FLUX.1 fine-tuning
 
@@ -465,17 +344,19 @@ T5XXL requires about 10GB of VRAM, so 10GB of VRAM will be minimum requirement f
     - Since the transfer between CPU and GPU takes time, the training will be slower.
     - `--blocks_to_swap` specify the number of blocks to swap. 
     - About 640MB of memory can be saved per block.
-    - Since the memory usage of one double block and two single blocks is almost the same, the transfer of single blocks is done in units of two. For example, consider the case of `--blocks_to_swap 6`.
-      - Before the forward pass, all double blocks and 26 (=38-12) single blocks are on the GPU. The last 12 single blocks are on the CPU.
-      - In the forward pass, the 6 double blocks that have finished calculation (the first 6 blocks) are transferred to the CPU, and the 12 single blocks to be calculated (the last 12 blocks) are transferred to the GPU.
-      - The same is true for the backward pass, but in reverse order. The 12 single blocks that have finished calculation are transferred to the CPU, and the 6 double blocks to be calculated are transferred to the GPU. 
-      - After the backward pass, the blocks are back to their original locations.
+  - (Update 1: Nov 12, 2024) 
+    - The maximum number of blocks that can be swapped is 35.
+    - We are exchanging only the data of the weights (weight.data) in reference to the implementation of OneTrainer (thanks to OneTrainer). However, the mechanism of the exchange is a custom implementation.
+    - Since it takes time to free CUDA memory (torch.cuda.empty_cache()), we reuse the CUDA memory allocated to weight.data as it is and exchange the weights between modules.
+    - This shortens the time it takes to exchange weights between modules.
+    - Since the weights must be almost identical to be exchanged, FLUX.1 exchanges the weights between double blocks and single blocks.
+    - In SD3, all blocks are similar, but some weights are different, so there are weights that always remain on the GPU.
 
 2. Sample Image Generation:
    - Sample image generation during training is now supported.
    - The prompts are cached and used for generation if `--cache_latents` is specified. So changing the prompts during training will not affect the generated images.
    - Specify options such as `--sample_prompts` and `--sample_every_n_epochs`.
-   - Note: It will be very slow when `--split_mode` is specified.
+   - Note: It will be very slow when `--blocks_to_swap` is specified.
 
 3. Experimental Memory-Efficient Saving:
    - `--mem_eff_save` option can further reduce memory consumption during model saving (about 22GB).
@@ -621,20 +502,19 @@ accelerate launch  --mixed_precision bf16 --num_cpu_threads_per_process 1 sd3_tr
 --pretrained_model_name_or_path path/to/sd3.5_large.safetensors --clip_l sd3/clip_l.safetensors --clip_g sd3/clip_g.safetensors --t5xxl sd3/t5xxl_fp16.safetensors 
 --cache_latents_to_disk --save_model_as safetensors --sdpa --persistent_data_loader_workers 
 --max_data_loader_n_workers 2 --seed 42 --gradient_checkpointing --mixed_precision bf16 --save_precision bf16 
---network_module networks.lora_sd3 --network_dim 4 --optimizer_type adamw8bit --learning_rate 1e-4 
+--network_module networks.lora_sd3 --network_dim 4 --network_train_unet_only 
+--optimizer_type adamw8bit --learning_rate 1e-4 
 --cache_text_encoder_outputs --cache_text_encoder_outputs_to_disk --fp8_base 
 --highvram --max_train_epochs 4 --save_every_n_epochs 1 --dataset_config dataset_1024_bs2.toml 
 --output_dir path/to/output/dir --output_name sd3-lora-name 
 ```
 (The command is multi-line for readability. Please combine it into one line.)
 
-The training can be done with 12GB VRAM GPUs with Adafactor optimizer. Please use settings like below:
+Like FLUX.1 training, the `--blocks_to_swap` option for memory reduction is available. The maximum number of blocks that can be swapped is 36 for SD3.5L and 22 for SD3.5M.
 
-```
---optimizer_type adafactor --optimizer_args "relative_step=False" "scale_parameter=False" "warmup_init=False" --lr_scheduler constant_with_warmup --max_grad_norm 0.0
-```
+Adafactor optimizer is also available.
 
-`--cpu_offload_checkpointing` and `--split_mode` are not available for SD3 LoRA training.
+`--cpu_offload_checkpointing` option is not available.
 
 We also not sure how many epochs are needed for convergence, and how the learning rate should be adjusted.
 
diff --git a/flux_train.py b/flux_train.py
index 346fe8fbd..ad2c7722b 100644
--- a/flux_train.py
+++ b/flux_train.py
@@ -78,6 +78,10 @@ def train(args):
         )
         args.gradient_checkpointing = True
 
+    assert (
+        args.blocks_to_swap is None or args.blocks_to_swap == 0
+    ) or not args.cpu_offload_checkpointing, "blocks_to_swap is not supported with cpu_offload_checkpointing / blocks_to_swapはcpu_offload_checkpointingと併用できません"
+
     cache_latents = args.cache_latents
     use_dreambooth_method = args.in_json is None
 
@@ -518,47 +522,6 @@ def grad_hook(parameter: torch.Tensor):
                         parameter_optimizer_map[parameter] = opt_idx
                         num_parameters_per_group[opt_idx] += 1
 
-    # add hooks for block swapping: this hook is called after fused_backward_pass hook or blockwise_fused_optimizers hook
-    if False: # is_swapping_blocks:
-        import library.custom_offloading_utils as custom_offloading_utils
-
-        num_double_blocks = len(accelerator.unwrap_model(flux).double_blocks)
-        num_single_blocks = len(accelerator.unwrap_model(flux).single_blocks)
-        double_blocks_to_swap = args.blocks_to_swap // 2
-        single_blocks_to_swap = (args.blocks_to_swap - double_blocks_to_swap) * 2
-
-        offloader_double = custom_offloading_utils.TrainOffloader(num_double_blocks, double_blocks_to_swap, accelerator.device)
-        offloader_single = custom_offloading_utils.TrainOffloader(num_single_blocks, single_blocks_to_swap, accelerator.device)
-
-        param_name_pairs = []
-        if not args.blockwise_fused_optimizers:
-            for param_group, param_name_group in zip(optimizer.param_groups, param_names):
-                param_name_pairs.extend(zip(param_group["params"], param_name_group))
-        else:
-            # named_parameters is a list of (name, parameter) pairs
-            param_name_pairs.extend([(p, n) for n, p in flux.named_parameters()])
-
-        for parameter, param_name in param_name_pairs:
-            if not parameter.requires_grad:
-                continue
-
-            is_double = param_name.startswith("double_blocks")
-            is_single = param_name.startswith("single_blocks")
-            if not is_double and not is_single:
-                continue
-
-            block_index = int(param_name.split(".")[1])
-            if is_double:
-                blocks = flux.double_blocks
-                offloader = offloader_double
-            else:
-                blocks = flux.single_blocks
-                offloader = offloader_single
-
-            grad_hook = offloader.create_grad_hook(blocks, block_index)
-            if grad_hook is not None:
-                parameter.register_post_accumulate_grad_hook(grad_hook)
-
     # epoch数を計算する
     num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
     num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch)
@@ -827,6 +790,7 @@ def setup_parser() -> argparse.ArgumentParser:
     train_util.add_optimizer_arguments(parser)
     config_util.add_config_arguments(parser)
     add_custom_train_arguments(parser)  # TODO remove this from here
+    train_util.add_dit_training_arguments(parser)
     flux_train_utils.add_flux_train_arguments(parser)
 
     parser.add_argument(
@@ -851,16 +815,6 @@ def setup_parser() -> argparse.ArgumentParser:
         action="store_true",
         help="[Deprecated] use 'skip_cache_check' instead / 代わりに 'skip_cache_check' を使用してください",
     )
-    parser.add_argument(
-        "--blocks_to_swap",
-        type=int,
-        default=None,
-        help="[EXPERIMENTAL] "
-        "Sets the number of blocks (~640MB) to swap during the forward and backward passes."
-        "Increasing this number lowers the overall VRAM used during training at the expense of training speed (s/it)."
-        " / 順伝播および逆伝播中にスワップするブロック（約640MB）の数を設定します。"
-        "この数を増やすと、トレーニング中のVRAM使用量が減りますが、トレーニング速度（s/it）も低下します。",
-    )
     parser.add_argument(
         "--double_blocks_to_swap",
         type=int,
diff --git a/flux_train_network.py b/flux_train_network.py
index 376cc1597..9bcd59282 100644
--- a/flux_train_network.py
+++ b/flux_train_network.py
@@ -52,10 +52,23 @@ def assert_extra_args(self, args, train_dataset_group):
         if args.max_token_length is not None:
             logger.warning("max_token_length is not used in Flux training / max_token_lengthはFluxのトレーニングでは使用されません")
 
-        assert not args.split_mode or not args.cpu_offload_checkpointing, (
-            "split_mode and cpu_offload_checkpointing cannot be used together"
-            " / split_modeとcpu_offload_checkpointingは同時に使用できません"
-        )
+        assert (
+            args.blocks_to_swap is None or args.blocks_to_swap == 0
+        ) or not args.cpu_offload_checkpointing, "blocks_to_swap is not supported with cpu_offload_checkpointing / blocks_to_swapはcpu_offload_checkpointingと併用できません"
+
+        # deprecated split_mode option
+        if args.split_mode:
+            if args.blocks_to_swap is not None:
+                logger.warning(
+                    "split_mode is deprecated. Because `--blocks_to_swap` is set, `--split_mode` is ignored."
+                    " / split_modeは非推奨です。`--blocks_to_swap`が設定されているため、`--split_mode`は無視されます。"
+                )
+            else:
+                logger.warning(
+                    "split_mode is deprecated. Please use `--blocks_to_swap` instead. `--blocks_to_swap 18` is automatically set."
+                    " / split_modeは非推奨です。代わりに`--blocks_to_swap`を使用してください。`--blocks_to_swap 18`が自動的に設定されました。"
+                )
+                args.blocks_to_swap = 18  # 18 is safe for most cases
 
         train_dataset_group.verify_bucket_reso_steps(32)  # TODO check this
 
@@ -75,9 +88,15 @@ def load_target_model(self, args, weight_dtype, accelerator):
                 raise ValueError(f"Unsupported fp8 model dtype: {model.dtype}")
             elif model.dtype == torch.float8_e4m3fn:
                 logger.info("Loaded fp8 FLUX model")
+            else:
+                logger.info(
+                    "Cast FLUX model to fp8. This may take a while. You can reduce the time by using fp8 checkpoint."
+                    " / FLUXモデルをfp8に変換しています。これには時間がかかる場合があります。fp8チェックポイントを使用することで時間を短縮できます。"
+                )
+                model.to(torch.float8_e4m3fn)
 
-        if args.split_mode:
-            model = self.prepare_split_model(model, weight_dtype, accelerator)
+        # if args.split_mode:
+        #     model = self.prepare_split_model(model, weight_dtype, accelerator)
 
         self.is_swapping_blocks = args.blocks_to_swap is not None and args.blocks_to_swap > 0
         if self.is_swapping_blocks:
@@ -108,6 +127,7 @@ def load_target_model(self, args, weight_dtype, accelerator):
 
         return flux_utils.MODEL_VERSION_FLUX_V1, [clip_l, t5xxl], ae, model
 
+    """
     def prepare_split_model(self, model, weight_dtype, accelerator):
         from accelerate import init_empty_weights
 
@@ -144,6 +164,7 @@ def prepare_split_model(self, model, weight_dtype, accelerator):
         logger.info("split model prepared")
 
         return flux_lower
+    """
 
     def get_tokenize_strategy(self, args):
         _, is_schnell, _, _ = flux_utils.analyze_checkpoint_state(args.pretrained_model_name_or_path)
@@ -291,14 +312,12 @@ def sample_images(self, accelerator, args, epoch, global_step, device, ae, token
         text_encoders = text_encoder  # for compatibility
         text_encoders = self.get_models_for_text_encoding(args, accelerator, text_encoders)
 
-        if not args.split_mode:
-            if self.is_swapping_blocks:
-                accelerator.unwrap_model(flux).prepare_block_swap_before_forward()
-            flux_train_utils.sample_images(
-                accelerator, args, epoch, global_step, flux, ae, text_encoders, self.sample_prompts_te_outputs
-            )
-            return
+        flux_train_utils.sample_images(
+            accelerator, args, epoch, global_step, flux, ae, text_encoders, self.sample_prompts_te_outputs
+        )
+        # return
 
+        """
         class FluxUpperLowerWrapper(torch.nn.Module):
             def __init__(self, flux_upper: flux_models.FluxUpper, flux_lower: flux_models.FluxLower, device: torch.device):
                 super().__init__()
@@ -325,6 +344,7 @@ def forward(self, img, img_ids, txt, txt_ids, timesteps, y, guidance=None, txt_a
             accelerator, args, epoch, global_step, wrapper, ae, text_encoders, self.sample_prompts_te_outputs
         )
         clean_memory_on_device(accelerator.device)
+        """
 
     def get_noise_scheduler(self, args: argparse.Namespace, device: torch.device) -> Any:
         noise_scheduler = sd3_train_utils.FlowMatchEulerDiscreteScheduler(num_train_timesteps=1000, shift=args.discrete_flow_shift)
@@ -383,20 +403,21 @@ def get_noise_pred_and_target(
             t5_attn_mask = None
 
         def call_dit(img, img_ids, t5_out, txt_ids, l_pooled, timesteps, guidance_vec, t5_attn_mask):
-            if not args.split_mode:
-                # normal forward
-                with accelerator.autocast():
-                    # YiYi notes: divide it by 1000 for now because we scale it by 1000 in the transformer model (we should not keep it but I want to keep the inputs same for the model for testing)
-                    model_pred = unet(
-                        img=img,
-                        img_ids=img_ids,
-                        txt=t5_out,
-                        txt_ids=txt_ids,
-                        y=l_pooled,
-                        timesteps=timesteps / 1000,
-                        guidance=guidance_vec,
-                        txt_attention_mask=t5_attn_mask,
-                    )
+            # if not args.split_mode:
+            # normal forward
+            with accelerator.autocast():
+                # YiYi notes: divide it by 1000 for now because we scale it by 1000 in the transformer model (we should not keep it but I want to keep the inputs same for the model for testing)
+                model_pred = unet(
+                    img=img,
+                    img_ids=img_ids,
+                    txt=t5_out,
+                    txt_ids=txt_ids,
+                    y=l_pooled,
+                    timesteps=timesteps / 1000,
+                    guidance=guidance_vec,
+                    txt_attention_mask=t5_attn_mask,
+                )
+            """
             else:
                 # split forward to reduce memory usage
                 assert network.train_blocks == "single", "train_blocks must be single for split mode"
@@ -430,6 +451,7 @@ def call_dit(img, img_ids, t5_out, txt_ids, l_pooled, timesteps, guidance_vec, t
                     vec.requires_grad_(True)
                     pe.requires_grad_(True)
                     model_pred = unet(img=intermediate_img, txt=intermediate_txt, vec=vec, pe=pe, txt_attention_mask=t5_attn_mask)
+            """
 
             return model_pred
 
@@ -558,30 +580,23 @@ def prepare_unet_with_accelerator(
         flux: flux_models.Flux = unet
         flux = accelerator.prepare(flux, device_placement=[not self.is_swapping_blocks])
         accelerator.unwrap_model(flux).move_to_device_except_swap_blocks(accelerator.device)  # reduce peak memory usage
+        accelerator.unwrap_model(flux).prepare_block_swap_before_forward()
 
         return flux
 
 
 def setup_parser() -> argparse.ArgumentParser:
     parser = train_network.setup_parser()
+    train_util.add_dit_training_arguments(parser)
     flux_train_utils.add_flux_train_arguments(parser)
 
     parser.add_argument(
         "--split_mode",
         action="store_true",
-        help="[EXPERIMENTAL] use split mode for Flux model, network arg `train_blocks=single` is required"
-        + "/[実験的] Fluxモデルの分割モードを使用する。ネットワーク引数`train_blocks=single`が必要",
-    )
-
-    parser.add_argument(
-        "--blocks_to_swap",
-        type=int,
-        default=None,
-        help="[EXPERIMENTAL] "
-        "Sets the number of blocks to swap during the forward and backward passes."
-        "Increasing this number lowers the overall VRAM used during training at the expense of training speed (s/it)."
-        " / 順伝播および逆伝播中にスワップするブロックの数を設定します。"
-        "この数を増やすと、トレーニング中のVRAM使用量が減りますが、トレーニング速度（s/it）も低下します。",
+        # help="[EXPERIMENTAL] use split mode for Flux model, network arg `train_blocks=single` is required"
+        # + "/[実験的] Fluxモデルの分割モードを使用する。ネットワーク引数`train_blocks=single`が必要",
+        help="[Deprecated] This option is deprecated. Please use `--blocks_to_swap` instead."
+        " / このオプションは非推奨です。代わりに`--blocks_to_swap`を使用してください。",
     )
     return parser
 
diff --git a/library/custom_offloading_utils.py b/library/custom_offloading_utils.py
index 70da93902..84c2b743e 100644
--- a/library/custom_offloading_utils.py
+++ b/library/custom_offloading_utils.py
@@ -16,13 +16,29 @@ def synchronize_device(device: torch.device):
         torch.mps.synchronize()
 
 
-def swap_weight_devices(layer_to_cpu: nn.Module, layer_to_cuda: nn.Module):
+def swap_weight_devices_cuda(device: torch.device, layer_to_cpu: nn.Module, layer_to_cuda: nn.Module):
     assert layer_to_cpu.__class__ == layer_to_cuda.__class__
 
     weight_swap_jobs = []
-    for module_to_cpu, module_to_cuda in zip(layer_to_cpu.modules(), layer_to_cuda.modules()):
-        if hasattr(module_to_cpu, "weight") and module_to_cpu.weight is not None:
-            weight_swap_jobs.append((module_to_cpu, module_to_cuda, module_to_cpu.weight.data, module_to_cuda.weight.data))
+
+    # This is not working for all cases (e.g. SD3), so we need to find the corresponding modules
+    # for module_to_cpu, module_to_cuda in zip(layer_to_cpu.modules(), layer_to_cuda.modules()):
+    #     print(module_to_cpu.__class__, module_to_cuda.__class__)
+    #     if hasattr(module_to_cpu, "weight") and module_to_cpu.weight is not None:
+    #         weight_swap_jobs.append((module_to_cpu, module_to_cuda, module_to_cpu.weight.data, module_to_cuda.weight.data))
+
+    modules_to_cpu = {k: v for k, v in layer_to_cpu.named_modules()}
+    for module_to_cuda_name, module_to_cuda in layer_to_cuda.named_modules():
+        if hasattr(module_to_cuda, "weight") and module_to_cuda.weight is not None:
+            module_to_cpu = modules_to_cpu.get(module_to_cuda_name, None)
+            if module_to_cpu is not None and module_to_cpu.weight.shape == module_to_cuda.weight.shape:
+                weight_swap_jobs.append((module_to_cpu, module_to_cuda, module_to_cpu.weight.data, module_to_cuda.weight.data))
+            else:
+                if module_to_cuda.weight.data.device.type != device.type:
+                    # print(
+                    #     f"Module {module_to_cuda_name} not found in CPU model or shape mismatch, so not swapping and moving to device"
+                    # )
+                    module_to_cuda.weight.data = module_to_cuda.weight.data.to(device)
 
     torch.cuda.current_stream().synchronize()  # this prevents the illegal loss value
 
@@ -92,7 +108,7 @@ def __init__(self, num_blocks: int, blocks_to_swap: int, device: torch.device, d
 
     def swap_weight_devices(self, block_to_cpu: nn.Module, block_to_cuda: nn.Module):
         if self.cuda_available:
-            swap_weight_devices(block_to_cpu, block_to_cuda)
+            swap_weight_devices_cuda(self.device, block_to_cpu, block_to_cuda)
         else:
             swap_weight_devices_no_cuda(self.device, block_to_cpu, block_to_cuda)
 
@@ -132,52 +148,6 @@ def _wait_blocks_move(self, block_idx):
             print(f"Waited for block {block_idx}: {time.perf_counter()-start_time:.2f}s")
 
 
-class TrainOffloader(Offloader):
-    """
-    supports backward offloading
-    """
-
-    def __init__(self, num_blocks: int, blocks_to_swap: int, device: torch.device, debug: bool = False):
-        super().__init__(num_blocks, blocks_to_swap, device, debug)
-        self.hook_added = set()
-
-    def create_grad_hook(self, blocks: list[nn.Module], block_index: int) -> Optional[callable]:
-        if block_index in self.hook_added:
-            return None
-        self.hook_added.add(block_index)
-
-        # -1 for 0-based index, -1 for current block is not fully backpropagated yet
-        num_blocks_propagated = self.num_blocks - block_index - 2
-        swapping = num_blocks_propagated > 0 and num_blocks_propagated <= self.blocks_to_swap
-        waiting = block_index > 0 and block_index <= self.blocks_to_swap
-
-        if not swapping and not waiting:
-            return None
-
-        # create  hook
-        block_idx_to_cpu = self.num_blocks - num_blocks_propagated
-        block_idx_to_cuda = self.blocks_to_swap - num_blocks_propagated
-        block_idx_to_wait = block_index - 1
-
-        if self.debug:
-            print(
-                f"Backward: Created grad hook for block {block_index} with {block_idx_to_cpu}, {block_idx_to_cuda}, {block_idx_to_wait}"
-            )
-        if swapping:
-
-            def grad_hook(tensor: torch.Tensor):
-                self._submit_move_blocks(blocks, block_idx_to_cpu, block_idx_to_cuda)
-
-            return grad_hook
-
-        else:
-
-            def grad_hook(tensor: torch.Tensor):
-                self._wait_blocks_move(block_idx_to_wait)
-
-            return grad_hook
-
-
 class ModelOffloader(Offloader):
     """
     supports forward offloading
@@ -228,6 +198,9 @@ def prepare_block_devices_before_forward(self, blocks: list[nn.Module]):
         if self.blocks_to_swap is None or self.blocks_to_swap == 0:
             return
 
+        if self.debug:
+            print("Prepare block devices before forward")
+
         for b in blocks[0 : self.num_blocks - self.blocks_to_swap]:
             b.to(self.device)
             weighs_to_device(b, self.device)  # make sure weights are on device
diff --git a/library/flux_models.py b/library/flux_models.py
index 4fa272522..fa3c7ad2b 100644
--- a/library/flux_models.py
+++ b/library/flux_models.py
@@ -970,11 +970,16 @@ def enable_block_swap(self, num_blocks: int, device: torch.device):
         double_blocks_to_swap = num_blocks // 2
         single_blocks_to_swap = (num_blocks - double_blocks_to_swap) * 2
 
+        assert double_blocks_to_swap <= self.num_double_blocks - 2 and single_blocks_to_swap <= self.num_single_blocks - 2, (
+            f"Cannot swap more than {self.num_double_blocks - 2} double blocks and {self.num_single_blocks - 2} single blocks. "
+            f"Requested {double_blocks_to_swap} double blocks and {single_blocks_to_swap} single blocks."
+        )
+
         self.offloader_double = custom_offloading_utils.ModelOffloader(
-            self.double_blocks, self.num_double_blocks, double_blocks_to_swap, device #, debug=True
+            self.double_blocks, self.num_double_blocks, double_blocks_to_swap, device  # , debug=True
         )
         self.offloader_single = custom_offloading_utils.ModelOffloader(
-            self.single_blocks, self.num_single_blocks, single_blocks_to_swap, device #, debug=True
+            self.single_blocks, self.num_single_blocks, single_blocks_to_swap, device  # , debug=True
         )
         print(
             f"FLUX: Block swap enabled. Swapping {num_blocks} blocks, double blocks: {double_blocks_to_swap}, single blocks: {single_blocks_to_swap}."
@@ -1061,10 +1066,11 @@ def forward(
         return img
 
 
+"""
 class FluxUpper(nn.Module):
-    """
+    ""
     Transformer model for flow matching on sequences.
-    """
+    ""
 
     def __init__(self, params: FluxParams):
         super().__init__()
@@ -1168,9 +1174,9 @@ def forward(
 
 
 class FluxLower(nn.Module):
-    """
+    ""
     Transformer model for flow matching on sequences.
-    """
+    ""
 
     def __init__(self, params: FluxParams):
         super().__init__()
@@ -1228,3 +1234,4 @@ def forward(
 
         img = self.final_layer(img, vec)  # (N, T, patch_size ** 2 * out_channels)
         return img
+"""
diff --git a/library/flux_train_utils.py b/library/flux_train_utils.py
index fa673a2f0..d90644a25 100644
--- a/library/flux_train_utils.py
+++ b/library/flux_train_utils.py
@@ -257,14 +257,9 @@ def sample_image_inference(
         wandb_tracker = accelerator.get_tracker("wandb")
 
         import wandb
+
         # not to commit images to avoid inconsistency between training and logging steps
-        wandb_tracker.log(
-            {f"sample_{i}": wandb.Image(
-                image,
-                caption=prompt # positive prompt as a caption
-            )}, 
-            commit=False
-        )
+        wandb_tracker.log({f"sample_{i}": wandb.Image(image, caption=prompt)}, commit=False)  # positive prompt as a caption
 
 
 def time_shift(mu: float, sigma: float, t: torch.Tensor):
@@ -324,7 +319,7 @@ def denoise(
         )
 
         img = img + (t_prev - t_curr) * pred
-        
+
     model.prepare_block_swap_before_forward()
     return img
 
@@ -549,44 +544,7 @@ def add_flux_train_arguments(parser: argparse.ArgumentParser):
         action="store_true",
         help="apply attention mask to T5-XXL encode and FLUX double blocks / T5-XXLエンコードとFLUXダブルブロックにアテンションマスクを適用する",
     )
-    parser.add_argument(
-        "--cache_text_encoder_outputs", action="store_true", help="cache text encoder outputs / text encoderの出力をキャッシュする"
-    )
-    parser.add_argument(
-        "--cache_text_encoder_outputs_to_disk",
-        action="store_true",
-        help="cache text encoder outputs to disk / text encoderの出力をディスクにキャッシュする",
-    )
-    parser.add_argument(
-        "--text_encoder_batch_size",
-        type=int,
-        default=None,
-        help="text encoder batch size (default: None, use dataset's batch size)"
-        + " / text encoderのバッチサイズ（デフォルト: None, データセットのバッチサイズを使用）",
-    )
-    parser.add_argument(
-        "--disable_mmap_load_safetensors",
-        action="store_true",
-        help="disable mmap load for safetensors. Speed up model loading in WSL environment / safetensorsのmmapロードを無効にする。WSL環境等でモデル読み込みを高速化できる",
-    )
 
-    # copy from Diffusers
-    parser.add_argument(
-        "--weighting_scheme",
-        type=str,
-        default="none",
-        choices=["sigma_sqrt", "logit_normal", "mode", "cosmap", "none"],
-    )
-    parser.add_argument(
-        "--logit_mean", type=float, default=0.0, help="mean to use when using the `'logit_normal'` weighting scheme."
-    )
-    parser.add_argument("--logit_std", type=float, default=1.0, help="std to use when using the `'logit_normal'` weighting scheme.")
-    parser.add_argument(
-        "--mode_scale",
-        type=float,
-        default=1.29,
-        help="Scale of mode weighting scheme. Only effective when using the `'mode'` as the `weighting_scheme`.",
-    )
     parser.add_argument(
         "--guidance_scale",
         type=float,
diff --git a/library/sd3_models.py b/library/sd3_models.py
index 89225fe4d..8b90205db 100644
--- a/library/sd3_models.py
+++ b/library/sd3_models.py
@@ -18,6 +18,7 @@
 from torch.utils.checkpoint import checkpoint
 from transformers import CLIPTokenizer, T5TokenizerFast
 
+from library import custom_offloading_utils
 from library.device_utils import clean_memory_on_device
 
 from .utils import setup_logging
@@ -862,7 +863,8 @@ def __init__(
         # self.initialize_weights()
 
         self.blocks_to_swap = None
-        self.thread_pool: Optional[ThreadPoolExecutor] = None
+        self.offloader = None
+        self.num_blocks = len(self.joint_blocks)
 
     def enable_scaled_pos_embed(self, use_scaled_pos_embed: bool, latent_sizes: Optional[list[int]]):
         self.use_scaled_pos_embed = use_scaled_pos_embed
@@ -1055,14 +1057,20 @@ def cropped_scaled_pos_embed(self, h, w, device=None, dtype=None, random_crop: b
         # )
         return spatial_pos_embed
 
-    def enable_block_swap(self, num_blocks: int):
+    def enable_block_swap(self, num_blocks: int, device: torch.device):
         self.blocks_to_swap = num_blocks
 
-        n = 1  # async block swap. 1 is enough
-        self.thread_pool = ThreadPoolExecutor(max_workers=n)
+        assert (
+            self.blocks_to_swap <= self.num_blocks - 2
+        ), f"Cannot swap more than {self.num_blocks - 2} blocks. Requested: {self.blocks_to_swap} blocks."
+
+        self.offloader = custom_offloading_utils.ModelOffloader(
+            self.joint_blocks, self.num_blocks, self.blocks_to_swap, device  # , debug=True
+        )
+        print(f"SD3: Block swap enabled. Swapping {num_blocks} blocks, total blocks: {self.num_blocks}, device: {device}.")
 
     def move_to_device_except_swap_blocks(self, device: torch.device):
-        # assume model is on cpu
+        # assume model is on cpu. do not move blocks to device to reduce temporary memory usage
         if self.blocks_to_swap:
             save_blocks = self.joint_blocks
             self.joint_blocks = None
@@ -1073,16 +1081,9 @@ def move_to_device_except_swap_blocks(self, device: torch.device):
             self.joint_blocks = save_blocks
 
     def prepare_block_swap_before_forward(self):
-        # make: first n blocks are on cuda, and last n blocks are on cpu
         if self.blocks_to_swap is None or self.blocks_to_swap == 0:
-            # raise ValueError("Block swap is not enabled.")
             return
-        num_blocks = len(self.joint_blocks)
-        for i in range(num_blocks - self.blocks_to_swap):
-            self.joint_blocks[i].to(self.device)
-        for i in range(num_blocks - self.blocks_to_swap, num_blocks):
-            self.joint_blocks[i].to("cpu")
-        clean_memory_on_device(self.device)
+        self.offloader.prepare_block_devices_before_forward(self.joint_blocks)
 
     def forward(
         self,
@@ -1122,57 +1123,19 @@ def forward(
 
         if self.register_length > 0:
             context = torch.cat(
-                (
-                    einops.repeat(self.register, "1 ... -> b ...", b=x.shape[0]),
-                    default(context, torch.Tensor([]).type_as(x)),
-                ),
-                1,
+                (einops.repeat(self.register, "1 ... -> b ...", b=x.shape[0]), default(context, torch.Tensor([]).type_as(x))), 1
             )
 
         if not self.blocks_to_swap:
             for block in self.joint_blocks:
                 context, x = block(context, x, c)
         else:
-            futures = {}
-
-            def submit_move_blocks(block_idx_to_cpu, block_idx_to_cuda):
-                def move_blocks(bidx_to_cpu, block_to_cpu, bidx_to_cuda, block_to_cuda):
-                    # print(f"Moving {bidx_to_cpu} to cpu.")
-                    block_to_cpu.to("cpu", non_blocking=True)
-                    torch.cuda.empty_cache()
-
-                    # print(f"Moving {bidx_to_cuda} to cuda.")
-                    block_to_cuda.to(self.device, non_blocking=True)
-
-                    torch.cuda.synchronize()
-                    # print(f"Block move done. {bidx_to_cpu} to cpu, {bidx_to_cuda} to cuda.")
-                    return block_idx_to_cpu, block_idx_to_cuda
-
-                block_to_cpu = self.joint_blocks[block_idx_to_cpu]
-                block_to_cuda = self.joint_blocks[block_idx_to_cuda]
-                # print(f"Submit move blocks. {block_idx_to_cpu} to cpu, {block_idx_to_cuda} to cuda.")
-                return self.thread_pool.submit(move_blocks, block_idx_to_cpu, block_to_cpu, block_idx_to_cuda, block_to_cuda)
-
-            def wait_for_blocks_move(block_idx, ftrs):
-                if block_idx not in ftrs:
-                    return
-                # print(f"Waiting for move blocks: {block_idx}")
-                # start_time = time.perf_counter()
-                ftr = ftrs.pop(block_idx)
-                ftr.result()
-                # torch.cuda.synchronize()
-                # print(f"Move blocks took {time.perf_counter() - start_time:.2f} seconds")
-
             for block_idx, block in enumerate(self.joint_blocks):
-                wait_for_blocks_move(block_idx, futures)
+                self.offloader.wait_for_block(block_idx)
 
                 context, x = block(context, x, c)
 
-                if block_idx < self.blocks_to_swap:
-                    block_idx_to_cpu = block_idx
-                    block_idx_to_cuda = len(self.joint_blocks) - self.blocks_to_swap + block_idx
-                    future = submit_move_blocks(block_idx_to_cpu, block_idx_to_cuda)
-                    futures[block_idx_to_cuda] = future
+                self.offloader.submit_move_blocks(self.joint_blocks, block_idx)
 
         x = self.final_layer(x, c, H, W)  # Our final layer combined UnPatchify
         return x[:, :, :H, :W]
diff --git a/library/sd3_train_utils.py b/library/sd3_train_utils.py
index 38f3c25f4..c40798846 100644
--- a/library/sd3_train_utils.py
+++ b/library/sd3_train_utils.py
@@ -142,27 +142,6 @@ def sd_saver(ckpt_file, epoch_no, global_step):
 
 
 def add_sd3_training_arguments(parser: argparse.ArgumentParser):
-    parser.add_argument(
-        "--cache_text_encoder_outputs", action="store_true", help="cache text encoder outputs / text encoderの出力をキャッシュする"
-    )
-    parser.add_argument(
-        "--cache_text_encoder_outputs_to_disk",
-        action="store_true",
-        help="cache text encoder outputs to disk / text encoderの出力をディスクにキャッシュする",
-    )
-    parser.add_argument(
-        "--text_encoder_batch_size",
-        type=int,
-        default=None,
-        help="text encoder batch size (default: None, use dataset's batch size)"
-        + " / text encoderのバッチサイズ（デフォルト: None, データセットのバッチサイズを使用）",
-    )
-    parser.add_argument(
-        "--disable_mmap_load_safetensors",
-        action="store_true",
-        help="disable mmap load for safetensors. Speed up model loading in WSL environment / safetensorsのmmapロードを無効にする。WSL環境等でモデル読み込みを高速化できる",
-    )
-
     parser.add_argument(
         "--clip_l",
         type=str,
@@ -253,32 +232,8 @@ def add_sd3_training_arguments(parser: argparse.ArgumentParser):
         " / 複数解像度学習時に解像度ごとに位置埋め込みをスケーリングする。SD3.5M以外では予期しない動作になります",
     )
 
-    # Dependencies of Diffusers noise sampler has been removed for clarity.
-    parser.add_argument(
-        "--weighting_scheme",
-        type=str,
-        default="uniform",
-        choices=["sigma_sqrt", "logit_normal", "mode", "cosmap", "uniform"],
-        help="weighting scheme for timestep distribution and loss / タイムステップ分布と損失のための重み付けスキーム",
-    )
-    parser.add_argument(
-        "--logit_mean",
-        type=float,
-        default=0.0,
-        help="mean to use when using the `'logit_normal'` weighting scheme for timestep distribution. / タイムステップ分布のために`'logit_normal'`重み付けスキームを使用する場合の平均",
-    )
-    parser.add_argument(
-        "--logit_std",
-        type=float,
-        default=1.0,
-        help="std to use when using the `'logit_normal'` weighting scheme for timestep distribution. / タイムステップ分布のために`'logit_normal'`重み付けスキームを使用する場合のstd",
-    )
-    parser.add_argument(
-        "--mode_scale",
-        type=float,
-        default=1.29,
-        help="Scale of mode weighting scheme. Only effective when using the `'mode'` as the `weighting_scheme`. / モード重み付けスキームのスケール。`'mode'`を`weighting_scheme`として使用する場合のみ有効",
-    )
+    # Dependencies of Diffusers noise sampler has been removed for clarity in training
+
     parser.add_argument(
         "--training_shift",
         type=float,
diff --git a/library/train_util.py b/library/train_util.py
index a5d6fdd21..e1dfeecdb 100644
--- a/library/train_util.py
+++ b/library/train_util.py
@@ -1887,7 +1887,9 @@ def load_dreambooth_dir(subset: DreamBoothSubset):
 
                     # make image path to npz path mapping
                     npz_paths = glob.glob(os.path.join(subset.image_dir, "*" + strategy.cache_suffix))
-                    npz_paths.sort(key=lambda item: item.rsplit("_", maxsplit=2)[0])  # sort by name excluding resolution and cache_suffix
+                    npz_paths.sort(
+                        key=lambda item: item.rsplit("_", maxsplit=2)[0]
+                    )  # sort by name excluding resolution and cache_suffix
                     npz_path_index = 0
 
                     size_set_count = 0
@@ -3537,8 +3539,8 @@ def int_or_float(value):
     parser.add_argument(
         "--fused_backward_pass",
         action="store_true",
-        help="Combines backward pass and optimizer step to reduce VRAM usage. Only available in SDXL"
-        + " / バックワードパスとオプティマイザステップを組み合わせてVRAMの使用量を削減します。SDXLでのみ有効",
+        help="Combines backward pass and optimizer step to reduce VRAM usage. Only available in SDXL, SD3 and FLUX"
+        " / バックワードパスとオプティマイザステップを組み合わせてVRAMの使用量を削減します。SDXL、SD3、FLUXでのみ利用可能",
     )
     parser.add_argument(
         "--lr_scheduler_timescale",
@@ -4027,6 +4029,72 @@ def add_masked_loss_arguments(parser: argparse.ArgumentParser):
     )
 
 
+def add_dit_training_arguments(parser: argparse.ArgumentParser):
+    # Text encoder related arguments
+    parser.add_argument(
+        "--cache_text_encoder_outputs", action="store_true", help="cache text encoder outputs / text encoderの出力をキャッシュする"
+    )
+    parser.add_argument(
+        "--cache_text_encoder_outputs_to_disk",
+        action="store_true",
+        help="cache text encoder outputs to disk / text encoderの出力をディスクにキャッシュする",
+    )
+    parser.add_argument(
+        "--text_encoder_batch_size",
+        type=int,
+        default=None,
+        help="text encoder batch size (default: None, use dataset's batch size)"
+        + " / text encoderのバッチサイズ（デフォルト: None, データセットのバッチサイズを使用）",
+    )
+
+    # Model loading optimization
+    parser.add_argument(
+        "--disable_mmap_load_safetensors",
+        action="store_true",
+        help="disable mmap load for safetensors. Speed up model loading in WSL environment / safetensorsのmmapロードを無効にする。WSL環境等でモデル読み込みを高速化できる",
+    )
+
+    # Training arguments. partial copy from Diffusers
+    parser.add_argument(
+        "--weighting_scheme",
+        type=str,
+        default="uniform",
+        choices=["sigma_sqrt", "logit_normal", "mode", "cosmap", "none", "uniform"],
+        help="weighting scheme for timestep distribution. Default is uniform, uniform and none are the same behavior"
+        " / タイムステップ分布の重み付けスキーム、デフォルトはuniform、uniform と none は同じ挙動",
+    )
+    parser.add_argument(
+        "--logit_mean",
+        type=float,
+        default=0.0,
+        help="mean to use when using the `'logit_normal'` weighting scheme / `'logit_normal'`重み付けスキームを使用する場合の平均",
+    )
+    parser.add_argument(
+        "--logit_std",
+        type=float,
+        default=1.0,
+        help="std to use when using the `'logit_normal'` weighting scheme / `'logit_normal'`重み付けスキームを使用する場合のstd",
+    )
+    parser.add_argument(
+        "--mode_scale",
+        type=float,
+        default=1.29,
+        help="Scale of mode weighting scheme. Only effective when using the `'mode'` as the `weighting_scheme` / モード重み付けスキームのスケール",
+    )
+
+    # offloading
+    parser.add_argument(
+        "--blocks_to_swap",
+        type=int,
+        default=None,
+        help="[EXPERIMENTAL] "
+        "Sets the number of blocks to swap during the forward and backward passes."
+        "Increasing this number lowers the overall VRAM used during training at the expense of training speed (s/it)."
+        " / 順伝播および逆伝播中にスワップするブロックの数を設定します。"
+        "この数を増やすと、トレーニング中のVRAM使用量が減りますが、トレーニング速度（s/it）も低下します。",
+    )
+
+
 def get_sanitized_config_or_none(args: argparse.Namespace):
     # if `--log_config` is enabled, return args for logging. if not, return None.
     # when `--log_config is enabled, filter out sensitive values from args
diff --git a/sd3_train.py b/sd3_train.py
index 24ecbfb7d..a4fc2eec8 100644
--- a/sd3_train.py
+++ b/sd3_train.py
@@ -201,21 +201,6 @@ def train(args):
     # モデルを読み込む
 
     # t5xxl_dtype = weight_dtype
-    # if args.t5xxl_dtype is not None:
-    #     if args.t5xxl_dtype == "fp16":
-    #         t5xxl_dtype = torch.float16
-    #     elif args.t5xxl_dtype == "bf16":
-    #         t5xxl_dtype = torch.bfloat16
-    #     elif args.t5xxl_dtype == "fp32" or args.t5xxl_dtype == "float":
-    #         t5xxl_dtype = torch.float32
-    #     else:
-    #         raise ValueError(f"unexpected t5xxl_dtype: {args.t5xxl_dtype}")
-    # t5xxl_device = accelerator.device if args.t5xxl_device is None else args.t5xxl_device
-    # clip_dtype = weight_dtype  # if not args.train_text_encoder else None
-
-    # if clip_l is not specified, the checkpoint must contain clip_l, so we load state dict here
-    # if full_fp16/bf16, model_dtype is casted to fp16/bf16. If not, model_dtype is None (float32).
-    # by loading with model_dtype, we can reduce memory usage.
     model_dtype = match_mixed_precision(args, weight_dtype)  # None (default) or fp16/bf16 (full_xxxx)
     if args.clip_l is None:
         sd3_state_dict = utils.load_safetensors(
@@ -384,7 +369,7 @@ def train(args):
         # Swap blocks between CPU and GPU to reduce memory usage, in forward and backward passes.
         # This idea is based on 2kpr's great work. Thank you!
         logger.info(f"enable block swap: blocks_to_swap={args.blocks_to_swap}")
-        mmdit.enable_block_swap(args.blocks_to_swap)
+        mmdit.enable_block_swap(args.blocks_to_swap, accelerator.device)
 
     if not cache_latents:
         # move to accelerator device
@@ -611,108 +596,21 @@ def train(args):
     # resumeする
     train_util.resume_from_local_or_hf_if_specified(accelerator, args)
 
-    # memory efficient block swapping
-
-    def submit_move_blocks(futures, thread_pool, block_idx_to_cpu, block_idx_to_cuda, blocks, device):
-        def move_blocks(bidx_to_cpu, block_to_cpu, bidx_to_cuda, block_to_cuda, dvc):
-            # print(f"Backward: Move block {bidx_to_cpu} to CPU")
-            block_to_cpu = block_to_cpu.to("cpu", non_blocking=True)
-            torch.cuda.empty_cache()
-
-            # print(f"Backward: Move block {bidx_to_cuda} to CUDA")
-            block_to_cuda = block_to_cuda.to(dvc, non_blocking=True)
-            torch.cuda.synchronize()
-            # print(f"Backward: Done moving blocks {bidx_to_cpu} and {bidx_to_cuda}")
-            return bidx_to_cpu, bidx_to_cuda
-
-        block_to_cpu = blocks[block_idx_to_cpu]
-        block_to_cuda = blocks[block_idx_to_cuda]
-
-        futures[block_idx_to_cuda] = thread_pool.submit(
-            move_blocks, block_idx_to_cpu, block_to_cpu, block_idx_to_cuda, block_to_cuda, device
-        )
-
-    def wait_blocks_move(block_idx, futures):
-        if block_idx not in futures:
-            return
-        future = futures.pop(block_idx)
-        future.result()
-
     if args.fused_backward_pass:
         # use fused optimizer for backward pass: other optimizers will be supported in the future
         import library.adafactor_fused
 
         library.adafactor_fused.patch_adafactor_fused(optimizer)
 
-        blocks_to_swap = args.blocks_to_swap
-        num_blocks = len(accelerator.unwrap_model(mmdit).joint_blocks)
-        handled_block_indices = set()
-
-        n = 1  # only asynchronous purpose, no need to increase this number
-        # n = 2
-        # n = max(1, os.cpu_count() // 2)
-        thread_pool = ThreadPoolExecutor(max_workers=n)
-        futures = {}
-
         for param_group, param_name_group in zip(optimizer.param_groups, param_names):
             for parameter, param_name in zip(param_group["params"], param_name_group):
                 if parameter.requires_grad:
-                    grad_hook = None
-
-                    if blocks_to_swap:
-                        is_block = param_name.startswith("joint_blocks")
-                        if is_block:
-                            block_idx = int(param_name.split(".")[1])
-                            if block_idx not in handled_block_indices:
-                                # swap following (already backpropagated) block
-                                handled_block_indices.add(block_idx)
-
-                                # if n blocks were already backpropagated
-                                num_blocks_propagated = num_blocks - block_idx - 1
-                                swapping = num_blocks_propagated > 0 and num_blocks_propagated <= blocks_to_swap
-                                waiting = block_idx > 0 and block_idx <= blocks_to_swap
-                                if swapping or waiting:
-                                    block_idx_to_cpu = num_blocks - num_blocks_propagated
-                                    block_idx_to_cuda = blocks_to_swap - num_blocks_propagated
-                                    block_idx_to_wait = block_idx - 1
-
-                                    # create swap hook
-                                    def create_swap_grad_hook(
-                                        bidx_to_cpu, bidx_to_cuda, bidx_to_wait, bidx: int, swpng: bool, wtng: bool
-                                    ):
-                                        def __grad_hook(tensor: torch.Tensor):
-                                            if accelerator.sync_gradients and args.max_grad_norm != 0.0:
-                                                accelerator.clip_grad_norm_(tensor, args.max_grad_norm)
-                                            optimizer.step_param(tensor, param_group)
-                                            tensor.grad = None
-
-                                            if swpng:
-                                                submit_move_blocks(
-                                                    futures,
-                                                    thread_pool,
-                                                    bidx_to_cpu,
-                                                    bidx_to_cuda,
-                                                    mmdit.joint_blocks,
-                                                    accelerator.device,
-                                                )
-                                            if wtng:
-                                                wait_blocks_move(bidx_to_wait, futures)
-
-                                        return __grad_hook
-
-                                    grad_hook = create_swap_grad_hook(
-                                        block_idx_to_cpu, block_idx_to_cuda, block_idx_to_wait, block_idx, swapping, waiting
-                                    )
-
-                    if grad_hook is None:
-
-                        def __grad_hook(tensor: torch.Tensor, param_group=param_group):
-                            if accelerator.sync_gradients and args.max_grad_norm != 0.0:
-                                accelerator.clip_grad_norm_(tensor, args.max_grad_norm)
-                            optimizer.step_param(tensor, param_group)
-                            tensor.grad = None
 
-                        grad_hook = __grad_hook
+                    def grad_hook(tensor: torch.Tensor, param_group=param_group):
+                        if accelerator.sync_gradients and args.max_grad_norm != 0.0:
+                            accelerator.clip_grad_norm_(tensor, args.max_grad_norm)
+                        optimizer.step_param(tensor, param_group)
+                        tensor.grad = None
 
                     parameter.register_post_accumulate_grad_hook(grad_hook)
 
@@ -731,59 +629,22 @@ def __grad_hook(tensor: torch.Tensor, param_group=param_group):
         num_parameters_per_group = [0] * len(optimizers)
         parameter_optimizer_map = {}
 
-        blocks_to_swap = args.blocks_to_swap
-        num_blocks = len(accelerator.unwrap_model(mmdit).joint_blocks)
-
-        n = 1  # only asynchronous purpose, no need to increase this number
-        # n = max(1, os.cpu_count() // 2)
-        thread_pool = ThreadPoolExecutor(max_workers=n)
-        futures = {}
-
         for opt_idx, optimizer in enumerate(optimizers):
             for param_group in optimizer.param_groups:
                 for parameter in param_group["params"]:
                     if parameter.requires_grad:
-                        block_type, block_idx = block_types_and_indices[opt_idx]
-
-                        def create_optimizer_hook(btype, bidx):
-                            def optimizer_hook(parameter: torch.Tensor):
-                                # print(f"optimizer_hook: {btype}, {bidx}")
-                                if accelerator.sync_gradients and args.max_grad_norm != 0.0:
-                                    accelerator.clip_grad_norm_(parameter, args.max_grad_norm)
-
-                                i = parameter_optimizer_map[parameter]
-                                optimizer_hooked_count[i] += 1
-                                if optimizer_hooked_count[i] == num_parameters_per_group[i]:
-                                    optimizers[i].step()
-                                    optimizers[i].zero_grad(set_to_none=True)
-
-                                    # swap blocks if necessary
-                                    if blocks_to_swap and btype == "joint":
-                                        num_blocks_propagated = num_blocks - bidx
-
-                                        swapping = num_blocks_propagated > 0 and num_blocks_propagated <= blocks_to_swap
-                                        waiting = bidx > 0 and bidx <= blocks_to_swap
-
-                                        if swapping:
-                                            block_idx_to_cpu = num_blocks - num_blocks_propagated
-                                            block_idx_to_cuda = blocks_to_swap - num_blocks_propagated
-                                            # print(f"Backward: Swap blocks {block_idx_to_cpu} and {block_idx_to_cuda}")
-                                            submit_move_blocks(
-                                                futures,
-                                                thread_pool,
-                                                block_idx_to_cpu,
-                                                block_idx_to_cuda,
-                                                mmdit.joint_blocks,
-                                                accelerator.device,
-                                            )
-
-                                        if waiting:
-                                            block_idx_to_wait = bidx - 1
-                                            wait_blocks_move(block_idx_to_wait, futures)
-
-                            return optimizer_hook
-
-                        parameter.register_post_accumulate_grad_hook(create_optimizer_hook(block_type, block_idx))
+
+                        def grad_hook(parameter: torch.Tensor):
+                            if accelerator.sync_gradients and args.max_grad_norm != 0.0:
+                                accelerator.clip_grad_norm_(parameter, args.max_grad_norm)
+
+                            i = parameter_optimizer_map[parameter]
+                            optimizer_hooked_count[i] += 1
+                            if optimizer_hooked_count[i] == num_parameters_per_group[i]:
+                                optimizers[i].step()
+                                optimizers[i].zero_grad(set_to_none=True)
+
+                        parameter.register_post_accumulate_grad_hook(grad_hook)
                         parameter_optimizer_map[parameter] = opt_idx
                         num_parameters_per_group[opt_idx] += 1
 
@@ -1130,6 +991,7 @@ def setup_parser() -> argparse.ArgumentParser:
     train_util.add_optimizer_arguments(parser)
     config_util.add_config_arguments(parser)
     add_custom_train_arguments(parser)
+    train_util.add_dit_training_arguments(parser)
     sd3_train_utils.add_sd3_training_arguments(parser)
 
     parser.add_argument(
@@ -1190,16 +1052,6 @@ def setup_parser() -> argparse.ArgumentParser:
         action="store_true",
         help="[Deprecated] use 'skip_cache_check' instead / 代わりに 'skip_cache_check' を使用してください",
     )
-    parser.add_argument(
-        "--blocks_to_swap",
-        type=int,
-        default=None,
-        help="[EXPERIMENTAL] "
-        "Sets the number of blocks (~640MB) to swap during the forward and backward passes."
-        "Increasing this number lowers the overall VRAM used during training at the expense of training speed (s/it)."
-        " / 順伝播および逆伝播中にスワップするブロック（約640MB）の数を設定します。"
-        "この数を増やすと、トレーニング中のVRAM使用量が減りますが、トレーニング速度（s/it）も低下します。",
-    )
     parser.add_argument(
         "--num_last_block_to_freeze",
         type=int,
diff --git a/sd3_train_network.py b/sd3_train_network.py
index bb02c7ac7..1726e325f 100644
--- a/sd3_train_network.py
+++ b/sd3_train_network.py
@@ -51,6 +51,10 @@ def assert_extra_args(self, args, train_dataset_group: train_util.DatasetGroup):
         if args.max_token_length is not None:
             logger.warning("max_token_length is not used in Flux training / max_token_lengthはFluxのトレーニングでは使用されません")
 
+        assert (
+            args.blocks_to_swap is None or args.blocks_to_swap == 0
+        ) or not args.cpu_offload_checkpointing, "blocks_to_swap is not supported with cpu_offload_checkpointing / blocks_to_swapはcpu_offload_checkpointingと併用できません"
+
         train_dataset_group.verify_bucket_reso_steps(32)  # TODO check this
 
         # enumerate resolutions from dataset for positional embeddings
@@ -83,6 +87,17 @@ def load_target_model(self, args, weight_dtype, accelerator):
                 raise ValueError(f"Unsupported fp8 model dtype: {mmdit.dtype}")
             elif mmdit.dtype == torch.float8_e4m3fn:
                 logger.info("Loaded fp8 SD3 model")
+            else:
+                logger.info(
+                    "Cast SD3 model to fp8. This may take a while. You can reduce the time by using fp8 checkpoint."
+                    " / SD3モデルをfp8に変換しています。これには時間がかかる場合があります。fp8チェックポイントを使用することで時間を短縮できます。"
+                )
+                mmdit.to(torch.float8_e4m3fn)
+        self.is_swapping_blocks = args.blocks_to_swap is not None and args.blocks_to_swap > 0
+        if self.is_swapping_blocks:
+            # Swap blocks between CPU and GPU to reduce memory usage, in forward and backward passes.
+            logger.info(f"enable block swap: blocks_to_swap={args.blocks_to_swap}")
+            mmdit.enable_block_swap(args.blocks_to_swap, accelerator.device)
 
         clip_l = sd3_utils.load_clip_l(
             args.clip_l, weight_dtype, "cpu", disable_mmap=args.disable_mmap_load_safetensors, state_dict=state_dict
@@ -432,9 +447,24 @@ def on_step_start(self, args, accelerator, network, text_encoders, unet, batch,
             text_encoder_outputs_list = text_encodoing_strategy.drop_cached_text_encoder_outputs(*text_encoder_outputs_list)
             batch["text_encoder_outputs_list"] = text_encoder_outputs_list
 
+    def prepare_unet_with_accelerator(
+        self, args: argparse.Namespace, accelerator: Accelerator, unet: torch.nn.Module
+    ) -> torch.nn.Module:
+        if not self.is_swapping_blocks:
+            return super().prepare_unet_with_accelerator(args, accelerator, unet)
+
+        # if we doesn't swap blocks, we can move the model to device
+        mmdit: sd3_models.MMDiT = unet
+        mmdit = accelerator.prepare(mmdit, device_placement=[not self.is_swapping_blocks])
+        accelerator.unwrap_model(mmdit).move_to_device_except_swap_blocks(accelerator.device)  # reduce peak memory usage
+        accelerator.unwrap_model(mmdit).prepare_block_swap_before_forward()
+
+        return mmdit
+
 
 def setup_parser() -> argparse.ArgumentParser:
     parser = train_network.setup_parser()
+    train_util.add_dit_training_arguments(parser)
     sd3_train_utils.add_sd3_training_arguments(parser)
     return parser
 
diff --git a/tools/cache_latents.py b/tools/cache_latents.py
index e2faa58a7..c034f949a 100644
--- a/tools/cache_latents.py
+++ b/tools/cache_latents.py
@@ -164,6 +164,7 @@ def setup_parser() -> argparse.ArgumentParser:
     train_util.add_dataset_arguments(parser, True, True, True)
     train_util.add_masked_loss_arguments(parser)
     config_util.add_config_arguments(parser)
+    train_util.add_dit_training_arguments(parser)
     flux_train_utils.add_flux_train_arguments(parser)
 
     parser.add_argument("--sdxl", action="store_true", help="Use SDXL model / SDXLモデルを使用する")
diff --git a/tools/cache_text_encoder_outputs.py b/tools/cache_text_encoder_outputs.py
index 7be9ad781..5888b8e3d 100644
--- a/tools/cache_text_encoder_outputs.py
+++ b/tools/cache_text_encoder_outputs.py
@@ -191,6 +191,7 @@ def setup_parser() -> argparse.ArgumentParser:
     train_util.add_dataset_arguments(parser, True, True, True)
     train_util.add_masked_loss_arguments(parser)
     config_util.add_config_arguments(parser)
+    train_util.add_dit_training_arguments(parser)
     flux_train_utils.add_flux_train_arguments(parser)
 
     parser.add_argument("--sdxl", action="store_true", help="Use SDXL model / SDXLモデルを使用する")
diff --git a/train_network.py b/train_network.py
index d70f14ad3..bbf381f99 100644
--- a/train_network.py
+++ b/train_network.py
@@ -601,8 +601,10 @@ def train(self, args):
             # unet.to(accelerator.device)  # this makes faster `to(dtype)` below, but consumes 23 GB VRAM
             # unet.to(dtype=unet_weight_dtype)  # without moving to gpu, this takes a lot of time and main memory
 
-            logger.info(f"set U-Net weight dtype to {unet_weight_dtype}, device to {accelerator.device}")
-            unet.to(accelerator.device, dtype=unet_weight_dtype)  # this seems to be safer than above
+            # logger.info(f"set U-Net weight dtype to {unet_weight_dtype}, device to {accelerator.device}")
+            # unet.to(accelerator.device, dtype=unet_weight_dtype)  # this seems to be safer than above
+            logger.info(f"set U-Net weight dtype to {unet_weight_dtype}")
+            unet.to(dtype=unet_weight_dtype)  # do not move to device because unet is not prepared by accelerator
 
         unet.requires_grad_(False)
         unet.to(dtype=unet_weight_dtype)

From 2bb0f547d72cd0256cafebd46d0f61fbe54012ac Mon Sep 17 00:00:00 2001
From: Kohya S <ykumeykume@gmail.com>
Date: Thu, 14 Nov 2024 19:33:12 +0900
Subject: [PATCH 234/582] update grad hook creation to fix TE lr in sd3 fine
 tuning

---
 flux_train.py         | 19 ++++++++++++-------
 library/train_util.py |  1 +
 sd3_train.py          | 15 +++++++++------
 3 files changed, 22 insertions(+), 13 deletions(-)

diff --git a/flux_train.py b/flux_train.py
index ad2c7722b..a89e2f139 100644
--- a/flux_train.py
+++ b/flux_train.py
@@ -80,7 +80,9 @@ def train(args):
 
     assert (
         args.blocks_to_swap is None or args.blocks_to_swap == 0
-    ) or not args.cpu_offload_checkpointing, "blocks_to_swap is not supported with cpu_offload_checkpointing / blocks_to_swapはcpu_offload_checkpointingと併用できません"
+    ) or not args.cpu_offload_checkpointing, (
+        "blocks_to_swap is not supported with cpu_offload_checkpointing / blocks_to_swapはcpu_offload_checkpointingと併用できません"
+    )
 
     cache_latents = args.cache_latents
     use_dreambooth_method = args.in_json is None
@@ -480,13 +482,16 @@ def train(args):
             for parameter, param_name in zip(param_group["params"], param_name_group):
                 if parameter.requires_grad:
 
-                    def grad_hook(tensor: torch.Tensor, param_group=param_group):
-                        if accelerator.sync_gradients and args.max_grad_norm != 0.0:
-                            accelerator.clip_grad_norm_(tensor, args.max_grad_norm)
-                        optimizer.step_param(tensor, param_group)
-                        tensor.grad = None
+                    def create_grad_hook(p_name, p_group):
+                        def grad_hook(tensor: torch.Tensor):
+                            if accelerator.sync_gradients and args.max_grad_norm != 0.0:
+                                accelerator.clip_grad_norm_(tensor, args.max_grad_norm)
+                            optimizer.step_param(tensor, p_group)
+                            tensor.grad = None
+
+                        return grad_hook
 
-                    parameter.register_post_accumulate_grad_hook(grad_hook)
+                    parameter.register_post_accumulate_grad_hook(create_grad_hook(param_name, param_group))
 
     elif args.blockwise_fused_optimizers:
         # prepare for additional optimizers and lr schedulers
diff --git a/library/train_util.py b/library/train_util.py
index e1dfeecdb..25cf7640d 100644
--- a/library/train_util.py
+++ b/library/train_util.py
@@ -5913,6 +5913,7 @@ def append_lr_to_logs(logs, lr_scheduler, optimizer_type, including_unet=True):
         names.append("unet")
     names.append("text_encoder1")
     names.append("text_encoder2")
+    names.append("text_encoder3") # SD3
 
     append_lr_to_logs_with_names(logs, lr_scheduler, optimizer_type, names)
 
diff --git a/sd3_train.py b/sd3_train.py
index a4fc2eec8..96ec951b9 100644
--- a/sd3_train.py
+++ b/sd3_train.py
@@ -606,13 +606,16 @@ def train(args):
             for parameter, param_name in zip(param_group["params"], param_name_group):
                 if parameter.requires_grad:
 
-                    def grad_hook(tensor: torch.Tensor, param_group=param_group):
-                        if accelerator.sync_gradients and args.max_grad_norm != 0.0:
-                            accelerator.clip_grad_norm_(tensor, args.max_grad_norm)
-                        optimizer.step_param(tensor, param_group)
-                        tensor.grad = None
+                    def create_grad_hook(p_name, p_group):
+                        def grad_hook(tensor: torch.Tensor):
+                            if accelerator.sync_gradients and args.max_grad_norm != 0.0:
+                                accelerator.clip_grad_norm_(tensor, args.max_grad_norm)
+                            optimizer.step_param(tensor, p_group)
+                            tensor.grad = None
+
+                        return grad_hook
 
-                    parameter.register_post_accumulate_grad_hook(grad_hook)
+                    parameter.register_post_accumulate_grad_hook(create_grad_hook(param_name, param_group))
 
     elif args.blockwise_fused_optimizers:
         # prepare for additional optimizers and lr schedulers

From 5c5b544b91ac434c12a372cbf1dc123a367ec878 Mon Sep 17 00:00:00 2001
From: Kohya S <ykumeykume@gmail.com>
Date: Thu, 14 Nov 2024 19:35:43 +0900
Subject: [PATCH 235/582] refactor: remove unused prepare_split_model method
 from FluxNetworkTrainer

---
 flux_train_network.py | 39 ---------------------------------------
 1 file changed, 39 deletions(-)

diff --git a/flux_train_network.py b/flux_train_network.py
index 9bcd59282..704c4d32e 100644
--- a/flux_train_network.py
+++ b/flux_train_network.py
@@ -127,45 +127,6 @@ def load_target_model(self, args, weight_dtype, accelerator):
 
         return flux_utils.MODEL_VERSION_FLUX_V1, [clip_l, t5xxl], ae, model
 
-    """
-    def prepare_split_model(self, model, weight_dtype, accelerator):
-        from accelerate import init_empty_weights
-
-        logger.info("prepare split model")
-        with init_empty_weights():
-            flux_upper = flux_models.FluxUpper(model.params)
-            flux_lower = flux_models.FluxLower(model.params)
-        sd = model.state_dict()
-
-        # lower (trainable)
-        logger.info("load state dict for lower")
-        flux_lower.load_state_dict(sd, strict=False, assign=True)
-        flux_lower.to(dtype=weight_dtype)
-
-        # upper (frozen)
-        logger.info("load state dict for upper")
-        flux_upper.load_state_dict(sd, strict=False, assign=True)
-
-        logger.info("prepare upper model")
-        target_dtype = torch.float8_e4m3fn if args.fp8_base else weight_dtype
-        flux_upper.to(accelerator.device, dtype=target_dtype)
-        flux_upper.eval()
-
-        if args.fp8_base:
-            # this is required to run on fp8
-            flux_upper = accelerator.prepare(flux_upper)
-
-        flux_upper.to("cpu")
-
-        self.flux_upper = flux_upper
-        del model  # we don't need model anymore
-        clean_memory_on_device(accelerator.device)
-
-        logger.info("split model prepared")
-
-        return flux_lower
-    """
-
     def get_tokenize_strategy(self, args):
         _, is_schnell, _, _ = flux_utils.analyze_checkpoint_state(args.pretrained_model_name_or_path)
 

From fd2d879ac883b8bdf1e03b6ca545c33200dbdff2 Mon Sep 17 00:00:00 2001
From: Kohya S <ykumeykume@gmail.com>
Date: Thu, 14 Nov 2024 19:43:08 +0900
Subject: [PATCH 236/582] docs: update README

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 1e63b5830..81a3199bc 100644
--- a/README.md
+++ b/README.md
@@ -14,7 +14,7 @@ The command to install PyTorch is as follows:
 
 ### Recent Updates
 
-Nov 12, 2024:
+Nov 14, 2024:
 
 - Improved the implementation of block swap and made it available for both FLUX.1 and SD3 LoRA training. See [FLUX.1 LoRA training](#flux1-lora-training) etc. for how to use the new options. Training is possible with about 8-10GB of VRAM.
 - During fine-tuning, the memory usage when specifying the same number of blocks has increased slightly, but the training speed when specifying block swap has been significantly improved.

From ccfaa001e74f80798e528b4b3ea6ef811017c07b Mon Sep 17 00:00:00 2001
From: minux302 <minux302@gmail.com>
Date: Fri, 15 Nov 2024 20:21:28 +0900
Subject: [PATCH 237/582] add flux controlnet base module

---
 flux_train_control_net.py | 573 ++++++++++++++++++++++++++++++++++++++
 flux_train_network.py     |   5 +-
 library/flux_models.py    | 257 ++++++++++++++++-
 library/flux_utils.py     |   8 +
 4 files changed, 841 insertions(+), 2 deletions(-)
 create mode 100644 flux_train_control_net.py

diff --git a/flux_train_control_net.py b/flux_train_control_net.py
new file mode 100644
index 000000000..704c4d32e
--- /dev/null
+++ b/flux_train_control_net.py
@@ -0,0 +1,573 @@
+import argparse
+import copy
+import math
+import random
+from typing import Any, Optional
+
+import torch
+from accelerate import Accelerator
+from library.device_utils import init_ipex, clean_memory_on_device
+
+init_ipex()
+
+from library import flux_models, flux_train_utils, flux_utils, sd3_train_utils, strategy_base, strategy_flux, train_util
+import train_network
+from library.utils import setup_logging
+
+setup_logging()
+import logging
+
+logger = logging.getLogger(__name__)
+
+
+class FluxNetworkTrainer(train_network.NetworkTrainer):
+    def __init__(self):
+        super().__init__()
+        self.sample_prompts_te_outputs = None
+        self.is_schnell: Optional[bool] = None
+        self.is_swapping_blocks: bool = False
+
+    def assert_extra_args(self, args, train_dataset_group):
+        super().assert_extra_args(args, train_dataset_group)
+        # sdxl_train_util.verify_sdxl_training_args(args)
+
+        if args.fp8_base_unet:
+            args.fp8_base = True  # if fp8_base_unet is enabled, fp8_base is also enabled for FLUX.1
+
+        if args.cache_text_encoder_outputs_to_disk and not args.cache_text_encoder_outputs:
+            logger.warning(
+                "cache_text_encoder_outputs_to_disk is enabled, so cache_text_encoder_outputs is also enabled / cache_text_encoder_outputs_to_diskが有効になっているため、cache_text_encoder_outputsも有効になります"
+            )
+            args.cache_text_encoder_outputs = True
+
+        if args.cache_text_encoder_outputs:
+            assert (
+                train_dataset_group.is_text_encoder_output_cacheable()
+            ), "when caching Text Encoder output, either caption_dropout_rate, shuffle_caption, token_warmup_step or caption_tag_dropout_rate cannot be used / Text Encoderの出力をキャッシュするときはcaption_dropout_rate, shuffle_caption, token_warmup_step, caption_tag_dropout_rateは使えません"
+
+        # prepare CLIP-L/T5XXL training flags
+        self.train_clip_l = not args.network_train_unet_only
+        self.train_t5xxl = False  # default is False even if args.network_train_unet_only is False
+
+        if args.max_token_length is not None:
+            logger.warning("max_token_length is not used in Flux training / max_token_lengthはFluxのトレーニングでは使用されません")
+
+        assert (
+            args.blocks_to_swap is None or args.blocks_to_swap == 0
+        ) or not args.cpu_offload_checkpointing, "blocks_to_swap is not supported with cpu_offload_checkpointing / blocks_to_swapはcpu_offload_checkpointingと併用できません"
+
+        # deprecated split_mode option
+        if args.split_mode:
+            if args.blocks_to_swap is not None:
+                logger.warning(
+                    "split_mode is deprecated. Because `--blocks_to_swap` is set, `--split_mode` is ignored."
+                    " / split_modeは非推奨です。`--blocks_to_swap`が設定されているため、`--split_mode`は無視されます。"
+                )
+            else:
+                logger.warning(
+                    "split_mode is deprecated. Please use `--blocks_to_swap` instead. `--blocks_to_swap 18` is automatically set."
+                    " / split_modeは非推奨です。代わりに`--blocks_to_swap`を使用してください。`--blocks_to_swap 18`が自動的に設定されました。"
+                )
+                args.blocks_to_swap = 18  # 18 is safe for most cases
+
+        train_dataset_group.verify_bucket_reso_steps(32)  # TODO check this
+
+    def load_target_model(self, args, weight_dtype, accelerator):
+        # currently offload to cpu for some models
+
+        # if the file is fp8 and we are using fp8_base, we can load it as is (fp8)
+        loading_dtype = None if args.fp8_base else weight_dtype
+
+        # if we load to cpu, flux.to(fp8) takes a long time, so we should load to gpu in future
+        self.is_schnell, model = flux_utils.load_flow_model(
+            args.pretrained_model_name_or_path, loading_dtype, "cpu", disable_mmap=args.disable_mmap_load_safetensors
+        )
+        if args.fp8_base:
+            # check dtype of model
+            if model.dtype == torch.float8_e4m3fnuz or model.dtype == torch.float8_e5m2 or model.dtype == torch.float8_e5m2fnuz:
+                raise ValueError(f"Unsupported fp8 model dtype: {model.dtype}")
+            elif model.dtype == torch.float8_e4m3fn:
+                logger.info("Loaded fp8 FLUX model")
+            else:
+                logger.info(
+                    "Cast FLUX model to fp8. This may take a while. You can reduce the time by using fp8 checkpoint."
+                    " / FLUXモデルをfp8に変換しています。これには時間がかかる場合があります。fp8チェックポイントを使用することで時間を短縮できます。"
+                )
+                model.to(torch.float8_e4m3fn)
+
+        # if args.split_mode:
+        #     model = self.prepare_split_model(model, weight_dtype, accelerator)
+
+        self.is_swapping_blocks = args.blocks_to_swap is not None and args.blocks_to_swap > 0
+        if self.is_swapping_blocks:
+            # Swap blocks between CPU and GPU to reduce memory usage, in forward and backward passes.
+            logger.info(f"enable block swap: blocks_to_swap={args.blocks_to_swap}")
+            model.enable_block_swap(args.blocks_to_swap, accelerator.device)
+
+        clip_l = flux_utils.load_clip_l(args.clip_l, weight_dtype, "cpu", disable_mmap=args.disable_mmap_load_safetensors)
+        clip_l.eval()
+
+        # if the file is fp8 and we are using fp8_base (not unet), we can load it as is (fp8)
+        if args.fp8_base and not args.fp8_base_unet:
+            loading_dtype = None  # as is
+        else:
+            loading_dtype = weight_dtype
+
+        # loading t5xxl to cpu takes a long time, so we should load to gpu in future
+        t5xxl = flux_utils.load_t5xxl(args.t5xxl, loading_dtype, "cpu", disable_mmap=args.disable_mmap_load_safetensors)
+        t5xxl.eval()
+        if args.fp8_base and not args.fp8_base_unet:
+            # check dtype of model
+            if t5xxl.dtype == torch.float8_e4m3fnuz or t5xxl.dtype == torch.float8_e5m2 or t5xxl.dtype == torch.float8_e5m2fnuz:
+                raise ValueError(f"Unsupported fp8 model dtype: {t5xxl.dtype}")
+            elif t5xxl.dtype == torch.float8_e4m3fn:
+                logger.info("Loaded fp8 T5XXL model")
+
+        ae = flux_utils.load_ae(args.ae, weight_dtype, "cpu", disable_mmap=args.disable_mmap_load_safetensors)
+
+        return flux_utils.MODEL_VERSION_FLUX_V1, [clip_l, t5xxl], ae, model
+
+    def get_tokenize_strategy(self, args):
+        _, is_schnell, _, _ = flux_utils.analyze_checkpoint_state(args.pretrained_model_name_or_path)
+
+        if args.t5xxl_max_token_length is None:
+            if is_schnell:
+                t5xxl_max_token_length = 256
+            else:
+                t5xxl_max_token_length = 512
+        else:
+            t5xxl_max_token_length = args.t5xxl_max_token_length
+
+        logger.info(f"t5xxl_max_token_length: {t5xxl_max_token_length}")
+        return strategy_flux.FluxTokenizeStrategy(t5xxl_max_token_length, args.tokenizer_cache_dir)
+
+    def get_tokenizers(self, tokenize_strategy: strategy_flux.FluxTokenizeStrategy):
+        return [tokenize_strategy.clip_l, tokenize_strategy.t5xxl]
+
+    def get_latents_caching_strategy(self, args):
+        latents_caching_strategy = strategy_flux.FluxLatentsCachingStrategy(args.cache_latents_to_disk, args.vae_batch_size, False)
+        return latents_caching_strategy
+
+    def get_text_encoding_strategy(self, args):
+        return strategy_flux.FluxTextEncodingStrategy(apply_t5_attn_mask=args.apply_t5_attn_mask)
+
+    def post_process_network(self, args, accelerator, network, text_encoders, unet):
+        # check t5xxl is trained or not
+        self.train_t5xxl = network.train_t5xxl
+
+        if self.train_t5xxl and args.cache_text_encoder_outputs:
+            raise ValueError(
+                "T5XXL is trained, so cache_text_encoder_outputs cannot be used / T5XXL学習時はcache_text_encoder_outputsは使用できません"
+            )
+
+    def get_models_for_text_encoding(self, args, accelerator, text_encoders):
+        if args.cache_text_encoder_outputs:
+            if self.train_clip_l and not self.train_t5xxl:
+                return text_encoders[0:1]  # only CLIP-L is needed for encoding because T5XXL is cached
+            else:
+                return None  # no text encoders are needed for encoding because both are cached
+        else:
+            return text_encoders  # both CLIP-L and T5XXL are needed for encoding
+
+    def get_text_encoders_train_flags(self, args, text_encoders):
+        return [self.train_clip_l, self.train_t5xxl]
+
+    def get_text_encoder_outputs_caching_strategy(self, args):
+        if args.cache_text_encoder_outputs:
+            # if the text encoders is trained, we need tokenization, so is_partial is True
+            return strategy_flux.FluxTextEncoderOutputsCachingStrategy(
+                args.cache_text_encoder_outputs_to_disk,
+                args.text_encoder_batch_size,
+                args.skip_cache_check,
+                is_partial=self.train_clip_l or self.train_t5xxl,
+                apply_t5_attn_mask=args.apply_t5_attn_mask,
+            )
+        else:
+            return None
+
+    def cache_text_encoder_outputs_if_needed(
+        self, args, accelerator: Accelerator, unet, vae, text_encoders, dataset: train_util.DatasetGroup, weight_dtype
+    ):
+        if args.cache_text_encoder_outputs:
+            if not args.lowram:
+                # メモリ消費を減らす
+                logger.info("move vae and unet to cpu to save memory")
+                org_vae_device = vae.device
+                org_unet_device = unet.device
+                vae.to("cpu")
+                unet.to("cpu")
+                clean_memory_on_device(accelerator.device)
+
+            # When TE is not be trained, it will not be prepared so we need to use explicit autocast
+            logger.info("move text encoders to gpu")
+            text_encoders[0].to(accelerator.device, dtype=weight_dtype)  # always not fp8
+            text_encoders[1].to(accelerator.device)
+
+            if text_encoders[1].dtype == torch.float8_e4m3fn:
+                # if we load fp8 weights, the model is already fp8, so we use it as is
+                self.prepare_text_encoder_fp8(1, text_encoders[1], text_encoders[1].dtype, weight_dtype)
+            else:
+                # otherwise, we need to convert it to target dtype
+                text_encoders[1].to(weight_dtype)
+
+            with accelerator.autocast():
+                dataset.new_cache_text_encoder_outputs(text_encoders, accelerator)
+
+            # cache sample prompts
+            if args.sample_prompts is not None:
+                logger.info(f"cache Text Encoder outputs for sample prompt: {args.sample_prompts}")
+
+                tokenize_strategy: strategy_flux.FluxTokenizeStrategy = strategy_base.TokenizeStrategy.get_strategy()
+                text_encoding_strategy: strategy_flux.FluxTextEncodingStrategy = strategy_base.TextEncodingStrategy.get_strategy()
+
+                prompts = train_util.load_prompts(args.sample_prompts)
+                sample_prompts_te_outputs = {}  # key: prompt, value: text encoder outputs
+                with accelerator.autocast(), torch.no_grad():
+                    for prompt_dict in prompts:
+                        for p in [prompt_dict.get("prompt", ""), prompt_dict.get("negative_prompt", "")]:
+                            if p not in sample_prompts_te_outputs:
+                                logger.info(f"cache Text Encoder outputs for prompt: {p}")
+                                tokens_and_masks = tokenize_strategy.tokenize(p)
+                                sample_prompts_te_outputs[p] = text_encoding_strategy.encode_tokens(
+                                    tokenize_strategy, text_encoders, tokens_and_masks, args.apply_t5_attn_mask
+                                )
+                self.sample_prompts_te_outputs = sample_prompts_te_outputs
+
+            accelerator.wait_for_everyone()
+
+            # move back to cpu
+            if not self.is_train_text_encoder(args):
+                logger.info("move CLIP-L back to cpu")
+                text_encoders[0].to("cpu")
+            logger.info("move t5XXL back to cpu")
+            text_encoders[1].to("cpu")
+            clean_memory_on_device(accelerator.device)
+
+            if not args.lowram:
+                logger.info("move vae and unet back to original device")
+                vae.to(org_vae_device)
+                unet.to(org_unet_device)
+        else:
+            # Text Encoderから毎回出力を取得するので、GPUに乗せておく
+            text_encoders[0].to(accelerator.device, dtype=weight_dtype)
+            text_encoders[1].to(accelerator.device)
+
+    # def call_unet(self, args, accelerator, unet, noisy_latents, timesteps, text_conds, batch, weight_dtype):
+    #     noisy_latents = noisy_latents.to(weight_dtype)  # TODO check why noisy_latents is not weight_dtype
+
+    #     # get size embeddings
+    #     orig_size = batch["original_sizes_hw"]
+    #     crop_size = batch["crop_top_lefts"]
+    #     target_size = batch["target_sizes_hw"]
+    #     embs = sdxl_train_util.get_size_embeddings(orig_size, crop_size, target_size, accelerator.device).to(weight_dtype)
+
+    #     # concat embeddings
+    #     encoder_hidden_states1, encoder_hidden_states2, pool2 = text_conds
+    #     vector_embedding = torch.cat([pool2, embs], dim=1).to(weight_dtype)
+    #     text_embedding = torch.cat([encoder_hidden_states1, encoder_hidden_states2], dim=2).to(weight_dtype)
+
+    #     noise_pred = unet(noisy_latents, timesteps, text_embedding, vector_embedding)
+    #     return noise_pred
+
+    def sample_images(self, accelerator, args, epoch, global_step, device, ae, tokenizer, text_encoder, flux):
+        text_encoders = text_encoder  # for compatibility
+        text_encoders = self.get_models_for_text_encoding(args, accelerator, text_encoders)
+
+        flux_train_utils.sample_images(
+            accelerator, args, epoch, global_step, flux, ae, text_encoders, self.sample_prompts_te_outputs
+        )
+        # return
+
+        """
+        class FluxUpperLowerWrapper(torch.nn.Module):
+            def __init__(self, flux_upper: flux_models.FluxUpper, flux_lower: flux_models.FluxLower, device: torch.device):
+                super().__init__()
+                self.flux_upper = flux_upper
+                self.flux_lower = flux_lower
+                self.target_device = device
+
+            def prepare_block_swap_before_forward(self):
+                pass
+
+            def forward(self, img, img_ids, txt, txt_ids, timesteps, y, guidance=None, txt_attention_mask=None):
+                self.flux_lower.to("cpu")
+                clean_memory_on_device(self.target_device)
+                self.flux_upper.to(self.target_device)
+                img, txt, vec, pe = self.flux_upper(img, img_ids, txt, txt_ids, timesteps, y, guidance, txt_attention_mask)
+                self.flux_upper.to("cpu")
+                clean_memory_on_device(self.target_device)
+                self.flux_lower.to(self.target_device)
+                return self.flux_lower(img, txt, vec, pe, txt_attention_mask)
+
+        wrapper = FluxUpperLowerWrapper(self.flux_upper, flux, accelerator.device)
+        clean_memory_on_device(accelerator.device)
+        flux_train_utils.sample_images(
+            accelerator, args, epoch, global_step, wrapper, ae, text_encoders, self.sample_prompts_te_outputs
+        )
+        clean_memory_on_device(accelerator.device)
+        """
+
+    def get_noise_scheduler(self, args: argparse.Namespace, device: torch.device) -> Any:
+        noise_scheduler = sd3_train_utils.FlowMatchEulerDiscreteScheduler(num_train_timesteps=1000, shift=args.discrete_flow_shift)
+        self.noise_scheduler_copy = copy.deepcopy(noise_scheduler)
+        return noise_scheduler
+
+    def encode_images_to_latents(self, args, accelerator, vae, images):
+        return vae.encode(images)
+
+    def shift_scale_latents(self, args, latents):
+        return latents
+
+    def get_noise_pred_and_target(
+        self,
+        args,
+        accelerator,
+        noise_scheduler,
+        latents,
+        batch,
+        text_encoder_conds,
+        unet: flux_models.Flux,
+        network,
+        weight_dtype,
+        train_unet,
+    ):
+        # Sample noise that we'll add to the latents
+        noise = torch.randn_like(latents)
+        bsz = latents.shape[0]
+
+        # get noisy model input and timesteps
+        noisy_model_input, timesteps, sigmas = flux_train_utils.get_noisy_model_input_and_timesteps(
+            args, noise_scheduler, latents, noise, accelerator.device, weight_dtype
+        )
+
+        # pack latents and get img_ids
+        packed_noisy_model_input = flux_utils.pack_latents(noisy_model_input)  # b, c, h*2, w*2 -> b, h*w, c*4
+        packed_latent_height, packed_latent_width = noisy_model_input.shape[2] // 2, noisy_model_input.shape[3] // 2
+        img_ids = flux_utils.prepare_img_ids(bsz, packed_latent_height, packed_latent_width).to(device=accelerator.device)
+
+        # get guidance
+        # ensure guidance_scale in args is float
+        guidance_vec = torch.full((bsz,), float(args.guidance_scale), device=accelerator.device)
+
+        # ensure the hidden state will require grad
+        if args.gradient_checkpointing:
+            noisy_model_input.requires_grad_(True)
+            for t in text_encoder_conds:
+                if t is not None and t.dtype.is_floating_point:
+                    t.requires_grad_(True)
+            img_ids.requires_grad_(True)
+            guidance_vec.requires_grad_(True)
+
+        # Predict the noise residual
+        l_pooled, t5_out, txt_ids, t5_attn_mask = text_encoder_conds
+        if not args.apply_t5_attn_mask:
+            t5_attn_mask = None
+
+        def call_dit(img, img_ids, t5_out, txt_ids, l_pooled, timesteps, guidance_vec, t5_attn_mask):
+            # if not args.split_mode:
+            # normal forward
+            with accelerator.autocast():
+                # YiYi notes: divide it by 1000 for now because we scale it by 1000 in the transformer model (we should not keep it but I want to keep the inputs same for the model for testing)
+                model_pred = unet(
+                    img=img,
+                    img_ids=img_ids,
+                    txt=t5_out,
+                    txt_ids=txt_ids,
+                    y=l_pooled,
+                    timesteps=timesteps / 1000,
+                    guidance=guidance_vec,
+                    txt_attention_mask=t5_attn_mask,
+                )
+            """
+            else:
+                # split forward to reduce memory usage
+                assert network.train_blocks == "single", "train_blocks must be single for split mode"
+                with accelerator.autocast():
+                    # move flux lower to cpu, and then move flux upper to gpu
+                    unet.to("cpu")
+                    clean_memory_on_device(accelerator.device)
+                    self.flux_upper.to(accelerator.device)
+
+                    # upper model does not require grad
+                    with torch.no_grad():
+                        intermediate_img, intermediate_txt, vec, pe = self.flux_upper(
+                            img=packed_noisy_model_input,
+                            img_ids=img_ids,
+                            txt=t5_out,
+                            txt_ids=txt_ids,
+                            y=l_pooled,
+                            timesteps=timesteps / 1000,
+                            guidance=guidance_vec,
+                            txt_attention_mask=t5_attn_mask,
+                        )
+
+                    # move flux upper back to cpu, and then move flux lower to gpu
+                    self.flux_upper.to("cpu")
+                    clean_memory_on_device(accelerator.device)
+                    unet.to(accelerator.device)
+
+                    # lower model requires grad
+                    intermediate_img.requires_grad_(True)
+                    intermediate_txt.requires_grad_(True)
+                    vec.requires_grad_(True)
+                    pe.requires_grad_(True)
+                    model_pred = unet(img=intermediate_img, txt=intermediate_txt, vec=vec, pe=pe, txt_attention_mask=t5_attn_mask)
+            """
+
+            return model_pred
+
+        model_pred = call_dit(
+            img=packed_noisy_model_input,
+            img_ids=img_ids,
+            t5_out=t5_out,
+            txt_ids=txt_ids,
+            l_pooled=l_pooled,
+            timesteps=timesteps,
+            guidance_vec=guidance_vec,
+            t5_attn_mask=t5_attn_mask,
+        )
+
+        # unpack latents
+        model_pred = flux_utils.unpack_latents(model_pred, packed_latent_height, packed_latent_width)
+
+        # apply model prediction type
+        model_pred, weighting = flux_train_utils.apply_model_prediction_type(args, model_pred, noisy_model_input, sigmas)
+
+        # flow matching loss: this is different from SD3
+        target = noise - latents
+
+        # differential output preservation
+        if "custom_attributes" in batch:
+            diff_output_pr_indices = []
+            for i, custom_attributes in enumerate(batch["custom_attributes"]):
+                if "diff_output_preservation" in custom_attributes and custom_attributes["diff_output_preservation"]:
+                    diff_output_pr_indices.append(i)
+
+            if len(diff_output_pr_indices) > 0:
+                network.set_multiplier(0.0)
+                with torch.no_grad():
+                    model_pred_prior = call_dit(
+                        img=packed_noisy_model_input[diff_output_pr_indices],
+                        img_ids=img_ids[diff_output_pr_indices],
+                        t5_out=t5_out[diff_output_pr_indices],
+                        txt_ids=txt_ids[diff_output_pr_indices],
+                        l_pooled=l_pooled[diff_output_pr_indices],
+                        timesteps=timesteps[diff_output_pr_indices],
+                        guidance_vec=guidance_vec[diff_output_pr_indices] if guidance_vec is not None else None,
+                        t5_attn_mask=t5_attn_mask[diff_output_pr_indices] if t5_attn_mask is not None else None,
+                    )
+                network.set_multiplier(1.0)  # may be overwritten by "network_multipliers" in the next step
+
+                model_pred_prior = flux_utils.unpack_latents(model_pred_prior, packed_latent_height, packed_latent_width)
+                model_pred_prior, _ = flux_train_utils.apply_model_prediction_type(
+                    args,
+                    model_pred_prior,
+                    noisy_model_input[diff_output_pr_indices],
+                    sigmas[diff_output_pr_indices] if sigmas is not None else None,
+                )
+                target[diff_output_pr_indices] = model_pred_prior.to(target.dtype)
+
+        return model_pred, target, timesteps, None, weighting
+
+    def post_process_loss(self, loss, args, timesteps, noise_scheduler):
+        return loss
+
+    def get_sai_model_spec(self, args):
+        return train_util.get_sai_model_spec(None, args, False, True, False, flux="dev")
+
+    def update_metadata(self, metadata, args):
+        metadata["ss_apply_t5_attn_mask"] = args.apply_t5_attn_mask
+        metadata["ss_weighting_scheme"] = args.weighting_scheme
+        metadata["ss_logit_mean"] = args.logit_mean
+        metadata["ss_logit_std"] = args.logit_std
+        metadata["ss_mode_scale"] = args.mode_scale
+        metadata["ss_guidance_scale"] = args.guidance_scale
+        metadata["ss_timestep_sampling"] = args.timestep_sampling
+        metadata["ss_sigmoid_scale"] = args.sigmoid_scale
+        metadata["ss_model_prediction_type"] = args.model_prediction_type
+        metadata["ss_discrete_flow_shift"] = args.discrete_flow_shift
+
+    def is_text_encoder_not_needed_for_training(self, args):
+        return args.cache_text_encoder_outputs and not self.is_train_text_encoder(args)
+
+    def prepare_text_encoder_grad_ckpt_workaround(self, index, text_encoder):
+        if index == 0:  # CLIP-L
+            return super().prepare_text_encoder_grad_ckpt_workaround(index, text_encoder)
+        else:  # T5XXL
+            text_encoder.encoder.embed_tokens.requires_grad_(True)
+
+    def prepare_text_encoder_fp8(self, index, text_encoder, te_weight_dtype, weight_dtype):
+        if index == 0:  # CLIP-L
+            logger.info(f"prepare CLIP-L for fp8: set to {te_weight_dtype}, set embeddings to {weight_dtype}")
+            text_encoder.to(te_weight_dtype)  # fp8
+            text_encoder.text_model.embeddings.to(dtype=weight_dtype)
+        else:  # T5XXL
+
+            def prepare_fp8(text_encoder, target_dtype):
+                def forward_hook(module):
+                    def forward(hidden_states):
+                        hidden_gelu = module.act(module.wi_0(hidden_states))
+                        hidden_linear = module.wi_1(hidden_states)
+                        hidden_states = hidden_gelu * hidden_linear
+                        hidden_states = module.dropout(hidden_states)
+
+                        hidden_states = module.wo(hidden_states)
+                        return hidden_states
+
+                    return forward
+
+                for module in text_encoder.modules():
+                    if module.__class__.__name__ in ["T5LayerNorm", "Embedding"]:
+                        # print("set", module.__class__.__name__, "to", target_dtype)
+                        module.to(target_dtype)
+                    if module.__class__.__name__ in ["T5DenseGatedActDense"]:
+                        # print("set", module.__class__.__name__, "hooks")
+                        module.forward = forward_hook(module)
+
+            if flux_utils.get_t5xxl_actual_dtype(text_encoder) == torch.float8_e4m3fn and text_encoder.dtype == weight_dtype:
+                logger.info(f"T5XXL already prepared for fp8")
+            else:
+                logger.info(f"prepare T5XXL for fp8: set to {te_weight_dtype}, set embeddings to {weight_dtype}, add hooks")
+                text_encoder.to(te_weight_dtype)  # fp8
+                prepare_fp8(text_encoder, weight_dtype)
+
+    def prepare_unet_with_accelerator(
+        self, args: argparse.Namespace, accelerator: Accelerator, unet: torch.nn.Module
+    ) -> torch.nn.Module:
+        if not self.is_swapping_blocks:
+            return super().prepare_unet_with_accelerator(args, accelerator, unet)
+
+        # if we doesn't swap blocks, we can move the model to device
+        flux: flux_models.Flux = unet
+        flux = accelerator.prepare(flux, device_placement=[not self.is_swapping_blocks])
+        accelerator.unwrap_model(flux).move_to_device_except_swap_blocks(accelerator.device)  # reduce peak memory usage
+        accelerator.unwrap_model(flux).prepare_block_swap_before_forward()
+
+        return flux
+
+
+def setup_parser() -> argparse.ArgumentParser:
+    parser = train_network.setup_parser()
+    train_util.add_dit_training_arguments(parser)
+    flux_train_utils.add_flux_train_arguments(parser)
+
+    parser.add_argument(
+        "--split_mode",
+        action="store_true",
+        # help="[EXPERIMENTAL] use split mode for Flux model, network arg `train_blocks=single` is required"
+        # + "/[実験的] Fluxモデルの分割モードを使用する。ネットワーク引数`train_blocks=single`が必要",
+        help="[Deprecated] This option is deprecated. Please use `--blocks_to_swap` instead."
+        " / このオプションは非推奨です。代わりに`--blocks_to_swap`を使用してください。",
+    )
+    return parser
+
+
+if __name__ == "__main__":
+    parser = setup_parser()
+
+    args = parser.parse_args()
+    train_util.verify_command_line_training_args(args)
+    args = train_util.read_config_from_file(args, parser)
+
+    trainer = FluxNetworkTrainer()
+    trainer.train(args)
diff --git a/flux_train_network.py b/flux_train_network.py
index 704c4d32e..0feb9b011 100644
--- a/flux_train_network.py
+++ b/flux_train_network.py
@@ -125,7 +125,10 @@ def load_target_model(self, args, weight_dtype, accelerator):
 
         ae = flux_utils.load_ae(args.ae, weight_dtype, "cpu", disable_mmap=args.disable_mmap_load_safetensors)
 
-        return flux_utils.MODEL_VERSION_FLUX_V1, [clip_l, t5xxl], ae, model
+        controlnet = flux_utils.load_controlnet()
+        controlnet.train()
+
+        return flux_utils.MODEL_VERSION_FLUX_V1, [clip_l, t5xxl], ae, model, controlnet
 
     def get_tokenize_strategy(self, args):
         _, is_schnell, _, _ = flux_utils.analyze_checkpoint_state(args.pretrained_model_name_or_path)
diff --git a/library/flux_models.py b/library/flux_models.py
index fa3c7ad2b..a3bd19743 100644
--- a/library/flux_models.py
+++ b/library/flux_models.py
@@ -1013,6 +1013,8 @@ def forward(
         txt_ids: Tensor,
         timesteps: Tensor,
         y: Tensor,
+        block_controlnet_hidden_states=None,
+        block_controlnet_single_hidden_states=None,
         guidance: Tensor | None = None,
         txt_attention_mask: Tensor | None = None,
     ) -> Tensor:
@@ -1031,18 +1033,29 @@ def forward(
 
         ids = torch.cat((txt_ids, img_ids), dim=1)
         pe = self.pe_embedder(ids)
+        if block_controlnet_hidden_states is not None:
+            controlnet_depth = len(block_controlnet_hidden_states)
+        if block_controlnet_single_hidden_states is not None:
+            controlnet_single_depth = len(block_controlnet_single_hidden_states)
 
         if not self.blocks_to_swap:
-            for block in self.double_blocks:
+            for block_idx, block in enumerate(self.double_blocks):
                 img, txt = block(img=img, txt=txt, vec=vec, pe=pe, txt_attention_mask=txt_attention_mask)
+                if block_controlnet_hidden_states is not None:
+                    img = img + block_controlnet_hidden_states[block_idx % controlnet_depth]
+
             img = torch.cat((txt, img), 1)
             for block in self.single_blocks:
                 img = block(img, vec=vec, pe=pe, txt_attention_mask=txt_attention_mask)
+                if block_controlnet_single_hidden_states is not None:
+                    img = img + block_controlnet_single_hidden_states[block_idx % controlnet_single_depth]
         else:
             for block_idx, block in enumerate(self.double_blocks):
                 self.offloader_double.wait_for_block(block_idx)
 
                 img, txt = block(img=img, txt=txt, vec=vec, pe=pe, txt_attention_mask=txt_attention_mask)
+                if block_controlnet_hidden_states is not None:
+                    img = img + block_controlnet_hidden_states[block_idx % controlnet_depth]
 
                 self.offloader_double.submit_move_blocks(self.double_blocks, block_idx)
 
@@ -1052,6 +1065,8 @@ def forward(
                 self.offloader_single.wait_for_block(block_idx)
 
                 img = block(img, vec=vec, pe=pe, txt_attention_mask=txt_attention_mask)
+                if block_controlnet_single_hidden_states is not None:
+                    img = img + block_controlnet_single_hidden_states[block_idx % controlnet_single_depth]
 
                 self.offloader_single.submit_move_blocks(self.single_blocks, block_idx)
 
@@ -1066,6 +1081,246 @@ def forward(
         return img
 
 
+def zero_module(module):
+    for p in module.parameters():
+        nn.init.zeros_(p)
+    return module
+
+
+class ControlNetFlux(nn.Module):
+    """
+    Transformer model for flow matching on sequences.
+    """
+
+    def __init__(self, params: FluxParams, controlnet_depth=2):
+        super().__init__()
+
+        self.params = params
+        self.in_channels = params.in_channels
+        self.out_channels = self.in_channels
+        if params.hidden_size % params.num_heads != 0:
+            raise ValueError(f"Hidden size {params.hidden_size} must be divisible by num_heads {params.num_heads}")
+        pe_dim = params.hidden_size // params.num_heads
+        if sum(params.axes_dim) != pe_dim:
+            raise ValueError(f"Got {params.axes_dim} but expected positional dim {pe_dim}")
+        self.hidden_size = params.hidden_size
+        self.num_heads = params.num_heads
+        self.pe_embedder = EmbedND(dim=pe_dim, theta=params.theta, axes_dim=params.axes_dim)
+        self.img_in = nn.Linear(self.in_channels, self.hidden_size, bias=True)
+        self.time_in = MLPEmbedder(in_dim=256, hidden_dim=self.hidden_size)
+        self.vector_in = MLPEmbedder(params.vec_in_dim, self.hidden_size)
+        self.guidance_in = MLPEmbedder(in_dim=256, hidden_dim=self.hidden_size) if params.guidance_embed else nn.Identity()
+        self.txt_in = nn.Linear(params.context_in_dim, self.hidden_size)
+
+        self.double_blocks = nn.ModuleList(
+            [
+                DoubleStreamBlock(
+                    self.hidden_size,
+                    self.num_heads,
+                    mlp_ratio=params.mlp_ratio,
+                    qkv_bias=params.qkv_bias,
+                )
+                for _ in range(params.depth)
+            ]
+        )
+
+        self.single_blocks = nn.ModuleList(
+            [
+                SingleStreamBlock(self.hidden_size, self.num_heads, mlp_ratio=params.mlp_ratio)
+                for _ in range(0) # TMP
+            ]
+        )
+
+        self.gradient_checkpointing = False
+        self.cpu_offload_checkpointing = False
+        self.blocks_to_swap = None
+
+        self.offloader_double = None
+        self.offloader_single = None
+        self.num_double_blocks = len(self.double_blocks)
+        self.num_single_blocks = len(self.single_blocks)
+
+        # add ControlNet blocks
+        self.controlnet_blocks_for_double = nn.ModuleList([])
+        for _ in range(controlnet_depth):
+            controlnet_block = nn.Linear(self.hidden_size, self.hidden_size)
+            controlnet_block = zero_module(controlnet_block)
+            self.controlnet_blocks_for_double.append(controlnet_block)
+        self.controlnet_blocks_for_single = nn.ModuleList([])
+        for _ in range(controlnet_depth):
+            controlnet_block = nn.Linear(self.hidden_size, self.hidden_size)
+            controlnet_block = zero_module(controlnet_block)
+            self.controlnet_blocks_for_single.append(controlnet_block)
+        self.pos_embed_input = nn.Linear(self.in_channels, self.hidden_size, bias=True)
+        self.gradient_checkpointing = False
+        self.input_hint_block = nn.Sequential(
+            nn.Conv2d(3, 16, 3, padding=1),
+            nn.SiLU(),
+            nn.Conv2d(16, 16, 3, padding=1),
+            nn.SiLU(),
+            nn.Conv2d(16, 16, 3, padding=1, stride=2),
+            nn.SiLU(),
+            nn.Conv2d(16, 16, 3, padding=1),
+            nn.SiLU(),
+            nn.Conv2d(16, 16, 3, padding=1, stride=2),
+            nn.SiLU(),
+            nn.Conv2d(16, 16, 3, padding=1),
+            nn.SiLU(),
+            nn.Conv2d(16, 16, 3, padding=1, stride=2),
+            nn.SiLU(),
+            zero_module(nn.Conv2d(16, 16, 3, padding=1))
+        )
+
+    @property
+    def device(self):
+        return next(self.parameters()).device
+
+    @property
+    def dtype(self):
+        return next(self.parameters()).dtype
+
+    def enable_gradient_checkpointing(self, cpu_offload: bool = False):
+        self.gradient_checkpointing = True
+        self.cpu_offload_checkpointing = cpu_offload
+
+        self.time_in.enable_gradient_checkpointing()
+        self.vector_in.enable_gradient_checkpointing()
+        if self.guidance_in.__class__ != nn.Identity:
+            self.guidance_in.enable_gradient_checkpointing()
+
+        for block in self.double_blocks + self.single_blocks:
+            block.enable_gradient_checkpointing(cpu_offload=cpu_offload)
+
+        print(f"FLUX: Gradient checkpointing enabled. CPU offload: {cpu_offload}")
+
+    def disable_gradient_checkpointing(self):
+        self.gradient_checkpointing = False
+        self.cpu_offload_checkpointing = False
+
+        self.time_in.disable_gradient_checkpointing()
+        self.vector_in.disable_gradient_checkpointing()
+        if self.guidance_in.__class__ != nn.Identity:
+            self.guidance_in.disable_gradient_checkpointing()
+
+        for block in self.double_blocks + self.single_blocks:
+            block.disable_gradient_checkpointing()
+
+        print("FLUX: Gradient checkpointing disabled.")
+
+    def enable_block_swap(self, num_blocks: int, device: torch.device):
+        self.blocks_to_swap = num_blocks
+        double_blocks_to_swap = num_blocks // 2
+        single_blocks_to_swap = (num_blocks - double_blocks_to_swap) * 2
+
+        assert double_blocks_to_swap <= self.num_double_blocks - 2 and single_blocks_to_swap <= self.num_single_blocks - 2, (
+            f"Cannot swap more than {self.num_double_blocks - 2} double blocks and {self.num_single_blocks - 2} single blocks. "
+            f"Requested {double_blocks_to_swap} double blocks and {single_blocks_to_swap} single blocks."
+        )
+
+        self.offloader_double = custom_offloading_utils.ModelOffloader(
+            self.double_blocks, self.num_double_blocks, double_blocks_to_swap, device  # , debug=True
+        )
+        self.offloader_single = custom_offloading_utils.ModelOffloader(
+            self.single_blocks, self.num_single_blocks, single_blocks_to_swap, device  # , debug=True
+        )
+        print(
+            f"FLUX: Block swap enabled. Swapping {num_blocks} blocks, double blocks: {double_blocks_to_swap}, single blocks: {single_blocks_to_swap}."
+        )
+
+    def move_to_device_except_swap_blocks(self, device: torch.device):
+        # assume model is on cpu. do not move blocks to device to reduce temporary memory usage
+        if self.blocks_to_swap:
+            save_double_blocks = self.double_blocks
+            save_single_blocks = self.single_blocks
+            self.double_blocks = None
+            self.single_blocks = None
+
+        self.to(device)
+
+        if self.blocks_to_swap:
+            self.double_blocks = save_double_blocks
+            self.single_blocks = save_single_blocks
+
+    def prepare_block_swap_before_forward(self):
+        if self.blocks_to_swap is None or self.blocks_to_swap == 0:
+            return
+        self.offloader_double.prepare_block_devices_before_forward(self.double_blocks)
+        self.offloader_single.prepare_block_devices_before_forward(self.single_blocks)
+
+    def forward(
+        self,
+        img: Tensor,
+        img_ids: Tensor,
+        controlnet_cond: Tensor,
+        txt: Tensor,
+        txt_ids: Tensor,
+        timesteps: Tensor,
+        y: Tensor,
+        guidance: Tensor | None = None,
+        txt_attention_mask: Tensor | None = None,
+    ) -> tuple[tuple[Tensor]]:
+        if img.ndim != 3 or txt.ndim != 3:
+            raise ValueError("Input img and txt tensors must have 3 dimensions.")
+
+        # running on sequences img
+        img = self.img_in(img)
+        controlnet_cond = self.input_hint_block(controlnet_cond)
+        controlnet_cond = rearrange(controlnet_cond, "b c (h ph) (w pw) -> b (h w) (c ph pw)", ph=2, pw=2)
+        controlnet_cond = self.pos_embed_input(controlnet_cond)
+        img = img + controlnet_cond
+        vec = self.time_in(timestep_embedding(timesteps, 256))
+        if self.params.guidance_embed:
+            if guidance is None:
+                raise ValueError("Didn't get guidance strength for guidance distilled model.")
+            vec = vec + self.guidance_in(timestep_embedding(guidance, 256))
+        vec = vec + self.vector_in(y)
+        txt = self.txt_in(txt)
+
+        ids = torch.cat((txt_ids, img_ids), dim=1)
+        pe = self.pe_embedder(ids)
+
+        block_samples = ()
+        block_single_samples = ()
+        if not self.blocks_to_swap:
+            for block_idx, block in enumerate(self.double_blocks):
+                img, txt = block(img=img, txt=txt, vec=vec, pe=pe, txt_attention_mask=txt_attention_mask)
+                block_samples = block_samples + (img,)
+
+            img = torch.cat((txt, img), 1)
+            for block in self.single_blocks:
+                img = block(img, vec=vec, pe=pe, txt_attention_mask=txt_attention_mask)
+                block_single_samples = block_single_samples + (img,)
+        else:
+            for block_idx, block in enumerate(self.double_blocks):
+                self.offloader_double.wait_for_block(block_idx)
+
+                img, txt = block(img=img, txt=txt, vec=vec, pe=pe, txt_attention_mask=txt_attention_mask)
+                block_samples = block_samples + (img,)
+
+                self.offloader_double.submit_move_blocks(self.double_blocks, block_idx)
+
+            img = torch.cat((txt, img), 1)
+
+            for block_idx, block in enumerate(self.single_blocks):
+                self.offloader_single.wait_for_block(block_idx)
+
+                img = block(img, vec=vec, pe=pe, txt_attention_mask=txt_attention_mask)
+                block_single_samples = block_single_samples + (img,)
+
+                self.offloader_single.submit_move_blocks(self.single_blocks, block_idx)
+
+        controlnet_block_samples = ()
+        controlnet_single_block_samples = ()
+        for block_sample, controlnet_block in zip(block_samples, self.controlnet_blocks_for_double):
+            block_sample = controlnet_block(block_sample)
+            controlnet_block_samples = controlnet_block_samples + (block_sample,)
+        for block_sample, controlnet_block in zip(block_samples, self.controlnet_single_blocks_for_single):
+            block_sample = controlnet_block(block_sample)
+            controlnet_single_block_samples = controlnet_single_block_samples + (block_sample,)
+
+        return controlnet_block_samples, controlnet_single_block_samples
+
+
 """
 class FluxUpper(nn.Module):
     ""
diff --git a/library/flux_utils.py b/library/flux_utils.py
index f3093615d..678efbc8a 100644
--- a/library/flux_utils.py
+++ b/library/flux_utils.py
@@ -153,6 +153,14 @@ def load_ae(
     return ae
 
 
+def load_controlnet(name, device, transformer=None):
+   with torch.device(device):
+       controlnet = flux_models.ControlNetFlux(flux_models.configs[name].params)
+   if transformer is not None:
+       controlnet.load_state_dict(transformer.state_dict(), strict=False)
+   return controlnet    
+
+
 def load_clip_l(
     ckpt_path: Optional[str],
     dtype: torch.dtype,

From 42f6edf3a886287b99770bc7a8c0bafd3fa03f39 Mon Sep 17 00:00:00 2001
From: minux302 <minux302@gmail.com>
Date: Fri, 15 Nov 2024 23:48:51 +0900
Subject: [PATCH 238/582] fix for adding controlnet

---
 flux_train_control_net.py   | 1270 +++++++++++++++++++++--------------
 flux_train_network.py       |    3 -
 library/flux_train_utils.py |   32 +-
 library/flux_utils.py       |   11 +-
 4 files changed, 820 insertions(+), 496 deletions(-)

diff --git a/flux_train_control_net.py b/flux_train_control_net.py
index 704c4d32e..8a7be75f2 100644
--- a/flux_train_control_net.py
+++ b/flux_train_control_net.py
@@ -1,563 +1,860 @@
+# training with captions
+
+# Swap blocks between CPU and GPU:
+# This implementation is inspired by and based on the work of 2kpr.
+# Many thanks to 2kpr for the original concept and implementation of memory-efficient offloading.
+# The original idea has been adapted and extended to fit the current project's needs.
+
+# Key features:
+# - CPU offloading during forward and backward passes
+# - Use of fused optimizer and grad_hook for efficient gradient processing
+# - Per-block fused optimizer instances
+
 import argparse
+from concurrent.futures import ThreadPoolExecutor
 import copy
 import math
-import random
-from typing import Any, Optional
+import os
+from multiprocessing import Value
+import time
+from typing import List, Optional, Tuple, Union
+import toml
+
+from tqdm import tqdm
 
 import torch
-from accelerate import Accelerator
+import torch.nn as nn
+from library import utils
 from library.device_utils import init_ipex, clean_memory_on_device
 
 init_ipex()
 
-from library import flux_models, flux_train_utils, flux_utils, sd3_train_utils, strategy_base, strategy_flux, train_util
-import train_network
-from library.utils import setup_logging
+from accelerate.utils import set_seed
+from library import deepspeed_utils, flux_train_utils, flux_utils, strategy_base, strategy_flux
+from library.sd3_train_utils import FlowMatchEulerDiscreteScheduler
+
+import library.train_util as train_util
+
+from library.utils import setup_logging, add_logging_arguments
 
 setup_logging()
 import logging
 
 logger = logging.getLogger(__name__)
 
+import library.config_util as config_util
+
+# import library.sdxl_train_util as sdxl_train_util
+from library.config_util import (
+    ConfigSanitizer,
+    BlueprintGenerator,
+)
+from library.custom_train_functions import apply_masked_loss, add_custom_train_arguments
+
+
+def train(args):
+    train_util.verify_training_args(args)
+    train_util.prepare_dataset_args(args, True)
+    # sdxl_train_util.verify_sdxl_training_args(args)
+    deepspeed_utils.prepare_deepspeed_args(args)
+    setup_logging(args, reset=True)
+
+    # temporary: backward compatibility for deprecated options. remove in the future
+    if not args.skip_cache_check:
+        args.skip_cache_check = args.skip_latents_validity_check
+
+    # assert (
+    #     not args.weighted_captions
+    # ), "weighted_captions is not supported currently / weighted_captionsは現在サポートされていません"
+    if args.cache_text_encoder_outputs_to_disk and not args.cache_text_encoder_outputs:
+        logger.warning(
+            "cache_text_encoder_outputs_to_disk is enabled, so cache_text_encoder_outputs is also enabled / cache_text_encoder_outputs_to_diskが有効になっているため、cache_text_encoder_outputsも有効になります"
+        )
+        args.cache_text_encoder_outputs = True
 
-class FluxNetworkTrainer(train_network.NetworkTrainer):
-    def __init__(self):
-        super().__init__()
-        self.sample_prompts_te_outputs = None
-        self.is_schnell: Optional[bool] = None
-        self.is_swapping_blocks: bool = False
+    if args.cpu_offload_checkpointing and not args.gradient_checkpointing:
+        logger.warning(
+            "cpu_offload_checkpointing is enabled, so gradient_checkpointing is also enabled / cpu_offload_checkpointingが有効になっているため、gradient_checkpointingも有効になります"
+        )
+        args.gradient_checkpointing = True
 
-    def assert_extra_args(self, args, train_dataset_group):
-        super().assert_extra_args(args, train_dataset_group)
-        # sdxl_train_util.verify_sdxl_training_args(args)
+    assert (
+        args.blocks_to_swap is None or args.blocks_to_swap == 0
+    ) or not args.cpu_offload_checkpointing, (
+        "blocks_to_swap is not supported with cpu_offload_checkpointing / blocks_to_swapはcpu_offload_checkpointingと併用できません"
+    )
 
-        if args.fp8_base_unet:
-            args.fp8_base = True  # if fp8_base_unet is enabled, fp8_base is also enabled for FLUX.1
+    cache_latents = args.cache_latents
+    use_dreambooth_method = args.in_json is None
 
-        if args.cache_text_encoder_outputs_to_disk and not args.cache_text_encoder_outputs:
-            logger.warning(
-                "cache_text_encoder_outputs_to_disk is enabled, so cache_text_encoder_outputs is also enabled / cache_text_encoder_outputs_to_diskが有効になっているため、cache_text_encoder_outputsも有効になります"
-            )
-            args.cache_text_encoder_outputs = True
+    if args.seed is not None:
+        set_seed(args.seed)  # 乱数系列を初期化する
 
+    # prepare caching strategy: this must be set before preparing dataset. because dataset may use this strategy for initialization.
+    if args.cache_latents:
+        latents_caching_strategy = strategy_flux.FluxLatentsCachingStrategy(
+            args.cache_latents_to_disk, args.vae_batch_size, args.skip_cache_check
+        )
+        strategy_base.LatentsCachingStrategy.set_strategy(latents_caching_strategy)
+
+    # データセットを準備する
+    if args.dataset_class is None:
+        blueprint_generator = BlueprintGenerator(ConfigSanitizer(True, True, args.masked_loss, True))
+        if args.dataset_config is not None:
+            logger.info(f"Load dataset config from {args.dataset_config}")
+            user_config = config_util.load_user_config(args.dataset_config)
+            ignored = ["train_data_dir", "in_json"]
+            if any(getattr(args, attr) is not None for attr in ignored):
+                logger.warning(
+                    "ignore following options because config file is found: {0} / 設定ファイルが利用されるため以下のオプションは無視されます: {0}".format(
+                        ", ".join(ignored)
+                    )
+                )
+        else:
+            if use_dreambooth_method:
+                logger.info("Using DreamBooth method.")
+                user_config = {
+                    "datasets": [
+                        {
+                            "subsets": config_util.generate_dreambooth_subsets_config_by_subdirs(
+                                args.train_data_dir, args.reg_data_dir
+                            )
+                        }
+                    ]
+                }
+            else:
+                logger.info("Training with captions.")
+                user_config = {
+                    "datasets": [
+                        {
+                            "subsets": [
+                                {
+                                    "image_dir": args.train_data_dir,
+                                    "metadata_file": args.in_json,
+                                }
+                            ]
+                        }
+                    ]
+                }
+
+        blueprint = blueprint_generator.generate(user_config, args)
+        train_dataset_group = config_util.generate_dataset_group_by_blueprint(blueprint.dataset_group)
+    else:
+        train_dataset_group = train_util.load_arbitrary_dataset(args)
+
+    current_epoch = Value("i", 0)
+    current_step = Value("i", 0)
+    ds_for_collator = train_dataset_group if args.max_data_loader_n_workers == 0 else None
+    collator = train_util.collator_class(current_epoch, current_step, ds_for_collator)
+
+    train_dataset_group.verify_bucket_reso_steps(16)  # TODO これでいいか確認
+
+    _, is_schnell, _, _ = flux_utils.analyze_checkpoint_state(args.pretrained_model_name_or_path)
+    if args.debug_dataset:
         if args.cache_text_encoder_outputs:
-            assert (
-                train_dataset_group.is_text_encoder_output_cacheable()
-            ), "when caching Text Encoder output, either caption_dropout_rate, shuffle_caption, token_warmup_step or caption_tag_dropout_rate cannot be used / Text Encoderの出力をキャッシュするときはcaption_dropout_rate, shuffle_caption, token_warmup_step, caption_tag_dropout_rateは使えません"
-
-        # prepare CLIP-L/T5XXL training flags
-        self.train_clip_l = not args.network_train_unet_only
-        self.train_t5xxl = False  # default is False even if args.network_train_unet_only is False
+            strategy_base.TextEncoderOutputsCachingStrategy.set_strategy(
+                strategy_flux.FluxTextEncoderOutputsCachingStrategy(
+                    args.cache_text_encoder_outputs_to_disk, args.text_encoder_batch_size, args.skip_cache_check, False
+                )
+            )
+        t5xxl_max_token_length = (
+            args.t5xxl_max_token_length if args.t5xxl_max_token_length is not None else (256 if is_schnell else 512)
+        )
+        strategy_base.TokenizeStrategy.set_strategy(strategy_flux.FluxTokenizeStrategy(t5xxl_max_token_length))
+
+        train_dataset_group.set_current_strategies()
+        train_util.debug_dataset(train_dataset_group, True)
+        return
+    if len(train_dataset_group) == 0:
+        logger.error(
+            "No data found. Please verify the metadata file and train_data_dir option. / 画像がありません。メタデータおよびtrain_data_dirオプションを確認してください。"
+        )
+        return
 
-        if args.max_token_length is not None:
-            logger.warning("max_token_length is not used in Flux training / max_token_lengthはFluxのトレーニングでは使用されません")
+    if cache_latents:
+        assert (
+            train_dataset_group.is_latent_cacheable()
+        ), "when caching latents, either color_aug or random_crop cannot be used / latentをキャッシュするときはcolor_augとrandom_cropは使えません"
 
+    if args.cache_text_encoder_outputs:
         assert (
-            args.blocks_to_swap is None or args.blocks_to_swap == 0
-        ) or not args.cpu_offload_checkpointing, "blocks_to_swap is not supported with cpu_offload_checkpointing / blocks_to_swapはcpu_offload_checkpointingと併用できません"
+            train_dataset_group.is_text_encoder_output_cacheable()
+        ), "when caching text encoder output, either caption_dropout_rate, shuffle_caption, token_warmup_step or caption_tag_dropout_rate cannot be used / text encoderの出力をキャッシュするときはcaption_dropout_rate, shuffle_caption, token_warmup_step, caption_tag_dropout_rateは使えません"
 
-        # deprecated split_mode option
-        if args.split_mode:
-            if args.blocks_to_swap is not None:
-                logger.warning(
-                    "split_mode is deprecated. Because `--blocks_to_swap` is set, `--split_mode` is ignored."
-                    " / split_modeは非推奨です。`--blocks_to_swap`が設定されているため、`--split_mode`は無視されます。"
-                )
-            else:
-                logger.warning(
-                    "split_mode is deprecated. Please use `--blocks_to_swap` instead. `--blocks_to_swap 18` is automatically set."
-                    " / split_modeは非推奨です。代わりに`--blocks_to_swap`を使用してください。`--blocks_to_swap 18`が自動的に設定されました。"
-                )
-                args.blocks_to_swap = 18  # 18 is safe for most cases
+    # acceleratorを準備する
+    logger.info("prepare accelerator")
+    accelerator = train_util.prepare_accelerator(args)
 
-        train_dataset_group.verify_bucket_reso_steps(32)  # TODO check this
+    # mixed precisionに対応した型を用意しておき適宜castする
+    weight_dtype, save_dtype = train_util.prepare_dtype(args)
 
-    def load_target_model(self, args, weight_dtype, accelerator):
-        # currently offload to cpu for some models
+    # モデルを読み込む
 
-        # if the file is fp8 and we are using fp8_base, we can load it as is (fp8)
-        loading_dtype = None if args.fp8_base else weight_dtype
+    # load VAE for caching latents
+    ae = None
+    if cache_latents:
+        ae = flux_utils.load_ae(args.ae, weight_dtype, "cpu", args.disable_mmap_load_safetensors)
+        ae.to(accelerator.device, dtype=weight_dtype)
+        ae.requires_grad_(False)
+        ae.eval()
 
-        # if we load to cpu, flux.to(fp8) takes a long time, so we should load to gpu in future
-        self.is_schnell, model = flux_utils.load_flow_model(
-            args.pretrained_model_name_or_path, loading_dtype, "cpu", disable_mmap=args.disable_mmap_load_safetensors
+        train_dataset_group.new_cache_latents(ae, accelerator)
+
+        ae.to("cpu")  # if no sampling, vae can be deleted
+        clean_memory_on_device(accelerator.device)
+
+        accelerator.wait_for_everyone()
+
+    # prepare tokenize strategy
+    if args.t5xxl_max_token_length is None:
+        if is_schnell:
+            t5xxl_max_token_length = 256
+        else:
+            t5xxl_max_token_length = 512
+    else:
+        t5xxl_max_token_length = args.t5xxl_max_token_length
+
+    flux_tokenize_strategy = strategy_flux.FluxTokenizeStrategy(t5xxl_max_token_length)
+    strategy_base.TokenizeStrategy.set_strategy(flux_tokenize_strategy)
+
+    # load clip_l, t5xxl for caching text encoder outputs
+    clip_l = flux_utils.load_clip_l(args.clip_l, weight_dtype, "cpu", args.disable_mmap_load_safetensors)
+    t5xxl = flux_utils.load_t5xxl(args.t5xxl, weight_dtype, "cpu", args.disable_mmap_load_safetensors)
+    clip_l.eval()
+    t5xxl.eval()
+    clip_l.requires_grad_(False)
+    t5xxl.requires_grad_(False)
+
+    text_encoding_strategy = strategy_flux.FluxTextEncodingStrategy(args.apply_t5_attn_mask)
+    strategy_base.TextEncodingStrategy.set_strategy(text_encoding_strategy)
+
+    # cache text encoder outputs
+    sample_prompts_te_outputs = None
+    if args.cache_text_encoder_outputs:
+        # Text Encodes are eval and no grad here
+        clip_l.to(accelerator.device)
+        t5xxl.to(accelerator.device)
+
+        text_encoder_caching_strategy = strategy_flux.FluxTextEncoderOutputsCachingStrategy(
+            args.cache_text_encoder_outputs_to_disk, args.text_encoder_batch_size, False, False, args.apply_t5_attn_mask
         )
-        if args.fp8_base:
-            # check dtype of model
-            if model.dtype == torch.float8_e4m3fnuz or model.dtype == torch.float8_e5m2 or model.dtype == torch.float8_e5m2fnuz:
-                raise ValueError(f"Unsupported fp8 model dtype: {model.dtype}")
-            elif model.dtype == torch.float8_e4m3fn:
-                logger.info("Loaded fp8 FLUX model")
-            else:
-                logger.info(
-                    "Cast FLUX model to fp8. This may take a while. You can reduce the time by using fp8 checkpoint."
-                    " / FLUXモデルをfp8に変換しています。これには時間がかかる場合があります。fp8チェックポイントを使用することで時間を短縮できます。"
-                )
-                model.to(torch.float8_e4m3fn)
+        strategy_base.TextEncoderOutputsCachingStrategy.set_strategy(text_encoder_caching_strategy)
+
+        with accelerator.autocast():
+            train_dataset_group.new_cache_text_encoder_outputs([clip_l, t5xxl], accelerator)
+
+        # cache sample prompt's embeddings to free text encoder's memory
+        if args.sample_prompts is not None:
+            logger.info(f"cache Text Encoder outputs for sample prompt: {args.sample_prompts}")
+
+            text_encoding_strategy: strategy_flux.FluxTextEncodingStrategy = strategy_base.TextEncodingStrategy.get_strategy()
+
+            prompts = train_util.load_prompts(args.sample_prompts)
+            sample_prompts_te_outputs = {}  # key: prompt, value: text encoder outputs
+            with accelerator.autocast(), torch.no_grad():
+                for prompt_dict in prompts:
+                    for p in [prompt_dict.get("prompt", ""), prompt_dict.get("negative_prompt", "")]:
+                        if p not in sample_prompts_te_outputs:
+                            logger.info(f"cache Text Encoder outputs for prompt: {p}")
+                            tokens_and_masks = flux_tokenize_strategy.tokenize(p)
+                            sample_prompts_te_outputs[p] = text_encoding_strategy.encode_tokens(
+                                flux_tokenize_strategy, [clip_l, t5xxl], tokens_and_masks, args.apply_t5_attn_mask
+                            )
+
+        accelerator.wait_for_everyone()
+
+        # now we can delete Text Encoders to free memory
+        clip_l = None
+        t5xxl = None
+        clean_memory_on_device(accelerator.device)
 
-        # if args.split_mode:
-        #     model = self.prepare_split_model(model, weight_dtype, accelerator)
+    # load FLUX
+    _, flux = flux_utils.load_flow_model(
+        args.pretrained_model_name_or_path, weight_dtype, "cpu", args.disable_mmap_load_safetensors
+    )
+    flux.requires_grad_(False)
 
-        self.is_swapping_blocks = args.blocks_to_swap is not None and args.blocks_to_swap > 0
-        if self.is_swapping_blocks:
-            # Swap blocks between CPU and GPU to reduce memory usage, in forward and backward passes.
-            logger.info(f"enable block swap: blocks_to_swap={args.blocks_to_swap}")
-            model.enable_block_swap(args.blocks_to_swap, accelerator.device)
+    # load controlnet
+    controlnet = flux_utils.load_controlnet()
+    controlnet.requires_grad_(True)
 
-        clip_l = flux_utils.load_clip_l(args.clip_l, weight_dtype, "cpu", disable_mmap=args.disable_mmap_load_safetensors)
-        clip_l.eval()
+    if args.gradient_checkpointing:
+        controlnet.enable_gradient_checkpointing(cpu_offload=args.cpu_offload_checkpointing)
 
-        # if the file is fp8 and we are using fp8_base (not unet), we can load it as is (fp8)
-        if args.fp8_base and not args.fp8_base_unet:
-            loading_dtype = None  # as is
-        else:
-            loading_dtype = weight_dtype
+    # block swap
 
-        # loading t5xxl to cpu takes a long time, so we should load to gpu in future
-        t5xxl = flux_utils.load_t5xxl(args.t5xxl, loading_dtype, "cpu", disable_mmap=args.disable_mmap_load_safetensors)
-        t5xxl.eval()
-        if args.fp8_base and not args.fp8_base_unet:
-            # check dtype of model
-            if t5xxl.dtype == torch.float8_e4m3fnuz or t5xxl.dtype == torch.float8_e5m2 or t5xxl.dtype == torch.float8_e5m2fnuz:
-                raise ValueError(f"Unsupported fp8 model dtype: {t5xxl.dtype}")
-            elif t5xxl.dtype == torch.float8_e4m3fn:
-                logger.info("Loaded fp8 T5XXL model")
+    # backward compatibility
+    if args.blocks_to_swap is None:
+        blocks_to_swap = args.double_blocks_to_swap or 0
+        if args.single_blocks_to_swap is not None:
+            blocks_to_swap += args.single_blocks_to_swap // 2
+        if blocks_to_swap > 0:
+            logger.warning(
+                "double_blocks_to_swap and single_blocks_to_swap are deprecated. Use blocks_to_swap instead."
+                " / double_blocks_to_swapとsingle_blocks_to_swapは非推奨です。blocks_to_swapを使ってください。"
+            )
+            logger.info(
+                f"double_blocks_to_swap={args.double_blocks_to_swap} and single_blocks_to_swap={args.single_blocks_to_swap} are converted to blocks_to_swap={blocks_to_swap}."
+            )
+            args.blocks_to_swap = blocks_to_swap
+        del blocks_to_swap
+
+    is_swapping_blocks = args.blocks_to_swap is not None and args.blocks_to_swap > 0
+    if is_swapping_blocks:
+        # Swap blocks between CPU and GPU to reduce memory usage, in forward and backward passes.
+        # This idea is based on 2kpr's great work. Thank you!
+        logger.info(f"enable block swap: blocks_to_swap={args.blocks_to_swap}")
+        flux.enable_block_swap(args.blocks_to_swap, accelerator.device)
+        controlnet.enable_block_swap(args.blocks_to_swap, accelerator.device)
+
+    if not cache_latents:
+        # load VAE here if not cached
+        ae = flux_utils.load_ae(args.ae, weight_dtype, "cpu")
+        ae.requires_grad_(False)
+        ae.eval()
+        ae.to(accelerator.device, dtype=weight_dtype)
+
+    training_models = []
+    params_to_optimize = []
+    training_models.append(controlnet)
+    name_and_params = list(controlnet.named_parameters())
+    # single param group for now
+    params_to_optimize.append({"params": [p for _, p in name_and_params], "lr": args.learning_rate})
+    param_names = [[n for n, _ in name_and_params]]
+
+    # calculate number of trainable parameters
+    n_params = 0
+    for group in params_to_optimize:
+        for p in group["params"]:
+            n_params += p.numel()
+
+    accelerator.print(f"number of trainable parameters: {n_params}")
+
+    # 学習に必要なクラスを準備する
+    accelerator.print("prepare optimizer, data loader etc.")
+
+    if args.blockwise_fused_optimizers:
+        # fused backward pass: https://pytorch.org/tutorials/intermediate/optimizer_step_in_backward_tutorial.html
+        # Instead of creating an optimizer for all parameters as in the tutorial, we create an optimizer for each block of parameters.
+        # This balances memory usage and management complexity.
+
+        # split params into groups. currently different learning rates are not supported
+        grouped_params = []
+        param_group = {}
+        for group in params_to_optimize:
+            named_parameters = list(controlnet.named_parameters())
+            assert len(named_parameters) == len(group["params"]), "number of parameters does not match"
+            for p, np in zip(group["params"], named_parameters):
+                # determine target layer and block index for each parameter
+                block_type = "other"  # double, single or other
+                if np[0].startswith("double_blocks"):
+                    block_index = int(np[0].split(".")[1])
+                    block_type = "double"
+                elif np[0].startswith("single_blocks"):
+                    block_index = int(np[0].split(".")[1])
+                    block_type = "single"
+                else:
+                    block_index = -1
+
+                param_group_key = (block_type, block_index)
+                if param_group_key not in param_group:
+                    param_group[param_group_key] = []
+                param_group[param_group_key].append(p)
+
+        block_types_and_indices = []
+        for param_group_key, param_group in param_group.items():
+            block_types_and_indices.append(param_group_key)
+            grouped_params.append({"params": param_group, "lr": args.learning_rate})
+
+            num_params = 0
+            for p in param_group:
+                num_params += p.numel()
+            accelerator.print(f"block {param_group_key}: {num_params} parameters")
+
+        # prepare optimizers for each group
+        optimizers = []
+        for group in grouped_params:
+            _, _, optimizer = train_util.get_optimizer(args, trainable_params=[group])
+            optimizers.append(optimizer)
+        optimizer = optimizers[0]  # avoid error in the following code
+
+        logger.info(f"using {len(optimizers)} optimizers for blockwise fused optimizers")
+
+        if train_util.is_schedulefree_optimizer(optimizers[0], args):
+            raise ValueError("Schedule-free optimizer is not supported with blockwise fused optimizers")
+        optimizer_train_fn = lambda: None  # dummy function
+        optimizer_eval_fn = lambda: None  # dummy function
+    else:
+        _, _, optimizer = train_util.get_optimizer(args, trainable_params=params_to_optimize)
+        optimizer_train_fn, optimizer_eval_fn = train_util.get_optimizer_train_eval_fn(optimizer, args)
+
+    # prepare dataloader
+    # strategies are set here because they cannot be referenced in another process. Copy them with the dataset
+    # some strategies can be None
+    train_dataset_group.set_current_strategies()
+
+    # DataLoaderのプロセス数：0 は persistent_workers が使えないので注意
+    n_workers = min(args.max_data_loader_n_workers, os.cpu_count())  # cpu_count or max_data_loader_n_workers
+    train_dataloader = torch.utils.data.DataLoader(
+        train_dataset_group,
+        batch_size=1,
+        shuffle=True,
+        collate_fn=collator,
+        num_workers=n_workers,
+        persistent_workers=args.persistent_data_loader_workers,
+    )
 
-        ae = flux_utils.load_ae(args.ae, weight_dtype, "cpu", disable_mmap=args.disable_mmap_load_safetensors)
+    # 学習ステップ数を計算する
+    if args.max_train_epochs is not None:
+        args.max_train_steps = args.max_train_epochs * math.ceil(
+            len(train_dataloader) / accelerator.num_processes / args.gradient_accumulation_steps
+        )
+        accelerator.print(
+            f"override steps. steps for {args.max_train_epochs} epochs is / 指定エポックまでのステップ数: {args.max_train_steps}"
+        )
 
-        return flux_utils.MODEL_VERSION_FLUX_V1, [clip_l, t5xxl], ae, model
+    # データセット側にも学習ステップを送信
+    train_dataset_group.set_max_train_steps(args.max_train_steps)
 
-    def get_tokenize_strategy(self, args):
-        _, is_schnell, _, _ = flux_utils.analyze_checkpoint_state(args.pretrained_model_name_or_path)
+    # lr schedulerを用意する
+    if args.blockwise_fused_optimizers:
+        # prepare lr schedulers for each optimizer
+        lr_schedulers = [train_util.get_scheduler_fix(args, optimizer, accelerator.num_processes) for optimizer in optimizers]
+        lr_scheduler = lr_schedulers[0]  # avoid error in the following code
+    else:
+        lr_scheduler = train_util.get_scheduler_fix(args, optimizer, accelerator.num_processes)
 
-        if args.t5xxl_max_token_length is None:
-            if is_schnell:
-                t5xxl_max_token_length = 256
-            else:
-                t5xxl_max_token_length = 512
-        else:
-            t5xxl_max_token_length = args.t5xxl_max_token_length
+    # 実験的機能：勾配も含めたfp16/bf16学習を行う　モデル全体をfp16/bf16にする
+    if args.full_fp16:
+        assert (
+            args.mixed_precision == "fp16"
+        ), "full_fp16 requires mixed precision='fp16' / full_fp16を使う場合はmixed_precision='fp16'を指定してください。"
+        accelerator.print("enable full fp16 training.")
+        flux.to(weight_dtype)
+        controlnet.to(weight_dtype)
+        if clip_l is not None:
+            clip_l.to(weight_dtype)
+            t5xxl.to(weight_dtype)  # TODO check works with fp16 or not
+    elif args.full_bf16:
+        assert (
+            args.mixed_precision == "bf16"
+        ), "full_bf16 requires mixed precision='bf16' / full_bf16を使う場合はmixed_precision='bf16'を指定してください。"
+        accelerator.print("enable full bf16 training.")
+        flux.to(weight_dtype)
+        controlnet.to(weight_dtype)
+        if clip_l is not None:
+            clip_l.to(weight_dtype)
+            t5xxl.to(weight_dtype)
+
+    # if we don't cache text encoder outputs, move them to device
+    if not args.cache_text_encoder_outputs:
+        clip_l.to(accelerator.device)
+        t5xxl.to(accelerator.device)
+
+    clean_memory_on_device(accelerator.device)
+
+    if args.deepspeed:
+        ds_model = deepspeed_utils.prepare_deepspeed_model(args, mmdit=controlnet)
+        # most of ZeRO stage uses optimizer partitioning, so we have to prepare optimizer and ds_model at the same time. # pull/1139#issuecomment-1986790007
+        ds_model, optimizer, train_dataloader, lr_scheduler = accelerator.prepare(
+            ds_model, optimizer, train_dataloader, lr_scheduler
+        )
+        training_models = [ds_model]
 
-        logger.info(f"t5xxl_max_token_length: {t5xxl_max_token_length}")
-        return strategy_flux.FluxTokenizeStrategy(t5xxl_max_token_length, args.tokenizer_cache_dir)
+    else:
+        # accelerator does some magic
+        # if we doesn't swap blocks, we can move the model to device
+        controlnet = accelerator.prepare(controlnet, device_placement=[not is_swapping_blocks])
+        if is_swapping_blocks:
+            accelerator.unwrap_model(controlnet).move_to_device_except_swap_blocks(accelerator.device)  # reduce peak memory usage
+        optimizer, train_dataloader, lr_scheduler = accelerator.prepare(optimizer, train_dataloader, lr_scheduler)
+
+    # 実験的機能：勾配も含めたfp16学習を行う　PyTorchにパッチを当ててfp16でのgrad scaleを有効にする
+    if args.full_fp16:
+        # During deepseed training, accelerate not handles fp16/bf16|mixed precision directly via scaler. Let deepspeed engine do.
+        # -> But we think it's ok to patch accelerator even if deepspeed is enabled.
+        train_util.patch_accelerator_for_fp16_training(accelerator)
+
+    # resumeする
+    train_util.resume_from_local_or_hf_if_specified(accelerator, args)
+
+    if args.fused_backward_pass:
+        # use fused optimizer for backward pass: other optimizers will be supported in the future
+        import library.adafactor_fused
+
+        library.adafactor_fused.patch_adafactor_fused(optimizer)
+
+        for param_group, param_name_group in zip(optimizer.param_groups, param_names):
+            for parameter, param_name in zip(param_group["params"], param_name_group):
+                if parameter.requires_grad:
+
+                    def create_grad_hook(p_name, p_group):
+                        def grad_hook(tensor: torch.Tensor):
+                            if accelerator.sync_gradients and args.max_grad_norm != 0.0:
+                                accelerator.clip_grad_norm_(tensor, args.max_grad_norm)
+                            optimizer.step_param(tensor, p_group)
+                            tensor.grad = None
+
+                        return grad_hook
+
+                    parameter.register_post_accumulate_grad_hook(create_grad_hook(param_name, param_group))
+
+    elif args.blockwise_fused_optimizers:
+        # prepare for additional optimizers and lr schedulers
+        for i in range(1, len(optimizers)):
+            optimizers[i] = accelerator.prepare(optimizers[i])
+            lr_schedulers[i] = accelerator.prepare(lr_schedulers[i])
+
+        # counters are used to determine when to step the optimizer
+        global optimizer_hooked_count
+        global num_parameters_per_group
+        global parameter_optimizer_map
+
+        optimizer_hooked_count = {}
+        num_parameters_per_group = [0] * len(optimizers)
+        parameter_optimizer_map = {}
+
+        for opt_idx, optimizer in enumerate(optimizers):
+            for param_group in optimizer.param_groups:
+                for parameter in param_group["params"]:
+                    if parameter.requires_grad:
+
+                        def grad_hook(parameter: torch.Tensor):
+                            if accelerator.sync_gradients and args.max_grad_norm != 0.0:
+                                accelerator.clip_grad_norm_(parameter, args.max_grad_norm)
+
+                            i = parameter_optimizer_map[parameter]
+                            optimizer_hooked_count[i] += 1
+                            if optimizer_hooked_count[i] == num_parameters_per_group[i]:
+                                optimizers[i].step()
+                                optimizers[i].zero_grad(set_to_none=True)
+
+                        parameter.register_post_accumulate_grad_hook(grad_hook)
+                        parameter_optimizer_map[parameter] = opt_idx
+                        num_parameters_per_group[opt_idx] += 1
+
+    # epoch数を計算する
+    num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
+    num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch)
+    if (args.save_n_epoch_ratio is not None) and (args.save_n_epoch_ratio > 0):
+        args.save_every_n_epochs = math.floor(num_train_epochs / args.save_n_epoch_ratio) or 1
+
+    # 学習する
+    # total_batch_size = args.train_batch_size * accelerator.num_processes * args.gradient_accumulation_steps
+    accelerator.print("running training / 学習開始")
+    accelerator.print(f"  num examples / サンプル数: {train_dataset_group.num_train_images}")
+    accelerator.print(f"  num batches per epoch / 1epochのバッチ数: {len(train_dataloader)}")
+    accelerator.print(f"  num epochs / epoch数: {num_train_epochs}")
+    accelerator.print(
+        f"  batch size per device / バッチサイズ: {', '.join([str(d.batch_size) for d in train_dataset_group.datasets])}"
+    )
+    # accelerator.print(
+    #     f"  total train batch size (with parallel & distributed & accumulation) / 総バッチサイズ（並列学習、勾配合計含む）: {total_batch_size}"
+    # )
+    accelerator.print(f"  gradient accumulation steps / 勾配を合計するステップ数 = {args.gradient_accumulation_steps}")
+    accelerator.print(f"  total optimization steps / 学習ステップ数: {args.max_train_steps}")
+
+    progress_bar = tqdm(range(args.max_train_steps), smoothing=0, disable=not accelerator.is_local_main_process, desc="steps")
+    global_step = 0
+
+    noise_scheduler = FlowMatchEulerDiscreteScheduler(num_train_timesteps=1000, shift=args.discrete_flow_shift)
+    noise_scheduler_copy = copy.deepcopy(noise_scheduler)
+
+    if accelerator.is_main_process:
+        init_kwargs = {}
+        if args.wandb_run_name:
+            init_kwargs["wandb"] = {"name": args.wandb_run_name}
+        if args.log_tracker_config is not None:
+            init_kwargs = toml.load(args.log_tracker_config)
+        accelerator.init_trackers(
+            "finetuning" if args.log_tracker_name is None else args.log_tracker_name,
+            config=train_util.get_sanitized_config_or_none(args),
+            init_kwargs=init_kwargs,
+        )
 
-    def get_tokenizers(self, tokenize_strategy: strategy_flux.FluxTokenizeStrategy):
-        return [tokenize_strategy.clip_l, tokenize_strategy.t5xxl]
+    if is_swapping_blocks:
+        accelerator.unwrap_model(controlnet).prepare_block_swap_before_forward()
 
-    def get_latents_caching_strategy(self, args):
-        latents_caching_strategy = strategy_flux.FluxLatentsCachingStrategy(args.cache_latents_to_disk, args.vae_batch_size, False)
-        return latents_caching_strategy
+    # For --sample_at_first
+    optimizer_eval_fn()
+    flux_train_utils.sample_images(accelerator, args, 0, global_step, flux, ae, [clip_l, t5xxl], sample_prompts_te_outputs)
+    optimizer_train_fn()
+    if len(accelerator.trackers) > 0:
+        # log empty object to commit the sample images to wandb
+        accelerator.log({}, step=0)
 
-    def get_text_encoding_strategy(self, args):
-        return strategy_flux.FluxTextEncodingStrategy(apply_t5_attn_mask=args.apply_t5_attn_mask)
+    loss_recorder = train_util.LossRecorder()
+    epoch = 0  # avoid error when max_train_steps is 0
+    for epoch in range(num_train_epochs):
+        accelerator.print(f"\nepoch {epoch+1}/{num_train_epochs}")
+        current_epoch.value = epoch + 1
 
-    def post_process_network(self, args, accelerator, network, text_encoders, unet):
-        # check t5xxl is trained or not
-        self.train_t5xxl = network.train_t5xxl
+        for m in training_models:
+            m.train()
 
-        if self.train_t5xxl and args.cache_text_encoder_outputs:
-            raise ValueError(
-                "T5XXL is trained, so cache_text_encoder_outputs cannot be used / T5XXL学習時はcache_text_encoder_outputsは使用できません"
-            )
+        for step, batch in enumerate(train_dataloader):
+            current_step.value = global_step
 
-    def get_models_for_text_encoding(self, args, accelerator, text_encoders):
-        if args.cache_text_encoder_outputs:
-            if self.train_clip_l and not self.train_t5xxl:
-                return text_encoders[0:1]  # only CLIP-L is needed for encoding because T5XXL is cached
-            else:
-                return None  # no text encoders are needed for encoding because both are cached
-        else:
-            return text_encoders  # both CLIP-L and T5XXL are needed for encoding
+            if args.blockwise_fused_optimizers:
+                optimizer_hooked_count = {i: 0 for i in range(len(optimizers))}  # reset counter for each step
 
-    def get_text_encoders_train_flags(self, args, text_encoders):
-        return [self.train_clip_l, self.train_t5xxl]
+            with accelerator.accumulate(*training_models):
+                if "latents" in batch and batch["latents"] is not None:
+                    latents = batch["latents"].to(accelerator.device, dtype=weight_dtype)
+                else:
+                    with torch.no_grad():
+                        # encode images to latents. images are [-1, 1]
+                        latents = ae.encode(batch["images"].to(ae.dtype)).to(accelerator.device, dtype=weight_dtype)
+
+                    # NaNが含まれていれば警告を表示し0に置き換える
+                    if torch.any(torch.isnan(latents)):
+                        accelerator.print("NaN found in latents, replacing with zeros")
+                        latents = torch.nan_to_num(latents, 0, out=latents)
+
+                text_encoder_outputs_list = batch.get("text_encoder_outputs_list", None)
+                if text_encoder_outputs_list is not None:
+                    text_encoder_conds = text_encoder_outputs_list
+                else:
+                    # not cached or training, so get from text encoders
+                    tokens_and_masks = batch["input_ids_list"]
+                    with torch.no_grad():
+                        input_ids = [ids.to(accelerator.device) for ids in batch["input_ids_list"]]
+                        text_encoder_conds = text_encoding_strategy.encode_tokens(
+                            flux_tokenize_strategy, [clip_l, t5xxl], input_ids, args.apply_t5_attn_mask
+                        )
+                        if args.full_fp16:
+                            text_encoder_conds = [c.to(weight_dtype) for c in text_encoder_conds]
 
-    def get_text_encoder_outputs_caching_strategy(self, args):
-        if args.cache_text_encoder_outputs:
-            # if the text encoders is trained, we need tokenization, so is_partial is True
-            return strategy_flux.FluxTextEncoderOutputsCachingStrategy(
-                args.cache_text_encoder_outputs_to_disk,
-                args.text_encoder_batch_size,
-                args.skip_cache_check,
-                is_partial=self.train_clip_l or self.train_t5xxl,
-                apply_t5_attn_mask=args.apply_t5_attn_mask,
-            )
-        else:
-            return None
+                # TODO support some features for noise implemented in get_noise_noisy_latents_and_timesteps
 
-    def cache_text_encoder_outputs_if_needed(
-        self, args, accelerator: Accelerator, unet, vae, text_encoders, dataset: train_util.DatasetGroup, weight_dtype
-    ):
-        if args.cache_text_encoder_outputs:
-            if not args.lowram:
-                # メモリ消費を減らす
-                logger.info("move vae and unet to cpu to save memory")
-                org_vae_device = vae.device
-                org_unet_device = unet.device
-                vae.to("cpu")
-                unet.to("cpu")
-                clean_memory_on_device(accelerator.device)
-
-            # When TE is not be trained, it will not be prepared so we need to use explicit autocast
-            logger.info("move text encoders to gpu")
-            text_encoders[0].to(accelerator.device, dtype=weight_dtype)  # always not fp8
-            text_encoders[1].to(accelerator.device)
-
-            if text_encoders[1].dtype == torch.float8_e4m3fn:
-                # if we load fp8 weights, the model is already fp8, so we use it as is
-                self.prepare_text_encoder_fp8(1, text_encoders[1], text_encoders[1].dtype, weight_dtype)
-            else:
-                # otherwise, we need to convert it to target dtype
-                text_encoders[1].to(weight_dtype)
-
-            with accelerator.autocast():
-                dataset.new_cache_text_encoder_outputs(text_encoders, accelerator)
-
-            # cache sample prompts
-            if args.sample_prompts is not None:
-                logger.info(f"cache Text Encoder outputs for sample prompt: {args.sample_prompts}")
-
-                tokenize_strategy: strategy_flux.FluxTokenizeStrategy = strategy_base.TokenizeStrategy.get_strategy()
-                text_encoding_strategy: strategy_flux.FluxTextEncodingStrategy = strategy_base.TextEncodingStrategy.get_strategy()
-
-                prompts = train_util.load_prompts(args.sample_prompts)
-                sample_prompts_te_outputs = {}  # key: prompt, value: text encoder outputs
-                with accelerator.autocast(), torch.no_grad():
-                    for prompt_dict in prompts:
-                        for p in [prompt_dict.get("prompt", ""), prompt_dict.get("negative_prompt", "")]:
-                            if p not in sample_prompts_te_outputs:
-                                logger.info(f"cache Text Encoder outputs for prompt: {p}")
-                                tokens_and_masks = tokenize_strategy.tokenize(p)
-                                sample_prompts_te_outputs[p] = text_encoding_strategy.encode_tokens(
-                                    tokenize_strategy, text_encoders, tokens_and_masks, args.apply_t5_attn_mask
-                                )
-                self.sample_prompts_te_outputs = sample_prompts_te_outputs
-
-            accelerator.wait_for_everyone()
-
-            # move back to cpu
-            if not self.is_train_text_encoder(args):
-                logger.info("move CLIP-L back to cpu")
-                text_encoders[0].to("cpu")
-            logger.info("move t5XXL back to cpu")
-            text_encoders[1].to("cpu")
-            clean_memory_on_device(accelerator.device)
-
-            if not args.lowram:
-                logger.info("move vae and unet back to original device")
-                vae.to(org_vae_device)
-                unet.to(org_unet_device)
-        else:
-            # Text Encoderから毎回出力を取得するので、GPUに乗せておく
-            text_encoders[0].to(accelerator.device, dtype=weight_dtype)
-            text_encoders[1].to(accelerator.device)
+                # Sample noise that we'll add to the latents
+                noise = torch.randn_like(latents)
+                bsz = latents.shape[0]
 
-    # def call_unet(self, args, accelerator, unet, noisy_latents, timesteps, text_conds, batch, weight_dtype):
-    #     noisy_latents = noisy_latents.to(weight_dtype)  # TODO check why noisy_latents is not weight_dtype
+                # get noisy model input and timesteps
+                noisy_model_input, timesteps, sigmas = flux_train_utils.get_noisy_model_input_and_timesteps(
+                    args, noise_scheduler_copy, latents, noise, accelerator.device, weight_dtype
+                )
 
-    #     # get size embeddings
-    #     orig_size = batch["original_sizes_hw"]
-    #     crop_size = batch["crop_top_lefts"]
-    #     target_size = batch["target_sizes_hw"]
-    #     embs = sdxl_train_util.get_size_embeddings(orig_size, crop_size, target_size, accelerator.device).to(weight_dtype)
+                # pack latents and get img_ids
+                packed_noisy_model_input = flux_utils.pack_latents(noisy_model_input)  # b, c, h*2, w*2 -> b, h*w, c*4
+                packed_latent_height, packed_latent_width = noisy_model_input.shape[2] // 2, noisy_model_input.shape[3] // 2
+                img_ids = flux_utils.prepare_img_ids(bsz, packed_latent_height, packed_latent_width).to(device=accelerator.device)
 
-    #     # concat embeddings
-    #     encoder_hidden_states1, encoder_hidden_states2, pool2 = text_conds
-    #     vector_embedding = torch.cat([pool2, embs], dim=1).to(weight_dtype)
-    #     text_embedding = torch.cat([encoder_hidden_states1, encoder_hidden_states2], dim=2).to(weight_dtype)
+                # get guidance: ensure args.guidance_scale is float
+                guidance_vec = torch.full((bsz,), float(args.guidance_scale), device=accelerator.device)
 
-    #     noise_pred = unet(noisy_latents, timesteps, text_embedding, vector_embedding)
-    #     return noise_pred
+                # call model
+                l_pooled, t5_out, txt_ids, t5_attn_mask = text_encoder_conds
+                if not args.apply_t5_attn_mask:
+                    t5_attn_mask = None
 
-    def sample_images(self, accelerator, args, epoch, global_step, device, ae, tokenizer, text_encoder, flux):
-        text_encoders = text_encoder  # for compatibility
-        text_encoders = self.get_models_for_text_encoding(args, accelerator, text_encoders)
+                with accelerator.autocast():
+                    block_samples, block_single_samples = controlnet(
+                        img=packed_noisy_model_input,
+                        img_ids=img_ids,
+                        controlnet_cond=batch["control_image"].to(accelerator.device),
+                        txt=t5_out,
+                        txt_ids=txt_ids,
+                        y=l_pooled,
+                        timesteps=timesteps / 1000,
+                        guidance=guidance_vec,
+                        txt_attention_mask=t5_attn_mask,
+                    )
+                    # YiYi notes: divide it by 1000 for now because we scale it by 1000 in the transformer model (we should not keep it but I want to keep the inputs same for the model for testing)
+                    model_pred = flux(
+                        img=packed_noisy_model_input,
+                        img_ids=img_ids,
+                        txt=t5_out,
+                        txt_ids=txt_ids,
+                        y=l_pooled,
+                        block_controlnet_hidden_states=block_samples,
+                        block_controlnet_single_hidden_states=block_single_samples,
+                        timesteps=timesteps / 1000,
+                        guidance=guidance_vec,
+                        txt_attention_mask=t5_attn_mask,
+                    )
 
-        flux_train_utils.sample_images(
-            accelerator, args, epoch, global_step, flux, ae, text_encoders, self.sample_prompts_te_outputs
-        )
-        # return
-
-        """
-        class FluxUpperLowerWrapper(torch.nn.Module):
-            def __init__(self, flux_upper: flux_models.FluxUpper, flux_lower: flux_models.FluxLower, device: torch.device):
-                super().__init__()
-                self.flux_upper = flux_upper
-                self.flux_lower = flux_lower
-                self.target_device = device
-
-            def prepare_block_swap_before_forward(self):
-                pass
-
-            def forward(self, img, img_ids, txt, txt_ids, timesteps, y, guidance=None, txt_attention_mask=None):
-                self.flux_lower.to("cpu")
-                clean_memory_on_device(self.target_device)
-                self.flux_upper.to(self.target_device)
-                img, txt, vec, pe = self.flux_upper(img, img_ids, txt, txt_ids, timesteps, y, guidance, txt_attention_mask)
-                self.flux_upper.to("cpu")
-                clean_memory_on_device(self.target_device)
-                self.flux_lower.to(self.target_device)
-                return self.flux_lower(img, txt, vec, pe, txt_attention_mask)
-
-        wrapper = FluxUpperLowerWrapper(self.flux_upper, flux, accelerator.device)
-        clean_memory_on_device(accelerator.device)
-        flux_train_utils.sample_images(
-            accelerator, args, epoch, global_step, wrapper, ae, text_encoders, self.sample_prompts_te_outputs
-        )
-        clean_memory_on_device(accelerator.device)
-        """
-
-    def get_noise_scheduler(self, args: argparse.Namespace, device: torch.device) -> Any:
-        noise_scheduler = sd3_train_utils.FlowMatchEulerDiscreteScheduler(num_train_timesteps=1000, shift=args.discrete_flow_shift)
-        self.noise_scheduler_copy = copy.deepcopy(noise_scheduler)
-        return noise_scheduler
-
-    def encode_images_to_latents(self, args, accelerator, vae, images):
-        return vae.encode(images)
-
-    def shift_scale_latents(self, args, latents):
-        return latents
-
-    def get_noise_pred_and_target(
-        self,
-        args,
-        accelerator,
-        noise_scheduler,
-        latents,
-        batch,
-        text_encoder_conds,
-        unet: flux_models.Flux,
-        network,
-        weight_dtype,
-        train_unet,
-    ):
-        # Sample noise that we'll add to the latents
-        noise = torch.randn_like(latents)
-        bsz = latents.shape[0]
-
-        # get noisy model input and timesteps
-        noisy_model_input, timesteps, sigmas = flux_train_utils.get_noisy_model_input_and_timesteps(
-            args, noise_scheduler, latents, noise, accelerator.device, weight_dtype
-        )
+                # unpack latents
+                model_pred = flux_utils.unpack_latents(model_pred, packed_latent_height, packed_latent_width)
+
+                # apply model prediction type
+                model_pred, weighting = flux_train_utils.apply_model_prediction_type(args, model_pred, noisy_model_input, sigmas)
 
-        # pack latents and get img_ids
-        packed_noisy_model_input = flux_utils.pack_latents(noisy_model_input)  # b, c, h*2, w*2 -> b, h*w, c*4
-        packed_latent_height, packed_latent_width = noisy_model_input.shape[2] // 2, noisy_model_input.shape[3] // 2
-        img_ids = flux_utils.prepare_img_ids(bsz, packed_latent_height, packed_latent_width).to(device=accelerator.device)
-
-        # get guidance
-        # ensure guidance_scale in args is float
-        guidance_vec = torch.full((bsz,), float(args.guidance_scale), device=accelerator.device)
-
-        # ensure the hidden state will require grad
-        if args.gradient_checkpointing:
-            noisy_model_input.requires_grad_(True)
-            for t in text_encoder_conds:
-                if t is not None and t.dtype.is_floating_point:
-                    t.requires_grad_(True)
-            img_ids.requires_grad_(True)
-            guidance_vec.requires_grad_(True)
-
-        # Predict the noise residual
-        l_pooled, t5_out, txt_ids, t5_attn_mask = text_encoder_conds
-        if not args.apply_t5_attn_mask:
-            t5_attn_mask = None
-
-        def call_dit(img, img_ids, t5_out, txt_ids, l_pooled, timesteps, guidance_vec, t5_attn_mask):
-            # if not args.split_mode:
-            # normal forward
-            with accelerator.autocast():
-                # YiYi notes: divide it by 1000 for now because we scale it by 1000 in the transformer model (we should not keep it but I want to keep the inputs same for the model for testing)
-                model_pred = unet(
-                    img=img,
-                    img_ids=img_ids,
-                    txt=t5_out,
-                    txt_ids=txt_ids,
-                    y=l_pooled,
-                    timesteps=timesteps / 1000,
-                    guidance=guidance_vec,
-                    txt_attention_mask=t5_attn_mask,
+                # flow matching loss: this is different from SD3
+                target = noise - latents
+
+                # calculate loss
+                loss = train_util.conditional_loss(
+                    model_pred.float(), target.float(), reduction="none", loss_type=args.loss_type, huber_c=None
+                )
+                if weighting is not None:
+                    loss = loss * weighting
+                if args.masked_loss or ("alpha_masks" in batch and batch["alpha_masks"] is not None):
+                    loss = apply_masked_loss(loss, batch)
+                loss = loss.mean([1, 2, 3])
+
+                loss_weights = batch["loss_weights"]  # 各sampleごとのweight
+                loss = loss * loss_weights
+                loss = loss.mean()
+
+                # backward
+                accelerator.backward(loss)
+
+                if not (args.fused_backward_pass or args.blockwise_fused_optimizers):
+                    if accelerator.sync_gradients and args.max_grad_norm != 0.0:
+                        params_to_clip = []
+                        for m in training_models:
+                            params_to_clip.extend(m.parameters())
+                        accelerator.clip_grad_norm_(params_to_clip, args.max_grad_norm)
+
+                    optimizer.step()
+                    lr_scheduler.step()
+                    optimizer.zero_grad(set_to_none=True)
+                else:
+                    # optimizer.step() and optimizer.zero_grad() are called in the optimizer hook
+                    lr_scheduler.step()
+                    if args.blockwise_fused_optimizers:
+                        for i in range(1, len(optimizers)):
+                            lr_schedulers[i].step()
+
+            # Checks if the accelerator has performed an optimization step behind the scenes
+            if accelerator.sync_gradients:
+                progress_bar.update(1)
+                global_step += 1
+
+                optimizer_eval_fn()
+                flux_train_utils.sample_images(
+                    accelerator, args, None, global_step, flux, ae, [clip_l, t5xxl], sample_prompts_te_outputs
                 )
-            """
-            else:
-                # split forward to reduce memory usage
-                assert network.train_blocks == "single", "train_blocks must be single for split mode"
-                with accelerator.autocast():
-                    # move flux lower to cpu, and then move flux upper to gpu
-                    unet.to("cpu")
-                    clean_memory_on_device(accelerator.device)
-                    self.flux_upper.to(accelerator.device)
 
-                    # upper model does not require grad
-                    with torch.no_grad():
-                        intermediate_img, intermediate_txt, vec, pe = self.flux_upper(
-                            img=packed_noisy_model_input,
-                            img_ids=img_ids,
-                            txt=t5_out,
-                            txt_ids=txt_ids,
-                            y=l_pooled,
-                            timesteps=timesteps / 1000,
-                            guidance=guidance_vec,
-                            txt_attention_mask=t5_attn_mask,
+                # 指定ステップごとにモデルを保存
+                if args.save_every_n_steps is not None and global_step % args.save_every_n_steps == 0:
+                    accelerator.wait_for_everyone()
+                    if accelerator.is_main_process:
+                        flux_train_utils.save_flux_model_on_epoch_end_or_stepwise(
+                            args,
+                            False,
+                            accelerator,
+                            save_dtype,
+                            epoch,
+                            num_train_epochs,
+                            global_step,
+                            accelerator.unwrap_model(flux),
                         )
+                optimizer_train_fn()
 
-                    # move flux upper back to cpu, and then move flux lower to gpu
-                    self.flux_upper.to("cpu")
-                    clean_memory_on_device(accelerator.device)
-                    unet.to(accelerator.device)
-
-                    # lower model requires grad
-                    intermediate_img.requires_grad_(True)
-                    intermediate_txt.requires_grad_(True)
-                    vec.requires_grad_(True)
-                    pe.requires_grad_(True)
-                    model_pred = unet(img=intermediate_img, txt=intermediate_txt, vec=vec, pe=pe, txt_attention_mask=t5_attn_mask)
-            """
-
-            return model_pred
-
-        model_pred = call_dit(
-            img=packed_noisy_model_input,
-            img_ids=img_ids,
-            t5_out=t5_out,
-            txt_ids=txt_ids,
-            l_pooled=l_pooled,
-            timesteps=timesteps,
-            guidance_vec=guidance_vec,
-            t5_attn_mask=t5_attn_mask,
-        )
+            current_loss = loss.detach().item()  # 平均なのでbatch sizeは関係ないはず
+            if len(accelerator.trackers) > 0:
+                logs = {"loss": current_loss}
+                train_util.append_lr_to_logs(logs, lr_scheduler, args.optimizer_type, including_unet=True)
 
-        # unpack latents
-        model_pred = flux_utils.unpack_latents(model_pred, packed_latent_height, packed_latent_width)
-
-        # apply model prediction type
-        model_pred, weighting = flux_train_utils.apply_model_prediction_type(args, model_pred, noisy_model_input, sigmas)
-
-        # flow matching loss: this is different from SD3
-        target = noise - latents
-
-        # differential output preservation
-        if "custom_attributes" in batch:
-            diff_output_pr_indices = []
-            for i, custom_attributes in enumerate(batch["custom_attributes"]):
-                if "diff_output_preservation" in custom_attributes and custom_attributes["diff_output_preservation"]:
-                    diff_output_pr_indices.append(i)
-
-            if len(diff_output_pr_indices) > 0:
-                network.set_multiplier(0.0)
-                with torch.no_grad():
-                    model_pred_prior = call_dit(
-                        img=packed_noisy_model_input[diff_output_pr_indices],
-                        img_ids=img_ids[diff_output_pr_indices],
-                        t5_out=t5_out[diff_output_pr_indices],
-                        txt_ids=txt_ids[diff_output_pr_indices],
-                        l_pooled=l_pooled[diff_output_pr_indices],
-                        timesteps=timesteps[diff_output_pr_indices],
-                        guidance_vec=guidance_vec[diff_output_pr_indices] if guidance_vec is not None else None,
-                        t5_attn_mask=t5_attn_mask[diff_output_pr_indices] if t5_attn_mask is not None else None,
-                    )
-                network.set_multiplier(1.0)  # may be overwritten by "network_multipliers" in the next step
+                accelerator.log(logs, step=global_step)
 
-                model_pred_prior = flux_utils.unpack_latents(model_pred_prior, packed_latent_height, packed_latent_width)
-                model_pred_prior, _ = flux_train_utils.apply_model_prediction_type(
+            loss_recorder.add(epoch=epoch, step=step, loss=current_loss)
+            avr_loss: float = loss_recorder.moving_average
+            logs = {"avr_loss": avr_loss}  # , "lr": lr_scheduler.get_last_lr()[0]}
+            progress_bar.set_postfix(**logs)
+
+            if global_step >= args.max_train_steps:
+                break
+
+        if len(accelerator.trackers) > 0:
+            logs = {"loss/epoch": loss_recorder.moving_average}
+            accelerator.log(logs, step=epoch + 1)
+
+        accelerator.wait_for_everyone()
+
+        optimizer_eval_fn()
+        if args.save_every_n_epochs is not None:
+            if accelerator.is_main_process:
+                flux_train_utils.save_flux_model_on_epoch_end_or_stepwise(
                     args,
-                    model_pred_prior,
-                    noisy_model_input[diff_output_pr_indices],
-                    sigmas[diff_output_pr_indices] if sigmas is not None else None,
+                    True,
+                    accelerator,
+                    save_dtype,
+                    epoch,
+                    num_train_epochs,
+                    global_step,
+                    accelerator.unwrap_model(flux),
                 )
-                target[diff_output_pr_indices] = model_pred_prior.to(target.dtype)
-
-        return model_pred, target, timesteps, None, weighting
-
-    def post_process_loss(self, loss, args, timesteps, noise_scheduler):
-        return loss
-
-    def get_sai_model_spec(self, args):
-        return train_util.get_sai_model_spec(None, args, False, True, False, flux="dev")
-
-    def update_metadata(self, metadata, args):
-        metadata["ss_apply_t5_attn_mask"] = args.apply_t5_attn_mask
-        metadata["ss_weighting_scheme"] = args.weighting_scheme
-        metadata["ss_logit_mean"] = args.logit_mean
-        metadata["ss_logit_std"] = args.logit_std
-        metadata["ss_mode_scale"] = args.mode_scale
-        metadata["ss_guidance_scale"] = args.guidance_scale
-        metadata["ss_timestep_sampling"] = args.timestep_sampling
-        metadata["ss_sigmoid_scale"] = args.sigmoid_scale
-        metadata["ss_model_prediction_type"] = args.model_prediction_type
-        metadata["ss_discrete_flow_shift"] = args.discrete_flow_shift
-
-    def is_text_encoder_not_needed_for_training(self, args):
-        return args.cache_text_encoder_outputs and not self.is_train_text_encoder(args)
-
-    def prepare_text_encoder_grad_ckpt_workaround(self, index, text_encoder):
-        if index == 0:  # CLIP-L
-            return super().prepare_text_encoder_grad_ckpt_workaround(index, text_encoder)
-        else:  # T5XXL
-            text_encoder.encoder.embed_tokens.requires_grad_(True)
-
-    def prepare_text_encoder_fp8(self, index, text_encoder, te_weight_dtype, weight_dtype):
-        if index == 0:  # CLIP-L
-            logger.info(f"prepare CLIP-L for fp8: set to {te_weight_dtype}, set embeddings to {weight_dtype}")
-            text_encoder.to(te_weight_dtype)  # fp8
-            text_encoder.text_model.embeddings.to(dtype=weight_dtype)
-        else:  # T5XXL
-
-            def prepare_fp8(text_encoder, target_dtype):
-                def forward_hook(module):
-                    def forward(hidden_states):
-                        hidden_gelu = module.act(module.wi_0(hidden_states))
-                        hidden_linear = module.wi_1(hidden_states)
-                        hidden_states = hidden_gelu * hidden_linear
-                        hidden_states = module.dropout(hidden_states)
-
-                        hidden_states = module.wo(hidden_states)
-                        return hidden_states
-
-                    return forward
-
-                for module in text_encoder.modules():
-                    if module.__class__.__name__ in ["T5LayerNorm", "Embedding"]:
-                        # print("set", module.__class__.__name__, "to", target_dtype)
-                        module.to(target_dtype)
-                    if module.__class__.__name__ in ["T5DenseGatedActDense"]:
-                        # print("set", module.__class__.__name__, "hooks")
-                        module.forward = forward_hook(module)
-
-            if flux_utils.get_t5xxl_actual_dtype(text_encoder) == torch.float8_e4m3fn and text_encoder.dtype == weight_dtype:
-                logger.info(f"T5XXL already prepared for fp8")
-            else:
-                logger.info(f"prepare T5XXL for fp8: set to {te_weight_dtype}, set embeddings to {weight_dtype}, add hooks")
-                text_encoder.to(te_weight_dtype)  # fp8
-                prepare_fp8(text_encoder, weight_dtype)
 
-    def prepare_unet_with_accelerator(
-        self, args: argparse.Namespace, accelerator: Accelerator, unet: torch.nn.Module
-    ) -> torch.nn.Module:
-        if not self.is_swapping_blocks:
-            return super().prepare_unet_with_accelerator(args, accelerator, unet)
+        flux_train_utils.sample_images(
+            accelerator, args, epoch + 1, global_step, flux, ae, [clip_l, t5xxl], sample_prompts_te_outputs
+        )
+        optimizer_train_fn()
+
+    is_main_process = accelerator.is_main_process
+    # if is_main_process:
+    controlnet = accelerator.unwrap_model(controlnet)
 
-        # if we doesn't swap blocks, we can move the model to device
-        flux: flux_models.Flux = unet
-        flux = accelerator.prepare(flux, device_placement=[not self.is_swapping_blocks])
-        accelerator.unwrap_model(flux).move_to_device_except_swap_blocks(accelerator.device)  # reduce peak memory usage
-        accelerator.unwrap_model(flux).prepare_block_swap_before_forward()
+    accelerator.end_training()
+    optimizer_eval_fn()
+
+    if args.save_state or args.save_state_on_train_end:
+        train_util.save_state_on_train_end(args, accelerator)
+
+    del accelerator  # この後メモリを使うのでこれは消す
 
-        return flux
+    if is_main_process:
+        flux_train_utils.save_flux_model_on_train_end(args, save_dtype, epoch, global_step, flux)
+        logger.info("model saved.")
 
 
 def setup_parser() -> argparse.ArgumentParser:
-    parser = train_network.setup_parser()
+    parser = argparse.ArgumentParser()
+
+    add_logging_arguments(parser)
+    train_util.add_sd_models_arguments(parser)  # TODO split this
+    train_util.add_dataset_arguments(parser, True, True, True)
+    train_util.add_training_arguments(parser, False)
+    train_util.add_masked_loss_arguments(parser)
+    deepspeed_utils.add_deepspeed_arguments(parser)
+    train_util.add_sd_saving_arguments(parser)
+    train_util.add_optimizer_arguments(parser)
+    config_util.add_config_arguments(parser)
+    add_custom_train_arguments(parser)  # TODO remove this from here
     train_util.add_dit_training_arguments(parser)
     flux_train_utils.add_flux_train_arguments(parser)
 
     parser.add_argument(
-        "--split_mode",
+        "--mem_eff_save",
+        action="store_true",
+        help="[EXPERIMENTAL] use memory efficient custom model saving method / メモリ効率の良い独自のモデル保存方法を使う",
+    )
+
+    parser.add_argument(
+        "--fused_optimizer_groups",
+        type=int,
+        default=None,
+        help="**this option is not working** will be removed in the future / このオプションは動作しません。将来削除されます",
+    )
+    parser.add_argument(
+        "--blockwise_fused_optimizers",
+        action="store_true",
+        help="enable blockwise optimizers for fused backward pass and optimizer step / fused backward passとoptimizer step のためブロック単位のoptimizerを有効にする",
+    )
+    parser.add_argument(
+        "--skip_latents_validity_check",
+        action="store_true",
+        help="[Deprecated] use 'skip_cache_check' instead / 代わりに 'skip_cache_check' を使用してください",
+    )
+    parser.add_argument(
+        "--double_blocks_to_swap",
+        type=int,
+        default=None,
+        help="[Deprecated] use 'blocks_to_swap' instead / 代わりに 'blocks_to_swap' を使用してください",
+    )
+    parser.add_argument(
+        "--single_blocks_to_swap",
+        type=int,
+        default=None,
+        help="[Deprecated] use 'blocks_to_swap' instead / 代わりに 'blocks_to_swap' を使用してください",
+    )
+    parser.add_argument(
+        "--cpu_offload_checkpointing",
         action="store_true",
-        # help="[EXPERIMENTAL] use split mode for Flux model, network arg `train_blocks=single` is required"
-        # + "/[実験的] Fluxモデルの分割モードを使用する。ネットワーク引数`train_blocks=single`が必要",
-        help="[Deprecated] This option is deprecated. Please use `--blocks_to_swap` instead."
-        " / このオプションは非推奨です。代わりに`--blocks_to_swap`を使用してください。",
+        help="[EXPERIMENTAL] enable offloading of tensors to CPU during checkpointing / チェックポイント時にテンソルをCPUにオフロードする",
     )
     return parser
 
@@ -569,5 +866,4 @@ def setup_parser() -> argparse.ArgumentParser:
     train_util.verify_command_line_training_args(args)
     args = train_util.read_config_from_file(args, parser)
 
-    trainer = FluxNetworkTrainer()
-    trainer.train(args)
+    train(args)
diff --git a/flux_train_network.py b/flux_train_network.py
index 0feb9b011..6668012e4 100644
--- a/flux_train_network.py
+++ b/flux_train_network.py
@@ -125,9 +125,6 @@ def load_target_model(self, args, weight_dtype, accelerator):
 
         ae = flux_utils.load_ae(args.ae, weight_dtype, "cpu", disable_mmap=args.disable_mmap_load_safetensors)
 
-        controlnet = flux_utils.load_controlnet()
-        controlnet.train()
-
         return flux_utils.MODEL_VERSION_FLUX_V1, [clip_l, t5xxl], ae, model, controlnet
 
     def get_tokenize_strategy(self, args):
diff --git a/library/flux_train_utils.py b/library/flux_train_utils.py
index d90644a25..cc3bcb0ec 100644
--- a/library/flux_train_utils.py
+++ b/library/flux_train_utils.py
@@ -40,6 +40,7 @@ def sample_images(
     text_encoders,
     sample_prompts_te_outputs,
     prompt_replacement=None,
+    controlnet=None
 ):
     if steps == 0:
         if not args.sample_at_first:
@@ -67,6 +68,8 @@ def sample_images(
     flux = accelerator.unwrap_model(flux)
     if text_encoders is not None:
         text_encoders = [accelerator.unwrap_model(te) for te in text_encoders]
+    if controlnet is not None:
+        controlnet = accelerator.unwrap_model(controlnet)
     # print([(te.parameters().__next__().device if te is not None else None) for te in text_encoders])
 
     prompts = train_util.load_prompts(args.sample_prompts)
@@ -98,6 +101,7 @@ def sample_images(
                     steps,
                     sample_prompts_te_outputs,
                     prompt_replacement,
+                    controlnet
                 )
     else:
         # Creating list with N elements, where each element is a list of prompt_dicts, and N is the number of processes available (number of devices available)
@@ -121,6 +125,7 @@ def sample_images(
                         steps,
                         sample_prompts_te_outputs,
                         prompt_replacement,
+                        controlnet
                     )
 
     torch.set_rng_state(rng_state)
@@ -142,6 +147,7 @@ def sample_image_inference(
     steps,
     sample_prompts_te_outputs,
     prompt_replacement,
+    controlnet
 ):
     assert isinstance(prompt_dict, dict)
     # negative_prompt = prompt_dict.get("negative_prompt")
@@ -150,7 +156,7 @@ def sample_image_inference(
     height = prompt_dict.get("height", 512)
     scale = prompt_dict.get("scale", 3.5)
     seed = prompt_dict.get("seed")
-    # controlnet_image = prompt_dict.get("controlnet_image")
+    controlnet_image = prompt_dict.get("controlnet_image")
     prompt: str = prompt_dict.get("prompt", "")
     # sampler_name: str = prompt_dict.get("sample_sampler", args.sample_sampler)
 
@@ -169,6 +175,9 @@ def sample_image_inference(
 
     # if negative_prompt is None:
     #     negative_prompt = ""
+    if controlnet_image is not None:
+        controlnet_image = Image.open(controlnet_image).convert("RGB")
+        controlnet_image = controlnet_image.resize((width, height), Image.LANCZOS)
 
     height = max(64, height - height % 16)  # round to divisible by 16
     width = max(64, width - width % 16)  # round to divisible by 16
@@ -224,7 +233,7 @@ def sample_image_inference(
     t5_attn_mask = t5_attn_mask.to(accelerator.device) if args.apply_t5_attn_mask else None
 
     with accelerator.autocast(), torch.no_grad():
-        x = denoise(flux, noise, img_ids, t5_out, txt_ids, l_pooled, timesteps=timesteps, guidance=scale, t5_attn_mask=t5_attn_mask)
+        x = denoise(flux, noise, img_ids, t5_out, txt_ids, l_pooled, timesteps=timesteps, guidance=scale, t5_attn_mask=t5_attn_mask, controlnet=controlnet, controlnet_img=controlnet_image)
 
     x = x.float()
     x = flux_utils.unpack_latents(x, packed_latent_height, packed_latent_width)
@@ -301,18 +310,37 @@ def denoise(
     timesteps: list[float],
     guidance: float = 4.0,
     t5_attn_mask: Optional[torch.Tensor] = None,
+    controlnet: Optional[flux_models.ControlNetFlux] = None,
+    controlnet_img: Optional[torch.Tensor] = None,
 ):
     # this is ignored for schnell
     guidance_vec = torch.full((img.shape[0],), guidance, device=img.device, dtype=img.dtype)
     for t_curr, t_prev in zip(tqdm(timesteps[:-1]), timesteps[1:]):
         t_vec = torch.full((img.shape[0],), t_curr, dtype=img.dtype, device=img.device)
         model.prepare_block_swap_before_forward()
+        if controlnet is not None:
+            block_samples, block_single_samples = controlnet(
+                img=img,
+                img_ids=img_ids,
+                controlnet_cond=controlnet_img,
+                txt=txt,
+                txt_ids=txt_ids,
+                y=vec,
+                timesteps=t_vec,
+                guidance=guidance_vec,
+                txt_attention_mask=t5_attn_mask,
+            )
+        else:
+            block_samples = None
+            block_single_samples = None
         pred = model(
             img=img,
             img_ids=img_ids,
             txt=txt,
             txt_ids=txt_ids,
             y=vec,
+            block_controlnet_hidden_states=block_samples,
+            block_controlnet_single_hidden_states=block_single_samples,
             timesteps=t_vec,
             guidance=guidance_vec,
             txt_attention_mask=t5_attn_mask,
diff --git a/library/flux_utils.py b/library/flux_utils.py
index 678efbc8a..7b538d133 100644
--- a/library/flux_utils.py
+++ b/library/flux_utils.py
@@ -153,11 +153,14 @@ def load_ae(
     return ae
 
 
-def load_controlnet(name, device, transformer=None):
-   with torch.device(device):
+def load_controlnet():
+   # TODO
+   is_schnell = False
+   name = MODEL_NAME_DEV if not is_schnell else MODEL_NAME_SCHNELL
+   with torch.device("meta"):
        controlnet = flux_models.ControlNetFlux(flux_models.configs[name].params)
-   if transformer is not None:
-       controlnet.load_state_dict(transformer.state_dict(), strict=False)
+   # if transformer is not None:
+   #    controlnet.load_state_dict(transformer.state_dict(), strict=False)
    return controlnet    
 
 

From e358b118afbc93f63dbb5ab6d2412ec553ea9cd7 Mon Sep 17 00:00:00 2001
From: minux302 <minux302@gmail.com>
Date: Sat, 16 Nov 2024 14:49:29 +0900
Subject: [PATCH 239/582] fix dataloader

---
 flux_train_control_net.py | 84 ++++++++++++++++++++-------------------
 library/flux_models.py    | 17 ++++----
 2 files changed, 52 insertions(+), 49 deletions(-)

diff --git a/flux_train_control_net.py b/flux_train_control_net.py
index 8a7be75f2..ee4d0ebf3 100644
--- a/flux_train_control_net.py
+++ b/flux_train_control_net.py
@@ -11,31 +11,36 @@
 # - Per-block fused optimizer instances
 
 import argparse
-from concurrent.futures import ThreadPoolExecutor
 import copy
 import math
 import os
-from multiprocessing import Value
 import time
+from concurrent.futures import ThreadPoolExecutor
+from multiprocessing import Value
 from typing import List, Optional, Tuple, Union
-import toml
-
-from tqdm import tqdm
 
+import toml
 import torch
 import torch.nn as nn
+from tqdm import tqdm
+
 from library import utils
-from library.device_utils import init_ipex, clean_memory_on_device
+from library.device_utils import clean_memory_on_device, init_ipex
 
 init_ipex()
 
 from accelerate.utils import set_seed
-from library import deepspeed_utils, flux_train_utils, flux_utils, strategy_base, strategy_flux
-from library.sd3_train_utils import FlowMatchEulerDiscreteScheduler
 
 import library.train_util as train_util
-
-from library.utils import setup_logging, add_logging_arguments
+from library import (
+    deepspeed_utils,
+    flux_train_utils,
+    flux_utils,
+    strategy_base,
+    strategy_flux,
+)
+from library.sd3_train_utils import FlowMatchEulerDiscreteScheduler
+from library.utils import add_logging_arguments, setup_logging
 
 setup_logging()
 import logging
@@ -46,10 +51,10 @@
 
 # import library.sdxl_train_util as sdxl_train_util
 from library.config_util import (
-    ConfigSanitizer,
     BlueprintGenerator,
+    ConfigSanitizer,
 )
-from library.custom_train_functions import apply_masked_loss, add_custom_train_arguments
+from library.custom_train_functions import add_custom_train_arguments, apply_masked_loss
 
 
 def train(args):
@@ -85,7 +90,6 @@ def train(args):
     )
 
     cache_latents = args.cache_latents
-    use_dreambooth_method = args.in_json is None
 
     if args.seed is not None:
         set_seed(args.seed)  # 乱数系列を初期化する
@@ -103,7 +107,7 @@ def train(args):
         if args.dataset_config is not None:
             logger.info(f"Load dataset config from {args.dataset_config}")
             user_config = config_util.load_user_config(args.dataset_config)
-            ignored = ["train_data_dir", "in_json"]
+            ignored = ["train_data_dir", "conditioing_data_dir"]
             if any(getattr(args, attr) is not None for attr in ignored):
                 logger.warning(
                     "ignore following options because config file is found: {0} / 設定ファイルが利用されるため以下のオプションは無視されます: {0}".format(
@@ -111,31 +115,17 @@ def train(args):
                     )
                 )
         else:
-            if use_dreambooth_method:
-                logger.info("Using DreamBooth method.")
-                user_config = {
-                    "datasets": [
-                        {
-                            "subsets": config_util.generate_dreambooth_subsets_config_by_subdirs(
-                                args.train_data_dir, args.reg_data_dir
-                            )
-                        }
-                    ]
-                }
-            else:
-                logger.info("Training with captions.")
-                user_config = {
-                    "datasets": [
-                        {
-                            "subsets": [
-                                {
-                                    "image_dir": args.train_data_dir,
-                                    "metadata_file": args.in_json,
-                                }
-                            ]
-                        }
-                    ]
-                }
+            user_config = {
+                "datasets": [
+                    {
+                        "subsets": config_util.generate_controlnet_subsets_config_by_subdirs(
+                            args.train_data_dir,
+                            args.conditioning_data_dir,
+                            args.caption_extension
+                        )
+                    }
+                ]
+            }
 
         blueprint = blueprint_generator.generate(user_config, args)
         train_dataset_group = config_util.generate_dataset_group_by_blueprint(blueprint.dataset_group)
@@ -648,12 +638,12 @@ def grad_hook(parameter: torch.Tensor):
                 l_pooled, t5_out, txt_ids, t5_attn_mask = text_encoder_conds
                 if not args.apply_t5_attn_mask:
                     t5_attn_mask = None
-
+                
                 with accelerator.autocast():
                     block_samples, block_single_samples = controlnet(
                         img=packed_noisy_model_input,
                         img_ids=img_ids,
-                        controlnet_cond=batch["control_image"].to(accelerator.device),
+                        controlnet_img=batch["conditioing_image"].to(accelerator.device),
                         txt=t5_out,
                         txt_ids=txt_ids,
                         y=l_pooled,
@@ -856,6 +846,18 @@ def setup_parser() -> argparse.ArgumentParser:
         action="store_true",
         help="[EXPERIMENTAL] enable offloading of tensors to CPU during checkpointing / チェックポイント時にテンソルをCPUにオフロードする",
     )
+    parser.add_argument(
+        "--controlnet_model_name_or_path",
+        type=str,
+        default=None,
+        help="controlnet model name or path / controlnetのモデル名またはパス",
+    )
+    parser.add_argument(
+        "--conditioning_data_dir",
+        type=str,
+        default=None,
+        help="conditioning data directory / 条件付けデータのディレクトリ",
+    )
     return parser
 
 
diff --git a/library/flux_models.py b/library/flux_models.py
index a3bd19743..b52ea6f0b 100644
--- a/library/flux_models.py
+++ b/library/flux_models.py
@@ -2,15 +2,15 @@
 # license: Apache-2.0 License
 
 
-from concurrent.futures import Future, ThreadPoolExecutor
-from dataclasses import dataclass
 import math
 import os
 import time
+from concurrent.futures import Future, ThreadPoolExecutor
+from dataclasses import dataclass
 from typing import Dict, List, Optional, Union
 
 from library import utils
-from library.device_utils import init_ipex, clean_memory_on_device
+from library.device_utils import clean_memory_on_device, init_ipex
 
 init_ipex()
 
@@ -18,6 +18,7 @@
 from einops import rearrange
 from torch import Tensor, nn
 from torch.utils.checkpoint import checkpoint
+
 from library import custom_offloading_utils
 
 # USE_REENTRANT = True
@@ -1251,7 +1252,7 @@ def forward(
         self,
         img: Tensor,
         img_ids: Tensor,
-        controlnet_cond: Tensor,
+        controlnet_img: Tensor,
         txt: Tensor,
         txt_ids: Tensor,
         timesteps: Tensor,
@@ -1264,10 +1265,10 @@ def forward(
 
         # running on sequences img
         img = self.img_in(img)
-        controlnet_cond = self.input_hint_block(controlnet_cond)
-        controlnet_cond = rearrange(controlnet_cond, "b c (h ph) (w pw) -> b (h w) (c ph pw)", ph=2, pw=2)
-        controlnet_cond = self.pos_embed_input(controlnet_cond)
-        img = img + controlnet_cond
+        controlnet_img = self.input_hint_block(controlnet_img)
+        controlnet_img = rearrange(controlnet_img, "b c (h ph) (w pw) -> b (h w) (c ph pw)", ph=2, pw=2)
+        controlnet_img = self.pos_embed_input(controlnet_img)
+        img = img + controlnet_img
         vec = self.time_in(timestep_embedding(timesteps, 256))
         if self.params.guidance_embed:
             if guidance is None:

From 2a188f07e682ed5dd958821a223d48c17a9aeb83 Mon Sep 17 00:00:00 2001
From: Kohya S <ykumeykume@gmail.com>
Date: Sun, 17 Nov 2024 16:12:10 +0900
Subject: [PATCH 240/582] Fix to work DOP with bock swap

---
 flux_train_network.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/flux_train_network.py b/flux_train_network.py
index 704c4d32e..679db62b6 100644
--- a/flux_train_network.py
+++ b/flux_train_network.py
@@ -445,6 +445,7 @@ def call_dit(img, img_ids, t5_out, txt_ids, l_pooled, timesteps, guidance_vec, t
 
             if len(diff_output_pr_indices) > 0:
                 network.set_multiplier(0.0)
+                unet.prepare_block_swap_before_forward()
                 with torch.no_grad():
                     model_pred_prior = call_dit(
                         img=packed_noisy_model_input[diff_output_pr_indices],

From b2660bbe7410d7ffa40906a7a09f84a17139cb46 Mon Sep 17 00:00:00 2001
From: minux302 <minux302@gmail.com>
Date: Sun, 17 Nov 2024 10:24:57 +0000
Subject: [PATCH 241/582] train run

---
 flux_train_control_net.py   | 39 ++++++++++++++++++++++---------------
 library/flux_models.py      | 30 ++++++++++++++--------------
 library/flux_train_utils.py |  2 +-
 library/flux_utils.py       |  2 +-
 4 files changed, 40 insertions(+), 33 deletions(-)

diff --git a/flux_train_control_net.py b/flux_train_control_net.py
index ee4d0ebf3..205ff6b6a 100644
--- a/flux_train_control_net.py
+++ b/flux_train_control_net.py
@@ -103,11 +103,11 @@ def train(args):
 
     # データセットを準備する
     if args.dataset_class is None:
-        blueprint_generator = BlueprintGenerator(ConfigSanitizer(True, True, args.masked_loss, True))
+        blueprint_generator = BlueprintGenerator(ConfigSanitizer(False, False, True, True))
         if args.dataset_config is not None:
             logger.info(f"Load dataset config from {args.dataset_config}")
             user_config = config_util.load_user_config(args.dataset_config)
-            ignored = ["train_data_dir", "conditioing_data_dir"]
+            ignored = ["train_data_dir", "conditioning_data_dir"]
             if any(getattr(args, attr) is not None for attr in ignored):
                 logger.warning(
                     "ignore following options because config file is found: {0} / 設定ファイルが利用されるため以下のオプションは無視されます: {0}".format(
@@ -263,10 +263,11 @@ def train(args):
         args.pretrained_model_name_or_path, weight_dtype, "cpu", args.disable_mmap_load_safetensors
     )
     flux.requires_grad_(False)
+    flux.to(accelerator.device)
 
     # load controlnet
     controlnet = flux_utils.load_controlnet()
-    controlnet.requires_grad_(True)
+    controlnet.train()
 
     if args.gradient_checkpointing:
         controlnet.enable_gradient_checkpointing(cpu_offload=args.cpu_offload_checkpointing)
@@ -443,7 +444,8 @@ def train(args):
 
     clean_memory_on_device(accelerator.device)
 
-    if args.deepspeed:
+    # if args.deepspeed:
+    if True:
         ds_model = deepspeed_utils.prepare_deepspeed_model(args, mmdit=controlnet)
         # most of ZeRO stage uses optimizer partitioning, so we have to prepare optimizer and ds_model at the same time. # pull/1139#issuecomment-1986790007
         ds_model, optimizer, train_dataloader, lr_scheduler = accelerator.prepare(
@@ -612,8 +614,10 @@ def grad_hook(parameter: torch.Tensor):
                         text_encoder_conds = text_encoding_strategy.encode_tokens(
                             flux_tokenize_strategy, [clip_l, t5xxl], input_ids, args.apply_t5_attn_mask
                         )
-                        if args.full_fp16:
-                            text_encoder_conds = [c.to(weight_dtype) for c in text_encoder_conds]
+                        # if args.full_fp16:
+                        #    text_encoder_conds = [c.to(weight_dtype) for c in text_encoder_conds]
+                # TODO: check
+                text_encoder_conds = [c.to(weight_dtype) for c in text_encoder_conds]
 
                 # TODO support some features for noise implemented in get_noise_noisy_latents_and_timesteps
 
@@ -629,10 +633,10 @@ def grad_hook(parameter: torch.Tensor):
                 # pack latents and get img_ids
                 packed_noisy_model_input = flux_utils.pack_latents(noisy_model_input)  # b, c, h*2, w*2 -> b, h*w, c*4
                 packed_latent_height, packed_latent_width = noisy_model_input.shape[2] // 2, noisy_model_input.shape[3] // 2
-                img_ids = flux_utils.prepare_img_ids(bsz, packed_latent_height, packed_latent_width).to(device=accelerator.device)
+                img_ids = flux_utils.prepare_img_ids(bsz, packed_latent_height, packed_latent_width).to(device=accelerator.device).to(weight_dtype)
 
                 # get guidance: ensure args.guidance_scale is float
-                guidance_vec = torch.full((bsz,), float(args.guidance_scale), device=accelerator.device)
+                guidance_vec = torch.full((bsz,), float(args.guidance_scale), device=accelerator.device, dtype=weight_dtype)
 
                 # call model
                 l_pooled, t5_out, txt_ids, t5_attn_mask = text_encoder_conds
@@ -640,10 +644,11 @@ def grad_hook(parameter: torch.Tensor):
                     t5_attn_mask = None
                 
                 with accelerator.autocast():
+                    print("control start")
                     block_samples, block_single_samples = controlnet(
                         img=packed_noisy_model_input,
                         img_ids=img_ids,
-                        controlnet_img=batch["conditioing_image"].to(accelerator.device),
+                        controlnet_cond=batch["conditioning_images"].to(accelerator.device).to(weight_dtype),
                         txt=t5_out,
                         txt_ids=txt_ids,
                         y=l_pooled,
@@ -651,6 +656,8 @@ def grad_hook(parameter: torch.Tensor):
                         guidance=guidance_vec,
                         txt_attention_mask=t5_attn_mask,
                     )
+                    print("control end")
+                    print("dit start")
                     # YiYi notes: divide it by 1000 for now because we scale it by 1000 in the transformer model (we should not keep it but I want to keep the inputs same for the model for testing)
                     model_pred = flux(
                         img=packed_noisy_model_input,
@@ -796,7 +803,7 @@ def setup_parser() -> argparse.ArgumentParser:
 
     add_logging_arguments(parser)
     train_util.add_sd_models_arguments(parser)  # TODO split this
-    train_util.add_dataset_arguments(parser, True, True, True)
+    train_util.add_dataset_arguments(parser, False, True, True)
     train_util.add_training_arguments(parser, False)
     train_util.add_masked_loss_arguments(parser)
     deepspeed_utils.add_deepspeed_arguments(parser)
@@ -852,12 +859,12 @@ def setup_parser() -> argparse.ArgumentParser:
         default=None,
         help="controlnet model name or path / controlnetのモデル名またはパス",
     )
-    parser.add_argument(
-        "--conditioning_data_dir",
-        type=str,
-        default=None,
-        help="conditioning data directory / 条件付けデータのディレクトリ",
-    )
+    # parser.add_argument(
+    #     "--conditioning_data_dir",
+    #     type=str,
+    #     default=None,
+    #     help="conditioning data directory / 条件付けデータのディレクトリ",
+    # )
     return parser
 
 
diff --git a/library/flux_models.py b/library/flux_models.py
index b52ea6f0b..2fc21db9d 100644
--- a/library/flux_models.py
+++ b/library/flux_models.py
@@ -1042,20 +1042,20 @@ def forward(
         if not self.blocks_to_swap:
             for block_idx, block in enumerate(self.double_blocks):
                 img, txt = block(img=img, txt=txt, vec=vec, pe=pe, txt_attention_mask=txt_attention_mask)
-                if block_controlnet_hidden_states is not None:
+                if block_controlnet_hidden_states is not None and controlnet_depth > 0:
                     img = img + block_controlnet_hidden_states[block_idx % controlnet_depth]
 
             img = torch.cat((txt, img), 1)
-            for block in self.single_blocks:
+            for block_idx, block in enumerate(self.single_blocks):
                 img = block(img, vec=vec, pe=pe, txt_attention_mask=txt_attention_mask)
-                if block_controlnet_single_hidden_states is not None:
+                if block_controlnet_single_hidden_states is not None and controlnet_single_depth > 0:
                     img = img + block_controlnet_single_hidden_states[block_idx % controlnet_single_depth]
         else:
             for block_idx, block in enumerate(self.double_blocks):
                 self.offloader_double.wait_for_block(block_idx)
 
                 img, txt = block(img=img, txt=txt, vec=vec, pe=pe, txt_attention_mask=txt_attention_mask)
-                if block_controlnet_hidden_states is not None:
+                if block_controlnet_hidden_states is not None and controlnet_depth > 0:
                     img = img + block_controlnet_hidden_states[block_idx % controlnet_depth]
 
                 self.offloader_double.submit_move_blocks(self.double_blocks, block_idx)
@@ -1066,7 +1066,7 @@ def forward(
                 self.offloader_single.wait_for_block(block_idx)
 
                 img = block(img, vec=vec, pe=pe, txt_attention_mask=txt_attention_mask)
-                if block_controlnet_single_hidden_states is not None:
+                if block_controlnet_single_hidden_states is not None and controlnet_single_depth > 0:
                     img = img + block_controlnet_single_hidden_states[block_idx % controlnet_single_depth]
 
                 self.offloader_single.submit_move_blocks(self.single_blocks, block_idx)
@@ -1121,14 +1121,14 @@ def __init__(self, params: FluxParams, controlnet_depth=2):
                     mlp_ratio=params.mlp_ratio,
                     qkv_bias=params.qkv_bias,
                 )
-                for _ in range(params.depth)
+                for _ in range(controlnet_depth)
             ]
         )
 
         self.single_blocks = nn.ModuleList(
             [
                 SingleStreamBlock(self.hidden_size, self.num_heads, mlp_ratio=params.mlp_ratio)
-                for _ in range(0) # TMP
+                for _ in range(0) # TODO
             ]
         )
 
@@ -1148,7 +1148,7 @@ def __init__(self, params: FluxParams, controlnet_depth=2):
             controlnet_block = zero_module(controlnet_block)
             self.controlnet_blocks_for_double.append(controlnet_block)
         self.controlnet_blocks_for_single = nn.ModuleList([])
-        for _ in range(controlnet_depth):
+        for _ in range(0):  # TODO
             controlnet_block = nn.Linear(self.hidden_size, self.hidden_size)
             controlnet_block = zero_module(controlnet_block)
             self.controlnet_blocks_for_single.append(controlnet_block)
@@ -1252,7 +1252,7 @@ def forward(
         self,
         img: Tensor,
         img_ids: Tensor,
-        controlnet_img: Tensor,
+        controlnet_cond: Tensor,
         txt: Tensor,
         txt_ids: Tensor,
         timesteps: Tensor,
@@ -1265,10 +1265,10 @@ def forward(
 
         # running on sequences img
         img = self.img_in(img)
-        controlnet_img = self.input_hint_block(controlnet_img)
-        controlnet_img = rearrange(controlnet_img, "b c (h ph) (w pw) -> b (h w) (c ph pw)", ph=2, pw=2)
-        controlnet_img = self.pos_embed_input(controlnet_img)
-        img = img + controlnet_img
+        controlnet_cond = self.input_hint_block(controlnet_cond)
+        controlnet_cond = rearrange(controlnet_cond, "b c (h ph) (w pw) -> b (h w) (c ph pw)", ph=2, pw=2)
+        controlnet_cond = self.pos_embed_input(controlnet_cond)
+        img = img + controlnet_cond
         vec = self.time_in(timestep_embedding(timesteps, 256))
         if self.params.guidance_embed:
             if guidance is None:
@@ -1283,7 +1283,7 @@ def forward(
         block_samples = ()
         block_single_samples = ()
         if not self.blocks_to_swap:
-            for block_idx, block in enumerate(self.double_blocks):
+            for block in self.double_blocks:
                 img, txt = block(img=img, txt=txt, vec=vec, pe=pe, txt_attention_mask=txt_attention_mask)
                 block_samples = block_samples + (img,)
 
@@ -1315,7 +1315,7 @@ def forward(
         for block_sample, controlnet_block in zip(block_samples, self.controlnet_blocks_for_double):
             block_sample = controlnet_block(block_sample)
             controlnet_block_samples = controlnet_block_samples + (block_sample,)
-        for block_sample, controlnet_block in zip(block_samples, self.controlnet_single_blocks_for_single):
+        for block_sample, controlnet_block in zip(block_samples, self.controlnet_blocks_for_single):
             block_sample = controlnet_block(block_sample)
             controlnet_single_block_samples = controlnet_single_block_samples + (block_sample,)
 
diff --git a/library/flux_train_utils.py b/library/flux_train_utils.py
index cc3bcb0ec..d82bde91c 100644
--- a/library/flux_train_utils.py
+++ b/library/flux_train_utils.py
@@ -460,7 +460,7 @@ def get_noisy_model_input_and_timesteps(
         sigmas = get_sigmas(noise_scheduler, timesteps, device, n_dim=latents.ndim, dtype=dtype)
         noisy_model_input = sigmas * noise + (1.0 - sigmas) * latents
 
-    return noisy_model_input, timesteps, sigmas
+    return noisy_model_input.to(dtype), timesteps.to(dtype), sigmas
 
 
 def apply_model_prediction_type(args, model_pred, noisy_model_input, sigmas):
diff --git a/library/flux_utils.py b/library/flux_utils.py
index 7b538d133..4a3817fdb 100644
--- a/library/flux_utils.py
+++ b/library/flux_utils.py
@@ -157,7 +157,7 @@ def load_controlnet():
    # TODO
    is_schnell = False
    name = MODEL_NAME_DEV if not is_schnell else MODEL_NAME_SCHNELL
-   with torch.device("meta"):
+   with torch.device("cuda:0"):
        controlnet = flux_models.ControlNetFlux(flux_models.configs[name].params)
    # if transformer is not None:
    #    controlnet.load_state_dict(transformer.state_dict(), strict=False)

From 35778f021897796410372aed8540547ba317c2a3 Mon Sep 17 00:00:00 2001
From: minux302 <minux302@gmail.com>
Date: Sun, 17 Nov 2024 11:09:05 +0000
Subject: [PATCH 242/582] fix sample_images type

---
 flux_train_control_net.py   | 31 ++++++++++++++-----------------
 library/flux_train_utils.py |  2 +-
 2 files changed, 15 insertions(+), 18 deletions(-)

diff --git a/flux_train_control_net.py b/flux_train_control_net.py
index 205ff6b6a..791900d17 100644
--- a/flux_train_control_net.py
+++ b/flux_train_control_net.py
@@ -444,8 +444,7 @@ def train(args):
 
     clean_memory_on_device(accelerator.device)
 
-    # if args.deepspeed:
-    if True:
+    if args.deepspeed:
         ds_model = deepspeed_utils.prepare_deepspeed_model(args, mmdit=controlnet)
         # most of ZeRO stage uses optimizer partitioning, so we have to prepare optimizer and ds_model at the same time. # pull/1139#issuecomment-1986790007
         ds_model, optimizer, train_dataloader, lr_scheduler = accelerator.prepare(
@@ -644,7 +643,6 @@ def grad_hook(parameter: torch.Tensor):
                     t5_attn_mask = None
                 
                 with accelerator.autocast():
-                    print("control start")
                     block_samples, block_single_samples = controlnet(
                         img=packed_noisy_model_input,
                         img_ids=img_ids,
@@ -656,8 +654,6 @@ def grad_hook(parameter: torch.Tensor):
                         guidance=guidance_vec,
                         txt_attention_mask=t5_attn_mask,
                     )
-                    print("control end")
-                    print("dit start")
                     # YiYi notes: divide it by 1000 for now because we scale it by 1000 in the transformer model (we should not keep it but I want to keep the inputs same for the model for testing)
                     model_pred = flux(
                         img=packed_noisy_model_input,
@@ -763,18 +759,19 @@ def grad_hook(parameter: torch.Tensor):
         accelerator.wait_for_everyone()
 
         optimizer_eval_fn()
-        if args.save_every_n_epochs is not None:
-            if accelerator.is_main_process:
-                flux_train_utils.save_flux_model_on_epoch_end_or_stepwise(
-                    args,
-                    True,
-                    accelerator,
-                    save_dtype,
-                    epoch,
-                    num_train_epochs,
-                    global_step,
-                    accelerator.unwrap_model(flux),
-                )
+        # TODO: save cn models
+        # if args.save_every_n_epochs is not None:
+        #     if accelerator.is_main_process:
+        #         flux_train_utils.save_flux_model_on_epoch_end_or_stepwise(
+        #             args,
+        #             True,
+        #             accelerator,
+        #             save_dtype,
+        #             epoch,
+        #             num_train_epochs,
+        #             global_step,
+        #             accelerator.unwrap_model(flux),
+        #         )
 
         flux_train_utils.sample_images(
             accelerator, args, epoch + 1, global_step, flux, ae, [clip_l, t5xxl], sample_prompts_te_outputs
diff --git a/library/flux_train_utils.py b/library/flux_train_utils.py
index d82bde91c..de2ee030a 100644
--- a/library/flux_train_utils.py
+++ b/library/flux_train_utils.py
@@ -235,7 +235,7 @@ def sample_image_inference(
     with accelerator.autocast(), torch.no_grad():
         x = denoise(flux, noise, img_ids, t5_out, txt_ids, l_pooled, timesteps=timesteps, guidance=scale, t5_attn_mask=t5_attn_mask, controlnet=controlnet, controlnet_img=controlnet_image)
 
-    x = x.float()
+    # x = x.float()  # TODO: check
     x = flux_utils.unpack_latents(x, packed_latent_height, packed_latent_width)
 
     # latent to image

From 4dd4cd6ec8c55fa94b53217181ed9c95e59eed56 Mon Sep 17 00:00:00 2001
From: minux302 <minux302@gmail.com>
Date: Mon, 18 Nov 2024 12:47:01 +0000
Subject: [PATCH 243/582] work cn load and validation

---
 flux_train_control_net.py   | 20 ++++----------------
 library/flux_models.py      |  6 +++---
 library/flux_train_utils.py | 18 ++++++++++++++----
 library/flux_utils.py       | 25 ++++++++++++++++---------
 4 files changed, 37 insertions(+), 32 deletions(-)

diff --git a/flux_train_control_net.py b/flux_train_control_net.py
index 791900d17..cbfac418f 100644
--- a/flux_train_control_net.py
+++ b/flux_train_control_net.py
@@ -266,7 +266,7 @@ def train(args):
     flux.to(accelerator.device)
 
     # load controlnet
-    controlnet = flux_utils.load_controlnet()
+    controlnet = flux_utils.load_controlnet(args.controlnet, weight_dtype, "cpu", args.disable_mmap_load_safetensors)
     controlnet.train()
 
     if args.gradient_checkpointing:
@@ -568,7 +568,7 @@ def grad_hook(parameter: torch.Tensor):
 
     # For --sample_at_first
     optimizer_eval_fn()
-    flux_train_utils.sample_images(accelerator, args, 0, global_step, flux, ae, [clip_l, t5xxl], sample_prompts_te_outputs)
+    flux_train_utils.sample_images(accelerator, args, 0, global_step, flux, ae, [clip_l, t5xxl], sample_prompts_te_outputs, controlnet=controlnet)
     optimizer_train_fn()
     if len(accelerator.trackers) > 0:
         # log empty object to commit the sample images to wandb
@@ -718,7 +718,7 @@ def grad_hook(parameter: torch.Tensor):
 
                 optimizer_eval_fn()
                 flux_train_utils.sample_images(
-                    accelerator, args, None, global_step, flux, ae, [clip_l, t5xxl], sample_prompts_te_outputs
+                    accelerator, args, None, global_step, flux, ae, [clip_l, t5xxl], sample_prompts_te_outputs, controlnet=controlnet
                 )
 
                 # 指定ステップごとにモデルを保存
@@ -774,7 +774,7 @@ def grad_hook(parameter: torch.Tensor):
         #         )
 
         flux_train_utils.sample_images(
-            accelerator, args, epoch + 1, global_step, flux, ae, [clip_l, t5xxl], sample_prompts_te_outputs
+            accelerator, args, epoch + 1, global_step, flux, ae, [clip_l, t5xxl], sample_prompts_te_outputs, controlnet=controlnet
         )
         optimizer_train_fn()
 
@@ -850,18 +850,6 @@ def setup_parser() -> argparse.ArgumentParser:
         action="store_true",
         help="[EXPERIMENTAL] enable offloading of tensors to CPU during checkpointing / チェックポイント時にテンソルをCPUにオフロードする",
     )
-    parser.add_argument(
-        "--controlnet_model_name_or_path",
-        type=str,
-        default=None,
-        help="controlnet model name or path / controlnetのモデル名またはパス",
-    )
-    # parser.add_argument(
-    #     "--conditioning_data_dir",
-    #     type=str,
-    #     default=None,
-    #     help="conditioning data directory / 条件付けデータのディレクトリ",
-    # )
     return parser
 
 
diff --git a/library/flux_models.py b/library/flux_models.py
index 2fc21db9d..4123b40e5 100644
--- a/library/flux_models.py
+++ b/library/flux_models.py
@@ -1142,11 +1142,11 @@ def __init__(self, params: FluxParams, controlnet_depth=2):
         self.num_single_blocks = len(self.single_blocks)
 
         # add ControlNet blocks
-        self.controlnet_blocks_for_double = nn.ModuleList([])
+        self.controlnet_blocks = nn.ModuleList([])
         for _ in range(controlnet_depth):
             controlnet_block = nn.Linear(self.hidden_size, self.hidden_size)
             controlnet_block = zero_module(controlnet_block)
-            self.controlnet_blocks_for_double.append(controlnet_block)
+            self.controlnet_blocks.append(controlnet_block)
         self.controlnet_blocks_for_single = nn.ModuleList([])
         for _ in range(0):  # TODO
             controlnet_block = nn.Linear(self.hidden_size, self.hidden_size)
@@ -1312,7 +1312,7 @@ def forward(
 
         controlnet_block_samples = ()
         controlnet_single_block_samples = ()
-        for block_sample, controlnet_block in zip(block_samples, self.controlnet_blocks_for_double):
+        for block_sample, controlnet_block in zip(block_samples, self.controlnet_blocks):
             block_sample = controlnet_block(block_sample)
             controlnet_block_samples = controlnet_block_samples + (block_sample,)
         for block_sample, controlnet_block in zip(block_samples, self.controlnet_blocks_for_single):
diff --git a/library/flux_train_utils.py b/library/flux_train_utils.py
index de2ee030a..dbbaba734 100644
--- a/library/flux_train_utils.py
+++ b/library/flux_train_utils.py
@@ -175,10 +175,6 @@ def sample_image_inference(
 
     # if negative_prompt is None:
     #     negative_prompt = ""
-    if controlnet_image is not None:
-        controlnet_image = Image.open(controlnet_image).convert("RGB")
-        controlnet_image = controlnet_image.resize((width, height), Image.LANCZOS)
-
     height = max(64, height - height % 16)  # round to divisible by 16
     width = max(64, width - width % 16)  # round to divisible by 16
     logger.info(f"prompt: {prompt}")
@@ -232,6 +228,12 @@ def sample_image_inference(
     img_ids = flux_utils.prepare_img_ids(1, packed_latent_height, packed_latent_width).to(accelerator.device, weight_dtype)
     t5_attn_mask = t5_attn_mask.to(accelerator.device) if args.apply_t5_attn_mask else None
 
+    if controlnet_image is not None:
+        controlnet_image = Image.open(controlnet_image).convert("RGB")
+        controlnet_image = controlnet_image.resize((width, height), Image.LANCZOS)
+        controlnet_image = torch.from_numpy((np.array(controlnet_image) / 127.5) - 1)
+        controlnet_image = controlnet_image.permute(2, 0, 1).unsqueeze(0).to(weight_dtype).to(accelerator.device)
+
     with accelerator.autocast(), torch.no_grad():
         x = denoise(flux, noise, img_ids, t5_out, txt_ids, l_pooled, timesteps=timesteps, guidance=scale, t5_attn_mask=t5_attn_mask, controlnet=controlnet, controlnet_img=controlnet_image)
 
@@ -315,6 +317,8 @@ def denoise(
 ):
     # this is ignored for schnell
     guidance_vec = torch.full((img.shape[0],), guidance, device=img.device, dtype=img.dtype)
+
+
     for t_curr, t_prev in zip(tqdm(timesteps[:-1]), timesteps[1:]):
         t_vec = torch.full((img.shape[0],), t_curr, dtype=img.dtype, device=img.device)
         model.prepare_block_swap_before_forward()
@@ -560,6 +564,12 @@ def add_flux_train_arguments(parser: argparse.ArgumentParser):
         help="path to t5xxl (*.sft or *.safetensors), should be float16 / t5xxlのパス（*.sftまたは*.safetensors）、float16が前提",
     )
     parser.add_argument("--ae", type=str, help="path to ae (*.sft or *.safetensors) / aeのパス（*.sftまたは*.safetensors）")
+    parser.add_argument(
+        "--controlnet",
+        type=str,
+        default=None,
+        help="path to controlnet (*.sft or *.safetensors) / aeのパス（*.sftまたは*.safetensors）"
+    )
     parser.add_argument(
         "--t5xxl_max_token_length",
         type=int,
diff --git a/library/flux_utils.py b/library/flux_utils.py
index 4a3817fdb..fb7a30749 100644
--- a/library/flux_utils.py
+++ b/library/flux_utils.py
@@ -153,15 +153,22 @@ def load_ae(
     return ae
 
 
-def load_controlnet():
-   # TODO
-   is_schnell = False
-   name = MODEL_NAME_DEV if not is_schnell else MODEL_NAME_SCHNELL
-   with torch.device("cuda:0"):
-       controlnet = flux_models.ControlNetFlux(flux_models.configs[name].params)
-   # if transformer is not None:
-   #    controlnet.load_state_dict(transformer.state_dict(), strict=False)
-   return controlnet    
+def load_controlnet(
+    ckpt_path: Optional[str], dtype: torch.dtype, device: Union[str, torch.device], disable_mmap: bool = False
+):
+    logger.info("Building ControlNet")
+    # is_diffusers, is_schnell, (num_double_blocks, num_single_blocks), ckpt_paths = analyze_checkpoint_state(ckpt_path)
+    is_schnell = False
+    name = MODEL_NAME_DEV if not is_schnell else MODEL_NAME_SCHNELL
+    with torch.device("meta"):
+        controlnet = flux_models.ControlNetFlux(flux_models.configs[name].params).to(dtype)
+
+    if ckpt_path is not None:
+        logger.info(f"Loading state dict from {ckpt_path}")
+        sd = load_safetensors(ckpt_path, device=str(device), disable_mmap=disable_mmap, dtype=dtype)
+        info = controlnet.load_state_dict(sd, strict=False, assign=True)
+        logger.info(f"Loaded ControlNet: {info}")
+    return controlnet    
 
 
 def load_clip_l(

From 31ca899b6b5425466c814d0d9e2e4e8bfbf93001 Mon Sep 17 00:00:00 2001
From: minux302 <minux302@gmail.com>
Date: Mon, 18 Nov 2024 13:03:28 +0000
Subject: [PATCH 244/582] fix depth value

---
 library/flux_models.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/library/flux_models.py b/library/flux_models.py
index 4123b40e5..328ad481d 100644
--- a/library/flux_models.py
+++ b/library/flux_models.py
@@ -1093,7 +1093,7 @@ class ControlNetFlux(nn.Module):
     Transformer model for flow matching on sequences.
     """
 
-    def __init__(self, params: FluxParams, controlnet_depth=2):
+    def __init__(self, params: FluxParams, controlnet_depth=2, controlnet_single_depth=0):
         super().__init__()
 
         self.params = params
@@ -1128,7 +1128,7 @@ def __init__(self, params: FluxParams, controlnet_depth=2):
         self.single_blocks = nn.ModuleList(
             [
                 SingleStreamBlock(self.hidden_size, self.num_heads, mlp_ratio=params.mlp_ratio)
-                for _ in range(0) # TODO
+                for _ in range(controlnet_single_depth)
             ]
         )
 
@@ -1148,7 +1148,7 @@ def __init__(self, params: FluxParams, controlnet_depth=2):
             controlnet_block = zero_module(controlnet_block)
             self.controlnet_blocks.append(controlnet_block)
         self.controlnet_blocks_for_single = nn.ModuleList([])
-        for _ in range(0):  # TODO
+        for _ in range(controlnet_single_depth):
             controlnet_block = nn.Linear(self.hidden_size, self.hidden_size)
             controlnet_block = zero_module(controlnet_block)
             self.controlnet_blocks_for_single.append(controlnet_block)

From 2a61fc07846dc919ea64b568f7e18c010e5c8e06 Mon Sep 17 00:00:00 2001
From: kohya-ss <ykumeykume@gmail.com>
Date: Wed, 20 Nov 2024 21:20:35 +0900
Subject: [PATCH 245/582] docs: fix typo from block_to_swap to blocks_to_swap
 in README

---
 README.md | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/README.md b/README.md
index 81a3199bc..f9c85e3ac 100644
--- a/README.md
+++ b/README.md
@@ -68,11 +68,11 @@ When training LoRA for Text Encoder (without `--network_train_unet_only`), more
 
 __Options for GPUs with less VRAM:__
 
-By specifying `--block_to_swap`, you can save VRAM by swapping some blocks between CPU and GPU. See [FLUX.1 fine-tuning](#flux1-fine-tuning) for details.
+By specifying `--blocks_to_swap`, you can save VRAM by swapping some blocks between CPU and GPU. See [FLUX.1 fine-tuning](#flux1-fine-tuning) for details.
 
-Specify a number like `--block_to_swap 10`. A larger number will swap more blocks, saving more VRAM, but training will be slower. In FLUX.1, you can swap up to 35 blocks.
+Specify a number like `--blocks_to_swap 10`. A larger number will swap more blocks, saving more VRAM, but training will be slower. In FLUX.1, you can swap up to 35 blocks.
 
-`--cpu_offload_checkpointing` offloads gradient checkpointing to CPU. This reduces up to 1GB of VRAM usage but slows down the training by about 15%. Cannot be used with `--block_to_swap`.
+`--cpu_offload_checkpointing` offloads gradient checkpointing to CPU. This reduces up to 1GB of VRAM usage but slows down the training by about 15%. Cannot be used with `--blocks_to_swap`.
 
 Adafactor optimizer may reduce the VRAM usage than 8bit AdamW. Please use settings like below:
 
@@ -82,7 +82,7 @@ Adafactor optimizer may reduce the VRAM usage than 8bit AdamW. Please use settin
 
 The training can be done with 16GB VRAM GPUs with the batch size of 1. Please change your dataset configuration.
 
-The training can be done with 12GB VRAM GPUs with `--block_to_swap 16` with 8bit AdamW. Please use settings like below:
+The training can be done with 12GB VRAM GPUs with `--blocks_to_swap 16` with 8bit AdamW. Please use settings like below:
 
 ```
 --blocks_to_swap 16 

From 0b5229a9550cb921b83d22472c4785a15c42ba90 Mon Sep 17 00:00:00 2001
From: minux302 <minux302@gmail.com>
Date: Thu, 21 Nov 2024 15:55:27 +0000
Subject: [PATCH 246/582] save cn

---
 flux_train_control_net.py   | 34 +++++++++++++++-------------------
 library/flux_train_utils.py |  1 -
 2 files changed, 15 insertions(+), 20 deletions(-)

diff --git a/flux_train_control_net.py b/flux_train_control_net.py
index cbfac418f..0f38b7094 100644
--- a/flux_train_control_net.py
+++ b/flux_train_control_net.py
@@ -266,7 +266,7 @@ def train(args):
     flux.to(accelerator.device)
 
     # load controlnet
-    controlnet = flux_utils.load_controlnet(args.controlnet, weight_dtype, "cpu", args.disable_mmap_load_safetensors)
+    controlnet = flux_utils.load_controlnet(args.controlnet, torch.float32, "cpu", args.disable_mmap_load_safetensors)
     controlnet.train()
 
     if args.gradient_checkpointing:
@@ -613,9 +613,6 @@ def grad_hook(parameter: torch.Tensor):
                         text_encoder_conds = text_encoding_strategy.encode_tokens(
                             flux_tokenize_strategy, [clip_l, t5xxl], input_ids, args.apply_t5_attn_mask
                         )
-                        # if args.full_fp16:
-                        #    text_encoder_conds = [c.to(weight_dtype) for c in text_encoder_conds]
-                # TODO: check
                 text_encoder_conds = [c.to(weight_dtype) for c in text_encoder_conds]
 
                 # TODO support some features for noise implemented in get_noise_noisy_latents_and_timesteps
@@ -733,7 +730,7 @@ def grad_hook(parameter: torch.Tensor):
                             epoch,
                             num_train_epochs,
                             global_step,
-                            accelerator.unwrap_model(flux),
+                            accelerator.unwrap_model(controlnet),
                         )
                 optimizer_train_fn()
 
@@ -759,19 +756,18 @@ def grad_hook(parameter: torch.Tensor):
         accelerator.wait_for_everyone()
 
         optimizer_eval_fn()
-        # TODO: save cn models
-        # if args.save_every_n_epochs is not None:
-        #     if accelerator.is_main_process:
-        #         flux_train_utils.save_flux_model_on_epoch_end_or_stepwise(
-        #             args,
-        #             True,
-        #             accelerator,
-        #             save_dtype,
-        #             epoch,
-        #             num_train_epochs,
-        #             global_step,
-        #             accelerator.unwrap_model(flux),
-        #         )
+        if args.save_every_n_epochs is not None:
+            if accelerator.is_main_process:
+                flux_train_utils.save_flux_model_on_epoch_end_or_stepwise(
+                    args,
+                    True,
+                    accelerator,
+                    save_dtype,
+                    epoch,
+                    num_train_epochs,
+                    global_step,
+                    accelerator.unwrap_model(controlnet),
+                )
 
         flux_train_utils.sample_images(
             accelerator, args, epoch + 1, global_step, flux, ae, [clip_l, t5xxl], sample_prompts_te_outputs, controlnet=controlnet
@@ -791,7 +787,7 @@ def grad_hook(parameter: torch.Tensor):
     del accelerator  # この後メモリを使うのでこれは消す
 
     if is_main_process:
-        flux_train_utils.save_flux_model_on_train_end(args, save_dtype, epoch, global_step, flux)
+        flux_train_utils.save_flux_model_on_train_end(args, save_dtype, epoch, global_step, controlnet)
         logger.info("model saved.")
 
 
diff --git a/library/flux_train_utils.py b/library/flux_train_utils.py
index dbbaba734..5e25c7feb 100644
--- a/library/flux_train_utils.py
+++ b/library/flux_train_utils.py
@@ -237,7 +237,6 @@ def sample_image_inference(
     with accelerator.autocast(), torch.no_grad():
         x = denoise(flux, noise, img_ids, t5_out, txt_ids, l_pooled, timesteps=timesteps, guidance=scale, t5_attn_mask=t5_attn_mask, controlnet=controlnet, controlnet_img=controlnet_image)
 
-    # x = x.float()  # TODO: check
     x = flux_utils.unpack_latents(x, packed_latent_height, packed_latent_width)
 
     # latent to image

From 420a180d938c7b5a6e3006b1719dbfeaae72a2cc Mon Sep 17 00:00:00 2001
From: recris <recris@users.noreply.github.com>
Date: Wed, 27 Nov 2024 18:11:51 +0000
Subject: [PATCH 247/582] Implement pseudo Huber loss for Flux and SD3

---
 fine_tune.py                         |  6 +--
 flux_train.py                        |  2 +-
 flux_train_network.py                |  2 +-
 library/train_util.py                | 74 ++++++++++++++++------------
 sd3_train.py                         |  2 +-
 sd3_train_network.py                 |  2 +-
 sdxl_train.py                        |  6 +--
 sdxl_train_control_net.py            |  4 +-
 sdxl_train_control_net_lllite.py     |  4 +-
 sdxl_train_control_net_lllite_old.py |  6 ++-
 train_controlnet.py                  |  6 +--
 train_db.py                          |  4 +-
 train_network.py                     |  9 ++--
 train_textual_inversion.py           |  4 +-
 train_textual_inversion_XTI.py       |  6 ++-
 15 files changed, 76 insertions(+), 61 deletions(-)

diff --git a/fine_tune.py b/fine_tune.py
index 0090bd190..70959a751 100644
--- a/fine_tune.py
+++ b/fine_tune.py
@@ -380,7 +380,7 @@ def fn_recursive_set_mem_eff(module: torch.nn.Module):
 
                 # Sample noise, sample a random timestep for each image, and add noise to the latents,
                 # with noise offset and/or multires noise if specified
-                noise, noisy_latents, timesteps, huber_c = train_util.get_noise_noisy_latents_and_timesteps(
+                noise, noisy_latents, timesteps = train_util.get_noise_noisy_latents_and_timesteps(
                     args, noise_scheduler, latents
                 )
 
@@ -397,7 +397,7 @@ def fn_recursive_set_mem_eff(module: torch.nn.Module):
                 if args.min_snr_gamma or args.scale_v_pred_loss_like_noise_pred or args.debiased_estimation_loss:
                     # do not mean over batch dimension for snr weight or scale v-pred loss
                     loss = train_util.conditional_loss(
-                        noise_pred.float(), target.float(), reduction="none", loss_type=args.loss_type, huber_c=huber_c
+                        args, noise_pred.float(), target.float(), timesteps, "none", noise_scheduler
                     )
                     loss = loss.mean([1, 2, 3])
 
@@ -411,7 +411,7 @@ def fn_recursive_set_mem_eff(module: torch.nn.Module):
                     loss = loss.mean()  # mean over batch dimension
                 else:
                     loss = train_util.conditional_loss(
-                        noise_pred.float(), target.float(), reduction="mean", loss_type=args.loss_type, huber_c=huber_c
+                        args, noise_pred.float(), target.float(), timesteps, "none", noise_scheduler
                     )
 
                 accelerator.backward(loss)
diff --git a/flux_train.py b/flux_train.py
index a89e2f139..f6e43b27a 100644
--- a/flux_train.py
+++ b/flux_train.py
@@ -667,7 +667,7 @@ def grad_hook(parameter: torch.Tensor):
 
                 # calculate loss
                 loss = train_util.conditional_loss(
-                    model_pred.float(), target.float(), reduction="none", loss_type=args.loss_type, huber_c=None
+                    args, noise_pred.float(), target.float(), timesteps, "none", noise_scheduler
                 )
                 if weighting is not None:
                     loss = loss * weighting
diff --git a/flux_train_network.py b/flux_train_network.py
index 679db62b6..04287f399 100644
--- a/flux_train_network.py
+++ b/flux_train_network.py
@@ -468,7 +468,7 @@ def call_dit(img, img_ids, t5_out, txt_ids, l_pooled, timesteps, guidance_vec, t
                 )
                 target[diff_output_pr_indices] = model_pred_prior.to(target.dtype)
 
-        return model_pred, target, timesteps, None, weighting
+        return model_pred, target, timesteps, weighting
 
     def post_process_loss(self, loss, args, timesteps, noise_scheduler):
         return loss
diff --git a/library/train_util.py b/library/train_util.py
index 25cf7640d..c204ebd38 100644
--- a/library/train_util.py
+++ b/library/train_util.py
@@ -3905,7 +3905,14 @@ def add_training_arguments(parser: argparse.ArgumentParser, support_dreambooth:
         "--huber_c",
         type=float,
         default=0.1,
-        help="The huber loss parameter. Only used if one of the huber loss modes (huber or smooth l1) is selected with loss_type. default is 0.1 / Huber損失のパラメータ。loss_typeがhuberまたはsmooth l1の場合に有効。デフォルトは0.1",
+        help="The Huber loss decay parameter. Only used if one of the huber loss modes (huber or smooth l1) is selected with loss_type. default is 0.1 / Huber損失のパラメータ。loss_typeがhuberまたはsmooth l1の場合に有効。デフォルトは0.1",
+    )
+
+    parser.add_argument(
+        "--huber_scale",
+        type=float,
+        default=1.0,
+        help="The Huber loss scale parameter. Only used if one of the huber loss modes (huber or smooth l1) is selected with loss_type. default is 1.0 / Huber損失のパラメータ。loss_typeがhuberまたはsmooth l1の場合に有効。デフォルトは0.1",
     )
 
     parser.add_argument(
@@ -5821,29 +5828,10 @@ def save_sd_model_on_train_end_common(
             huggingface_util.upload(args, out_dir, "/" + model_name, force_sync_upload=True)
 
 
-def get_timesteps_and_huber_c(args, min_timestep, max_timestep, noise_scheduler, b_size, device):
-    timesteps = torch.randint(min_timestep, max_timestep, (b_size,), device="cpu")
-
-    if args.loss_type == "huber" or args.loss_type == "smooth_l1":
-        if args.huber_schedule == "exponential":
-            alpha = -math.log(args.huber_c) / noise_scheduler.config.num_train_timesteps
-            huber_c = torch.exp(-alpha * timesteps)
-        elif args.huber_schedule == "snr":
-            alphas_cumprod = torch.index_select(noise_scheduler.alphas_cumprod, 0, timesteps)
-            sigmas = ((1.0 - alphas_cumprod) / alphas_cumprod) ** 0.5
-            huber_c = (1 - args.huber_c) / (1 + sigmas) ** 2 + args.huber_c
-        elif args.huber_schedule == "constant":
-            huber_c = torch.full((b_size,), args.huber_c)
-        else:
-            raise NotImplementedError(f"Unknown Huber loss schedule {args.huber_schedule}!")
-        huber_c = huber_c.to(device)
-    elif args.loss_type == "l2":
-        huber_c = None  # may be anything, as it's not used
-    else:
-        raise NotImplementedError(f"Unknown loss type {args.loss_type}")
-
-    timesteps = timesteps.long().to(device)
-    return timesteps, huber_c
+def get_timesteps(min_timestep, max_timestep, b_size, device):
+    timesteps = torch.randint(min_timestep, max_timestep, (b_size,), device=device)
+    timesteps = timesteps.long()
+    return timesteps
 
 
 def get_noise_noisy_latents_and_timesteps(args, noise_scheduler, latents):
@@ -5865,7 +5853,7 @@ def get_noise_noisy_latents_and_timesteps(args, noise_scheduler, latents):
     min_timestep = 0 if args.min_timestep is None else args.min_timestep
     max_timestep = noise_scheduler.config.num_train_timesteps if args.max_timestep is None else args.max_timestep
 
-    timesteps, huber_c = get_timesteps_and_huber_c(args, min_timestep, max_timestep, noise_scheduler, b_size, latents.device)
+    timesteps = get_timesteps(min_timestep, max_timestep, b_size, latents.device)
 
     # Add noise to the latents according to the noise magnitude at each timestep
     # (this is the forward diffusion process)
@@ -5878,24 +5866,46 @@ def get_noise_noisy_latents_and_timesteps(args, noise_scheduler, latents):
     else:
         noisy_latents = noise_scheduler.add_noise(latents, noise, timesteps)
 
-    return noise, noisy_latents, timesteps, huber_c
+    return noise, noisy_latents, timesteps
+
+
+def get_huber_threshold(args, timesteps: torch.Tensor, noise_scheduler) -> torch.Tensor:
+    b_size = timesteps.shape[0]
+    if args.huber_schedule == "exponential":
+        alpha = -math.log(args.huber_c) / noise_scheduler.config.num_train_timesteps
+        result = torch.exp(-alpha * timesteps) * args.huber_scale
+    elif args.huber_schedule == "snr":
+        if not hasattr(noise_scheduler, 'alphas_cumprod'):
+            raise NotImplementedError(f"Huber schedule 'snr' is not supported with the current model.")
+        alphas_cumprod = torch.index_select(noise_scheduler.alphas_cumprod, 0, timesteps.cpu())
+        sigmas = ((1.0 - alphas_cumprod) / alphas_cumprod) ** 0.5
+        result = (1 - args.huber_c) / (1 + sigmas) ** 2 + args.huber_c
+        result = result.to(timesteps.device)
+    elif args.huber_schedule == "constant":
+        result = torch.full((b_size,), args.huber_c * args.huber_scale, device=timesteps.device)
+    else:
+        raise NotImplementedError(f"Unknown Huber loss schedule {args.huber_schedule}!")
+
+    return result
 
 
 def conditional_loss(
-    model_pred: torch.Tensor, target: torch.Tensor, reduction: str, loss_type: str, huber_c: Optional[torch.Tensor]
+    args, model_pred: torch.Tensor, target: torch.Tensor, timesteps: torch.Tensor, reduction: str, noise_scheduler
 ):
-    if loss_type == "l2":
+    if args.loss_type == "l2":
         loss = torch.nn.functional.mse_loss(model_pred, target, reduction=reduction)
-    elif loss_type == "l1":
+    elif args.loss_type == "l1":
         loss = torch.nn.functional.l1_loss(model_pred, target, reduction=reduction)
-    elif loss_type == "huber":
+    elif args.loss_type == "huber":
+        huber_c = get_huber_threshold(args, timesteps, noise_scheduler)
         huber_c = huber_c.view(-1, 1, 1, 1)
         loss = 2 * huber_c * (torch.sqrt((model_pred - target) ** 2 + huber_c**2) - huber_c)
         if reduction == "mean":
             loss = torch.mean(loss)
         elif reduction == "sum":
             loss = torch.sum(loss)
-    elif loss_type == "smooth_l1":
+    elif args.loss_type == "smooth_l1":
+        huber_c = get_huber_threshold(args, timesteps, noise_scheduler)
         huber_c = huber_c.view(-1, 1, 1, 1)
         loss = 2 * (torch.sqrt((model_pred - target) ** 2 + huber_c**2) - huber_c)
         if reduction == "mean":
@@ -5903,7 +5913,7 @@ def conditional_loss(
         elif reduction == "sum":
             loss = torch.sum(loss)
     else:
-        raise NotImplementedError(f"Unsupported Loss Type {loss_type}")
+        raise NotImplementedError(f"Unsupported Loss Type: {args.loss_type}")
     return loss
 
 
diff --git a/sd3_train.py b/sd3_train.py
index 96ec951b9..cf2bdf938 100644
--- a/sd3_train.py
+++ b/sd3_train.py
@@ -845,7 +845,7 @@ def grad_hook(parameter: torch.Tensor):
                 # )
                 # calculate loss
                 loss = train_util.conditional_loss(
-                    model_pred.float(), target.float(), reduction="none", loss_type=args.loss_type, huber_c=None
+                    args, model_pred.float(), target.float(), timesteps, "none", noise_scheduler
                 )
                 if args.masked_loss or ("alpha_masks" in batch and batch["alpha_masks"] is not None):
                     loss = apply_masked_loss(loss, batch)
diff --git a/sd3_train_network.py b/sd3_train_network.py
index 1726e325f..fb7711bda 100644
--- a/sd3_train_network.py
+++ b/sd3_train_network.py
@@ -378,7 +378,7 @@ def get_noise_pred_and_target(
 
                 target[diff_output_pr_indices] = model_pred_prior.to(target.dtype)
 
-        return model_pred, target, timesteps, None, weighting
+        return model_pred, target, timesteps, weighting
 
     def post_process_loss(self, loss, args, timesteps, noise_scheduler):
         return loss
diff --git a/sdxl_train.py b/sdxl_train.py
index e26f4aa19..1bc27ec6c 100644
--- a/sdxl_train.py
+++ b/sdxl_train.py
@@ -695,7 +695,7 @@ def optimizer_hook(parameter: torch.Tensor):
 
                 # Sample noise, sample a random timestep for each image, and add noise to the latents,
                 # with noise offset and/or multires noise if specified
-                noise, noisy_latents, timesteps, huber_c = train_util.get_noise_noisy_latents_and_timesteps(
+                noise, noisy_latents, timesteps = train_util.get_noise_noisy_latents_and_timesteps(
                     args, noise_scheduler, latents
                 )
 
@@ -720,7 +720,7 @@ def optimizer_hook(parameter: torch.Tensor):
                 ):
                     # do not mean over batch dimension for snr weight or scale v-pred loss
                     loss = train_util.conditional_loss(
-                        noise_pred.float(), target.float(), reduction="none", loss_type=args.loss_type, huber_c=huber_c
+                        args, noise_pred.float(), target.float(), timesteps, "none", noise_scheduler
                     )
                     if args.masked_loss or ("alpha_masks" in batch and batch["alpha_masks"] is not None):
                         loss = apply_masked_loss(loss, batch)
@@ -738,7 +738,7 @@ def optimizer_hook(parameter: torch.Tensor):
                     loss = loss.mean()  # mean over batch dimension
                 else:
                     loss = train_util.conditional_loss(
-                        noise_pred.float(), target.float(), reduction="mean", loss_type=args.loss_type, huber_c=huber_c
+                        args, noise_pred.float(), target.float(), timesteps, "none", noise_scheduler
                     )
 
                 accelerator.backward(loss)
diff --git a/sdxl_train_control_net.py b/sdxl_train_control_net.py
index 24080afbd..d0051d18f 100644
--- a/sdxl_train_control_net.py
+++ b/sdxl_train_control_net.py
@@ -512,7 +512,7 @@ def remove_model(old_ckpt_name):
 
                 # Sample noise, sample a random timestep for each image, and add noise to the latents,
                 # with noise offset and/or multires noise if specified
-                noise, noisy_latents, timesteps, huber_c = train_util.get_noise_noisy_latents_and_timesteps(
+                noise, noisy_latents, timesteps = train_util.get_noise_noisy_latents_and_timesteps(
                     args, noise_scheduler, latents
                 )
 
@@ -534,7 +534,7 @@ def remove_model(old_ckpt_name):
                     target = noise
 
                 loss = train_util.conditional_loss(
-                    noise_pred.float(), target.float(), reduction="none", loss_type=args.loss_type, huber_c=huber_c
+                    args, noise_pred.float(), target.float(), timesteps, "none", noise_scheduler
                 )
                 loss = loss.mean([1, 2, 3])
 
diff --git a/sdxl_train_control_net_lllite.py b/sdxl_train_control_net_lllite.py
index 2946c97d4..66214f5df 100644
--- a/sdxl_train_control_net_lllite.py
+++ b/sdxl_train_control_net_lllite.py
@@ -463,7 +463,7 @@ def remove_model(old_ckpt_name):
 
                 # Sample noise, sample a random timestep for each image, and add noise to the latents,
                 # with noise offset and/or multires noise if specified
-                noise, noisy_latents, timesteps, huber_c = train_util.get_noise_noisy_latents_and_timesteps(
+                noise, noisy_latents, timesteps = train_util.get_noise_noisy_latents_and_timesteps(
                     args, noise_scheduler, latents
                 )
 
@@ -485,7 +485,7 @@ def remove_model(old_ckpt_name):
                     target = noise
 
                 loss = train_util.conditional_loss(
-                    noise_pred.float(), target.float(), reduction="none", loss_type=args.loss_type, huber_c=huber_c
+                    args, noise_pred.float(), target.float(), timesteps, "none", noise_scheduler
                 )
                 loss = loss.mean([1, 2, 3])
 
diff --git a/sdxl_train_control_net_lllite_old.py b/sdxl_train_control_net_lllite_old.py
index 2d4465234..5e10654b9 100644
--- a/sdxl_train_control_net_lllite_old.py
+++ b/sdxl_train_control_net_lllite_old.py
@@ -406,7 +406,7 @@ def remove_model(old_ckpt_name):
 
                 # Sample noise, sample a random timestep for each image, and add noise to the latents,
                 # with noise offset and/or multires noise if specified
-                noise, noisy_latents, timesteps, huber_c = train_util.get_noise_noisy_latents_and_timesteps(args, noise_scheduler, latents)
+                noise, noisy_latents, timesteps = train_util.get_noise_noisy_latents_and_timesteps(args, noise_scheduler, latents)
 
                 noisy_latents = noisy_latents.to(weight_dtype)  # TODO check why noisy_latents is not weight_dtype
 
@@ -426,7 +426,9 @@ def remove_model(old_ckpt_name):
                 else:
                     target = noise
 
-                loss = train_util.conditional_loss(noise_pred.float(), target.float(), reduction="none", loss_type=args.loss_type, huber_c=huber_c)
+                loss = train_util.conditional_loss(
+                    args, noise_pred.float(), target.float(), timesteps, "none", noise_scheduler
+                )
                 loss = loss.mean([1, 2, 3])
 
                 loss_weights = batch["loss_weights"]  # 各sampleごとのweight
diff --git a/train_controlnet.py b/train_controlnet.py
index 8c7882c8f..da7a08d69 100644
--- a/train_controlnet.py
+++ b/train_controlnet.py
@@ -464,8 +464,8 @@ def remove_model(old_ckpt_name):
                     )
 
                 # Sample a random timestep for each image
-                timesteps, huber_c = train_util.get_timesteps_and_huber_c(
-                    args, 0, noise_scheduler.config.num_train_timesteps, noise_scheduler, b_size, latents.device
+                timesteps = train_util.get_timesteps(
+                    0, noise_scheduler.config.num_train_timesteps, b_size, latents.device
                 )
 
                 # Add noise to the latents according to the noise magnitude at each timestep
@@ -499,7 +499,7 @@ def remove_model(old_ckpt_name):
                     target = noise
 
                 loss = train_util.conditional_loss(
-                    noise_pred.float(), target.float(), reduction="none", loss_type=args.loss_type, huber_c=huber_c
+                    args, noise_pred.float(), target.float(), timesteps, "none", noise_scheduler
                 )
                 loss = loss.mean([1, 2, 3])
 
diff --git a/train_db.py b/train_db.py
index 51e209f34..a185b31b3 100644
--- a/train_db.py
+++ b/train_db.py
@@ -370,7 +370,7 @@ def train(args):
 
                 # Sample noise, sample a random timestep for each image, and add noise to the latents,
                 # with noise offset and/or multires noise if specified
-                noise, noisy_latents, timesteps, huber_c = train_util.get_noise_noisy_latents_and_timesteps(
+                noise, noisy_latents, timesteps = train_util.get_noise_noisy_latents_and_timesteps(
                     args, noise_scheduler, latents
                 )
 
@@ -385,7 +385,7 @@ def train(args):
                     target = noise
 
                 loss = train_util.conditional_loss(
-                    noise_pred.float(), target.float(), reduction="none", loss_type=args.loss_type, huber_c=huber_c
+                    args, noise_pred.float(), target.float(), timesteps, "none", noise_scheduler
                 )
                 if args.masked_loss or ("alpha_masks" in batch and batch["alpha_masks"] is not None):
                     loss = apply_masked_loss(loss, batch)
diff --git a/train_network.py b/train_network.py
index bbf381f99..c7d4f5dc5 100644
--- a/train_network.py
+++ b/train_network.py
@@ -192,7 +192,7 @@ def get_noise_pred_and_target(
     ):
         # Sample noise, sample a random timestep for each image, and add noise to the latents,
         # with noise offset and/or multires noise if specified
-        noise, noisy_latents, timesteps, huber_c = train_util.get_noise_noisy_latents_and_timesteps(args, noise_scheduler, latents)
+        noise, noisy_latents, timesteps = train_util.get_noise_noisy_latents_and_timesteps(args, noise_scheduler, latents)
 
         # ensure the hidden state will require grad
         if args.gradient_checkpointing:
@@ -244,7 +244,7 @@ def get_noise_pred_and_target(
                 network.set_multiplier(1.0)  # may be overwritten by "network_multipliers" in the next step
                 target[diff_output_pr_indices] = noise_pred_prior.to(target.dtype)
 
-        return noise_pred, target, timesteps, huber_c, None
+        return noise_pred, target, timesteps, None
 
     def post_process_loss(self, loss, args, timesteps, noise_scheduler):
         if args.min_snr_gamma:
@@ -806,6 +806,7 @@ def load_model_hook(models, input_dir):
             "ss_ip_noise_gamma_random_strength": args.ip_noise_gamma_random_strength,
             "ss_loss_type": args.loss_type,
             "ss_huber_schedule": args.huber_schedule,
+            "ss_huber_scale": args.huber_scale,
             "ss_huber_c": args.huber_c,
             "ss_fp8_base": bool(args.fp8_base),
             "ss_fp8_base_unet": bool(args.fp8_base_unet),
@@ -1193,7 +1194,7 @@ def remove_model(old_ckpt_name):
                                     text_encoder_conds[i] = encoded_text_encoder_conds[i]
 
                     # sample noise, call unet, get target
-                    noise_pred, target, timesteps, huber_c, weighting = self.get_noise_pred_and_target(
+                    noise_pred, target, timesteps, weighting = self.get_noise_pred_and_target(
                         args,
                         accelerator,
                         noise_scheduler,
@@ -1207,7 +1208,7 @@ def remove_model(old_ckpt_name):
                     )
 
                     loss = train_util.conditional_loss(
-                        noise_pred.float(), target.float(), reduction="none", loss_type=args.loss_type, huber_c=huber_c
+                        args, noise_pred.float(), target.float(), timesteps, "none", noise_scheduler
                     )
                     if weighting is not None:
                         loss = loss * weighting
diff --git a/train_textual_inversion.py b/train_textual_inversion.py
index 5f4657eb9..9e1e57c48 100644
--- a/train_textual_inversion.py
+++ b/train_textual_inversion.py
@@ -585,7 +585,7 @@ def remove_model(old_ckpt_name):
 
                     # Sample noise, sample a random timestep for each image, and add noise to the latents,
                     # with noise offset and/or multires noise if specified
-                    noise, noisy_latents, timesteps, huber_c = train_util.get_noise_noisy_latents_and_timesteps(
+                    noise, noisy_latents, timesteps = train_util.get_noise_noisy_latents_and_timesteps(
                         args, noise_scheduler, latents
                     )
 
@@ -602,7 +602,7 @@ def remove_model(old_ckpt_name):
                         target = noise
 
                     loss = train_util.conditional_loss(
-                        noise_pred.float(), target.float(), reduction="none", loss_type=args.loss_type, huber_c=huber_c
+                        args, noise_pred.float(), target.float(), timesteps, "none", noise_scheduler
                     )
                     if args.masked_loss or ("alpha_masks" in batch and batch["alpha_masks"] is not None):
                         loss = apply_masked_loss(loss, batch)
diff --git a/train_textual_inversion_XTI.py b/train_textual_inversion_XTI.py
index 52d525fc5..944733602 100644
--- a/train_textual_inversion_XTI.py
+++ b/train_textual_inversion_XTI.py
@@ -461,7 +461,7 @@ def remove_model(old_ckpt_name):
 
                 # Sample noise, sample a random timestep for each image, and add noise to the latents,
                 # with noise offset and/or multires noise if specified
-                noise, noisy_latents, timesteps, huber_c = train_util.get_noise_noisy_latents_and_timesteps(args, noise_scheduler, latents)
+                noise, noisy_latents, timesteps = train_util.get_noise_noisy_latents_and_timesteps(args, noise_scheduler, latents)
 
                 # Predict the noise residual
                 with accelerator.autocast():
@@ -473,7 +473,9 @@ def remove_model(old_ckpt_name):
                 else:
                     target = noise
 
-                loss = train_util.conditional_loss(noise_pred.float(), target.float(), reduction="none", loss_type=args.loss_type, huber_c=huber_c)
+                loss = train_util.conditional_loss(
+                    args, noise_pred.float(), target.float(), timesteps, "none", noise_scheduler
+                )
                 if args.masked_loss or ("alpha_masks" in batch and batch["alpha_masks"] is not None):
                     loss = apply_masked_loss(loss, batch)
                 loss = loss.mean([1, 2, 3])

From 740ec1d5265fa321659589ae6a75a4a9898ef8be Mon Sep 17 00:00:00 2001
From: recris <recris@users.noreply.github.com>
Date: Thu, 28 Nov 2024 20:38:32 +0000
Subject: [PATCH 248/582] Fix issues found in review

---
 fine_tune.py          | 2 +-
 library/train_util.py | 8 ++++----
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/fine_tune.py b/fine_tune.py
index 70959a751..401a40f08 100644
--- a/fine_tune.py
+++ b/fine_tune.py
@@ -411,7 +411,7 @@ def fn_recursive_set_mem_eff(module: torch.nn.Module):
                     loss = loss.mean()  # mean over batch dimension
                 else:
                     loss = train_util.conditional_loss(
-                        args, noise_pred.float(), target.float(), timesteps, "none", noise_scheduler
+                        args, noise_pred.float(), target.float(), timesteps, "mean", noise_scheduler
                     )
 
                 accelerator.backward(loss)
diff --git a/library/train_util.py b/library/train_util.py
index c204ebd38..eaf6ec004 100644
--- a/library/train_util.py
+++ b/library/train_util.py
@@ -5829,8 +5829,8 @@ def save_sd_model_on_train_end_common(
 
 
 def get_timesteps(min_timestep, max_timestep, b_size, device):
-    timesteps = torch.randint(min_timestep, max_timestep, (b_size,), device=device)
-    timesteps = timesteps.long()
+    timesteps = torch.randint(min_timestep, max_timestep, (b_size,), device="cpu")
+    timesteps = timesteps.long().to(device)
     return timesteps
 
 
@@ -5875,8 +5875,8 @@ def get_huber_threshold(args, timesteps: torch.Tensor, noise_scheduler) -> torch
         alpha = -math.log(args.huber_c) / noise_scheduler.config.num_train_timesteps
         result = torch.exp(-alpha * timesteps) * args.huber_scale
     elif args.huber_schedule == "snr":
-        if not hasattr(noise_scheduler, 'alphas_cumprod'):
-            raise NotImplementedError(f"Huber schedule 'snr' is not supported with the current model.")
+        if not hasattr(noise_scheduler, "alphas_cumprod"):
+            raise NotImplementedError("Huber schedule 'snr' is not supported with the current model.")
         alphas_cumprod = torch.index_select(noise_scheduler.alphas_cumprod, 0, timesteps.cpu())
         sigmas = ((1.0 - alphas_cumprod) / alphas_cumprod) ** 0.5
         result = (1 - args.huber_c) / (1 + sigmas) ** 2 + args.huber_c

From 575f583fd9cbaf7f7b644a31437ed9094810b99a Mon Sep 17 00:00:00 2001
From: minux302 <minux302@gmail.com>
Date: Fri, 29 Nov 2024 23:55:52 +0900
Subject: [PATCH 249/582] add README

---
 README.md | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

diff --git a/README.md b/README.md
index f9c85e3ac..2b1ca3f8c 100644
--- a/README.md
+++ b/README.md
@@ -28,6 +28,7 @@ Nov 14, 2024:
   - [Key Features for FLUX.1 LoRA training](#key-features-for-flux1-lora-training)
   - [Specify rank for each layer in FLUX.1](#specify-rank-for-each-layer-in-flux1)
   - [Specify blocks to train in FLUX.1 LoRA training](#specify-blocks-to-train-in-flux1-lora-training)
+- [FLUX.1 ControlNet training](#flux1-controlnet-training)
 - [FLUX.1 OFT training](#flux1-oft-training)
 - [Inference for FLUX.1 with LoRA model](#inference-for-flux1-with-lora-model)
 - [FLUX.1 fine-tuning](#flux1-fine-tuning)
@@ -245,6 +246,22 @@ example:
 
 If you specify one of `train_double_block_indices` or `train_single_block_indices`, the other will be trained as usual. 
 
+### FLUX.1 ControlNet training
+We have added a new training script for ControlNet training. The script is flux_train_control_net.py. See --help for options.
+
+Sample command is below. It will work with 80GB VRAM GPUs.
+```
+accelerate launch --mixed_precision bf16 --num_cpu_threads_per_process 1 flux_train_control_net.py
+--pretrained_model_name_or_path flux1-dev.safetensors --clip_l clip_l.safetensors --t5xxl t5xxl_fp16.safetensors
+--ae ae.safetensors --save_model_as safetensors --sdpa --persistent_data_loader_workers
+--max_data_loader_n_workers 1 --seed 42 --gradient_checkpointing --mixed_precision bf16
+--optimizer_type adamw8bit --learning_rate 2e-5 
+--highvram --max_train_epochs 1 --save_every_n_steps 1000 --dataset_config dataset.toml
+--output_dir /path/to/output/dir --output_name flux-cn
+--timestep_sampling shift --discrete_flow_shift 3.1582 --model_prediction_type raw --guidance_scale 1.0 --deepspeed
+```
+
+
 ### FLUX.1 OFT training
 
 You can train OFT with almost the same options as LoRA, such as `--timestamp_sampling`. The following points are different.

From be5860f8e266c5562f123fe9e0cb3febef615290 Mon Sep 17 00:00:00 2001
From: minux302 <minux302@gmail.com>
Date: Sat, 30 Nov 2024 00:08:21 +0900
Subject: [PATCH 250/582] add schnell option to load_cn

---
 flux_train_control_net.py |  4 ++--
 library/flux_utils.py     | 14 ++++++--------
 2 files changed, 8 insertions(+), 10 deletions(-)

diff --git a/flux_train_control_net.py b/flux_train_control_net.py
index a17c811e3..bb27c35ed 100644
--- a/flux_train_control_net.py
+++ b/flux_train_control_net.py
@@ -259,14 +259,14 @@ def train(args):
         clean_memory_on_device(accelerator.device)
 
     # load FLUX
-    _, flux = flux_utils.load_flow_model(
+    is_schnell, flux = flux_utils.load_flow_model(
         args.pretrained_model_name_or_path, weight_dtype, "cpu", args.disable_mmap_load_safetensors
     )
     flux.requires_grad_(False)
     flux.to(accelerator.device)
 
     # load controlnet
-    controlnet = flux_utils.load_controlnet(args.controlnet, torch.float32, accelerator.device, args.disable_mmap_load_safetensors)
+    controlnet = flux_utils.load_controlnet(args.controlnet, is_schnell, torch.float32, accelerator.device, args.disable_mmap_load_safetensors)
     controlnet.train()
 
     if args.gradient_checkpointing:
diff --git a/library/flux_utils.py b/library/flux_utils.py
index f2759c375..8be1d63ee 100644
--- a/library/flux_utils.py
+++ b/library/flux_utils.py
@@ -1,14 +1,14 @@
-from dataclasses import replace
 import json
 import os
+from dataclasses import replace
 from typing import List, Optional, Tuple, Union
+
 import einops
 import torch
-
-from safetensors.torch import load_file
-from safetensors import safe_open
 from accelerate import init_empty_weights
-from transformers import CLIPTextModel, CLIPConfig, T5EncoderModel, T5Config
+from safetensors import safe_open
+from safetensors.torch import load_file
+from transformers import CLIPConfig, CLIPTextModel, T5Config, T5EncoderModel
 
 from library.utils import setup_logging
 
@@ -154,11 +154,9 @@ def load_ae(
 
 
 def load_controlnet(
-    ckpt_path: Optional[str], dtype: torch.dtype, device: Union[str, torch.device], disable_mmap: bool = False
+    ckpt_path: Optional[str], is_schnell: bool, dtype: torch.dtype, device: Union[str, torch.device], disable_mmap: bool = False
 ):
     logger.info("Building ControlNet")
-    # is_diffusers, is_schnell, (num_double_blocks, num_single_blocks), ckpt_paths = analyze_checkpoint_state(ckpt_path)
-    is_schnell = False
     name = MODEL_NAME_DEV if not is_schnell else MODEL_NAME_SCHNELL
     with torch.device(device):
         controlnet = flux_models.ControlNetFlux(flux_models.configs[name].params).to(dtype)

From f40632bac6704886a7640c327d64820f8f017df8 Mon Sep 17 00:00:00 2001
From: minux302 <minux302@gmail.com>
Date: Sat, 30 Nov 2024 00:15:47 +0900
Subject: [PATCH 251/582] rm abundant arg

---
 flux_train_network.py | 15 ++++++++++++---
 1 file changed, 12 insertions(+), 3 deletions(-)

diff --git a/flux_train_network.py b/flux_train_network.py
index 314335366..fa3810e34 100644
--- a/flux_train_network.py
+++ b/flux_train_network.py
@@ -6,12 +6,21 @@
 
 import torch
 from accelerate import Accelerator
-from library.device_utils import init_ipex, clean_memory_on_device
+
+from library.device_utils import clean_memory_on_device, init_ipex
 
 init_ipex()
 
-from library import flux_models, flux_train_utils, flux_utils, sd3_train_utils, strategy_base, strategy_flux, train_util
 import train_network
+from library import (
+    flux_models,
+    flux_train_utils,
+    flux_utils,
+    sd3_train_utils,
+    strategy_base,
+    strategy_flux,
+    train_util,
+)
 from library.utils import setup_logging
 
 setup_logging()
@@ -125,7 +134,7 @@ def load_target_model(self, args, weight_dtype, accelerator):
 
         ae = flux_utils.load_ae(args.ae, weight_dtype, "cpu", disable_mmap=args.disable_mmap_load_safetensors)
 
-        return flux_utils.MODEL_VERSION_FLUX_V1, [clip_l, t5xxl], ae, model, controlnet
+        return flux_utils.MODEL_VERSION_FLUX_V1, [clip_l, t5xxl], ae, model
 
     def get_tokenize_strategy(self, args):
         _, is_schnell, _, _ = flux_utils.analyze_checkpoint_state(args.pretrained_model_name_or_path)

From 928b9393daac252d0b6c4c9dd277d549b3dad8e9 Mon Sep 17 00:00:00 2001
From: rockerBOO <rockerboo@gmail.com>
Date: Wed, 20 Nov 2024 11:15:30 -0500
Subject: [PATCH 252/582] Allow unknown schedule-free optimizers to continue to
 module loader

---
 library/train_util.py | 15 ++++++++++-----
 1 file changed, 10 insertions(+), 5 deletions(-)

diff --git a/library/train_util.py b/library/train_util.py
index 25cf7640d..74050880a 100644
--- a/library/train_util.py
+++ b/library/train_util.py
@@ -4600,7 +4600,7 @@ def task():
 
 def get_optimizer(args, trainable_params):
     # "Optimizer to use: AdamW, AdamW8bit, Lion, SGDNesterov, SGDNesterov8bit, PagedAdamW, PagedAdamW8bit, PagedAdamW32bit, Lion8bit, PagedLion8bit, AdEMAMix8bit, PagedAdEMAMix8bit, DAdaptation(DAdaptAdamPreprint), DAdaptAdaGrad, DAdaptAdam, DAdaptAdan, DAdaptAdanIP, DAdaptLion, DAdaptSGD, Adafactor"
-
+    
     optimizer_type = args.optimizer_type
     if args.use_8bit_adam:
         assert (
@@ -4874,6 +4874,7 @@ def get_optimizer(args, trainable_params):
         optimizer = optimizer_class(trainable_params, lr=lr, **optimizer_kwargs)
 
     elif optimizer_type.endswith("schedulefree".lower()):
+        should_train_optimizer = True
         try:
             import schedulefree as sf
         except ImportError:
@@ -4885,10 +4886,10 @@ def get_optimizer(args, trainable_params):
             optimizer_class = sf.SGDScheduleFree
             logger.info(f"use SGDScheduleFree optimizer | {optimizer_kwargs}")
         else:
-            raise ValueError(f"Unknown optimizer type: {optimizer_type}")
-        optimizer = optimizer_class(trainable_params, lr=lr, **optimizer_kwargs)
-        # make optimizer as train mode: we don't need to call train again, because eval will not be called in training loop
-        optimizer.train()
+            optimizer_class = None
+
+        if optimizer_class is not None:
+            optimizer = optimizer_class(trainable_params, lr=lr, **optimizer_kwargs)
 
     if optimizer is None:
         # 任意のoptimizerを使う
@@ -4990,6 +4991,10 @@ def __instancecheck__(self, instance):
     optimizer_name = optimizer_class.__module__ + "." + optimizer_class.__name__
     optimizer_args = ",".join([f"{k}={v}" for k, v in optimizer_kwargs.items()])
 
+    if hasattr(optimizer, 'train') and callable(optimizer.train):
+        # make optimizer as train mode: we don't need to call train again, because eval will not be called in training loop
+        optimizer.train()
+
     return optimizer_name, optimizer_args, optimizer
 
 

From 87f5224e2d19254748158939cbca75802fc024f2 Mon Sep 17 00:00:00 2001
From: rockerBOO <rockerboo@gmail.com>
Date: Wed, 20 Nov 2024 11:57:15 -0500
Subject: [PATCH 253/582] Support d*lr for ProdigyPlus optimizer

---
 train_network.py | 27 ++++++++++++++++++++++++++-
 1 file changed, 26 insertions(+), 1 deletion(-)

diff --git a/train_network.py b/train_network.py
index bbf381f99..65962bd74 100644
--- a/train_network.py
+++ b/train_network.py
@@ -61,6 +61,7 @@ def generate_step_logs(
         avr_loss,
         lr_scheduler,
         lr_descriptions,
+        optimizer=None,
         keys_scaled=None,
         mean_norm=None,
         maximum_norm=None,
@@ -93,6 +94,30 @@ def generate_step_logs(
                 logs[f"lr/d*lr/{lr_desc}"] = (
                     lr_scheduler.optimizers[-1].param_groups[i]["d"] * lr_scheduler.optimizers[-1].param_groups[i]["lr"]
                 )
+            if (
+                args.optimizer_type.lower().endswith("ProdigyPlusScheduleFree".lower()) and optimizer is not None
+            ):  # tracking d*lr value of unet.
+                logs["lr/d*lr"] = (
+                    optimizer.param_groups[0]["d"] * optimizer.param_groups[0]["lr"]
+                )
+        else:
+            idx = 0
+            if not args.network_train_unet_only:
+                logs["lr/textencoder"] = float(lrs[0])
+                idx = 1
+
+            for i in range(idx, len(lrs)):
+                logs[f"lr/group{i}"] = float(lrs[i])
+                if args.optimizer_type.lower().startswith("DAdapt".lower()) or args.optimizer_type.lower() == "Prodigy".lower():
+                    logs[f"lr/d*lr/group{i}"] = (
+                        lr_scheduler.optimizers[-1].param_groups[i]["d"] * lr_scheduler.optimizers[-1].param_groups[i]["lr"]
+                    )
+                if (
+                    args.optimizer_type.lower().endswith("ProdigyPlusScheduleFree".lower()) and optimizer is not None
+                ):  
+                    logs[f"lr/d*lr/group{i}"] = (
+                        optimizer.param_groups[0]["d"] * optimizer.param_groups[0]["lr"]
+                    )
 
         return logs
 
@@ -1279,7 +1304,7 @@ def remove_model(old_ckpt_name):
 
                 if len(accelerator.trackers) > 0:
                     logs = self.generate_step_logs(
-                        args, current_loss, avr_loss, lr_scheduler, lr_descriptions, keys_scaled, mean_norm, maximum_norm
+                        args, current_loss, avr_loss, lr_scheduler, lr_descriptions, optimizer, keys_scaled, mean_norm, maximum_norm
                     )
                     accelerator.log(logs, step=global_step)
 

From 6593cfbec14c0be70407b5d6d85d569ecf8160f1 Mon Sep 17 00:00:00 2001
From: rockerBOO <rockerboo@gmail.com>
Date: Thu, 21 Nov 2024 14:41:37 -0500
Subject: [PATCH 254/582] Fix d * lr step log

---
 train_network.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/train_network.py b/train_network.py
index 65962bd74..c236a2c95 100644
--- a/train_network.py
+++ b/train_network.py
@@ -116,7 +116,7 @@ def generate_step_logs(
                     args.optimizer_type.lower().endswith("ProdigyPlusScheduleFree".lower()) and optimizer is not None
                 ):  
                     logs[f"lr/d*lr/group{i}"] = (
-                        optimizer.param_groups[0]["d"] * optimizer.param_groups[0]["lr"]
+                        optimizer.param_groups[i]["d"] * optimizer.param_groups[i]["lr"]
                     )
 
         return logs

From c7cadbc8c73b48eaacbfb44b18121d20df373e19 Mon Sep 17 00:00:00 2001
From: rockerBOO <rockerboo@gmail.com>
Date: Fri, 29 Nov 2024 15:52:03 -0500
Subject: [PATCH 255/582] Add pytest testing

---
 .github/workflows/tests.yml |  54 +++++++++++++
 library/train_util.py       |   4 +-
 pytest.ini                  |   7 ++
 tests/test_optimizer.py     | 153 ++++++++++++++++++++++++++++++++++++
 4 files changed, 216 insertions(+), 2 deletions(-)
 create mode 100644 .github/workflows/tests.yml
 create mode 100644 pytest.ini
 create mode 100644 tests/test_optimizer.py

diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
new file mode 100644
index 000000000..50b08243a
--- /dev/null
+++ b/.github/workflows/tests.yml
@@ -0,0 +1,54 @@
+
+name: Python package
+
+on: [push]
+
+jobs:
+  build:
+
+    runs-on: ${{ matrix.os }}
+    strategy:
+      matrix:
+        os: [ubuntu-latest]
+        python-version: ["3.10", "3.11"]
+
+    steps:
+      - uses: actions/checkout@v4
+      - name: Set up Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: '3.x'
+      - name: Install dependencies
+        run: python -m pip install --upgrade pip setuptools wheel
+
+      - uses: actions/checkout@v4
+      - name: Set up Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: '3.x'
+      - name: Install dependencies
+        run: |
+          python -m pip install --upgrade pip
+          pip install -r requirements.txt
+
+      - uses: actions/checkout@v4
+      - name: Set up Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: '3.x'
+      - name: Install dependencies
+        run: |
+          python -m pip install --upgrade pip
+          pip install -r requirements.txt
+      - name: Test with pytest
+        run: |
+          pip install pytest pytest-cov
+          pytest --junitxml=junit/test-results.xml --cov=com --cov-report=xml --cov-report=html
+
+      - name: Upload pytest test results
+        uses: actions/upload-artifact@v4
+        with:
+          name: pytest-results-${{ matrix.python-version }}
+          path: junit/test-results-${{ matrix.python-version }}.xml
+        # Use always() to always run this step to publish test results when there are test failures
+        if: ${{ always() }}
diff --git a/library/train_util.py b/library/train_util.py
index 25cf7640d..823cd3663 100644
--- a/library/train_util.py
+++ b/library/train_util.py
@@ -21,7 +21,7 @@
     Optional,
     Sequence,
     Tuple,
-    Union,
+    Union
 )
 from accelerate import Accelerator, InitProcessGroupKwargs, DistributedDataParallelKwargs, PartialState
 import glob
@@ -4598,7 +4598,7 @@ def task():
     accelerator.load_state(dirname)
 
 
-def get_optimizer(args, trainable_params):
+def get_optimizer(args, trainable_params) -> tuple[str, str, object]:
     # "Optimizer to use: AdamW, AdamW8bit, Lion, SGDNesterov, SGDNesterov8bit, PagedAdamW, PagedAdamW8bit, PagedAdamW32bit, Lion8bit, PagedLion8bit, AdEMAMix8bit, PagedAdEMAMix8bit, DAdaptation(DAdaptAdamPreprint), DAdaptAdaGrad, DAdaptAdam, DAdaptAdan, DAdaptAdanIP, DAdaptLion, DAdaptSGD, Adafactor"
 
     optimizer_type = args.optimizer_type
diff --git a/pytest.ini b/pytest.ini
new file mode 100644
index 000000000..63e03efc5
--- /dev/null
+++ b/pytest.ini
@@ -0,0 +1,7 @@
+[pytest]
+minversion = 6.0
+testpaths =
+    tests
+filterwarnings =
+    ignore::DeprecationWarning
+    ignore::UserWarning
diff --git a/tests/test_optimizer.py b/tests/test_optimizer.py
new file mode 100644
index 000000000..f6ade91a6
--- /dev/null
+++ b/tests/test_optimizer.py
@@ -0,0 +1,153 @@
+from unittest.mock import patch
+from library.train_util import get_optimizer
+from train_network import setup_parser
+import torch
+from torch.nn import Parameter
+
+# Optimizer libraries
+import bitsandbytes as bnb
+from lion_pytorch import lion_pytorch
+import schedulefree
+
+import dadaptation
+import dadaptation.experimental as dadapt_experimental
+
+import prodigyopt
+import schedulefree as sf
+import transformers
+
+
+def test_default_get_optimizer():
+    with patch("sys.argv", [""]):
+        parser = setup_parser()
+        args = parser.parse_args()
+        params_t = torch.tensor([1.5, 1.5])
+
+        param = Parameter(params_t)
+        optimizer_name, optimizer_args, optimizer = get_optimizer(args, [param])
+        assert optimizer_name == "torch.optim.adamw.AdamW"
+        assert optimizer_args == ""
+        assert isinstance(optimizer, torch.optim.AdamW)
+
+
+def test_get_schedulefree_optimizer():
+    with patch("sys.argv", ["", "--optimizer_type", "AdamWScheduleFree"]):
+        parser = setup_parser()
+        args = parser.parse_args()
+        params_t = torch.tensor([1.5, 1.5])
+
+        param = Parameter(params_t)
+        optimizer_name, optimizer_args, optimizer = get_optimizer(args, [param])
+        assert optimizer_name == "schedulefree.adamw_schedulefree.AdamWScheduleFree"
+        assert optimizer_args == ""
+        assert isinstance(optimizer, schedulefree.adamw_schedulefree.AdamWScheduleFree)
+
+
+def test_all_supported_optimizers():
+    optimizers = [
+        {
+            "name": "bitsandbytes.optim.adamw.AdamW8bit",
+            "alias": "AdamW8bit",
+            "instance": bnb.optim.AdamW8bit,
+        },
+        {
+            "name": "lion_pytorch.lion_pytorch.Lion",
+            "alias": "Lion",
+            "instance": lion_pytorch.Lion,
+        },
+        {
+            "name": "torch.optim.adamw.AdamW",
+            "alias": "AdamW",
+            "instance": torch.optim.AdamW,
+        },
+        {
+            "name": "bitsandbytes.optim.lion.Lion8bit",
+            "alias": "Lion8bit",
+            "instance": bnb.optim.Lion8bit,
+        },
+        {
+            "name": "bitsandbytes.optim.adamw.PagedAdamW8bit",
+            "alias": "PagedAdamW8bit",
+            "instance": bnb.optim.PagedAdamW8bit,
+        },
+        {
+            "name": "bitsandbytes.optim.lion.PagedLion8bit",
+            "alias": "PagedLion8bit",
+            "instance": bnb.optim.PagedLion8bit,
+        },
+        {
+            "name": "bitsandbytes.optim.adamw.PagedAdamW",
+            "alias": "PagedAdamW",
+            "instance": bnb.optim.PagedAdamW,
+        },
+        {
+            "name": "bitsandbytes.optim.adamw.PagedAdamW32bit",
+            "alias": "PagedAdamW32bit",
+            "instance": bnb.optim.PagedAdamW32bit,
+        },
+        {"name": "torch.optim.sgd.SGD", "alias": "SGD", "instance": torch.optim.SGD},
+        {
+            "name": "dadaptation.experimental.dadapt_adam_preprint.DAdaptAdamPreprint",
+            "alias": "DAdaptAdamPreprint",
+            "instance": dadapt_experimental.DAdaptAdamPreprint,
+        },
+        {
+            "name": "dadaptation.dadapt_adagrad.DAdaptAdaGrad",
+            "alias": "DAdaptAdaGrad",
+            "instance": dadaptation.DAdaptAdaGrad,
+        },
+        {
+            "name": "dadaptation.dadapt_adan.DAdaptAdan",
+            "alias": "DAdaptAdan",
+            "instance": dadaptation.DAdaptAdan,
+        },
+        {
+            "name": "dadaptation.experimental.dadapt_adan_ip.DAdaptAdanIP",
+            "alias": "DAdaptAdanIP",
+            "instance": dadapt_experimental.DAdaptAdanIP,
+        },
+        {
+            "name": "dadaptation.dadapt_lion.DAdaptLion",
+            "alias": "DAdaptLion",
+            "instance": dadaptation.DAdaptLion,
+        },
+        {
+            "name": "dadaptation.dadapt_sgd.DAdaptSGD",
+            "alias": "DAdaptSGD",
+            "instance": dadaptation.DAdaptSGD,
+        },
+        {
+            "name": "prodigyopt.prodigy.Prodigy",
+            "alias": "Prodigy",
+            "instance": prodigyopt.Prodigy,
+        },
+        {
+            "name": "transformers.optimization.Adafactor",
+            "alias": "Adafactor",
+            "instance": transformers.optimization.Adafactor,
+        },
+        {
+            "name": "schedulefree.adamw_schedulefree.AdamWScheduleFree",
+            "alias": "AdamWScheduleFree",
+            "instance": sf.AdamWScheduleFree,
+        },
+        {
+            "name": "schedulefree.sgd_schedulefree.SGDScheduleFree",
+            "alias": "SGDScheduleFree",
+            "instance": sf.SGDScheduleFree,
+        },
+    ]
+
+    for opt in optimizers:
+        with patch("sys.argv", ["", "--optimizer_type", opt.get("alias")]):
+            parser = setup_parser()
+            args = parser.parse_args()
+            params_t = torch.tensor([1.5, 1.5])
+
+            param = Parameter(params_t)
+            optimizer_name, _, optimizer = get_optimizer(args, [param])
+            assert optimizer_name == opt.get("name")
+
+            instance = opt.get("instance")
+            assert instance is not None
+            assert isinstance(optimizer, instance)

From 2dd063a679effae2538c474fece1e7aacad0c9c5 Mon Sep 17 00:00:00 2001
From: rockerBOO <rockerboo@gmail.com>
Date: Fri, 29 Nov 2024 15:57:31 -0500
Subject: [PATCH 256/582] add torch torchvision accelerate versions

---
 .github/workflows/tests.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
index 50b08243a..96ab612d8 100644
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -40,6 +40,7 @@ jobs:
         run: |
           python -m pip install --upgrade pip
           pip install -r requirements.txt
+          pip install torch==2.4.0 torchvision==0.19.0 accelerate==0.33.0
       - name: Test with pytest
         run: |
           pip install pytest pytest-cov

From e59e276fb948a1dc8a64672d8fd6d3a7eb166c80 Mon Sep 17 00:00:00 2001
From: rockerBOO <rockerboo@gmail.com>
Date: Fri, 29 Nov 2024 16:03:29 -0500
Subject: [PATCH 257/582] Add dadaptation

---
 .github/workflows/tests.yml | 26 +++++---------------------
 1 file changed, 5 insertions(+), 21 deletions(-)

diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
index 96ab612d8..433c326bf 100644
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -10,7 +10,7 @@ jobs:
     strategy:
       matrix:
         os: [ubuntu-latest]
-        python-version: ["3.10", "3.11"]
+        python-version: ["3.10"]
 
     steps:
       - uses: actions/checkout@v4
@@ -26,30 +26,14 @@ jobs:
         uses: actions/setup-python@v5
         with:
           python-version: '3.x'
+          cache: 'pip' # caching pip dependencies
       - name: Install dependencies
         run: |
           python -m pip install --upgrade pip
           pip install -r requirements.txt
-
-      - uses: actions/checkout@v4
-      - name: Set up Python
-        uses: actions/setup-python@v5
-        with:
-          python-version: '3.x'
-      - name: Install dependencies
-        run: |
-          python -m pip install --upgrade pip
-          pip install -r requirements.txt
-          pip install torch==2.4.0 torchvision==0.19.0 accelerate==0.33.0
+          pip install dadaptation==3.2 torch==2.4.0 torchvision==0.19.0 accelerate==0.33.0
       - name: Test with pytest
         run: |
-          pip install pytest pytest-cov
-          pytest --junitxml=junit/test-results.xml --cov=com --cov-report=xml --cov-report=html
+          pip install pytest
+          pytest
 
-      - name: Upload pytest test results
-        uses: actions/upload-artifact@v4
-        with:
-          name: pytest-results-${{ matrix.python-version }}
-          path: junit/test-results-${{ matrix.python-version }}.xml
-        # Use always() to always run this step to publish test results when there are test failures
-        if: ${{ always() }}

From dd3b846b54814b605bd33ae08ed480ea5075483b Mon Sep 17 00:00:00 2001
From: rockerBOO <rockerboo@gmail.com>
Date: Fri, 29 Nov 2024 16:18:05 -0500
Subject: [PATCH 258/582] Install pytorch first to pin version

---
 .github/workflows/tests.yml | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
index 433c326bf..9ae67b0e9 100644
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -18,6 +18,7 @@ jobs:
         uses: actions/setup-python@v5
         with:
           python-version: '3.x'
+
       - name: Install dependencies
         run: python -m pip install --upgrade pip setuptools wheel
 
@@ -27,11 +28,13 @@ jobs:
         with:
           python-version: '3.x'
           cache: 'pip' # caching pip dependencies
+
       - name: Install dependencies
         run: |
           python -m pip install --upgrade pip
-          pip install -r requirements.txt
           pip install dadaptation==3.2 torch==2.4.0 torchvision==0.19.0 accelerate==0.33.0
+          pip install -r requirements.txt
+
       - name: Test with pytest
         run: |
           pip install pytest

From 89825d6898ba6629b18cc8c1f9fbd93a730ff36e Mon Sep 17 00:00:00 2001
From: rockerBOO <rockerboo@gmail.com>
Date: Fri, 29 Nov 2024 16:27:13 -0500
Subject: [PATCH 259/582] Run typos workflows once where appropriate

---
 .github/workflows/typos.yml | 6 ++++--
 pytest.ini                  | 1 +
 2 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/typos.yml b/.github/workflows/typos.yml
index 0149dcdd3..667146a7a 100644
--- a/.github/workflows/typos.yml
+++ b/.github/workflows/typos.yml
@@ -1,9 +1,11 @@
 ---
-# yamllint disable rule:line-length
 name: Typos
 
-on:  # yamllint disable-line rule:truthy
+on: 
   push:
+    branches:
+      - main
+      - dev
   pull_request:
     types:
       - opened
diff --git a/pytest.ini b/pytest.ini
index 63e03efc5..484d3aef6 100644
--- a/pytest.ini
+++ b/pytest.ini
@@ -5,3 +5,4 @@ testpaths =
 filterwarnings =
     ignore::DeprecationWarning
     ignore::UserWarning
+		ignore::FutureWarning

From 4f7f248071c93f539c12c8a35380b6d983bfff4c Mon Sep 17 00:00:00 2001
From: rockerBOO <rockerboo@gmail.com>
Date: Fri, 29 Nov 2024 16:28:51 -0500
Subject: [PATCH 260/582] Bump typos action

---
 .github/workflows/typos.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/typos.yml b/.github/workflows/typos.yml
index 667146a7a..87ebdf894 100644
--- a/.github/workflows/typos.yml
+++ b/.github/workflows/typos.yml
@@ -20,4 +20,4 @@ jobs:
       - uses: actions/checkout@v4
 
       - name: typos-action
-        uses: crate-ci/typos@v1.24.3
+        uses: crate-ci/typos@v1.28.1

From 9c885e549dbb5535b37f2a3220b5a8f53ad4d211 Mon Sep 17 00:00:00 2001
From: Kohya S <ykumeykume@gmail.com>
Date: Sat, 30 Nov 2024 18:25:50 +0900
Subject: [PATCH 261/582] fix: improve pos_embed handling for oversized images
 and update resolution_area_to_latent_size, when sample image size > train
 image size

---
 library/sd3_models.py | 25 +++++++++++++++++++------
 1 file changed, 19 insertions(+), 6 deletions(-)

diff --git a/library/sd3_models.py b/library/sd3_models.py
index 8b90205db..2f3c82eed 100644
--- a/library/sd3_models.py
+++ b/library/sd3_models.py
@@ -1017,22 +1017,35 @@ def cropped_scaled_pos_embed(self, h, w, device=None, dtype=None, random_crop: b
                 patched_size = patched_size_
                 break
         if patched_size is None:
-            raise ValueError(f"Area {area} is too large for the given latent sizes {self.resolution_area_to_latent_size}.")
+            # raise ValueError(f"Area {area} is too large for the given latent sizes {self.resolution_area_to_latent_size}.")
+            # use largest latent size
+            patched_size = self.resolution_area_to_latent_size[-1][1]
 
         pos_embed = self.resolution_pos_embeds[patched_size]
-        pos_embed_size = round(math.sqrt(pos_embed.shape[1]))
+        pos_embed_size = round(math.sqrt(pos_embed.shape[1]))  # max size, patched_size * POS_EMBED_MAX_RATIO
         if h > pos_embed_size or w > pos_embed_size:
             # # fallback to normal pos_embed
             # return self.cropped_pos_embed(h * p, w * p, device=device, random_crop=random_crop)
             # extend pos_embed size
             logger.warning(
-                f"Using normal pos_embed for size {h}x{w} as it exceeds the scaled pos_embed size {pos_embed_size}. Image is too tall or wide."
+                f"Add new pos_embed for size {h}x{w} as it exceeds the scaled pos_embed size {pos_embed_size}. Image is too tall or wide."
             )
-            pos_embed_size = max(h, w)
-            pos_embed = get_scaled_2d_sincos_pos_embed(self.hidden_size, pos_embed_size, sample_size=patched_size)
+            patched_size = max(h, w)
+            grid_size = int(patched_size * MMDiT.POS_EMBED_MAX_RATIO)
+            pos_embed_size = grid_size
+            pos_embed = get_scaled_2d_sincos_pos_embed(self.hidden_size, grid_size, sample_size=patched_size)
             pos_embed = torch.from_numpy(pos_embed).float().unsqueeze(0)
             self.resolution_pos_embeds[patched_size] = pos_embed
-            logger.info(f"Updated pos_embed for size {pos_embed_size}x{pos_embed_size}")
+            logger.info(f"Added pos_embed for size {patched_size}x{patched_size}")
+
+            # print(torch.allclose(pos_embed.to(torch.float32).cpu(), self.pos_embed.to(torch.float32).cpu(), atol=5e-2))
+            # diff = pos_embed.to(torch.float32).cpu() - self.pos_embed.to(torch.float32).cpu()
+            # print(diff.abs().max(), diff.abs().mean())
+
+            # insert to resolution_area_to_latent_size, by adding and sorting
+            area = pos_embed_size**2
+            self.resolution_area_to_latent_size.append((area, patched_size))
+            self.resolution_area_to_latent_size = sorted(self.resolution_area_to_latent_size)
 
         if not random_crop:
             top = (pos_embed_size - h) // 2

From 7b61e9eb58e0a004b451e8f06c9f90b861f81b45 Mon Sep 17 00:00:00 2001
From: recris <recris@users.noreply.github.com>
Date: Sat, 30 Nov 2024 11:36:40 +0000
Subject: [PATCH 262/582] Fix issues found in review (pt 2)

---
 library/train_util.py | 2 +-
 sd3_train.py          | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/library/train_util.py b/library/train_util.py
index eaf6ec004..d5e72323a 100644
--- a/library/train_util.py
+++ b/library/train_util.py
@@ -5875,7 +5875,7 @@ def get_huber_threshold(args, timesteps: torch.Tensor, noise_scheduler) -> torch
         alpha = -math.log(args.huber_c) / noise_scheduler.config.num_train_timesteps
         result = torch.exp(-alpha * timesteps) * args.huber_scale
     elif args.huber_schedule == "snr":
-        if not hasattr(noise_scheduler, "alphas_cumprod"):
+        if noise_scheduler is None or not hasattr(noise_scheduler, "alphas_cumprod"):
             raise NotImplementedError("Huber schedule 'snr' is not supported with the current model.")
         alphas_cumprod = torch.index_select(noise_scheduler.alphas_cumprod, 0, timesteps.cpu())
         sigmas = ((1.0 - alphas_cumprod) / alphas_cumprod) ** 0.5
diff --git a/sd3_train.py b/sd3_train.py
index cf2bdf938..909c5ead6 100644
--- a/sd3_train.py
+++ b/sd3_train.py
@@ -845,7 +845,7 @@ def grad_hook(parameter: torch.Tensor):
                 # )
                 # calculate loss
                 loss = train_util.conditional_loss(
-                    args, model_pred.float(), target.float(), timesteps, "none", noise_scheduler
+                    args, model_pred.float(), target.float(), timesteps, "none", None
                 )
                 if args.masked_loss or ("alpha_masks" in batch and batch["alpha_masks"] is not None):
                     loss = apply_masked_loss(loss, batch)

From 14f642f88be888ce1a4157b550186347c159ca42 Mon Sep 17 00:00:00 2001
From: Kohya S <ykumeykume@gmail.com>
Date: Sun, 1 Dec 2024 13:30:35 +0900
Subject: [PATCH 263/582] fix: huber_schedule exponential not working on
 sd3_train.py

---
 library/train_util.py | 2 +-
 sd3_train.py          | 8 +++-----
 2 files changed, 4 insertions(+), 6 deletions(-)

diff --git a/library/train_util.py b/library/train_util.py
index d5e72323a..eaf6ec004 100644
--- a/library/train_util.py
+++ b/library/train_util.py
@@ -5875,7 +5875,7 @@ def get_huber_threshold(args, timesteps: torch.Tensor, noise_scheduler) -> torch
         alpha = -math.log(args.huber_c) / noise_scheduler.config.num_train_timesteps
         result = torch.exp(-alpha * timesteps) * args.huber_scale
     elif args.huber_schedule == "snr":
-        if noise_scheduler is None or not hasattr(noise_scheduler, "alphas_cumprod"):
+        if not hasattr(noise_scheduler, "alphas_cumprod"):
             raise NotImplementedError("Huber schedule 'snr' is not supported with the current model.")
         alphas_cumprod = torch.index_select(noise_scheduler.alphas_cumprod, 0, timesteps.cpu())
         sigmas = ((1.0 - alphas_cumprod) / alphas_cumprod) ** 0.5
diff --git a/sd3_train.py b/sd3_train.py
index 909c5ead6..73a68aa6a 100644
--- a/sd3_train.py
+++ b/sd3_train.py
@@ -675,8 +675,8 @@ def grad_hook(parameter: torch.Tensor):
     progress_bar = tqdm(range(args.max_train_steps), smoothing=0, disable=not accelerator.is_local_main_process, desc="steps")
     global_step = 0
 
-    # noise_scheduler = sd3_train_utils.FlowMatchEulerDiscreteScheduler(num_train_timesteps=1000, shift=3.0)
-    # noise_scheduler_copy = copy.deepcopy(noise_scheduler)
+    # only used to get timesteps, etc. TODO manage timesteps etc. separately
+    dummy_scheduler = sd3_train_utils.FlowMatchEulerDiscreteScheduler(num_train_timesteps=1000, shift=3.0)
 
     if accelerator.is_main_process:
         init_kwargs = {}
@@ -844,9 +844,7 @@ def grad_hook(parameter: torch.Tensor):
                 #     1,
                 # )
                 # calculate loss
-                loss = train_util.conditional_loss(
-                    args, model_pred.float(), target.float(), timesteps, "none", None
-                )
+                loss = train_util.conditional_loss(args, model_pred.float(), target.float(), timesteps, "none", dummy_scheduler)
                 if args.masked_loss or ("alpha_masks" in batch and batch["alpha_masks"] is not None):
                     loss = apply_masked_loss(loss, batch)
                 loss = loss.mean([1, 2, 3])

From 0fe6320f09a61859c3faa134affb810cb42b62cd Mon Sep 17 00:00:00 2001
From: Kohya S <ykumeykume@gmail.com>
Date: Sun, 1 Dec 2024 14:13:37 +0900
Subject: [PATCH 264/582] fix flux_train.py is not working

---
 flux_train.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/flux_train.py b/flux_train.py
index f6e43b27a..cfe14885e 100644
--- a/flux_train.py
+++ b/flux_train.py
@@ -667,7 +667,7 @@ def grad_hook(parameter: torch.Tensor):
 
                 # calculate loss
                 loss = train_util.conditional_loss(
-                    args, noise_pred.float(), target.float(), timesteps, "none", noise_scheduler
+                    args, model_pred.float(), target.float(), timesteps, "none", noise_scheduler
                 )
                 if weighting is not None:
                     loss = loss * weighting

From cc11989755d0dd61f10eeec85983c751fd7ebb47 Mon Sep 17 00:00:00 2001
From: Kohya S <ykumeykume@gmail.com>
Date: Sun, 1 Dec 2024 21:20:28 +0900
Subject: [PATCH 265/582] fix: refactor huber-loss calculation in multiple
 training scripts

---
 fine_tune.py                         | 13 ++++---------
 flux_train.py                        |  5 ++---
 library/train_util.py                | 21 +++++++++++----------
 sd3_train.py                         |  3 ++-
 sdxl_train.py                        | 13 ++++---------
 sdxl_train_control_net.py            |  9 +++------
 sdxl_train_control_net_lllite.py     |  9 +++------
 sdxl_train_control_net_lllite_old.py | 10 ++++++----
 train_controlnet.py                  | 11 +++++------
 train_db.py                          |  9 +++------
 train_network.py                     |  5 ++---
 train_textual_inversion.py           |  5 ++---
 train_textual_inversion_XTI.py       |  9 +++++----
 13 files changed, 52 insertions(+), 70 deletions(-)

diff --git a/fine_tune.py b/fine_tune.py
index 401a40f08..176087065 100644
--- a/fine_tune.py
+++ b/fine_tune.py
@@ -380,9 +380,7 @@ def fn_recursive_set_mem_eff(module: torch.nn.Module):
 
                 # Sample noise, sample a random timestep for each image, and add noise to the latents,
                 # with noise offset and/or multires noise if specified
-                noise, noisy_latents, timesteps = train_util.get_noise_noisy_latents_and_timesteps(
-                    args, noise_scheduler, latents
-                )
+                noise, noisy_latents, timesteps = train_util.get_noise_noisy_latents_and_timesteps(args, noise_scheduler, latents)
 
                 # Predict the noise residual
                 with accelerator.autocast():
@@ -394,11 +392,10 @@ def fn_recursive_set_mem_eff(module: torch.nn.Module):
                 else:
                     target = noise
 
+                huber_c = train_util.get_huber_threshold_if_needed(args, timesteps, noise_scheduler)
                 if args.min_snr_gamma or args.scale_v_pred_loss_like_noise_pred or args.debiased_estimation_loss:
                     # do not mean over batch dimension for snr weight or scale v-pred loss
-                    loss = train_util.conditional_loss(
-                        args, noise_pred.float(), target.float(), timesteps, "none", noise_scheduler
-                    )
+                    loss = train_util.conditional_loss(noise_pred.float(), target.float(), args.loss_type, "none", huber_c)
                     loss = loss.mean([1, 2, 3])
 
                     if args.min_snr_gamma:
@@ -410,9 +407,7 @@ def fn_recursive_set_mem_eff(module: torch.nn.Module):
 
                     loss = loss.mean()  # mean over batch dimension
                 else:
-                    loss = train_util.conditional_loss(
-                        args, noise_pred.float(), target.float(), timesteps, "mean", noise_scheduler
-                    )
+                    loss = train_util.conditional_loss(noise_pred.float(), target.float(), args.loss_type, "mean", huber_c)
 
                 accelerator.backward(loss)
                 if accelerator.sync_gradients and args.max_grad_norm != 0.0:
diff --git a/flux_train.py b/flux_train.py
index cfe14885e..fced3bef9 100644
--- a/flux_train.py
+++ b/flux_train.py
@@ -666,9 +666,8 @@ def grad_hook(parameter: torch.Tensor):
                 target = noise - latents
 
                 # calculate loss
-                loss = train_util.conditional_loss(
-                    args, model_pred.float(), target.float(), timesteps, "none", noise_scheduler
-                )
+                huber_c = train_util.get_huber_threshold_if_needed(args, timesteps, noise_scheduler)
+                loss = train_util.conditional_loss(model_pred.float(), target.float(), args.loss_type, "none", huber_c)
                 if weighting is not None:
                     loss = loss * weighting
                 if args.masked_loss or ("alpha_masks" in batch and batch["alpha_masks"] is not None):
diff --git a/library/train_util.py b/library/train_util.py
index eaf6ec004..fe74ddc7e 100644
--- a/library/train_util.py
+++ b/library/train_util.py
@@ -5869,7 +5869,10 @@ def get_noise_noisy_latents_and_timesteps(args, noise_scheduler, latents):
     return noise, noisy_latents, timesteps
 
 
-def get_huber_threshold(args, timesteps: torch.Tensor, noise_scheduler) -> torch.Tensor:
+def get_huber_threshold_if_needed(args, timesteps: torch.Tensor, noise_scheduler) -> Optional[torch.Tensor]:
+    if not (args.loss_type == "huber" or args.loss_type == "smooth_l1"):
+        return None
+
     b_size = timesteps.shape[0]
     if args.huber_schedule == "exponential":
         alpha = -math.log(args.huber_c) / noise_scheduler.config.num_train_timesteps
@@ -5890,22 +5893,20 @@ def get_huber_threshold(args, timesteps: torch.Tensor, noise_scheduler) -> torch
 
 
 def conditional_loss(
-    args, model_pred: torch.Tensor, target: torch.Tensor, timesteps: torch.Tensor, reduction: str, noise_scheduler
+    model_pred: torch.Tensor, target: torch.Tensor, loss_type: str, reduction: str, huber_c: Optional[torch.Tensor] = None
 ):
-    if args.loss_type == "l2":
+    if loss_type == "l2":
         loss = torch.nn.functional.mse_loss(model_pred, target, reduction=reduction)
-    elif args.loss_type == "l1":
+    elif loss_type == "l1":
         loss = torch.nn.functional.l1_loss(model_pred, target, reduction=reduction)
-    elif args.loss_type == "huber":
-        huber_c = get_huber_threshold(args, timesteps, noise_scheduler)
+    elif loss_type == "huber":
         huber_c = huber_c.view(-1, 1, 1, 1)
         loss = 2 * huber_c * (torch.sqrt((model_pred - target) ** 2 + huber_c**2) - huber_c)
         if reduction == "mean":
             loss = torch.mean(loss)
         elif reduction == "sum":
             loss = torch.sum(loss)
-    elif args.loss_type == "smooth_l1":
-        huber_c = get_huber_threshold(args, timesteps, noise_scheduler)
+    elif loss_type == "smooth_l1":
         huber_c = huber_c.view(-1, 1, 1, 1)
         loss = 2 * (torch.sqrt((model_pred - target) ** 2 + huber_c**2) - huber_c)
         if reduction == "mean":
@@ -5913,7 +5914,7 @@ def conditional_loss(
         elif reduction == "sum":
             loss = torch.sum(loss)
     else:
-        raise NotImplementedError(f"Unsupported Loss Type: {args.loss_type}")
+        raise NotImplementedError(f"Unsupported Loss Type: {loss_type}")
     return loss
 
 
@@ -5923,7 +5924,7 @@ def append_lr_to_logs(logs, lr_scheduler, optimizer_type, including_unet=True):
         names.append("unet")
     names.append("text_encoder1")
     names.append("text_encoder2")
-    names.append("text_encoder3") # SD3
+    names.append("text_encoder3")  # SD3
 
     append_lr_to_logs_with_names(logs, lr_scheduler, optimizer_type, names)
 
diff --git a/sd3_train.py b/sd3_train.py
index 73a68aa6a..120455e7b 100644
--- a/sd3_train.py
+++ b/sd3_train.py
@@ -844,7 +844,8 @@ def grad_hook(parameter: torch.Tensor):
                 #     1,
                 # )
                 # calculate loss
-                loss = train_util.conditional_loss(args, model_pred.float(), target.float(), timesteps, "none", dummy_scheduler)
+                huber_c = train_util.get_huber_threshold_if_needed(args, timesteps, dummy_scheduler)
+                loss = train_util.conditional_loss(model_pred.float(), target.float(), args.loss_type, "none", huber_c)
                 if args.masked_loss or ("alpha_masks" in batch and batch["alpha_masks"] is not None):
                     loss = apply_masked_loss(loss, batch)
                 loss = loss.mean([1, 2, 3])
diff --git a/sdxl_train.py b/sdxl_train.py
index 1bc27ec6c..b9d529243 100644
--- a/sdxl_train.py
+++ b/sdxl_train.py
@@ -695,9 +695,7 @@ def optimizer_hook(parameter: torch.Tensor):
 
                 # Sample noise, sample a random timestep for each image, and add noise to the latents,
                 # with noise offset and/or multires noise if specified
-                noise, noisy_latents, timesteps = train_util.get_noise_noisy_latents_and_timesteps(
-                    args, noise_scheduler, latents
-                )
+                noise, noisy_latents, timesteps = train_util.get_noise_noisy_latents_and_timesteps(args, noise_scheduler, latents)
 
                 noisy_latents = noisy_latents.to(weight_dtype)  # TODO check why noisy_latents is not weight_dtype
 
@@ -711,6 +709,7 @@ def optimizer_hook(parameter: torch.Tensor):
                 else:
                     target = noise
 
+                huber_c = train_util.get_huber_threshold_if_needed(args, timesteps, noise_scheduler)
                 if (
                     args.min_snr_gamma
                     or args.scale_v_pred_loss_like_noise_pred
@@ -719,9 +718,7 @@ def optimizer_hook(parameter: torch.Tensor):
                     or args.masked_loss
                 ):
                     # do not mean over batch dimension for snr weight or scale v-pred loss
-                    loss = train_util.conditional_loss(
-                        args, noise_pred.float(), target.float(), timesteps, "none", noise_scheduler
-                    )
+                    loss = train_util.conditional_loss(noise_pred.float(), target.float(), args.loss_type, "none", huber_c)
                     if args.masked_loss or ("alpha_masks" in batch and batch["alpha_masks"] is not None):
                         loss = apply_masked_loss(loss, batch)
                     loss = loss.mean([1, 2, 3])
@@ -737,9 +734,7 @@ def optimizer_hook(parameter: torch.Tensor):
 
                     loss = loss.mean()  # mean over batch dimension
                 else:
-                    loss = train_util.conditional_loss(
-                        args, noise_pred.float(), target.float(), timesteps, "none", noise_scheduler
-                    )
+                    loss = train_util.conditional_loss(noise_pred.float(), target.float(), args.loss_type, "mean", huber_c)
 
                 accelerator.backward(loss)
 
diff --git a/sdxl_train_control_net.py b/sdxl_train_control_net.py
index d0051d18f..01387409a 100644
--- a/sdxl_train_control_net.py
+++ b/sdxl_train_control_net.py
@@ -512,9 +512,7 @@ def remove_model(old_ckpt_name):
 
                 # Sample noise, sample a random timestep for each image, and add noise to the latents,
                 # with noise offset and/or multires noise if specified
-                noise, noisy_latents, timesteps = train_util.get_noise_noisy_latents_and_timesteps(
-                    args, noise_scheduler, latents
-                )
+                noise, noisy_latents, timesteps = train_util.get_noise_noisy_latents_and_timesteps(args, noise_scheduler, latents)
 
                 controlnet_image = batch["conditioning_images"].to(dtype=weight_dtype)
 
@@ -533,9 +531,8 @@ def remove_model(old_ckpt_name):
                 else:
                     target = noise
 
-                loss = train_util.conditional_loss(
-                    args, noise_pred.float(), target.float(), timesteps, "none", noise_scheduler
-                )
+                huber_c = train_util.get_huber_threshold_if_needed(args, timesteps, noise_scheduler)
+                loss = train_util.conditional_loss(noise_pred.float(), target.float(), args.loss_type, "none", huber_c)
                 loss = loss.mean([1, 2, 3])
 
                 loss_weights = batch["loss_weights"]  # 各sampleごとのweight
diff --git a/sdxl_train_control_net_lllite.py b/sdxl_train_control_net_lllite.py
index 66214f5df..365059b75 100644
--- a/sdxl_train_control_net_lllite.py
+++ b/sdxl_train_control_net_lllite.py
@@ -463,9 +463,7 @@ def remove_model(old_ckpt_name):
 
                 # Sample noise, sample a random timestep for each image, and add noise to the latents,
                 # with noise offset and/or multires noise if specified
-                noise, noisy_latents, timesteps = train_util.get_noise_noisy_latents_and_timesteps(
-                    args, noise_scheduler, latents
-                )
+                noise, noisy_latents, timesteps = train_util.get_noise_noisy_latents_and_timesteps(args, noise_scheduler, latents)
 
                 noisy_latents = noisy_latents.to(weight_dtype)  # TODO check why noisy_latents is not weight_dtype
 
@@ -484,9 +482,8 @@ def remove_model(old_ckpt_name):
                 else:
                     target = noise
 
-                loss = train_util.conditional_loss(
-                    args, noise_pred.float(), target.float(), timesteps, "none", noise_scheduler
-                )
+                huber_c = train_util.get_huber_threshold_if_needed(args, timesteps, noise_scheduler)
+                loss = train_util.conditional_loss(noise_pred.float(), target.float(), args.loss_type, "none", huber_c)
                 loss = loss.mean([1, 2, 3])
 
                 loss_weights = batch["loss_weights"]  # 各sampleごとのweight
diff --git a/sdxl_train_control_net_lllite_old.py b/sdxl_train_control_net_lllite_old.py
index 5e10654b9..5b372befc 100644
--- a/sdxl_train_control_net_lllite_old.py
+++ b/sdxl_train_control_net_lllite_old.py
@@ -12,6 +12,7 @@
 
 import torch
 from library.device_utils import init_ipex, clean_memory_on_device
+
 init_ipex()
 
 from torch.nn.parallel import DistributedDataParallel as DDP
@@ -324,7 +325,9 @@ def train(args):
         if args.log_tracker_config is not None:
             init_kwargs = toml.load(args.log_tracker_config)
         accelerator.init_trackers(
-            "lllite_control_net_train" if args.log_tracker_name is None else args.log_tracker_name, config=train_util.get_sanitized_config_or_none(args), init_kwargs=init_kwargs
+            "lllite_control_net_train" if args.log_tracker_name is None else args.log_tracker_name,
+            config=train_util.get_sanitized_config_or_none(args),
+            init_kwargs=init_kwargs,
         )
 
     loss_recorder = train_util.LossRecorder()
@@ -426,9 +429,8 @@ def remove_model(old_ckpt_name):
                 else:
                     target = noise
 
-                loss = train_util.conditional_loss(
-                    args, noise_pred.float(), target.float(), timesteps, "none", noise_scheduler
-                )
+                huber_c = train_util.get_huber_threshold_if_needed(args, timesteps, noise_scheduler)
+                loss = train_util.conditional_loss(noise_pred.float(), target.float(), args.loss_type, "none", huber_c)
                 loss = loss.mean([1, 2, 3])
 
                 loss_weights = batch["loss_weights"]  # 各sampleごとのweight
diff --git a/train_controlnet.py b/train_controlnet.py
index da7a08d69..177d2b11f 100644
--- a/train_controlnet.py
+++ b/train_controlnet.py
@@ -307,10 +307,12 @@ def __contains__(self, name):
 
     if args.fused_backward_pass:
         import library.adafactor_fused
+
         library.adafactor_fused.patch_adafactor_fused(optimizer)
         for param_group in optimizer.param_groups:
             for parameter in param_group["params"]:
                 if parameter.requires_grad:
+
                     def __grad_hook(tensor: torch.Tensor, param_group=param_group):
                         if accelerator.sync_gradients and args.max_grad_norm != 0.0:
                             accelerator.clip_grad_norm_(tensor, args.max_grad_norm)
@@ -464,9 +466,7 @@ def remove_model(old_ckpt_name):
                     )
 
                 # Sample a random timestep for each image
-                timesteps = train_util.get_timesteps(
-                    0, noise_scheduler.config.num_train_timesteps, b_size, latents.device
-                )
+                timesteps = train_util.get_timesteps(0, noise_scheduler.config.num_train_timesteps, b_size, latents.device)
 
                 # Add noise to the latents according to the noise magnitude at each timestep
                 # (this is the forward diffusion process)
@@ -498,9 +498,8 @@ def remove_model(old_ckpt_name):
                 else:
                     target = noise
 
-                loss = train_util.conditional_loss(
-                    args, noise_pred.float(), target.float(), timesteps, "none", noise_scheduler
-                )
+                huber_c = train_util.get_huber_threshold_if_needed(args, timesteps, noise_scheduler)
+                loss = train_util.conditional_loss(noise_pred.float(), target.float(), args.loss_type, "none", huber_c)
                 loss = loss.mean([1, 2, 3])
 
                 loss_weights = batch["loss_weights"]  # 各sampleごとのweight
diff --git a/train_db.py b/train_db.py
index a185b31b3..ad21f8d1b 100644
--- a/train_db.py
+++ b/train_db.py
@@ -370,9 +370,7 @@ def train(args):
 
                 # Sample noise, sample a random timestep for each image, and add noise to the latents,
                 # with noise offset and/or multires noise if specified
-                noise, noisy_latents, timesteps = train_util.get_noise_noisy_latents_and_timesteps(
-                    args, noise_scheduler, latents
-                )
+                noise, noisy_latents, timesteps = train_util.get_noise_noisy_latents_and_timesteps(args, noise_scheduler, latents)
 
                 # Predict the noise residual
                 with accelerator.autocast():
@@ -384,9 +382,8 @@ def train(args):
                 else:
                     target = noise
 
-                loss = train_util.conditional_loss(
-                    args, noise_pred.float(), target.float(), timesteps, "none", noise_scheduler
-                )
+                huber_c = train_util.get_huber_threshold_if_needed(args, timesteps, noise_scheduler)
+                loss = train_util.conditional_loss(noise_pred.float(), target.float(), args.loss_type, "none", huber_c)
                 if args.masked_loss or ("alpha_masks" in batch and batch["alpha_masks"] is not None):
                     loss = apply_masked_loss(loss, batch)
                 loss = loss.mean([1, 2, 3])
diff --git a/train_network.py b/train_network.py
index c7d4f5dc5..0b4208187 100644
--- a/train_network.py
+++ b/train_network.py
@@ -1207,9 +1207,8 @@ def remove_model(old_ckpt_name):
                         train_unet,
                     )
 
-                    loss = train_util.conditional_loss(
-                        args, noise_pred.float(), target.float(), timesteps, "none", noise_scheduler
-                    )
+                    huber_c = train_util.get_huber_threshold_if_needed(args, timesteps, noise_scheduler)
+                    loss = train_util.conditional_loss(noise_pred.float(), target.float(), args.loss_type, "none", huber_c)
                     if weighting is not None:
                         loss = loss * weighting
                     if args.masked_loss or ("alpha_masks" in batch and batch["alpha_masks"] is not None):
diff --git a/train_textual_inversion.py b/train_textual_inversion.py
index 9e1e57c48..65da4859b 100644
--- a/train_textual_inversion.py
+++ b/train_textual_inversion.py
@@ -601,9 +601,8 @@ def remove_model(old_ckpt_name):
                     else:
                         target = noise
 
-                    loss = train_util.conditional_loss(
-                        args, noise_pred.float(), target.float(), timesteps, "none", noise_scheduler
-                    )
+                    huber_c = train_util.get_huber_threshold_if_needed(args, timesteps, noise_scheduler)
+                    loss = train_util.conditional_loss(noise_pred.float(), target.float(), args.loss_type, "none", huber_c)
                     if args.masked_loss or ("alpha_masks" in batch and batch["alpha_masks"] is not None):
                         loss = apply_masked_loss(loss, batch)
                     loss = loss.mean([1, 2, 3])
diff --git a/train_textual_inversion_XTI.py b/train_textual_inversion_XTI.py
index 944733602..2a2b42310 100644
--- a/train_textual_inversion_XTI.py
+++ b/train_textual_inversion_XTI.py
@@ -407,7 +407,9 @@ def train(args):
         if args.log_tracker_config is not None:
             init_kwargs = toml.load(args.log_tracker_config)
         accelerator.init_trackers(
-            "textual_inversion" if args.log_tracker_name is None else args.log_tracker_name, config=train_util.get_sanitized_config_or_none(args), init_kwargs=init_kwargs
+            "textual_inversion" if args.log_tracker_name is None else args.log_tracker_name,
+            config=train_util.get_sanitized_config_or_none(args),
+            init_kwargs=init_kwargs,
         )
 
     # function for saving/removing
@@ -473,9 +475,8 @@ def remove_model(old_ckpt_name):
                 else:
                     target = noise
 
-                loss = train_util.conditional_loss(
-                    args, noise_pred.float(), target.float(), timesteps, "none", noise_scheduler
-                )
+                huber_c = train_util.get_huber_threshold_if_needed(args, timesteps, noise_scheduler)
+                loss = train_util.conditional_loss(noise_pred.float(), target.float(), args.loss_type, "none", huber_c)
                 if args.masked_loss or ("alpha_masks" in batch and batch["alpha_masks"] is not None):
                     loss = apply_masked_loss(loss, batch)
                 loss = loss.mean([1, 2, 3])

From 14760407871c7eaa26210c7db71ce2740a817c4c Mon Sep 17 00:00:00 2001
From: Kohya S <ykumeykume@gmail.com>
Date: Sun, 1 Dec 2024 21:26:39 +0900
Subject: [PATCH 266/582] fix: update help text for huber loss parameters in
 train_util.py

---
 library/train_util.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/library/train_util.py b/library/train_util.py
index fe74ddc7e..a40983a68 100644
--- a/library/train_util.py
+++ b/library/train_util.py
@@ -3905,14 +3905,16 @@ def add_training_arguments(parser: argparse.ArgumentParser, support_dreambooth:
         "--huber_c",
         type=float,
         default=0.1,
-        help="The Huber loss decay parameter. Only used if one of the huber loss modes (huber or smooth l1) is selected with loss_type. default is 0.1 / Huber損失のパラメータ。loss_typeがhuberまたはsmooth l1の場合に有効。デフォルトは0.1",
+        help="The Huber loss decay parameter. Only used if one of the huber loss modes (huber or smooth l1) is selected with loss_type. default is 0.1"
+        " / Huber損失の減衰パラメータ。loss_typeがhuberまたはsmooth l1の場合に有効。デフォルトは0.1",
     )
 
     parser.add_argument(
         "--huber_scale",
         type=float,
         default=1.0,
-        help="The Huber loss scale parameter. Only used if one of the huber loss modes (huber or smooth l1) is selected with loss_type. default is 1.0 / Huber損失のパラメータ。loss_typeがhuberまたはsmooth l1の場合に有効。デフォルトは0.1",
+        help="The Huber loss scale parameter. Only used if one of the huber loss modes (huber or smooth l1) is selected with loss_type. default is 1.0"
+        " / Huber損失のスケールパラメータ。loss_typeがhuberまたはsmooth l1の場合に有効。デフォルトは1.0",
     )
 
     parser.add_argument(

From 34e7f509c41491f9a08c16c8ead2adf5cb210ec1 Mon Sep 17 00:00:00 2001
From: Kohya S <ykumeykume@gmail.com>
Date: Sun, 1 Dec 2024 21:36:24 +0900
Subject: [PATCH 267/582] docs: update README for huber loss

---
 README.md | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/README.md b/README.md
index f9c85e3ac..89a96827c 100644
--- a/README.md
+++ b/README.md
@@ -14,6 +14,11 @@ The command to install PyTorch is as follows:
 
 ### Recent Updates
 
+1 Dec, 2024:
+
+- Pseudo Huber loss is now available for FLUX.1 and SD3.5 training. See [#1808](https://github.com/kohya-ss/sd-scripts/pull/1808)  for details. Thanks to recris!
+  - Specify `--loss_type huber` or `--loss_type smooth_l1` to use it. `--huber_c` and `--huber_scale` are also available.
+
 Nov 14, 2024:
 
 - Improved the implementation of block swap and made it available for both FLUX.1 and SD3 LoRA training. See [FLUX.1 LoRA training](#flux1-lora-training) etc. for how to use the new options. Training is possible with about 8-10GB of VRAM.

From 1dc873d9b463d50e27ae8572c28a473ce9a1254f Mon Sep 17 00:00:00 2001
From: Kohya S <ykumeykume@gmail.com>
Date: Sun, 1 Dec 2024 22:00:44 +0900
Subject: [PATCH 268/582] update README and clean up code for schedulefree
 optimizer

---
 README.md             | 4 +++-
 library/train_util.py | 7 +++----
 2 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/README.md b/README.md
index 89a96827c..8db5c4d42 100644
--- a/README.md
+++ b/README.md
@@ -16,9 +16,11 @@ The command to install PyTorch is as follows:
 
 1 Dec, 2024:
 
-- Pseudo Huber loss is now available for FLUX.1 and SD3.5 training. See [#1808](https://github.com/kohya-ss/sd-scripts/pull/1808)  for details. Thanks to recris!
+- Pseudo Huber loss is now available for FLUX.1 and SD3.5 training. See PR [#1808](https://github.com/kohya-ss/sd-scripts/pull/1808)  for details. Thanks to recris!
   - Specify `--loss_type huber` or `--loss_type smooth_l1` to use it. `--huber_c` and `--huber_scale` are also available.
 
+- [Prodigy + ScheduleFree](https://github.com/LoganBooker/prodigy-plus-schedule-free) is supported. See PR [#1811](https://github.com/kohya-ss/sd-scripts/pull/1811) for details. Thanks to rockerBOO!
+
 Nov 14, 2024:
 
 - Improved the implementation of block swap and made it available for both FLUX.1 and SD3 LoRA training. See [FLUX.1 LoRA training](#flux1-lora-training) etc. for how to use the new options. Training is possible with about 8-10GB of VRAM.
diff --git a/library/train_util.py b/library/train_util.py
index 289ab8235..6cfd14d5e 100644
--- a/library/train_util.py
+++ b/library/train_util.py
@@ -4609,7 +4609,7 @@ def task():
 
 def get_optimizer(args, trainable_params):
     # "Optimizer to use: AdamW, AdamW8bit, Lion, SGDNesterov, SGDNesterov8bit, PagedAdamW, PagedAdamW8bit, PagedAdamW32bit, Lion8bit, PagedLion8bit, AdEMAMix8bit, PagedAdEMAMix8bit, DAdaptation(DAdaptAdamPreprint), DAdaptAdaGrad, DAdaptAdam, DAdaptAdan, DAdaptAdanIP, DAdaptLion, DAdaptSGD, Adafactor"
-    
+
     optimizer_type = args.optimizer_type
     if args.use_8bit_adam:
         assert (
@@ -4883,7 +4883,6 @@ def get_optimizer(args, trainable_params):
         optimizer = optimizer_class(trainable_params, lr=lr, **optimizer_kwargs)
 
     elif optimizer_type.endswith("schedulefree".lower()):
-        should_train_optimizer = True
         try:
             import schedulefree as sf
         except ImportError:
@@ -5000,8 +4999,8 @@ def __instancecheck__(self, instance):
     optimizer_name = optimizer_class.__module__ + "." + optimizer_class.__name__
     optimizer_args = ",".join([f"{k}={v}" for k, v in optimizer_kwargs.items()])
 
-    if hasattr(optimizer, 'train') and callable(optimizer.train):
-        # make optimizer as train mode: we don't need to call train again, because eval will not be called in training loop
+    if hasattr(optimizer, "train") and callable(optimizer.train):
+        # make optimizer as train mode before training for schedulefree optimizer. the optimizer will be in eval mode in sampling and saving.
         optimizer.train()
 
     return optimizer_name, optimizer_args, optimizer

From e369b9a252b90d1f57ea20dd6f5d05ec0c287ae1 Mon Sep 17 00:00:00 2001
From: kohya-ss <ykumeykume@gmail.com>
Date: Mon, 2 Dec 2024 23:38:54 +0900
Subject: [PATCH 269/582] docs: update README with FLUX.1 ControlNet training
 details and improve argument help text

---
 README.md                   | 10 +++++++++-
 library/flux_train_utils.py |  2 +-
 2 files changed, 10 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index 45e3cb7ab..6a5cdd342 100644
--- a/README.md
+++ b/README.md
@@ -14,7 +14,15 @@ The command to install PyTorch is as follows:
 
 ### Recent Updates
 
-1 Dec, 2024:
+Dec 2, 2024:
+
+- FLUX.1 ControlNet training is supported. PR [#1813](https://github.com/kohya-ss/sd-scripts/pull/1813). Thanks to minux302!  See PR and [here](#flux1-controlnet-training) for details.
+  - Not fully tested. Feedback is welcome.
+  - 80GB VRAM is required for 1024x1024 resolution, and 48GB VRAM is required for 512x512 resolution.
+  - Currently, it only works in Linux environment (or Windows WSL2) because DeepSpeed is required.
+  - Multi-GPU training is not tested.
+
+Dec 1, 2024:
 
 - Pseudo Huber loss is now available for FLUX.1 and SD3.5 training. See PR [#1808](https://github.com/kohya-ss/sd-scripts/pull/1808)  for details. Thanks to recris!
   - Specify `--loss_type huber` or `--loss_type smooth_l1` to use it. `--huber_c` and `--huber_scale` are also available.
diff --git a/library/flux_train_utils.py b/library/flux_train_utils.py
index 5e25c7feb..de2e2b48d 100644
--- a/library/flux_train_utils.py
+++ b/library/flux_train_utils.py
@@ -567,7 +567,7 @@ def add_flux_train_arguments(parser: argparse.ArgumentParser):
         "--controlnet",
         type=str,
         default=None,
-        help="path to controlnet (*.sft or *.safetensors) / aeのパス（*.sftまたは*.safetensors）"
+        help="path to controlnet (*.sft or *.safetensors) / controlnetのパス（*.sftまたは*.safetensors）"
     )
     parser.add_argument(
         "--t5xxl_max_token_length",

From 5ab00f9b49b5a3958bb0267fdb9236a96d503dbd Mon Sep 17 00:00:00 2001
From: rockerBOO <rockerboo@gmail.com>
Date: Mon, 2 Dec 2024 13:39:51 -0500
Subject: [PATCH 270/582] Update workflow tests with cleanup and documentation

---
 .github/workflows/tests.yml | 48 +++++++++++++++++++------------------
 1 file changed, 25 insertions(+), 23 deletions(-)

diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
index 9ae67b0e9..5a790d570 100644
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -1,42 +1,44 @@
-
-name: Python package
-
-on: [push]
+name: Test with pytet
+
+on: 
+  push:
+    branches:
+      - main
+      - dev
+      - sd3
+  pull_request:
+    branches:
+      - main
+      - dev
+      - sd3
 
 jobs:
   build:
-
     runs-on: ${{ matrix.os }}
     strategy:
       matrix:
         os: [ubuntu-latest]
-        python-version: ["3.10"]
+        python-version: ["3.10"] # Python versions to test
+        pytorch-version: ["2.4.0"] # PyTorch versions to test
 
     steps:
       - uses: actions/checkout@v4
-      - name: Set up Python
-        uses: actions/setup-python@v5
+      - uses: actions/setup-python@v5
         with:
-          python-version: '3.x'
-
-      - name: Install dependencies
-        run: python -m pip install --upgrade pip setuptools wheel
+          python-version: ${{ matrix.python-version }}
+          cache: 'pip' 
 
-      - uses: actions/checkout@v4
-      - name: Set up Python
-        uses: actions/setup-python@v5
-        with:
-          python-version: '3.x'
-          cache: 'pip' # caching pip dependencies
+      - name: Install and update pip, setuptools, wheel
+        run: |
+          # Setuptools, wheel for compiling some packages
+          python -m pip install --upgrade pip setuptools wheel
 
       - name: Install dependencies
         run: |
-          python -m pip install --upgrade pip
-          pip install dadaptation==3.2 torch==2.4.0 torchvision==0.19.0 accelerate==0.33.0
+          # Pre-install torch to pin version (requirements.txt has dependencies like transformers which requires pytorch)
+          pip install dadaptation==3.2 torch==${{ matrix.pytorch-version }} torchvision==0.19.0 pytest==8.3.4
           pip install -r requirements.txt
 
       - name: Test with pytest
-        run: |
-          pip install pytest
-          pytest
+        run: pytest # See pytest.ini for configuration
 

From 63738ecb0758a02555392d2c283a83bba1c6f98e Mon Sep 17 00:00:00 2001
From: rockerBOO <rockerboo@gmail.com>
Date: Mon, 2 Dec 2024 13:48:30 -0500
Subject: [PATCH 271/582] Add tests documentation

---
 tests/README.md | 32 ++++++++++++++++++++++++++++++++
 1 file changed, 32 insertions(+)
 create mode 100644 tests/README.md

diff --git a/tests/README.md b/tests/README.md
new file mode 100644
index 000000000..19eeab0e2
--- /dev/null
+++ b/tests/README.md
@@ -0,0 +1,32 @@
+# Tests
+
+## Install
+
+```
+pip install pytest
+```
+
+## Usage
+
+```
+pytest
+```
+
+## Contribution
+
+Pytest is configured to run tests in this directory. It might be a good idea to add tests closer in the code, as well as doctests.
+
+Tests are functions starting with `test_` and files with the pattern `test_*.py`.
+
+```
+def test_x():
+    assert 1 == 2, "Invalid test response"
+```
+
+## Resources
+
+- https://circleci.com/blog/testing-pytorch-model-with-pytest/
+- https://pytorch.org/docs/stable/testing.html
+- https://github.com/pytorch/pytorch/wiki/Running-and-writing-tests
+- https://github.com/huggingface/pytorch-image-models/tree/main/tests
+- https://github.com/pytorch/pytorch/tree/main/test

From 2610e96e9e3d0605d5a16615efa26ae8935ed3aa Mon Sep 17 00:00:00 2001
From: rockerBOO <rockerboo@gmail.com>
Date: Mon, 2 Dec 2024 13:49:58 -0500
Subject: [PATCH 272/582] Pytest

---
 .github/workflows/tests.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
index 5a790d570..672a657bf 100644
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -1,4 +1,4 @@
-name: Test with pytet
+name: Test with pytest
 
 on: 
   push:

From 3e5d89c76c287872e20c4a967d36b51384285be8 Mon Sep 17 00:00:00 2001
From: rockerBOO <rockerboo@gmail.com>
Date: Mon, 2 Dec 2024 13:51:57 -0500
Subject: [PATCH 273/582] Add more resources

---
 tests/README.md | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/tests/README.md b/tests/README.md
index 19eeab0e2..9836da8b4 100644
--- a/tests/README.md
+++ b/tests/README.md
@@ -25,8 +25,17 @@ def test_x():
 
 ## Resources
 
+### pytest 
+
+- https://docs.pytest.org/en/stable/index.html
+- https://docs.pytest.org/en/stable/how-to/assert.html
+- https://docs.pytest.org/en/stable/how-to/doctest.html
+
+### PyTorch testing
+
 - https://circleci.com/blog/testing-pytorch-model-with-pytest/
 - https://pytorch.org/docs/stable/testing.html
 - https://github.com/pytorch/pytorch/wiki/Running-and-writing-tests
 - https://github.com/huggingface/pytorch-image-models/tree/main/tests
 - https://github.com/pytorch/pytorch/tree/main/test
+

From 8b36d907d8635dca64224574b5cb15013e00809d Mon Sep 17 00:00:00 2001
From: Kohya S <ykumeykume@gmail.com>
Date: Tue, 3 Dec 2024 08:43:26 +0900
Subject: [PATCH 274/582] feat: support block_to_swap for FLUX.1 ControlNet
 training

---
 README.md                 | 13 +++++++++++
 flux_train_control_net.py | 46 +++++++++++++++++++++++++++------------
 2 files changed, 45 insertions(+), 14 deletions(-)

diff --git a/README.md b/README.md
index 6a5cdd342..f02725191 100644
--- a/README.md
+++ b/README.md
@@ -14,6 +14,11 @@ The command to install PyTorch is as follows:
 
 ### Recent Updates
 
+
+Dec 3, 2024:
+
+-`--blocks_to_swap` now works in FLUX.1 ControlNet training. Sample commands for 24GB VRAM and 16GB VRAM are added [here](#flux1-controlnet-training).
+
 Dec 2, 2024:
 
 - FLUX.1 ControlNet training is supported. PR [#1813](https://github.com/kohya-ss/sd-scripts/pull/1813). Thanks to minux302!  See PR and [here](#flux1-controlnet-training) for details.
@@ -276,6 +281,14 @@ accelerate launch --mixed_precision bf16 --num_cpu_threads_per_process 1 flux_tr
 --timestep_sampling shift --discrete_flow_shift 3.1582 --model_prediction_type raw --guidance_scale 1.0 --deepspeed
 ```
 
+For 24GB VRAM GPUs, you can train with 16 blocks swapped and caching latents and text encoder outputs with the batch size of 1. Remove `--deepspeed` . Sample command is below. Not fully tested.
+```
+ --blocks_to_swap 16 --cache_latents_to_disk --cache_text_encoder_outputs_to_disk 
+```
+
+The training can be done with 16GB VRAM GPUs with around 30 blocks swapped. 
+
+`--gradient_accumulation_steps` is also available. The default value is 1 (no accumulation), but according to the original PR, 8 is used.
 
 ### FLUX.1 OFT training
 
diff --git a/flux_train_control_net.py b/flux_train_control_net.py
index bb27c35ed..5548fd991 100644
--- a/flux_train_control_net.py
+++ b/flux_train_control_net.py
@@ -119,9 +119,7 @@ def train(args):
                 "datasets": [
                     {
                         "subsets": config_util.generate_controlnet_subsets_config_by_subdirs(
-                            args.train_data_dir,
-                            args.conditioning_data_dir,
-                            args.caption_extension
+                            args.train_data_dir, args.conditioning_data_dir, args.caption_extension
                         )
                     }
                 ]
@@ -263,13 +261,17 @@ def train(args):
         args.pretrained_model_name_or_path, weight_dtype, "cpu", args.disable_mmap_load_safetensors
     )
     flux.requires_grad_(False)
-    flux.to(accelerator.device)
 
     # load controlnet
-    controlnet = flux_utils.load_controlnet(args.controlnet, is_schnell, torch.float32, accelerator.device, args.disable_mmap_load_safetensors)
+    controlnet_dtype = torch.float32 if args.deepspeed else weight_dtype
+    controlnet = flux_utils.load_controlnet(
+        args.controlnet, is_schnell, controlnet_dtype, accelerator.device, args.disable_mmap_load_safetensors
+    )
     controlnet.train()
 
     if args.gradient_checkpointing:
+        if not args.deepspeed:
+            flux.enable_gradient_checkpointing(cpu_offload=args.cpu_offload_checkpointing)
         controlnet.enable_gradient_checkpointing(cpu_offload=args.cpu_offload_checkpointing)
 
     # block swap
@@ -296,7 +298,11 @@ def train(args):
         # This idea is based on 2kpr's great work. Thank you!
         logger.info(f"enable block swap: blocks_to_swap={args.blocks_to_swap}")
         flux.enable_block_swap(args.blocks_to_swap, accelerator.device)
-        controlnet.enable_block_swap(args.blocks_to_swap, accelerator.device)
+        flux.move_to_device_except_swap_blocks(accelerator.device)  # reduce peak memory usage
+        # ControlNet only has two blocks, so we can keep it on GPU
+        # controlnet.enable_block_swap(args.blocks_to_swap, accelerator.device)
+    else:
+        flux.to(accelerator.device)
 
     if not cache_latents:
         # load VAE here if not cached
@@ -455,9 +461,7 @@ def train(args):
     else:
         # accelerator does some magic
         # if we doesn't swap blocks, we can move the model to device
-        controlnet = accelerator.prepare(controlnet, device_placement=[not is_swapping_blocks])
-        if is_swapping_blocks:
-            accelerator.unwrap_model(controlnet).move_to_device_except_swap_blocks(accelerator.device)  # reduce peak memory usage
+        controlnet = accelerator.prepare(controlnet)  # , device_placement=[not is_swapping_blocks])
         optimizer, train_dataloader, lr_scheduler = accelerator.prepare(optimizer, train_dataloader, lr_scheduler)
 
     # 実験的機能：勾配も含めたfp16学習を行う　PyTorchにパッチを当ててfp16でのgrad scaleを有効にする
@@ -564,11 +568,13 @@ def grad_hook(parameter: torch.Tensor):
         )
 
     if is_swapping_blocks:
-        accelerator.unwrap_model(controlnet).prepare_block_swap_before_forward()
+        flux.prepare_block_swap_before_forward()
 
     # For --sample_at_first
     optimizer_eval_fn()
-    flux_train_utils.sample_images(accelerator, args, 0, global_step, flux, ae, [clip_l, t5xxl], sample_prompts_te_outputs, controlnet=controlnet)
+    flux_train_utils.sample_images(
+        accelerator, args, 0, global_step, flux, ae, [clip_l, t5xxl], sample_prompts_te_outputs, controlnet=controlnet
+    )
     optimizer_train_fn()
     if len(accelerator.trackers) > 0:
         # log empty object to commit the sample images to wandb
@@ -629,7 +635,11 @@ def grad_hook(parameter: torch.Tensor):
                 # pack latents and get img_ids
                 packed_noisy_model_input = flux_utils.pack_latents(noisy_model_input)  # b, c, h*2, w*2 -> b, h*w, c*4
                 packed_latent_height, packed_latent_width = noisy_model_input.shape[2] // 2, noisy_model_input.shape[3] // 2
-                img_ids = flux_utils.prepare_img_ids(bsz, packed_latent_height, packed_latent_width).to(device=accelerator.device).to(weight_dtype)
+                img_ids = (
+                    flux_utils.prepare_img_ids(bsz, packed_latent_height, packed_latent_width)
+                    .to(device=accelerator.device)
+                    .to(weight_dtype)
+                )
 
                 # get guidance: ensure args.guidance_scale is float
                 guidance_vec = torch.full((bsz,), float(args.guidance_scale), device=accelerator.device, dtype=weight_dtype)
@@ -638,7 +648,7 @@ def grad_hook(parameter: torch.Tensor):
                 l_pooled, t5_out, txt_ids, t5_attn_mask = text_encoder_conds
                 if not args.apply_t5_attn_mask:
                     t5_attn_mask = None
-                
+
                 with accelerator.autocast():
                     block_samples, block_single_samples = controlnet(
                         img=packed_noisy_model_input,
@@ -715,7 +725,15 @@ def grad_hook(parameter: torch.Tensor):
 
                 optimizer_eval_fn()
                 flux_train_utils.sample_images(
-                    accelerator, args, None, global_step, flux, ae, [clip_l, t5xxl], sample_prompts_te_outputs, controlnet=controlnet
+                    accelerator,
+                    args,
+                    None,
+                    global_step,
+                    flux,
+                    ae,
+                    [clip_l, t5xxl],
+                    sample_prompts_te_outputs,
+                    controlnet=controlnet,
                 )
 
                 # 指定ステップごとにモデルを保存

From 6bee18db4fbf62ebd2a1da88a5851c48f2e06c54 Mon Sep 17 00:00:00 2001
From: Kohya S <ykumeykume@gmail.com>
Date: Sat, 7 Dec 2024 15:12:27 +0900
Subject: [PATCH 275/582] fix: resolve model corruption issue with pos_embed
 when using --enable_scaled_pos_embed

---
 README.md             | 2 ++
 library/sd3_models.py | 6 ++++--
 2 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index f02725191..6162359d1 100644
--- a/README.md
+++ b/README.md
@@ -14,6 +14,8 @@ The command to install PyTorch is as follows:
 
 ### Recent Updates
 
+Dec 7, 2024:
+- Fixed an issue where the saved model would be corrupted (pos_embed would not be saved) when `--enable_scaled_pos_embed` was specified in `sd3_train.py`.
 
 Dec 3, 2024:
 
diff --git a/library/sd3_models.py b/library/sd3_models.py
index 2f3c82eed..e4a931861 100644
--- a/library/sd3_models.py
+++ b/library/sd3_models.py
@@ -870,8 +870,10 @@ def enable_scaled_pos_embed(self, use_scaled_pos_embed: bool, latent_sizes: Opti
         self.use_scaled_pos_embed = use_scaled_pos_embed
 
         if self.use_scaled_pos_embed:
-            # remove pos_embed to free up memory up to 0.4 GB
-            self.pos_embed = None
+            # # remove pos_embed to free up memory up to 0.4 GB -> this causes error because pos_embed is not saved
+            # self.pos_embed = None
+            # move pos_embed to CPU to free up memory up to 0.4 GB
+            self.pos_embed = self.pos_embed.cpu()
 
             # remove duplicates and sort latent sizes in ascending order
             latent_sizes = list(set(latent_sizes))

From abff4b0ec7bb37b338924e38392593f2bea2b8d3 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E9=9D=92=E9=BE=8D=E8=81=96=E8=80=85=40bdsqlsz?=
 <qinglongshengzhe@gmail.com>
Date: Sat, 7 Dec 2024 16:12:46 +0800
Subject: [PATCH 276/582] Unify controlnet parameters name and change scripts
 name. (#1821)

* Update sd3_train.py

* add freeze block lr

* Update train_util.py

* update

* Revert "add freeze block lr"

This reverts commit 8b1653548f8f219e5be2cde96f65a8813cf9ea1f.

# Conflicts:
#	library/train_util.py
#	sd3_train.py

* use same control net model path

* use controlnet_model_name_or_path
---
 flux_train_control_net.py                   | 2 +-
 library/flux_train_utils.py                 | 2 +-
 sdxl_train_control_net.py                   | 8 ++++----
 train_controlnet.py => train_control_net.py | 0
 4 files changed, 6 insertions(+), 6 deletions(-)
 rename train_controlnet.py => train_control_net.py (100%)

diff --git a/flux_train_control_net.py b/flux_train_control_net.py
index 5548fd991..9d36a41d3 100644
--- a/flux_train_control_net.py
+++ b/flux_train_control_net.py
@@ -265,7 +265,7 @@ def train(args):
     # load controlnet
     controlnet_dtype = torch.float32 if args.deepspeed else weight_dtype
     controlnet = flux_utils.load_controlnet(
-        args.controlnet, is_schnell, controlnet_dtype, accelerator.device, args.disable_mmap_load_safetensors
+        args.controlnet_model_name_or_path, is_schnell, controlnet_dtype, accelerator.device, args.disable_mmap_load_safetensors
     )
     controlnet.train()
 
diff --git a/library/flux_train_utils.py b/library/flux_train_utils.py
index de2e2b48d..f7f06c5cf 100644
--- a/library/flux_train_utils.py
+++ b/library/flux_train_utils.py
@@ -564,7 +564,7 @@ def add_flux_train_arguments(parser: argparse.ArgumentParser):
     )
     parser.add_argument("--ae", type=str, help="path to ae (*.sft or *.safetensors) / aeのパス（*.sftまたは*.safetensors）")
     parser.add_argument(
-        "--controlnet",
+        "--controlnet_model_name_or_path",
         type=str,
         default=None,
         help="path to controlnet (*.sft or *.safetensors) / controlnetのパス（*.sftまたは*.safetensors）"
diff --git a/sdxl_train_control_net.py b/sdxl_train_control_net.py
index 01387409a..ffbf03cab 100644
--- a/sdxl_train_control_net.py
+++ b/sdxl_train_control_net.py
@@ -184,12 +184,12 @@ def unwrap_model(model):
 
     # make control net
     logger.info("make ControlNet")
-    if args.controlnet_model_path:
+    if args.controlnet_model_name_or_path:
         with init_empty_weights():
             control_net = SdxlControlNet()
 
-        logger.info(f"load ControlNet from {args.controlnet_model_path}")
-        filename = args.controlnet_model_path
+        logger.info(f"load ControlNet from {args.controlnet_model_name_or_path}")
+        filename = args.controlnet_model_name_or_path
         if os.path.splitext(filename)[1] == ".safetensors":
             state_dict = load_file(filename)
         else:
@@ -675,7 +675,7 @@ def setup_parser() -> argparse.ArgumentParser:
     sdxl_train_util.add_sdxl_training_arguments(parser)
 
     parser.add_argument(
-        "--controlnet_model_path",
+        "--controlnet_model_name_or_path",
         type=str,
         default=None,
         help="controlnet model name or path / controlnetのモデル名またはパス",
diff --git a/train_controlnet.py b/train_control_net.py
similarity index 100%
rename from train_controlnet.py
rename to train_control_net.py

From e425996a5953f0479384e70b6490e751c2d00b1f Mon Sep 17 00:00:00 2001
From: Kohya S <ykumeykume@gmail.com>
Date: Sat, 7 Dec 2024 17:28:19 +0900
Subject: [PATCH 277/582] feat: unify ControlNet model name option and
 deprecate old training script

---
 README.md           |  7 +++++++
 train_controlnet.py | 23 +++++++++++++++++++++++
 2 files changed, 30 insertions(+)
 create mode 100644 train_controlnet.py

diff --git a/README.md b/README.md
index 6162359d1..67836ddf0 100644
--- a/README.md
+++ b/README.md
@@ -15,6 +15,13 @@ The command to install PyTorch is as follows:
 ### Recent Updates
 
 Dec 7, 2024:
+
+- The option to specify the model name during ControlNet training was different in each script. It has been unified. Please specify `--controlnet_model_name_or_path`. PR [#1821](https://github.com/kohya-ss/sd-scripts/pull/1821) Thanks to sdbds!
+<!-- 
+Also, the ControlNet training script for SD has been changed from `train_controlnet.py` to `train_control_net.py`.
+  - `train_controlnet.py` is still available, but it will be removed in the future.
+-->
+
 - Fixed an issue where the saved model would be corrupted (pos_embed would not be saved) when `--enable_scaled_pos_embed` was specified in `sd3_train.py`.
 
 Dec 3, 2024:
diff --git a/train_controlnet.py b/train_controlnet.py
new file mode 100644
index 000000000..365e35c8c
--- /dev/null
+++ b/train_controlnet.py
@@ -0,0 +1,23 @@
+from library.utils import setup_logging
+
+setup_logging()
+import logging
+
+logger = logging.getLogger(__name__)
+
+
+from library import train_util
+from train_control_net import setup_parser, train
+
+if __name__ == "__main__":
+    logger.warning(
+        "The module 'train_controlnet.py' is deprecated. Please use 'train_control_net.py' instead"
+        " / 'train_controlnet.py'は非推奨です。代わりに'train_control_net.py'を使用してください。"
+    )
+    parser = setup_parser()
+
+    args = parser.parse_args()
+    train_util.verify_command_line_training_args(args)
+    args = train_util.read_config_from_file(args, parser)
+
+    train(args)

From 3cb8cb2d4fd697a49135193ac0873204e0139e62 Mon Sep 17 00:00:00 2001
From: rockerBOO <rockerboo@gmail.com>
Date: Mon, 9 Dec 2024 15:20:04 -0500
Subject: [PATCH 278/582] Prevent git credentials from leaking into other
 actions

---
 .github/workflows/tests.yml | 4 ++++
 .github/workflows/typos.yml | 3 +++
 2 files changed, 7 insertions(+)

diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
index 672a657bf..2eddedc7b 100644
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -23,6 +23,10 @@ jobs:
 
     steps:
       - uses: actions/checkout@v4
+        with:
+          # https://woodruffw.github.io/zizmor/audits/#artipacked
+          persist-credentials: false
+
       - uses: actions/setup-python@v5
         with:
           python-version: ${{ matrix.python-version }}
diff --git a/.github/workflows/typos.yml b/.github/workflows/typos.yml
index 87ebdf894..f53cda218 100644
--- a/.github/workflows/typos.yml
+++ b/.github/workflows/typos.yml
@@ -18,6 +18,9 @@ jobs:
 
     steps:
       - uses: actions/checkout@v4
+        with:
+          # https://woodruffw.github.io/zizmor/audits/#artipacked
+          persist-credentials: false
 
       - name: typos-action
         uses: crate-ci/typos@v1.28.1

From 8e378cf03df645cef897a342559dc5fa7f66a35d Mon Sep 17 00:00:00 2001
From: nhamanasu <ryo.quantum@gmail.com>
Date: Wed, 11 Dec 2024 19:43:44 +0900
Subject: [PATCH 279/582] add RAdamScheduleFree support

---
 library/train_util.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/library/train_util.py b/library/train_util.py
index a35388fee..72b5b24db 100644
--- a/library/train_util.py
+++ b/library/train_util.py
@@ -4887,7 +4887,11 @@ def get_optimizer(args, trainable_params) -> tuple[str, str, object]:
             import schedulefree as sf
         except ImportError:
             raise ImportError("No schedulefree / schedulefreeがインストールされていないようです")
-        if optimizer_type == "AdamWScheduleFree".lower():
+        
+        if optimizer_type == "RAdamScheduleFree".lower():
+            optimizer_class = sf.RAdamScheduleFree
+            logger.info(f"use RAdamScheduleFree optimizer | {optimizer_kwargs}")
+        elif optimizer_type == "AdamWScheduleFree".lower():
             optimizer_class = sf.AdamWScheduleFree
             logger.info(f"use AdamWScheduleFree optimizer | {optimizer_kwargs}")
         elif optimizer_type == "SGDScheduleFree".lower():

From e89653975ddf429cdf0c0fd268da0a5a3e8dba1f Mon Sep 17 00:00:00 2001
From: Kohya S <ykumeykume@gmail.com>
Date: Sun, 15 Dec 2024 19:39:47 +0900
Subject: [PATCH 280/582] update requirements.txt and README to include
 RAdamScheduleFree optimizer support

---
 README.md        | 6 ++++++
 requirements.txt | 2 +-
 2 files changed, 7 insertions(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 67836ddf0..bfb22bcf1 100644
--- a/README.md
+++ b/README.md
@@ -14,6 +14,12 @@ The command to install PyTorch is as follows:
 
 ### Recent Updates
 
+Dec 15, 2024:
+
+- RAdamScheduleFree optimizer is supported. PR [#1830](https://github.com/kohya-ss/sd-scripts/pull/1830) Thanks to nhamanasu!
+  - Update to `schedulefree==1.4` is required. Please update individually or with `pip install --use-pep517 --upgrade -r requirements.txt`.
+  - Available with `--optimizer_type=RAdamScheduleFree`. No need to specify warm up steps as well as learning rate scheduler.
+
 Dec 7, 2024:
 
 - The option to specify the model name during ControlNet training was different in each script. It has been unified. Please specify `--controlnet_model_name_or_path`. PR [#1821](https://github.com/kohya-ss/sd-scripts/pull/1821) Thanks to sdbds!
diff --git a/requirements.txt b/requirements.txt
index 0dd1c69cc..e0091749a 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -9,7 +9,7 @@ pytorch-lightning==1.9.0
 bitsandbytes==0.44.0
 prodigyopt==1.0
 lion-pytorch==0.0.6
-schedulefree==1.2.7
+schedulefree==1.4
 tensorboard
 safetensors==0.4.4
 # gradio==3.16.2

From 05bb9183fae18c62a1730fe5060f80c0b99a21f3 Mon Sep 17 00:00:00 2001
From: Hina Chen <hinablue@gmail.com>
Date: Fri, 27 Dec 2024 16:47:59 +0800
Subject: [PATCH 281/582] Add Validation loss for LoRA training

---
 library/config_util.py |  78 +++++++++++++++++++++++-
 library/train_util.py  |  54 ++++++++++++++++-
 train_network.py       | 131 ++++++++++++++++++++++++++++++++++++++++-
 3 files changed, 257 insertions(+), 6 deletions(-)

diff --git a/library/config_util.py b/library/config_util.py
index 12d0be173..a57cd36f0 100644
--- a/library/config_util.py
+++ b/library/config_util.py
@@ -73,6 +73,8 @@ class BaseSubsetParams:
     token_warmup_min: int = 1
     token_warmup_step: float = 0
     custom_attributes: Optional[Dict[str, Any]] = None
+    validation_seed: int = 0
+    validation_split: float = 0.0
 
 
 @dataclass
@@ -102,6 +104,8 @@ class BaseDatasetParams:
     resolution: Optional[Tuple[int, int]] = None
     network_multiplier: float = 1.0
     debug_dataset: bool = False
+    validation_seed: Optional[int] = None
+    validation_split: float = 0.0
 
 
 @dataclass
@@ -478,9 +482,27 @@ def generate_dataset_group_by_blueprint(dataset_group_blueprint: DatasetGroupBlu
             dataset_klass = FineTuningDataset
 
         subsets = [subset_klass(**asdict(subset_blueprint.params)) for subset_blueprint in dataset_blueprint.subsets]
-        dataset = dataset_klass(subsets=subsets, **asdict(dataset_blueprint.params))
+        dataset = dataset_klass(subsets=subsets, is_train=True, **asdict(dataset_blueprint.params))
         datasets.append(dataset)
 
+    val_datasets:List[Union[DreamBoothDataset, FineTuningDataset, ControlNetDataset]] = []
+    for dataset_blueprint in dataset_group_blueprint.datasets:
+        if dataset_blueprint.params.validation_split <= 0.0:
+            continue
+        if dataset_blueprint.is_controlnet:
+            subset_klass = ControlNetSubset
+            dataset_klass = ControlNetDataset
+        elif dataset_blueprint.is_dreambooth:
+            subset_klass = DreamBoothSubset
+            dataset_klass = DreamBoothDataset
+        else:
+            subset_klass = FineTuningSubset
+            dataset_klass = FineTuningDataset
+
+        subsets = [subset_klass(**asdict(subset_blueprint.params)) for subset_blueprint in dataset_blueprint.subsets]
+        dataset = dataset_klass(subsets=subsets, is_train=False, **asdict(dataset_blueprint.params))
+        val_datasets.append(dataset)
+
     # print info
     info = ""
     for i, dataset in enumerate(datasets):
@@ -566,6 +588,50 @@ def generate_dataset_group_by_blueprint(dataset_group_blueprint: DatasetGroupBlu
 
     logger.info(f"{info}")
 
+    if len(val_datasets) > 0:
+        info = ""
+
+        for i, dataset in enumerate(val_datasets):
+            info += dedent(
+                f"""\
+      [Validation Dataset {i}]
+        batch_size: {dataset.batch_size}
+        resolution: {(dataset.width, dataset.height)}
+        enable_bucket: {dataset.enable_bucket}
+        network_multiplier: {dataset.network_multiplier}
+    """
+        )
+
+            if dataset.enable_bucket:
+                info += indent(
+                    dedent(
+                        f"""\
+        min_bucket_reso: {dataset.min_bucket_reso}
+        max_bucket_reso: {dataset.max_bucket_reso}
+        bucket_reso_steps: {dataset.bucket_reso_steps}
+        bucket_no_upscale: {dataset.bucket_no_upscale}
+      \n"""
+                ),
+                "  ",
+            )
+            else:
+                info += "\n"
+
+            for j, subset in enumerate(dataset.subsets):
+                info += indent(
+                    dedent(
+                       f"""\
+        [Subset {j} of Validation Dataset {i}]
+          image_dir: "{subset.image_dir}"
+          image_count: {subset.img_count}
+          num_repeats: {subset.num_repeats}
+      """
+                ),
+                "  ",
+            )
+
+        logger.info(f"{info}")
+
     # make buckets first because it determines the length of dataset
     # and set the same seed for all datasets
     seed = random.randint(0, 2**31)  # actual seed is seed + epoch_no
@@ -574,7 +640,15 @@ def generate_dataset_group_by_blueprint(dataset_group_blueprint: DatasetGroupBlu
         dataset.make_buckets()
         dataset.set_seed(seed)
 
-    return DatasetGroup(datasets)
+    for i, dataset in enumerate(val_datasets):
+        logger.info(f"[Validation Dataset {i}]")
+        dataset.make_buckets()
+        dataset.set_seed(seed)
+
+    return (
+        DatasetGroup(datasets),
+        DatasetGroup(val_datasets) if val_datasets else None
+    )
 
 
 def generate_dreambooth_subsets_config_by_subdirs(train_data_dir: Optional[str] = None, reg_data_dir: Optional[str] = None):
diff --git a/library/train_util.py b/library/train_util.py
index 72b5b24db..a3fa98e99 100644
--- a/library/train_util.py
+++ b/library/train_util.py
@@ -145,6 +145,17 @@
 TEXT_ENCODER_OUTPUTS_CACHE_SUFFIX = "_te_outputs.npz"
 TEXT_ENCODER_OUTPUTS_CACHE_SUFFIX_SD3 = "_sd3_te.npz"
 
+def split_train_val(paths: List[str], validation_split: float, validation_seed: int) -> List[str]:
+    if validation_seed is not None:
+        print(f"Using validation seed: {validation_seed}")
+        prevstate = random.getstate()
+        random.seed(validation_seed)
+        random.shuffle(paths)
+        random.setstate(prevstate)
+    else:
+        random.shuffle(paths)
+
+    return paths[len(paths) - round(len(paths) * validation_split):]
 
 class ImageInfo:
     def __init__(self, image_key: str, num_repeats: int, caption: str, is_reg: bool, absolute_path: str) -> None:
@@ -397,6 +408,8 @@ def __init__(
         token_warmup_min: int,
         token_warmup_step: Union[float, int],
         custom_attributes: Optional[Dict[str, Any]] = None,
+        validation_seed: Optional[int] = None,
+        validation_split: Optional[float] = 0.0,
     ) -> None:
         self.image_dir = image_dir
         self.alpha_mask = alpha_mask if alpha_mask is not None else False
@@ -424,6 +437,9 @@ def __init__(
 
         self.img_count = 0
 
+        self.validation_seed = validation_seed
+        self.validation_split = validation_split
+
 
 class DreamBoothSubset(BaseSubset):
     def __init__(
@@ -453,6 +469,8 @@ def __init__(
         token_warmup_min,
         token_warmup_step,
         custom_attributes: Optional[Dict[str, Any]] = None,
+        validation_seed: Optional[int] = None,
+        validation_split: Optional[float] = 0.0,
     ) -> None:
         assert image_dir is not None, "image_dir must be specified / image_dirは指定が必須です"
 
@@ -478,6 +496,8 @@ def __init__(
             token_warmup_min,
             token_warmup_step,
             custom_attributes=custom_attributes,
+            validation_seed=validation_seed,
+            validation_split=validation_split,
         )
 
         self.is_reg = is_reg
@@ -518,6 +538,8 @@ def __init__(
         token_warmup_min,
         token_warmup_step,
         custom_attributes: Optional[Dict[str, Any]] = None,
+        validation_seed: Optional[int] = None,
+        validation_split: Optional[float] = 0.0,
     ) -> None:
         assert metadata_file is not None, "metadata_file must be specified / metadata_fileは指定が必須です"
 
@@ -543,6 +565,8 @@ def __init__(
             token_warmup_min,
             token_warmup_step,
             custom_attributes=custom_attributes,
+            validation_seed=validation_seed,
+            validation_split=validation_split,
         )
 
         self.metadata_file = metadata_file
@@ -579,6 +603,8 @@ def __init__(
         token_warmup_min,
         token_warmup_step,
         custom_attributes: Optional[Dict[str, Any]] = None,
+        validation_seed: Optional[int] = None,
+        validation_split: Optional[float] = 0.0,
     ) -> None:
         assert image_dir is not None, "image_dir must be specified / image_dirは指定が必須です"
 
@@ -604,6 +630,8 @@ def __init__(
             token_warmup_min,
             token_warmup_step,
             custom_attributes=custom_attributes,
+            validation_seed=validation_seed,
+            validation_split=validation_split,
         )
 
         self.conditioning_data_dir = conditioning_data_dir
@@ -1799,6 +1827,9 @@ def __init__(
         bucket_no_upscale: bool,
         prior_loss_weight: float,
         debug_dataset: bool,
+        is_train: bool,
+        validation_seed: int,
+        validation_split: float,
     ) -> None:
         super().__init__(resolution, network_multiplier, debug_dataset)
 
@@ -1808,6 +1839,9 @@ def __init__(
         self.size = min(self.width, self.height)  # 短いほう
         self.prior_loss_weight = prior_loss_weight
         self.latents_cache = None
+        self.is_train = is_train
+        self.validation_seed = validation_seed
+        self.validation_split = validation_split
 
         self.enable_bucket = enable_bucket
         if self.enable_bucket:
@@ -1992,6 +2026,9 @@ def load_dreambooth_dir(subset: DreamBoothSubset):
                 )
                 continue
 
+            if self.is_train == False:
+                img_paths = split_train_val(img_paths, self.validation_split, self.validation_seed)
+
             if subset.is_reg:
                 num_reg_images += subset.num_repeats * len(img_paths)
             else:
@@ -2009,7 +2046,11 @@ def load_dreambooth_dir(subset: DreamBoothSubset):
             subset.img_count = len(img_paths)
             self.subsets.append(subset)
 
-        logger.info(f"{num_train_images} train images with repeating.")
+        if self.is_train:
+            logger.info(f"{num_train_images} train images with repeating.")
+        else:
+            logger.info(f"{num_train_images} validation images with repeating.")
+
         self.num_train_images = num_train_images
 
         logger.info(f"{num_reg_images} reg images.")
@@ -2050,6 +2091,9 @@ def __init__(
         bucket_reso_steps: int,
         bucket_no_upscale: bool,
         debug_dataset: bool,
+        is_train: bool,
+        validation_seed: int,
+        validation_split: float,
     ) -> None:
         super().__init__(resolution, network_multiplier, debug_dataset)
 
@@ -2276,6 +2320,9 @@ def __init__(
         bucket_reso_steps: int,
         bucket_no_upscale: bool,
         debug_dataset: float,
+        is_train: bool,
+        validation_seed: int,
+        validation_split: float,
     ) -> None:
         super().__init__(resolution, network_multiplier, debug_dataset)
 
@@ -2324,6 +2371,9 @@ def __init__(
             bucket_no_upscale,
             1.0,
             debug_dataset,
+            is_train,
+            validation_seed,
+            validation_split,
         )
 
         # config_util等から参照される値をいれておく（若干微妙なのでなんとかしたい）
@@ -4887,7 +4937,7 @@ def get_optimizer(args, trainable_params) -> tuple[str, str, object]:
             import schedulefree as sf
         except ImportError:
             raise ImportError("No schedulefree / schedulefreeがインストールされていないようです")
-        
+
         if optimizer_type == "RAdamScheduleFree".lower():
             optimizer_class = sf.RAdamScheduleFree
             logger.info(f"use RAdamScheduleFree optimizer | {optimizer_kwargs}")
diff --git a/train_network.py b/train_network.py
index 5e82b307c..776feaf76 100644
--- a/train_network.py
+++ b/train_network.py
@@ -9,6 +9,7 @@
 from multiprocessing import Value
 from typing import Any, List
 import toml
+import itertools
 
 from tqdm import tqdm
 
@@ -114,7 +115,7 @@ def generate_step_logs(
                     )
                 if (
                     args.optimizer_type.lower().endswith("ProdigyPlusScheduleFree".lower()) and optimizer is not None
-                ):  
+                ):
                     logs[f"lr/d*lr/group{i}"] = (
                         optimizer.param_groups[i]["d"] * optimizer.param_groups[i]["lr"]
                     )
@@ -373,10 +374,11 @@ def train(self, args):
                     }
 
             blueprint = blueprint_generator.generate(user_config, args)
-            train_dataset_group = config_util.generate_dataset_group_by_blueprint(blueprint.dataset_group)
+            train_dataset_group, val_dataset_group = config_util.generate_dataset_group_by_blueprint(blueprint.dataset_group)
         else:
             # use arbitrary dataset class
             train_dataset_group = train_util.load_arbitrary_dataset(args)
+            val_dataset_group = None
 
         current_epoch = Value("i", 0)
         current_step = Value("i", 0)
@@ -398,6 +400,11 @@ def train(self, args):
                 train_dataset_group.is_latent_cacheable()
             ), "when caching latents, either color_aug or random_crop cannot be used / latentをキャッシュするときはcolor_augとrandom_cropは使えません"
 
+            if val_dataset_group is not None:
+                assert (
+                    val_dataset_group.is_latent_cacheable()
+                ), "when caching latents, either color_aug or random_crop cannot be used / latentをキャッシュするときはcolor_augとrandom_cropは使えません"
+
         self.assert_extra_args(args, train_dataset_group)  # may change some args
 
         # acceleratorを準備する
@@ -444,6 +451,8 @@ def train(self, args):
             vae.eval()
 
             train_dataset_group.new_cache_latents(vae, accelerator)
+            if val_dataset_group is not None:
+                val_dataset_group.new_cache_latents(vae, accelerator)
 
             vae.to("cpu")
             clean_memory_on_device(accelerator.device)
@@ -459,6 +468,8 @@ def train(self, args):
         if text_encoder_outputs_caching_strategy is not None:
             strategy_base.TextEncoderOutputsCachingStrategy.set_strategy(text_encoder_outputs_caching_strategy)
         self.cache_text_encoder_outputs_if_needed(args, accelerator, unet, vae, text_encoders, train_dataset_group, weight_dtype)
+        if val_dataset_group is not None:
+            self.cache_text_encoder_outputs_if_needed(args, accelerator, unet, vae, text_encoders, val_dataset_group, weight_dtype)
 
         # prepare network
         net_kwargs = {}
@@ -567,6 +578,8 @@ def train(self, args):
         # strategies are set here because they cannot be referenced in another process. Copy them with the dataset
         # some strategies can be None
         train_dataset_group.set_current_strategies()
+        if val_dataset_group is not None:
+            val_dataset_group.set_current_strategies()
 
         # DataLoaderのプロセス数：0 は persistent_workers が使えないので注意
         n_workers = min(args.max_data_loader_n_workers, os.cpu_count())  # cpu_count or max_data_loader_n_workers
@@ -580,6 +593,17 @@ def train(self, args):
             persistent_workers=args.persistent_data_loader_workers,
         )
 
+        val_dataloader = torch.utils.data.DataLoader(
+            val_dataset_group if val_dataset_group is not None else [],
+            batch_size=1,
+            shuffle=False,
+            collate_fn=collator,
+            num_workers=n_workers,
+            persistent_workers=args.persistent_data_loader_workers,
+        )
+
+        cyclic_val_dataloader = itertools.cycle(val_dataloader)
+
         # 学習ステップ数を計算する
         if args.max_train_epochs is not None:
             args.max_train_steps = args.max_train_epochs * math.ceil(
@@ -592,6 +616,10 @@ def train(self, args):
         # データセット側にも学習ステップを送信
         train_dataset_group.set_max_train_steps(args.max_train_steps)
 
+        # Not for sure here.
+        # if val_dataset_group is not None:
+        #     val_dataset_group.set_max_train_steps(args.max_train_steps)
+
         # lr schedulerを用意する
         lr_scheduler = train_util.get_scheduler_fix(args, optimizer, accelerator.num_processes)
 
@@ -1064,7 +1092,11 @@ def load_model_hook(models, input_dir):
             )
 
         loss_recorder = train_util.LossRecorder()
+        # val_loss_recorder = train_util.LossRecorder()
+
         del train_dataset_group
+        if val_dataset_group is not None:
+            del val_dataset_group
 
         # callback for step start
         if hasattr(accelerator.unwrap_model(network), "on_step_start"):
@@ -1308,6 +1340,77 @@ def remove_model(old_ckpt_name):
                     )
                     accelerator.log(logs, step=global_step)
 
+                if len(val_dataloader) > 0:
+                    if ((args.validation_every_n_step is not None and global_step % args.validation_every_n_step == 0) or (args.validation_every_n_step is None and step == len(train_dataloader) - 1) or global_step >= args.max_train_steps):
+                        accelerator.print("\nValidating バリデーション処理...")
+
+                        total_loss = 0.0
+
+                        with torch.no_grad():
+                            validation_steps = min(args.max_validation_steps, len(val_dataloader)) if args.max_validation_steps is not None else len(val_dataloader)
+                            for val_step in tqdm(range(validation_steps), desc="Validation Steps バリデーションテップ"):
+                                batch = next(cyclic_val_dataloader)
+
+                                timesteps_list = [10, 350, 500, 650, 990]
+
+                                val_loss = 0.0
+
+                                for fixed_timesteps in timesteps_list:
+                                    with torch.set_grad_enabled(False), accelerator.autocast():
+                                        noise = torch.randn_like(latents, device=latents.device)
+                                        b_size = latents.shape[0]
+
+                                        timesteps = torch.full((b_size,), fixed_timesteps, dtype=torch.long, device="cpu")
+                                        timesteps = timesteps.long().to(latents.device)
+
+                                        noisy_latents = noise_scheduler.add_noise(latents, noise, timesteps)
+
+                                        with accelerator.autocast():
+                                            noise_pred = self.call_unet(
+                                                args,
+                                                accelerator,
+                                                unet,
+                                                noisy_latents.requires_grad_(False),
+                                                timesteps,
+                                                text_encoder_conds,
+                                                batch,
+                                                weight_dtype,
+                                            )
+
+                                        if args.v_parameterization:
+                                            # v-parameterization training
+                                            target = noise_scheduler.get_velocity(latents, noise, timesteps)
+                                        else:
+                                            target = noise
+
+                                        huber_c = train_util.get_huber_threshold_if_needed(args, timesteps, noise_scheduler)
+                                        loss = train_util.conditional_loss(noise_pred.float(), target.float(), args.loss_type, "none", huber_c)
+                                        if weighting is not None:
+                                            loss = loss * weighting
+                                        if args.masked_loss or ("alpha_masks" in batch and batch["alpha_masks"] is not None):
+                                            loss = apply_masked_loss(loss, batch)
+                                        loss = loss.mean([1, 2, 3])
+
+                                        # min snr gamma, scale v pred loss like noise pred, v pred like loss, debiased estimation etc.
+                                        loss = self.post_process_loss(loss, args, timesteps, noise_scheduler)
+
+                                        loss = loss.mean()  # 平均なのでbatch_sizeで割る必要なし
+
+                                    val_loss += loss / len(timesteps_list)
+
+                                total_loss += val_loss.detach().item()
+
+                            current_val_loss = total_loss / validation_steps
+                            # val_loss_recorder.add(epoch=epoch, step=global_step, loss=current_val_loss)
+
+                            if len(accelerator.trackers) > 0:
+                                logs = {"loss/current_val_loss": current_val_loss}
+                                accelerator.log(logs, step=global_step)
+
+                                # avr_loss: float = val_loss_recorder.moving_average
+                                # logs = {"loss/average_val_loss": avr_loss}
+                                # accelerator.log(logs, step=global_step)
+
                 if global_step >= args.max_train_steps:
                     break
 
@@ -1496,6 +1599,30 @@ def setup_parser() -> argparse.ArgumentParser:
         help="initial step number including all epochs, 0 means first step (same as not specifying). overwrites initial_epoch."
         + " / 初期ステップ数、全エポックを含むステップ数、0で最初のステップ（未指定時と同じ）。initial_epochを上書きする",
     )
+    parser.add_argument(
+        "--validation_seed",
+        type=int,
+        default=None,
+        help="Validation seed / 検証シード"
+    )
+    parser.add_argument(
+        "--validation_split",
+        type=float,
+        default=0.0,
+        help="Split for validation images out of the training dataset / 学習画像から検証画像に分割する割合"
+    )
+    parser.add_argument(
+        "--validation_every_n_step",
+        type=int,
+        default=None,
+        help="Number of train steps for counting validation loss. By default, validation per train epoch is performed / 学習エポックごとに検証を行う場合はNoneを指定する"
+    )
+    parser.add_argument(
+        "--max_validation_steps",
+        type=int,
+        default=None,
+        help="Number of max validation steps for counting validation loss. By default, validation will run entire validation dataset / 検証データセット全体を検証する場合はNoneを指定する"
+    )
     # parser.add_argument("--loraplus_lr_ratio", default=None, type=float, help="LoRA+ learning rate ratio")
     # parser.add_argument("--loraplus_unet_lr_ratio", default=None, type=float, help="LoRA+ UNet learning rate ratio")
     # parser.add_argument("--loraplus_text_encoder_lr_ratio", default=None, type=float, help="LoRA+ text encoder learning rate ratio")

From 62164e57925125ed6268983ffa441f1ffecc0e6d Mon Sep 17 00:00:00 2001
From: Hina Chen <hinablue@gmail.com>
Date: Fri, 27 Dec 2024 17:28:05 +0800
Subject: [PATCH 282/582] Change val loss calculate method

---
 train_network.py | 20 ++++++++++++--------
 1 file changed, 12 insertions(+), 8 deletions(-)

diff --git a/train_network.py b/train_network.py
index 776feaf76..5fd1b212f 100644
--- a/train_network.py
+++ b/train_network.py
@@ -1383,16 +1383,20 @@ def remove_model(old_ckpt_name):
                                         else:
                                             target = noise
 
-                                        huber_c = train_util.get_huber_threshold_if_needed(args, timesteps, noise_scheduler)
-                                        loss = train_util.conditional_loss(noise_pred.float(), target.float(), args.loss_type, "none", huber_c)
-                                        if weighting is not None:
-                                            loss = loss * weighting
-                                        if args.masked_loss or ("alpha_masks" in batch and batch["alpha_masks"] is not None):
-                                            loss = apply_masked_loss(loss, batch)
-                                        loss = loss.mean([1, 2, 3])
+                                        # huber_c = train_util.get_huber_threshold_if_needed(args, timesteps, noise_scheduler)
+                                        # loss = train_util.conditional_loss(noise_pred.float(), target.float(), args.loss_type, "none", huber_c)
+                                        # if weighting is not None:
+                                        #     loss = loss * weighting
+                                        # if args.masked_loss or ("alpha_masks" in batch and batch["alpha_masks"] is not None):
+                                        #     loss = apply_masked_loss(loss, batch)
+                                        # loss = loss.mean([1, 2, 3])
 
                                         # min snr gamma, scale v pred loss like noise pred, v pred like loss, debiased estimation etc.
-                                        loss = self.post_process_loss(loss, args, timesteps, noise_scheduler)
+                                        # loss = self.post_process_loss(loss, args, timesteps, noise_scheduler)
+
+                                        loss = torch.nn.functional.mse_loss(noise_pred.float(), target.float(), reduction="none")
+                                        loss = loss.mean([1, 2, 3])
+                                        loss = apply_debiased_estimation(loss, timesteps, noise_scheduler, args.v_parameterization)
 
                                         loss = loss.mean()  # 平均なのでbatch_sizeで割る必要なし
 

From 64bd5317dc9cb39d69ab7728f36b03157c9b341f Mon Sep 17 00:00:00 2001
From: Hina Chen <hinablue@gmail.com>
Date: Sat, 28 Dec 2024 11:42:15 +0800
Subject: [PATCH 283/582] Split val latents/batch and pick up val latents shape
 size which equal to training batch.

---
 train_network.py | 45 +++++++++++++++++++++++++++------------------
 1 file changed, 27 insertions(+), 18 deletions(-)

diff --git a/train_network.py b/train_network.py
index 5fd1b212f..6bce9e964 100644
--- a/train_network.py
+++ b/train_network.py
@@ -1349,7 +1349,27 @@ def remove_model(old_ckpt_name):
                         with torch.no_grad():
                             validation_steps = min(args.max_validation_steps, len(val_dataloader)) if args.max_validation_steps is not None else len(val_dataloader)
                             for val_step in tqdm(range(validation_steps), desc="Validation Steps バリデーションテップ"):
-                                batch = next(cyclic_val_dataloader)
+
+                                while True:
+                                    val_batch = next(cyclic_val_dataloader)
+
+                                    if "latents" in val_batch and val_batch["latents"] is not None:
+                                        val_latents = val_batch["latents"].to(accelerator.device).to(dtype=weight_dtype)
+                                    else:
+                                        with torch.no_grad():
+                                            # latentに変換
+                                            val_latents = self.encode_images_to_latents(args, accelerator, vae, val_batch["images"].to(vae_dtype))
+                                            val_latents = val_latents.to(dtype=weight_dtype)
+
+                                            # NaNが含まれていれば警告を表示し0に置き換える
+                                            if torch.any(torch.isnan(val_latents)):
+                                                accelerator.print("NaN found in validation latents, replacing with zeros")
+                                                val_latents = torch.nan_to_num(val_latents, 0, out=val_latents)
+
+                                    val_latents = self.shift_scale_latents(args, val_latents)
+
+                                    if val_latents.shape == latents.shape:
+                                        break
 
                                 timesteps_list = [10, 350, 500, 650, 990]
 
@@ -1357,13 +1377,13 @@ def remove_model(old_ckpt_name):
 
                                 for fixed_timesteps in timesteps_list:
                                     with torch.set_grad_enabled(False), accelerator.autocast():
-                                        noise = torch.randn_like(latents, device=latents.device)
-                                        b_size = latents.shape[0]
+                                        noise = torch.randn_like(val_latents, device=val_latents.device)
+                                        b_size = val_latents.shape[0]
 
                                         timesteps = torch.full((b_size,), fixed_timesteps, dtype=torch.long, device="cpu")
-                                        timesteps = timesteps.long().to(latents.device)
+                                        timesteps = timesteps.long().to(val_latents.device)
 
-                                        noisy_latents = noise_scheduler.add_noise(latents, noise, timesteps)
+                                        noisy_latents = noise_scheduler.add_noise(val_latents, noise, timesteps)
 
                                         with accelerator.autocast():
                                             noise_pred = self.call_unet(
@@ -1373,27 +1393,16 @@ def remove_model(old_ckpt_name):
                                                 noisy_latents.requires_grad_(False),
                                                 timesteps,
                                                 text_encoder_conds,
-                                                batch,
+                                                val_batch,
                                                 weight_dtype,
                                             )
 
                                         if args.v_parameterization:
                                             # v-parameterization training
-                                            target = noise_scheduler.get_velocity(latents, noise, timesteps)
+                                            target = noise_scheduler.get_velocity(val_latents, noise, timesteps)
                                         else:
                                             target = noise
 
-                                        # huber_c = train_util.get_huber_threshold_if_needed(args, timesteps, noise_scheduler)
-                                        # loss = train_util.conditional_loss(noise_pred.float(), target.float(), args.loss_type, "none", huber_c)
-                                        # if weighting is not None:
-                                        #     loss = loss * weighting
-                                        # if args.masked_loss or ("alpha_masks" in batch and batch["alpha_masks"] is not None):
-                                        #     loss = apply_masked_loss(loss, batch)
-                                        # loss = loss.mean([1, 2, 3])
-
-                                        # min snr gamma, scale v pred loss like noise pred, v pred like loss, debiased estimation etc.
-                                        # loss = self.post_process_loss(loss, args, timesteps, noise_scheduler)
-
                                         loss = torch.nn.functional.mse_loss(noise_pred.float(), target.float(), reduction="none")
                                         loss = loss.mean([1, 2, 3])
                                         loss = apply_debiased_estimation(loss, timesteps, noise_scheduler, args.v_parameterization)

From cb89e0284e1a25b41401861107159e6b943ee387 Mon Sep 17 00:00:00 2001
From: Hina Chen <hinablue@gmail.com>
Date: Sat, 28 Dec 2024 11:57:04 +0800
Subject: [PATCH 284/582] Change val latent loss compare

---
 train_network.py | 15 ++++++++++-----
 1 file changed, 10 insertions(+), 5 deletions(-)

diff --git a/train_network.py b/train_network.py
index 6bce9e964..7276d5dc0 100644
--- a/train_network.py
+++ b/train_network.py
@@ -1350,6 +1350,8 @@ def remove_model(old_ckpt_name):
                             validation_steps = min(args.max_validation_steps, len(val_dataloader)) if args.max_validation_steps is not None else len(val_dataloader)
                             for val_step in tqdm(range(validation_steps), desc="Validation Steps バリデーションテップ"):
 
+                                val_latents = None
+
                                 while True:
                                     val_batch = next(cyclic_val_dataloader)
 
@@ -1371,19 +1373,22 @@ def remove_model(old_ckpt_name):
                                     if val_latents.shape == latents.shape:
                                         break
 
+                                if val_latents is not None:
+                                    del val_latents
+
                                 timesteps_list = [10, 350, 500, 650, 990]
 
                                 val_loss = 0.0
 
                                 for fixed_timesteps in timesteps_list:
                                     with torch.set_grad_enabled(False), accelerator.autocast():
-                                        noise = torch.randn_like(val_latents, device=val_latents.device)
-                                        b_size = val_latents.shape[0]
+                                        noise = torch.randn_like(latents, device=latents.device)
+                                        b_size = latents.shape[0]
 
                                         timesteps = torch.full((b_size,), fixed_timesteps, dtype=torch.long, device="cpu")
-                                        timesteps = timesteps.long().to(val_latents.device)
+                                        timesteps = timesteps.long().to(latents.device)
 
-                                        noisy_latents = noise_scheduler.add_noise(val_latents, noise, timesteps)
+                                        noisy_latents = noise_scheduler.add_noise(latents, noise, timesteps)
 
                                         with accelerator.autocast():
                                             noise_pred = self.call_unet(
@@ -1399,7 +1404,7 @@ def remove_model(old_ckpt_name):
 
                                         if args.v_parameterization:
                                             # v-parameterization training
-                                            target = noise_scheduler.get_velocity(val_latents, noise, timesteps)
+                                            target = noise_scheduler.get_velocity(latents, noise, timesteps)
                                         else:
                                             target = noise
 

From 874353296304c753b452511a412472f8a3e4ba09 Mon Sep 17 00:00:00 2001
From: gesen2egee <79357052+gesen2egee@users.noreply.github.com>
Date: Sun, 10 Mar 2024 04:37:16 +0800
Subject: [PATCH 285/582] val

---
 library/config_util.py |  32 +++++++------
 library/train_util.py  |  20 ++++++--
 train_network.py       | 104 +++++++++++++++++++++++++++--------------
 3 files changed, 103 insertions(+), 53 deletions(-)

diff --git a/library/config_util.py b/library/config_util.py
index 1bf7ed955..cb2c5b68f 100644
--- a/library/config_util.py
+++ b/library/config_util.py
@@ -81,23 +81,24 @@ class ControlNetSubsetParams(BaseSubsetParams):
 
 @dataclass
 class BaseDatasetParams:
-  tokenizer: Union[CLIPTokenizer, List[CLIPTokenizer]] = None
-  max_token_length: int = None
-  resolution: Optional[Tuple[int, int]] = None
-  debug_dataset: bool = False
-  validation_seed: Optional[int] = None
-  validation_split: float = 0.0
+    tokenizer: Union[CLIPTokenizer, List[CLIPTokenizer]] = None
+    max_token_length: int = None
+    resolution: Optional[Tuple[int, int]] = None
+    network_multiplier: float = 1.0
+    debug_dataset: bool = False
+    validation_seed: Optional[int] = None
+    validation_split: float = 0.0
 
 @dataclass
 class DreamBoothDatasetParams(BaseDatasetParams):
-  batch_size: int = 1
-  enable_bucket: bool = False
-  min_bucket_reso: int = 256
-  max_bucket_reso: int = 1024
-  bucket_reso_steps: int = 64
-  bucket_no_upscale: bool = False
-  prior_loss_weight: float = 1.0
-
+    batch_size: int = 1
+    enable_bucket: bool = False
+    min_bucket_reso: int = 256
+    max_bucket_reso: int = 1024
+    bucket_reso_steps: int = 64
+    bucket_no_upscale: bool = False
+    prior_loss_weight: float = 1.0
+    
 @dataclass
 class FineTuningDatasetParams(BaseDatasetParams):
   batch_size: int = 1
@@ -203,8 +204,9 @@ def __validate_and_convert_scalar_or_twodim(klass, value: Union[float, Sequence]
     "max_bucket_reso": int,
     "min_bucket_reso": int,
     "validation_seed": int,
-    "validation_split": float,
+    "validation_split": float,        
     "resolution": functools.partial(__validate_and_convert_scalar_or_twodim.__func__, int),
+    "network_multiplier": float,
   }
 
   # options handled by argparse but not handled by user config
diff --git a/library/train_util.py b/library/train_util.py
index 1979207b0..2364d62b3 100644
--- a/library/train_util.py
+++ b/library/train_util.py
@@ -122,6 +122,20 @@
 
 TEXT_ENCODER_OUTPUTS_CACHE_SUFFIX = "_te_outputs.npz"
 
+def split_train_val(paths, is_train, validation_split, validation_seed):
+    if validation_seed is not None:
+        print(f"Using validation seed: {validation_seed}")
+        prevstate = random.getstate()
+        random.seed(validation_seed)
+        random.shuffle(paths)
+        random.setstate(prevstate)
+    else:
+        random.shuffle(paths)
+
+    if is_train:
+        return paths[0:math.ceil(len(paths) * (1 - validation_split))]
+    else:
+        return paths[len(paths) - round(len(paths) * validation_split):]
 
 def split_train_val(paths, is_train, validation_split, validation_seed):
     if validation_seed is not None:
@@ -1352,7 +1366,6 @@ def __init__(
         self.is_train = is_train
         self.validation_split = validation_split
         self.validation_seed = validation_seed 
-
         self.batch_size = batch_size
         self.size = min(self.width, self.height)  # 短いほう
         self.prior_loss_weight = prior_loss_weight
@@ -1405,10 +1418,9 @@ def load_dreambooth_dir(subset: DreamBoothSubset):
                 return [], []
 
             img_paths = glob_images(subset.image_dir, "*")
-
             if self.validation_split > 0.0:
-                img_paths = split_train_val(img_paths, self.is_train, self.validation_split, self.validation_seed)
-            print(f"found directory {subset.image_dir} contains {len(img_paths)} image files")
+                img_paths = split_train_val(img_paths, self.is_train, self.validation_split, self.validation_seed)            
+            logger.info(f"found directory {subset.image_dir} contains {len(img_paths)} image files")
 
             # 画像ファイルごとにプロンプトを読み込み、もしあればそちらを使う
             captions = []
diff --git a/train_network.py b/train_network.py
index edd3ff944..48885503f 100644
--- a/train_network.py
+++ b/train_network.py
@@ -130,7 +130,9 @@ def call_unet(self, args, accelerator, unet, noisy_latents, timesteps, text_cond
     def sample_images(self, accelerator, args, epoch, global_step, device, vae, tokenizer, text_encoder, unet):
         train_util.sample_images(accelerator, args, epoch, global_step, device, vae, tokenizer, text_encoder, unet)
 
-    def process_batch(self, batch, is_train, tokenizers, text_encoders, unet, vae, noise_scheduler, vae_dtype, weight_dtype, accelerator, args, train_text_encoder=True):
+    def process_batch(self, batch, is_train, tokenizers, text_encoders, unet, vae, noise_scheduler, vae_dtype, weight_dtype, accelerator, args, train_text_encoder=True, timesteps_list=None):
+        total_loss = 0.0
+        
         with torch.no_grad():
             if "latents" in batch and batch["latents"] is not None:
                 latents = batch["latents"].to(accelerator.device)
@@ -167,37 +169,40 @@ def process_batch(self, batch, is_train, tokenizers, text_encoders, unet, vae, n
             args, noise_scheduler, latents
         )
 
-        # Predict the noise residual
-        with torch.set_grad_enabled(is_train), accelerator.autocast():
-            noise_pred = self.call_unet(
-                args, accelerator, unet, noisy_latents, timesteps, text_encoder_conds, batch, weight_dtype
-            )
-
-        if args.v_parameterization:
-            # v-parameterization training
-            target = noise_scheduler.get_velocity(latents, noise, timesteps)
-        else:
-            target = noise
+        # Use input timesteps_list or use described timesteps above
+        timesteps_list = timesteps_list or [timesteps]
+        for timesteps in timesteps_list:
+            # Predict the noise residual
+            with torch.set_grad_enabled(is_train), accelerator.autocast():
+                noise_pred = self.call_unet(
+                    args, accelerator, unet, noisy_latents, timesteps, text_encoder_conds, batch, weight_dtype
+                )
 
-        loss = torch.nn.functional.mse_loss(noise_pred.float(), target.float(), reduction="none")
-        loss = loss.mean([1, 2, 3])
+            if args.v_parameterization:
+                # v-parameterization training
+                target = noise_scheduler.get_velocity(latents, noise, timesteps)
+            else:
+                target = noise
 
-        loss_weights = batch["loss_weights"].to(accelerator.device)  # 各sampleごとのweight
-        loss = loss * loss_weights
+            loss = torch.nn.functional.mse_loss(noise_pred.float(), target.float(), reduction="none")
+            loss = loss.mean([1, 2, 3])
 
-        if args.min_snr_gamma:
-            loss = apply_snr_weight(loss, timesteps, noise_scheduler, args.min_snr_gamma)
-        if args.scale_v_pred_loss_like_noise_pred:
-            loss = scale_v_prediction_loss_like_noise_prediction(loss, timesteps, noise_scheduler)
-        if args.v_pred_like_loss:
-            loss = add_v_prediction_like_loss(loss, timesteps, noise_scheduler, args.v_pred_like_loss)
-        if args.debiased_estimation_loss:
-            loss = apply_debiased_estimation(loss, timesteps, noise_scheduler)
+            loss_weights = batch["loss_weights"].to(accelerator.device)  # 各sampleごとのweight
+            loss = loss * loss_weights
 
-        loss = loss.mean()  # 平均なのでbatch_sizeで割る必要なし
+            if args.min_snr_gamma:
+                loss = apply_snr_weight(loss, timesteps, noise_scheduler, args.min_snr_gamma)
+            if args.scale_v_pred_loss_like_noise_pred:
+                loss = scale_v_prediction_loss_like_noise_prediction(loss, timesteps, noise_scheduler)
+            if args.v_pred_like_loss:
+                loss = add_v_prediction_like_loss(loss, timesteps, noise_scheduler, args.v_pred_like_loss)
+            if args.debiased_estimation_loss:
+                loss = apply_debiased_estimation(loss, timesteps, noise_scheduler)
 
-        return loss
+            total_loss += loss.mean()  # 平均なのでbatch_sizeで割る必要なし
 
+        average_loss = total_loss / len(timesteps_list)    
+        return average_loss
 
     def train(self, args):
         session_id = random.randint(0, 2**32)
@@ -283,10 +288,10 @@ def train(self, args):
                 train_dataset_group.is_latent_cacheable()
             ), "when caching latents, either color_aug or random_crop cannot be used / latentをキャッシュするときはcolor_augとrandom_cropは使えません"
             if val_dataset_group is not None:
-              assert (
-                  val_dataset_group.is_latent_cacheable()
-              ), "when caching validation latents, either color_aug or random_crop cannot be used / latentをキャッシュするときはcolor_augとrandom_cropは使えません"
-
+                assert (
+                    val_dataset_group.is_latent_cacheable()
+                ), "when caching validation latents, either color_aug or random_crop cannot be used / latentをキャッシュするときはcolor_augとrandom_cropは使えません"
+              
         self.assert_extra_args(args, train_dataset_group)
 
         # acceleratorを準備する
@@ -430,6 +435,15 @@ def train(self, args):
             num_workers=n_workers,
             persistent_workers=args.persistent_data_loader_workers,
         )
+        
+        val_dataloader = torch.utils.data.DataLoader(
+            val_dataset_group if val_dataset_group is not None else [],
+            shuffle=False,
+            batch_size=1,
+            collate_fn=collator,
+            num_workers=n_workers,
+            persistent_workers=args.persistent_data_loader_workers,
+        )
 
         val_dataloader = torch.utils.data.DataLoader(
             val_dataset_group if val_dataset_group is not None else [],
@@ -798,7 +812,6 @@ def train(self, args):
 
         loss_recorder = train_util.LossRecorder()
         val_loss_recorder = train_util.LossRecorder()
-
         del train_dataset_group
 
         # callback for step start
@@ -848,7 +861,6 @@ def remove_model(old_ckpt_name):
                     on_step_start(text_encoder, unet)
                     is_train = True
                     loss = self.process_batch(batch, is_train, tokenizers, text_encoders, unet, vae, noise_scheduler, vae_dtype, weight_dtype, accelerator, args, train_text_encoder=train_text_encoder)
-
                     accelerator.backward(loss)
                     if accelerator.sync_gradients and args.max_grad_norm != 0.0:
                         params_to_clip = network.get_trainable_params()
@@ -900,7 +912,25 @@ def remove_model(old_ckpt_name):
                 if args.logging_dir is not None:
                     logs = self.generate_step_logs(args, current_loss, avr_loss, lr_scheduler, keys_scaled, mean_norm, maximum_norm)
                     accelerator.log(logs, step=global_step)
-
+                    
+                if global_step % 25 == 0:                
+                    if len(val_dataloader) > 0:
+                        print("Validating バリデーション処理...")
+
+                        with torch.no_grad():
+                            val_dataloader_iter = iter(val_dataloader)
+                            batch = next(val_dataloader_iter)
+                            is_train = False
+                            loss = self.process_batch(batch, is_train, tokenizers, text_encoders, unet, vae, noise_scheduler, vae_dtype, weight_dtype, accelerator, args, timesteps_list=[10, 350, 500, 650, 990])
+
+                            current_loss = loss.detach().item()
+                            val_loss_recorder.add(epoch=epoch, step=global_step, loss=current_loss)
+
+                            if args.logging_dir is not None:
+                                avr_loss: float = val_loss_recorder.moving_average
+                                logs = {"loss/validation_current": current_loss}
+                                accelerator.log(logs, step=global_step)
+                                
                 if global_step >= args.max_train_steps:
                     break
 
@@ -912,7 +942,7 @@ def remove_model(old_ckpt_name):
             with torch.no_grad():
                 for val_step, batch in enumerate(val_dataloader):
                     is_train = False
-                    loss = self.process_batch(batch, is_train, tokenizers, text_encoders, unet, vae, noise_scheduler, vae_dtype, weight_dtype, accelerator, args)
+                    loss = self.process_batch(batch, is_train, tokenizers, text_encoders, unet, vae, noise_scheduler, vae_dtype, weight_dtype, accelerator, args, timesteps_list=[10, 350, 500, 650, 990])
 
                     current_loss = loss.detach().item()
                     val_loss_recorder.add(epoch=epoch, step=val_step, loss=current_loss)
@@ -933,6 +963,12 @@ def remove_model(old_ckpt_name):
                 logs = {"loss/epoch_average": loss_recorder.moving_average}
                 accelerator.log(logs, step=epoch + 1)
 
+            if len(val_dataloader) > 0:
+                if args.logging_dir is not None:
+                    avr_loss: float = val_loss_recorder.moving_average
+                    logs = {"loss/validation_epoch_average": avr_loss}
+                    accelerator.log(logs, step=epoch + 1)
+                    
             accelerator.wait_for_everyone()
 
             # 指定エポックごとにモデルを保存

From 449c1c5c502375713e609ad9e00e747b4013063a Mon Sep 17 00:00:00 2001
From: rockerBOO <rockerboo@gmail.com>
Date: Thu, 2 Jan 2025 15:59:20 -0500
Subject: [PATCH 286/582] Adding modified train_util and config_util

---
 library/config_util.py | 1 -
 library/train_util.py  | 2 +-
 2 files changed, 1 insertion(+), 2 deletions(-)

diff --git a/library/config_util.py b/library/config_util.py
index cb2c5b68f..727e1a409 100644
--- a/library/config_util.py
+++ b/library/config_util.py
@@ -84,7 +84,6 @@ class BaseDatasetParams:
     tokenizer: Union[CLIPTokenizer, List[CLIPTokenizer]] = None
     max_token_length: int = None
     resolution: Optional[Tuple[int, int]] = None
-    network_multiplier: float = 1.0
     debug_dataset: bool = False
     validation_seed: Optional[int] = None
     validation_split: float = 0.0
diff --git a/library/train_util.py b/library/train_util.py
index 2364d62b3..394337397 100644
--- a/library/train_util.py
+++ b/library/train_util.py
@@ -1420,7 +1420,7 @@ def load_dreambooth_dir(subset: DreamBoothSubset):
             img_paths = glob_images(subset.image_dir, "*")
             if self.validation_split > 0.0:
                 img_paths = split_train_val(img_paths, self.is_train, self.validation_split, self.validation_seed)            
-            logger.info(f"found directory {subset.image_dir} contains {len(img_paths)} image files")
+            print(f"found directory {subset.image_dir} contains {len(img_paths)} image files")
 
             # 画像ファイルごとにプロンプトを読み込み、もしあればそちらを使う
             captions = []

From 7470173044ca5b700bc4723709bd9c012e2216f3 Mon Sep 17 00:00:00 2001
From: rockerBOO <rockerboo@gmail.com>
Date: Fri, 3 Jan 2025 01:13:57 -0500
Subject: [PATCH 287/582] Remove defunct code for train_controlnet.py

---
 train_controlnet.py | 569 --------------------------------------------
 1 file changed, 569 deletions(-)

diff --git a/train_controlnet.py b/train_controlnet.py
index 09a911a00..365e35c8c 100644
--- a/train_controlnet.py
+++ b/train_controlnet.py
@@ -6,577 +6,8 @@
 logger = logging.getLogger(__name__)
 
 
-<<<<<<< HEAD
-# TODO 他のスクリプトと共通化する
-def generate_step_logs(args: argparse.Namespace, current_loss, avr_loss, lr_scheduler):
-    logs = {
-        "loss/current": current_loss,
-        "loss/average": avr_loss,
-        "lr": lr_scheduler.get_last_lr()[0],
-    }
-
-    if args.optimizer_type.lower().startswith("DAdapt".lower()):
-        logs["lr/d*lr"] = lr_scheduler.optimizers[-1].param_groups[0]["d"] * lr_scheduler.optimizers[-1].param_groups[0]["lr"]
-
-    return logs
-
-
-def train(args):
-    # session_id = random.randint(0, 2**32)
-    # training_started_at = time.time()
-    train_util.verify_training_args(args)
-    train_util.prepare_dataset_args(args, True)
-    setup_logging(args, reset=True)
-
-    cache_latents = args.cache_latents
-    use_user_config = args.dataset_config is not None
-
-    if args.seed is None:
-        args.seed = random.randint(0, 2**32)
-    set_seed(args.seed)
-
-    tokenizer = train_util.load_tokenizer(args)
-
-    # データセットを準備する
-    blueprint_generator = BlueprintGenerator(ConfigSanitizer(False, False, True, True))
-    if use_user_config:
-        logger.info(f"Load dataset config from {args.dataset_config}")
-        user_config = config_util.load_user_config(args.dataset_config)
-        ignored = ["train_data_dir", "conditioning_data_dir"]
-        if any(getattr(args, attr) is not None for attr in ignored):
-            logger.warning(
-                "ignore following options because config file is found: {0} / 設定ファイルが利用されるため以下のオプションは無視されます: {0}".format(
-                    ", ".join(ignored)
-                )
-            )
-    else:
-        user_config = {
-            "datasets": [
-                {
-                    "subsets": config_util.generate_controlnet_subsets_config_by_subdirs(
-                        args.train_data_dir,
-                        args.conditioning_data_dir,
-                        args.caption_extension,
-                    )
-                }
-            ]
-        }
-
-    blueprint = blueprint_generator.generate(user_config, args, tokenizer=tokenizer)
-    train_dataset_group = config_util.generate_dataset_group_by_blueprint(blueprint.dataset_group)
-
-    current_epoch = Value("i", 0)
-    current_step = Value("i", 0)
-    ds_for_collator = train_dataset_group if args.max_data_loader_n_workers == 0 else None
-    collator = train_util.collator_class(current_epoch, current_step, ds_for_collator)
-
-    if args.debug_dataset:
-        train_util.debug_dataset(train_dataset_group)
-        return
-    if len(train_dataset_group) == 0:
-        logger.error(
-            "No data found. Please verify arguments (train_data_dir must be the parent of folders with images) / 画像がありません。引数指定を確認してください（train_data_dirには画像があるフォルダではなく、画像があるフォルダの親フォルダを指定する必要があります）"
-        )
-        return
-
-    if cache_latents:
-        assert (
-            train_dataset_group.is_latent_cacheable()
-        ), "when caching latents, either color_aug or random_crop cannot be used / latentをキャッシュするときはcolor_augとrandom_cropは使えません"
-
-    # acceleratorを準備する
-    logger.info("prepare accelerator")
-    accelerator = train_util.prepare_accelerator(args)
-    is_main_process = accelerator.is_main_process
-
-    # mixed precisionに対応した型を用意しておき適宜castする
-    weight_dtype, save_dtype = train_util.prepare_dtype(args)
-
-    # モデルを読み込む
-    text_encoder, vae, unet, _ = train_util.load_target_model(
-        args, weight_dtype, accelerator, unet_use_linear_projection_in_v2=True
-    )
-
-    # DiffusersのControlNetが使用するデータを準備する
-    if args.v2:
-        unet.config = {
-            "act_fn": "silu",
-            "attention_head_dim": [5, 10, 20, 20],
-            "block_out_channels": [320, 640, 1280, 1280],
-            "center_input_sample": False,
-            "cross_attention_dim": 1024,
-            "down_block_types": ["CrossAttnDownBlock2D", "CrossAttnDownBlock2D", "CrossAttnDownBlock2D", "DownBlock2D"],
-            "downsample_padding": 1,
-            "dual_cross_attention": False,
-            "flip_sin_to_cos": True,
-            "freq_shift": 0,
-            "in_channels": 4,
-            "layers_per_block": 2,
-            "mid_block_scale_factor": 1,
-            "norm_eps": 1e-05,
-            "norm_num_groups": 32,
-            "num_class_embeds": None,
-            "only_cross_attention": False,
-            "out_channels": 4,
-            "sample_size": 96,
-            "up_block_types": ["UpBlock2D", "CrossAttnUpBlock2D", "CrossAttnUpBlock2D", "CrossAttnUpBlock2D"],
-            "use_linear_projection": True,
-            "upcast_attention": True,
-            "only_cross_attention": False,
-            "downsample_padding": 1,
-            "use_linear_projection": True,
-            "class_embed_type": None,
-            "num_class_embeds": None,
-            "resnet_time_scale_shift": "default",
-            "projection_class_embeddings_input_dim": None,
-        }
-    else:
-        unet.config = {
-            "act_fn": "silu",
-            "attention_head_dim": 8,
-            "block_out_channels": [320, 640, 1280, 1280],
-            "center_input_sample": False,
-            "cross_attention_dim": 768,
-            "down_block_types": ["CrossAttnDownBlock2D", "CrossAttnDownBlock2D", "CrossAttnDownBlock2D", "DownBlock2D"],
-            "downsample_padding": 1,
-            "flip_sin_to_cos": True,
-            "freq_shift": 0,
-            "in_channels": 4,
-            "layers_per_block": 2,
-            "mid_block_scale_factor": 1,
-            "norm_eps": 1e-05,
-            "norm_num_groups": 32,
-            "out_channels": 4,
-            "sample_size": 64,
-            "up_block_types": ["UpBlock2D", "CrossAttnUpBlock2D", "CrossAttnUpBlock2D", "CrossAttnUpBlock2D"],
-            "only_cross_attention": False,
-            "downsample_padding": 1,
-            "use_linear_projection": False,
-            "class_embed_type": None,
-            "num_class_embeds": None,
-            "upcast_attention": False,
-            "resnet_time_scale_shift": "default",
-            "projection_class_embeddings_input_dim": None,
-        }
-    unet.config = SimpleNamespace(**unet.config)
-
-    controlnet = ControlNetModel.from_unet(unet)
-
-    if args.controlnet_model_name_or_path:
-        filename = args.controlnet_model_name_or_path
-        if os.path.isfile(filename):
-            if os.path.splitext(filename)[1] == ".safetensors":
-                state_dict = load_file(filename)
-            else:
-                state_dict = torch.load(filename)
-            state_dict = model_util.convert_controlnet_state_dict_to_diffusers(state_dict)
-            controlnet.load_state_dict(state_dict)
-        elif os.path.isdir(filename):
-            controlnet = ControlNetModel.from_pretrained(filename)
-
-    # モデルに xformers とか memory efficient attention を組み込む
-    train_util.replace_unet_modules(unet, args.mem_eff_attn, args.xformers, args.sdpa)
-
-    # 学習を準備する
-    if cache_latents:
-        vae.to(accelerator.device, dtype=weight_dtype)
-        vae.requires_grad_(False)
-        vae.eval()
-        with torch.no_grad():
-            train_dataset_group.cache_latents(
-                vae,
-                args.vae_batch_size,
-                args.cache_latents_to_disk,
-                accelerator.is_main_process,
-            )
-        vae.to("cpu")
-        clean_memory_on_device(accelerator.device)
-        
-        accelerator.wait_for_everyone()
-
-    if args.gradient_checkpointing:
-        controlnet.enable_gradient_checkpointing()
-
-    # 学習に必要なクラスを準備する
-    accelerator.print("prepare optimizer, data loader etc.")
-
-    trainable_params = controlnet.parameters()
-
-    _, _, optimizer = train_util.get_optimizer(args, trainable_params)
-
-    # dataloaderを準備する
-    # DataLoaderのプロセス数：0 は persistent_workers が使えないので注意
-    n_workers = min(args.max_data_loader_n_workers, os.cpu_count())  # cpu_count or max_data_loader_n_workers
-
-    train_dataloader = torch.utils.data.DataLoader(
-        train_dataset_group,
-        batch_size=1,
-        shuffle=True,
-        collate_fn=collator,
-        num_workers=n_workers,
-        persistent_workers=args.persistent_data_loader_workers,
-    )
-
-    # 学習ステップ数を計算する
-    if args.max_train_epochs is not None:
-        args.max_train_steps = args.max_train_epochs * math.ceil(
-            len(train_dataloader) / accelerator.num_processes / args.gradient_accumulation_steps
-        )
-        accelerator.print(
-            f"override steps. steps for {args.max_train_epochs} epochs is / 指定エポックまでのステップ数: {args.max_train_steps}"
-        )
-
-    # データセット側にも学習ステップを送信
-    train_dataset_group.set_max_train_steps(args.max_train_steps)
-
-    # lr schedulerを用意する
-    lr_scheduler = train_util.get_scheduler_fix(args, optimizer, accelerator.num_processes)
-
-    # 実験的機能：勾配も含めたfp16学習を行う　モデル全体をfp16にする
-    if args.full_fp16:
-        assert (
-            args.mixed_precision == "fp16"
-        ), "full_fp16 requires mixed precision='fp16' / full_fp16を使う場合はmixed_precision='fp16'を指定してください。"
-        accelerator.print("enable full fp16 training.")
-        controlnet.to(weight_dtype)
-
-    # acceleratorがなんかよろしくやってくれるらしい
-    controlnet, optimizer, train_dataloader, lr_scheduler = accelerator.prepare(
-        controlnet, optimizer, train_dataloader, lr_scheduler
-    )
-
-    unet.requires_grad_(False)
-    text_encoder.requires_grad_(False)
-    unet.to(accelerator.device)
-    text_encoder.to(accelerator.device)
-
-    # transform DDP after prepare
-    controlnet = controlnet.module if isinstance(controlnet, DDP) else controlnet
-
-    controlnet.train()
-
-    if not cache_latents:
-        vae.requires_grad_(False)
-        vae.eval()
-        vae.to(accelerator.device, dtype=weight_dtype)
-
-    # 実験的機能：勾配も含めたfp16学習を行う　PyTorchにパッチを当ててfp16でのgrad scaleを有効にする
-    if args.full_fp16:
-        train_util.patch_accelerator_for_fp16_training(accelerator)
-
-    # resumeする
-    train_util.resume_from_local_or_hf_if_specified(accelerator, args)
-
-    # epoch数を計算する
-    num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
-    num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch)
-    if (args.save_n_epoch_ratio is not None) and (args.save_n_epoch_ratio > 0):
-        args.save_every_n_epochs = math.floor(num_train_epochs / args.save_n_epoch_ratio) or 1
-
-    # 学習する
-    # TODO: find a way to handle total batch size when there are multiple datasets
-    accelerator.print("running training / 学習開始")
-    accelerator.print(f"  num train images * repeats / 学習画像の数×繰り返し回数: {train_dataset_group.num_train_images}")
-    accelerator.print(f"  num reg images / 正則化画像の数: {train_dataset_group.num_reg_images}")
-    accelerator.print(f"  num batches per epoch / 1epochのバッチ数: {len(train_dataloader)}")
-    accelerator.print(f"  num epochs / epoch数: {num_train_epochs}")
-    accelerator.print(
-        f"  batch size per device / バッチサイズ: {', '.join([str(d.batch_size) for d in train_dataset_group.datasets])}"
-    )
-    # logger.info(f"  total train batch size (with parallel & distributed & accumulation) / 総バッチサイズ（並列学習、勾配合計含む）: {total_batch_size}")
-    accelerator.print(f"  gradient accumulation steps / 勾配を合計するステップ数 = {args.gradient_accumulation_steps}")
-    accelerator.print(f"  total optimization steps / 学習ステップ数: {args.max_train_steps}")
-
-    progress_bar = tqdm(
-        range(args.max_train_steps),
-        smoothing=0,
-        disable=not accelerator.is_local_main_process,
-        desc="steps",
-    )
-    global_step = 0
-
-    noise_scheduler = DDPMScheduler(
-        beta_start=0.00085,
-        beta_end=0.012,
-        beta_schedule="scaled_linear",
-        num_train_timesteps=1000,
-        clip_sample=False,
-    )
-    if accelerator.is_main_process:
-        init_kwargs = {}
-        if args.wandb_run_name:
-            init_kwargs["wandb"] = {"name": args.wandb_run_name}
-        if args.log_tracker_config is not None:
-            init_kwargs = toml.load(args.log_tracker_config)
-        accelerator.init_trackers(
-            "controlnet_train" if args.log_tracker_name is None else args.log_tracker_name, init_kwargs=init_kwargs
-        )
-
-    loss_recorder = train_util.LossRecorder()
-    del train_dataset_group
-
-    # function for saving/removing
-    def save_model(ckpt_name, model, force_sync_upload=False):
-        os.makedirs(args.output_dir, exist_ok=True)
-        ckpt_file = os.path.join(args.output_dir, ckpt_name)
-
-        accelerator.print(f"\nsaving checkpoint: {ckpt_file}")
-
-        state_dict = model_util.convert_controlnet_state_dict_to_sd(model.state_dict())
-
-        if save_dtype is not None:
-            for key in list(state_dict.keys()):
-                v = state_dict[key]
-                v = v.detach().clone().to("cpu").to(save_dtype)
-                state_dict[key] = v
-
-        if os.path.splitext(ckpt_file)[1] == ".safetensors":
-            from safetensors.torch import save_file
-
-            save_file(state_dict, ckpt_file)
-        else:
-            torch.save(state_dict, ckpt_file)
-
-        if args.huggingface_repo_id is not None:
-            huggingface_util.upload(args, ckpt_file, "/" + ckpt_name, force_sync_upload=force_sync_upload)
-
-    def remove_model(old_ckpt_name):
-        old_ckpt_file = os.path.join(args.output_dir, old_ckpt_name)
-        if os.path.exists(old_ckpt_file):
-            accelerator.print(f"removing old checkpoint: {old_ckpt_file}")
-            os.remove(old_ckpt_file)
-
-    # For --sample_at_first
-    train_util.sample_images(
-        accelerator, args, 0, global_step, accelerator.device, vae, tokenizer, text_encoder, unet, controlnet=controlnet
-    )
-
-    # training loop
-    for epoch in range(num_train_epochs):
-        if is_main_process:
-            accelerator.print(f"\nepoch {epoch+1}/{num_train_epochs}")
-        current_epoch.value = epoch + 1
-
-        for step, batch in enumerate(train_dataloader):
-            current_step.value = global_step
-            with accelerator.accumulate(controlnet):
-                with torch.no_grad():
-                    if "latents" in batch and batch["latents"] is not None:
-                        latents = batch["latents"].to(accelerator.device).to(dtype=weight_dtype)
-                    else:
-                        # latentに変換
-                        latents = vae.encode(batch["images"].to(dtype=weight_dtype)).latent_dist.sample()
-                    latents = latents * 0.18215
-                b_size = latents.shape[0]
-
-                input_ids = batch["input_ids"].to(accelerator.device)
-                encoder_hidden_states = train_util.get_hidden_states(args, input_ids, tokenizer, text_encoder, weight_dtype)
-
-                # Sample noise that we'll add to the latents
-                noise = torch.randn_like(latents, device=latents.device)
-                if args.noise_offset:
-                    noise = apply_noise_offset(latents, noise, args.noise_offset, args.adaptive_noise_scale)
-                elif args.multires_noise_iterations:
-                    noise = pyramid_noise_like(
-                        noise,
-                        latents.device,
-                        args.multires_noise_iterations,
-                        args.multires_noise_discount,
-                    )
-
-                # Sample a random timestep for each image
-                timesteps = train_util.get_timesteps(args, 0, noise_scheduler.config.num_train_timesteps, b_size)
-                huber_c = train_util.get_huber_c(args, noise_scheduler, timesteps.item(), latents.device)
-
-                # Add noise to the latents according to the noise magnitude at each timestep
-                # (this is the forward diffusion process)
-                noisy_latents = noise_scheduler.add_noise(latents, noise, timesteps)
-
-                controlnet_image = batch["conditioning_images"].to(dtype=weight_dtype)
-
-                with accelerator.autocast():
-                    down_block_res_samples, mid_block_res_sample = controlnet(
-                        noisy_latents,
-                        timesteps,
-                        encoder_hidden_states=encoder_hidden_states,
-                        controlnet_cond=controlnet_image,
-                        return_dict=False,
-                    )
-
-                    # Predict the noise residual
-                    noise_pred = unet(
-                        noisy_latents,
-                        timesteps,
-                        encoder_hidden_states,
-                        down_block_additional_residuals=[sample.to(dtype=weight_dtype) for sample in down_block_res_samples],
-                        mid_block_additional_residual=mid_block_res_sample.to(dtype=weight_dtype),
-                    ).sample
-
-                if args.v_parameterization:
-                    # v-parameterization training
-                    target = noise_scheduler.get_velocity(latents, noise, timesteps)
-                else:
-                    target = noise
-
-                loss = train_util.conditional_loss(noise_pred.float(), target.float(), reduction="none", loss_type=args.loss_type, huber_c=huber_c)
-                loss = loss.mean([1, 2, 3])
-
-                loss_weights = batch["loss_weights"]  # 各sampleごとのweight
-                loss = loss * loss_weights
-
-                if args.min_snr_gamma:
-                    loss = apply_snr_weight(loss, timesteps, noise_scheduler, args.min_snr_gamma, args.v_parameterization)
-
-                loss = loss.mean()  # 平均なのでbatch_sizeで割る必要なし
-
-                accelerator.backward(loss)
-                if accelerator.sync_gradients and args.max_grad_norm != 0.0:
-                    params_to_clip = controlnet.parameters()
-                    accelerator.clip_grad_norm_(params_to_clip, args.max_grad_norm)
-
-                optimizer.step()
-                lr_scheduler.step()
-                optimizer.zero_grad(set_to_none=True)
-
-            # Checks if the accelerator has performed an optimization step behind the scenes
-            if accelerator.sync_gradients:
-                progress_bar.update(1)
-                global_step += 1
-
-                train_util.sample_images(
-                    accelerator,
-                    args,
-                    None,
-                    global_step,
-                    accelerator.device,
-                    vae,
-                    tokenizer,
-                    text_encoder,
-                    unet,
-                    controlnet=controlnet,
-                )
-
-                # 指定ステップごとにモデルを保存
-                if args.save_every_n_steps is not None and global_step % args.save_every_n_steps == 0:
-                    accelerator.wait_for_everyone()
-                    if accelerator.is_main_process:
-                        ckpt_name = train_util.get_step_ckpt_name(args, "." + args.save_model_as, global_step)
-                        save_model(
-                            ckpt_name,
-                            accelerator.unwrap_model(controlnet),
-                        )
-
-                        if args.save_state:
-                            train_util.save_and_remove_state_stepwise(args, accelerator, global_step)
-
-                        remove_step_no = train_util.get_remove_step_no(args, global_step)
-                        if remove_step_no is not None:
-                            remove_ckpt_name = train_util.get_step_ckpt_name(args, "." + args.save_model_as, remove_step_no)
-                            remove_model(remove_ckpt_name)
-
-            current_loss = loss.detach().item()
-            loss_recorder.add(epoch=epoch, step=step, loss=current_loss)
-            avr_loss: float = loss_recorder.moving_average
-            logs = {"avr_loss": avr_loss}  # , "lr": lr_scheduler.get_last_lr()[0]}
-            progress_bar.set_postfix(**logs)
-
-            if args.logging_dir is not None:
-                logs = generate_step_logs(args, current_loss, avr_loss, lr_scheduler)
-                accelerator.log(logs, step=global_step)
-
-            if global_step >= args.max_train_steps:
-                break
-
-        if args.logging_dir is not None:
-            logs = {"loss/epoch": loss_recorder.moving_average}
-            accelerator.log(logs, step=epoch + 1)
-
-        accelerator.wait_for_everyone()
-
-        # 指定エポックごとにモデルを保存
-        if args.save_every_n_epochs is not None:
-            saving = (epoch + 1) % args.save_every_n_epochs == 0 and (epoch + 1) < num_train_epochs
-            if is_main_process and saving:
-                ckpt_name = train_util.get_epoch_ckpt_name(args, "." + args.save_model_as, epoch + 1)
-                save_model(ckpt_name, accelerator.unwrap_model(controlnet))
-
-                remove_epoch_no = train_util.get_remove_epoch_no(args, epoch + 1)
-                if remove_epoch_no is not None:
-                    remove_ckpt_name = train_util.get_epoch_ckpt_name(args, "." + args.save_model_as, remove_epoch_no)
-                    remove_model(remove_ckpt_name)
-
-                if args.save_state:
-                    train_util.save_and_remove_state_on_epoch_end(args, accelerator, epoch + 1)
-
-        train_util.sample_images(
-            accelerator,
-            args,
-            epoch + 1,
-            global_step,
-            accelerator.device,
-            vae,
-            tokenizer,
-            text_encoder,
-            unet,
-            controlnet=controlnet,
-        )
-
-        # end of epoch
-    if is_main_process:
-        controlnet = accelerator.unwrap_model(controlnet)
-
-    accelerator.end_training()
-
-    if is_main_process and (args.save_state or args.save_state_on_train_end):
-        train_util.save_state_on_train_end(args, accelerator)
-
-    # del accelerator  # この後メモリを使うのでこれは消す→printで使うので消さずにおく
-
-    if is_main_process:
-        ckpt_name = train_util.get_last_ckpt_name(args, "." + args.save_model_as)
-        save_model(ckpt_name, controlnet, force_sync_upload=True)
-
-        logger.info("model saved.")
-
-
-def setup_parser() -> argparse.ArgumentParser:
-    parser = argparse.ArgumentParser()
-
-    add_logging_arguments(parser)
-    train_util.add_sd_models_arguments(parser)
-    train_util.add_dataset_arguments(parser, False, True, True)
-    train_util.add_training_arguments(parser, False)
-    deepspeed_utils.add_deepspeed_arguments(parser)
-    train_util.add_optimizer_arguments(parser)
-    config_util.add_config_arguments(parser)
-    custom_train_functions.add_custom_train_arguments(parser)
-
-    parser.add_argument(
-        "--save_model_as",
-        type=str,
-        default="safetensors",
-        choices=[None, "ckpt", "pt", "safetensors"],
-        help="format to save the model (default is .safetensors) / モデル保存時の形式（デフォルトはsafetensors）",
-    )
-    parser.add_argument(
-        "--controlnet_model_name_or_path",
-        type=str,
-        default=None,
-        help="controlnet model name or path / controlnetのモデル名またはパス",
-    )
-    parser.add_argument(
-        "--conditioning_data_dir",
-        type=str,
-        default=None,
-        help="conditioning data directory / 条件付けデータのディレクトリ",
-    )
-
-    return parser
-
-=======
 from library import train_util
 from train_control_net import setup_parser, train
->>>>>>> hina/feature/val-loss
 
 if __name__ == "__main__":
     logger.warning(

From 534059dea517d44de387e7d467d64209f9dcfba2 Mon Sep 17 00:00:00 2001
From: rockerBOO <rockerboo@gmail.com>
Date: Fri, 3 Jan 2025 01:18:15 -0500
Subject: [PATCH 288/582] Typos and lingering is_train

---
 library/config_util.py | 2 +-
 library/train_util.py  | 4 ----
 train_network.py       | 6 +++---
 3 files changed, 4 insertions(+), 8 deletions(-)

diff --git a/library/config_util.py b/library/config_util.py
index a09d2c7ca..418c179dc 100644
--- a/library/config_util.py
+++ b/library/config_util.py
@@ -535,7 +535,7 @@ def print_info(_datasets):
                     shuffle_caption: {subset.shuffle_caption}
                     keep_tokens: {subset.keep_tokens}
                     caption_dropout_rate: {subset.caption_dropout_rate}
-                    caption_dropout_every_n_epoches: {subset.caption_dropout_every_n_epochs}
+                    caption_dropout_every_n_epochs: {subset.caption_dropout_every_n_epochs}
                     caption_tag_dropout_rate: {subset.caption_tag_dropout_rate}
                     caption_prefix: {subset.caption_prefix}
                     caption_suffix: {subset.caption_suffix}
diff --git a/library/train_util.py b/library/train_util.py
index bf1b6731c..220d4702b 100644
--- a/library/train_util.py
+++ b/library/train_util.py
@@ -2092,7 +2092,6 @@ def __init__(
         bucket_reso_steps: int,
         bucket_no_upscale: bool,
         debug_dataset: bool,
-        is_train: bool,
         validation_seed: int,
         validation_split: float,
     ) -> None:
@@ -2312,7 +2311,6 @@ class ControlNetDataset(BaseDataset):
     def __init__(
         self,
         subsets: Sequence[ControlNetSubset],
-        is_train: bool,
         batch_size: int,
         resolution,
         network_multiplier: float,
@@ -2362,7 +2360,6 @@ def __init__(
 
         self.dreambooth_dataset_delegate = DreamBoothDataset(
             db_subsets,
-            is_train,
             batch_size,
             resolution,
             network_multiplier,
@@ -2382,7 +2379,6 @@ def __init__(
         self.batch_size = batch_size
         self.num_train_images = self.dreambooth_dataset_delegate.num_train_images
         self.num_reg_images = self.dreambooth_dataset_delegate.num_reg_images        
-        self.is_train = is_train
         self.validation_split = validation_split
         self.validation_seed = validation_seed 
 
diff --git a/train_network.py b/train_network.py
index 99b9717a5..4bcfc0ac7 100644
--- a/train_network.py
+++ b/train_network.py
@@ -380,11 +380,11 @@ def pick_timesteps_list() -> torch.IntTensor:
             else:
                 return typing.cast(torch.IntTensor, torch.tensor(timesteps_list).unsqueeze(1).repeat(1, batch_size).to(latents.device))
 
-        choosen_timesteps_list = pick_timesteps_list()
+        chosen_timesteps_list = pick_timesteps_list()
         total_loss = torch.zeros((batch_size, 1)).to(latents.device)
 
         # Use input timesteps_list or use described timesteps above
-        for fixed_timestep in choosen_timesteps_list:
+        for fixed_timestep in chosen_timesteps_list:
             fixed_timestep = typing.cast(torch.IntTensor, fixed_timestep)
 
             # Predict the noise residual
@@ -447,7 +447,7 @@ def pick_timesteps_list() -> torch.IntTensor:
 
             total_loss += loss 
 
-        return total_loss / len(choosen_timesteps_list)
+        return total_loss / len(chosen_timesteps_list)
 
     def train(self, args):
         session_id = random.randint(0, 2**32)

From c8c3569df292109fe3be4d209c9f6131afe2ba5f Mon Sep 17 00:00:00 2001
From: rockerBOO <rockerboo@gmail.com>
Date: Fri, 3 Jan 2025 01:26:45 -0500
Subject: [PATCH 289/582] Cleanup order, types, print to logger

---
 library/config_util.py | 7 +++----
 library/train_util.py  | 6 +++---
 2 files changed, 6 insertions(+), 7 deletions(-)

diff --git a/library/config_util.py b/library/config_util.py
index 418c179dc..5a4d3aa2d 100644
--- a/library/config_util.py
+++ b/library/config_util.py
@@ -485,7 +485,7 @@ def generate_dataset_group_by_blueprint(dataset_group_blueprint: DatasetGroupBlu
     dataset = dataset_klass(subsets=subsets, **asdict(dataset_blueprint.params))
     datasets.append(dataset)
 
-    val_datasets:List[Union[DreamBoothDataset, FineTuningDataset, ControlNetDataset]] = []
+    val_datasets: List[Union[DreamBoothDataset, FineTuningDataset, ControlNetDataset]] = []
     for dataset_blueprint in dataset_group_blueprint.datasets:
         if dataset_blueprint.params.validation_split <= 0.0:
             continue
@@ -503,7 +503,6 @@ def generate_dataset_group_by_blueprint(dataset_group_blueprint: DatasetGroupBlu
         dataset = dataset_klass(subsets=subsets, **asdict(dataset_blueprint.params))
         val_datasets.append(dataset)
 
-    # print info
     def print_info(_datasets):
         info = ""
         for i, dataset in enumerate(_datasets):
@@ -565,7 +564,7 @@ def print_info(_datasets):
     print_info(datasets)
 
     if len(val_datasets) > 0:
-        print("Validation dataset")
+        logger.info("Validation dataset")
         print_info(val_datasets)
 
     if len(val_datasets) > 0:
@@ -610,7 +609,7 @@ def print_info(_datasets):
                 "  ",
             )
 
-        logger.info(f"{info}")
+        logger.info(info)
 
     # make buckets first because it determines the length of dataset
     # and set the same seed for all datasets
diff --git a/library/train_util.py b/library/train_util.py
index 220d4702b..782f57e8f 100644
--- a/library/train_util.py
+++ b/library/train_util.py
@@ -1833,9 +1833,9 @@ def __init__(
         bucket_reso_steps: int,
         bucket_no_upscale: bool,
         prior_loss_weight: float,
+        debug_dataset: bool,
         validation_split: float,
         validation_seed: Optional[int],
-        debug_dataset,
     ) -> None:
         super().__init__(resolution, network_multiplier, debug_dataset)
 
@@ -2319,9 +2319,9 @@ def __init__(
         max_bucket_reso: int,
         bucket_reso_steps: int,
         bucket_no_upscale: bool,
+        debug_dataset: bool,
         validation_split: float,
         validation_seed: Optional[int],        
-        debug_dataset: float,
     ) -> None:
         super().__init__(resolution, network_multiplier, debug_dataset)
 
@@ -2369,9 +2369,9 @@ def __init__(
             bucket_reso_steps,
             bucket_no_upscale,
             1.0,
+            debug_dataset,
             validation_split,
             validation_seed,
-            debug_dataset
         )
 
         # config_util等から参照される値をいれておく（若干微妙なのでなんとかしたい）

From fbfc2753eb7fa57724eb525ee65d851b5e80b8ea Mon Sep 17 00:00:00 2001
From: rockerBOO <rockerboo@gmail.com>
Date: Fri, 3 Jan 2025 01:53:12 -0500
Subject: [PATCH 290/582] Update text for train/reg with repeats

---
 library/train_util.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/library/train_util.py b/library/train_util.py
index 782f57e8f..77a6a9f9a 100644
--- a/library/train_util.py
+++ b/library/train_util.py
@@ -2050,11 +2050,11 @@ def load_dreambooth_dir(subset: DreamBoothSubset):
             subset.img_count = len(img_paths)
             self.subsets.append(subset)
 
-        logger.info(f"{num_train_images} images with repeating.")
+        logger.info(f"{num_train_images} train images with repeats.")
 
         self.num_train_images = num_train_images
 
-        logger.info(f"{num_reg_images} reg images.")
+        logger.info(f"{num_reg_images} reg images with repeats.")
         if num_train_images < num_reg_images:
             logger.warning("some of reg images are not used / 正則化画像の数が多いので、一部使用されない正則化画像があります")
 

From 58bfa36d0275d864d5a2d64c51632e808f789ddd Mon Sep 17 00:00:00 2001
From: rockerBOO <rockerboo@gmail.com>
Date: Fri, 3 Jan 2025 02:00:28 -0500
Subject: [PATCH 291/582] Add seed help clarifying info

---
 train_network.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/train_network.py b/train_network.py
index 4bcfc0ac7..7d064d210 100644
--- a/train_network.py
+++ b/train_network.py
@@ -1639,7 +1639,7 @@ def setup_parser() -> argparse.ArgumentParser:
         "--validation_seed",
         type=int,
         default=None,
-        help="Validation seed / 検証シード"
+        help="Validation seed for shuffling validation dataset, training `--seed` used otherwise / 検証シード"
     )
     parser.add_argument(
         "--validation_split",

From 6604b36044a83f3531faed508096f3e6bfe48fc9 Mon Sep 17 00:00:00 2001
From: rockerBOO <rockerboo@gmail.com>
Date: Fri, 3 Jan 2025 02:04:59 -0500
Subject: [PATCH 292/582] Remove duplicate assignment

---
 library/train_util.py | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/library/train_util.py b/library/train_util.py
index 77a6a9f9a..3710c865d 100644
--- a/library/train_util.py
+++ b/library/train_util.py
@@ -86,8 +86,6 @@
 import library.deepspeed_utils as deepspeed_utils
 from library.utils import setup_logging, pil_resize
 
-
-
 setup_logging()
 import logging
 
@@ -1841,8 +1839,6 @@ def __init__(
 
         assert resolution is not None, f"resolution is required / resolution（解像度）指定は必須です"
 
-        self.validation_split = validation_split
-        self.validation_seed = validation_seed 
         self.batch_size = batch_size
         self.size = min(self.width, self.height)  # 短いほう
         self.prior_loss_weight = prior_loss_weight

From 0522070d197d92745dbdb408d74c9c3f869bff76 Mon Sep 17 00:00:00 2001
From: rockerBOO <rockerboo@gmail.com>
Date: Fri, 3 Jan 2025 15:20:25 -0500
Subject: [PATCH 293/582] Fix training, validation split, revert to using
 upstream implemenation

---
 library/config_util.py            |  67 +++-----------
 library/custom_train_functions.py |   6 +-
 library/strategy_sd.py            |   2 +-
 library/train_util.py             | 143 +++++++++++++++++-------------
 train_network.py                  |  94 ++++++++++++--------
 5 files changed, 152 insertions(+), 160 deletions(-)

diff --git a/library/config_util.py b/library/config_util.py
index 5a4d3aa2d..63d28c969 100644
--- a/library/config_util.py
+++ b/library/config_util.py
@@ -482,7 +482,7 @@ def generate_dataset_group_by_blueprint(dataset_group_blueprint: DatasetGroupBlu
             dataset_klass = FineTuningDataset
 
     subsets = [subset_klass(**asdict(subset_blueprint.params)) for subset_blueprint in dataset_blueprint.subsets]
-    dataset = dataset_klass(subsets=subsets, **asdict(dataset_blueprint.params))
+    dataset = dataset_klass(subsets=subsets, is_training_dataset=True, **asdict(dataset_blueprint.params))
     datasets.append(dataset)
 
     val_datasets: List[Union[DreamBoothDataset, FineTuningDataset, ControlNetDataset]] = []
@@ -500,16 +500,16 @@ def generate_dataset_group_by_blueprint(dataset_group_blueprint: DatasetGroupBlu
             dataset_klass = FineTuningDataset
 
         subsets = [subset_klass(**asdict(subset_blueprint.params)) for subset_blueprint in dataset_blueprint.subsets]
-        dataset = dataset_klass(subsets=subsets, **asdict(dataset_blueprint.params))
+        dataset = dataset_klass(subsets=subsets, is_training_dataset=False, **asdict(dataset_blueprint.params))
         val_datasets.append(dataset)
 
-    def print_info(_datasets):
+    def print_info(_datasets, dataset_type: str):
         info = ""
         for i, dataset in enumerate(_datasets):
             is_dreambooth = isinstance(dataset, DreamBoothDataset)
             is_controlnet = isinstance(dataset, ControlNetDataset)
             info += dedent(f"""\
-                [Dataset {i}]
+                [{dataset_type} {i}]
                   batch_size: {dataset.batch_size}
                   resolution: {(dataset.width, dataset.height)}
                   enable_bucket: {dataset.enable_bucket}
@@ -527,7 +527,7 @@ def print_info(_datasets):
 
             for j, subset in enumerate(dataset.subsets):
                 info += indent(dedent(f"""\
-                  [Subset {j} of Dataset {i}]
+                  [Subset {j} of {dataset_type} {i}]
                     image_dir: "{subset.image_dir}"
                     image_count: {subset.img_count}
                     num_repeats: {subset.num_repeats}
@@ -544,8 +544,8 @@ def print_info(_datasets):
                     random_crop: {subset.random_crop}
                     token_warmup_min: {subset.token_warmup_min},
                     token_warmup_step: {subset.token_warmup_step},
-          alpha_mask: {subset.alpha_mask}
-          custom_attributes: {subset.custom_attributes}
+                    alpha_mask: {subset.alpha_mask}
+                    custom_attributes: {subset.custom_attributes}
                 """), "  ")
 
                 if is_dreambooth:
@@ -561,67 +561,22 @@ def print_info(_datasets):
 
         logger.info(info)
 
-    print_info(datasets)
+    print_info(datasets, "Dataset")
 
     if len(val_datasets) > 0:
-        logger.info("Validation dataset")
-        print_info(val_datasets)
-
-    if len(val_datasets) > 0:
-        info = ""
-
-        for i, dataset in enumerate(val_datasets):
-            info += dedent(
-                f"""\
-      [Validation Dataset {i}]
-        batch_size: {dataset.batch_size}
-        resolution: {(dataset.width, dataset.height)}
-        enable_bucket: {dataset.enable_bucket}
-        network_multiplier: {dataset.network_multiplier}
-    """
-        )
-
-            if dataset.enable_bucket:
-                info += indent(
-                    dedent(
-                        f"""\
-        min_bucket_reso: {dataset.min_bucket_reso}
-        max_bucket_reso: {dataset.max_bucket_reso}
-        bucket_reso_steps: {dataset.bucket_reso_steps}
-        bucket_no_upscale: {dataset.bucket_no_upscale}
-      \n"""
-                ),
-                "  ",
-            )
-            else:
-                info += "\n"
-
-            for j, subset in enumerate(dataset.subsets):
-                info += indent(
-                    dedent(
-                       f"""\
-        [Subset {j} of Validation Dataset {i}]
-          image_dir: "{subset.image_dir}"
-          image_count: {subset.img_count}
-          num_repeats: {subset.num_repeats}
-      """
-                ),
-                "  ",
-            )
-
-        logger.info(info)
+        print_info(val_datasets, "Validation Dataset")
 
     # make buckets first because it determines the length of dataset
     # and set the same seed for all datasets
     seed = random.randint(0, 2**31)  # actual seed is seed + epoch_no
 
     for i, dataset in enumerate(datasets):
-        logger.info(f"[Dataset {i}]")
+        logger.info(f"[Prepare dataset {i}]")
         dataset.make_buckets()
         dataset.set_seed(seed)
 
     for i, dataset in enumerate(val_datasets):
-        logger.info(f"[Validation Dataset {i}]")
+        logger.info(f"[Prepare validation dataset {i}]")
         dataset.make_buckets()
         dataset.set_seed(seed)
 
diff --git a/library/custom_train_functions.py b/library/custom_train_functions.py
index 9a7c21a3e..ad3e69ffb 100644
--- a/library/custom_train_functions.py
+++ b/library/custom_train_functions.py
@@ -455,7 +455,7 @@ def get_weighted_text_embeddings(
 
 
 # https://wandb.ai/johnowhitaker/multires_noise/reports/Multi-Resolution-Noise-for-Diffusion-Model-Training--VmlldzozNjYyOTU2
-def pyramid_noise_like(noise, device, iterations=6, discount=0.4):
+def pyramid_noise_like(noise, device, iterations=6, discount=0.4) -> torch.FloatTensor:
     b, c, w, h = noise.shape  # EDIT: w and h get over-written, rename for a different variant!
     u = torch.nn.Upsample(size=(w, h), mode="bilinear").to(device)
     for i in range(iterations):
@@ -468,7 +468,7 @@ def pyramid_noise_like(noise, device, iterations=6, discount=0.4):
 
 
 # https://www.crosslabs.org//blog/diffusion-with-offset-noise
-def apply_noise_offset(latents, noise, noise_offset, adaptive_noise_scale):
+def apply_noise_offset(latents, noise, noise_offset, adaptive_noise_scale) -> torch.FloatTensor:
     if noise_offset is None:
         return noise
     if adaptive_noise_scale is not None:
@@ -484,7 +484,7 @@ def apply_noise_offset(latents, noise, noise_offset, adaptive_noise_scale):
     return noise
 
 
-def apply_masked_loss(loss, batch):
+def apply_masked_loss(loss, batch) -> torch.FloatTensor:
     if "conditioning_images" in batch:
         # conditioning image is -1 to 1. we need to convert it to 0 to 1
         mask_image = batch["conditioning_images"].to(dtype=loss.dtype)[:, 0].unsqueeze(1)  # use R channel
diff --git a/library/strategy_sd.py b/library/strategy_sd.py
index d0a3a68bf..a44fc4092 100644
--- a/library/strategy_sd.py
+++ b/library/strategy_sd.py
@@ -40,7 +40,7 @@ def tokenize(self, text: Union[str, List[str]]) -> List[torch.Tensor]:
         text = [text] if isinstance(text, str) else text
         return [torch.stack([self._get_input_ids(self.tokenizer, t, self.max_length) for t in text], dim=0)]
 
-    def tokenize_with_weights(self, text: str | List[str]) -> Tuple[List[torch.Tensor]]:
+    def tokenize_with_weights(self, text: str | List[str]) -> Tuple[List[torch.Tensor], List[torch.Tensor]]:
         text = [text] if isinstance(text, str) else text
         tokens_list = []
         weights_list = []
diff --git a/library/train_util.py b/library/train_util.py
index 3710c865d..0f16a4f31 100644
--- a/library/train_util.py
+++ b/library/train_util.py
@@ -146,7 +146,15 @@
 TEXT_ENCODER_OUTPUTS_CACHE_SUFFIX = "_te_outputs.npz"
 TEXT_ENCODER_OUTPUTS_CACHE_SUFFIX_SD3 = "_sd3_te.npz"
 
-def split_train_val(paths: List[str], is_train: bool, validation_split: float, validation_seed: int) -> List[str]:
+def split_train_val(paths: List[str], is_training_dataset: bool, validation_split: float, validation_seed: int) -> List[str]:
+    """
+    Split the dataset into train and validation
+
+    Shuffle the dataset based on the validation_seed or the current random seed.
+    For example if the split of 0.2 of 100 images.
+    [0:79] = 80 training images
+    [80:] = 20 validation images
+    """
     if validation_seed is not None:
         print(f"Using validation seed: {validation_seed}")
         prevstate = random.getstate()
@@ -156,9 +164,12 @@ def split_train_val(paths: List[str], is_train: bool, validation_split: float, v
     else:
         random.shuffle(paths)
 
-    if is_train:
+    # Split the dataset between training and validation
+    if is_training_dataset:
+        # Training dataset we split to the first part
         return paths[0:math.ceil(len(paths) * (1 - validation_split))]
     else:
+        # Validation dataset we split to the second part
         return paths[len(paths) - round(len(paths) * validation_split):]
 
 
@@ -1822,6 +1833,7 @@ class DreamBoothDataset(BaseDataset):
     def __init__(
         self,
         subsets: Sequence[DreamBoothSubset],
+        is_training_dataset: bool,
         batch_size: int,
         resolution,
         network_multiplier: float,
@@ -1843,6 +1855,7 @@ def __init__(
         self.size = min(self.width, self.height)  # 短いほう
         self.prior_loss_weight = prior_loss_weight
         self.latents_cache = None
+        self.is_training_dataset = is_training_dataset
         self.validation_seed = validation_seed
         self.validation_split = validation_split
 
@@ -1952,6 +1965,9 @@ def load_dreambooth_dir(subset: DreamBoothSubset):
                             size_set_count += 1
                     logger.info(f"set image size from cache files: {size_set_count}/{len(img_paths)}")
 
+            if self.validation_split > 0.0:
+                img_paths = split_train_val(img_paths, self.is_training_dataset, self.validation_split, self.validation_seed)
+
             logger.info(f"found directory {subset.image_dir} contains {len(img_paths)} image files")
 
             if use_cached_info_for_subset:
@@ -2046,7 +2062,8 @@ def load_dreambooth_dir(subset: DreamBoothSubset):
             subset.img_count = len(img_paths)
             self.subsets.append(subset)
 
-        logger.info(f"{num_train_images} train images with repeats.")
+        images_split_name = "train" if self.is_training_dataset else "validation"
+        logger.info(f"{num_train_images} {images_split_name} images with repeats.")
 
         self.num_train_images = num_train_images
 
@@ -2411,8 +2428,12 @@ def __init__(
             conditioning_img_paths = [os.path.abspath(p) for p in conditioning_img_paths]  # normalize path
             extra_imgs.extend([p for p in conditioning_img_paths if os.path.splitext(p)[0] not in cond_imgs_with_pair])
 
-        assert len(missing_imgs) == 0, f"missing conditioning data for {len(missing_imgs)} images: {missing_imgs}"
-        assert len(extra_imgs) == 0, f"extra conditioning data for {len(extra_imgs)} images: {extra_imgs}"
+        assert (
+            len(missing_imgs) == 0
+        ), f"missing conditioning data for {len(missing_imgs)} images / 制御用画像が見つかりませんでした: {missing_imgs}"
+        assert (
+            len(extra_imgs) == 0
+        ), f"extra conditioning data for {len(extra_imgs)} images / 余分な制御用画像があります: {extra_imgs}"
 
         self.conditioning_image_transforms = IMAGE_TRANSFORMS
 
@@ -4586,7 +4607,6 @@ def read_config_from_file(args: argparse.Namespace, parser: argparse.ArgumentPar
     config_args = argparse.Namespace(**ignore_nesting_dict)
     args = parser.parse_args(namespace=config_args)
     args.config_file = os.path.splitext(args.config_file)[0]
-    logger.info(args.config_file)
 
     return args
 
@@ -5880,55 +5900,35 @@ def save_sd_model_on_train_end_common(
             huggingface_util.upload(args, out_dir, "/" + model_name, force_sync_upload=True)
 
 
-def get_random_timesteps(args, min_timestep: int, max_timestep: int, batch_size: int, device: torch.device) -> torch.IntTensor:
-    """
-    Get a random timestep between the min and max timesteps
-    Can error (NotImplementedError) if the loss type is not supported
-    """
-    # TODO: if a huber loss is selected, it will use constant timesteps for each batch
-    # as. In the future there may be a smarter way
-    if args.loss_type == "huber" or args.loss_type == "smooth_l1":
-        timesteps = torch.randint(min_timestep, max_timestep, (1,), device="cpu")
-        timesteps = timesteps.repeat(batch_size).to(device)
-    elif args.loss_type == "l2":
-        timesteps = torch.randint(min_timestep, max_timestep, (batch_size,), device=device)
-    else:
-        raise NotImplementedError(f"Unknown loss type {args.loss_type}")
-
-    return typing.cast(torch.IntTensor, timesteps)
-
+def get_timesteps(min_timestep: int, max_timestep: int, b_size: int, device: torch.device = torch.device("cpu")) -> torch.IntTensor:
+    timesteps = torch.randint(min_timestep, max_timestep, (b_size,), device=device)
+    return timesteps
 
-def get_huber_c(args, noise_scheduler: DDPMScheduler, timesteps: torch.IntTensor) -> Optional[float]:
-    """
-    Calculate the Huber convolution (huber_c) value
-    Huber loss is a loss function used in robust regression, that is less sensitive 
-    to outliers in data than the squared error loss.
-    https://en.wikipedia.org/wiki/Huber_loss
-    """
-    if args.loss_type == "huber" or args.loss_type == "smooth_l1":
-        if args.huber_schedule == "exponential":
-            alpha = -math.log(args.huber_c) / noise_scheduler.config.get('num_train_timesteps', 1000)
-            huber_c = math.exp(-alpha * timesteps.item())
-        elif args.huber_schedule == "snr":
-            if not hasattr(noise_scheduler, "alphas_cumprod"):
-                raise NotImplementedError("Huber schedule 'snr' is not supported with the current model.")
-            alphas_cumprod = noise_scheduler.alphas_cumprod.index_select(0, timesteps)
-            sigmas = ((1.0 - alphas_cumprod) / alphas_cumprod) ** 0.5
-            huber_c = (1 - args.huber_c) / (1 + sigmas) ** 2 + args.huber_c
-        elif args.huber_schedule == "constant":
-            huber_c = args.huber_c
-        else:
-            raise NotImplementedError(f"Unknown Huber loss schedule {args.huber_schedule}!")
 
-    elif args.loss_type == "l2":
+def get_huber_threshold_if_needed(args, timesteps: torch.Tensor, noise_scheduler) -> Optional[torch.Tensor]:
+    if not (args.loss_type == "huber" or args.loss_type == "smooth_l1"):
         return None
+
+    b_size = timesteps.shape[0]
+    if args.huber_schedule == "exponential":
+        alpha = -math.log(args.huber_c) / noise_scheduler.config.num_train_timesteps
+        result = torch.exp(-alpha * timesteps) * args.huber_scale
+    elif args.huber_schedule == "snr":
+        if not hasattr(noise_scheduler, "alphas_cumprod"):
+            raise NotImplementedError("Huber schedule 'snr' is not supported with the current model.")
+        alphas_cumprod = torch.index_select(noise_scheduler.alphas_cumprod, 0, timesteps.cpu())
+        sigmas = ((1.0 - alphas_cumprod) / alphas_cumprod) ** 0.5
+        result = (1 - args.huber_c) / (1 + sigmas) ** 2 + args.huber_c
+        result = result.to(timesteps.device)
+    elif args.huber_schedule == "constant":
+        result = torch.full((b_size,), args.huber_c * args.huber_scale, device=timesteps.device)
     else:
-        raise NotImplementedError(f"Unknown loss type {args.loss_type}")
+        raise NotImplementedError(f"Unknown Huber loss schedule {args.huber_schedule}!")
 
-    return huber_c
+    return result
 
 
-def modify_noise(args, noise: torch.Tensor, latents: torch.Tensor):
+def modify_noise(args, noise: torch.Tensor, latents: torch.Tensor) -> torch.FloatTensor:
     """
     Apply noise modifications like noise offset and multires noise
     """
@@ -5964,27 +5964,44 @@ def make_random_timesteps(args, noise_scheduler: DDPMScheduler, batch_size: int,
     max_timestep = noise_scheduler.config.get('num_train_timesteps', 1000) if args.max_timestep is None else args.max_timestep
 
     # Sample a random timestep for each image
-    timesteps = get_random_timesteps(args, min_timestep, max_timestep, batch_size, device)
+    timesteps = get_timesteps(min_timestep, max_timestep, batch_size, device)
 
     return timesteps
 
 
-def get_noise_noisy_latents_and_timesteps(args, noise_scheduler: DDPMScheduler, latents: torch.FloatTensor) -> Tuple[torch.FloatTensor, torch.FloatTensor, torch.IntTensor, Optional[float]]:
-    """
-    Unified noise, noisy_latents, timesteps and huber loss convolution calculations
-    """
-    batch_size = latents.shape[0]
+def get_noise_noisy_latents_and_timesteps(args, noise_scheduler, latents: torch.FloatTensor) -> Tuple[torch.FloatTensor, torch.FloatTensor, torch.IntTensor]:
+    # Sample noise that we'll add to the latents
+    noise = torch.randn_like(latents, device=latents.device)
+    if args.noise_offset:
+        if args.noise_offset_random_strength:
+            noise_offset = torch.rand(1, device=latents.device) * args.noise_offset
+        else:
+            noise_offset = args.noise_offset
+        noise = custom_train_functions.apply_noise_offset(latents, noise, noise_offset, args.adaptive_noise_scale)
+    if args.multires_noise_iterations:
+        noise = custom_train_functions.pyramid_noise_like(
+            noise, latents.device, args.multires_noise_iterations, args.multires_noise_discount
+        )
+
+    # Sample a random timestep for each image
+    b_size = latents.shape[0]
     min_timestep = 0 if args.min_timestep is None else args.min_timestep
-    max_timestep = noise_scheduler.config.get("num_train_timesteps", 1000) if args.max_timestep is None else args.max_timestep
+    max_timestep = noise_scheduler.config.num_train_timesteps if args.max_timestep is None else args.max_timestep
 
-    # A random timestep for each image in the batch
-    timesteps = get_random_timesteps(args, min_timestep, max_timestep, batch_size, latents.device)
-    huber_c = get_huber_c(args, noise_scheduler, timesteps)
+    timesteps = get_timesteps(min_timestep, max_timestep, b_size, latents.device)
 
-    noise = make_noise(args, latents)
-    noisy_latents = get_noisy_latents(args, noise, noise_scheduler, latents, timesteps)
+    # Add noise to the latents according to the noise magnitude at each timestep
+    # (this is the forward diffusion process)
+    if args.ip_noise_gamma:
+        if args.ip_noise_gamma_random_strength:
+            strength = torch.rand(1, device=latents.device) * args.ip_noise_gamma
+        else:
+            strength = args.ip_noise_gamma
+        noisy_latents = noise_scheduler.add_noise(latents, noise + strength * torch.randn_like(latents), timesteps)
+    else:
+        noisy_latents = noise_scheduler.add_noise(latents, noise, timesteps)
 
-    return noise, noisy_latents, timesteps, huber_c
+    return noise, noisy_latents, timesteps
 
 
 def get_noisy_latents(args, noise: torch.FloatTensor, noise_scheduler: DDPMScheduler, latents: torch.FloatTensor, timesteps: torch.IntTensor) -> torch.FloatTensor:
@@ -6015,6 +6032,8 @@ def conditional_loss(
     elif loss_type == "l1":
         loss = torch.nn.functional.l1_loss(model_pred, target, reduction=reduction)
     elif loss_type == "huber":
+        if huber_c is None:
+            raise NotImplementedError("huber_c not implemented correctly")
         huber_c = huber_c.view(-1, 1, 1, 1)
         loss = 2 * huber_c * (torch.sqrt((model_pred - target) ** 2 + huber_c**2) - huber_c)
         if reduction == "mean":
@@ -6022,6 +6041,8 @@ def conditional_loss(
         elif reduction == "sum":
             loss = torch.sum(loss)
     elif loss_type == "smooth_l1":
+        if huber_c is None:
+            raise NotImplementedError("huber_c not implemented correctly")
         huber_c = huber_c.view(-1, 1, 1, 1)
         loss = 2 * (torch.sqrt((model_pred - target) ** 2 + huber_c**2) - huber_c)
         if reduction == "mean":
diff --git a/train_network.py b/train_network.py
index 7d064d210..f870734fd 100644
--- a/train_network.py
+++ b/train_network.py
@@ -205,10 +205,10 @@ def get_noise_scheduler(self, args: argparse.Namespace, device: torch.device) ->
             custom_train_functions.fix_noise_scheduler_betas_for_zero_terminal_snr(noise_scheduler)
         return noise_scheduler
 
-    def encode_images_to_latents(self, args, accelerator, vae, images):
+    def encode_images_to_latents(self, args, vae: AutoencoderKL, images: torch.FloatTensor) -> torch.FloatTensor:
         return vae.encode(images).latent_dist.sample()
 
-    def shift_scale_latents(self, args, latents):
+    def shift_scale_latents(self, args, latents: torch.FloatTensor) -> torch.FloatTensor:
         return latents * self.vae_scale_factor
 
     def get_noise_pred_and_target(
@@ -280,7 +280,7 @@ def get_noise_pred_and_target(
 
         return noise_pred, target, timesteps, None
 
-    def post_process_loss(self, loss, args, timesteps, noise_scheduler):
+    def post_process_loss(self, loss, args, timesteps: torch.IntTensor, noise_scheduler) -> torch.FloatTensor:
         if args.min_snr_gamma:
             loss = apply_snr_weight(loss, timesteps, noise_scheduler, args.min_snr_gamma, args.v_parameterization)
         if args.scale_v_pred_loss_like_noise_pred:
@@ -317,20 +317,21 @@ def on_step_start(self, args, accelerator, network, text_encoders, unet, batch,
 
     # endregion
 
-    def process_batch(self, batch, tokenizers, text_encoders, unet, vae: AutoencoderKL, noise_scheduler, vae_dtype, weight_dtype, accelerator, args, text_encoding_strategy: strategy_sd.SdTextEncodingStrategy, tokenize_strategy: strategy_sd.SdTokenizeStrategy, is_train=True, train_text_encoder=True, train_unet=True, timesteps_list: Optional[List[Number]]=None) -> torch.Tensor:
+    def process_batch(self, batch, tokenizers, text_encoders, unet, network, vae: AutoencoderKL, noise_scheduler, vae_dtype, weight_dtype, accelerator, args, text_encoding_strategy: strategy_sd.SdTextEncodingStrategy, tokenize_strategy: strategy_sd.SdTokenizeStrategy, is_train=True, train_text_encoder=True, train_unet=True, timesteps_list: Optional[List[Number]]=None) -> torch.Tensor:
         
         with torch.no_grad():
             if "latents" in batch and batch["latents"] is not None:
-                latents: torch.Tensor = typing.cast(torch.FloatTensor, batch["latents"].to(accelerator.device))
+                latents = typing.cast(torch.FloatTensor, batch["latents"].to(accelerator.device))
             else:
                 # latentに変換
-                latents: torch.Tensor = typing.cast(torch.FloatTensor, typing.cast(AutoencoderKLOutput, vae.encode(batch["images"].to(accelerator.device, dtype=vae_dtype))).latent_dist.sample())
+                latents = self.encode_images_to_latents(args, vae, batch["images"].to(accelerator.device, dtype=vae_dtype))
 
                 # NaNが含まれていれば警告を表示し0に置き換える
                 if torch.any(torch.isnan(latents)):
                     accelerator.print("NaN found in latents, replacing with zeros")
-                    latents = typing.cast(torch.FloatTensor, torch.where(torch.isnan(latents), torch.zeros_like(latents), latents))
-            latents = typing.cast(torch.FloatTensor, latents * self.vae_scale_factor)
+                    latents = typing.cast(torch.FloatTensor, torch.nan_to_num(latents, 0, out=latents))
+
+            latents = self.shift_scale_latents(args, latents)
 
 
         text_encoder_conds = []
@@ -384,22 +385,36 @@ def pick_timesteps_list() -> torch.IntTensor:
         total_loss = torch.zeros((batch_size, 1)).to(latents.device)
 
         # Use input timesteps_list or use described timesteps above
-        for fixed_timestep in chosen_timesteps_list:
-            fixed_timestep = typing.cast(torch.IntTensor, fixed_timestep)
+        for fixed_timesteps in chosen_timesteps_list:
+            fixed_timesteps = typing.cast(torch.IntTensor, fixed_timesteps)
 
             # Predict the noise residual
             # and add noise to the latents
             # with noise offset and/or multires noise if specified
-            noisy_latents = train_util.get_noisy_latents(args, noise, noise_scheduler, latents, fixed_timestep)
+            noisy_latents = train_util.get_noisy_latents(args, noise, noise_scheduler, latents, fixed_timesteps)
+
+            # ensure the hidden state will require grad
+            if args.gradient_checkpointing:
+                for x in noisy_latents:
+                    x.requires_grad_(True)
+                for t in text_encoder_conds:
+                    t.requires_grad_(True)
 
             with torch.set_grad_enabled(is_train and train_unet), accelerator.autocast():
                 noise_pred = self.call_unet(
-                    args, accelerator, unet, noisy_latents.requires_grad_(train_unet), fixed_timestep, text_encoder_conds, batch, weight_dtype
+                    args,
+                    accelerator,
+                    unet,
+                    noisy_latents.requires_grad_(train_unet),
+                    fixed_timesteps,
+                    text_encoder_conds,
+                    batch,
+                    weight_dtype,
                 )
 
             if args.v_parameterization:
                 # v-parameterization training
-                target = noise_scheduler.get_velocity(latents, noise, fixed_timestep)
+                target = noise_scheduler.get_velocity(latents, noise, fixed_timesteps)
             else:
                 target = noise
 
@@ -418,7 +433,7 @@ def pick_timesteps_list() -> torch.IntTensor:
                             accelerator,
                             unet,
                             noisy_latents,
-                            timesteps,
+                            fixed_timesteps,
                             text_encoder_conds,
                             batch,
                             weight_dtype,
@@ -427,7 +442,8 @@ def pick_timesteps_list() -> torch.IntTensor:
                     network.set_multiplier(1.0)  # may be overwritten by "network_multipliers" in the next step
                     target[diff_output_pr_indices] = noise_pred_prior.to(target.dtype)
 
-            loss = torch.nn.functional.mse_loss(noise_pred.float(), target.float(), reduction="none")
+            huber_c = train_util.get_huber_threshold_if_needed(args, fixed_timesteps, noise_scheduler)
+            loss = train_util.conditional_loss(noise_pred.float(), target.float(), args.loss_type, "none", huber_c)
             loss = loss.mean([1, 2, 3]) # 平均なのでbatch_sizeで割る必要なし
 
             if args.masked_loss or ("alpha_masks" in batch and batch["alpha_masks"] is not None):
@@ -436,14 +452,7 @@ def pick_timesteps_list() -> torch.IntTensor:
             loss_weights = batch["loss_weights"].to(accelerator.device)  # 各sampleごとのweight
             loss = loss * loss_weights
 
-            if args.min_snr_gamma:
-                loss = apply_snr_weight(loss, fixed_timestep, noise_scheduler, args.min_snr_gamma)
-            if args.scale_v_pred_loss_like_noise_pred:
-                loss = scale_v_prediction_loss_like_noise_prediction(loss, fixed_timestep, noise_scheduler)
-            if args.v_pred_like_loss:
-                loss = add_v_prediction_like_loss(loss, fixed_timestep, noise_scheduler, args.v_pred_like_loss)
-            if args.debiased_estimation_loss:
-                loss = apply_debiased_estimation(loss, fixed_timestep, noise_scheduler)
+            loss = self.post_process_loss(loss, args, fixed_timesteps, noise_scheduler)
 
             total_loss += loss 
 
@@ -526,8 +535,12 @@ def train(self, args):
         collator = train_util.collator_class(current_epoch, current_step, ds_for_collator)
 
         if args.debug_dataset:
-            train_dataset_group.set_current_strategies()  # dasaset needs to know the strategies explicitly
+            train_dataset_group.set_current_strategies()  # dataset needs to know the strategies explicitly
             train_util.debug_dataset(train_dataset_group)
+
+            if val_dataset_group is not None:
+                val_dataset_group.set_current_strategies()  # dataset needs to know the strategies explicitly
+                train_util.debug_dataset(val_dataset_group)
             return
         if len(train_dataset_group) == 0:
             logger.error(
@@ -753,10 +766,6 @@ def train(self, args):
         # データセット側にも学習ステップを送信
         train_dataset_group.set_max_train_steps(args.max_train_steps)
 
-        # Not for sure here.
-        # if val_dataset_group is not None:
-        #     val_dataset_group.set_max_train_steps(args.max_train_steps)
-
         # lr schedulerを用意する
         lr_scheduler = train_util.get_scheduler_fix(args, optimizer, accelerator.num_processes)
 
@@ -1304,7 +1313,7 @@ def remove_model(old_ckpt_name):
         clean_memory_on_device(accelerator.device)
 
         for epoch in range(epoch_to_start, num_train_epochs):
-            accelerator.print(f"\nepoch {epoch+1}/{num_train_epochs}")
+            accelerator.print(f"\nepoch {epoch+1}/{num_train_epochs}\n")
             current_epoch.value = epoch + 1
 
             metadata["ss_epoch"] = str(epoch + 1)
@@ -1324,7 +1333,7 @@ def remove_model(old_ckpt_name):
                     continue
 
                 with accelerator.accumulate(training_model):
-                    loss = self.process_batch(batch, tokenizers, text_encoders, unet, vae, noise_scheduler, vae_dtype, weight_dtype, accelerator, args, text_encoding_strategy, tokenize_strategy, is_train=True, train_text_encoder=train_text_encoder, train_unet=train_unet)
+                    loss = self.process_batch(batch, tokenizers, text_encoders, unet, network, vae, noise_scheduler, vae_dtype, weight_dtype, accelerator, args, text_encoding_strategy, tokenize_strategy, is_train=True, train_text_encoder=train_text_encoder, train_unet=train_unet)
                     accelerator.backward(loss)
                     if accelerator.sync_gradients:
                         self.all_reduce_network(accelerator, network)  # sync DDP grad manually
@@ -1384,7 +1393,8 @@ def remove_model(old_ckpt_name):
                     logs = self.generate_step_logs(
                         args, current_loss, avr_loss, lr_scheduler, lr_descriptions, optimizer, keys_scaled, mean_norm, maximum_norm
                     )
-                    accelerator.log(logs, step=global_step)
+                    # accelerator.log(logs, step=global_step)
+                    accelerator.log(logs)
 
                 # VALIDATION PER STEP
                 should_validate = (args.validation_every_n_step is not None 
@@ -1401,7 +1411,7 @@ def remove_model(old_ckpt_name):
                         if val_step >= validation_steps:
                             break
 
-                        loss = self.process_batch(batch, tokenizers, text_encoders, unet, vae, noise_scheduler, vae_dtype, weight_dtype, accelerator, args, text_encoding_strategy, tokenize_strategy, is_train=False, timesteps_list=[10, 350, 500, 650, 990])
+                        loss = self.process_batch(batch, tokenizers, text_encoders, unet, network, vae, noise_scheduler, vae_dtype, weight_dtype, accelerator, args, text_encoding_strategy, tokenize_strategy, is_train=False, timesteps_list=[10, 350, 500, 650, 990])
                         
                         val_loss_recorder.add(epoch=epoch, step=val_step, loss=loss.detach().item())
                         val_progress_bar.update(1)
@@ -1409,10 +1419,12 @@ def remove_model(old_ckpt_name):
     
                         if is_tracking:
                             logs = {"loss/current_val_loss": loss.detach().item()}
-                            accelerator.log(logs, step=(len(val_dataloader) * epoch) + 1 + val_step)
+                            # accelerator.log(logs, step=(len(val_dataloader) * epoch) + 1 + val_step)
+                            accelerator.log(logs)
 
                             logs = {"loss/average_val_loss": val_loss_recorder.moving_average}
-                            accelerator.log(logs, step=global_step)
+                            # accelerator.log(logs, step=global_step)
+                            accelerator.log(logs)
                                         
                 if global_step >= args.max_train_steps:
                     break
@@ -1427,7 +1439,7 @@ def remove_model(old_ckpt_name):
                 )
 
                 for val_step, batch in enumerate(val_dataloader):
-                    loss = self.process_batch(batch, tokenizers, text_encoders, unet, vae, noise_scheduler, vae_dtype, weight_dtype, accelerator, args, text_encoding_strategy, tokenize_strategy, is_train=False, timesteps_list=[10, 350, 500, 650, 990])
+                    loss = self.process_batch(batch, tokenizers, text_encoders, unet, network, vae, noise_scheduler, vae_dtype, weight_dtype, accelerator, args, text_encoding_strategy, tokenize_strategy, is_train=False, timesteps_list=[10, 350, 500, 650, 990])
 
                     current_loss = loss.detach().item()
                     val_loss_recorder.add(epoch=epoch, step=val_step, loss=current_loss)
@@ -1437,22 +1449,26 @@ def remove_model(old_ckpt_name):
                     if is_tracking:
                         avr_loss: float = val_loss_recorder.moving_average
                         logs = {"loss/validation_current": current_loss}
-                        accelerator.log(logs, step=(len(val_dataloader) * epoch) + 1 + val_step)
+                        # accelerator.log(logs, step=(len(val_dataloader) * epoch) + 1 + val_step)
+                        accelerator.log(logs)
 
                 if is_tracking:
                     avr_loss: float = val_loss_recorder.moving_average
                     logs = {"loss/validation_average": avr_loss}
-                    accelerator.log(logs, step=epoch + 1)
+                    # accelerator.log(logs, step=epoch + 1)
+                    accelerator.log(logs)
 
             # END OF EPOCH
             if is_tracking:
                 logs = {"loss/epoch_average": loss_recorder.moving_average}
-                accelerator.log(logs, step=epoch + 1)
+                # accelerator.log(logs, step=epoch + 1)
+                accelerator.log(logs)
 
             if len(val_dataloader) > 0 and is_tracking:
                 avr_loss: float = val_loss_recorder.moving_average
                 logs = {"loss/validation_epoch_average": avr_loss}
-                accelerator.log(logs, step=epoch + 1)
+                # accelerator.log(logs, step=epoch + 1)
+                accelerator.log(logs)
                     
             accelerator.wait_for_everyone()
 

From 695f38962ce279adfee3fabb3479b84b1076b4e8 Mon Sep 17 00:00:00 2001
From: rockerBOO <rockerboo@gmail.com>
Date: Fri, 3 Jan 2025 15:25:12 -0500
Subject: [PATCH 294/582] Move get_huber_threshold_if_needed

---
 library/train_util.py | 44 ++++++++++++++++++++++---------------------
 1 file changed, 23 insertions(+), 21 deletions(-)

diff --git a/library/train_util.py b/library/train_util.py
index 0f16a4f31..0907a8c03 100644
--- a/library/train_util.py
+++ b/library/train_util.py
@@ -5905,27 +5905,6 @@ def get_timesteps(min_timestep: int, max_timestep: int, b_size: int, device: tor
     return timesteps
 
 
-def get_huber_threshold_if_needed(args, timesteps: torch.Tensor, noise_scheduler) -> Optional[torch.Tensor]:
-    if not (args.loss_type == "huber" or args.loss_type == "smooth_l1"):
-        return None
-
-    b_size = timesteps.shape[0]
-    if args.huber_schedule == "exponential":
-        alpha = -math.log(args.huber_c) / noise_scheduler.config.num_train_timesteps
-        result = torch.exp(-alpha * timesteps) * args.huber_scale
-    elif args.huber_schedule == "snr":
-        if not hasattr(noise_scheduler, "alphas_cumprod"):
-            raise NotImplementedError("Huber schedule 'snr' is not supported with the current model.")
-        alphas_cumprod = torch.index_select(noise_scheduler.alphas_cumprod, 0, timesteps.cpu())
-        sigmas = ((1.0 - alphas_cumprod) / alphas_cumprod) ** 0.5
-        result = (1 - args.huber_c) / (1 + sigmas) ** 2 + args.huber_c
-        result = result.to(timesteps.device)
-    elif args.huber_schedule == "constant":
-        result = torch.full((b_size,), args.huber_c * args.huber_scale, device=timesteps.device)
-    else:
-        raise NotImplementedError(f"Unknown Huber loss schedule {args.huber_schedule}!")
-
-    return result
 
 
 def modify_noise(args, noise: torch.Tensor, latents: torch.Tensor) -> torch.FloatTensor:
@@ -6004,6 +5983,29 @@ def get_noise_noisy_latents_and_timesteps(args, noise_scheduler, latents: torch.
     return noise, noisy_latents, timesteps
 
 
+def get_huber_threshold_if_needed(args, timesteps: torch.Tensor, noise_scheduler) -> Optional[torch.Tensor]:
+    if not (args.loss_type == "huber" or args.loss_type == "smooth_l1"):
+        return None
+
+    b_size = timesteps.shape[0]
+    if args.huber_schedule == "exponential":
+        alpha = -math.log(args.huber_c) / noise_scheduler.config.num_train_timesteps
+        result = torch.exp(-alpha * timesteps) * args.huber_scale
+    elif args.huber_schedule == "snr":
+        if not hasattr(noise_scheduler, "alphas_cumprod"):
+            raise NotImplementedError("Huber schedule 'snr' is not supported with the current model.")
+        alphas_cumprod = torch.index_select(noise_scheduler.alphas_cumprod, 0, timesteps.cpu())
+        sigmas = ((1.0 - alphas_cumprod) / alphas_cumprod) ** 0.5
+        result = (1 - args.huber_c) / (1 + sigmas) ** 2 + args.huber_c
+        result = result.to(timesteps.device)
+    elif args.huber_schedule == "constant":
+        result = torch.full((b_size,), args.huber_c * args.huber_scale, device=timesteps.device)
+    else:
+        raise NotImplementedError(f"Unknown Huber loss schedule {args.huber_schedule}!")
+
+    return result
+
+
 def get_noisy_latents(args, noise: torch.FloatTensor, noise_scheduler: DDPMScheduler, latents: torch.FloatTensor, timesteps: torch.IntTensor) -> torch.FloatTensor:
     """
     Add noise to the latents according to the noise magnitude at each timestep

From 1f9ba40b8b70fd08e6b87a70727d5e789666a925 Mon Sep 17 00:00:00 2001
From: rockerBOO <rockerboo@gmail.com>
Date: Fri, 3 Jan 2025 15:32:07 -0500
Subject: [PATCH 295/582] Add step break for validation epoch. Remove unused
 variable

---
 train_network.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/train_network.py b/train_network.py
index f870734fd..ce34f26d3 100644
--- a/train_network.py
+++ b/train_network.py
@@ -1439,6 +1439,9 @@ def remove_model(old_ckpt_name):
                 )
 
                 for val_step, batch in enumerate(val_dataloader):
+                    if val_step >= validation_steps:
+                        break
+
                     loss = self.process_batch(batch, tokenizers, text_encoders, unet, network, vae, noise_scheduler, vae_dtype, weight_dtype, accelerator, args, text_encoding_strategy, tokenize_strategy, is_train=False, timesteps_list=[10, 350, 500, 650, 990])
 
                     current_loss = loss.detach().item()
@@ -1447,7 +1450,6 @@ def remove_model(old_ckpt_name):
                     val_progress_bar.set_postfix({ "val_avg_loss": val_loss_recorder.moving_average })
 
                     if is_tracking:
-                        avr_loss: float = val_loss_recorder.moving_average
                         logs = {"loss/validation_current": current_loss}
                         # accelerator.log(logs, step=(len(val_dataloader) * epoch) + 1 + val_step)
                         accelerator.log(logs)

From 1c0ae306e551ede5bd162819debb4d80a7fe620b Mon Sep 17 00:00:00 2001
From: rockerBOO <rockerboo@gmail.com>
Date: Fri, 3 Jan 2025 15:43:02 -0500
Subject: [PATCH 296/582] Add missing functions for training batch

---
 train_network.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/train_network.py b/train_network.py
index ce34f26d3..377ddf48e 100644
--- a/train_network.py
+++ b/train_network.py
@@ -318,7 +318,7 @@ def on_step_start(self, args, accelerator, network, text_encoders, unet, batch,
     # endregion
 
     def process_batch(self, batch, tokenizers, text_encoders, unet, network, vae: AutoencoderKL, noise_scheduler, vae_dtype, weight_dtype, accelerator, args, text_encoding_strategy: strategy_sd.SdTextEncodingStrategy, tokenize_strategy: strategy_sd.SdTokenizeStrategy, is_train=True, train_text_encoder=True, train_unet=True, timesteps_list: Optional[List[Number]]=None) -> torch.Tensor:
-        
+
         with torch.no_grad():
             if "latents" in batch and batch["latents"] is not None:
                 latents = typing.cast(torch.FloatTensor, batch["latents"].to(accelerator.device))
@@ -1333,6 +1333,11 @@ def remove_model(old_ckpt_name):
                     continue
 
                 with accelerator.accumulate(training_model):
+                    on_step_start_for_network(text_encoder, unet)
+
+                    # temporary, for batch processing
+                    self.on_step_start(args, accelerator, network, text_encoders, unet, batch, weight_dtype)
+
                     loss = self.process_batch(batch, tokenizers, text_encoders, unet, network, vae, noise_scheduler, vae_dtype, weight_dtype, accelerator, args, text_encoding_strategy, tokenize_strategy, is_train=True, train_text_encoder=train_text_encoder, train_unet=train_unet)
                     accelerator.backward(loss)
                     if accelerator.sync_gradients:

From a9c5aa1f9336cedf1e294fd3c8c22bb649d51015 Mon Sep 17 00:00:00 2001
From: Kohya S <ykumeykume@gmail.com>
Date: Sun, 5 Jan 2025 22:28:51 +0900
Subject: [PATCH 297/582] add CFG to FLUX.1 sample image

---
 library/flux_train_utils.py | 156 ++++++++++++++++++++++++------------
 1 file changed, 106 insertions(+), 50 deletions(-)

diff --git a/library/flux_train_utils.py b/library/flux_train_utils.py
index f7f06c5cf..9f954f58c 100644
--- a/library/flux_train_utils.py
+++ b/library/flux_train_utils.py
@@ -40,7 +40,7 @@ def sample_images(
     text_encoders,
     sample_prompts_te_outputs,
     prompt_replacement=None,
-    controlnet=None
+    controlnet=None,
 ):
     if steps == 0:
         if not args.sample_at_first:
@@ -101,7 +101,7 @@ def sample_images(
                     steps,
                     sample_prompts_te_outputs,
                     prompt_replacement,
-                    controlnet
+                    controlnet,
                 )
     else:
         # Creating list with N elements, where each element is a list of prompt_dicts, and N is the number of processes available (number of devices available)
@@ -125,7 +125,7 @@ def sample_images(
                         steps,
                         sample_prompts_te_outputs,
                         prompt_replacement,
-                        controlnet
+                        controlnet,
                     )
 
     torch.set_rng_state(rng_state)
@@ -147,14 +147,14 @@ def sample_image_inference(
     steps,
     sample_prompts_te_outputs,
     prompt_replacement,
-    controlnet
+    controlnet,
 ):
     assert isinstance(prompt_dict, dict)
-    # negative_prompt = prompt_dict.get("negative_prompt")
+    negative_prompt = prompt_dict.get("negative_prompt")
     sample_steps = prompt_dict.get("sample_steps", 20)
     width = prompt_dict.get("width", 512)
     height = prompt_dict.get("height", 512)
-    scale = prompt_dict.get("scale", 3.5)
+    scale = prompt_dict.get("scale", 1.0)  # 1.0 means no guidance
     seed = prompt_dict.get("seed")
     controlnet_image = prompt_dict.get("controlnet_image")
     prompt: str = prompt_dict.get("prompt", "")
@@ -162,8 +162,8 @@ def sample_image_inference(
 
     if prompt_replacement is not None:
         prompt = prompt.replace(prompt_replacement[0], prompt_replacement[1])
-        # if negative_prompt is not None:
-        #     negative_prompt = negative_prompt.replace(prompt_replacement[0], prompt_replacement[1])
+        if negative_prompt is not None:
+            negative_prompt = negative_prompt.replace(prompt_replacement[0], prompt_replacement[1])
 
     if seed is not None:
         torch.manual_seed(seed)
@@ -173,16 +173,18 @@ def sample_image_inference(
         torch.seed()
         torch.cuda.seed()
 
-    # if negative_prompt is None:
-    #     negative_prompt = ""
+    if negative_prompt is None:
+        negative_prompt = ""
     height = max(64, height - height % 16)  # round to divisible by 16
     width = max(64, width - width % 16)  # round to divisible by 16
     logger.info(f"prompt: {prompt}")
-    # logger.info(f"negative_prompt: {negative_prompt}")
+    if scale != 1.0:
+        logger.info(f"negative_prompt: {negative_prompt}")
     logger.info(f"height: {height}")
     logger.info(f"width: {width}")
     logger.info(f"sample_steps: {sample_steps}")
-    logger.info(f"scale: {scale}")
+    if scale != 1.0:
+        logger.info(f"scale: {scale}")
     # logger.info(f"sample_sampler: {sampler_name}")
     if seed is not None:
         logger.info(f"seed: {seed}")
@@ -191,26 +193,37 @@ def sample_image_inference(
     tokenize_strategy = strategy_base.TokenizeStrategy.get_strategy()
     encoding_strategy = strategy_base.TextEncodingStrategy.get_strategy()
 
-    text_encoder_conds = []
-    if sample_prompts_te_outputs and prompt in sample_prompts_te_outputs:
-        text_encoder_conds = sample_prompts_te_outputs[prompt]
-        print(f"Using cached text encoder outputs for prompt: {prompt}")
-    if text_encoders is not None:
-        print(f"Encoding prompt: {prompt}")
-        tokens_and_masks = tokenize_strategy.tokenize(prompt)
-        # strategy has apply_t5_attn_mask option
-        encoded_text_encoder_conds = encoding_strategy.encode_tokens(tokenize_strategy, text_encoders, tokens_and_masks)
-
-        # if text_encoder_conds is not cached, use encoded_text_encoder_conds
-        if len(text_encoder_conds) == 0:
-            text_encoder_conds = encoded_text_encoder_conds
-        else:
-            # if encoded_text_encoder_conds is not None, update cached text_encoder_conds
-            for i in range(len(encoded_text_encoder_conds)):
-                if encoded_text_encoder_conds[i] is not None:
-                    text_encoder_conds[i] = encoded_text_encoder_conds[i]
-
-    l_pooled, t5_out, txt_ids, t5_attn_mask = text_encoder_conds
+    def encode_prompt(prpt):
+        text_encoder_conds = []
+        if sample_prompts_te_outputs and prpt in sample_prompts_te_outputs:
+            text_encoder_conds = sample_prompts_te_outputs[prpt]
+            print(f"Using cached text encoder outputs for prompt: {prpt}")
+        if text_encoders is not None:
+            print(f"Encoding prompt: {prpt}")
+            tokens_and_masks = tokenize_strategy.tokenize(prpt)
+            # strategy has apply_t5_attn_mask option
+            encoded_text_encoder_conds = encoding_strategy.encode_tokens(tokenize_strategy, text_encoders, tokens_and_masks)
+
+            # if text_encoder_conds is not cached, use encoded_text_encoder_conds
+            if len(text_encoder_conds) == 0:
+                text_encoder_conds = encoded_text_encoder_conds
+            else:
+                # if encoded_text_encoder_conds is not None, update cached text_encoder_conds
+                for i in range(len(encoded_text_encoder_conds)):
+                    if encoded_text_encoder_conds[i] is not None:
+                        text_encoder_conds[i] = encoded_text_encoder_conds[i]
+        return text_encoder_conds
+
+    l_pooled, t5_out, txt_ids, t5_attn_mask = encode_prompt(prompt)
+    # encode negative prompts
+    if scale != 1.0:
+        neg_l_pooled, neg_t5_out, _, neg_t5_attn_mask = encode_prompt(negative_prompt)
+        neg_t5_attn_mask = (
+            neg_t5_attn_mask.to(accelerator.device) if args.apply_t5_attn_mask and neg_t5_attn_mask is not None else None
+        )
+        neg_cond = (scale, neg_l_pooled, neg_t5_out, neg_t5_attn_mask)
+    else:
+        neg_cond = None
 
     # sample image
     weight_dtype = ae.dtype  # TOFO give dtype as argument
@@ -235,7 +248,20 @@ def sample_image_inference(
         controlnet_image = controlnet_image.permute(2, 0, 1).unsqueeze(0).to(weight_dtype).to(accelerator.device)
 
     with accelerator.autocast(), torch.no_grad():
-        x = denoise(flux, noise, img_ids, t5_out, txt_ids, l_pooled, timesteps=timesteps, guidance=scale, t5_attn_mask=t5_attn_mask, controlnet=controlnet, controlnet_img=controlnet_image)
+        x = denoise(
+            flux,
+            noise,
+            img_ids,
+            t5_out,
+            txt_ids,
+            l_pooled,
+            timesteps=timesteps,
+            guidance=scale,
+            t5_attn_mask=t5_attn_mask,
+            controlnet=controlnet,
+            controlnet_img=controlnet_image,
+            neg_cond=neg_cond,
+        )
 
     x = flux_utils.unpack_latents(x, packed_latent_height, packed_latent_width)
 
@@ -305,22 +331,24 @@ def denoise(
     model: flux_models.Flux,
     img: torch.Tensor,
     img_ids: torch.Tensor,
-    txt: torch.Tensor,
+    txt: torch.Tensor,  # t5_out
     txt_ids: torch.Tensor,
-    vec: torch.Tensor,
+    vec: torch.Tensor,  # l_pooled
     timesteps: list[float],
     guidance: float = 4.0,
     t5_attn_mask: Optional[torch.Tensor] = None,
     controlnet: Optional[flux_models.ControlNetFlux] = None,
     controlnet_img: Optional[torch.Tensor] = None,
+    neg_cond: Optional[Tuple[float, torch.Tensor, torch.Tensor, torch.Tensor]] = None,
 ):
     # this is ignored for schnell
     guidance_vec = torch.full((img.shape[0],), guidance, device=img.device, dtype=img.dtype)
-
+    do_cfg = neg_cond is not None
 
     for t_curr, t_prev in zip(tqdm(timesteps[:-1]), timesteps[1:]):
         t_vec = torch.full((img.shape[0],), t_curr, dtype=img.dtype, device=img.device)
         model.prepare_block_swap_before_forward()
+
         if controlnet is not None:
             block_samples, block_single_samples = controlnet(
                 img=img,
@@ -336,20 +364,48 @@ def denoise(
         else:
             block_samples = None
             block_single_samples = None
-        pred = model(
-            img=img,
-            img_ids=img_ids,
-            txt=txt,
-            txt_ids=txt_ids,
-            y=vec,
-            block_controlnet_hidden_states=block_samples,
-            block_controlnet_single_hidden_states=block_single_samples,
-            timesteps=t_vec,
-            guidance=guidance_vec,
-            txt_attention_mask=t5_attn_mask,
-        )
 
-        img = img + (t_prev - t_curr) * pred
+        if not do_cfg:
+            pred = model(
+                img=img,
+                img_ids=img_ids,
+                txt=txt,
+                txt_ids=txt_ids,
+                y=vec,
+                block_controlnet_hidden_states=block_samples,
+                block_controlnet_single_hidden_states=block_single_samples,
+                timesteps=t_vec,
+                guidance=guidance_vec,
+                txt_attention_mask=t5_attn_mask,
+            )
+
+            img = img + (t_prev - t_curr) * pred
+        else:
+            cfg_scale, neg_l_pooled, neg_t5_out, neg_t5_attn_mask = neg_cond
+            nc_c_t5_attn_mask = None if t5_attn_mask is None else torch.cat([neg_t5_attn_mask, t5_attn_mask], dim=0)
+
+            # TODO is it ok to use the same block samples for both cond and uncond?
+            block_samples = None if block_samples is None else torch.cat([block_samples, block_samples], dim=0)
+            block_single_samples = (
+                None if block_single_samples is None else torch.cat([block_single_samples, block_single_samples], dim=0)
+            )
+
+            nc_c_pred = model(
+                img=torch.cat([img, img], dim=0),
+                img_ids=torch.cat([img_ids, img_ids], dim=0),
+                txt=torch.cat([neg_t5_out, txt], dim=0),
+                txt_ids=torch.cat([txt_ids, txt_ids], dim=0),
+                y=torch.cat([neg_l_pooled, vec], dim=0),
+                block_controlnet_hidden_states=block_samples,
+                block_controlnet_single_hidden_states=block_single_samples,
+                timesteps=t_vec,
+                guidance=guidance_vec,
+                txt_attention_mask=nc_c_t5_attn_mask,
+            )
+            neg_pred, pred = torch.chunk(nc_c_pred, 2, dim=0)
+            pred = neg_pred + (pred - neg_pred) * cfg_scale
+
+            img = img + (t_prev - t_curr) * pred
 
     model.prepare_block_swap_before_forward()
     return img
@@ -567,7 +623,7 @@ def add_flux_train_arguments(parser: argparse.ArgumentParser):
         "--controlnet_model_name_or_path",
         type=str,
         default=None,
-        help="path to controlnet (*.sft or *.safetensors) / controlnetのパス（*.sftまたは*.safetensors）"
+        help="path to controlnet (*.sft or *.safetensors) / controlnetのパス（*.sftまたは*.safetensors）",
     )
     parser.add_argument(
         "--t5xxl_max_token_length",

From bbf6bbd5ea27231066cec98b8bf2a65f162cb18f Mon Sep 17 00:00:00 2001
From: rockerBOO <rockerboo@gmail.com>
Date: Mon, 6 Jan 2025 10:48:38 -0500
Subject: [PATCH 298/582] Use self.get_noise_pred_and_target and drop fixed
 timesteps

---
 flux_train_network.py |   7 ++-
 sd3_train_network.py  |   3 +-
 train_network.py      | 116 ++++++++++++------------------------------
 3 files changed, 40 insertions(+), 86 deletions(-)

diff --git a/flux_train_network.py b/flux_train_network.py
index 75e975bae..b3aebecc7 100644
--- a/flux_train_network.py
+++ b/flux_train_network.py
@@ -339,6 +339,7 @@ def get_noise_pred_and_target(
         network,
         weight_dtype,
         train_unet,
+        is_train=True
     ):
         # Sample noise that we'll add to the latents
         noise = torch.randn_like(latents)
@@ -375,7 +376,7 @@ def get_noise_pred_and_target(
         def call_dit(img, img_ids, t5_out, txt_ids, l_pooled, timesteps, guidance_vec, t5_attn_mask):
             # if not args.split_mode:
             # normal forward
-            with accelerator.autocast():
+            with torch.set_grad_enabled(is_train and train_unet), accelerator.autocast():
                 # YiYi notes: divide it by 1000 for now because we scale it by 1000 in the transformer model (we should not keep it but I want to keep the inputs same for the model for testing)
                 model_pred = unet(
                     img=img,
@@ -420,7 +421,9 @@ def call_dit(img, img_ids, t5_out, txt_ids, l_pooled, timesteps, guidance_vec, t
                     intermediate_txt.requires_grad_(True)
                     vec.requires_grad_(True)
                     pe.requires_grad_(True)
-                    model_pred = unet(img=intermediate_img, txt=intermediate_txt, vec=vec, pe=pe, txt_attention_mask=t5_attn_mask)
+
+                    with torch.set_grad_enabled(is_train and train_unet): 
+                        model_pred = unet(img=intermediate_img, txt=intermediate_txt, vec=vec, pe=pe, txt_attention_mask=t5_attn_mask)
             """
 
             return model_pred
diff --git a/sd3_train_network.py b/sd3_train_network.py
index fb7711bda..c7417802d 100644
--- a/sd3_train_network.py
+++ b/sd3_train_network.py
@@ -312,6 +312,7 @@ def get_noise_pred_and_target(
         network,
         weight_dtype,
         train_unet,
+        is_train=True
     ):
         # Sample noise that we'll add to the latents
         noise = torch.randn_like(latents)
@@ -339,7 +340,7 @@ def get_noise_pred_and_target(
             t5_attn_mask = None
 
         # call model
-        with accelerator.autocast():
+        with torch.set_grad_enabled(is_train and train_unet), accelerator.autocast():
             # TODO support attention mask
             model_pred = unet(noisy_model_input, timesteps, context=context, y=lg_pooled)
 
diff --git a/train_network.py b/train_network.py
index 377ddf48e..61e6369ae 100644
--- a/train_network.py
+++ b/train_network.py
@@ -223,6 +223,7 @@ def get_noise_pred_and_target(
         network,
         weight_dtype,
         train_unet,
+        is_train=True
     ):
         # Sample noise, sample a random timestep for each image, and add noise to the latents,
         # with noise offset and/or multires noise if specified
@@ -236,7 +237,7 @@ def get_noise_pred_and_target(
                 t.requires_grad_(True)
 
         # Predict the noise residual
-        with accelerator.autocast():
+        with torch.set_grad_enabled(is_train and train_unet), accelerator.autocast():
             noise_pred = self.call_unet(
                 args,
                 accelerator,
@@ -317,7 +318,7 @@ def on_step_start(self, args, accelerator, network, text_encoders, unet, batch,
 
     # endregion
 
-    def process_batch(self, batch, tokenizers, text_encoders, unet, network, vae: AutoencoderKL, noise_scheduler, vae_dtype, weight_dtype, accelerator, args, text_encoding_strategy: strategy_sd.SdTextEncodingStrategy, tokenize_strategy: strategy_sd.SdTokenizeStrategy, is_train=True, train_text_encoder=True, train_unet=True, timesteps_list: Optional[List[Number]]=None) -> torch.Tensor:
+    def process_batch(self, batch, tokenizers, text_encoders, unet, network, vae: AutoencoderKL, noise_scheduler, vae_dtype, weight_dtype, accelerator, args, text_encoding_strategy: strategy_sd.SdTextEncodingStrategy, tokenize_strategy: strategy_sd.SdTokenizeStrategy, is_train=True, train_text_encoder=True, train_unet=True) -> torch.Tensor:
 
         with torch.no_grad():
             if "latents" in batch and batch["latents"] is not None:
@@ -372,91 +373,40 @@ def process_batch(self, batch, tokenizers, text_encoders, unet, network, vae: Au
 
         batch_size = latents.shape[0]
 
-        # Sample noise, 
-        noise = train_util.make_noise(args, latents)
 
-        def pick_timesteps_list() -> torch.IntTensor:
-            if timesteps_list is None or timesteps_list == []:
-                return typing.cast(torch.IntTensor, train_util.make_random_timesteps(args, noise_scheduler, batch_size, latents.device).unsqueeze(1))
-            else:
-                return typing.cast(torch.IntTensor, torch.tensor(timesteps_list).unsqueeze(1).repeat(1, batch_size).to(latents.device))
-
-        chosen_timesteps_list = pick_timesteps_list()
-        total_loss = torch.zeros((batch_size, 1)).to(latents.device)
-
-        # Use input timesteps_list or use described timesteps above
-        for fixed_timesteps in chosen_timesteps_list:
-            fixed_timesteps = typing.cast(torch.IntTensor, fixed_timesteps)
-
-            # Predict the noise residual
-            # and add noise to the latents
-            # with noise offset and/or multires noise if specified
-            noisy_latents = train_util.get_noisy_latents(args, noise, noise_scheduler, latents, fixed_timesteps)
-
-            # ensure the hidden state will require grad
-            if args.gradient_checkpointing:
-                for x in noisy_latents:
-                    x.requires_grad_(True)
-                for t in text_encoder_conds:
-                    t.requires_grad_(True)
-
-            with torch.set_grad_enabled(is_train and train_unet), accelerator.autocast():
-                noise_pred = self.call_unet(
-                    args,
-                    accelerator,
-                    unet,
-                    noisy_latents.requires_grad_(train_unet),
-                    fixed_timesteps,
-                    text_encoder_conds,
-                    batch,
-                    weight_dtype,
-                )
-
-            if args.v_parameterization:
-                # v-parameterization training
-                target = noise_scheduler.get_velocity(latents, noise, fixed_timesteps)
-            else:
-                target = noise
-
-            # differential output preservation
-            if "custom_attributes" in batch:
-                diff_output_pr_indices = []
-                for i, custom_attributes in enumerate(batch["custom_attributes"]):
-                    if "diff_output_preservation" in custom_attributes and custom_attributes["diff_output_preservation"]:
-                        diff_output_pr_indices.append(i)
-
-                if len(diff_output_pr_indices) > 0:
-                    network.set_multiplier(0.0)
-                    with torch.no_grad(), accelerator.autocast():
-                        noise_pred_prior = self.call_unet(
-                            args,
-                            accelerator,
-                            unet,
-                            noisy_latents,
-                            fixed_timesteps,
-                            text_encoder_conds,
-                            batch,
-                            weight_dtype,
-                            indices=diff_output_pr_indices,
-                        )
-                    network.set_multiplier(1.0)  # may be overwritten by "network_multipliers" in the next step
-                    target[diff_output_pr_indices] = noise_pred_prior.to(target.dtype)
-
-            huber_c = train_util.get_huber_threshold_if_needed(args, fixed_timesteps, noise_scheduler)
-            loss = train_util.conditional_loss(noise_pred.float(), target.float(), args.loss_type, "none", huber_c)
-            loss = loss.mean([1, 2, 3]) # 平均なのでbatch_sizeで割る必要なし
+        # Predict the noise residual
+        # and add noise to the latents
+        # with noise offset and/or multires noise if specified
 
-            if args.masked_loss or ("alpha_masks" in batch and batch["alpha_masks"] is not None):
-                loss = apply_masked_loss(loss, batch)
+        # sample noise, call unet, get target
+        noise_pred, target, timesteps, weighting = self.get_noise_pred_and_target(
+            args,
+            accelerator,
+            noise_scheduler,
+            latents,
+            batch,
+            text_encoder_conds,
+            unet,
+            network,
+            weight_dtype,
+            train_unet,
+            is_train=is_train
+        )
 
-            loss_weights = batch["loss_weights"].to(accelerator.device)  # 各sampleごとのweight
-            loss = loss * loss_weights
+        huber_c = train_util.get_huber_threshold_if_needed(args, timesteps, noise_scheduler)
+        loss = train_util.conditional_loss(noise_pred.float(), target.float(), args.loss_type, "none", huber_c)
+        if weighting is not None:
+            loss = loss * weighting
+        if args.masked_loss or ("alpha_masks" in batch and batch["alpha_masks"] is not None):
+            loss = apply_masked_loss(loss, batch)
+        loss = loss.mean([1, 2, 3])
 
-            loss = self.post_process_loss(loss, args, fixed_timesteps, noise_scheduler)
+        loss_weights = batch["loss_weights"]  # 各sampleごとのweight
+        loss = loss * loss_weights
 
-            total_loss += loss 
+        loss = self.post_process_loss(loss, args, timesteps, noise_scheduler)
 
-        return total_loss / len(chosen_timesteps_list)
+        return loss.mean()
 
     def train(self, args):
         session_id = random.randint(0, 2**32)
@@ -1416,7 +1366,7 @@ def remove_model(old_ckpt_name):
                         if val_step >= validation_steps:
                             break
 
-                        loss = self.process_batch(batch, tokenizers, text_encoders, unet, network, vae, noise_scheduler, vae_dtype, weight_dtype, accelerator, args, text_encoding_strategy, tokenize_strategy, is_train=False, timesteps_list=[10, 350, 500, 650, 990])
+                        loss = self.process_batch(batch, tokenizers, text_encoders, unet, network, vae, noise_scheduler, vae_dtype, weight_dtype, accelerator, args, text_encoding_strategy, tokenize_strategy, is_train=False)
                         
                         val_loss_recorder.add(epoch=epoch, step=val_step, loss=loss.detach().item())
                         val_progress_bar.update(1)
@@ -1447,7 +1397,7 @@ def remove_model(old_ckpt_name):
                     if val_step >= validation_steps:
                         break
 
-                    loss = self.process_batch(batch, tokenizers, text_encoders, unet, network, vae, noise_scheduler, vae_dtype, weight_dtype, accelerator, args, text_encoding_strategy, tokenize_strategy, is_train=False, timesteps_list=[10, 350, 500, 650, 990])
+                    loss = self.process_batch(batch, tokenizers, text_encoders, unet, network, vae, noise_scheduler, vae_dtype, weight_dtype, accelerator, args, text_encoding_strategy, tokenize_strategy, is_train=False)
 
                     current_loss = loss.detach().item()
                     val_loss_recorder.add(epoch=epoch, step=val_step, loss=current_loss)

From f4840ef29ef67878d7c7ccec92bdce89c3b61c6d Mon Sep 17 00:00:00 2001
From: rockerBOO <rockerboo@gmail.com>
Date: Mon, 6 Jan 2025 10:52:07 -0500
Subject: [PATCH 299/582] Revert train_db.py

---
 train_db.py | 121 ++--------------------------------------------------
 1 file changed, 3 insertions(+), 118 deletions(-)

diff --git a/train_db.py b/train_db.py
index 398489ffe..ad21f8d1b 100644
--- a/train_db.py
+++ b/train_db.py
@@ -2,6 +2,7 @@
 # XXX dropped option: fine_tune
 
 import argparse
+import itertools
 import math
 import os
 from multiprocessing import Value
@@ -41,73 +42,11 @@
 
 setup_logging()
 import logging
-import itertools
 
 logger = logging.getLogger(__name__)
 
 # perlin_noise,
-def process_val_batch(*training_models, batch, tokenizers, text_encoders, unet, vae, noise_scheduler, vae_dtype, weight_dtype, accelerator, args):
-    total_loss = 0.0
-    timesteps_list = [10, 350, 500, 650, 990]
-    
-    with accelerator.accumulate(*training_models):
-        with torch.no_grad():
-            # latentに変換
-            if cache_latents:
-                latents = batch["latents"].to(accelerator.device).to(dtype=weight_dtype)
-            else:
-                latents = vae.encode(batch["images"].to(dtype=weight_dtype)).latent_dist.sample()
-            latents = latents * 0.18215
-        b_size = latents.shape[0]
-
-    with torch.set_grad_enabled(False), accelerator.autocast():
-        if args.weighted_captions:
-            encoder_hidden_states = get_weighted_text_embeddings(
-                tokenizer,
-                text_encoder,
-                batch["captions"],
-                accelerator.device,
-                args.max_token_length // 75 if args.max_token_length else 1,
-                clip_skip=args.clip_skip,
-            )
-        else:
-            input_ids = batch["input_ids"].to(accelerator.device)
-            encoder_hidden_states = train_util.get_hidden_states(
-                args, input_ids, tokenizer, text_encoder, None if not args.full_fp16 else weight_dtype
-            )
 
-    # Sample noise, sample a random timestep for each image, and add noise to the latents,
-    # with noise offset and/or multires noise if specified
-
-    for fixed_timesteps in timesteps_list:
-        with torch.set_grad_enabled(False), accelerator.autocast():
-            # Sample noise, sample a random timestep for each image, and add noise to the latents,
-            # with noise offset and/or multires noise if specified
-            noise = torch.randn_like(latents, device=latents.device)
-            b_size = latents.shape[0]
-            timesteps = torch.full((b_size,), fixed_timesteps, dtype=torch.long, device=latents.device)
-            noisy_latents = noise_scheduler.add_noise(latents, noise, timesteps)
-
-            # Predict the noise residual
-            with accelerator.autocast():
-                noise_pred = unet(noisy_latents, timesteps, encoder_hidden_states).sample
-
-            if args.v_parameterization:
-                # v-parameterization training
-                target = noise_scheduler.get_velocity(latents, noise, timesteps)
-            else:
-                target = noise
-
-            loss = torch.nn.functional.mse_loss(noise_pred.float(), target.float(), reduction="none")
-            if args.masked_loss:
-                loss = apply_masked_loss(loss, batch)
-            loss = loss.mean([1, 2, 3])
-            loss = apply_debiased_estimation(loss, timesteps, noise_scheduler)
-            loss = loss.mean()  # 平均なのでbatch_sizeで割る必要なし
-            total_loss += loss
-
-    average_loss = total_loss / len(timesteps_list)    
-    return average_loss
 
 def train(args):
     train_util.verify_training_args(args)
@@ -150,10 +89,9 @@ def train(args):
             }
 
         blueprint = blueprint_generator.generate(user_config, args)
-        train_dataset_group, val_dataset_group = config_util.generate_dataset_group_by_blueprint(blueprint.dataset_group)
+        train_dataset_group = config_util.generate_dataset_group_by_blueprint(blueprint.dataset_group)
     else:
         train_dataset_group = train_util.load_arbitrary_dataset(args)
-        val_dataset_group = None
 
     current_epoch = Value("i", 0)
     current_step = Value("i", 0)
@@ -274,15 +212,6 @@ def train(args):
         num_workers=n_workers,
         persistent_workers=args.persistent_data_loader_workers,
     )
-    val_dataloader = torch.utils.data.DataLoader(
-        val_dataset_group if val_dataset_group is not None else [],
-        shuffle=False,
-        batch_size=1,
-        collate_fn=collator,
-        num_workers=n_workers,
-        persistent_workers=args.persistent_data_loader_workers,
-    )
-    cyclic_val_dataloader = itertools.cycle(val_dataloader)
 
     # 学習ステップ数を計算する
     if args.max_train_epochs is not None:
@@ -393,8 +322,6 @@ def train(args):
         accelerator.log({}, step=0)
 
     loss_recorder = train_util.LossRecorder()
-    val_loss_recorder = train_util.LossRecorder()
-
     for epoch in range(num_train_epochs):
         accelerator.print(f"\nepoch {epoch+1}/{num_train_epochs}")
         current_epoch.value = epoch + 1
@@ -525,25 +452,6 @@ def train(args):
             avr_loss: float = loss_recorder.moving_average
             logs = {"avr_loss": avr_loss}  # , "lr": lr_scheduler.get_last_lr()[0]}
             progress_bar.set_postfix(**logs)
-            if len(val_dataloader) > 0:
-                if  (args.validation_every_n_step is not None and global_step % args.validation_every_n_step == 0) or (args.validation_every_n_step is None and step == len(train_dataloader) - 1) or global_step >= args.max_train_steps:
-                    accelerator.print("Validating バリデーション処理...")
-                    total_loss = 0.0
-                    with torch.no_grad():
-                        validation_steps = min(args.max_validation_steps, len(val_dataloader)) if args.max_validation_steps is not None else len(val_dataloader)
-                        for val_step in tqdm(range(validation_steps), desc='Validation Steps'):
-                            batch = next(cyclic_val_dataloader)
-                            loss = self.process_val_batch(batch, tokenizers, text_encoders, unet, vae, noise_scheduler, vae_dtype, weight_dtype, accelerator, args)
-                            total_loss += loss.detach().item()
-                        current_loss = total_loss / validation_steps
-                        val_loss_recorder.add(epoch=0, step=global_step, loss=current_loss)   
-
-                    if args.logging_dir is not None:
-                        logs = {"loss/current_val_loss": current_loss}
-                        accelerator.log(logs, step=global_step)                            
-                        avr_loss: float = val_loss_recorder.moving_average
-                        logs = {"loss/average_val_loss": avr_loss}
-                        accelerator.log(logs, step=global_step)
 
             if global_step >= args.max_train_steps:
                 break
@@ -634,30 +542,7 @@ def setup_parser() -> argparse.ArgumentParser:
         action="store_true",
         help="do not use fp16/bf16 VAE in mixed precision (use float VAE) / mixed precisionでも fp16/bf16 VAEを使わずfloat VAEを使う",
     )
-    parser.add_argument(
-        "--validation_seed",
-        type=int,
-        default=None,
-        help="Validation seed"
-    )
-    parser.add_argument(
-        "--validation_split",
-        type=float,
-        default=0.0,
-        help="Split for validation images out of the training dataset"
-    )
-    parser.add_argument(
-        "--validation_every_n_step",
-        type=int,
-        default=None,
-        help="Number of train steps for counting validation loss. By default, validation per train epoch is performed"
-    )
-    parser.add_argument(
-        "--max_validation_steps",
-        type=int,
-        default=None,
-        help="Number of max validation steps for counting validation loss. By default, validation will run entire validation dataset"
-    )
+
     return parser
 
 

From 1c63e7cc4979b528417b5bfe181e0a9ac119209c Mon Sep 17 00:00:00 2001
From: rockerBOO <rockerboo@gmail.com>
Date: Mon, 6 Jan 2025 11:07:47 -0500
Subject: [PATCH 300/582] Cleanup unused code and formatting

---
 train_network.py | 85 +++++++++++++++++++++++++++++++++++++++---------
 1 file changed, 70 insertions(+), 15 deletions(-)

diff --git a/train_network.py b/train_network.py
index 61e6369ae..5a80d825d 100644
--- a/train_network.py
+++ b/train_network.py
@@ -318,8 +318,27 @@ def on_step_start(self, args, accelerator, network, text_encoders, unet, batch,
 
     # endregion
 
-    def process_batch(self, batch, tokenizers, text_encoders, unet, network, vae: AutoencoderKL, noise_scheduler, vae_dtype, weight_dtype, accelerator, args, text_encoding_strategy: strategy_sd.SdTextEncodingStrategy, tokenize_strategy: strategy_sd.SdTokenizeStrategy, is_train=True, train_text_encoder=True, train_unet=True) -> torch.Tensor:
-
+    def process_batch(
+        self, 
+        batch, 
+        text_encoders, 
+        unet, 
+        network, 
+        vae, 
+        noise_scheduler, 
+        vae_dtype, 
+        weight_dtype, 
+        accelerator, 
+        args, 
+        text_encoding_strategy: strategy_sd.SdTextEncodingStrategy, 
+        tokenize_strategy: strategy_sd.SdTokenizeStrategy, 
+        is_train=True, 
+        train_text_encoder=True, 
+        train_unet=True
+    ) -> torch.Tensor:
+        """
+        Process a batch for the network
+        """
         with torch.no_grad():
             if "latents" in batch and batch["latents"] is not None:
                 latents = typing.cast(torch.FloatTensor, batch["latents"].to(accelerator.device))
@@ -334,7 +353,6 @@ def process_batch(self, batch, tokenizers, text_encoders, unet, network, vae: Au
 
             latents = self.shift_scale_latents(args, latents)
 
-
         text_encoder_conds = []
         text_encoder_outputs_list = batch.get("text_encoder_outputs_list", None)
         if text_encoder_outputs_list is not None:
@@ -371,13 +389,6 @@ def process_batch(self, batch, tokenizers, text_encoders, unet, network, vae: Au
                     if encoded_text_encoder_conds[i] is not None:
                         text_encoder_conds[i] = encoded_text_encoder_conds[i]
 
-        batch_size = latents.shape[0]
-
-
-        # Predict the noise residual
-        # and add noise to the latents
-        # with noise offset and/or multires noise if specified
-
         # sample noise, call unet, get target
         noise_pred, target, timesteps, weighting = self.get_noise_pred_and_target(
             args,
@@ -1288,7 +1299,23 @@ def remove_model(old_ckpt_name):
                     # temporary, for batch processing
                     self.on_step_start(args, accelerator, network, text_encoders, unet, batch, weight_dtype)
 
-                    loss = self.process_batch(batch, tokenizers, text_encoders, unet, network, vae, noise_scheduler, vae_dtype, weight_dtype, accelerator, args, text_encoding_strategy, tokenize_strategy, is_train=True, train_text_encoder=train_text_encoder, train_unet=train_unet)
+                    loss = self.process_batch(batch, 
+                        text_encoders, 
+                        unet, 
+                        network, 
+                        vae, 
+                        noise_scheduler, 
+                        vae_dtype, 
+                        weight_dtype, 
+                        accelerator, 
+                        args, 
+                        text_encoding_strategy, 
+                        tokenize_strategy, 
+                        is_train=True, 
+                        train_text_encoder=train_text_encoder, 
+                        train_unet=train_unet
+                    )
+
                     accelerator.backward(loss)
                     if accelerator.sync_gradients:
                         self.all_reduce_network(accelerator, network)  # sync DDP grad manually
@@ -1366,12 +1393,26 @@ def remove_model(old_ckpt_name):
                         if val_step >= validation_steps:
                             break
 
-                        loss = self.process_batch(batch, tokenizers, text_encoders, unet, network, vae, noise_scheduler, vae_dtype, weight_dtype, accelerator, args, text_encoding_strategy, tokenize_strategy, is_train=False)
-                        
+                        loss = self.process_batch(
+                            batch, 
+                            text_encoders, 
+                            unet, 
+                            network, 
+                            vae, 
+                            noise_scheduler, 
+                            vae_dtype, 
+                            weight_dtype, 
+                            accelerator, 
+                            args, 
+                            text_encoding_strategy, 
+                            tokenize_strategy, 
+                            is_train=False
+                        )
+
                         val_loss_recorder.add(epoch=epoch, step=val_step, loss=loss.detach().item())
                         val_progress_bar.update(1)
                         val_progress_bar.set_postfix({ "val_avg_loss": val_loss_recorder.moving_average })
-    
+
                         if is_tracking:
                             logs = {"loss/current_val_loss": loss.detach().item()}
                             # accelerator.log(logs, step=(len(val_dataloader) * epoch) + 1 + val_step)
@@ -1397,7 +1438,21 @@ def remove_model(old_ckpt_name):
                     if val_step >= validation_steps:
                         break
 
-                    loss = self.process_batch(batch, tokenizers, text_encoders, unet, network, vae, noise_scheduler, vae_dtype, weight_dtype, accelerator, args, text_encoding_strategy, tokenize_strategy, is_train=False)
+                    loss = self.process_batch(
+                        batch, 
+                        text_encoders, 
+                        unet, 
+                        network, 
+                        vae, 
+                        noise_scheduler, 
+                        vae_dtype, 
+                        weight_dtype, 
+                        accelerator, 
+                        args, 
+                        text_encoding_strategy, 
+                        tokenize_strategy, 
+                        is_train=False
+                    )
 
                     current_loss = loss.detach().item()
                     val_loss_recorder.add(epoch=epoch, step=val_step, loss=current_loss)

From c64d1a22fc4ff25625873e50d63d480b297301c6 Mon Sep 17 00:00:00 2001
From: rockerBOO <rockerboo@gmail.com>
Date: Mon, 6 Jan 2025 11:30:21 -0500
Subject: [PATCH 301/582] Add validate_every_n_epochs, change name
 validate_every_n_steps

---
 train_network.py | 69 ++++++++++++++++++++++++++++++------------------
 1 file changed, 44 insertions(+), 25 deletions(-)

diff --git a/train_network.py b/train_network.py
index 5a80d825d..f3c8d8c96 100644
--- a/train_network.py
+++ b/train_network.py
@@ -1199,7 +1199,8 @@ def load_model_hook(models, input_dir):
             )
 
         loss_recorder = train_util.LossRecorder()
-        val_loss_recorder = train_util.LossRecorder()
+        val_step_loss_recorder = train_util.LossRecorder()
+        val_epoch_loss_recorder = train_util.LossRecorder()
 
         del train_dataset_group
         if val_dataset_group is not None:
@@ -1299,7 +1300,8 @@ def remove_model(old_ckpt_name):
                     # temporary, for batch processing
                     self.on_step_start(args, accelerator, network, text_encoders, unet, batch, weight_dtype)
 
-                    loss = self.process_batch(batch, 
+                    loss = self.process_batch(
+                        batch, 
                         text_encoders, 
                         unet, 
                         network, 
@@ -1373,15 +1375,25 @@ def remove_model(old_ckpt_name):
 
                 if is_tracking:
                     logs = self.generate_step_logs(
-                        args, current_loss, avr_loss, lr_scheduler, lr_descriptions, optimizer, keys_scaled, mean_norm, maximum_norm
+                        args, 
+                        current_loss, 
+                        avr_loss, 
+                        lr_scheduler, 
+                        lr_descriptions, 
+                        optimizer, 
+                        keys_scaled, 
+                        mean_norm, 
+                        maximum_norm
                     )
                     # accelerator.log(logs, step=global_step)
                     accelerator.log(logs)
 
                 # VALIDATION PER STEP
-                should_validate = (args.validation_every_n_step is not None 
-                                   and global_step % args.validation_every_n_step == 0)
-                if validation_steps > 0 and should_validate:
+                should_validate_epoch = (
+                    args.validate_every_n_steps is not None 
+                    and global_step % args.validate_every_n_steps == 0
+                )
+                if validation_steps > 0 and should_validate_epoch:
                     accelerator.print("Validating バリデーション処理...")
 
                     val_progress_bar = tqdm(
@@ -1409,16 +1421,17 @@ def remove_model(old_ckpt_name):
                             is_train=False
                         )
 
-                        val_loss_recorder.add(epoch=epoch, step=val_step, loss=loss.detach().item())
+                        current_loss = loss.detach().item()
+                        val_step_loss_recorder.add(epoch=epoch, step=val_step, loss=current_loss)
                         val_progress_bar.update(1)
-                        val_progress_bar.set_postfix({ "val_avg_loss": val_loss_recorder.moving_average })
+                        val_progress_bar.set_postfix({ "val_avg_loss": val_step_loss_recorder.moving_average })
 
                         if is_tracking:
-                            logs = {"loss/current_val_loss": loss.detach().item()}
+                            logs = {"loss/step_validation_current": current_loss}
                             # accelerator.log(logs, step=(len(val_dataloader) * epoch) + 1 + val_step)
                             accelerator.log(logs)
 
-                            logs = {"loss/average_val_loss": val_loss_recorder.moving_average}
+                            logs = {"loss/step_validation_average": val_step_loss_recorder.moving_average}
                             # accelerator.log(logs, step=global_step)
                             accelerator.log(logs)
                                         
@@ -1426,12 +1439,18 @@ def remove_model(old_ckpt_name):
                     break
 
             # VALIDATION EPOCH
-            if len(val_dataloader) > 0:
+            should_validate_epoch = (
+                (epoch + 1) % args.validate_every_n_epochs == 0 
+                if args.validate_every_n_epochs is not None 
+                else False
+            )
+
+            if should_validate_epoch and len(val_dataloader) > 0:
                 accelerator.print("Validating バリデーション処理...")
                 val_progress_bar = tqdm(
                     range(validation_steps), smoothing=0, 
                     disable=not accelerator.is_local_main_process, 
-                    desc="validation steps"
+                    desc="epoch validation steps"
                 )
 
                 for val_step, batch in enumerate(val_dataloader):
@@ -1455,18 +1474,18 @@ def remove_model(old_ckpt_name):
                     )
 
                     current_loss = loss.detach().item()
-                    val_loss_recorder.add(epoch=epoch, step=val_step, loss=current_loss)
+                    val_epoch_loss_recorder.add(epoch=epoch, step=val_step, loss=current_loss)
                     val_progress_bar.update(1)
-                    val_progress_bar.set_postfix({ "val_avg_loss": val_loss_recorder.moving_average })
+                    val_progress_bar.set_postfix({ "val_epoch_avg_loss": val_epoch_loss_recorder.moving_average })
 
                     if is_tracking:
-                        logs = {"loss/validation_current": current_loss}
+                        logs = {"loss/epoch_validation_current": current_loss}
                         # accelerator.log(logs, step=(len(val_dataloader) * epoch) + 1 + val_step)
                         accelerator.log(logs)
 
                 if is_tracking:
-                    avr_loss: float = val_loss_recorder.moving_average
-                    logs = {"loss/validation_average": avr_loss}
+                    avr_loss: float = val_epoch_loss_recorder.moving_average
+                    logs = {"loss/epoch_validation_average": avr_loss}
                     # accelerator.log(logs, step=epoch + 1)
                     accelerator.log(logs)
 
@@ -1475,12 +1494,6 @@ def remove_model(old_ckpt_name):
                 logs = {"loss/epoch_average": loss_recorder.moving_average}
                 # accelerator.log(logs, step=epoch + 1)
                 accelerator.log(logs)
-
-            if len(val_dataloader) > 0 and is_tracking:
-                avr_loss: float = val_loss_recorder.moving_average
-                logs = {"loss/validation_epoch_average": avr_loss}
-                # accelerator.log(logs, step=epoch + 1)
-                accelerator.log(logs)
                     
             accelerator.wait_for_everyone()
 
@@ -1676,10 +1689,16 @@ def setup_parser() -> argparse.ArgumentParser:
         help="Split for validation images out of the training dataset / 学習画像から検証画像に分割する割合"
     )
     parser.add_argument(
-        "--validation_every_n_step",
+        "--validate_every_n_steps",
+        type=int,
+        default=None,
+        help="Run validation dataset every N steps"
+    )
+    parser.add_argument(
+        "--validate_every_n_epochs",
         type=int,
         default=None,
-        help="Number of train steps for counting validation loss. By default, validation per train epoch is performed / 学習エポックごとに検証を行う場合はNoneを指定する"
+        help="Run validation dataset every N epochs. By default, validation will run every epoch if a validation dataset is available"
     )
     parser.add_argument(
         "--max_validation_steps",

From f8850296c83ef2091bf1cb0f6e9ba462adfd9045 Mon Sep 17 00:00:00 2001
From: rockerBOO <rockerboo@gmail.com>
Date: Mon, 6 Jan 2025 11:34:10 -0500
Subject: [PATCH 302/582] Fix validate epoch, cleanup imports

---
 train_network.py | 11 ++++-------
 1 file changed, 4 insertions(+), 7 deletions(-)

diff --git a/train_network.py b/train_network.py
index f3c8d8c96..11bba71e8 100644
--- a/train_network.py
+++ b/train_network.py
@@ -3,15 +3,13 @@
 import math
 import os
 import typing
-from typing import List, Optional, Union
+from typing import Any, List
 import sys
 import random
 import time
 import json
 from multiprocessing import Value
-from typing import Any, List
 import toml
-import itertools
 
 from tqdm import tqdm
 
@@ -23,8 +21,8 @@
 
 from accelerate.utils import set_seed
 from accelerate import Accelerator
-from diffusers import DDPMScheduler, AutoencoderKL
-from diffusers.models.modeling_outputs import AutoencoderKLOutput
+from diffusers import DDPMScheduler
+from diffusers.models.autoencoders.autoencoder_kl import AutoencoderKL
 from library import deepspeed_utils, model_util, strategy_base, strategy_sd
 
 import library.train_util as train_util
@@ -49,7 +47,6 @@
 
 setup_logging()
 import logging
-import itertools
 
 logger = logging.getLogger(__name__)
 
@@ -1442,7 +1439,7 @@ def remove_model(old_ckpt_name):
             should_validate_epoch = (
                 (epoch + 1) % args.validate_every_n_epochs == 0 
                 if args.validate_every_n_epochs is not None 
-                else False
+                else True
             )
 
             if should_validate_epoch and len(val_dataloader) > 0:

From fcb2ff010cf2e42c50b3745a17317f2d4b4319d9 Mon Sep 17 00:00:00 2001
From: rockerBOO <rockerboo@gmail.com>
Date: Mon, 6 Jan 2025 11:39:32 -0500
Subject: [PATCH 303/582] Clean up some validation help documentation

---
 train_network.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/train_network.py b/train_network.py
index 11bba71e8..af180c455 100644
--- a/train_network.py
+++ b/train_network.py
@@ -1677,7 +1677,7 @@ def setup_parser() -> argparse.ArgumentParser:
         "--validation_seed",
         type=int,
         default=None,
-        help="Validation seed for shuffling validation dataset, training `--seed` used otherwise / 検証シード"
+        help="Validation seed for shuffling validation dataset, training `--seed` used otherwise / 検証データセットをシャッフルするための検証シード、それ以外の場合はトレーニング `--seed` を使用する"
     )
     parser.add_argument(
         "--validation_split",
@@ -1689,19 +1689,19 @@ def setup_parser() -> argparse.ArgumentParser:
         "--validate_every_n_steps",
         type=int,
         default=None,
-        help="Run validation dataset every N steps"
+        help="Run validation on validation dataset every N steps if a validation dataset is available / 検証データセットが利用可能な場合は、Nステップごとに検証データセットの検証を実行します"
     )
     parser.add_argument(
         "--validate_every_n_epochs",
         type=int,
         default=None,
-        help="Run validation dataset every N epochs. By default, validation will run every epoch if a validation dataset is available"
+        help="Run validation dataset every N epochs. By default, validation will run every epoch if a validation dataset is available / 検証データセットをNエポックごとに実行します。デフォルトでは、検証データセットが利用可能な場合、検証はエポックごとに実行されます"
     )
     parser.add_argument(
         "--max_validation_steps",
         type=int,
         default=None,
-        help="Number of max validation steps for counting validation loss. By default, validation will run entire validation dataset / 検証データセット全体を検証する場合はNoneを指定する"
+        help="Max number of validation dataset items processed. By default, validation will run the entire validation dataset / 処理される検証データセット項目の最大数。デフォルトでは、検証は検証データセット全体を実行します"
     )
     return parser
 

From 742bee9738e9d190a39f5a36adf4515fa415e9b7 Mon Sep 17 00:00:00 2001
From: rockerBOO <rockerboo@gmail.com>
Date: Mon, 6 Jan 2025 17:34:23 -0500
Subject: [PATCH 304/582] Set validation steps in multiple lines for
 readability

---
 train_network.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/train_network.py b/train_network.py
index af180c455..d0596fcae 100644
--- a/train_network.py
+++ b/train_network.py
@@ -1251,7 +1251,11 @@ def remove_model(old_ckpt_name):
             # log empty object to commit the sample images to wandb
             accelerator.log({}, step=0)
 
-        validation_steps = min(args.max_validation_steps, len(val_dataloader)) if args.max_validation_steps is not None else len(val_dataloader)
+        validation_steps = (
+            min(args.max_validation_steps, len(val_dataloader)) 
+            if args.max_validation_steps is not None 
+            else len(val_dataloader)
+        )
 
         # training loop
         if initial_step > 0:  # only if skip_until_initial_step is specified
@@ -1689,7 +1693,7 @@ def setup_parser() -> argparse.ArgumentParser:
         "--validate_every_n_steps",
         type=int,
         default=None,
-        help="Run validation on validation dataset every N steps if a validation dataset is available / 検証データセットが利用可能な場合は、Nステップごとに検証データセットの検証を実行します"
+        help="Run validation on validation dataset every N steps. By default, validation will only occur every epoch if a validation dataset is available / 検証データセットの検証をNステップごとに実行します。デフォルトでは、検証データセットが利用可能な場合にのみ、検証はエポックごとに実行されます"
     )
     parser.add_argument(
         "--validate_every_n_epochs",

From 1231f5114ccd6a0a26a53da82b89083299ccc333 Mon Sep 17 00:00:00 2001
From: rockerBOO <rockerboo@gmail.com>
Date: Tue, 7 Jan 2025 22:31:41 -0500
Subject: [PATCH 305/582] Remove unused train_util code, fix accelerate.log for
 wandb, add init_trackers library code

---
 library/train_util.py | 70 ++++++++++++++++---------------------------
 train_network.py      | 66 ++++++++++++++++++++--------------------
 2 files changed, 59 insertions(+), 77 deletions(-)

diff --git a/library/train_util.py b/library/train_util.py
index 0907a8c03..b8894752e 100644
--- a/library/train_util.py
+++ b/library/train_util.py
@@ -5900,51 +5900,9 @@ def save_sd_model_on_train_end_common(
             huggingface_util.upload(args, out_dir, "/" + model_name, force_sync_upload=True)
 
 
-def get_timesteps(min_timestep: int, max_timestep: int, b_size: int, device: torch.device = torch.device("cpu")) -> torch.IntTensor:
+def get_timesteps(min_timestep: int, max_timestep: int, b_size: int, device: torch.device = torch.device("cpu")) -> torch.Tensor:
     timesteps = torch.randint(min_timestep, max_timestep, (b_size,), device=device)
-    return timesteps
-
-
-
-
-def modify_noise(args, noise: torch.Tensor, latents: torch.Tensor) -> torch.FloatTensor:
-    """
-    Apply noise modifications like noise offset and multires noise
-    """
-    if args.noise_offset:
-        if args.noise_offset_random_strength:
-            noise_offset = torch.rand(1, device=latents.device) * args.noise_offset
-        else:
-            noise_offset = args.noise_offset
-        noise = custom_train_functions.apply_noise_offset(latents, noise, noise_offset, args.adaptive_noise_scale)
-    if args.multires_noise_iterations:
-        noise = custom_train_functions.pyramid_noise_like(
-            noise, latents.device, args.multires_noise_iterations, args.multires_noise_discount
-        )
-    return noise
-
-
-def make_noise(args, latents: torch.Tensor) -> torch.FloatTensor:
-    """
-    Make a noise tensor to denoise and apply noise modifications (noise offset, multires noise). See `modify_noise`
-    """
-    # Sample noise that we'll add to the latents
-    noise = torch.randn_like(latents, device=latents.device)
-    noise = modify_noise(args, noise, latents)
-
-    return typing.cast(torch.FloatTensor, noise)
-
-
-def make_random_timesteps(args, noise_scheduler: DDPMScheduler, batch_size: int, device: torch.device) -> torch.IntTensor:
-    """
-    From args, produce random timesteps for each image in the batch
-    """
-    min_timestep = 0 if args.min_timestep is None else args.min_timestep
-    max_timestep = noise_scheduler.config.get('num_train_timesteps', 1000) if args.max_timestep is None else args.max_timestep
-
-    # Sample a random timestep for each image
-    timesteps = get_timesteps(min_timestep, max_timestep, batch_size, device)
-
+    timesteps = timesteps.long().to(device)
     return timesteps
 
 
@@ -6457,6 +6415,30 @@ def sample_image_inference(
         wandb_tracker.log({f"sample_{i}": wandb.Image(image, caption=prompt)}, commit=False)  # positive prompt as a caption
 
 
+def init_trackers(accelerator: Accelerator, args: argparse.Namespace, default_tracker_name: str): 
+    """
+    Initialize experiment trackers with tracker specific behaviors
+    """
+    if accelerator.is_main_process:
+        init_kwargs = {}
+        if args.wandb_run_name:
+            init_kwargs["wandb"] = {"name": args.wandb_run_name}
+        if args.log_tracker_config is not None:
+            init_kwargs = toml.load(args.log_tracker_config)
+        accelerator.init_trackers(
+            default_tracker_name if args.log_tracker_name is None else args.log_tracker_name,
+            config=get_sanitized_config_or_none(args),
+            init_kwargs=init_kwargs,
+        )
+
+        if "wandb" in [tracker.name for tracker in accelerator.trackers]:
+            import wandb 
+            wandb_tracker = accelerator.get_tracker("wandb", unwrap=True)
+
+            # Define specific metrics to handle validation and epochs "steps"
+            wandb_tracker.define_metric("epoch", hidden=True)
+            wandb_tracker.define_metric("val_step", hidden=True)
+
 # endregion
 
 
diff --git a/train_network.py b/train_network.py
index d0596fcae..199f589b0 100644
--- a/train_network.py
+++ b/train_network.py
@@ -327,8 +327,8 @@ def process_batch(
         weight_dtype, 
         accelerator, 
         args, 
-        text_encoding_strategy: strategy_sd.SdTextEncodingStrategy, 
-        tokenize_strategy: strategy_sd.SdTokenizeStrategy, 
+        text_encoding_strategy: strategy_base.TextEncodingStrategy, 
+        tokenize_strategy: strategy_base.TokenizeStrategy, 
         is_train=True, 
         train_text_encoder=True, 
         train_unet=True
@@ -1183,17 +1183,7 @@ def load_model_hook(models, input_dir):
 
         noise_scheduler = self.get_noise_scheduler(args, accelerator.device)
 
-        if accelerator.is_main_process:
-            init_kwargs = {}
-            if args.wandb_run_name:
-                init_kwargs["wandb"] = {"name": args.wandb_run_name}
-            if args.log_tracker_config is not None:
-                init_kwargs = toml.load(args.log_tracker_config)
-            accelerator.init_trackers(
-                "network_train" if args.log_tracker_name is None else args.log_tracker_name,
-                config=train_util.get_sanitized_config_or_none(args),
-                init_kwargs=init_kwargs,
-            )
+        train_util.init_trackers(accelerator, args, "network_train")
 
         loss_recorder = train_util.LossRecorder()
         val_step_loss_recorder = train_util.LossRecorder()
@@ -1386,15 +1376,14 @@ def remove_model(old_ckpt_name):
                         mean_norm, 
                         maximum_norm
                     )
-                    # accelerator.log(logs, step=global_step)
-                    accelerator.log(logs)
+                    accelerator.log(logs, step=global_step)
 
                 # VALIDATION PER STEP
-                should_validate_epoch = (
+                should_validate_step = (
                     args.validate_every_n_steps is not None 
                     and global_step % args.validate_every_n_steps == 0
                 )
-                if validation_steps > 0 and should_validate_epoch:
+                if validation_steps > 0 and should_validate_step:
                     accelerator.print("Validating バリデーション処理...")
 
                     val_progress_bar = tqdm(
@@ -1406,6 +1395,9 @@ def remove_model(old_ckpt_name):
                         if val_step >= validation_steps:
                             break
 
+                        # temporary, for batch processing
+                        self.on_step_start(args, accelerator, network, text_encoders, unet, batch, weight_dtype)
+
                         loss = self.process_batch(
                             batch, 
                             text_encoders, 
@@ -1428,18 +1420,22 @@ def remove_model(old_ckpt_name):
                         val_progress_bar.set_postfix({ "val_avg_loss": val_step_loss_recorder.moving_average })
 
                         if is_tracking:
-                            logs = {"loss/step_validation_current": current_loss}
-                            # accelerator.log(logs, step=(len(val_dataloader) * epoch) + 1 + val_step)
-                            accelerator.log(logs)
+                            logs = {
+                                "loss/validation/step/current": current_loss,
+                                "val_step": (epoch * validation_steps) + val_step,
+                            }
+                            accelerator.log(logs, step=global_step)
 
-                            logs = {"loss/step_validation_average": val_step_loss_recorder.moving_average}
-                            # accelerator.log(logs, step=global_step)
-                            accelerator.log(logs)
+                    if is_tracking:
+                        logs = {
+                            "loss/validation/step/average": val_step_loss_recorder.moving_average, 
+                        }
+                        accelerator.log(logs, step=global_step)
                                         
                 if global_step >= args.max_train_steps:
                     break
 
-            # VALIDATION EPOCH
+            # EPOCH VALIDATION
             should_validate_epoch = (
                 (epoch + 1) % args.validate_every_n_epochs == 0 
                 if args.validate_every_n_epochs is not None 
@@ -1458,6 +1454,9 @@ def remove_model(old_ckpt_name):
                     if val_step >= validation_steps:
                         break
 
+                    # temporary, for batch processing
+                    self.on_step_start(args, accelerator, network, text_encoders, unet, batch, weight_dtype)
+
                     loss = self.process_batch(
                         batch, 
                         text_encoders, 
@@ -1480,21 +1479,22 @@ def remove_model(old_ckpt_name):
                     val_progress_bar.set_postfix({ "val_epoch_avg_loss": val_epoch_loss_recorder.moving_average })
 
                     if is_tracking:
-                        logs = {"loss/epoch_validation_current": current_loss}
-                        # accelerator.log(logs, step=(len(val_dataloader) * epoch) + 1 + val_step)
-                        accelerator.log(logs)
+                        logs = {
+                            "loss/validation/epoch_current": current_loss, 
+                            "epoch": epoch + 1, 
+                            "val_step": (epoch * validation_steps) + val_step
+                        }
+                        accelerator.log(logs, step=global_step)
 
                 if is_tracking:
                     avr_loss: float = val_epoch_loss_recorder.moving_average
-                    logs = {"loss/epoch_validation_average": avr_loss}
-                    # accelerator.log(logs, step=epoch + 1)
-                    accelerator.log(logs)
+                    logs = {"loss/validation/epoch_average": avr_loss, "epoch": epoch + 1}
+                    accelerator.log(logs, step=global_step)
 
             # END OF EPOCH
             if is_tracking:
-                logs = {"loss/epoch_average": loss_recorder.moving_average}
-                # accelerator.log(logs, step=epoch + 1)
-                accelerator.log(logs)
+                logs = {"loss/epoch_average": loss_recorder.moving_average, "epoch": epoch + 1}
+                accelerator.log(logs, step=global_step)
                     
             accelerator.wait_for_everyone()
 

From 556f3f1696eadcc16ee77425243b732a84c7a2aa Mon Sep 17 00:00:00 2001
From: rockerBOO <rockerboo@gmail.com>
Date: Wed, 8 Jan 2025 13:41:15 -0500
Subject: [PATCH 306/582] Fix documentation, remove unused function, fix bucket
 reso for sd1.5, fix multiple datasets

---
 library/config_util.py |  6 +++---
 library/train_util.py  | 25 ++++---------------------
 train_network.py       |  5 +----
 3 files changed, 8 insertions(+), 28 deletions(-)

diff --git a/library/config_util.py b/library/config_util.py
index 63d28c969..de1e154a1 100644
--- a/library/config_util.py
+++ b/library/config_util.py
@@ -481,9 +481,9 @@ def generate_dataset_group_by_blueprint(dataset_group_blueprint: DatasetGroupBlu
             subset_klass = FineTuningSubset
             dataset_klass = FineTuningDataset
 
-    subsets = [subset_klass(**asdict(subset_blueprint.params)) for subset_blueprint in dataset_blueprint.subsets]
-    dataset = dataset_klass(subsets=subsets, is_training_dataset=True, **asdict(dataset_blueprint.params))
-    datasets.append(dataset)
+        subsets = [subset_klass(**asdict(subset_blueprint.params)) for subset_blueprint in dataset_blueprint.subsets]
+        dataset = dataset_klass(subsets=subsets, is_training_dataset=True, **asdict(dataset_blueprint.params))
+        datasets.append(dataset)
 
     val_datasets: List[Union[DreamBoothDataset, FineTuningDataset, ControlNetDataset]] = []
     for dataset_blueprint in dataset_group_blueprint.datasets:
diff --git a/library/train_util.py b/library/train_util.py
index b8894752e..62aae37ef 100644
--- a/library/train_util.py
+++ b/library/train_util.py
@@ -152,11 +152,11 @@ def split_train_val(paths: List[str], is_training_dataset: bool, validation_spli
 
     Shuffle the dataset based on the validation_seed or the current random seed.
     For example if the split of 0.2 of 100 images.
-    [0:79] = 80 training images
+    [0:80] = 80 training images
     [80:] = 20 validation images
     """
     if validation_seed is not None:
-        print(f"Using validation seed: {validation_seed}")
+        logging.info(f"Using validation seed: {validation_seed}")
         prevstate = random.getstate()
         random.seed(validation_seed)
         random.shuffle(paths)
@@ -5900,8 +5900,8 @@ def save_sd_model_on_train_end_common(
             huggingface_util.upload(args, out_dir, "/" + model_name, force_sync_upload=True)
 
 
-def get_timesteps(min_timestep: int, max_timestep: int, b_size: int, device: torch.device = torch.device("cpu")) -> torch.Tensor:
-    timesteps = torch.randint(min_timestep, max_timestep, (b_size,), device=device)
+def get_timesteps(min_timestep: int, max_timestep: int, b_size: int, device: torch.device) -> torch.Tensor:
+    timesteps = torch.randint(min_timestep, max_timestep, (b_size,), device="cpu")
     timesteps = timesteps.long().to(device)
     return timesteps
 
@@ -5964,23 +5964,6 @@ def get_huber_threshold_if_needed(args, timesteps: torch.Tensor, noise_scheduler
     return result
 
 
-def get_noisy_latents(args, noise: torch.FloatTensor, noise_scheduler: DDPMScheduler, latents: torch.FloatTensor, timesteps: torch.IntTensor) -> torch.FloatTensor:
-    """
-    Add noise to the latents according to the noise magnitude at each timestep
-    (this is the forward diffusion process)
-    """
-    if args.ip_noise_gamma:
-        if args.ip_noise_gamma_random_strength:
-            strength = torch.rand(1, device=latents.device) * args.ip_noise_gamma
-        else:
-            strength = args.ip_noise_gamma
-        noisy_latents = noise_scheduler.add_noise(latents, noise + strength * torch.randn_like(latents), timesteps)
-    else:
-        noisy_latents = noise_scheduler.add_noise(latents, noise, timesteps)
-
-    return noisy_latents
-
-
 def conditional_loss(
     model_pred: torch.Tensor, target: torch.Tensor, loss_type: str, reduction: str, huber_c: Optional[torch.Tensor] = None
 ):
diff --git a/train_network.py b/train_network.py
index 199f589b0..7dbd12e88 100644
--- a/train_network.py
+++ b/train_network.py
@@ -125,10 +125,7 @@ def generate_step_logs(
         return logs
 
     def assert_extra_args(self, args, train_dataset_group):
-        # train_dataset_group.verify_bucket_reso_steps(64)
-        # TODO: Number of bucket reso steps may differ for each model, so a static number won't work
-        # and prevents models like SD1.5 with 64
-        pass
+        train_dataset_group.verify_bucket_reso_steps(32)
 
     def load_target_model(self, args, weight_dtype, accelerator):
         text_encoder, vae, unet, _ = train_util.load_target_model(args, weight_dtype, accelerator)

From 9fde0d797282c0cb9fcea01682e2e6e2eece47bc Mon Sep 17 00:00:00 2001
From: rockerBOO <rockerboo@gmail.com>
Date: Wed, 8 Jan 2025 18:38:20 -0500
Subject: [PATCH 307/582] Handle tuple return from
 generate_dataset_group_by_blueprint

---
 fine_tune.py                         | 4 ++--
 flux_train.py                        | 3 ++-
 flux_train_control_net.py            | 4 ++--
 library/config_util.py               | 2 +-
 sd3_train.py                         | 3 ++-
 sdxl_train.py                        | 3 ++-
 sdxl_train_control_net.py            | 2 +-
 sdxl_train_control_net_lllite.py     | 2 +-
 sdxl_train_control_net_lllite_old.py | 2 +-
 tools/cache_latents.py               | 3 ++-
 tools/cache_text_encoder_outputs.py  | 3 ++-
 train_control_net.py                 | 2 +-
 train_db.py                          | 3 ++-
 train_textual_inversion.py           | 3 ++-
 train_textual_inversion_XTI.py       | 2 +-
 15 files changed, 24 insertions(+), 17 deletions(-)

diff --git a/fine_tune.py b/fine_tune.py
index 176087065..6be2f98ca 100644
--- a/fine_tune.py
+++ b/fine_tune.py
@@ -91,9 +91,9 @@ def train(args):
             }
 
         blueprint = blueprint_generator.generate(user_config, args)
-        train_dataset_group = config_util.generate_dataset_group_by_blueprint(blueprint.dataset_group)
+        train_dataset_group, val_dataset_group = config_util.generate_dataset_group_by_blueprint(blueprint.dataset_group)
     else:
-        train_dataset_group = train_util.load_arbitrary_dataset(args)
+        train_dataset_group, val_dataset_group = train_util.load_arbitrary_dataset(args)
 
     current_epoch = Value("i", 0)
     current_step = Value("i", 0)
diff --git a/flux_train.py b/flux_train.py
index fced3bef9..6f98adea8 100644
--- a/flux_train.py
+++ b/flux_train.py
@@ -138,9 +138,10 @@ def train(args):
                 }
 
         blueprint = blueprint_generator.generate(user_config, args)
-        train_dataset_group = config_util.generate_dataset_group_by_blueprint(blueprint.dataset_group)
+        train_dataset_group, val_dataset_group = config_util.generate_dataset_group_by_blueprint(blueprint.dataset_group)
     else:
         train_dataset_group = train_util.load_arbitrary_dataset(args)
+        val_dataset_group = None
 
     current_epoch = Value("i", 0)
     current_step = Value("i", 0)
diff --git a/flux_train_control_net.py b/flux_train_control_net.py
index 9d36a41d3..54dec2a77 100644
--- a/flux_train_control_net.py
+++ b/flux_train_control_net.py
@@ -126,9 +126,9 @@ def train(args):
             }
 
         blueprint = blueprint_generator.generate(user_config, args)
-        train_dataset_group = config_util.generate_dataset_group_by_blueprint(blueprint.dataset_group)
+        train_dataset_group, val_dataset_group = config_util.generate_dataset_group_by_blueprint(blueprint.dataset_group)
     else:
-        train_dataset_group = train_util.load_arbitrary_dataset(args)
+        train_dataset_group, val_dataset_group = train_util.load_arbitrary_dataset(args)
 
     current_epoch = Value("i", 0)
     current_step = Value("i", 0)
diff --git a/library/config_util.py b/library/config_util.py
index de1e154a1..834d6bfaf 100644
--- a/library/config_util.py
+++ b/library/config_util.py
@@ -467,7 +467,7 @@ def search_value(key: str, fallbacks: Sequence[dict], default_value=None):
 
         return default_value
 
-def generate_dataset_group_by_blueprint(dataset_group_blueprint: DatasetGroupBlueprint):
+def generate_dataset_group_by_blueprint(dataset_group_blueprint: DatasetGroupBlueprint) -> Tuple[DatasetGroup, Optional[DatasetGroup]]:
     datasets: List[Union[DreamBoothDataset, FineTuningDataset, ControlNetDataset]] = []
 
     for dataset_blueprint in dataset_group_blueprint.datasets:
diff --git a/sd3_train.py b/sd3_train.py
index 120455e7b..3bff6a50f 100644
--- a/sd3_train.py
+++ b/sd3_train.py
@@ -149,9 +149,10 @@ def train(args):
                 }
 
         blueprint = blueprint_generator.generate(user_config, args)
-        train_dataset_group = config_util.generate_dataset_group_by_blueprint(blueprint.dataset_group)
+        train_dataset_group, val_dataset_group = config_util.generate_dataset_group_by_blueprint(blueprint.dataset_group)
     else:
         train_dataset_group = train_util.load_arbitrary_dataset(args)
+        val_dataset_group = None
 
     current_epoch = Value("i", 0)
     current_step = Value("i", 0)
diff --git a/sdxl_train.py b/sdxl_train.py
index b9d529243..a60f6df63 100644
--- a/sdxl_train.py
+++ b/sdxl_train.py
@@ -176,9 +176,10 @@ def train(args):
                 }
 
         blueprint = blueprint_generator.generate(user_config, args)
-        train_dataset_group = config_util.generate_dataset_group_by_blueprint(blueprint.dataset_group)
+        train_dataset_group, val_dataset_group = config_util.generate_dataset_group_by_blueprint(blueprint.dataset_group)
     else:
         train_dataset_group = train_util.load_arbitrary_dataset(args)
+        val_dataset_group = None
 
     current_epoch = Value("i", 0)
     current_step = Value("i", 0)
diff --git a/sdxl_train_control_net.py b/sdxl_train_control_net.py
index ffbf03cab..c6e8136f7 100644
--- a/sdxl_train_control_net.py
+++ b/sdxl_train_control_net.py
@@ -114,7 +114,7 @@ def train(args):
         }
 
     blueprint = blueprint_generator.generate(user_config, args)
-    train_dataset_group = config_util.generate_dataset_group_by_blueprint(blueprint.dataset_group)
+    train_dataset_group, val_dataset_group = config_util.generate_dataset_group_by_blueprint(blueprint.dataset_group)
 
     current_epoch = Value("i", 0)
     current_step = Value("i", 0)
diff --git a/sdxl_train_control_net_lllite.py b/sdxl_train_control_net_lllite.py
index 365059b75..00e51a673 100644
--- a/sdxl_train_control_net_lllite.py
+++ b/sdxl_train_control_net_lllite.py
@@ -123,7 +123,7 @@ def train(args):
         }
 
     blueprint = blueprint_generator.generate(user_config, args)
-    train_dataset_group = config_util.generate_dataset_group_by_blueprint(blueprint.dataset_group)
+    train_dataset_group, val_dataset_group = config_util.generate_dataset_group_by_blueprint(blueprint.dataset_group)
 
     current_epoch = Value("i", 0)
     current_step = Value("i", 0)
diff --git a/sdxl_train_control_net_lllite_old.py b/sdxl_train_control_net_lllite_old.py
index 5b372befc..63457cc61 100644
--- a/sdxl_train_control_net_lllite_old.py
+++ b/sdxl_train_control_net_lllite_old.py
@@ -103,7 +103,7 @@ def train(args):
         }
 
     blueprint = blueprint_generator.generate(user_config, args, tokenizer=[tokenizer1, tokenizer2])
-    train_dataset_group = config_util.generate_dataset_group_by_blueprint(blueprint.dataset_group)
+    train_dataset_group, val_dataset_group = config_util.generate_dataset_group_by_blueprint(blueprint.dataset_group)
 
     current_epoch = Value("i", 0)
     current_step = Value("i", 0)
diff --git a/tools/cache_latents.py b/tools/cache_latents.py
index c034f949a..515ece98d 100644
--- a/tools/cache_latents.py
+++ b/tools/cache_latents.py
@@ -116,10 +116,11 @@ def cache_to_disk(args: argparse.Namespace) -> None:
                 }
 
         blueprint = blueprint_generator.generate(user_config, args)
-        train_dataset_group = config_util.generate_dataset_group_by_blueprint(blueprint.dataset_group)
+        train_dataset_group, val_dataset_group = config_util.generate_dataset_group_by_blueprint(blueprint.dataset_group)
     else:
         # use arbitrary dataset class
         train_dataset_group = train_util.load_arbitrary_dataset(args)
+        val_dataset_group = None
 
     # acceleratorを準備する
     logger.info("prepare accelerator")
diff --git a/tools/cache_text_encoder_outputs.py b/tools/cache_text_encoder_outputs.py
index 5888b8e3d..00459658e 100644
--- a/tools/cache_text_encoder_outputs.py
+++ b/tools/cache_text_encoder_outputs.py
@@ -103,10 +103,11 @@ def cache_to_disk(args: argparse.Namespace) -> None:
                 }
 
         blueprint = blueprint_generator.generate(user_config, args)
-        train_dataset_group = config_util.generate_dataset_group_by_blueprint(blueprint.dataset_group)
+        train_dataset_group, val_dataset_group = config_util.generate_dataset_group_by_blueprint(blueprint.dataset_group)
     else:
         # use arbitrary dataset class
         train_dataset_group = train_util.load_arbitrary_dataset(args)
+        val_dataset_group = None
 
     # acceleratorを準備する
     logger.info("prepare accelerator")
diff --git a/train_control_net.py b/train_control_net.py
index 177d2b11f..ba016ac5d 100644
--- a/train_control_net.py
+++ b/train_control_net.py
@@ -100,7 +100,7 @@ def train(args):
         }
 
     blueprint = blueprint_generator.generate(user_config, args, tokenizer=tokenizer)
-    train_dataset_group = config_util.generate_dataset_group_by_blueprint(blueprint.dataset_group)
+    train_dataset_group, val_dataset_group = config_util.generate_dataset_group_by_blueprint(blueprint.dataset_group)
 
     current_epoch = Value("i", 0)
     current_step = Value("i", 0)
diff --git a/train_db.py b/train_db.py
index ad21f8d1b..edd674034 100644
--- a/train_db.py
+++ b/train_db.py
@@ -89,9 +89,10 @@ def train(args):
             }
 
         blueprint = blueprint_generator.generate(user_config, args)
-        train_dataset_group = config_util.generate_dataset_group_by_blueprint(blueprint.dataset_group)
+        train_dataset_group, val_dataset_group = config_util.generate_dataset_group_by_blueprint(blueprint.dataset_group)
     else:
         train_dataset_group = train_util.load_arbitrary_dataset(args)
+        val_dataset_group = None
 
     current_epoch = Value("i", 0)
     current_step = Value("i", 0)
diff --git a/train_textual_inversion.py b/train_textual_inversion.py
index 65da4859b..113f35997 100644
--- a/train_textual_inversion.py
+++ b/train_textual_inversion.py
@@ -320,9 +320,10 @@ def train(self, args):
                     }
 
             blueprint = blueprint_generator.generate(user_config, args)
-            train_dataset_group = config_util.generate_dataset_group_by_blueprint(blueprint.dataset_group)
+            train_dataset_group, val_dataset_group = config_util.generate_dataset_group_by_blueprint(blueprint.dataset_group)
         else:
             train_dataset_group = train_util.load_arbitrary_dataset(args)
+            val_dataset_group = None
 
         self.assert_extra_args(args, train_dataset_group)
 
diff --git a/train_textual_inversion_XTI.py b/train_textual_inversion_XTI.py
index 2a2b42310..6ff97d03f 100644
--- a/train_textual_inversion_XTI.py
+++ b/train_textual_inversion_XTI.py
@@ -239,7 +239,7 @@ def train(args):
             }
 
     blueprint = blueprint_generator.generate(user_config, args, tokenizer=tokenizer)
-    train_dataset_group = config_util.generate_dataset_group_by_blueprint(blueprint.dataset_group)
+    train_dataset_group, val_dataset_group = config_util.generate_dataset_group_by_blueprint(blueprint.dataset_group)
     train_dataset_group.enable_XTI(XTI_layers, token_strings=token_strings)
     current_epoch = Value("i", 0)
     current_step = Value("i", 0)

From 1e61392cf2f601e1c66aaede6846ef70f599c34f Mon Sep 17 00:00:00 2001
From: rockerBOO <rockerboo@gmail.com>
Date: Wed, 8 Jan 2025 18:43:26 -0500
Subject: [PATCH 308/582] Revert bucket_reso_steps to correct 64

---
 train_network.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/train_network.py b/train_network.py
index 7dbd12e88..7e9f12659 100644
--- a/train_network.py
+++ b/train_network.py
@@ -125,7 +125,7 @@ def generate_step_logs(
         return logs
 
     def assert_extra_args(self, args, train_dataset_group):
-        train_dataset_group.verify_bucket_reso_steps(32)
+        train_dataset_group.verify_bucket_reso_steps(64)
 
     def load_target_model(self, args, weight_dtype, accelerator):
         text_encoder, vae, unet, _ = train_util.load_target_model(args, weight_dtype, accelerator)

From d6f158ddf6a3631df7db10ac97453b12de8eadbe Mon Sep 17 00:00:00 2001
From: rockerBOO <rockerboo@gmail.com>
Date: Wed, 8 Jan 2025 18:48:05 -0500
Subject: [PATCH 309/582] Fix incorrect destructoring for
 load_abritrary_dataset

---
 fine_tune.py              | 3 ++-
 flux_train_control_net.py | 3 ++-
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/fine_tune.py b/fine_tune.py
index 6be2f98ca..e1ed47496 100644
--- a/fine_tune.py
+++ b/fine_tune.py
@@ -93,7 +93,8 @@ def train(args):
         blueprint = blueprint_generator.generate(user_config, args)
         train_dataset_group, val_dataset_group = config_util.generate_dataset_group_by_blueprint(blueprint.dataset_group)
     else:
-        train_dataset_group, val_dataset_group = train_util.load_arbitrary_dataset(args)
+        train_dataset_group = train_util.load_arbitrary_dataset(args)
+        val_dataset_group = None
 
     current_epoch = Value("i", 0)
     current_step = Value("i", 0)
diff --git a/flux_train_control_net.py b/flux_train_control_net.py
index 54dec2a77..cecd00019 100644
--- a/flux_train_control_net.py
+++ b/flux_train_control_net.py
@@ -128,7 +128,8 @@ def train(args):
         blueprint = blueprint_generator.generate(user_config, args)
         train_dataset_group, val_dataset_group = config_util.generate_dataset_group_by_blueprint(blueprint.dataset_group)
     else:
-        train_dataset_group, val_dataset_group = train_util.load_arbitrary_dataset(args)
+        train_dataset_group = train_util.load_arbitrary_dataset(args)
+        val_dataset_group = None
 
     current_epoch = Value("i", 0)
     current_step = Value("i", 0)

From 264167fa1636c79f106c63c3cdb67b6bee80aceb Mon Sep 17 00:00:00 2001
From: rockerBOO <rockerboo@gmail.com>
Date: Thu, 9 Jan 2025 12:43:58 -0500
Subject: [PATCH 310/582] Apply is_training_dataset only to DreamBoothDataset.
 Add validation_split check and warning

---
 library/config_util.py | 19 ++++++++++++++++---
 1 file changed, 16 insertions(+), 3 deletions(-)

diff --git a/library/config_util.py b/library/config_util.py
index 834d6bfaf..a2e07dc6c 100644
--- a/library/config_util.py
+++ b/library/config_util.py
@@ -471,36 +471,49 @@ def generate_dataset_group_by_blueprint(dataset_group_blueprint: DatasetGroupBlu
     datasets: List[Union[DreamBoothDataset, FineTuningDataset, ControlNetDataset]] = []
 
     for dataset_blueprint in dataset_group_blueprint.datasets:
+        extra_dataset_params = {}
+
         if dataset_blueprint.is_controlnet:
             subset_klass = ControlNetSubset
             dataset_klass = ControlNetDataset
         elif dataset_blueprint.is_dreambooth:
             subset_klass = DreamBoothSubset
             dataset_klass = DreamBoothDataset
+            # DreamBooth datasets support splitting training and validation datasets
+            extra_dataset_params = {"is_training_dataset": True}
         else:
             subset_klass = FineTuningSubset
             dataset_klass = FineTuningDataset
 
         subsets = [subset_klass(**asdict(subset_blueprint.params)) for subset_blueprint in dataset_blueprint.subsets]
-        dataset = dataset_klass(subsets=subsets, is_training_dataset=True, **asdict(dataset_blueprint.params))
+        dataset = dataset_klass(subsets=subsets, **asdict(dataset_blueprint.params), **extra_dataset_params)
         datasets.append(dataset)
 
     val_datasets: List[Union[DreamBoothDataset, FineTuningDataset, ControlNetDataset]] = []
     for dataset_blueprint in dataset_group_blueprint.datasets:
-        if dataset_blueprint.params.validation_split <= 0.0:
+        if dataset_blueprint.params.validation_split < 0.0 or dataset_blueprint.params.validation_split > 1.0:
+            logging.warning(f"Dataset param `validation_split` ({dataset_blueprint.params.validation_split}) is not a valid number between 0.0 and 1.0, skipping validation split...")
+            continue
+
+        # if the dataset isn't setting a validation split, there is no current validation dataset
+        if dataset_blueprint.params.validation_split == 0.0:
             continue
+
+        extra_dataset_params = {}
         if dataset_blueprint.is_controlnet:
             subset_klass = ControlNetSubset
             dataset_klass = ControlNetDataset
         elif dataset_blueprint.is_dreambooth:
             subset_klass = DreamBoothSubset
             dataset_klass = DreamBoothDataset
+            # DreamBooth datasets support splitting training and validation datasets
+            extra_dataset_params = {"is_training_dataset": False}
         else:
             subset_klass = FineTuningSubset
             dataset_klass = FineTuningDataset
 
         subsets = [subset_klass(**asdict(subset_blueprint.params)) for subset_blueprint in dataset_blueprint.subsets]
-        dataset = dataset_klass(subsets=subsets, is_training_dataset=False, **asdict(dataset_blueprint.params))
+        dataset = dataset_klass(subsets=subsets, **asdict(dataset_blueprint.params), **extra_dataset_params)
         val_datasets.append(dataset)
 
     def print_info(_datasets, dataset_type: str):

From 4c61adc9965df6861ae3705c96143f4299074744 Mon Sep 17 00:00:00 2001
From: rockerBOO <rockerboo@gmail.com>
Date: Sun, 12 Jan 2025 13:18:26 -0500
Subject: [PATCH 311/582] Add divergence to logs

Divergence is the difference between training and validation to
allow a clear value to indicate the difference between the two
in the logs.
---
 train_network.py | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

diff --git a/train_network.py b/train_network.py
index 7e9f12659..5ed92b7e2 100644
--- a/train_network.py
+++ b/train_network.py
@@ -1418,14 +1418,16 @@ def remove_model(old_ckpt_name):
 
                         if is_tracking:
                             logs = {
-                                "loss/validation/step/current": current_loss,
+                                "loss/validation/step_current": current_loss,
                                 "val_step": (epoch * validation_steps) + val_step,
                             }
                             accelerator.log(logs, step=global_step)
 
                     if is_tracking:
+                        loss_validation_divergence = val_step_loss_recorder.moving_average - loss_recorder.moving_average
                         logs = {
-                            "loss/validation/step/average": val_step_loss_recorder.moving_average, 
+                            "loss/validation/step_average": val_step_loss_recorder.moving_average, 
+                            "loss/validation/step_divergence": loss_validation_divergence, 
                         }
                         accelerator.log(logs, step=global_step)
                                         
@@ -1485,7 +1487,12 @@ def remove_model(old_ckpt_name):
 
                 if is_tracking:
                     avr_loss: float = val_epoch_loss_recorder.moving_average
-                    logs = {"loss/validation/epoch_average": avr_loss, "epoch": epoch + 1}
+                    loss_validation_divergence = val_step_loss_recorder.moving_average - avr_loss 
+                    logs = {
+                        "loss/validation/epoch_average": avr_loss, 
+                        "loss/validation/epoch_divergence": loss_validation_divergence, 
+                        "epoch": epoch + 1
+                    }
                     accelerator.log(logs, step=global_step)
 
             # END OF EPOCH

From 2bbb40ce51d5be3ce8c3e1990d30455201f9e852 Mon Sep 17 00:00:00 2001
From: rockerBOO <rockerboo@gmail.com>
Date: Sun, 12 Jan 2025 14:29:50 -0500
Subject: [PATCH 312/582] Fix regularization images with validation

Adding metadata recording for validation arguments
Add comments about the validation split for clarity of intention
---
 library/train_util.py | 33 +++++++++++++++++++++++++++++++--
 train_network.py      |  7 +++++++
 2 files changed, 38 insertions(+), 2 deletions(-)

diff --git a/library/train_util.py b/library/train_util.py
index 62aae37ef..6d3a772bb 100644
--- a/library/train_util.py
+++ b/library/train_util.py
@@ -146,7 +146,12 @@
 TEXT_ENCODER_OUTPUTS_CACHE_SUFFIX = "_te_outputs.npz"
 TEXT_ENCODER_OUTPUTS_CACHE_SUFFIX_SD3 = "_sd3_te.npz"
 
-def split_train_val(paths: List[str], is_training_dataset: bool, validation_split: float, validation_seed: int) -> List[str]:
+def split_train_val(
+    paths: List[str], 
+    is_training_dataset: bool, 
+    validation_split: float, 
+    validation_seed: int | None
+) -> List[str]:
     """
     Split the dataset into train and validation
 
@@ -1830,6 +1835,9 @@ def get_item_for_caching(self, bucket, bucket_batch_size, image_index):
 class DreamBoothDataset(BaseDataset):
     IMAGE_INFO_CACHE_FILE = "metadata_cache.json"
 
+    # The is_training_dataset defines the type of dataset, training or validation 
+    # if is_training_dataset is True -> training dataset
+    # if is_training_dataset is False -> validation dataset
     def __init__(
         self,
         subsets: Sequence[DreamBoothSubset],
@@ -1965,8 +1973,29 @@ def load_dreambooth_dir(subset: DreamBoothSubset):
                             size_set_count += 1
                     logger.info(f"set image size from cache files: {size_set_count}/{len(img_paths)}")
 
+            # We want to create a training and validation split. This should be improved in the future
+            # to allow a clearer distinction between training and validation. This can be seen as a 
+            # short-term solution to limit what is necessary to implement validation datasets
+            # 
+            # We split the dataset for the subset based on if we are doing a validation split
+            # The self.is_training_dataset defines the type of dataset, training or validation 
+            # if self.is_training_dataset is True -> training dataset
+            # if self.is_training_dataset is False -> validation dataset
             if self.validation_split > 0.0:
-                img_paths = split_train_val(img_paths, self.is_training_dataset, self.validation_split, self.validation_seed)
+                # For regularization images we do not want to split this dataset. 
+                if subset.is_reg is True:
+                    # Skip any validation dataset for regularization images
+                    if self.is_training_dataset is False:
+                        img_paths = []
+                    # Otherwise the img_paths remain as original img_paths and no split 
+                    # required for training images dataset of regularization images
+                else:
+                    img_paths = split_train_val(
+                        img_paths, 
+                        self.is_training_dataset, 
+                        self.validation_split, 
+                        self.validation_seed
+                    )
 
             logger.info(f"found directory {subset.image_dir} contains {len(img_paths)} image files")
 
diff --git a/train_network.py b/train_network.py
index 5ed92b7e2..605dbc60c 100644
--- a/train_network.py
+++ b/train_network.py
@@ -898,6 +898,7 @@ def load_model_hook(models, input_dir):
 
         accelerator.print("running training / 学習開始")
         accelerator.print(f"  num train images * repeats / 学習画像の数×繰り返し回数: {train_dataset_group.num_train_images}")
+        accelerator.print(f"  num validation images * repeats / 学習画像の数×繰り返し回数: {val_dataset_group.num_train_images if val_dataset_group is not None else 0}")
         accelerator.print(f"  num reg images / 正則化画像の数: {train_dataset_group.num_reg_images}")
         accelerator.print(f"  num batches per epoch / 1epochのバッチ数: {len(train_dataloader)}")
         accelerator.print(f"  num epochs / epoch数: {num_train_epochs}")
@@ -917,6 +918,7 @@ def load_model_hook(models, input_dir):
             "ss_text_encoder_lr": text_encoder_lr,
             "ss_unet_lr": args.unet_lr,
             "ss_num_train_images": train_dataset_group.num_train_images,
+            "ss_num_validation_images": val_dataset_group.num_train_images if val_dataset_group is not None else 0,
             "ss_num_reg_images": train_dataset_group.num_reg_images,
             "ss_num_batches_per_epoch": len(train_dataloader),
             "ss_num_epochs": num_train_epochs,
@@ -964,6 +966,11 @@ def load_model_hook(models, input_dir):
             "ss_huber_c": args.huber_c,
             "ss_fp8_base": bool(args.fp8_base),
             "ss_fp8_base_unet": bool(args.fp8_base_unet),
+            "ss_validation_seed": args.validation_seed, 
+            "ss_validation_split": args.validation_split, 
+            "ss_max_validation_steps": args.max_validation_steps, 
+            "ss_validate_every_n_epochs": args.validate_every_n_epochs, 
+            "ss_validate_every_n_steps": args.validate_every_n_steps, 
         }
 
         self.update_metadata(metadata, args)  # architecture specific metadata

From 0456858992909ca0b821ec1b2ca40fa633113224 Mon Sep 17 00:00:00 2001
From: rockerBOO <rockerboo@gmail.com>
Date: Sun, 12 Jan 2025 14:47:49 -0500
Subject: [PATCH 313/582] Fix validate_every_n_steps always running first step

---
 train_network.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/train_network.py b/train_network.py
index 605dbc60c..75e36dca9 100644
--- a/train_network.py
+++ b/train_network.py
@@ -1385,6 +1385,7 @@ def remove_model(old_ckpt_name):
                 # VALIDATION PER STEP
                 should_validate_step = (
                     args.validate_every_n_steps is not None 
+                    and global_step != 0 # Skip first step
                     and global_step % args.validate_every_n_steps == 0
                 )
                 if validation_steps > 0 and should_validate_step:

From ee9265cf2678df5c9dfa6c1148d20fb738a9e6ce Mon Sep 17 00:00:00 2001
From: rockerBOO <rockerboo@gmail.com>
Date: Sun, 12 Jan 2025 14:56:35 -0500
Subject: [PATCH 314/582] Fix validate_every_n_steps for gradient accumulation

---
 train_network.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/train_network.py b/train_network.py
index 75e36dca9..2f3203c94 100644
--- a/train_network.py
+++ b/train_network.py
@@ -1388,7 +1388,7 @@ def remove_model(old_ckpt_name):
                     and global_step != 0 # Skip first step
                     and global_step % args.validate_every_n_steps == 0
                 )
-                if validation_steps > 0 and should_validate_step:
+                if accelerator.sync_gradients and validation_steps > 0 and should_validate_step:
                     accelerator.print("Validating バリデーション処理...")
 
                     val_progress_bar = tqdm(

From 25929dd0d733144859008479c374968102e5d3a3 Mon Sep 17 00:00:00 2001
From: rockerBOO <rockerboo@gmail.com>
Date: Sun, 12 Jan 2025 15:38:57 -0500
Subject: [PATCH 315/582] Remove Validating... print to fix output layout

---
 train_network.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/train_network.py b/train_network.py
index 2f3203c94..e7d93a108 100644
--- a/train_network.py
+++ b/train_network.py
@@ -1389,8 +1389,6 @@ def remove_model(old_ckpt_name):
                     and global_step % args.validate_every_n_steps == 0
                 )
                 if accelerator.sync_gradients and validation_steps > 0 and should_validate_step:
-                    accelerator.print("Validating バリデーション処理...")
-
                     val_progress_bar = tqdm(
                         range(validation_steps), smoothing=0, 
                         disable=not accelerator.is_local_main_process, 
@@ -1450,7 +1448,6 @@ def remove_model(old_ckpt_name):
             )
 
             if should_validate_epoch and len(val_dataloader) > 0:
-                accelerator.print("Validating バリデーション処理...")
                 val_progress_bar = tqdm(
                     range(validation_steps), smoothing=0, 
                     disable=not accelerator.is_local_main_process, 

From b489082495ba6779385f282797227799413715f5 Mon Sep 17 00:00:00 2001
From: rockerBOO <rockerboo@gmail.com>
Date: Sun, 12 Jan 2025 16:42:04 -0500
Subject: [PATCH 316/582] Disable repeats for validation datasets

---
 library/train_util.py | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/library/train_util.py b/library/train_util.py
index 6d3a772bb..4d143c373 100644
--- a/library/train_util.py
+++ b/library/train_util.py
@@ -2055,9 +2055,10 @@ def load_dreambooth_dir(subset: DreamBoothSubset):
         num_reg_images = 0
         reg_infos: List[Tuple[ImageInfo, DreamBoothSubset]] = []
         for subset in subsets:
-            if subset.num_repeats < 1:
+            num_repeats = subset.num_repeats if self.is_training_dataset else 1
+            if num_repeats < 1:
                 logger.warning(
-                    f"ignore subset with image_dir='{subset.image_dir}': num_repeats is less than 1 / num_repeatsが1を下回っているためサブセットを無視します: {subset.num_repeats}"
+                    f"ignore subset with image_dir='{subset.image_dir}': num_repeats is less than 1 / num_repeatsが1を下回っているためサブセットを無視します: {num_repeats}"
                 )
                 continue
 
@@ -2075,12 +2076,12 @@ def load_dreambooth_dir(subset: DreamBoothSubset):
                 continue
 
             if subset.is_reg:
-                num_reg_images += subset.num_repeats * len(img_paths)
+                num_reg_images += num_repeats * len(img_paths)
             else:
-                num_train_images += subset.num_repeats * len(img_paths)
+                num_train_images += num_repeats * len(img_paths)
 
             for img_path, caption, size in zip(img_paths, captions, sizes):
-                info = ImageInfo(img_path, subset.num_repeats, caption, subset.is_reg, img_path)
+                info = ImageInfo(img_path, num_repeats, caption, subset.is_reg, img_path)
                 if size is not None:
                     info.image_size = size
                 if subset.is_reg:

From c04e5dfe92250a4790dc5f6e092cd85809a4e81d Mon Sep 17 00:00:00 2001
From: rockerBOO <rockerboo@gmail.com>
Date: Thu, 23 Jan 2025 09:57:24 -0500
Subject: [PATCH 317/582] Fix loss recorder on 0. Fix validation for cached
 runs. Assert on validation dataset

---
 flux_train_network.py           |  8 +++++---
 library/train_util.py           |  8 +++++++-
 requirements.txt                |  1 +
 sd3_train_network.py            | 11 ++++++++---
 sdxl_train_network.py           |  8 +++++---
 sdxl_train_textual_inversion.py |  5 +++--
 train_network.py                | 16 +++++++++++-----
 train_textual_inversion.py      |  9 ++++++---
 8 files changed, 46 insertions(+), 20 deletions(-)

diff --git a/flux_train_network.py b/flux_train_network.py
index b3aebecc7..5cd1b9d51 100644
--- a/flux_train_network.py
+++ b/flux_train_network.py
@@ -2,7 +2,7 @@
 import copy
 import math
 import random
-from typing import Any, Optional
+from typing import Any, Optional, Union
 
 import torch
 from accelerate import Accelerator
@@ -36,8 +36,8 @@ def __init__(self):
         self.is_schnell: Optional[bool] = None
         self.is_swapping_blocks: bool = False
 
-    def assert_extra_args(self, args, train_dataset_group):
-        super().assert_extra_args(args, train_dataset_group)
+    def assert_extra_args(self, args, train_dataset_group: Union[train_util.DatasetGroup, train_util.MinimalDataset], val_dataset_group: Optional[train_util.DatasetGroup]):
+        super().assert_extra_args(args, train_dataset_group, val_dataset_group)
         # sdxl_train_util.verify_sdxl_training_args(args)
 
         if args.fp8_base_unet:
@@ -80,6 +80,8 @@ def assert_extra_args(self, args, train_dataset_group):
                 args.blocks_to_swap = 18  # 18 is safe for most cases
 
         train_dataset_group.verify_bucket_reso_steps(32)  # TODO check this
+        if val_dataset_group is not None:
+            val_dataset_group.verify_bucket_reso_steps(32)  # TODO check this
 
     def load_target_model(self, args, weight_dtype, accelerator):
         # currently offload to cpu for some models
diff --git a/library/train_util.py b/library/train_util.py
index 4d143c373..56fea4a8c 100644
--- a/library/train_util.py
+++ b/library/train_util.py
@@ -2893,6 +2893,9 @@ def __getitem__(self, idx):
         """
         raise NotImplementedError
 
+    def get_resolutions(self) -> List[Tuple[int, int]]:
+        return []
+
 
 def load_arbitrary_dataset(args, tokenizer=None) -> MinimalDataset:
     module = ".".join(args.dataset_class.split(".")[:-1])
@@ -6520,4 +6523,7 @@ def add(self, *, epoch: int, step: int, loss: float) -> None:
 
     @property
     def moving_average(self) -> float:
-        return self.loss_total / len(self.loss_list)
+        losses = len(self.loss_list)
+        if losses == 0:
+            return 0
+        return self.loss_total / losses
diff --git a/requirements.txt b/requirements.txt
index e0091749a..de39f5887 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -20,6 +20,7 @@ voluptuous==0.13.1
 huggingface-hub==0.24.5
 # for Image utils
 imagesize==1.4.1
+numpy<=2.0
 # for BLIP captioning
 # requests==2.28.2
 # timm==0.6.12
diff --git a/sd3_train_network.py b/sd3_train_network.py
index c7417802d..dcf497f53 100644
--- a/sd3_train_network.py
+++ b/sd3_train_network.py
@@ -2,7 +2,7 @@
 import copy
 import math
 import random
-from typing import Any, Optional
+from typing import Any, Optional, Union
 
 import torch
 from accelerate import Accelerator
@@ -26,7 +26,7 @@ def __init__(self):
         super().__init__()
         self.sample_prompts_te_outputs = None
 
-    def assert_extra_args(self, args, train_dataset_group: train_util.DatasetGroup):
+    def assert_extra_args(self, args, train_dataset_group: Union[train_util.DatasetGroup, train_util.MinimalDataset], val_dataset_group: Optional[train_util.DatasetGroup]):
         # super().assert_extra_args(args, train_dataset_group)
         # sdxl_train_util.verify_sdxl_training_args(args)
 
@@ -56,9 +56,14 @@ def assert_extra_args(self, args, train_dataset_group: train_util.DatasetGroup):
         ) or not args.cpu_offload_checkpointing, "blocks_to_swap is not supported with cpu_offload_checkpointing / blocks_to_swapはcpu_offload_checkpointingと併用できません"
 
         train_dataset_group.verify_bucket_reso_steps(32)  # TODO check this
+        if val_dataset_group is not None:
+            val_dataset_group.verify_bucket_reso_steps(32)  # TODO check this
 
         # enumerate resolutions from dataset for positional embeddings
-        self.resolutions = train_dataset_group.get_resolutions()
+        resolutions = train_dataset_group.get_resolutions()
+        if val_dataset_group is not None:
+            resolutions = resolutions + val_dataset_group.get_resolutions()
+        self.resolutions = resolutions
 
     def load_target_model(self, args, weight_dtype, accelerator):
         # currently offload to cpu for some models
diff --git a/sdxl_train_network.py b/sdxl_train_network.py
index d45df6e05..eb09831ec 100644
--- a/sdxl_train_network.py
+++ b/sdxl_train_network.py
@@ -1,5 +1,5 @@
 import argparse
-from typing import List, Optional
+from typing import List, Optional, Union
 
 import torch
 from accelerate import Accelerator
@@ -23,8 +23,8 @@ def __init__(self):
         self.vae_scale_factor = sdxl_model_util.VAE_SCALE_FACTOR
         self.is_sdxl = True
 
-    def assert_extra_args(self, args, train_dataset_group):
-        super().assert_extra_args(args, train_dataset_group)
+    def assert_extra_args(self, args, train_dataset_group: Union[train_util.DatasetGroup, train_util.MinimalDataset], val_dataset_group: Optional[train_util.DatasetGroup]):
+        super().assert_extra_args(args, train_dataset_group, val_dataset_group)
         sdxl_train_util.verify_sdxl_training_args(args)
 
         if args.cache_text_encoder_outputs:
@@ -37,6 +37,8 @@ def assert_extra_args(self, args, train_dataset_group):
         ), "network for Text Encoder cannot be trained with caching Text Encoder outputs / Text Encoderの出力をキャッシュしながらText Encoderのネットワークを学習することはできません"
 
         train_dataset_group.verify_bucket_reso_steps(32)
+        if val_dataset_group is not None:
+            val_dataset_group.verify_bucket_reso_steps(32)
 
     def load_target_model(self, args, weight_dtype, accelerator):
         (
diff --git a/sdxl_train_textual_inversion.py b/sdxl_train_textual_inversion.py
index 821a69558..bf56faf34 100644
--- a/sdxl_train_textual_inversion.py
+++ b/sdxl_train_textual_inversion.py
@@ -18,11 +18,12 @@ def __init__(self):
         self.vae_scale_factor = sdxl_model_util.VAE_SCALE_FACTOR
         self.is_sdxl = True
 
-    def assert_extra_args(self, args, train_dataset_group):
-        super().assert_extra_args(args, train_dataset_group)
+    def assert_extra_args(self, args, train_dataset_group: Union[train_util.DatasetGroup, train_util.MinimalDataset], val_dataset_group: Optional[train_util.DatasetGroup]):
+        super().assert_extra_args(args, train_dataset_group, val_dataset_group)
         sdxl_train_util.verify_sdxl_training_args(args, supportTextEncoderCaching=False)
 
         train_dataset_group.verify_bucket_reso_steps(32)
+        val_dataset_group.verify_bucket_reso_steps(32)
 
     def load_target_model(self, args, weight_dtype, accelerator):
         (
diff --git a/train_network.py b/train_network.py
index e7d93a108..2c3bb2aae 100644
--- a/train_network.py
+++ b/train_network.py
@@ -3,7 +3,7 @@
 import math
 import os
 import typing
-from typing import Any, List
+from typing import Any, List, Union, Optional
 import sys
 import random
 import time
@@ -124,8 +124,10 @@ def generate_step_logs(
 
         return logs
 
-    def assert_extra_args(self, args, train_dataset_group):
+    def assert_extra_args(self, args, train_dataset_group: Union[train_util.DatasetGroup, train_util.MinimalDataset], val_dataset_group: Optional[train_util.DatasetGroup]):
         train_dataset_group.verify_bucket_reso_steps(64)
+        if val_dataset_group is not None:
+            val_dataset_group.verify_bucket_reso_steps(64)
 
     def load_target_model(self, args, weight_dtype, accelerator):
         text_encoder, vae, unet, _ = train_util.load_target_model(args, weight_dtype, accelerator)
@@ -512,7 +514,7 @@ def train(self, args):
                     val_dataset_group.is_latent_cacheable()
                 ), "when caching latents, either color_aug or random_crop cannot be used / latentをキャッシュするときはcolor_augとrandom_cropは使えません"
 
-        self.assert_extra_args(args, train_dataset_group)  # may change some args
+        self.assert_extra_args(args, train_dataset_group, val_dataset_group)  # may change some args
 
         # acceleratorを準備する
         logger.info("preparing accelerator")
@@ -1414,7 +1416,9 @@ def remove_model(old_ckpt_name):
                             args, 
                             text_encoding_strategy, 
                             tokenize_strategy, 
-                            is_train=False
+                            is_train=False,
+                            train_text_encoder=False, 
+                            train_unet=False
                         )
 
                         current_loss = loss.detach().item()
@@ -1474,7 +1478,9 @@ def remove_model(old_ckpt_name):
                         args, 
                         text_encoding_strategy, 
                         tokenize_strategy, 
-                        is_train=False
+                        is_train=False,
+                        train_text_encoder=False, 
+                        train_unet=False
                     )
 
                     current_loss = loss.detach().item()
diff --git a/train_textual_inversion.py b/train_textual_inversion.py
index 113f35997..0c6568b08 100644
--- a/train_textual_inversion.py
+++ b/train_textual_inversion.py
@@ -2,7 +2,7 @@
 import math
 import os
 from multiprocessing import Value
-from typing import Any, List
+from typing import Any, List, Optional, Union
 import toml
 
 from tqdm import tqdm
@@ -99,9 +99,12 @@ def __init__(self):
         self.vae_scale_factor = 0.18215
         self.is_sdxl = False
 
-    def assert_extra_args(self, args, train_dataset_group):
+    def assert_extra_args(self, args, train_dataset_group: Union[train_util.DatasetGroup, train_util.MinimalDataset], val_dataset_group: Optional[train_util.DatasetGroup]):
         train_dataset_group.verify_bucket_reso_steps(64)
 
+        if val_dataset_group is not None:
+            val_dataset_group.verify_bucket_reso_steps(64)
+
     def load_target_model(self, args, weight_dtype, accelerator):
         text_encoder, vae, unet, _ = train_util.load_target_model(args, weight_dtype, accelerator)
         return model_util.get_model_version_str_for_sd1_sd2(args.v2, args.v_parameterization), [text_encoder], vae, unet
@@ -325,7 +328,7 @@ def train(self, args):
             train_dataset_group = train_util.load_arbitrary_dataset(args)
             val_dataset_group = None
 
-        self.assert_extra_args(args, train_dataset_group)
+        self.assert_extra_args(args, train_dataset_group, val_dataset_group)
 
         current_epoch = Value("i", 0)
         current_step = Value("i", 0)

From 58b82a576e32c2157e476840339ddafa98222dfc Mon Sep 17 00:00:00 2001
From: Kohya S <ykumeykume@gmail.com>
Date: Sun, 26 Jan 2025 21:21:21 +0900
Subject: [PATCH 318/582] Fix to work with validation dataset

---
 library/train_util.py           | 1 +
 sdxl_train_textual_inversion.py | 4 +++-
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/library/train_util.py b/library/train_util.py
index 56fea4a8c..37ed0a994 100644
--- a/library/train_util.py
+++ b/library/train_util.py
@@ -2403,6 +2403,7 @@ def __init__(
 
         self.dreambooth_dataset_delegate = DreamBoothDataset(
             db_subsets,
+            True,
             batch_size,
             resolution,
             network_multiplier,
diff --git a/sdxl_train_textual_inversion.py b/sdxl_train_textual_inversion.py
index bf56faf34..982007601 100644
--- a/sdxl_train_textual_inversion.py
+++ b/sdxl_train_textual_inversion.py
@@ -1,5 +1,6 @@
 import argparse
 import os
+from typing import Optional, Union
 
 import regex
 
@@ -23,7 +24,8 @@ def assert_extra_args(self, args, train_dataset_group: Union[train_util.DatasetG
         sdxl_train_util.verify_sdxl_training_args(args, supportTextEncoderCaching=False)
 
         train_dataset_group.verify_bucket_reso_steps(32)
-        val_dataset_group.verify_bucket_reso_steps(32)
+        if val_dataset_group is not None:
+            val_dataset_group.verify_bucket_reso_steps(32)
 
     def load_target_model(self, args, weight_dtype, accelerator):
         (

From e8529613d8a06ce91d3b304bccf85a172b1b4b31 Mon Sep 17 00:00:00 2001
From: Kohya S <ykumeykume@gmail.com>
Date: Sun, 26 Jan 2025 21:27:22 +0900
Subject: [PATCH 319/582] README.md: Update recent updates section to include
 validation loss support for training scripts

---
 README.md | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/README.md b/README.md
index 4dff15440..053354103 100644
--- a/README.md
+++ b/README.md
@@ -14,6 +14,13 @@ The command to install PyTorch is as follows:
 
 ### Recent Updates
 
+Jan 25, 2025:
+
+- `train_network.py`, `sdxl_train_network.py`, `flux_train_network.py`, and `sd3_train_network.py` now support validation loss. PR [#1864](https://github.com/kohya-ss/sd-scripts/pull/1864) Thank you to rockerBOO!
+  - For details on how to set it up, please refer to the PR. The documentation will be updated as needed.
+  - It will be added to other scripts as well.
+  - As a current limitation, validation loss is not supported when `--block_to_swap` is specified.
+
 Dec 15, 2024:
 
 - RAdamScheduleFree optimizer is supported. PR [#1830](https://github.com/kohya-ss/sd-scripts/pull/1830) Thanks to nhamanasu!

From 59b3b94faf827e3a7f01829fed0232d89dec9e33 Mon Sep 17 00:00:00 2001
From: Kohya S <ykumeykume@gmail.com>
Date: Sun, 26 Jan 2025 21:52:58 +0900
Subject: [PATCH 320/582] README.md: Update limitation for validation loss
 support to include schedule-free optimizer

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 053354103..4bbd7617e 100644
--- a/README.md
+++ b/README.md
@@ -19,7 +19,7 @@ Jan 25, 2025:
 - `train_network.py`, `sdxl_train_network.py`, `flux_train_network.py`, and `sd3_train_network.py` now support validation loss. PR [#1864](https://github.com/kohya-ss/sd-scripts/pull/1864) Thank you to rockerBOO!
   - For details on how to set it up, please refer to the PR. The documentation will be updated as needed.
   - It will be added to other scripts as well.
-  - As a current limitation, validation loss is not supported when `--block_to_swap` is specified.
+  - As a current limitation, validation loss is not supported when `--block_to_swap` is specified, or when schedule-free optimizer is used.
 
 Dec 15, 2024:
 

From 532f5c58a6e83a3400f82103f5854ff3f63d77d7 Mon Sep 17 00:00:00 2001
From: Kohya S <ykumeykume@gmail.com>
Date: Mon, 27 Jan 2025 20:50:42 +0900
Subject: [PATCH 321/582] formatting

---
 train_network.py | 229 ++++++++++++++++++++++-------------------------
 1 file changed, 108 insertions(+), 121 deletions(-)

diff --git a/train_network.py b/train_network.py
index 2c3bb2aae..cc54be7cc 100644
--- a/train_network.py
+++ b/train_network.py
@@ -100,9 +100,7 @@ def generate_step_logs(
             if (
                 args.optimizer_type.lower().endswith("ProdigyPlusScheduleFree".lower()) and optimizer is not None
             ):  # tracking d*lr value of unet.
-                logs["lr/d*lr"] = (
-                    optimizer.param_groups[0]["d"] * optimizer.param_groups[0]["lr"]
-                )
+                logs["lr/d*lr"] = optimizer.param_groups[0]["d"] * optimizer.param_groups[0]["lr"]
         else:
             idx = 0
             if not args.network_train_unet_only:
@@ -115,16 +113,17 @@ def generate_step_logs(
                     logs[f"lr/d*lr/group{i}"] = (
                         lr_scheduler.optimizers[-1].param_groups[i]["d"] * lr_scheduler.optimizers[-1].param_groups[i]["lr"]
                     )
-                if (
-                    args.optimizer_type.lower().endswith("ProdigyPlusScheduleFree".lower()) and optimizer is not None
-                ):
-                    logs[f"lr/d*lr/group{i}"] = (
-                        optimizer.param_groups[i]["d"] * optimizer.param_groups[i]["lr"]
-                    )
+                if args.optimizer_type.lower().endswith("ProdigyPlusScheduleFree".lower()) and optimizer is not None:
+                    logs[f"lr/d*lr/group{i}"] = optimizer.param_groups[i]["d"] * optimizer.param_groups[i]["lr"]
 
         return logs
 
-    def assert_extra_args(self, args, train_dataset_group: Union[train_util.DatasetGroup, train_util.MinimalDataset], val_dataset_group: Optional[train_util.DatasetGroup]):
+    def assert_extra_args(
+        self,
+        args,
+        train_dataset_group: Union[train_util.DatasetGroup, train_util.MinimalDataset],
+        val_dataset_group: Optional[train_util.DatasetGroup],
+    ):
         train_dataset_group.verify_bucket_reso_steps(64)
         if val_dataset_group is not None:
             val_dataset_group.verify_bucket_reso_steps(64)
@@ -219,7 +218,7 @@ def get_noise_pred_and_target(
         network,
         weight_dtype,
         train_unet,
-        is_train=True
+        is_train=True,
     ):
         # Sample noise, sample a random timestep for each image, and add noise to the latents,
         # with noise offset and/or multires noise if specified
@@ -315,22 +314,22 @@ def on_step_start(self, args, accelerator, network, text_encoders, unet, batch,
     # endregion
 
     def process_batch(
-        self, 
-        batch, 
-        text_encoders, 
-        unet, 
-        network, 
-        vae, 
-        noise_scheduler, 
-        vae_dtype, 
-        weight_dtype, 
-        accelerator, 
-        args, 
-        text_encoding_strategy: strategy_base.TextEncodingStrategy, 
-        tokenize_strategy: strategy_base.TokenizeStrategy, 
-        is_train=True, 
-        train_text_encoder=True, 
-        train_unet=True
+        self,
+        batch,
+        text_encoders,
+        unet,
+        network,
+        vae,
+        noise_scheduler,
+        vae_dtype,
+        weight_dtype,
+        accelerator,
+        args,
+        text_encoding_strategy: strategy_base.TextEncodingStrategy,
+        tokenize_strategy: strategy_base.TokenizeStrategy,
+        is_train=True,
+        train_text_encoder=True,
+        train_unet=True,
     ) -> torch.Tensor:
         """
         Process a batch for the network
@@ -397,7 +396,7 @@ def process_batch(
             network,
             weight_dtype,
             train_unet,
-            is_train=is_train
+            is_train=is_train,
         )
 
         huber_c = train_util.get_huber_threshold_if_needed(args, timesteps, noise_scheduler)
@@ -484,7 +483,7 @@ def train(self, args):
         else:
             # use arbitrary dataset class
             train_dataset_group = train_util.load_arbitrary_dataset(args)
-            val_dataset_group = None # placeholder until validation dataset supported for arbitrary
+            val_dataset_group = None  # placeholder until validation dataset supported for arbitrary
 
         current_epoch = Value("i", 0)
         current_step = Value("i", 0)
@@ -701,7 +700,7 @@ def train(self, args):
             num_workers=n_workers,
             persistent_workers=args.persistent_data_loader_workers,
         )
-        
+
         val_dataloader = torch.utils.data.DataLoader(
             val_dataset_group if val_dataset_group is not None else [],
             shuffle=False,
@@ -900,7 +899,9 @@ def load_model_hook(models, input_dir):
 
         accelerator.print("running training / 学習開始")
         accelerator.print(f"  num train images * repeats / 学習画像の数×繰り返し回数: {train_dataset_group.num_train_images}")
-        accelerator.print(f"  num validation images * repeats / 学習画像の数×繰り返し回数: {val_dataset_group.num_train_images if val_dataset_group is not None else 0}")
+        accelerator.print(
+            f"  num validation images * repeats / 学習画像の数×繰り返し回数: {val_dataset_group.num_train_images if val_dataset_group is not None else 0}"
+        )
         accelerator.print(f"  num reg images / 正則化画像の数: {train_dataset_group.num_reg_images}")
         accelerator.print(f"  num batches per epoch / 1epochのバッチ数: {len(train_dataloader)}")
         accelerator.print(f"  num epochs / epoch数: {num_train_epochs}")
@@ -968,11 +969,11 @@ def load_model_hook(models, input_dir):
             "ss_huber_c": args.huber_c,
             "ss_fp8_base": bool(args.fp8_base),
             "ss_fp8_base_unet": bool(args.fp8_base_unet),
-            "ss_validation_seed": args.validation_seed, 
-            "ss_validation_split": args.validation_split, 
-            "ss_max_validation_steps": args.max_validation_steps, 
-            "ss_validate_every_n_epochs": args.validate_every_n_epochs, 
-            "ss_validate_every_n_steps": args.validate_every_n_steps, 
+            "ss_validation_seed": args.validation_seed,
+            "ss_validation_split": args.validation_split,
+            "ss_max_validation_steps": args.max_validation_steps,
+            "ss_validate_every_n_epochs": args.validate_every_n_epochs,
+            "ss_validate_every_n_steps": args.validate_every_n_steps,
         }
 
         self.update_metadata(metadata, args)  # architecture specific metadata
@@ -1248,9 +1249,7 @@ def remove_model(old_ckpt_name):
             accelerator.log({}, step=0)
 
         validation_steps = (
-            min(args.max_validation_steps, len(val_dataloader)) 
-            if args.max_validation_steps is not None 
-            else len(val_dataloader)
+            min(args.max_validation_steps, len(val_dataloader)) if args.max_validation_steps is not None else len(val_dataloader)
         )
 
         # training loop
@@ -1298,21 +1297,21 @@ def remove_model(old_ckpt_name):
                     self.on_step_start(args, accelerator, network, text_encoders, unet, batch, weight_dtype)
 
                     loss = self.process_batch(
-                        batch, 
-                        text_encoders, 
-                        unet, 
-                        network, 
-                        vae, 
-                        noise_scheduler, 
-                        vae_dtype, 
-                        weight_dtype, 
-                        accelerator, 
-                        args, 
-                        text_encoding_strategy, 
-                        tokenize_strategy, 
-                        is_train=True, 
-                        train_text_encoder=train_text_encoder, 
-                        train_unet=train_unet
+                        batch,
+                        text_encoders,
+                        unet,
+                        network,
+                        vae,
+                        noise_scheduler,
+                        vae_dtype,
+                        weight_dtype,
+                        accelerator,
+                        args,
+                        text_encoding_strategy,
+                        tokenize_strategy,
+                        is_train=True,
+                        train_text_encoder=train_text_encoder,
+                        train_unet=train_unet,
                     )
 
                     accelerator.backward(loss)
@@ -1369,32 +1368,21 @@ def remove_model(old_ckpt_name):
                 if args.scale_weight_norms:
                     progress_bar.set_postfix(**{**max_mean_logs, **logs})
 
-
                 if is_tracking:
                     logs = self.generate_step_logs(
-                        args, 
-                        current_loss, 
-                        avr_loss, 
-                        lr_scheduler, 
-                        lr_descriptions, 
-                        optimizer, 
-                        keys_scaled, 
-                        mean_norm, 
-                        maximum_norm
+                        args, current_loss, avr_loss, lr_scheduler, lr_descriptions, optimizer, keys_scaled, mean_norm, maximum_norm
                     )
                     accelerator.log(logs, step=global_step)
 
                 # VALIDATION PER STEP
                 should_validate_step = (
-                    args.validate_every_n_steps is not None 
-                    and global_step != 0 # Skip first step
+                    args.validate_every_n_steps is not None
+                    and global_step != 0  # Skip first step
                     and global_step % args.validate_every_n_steps == 0
                 )
                 if accelerator.sync_gradients and validation_steps > 0 and should_validate_step:
                     val_progress_bar = tqdm(
-                        range(validation_steps), smoothing=0, 
-                        disable=not accelerator.is_local_main_process, 
-                        desc="validation steps"
+                        range(validation_steps), smoothing=0, disable=not accelerator.is_local_main_process, desc="validation steps"
                     )
                     for val_step, batch in enumerate(val_dataloader):
                         if val_step >= validation_steps:
@@ -1404,27 +1392,27 @@ def remove_model(old_ckpt_name):
                         self.on_step_start(args, accelerator, network, text_encoders, unet, batch, weight_dtype)
 
                         loss = self.process_batch(
-                            batch, 
-                            text_encoders, 
-                            unet, 
-                            network, 
-                            vae, 
-                            noise_scheduler, 
-                            vae_dtype, 
-                            weight_dtype, 
-                            accelerator, 
-                            args, 
-                            text_encoding_strategy, 
-                            tokenize_strategy, 
+                            batch,
+                            text_encoders,
+                            unet,
+                            network,
+                            vae,
+                            noise_scheduler,
+                            vae_dtype,
+                            weight_dtype,
+                            accelerator,
+                            args,
+                            text_encoding_strategy,
+                            tokenize_strategy,
                             is_train=False,
-                            train_text_encoder=False, 
-                            train_unet=False
+                            train_text_encoder=False,
+                            train_unet=False,
                         )
 
                         current_loss = loss.detach().item()
                         val_step_loss_recorder.add(epoch=epoch, step=val_step, loss=current_loss)
                         val_progress_bar.update(1)
-                        val_progress_bar.set_postfix({ "val_avg_loss": val_step_loss_recorder.moving_average })
+                        val_progress_bar.set_postfix({"val_avg_loss": val_step_loss_recorder.moving_average})
 
                         if is_tracking:
                             logs = {
@@ -1436,26 +1424,25 @@ def remove_model(old_ckpt_name):
                     if is_tracking:
                         loss_validation_divergence = val_step_loss_recorder.moving_average - loss_recorder.moving_average
                         logs = {
-                            "loss/validation/step_average": val_step_loss_recorder.moving_average, 
-                            "loss/validation/step_divergence": loss_validation_divergence, 
+                            "loss/validation/step_average": val_step_loss_recorder.moving_average,
+                            "loss/validation/step_divergence": loss_validation_divergence,
                         }
                         accelerator.log(logs, step=global_step)
-                                        
+
                 if global_step >= args.max_train_steps:
                     break
 
             # EPOCH VALIDATION
             should_validate_epoch = (
-                (epoch + 1) % args.validate_every_n_epochs == 0 
-                if args.validate_every_n_epochs is not None 
-                else True
+                (epoch + 1) % args.validate_every_n_epochs == 0 if args.validate_every_n_epochs is not None else True
             )
 
             if should_validate_epoch and len(val_dataloader) > 0:
                 val_progress_bar = tqdm(
-                    range(validation_steps), smoothing=0, 
-                    disable=not accelerator.is_local_main_process, 
-                    desc="epoch validation steps"
+                    range(validation_steps),
+                    smoothing=0,
+                    disable=not accelerator.is_local_main_process,
+                    desc="epoch validation steps",
                 )
 
                 for val_step, batch in enumerate(val_dataloader):
@@ -1466,43 +1453,43 @@ def remove_model(old_ckpt_name):
                     self.on_step_start(args, accelerator, network, text_encoders, unet, batch, weight_dtype)
 
                     loss = self.process_batch(
-                        batch, 
-                        text_encoders, 
-                        unet, 
-                        network, 
-                        vae, 
-                        noise_scheduler, 
-                        vae_dtype, 
-                        weight_dtype, 
-                        accelerator, 
-                        args, 
-                        text_encoding_strategy, 
-                        tokenize_strategy, 
+                        batch,
+                        text_encoders,
+                        unet,
+                        network,
+                        vae,
+                        noise_scheduler,
+                        vae_dtype,
+                        weight_dtype,
+                        accelerator,
+                        args,
+                        text_encoding_strategy,
+                        tokenize_strategy,
                         is_train=False,
-                        train_text_encoder=False, 
-                        train_unet=False
+                        train_text_encoder=False,
+                        train_unet=False,
                     )
 
                     current_loss = loss.detach().item()
                     val_epoch_loss_recorder.add(epoch=epoch, step=val_step, loss=current_loss)
                     val_progress_bar.update(1)
-                    val_progress_bar.set_postfix({ "val_epoch_avg_loss": val_epoch_loss_recorder.moving_average })
+                    val_progress_bar.set_postfix({"val_epoch_avg_loss": val_epoch_loss_recorder.moving_average})
 
                     if is_tracking:
                         logs = {
-                            "loss/validation/epoch_current": current_loss, 
-                            "epoch": epoch + 1, 
-                            "val_step": (epoch * validation_steps) + val_step
+                            "loss/validation/epoch_current": current_loss,
+                            "epoch": epoch + 1,
+                            "val_step": (epoch * validation_steps) + val_step,
                         }
                         accelerator.log(logs, step=global_step)
 
                 if is_tracking:
                     avr_loss: float = val_epoch_loss_recorder.moving_average
-                    loss_validation_divergence = val_step_loss_recorder.moving_average - avr_loss 
+                    loss_validation_divergence = val_step_loss_recorder.moving_average - avr_loss
                     logs = {
-                        "loss/validation/epoch_average": avr_loss, 
-                        "loss/validation/epoch_divergence": loss_validation_divergence, 
-                        "epoch": epoch + 1
+                        "loss/validation/epoch_average": avr_loss,
+                        "loss/validation/epoch_divergence": loss_validation_divergence,
+                        "epoch": epoch + 1,
                     }
                     accelerator.log(logs, step=global_step)
 
@@ -1510,7 +1497,7 @@ def remove_model(old_ckpt_name):
             if is_tracking:
                 logs = {"loss/epoch_average": loss_recorder.moving_average, "epoch": epoch + 1}
                 accelerator.log(logs, step=global_step)
-                    
+
             accelerator.wait_for_everyone()
 
             # 指定エポックごとにモデルを保存
@@ -1696,31 +1683,31 @@ def setup_parser() -> argparse.ArgumentParser:
         "--validation_seed",
         type=int,
         default=None,
-        help="Validation seed for shuffling validation dataset, training `--seed` used otherwise / 検証データセットをシャッフルするための検証シード、それ以外の場合はトレーニング `--seed` を使用する"
+        help="Validation seed for shuffling validation dataset, training `--seed` used otherwise / 検証データセットをシャッフルするための検証シード、それ以外の場合はトレーニング `--seed` を使用する",
     )
     parser.add_argument(
         "--validation_split",
         type=float,
         default=0.0,
-        help="Split for validation images out of the training dataset / 学習画像から検証画像に分割する割合"
+        help="Split for validation images out of the training dataset / 学習画像から検証画像に分割する割合",
     )
     parser.add_argument(
         "--validate_every_n_steps",
         type=int,
         default=None,
-        help="Run validation on validation dataset every N steps. By default, validation will only occur every epoch if a validation dataset is available / 検証データセットの検証をNステップごとに実行します。デフォルトでは、検証データセットが利用可能な場合にのみ、検証はエポックごとに実行されます"
+        help="Run validation on validation dataset every N steps. By default, validation will only occur every epoch if a validation dataset is available / 検証データセットの検証をNステップごとに実行します。デフォルトでは、検証データセットが利用可能な場合にのみ、検証はエポックごとに実行されます",
     )
     parser.add_argument(
         "--validate_every_n_epochs",
         type=int,
         default=None,
-        help="Run validation dataset every N epochs. By default, validation will run every epoch if a validation dataset is available / 検証データセットをNエポックごとに実行します。デフォルトでは、検証データセットが利用可能な場合、検証はエポックごとに実行されます"
+        help="Run validation dataset every N epochs. By default, validation will run every epoch if a validation dataset is available / 検証データセットをNエポックごとに実行します。デフォルトでは、検証データセットが利用可能な場合、検証はエポックごとに実行されます",
     )
     parser.add_argument(
         "--max_validation_steps",
         type=int,
         default=None,
-        help="Max number of validation dataset items processed. By default, validation will run the entire validation dataset / 処理される検証データセット項目の最大数。デフォルトでは、検証は検証データセット全体を実行します"
+        help="Max number of validation dataset items processed. By default, validation will run the entire validation dataset / 処理される検証データセット項目の最大数。デフォルトでは、検証は検証データセット全体を実行します",
     )
     return parser
 

From 86a2f3fd262e52b3249d9f5508efe4774f1fa3ed Mon Sep 17 00:00:00 2001
From: Kohya S <ykumeykume@gmail.com>
Date: Mon, 27 Jan 2025 21:10:52 +0900
Subject: [PATCH 322/582] Fix gradient handling when Text Encoders are trained

---
 flux_train_network.py | 43 ++-----------------------------------------
 sd3_train_network.py  |  2 +-
 train_network.py      | 10 +++++-----
 3 files changed, 8 insertions(+), 47 deletions(-)

diff --git a/flux_train_network.py b/flux_train_network.py
index 5cd1b9d51..475bd751b 100644
--- a/flux_train_network.py
+++ b/flux_train_network.py
@@ -376,9 +376,8 @@ def get_noise_pred_and_target(
             t5_attn_mask = None
 
         def call_dit(img, img_ids, t5_out, txt_ids, l_pooled, timesteps, guidance_vec, t5_attn_mask):
-            # if not args.split_mode:
-            # normal forward
-            with torch.set_grad_enabled(is_train and train_unet), accelerator.autocast():
+            # grad is enabled even if unet is not in train mode, because Text Encoder is in train mode
+            with torch.set_grad_enabled(is_train), accelerator.autocast():
                 # YiYi notes: divide it by 1000 for now because we scale it by 1000 in the transformer model (we should not keep it but I want to keep the inputs same for the model for testing)
                 model_pred = unet(
                     img=img,
@@ -390,44 +389,6 @@ def call_dit(img, img_ids, t5_out, txt_ids, l_pooled, timesteps, guidance_vec, t
                     guidance=guidance_vec,
                     txt_attention_mask=t5_attn_mask,
                 )
-            """
-            else:
-                # split forward to reduce memory usage
-                assert network.train_blocks == "single", "train_blocks must be single for split mode"
-                with accelerator.autocast():
-                    # move flux lower to cpu, and then move flux upper to gpu
-                    unet.to("cpu")
-                    clean_memory_on_device(accelerator.device)
-                    self.flux_upper.to(accelerator.device)
-
-                    # upper model does not require grad
-                    with torch.no_grad():
-                        intermediate_img, intermediate_txt, vec, pe = self.flux_upper(
-                            img=packed_noisy_model_input,
-                            img_ids=img_ids,
-                            txt=t5_out,
-                            txt_ids=txt_ids,
-                            y=l_pooled,
-                            timesteps=timesteps / 1000,
-                            guidance=guidance_vec,
-                            txt_attention_mask=t5_attn_mask,
-                        )
-
-                    # move flux upper back to cpu, and then move flux lower to gpu
-                    self.flux_upper.to("cpu")
-                    clean_memory_on_device(accelerator.device)
-                    unet.to(accelerator.device)
-
-                    # lower model requires grad
-                    intermediate_img.requires_grad_(True)
-                    intermediate_txt.requires_grad_(True)
-                    vec.requires_grad_(True)
-                    pe.requires_grad_(True)
-
-                    with torch.set_grad_enabled(is_train and train_unet): 
-                        model_pred = unet(img=intermediate_img, txt=intermediate_txt, vec=vec, pe=pe, txt_attention_mask=t5_attn_mask)
-            """
-
             return model_pred
 
         model_pred = call_dit(
diff --git a/sd3_train_network.py b/sd3_train_network.py
index dcf497f53..2f4579492 100644
--- a/sd3_train_network.py
+++ b/sd3_train_network.py
@@ -345,7 +345,7 @@ def get_noise_pred_and_target(
             t5_attn_mask = None
 
         # call model
-        with torch.set_grad_enabled(is_train and train_unet), accelerator.autocast():
+        with torch.set_grad_enabled(is_train), accelerator.autocast():
             # TODO support attention mask
             model_pred = unet(noisy_model_input, timesteps, context=context, y=lg_pooled)
 
diff --git a/train_network.py b/train_network.py
index cc54be7cc..6f1652fd9 100644
--- a/train_network.py
+++ b/train_network.py
@@ -232,7 +232,7 @@ def get_noise_pred_and_target(
                 t.requires_grad_(True)
 
         # Predict the noise residual
-        with torch.set_grad_enabled(is_train and train_unet), accelerator.autocast():
+        with torch.set_grad_enabled(is_train), accelerator.autocast():
             noise_pred = self.call_unet(
                 args,
                 accelerator,
@@ -1405,8 +1405,8 @@ def remove_model(old_ckpt_name):
                             text_encoding_strategy,
                             tokenize_strategy,
                             is_train=False,
-                            train_text_encoder=False,
-                            train_unet=False,
+                            train_text_encoder=train_text_encoder,  # this is needed for validation because Text Encoders must be called if train_text_encoder is True
+                            train_unet=train_unet,
                         )
 
                         current_loss = loss.detach().item()
@@ -1466,8 +1466,8 @@ def remove_model(old_ckpt_name):
                         text_encoding_strategy,
                         tokenize_strategy,
                         is_train=False,
-                        train_text_encoder=False,
-                        train_unet=False,
+                        train_text_encoder=train_text_encoder,
+                        train_unet=train_unet,
                     )
 
                     current_loss = loss.detach().item()

From b6a309321675b5d0a59b776ffb4d0ecdd3d28ec2 Mon Sep 17 00:00:00 2001
From: Kohya S <ykumeykume@gmail.com>
Date: Mon, 27 Jan 2025 21:22:11 +0900
Subject: [PATCH 323/582] call optimizer eval/train fn before/after validation

---
 train_network.py | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/train_network.py b/train_network.py
index 6f1652fd9..e735c582d 100644
--- a/train_network.py
+++ b/train_network.py
@@ -1381,6 +1381,8 @@ def remove_model(old_ckpt_name):
                     and global_step % args.validate_every_n_steps == 0
                 )
                 if accelerator.sync_gradients and validation_steps > 0 and should_validate_step:
+                    optimizer_eval_fn()
+
                     val_progress_bar = tqdm(
                         range(validation_steps), smoothing=0, disable=not accelerator.is_local_main_process, desc="validation steps"
                     )
@@ -1429,6 +1431,8 @@ def remove_model(old_ckpt_name):
                         }
                         accelerator.log(logs, step=global_step)
 
+                    optimizer_train_fn()
+
                 if global_step >= args.max_train_steps:
                     break
 
@@ -1438,6 +1442,8 @@ def remove_model(old_ckpt_name):
             )
 
             if should_validate_epoch and len(val_dataloader) > 0:
+                optimizer_eval_fn()
+
                 val_progress_bar = tqdm(
                     range(validation_steps),
                     smoothing=0,
@@ -1493,6 +1499,8 @@ def remove_model(old_ckpt_name):
                     }
                     accelerator.log(logs, step=global_step)
 
+                optimizer_train_fn()
+
             # END OF EPOCH
             if is_tracking:
                 logs = {"loss/epoch_average": loss_recorder.moving_average, "epoch": epoch + 1}

From 29f31d005f12a08650389164fa9c60504928d451 Mon Sep 17 00:00:00 2001
From: Kohya S <ykumeykume@gmail.com>
Date: Mon, 27 Jan 2025 21:35:43 +0900
Subject: [PATCH 324/582] add network.train()/eval() for validation

---
 train_network.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/train_network.py b/train_network.py
index e735c582d..9b8036f8b 100644
--- a/train_network.py
+++ b/train_network.py
@@ -1276,7 +1276,7 @@ def remove_model(old_ckpt_name):
 
             metadata["ss_epoch"] = str(epoch + 1)
 
-            accelerator.unwrap_model(network).on_epoch_start(text_encoder, unet)
+            accelerator.unwrap_model(network).on_epoch_start(text_encoder, unet)  # network.train() is called here
 
             # TRAINING
             skipped_dataloader = None
@@ -1382,6 +1382,7 @@ def remove_model(old_ckpt_name):
                 )
                 if accelerator.sync_gradients and validation_steps > 0 and should_validate_step:
                     optimizer_eval_fn()
+                    accelerator.unwrap_model(network).eval()
 
                     val_progress_bar = tqdm(
                         range(validation_steps), smoothing=0, disable=not accelerator.is_local_main_process, desc="validation steps"
@@ -1432,6 +1433,7 @@ def remove_model(old_ckpt_name):
                         accelerator.log(logs, step=global_step)
 
                     optimizer_train_fn()
+                    accelerator.unwrap_model(network).train()
 
                 if global_step >= args.max_train_steps:
                     break
@@ -1443,6 +1445,7 @@ def remove_model(old_ckpt_name):
 
             if should_validate_epoch and len(val_dataloader) > 0:
                 optimizer_eval_fn()
+                accelerator.unwrap_model(network).eval()
 
                 val_progress_bar = tqdm(
                     range(validation_steps),
@@ -1500,6 +1503,7 @@ def remove_model(old_ckpt_name):
                     accelerator.log(logs, step=global_step)
 
                 optimizer_train_fn()
+                accelerator.unwrap_model(network).train()
 
             # END OF EPOCH
             if is_tracking:

From 0750859133eec7858052cd3f79106113fa786e94 Mon Sep 17 00:00:00 2001
From: Kohya S <ykumeykume@gmail.com>
Date: Mon, 27 Jan 2025 21:56:59 +0900
Subject: [PATCH 325/582] validation: Implement timestep-based validation
 processing

---
 sd3_train_network.py |   1 +
 train_network.py     | 167 +++++++++++++++++++++++++------------------
 2 files changed, 100 insertions(+), 68 deletions(-)

diff --git a/sd3_train_network.py b/sd3_train_network.py
index 2f4579492..d4f131252 100644
--- a/sd3_train_network.py
+++ b/sd3_train_network.py
@@ -446,6 +446,7 @@ def forward(hidden_states):
                 prepare_fp8(text_encoder, weight_dtype)
 
     def on_step_start(self, args, accelerator, network, text_encoders, unet, batch, weight_dtype):
+        # TODO consider validation
         # drop cached text encoder outputs
         text_encoder_outputs_list = batch.get("text_encoder_outputs_list", None)
         if text_encoder_outputs_list is not None:
diff --git a/train_network.py b/train_network.py
index 9b8036f8b..a63e9d1e9 100644
--- a/train_network.py
+++ b/train_network.py
@@ -9,6 +9,7 @@
 import time
 import json
 from multiprocessing import Value
+import numpy as np
 import toml
 
 from tqdm import tqdm
@@ -1248,10 +1249,6 @@ def remove_model(old_ckpt_name):
             # log empty object to commit the sample images to wandb
             accelerator.log({}, step=0)
 
-        validation_steps = (
-            min(args.max_validation_steps, len(val_dataloader)) if args.max_validation_steps is not None else len(val_dataloader)
-        )
-
         # training loop
         if initial_step > 0:  # only if skip_until_initial_step is specified
             for skip_epoch in range(epoch_to_start):  # skip epochs
@@ -1270,6 +1267,17 @@ def remove_model(old_ckpt_name):
 
         clean_memory_on_device(accelerator.device)
 
+        validation_steps = (
+            min(args.max_validation_steps, len(val_dataloader)) if args.max_validation_steps is not None else len(val_dataloader)
+        )
+        NUM_VALIDATION_TIMESTEPS = 4  # 200, 400, 600, 800 TODO make this configurable
+        min_timestep = 0 if args.min_timestep is None else args.min_timestep
+        max_timestep = noise_scheduler.num_train_timesteps if args.max_timestep is None else args.max_timestep
+        validation_timesteps = np.linspace(min_timestep, max_timestep, (NUM_VALIDATION_TIMESTEPS + 2), dtype=int)[1:-1]
+        validation_total_steps = validation_steps * len(validation_timesteps)
+        original_args_min_timestep = args.min_timestep
+        original_args_max_timestep = args.max_timestep
+
         for epoch in range(epoch_to_start, num_train_epochs):
             accelerator.print(f"\nepoch {epoch+1}/{num_train_epochs}\n")
             current_epoch.value = epoch + 1
@@ -1385,44 +1393,55 @@ def remove_model(old_ckpt_name):
                     accelerator.unwrap_model(network).eval()
 
                     val_progress_bar = tqdm(
-                        range(validation_steps), smoothing=0, disable=not accelerator.is_local_main_process, desc="validation steps"
+                        range(validation_total_steps),
+                        smoothing=0,
+                        disable=not accelerator.is_local_main_process,
+                        desc="validation steps",
                     )
+                    val_ts_step = 0
                     for val_step, batch in enumerate(val_dataloader):
                         if val_step >= validation_steps:
                             break
 
-                        # temporary, for batch processing
-                        self.on_step_start(args, accelerator, network, text_encoders, unet, batch, weight_dtype)
-
-                        loss = self.process_batch(
-                            batch,
-                            text_encoders,
-                            unet,
-                            network,
-                            vae,
-                            noise_scheduler,
-                            vae_dtype,
-                            weight_dtype,
-                            accelerator,
-                            args,
-                            text_encoding_strategy,
-                            tokenize_strategy,
-                            is_train=False,
-                            train_text_encoder=train_text_encoder,  # this is needed for validation because Text Encoders must be called if train_text_encoder is True
-                            train_unet=train_unet,
-                        )
-
-                        current_loss = loss.detach().item()
-                        val_step_loss_recorder.add(epoch=epoch, step=val_step, loss=current_loss)
-                        val_progress_bar.update(1)
-                        val_progress_bar.set_postfix({"val_avg_loss": val_step_loss_recorder.moving_average})
-
-                        if is_tracking:
-                            logs = {
-                                "loss/validation/step_current": current_loss,
-                                "val_step": (epoch * validation_steps) + val_step,
-                            }
-                            accelerator.log(logs, step=global_step)
+                        for timestep in validation_timesteps:
+                            # temporary, for batch processing
+                            self.on_step_start(args, accelerator, network, text_encoders, unet, batch, weight_dtype)
+
+                            args.min_timestep = args.max_timestep = timestep  # dirty hack to change timestep
+
+                            loss = self.process_batch(
+                                batch,
+                                text_encoders,
+                                unet,
+                                network,
+                                vae,
+                                noise_scheduler,
+                                vae_dtype,
+                                weight_dtype,
+                                accelerator,
+                                args,
+                                text_encoding_strategy,
+                                tokenize_strategy,
+                                is_train=False,
+                                train_text_encoder=train_text_encoder,  # this is needed for validation because Text Encoders must be called if train_text_encoder is True
+                                train_unet=train_unet,
+                            )
+
+                            current_loss = loss.detach().item()
+                            val_step_loss_recorder.add(epoch=epoch, step=val_ts_step, loss=current_loss)
+                            val_progress_bar.update(1)
+                            val_progress_bar.set_postfix(
+                                {"val_avg_loss": val_step_loss_recorder.moving_average, "timestep": timestep}
+                            )
+
+                            if is_tracking:
+                                logs = {
+                                    "loss/validation/step_current": current_loss,
+                                    "val_step": (epoch * validation_total_steps) + val_ts_step,
+                                }
+                                accelerator.log(logs, step=global_step)
+
+                            val_ts_step += 1
 
                     if is_tracking:
                         loss_validation_divergence = val_step_loss_recorder.moving_average - loss_recorder.moving_average
@@ -1432,6 +1451,8 @@ def remove_model(old_ckpt_name):
                         }
                         accelerator.log(logs, step=global_step)
 
+                    args.min_timestep = original_args_min_timestep
+                    args.max_timestep = original_args_max_timestep
                     optimizer_train_fn()
                     accelerator.unwrap_model(network).train()
 
@@ -1448,49 +1469,57 @@ def remove_model(old_ckpt_name):
                 accelerator.unwrap_model(network).eval()
 
                 val_progress_bar = tqdm(
-                    range(validation_steps),
+                    range(validation_total_steps),
                     smoothing=0,
                     disable=not accelerator.is_local_main_process,
                     desc="epoch validation steps",
                 )
 
+                val_ts_step = 0
                 for val_step, batch in enumerate(val_dataloader):
                     if val_step >= validation_steps:
                         break
 
-                    # temporary, for batch processing
-                    self.on_step_start(args, accelerator, network, text_encoders, unet, batch, weight_dtype)
+                    for timestep in validation_timesteps:
+                        args.min_timestep = args.max_timestep = timestep
 
-                    loss = self.process_batch(
-                        batch,
-                        text_encoders,
-                        unet,
-                        network,
-                        vae,
-                        noise_scheduler,
-                        vae_dtype,
-                        weight_dtype,
-                        accelerator,
-                        args,
-                        text_encoding_strategy,
-                        tokenize_strategy,
-                        is_train=False,
-                        train_text_encoder=train_text_encoder,
-                        train_unet=train_unet,
-                    )
+                        # temporary, for batch processing
+                        self.on_step_start(args, accelerator, network, text_encoders, unet, batch, weight_dtype)
+
+                        loss = self.process_batch(
+                            batch,
+                            text_encoders,
+                            unet,
+                            network,
+                            vae,
+                            noise_scheduler,
+                            vae_dtype,
+                            weight_dtype,
+                            accelerator,
+                            args,
+                            text_encoding_strategy,
+                            tokenize_strategy,
+                            is_train=False,
+                            train_text_encoder=train_text_encoder,
+                            train_unet=train_unet,
+                        )
 
-                    current_loss = loss.detach().item()
-                    val_epoch_loss_recorder.add(epoch=epoch, step=val_step, loss=current_loss)
-                    val_progress_bar.update(1)
-                    val_progress_bar.set_postfix({"val_epoch_avg_loss": val_epoch_loss_recorder.moving_average})
+                        current_loss = loss.detach().item()
+                        val_epoch_loss_recorder.add(epoch=epoch, step=val_ts_step, loss=current_loss)
+                        val_progress_bar.update(1)
+                        val_progress_bar.set_postfix(
+                            {"val_epoch_avg_loss": val_epoch_loss_recorder.moving_average, "timestep": timestep}
+                        )
 
-                    if is_tracking:
-                        logs = {
-                            "loss/validation/epoch_current": current_loss,
-                            "epoch": epoch + 1,
-                            "val_step": (epoch * validation_steps) + val_step,
-                        }
-                        accelerator.log(logs, step=global_step)
+                        if is_tracking:
+                            logs = {
+                                "loss/validation/epoch_current": current_loss,
+                                "epoch": epoch + 1,
+                                "val_step": (epoch * validation_total_steps) + val_ts_step,
+                            }
+                            accelerator.log(logs, step=global_step)
+
+                        val_ts_step += 1
 
                 if is_tracking:
                     avr_loss: float = val_epoch_loss_recorder.moving_average
@@ -1502,6 +1531,8 @@ def remove_model(old_ckpt_name):
                     }
                     accelerator.log(logs, step=global_step)
 
+                args.min_timestep = original_args_min_timestep
+                args.max_timestep = original_args_max_timestep
                 optimizer_train_fn()
                 accelerator.unwrap_model(network).train()
 

From 0778dd9b1df0d6aa33287ded3ce4195f3d03251b Mon Sep 17 00:00:00 2001
From: Kohya S <ykumeykume@gmail.com>
Date: Mon, 27 Jan 2025 22:03:42 +0900
Subject: [PATCH 326/582] fix Text Encoder only LoRA training

---
 flux_train_network.py | 2 +-
 sd3_train_network.py  | 2 +-
 train_network.py      | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/flux_train_network.py b/flux_train_network.py
index 5cd1b9d51..ae4b62f5c 100644
--- a/flux_train_network.py
+++ b/flux_train_network.py
@@ -378,7 +378,7 @@ def get_noise_pred_and_target(
         def call_dit(img, img_ids, t5_out, txt_ids, l_pooled, timesteps, guidance_vec, t5_attn_mask):
             # if not args.split_mode:
             # normal forward
-            with torch.set_grad_enabled(is_train and train_unet), accelerator.autocast():
+            with torch.set_grad_enabled(is_train), accelerator.autocast():
                 # YiYi notes: divide it by 1000 for now because we scale it by 1000 in the transformer model (we should not keep it but I want to keep the inputs same for the model for testing)
                 model_pred = unet(
                     img=img,
diff --git a/sd3_train_network.py b/sd3_train_network.py
index dcf497f53..2f4579492 100644
--- a/sd3_train_network.py
+++ b/sd3_train_network.py
@@ -345,7 +345,7 @@ def get_noise_pred_and_target(
             t5_attn_mask = None
 
         # call model
-        with torch.set_grad_enabled(is_train and train_unet), accelerator.autocast():
+        with torch.set_grad_enabled(is_train), accelerator.autocast():
             # TODO support attention mask
             model_pred = unet(noisy_model_input, timesteps, context=context, y=lg_pooled)
 
diff --git a/train_network.py b/train_network.py
index 2c3bb2aae..c3879531d 100644
--- a/train_network.py
+++ b/train_network.py
@@ -233,7 +233,7 @@ def get_noise_pred_and_target(
                 t.requires_grad_(True)
 
         # Predict the noise residual
-        with torch.set_grad_enabled(is_train and train_unet), accelerator.autocast():
+        with torch.set_grad_enabled(is_train), accelerator.autocast():
             noise_pred = self.call_unet(
                 args,
                 accelerator,

From 45ec02b2a8b5eb5af8f5b4877381dc4dcc596cb9 Mon Sep 17 00:00:00 2001
From: Kohya S <ykumeykume@gmail.com>
Date: Mon, 27 Jan 2025 22:10:38 +0900
Subject: [PATCH 327/582] use same noise for every validation

---
 flux_train_network.py | 1 -
 train_network.py      | 6 ++++++
 2 files changed, 6 insertions(+), 1 deletion(-)

diff --git a/flux_train_network.py b/flux_train_network.py
index aab025735..475bd751b 100644
--- a/flux_train_network.py
+++ b/flux_train_network.py
@@ -377,7 +377,6 @@ def get_noise_pred_and_target(
 
         def call_dit(img, img_ids, t5_out, txt_ids, l_pooled, timesteps, guidance_vec, t5_attn_mask):
             # grad is enabled even if unet is not in train mode, because Text Encoder is in train mode
-            
             with torch.set_grad_enabled(is_train), accelerator.autocast():
                 # YiYi notes: divide it by 1000 for now because we scale it by 1000 in the transformer model (we should not keep it but I want to keep the inputs same for the model for testing)
                 model_pred = unet(
diff --git a/train_network.py b/train_network.py
index a63e9d1e9..f0deb67ab 100644
--- a/train_network.py
+++ b/train_network.py
@@ -1391,6 +1391,8 @@ def remove_model(old_ckpt_name):
                 if accelerator.sync_gradients and validation_steps > 0 and should_validate_step:
                     optimizer_eval_fn()
                     accelerator.unwrap_model(network).eval()
+                    rng_state = torch.get_rng_state()
+                    torch.manual_seed(args.validation_seed if args.validation_seed is not None else args.seed)
 
                     val_progress_bar = tqdm(
                         range(validation_total_steps),
@@ -1451,6 +1453,7 @@ def remove_model(old_ckpt_name):
                         }
                         accelerator.log(logs, step=global_step)
 
+                    torch.set_rng_state(rng_state)
                     args.min_timestep = original_args_min_timestep
                     args.max_timestep = original_args_max_timestep
                     optimizer_train_fn()
@@ -1467,6 +1470,8 @@ def remove_model(old_ckpt_name):
             if should_validate_epoch and len(val_dataloader) > 0:
                 optimizer_eval_fn()
                 accelerator.unwrap_model(network).eval()
+                rng_state = torch.get_rng_state()
+                torch.manual_seed(args.validation_seed if args.validation_seed is not None else args.seed)
 
                 val_progress_bar = tqdm(
                     range(validation_total_steps),
@@ -1531,6 +1536,7 @@ def remove_model(old_ckpt_name):
                     }
                     accelerator.log(logs, step=global_step)
 
+                torch.set_rng_state(rng_state)
                 args.min_timestep = original_args_min_timestep
                 args.max_timestep = original_args_max_timestep
                 optimizer_train_fn()

From de830b89416f0671d7a1364a9262fa850c0669df Mon Sep 17 00:00:00 2001
From: rockerBOO <rockerboo@gmail.com>
Date: Wed, 29 Jan 2025 00:02:45 -0500
Subject: [PATCH 328/582] Move progress bar to account for sampling image first

---
 train_network.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/train_network.py b/train_network.py
index c3879531d..2deb736d6 100644
--- a/train_network.py
+++ b/train_network.py
@@ -1163,10 +1163,6 @@ def load_model_hook(models, input_dir):
                 args.max_train_steps > initial_step
             ), f"max_train_steps should be greater than initial step / max_train_stepsは初期ステップより大きい必要があります: {args.max_train_steps} vs {initial_step}"
 
-        progress_bar = tqdm(
-            range(args.max_train_steps - initial_step), smoothing=0, disable=not accelerator.is_local_main_process, desc="steps"
-        )
-
         epoch_to_start = 0
         if initial_step > 0:
             if args.skip_until_initial_step:
@@ -1271,6 +1267,10 @@ def remove_model(old_ckpt_name):
 
         clean_memory_on_device(accelerator.device)
 
+        progress_bar = tqdm(
+            range(args.max_train_steps - initial_step), smoothing=0, disable=not accelerator.is_local_main_process, desc="steps"
+        )
+
         for epoch in range(epoch_to_start, num_train_epochs):
             accelerator.print(f"\nepoch {epoch+1}/{num_train_epochs}\n")
             current_epoch.value = epoch + 1

From c5b803ce94bd70812e6979ac7b986a769659b14e Mon Sep 17 00:00:00 2001
From: Kohya S <ykumeykume@gmail.com>
Date: Tue, 4 Feb 2025 21:59:09 +0900
Subject: [PATCH 329/582] rng state management: Implement functions to get and
 set RNG states for consistent validation

---
 train_network.py | 33 +++++++++++++++++++++++++++++----
 1 file changed, 29 insertions(+), 4 deletions(-)

diff --git a/train_network.py b/train_network.py
index f0deb67ab..b3c7ff524 100644
--- a/train_network.py
+++ b/train_network.py
@@ -1278,6 +1278,31 @@ def remove_model(old_ckpt_name):
         original_args_min_timestep = args.min_timestep
         original_args_max_timestep = args.max_timestep
 
+        def get_rng_state() -> tuple[torch.ByteTensor, Optional[torch.ByteTensor], tuple]:
+            cpu_rng_state = torch.get_rng_state()
+            if accelerator.device.type == "cuda":
+                gpu_rng_state = torch.cuda.get_rng_state()
+            elif accelerator.device.type == "xpu":
+                gpu_rng_state = torch.xpu.get_rng_state()
+            elif accelerator.device.type == "mps":
+                gpu_rng_state = torch.cuda.get_rng_state()
+            else:
+                gpu_rng_state = None
+            python_rng_state = random.getstate()
+            return (cpu_rng_state, gpu_rng_state, python_rng_state)
+
+        def set_rng_state(rng_states: tuple[torch.ByteTensor, Optional[torch.ByteTensor], tuple]):
+            cpu_rng_state, gpu_rng_state, python_rng_state = rng_states
+            torch.set_rng_state(cpu_rng_state)
+            if gpu_rng_state is not None:
+                if accelerator.device.type == "cuda":
+                    torch.cuda.set_rng_state(gpu_rng_state)
+                elif accelerator.device.type == "xpu":
+                    torch.xpu.set_rng_state(gpu_rng_state)
+                elif accelerator.device.type == "mps":
+                    torch.cuda.set_rng_state(gpu_rng_state)
+            random.setstate(python_rng_state)
+
         for epoch in range(epoch_to_start, num_train_epochs):
             accelerator.print(f"\nepoch {epoch+1}/{num_train_epochs}\n")
             current_epoch.value = epoch + 1
@@ -1391,7 +1416,7 @@ def remove_model(old_ckpt_name):
                 if accelerator.sync_gradients and validation_steps > 0 and should_validate_step:
                     optimizer_eval_fn()
                     accelerator.unwrap_model(network).eval()
-                    rng_state = torch.get_rng_state()
+                    rng_states = get_rng_state()
                     torch.manual_seed(args.validation_seed if args.validation_seed is not None else args.seed)
 
                     val_progress_bar = tqdm(
@@ -1453,7 +1478,7 @@ def remove_model(old_ckpt_name):
                         }
                         accelerator.log(logs, step=global_step)
 
-                    torch.set_rng_state(rng_state)
+                    set_rng_state(rng_states)
                     args.min_timestep = original_args_min_timestep
                     args.max_timestep = original_args_max_timestep
                     optimizer_train_fn()
@@ -1470,7 +1495,7 @@ def remove_model(old_ckpt_name):
             if should_validate_epoch and len(val_dataloader) > 0:
                 optimizer_eval_fn()
                 accelerator.unwrap_model(network).eval()
-                rng_state = torch.get_rng_state()
+                rng_states = get_rng_state()
                 torch.manual_seed(args.validation_seed if args.validation_seed is not None else args.seed)
 
                 val_progress_bar = tqdm(
@@ -1536,7 +1561,7 @@ def remove_model(old_ckpt_name):
                     }
                     accelerator.log(logs, step=global_step)
 
-                torch.set_rng_state(rng_state)
+                set_rng_state(rng_states)
                 args.min_timestep = original_args_min_timestep
                 args.max_timestep = original_args_max_timestep
                 optimizer_train_fn()

From a24db1d532a95cc9dd91aba25a06b8eb58db5cff Mon Sep 17 00:00:00 2001
From: Kohya S <ykumeykume@gmail.com>
Date: Tue, 4 Feb 2025 22:02:42 +0900
Subject: [PATCH 330/582] fix: validation timestep generation fails on SD/SDXL
 training

---
 library/train_util.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/library/train_util.py b/library/train_util.py
index 37ed0a994..01fa64674 100644
--- a/library/train_util.py
+++ b/library/train_util.py
@@ -5935,7 +5935,10 @@ def save_sd_model_on_train_end_common(
 
 
 def get_timesteps(min_timestep: int, max_timestep: int, b_size: int, device: torch.device) -> torch.Tensor:
-    timesteps = torch.randint(min_timestep, max_timestep, (b_size,), device="cpu")
+    if min_timestep < max_timestep:
+        timesteps = torch.randint(min_timestep, max_timestep, (b_size,), device="cpu")
+    else:
+        timesteps = torch.full((b_size,), max_timestep, device="cpu")
     timesteps = timesteps.long().to(device)
     return timesteps
 

From 0911683717e439676bba758a5f7a29356984966c Mon Sep 17 00:00:00 2001
From: Kohya S <ykumeykume@gmail.com>
Date: Sun, 9 Feb 2025 20:53:49 +0900
Subject: [PATCH 331/582] set python random state

---
 train_network.py | 18 ++++++++++--------
 1 file changed, 10 insertions(+), 8 deletions(-)

diff --git a/train_network.py b/train_network.py
index b3c7ff524..083e5993d 100644
--- a/train_network.py
+++ b/train_network.py
@@ -1278,7 +1278,7 @@ def remove_model(old_ckpt_name):
         original_args_min_timestep = args.min_timestep
         original_args_max_timestep = args.max_timestep
 
-        def get_rng_state() -> tuple[torch.ByteTensor, Optional[torch.ByteTensor], tuple]:
+        def switch_rng_state(seed:int) -> tuple[torch.ByteTensor, Optional[torch.ByteTensor], tuple]:
             cpu_rng_state = torch.get_rng_state()
             if accelerator.device.type == "cuda":
                 gpu_rng_state = torch.cuda.get_rng_state()
@@ -1289,9 +1289,13 @@ def get_rng_state() -> tuple[torch.ByteTensor, Optional[torch.ByteTensor], tuple
             else:
                 gpu_rng_state = None
             python_rng_state = random.getstate()
+
+            torch.manual_seed(seed)
+            random.seed(seed)
+
             return (cpu_rng_state, gpu_rng_state, python_rng_state)
 
-        def set_rng_state(rng_states: tuple[torch.ByteTensor, Optional[torch.ByteTensor], tuple]):
+        def restore_rng_state(rng_states: tuple[torch.ByteTensor, Optional[torch.ByteTensor], tuple]):
             cpu_rng_state, gpu_rng_state, python_rng_state = rng_states
             torch.set_rng_state(cpu_rng_state)
             if gpu_rng_state is not None:
@@ -1416,8 +1420,7 @@ def set_rng_state(rng_states: tuple[torch.ByteTensor, Optional[torch.ByteTensor]
                 if accelerator.sync_gradients and validation_steps > 0 and should_validate_step:
                     optimizer_eval_fn()
                     accelerator.unwrap_model(network).eval()
-                    rng_states = get_rng_state()
-                    torch.manual_seed(args.validation_seed if args.validation_seed is not None else args.seed)
+                    rng_states = switch_rng_state(args.validation_seed if args.validation_seed is not None else args.seed)
 
                     val_progress_bar = tqdm(
                         range(validation_total_steps),
@@ -1478,7 +1481,7 @@ def set_rng_state(rng_states: tuple[torch.ByteTensor, Optional[torch.ByteTensor]
                         }
                         accelerator.log(logs, step=global_step)
 
-                    set_rng_state(rng_states)
+                    restore_rng_state(rng_states)
                     args.min_timestep = original_args_min_timestep
                     args.max_timestep = original_args_max_timestep
                     optimizer_train_fn()
@@ -1495,8 +1498,7 @@ def set_rng_state(rng_states: tuple[torch.ByteTensor, Optional[torch.ByteTensor]
             if should_validate_epoch and len(val_dataloader) > 0:
                 optimizer_eval_fn()
                 accelerator.unwrap_model(network).eval()
-                rng_states = get_rng_state()
-                torch.manual_seed(args.validation_seed if args.validation_seed is not None else args.seed)
+                rng_states = switch_rng_state(args.validation_seed if args.validation_seed is not None else args.seed)
 
                 val_progress_bar = tqdm(
                     range(validation_total_steps),
@@ -1561,7 +1563,7 @@ def set_rng_state(rng_states: tuple[torch.ByteTensor, Optional[torch.ByteTensor]
                     }
                     accelerator.log(logs, step=global_step)
 
-                set_rng_state(rng_states)
+                restore_rng_state(rng_states)
                 args.min_timestep = original_args_min_timestep
                 args.max_timestep = original_args_max_timestep
                 optimizer_train_fn()

From 344845b42941b48956dce94d614fbf32e900c70e Mon Sep 17 00:00:00 2001
From: Kohya S <ykumeykume@gmail.com>
Date: Sun, 9 Feb 2025 21:25:40 +0900
Subject: [PATCH 332/582] fix: validation with block swap

---
 flux_train_network.py | 14 ++++++++++++--
 sd3_train_network.py  | 19 ++++++++++++++-----
 train_network.py      | 18 +++++++++++-------
 3 files changed, 37 insertions(+), 14 deletions(-)

diff --git a/flux_train_network.py b/flux_train_network.py
index 475bd751b..e97dfc5b8 100644
--- a/flux_train_network.py
+++ b/flux_train_network.py
@@ -36,7 +36,12 @@ def __init__(self):
         self.is_schnell: Optional[bool] = None
         self.is_swapping_blocks: bool = False
 
-    def assert_extra_args(self, args, train_dataset_group: Union[train_util.DatasetGroup, train_util.MinimalDataset], val_dataset_group: Optional[train_util.DatasetGroup]):
+    def assert_extra_args(
+        self,
+        args,
+        train_dataset_group: Union[train_util.DatasetGroup, train_util.MinimalDataset],
+        val_dataset_group: Optional[train_util.DatasetGroup],
+    ):
         super().assert_extra_args(args, train_dataset_group, val_dataset_group)
         # sdxl_train_util.verify_sdxl_training_args(args)
 
@@ -341,7 +346,7 @@ def get_noise_pred_and_target(
         network,
         weight_dtype,
         train_unet,
-        is_train=True
+        is_train=True,
     ):
         # Sample noise that we'll add to the latents
         noise = torch.randn_like(latents)
@@ -507,6 +512,11 @@ def forward(hidden_states):
                 text_encoder.to(te_weight_dtype)  # fp8
                 prepare_fp8(text_encoder, weight_dtype)
 
+    def on_validation_step_end(self, args, accelerator, network, text_encoders, unet, batch, weight_dtype):
+        if self.is_swapping_blocks:
+            # prepare for next forward: because backward pass is not called, we need to prepare it here
+            accelerator.unwrap_model(unet).prepare_block_swap_before_forward()
+
     def prepare_unet_with_accelerator(
         self, args: argparse.Namespace, accelerator: Accelerator, unet: torch.nn.Module
     ) -> torch.nn.Module:
diff --git a/sd3_train_network.py b/sd3_train_network.py
index d4f131252..216d93c58 100644
--- a/sd3_train_network.py
+++ b/sd3_train_network.py
@@ -26,7 +26,12 @@ def __init__(self):
         super().__init__()
         self.sample_prompts_te_outputs = None
 
-    def assert_extra_args(self, args, train_dataset_group: Union[train_util.DatasetGroup, train_util.MinimalDataset], val_dataset_group: Optional[train_util.DatasetGroup]):
+    def assert_extra_args(
+        self,
+        args,
+        train_dataset_group: Union[train_util.DatasetGroup, train_util.MinimalDataset],
+        val_dataset_group: Optional[train_util.DatasetGroup],
+    ):
         # super().assert_extra_args(args, train_dataset_group)
         # sdxl_train_util.verify_sdxl_training_args(args)
 
@@ -317,7 +322,7 @@ def get_noise_pred_and_target(
         network,
         weight_dtype,
         train_unet,
-        is_train=True
+        is_train=True,
     ):
         # Sample noise that we'll add to the latents
         noise = torch.randn_like(latents)
@@ -445,15 +450,19 @@ def forward(hidden_states):
                 text_encoder.to(te_weight_dtype)  # fp8
                 prepare_fp8(text_encoder, weight_dtype)
 
-    def on_step_start(self, args, accelerator, network, text_encoders, unet, batch, weight_dtype):
-        # TODO consider validation
-        # drop cached text encoder outputs
+    def on_step_start(self, args, accelerator, network, text_encoders, unet, batch, weight_dtype, is_train=True):
+        # drop cached text encoder outputs: in validation, we drop cached outputs deterministically by fixed seed
         text_encoder_outputs_list = batch.get("text_encoder_outputs_list", None)
         if text_encoder_outputs_list is not None:
             text_encodoing_strategy: strategy_sd3.Sd3TextEncodingStrategy = strategy_base.TextEncodingStrategy.get_strategy()
             text_encoder_outputs_list = text_encodoing_strategy.drop_cached_text_encoder_outputs(*text_encoder_outputs_list)
             batch["text_encoder_outputs_list"] = text_encoder_outputs_list
 
+    def on_validation_step_end(self, args, accelerator, network, text_encoders, unet, batch, weight_dtype):
+        if self.is_swapping_blocks:
+            # prepare for next forward: because backward pass is not called, we need to prepare it here
+            accelerator.unwrap_model(unet).prepare_block_swap_before_forward()
+
     def prepare_unet_with_accelerator(
         self, args: argparse.Namespace, accelerator: Accelerator, unet: torch.nn.Module
     ) -> torch.nn.Module:
diff --git a/train_network.py b/train_network.py
index 083e5993d..49013c708 100644
--- a/train_network.py
+++ b/train_network.py
@@ -309,7 +309,10 @@ def prepare_unet_with_accelerator(
     ) -> torch.nn.Module:
         return accelerator.prepare(unet)
 
-    def on_step_start(self, args, accelerator, network, text_encoders, unet, batch, weight_dtype):
+    def on_step_start(self, args, accelerator, network, text_encoders, unet, batch, weight_dtype, is_train: bool = True):
+        pass
+
+    def on_validation_step_end(self, args, accelerator, network, text_encoders, unet, batch, weight_dtype):
         pass
 
     # endregion
@@ -1278,7 +1281,7 @@ def remove_model(old_ckpt_name):
         original_args_min_timestep = args.min_timestep
         original_args_max_timestep = args.max_timestep
 
-        def switch_rng_state(seed:int) -> tuple[torch.ByteTensor, Optional[torch.ByteTensor], tuple]:
+        def switch_rng_state(seed: int) -> tuple[torch.ByteTensor, Optional[torch.ByteTensor], tuple]:
             cpu_rng_state = torch.get_rng_state()
             if accelerator.device.type == "cuda":
                 gpu_rng_state = torch.cuda.get_rng_state()
@@ -1330,8 +1333,8 @@ def restore_rng_state(rng_states: tuple[torch.ByteTensor, Optional[torch.ByteTen
                 with accelerator.accumulate(training_model):
                     on_step_start_for_network(text_encoder, unet)
 
-                    # temporary, for batch processing
-                    self.on_step_start(args, accelerator, network, text_encoders, unet, batch, weight_dtype)
+                    # preprocess batch for each model
+                    self.on_step_start(args, accelerator, network, text_encoders, unet, batch, weight_dtype, is_train=True)
 
                     loss = self.process_batch(
                         batch,
@@ -1434,8 +1437,7 @@ def restore_rng_state(rng_states: tuple[torch.ByteTensor, Optional[torch.ByteTen
                             break
 
                         for timestep in validation_timesteps:
-                            # temporary, for batch processing
-                            self.on_step_start(args, accelerator, network, text_encoders, unet, batch, weight_dtype)
+                            self.on_step_start(args, accelerator, network, text_encoders, unet, batch, weight_dtype, is_train=False)
 
                             args.min_timestep = args.max_timestep = timestep  # dirty hack to change timestep
 
@@ -1471,6 +1473,7 @@ def restore_rng_state(rng_states: tuple[torch.ByteTensor, Optional[torch.ByteTen
                                 }
                                 accelerator.log(logs, step=global_step)
 
+                            self.on_validation_step_end(args, accelerator, network, text_encoders, unet, batch, weight_dtype)
                             val_ts_step += 1
 
                     if is_tracking:
@@ -1516,7 +1519,7 @@ def restore_rng_state(rng_states: tuple[torch.ByteTensor, Optional[torch.ByteTen
                         args.min_timestep = args.max_timestep = timestep
 
                         # temporary, for batch processing
-                        self.on_step_start(args, accelerator, network, text_encoders, unet, batch, weight_dtype)
+                        self.on_step_start(args, accelerator, network, text_encoders, unet, batch, weight_dtype, is_train=False)
 
                         loss = self.process_batch(
                             batch,
@@ -1551,6 +1554,7 @@ def restore_rng_state(rng_states: tuple[torch.ByteTensor, Optional[torch.ByteTen
                             }
                             accelerator.log(logs, step=global_step)
 
+                        self.on_validation_step_end(args, accelerator, network, text_encoders, unet, batch, weight_dtype)
                         val_ts_step += 1
 
                 if is_tracking:

From 177203818a024329efa74640a588674323363373 Mon Sep 17 00:00:00 2001
From: Kohya S <ykumeykume@gmail.com>
Date: Tue, 11 Feb 2025 21:42:46 +0900
Subject: [PATCH 333/582] fix: unpause training progress bar after vaidation

---
 train_network.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/train_network.py b/train_network.py
index 49013c708..8bfb19258 100644
--- a/train_network.py
+++ b/train_network.py
@@ -1489,6 +1489,7 @@ def restore_rng_state(rng_states: tuple[torch.ByteTensor, Optional[torch.ByteTen
                     args.max_timestep = original_args_max_timestep
                     optimizer_train_fn()
                     accelerator.unwrap_model(network).train()
+                    progress_bar.unpause()
 
                 if global_step >= args.max_train_steps:
                     break
@@ -1572,6 +1573,7 @@ def restore_rng_state(rng_states: tuple[torch.ByteTensor, Optional[torch.ByteTen
                 args.max_timestep = original_args_max_timestep
                 optimizer_train_fn()
                 accelerator.unwrap_model(network).train()
+                progress_bar.unpause()
 
             # END OF EPOCH
             if is_tracking:

From cd80752175c663ede2cb7995da652ed5f5f7f749 Mon Sep 17 00:00:00 2001
From: Kohya S <ykumeykume@gmail.com>
Date: Tue, 11 Feb 2025 21:42:58 +0900
Subject: [PATCH 334/582] fix: remove unused parameter 'accelerator' from
 encode_images_to_latents method

---
 flux_train_network.py | 2 +-
 sd3_train_network.py  | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/flux_train_network.py b/flux_train_network.py
index e97dfc5b8..def441559 100644
--- a/flux_train_network.py
+++ b/flux_train_network.py
@@ -328,7 +328,7 @@ def get_noise_scheduler(self, args: argparse.Namespace, device: torch.device) ->
         self.noise_scheduler_copy = copy.deepcopy(noise_scheduler)
         return noise_scheduler
 
-    def encode_images_to_latents(self, args, accelerator, vae, images):
+    def encode_images_to_latents(self, args, vae, images):
         return vae.encode(images)
 
     def shift_scale_latents(self, args, latents):
diff --git a/sd3_train_network.py b/sd3_train_network.py
index 216d93c58..cdb7aa4e3 100644
--- a/sd3_train_network.py
+++ b/sd3_train_network.py
@@ -304,7 +304,7 @@ def get_noise_scheduler(self, args: argparse.Namespace, device: torch.device) ->
         noise_scheduler = sd3_train_utils.FlowMatchEulerDiscreteScheduler(num_train_timesteps=1000, shift=args.training_shift)
         return noise_scheduler
 
-    def encode_images_to_latents(self, args, accelerator, vae, images):
+    def encode_images_to_latents(self, args, vae, images):
         return vae.encode(images)
 
     def shift_scale_latents(self, args, latents):

From 76b761943b5166f496aa1cb8ffbcc2d04469346a Mon Sep 17 00:00:00 2001
From: Kohya S <ykumeykume@gmail.com>
Date: Tue, 11 Feb 2025 21:53:57 +0900
Subject: [PATCH 335/582] fix: simplify validation step condition in
 NetworkTrainer

---
 train_network.py | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

diff --git a/train_network.py b/train_network.py
index 8bfb19258..99c58f49f 100644
--- a/train_network.py
+++ b/train_network.py
@@ -1414,12 +1414,9 @@ def restore_rng_state(rng_states: tuple[torch.ByteTensor, Optional[torch.ByteTen
                     )
                     accelerator.log(logs, step=global_step)
 
-                # VALIDATION PER STEP
-                should_validate_step = (
-                    args.validate_every_n_steps is not None
-                    and global_step != 0  # Skip first step
-                    and global_step % args.validate_every_n_steps == 0
-                )
+                # VALIDATION PER STEP: global_step is already incremented
+                # for example, if validate_every_n_steps=100, validate at step 100, 200, 300, ...
+                should_validate_step = args.validate_every_n_steps is not None and global_step % args.validate_every_n_steps == 0
                 if accelerator.sync_gradients and validation_steps > 0 and should_validate_step:
                     optimizer_eval_fn()
                     accelerator.unwrap_model(network).eval()

From d154e76c457a526d8af0853c92edab98cade22f6 Mon Sep 17 00:00:00 2001
From: sdbds <865105819@qq.com>
Date: Wed, 12 Feb 2025 16:30:05 +0800
Subject: [PATCH 336/582] init

---
 library/lumina_models.py     | 1144 ++++++++++++++++++++++++++++++++++
 library/lumina_train_util.py |  554 ++++++++++++++++
 library/lumina_util.py       |  194 ++++++
 library/sai_model_spec.py    |   12 +
 library/strategy_lumina.py   |  275 ++++++++
 library/train_util.py        |    2 +
 lumina_train_network.py      |  192 ++++++
 7 files changed, 2373 insertions(+)
 create mode 100644 library/lumina_models.py
 create mode 100644 library/lumina_train_util.py
 create mode 100644 library/lumina_util.py
 create mode 100644 library/strategy_lumina.py
 create mode 100644 lumina_train_network.py

diff --git a/library/lumina_models.py b/library/lumina_models.py
new file mode 100644
index 000000000..43b1e9c64
--- /dev/null
+++ b/library/lumina_models.py
@@ -0,0 +1,1144 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+# --------------------------------------------------------
+# References:
+# GLIDE: https://github.com/openai/glide-text2im
+# MAE: https://github.com/facebookresearch/mae/blob/main/models_mae.py
+# --------------------------------------------------------
+
+import math
+from typing import List, Optional, Tuple
+from dataclasses import dataclass
+
+from flash_attn import flash_attn_varlen_func
+from flash_attn.bert_padding import index_first_axis, pad_input, unpad_input  # noqa
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+try:
+    from apex.normalization import FusedRMSNorm as RMSNorm
+except ImportError:
+    warnings.warn("Cannot import apex RMSNorm, switch to vanilla implementation")
+
+memory_efficient_attention = None
+try:
+    import xformers
+except:
+    pass
+
+try:
+    from xformers.ops import memory_efficient_attention
+except:
+    memory_efficient_attention = None
+
+@dataclass
+class LuminaParams:
+    """Parameters for Lumina model configuration"""
+    patch_size: int = 2
+    dim: int = 2592
+    n_layers: int = 30
+    n_heads: int = 24
+    n_kv_heads: int = 8
+    axes_dims: List[int] = None
+    axes_lens: List[int] = None
+    qk_norm: bool = False,
+    ffn_dim_multiplier: Optional[float] = None,
+    norm_eps: float = 1e-5,
+    scaling_factor: float = 1.0,
+    cap_feat_dim: int = 32,
+
+    def __post_init__(self):
+        if self.axes_dims is None:
+            self.axes_dims = [36, 36, 36]
+        if self.axes_lens is None:
+            self.axes_lens = [300, 512, 512]
+
+    @classmethod
+    def get_2b_config(cls) -> "LuminaParams":
+        """Returns the configuration for the 2B parameter model"""
+        return cls(
+            patch_size=2,
+            dim=2592,
+            n_layers=30,
+            n_heads=24,
+            n_kv_heads=8,
+            axes_dims=[36, 36, 36],
+            axes_lens=[300, 512, 512]
+        )
+
+    @classmethod
+    def get_7b_config(cls) -> "LuminaParams":
+        """Returns the configuration for the 7B parameter model"""
+        return cls(
+            patch_size=2,
+            dim=4096,
+            n_layers=32,
+            n_heads=32,
+            n_kv_heads=8,
+            axes_dims=[64, 64, 64],
+            axes_lens=[300, 512, 512]
+        )
+
+
+#############################################################################
+#                                 RMSNorm                                   #
+#############################################################################
+
+
+class RMSNorm(torch.nn.Module):
+    def __init__(self, dim: int, eps: float = 1e-6):
+        """
+        Initialize the RMSNorm normalization layer.
+
+        Args:
+            dim (int): The dimension of the input tensor.
+            eps (float, optional): A small value added to the denominator for numerical stability. Default is 1e-6.
+
+        Attributes:
+            eps (float): A small value added to the denominator for numerical stability.
+            weight (nn.Parameter): Learnable scaling parameter.
+
+        """
+        super().__init__()
+        self.eps = eps
+        self.weight = nn.Parameter(torch.ones(dim))
+
+    def _norm(self, x):
+        """
+        Apply the RMSNorm normalization to the input tensor.
+
+        Args:
+            x (torch.Tensor): The input tensor.
+
+        Returns:
+            torch.Tensor: The normalized tensor.
+
+        """
+        return x * torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + self.eps)
+
+    def forward(self, x):
+        """
+        Forward pass through the RMSNorm layer.
+
+        Args:
+            x (torch.Tensor): The input tensor.
+
+        Returns:
+            torch.Tensor: The output tensor after applying RMSNorm.
+
+        """
+        output = self._norm(x.float()).type_as(x)
+        return output * self.weight
+
+
+def modulate(x, scale):
+    return x * (1 + scale.unsqueeze(1))
+
+
+#############################################################################
+#             Embedding Layers for Timesteps and Class Labels               #
+#############################################################################
+
+
+class TimestepEmbedder(nn.Module):
+    """
+    Embeds scalar timesteps into vector representations.
+    """
+
+    def __init__(self, hidden_size, frequency_embedding_size=256):
+        super().__init__()
+        self.mlp = nn.Sequential(
+            nn.Linear(
+                frequency_embedding_size,
+                hidden_size,
+                bias=True,
+            ),
+            nn.SiLU(),
+            nn.Linear(
+                hidden_size,
+                hidden_size,
+                bias=True,
+            ),
+        )
+        nn.init.normal_(self.mlp[0].weight, std=0.02)
+        nn.init.zeros_(self.mlp[0].bias)
+        nn.init.normal_(self.mlp[2].weight, std=0.02)
+        nn.init.zeros_(self.mlp[2].bias)
+
+        self.frequency_embedding_size = frequency_embedding_size
+
+    @staticmethod
+    def timestep_embedding(t, dim, max_period=10000):
+        """
+        Create sinusoidal timestep embeddings.
+        :param t: a 1-D Tensor of N indices, one per batch element.
+                          These may be fractional.
+        :param dim: the dimension of the output.
+        :param max_period: controls the minimum frequency of the embeddings.
+        :return: an (N, D) Tensor of positional embeddings.
+        """
+        # https://github.com/openai/glide-text2im/blob/main/glide_text2im/nn.py
+        half = dim // 2
+        freqs = torch.exp(
+            -math.log(max_period)
+            * torch.arange(start=0, end=half, dtype=torch.float32)
+            / half
+        ).to(device=t.device)
+        args = t[:, None].float() * freqs[None]
+        embedding = torch.cat([torch.cos(args), torch.sin(args)], dim=-1)
+        if dim % 2:
+            embedding = torch.cat(
+                [embedding, torch.zeros_like(embedding[:, :1])], dim=-1
+            )
+        return embedding
+
+    def forward(self, t):
+        t_freq = self.timestep_embedding(t, self.frequency_embedding_size)
+        t_emb = self.mlp(t_freq.to(self.mlp[0].weight.dtype))
+        return t_emb
+
+
+#############################################################################
+#                               Core NextDiT Model                              #
+#############################################################################
+
+
+class JointAttention(nn.Module):
+    """Multi-head attention module."""
+
+    def __init__(
+        self,
+        dim: int,
+        n_heads: int,
+        n_kv_heads: Optional[int],
+        qk_norm: bool,
+    ):
+        """
+        Initialize the Attention module.
+
+        Args:
+            dim (int): Number of input dimensions.
+            n_heads (int): Number of heads.
+            n_kv_heads (Optional[int]): Number of kv heads, if using GQA.
+
+        """
+        super().__init__()
+        self.n_kv_heads = n_heads if n_kv_heads is None else n_kv_heads
+        self.n_local_heads = n_heads
+        self.n_local_kv_heads = self.n_kv_heads
+        self.n_rep = self.n_local_heads // self.n_local_kv_heads
+        self.head_dim = dim // n_heads
+
+        self.qkv = nn.Linear(
+            dim,
+            (n_heads + self.n_kv_heads + self.n_kv_heads) * self.head_dim,
+            bias=False,
+        )
+        nn.init.xavier_uniform_(self.qkv.weight)
+
+        self.out = nn.Linear(
+            n_heads * self.head_dim,
+            dim,
+            bias=False,
+        )
+        nn.init.xavier_uniform_(self.out.weight)
+
+        if qk_norm:
+            self.q_norm = RMSNorm(self.head_dim)
+            self.k_norm = RMSNorm(self.head_dim)
+        else:
+            self.q_norm = self.k_norm = nn.Identity()
+
+    @staticmethod
+    def apply_rotary_emb(
+        x_in: torch.Tensor,
+        freqs_cis: torch.Tensor,
+    ) -> torch.Tensor:
+        """
+        Apply rotary embeddings to input tensors using the given frequency
+        tensor.
+
+        This function applies rotary embeddings to the given query 'xq' and
+        key 'xk' tensors using the provided frequency tensor 'freqs_cis'. The
+        input tensors are reshaped as complex numbers, and the frequency tensor
+        is reshaped for broadcasting compatibility. The resulting tensors
+        contain rotary embeddings and are returned as real tensors.
+
+        Args:
+            x_in (torch.Tensor): Query or Key tensor to apply rotary embeddings.
+            freqs_cis (torch.Tensor): Precomputed frequency tensor for complex
+                exponentials.
+
+        Returns:
+            Tuple[torch.Tensor, torch.Tensor]: Tuple of modified query tensor
+                and key tensor with rotary embeddings.
+        """
+        with torch.amp.autocast("cuda",enabled=False):
+            x = torch.view_as_complex(x_in.float().reshape(*x_in.shape[:-1], -1, 2))
+            freqs_cis = freqs_cis.unsqueeze(2)
+            x_out = torch.view_as_real(x * freqs_cis).flatten(3)
+            return x_out.type_as(x_in)
+
+    # copied from huggingface modeling_llama.py
+    def _upad_input(
+        self, query_layer, key_layer, value_layer, attention_mask, query_length
+    ):
+        def _get_unpad_data(attention_mask):
+            seqlens_in_batch = attention_mask.sum(dim=-1, dtype=torch.int32)
+            indices = torch.nonzero(attention_mask.flatten(), as_tuple=False).flatten()
+            max_seqlen_in_batch = seqlens_in_batch.max().item()
+            cu_seqlens = F.pad(
+                torch.cumsum(seqlens_in_batch, dim=0, dtype=torch.int32), (1, 0)
+            )
+            return (
+                indices,
+                cu_seqlens,
+                max_seqlen_in_batch,
+            )
+
+        indices_k, cu_seqlens_k, max_seqlen_in_batch_k = _get_unpad_data(attention_mask)
+        batch_size, kv_seq_len, num_key_value_heads, head_dim = key_layer.shape
+
+        key_layer = index_first_axis(
+            key_layer.reshape(batch_size * kv_seq_len, num_key_value_heads, head_dim),
+            indices_k,
+        )
+        value_layer = index_first_axis(
+            value_layer.reshape(batch_size * kv_seq_len, num_key_value_heads, head_dim),
+            indices_k,
+        )
+        if query_length == kv_seq_len:
+            query_layer = index_first_axis(
+                query_layer.reshape(
+                    batch_size * kv_seq_len, self.n_local_heads, head_dim
+                ),
+                indices_k,
+            )
+            cu_seqlens_q = cu_seqlens_k
+            max_seqlen_in_batch_q = max_seqlen_in_batch_k
+            indices_q = indices_k
+        elif query_length == 1:
+            max_seqlen_in_batch_q = 1
+            cu_seqlens_q = torch.arange(
+                batch_size + 1, dtype=torch.int32, device=query_layer.device
+            )  # There is a memcpy here, that is very bad.
+            indices_q = cu_seqlens_q[:-1]
+            query_layer = query_layer.squeeze(1)
+        else:
+            # The -q_len: slice assumes left padding.
+            attention_mask = attention_mask[:, -query_length:]
+            query_layer, indices_q, cu_seqlens_q, max_seqlen_in_batch_q = unpad_input(
+                query_layer, attention_mask
+            )
+
+        return (
+            query_layer,
+            key_layer,
+            value_layer,
+            indices_q,
+            (cu_seqlens_q, cu_seqlens_k),
+            (max_seqlen_in_batch_q, max_seqlen_in_batch_k),
+        )
+
+    def forward(
+        self,
+        x: torch.Tensor,
+        x_mask: torch.Tensor,
+        freqs_cis: torch.Tensor,
+    ) -> torch.Tensor:
+        """
+
+        Args:
+            x:
+            x_mask:
+            freqs_cis:
+
+        Returns:
+
+        """
+        bsz, seqlen, _ = x.shape
+        dtype = x.dtype
+
+        xq, xk, xv = torch.split(
+            self.qkv(x),
+            [
+                self.n_local_heads * self.head_dim,
+                self.n_local_kv_heads * self.head_dim,
+                self.n_local_kv_heads * self.head_dim,
+            ],
+            dim=-1,
+        )
+        xq = xq.view(bsz, seqlen, self.n_local_heads, self.head_dim)
+        xk = xk.view(bsz, seqlen, self.n_local_kv_heads, self.head_dim)
+        xv = xv.view(bsz, seqlen, self.n_local_kv_heads, self.head_dim)
+        xq = self.q_norm(xq)
+        xk = self.k_norm(xk)
+        xq = JointAttention.apply_rotary_emb(xq, freqs_cis=freqs_cis)
+        xk = JointAttention.apply_rotary_emb(xk, freqs_cis=freqs_cis)
+        xq, xk = xq.to(dtype), xk.to(dtype)
+
+        softmax_scale = math.sqrt(1 / self.head_dim)
+
+        if dtype in [torch.float16, torch.bfloat16]:
+            # begin var_len flash attn
+            (
+                query_states,
+                key_states,
+                value_states,
+                indices_q,
+                cu_seq_lens,
+                max_seq_lens,
+            ) = self._upad_input(xq, xk, xv, x_mask, seqlen)
+
+            cu_seqlens_q, cu_seqlens_k = cu_seq_lens
+            max_seqlen_in_batch_q, max_seqlen_in_batch_k = max_seq_lens
+
+            attn_output_unpad = flash_attn_varlen_func(
+                query_states,
+                key_states,
+                value_states,
+                cu_seqlens_q=cu_seqlens_q,
+                cu_seqlens_k=cu_seqlens_k,
+                max_seqlen_q=max_seqlen_in_batch_q,
+                max_seqlen_k=max_seqlen_in_batch_k,
+                dropout_p=0.0,
+                causal=False,
+                softmax_scale=softmax_scale,
+            )
+            output = pad_input(attn_output_unpad, indices_q, bsz, seqlen)
+            # end var_len_flash_attn
+
+        else:
+            n_rep = self.n_local_heads // self.n_local_kv_heads
+            if n_rep >= 1:
+                xk = xk.unsqueeze(3).repeat(1, 1, 1, n_rep, 1).flatten(2, 3)
+                xv = xv.unsqueeze(3).repeat(1, 1, 1, n_rep, 1).flatten(2, 3)
+            output = (
+                F.scaled_dot_product_attention(
+                    xq.permute(0, 2, 1, 3),
+                    xk.permute(0, 2, 1, 3),
+                    xv.permute(0, 2, 1, 3),
+                    attn_mask=x_mask.bool()
+                    .view(bsz, 1, 1, seqlen)
+                    .expand(-1, self.n_local_heads, seqlen, -1),
+                    scale=softmax_scale,
+                )
+                .permute(0, 2, 1, 3)
+                .to(dtype)
+            )
+
+        output = output.flatten(-2)
+
+        return self.out(output)
+
+
+class FeedForward(nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        hidden_dim: int,
+        multiple_of: int,
+        ffn_dim_multiplier: Optional[float],
+    ):
+        """
+        Initialize the FeedForward module.
+
+        Args:
+            dim (int): Input dimension.
+            hidden_dim (int): Hidden dimension of the feedforward layer.
+            multiple_of (int): Value to ensure hidden dimension is a multiple
+                of this value.
+            ffn_dim_multiplier (float, optional): Custom multiplier for hidden
+                dimension. Defaults to None.
+
+        """
+        super().__init__()
+        # custom dim factor multiplier
+        if ffn_dim_multiplier is not None:
+            hidden_dim = int(ffn_dim_multiplier * hidden_dim)
+        hidden_dim = multiple_of * ((hidden_dim + multiple_of - 1) // multiple_of)
+
+        self.w1 = nn.Linear(
+            dim,
+            hidden_dim,
+            bias=False,
+        )
+        nn.init.xavier_uniform_(self.w1.weight)
+        self.w2 = nn.Linear(
+            hidden_dim,
+            dim,
+            bias=False,
+        )
+        nn.init.xavier_uniform_(self.w2.weight)
+        self.w3 = nn.Linear(
+            dim,
+            hidden_dim,
+            bias=False,
+        )
+        nn.init.xavier_uniform_(self.w3.weight)
+
+    # @torch.compile
+    def _forward_silu_gating(self, x1, x3):
+        return F.silu(x1) * x3
+
+    def forward(self, x):
+        return self.w2(self._forward_silu_gating(self.w1(x), self.w3(x)))
+
+
+class JointTransformerBlock(nn.Module):
+    def __init__(
+        self,
+        layer_id: int,
+        dim: int,
+        n_heads: int,
+        n_kv_heads: int,
+        multiple_of: int,
+        ffn_dim_multiplier: float,
+        norm_eps: float,
+        qk_norm: bool,
+        modulation=True,
+    ) -> None:
+        """
+        Initialize a TransformerBlock.
+
+        Args:
+            layer_id (int): Identifier for the layer.
+            dim (int): Embedding dimension of the input features.
+            n_heads (int): Number of attention heads.
+            n_kv_heads (Optional[int]): Number of attention heads in key and
+                value features (if using GQA), or set to None for the same as
+                query.
+            multiple_of (int):
+            ffn_dim_multiplier (float):
+            norm_eps (float):
+
+        """
+        super().__init__()
+        self.dim = dim
+        self.head_dim = dim // n_heads
+        self.attention = JointAttention(dim, n_heads, n_kv_heads, qk_norm)
+        self.feed_forward = FeedForward(
+            dim=dim,
+            hidden_dim=4 * dim,
+            multiple_of=multiple_of,
+            ffn_dim_multiplier=ffn_dim_multiplier,
+        )
+        self.layer_id = layer_id
+        self.attention_norm1 = RMSNorm(dim, eps=norm_eps)
+        self.ffn_norm1 = RMSNorm(dim, eps=norm_eps)
+
+        self.attention_norm2 = RMSNorm(dim, eps=norm_eps)
+        self.ffn_norm2 = RMSNorm(dim, eps=norm_eps)
+
+        self.modulation = modulation
+        if modulation:
+            self.adaLN_modulation = nn.Sequential(
+                nn.SiLU(),
+                nn.Linear(
+                    min(dim, 1024),
+                    4 * dim,
+                    bias=True,
+                ),
+            )
+            nn.init.zeros_(self.adaLN_modulation[1].weight)
+            nn.init.zeros_(self.adaLN_modulation[1].bias)
+
+    def forward(
+        self,
+        x: torch.Tensor,
+        x_mask: torch.Tensor,
+        freqs_cis: torch.Tensor,
+        adaln_input: Optional[torch.Tensor] = None,
+    ):
+        """
+        Perform a forward pass through the TransformerBlock.
+
+        Args:
+            x (torch.Tensor): Input tensor.
+            freqs_cis (torch.Tensor): Precomputed cosine and sine frequencies.
+
+        Returns:
+            torch.Tensor: Output tensor after applying attention and
+                feedforward layers.
+
+        """
+        if self.modulation:
+            assert adaln_input is not None
+            scale_msa, gate_msa, scale_mlp, gate_mlp = self.adaLN_modulation(
+                adaln_input
+            ).chunk(4, dim=1)
+
+            x = x + gate_msa.unsqueeze(1).tanh() * self.attention_norm2(
+                self.attention(
+                    modulate(self.attention_norm1(x), scale_msa),
+                    x_mask,
+                    freqs_cis,
+                )
+            )
+            x = x + gate_mlp.unsqueeze(1).tanh() * self.ffn_norm2(
+                self.feed_forward(
+                    modulate(self.ffn_norm1(x), scale_mlp),
+                )
+            )
+        else:
+            assert adaln_input is None
+            x = x + self.attention_norm2(
+                self.attention(
+                    self.attention_norm1(x),
+                    x_mask,
+                    freqs_cis,
+                )
+            )
+            x = x + self.ffn_norm2(
+                self.feed_forward(
+                    self.ffn_norm1(x),
+                )
+            )
+        return x
+
+
+class FinalLayer(nn.Module):
+    """
+    The final layer of NextDiT.
+    """
+
+    def __init__(self, hidden_size, patch_size, out_channels):
+        super().__init__()
+        self.norm_final = nn.LayerNorm(
+            hidden_size,
+            elementwise_affine=False,
+            eps=1e-6,
+        )
+        self.linear = nn.Linear(
+            hidden_size,
+            patch_size * patch_size * out_channels,
+            bias=True,
+        )
+        nn.init.zeros_(self.linear.weight)
+        nn.init.zeros_(self.linear.bias)
+
+        self.adaLN_modulation = nn.Sequential(
+            nn.SiLU(),
+            nn.Linear(
+                min(hidden_size, 1024),
+                hidden_size,
+                bias=True,
+            ),
+        )
+        nn.init.zeros_(self.adaLN_modulation[1].weight)
+        nn.init.zeros_(self.adaLN_modulation[1].bias)
+
+    def forward(self, x, c):
+        scale = self.adaLN_modulation(c)
+        x = modulate(self.norm_final(x), scale)
+        x = self.linear(x)
+        return x
+
+
+class RopeEmbedder:
+    def __init__(
+        self,
+        theta: float = 10000.0,
+        axes_dims: List[int] = (16, 56, 56),
+        axes_lens: List[int] = (1, 512, 512),
+    ):
+        super().__init__()
+        self.theta = theta
+        self.axes_dims = axes_dims
+        self.axes_lens = axes_lens
+        self.freqs_cis = NextDiT.precompute_freqs_cis(
+            self.axes_dims, self.axes_lens, theta=self.theta
+        )
+
+    def __call__(self, ids: torch.Tensor):
+        self.freqs_cis = [freqs_cis.to(ids.device) for freqs_cis in self.freqs_cis]
+        result = []
+        for i in range(len(self.axes_dims)):
+            # import torch.distributed as dist
+            # if not dist.is_initialized() or dist.get_rank() == 0:
+            #     import pdb
+            #     pdb.set_trace()
+            index = (
+                ids[:, :, i : i + 1]
+                .repeat(1, 1, self.freqs_cis[i].shape[-1])
+                .to(torch.int64)
+            )
+            result.append(
+                torch.gather(
+                    self.freqs_cis[i].unsqueeze(0).repeat(index.shape[0], 1, 1),
+                    dim=1,
+                    index=index,
+                )
+            )
+        return torch.cat(result, dim=-1)
+
+
+class NextDiT(nn.Module):
+    """
+    Diffusion model with a Transformer backbone.
+    """
+
+    def __init__(
+        self,
+        patch_size: int = 2,
+        in_channels: int = 4,
+        dim: int = 4096,
+        n_layers: int = 32,
+        n_refiner_layers: int = 2,
+        n_heads: int = 32,
+        n_kv_heads: Optional[int] = None,
+        multiple_of: int = 256,
+        ffn_dim_multiplier: Optional[float] = None,
+        norm_eps: float = 1e-5,
+        qk_norm: bool = False,
+        cap_feat_dim: int = 5120,
+        axes_dims: List[int] = (16, 56, 56),
+        axes_lens: List[int] = (1, 512, 512),
+    ) -> None:
+        super().__init__()
+        self.in_channels = in_channels
+        self.out_channels = in_channels
+        self.patch_size = patch_size
+
+        self.x_embedder = nn.Linear(
+            in_features=patch_size * patch_size * in_channels,
+            out_features=dim,
+            bias=True,
+        )
+        nn.init.xavier_uniform_(self.x_embedder.weight)
+        nn.init.constant_(self.x_embedder.bias, 0.0)
+
+        self.noise_refiner = nn.ModuleList(
+            [
+                JointTransformerBlock(
+                    layer_id,
+                    dim,
+                    n_heads,
+                    n_kv_heads,
+                    multiple_of,
+                    ffn_dim_multiplier,
+                    norm_eps,
+                    qk_norm,
+                    modulation=True,
+                )
+                for layer_id in range(n_refiner_layers)
+            ]
+        )
+        self.context_refiner = nn.ModuleList(
+            [
+                JointTransformerBlock(
+                    layer_id,
+                    dim,
+                    n_heads,
+                    n_kv_heads,
+                    multiple_of,
+                    ffn_dim_multiplier,
+                    norm_eps,
+                    qk_norm,
+                    modulation=False,
+                )
+                for layer_id in range(n_refiner_layers)
+            ]
+        )
+
+        self.t_embedder = TimestepEmbedder(min(dim, 1024))
+        self.cap_embedder = nn.Sequential(
+            RMSNorm(cap_feat_dim, eps=norm_eps),
+            nn.Linear(
+                cap_feat_dim,
+                dim,
+                bias=True,
+            ),
+        )
+        nn.init.trunc_normal_(self.cap_embedder[1].weight, std=0.02)
+        # nn.init.zeros_(self.cap_embedder[1].weight)
+        nn.init.zeros_(self.cap_embedder[1].bias)
+
+        self.layers = nn.ModuleList(
+            [
+                JointTransformerBlock(
+                    layer_id,
+                    dim,
+                    n_heads,
+                    n_kv_heads,
+                    multiple_of,
+                    ffn_dim_multiplier,
+                    norm_eps,
+                    qk_norm,
+                )
+                for layer_id in range(n_layers)
+            ]
+        )
+        self.norm_final = RMSNorm(dim, eps=norm_eps)
+        self.final_layer = FinalLayer(dim, patch_size, self.out_channels)
+
+        assert (dim // n_heads) == sum(axes_dims)
+        self.axes_dims = axes_dims
+        self.axes_lens = axes_lens
+        self.rope_embedder = RopeEmbedder(axes_dims=axes_dims, axes_lens=axes_lens)
+        self.dim = dim
+        self.n_heads = n_heads
+
+    def unpatchify(
+        self,
+        x: torch.Tensor,
+        img_size: List[Tuple[int, int]],
+        cap_size: List[int],
+        return_tensor=False,
+    ) -> List[torch.Tensor]:
+        """
+        x: (N, T, patch_size**2 * C)
+        imgs: (N, H, W, C)
+        """
+        pH = pW = self.patch_size
+        imgs = []
+        for i in range(x.size(0)):
+            H, W = img_size[i]
+            begin = cap_size[i]
+            end = begin + (H // pH) * (W // pW)
+            imgs.append(
+                x[i][begin:end]
+                .view(H // pH, W // pW, pH, pW, self.out_channels)
+                .permute(4, 0, 2, 1, 3)
+                .flatten(3, 4)
+                .flatten(1, 2)
+            )
+
+        if return_tensor:
+            imgs = torch.stack(imgs, dim=0)
+        return imgs
+
+    def patchify_and_embed(
+        self,
+        x: List[torch.Tensor] | torch.Tensor,
+        cap_feats: torch.Tensor,
+        cap_mask: torch.Tensor,
+        t: torch.Tensor,
+    ) -> Tuple[
+        torch.Tensor, torch.Tensor, List[Tuple[int, int]], List[int], torch.Tensor
+    ]:
+        bsz = len(x)
+        pH = pW = self.patch_size
+        device = x[0].device
+
+        l_effective_cap_len = cap_mask.sum(dim=1).tolist()
+        img_sizes = [(img.size(1), img.size(2)) for img in x]
+        l_effective_img_len = [(H // pH) * (W // pW) for (H, W) in img_sizes]
+
+        max_seq_len = max(
+            (
+                cap_len + img_len
+                for cap_len, img_len in zip(l_effective_cap_len, l_effective_img_len)
+            )
+        )
+        max_cap_len = max(l_effective_cap_len)
+        max_img_len = max(l_effective_img_len)
+
+        position_ids = torch.zeros(
+            bsz, max_seq_len, 3, dtype=torch.int32, device=device
+        )
+
+        for i in range(bsz):
+            cap_len = l_effective_cap_len[i]
+            img_len = l_effective_img_len[i]
+            H, W = img_sizes[i]
+            H_tokens, W_tokens = H // pH, W // pW
+            assert H_tokens * W_tokens == img_len
+
+            position_ids[i, :cap_len, 0] = torch.arange(
+                cap_len, dtype=torch.int32, device=device
+            )
+            position_ids[i, cap_len : cap_len + img_len, 0] = cap_len
+            row_ids = (
+                torch.arange(H_tokens, dtype=torch.int32, device=device)
+                .view(-1, 1)
+                .repeat(1, W_tokens)
+                .flatten()
+            )
+            col_ids = (
+                torch.arange(W_tokens, dtype=torch.int32, device=device)
+                .view(1, -1)
+                .repeat(H_tokens, 1)
+                .flatten()
+            )
+            position_ids[i, cap_len : cap_len + img_len, 1] = row_ids
+            position_ids[i, cap_len : cap_len + img_len, 2] = col_ids
+
+        freqs_cis = self.rope_embedder(position_ids)
+
+        # build freqs_cis for cap and image individually
+        cap_freqs_cis_shape = list(freqs_cis.shape)
+        # cap_freqs_cis_shape[1] = max_cap_len
+        cap_freqs_cis_shape[1] = cap_feats.shape[1]
+        cap_freqs_cis = torch.zeros(
+            *cap_freqs_cis_shape, device=device, dtype=freqs_cis.dtype
+        )
+
+        img_freqs_cis_shape = list(freqs_cis.shape)
+        img_freqs_cis_shape[1] = max_img_len
+        img_freqs_cis = torch.zeros(
+            *img_freqs_cis_shape, device=device, dtype=freqs_cis.dtype
+        )
+
+        for i in range(bsz):
+            cap_len = l_effective_cap_len[i]
+            img_len = l_effective_img_len[i]
+            cap_freqs_cis[i, :cap_len] = freqs_cis[i, :cap_len]
+            img_freqs_cis[i, :img_len] = freqs_cis[i, cap_len : cap_len + img_len]
+
+        # refine context
+        for layer in self.context_refiner:
+            cap_feats = layer(cap_feats, cap_mask, cap_freqs_cis)
+
+        # refine image
+        flat_x = []
+        for i in range(bsz):
+            img = x[i]
+            C, H, W = img.size()
+            img = (
+                img.view(C, H // pH, pH, W // pW, pW)
+                .permute(1, 3, 2, 4, 0)
+                .flatten(2)
+                .flatten(0, 1)
+            )
+            flat_x.append(img)
+        x = flat_x
+        padded_img_embed = torch.zeros(
+            bsz, max_img_len, x[0].shape[-1], device=device, dtype=x[0].dtype
+        )
+        padded_img_mask = torch.zeros(bsz, max_img_len, dtype=torch.bool, device=device)
+        for i in range(bsz):
+            padded_img_embed[i, : l_effective_img_len[i]] = x[i]
+            padded_img_mask[i, : l_effective_img_len[i]] = True
+
+        padded_img_embed = self.x_embedder(padded_img_embed)
+        for layer in self.noise_refiner:
+            padded_img_embed = layer(
+                padded_img_embed, padded_img_mask, img_freqs_cis, t
+            )
+
+        mask = torch.zeros(bsz, max_seq_len, dtype=torch.bool, device=device)
+        padded_full_embed = torch.zeros(
+            bsz, max_seq_len, self.dim, device=device, dtype=x[0].dtype
+        )
+        for i in range(bsz):
+            cap_len = l_effective_cap_len[i]
+            img_len = l_effective_img_len[i]
+
+            mask[i, : cap_len + img_len] = True
+            padded_full_embed[i, :cap_len] = cap_feats[i, :cap_len]
+            padded_full_embed[i, cap_len : cap_len + img_len] = padded_img_embed[
+                i, :img_len
+            ]
+
+        return padded_full_embed, mask, img_sizes, l_effective_cap_len, freqs_cis
+
+    def forward(self, x, t, cap_feats, cap_mask):
+        """
+        Forward pass of NextDiT.
+        t: (N,) tensor of diffusion timesteps
+        y: (N,) tensor of text tokens/features
+        """
+
+        # import torch.distributed as dist
+        # if not dist.is_initialized() or dist.get_rank() == 0:
+        #     import pdb
+        #     pdb.set_trace()
+        # torch.save([x, t, cap_feats, cap_mask], "./fake_input.pt")
+        t = self.t_embedder(t)  # (N, D)
+        adaln_input = t
+
+        cap_feats = self.cap_embedder(
+            cap_feats
+        )  # (N, L, D)  # todo check if able to batchify w.o. redundant compute
+
+        x_is_tensor = isinstance(x, torch.Tensor)
+        x, mask, img_size, cap_size, freqs_cis = self.patchify_and_embed(
+            x, cap_feats, cap_mask, t
+        )
+        freqs_cis = freqs_cis.to(x.device)
+
+        for layer in self.layers:
+            x = layer(x, mask, freqs_cis, adaln_input)
+
+        x = self.final_layer(x, adaln_input)
+        x = self.unpatchify(x, img_size, cap_size, return_tensor=x_is_tensor)
+
+        return x
+
+    def forward_with_cfg(
+        self, x, t, cap_feats, cap_mask, cfg_scale, cfg_trunc=100, renorm_cfg=1
+    ):
+        """
+        Forward pass of NextDiT, but also batches the unconditional forward pass
+        for classifier-free guidance.
+        """
+        # # https://github.com/openai/glide-text2im/blob/main/notebooks/text2im.ipynb
+        half = x[: len(x) // 2]
+        if t[0] < cfg_trunc:
+            combined = torch.cat([half, half], dim=0)  # [2, 16, 128, 128]
+            model_out = self.forward(
+                combined, t, cap_feats, cap_mask
+            )  # [2, 16, 128, 128]
+            # For exact reproducibility reasons, we apply classifier-free guidance on only
+            # three channels by default. The standard approach to cfg applies it to all channels.
+            # This can be done by uncommenting the following line and commenting-out the line following that.
+            eps, rest = (
+                model_out[:, : self.in_channels],
+                model_out[:, self.in_channels :],
+            )
+            cond_eps, uncond_eps = torch.split(eps, len(eps) // 2, dim=0)
+            half_eps = uncond_eps + cfg_scale * (cond_eps - uncond_eps)
+            if float(renorm_cfg) > 0.0:
+                ori_pos_norm = torch.linalg.vector_norm(
+                    cond_eps, dim=tuple(range(1, len(cond_eps.shape))), keepdim=True
+                )
+                max_new_norm = ori_pos_norm * float(renorm_cfg)
+                new_pos_norm = torch.linalg.vector_norm(
+                    half_eps, dim=tuple(range(1, len(half_eps.shape))), keepdim=True
+                )
+                if new_pos_norm >= max_new_norm:
+                    half_eps = half_eps * (max_new_norm / new_pos_norm)
+        else:
+            combined = half
+            model_out = self.forward(
+                combined,
+                t[: len(x) // 2],
+                cap_feats[: len(x) // 2],
+                cap_mask[: len(x) // 2],
+            )
+            eps, rest = (
+                model_out[:, : self.in_channels],
+                model_out[:, self.in_channels :],
+            )
+            half_eps = eps
+
+        output = torch.cat([half_eps, half_eps], dim=0)
+        return output
+
+    @staticmethod
+    def precompute_freqs_cis(
+        dim: List[int],
+        end: List[int],
+        theta: float = 10000.0,
+    ):
+        """
+        Precompute the frequency tensor for complex exponentials (cis) with
+        given dimensions.
+
+        This function calculates a frequency tensor with complex exponentials
+        using the given dimension 'dim' and the end index 'end'. The 'theta'
+        parameter scales the frequencies. The returned tensor contains complex
+        values in complex64 data type.
+
+        Args:
+            dim (list): Dimension of the frequency tensor.
+            end (list): End index for precomputing frequencies.
+            theta (float, optional): Scaling factor for frequency computation.
+                Defaults to 10000.0.
+
+        Returns:
+            torch.Tensor: Precomputed frequency tensor with complex
+                exponentials.
+        """
+        freqs_cis = []
+        for i, (d, e) in enumerate(zip(dim, end)):
+            freqs = 1.0 / (
+                theta ** (torch.arange(0, d, 2, dtype=torch.float64, device="cpu") / d)
+            )
+            timestep = torch.arange(e, device=freqs.device, dtype=torch.float64)
+            freqs = torch.outer(timestep, freqs).float()
+            freqs_cis_i = torch.polar(torch.ones_like(freqs), freqs).to(
+                torch.complex64
+            )  # complex64
+            freqs_cis.append(freqs_cis_i)
+
+        return freqs_cis
+
+    def parameter_count(self) -> int:
+        total_params = 0
+
+        def _recursive_count_params(module):
+            nonlocal total_params
+            for param in module.parameters(recurse=False):
+                total_params += param.numel()
+            for submodule in module.children():
+                _recursive_count_params(submodule)
+
+        _recursive_count_params(self)
+        return total_params
+
+    def get_fsdp_wrap_module_list(self) -> List[nn.Module]:
+        return list(self.layers)
+
+    def get_checkpointing_wrap_module_list(self) -> List[nn.Module]:
+        return list(self.layers)
+
+
+#############################################################################
+#                                 NextDiT Configs                               #
+#############################################################################
+
+
+def NextDiT_2B_GQA_patch2_Adaln_Refiner(params: Optional[LuminaParams] = None, **kwargs):
+    if params is None:
+        params = LuminaParams.get_2b_config()
+    
+    return NextDiT(
+        patch_size=params.patch_size,
+        dim=params.dim,
+        n_layers=params.n_layers,
+        n_heads=params.n_heads,
+        n_kv_heads=params.n_kv_heads,
+        axes_dims=params.axes_dims,
+        axes_lens=params.axes_lens,
+        qk_norm=params.qk_norm,
+        ffn_dim_multiplier=params.ffn_dim_multiplier,
+        norm_eps=params.norm_eps,
+        scaling_factor=params.scaling_factor,
+        cap_feat_dim=params.cap_feat_dim,
+        **kwargs,
+    )
+
+
+def NextDiT_3B_GQA_patch2_Adaln_Refiner(**kwargs):
+    return NextDiT(
+        patch_size=2,
+        dim=2592,
+        n_layers=30,
+        n_heads=24,
+        n_kv_heads=8,
+        axes_dims=[36, 36, 36],
+        axes_lens=[300, 512, 512],
+        **kwargs,
+    )
+
+
+def NextDiT_4B_GQA_patch2_Adaln_Refiner(**kwargs):
+    return NextDiT(
+        patch_size=2,
+        dim=2880,
+        n_layers=32,
+        n_heads=24,
+        n_kv_heads=8,
+        axes_dims=[40, 40, 40],
+        axes_lens=[300, 512, 512],
+        **kwargs,
+    )
+
+
+def NextDiT_7B_GQA_patch2_Adaln_Refiner(**kwargs):
+    return NextDiT(
+        patch_size=2,
+        dim=3840,
+        n_layers=32,
+        n_heads=32,
+        n_kv_heads=8,
+        axes_dims=[40, 40, 40],
+        axes_lens=[300, 512, 512],
+        **kwargs,
+    )
diff --git a/library/lumina_train_util.py b/library/lumina_train_util.py
new file mode 100644
index 000000000..d3edd262c
--- /dev/null
+++ b/library/lumina_train_util.py
@@ -0,0 +1,554 @@
+import argparse
+import math
+import os
+import numpy as np
+import toml
+import json
+import time
+from typing import Callable, Dict, List, Optional, Tuple, Union
+
+import torch
+from accelerate import Accelerator, PartialState
+from transformers import AutoTokenizer, AutoModelForCausalLM
+from tqdm import tqdm
+from PIL import Image
+from safetensors.torch import save_file
+
+from library import lumina_models, lumina_util, strategy_base, train_util
+from library.device_utils import init_ipex, clean_memory_on_device
+
+init_ipex()
+
+from .utils import setup_logging, mem_eff_save_file
+
+setup_logging()
+import logging
+
+logger = logging.getLogger(__name__)
+
+
+# region sample images
+
+
+def sample_images(
+    accelerator: Accelerator,
+    args: argparse.Namespace,
+    epoch,
+    steps,
+    nextdit,
+    ae,
+    gemma2_model,
+    sample_prompts_gemma2_outputs,
+    prompt_replacement=None,
+    controlnet=None
+):
+    if steps == 0:
+        if not args.sample_at_first:
+            return
+    else:
+        if args.sample_every_n_steps is None and args.sample_every_n_epochs is None:
+            return
+        if args.sample_every_n_epochs is not None:
+            # sample_every_n_steps は無視する
+            if epoch is None or epoch % args.sample_every_n_epochs != 0:
+                return
+        else:
+            if steps % args.sample_every_n_steps != 0 or epoch is not None:  # steps is not divisible or end of epoch
+                return
+
+    logger.info("")
+    logger.info(f"generating sample images at step / サンプル画像生成 ステップ: {steps}")
+    if not os.path.isfile(args.sample_prompts) and sample_prompts_gemma2_outputs is None:
+        logger.error(f"No prompt file / プロンプトファイルがありません: {args.sample_prompts}")
+        return
+
+    distributed_state = PartialState()  # for multi gpu distributed inference. this is a singleton, so it's safe to use it here
+
+    # unwrap nextdit and gemma2_model
+    nextdit = accelerator.unwrap_model(nextdit)
+    if gemma2_model is not None:
+        gemma2_model = accelerator.unwrap_model(gemma2_model)
+    # if controlnet is not None:
+    #     controlnet = accelerator.unwrap_model(controlnet)
+    # print([(te.parameters().__next__().device if te is not None else None) for te in text_encoders])
+
+    prompts = train_util.load_prompts(args.sample_prompts)
+
+    save_dir = args.output_dir + "/sample"
+    os.makedirs(save_dir, exist_ok=True)
+
+    # save random state to restore later
+    rng_state = torch.get_rng_state()
+    cuda_rng_state = None
+    try:
+        cuda_rng_state = torch.cuda.get_rng_state() if torch.cuda.is_available() else None
+    except Exception:
+        pass
+
+    if distributed_state.num_processes <= 1:
+        # If only one device is available, just use the original prompt list. We don't need to care about the distribution of prompts.
+        with torch.no_grad(), accelerator.autocast():
+            for prompt_dict in prompts:
+                sample_image_inference(
+                    accelerator,
+                    args,
+                    nextdit,
+                    gemma2_model,
+                    ae,
+                    save_dir,
+                    prompt_dict,
+                    epoch,
+                    steps,
+                    sample_prompts_gemma2_outputs,
+                    prompt_replacement,
+                    controlnet
+                )
+    else:
+        # Creating list with N elements, where each element is a list of prompt_dicts, and N is the number of processes available (number of devices available)
+        # prompt_dicts are assigned to lists based on order of processes, to attempt to time the image creation time to match enum order. Probably only works when steps and sampler are identical.
+        per_process_prompts = []  # list of lists
+        for i in range(distributed_state.num_processes):
+            per_process_prompts.append(prompts[i :: distributed_state.num_processes])
+
+        with torch.no_grad():
+            with distributed_state.split_between_processes(per_process_prompts) as prompt_dict_lists:
+                for prompt_dict in prompt_dict_lists[0]:
+                    sample_image_inference(
+                        accelerator,
+                        args,
+                        nextdit,
+                        gemma2_model,
+                        ae,
+                        save_dir,
+                        prompt_dict,
+                        epoch,
+                        steps,
+                        sample_prompts_gemma2_outputs,
+                        prompt_replacement,
+                        controlnet
+                    )
+
+    torch.set_rng_state(rng_state)
+    if cuda_rng_state is not None:
+        torch.cuda.set_rng_state(cuda_rng_state)
+
+    clean_memory_on_device(accelerator.device)
+
+
+def sample_image_inference(
+    accelerator: Accelerator,
+    args: argparse.Namespace,
+    nextdit,
+    gemma2_model,
+    ae,
+    save_dir,
+    prompt_dict,
+    epoch,
+    steps,
+    sample_prompts_gemma2_outputs,
+    prompt_replacement,
+    # controlnet
+):
+    assert isinstance(prompt_dict, dict)
+    # negative_prompt = prompt_dict.get("negative_prompt")
+    sample_steps = prompt_dict.get("sample_steps", 20)
+    width = prompt_dict.get("width", 512)
+    height = prompt_dict.get("height", 512)
+    scale = prompt_dict.get("scale", 3.5)
+    seed = prompt_dict.get("seed")
+    controlnet_image = prompt_dict.get("controlnet_image")
+    prompt: str = prompt_dict.get("prompt", "")
+    # sampler_name: str = prompt_dict.get("sample_sampler", args.sample_sampler)
+
+    if prompt_replacement is not None:
+        prompt = prompt.replace(prompt_replacement[0], prompt_replacement[1])
+        # if negative_prompt is not None:
+        #     negative_prompt = negative_prompt.replace(prompt_replacement[0], prompt_replacement[1])
+
+    if seed is not None:
+        torch.manual_seed(seed)
+        torch.cuda.manual_seed(seed)
+    else:
+        # True random sample image generation
+        torch.seed()
+        torch.cuda.seed()
+
+    # if negative_prompt is None:
+    #     negative_prompt = ""
+    height = max(64, height - height % 16)  # round to divisible by 16
+    width = max(64, width - width % 16)  # round to divisible by 16
+    logger.info(f"prompt: {prompt}")
+    # logger.info(f"negative_prompt: {negative_prompt}")
+    logger.info(f"height: {height}")
+    logger.info(f"width: {width}")
+    logger.info(f"sample_steps: {sample_steps}")
+    logger.info(f"scale: {scale}")
+    # logger.info(f"sample_sampler: {sampler_name}")
+    if seed is not None:
+        logger.info(f"seed: {seed}")
+
+    # encode prompts
+    tokenize_strategy = strategy_base.TokenizeStrategy.get_strategy()
+    encoding_strategy = strategy_base.TextEncodingStrategy.get_strategy()
+
+    gemma2_conds = []
+    if sample_prompts_gemma2_outputs and prompt in sample_prompts_gemma2_outputs:
+        gemma2_conds = sample_prompts_gemma2_outputs[prompt]
+        print(f"Using cached Gemma2 outputs for prompt: {prompt}")
+    if gemma2_model is not None:
+        print(f"Encoding prompt with Gemma2: {prompt}")
+        tokens_and_masks = tokenize_strategy.tokenize(prompt)
+        # strategy has apply_gemma2_attn_mask option
+        encoded_gemma2_conds = encoding_strategy.encode_tokens(tokenize_strategy, [gemma2_model], tokens_and_masks)
+
+        # if gemma2_conds is not cached, use encoded_gemma2_conds
+        if len(gemma2_conds) == 0:
+            gemma2_conds = encoded_gemma2_conds
+        else:
+            # if encoded_gemma2_conds is not None, update cached gemma2_conds
+            for i in range(len(encoded_gemma2_conds)):
+                if encoded_gemma2_conds[i] is not None:
+                    gemma2_conds[i] = encoded_gemma2_conds[i]
+
+    # Unpack Gemma2 outputs
+    gemma2_hidden_states, gemma2_attn_mask, input_ids = gemma2_conds
+
+    # sample image
+    weight_dtype = ae.dtype  # TOFO give dtype as argument
+    packed_latent_height = height // 16
+    packed_latent_width = width // 16
+    noise = torch.randn(
+        1,
+        packed_latent_height * packed_latent_width,
+        16 * 2 * 2,
+        device=accelerator.device,
+        dtype=weight_dtype,
+        generator=torch.Generator(device=accelerator.device).manual_seed(seed) if seed is not None else None,
+    )
+    timesteps = get_schedule(sample_steps, noise.shape[1], shift=True)
+    img_ids = lumina_util.prepare_img_ids(1, packed_latent_height, packed_latent_width).to(accelerator.device, weight_dtype)
+    gemma2_attn_mask = gemma2_attn_mask.to(accelerator.device) if args.apply_gemma2_attn_mask else None
+
+    # if controlnet_image is not None:
+    #     controlnet_image = Image.open(controlnet_image).convert("RGB")
+    #     controlnet_image = controlnet_image.resize((width, height), Image.LANCZOS)
+    #     controlnet_image = torch.from_numpy((np.array(controlnet_image) / 127.5) - 1)
+    #     controlnet_image = controlnet_image.permute(2, 0, 1).unsqueeze(0).to(weight_dtype).to(accelerator.device)
+
+    with accelerator.autocast(), torch.no_grad():
+        x = denoise(nextdit, noise, img_ids, gemma2_hidden_states, input_ids, None, timesteps=timesteps, guidance=scale, gemma2_attn_mask=gemma2_attn_mask, controlnet=controlnet, controlnet_img=controlnet_image)
+
+    x = lumina_util.unpack_latents(x, packed_latent_height, packed_latent_width)
+
+    # latent to image
+    clean_memory_on_device(accelerator.device)
+    org_vae_device = ae.device  # will be on cpu
+    ae.to(accelerator.device)  # distributed_state.device is same as accelerator.device
+    with accelerator.autocast(), torch.no_grad():
+        x = ae.decode(x)
+    ae.to(org_vae_device)
+    clean_memory_on_device(accelerator.device)
+
+    x = x.clamp(-1, 1)
+    x = x.permute(0, 2, 3, 1)
+    image = Image.fromarray((127.5 * (x + 1.0)).float().cpu().numpy().astype(np.uint8)[0])
+
+    # adding accelerator.wait_for_everyone() here should sync up and ensure that sample images are saved in the same order as the original prompt list
+    # but adding 'enum' to the filename should be enough
+
+    ts_str = time.strftime("%Y%m%d%H%M%S", time.localtime())
+    num_suffix = f"e{epoch:06d}" if epoch is not None else f"{steps:06d}"
+    seed_suffix = "" if seed is None else f"_{seed}"
+    i: int = prompt_dict["enum"]
+    img_filename = f"{'' if args.output_name is None else args.output_name + '_'}{num_suffix}_{i:02d}_{ts_str}{seed_suffix}.png"
+    image.save(os.path.join(save_dir, img_filename))
+
+    # send images to wandb if enabled
+    if "wandb" in [tracker.name for tracker in accelerator.trackers]:
+        wandb_tracker = accelerator.get_tracker("wandb")
+
+        import wandb
+
+        # not to commit images to avoid inconsistency between training and logging steps
+        wandb_tracker.log({f"sample_{i}": wandb.Image(image, caption=prompt)}, commit=False)  # positive prompt as a caption
+
+
+def time_shift(mu: float, sigma: float, t: torch.Tensor):
+    return math.exp(mu) / (math.exp(mu) + (1 / t - 1) ** sigma)
+
+
+def get_lin_function(x1: float = 256, y1: float = 0.5, x2: float = 4096, y2: float = 1.15) -> Callable[[float], float]:
+    m = (y2 - y1) / (x2 - x1)
+    b = y1 - m * x1
+    return lambda x: m * x + b
+
+
+def get_schedule(
+    num_steps: int,
+    image_seq_len: int,
+    base_shift: float = 0.5,
+    max_shift: float = 1.15,
+    shift: bool = True,
+) -> list[float]:
+    # extra step for zero
+    timesteps = torch.linspace(1, 0, num_steps + 1)
+
+    # shifting the schedule to favor high timesteps for higher signal images
+    if shift:
+        # eastimate mu based on linear estimation between two points
+        mu = get_lin_function(y1=base_shift, y2=max_shift)(image_seq_len)
+        timesteps = time_shift(mu, 1.0, timesteps)
+
+    return timesteps.tolist()
+
+# endregion
+
+
+# region train
+def get_sigmas(noise_scheduler, timesteps, device, n_dim=4, dtype=torch.float32):
+    sigmas = noise_scheduler.sigmas.to(device=device, dtype=dtype)
+    schedule_timesteps = noise_scheduler.timesteps.to(device)
+    timesteps = timesteps.to(device)
+    step_indices = [(schedule_timesteps == t).nonzero().item() for t in timesteps]
+
+    sigma = sigmas[step_indices].flatten()
+    while len(sigma.shape) < n_dim:
+        sigma = sigma.unsqueeze(-1)
+    return sigma
+
+
+def compute_density_for_timestep_sampling(
+    weighting_scheme: str, batch_size: int, logit_mean: float = None, logit_std: float = None, mode_scale: float = None
+):
+    """Compute the density for sampling the timesteps when doing SD3 training.
+
+    Courtesy: This was contributed by Rafie Walker in https://github.com/huggingface/diffusers/pull/8528.
+
+    SD3 paper reference: https://arxiv.org/abs/2403.03206v1.
+    """
+    if weighting_scheme == "logit_normal":
+        # See 3.1 in the SD3 paper ($rf/lognorm(0.00,1.00)$).
+        u = torch.normal(mean=logit_mean, std=logit_std, size=(batch_size,), device="cpu")
+        u = torch.nn.functional.sigmoid(u)
+    elif weighting_scheme == "mode":
+        u = torch.rand(size=(batch_size,), device="cpu")
+        u = 1 - u - mode_scale * (torch.cos(math.pi * u / 2) ** 2 - 1 + u)
+    else:
+        u = torch.rand(size=(batch_size,), device="cpu")
+    return u
+
+
+def compute_loss_weighting_for_sd3(weighting_scheme: str, sigmas=None):
+    """Computes loss weighting scheme for SD3 training.
+
+    Courtesy: This was contributed by Rafie Walker in https://github.com/huggingface/diffusers/pull/8528.
+
+    SD3 paper reference: https://arxiv.org/abs/2403.03206v1.
+    """
+    if weighting_scheme == "sigma_sqrt":
+        weighting = (sigmas**-2.0).float()
+    elif weighting_scheme == "cosmap":
+        bot = 1 - 2 * sigmas + 2 * sigmas**2
+        weighting = 2 / (math.pi * bot)
+    else:
+        weighting = torch.ones_like(sigmas)
+    return weighting
+
+
+def get_noisy_model_input_and_timesteps(
+    args, noise_scheduler, latents, noise, device, dtype
+) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+    bsz, _, h, w = latents.shape
+    sigmas = None
+
+    if args.timestep_sampling == "uniform" or args.timestep_sampling == "sigmoid":
+        # Simple random t-based noise sampling
+        if args.timestep_sampling == "sigmoid":
+            # https://github.com/XLabs-AI/x-flux/tree/main
+            t = torch.sigmoid(args.sigmoid_scale * torch.randn((bsz,), device=device))
+        else:
+            t = torch.rand((bsz,), device=device)
+
+        timesteps = t * 1000.0
+        t = t.view(-1, 1, 1, 1)
+        noisy_model_input = (1 - t) * latents + t * noise
+    elif args.timestep_sampling == "shift":
+        shift = args.discrete_flow_shift
+        logits_norm = torch.randn(bsz, device=device)
+        logits_norm = logits_norm * args.sigmoid_scale  # larger scale for more uniform sampling
+        timesteps = logits_norm.sigmoid()
+        timesteps = (timesteps * shift) / (1 + (shift - 1) * timesteps)
+
+        t = timesteps.view(-1, 1, 1, 1)
+        timesteps = timesteps * 1000.0
+        noisy_model_input = (1 - t) * latents + t * noise
+    elif args.timestep_sampling == "nextdit_shift":
+        logits_norm = torch.randn(bsz, device=device)
+        logits_norm = logits_norm * args.sigmoid_scale  # larger scale for more uniform sampling
+        timesteps = logits_norm.sigmoid()
+        mu = get_lin_function(y1=0.5, y2=1.15)((h // 2) * (w // 2))
+        timesteps = time_shift(mu, 1.0, timesteps)
+
+        t = timesteps.view(-1, 1, 1, 1)
+        timesteps = timesteps * 1000.0
+        noisy_model_input = (1 - t) * latents + t * noise
+    else:
+        # Sample a random timestep for each image
+        # for weighting schemes where we sample timesteps non-uniformly
+        u = compute_density_for_timestep_sampling(
+            weighting_scheme=args.weighting_scheme,
+            batch_size=bsz,
+            logit_mean=args.logit_mean,
+            logit_std=args.logit_std,
+            mode_scale=args.mode_scale,
+        )
+        indices = (u * noise_scheduler.config.num_train_timesteps).long()
+        timesteps = noise_scheduler.timesteps[indices].to(device=device)
+
+        # Add noise according to flow matching.
+        sigmas = get_sigmas(noise_scheduler, timesteps, device, n_dim=latents.ndim, dtype=dtype)
+        noisy_model_input = sigmas * noise + (1.0 - sigmas) * latents
+
+    return noisy_model_input.to(dtype), timesteps.to(dtype), sigmas
+
+
+def apply_model_prediction_type(args, model_pred, noisy_model_input, sigmas):
+    weighting = None
+    if args.model_prediction_type == "raw":
+        pass
+    elif args.model_prediction_type == "additive":
+        # add the model_pred to the noisy_model_input
+        model_pred = model_pred + noisy_model_input
+    elif args.model_prediction_type == "sigma_scaled":
+        # apply sigma scaling
+        model_pred = model_pred * (-sigmas) + noisy_model_input
+
+        # these weighting schemes use a uniform timestep sampling
+        # and instead post-weight the loss
+        weighting = compute_loss_weighting_for_sd3(weighting_scheme=args.weighting_scheme, sigmas=sigmas)
+
+    return model_pred, weighting
+
+
+def save_models(
+    ckpt_path: str,
+    lumina: lumina_models.NextDiT,
+    sai_metadata: Optional[dict],
+    save_dtype: Optional[torch.dtype] = None,
+    use_mem_eff_save: bool = False,
+):
+    state_dict = {}
+
+    def update_sd(prefix, sd):
+        for k, v in sd.items():
+            key = prefix + k
+            if save_dtype is not None and v.dtype != save_dtype:
+                v = v.detach().clone().to("cpu").to(save_dtype)
+            state_dict[key] = v
+
+    update_sd("", lumina.state_dict())
+
+    if not use_mem_eff_save:
+        save_file(state_dict, ckpt_path, metadata=sai_metadata)
+    else:
+        mem_eff_save_file(state_dict, ckpt_path, metadata=sai_metadata)
+
+
+def save_lumina_model_on_train_end(
+    args: argparse.Namespace, save_dtype: torch.dtype, epoch: int, global_step: int, lumina: lumina_models.NextDiT
+):
+    def sd_saver(ckpt_file, epoch_no, global_step):
+        sai_metadata = train_util.get_sai_model_spec(None, args, False, False, False, is_stable_diffusion_ckpt=True, lumina="lumina2")
+        save_models(ckpt_file, lumina, sai_metadata, save_dtype, args.mem_eff_save)
+
+    train_util.save_sd_model_on_train_end_common(args, True, True, epoch, global_step, sd_saver, None)
+
+
+# epochとstepの保存、メタデータにepoch/stepが含まれ引数が同じになるため、統合してている
+# on_epoch_end: Trueならepoch終了時、Falseならstep経過時
+def save_lumina_model_on_epoch_end_or_stepwise(
+    args: argparse.Namespace,
+    on_epoch_end: bool,
+    accelerator,
+    save_dtype: torch.dtype,
+    epoch: int,
+    num_train_epochs: int,
+    global_step: int,
+    lumina: lumina_models.NextDiT,
+):
+    def sd_saver(ckpt_file, epoch_no, global_step):
+        sai_metadata = train_util.get_sai_model_spec(None, args, False, False, False, is_stable_diffusion_ckpt=True, lumina="lumina2")
+        save_models(ckpt_file, lumina, sai_metadata, save_dtype, args.mem_eff_save)
+
+    train_util.save_sd_model_on_epoch_end_or_stepwise_common(
+        args,
+        on_epoch_end,
+        accelerator,
+        True,
+        True,
+        epoch,
+        num_train_epochs,
+        global_step,
+        sd_saver,
+        None,
+    )
+
+
+# endregion
+
+
+def add_lumina_train_arguments(parser: argparse.ArgumentParser):
+    parser.add_argument(
+        "--gemma2",
+        type=str,
+        help="path to gemma2 model (*.sft or *.safetensors), should be float16 / gemma2のパス（*.sftまたは*.safetensors）、float16が前提",
+    )
+    parser.add_argument("--ae", type=str, help="path to ae (*.sft or *.safetensors) / aeのパス（*.sftまたは*.safetensors）")
+    parser.add_argument(
+        "--gemma2_max_token_length",
+        type=int,
+        default=None,
+        help="maximum token length for Gemma2. if omitted, 256 for schnell and 512 for dev"
+        " / Gemma2の最大トークン長。省略された場合、schnellの場合は256、devの場合は512",
+    )
+    parser.add_argument(
+        "--apply_gemma2_attn_mask",
+        action="store_true",
+        help="apply attention mask to Gemma2 encode and NextDIT double blocks / Gemma2エンコードとNextDITダブルブロックにアテンションマスクを適用する",
+    )
+
+    parser.add_argument(
+        "--guidance_scale",
+        type=float,
+        default=3.5,
+        help="the NextDIT.1 dev variant is a guidance distilled model",
+    )
+
+    parser.add_argument(
+        "--timestep_sampling",
+        choices=["sigma", "uniform", "sigmoid", "shift", "nextdit_shift"],
+        default="sigma",
+        help="Method to sample timesteps: sigma-based, uniform random, sigmoid of random normal, shift of sigmoid and NextDIT.1 shifting."
+        " / タイムステップをサンプリングする方法：sigma、random uniform、random normalのsigmoid、sigmoidのシフト、NextDIT.1のシフト。",
+    )
+    parser.add_argument(
+        "--sigmoid_scale",
+        type=float,
+        default=1.0,
+        help='Scale factor for sigmoid timestep sampling (only used when timestep-sampling is "sigmoid"). / sigmoidタイムステップサンプリングの倍率（timestep-samplingが"sigmoid"の場合のみ有効）。',
+    )
+    parser.add_argument(
+        "--model_prediction_type",
+        choices=["raw", "additive", "sigma_scaled"],
+        default="sigma_scaled",
+        help="How to interpret and process the model prediction: "
+        "raw (use as is), additive (add to noisy input), sigma_scaled (apply sigma scaling)."
+        " / モデル予測の解釈と処理方法："
+        "raw（そのまま使用）、additive（ノイズ入力に加算）、sigma_scaled（シグマスケーリングを適用）。",
+    )
+    parser.add_argument(
+        "--discrete_flow_shift",
+        type=float,
+        default=3.0,
+        help="Discrete flow shift for the Euler Discrete Scheduler, default is 3.0. / Euler Discrete Schedulerの離散フローシフト、デフォルトは3.0。",
+    )
diff --git a/library/lumina_util.py b/library/lumina_util.py
new file mode 100644
index 000000000..990f8c684
--- /dev/null
+++ b/library/lumina_util.py
@@ -0,0 +1,194 @@
+import json
+import os
+from dataclasses import replace
+from typing import List, Optional, Tuple, Union
+
+import einops
+import torch
+from accelerate import init_empty_weights
+from safetensors import safe_open
+from safetensors.torch import load_file
+from transformers import Gemma2Config, Gemma2Model
+
+from library.utils import setup_logging
+
+setup_logging()
+import logging
+
+logger = logging.getLogger(__name__)
+
+from library import lumina_models, flux_models
+from library.utils import load_safetensors
+
+MODEL_VERSION_LUMINA_V2 = "lumina2"
+
+def load_lumina_model(
+    ckpt_path: str,
+    dtype: torch.dtype,
+    device: Union[str, torch.device],
+    disable_mmap: bool = False,
+) -> lumina_models.Lumina:
+    logger.info("Building Lumina")
+    with torch.device("meta"):
+        model = lumina_models.NextDiT_2B_GQA_patch2_Adaln_Refiner().to(dtype)
+
+    logger.info(f"Loading state dict from {ckpt_path}")
+    state_dict = load_safetensors(
+        ckpt_path, device=str(device), disable_mmap=disable_mmap, dtype=dtype
+    )
+    info = model.load_state_dict(state_dict, strict=False, assign=True)
+    logger.info(f"Loaded Lumina: {info}")
+    return model
+
+def load_ae(
+    ckpt_path: str,
+    dtype: torch.dtype,
+    device: Union[str, torch.device],
+    disable_mmap: bool = False,
+) -> flux_models.AutoEncoder:
+    logger.info("Building AutoEncoder")
+    with torch.device("meta"):
+        # dev and schnell have the same AE params
+        ae = flux_models.AutoEncoder(flux_models.configs["schnell"].ae_params).to(dtype)
+
+    logger.info(f"Loading state dict from {ckpt_path}")
+    sd = load_safetensors(
+        ckpt_path, device=str(device), disable_mmap=disable_mmap, dtype=dtype
+    )
+    info = ae.load_state_dict(sd, strict=False, assign=True)
+    logger.info(f"Loaded AE: {info}")
+    return ae
+
+
+def load_gemma2(
+    ckpt_path: Optional[str],
+    dtype: torch.dtype,
+    device: Union[str, torch.device],
+    disable_mmap: bool = False,
+    state_dict: Optional[dict] = None,
+) -> Gemma2Model:
+    logger.info("Building Gemma2")
+    GEMMA2_CONFIG = {
+        "_name_or_path": "google/gemma-2b",
+        "attention_bias": false,
+        "attention_dropout": 0.0,
+        "bos_token_id": 2,
+        "eos_token_id": 1,
+        "head_dim": 256,
+        "hidden_act": "gelu",
+        "hidden_size": 2048,
+        "initializer_range": 0.02,
+        "intermediate_size": 16384,
+        "max_position_embeddings": 8192,
+        "model_type": "gemma",
+        "num_attention_heads": 8,
+        "num_hidden_layers": 18,
+        "num_key_value_heads": 1,
+        "pad_token_id": 0,
+        "rms_norm_eps": 1e-06,
+        "rope_scaling": null,
+        "rope_theta": 10000.0,
+        "torch_dtype": "bfloat16",
+        "transformers_version": "4.38.0.dev0",
+        "use_cache": true,
+        "vocab_size": 256000
+    }
+    config = Gemma2Config(**GEMMA2_CONFIG)
+    with init_empty_weights():
+        gemma2 = Gemma2Model._from_config(config)
+
+    if state_dict is not None:
+        sd = state_dict
+    else:
+        logger.info(f"Loading state dict from {ckpt_path}")
+        sd = load_safetensors(
+            ckpt_path, device=str(device), disable_mmap=disable_mmap, dtype=dtype
+        )
+    info = gemma2.load_state_dict(sd, strict=False, assign=True)
+    logger.info(f"Loaded Gemma2: {info}")
+    return gemma2
+
+def prepare_img_ids(batch_size: int, packed_latent_height: int, packed_latent_width: int):
+    img_ids = torch.zeros(packed_latent_height, packed_latent_width, 3)
+    img_ids[..., 1] = img_ids[..., 1] + torch.arange(packed_latent_height)[:, None]
+    img_ids[..., 2] = img_ids[..., 2] + torch.arange(packed_latent_width)[None, :]
+    img_ids = einops.repeat(img_ids, "h w c -> b (h w) c", b=batch_size)
+    return img_ids
+
+
+def unpack_latents(x: torch.Tensor, packed_latent_height: int, packed_latent_width: int) -> torch.Tensor:
+    """
+    x: [b (h w) (c ph pw)] -> [b c (h ph) (w pw)], ph=2, pw=2
+    """
+    x = einops.rearrange(x, "b (h w) (c ph pw) -> b c (h ph) (w pw)", h=packed_latent_height, w=packed_latent_width, ph=2, pw=2)
+    return x
+
+
+def pack_latents(x: torch.Tensor) -> torch.Tensor:
+    """
+    x: [b c (h ph) (w pw)] -> [b (h w) (c ph pw)], ph=2, pw=2
+    """
+    x = einops.rearrange(x, "b c (h ph) (w pw) -> b (h w) (c ph pw)", ph=2, pw=2)
+    return x
+
+DIFFUSERS_TO_ALPHA_VLLM_MAP = {
+    # Embedding layers
+    "cap_embedder.0.weight": ["time_caption_embed.caption_embedder.0.weight"],
+    "cap_embedder.1.weight": "time_caption_embed.caption_embedder.1.weight",
+    "cap_embedder.1.bias": "text_embedder.1.bias",
+    "x_embedder.weight": "patch_embedder.proj.weight",
+    "x_embedder.bias": "patch_embedder.proj.bias",
+    # Attention modulation
+    "layers.().adaLN_modulation.1.weight": "transformer_blocks.().adaln_modulation.1.weight",
+    "layers.().adaLN_modulation.1.bias": "transformer_blocks.().adaln_modulation.1.bias",
+    # Final layers
+    "final_layer.adaLN_modulation.1.weight": "final_adaln_modulation.1.weight",
+    "final_layer.adaLN_modulation.1.bias": "final_adaln_modulation.1.bias",
+    "final_layer.linear.weight": "final_linear.weight",
+    "final_layer.linear.bias": "final_linear.bias",
+    # Noise refiner
+    "noise_refiner.().adaLN_modulation.1.weight": "single_transformer_blocks.().adaln_modulation.1.weight",
+    "noise_refiner.().adaLN_modulation.1.bias": "single_transformer_blocks.().adaln_modulation.1.bias",
+    "noise_refiner.().attention.qkv.weight": "single_transformer_blocks.().attn.to_qkv.weight",
+    "noise_refiner.().attention.out.weight": "single_transformer_blocks.().attn.to_out.0.weight",
+    # Time embedding
+    "t_embedder.mlp.0.weight": "time_embedder.0.weight",
+    "t_embedder.mlp.0.bias": "time_embedder.0.bias",
+    "t_embedder.mlp.2.weight": "time_embedder.2.weight",
+    "t_embedder.mlp.2.bias": "time_embedder.2.bias",
+    # Context attention
+    "context_refiner.().attention.qkv.weight": "transformer_blocks.().attn2.to_qkv.weight",
+    "context_refiner.().attention.out.weight": "transformer_blocks.().attn2.to_out.0.weight",
+    # Normalization
+    "layers.().attention_norm1.weight": "transformer_blocks.().norm1.weight",
+    "layers.().attention_norm2.weight": "transformer_blocks.().norm2.weight",
+    # FFN
+    "layers.().feed_forward.w1.weight": "transformer_blocks.().ff.net.0.proj.weight",
+    "layers.().feed_forward.w2.weight": "transformer_blocks.().ff.net.2.weight",
+    "layers.().feed_forward.w3.weight": "transformer_blocks.().ff.net.4.weight",
+}
+
+
+def convert_diffusers_sd_to_alpha_vllm(sd: dict, num_double_blocks: int) -> dict:
+    """Convert Diffusers checkpoint to Alpha-VLLM format"""
+    logger.info("Converting Diffusers checkpoint to Alpha-VLLM format")
+    new_sd = {}
+
+    for key, value in sd.items():
+        new_key = key
+        for pattern, replacement in DIFFUSERS_TO_ALPHA_VLLM_MAP.items():
+            if "()." in pattern:
+                for block_idx in range(num_double_blocks):
+                    if str(block_idx) in key:
+                        converted = pattern.replace("()", str(block_idx))
+                        new_key = key.replace(
+                            converted, replacement.replace("()", str(block_idx))
+                        )
+                        break
+
+        if new_key == key:
+            logger.debug(f"Unmatched key in conversion: {key}")
+        new_sd[new_key] = value
+
+    logger.info(f"Converted {len(new_sd)} keys to Alpha-VLLM format")
+    return new_sd
diff --git a/library/sai_model_spec.py b/library/sai_model_spec.py
index 8896c047e..1e97c9cd2 100644
--- a/library/sai_model_spec.py
+++ b/library/sai_model_spec.py
@@ -61,6 +61,8 @@
 # ARCH_SD3_UNKNOWN = "stable-diffusion-3"
 ARCH_FLUX_1_DEV = "flux-1-dev"
 ARCH_FLUX_1_UNKNOWN = "flux-1"
+ARCH_LUMINA_2 = "lumina-2"
+ARCH_LUMINA_UNKNOWN = "lumina"
 
 ADAPTER_LORA = "lora"
 ADAPTER_TEXTUAL_INVERSION = "textual-inversion"
@@ -69,6 +71,7 @@
 IMPL_COMFY_UI = "https://github.com/comfyanonymous/ComfyUI"
 IMPL_DIFFUSERS = "diffusers"
 IMPL_FLUX = "https://github.com/black-forest-labs/flux"
+IMPL_LUMINA = "https://github.com/Alpha-VLLM/Lumina-Image-2.0"
 
 PRED_TYPE_EPSILON = "epsilon"
 PRED_TYPE_V = "v"
@@ -123,6 +126,7 @@ def build_metadata(
     clip_skip: Optional[int] = None,
     sd3: Optional[str] = None,
     flux: Optional[str] = None,
+    lumina: Optional[str] = None,
 ):
     """
     sd3: only supports "m", flux: only supports "dev"
@@ -146,6 +150,11 @@ def build_metadata(
             arch = ARCH_FLUX_1_DEV
         else:
             arch = ARCH_FLUX_1_UNKNOWN
+    elif lumina is not None:
+        if lumina == "lumina2":
+            arch = ARCH_LUMINA_2
+        else:
+            arch = ARCH_LUMINA_UNKNOWN
     elif v2:
         if v_parameterization:
             arch = ARCH_SD_V2_768_V
@@ -167,6 +176,9 @@ def build_metadata(
     if flux is not None:
         # Flux
         impl = IMPL_FLUX
+    elif lumina is not None:
+        # Lumina
+        impl = IMPL_LUMINA
     elif (lora and sdxl) or textual_inversion or is_stable_diffusion_ckpt:
         # Stable Diffusion ckpt, TI, SDXL LoRA
         impl = IMPL_STABILITY_AI
diff --git a/library/strategy_lumina.py b/library/strategy_lumina.py
new file mode 100644
index 000000000..622c019a4
--- /dev/null
+++ b/library/strategy_lumina.py
@@ -0,0 +1,275 @@
+import glob
+import os
+from typing import Any, List, Optional, Tuple, Union
+
+import torch
+from transformers import AutoTokenizer, AutoModel
+from library import train_util
+from library.strategy_base import (
+    LatentsCachingStrategy,
+    TokenizeStrategy,
+    TextEncodingStrategy,
+)
+from library.utils import setup_logging
+
+setup_logging()
+import logging
+
+logger = logging.getLogger(__name__)
+
+
+GEMMA_ID = "google/gemma-2-2b"
+
+
+class LuminaTokenizeStrategy(TokenizeStrategy):
+    def __init__(
+        self, max_length: Optional[int], tokenizer_cache_dir: Optional[str] = None
+    ) -> None:
+        self.tokenizer = AutoTokenizer.from_pretrained(
+            GEMMA_ID, cache_dir=tokenizer_cache_dir
+        )
+        self.tokenizer.padding_side = "right"
+
+        if max_length is None:
+            self.max_length = self.tokenizer.model_max_length
+        else:
+            self.max_length = max_length
+
+    def tokenize(self, text: Union[str, List[str]]) -> List[torch.Tensor]:
+        text = [text] if isinstance(text, str) else text
+        encodings = self.tokenizer(
+            text,
+            padding="max_length",
+            max_length=self.max_length,
+            return_tensors="pt",
+            truncation=True,
+        )
+        return [encodings.input_ids]
+
+    def tokenize_with_weights(
+        self, text: str | List[str]
+    ) -> Tuple[List[torch.Tensor], List[torch.Tensor]]:
+        # Gemma doesn't support weighted prompts, return uniform weights
+        tokens = self.tokenize(text)
+        weights = [torch.ones_like(t) for t in tokens]
+        return tokens, weights
+
+
+class LuminaTextEncodingStrategy(TextEncodingStrategy):
+    def __init__(self, apply_gemma2_attn_mask: Optional[bool] = None) -> None:
+        super().__init__()
+        self.apply_gemma2_attn_mask = apply_gemma2_attn_mask
+
+    def encode_tokens(
+        self,
+        tokenize_strategy: TokenizeStrategy,
+        models: List[Any],
+        tokens: List[torch.Tensor],
+        apply_gemma2_attn_mask: Optional[bool] = None,
+    ) -> List[torch.Tensor]:
+
+        if apply_gemma2_attn_mask is None:
+            apply_gemma2_attn_mask = self.apply_gemma2_attn_mask
+
+        text_encoder = models[0]
+        input_ids = tokens[0].to(text_encoder.device)
+
+        attention_mask = None
+        position_ids = None
+        if apply_gemma2_attn_mask:
+            # Create attention mask (1 for non-padding, 0 for padding)
+            attention_mask = (input_ids != tokenize_strategy.tokenizer.pad_token_id).to(
+                text_encoder.device
+            )
+
+            # Create position IDs
+            position_ids = attention_mask.long().cumsum(-1) - 1
+            position_ids.masked_fill_(attention_mask == 0, 1)
+
+        with torch.no_grad():
+            outputs = text_encoder(
+                input_ids=input_ids,
+                attention_mask=attention_mask,
+                position_ids=position_ids,
+                output_hidden_states=True,
+                return_dict=True,
+            )
+            # Get the last hidden state
+            hidden_states = outputs.last_hidden_state
+
+        return [hidden_states]
+
+    def encode_tokens_with_weights(
+        self,
+        tokenize_strategy: TokenizeStrategy,
+        models: List[Any],
+        tokens_list: List[torch.Tensor],
+        weights_list: List[torch.Tensor],
+    ) -> List[torch.Tensor]:
+        # For simplicity, use uniform weighting
+        return self.encode_tokens(tokenize_strategy, models, tokens_list)
+
+
+class LuminaTextEncoderOutputsCachingStrategy(TextEncoderOutputsCachingStrategy):
+    LUMINA_TEXT_ENCODER_OUTPUTS_NPZ_SUFFIX = "_lumina_te.npz"
+
+    def __init__(
+        self,
+        cache_to_disk: bool,
+        batch_size: int,
+        skip_disk_cache_validity_check: bool,
+        is_partial: bool = False,
+        apply_gemma2_attn_mask: bool = False,
+    ) -> None:
+        super().__init__(
+            cache_to_disk,
+            batch_size,
+            skip_disk_cache_validity_check,
+            is_partial,
+        )
+        self.apply_gemma2_attn_mask = apply_gemma2_attn_mask
+
+    def get_outputs_npz_path(self, image_abs_path: str) -> str:
+        return (
+            os.path.splitext(image_abs_path)[0]
+            + LuminaTextEncoderOutputsCachingStrategy.LUMINA_TEXT_ENCODER_OUTPUTS_NPZ_SUFFIX
+        )
+
+    def is_disk_cached_outputs_expected(self, npz_path: str):
+        if not self.cache_to_disk:
+            return False
+        if not os.path.exists(npz_path):
+            return False
+        if self.skip_disk_cache_validity_check:
+            return True
+
+        try:
+            npz = np.load(npz_path)
+            if "hidden_state" not in npz:
+                return False
+        except Exception as e:
+            logger.error(f"Error loading file: {npz_path}")
+            raise e
+
+        return True
+
+    def load_outputs_npz(self, npz_path: str) -> List[np.ndarray]:
+        data = np.load(npz_path)
+        hidden_state = data["hidden_state"]
+        return [hidden_state]
+
+    def cache_batch_outputs(
+        self,
+        tokenize_strategy: TokenizeStrategy,
+        models: List[Any],
+        text_encoding_strategy: TextEncodingStrategy,
+        infos: List,
+    ):
+        lumina_text_encoding_strategy: LuminaTextEncodingStrategy = (
+            text_encoding_strategy
+        )
+        captions = [info.caption for info in infos]
+
+        if self.is_weighted:
+            tokens_list, weights_list = tokenize_strategy.tokenize_with_weights(
+                captions
+            )
+            with torch.no_grad():
+                hidden_state = lumina_text_encoding_strategy.encode_tokens_with_weights(
+                    tokenize_strategy, models, tokens_list, weights_list
+                )[0]
+        else:
+            tokens = tokenize_strategy.tokenize(captions)
+            with torch.no_grad():
+                hidden_state = lumina_text_encoding_strategy.encode_tokens(
+                    tokenize_strategy, models, tokens
+                )[0]
+
+        if hidden_state.dtype == torch.bfloat16:
+            hidden_state = hidden_state.float()
+
+        hidden_state = hidden_state.cpu().numpy()
+
+        for i, info in enumerate(infos):
+            hidden_state_i = hidden_state[i]
+
+            if self.cache_to_disk:
+                np.savez(
+                    info.text_encoder_outputs_npz,
+                    hidden_state=hidden_state_i,
+                )
+            else:
+                info.text_encoder_outputs = [hidden_state_i]
+
+
+class LuminaLatentsCachingStrategy(LatentsCachingStrategy):
+    LUMINA_LATENTS_NPZ_SUFFIX = "_lumina.npz"
+
+    def __init__(
+        self, cache_to_disk: bool, batch_size: int, skip_disk_cache_validity_check: bool
+    ) -> None:
+        super().__init__(cache_to_disk, batch_size, skip_disk_cache_validity_check)
+
+    @property
+    def cache_suffix(self) -> str:
+        return LuminaLatentsCachingStrategy.LUMINA_LATENTS_NPZ_SUFFIX
+
+    def get_latents_npz_path(
+        self, absolute_path: str, image_size: Tuple[int, int]
+    ) -> str:
+        return (
+            os.path.splitext(absolute_path)[0]
+            + f"_{image_size[0]:04d}x{image_size[1]:04d}"
+            + LuminaLatentsCachingStrategy.LUMINA_LATENTS_NPZ_SUFFIX
+        )
+
+    def is_disk_cached_latents_expected(
+        self,
+        bucket_reso: Tuple[int, int],
+        npz_path: str,
+        flip_aug: bool,
+        alpha_mask: bool,
+    ):
+        return self._default_is_disk_cached_latents_expected(
+            8, bucket_reso, npz_path, flip_aug, alpha_mask, multi_resolution=True
+        )
+
+    def load_latents_from_disk(
+        self, npz_path: str, bucket_reso: Tuple[int, int]
+    ) -> Tuple[
+        Optional[np.ndarray],
+        Optional[List[int]],
+        Optional[List[int]],
+        Optional[np.ndarray],
+        Optional[np.ndarray],
+    ]:
+        return self._default_load_latents_from_disk(
+            8, npz_path, bucket_reso
+        )  # support multi-resolution
+
+    # TODO remove circular dependency for ImageInfo
+    def cache_batch_latents(
+        self,
+        vae,
+        image_infos: List,
+        flip_aug: bool,
+        alpha_mask: bool,
+        random_crop: bool,
+    ):
+        encode_by_vae = lambda img_tensor: vae.encode(img_tensor).to("cpu")
+        vae_device = vae.device
+        vae_dtype = vae.dtype
+
+        self._default_cache_batch_latents(
+            encode_by_vae,
+            vae_device,
+            vae_dtype,
+            image_infos,
+            flip_aug,
+            alpha_mask,
+            random_crop,
+            multi_resolution=True,
+        )
+
+        if not train_util.HIGH_VRAM:
+            train_util.clean_memory_on_device(vae.device)
diff --git a/library/train_util.py b/library/train_util.py
index 37ed0a994..34ffe22b1 100644
--- a/library/train_util.py
+++ b/library/train_util.py
@@ -3463,6 +3463,7 @@ def get_sai_model_spec(
     is_stable_diffusion_ckpt: Optional[bool] = None,  # None for TI and LoRA
     sd3: str = None,
     flux: str = None,
+    lumina: str = None,
 ):
     timestamp = time.time()
 
@@ -3498,6 +3499,7 @@ def get_sai_model_spec(
         clip_skip=args.clip_skip,  # None or int
         sd3=sd3,
         flux=flux,
+        lumina=lumina,
     )
     return metadata
 
diff --git a/lumina_train_network.py b/lumina_train_network.py
new file mode 100644
index 000000000..40b84e149
--- /dev/null
+++ b/lumina_train_network.py
@@ -0,0 +1,192 @@
+import argparse
+import copy
+import math
+import random
+from typing import Any, Optional, Union
+
+import torch
+from accelerate import Accelerator
+
+from library.device_utils import clean_memory_on_device, init_ipex
+
+init_ipex()
+
+import train_network
+from library import (
+    lumina_models,
+    flux_train_utils,
+    lumina_util,
+    lumina_train_util,
+    sd3_train_utils,
+    strategy_base,
+    strategy_lumina,
+    train_util,
+)
+from library.utils import setup_logging
+
+setup_logging()
+import logging
+
+logger = logging.getLogger(__name__)
+
+
+class LuminaNetworkTrainer(train_network.NetworkTrainer):
+    def __init__(self):
+        super().__init__()
+        self.sample_prompts_te_outputs = None
+        self.is_swapping_blocks: bool = False
+
+    def assert_extra_args(self, args, train_dataset_group, val_dataset_group):
+        super().assert_extra_args(args, train_dataset_group, val_dataset_group)
+
+        if (
+            args.cache_text_encoder_outputs_to_disk
+            and not args.cache_text_encoder_outputs
+        ):
+            logger.warning("Enabling cache_text_encoder_outputs due to disk caching")
+            args.cache_text_encoder_outputs = True
+
+        train_dataset_group.verify_bucket_reso_steps(32)
+        if val_dataset_group is not None:
+            val_dataset_group.verify_bucket_reso_steps(32)
+
+        self.train_gemma2 = not args.network_train_unet_only
+
+    def load_target_model(self, args, weight_dtype, accelerator):
+        loading_dtype = None if args.fp8 else weight_dtype
+
+        model = lumina_util.load_lumina_model(
+            args.pretrained_model_name_or_path,
+            loading_dtype,
+            "cpu",
+            disable_mmap=args.disable_mmap_load_safetensors,
+        )
+
+        # if args.blocks_to_swap:
+        #     logger.info(f'Enabling block swap: {args.blocks_to_swap}')
+        #     model.enable_block_swap(args.blocks_to_swap, accelerator.device)
+        #     self.is_swapping_blocks = True
+
+        gemma2 = lumina_util.load_gemma2(args.gemma2, weight_dtype, "cpu")
+        ae = lumina_util.load_ae(args.ae, weight_dtype, "cpu")
+
+        return lumina_util.MODEL_VERSION_LUMINA_V2, [gemma2], ae, model
+
+    def get_tokenize_strategy(self, args):
+        return strategy_lumina.LuminaTokenizeStrategy(
+            args.gemma2_max_token_length, args.tokenizer_cache_dir
+        )
+
+    def get_tokenizers(self, tokenize_strategy: strategy_lumina.LuminaTokenizeStrategy):
+        return [tokenize_strategy.tokenizer]
+
+    def get_latents_caching_strategy(self, args):
+        return strategy_lumina.LuminaLatentsCachingStrategy(
+            args.cache_latents_to_disk, args.vae_batch_size, False
+        )
+
+    def get_text_encoding_strategy(self, args):
+        return strategy_lumina.LuminaTextEncodingStrategy(args.apply_gemma2_attn_mask)
+
+    def get_text_encoders_train_flags(self, args, text_encoders):
+        return [self.train_gemma2]
+
+    def get_text_encoder_outputs_caching_strategy(self, args):
+        if args.cache_text_encoder_outputs:
+            # if the text encoders is trained, we need tokenization, so is_partial is True
+            return strategy_lumina.LuminaTextEncoderOutputsCachingStrategy(
+                args.cache_text_encoder_outputs_to_disk,
+                args.text_encoder_batch_size,
+                args.skip_cache_check,
+                is_partial=self.train_gemma2,
+                apply_gemma2_attn_mask=args.apply_gemma2_attn_mask,
+            )
+        else:
+            return None
+
+    def cache_text_encoder_outputs_if_needed(
+        self,
+        args,
+        accelerator: Accelerator,
+        unet,
+        vae,
+        text_encoders,
+        dataset,
+        weight_dtype,
+    ):
+        for text_encoder in text_encoders:
+            text_encoder_outputs_caching_strategy = (
+                self.get_text_encoder_outputs_caching_strategy(args)
+            )
+            if text_encoder_outputs_caching_strategy is not None:
+                text_encoder_outputs_caching_strategy.cache_batch_outputs(
+                    self.get_tokenize_strategy(args),
+                    [text_encoder],
+                    self.get_text_encoding_strategy(args),
+                    dataset,
+                )
+
+    def sample_images(
+        self,
+        accelerator,
+        args,
+        epoch,
+        global_step,
+        device,
+        ae,
+        tokenizer,
+        text_encoder,
+        lumina,
+    ):
+        lumina_train_util.sample_images(
+            accelerator,
+            args,
+            epoch,
+            global_step,
+            lumina,
+            ae,
+            self.get_models_for_text_encoding(args, accelerator, text_encoder),
+            self.sample_prompts_te_outputs,
+        )
+
+    # Remaining methods maintain similar structure to flux implementation
+    # with Lumina-specific model calls and strategies
+
+    def get_noise_scheduler(
+        self, args: argparse.Namespace, device: torch.device
+    ) -> Any:
+        noise_scheduler = sd3_train_utils.FlowMatchEulerDiscreteScheduler(
+            num_train_timesteps=1000, shift=args.discrete_flow_shift
+        )
+        self.noise_scheduler_copy = copy.deepcopy(noise_scheduler)
+        return noise_scheduler
+
+    def encode_images_to_latents(self, args, accelerator, vae, images):
+        return vae.encode(images)
+
+    # not sure, they use same flux vae
+    def shift_scale_latents(self, args, latents):
+        return latents
+
+    def post_process_loss(self, loss, args, timesteps, noise_scheduler):
+        return loss
+
+    def get_sai_model_spec(self, args):
+        return train_util.get_sai_model_spec(None, args, False, True, False, flux="dev")
+
+
+def setup_parser() -> argparse.ArgumentParser:
+    parser = train_network.setup_parser()
+    train_util.add_dit_training_arguments(parser)
+    lumina_train_utils.add_lumina_train_arguments(parser)
+    return parser
+
+
+if __name__ == "__main__":
+    parser = setup_parser()
+    args = parser.parse_args()
+    train_util.verify_command_line_training_args(args)
+    args = train_util.read_config_from_file(args, parser)
+
+    trainer = LuminaNetworkTrainer()
+    trainer.train(args)

From ab88b431b0c903f7a60ae59e22fbb8a7cf9d78a1 Mon Sep 17 00:00:00 2001
From: rockerBOO <rockerboo@gmail.com>
Date: Fri, 14 Feb 2025 11:14:38 -0500
Subject: [PATCH 337/582] Fix validation epoch divergence

---
 train_network.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/train_network.py b/train_network.py
index c3879531d..b5f92e06b 100644
--- a/train_network.py
+++ b/train_network.py
@@ -1498,7 +1498,7 @@ def remove_model(old_ckpt_name):
 
                 if is_tracking:
                     avr_loss: float = val_epoch_loss_recorder.moving_average
-                    loss_validation_divergence = val_step_loss_recorder.moving_average - avr_loss 
+                    loss_validation_divergence = val_epoch_loss_recorder.moving_average - avr_loss 
                     logs = {
                         "loss/validation/epoch_average": avr_loss, 
                         "loss/validation/epoch_divergence": loss_validation_divergence, 

From c0caf33e3fa7a99c2160946e42d4ef7b8d7660a4 Mon Sep 17 00:00:00 2001
From: sdbds <865105819@qq.com>
Date: Sat, 15 Feb 2025 16:38:59 +0800
Subject: [PATCH 338/582] update

---
 library/lumina_util.py  |   8 --
 lumina_train_network.py | 175 +++++++++++++++++++++++++++++++++++++++-
 2 files changed, 171 insertions(+), 12 deletions(-)

diff --git a/library/lumina_util.py b/library/lumina_util.py
index 990f8c684..b47e057a9 100644
--- a/library/lumina_util.py
+++ b/library/lumina_util.py
@@ -108,14 +108,6 @@ def load_gemma2(
     logger.info(f"Loaded Gemma2: {info}")
     return gemma2
 
-def prepare_img_ids(batch_size: int, packed_latent_height: int, packed_latent_width: int):
-    img_ids = torch.zeros(packed_latent_height, packed_latent_width, 3)
-    img_ids[..., 1] = img_ids[..., 1] + torch.arange(packed_latent_height)[:, None]
-    img_ids[..., 2] = img_ids[..., 2] + torch.arange(packed_latent_width)[None, :]
-    img_ids = einops.repeat(img_ids, "h w c -> b (h w) c", b=batch_size)
-    return img_ids
-
-
 def unpack_latents(x: torch.Tensor, packed_latent_height: int, packed_latent_width: int) -> torch.Tensor:
     """
     x: [b (h w) (c ph pw)] -> [b c (h ph) (w pw)], ph=2, pw=2
diff --git a/lumina_train_network.py b/lumina_train_network.py
index 40b84e149..db329a9b1 100644
--- a/lumina_train_network.py
+++ b/lumina_train_network.py
@@ -53,7 +53,7 @@ def assert_extra_args(self, args, train_dataset_group, val_dataset_group):
         self.train_gemma2 = not args.network_train_unet_only
 
     def load_target_model(self, args, weight_dtype, accelerator):
-        loading_dtype = None if args.fp8 else weight_dtype
+        loading_dtype = None if args.fp8_base else weight_dtype
 
         model = lumina_util.load_lumina_model(
             args.pretrained_model_name_or_path,
@@ -67,8 +67,12 @@ def load_target_model(self, args, weight_dtype, accelerator):
         #     model.enable_block_swap(args.blocks_to_swap, accelerator.device)
         #     self.is_swapping_blocks = True
 
-        gemma2 = lumina_util.load_gemma2(args.gemma2, weight_dtype, "cpu")
-        ae = lumina_util.load_ae(args.ae, weight_dtype, "cpu")
+        gemma2 = lumina_util.load_gemma2(
+            args.gemma2, weight_dtype, "cpu"
+        )
+        ae = lumina_util.load_ae(
+            args.ae, weight_dtype, "cpu"
+        )
 
         return lumina_util.MODEL_VERSION_LUMINA_V2, [gemma2], ae, model
 
@@ -168,11 +172,174 @@ def encode_images_to_latents(self, args, accelerator, vae, images):
     def shift_scale_latents(self, args, latents):
         return latents
 
+    def get_noise_pred_and_target(
+        self,
+        args,
+        accelerator,
+        noise_scheduler,
+        latents,
+        batch,
+        text_encoder_conds,
+        unet: lumina_models.NextDiT,
+        network,
+        weight_dtype,
+        train_unet,
+        is_train=True,
+    ):
+        noise = torch.randn_like(latents)
+        bsz = latents.shape[0]
+
+        # get noisy model input and timesteps
+        noisy_model_input, timesteps, sigmas = (
+            flux_train_utils.get_noisy_model_input_and_timesteps(
+                args, noise_scheduler, latents, noise, accelerator.device, weight_dtype
+            )
+        )
+
+        # pack latents and get img_ids - 这部分可以保留因为NextDiT也需要packed格式的输入
+        packed_noisy_model_input = lumina_util.pack_latents(noisy_model_input)
+        packed_latent_height, packed_latent_width = (
+            noisy_model_input.shape[2] // 2,
+            noisy_model_input.shape[3] // 2,
+        )
+
+        # ensure the hidden state will require grad
+        if args.gradient_checkpointing:
+            noisy_model_input.requires_grad_(True)
+            for t in text_encoder_conds:
+                if t is not None and t.dtype.is_floating_point:
+                    t.requires_grad_(True)
+
+        # Unpack Gemma2 outputs
+        gemma2_hidden_states, gemma2_attn_mask, input_ids = text_encoder_conds
+        if not args.apply_gemma2_attn_mask:
+            gemma2_attn_mask = None
+
+        def call_dit(img, gemma2_hidden_states, input_ids, timesteps, gemma2_attn_mask):
+            with torch.set_grad_enabled(is_train), accelerator.autocast():
+                # NextDiT forward expects (x, t, cap_feats, cap_mask)
+                model_pred = unet(
+                    x=img,  # packed latents
+                    t=timesteps / 1000,  # timesteps需要除以1000来匹配模型预期
+                    cap_feats=gemma2_hidden_states,  # Gemma2的hidden states作为caption features
+                    cap_mask=gemma2_attn_mask,  # Gemma2的attention mask
+                )
+            return model_pred
+
+        model_pred = call_dit(
+            img=packed_noisy_model_input,
+            gemma2_hidden_states=gemma2_hidden_states,
+            input_ids=input_ids,
+            timesteps=timesteps,
+            gemma2_attn_mask=gemma2_attn_mask,
+        )
+
+        # unpack latents
+        model_pred = lumina_util.unpack_latents(
+            model_pred, packed_latent_height, packed_latent_width
+        )
+
+        # apply model prediction type
+        model_pred, weighting = flux_train_utils.apply_model_prediction_type(
+            args, model_pred, noisy_model_input, sigmas
+        )
+
+        # flow matching loss: this is different from SD3
+        target = noise - latents
+
+        # differential output preservation
+        if "custom_attributes" in batch:
+            diff_output_pr_indices = []
+            for i, custom_attributes in enumerate(batch["custom_attributes"]):
+                if (
+                    "diff_output_preservation" in custom_attributes
+                    and custom_attributes["diff_output_preservation"]
+                ):
+                    diff_output_pr_indices.append(i)
+
+            if len(diff_output_pr_indices) > 0:
+                network.set_multiplier(0.0)
+                with torch.no_grad():
+                    model_pred_prior = call_dit(
+                        img=packed_noisy_model_input[diff_output_pr_indices],
+                        gemma2_hidden_states=gemma2_hidden_states[
+                            diff_output_pr_indices
+                        ],
+                        input_ids=input_ids[diff_output_pr_indices],
+                        timesteps=timesteps[diff_output_pr_indices],
+                        gemma2_attn_mask=(
+                            gemma2_attn_mask[diff_output_pr_indices]
+                            if gemma2_attn_mask is not None
+                            else None
+                        ),
+                    )
+                network.set_multiplier(1.0)
+
+                model_pred_prior = lumina_util.unpack_latents(
+                    model_pred_prior, packed_latent_height, packed_latent_width
+                )
+                model_pred_prior, _ = flux_train_utils.apply_model_prediction_type(
+                    args,
+                    model_pred_prior,
+                    noisy_model_input[diff_output_pr_indices],
+                    sigmas[diff_output_pr_indices] if sigmas is not None else None,
+                )
+                target[diff_output_pr_indices] = model_pred_prior.to(target.dtype)
+
+        return model_pred, target, timesteps, weighting
+
     def post_process_loss(self, loss, args, timesteps, noise_scheduler):
         return loss
 
     def get_sai_model_spec(self, args):
-        return train_util.get_sai_model_spec(None, args, False, True, False, flux="dev")
+        return train_util.get_sai_model_spec(
+            None, args, False, True, False, lumina="lumina2"
+        )
+
+    def update_metadata(self, metadata, args):
+        metadata["ss_apply_gemma2_attn_mask"] = args.apply_gemma2_attn_mask
+        metadata["ss_weighting_scheme"] = args.weighting_scheme
+        metadata["ss_logit_mean"] = args.logit_mean
+        metadata["ss_logit_std"] = args.logit_std
+        metadata["ss_mode_scale"] = args.mode_scale
+        metadata["ss_guidance_scale"] = args.guidance_scale
+        metadata["ss_timestep_sampling"] = args.timestep_sampling
+        metadata["ss_sigmoid_scale"] = args.sigmoid_scale
+        metadata["ss_model_prediction_type"] = args.model_prediction_type
+        metadata["ss_discrete_flow_shift"] = args.discrete_flow_shift
+
+    def is_text_encoder_not_needed_for_training(self, args):
+        return args.cache_text_encoder_outputs and not self.is_train_text_encoder(args)
+
+    def prepare_text_encoder_grad_ckpt_workaround(self, index, text_encoder):
+        text_encoder.model.embed_tokens.requires_grad_(True)
+
+    def prepare_text_encoder_fp8(
+        self, index, text_encoder, te_weight_dtype, weight_dtype
+    ):
+        logger.info(
+            f"prepare Gemma2 for fp8: set to {te_weight_dtype}, set embeddings to {weight_dtype}"
+        )
+        text_encoder.to(te_weight_dtype)  # fp8
+        text_encoder.model.embed_tokens.to(dtype=weight_dtype)
+
+    def prepare_unet_with_accelerator(
+        self, args: argparse.Namespace, accelerator: Accelerator, unet: torch.nn.Module
+    ) -> torch.nn.Module:
+        if not self.is_swapping_blocks:
+            return super().prepare_unet_with_accelerator(args, accelerator, unet)
+
+        # if we doesn't swap blocks, we can move the model to device
+        nextdit: lumina_models.Nextdit = unet
+        nextdit = accelerator.prepare(
+            nextdit, device_placement=[not self.is_swapping_blocks]
+        )
+        accelerator.unwrap_model(nextdit).move_to_device_except_swap_blocks(
+            accelerator.device
+        )  # reduce peak memory usage
+        accelerator.unwrap_model(nextdit).prepare_block_swap_before_forward()
+
+        return nextdit
 
 
 def setup_parser() -> argparse.ArgumentParser:

From 7323ee1b9dbfd723ee767b7faeee8833421b832d Mon Sep 17 00:00:00 2001
From: sdbds <865105819@qq.com>
Date: Sat, 15 Feb 2025 17:10:34 +0800
Subject: [PATCH 339/582] update lora_lumina

---
 networks/lora_lumina.py | 1011 +++++++++++++++++++++++++++++++++++++++
 1 file changed, 1011 insertions(+)
 create mode 100644 networks/lora_lumina.py

diff --git a/networks/lora_lumina.py b/networks/lora_lumina.py
new file mode 100644
index 000000000..d554ce13d
--- /dev/null
+++ b/networks/lora_lumina.py
@@ -0,0 +1,1011 @@
+# temporary minimum implementation of LoRA
+# FLUX doesn't have Conv2d, so we ignore it
+# TODO commonize with the original implementation
+
+# LoRA network module
+# reference:
+# https://github.com/microsoft/LoRA/blob/main/loralib/layers.py
+# https://github.com/cloneofsimo/lora/blob/master/lora_diffusion/lora.py
+
+import math
+import os
+from typing import Dict, List, Optional, Tuple, Type, Union
+from diffusers import AutoencoderKL
+from transformers import CLIPTextModel
+import numpy as np
+import torch
+import re
+from library.utils import setup_logging
+from library.sdxl_original_unet import SdxlUNet2DConditionModel
+
+setup_logging()
+import logging
+
+logger = logging.getLogger(__name__)
+
+
+NUM_DOUBLE_BLOCKS = 19
+NUM_SINGLE_BLOCKS = 38
+
+
+class LoRAModule(torch.nn.Module):
+    """
+    replaces forward method of the original Linear, instead of replacing the original Linear module.
+    """
+
+    def __init__(
+        self,
+        lora_name,
+        org_module: torch.nn.Module,
+        multiplier=1.0,
+        lora_dim=4,
+        alpha=1,
+        dropout=None,
+        rank_dropout=None,
+        module_dropout=None,
+        split_dims: Optional[List[int]] = None,
+    ):
+        """
+        if alpha == 0 or None, alpha is rank (no scaling).
+
+        split_dims is used to mimic the split qkv of lumina as same as Diffusers
+        """
+        super().__init__()
+        self.lora_name = lora_name
+
+        if org_module.__class__.__name__ == "Conv2d":
+            in_dim = org_module.in_channels
+            out_dim = org_module.out_channels
+        else:
+            in_dim = org_module.in_features
+            out_dim = org_module.out_features
+
+        self.lora_dim = lora_dim
+        self.split_dims = split_dims
+
+        if split_dims is None:
+            if org_module.__class__.__name__ == "Conv2d":
+                kernel_size = org_module.kernel_size
+                stride = org_module.stride
+                padding = org_module.padding
+                self.lora_down = torch.nn.Conv2d(in_dim, self.lora_dim, kernel_size, stride, padding, bias=False)
+                self.lora_up = torch.nn.Conv2d(self.lora_dim, out_dim, (1, 1), (1, 1), bias=False)
+            else:
+                self.lora_down = torch.nn.Linear(in_dim, self.lora_dim, bias=False)
+                self.lora_up = torch.nn.Linear(self.lora_dim, out_dim, bias=False)
+
+            torch.nn.init.kaiming_uniform_(self.lora_down.weight, a=math.sqrt(5))
+            torch.nn.init.zeros_(self.lora_up.weight)
+        else:
+            # conv2d not supported
+            assert sum(split_dims) == out_dim, "sum of split_dims must be equal to out_dim"
+            assert org_module.__class__.__name__ == "Linear", "split_dims is only supported for Linear"
+            # print(f"split_dims: {split_dims}")
+            self.lora_down = torch.nn.ModuleList(
+                [torch.nn.Linear(in_dim, self.lora_dim, bias=False) for _ in range(len(split_dims))]
+            )
+            self.lora_up = torch.nn.ModuleList([torch.nn.Linear(self.lora_dim, split_dim, bias=False) for split_dim in split_dims])
+            for lora_down in self.lora_down:
+                torch.nn.init.kaiming_uniform_(lora_down.weight, a=math.sqrt(5))
+            for lora_up in self.lora_up:
+                torch.nn.init.zeros_(lora_up.weight)
+
+        if type(alpha) == torch.Tensor:
+            alpha = alpha.detach().float().numpy()  # without casting, bf16 causes error
+        alpha = self.lora_dim if alpha is None or alpha == 0 else alpha
+        self.scale = alpha / self.lora_dim
+        self.register_buffer("alpha", torch.tensor(alpha))  # 定数として扱える
+
+        # same as microsoft's
+        self.multiplier = multiplier
+        self.org_module = org_module  # remove in applying
+        self.dropout = dropout
+        self.rank_dropout = rank_dropout
+        self.module_dropout = module_dropout
+
+    def apply_to(self):
+        self.org_forward = self.org_module.forward
+        self.org_module.forward = self.forward
+        del self.org_module
+
+    def forward(self, x):
+        org_forwarded = self.org_forward(x)
+
+        # module dropout
+        if self.module_dropout is not None and self.training:
+            if torch.rand(1) < self.module_dropout:
+                return org_forwarded
+
+        if self.split_dims is None:
+            lx = self.lora_down(x)
+
+            # normal dropout
+            if self.dropout is not None and self.training:
+                lx = torch.nn.functional.dropout(lx, p=self.dropout)
+
+            # rank dropout
+            if self.rank_dropout is not None and self.training:
+                mask = torch.rand((lx.size(0), self.lora_dim), device=lx.device) > self.rank_dropout
+                if len(lx.size()) == 3:
+                    mask = mask.unsqueeze(1)  # for Text Encoder
+                elif len(lx.size()) == 4:
+                    mask = mask.unsqueeze(-1).unsqueeze(-1)  # for Conv2d
+                lx = lx * mask
+
+                # scaling for rank dropout: treat as if the rank is changed
+                # maskから計算することも考えられるが、augmentation的な効果を期待してrank_dropoutを用いる
+                scale = self.scale * (1.0 / (1.0 - self.rank_dropout))  # redundant for readability
+            else:
+                scale = self.scale
+
+            lx = self.lora_up(lx)
+
+            return org_forwarded + lx * self.multiplier * scale
+        else:
+            lxs = [lora_down(x) for lora_down in self.lora_down]
+
+            # normal dropout
+            if self.dropout is not None and self.training:
+                lxs = [torch.nn.functional.dropout(lx, p=self.dropout) for lx in lxs]
+
+            # rank dropout
+            if self.rank_dropout is not None and self.training:
+                masks = [torch.rand((lx.size(0), self.lora_dim), device=lx.device) > self.rank_dropout for lx in lxs]
+                for i in range(len(lxs)):
+                    if len(lx.size()) == 3:
+                        masks[i] = masks[i].unsqueeze(1)
+                    elif len(lx.size()) == 4:
+                        masks[i] = masks[i].unsqueeze(-1).unsqueeze(-1)
+                    lxs[i] = lxs[i] * masks[i]
+
+                # scaling for rank dropout: treat as if the rank is changed
+                scale = self.scale * (1.0 / (1.0 - self.rank_dropout))  # redundant for readability
+            else:
+                scale = self.scale
+
+            lxs = [lora_up(lx) for lora_up, lx in zip(self.lora_up, lxs)]
+
+            return org_forwarded + torch.cat(lxs, dim=-1) * self.multiplier * scale
+
+
+class LoRAInfModule(LoRAModule):
+    def __init__(
+        self,
+        lora_name,
+        org_module: torch.nn.Module,
+        multiplier=1.0,
+        lora_dim=4,
+        alpha=1,
+        **kwargs,
+    ):
+        # no dropout for inference
+        super().__init__(lora_name, org_module, multiplier, lora_dim, alpha)
+
+        self.org_module_ref = [org_module]  # 後から参照できるように
+        self.enabled = True
+        self.network: LoRANetwork = None
+
+    def set_network(self, network):
+        self.network = network
+
+    # freezeしてマージする
+    def merge_to(self, sd, dtype, device):
+        # extract weight from org_module
+        org_sd = self.org_module.state_dict()
+        weight = org_sd["weight"]
+        org_dtype = weight.dtype
+        org_device = weight.device
+        weight = weight.to(torch.float)  # calc in float
+
+        if dtype is None:
+            dtype = org_dtype
+        if device is None:
+            device = org_device
+
+        if self.split_dims is None:
+            # get up/down weight
+            down_weight = sd["lora_down.weight"].to(torch.float).to(device)
+            up_weight = sd["lora_up.weight"].to(torch.float).to(device)
+
+            # merge weight
+            if len(weight.size()) == 2:
+                # linear
+                weight = weight + self.multiplier * (up_weight @ down_weight) * self.scale
+            elif down_weight.size()[2:4] == (1, 1):
+                # conv2d 1x1
+                weight = (
+                    weight
+                    + self.multiplier
+                    * (up_weight.squeeze(3).squeeze(2) @ down_weight.squeeze(3).squeeze(2)).unsqueeze(2).unsqueeze(3)
+                    * self.scale
+                )
+            else:
+                # conv2d 3x3
+                conved = torch.nn.functional.conv2d(down_weight.permute(1, 0, 2, 3), up_weight).permute(1, 0, 2, 3)
+                # logger.info(conved.size(), weight.size(), module.stride, module.padding)
+                weight = weight + self.multiplier * conved * self.scale
+
+            # set weight to org_module
+            org_sd["weight"] = weight.to(dtype)
+            self.org_module.load_state_dict(org_sd)
+        else:
+            # split_dims
+            total_dims = sum(self.split_dims)
+            for i in range(len(self.split_dims)):
+                # get up/down weight
+                down_weight = sd[f"lora_down.{i}.weight"].to(torch.float).to(device)  # (rank, in_dim)
+                up_weight = sd[f"lora_up.{i}.weight"].to(torch.float).to(device)  # (split dim, rank)
+
+                # pad up_weight -> (total_dims, rank)
+                padded_up_weight = torch.zeros((total_dims, up_weight.size(0)), device=device, dtype=torch.float)
+                padded_up_weight[sum(self.split_dims[:i]) : sum(self.split_dims[: i + 1])] = up_weight
+
+                # merge weight
+                weight = weight + self.multiplier * (up_weight @ down_weight) * self.scale
+
+            # set weight to org_module
+            org_sd["weight"] = weight.to(dtype)
+            self.org_module.load_state_dict(org_sd)
+
+    # 復元できるマージのため、このモジュールのweightを返す
+    def get_weight(self, multiplier=None):
+        if multiplier is None:
+            multiplier = self.multiplier
+
+        # get up/down weight from module
+        up_weight = self.lora_up.weight.to(torch.float)
+        down_weight = self.lora_down.weight.to(torch.float)
+
+        # pre-calculated weight
+        if len(down_weight.size()) == 2:
+            # linear
+            weight = self.multiplier * (up_weight @ down_weight) * self.scale
+        elif down_weight.size()[2:4] == (1, 1):
+            # conv2d 1x1
+            weight = (
+                self.multiplier
+                * (up_weight.squeeze(3).squeeze(2) @ down_weight.squeeze(3).squeeze(2)).unsqueeze(2).unsqueeze(3)
+                * self.scale
+            )
+        else:
+            # conv2d 3x3
+            conved = torch.nn.functional.conv2d(down_weight.permute(1, 0, 2, 3), up_weight).permute(1, 0, 2, 3)
+            weight = self.multiplier * conved * self.scale
+
+        return weight
+
+    def set_region(self, region):
+        self.region = region
+        self.region_mask = None
+
+    def default_forward(self, x):
+        # logger.info(f"default_forward {self.lora_name} {x.size()}")
+        if self.split_dims is None:
+            lx = self.lora_down(x)
+            lx = self.lora_up(lx)
+            return self.org_forward(x) + lx * self.multiplier * self.scale
+        else:
+            lxs = [lora_down(x) for lora_down in self.lora_down]
+            lxs = [lora_up(lx) for lora_up, lx in zip(self.lora_up, lxs)]
+            return self.org_forward(x) + torch.cat(lxs, dim=-1) * self.multiplier * self.scale
+
+    def forward(self, x):
+        if not self.enabled:
+            return self.org_forward(x)
+        return self.default_forward(x)
+
+
+def create_network(
+    multiplier: float,
+    network_dim: Optional[int],
+    network_alpha: Optional[float],
+    ae: AutoencoderKL,
+    text_encoders: List[CLIPTextModel],
+    lumina,
+    neuron_dropout: Optional[float] = None,
+    **kwargs,
+):
+    if network_dim is None:
+        network_dim = 4  # default
+    if network_alpha is None:
+        network_alpha = 1.0
+
+    # extract dim/alpha for conv2d, and block dim
+    conv_dim = kwargs.get("conv_dim", None)
+    conv_alpha = kwargs.get("conv_alpha", None)
+    if conv_dim is not None:
+        conv_dim = int(conv_dim)
+        if conv_alpha is None:
+            conv_alpha = 1.0
+        else:
+            conv_alpha = float(conv_alpha)
+
+    # attn dim, mlp dim for JointTransformerBlock
+    attn_dim = kwargs.get("attn_dim", None)  # attention dimension
+    mlp_dim = kwargs.get("mlp_dim", None)   # MLP dimension
+    mod_dim = kwargs.get("mod_dim", None)   # modulation dimension
+    refiner_dim = kwargs.get("refiner_dim", None)  # refiner blocks dimension
+
+    if attn_dim is not None:
+        attn_dim = int(attn_dim)
+    if mlp_dim is not None:
+        mlp_dim = int(mlp_dim)
+    if mod_dim is not None:
+        mod_dim = int(mod_dim)
+    if refiner_dim is not None:
+        refiner_dim = int(refiner_dim)
+
+    type_dims = [attn_dim, mlp_dim, mod_dim, refiner_dim]
+    if all([d is None for d in type_dims]):
+        type_dims = None
+
+    # in_dims for embedders
+    in_dims = kwargs.get("in_dims", None)
+    if in_dims is not None:
+        in_dims = in_dims.strip()
+        if in_dims.startswith("[") and in_dims.endswith("]"):
+            in_dims = in_dims[1:-1]
+        in_dims = [int(d) for d in in_dims.split(",")]
+        assert len(in_dims) == 4, f"invalid in_dims: {in_dims}, must be 4 dimensions (x_embedder, t_embedder, cap_embedder, final_layer)"
+
+    # rank/module dropout
+    rank_dropout = kwargs.get("rank_dropout", None)
+    if rank_dropout is not None:
+        rank_dropout = float(rank_dropout)
+    module_dropout = kwargs.get("module_dropout", None)
+    if module_dropout is not None:
+        module_dropout = float(module_dropout)
+
+    # single or double blocks
+    train_blocks = kwargs.get("train_blocks", None)  # None (default), "all" (same as None), "single", "double"
+    if train_blocks is not None:
+        assert train_blocks in ["all", "single", "double"], f"invalid train_blocks: {train_blocks}"
+
+    # split qkv
+    split_qkv = kwargs.get("split_qkv", False)
+    if split_qkv is not None:
+        split_qkv = True if split_qkv == "True" else False
+
+    # verbose
+    verbose = kwargs.get("verbose", False)
+    if verbose is not None:
+        verbose = True if verbose == "True" else False
+
+    # すごく引数が多いな ( ^ω^)･･･
+    network = LoRANetwork(
+        text_encoders,
+        lumina,
+        multiplier=multiplier,
+        lora_dim=network_dim,
+        alpha=network_alpha,
+        dropout=neuron_dropout,
+        rank_dropout=rank_dropout,
+        module_dropout=module_dropout,
+        conv_lora_dim=conv_dim,
+        conv_alpha=conv_alpha,
+        train_blocks=train_blocks,
+        split_qkv=split_qkv,
+        type_dims=type_dims,
+        in_dims=in_dims,
+        verbose=verbose,
+    )
+
+    loraplus_lr_ratio = kwargs.get("loraplus_lr_ratio", None)
+    loraplus_unet_lr_ratio = kwargs.get("loraplus_unet_lr_ratio", None)
+    loraplus_text_encoder_lr_ratio = kwargs.get("loraplus_text_encoder_lr_ratio", None)
+    loraplus_lr_ratio = float(loraplus_lr_ratio) if loraplus_lr_ratio is not None else None
+    loraplus_unet_lr_ratio = float(loraplus_unet_lr_ratio) if loraplus_unet_lr_ratio is not None else None
+    loraplus_text_encoder_lr_ratio = float(loraplus_text_encoder_lr_ratio) if loraplus_text_encoder_lr_ratio is not None else None
+    if loraplus_lr_ratio is not None or loraplus_unet_lr_ratio is not None or loraplus_text_encoder_lr_ratio is not None:
+        network.set_loraplus_lr_ratio(loraplus_lr_ratio, loraplus_unet_lr_ratio, loraplus_text_encoder_lr_ratio)
+
+    return network
+
+
+# Create network from weights for inference, weights are not loaded here (because can be merged)
+def create_network_from_weights(multiplier, file, ae, text_encoders, lumina, weights_sd=None, for_inference=False, **kwargs):
+    # if unet is an instance of SdxlUNet2DConditionModel or subclass, set is_sdxl to True
+    if weights_sd is None:
+        if os.path.splitext(file)[1] == ".safetensors":
+            from safetensors.torch import load_file, safe_open
+
+            weights_sd = load_file(file)
+        else:
+            weights_sd = torch.load(file, map_location="cpu")
+
+    # get dim/alpha mapping, and train t5xxl
+    modules_dim = {}
+    modules_alpha = {}
+    for key, value in weights_sd.items():
+        if "." not in key:
+            continue
+
+        lora_name = key.split(".")[0]
+        if "alpha" in key:
+            modules_alpha[lora_name] = value
+        elif "lora_down" in key:
+            dim = value.size()[0]
+            modules_dim[lora_name] = dim
+            # logger.info(lora_name, value.size(), dim)
+
+    # # split qkv
+    # double_qkv_rank = None
+    # single_qkv_rank = None
+    # rank = None
+    # for lora_name, dim in modules_dim.items():
+    #     if "double" in lora_name and "qkv" in lora_name:
+    #         double_qkv_rank = dim
+    #     elif "single" in lora_name and "linear1" in lora_name:
+    #         single_qkv_rank = dim
+    #     elif rank is None:
+    #         rank = dim
+    #     if double_qkv_rank is not None and single_qkv_rank is not None and rank is not None:
+    #         break
+    # split_qkv = (double_qkv_rank is not None and double_qkv_rank != rank) or (
+    #     single_qkv_rank is not None and single_qkv_rank != rank
+    # )
+    split_qkv = False  # split_qkv is not needed to care, because state_dict is qkv combined
+
+    module_class = LoRAInfModule if for_inference else LoRAModule
+
+    network = LoRANetwork(
+        text_encoders,
+        lumina,
+        multiplier=multiplier,
+        modules_dim=modules_dim,
+        modules_alpha=modules_alpha,
+        module_class=module_class,
+        split_qkv=split_qkv,
+    )
+    return network, weights_sd
+
+
+class LoRANetwork(torch.nn.Module):
+    LUMINA_TARGET_REPLACE_MODULE = ["JointTransformerBlock"]
+    TEXT_ENCODER_TARGET_REPLACE_MODULE = ["GemmaAttention", "GemmaDecoderLayer", "GemmaMLP"]
+    LORA_PREFIX_LUMINA = "lora_unet"
+    LORA_PREFIX_TEXT_ENCODER = "lora_te"  # Simplified prefix since we only have one text encoder
+
+    def __init__(
+        self,
+        text_encoders,  # Now this will be a single Gemma2 model
+        unet,
+        multiplier: float = 1.0,
+        lora_dim: int = 4,
+        alpha: float = 1,
+        dropout: Optional[float] = None,
+        rank_dropout: Optional[float] = None,
+        module_dropout: Optional[float] = None,
+        conv_lora_dim: Optional[int] = None,
+        conv_alpha: Optional[float] = None,
+        module_class: Type[object] = LoRAModule,
+        modules_dim: Optional[Dict[str, int]] = None,
+        modules_alpha: Optional[Dict[str, int]] = None,
+        train_blocks: Optional[str] = None,
+        split_qkv: bool = False,
+        type_dims: Optional[List[int]] = None,
+        in_dims: Optional[List[int]] = None,
+        verbose: Optional[bool] = False,
+    ) -> None:
+        super().__init__()
+        self.multiplier = multiplier
+
+        self.lora_dim = lora_dim
+        self.alpha = alpha
+        self.conv_lora_dim = conv_lora_dim
+        self.conv_alpha = conv_alpha
+        self.dropout = dropout
+        self.rank_dropout = rank_dropout
+        self.module_dropout = module_dropout
+        self.train_blocks = train_blocks if train_blocks is not None else "all"
+        self.split_qkv = split_qkv
+
+        self.type_dims = type_dims
+        self.in_dims = in_dims
+
+        self.loraplus_lr_ratio = None
+        self.loraplus_unet_lr_ratio = None
+        self.loraplus_text_encoder_lr_ratio = None
+
+        if modules_dim is not None:
+            logger.info(f"create LoRA network from weights")
+            self.in_dims = [0] * 5  # create in_dims
+            # verbose = True
+        else:
+            logger.info(f"create LoRA network. base dim (rank): {lora_dim}, alpha: {alpha}")
+            logger.info(
+                f"neuron dropout: p={self.dropout}, rank dropout: p={self.rank_dropout}, module dropout: p={self.module_dropout}"
+            )
+            # if self.conv_lora_dim is not None:
+            #     logger.info(
+            #         f"apply LoRA to Conv2d with kernel size (3,3). dim (rank): {self.conv_lora_dim}, alpha: {self.conv_alpha}"
+            #     )
+        if self.split_qkv:
+            logger.info(f"split qkv for LoRA")
+        if self.train_blocks is not None:
+            logger.info(f"train {self.train_blocks} blocks only")
+
+        # create module instances
+        def create_modules(
+            is_lumina: bool,
+            root_module: torch.nn.Module,
+            target_replace_modules: List[str],
+            filter: Optional[str] = None,
+            default_dim: Optional[int] = None,
+        ) -> List[LoRAModule]:
+            prefix = self.LORA_PREFIX_FLUX if is_lumina else self.LORA_PREFIX_TEXT_ENCODER
+
+            loras = []
+            skipped = []
+            for name, module in root_module.named_modules():
+                if target_replace_modules is None or module.__class__.__name__ in target_replace_modules:
+                    if target_replace_modules is None:  # for handling embedders
+                        module = root_module
+
+                    for child_name, child_module in module.named_modules():
+                        is_linear = child_module.__class__.__name__ == "Linear"
+                        is_conv2d = child_module.__class__.__name__ == "Conv2d"
+                        is_conv2d_1x1 = is_conv2d and child_module.kernel_size == (1, 1)
+
+                        if is_linear or is_conv2d:
+                            lora_name = prefix + "." + (name + "." if name else "") + child_name
+                            lora_name = lora_name.replace(".", "_")
+
+                            if filter is not None and not filter in lora_name:
+                                continue
+
+                            dim = None
+                            alpha = None
+
+                            if modules_dim is not None:
+                                # モジュール指定あり
+                                if lora_name in modules_dim:
+                                    dim = modules_dim[lora_name]
+                                    alpha = modules_alpha[lora_name]
+                            else:
+                                # 通常、すべて対象とする
+                                if is_linear or is_conv2d_1x1:
+                                    dim = default_dim if default_dim is not None else self.lora_dim
+                                    alpha = self.alpha
+
+                                    if is_lumina and type_dims is not None:
+                                        identifier = [
+                                            ("attention",),  # attention layers
+                                            ("mlp",),       # MLP layers
+                                            ("modulation",), # modulation layers
+                                            ("refiner",),   # refiner blocks
+                                        ]
+                                        for i, d in enumerate(type_dims):
+                                            if d is not None and all([id in lora_name for id in identifier[i]]):
+                                                dim = d  # may be 0 for skip
+                                                break
+
+                                elif self.conv_lora_dim is not None:
+                                    dim = self.conv_lora_dim
+                                    alpha = self.conv_alpha
+
+                            if dim is None or dim == 0:
+                                # skipした情報を出力
+                                if is_linear or is_conv2d_1x1 or (self.conv_lora_dim is not None):
+                                    skipped.append(lora_name)
+                                continue
+
+                            lora = module_class(
+                                lora_name,
+                                child_module,
+                                self.multiplier,
+                                dim,
+                                alpha,
+                                dropout=dropout,
+                                rank_dropout=rank_dropout,
+                                module_dropout=module_dropout,
+                            )
+                            loras.append(lora)
+
+                if target_replace_modules is None:
+                    break  # all modules are searched
+            return loras, skipped
+
+        # create LoRA for text encoder (Gemma2)
+        self.text_encoder_loras: List[Union[LoRAModule, LoRAInfModule]] = []
+        skipped_te = []
+
+        logger.info(f"create LoRA for Gemma2 Text Encoder:")
+        text_encoder_loras, skipped = create_modules(False, text_encoders, LoRANetwork.TEXT_ENCODER_TARGET_REPLACE_MODULE)
+        logger.info(f"create LoRA for Gemma2 Text Encoder: {len(text_encoder_loras)} modules.")
+        self.text_encoder_loras.extend(text_encoder_loras)
+        skipped_te += skipped
+
+        # create LoRA for U-Net
+        target_replace_modules = LoRANetwork.LUMINA_TARGET_REPLACE_MODULE
+
+        self.unet_loras: List[Union[LoRAModule, LoRAInfModule]]
+        self.unet_loras, skipped_un = create_modules(True, unet, target_replace_modules)
+
+        # Handle embedders
+        if self.in_dims:
+            for filter, in_dim in zip(["x_embedder", "t_embedder", "cap_embedder", "final_layer"], self.in_dims):
+                loras, _ = create_modules(True, unet, None, filter=filter, default_dim=in_dim)
+                self.unet_loras.extend(loras)
+
+        logger.info(f"create LoRA for Lumina blocks: {len(self.unet_loras)} modules.")
+        if verbose:
+            for lora in self.unet_loras:
+                logger.info(f"\t{lora.lora_name:50} {lora.lora_dim}, {lora.alpha}")
+
+        skipped = skipped_te + skipped_un
+        if verbose and len(skipped) > 0:
+            logger.warning(
+                f"because dim (rank) is 0, {len(skipped)} LoRA modules are skipped / dim (rank)が0の為、次の{len(skipped)}個のLoRAモジュールはスキップされます:"
+            )
+            for name in skipped:
+                logger.info(f"\t{name}")
+
+        # assertion
+        names = set()
+        for lora in self.text_encoder_loras + self.unet_loras:
+            assert lora.lora_name not in names, f"duplicated lora name: {lora.lora_name}"
+            names.add(lora.lora_name)
+
+    def set_multiplier(self, multiplier):
+        self.multiplier = multiplier
+        for lora in self.text_encoder_loras + self.unet_loras:
+            lora.multiplier = self.multiplier
+
+    def set_enabled(self, is_enabled):
+        for lora in self.text_encoder_loras + self.unet_loras:
+            lora.enabled = is_enabled
+
+    def load_weights(self, file):
+        if os.path.splitext(file)[1] == ".safetensors":
+            from safetensors.torch import load_file
+
+            weights_sd = load_file(file)
+        else:
+            weights_sd = torch.load(file, map_location="cpu")
+
+        info = self.load_state_dict(weights_sd, False)
+        return info
+
+    def load_state_dict(self, state_dict, strict=True):
+        # override to convert original weight to split qkv
+        if not self.split_qkv:
+            return super().load_state_dict(state_dict, strict)
+
+        # # split qkv
+        # for key in list(state_dict.keys()):
+        #     if "double" in key and "qkv" in key:
+        #         split_dims = [3072] * 3
+        #     elif "single" in key and "linear1" in key:
+        #         split_dims = [3072] * 3 + [12288]
+        #     else:
+        #         continue
+
+        #     weight = state_dict[key]
+        #     lora_name = key.split(".")[0]
+
+        #     if key not in state_dict:
+        #         continue  # already merged
+
+        #     # (rank, in_dim) * 3
+        #     down_weights = [state_dict.pop(f"{lora_name}.lora_down.{i}.weight") for i in range(len(split_dims))]
+        #     # (split dim, rank) * 3
+        #     up_weights = [state_dict.pop(f"{lora_name}.lora_up.{i}.weight") for i in range(len(split_dims))]
+
+        #     alpha = state_dict.pop(f"{lora_name}.alpha")
+
+        #     # merge down weight
+        #     down_weight = torch.cat(down_weights, dim=0)  # (rank, split_dim) * 3 -> (rank*3, sum of split_dim)
+
+        #     # merge up weight (sum of split_dim, rank*3)
+        #     rank = up_weights[0].size(1)
+        #     up_weight = torch.zeros((sum(split_dims), down_weight.size(0)), device=down_weight.device, dtype=down_weight.dtype)
+        #     i = 0
+        #     for j in range(len(split_dims)):
+        #         up_weight[i : i + split_dims[j], j * rank : (j + 1) * rank] = up_weights[j]
+        #         i += split_dims[j]
+
+        #     state_dict[f"{lora_name}.lora_down.weight"] = down_weight
+        #     state_dict[f"{lora_name}.lora_up.weight"] = up_weight
+        #     state_dict[f"{lora_name}.alpha"] = alpha
+
+        #     # print(
+        #     #     f"merged {lora_name}: {lora_name}, {[w.shape for w in down_weights]}, {[w.shape for w in up_weights]} to {down_weight.shape}, {up_weight.shape}"
+        #     # )
+        #     print(f"new key: {lora_name}.lora_down.weight, {lora_name}.lora_up.weight, {lora_name}.alpha")
+
+        return super().load_state_dict(state_dict, strict)
+
+    def state_dict(self, destination=None, prefix="", keep_vars=False):
+        if not self.split_qkv:
+            return super().state_dict(destination, prefix, keep_vars)
+
+        # merge qkv
+        state_dict = super().state_dict(destination, prefix, keep_vars)
+        new_state_dict = {}
+        for key in list(state_dict.keys()):
+            if "double" in key and "qkv" in key:
+                split_dims = [3072] * 3
+            elif "single" in key and "linear1" in key:
+                split_dims = [3072] * 3 + [12288]
+            else:
+                new_state_dict[key] = state_dict[key]
+                continue
+
+            if key not in state_dict:
+                continue  # already merged
+
+            lora_name = key.split(".")[0]
+
+            # (rank, in_dim) * 3
+            down_weights = [state_dict.pop(f"{lora_name}.lora_down.{i}.weight") for i in range(len(split_dims))]
+            # (split dim, rank) * 3
+            up_weights = [state_dict.pop(f"{lora_name}.lora_up.{i}.weight") for i in range(len(split_dims))]
+
+            alpha = state_dict.pop(f"{lora_name}.alpha")
+
+            # merge down weight
+            down_weight = torch.cat(down_weights, dim=0)  # (rank, split_dim) * 3 -> (rank*3, sum of split_dim)
+
+            # merge up weight (sum of split_dim, rank*3)
+            rank = up_weights[0].size(1)
+            up_weight = torch.zeros((sum(split_dims), down_weight.size(0)), device=down_weight.device, dtype=down_weight.dtype)
+            i = 0
+            for j in range(len(split_dims)):
+                up_weight[i : i + split_dims[j], j * rank : (j + 1) * rank] = up_weights[j]
+                i += split_dims[j]
+
+            new_state_dict[f"{lora_name}.lora_down.weight"] = down_weight
+            new_state_dict[f"{lora_name}.lora_up.weight"] = up_weight
+            new_state_dict[f"{lora_name}.alpha"] = alpha
+
+            # print(
+            #     f"merged {lora_name}: {lora_name}, {[w.shape for w in down_weights]}, {[w.shape for w in up_weights]} to {down_weight.shape}, {up_weight.shape}"
+            # )
+            print(f"new key: {lora_name}.lora_down.weight, {lora_name}.lora_up.weight, {lora_name}.alpha")
+
+        return new_state_dict
+
+    def apply_to(self, text_encoders, flux, apply_text_encoder=True, apply_unet=True):
+        if apply_text_encoder:
+            logger.info(f"enable LoRA for text encoder: {len(self.text_encoder_loras)} modules")
+        else:
+            self.text_encoder_loras = []
+
+        if apply_unet:
+            logger.info(f"enable LoRA for U-Net: {len(self.unet_loras)} modules")
+        else:
+            self.unet_loras = []
+
+        for lora in self.text_encoder_loras + self.unet_loras:
+            lora.apply_to()
+            self.add_module(lora.lora_name, lora)
+
+    # マージできるかどうかを返す
+    def is_mergeable(self):
+        return True
+
+    # TODO refactor to common function with apply_to
+    def merge_to(self, text_encoders, flux, weights_sd, dtype=None, device=None):
+        apply_text_encoder = apply_unet = False
+        for key in weights_sd.keys():
+            if key.startswith(LoRANetwork.LORA_PREFIX_TEXT_ENCODER):
+                apply_text_encoder = True
+            elif key.startswith(LoRANetwork.LORA_PREFIX_LUMINA):
+                apply_unet = True
+
+        if apply_text_encoder:
+            logger.info("enable LoRA for text encoder")
+        else:
+            self.text_encoder_loras = []
+
+        if apply_unet:
+            logger.info("enable LoRA for U-Net")
+        else:
+            self.unet_loras = []
+
+        for lora in self.text_encoder_loras + self.unet_loras:
+            sd_for_lora = {}
+            for key in weights_sd.keys():
+                if key.startswith(lora.lora_name):
+                    sd_for_lora[key[len(lora.lora_name) + 1 :]] = weights_sd[key]
+            lora.merge_to(sd_for_lora, dtype, device)
+
+        logger.info(f"weights are merged")
+
+    def set_loraplus_lr_ratio(self, loraplus_lr_ratio, loraplus_unet_lr_ratio, loraplus_text_encoder_lr_ratio):
+        self.loraplus_lr_ratio = loraplus_lr_ratio
+        self.loraplus_unet_lr_ratio = loraplus_unet_lr_ratio
+        self.loraplus_text_encoder_lr_ratio = loraplus_text_encoder_lr_ratio
+
+        logger.info(f"LoRA+ UNet LR Ratio: {self.loraplus_unet_lr_ratio or self.loraplus_lr_ratio}")
+        logger.info(f"LoRA+ Text Encoder LR Ratio: {self.loraplus_text_encoder_lr_ratio or self.loraplus_lr_ratio}")
+
+    def prepare_optimizer_params_with_multiple_te_lrs(self, text_encoder_lr, unet_lr, default_lr):
+        # make sure text_encoder_lr as list of two elements
+        # if float, use the same value for both text encoders
+        if text_encoder_lr is None or (isinstance(text_encoder_lr, list) and len(text_encoder_lr) == 0):
+            text_encoder_lr = [default_lr, default_lr]
+        elif isinstance(text_encoder_lr, float) or isinstance(text_encoder_lr, int):
+            text_encoder_lr = [float(text_encoder_lr), float(text_encoder_lr)]
+        elif len(text_encoder_lr) == 1:
+            text_encoder_lr = [text_encoder_lr[0], text_encoder_lr[0]]
+
+        self.requires_grad_(True)
+
+        all_params = []
+        lr_descriptions = []
+
+        def assemble_params(loras, lr, loraplus_ratio):
+            param_groups = {"lora": {}, "plus": {}}
+            for lora in loras:
+                for name, param in lora.named_parameters():
+                    if loraplus_ratio is not None and "lora_up" in name:
+                        param_groups["plus"][f"{lora.lora_name}.{name}"] = param
+                    else:
+                        param_groups["lora"][f"{lora.lora_name}.{name}"] = param
+
+            params = []
+            descriptions = []
+            for key in param_groups.keys():
+                param_data = {"params": param_groups[key].values()}
+
+                if len(param_data["params"]) == 0:
+                    continue
+
+                if lr is not None:
+                    if key == "plus":
+                        param_data["lr"] = lr * loraplus_ratio
+                    else:
+                        param_data["lr"] = lr
+
+                if param_data.get("lr", None) == 0 or param_data.get("lr", None) is None:
+                    logger.info("NO LR skipping!")
+                    continue
+
+                params.append(param_data)
+                descriptions.append("plus" if key == "plus" else "")
+
+            return params, descriptions
+
+        if self.text_encoder_loras:
+            loraplus_lr_ratio = self.loraplus_text_encoder_lr_ratio or self.loraplus_lr_ratio
+
+            # split text encoder loras for te1 and te3
+            te_loras = [lora for lora in self.text_encoder_loras]
+            if len(te_loras) > 0:
+                logger.info(f"Text Encoder: {len(te_loras)} modules, LR {text_encoder_lr[0]}")
+                params, descriptions = assemble_params(te_loras, text_encoder_lr[0], loraplus_lr_ratio)
+                all_params.extend(params)
+                lr_descriptions.extend(["textencoder " + (" " + d if d else "") for d in descriptions])
+
+        if self.unet_loras:
+            params, descriptions = assemble_params(
+                self.unet_loras,
+                unet_lr if unet_lr is not None else default_lr,
+                self.loraplus_unet_lr_ratio or self.loraplus_lr_ratio,
+            )
+            all_params.extend(params)
+            lr_descriptions.extend(["unet" + (" " + d if d else "") for d in descriptions])
+
+        return all_params, lr_descriptions
+
+    def enable_gradient_checkpointing(self):
+        # not supported
+        pass
+
+    def prepare_grad_etc(self, text_encoder, unet):
+        self.requires_grad_(True)
+
+    def on_epoch_start(self, text_encoder, unet):
+        self.train()
+
+    def get_trainable_params(self):
+        return self.parameters()
+
+    def save_weights(self, file, dtype, metadata):
+        if metadata is not None and len(metadata) == 0:
+            metadata = None
+
+        state_dict = self.state_dict()
+
+        if dtype is not None:
+            for key in list(state_dict.keys()):
+                v = state_dict[key]
+                v = v.detach().clone().to("cpu").to(dtype)
+                state_dict[key] = v
+
+        if os.path.splitext(file)[1] == ".safetensors":
+            from safetensors.torch import save_file
+            from library import train_util
+
+            # Precalculate model hashes to save time on indexing
+            if metadata is None:
+                metadata = {}
+            model_hash, legacy_hash = train_util.precalculate_safetensors_hashes(state_dict, metadata)
+            metadata["sshs_model_hash"] = model_hash
+            metadata["sshs_legacy_hash"] = legacy_hash
+
+            save_file(state_dict, file, metadata)
+        else:
+            torch.save(state_dict, file)
+
+    def backup_weights(self):
+        # 重みのバックアップを行う
+        loras: List[LoRAInfModule] = self.text_encoder_loras + self.unet_loras
+        for lora in loras:
+            org_module = lora.org_module_ref[0]
+            if not hasattr(org_module, "_lora_org_weight"):
+                sd = org_module.state_dict()
+                org_module._lora_org_weight = sd["weight"].detach().clone()
+                org_module._lora_restored = True
+
+    def restore_weights(self):
+        # 重みのリストアを行う
+        loras: List[LoRAInfModule] = self.text_encoder_loras + self.unet_loras
+        for lora in loras:
+            org_module = lora.org_module_ref[0]
+            if not org_module._lora_restored:
+                sd = org_module.state_dict()
+                sd["weight"] = org_module._lora_org_weight
+                org_module.load_state_dict(sd)
+                org_module._lora_restored = True
+
+    def pre_calculation(self):
+        # 事前計算を行う
+        loras: List[LoRAInfModule] = self.text_encoder_loras + self.unet_loras
+        for lora in loras:
+            org_module = lora.org_module_ref[0]
+            sd = org_module.state_dict()
+
+            org_weight = sd["weight"]
+            lora_weight = lora.get_weight().to(org_weight.device, dtype=org_weight.dtype)
+            sd["weight"] = org_weight + lora_weight
+            assert sd["weight"].shape == org_weight.shape
+            org_module.load_state_dict(sd)
+
+            org_module._lora_restored = False
+            lora.enabled = False
+
+    def apply_max_norm_regularization(self, max_norm_value, device):
+        downkeys = []
+        upkeys = []
+        alphakeys = []
+        norms = []
+        keys_scaled = 0
+
+        state_dict = self.state_dict()
+        for key in state_dict.keys():
+            if "lora_down" in key and "weight" in key:
+                downkeys.append(key)
+                upkeys.append(key.replace("lora_down", "lora_up"))
+                alphakeys.append(key.replace("lora_down.weight", "alpha"))
+
+        for i in range(len(downkeys)):
+            down = state_dict[downkeys[i]].to(device)
+            up = state_dict[upkeys[i]].to(device)
+            alpha = state_dict[alphakeys[i]].to(device)
+            dim = down.shape[0]
+            scale = alpha / dim
+
+            if up.shape[2:] == (1, 1) and down.shape[2:] == (1, 1):
+                updown = (up.squeeze(2).squeeze(2) @ down.squeeze(2).squeeze(2)).unsqueeze(2).unsqueeze(3)
+            elif up.shape[2:] == (3, 3) or down.shape[2:] == (3, 3):
+                updown = torch.nn.functional.conv2d(down.permute(1, 0, 2, 3), up).permute(1, 0, 2, 3)
+            else:
+                updown = up @ down
+
+            updown *= scale
+
+            norm = updown.norm().clamp(min=max_norm_value / 2)
+            desired = torch.clamp(norm, max=max_norm_value)
+            ratio = desired.cpu() / norm.cpu()
+            sqrt_ratio = ratio**0.5
+            if ratio != 1:
+                keys_scaled += 1
+                state_dict[upkeys[i]] *= sqrt_ratio
+                state_dict[downkeys[i]] *= sqrt_ratio
+            scalednorm = updown.norm() * ratio
+            norms.append(scalednorm.item())
+
+        return keys_scaled, sum(norms) / len(norms), max(norms)

From a00b06bc978c80502850a869c845877aeb451003 Mon Sep 17 00:00:00 2001
From: rockerBOO <rockerboo@gmail.com>
Date: Sat, 15 Feb 2025 14:56:11 -0500
Subject: [PATCH 340/582] Lumina 2 and Gemma 2 model loading

---
 library/lumina_models.py   | 35 ++++++++++++--------
 library/lumina_util.py     | 66 +++++++++++++++++++++++---------------
 library/strategy_lumina.py |  2 ++
 lumina_train_network.py    |  2 +-
 4 files changed, 65 insertions(+), 40 deletions(-)

diff --git a/library/lumina_models.py b/library/lumina_models.py
index 43b1e9c64..3f2e854e6 100644
--- a/library/lumina_models.py
+++ b/library/lumina_models.py
@@ -21,7 +21,8 @@
 
 try:
     from apex.normalization import FusedRMSNorm as RMSNorm
-except ImportError:
+except ModuleNotFoundError:
+    import warnings
     warnings.warn("Cannot import apex RMSNorm, switch to vanilla implementation")
 
 memory_efficient_attention = None
@@ -39,17 +40,20 @@
 class LuminaParams:
     """Parameters for Lumina model configuration"""
     patch_size: int = 2
-    dim: int = 2592
+    in_channels: int = 4
+    dim: int = 4096
     n_layers: int = 30
+    n_refiner_layers: int = 2
     n_heads: int = 24
     n_kv_heads: int = 8
+    multiple_of: int = 256
     axes_dims: List[int] = None
     axes_lens: List[int] = None
-    qk_norm: bool = False,
-    ffn_dim_multiplier: Optional[float] = None,
-    norm_eps: float = 1e-5,
-    scaling_factor: float = 1.0,
-    cap_feat_dim: int = 32,
+    qk_norm: bool = False
+    ffn_dim_multiplier: Optional[float] = None
+    norm_eps: float = 1e-5
+    scaling_factor: float = 1.0
+    cap_feat_dim: int = 32
 
     def __post_init__(self):
         if self.axes_dims is None:
@@ -62,12 +66,15 @@ def get_2b_config(cls) -> "LuminaParams":
         """Returns the configuration for the 2B parameter model"""
         return cls(
             patch_size=2,
-            dim=2592,
-            n_layers=30,
+            in_channels=16,
+            dim=2304,
+            n_layers=26,
             n_heads=24,
             n_kv_heads=8,
-            axes_dims=[36, 36, 36],
-            axes_lens=[300, 512, 512]
+            axes_dims=[32, 32, 32],
+            axes_lens=[300, 512, 512],
+            qk_norm=True,
+            cap_feat_dim=2304
         )
 
     @classmethod
@@ -696,8 +703,8 @@ def __init__(
         norm_eps: float = 1e-5,
         qk_norm: bool = False,
         cap_feat_dim: int = 5120,
-        axes_dims: List[int] = (16, 56, 56),
-        axes_lens: List[int] = (1, 512, 512),
+        axes_dims: List[int] = [16, 56, 56],
+        axes_lens: List[int] = [1, 512, 512],
     ) -> None:
         super().__init__()
         self.in_channels = in_channels
@@ -1090,6 +1097,7 @@ def NextDiT_2B_GQA_patch2_Adaln_Refiner(params: Optional[LuminaParams] = None, *
     
     return NextDiT(
         patch_size=params.patch_size,
+        in_channels=params.in_channels,
         dim=params.dim,
         n_layers=params.n_layers,
         n_heads=params.n_heads,
@@ -1099,7 +1107,6 @@ def NextDiT_2B_GQA_patch2_Adaln_Refiner(params: Optional[LuminaParams] = None, *
         qk_norm=params.qk_norm,
         ffn_dim_multiplier=params.ffn_dim_multiplier,
         norm_eps=params.norm_eps,
-        scaling_factor=params.scaling_factor,
         cap_feat_dim=params.cap_feat_dim,
         **kwargs,
     )
diff --git a/library/lumina_util.py b/library/lumina_util.py
index b47e057a9..f8e3f7dbc 100644
--- a/library/lumina_util.py
+++ b/library/lumina_util.py
@@ -27,14 +27,14 @@ def load_lumina_model(
     dtype: torch.dtype,
     device: Union[str, torch.device],
     disable_mmap: bool = False,
-) -> lumina_models.Lumina:
+):
     logger.info("Building Lumina")
     with torch.device("meta"):
         model = lumina_models.NextDiT_2B_GQA_patch2_Adaln_Refiner().to(dtype)
 
     logger.info(f"Loading state dict from {ckpt_path}")
     state_dict = load_safetensors(
-        ckpt_path, device=str(device), disable_mmap=disable_mmap, dtype=dtype
+        ckpt_path, device=device, disable_mmap=disable_mmap, dtype=dtype
     )
     info = model.load_state_dict(state_dict, strict=False, assign=True)
     logger.info(f"Loaded Lumina: {info}")
@@ -69,30 +69,39 @@ def load_gemma2(
 ) -> Gemma2Model:
     logger.info("Building Gemma2")
     GEMMA2_CONFIG = {
-        "_name_or_path": "google/gemma-2b",
-        "attention_bias": false,
-        "attention_dropout": 0.0,
-        "bos_token_id": 2,
-        "eos_token_id": 1,
-        "head_dim": 256,
-        "hidden_act": "gelu",
-        "hidden_size": 2048,
-        "initializer_range": 0.02,
-        "intermediate_size": 16384,
-        "max_position_embeddings": 8192,
-        "model_type": "gemma",
-        "num_attention_heads": 8,
-        "num_hidden_layers": 18,
-        "num_key_value_heads": 1,
-        "pad_token_id": 0,
-        "rms_norm_eps": 1e-06,
-        "rope_scaling": null,
-        "rope_theta": 10000.0,
-        "torch_dtype": "bfloat16",
-        "transformers_version": "4.38.0.dev0",
-        "use_cache": true,
-        "vocab_size": 256000
+      "_name_or_path": "google/gemma-2-2b",
+      "architectures": [
+        "Gemma2Model"
+      ],
+      "attention_bias": False,
+      "attention_dropout": 0.0,
+      "attn_logit_softcapping": 50.0,
+      "bos_token_id": 2,
+      "cache_implementation": "hybrid",
+      "eos_token_id": 1,
+      "final_logit_softcapping": 30.0,
+      "head_dim": 256,
+      "hidden_act": "gelu_pytorch_tanh",
+      "hidden_activation": "gelu_pytorch_tanh",
+      "hidden_size": 2304,
+      "initializer_range": 0.02,
+      "intermediate_size": 9216,
+      "max_position_embeddings": 8192,
+      "model_type": "gemma2",
+      "num_attention_heads": 8,
+      "num_hidden_layers": 26,
+      "num_key_value_heads": 4,
+      "pad_token_id": 0,
+      "query_pre_attn_scalar": 256,
+      "rms_norm_eps": 1e-06,
+      "rope_theta": 10000.0,
+      "sliding_window": 4096,
+      "torch_dtype": "float32",
+      "transformers_version": "4.44.2",
+      "use_cache": True,
+      "vocab_size": 256000
     }
+
     config = Gemma2Config(**GEMMA2_CONFIG)
     with init_empty_weights():
         gemma2 = Gemma2Model._from_config(config)
@@ -104,6 +113,13 @@ def load_gemma2(
         sd = load_safetensors(
             ckpt_path, device=str(device), disable_mmap=disable_mmap, dtype=dtype
         )
+
+    for key in list(sd.keys()):
+        new_key = key.replace("model.", "")
+        if new_key == key:
+            break  # the model doesn't have annoying prefix
+        sd[new_key] = sd.pop(key)
+
     info = gemma2.load_state_dict(sd, strict=False, assign=True)
     logger.info(f"Loaded Gemma2: {info}")
     return gemma2
diff --git a/library/strategy_lumina.py b/library/strategy_lumina.py
index 622c019a4..615f6e00c 100644
--- a/library/strategy_lumina.py
+++ b/library/strategy_lumina.py
@@ -9,7 +9,9 @@
     LatentsCachingStrategy,
     TokenizeStrategy,
     TextEncodingStrategy,
+    TextEncoderOutputsCachingStrategy
 )
+import numpy as np
 from library.utils import setup_logging
 
 setup_logging()
diff --git a/lumina_train_network.py b/lumina_train_network.py
index db329a9b1..1f8ba613e 100644
--- a/lumina_train_network.py
+++ b/lumina_train_network.py
@@ -345,7 +345,7 @@ def prepare_unet_with_accelerator(
 def setup_parser() -> argparse.ArgumentParser:
     parser = train_network.setup_parser()
     train_util.add_dit_training_arguments(parser)
-    lumina_train_utils.add_lumina_train_arguments(parser)
+    lumina_train_util.add_lumina_train_arguments(parser)
     return parser
 
 

From 60a76ebb72772327fcb7b2a10c87ad8f7b09f56f Mon Sep 17 00:00:00 2001
From: rockerBOO <rockerboo@gmail.com>
Date: Sun, 16 Feb 2025 01:06:34 -0500
Subject: [PATCH 341/582] Add caching gemma2, add gradient checkpointing,
 refactor lumina model code

---
 library/lumina_models.py   | 300 +++++++++++++++++++------------------
 library/strategy_lumina.py | 110 +++++++-------
 lumina_train_network.py    | 113 ++++++++++----
 networks/lora_lumina.py    |  10 +-
 4 files changed, 306 insertions(+), 227 deletions(-)

diff --git a/library/lumina_models.py b/library/lumina_models.py
index 3f2e854e6..27194e2f5 100644
--- a/library/lumina_models.py
+++ b/library/lumina_models.py
@@ -16,6 +16,8 @@
 from flash_attn import flash_attn_varlen_func
 from flash_attn.bert_padding import index_first_axis, pad_input, unpad_input  # noqa
 import torch
+from torch import Tensor
+from torch.utils.checkpoint import checkpoint
 import torch.nn as nn
 import torch.nn.functional as F
 
@@ -91,6 +93,25 @@ def get_7b_config(cls) -> "LuminaParams":
         )
 
 
+class GradientCheckpointMixin(nn.Module):
+    def __init__(self, *args, **kwargs) -> None:
+        super().__init__(*args, **kwargs)
+
+        self.gradient_checkpointing = False
+        self.cpu_offload_checkpointing = False
+
+    def enable_gradient_checkpointing(self, cpu_offload: bool = False):
+        self.gradient_checkpointing = True
+
+    def disable_gradient_checkpointing(self, cpu_offload: bool = False):
+        self.gradient_checkpointing = False
+
+    def forward(self, *args, **kwargs):
+        if self.training and self.gradient_checkpointing:
+            return checkpoint(self._forward, *args, use_reentrant=False, **kwargs)
+        else:
+            return self._forward(*args, **kwargs)
+
 #############################################################################
 #                                 RMSNorm                                   #
 #############################################################################
@@ -114,7 +135,7 @@ def __init__(self, dim: int, eps: float = 1e-6):
         self.eps = eps
         self.weight = nn.Parameter(torch.ones(dim))
 
-    def _norm(self, x):
+    def _norm(self, x) -> Tensor:
         """
         Apply the RMSNorm normalization to the input tensor.
 
@@ -125,21 +146,14 @@ def _norm(self, x):
             torch.Tensor: The normalized tensor.
 
         """
-        return x * torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + self.eps)
+        return x * torch.rsqrt(x.float().pow(2).mean(-1, keepdim=True) + self.eps)
 
-    def forward(self, x):
-        """
-        Forward pass through the RMSNorm layer.
 
-        Args:
-            x (torch.Tensor): The input tensor.
-
-        Returns:
-            torch.Tensor: The output tensor after applying RMSNorm.
-
-        """
-        output = self._norm(x.float()).type_as(x)
-        return output * self.weight
+    def forward(self, x: Tensor):
+        x_dtype = x.dtype
+        x = x.float()
+        rrms = torch.rsqrt(torch.mean(x**2, dim=-1, keepdim=True) + 1e-6)
+        return ((x * rrms) * self.weight.float()).to(dtype=x_dtype)
 
 
 def modulate(x, scale):
@@ -151,7 +165,7 @@ def modulate(x, scale):
 #############################################################################
 
 
-class TimestepEmbedder(nn.Module):
+class TimestepEmbedder(GradientCheckpointMixin):
     """
     Embeds scalar timesteps into vector representations.
     """
@@ -203,11 +217,32 @@ def timestep_embedding(t, dim, max_period=10000):
             )
         return embedding
 
-    def forward(self, t):
+    def _forward(self, t):
         t_freq = self.timestep_embedding(t, self.frequency_embedding_size)
         t_emb = self.mlp(t_freq.to(self.mlp[0].weight.dtype))
         return t_emb
 
+def to_cuda(x):
+    if isinstance(x, torch.Tensor):
+        return x.cuda()
+    elif isinstance(x, (list, tuple)):
+        return [to_cuda(elem) for elem in x]
+    elif isinstance(x, dict):
+        return {k: to_cuda(v) for k, v in x.items()}
+    else:
+        return x
+
+
+def to_cpu(x):
+    if isinstance(x, torch.Tensor):
+        return x.cpu()
+    elif isinstance(x, (list, tuple)):
+        return [to_cpu(elem) for elem in x]
+    elif isinstance(x, dict):
+        return {k: to_cpu(v) for k, v in x.items()}
+    else:
+        return x
+
 
 #############################################################################
 #                               Core NextDiT Model                              #
@@ -284,7 +319,7 @@ def apply_rotary_emb(
             Tuple[torch.Tensor, torch.Tensor]: Tuple of modified query tensor
                 and key tensor with rotary embeddings.
         """
-        with torch.amp.autocast("cuda",enabled=False):
+        with torch.autocast("cuda", enabled=False):
             x = torch.view_as_complex(x_in.float().reshape(*x_in.shape[:-1], -1, 2))
             freqs_cis = freqs_cis.unsqueeze(2)
             x_out = torch.view_as_real(x * freqs_cis).flatten(3)
@@ -496,15 +531,15 @@ def forward(self, x):
         return self.w2(self._forward_silu_gating(self.w1(x), self.w3(x)))
 
 
-class JointTransformerBlock(nn.Module):
+class JointTransformerBlock(GradientCheckpointMixin):
     def __init__(
         self,
         layer_id: int,
         dim: int,
         n_heads: int,
-        n_kv_heads: int,
+        n_kv_heads: Optional[int],
         multiple_of: int,
-        ffn_dim_multiplier: float,
+        ffn_dim_multiplier: Optional[float],
         norm_eps: float,
         qk_norm: bool,
         modulation=True,
@@ -520,7 +555,7 @@ def __init__(
                 value features (if using GQA), or set to None for the same as
                 query.
             multiple_of (int):
-            ffn_dim_multiplier (float):
+            ffn_dim_multiplier (Optional[float]):
             norm_eps (float):
 
         """
@@ -554,7 +589,7 @@ def __init__(
             nn.init.zeros_(self.adaLN_modulation[1].weight)
             nn.init.zeros_(self.adaLN_modulation[1].bias)
 
-    def forward(
+    def _forward(
         self,
         x: torch.Tensor,
         x_mask: torch.Tensor,
@@ -608,7 +643,7 @@ def forward(
         return x
 
 
-class FinalLayer(nn.Module):
+class FinalLayer(GradientCheckpointMixin):
     """
     The final layer of NextDiT.
     """
@@ -661,22 +696,21 @@ def __init__(
             self.axes_dims, self.axes_lens, theta=self.theta
         )
 
-    def __call__(self, ids: torch.Tensor):
+    def get_freqs_cis(self, ids: torch.Tensor):
         self.freqs_cis = [freqs_cis.to(ids.device) for freqs_cis in self.freqs_cis]
         result = []
         for i in range(len(self.axes_dims)):
-            # import torch.distributed as dist
-            # if not dist.is_initialized() or dist.get_rank() == 0:
-            #     import pdb
-            #     pdb.set_trace()
             index = (
                 ids[:, :, i : i + 1]
                 .repeat(1, 1, self.freqs_cis[i].shape[-1])
                 .to(torch.int64)
             )
+
+            axes = self.freqs_cis[i].unsqueeze(0).repeat(index.shape[0], 1, 1)
+
             result.append(
                 torch.gather(
-                    self.freqs_cis[i].unsqueeze(0).repeat(index.shape[0], 1, 1),
+                    axes,
                     dim=1,
                     index=index,
                 )
@@ -790,76 +824,98 @@ def __init__(
         self.dim = dim
         self.n_heads = n_heads
 
+        self.gradient_checkpointing = False
+        self.cpu_offload_checkpointing = False
+        self.blocks_to_swap = None
+
+    @property
+    def device(self):
+        return next(self.parameters()).device
+
+    @property
+    def dtype(self):
+        return next(self.parameters()).dtype
+
+    def enable_gradient_checkpointing(self, cpu_offload: bool = False):
+        self.gradient_checkpointing = True
+        self.cpu_offload_checkpointing = cpu_offload
+
+        self.t_embedder.enable_gradient_checkpointing()
+
+        for block in self.layers + self.context_refiner + self.noise_refiner:
+            block.enable_gradient_checkpointing(cpu_offload=cpu_offload)
+
+        self.final_layer.enable_gradient_checkpointing()
+
+        print(f"Lumina: Gradient checkpointing enabled. CPU offload: {cpu_offload}")
+
+    def disable_gradient_checkpointing(self):
+        self.gradient_checkpointing = False
+        self.cpu_offload_checkpointing = False
+
+        self.t_embedder.disable_gradient_checkpointing()
+
+        for block in self.layers + self.context_refiner + self.noise_refiner:
+            block.disable_gradient_checkpointing()
+
+        self.final_layer.disable_gradient_checkpointing()
+
+        print("Lumina: Gradient checkpointing disabled.")
+
     def unpatchify(
         self,
         x: torch.Tensor,
-        img_size: List[Tuple[int, int]],
-        cap_size: List[int],
-        return_tensor=False,
-    ) -> List[torch.Tensor]:
+        width: int,
+        height: int,
+        encoder_seq_lengths: List[int],
+        seq_lengths: List[int],
+    ) -> torch.Tensor:
         """
         x: (N, T, patch_size**2 * C)
         imgs: (N, H, W, C)
         """
         pH = pW = self.patch_size
-        imgs = []
-        for i in range(x.size(0)):
-            H, W = img_size[i]
-            begin = cap_size[i]
-            end = begin + (H // pH) * (W // pW)
-            imgs.append(
-                x[i][begin:end]
-                .view(H // pH, W // pW, pH, pW, self.out_channels)
+
+        output = []
+        for i, (encoder_seq_len, seq_len) in enumerate(zip(encoder_seq_lengths, seq_lengths)):
+            output.append(
+                x[i][encoder_seq_len:seq_len]
+                .view(height // pH, width // pW, pH, pW, self.out_channels)
                 .permute(4, 0, 2, 1, 3)
                 .flatten(3, 4)
                 .flatten(1, 2)
             )
+        output = torch.stack(output, dim=0)
 
-        if return_tensor:
-            imgs = torch.stack(imgs, dim=0)
-        return imgs
+        return output
 
     def patchify_and_embed(
         self,
-        x: List[torch.Tensor] | torch.Tensor,
+        x: torch.Tensor,
         cap_feats: torch.Tensor,
         cap_mask: torch.Tensor,
         t: torch.Tensor,
     ) -> Tuple[
-        torch.Tensor, torch.Tensor, List[Tuple[int, int]], List[int], torch.Tensor
+        torch.Tensor, torch.Tensor, torch.Tensor, List[int], List[int]
     ]:
-        bsz = len(x)
+        bsz, channels, height, width = x.shape
         pH = pW = self.patch_size
-        device = x[0].device
+        device = x.device
 
         l_effective_cap_len = cap_mask.sum(dim=1).tolist()
-        img_sizes = [(img.size(1), img.size(2)) for img in x]
-        l_effective_img_len = [(H // pH) * (W // pW) for (H, W) in img_sizes]
+        encoder_seq_len = cap_mask.shape[1]
 
-        max_seq_len = max(
-            (
-                cap_len + img_len
-                for cap_len, img_len in zip(l_effective_cap_len, l_effective_img_len)
-            )
-        )
-        max_cap_len = max(l_effective_cap_len)
-        max_img_len = max(l_effective_img_len)
+        image_seq_len = (height // self.patch_size) * (width // self.patch_size)
+        seq_lengths = [cap_seq_len + image_seq_len for cap_seq_len in l_effective_cap_len]
+        max_seq_len = max(seq_lengths)
 
-        position_ids = torch.zeros(
-            bsz, max_seq_len, 3, dtype=torch.int32, device=device
-        )
+        position_ids = torch.zeros(bsz, max_seq_len, 3, dtype=torch.int32, device=device)
 
-        for i in range(bsz):
-            cap_len = l_effective_cap_len[i]
-            img_len = l_effective_img_len[i]
-            H, W = img_sizes[i]
-            H_tokens, W_tokens = H // pH, W // pW
-            assert H_tokens * W_tokens == img_len
+        for i, (cap_len, seq_len) in enumerate(zip(l_effective_cap_len, seq_lengths)):
+            H_tokens, W_tokens = height // pH, width // pW
 
-            position_ids[i, :cap_len, 0] = torch.arange(
-                cap_len, dtype=torch.int32, device=device
-            )
-            position_ids[i, cap_len : cap_len + img_len, 0] = cap_len
+            position_ids[i, :cap_len, 0] = torch.arange(cap_len, dtype=torch.int32, device=device)
+            position_ids[i, cap_len : cap_len + seq_len, 0] = cap_len
             row_ids = (
                 torch.arange(H_tokens, dtype=torch.int32, device=device)
                 .view(-1, 1)
@@ -872,77 +928,40 @@ def patchify_and_embed(
                 .repeat(H_tokens, 1)
                 .flatten()
             )
-            position_ids[i, cap_len : cap_len + img_len, 1] = row_ids
-            position_ids[i, cap_len : cap_len + img_len, 2] = col_ids
+            position_ids[i, cap_len : cap_len + seq_len, 1] = row_ids
+            position_ids[i, cap_len : cap_len + seq_len, 2] = col_ids
 
-        freqs_cis = self.rope_embedder(position_ids)
+        freqs_cis = self.rope_embedder.get_freqs_cis(position_ids)
 
-        # build freqs_cis for cap and image individually
-        cap_freqs_cis_shape = list(freqs_cis.shape)
-        # cap_freqs_cis_shape[1] = max_cap_len
-        cap_freqs_cis_shape[1] = cap_feats.shape[1]
-        cap_freqs_cis = torch.zeros(
-            *cap_freqs_cis_shape, device=device, dtype=freqs_cis.dtype
-        )
+        cap_freqs_cis = torch.zeros(bsz, encoder_seq_len, freqs_cis.shape[-1], device=device, dtype=freqs_cis.dtype)
+        img_freqs_cis = torch.zeros(bsz, image_seq_len, freqs_cis.shape[-1], device=device, dtype=freqs_cis.dtype)
 
-        img_freqs_cis_shape = list(freqs_cis.shape)
-        img_freqs_cis_shape[1] = max_img_len
-        img_freqs_cis = torch.zeros(
-            *img_freqs_cis_shape, device=device, dtype=freqs_cis.dtype
-        )
-
-        for i in range(bsz):
-            cap_len = l_effective_cap_len[i]
-            img_len = l_effective_img_len[i]
+        for i, (cap_len, seq_len) in enumerate(zip(l_effective_cap_len, seq_lengths)):
             cap_freqs_cis[i, :cap_len] = freqs_cis[i, :cap_len]
-            img_freqs_cis[i, :img_len] = freqs_cis[i, cap_len : cap_len + img_len]
+            img_freqs_cis[i, :seq_len] = freqs_cis[i, cap_len : cap_len + seq_len]
+
+        x = x.view(bsz, channels, height // pH, pH, width // pW, pW).permute(0, 2, 4, 3, 5, 1).flatten(3).flatten(1, 2)
+        x_mask = torch.zeros(bsz, max_seq_len, dtype=torch.bool, device=device)
 
         # refine context
         for layer in self.context_refiner:
             cap_feats = layer(cap_feats, cap_mask, cap_freqs_cis)
 
-        # refine image
-        flat_x = []
-        for i in range(bsz):
-            img = x[i]
-            C, H, W = img.size()
-            img = (
-                img.view(C, H // pH, pH, W // pW, pW)
-                .permute(1, 3, 2, 4, 0)
-                .flatten(2)
-                .flatten(0, 1)
-            )
-            flat_x.append(img)
-        x = flat_x
-        padded_img_embed = torch.zeros(
-            bsz, max_img_len, x[0].shape[-1], device=device, dtype=x[0].dtype
-        )
-        padded_img_mask = torch.zeros(bsz, max_img_len, dtype=torch.bool, device=device)
-        for i in range(bsz):
-            padded_img_embed[i, : l_effective_img_len[i]] = x[i]
-            padded_img_mask[i, : l_effective_img_len[i]] = True
+        x = self.x_embedder(x)
 
-        padded_img_embed = self.x_embedder(padded_img_embed)
         for layer in self.noise_refiner:
-            padded_img_embed = layer(
-                padded_img_embed, padded_img_mask, img_freqs_cis, t
-            )
+            x = layer(x, x_mask, img_freqs_cis, t)
 
-        mask = torch.zeros(bsz, max_seq_len, dtype=torch.bool, device=device)
-        padded_full_embed = torch.zeros(
-            bsz, max_seq_len, self.dim, device=device, dtype=x[0].dtype
-        )
-        for i in range(bsz):
-            cap_len = l_effective_cap_len[i]
-            img_len = l_effective_img_len[i]
-
-            mask[i, : cap_len + img_len] = True
-            padded_full_embed[i, :cap_len] = cap_feats[i, :cap_len]
-            padded_full_embed[i, cap_len : cap_len + img_len] = padded_img_embed[
-                i, :img_len
-            ]
+        joint_hidden_states = torch.zeros(bsz, max_seq_len, self.dim, device=device, dtype=x.dtype)
+        attention_mask = torch.zeros(bsz, max_seq_len, dtype=torch.bool, device=device)
+        for i, (cap_len, seq_len) in enumerate(zip(l_effective_cap_len, seq_lengths)):
+            attention_mask[i, :seq_len] = True
+            joint_hidden_states[i, :cap_len] = cap_feats[i, :cap_len]
+            joint_hidden_states[i, cap_len:seq_len] = x[i]
 
-        return padded_full_embed, mask, img_sizes, l_effective_cap_len, freqs_cis
+        x = joint_hidden_states
+
+        return x, attention_mask, freqs_cis, l_effective_cap_len, seq_lengths
 
     def forward(self, x, t, cap_feats, cap_mask):
         """
@@ -950,30 +969,19 @@ def forward(self, x, t, cap_feats, cap_mask):
         t: (N,) tensor of diffusion timesteps
         y: (N,) tensor of text tokens/features
         """
-
-        # import torch.distributed as dist
-        # if not dist.is_initialized() or dist.get_rank() == 0:
-        #     import pdb
-        #     pdb.set_trace()
-        # torch.save([x, t, cap_feats, cap_mask], "./fake_input.pt")
+        _, _, height, width = x.shape # B, C, H, W
         t = self.t_embedder(t)  # (N, D)
-        adaln_input = t
-
-        cap_feats = self.cap_embedder(
-            cap_feats
-        )  # (N, L, D)  # todo check if able to batchify w.o. redundant compute
+        cap_feats = self.cap_embedder(cap_feats)  # (N, L, D)  # todo check if able to batchify w.o. redundant compute
 
-        x_is_tensor = isinstance(x, torch.Tensor)
-        x, mask, img_size, cap_size, freqs_cis = self.patchify_and_embed(
+        x, mask, freqs_cis, l_effective_cap_len, seq_lengths = self.patchify_and_embed(
             x, cap_feats, cap_mask, t
         )
-        freqs_cis = freqs_cis.to(x.device)
 
         for layer in self.layers:
-            x = layer(x, mask, freqs_cis, adaln_input)
+            x = layer(x, mask, freqs_cis, t)
 
-        x = self.final_layer(x, adaln_input)
-        x = self.unpatchify(x, img_size, cap_size, return_tensor=x_is_tensor)
+        x = self.final_layer(x, t)
+        x = self.unpatchify(x, width, height, l_effective_cap_len, seq_lengths)
 
         return x
 
diff --git a/library/strategy_lumina.py b/library/strategy_lumina.py
index 615f6e00c..6feea387e 100644
--- a/library/strategy_lumina.py
+++ b/library/strategy_lumina.py
@@ -3,7 +3,7 @@
 from typing import Any, List, Optional, Tuple, Union
 
 import torch
-from transformers import AutoTokenizer, AutoModel
+from transformers import AutoTokenizer, AutoModel, GemmaTokenizerFast
 from library import train_util
 from library.strategy_base import (
     LatentsCachingStrategy,
@@ -27,34 +27,35 @@ class LuminaTokenizeStrategy(TokenizeStrategy):
     def __init__(
         self, max_length: Optional[int], tokenizer_cache_dir: Optional[str] = None
     ) -> None:
-        self.tokenizer = AutoTokenizer.from_pretrained(
+        self.tokenizer: GemmaTokenizerFast = AutoTokenizer.from_pretrained(
             GEMMA_ID, cache_dir=tokenizer_cache_dir
         )
         self.tokenizer.padding_side = "right"
 
         if max_length is None:
-            self.max_length = self.tokenizer.model_max_length
+            self.max_length = 256
         else:
             self.max_length = max_length
 
-    def tokenize(self, text: Union[str, List[str]]) -> List[torch.Tensor]:
+    def tokenize(self, text: Union[str, List[str]]) -> Tuple[torch.Tensor, torch.Tensor]:
         text = [text] if isinstance(text, str) else text
         encodings = self.tokenizer(
             text,
-            padding="max_length",
             max_length=self.max_length,
             return_tensors="pt",
+            padding=True,
+            pad_to_multiple_of=8,
             truncation=True,
         )
-        return [encodings.input_ids]
+        return encodings.input_ids, encodings.attention_mask
 
     def tokenize_with_weights(
         self, text: str | List[str]
-    ) -> Tuple[List[torch.Tensor], List[torch.Tensor]]:
+    ) -> Tuple[torch.Tensor, torch.Tensor, List[torch.Tensor]]:
         # Gemma doesn't support weighted prompts, return uniform weights
-        tokens = self.tokenize(text)
+        tokens, attention_masks = self.tokenize(text)
         weights = [torch.ones_like(t) for t in tokens]
-        return tokens, weights
+        return tokens, attention_masks, weights
 
 
 class LuminaTextEncodingStrategy(TextEncodingStrategy):
@@ -66,50 +67,39 @@ def encode_tokens(
         self,
         tokenize_strategy: TokenizeStrategy,
         models: List[Any],
-        tokens: List[torch.Tensor],
+        tokens: torch.Tensor,
+        attention_masks: torch.Tensor,
         apply_gemma2_attn_mask: Optional[bool] = None,
-    ) -> List[torch.Tensor]:
-
+    ) -> torch.Tensor:
         if apply_gemma2_attn_mask is None:
             apply_gemma2_attn_mask = self.apply_gemma2_attn_mask
 
         text_encoder = models[0]
-        input_ids = tokens[0].to(text_encoder.device)
-
-        attention_mask = None
-        position_ids = None
-        if apply_gemma2_attn_mask:
-            # Create attention mask (1 for non-padding, 0 for padding)
-            attention_mask = (input_ids != tokenize_strategy.tokenizer.pad_token_id).to(
-                text_encoder.device
-            )
 
-            # Create position IDs
-            position_ids = attention_mask.long().cumsum(-1) - 1
-            position_ids.masked_fill_(attention_mask == 0, 1)
-
-        with torch.no_grad():
-            outputs = text_encoder(
-                input_ids=input_ids,
-                attention_mask=attention_mask,
-                position_ids=position_ids,
-                output_hidden_states=True,
-                return_dict=True,
-            )
-            # Get the last hidden state
-            hidden_states = outputs.last_hidden_state
+        # Create position IDs
+        position_ids = attention_masks.cumsum(-1) - 1
+        position_ids.masked_fill_(attention_masks == 0, 1)
 
-        return [hidden_states]
+        outputs = text_encoder(
+            input_ids=tokens.to(text_encoder.device),
+            attention_mask=attention_masks.to(text_encoder.device) if apply_gemma2_attn_mask else None,
+            position_ids=position_ids.to(text_encoder.device),
+            output_hidden_states=True,
+            return_dict=True,
+        )
+
+        return outputs.hidden_states[-2]
 
     def encode_tokens_with_weights(
         self,
         tokenize_strategy: TokenizeStrategy,
         models: List[Any],
-        tokens_list: List[torch.Tensor],
+        tokens: torch.Tensor,
         weights_list: List[torch.Tensor],
-    ) -> List[torch.Tensor]:
+        attention_masks: torch.Tensor
+    ) -> torch.Tensor:
         # For simplicity, use uniform weighting
-        return self.encode_tokens(tokenize_strategy, models, tokens_list)
+        return self.encode_tokens(tokenize_strategy, models, tokens, attention_masks)
 
 
 class LuminaTextEncoderOutputsCachingStrategy(TextEncoderOutputsCachingStrategy):
@@ -149,6 +139,15 @@ def is_disk_cached_outputs_expected(self, npz_path: str):
             npz = np.load(npz_path)
             if "hidden_state" not in npz:
                 return False
+            if "attention_mask" not in npz:
+                return False
+            if "input_ids" not in npz:
+                return False
+            if "apply_gemma2_attn_mask" not in npz:
+                return False
+            npz_apply_gemma2_attn_mask = npz["apply_gemma2_attn_mask"]
+            if npz_apply_gemma2_attn_mask != self.apply_gemma2_attn_mask:
+                return False
         except Exception as e:
             logger.error(f"Error loading file: {npz_path}")
             raise e
@@ -158,13 +157,15 @@ def is_disk_cached_outputs_expected(self, npz_path: str):
     def load_outputs_npz(self, npz_path: str) -> List[np.ndarray]:
         data = np.load(npz_path)
         hidden_state = data["hidden_state"]
-        return [hidden_state]
+        attention_mask = data["attention_mask"]
+        input_ids = data["input_ids"]
+        return [hidden_state, attention_mask, input_ids]
 
     def cache_batch_outputs(
         self,
-        tokenize_strategy: TokenizeStrategy,
+        tokenize_strategy: LuminaTokenizeStrategy,
         models: List[Any],
-        text_encoding_strategy: TextEncodingStrategy,
+        text_encoding_strategy: LuminaTextEncodingStrategy,
         infos: List,
     ):
         lumina_text_encoding_strategy: LuminaTextEncodingStrategy = (
@@ -173,35 +174,44 @@ def cache_batch_outputs(
         captions = [info.caption for info in infos]
 
         if self.is_weighted:
-            tokens_list, weights_list = tokenize_strategy.tokenize_with_weights(
+            tokens, attention_masks, weights_list = tokenize_strategy.tokenize_with_weights(
                 captions
             )
             with torch.no_grad():
                 hidden_state = lumina_text_encoding_strategy.encode_tokens_with_weights(
-                    tokenize_strategy, models, tokens_list, weights_list
-                )[0]
+                    tokenize_strategy, models, tokens, weights_list, attention_masks
+                )
         else:
-            tokens = tokenize_strategy.tokenize(captions)
+            tokens, attention_masks = tokenize_strategy.tokenize(captions)
             with torch.no_grad():
                 hidden_state = lumina_text_encoding_strategy.encode_tokens(
-                    tokenize_strategy, models, tokens
-                )[0]
+                    tokenize_strategy, models, tokens, attention_masks
+                )
 
-        if hidden_state.dtype == torch.bfloat16:
+        if hidden_state.dtype != torch.float32:
             hidden_state = hidden_state.float()
 
         hidden_state = hidden_state.cpu().numpy()
+        attention_mask = attention_masks.cpu().numpy()
+        input_ids = tokens.cpu().numpy()
+
 
         for i, info in enumerate(infos):
             hidden_state_i = hidden_state[i]
+            attention_mask_i = attention_mask[i]
+            input_ids_i = input_ids[i]
+            apply_gemma2_attn_mask_i = self.apply_gemma2_attn_mask
 
             if self.cache_to_disk:
                 np.savez(
                     info.text_encoder_outputs_npz,
                     hidden_state=hidden_state_i,
+                    attention_mask=attention_mask_i,
+                    input_ids=input_ids_i,
+                    apply_gemma2_attn_mask=apply_gemma2_attn_mask_i,
                 )
             else:
-                info.text_encoder_outputs = [hidden_state_i]
+                info.text_encoder_outputs = [hidden_state_i, attention_mask_i, input_ids_i]
 
 
 class LuminaLatentsCachingStrategy(LatentsCachingStrategy):
diff --git a/lumina_train_network.py b/lumina_train_network.py
index 1f8ba613e..3d0c70629 100644
--- a/lumina_train_network.py
+++ b/lumina_train_network.py
@@ -62,6 +62,19 @@ def load_target_model(self, args, weight_dtype, accelerator):
             disable_mmap=args.disable_mmap_load_safetensors,
         )
 
+        if args.fp8_base:
+            # check dtype of model
+            if model.dtype == torch.float8_e4m3fnuz or model.dtype == torch.float8_e5m2 or model.dtype == torch.float8_e5m2fnuz:
+                raise ValueError(f"Unsupported fp8 model dtype: {model.dtype}")
+            elif model.dtype == torch.float8_e4m3fn:
+                logger.info("Loaded fp8 Lumina 2 model")
+            else:
+                logger.info(
+                    "Cast Lumina 2 model to fp8. This may take a while. You can reduce the time by using fp8 checkpoint."
+                    " / Lumina 2モデルをfp8に変換しています。これには時間がかかる場合があります。fp8チェックポイントを使用することで時間を短縮できます。"
+                )
+                model.to(torch.float8_e4m3fn)
+
         # if args.blocks_to_swap:
         #     logger.info(f'Enabling block swap: {args.blocks_to_swap}')
         #     model.enable_block_swap(args.blocks_to_swap, accelerator.device)
@@ -70,6 +83,7 @@ def load_target_model(self, args, weight_dtype, accelerator):
         gemma2 = lumina_util.load_gemma2(
             args.gemma2, weight_dtype, "cpu"
         )
+        gemma2.eval()
         ae = lumina_util.load_ae(
             args.ae, weight_dtype, "cpu"
         )
@@ -118,17 +132,65 @@ def cache_text_encoder_outputs_if_needed(
         dataset,
         weight_dtype,
     ):
-        for text_encoder in text_encoders:
-            text_encoder_outputs_caching_strategy = (
-                self.get_text_encoder_outputs_caching_strategy(args)
-            )
-            if text_encoder_outputs_caching_strategy is not None:
-                text_encoder_outputs_caching_strategy.cache_batch_outputs(
-                    self.get_tokenize_strategy(args),
-                    [text_encoder],
-                    self.get_text_encoding_strategy(args),
-                    dataset,
-                )
+        if args.cache_text_encoder_outputs:
+            if not args.lowram:
+                # メモリ消費を減らす
+                logger.info("move vae and unet to cpu to save memory")
+                org_vae_device = vae.device
+                org_unet_device = unet.device
+                vae.to("cpu")
+                unet.to("cpu")
+                clean_memory_on_device(accelerator.device)
+
+            # When TE is not be trained, it will not be prepared so we need to use explicit autocast
+            logger.info("move text encoders to gpu")
+            text_encoders[0].to(accelerator.device, dtype=weight_dtype)  # always not fp8
+
+            if text_encoders[0].dtype == torch.float8_e4m3fn:
+                # if we load fp8 weights, the model is already fp8, so we use it as is
+                self.prepare_text_encoder_fp8(1, text_encoders[1], text_encoders[1].dtype, weight_dtype)
+            else:
+                # otherwise, we need to convert it to target dtype
+                text_encoders[0].to(weight_dtype)
+
+            with accelerator.autocast():
+                dataset.new_cache_text_encoder_outputs(text_encoders, accelerator)
+
+            # cache sample prompts
+            if args.sample_prompts is not None:
+                logger.info(f"cache Text Encoder outputs for sample prompt: {args.sample_prompts}")
+
+                tokenize_strategy: strategy_lumina.LuminaTokenizeStrategy = strategy_base.TokenizeStrategy.get_strategy()
+                text_encoding_strategy: strategy_lumina.LuminaTextEncodingStrategy = strategy_base.TextEncodingStrategy.get_strategy()
+
+                prompts = train_util.load_prompts(args.sample_prompts)
+                sample_prompts_te_outputs = {}  # key: prompt, value: text encoder outputs
+                with accelerator.autocast(), torch.no_grad():
+                    for prompt_dict in prompts:
+                        for p in [prompt_dict.get("prompt", ""), prompt_dict.get("negative_prompt", "")]:
+                            if p not in sample_prompts_te_outputs:
+                                logger.info(f"cache Text Encoder outputs for prompt: {p}")
+                                tokens_and_masks = tokenize_strategy.tokenize(p)
+                                sample_prompts_te_outputs[p] = text_encoding_strategy.encode_tokens(
+                                    tokenize_strategy, text_encoders, tokens_and_masks, args.apply_t5_attn_mask
+                                )
+                self.sample_prompts_te_outputs = sample_prompts_te_outputs
+
+            accelerator.wait_for_everyone()
+
+            # move back to cpu
+            if not self.is_train_text_encoder(args):
+                logger.info("move Gemma 2 back to cpu")
+                text_encoders[0].to("cpu")
+            clean_memory_on_device(accelerator.device)
+
+            if not args.lowram:
+                logger.info("move vae and unet back to original device")
+                vae.to(org_vae_device)
+                unet.to(org_unet_device)
+        else:
+            # Text Encoderから毎回出力を取得するので、GPUに乗せておく
+            text_encoders[0].to(accelerator.device, dtype=weight_dtype)
 
     def sample_images(
         self,
@@ -196,12 +258,13 @@ def get_noise_pred_and_target(
             )
         )
 
+        # May not need to pack/unpack?
         # pack latents and get img_ids - 这部分可以保留因为NextDiT也需要packed格式的输入
-        packed_noisy_model_input = lumina_util.pack_latents(noisy_model_input)
-        packed_latent_height, packed_latent_width = (
-            noisy_model_input.shape[2] // 2,
-            noisy_model_input.shape[3] // 2,
-        )
+        # packed_noisy_model_input = lumina_util.pack_latents(noisy_model_input)
+        # packed_latent_height, packed_latent_width = (
+        #     noisy_model_input.shape[2] // 2,
+        #     noisy_model_input.shape[3] // 2,
+        # )
 
         # ensure the hidden state will require grad
         if args.gradient_checkpointing:
@@ -212,32 +275,30 @@ def get_noise_pred_and_target(
 
         # Unpack Gemma2 outputs
         gemma2_hidden_states, gemma2_attn_mask, input_ids = text_encoder_conds
-        if not args.apply_gemma2_attn_mask:
-            gemma2_attn_mask = None
 
-        def call_dit(img, gemma2_hidden_states, input_ids, timesteps, gemma2_attn_mask):
+        def call_dit(img, gemma2_hidden_states, timesteps, gemma2_attn_mask):
             with torch.set_grad_enabled(is_train), accelerator.autocast():
                 # NextDiT forward expects (x, t, cap_feats, cap_mask)
                 model_pred = unet(
-                    x=img,  # packed latents
+                    x=img, # image latents (B, C, H, W)
                     t=timesteps / 1000,  # timesteps需要除以1000来匹配模型预期
                     cap_feats=gemma2_hidden_states,  # Gemma2的hidden states作为caption features
-                    cap_mask=gemma2_attn_mask,  # Gemma2的attention mask
+                    cap_mask=gemma2_attn_mask.to(dtype=torch.int32),  # Gemma2的attention mask
                 )
             return model_pred
 
         model_pred = call_dit(
-            img=packed_noisy_model_input,
+            img=noisy_model_input,
             gemma2_hidden_states=gemma2_hidden_states,
-            input_ids=input_ids,
             timesteps=timesteps,
             gemma2_attn_mask=gemma2_attn_mask,
         )
 
+        # May not need to pack/unpack?
         # unpack latents
-        model_pred = lumina_util.unpack_latents(
-            model_pred, packed_latent_height, packed_latent_width
-        )
+        # model_pred = lumina_util.unpack_latents(
+        #     model_pred, packed_latent_height, packed_latent_width
+        # )
 
         # apply model prediction type
         model_pred, weighting = flux_train_utils.apply_model_prediction_type(
diff --git a/networks/lora_lumina.py b/networks/lora_lumina.py
index d554ce13d..3f6c9b417 100644
--- a/networks/lora_lumina.py
+++ b/networks/lora_lumina.py
@@ -462,7 +462,7 @@ def create_network_from_weights(multiplier, file, ae, text_encoders, lumina, wei
 
 class LoRANetwork(torch.nn.Module):
     LUMINA_TARGET_REPLACE_MODULE = ["JointTransformerBlock"]
-    TEXT_ENCODER_TARGET_REPLACE_MODULE = ["GemmaAttention", "GemmaDecoderLayer", "GemmaMLP"]
+    TEXT_ENCODER_TARGET_REPLACE_MODULE = ["Gemma2Attention", "Gemma2MLP"]
     LORA_PREFIX_LUMINA = "lora_unet"
     LORA_PREFIX_TEXT_ENCODER = "lora_te"  # Simplified prefix since we only have one text encoder
 
@@ -533,7 +533,7 @@ def create_modules(
             filter: Optional[str] = None,
             default_dim: Optional[int] = None,
         ) -> List[LoRAModule]:
-            prefix = self.LORA_PREFIX_FLUX if is_lumina else self.LORA_PREFIX_TEXT_ENCODER
+            prefix = self.LORA_PREFIX_LUMINA if is_lumina else self.LORA_PREFIX_TEXT_ENCODER
 
             loras = []
             skipped = []
@@ -611,7 +611,7 @@ def create_modules(
         skipped_te = []
 
         logger.info(f"create LoRA for Gemma2 Text Encoder:")
-        text_encoder_loras, skipped = create_modules(False, text_encoders, LoRANetwork.TEXT_ENCODER_TARGET_REPLACE_MODULE)
+        text_encoder_loras, skipped = create_modules(False, text_encoders[0], LoRANetwork.TEXT_ENCODER_TARGET_REPLACE_MODULE)
         logger.info(f"create LoRA for Gemma2 Text Encoder: {len(text_encoder_loras)} modules.")
         self.text_encoder_loras.extend(text_encoder_loras)
         skipped_te += skipped
@@ -718,10 +718,10 @@ def load_state_dict(self, state_dict, strict=True):
 
     def state_dict(self, destination=None, prefix="", keep_vars=False):
         if not self.split_qkv:
-            return super().state_dict(destination, prefix, keep_vars)
+            return super().state_dict(destination=destination, prefix=prefix, keep_vars=keep_vars)
 
         # merge qkv
-        state_dict = super().state_dict(destination, prefix, keep_vars)
+        state_dict = super().state_dict(destination=destination, prefix=prefix, keep_vars=keep_vars)
         new_state_dict = {}
         for key in list(state_dict.keys()):
             if "double" in key and "qkv" in key:

From 16015635d24cad3d8e2149907c24715ea0a37d4f Mon Sep 17 00:00:00 2001
From: rockerBOO <rockerboo@gmail.com>
Date: Sun, 16 Feb 2025 01:36:29 -0500
Subject: [PATCH 342/582] Update metadata.resolution for Lumina 2

---
 library/sai_model_spec.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/library/sai_model_spec.py b/library/sai_model_spec.py
index 1e97c9cd2..f5343924a 100644
--- a/library/sai_model_spec.py
+++ b/library/sai_model_spec.py
@@ -237,7 +237,7 @@ def build_metadata(
             reso = (reso[0], reso[0])
     else:
         # resolution is defined in dataset, so use default
-        if sdxl or sd3 is not None or flux is not None:
+        if sdxl or sd3 is not None or flux is not None or lumina is not None:
             reso = 1024
         elif v2 and v_parameterization:
             reso = 768

From 4671e237781dcfe9a16e90f5343afd57586a1df6 Mon Sep 17 00:00:00 2001
From: rockerBOO <rockerboo@gmail.com>
Date: Sun, 16 Feb 2025 01:42:44 -0500
Subject: [PATCH 343/582] Fix validation epoch loss to check epoch average

---
 train_network.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/train_network.py b/train_network.py
index b5f92e06b..674f1cb66 100644
--- a/train_network.py
+++ b/train_network.py
@@ -1498,7 +1498,7 @@ def remove_model(old_ckpt_name):
 
                 if is_tracking:
                     avr_loss: float = val_epoch_loss_recorder.moving_average
-                    loss_validation_divergence = val_epoch_loss_recorder.moving_average - avr_loss 
+                    loss_validation_divergence = val_epoch_loss_recorder.moving_average - loss_recorder.moving_average 
                     logs = {
                         "loss/validation/epoch_average": avr_loss, 
                         "loss/validation/epoch_divergence": loss_validation_divergence, 

From 3c7496ae3f2736a8283a881f49698d3e8f3a4291 Mon Sep 17 00:00:00 2001
From: rockerBOO <rockerboo@gmail.com>
Date: Sun, 16 Feb 2025 22:18:14 -0500
Subject: [PATCH 344/582] Fix sizes for validation split

---
 library/train_util.py | 18 +++++++++++-------
 1 file changed, 11 insertions(+), 7 deletions(-)

diff --git a/library/train_util.py b/library/train_util.py
index 37ed0a994..6c782ea1c 100644
--- a/library/train_util.py
+++ b/library/train_util.py
@@ -148,10 +148,11 @@
 
 def split_train_val(
     paths: List[str], 
+    sizes: List[Optional[Tuple[int, int]]],
     is_training_dataset: bool, 
     validation_split: float, 
     validation_seed: int | None
-) -> List[str]:
+) -> Tuple[List[str], List[Optional[Tuple[int, int]]]]:
     """
     Split the dataset into train and validation
 
@@ -172,10 +173,12 @@ def split_train_val(
     # Split the dataset between training and validation
     if is_training_dataset:
         # Training dataset we split to the first part
-        return paths[0:math.ceil(len(paths) * (1 - validation_split))]
+        split = math.ceil(len(paths) * (1 - validation_split))
+        return paths[0:split], sizes[0:split]
     else:
         # Validation dataset we split to the second part
-        return paths[len(paths) - round(len(paths) * validation_split):]
+        split = len(paths) - round(len(paths) * validation_split)
+        return paths[split:], sizes[split:]
 
 
 class ImageInfo:
@@ -1931,12 +1934,12 @@ def load_dreambooth_dir(subset: DreamBoothSubset):
                 with open(info_cache_file, "r", encoding="utf-8") as f:
                     metas = json.load(f)
                 img_paths = list(metas.keys())
-                sizes = [meta["resolution"] for meta in metas.values()]
+                sizes: List[Optional[Tuple[int, int]]] = [meta["resolution"] for meta in metas.values()]
 
                 # we may need to check image size and existence of image files, but it takes time, so user should check it before training
             else:
                 img_paths = glob_images(subset.image_dir, "*")
-                sizes = [None] * len(img_paths)
+                sizes: List[Optional[Tuple[int, int]]] = [None] * len(img_paths)
 
                 # new caching: get image size from cache files
                 strategy = LatentsCachingStrategy.get_strategy()
@@ -1969,7 +1972,7 @@ def load_dreambooth_dir(subset: DreamBoothSubset):
                             w, h = None, None
 
                         if w is not None and h is not None:
-                            sizes[i] = [w, h]
+                            sizes[i] = (w, h)
                             size_set_count += 1
                     logger.info(f"set image size from cache files: {size_set_count}/{len(img_paths)}")
 
@@ -1990,8 +1993,9 @@ def load_dreambooth_dir(subset: DreamBoothSubset):
                     # Otherwise the img_paths remain as original img_paths and no split 
                     # required for training images dataset of regularization images
                 else:
-                    img_paths = split_train_val(
+                    img_paths, sizes = split_train_val(
                         img_paths, 
+                        sizes,
                         self.is_training_dataset, 
                         self.validation_split, 
                         self.validation_seed

From f3a010978c0e4b88c4839b3a81400b8973f52158 Mon Sep 17 00:00:00 2001
From: rockerBOO <rockerboo@gmail.com>
Date: Sun, 16 Feb 2025 22:28:34 -0500
Subject: [PATCH 345/582] Clear sizes for validation reg images to be
 consistent

---
 library/train_util.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/library/train_util.py b/library/train_util.py
index 6c782ea1c..39b4af856 100644
--- a/library/train_util.py
+++ b/library/train_util.py
@@ -1990,6 +1990,7 @@ def load_dreambooth_dir(subset: DreamBoothSubset):
                     # Skip any validation dataset for regularization images
                     if self.is_training_dataset is False:
                         img_paths = []
+                        sizes = []
                     # Otherwise the img_paths remain as original img_paths and no split 
                     # required for training images dataset of regularization images
                 else:

From 733fdc09c63eb2830081c6b531bf1115075c0f7b Mon Sep 17 00:00:00 2001
From: sdbds <865105819@qq.com>
Date: Mon, 17 Feb 2025 14:52:48 +0800
Subject: [PATCH 346/582] update

---
 library/lumina_models.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/library/lumina_models.py b/library/lumina_models.py
index 43b1e9c64..4daa63428 100644
--- a/library/lumina_models.py
+++ b/library/lumina_models.py
@@ -21,7 +21,7 @@
 
 try:
     from apex.normalization import FusedRMSNorm as RMSNorm
-except ImportError:
+except:
     warnings.warn("Cannot import apex RMSNorm, switch to vanilla implementation")
 
 memory_efficient_attention = None

From aa36c48685bad4bcc0fc341fdd516f0ee5c2cf01 Mon Sep 17 00:00:00 2001
From: sdbds <865105819@qq.com>
Date: Mon, 17 Feb 2025 19:00:18 +0800
Subject: [PATCH 347/582] update for always use gemma2 mask

---
 library/lumina_train_util.py |  7 +--
 library/strategy_lumina.py   | 52 +++++++++-------------
 lumina_train_network.py      | 83 +++++++++++++++++++++---------------
 3 files changed, 68 insertions(+), 74 deletions(-)

diff --git a/library/lumina_train_util.py b/library/lumina_train_util.py
index d3edd262c..7ade6c1bc 100644
--- a/library/lumina_train_util.py
+++ b/library/lumina_train_util.py
@@ -227,7 +227,7 @@ def sample_image_inference(
     )
     timesteps = get_schedule(sample_steps, noise.shape[1], shift=True)
     img_ids = lumina_util.prepare_img_ids(1, packed_latent_height, packed_latent_width).to(accelerator.device, weight_dtype)
-    gemma2_attn_mask = gemma2_attn_mask.to(accelerator.device) if args.apply_gemma2_attn_mask else None
+    gemma2_attn_mask = gemma2_attn_mask.to(accelerator.device)
 
     # if controlnet_image is not None:
     #     controlnet_image = Image.open(controlnet_image).convert("RGB")
@@ -511,11 +511,6 @@ def add_lumina_train_arguments(parser: argparse.ArgumentParser):
         help="maximum token length for Gemma2. if omitted, 256 for schnell and 512 for dev"
         " / Gemma2の最大トークン長。省略された場合、schnellの場合は256、devの場合は512",
     )
-    parser.add_argument(
-        "--apply_gemma2_attn_mask",
-        action="store_true",
-        help="apply attention mask to Gemma2 encode and NextDIT double blocks / Gemma2エンコードとNextDITダブルブロックにアテンションマスクを適用する",
-    )
 
     parser.add_argument(
         "--guidance_scale",
diff --git a/library/strategy_lumina.py b/library/strategy_lumina.py
index 6feea387e..209f62a05 100644
--- a/library/strategy_lumina.py
+++ b/library/strategy_lumina.py
@@ -47,7 +47,7 @@ def tokenize(self, text: Union[str, List[str]]) -> Tuple[torch.Tensor, torch.Ten
             pad_to_multiple_of=8,
             truncation=True,
         )
-        return encodings.input_ids, encodings.attention_mask
+        return [encodings.input_ids, encodings.attention_mask]
 
     def tokenize_with_weights(
         self, text: str | List[str]
@@ -59,47 +59,36 @@ def tokenize_with_weights(
 
 
 class LuminaTextEncodingStrategy(TextEncodingStrategy):
-    def __init__(self, apply_gemma2_attn_mask: Optional[bool] = None) -> None:
+    def __init__(self) -> None:
         super().__init__()
-        self.apply_gemma2_attn_mask = apply_gemma2_attn_mask
 
     def encode_tokens(
         self,
         tokenize_strategy: TokenizeStrategy,
         models: List[Any],
-        tokens: torch.Tensor,
-        attention_masks: torch.Tensor,
-        apply_gemma2_attn_mask: Optional[bool] = None,
-    ) -> torch.Tensor:
-        if apply_gemma2_attn_mask is None:
-            apply_gemma2_attn_mask = self.apply_gemma2_attn_mask
-
+        tokens: List[torch.Tensor],
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
         text_encoder = models[0]
-
-        # Create position IDs
-        position_ids = attention_masks.cumsum(-1) - 1
-        position_ids.masked_fill_(attention_masks == 0, 1)
+        input_ids, attention_masks = tokens
 
         outputs = text_encoder(
-            input_ids=tokens.to(text_encoder.device),
-            attention_mask=attention_masks.to(text_encoder.device) if apply_gemma2_attn_mask else None,
-            position_ids=position_ids.to(text_encoder.device),
+            input_ids=input_ids.to(text_encoder.device),
+            attention_mask=attention_masks.to(text_encoder.device),
             output_hidden_states=True,
             return_dict=True,
         )
 
-        return outputs.hidden_states[-2]
+        return outputs.hidden_states[-2], input_ids, attention_masks
 
     def encode_tokens_with_weights(
         self,
         tokenize_strategy: TokenizeStrategy,
         models: List[Any],
-        tokens: torch.Tensor,
+        tokens: List[torch.Tensor],
         weights_list: List[torch.Tensor],
-        attention_masks: torch.Tensor
-    ) -> torch.Tensor:
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
         # For simplicity, use uniform weighting
-        return self.encode_tokens(tokenize_strategy, models, tokens, attention_masks)
+        return self.encode_tokens(tokenize_strategy, models, tokens)
 
 
 class LuminaTextEncoderOutputsCachingStrategy(TextEncoderOutputsCachingStrategy):
@@ -111,7 +100,6 @@ def __init__(
         batch_size: int,
         skip_disk_cache_validity_check: bool,
         is_partial: bool = False,
-        apply_gemma2_attn_mask: bool = False,
     ) -> None:
         super().__init__(
             cache_to_disk,
@@ -119,7 +107,6 @@ def __init__(
             skip_disk_cache_validity_check,
             is_partial,
         )
-        self.apply_gemma2_attn_mask = apply_gemma2_attn_mask
 
     def get_outputs_npz_path(self, image_abs_path: str) -> str:
         return (
@@ -146,7 +133,7 @@ def is_disk_cached_outputs_expected(self, npz_path: str):
             if "apply_gemma2_attn_mask" not in npz:
                 return False
             npz_apply_gemma2_attn_mask = npz["apply_gemma2_attn_mask"]
-            if npz_apply_gemma2_attn_mask != self.apply_gemma2_attn_mask:
+            if not npz_apply_gemma2_attn_mask:
                 return False
         except Exception as e:
             logger.error(f"Error loading file: {npz_path}")
@@ -174,18 +161,18 @@ def cache_batch_outputs(
         captions = [info.caption for info in infos]
 
         if self.is_weighted:
-            tokens, attention_masks, weights_list = tokenize_strategy.tokenize_with_weights(
+            tokens, weights_list = tokenize_strategy.tokenize_with_weights(
                 captions
             )
             with torch.no_grad():
-                hidden_state = lumina_text_encoding_strategy.encode_tokens_with_weights(
-                    tokenize_strategy, models, tokens, weights_list, attention_masks
+                hidden_state, input_ids, attention_masks = lumina_text_encoding_strategy.encode_tokens_with_weights(
+                    tokenize_strategy, models, tokens, weights_list
                 )
         else:
-            tokens, attention_masks = tokenize_strategy.tokenize(captions)
+            tokens = tokenize_strategy.tokenize(captions)
             with torch.no_grad():
-                hidden_state = lumina_text_encoding_strategy.encode_tokens(
-                    tokenize_strategy, models, tokens, attention_masks
+                hidden_state, input_ids, attention_masks = lumina_text_encoding_strategy.encode_tokens(
+                    tokenize_strategy, models, tokens
                 )
 
         if hidden_state.dtype != torch.float32:
@@ -200,7 +187,6 @@ def cache_batch_outputs(
             hidden_state_i = hidden_state[i]
             attention_mask_i = attention_mask[i]
             input_ids_i = input_ids[i]
-            apply_gemma2_attn_mask_i = self.apply_gemma2_attn_mask
 
             if self.cache_to_disk:
                 np.savez(
@@ -208,7 +194,7 @@ def cache_batch_outputs(
                     hidden_state=hidden_state_i,
                     attention_mask=attention_mask_i,
                     input_ids=input_ids_i,
-                    apply_gemma2_attn_mask=apply_gemma2_attn_mask_i,
+                    apply_gemma2_attn_mask=True
                 )
             else:
                 info.text_encoder_outputs = [hidden_state_i, attention_mask_i, input_ids_i]
diff --git a/lumina_train_network.py b/lumina_train_network.py
index 3d0c70629..00c81bceb 100644
--- a/lumina_train_network.py
+++ b/lumina_train_network.py
@@ -64,7 +64,11 @@ def load_target_model(self, args, weight_dtype, accelerator):
 
         if args.fp8_base:
             # check dtype of model
-            if model.dtype == torch.float8_e4m3fnuz or model.dtype == torch.float8_e5m2 or model.dtype == torch.float8_e5m2fnuz:
+            if (
+                model.dtype == torch.float8_e4m3fnuz
+                or model.dtype == torch.float8_e5m2
+                or model.dtype == torch.float8_e5m2fnuz
+            ):
                 raise ValueError(f"Unsupported fp8 model dtype: {model.dtype}")
             elif model.dtype == torch.float8_e4m3fn:
                 logger.info("Loaded fp8 Lumina 2 model")
@@ -80,13 +84,9 @@ def load_target_model(self, args, weight_dtype, accelerator):
         #     model.enable_block_swap(args.blocks_to_swap, accelerator.device)
         #     self.is_swapping_blocks = True
 
-        gemma2 = lumina_util.load_gemma2(
-            args.gemma2, weight_dtype, "cpu"
-        )
+        gemma2 = lumina_util.load_gemma2(args.gemma2, weight_dtype, "cpu")
         gemma2.eval()
-        ae = lumina_util.load_ae(
-            args.ae, weight_dtype, "cpu"
-        )
+        ae = lumina_util.load_ae(args.ae, weight_dtype, "cpu")
 
         return lumina_util.MODEL_VERSION_LUMINA_V2, [gemma2], ae, model
 
@@ -104,7 +104,7 @@ def get_latents_caching_strategy(self, args):
         )
 
     def get_text_encoding_strategy(self, args):
-        return strategy_lumina.LuminaTextEncodingStrategy(args.apply_gemma2_attn_mask)
+        return strategy_lumina.LuminaTextEncodingStrategy()
 
     def get_text_encoders_train_flags(self, args, text_encoders):
         return [self.train_gemma2]
@@ -117,7 +117,6 @@ def get_text_encoder_outputs_caching_strategy(self, args):
                 args.text_encoder_batch_size,
                 args.skip_cache_check,
                 is_partial=self.train_gemma2,
-                apply_gemma2_attn_mask=args.apply_gemma2_attn_mask,
             )
         else:
             return None
@@ -144,11 +143,15 @@ def cache_text_encoder_outputs_if_needed(
 
             # When TE is not be trained, it will not be prepared so we need to use explicit autocast
             logger.info("move text encoders to gpu")
-            text_encoders[0].to(accelerator.device, dtype=weight_dtype)  # always not fp8
+            text_encoders[0].to(
+                accelerator.device, dtype=weight_dtype
+            )  # always not fp8
 
             if text_encoders[0].dtype == torch.float8_e4m3fn:
                 # if we load fp8 weights, the model is already fp8, so we use it as is
-                self.prepare_text_encoder_fp8(1, text_encoders[1], text_encoders[1].dtype, weight_dtype)
+                self.prepare_text_encoder_fp8(
+                    1, text_encoders[1], text_encoders[1].dtype, weight_dtype
+                )
             else:
                 # otherwise, we need to convert it to target dtype
                 text_encoders[0].to(weight_dtype)
@@ -158,21 +161,39 @@ def cache_text_encoder_outputs_if_needed(
 
             # cache sample prompts
             if args.sample_prompts is not None:
-                logger.info(f"cache Text Encoder outputs for sample prompt: {args.sample_prompts}")
+                logger.info(
+                    f"cache Text Encoder outputs for sample prompt: {args.sample_prompts}"
+                )
 
-                tokenize_strategy: strategy_lumina.LuminaTokenizeStrategy = strategy_base.TokenizeStrategy.get_strategy()
-                text_encoding_strategy: strategy_lumina.LuminaTextEncodingStrategy = strategy_base.TextEncodingStrategy.get_strategy()
+                tokenize_strategy: strategy_lumina.LuminaTokenizeStrategy = (
+                    strategy_base.TokenizeStrategy.get_strategy()
+                )
+                text_encoding_strategy: strategy_lumina.LuminaTextEncodingStrategy = (
+                    strategy_base.TextEncodingStrategy.get_strategy()
+                )
 
                 prompts = train_util.load_prompts(args.sample_prompts)
-                sample_prompts_te_outputs = {}  # key: prompt, value: text encoder outputs
+                sample_prompts_te_outputs = (
+                    {}
+                )  # key: prompt, value: text encoder outputs
                 with accelerator.autocast(), torch.no_grad():
                     for prompt_dict in prompts:
-                        for p in [prompt_dict.get("prompt", ""), prompt_dict.get("negative_prompt", "")]:
+                        for p in [
+                            prompt_dict.get("prompt", ""),
+                            prompt_dict.get("negative_prompt", ""),
+                        ]:
                             if p not in sample_prompts_te_outputs:
-                                logger.info(f"cache Text Encoder outputs for prompt: {p}")
+                                logger.info(
+                                    f"cache Text Encoder outputs for prompt: {p}"
+                                )
                                 tokens_and_masks = tokenize_strategy.tokenize(p)
-                                sample_prompts_te_outputs[p] = text_encoding_strategy.encode_tokens(
-                                    tokenize_strategy, text_encoders, tokens_and_masks, args.apply_t5_attn_mask
+                                sample_prompts_te_outputs[p] = (
+                                    text_encoding_strategy.encode_tokens(
+                                        tokenize_strategy,
+                                        text_encoders,
+                                        tokens_and_masks,
+                                        args.apply_t5_attn_mask,
+                                    )
                                 )
                 self.sample_prompts_te_outputs = sample_prompts_te_outputs
 
@@ -261,10 +282,6 @@ def get_noise_pred_and_target(
         # May not need to pack/unpack?
         # pack latents and get img_ids - 这部分可以保留因为NextDiT也需要packed格式的输入
         # packed_noisy_model_input = lumina_util.pack_latents(noisy_model_input)
-        # packed_latent_height, packed_latent_width = (
-        #     noisy_model_input.shape[2] // 2,
-        #     noisy_model_input.shape[3] // 2,
-        # )
 
         # ensure the hidden state will require grad
         if args.gradient_checkpointing:
@@ -274,16 +291,18 @@ def get_noise_pred_and_target(
                     t.requires_grad_(True)
 
         # Unpack Gemma2 outputs
-        gemma2_hidden_states, gemma2_attn_mask, input_ids = text_encoder_conds
+        gemma2_hidden_states, input_ids, gemma2_attn_mask = text_encoder_conds
 
         def call_dit(img, gemma2_hidden_states, timesteps, gemma2_attn_mask):
             with torch.set_grad_enabled(is_train), accelerator.autocast():
                 # NextDiT forward expects (x, t, cap_feats, cap_mask)
                 model_pred = unet(
-                    x=img, # image latents (B, C, H, W)
+                    x=img,  # image latents (B, C, H, W)
                     t=timesteps / 1000,  # timesteps需要除以1000来匹配模型预期
                     cap_feats=gemma2_hidden_states,  # Gemma2的hidden states作为caption features
-                    cap_mask=gemma2_attn_mask.to(dtype=torch.int32),  # Gemma2的attention mask
+                    cap_mask=gemma2_attn_mask.to(
+                        dtype=torch.int32
+                    ),  # Gemma2的attention mask
                 )
             return model_pred
 
@@ -326,13 +345,8 @@ def call_dit(img, gemma2_hidden_states, timesteps, gemma2_attn_mask):
                         gemma2_hidden_states=gemma2_hidden_states[
                             diff_output_pr_indices
                         ],
-                        input_ids=input_ids[diff_output_pr_indices],
                         timesteps=timesteps[diff_output_pr_indices],
-                        gemma2_attn_mask=(
-                            gemma2_attn_mask[diff_output_pr_indices]
-                            if gemma2_attn_mask is not None
-                            else None
-                        ),
+                        gemma2_attn_mask=(gemma2_attn_mask[diff_output_pr_indices]),
                     )
                 network.set_multiplier(1.0)
 
@@ -358,7 +372,6 @@ def get_sai_model_spec(self, args):
         )
 
     def update_metadata(self, metadata, args):
-        metadata["ss_apply_gemma2_attn_mask"] = args.apply_gemma2_attn_mask
         metadata["ss_weighting_scheme"] = args.weighting_scheme
         metadata["ss_logit_mean"] = args.logit_mean
         metadata["ss_logit_std"] = args.logit_std
@@ -373,7 +386,7 @@ def is_text_encoder_not_needed_for_training(self, args):
         return args.cache_text_encoder_outputs and not self.is_train_text_encoder(args)
 
     def prepare_text_encoder_grad_ckpt_workaround(self, index, text_encoder):
-        text_encoder.model.embed_tokens.requires_grad_(True)
+        text_encoder.embed_tokens.requires_grad_(True)
 
     def prepare_text_encoder_fp8(
         self, index, text_encoder, te_weight_dtype, weight_dtype
@@ -382,7 +395,7 @@ def prepare_text_encoder_fp8(
             f"prepare Gemma2 for fp8: set to {te_weight_dtype}, set embeddings to {weight_dtype}"
         )
         text_encoder.to(te_weight_dtype)  # fp8
-        text_encoder.model.embed_tokens.to(dtype=weight_dtype)
+        text_encoder.embed_tokens.to(dtype=weight_dtype)
 
     def prepare_unet_with_accelerator(
         self, args: argparse.Namespace, accelerator: Accelerator, unet: torch.nn.Module

From 44782dd7905d56fedfcb4cf8e51d162d2f2d3e23 Mon Sep 17 00:00:00 2001
From: rockerBOO <rockerboo@gmail.com>
Date: Fri, 14 Feb 2025 11:14:38 -0500
Subject: [PATCH 348/582] Fix validation epoch divergence

---
 train_network.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/train_network.py b/train_network.py
index c3879531d..b5f92e06b 100644
--- a/train_network.py
+++ b/train_network.py
@@ -1498,7 +1498,7 @@ def remove_model(old_ckpt_name):
 
                 if is_tracking:
                     avr_loss: float = val_epoch_loss_recorder.moving_average
-                    loss_validation_divergence = val_step_loss_recorder.moving_average - avr_loss 
+                    loss_validation_divergence = val_epoch_loss_recorder.moving_average - avr_loss 
                     logs = {
                         "loss/validation/epoch_average": avr_loss, 
                         "loss/validation/epoch_divergence": loss_validation_divergence, 

From 3365cfadd7af64c6468210f98801396ffeb4873f Mon Sep 17 00:00:00 2001
From: rockerBOO <rockerboo@gmail.com>
Date: Sun, 16 Feb 2025 22:18:14 -0500
Subject: [PATCH 349/582] Fix sizes for validation split

---
 library/train_util.py | 18 +++++++++++-------
 1 file changed, 11 insertions(+), 7 deletions(-)

diff --git a/library/train_util.py b/library/train_util.py
index 34ffe22b1..f9fe317f6 100644
--- a/library/train_util.py
+++ b/library/train_util.py
@@ -148,10 +148,11 @@
 
 def split_train_val(
     paths: List[str], 
+    sizes: List[Optional[Tuple[int, int]]],
     is_training_dataset: bool, 
     validation_split: float, 
     validation_seed: int | None
-) -> List[str]:
+) -> Tuple[List[str], List[Optional[Tuple[int, int]]]]:
     """
     Split the dataset into train and validation
 
@@ -172,10 +173,12 @@ def split_train_val(
     # Split the dataset between training and validation
     if is_training_dataset:
         # Training dataset we split to the first part
-        return paths[0:math.ceil(len(paths) * (1 - validation_split))]
+        split = math.ceil(len(paths) * (1 - validation_split))
+        return paths[0:split], sizes[0:split]
     else:
         # Validation dataset we split to the second part
-        return paths[len(paths) - round(len(paths) * validation_split):]
+        split = len(paths) - round(len(paths) * validation_split)
+        return paths[split:], sizes[split:]
 
 
 class ImageInfo:
@@ -1931,12 +1934,12 @@ def load_dreambooth_dir(subset: DreamBoothSubset):
                 with open(info_cache_file, "r", encoding="utf-8") as f:
                     metas = json.load(f)
                 img_paths = list(metas.keys())
-                sizes = [meta["resolution"] for meta in metas.values()]
+                sizes: List[Optional[Tuple[int, int]]] = [meta["resolution"] for meta in metas.values()]
 
                 # we may need to check image size and existence of image files, but it takes time, so user should check it before training
             else:
                 img_paths = glob_images(subset.image_dir, "*")
-                sizes = [None] * len(img_paths)
+                sizes: List[Optional[Tuple[int, int]]] = [None] * len(img_paths)
 
                 # new caching: get image size from cache files
                 strategy = LatentsCachingStrategy.get_strategy()
@@ -1969,7 +1972,7 @@ def load_dreambooth_dir(subset: DreamBoothSubset):
                             w, h = None, None
 
                         if w is not None and h is not None:
-                            sizes[i] = [w, h]
+                            sizes[i] = (w, h)
                             size_set_count += 1
                     logger.info(f"set image size from cache files: {size_set_count}/{len(img_paths)}")
 
@@ -1990,8 +1993,9 @@ def load_dreambooth_dir(subset: DreamBoothSubset):
                     # Otherwise the img_paths remain as original img_paths and no split 
                     # required for training images dataset of regularization images
                 else:
-                    img_paths = split_train_val(
+                    img_paths, sizes = split_train_val(
                         img_paths, 
+                        sizes,
                         self.is_training_dataset, 
                         self.validation_split, 
                         self.validation_seed

From 3ed7606f8840c166c3d7b8e6daa170070c749b0b Mon Sep 17 00:00:00 2001
From: rockerBOO <rockerboo@gmail.com>
Date: Sun, 16 Feb 2025 22:28:34 -0500
Subject: [PATCH 350/582] Clear sizes for validation reg images to be
 consistent

---
 library/train_util.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/library/train_util.py b/library/train_util.py
index f9fe317f6..4eccc4a0b 100644
--- a/library/train_util.py
+++ b/library/train_util.py
@@ -1990,6 +1990,7 @@ def load_dreambooth_dir(subset: DreamBoothSubset):
                     # Skip any validation dataset for regularization images
                     if self.is_training_dataset is False:
                         img_paths = []
+                        sizes = []
                     # Otherwise the img_paths remain as original img_paths and no split 
                     # required for training images dataset of regularization images
                 else:

From 1aa2f00e85cf7802007a394e28d52014c776df48 Mon Sep 17 00:00:00 2001
From: rockerBOO <rockerboo@gmail.com>
Date: Sun, 16 Feb 2025 01:42:44 -0500
Subject: [PATCH 351/582] Fix validation epoch loss to check epoch average

---
 train_network.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/train_network.py b/train_network.py
index b5f92e06b..674f1cb66 100644
--- a/train_network.py
+++ b/train_network.py
@@ -1498,7 +1498,7 @@ def remove_model(old_ckpt_name):
 
                 if is_tracking:
                     avr_loss: float = val_epoch_loss_recorder.moving_average
-                    loss_validation_divergence = val_epoch_loss_recorder.moving_average - avr_loss 
+                    loss_validation_divergence = val_epoch_loss_recorder.moving_average - loss_recorder.moving_average 
                     logs = {
                         "loss/validation/epoch_average": avr_loss, 
                         "loss/validation/epoch_divergence": loss_validation_divergence, 

From 9436b410617f22716eac64f7c604c8f53fa8c1a8 Mon Sep 17 00:00:00 2001
From: rockerBOO <rockerboo@gmail.com>
Date: Mon, 17 Feb 2025 14:28:41 -0500
Subject: [PATCH 352/582] Fix validation split and add test

---
 library/train_util.py    |  8 ++++++--
 tests/test_validation.py | 17 +++++++++++++++++
 2 files changed, 23 insertions(+), 2 deletions(-)
 create mode 100644 tests/test_validation.py

diff --git a/library/train_util.py b/library/train_util.py
index 39b4af856..b23290663 100644
--- a/library/train_util.py
+++ b/library/train_util.py
@@ -161,15 +161,19 @@ def split_train_val(
     [0:80] = 80 training images
     [80:] = 20 validation images
     """
+    dataset = list(zip(paths, sizes))
     if validation_seed is not None:
         logging.info(f"Using validation seed: {validation_seed}")
         prevstate = random.getstate()
         random.seed(validation_seed)
-        random.shuffle(paths)
+        random.shuffle(dataset)
         random.setstate(prevstate)
     else:
-        random.shuffle(paths)
+        random.shuffle(dataset)
 
+    paths, sizes = zip(*dataset)
+    paths = list(paths)
+    sizes = list(sizes)
     # Split the dataset between training and validation
     if is_training_dataset:
         # Training dataset we split to the first part
diff --git a/tests/test_validation.py b/tests/test_validation.py
new file mode 100644
index 000000000..f80686d8c
--- /dev/null
+++ b/tests/test_validation.py
@@ -0,0 +1,17 @@
+from library.train_util import split_train_val
+
+
+def test_split_train_val():
+    paths = ["path1", "path2", "path3", "path4", "path5", "path6", "path7"]
+    sizes = [(1, 1), (2, 2), None, (4, 4), (5, 5), (6, 6), None]
+    result_paths, result_sizes = split_train_val(paths, sizes, True, 0.2, 1234)
+    assert result_paths == ["path2", "path3", "path6", "path5", "path1", "path4"], result_paths
+    assert result_sizes == [(2, 2), None, (6, 6), (5, 5), (1, 1), (4, 4)], result_sizes
+
+    result_paths, result_sizes = split_train_val(paths, sizes, False, 0.2, 1234)
+    assert result_paths == ["path7"], result_paths
+    assert result_sizes == [None], result_sizes
+
+
+if __name__ == "__main__":
+    test_split_train_val()

From 98efbc3bb784d9246a70575349b309faef9e2ecf Mon Sep 17 00:00:00 2001
From: rockerBOO <rockerboo@gmail.com>
Date: Tue, 18 Feb 2025 00:58:53 -0500
Subject: [PATCH 353/582] Add documentation to model, use SDPA attention,
 sample images

---
 library/lumina_models.py     | 407 ++++++++++++++++++++--------------
 library/lumina_train_util.py | 413 ++++++++++++++++++++++++++---------
 library/lumina_util.py       |  47 +++-
 library/strategy_lumina.py   |  16 +-
 lumina_train_network.py      |  71 +++---
 5 files changed, 632 insertions(+), 322 deletions(-)

diff --git a/library/lumina_models.py b/library/lumina_models.py
index 27194e2f5..e82f3b2c7 100644
--- a/library/lumina_models.py
+++ b/library/lumina_models.py
@@ -13,6 +13,7 @@
 from typing import List, Optional, Tuple
 from dataclasses import dataclass
 
+from einops import rearrange
 from flash_attn import flash_attn_varlen_func
 from flash_attn.bert_padding import index_first_axis, pad_input, unpad_input  # noqa
 import torch
@@ -23,24 +24,16 @@
 
 try:
     from apex.normalization import FusedRMSNorm as RMSNorm
-except ModuleNotFoundError:
+except:
     import warnings
-    warnings.warn("Cannot import apex RMSNorm, switch to vanilla implementation")
 
-memory_efficient_attention = None
-try:
-    import xformers
-except:
-    pass
+    warnings.warn("Cannot import apex RMSNorm, switch to vanilla implementation")
 
-try:
-    from xformers.ops import memory_efficient_attention
-except:
-    memory_efficient_attention = None
 
 @dataclass
 class LuminaParams:
     """Parameters for Lumina model configuration"""
+
     patch_size: int = 2
     in_channels: int = 4
     dim: int = 4096
@@ -68,7 +61,7 @@ def get_2b_config(cls) -> "LuminaParams":
         """Returns the configuration for the 2B parameter model"""
         return cls(
             patch_size=2,
-            in_channels=16,
+            in_channels=16,  # VAE channels
             dim=2304,
             n_layers=26,
             n_heads=24,
@@ -76,21 +69,13 @@ def get_2b_config(cls) -> "LuminaParams":
             axes_dims=[32, 32, 32],
             axes_lens=[300, 512, 512],
             qk_norm=True,
-            cap_feat_dim=2304
+            cap_feat_dim=2304,  # Gemma 2 hidden_size
         )
 
     @classmethod
     def get_7b_config(cls) -> "LuminaParams":
         """Returns the configuration for the 7B parameter model"""
-        return cls(
-            patch_size=2,
-            dim=4096,
-            n_layers=32,
-            n_heads=32,
-            n_kv_heads=8,
-            axes_dims=[64, 64, 64],
-            axes_lens=[300, 512, 512]
-        )
+        return cls(patch_size=2, dim=4096, n_layers=32, n_heads=32, n_kv_heads=8, axes_dims=[64, 64, 64], axes_lens=[300, 512, 512])
 
 
 class GradientCheckpointMixin(nn.Module):
@@ -112,6 +97,7 @@ def forward(self, *args, **kwargs):
         else:
             return self._forward(*args, **kwargs)
 
+
 #############################################################################
 #                                 RMSNorm                                   #
 #############################################################################
@@ -148,9 +134,18 @@ def _norm(self, x) -> Tensor:
         """
         return x * torch.rsqrt(x.float().pow(2).mean(-1, keepdim=True) + self.eps)
 
-
     def forward(self, x: Tensor):
+        """
+        Apply RMSNorm to the input tensor.
+
+        Args:
+            x (torch.Tensor): The input tensor.
+
+        Returns:
+            torch.Tensor: The normalized tensor.
+        """
         x_dtype = x.dtype
+        # To handle float8 we need to convert the tensor to float
         x = x.float()
         rrms = torch.rsqrt(torch.mean(x**2, dim=-1, keepdim=True) + 1e-6)
         return ((x * rrms) * self.weight.float()).to(dtype=x_dtype)
@@ -204,17 +199,11 @@ def timestep_embedding(t, dim, max_period=10000):
         """
         # https://github.com/openai/glide-text2im/blob/main/glide_text2im/nn.py
         half = dim // 2
-        freqs = torch.exp(
-            -math.log(max_period)
-            * torch.arange(start=0, end=half, dtype=torch.float32)
-            / half
-        ).to(device=t.device)
+        freqs = torch.exp(-math.log(max_period) * torch.arange(start=0, end=half, dtype=torch.float32) / half).to(device=t.device)
         args = t[:, None].float() * freqs[None]
         embedding = torch.cat([torch.cos(args), torch.sin(args)], dim=-1)
         if dim % 2:
-            embedding = torch.cat(
-                [embedding, torch.zeros_like(embedding[:, :1])], dim=-1
-            )
+            embedding = torch.cat([embedding, torch.zeros_like(embedding[:, :1])], dim=-1)
         return embedding
 
     def _forward(self, t):
@@ -222,6 +211,7 @@ def _forward(self, t):
         t_emb = self.mlp(t_freq.to(self.mlp[0].weight.dtype))
         return t_emb
 
+
 def to_cuda(x):
     if isinstance(x, torch.Tensor):
         return x.cuda()
@@ -266,6 +256,7 @@ def __init__(
             dim (int): Number of input dimensions.
             n_heads (int): Number of heads.
             n_kv_heads (Optional[int]): Number of kv heads, if using GQA.
+            qk_norm (bool): Whether to use normalization for queries and keys.
 
         """
         super().__init__()
@@ -295,6 +286,14 @@ def __init__(
         else:
             self.q_norm = self.k_norm = nn.Identity()
 
+        self.flash_attn = False
+
+        # self.attention_processor = xformers.ops.memory_efficient_attention
+        self.attention_processor = F.scaled_dot_product_attention
+
+    def set_attention_processor(self, attention_processor):
+        self.attention_processor = attention_processor
+
     @staticmethod
     def apply_rotary_emb(
         x_in: torch.Tensor,
@@ -326,16 +325,12 @@ def apply_rotary_emb(
             return x_out.type_as(x_in)
 
     # copied from huggingface modeling_llama.py
-    def _upad_input(
-        self, query_layer, key_layer, value_layer, attention_mask, query_length
-    ):
+    def _upad_input(self, query_layer, key_layer, value_layer, attention_mask, query_length):
         def _get_unpad_data(attention_mask):
             seqlens_in_batch = attention_mask.sum(dim=-1, dtype=torch.int32)
             indices = torch.nonzero(attention_mask.flatten(), as_tuple=False).flatten()
             max_seqlen_in_batch = seqlens_in_batch.max().item()
-            cu_seqlens = F.pad(
-                torch.cumsum(seqlens_in_batch, dim=0, dtype=torch.int32), (1, 0)
-            )
+            cu_seqlens = F.pad(torch.cumsum(seqlens_in_batch, dim=0, dtype=torch.int32), (1, 0))
             return (
                 indices,
                 cu_seqlens,
@@ -355,9 +350,7 @@ def _get_unpad_data(attention_mask):
         )
         if query_length == kv_seq_len:
             query_layer = index_first_axis(
-                query_layer.reshape(
-                    batch_size * kv_seq_len, self.n_local_heads, head_dim
-                ),
+                query_layer.reshape(batch_size * kv_seq_len, self.n_local_heads, head_dim),
                 indices_k,
             )
             cu_seqlens_q = cu_seqlens_k
@@ -373,9 +366,7 @@ def _get_unpad_data(attention_mask):
         else:
             # The -q_len: slice assumes left padding.
             attention_mask = attention_mask[:, -query_length:]
-            query_layer, indices_q, cu_seqlens_q, max_seqlen_in_batch_q = unpad_input(
-                query_layer, attention_mask
-            )
+            query_layer, indices_q, cu_seqlens_q, max_seqlen_in_batch_q = unpad_input(query_layer, attention_mask)
 
         return (
             query_layer,
@@ -388,10 +379,10 @@ def _get_unpad_data(attention_mask):
 
     def forward(
         self,
-        x: torch.Tensor,
-        x_mask: torch.Tensor,
-        freqs_cis: torch.Tensor,
-    ) -> torch.Tensor:
+        x: Tensor,
+        x_mask: Tensor,
+        freqs_cis: Tensor,
+    ) -> Tensor:
         """
 
         Args:
@@ -425,7 +416,7 @@ def forward(
 
         softmax_scale = math.sqrt(1 / self.head_dim)
 
-        if dtype in [torch.float16, torch.bfloat16]:
+        if self.flash_attn:
             # begin var_len flash attn
             (
                 query_states,
@@ -459,14 +450,13 @@ def forward(
             if n_rep >= 1:
                 xk = xk.unsqueeze(3).repeat(1, 1, 1, n_rep, 1).flatten(2, 3)
                 xv = xv.unsqueeze(3).repeat(1, 1, 1, n_rep, 1).flatten(2, 3)
+
             output = (
-                F.scaled_dot_product_attention(
+                self.attention_processor(
                     xq.permute(0, 2, 1, 3),
                     xk.permute(0, 2, 1, 3),
                     xv.permute(0, 2, 1, 3),
-                    attn_mask=x_mask.bool()
-                    .view(bsz, 1, 1, seqlen)
-                    .expand(-1, self.n_local_heads, seqlen, -1),
+                    attn_mask=x_mask.bool().view(bsz, 1, 1, seqlen).expand(-1, self.n_local_heads, seqlen, -1),
                     scale=softmax_scale,
                 )
                 .permute(0, 2, 1, 3)
@@ -474,10 +464,47 @@ def forward(
             )
 
         output = output.flatten(-2)
-
         return self.out(output)
 
 
+def attention(q: Tensor, k: Tensor, v: Tensor, attn_mask: Optional[Tensor] = None) -> Tensor:
+    x = torch.nn.functional.scaled_dot_product_attention(q, k, v, attn_mask=attn_mask)
+    x = rearrange(x, "B H L D -> B L (H D)")
+
+    return x
+
+
+def apply_rope(
+    x_in: torch.Tensor,
+    freqs_cis: torch.Tensor,
+) -> torch.Tensor:
+    """
+    Apply rotary embeddings to input tensors using the given frequency
+    tensor.
+
+    This function applies rotary embeddings to the given query 'xq' and
+    key 'xk' tensors using the provided frequency tensor 'freqs_cis'. The
+    input tensors are reshaped as complex numbers, and the frequency tensor
+    is reshaped for broadcasting compatibility. The resulting tensors
+    contain rotary embeddings and are returned as real tensors.
+
+    Args:
+        x_in (torch.Tensor): Query or Key tensor to apply rotary embeddings.
+        freqs_cis (torch.Tensor): Precomputed frequency tensor for complex
+            exponentials.
+
+    Returns:
+        Tuple[torch.Tensor, torch.Tensor]: Tuple of modified query tensor
+            and key tensor with rotary embeddings.
+    """
+    with torch.autocast("cuda", enabled=False):
+        x = torch.view_as_complex(x_in.float().reshape(*x_in.shape[:-1], -1, 2))
+        freqs_cis = freqs_cis.unsqueeze(2)
+        x_out = torch.view_as_real(x * freqs_cis).flatten(3)
+
+    return x_out.type_as(x_in)
+
+
 class FeedForward(nn.Module):
     def __init__(
         self,
@@ -554,10 +581,13 @@ def __init__(
             n_kv_heads (Optional[int]): Number of attention heads in key and
                 value features (if using GQA), or set to None for the same as
                 query.
-            multiple_of (int):
-            ffn_dim_multiplier (Optional[float]):
-            norm_eps (float):
-
+            multiple_of (int): Number of multiple of the hidden dimension.
+            ffn_dim_multiplier (Optional[float]): Dimension multiplier for the
+                feedforward layer.
+            norm_eps (float): Epsilon value for normalization.
+            qk_norm (bool): Whether to use normalization for queries and keys.
+            modulation (bool): Whether to use modulation for the attention
+                layer.
         """
         super().__init__()
         self.dim = dim
@@ -593,32 +623,30 @@ def _forward(
         self,
         x: torch.Tensor,
         x_mask: torch.Tensor,
-        freqs_cis: torch.Tensor,
+        pe: torch.Tensor,
         adaln_input: Optional[torch.Tensor] = None,
     ):
         """
         Perform a forward pass through the TransformerBlock.
 
         Args:
-            x (torch.Tensor): Input tensor.
-            freqs_cis (torch.Tensor): Precomputed cosine and sine frequencies.
+            x (Tensor): Input tensor.
+            pe (Tensor): Rope position embedding.
 
         Returns:
-            torch.Tensor: Output tensor after applying attention and
+            Tensor: Output tensor after applying attention and
                 feedforward layers.
 
         """
         if self.modulation:
             assert adaln_input is not None
-            scale_msa, gate_msa, scale_mlp, gate_mlp = self.adaLN_modulation(
-                adaln_input
-            ).chunk(4, dim=1)
+            scale_msa, gate_msa, scale_mlp, gate_mlp = self.adaLN_modulation(adaln_input).chunk(4, dim=1)
 
             x = x + gate_msa.unsqueeze(1).tanh() * self.attention_norm2(
                 self.attention(
                     modulate(self.attention_norm1(x), scale_msa),
                     x_mask,
-                    freqs_cis,
+                    pe,
                 )
             )
             x = x + gate_mlp.unsqueeze(1).tanh() * self.ffn_norm2(
@@ -632,7 +660,7 @@ def _forward(
                 self.attention(
                     self.attention_norm1(x),
                     x_mask,
-                    freqs_cis,
+                    pe,
                 )
             )
             x = x + self.ffn_norm2(
@@ -649,6 +677,14 @@ class FinalLayer(GradientCheckpointMixin):
     """
 
     def __init__(self, hidden_size, patch_size, out_channels):
+        """
+        Initialize the FinalLayer.
+
+        Args:
+            hidden_size (int): Hidden size of the input features.
+            patch_size (int): Patch size of the input features.
+            out_channels (int): Number of output channels.
+        """
         super().__init__()
         self.norm_final = nn.LayerNorm(
             hidden_size,
@@ -682,39 +718,21 @@ def forward(self, x, c):
 
 
 class RopeEmbedder:
-    def __init__(
-        self,
-        theta: float = 10000.0,
-        axes_dims: List[int] = (16, 56, 56),
-        axes_lens: List[int] = (1, 512, 512),
-    ):
+    def __init__(self, theta: float = 10000.0, axes_dims: List[int] = [16, 56, 56], axes_lens: List[int] = [1, 512, 512]):
         super().__init__()
         self.theta = theta
         self.axes_dims = axes_dims
         self.axes_lens = axes_lens
-        self.freqs_cis = NextDiT.precompute_freqs_cis(
-            self.axes_dims, self.axes_lens, theta=self.theta
-        )
+        self.freqs_cis = NextDiT.precompute_freqs_cis(self.axes_dims, self.axes_lens, theta=self.theta)
 
-    def get_freqs_cis(self, ids: torch.Tensor):
+    def __call__(self, ids: torch.Tensor):
+        device = ids.device
         self.freqs_cis = [freqs_cis.to(ids.device) for freqs_cis in self.freqs_cis]
         result = []
         for i in range(len(self.axes_dims)):
-            index = (
-                ids[:, :, i : i + 1]
-                .repeat(1, 1, self.freqs_cis[i].shape[-1])
-                .to(torch.int64)
-            )
-
-            axes = self.freqs_cis[i].unsqueeze(0).repeat(index.shape[0], 1, 1)
-
-            result.append(
-                torch.gather(
-                    axes,
-                    dim=1,
-                    index=index,
-                )
-            )
+            freqs = self.freqs_cis[i].to(ids.device)
+            index = ids[:, :, i : i + 1].repeat(1, 1, freqs.shape[-1]).to(torch.int64)
+            result.append(torch.gather(freqs.unsqueeze(0).repeat(index.shape[0], 1, 1), dim=1, index=index))
         return torch.cat(result, dim=-1)
 
 
@@ -740,20 +758,47 @@ def __init__(
         axes_dims: List[int] = [16, 56, 56],
         axes_lens: List[int] = [1, 512, 512],
     ) -> None:
+        """
+        Initialize the NextDiT model.
+
+        Args:
+            patch_size (int): Patch size of the input features.
+            in_channels (int): Number of input channels.
+            dim (int): Hidden size of the input features.
+            n_layers (int): Number of Transformer layers.
+            n_refiner_layers (int): Number of refiner layers.
+            n_heads (int): Number of attention heads.
+            n_kv_heads (Optional[int]): Number of attention heads in key and
+                value features (if using GQA), or set to None for the same as
+                query.
+            multiple_of (int): Multiple of the hidden size.
+            ffn_dim_multiplier (Optional[float]): Dimension multiplier for the
+                feedforward layer.
+            norm_eps (float): Epsilon value for normalization.
+            qk_norm (bool): Whether to use query key normalization.
+            cap_feat_dim (int): Dimension of the caption features.
+            axes_dims (List[int]): List of dimensions for the axes.
+            axes_lens (List[int]): List of lengths for the axes.
+
+        Returns:
+            None
+        """
         super().__init__()
         self.in_channels = in_channels
         self.out_channels = in_channels
         self.patch_size = patch_size
 
-        self.x_embedder = nn.Linear(
-            in_features=patch_size * patch_size * in_channels,
-            out_features=dim,
-            bias=True,
+        self.t_embedder = TimestepEmbedder(min(dim, 1024))
+        self.cap_embedder = nn.Sequential(
+            RMSNorm(cap_feat_dim, eps=norm_eps),
+            nn.Linear(
+                cap_feat_dim,
+                dim,
+                bias=True,
+            ),
         )
-        nn.init.xavier_uniform_(self.x_embedder.weight)
-        nn.init.constant_(self.x_embedder.bias, 0.0)
 
-        self.noise_refiner = nn.ModuleList(
+        self.context_refiner = nn.ModuleList(
             [
                 JointTransformerBlock(
                     layer_id,
@@ -764,12 +809,21 @@ def __init__(
                     ffn_dim_multiplier,
                     norm_eps,
                     qk_norm,
-                    modulation=True,
+                    modulation=False,
                 )
                 for layer_id in range(n_refiner_layers)
             ]
         )
-        self.context_refiner = nn.ModuleList(
+
+        self.x_embedder = nn.Linear(
+            in_features=patch_size * patch_size * in_channels,
+            out_features=dim,
+            bias=True,
+        )
+        nn.init.xavier_uniform_(self.x_embedder.weight)
+        nn.init.constant_(self.x_embedder.bias, 0.0)
+
+        self.noise_refiner = nn.ModuleList(
             [
                 JointTransformerBlock(
                     layer_id,
@@ -780,21 +834,12 @@ def __init__(
                     ffn_dim_multiplier,
                     norm_eps,
                     qk_norm,
-                    modulation=False,
+                    modulation=True,
                 )
                 for layer_id in range(n_refiner_layers)
             ]
         )
 
-        self.t_embedder = TimestepEmbedder(min(dim, 1024))
-        self.cap_embedder = nn.Sequential(
-            RMSNorm(cap_feat_dim, eps=norm_eps),
-            nn.Linear(
-                cap_feat_dim,
-                dim,
-                bias=True,
-            ),
-        )
         nn.init.trunc_normal_(self.cap_embedder[1].weight, std=0.02)
         # nn.init.zeros_(self.cap_embedder[1].weight)
         nn.init.zeros_(self.cap_embedder[1].bias)
@@ -864,15 +909,26 @@ def disable_gradient_checkpointing(self):
 
     def unpatchify(
         self,
-        x: torch.Tensor,
+        x: Tensor,
         width: int,
         height: int,
         encoder_seq_lengths: List[int],
         seq_lengths: List[int],
-    ) -> torch.Tensor:
+    ) -> Tensor:
         """
+        Unpatchify the input tensor and embed the caption features.
         x: (N, T, patch_size**2 * C)
         imgs: (N, H, W, C)
+
+        Args:
+            x (Tensor): Input tensor.
+            width (int): Width of the input tensor.
+            height (int): Height of the input tensor.
+            encoder_seq_lengths (List[int]): List of encoder sequence lengths.
+            seq_lengths (List[int]): List of sequence lengths
+
+        Returns:
+            output: (N, C, H, W)
         """
         pH = pW = self.patch_size
 
@@ -891,13 +947,27 @@ def unpatchify(
 
     def patchify_and_embed(
         self,
-        x: torch.Tensor,
-        cap_feats: torch.Tensor,
-        cap_mask: torch.Tensor,
-        t: torch.Tensor,
-    ) -> Tuple[
-        torch.Tensor, torch.Tensor, torch.Tensor, List[int], List[int]
-    ]:
+        x: Tensor,
+        cap_feats: Tensor,
+        cap_mask: Tensor,
+        t: Tensor,
+    ) -> Tuple[Tensor, Tensor, Tensor, List[int], List[int]]:
+        """
+        Patchify and embed the input image and caption features.
+
+        Args:
+            x: (N, C, H, W) image latents
+            cap_feats: (N, C, D) caption features
+            cap_mask: (N, C, D) caption attention mask
+            t: (N), T timesteps
+
+        Returns:
+            Tuple[Tensor, Tensor, Tensor, List[int], List[int]]:
+
+            return x, attention_mask, freqs_cis, l_effective_cap_len, seq_lengths
+
+
+        """
         bsz, channels, height, width = x.shape
         pH = pW = self.patch_size
         device = x.device
@@ -915,40 +985,35 @@ def patchify_and_embed(
             H_tokens, W_tokens = height // pH, width // pW
 
             position_ids[i, :cap_len, 0] = torch.arange(cap_len, dtype=torch.int32, device=device)
-            position_ids[i, cap_len : cap_len + seq_len, 0] = cap_len
-            row_ids = (
-                torch.arange(H_tokens, dtype=torch.int32, device=device)
-                .view(-1, 1)
-                .repeat(1, W_tokens)
-                .flatten()
-            )
-            col_ids = (
-                torch.arange(W_tokens, dtype=torch.int32, device=device)
-                .view(1, -1)
-                .repeat(H_tokens, 1)
-                .flatten()
-            )
-            position_ids[i, cap_len : cap_len + seq_len, 1] = row_ids
-            position_ids[i, cap_len : cap_len + seq_len, 2] = col_ids
+            position_ids[i, cap_len:seq_len, 0] = cap_len
 
-        freqs_cis = self.rope_embedder.get_freqs_cis(position_ids)
+            row_ids = torch.arange(H_tokens, dtype=torch.int32, device=device).view(-1, 1).repeat(1, W_tokens).flatten()
+            col_ids = torch.arange(W_tokens, dtype=torch.int32, device=device).view(1, -1).repeat(H_tokens, 1).flatten()
 
+            position_ids[i, cap_len:seq_len, 1] = row_ids
+            position_ids[i, cap_len:seq_len, 2] = col_ids
+
+        # Get combinded rotary embeddings
+        freqs_cis = self.rope_embedder(position_ids)
+
+        # Create separate rotary embeddings for captions and images
         cap_freqs_cis = torch.zeros(bsz, encoder_seq_len, freqs_cis.shape[-1], device=device, dtype=freqs_cis.dtype)
         img_freqs_cis = torch.zeros(bsz, image_seq_len, freqs_cis.shape[-1], device=device, dtype=freqs_cis.dtype)
 
         for i, (cap_len, seq_len) in enumerate(zip(l_effective_cap_len, seq_lengths)):
             cap_freqs_cis[i, :cap_len] = freqs_cis[i, :cap_len]
-            img_freqs_cis[i, :seq_len] = freqs_cis[i, cap_len : cap_len + seq_len]
+            img_freqs_cis[i, :image_seq_len] = freqs_cis[i, cap_len:seq_len]
 
-        x = x.view(bsz, channels, height // pH, pH, width // pW, pW).permute(0, 2, 4, 3, 5, 1).flatten(3).flatten(1, 2)
-        x_mask = torch.zeros(bsz, max_seq_len, dtype=torch.bool, device=device)
-
-        # refine context
+        # Refine caption context
         for layer in self.context_refiner:
             cap_feats = layer(cap_feats, cap_mask, cap_freqs_cis)
 
+        x = x.view(bsz, channels, height // pH, pH, width // pW, pW).permute(0, 2, 4, 3, 5, 1).flatten(3).flatten(1, 2)
+        x_mask = torch.zeros(bsz, image_seq_len, dtype=torch.bool, device=device)
+
         x = self.x_embedder(x)
 
+        # Refine image context
         for layer in self.noise_refiner:
             x = layer(x, x_mask, img_freqs_cis, t)
 
@@ -963,19 +1028,23 @@ def patchify_and_embed(
 
         return x, attention_mask, freqs_cis, l_effective_cap_len, seq_lengths
 
-    def forward(self, x, t, cap_feats, cap_mask):
+    def forward(self, x: Tensor, t: Tensor, cap_feats: Tensor, cap_mask: Tensor) -> Tensor:
         """
         Forward pass of NextDiT.
-        t: (N,) tensor of diffusion timesteps
-        y: (N,) tensor of text tokens/features
+        Args:
+            x: (N, C, H, W) image latents
+            t: (N,) tensor of diffusion timesteps
+            cap_feats: (N, L, D) caption features
+            cap_mask: (N, L) caption attention mask
+
+        Returns:
+            x: (N, C, H, W) denoised latents
         """
-        _, _, height, width = x.shape # B, C, H, W
+        _, _, height, width = x.shape  # B, C, H, W
         t = self.t_embedder(t)  # (N, D)
         cap_feats = self.cap_embedder(cap_feats)  # (N, L, D)  # todo check if able to batchify w.o. redundant compute
 
-        x, mask, freqs_cis, l_effective_cap_len, seq_lengths = self.patchify_and_embed(
-            x, cap_feats, cap_mask, t
-        )
+        x, mask, freqs_cis, l_effective_cap_len, seq_lengths = self.patchify_and_embed(x, cap_feats, cap_mask, t)
 
         for layer in self.layers:
             x = layer(x, mask, freqs_cis, t)
@@ -986,7 +1055,14 @@ def forward(self, x, t, cap_feats, cap_mask):
         return x
 
     def forward_with_cfg(
-        self, x, t, cap_feats, cap_mask, cfg_scale, cfg_trunc=100, renorm_cfg=1
+        self,
+        x: Tensor,
+        t: Tensor,
+        cap_feats: Tensor,
+        cap_mask: Tensor,
+        cfg_scale: float,
+        cfg_trunc: int = 100,
+        renorm_cfg: float = 1.0,
     ):
         """
         Forward pass of NextDiT, but also batches the unconditional forward pass
@@ -996,9 +1072,10 @@ def forward_with_cfg(
         half = x[: len(x) // 2]
         if t[0] < cfg_trunc:
             combined = torch.cat([half, half], dim=0)  # [2, 16, 128, 128]
-            model_out = self.forward(
-                combined, t, cap_feats, cap_mask
-            )  # [2, 16, 128, 128]
+            assert (
+                cap_mask.shape[0] == combined.shape[0]
+            ), f"caption attention mask shape: {cap_mask.shape[0]} latents shape: {combined.shape[0]}"
+            model_out = self.forward(x, t, cap_feats, cap_mask)  # [2, 16, 128, 128]
             # For exact reproducibility reasons, we apply classifier-free guidance on only
             # three channels by default. The standard approach to cfg applies it to all channels.
             # This can be done by uncommenting the following line and commenting-out the line following that.
@@ -1009,13 +1086,9 @@ def forward_with_cfg(
             cond_eps, uncond_eps = torch.split(eps, len(eps) // 2, dim=0)
             half_eps = uncond_eps + cfg_scale * (cond_eps - uncond_eps)
             if float(renorm_cfg) > 0.0:
-                ori_pos_norm = torch.linalg.vector_norm(
-                    cond_eps, dim=tuple(range(1, len(cond_eps.shape))), keepdim=True
-                )
+                ori_pos_norm = torch.linalg.vector_norm(cond_eps, dim=tuple(range(1, len(cond_eps.shape))), keepdim=True)
                 max_new_norm = ori_pos_norm * float(renorm_cfg)
-                new_pos_norm = torch.linalg.vector_norm(
-                    half_eps, dim=tuple(range(1, len(half_eps.shape))), keepdim=True
-                )
+                new_pos_norm = torch.linalg.vector_norm(half_eps, dim=tuple(range(1, len(half_eps.shape))), keepdim=True)
                 if new_pos_norm >= max_new_norm:
                     half_eps = half_eps * (max_new_norm / new_pos_norm)
         else:
@@ -1040,7 +1113,7 @@ def precompute_freqs_cis(
         dim: List[int],
         end: List[int],
         theta: float = 10000.0,
-    ):
+    ) -> List[Tensor]:
         """
         Precompute the frequency tensor for complex exponentials (cis) with
         given dimensions.
@@ -1057,19 +1130,17 @@ def precompute_freqs_cis(
                 Defaults to 10000.0.
 
         Returns:
-            torch.Tensor: Precomputed frequency tensor with complex
+            List[torch.Tensor]: Precomputed frequency tensor with complex
                 exponentials.
         """
         freqs_cis = []
+        freqs_dtype = torch.float32 if torch.backends.mps.is_available() else torch.float64
+
         for i, (d, e) in enumerate(zip(dim, end)):
-            freqs = 1.0 / (
-                theta ** (torch.arange(0, d, 2, dtype=torch.float64, device="cpu") / d)
-            )
-            timestep = torch.arange(e, device=freqs.device, dtype=torch.float64)
-            freqs = torch.outer(timestep, freqs).float()
-            freqs_cis_i = torch.polar(torch.ones_like(freqs), freqs).to(
-                torch.complex64
-            )  # complex64
+            pos = torch.arange(e, dtype=freqs_dtype, device="cpu")
+            freqs = 1.0 / (theta ** (torch.arange(0, d, 2, dtype=freqs_dtype, device="cpu") / d))
+            freqs = torch.outer(pos, freqs)
+            freqs_cis_i = torch.polar(torch.ones_like(freqs), freqs)  # [S, D/2]
             freqs_cis.append(freqs_cis_i)
 
         return freqs_cis
@@ -1102,7 +1173,7 @@ def get_checkpointing_wrap_module_list(self) -> List[nn.Module]:
 def NextDiT_2B_GQA_patch2_Adaln_Refiner(params: Optional[LuminaParams] = None, **kwargs):
     if params is None:
         params = LuminaParams.get_2b_config()
-    
+
     return NextDiT(
         patch_size=params.patch_size,
         in_channels=params.in_channels,
diff --git a/library/lumina_train_util.py b/library/lumina_train_util.py
index 7ade6c1bc..9dac9c9f2 100644
--- a/library/lumina_train_util.py
+++ b/library/lumina_train_util.py
@@ -2,20 +2,20 @@
 import math
 import os
 import numpy as np
-import toml
-import json
 import time
-from typing import Callable, Dict, List, Optional, Tuple, Union
+from typing import Callable, Dict, List, Optional, Tuple, Any
 
 import torch
+from torch import Tensor
 from accelerate import Accelerator, PartialState
-from transformers import AutoTokenizer, AutoModelForCausalLM
+from transformers import Gemma2Model
 from tqdm import tqdm
 from PIL import Image
 from safetensors.torch import save_file
 
-from library import lumina_models, lumina_util, strategy_base, train_util
+from library import lumina_models, lumina_util, strategy_base, strategy_lumina, train_util
 from library.device_utils import init_ipex, clean_memory_on_device
+from library.sd3_train_utils import FlowMatchEulerDiscreteScheduler
 
 init_ipex()
 
@@ -30,19 +30,38 @@
 # region sample images
 
 
+@torch.no_grad()
 def sample_images(
     accelerator: Accelerator,
     args: argparse.Namespace,
-    epoch,
-    steps,
-    nextdit,
-    ae,
-    gemma2_model,
-    sample_prompts_gemma2_outputs,
-    prompt_replacement=None,
-    controlnet=None
+    epoch: int,
+    global_step: int,
+    nextdit: lumina_models.NextDiT,
+    vae: torch.nn.Module,
+    gemma2_model: Gemma2Model,
+    sample_prompts_gemma2_outputs: List[Tuple[Tensor, Tensor, Tensor]],
+    prompt_replacement: Optional[Tuple[str, str]] = None,
+    controlnet=None,
 ):
-    if steps == 0:
+    """
+    Generate sample images using the NextDiT model.
+
+    Args:
+        accelerator (Accelerator): Accelerator instance.
+        args (argparse.Namespace): Command-line arguments.
+        epoch (int): Current epoch number.
+        global_step (int): Current global step number.
+        nextdit (lumina_models.NextDiT): The NextDiT model instance.
+        vae (torch.nn.Module): The VAE module.
+        gemma2_model (Gemma2Model): The Gemma2 model instance.
+        sample_prompts_gemma2_outputs (List[Tuple[Tensor, Tensor, Tensor]]): List of tuples containing the encoded prompts, text masks, and timestep for each sample.
+        prompt_replacement (Optional[Tuple[str, str]], optional): Tuple containing the prompt and negative prompt replacements. Defaults to None.
+        controlnet:: ControlNet model
+
+    Returns:
+        None
+    """
+    if global_step == 0:
         if not args.sample_at_first:
             return
     else:
@@ -53,11 +72,15 @@ def sample_images(
             if epoch is None or epoch % args.sample_every_n_epochs != 0:
                 return
         else:
-            if steps % args.sample_every_n_steps != 0 or epoch is not None:  # steps is not divisible or end of epoch
+            if global_step % args.sample_every_n_steps != 0 or epoch is not None:  # steps is not divisible or end of epoch
                 return
 
+    assert (
+        args.sample_prompts is not None
+    ), "No sample prompts found. Provide `--sample_prompts` / サンプルプロンプトが見つかりません。`--sample_prompts` を指定してください"
+
     logger.info("")
-    logger.info(f"generating sample images at step / サンプル画像生成 ステップ: {steps}")
+    logger.info(f"generating sample images at step / サンプル画像生成 ステップ: {global_step}")
     if not os.path.isfile(args.sample_prompts) and sample_prompts_gemma2_outputs is None:
         logger.error(f"No prompt file / プロンプトファイルがありません: {args.sample_prompts}")
         return
@@ -87,46 +110,44 @@ def sample_images(
 
     if distributed_state.num_processes <= 1:
         # If only one device is available, just use the original prompt list. We don't need to care about the distribution of prompts.
-        with torch.no_grad(), accelerator.autocast():
-            for prompt_dict in prompts:
+        for prompt_dict in prompts:
+            sample_image_inference(
+                accelerator,
+                args,
+                nextdit,
+                gemma2_model,
+                vae,
+                save_dir,
+                prompt_dict,
+                epoch,
+                global_step,
+                sample_prompts_gemma2_outputs,
+                prompt_replacement,
+                controlnet,
+            )
+    else:
+        # Creating list with N elements, where each element is a list of prompt_dicts, and N is the number of processes available (number of devices available)
+        # prompt_dicts are assigned to lists based on order of processes, to attempt to time the image creation time to match enum order. Probably only works when steps and sampler are identical.
+        per_process_prompts = []  # list of lists
+        for i in range(distributed_state.num_processes):
+            per_process_prompts.append(prompts[i :: distributed_state.num_processes])
+
+        with distributed_state.split_between_processes(per_process_prompts) as prompt_dict_lists:
+            for prompt_dict in prompt_dict_lists[0]:
                 sample_image_inference(
                     accelerator,
                     args,
                     nextdit,
                     gemma2_model,
-                    ae,
+                    vae,
                     save_dir,
                     prompt_dict,
                     epoch,
-                    steps,
+                    global_step,
                     sample_prompts_gemma2_outputs,
                     prompt_replacement,
-                    controlnet
+                    controlnet,
                 )
-    else:
-        # Creating list with N elements, where each element is a list of prompt_dicts, and N is the number of processes available (number of devices available)
-        # prompt_dicts are assigned to lists based on order of processes, to attempt to time the image creation time to match enum order. Probably only works when steps and sampler are identical.
-        per_process_prompts = []  # list of lists
-        for i in range(distributed_state.num_processes):
-            per_process_prompts.append(prompts[i :: distributed_state.num_processes])
-
-        with torch.no_grad():
-            with distributed_state.split_between_processes(per_process_prompts) as prompt_dict_lists:
-                for prompt_dict in prompt_dict_lists[0]:
-                    sample_image_inference(
-                        accelerator,
-                        args,
-                        nextdit,
-                        gemma2_model,
-                        ae,
-                        save_dir,
-                        prompt_dict,
-                        epoch,
-                        steps,
-                        sample_prompts_gemma2_outputs,
-                        prompt_replacement,
-                        controlnet
-                    )
 
     torch.set_rng_state(rng_state)
     if cuda_rng_state is not None:
@@ -135,43 +156,60 @@ def sample_images(
     clean_memory_on_device(accelerator.device)
 
 
+@torch.no_grad()
 def sample_image_inference(
     accelerator: Accelerator,
     args: argparse.Namespace,
-    nextdit,
-    gemma2_model,
-    ae,
-    save_dir,
-    prompt_dict,
-    epoch,
-    steps,
-    sample_prompts_gemma2_outputs,
-    prompt_replacement,
-    # controlnet
+    nextdit: lumina_models.NextDiT,
+    gemma2_model: Gemma2Model,
+    vae: torch.nn.Module,
+    save_dir: str,
+    prompt_dict: Dict[str, str],
+    epoch: int,
+    global_step: int,
+    sample_prompts_gemma2_outputs: List[Tuple[Tensor, Tensor, Tensor]],
+    prompt_replacement: Optional[Tuple[str, str]] = None,
+    controlnet=None,
 ):
+    """
+    Generates sample images
+
+    Args:
+        accelerator (Accelerator): Accelerator object
+        args (argparse.Namespace): Arguments object
+        nextdit (lumina_models.NextDiT): NextDiT model
+        gemma2_model (Gemma2Model): Gemma2 model
+        vae (torch.nn.Module): VAE model
+        save_dir (str): Directory to save images
+        prompt_dict (Dict[str, str]): Prompt dictionary
+        epoch (int): Epoch number
+        steps (int): Number of steps to run
+        sample_prompts_gemma2_outputs (List[Tuple[Tensor, Tensor, Tensor]]): List of tuples containing gemma2 outputs
+        prompt_replacement (Optional[Tuple[str, str]], optional): Replacement for positive and negative prompt. Defaults to None.
+
+    Returns:
+        None
+    """
     assert isinstance(prompt_dict, dict)
     # negative_prompt = prompt_dict.get("negative_prompt")
-    sample_steps = prompt_dict.get("sample_steps", 20)
-    width = prompt_dict.get("width", 512)
-    height = prompt_dict.get("height", 512)
-    scale = prompt_dict.get("scale", 3.5)
-    seed = prompt_dict.get("seed")
+    sample_steps = prompt_dict.get("sample_steps", 38)
+    width = prompt_dict.get("width", 1024)
+    height = prompt_dict.get("height", 1024)
+    guidance_scale: int = prompt_dict.get("scale", 3.5)
+    seed: int = prompt_dict.get("seed", None)
     controlnet_image = prompt_dict.get("controlnet_image")
     prompt: str = prompt_dict.get("prompt", "")
+    negative_prompt: str = prompt_dict.get("negative_prompt", "")
     # sampler_name: str = prompt_dict.get("sample_sampler", args.sample_sampler)
 
     if prompt_replacement is not None:
         prompt = prompt.replace(prompt_replacement[0], prompt_replacement[1])
-        # if negative_prompt is not None:
-        #     negative_prompt = negative_prompt.replace(prompt_replacement[0], prompt_replacement[1])
+        if negative_prompt is not None:
+            negative_prompt = negative_prompt.replace(prompt_replacement[0], prompt_replacement[1])
 
+    generator = torch.Generator(device=accelerator.device)
     if seed is not None:
-        torch.manual_seed(seed)
-        torch.cuda.manual_seed(seed)
-    else:
-        # True random sample image generation
-        torch.seed()
-        torch.cuda.seed()
+        generator.manual_seed(seed)
 
     # if negative_prompt is None:
     #     negative_prompt = ""
@@ -182,7 +220,7 @@ def sample_image_inference(
     logger.info(f"height: {height}")
     logger.info(f"width: {width}")
     logger.info(f"sample_steps: {sample_steps}")
-    logger.info(f"scale: {scale}")
+    logger.info(f"scale: {guidance_scale}")
     # logger.info(f"sample_sampler: {sampler_name}")
     if seed is not None:
         logger.info(f"seed: {seed}")
@@ -191,14 +229,16 @@ def sample_image_inference(
     tokenize_strategy = strategy_base.TokenizeStrategy.get_strategy()
     encoding_strategy = strategy_base.TextEncodingStrategy.get_strategy()
 
+    assert isinstance(tokenize_strategy, strategy_lumina.LuminaTokenizeStrategy)
+    assert isinstance(encoding_strategy, strategy_lumina.LuminaTextEncodingStrategy)
+
     gemma2_conds = []
     if sample_prompts_gemma2_outputs and prompt in sample_prompts_gemma2_outputs:
         gemma2_conds = sample_prompts_gemma2_outputs[prompt]
-        print(f"Using cached Gemma2 outputs for prompt: {prompt}")
+        logger.info(f"Using cached Gemma2 outputs for prompt: {prompt}")
     if gemma2_model is not None:
-        print(f"Encoding prompt with Gemma2: {prompt}")
+        logger.info(f"Encoding prompt with Gemma2: {prompt}")
         tokens_and_masks = tokenize_strategy.tokenize(prompt)
-        # strategy has apply_gemma2_attn_mask option
         encoded_gemma2_conds = encoding_strategy.encode_tokens(tokenize_strategy, [gemma2_model], tokens_and_masks)
 
         # if gemma2_conds is not cached, use encoded_gemma2_conds
@@ -211,22 +251,26 @@ def sample_image_inference(
                     gemma2_conds[i] = encoded_gemma2_conds[i]
 
     # Unpack Gemma2 outputs
-    gemma2_hidden_states, gemma2_attn_mask, input_ids = gemma2_conds
+    gemma2_hidden_states, input_ids, gemma2_attn_mask = gemma2_conds
 
     # sample image
-    weight_dtype = ae.dtype  # TOFO give dtype as argument
-    packed_latent_height = height // 16
-    packed_latent_width = width // 16
+    weight_dtype = vae.dtype  # TOFO give dtype as argument
+    latent_height = height // 8
+    latent_width = width // 8
     noise = torch.randn(
         1,
-        packed_latent_height * packed_latent_width,
-        16 * 2 * 2,
+        16,
+        latent_height,
+        latent_width,
         device=accelerator.device,
         dtype=weight_dtype,
-        generator=torch.Generator(device=accelerator.device).manual_seed(seed) if seed is not None else None,
+        generator=generator,
     )
+    # Prompts are paired positive/negative
+    noise = noise.repeat(gemma2_attn_mask.shape[0], 1, 1, 1)
+
     timesteps = get_schedule(sample_steps, noise.shape[1], shift=True)
-    img_ids = lumina_util.prepare_img_ids(1, packed_latent_height, packed_latent_width).to(accelerator.device, weight_dtype)
+    # img_ids = lumina_util.prepare_img_ids(1, packed_latent_height, packed_latent_width).to(accelerator.device, weight_dtype)
     gemma2_attn_mask = gemma2_attn_mask.to(accelerator.device)
 
     # if controlnet_image is not None:
@@ -235,18 +279,18 @@ def sample_image_inference(
     #     controlnet_image = torch.from_numpy((np.array(controlnet_image) / 127.5) - 1)
     #     controlnet_image = controlnet_image.permute(2, 0, 1).unsqueeze(0).to(weight_dtype).to(accelerator.device)
 
-    with accelerator.autocast(), torch.no_grad():
-        x = denoise(nextdit, noise, img_ids, gemma2_hidden_states, input_ids, None, timesteps=timesteps, guidance=scale, gemma2_attn_mask=gemma2_attn_mask, controlnet=controlnet, controlnet_img=controlnet_image)
+    with accelerator.autocast():
+        x = denoise(nextdit, noise, gemma2_hidden_states, gemma2_attn_mask, timesteps=timesteps, guidance=guidance_scale)
 
-    x = lumina_util.unpack_latents(x, packed_latent_height, packed_latent_width)
+    # x = lumina_util.unpack_latents(x, packed_latent_height, packed_latent_width)
 
     # latent to image
     clean_memory_on_device(accelerator.device)
-    org_vae_device = ae.device  # will be on cpu
-    ae.to(accelerator.device)  # distributed_state.device is same as accelerator.device
-    with accelerator.autocast(), torch.no_grad():
-        x = ae.decode(x)
-    ae.to(org_vae_device)
+    org_vae_device = vae.device  # will be on cpu
+    vae.to(accelerator.device)  # distributed_state.device is same as accelerator.device
+    with accelerator.autocast():
+        x = vae.decode(x)
+    vae.to(org_vae_device)
     clean_memory_on_device(accelerator.device)
 
     x = x.clamp(-1, 1)
@@ -257,9 +301,9 @@ def sample_image_inference(
     # but adding 'enum' to the filename should be enough
 
     ts_str = time.strftime("%Y%m%d%H%M%S", time.localtime())
-    num_suffix = f"e{epoch:06d}" if epoch is not None else f"{steps:06d}"
+    num_suffix = f"e{epoch:06d}" if epoch is not None else f"{global_step:06d}"
     seed_suffix = "" if seed is None else f"_{seed}"
-    i: int = prompt_dict["enum"]
+    i: int = int(prompt_dict.get("enum", 0))
     img_filename = f"{'' if args.output_name is None else args.output_name + '_'}{num_suffix}_{i:02d}_{ts_str}{seed_suffix}.png"
     image.save(os.path.join(save_dir, img_filename))
 
@@ -273,11 +317,34 @@ def sample_image_inference(
         wandb_tracker.log({f"sample_{i}": wandb.Image(image, caption=prompt)}, commit=False)  # positive prompt as a caption
 
 
-def time_shift(mu: float, sigma: float, t: torch.Tensor):
+def time_shift(mu: float, sigma: float, t: Tensor):
+    """
+    Get time shift
+
+    Args:
+        mu (float): mu value.
+        sigma (float): sigma value.
+        t (Tensor): timestep.
+
+    Return:
+        float: time shift
+    """
     return math.exp(mu) / (math.exp(mu) + (1 / t - 1) ** sigma)
 
 
 def get_lin_function(x1: float = 256, y1: float = 0.5, x2: float = 4096, y2: float = 1.15) -> Callable[[float], float]:
+    """
+    Get linear function
+
+    Args:
+        x1 (float, optional): x1 value. Defaults to 256.
+        y1 (float, optional): y1 value. Defaults to 0.5.
+        x2 (float, optional): x2 value. Defaults to 4096.
+        y2 (float, optional): y2 value. Defaults to 1.15.
+
+    Return:
+        Callable[[float], float]: linear function
+    """
     m = (y2 - y1) / (x2 - x1)
     b = y1 - m * x1
     return lambda x: m * x + b
@@ -290,6 +357,19 @@ def get_schedule(
     max_shift: float = 1.15,
     shift: bool = True,
 ) -> list[float]:
+    """
+    Get timesteps schedule
+
+    Args:
+        num_steps (int): Number of steps in the schedule.
+        image_seq_len (int): Sequence length of the image.
+        base_shift (float, optional): Base shift value. Defaults to 0.5.
+        max_shift (float, optional): Maximum shift value. Defaults to 1.15.
+        shift (bool, optional): Whether to shift the schedule. Defaults to True.
+
+    Return:
+        List[float]: timesteps schedule
+    """
     # extra step for zero
     timesteps = torch.linspace(1, 0, num_steps + 1)
 
@@ -301,11 +381,63 @@ def get_schedule(
 
     return timesteps.tolist()
 
+
+def denoise(
+    model: lumina_models.NextDiT, img: Tensor, txt: Tensor, txt_mask: Tensor, timesteps: List[float], guidance: float = 4.0
+):
+    """
+    Denoise an image using the NextDiT model.
+
+    Args:
+        model (lumina_models.NextDiT): The NextDiT model instance.
+        img (Tensor): The input image tensor.
+        txt (Tensor): The input text tensor.
+        txt_mask (Tensor): The input text mask tensor.
+        timesteps (List[float]): A list of timesteps for the denoising process.
+        guidance (float, optional): The guidance scale for the denoising process. Defaults to 4.0.
+
+    Returns:
+        img (Tensor): Denoised tensor
+    """
+    for t_curr, t_prev in zip(tqdm(timesteps[:-1]), timesteps[1:]):
+        t_vec = torch.full((img.shape[0],), t_curr, dtype=img.dtype, device=img.device)
+        # model.prepare_block_swap_before_forward()
+        # block_samples = None
+        # block_single_samples = None
+        pred = model.forward_with_cfg(
+            x=img,  # image latents (B, C, H, W)
+            t=t_vec / 1000,  # timesteps需要除以1000来匹配模型预期
+            cap_feats=txt,  # Gemma2的hidden states作为caption features
+            cap_mask=txt_mask.to(dtype=torch.int32),  # Gemma2的attention mask
+            cfg_scale=guidance,
+        )
+
+        img = img + (t_prev - t_curr) * pred
+
+    # model.prepare_block_swap_before_forward()
+    return img
+
+
 # endregion
 
 
 # region train
-def get_sigmas(noise_scheduler, timesteps, device, n_dim=4, dtype=torch.float32):
+def get_sigmas(
+    noise_scheduler: FlowMatchEulerDiscreteScheduler, timesteps: Tensor, device: torch.device, n_dim=4, dtype=torch.float32
+) -> Tensor:
+    """
+    Get sigmas for timesteps
+
+    Args:
+        noise_scheduler (FlowMatchEulerDiscreteScheduler): The noise scheduler instance.
+        timesteps (Tensor): A tensor of timesteps for the denoising process.
+        device (torch.device): The device on which the tensors are stored.
+        n_dim (int, optional): The number of dimensions for the output tensor. Defaults to 4.
+        dtype (torch.dtype, optional): The data type for the output tensor. Defaults to torch.float32.
+
+    Returns:
+        sigmas (Tensor): The sigmas tensor.
+    """
     sigmas = noise_scheduler.sigmas.to(device=device, dtype=dtype)
     schedule_timesteps = noise_scheduler.timesteps.to(device)
     timesteps = timesteps.to(device)
@@ -320,11 +452,22 @@ def get_sigmas(noise_scheduler, timesteps, device, n_dim=4, dtype=torch.float32)
 def compute_density_for_timestep_sampling(
     weighting_scheme: str, batch_size: int, logit_mean: float = None, logit_std: float = None, mode_scale: float = None
 ):
-    """Compute the density for sampling the timesteps when doing SD3 training.
+    """
+    Compute the density for sampling the timesteps when doing SD3 training.
 
     Courtesy: This was contributed by Rafie Walker in https://github.com/huggingface/diffusers/pull/8528.
 
     SD3 paper reference: https://arxiv.org/abs/2403.03206v1.
+
+    Args:
+        weighting_scheme (str): The weighting scheme to use.
+        batch_size (int): The batch size for the sampling process.
+        logit_mean (float, optional): The mean of the logit distribution. Defaults to None.
+        logit_std (float, optional): The standard deviation of the logit distribution. Defaults to None.
+        mode_scale (float, optional): The mode scale for the mode weighting scheme. Defaults to None.
+
+    Returns:
+        u (Tensor): The sampled timesteps.
     """
     if weighting_scheme == "logit_normal":
         # See 3.1 in the SD3 paper ($rf/lognorm(0.00,1.00)$).
@@ -338,12 +481,19 @@ def compute_density_for_timestep_sampling(
     return u
 
 
-def compute_loss_weighting_for_sd3(weighting_scheme: str, sigmas=None):
+def compute_loss_weighting_for_sd3(weighting_scheme: str, sigmas=None) -> Tensor:
     """Computes loss weighting scheme for SD3 training.
 
     Courtesy: This was contributed by Rafie Walker in https://github.com/huggingface/diffusers/pull/8528.
 
     SD3 paper reference: https://arxiv.org/abs/2403.03206v1.
+
+    Args:
+        weighting_scheme (str): The weighting scheme to use.
+        sigmas (Tensor, optional): The sigmas tensor. Defaults to None.
+
+    Returns:
+        u (Tensor): The sampled timesteps.
     """
     if weighting_scheme == "sigma_sqrt":
         weighting = (sigmas**-2.0).float()
@@ -355,9 +505,24 @@ def compute_loss_weighting_for_sd3(weighting_scheme: str, sigmas=None):
     return weighting
 
 
-def get_noisy_model_input_and_timesteps(
-    args, noise_scheduler, latents, noise, device, dtype
-) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+def get_noisy_model_input_and_timesteps(args, noise_scheduler, latents, noise, device, dtype) -> Tuple[Tensor, Tensor, Tensor]:
+    """
+    Get noisy model input and timesteps.
+
+    Args:
+        args (argparse.Namespace): Arguments.
+        noise_scheduler (noise_scheduler): Noise scheduler.
+        latents (Tensor): Latents.
+        noise (Tensor): Latent noise.
+        device (torch.device): Device.
+        dtype (torch.dtype): Data type
+
+    Return:
+        Tuple[Tensor, Tensor, Tensor]:
+            noisy model input
+            timesteps
+            sigmas
+    """
     bsz, _, h, w = latents.shape
     sigmas = None
 
@@ -412,7 +577,21 @@ def get_noisy_model_input_and_timesteps(
     return noisy_model_input.to(dtype), timesteps.to(dtype), sigmas
 
 
-def apply_model_prediction_type(args, model_pred, noisy_model_input, sigmas):
+def apply_model_prediction_type(
+    args, model_pred: Tensor, noisy_model_input: Tensor, sigmas: Tensor
+) -> Tuple[Tensor, Optional[Tensor]]:
+    """
+    Apply model prediction type to the model prediction and the sigmas.
+
+    Args:
+        args (argparse.Namespace): Arguments.
+        model_pred (Tensor): Model prediction.
+        noisy_model_input (Tensor): Noisy model input.
+        sigmas (Tensor): Sigmas.
+
+    Return:
+        Tuple[Tensor, Optional[Tensor]]:
+    """
     weighting = None
     if args.model_prediction_type == "raw":
         pass
@@ -433,10 +612,22 @@ def apply_model_prediction_type(args, model_pred, noisy_model_input, sigmas):
 def save_models(
     ckpt_path: str,
     lumina: lumina_models.NextDiT,
-    sai_metadata: Optional[dict],
+    sai_metadata: Dict[str, Any],
     save_dtype: Optional[torch.dtype] = None,
     use_mem_eff_save: bool = False,
 ):
+    """
+    Save the model to the checkpoint path.
+
+    Args:
+        ckpt_path (str): Path to the checkpoint.
+        lumina (lumina_models.NextDiT): NextDIT model.
+        sai_metadata (Optional[dict]): Metadata for the SAI model.
+        save_dtype (Optional[torch.dtype]): Data
+
+    Return:
+        None
+    """
     state_dict = {}
 
     def update_sd(prefix, sd):
@@ -458,7 +649,9 @@ def save_lumina_model_on_train_end(
     args: argparse.Namespace, save_dtype: torch.dtype, epoch: int, global_step: int, lumina: lumina_models.NextDiT
 ):
     def sd_saver(ckpt_file, epoch_no, global_step):
-        sai_metadata = train_util.get_sai_model_spec(None, args, False, False, False, is_stable_diffusion_ckpt=True, lumina="lumina2")
+        sai_metadata = train_util.get_sai_model_spec(
+            None, args, False, False, False, is_stable_diffusion_ckpt=True, lumina="lumina2"
+        )
         save_models(ckpt_file, lumina, sai_metadata, save_dtype, args.mem_eff_save)
 
     train_util.save_sd_model_on_train_end_common(args, True, True, epoch, global_step, sd_saver, None)
@@ -469,15 +662,29 @@ def sd_saver(ckpt_file, epoch_no, global_step):
 def save_lumina_model_on_epoch_end_or_stepwise(
     args: argparse.Namespace,
     on_epoch_end: bool,
-    accelerator,
+    accelerator: Accelerator,
     save_dtype: torch.dtype,
     epoch: int,
     num_train_epochs: int,
     global_step: int,
     lumina: lumina_models.NextDiT,
 ):
-    def sd_saver(ckpt_file, epoch_no, global_step):
-        sai_metadata = train_util.get_sai_model_spec(None, args, False, False, False, is_stable_diffusion_ckpt=True, lumina="lumina2")
+    """
+    Save the model to the checkpoint path.
+
+    Args:
+        args (argparse.Namespace): Arguments.
+        save_dtype (torch.dtype): Data type.
+        epoch (int): Epoch.
+        global_step (int): Global step.
+        lumina (lumina_models.NextDiT): NextDIT model.
+
+    Return:
+        None
+    """
+
+    def sd_saver(ckpt_file: str, epoch_no: int, global_step: int):
+        sai_metadata = train_util.get_sai_model_spec({}, args, False, False, False, is_stable_diffusion_ckpt=True, lumina="lumina2")
         save_models(ckpt_file, lumina, sai_metadata, save_dtype, args.mem_eff_save)
 
     train_util.save_sd_model_on_epoch_end_or_stepwise_common(
diff --git a/library/lumina_util.py b/library/lumina_util.py
index f8e3f7dbc..f404e7754 100644
--- a/library/lumina_util.py
+++ b/library/lumina_util.py
@@ -11,23 +11,33 @@
 from transformers import Gemma2Config, Gemma2Model
 
 from library.utils import setup_logging
-
-setup_logging()
+from library import lumina_models, flux_models
+from library.utils import load_safetensors
 import logging
 
+setup_logging()
 logger = logging.getLogger(__name__)
 
-from library import lumina_models, flux_models
-from library.utils import load_safetensors
-
 MODEL_VERSION_LUMINA_V2 = "lumina2"
 
 def load_lumina_model(
     ckpt_path: str,
     dtype: torch.dtype,
-    device: Union[str, torch.device],
+    device: torch.device,
     disable_mmap: bool = False,
 ):
+    """
+    Load the Lumina model from the checkpoint path.
+
+    Args:
+        ckpt_path (str): Path to the checkpoint.
+        dtype (torch.dtype): The data type for the model.
+        device (torch.device): The device to load the model on.
+        disable_mmap (bool, optional): Whether to disable mmap. Defaults to False.
+
+    Returns:
+        model (lumina_models.NextDiT): The loaded model.
+    """
     logger.info("Building Lumina")
     with torch.device("meta"):
         model = lumina_models.NextDiT_2B_GQA_patch2_Adaln_Refiner().to(dtype)
@@ -46,6 +56,18 @@ def load_ae(
     device: Union[str, torch.device],
     disable_mmap: bool = False,
 ) -> flux_models.AutoEncoder:
+    """
+    Load the AutoEncoder model from the checkpoint path.
+
+    Args:
+        ckpt_path (str): Path to the checkpoint.
+        dtype (torch.dtype): The data type for the model.
+        device (Union[str, torch.device]): The device to load the model on.
+        disable_mmap (bool, optional): Whether to disable mmap. Defaults to False.
+
+    Returns:
+        ae (flux_models.AutoEncoder): The loaded model.
+    """
     logger.info("Building AutoEncoder")
     with torch.device("meta"):
         # dev and schnell have the same AE params
@@ -67,6 +89,19 @@ def load_gemma2(
     disable_mmap: bool = False,
     state_dict: Optional[dict] = None,
 ) -> Gemma2Model:
+    """
+    Load the Gemma2 model from the checkpoint path.
+
+    Args:
+        ckpt_path (str): Path to the checkpoint.
+        dtype (torch.dtype): The data type for the model.
+        device (Union[str, torch.device]): The device to load the model on.
+        disable_mmap (bool, optional): Whether to disable mmap. Defaults to False.
+        state_dict (Optional[dict], optional): The state dict to load. Defaults to None.
+
+    Returns:
+        gemma2 (Gemma2Model): The loaded model
+    """
     logger.info("Building Gemma2")
     GEMMA2_CONFIG = {
       "_name_or_path": "google/gemma-2-2b",
diff --git a/library/strategy_lumina.py b/library/strategy_lumina.py
index 209f62a05..0a6a7f293 100644
--- a/library/strategy_lumina.py
+++ b/library/strategy_lumina.py
@@ -130,11 +130,6 @@ def is_disk_cached_outputs_expected(self, npz_path: str):
                 return False
             if "input_ids" not in npz:
                 return False
-            if "apply_gemma2_attn_mask" not in npz:
-                return False
-            npz_apply_gemma2_attn_mask = npz["apply_gemma2_attn_mask"]
-            if not npz_apply_gemma2_attn_mask:
-                return False
         except Exception as e:
             logger.error(f"Error loading file: {npz_path}")
             raise e
@@ -142,11 +137,17 @@ def is_disk_cached_outputs_expected(self, npz_path: str):
         return True
 
     def load_outputs_npz(self, npz_path: str) -> List[np.ndarray]:
+        """
+        Load outputs from a npz file
+
+        Returns:
+            List[np.ndarray]: hidden_state,  input_ids, attention_mask
+        """
         data = np.load(npz_path)
         hidden_state = data["hidden_state"]
         attention_mask = data["attention_mask"]
         input_ids = data["input_ids"]
-        return [hidden_state, attention_mask, input_ids]
+        return [hidden_state, input_ids, attention_mask]
 
     def cache_batch_outputs(
         self,
@@ -193,8 +194,7 @@ def cache_batch_outputs(
                     info.text_encoder_outputs_npz,
                     hidden_state=hidden_state_i,
                     attention_mask=attention_mask_i,
-                    input_ids=input_ids_i,
-                    apply_gemma2_attn_mask=True
+                    input_ids=input_ids_i
                 )
             else:
                 info.text_encoder_outputs = [hidden_state_i, attention_mask_i, input_ids_i]
diff --git a/lumina_train_network.py b/lumina_train_network.py
index 00c81bceb..81acfb513 100644
--- a/lumina_train_network.py
+++ b/lumina_train_network.py
@@ -2,9 +2,10 @@
 import copy
 import math
 import random
-from typing import Any, Optional, Union
+from typing import Any, Optional, Union, Tuple
 
 import torch
+from torch import Tensor
 from accelerate import Accelerator
 
 from library.device_utils import clean_memory_on_device, init_ipex
@@ -165,36 +166,31 @@ def cache_text_encoder_outputs_if_needed(
                     f"cache Text Encoder outputs for sample prompt: {args.sample_prompts}"
                 )
 
-                tokenize_strategy: strategy_lumina.LuminaTokenizeStrategy = (
-                    strategy_base.TokenizeStrategy.get_strategy()
-                )
-                text_encoding_strategy: strategy_lumina.LuminaTextEncodingStrategy = (
-                    strategy_base.TextEncodingStrategy.get_strategy()
-                )
+                tokenize_strategy = strategy_base.TokenizeStrategy.get_strategy()
+                text_encoding_strategy  = strategy_base.TextEncodingStrategy.get_strategy()
+                
+                assert isinstance(tokenize_strategy, strategy_lumina.LuminaTokenizeStrategy)
+                assert isinstance(text_encoding_strategy, strategy_lumina.LuminaTextEncodingStrategy)
 
-                prompts = train_util.load_prompts(args.sample_prompts)
+                sample_prompts = train_util.load_prompts(args.sample_prompts)
                 sample_prompts_te_outputs = (
                     {}
                 )  # key: prompt, value: text encoder outputs
                 with accelerator.autocast(), torch.no_grad():
-                    for prompt_dict in prompts:
-                        for p in [
-                            prompt_dict.get("prompt", ""),
-                            prompt_dict.get("negative_prompt", ""),
-                        ]:
-                            if p not in sample_prompts_te_outputs:
-                                logger.info(
-                                    f"cache Text Encoder outputs for prompt: {p}"
-                                )
-                                tokens_and_masks = tokenize_strategy.tokenize(p)
-                                sample_prompts_te_outputs[p] = (
-                                    text_encoding_strategy.encode_tokens(
-                                        tokenize_strategy,
-                                        text_encoders,
-                                        tokens_and_masks,
-                                        args.apply_t5_attn_mask,
-                                    )
-                                )
+                    for prompt_dict in sample_prompts:
+                        prompts = [prompt_dict.get("prompt", ""),
+                            prompt_dict.get("negative_prompt", "")]
+                        logger.info(
+                            f"cache Text Encoder outputs for prompt: {prompts[0]}"
+                        )
+                        tokens_and_masks = tokenize_strategy.tokenize(prompts)
+                        sample_prompts_te_outputs[prompts[0]] = (
+                            text_encoding_strategy.encode_tokens(
+                                tokenize_strategy,
+                                text_encoders,
+                                tokens_and_masks,
+                            )
+                        )
                 self.sample_prompts_te_outputs = sample_prompts_te_outputs
 
             accelerator.wait_for_everyone()
@@ -220,7 +216,7 @@ def sample_images(
         epoch,
         global_step,
         device,
-        ae,
+        vae,
         tokenizer,
         text_encoder,
         lumina,
@@ -231,7 +227,7 @@ def sample_images(
             epoch,
             global_step,
             lumina,
-            ae,
+            vae,
             self.get_models_for_text_encoding(args, accelerator, text_encoder),
             self.sample_prompts_te_outputs,
         )
@@ -258,12 +254,12 @@ def shift_scale_latents(self, args, latents):
     def get_noise_pred_and_target(
         self,
         args,
-        accelerator,
+        accelerator: Accelerator,
         noise_scheduler,
         latents,
         batch,
-        text_encoder_conds,
-        unet: lumina_models.NextDiT,
+        text_encoder_conds: Tuple[Tensor, Tensor, Tensor], # (hidden_states, input_ids, attention_masks)
+        dit: lumina_models.NextDiT,
         network,
         weight_dtype,
         train_unet,
@@ -296,7 +292,7 @@ def get_noise_pred_and_target(
         def call_dit(img, gemma2_hidden_states, timesteps, gemma2_attn_mask):
             with torch.set_grad_enabled(is_train), accelerator.autocast():
                 # NextDiT forward expects (x, t, cap_feats, cap_mask)
-                model_pred = unet(
+                model_pred = dit(
                     x=img,  # image latents (B, C, H, W)
                     t=timesteps / 1000,  # timesteps需要除以1000来匹配模型预期
                     cap_feats=gemma2_hidden_states,  # Gemma2的hidden states作为caption features
@@ -341,7 +337,7 @@ def call_dit(img, gemma2_hidden_states, timesteps, gemma2_attn_mask):
                 network.set_multiplier(0.0)
                 with torch.no_grad():
                     model_pred_prior = call_dit(
-                        img=packed_noisy_model_input[diff_output_pr_indices],
+                        img=noisy_model_input[diff_output_pr_indices],
                         gemma2_hidden_states=gemma2_hidden_states[
                             diff_output_pr_indices
                         ],
@@ -350,9 +346,9 @@ def call_dit(img, gemma2_hidden_states, timesteps, gemma2_attn_mask):
                     )
                 network.set_multiplier(1.0)
 
-                model_pred_prior = lumina_util.unpack_latents(
-                    model_pred_prior, packed_latent_height, packed_latent_width
-                )
+                # model_pred_prior = lumina_util.unpack_latents(
+                #     model_pred_prior, packed_latent_height, packed_latent_width
+                # )
                 model_pred_prior, _ = flux_train_utils.apply_model_prediction_type(
                     args,
                     model_pred_prior,
@@ -404,7 +400,8 @@ def prepare_unet_with_accelerator(
             return super().prepare_unet_with_accelerator(args, accelerator, unet)
 
         # if we doesn't swap blocks, we can move the model to device
-        nextdit: lumina_models.Nextdit = unet
+        nextdit = unet
+        assert isinstance(nextdit, lumina_models.NextDiT)
         nextdit = accelerator.prepare(
             nextdit, device_placement=[not self.is_swapping_blocks]
         )

From bd16bd13ae97a02ffee34346d254384bc40c7b30 Mon Sep 17 00:00:00 2001
From: rockerBOO <rockerboo@gmail.com>
Date: Tue, 18 Feb 2025 01:21:18 -0500
Subject: [PATCH 354/582] Remove unused attention, fix typo

---
 library/lumina_models.py | 11 +----------
 1 file changed, 1 insertion(+), 10 deletions(-)

diff --git a/library/lumina_models.py b/library/lumina_models.py
index e82f3b2c7..36c3b9796 100644
--- a/library/lumina_models.py
+++ b/library/lumina_models.py
@@ -467,13 +467,6 @@ def forward(
         return self.out(output)
 
 
-def attention(q: Tensor, k: Tensor, v: Tensor, attn_mask: Optional[Tensor] = None) -> Tensor:
-    x = torch.nn.functional.scaled_dot_product_attention(q, k, v, attn_mask=attn_mask)
-    x = rearrange(x, "B H L D -> B L (H D)")
-
-    return x
-
-
 def apply_rope(
     x_in: torch.Tensor,
     freqs_cis: torch.Tensor,
@@ -965,8 +958,6 @@ def patchify_and_embed(
             Tuple[Tensor, Tensor, Tensor, List[int], List[int]]:
 
             return x, attention_mask, freqs_cis, l_effective_cap_len, seq_lengths
-
-
         """
         bsz, channels, height, width = x.shape
         pH = pW = self.patch_size
@@ -993,7 +984,7 @@ def patchify_and_embed(
             position_ids[i, cap_len:seq_len, 1] = row_ids
             position_ids[i, cap_len:seq_len, 2] = col_ids
 
-        # Get combinded rotary embeddings
+        # Get combined rotary embeddings
         freqs_cis = self.rope_embedder(position_ids)
 
         # Create separate rotary embeddings for captions and images

From 4a369961346ca153a370728247449978d8a33415 Mon Sep 17 00:00:00 2001
From: Kohya S <ykumeykume@gmail.com>
Date: Tue, 18 Feb 2025 22:05:08 +0900
Subject: [PATCH 355/582] modify log step calculation

---
 train_network.py | 26 ++++++++++----------------
 1 file changed, 10 insertions(+), 16 deletions(-)

diff --git a/train_network.py b/train_network.py
index 47c4bb56e..93558da45 100644
--- a/train_network.py
+++ b/train_network.py
@@ -1464,11 +1464,10 @@ def restore_rng_state(rng_states: tuple[torch.ByteTensor, Optional[torch.ByteTen
                             )
 
                             if is_tracking:
-                                logs = {
-                                    "loss/validation/step_current": current_loss,
-                                    "val_step": (epoch * validation_total_steps) + val_ts_step,
-                                }
-                                accelerator.log(logs, step=global_step)
+                                logs = {"loss/validation/step_current": current_loss}
+                                accelerator.log(
+                                    logs, step=global_step + val_ts_step
+                                )  # a bit weird to log with global_step + val_ts_step
 
                             self.on_validation_step_end(args, accelerator, network, text_encoders, unet, batch, weight_dtype)
                             val_ts_step += 1
@@ -1545,25 +1544,20 @@ def restore_rng_state(rng_states: tuple[torch.ByteTensor, Optional[torch.ByteTen
                         )
 
                         if is_tracking:
-                            logs = {
-                                "loss/validation/epoch_current": current_loss,
-                                "epoch": epoch + 1,
-                                "val_step": (epoch * validation_total_steps) + val_ts_step,
-                            }
-                            accelerator.log(logs, step=global_step)
+                            logs = {"loss/validation/epoch_current": current_loss}
+                            accelerator.log(logs, step=global_step + val_ts_step)
 
                         self.on_validation_step_end(args, accelerator, network, text_encoders, unet, batch, weight_dtype)
                         val_ts_step += 1
 
                 if is_tracking:
                     avr_loss: float = val_epoch_loss_recorder.moving_average
-                    loss_validation_divergence = val_epoch_loss_recorder.moving_average - loss_recorder.moving_average 
+                    loss_validation_divergence = val_epoch_loss_recorder.moving_average - loss_recorder.moving_average
                     logs = {
                         "loss/validation/epoch_average": avr_loss,
                         "loss/validation/epoch_divergence": loss_validation_divergence,
-                        "epoch": epoch + 1,
                     }
-                    accelerator.log(logs, step=global_step)
+                    accelerator.log(logs, step=epoch + 1)
 
                 restore_rng_state(rng_states)
                 args.min_timestep = original_args_min_timestep
@@ -1574,8 +1568,8 @@ def restore_rng_state(rng_states: tuple[torch.ByteTensor, Optional[torch.ByteTen
 
             # END OF EPOCH
             if is_tracking:
-                logs = {"loss/epoch_average": loss_recorder.moving_average, "epoch": epoch + 1}
-                accelerator.log(logs, step=global_step)
+                logs = {"loss/epoch_average": loss_recorder.moving_average}
+                accelerator.log(logs, step=epoch + 1)
 
             accelerator.wait_for_everyone()
 

From 58e9e146a3c72716af909191835d4f41521b4c27 Mon Sep 17 00:00:00 2001
From: rockerBOO <rockerboo@gmail.com>
Date: Fri, 14 Feb 2025 14:02:21 -0500
Subject: [PATCH 356/582] Add resize interpolation configuration

---
 library/config_util.py |  7 ++++-
 library/train_util.py  | 70 ++++++++++++++++++++++++++++++++++++------
 2 files changed, 67 insertions(+), 10 deletions(-)

diff --git a/library/config_util.py b/library/config_util.py
index a2e07dc6c..53727f252 100644
--- a/library/config_util.py
+++ b/library/config_util.py
@@ -75,6 +75,7 @@ class BaseSubsetParams:
     custom_attributes: Optional[Dict[str, Any]] = None
     validation_seed: int = 0
     validation_split: float = 0.0
+    resize_interpolation: Optional[str] = None
 
 
 @dataclass
@@ -106,7 +107,7 @@ class BaseDatasetParams:
     debug_dataset: bool = False
     validation_seed: Optional[int] = None
     validation_split: float = 0.0
-
+    resize_interpolation: Optional[str] = None
 
 @dataclass
 class DreamBoothDatasetParams(BaseDatasetParams):
@@ -196,6 +197,7 @@ def __validate_and_convert_scalar_or_twodim(klass, value: Union[float, Sequence]
         "caption_prefix": str,
         "caption_suffix": str,
         "custom_attributes": dict,
+        "resize_interpolation": str,
     }
     # DO means DropOut
     DO_SUBSET_ASCENDABLE_SCHEMA = {
@@ -241,6 +243,7 @@ def __validate_and_convert_scalar_or_twodim(klass, value: Union[float, Sequence]
         "validation_split": float,
         "resolution": functools.partial(__validate_and_convert_scalar_or_twodim.__func__, int),
         "network_multiplier": float,
+        "resize_interpolation": str,
     }
 
     # options handled by argparse but not handled by user config
@@ -525,6 +528,7 @@ def print_info(_datasets, dataset_type: str):
                 [{dataset_type} {i}]
                   batch_size: {dataset.batch_size}
                   resolution: {(dataset.width, dataset.height)}
+                  resize_interpolation: {dataset.resize_interpolation}
                   enable_bucket: {dataset.enable_bucket}
             """)
 
@@ -558,6 +562,7 @@ def print_info(_datasets, dataset_type: str):
                     token_warmup_min: {subset.token_warmup_min},
                     token_warmup_step: {subset.token_warmup_step},
                     alpha_mask: {subset.alpha_mask}
+                    resize_interpolation: {subset.resize_interpolation}
                     custom_attributes: {subset.custom_attributes}
                 """), "  ")
 
diff --git a/library/train_util.py b/library/train_util.py
index 39b4af856..a07834adf 100644
--- a/library/train_util.py
+++ b/library/train_util.py
@@ -210,6 +210,7 @@ def __init__(self, image_key: str, num_repeats: int, caption: str, is_reg: bool,
         self.text_encoder_pool2: Optional[torch.Tensor] = None
 
         self.alpha_mask: Optional[torch.Tensor] = None  # alpha mask can be flipped in runtime
+        self.resize_interpolation: Optional[str] = None
 
 
 class BucketManager:
@@ -434,6 +435,7 @@ def __init__(
         custom_attributes: Optional[Dict[str, Any]] = None,
         validation_seed: Optional[int] = None,
         validation_split: Optional[float] = 0.0,
+        resize_interpolation: Optional[str] = None,
     ) -> None:
         self.image_dir = image_dir
         self.alpha_mask = alpha_mask if alpha_mask is not None else False
@@ -464,6 +466,8 @@ def __init__(
         self.validation_seed = validation_seed
         self.validation_split = validation_split
 
+        self.resize_interpolation = resize_interpolation
+
 
 class DreamBoothSubset(BaseSubset):
     def __init__(
@@ -495,6 +499,7 @@ def __init__(
         custom_attributes: Optional[Dict[str, Any]] = None,
         validation_seed: Optional[int] = None,
         validation_split: Optional[float] = 0.0,
+        resize_interpolation: Optional[str] = None,
     ) -> None:
         assert image_dir is not None, "image_dir must be specified / image_dirは指定が必須です"
 
@@ -522,6 +527,7 @@ def __init__(
             custom_attributes=custom_attributes,
             validation_seed=validation_seed,
             validation_split=validation_split,
+            resize_interpolation=resize_interpolation,
         )
 
         self.is_reg = is_reg
@@ -564,6 +570,7 @@ def __init__(
         custom_attributes: Optional[Dict[str, Any]] = None,
         validation_seed: Optional[int] = None,
         validation_split: Optional[float] = 0.0,
+        resize_interpolation: Optional[str] = None,
     ) -> None:
         assert metadata_file is not None, "metadata_file must be specified / metadata_fileは指定が必須です"
 
@@ -591,6 +598,7 @@ def __init__(
             custom_attributes=custom_attributes,
             validation_seed=validation_seed,
             validation_split=validation_split,
+            resize_interpolation=resize_interpolation,
         )
 
         self.metadata_file = metadata_file
@@ -629,6 +637,7 @@ def __init__(
         custom_attributes: Optional[Dict[str, Any]] = None,
         validation_seed: Optional[int] = None,
         validation_split: Optional[float] = 0.0,
+        resize_interpolation: Optional[str] = None,
     ) -> None:
         assert image_dir is not None, "image_dir must be specified / image_dirは指定が必須です"
 
@@ -656,6 +665,7 @@ def __init__(
             custom_attributes=custom_attributes,
             validation_seed=validation_seed,
             validation_split=validation_split,
+            resize_interpolation=resize_interpolation,
         )
 
         self.conditioning_data_dir = conditioning_data_dir
@@ -676,6 +686,7 @@ def __init__(
         resolution: Optional[Tuple[int, int]],
         network_multiplier: float,
         debug_dataset: bool,
+        resize_interpolation: Optional[str] = None
     ) -> None:
         super().__init__()
 
@@ -710,6 +721,10 @@ def __init__(
 
         self.image_transforms = IMAGE_TRANSFORMS
 
+        if resize_interpolation is not None:
+            assert validate_interpolation_fn(resize_interpolation), f"Resize interpolation \"{resize_interpolation}\" is not a valid interpolation"
+        self.resize_interpolation = resize_interpolation
+
         self.image_data: Dict[str, ImageInfo] = {}
         self.image_to_subset: Dict[str, Union[DreamBoothSubset, FineTuningSubset]] = {}
 
@@ -1499,7 +1514,9 @@ def crop_target(self, subset: BaseSubset, image, face_cx, face_cy, face_w, face_
         nh = int(height * scale + 0.5)
         nw = int(width * scale + 0.5)
         assert nh >= self.height and nw >= self.width, f"internal error. small scale {scale}, {width}*{height}"
-        image = cv2.resize(image, (nw, nh), interpolation=cv2.INTER_AREA)
+        interpolation = get_cv2_interpolation(subset.resize_interpolation if subset.resize_interpolation is not None else self.resize_interpolation)
+        logger.info(f"Interpolation: {interpolation}")
+        image = cv2.resize(image, (nw, nh), interpolation=interpolation if interpolation is not None else cv2.INTER_AREA)
         face_cx = int(face_cx * scale + 0.5)
         face_cy = int(face_cy * scale + 0.5)
         height, width = nh, nw
@@ -1596,7 +1613,7 @@ def __getitem__(self, index):
 
                 if self.enable_bucket:
                     img, original_size, crop_ltrb = trim_and_resize_if_required(
-                        subset.random_crop, img, image_info.bucket_reso, image_info.resized_size
+                        subset.random_crop, img, image_info.bucket_reso, image_info.resized_size, resize_interpolation=image_info.resize_interpolation
                     )
                 else:
                     if face_cx > 0:  # 顔位置情報あり
@@ -1857,8 +1874,9 @@ def __init__(
         debug_dataset: bool,
         validation_split: float,
         validation_seed: Optional[int],
+        resize_interpolation: Optional[str],
     ) -> None:
-        super().__init__(resolution, network_multiplier, debug_dataset)
+        super().__init__(resolution, network_multiplier, debug_dataset, resize_interpolation)
 
         assert resolution is not None, f"resolution is required / resolution（解像度）指定は必須です"
 
@@ -2087,6 +2105,7 @@ def load_dreambooth_dir(subset: DreamBoothSubset):
 
             for img_path, caption, size in zip(img_paths, captions, sizes):
                 info = ImageInfo(img_path, num_repeats, caption, subset.is_reg, img_path)
+                info.resize_interpolation = subset.resize_interpolation if subset.resize_interpolation is not None else self.resize_interpolation
                 if size is not None:
                     info.image_size = size
                 if subset.is_reg:
@@ -2370,8 +2389,9 @@ def __init__(
         debug_dataset: bool,
         validation_split: float,
         validation_seed: Optional[int],        
+        resize_interpolation: Optional[str] = None,
     ) -> None:
-        super().__init__(resolution, network_multiplier, debug_dataset)
+        super().__init__(resolution, network_multiplier, debug_dataset, resize_interpolation)
 
         db_subsets = []
         for subset in subsets:
@@ -2403,6 +2423,7 @@ def __init__(
                 subset.caption_suffix,
                 subset.token_warmup_min,
                 subset.token_warmup_step,
+                resize_interpolation=subset.resize_interpolation,
             )
             db_subsets.append(db_subset)
 
@@ -2421,6 +2442,7 @@ def __init__(
             debug_dataset,
             validation_split,
             validation_seed,
+            resize_interpolation,
         )
 
         # config_util等から参照される値をいれておく（若干微妙なのでなんとかしたい）
@@ -2430,6 +2452,7 @@ def __init__(
         self.num_reg_images = self.dreambooth_dataset_delegate.num_reg_images        
         self.validation_split = validation_split
         self.validation_seed = validation_seed 
+        self.resize_interpolation = resize_interpolation
 
         # assert all conditioning data exists
         missing_imgs = []
@@ -2517,8 +2540,10 @@ def __getitem__(self, index):
                 assert (
                     cond_img.shape[0] == original_size_hw[0] and cond_img.shape[1] == original_size_hw[1]
                 ), f"size of conditioning image is not match / 画像サイズが合いません: {image_info.absolute_path}"
+
+                interpolation = get_cv2_interpolation(self.resize_interpolation)
                 cond_img = cv2.resize(
-                    cond_img, image_info.resized_size, interpolation=cv2.INTER_AREA
+                    cond_img, image_info.resized_size, interpolation=interpolation if interpolation is not None else cv2.INTER_AREA
                 )  # INTER_AREAでやりたいのでcv2でリサイズ
 
                 # TODO support random crop
@@ -2930,7 +2955,7 @@ def load_image(image_path, alpha=False):
 
 # 画像を読み込む。戻り値はnumpy.ndarray,(original width, original height),(crop left, crop top, crop right, crop bottom)
 def trim_and_resize_if_required(
-    random_crop: bool, image: np.ndarray, reso, resized_size: Tuple[int, int]
+    random_crop: bool, image: np.ndarray, reso, resized_size: Tuple[int, int], resize_interpolation: Optional[str] = None
 ) -> Tuple[np.ndarray, Tuple[int, int], Tuple[int, int, int, int]]:
     image_height, image_width = image.shape[0:2]
     original_size = (image_width, image_height)  # size before resize
@@ -2938,7 +2963,8 @@ def trim_and_resize_if_required(
     if image_width != resized_size[0] or image_height != resized_size[1]:
         # リサイズする
         if image_width > resized_size[0] and image_height > resized_size[1]:
-            image = cv2.resize(image, resized_size, interpolation=cv2.INTER_AREA)  # INTER_AREAでやりたいのでcv2でリサイズ
+            interpolation = get_cv2_interpolation(resize_interpolation)
+            image = cv2.resize(image, resized_size, interpolation=interpolation if interpolation is not None else cv2.INTER_AREA)  # INTER_AREAでやりたいのでcv2でリサイズ
         else:
             image = pil_resize(image, resized_size)
 
@@ -2985,7 +3011,7 @@ def load_images_and_masks_for_caching(
     for info in image_infos:
         image = load_image(info.absolute_path, use_alpha_mask) if info.image is None else np.array(info.image, np.uint8)
         # TODO 画像のメタデータが壊れていて、メタデータから割り当てたbucketと実際の画像サイズが一致しない場合があるのでチェック追加要
-        image, original_size, crop_ltrb = trim_and_resize_if_required(random_crop, image, info.bucket_reso, info.resized_size)
+        image, original_size, crop_ltrb = trim_and_resize_if_required(random_crop, image, info.bucket_reso, info.resized_size, resize_interpolation=info.resize_interpolation)
 
         original_sizes.append(original_size)
         crop_ltrbs.append(crop_ltrb)
@@ -3026,7 +3052,7 @@ def cache_batch_latents(
     for info in image_infos:
         image = load_image(info.absolute_path, use_alpha_mask) if info.image is None else np.array(info.image, np.uint8)
         # TODO 画像のメタデータが壊れていて、メタデータから割り当てたbucketと実際の画像サイズが一致しない場合があるのでチェック追加要
-        image, original_size, crop_ltrb = trim_and_resize_if_required(random_crop, image, info.bucket_reso, info.resized_size)
+        image, original_size, crop_ltrb = trim_and_resize_if_required(random_crop, image, info.bucket_reso, info.resized_size, resize_interpolation=info.resize_interpolation)
 
         info.latents_original_size = original_size
         info.latents_crop_ltrb = crop_ltrb
@@ -6533,3 +6559,29 @@ def moving_average(self) -> float:
         if losses == 0:
             return 0
         return self.loss_total / losses
+
+def get_cv2_interpolation(interpolation: Optional[str]) -> Optional[int]:
+    """
+    Convert interpolation ovalue to cv2 interpolation integer
+    """
+    if interpolation is None:
+        return None 
+
+    if interpolation == "lanczos":
+        return cv2.INTER_LANCZOS4
+    elif interpolation == "nearest":
+        return cv2.INTER_NEAREST
+    elif interpolation == "bilinear" or interpolation == "linear":
+        return cv2.INTER_LINEAR
+    elif interpolation == "bicubic" or interpolation == "cubic":
+        return cv2.INTER_CUBIC
+    elif interpolation == "area":
+        return cv2.INTER_AREA
+    else:
+        return None
+
+def validate_interpolation_fn(interpolation_str: str) -> bool:
+    """
+    Check if a interpolation function is supported
+    """
+    return interpolation_str in ["lanczos", "nearest", "bilinear", "linear", "bicubic", "cubic", "area"]

From d0128d18be009c5e221db96d53b6045bdd5af04f Mon Sep 17 00:00:00 2001
From: rockerBOO <rockerboo@gmail.com>
Date: Fri, 14 Feb 2025 14:14:57 -0500
Subject: [PATCH 357/582] Add resize interpolation CLI option

---
 library/train_util.py | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/library/train_util.py b/library/train_util.py
index a07834adf..d41d1ff34 100644
--- a/library/train_util.py
+++ b/library/train_util.py
@@ -4530,7 +4530,13 @@ def add_dataset_arguments(
         action="store_true",
         help="make bucket for each image without upscaling / 画像を拡大せずbucketを作成します",
     )
-
+    parser.add_argument(
+        "--resize_interpolation",
+        type=str,
+        default=None,
+        choices=["lanczos", "nearest", "bilinear", "linear", "bicubic", "cubic", "area"],
+        help="Resize interpolation when required. Default: area Options: lanczos, nearest, bilinear, bicubic, area / 必要に応じてサイズ補間を変更します。デフォルト: area オプション: lanczos, nearest, bilinear, bicubic, area",
+    )
     parser.add_argument(
         "--token_warmup_min",
         type=int,

From 7729c4c8f962d4f0b5fd73fb86399e73ab9cce8b Mon Sep 17 00:00:00 2001
From: rockerBOO <rockerboo@gmail.com>
Date: Fri, 14 Feb 2025 14:18:53 -0500
Subject: [PATCH 358/582] Add metadata

---
 train_network.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/train_network.py b/train_network.py
index 674f1cb66..16c1774ed 100644
--- a/train_network.py
+++ b/train_network.py
@@ -973,6 +973,7 @@ def load_model_hook(models, input_dir):
             "ss_max_validation_steps": args.max_validation_steps, 
             "ss_validate_every_n_epochs": args.validate_every_n_epochs, 
             "ss_validate_every_n_steps": args.validate_every_n_steps, 
+            "ss_resize_interpolation": train_dataset_group.resize_interpolation
         }
 
         self.update_metadata(metadata, args)  # architecture specific metadata
@@ -998,6 +999,7 @@ def load_model_hook(models, input_dir):
                     "max_bucket_reso": dataset.max_bucket_reso,
                     "tag_frequency": dataset.tag_frequency,
                     "bucket_info": dataset.bucket_info,
+                    "resize_interpolation": dataset.resize_interpolation,
                 }
 
                 subsets_metadata = []
@@ -1015,6 +1017,7 @@ def load_model_hook(models, input_dir):
                         "enable_wildcard": bool(subset.enable_wildcard),
                         "caption_prefix": subset.caption_prefix,
                         "caption_suffix": subset.caption_suffix,
+                        "resize_interpolation": subset.resize_interpolation,
                     }
 
                     image_dir_or_metadata_file = None

From 545425c13e855838781f0d0af24c4c5df992c87d Mon Sep 17 00:00:00 2001
From: rockerBOO <rockerboo@gmail.com>
Date: Fri, 14 Feb 2025 14:24:25 -0500
Subject: [PATCH 359/582] Typo

---
 library/train_util.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/library/train_util.py b/library/train_util.py
index d41d1ff34..94145cad7 100644
--- a/library/train_util.py
+++ b/library/train_util.py
@@ -6568,7 +6568,7 @@ def moving_average(self) -> float:
 
 def get_cv2_interpolation(interpolation: Optional[str]) -> Optional[int]:
     """
-    Convert interpolation ovalue to cv2 interpolation integer
+    Convert interpolation value to cv2 interpolation integer
     """
     if interpolation is None:
         return None 

From ca1c129ffd2439dec3f00a6a78a5cc5858d08cb5 Mon Sep 17 00:00:00 2001
From: rockerBOO <rockerboo@gmail.com>
Date: Fri, 14 Feb 2025 16:18:24 -0500
Subject: [PATCH 360/582] Fix metadata

---
 train_network.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/train_network.py b/train_network.py
index 16c1774ed..c625eee6d 100644
--- a/train_network.py
+++ b/train_network.py
@@ -973,7 +973,7 @@ def load_model_hook(models, input_dir):
             "ss_max_validation_steps": args.max_validation_steps, 
             "ss_validate_every_n_epochs": args.validate_every_n_epochs, 
             "ss_validate_every_n_steps": args.validate_every_n_steps, 
-            "ss_resize_interpolation": train_dataset_group.resize_interpolation
+            "ss_resize_interpolation": args.resize_interpolation
         }
 
         self.update_metadata(metadata, args)  # architecture specific metadata

From 7f2747176bb01757b95e086c548f2bcf8f689005 Mon Sep 17 00:00:00 2001
From: rockerBOO <rockerboo@gmail.com>
Date: Wed, 19 Feb 2025 14:20:24 -0500
Subject: [PATCH 361/582] Use resize_image where resizing is required

---
 finetune/tag_images_by_wd14_tagger.py |   7 +-
 library/train_util.py                 |  45 ++----------
 library/utils.py                      | 101 +++++++++++++++++++++++++-
 tools/detect_face_rotate.py           |  11 +--
 tools/resize_images_to_resolution.py  |  20 +----
 5 files changed, 113 insertions(+), 71 deletions(-)

diff --git a/finetune/tag_images_by_wd14_tagger.py b/finetune/tag_images_by_wd14_tagger.py
index cbc3d2d6b..406f12f29 100644
--- a/finetune/tag_images_by_wd14_tagger.py
+++ b/finetune/tag_images_by_wd14_tagger.py
@@ -11,7 +11,7 @@
 from tqdm import tqdm
 
 import library.train_util as train_util
-from library.utils import setup_logging, pil_resize
+from library.utils import setup_logging, resize_image
 
 setup_logging()
 import logging
@@ -42,10 +42,7 @@ def preprocess_image(image):
     pad_t = pad_y // 2
     image = np.pad(image, ((pad_t, pad_y - pad_t), (pad_l, pad_x - pad_l), (0, 0)), mode="constant", constant_values=255)
 
-    if size > IMAGE_SIZE:
-        image = cv2.resize(image, (IMAGE_SIZE, IMAGE_SIZE), cv2.INTER_AREA)
-    else:
-        image = pil_resize(image, (IMAGE_SIZE, IMAGE_SIZE))
+    image = resize_image(image, image.shape[0], image.shape[1], IMAGE_SIZE, IMAGE_SIZE)
 
     image = image.astype(np.float32)
     return image
diff --git a/library/train_util.py b/library/train_util.py
index 94145cad7..46219d4fc 100644
--- a/library/train_util.py
+++ b/library/train_util.py
@@ -84,7 +84,7 @@
 import library.huggingface_util as huggingface_util
 import library.sai_model_spec as sai_model_spec
 import library.deepspeed_utils as deepspeed_utils
-from library.utils import setup_logging, pil_resize
+from library.utils import setup_logging, resize_image
 
 setup_logging()
 import logging
@@ -1514,9 +1514,7 @@ def crop_target(self, subset: BaseSubset, image, face_cx, face_cy, face_w, face_
         nh = int(height * scale + 0.5)
         nw = int(width * scale + 0.5)
         assert nh >= self.height and nw >= self.width, f"internal error. small scale {scale}, {width}*{height}"
-        interpolation = get_cv2_interpolation(subset.resize_interpolation if subset.resize_interpolation is not None else self.resize_interpolation)
-        logger.info(f"Interpolation: {interpolation}")
-        image = cv2.resize(image, (nw, nh), interpolation=interpolation if interpolation is not None else cv2.INTER_AREA)
+        image = resize_image(image, width, height, nw, nh, subset.resize_interpolation)
         face_cx = int(face_cx * scale + 0.5)
         face_cy = int(face_cy * scale + 0.5)
         height, width = nh, nw
@@ -2541,10 +2539,7 @@ def __getitem__(self, index):
                     cond_img.shape[0] == original_size_hw[0] and cond_img.shape[1] == original_size_hw[1]
                 ), f"size of conditioning image is not match / 画像サイズが合いません: {image_info.absolute_path}"
 
-                interpolation = get_cv2_interpolation(self.resize_interpolation)
-                cond_img = cv2.resize(
-                    cond_img, image_info.resized_size, interpolation=interpolation if interpolation is not None else cv2.INTER_AREA
-                )  # INTER_AREAでやりたいのでcv2でリサイズ
+                cond_img = resize_image(cond_img, original_size_hw[1], original_size_hw[0], target_size_hw[1], target_size_hw[0], self.resize_interpolation)
 
                 # TODO support random crop
                 # 現在サポートしているcropはrandomではなく中央のみ
@@ -2558,7 +2553,7 @@ def __getitem__(self, index):
                 # ), f"image size is small / 画像サイズが小さいようです: {image_info.absolute_path}"
                 # resize to target
                 if cond_img.shape[0] != target_size_hw[0] or cond_img.shape[1] != target_size_hw[1]:
-                    cond_img = pil_resize(cond_img, (int(target_size_hw[1]), int(target_size_hw[0])))
+                    cond_img = resize_image(cond_img, cond_img.shape[0], cond_img.shape[1], target_size_hw[1], target_size_hw[0], self.resize_interpolation)
 
             if flipped:
                 cond_img = cond_img[:, ::-1, :].copy()  # copy to avoid negative stride
@@ -2961,12 +2956,7 @@ def trim_and_resize_if_required(
     original_size = (image_width, image_height)  # size before resize
 
     if image_width != resized_size[0] or image_height != resized_size[1]:
-        # リサイズする
-        if image_width > resized_size[0] and image_height > resized_size[1]:
-            interpolation = get_cv2_interpolation(resize_interpolation)
-            image = cv2.resize(image, resized_size, interpolation=interpolation if interpolation is not None else cv2.INTER_AREA)  # INTER_AREAでやりたいのでcv2でリサイズ
-        else:
-            image = pil_resize(image, resized_size)
+        image = resize_image(image, image_width, image_height, resized_size[0], resized_size[1], resize_interpolation)
 
     image_height, image_width = image.shape[0:2]
 
@@ -6566,28 +6556,3 @@ def moving_average(self) -> float:
             return 0
         return self.loss_total / losses
 
-def get_cv2_interpolation(interpolation: Optional[str]) -> Optional[int]:
-    """
-    Convert interpolation value to cv2 interpolation integer
-    """
-    if interpolation is None:
-        return None 
-
-    if interpolation == "lanczos":
-        return cv2.INTER_LANCZOS4
-    elif interpolation == "nearest":
-        return cv2.INTER_NEAREST
-    elif interpolation == "bilinear" or interpolation == "linear":
-        return cv2.INTER_LINEAR
-    elif interpolation == "bicubic" or interpolation == "cubic":
-        return cv2.INTER_CUBIC
-    elif interpolation == "area":
-        return cv2.INTER_AREA
-    else:
-        return None
-
-def validate_interpolation_fn(interpolation_str: str) -> bool:
-    """
-    Check if a interpolation function is supported
-    """
-    return interpolation_str in ["lanczos", "nearest", "bilinear", "linear", "bicubic", "cubic", "area"]
diff --git a/library/utils.py b/library/utils.py
index 07079c6d9..9156864ee 100644
--- a/library/utils.py
+++ b/library/utils.py
@@ -16,7 +16,6 @@
 import numpy as np
 from safetensors.torch import load_file
 
-
 def fire_in_thread(f, *args, **kwargs):
     threading.Thread(target=f, args=args, kwargs=kwargs).start()
 
@@ -89,6 +88,8 @@ def setup_logging(args=None, log_level=None, reset=False):
         logger = logging.getLogger(__name__)
         logger.info(msg_init)
 
+setup_logging()
+logger = logging.getLogger(__name__)
 
 # endregion
 
@@ -377,7 +378,7 @@ def load_safetensors(
 # region Image utils
 
 
-def pil_resize(image, size, interpolation=Image.LANCZOS):
+def pil_resize(image, size, interpolation):
     has_alpha = image.shape[2] == 4 if len(image.shape) == 3 else False
 
     if has_alpha:
@@ -385,7 +386,7 @@ def pil_resize(image, size, interpolation=Image.LANCZOS):
     else:
         pil_image = Image.fromarray(cv2.cvtColor(image, cv2.COLOR_BGR2RGB))
 
-    resized_pil = pil_image.resize(size, interpolation)
+    resized_pil = pil_image.resize(size, resample=interpolation)
 
     # Convert back to cv2 format
     if has_alpha:
@@ -396,6 +397,100 @@ def pil_resize(image, size, interpolation=Image.LANCZOS):
     return resized_cv2
 
 
+def resize_image(image: np.ndarray, width: int, height: int, resized_width: int, resized_height: int, resize_interpolation: Optional[str] = None):
+    """
+    Resize image with resize interpolation. Default interpolation to AREA if image is smaller, else LANCZOS
+
+    Args:
+        image: numpy.ndarray
+        width: int Original image width
+        height: int Original image height
+        resized_width: int Resized image width
+        resized_height: int Resized image height
+        resize_interpolation: Optional[str] Resize interpolation method "lanczos", "area", "bilinear", "bicubic", "nearest", "box"
+
+    Returns:
+        image
+    """
+    interpolation = get_cv2_interpolation(resize_interpolation)
+    resized_size = (resized_width, resized_height)
+    if width > resized_width and height > resized_width:
+        image = cv2.resize(image, resized_size, interpolation=interpolation if interpolation is not None else cv2.INTER_AREA)  # INTER_AREAでやりたいのでcv2でリサイズ
+        logger.debug(f"resize image using {resize_interpolation}")
+    else:
+        image = cv2.resize(image, resized_size, interpolation=interpolation if interpolation is not None else cv2.INTER_LANCZOS4)  # INTER_AREAでやりたいのでcv2でリサイズ
+        logger.debug(f"resize image using {resize_interpolation}")
+
+    return image
+
+
+def get_cv2_interpolation(interpolation: Optional[str]) -> Optional[int]:
+    """
+    Convert interpolation value to cv2 interpolation integer
+
+    https://docs.opencv.org/3.4/da/d54/group__imgproc__transform.html#ga5bb5a1fea74ea38e1a5445ca803ff121
+    """
+    if interpolation is None:
+        return None 
+
+    if interpolation == "lanczos" or interpolation == "lanczos4":
+        # Lanczos interpolation over 8x8 neighborhood 
+        return cv2.INTER_LANCZOS4
+    elif interpolation == "nearest":
+        # Bit exact nearest neighbor interpolation. This will produce same results as the nearest neighbor method in PIL, scikit-image or Matlab. 
+        return cv2.INTER_NEAREST_EXACT
+    elif interpolation == "bilinear" or interpolation == "linear":
+        # bilinear interpolation
+        return cv2.INTER_LINEAR
+    elif interpolation == "bicubic" or interpolation == "cubic":
+        # bicubic interpolation 
+        return cv2.INTER_CUBIC
+    elif interpolation == "area":
+        # resampling using pixel area relation. It may be a preferred method for image decimation, as it gives moire'-free results. But when the image is zoomed, it is similar to the INTER_NEAREST method. 
+        return cv2.INTER_AREA
+    elif interpolation == "box":
+        # resampling using pixel area relation. It may be a preferred method for image decimation, as it gives moire'-free results. But when the image is zoomed, it is similar to the INTER_NEAREST method. 
+        return cv2.INTER_AREA
+    else:
+        return None
+
+def get_pil_interpolation(interpolation: Optional[str]) -> Optional[Image.Resampling]:
+    """
+    Convert interpolation value to PIL interpolation
+
+    https://pillow.readthedocs.io/en/stable/handbook/concepts.html#concept-filters
+    """
+    if interpolation is None:
+        return None 
+
+    if interpolation == "lanczos":
+        return Image.Resampling.LANCZOS
+    elif interpolation == "nearest":
+        # Pick one nearest pixel from the input image. Ignore all other input pixels.
+        return Image.Resampling.NEAREST
+    elif interpolation == "bilinear" or interpolation == "linear":
+        # For resize calculate the output pixel value using linear interpolation on all pixels that may contribute to the output value. For other transformations linear interpolation over a 2x2 environment in the input image is used.
+        return Image.Resampling.BILINEAR
+    elif interpolation == "bicubic" or interpolation == "cubic":
+        # For resize calculate the output pixel value using cubic interpolation on all pixels that may contribute to the output value. For other transformations cubic interpolation over a 4x4 environment in the input image is used.
+        return Image.Resampling.BICUBIC
+    elif interpolation == "area":
+        # Image.Resampling.BOX may be more appropriate if upscaling 
+        # Area interpolation is related to cv2.INTER_AREA
+        # Produces a sharper image than Resampling.BILINEAR, doesn’t have dislocations on local level like with Resampling.BOX.
+        return Image.Resampling.HAMMING
+    elif interpolation == "box":
+        # Each pixel of source image contributes to one pixel of the destination image with identical weights. For upscaling is equivalent of Resampling.NEAREST.
+        return Image.Resampling.BOX
+    else:
+        return None
+
+def validate_interpolation_fn(interpolation_str: str) -> bool:
+    """
+    Check if a interpolation function is supported
+    """
+    return interpolation_str in ["lanczos", "nearest", "bilinear", "linear", "bicubic", "cubic", "area", "box"]
+
 # endregion
 
 # TODO make inf_utils.py
diff --git a/tools/detect_face_rotate.py b/tools/detect_face_rotate.py
index d2a4d9cfb..16fd7d0b7 100644
--- a/tools/detect_face_rotate.py
+++ b/tools/detect_face_rotate.py
@@ -15,7 +15,7 @@
 from anime_face_detector import create_detector
 from tqdm import tqdm
 import numpy as np
-from library.utils import setup_logging, pil_resize
+from library.utils import setup_logging, resize_image
 setup_logging()
 import logging
 logger = logging.getLogger(__name__)
@@ -170,12 +170,9 @@ def process(args):
             scale = max(cur_crop_width / w, cur_crop_height / h)
 
         if scale != 1.0:
-          w = int(w * scale + .5)
-          h = int(h * scale + .5)
-          if scale < 1.0:
-            face_img = cv2.resize(face_img, (w, h), interpolation=cv2.INTER_AREA)
-          else:
-            face_img = pil_resize(face_img, (w, h))
+          rw = int(w * scale + .5)
+          rh = int(h * scale + .5)
+          face_img = resize_image(face_img, w, h, rw, rh)
           cx = int(cx * scale + .5)
           cy = int(cy * scale + .5)
           fw = int(fw * scale + .5)
diff --git a/tools/resize_images_to_resolution.py b/tools/resize_images_to_resolution.py
index 0f9e00b1e..f5fbae2bb 100644
--- a/tools/resize_images_to_resolution.py
+++ b/tools/resize_images_to_resolution.py
@@ -6,7 +6,7 @@
 import math
 from PIL import Image
 import numpy as np
-from library.utils import setup_logging, pil_resize
+from library.utils import setup_logging, resize_image
 setup_logging()
 import logging
 logger = logging.getLogger(__name__)
@@ -22,14 +22,6 @@ def resize_images(src_img_folder, dst_img_folder, max_resolution="512x512", divi
   if not os.path.exists(dst_img_folder):
     os.makedirs(dst_img_folder)
 
-  # Select interpolation method
-  if interpolation == 'lanczos4':
-    pil_interpolation = Image.LANCZOS
-  elif interpolation == 'cubic':
-    pil_interpolation = Image.BICUBIC
-  else:
-    cv2_interpolation = cv2.INTER_AREA
-
   # Iterate through all files in src_img_folder
   img_exts = (".png", ".jpg", ".jpeg", ".webp", ".bmp")                   # copy from train_util.py
   for filename in os.listdir(src_img_folder):
@@ -63,11 +55,7 @@ def resize_images(src_img_folder, dst_img_folder, max_resolution="512x512", divi
         new_height = int(img.shape[0] * math.sqrt(scale_factor))
         new_width = int(img.shape[1] * math.sqrt(scale_factor))
 
-        # Resize image
-        if cv2_interpolation:
-          img = cv2.resize(img, (new_width, new_height), interpolation=cv2_interpolation)
-        else:
-          img = pil_resize(img, (new_width, new_height), interpolation=pil_interpolation)
+        img = resize_image(img,  img.shape[0], img.shape[1], new_height, new_width, interpolation)
       else:
         new_height, new_width = img.shape[0:2]
 
@@ -113,8 +101,8 @@ def setup_parser() -> argparse.ArgumentParser:
                       help='Maximum resolution(s) in the format "512x512,384x384, etc, etc" / 最大画像サイズをカンマ区切りで指定 ("512x512,384x384, etc, etc" など)', default="512x512,384x384,256x256,128x128")
   parser.add_argument('--divisible_by', type=int,
                       help='Ensure new dimensions are divisible by this value / リサイズ後の画像のサイズをこの値で割り切れるようにします', default=1)
-  parser.add_argument('--interpolation', type=str, choices=['area', 'cubic', 'lanczos4'],
-                      default='area', help='Interpolation method for resizing / リサイズ時の補完方法')
+  parser.add_argument('--interpolation', type=str, choices=['area', 'cubic', 'lanczos4', 'nearest', 'linear', 'box'],
+                      default=None, help='Interpolation method for resizing. Default to area if smaller, lanczos if larger / サイズ変更の補間方法。小さい場合はデフォルトでエリア、大きい場合はランチョスになります。')
   parser.add_argument('--save_as_png', action='store_true', help='Save as png format / png形式で保存')
   parser.add_argument('--copy_associated_files', action='store_true',
                       help='Copy files with same base name to images (captions etc) / 画像と同じファイル名（拡張子を除く）のファイルもコピーする')

From efb2a128cd0d2c6340a21bf544e77853a20b3453 Mon Sep 17 00:00:00 2001
From: Kohya S <ykumeykume@gmail.com>
Date: Fri, 21 Feb 2025 22:07:35 +0900
Subject: [PATCH 362/582] fix wandb val logging

---
 library/train_util.py | 57 +++++++++++++++------------------
 train_network.py      | 73 ++++++++++++++++++++++++++++++++-----------
 2 files changed, 80 insertions(+), 50 deletions(-)

diff --git a/library/train_util.py b/library/train_util.py
index 258701982..1f591c422 100644
--- a/library/train_util.py
+++ b/library/train_util.py
@@ -13,17 +13,7 @@
 import shutil
 import time
 import typing
-from typing import (
-    Any,
-    Callable,
-    Dict,
-    List,
-    NamedTuple,
-    Optional,
-    Sequence,
-    Tuple,
-    Union
-)
+from typing import Any, Callable, Dict, List, NamedTuple, Optional, Sequence, Tuple, Union
 from accelerate import Accelerator, InitProcessGroupKwargs, DistributedDataParallelKwargs, PartialState
 import glob
 import math
@@ -146,12 +136,13 @@
 TEXT_ENCODER_OUTPUTS_CACHE_SUFFIX = "_te_outputs.npz"
 TEXT_ENCODER_OUTPUTS_CACHE_SUFFIX_SD3 = "_sd3_te.npz"
 
+
 def split_train_val(
-    paths: List[str], 
+    paths: List[str],
     sizes: List[Optional[Tuple[int, int]]],
-    is_training_dataset: bool, 
-    validation_split: float, 
-    validation_seed: int | None
+    is_training_dataset: bool,
+    validation_split: float,
+    validation_seed: int | None,
 ) -> Tuple[List[str], List[Optional[Tuple[int, int]]]]:
     """
     Split the dataset into train and validation
@@ -1842,7 +1833,7 @@ def get_item_for_caching(self, bucket, bucket_batch_size, image_index):
 class DreamBoothDataset(BaseDataset):
     IMAGE_INFO_CACHE_FILE = "metadata_cache.json"
 
-    # The is_training_dataset defines the type of dataset, training or validation 
+    # The is_training_dataset defines the type of dataset, training or validation
     # if is_training_dataset is True -> training dataset
     # if is_training_dataset is False -> validation dataset
     def __init__(
@@ -1981,29 +1972,25 @@ def load_dreambooth_dir(subset: DreamBoothSubset):
                     logger.info(f"set image size from cache files: {size_set_count}/{len(img_paths)}")
 
             # We want to create a training and validation split. This should be improved in the future
-            # to allow a clearer distinction between training and validation. This can be seen as a 
+            # to allow a clearer distinction between training and validation. This can be seen as a
             # short-term solution to limit what is necessary to implement validation datasets
-            # 
+            #
             # We split the dataset for the subset based on if we are doing a validation split
-            # The self.is_training_dataset defines the type of dataset, training or validation 
+            # The self.is_training_dataset defines the type of dataset, training or validation
             # if self.is_training_dataset is True -> training dataset
             # if self.is_training_dataset is False -> validation dataset
             if self.validation_split > 0.0:
-                # For regularization images we do not want to split this dataset. 
+                # For regularization images we do not want to split this dataset.
                 if subset.is_reg is True:
                     # Skip any validation dataset for regularization images
                     if self.is_training_dataset is False:
                         img_paths = []
                         sizes = []
-                    # Otherwise the img_paths remain as original img_paths and no split 
+                    # Otherwise the img_paths remain as original img_paths and no split
                     # required for training images dataset of regularization images
                 else:
                     img_paths, sizes = split_train_val(
-                        img_paths, 
-                        sizes,
-                        self.is_training_dataset, 
-                        self.validation_split, 
-                        self.validation_seed
+                        img_paths, sizes, self.is_training_dataset, self.validation_split, self.validation_seed
                     )
 
             logger.info(f"found directory {subset.image_dir} contains {len(img_paths)} image files")
@@ -2373,7 +2360,7 @@ def __init__(
         bucket_no_upscale: bool,
         debug_dataset: bool,
         validation_split: float,
-        validation_seed: Optional[int],        
+        validation_seed: Optional[int],
     ) -> None:
         super().__init__(resolution, network_multiplier, debug_dataset)
 
@@ -2431,9 +2418,9 @@ def __init__(
         self.image_data = self.dreambooth_dataset_delegate.image_data
         self.batch_size = batch_size
         self.num_train_images = self.dreambooth_dataset_delegate.num_train_images
-        self.num_reg_images = self.dreambooth_dataset_delegate.num_reg_images        
+        self.num_reg_images = self.dreambooth_dataset_delegate.num_reg_images
         self.validation_split = validation_split
-        self.validation_seed = validation_seed 
+        self.validation_seed = validation_seed
 
         # assert all conditioning data exists
         missing_imgs = []
@@ -5952,7 +5939,9 @@ def get_timesteps(min_timestep: int, max_timestep: int, b_size: int, device: tor
     return timesteps
 
 
-def get_noise_noisy_latents_and_timesteps(args, noise_scheduler, latents: torch.FloatTensor) -> Tuple[torch.FloatTensor, torch.FloatTensor, torch.IntTensor]:
+def get_noise_noisy_latents_and_timesteps(
+    args, noise_scheduler, latents: torch.FloatTensor
+) -> Tuple[torch.FloatTensor, torch.FloatTensor, torch.IntTensor]:
     # Sample noise that we'll add to the latents
     noise = torch.randn_like(latents, device=latents.device)
     if args.noise_offset:
@@ -6444,7 +6433,7 @@ def sample_image_inference(
         wandb_tracker.log({f"sample_{i}": wandb.Image(image, caption=prompt)}, commit=False)  # positive prompt as a caption
 
 
-def init_trackers(accelerator: Accelerator, args: argparse.Namespace, default_tracker_name: str): 
+def init_trackers(accelerator: Accelerator, args: argparse.Namespace, default_tracker_name: str):
     """
     Initialize experiment trackers with tracker specific behaviors
     """
@@ -6461,13 +6450,17 @@ def init_trackers(accelerator: Accelerator, args: argparse.Namespace, default_tr
         )
 
         if "wandb" in [tracker.name for tracker in accelerator.trackers]:
-            import wandb 
+            import wandb
+
             wandb_tracker = accelerator.get_tracker("wandb", unwrap=True)
 
             # Define specific metrics to handle validation and epochs "steps"
             wandb_tracker.define_metric("epoch", hidden=True)
             wandb_tracker.define_metric("val_step", hidden=True)
 
+            wandb_tracker.define_metric("global_step", hidden=True)
+
+
 # endregion
 
 
diff --git a/train_network.py b/train_network.py
index 93558da45..ab5483deb 100644
--- a/train_network.py
+++ b/train_network.py
@@ -119,6 +119,45 @@ def generate_step_logs(
 
         return logs
 
+    def step_logging(self, accelerator: Accelerator, logs: dict, global_step: int, epoch: int):
+        self.accelerator_logging(accelerator, logs, global_step, global_step, epoch)
+
+    def epoch_logging(self, accelerator: Accelerator, logs: dict, global_step: int, epoch: int):
+        self.accelerator_logging(accelerator, logs, epoch, global_step, epoch)
+
+    def val_logging(self, accelerator: Accelerator, logs: dict, global_step: int, epoch: int, val_step: int):
+        self.accelerator_logging(accelerator, logs, global_step + val_step, global_step, epoch, val_step)
+
+    def accelerator_logging(
+        self, accelerator: Accelerator, logs: dict, step_value: int, global_step: int, epoch: int, val_step: Optional[int] = None
+    ):
+        """
+        step_value is for tensorboard, other values are for wandb
+        """
+        tensorboard_tracker = None
+        wandb_tracker = None
+        other_trackers = []
+        for tracker in accelerator.trackers:
+            if tracker.name == "tensorboard":
+                tensorboard_tracker = accelerator.get_tracker("tensorboard")
+            elif tracker.name == "wandb":
+                wandb_tracker = accelerator.get_tracker("wandb")
+            else:
+                other_trackers.append(accelerator.get_tracker(tracker.name))
+
+        if tensorboard_tracker is not None:
+            tensorboard_tracker.log(logs, step=step_value)
+
+        if wandb_tracker is not None:
+            logs["global_step"] = global_step
+            logs["epoch"] = epoch
+            if val_step is not None:
+                logs["val_step"] = val_step
+            wandb_tracker.log(logs)
+
+        for tracker in other_trackers:
+            tracker.log(logs, step=step_value)
+
     def assert_extra_args(
         self,
         args,
@@ -1412,7 +1451,7 @@ def restore_rng_state(rng_states: tuple[torch.ByteTensor, Optional[torch.ByteTen
                     logs = self.generate_step_logs(
                         args, current_loss, avr_loss, lr_scheduler, lr_descriptions, optimizer, keys_scaled, mean_norm, maximum_norm
                     )
-                    accelerator.log(logs, step=global_step)
+                    self.step_logging(accelerator, logs, global_step, epoch + 1)
 
                 # VALIDATION PER STEP: global_step is already incremented
                 # for example, if validate_every_n_steps=100, validate at step 100, 200, 300, ...
@@ -1428,7 +1467,7 @@ def restore_rng_state(rng_states: tuple[torch.ByteTensor, Optional[torch.ByteTen
                         disable=not accelerator.is_local_main_process,
                         desc="validation steps",
                     )
-                    val_ts_step = 0
+                    val_timesteps_step = 0
                     for val_step, batch in enumerate(val_dataloader):
                         if val_step >= validation_steps:
                             break
@@ -1457,20 +1496,18 @@ def restore_rng_state(rng_states: tuple[torch.ByteTensor, Optional[torch.ByteTen
                             )
 
                             current_loss = loss.detach().item()
-                            val_step_loss_recorder.add(epoch=epoch, step=val_ts_step, loss=current_loss)
+                            val_step_loss_recorder.add(epoch=epoch, step=val_timesteps_step, loss=current_loss)
                             val_progress_bar.update(1)
                             val_progress_bar.set_postfix(
                                 {"val_avg_loss": val_step_loss_recorder.moving_average, "timestep": timestep}
                             )
 
-                            if is_tracking:
-                                logs = {"loss/validation/step_current": current_loss}
-                                accelerator.log(
-                                    logs, step=global_step + val_ts_step
-                                )  # a bit weird to log with global_step + val_ts_step
+                            # if is_tracking:
+                            #     logs = {f"loss/validation/step_current_{timestep}": current_loss}
+                            #     self.val_logging(accelerator, logs, global_step, epoch + 1, val_step)
 
                             self.on_validation_step_end(args, accelerator, network, text_encoders, unet, batch, weight_dtype)
-                            val_ts_step += 1
+                            val_timesteps_step += 1
 
                     if is_tracking:
                         loss_validation_divergence = val_step_loss_recorder.moving_average - loss_recorder.moving_average
@@ -1478,7 +1515,7 @@ def restore_rng_state(rng_states: tuple[torch.ByteTensor, Optional[torch.ByteTen
                             "loss/validation/step_average": val_step_loss_recorder.moving_average,
                             "loss/validation/step_divergence": loss_validation_divergence,
                         }
-                        accelerator.log(logs, step=global_step)
+                        self.step_logging(accelerator, logs, global_step, epoch=epoch + 1)
 
                     restore_rng_state(rng_states)
                     args.min_timestep = original_args_min_timestep
@@ -1507,7 +1544,7 @@ def restore_rng_state(rng_states: tuple[torch.ByteTensor, Optional[torch.ByteTen
                     desc="epoch validation steps",
                 )
 
-                val_ts_step = 0
+                val_timesteps_step = 0
                 for val_step, batch in enumerate(val_dataloader):
                     if val_step >= validation_steps:
                         break
@@ -1537,18 +1574,18 @@ def restore_rng_state(rng_states: tuple[torch.ByteTensor, Optional[torch.ByteTen
                         )
 
                         current_loss = loss.detach().item()
-                        val_epoch_loss_recorder.add(epoch=epoch, step=val_ts_step, loss=current_loss)
+                        val_epoch_loss_recorder.add(epoch=epoch, step=val_timesteps_step, loss=current_loss)
                         val_progress_bar.update(1)
                         val_progress_bar.set_postfix(
                             {"val_epoch_avg_loss": val_epoch_loss_recorder.moving_average, "timestep": timestep}
                         )
 
-                        if is_tracking:
-                            logs = {"loss/validation/epoch_current": current_loss}
-                            accelerator.log(logs, step=global_step + val_ts_step)
+                        # if is_tracking:
+                        #     logs = {f"loss/validation/epoch_current_{timestep}": current_loss}
+                        #     self.val_logging(accelerator, logs, global_step, epoch + 1, val_step)
 
                         self.on_validation_step_end(args, accelerator, network, text_encoders, unet, batch, weight_dtype)
-                        val_ts_step += 1
+                        val_timesteps_step += 1
 
                 if is_tracking:
                     avr_loss: float = val_epoch_loss_recorder.moving_average
@@ -1557,7 +1594,7 @@ def restore_rng_state(rng_states: tuple[torch.ByteTensor, Optional[torch.ByteTen
                         "loss/validation/epoch_average": avr_loss,
                         "loss/validation/epoch_divergence": loss_validation_divergence,
                     }
-                    accelerator.log(logs, step=epoch + 1)
+                    self.epoch_logging(accelerator, logs, global_step, epoch + 1)
 
                 restore_rng_state(rng_states)
                 args.min_timestep = original_args_min_timestep
@@ -1569,7 +1606,7 @@ def restore_rng_state(rng_states: tuple[torch.ByteTensor, Optional[torch.ByteTen
             # END OF EPOCH
             if is_tracking:
                 logs = {"loss/epoch_average": loss_recorder.moving_average}
-                accelerator.log(logs, step=epoch + 1)
+                self.epoch_logging(accelerator, logs, global_step, epoch + 1)
 
             accelerator.wait_for_everyone()
 

From 025cca699ba0ee05b91d37e5b7779ec28d076620 Mon Sep 17 00:00:00 2001
From: rockerBOO <rockerboo@gmail.com>
Date: Sun, 23 Feb 2025 01:29:18 -0500
Subject: [PATCH 363/582] Fix samples, LoRA training. Add system prompt,
 use_flash_attn

---
 library/config_util.py       |   6 +
 library/lumina_models.py     | 198 +++++++++++++-----------
 library/lumina_train_util.py | 289 ++++++++++++++++++++++++++---------
 library/lumina_util.py       |  86 +++++------
 library/sd3_train_utils.py   | 259 ++++++++++++++++++++++++++-----
 library/strategy_base.py     |  84 ++++++++--
 library/strategy_lumina.py   | 153 +++++++++++++++----
 library/train_util.py        |  21 ++-
 lumina_train_network.py      | 173 +++++++++------------
 train_network.py             |   5 +-
 10 files changed, 888 insertions(+), 386 deletions(-)

diff --git a/library/config_util.py b/library/config_util.py
index a2e07dc6c..ca14dfb13 100644
--- a/library/config_util.py
+++ b/library/config_util.py
@@ -75,6 +75,7 @@ class BaseSubsetParams:
     custom_attributes: Optional[Dict[str, Any]] = None
     validation_seed: int = 0
     validation_split: float = 0.0
+    system_prompt: Optional[str] = None
 
 
 @dataclass
@@ -106,6 +107,7 @@ class BaseDatasetParams:
     debug_dataset: bool = False
     validation_seed: Optional[int] = None
     validation_split: float = 0.0
+    system_prompt: Optional[str] = None
 
 
 @dataclass
@@ -196,6 +198,7 @@ def __validate_and_convert_scalar_or_twodim(klass, value: Union[float, Sequence]
         "caption_prefix": str,
         "caption_suffix": str,
         "custom_attributes": dict,
+        "system_prompt": str,
     }
     # DO means DropOut
     DO_SUBSET_ASCENDABLE_SCHEMA = {
@@ -241,6 +244,7 @@ def __validate_and_convert_scalar_or_twodim(klass, value: Union[float, Sequence]
         "validation_split": float,
         "resolution": functools.partial(__validate_and_convert_scalar_or_twodim.__func__, int),
         "network_multiplier": float,
+        "system_prompt": str,
     }
 
     # options handled by argparse but not handled by user config
@@ -526,6 +530,7 @@ def print_info(_datasets, dataset_type: str):
                   batch_size: {dataset.batch_size}
                   resolution: {(dataset.width, dataset.height)}
                   enable_bucket: {dataset.enable_bucket}
+                  system_prompt: {dataset.system_prompt}
             """)
 
             if dataset.enable_bucket:
@@ -559,6 +564,7 @@ def print_info(_datasets, dataset_type: str):
                     token_warmup_step: {subset.token_warmup_step},
                     alpha_mask: {subset.alpha_mask}
                     custom_attributes: {subset.custom_attributes}
+                    system_prompt: {subset.system_prompt}
                 """), "  ")
 
                 if is_dreambooth:
diff --git a/library/lumina_models.py b/library/lumina_models.py
index 36c3b9796..f819b68fb 100644
--- a/library/lumina_models.py
+++ b/library/lumina_models.py
@@ -14,14 +14,19 @@
 from dataclasses import dataclass
 
 from einops import rearrange
-from flash_attn import flash_attn_varlen_func
-from flash_attn.bert_padding import index_first_axis, pad_input, unpad_input  # noqa
+
 import torch
 from torch import Tensor
 from torch.utils.checkpoint import checkpoint
 import torch.nn as nn
 import torch.nn.functional as F
 
+try:
+    from flash_attn import flash_attn_varlen_func
+    from flash_attn.bert_padding import index_first_axis, pad_input, unpad_input  # noqa
+except:
+    pass
+
 try:
     from apex.normalization import FusedRMSNorm as RMSNorm
 except:
@@ -75,7 +80,15 @@ def get_2b_config(cls) -> "LuminaParams":
     @classmethod
     def get_7b_config(cls) -> "LuminaParams":
         """Returns the configuration for the 7B parameter model"""
-        return cls(patch_size=2, dim=4096, n_layers=32, n_heads=32, n_kv_heads=8, axes_dims=[64, 64, 64], axes_lens=[300, 512, 512])
+        return cls(
+            patch_size=2,
+            dim=4096,
+            n_layers=32,
+            n_heads=32,
+            n_kv_heads=8,
+            axes_dims=[64, 64, 64],
+            axes_lens=[300, 512, 512],
+        )
 
 
 class GradientCheckpointMixin(nn.Module):
@@ -248,6 +261,7 @@ def __init__(
         n_heads: int,
         n_kv_heads: Optional[int],
         qk_norm: bool,
+        use_flash_attn=False,
     ):
         """
         Initialize the Attention module.
@@ -286,7 +300,7 @@ def __init__(
         else:
             self.q_norm = self.k_norm = nn.Identity()
 
-        self.flash_attn = False
+        self.use_flash_attn = use_flash_attn
 
         # self.attention_processor = xformers.ops.memory_efficient_attention
         self.attention_processor = F.scaled_dot_product_attention
@@ -294,35 +308,63 @@ def __init__(
     def set_attention_processor(self, attention_processor):
         self.attention_processor = attention_processor
 
-    @staticmethod
-    def apply_rotary_emb(
-        x_in: torch.Tensor,
-        freqs_cis: torch.Tensor,
-    ) -> torch.Tensor:
+    def forward(
+        self,
+        x: Tensor,
+        x_mask: Tensor,
+        freqs_cis: Tensor,
+    ) -> Tensor:
         """
-        Apply rotary embeddings to input tensors using the given frequency
-        tensor.
+        Args:
+            x:
+            x_mask:
+            freqs_cis:
+        """
+        bsz, seqlen, _ = x.shape
+        dtype = x.dtype
+
+        xq, xk, xv = torch.split(
+            self.qkv(x),
+            [
+                self.n_local_heads * self.head_dim,
+                self.n_local_kv_heads * self.head_dim,
+                self.n_local_kv_heads * self.head_dim,
+            ],
+            dim=-1,
+        )
+        xq = xq.view(bsz, seqlen, self.n_local_heads, self.head_dim)
+        xk = xk.view(bsz, seqlen, self.n_local_kv_heads, self.head_dim)
+        xv = xv.view(bsz, seqlen, self.n_local_kv_heads, self.head_dim)
+        xq = self.q_norm(xq)
+        xk = self.k_norm(xk)
+        xq = apply_rope(xq, freqs_cis=freqs_cis)
+        xk = apply_rope(xk, freqs_cis=freqs_cis)
+        xq, xk = xq.to(dtype), xk.to(dtype)
 
-        This function applies rotary embeddings to the given query 'xq' and
-        key 'xk' tensors using the provided frequency tensor 'freqs_cis'. The
-        input tensors are reshaped as complex numbers, and the frequency tensor
-        is reshaped for broadcasting compatibility. The resulting tensors
-        contain rotary embeddings and are returned as real tensors.
+        softmax_scale = math.sqrt(1 / self.head_dim)
 
-        Args:
-            x_in (torch.Tensor): Query or Key tensor to apply rotary embeddings.
-            freqs_cis (torch.Tensor): Precomputed frequency tensor for complex
-                exponentials.
+        if self.use_flash_attn:
+            output = self.flash_attn(xq, xk, xv, x_mask, softmax_scale)
+        else:
+            n_rep = self.n_local_heads // self.n_local_kv_heads
+            if n_rep >= 1:
+                xk = xk.unsqueeze(3).repeat(1, 1, 1, n_rep, 1).flatten(2, 3)
+                xv = xv.unsqueeze(3).repeat(1, 1, 1, n_rep, 1).flatten(2, 3)
 
-        Returns:
-            Tuple[torch.Tensor, torch.Tensor]: Tuple of modified query tensor
-                and key tensor with rotary embeddings.
-        """
-        with torch.autocast("cuda", enabled=False):
-            x = torch.view_as_complex(x_in.float().reshape(*x_in.shape[:-1], -1, 2))
-            freqs_cis = freqs_cis.unsqueeze(2)
-            x_out = torch.view_as_real(x * freqs_cis).flatten(3)
-            return x_out.type_as(x_in)
+            output = (
+                self.attention_processor(
+                    xq.permute(0, 2, 1, 3),
+                    xk.permute(0, 2, 1, 3),
+                    xv.permute(0, 2, 1, 3),
+                    attn_mask=x_mask.bool().view(bsz, 1, 1, seqlen).expand(-1, self.n_local_heads, seqlen, -1),
+                    scale=softmax_scale,
+                )
+                .permute(0, 2, 1, 3)
+                .to(dtype)
+            )
+
+        output = output.flatten(-2)
+        return self.out(output)
 
     # copied from huggingface modeling_llama.py
     def _upad_input(self, query_layer, key_layer, value_layer, attention_mask, query_length):
@@ -377,46 +419,17 @@ def _get_unpad_data(attention_mask):
             (max_seqlen_in_batch_q, max_seqlen_in_batch_k),
         )
 
-    def forward(
+    def flash_attn(
         self,
-        x: Tensor,
+        q: Tensor,
+        k: Tensor,
+        v: Tensor,
         x_mask: Tensor,
-        freqs_cis: Tensor,
+        softmax_scale,
     ) -> Tensor:
-        """
-
-        Args:
-            x:
-            x_mask:
-            freqs_cis:
-
-        Returns:
-
-        """
-        bsz, seqlen, _ = x.shape
-        dtype = x.dtype
+        bsz, seqlen, _, _ = q.shape
 
-        xq, xk, xv = torch.split(
-            self.qkv(x),
-            [
-                self.n_local_heads * self.head_dim,
-                self.n_local_kv_heads * self.head_dim,
-                self.n_local_kv_heads * self.head_dim,
-            ],
-            dim=-1,
-        )
-        xq = xq.view(bsz, seqlen, self.n_local_heads, self.head_dim)
-        xk = xk.view(bsz, seqlen, self.n_local_kv_heads, self.head_dim)
-        xv = xv.view(bsz, seqlen, self.n_local_kv_heads, self.head_dim)
-        xq = self.q_norm(xq)
-        xk = self.k_norm(xk)
-        xq = JointAttention.apply_rotary_emb(xq, freqs_cis=freqs_cis)
-        xk = JointAttention.apply_rotary_emb(xk, freqs_cis=freqs_cis)
-        xq, xk = xq.to(dtype), xk.to(dtype)
-
-        softmax_scale = math.sqrt(1 / self.head_dim)
-
-        if self.flash_attn:
+        try:
             # begin var_len flash attn
             (
                 query_states,
@@ -425,7 +438,7 @@ def forward(
                 indices_q,
                 cu_seq_lens,
                 max_seq_lens,
-            ) = self._upad_input(xq, xk, xv, x_mask, seqlen)
+            ) = self._upad_input(q, k, v, x_mask, seqlen)
 
             cu_seqlens_q, cu_seqlens_k = cu_seq_lens
             max_seqlen_in_batch_q, max_seqlen_in_batch_k = max_seq_lens
@@ -445,27 +458,12 @@ def forward(
             output = pad_input(attn_output_unpad, indices_q, bsz, seqlen)
             # end var_len_flash_attn
 
-        else:
-            n_rep = self.n_local_heads // self.n_local_kv_heads
-            if n_rep >= 1:
-                xk = xk.unsqueeze(3).repeat(1, 1, 1, n_rep, 1).flatten(2, 3)
-                xv = xv.unsqueeze(3).repeat(1, 1, 1, n_rep, 1).flatten(2, 3)
-
-            output = (
-                self.attention_processor(
-                    xq.permute(0, 2, 1, 3),
-                    xk.permute(0, 2, 1, 3),
-                    xv.permute(0, 2, 1, 3),
-                    attn_mask=x_mask.bool().view(bsz, 1, 1, seqlen).expand(-1, self.n_local_heads, seqlen, -1),
-                    scale=softmax_scale,
-                )
-                .permute(0, 2, 1, 3)
-                .to(dtype)
+            return output
+        except NameError as e:
+            raise RuntimeError(
+                f"Could not load flash attention. Please install flash_attn. / フラッシュアテンションを読み込めませんでした。flash_attn をインストールしてください。 / {e}"
             )
 
-        output = output.flatten(-2)
-        return self.out(output)
-
 
 def apply_rope(
     x_in: torch.Tensor,
@@ -563,6 +561,7 @@ def __init__(
         norm_eps: float,
         qk_norm: bool,
         modulation=True,
+        use_flash_attn=False,
     ) -> None:
         """
         Initialize a TransformerBlock.
@@ -585,7 +584,7 @@ def __init__(
         super().__init__()
         self.dim = dim
         self.head_dim = dim // n_heads
-        self.attention = JointAttention(dim, n_heads, n_kv_heads, qk_norm)
+        self.attention = JointAttention(dim, n_heads, n_kv_heads, qk_norm, use_flash_attn=use_flash_attn)
         self.feed_forward = FeedForward(
             dim=dim,
             hidden_dim=4 * dim,
@@ -711,7 +710,12 @@ def forward(self, x, c):
 
 
 class RopeEmbedder:
-    def __init__(self, theta: float = 10000.0, axes_dims: List[int] = [16, 56, 56], axes_lens: List[int] = [1, 512, 512]):
+    def __init__(
+        self,
+        theta: float = 10000.0,
+        axes_dims: List[int] = [16, 56, 56],
+        axes_lens: List[int] = [1, 512, 512],
+    ):
         super().__init__()
         self.theta = theta
         self.axes_dims = axes_dims
@@ -750,6 +754,7 @@ def __init__(
         cap_feat_dim: int = 5120,
         axes_dims: List[int] = [16, 56, 56],
         axes_lens: List[int] = [1, 512, 512],
+        use_flash_attn=False,
     ) -> None:
         """
         Initialize the NextDiT model.
@@ -803,6 +808,7 @@ def __init__(
                     norm_eps,
                     qk_norm,
                     modulation=False,
+                    use_flash_attn=use_flash_attn,
                 )
                 for layer_id in range(n_refiner_layers)
             ]
@@ -828,6 +834,7 @@ def __init__(
                     norm_eps,
                     qk_norm,
                     modulation=True,
+                    use_flash_attn=use_flash_attn,
                 )
                 for layer_id in range(n_refiner_layers)
             ]
@@ -848,6 +855,7 @@ def __init__(
                     ffn_dim_multiplier,
                     norm_eps,
                     qk_norm,
+                    use_flash_attn=use_flash_attn,
                 )
                 for layer_id in range(n_layers)
             ]
@@ -988,8 +996,20 @@ def patchify_and_embed(
         freqs_cis = self.rope_embedder(position_ids)
 
         # Create separate rotary embeddings for captions and images
-        cap_freqs_cis = torch.zeros(bsz, encoder_seq_len, freqs_cis.shape[-1], device=device, dtype=freqs_cis.dtype)
-        img_freqs_cis = torch.zeros(bsz, image_seq_len, freqs_cis.shape[-1], device=device, dtype=freqs_cis.dtype)
+        cap_freqs_cis = torch.zeros(
+            bsz,
+            encoder_seq_len,
+            freqs_cis.shape[-1],
+            device=device,
+            dtype=freqs_cis.dtype,
+        )
+        img_freqs_cis = torch.zeros(
+            bsz,
+            image_seq_len,
+            freqs_cis.shape[-1],
+            device=device,
+            dtype=freqs_cis.dtype,
+        )
 
         for i, (cap_len, seq_len) in enumerate(zip(l_effective_cap_len, seq_lengths)):
             cap_freqs_cis[i, :cap_len] = freqs_cis[i, :cap_len]
diff --git a/library/lumina_train_util.py b/library/lumina_train_util.py
index 9dac9c9f2..414b2849c 100644
--- a/library/lumina_train_util.py
+++ b/library/lumina_train_util.py
@@ -1,21 +1,28 @@
+import inspect
+import enum
 import argparse
 import math
 import os
 import numpy as np
 import time
-from typing import Callable, Dict, List, Optional, Tuple, Any
+from typing import Callable, Dict, List, Optional, Tuple, Any, Union
 
 import torch
 from torch import Tensor
+from torchdiffeq import odeint
 from accelerate import Accelerator, PartialState
 from transformers import Gemma2Model
 from tqdm import tqdm
 from PIL import Image
 from safetensors.torch import save_file
 
+from diffusers.schedulers.scheduling_heun_discrete import HeunDiscreteScheduler
 from library import lumina_models, lumina_util, strategy_base, strategy_lumina, train_util
+from library.flux_models import AutoEncoder
 from library.device_utils import init_ipex, clean_memory_on_device
 from library.sd3_train_utils import FlowMatchEulerDiscreteScheduler
+from library.lumina_dpm_solver import NoiseScheduleFlow, DPM_Solver
+import library.lumina_path as path
 
 init_ipex()
 
@@ -162,12 +169,12 @@ def sample_image_inference(
     args: argparse.Namespace,
     nextdit: lumina_models.NextDiT,
     gemma2_model: Gemma2Model,
-    vae: torch.nn.Module,
+    vae: AutoEncoder,
     save_dir: str,
     prompt_dict: Dict[str, str],
     epoch: int,
     global_step: int,
-    sample_prompts_gemma2_outputs: List[Tuple[Tensor, Tensor, Tensor]],
+    sample_prompts_gemma2_outputs: dict[str, List[Tuple[Tensor, Tensor, Tensor]]],
     prompt_replacement: Optional[Tuple[str, str]] = None,
     controlnet=None,
 ):
@@ -179,12 +186,12 @@ def sample_image_inference(
         args (argparse.Namespace): Arguments object
         nextdit (lumina_models.NextDiT): NextDiT model
         gemma2_model (Gemma2Model): Gemma2 model
-        vae (torch.nn.Module): VAE model
+        vae (AutoEncoder): VAE model
         save_dir (str): Directory to save images
         prompt_dict (Dict[str, str]): Prompt dictionary
         epoch (int): Epoch number
         steps (int): Number of steps to run
-        sample_prompts_gemma2_outputs (List[Tuple[Tensor, Tensor, Tensor]]): List of tuples containing gemma2 outputs
+        sample_prompts_gemma2_outputs (List[Tuple[Tensor, Tensor, Tensor]]): List of tuples containing Gemma 2 outputs
         prompt_replacement (Optional[Tuple[str, str]], optional): Replacement for positive and negative prompt. Defaults to None.
 
     Returns:
@@ -192,16 +199,19 @@ def sample_image_inference(
     """
     assert isinstance(prompt_dict, dict)
     # negative_prompt = prompt_dict.get("negative_prompt")
-    sample_steps = prompt_dict.get("sample_steps", 38)
-    width = prompt_dict.get("width", 1024)
-    height = prompt_dict.get("height", 1024)
-    guidance_scale: int = prompt_dict.get("scale", 3.5)
-    seed: int = prompt_dict.get("seed", None)
+    sample_steps = int(prompt_dict.get("sample_steps", 38))
+    width = int(prompt_dict.get("width", 1024))
+    height = int(prompt_dict.get("height", 1024))
+    guidance_scale = float(prompt_dict.get("scale", 3.5))
+    seed = prompt_dict.get("seed", None)
     controlnet_image = prompt_dict.get("controlnet_image")
     prompt: str = prompt_dict.get("prompt", "")
     negative_prompt: str = prompt_dict.get("negative_prompt", "")
     # sampler_name: str = prompt_dict.get("sample_sampler", args.sample_sampler)
 
+    seed = int(seed) if seed is not None else None
+    assert seed is None or seed > 0, f"Invalid seed {seed}"
+
     if prompt_replacement is not None:
         prompt = prompt.replace(prompt_replacement[0], prompt_replacement[1])
         if negative_prompt is not None:
@@ -213,10 +223,10 @@ def sample_image_inference(
 
     # if negative_prompt is None:
     #     negative_prompt = ""
-    height = max(64, height - height % 16)  # round to divisible by 16
-    width = max(64, width - width % 16)  # round to divisible by 16
+    height = max(64, height - height % 8)  # round to divisible by 8
+    width = max(64, width - width % 8)  # round to divisible by 8
     logger.info(f"prompt: {prompt}")
-    # logger.info(f"negative_prompt: {negative_prompt}")
+    logger.info(f"negative_prompt: {negative_prompt}")
     logger.info(f"height: {height}")
     logger.info(f"width: {width}")
     logger.info(f"sample_steps: {sample_steps}")
@@ -232,46 +242,51 @@ def sample_image_inference(
     assert isinstance(tokenize_strategy, strategy_lumina.LuminaTokenizeStrategy)
     assert isinstance(encoding_strategy, strategy_lumina.LuminaTextEncodingStrategy)
 
-    gemma2_conds = []
+    system_prompt = args.system_prompt or ""
+
+    # Apply system prompt to prompts
+    prompt = system_prompt + prompt
+    negative_prompt = system_prompt + negative_prompt
+
+    # Get sample prompts from cache
     if sample_prompts_gemma2_outputs and prompt in sample_prompts_gemma2_outputs:
         gemma2_conds = sample_prompts_gemma2_outputs[prompt]
         logger.info(f"Using cached Gemma2 outputs for prompt: {prompt}")
+
+    if sample_prompts_gemma2_outputs and negative_prompt in sample_prompts_gemma2_outputs:
+        neg_gemma2_conds = sample_prompts_gemma2_outputs[negative_prompt]
+        logger.info(f"Using cached Gemma2 outputs for negative prompt: {negative_prompt}")
+
+    # Load sample prompts from Gemma 2
     if gemma2_model is not None:
         logger.info(f"Encoding prompt with Gemma2: {prompt}")
         tokens_and_masks = tokenize_strategy.tokenize(prompt)
-        encoded_gemma2_conds = encoding_strategy.encode_tokens(tokenize_strategy, [gemma2_model], tokens_and_masks)
+        gemma2_conds = encoding_strategy.encode_tokens(tokenize_strategy, [gemma2_model], tokens_and_masks)
 
-        # if gemma2_conds is not cached, use encoded_gemma2_conds
-        if len(gemma2_conds) == 0:
-            gemma2_conds = encoded_gemma2_conds
-        else:
-            # if encoded_gemma2_conds is not None, update cached gemma2_conds
-            for i in range(len(encoded_gemma2_conds)):
-                if encoded_gemma2_conds[i] is not None:
-                    gemma2_conds[i] = encoded_gemma2_conds[i]
+        tokens_and_masks = tokenize_strategy.tokenize(negative_prompt)
+        neg_gemma2_conds = encoding_strategy.encode_tokens(tokenize_strategy, [gemma2_model], tokens_and_masks)
 
     # Unpack Gemma2 outputs
     gemma2_hidden_states, input_ids, gemma2_attn_mask = gemma2_conds
+    neg_gemma2_hidden_states, neg_input_ids, neg_gemma2_attn_mask = neg_gemma2_conds
 
     # sample image
     weight_dtype = vae.dtype  # TOFO give dtype as argument
     latent_height = height // 8
     latent_width = width // 8
+    latent_channels = 16
     noise = torch.randn(
         1,
-        16,
+        latent_channels,
         latent_height,
         latent_width,
         device=accelerator.device,
         dtype=weight_dtype,
         generator=generator,
     )
-    # Prompts are paired positive/negative
-    noise = noise.repeat(gemma2_attn_mask.shape[0], 1, 1, 1)
 
-    timesteps = get_schedule(sample_steps, noise.shape[1], shift=True)
-    # img_ids = lumina_util.prepare_img_ids(1, packed_latent_height, packed_latent_width).to(accelerator.device, weight_dtype)
-    gemma2_attn_mask = gemma2_attn_mask.to(accelerator.device)
+    scheduler = FlowMatchEulerDiscreteScheduler(shift=6.0, use_karras_sigmas=True)
+    timesteps, num_inference_steps = retrieve_timesteps(scheduler, num_inference_steps=sample_steps)
 
     # if controlnet_image is not None:
     #     controlnet_image = Image.open(controlnet_image).convert("RGB")
@@ -280,16 +295,25 @@ def sample_image_inference(
     #     controlnet_image = controlnet_image.permute(2, 0, 1).unsqueeze(0).to(weight_dtype).to(accelerator.device)
 
     with accelerator.autocast():
-        x = denoise(nextdit, noise, gemma2_hidden_states, gemma2_attn_mask, timesteps=timesteps, guidance=guidance_scale)
-
-    # x = lumina_util.unpack_latents(x, packed_latent_height, packed_latent_width)
+        x = denoise(
+            scheduler,
+            nextdit,
+            noise,
+            gemma2_hidden_states,
+            gemma2_attn_mask.to(accelerator.device),
+            neg_gemma2_hidden_states,
+            neg_gemma2_attn_mask.to(accelerator.device),
+            timesteps=timesteps,
+            num_inference_steps=num_inference_steps,
+            guidance_scale=guidance_scale,
+        )
 
-    # latent to image
+    # Latent to image
     clean_memory_on_device(accelerator.device)
     org_vae_device = vae.device  # will be on cpu
     vae.to(accelerator.device)  # distributed_state.device is same as accelerator.device
     with accelerator.autocast():
-        x = vae.decode(x)
+        x = vae.decode((x / vae.scale_factor) + vae.shift_factor)
     vae.to(org_vae_device)
     clean_memory_on_device(accelerator.device)
 
@@ -317,30 +341,25 @@ def sample_image_inference(
         wandb_tracker.log({f"sample_{i}": wandb.Image(image, caption=prompt)}, commit=False)  # positive prompt as a caption
 
 
-def time_shift(mu: float, sigma: float, t: Tensor):
-    """
-    Get time shift
-
-    Args:
-        mu (float): mu value.
-        sigma (float): sigma value.
-        t (Tensor): timestep.
-
-    Return:
-        float: time shift
-    """
-    return math.exp(mu) / (math.exp(mu) + (1 / t - 1) ** sigma)
+def time_shift(mu: float, sigma: float, t: torch.Tensor):
+    # the following implementation was original for t=0: clean / t=1: noise
+    # Since we adopt the reverse, the 1-t operations are needed
+    t = 1 - t
+    t = math.exp(mu) / (math.exp(mu) + (1 / t - 1) ** sigma)
+    t = 1 - t
+    return t
 
 
-def get_lin_function(x1: float = 256, y1: float = 0.5, x2: float = 4096, y2: float = 1.15) -> Callable[[float], float]:
+def get_lin_function(x1: float = 256, x2: float = 4096, y1: float = 0.5, y2: float = 1.15) -> Callable[[float], float]:
     """
     Get linear function
 
     Args:
-        x1 (float, optional): x1 value. Defaults to 256.
-        y1 (float, optional): y1 value. Defaults to 0.5.
-        x2 (float, optional): x2 value. Defaults to 4096.
-        y2 (float, optional): y2 value. Defaults to 1.15.
+        image_seq_len,
+        x1 base_seq_len: int = 256,
+        y2 max_seq_len: int = 4096,
+        y1 base_shift: float = 0.5,
+        y2 max_shift: float = 1.15,
 
     Return:
         Callable[[float], float]: linear function
@@ -370,51 +389,164 @@ def get_schedule(
     Return:
         List[float]: timesteps schedule
     """
-    # extra step for zero
-    timesteps = torch.linspace(1, 0, num_steps + 1)
+    timesteps = torch.linspace(1, 1 / num_steps, num_steps)
 
     # shifting the schedule to favor high timesteps for higher signal images
     if shift:
         # eastimate mu based on linear estimation between two points
-        mu = get_lin_function(y1=base_shift, y2=max_shift)(image_seq_len)
+        mu = get_lin_function(y1=base_shift, y2=max_shift, x1=256, x2=4096)(image_seq_len)
         timesteps = time_shift(mu, 1.0, timesteps)
 
     return timesteps.tolist()
 
 
+# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.retrieve_timesteps
+def retrieve_timesteps(
+    scheduler,
+    num_inference_steps: Optional[int] = None,
+    device: Optional[Union[str, torch.device]] = None,
+    timesteps: Optional[List[int]] = None,
+    sigmas: Optional[List[float]] = None,
+    **kwargs,
+) -> Tuple[torch.Tensor, int]:
+    r"""
+    Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. Handles
+    custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`.
+
+    Args:
+        scheduler (`SchedulerMixin`):
+            The scheduler to get timesteps from.
+        num_inference_steps (`int`):
+            The number of diffusion steps used when generating samples with a pre-trained model. If used, `timesteps`
+            must be `None`.
+        device (`str` or `torch.device`, *optional*):
+            The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
+        timesteps (`List[int]`, *optional*):
+            Custom timesteps used to override the timestep spacing strategy of the scheduler. If `timesteps` is passed,
+            `num_inference_steps` and `sigmas` must be `None`.
+        sigmas (`List[float]`, *optional*):
+            Custom sigmas used to override the timestep spacing strategy of the scheduler. If `sigmas` is passed,
+            `num_inference_steps` and `timesteps` must be `None`.
+
+    Returns:
+        `Tuple[torch.Tensor, int]`: A tuple where the first element is the timestep schedule from the scheduler and the
+        second element is the number of inference steps.
+    """
+    if timesteps is not None and sigmas is not None:
+        raise ValueError("Only one of `timesteps` or `sigmas` can be passed. Please choose one to set custom values")
+    if timesteps is not None:
+        accepts_timesteps = "timesteps" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
+        if not accepts_timesteps:
+            raise ValueError(
+                f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
+                f" timestep schedules. Please check whether you are using the correct scheduler."
+            )
+        scheduler.set_timesteps(timesteps=timesteps, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+        num_inference_steps = len(timesteps)
+    elif sigmas is not None:
+        accept_sigmas = "sigmas" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
+        if not accept_sigmas:
+            raise ValueError(
+                f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
+                f" sigmas schedules. Please check whether you are using the correct scheduler."
+            )
+        scheduler.set_timesteps(sigmas=sigmas, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+        num_inference_steps = len(timesteps)
+    else:
+        scheduler.set_timesteps(num_inference_steps, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+    return timesteps, num_inference_steps
+
+
 def denoise(
-    model: lumina_models.NextDiT, img: Tensor, txt: Tensor, txt_mask: Tensor, timesteps: List[float], guidance: float = 4.0
+    scheduler,
+    model: lumina_models.NextDiT,
+    img: Tensor,
+    txt: Tensor,
+    txt_mask: Tensor,
+    neg_txt: Tensor,
+    neg_txt_mask: Tensor,
+    timesteps: Union[List[float], torch.Tensor],
+    num_inference_steps: int = 38,
+    guidance_scale: float = 4.0,
+    cfg_trunc_ratio: float = 1.0,
+    cfg_normalization: bool = True,
 ):
     """
     Denoise an image using the NextDiT model.
 
     Args:
+        scheduler ():
+            Noise scheduler
         model (lumina_models.NextDiT): The NextDiT model instance.
-        img (Tensor): The input image tensor.
-        txt (Tensor): The input text tensor.
-        txt_mask (Tensor): The input text mask tensor.
-        timesteps (List[float]): A list of timesteps for the denoising process.
-        guidance (float, optional): The guidance scale for the denoising process. Defaults to 4.0.
+        img (Tensor):
+            The input image latent tensor.
+        txt (Tensor):
+            The input text tensor.
+        txt_mask (Tensor):
+            The input text mask tensor.
+        neg_txt (Tensor):
+            The negative input txt tensor
+        neg_txt_mask (Tensor):
+            The negative input text mask tensor.
+        timesteps (List[Union[float, torch.FloatTensor]]):
+            A list of timesteps for the denoising process.
+        guidance_scale (float, optional):
+            The guidance scale for the denoising process. Defaults to 4.0.
+        cfg_trunc_ratio (float, optional):
+            The ratio of the timestep interval to apply normalization-based guidance scale.
+        cfg_normalization (bool, optional):
+            Whether to apply normalization-based guidance scale.
 
     Returns:
-        img (Tensor): Denoised tensor
+        img (Tensor): Denoised latent tensor
     """
-    for t_curr, t_prev in zip(tqdm(timesteps[:-1]), timesteps[1:]):
-        t_vec = torch.full((img.shape[0],), t_curr, dtype=img.dtype, device=img.device)
-        # model.prepare_block_swap_before_forward()
-        # block_samples = None
-        # block_single_samples = None
-        pred = model.forward_with_cfg(
-            x=img,  # image latents (B, C, H, W)
-            t=t_vec / 1000,  # timesteps需要除以1000来匹配模型预期
+
+    for i, t in enumerate(tqdm(timesteps)):
+        # compute whether apply classifier-free truncation on this timestep
+        do_classifier_free_truncation = (i + 1) / num_inference_steps > cfg_trunc_ratio
+
+        # reverse the timestep since Lumina uses t=0 as the noise and t=1 as the image
+        current_timestep = 1 - t / scheduler.config.num_train_timesteps
+        # broadcast to batch dimension in a way that's compatible with ONNX/Core ML
+        current_timestep = current_timestep.expand(img.shape[0]).to(model.device)
+
+        noise_pred_cond = model(
+            img,
+            current_timestep,
             cap_feats=txt,  # Gemma2的hidden states作为caption features
             cap_mask=txt_mask.to(dtype=torch.int32),  # Gemma2的attention mask
-            cfg_scale=guidance,
         )
 
-        img = img + (t_prev - t_curr) * pred
+        if not do_classifier_free_truncation:
+            noise_pred_uncond = model(
+                img,
+                current_timestep,
+                cap_feats=neg_txt,  # Gemma2的hidden states作为caption features
+                cap_mask=neg_txt_mask.to(dtype=torch.int32),  # Gemma2的attention mask
+            )
+            noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_cond - noise_pred_uncond)
+            # apply normalization after classifier-free guidance
+            if cfg_normalization:
+                cond_norm = torch.norm(noise_pred_cond, dim=-1, keepdim=True)
+                noise_norm = torch.norm(noise_pred, dim=-1, keepdim=True)
+                noise_pred = noise_pred * (cond_norm / noise_norm)
+        else:
+            noise_pred = noise_pred_cond
+
+        img_dtype = img.dtype
+
+        if img.dtype != img_dtype:
+            if torch.backends.mps.is_available():
+                # some platforms (eg. apple mps) misbehave due to a pytorch bug: https://github.com/pytorch/pytorch/pull/99272
+                img = img.to(img_dtype)
+
+        # compute the previous noisy sample x_t -> x_t-1
+        noise_pred = -noise_pred
+        img = scheduler.step(noise_pred, t, img, return_dict=False)[0]
 
-    # model.prepare_block_swap_before_forward()
     return img
 
 
@@ -754,3 +886,14 @@ def add_lumina_train_arguments(parser: argparse.ArgumentParser):
         default=3.0,
         help="Discrete flow shift for the Euler Discrete Scheduler, default is 3.0. / Euler Discrete Schedulerの離散フローシフト、デフォルトは3.0。",
     )
+    parser.add_argument(
+        "--use_flash_attn",
+        action="store_true",
+        help="Use Flash Attention for the model. / モデルにFlash Attentionを使用する。",
+    )
+    parser.add_argument(
+        "--system_prompt",
+        type=str,
+        default="You are an assistant designed to generate high-quality images based on user prompts. <Prompt Start> ",
+        help="System prompt to add to the prompt. / プロンプトに追加するシステムプロンプト。",
+    )
diff --git a/library/lumina_util.py b/library/lumina_util.py
index f404e7754..d9c899386 100644
--- a/library/lumina_util.py
+++ b/library/lumina_util.py
@@ -20,11 +20,13 @@
 
 MODEL_VERSION_LUMINA_V2 = "lumina2"
 
+
 def load_lumina_model(
     ckpt_path: str,
-    dtype: torch.dtype,
+    dtype: Optional[torch.dtype],
     device: torch.device,
     disable_mmap: bool = False,
+    use_flash_attn: bool = False,
 ):
     """
     Load the Lumina model from the checkpoint path.
@@ -34,22 +36,22 @@ def load_lumina_model(
         dtype (torch.dtype): The data type for the model.
         device (torch.device): The device to load the model on.
         disable_mmap (bool, optional): Whether to disable mmap. Defaults to False.
+        use_flash_attn (bool, optional): Whether to use flash attention. Defaults to False.
 
     Returns:
         model (lumina_models.NextDiT): The loaded model.
     """
     logger.info("Building Lumina")
     with torch.device("meta"):
-        model = lumina_models.NextDiT_2B_GQA_patch2_Adaln_Refiner().to(dtype)
+        model = lumina_models.NextDiT_2B_GQA_patch2_Adaln_Refiner(use_flash_attn=use_flash_attn).to(dtype)
 
     logger.info(f"Loading state dict from {ckpt_path}")
-    state_dict = load_safetensors(
-        ckpt_path, device=device, disable_mmap=disable_mmap, dtype=dtype
-    )
+    state_dict = load_safetensors(ckpt_path, device=device, disable_mmap=disable_mmap, dtype=dtype)
     info = model.load_state_dict(state_dict, strict=False, assign=True)
     logger.info(f"Loaded Lumina: {info}")
     return model
 
+
 def load_ae(
     ckpt_path: str,
     dtype: torch.dtype,
@@ -74,9 +76,7 @@ def load_ae(
         ae = flux_models.AutoEncoder(flux_models.configs["schnell"].ae_params).to(dtype)
 
     logger.info(f"Loading state dict from {ckpt_path}")
-    sd = load_safetensors(
-        ckpt_path, device=str(device), disable_mmap=disable_mmap, dtype=dtype
-    )
+    sd = load_safetensors(ckpt_path, device=device, disable_mmap=disable_mmap, dtype=dtype)
     info = ae.load_state_dict(sd, strict=False, assign=True)
     logger.info(f"Loaded AE: {info}")
     return ae
@@ -104,37 +104,35 @@ def load_gemma2(
     """
     logger.info("Building Gemma2")
     GEMMA2_CONFIG = {
-      "_name_or_path": "google/gemma-2-2b",
-      "architectures": [
-        "Gemma2Model"
-      ],
-      "attention_bias": False,
-      "attention_dropout": 0.0,
-      "attn_logit_softcapping": 50.0,
-      "bos_token_id": 2,
-      "cache_implementation": "hybrid",
-      "eos_token_id": 1,
-      "final_logit_softcapping": 30.0,
-      "head_dim": 256,
-      "hidden_act": "gelu_pytorch_tanh",
-      "hidden_activation": "gelu_pytorch_tanh",
-      "hidden_size": 2304,
-      "initializer_range": 0.02,
-      "intermediate_size": 9216,
-      "max_position_embeddings": 8192,
-      "model_type": "gemma2",
-      "num_attention_heads": 8,
-      "num_hidden_layers": 26,
-      "num_key_value_heads": 4,
-      "pad_token_id": 0,
-      "query_pre_attn_scalar": 256,
-      "rms_norm_eps": 1e-06,
-      "rope_theta": 10000.0,
-      "sliding_window": 4096,
-      "torch_dtype": "float32",
-      "transformers_version": "4.44.2",
-      "use_cache": True,
-      "vocab_size": 256000
+        "_name_or_path": "google/gemma-2-2b",
+        "architectures": ["Gemma2Model"],
+        "attention_bias": False,
+        "attention_dropout": 0.0,
+        "attn_logit_softcapping": 50.0,
+        "bos_token_id": 2,
+        "cache_implementation": "hybrid",
+        "eos_token_id": 1,
+        "final_logit_softcapping": 30.0,
+        "head_dim": 256,
+        "hidden_act": "gelu_pytorch_tanh",
+        "hidden_activation": "gelu_pytorch_tanh",
+        "hidden_size": 2304,
+        "initializer_range": 0.02,
+        "intermediate_size": 9216,
+        "max_position_embeddings": 8192,
+        "model_type": "gemma2",
+        "num_attention_heads": 8,
+        "num_hidden_layers": 26,
+        "num_key_value_heads": 4,
+        "pad_token_id": 0,
+        "query_pre_attn_scalar": 256,
+        "rms_norm_eps": 1e-06,
+        "rope_theta": 10000.0,
+        "sliding_window": 4096,
+        "torch_dtype": "float32",
+        "transformers_version": "4.44.2",
+        "use_cache": True,
+        "vocab_size": 256000,
     }
 
     config = Gemma2Config(**GEMMA2_CONFIG)
@@ -145,9 +143,7 @@ def load_gemma2(
         sd = state_dict
     else:
         logger.info(f"Loading state dict from {ckpt_path}")
-        sd = load_safetensors(
-            ckpt_path, device=str(device), disable_mmap=disable_mmap, dtype=dtype
-        )
+        sd = load_safetensors(ckpt_path, device=str(device), disable_mmap=disable_mmap, dtype=dtype)
 
     for key in list(sd.keys()):
         new_key = key.replace("model.", "")
@@ -159,6 +155,7 @@ def load_gemma2(
     logger.info(f"Loaded Gemma2: {info}")
     return gemma2
 
+
 def unpack_latents(x: torch.Tensor, packed_latent_height: int, packed_latent_width: int) -> torch.Tensor:
     """
     x: [b (h w) (c ph pw)] -> [b c (h ph) (w pw)], ph=2, pw=2
@@ -174,6 +171,7 @@ def pack_latents(x: torch.Tensor) -> torch.Tensor:
     x = einops.rearrange(x, "b c (h ph) (w pw) -> b (h w) (c ph pw)", ph=2, pw=2)
     return x
 
+
 DIFFUSERS_TO_ALPHA_VLLM_MAP = {
     # Embedding layers
     "cap_embedder.0.weight": ["time_caption_embed.caption_embedder.0.weight"],
@@ -224,9 +222,7 @@ def convert_diffusers_sd_to_alpha_vllm(sd: dict, num_double_blocks: int) -> dict
                 for block_idx in range(num_double_blocks):
                     if str(block_idx) in key:
                         converted = pattern.replace("()", str(block_idx))
-                        new_key = key.replace(
-                            converted, replacement.replace("()", str(block_idx))
-                        )
+                        new_key = key.replace(converted, replacement.replace("()", str(block_idx)))
                         break
 
         if new_key == key:
diff --git a/library/sd3_train_utils.py b/library/sd3_train_utils.py
index c40798846..6a4b39b3a 100644
--- a/library/sd3_train_utils.py
+++ b/library/sd3_train_utils.py
@@ -610,6 +610,21 @@ def encode_prompt(prpt):
 from diffusers.utils import BaseOutput
 
 
+# Copyright 2024 Stability AI, Katherine Crowson and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
 @dataclass
 class FlowMatchEulerDiscreteSchedulerOutput(BaseOutput):
     """
@@ -649,22 +664,49 @@ def __init__(
         self,
         num_train_timesteps: int = 1000,
         shift: float = 1.0,
+        use_dynamic_shifting=False,
+        base_shift: Optional[float] = 0.5,
+        max_shift: Optional[float] = 1.15,
+        base_image_seq_len: Optional[int] = 256,
+        max_image_seq_len: Optional[int] = 4096,
+        invert_sigmas: bool = False,
+        shift_terminal: Optional[float] = None,
+        use_karras_sigmas: Optional[bool] = False,
+        use_exponential_sigmas: Optional[bool] = False,
+        use_beta_sigmas: Optional[bool] = False,
     ):
+        if self.config.use_beta_sigmas and not is_scipy_available():
+            raise ImportError("Make sure to install scipy if you want to use beta sigmas.")
+        if sum([self.config.use_beta_sigmas, self.config.use_exponential_sigmas, self.config.use_karras_sigmas]) > 1:
+            raise ValueError(
+                "Only one of `config.use_beta_sigmas`, `config.use_exponential_sigmas`, `config.use_karras_sigmas` can be used."
+            )
         timesteps = np.linspace(1, num_train_timesteps, num_train_timesteps, dtype=np.float32)[::-1].copy()
         timesteps = torch.from_numpy(timesteps).to(dtype=torch.float32)
 
         sigmas = timesteps / num_train_timesteps
-        sigmas = shift * sigmas / (1 + (shift - 1) * sigmas)
+        if not use_dynamic_shifting:
+            # when use_dynamic_shifting is True, we apply the timestep shifting on the fly based on the image resolution
+            sigmas = shift * sigmas / (1 + (shift - 1) * sigmas)
 
         self.timesteps = sigmas * num_train_timesteps
 
         self._step_index = None
         self._begin_index = None
 
+        self._shift = shift
+
         self.sigmas = sigmas.to("cpu")  # to avoid too much CPU/GPU communication
         self.sigma_min = self.sigmas[-1].item()
         self.sigma_max = self.sigmas[0].item()
 
+    @property
+    def shift(self):
+        """
+        The value used for shifting.
+        """
+        return self._shift
+
     @property
     def step_index(self):
         """
@@ -690,6 +732,9 @@ def set_begin_index(self, begin_index: int = 0):
         """
         self._begin_index = begin_index
 
+    def set_shift(self, shift: float):
+        self._shift = shift
+
     def scale_noise(
         self,
         sample: torch.FloatTensor,
@@ -709,10 +754,31 @@ def scale_noise(
             `torch.FloatTensor`:
                 A scaled input sample.
         """
-        if self.step_index is None:
-            self._init_step_index(timestep)
+        # Make sure sigmas and timesteps have the same device and dtype as original_samples
+        sigmas = self.sigmas.to(device=sample.device, dtype=sample.dtype)
+
+        if sample.device.type == "mps" and torch.is_floating_point(timestep):
+            # mps does not support float64
+            schedule_timesteps = self.timesteps.to(sample.device, dtype=torch.float32)
+            timestep = timestep.to(sample.device, dtype=torch.float32)
+        else:
+            schedule_timesteps = self.timesteps.to(sample.device)
+            timestep = timestep.to(sample.device)
+
+        # self.begin_index is None when scheduler is used for training, or pipeline does not implement set_begin_index
+        if self.begin_index is None:
+            step_indices = [self.index_for_timestep(t, schedule_timesteps) for t in timestep]
+        elif self.step_index is not None:
+            # add_noise is called after first denoising step (for inpainting)
+            step_indices = [self.step_index] * timestep.shape[0]
+        else:
+            # add noise is called before first denoising step to create initial latent(img2img)
+            step_indices = [self.begin_index] * timestep.shape[0]
+
+        sigma = sigmas[step_indices].flatten()
+        while len(sigma.shape) < len(sample.shape):
+            sigma = sigma.unsqueeze(-1)
 
-        sigma = self.sigmas[self.step_index]
         sample = sigma * noise + (1.0 - sigma) * sample
 
         return sample
@@ -720,7 +786,37 @@ def scale_noise(
     def _sigma_to_t(self, sigma):
         return sigma * self.config.num_train_timesteps
 
-    def set_timesteps(self, num_inference_steps: int, device: Union[str, torch.device] = None):
+    def time_shift(self, mu: float, sigma: float, t: torch.Tensor):
+        return math.exp(mu) / (math.exp(mu) + (1 / t - 1) ** sigma)
+
+    def stretch_shift_to_terminal(self, t: torch.Tensor) -> torch.Tensor:
+        r"""
+        Stretches and shifts the timestep schedule to ensure it terminates at the configured `shift_terminal` config
+        value.
+
+        Reference:
+        https://github.com/Lightricks/LTX-Video/blob/a01a171f8fe3d99dce2728d60a73fecf4d4238ae/ltx_video/schedulers/rf.py#L51
+
+        Args:
+            t (`torch.Tensor`):
+                A tensor of timesteps to be stretched and shifted.
+
+        Returns:
+            `torch.Tensor`:
+                A tensor of adjusted timesteps such that the final value equals `self.config.shift_terminal`.
+        """
+        one_minus_z = 1 - t
+        scale_factor = one_minus_z[-1] / (1 - self.config.shift_terminal)
+        stretched_t = 1 - (one_minus_z / scale_factor)
+        return stretched_t
+
+    def set_timesteps(
+        self,
+        num_inference_steps: int = None,
+        device: Union[str, torch.device] = None,
+        sigmas: Optional[List[float]] = None,
+        mu: Optional[float] = None,
+    ):
         """
         Sets the discrete timesteps used for the diffusion chain (to be run before inference).
 
@@ -730,18 +826,49 @@ def set_timesteps(self, num_inference_steps: int, device: Union[str, torch.devic
             device (`str` or `torch.device`, *optional*):
                 The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
         """
+        if self.config.use_dynamic_shifting and mu is None:
+            raise ValueError(" you have a pass a value for `mu` when `use_dynamic_shifting` is set to be `True`")
+
+        if sigmas is None:
+            timesteps = np.linspace(
+                self._sigma_to_t(self.sigma_max), self._sigma_to_t(self.sigma_min), num_inference_steps
+            )
+
+            sigmas = timesteps / self.config.num_train_timesteps
+        else:
+            sigmas = np.array(sigmas).astype(np.float32)
+            num_inference_steps = len(sigmas)
         self.num_inference_steps = num_inference_steps
 
-        timesteps = np.linspace(self._sigma_to_t(self.sigma_max), self._sigma_to_t(self.sigma_min), num_inference_steps)
+        if self.config.use_dynamic_shifting:
+            sigmas = self.time_shift(mu, 1.0, sigmas)
+        else:
+            sigmas = self.shift * sigmas / (1 + (self.shift - 1) * sigmas)
 
-        sigmas = timesteps / self.config.num_train_timesteps
-        sigmas = self.config.shift * sigmas / (1 + (self.config.shift - 1) * sigmas)
-        sigmas = torch.from_numpy(sigmas).to(dtype=torch.float32, device=device)
+        if self.config.shift_terminal:
+            sigmas = self.stretch_shift_to_terminal(sigmas)
+
+        if self.config.use_karras_sigmas:
+            sigmas = self._convert_to_karras(in_sigmas=sigmas, num_inference_steps=num_inference_steps)
+
+        elif self.config.use_exponential_sigmas:
+            sigmas = self._convert_to_exponential(in_sigmas=sigmas, num_inference_steps=num_inference_steps)
+
+        elif self.config.use_beta_sigmas:
+            sigmas = self._convert_to_beta(in_sigmas=sigmas, num_inference_steps=num_inference_steps)
 
+        sigmas = torch.from_numpy(sigmas).to(dtype=torch.float32, device=device)
         timesteps = sigmas * self.config.num_train_timesteps
-        self.timesteps = timesteps.to(device=device)
-        self.sigmas = torch.cat([sigmas, torch.zeros(1, device=sigmas.device)])
 
+        if self.config.invert_sigmas:
+            sigmas = 1.0 - sigmas
+            timesteps = sigmas * self.config.num_train_timesteps
+            sigmas = torch.cat([sigmas, torch.ones(1, device=sigmas.device)])
+        else:
+            sigmas = torch.cat([sigmas, torch.zeros(1, device=sigmas.device)])
+
+        self.timesteps = timesteps.to(device=device)
+        self.sigmas = sigmas
         self._step_index = None
         self._begin_index = None
 
@@ -807,7 +934,11 @@ def step(
                 returned, otherwise a tuple is returned where the first element is the sample tensor.
         """
 
-        if isinstance(timestep, int) or isinstance(timestep, torch.IntTensor) or isinstance(timestep, torch.LongTensor):
+        if (
+            isinstance(timestep, int)
+            or isinstance(timestep, torch.IntTensor)
+            or isinstance(timestep, torch.LongTensor)
+        ):
             raise ValueError(
                 (
                     "Passing integer indices (e.g. from `enumerate(timesteps)`) as timesteps to"
@@ -823,30 +954,10 @@ def step(
         sample = sample.to(torch.float32)
 
         sigma = self.sigmas[self.step_index]
+        sigma_next = self.sigmas[self.step_index + 1]
 
-        gamma = min(s_churn / (len(self.sigmas) - 1), 2**0.5 - 1) if s_tmin <= sigma <= s_tmax else 0.0
-
-        noise = randn_tensor(model_output.shape, dtype=model_output.dtype, device=model_output.device, generator=generator)
-
-        eps = noise * s_noise
-        sigma_hat = sigma * (gamma + 1)
-
-        if gamma > 0:
-            sample = sample + eps * (sigma_hat**2 - sigma**2) ** 0.5
-
-        # 1. compute predicted original sample (x_0) from sigma-scaled predicted noise
-        # NOTE: "original_sample" should not be an expected prediction_type but is left in for
-        # backwards compatibility
-
-        # if self.config.prediction_type == "vector_field":
-
-        denoised = sample - model_output * sigma
-        # 2. Convert to an ODE derivative
-        derivative = (sample - denoised) / sigma_hat
-
-        dt = self.sigmas[self.step_index + 1] - sigma_hat
+        prev_sample = sample + (sigma_next - sigma) * model_output
 
-        prev_sample = sample + derivative * dt
         # Cast sample back to model compatible dtype
         prev_sample = prev_sample.to(model_output.dtype)
 
@@ -858,6 +969,86 @@ def step(
 
         return FlowMatchEulerDiscreteSchedulerOutput(prev_sample=prev_sample)
 
+    # Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler._convert_to_karras
+    def _convert_to_karras(self, in_sigmas: torch.Tensor, num_inference_steps) -> torch.Tensor:
+        """Constructs the noise schedule of Karras et al. (2022)."""
+
+        # Hack to make sure that other schedulers which copy this function don't break
+        # TODO: Add this logic to the other schedulers
+        if hasattr(self.config, "sigma_min"):
+            sigma_min = self.config.sigma_min
+        else:
+            sigma_min = None
+
+        if hasattr(self.config, "sigma_max"):
+            sigma_max = self.config.sigma_max
+        else:
+            sigma_max = None
+
+        sigma_min = sigma_min if sigma_min is not None else in_sigmas[-1].item()
+        sigma_max = sigma_max if sigma_max is not None else in_sigmas[0].item()
+
+        rho = 7.0  # 7.0 is the value used in the paper
+        ramp = np.linspace(0, 1, num_inference_steps)
+        min_inv_rho = sigma_min ** (1 / rho)
+        max_inv_rho = sigma_max ** (1 / rho)
+        sigmas = (max_inv_rho + ramp * (min_inv_rho - max_inv_rho)) ** rho
+        return sigmas
+
+    # Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler._convert_to_exponential
+    def _convert_to_exponential(self, in_sigmas: torch.Tensor, num_inference_steps: int) -> torch.Tensor:
+        """Constructs an exponential noise schedule."""
+
+        # Hack to make sure that other schedulers which copy this function don't break
+        # TODO: Add this logic to the other schedulers
+        if hasattr(self.config, "sigma_min"):
+            sigma_min = self.config.sigma_min
+        else:
+            sigma_min = None
+
+        if hasattr(self.config, "sigma_max"):
+            sigma_max = self.config.sigma_max
+        else:
+            sigma_max = None
+
+        sigma_min = sigma_min if sigma_min is not None else in_sigmas[-1].item()
+        sigma_max = sigma_max if sigma_max is not None else in_sigmas[0].item()
+
+        sigmas = np.exp(np.linspace(math.log(sigma_max), math.log(sigma_min), num_inference_steps))
+        return sigmas
+
+    # Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler._convert_to_beta
+    def _convert_to_beta(
+        self, in_sigmas: torch.Tensor, num_inference_steps: int, alpha: float = 0.6, beta: float = 0.6
+    ) -> torch.Tensor:
+        """From "Beta Sampling is All You Need" [arXiv:2407.12173] (Lee et. al, 2024)"""
+
+        # Hack to make sure that other schedulers which copy this function don't break
+        # TODO: Add this logic to the other schedulers
+        if hasattr(self.config, "sigma_min"):
+            sigma_min = self.config.sigma_min
+        else:
+            sigma_min = None
+
+        if hasattr(self.config, "sigma_max"):
+            sigma_max = self.config.sigma_max
+        else:
+            sigma_max = None
+
+        sigma_min = sigma_min if sigma_min is not None else in_sigmas[-1].item()
+        sigma_max = sigma_max if sigma_max is not None else in_sigmas[0].item()
+
+        sigmas = np.array(
+            [
+                sigma_min + (ppf * (sigma_max - sigma_min))
+                for ppf in [
+                    scipy.stats.beta.ppf(timestep, alpha, beta)
+                    for timestep in 1 - np.linspace(0, 1, num_inference_steps)
+                ]
+            ]
+        )
+        return sigmas
+
     def __len__(self):
         return self.config.num_train_timesteps
 
diff --git a/library/strategy_base.py b/library/strategy_base.py
index 358e42f1d..fad79682f 100644
--- a/library/strategy_base.py
+++ b/library/strategy_base.py
@@ -2,7 +2,7 @@
 
 import os
 import re
-from typing import Any, List, Optional, Tuple, Union
+from typing import Any, List, Optional, Tuple, Union, Callable
 
 import numpy as np
 import torch
@@ -430,9 +430,21 @@ def _default_is_disk_cached_latents_expected(
         bucket_reso: Tuple[int, int],
         npz_path: str,
         flip_aug: bool,
-        alpha_mask: bool,
+        apply_alpha_mask: bool,
         multi_resolution: bool = False,
-    ):
+    ) -> bool:
+        """
+        Args:
+            latents_stride: stride of latents
+            bucket_reso: resolution of the bucket
+            npz_path: path to the npz file
+            flip_aug: whether to flip images
+            apply_alpha_mask: whether to apply alpha mask
+            multi_resolution: whether to use multi-resolution latents
+
+        Returns:
+            bool
+        """
         if not self.cache_to_disk:
             return False
         if not os.path.exists(npz_path):
@@ -451,7 +463,7 @@ def _default_is_disk_cached_latents_expected(
                 return False
             if flip_aug and "latents_flipped" + key_reso_suffix not in npz:
                 return False
-            if alpha_mask and "alpha_mask" + key_reso_suffix not in npz:
+            if apply_alpha_mask and "alpha_mask" + key_reso_suffix not in npz:
                 return False
         except Exception as e:
             logger.error(f"Error loading file: {npz_path}")
@@ -462,22 +474,35 @@ def _default_is_disk_cached_latents_expected(
     # TODO remove circular dependency for ImageInfo
     def _default_cache_batch_latents(
         self,
-        encode_by_vae,
-        vae_device,
-        vae_dtype,
+        encode_by_vae: Callable,
+        vae_device: torch.device,
+        vae_dtype: torch.dtype,
         image_infos: List,
         flip_aug: bool,
-        alpha_mask: bool,
+        apply_alpha_mask: bool,
         random_crop: bool,
         multi_resolution: bool = False,
     ):
         """
         Default implementation for cache_batch_latents. Image loading, VAE, flipping, alpha mask handling are common.
+
+        Args:
+            encode_by_vae: function to encode images by VAE
+            vae_device: device to use for VAE
+            vae_dtype: dtype to use for VAE
+            image_infos: list of ImageInfo
+            flip_aug: whether to flip images
+            apply_alpha_mask: whether to apply alpha mask
+            random_crop: whether to random crop images
+            multi_resolution: whether to use multi-resolution latents
+        
+        Returns: 
+            None
         """
         from library import train_util  # import here to avoid circular import
 
         img_tensor, alpha_masks, original_sizes, crop_ltrbs = train_util.load_images_and_masks_for_caching(
-            image_infos, alpha_mask, random_crop
+            image_infos, apply_alpha_mask, random_crop
         )
         img_tensor = img_tensor.to(device=vae_device, dtype=vae_dtype)
 
@@ -519,12 +544,40 @@ def load_latents_from_disk(
     ) -> Tuple[Optional[np.ndarray], Optional[List[int]], Optional[List[int]], Optional[np.ndarray], Optional[np.ndarray]]:
         """
         for SD/SDXL
+
+        Args:
+            npz_path (str): Path to the npz file.
+            bucket_reso (Tuple[int, int]): The resolution of the bucket.
+        
+        Returns:
+            Tuple[
+                Optional[np.ndarray], 
+                Optional[List[int]], 
+                Optional[List[int]], 
+                Optional[np.ndarray], 
+                Optional[np.ndarray]
+            ]: Latent np tensors, original size, crop (left top, right bottom), flipped latents, alpha mask
         """
         return self._default_load_latents_from_disk(None, npz_path, bucket_reso)
 
     def _default_load_latents_from_disk(
         self, latents_stride: Optional[int], npz_path: str, bucket_reso: Tuple[int, int]
     ) -> Tuple[Optional[np.ndarray], Optional[List[int]], Optional[List[int]], Optional[np.ndarray], Optional[np.ndarray]]:
+        """
+        Args:
+            latents_stride (Optional[int]): Stride for latents. If None, load all latents.
+            npz_path (str): Path to the npz file.
+            bucket_reso (Tuple[int, int]): The resolution of the bucket.
+       
+        Returns:
+            Tuple[
+                Optional[np.ndarray], 
+                Optional[List[int]], 
+                Optional[List[int]], 
+                Optional[np.ndarray], 
+                Optional[np.ndarray]
+            ]: Latent np tensors, original size, crop (left top, right bottom), flipped latents, alpha mask
+        """
         if latents_stride is None:
             key_reso_suffix = ""
         else:
@@ -552,6 +605,19 @@ def save_latents_to_disk(
         alpha_mask=None,
         key_reso_suffix="",
     ):
+        """
+        Args:
+            npz_path (str): Path to the npz file.
+            latents_tensor (torch.Tensor): Latent tensor
+            original_size (List[int]): Original size of the image
+            crop_ltrb (List[int]): Crop left top right bottom
+            flipped_latents_tensor (Optional[torch.Tensor]): Flipped latent tensor
+            alpha_mask (Optional[torch.Tensor]): Alpha mask
+            key_reso_suffix (str): Key resolution suffix
+
+        Returns:
+            None
+        """
         kwargs = {}
 
         if os.path.exists(npz_path):
diff --git a/library/strategy_lumina.py b/library/strategy_lumina.py
index 0a6a7f293..5d6e100fc 100644
--- a/library/strategy_lumina.py
+++ b/library/strategy_lumina.py
@@ -3,13 +3,13 @@
 from typing import Any, List, Optional, Tuple, Union
 
 import torch
-from transformers import AutoTokenizer, AutoModel, GemmaTokenizerFast
+from transformers import AutoTokenizer, AutoModel, Gemma2Model, GemmaTokenizerFast
 from library import train_util
 from library.strategy_base import (
     LatentsCachingStrategy,
     TokenizeStrategy,
     TextEncodingStrategy,
-    TextEncoderOutputsCachingStrategy
+    TextEncoderOutputsCachingStrategy,
 )
 import numpy as np
 from library.utils import setup_logging
@@ -37,21 +37,38 @@ def __init__(
         else:
             self.max_length = max_length
 
-    def tokenize(self, text: Union[str, List[str]]) -> Tuple[torch.Tensor, torch.Tensor]:
+    def tokenize(
+        self, text: Union[str, List[str]]
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        """
+        Args:
+            text (Union[str, List[str]]): Text to tokenize
+
+        Returns:
+            Tuple[torch.Tensor, torch.Tensor]:
+                token input ids, attention_masks
+        """
         text = [text] if isinstance(text, str) else text
         encodings = self.tokenizer(
             text,
             max_length=self.max_length,
             return_tensors="pt",
-            padding=True,
+            padding="max_length",
             pad_to_multiple_of=8,
-            truncation=True,
         )
-        return [encodings.input_ids, encodings.attention_mask]
+        return (encodings.input_ids, encodings.attention_mask)
 
     def tokenize_with_weights(
         self, text: str | List[str]
     ) -> Tuple[torch.Tensor, torch.Tensor, List[torch.Tensor]]:
+        """
+        Args:
+            text (Union[str, List[str]]): Text to tokenize
+
+        Returns:
+            Tuple[torch.Tensor, torch.Tensor, List[torch.Tensor]]:
+                token input ids, attention_masks, weights
+        """
         # Gemma doesn't support weighted prompts, return uniform weights
         tokens, attention_masks = self.tokenize(text)
         weights = [torch.ones_like(t) for t in tokens]
@@ -66,9 +83,20 @@ def encode_tokens(
         self,
         tokenize_strategy: TokenizeStrategy,
         models: List[Any],
-        tokens: List[torch.Tensor],
+        tokens: Tuple[torch.Tensor, torch.Tensor],
     ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        """
+        Args:
+            tokenize_strategy (LuminaTokenizeStrategy): Tokenize strategy
+            models (List[Any]): Text encoders
+            tokens (Tuple[torch.Tensor, torch.Tensor]): tokens, attention_masks
+
+        Returns:
+            Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+                hidden_states, input_ids, attention_masks
+        """
         text_encoder = models[0]
+        assert isinstance(text_encoder, Gemma2Model)
         input_ids, attention_masks = tokens
 
         outputs = text_encoder(
@@ -84,9 +112,20 @@ def encode_tokens_with_weights(
         self,
         tokenize_strategy: TokenizeStrategy,
         models: List[Any],
-        tokens: List[torch.Tensor],
-        weights_list: List[torch.Tensor],
+        tokens: Tuple[torch.Tensor, torch.Tensor],
+        weights: List[torch.Tensor],
     ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        """
+        Args:
+            tokenize_strategy (LuminaTokenizeStrategy): Tokenize strategy
+            models (List[Any]): Text encoders
+            tokens (Tuple[torch.Tensor, torch.Tensor]): tokens, attention_masks
+            weights_list (List[torch.Tensor]): Currently unused
+
+        Returns:
+            Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+                hidden_states, input_ids, attention_masks
+        """
         # For simplicity, use uniform weighting
         return self.encode_tokens(tokenize_strategy, models, tokens)
 
@@ -114,7 +153,14 @@ def get_outputs_npz_path(self, image_abs_path: str) -> str:
             + LuminaTextEncoderOutputsCachingStrategy.LUMINA_TEXT_ENCODER_OUTPUTS_NPZ_SUFFIX
         )
 
-    def is_disk_cached_outputs_expected(self, npz_path: str):
+    def is_disk_cached_outputs_expected(self, npz_path: str) -> bool:
+        """
+        Args:
+            npz_path (str): Path to the npz file.
+
+        Returns:
+            bool: True if the npz file is expected to be cached.
+        """
         if not self.cache_to_disk:
             return False
         if not os.path.exists(npz_path):
@@ -141,7 +187,7 @@ def load_outputs_npz(self, npz_path: str) -> List[np.ndarray]:
         Load outputs from a npz file
 
         Returns:
-            List[np.ndarray]: hidden_state,  input_ids, attention_mask
+            List[np.ndarray]: hidden_state, input_ids, attention_mask
         """
         data = np.load(npz_path)
         hidden_state = data["hidden_state"]
@@ -151,53 +197,75 @@ def load_outputs_npz(self, npz_path: str) -> List[np.ndarray]:
 
     def cache_batch_outputs(
         self,
-        tokenize_strategy: LuminaTokenizeStrategy,
+        tokenize_strategy: TokenizeStrategy,
         models: List[Any],
-        text_encoding_strategy: LuminaTextEncodingStrategy,
-        infos: List,
-    ):
-        lumina_text_encoding_strategy: LuminaTextEncodingStrategy = (
-            text_encoding_strategy
-        )
-        captions = [info.caption for info in infos]
+        text_encoding_strategy: TextEncodingStrategy,
+        batch: List[train_util.ImageInfo],
+    ) -> None:
+        """
+        Args:
+            tokenize_strategy (LuminaTokenizeStrategy): Tokenize strategy
+            models (List[Any]): Text encoders
+            text_encoding_strategy (LuminaTextEncodingStrategy):
+            infos (List): List of image_info
+
+        Returns:
+            None
+        """
+        assert isinstance(text_encoding_strategy, LuminaTextEncodingStrategy)
+        assert isinstance(tokenize_strategy, LuminaTokenizeStrategy)
+
+        captions = [info.system_prompt or "" + info.caption for info in batch]
 
         if self.is_weighted:
-            tokens, weights_list = tokenize_strategy.tokenize_with_weights(
-                captions
+            tokens, attention_masks, weights_list = (
+                tokenize_strategy.tokenize_with_weights(captions)
             )
             with torch.no_grad():
-                hidden_state, input_ids, attention_masks = lumina_text_encoding_strategy.encode_tokens_with_weights(
-                    tokenize_strategy, models, tokens, weights_list
+                hidden_state, input_ids, attention_masks = (
+                    text_encoding_strategy.encode_tokens_with_weights(
+                        tokenize_strategy,
+                        models,
+                        (tokens, attention_masks),
+                        weights_list,
+                    )
                 )
         else:
             tokens = tokenize_strategy.tokenize(captions)
             with torch.no_grad():
-                hidden_state, input_ids, attention_masks = lumina_text_encoding_strategy.encode_tokens(
-                    tokenize_strategy, models, tokens
+                hidden_state, input_ids, attention_masks = (
+                    text_encoding_strategy.encode_tokens(
+                        tokenize_strategy, models, tokens
+                    )
                 )
 
         if hidden_state.dtype != torch.float32:
             hidden_state = hidden_state.float()
 
         hidden_state = hidden_state.cpu().numpy()
-        attention_mask = attention_masks.cpu().numpy()
-        input_ids = tokens.cpu().numpy()
+        attention_mask = attention_masks.cpu().numpy() # (B, S)
+        input_ids = input_ids.cpu().numpy() # (B, S) 
 
-
-        for i, info in enumerate(infos):
+        for i, info in enumerate(batch):
             hidden_state_i = hidden_state[i]
             attention_mask_i = attention_mask[i]
             input_ids_i = input_ids[i]
 
+            assert info.text_encoder_outputs_npz is not None, "Text encoder cache outputs to disk not found for image {info.image_path}"
+
             if self.cache_to_disk:
                 np.savez(
                     info.text_encoder_outputs_npz,
                     hidden_state=hidden_state_i,
                     attention_mask=attention_mask_i,
-                    input_ids=input_ids_i
+                    input_ids=input_ids_i,
                 )
             else:
-                info.text_encoder_outputs = [hidden_state_i, attention_mask_i, input_ids_i]
+                info.text_encoder_outputs = [
+                    hidden_state_i,
+                    attention_mask_i,
+                    input_ids_i,
+                ]
 
 
 class LuminaLatentsCachingStrategy(LatentsCachingStrategy):
@@ -227,7 +295,14 @@ def is_disk_cached_latents_expected(
         npz_path: str,
         flip_aug: bool,
         alpha_mask: bool,
-    ):
+    ) -> bool:
+        """
+        Args:
+            bucket_reso (Tuple[int, int]): The resolution of the bucket.
+            npz_path (str): Path to the npz file.
+            flip_aug (bool): Whether to flip the image.
+            alpha_mask (bool): Whether to apply
+        """
         return self._default_is_disk_cached_latents_expected(
             8, bucket_reso, npz_path, flip_aug, alpha_mask, multi_resolution=True
         )
@@ -241,6 +316,20 @@ def load_latents_from_disk(
         Optional[np.ndarray],
         Optional[np.ndarray],
     ]:
+        """
+        Args:
+            npz_path (str): Path to the npz file.
+            bucket_reso (Tuple[int, int]): The resolution of the bucket.
+
+        Returns:
+            Tuple[
+                Optional[np.ndarray],
+                Optional[List[int]],
+                Optional[List[int]],
+                Optional[np.ndarray],
+                Optional[np.ndarray],
+            ]: Tuple of latent tensors, attention_mask, input_ids, latents, latents_unet
+        """
         return self._default_load_latents_from_disk(
             8, npz_path, bucket_reso
         )  # support multi-resolution
diff --git a/library/train_util.py b/library/train_util.py
index 4eccc4a0b..230b2c4b7 100644
--- a/library/train_util.py
+++ b/library/train_util.py
@@ -195,7 +195,7 @@ def __init__(self, image_key: str, num_repeats: int, caption: str, is_reg: bool,
         self.latents_flipped: Optional[torch.Tensor] = None
         self.latents_npz: Optional[str] = None  # set in cache_latents
         self.latents_original_size: Optional[Tuple[int, int]] = None  # original image size, not latents size
-        self.latents_crop_ltrb: Optional[Tuple[int, int]] = (
+        self.latents_crop_ltrb: Optional[Tuple[int, int, int, int]] = (
             None  # crop left top right bottom in original pixel size, not latents size
         )
         self.cond_img_path: Optional[str] = None
@@ -211,6 +211,8 @@ def __init__(self, image_key: str, num_repeats: int, caption: str, is_reg: bool,
 
         self.alpha_mask: Optional[torch.Tensor] = None  # alpha mask can be flipped in runtime
 
+        self.system_prompt: Optional[str] = None
+
 
 class BucketManager:
     def __init__(self, no_upscale, max_reso, min_size, max_size, reso_steps) -> None:
@@ -434,6 +436,7 @@ def __init__(
         custom_attributes: Optional[Dict[str, Any]] = None,
         validation_seed: Optional[int] = None,
         validation_split: Optional[float] = 0.0,
+        system_prompt: Optional[str] = None
     ) -> None:
         self.image_dir = image_dir
         self.alpha_mask = alpha_mask if alpha_mask is not None else False
@@ -464,6 +467,8 @@ def __init__(
         self.validation_seed = validation_seed
         self.validation_split = validation_split
 
+        self.system_prompt = system_prompt
+
 
 class DreamBoothSubset(BaseSubset):
     def __init__(
@@ -495,6 +500,7 @@ def __init__(
         custom_attributes: Optional[Dict[str, Any]] = None,
         validation_seed: Optional[int] = None,
         validation_split: Optional[float] = 0.0,
+        system_prompt: Optional[str] = None
     ) -> None:
         assert image_dir is not None, "image_dir must be specified / image_dirは指定が必須です"
 
@@ -522,6 +528,7 @@ def __init__(
             custom_attributes=custom_attributes,
             validation_seed=validation_seed,
             validation_split=validation_split,
+            system_prompt=system_prompt
         )
 
         self.is_reg = is_reg
@@ -564,6 +571,7 @@ def __init__(
         custom_attributes: Optional[Dict[str, Any]] = None,
         validation_seed: Optional[int] = None,
         validation_split: Optional[float] = 0.0,
+        system_prompt: Optional[str] = None
     ) -> None:
         assert metadata_file is not None, "metadata_file must be specified / metadata_fileは指定が必須です"
 
@@ -591,6 +599,7 @@ def __init__(
             custom_attributes=custom_attributes,
             validation_seed=validation_seed,
             validation_split=validation_split,
+            system_prompt=system_prompt
         )
 
         self.metadata_file = metadata_file
@@ -629,6 +638,7 @@ def __init__(
         custom_attributes: Optional[Dict[str, Any]] = None,
         validation_seed: Optional[int] = None,
         validation_split: Optional[float] = 0.0,
+        system_prompt: Optional[str] = None
     ) -> None:
         assert image_dir is not None, "image_dir must be specified / image_dirは指定が必須です"
 
@@ -656,6 +666,7 @@ def __init__(
             custom_attributes=custom_attributes,
             validation_seed=validation_seed,
             validation_split=validation_split,
+            system_prompt=system_prompt
         )
 
         self.conditioning_data_dir = conditioning_data_dir
@@ -1686,8 +1697,9 @@ def __getitem__(self, index):
             text_encoder_outputs_list.append(text_encoder_outputs)
 
             if tokenization_required:
+                system_prompt = subset.system_prompt or ""
                 caption = self.process_caption(subset, image_info.caption)
-                input_ids = [ids[0] for ids in self.tokenize_strategy.tokenize(caption)]  # remove batch dimension
+                input_ids = [ids[0] for ids in self.tokenize_strategy.tokenize(system_prompt + caption)]  # remove batch dimension
                 # if self.XTI_layers:
                 #     caption_layer = []
                 #     for layer in self.XTI_layers:
@@ -2059,6 +2071,7 @@ def load_dreambooth_dir(subset: DreamBoothSubset):
         num_train_images = 0
         num_reg_images = 0
         reg_infos: List[Tuple[ImageInfo, DreamBoothSubset]] = []
+
         for subset in subsets:
             num_repeats = subset.num_repeats if self.is_training_dataset else 1
             if num_repeats < 1:
@@ -2086,7 +2099,7 @@ def load_dreambooth_dir(subset: DreamBoothSubset):
                 num_train_images += num_repeats * len(img_paths)
 
             for img_path, caption, size in zip(img_paths, captions, sizes):
-                info = ImageInfo(img_path, num_repeats, caption, subset.is_reg, img_path)
+                info = ImageInfo(img_path, num_repeats, subset.system_prompt or "" + caption, subset.is_reg, img_path)
                 if size is not None:
                     info.image_size = size
                 if subset.is_reg:
@@ -2967,7 +2980,7 @@ def trim_and_resize_if_required(
 # for new_cache_latents
 def load_images_and_masks_for_caching(
     image_infos: List[ImageInfo], use_alpha_mask: bool, random_crop: bool
-) -> Tuple[torch.Tensor, List[np.ndarray], List[Tuple[int, int]], List[Tuple[int, int, int, int]]]:
+) -> Tuple[torch.Tensor, List[torch.Tensor], List[Tuple[int, int]], List[Tuple[int, int, int, int]]]:
     r"""
     requires image_infos to have: [absolute_path or image], bucket_reso, resized_size
 
diff --git a/lumina_train_network.py b/lumina_train_network.py
index 81acfb513..adbf834ca 100644
--- a/lumina_train_network.py
+++ b/lumina_train_network.py
@@ -1,17 +1,17 @@
 import argparse
 import copy
-import math
-import random
-from typing import Any, Optional, Union, Tuple
+from typing import Any, Tuple
 
 import torch
-from torch import Tensor
-from accelerate import Accelerator
 
 from library.device_utils import clean_memory_on_device, init_ipex
 
 init_ipex()
 
+from torch import Tensor
+from accelerate import Accelerator
+
+
 import train_network
 from library import (
     lumina_models,
@@ -40,10 +40,7 @@ def __init__(self):
     def assert_extra_args(self, args, train_dataset_group, val_dataset_group):
         super().assert_extra_args(args, train_dataset_group, val_dataset_group)
 
-        if (
-            args.cache_text_encoder_outputs_to_disk
-            and not args.cache_text_encoder_outputs
-        ):
+        if args.cache_text_encoder_outputs_to_disk and not args.cache_text_encoder_outputs:
             logger.warning("Enabling cache_text_encoder_outputs due to disk caching")
             args.cache_text_encoder_outputs = True
 
@@ -59,17 +56,14 @@ def load_target_model(self, args, weight_dtype, accelerator):
         model = lumina_util.load_lumina_model(
             args.pretrained_model_name_or_path,
             loading_dtype,
-            "cpu",
+            torch.device("cpu"),
             disable_mmap=args.disable_mmap_load_safetensors,
+            use_flash_attn=args.use_flash_attn,
         )
 
         if args.fp8_base:
             # check dtype of model
-            if (
-                model.dtype == torch.float8_e4m3fnuz
-                or model.dtype == torch.float8_e5m2
-                or model.dtype == torch.float8_e5m2fnuz
-            ):
+            if model.dtype == torch.float8_e4m3fnuz or model.dtype == torch.float8_e5m2 or model.dtype == torch.float8_e5m2fnuz:
                 raise ValueError(f"Unsupported fp8 model dtype: {model.dtype}")
             elif model.dtype == torch.float8_e4m3fn:
                 logger.info("Loaded fp8 Lumina 2 model")
@@ -92,17 +86,13 @@ def load_target_model(self, args, weight_dtype, accelerator):
         return lumina_util.MODEL_VERSION_LUMINA_V2, [gemma2], ae, model
 
     def get_tokenize_strategy(self, args):
-        return strategy_lumina.LuminaTokenizeStrategy(
-            args.gemma2_max_token_length, args.tokenizer_cache_dir
-        )
+        return strategy_lumina.LuminaTokenizeStrategy(args.gemma2_max_token_length, args.tokenizer_cache_dir)
 
     def get_tokenizers(self, tokenize_strategy: strategy_lumina.LuminaTokenizeStrategy):
         return [tokenize_strategy.tokenizer]
 
     def get_latents_caching_strategy(self, args):
-        return strategy_lumina.LuminaLatentsCachingStrategy(
-            args.cache_latents_to_disk, args.vae_batch_size, False
-        )
+        return strategy_lumina.LuminaLatentsCachingStrategy(args.cache_latents_to_disk, args.vae_batch_size, False)
 
     def get_text_encoding_strategy(self, args):
         return strategy_lumina.LuminaTextEncodingStrategy()
@@ -144,15 +134,11 @@ def cache_text_encoder_outputs_if_needed(
 
             # When TE is not be trained, it will not be prepared so we need to use explicit autocast
             logger.info("move text encoders to gpu")
-            text_encoders[0].to(
-                accelerator.device, dtype=weight_dtype
-            )  # always not fp8
+            text_encoders[0].to(accelerator.device, dtype=weight_dtype)  # always not fp8
 
             if text_encoders[0].dtype == torch.float8_e4m3fn:
                 # if we load fp8 weights, the model is already fp8, so we use it as is
-                self.prepare_text_encoder_fp8(
-                    1, text_encoders[1], text_encoders[1].dtype, weight_dtype
-                )
+                self.prepare_text_encoder_fp8(1, text_encoders[1], text_encoders[1].dtype, weight_dtype)
             else:
                 # otherwise, we need to convert it to target dtype
                 text_encoders[0].to(weight_dtype)
@@ -162,35 +148,36 @@ def cache_text_encoder_outputs_if_needed(
 
             # cache sample prompts
             if args.sample_prompts is not None:
-                logger.info(
-                    f"cache Text Encoder outputs for sample prompt: {args.sample_prompts}"
-                )
+                logger.info(f"cache Text Encoder outputs for sample prompts: {args.sample_prompts}")
 
                 tokenize_strategy = strategy_base.TokenizeStrategy.get_strategy()
-                text_encoding_strategy  = strategy_base.TextEncodingStrategy.get_strategy()
-                
+                text_encoding_strategy = strategy_base.TextEncodingStrategy.get_strategy()
+
                 assert isinstance(tokenize_strategy, strategy_lumina.LuminaTokenizeStrategy)
                 assert isinstance(text_encoding_strategy, strategy_lumina.LuminaTextEncodingStrategy)
 
+                system_prompt = args.system_prompt or ""
                 sample_prompts = train_util.load_prompts(args.sample_prompts)
-                sample_prompts_te_outputs = (
-                    {}
-                )  # key: prompt, value: text encoder outputs
+                sample_prompts_te_outputs = {}  # key: prompt, value: text encoder outputs
                 with accelerator.autocast(), torch.no_grad():
                     for prompt_dict in sample_prompts:
-                        prompts = [prompt_dict.get("prompt", ""),
-                            prompt_dict.get("negative_prompt", "")]
-                        logger.info(
-                            f"cache Text Encoder outputs for prompt: {prompts[0]}"
-                        )
-                        tokens_and_masks = tokenize_strategy.tokenize(prompts)
-                        sample_prompts_te_outputs[prompts[0]] = (
-                            text_encoding_strategy.encode_tokens(
+                        prompts = [
+                            prompt_dict.get("prompt", ""),
+                            prompt_dict.get("negative_prompt", ""),
+                        ]
+                        for prompt in prompts:
+                            prompt = system_prompt + prompt
+                            if prompt in sample_prompts_te_outputs:
+                                continue
+
+                            logger.info(f"cache Text Encoder outputs for prompt: {prompt}")
+                            tokens_and_masks = tokenize_strategy.tokenize(prompt)
+                            sample_prompts_te_outputs[prompt] = text_encoding_strategy.encode_tokens(
                                 tokenize_strategy,
                                 text_encoders,
                                 tokens_and_masks,
                             )
-                        )
+
                 self.sample_prompts_te_outputs = sample_prompts_te_outputs
 
             accelerator.wait_for_everyone()
@@ -235,12 +222,8 @@ def sample_images(
     # Remaining methods maintain similar structure to flux implementation
     # with Lumina-specific model calls and strategies
 
-    def get_noise_scheduler(
-        self, args: argparse.Namespace, device: torch.device
-    ) -> Any:
-        noise_scheduler = sd3_train_utils.FlowMatchEulerDiscreteScheduler(
-            num_train_timesteps=1000, shift=args.discrete_flow_shift
-        )
+    def get_noise_scheduler(self, args: argparse.Namespace, device: torch.device) -> Any:
+        noise_scheduler = sd3_train_utils.FlowMatchEulerDiscreteScheduler(num_train_timesteps=1000, shift=args.discrete_flow_shift)
         self.noise_scheduler_copy = copy.deepcopy(noise_scheduler)
         return noise_scheduler
 
@@ -258,26 +241,45 @@ def get_noise_pred_and_target(
         noise_scheduler,
         latents,
         batch,
-        text_encoder_conds: Tuple[Tensor, Tensor, Tensor], # (hidden_states, input_ids, attention_masks)
+        text_encoder_conds: Tuple[Tensor, Tensor, Tensor],  # (hidden_states, input_ids, attention_masks)
         dit: lumina_models.NextDiT,
         network,
         weight_dtype,
         train_unet,
         is_train=True,
     ):
+        assert isinstance(noise_scheduler, sd3_train_utils.FlowMatchEulerDiscreteScheduler)
         noise = torch.randn_like(latents)
         bsz = latents.shape[0]
 
-        # get noisy model input and timesteps
-        noisy_model_input, timesteps, sigmas = (
-            flux_train_utils.get_noisy_model_input_and_timesteps(
-                args, noise_scheduler, latents, noise, accelerator.device, weight_dtype
-            )
+        # Sample a random timestep for each image
+        # for weighting schemes where we sample timesteps non-uniformly
+        u = lumina_train_util.compute_density_for_timestep_sampling(
+            weighting_scheme=args.weighting_scheme,
+            batch_size=bsz,
+            logit_mean=args.logit_mean,
+            logit_std=args.logit_std,
+            mode_scale=args.mode_scale,
         )
-
-        # May not need to pack/unpack?
-        # pack latents and get img_ids - 这部分可以保留因为NextDiT也需要packed格式的输入
-        # packed_noisy_model_input = lumina_util.pack_latents(noisy_model_input)
+        indices = (u * noise_scheduler.config.num_train_timesteps).long()
+        timesteps = noise_scheduler.timesteps[indices].to(device=latents.device)
+
+        def get_sigmas(timesteps, n_dim=4, dtype=torch.float32):
+            sigmas = noise_scheduler.sigmas.to(device=accelerator.device, dtype=dtype)
+            schedule_timesteps = noise_scheduler.timesteps.to(accelerator.device)
+            timesteps = timesteps.to(accelerator.device)
+            step_indices = [(schedule_timesteps == t).nonzero().item() for t in timesteps]
+
+            sigma = sigmas[step_indices].flatten()
+            while len(sigma.shape) < n_dim:
+                sigma = sigma.unsqueeze(-1)
+            return sigma
+
+        # Add noise according to flow matching.
+        # zt = (1 - texp) * x + texp * z1
+        # Lumina2 reverses the lerp i.e., sigma of 1.0 should mean `latents`
+        sigmas = get_sigmas(timesteps, n_dim=latents.ndim, dtype=latents.dtype)
+        noisy_model_input = (1.0 - sigmas) * noise + sigmas * latents
 
         # ensure the hidden state will require grad
         if args.gradient_checkpointing:
@@ -289,48 +291,35 @@ def get_noise_pred_and_target(
         # Unpack Gemma2 outputs
         gemma2_hidden_states, input_ids, gemma2_attn_mask = text_encoder_conds
 
-        def call_dit(img, gemma2_hidden_states, timesteps, gemma2_attn_mask):
+        def call_dit(img, gemma2_hidden_states, gemma2_attn_mask, timesteps):
             with torch.set_grad_enabled(is_train), accelerator.autocast():
                 # NextDiT forward expects (x, t, cap_feats, cap_mask)
                 model_pred = dit(
                     x=img,  # image latents (B, C, H, W)
                     t=timesteps / 1000,  # timesteps需要除以1000来匹配模型预期
                     cap_feats=gemma2_hidden_states,  # Gemma2的hidden states作为caption features
-                    cap_mask=gemma2_attn_mask.to(
-                        dtype=torch.int32
-                    ),  # Gemma2的attention mask
+                    cap_mask=gemma2_attn_mask.to(dtype=torch.int32),  # Gemma2的attention mask
                 )
             return model_pred
 
         model_pred = call_dit(
             img=noisy_model_input,
             gemma2_hidden_states=gemma2_hidden_states,
-            timesteps=timesteps,
             gemma2_attn_mask=gemma2_attn_mask,
+            timesteps=timesteps,
         )
 
-        # May not need to pack/unpack?
-        # unpack latents
-        # model_pred = lumina_util.unpack_latents(
-        #     model_pred, packed_latent_height, packed_latent_width
-        # )
-
         # apply model prediction type
-        model_pred, weighting = flux_train_utils.apply_model_prediction_type(
-            args, model_pred, noisy_model_input, sigmas
-        )
+        model_pred, weighting = flux_train_utils.apply_model_prediction_type(args, model_pred, noisy_model_input, sigmas)
 
-        # flow matching loss: this is different from SD3
-        target = noise - latents
+        # flow matching loss
+        target = latents - noise
 
         # differential output preservation
         if "custom_attributes" in batch:
             diff_output_pr_indices = []
             for i, custom_attributes in enumerate(batch["custom_attributes"]):
-                if (
-                    "diff_output_preservation" in custom_attributes
-                    and custom_attributes["diff_output_preservation"]
-                ):
+                if "diff_output_preservation" in custom_attributes and custom_attributes["diff_output_preservation"]:
                     diff_output_pr_indices.append(i)
 
             if len(diff_output_pr_indices) > 0:
@@ -338,9 +327,7 @@ def call_dit(img, gemma2_hidden_states, timesteps, gemma2_attn_mask):
                 with torch.no_grad():
                     model_pred_prior = call_dit(
                         img=noisy_model_input[diff_output_pr_indices],
-                        gemma2_hidden_states=gemma2_hidden_states[
-                            diff_output_pr_indices
-                        ],
+                        gemma2_hidden_states=gemma2_hidden_states[diff_output_pr_indices],
                         timesteps=timesteps[diff_output_pr_indices],
                         gemma2_attn_mask=(gemma2_attn_mask[diff_output_pr_indices]),
                     )
@@ -363,9 +350,7 @@ def post_process_loss(self, loss, args, timesteps, noise_scheduler):
         return loss
 
     def get_sai_model_spec(self, args):
-        return train_util.get_sai_model_spec(
-            None, args, False, True, False, lumina="lumina2"
-        )
+        return train_util.get_sai_model_spec(None, args, False, True, False, lumina="lumina2")
 
     def update_metadata(self, metadata, args):
         metadata["ss_weighting_scheme"] = args.weighting_scheme
@@ -384,12 +369,8 @@ def is_text_encoder_not_needed_for_training(self, args):
     def prepare_text_encoder_grad_ckpt_workaround(self, index, text_encoder):
         text_encoder.embed_tokens.requires_grad_(True)
 
-    def prepare_text_encoder_fp8(
-        self, index, text_encoder, te_weight_dtype, weight_dtype
-    ):
-        logger.info(
-            f"prepare Gemma2 for fp8: set to {te_weight_dtype}, set embeddings to {weight_dtype}"
-        )
+    def prepare_text_encoder_fp8(self, index, text_encoder, te_weight_dtype, weight_dtype):
+        logger.info(f"prepare Gemma2 for fp8: set to {te_weight_dtype}, set embeddings to {weight_dtype}")
         text_encoder.to(te_weight_dtype)  # fp8
         text_encoder.embed_tokens.to(dtype=weight_dtype)
 
@@ -402,12 +383,8 @@ def prepare_unet_with_accelerator(
         # if we doesn't swap blocks, we can move the model to device
         nextdit = unet
         assert isinstance(nextdit, lumina_models.NextDiT)
-        nextdit = accelerator.prepare(
-            nextdit, device_placement=[not self.is_swapping_blocks]
-        )
-        accelerator.unwrap_model(nextdit).move_to_device_except_swap_blocks(
-            accelerator.device
-        )  # reduce peak memory usage
+        nextdit = accelerator.prepare(nextdit, device_placement=[not self.is_swapping_blocks])
+        accelerator.unwrap_model(nextdit).move_to_device_except_swap_blocks(accelerator.device)  # reduce peak memory usage
         accelerator.unwrap_model(nextdit).prepare_block_swap_before_forward()
 
         return nextdit
diff --git a/train_network.py b/train_network.py
index 674f1cb66..2cf11af73 100644
--- a/train_network.py
+++ b/train_network.py
@@ -129,7 +129,7 @@ def assert_extra_args(self, args, train_dataset_group: Union[train_util.DatasetG
         if val_dataset_group is not None:
             val_dataset_group.verify_bucket_reso_steps(64)
 
-    def load_target_model(self, args, weight_dtype, accelerator):
+    def load_target_model(self, args, weight_dtype, accelerator) -> tuple:
         text_encoder, vae, unet, _ = train_util.load_target_model(args, weight_dtype, accelerator)
 
         # モデルに xformers とか memory efficient attention を組み込む
@@ -354,12 +354,13 @@ def process_batch(
         if text_encoder_outputs_list is not None:
             text_encoder_conds = text_encoder_outputs_list  # List of text encoder outputs
 
+
         if len(text_encoder_conds) == 0 or text_encoder_conds[0] is None or train_text_encoder:
             # TODO this does not work if 'some text_encoders are trained' and 'some are not and not cached'
             with torch.set_grad_enabled(is_train and train_text_encoder), accelerator.autocast():
                 # Get the text embedding for conditioning
                 if args.weighted_captions:
-                    input_ids_list, weights_list = tokenize_strategy.tokenize_with_weights(batch["captions"])
+                    input_ids_list, weights_list = tokenize_strategy.tokenize_with_weights(batch['captions'])
                     encoded_text_encoder_conds = text_encoding_strategy.encode_tokens_with_weights(
                         tokenize_strategy,
                         self.get_models_for_text_encoding(args, accelerator, text_encoders),

From 6d7bec8a374c610d31986f049e2296974471f58c Mon Sep 17 00:00:00 2001
From: rockerBOO <rockerboo@gmail.com>
Date: Sun, 23 Feb 2025 01:46:47 -0500
Subject: [PATCH 364/582] Remove non-used code

---
 library/lumina_train_util.py | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

diff --git a/library/lumina_train_util.py b/library/lumina_train_util.py
index 414b2849c..db9af2388 100644
--- a/library/lumina_train_util.py
+++ b/library/lumina_train_util.py
@@ -1,5 +1,4 @@
 import inspect
-import enum
 import argparse
 import math
 import os
@@ -9,20 +8,16 @@
 
 import torch
 from torch import Tensor
-from torchdiffeq import odeint
 from accelerate import Accelerator, PartialState
 from transformers import Gemma2Model
 from tqdm import tqdm
 from PIL import Image
 from safetensors.torch import save_file
 
-from diffusers.schedulers.scheduling_heun_discrete import HeunDiscreteScheduler
-from library import lumina_models, lumina_util, strategy_base, strategy_lumina, train_util
+from library import lumina_models, strategy_base, strategy_lumina, train_util
 from library.flux_models import AutoEncoder
 from library.device_utils import init_ipex, clean_memory_on_device
 from library.sd3_train_utils import FlowMatchEulerDiscreteScheduler
-from library.lumina_dpm_solver import NoiseScheduleFlow, DPM_Solver
-import library.lumina_path as path
 
 init_ipex()
 

From 42a801514ccad054ac7c362ff5a9c0aa0e1e79d7 Mon Sep 17 00:00:00 2001
From: rockerBOO <rockerboo@gmail.com>
Date: Sun, 23 Feb 2025 13:48:37 -0500
Subject: [PATCH 365/582] Fix system prompt in datasets

---
 library/lumina_train_util.py | 2 +-
 library/train_util.py        | 5 ++++-
 2 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/library/lumina_train_util.py b/library/lumina_train_util.py
index db9af2388..487ae2f97 100644
--- a/library/lumina_train_util.py
+++ b/library/lumina_train_util.py
@@ -280,7 +280,7 @@ def sample_image_inference(
         generator=generator,
     )
 
-    scheduler = FlowMatchEulerDiscreteScheduler(shift=6.0, use_karras_sigmas=True)
+    scheduler = FlowMatchEulerDiscreteScheduler(shift=6.0)
     timesteps, num_inference_steps = retrieve_timesteps(scheduler, num_inference_steps=sample_steps)
 
     # if controlnet_image is not None:
diff --git a/library/train_util.py b/library/train_util.py
index 230b2c4b7..ded23f411 100644
--- a/library/train_util.py
+++ b/library/train_util.py
@@ -1869,6 +1869,7 @@ def __init__(
         debug_dataset: bool,
         validation_split: float,
         validation_seed: Optional[int],
+        system_prompt: Optional[str],
     ) -> None:
         super().__init__(resolution, network_multiplier, debug_dataset)
 
@@ -1881,6 +1882,7 @@ def __init__(
         self.is_training_dataset = is_training_dataset
         self.validation_seed = validation_seed
         self.validation_split = validation_split
+        self.system_prompt = system_prompt
 
         self.enable_bucket = enable_bucket
         if self.enable_bucket:
@@ -2098,8 +2100,9 @@ def load_dreambooth_dir(subset: DreamBoothSubset):
             else:
                 num_train_images += num_repeats * len(img_paths)
 
+            system_prompt = self.system_prompt or subset.system_prompt or ""
             for img_path, caption, size in zip(img_paths, captions, sizes):
-                info = ImageInfo(img_path, num_repeats, subset.system_prompt or "" + caption, subset.is_reg, img_path)
+                info = ImageInfo(img_path, num_repeats, system_prompt + caption, subset.is_reg, img_path)
                 if size is not None:
                     info.image_size = size
                 if subset.is_reg:

From ba725a84e9511abeb3a31b4bc45cd7eba4c12d65 Mon Sep 17 00:00:00 2001
From: rockerBOO <rockerboo@gmail.com>
Date: Sun, 23 Feb 2025 18:01:09 -0500
Subject: [PATCH 366/582] Set default discrete_flow_shift to 6.0. Remove
 default system prompt.

---
 library/lumina_models.py     | 125 +++++++++++++++++++----------------
 library/lumina_train_util.py |   6 +-
 2 files changed, 70 insertions(+), 61 deletions(-)

diff --git a/library/lumina_models.py b/library/lumina_models.py
index f819b68fb..365453c1c 100644
--- a/library/lumina_models.py
+++ b/library/lumina_models.py
@@ -1,9 +1,19 @@
+# Copyright Alpha VLLM/Lumina Image 2.0 and contributors
 # Copyright (c) Meta Platforms, Inc. and affiliates.
 # All rights reserved.
-
-# This source code is licensed under the license found in the
-# LICENSE file in the root directory of this source tree.
-# --------------------------------------------------------
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
 # References:
 # GLIDE: https://github.com/openai/glide-text2im
 # MAE: https://github.com/facebookresearch/mae/blob/main/models_mae.py
@@ -13,8 +23,6 @@
 from typing import List, Optional, Tuple
 from dataclasses import dataclass
 
-from einops import rearrange
-
 import torch
 from torch import Tensor
 from torch.utils.checkpoint import checkpoint
@@ -25,6 +33,7 @@
     from flash_attn import flash_attn_varlen_func
     from flash_attn.bert_padding import index_first_axis, pad_input, unpad_input  # noqa
 except:
+    # flash_attn may not be available but it is not required
     pass
 
 try:
@@ -34,6 +43,58 @@
 
     warnings.warn("Cannot import apex RMSNorm, switch to vanilla implementation")
 
+    #############################################################################
+    #                                 RMSNorm                                   #
+    #############################################################################
+
+    class RMSNorm(torch.nn.Module):
+        def __init__(self, dim: int, eps: float = 1e-6):
+            """
+            Initialize the RMSNorm normalization layer.
+
+            Args:
+                dim (int): The dimension of the input tensor.
+                eps (float, optional): A small value added to the denominator for numerical stability. Default is 1e-6.
+
+            Attributes:
+                eps (float): A small value added to the denominator for numerical stability.
+                weight (nn.Parameter): Learnable scaling parameter.
+
+            """
+            super().__init__()
+            self.eps = eps
+            self.weight = nn.Parameter(torch.ones(dim))
+
+        def _norm(self, x) -> Tensor:
+            """
+            Apply the RMSNorm normalization to the input tensor.
+
+            Args:
+                x (torch.Tensor): The input tensor.
+
+            Returns:
+                torch.Tensor: The normalized tensor.
+
+            """
+            return x * torch.rsqrt(x.float().pow(2).mean(-1, keepdim=True) + self.eps)
+
+        def forward(self, x: Tensor):
+            """
+            Apply RMSNorm to the input tensor.
+
+            Args:
+                x (torch.Tensor): The input tensor.
+
+            Returns:
+                torch.Tensor: The normalized tensor.
+            """
+            x_dtype = x.dtype
+            # To handle float8 we need to convert the tensor to float
+            x = x.float()
+            rrms = torch.rsqrt(torch.mean(x**2, dim=-1, keepdim=True) + 1e-6)
+            return ((x * rrms) * self.weight.float()).to(dtype=x_dtype)
+
+
 
 @dataclass
 class LuminaParams:
@@ -111,58 +172,6 @@ def forward(self, *args, **kwargs):
             return self._forward(*args, **kwargs)
 
 
-#############################################################################
-#                                 RMSNorm                                   #
-#############################################################################
-
-
-class RMSNorm(torch.nn.Module):
-    def __init__(self, dim: int, eps: float = 1e-6):
-        """
-        Initialize the RMSNorm normalization layer.
-
-        Args:
-            dim (int): The dimension of the input tensor.
-            eps (float, optional): A small value added to the denominator for numerical stability. Default is 1e-6.
-
-        Attributes:
-            eps (float): A small value added to the denominator for numerical stability.
-            weight (nn.Parameter): Learnable scaling parameter.
-
-        """
-        super().__init__()
-        self.eps = eps
-        self.weight = nn.Parameter(torch.ones(dim))
-
-    def _norm(self, x) -> Tensor:
-        """
-        Apply the RMSNorm normalization to the input tensor.
-
-        Args:
-            x (torch.Tensor): The input tensor.
-
-        Returns:
-            torch.Tensor: The normalized tensor.
-
-        """
-        return x * torch.rsqrt(x.float().pow(2).mean(-1, keepdim=True) + self.eps)
-
-    def forward(self, x: Tensor):
-        """
-        Apply RMSNorm to the input tensor.
-
-        Args:
-            x (torch.Tensor): The input tensor.
-
-        Returns:
-            torch.Tensor: The normalized tensor.
-        """
-        x_dtype = x.dtype
-        # To handle float8 we need to convert the tensor to float
-        x = x.float()
-        rrms = torch.rsqrt(torch.mean(x**2, dim=-1, keepdim=True) + 1e-6)
-        return ((x * rrms) * self.weight.float()).to(dtype=x_dtype)
-
 
 def modulate(x, scale):
     return x * (1 + scale.unsqueeze(1))
diff --git a/library/lumina_train_util.py b/library/lumina_train_util.py
index 487ae2f97..172d09eac 100644
--- a/library/lumina_train_util.py
+++ b/library/lumina_train_util.py
@@ -878,8 +878,8 @@ def add_lumina_train_arguments(parser: argparse.ArgumentParser):
     parser.add_argument(
         "--discrete_flow_shift",
         type=float,
-        default=3.0,
-        help="Discrete flow shift for the Euler Discrete Scheduler, default is 3.0. / Euler Discrete Schedulerの離散フローシフト、デフォルトは3.0。",
+        default=6.0,
+        help="Discrete flow shift for the Euler Discrete Scheduler, default is 6.0. / Euler Discrete Schedulerの離散フローシフト、デフォルトは6.0。",
     )
     parser.add_argument(
         "--use_flash_attn",
@@ -889,6 +889,6 @@ def add_lumina_train_arguments(parser: argparse.ArgumentParser):
     parser.add_argument(
         "--system_prompt",
         type=str,
-        default="You are an assistant designed to generate high-quality images based on user prompts. <Prompt Start> ",
+        default="",
         help="System prompt to add to the prompt. / プロンプトに追加するシステムプロンプト。",
     )

From 48e7da2d4a844d60a4db1ac03b9a4a34a2c57720 Mon Sep 17 00:00:00 2001
From: rockerBOO <rockerboo@gmail.com>
Date: Sun, 23 Feb 2025 20:19:24 -0500
Subject: [PATCH 367/582] Add sample batch size for Lumina

---
 library/lumina_models.py     |   6 +-
 library/lumina_train_util.py | 294 +++++++++++++++++++++++------------
 train_network.py             |   3 +
 3 files changed, 199 insertions(+), 104 deletions(-)

diff --git a/library/lumina_models.py b/library/lumina_models.py
index 365453c1c..d86a9cb2b 100644
--- a/library/lumina_models.py
+++ b/library/lumina_models.py
@@ -880,8 +880,8 @@ def __init__(
         self.n_heads = n_heads
 
         self.gradient_checkpointing = False
-        self.cpu_offload_checkpointing = False
-        self.blocks_to_swap = None
+        self.cpu_offload_checkpointing = False # TODO: not yet supported
+        self.blocks_to_swap = None # TODO: not yet supported
 
     @property
     def device(self):
@@ -982,8 +982,8 @@ def patchify_and_embed(
 
         l_effective_cap_len = cap_mask.sum(dim=1).tolist()
         encoder_seq_len = cap_mask.shape[1]
-
         image_seq_len = (height // self.patch_size) * (width // self.patch_size)
+
         seq_lengths = [cap_seq_len + image_seq_len for cap_seq_len in l_effective_cap_len]
         max_seq_len = max(seq_lengths)
 
diff --git a/library/lumina_train_util.py b/library/lumina_train_util.py
index 172d09eac..4aa48e8b2 100644
--- a/library/lumina_train_util.py
+++ b/library/lumina_train_util.py
@@ -4,7 +4,7 @@
 import os
 import numpy as np
 import time
-from typing import Callable, Dict, List, Optional, Tuple, Any, Union
+from typing import Callable, Dict, List, Optional, Tuple, Any, Union, Generator
 
 import torch
 from torch import Tensor
@@ -32,6 +32,59 @@
 # region sample images
 
 
+def batchify(prompt_dicts, batch_size=None) -> Generator[list[dict[str, str]], None, None]:
+    """
+    Group prompt dictionaries into batches with configurable batch size.
+
+    Args:
+        prompt_dicts (list): List of dictionaries containing prompt parameters.
+        batch_size (int, optional): Number of prompts per batch. Defaults to None.
+
+    Yields:
+        list[dict[str, str]]: Batch of prompts.
+    """
+    # Validate batch_size
+    if batch_size is not None:
+        if not isinstance(batch_size, int) or batch_size <= 0:
+            raise ValueError("batch_size must be a positive integer or None")
+
+    # Group prompts by their parameters
+    batches = {}
+    for prompt_dict in prompt_dicts:
+        # Extract parameters
+        width = int(prompt_dict.get("width", 1024))
+        height = int(prompt_dict.get("height", 1024))
+        height = max(64, height - height % 8)  # round to divisible by 8
+        width = max(64, width - width % 8)  # round to divisible by 8
+        guidance_scale = float(prompt_dict.get("scale", 3.5))
+        sample_steps = int(prompt_dict.get("sample_steps", 38))
+        seed = prompt_dict.get("seed", None)
+        seed = int(seed) if seed is not None else None
+
+        # Create a key based on the parameters
+        key = (width, height, guidance_scale, seed, sample_steps)
+
+        # Add the prompt_dict to the corresponding batch
+        if key not in batches:
+            batches[key] = []
+        batches[key].append(prompt_dict)
+
+    # Yield each batch with its parameters
+    for key in batches:
+        prompts = batches[key]
+        if batch_size is None:
+            # Yield the entire group as a single batch
+            yield prompts
+        else:
+            # Split the group into batches of size `batch_size`
+            start = 0
+            while start < len(prompts):
+                end = start + batch_size
+                batch = prompts[start:end]
+                yield batch
+                start = end
+
+
 @torch.no_grad()
 def sample_images(
     accelerator: Accelerator,
@@ -39,9 +92,9 @@ def sample_images(
     epoch: int,
     global_step: int,
     nextdit: lumina_models.NextDiT,
-    vae: torch.nn.Module,
+    vae: AutoEncoder,
     gemma2_model: Gemma2Model,
-    sample_prompts_gemma2_outputs: List[Tuple[Tensor, Tensor, Tensor]],
+    sample_prompts_gemma2_outputs: dict[str, Tuple[Tensor, Tensor, Tensor]],
     prompt_replacement: Optional[Tuple[str, str]] = None,
     controlnet=None,
 ):
@@ -54,11 +107,13 @@ def sample_images(
         epoch (int): Current epoch number.
         global_step (int): Current global step number.
         nextdit (lumina_models.NextDiT): The NextDiT model instance.
-        vae (torch.nn.Module): The VAE module.
+        vae (AutoEncoder): The VAE module.
         gemma2_model (Gemma2Model): The Gemma2 model instance.
-        sample_prompts_gemma2_outputs (List[Tuple[Tensor, Tensor, Tensor]]): List of tuples containing the encoded prompts, text masks, and timestep for each sample.
-        prompt_replacement (Optional[Tuple[str, str]], optional): Tuple containing the prompt and negative prompt replacements. Defaults to None.
-        controlnet:: ControlNet model
+        sample_prompts_gemma2_outputs (dict[str, Tuple[Tensor, Tensor, Tensor]]):
+            Dictionary ist of tuples containing the encoded prompts, text masks, and timestep for each sample.
+        prompt_replacement (Optional[Tuple[str, str]], optional):
+            Tuple containing the prompt and negative prompt replacements. Defaults to None.
+        controlnet (): ControlNet model, not yet supported
 
     Returns:
         None
@@ -110,9 +165,12 @@ def sample_images(
     except Exception:
         pass
 
+    batch_size = args.sample_batch_size or args.train_batch_size or 1
+
     if distributed_state.num_processes <= 1:
         # If only one device is available, just use the original prompt list. We don't need to care about the distribution of prompts.
-        for prompt_dict in prompts:
+        # TODO: batch prompts together with buckets of image sizes
+        for prompt_dicts in batchify(prompts, batch_size):
             sample_image_inference(
                 accelerator,
                 args,
@@ -120,7 +178,7 @@ def sample_images(
                 gemma2_model,
                 vae,
                 save_dir,
-                prompt_dict,
+                prompt_dicts,
                 epoch,
                 global_step,
                 sample_prompts_gemma2_outputs,
@@ -135,7 +193,8 @@ def sample_images(
             per_process_prompts.append(prompts[i :: distributed_state.num_processes])
 
         with distributed_state.split_between_processes(per_process_prompts) as prompt_dict_lists:
-            for prompt_dict in prompt_dict_lists[0]:
+            # TODO: batch prompts together with buckets of image sizes
+            for prompt_dicts in batchify(prompt_dict_lists[0], batch_size):
                 sample_image_inference(
                     accelerator,
                     args,
@@ -143,7 +202,7 @@ def sample_images(
                     gemma2_model,
                     vae,
                     save_dir,
-                    prompt_dict,
+                    prompt_dicts,
                     epoch,
                     global_step,
                     sample_prompts_gemma2_outputs,
@@ -166,10 +225,10 @@ def sample_image_inference(
     gemma2_model: Gemma2Model,
     vae: AutoEncoder,
     save_dir: str,
-    prompt_dict: Dict[str, str],
+    prompt_dicts: list[Dict[str, str]],
     epoch: int,
     global_step: int,
-    sample_prompts_gemma2_outputs: dict[str, List[Tuple[Tensor, Tensor, Tensor]]],
+    sample_prompts_gemma2_outputs: dict[str, Tuple[Tensor, Tensor, Tensor]],
     prompt_replacement: Optional[Tuple[str, str]] = None,
     controlnet=None,
 ):
@@ -192,43 +251,6 @@ def sample_image_inference(
     Returns:
         None
     """
-    assert isinstance(prompt_dict, dict)
-    # negative_prompt = prompt_dict.get("negative_prompt")
-    sample_steps = int(prompt_dict.get("sample_steps", 38))
-    width = int(prompt_dict.get("width", 1024))
-    height = int(prompt_dict.get("height", 1024))
-    guidance_scale = float(prompt_dict.get("scale", 3.5))
-    seed = prompt_dict.get("seed", None)
-    controlnet_image = prompt_dict.get("controlnet_image")
-    prompt: str = prompt_dict.get("prompt", "")
-    negative_prompt: str = prompt_dict.get("negative_prompt", "")
-    # sampler_name: str = prompt_dict.get("sample_sampler", args.sample_sampler)
-
-    seed = int(seed) if seed is not None else None
-    assert seed is None or seed > 0, f"Invalid seed {seed}"
-
-    if prompt_replacement is not None:
-        prompt = prompt.replace(prompt_replacement[0], prompt_replacement[1])
-        if negative_prompt is not None:
-            negative_prompt = negative_prompt.replace(prompt_replacement[0], prompt_replacement[1])
-
-    generator = torch.Generator(device=accelerator.device)
-    if seed is not None:
-        generator.manual_seed(seed)
-
-    # if negative_prompt is None:
-    #     negative_prompt = ""
-    height = max(64, height - height % 8)  # round to divisible by 8
-    width = max(64, width - width % 8)  # round to divisible by 8
-    logger.info(f"prompt: {prompt}")
-    logger.info(f"negative_prompt: {negative_prompt}")
-    logger.info(f"height: {height}")
-    logger.info(f"width: {width}")
-    logger.info(f"sample_steps: {sample_steps}")
-    logger.info(f"scale: {guidance_scale}")
-    # logger.info(f"sample_sampler: {sampler_name}")
-    if seed is not None:
-        logger.info(f"seed: {seed}")
 
     # encode prompts
     tokenize_strategy = strategy_base.TokenizeStrategy.get_strategy()
@@ -237,33 +259,86 @@ def sample_image_inference(
     assert isinstance(tokenize_strategy, strategy_lumina.LuminaTokenizeStrategy)
     assert isinstance(encoding_strategy, strategy_lumina.LuminaTextEncodingStrategy)
 
-    system_prompt = args.system_prompt or ""
-
-    # Apply system prompt to prompts
-    prompt = system_prompt + prompt
-    negative_prompt = system_prompt + negative_prompt
-
-    # Get sample prompts from cache
-    if sample_prompts_gemma2_outputs and prompt in sample_prompts_gemma2_outputs:
-        gemma2_conds = sample_prompts_gemma2_outputs[prompt]
-        logger.info(f"Using cached Gemma2 outputs for prompt: {prompt}")
+    text_conds = []
 
-    if sample_prompts_gemma2_outputs and negative_prompt in sample_prompts_gemma2_outputs:
-        neg_gemma2_conds = sample_prompts_gemma2_outputs[negative_prompt]
-        logger.info(f"Using cached Gemma2 outputs for negative prompt: {negative_prompt}")
+    # assuming seed, width, height, sample steps, guidance are the same
+    width = int(prompt_dicts[0].get("width", 1024))
+    height = int(prompt_dicts[0].get("height", 1024))
+    height = max(64, height - height % 8)  # round to divisible by 8
+    width = max(64, width - width % 8)  # round to divisible by 8
 
-    # Load sample prompts from Gemma 2
-    if gemma2_model is not None:
-        logger.info(f"Encoding prompt with Gemma2: {prompt}")
-        tokens_and_masks = tokenize_strategy.tokenize(prompt)
-        gemma2_conds = encoding_strategy.encode_tokens(tokenize_strategy, [gemma2_model], tokens_and_masks)
+    guidance_scale = float(prompt_dicts[0].get("scale", 3.5))
+    sample_steps = int(prompt_dicts[0].get("sample_steps", 36))
+    seed = prompt_dicts[0].get("seed", None)
+    seed = int(seed) if seed is not None else None
+    assert seed is None or seed > 0, f"Invalid seed {seed}"
+    generator = torch.Generator(device=accelerator.device)
+    if seed is not None:
+        generator.manual_seed(seed)
 
-        tokens_and_masks = tokenize_strategy.tokenize(negative_prompt)
-        neg_gemma2_conds = encoding_strategy.encode_tokens(tokenize_strategy, [gemma2_model], tokens_and_masks)
+    for prompt_dict in prompt_dicts:
+        controlnet_image = prompt_dict.get("controlnet_image")
+        prompt: str = prompt_dict.get("prompt", "")
+        negative_prompt = prompt_dict.get("negative_prompt", "")
+        # sampler_name: str = prompt_dict.get("sample_sampler", args.sample_sampler)
+
+        if prompt_replacement is not None:
+            prompt = prompt.replace(prompt_replacement[0], prompt_replacement[1])
+            if negative_prompt is not None:
+                negative_prompt = negative_prompt.replace(prompt_replacement[0], prompt_replacement[1])
+
+        if negative_prompt is None:
+            negative_prompt = ""
+        logger.info(f"prompt: {prompt}")
+        logger.info(f"negative_prompt: {negative_prompt}")
+        logger.info(f"height: {height}")
+        logger.info(f"width: {width}")
+        logger.info(f"sample_steps: {sample_steps}")
+        logger.info(f"scale: {guidance_scale}")
+        # logger.info(f"sample_sampler: {sampler_name}")
+
+        system_prompt = args.system_prompt or ""
+
+        # Apply system prompt to prompts
+        prompt = system_prompt + prompt
+        negative_prompt = system_prompt + negative_prompt
+
+        # Get sample prompts from cache
+        if sample_prompts_gemma2_outputs and prompt in sample_prompts_gemma2_outputs:
+            gemma2_conds = sample_prompts_gemma2_outputs[prompt]
+            logger.info(f"Using cached Gemma2 outputs for prompt: {prompt}")
+
+        if sample_prompts_gemma2_outputs and negative_prompt in sample_prompts_gemma2_outputs:
+            neg_gemma2_conds = sample_prompts_gemma2_outputs[negative_prompt]
+            logger.info(f"Using cached Gemma2 outputs for negative prompt: {negative_prompt}")
+
+        # Load sample prompts from Gemma 2
+        if gemma2_model is not None:
+            logger.info(f"Encoding prompt with Gemma2: {prompt}")
+            tokens_and_masks = tokenize_strategy.tokenize(prompt)
+            gemma2_conds = encoding_strategy.encode_tokens(tokenize_strategy, [gemma2_model], tokens_and_masks)
+
+            tokens_and_masks = tokenize_strategy.tokenize(negative_prompt)
+            neg_gemma2_conds = encoding_strategy.encode_tokens(tokenize_strategy, [gemma2_model], tokens_and_masks)
+
+        # Unpack Gemma2 outputs
+        gemma2_hidden_states, _, gemma2_attn_mask = gemma2_conds
+        neg_gemma2_hidden_states, _, neg_gemma2_attn_mask = neg_gemma2_conds
+
+        text_conds.append(
+            (
+                gemma2_hidden_states.squeeze(0),
+                gemma2_attn_mask.squeeze(0),
+                neg_gemma2_hidden_states.squeeze(0),
+                neg_gemma2_attn_mask.squeeze(0),
+            )
+        )
 
-    # Unpack Gemma2 outputs
-    gemma2_hidden_states, input_ids, gemma2_attn_mask = gemma2_conds
-    neg_gemma2_hidden_states, neg_input_ids, neg_gemma2_attn_mask = neg_gemma2_conds
+    # Stack conditioning
+    cond_hidden_states = torch.stack([text_cond[0] for text_cond in text_conds]).to(accelerator.device)
+    cond_attn_masks = torch.stack([text_cond[1] for text_cond in text_conds]).to(accelerator.device)
+    uncond_hidden_states = torch.stack([text_cond[2] for text_cond in text_conds]).to(accelerator.device)
+    uncond_attn_masks = torch.stack([text_cond[3] for text_cond in text_conds]).to(accelerator.device)
 
     # sample image
     weight_dtype = vae.dtype  # TOFO give dtype as argument
@@ -279,6 +354,7 @@ def sample_image_inference(
         dtype=weight_dtype,
         generator=generator,
     )
+    noise = noise.repeat(cond_hidden_states.shape[0], 1, 1, 1)
 
     scheduler = FlowMatchEulerDiscreteScheduler(shift=6.0)
     timesteps, num_inference_steps = retrieve_timesteps(scheduler, num_inference_steps=sample_steps)
@@ -294,10 +370,10 @@ def sample_image_inference(
             scheduler,
             nextdit,
             noise,
-            gemma2_hidden_states,
-            gemma2_attn_mask.to(accelerator.device),
-            neg_gemma2_hidden_states,
-            neg_gemma2_attn_mask.to(accelerator.device),
+            cond_hidden_states,
+            cond_attn_masks,
+            uncond_hidden_states,
+            uncond_attn_masks,
             timesteps=timesteps,
             num_inference_steps=num_inference_steps,
             guidance_scale=guidance_scale,
@@ -307,33 +383,43 @@ def sample_image_inference(
     clean_memory_on_device(accelerator.device)
     org_vae_device = vae.device  # will be on cpu
     vae.to(accelerator.device)  # distributed_state.device is same as accelerator.device
-    with accelerator.autocast():
-        x = vae.decode((x / vae.scale_factor) + vae.shift_factor)
-    vae.to(org_vae_device)
-    clean_memory_on_device(accelerator.device)
+    for img, prompt_dict in zip(x, prompt_dicts):
+
+        img = (img / vae.scale_factor) + vae.shift_factor
+
+        with accelerator.autocast():
+            # Add a single batch image for the VAE to decode
+            img = vae.decode(img.unsqueeze(0))
 
-    x = x.clamp(-1, 1)
-    x = x.permute(0, 2, 3, 1)
-    image = Image.fromarray((127.5 * (x + 1.0)).float().cpu().numpy().astype(np.uint8)[0])
+        img = img.clamp(-1, 1)
+        img = img.permute(0, 2, 3, 1)  # B, H, W, C
+        # Scale images back to 0 to 255
+        img = (127.5 * (img + 1.0)).float().cpu().numpy().astype(np.uint8)
 
-    # adding accelerator.wait_for_everyone() here should sync up and ensure that sample images are saved in the same order as the original prompt list
-    # but adding 'enum' to the filename should be enough
+        # Get single image
+        image = Image.fromarray(img[0])
 
-    ts_str = time.strftime("%Y%m%d%H%M%S", time.localtime())
-    num_suffix = f"e{epoch:06d}" if epoch is not None else f"{global_step:06d}"
-    seed_suffix = "" if seed is None else f"_{seed}"
-    i: int = int(prompt_dict.get("enum", 0))
-    img_filename = f"{'' if args.output_name is None else args.output_name + '_'}{num_suffix}_{i:02d}_{ts_str}{seed_suffix}.png"
-    image.save(os.path.join(save_dir, img_filename))
+        # adding accelerator.wait_for_everyone() here should sync up and ensure that sample images are saved in the same order as the original prompt list
+        # but adding 'enum' to the filename should be enough
 
-    # send images to wandb if enabled
-    if "wandb" in [tracker.name for tracker in accelerator.trackers]:
-        wandb_tracker = accelerator.get_tracker("wandb")
+        ts_str = time.strftime("%Y%m%d%H%M%S", time.localtime())
+        num_suffix = f"e{epoch:06d}" if epoch is not None else f"{global_step:06d}"
+        seed_suffix = "" if seed is None else f"_{seed}"
+        i: int = int(prompt_dict.get("enum", 0))
+        img_filename = f"{'' if args.output_name is None else args.output_name + '_'}{num_suffix}_{i:02d}_{ts_str}{seed_suffix}.png"
+        image.save(os.path.join(save_dir, img_filename))
 
-        import wandb
+        # send images to wandb if enabled
+        if "wandb" in [tracker.name for tracker in accelerator.trackers]:
+            wandb_tracker = accelerator.get_tracker("wandb")
 
-        # not to commit images to avoid inconsistency between training and logging steps
-        wandb_tracker.log({f"sample_{i}": wandb.Image(image, caption=prompt)}, commit=False)  # positive prompt as a caption
+            import wandb
+
+            # not to commit images to avoid inconsistency between training and logging steps
+            wandb_tracker.log({f"sample_{i}": wandb.Image(image, caption=prompt)}, commit=False)  # positive prompt as a caption
+
+    vae.to(org_vae_device)
+    clean_memory_on_device(accelerator.device)
 
 
 def time_shift(mu: float, sigma: float, t: torch.Tensor):
@@ -879,16 +965,22 @@ def add_lumina_train_arguments(parser: argparse.ArgumentParser):
         "--discrete_flow_shift",
         type=float,
         default=6.0,
-        help="Discrete flow shift for the Euler Discrete Scheduler, default is 6.0. / Euler Discrete Schedulerの離散フローシフト、デフォルトは6.0。",
+        help="Discrete flow shift for the Euler Discrete Scheduler, default is 6.0 / Euler Discrete Schedulerの離散フローシフト、デフォルトは6.0",
     )
     parser.add_argument(
         "--use_flash_attn",
         action="store_true",
-        help="Use Flash Attention for the model. / モデルにFlash Attentionを使用する。",
+        help="Use Flash Attention for the model / モデルにFlash Attentionを使用する",
     )
     parser.add_argument(
         "--system_prompt",
         type=str,
         default="",
-        help="System prompt to add to the prompt. / プロンプトに追加するシステムプロンプト。",
+        help="System prompt to add to the prompt / プロンプトに追加するシステムプロンプト",
+    )
+    parser.add_argument(
+        "--sample_batch_size",
+        type=int,
+        default=None,
+        help="Batch size to use for sampling, defaults to --training_batch_size value. Sample batches are bucketed by width, height, guidance scale, and seed / サンプリングに使用するバッチサイズ。デフォルトは --training_batch_size の値です。サンプルバッチは、幅、高さ、ガイダンススケール、シードによってバケット化されます",
     )
diff --git a/train_network.py b/train_network.py
index 2cf11af73..07de30b3b 100644
--- a/train_network.py
+++ b/train_network.py
@@ -1242,6 +1242,7 @@ def remove_model(old_ckpt_name):
         # For --sample_at_first
         optimizer_eval_fn()
         self.sample_images(accelerator, args, 0, global_step, accelerator.device, vae, tokenizers, text_encoder, unet)
+        progress_bar.unpause() # Reset progress bar to before sampling images
         optimizer_train_fn()
         is_tracking = len(accelerator.trackers) > 0
         if is_tracking:
@@ -1344,6 +1345,7 @@ def remove_model(old_ckpt_name):
                     self.sample_images(
                         accelerator, args, None, global_step, accelerator.device, vae, tokenizers, text_encoder, unet
                     )
+                    progress_bar.unpause()
 
                     # 指定ステップごとにモデルを保存
                     if args.save_every_n_steps is not None and global_step % args.save_every_n_steps == 0:
@@ -1531,6 +1533,7 @@ def remove_model(old_ckpt_name):
                         train_util.save_and_remove_state_on_epoch_end(args, accelerator, epoch + 1)
 
             self.sample_images(accelerator, args, epoch + 1, global_step, accelerator.device, vae, tokenizers, text_encoder, unet)
+            progress_bar.unpause()
             optimizer_train_fn()
 
             # end of epoch

From 2c94d17f0554d1f468e1249e24ad8db0ca812f19 Mon Sep 17 00:00:00 2001
From: rockerBOO <rockerboo@gmail.com>
Date: Sun, 23 Feb 2025 20:21:06 -0500
Subject: [PATCH 368/582] Fix typo

---
 library/lumina_train_util.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/library/lumina_train_util.py b/library/lumina_train_util.py
index 4aa48e8b2..87f7ba36b 100644
--- a/library/lumina_train_util.py
+++ b/library/lumina_train_util.py
@@ -110,7 +110,7 @@ def sample_images(
         vae (AutoEncoder): The VAE module.
         gemma2_model (Gemma2Model): The Gemma2 model instance.
         sample_prompts_gemma2_outputs (dict[str, Tuple[Tensor, Tensor, Tensor]]):
-            Dictionary ist of tuples containing the encoded prompts, text masks, and timestep for each sample.
+            Dictionary of tuples containing the encoded prompts, text masks, and timestep for each sample.
         prompt_replacement (Optional[Tuple[str, str]], optional):
             Tuple containing the prompt and negative prompt replacements. Defaults to None.
         controlnet (): ControlNet model, not yet supported

From fc772affbe4345c8e0d14eb53ebc883f8c5a576f Mon Sep 17 00:00:00 2001
From: sdbds <865105819@qq.com>
Date: Mon, 24 Feb 2025 14:10:24 +0800
Subject: [PATCH 369/582] =?UTF-8?q?1=E3=80=81Implement=20cfg=5Ftrunc=20cal?=
 =?UTF-8?q?culation=20directly=20using=20timesteps,=20without=20intermedia?=
 =?UTF-8?q?te=20steps.?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

2、Deprecate and remove the guidance_scale parameter because it used in inference not train

3、Add inference command-line arguments --ct for cfg_trunc_ratio and --rc for renorm_cfg to control CFG truncation and renormalization during inference.
---
 library/lumina_models.py     |  2 +-
 library/lumina_train_util.py | 46 +++++++++++++++++-------------------
 library/train_util.py        | 10 ++++++++
 lumina_train_network.py      |  1 -
 4 files changed, 33 insertions(+), 26 deletions(-)

diff --git a/library/lumina_models.py b/library/lumina_models.py
index d86a9cb2b..1a441a69d 100644
--- a/library/lumina_models.py
+++ b/library/lumina_models.py
@@ -1081,7 +1081,7 @@ def forward_with_cfg(
         cap_feats: Tensor,
         cap_mask: Tensor,
         cfg_scale: float,
-        cfg_trunc: int = 100,
+        cfg_trunc: float = 0.25,
         renorm_cfg: float = 1.0,
     ):
         """
diff --git a/library/lumina_train_util.py b/library/lumina_train_util.py
index 87f7ba36b..f54b202d4 100644
--- a/library/lumina_train_util.py
+++ b/library/lumina_train_util.py
@@ -58,11 +58,13 @@ def batchify(prompt_dicts, batch_size=None) -> Generator[list[dict[str, str]], N
         width = max(64, width - width % 8)  # round to divisible by 8
         guidance_scale = float(prompt_dict.get("scale", 3.5))
         sample_steps = int(prompt_dict.get("sample_steps", 38))
+        cfg_trunc_ratio = float(prompt_dict.get("cfg_trunc_ratio", 0.25))
+        renorm_cfg = float(prompt_dict.get("renorm_cfg", 1.0))
         seed = prompt_dict.get("seed", None)
         seed = int(seed) if seed is not None else None
 
         # Create a key based on the parameters
-        key = (width, height, guidance_scale, seed, sample_steps)
+        key = (width, height, guidance_scale, seed, sample_steps, cfg_trunc_ratio, renorm_cfg)
 
         # Add the prompt_dict to the corresponding batch
         if key not in batches:
@@ -268,6 +270,8 @@ def sample_image_inference(
     width = max(64, width - width % 8)  # round to divisible by 8
 
     guidance_scale = float(prompt_dicts[0].get("scale", 3.5))
+    cfg_trunc_ratio = float(prompt_dicts[0].get("cfg_trunc_ratio", 0.25))
+    renorm_cfg = float(prompt_dicts[0].get("renorm_cfg", 1.0))
     sample_steps = int(prompt_dicts[0].get("sample_steps", 36))
     seed = prompt_dicts[0].get("seed", None)
     seed = int(seed) if seed is not None else None
@@ -295,6 +299,8 @@ def sample_image_inference(
         logger.info(f"width: {width}")
         logger.info(f"sample_steps: {sample_steps}")
         logger.info(f"scale: {guidance_scale}")
+        logger.info(f"trunc: {cfg_trunc_ratio}")
+        logger.info(f"renorm: {renorm_cfg}")
         # logger.info(f"sample_sampler: {sampler_name}")
 
         system_prompt = args.system_prompt or ""
@@ -375,8 +381,9 @@ def sample_image_inference(
             uncond_hidden_states,
             uncond_attn_masks,
             timesteps=timesteps,
-            num_inference_steps=num_inference_steps,
             guidance_scale=guidance_scale,
+            cfg_trunc_ratio=cfg_trunc_ratio,
+            renorm_cfg=renorm_cfg,
         )
 
     # Latent to image
@@ -550,10 +557,9 @@ def denoise(
     neg_txt: Tensor,
     neg_txt_mask: Tensor,
     timesteps: Union[List[float], torch.Tensor],
-    num_inference_steps: int = 38,
     guidance_scale: float = 4.0,
-    cfg_trunc_ratio: float = 1.0,
-    cfg_normalization: bool = True,
+    cfg_trunc_ratio: float = 0.25,
+    renorm_cfg: float = 1.0,
 ):
     """
     Denoise an image using the NextDiT model.
@@ -578,21 +584,17 @@ def denoise(
             The guidance scale for the denoising process. Defaults to 4.0.
         cfg_trunc_ratio (float, optional):
             The ratio of the timestep interval to apply normalization-based guidance scale.
-        cfg_normalization (bool, optional):
-            Whether to apply normalization-based guidance scale.
-
+        renorm_cfg (float, optional):
+            The factor to limit the maximum norm after guidance. Default: 1.0
     Returns:
         img (Tensor): Denoised latent tensor
     """
 
     for i, t in enumerate(tqdm(timesteps)):
-        # compute whether apply classifier-free truncation on this timestep
-        do_classifier_free_truncation = (i + 1) / num_inference_steps > cfg_trunc_ratio
-
         # reverse the timestep since Lumina uses t=0 as the noise and t=1 as the image
         current_timestep = 1 - t / scheduler.config.num_train_timesteps
         # broadcast to batch dimension in a way that's compatible with ONNX/Core ML
-        current_timestep = current_timestep.expand(img.shape[0]).to(model.device)
+        current_timestep = current_timestep * torch.ones(img.shape[0], device=img.device)
 
         noise_pred_cond = model(
             img,
@@ -601,7 +603,8 @@ def denoise(
             cap_mask=txt_mask.to(dtype=torch.int32),  # Gemma2的attention mask
         )
 
-        if not do_classifier_free_truncation:
+        # compute whether to apply classifier-free guidance based on current timestep
+        if current_timestep[0] < cfg_trunc_ratio:
             noise_pred_uncond = model(
                 img,
                 current_timestep,
@@ -610,10 +613,12 @@ def denoise(
             )
             noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_cond - noise_pred_uncond)
             # apply normalization after classifier-free guidance
-            if cfg_normalization:
-                cond_norm = torch.norm(noise_pred_cond, dim=-1, keepdim=True)
-                noise_norm = torch.norm(noise_pred, dim=-1, keepdim=True)
-                noise_pred = noise_pred * (cond_norm / noise_norm)
+            if float(renorm_cfg) > 0.0:
+                cond_norm = torch.linalg.vector_norm(noise_pred_cond, dim=tuple(range(1, len(noise_pred_cond.shape))), keepdim=True)
+                max_new_norm = cond_norm * float(renorm_cfg)
+                noise_norm = torch.linalg.vector_norm(noise_pred, dim=tuple(range(1, len(noise_pred.shape))), keepdim=True)
+                if noise_norm >= max_new_norm:
+                    noise_pred = noise_pred * (max_new_norm / noise_norm)
         else:
             noise_pred = noise_pred_cond
 
@@ -932,13 +937,6 @@ def add_lumina_train_arguments(parser: argparse.ArgumentParser):
         " / Gemma2の最大トークン長。省略された場合、schnellの場合は256、devの場合は512",
     )
 
-    parser.add_argument(
-        "--guidance_scale",
-        type=float,
-        default=3.5,
-        help="the NextDIT.1 dev variant is a guidance distilled model",
-    )
-
     parser.add_argument(
         "--timestep_sampling",
         choices=["sigma", "uniform", "sigmoid", "shift", "nextdit_shift"],
diff --git a/library/train_util.py b/library/train_util.py
index ded23f411..18aceaf7b 100644
--- a/library/train_util.py
+++ b/library/train_util.py
@@ -6188,6 +6188,16 @@ def line_to_prompt_dict(line: str) -> dict:
                 prompt_dict["controlnet_image"] = m.group(1)
                 continue
 
+            m = re.match(r"ct (.+)", parg, re.IGNORECASE)
+            if m:
+                prompt_dict["cfg_trunc_ratio"] = float(m.group(1))
+                continue
+
+            m = re.match(r"rc (.+)", parg, re.IGNORECASE)
+            if m:
+                prompt_dict["renorm_cfg"] = float(m.group(1))
+                continue
+
         except ValueError as ex:
             logger.error(f"Exception in parsing / 解析エラー: {parg}")
             logger.error(ex)
diff --git a/lumina_train_network.py b/lumina_train_network.py
index adbf834ca..0fd4da6b3 100644
--- a/lumina_train_network.py
+++ b/lumina_train_network.py
@@ -357,7 +357,6 @@ def update_metadata(self, metadata, args):
         metadata["ss_logit_mean"] = args.logit_mean
         metadata["ss_logit_std"] = args.logit_std
         metadata["ss_mode_scale"] = args.mode_scale
-        metadata["ss_guidance_scale"] = args.guidance_scale
         metadata["ss_timestep_sampling"] = args.timestep_sampling
         metadata["ss_sigmoid_scale"] = args.sigmoid_scale
         metadata["ss_model_prediction_type"] = args.model_prediction_type

From 5f9047c8cf4f28019d1365cbc7e439f5afbdda0a Mon Sep 17 00:00:00 2001
From: sdbds <865105819@qq.com>
Date: Wed, 26 Feb 2025 01:00:35 +0800
Subject: [PATCH 370/582] add truncation when > max_length

---
 library/lumina_train_util.py | 1 -
 library/strategy_lumina.py   | 1 +
 2 files changed, 1 insertion(+), 1 deletion(-)

diff --git a/library/lumina_train_util.py b/library/lumina_train_util.py
index f54b202d4..20df7eef6 100644
--- a/library/lumina_train_util.py
+++ b/library/lumina_train_util.py
@@ -320,7 +320,6 @@ def sample_image_inference(
 
         # Load sample prompts from Gemma 2
         if gemma2_model is not None:
-            logger.info(f"Encoding prompt with Gemma2: {prompt}")
             tokens_and_masks = tokenize_strategy.tokenize(prompt)
             gemma2_conds = encoding_strategy.encode_tokens(tokenize_strategy, [gemma2_model], tokens_and_masks)
 
diff --git a/library/strategy_lumina.py b/library/strategy_lumina.py
index 5d6e100fc..c9e654236 100644
--- a/library/strategy_lumina.py
+++ b/library/strategy_lumina.py
@@ -54,6 +54,7 @@ def tokenize(
             max_length=self.max_length,
             return_tensors="pt",
             padding="max_length",
+            truncation=True,
             pad_to_multiple_of=8,
         )
         return (encodings.input_ids, encodings.attention_mask)

From ce37c08b9a3b8e6567c70712f9d6899a304e98b6 Mon Sep 17 00:00:00 2001
From: sdbds <865105819@qq.com>
Date: Wed, 26 Feb 2025 11:20:03 +0800
Subject: [PATCH 371/582] clean code and add finetune code

---
 library/lumina_train_util.py | 212 ++++++--
 lumina_train.py              | 953 +++++++++++++++++++++++++++++++++++
 lumina_train_network.py      |  37 +-
 3 files changed, 1118 insertions(+), 84 deletions(-)
 create mode 100644 lumina_train.py

diff --git a/library/lumina_train_util.py b/library/lumina_train_util.py
index 20df7eef6..ca0391673 100644
--- a/library/lumina_train_util.py
+++ b/library/lumina_train_util.py
@@ -32,7 +32,9 @@
 # region sample images
 
 
-def batchify(prompt_dicts, batch_size=None) -> Generator[list[dict[str, str]], None, None]:
+def batchify(
+    prompt_dicts, batch_size=None
+) -> Generator[list[dict[str, str]], None, None]:
     """
     Group prompt dictionaries into batches with configurable batch size.
 
@@ -64,7 +66,15 @@ def batchify(prompt_dicts, batch_size=None) -> Generator[list[dict[str, str]], N
         seed = int(seed) if seed is not None else None
 
         # Create a key based on the parameters
-        key = (width, height, guidance_scale, seed, sample_steps, cfg_trunc_ratio, renorm_cfg)
+        key = (
+            width,
+            height,
+            guidance_scale,
+            seed,
+            sample_steps,
+            cfg_trunc_ratio,
+            renorm_cfg,
+        )
 
         # Add the prompt_dict to the corresponding batch
         if key not in batches:
@@ -131,7 +141,9 @@ def sample_images(
             if epoch is None or epoch % args.sample_every_n_epochs != 0:
                 return
         else:
-            if global_step % args.sample_every_n_steps != 0 or epoch is not None:  # steps is not divisible or end of epoch
+            if (
+                global_step % args.sample_every_n_steps != 0 or epoch is not None
+            ):  # steps is not divisible or end of epoch
                 return
 
     assert (
@@ -139,12 +151,21 @@ def sample_images(
     ), "No sample prompts found. Provide `--sample_prompts` / サンプルプロンプトが見つかりません。`--sample_prompts` を指定してください"
 
     logger.info("")
-    logger.info(f"generating sample images at step / サンプル画像生成 ステップ: {global_step}")
-    if not os.path.isfile(args.sample_prompts) and sample_prompts_gemma2_outputs is None:
-        logger.error(f"No prompt file / プロンプトファイルがありません: {args.sample_prompts}")
+    logger.info(
+        f"generating sample images at step / サンプル画像生成 ステップ: {global_step}"
+    )
+    if (
+        not os.path.isfile(args.sample_prompts)
+        and sample_prompts_gemma2_outputs is None
+    ):
+        logger.error(
+            f"No prompt file / プロンプトファイルがありません: {args.sample_prompts}"
+        )
         return
 
-    distributed_state = PartialState()  # for multi gpu distributed inference. this is a singleton, so it's safe to use it here
+    distributed_state = (
+        PartialState()
+    )  # for multi gpu distributed inference. this is a singleton, so it's safe to use it here
 
     # unwrap nextdit and gemma2_model
     nextdit = accelerator.unwrap_model(nextdit)
@@ -163,7 +184,9 @@ def sample_images(
     rng_state = torch.get_rng_state()
     cuda_rng_state = None
     try:
-        cuda_rng_state = torch.cuda.get_rng_state() if torch.cuda.is_available() else None
+        cuda_rng_state = (
+            torch.cuda.get_rng_state() if torch.cuda.is_available() else None
+        )
     except Exception:
         pass
 
@@ -194,7 +217,9 @@ def sample_images(
         for i in range(distributed_state.num_processes):
             per_process_prompts.append(prompts[i :: distributed_state.num_processes])
 
-        with distributed_state.split_between_processes(per_process_prompts) as prompt_dict_lists:
+        with distributed_state.split_between_processes(
+            per_process_prompts
+        ) as prompt_dict_lists:
             # TODO: batch prompts together with buckets of image sizes
             for prompt_dicts in batchify(prompt_dict_lists[0], batch_size):
                 sample_image_inference(
@@ -289,7 +314,9 @@ def sample_image_inference(
         if prompt_replacement is not None:
             prompt = prompt.replace(prompt_replacement[0], prompt_replacement[1])
             if negative_prompt is not None:
-                negative_prompt = negative_prompt.replace(prompt_replacement[0], prompt_replacement[1])
+                negative_prompt = negative_prompt.replace(
+                    prompt_replacement[0], prompt_replacement[1]
+                )
 
         if negative_prompt is None:
             negative_prompt = ""
@@ -314,17 +341,26 @@ def sample_image_inference(
             gemma2_conds = sample_prompts_gemma2_outputs[prompt]
             logger.info(f"Using cached Gemma2 outputs for prompt: {prompt}")
 
-        if sample_prompts_gemma2_outputs and negative_prompt in sample_prompts_gemma2_outputs:
+        if (
+            sample_prompts_gemma2_outputs
+            and negative_prompt in sample_prompts_gemma2_outputs
+        ):
             neg_gemma2_conds = sample_prompts_gemma2_outputs[negative_prompt]
-            logger.info(f"Using cached Gemma2 outputs for negative prompt: {negative_prompt}")
+            logger.info(
+                f"Using cached Gemma2 outputs for negative prompt: {negative_prompt}"
+            )
 
         # Load sample prompts from Gemma 2
         if gemma2_model is not None:
             tokens_and_masks = tokenize_strategy.tokenize(prompt)
-            gemma2_conds = encoding_strategy.encode_tokens(tokenize_strategy, [gemma2_model], tokens_and_masks)
+            gemma2_conds = encoding_strategy.encode_tokens(
+                tokenize_strategy, [gemma2_model], tokens_and_masks
+            )
 
             tokens_and_masks = tokenize_strategy.tokenize(negative_prompt)
-            neg_gemma2_conds = encoding_strategy.encode_tokens(tokenize_strategy, [gemma2_model], tokens_and_masks)
+            neg_gemma2_conds = encoding_strategy.encode_tokens(
+                tokenize_strategy, [gemma2_model], tokens_and_masks
+            )
 
         # Unpack Gemma2 outputs
         gemma2_hidden_states, _, gemma2_attn_mask = gemma2_conds
@@ -340,10 +376,18 @@ def sample_image_inference(
         )
 
     # Stack conditioning
-    cond_hidden_states = torch.stack([text_cond[0] for text_cond in text_conds]).to(accelerator.device)
-    cond_attn_masks = torch.stack([text_cond[1] for text_cond in text_conds]).to(accelerator.device)
-    uncond_hidden_states = torch.stack([text_cond[2] for text_cond in text_conds]).to(accelerator.device)
-    uncond_attn_masks = torch.stack([text_cond[3] for text_cond in text_conds]).to(accelerator.device)
+    cond_hidden_states = torch.stack([text_cond[0] for text_cond in text_conds]).to(
+        accelerator.device
+    )
+    cond_attn_masks = torch.stack([text_cond[1] for text_cond in text_conds]).to(
+        accelerator.device
+    )
+    uncond_hidden_states = torch.stack([text_cond[2] for text_cond in text_conds]).to(
+        accelerator.device
+    )
+    uncond_attn_masks = torch.stack([text_cond[3] for text_cond in text_conds]).to(
+        accelerator.device
+    )
 
     # sample image
     weight_dtype = vae.dtype  # TOFO give dtype as argument
@@ -362,7 +406,9 @@ def sample_image_inference(
     noise = noise.repeat(cond_hidden_states.shape[0], 1, 1, 1)
 
     scheduler = FlowMatchEulerDiscreteScheduler(shift=6.0)
-    timesteps, num_inference_steps = retrieve_timesteps(scheduler, num_inference_steps=sample_steps)
+    timesteps, num_inference_steps = retrieve_timesteps(
+        scheduler, num_inference_steps=sample_steps
+    )
 
     # if controlnet_image is not None:
     #     controlnet_image = Image.open(controlnet_image).convert("RGB")
@@ -422,7 +468,9 @@ def sample_image_inference(
             import wandb
 
             # not to commit images to avoid inconsistency between training and logging steps
-            wandb_tracker.log({f"sample_{i}": wandb.Image(image, caption=prompt)}, commit=False)  # positive prompt as a caption
+            wandb_tracker.log(
+                {f"sample_{i}": wandb.Image(image, caption=prompt)}, commit=False
+            )  # positive prompt as a caption
 
     vae.to(org_vae_device)
     clean_memory_on_device(accelerator.device)
@@ -437,7 +485,9 @@ def time_shift(mu: float, sigma: float, t: torch.Tensor):
     return t
 
 
-def get_lin_function(x1: float = 256, x2: float = 4096, y1: float = 0.5, y2: float = 1.15) -> Callable[[float], float]:
+def get_lin_function(
+    x1: float = 256, x2: float = 4096, y1: float = 0.5, y2: float = 1.15
+) -> Callable[[float], float]:
     """
     Get linear function
 
@@ -481,7 +531,9 @@ def get_schedule(
     # shifting the schedule to favor high timesteps for higher signal images
     if shift:
         # eastimate mu based on linear estimation between two points
-        mu = get_lin_function(y1=base_shift, y2=max_shift, x1=256, x2=4096)(image_seq_len)
+        mu = get_lin_function(y1=base_shift, y2=max_shift, x1=256, x2=4096)(
+            image_seq_len
+        )
         timesteps = time_shift(mu, 1.0, timesteps)
 
     return timesteps.tolist()
@@ -520,9 +572,13 @@ def retrieve_timesteps(
         second element is the number of inference steps.
     """
     if timesteps is not None and sigmas is not None:
-        raise ValueError("Only one of `timesteps` or `sigmas` can be passed. Please choose one to set custom values")
+        raise ValueError(
+            "Only one of `timesteps` or `sigmas` can be passed. Please choose one to set custom values"
+        )
     if timesteps is not None:
-        accepts_timesteps = "timesteps" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
+        accepts_timesteps = "timesteps" in set(
+            inspect.signature(scheduler.set_timesteps).parameters.keys()
+        )
         if not accepts_timesteps:
             raise ValueError(
                 f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
@@ -532,7 +588,9 @@ def retrieve_timesteps(
         timesteps = scheduler.timesteps
         num_inference_steps = len(timesteps)
     elif sigmas is not None:
-        accept_sigmas = "sigmas" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
+        accept_sigmas = "sigmas" in set(
+            inspect.signature(scheduler.set_timesteps).parameters.keys()
+        )
         if not accept_sigmas:
             raise ValueError(
                 f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
@@ -593,7 +651,9 @@ def denoise(
         # reverse the timestep since Lumina uses t=0 as the noise and t=1 as the image
         current_timestep = 1 - t / scheduler.config.num_train_timesteps
         # broadcast to batch dimension in a way that's compatible with ONNX/Core ML
-        current_timestep = current_timestep * torch.ones(img.shape[0], device=img.device)
+        current_timestep = current_timestep * torch.ones(
+            img.shape[0], device=img.device
+        )
 
         noise_pred_cond = model(
             img,
@@ -610,12 +670,20 @@ def denoise(
                 cap_feats=neg_txt,  # Gemma2的hidden states作为caption features
                 cap_mask=neg_txt_mask.to(dtype=torch.int32),  # Gemma2的attention mask
             )
-            noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_cond - noise_pred_uncond)
+            noise_pred = noise_pred_uncond + guidance_scale * (
+                noise_pred_cond - noise_pred_uncond
+            )
             # apply normalization after classifier-free guidance
             if float(renorm_cfg) > 0.0:
-                cond_norm = torch.linalg.vector_norm(noise_pred_cond, dim=tuple(range(1, len(noise_pred_cond.shape))), keepdim=True)
+                cond_norm = torch.linalg.vector_norm(
+                    noise_pred_cond,
+                    dim=tuple(range(1, len(noise_pred_cond.shape))),
+                    keepdim=True,
+                )
                 max_new_norm = cond_norm * float(renorm_cfg)
-                noise_norm = torch.linalg.vector_norm(noise_pred, dim=tuple(range(1, len(noise_pred.shape))), keepdim=True)
+                noise_norm = torch.linalg.vector_norm(
+                    noise_pred, dim=tuple(range(1, len(noise_pred.shape))), keepdim=True
+                )
                 if noise_norm >= max_new_norm:
                     noise_pred = noise_pred * (max_new_norm / noise_norm)
         else:
@@ -640,7 +708,11 @@ def denoise(
 
 # region train
 def get_sigmas(
-    noise_scheduler: FlowMatchEulerDiscreteScheduler, timesteps: Tensor, device: torch.device, n_dim=4, dtype=torch.float32
+    noise_scheduler: FlowMatchEulerDiscreteScheduler,
+    timesteps: Tensor,
+    device: torch.device,
+    n_dim=4,
+    dtype=torch.float32,
 ) -> Tensor:
     """
     Get sigmas for timesteps
@@ -667,7 +739,11 @@ def get_sigmas(
 
 
 def compute_density_for_timestep_sampling(
-    weighting_scheme: str, batch_size: int, logit_mean: float = None, logit_std: float = None, mode_scale: float = None
+    weighting_scheme: str,
+    batch_size: int,
+    logit_mean: float = None,
+    logit_std: float = None,
+    mode_scale: float = None,
 ):
     """
     Compute the density for sampling the timesteps when doing SD3 training.
@@ -688,7 +764,9 @@ def compute_density_for_timestep_sampling(
     """
     if weighting_scheme == "logit_normal":
         # See 3.1 in the SD3 paper ($rf/lognorm(0.00,1.00)$).
-        u = torch.normal(mean=logit_mean, std=logit_std, size=(batch_size,), device="cpu")
+        u = torch.normal(
+            mean=logit_mean, std=logit_std, size=(batch_size,), device="cpu"
+        )
         u = torch.nn.functional.sigmoid(u)
     elif weighting_scheme == "mode":
         u = torch.rand(size=(batch_size,), device="cpu")
@@ -722,7 +800,9 @@ def compute_loss_weighting_for_sd3(weighting_scheme: str, sigmas=None) -> Tensor
     return weighting
 
 
-def get_noisy_model_input_and_timesteps(args, noise_scheduler, latents, noise, device, dtype) -> Tuple[Tensor, Tensor, Tensor]:
+def get_noisy_model_input_and_timesteps(
+    args, noise_scheduler, latents, noise, device, dtype
+) -> Tuple[Tensor, Tensor, Tensor]:
     """
     Get noisy model input and timesteps.
 
@@ -753,27 +833,27 @@ def get_noisy_model_input_and_timesteps(args, noise_scheduler, latents, noise, d
 
         timesteps = t * 1000.0
         t = t.view(-1, 1, 1, 1)
-        noisy_model_input = (1 - t) * latents + t * noise
+        noisy_model_input = (1 - t) * noise + t * latents
     elif args.timestep_sampling == "shift":
         shift = args.discrete_flow_shift
         logits_norm = torch.randn(bsz, device=device)
-        logits_norm = logits_norm * args.sigmoid_scale  # larger scale for more uniform sampling
+        logits_norm = (
+            logits_norm * args.sigmoid_scale
+        )  # larger scale for more uniform sampling
         timesteps = logits_norm.sigmoid()
         timesteps = (timesteps * shift) / (1 + (shift - 1) * timesteps)
 
         t = timesteps.view(-1, 1, 1, 1)
         timesteps = timesteps * 1000.0
-        noisy_model_input = (1 - t) * latents + t * noise
+        noisy_model_input = (1 - t) * noise + t * latents
     elif args.timestep_sampling == "nextdit_shift":
-        logits_norm = torch.randn(bsz, device=device)
-        logits_norm = logits_norm * args.sigmoid_scale  # larger scale for more uniform sampling
-        timesteps = logits_norm.sigmoid()
-        mu = get_lin_function(y1=0.5, y2=1.15)((h // 2) * (w // 2))
-        timesteps = time_shift(mu, 1.0, timesteps)
+        t = torch.rand((bsz,), device=device)
+        mu = get_lin_function(y1=0.5, y2=1.15)((h // 16) * (w // 16)) # lumina use //16
+        t = time_shift(mu, 1.0, t)
 
-        t = timesteps.view(-1, 1, 1, 1)
-        timesteps = timesteps * 1000.0
-        noisy_model_input = (1 - t) * latents + t * noise
+        timesteps = t * 1000.0
+        t = t.view(-1, 1, 1, 1)
+        noisy_model_input = (1 - t) * noise + t * latents
     else:
         # Sample a random timestep for each image
         # for weighting schemes where we sample timesteps non-uniformly
@@ -788,8 +868,10 @@ def get_noisy_model_input_and_timesteps(args, noise_scheduler, latents, noise, d
         timesteps = noise_scheduler.timesteps[indices].to(device=device)
 
         # Add noise according to flow matching.
-        sigmas = get_sigmas(noise_scheduler, timesteps, device, n_dim=latents.ndim, dtype=dtype)
-        noisy_model_input = sigmas * noise + (1.0 - sigmas) * latents
+        sigmas = get_sigmas(
+            noise_scheduler, timesteps, device, n_dim=latents.ndim, dtype=dtype
+        )
+        noisy_model_input = sigmas * latents + (1.0 - sigmas) * noise
 
     return noisy_model_input.to(dtype), timesteps.to(dtype), sigmas
 
@@ -821,7 +903,9 @@ def apply_model_prediction_type(
 
         # these weighting schemes use a uniform timestep sampling
         # and instead post-weight the loss
-        weighting = compute_loss_weighting_for_sd3(weighting_scheme=args.weighting_scheme, sigmas=sigmas)
+        weighting = compute_loss_weighting_for_sd3(
+            weighting_scheme=args.weighting_scheme, sigmas=sigmas
+        )
 
     return model_pred, weighting
 
@@ -863,15 +947,27 @@ def update_sd(prefix, sd):
 
 
 def save_lumina_model_on_train_end(
-    args: argparse.Namespace, save_dtype: torch.dtype, epoch: int, global_step: int, lumina: lumina_models.NextDiT
+    args: argparse.Namespace,
+    save_dtype: torch.dtype,
+    epoch: int,
+    global_step: int,
+    lumina: lumina_models.NextDiT,
 ):
     def sd_saver(ckpt_file, epoch_no, global_step):
         sai_metadata = train_util.get_sai_model_spec(
-            None, args, False, False, False, is_stable_diffusion_ckpt=True, lumina="lumina2"
+            None,
+            args,
+            False,
+            False,
+            False,
+            is_stable_diffusion_ckpt=True,
+            lumina="lumina2",
         )
         save_models(ckpt_file, lumina, sai_metadata, save_dtype, args.mem_eff_save)
 
-    train_util.save_sd_model_on_train_end_common(args, True, True, epoch, global_step, sd_saver, None)
+    train_util.save_sd_model_on_train_end_common(
+        args, True, True, epoch, global_step, sd_saver, None
+    )
 
 
 # epochとstepの保存、メタデータにepoch/stepが含まれ引数が同じになるため、統合してている
@@ -901,7 +997,15 @@ def save_lumina_model_on_epoch_end_or_stepwise(
     """
 
     def sd_saver(ckpt_file: str, epoch_no: int, global_step: int):
-        sai_metadata = train_util.get_sai_model_spec({}, args, False, False, False, is_stable_diffusion_ckpt=True, lumina="lumina2")
+        sai_metadata = train_util.get_sai_model_spec(
+            {},
+            args,
+            False,
+            False,
+            False,
+            is_stable_diffusion_ckpt=True,
+            lumina="lumina2",
+        )
         save_models(ckpt_file, lumina, sai_metadata, save_dtype, args.mem_eff_save)
 
     train_util.save_sd_model_on_epoch_end_or_stepwise_common(
@@ -927,7 +1031,11 @@ def add_lumina_train_arguments(parser: argparse.ArgumentParser):
         type=str,
         help="path to gemma2 model (*.sft or *.safetensors), should be float16 / gemma2のパス（*.sftまたは*.safetensors）、float16が前提",
     )
-    parser.add_argument("--ae", type=str, help="path to ae (*.sft or *.safetensors) / aeのパス（*.sftまたは*.safetensors）")
+    parser.add_argument(
+        "--ae",
+        type=str,
+        help="path to ae (*.sft or *.safetensors) / aeのパス（*.sftまたは*.safetensors）",
+    )
     parser.add_argument(
         "--gemma2_max_token_length",
         type=int,
diff --git a/lumina_train.py b/lumina_train.py
new file mode 100644
index 000000000..330d0093b
--- /dev/null
+++ b/lumina_train.py
@@ -0,0 +1,953 @@
+# training with captions
+
+# Swap blocks between CPU and GPU:
+# This implementation is inspired by and based on the work of 2kpr.
+# Many thanks to 2kpr for the original concept and implementation of memory-efficient offloading.
+# The original idea has been adapted and extended to fit the current project's needs.
+
+# Key features:
+# - CPU offloading during forward and backward passes
+# - Use of fused optimizer and grad_hook for efficient gradient processing
+# - Per-block fused optimizer instances
+
+import argparse
+import copy
+import math
+import os
+from multiprocessing import Value
+import toml
+
+from tqdm import tqdm
+
+import torch
+from library.device_utils import init_ipex, clean_memory_on_device
+
+init_ipex()
+
+from accelerate.utils import set_seed
+from library import (
+    deepspeed_utils,
+    lumina_train_util,
+    lumina_util,
+    strategy_base,
+    strategy_lumina,
+)
+from library.sd3_train_utils import FlowMatchEulerDiscreteScheduler
+
+import library.train_util as train_util
+
+from library.utils import setup_logging, add_logging_arguments
+
+setup_logging()
+import logging
+
+logger = logging.getLogger(__name__)
+
+import library.config_util as config_util
+
+# import library.sdxl_train_util as sdxl_train_util
+from library.config_util import (
+    ConfigSanitizer,
+    BlueprintGenerator,
+)
+from library.custom_train_functions import apply_masked_loss, add_custom_train_arguments
+
+
+def train(args):
+    train_util.verify_training_args(args)
+    train_util.prepare_dataset_args(args, True)
+    # sdxl_train_util.verify_sdxl_training_args(args)
+    deepspeed_utils.prepare_deepspeed_args(args)
+    setup_logging(args, reset=True)
+
+    # temporary: backward compatibility for deprecated options. remove in the future
+    if not args.skip_cache_check:
+        args.skip_cache_check = args.skip_latents_validity_check
+
+    # assert (
+    #     not args.weighted_captions
+    # ), "weighted_captions is not supported currently / weighted_captionsは現在サポートされていません"
+    if args.cache_text_encoder_outputs_to_disk and not args.cache_text_encoder_outputs:
+        logger.warning(
+            "cache_text_encoder_outputs_to_disk is enabled, so cache_text_encoder_outputs is also enabled / cache_text_encoder_outputs_to_diskが有効になっているため、cache_text_encoder_outputsも有効になります"
+        )
+        args.cache_text_encoder_outputs = True
+
+    if args.cpu_offload_checkpointing and not args.gradient_checkpointing:
+        logger.warning(
+            "cpu_offload_checkpointing is enabled, so gradient_checkpointing is also enabled / cpu_offload_checkpointingが有効になっているため、gradient_checkpointingも有効になります"
+        )
+        args.gradient_checkpointing = True
+
+    # assert (
+    #     args.blocks_to_swap is None or args.blocks_to_swap == 0
+    # ) or not args.cpu_offload_checkpointing, "blocks_to_swap is not supported with cpu_offload_checkpointing / blocks_to_swapはcpu_offload_checkpointingと併用できません"
+
+    cache_latents = args.cache_latents
+    use_dreambooth_method = args.in_json is None
+
+    if args.seed is not None:
+        set_seed(args.seed)  # 乱数系列を初期化する
+
+    # prepare caching strategy: this must be set before preparing dataset. because dataset may use this strategy for initialization.
+    if args.cache_latents:
+        latents_caching_strategy = strategy_lumina.LuminaLatentsCachingStrategy(
+            args.cache_latents_to_disk, args.vae_batch_size, args.skip_cache_check
+        )
+        strategy_base.LatentsCachingStrategy.set_strategy(latents_caching_strategy)
+
+    # データセットを準備する
+    if args.dataset_class is None:
+        blueprint_generator = BlueprintGenerator(
+            ConfigSanitizer(True, True, args.masked_loss, True)
+        )
+        if args.dataset_config is not None:
+            logger.info(f"Load dataset config from {args.dataset_config}")
+            user_config = config_util.load_user_config(args.dataset_config)
+            ignored = ["train_data_dir", "in_json"]
+            if any(getattr(args, attr) is not None for attr in ignored):
+                logger.warning(
+                    "ignore following options because config file is found: {0} / 設定ファイルが利用されるため以下のオプションは無視されます: {0}".format(
+                        ", ".join(ignored)
+                    )
+                )
+        else:
+            if use_dreambooth_method:
+                logger.info("Using DreamBooth method.")
+                user_config = {
+                    "datasets": [
+                        {
+                            "subsets": config_util.generate_dreambooth_subsets_config_by_subdirs(
+                                args.train_data_dir, args.reg_data_dir
+                            )
+                        }
+                    ]
+                }
+            else:
+                logger.info("Training with captions.")
+                user_config = {
+                    "datasets": [
+                        {
+                            "subsets": [
+                                {
+                                    "image_dir": args.train_data_dir,
+                                    "metadata_file": args.in_json,
+                                }
+                            ]
+                        }
+                    ]
+                }
+
+        blueprint = blueprint_generator.generate(user_config, args)
+        train_dataset_group, val_dataset_group = (
+            config_util.generate_dataset_group_by_blueprint(blueprint.dataset_group)
+        )
+    else:
+        train_dataset_group = train_util.load_arbitrary_dataset(args)
+        val_dataset_group = None
+
+    current_epoch = Value("i", 0)
+    current_step = Value("i", 0)
+    ds_for_collator = (
+        train_dataset_group if args.max_data_loader_n_workers == 0 else None
+    )
+    collator = train_util.collator_class(current_epoch, current_step, ds_for_collator)
+
+    train_dataset_group.verify_bucket_reso_steps(16)  # TODO これでいいか確認
+
+    if args.debug_dataset:
+        if args.cache_text_encoder_outputs:
+            strategy_base.TextEncoderOutputsCachingStrategy.set_strategy(
+                strategy_lumina.LuminaTextEncoderOutputsCachingStrategy(
+                    args.cache_text_encoder_outputs_to_disk,
+                    args.text_encoder_batch_size,
+                    args.skip_cache_check,
+                    False,
+                )
+            )
+        strategy_base.TokenizeStrategy.set_strategy(
+            strategy_lumina.LuminaTokenizeStrategy()
+        )
+
+        train_dataset_group.set_current_strategies()
+        train_util.debug_dataset(train_dataset_group, True)
+        return
+    if len(train_dataset_group) == 0:
+        logger.error(
+            "No data found. Please verify the metadata file and train_data_dir option. / 画像がありません。メタデータおよびtrain_data_dirオプションを確認してください。"
+        )
+        return
+
+    if cache_latents:
+        assert (
+            train_dataset_group.is_latent_cacheable()
+        ), "when caching latents, either color_aug or random_crop cannot be used / latentをキャッシュするときはcolor_augとrandom_cropは使えません"
+
+    if args.cache_text_encoder_outputs:
+        assert (
+            train_dataset_group.is_text_encoder_output_cacheable()
+        ), "when caching text encoder output, either caption_dropout_rate, shuffle_caption, token_warmup_step or caption_tag_dropout_rate cannot be used / text encoderの出力をキャッシュするときはcaption_dropout_rate, shuffle_caption, token_warmup_step, caption_tag_dropout_rateは使えません"
+
+    # acceleratorを準備する
+    logger.info("prepare accelerator")
+    accelerator = train_util.prepare_accelerator(args)
+
+    # mixed precisionに対応した型を用意しておき適宜castする
+    weight_dtype, save_dtype = train_util.prepare_dtype(args)
+
+    # モデルを読み込む
+
+    # load VAE for caching latents
+    ae = None
+    if cache_latents:
+        ae = lumina_util.load_ae(
+            args.ae, weight_dtype, "cpu", args.disable_mmap_load_safetensors
+        )
+        ae.to(accelerator.device, dtype=weight_dtype)
+        ae.requires_grad_(False)
+        ae.eval()
+
+        train_dataset_group.new_cache_latents(ae, accelerator)
+
+        ae.to("cpu")  # if no sampling, vae can be deleted
+        clean_memory_on_device(accelerator.device)
+
+        accelerator.wait_for_everyone()
+
+    # prepare tokenize strategy
+    if args.gemma2_max_token_length is None:
+        gemma2_max_token_length = 256
+    else:
+        gemma2_max_token_length = args.gemma2_max_token_length
+
+    lumina_tokenize_strategy = strategy_lumina.LuminaTokenizeStrategy(
+        gemma2_max_token_length
+    )
+    strategy_base.TokenizeStrategy.set_strategy(lumina_tokenize_strategy)
+
+    # load gemma2 for caching text encoder outputs
+    gemma2 = lumina_util.load_gemma2(
+        args.gemma2, weight_dtype, "cpu", args.disable_mmap_load_safetensors
+    )
+    gemma2.eval()
+    gemma2.requires_grad_(False)
+
+    text_encoding_strategy = strategy_lumina.LuminaTextEncodingStrategy()
+    strategy_base.TextEncodingStrategy.set_strategy(text_encoding_strategy)
+
+    # cache text encoder outputs
+    sample_prompts_te_outputs = None
+    if args.cache_text_encoder_outputs:
+        # Text Encodes are eval and no grad here
+        gemma2.to(accelerator.device)
+
+        text_encoder_caching_strategy = (
+            strategy_lumina.LuminaTextEncoderOutputsCachingStrategy(
+                args.cache_text_encoder_outputs_to_disk,
+                args.text_encoder_batch_size,
+                False,
+                False,
+            )
+        )
+        strategy_base.TextEncoderOutputsCachingStrategy.set_strategy(
+            text_encoder_caching_strategy
+        )
+
+        with accelerator.autocast():
+            train_dataset_group.new_cache_text_encoder_outputs([gemma2], accelerator)
+
+        # cache sample prompt's embeddings to free text encoder's memory
+        if args.sample_prompts is not None:
+            logger.info(
+                f"cache Text Encoder outputs for sample prompt: {args.sample_prompts}"
+            )
+
+            text_encoding_strategy: strategy_lumina.LuminaTextEncodingStrategy = (
+                strategy_base.TextEncodingStrategy.get_strategy()
+            )
+
+            prompts = train_util.load_prompts(args.sample_prompts)
+            sample_prompts_te_outputs = {}  # key: prompt, value: text encoder outputs
+            with accelerator.autocast(), torch.no_grad():
+                for prompt_dict in prompts:
+                    for p in [
+                        prompt_dict.get("prompt", ""),
+                        prompt_dict.get("negative_prompt", ""),
+                    ]:
+                        if p not in sample_prompts_te_outputs:
+                            logger.info(f"cache Text Encoder outputs for prompt: {p}")
+                            tokens_and_masks = lumina_tokenize_strategy.tokenize(p)
+                            sample_prompts_te_outputs[p] = (
+                                text_encoding_strategy.encode_tokens(
+                                    lumina_tokenize_strategy,
+                                    [gemma2],
+                                    tokens_and_masks,
+                                )
+                            )
+
+        accelerator.wait_for_everyone()
+
+        # now we can delete Text Encoders to free memory
+        gemma2 = None
+        clean_memory_on_device(accelerator.device)
+
+    # load lumina
+    nextdit = lumina_util.load_lumina_model(
+        args.pretrained_model_name_or_path,
+        loading_dtype,
+        torch.device("cpu"),
+        disable_mmap=args.disable_mmap_load_safetensors,
+        use_flash_attn=args.use_flash_attn,
+    )
+
+    if args.gradient_checkpointing:
+        nextdit.enable_gradient_checkpointing(
+            cpu_offload=args.cpu_offload_checkpointing
+        )
+
+    nextdit.requires_grad_(True)
+
+    # block swap
+
+    # backward compatibility
+    # if args.blocks_to_swap is None:
+    #     blocks_to_swap = args.double_blocks_to_swap or 0
+    #     if args.single_blocks_to_swap is not None:
+    #         blocks_to_swap += args.single_blocks_to_swap // 2
+    #     if blocks_to_swap > 0:
+    #         logger.warning(
+    #             "double_blocks_to_swap and single_blocks_to_swap are deprecated. Use blocks_to_swap instead."
+    #             " / double_blocks_to_swapとsingle_blocks_to_swapは非推奨です。blocks_to_swapを使ってください。"
+    #         )
+    #         logger.info(
+    #             f"double_blocks_to_swap={args.double_blocks_to_swap} and single_blocks_to_swap={args.single_blocks_to_swap} are converted to blocks_to_swap={blocks_to_swap}."
+    #         )
+    #         args.blocks_to_swap = blocks_to_swap
+    #     del blocks_to_swap
+
+    # is_swapping_blocks = args.blocks_to_swap is not None and args.blocks_to_swap > 0
+    # if is_swapping_blocks:
+    #     # Swap blocks between CPU and GPU to reduce memory usage, in forward and backward passes.
+    #     # This idea is based on 2kpr's great work. Thank you!
+    #     logger.info(f"enable block swap: blocks_to_swap={args.blocks_to_swap}")
+    #     flux.enable_block_swap(args.blocks_to_swap, accelerator.device)
+
+    if not cache_latents:
+        # load VAE here if not cached
+        ae = lumina_util.load_ae(args.ae, weight_dtype, "cpu")
+        ae.requires_grad_(False)
+        ae.eval()
+        ae.to(accelerator.device, dtype=weight_dtype)
+
+    training_models = []
+    params_to_optimize = []
+    training_models.append(nextdit)
+    name_and_params = list(nextdit.named_parameters())
+    # single param group for now
+    params_to_optimize.append(
+        {"params": [p for _, p in name_and_params], "lr": args.learning_rate}
+    )
+    param_names = [[n for n, _ in name_and_params]]
+
+    # calculate number of trainable parameters
+    n_params = 0
+    for group in params_to_optimize:
+        for p in group["params"]:
+            n_params += p.numel()
+
+    accelerator.print(f"number of trainable parameters: {n_params}")
+
+    # 学習に必要なクラスを準備する
+    accelerator.print("prepare optimizer, data loader etc.")
+
+    if args.blockwise_fused_optimizers:
+        # fused backward pass: https://pytorch.org/tutorials/intermediate/optimizer_step_in_backward_tutorial.html
+        # Instead of creating an optimizer for all parameters as in the tutorial, we create an optimizer for each block of parameters.
+        # This balances memory usage and management complexity.
+
+        # split params into groups. currently different learning rates are not supported
+        grouped_params = []
+        param_group = {}
+        for group in params_to_optimize:
+            named_parameters = list(nextdit.named_parameters())
+            assert len(named_parameters) == len(
+                group["params"]
+            ), "number of parameters does not match"
+            for p, np in zip(group["params"], named_parameters):
+                # determine target layer and block index for each parameter
+                block_type = "other"  # double, single or other
+                if np[0].startswith("double_blocks"):
+                    block_index = int(np[0].split(".")[1])
+                    block_type = "double"
+                elif np[0].startswith("single_blocks"):
+                    block_index = int(np[0].split(".")[1])
+                    block_type = "single"
+                else:
+                    block_index = -1
+
+                param_group_key = (block_type, block_index)
+                if param_group_key not in param_group:
+                    param_group[param_group_key] = []
+                param_group[param_group_key].append(p)
+
+        block_types_and_indices = []
+        for param_group_key, param_group in param_group.items():
+            block_types_and_indices.append(param_group_key)
+            grouped_params.append({"params": param_group, "lr": args.learning_rate})
+
+            num_params = 0
+            for p in param_group:
+                num_params += p.numel()
+            accelerator.print(f"block {param_group_key}: {num_params} parameters")
+
+        # prepare optimizers for each group
+        optimizers = []
+        for group in grouped_params:
+            _, _, optimizer = train_util.get_optimizer(args, trainable_params=[group])
+            optimizers.append(optimizer)
+        optimizer = optimizers[0]  # avoid error in the following code
+
+        logger.info(
+            f"using {len(optimizers)} optimizers for blockwise fused optimizers"
+        )
+
+        if train_util.is_schedulefree_optimizer(optimizers[0], args):
+            raise ValueError(
+                "Schedule-free optimizer is not supported with blockwise fused optimizers"
+            )
+        optimizer_train_fn = lambda: None  # dummy function
+        optimizer_eval_fn = lambda: None  # dummy function
+    else:
+        _, _, optimizer = train_util.get_optimizer(
+            args, trainable_params=params_to_optimize
+        )
+        optimizer_train_fn, optimizer_eval_fn = train_util.get_optimizer_train_eval_fn(
+            optimizer, args
+        )
+
+    # prepare dataloader
+    # strategies are set here because they cannot be referenced in another process. Copy them with the dataset
+    # some strategies can be None
+    train_dataset_group.set_current_strategies()
+
+    # DataLoaderのプロセス数：0 は persistent_workers が使えないので注意
+    n_workers = min(
+        args.max_data_loader_n_workers, os.cpu_count()
+    )  # cpu_count or max_data_loader_n_workers
+    train_dataloader = torch.utils.data.DataLoader(
+        train_dataset_group,
+        batch_size=1,
+        shuffle=True,
+        collate_fn=collator,
+        num_workers=n_workers,
+        persistent_workers=args.persistent_data_loader_workers,
+    )
+
+    # 学習ステップ数を計算する
+    if args.max_train_epochs is not None:
+        args.max_train_steps = args.max_train_epochs * math.ceil(
+            len(train_dataloader)
+            / accelerator.num_processes
+            / args.gradient_accumulation_steps
+        )
+        accelerator.print(
+            f"override steps. steps for {args.max_train_epochs} epochs is / 指定エポックまでのステップ数: {args.max_train_steps}"
+        )
+
+    # データセット側にも学習ステップを送信
+    train_dataset_group.set_max_train_steps(args.max_train_steps)
+
+    # lr schedulerを用意する
+    if args.blockwise_fused_optimizers:
+        # prepare lr schedulers for each optimizer
+        lr_schedulers = [
+            train_util.get_scheduler_fix(args, optimizer, accelerator.num_processes)
+            for optimizer in optimizers
+        ]
+        lr_scheduler = lr_schedulers[0]  # avoid error in the following code
+    else:
+        lr_scheduler = train_util.get_scheduler_fix(
+            args, optimizer, accelerator.num_processes
+        )
+
+    # 実験的機能：勾配も含めたfp16/bf16学習を行う　モデル全体をfp16/bf16にする
+    if args.full_fp16:
+        assert (
+            args.mixed_precision == "fp16"
+        ), "full_fp16 requires mixed precision='fp16' / full_fp16を使う場合はmixed_precision='fp16'を指定してください。"
+        accelerator.print("enable full fp16 training.")
+        nextdit.to(weight_dtype)
+        if gemma2 is not None:
+            gemma2.to(weight_dtype)
+    elif args.full_bf16:
+        assert (
+            args.mixed_precision == "bf16"
+        ), "full_bf16 requires mixed precision='bf16' / full_bf16を使う場合はmixed_precision='bf16'を指定してください。"
+        accelerator.print("enable full bf16 training.")
+        nextdit.to(weight_dtype)
+        if gemma2 is not None:
+            gemma2.to(weight_dtype)
+
+    # if we don't cache text encoder outputs, move them to device
+    if not args.cache_text_encoder_outputs:
+        gemma2.to(accelerator.device)
+
+    clean_memory_on_device(accelerator.device)
+
+    if args.deepspeed:
+        ds_model = deepspeed_utils.prepare_deepspeed_model(args, nextdit=nextdit)
+        # most of ZeRO stage uses optimizer partitioning, so we have to prepare optimizer and ds_model at the same time. # pull/1139#issuecomment-1986790007
+        ds_model, optimizer, train_dataloader, lr_scheduler = accelerator.prepare(
+            ds_model, optimizer, train_dataloader, lr_scheduler
+        )
+        training_models = [ds_model]
+
+    else:
+        # accelerator does some magic
+        # if we doesn't swap blocks, we can move the model to device
+        nextdit = accelerator.prepare(
+            nextdit, device_placement=[not is_swapping_blocks]
+        )
+        if is_swapping_blocks:
+            accelerator.unwrap_model(nextdit).move_to_device_except_swap_blocks(
+                accelerator.device
+            )  # reduce peak memory usage
+        optimizer, train_dataloader, lr_scheduler = accelerator.prepare(
+            optimizer, train_dataloader, lr_scheduler
+        )
+
+    # 実験的機能：勾配も含めたfp16学習を行う　PyTorchにパッチを当ててfp16でのgrad scaleを有効にする
+    if args.full_fp16:
+        # During deepseed training, accelerate not handles fp16/bf16|mixed precision directly via scaler. Let deepspeed engine do.
+        # -> But we think it's ok to patch accelerator even if deepspeed is enabled.
+        train_util.patch_accelerator_for_fp16_training(accelerator)
+
+    # resumeする
+    train_util.resume_from_local_or_hf_if_specified(accelerator, args)
+
+    if args.fused_backward_pass:
+        # use fused optimizer for backward pass: other optimizers will be supported in the future
+        import library.adafactor_fused
+
+        library.adafactor_fused.patch_adafactor_fused(optimizer)
+
+        for param_group, param_name_group in zip(optimizer.param_groups, param_names):
+            for parameter, param_name in zip(param_group["params"], param_name_group):
+                if parameter.requires_grad:
+
+                    def create_grad_hook(p_name, p_group):
+                        def grad_hook(tensor: torch.Tensor):
+                            if accelerator.sync_gradients and args.max_grad_norm != 0.0:
+                                accelerator.clip_grad_norm_(tensor, args.max_grad_norm)
+                            optimizer.step_param(tensor, p_group)
+                            tensor.grad = None
+
+                        return grad_hook
+
+                    parameter.register_post_accumulate_grad_hook(
+                        create_grad_hook(param_name, param_group)
+                    )
+
+    elif args.blockwise_fused_optimizers:
+        # prepare for additional optimizers and lr schedulers
+        for i in range(1, len(optimizers)):
+            optimizers[i] = accelerator.prepare(optimizers[i])
+            lr_schedulers[i] = accelerator.prepare(lr_schedulers[i])
+
+        # counters are used to determine when to step the optimizer
+        global optimizer_hooked_count
+        global num_parameters_per_group
+        global parameter_optimizer_map
+
+        optimizer_hooked_count = {}
+        num_parameters_per_group = [0] * len(optimizers)
+        parameter_optimizer_map = {}
+
+        for opt_idx, optimizer in enumerate(optimizers):
+            for param_group in optimizer.param_groups:
+                for parameter in param_group["params"]:
+                    if parameter.requires_grad:
+
+                        def grad_hook(parameter: torch.Tensor):
+                            if accelerator.sync_gradients and args.max_grad_norm != 0.0:
+                                accelerator.clip_grad_norm_(
+                                    parameter, args.max_grad_norm
+                                )
+
+                            i = parameter_optimizer_map[parameter]
+                            optimizer_hooked_count[i] += 1
+                            if optimizer_hooked_count[i] == num_parameters_per_group[i]:
+                                optimizers[i].step()
+                                optimizers[i].zero_grad(set_to_none=True)
+
+                        parameter.register_post_accumulate_grad_hook(grad_hook)
+                        parameter_optimizer_map[parameter] = opt_idx
+                        num_parameters_per_group[opt_idx] += 1
+
+    # epoch数を計算する
+    num_update_steps_per_epoch = math.ceil(
+        len(train_dataloader) / args.gradient_accumulation_steps
+    )
+    num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch)
+    if (args.save_n_epoch_ratio is not None) and (args.save_n_epoch_ratio > 0):
+        args.save_every_n_epochs = (
+            math.floor(num_train_epochs / args.save_n_epoch_ratio) or 1
+        )
+
+    # 学習する
+    # total_batch_size = args.train_batch_size * accelerator.num_processes * args.gradient_accumulation_steps
+    accelerator.print("running training / 学習開始")
+    accelerator.print(
+        f"  num examples / サンプル数: {train_dataset_group.num_train_images}"
+    )
+    accelerator.print(
+        f"  num batches per epoch / 1epochのバッチ数: {len(train_dataloader)}"
+    )
+    accelerator.print(f"  num epochs / epoch数: {num_train_epochs}")
+    accelerator.print(
+        f"  batch size per device / バッチサイズ: {', '.join([str(d.batch_size) for d in train_dataset_group.datasets])}"
+    )
+    # accelerator.print(
+    #     f"  total train batch size (with parallel & distributed & accumulation) / 総バッチサイズ（並列学習、勾配合計含む）: {total_batch_size}"
+    # )
+    accelerator.print(
+        f"  gradient accumulation steps / 勾配を合計するステップ数 = {args.gradient_accumulation_steps}"
+    )
+    accelerator.print(
+        f"  total optimization steps / 学習ステップ数: {args.max_train_steps}"
+    )
+
+    progress_bar = tqdm(
+        range(args.max_train_steps),
+        smoothing=0,
+        disable=not accelerator.is_local_main_process,
+        desc="steps",
+    )
+    global_step = 0
+
+    noise_scheduler = FlowMatchEulerDiscreteScheduler(
+        num_train_timesteps=1000, shift=args.discrete_flow_shift
+    )
+    noise_scheduler_copy = copy.deepcopy(noise_scheduler)
+
+    if accelerator.is_main_process:
+        init_kwargs = {}
+        if args.wandb_run_name:
+            init_kwargs["wandb"] = {"name": args.wandb_run_name}
+        if args.log_tracker_config is not None:
+            init_kwargs = toml.load(args.log_tracker_config)
+        accelerator.init_trackers(
+            "finetuning" if args.log_tracker_name is None else args.log_tracker_name,
+            config=train_util.get_sanitized_config_or_none(args),
+            init_kwargs=init_kwargs,
+        )
+
+    if is_swapping_blocks:
+        accelerator.unwrap_model(nextdit).prepare_block_swap_before_forward()
+
+    # For --sample_at_first
+    optimizer_eval_fn()
+    lumina_train_util.sample_images(
+        accelerator,
+        args,
+        0,
+        global_step,
+        nextdit,
+        ae,
+        gemma2,
+        sample_prompts_te_outputs,
+    )
+    optimizer_train_fn()
+    if len(accelerator.trackers) > 0:
+        # log empty object to commit the sample images to wandb
+        accelerator.log({}, step=0)
+
+    loss_recorder = train_util.LossRecorder()
+    epoch = 0  # avoid error when max_train_steps is 0
+    for epoch in range(num_train_epochs):
+        accelerator.print(f"\nepoch {epoch+1}/{num_train_epochs}")
+        current_epoch.value = epoch + 1
+
+        for m in training_models:
+            m.train()
+
+        for step, batch in enumerate(train_dataloader):
+            current_step.value = global_step
+
+            if args.blockwise_fused_optimizers:
+                optimizer_hooked_count = {
+                    i: 0 for i in range(len(optimizers))
+                }  # reset counter for each step
+
+            with accelerator.accumulate(*training_models):
+                if "latents" in batch and batch["latents"] is not None:
+                    latents = batch["latents"].to(
+                        accelerator.device, dtype=weight_dtype
+                    )
+                else:
+                    with torch.no_grad():
+                        # encode images to latents. images are [-1, 1]
+                        latents = ae.encode(batch["images"].to(ae.dtype)).to(
+                            accelerator.device, dtype=weight_dtype
+                        )
+
+                    # NaNが含まれていれば警告を表示し0に置き換える
+                    if torch.any(torch.isnan(latents)):
+                        accelerator.print("NaN found in latents, replacing with zeros")
+                        latents = torch.nan_to_num(latents, 0, out=latents)
+
+                text_encoder_outputs_list = batch.get("text_encoder_outputs_list", None)
+                if text_encoder_outputs_list is not None:
+                    text_encoder_conds = text_encoder_outputs_list
+                else:
+                    # not cached or training, so get from text encoders
+                    tokens_and_masks = batch["input_ids_list"]
+                    with torch.no_grad():
+                        input_ids = [
+                            ids.to(accelerator.device)
+                            for ids in batch["input_ids_list"]
+                        ]
+                        text_encoder_conds = text_encoding_strategy.encode_tokens(
+                            lumina_tokenize_strategy,
+                            [gemma2],
+                            input_ids,
+                        )
+                        if args.full_fp16:
+                            text_encoder_conds = [
+                                c.to(weight_dtype) for c in text_encoder_conds
+                            ]
+
+                # TODO support some features for noise implemented in get_noise_noisy_latents_and_timesteps
+
+                # Sample noise that we'll add to the latents
+                noise = torch.randn_like(latents)
+
+                # get noisy model input and timesteps
+                noisy_model_input, timesteps, sigmas = (
+                    lumina_train_util.get_noisy_model_input_and_timesteps(
+                        args,
+                        noise_scheduler_copy,
+                        latents,
+                        noise,
+                        accelerator.device,
+                        weight_dtype,
+                    )
+                )
+                # call model
+                gemma2_hidden_states, input_ids, gemma2_attn_mask = text_encoder_conds
+
+                with accelerator.autocast():
+                    # YiYi notes: divide it by 1000 for now because we scale it by 1000 in the transformer model (we should not keep it but I want to keep the inputs same for the model for testing)
+                    model_pred = nextdit(
+                        x=img,  # image latents (B, C, H, W)
+                        t=timesteps / 1000,  # timesteps需要除以1000来匹配模型预期
+                        cap_feats=gemma2_hidden_states,  # Gemma2的hidden states作为caption features
+                        cap_mask=gemma2_attn_mask.to(
+                            dtype=torch.int32
+                        ),  # Gemma2的attention mask
+                    )
+                # apply model prediction type
+                model_pred, weighting = lumina_train_util.apply_model_prediction_type(
+                    args, model_pred, noisy_model_input, sigmas
+                )
+
+                # flow matching loss: this is different from SD3
+                target = noise - latents
+
+                # calculate loss
+                huber_c = train_util.get_huber_threshold_if_needed(
+                    args, timesteps, noise_scheduler
+                )
+                loss = train_util.conditional_loss(
+                    model_pred.float(), target.float(), args.loss_type, "none", huber_c
+                )
+                if weighting is not None:
+                    loss = loss * weighting
+                if args.masked_loss or (
+                    "alpha_masks" in batch and batch["alpha_masks"] is not None
+                ):
+                    loss = apply_masked_loss(loss, batch)
+                loss = loss.mean([1, 2, 3])
+
+                loss_weights = batch["loss_weights"]  # 各sampleごとのweight
+                loss = loss * loss_weights
+                loss = loss.mean()
+
+                # backward
+                accelerator.backward(loss)
+
+                if not (args.fused_backward_pass or args.blockwise_fused_optimizers):
+                    if accelerator.sync_gradients and args.max_grad_norm != 0.0:
+                        params_to_clip = []
+                        for m in training_models:
+                            params_to_clip.extend(m.parameters())
+                        accelerator.clip_grad_norm_(params_to_clip, args.max_grad_norm)
+
+                    optimizer.step()
+                    lr_scheduler.step()
+                    optimizer.zero_grad(set_to_none=True)
+                else:
+                    # optimizer.step() and optimizer.zero_grad() are called in the optimizer hook
+                    lr_scheduler.step()
+                    if args.blockwise_fused_optimizers:
+                        for i in range(1, len(optimizers)):
+                            lr_schedulers[i].step()
+
+            # Checks if the accelerator has performed an optimization step behind the scenes
+            if accelerator.sync_gradients:
+                progress_bar.update(1)
+                global_step += 1
+
+                optimizer_eval_fn()
+                lumina_train_util.sample_images(
+                    accelerator,
+                    args,
+                    None,
+                    global_step,
+                    nextdit,
+                    ae,
+                    gemma2,
+                    sample_prompts_te_outputs,
+                )
+
+                # 指定ステップごとにモデルを保存
+                if (
+                    args.save_every_n_steps is not None
+                    and global_step % args.save_every_n_steps == 0
+                ):
+                    accelerator.wait_for_everyone()
+                    if accelerator.is_main_process:
+                        lumina_train_util.save_lumina_model_on_epoch_end_or_stepwise(
+                            args,
+                            False,
+                            accelerator,
+                            save_dtype,
+                            epoch,
+                            num_train_epochs,
+                            global_step,
+                            accelerator.unwrap_model(nextdit),
+                        )
+                optimizer_train_fn()
+
+            current_loss = loss.detach().item()  # 平均なのでbatch sizeは関係ないはず
+            if len(accelerator.trackers) > 0:
+                logs = {"loss": current_loss}
+                train_util.append_lr_to_logs(
+                    logs, lr_scheduler, args.optimizer_type, including_unet=True
+                )
+
+                accelerator.log(logs, step=global_step)
+
+            loss_recorder.add(epoch=epoch, step=step, loss=current_loss)
+            avr_loss: float = loss_recorder.moving_average
+            logs = {"avr_loss": avr_loss}  # , "lr": lr_scheduler.get_last_lr()[0]}
+            progress_bar.set_postfix(**logs)
+
+            if global_step >= args.max_train_steps:
+                break
+
+        if len(accelerator.trackers) > 0:
+            logs = {"loss/epoch": loss_recorder.moving_average}
+            accelerator.log(logs, step=epoch + 1)
+
+        accelerator.wait_for_everyone()
+
+        optimizer_eval_fn()
+        if args.save_every_n_epochs is not None:
+            if accelerator.is_main_process:
+                lumina_train_util.save_lumina_model_on_epoch_end_or_stepwise(
+                    args,
+                    True,
+                    accelerator,
+                    save_dtype,
+                    epoch,
+                    num_train_epochs,
+                    global_step,
+                    accelerator.unwrap_model(nextdit),
+                )
+
+        lumina_train_util.sample_images(
+            accelerator,
+            args,
+            epoch + 1,
+            global_step,
+            nextdit,
+            ae,
+            gemma2,
+            sample_prompts_te_outputs,
+        )
+        optimizer_train_fn()
+
+    is_main_process = accelerator.is_main_process
+    # if is_main_process:
+    nextdit = accelerator.unwrap_model(nextdit)
+
+    accelerator.end_training()
+    optimizer_eval_fn()
+
+    if args.save_state or args.save_state_on_train_end:
+        train_util.save_state_on_train_end(args, accelerator)
+
+    del accelerator  # この後メモリを使うのでこれは消す
+
+    if is_main_process:
+        lumina_train_util.save_lumina_model_on_train_end(
+            args, save_dtype, epoch, global_step, nextdit
+        )
+        logger.info("model saved.")
+
+
+def setup_parser() -> argparse.ArgumentParser:
+    parser = argparse.ArgumentParser()
+
+    add_logging_arguments(parser)
+    train_util.add_sd_models_arguments(parser)  # TODO split this
+    train_util.add_dataset_arguments(parser, True, True, True)
+    train_util.add_training_arguments(parser, False)
+    train_util.add_masked_loss_arguments(parser)
+    deepspeed_utils.add_deepspeed_arguments(parser)
+    train_util.add_sd_saving_arguments(parser)
+    train_util.add_optimizer_arguments(parser)
+    config_util.add_config_arguments(parser)
+    add_custom_train_arguments(parser)  # TODO remove this from here
+    train_util.add_dit_training_arguments(parser)
+    lumina_train_util.add_lumina_train_arguments(parser)
+
+    parser.add_argument(
+        "--mem_eff_save",
+        action="store_true",
+        help="[EXPERIMENTAL] use memory efficient custom model saving method / メモリ効率の良い独自のモデル保存方法を使う",
+    )
+
+    parser.add_argument(
+        "--fused_optimizer_groups",
+        type=int,
+        default=None,
+        help="**this option is not working** will be removed in the future / このオプションは動作しません。将来削除されます",
+    )
+    parser.add_argument(
+        "--blockwise_fused_optimizers",
+        action="store_true",
+        help="enable blockwise optimizers for fused backward pass and optimizer step / fused backward passとoptimizer step のためブロック単位のoptimizerを有効にする",
+    )
+    parser.add_argument(
+        "--skip_latents_validity_check",
+        action="store_true",
+        help="[Deprecated] use 'skip_cache_check' instead / 代わりに 'skip_cache_check' を使用してください",
+    )
+    parser.add_argument(
+        "--cpu_offload_checkpointing",
+        action="store_true",
+        help="[EXPERIMENTAL] enable offloading of tensors to CPU during checkpointing / チェックポイント時にテンソルをCPUにオフロードする",
+    )
+    return parser
+
+
+if __name__ == "__main__":
+    parser = setup_parser()
+
+    args = parser.parse_args()
+    train_util.verify_command_line_training_args(args)
+    args = train_util.read_config_from_file(args, parser)
+
+    train(args)
diff --git a/lumina_train_network.py b/lumina_train_network.py
index 0fd4da6b3..5f20c0146 100644
--- a/lumina_train_network.py
+++ b/lumina_train_network.py
@@ -15,7 +15,6 @@
 import train_network
 from library import (
     lumina_models,
-    flux_train_utils,
     lumina_util,
     lumina_train_util,
     sd3_train_utils,
@@ -250,36 +249,10 @@ def get_noise_pred_and_target(
     ):
         assert isinstance(noise_scheduler, sd3_train_utils.FlowMatchEulerDiscreteScheduler)
         noise = torch.randn_like(latents)
-        bsz = latents.shape[0]
-
-        # Sample a random timestep for each image
-        # for weighting schemes where we sample timesteps non-uniformly
-        u = lumina_train_util.compute_density_for_timestep_sampling(
-            weighting_scheme=args.weighting_scheme,
-            batch_size=bsz,
-            logit_mean=args.logit_mean,
-            logit_std=args.logit_std,
-            mode_scale=args.mode_scale,
+        # get noisy model input and timesteps
+        noisy_model_input, timesteps, sigmas = lumina_train_util.get_noisy_model_input_and_timesteps(
+            args, noise_scheduler, latents, noise, accelerator.device, weight_dtype
         )
-        indices = (u * noise_scheduler.config.num_train_timesteps).long()
-        timesteps = noise_scheduler.timesteps[indices].to(device=latents.device)
-
-        def get_sigmas(timesteps, n_dim=4, dtype=torch.float32):
-            sigmas = noise_scheduler.sigmas.to(device=accelerator.device, dtype=dtype)
-            schedule_timesteps = noise_scheduler.timesteps.to(accelerator.device)
-            timesteps = timesteps.to(accelerator.device)
-            step_indices = [(schedule_timesteps == t).nonzero().item() for t in timesteps]
-
-            sigma = sigmas[step_indices].flatten()
-            while len(sigma.shape) < n_dim:
-                sigma = sigma.unsqueeze(-1)
-            return sigma
-
-        # Add noise according to flow matching.
-        # zt = (1 - texp) * x + texp * z1
-        # Lumina2 reverses the lerp i.e., sigma of 1.0 should mean `latents`
-        sigmas = get_sigmas(timesteps, n_dim=latents.ndim, dtype=latents.dtype)
-        noisy_model_input = (1.0 - sigmas) * noise + sigmas * latents
 
         # ensure the hidden state will require grad
         if args.gradient_checkpointing:
@@ -310,7 +283,7 @@ def call_dit(img, gemma2_hidden_states, gemma2_attn_mask, timesteps):
         )
 
         # apply model prediction type
-        model_pred, weighting = flux_train_utils.apply_model_prediction_type(args, model_pred, noisy_model_input, sigmas)
+        model_pred, weighting = lumina_train_util.apply_model_prediction_type(args, model_pred, noisy_model_input, sigmas)
 
         # flow matching loss
         target = latents - noise
@@ -336,7 +309,7 @@ def call_dit(img, gemma2_hidden_states, gemma2_attn_mask, timesteps):
                 # model_pred_prior = lumina_util.unpack_latents(
                 #     model_pred_prior, packed_latent_height, packed_latent_width
                 # )
-                model_pred_prior, _ = flux_train_utils.apply_model_prediction_type(
+                model_pred_prior, _ = lumina_train_util.apply_model_prediction_type(
                     args,
                     model_pred_prior,
                     noisy_model_input[diff_output_pr_indices],

From a1a5627b13d0ebf182710ea0cea5e97ab2f6d580 Mon Sep 17 00:00:00 2001
From: sdbds <865105819@qq.com>
Date: Wed, 26 Feb 2025 11:35:38 +0800
Subject: [PATCH 372/582] fix shift

---
 library/lumina_train_util.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/library/lumina_train_util.py b/library/lumina_train_util.py
index ca0391673..11dd3febc 100644
--- a/library/lumina_train_util.py
+++ b/library/lumina_train_util.py
@@ -848,7 +848,7 @@ def get_noisy_model_input_and_timesteps(
         noisy_model_input = (1 - t) * noise + t * latents
     elif args.timestep_sampling == "nextdit_shift":
         t = torch.rand((bsz,), device=device)
-        mu = get_lin_function(y1=0.5, y2=1.15)((h // 16) * (w // 16)) # lumina use //16
+        mu = get_lin_function(y1=0.5, y2=1.15)((h // 2) * (w // 2))
         t = time_shift(mu, 1.0, t)
 
         timesteps = t * 1000.0

From f4a004786500d80e1b47728d216aed9d76869a9d Mon Sep 17 00:00:00 2001
From: Kohya S <ykumeykume@gmail.com>
Date: Wed, 26 Feb 2025 20:50:44 +0900
Subject: [PATCH 373/582] feat: support metadata loading in
 MemoryEfficientSafeOpen

---
 library/utils.py | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/library/utils.py b/library/utils.py
index 07079c6d9..4df8bd328 100644
--- a/library/utils.py
+++ b/library/utils.py
@@ -261,11 +261,10 @@ def validate_metadata(metadata: Dict[str, Any]) -> Dict[str, str]:
 
 
 class MemoryEfficientSafeOpen:
-    # does not support metadata loading
     def __init__(self, filename):
         self.filename = filename
-        self.header, self.header_size = self._read_header()
         self.file = open(filename, "rb")
+        self.header, self.header_size = self._read_header()
 
     def __enter__(self):
         return self
@@ -276,6 +275,9 @@ def __exit__(self, exc_type, exc_val, exc_tb):
     def keys(self):
         return [k for k in self.header.keys() if k != "__metadata__"]
 
+    def metadata(self) -> Dict[str, str]:
+        return self.header.get("__metadata__", {})
+
     def get_tensor(self, key):
         if key not in self.header:
             raise KeyError(f"Tensor '{key}' not found in the file")
@@ -293,10 +295,9 @@ def get_tensor(self, key):
         return self._deserialize_tensor(tensor_bytes, metadata)
 
     def _read_header(self):
-        with open(self.filename, "rb") as f:
-            header_size = struct.unpack("<Q", f.read(8))[0]
-            header_json = f.read(header_size).decode("utf-8")
-            return json.loads(header_json), header_size
+        header_size = struct.unpack("<Q", self.file.read(8))[0]
+        header_json = self.file.read(header_size).decode("utf-8")
+        return json.loads(header_json), header_size
 
     def _deserialize_tensor(self, tensor_bytes, metadata):
         dtype = self._get_torch_dtype(metadata["dtype"])

From 5228db154877e80652f1b28b2babc6120b9fca19 Mon Sep 17 00:00:00 2001
From: Kohya S <ykumeykume@gmail.com>
Date: Wed, 26 Feb 2025 20:50:58 +0900
Subject: [PATCH 374/582] feat: add script to merge multiple safetensors files
 into a single file for SD3

---
 tools/merge_sd3_safetensors.py | 139 +++++++++++++++++++++++++++++++++
 1 file changed, 139 insertions(+)
 create mode 100644 tools/merge_sd3_safetensors.py

diff --git a/tools/merge_sd3_safetensors.py b/tools/merge_sd3_safetensors.py
new file mode 100644
index 000000000..bef7c9b90
--- /dev/null
+++ b/tools/merge_sd3_safetensors.py
@@ -0,0 +1,139 @@
+import argparse
+import os
+import gc
+from typing import Dict, Optional, Union
+import torch
+from safetensors.torch import safe_open
+
+from library.utils import setup_logging
+from library.utils import load_safetensors, mem_eff_save_file
+
+setup_logging()
+import logging
+
+logger = logging.getLogger(__name__)
+
+
+def merge_safetensors(
+    dit_path: str,
+    vae_path: Optional[str] = None,
+    clip_l_path: Optional[str] = None,
+    clip_g_path: Optional[str] = None,
+    t5xxl_path: Optional[str] = None,
+    output_path: str = "merged_model.safetensors",
+    device: str = "cpu",
+):
+    """
+    Merge multiple safetensors files into a single file
+
+    Args:
+        dit_path: Path to the DiT/MMDiT model
+        vae_path: Path to the VAE model
+        clip_l_path: Path to the CLIP-L model
+        clip_g_path: Path to the CLIP-G model
+        t5xxl_path: Path to the T5-XXL model
+        output_path: Path to save the merged model
+        device: Device to load tensors to
+    """
+    logger.info("Starting to merge safetensors files...")
+
+    # 1. Get DiT metadata if available
+    metadata = None
+    try:
+        with safe_open(dit_path, framework="pt") as f:
+            metadata = f.metadata()  # may be None
+            if metadata:
+                logger.info(f"Found metadata in DiT model: {metadata}")
+    except Exception as e:
+        logger.warning(f"Failed to read metadata from DiT model: {e}")
+
+    # 2. Create empty merged state dict
+    merged_state_dict = {}
+
+    # 3. Load and merge each model with memory management
+
+    # DiT/MMDiT - prefix: model.diffusion_model.
+    logger.info(f"Loading DiT model from {dit_path}")
+    dit_state_dict = load_safetensors(dit_path, device=device, disable_mmap=True)
+    logger.info(f"Adding DiT model with {len(dit_state_dict)} keys")
+    for key, value in dit_state_dict.items():
+        merged_state_dict[f"model.diffusion_model.{key}"] = value
+    # Free memory
+    del dit_state_dict
+    gc.collect()
+
+    # VAE - prefix: first_stage_model.
+    if vae_path:
+        logger.info(f"Loading VAE model from {vae_path}")
+        vae_state_dict = load_safetensors(vae_path, device=device, disable_mmap=True)
+        logger.info(f"Adding VAE model with {len(vae_state_dict)} keys")
+        for key, value in vae_state_dict.items():
+            merged_state_dict[f"first_stage_model.{key}"] = value
+        # Free memory
+        del vae_state_dict
+        gc.collect()
+
+    # CLIP-L - prefix: text_encoders.clip_l.
+    if clip_l_path:
+        logger.info(f"Loading CLIP-L model from {clip_l_path}")
+        clip_l_state_dict = load_safetensors(clip_l_path, device=device, disable_mmap=True)
+        logger.info(f"Adding CLIP-L model with {len(clip_l_state_dict)} keys")
+        for key, value in clip_l_state_dict.items():
+            merged_state_dict[f"text_encoders.clip_l.{key}"] = value
+        # Free memory
+        del clip_l_state_dict
+        gc.collect()
+
+    # CLIP-G - prefix: text_encoders.clip_g.
+    if clip_g_path:
+        logger.info(f"Loading CLIP-G model from {clip_g_path}")
+        clip_g_state_dict = load_safetensors(clip_g_path, device=device, disable_mmap=True)
+        logger.info(f"Adding CLIP-G model with {len(clip_g_state_dict)} keys")
+        for key, value in clip_g_state_dict.items():
+            merged_state_dict[f"text_encoders.clip_g.{key}"] = value
+        # Free memory
+        del clip_g_state_dict
+        gc.collect()
+
+    # T5-XXL - prefix: text_encoders.t5xxl.
+    if t5xxl_path:
+        logger.info(f"Loading T5-XXL model from {t5xxl_path}")
+        t5xxl_state_dict = load_safetensors(t5xxl_path, device=device, disable_mmap=True)
+        logger.info(f"Adding T5-XXL model with {len(t5xxl_state_dict)} keys")
+        for key, value in t5xxl_state_dict.items():
+            merged_state_dict[f"text_encoders.t5xxl.{key}"] = value
+        # Free memory
+        del t5xxl_state_dict
+        gc.collect()
+
+    # 4. Save merged state dict
+    logger.info(f"Saving merged model to {output_path} with {len(merged_state_dict)} keys total")
+    mem_eff_save_file(merged_state_dict, output_path, metadata)
+    logger.info("Successfully merged safetensors files")
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Merge Stable Diffusion 3.5 model components into a single safetensors file")
+    parser.add_argument("--dit", required=True, help="Path to the DiT/MMDiT model")
+    parser.add_argument("--vae", help="Path to the VAE model")
+    parser.add_argument("--clip_l", help="Path to the CLIP-L model")
+    parser.add_argument("--clip_g", help="Path to the CLIP-G model")
+    parser.add_argument("--t5xxl", help="Path to the T5-XXL model")
+    parser.add_argument("--output", default="merged_model.safetensors", help="Path to save the merged model")
+    parser.add_argument("--device", default="cpu", help="Device to load tensors to")
+
+    args = parser.parse_args()
+
+    merge_safetensors(
+        dit_path=args.dit,
+        vae_path=args.vae,
+        clip_l_path=args.clip_l,
+        clip_g_path=args.clip_g,
+        t5xxl_path=args.t5xxl,
+        output_path=args.output,
+        device=args.device,
+    )
+
+
+if __name__ == "__main__":
+    main()

From ae409e83c939f2c4a997cfb1679bd7cd364baf7e Mon Sep 17 00:00:00 2001
From: Kohya S <ykumeykume@gmail.com>
Date: Wed, 26 Feb 2025 20:56:32 +0900
Subject: [PATCH 375/582] fix: FLUX/SD3 network training not working without
 caching latents closes #1954

---
 flux_train_network.py | 11 ++++++++---
 sd3_train_network.py  | 11 ++++++++---
 2 files changed, 16 insertions(+), 6 deletions(-)

diff --git a/flux_train_network.py b/flux_train_network.py
index ae4b62f5c..26503df1f 100644
--- a/flux_train_network.py
+++ b/flux_train_network.py
@@ -36,7 +36,12 @@ def __init__(self):
         self.is_schnell: Optional[bool] = None
         self.is_swapping_blocks: bool = False
 
-    def assert_extra_args(self, args, train_dataset_group: Union[train_util.DatasetGroup, train_util.MinimalDataset], val_dataset_group: Optional[train_util.DatasetGroup]):
+    def assert_extra_args(
+        self,
+        args,
+        train_dataset_group: Union[train_util.DatasetGroup, train_util.MinimalDataset],
+        val_dataset_group: Optional[train_util.DatasetGroup],
+    ):
         super().assert_extra_args(args, train_dataset_group, val_dataset_group)
         # sdxl_train_util.verify_sdxl_training_args(args)
 
@@ -323,7 +328,7 @@ def get_noise_scheduler(self, args: argparse.Namespace, device: torch.device) ->
         self.noise_scheduler_copy = copy.deepcopy(noise_scheduler)
         return noise_scheduler
 
-    def encode_images_to_latents(self, args, accelerator, vae, images):
+    def encode_images_to_latents(self, args, vae, images):
         return vae.encode(images)
 
     def shift_scale_latents(self, args, latents):
@@ -341,7 +346,7 @@ def get_noise_pred_and_target(
         network,
         weight_dtype,
         train_unet,
-        is_train=True
+        is_train=True,
     ):
         # Sample noise that we'll add to the latents
         noise = torch.randn_like(latents)
diff --git a/sd3_train_network.py b/sd3_train_network.py
index 2f4579492..9438bc7bc 100644
--- a/sd3_train_network.py
+++ b/sd3_train_network.py
@@ -26,7 +26,12 @@ def __init__(self):
         super().__init__()
         self.sample_prompts_te_outputs = None
 
-    def assert_extra_args(self, args, train_dataset_group: Union[train_util.DatasetGroup, train_util.MinimalDataset], val_dataset_group: Optional[train_util.DatasetGroup]):
+    def assert_extra_args(
+        self,
+        args,
+        train_dataset_group: Union[train_util.DatasetGroup, train_util.MinimalDataset],
+        val_dataset_group: Optional[train_util.DatasetGroup],
+    ):
         # super().assert_extra_args(args, train_dataset_group)
         # sdxl_train_util.verify_sdxl_training_args(args)
 
@@ -299,7 +304,7 @@ def get_noise_scheduler(self, args: argparse.Namespace, device: torch.device) ->
         noise_scheduler = sd3_train_utils.FlowMatchEulerDiscreteScheduler(num_train_timesteps=1000, shift=args.training_shift)
         return noise_scheduler
 
-    def encode_images_to_latents(self, args, accelerator, vae, images):
+    def encode_images_to_latents(self, args, vae, images):
         return vae.encode(images)
 
     def shift_scale_latents(self, args, latents):
@@ -317,7 +322,7 @@ def get_noise_pred_and_target(
         network,
         weight_dtype,
         train_unet,
-        is_train=True
+        is_train=True,
     ):
         # Sample noise that we'll add to the latents
         noise = torch.randn_like(latents)

From 3d79239be4b20d67faed67c47f693396342e3af4 Mon Sep 17 00:00:00 2001
From: Kohya S <ykumeykume@gmail.com>
Date: Wed, 26 Feb 2025 21:21:04 +0900
Subject: [PATCH 376/582] docs: update README to include recent improvements in
 validation loss calculation

---
 README.md | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/README.md b/README.md
index 4bbd7617e..3c6993075 100644
--- a/README.md
+++ b/README.md
@@ -14,6 +14,11 @@ The command to install PyTorch is as follows:
 
 ### Recent Updates
 
+Feb 26, 2025:
+
+- Improve the validation loss calculation in `train_network.py`, `sdxl_train_network.py`, `flux_train_network.py`, and `sd3_train_network.py`. PR [#1903](https://github.com/kohya-ss/sd-scripts/pull/1903)
+  - The validation loss uses the fixed timestep sampling and the fixed random seed. This is to ensure that the validation loss is not fluctuated by the random values.
+
 Jan 25, 2025:
 
 - `train_network.py`, `sdxl_train_network.py`, `flux_train_network.py`, and `sd3_train_network.py` now support validation loss. PR [#1864](https://github.com/kohya-ss/sd-scripts/pull/1864) Thank you to rockerBOO!

From 70403f6977471e543f4ffa1b82edc0b0a4d77a3b Mon Sep 17 00:00:00 2001
From: rockerBOO <rockerboo@gmail.com>
Date: Wed, 26 Feb 2025 23:33:50 -0500
Subject: [PATCH 377/582] fix cache text encoder outputs if not using disk.
 small cleanup/alignment

---
 library/strategy_lumina.py | 43 +++++++++++++++++++-------------------
 train_network.py           |  1 -
 2 files changed, 21 insertions(+), 23 deletions(-)

diff --git a/library/strategy_lumina.py b/library/strategy_lumina.py
index c9e654236..74f15cec1 100644
--- a/library/strategy_lumina.py
+++ b/library/strategy_lumina.py
@@ -196,6 +196,7 @@ def load_outputs_npz(self, npz_path: str) -> List[np.ndarray]:
         input_ids = data["input_ids"]
         return [hidden_state, input_ids, attention_mask]
 
+    @torch.no_grad()
     def cache_batch_outputs(
         self,
         tokenize_strategy: TokenizeStrategy,
@@ -222,23 +223,21 @@ def cache_batch_outputs(
             tokens, attention_masks, weights_list = (
                 tokenize_strategy.tokenize_with_weights(captions)
             )
-            with torch.no_grad():
-                hidden_state, input_ids, attention_masks = (
-                    text_encoding_strategy.encode_tokens_with_weights(
-                        tokenize_strategy,
-                        models,
-                        (tokens, attention_masks),
-                        weights_list,
-                    )
+            hidden_state, input_ids, attention_masks = (
+                text_encoding_strategy.encode_tokens_with_weights(
+                    tokenize_strategy,
+                    models,
+                    (tokens, attention_masks),
+                    weights_list,
                 )
+            )
         else:
             tokens = tokenize_strategy.tokenize(captions)
-            with torch.no_grad():
-                hidden_state, input_ids, attention_masks = (
-                    text_encoding_strategy.encode_tokens(
-                        tokenize_strategy, models, tokens
-                    )
+            hidden_state, input_ids, attention_masks = (
+                text_encoding_strategy.encode_tokens(
+                    tokenize_strategy, models, tokens
                 )
+            )
 
         if hidden_state.dtype != torch.float32:
             hidden_state = hidden_state.float()
@@ -247,14 +246,14 @@ def cache_batch_outputs(
         attention_mask = attention_masks.cpu().numpy() # (B, S)
         input_ids = input_ids.cpu().numpy() # (B, S) 
 
+
         for i, info in enumerate(batch):
             hidden_state_i = hidden_state[i]
             attention_mask_i = attention_mask[i]
             input_ids_i = input_ids[i]
 
-            assert info.text_encoder_outputs_npz is not None, "Text encoder cache outputs to disk not found for image {info.image_path}"
-
             if self.cache_to_disk:
+                assert info.text_encoder_outputs_npz is not None, f"Text encoder cache outputs to disk not found for image {info.image_key}"
                 np.savez(
                     info.text_encoder_outputs_npz,
                     hidden_state=hidden_state_i,
@@ -338,21 +337,21 @@ def load_latents_from_disk(
     # TODO remove circular dependency for ImageInfo
     def cache_batch_latents(
         self,
-        vae,
-        image_infos: List,
+        model,
+        batch: List,
         flip_aug: bool,
         alpha_mask: bool,
         random_crop: bool,
     ):
-        encode_by_vae = lambda img_tensor: vae.encode(img_tensor).to("cpu")
-        vae_device = vae.device
-        vae_dtype = vae.dtype
+        encode_by_vae = lambda img_tensor: model.encode(img_tensor).to("cpu")
+        vae_device = model.device
+        vae_dtype = model.dtype
 
         self._default_cache_batch_latents(
             encode_by_vae,
             vae_device,
             vae_dtype,
-            image_infos,
+            batch,
             flip_aug,
             alpha_mask,
             random_crop,
@@ -360,4 +359,4 @@ def cache_batch_latents(
         )
 
         if not train_util.HIGH_VRAM:
-            train_util.clean_memory_on_device(vae.device)
+            train_util.clean_memory_on_device(model.device)
diff --git a/train_network.py b/train_network.py
index ff62f46a3..b4b0d42d2 100644
--- a/train_network.py
+++ b/train_network.py
@@ -1282,7 +1282,6 @@ def remove_model(old_ckpt_name):
         # For --sample_at_first
         optimizer_eval_fn()
         self.sample_images(accelerator, args, 0, global_step, accelerator.device, vae, tokenizers, text_encoder, unet)
-        progress_bar.unpause() # Reset progress bar to before sampling images
         optimizer_train_fn()
         is_tracking = len(accelerator.trackers) > 0
         if is_tracking:

From 542f980443feadee0cbab2beeae3f9b3891a3058 Mon Sep 17 00:00:00 2001
From: rockerBOO <rockerboo@gmail.com>
Date: Thu, 27 Feb 2025 00:00:20 -0500
Subject: [PATCH 378/582] Fix sample norms in batches

---
 library/lumina_train_util.py | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/library/lumina_train_util.py b/library/lumina_train_util.py
index 11dd3febc..a95da382a 100644
--- a/library/lumina_train_util.py
+++ b/library/lumina_train_util.py
@@ -680,12 +680,14 @@ def denoise(
                     dim=tuple(range(1, len(noise_pred_cond.shape))),
                     keepdim=True,
                 )
-                max_new_norm = cond_norm * float(renorm_cfg)
-                noise_norm = torch.linalg.vector_norm(
+                max_new_norms = cond_norm * float(renorm_cfg)
+                noise_norms = torch.linalg.vector_norm(
                     noise_pred, dim=tuple(range(1, len(noise_pred.shape))), keepdim=True
                 )
-                if noise_norm >= max_new_norm:
-                    noise_pred = noise_pred * (max_new_norm / noise_norm)
+                # Iterate through batch
+                for noise_norm, max_new_norm, noise in zip(noise_norms, max_new_norms, noise_pred):
+                    if noise_norm >= max_new_norm:
+                        noise = noise * (max_new_norm / noise_norm)
         else:
             noise_pred = noise_pred_cond
 

From 0886d976f1d3cca531bc068a5b1a0e54555dc20c Mon Sep 17 00:00:00 2001
From: rockerBOO <rockerboo@gmail.com>
Date: Thu, 27 Feb 2025 02:31:50 -0500
Subject: [PATCH 379/582] Add block swap

---
 library/lumina_models.py | 65 ++++++++++++++++++++++++++++++++++++++--
 1 file changed, 63 insertions(+), 2 deletions(-)

diff --git a/library/lumina_models.py b/library/lumina_models.py
index 1a441a69d..c00ca88d4 100644
--- a/library/lumina_models.py
+++ b/library/lumina_models.py
@@ -29,6 +29,8 @@
 import torch.nn as nn
 import torch.nn.functional as F
 
+from library import custom_offloading_utils
+
 try:
     from flash_attn import flash_attn_varlen_func
     from flash_attn.bert_padding import index_first_axis, pad_input, unpad_input  # noqa
@@ -1066,8 +1068,16 @@ def forward(self, x: Tensor, t: Tensor, cap_feats: Tensor, cap_mask: Tensor) ->
 
         x, mask, freqs_cis, l_effective_cap_len, seq_lengths = self.patchify_and_embed(x, cap_feats, cap_mask, t)
 
-        for layer in self.layers:
-            x = layer(x, mask, freqs_cis, t)
+        if not self.blocks_to_swap:
+            for layer in self.layers:
+                x = layer(x, mask, freqs_cis, t)
+        else:
+            for block_idx, layer in enumerate(self.layers):
+                self.offloader_main.wait_for_block(block_idx)
+                
+                x = layer(x, mask, freqs_cis, t)
+                
+                self.offloader_main.submit_move_blocks(self.layers, block_idx)
 
         x = self.final_layer(x, t)
         x = self.unpatchify(x, width, height, l_effective_cap_len, seq_lengths)
@@ -1184,6 +1194,57 @@ def get_fsdp_wrap_module_list(self) -> List[nn.Module]:
     def get_checkpointing_wrap_module_list(self) -> List[nn.Module]:
         return list(self.layers)
 
+    def enable_block_swap(self, num_blocks: int, device: torch.device):
+        """
+        Enable block swapping to reduce memory usage during inference.
+        
+        Args:
+            num_blocks (int): Number of blocks to swap between CPU and device
+            device (torch.device): Device to use for computation
+        """
+        self.blocks_to_swap = num_blocks
+        
+        # Calculate how many blocks to swap from main layers
+        num_main_blocks_to_swap = min(num_blocks, self.layers)
+        
+        assert num_main_blocks_to_swap <= len(self.layers) - 2, (
+            f"Cannot swap more than {len(self.layers) - 2} main blocks. "
+            f"Requested {num_main_blocks_to_swap} blocks."
+        )
+        
+        self.offloader_main = custom_offloading_utils.ModelOffloader(
+            self.layers, len(self.layers), num_main_blocks_to_swap, device
+        )
+        
+        print(f"NextDiT: Block swap enabled. Swapping {num_blocks} blocks.")
+
+    def move_to_device_except_swap_blocks(self, device: torch.device):
+        """
+        Move the model to the device except for blocks that will be swapped.
+        This reduces temporary memory usage during model loading.
+        
+        Args:
+            device (torch.device): Device to move the model to
+        """
+        if self.blocks_to_swap:
+            save_layers = self.layers
+            self.layers = None
+            
+            self.to(device)
+            
+            self.layers = save_layers
+        else:
+            self.to(device)
+
+    def prepare_block_swap_before_forward(self):
+        """
+        Prepare blocks for swapping before forward pass.
+        """
+        if self.blocks_to_swap is None or self.blocks_to_swap == 0:
+            return
+        
+        self.offloader_main.prepare_block_devices_before_forward(self.layers)
+
 
 #############################################################################
 #                                 NextDiT Configs                               #

From ce2610d29b399c8353686f50bf1973457a133153 Mon Sep 17 00:00:00 2001
From: rockerBOO <rockerboo@gmail.com>
Date: Thu, 27 Feb 2025 02:47:04 -0500
Subject: [PATCH 380/582] Change system prompt to inject Prompt Start special
 token

---
 library/lumina_train_util.py | 5 +++--
 library/strategy_lumina.py   | 3 ++-
 library/train_util.py        | 6 ++++--
 lumina_train_network.py      | 9 ++++++---
 4 files changed, 15 insertions(+), 8 deletions(-)

diff --git a/library/lumina_train_util.py b/library/lumina_train_util.py
index 11dd3febc..bfc470a93 100644
--- a/library/lumina_train_util.py
+++ b/library/lumina_train_util.py
@@ -330,11 +330,12 @@ def sample_image_inference(
         logger.info(f"renorm: {renorm_cfg}")
         # logger.info(f"sample_sampler: {sampler_name}")
 
-        system_prompt = args.system_prompt or ""
+        system_prompt_special_token = "<Prompt Start>"
+        system_prompt = f"{args.system_prompt} {system_prompt_special_token} " if args.system_prompt else ""
 
         # Apply system prompt to prompts
         prompt = system_prompt + prompt
-        negative_prompt = system_prompt + negative_prompt
+        negative_prompt = negative_prompt
 
         # Get sample prompts from cache
         if sample_prompts_gemma2_outputs and prompt in sample_prompts_gemma2_outputs:
diff --git a/library/strategy_lumina.py b/library/strategy_lumina.py
index c9e654236..275e290f6 100644
--- a/library/strategy_lumina.py
+++ b/library/strategy_lumina.py
@@ -216,7 +216,8 @@ def cache_batch_outputs(
         assert isinstance(text_encoding_strategy, LuminaTextEncodingStrategy)
         assert isinstance(tokenize_strategy, LuminaTokenizeStrategy)
 
-        captions = [info.system_prompt or "" + info.caption for info in batch]
+        system_prompt_special_token = "<Prompt Start>"
+        captions = [f"{info.system_prompt} {system_prompt_special_token} " if info.system_prompt else "" + info.caption for info in batch]
 
         if self.is_weighted:
             tokens, attention_masks, weights_list = (
diff --git a/library/train_util.py b/library/train_util.py
index 0c057bd1a..34b98f89f 100644
--- a/library/train_util.py
+++ b/library/train_util.py
@@ -1692,7 +1692,8 @@ def __getitem__(self, index):
             text_encoder_outputs_list.append(text_encoder_outputs)
 
             if tokenization_required:
-                system_prompt = subset.system_prompt or ""
+                system_prompt_special_token = "<Prompt Start>"
+                system_prompt = f"{subset.system_prompt} {system_prompt_special_token} " if subset.system_prompt else ""
                 caption = self.process_caption(subset, image_info.caption)
                 input_ids = [ids[0] for ids in self.tokenize_strategy.tokenize(system_prompt + caption)]  # remove batch dimension
                 # if self.XTI_layers:
@@ -2091,7 +2092,8 @@ def load_dreambooth_dir(subset: DreamBoothSubset):
             else:
                 num_train_images += num_repeats * len(img_paths)
 
-            system_prompt = self.system_prompt or subset.system_prompt or ""
+            system_prompt_special_token = "<Prompt Start>"
+            system_prompt = f"{self.system_prompt or subset.system_prompt} {system_prompt_special_token} " if self.system_prompt or subset.system_prompt else ""
             for img_path, caption, size in zip(img_paths, captions, sizes):
                 info = ImageInfo(img_path, num_repeats, system_prompt + caption, subset.is_reg, img_path)
                 if size is not None:
diff --git a/lumina_train_network.py b/lumina_train_network.py
index 5f20c0146..c9ef5f02c 100644
--- a/lumina_train_network.py
+++ b/lumina_train_network.py
@@ -155,7 +155,8 @@ def cache_text_encoder_outputs_if_needed(
                 assert isinstance(tokenize_strategy, strategy_lumina.LuminaTokenizeStrategy)
                 assert isinstance(text_encoding_strategy, strategy_lumina.LuminaTextEncodingStrategy)
 
-                system_prompt = args.system_prompt or ""
+                system_prompt_special_token = "<Prompt Start>"
+                system_prompt = f"{args.system_prompt} {system_prompt_special_token} " if args.system_prompt else "" 
                 sample_prompts = train_util.load_prompts(args.sample_prompts)
                 sample_prompts_te_outputs = {}  # key: prompt, value: text encoder outputs
                 with accelerator.autocast(), torch.no_grad():
@@ -164,8 +165,10 @@ def cache_text_encoder_outputs_if_needed(
                             prompt_dict.get("prompt", ""),
                             prompt_dict.get("negative_prompt", ""),
                         ]
-                        for prompt in prompts:
-                            prompt = system_prompt + prompt
+                        for i, prompt in enumerate(prompts):
+                            # Add system prompt only to positive prompt
+                            if i == 0:
+                                prompt = system_prompt + prompt
                             if prompt in sample_prompts_te_outputs:
                                 continue
 

From 42fe22f5a25e950545b81e53c13b0c1c804d6e46 Mon Sep 17 00:00:00 2001
From: rockerBOO <rockerboo@gmail.com>
Date: Thu, 27 Feb 2025 03:21:24 -0500
Subject: [PATCH 381/582] Enable block swap for Lumina

---
 library/lumina_models.py | 7 +++----
 lumina_train_network.py  | 8 ++++----
 2 files changed, 7 insertions(+), 8 deletions(-)

diff --git a/library/lumina_models.py b/library/lumina_models.py
index c00ca88d4..020320b01 100644
--- a/library/lumina_models.py
+++ b/library/lumina_models.py
@@ -1205,15 +1205,14 @@ def enable_block_swap(self, num_blocks: int, device: torch.device):
         self.blocks_to_swap = num_blocks
         
         # Calculate how many blocks to swap from main layers
-        num_main_blocks_to_swap = min(num_blocks, self.layers)
         
-        assert num_main_blocks_to_swap <= len(self.layers) - 2, (
+        assert num_blocks <= len(self.layers) - 2, (
             f"Cannot swap more than {len(self.layers) - 2} main blocks. "
-            f"Requested {num_main_blocks_to_swap} blocks."
+            f"Requested {num_blocks} blocks."
         )
         
         self.offloader_main = custom_offloading_utils.ModelOffloader(
-            self.layers, len(self.layers), num_main_blocks_to_swap, device
+            self.layers, len(self.layers), num_blocks, device
         )
         
         print(f"NextDiT: Block swap enabled. Swapping {num_blocks} blocks.")
diff --git a/lumina_train_network.py b/lumina_train_network.py
index 5f20c0146..44c3f32f0 100644
--- a/lumina_train_network.py
+++ b/lumina_train_network.py
@@ -73,10 +73,10 @@ def load_target_model(self, args, weight_dtype, accelerator):
                 )
                 model.to(torch.float8_e4m3fn)
 
-        # if args.blocks_to_swap:
-        #     logger.info(f'Enabling block swap: {args.blocks_to_swap}')
-        #     model.enable_block_swap(args.blocks_to_swap, accelerator.device)
-        #     self.is_swapping_blocks = True
+        if args.blocks_to_swap:
+            logger.info(f'Enabling block swap: {args.blocks_to_swap}')
+            model.enable_block_swap(args.blocks_to_swap, accelerator.device)
+            self.is_swapping_blocks = True
 
         gemma2 = lumina_util.load_gemma2(args.gemma2, weight_dtype, "cpu")
         gemma2.eval()

From 9647f1e32485444facb8a5be5eb77dbac797dc71 Mon Sep 17 00:00:00 2001
From: rockerBOO <rockerboo@gmail.com>
Date: Thu, 27 Feb 2025 20:36:36 -0500
Subject: [PATCH 382/582] Fix validation block swap. Add custom offloading
 tests

---
 library/custom_offloading_utils.py    |  30 +-
 library/flux_models.py                |   8 +-
 library/lumina_models.py              |  19 +-
 library/sd3_models.py                 |   4 +-
 library/strategy_lumina.py            |   2 +-
 lumina_train_network.py               |   7 +-
 tests/test_custom_offloading_utils.py | 408 ++++++++++++++++++++++++++
 7 files changed, 446 insertions(+), 32 deletions(-)
 create mode 100644 tests/test_custom_offloading_utils.py

diff --git a/library/custom_offloading_utils.py b/library/custom_offloading_utils.py
index 84c2b743e..55ff08b64 100644
--- a/library/custom_offloading_utils.py
+++ b/library/custom_offloading_utils.py
@@ -1,6 +1,6 @@
 from concurrent.futures import ThreadPoolExecutor
 import time
-from typing import Optional
+from typing import Optional, Union, Callable, Tuple
 import torch
 import torch.nn as nn
 
@@ -19,7 +19,7 @@ def synchronize_device(device: torch.device):
 def swap_weight_devices_cuda(device: torch.device, layer_to_cpu: nn.Module, layer_to_cuda: nn.Module):
     assert layer_to_cpu.__class__ == layer_to_cuda.__class__
 
-    weight_swap_jobs = []
+    weight_swap_jobs: list[Tuple[nn.Module, nn.Module, torch.Tensor, torch.Tensor]] = []
 
     # This is not working for all cases (e.g. SD3), so we need to find the corresponding modules
     # for module_to_cpu, module_to_cuda in zip(layer_to_cpu.modules(), layer_to_cuda.modules()):
@@ -42,7 +42,7 @@ def swap_weight_devices_cuda(device: torch.device, layer_to_cpu: nn.Module, laye
 
     torch.cuda.current_stream().synchronize()  # this prevents the illegal loss value
 
-    stream = torch.cuda.Stream()
+    stream = torch.Stream(device="cuda")
     with torch.cuda.stream(stream):
         # cuda to cpu
         for module_to_cpu, module_to_cuda, cuda_data_view, cpu_data_view in weight_swap_jobs:
@@ -66,23 +66,24 @@ def swap_weight_devices_no_cuda(device: torch.device, layer_to_cpu: nn.Module, l
     """
     assert layer_to_cpu.__class__ == layer_to_cuda.__class__
 
-    weight_swap_jobs = []
+    weight_swap_jobs: list[Tuple[nn.Module, nn.Module, torch.Tensor, torch.Tensor]] = []
     for module_to_cpu, module_to_cuda in zip(layer_to_cpu.modules(), layer_to_cuda.modules()):
         if hasattr(module_to_cpu, "weight") and module_to_cpu.weight is not None:
             weight_swap_jobs.append((module_to_cpu, module_to_cuda, module_to_cpu.weight.data, module_to_cuda.weight.data))
 
+
     # device to cpu
     for module_to_cpu, module_to_cuda, cuda_data_view, cpu_data_view in weight_swap_jobs:
         module_to_cpu.weight.data = cuda_data_view.data.to("cpu", non_blocking=True)
 
-    synchronize_device()
+    synchronize_device(device)
 
     # cpu to device
     for module_to_cpu, module_to_cuda, cuda_data_view, cpu_data_view in weight_swap_jobs:
         cuda_data_view.copy_(module_to_cuda.weight.data, non_blocking=True)
         module_to_cuda.weight.data = cuda_data_view
 
-    synchronize_device()
+    synchronize_device(device)
 
 
 def weighs_to_device(layer: nn.Module, device: torch.device):
@@ -148,13 +149,16 @@ def _wait_blocks_move(self, block_idx):
             print(f"Waited for block {block_idx}: {time.perf_counter()-start_time:.2f}s")
 
 
+# Gradient tensors
+_grad_t = Union[tuple[torch.Tensor, ...], torch.Tensor]
+
 class ModelOffloader(Offloader):
     """
     supports forward offloading
     """
 
-    def __init__(self, blocks: list[nn.Module], num_blocks: int, blocks_to_swap: int, device: torch.device, debug: bool = False):
-        super().__init__(num_blocks, blocks_to_swap, device, debug)
+    def __init__(self, blocks: Union[list[nn.Module], nn.ModuleList], blocks_to_swap: int, device: torch.device, debug: bool = False):
+        super().__init__(len(blocks), blocks_to_swap, device, debug)
 
         # register backward hooks
         self.remove_handles = []
@@ -168,7 +172,7 @@ def __del__(self):
         for handle in self.remove_handles:
             handle.remove()
 
-    def create_backward_hook(self, blocks: list[nn.Module], block_index: int) -> Optional[callable]:
+    def create_backward_hook(self, blocks: Union[list[nn.Module], nn.ModuleList], block_index: int) -> Optional[Callable[[nn.Module, _grad_t, _grad_t], Union[None, _grad_t]]]:
         # -1 for 0-based index
         num_blocks_propagated = self.num_blocks - block_index - 1
         swapping = num_blocks_propagated > 0 and num_blocks_propagated <= self.blocks_to_swap
@@ -182,7 +186,7 @@ def create_backward_hook(self, blocks: list[nn.Module], block_index: int) -> Opt
         block_idx_to_cuda = self.blocks_to_swap - num_blocks_propagated
         block_idx_to_wait = block_index - 1
 
-        def backward_hook(module, grad_input, grad_output):
+        def backward_hook(module: nn.Module, grad_input: _grad_t, grad_output: _grad_t):
             if self.debug:
                 print(f"Backward hook for block {block_index}")
 
@@ -194,7 +198,7 @@ def backward_hook(module, grad_input, grad_output):
 
         return backward_hook
 
-    def prepare_block_devices_before_forward(self, blocks: list[nn.Module]):
+    def prepare_block_devices_before_forward(self, blocks: Union[list[nn.Module], nn.ModuleList]):
         if self.blocks_to_swap is None or self.blocks_to_swap == 0:
             return
 
@@ -207,7 +211,7 @@ def prepare_block_devices_before_forward(self, blocks: list[nn.Module]):
 
         for b in blocks[self.num_blocks - self.blocks_to_swap :]:
             b.to(self.device)  # move block to device first
-            weighs_to_device(b, "cpu")  # make sure weights are on cpu
+            weighs_to_device(b, torch.device("cpu"))  # make sure weights are on cpu
 
         synchronize_device(self.device)
         clean_memory_on_device(self.device)
@@ -217,7 +221,7 @@ def wait_for_block(self, block_idx: int):
             return
         self._wait_blocks_move(block_idx)
 
-    def submit_move_blocks(self, blocks: list[nn.Module], block_idx: int):
+    def submit_move_blocks(self, blocks: Union[list[nn.Module], nn.ModuleList], block_idx: int):
         if self.blocks_to_swap is None or self.blocks_to_swap == 0:
             return
         if block_idx >= self.blocks_to_swap:
diff --git a/library/flux_models.py b/library/flux_models.py
index 328ad481d..b00bdae23 100644
--- a/library/flux_models.py
+++ b/library/flux_models.py
@@ -1219,10 +1219,10 @@ def enable_block_swap(self, num_blocks: int, device: torch.device):
         )
 
         self.offloader_double = custom_offloading_utils.ModelOffloader(
-            self.double_blocks, self.num_double_blocks, double_blocks_to_swap, device  # , debug=True
+            self.double_blocks, double_blocks_to_swap, device  # , debug=True
         )
         self.offloader_single = custom_offloading_utils.ModelOffloader(
-            self.single_blocks, self.num_single_blocks, single_blocks_to_swap, device  # , debug=True
+            self.single_blocks,  single_blocks_to_swap, device  # , debug=True
         )
         print(
             f"FLUX: Block swap enabled. Swapping {num_blocks} blocks, double blocks: {double_blocks_to_swap}, single blocks: {single_blocks_to_swap}."
@@ -1233,8 +1233,8 @@ def move_to_device_except_swap_blocks(self, device: torch.device):
         if self.blocks_to_swap:
             save_double_blocks = self.double_blocks
             save_single_blocks = self.single_blocks
-            self.double_blocks = None
-            self.single_blocks = None
+            self.double_blocks = nn.ModuleList()
+            self.single_blocks = nn.ModuleList()
 
         self.to(device)
 
diff --git a/library/lumina_models.py b/library/lumina_models.py
index 020320b01..2d4c65271 100644
--- a/library/lumina_models.py
+++ b/library/lumina_models.py
@@ -1194,7 +1194,7 @@ def get_fsdp_wrap_module_list(self) -> List[nn.Module]:
     def get_checkpointing_wrap_module_list(self) -> List[nn.Module]:
         return list(self.layers)
 
-    def enable_block_swap(self, num_blocks: int, device: torch.device):
+    def enable_block_swap(self, blocks_to_swap: int, device: torch.device):
         """
         Enable block swapping to reduce memory usage during inference.
         
@@ -1202,20 +1202,18 @@ def enable_block_swap(self, num_blocks: int, device: torch.device):
             num_blocks (int): Number of blocks to swap between CPU and device
             device (torch.device): Device to use for computation
         """
-        self.blocks_to_swap = num_blocks
+        self.blocks_to_swap = blocks_to_swap
         
         # Calculate how many blocks to swap from main layers
         
-        assert num_blocks <= len(self.layers) - 2, (
+        assert blocks_to_swap <= len(self.layers) - 2, (
             f"Cannot swap more than {len(self.layers) - 2} main blocks. "
-            f"Requested {num_blocks} blocks."
+            f"Requested {blocks_to_swap} blocks."
         )
         
         self.offloader_main = custom_offloading_utils.ModelOffloader(
-            self.layers, len(self.layers), num_blocks, device
+            self.layers, blocks_to_swap, device, debug=False
         )
-        
-        print(f"NextDiT: Block swap enabled. Swapping {num_blocks} blocks.")
 
     def move_to_device_except_swap_blocks(self, device: torch.device):
         """
@@ -1227,13 +1225,12 @@ def move_to_device_except_swap_blocks(self, device: torch.device):
         """
         if self.blocks_to_swap:
             save_layers = self.layers
-            self.layers = None
+            self.layers = nn.ModuleList([])
             
-            self.to(device)
+        self.to(device)
             
+        if self.blocks_to_swap:
             self.layers = save_layers
-        else:
-            self.to(device)
 
     def prepare_block_swap_before_forward(self):
         """
diff --git a/library/sd3_models.py b/library/sd3_models.py
index e4a931861..996f81920 100644
--- a/library/sd3_models.py
+++ b/library/sd3_models.py
@@ -1080,7 +1080,7 @@ def enable_block_swap(self, num_blocks: int, device: torch.device):
         ), f"Cannot swap more than {self.num_blocks - 2} blocks. Requested: {self.blocks_to_swap} blocks."
 
         self.offloader = custom_offloading_utils.ModelOffloader(
-            self.joint_blocks, self.num_blocks, self.blocks_to_swap, device  # , debug=True
+            self.joint_blocks, self.blocks_to_swap, device  # , debug=True
         )
         print(f"SD3: Block swap enabled. Swapping {num_blocks} blocks, total blocks: {self.num_blocks}, device: {device}.")
 
@@ -1088,7 +1088,7 @@ def move_to_device_except_swap_blocks(self, device: torch.device):
         # assume model is on cpu. do not move blocks to device to reduce temporary memory usage
         if self.blocks_to_swap:
             save_blocks = self.joint_blocks
-            self.joint_blocks = None
+            self.joint_blocks = nn.ModuleList()
 
         self.to(device)
 
diff --git a/library/strategy_lumina.py b/library/strategy_lumina.py
index c9e654236..714326ad2 100644
--- a/library/strategy_lumina.py
+++ b/library/strategy_lumina.py
@@ -208,7 +208,7 @@ def cache_batch_outputs(
             tokenize_strategy (LuminaTokenizeStrategy): Tokenize strategy
             models (List[Any]): Text encoders
             text_encoding_strategy (LuminaTextEncodingStrategy):
-            infos (List): List of image_info
+            infos (List): List of ImageInfo
 
         Returns:
             None
diff --git a/lumina_train_network.py b/lumina_train_network.py
index 44c3f32f0..3e003a921 100644
--- a/lumina_train_network.py
+++ b/lumina_train_network.py
@@ -74,7 +74,7 @@ def load_target_model(self, args, weight_dtype, accelerator):
                 model.to(torch.float8_e4m3fn)
 
         if args.blocks_to_swap:
-            logger.info(f'Enabling block swap: {args.blocks_to_swap}')
+            logger.info(f'Lumina 2: Enabling block swap: {args.blocks_to_swap}')
             model.enable_block_swap(args.blocks_to_swap, accelerator.device)
             self.is_swapping_blocks = True
 
@@ -361,6 +361,11 @@ def prepare_unet_with_accelerator(
 
         return nextdit
 
+    def on_validation_step_end(self, args, accelerator, network, text_encoders, unet, batch, weight_dtype):
+        if self.is_swapping_blocks:
+            # prepare for next forward: because backward pass is not called, we need to prepare it here
+            accelerator.unwrap_model(unet).prepare_block_swap_before_forward()
+
 
 def setup_parser() -> argparse.ArgumentParser:
     parser = train_network.setup_parser()
diff --git a/tests/test_custom_offloading_utils.py b/tests/test_custom_offloading_utils.py
new file mode 100644
index 000000000..5fa40b768
--- /dev/null
+++ b/tests/test_custom_offloading_utils.py
@@ -0,0 +1,408 @@
+import pytest
+import torch
+import torch.nn as nn
+from unittest.mock import patch, MagicMock
+
+from library.custom_offloading_utils import (
+    synchronize_device, 
+    swap_weight_devices_cuda,
+    swap_weight_devices_no_cuda,
+    weighs_to_device,
+    Offloader,
+    ModelOffloader
+)
+
+class TransformerBlock(nn.Module):
+    def __init__(self, block_idx: int):
+        super().__init__()
+        self.block_idx = block_idx
+        self.linear1 = nn.Linear(10, 5)
+        self.linear2 = nn.Linear(5, 10)
+        self.seq = nn.Sequential(nn.SiLU(), nn.Linear(10, 10))
+    
+    def forward(self, x):
+        x = self.linear1(x)
+        x = torch.relu(x)
+        x = self.linear2(x)
+        x = self.seq(x)
+        return x
+
+
+class SimpleModel(nn.Module):
+    def __init__(self, num_blocks=16):
+        super().__init__()
+        self.blocks = nn.ModuleList([
+            TransformerBlock(i)
+        for i in range(num_blocks)])
+    
+    def forward(self, x):
+        for block in self.blocks:
+            x = block(x)
+        return x
+
+    @property
+    def device(self):
+        return next(self.parameters()).device
+
+
+# Device Synchronization Tests
+@patch('torch.cuda.synchronize')
+@pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
+def test_cuda_synchronize(mock_cuda_sync):
+    device = torch.device('cuda')
+    synchronize_device(device)
+    mock_cuda_sync.assert_called_once()
+
+@patch('torch.xpu.synchronize')
+@pytest.mark.skipif(not torch.xpu.is_available(), reason="XPU not available")
+def test_xpu_synchronize(mock_xpu_sync):
+    device = torch.device('xpu')
+    synchronize_device(device)
+    mock_xpu_sync.assert_called_once()
+
+@patch('torch.mps.synchronize')
+@pytest.mark.skipif(not torch.xpu.is_available(), reason="MPS not available")
+def test_mps_synchronize(mock_mps_sync):
+    device = torch.device('mps')
+    synchronize_device(device)
+    mock_mps_sync.assert_called_once()
+
+
+# Weights to Device Tests
+def test_weights_to_device():
+    # Create a simple model with weights
+    model = nn.Sequential(
+        nn.Linear(10, 5),
+        nn.ReLU(),
+        nn.Linear(5, 2)
+    )
+    
+    # Start with CPU tensors
+    device = torch.device('cpu')
+    for module in model.modules():
+        if hasattr(module, "weight") and module.weight is not None:
+            assert module.weight.device == device
+    
+    # Move to mock CUDA device
+    mock_device = torch.device('cuda')
+    with patch('torch.Tensor.to', return_value=torch.zeros(1).to(device)):
+        weighs_to_device(model, mock_device)
+        
+        # Since we mocked the to() function, we can only verify modules were processed
+        # but can't check actual device movement
+
+
+# Swap Weight Devices Tests
+@pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
+def test_swap_weight_devices_cuda():
+    device = torch.device('cuda')
+    layer_to_cpu = SimpleModel()
+    layer_to_cuda = SimpleModel()
+
+    # Move layer to CUDA to move to CPU
+    layer_to_cpu.to(device)
+    
+    with patch('torch.Tensor.to', return_value=torch.zeros(1)):
+        with patch('torch.Tensor.copy_'):
+            swap_weight_devices_cuda(device, layer_to_cpu, layer_to_cuda)
+            
+            assert layer_to_cpu.device.type == 'cpu'
+            assert layer_to_cuda.device.type == 'cuda'
+
+
+
+@patch('library.custom_offloading_utils.synchronize_device')
+def test_swap_weight_devices_no_cuda(mock_sync_device):
+    device = torch.device('cpu')
+    layer_to_cpu = SimpleModel()
+    layer_to_cuda = SimpleModel()
+    
+    with patch('torch.Tensor.to', return_value=torch.zeros(1)):
+        with patch('torch.Tensor.copy_'):
+            swap_weight_devices_no_cuda(device, layer_to_cpu, layer_to_cuda)
+            
+            # Verify synchronize_device was called twice
+            assert mock_sync_device.call_count == 2
+
+
+# Offloader Tests
+@pytest.fixture
+def offloader():
+    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+    return Offloader(
+        num_blocks=4,
+        blocks_to_swap=2,
+        device=device,
+        debug=False
+    )
+
+
+def test_offloader_init(offloader):
+    assert offloader.num_blocks == 4
+    assert offloader.blocks_to_swap == 2
+    assert hasattr(offloader, 'thread_pool')
+    assert offloader.futures == {}
+    assert offloader.cuda_available == (offloader.device.type == 'cuda')
+
+
+@patch('library.custom_offloading_utils.swap_weight_devices_cuda')
+@patch('library.custom_offloading_utils.swap_weight_devices_no_cuda')
+def test_swap_weight_devices(mock_no_cuda, mock_cuda, offloader: Offloader):
+    block_to_cpu = SimpleModel()
+    block_to_cuda = SimpleModel()
+    
+    # Force test for CUDA device
+    offloader.cuda_available = True
+    offloader.swap_weight_devices(block_to_cpu, block_to_cuda)
+    mock_cuda.assert_called_once_with(offloader.device, block_to_cpu, block_to_cuda)
+    mock_no_cuda.assert_not_called()
+    
+    # Reset mocks
+    mock_cuda.reset_mock()
+    mock_no_cuda.reset_mock()
+    
+    # Force test for non-CUDA device
+    offloader.cuda_available = False
+    offloader.swap_weight_devices(block_to_cpu, block_to_cuda)
+    mock_no_cuda.assert_called_once_with(offloader.device, block_to_cpu, block_to_cuda)
+    mock_cuda.assert_not_called()
+
+
+@patch('library.custom_offloading_utils.Offloader.swap_weight_devices')
+def test_submit_move_blocks(mock_swap, offloader):
+    blocks = [SimpleModel() for _ in range(4)]
+    block_idx_to_cpu = 0
+    block_idx_to_cuda = 2
+    
+    # Mock the thread pool to execute synchronously
+    future = MagicMock()
+    future.result.return_value = (block_idx_to_cpu, block_idx_to_cuda)
+    offloader.thread_pool.submit = MagicMock(return_value=future)
+    
+    offloader._submit_move_blocks(blocks, block_idx_to_cpu, block_idx_to_cuda)
+    
+    # Check that the future is stored with the correct key
+    assert block_idx_to_cuda in offloader.futures
+
+
+def test_wait_blocks_move(offloader):
+    block_idx = 2
+    
+    # Test with no future for the block
+    offloader._wait_blocks_move(block_idx)  # Should not raise
+    
+    # Create a fake future and test waiting
+    future = MagicMock()
+    future.result.return_value = (0, block_idx)
+    offloader.futures[block_idx] = future
+    
+    offloader._wait_blocks_move(block_idx)
+    
+    # Check that the future was removed
+    assert block_idx not in offloader.futures
+    future.result.assert_called_once()
+
+
+# ModelOffloader Tests
+@pytest.fixture
+def model_offloader():
+    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+    blocks_to_swap = 2
+    blocks = SimpleModel(4).blocks
+    return ModelOffloader(
+        blocks=blocks,
+        blocks_to_swap=blocks_to_swap,
+        device=device,
+        debug=False
+    )
+
+
+def test_model_offloader_init(model_offloader):
+    assert model_offloader.num_blocks == 4
+    assert model_offloader.blocks_to_swap == 2
+    assert hasattr(model_offloader, 'thread_pool')
+    assert model_offloader.futures == {}
+    assert len(model_offloader.remove_handles) > 0  # Should have registered hooks
+
+
+def test_create_backward_hook():
+    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+    blocks_to_swap = 2
+    blocks = SimpleModel(4).blocks
+    model_offloader = ModelOffloader(
+        blocks=blocks,
+        blocks_to_swap=blocks_to_swap,
+        device=device,
+        debug=False
+    )
+
+    # Test hook creation for swapping case (block 0)
+    hook_swap = model_offloader.create_backward_hook(blocks, 0)
+    assert hook_swap is None
+    
+    # Test hook creation for waiting case (block 1)
+    hook_wait = model_offloader.create_backward_hook(blocks, 1)
+    assert hook_wait is not None
+    
+    # Test hook creation for no action case (block 3)
+    hook_none = model_offloader.create_backward_hook(blocks, 3)
+    assert hook_none is None
+
+
+@patch('library.custom_offloading_utils.ModelOffloader._submit_move_blocks')
+@patch('library.custom_offloading_utils.ModelOffloader._wait_blocks_move')
+def test_backward_hook_execution(mock_wait, mock_submit):
+    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+    blocks_to_swap = 2
+    model = SimpleModel(4)
+    blocks = model.blocks
+    model_offloader = ModelOffloader(
+        blocks=blocks,
+        blocks_to_swap=blocks_to_swap,
+        device=device,
+        debug=False
+    )
+    
+    # Test swapping hook (block 1)
+    hook_swap = model_offloader.create_backward_hook(blocks, 1)
+    assert hook_swap is not None
+    hook_swap(model, torch.zeros(1), torch.zeros(1))
+    mock_submit.assert_called_once()
+    
+    mock_submit.reset_mock()
+    
+    # Test waiting hook (block 2)
+    hook_wait = model_offloader.create_backward_hook(blocks, 2)
+    assert hook_wait is not None
+    hook_wait(model, torch.zeros(1), torch.zeros(1))
+    assert mock_wait.call_count == 2
+
+
+@patch('library.custom_offloading_utils.weighs_to_device')
+@patch('library.custom_offloading_utils.synchronize_device')
+@patch('library.custom_offloading_utils.clean_memory_on_device')
+def test_prepare_block_devices_before_forward(mock_clean, mock_sync, mock_weights_to_device, model_offloader):
+    model = SimpleModel(4)
+    blocks = model.blocks
+    
+    with patch.object(nn.Module, 'to'):
+        model_offloader.prepare_block_devices_before_forward(blocks)
+        
+        # Check that weighs_to_device was called for each block
+        assert mock_weights_to_device.call_count == 4
+        
+        # Check that synchronize_device and clean_memory_on_device were called
+        mock_sync.assert_called_once_with(model_offloader.device)
+        mock_clean.assert_called_once_with(model_offloader.device)
+
+
+@patch('library.custom_offloading_utils.ModelOffloader._wait_blocks_move')
+def test_wait_for_block(mock_wait, model_offloader):
+    # Test with blocks_to_swap=0
+    model_offloader.blocks_to_swap = 0
+    model_offloader.wait_for_block(1)
+    mock_wait.assert_not_called()
+    
+    # Test with blocks_to_swap=2
+    model_offloader.blocks_to_swap = 2
+    block_idx = 1
+    model_offloader.wait_for_block(block_idx)
+    mock_wait.assert_called_once_with(block_idx)
+
+
+@patch('library.custom_offloading_utils.ModelOffloader._submit_move_blocks')
+def test_submit_move_blocks(mock_submit, model_offloader):
+    model = SimpleModel()
+    blocks = model.blocks
+    
+    # Test with blocks_to_swap=0
+    model_offloader.blocks_to_swap = 0
+    model_offloader.submit_move_blocks(blocks, 1)
+    mock_submit.assert_not_called()
+    
+    mock_submit.reset_mock()
+    model_offloader.blocks_to_swap = 2
+    
+    # Test within swap range
+    block_idx = 1
+    model_offloader.submit_move_blocks(blocks, block_idx)
+    mock_submit.assert_called_once()
+    
+    mock_submit.reset_mock()
+    
+    # Test outside swap range
+    block_idx = 3
+    model_offloader.submit_move_blocks(blocks, block_idx)
+    mock_submit.assert_not_called()
+
+
+# Integration test for offloading in a realistic scenario
+@pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
+def test_offloading_integration():
+    device = torch.device('cuda')
+    # Create a mini model with 4 blocks
+    model = SimpleModel(5)
+    model.to(device)
+    blocks = model.blocks
+    
+    # Initialize model offloader
+    offloader = ModelOffloader(
+        blocks=blocks,
+        blocks_to_swap=2,
+        device=device,
+        debug=True
+    )
+    
+    # Prepare blocks for forward pass
+    offloader.prepare_block_devices_before_forward(blocks)
+    
+    # Simulate forward pass with offloading
+    input_tensor = torch.randn(1, 10, device=device)
+    x = input_tensor
+    
+    for i, block in enumerate(blocks):
+        # Wait for the current block to be ready
+        offloader.wait_for_block(i)
+        
+        # Process through the block
+        x = block(x)
+        
+        # Schedule moving weights for future blocks
+        offloader.submit_move_blocks(blocks, i)
+    
+    # Verify we get a valid output
+    assert x.shape == (1, 10)
+    assert not torch.isnan(x).any()
+
+
+# Error handling tests
+def test_offloader_assertion_error():
+    with pytest.raises(AssertionError):
+        device = torch.device('cpu')
+        layer_to_cpu = SimpleModel()
+        layer_to_cuda = nn.Linear(10, 5)  # Different class
+        swap_weight_devices_cuda(device, layer_to_cpu, layer_to_cuda)
+
+if __name__ == "__main__":
+    # Run all tests when file is executed directly
+    import sys
+    
+    # Configure pytest command line arguments
+    pytest_args = [
+        "-v",                   # Verbose output
+        "--color=yes",          # Colored output
+        __file__,               # Run tests in this file
+    ]
+    
+    # Add optional arguments from command line
+    if len(sys.argv) > 1:
+        pytest_args.extend(sys.argv[1:])
+    
+    # Print info about test execution
+    print(f"Running tests with PyTorch {torch.__version__}")
+    print(f"CUDA available: {torch.cuda.is_available()}")
+    if torch.cuda.is_available():
+        print(f"CUDA device: {torch.cuda.get_device_name(0)}")
+    
+    # Run the tests
+    sys.exit(pytest.main(pytest_args))

From 734333d0c9eec3f20582c9c16f6d148cb1ec2596 Mon Sep 17 00:00:00 2001
From: Kohya S <ykumeykume@gmail.com>
Date: Fri, 28 Feb 2025 23:52:29 +0900
Subject: [PATCH 383/582] feat: enhance merging logic for safetensors models to
 handle key prefixes correctly

---
 tools/merge_sd3_safetensors.py | 29 +++++++++++++++++++++++------
 1 file changed, 23 insertions(+), 6 deletions(-)

diff --git a/tools/merge_sd3_safetensors.py b/tools/merge_sd3_safetensors.py
index bef7c9b90..960cf6e77 100644
--- a/tools/merge_sd3_safetensors.py
+++ b/tools/merge_sd3_safetensors.py
@@ -53,22 +53,30 @@ def merge_safetensors(
     # 3. Load and merge each model with memory management
 
     # DiT/MMDiT - prefix: model.diffusion_model.
+    # This state dict may have VAE keys.
     logger.info(f"Loading DiT model from {dit_path}")
     dit_state_dict = load_safetensors(dit_path, device=device, disable_mmap=True)
     logger.info(f"Adding DiT model with {len(dit_state_dict)} keys")
     for key, value in dit_state_dict.items():
-        merged_state_dict[f"model.diffusion_model.{key}"] = value
+        if key.startswith("model.diffusion_model.") or key.startswith("first_stage_model."):
+            merged_state_dict[key] = value
+        else:
+            merged_state_dict[f"model.diffusion_model.{key}"] = value
     # Free memory
     del dit_state_dict
     gc.collect()
 
     # VAE - prefix: first_stage_model.
+    # May be omitted if VAE is already included in DiT model.
     if vae_path:
         logger.info(f"Loading VAE model from {vae_path}")
         vae_state_dict = load_safetensors(vae_path, device=device, disable_mmap=True)
         logger.info(f"Adding VAE model with {len(vae_state_dict)} keys")
         for key, value in vae_state_dict.items():
-            merged_state_dict[f"first_stage_model.{key}"] = value
+            if key.startswith("first_stage_model."):
+                merged_state_dict[key] = value
+            else:
+                merged_state_dict[f"first_stage_model.{key}"] = value
         # Free memory
         del vae_state_dict
         gc.collect()
@@ -79,7 +87,10 @@ def merge_safetensors(
         clip_l_state_dict = load_safetensors(clip_l_path, device=device, disable_mmap=True)
         logger.info(f"Adding CLIP-L model with {len(clip_l_state_dict)} keys")
         for key, value in clip_l_state_dict.items():
-            merged_state_dict[f"text_encoders.clip_l.{key}"] = value
+            if key.startswith("text_encoders.clip_l.transformer."):
+                merged_state_dict[key] = value
+            else:
+                merged_state_dict[f"text_encoders.clip_l.transformer.{key}"] = value
         # Free memory
         del clip_l_state_dict
         gc.collect()
@@ -90,7 +101,10 @@ def merge_safetensors(
         clip_g_state_dict = load_safetensors(clip_g_path, device=device, disable_mmap=True)
         logger.info(f"Adding CLIP-G model with {len(clip_g_state_dict)} keys")
         for key, value in clip_g_state_dict.items():
-            merged_state_dict[f"text_encoders.clip_g.{key}"] = value
+            if key.startswith("text_encoders.clip_g.transformer."):
+                merged_state_dict[key] = value
+            else:
+                merged_state_dict[f"text_encoders.clip_g.transformer.{key}"] = value
         # Free memory
         del clip_g_state_dict
         gc.collect()
@@ -101,7 +115,10 @@ def merge_safetensors(
         t5xxl_state_dict = load_safetensors(t5xxl_path, device=device, disable_mmap=True)
         logger.info(f"Adding T5-XXL model with {len(t5xxl_state_dict)} keys")
         for key, value in t5xxl_state_dict.items():
-            merged_state_dict[f"text_encoders.t5xxl.{key}"] = value
+            if key.startswith("text_encoders.t5xxl.transformer."):
+                merged_state_dict[key] = value
+            else:
+                merged_state_dict[f"text_encoders.t5xxl.transformer.{key}"] = value
         # Free memory
         del t5xxl_state_dict
         gc.collect()
@@ -115,7 +132,7 @@ def merge_safetensors(
 def main():
     parser = argparse.ArgumentParser(description="Merge Stable Diffusion 3.5 model components into a single safetensors file")
     parser.add_argument("--dit", required=True, help="Path to the DiT/MMDiT model")
-    parser.add_argument("--vae", help="Path to the VAE model")
+    parser.add_argument("--vae", help="Path to the VAE model. May be omitted if VAE is included in DiT model")
     parser.add_argument("--clip_l", help="Path to the CLIP-L model")
     parser.add_argument("--clip_g", help="Path to the CLIP-G model")
     parser.add_argument("--t5xxl", help="Path to the T5-XXL model")

From d6f7e2e20cfe91eb0c7a5f4c277107f7b699d97f Mon Sep 17 00:00:00 2001
From: rockerBOO <rockerboo@gmail.com>
Date: Fri, 28 Feb 2025 14:08:27 -0500
Subject: [PATCH 384/582] Fix block swap for sample images

---
 library/flux_train_utils.py  | 1 -
 library/lumina_train_util.py | 3 ++-
 lumina_train_network.py      | 1 +
 3 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/library/flux_train_utils.py b/library/flux_train_utils.py
index f7f06c5cf..c6d2baeb0 100644
--- a/library/flux_train_utils.py
+++ b/library/flux_train_utils.py
@@ -317,7 +317,6 @@ def denoise(
     # this is ignored for schnell
     guidance_vec = torch.full((img.shape[0],), guidance, device=img.device, dtype=img.dtype)
 
-
     for t_curr, t_prev in zip(tqdm(timesteps[:-1]), timesteps[1:]):
         t_vec = torch.full((img.shape[0],), t_curr, dtype=img.dtype, device=img.device)
         model.prepare_block_swap_before_forward()
diff --git a/library/lumina_train_util.py b/library/lumina_train_util.py
index 11dd3febc..e008b3ce3 100644
--- a/library/lumina_train_util.py
+++ b/library/lumina_train_util.py
@@ -604,7 +604,6 @@ def retrieve_timesteps(
         timesteps = scheduler.timesteps
     return timesteps, num_inference_steps
 
-
 def denoise(
     scheduler,
     model: lumina_models.NextDiT,
@@ -648,6 +647,7 @@ def denoise(
     """
 
     for i, t in enumerate(tqdm(timesteps)):
+
         # reverse the timestep since Lumina uses t=0 as the noise and t=1 as the image
         current_timestep = 1 - t / scheduler.config.num_train_timesteps
         # broadcast to batch dimension in a way that's compatible with ONNX/Core ML
@@ -700,6 +700,7 @@ def denoise(
         noise_pred = -noise_pred
         img = scheduler.step(noise_pred, t, img, return_dict=False)[0]
 
+    model.prepare_block_swap_before_forward()
     return img
 
 
diff --git a/lumina_train_network.py b/lumina_train_network.py
index 3e003a921..60c39c20b 100644
--- a/lumina_train_network.py
+++ b/lumina_train_network.py
@@ -367,6 +367,7 @@ def on_validation_step_end(self, args, accelerator, network, text_encoders, unet
             accelerator.unwrap_model(unet).prepare_block_swap_before_forward()
 
 
+
 def setup_parser() -> argparse.ArgumentParser:
     parser = train_network.setup_parser()
     train_util.add_dit_training_arguments(parser)

From 1bba7acd9ac42ef5a654cadf47356d20d407ce82 Mon Sep 17 00:00:00 2001
From: rockerBOO <rockerboo@gmail.com>
Date: Fri, 28 Feb 2025 14:11:53 -0500
Subject: [PATCH 385/582] Add block swap in sample image timestep loop

---
 library/lumina_train_util.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/library/lumina_train_util.py b/library/lumina_train_util.py
index e008b3ce3..0be81df98 100644
--- a/library/lumina_train_util.py
+++ b/library/lumina_train_util.py
@@ -647,6 +647,7 @@ def denoise(
     """
 
     for i, t in enumerate(tqdm(timesteps)):
+        model.prepare_block_swap_before_forward()
 
         # reverse the timestep since Lumina uses t=0 as the noise and t=1 as the image
         current_timestep = 1 - t / scheduler.config.num_train_timesteps

From a2daa870074310ba2415da993016f0779c8b56e2 Mon Sep 17 00:00:00 2001
From: rockerBOO <rockerboo@gmail.com>
Date: Fri, 28 Feb 2025 14:22:39 -0500
Subject: [PATCH 386/582] Add block swap for uncond (neg) for sample images

---
 library/lumina_train_util.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/library/lumina_train_util.py b/library/lumina_train_util.py
index 0be81df98..933a4eda6 100644
--- a/library/lumina_train_util.py
+++ b/library/lumina_train_util.py
@@ -665,6 +665,7 @@ def denoise(
 
         # compute whether to apply classifier-free guidance based on current timestep
         if current_timestep[0] < cfg_trunc_ratio:
+            model.prepare_block_swap_before_forward()
             noise_pred_uncond = model(
                 img,
                 current_timestep,

From cad182d29a2f3ad3ed7550b258025f3243981464 Mon Sep 17 00:00:00 2001
From: rockerBOO <rockerboo@gmail.com>
Date: Fri, 28 Feb 2025 18:30:16 -0500
Subject: [PATCH 387/582] fix torch compile/dynamo for Gemma2

---
 library/strategy_lumina.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/library/strategy_lumina.py b/library/strategy_lumina.py
index c9e654236..b4c941064 100644
--- a/library/strategy_lumina.py
+++ b/library/strategy_lumina.py
@@ -97,7 +97,8 @@ def encode_tokens(
                 hidden_states, input_ids, attention_masks
         """
         text_encoder = models[0]
-        assert isinstance(text_encoder, Gemma2Model)
+        # Check model or torch dynamo OptimizedModule
+        assert isinstance(text_encoder, Gemma2Model) or isinstance(text_encoder._orig_mod, Gemma2Model), f"text encoder is not Gemma2Model {text_encoder.__class__.__name__}"
         input_ids, attention_masks = tokens
 
         outputs = text_encoder(

From ba5251168a91f608de9fe9e365a2f889e4bb6cf8 Mon Sep 17 00:00:00 2001
From: Kohya S <ykumeykume@gmail.com>
Date: Sat, 1 Mar 2025 10:31:39 +0900
Subject: [PATCH 388/582] fix: save tensors as is dtype, add save_precision
 option

---
 tools/merge_sd3_safetensors.py | 22 ++++++++++++++++------
 1 file changed, 16 insertions(+), 6 deletions(-)

diff --git a/tools/merge_sd3_safetensors.py b/tools/merge_sd3_safetensors.py
index 960cf6e77..6bc1003ec 100644
--- a/tools/merge_sd3_safetensors.py
+++ b/tools/merge_sd3_safetensors.py
@@ -6,7 +6,7 @@
 from safetensors.torch import safe_open
 
 from library.utils import setup_logging
-from library.utils import load_safetensors, mem_eff_save_file
+from library.utils import load_safetensors, mem_eff_save_file, str_to_dtype
 
 setup_logging()
 import logging
@@ -22,6 +22,7 @@ def merge_safetensors(
     t5xxl_path: Optional[str] = None,
     output_path: str = "merged_model.safetensors",
     device: str = "cpu",
+    save_precision: Optional[str] = None,
 ):
     """
     Merge multiple safetensors files into a single file
@@ -34,9 +35,16 @@ def merge_safetensors(
         t5xxl_path: Path to the T5-XXL model
         output_path: Path to save the merged model
         device: Device to load tensors to
+        save_precision: Target dtype for model weights (e.g. 'fp16', 'bf16')
     """
     logger.info("Starting to merge safetensors files...")
 
+    # Convert save_precision string to torch dtype if specified
+    if save_precision:
+        target_dtype = str_to_dtype(save_precision)
+    else:
+        target_dtype = None
+
     # 1. Get DiT metadata if available
     metadata = None
     try:
@@ -55,7 +63,7 @@ def merge_safetensors(
     # DiT/MMDiT - prefix: model.diffusion_model.
     # This state dict may have VAE keys.
     logger.info(f"Loading DiT model from {dit_path}")
-    dit_state_dict = load_safetensors(dit_path, device=device, disable_mmap=True)
+    dit_state_dict = load_safetensors(dit_path, device=device, disable_mmap=True, dtype=target_dtype)
     logger.info(f"Adding DiT model with {len(dit_state_dict)} keys")
     for key, value in dit_state_dict.items():
         if key.startswith("model.diffusion_model.") or key.startswith("first_stage_model."):
@@ -70,7 +78,7 @@ def merge_safetensors(
     # May be omitted if VAE is already included in DiT model.
     if vae_path:
         logger.info(f"Loading VAE model from {vae_path}")
-        vae_state_dict = load_safetensors(vae_path, device=device, disable_mmap=True)
+        vae_state_dict = load_safetensors(vae_path, device=device, disable_mmap=True, dtype=target_dtype)
         logger.info(f"Adding VAE model with {len(vae_state_dict)} keys")
         for key, value in vae_state_dict.items():
             if key.startswith("first_stage_model."):
@@ -84,7 +92,7 @@ def merge_safetensors(
     # CLIP-L - prefix: text_encoders.clip_l.
     if clip_l_path:
         logger.info(f"Loading CLIP-L model from {clip_l_path}")
-        clip_l_state_dict = load_safetensors(clip_l_path, device=device, disable_mmap=True)
+        clip_l_state_dict = load_safetensors(clip_l_path, device=device, disable_mmap=True, dtype=target_dtype)
         logger.info(f"Adding CLIP-L model with {len(clip_l_state_dict)} keys")
         for key, value in clip_l_state_dict.items():
             if key.startswith("text_encoders.clip_l.transformer."):
@@ -98,7 +106,7 @@ def merge_safetensors(
     # CLIP-G - prefix: text_encoders.clip_g.
     if clip_g_path:
         logger.info(f"Loading CLIP-G model from {clip_g_path}")
-        clip_g_state_dict = load_safetensors(clip_g_path, device=device, disable_mmap=True)
+        clip_g_state_dict = load_safetensors(clip_g_path, device=device, disable_mmap=True, dtype=target_dtype)
         logger.info(f"Adding CLIP-G model with {len(clip_g_state_dict)} keys")
         for key, value in clip_g_state_dict.items():
             if key.startswith("text_encoders.clip_g.transformer."):
@@ -112,7 +120,7 @@ def merge_safetensors(
     # T5-XXL - prefix: text_encoders.t5xxl.
     if t5xxl_path:
         logger.info(f"Loading T5-XXL model from {t5xxl_path}")
-        t5xxl_state_dict = load_safetensors(t5xxl_path, device=device, disable_mmap=True)
+        t5xxl_state_dict = load_safetensors(t5xxl_path, device=device, disable_mmap=True, dtype=target_dtype)
         logger.info(f"Adding T5-XXL model with {len(t5xxl_state_dict)} keys")
         for key, value in t5xxl_state_dict.items():
             if key.startswith("text_encoders.t5xxl.transformer."):
@@ -138,6 +146,7 @@ def main():
     parser.add_argument("--t5xxl", help="Path to the T5-XXL model")
     parser.add_argument("--output", default="merged_model.safetensors", help="Path to save the merged model")
     parser.add_argument("--device", default="cpu", help="Device to load tensors to")
+    parser.add_argument("--save_precision", type=str, help="Precision to save the model in (e.g., 'fp16', 'bf16', 'float16', etc.)")
 
     args = parser.parse_args()
 
@@ -149,6 +158,7 @@ def main():
         t5xxl_path=args.t5xxl,
         output_path=args.output,
         device=args.device,
+        save_precision=args.save_precision,
     )
 
 

From a69884a2090076a4bf7f4dedf4cc6aa82789e3bc Mon Sep 17 00:00:00 2001
From: rockerBOO <rockerboo@gmail.com>
Date: Sat, 1 Mar 2025 20:37:45 -0500
Subject: [PATCH 389/582] Add Sage Attention for Lumina

---
 library/lumina_models.py     | 91 +++++++++++++++++++++++++++++++++---
 library/lumina_train_util.py |  5 ++
 library/lumina_util.py       |  3 +-
 lumina_train_network.py      |  1 +
 4 files changed, 93 insertions(+), 7 deletions(-)

diff --git a/library/lumina_models.py b/library/lumina_models.py
index 1a441a69d..00ac16d53 100644
--- a/library/lumina_models.py
+++ b/library/lumina_models.py
@@ -36,6 +36,11 @@
     # flash_attn may not be available but it is not required
     pass
 
+try:
+    from sageattention import sageattn
+except:
+    pass
+
 try:
     from apex.normalization import FusedRMSNorm as RMSNorm
 except:
@@ -271,6 +276,7 @@ def __init__(
         n_kv_heads: Optional[int],
         qk_norm: bool,
         use_flash_attn=False,
+        use_sage_attn=False,
     ):
         """
         Initialize the Attention module.
@@ -310,13 +316,20 @@ def __init__(
             self.q_norm = self.k_norm = nn.Identity()
 
         self.use_flash_attn = use_flash_attn
+        self.use_sage_attn = use_sage_attn
 
-        # self.attention_processor = xformers.ops.memory_efficient_attention
-        self.attention_processor = F.scaled_dot_product_attention
+        if use_sage_attn :
+            self.attention_processor = self.sage_attn
+        else:
+            # self.attention_processor = xformers.ops.memory_efficient_attention
+            self.attention_processor = F.scaled_dot_product_attention
 
     def set_attention_processor(self, attention_processor):
         self.attention_processor = attention_processor
 
+    def get_attention_processor(self):
+        return self.attention_processor
+
     def forward(
         self,
         x: Tensor,
@@ -352,7 +365,15 @@ def forward(
 
         softmax_scale = math.sqrt(1 / self.head_dim)
 
-        if self.use_flash_attn:
+        if self.use_sage_attn:
+            # Handle GQA (Grouped Query Attention) if needed
+            n_rep = self.n_local_heads // self.n_local_kv_heads
+            if n_rep >= 1:
+                xk = xk.unsqueeze(3).repeat(1, 1, 1, n_rep, 1).flatten(2, 3)
+                xv = xv.unsqueeze(3).repeat(1, 1, 1, n_rep, 1).flatten(2, 3)
+
+            output = self.sage_attn(xq, xk, xv, x_mask, softmax_scale)
+        elif self.use_flash_attn:
             output = self.flash_attn(xq, xk, xv, x_mask, softmax_scale)
         else:
             n_rep = self.n_local_heads // self.n_local_kv_heads
@@ -428,6 +449,63 @@ def _get_unpad_data(attention_mask):
             (max_seqlen_in_batch_q, max_seqlen_in_batch_k),
         )
 
+    def sage_attn(self, q: Tensor, k: Tensor, v: Tensor, x_mask: Tensor, softmax_scale: float):
+        try:
+            bsz = q.shape[0]
+            seqlen = q.shape[1]
+
+            # Transpose tensors to match SageAttention's expected format (HND layout)
+            q_transposed = q.permute(0, 2, 1, 3)  # [batch, heads, seq_len, head_dim]
+            k_transposed = k.permute(0, 2, 1, 3)  # [batch, heads, seq_len, head_dim]
+            v_transposed = v.permute(0, 2, 1, 3)  # [batch, heads, seq_len, head_dim]
+            
+            # Handle masking for SageAttention
+            # We need to filter out masked positions - this approach handles variable sequence lengths
+            outputs = []
+            for b in range(bsz):
+                # Find valid token positions from the mask
+                valid_indices = torch.nonzero(x_mask[b], as_tuple=False).squeeze(-1)
+                if valid_indices.numel() == 0:
+                    # If all tokens are masked, create a zero output
+                    batch_output = torch.zeros(
+                        seqlen, self.n_local_heads, self.head_dim, 
+                        device=q.device, dtype=q.dtype
+                    )
+                else:
+                    # Extract only valid tokens for this batch
+                    batch_q = q_transposed[b, :, valid_indices, :]
+                    batch_k = k_transposed[b, :, valid_indices, :]
+                    batch_v = v_transposed[b, :, valid_indices, :]
+                    
+                    # Run SageAttention on valid tokens only
+                    batch_output_valid = sageattn(
+                        batch_q.unsqueeze(0),  # Add batch dimension back
+                        batch_k.unsqueeze(0), 
+                        batch_v.unsqueeze(0), 
+                        tensor_layout="HND",
+                        is_causal=False,
+                        sm_scale=softmax_scale
+                    )
+                    
+                    # Create output tensor with zeros for masked positions
+                    batch_output = torch.zeros(
+                        seqlen, self.n_local_heads, self.head_dim, 
+                        device=q.device, dtype=q.dtype
+                    )
+                    # Place valid outputs back in the right positions
+                    batch_output[valid_indices] = batch_output_valid.squeeze(0).permute(1, 0, 2)
+                    
+                outputs.append(batch_output)
+            
+            # Stack batch outputs and reshape to expected format
+            output = torch.stack(outputs, dim=0)  # [batch, seq_len, heads, head_dim]
+        except NameError as e:
+            raise RuntimeError(
+                f"Could not load Sage Attention. Please install https://github.com/thu-ml/SageAttention. / Sage Attention を読み込めませんでした。https://github.com/thu-ml/SageAttention をインストールしてください。 / {e}"
+            )
+
+        return output
+
     def flash_attn(
         self,
         q: Tensor,
@@ -571,6 +649,7 @@ def __init__(
         qk_norm: bool,
         modulation=True,
         use_flash_attn=False,
+        use_sage_attn=False,
     ) -> None:
         """
         Initialize a TransformerBlock.
@@ -593,7 +672,7 @@ def __init__(
         super().__init__()
         self.dim = dim
         self.head_dim = dim // n_heads
-        self.attention = JointAttention(dim, n_heads, n_kv_heads, qk_norm, use_flash_attn=use_flash_attn)
+        self.attention = JointAttention(dim, n_heads, n_kv_heads, qk_norm, use_flash_attn=use_flash_attn, use_sage_attn=use_sage_attn)
         self.feed_forward = FeedForward(
             dim=dim,
             hidden_dim=4 * dim,
@@ -764,6 +843,7 @@ def __init__(
         axes_dims: List[int] = [16, 56, 56],
         axes_lens: List[int] = [1, 512, 512],
         use_flash_attn=False,
+        use_sage_attn=False,
     ) -> None:
         """
         Initialize the NextDiT model.
@@ -817,7 +897,6 @@ def __init__(
                     norm_eps,
                     qk_norm,
                     modulation=False,
-                    use_flash_attn=use_flash_attn,
                 )
                 for layer_id in range(n_refiner_layers)
             ]
@@ -843,7 +922,6 @@ def __init__(
                     norm_eps,
                     qk_norm,
                     modulation=True,
-                    use_flash_attn=use_flash_attn,
                 )
                 for layer_id in range(n_refiner_layers)
             ]
@@ -865,6 +943,7 @@ def __init__(
                     norm_eps,
                     qk_norm,
                     use_flash_attn=use_flash_attn,
+                    use_sage_attn=use_sage_attn,
                 )
                 for layer_id in range(n_layers)
             ]
diff --git a/library/lumina_train_util.py b/library/lumina_train_util.py
index 11dd3febc..d3a54a743 100644
--- a/library/lumina_train_util.py
+++ b/library/lumina_train_util.py
@@ -1077,6 +1077,11 @@ def add_lumina_train_arguments(parser: argparse.ArgumentParser):
         action="store_true",
         help="Use Flash Attention for the model / モデルにFlash Attentionを使用する",
     )
+    parser.add_argument(
+        "--use_sage_attn",
+        action="store_true",
+        help="Use Sage Attention for the model / モデルにSage Attentionを使用する",
+    )
     parser.add_argument(
         "--system_prompt",
         type=str,
diff --git a/library/lumina_util.py b/library/lumina_util.py
index d9c899386..06f089d4a 100644
--- a/library/lumina_util.py
+++ b/library/lumina_util.py
@@ -27,6 +27,7 @@ def load_lumina_model(
     device: torch.device,
     disable_mmap: bool = False,
     use_flash_attn: bool = False,
+    use_sage_attn: bool = False,
 ):
     """
     Load the Lumina model from the checkpoint path.
@@ -43,7 +44,7 @@ def load_lumina_model(
     """
     logger.info("Building Lumina")
     with torch.device("meta"):
-        model = lumina_models.NextDiT_2B_GQA_patch2_Adaln_Refiner(use_flash_attn=use_flash_attn).to(dtype)
+        model = lumina_models.NextDiT_2B_GQA_patch2_Adaln_Refiner(use_flash_attn=use_flash_attn, use_sage_attn=use_sage_attn).to(dtype)
 
     logger.info(f"Loading state dict from {ckpt_path}")
     state_dict = load_safetensors(ckpt_path, device=device, disable_mmap=disable_mmap, dtype=dtype)
diff --git a/lumina_train_network.py b/lumina_train_network.py
index 5f20c0146..ed1f3aaec 100644
--- a/lumina_train_network.py
+++ b/lumina_train_network.py
@@ -58,6 +58,7 @@ def load_target_model(self, args, weight_dtype, accelerator):
             torch.device("cpu"),
             disable_mmap=args.disable_mmap_load_safetensors,
             use_flash_attn=args.use_flash_attn,
+            use_sage_attn=args.use_sage_attn
         )
 
         if args.fp8_base:

From 5e45df722d434bd64b230f462cac632d5ea68c96 Mon Sep 17 00:00:00 2001
From: sdbds <865105819@qq.com>
Date: Tue, 4 Mar 2025 08:07:33 +0800
Subject: [PATCH 390/582] update gemma2 train attention layer

---
 networks/lora_lumina.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/networks/lora_lumina.py b/networks/lora_lumina.py
index 3f6c9b417..431c183d8 100644
--- a/networks/lora_lumina.py
+++ b/networks/lora_lumina.py
@@ -462,7 +462,7 @@ def create_network_from_weights(multiplier, file, ae, text_encoders, lumina, wei
 
 class LoRANetwork(torch.nn.Module):
     LUMINA_TARGET_REPLACE_MODULE = ["JointTransformerBlock"]
-    TEXT_ENCODER_TARGET_REPLACE_MODULE = ["Gemma2Attention", "Gemma2MLP"]
+    TEXT_ENCODER_TARGET_REPLACE_MODULE = ["Gemma2Attention", "Gemma2FlashAttention2", "Gemma2SdpaAttention", "Gemma2MLP"]
     LORA_PREFIX_LUMINA = "lora_unet"
     LORA_PREFIX_TEXT_ENCODER = "lora_te"  # Simplified prefix since we only have one text encoder
 

From 1f22a94cfe55491cc708adfa881953db423a886f Mon Sep 17 00:00:00 2001
From: rockerBOO <rockerboo@gmail.com>
Date: Tue, 4 Mar 2025 02:21:05 -0500
Subject: [PATCH 391/582] Update embedder_dims, add more flexible caption
 extension

---
 library/lumina_models.py |   6 +-
 library/train_util.py    |  39 ++++---
 networks/lora_lumina.py  | 245 ++++++++++++++++++++++-----------------
 3 files changed, 164 insertions(+), 126 deletions(-)

diff --git a/library/lumina_models.py b/library/lumina_models.py
index e00dcf967..2508cc7df 100644
--- a/library/lumina_models.py
+++ b/library/lumina_models.py
@@ -887,6 +887,9 @@ def __init__(
             ),
         )
 
+        nn.init.trunc_normal_(self.cap_embedder[1].weight, std=0.02)
+        nn.init.zeros_(self.cap_embedder[1].bias)
+
         self.context_refiner = nn.ModuleList(
             [
                 JointTransformerBlock(
@@ -929,9 +932,6 @@ def __init__(
             ]
         )
 
-        nn.init.trunc_normal_(self.cap_embedder[1].weight, std=0.02)
-        # nn.init.zeros_(self.cap_embedder[1].weight)
-        nn.init.zeros_(self.cap_embedder[1].bias)
 
         self.layers = nn.ModuleList(
             [
diff --git a/library/train_util.py b/library/train_util.py
index 34b98f89f..c07a4a739 100644
--- a/library/train_util.py
+++ b/library/train_util.py
@@ -529,8 +529,8 @@ def __init__(
         self.is_reg = is_reg
         self.class_tokens = class_tokens
         self.caption_extension = caption_extension
-        if self.caption_extension and not self.caption_extension.startswith("."):
-            self.caption_extension = "." + self.caption_extension
+        # if self.caption_extension and not self.caption_extension.startswith("."):
+        #     self.caption_extension = "." + self.caption_extension
         self.cache_info = cache_info
 
     def __eq__(self, other) -> bool:
@@ -1895,30 +1895,33 @@ def __init__(
             self.bucket_reso_steps = None  # この情報は使われない
             self.bucket_no_upscale = False
 
-        def read_caption(img_path, caption_extension, enable_wildcard):
+        def read_caption(img_path: str, caption_extension: str, enable_wildcard: bool):
             # captionの候補ファイル名を作る
             base_name = os.path.splitext(img_path)[0]
             base_name_face_det = base_name
             tokens = base_name.split("_")
             if len(tokens) >= 5:
                 base_name_face_det = "_".join(tokens[:-4])
-            cap_paths = [base_name + caption_extension, base_name_face_det + caption_extension]
+            cap_paths = [(base_name, caption_extension), (base_name_face_det, caption_extension)]
 
             caption = None
-            for cap_path in cap_paths:
-                if os.path.isfile(cap_path):
-                    with open(cap_path, "rt", encoding="utf-8") as f:
-                        try:
-                            lines = f.readlines()
-                        except UnicodeDecodeError as e:
-                            logger.error(f"illegal char in file (not UTF-8) / ファイルにUTF-8以外の文字があります: {cap_path}")
-                            raise e
-                        assert len(lines) > 0, f"caption file is empty / キャプションファイルが空です: {cap_path}"
-                        if enable_wildcard:
-                            caption = "\n".join([line.strip() for line in lines if line.strip() != ""])  # 空行を除く、改行で連結
-                        else:
-                            caption = lines[0].strip()
-                    break
+            for base, cap_extension in cap_paths:
+                # check with and without . to allow for extension flexibility (img_var.txt, img.txt, img + txt)
+                for cap_path in [base + cap_extension, base + "." + cap_extension]:
+                    if os.path.isfile(cap_path):
+                        with open(cap_path, "rt", encoding="utf-8") as f:
+                            try:
+                                lines = f.readlines()
+                            except UnicodeDecodeError as e:
+                                logger.error(f"illegal char in file (not UTF-8) / ファイルにUTF-8以外の文字があります: {cap_path}")
+                                raise e
+                            assert len(lines) > 0, f"caption file is empty / キャプションファイルが空です: {cap_path}"
+                            if enable_wildcard:
+                                caption = "\n".join([line.strip() for line in lines if line.strip() != ""])  # 空行を除く、改行で連結
+                            else:
+                                caption = lines[0].strip()
+                        break
+                break
             return caption
 
         def load_dreambooth_dir(subset: DreamBoothSubset):
diff --git a/networks/lora_lumina.py b/networks/lora_lumina.py
index 431c183d8..f856d4e7b 100644
--- a/networks/lora_lumina.py
+++ b/networks/lora_lumina.py
@@ -1,5 +1,5 @@
 # temporary minimum implementation of LoRA
-# FLUX doesn't have Conv2d, so we ignore it
+# Lumina 2 does not have Conv2d, so ignore
 # TODO commonize with the original implementation
 
 # LoRA network module
@@ -10,13 +10,11 @@
 import math
 import os
 from typing import Dict, List, Optional, Tuple, Type, Union
-from diffusers import AutoencoderKL
+from diffusers.models.autoencoders.autoencoder_kl import AutoencoderKL
 from transformers import CLIPTextModel
-import numpy as np
 import torch
-import re
+from torch import Tensor, nn
 from library.utils import setup_logging
-from library.sdxl_original_unet import SdxlUNet2DConditionModel
 
 setup_logging()
 import logging
@@ -35,14 +33,14 @@ class LoRAModule(torch.nn.Module):
 
     def __init__(
         self,
-        lora_name,
-        org_module: torch.nn.Module,
-        multiplier=1.0,
-        lora_dim=4,
-        alpha=1,
-        dropout=None,
-        rank_dropout=None,
-        module_dropout=None,
+        lora_name: str,
+        org_module: nn.Module,
+        multiplier: float =1.0,
+        lora_dim: int = 4,
+        alpha: Optional[float | int | Tensor] = 1,
+        dropout: Optional[float] = None,
+        rank_dropout: Optional[float] = None,
+        module_dropout: Optional[float] = None,
         split_dims: Optional[List[int]] = None,
     ):
         """
@@ -60,6 +58,9 @@ def __init__(
             in_dim = org_module.in_features
             out_dim = org_module.out_features
 
+        assert isinstance(in_dim, int)
+        assert isinstance(out_dim, int)
+
         self.lora_dim = lora_dim
         self.split_dims = split_dims
 
@@ -68,30 +69,31 @@ def __init__(
                 kernel_size = org_module.kernel_size
                 stride = org_module.stride
                 padding = org_module.padding
-                self.lora_down = torch.nn.Conv2d(in_dim, self.lora_dim, kernel_size, stride, padding, bias=False)
-                self.lora_up = torch.nn.Conv2d(self.lora_dim, out_dim, (1, 1), (1, 1), bias=False)
+                self.lora_down = nn.Conv2d(in_dim, self.lora_dim, kernel_size, stride, padding, bias=False)
+                self.lora_up = nn.Conv2d(self.lora_dim, out_dim, (1, 1), (1, 1), bias=False)
             else:
-                self.lora_down = torch.nn.Linear(in_dim, self.lora_dim, bias=False)
-                self.lora_up = torch.nn.Linear(self.lora_dim, out_dim, bias=False)
+                self.lora_down = nn.Linear(in_dim, self.lora_dim, bias=False)
+                self.lora_up = nn.Linear(self.lora_dim, out_dim, bias=False)
 
-            torch.nn.init.kaiming_uniform_(self.lora_down.weight, a=math.sqrt(5))
-            torch.nn.init.zeros_(self.lora_up.weight)
+            nn.init.kaiming_uniform_(self.lora_down.weight, a=math.sqrt(5))
+            nn.init.zeros_(self.lora_up.weight)
         else:
             # conv2d not supported
             assert sum(split_dims) == out_dim, "sum of split_dims must be equal to out_dim"
             assert org_module.__class__.__name__ == "Linear", "split_dims is only supported for Linear"
             # print(f"split_dims: {split_dims}")
-            self.lora_down = torch.nn.ModuleList(
-                [torch.nn.Linear(in_dim, self.lora_dim, bias=False) for _ in range(len(split_dims))]
+            self.lora_down = nn.ModuleList(
+                [nn.Linear(in_dim, self.lora_dim, bias=False) for _ in range(len(split_dims))]
             )
-            self.lora_up = torch.nn.ModuleList([torch.nn.Linear(self.lora_dim, split_dim, bias=False) for split_dim in split_dims])
+            self.lora_up = nn.ModuleList([torch.nn.Linear(self.lora_dim, split_dim, bias=False) for split_dim in split_dims])
+
             for lora_down in self.lora_down:
-                torch.nn.init.kaiming_uniform_(lora_down.weight, a=math.sqrt(5))
+                nn.init.kaiming_uniform_(lora_down.weight, a=math.sqrt(5))
             for lora_up in self.lora_up:
-                torch.nn.init.zeros_(lora_up.weight)
+                nn.init.zeros_(lora_up.weight)
 
-        if type(alpha) == torch.Tensor:
-            alpha = alpha.detach().float().numpy()  # without casting, bf16 causes error
+        if isinstance(alpha, Tensor):
+            alpha = alpha.detach().cpu().float().item()  # without casting, bf16 causes error
         alpha = self.lora_dim if alpha is None or alpha == 0 else alpha
         self.scale = alpha / self.lora_dim
         self.register_buffer("alpha", torch.tensor(alpha))  # 定数として扱える
@@ -140,6 +142,9 @@ def forward(self, x):
 
             lx = self.lora_up(lx)
 
+            if self.dropout is not None and self.training:
+                lx = torch.nn.functional.dropout(lx, p=self.dropout)
+
             return org_forwarded + lx * self.multiplier * scale
         else:
             lxs = [lora_down(x) for lora_down in self.lora_down]
@@ -152,9 +157,9 @@ def forward(self, x):
             if self.rank_dropout is not None and self.training:
                 masks = [torch.rand((lx.size(0), self.lora_dim), device=lx.device) > self.rank_dropout for lx in lxs]
                 for i in range(len(lxs)):
-                    if len(lx.size()) == 3:
+                    if len(lxs[i].size()) == 3:
                         masks[i] = masks[i].unsqueeze(1)
-                    elif len(lx.size()) == 4:
+                    elif len(lxs[i].size()) == 4:
                         masks[i] = masks[i].unsqueeze(-1).unsqueeze(-1)
                     lxs[i] = lxs[i] * masks[i]
 
@@ -165,6 +170,9 @@ def forward(self, x):
 
             lxs = [lora_up(lx) for lora_up, lx in zip(self.lora_up, lxs)]
 
+            if self.dropout is not None and self.training:
+                lxs = [torch.nn.functional.dropout(lx, p=self.dropout) for lx in lxs]
+
             return org_forwarded + torch.cat(lxs, dim=-1) * self.multiplier * scale
 
 
@@ -339,14 +347,14 @@ def create_network(
     if all([d is None for d in type_dims]):
         type_dims = None
 
-    # in_dims for embedders
-    in_dims = kwargs.get("in_dims", None)
-    if in_dims is not None:
-        in_dims = in_dims.strip()
-        if in_dims.startswith("[") and in_dims.endswith("]"):
-            in_dims = in_dims[1:-1]
-        in_dims = [int(d) for d in in_dims.split(",")]
-        assert len(in_dims) == 4, f"invalid in_dims: {in_dims}, must be 4 dimensions (x_embedder, t_embedder, cap_embedder, final_layer)"
+    # embedder_dims for embedders
+    embedder_dims = kwargs.get("embedder_dims", None)
+    if embedder_dims is not None:
+        embedder_dims = embedder_dims.strip()
+        if embedder_dims.startswith("[") and embedder_dims.endswith("]"):
+            embedder_dims = embedder_dims[1:-1]
+        embedder_dims = [int(d) for d in embedder_dims.split(",")]
+        assert len(embedder_dims) == 3, f"invalid embedder_dims: {embedder_dims}, must be 4 dimensions (x_embedder, t_embedder, cap_embedder)"
 
     # rank/module dropout
     rank_dropout = kwargs.get("rank_dropout", None)
@@ -357,9 +365,9 @@ def create_network(
         module_dropout = float(module_dropout)
 
     # single or double blocks
-    train_blocks = kwargs.get("train_blocks", None)  # None (default), "all" (same as None), "single", "double"
+    train_blocks = kwargs.get("train_blocks", None)  # None (default), "all" (same as None), "transformer", "refiners", "noise_refiner", "context_refiner"
     if train_blocks is not None:
-        assert train_blocks in ["all", "single", "double"], f"invalid train_blocks: {train_blocks}"
+        assert train_blocks in ["all", "transformer", "refiners", "noise_refiner", "context_refiner"], f"invalid train_blocks: {train_blocks}"
 
     # split qkv
     split_qkv = kwargs.get("split_qkv", False)
@@ -386,7 +394,7 @@ def create_network(
         train_blocks=train_blocks,
         split_qkv=split_qkv,
         type_dims=type_dims,
-        in_dims=in_dims,
+        embedder_dims=embedder_dims,
         verbose=verbose,
     )
 
@@ -461,7 +469,7 @@ def create_network_from_weights(multiplier, file, ae, text_encoders, lumina, wei
 
 
 class LoRANetwork(torch.nn.Module):
-    LUMINA_TARGET_REPLACE_MODULE = ["JointTransformerBlock"]
+    LUMINA_TARGET_REPLACE_MODULE = ["JointTransformerBlock", "FinalLayer"]
     TEXT_ENCODER_TARGET_REPLACE_MODULE = ["Gemma2Attention", "Gemma2FlashAttention2", "Gemma2SdpaAttention", "Gemma2MLP"]
     LORA_PREFIX_LUMINA = "lora_unet"
     LORA_PREFIX_TEXT_ENCODER = "lora_te"  # Simplified prefix since we only have one text encoder
@@ -478,13 +486,14 @@ def __init__(
         module_dropout: Optional[float] = None,
         conv_lora_dim: Optional[int] = None,
         conv_alpha: Optional[float] = None,
-        module_class: Type[object] = LoRAModule,
+        module_class: Type[LoRAModule] = LoRAModule,
         modules_dim: Optional[Dict[str, int]] = None,
         modules_alpha: Optional[Dict[str, int]] = None,
         train_blocks: Optional[str] = None,
         split_qkv: bool = False,
         type_dims: Optional[List[int]] = None,
-        in_dims: Optional[List[int]] = None,
+        embedder_dims: Optional[List[int]] = None,
+        train_block_indices: Optional[List[bool]] = None,
         verbose: Optional[bool] = False,
     ) -> None:
         super().__init__()
@@ -501,7 +510,9 @@ def __init__(
         self.split_qkv = split_qkv
 
         self.type_dims = type_dims
-        self.in_dims = in_dims
+        self.embedder_dims = embedder_dims
+
+        self.train_block_indices = train_block_indices
 
         self.loraplus_lr_ratio = None
         self.loraplus_unet_lr_ratio = None
@@ -509,7 +520,7 @@ def __init__(
 
         if modules_dim is not None:
             logger.info(f"create LoRA network from weights")
-            self.in_dims = [0] * 5  # create in_dims
+            self.embedder_dims = [0] * 5  # create embedder_dims
             # verbose = True
         else:
             logger.info(f"create LoRA network. base dim (rank): {lora_dim}, alpha: {alpha}")
@@ -529,7 +540,7 @@ def __init__(
         def create_modules(
             is_lumina: bool,
             root_module: torch.nn.Module,
-            target_replace_modules: List[str],
+            target_replace_modules: Optional[List[str]],
             filter: Optional[str] = None,
             default_dim: Optional[int] = None,
         ) -> List[LoRAModule]:
@@ -544,63 +555,77 @@ def create_modules(
 
                     for child_name, child_module in module.named_modules():
                         is_linear = child_module.__class__.__name__ == "Linear"
-                        is_conv2d = child_module.__class__.__name__ == "Conv2d"
-                        is_conv2d_1x1 = is_conv2d and child_module.kernel_size == (1, 1)
-
-                        if is_linear or is_conv2d:
-                            lora_name = prefix + "." + (name + "." if name else "") + child_name
-                            lora_name = lora_name.replace(".", "_")
-
-                            if filter is not None and not filter in lora_name:
-                                continue
-
-                            dim = None
-                            alpha = None
-
-                            if modules_dim is not None:
-                                # モジュール指定あり
-                                if lora_name in modules_dim:
-                                    dim = modules_dim[lora_name]
-                                    alpha = modules_alpha[lora_name]
-                            else:
-                                # 通常、すべて対象とする
-                                if is_linear or is_conv2d_1x1:
-                                    dim = default_dim if default_dim is not None else self.lora_dim
-                                    alpha = self.alpha
-
-                                    if is_lumina and type_dims is not None:
-                                        identifier = [
-                                            ("attention",),  # attention layers
-                                            ("mlp",),       # MLP layers
-                                            ("modulation",), # modulation layers
-                                            ("refiner",),   # refiner blocks
-                                        ]
-                                        for i, d in enumerate(type_dims):
-                                            if d is not None and all([id in lora_name for id in identifier[i]]):
-                                                dim = d  # may be 0 for skip
-                                                break
-
-                                elif self.conv_lora_dim is not None:
-                                    dim = self.conv_lora_dim
-                                    alpha = self.conv_alpha
-
-                            if dim is None or dim == 0:
-                                # skipした情報を出力
-                                if is_linear or is_conv2d_1x1 or (self.conv_lora_dim is not None):
-                                    skipped.append(lora_name)
-                                continue
-
-                            lora = module_class(
-                                lora_name,
-                                child_module,
-                                self.multiplier,
-                                dim,
-                                alpha,
-                                dropout=dropout,
-                                rank_dropout=rank_dropout,
-                                module_dropout=module_dropout,
+
+                        lora_name = prefix + "." + (name + "." if name else "") + child_name
+                        lora_name = lora_name.replace(".", "_")
+
+                        # Only Linear is supported
+                        if not is_linear:
+                            skipped.append(lora_name)
+                            continue
+
+                        if filter is not None and filter not in lora_name:
+                            continue
+
+                        dim = default_dim if default_dim is not None else self.lora_dim
+                        alpha = self.alpha
+
+                        # Set dim/alpha to modules dim/alpha
+                        if modules_dim is not None and modules_alpha is not None:
+                            # モジュール指定あり
+                            if lora_name in modules_dim:
+                                dim = modules_dim[lora_name]
+                                alpha = modules_alpha[lora_name]
+
+                        # Set dims to type_dims
+                        if is_lumina and type_dims is not None:
+                            identifier = [
+                                ("attention",),  # attention layers
+                                ("mlp",),       # MLP layers
+                                ("modulation",), # modulation layers
+                                ("refiner",),   # refiner blocks
+                            ]
+                            for i, d in enumerate(type_dims):
+                                if d is not None and all([id in lora_name for id in identifier[i]]):
+                                    dim = d  # may be 0 for skip
+                                    break
+
+                        # Drop blocks if we are only training some blocks
+                        if (
+                            is_lumina
+                            and dim
+                            and (
+                                self.train_block_indices is not None
                             )
-                            loras.append(lora)
+                            and ("layer" in lora_name)
+                        ):
+                            # "lora_unet_layers_0_..." or "lora_unet_cap_refiner_0_..." or or "lora_unet_noise_refiner_0_..."
+                            block_index = int(lora_name.split("_")[3])  # bit dirty
+                            if (
+                                "layer" in lora_name
+                                and self.train_block_indices is not None
+                                and not self.train_block_indices[block_index]
+                            ):
+                                dim = 0
+
+
+                        if dim is None or dim == 0:
+                            # skipした情報を出力
+                            skipped.append(lora_name)
+                            continue
+
+                        lora = module_class(
+                            lora_name,
+                            child_module,
+                            self.multiplier,
+                            dim,
+                            alpha,
+                            dropout=dropout,
+                            rank_dropout=rank_dropout,
+                            module_dropout=module_dropout,
+                        )
+                        logger.info(f"Add LoRA module: {lora_name}")
+                        loras.append(lora)
 
                 if target_replace_modules is None:
                     break  # all modules are searched
@@ -617,15 +642,25 @@ def create_modules(
         skipped_te += skipped
 
         # create LoRA for U-Net
-        target_replace_modules = LoRANetwork.LUMINA_TARGET_REPLACE_MODULE
+        if self.train_blocks == "all":
+            target_replace_modules = LoRANetwork.LUMINA_TARGET_REPLACE_MODULE
+        # TODO: limit different blocks
+        elif self.train_blocks == "transformer":
+            target_replace_modules = LoRANetwork.LUMINA_TARGET_REPLACE_MODULE
+        elif self.train_blocks == "refiners":
+            target_replace_modules = LoRANetwork.LUMINA_TARGET_REPLACE_MODULE
+        elif self.train_blocks == "noise_refiner":
+            target_replace_modules = LoRANetwork.LUMINA_TARGET_REPLACE_MODULE
+        elif self.train_blocks == "cap_refiner":
+            target_replace_modules = LoRANetwork.LUMINA_TARGET_REPLACE_MODULE
 
         self.unet_loras: List[Union[LoRAModule, LoRAInfModule]]
         self.unet_loras, skipped_un = create_modules(True, unet, target_replace_modules)
 
         # Handle embedders
-        if self.in_dims:
-            for filter, in_dim in zip(["x_embedder", "t_embedder", "cap_embedder", "final_layer"], self.in_dims):
-                loras, _ = create_modules(True, unet, None, filter=filter, default_dim=in_dim)
+        if self.embedder_dims:
+            for filter, embedder_dim in zip(["x_embedder", "t_embedder", "cap_embedder"], self.embedder_dims):
+                loras, _ = create_modules(True, unet, None, filter=filter, default_dim=embedder_dim)
                 self.unet_loras.extend(loras)
 
         logger.info(f"create LoRA for Lumina blocks: {len(self.unet_loras)} modules.")

From 9fe8a470800e70a6d899dd63d09e1d63954d67fb Mon Sep 17 00:00:00 2001
From: rockerBOO <rockerboo@gmail.com>
Date: Tue, 4 Mar 2025 02:28:56 -0500
Subject: [PATCH 392/582] Undo dropout after up

---
 networks/lora_lumina.py | 10 ----------
 1 file changed, 10 deletions(-)

diff --git a/networks/lora_lumina.py b/networks/lora_lumina.py
index f856d4e7b..03d130396 100644
--- a/networks/lora_lumina.py
+++ b/networks/lora_lumina.py
@@ -22,10 +22,6 @@
 logger = logging.getLogger(__name__)
 
 
-NUM_DOUBLE_BLOCKS = 19
-NUM_SINGLE_BLOCKS = 38
-
-
 class LoRAModule(torch.nn.Module):
     """
     replaces forward method of the original Linear, instead of replacing the original Linear module.
@@ -142,9 +138,6 @@ def forward(self, x):
 
             lx = self.lora_up(lx)
 
-            if self.dropout is not None and self.training:
-                lx = torch.nn.functional.dropout(lx, p=self.dropout)
-
             return org_forwarded + lx * self.multiplier * scale
         else:
             lxs = [lora_down(x) for lora_down in self.lora_down]
@@ -170,9 +163,6 @@ def forward(self, x):
 
             lxs = [lora_up(lx) for lora_up, lx in zip(self.lora_up, lxs)]
 
-            if self.dropout is not None and self.training:
-                lxs = [torch.nn.functional.dropout(lx, p=self.dropout) for lx in lxs]
-
             return org_forwarded + torch.cat(lxs, dim=-1) * self.multiplier * scale
 
 

From e8c15c716789c5b50a10190871145db2a2aad9f9 Mon Sep 17 00:00:00 2001
From: rockerBOO <rockerboo@gmail.com>
Date: Tue, 4 Mar 2025 02:30:08 -0500
Subject: [PATCH 393/582] Remove log

---
 networks/lora_lumina.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/networks/lora_lumina.py b/networks/lora_lumina.py
index 03d130396..15c35f441 100644
--- a/networks/lora_lumina.py
+++ b/networks/lora_lumina.py
@@ -614,7 +614,6 @@ def create_modules(
                             rank_dropout=rank_dropout,
                             module_dropout=module_dropout,
                         )
-                        logger.info(f"Add LoRA module: {lora_name}")
                         loras.append(lora)
 
                 if target_replace_modules is None:

From aa2bde7ece17be16083acfe9645bb4e21718fb2c Mon Sep 17 00:00:00 2001
From: Kohya S <ykumeykume@gmail.com>
Date: Wed, 5 Mar 2025 23:24:52 +0900
Subject: [PATCH 394/582] docs: add utility script for merging SD3 weights into
 a single .safetensors file

---
 README.md | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/README.md b/README.md
index 3c6993075..426eaed82 100644
--- a/README.md
+++ b/README.md
@@ -14,6 +14,10 @@ The command to install PyTorch is as follows:
 
 ### Recent Updates
 
+Mar 6, 2025:
+
+- Added a utility script to merge the weights of SD3's DiT, VAE (optional), CLIP-L, CLIP-G, and T5XXL into a single .safetensors file. Run `tools/merge_sd3_safetensors.py`. See `--help` for usage. PR [#1960](https://github.com/kohya-ss/sd-scripts/pull/1960)
+
 Feb 26, 2025:
 
 - Improve the validation loss calculation in `train_network.py`, `sdxl_train_network.py`, `flux_train_network.py`, and `sd3_train_network.py`. PR [#1903](https://github.com/kohya-ss/sd-scripts/pull/1903)

From ea53290f625b29c2cfc1c63cc83d6dcd1492731c Mon Sep 17 00:00:00 2001
From: rockerBOO <rockerboo@gmail.com>
Date: Thu, 6 Mar 2025 00:00:38 -0500
Subject: [PATCH 395/582] Add LoRA-GGPO for Flux

---
 networks/lora_flux.py | 134 +++++++++++++++++++++++++++++++++++++++++-
 train_network.py      |   4 ++
 2 files changed, 137 insertions(+), 1 deletion(-)

diff --git a/networks/lora_flux.py b/networks/lora_flux.py
index 91e9cd77f..98cf8c55d 100644
--- a/networks/lora_flux.py
+++ b/networks/lora_flux.py
@@ -9,6 +9,7 @@
 
 import math
 import os
+from contextlib import contextmanager
 from typing import Dict, List, Optional, Tuple, Type, Union
 from diffusers import AutoencoderKL
 from transformers import CLIPTextModel
@@ -27,6 +28,42 @@
 NUM_DOUBLE_BLOCKS = 19
 NUM_SINGLE_BLOCKS = 38
 
+@contextmanager
+def temp_random_seed(seed, device=None):
+    """
+    Context manager that temporarily sets a specific random seed and then
+    restores the original RNG state afterward.
+    
+    Args:
+        seed (int): The random seed to set temporarily
+        device (torch.device, optional): The device to set the seed for.
+            If None, will detect from the current context.
+    """
+    # Save original RNG states
+    original_cpu_rng_state = torch.get_rng_state()
+    original_cuda_rng_states = None
+    if torch.cuda.is_available():
+        original_cuda_rng_states = torch.cuda.get_rng_state_all()
+    
+    # Determine if we need to set CUDA seed
+    set_cuda = False
+    if device is not None:
+        set_cuda = device.type == 'cuda'
+    elif torch.cuda.is_available():
+        set_cuda = True
+        
+    try:
+        # Set the temporary seed
+        torch.manual_seed(seed)
+        if set_cuda:
+            torch.cuda.manual_seed_all(seed)
+        yield
+    finally:
+        # Restore original RNG states
+        torch.set_rng_state(original_cpu_rng_state)
+        if torch.cuda.is_available() and original_cuda_rng_states is not None:
+            torch.cuda.set_rng_state_all(original_cuda_rng_states)
+
 
 class LoRAModule(torch.nn.Module):
     """
@@ -44,6 +81,8 @@ def __init__(
         rank_dropout=None,
         module_dropout=None,
         split_dims: Optional[List[int]] = None,
+        ggpo_beta: Optional[float] = None,
+        ggpo_sigma: Optional[float] = None,
     ):
         """
         if alpha == 0 or None, alpha is rank (no scaling).
@@ -103,9 +142,16 @@ def __init__(
         self.rank_dropout = rank_dropout
         self.module_dropout = module_dropout
 
+        self.ggpo_sigma = ggpo_sigma
+        self.ggpo_beta = ggpo_beta
+
+        self.perturbation_norm_factor = 1.0 / math.sqrt(org_module.weight.shape[0])
+        self._org_module_weight = self.org_module.weight.detach()
+
     def apply_to(self):
         self.org_forward = self.org_module.forward
         self.org_module.forward = self.forward
+
         del self.org_module
 
     def forward(self, x):
@@ -140,7 +186,15 @@ def forward(self, x):
 
             lx = self.lora_up(lx)
 
-            return org_forwarded + lx * self.multiplier * scale
+            # LoRA Gradient-Guided Perturbation Optimization
+            if self.training and hasattr(self, 'perturbation_seed') and self.ggpo_sigma is not None and self.ggpo_beta is not None:
+                with torch.no_grad(), torch.autocast(self.device.type), temp_random_seed(self.perturbation_seed):
+                    perturbation = torch.randn_like(self._org_module_weight,  dtype=self.dtype, device=self.device)
+                    perturbation.mul_(self.perturbation_scale_factor)
+                    perturbation_output = x @ perturbation.T  # Result: (batch × n)
+                return org_forwarded + (self.multiplier * scale * lx) + perturbation_output
+            else:
+                return org_forwarded + lx * self.multiplier * scale
         else:
             lxs = [lora_down(x) for lora_down in self.lora_down]
 
@@ -167,6 +221,58 @@ def forward(self, x):
 
             return org_forwarded + torch.cat(lxs, dim=-1) * self.multiplier * scale
 
+    @torch.no_grad()
+    def update_norms(self):
+        # Not running GGPO so not currently running update norms
+        if self.ggpo_beta is None or self.ggpo_sigma is None:
+            return
+
+        # only update norms when we are training 
+        if self.lora_down.weight.requires_grad is not True:
+            print(f"skipping update_norms for {self.lora_name}")
+            return
+
+        lora_down_grad = None
+        lora_up_grad = None
+
+        for name, param in self.named_parameters():
+            if name == "lora_down.weight":
+                lora_down_grad = param.grad
+            elif name == "lora_up.weight":
+                lora_up_grad = param.grad
+
+        with torch.autocast(self.device.type):
+            module_weights = self.scale * (self.lora_up.weight @ self.lora_down.weight)
+            org_device = self._org_module_weight.device
+            org_dtype = self._org_module_weight.dtype
+            org_weight = self._org_module_weight.to(device=self.device, dtype=self.dtype)
+            combined_weight = org_weight + module_weights
+
+            self.combined_weight_norms = torch.norm(combined_weight, dim=1, keepdim=True)
+
+            self._org_module_weight.to(device=org_device, dtype=org_dtype)
+
+
+        # Calculate gradient norms if we have both gradients
+        if lora_down_grad is not None and lora_up_grad is not None:
+            with torch.autocast(self.device.type):
+                approx_grad = self.scale * ((self.lora_up.weight @ lora_down_grad) + (lora_up_grad @ self.lora_down.weight))
+                self.grad_norms = torch.norm(approx_grad, dim=1, keepdim=True)
+
+        self.perturbation_scale = (self.ggpo_sigma * torch.sqrt(self.combined_weight_norms ** 2)) + (self.ggpo_beta * (self.grad_norms ** 2))
+        self.perturbation_scale_factor = (self.perturbation_scale * self.perturbation_norm_factor).to(self.device)
+
+        # LoRA Gradient-Guided Perturbation Optimization
+        self.perturbation_seed = torch.randint(0, 2**32 - 1, (1,)).detach().item()
+
+    @property
+    def device(self):
+        return next(self.parameters()).device
+
+    @property
+    def dtype(self):
+        return next(self.parameters()).dtype
+
 
 class LoRAInfModule(LoRAModule):
     def __init__(
@@ -420,6 +526,16 @@ def parse_block_selection(selection: str, total_blocks: int) -> List[bool]:
     if split_qkv is not None:
         split_qkv = True if split_qkv == "True" else False
 
+    ggpo_beta = kwargs.get("ggpo_beta", None)
+    ggpo_sigma = kwargs.get("ggpo_sigma", None)
+
+    if ggpo_beta is not None:
+        ggpo_beta = float(ggpo_beta)
+
+    if ggpo_sigma is not None:
+        ggpo_sigma = float(ggpo_sigma)
+
+
     # train T5XXL
     train_t5xxl = kwargs.get("train_t5xxl", False)
     if train_t5xxl is not None:
@@ -449,6 +565,8 @@ def parse_block_selection(selection: str, total_blocks: int) -> List[bool]:
         in_dims=in_dims,
         train_double_block_indices=train_double_block_indices,
         train_single_block_indices=train_single_block_indices,
+        ggpo_beta=ggpo_beta,
+        ggpo_sigma=ggpo_sigma,
         verbose=verbose,
     )
 
@@ -561,6 +679,8 @@ def __init__(
         in_dims: Optional[List[int]] = None,
         train_double_block_indices: Optional[List[bool]] = None,
         train_single_block_indices: Optional[List[bool]] = None,
+        ggpo_beta: Optional[float] = None,
+        ggpo_sigma: Optional[float] = None,
         verbose: Optional[bool] = False,
     ) -> None:
         super().__init__()
@@ -599,10 +719,16 @@ def __init__(
             #     logger.info(
             #         f"apply LoRA to Conv2d with kernel size (3,3). dim (rank): {self.conv_lora_dim}, alpha: {self.conv_alpha}"
             #     )
+
+        if ggpo_beta is not None and ggpo_sigma is not None:
+            logger.info(f"LoRA-GGPO training sigma: {ggpo_sigma} beta: {ggpo_beta}")
+
         if self.split_qkv:
             logger.info(f"split qkv for LoRA")
         if self.train_blocks is not None:
             logger.info(f"train {self.train_blocks} blocks only")
+
+
         if train_t5xxl:
             logger.info(f"train T5XXL as well")
 
@@ -722,6 +848,8 @@ def create_modules(
                                 rank_dropout=rank_dropout,
                                 module_dropout=module_dropout,
                                 split_dims=split_dims,
+                                ggpo_beta=ggpo_beta,
+                                ggpo_sigma=ggpo_sigma,
                             )
                             loras.append(lora)
 
@@ -790,6 +918,10 @@ def set_enabled(self, is_enabled):
         for lora in self.text_encoder_loras + self.unet_loras:
             lora.enabled = is_enabled
 
+    def update_norms(self):
+        for lora in self.text_encoder_loras + self.unet_loras:
+            lora.update_norms()
+
     def load_weights(self, file):
         if os.path.splitext(file)[1] == ".safetensors":
             from safetensors.torch import load_file
diff --git a/train_network.py b/train_network.py
index 2d279b3bf..9db335b04 100644
--- a/train_network.py
+++ b/train_network.py
@@ -1400,6 +1400,10 @@ def restore_rng_state(rng_states: tuple[torch.ByteTensor, Optional[torch.ByteTen
                             params_to_clip = accelerator.unwrap_model(network).get_trainable_params()
                             accelerator.clip_grad_norm_(params_to_clip, args.max_grad_norm)
 
+                    if global_step % 5 == 0:
+                        if hasattr(network, "update_norms"):
+                            network.update_norms()
+
                     optimizer.step()
                     lr_scheduler.step()
                     optimizer.zero_grad(set_to_none=True)

From e5b5c7e1db5a5c8d7e0628cd565e9619f9564adb Mon Sep 17 00:00:00 2001
From: gesen2egee <gesen2gee@gmail.com>
Date: Sat, 15 Mar 2025 13:29:32 +0800
Subject: [PATCH 396/582] Update requirements.txt

---
 requirements.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/requirements.txt b/requirements.txt
index de39f5887..52c3b8c74 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -43,4 +43,5 @@ rich==13.7.0
 # for T5XXL tokenizer (SD3/FLUX)
 sentencepiece==0.2.0
 # for kohya_ss library
+pytorch-optimizer
 -e .

From 3647d065b50d74ade3642edd0ec99a2ce1041edf Mon Sep 17 00:00:00 2001
From: rockerBOO <rockerboo@gmail.com>
Date: Tue, 18 Mar 2025 14:25:09 -0400
Subject: [PATCH 397/582] Cache weight norms estimate on initialization. Move
 to update norms every step

---
 networks/lora_flux.py | 142 ++++++++++++++++++++++++++++++++++--------
 train_network.py      |  36 ++++++++---
 2 files changed, 145 insertions(+), 33 deletions(-)

diff --git a/networks/lora_flux.py b/networks/lora_flux.py
index 98cf8c55d..9f5f1916a 100644
--- a/networks/lora_flux.py
+++ b/networks/lora_flux.py
@@ -15,6 +15,7 @@
 from transformers import CLIPTextModel
 import numpy as np
 import torch
+from torch import Tensor
 import re
 from library.utils import setup_logging
 from library.sdxl_original_unet import SdxlUNet2DConditionModel
@@ -145,8 +146,13 @@ def __init__(
         self.ggpo_sigma = ggpo_sigma
         self.ggpo_beta = ggpo_beta
 
-        self.perturbation_norm_factor = 1.0 / math.sqrt(org_module.weight.shape[0])
-        self._org_module_weight = self.org_module.weight.detach()
+        if self.ggpo_beta is not None and self.ggpo_sigma is not None:
+            self.combined_weight_norms = None
+            self.grad_norms = None
+            self.perturbation_norm_factor = 1.0 / math.sqrt(org_module.weight.shape[0])
+            self.perturbation_seed = torch.randint(0, 2**32 - 1, (1,)).detach().item()
+            self.initialize_norm_cache(org_module.weight)
+            self.org_module_shape: tuple[int] = org_module.weight.shape
 
     def apply_to(self):
         self.org_forward = self.org_module.forward
@@ -187,10 +193,12 @@ def forward(self, x):
             lx = self.lora_up(lx)
 
             # LoRA Gradient-Guided Perturbation Optimization
-            if self.training and hasattr(self, 'perturbation_seed') and self.ggpo_sigma is not None and self.ggpo_beta is not None:
-                with torch.no_grad(), torch.autocast(self.device.type), temp_random_seed(self.perturbation_seed):
-                    perturbation = torch.randn_like(self._org_module_weight,  dtype=self.dtype, device=self.device)
-                    perturbation.mul_(self.perturbation_scale_factor)
+            if self.training and hasattr(self, 'perturbation_seed') and self.ggpo_sigma is not None and self.ggpo_beta is not None and self.combined_weight_norms is not None and self.grad_norms is not None:
+                with torch.no_grad(), temp_random_seed(self.perturbation_seed):
+                    perturbation_scale = (self.ggpo_sigma * torch.sqrt(self.combined_weight_norms ** 2)) + (self.ggpo_beta * (self.grad_norms ** 2))
+                    perturbation_scale_factor = (perturbation_scale * self.perturbation_norm_factor).to(self.device)
+                    perturbation = torch.randn(self.org_module_shape,  dtype=self.dtype, device=self.device)
+                    perturbation.mul_(perturbation_scale_factor)
                     perturbation_output = x @ perturbation.T  # Result: (batch × n)
                 return org_forwarded + (self.multiplier * scale * lx) + perturbation_output
             else:
@@ -221,6 +229,69 @@ def forward(self, x):
 
             return org_forwarded + torch.cat(lxs, dim=-1) * self.multiplier * scale
 
+    @torch.no_grad()
+    def initialize_norm_cache(self, org_module_weight: Tensor):
+        # Choose a reasonable sample size
+        n_rows = org_module_weight.shape[0]
+        sample_size = min(1000, n_rows)  # Cap at 1000 samples or use all if smaller
+        
+        # Sample random indices across all rows
+        indices = torch.randperm(n_rows)[:sample_size]
+        
+        # Convert to a supported data type first, then index
+        # Use float32 for indexing operations
+        weights_float32 = org_module_weight.to(dtype=torch.float32)
+        sampled_weights = weights_float32[indices].to(device=self.device)
+        
+        # Calculate sampled norms
+        sampled_norms = torch.norm(sampled_weights, dim=1, keepdim=True)
+        
+        # Store the mean norm as our estimate
+        self.org_weight_norm_estimate = sampled_norms.mean()
+        
+        # Optional: store standard deviation for confidence intervals
+        self.org_weight_norm_std = sampled_norms.std()
+        
+        # Free memory
+        del sampled_weights, weights_float32
+
+    @torch.no_grad()
+    def validate_norm_approximation(self, org_module_weight: Tensor, verbose=True):
+        # Calculate the true norm (this will be slow but it's just for validation)
+        true_norms = []
+        chunk_size = 1024  # Process in chunks to avoid OOM
+        
+        for i in range(0, org_module_weight.shape[0], chunk_size):
+            end_idx = min(i + chunk_size, org_module_weight.shape[0])
+            chunk = org_module_weight[i:end_idx].to(device=self.device, dtype=self.dtype)
+            chunk_norms = torch.norm(chunk, dim=1, keepdim=True)
+            true_norms.append(chunk_norms.cpu())
+            del chunk
+            
+        true_norms = torch.cat(true_norms, dim=0)
+        true_mean_norm = true_norms.mean().item()
+        
+        # Compare with our estimate
+        estimated_norm = self.org_weight_norm_estimate.item()
+        
+        # Calculate error metrics
+        absolute_error = abs(true_mean_norm - estimated_norm)
+        relative_error = absolute_error / true_mean_norm * 100  # as percentage
+        
+        if verbose:
+            logger.info(f"True mean norm: {true_mean_norm:.6f}")
+            logger.info(f"Estimated norm: {estimated_norm:.6f}")
+            logger.info(f"Absolute error: {absolute_error:.6f}")
+            logger.info(f"Relative error: {relative_error:.2f}%")
+            
+        return {
+            'true_mean_norm': true_mean_norm,
+            'estimated_norm': estimated_norm,
+            'absolute_error': absolute_error,
+            'relative_error': relative_error
+        }
+
+
     @torch.no_grad()
     def update_norms(self):
         # Not running GGPO so not currently running update norms
@@ -228,8 +299,20 @@ def update_norms(self):
             return
 
         # only update norms when we are training 
-        if self.lora_down.weight.requires_grad is not True:
-            print(f"skipping update_norms for {self.lora_name}")
+        if self.training is False:
+            return
+
+        module_weights = self.lora_up.weight @ self.lora_down.weight
+        module_weights.mul(self.scale)
+
+        self.weight_norms = torch.norm(module_weights, dim=1, keepdim=True)
+        self.combined_weight_norms = torch.sqrt((self.org_weight_norm_estimate**2) + 
+                                           torch.sum(module_weights**2, dim=1, keepdim=True))
+
+    @torch.no_grad()
+    def update_grad_norms(self):
+        if self.training is False:
+            print(f"skipping update_grad_norms for {self.lora_name}")
             return
 
         lora_down_grad = None
@@ -241,29 +324,12 @@ def update_norms(self):
             elif name == "lora_up.weight":
                 lora_up_grad = param.grad
 
-        with torch.autocast(self.device.type):
-            module_weights = self.scale * (self.lora_up.weight @ self.lora_down.weight)
-            org_device = self._org_module_weight.device
-            org_dtype = self._org_module_weight.dtype
-            org_weight = self._org_module_weight.to(device=self.device, dtype=self.dtype)
-            combined_weight = org_weight + module_weights
-
-            self.combined_weight_norms = torch.norm(combined_weight, dim=1, keepdim=True)
-
-            self._org_module_weight.to(device=org_device, dtype=org_dtype)
-
-
         # Calculate gradient norms if we have both gradients
         if lora_down_grad is not None and lora_up_grad is not None:
             with torch.autocast(self.device.type):
                 approx_grad = self.scale * ((self.lora_up.weight @ lora_down_grad) + (lora_up_grad @ self.lora_down.weight))
                 self.grad_norms = torch.norm(approx_grad, dim=1, keepdim=True)
 
-        self.perturbation_scale = (self.ggpo_sigma * torch.sqrt(self.combined_weight_norms ** 2)) + (self.ggpo_beta * (self.grad_norms ** 2))
-        self.perturbation_scale_factor = (self.perturbation_scale * self.perturbation_norm_factor).to(self.device)
-
-        # LoRA Gradient-Guided Perturbation Optimization
-        self.perturbation_seed = torch.randint(0, 2**32 - 1, (1,)).detach().item()
 
     @property
     def device(self):
@@ -922,6 +988,32 @@ def update_norms(self):
         for lora in self.text_encoder_loras + self.unet_loras:
             lora.update_norms()
 
+    def update_grad_norms(self):
+        for lora in self.text_encoder_loras + self.unet_loras:
+            lora.update_grad_norms()
+
+    def grad_norms(self) -> Tensor:
+        grad_norms = []
+        for lora in self.text_encoder_loras + self.unet_loras:
+            if hasattr(lora, "grad_norms") and lora.grad_norms is not None:
+                grad_norms.append(lora.grad_norms.mean(dim=0))
+        return torch.stack(grad_norms) if len(grad_norms) > 0 else torch.tensor([])
+
+    def weight_norms(self) -> Tensor:
+        weight_norms = []
+        for lora in self.text_encoder_loras + self.unet_loras:
+            if hasattr(lora, "weight_norms") and lora.weight_norms is not None:
+                weight_norms.append(lora.weight_norms.mean(dim=0))
+        return torch.stack(weight_norms) if len(weight_norms) > 0 else torch.tensor([])
+
+    def combined_weight_norms(self) -> Tensor:
+        combined_weight_norms = []
+        for lora in self.text_encoder_loras + self.unet_loras:
+            if hasattr(lora, "combined_weight_norms") and lora.combined_weight_norms is not None:
+                combined_weight_norms.append(lora.combined_weight_norms.mean(dim=0))
+        return torch.stack(combined_weight_norms) if len(combined_weight_norms) > 0 else torch.tensor([])
+
+
     def load_weights(self, file):
         if os.path.splitext(file)[1] == ".safetensors":
             from safetensors.torch import load_file
diff --git a/train_network.py b/train_network.py
index 9db335b04..4898e7985 100644
--- a/train_network.py
+++ b/train_network.py
@@ -69,13 +69,20 @@ def generate_step_logs(
         keys_scaled=None,
         mean_norm=None,
         maximum_norm=None,
+        mean_grad_norm=None,
+        mean_combined_norm=None
     ):
         logs = {"loss/current": current_loss, "loss/average": avr_loss}
 
         if keys_scaled is not None:
             logs["max_norm/keys_scaled"] = keys_scaled
-            logs["max_norm/average_key_norm"] = mean_norm
             logs["max_norm/max_key_norm"] = maximum_norm
+        if mean_norm is not None:
+            logs["norm/avg_key_norm"] = mean_norm
+        if mean_grad_norm is not None:
+            logs["norm/avg_grad_norm"] = mean_grad_norm
+        if mean_combined_norm is not None:
+            logs["norm/avg_combined_norm"] = mean_combined_norm
 
         lrs = lr_scheduler.get_last_lr()
         for i, lr in enumerate(lrs):
@@ -1400,10 +1407,12 @@ def restore_rng_state(rng_states: tuple[torch.ByteTensor, Optional[torch.ByteTen
                             params_to_clip = accelerator.unwrap_model(network).get_trainable_params()
                             accelerator.clip_grad_norm_(params_to_clip, args.max_grad_norm)
 
-                    if global_step % 5 == 0:
+                        if hasattr(network, "update_grad_norms"):
+                            network.update_grad_norms()
                         if hasattr(network, "update_norms"):
                             network.update_norms()
 
+
                     optimizer.step()
                     lr_scheduler.step()
                     optimizer.zero_grad(set_to_none=True)
@@ -1412,9 +1421,23 @@ def restore_rng_state(rng_states: tuple[torch.ByteTensor, Optional[torch.ByteTen
                     keys_scaled, mean_norm, maximum_norm = accelerator.unwrap_model(network).apply_max_norm_regularization(
                         args.scale_weight_norms, accelerator.device
                     )
+                    mean_grad_norm = None
+                    mean_combined_norm = None
                     max_mean_logs = {"Keys Scaled": keys_scaled, "Average key norm": mean_norm}
                 else:
-                    keys_scaled, mean_norm, maximum_norm = None, None, None
+                    if hasattr(network, "weight_norms"):
+                        mean_norm = network.weight_norms().mean().item()
+                        mean_grad_norm = network.grad_norms().mean().item()
+                        mean_combined_norm = network.combined_weight_norms().mean().item()
+                        weight_norms = network.weight_norms()
+                        maximum_norm = weight_norms.max().item() if weight_norms.numel() > 0 else None
+                        keys_scaled = None
+                        max_mean_logs = {"avg weight norm": mean_norm, "avg grad norm": mean_grad_norm, "avg comb norm": mean_combined_norm}
+                    else:
+                        keys_scaled, mean_norm, maximum_norm = None, None, None
+                        mean_grad_norm = None
+                        mean_combined_norm = None
+                        max_mean_logs = {}
 
                 # Checks if the accelerator has performed an optimization step behind the scenes
                 if accelerator.sync_gradients:
@@ -1446,14 +1469,11 @@ def restore_rng_state(rng_states: tuple[torch.ByteTensor, Optional[torch.ByteTen
                 loss_recorder.add(epoch=epoch, step=step, loss=current_loss)
                 avr_loss: float = loss_recorder.moving_average
                 logs = {"avr_loss": avr_loss}  # , "lr": lr_scheduler.get_last_lr()[0]}
-                progress_bar.set_postfix(**logs)
-
-                if args.scale_weight_norms:
-                    progress_bar.set_postfix(**{**max_mean_logs, **logs})
+                progress_bar.set_postfix(**{**max_mean_logs, **logs})
 
                 if is_tracking:
                     logs = self.generate_step_logs(
-                        args, current_loss, avr_loss, lr_scheduler, lr_descriptions, optimizer, keys_scaled, mean_norm, maximum_norm
+                        args, current_loss, avr_loss, lr_scheduler, lr_descriptions, optimizer, keys_scaled, mean_norm, maximum_norm, mean_grad_norm, mean_combined_norm
                     )
                     self.step_logging(accelerator, logs, global_step, epoch + 1)
 

From 0b25a05e3c0b983d7a4fa74f40798705a00992e3 Mon Sep 17 00:00:00 2001
From: rockerBOO <rockerboo@gmail.com>
Date: Tue, 18 Mar 2025 15:40:40 -0400
Subject: [PATCH 398/582] Add IP noise gamma for Flux

---
 library/flux_train_utils.py | 19 +++++++++++++++----
 1 file changed, 15 insertions(+), 4 deletions(-)

diff --git a/library/flux_train_utils.py b/library/flux_train_utils.py
index f7f06c5cf..f866fd4ac 100644
--- a/library/flux_train_utils.py
+++ b/library/flux_train_utils.py
@@ -415,6 +415,16 @@ def get_noisy_model_input_and_timesteps(
     bsz, _, h, w = latents.shape
     sigmas = None
 
+    ip_noise_gamma = 0.0
+
+    # Add noise to the latents according to the noise magnitude at each timestep
+    # (this is the forward diffusion process)
+    if args.ip_noise_gamma:
+        if args.ip_noise_gamma_random_strength:
+            ip_noise_gamma = torch.rand(1, device=latents.device) * args.ip_noise_gamma
+        else:
+            ip_noise_gamma = args.ip_noise_gamma
+
     if args.timestep_sampling == "uniform" or args.timestep_sampling == "sigmoid":
         # Simple random t-based noise sampling
         if args.timestep_sampling == "sigmoid":
@@ -425,7 +435,7 @@ def get_noisy_model_input_and_timesteps(
 
         timesteps = t * 1000.0
         t = t.view(-1, 1, 1, 1)
-        noisy_model_input = (1 - t) * latents + t * noise
+        noisy_model_input = (1 - t) * latents + t * noise + ip_noise_gamma
     elif args.timestep_sampling == "shift":
         shift = args.discrete_flow_shift
         logits_norm = torch.randn(bsz, device=device)
@@ -435,7 +445,7 @@ def get_noisy_model_input_and_timesteps(
 
         t = timesteps.view(-1, 1, 1, 1)
         timesteps = timesteps * 1000.0
-        noisy_model_input = (1 - t) * latents + t * noise
+        noisy_model_input = (1 - t) * latents + t * noise + ip_noise_gamma
     elif args.timestep_sampling == "flux_shift":
         logits_norm = torch.randn(bsz, device=device)
         logits_norm = logits_norm * args.sigmoid_scale  # larger scale for more uniform sampling
@@ -445,7 +455,7 @@ def get_noisy_model_input_and_timesteps(
 
         t = timesteps.view(-1, 1, 1, 1)
         timesteps = timesteps * 1000.0
-        noisy_model_input = (1 - t) * latents + t * noise
+        noisy_model_input = (1 - t) * latents + t * noise + ip_noise_gamma
     else:
         # Sample a random timestep for each image
         # for weighting schemes where we sample timesteps non-uniformly
@@ -461,7 +471,8 @@ def get_noisy_model_input_and_timesteps(
 
         # Add noise according to flow matching.
         sigmas = get_sigmas(noise_scheduler, timesteps, device, n_dim=latents.ndim, dtype=dtype)
-        noisy_model_input = sigmas * noise + (1.0 - sigmas) * latents
+        noisy_model_input = sigmas * noise + (1.0 - sigmas) * latents + ip_noise_gamma
+
 
     return noisy_model_input.to(dtype), timesteps.to(dtype), sigmas
 

From c8be141ae0576119ecd8ae329f00700098ee83a2 Mon Sep 17 00:00:00 2001
From: rockerBOO <rockerboo@gmail.com>
Date: Tue, 18 Mar 2025 15:42:18 -0400
Subject: [PATCH 399/582] Apply IP gamma to noise fix

---
 library/flux_train_utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/library/flux_train_utils.py b/library/flux_train_utils.py
index f866fd4ac..557f61e7a 100644
--- a/library/flux_train_utils.py
+++ b/library/flux_train_utils.py
@@ -471,7 +471,7 @@ def get_noisy_model_input_and_timesteps(
 
         # Add noise according to flow matching.
         sigmas = get_sigmas(noise_scheduler, timesteps, device, n_dim=latents.ndim, dtype=dtype)
-        noisy_model_input = sigmas * noise + (1.0 - sigmas) * latents + ip_noise_gamma
+        noisy_model_input = sigmas * noise +  ip_noise_gamma + (1.0 - sigmas) * latents
 
 
     return noisy_model_input.to(dtype), timesteps.to(dtype), sigmas

From b425466e7be64e12238b267862468dc9f0b0bb6e Mon Sep 17 00:00:00 2001
From: rockerBOO <rockerboo@gmail.com>
Date: Tue, 18 Mar 2025 18:42:35 -0400
Subject: [PATCH 400/582] Fix IP noise gamma to use random values

---
 library/flux_train_utils.py | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/library/flux_train_utils.py b/library/flux_train_utils.py
index 557f61e7a..f07447476 100644
--- a/library/flux_train_utils.py
+++ b/library/flux_train_utils.py
@@ -415,15 +415,15 @@ def get_noisy_model_input_and_timesteps(
     bsz, _, h, w = latents.shape
     sigmas = None
 
-    ip_noise_gamma = 0.0
-
     # Add noise to the latents according to the noise magnitude at each timestep
     # (this is the forward diffusion process)
     if args.ip_noise_gamma:
         if args.ip_noise_gamma_random_strength:
-            ip_noise_gamma = torch.rand(1, device=latents.device) * args.ip_noise_gamma
+            ip_noise = (torch.rand(1, device=latents.device) * args.ip_noise_gamma) * torch.randn_like(latents)
         else:
-            ip_noise_gamma = args.ip_noise_gamma
+            ip_noise = args.ip_noise_gamma * torch.randn_like(latents)
+    else:
+        ip_noise = torch.zeros_like(latents)
 
     if args.timestep_sampling == "uniform" or args.timestep_sampling == "sigmoid":
         # Simple random t-based noise sampling
@@ -435,7 +435,7 @@ def get_noisy_model_input_and_timesteps(
 
         timesteps = t * 1000.0
         t = t.view(-1, 1, 1, 1)
-        noisy_model_input = (1 - t) * latents + t * noise + ip_noise_gamma
+        noisy_model_input = (1 - t) * latents + t * (noise + ip_noise)
     elif args.timestep_sampling == "shift":
         shift = args.discrete_flow_shift
         logits_norm = torch.randn(bsz, device=device)
@@ -445,7 +445,7 @@ def get_noisy_model_input_and_timesteps(
 
         t = timesteps.view(-1, 1, 1, 1)
         timesteps = timesteps * 1000.0
-        noisy_model_input = (1 - t) * latents + t * noise + ip_noise_gamma
+        noisy_model_input = (1 - t) * latents + t * (noise + ip_noise)
     elif args.timestep_sampling == "flux_shift":
         logits_norm = torch.randn(bsz, device=device)
         logits_norm = logits_norm * args.sigmoid_scale  # larger scale for more uniform sampling
@@ -455,7 +455,7 @@ def get_noisy_model_input_and_timesteps(
 
         t = timesteps.view(-1, 1, 1, 1)
         timesteps = timesteps * 1000.0
-        noisy_model_input = (1 - t) * latents + t * noise + ip_noise_gamma
+        noisy_model_input = (1 - t) * latents + t * (noise + ip_noise)
     else:
         # Sample a random timestep for each image
         # for weighting schemes where we sample timesteps non-uniformly
@@ -471,7 +471,7 @@ def get_noisy_model_input_and_timesteps(
 
         # Add noise according to flow matching.
         sigmas = get_sigmas(noise_scheduler, timesteps, device, n_dim=latents.ndim, dtype=dtype)
-        noisy_model_input = sigmas * noise +  ip_noise_gamma + (1.0 - sigmas) * latents
+        noisy_model_input = sigmas * (noise + ip_noise) + (1.0 - sigmas) * latents
 
 
     return noisy_model_input.to(dtype), timesteps.to(dtype), sigmas

From a4f3a9fc1a4f4f964a6971bc4b0ae15c94f0d672 Mon Sep 17 00:00:00 2001
From: rockerBOO <rockerboo@gmail.com>
Date: Tue, 18 Mar 2025 18:44:21 -0400
Subject: [PATCH 401/582] Use ones_like

---
 library/flux_train_utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/library/flux_train_utils.py b/library/flux_train_utils.py
index f07447476..8cf958580 100644
--- a/library/flux_train_utils.py
+++ b/library/flux_train_utils.py
@@ -423,7 +423,7 @@ def get_noisy_model_input_and_timesteps(
         else:
             ip_noise = args.ip_noise_gamma * torch.randn_like(latents)
     else:
-        ip_noise = torch.zeros_like(latents)
+        ip_noise = torch.ones_like(latents)
 
     if args.timestep_sampling == "uniform" or args.timestep_sampling == "sigmoid":
         # Simple random t-based noise sampling

From 6f4d3657756a9d679dfa76f7c6c7bd1c957130ca Mon Sep 17 00:00:00 2001
From: rockerBOO <rockerboo@gmail.com>
Date: Tue, 18 Mar 2025 18:53:34 -0400
Subject: [PATCH 402/582] zeros_like because we are adding

---
 library/flux_train_utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/library/flux_train_utils.py b/library/flux_train_utils.py
index 8cf958580..f07447476 100644
--- a/library/flux_train_utils.py
+++ b/library/flux_train_utils.py
@@ -423,7 +423,7 @@ def get_noisy_model_input_and_timesteps(
         else:
             ip_noise = args.ip_noise_gamma * torch.randn_like(latents)
     else:
-        ip_noise = torch.ones_like(latents)
+        ip_noise = torch.zeros_like(latents)
 
     if args.timestep_sampling == "uniform" or args.timestep_sampling == "sigmoid":
         # Simple random t-based noise sampling

From b81bcd0b01aa81bf616b6125ca1da4d6d3c9dd82 Mon Sep 17 00:00:00 2001
From: rockerBOO <rockerboo@gmail.com>
Date: Tue, 18 Mar 2025 21:36:55 -0400
Subject: [PATCH 403/582] Move IP noise gamma to noise creation to remove
 complexity and align noise for target loss

---
 flux_train_network.py       |  9 +++++++++
 library/flux_train_utils.py | 19 ++++---------------
 2 files changed, 13 insertions(+), 15 deletions(-)

diff --git a/flux_train_network.py b/flux_train_network.py
index def441559..d85584f5d 100644
--- a/flux_train_network.py
+++ b/flux_train_network.py
@@ -350,6 +350,15 @@ def get_noise_pred_and_target(
     ):
         # Sample noise that we'll add to the latents
         noise = torch.randn_like(latents)
+
+        # Add noise to the latents according to the noise magnitude at each timestep
+        # (this is the forward diffusion process)
+        if args.ip_noise_gamma:
+            if args.ip_noise_gamma_random_strength:
+                noise = noise + (torch.rand(1, device=latents.device) * args.ip_noise_gamma) * torch.randn_like(latents)
+            else:
+                noise = noise + args.ip_noise_gamma * torch.randn_like(latents)
+
         bsz = latents.shape[0]
 
         # get noisy model input and timesteps
diff --git a/library/flux_train_utils.py b/library/flux_train_utils.py
index f07447476..f7f06c5cf 100644
--- a/library/flux_train_utils.py
+++ b/library/flux_train_utils.py
@@ -415,16 +415,6 @@ def get_noisy_model_input_and_timesteps(
     bsz, _, h, w = latents.shape
     sigmas = None
 
-    # Add noise to the latents according to the noise magnitude at each timestep
-    # (this is the forward diffusion process)
-    if args.ip_noise_gamma:
-        if args.ip_noise_gamma_random_strength:
-            ip_noise = (torch.rand(1, device=latents.device) * args.ip_noise_gamma) * torch.randn_like(latents)
-        else:
-            ip_noise = args.ip_noise_gamma * torch.randn_like(latents)
-    else:
-        ip_noise = torch.zeros_like(latents)
-
     if args.timestep_sampling == "uniform" or args.timestep_sampling == "sigmoid":
         # Simple random t-based noise sampling
         if args.timestep_sampling == "sigmoid":
@@ -435,7 +425,7 @@ def get_noisy_model_input_and_timesteps(
 
         timesteps = t * 1000.0
         t = t.view(-1, 1, 1, 1)
-        noisy_model_input = (1 - t) * latents + t * (noise + ip_noise)
+        noisy_model_input = (1 - t) * latents + t * noise
     elif args.timestep_sampling == "shift":
         shift = args.discrete_flow_shift
         logits_norm = torch.randn(bsz, device=device)
@@ -445,7 +435,7 @@ def get_noisy_model_input_and_timesteps(
 
         t = timesteps.view(-1, 1, 1, 1)
         timesteps = timesteps * 1000.0
-        noisy_model_input = (1 - t) * latents + t * (noise + ip_noise)
+        noisy_model_input = (1 - t) * latents + t * noise
     elif args.timestep_sampling == "flux_shift":
         logits_norm = torch.randn(bsz, device=device)
         logits_norm = logits_norm * args.sigmoid_scale  # larger scale for more uniform sampling
@@ -455,7 +445,7 @@ def get_noisy_model_input_and_timesteps(
 
         t = timesteps.view(-1, 1, 1, 1)
         timesteps = timesteps * 1000.0
-        noisy_model_input = (1 - t) * latents + t * (noise + ip_noise)
+        noisy_model_input = (1 - t) * latents + t * noise
     else:
         # Sample a random timestep for each image
         # for weighting schemes where we sample timesteps non-uniformly
@@ -471,8 +461,7 @@ def get_noisy_model_input_and_timesteps(
 
         # Add noise according to flow matching.
         sigmas = get_sigmas(noise_scheduler, timesteps, device, n_dim=latents.ndim, dtype=dtype)
-        noisy_model_input = sigmas * (noise + ip_noise) + (1.0 - sigmas) * latents
-
+        noisy_model_input = sigmas * noise + (1.0 - sigmas) * latents
 
     return noisy_model_input.to(dtype), timesteps.to(dtype), sigmas
 

From 5b210ad7178c0b88c214686389b0afb03ba3813c Mon Sep 17 00:00:00 2001
From: gesen2egee <gesen2gee@gmail.com>
Date: Wed, 19 Mar 2025 10:49:06 +0800
Subject: [PATCH 404/582] update  prodigyopt and  prodigy-plus-schedule-free

---
 requirements.txt | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/requirements.txt b/requirements.txt
index 52c3b8c74..7348647f8 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -7,7 +7,6 @@ opencv-python==4.8.1.78
 einops==0.7.0
 pytorch-lightning==1.9.0
 bitsandbytes==0.44.0
-prodigyopt==1.0
 lion-pytorch==0.0.6
 schedulefree==1.4
 tensorboard
@@ -44,4 +43,6 @@ rich==13.7.0
 sentencepiece==0.2.0
 # for kohya_ss library
 pytorch-optimizer
+prodigy-plus-schedule-free==1.9.0
+prodigyopt==1.1.2
 -e .

From 7197266703d8ac9219dda8b5a58bbd60d029d597 Mon Sep 17 00:00:00 2001
From: rockerBOO <rockerboo@gmail.com>
Date: Wed, 19 Mar 2025 00:25:51 -0400
Subject: [PATCH 405/582] Perturbed noise should be separate of input noise

---
 flux_train_network.py       |  9 ---------
 library/flux_train_utils.py | 13 ++++++++++++-
 2 files changed, 12 insertions(+), 10 deletions(-)

diff --git a/flux_train_network.py b/flux_train_network.py
index d85584f5d..def441559 100644
--- a/flux_train_network.py
+++ b/flux_train_network.py
@@ -350,15 +350,6 @@ def get_noise_pred_and_target(
     ):
         # Sample noise that we'll add to the latents
         noise = torch.randn_like(latents)
-
-        # Add noise to the latents according to the noise magnitude at each timestep
-        # (this is the forward diffusion process)
-        if args.ip_noise_gamma:
-            if args.ip_noise_gamma_random_strength:
-                noise = noise + (torch.rand(1, device=latents.device) * args.ip_noise_gamma) * torch.randn_like(latents)
-            else:
-                noise = noise + args.ip_noise_gamma * torch.randn_like(latents)
-
         bsz = latents.shape[0]
 
         # get noisy model input and timesteps
diff --git a/library/flux_train_utils.py b/library/flux_train_utils.py
index f7f06c5cf..775e0c33a 100644
--- a/library/flux_train_utils.py
+++ b/library/flux_train_utils.py
@@ -410,11 +410,22 @@ def compute_loss_weighting_for_sd3(weighting_scheme: str, sigmas=None):
 
 
 def get_noisy_model_input_and_timesteps(
-    args, noise_scheduler, latents, noise, device, dtype
+    args, noise_scheduler, latents: torch.Tensor, input_noise: torch.Tensor, device, dtype
 ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
     bsz, _, h, w = latents.shape
     sigmas = None
 
+    # Add noise to the latents according to the noise magnitude at each timestep
+    # (this is the forward diffusion process)
+    if args.ip_noise_gamma:
+        if args.ip_noise_gamma_random_strength:
+            noise = input_noise.detach().clone() + (torch.rand(1, device=latents.device) * args.ip_noise_gamma) * torch.randn_like(latents)
+        else:
+            noise = input_noise.detach().clone() + args.ip_noise_gamma * torch.randn_like(latents)
+    else:
+        noise = input_noise
+
+
     if args.timestep_sampling == "uniform" or args.timestep_sampling == "sigmoid":
         # Simple random t-based noise sampling
         if args.timestep_sampling == "sigmoid":

From d93ad90a717beb2fd322d2fae73992e9ea5213ea Mon Sep 17 00:00:00 2001
From: rockerBOO <rockerboo@gmail.com>
Date: Wed, 19 Mar 2025 00:37:27 -0400
Subject: [PATCH 406/582] Add perturbation on noisy_model_input if needed

---
 library/flux_train_utils.py | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/library/flux_train_utils.py b/library/flux_train_utils.py
index 775e0c33a..0fe81da7a 100644
--- a/library/flux_train_utils.py
+++ b/library/flux_train_utils.py
@@ -410,20 +410,11 @@ def compute_loss_weighting_for_sd3(weighting_scheme: str, sigmas=None):
 
 
 def get_noisy_model_input_and_timesteps(
-    args, noise_scheduler, latents: torch.Tensor, input_noise: torch.Tensor, device, dtype
+    args, noise_scheduler, latents: torch.Tensor, noise: torch.Tensor, device, dtype
 ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
     bsz, _, h, w = latents.shape
     sigmas = None
 
-    # Add noise to the latents according to the noise magnitude at each timestep
-    # (this is the forward diffusion process)
-    if args.ip_noise_gamma:
-        if args.ip_noise_gamma_random_strength:
-            noise = input_noise.detach().clone() + (torch.rand(1, device=latents.device) * args.ip_noise_gamma) * torch.randn_like(latents)
-        else:
-            noise = input_noise.detach().clone() + args.ip_noise_gamma * torch.randn_like(latents)
-    else:
-        noise = input_noise
 
 
     if args.timestep_sampling == "uniform" or args.timestep_sampling == "sigmoid":
@@ -474,6 +465,15 @@ def get_noisy_model_input_and_timesteps(
         sigmas = get_sigmas(noise_scheduler, timesteps, device, n_dim=latents.ndim, dtype=dtype)
         noisy_model_input = sigmas * noise + (1.0 - sigmas) * latents
 
+    # Add noise to the latents according to the noise magnitude at each timestep
+    # (this is the forward diffusion process)
+    if args.ip_noise_gamma:
+        if args.ip_noise_gamma_random_strength:
+            xi = noise.detach().clone() + (torch.rand(1, device=latents.device) * args.ip_noise_gamma) * torch.randn_like(latents)
+        else:
+            xi = noise.detach().clone() + args.ip_noise_gamma * torch.randn_like(latents)
+        noisy_model_input += xi
+
     return noisy_model_input.to(dtype), timesteps.to(dtype), sigmas
 
 

From 8e6817b0c2d6e312b8da0d84baa2ecc72c83767f Mon Sep 17 00:00:00 2001
From: rockerBOO <rockerboo@gmail.com>
Date: Wed, 19 Mar 2025 00:45:13 -0400
Subject: [PATCH 407/582] Remove double noise

---
 library/flux_train_utils.py | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/library/flux_train_utils.py b/library/flux_train_utils.py
index 0fe81da7a..9808ad0a7 100644
--- a/library/flux_train_utils.py
+++ b/library/flux_train_utils.py
@@ -415,8 +415,6 @@ def get_noisy_model_input_and_timesteps(
     bsz, _, h, w = latents.shape
     sigmas = None
 
-
-
     if args.timestep_sampling == "uniform" or args.timestep_sampling == "sigmoid":
         # Simple random t-based noise sampling
         if args.timestep_sampling == "sigmoid":
@@ -469,10 +467,10 @@ def get_noisy_model_input_and_timesteps(
     # (this is the forward diffusion process)
     if args.ip_noise_gamma:
         if args.ip_noise_gamma_random_strength:
-            xi = noise.detach().clone() + (torch.rand(1, device=latents.device) * args.ip_noise_gamma) * torch.randn_like(latents)
+            noise_perturbation = (torch.rand(1, device=latents.device) * args.ip_noise_gamma) * torch.randn_like(noise)
         else:
-            xi = noise.detach().clone() + args.ip_noise_gamma * torch.randn_like(latents)
-        noisy_model_input += xi
+            noise_perturbation = args.ip_noise_gamma * torch.randn_like(noise)
+        noisy_model_input += noise_perturbation
 
     return noisy_model_input.to(dtype), timesteps.to(dtype), sigmas
 

From 1eddac26b010d23ce5f0eb6a8ac12fbca66ee50b Mon Sep 17 00:00:00 2001
From: rockerBOO <rockerboo@gmail.com>
Date: Wed, 19 Mar 2025 00:49:42 -0400
Subject: [PATCH 408/582] Separate random to a variable, and make sure on
 device

---
 library/flux_train_utils.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/library/flux_train_utils.py b/library/flux_train_utils.py
index 9808ad0a7..107f351f7 100644
--- a/library/flux_train_utils.py
+++ b/library/flux_train_utils.py
@@ -466,11 +466,12 @@ def get_noisy_model_input_and_timesteps(
     # Add noise to the latents according to the noise magnitude at each timestep
     # (this is the forward diffusion process)
     if args.ip_noise_gamma:
+        xi = torch.randn_like(latents, device=latents.device, dtype=dtype)
         if args.ip_noise_gamma_random_strength:
-            noise_perturbation = (torch.rand(1, device=latents.device) * args.ip_noise_gamma) * torch.randn_like(noise)
+            ip_noise_gamma = (torch.rand(1, device=latents.device, dtype=dtype) * args.ip_noise_gamma)
         else:
-            noise_perturbation = args.ip_noise_gamma * torch.randn_like(noise)
-        noisy_model_input += noise_perturbation
+            ip_noise_gamma = args.ip_noise_gamma
+        noisy_model_input += ip_noise_gamma * xi
 
     return noisy_model_input.to(dtype), timesteps.to(dtype), sigmas
 

From 5d5a7d2acf884077b6a24db269c8f4facb5b7487 Mon Sep 17 00:00:00 2001
From: rockerBOO <rockerboo@gmail.com>
Date: Wed, 19 Mar 2025 13:50:04 -0400
Subject: [PATCH 409/582] Fix IP noise calculation

---
 library/flux_train_utils.py | 18 ++++++------------
 1 file changed, 6 insertions(+), 12 deletions(-)

diff --git a/library/flux_train_utils.py b/library/flux_train_utils.py
index 107f351f7..0cb07e3d3 100644
--- a/library/flux_train_utils.py
+++ b/library/flux_train_utils.py
@@ -423,29 +423,24 @@ def get_noisy_model_input_and_timesteps(
         else:
             t = torch.rand((bsz,), device=device)
 
+        sigmas = t.view(-1, 1, 1, 1)
         timesteps = t * 1000.0
-        t = t.view(-1, 1, 1, 1)
-        noisy_model_input = (1 - t) * latents + t * noise
     elif args.timestep_sampling == "shift":
         shift = args.discrete_flow_shift
         logits_norm = torch.randn(bsz, device=device)
         logits_norm = logits_norm * args.sigmoid_scale  # larger scale for more uniform sampling
         timesteps = logits_norm.sigmoid()
         timesteps = (timesteps * shift) / (1 + (shift - 1) * timesteps)
-
-        t = timesteps.view(-1, 1, 1, 1)
+        sigmas = timesteps.view(-1, 1, 1, 1)
         timesteps = timesteps * 1000.0
-        noisy_model_input = (1 - t) * latents + t * noise
     elif args.timestep_sampling == "flux_shift":
         logits_norm = torch.randn(bsz, device=device)
         logits_norm = logits_norm * args.sigmoid_scale  # larger scale for more uniform sampling
         timesteps = logits_norm.sigmoid()
         mu = get_lin_function(y1=0.5, y2=1.15)((h // 2) * (w // 2))
         timesteps = time_shift(mu, 1.0, timesteps)
-
-        t = timesteps.view(-1, 1, 1, 1)
+        sigmas = timesteps.view(-1, 1, 1, 1)
         timesteps = timesteps * 1000.0
-        noisy_model_input = (1 - t) * latents + t * noise
     else:
         # Sample a random timestep for each image
         # for weighting schemes where we sample timesteps non-uniformly
@@ -458,10 +453,7 @@ def get_noisy_model_input_and_timesteps(
         )
         indices = (u * noise_scheduler.config.num_train_timesteps).long()
         timesteps = noise_scheduler.timesteps[indices].to(device=device)
-
-        # Add noise according to flow matching.
         sigmas = get_sigmas(noise_scheduler, timesteps, device, n_dim=latents.ndim, dtype=dtype)
-        noisy_model_input = sigmas * noise + (1.0 - sigmas) * latents
 
     # Add noise to the latents according to the noise magnitude at each timestep
     # (this is the forward diffusion process)
@@ -471,7 +463,9 @@ def get_noisy_model_input_and_timesteps(
             ip_noise_gamma = (torch.rand(1, device=latents.device, dtype=dtype) * args.ip_noise_gamma)
         else:
             ip_noise_gamma = args.ip_noise_gamma
-        noisy_model_input += ip_noise_gamma * xi
+        noisy_model_input = sigmas * (noise + ip_noise_gamma * xi) + (1.0 - sigmas) * latents
+    else:
+        noisy_model_input = sigmas * noise + (1.0 - sigmas) * latents
 
     return noisy_model_input.to(dtype), timesteps.to(dtype), sigmas
 

From f974c6b2577348acbe948bcc668dd7b061feb73e Mon Sep 17 00:00:00 2001
From: rockerBOO <rockerboo@gmail.com>
Date: Wed, 19 Mar 2025 14:27:43 -0400
Subject: [PATCH 410/582] change order to match upstream

---
 library/flux_train_utils.py | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/library/flux_train_utils.py b/library/flux_train_utils.py
index 0cb07e3d3..7bf2faf07 100644
--- a/library/flux_train_utils.py
+++ b/library/flux_train_utils.py
@@ -413,8 +413,6 @@ def get_noisy_model_input_and_timesteps(
     args, noise_scheduler, latents: torch.Tensor, noise: torch.Tensor, device, dtype
 ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
     bsz, _, h, w = latents.shape
-    sigmas = None
-
     if args.timestep_sampling == "uniform" or args.timestep_sampling == "sigmoid":
         # Simple random t-based noise sampling
         if args.timestep_sampling == "sigmoid":
@@ -463,9 +461,9 @@ def get_noisy_model_input_and_timesteps(
             ip_noise_gamma = (torch.rand(1, device=latents.device, dtype=dtype) * args.ip_noise_gamma)
         else:
             ip_noise_gamma = args.ip_noise_gamma
-        noisy_model_input = sigmas * (noise + ip_noise_gamma * xi) + (1.0 - sigmas) * latents
+        noisy_model_input = (1.0 - sigmas) * latents + sigmas * (noise + ip_noise_gamma * xi)
     else:
-        noisy_model_input = sigmas * noise + (1.0 - sigmas) * latents
+        noisy_model_input = (1.0 - sigmas) * latents + sigmas * noise
 
     return noisy_model_input.to(dtype), timesteps.to(dtype), sigmas
 

From d151833526f5f79414a995cbb416de8a31e000cb Mon Sep 17 00:00:00 2001
From: Kohya S <ykumeykume@gmail.com>
Date: Thu, 20 Mar 2025 22:05:29 +0900
Subject: [PATCH 411/582] docs: update README with recent changes and specify
 version for pytorch-optimizer

---
 README.md        | 4 ++++
 requirements.txt | 6 +++---
 2 files changed, 7 insertions(+), 3 deletions(-)

diff --git a/README.md b/README.md
index 426eaed82..59b0e6766 100644
--- a/README.md
+++ b/README.md
@@ -14,6 +14,10 @@ The command to install PyTorch is as follows:
 
 ### Recent Updates
 
+Mar 20, 2025:
+- `pytorch-optimizer` is added to requirements.txt. Thank you to gesen2egee for PR [#1985](https://github.com/kohya-ss/sd-scripts/pull/1985). 
+  - For example, you can use CAME optimizer with `--optimizer_type "pytorch_optimizer.CAME" --optimizer_args "weight_decay=0.01"`.
+
 Mar 6, 2025:
 
 - Added a utility script to merge the weights of SD3's DiT, VAE (optional), CLIP-L, CLIP-G, and T5XXL into a single .safetensors file. Run `tools/merge_sd3_safetensors.py`. See `--help` for usage. PR [#1960](https://github.com/kohya-ss/sd-scripts/pull/1960)
diff --git a/requirements.txt b/requirements.txt
index 7348647f8..767d9e8eb 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -9,6 +9,9 @@ pytorch-lightning==1.9.0
 bitsandbytes==0.44.0
 lion-pytorch==0.0.6
 schedulefree==1.4
+pytorch-optimizer==3.5.0
+prodigy-plus-schedule-free==1.9.0
+prodigyopt==1.1.2
 tensorboard
 safetensors==0.4.4
 # gradio==3.16.2
@@ -42,7 +45,4 @@ rich==13.7.0
 # for T5XXL tokenizer (SD3/FLUX)
 sentencepiece==0.2.0
 # for kohya_ss library
-pytorch-optimizer
-prodigy-plus-schedule-free==1.9.0
-prodigyopt==1.1.2
 -e .

From 16cef81aeaec1ebc07de30c7a1448982a61167e1 Mon Sep 17 00:00:00 2001
From: rockerBOO <rockerboo@gmail.com>
Date: Thu, 20 Mar 2025 14:32:56 -0400
Subject: [PATCH 412/582] Refactor sigmas and timesteps

---
 library/flux_train_utils.py | 41 ++++++++++++++++++-------------------
 1 file changed, 20 insertions(+), 21 deletions(-)

diff --git a/library/flux_train_utils.py b/library/flux_train_utils.py
index 7bf2faf07..9110da896 100644
--- a/library/flux_train_utils.py
+++ b/library/flux_train_utils.py
@@ -366,8 +366,6 @@ def get_sigmas(noise_scheduler, timesteps, device, n_dim=4, dtype=torch.float32)
     step_indices = [(schedule_timesteps == t).nonzero().item() for t in timesteps]
 
     sigma = sigmas[step_indices].flatten()
-    while len(sigma.shape) < n_dim:
-        sigma = sigma.unsqueeze(-1)
     return sigma
 
 
@@ -413,32 +411,30 @@ def get_noisy_model_input_and_timesteps(
     args, noise_scheduler, latents: torch.Tensor, noise: torch.Tensor, device, dtype
 ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
     bsz, _, h, w = latents.shape
+    num_timesteps = noise_scheduler.config.num_train_timesteps
     if args.timestep_sampling == "uniform" or args.timestep_sampling == "sigmoid":
-        # Simple random t-based noise sampling
+        # Simple random sigma-based noise sampling
         if args.timestep_sampling == "sigmoid":
             # https://github.com/XLabs-AI/x-flux/tree/main
-            t = torch.sigmoid(args.sigmoid_scale * torch.randn((bsz,), device=device))
+            sigmas = torch.sigmoid(args.sigmoid_scale * torch.randn((bsz,), device=device))
         else:
-            t = torch.rand((bsz,), device=device)
+            sigmas = torch.rand((bsz,), device=device)
 
-        sigmas = t.view(-1, 1, 1, 1)
-        timesteps = t * 1000.0
+        timesteps = sigmas * num_timesteps
     elif args.timestep_sampling == "shift":
         shift = args.discrete_flow_shift
-        logits_norm = torch.randn(bsz, device=device)
-        logits_norm = logits_norm * args.sigmoid_scale  # larger scale for more uniform sampling
-        timesteps = logits_norm.sigmoid()
-        timesteps = (timesteps * shift) / (1 + (shift - 1) * timesteps)
-        sigmas = timesteps.view(-1, 1, 1, 1)
-        timesteps = timesteps * 1000.0
+        sigmas = torch.randn(bsz, device=device)
+        sigmas = sigmas * args.sigmoid_scale  # larger scale for more uniform sampling
+        sigmas = sigmas.sigmoid()
+        sigmas = (sigmas * shift) / (1 + (shift - 1) * sigmas)
+        timesteps = sigmas * num_timesteps
     elif args.timestep_sampling == "flux_shift":
-        logits_norm = torch.randn(bsz, device=device)
-        logits_norm = logits_norm * args.sigmoid_scale  # larger scale for more uniform sampling
-        timesteps = logits_norm.sigmoid()
-        mu = get_lin_function(y1=0.5, y2=1.15)((h // 2) * (w // 2))
-        timesteps = time_shift(mu, 1.0, timesteps)
-        sigmas = timesteps.view(-1, 1, 1, 1)
-        timesteps = timesteps * 1000.0
+        sigmas = torch.randn(bsz, device=device)
+        sigmas = sigmas * args.sigmoid_scale  # larger scale for more uniform sampling
+        sigmas = sigmas.sigmoid()
+        mu = get_lin_function(y1=0.5, y2=1.15)((h // 2) * (w // 2)) # we are pre-packed so must adjust for packed size 
+        sigmas = time_shift(mu, 1.0, sigmas)
+        timesteps = sigmas * num_timesteps
     else:
         # Sample a random timestep for each image
         # for weighting schemes where we sample timesteps non-uniformly
@@ -449,10 +445,13 @@ def get_noisy_model_input_and_timesteps(
             logit_std=args.logit_std,
             mode_scale=args.mode_scale,
         )
-        indices = (u * noise_scheduler.config.num_train_timesteps).long()
+        indices = (u * num_timesteps).long()
         timesteps = noise_scheduler.timesteps[indices].to(device=device)
         sigmas = get_sigmas(noise_scheduler, timesteps, device, n_dim=latents.ndim, dtype=dtype)
 
+    # Broadcast sigmas to latent shape
+    sigmas = sigmas.view(-1, 1, 1, 1)
+
     # Add noise to the latents according to the noise magnitude at each timestep
     # (this is the forward diffusion process)
     if args.ip_noise_gamma:

From e8b32548580ebf0001cd457d7b6f796e2eb169ff Mon Sep 17 00:00:00 2001
From: rockerBOO <rockerboo@gmail.com>
Date: Thu, 20 Mar 2025 15:01:15 -0400
Subject: [PATCH 413/582] Add flux_train_utils tests for get
 get_noisy_model_input_and_timesteps

---
 library/flux_train_utils.py            |   1 +
 tests/library/test_flux_train_utils.py | 220 +++++++++++++++++++++++++
 2 files changed, 221 insertions(+)
 create mode 100644 tests/library/test_flux_train_utils.py

diff --git a/library/flux_train_utils.py b/library/flux_train_utils.py
index 9110da896..0e73a01d4 100644
--- a/library/flux_train_utils.py
+++ b/library/flux_train_utils.py
@@ -411,6 +411,7 @@ def get_noisy_model_input_and_timesteps(
     args, noise_scheduler, latents: torch.Tensor, noise: torch.Tensor, device, dtype
 ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
     bsz, _, h, w = latents.shape
+    assert bsz > 0, "Batch size not large enough"
     num_timesteps = noise_scheduler.config.num_train_timesteps
     if args.timestep_sampling == "uniform" or args.timestep_sampling == "sigmoid":
         # Simple random sigma-based noise sampling
diff --git a/tests/library/test_flux_train_utils.py b/tests/library/test_flux_train_utils.py
new file mode 100644
index 000000000..a4c7ba3b6
--- /dev/null
+++ b/tests/library/test_flux_train_utils.py
@@ -0,0 +1,220 @@
+import pytest
+import torch
+from unittest.mock import MagicMock, patch
+from library.flux_train_utils import (
+    get_noisy_model_input_and_timesteps,
+)
+
+# Mock classes and functions
+class MockNoiseScheduler:
+    def __init__(self, num_train_timesteps=1000):
+        self.config = MagicMock()
+        self.config.num_train_timesteps = num_train_timesteps
+        self.timesteps = torch.arange(num_train_timesteps, dtype=torch.long)
+
+
+# Create fixtures for commonly used objects
+@pytest.fixture
+def args():
+    args = MagicMock()
+    args.timestep_sampling = "uniform"
+    args.weighting_scheme = "uniform"
+    args.logit_mean = 0.0
+    args.logit_std = 1.0
+    args.mode_scale = 1.0
+    args.sigmoid_scale = 1.0
+    args.discrete_flow_shift = 3.1582
+    args.ip_noise_gamma = None
+    args.ip_noise_gamma_random_strength = False
+    return args
+
+
+@pytest.fixture
+def noise_scheduler():
+    return MockNoiseScheduler(num_train_timesteps=1000)
+
+
+@pytest.fixture
+def latents():
+    return torch.randn(2, 4, 8, 8)
+
+
+@pytest.fixture
+def noise():
+    return torch.randn(2, 4, 8, 8)
+
+
+@pytest.fixture
+def device():
+    # return "cuda" if torch.cuda.is_available() else "cpu"
+    return "cpu"
+
+
+# Mock the required functions
+@pytest.fixture(autouse=True)
+def mock_functions():
+    with (
+        patch("torch.sigmoid", side_effect=torch.sigmoid),
+        patch("torch.rand", side_effect=torch.rand),
+        patch("torch.randn", side_effect=torch.randn),
+    ):
+        yield
+
+
+# Test different timestep sampling methods
+def test_uniform_sampling(args, noise_scheduler, latents, noise, device):
+    args.timestep_sampling = "uniform"
+    dtype = torch.float32
+
+    noisy_input, timesteps, sigmas = get_noisy_model_input_and_timesteps(args, noise_scheduler, latents, noise, device, dtype)
+
+    assert noisy_input.shape == latents.shape
+    assert timesteps.shape == (latents.shape[0],)
+    assert sigmas.shape == (latents.shape[0], 1, 1, 1)
+    assert noisy_input.dtype == dtype
+    assert timesteps.dtype == dtype
+
+
+def test_sigmoid_sampling(args, noise_scheduler, latents, noise, device):
+    args.timestep_sampling = "sigmoid"
+    args.sigmoid_scale = 10.0
+    dtype = torch.float32
+
+    noisy_input, timesteps, sigmas = get_noisy_model_input_and_timesteps(args, noise_scheduler, latents, noise, device, dtype)
+
+    assert noisy_input.shape == latents.shape
+    assert timesteps.shape == (latents.shape[0],)
+    assert sigmas.shape == (latents.shape[0], 1, 1, 1)
+
+
+def test_shift_sampling(args, noise_scheduler, latents, noise, device):
+    args.timestep_sampling = "shift"
+    args.sigmoid_scale = 1.0
+    args.discrete_flow_shift = 3.1582
+    dtype = torch.float32
+
+    noisy_input, timesteps, sigmas = get_noisy_model_input_and_timesteps(args, noise_scheduler, latents, noise, device, dtype)
+
+    assert noisy_input.shape == latents.shape
+    assert timesteps.shape == (latents.shape[0],)
+    assert sigmas.shape == (latents.shape[0], 1, 1, 1)
+
+
+def test_flux_shift_sampling(args, noise_scheduler, latents, noise, device):
+    args.timestep_sampling = "flux_shift"
+    args.sigmoid_scale = 10.0
+    dtype = torch.float32
+
+    noisy_input, timesteps, sigmas = get_noisy_model_input_and_timesteps(args, noise_scheduler, latents, noise, device, dtype)
+
+    assert noisy_input.shape == latents.shape
+    assert timesteps.shape == (latents.shape[0],)
+    assert sigmas.shape == (latents.shape[0], 1, 1, 1)
+
+
+def test_weighting_scheme(args, noise_scheduler, latents, noise, device):
+    # Mock the necessary functions for this specific test
+    with patch("library.flux_train_utils.compute_density_for_timestep_sampling", 
+               return_value=torch.tensor([0.3, 0.7], device=device)), \
+         patch("library.flux_train_utils.get_sigmas", 
+               return_value=torch.tensor([[0.3], [0.7]], device=device).view(-1, 1, 1, 1)):
+               
+        args.timestep_sampling = "other"  # Will trigger the weighting scheme path
+        args.weighting_scheme = "uniform"
+        args.logit_mean = 0.0
+        args.logit_std = 1.0
+        args.mode_scale = 1.0
+        dtype = torch.float32
+        
+        noisy_input, timesteps, sigmas = get_noisy_model_input_and_timesteps(
+            args, noise_scheduler, latents, noise, device, dtype
+        )
+        
+        assert noisy_input.shape == latents.shape
+        assert timesteps.shape == (latents.shape[0],)
+        assert sigmas.shape == (latents.shape[0], 1, 1, 1)
+
+
+# Test IP noise options
+def test_with_ip_noise(args, noise_scheduler, latents, noise, device):
+    args.ip_noise_gamma = 0.5
+    args.ip_noise_gamma_random_strength = False
+    dtype = torch.float32
+
+    noisy_input, timesteps, sigmas = get_noisy_model_input_and_timesteps(args, noise_scheduler, latents, noise, device, dtype)
+
+    assert noisy_input.shape == latents.shape
+    assert timesteps.shape == (latents.shape[0],)
+    assert sigmas.shape == (latents.shape[0], 1, 1, 1)
+
+
+def test_with_random_ip_noise(args, noise_scheduler, latents, noise, device):
+    args.ip_noise_gamma = 0.1
+    args.ip_noise_gamma_random_strength = True
+    dtype = torch.float32
+
+    noisy_input, timesteps, sigmas = get_noisy_model_input_and_timesteps(args, noise_scheduler, latents, noise, device, dtype)
+
+    assert noisy_input.shape == latents.shape
+    assert timesteps.shape == (latents.shape[0],)
+    assert sigmas.shape == (latents.shape[0], 1, 1, 1)
+
+
+# Test different data types
+def test_float16_dtype(args, noise_scheduler, latents, noise, device):
+    dtype = torch.float16
+
+    noisy_input, timesteps, sigmas = get_noisy_model_input_and_timesteps(args, noise_scheduler, latents, noise, device, dtype)
+
+    assert noisy_input.dtype == dtype
+    assert timesteps.dtype == dtype
+
+
+# Test different batch sizes
+def test_different_batch_size(args, noise_scheduler, device):
+    latents = torch.randn(5, 4, 8, 8)  # batch size of 5
+    noise = torch.randn(5, 4, 8, 8)
+    dtype = torch.float32
+
+    noisy_input, timesteps, sigmas = get_noisy_model_input_and_timesteps(args, noise_scheduler, latents, noise, device, dtype)
+
+    assert noisy_input.shape == latents.shape
+    assert timesteps.shape == (5,)
+    assert sigmas.shape == (5, 1, 1, 1)
+
+
+# Test different image sizes
+def test_different_image_size(args, noise_scheduler, device):
+    latents = torch.randn(2, 4, 16, 16)  # larger image size
+    noise = torch.randn(2, 4, 16, 16)
+    dtype = torch.float32
+
+    noisy_input, timesteps, sigmas = get_noisy_model_input_and_timesteps(args, noise_scheduler, latents, noise, device, dtype)
+
+    assert noisy_input.shape == latents.shape
+    assert timesteps.shape == (2,)
+    assert sigmas.shape == (2, 1, 1, 1)
+
+
+# Test edge cases
+def test_zero_batch_size(args, noise_scheduler, device):
+    with pytest.raises(AssertionError):  # expecting an error with zero batch size
+        latents = torch.randn(0, 4, 8, 8)
+        noise = torch.randn(0, 4, 8, 8)
+        dtype = torch.float32
+
+        get_noisy_model_input_and_timesteps(args, noise_scheduler, latents, noise, device, dtype)
+
+
+def test_different_timestep_count(args, device):
+    noise_scheduler = MockNoiseScheduler(num_train_timesteps=500)  # different timestep count
+    latents = torch.randn(2, 4, 8, 8)
+    noise = torch.randn(2, 4, 8, 8)
+    dtype = torch.float32
+
+    noisy_input, timesteps, sigmas = get_noisy_model_input_and_timesteps(args, noise_scheduler, latents, noise, device, dtype)
+
+    assert noisy_input.shape == latents.shape
+    assert timesteps.shape == (2,)
+    # Check that timesteps are within the proper range
+    assert torch.all(timesteps < 500)

From 8aa126582efbdf0472b0b8db800d50860870f3cd Mon Sep 17 00:00:00 2001
From: rockerBOO <rockerboo@gmail.com>
Date: Thu, 20 Mar 2025 15:09:11 -0400
Subject: [PATCH 414/582] Scale sigmoid to default 1.0

---
 pytest.ini                             | 1 +
 requirements.txt                       | 2 +-
 tests/library/test_flux_train_utils.py | 4 ++--
 3 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/pytest.ini b/pytest.ini
index 484d3aef6..34b7e9c1f 100644
--- a/pytest.ini
+++ b/pytest.ini
@@ -6,3 +6,4 @@ filterwarnings =
     ignore::DeprecationWarning
     ignore::UserWarning
 		ignore::FutureWarning
+pythonpath = .
diff --git a/requirements.txt b/requirements.txt
index de39f5887..8fe8c7620 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -7,7 +7,7 @@ opencv-python==4.8.1.78
 einops==0.7.0
 pytorch-lightning==1.9.0
 bitsandbytes==0.44.0
-prodigyopt==1.0
+prodigyopt>=1.0
 lion-pytorch==0.0.6
 schedulefree==1.4
 tensorboard
diff --git a/tests/library/test_flux_train_utils.py b/tests/library/test_flux_train_utils.py
index a4c7ba3b6..2ad7ce4ee 100644
--- a/tests/library/test_flux_train_utils.py
+++ b/tests/library/test_flux_train_utils.py
@@ -77,7 +77,7 @@ def test_uniform_sampling(args, noise_scheduler, latents, noise, device):
 
 def test_sigmoid_sampling(args, noise_scheduler, latents, noise, device):
     args.timestep_sampling = "sigmoid"
-    args.sigmoid_scale = 10.0
+    args.sigmoid_scale = 1.0
     dtype = torch.float32
 
     noisy_input, timesteps, sigmas = get_noisy_model_input_and_timesteps(args, noise_scheduler, latents, noise, device, dtype)
@@ -102,7 +102,7 @@ def test_shift_sampling(args, noise_scheduler, latents, noise, device):
 
 def test_flux_shift_sampling(args, noise_scheduler, latents, noise, device):
     args.timestep_sampling = "flux_shift"
-    args.sigmoid_scale = 10.0
+    args.sigmoid_scale = 1.0
     dtype = torch.float32
 
     noisy_input, timesteps, sigmas = get_noisy_model_input_and_timesteps(args, noise_scheduler, latents, noise, device, dtype)

From d40f5b1e4ef5e7e6b51df26914be3a661b006d34 Mon Sep 17 00:00:00 2001
From: rockerBOO <rockerboo@gmail.com>
Date: Thu, 20 Mar 2025 15:09:50 -0400
Subject: [PATCH 415/582] Revert "Scale sigmoid to default 1.0"

This reverts commit 8aa126582efbdf0472b0b8db800d50860870f3cd.
---
 pytest.ini                             | 1 -
 requirements.txt                       | 2 +-
 tests/library/test_flux_train_utils.py | 4 ++--
 3 files changed, 3 insertions(+), 4 deletions(-)

diff --git a/pytest.ini b/pytest.ini
index 34b7e9c1f..484d3aef6 100644
--- a/pytest.ini
+++ b/pytest.ini
@@ -6,4 +6,3 @@ filterwarnings =
     ignore::DeprecationWarning
     ignore::UserWarning
 		ignore::FutureWarning
-pythonpath = .
diff --git a/requirements.txt b/requirements.txt
index 8fe8c7620..de39f5887 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -7,7 +7,7 @@ opencv-python==4.8.1.78
 einops==0.7.0
 pytorch-lightning==1.9.0
 bitsandbytes==0.44.0
-prodigyopt>=1.0
+prodigyopt==1.0
 lion-pytorch==0.0.6
 schedulefree==1.4
 tensorboard
diff --git a/tests/library/test_flux_train_utils.py b/tests/library/test_flux_train_utils.py
index 2ad7ce4ee..a4c7ba3b6 100644
--- a/tests/library/test_flux_train_utils.py
+++ b/tests/library/test_flux_train_utils.py
@@ -77,7 +77,7 @@ def test_uniform_sampling(args, noise_scheduler, latents, noise, device):
 
 def test_sigmoid_sampling(args, noise_scheduler, latents, noise, device):
     args.timestep_sampling = "sigmoid"
-    args.sigmoid_scale = 1.0
+    args.sigmoid_scale = 10.0
     dtype = torch.float32
 
     noisy_input, timesteps, sigmas = get_noisy_model_input_and_timesteps(args, noise_scheduler, latents, noise, device, dtype)
@@ -102,7 +102,7 @@ def test_shift_sampling(args, noise_scheduler, latents, noise, device):
 
 def test_flux_shift_sampling(args, noise_scheduler, latents, noise, device):
     args.timestep_sampling = "flux_shift"
-    args.sigmoid_scale = 1.0
+    args.sigmoid_scale = 10.0
     dtype = torch.float32
 
     noisy_input, timesteps, sigmas = get_noisy_model_input_and_timesteps(args, noise_scheduler, latents, noise, device, dtype)

From 89f0d27a5930ae0a355caacfedc546fb04a7345d Mon Sep 17 00:00:00 2001
From: rockerBOO <rockerboo@gmail.com>
Date: Thu, 20 Mar 2025 15:10:33 -0400
Subject: [PATCH 416/582] Set sigmoid_scale to default 1.0

---
 tests/library/test_flux_train_utils.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/library/test_flux_train_utils.py b/tests/library/test_flux_train_utils.py
index a4c7ba3b6..2ad7ce4ee 100644
--- a/tests/library/test_flux_train_utils.py
+++ b/tests/library/test_flux_train_utils.py
@@ -77,7 +77,7 @@ def test_uniform_sampling(args, noise_scheduler, latents, noise, device):
 
 def test_sigmoid_sampling(args, noise_scheduler, latents, noise, device):
     args.timestep_sampling = "sigmoid"
-    args.sigmoid_scale = 10.0
+    args.sigmoid_scale = 1.0
     dtype = torch.float32
 
     noisy_input, timesteps, sigmas = get_noisy_model_input_and_timesteps(args, noise_scheduler, latents, noise, device, dtype)
@@ -102,7 +102,7 @@ def test_shift_sampling(args, noise_scheduler, latents, noise, device):
 
 def test_flux_shift_sampling(args, noise_scheduler, latents, noise, device):
     args.timestep_sampling = "flux_shift"
-    args.sigmoid_scale = 10.0
+    args.sigmoid_scale = 1.0
     dtype = torch.float32
 
     noisy_input, timesteps, sigmas = get_noisy_model_input_and_timesteps(args, noise_scheduler, latents, noise, device, dtype)

From 2ba1cc7791a5438448b99d70929c6c9a54c70e73 Mon Sep 17 00:00:00 2001
From: rockerBOO <rockerboo@gmail.com>
Date: Fri, 21 Mar 2025 20:17:22 -0400
Subject: [PATCH 417/582] Fix max norms not applying to noise

---
 library/lumina_train_util.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/library/lumina_train_util.py b/library/lumina_train_util.py
index f224e86cf..14a79bb2e 100644
--- a/library/lumina_train_util.py
+++ b/library/lumina_train_util.py
@@ -688,9 +688,9 @@ def denoise(
                     noise_pred, dim=tuple(range(1, len(noise_pred.shape))), keepdim=True
                 )
                 # Iterate through batch
-                for noise_norm, max_new_norm, noise in zip(noise_norms, max_new_norms, noise_pred):
+                for i, (noise_norm, max_new_norm) in enumerate(zip(noise_norms, max_new_norms)):
                     if noise_norm >= max_new_norm:
-                        noise = noise * (max_new_norm / noise_norm)
+                        noise_pred[i] = noise_pred[i] * (max_new_norm / noise_norm)
         else:
             noise_pred = noise_pred_cond
 

From 61f7283167b2f4002b78ad4487041c10cfc2134a Mon Sep 17 00:00:00 2001
From: rockerBOO <rockerboo@gmail.com>
Date: Fri, 21 Mar 2025 20:38:43 -0400
Subject: [PATCH 418/582] Fix non-cache vae encode

---
 lumina_train_network.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lumina_train_network.py b/lumina_train_network.py
index 6b7e7d22e..e1b45ac70 100644
--- a/lumina_train_network.py
+++ b/lumina_train_network.py
@@ -230,7 +230,7 @@ def get_noise_scheduler(self, args: argparse.Namespace, device: torch.device) ->
         self.noise_scheduler_copy = copy.deepcopy(noise_scheduler)
         return noise_scheduler
 
-    def encode_images_to_latents(self, args, accelerator, vae, images):
+    def encode_images_to_latents(self, args, vae, images):
         return vae.encode(images)
 
     # not sure, they use same flux vae

From e64dc05c2a704a3400e6f969c0b6ff9914d226dd Mon Sep 17 00:00:00 2001
From: laolongboy <675077044@qq.com>
Date: Mon, 24 Mar 2025 23:33:25 +0800
Subject: [PATCH 419/582] Supplement the input parameters to correctly convert
 the flux model to BFL format; fixes #1996

---
 tools/convert_diffusers_to_flux.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/convert_diffusers_to_flux.py b/tools/convert_diffusers_to_flux.py
index 65ba7321a..fdfc45925 100644
--- a/tools/convert_diffusers_to_flux.py
+++ b/tools/convert_diffusers_to_flux.py
@@ -56,7 +56,7 @@ def convert(args):
     save_dtype = str_to_dtype(args.save_precision) if args.save_precision is not None else None
 
     # make reverse map from diffusers map
-    diffusers_to_bfl_map = flux_utils.make_diffusers_to_bfl_map()
+    diffusers_to_bfl_map = flux_utils.make_diffusers_to_bfl_map(19, 38)
 
     # iterate over three safetensors files to reduce memory usage
     flux_sd = {}

From 182544dcce383a433527e446bfc7fa8374e375a8 Mon Sep 17 00:00:00 2001
From: rockerBOO <rockerboo@gmail.com>
Date: Wed, 26 Mar 2025 14:23:04 -0400
Subject: [PATCH 420/582] Remove pertubation seed

---
 networks/lora_flux.py | 41 ++---------------------------------------
 1 file changed, 2 insertions(+), 39 deletions(-)

diff --git a/networks/lora_flux.py b/networks/lora_flux.py
index 9f5f1916a..92b3979ae 100644
--- a/networks/lora_flux.py
+++ b/networks/lora_flux.py
@@ -29,42 +29,6 @@
 NUM_DOUBLE_BLOCKS = 19
 NUM_SINGLE_BLOCKS = 38
 
-@contextmanager
-def temp_random_seed(seed, device=None):
-    """
-    Context manager that temporarily sets a specific random seed and then
-    restores the original RNG state afterward.
-    
-    Args:
-        seed (int): The random seed to set temporarily
-        device (torch.device, optional): The device to set the seed for.
-            If None, will detect from the current context.
-    """
-    # Save original RNG states
-    original_cpu_rng_state = torch.get_rng_state()
-    original_cuda_rng_states = None
-    if torch.cuda.is_available():
-        original_cuda_rng_states = torch.cuda.get_rng_state_all()
-    
-    # Determine if we need to set CUDA seed
-    set_cuda = False
-    if device is not None:
-        set_cuda = device.type == 'cuda'
-    elif torch.cuda.is_available():
-        set_cuda = True
-        
-    try:
-        # Set the temporary seed
-        torch.manual_seed(seed)
-        if set_cuda:
-            torch.cuda.manual_seed_all(seed)
-        yield
-    finally:
-        # Restore original RNG states
-        torch.set_rng_state(original_cpu_rng_state)
-        if torch.cuda.is_available() and original_cuda_rng_states is not None:
-            torch.cuda.set_rng_state_all(original_cuda_rng_states)
-
 
 class LoRAModule(torch.nn.Module):
     """
@@ -150,7 +114,6 @@ def __init__(
             self.combined_weight_norms = None
             self.grad_norms = None
             self.perturbation_norm_factor = 1.0 / math.sqrt(org_module.weight.shape[0])
-            self.perturbation_seed = torch.randint(0, 2**32 - 1, (1,)).detach().item()
             self.initialize_norm_cache(org_module.weight)
             self.org_module_shape: tuple[int] = org_module.weight.shape
 
@@ -193,8 +156,8 @@ def forward(self, x):
             lx = self.lora_up(lx)
 
             # LoRA Gradient-Guided Perturbation Optimization
-            if self.training and hasattr(self, 'perturbation_seed') and self.ggpo_sigma is not None and self.ggpo_beta is not None and self.combined_weight_norms is not None and self.grad_norms is not None:
-                with torch.no_grad(), temp_random_seed(self.perturbation_seed):
+            if self.training and self.ggpo_sigma is not None and self.ggpo_beta is not None and self.combined_weight_norms is not None and self.grad_norms is not None:
+                with torch.no_grad():
                     perturbation_scale = (self.ggpo_sigma * torch.sqrt(self.combined_weight_norms ** 2)) + (self.ggpo_beta * (self.grad_norms ** 2))
                     perturbation_scale_factor = (perturbation_scale * self.perturbation_norm_factor).to(self.device)
                     perturbation = torch.randn(self.org_module_shape,  dtype=self.dtype, device=self.device)

From 0181b7a0425fd58012f7e3ece10345c86d9b6fc8 Mon Sep 17 00:00:00 2001
From: rockerBOO <rockerboo@gmail.com>
Date: Thu, 27 Mar 2025 03:28:33 -0400
Subject: [PATCH 421/582] Remove progress bar avg norms

---
 train_network.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/train_network.py b/train_network.py
index 4898e7985..5b2f377a3 100644
--- a/train_network.py
+++ b/train_network.py
@@ -1432,7 +1432,7 @@ def restore_rng_state(rng_states: tuple[torch.ByteTensor, Optional[torch.ByteTen
                         weight_norms = network.weight_norms()
                         maximum_norm = weight_norms.max().item() if weight_norms.numel() > 0 else None
                         keys_scaled = None
-                        max_mean_logs = {"avg weight norm": mean_norm, "avg grad norm": mean_grad_norm, "avg comb norm": mean_combined_norm}
+                        max_mean_logs = {}
                     else:
                         keys_scaled, mean_norm, maximum_norm = None, None, None
                         mean_grad_norm = None

From 1f432e2c0e5b583c09100c68ce59f30c9d39ecf6 Mon Sep 17 00:00:00 2001
From: Kohya S <ykumeykume@gmail.com>
Date: Sun, 30 Mar 2025 20:40:29 +0900
Subject: [PATCH 422/582] use PIL for lanczos and box

---
 docs/config_README-en.md |  3 +++
 docs/config_README-ja.md |  4 ++++
 library/train_util.py    |  2 +-
 library/utils.py         | 21 ++++++++++++++-------
 4 files changed, 22 insertions(+), 8 deletions(-)

diff --git a/docs/config_README-en.md b/docs/config_README-en.md
index 66a50dc09..8c55903d0 100644
--- a/docs/config_README-en.md
+++ b/docs/config_README-en.md
@@ -152,6 +152,7 @@ These options are related to subset configuration.
 | `keep_tokens_separator` | `“|||”` | o | o | o |
 | `secondary_separator` | `“;;;”` | o | o | o |
 | `enable_wildcard` | `true` | o | o | o |
+| `resize_interpolation` | (not specified) | o | o | o |
 
 * `num_repeats`
     * Specifies the number of repeats for images in a subset. This is equivalent to `--dataset_repeats` in fine-tuning but can be specified for any training method.
@@ -165,6 +166,8 @@ These options are related to subset configuration.
     * Specifies an additional separator. The part separated by this separator is treated as one tag and is shuffled and dropped. It is then replaced by `caption_separator`. For example, if you specify `aaa;;;bbb;;;ccc`, it will be replaced by `aaa,bbb,ccc` or dropped together.
 * `enable_wildcard`
     * Enables wildcard notation. This will be explained later.
+* `resize_interpolation`
+    * Specifies the interpolation method used when resizing images. Normally, there is no need to specify this. The following options can be specified: `lanczos`, `nearest`, `bilinear`, `linear`, `bicubic`, `cubic`, `area`, `box`. By default (when not specified), `area` is used for downscaling, and `lanczos` is used for upscaling. If this option is specified, the same interpolation method will be used for both upscaling and downscaling. When `lanczos` or `box` is specified, PIL is used; for other options, OpenCV is used.
 
 ### DreamBooth-specific options
 
diff --git a/docs/config_README-ja.md b/docs/config_README-ja.md
index 0ed95e0eb..aec0eca5d 100644
--- a/docs/config_README-ja.md
+++ b/docs/config_README-ja.md
@@ -144,6 +144,7 @@ DreamBooth の手法と fine tuning の手法の両方とも利用可能な学
 | `keep_tokens_separator` | `“|||”` | o | o | o |
 | `secondary_separator` | `“;;;”` | o | o | o |
 | `enable_wildcard` | `true` | o | o | o |
+| `resize_interpolation` |（通常は設定しません） | o | o | o |
 
 * `num_repeats`
     * サブセットの画像の繰り返し回数を指定します。fine tuning における `--dataset_repeats` に相当しますが、`num_repeats` はどの学習方法でも指定可能です。
@@ -162,6 +163,9 @@ DreamBooth の手法と fine tuning の手法の両方とも利用可能な学
 * `enable_wildcard`
     * ワイルドカード記法および複数行キャプションを有効にします。ワイルドカード記法、複数行キャプションについては後述します。
 
+* `resize_interpolation`
+    * 画像のリサイズ時に使用する補間方法を指定します。通常は指定しなくて構いません。`lanczos`, `nearest`, `bilinear`, `linear`, `bicubic`, `cubic`, `area`, `box` が指定可能です。デフォルト（未指定時）は、縮小時は `area`、拡大時は `lanczos` になります。このオプションを指定すると、拡大時・縮小時とも同じ補間方法が使用されます。`lanczos`、`box`を指定するとPILが、それ以外を指定するとOpenCVが使用されます。
+
 ### DreamBooth 方式専用のオプション
 
 DreamBooth 方式のオプションは、サブセット向けオプションのみ存在します。
diff --git a/library/train_util.py b/library/train_util.py
index e9c506883..1ed1d3c23 100644
--- a/library/train_util.py
+++ b/library/train_util.py
@@ -74,7 +74,7 @@
 import library.huggingface_util as huggingface_util
 import library.sai_model_spec as sai_model_spec
 import library.deepspeed_utils as deepspeed_utils
-from library.utils import setup_logging, resize_image
+from library.utils import setup_logging, resize_image, validate_interpolation_fn
 
 setup_logging()
 import logging
diff --git a/library/utils.py b/library/utils.py
index 4fbc26270..0f535a87b 100644
--- a/library/utils.py
+++ b/library/utils.py
@@ -400,7 +400,7 @@ def pil_resize(image, size, interpolation):
 
 def resize_image(image: np.ndarray, width: int, height: int, resized_width: int, resized_height: int, resize_interpolation: Optional[str] = None):
     """
-    Resize image with resize interpolation. Default interpolation to AREA if image is smaller, else LANCZOS
+    Resize image with resize interpolation. Default interpolation to AREA if image is smaller, else LANCZOS.
 
     Args:
         image: numpy.ndarray
@@ -413,14 +413,21 @@ def resize_image(image: np.ndarray, width: int, height: int, resized_width: int,
     Returns:
         image
     """
-    interpolation = get_cv2_interpolation(resize_interpolation)
+    if resize_interpolation is None:
+        resize_interpolation = "lanczos" if width > resized_width and height > resized_height else "area"
+    
+    # we use PIL for lanczos (for backward compatibility) and box, cv2 for others
+    use_pil = resize_interpolation in ["lanczos", "lanczos4", "box"]
+
     resized_size = (resized_width, resized_height)
-    if width > resized_width and height > resized_width:
-        image = cv2.resize(image, resized_size, interpolation=interpolation if interpolation is not None else cv2.INTER_AREA)  # INTER_AREAでやりたいのでcv2でリサイズ
-        logger.debug(f"resize image using {resize_interpolation}")
+    if use_pil:
+        interpolation = get_pil_interpolation(resize_interpolation)
+        image = pil_resize(image, resized_size, interpolation=interpolation)
+        logger.debug(f"resize image using {resize_interpolation} (PIL)")
     else:
-        image = cv2.resize(image, resized_size, interpolation=interpolation if interpolation is not None else cv2.INTER_LANCZOS4)  # INTER_AREAでやりたいのでcv2でリサイズ
-        logger.debug(f"resize image using {resize_interpolation}")
+        interpolation = get_cv2_interpolation(resize_interpolation)
+        image = cv2.resize(image, resized_size, interpolation=interpolation)
+        logger.debug(f"resize image using {resize_interpolation} (cv2)")
 
     return image
 

From 96a133c99850fe19544b62fbfde55a7d149802dd Mon Sep 17 00:00:00 2001
From: Kohya S <ykumeykume@gmail.com>
Date: Sun, 30 Mar 2025 20:45:06 +0900
Subject: [PATCH 423/582] README.md: update recent updates section to include
 new interpolation method for resizing images

---
 README.md | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/README.md b/README.md
index 7620e4073..6e28b2121 100644
--- a/README.md
+++ b/README.md
@@ -14,6 +14,9 @@ The command to install PyTorch is as follows:
 
 ### Recent Updates
 
+Mar 30, 2025:
+- The interpolation method for resizing the original image to the training size can now be specified. Thank you to rockerBOO for PR [#1939](https://github.com/kohya-ss/sd-scripts/pull/1939).
+
 Mar 20, 2025:
 - `pytorch-optimizer` is added to requirements.txt. Thank you to gesen2egee for PR [#1985](https://github.com/kohya-ss/sd-scripts/pull/1985). 
   - For example, you can use CAME optimizer with `--optimizer_type "pytorch_optimizer.CAME" --optimizer_args "weight_decay=0.01"`.

From d0b5c0e5cfabf65a64d9d60712dc67bd8057336b Mon Sep 17 00:00:00 2001
From: Kohya S <ykumeykume@gmail.com>
Date: Sun, 30 Mar 2025 21:15:37 +0900
Subject: [PATCH 424/582] chore: formatting, add TODO comment

---
 train_network.py | 31 ++++++++++++++++++++++---------
 1 file changed, 22 insertions(+), 9 deletions(-)

diff --git a/train_network.py b/train_network.py
index 3bab0cadb..f66cdeb42 100644
--- a/train_network.py
+++ b/train_network.py
@@ -70,7 +70,7 @@ def generate_step_logs(
         mean_norm=None,
         maximum_norm=None,
         mean_grad_norm=None,
-        mean_combined_norm=None
+        mean_combined_norm=None,
     ):
         logs = {"loss/current": current_loss, "loss/average": avr_loss}
 
@@ -658,6 +658,10 @@ def train(self, args):
             return
         network_has_multiplier = hasattr(network, "set_multiplier")
 
+        # TODO remove `hasattr`s by setting up methods if not defined in the network like (hacky but works):
+        # if not hasattr(network, "prepare_network"):
+        #    network.prepare_network = lambda args: None
+
         if hasattr(network, "prepare_network"):
             network.prepare_network(args)
         if args.scale_weight_norms and not hasattr(network, "apply_max_norm_regularization"):
@@ -1019,12 +1023,12 @@ def load_model_hook(models, input_dir):
             "ss_huber_c": args.huber_c,
             "ss_fp8_base": bool(args.fp8_base),
             "ss_fp8_base_unet": bool(args.fp8_base_unet),
-            "ss_validation_seed": args.validation_seed, 
-            "ss_validation_split": args.validation_split, 
-            "ss_max_validation_steps": args.max_validation_steps, 
-            "ss_validate_every_n_epochs": args.validate_every_n_epochs, 
-            "ss_validate_every_n_steps": args.validate_every_n_steps, 
-            "ss_resize_interpolation": args.resize_interpolation
+            "ss_validation_seed": args.validation_seed,
+            "ss_validation_split": args.validation_split,
+            "ss_max_validation_steps": args.max_validation_steps,
+            "ss_validate_every_n_epochs": args.validate_every_n_epochs,
+            "ss_validate_every_n_steps": args.validate_every_n_steps,
+            "ss_resize_interpolation": args.resize_interpolation,
         }
 
         self.update_metadata(metadata, args)  # architecture specific metadata
@@ -1415,7 +1419,6 @@ def restore_rng_state(rng_states: tuple[torch.ByteTensor, Optional[torch.ByteTen
                         if hasattr(network, "update_norms"):
                             network.update_norms()
 
-
                     optimizer.step()
                     lr_scheduler.step()
                     optimizer.zero_grad(set_to_none=True)
@@ -1476,7 +1479,17 @@ def restore_rng_state(rng_states: tuple[torch.ByteTensor, Optional[torch.ByteTen
 
                 if is_tracking:
                     logs = self.generate_step_logs(
-                        args, current_loss, avr_loss, lr_scheduler, lr_descriptions, optimizer, keys_scaled, mean_norm, maximum_norm, mean_grad_norm, mean_combined_norm
+                        args,
+                        current_loss,
+                        avr_loss,
+                        lr_scheduler,
+                        lr_descriptions,
+                        optimizer,
+                        keys_scaled,
+                        mean_norm,
+                        maximum_norm,
+                        mean_grad_norm,
+                        mean_combined_norm,
                     )
                     self.step_logging(accelerator, logs, global_step, epoch + 1)
 

From aaa26bb882ff23e1e35c84fed6e4f7a12ec420d4 Mon Sep 17 00:00:00 2001
From: Kohya S <ykumeykume@gmail.com>
Date: Sun, 30 Mar 2025 21:18:05 +0900
Subject: [PATCH 425/582] docs: update README to include LoRA-GGPO details for
 FLUX.1 training

---
 README.md | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/README.md b/README.md
index 6e28b2121..4bc0c2b5b 100644
--- a/README.md
+++ b/README.md
@@ -15,6 +15,8 @@ The command to install PyTorch is as follows:
 ### Recent Updates
 
 Mar 30, 2025:
+- LoRA-GGPO is added for FLUX.1 LoRA training. Thank you to rockerBOO for PR [#1974](https://github.com/kohya-ss/sd-scripts/pull/1974). 
+  - Specify `--network_args ggpo_sigma=0.03 ggpo_beta=0.01` in the command line or `network_args = ["ggpo_sigma=0.03", "ggpo_beta=0.01"]` in .toml file. See PR for details.
 - The interpolation method for resizing the original image to the training size can now be specified. Thank you to rockerBOO for PR [#1939](https://github.com/kohya-ss/sd-scripts/pull/1939).
 
 Mar 20, 2025:

From ede34702609c59ba2256cf7e330bad67ce9c77d3 Mon Sep 17 00:00:00 2001
From: Lex Song <LexSong.tw@gmail.com>
Date: Wed, 2 Apr 2025 03:28:58 +0800
Subject: [PATCH 426/582] Ensure all size parameters are integers to prevent
 type errors

---
 library/utils.py | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/library/utils.py b/library/utils.py
index 0f535a87b..767de472c 100644
--- a/library/utils.py
+++ b/library/utils.py
@@ -413,6 +413,13 @@ def resize_image(image: np.ndarray, width: int, height: int, resized_width: int,
     Returns:
         image
     """
+
+    # Ensure all size parameters are actual integers
+    width = int(width)
+    height = int(height)
+    resized_width = int(resized_width)
+    resized_height = int(resized_height)
+
     if resize_interpolation is None:
         resize_interpolation = "lanczos" if width > resized_width and height > resized_height else "area"
     

From b822b7e60b84a4fb32a8e1ffa966054f8fe96209 Mon Sep 17 00:00:00 2001
From: Lex Song <LexSong.tw@gmail.com>
Date: Wed, 2 Apr 2025 03:32:36 +0800
Subject: [PATCH 427/582] Fix the interpolation logic error in resize_image()

The original code had a mistake. It used 'lanczos' when the image got smaller (width > resized_width and height > resized_height) and 'area' when it stayed the same or got bigger. This was the wrong way. 'area' is better for big shrinking.
---
 library/utils.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/library/utils.py b/library/utils.py
index 767de472c..d0586b84a 100644
--- a/library/utils.py
+++ b/library/utils.py
@@ -421,8 +421,11 @@ def resize_image(image: np.ndarray, width: int, height: int, resized_width: int,
     resized_height = int(resized_height)
 
     if resize_interpolation is None:
-        resize_interpolation = "lanczos" if width > resized_width and height > resized_height else "area"
-    
+        if width >= resized_width and height >= resized_height:
+            resize_interpolation = "area"
+        else:
+            resize_interpolation = "lanczos"
+
     # we use PIL for lanczos (for backward compatibility) and box, cv2 for others
     use_pil = resize_interpolation in ["lanczos", "lanczos4", "box"]
 

From f1423a72298a12110192f59cebe26b39206268e5 Mon Sep 17 00:00:00 2001
From: Kohya S <ykumeykume@gmail.com>
Date: Thu, 3 Apr 2025 21:48:51 +0900
Subject: [PATCH 428/582] fix: add resize_interpolation parameter to
 FineTuningDataset constructor

---
 library/train_util.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/library/train_util.py b/library/train_util.py
index 537990816..6c39f8d98 100644
--- a/library/train_util.py
+++ b/library/train_util.py
@@ -2154,8 +2154,9 @@ def __init__(
         debug_dataset: bool,
         validation_seed: int,
         validation_split: float,
+        resize_interpolation: Optional[str],
     ) -> None:
-        super().__init__(resolution, network_multiplier, debug_dataset)
+        super().__init__(resolution, network_multiplier, debug_dataset, resize_interpolation)
 
         self.batch_size = batch_size
 

From fd36fd1aa91a5c24bea820fae245b1cea7ac2b44 Mon Sep 17 00:00:00 2001
From: Dave Lage <rockerboo@gmail.com>
Date: Thu, 3 Apr 2025 16:09:45 -0400
Subject: [PATCH 429/582] Fix resize PR link

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 4bc0c2b5b..ae417d056 100644
--- a/README.md
+++ b/README.md
@@ -17,7 +17,7 @@ The command to install PyTorch is as follows:
 Mar 30, 2025:
 - LoRA-GGPO is added for FLUX.1 LoRA training. Thank you to rockerBOO for PR [#1974](https://github.com/kohya-ss/sd-scripts/pull/1974). 
   - Specify `--network_args ggpo_sigma=0.03 ggpo_beta=0.01` in the command line or `network_args = ["ggpo_sigma=0.03", "ggpo_beta=0.01"]` in .toml file. See PR for details.
-- The interpolation method for resizing the original image to the training size can now be specified. Thank you to rockerBOO for PR [#1939](https://github.com/kohya-ss/sd-scripts/pull/1939).
+- The interpolation method for resizing the original image to the training size can now be specified. Thank you to rockerBOO for PR [#1936](https://github.com/kohya-ss/sd-scripts/pull/1936).
 
 Mar 20, 2025:
 - `pytorch-optimizer` is added to requirements.txt. Thank you to gesen2egee for PR [#1985](https://github.com/kohya-ss/sd-scripts/pull/1985). 

From 00e12eed657423c6e0c86a4b2134cb04aceac42c Mon Sep 17 00:00:00 2001
From: sdbds <865105819@qq.com>
Date: Sun, 6 Apr 2025 16:09:29 +0800
Subject: [PATCH 430/582] update for lost change

---
 library/flux_models.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/library/flux_models.py b/library/flux_models.py
index b00bdae23..a945a1cbd 100644
--- a/library/flux_models.py
+++ b/library/flux_models.py
@@ -977,10 +977,10 @@ def enable_block_swap(self, num_blocks: int, device: torch.device):
         )
 
         self.offloader_double = custom_offloading_utils.ModelOffloader(
-            self.double_blocks, self.num_double_blocks, double_blocks_to_swap, device  # , debug=True
+            self.double_blocks, double_blocks_to_swap, device  # , debug=True
         )
         self.offloader_single = custom_offloading_utils.ModelOffloader(
-            self.single_blocks, self.num_single_blocks, single_blocks_to_swap, device  # , debug=True
+            self.single_blocks, single_blocks_to_swap, device  # , debug=True
         )
         print(
             f"FLUX: Block swap enabled. Swapping {num_blocks} blocks, double blocks: {double_blocks_to_swap}, single blocks: {single_blocks_to_swap}."

From 7f93e21f30a0964fd6bdbe5a84d8d6af6d2f4081 Mon Sep 17 00:00:00 2001
From: sdbds <865105819@qq.com>
Date: Sun, 6 Apr 2025 16:21:48 +0800
Subject: [PATCH 431/582] fix typo

---
 library/train_util.py | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/library/train_util.py b/library/train_util.py
index 43a8a0fe9..ba6e4cb9b 100644
--- a/library/train_util.py
+++ b/library/train_util.py
@@ -434,7 +434,7 @@ def __init__(
         custom_attributes: Optional[Dict[str, Any]] = None,
         validation_seed: Optional[int] = None,
         validation_split: Optional[float] = 0.0,
-        system_prompt: Optional[str] = None
+        system_prompt: Optional[str] = None,
         resize_interpolation: Optional[str] = None,
     ) -> None:
         self.image_dir = image_dir
@@ -500,7 +500,7 @@ def __init__(
         custom_attributes: Optional[Dict[str, Any]] = None,
         validation_seed: Optional[int] = None,
         validation_split: Optional[float] = 0.0,
-        system_prompt: Optional[str] = None
+        system_prompt: Optional[str] = None,
         resize_interpolation: Optional[str] = None,
     ) -> None:
         assert image_dir is not None, "image_dir must be specified / image_dirは指定が必須です"
@@ -529,7 +529,7 @@ def __init__(
             custom_attributes=custom_attributes,
             validation_seed=validation_seed,
             validation_split=validation_split,
-            system_prompt=system_prompt
+            system_prompt=system_prompt,
             resize_interpolation=resize_interpolation,
         )
 
@@ -573,7 +573,7 @@ def __init__(
         custom_attributes: Optional[Dict[str, Any]] = None,
         validation_seed: Optional[int] = None,
         validation_split: Optional[float] = 0.0,
-        system_prompt: Optional[str] = None
+        system_prompt: Optional[str] = None,
         resize_interpolation: Optional[str] = None,
     ) -> None:
         assert metadata_file is not None, "metadata_file must be specified / metadata_fileは指定が必須です"
@@ -602,7 +602,7 @@ def __init__(
             custom_attributes=custom_attributes,
             validation_seed=validation_seed,
             validation_split=validation_split,
-            system_prompt=system_prompt
+            system_prompt=system_prompt,
             resize_interpolation=resize_interpolation,
         )
 
@@ -642,7 +642,7 @@ def __init__(
         custom_attributes: Optional[Dict[str, Any]] = None,
         validation_seed: Optional[int] = None,
         validation_split: Optional[float] = 0.0,
-        system_prompt: Optional[str] = None
+        system_prompt: Optional[str] = None,
         resize_interpolation: Optional[str] = None,
     ) -> None:
         assert image_dir is not None, "image_dir must be specified / image_dirは指定が必須です"
@@ -671,7 +671,7 @@ def __init__(
             custom_attributes=custom_attributes,
             validation_seed=validation_seed,
             validation_split=validation_split,
-            system_prompt=system_prompt
+            system_prompt=system_prompt,
             resize_interpolation=resize_interpolation,
         )
 

From 4589262f8f35914b592992f4144bde5e746a6e36 Mon Sep 17 00:00:00 2001
From: Kohya S <ykumeykume@gmail.com>
Date: Sun, 6 Apr 2025 21:34:27 +0900
Subject: [PATCH 432/582] README.md: Update recent updates section to include
 IP noise gamma feature for FLUX.1

---
 README.md | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/README.md b/README.md
index ae417d056..2e80a6974 100644
--- a/README.md
+++ b/README.md
@@ -14,6 +14,10 @@ The command to install PyTorch is as follows:
 
 ### Recent Updates
 
+Apr 6, 2025:
+- IP noise gamma has been enabled in FLUX.1. Thanks to rockerBOO for PR [#1992](https://github.com/kohya-ss/sd-scripts/pull/1992). See the PR for details.
+    - `--ip_noise_gamma` and `--ip_noise_gamma_random_strength` are available.
+  
 Mar 30, 2025:
 - LoRA-GGPO is added for FLUX.1 LoRA training. Thank you to rockerBOO for PR [#1974](https://github.com/kohya-ss/sd-scripts/pull/1974). 
   - Specify `--network_args ggpo_sigma=0.03 ggpo_beta=0.01` in the command line or `network_args = ["ggpo_sigma=0.03", "ggpo_beta=0.01"]` in .toml file. See PR for details.

From 8f5a2eba3db83b6651fb99745368551132782816 Mon Sep 17 00:00:00 2001
From: Kohya S <ykumeykume@gmail.com>
Date: Fri, 11 Apr 2025 08:07:24 +0900
Subject: [PATCH 433/582] Add documentation for LoRA training scripts for
 SD1/2, SDXL, FLUX.1 and SD3/3.5 models

---
 docs/flux_train_network.md | 127 +++++++++++++++
 docs/sd3_train_network.md  | 122 ++++++++++++++
 docs/sdxl_train_network.md | 160 +++++++++++++++++++
 docs/train_network.md      | 314 +++++++++++++++++++++++++++++++++++++
 4 files changed, 723 insertions(+)
 create mode 100644 docs/flux_train_network.md
 create mode 100644 docs/sd3_train_network.md
 create mode 100644 docs/sdxl_train_network.md
 create mode 100644 docs/train_network.md

diff --git a/docs/flux_train_network.md b/docs/flux_train_network.md
new file mode 100644
index 000000000..d28d58778
--- /dev/null
+++ b/docs/flux_train_network.md
@@ -0,0 +1,127 @@
+# `flux_train_network.py` を用いたFLUX.1モデルのLoRA学習ガイド
+
+このドキュメントでは、`sd-scripts`リポジトリに含まれる`flux_train_network.py`を使用して、FLUX.1モデルに対するLoRA (Low-Rank Adaptation) モデルを学習する基本的な手順について解説します。
+
+## 1. はじめに
+
+`flux_train_network.py`は、FLUX.1モデルに対してLoRAなどの追加ネットワークを学習させるためのスクリプトです。FLUX.1はStable Diffusionとは異なるアーキテクチャを持つ画像生成モデルであり、このスクリプトを使用することで、特定のキャラクターや画風を再現するLoRAモデルを作成できます。
+
+このガイドは、基本的なLoRA学習の手順を理解しているユーザーを対象とし、`train_network.py`での学習経験があることを前提としています。基本的な使い方や共通のオプションについては、[`train_network.py`のガイド](how_to_use_train_network.md)を参照してください。
+
+**前提条件:**
+
+*   `sd-scripts`リポジトリのクローンとPython環境のセットアップが完了していること。
+*   学習用データセットの準備が完了していること。（データセットの準備については[データセット設定ガイド](link/to/dataset/config/doc)を参照してください）
+
+## 2. `train_network.py` との違い
+
+`flux_train_network.py`は`train_network.py`をベースに、FLUX.1モデルに対応するための変更が加えられています。主な違いは以下の通りです。
+
+*   **対象モデル:** FLUX.1モデル（dev版またはschnell版）を対象とします。
+*   **モデル構造:** Stable Diffusionとは異なり、FLUX.1はTransformerベースのアーキテクチャを持ちます。Text EncoderとしてCLIP-LとT5-XXLの二つを使用し、VAEの代わりに専用のAutoEncoder (AE) を使用します。
+*   **必須の引数:** FLUX.1モデル、CLIP-L、T5-XXL、AEの各モデルファイルを指定する引数が追加されています。
+*   **一部引数の非互換性:** Stable Diffusion向けの引数の一部（例: `--v2`, `--clip_skip`, `--max_token_length`）はFLUX.1の学習では使用されません。
+*   **FLUX.1特有の引数:** タイムステップのサンプリング方法やガイダンススケールなど、FLUX.1特有の学習パラメータを指定する引数が追加されています。
+
+## 3. 準備
+
+学習を開始する前に、以下のファイルが必要です。
+
+1.  **学習スクリプト:** `flux_train_network.py`
+2.  **FLUX.1モデルファイル:** 学習のベースとなるFLUX.1モデルの`.safetensors`ファイル（例: `flux1-dev.safetensors`）。
+3.  **Text Encoderモデルファイル:**
+    *   CLIP-Lモデルの`.safetensors`ファイル。
+    *   T5-XXLモデルの`.safetensors`ファイル。
+4.  **AutoEncoderモデルファイル:** FLUX.1に対応するAEモデルの`.safetensors`ファイル。
+5.  **データセット定義ファイル (.toml):** 学習データセットの設定を記述したTOML形式のファイル。（詳細は[データセット設定ガイド](link/to/dataset/config/doc)を参照してください）。
+
+    *   例として`my_flux_dataset_config.toml`を使用します。
+
+## 4. 学習の実行
+
+学習は、ターミナルから`flux_train_network.py`を実行することで開始します。基本的なコマンドラインの構造は`train_network.py`と同様ですが、FLUX.1特有の引数を指定する必要があります。
+
+以下に、基本的なコマンドライン実行例を示します。
+
+```bash
+accelerate launch --num_cpu_threads_per_process 1 flux_train_network.py 
+ --pretrained_model_name_or_path="<path to FLUX.1 model>" 
+ --clip_l="<path to CLIP-L model>" 
+ --t5xxl="<path to T5-XXL model>" 
+ --ae="<path to AE model>" 
+ --dataset_config="my_flux_dataset_config.toml" 
+ --output_dir="<output directory for training results>" 
+ --output_name="my_flux_lora" 
+ --save_model_as=safetensors 
+ --network_module=networks.lora 
+ --network_dim=16 
+ --network_alpha=1 
+ --learning_rate=1e-4 
+ --optimizer_type="AdamW8bit" 
+ --lr_scheduler="constant" 
+ --sdpa  
+ --max_train_epochs=10 
+ --save_every_n_epochs=1 
+ --mixed_precision="fp16" 
+ --gradient_checkpointing 
+ --apply_t5_attn_mask 
+ --blocks_to_swap=18
+```
+
+※実際には1行で書くか、適切な改行文字（`\` または `^`）を使用してください。
+
+### 4.1. 主要なコマンドライン引数の解説（`train_network.py`からの追加・変更点）
+
+[`train_network.py`のガイド](how_to_use_train_network.md)で説明されている引数に加え、以下のFLUX.1特有の引数を指定します。共通の引数（`--output_dir`, `--output_name`, `--network_module`, `--network_dim`, `--network_alpha`, `--learning_rate`など）については、上記ガイドを参照してください。
+
+#### モデル関連 [必須]
+
+*   `--pretrained_model_name_or_path="<path to FLUX.1 model>"` **[必須]**
+    *   学習のベースとなるFLUX.1モデル（dev版またはschnell版）の`.safetensors`ファイルのパスを指定します。Diffusers形式のディレクトリは現在サポートされていません。
+*   `--clip_l="<path to CLIP-L model>"` **[必須]**
+    *   CLIP-L Text Encoderモデルの`.safetensors`ファイルのパスを指定します。
+*   `--t5xxl="<path to T5-XXL model>"` **[必須]**
+    *   T5-XXL Text Encoderモデルの`.safetensors`ファイルのパスを指定します。
+*   `--ae="<path to AE model>"` **[必須]**
+    *   FLUX.1に対応するAutoEncoderモデルの`.safetensors`ファイルのパスを指定します。
+
+#### FLUX.1 学習パラメータ
+
+*   `--t5xxl_max_token_length=<integer>`
+    *   T5-XXL Text Encoderで使用するトークンの最大長を指定します。省略した場合、モデルがschnell版なら256、dev版なら512が自動的に設定されます。データセットのキャプション長に合わせて調整が必要な場合があります。
+*   `--apply_t5_attn_mask`
+    *   T5-XXLの出力とFLUXモデル内部（Double Block）のアテンション計算時に、パディングトークンに対応するアテンションマスクを適用します。精度向上が期待できる場合がありますが、わずかに計算コストが増加します。
+*   `--guidance_scale=<float>`
+    *   FLUX.1 dev版は特定のガイダンススケール値で蒸留されているため、学習時にもその値を指定します。デフォルトは`3.5`です。schnell版では通常無視されます。
+*   `--timestep_sampling=<choice>`
+    *   学習時に使用するタイムステップ（ノイズレベル）のサンプリング方法を指定します。`sigma`, `uniform`, `sigmoid`, `shift`, `flux_shift` から選択します。デフォルトは `sigma` です。
+*   `--sigmoid_scale=<float>`
+    *   `timestep_sampling` に `sigmoid` または `shift`, `flux_shift` を指定した場合のスケール係数です。デフォルトは`1.0`です。
+*   `--model_prediction_type=<choice>`
+    *   モデルが何を予測するかを指定します。`raw` (予測値をそのまま使用), `additive` (ノイズ入力に加算), `sigma_scaled` (シグマスケーリングを適用) から選択します。デフォルトは `sigma_scaled` です。
+*   `--discrete_flow_shift=<float>`
+    *   Flow Matchingで使用されるスケジューラのシフト値を指定します。デフォルトは`3.0`です。
+
+#### メモリ・速度関連
+
+*   `--blocks_to_swap=<integer>` **[実験的機能]**
+    *   VRAM使用量を削減するために、モデルの一部（Transformerブロック）をCPUとGPU間でスワップする設定です。スワップするブロック数を整数で指定します（例: `18`）。値を大きくするとVRAM使用量は減りますが、学習速度は低下します。GPUのVRAM容量に応じて調整してください。`gradient_checkpointing`と併用可能です。
+    *   `--cpu_offload_checkpointing`とは併用できません。
+
+#### 非互換・非推奨の引数
+
+*   `--v2`, `--v_parameterization`, `--clip_skip`: Stable Diffusion特有の引数のため、FLUX.1学習では使用されません。
+*   `--max_token_length`: Stable Diffusion v1/v2向けの引数です。FLUX.1では`--t5xxl_max_token_length`を使用してください。
+*   `--split_mode`: 非推奨の引数です。代わりに`--blocks_to_swap`を使用してください。
+
+### 4.2. 学習の開始
+
+必要な引数を設定し、コマンドを実行すると学習が開始されます。基本的な流れやログの確認方法は[`train_network.py`のガイド](how_to_use_train_network.md#32-starting-the-training--学習の開始)と同様です。
+
+## 5. 学習済みモデルの利用
+
+学習が完了すると、指定した`output_dir`にLoRAモデルファイル（例: `my_flux_lora.safetensors`）が保存されます。このファイルは、FLUX.1モデルに対応した推論環境（例: ComfyUI + ComfyUI-FluxNodes）で使用できます。
+
+## 6. その他
+
+`flux_train_network.py`には、サンプル画像の生成 (`--sample_prompts`など) や詳細なオプティマイザ設定など、`train_network.py`と共通の機能も多く存在します。これらについては、[`train_network.py`のガイド](how_to_use_train_network.md#5-other-features--その他の機能)やスクリプトのヘルプ (`python flux_train_network.py --help`) を参照してください。
diff --git a/docs/sd3_train_network.md b/docs/sd3_train_network.md
new file mode 100644
index 000000000..d5cc5a75a
--- /dev/null
+++ b/docs/sd3_train_network.md
@@ -0,0 +1,122 @@
+# `sd3_train_network.py` を用いたStable Diffusion 3/3.5モデルのLoRA学習ガイド
+
+このドキュメントでは、`sd-scripts`リポジトリに含まれる`sd3_train_network.py`を使用して、Stable Diffusion 3 (SD3) および Stable Diffusion 3.5 (SD3.5) モデルに対するLoRA (Low-Rank Adaptation) モデルを学習する基本的な手順について解説します。
+
+## 1. はじめに
+
+`sd3_train_network.py`は、Stable Diffusion 3/3.5モデルに対してLoRAなどの追加ネットワークを学習させるためのスクリプトです。SD3は、MMDiT (Multi-Modal Diffusion Transformer) と呼ばれる新しいアーキテクチャを採用しており、従来のStable Diffusionモデルとは構造が異なります。このスクリプトを使用することで、SD3/3.5モデルに特化したLoRAモデルを作成できます。
+
+このガイドは、基本的なLoRA学習の手順を理解しているユーザーを対象とし、`train_network.py`での学習経験があることを前提としています。基本的な使い方や共通のオプションについては、[`train_network.py`のガイド](how_to_use_train_network.md)を参照してください。
+
+**前提条件:**
+
+*   `sd-scripts`リポジトリのクローンとPython環境のセットアップが完了していること。
+*   学習用データセットの準備が完了していること。（データセットの準備については[データセット設定ガイド](link/to/dataset/config/doc)を参照してください）
+*   学習対象のSD3/3.5モデルファイルが準備できていること。
+
+## 2. `train_network.py` との違い
+
+`sd3_train_network.py`は`train_network.py`をベースに、SD3/3.5モデルに対応するための変更が加えられています。主な違いは以下の通りです。
+
+*   **対象モデル:** Stable Diffusion 3 Medium / Large (3.0 / 3.5) モデルを対象とします。
+*   **モデル構造:** U-Netの代わりにMMDiT (Transformerベース) を使用します。Text EncoderとしてCLIP-L, CLIP-G, T5-XXLの三つを使用します。VAEはSDXLと互換性がありますが、入力のスケール処理が異なります。
+*   **引数:** SD3/3.5モデル、Text Encoder群、VAEを指定する引数があります。ただし、単一ファイルの`.safetensors`形式であれば、内部で自動的に分離されるため、個別のパス指定は必須ではありません。
+*   **一部引数の非互換性:** Stable Diffusion v1/v2向けの引数（例: `--v2`, `--v_parameterization`, `--clip_skip`）はSD3/3.5の学習では使用されません。
+*   **SD3特有の引数:** Text Encoderのアテンションマスクやドロップアウト率、Positional Embeddingの調整（SD3.5向け）、タイムステップのサンプリングや損失の重み付けに関する引数が追加されています。
+
+## 3. 準備
+
+学習を開始する前に、以下のファイルが必要です。
+
+1.  **学習スクリプト:** `sd3_train_network.py`
+2.  **SD3/3.5モデルファイル:** 学習のベースとなるSD3/3.5モデルの`.safetensors`ファイル。単一ファイル形式（Diffusers/ComfyUI/AUTOMATIC1111形式）を推奨します。
+    *   Text EncoderやVAEが別ファイルになっている場合は、対応する引数でパスを指定します。
+3.  **データセット定義ファイル (.toml):** 学習データセットの設定を記述したTOML形式のファイル。（詳細は[データセット設定ガイド](link/to/dataset/config/doc)を参照してください）。
+    *   例として`my_sd3_dataset_config.toml`を使用します。
+
+## 4. 学習の実行
+
+学習は、ターミナルから`sd3_train_network.py`を実行することで開始します。基本的なコマンドラインの構造は`train_network.py`と同様ですが、SD3/3.5特有の引数を指定する必要があります。
+
+以下に、基本的なコマンドライン実行例を示します。
+
+```bash
+accelerate launch --num_cpu_threads_per_process 1 sd3_train_network.py 
+ --pretrained_model_name_or_path="<path to SD3 model>" 
+ --dataset_config="my_sd3_dataset_config.toml" 
+ --output_dir="<output directory for training results>" 
+ --output_name="my_sd3_lora" 
+ --save_model_as=safetensors 
+ --network_module=networks.lora 
+ --network_dim=16 
+ --network_alpha=1 
+ --learning_rate=1e-4 
+ --optimizer_type="AdamW8bit" 
+ --lr_scheduler="constant" 
+ --sdpa  
+ --max_train_epochs=10 
+ --save_every_n_epochs=1 
+ --mixed_precision="fp16" 
+ --gradient_checkpointing 
+ --apply_lg_attn_mask 
+ --apply_t5_attn_mask 
+ --weighting_scheme="sigma_sqrt" 
+ --blocks_to_swap=32
+```
+
+※実際には1行で書くか、適切な改行文字（`\` または `^`）を使用してください。
+
+### 4.1. 主要なコマンドライン引数の解説（`train_network.py`からの追加・変更点）
+
+[`train_network.py`のガイド](how_to_use_train_network.md)で説明されている引数に加え、以下のSD3/3.5特有の引数を指定します。共通の引数（`--output_dir`, `--output_name`, `--network_module`, `--network_dim`, `--network_alpha`, `--learning_rate`など）については、上記ガイドを参照してください。
+
+#### モデル関連
+
+*   `--pretrained_model_name_or_path="<path to SD3 model>"` **[必須]**
+    *   学習のベースとなるSD3/3.5モデルの`.safetensors`ファイルのパスを指定します。単一ファイル形式（Diffusers/ComfyUI/AUTOMATIC1111形式）を想定しています。
+*   `--clip_l`, `--clip_g`, `--t5xxl`, `--vae`:
+    *   ベースモデルが単一ファイル形式の場合、通常これらの指定は不要です（自動的にモデル内部から読み込まれます）。
+    *   もしText EncoderやVAEが別ファイルとして提供されている場合は、それぞれの`.safetensors`ファイルのパスを指定します。
+
+#### SD3/3.5 学習パラメータ
+
+*   `--t5xxl_max_token_length=<integer>`
+    *   T5-XXL Text Encoderで使用するトークンの最大長を指定します。SD3のデフォルトは`256`です。データセットのキャプション長に合わせて調整が必要な場合があります。
+*   `--apply_lg_attn_mask`
+    *   CLIP-LおよびCLIP-Gの出力に対して、パディングトークンに対応するアテンションマスク（ゼロ埋め）を適用します。
+*   `--apply_t5_attn_mask`
+    *   T5-XXLの出力に対して、パディングトークンに対応するアテンションマスク（ゼロ埋め）を適用します。
+*   `--clip_l_dropout_rate`, `--clip_g_dropout_rate`, `--t5_dropout_rate`:
+    *   各Text Encoderの出力に対して、指定した確率でドロップアウト（出力をゼロにする）を適用します。過学習の抑制に役立つ場合があります。デフォルトは`0.0`（ドロップアウトなし）です。
+*   `--pos_emb_random_crop_rate=<float>` **[SD3.5向け]**
+    *   MMDiTのPositional Embeddingに対してランダムクロップを適用する確率を指定します。SD3 5M (3.5) モデルで学習された機能であり、他のモデルでの効果は限定的です。デフォルトは`0.0`です。
+*   `--enable_scaled_pos_embed` **[SD3.5向け]**
+    *   マルチ解像度学習時に、解像度に応じてPositional Embeddingをスケーリングします。SD3 5M (3.5) モデルで学習された機能であり、他のモデルでの効果は限定的です。
+*   `--training_shift=<float>`
+    *   学習時のタイムステップ（ノイズレベル）の分布を調整するためのシフト値です。`weighting_scheme`に加えて適用されます。`1.0`より大きい値はノイズの大きい（構造寄り）領域を、小さい値はノイズの小さい（詳細寄り）領域を重視する傾向になります。デフォルトは`1.0`です。
+*   `--weighting_scheme=<choice>`
+    *   損失計算時のタイムステップ（ノイズレベル）に応じた重み付け方法を指定します。`sigma_sqrt`, `logit_normal`, `mode`, `cosmap`, `uniform` (または`none`) から選択します。SD3の論文では`sigma_sqrt`が使用されています。デフォルトは`uniform`です。
+*   `--logit_mean`, `--logit_std`, `--mode_scale`:
+    *   `weighting_scheme`で`logit_normal`または`mode`を選択した場合に、その分布を制御するためのパラメータです。通常はデフォルト値で問題ありません。
+
+#### メモリ・速度関連
+
+*   `--blocks_to_swap=<integer>` **[実験的機能]**
+    *   VRAM使用量を削減するために、モデルの一部（MMDiTのTransformerブロック）をCPUとGPU間でスワップする設定です。スワップするブロック数を整数で指定します（例: `32`）。値を大きくするとVRAM使用量は減りますが、学習速度は低下します。GPUのVRAM容量に応じて調整してください。`gradient_checkpointing`と併用可能です。
+    *   `--cpu_offload_checkpointing`とは併用できません。
+
+#### 非互換・非推奨の引数
+
+*   `--v2`, `--v_parameterization`, `--clip_skip`: Stable Diffusion v1/v2特有の引数のため、SD3/3.5学習では使用されません。
+
+### 4.2. 学習の開始
+
+必要な引数を設定し、コマンドを実行すると学習が開始されます。基本的な流れやログの確認方法は[`train_network.py`のガイド](how_to_use_train_network.md#32-starting-the-training--学習の開始)と同様です。
+
+## 5. 学習済みモデルの利用
+
+学習が完了すると、指定した`output_dir`にLoRAモデルファイル（例: `my_sd3_lora.safetensors`）が保存されます。このファイルは、SD3/3.5モデルに対応した推論環境（例: ComfyUIなど）で使用できます。
+
+## 6. その他
+
+`sd3_train_network.py`には、サンプル画像の生成 (`--sample_prompts`など) や詳細なオプティマイザ設定など、`train_network.py`と共通の機能も多く存在します。これらについては、[`train_network.py`のガイド](how_to_use_train_network.md#5-other-features--その他の機能)やスクリプトのヘルプ (`python sd3_train_network.py --help`) を参照してください。
diff --git a/docs/sdxl_train_network.md b/docs/sdxl_train_network.md
new file mode 100644
index 000000000..8a19f7aed
--- /dev/null
+++ b/docs/sdxl_train_network.md
@@ -0,0 +1,160 @@
+はい、承知いたしました。`sd-scripts` リポジトリに含まれる `sdxl_train_network.py` を使用した SDXL LoRA 学習に関するドキュメントを作成します。`how_to_use_train_network.md` との差分を中心に、初心者ユーザー向けに解説します。
+
+---
+
+# SDXL LoRA学習スクリプト `sdxl_train_network.py` の使い方
+
+このドキュメントでは、`sd-scripts` リポジトリに含まれる `sdxl_train_network.py` を使用して、SDXL (Stable Diffusion XL) モデルに対する LoRA (Low-Rank Adaptation) モデルを学習する基本的な手順について解説します。
+
+## 1. はじめに
+
+`sdxl_train_network.py` は、SDXL モデルに対して LoRA などの追加ネットワークを学習させるためのスクリプトです。基本的な使い方は `train_network.py` ([LoRA学習スクリプト `train_network.py` の使い方](how_to_use_train_network.md) 参照) と共通ですが、SDXL モデル特有の設定が必要となります。
+
+このガイドでは、SDXL LoRA 学習に焦点を当て、`train_network.py` との主な違いや SDXL 特有の設定項目を中心に説明します。
+
+**前提条件:**
+
+*   `sd-scripts` リポジトリのクローンと Python 環境のセットアップが完了していること。
+*   学習用データセットの準備が完了していること。（データセットの準備については[データセット準備ガイド](link/to/dataset/doc)を参照してください）
+*   [LoRA学習スクリプト `train_network.py` の使い方](how_to_use_train_network.md) を一読していること。
+
+## 2. 準備
+
+学習を開始する前に、以下のファイルが必要です。
+
+1.  **学習スクリプト:** `sdxl_train_network.py`
+2.  **データセット定義ファイル (.toml):** 学習データセットの設定を記述した TOML 形式のファイル。
+
+### データセット定義ファイルについて
+
+データセット定義ファイル (`.toml`) の基本的な書き方は `train_network.py` と共通です。[データセット設定ガイド](link/to/dataset/config/doc) および [LoRA学習スクリプト `train_network.py` の使い方](how_to_use_train_network.md#データセット定義ファイルについて) を参照してください。
+
+SDXL では、高解像度のデータセットや、アスペクト比バケツ機能 (`enable_bucket = true`) の利用が一般的です。
+
+ここでは、例として `my_sdxl_dataset_config.toml` という名前のファイルを使用することにします。
+
+## 3. 学習の実行
+
+学習は、ターミナルから `sdxl_train_network.py` を実行することで開始します。
+
+以下に、SDXL LoRA 学習における基本的なコマンドライン実行例を示します。
+
+```bash
+accelerate launch --num_cpu_threads_per_process 1 sdxl_train_network.py 
+ --pretrained_model_name_or_path="<SDXLベースモデルのパス>" 
+ --dataset_config="my_sdxl_dataset_config.toml" 
+ --output_dir="<学習結果の出力先ディレクトリ>" 
+ --output_name="my_sdxl_lora" 
+ --save_model_as=safetensors 
+ --network_module=networks.lora 
+ --network_dim=32 
+ --network_alpha=16 
+ --learning_rate=1e-4 
+ --unet_lr=1e-4 
+ --text_encoder_lr1=1e-5 
+ --text_encoder_lr2=1e-5 
+ --optimizer_type="AdamW8bit" 
+ --lr_scheduler="constant" 
+ --max_train_epochs=10 
+ --save_every_n_epochs=1 
+ --mixed_precision="bf16" 
+ --gradient_checkpointing 
+ --cache_text_encoder_outputs 
+ --cache_latents
+```
+
+`train_network.py` の実行例と比較すると、以下の点が異なります。
+
+*   実行するスクリプトが `sdxl_train_network.py` になります。
+*   `--pretrained_model_name_or_path` には SDXL のベースモデルを指定します。
+*   `--text_encoder_lr` が `--text_encoder_lr1` と `--text_encoder_lr2` に分かれています（SDXL は2つの Text Encoder を持つため）。
+*   `--mixed_precision` は `bf16` または `fp16` が推奨されます。
+*   `--cache_text_encoder_outputs` や `--cache_latents` は VRAM 使用量を削減するために推奨されます。
+
+次に、`train_network.py` との差分となる主要なコマンドライン引数について解説します。共通の引数については、[LoRA学習スクリプト `train_network.py` の使い方](how_to_use_train_network.md#31-主要なコマンドライン引数) を参照してください。
+
+### 3.1. 主要なコマンドライン引数（差分）
+
+#### モデル関連
+
+*   `--pretrained_model_name_or_path="<モデルのパス>"` **[必須]**
+    *   学習のベースとなる **SDXL モデル**を指定します。Hugging Face Hub のモデル ID (例: `"stabilityai/stable-diffusion-xl-base-1.0"`) や、ローカルの Diffusers 形式モデルのディレクトリ、`.safetensors` ファイルのパスを指定できます。
+*   `--v2`, `--v_parameterization`
+    *   これらの引数は SD1.x/2.x 用です。`sdxl_train_network.py` を使用する場合、SDXL モデルであることが前提となるため、通常は**指定する必要はありません**。
+
+#### データセット関連
+
+*   `--dataset_config="<設定ファイルのパス>"`
+    *   `train_network.py` と共通です。
+    *   SDXL では高解像度データやバケツ機能 (`.toml` で `enable_bucket = true` を指定) の利用が一般的です。
+
+#### 出力・保存関連
+
+*   `train_network.py` と共通です。
+
+#### LoRA パラメータ
+
+*   `train_network.py` と共通です。
+
+#### 学習パラメータ
+
+*   `--learning_rate=1e-4`
+    *   全体の学習率。`unet_lr`, `text_encoder_lr1`, `text_encoder_lr2` が指定されない場合のデフォルト値となります。
+*   `--unet_lr=1e-4`
+    *   U-Net 部分の LoRA モジュールに対する学習率。指定しない場合は `--learning_rate` の値が使用されます。
+*   `--text_encoder_lr1=1e-5`
+    *   **Text Encoder 1 (OpenCLIP ViT-G/14) の LoRA モジュール**に対する学習率。指定しない場合は `--learning_rate` の値が使用されます。U-Net より小さめの値が推奨されます。
+*   `--text_encoder_lr2=1e-5`
+    *   **Text Encoder 2 (CLIP ViT-L/14) の LoRA モジュール**に対する学習率。指定しない場合は `--learning_rate` の値が使用されます。U-Net より小さめの値が推奨されます。
+*   `--optimizer_type="AdamW8bit"`
+    *   `train_network.py` と共通です。
+*   `--lr_scheduler="constant"`
+    *   `train_network.py` と共通です。
+*   `--lr_warmup_steps`
+    *   `train_network.py` と共通です。
+*   `--max_train_steps`, `--max_train_epochs`
+    *   `train_network.py` と共通です。
+*   `--mixed_precision="bf16"`
+    *   混合精度学習の設定。SDXL では `bf16` または `fp16` の使用が推奨されます。GPU が対応している方を選択してください。VRAM 使用量を削減し、学習速度を向上させます。
+*   `--gradient_accumulation_steps=1`
+    *   `train_network.py` と共通です。
+*   `--gradient_checkpointing`
+    *   `train_network.py` と共通です。SDXL はメモリ消費が大きいため、有効にすることが推奨されます。
+*   `--cache_latents`
+    *   VAE の出力をメモリ（または `--cache_latents_to_disk` 指定時はディスク）にキャッシュします。VAE の計算を省略できるため、VRAM 使用量を削減し、学習を高速化できます。画像に対する Augmentation (`--color_aug`, `--flip_aug`, `--random_crop` 等) が無効になります。SDXL 学習では推奨されるオプションです。
+*   `--cache_latents_to_disk`
+    *   `--cache_latents` と併用し、キャッシュ先をディスクにします。データセットを最初に読み込む際に、VAE の出力をディスクにキャッシュします。二回目以降の学習で VAE の計算を省略できるため、学習データの枚数が多い場合に推奨されます。
+*   `--cache_text_encoder_outputs`
+    *   Text Encoder の出力をメモリ（または `--cache_text_encoder_outputs_to_disk` 指定時はディスク）にキャッシュします。Text Encoder の計算を省略できるため、VRAM 使用量を削減し、学習を高速化できます。キャプションに対する Augmentation (`--shuffle_caption`, `--caption_dropout_rate` 等) が無効になります。
+    *   **注意:** このオプションを使用する場合、Text Encoder の LoRA モジュールは学習できません (`--network_train_unet_only` の指定が必須です)。
+*   `--cache_text_encoder_outputs_to_disk`
+    *   `--cache_text_encoder_outputs` と併用し、キャッシュ先をディスクにします。
+*   `--no_half_vae`
+    *   混合精度 (`fp16`/`bf16`) 使用時でも VAE を `float32` で動作させます。SDXL の VAE は `float16` で不安定になることがあるため、`fp16` 指定時には有効にしてくだ
+*   `--clip_skip`
+    *   SDXL では通常使用しません。指定は不要です。
+*   `--fused_backward_pass`
+    *   勾配計算とオプティマイザのステップを融合し、VRAM使用量を削減します。SDXLで利用可能です。（現在 `Adafactor` オプティマイザのみ対応）
+
+#### その他
+
+*   `--seed`, `--logging_dir`, `--log_prefix` などは `train_network.py` と共通です。
+
+### 3.2. 学習の開始
+
+必要な引数を設定し、コマンドを実行すると学習が開始されます。学習の進行状況はコンソールに出力されます。基本的な流れは `train_network.py` と同じです。
+
+## 4. 学習済みモデルの利用
+
+学習が完了すると、`output_dir` で指定したディレクトリに、`output_name` で指定した名前の LoRA モデルファイル (`.safetensors` など) が保存されます。
+
+このファイルは、AUTOMATIC1111/stable-diffusion-webui 、ComfyUI などの SDXL に対応した GUI ツールで利用できます。
+
+## 5. 補足: `train_network.py` との主な違い
+
+*   **対象モデル:** `sdxl_train_network.py` は SDXL モデル専用です。
+*   **Text Encoder:** SDXL は 2 つの Text Encoder を持つため、学習率の指定 (`--text_encoder_lr1`, `--text_encoder_lr2`) などが異なります。
+*   **キャッシュ機能:** `--cache_text_encoder_outputs` は SDXL で特に効果が高く、推奨されます。
+*   **推奨設定:** VRAM 使用量が大きいため、`bf16` または `fp16` の混合精度、`gradient_checkpointing`、キャッシュ機能 (`--cache_latents`, `--cache_text_encoder_outputs`) の利用が推奨されます。`fp16` 指定時は、VAE は `--no_half_vae` で `float32` 動作を推奨します。
+
+その他の詳細なオプションについては、スクリプトのヘルプ (`python sdxl_train_network.py --help`) やリポジトリ内の他のドキュメントを参照してください。
\ No newline at end of file
diff --git a/docs/train_network.md b/docs/train_network.md
new file mode 100644
index 000000000..06c08a424
--- /dev/null
+++ b/docs/train_network.md
@@ -0,0 +1,314 @@
+# How to use the LoRA training script `train_network.py` / LoRA学習スクリプト `train_network.py` の使い方
+
+This document explains the basic procedures for training LoRA (Low-Rank Adaptation) models using `train_network.py` included in the `sd-scripts` repository.
+
+<details>
+<summary>日本語</summary>
+このドキュメントでは、`sd-scripts` リポジトリに含まれる `train_network.py` を使用して LoRA (Low-Rank Adaptation) モデルを学習する基本的な手順について解説します。
+</details>
+
+## 1. Introduction / はじめに
+
+`train_network.py` is a script for training additional networks such as LoRA on Stable Diffusion models (v1.x, v2.x). It allows for additional training on the original model with a low computational cost, enabling the creation of models that reproduce specific characters or art styles.
+
+This guide focuses on LoRA training and explains the basic configuration items.
+
+**Prerequisites:**
+
+* The `sd-scripts` repository has been cloned and the Python environment has been set up.
+* The training dataset has been prepared. (For dataset preparation, please refer to [this guide](link/to/dataset/doc))
+
+<details>
+<summary>日本語</summary>
+
+`train_network.py` は、Stable Diffusion モデル（v1.x, v2.x）に対して、LoRA などの追加ネットワークを学習させるためのスクリプトです。少ない計算コストで元のモデルに追加学習を行い、特定のキャラクターや画風を再現するモデルを作成できます。
+
+このガイドでは、LoRA 学習に焦点を当て、基本的な設定項目を中心に説明します。
+
+**前提条件:**
+
+*   `sd-scripts` リポジトリのクローンと Python 環境のセットアップが完了していること。
+*   学習用データセットの準備が完了していること。（データセットの準備については[こちら](link/to/dataset/doc)を参照してください）
+</details>
+
+## 2. Preparation / 準備
+
+Before starting training, you will need the following files:
+
+1. **Training script:** `train_network.py`
+2. **Dataset definition file (.toml):** A file in TOML format that describes the configuration of the training dataset.
+
+### About the Dataset Definition File / データセット定義ファイルについて
+
+The dataset definition file (`.toml`) contains detailed settings such as the directory of images to use, repetition count, caption settings, resolution buckets (optional), etc.
+
+For more details on how to write the dataset definition file, please refer to the [Dataset Configuration Guide](link/to/dataset/config/doc).
+
+In this guide, we will use a file named `my_dataset_config.toml` as an example.
+
+<details>
+<summary>日本語</summary>
+
+学習を開始する前に、以下のファイルが必要です。
+
+1.  **学習スクリプト:** `train_network.py`
+2.  **データセット定義ファイル (.toml):** 学習データセットの設定を記述した TOML 形式のファイル。
+
+**データセット定義ファイルについて**
+
+データセット定義ファイル (`.toml`) には、使用する画像のディレクトリ、繰り返し回数、キャプションの設定、解像度バケツ（任意）などの詳細な設定を記述します。
+
+データセット定義ファイルの詳しい書き方については、[データセット設定ガイド](link/to/dataset/config/doc)を参照してください。
+
+ここでは、例として `my_dataset_config.toml` という名前のファイルを使用することにします。
+</details>
+
+## 3. Running the Training / 学習の実行
+
+Training is started by executing `train_network.py` from the terminal. When executing, various training settings are specified as command-line arguments.
+
+Below is a basic command-line execution example:
+
+```bash
+accelerate launch --num_cpu_threads_per_process 1 train_network.py 
+ --pretrained_model_name_or_path="<path to Stable Diffusion model>" 
+ --dataset_config="my_dataset_config.toml" 
+ --output_dir="<output directory for training results>" 
+ --output_name="my_lora" 
+ --save_model_as=safetensors 
+ --network_module=networks.lora 
+ --network_dim=16 
+ --network_alpha=1 
+ --learning_rate=1e-4 
+ --optimizer_type="AdamW8bit" 
+ --lr_scheduler="constant" 
+ --sdpa  
+ --max_train_epochs=10 
+ --save_every_n_epochs=1 
+ --mixed_precision="fp16" 
+ --gradient_checkpointing 
+```
+
+In reality, you need to write this in a single line, but it's shown with line breaks for readability (on Linux or Mac, you can add `\` at the end of each line to break lines). For Windows, either write it in a single line without breaks or add `^` at the end of each line.
+
+Next, we'll explain the main command-line arguments.
+
+<details>
+<summary>日本語</summary>
+
+学習は、ターミナルから `train_network.py` を実行することで開始します。実行時には、学習に関する様々な設定をコマンドライン引数として指定します。
+
+以下に、基本的なコマンドライン実行例を示します。
+
+実際には1行で書く必要がありますが、見やすさのために改行しています（Linux や Mac では `\` を行末に追加することで改行できます）。Windows の場合は、改行せずに1行で書くか、`^` を行末に追加してください。
+
+次に、主要なコマンドライン引数について解説します。
+</details>
+
+### 3.1. Main Command-Line Arguments / 主要なコマンドライン引数
+
+#### Model Related / モデル関連
+
+* `--pretrained_model_name_or_path="<path to model>"` **[Required]**
+  * Specifies the Stable Diffusion model to be used as the base for training. You can specify the path to a local `.ckpt` or `.safetensors` file, or a directory containing a Diffusers format model. You can also specify a Hugging Face Hub model ID (e.g., `"stabilityai/stable-diffusion-2-1-base"`).
+* `--v2`
+  * Specify this when the base model is Stable Diffusion v2.x.
+* `--v_parameterization`
+  * Specify this when training with a v-prediction model (such as v2.x 768px models).
+
+#### Dataset Related / データセット関連
+
+* `--dataset_config="<path to configuration file>"`
+  * Specifies the path to a `.toml` file describing the dataset configuration. (For details on dataset configuration, see [here](link/to/dataset/config/doc))
+  * It's also possible to specify dataset settings from the command line, but using a `.toml` file is recommended as it becomes lengthy.
+
+#### Output and Save Related / 出力・保存関連
+
+* `--output_dir="<output directory>"` **[Required]**
+  * Specifies the directory where trained LoRA models, sample images, logs, etc. will be output.
+* `--output_name="<output filename>"` **[Required]**
+  * Specifies the filename of the trained LoRA model (excluding the extension).
+* `--save_model_as="safetensors"`
+  * Specifies the format for saving the model. You can choose from `safetensors` (recommended), `ckpt`, or `pt`. The default is `safetensors`.
+* `--save_every_n_epochs=1`
+  * Saves the model every specified number of epochs. If not specified, only the final model will be saved.
+* `--save_every_n_steps=1000`
+  * Saves the model every specified number of steps. If both epoch and step saving are specified, both will be saved.
+
+#### LoRA Parameters / LoRA パラメータ
+
+* `--network_module=networks.lora` **[Required]**
+  * Specifies the type of network to train. For LoRA, specify `networks.lora`.
+* `--network_dim=16` **[Required]**
+  * Specifies the rank (dimension) of LoRA. Higher values increase expressiveness but also increase file size and computational cost. Values between 4 and 128 are commonly used. There is no default (module dependent).
+* `--network_alpha=1`
+  * Specifies the alpha value for LoRA. This parameter is related to learning rate scaling. It is generally recommended to set it to about half the value of `network_dim`, but it can also be the same value as `network_dim`. The default is 1. Setting it to the same value as `network_dim` will result in behavior similar to older versions.
+
+#### Training Parameters / 学習パラメータ
+
+* `--learning_rate=1e-4`
+  * Specifies the learning rate. For LoRA training (when alpha value is 1), relatively higher values (e.g., from `1e-4` to `1e-3`) are often used.
+* `--unet_lr=1e-4`
+  * Used to specify a separate learning rate for the LoRA modules in the U-Net part. If not specified, the value of `--learning_rate` is used.
+* `--text_encoder_lr=1e-5`
+  * Used to specify a separate learning rate for the LoRA modules in the Text Encoder part. If not specified, the value of `--learning_rate` is used. A smaller value than that for U-Net is recommended.
+* `--optimizer_type="AdamW8bit"`
+  * Specifies the optimizer to use for training. Options include `AdamW8bit` (requires `bitsandbytes`), `AdamW`, `Lion` (requires `lion-pytorch`), `DAdaptation` (requires `dadaptation`), and `Adafactor`. `AdamW8bit` is memory-efficient and widely used.
+* `--lr_scheduler="constant"`
+  * Specifies the learning rate scheduler. This is the method for changing the learning rate as training progresses. Options include `constant` (no change), `cosine` (cosine curve), `linear` (linear decay), `constant_with_warmup` (constant with warmup), and `cosine_with_restarts`. `constant`, `cosine`, and `constant_with_warmup` are commonly used.
+* `--lr_warmup_steps=500`
+  * Specifies the number of warmup steps for the learning rate scheduler. This is the period during which the learning rate gradually increases at the start of training. Valid when the `lr_scheduler` supports warmup.
+* `--max_train_steps=10000`
+  * Specifies the total number of training steps. If `max_train_epochs` is specified, that takes precedence.
+* `--max_train_epochs=12`
+  * Specifies the number of training epochs. If this is specified, `max_train_steps` is ignored.
+* `--sdpa`
+  * Uses Scaled Dot-Product Attention. This can reduce memory usage and improve training speed for LoRA training.
+* `--mixed_precision="fp16"`
+  * Specifies the mixed precision training setting. Options are `no` (disabled), `fp16` (half precision), and `bf16` (bfloat16). If your GPU supports it, specifying `fp16` or `bf16` can improve training speed and reduce memory usage.
+* `--gradient_accumulation_steps=1`
+  * Specifies the number of steps to accumulate gradients. This effectively increases the batch size to `train_batch_size * gradient_accumulation_steps`. Set a larger value if GPU memory is insufficient. Usually `1` is fine.
+
+#### Others / その他
+
+* `--seed=42`
+  * Specifies the random seed. Set this if you want to ensure reproducibility of the training.
+* `--logging_dir="<log directory>"`
+  * Specifies the directory to output logs for TensorBoard, etc. If not specified, logs will not be output.
+* `--log_prefix="<prefix>"`
+  * Specifies the prefix for the subdirectory name created within `logging_dir`.
+* `--gradient_checkpointing`
+  * Enables Gradient Checkpointing. This can significantly reduce memory usage but slightly decreases training speed. Useful when memory is limited.
+* `--clip_skip=1`
+  * Specifies how many layers to skip from the last layer of the Text Encoder. Specifying `2` will use the output from the second-to-last layer. `None` or `1` means no skip (uses the last layer). Check the recommended value for the model you are training.
+
+<details>
+<summary>日本語</summary>
+
+#### モデル関連
+
+*   `--pretrained_model_name_or_path="<モデルのパス>"` **[必須]**
+    *   学習のベースとなる Stable Diffusion モデルを指定します。ローカルの `.ckpt` または `.safetensors` ファイルのパス、あるいは Diffusers 形式モデルのディレクトリを指定できます。Hugging Face Hub のモデル ID (例: `"stabilityai/stable-diffusion-2-1-base"`) も指定可能です。
+*   `--v2`
+    *   ベースモデルが Stable Diffusion v2.x の場合に指定します。
+*   `--v_parameterization`
+    *   v-prediction モデル（v2.x の 768px モデルなど）で学習する場合に指定します。
+
+#### データセット関連
+
+*   `--dataset_config="<設定ファイルのパス>"` 
+    *   データセット設定を記述した `.toml` ファイルのパスを指定します。（データセット設定の詳細は[こちら](link/to/dataset/config/doc)）
+    *   コマンドラインからデータセット設定を指定することも可能ですが、長くなるため `.toml` ファイルを使用することを推奨します。
+
+#### 出力・保存関連
+
+*   `--output_dir="<出力先ディレクトリ>"` **[必須]**
+    *   学習済み LoRA モデルやサンプル画像、ログなどが出力されるディレクトリを指定します。
+*   `--output_name="<出力ファイル名>"` **[必須]**
+    *   学習済み LoRA モデルのファイル名（拡張子を除く）を指定します。
+*   `--save_model_as="safetensors"`
+    *   モデルの保存形式を指定します。`safetensors` (推奨), `ckpt`, `pt` から選択できます。デフォルトは `safetensors` です。
+*   `--save_every_n_epochs=1`
+    *   指定したエポックごとにモデルを保存します。省略するとエポックごとの保存は行われません（最終モデルのみ保存）。
+*   `--save_every_n_steps=1000`
+    *   指定したステップごとにモデルを保存します。エポック指定 (`save_every_n_epochs`) と同時に指定された場合、両方とも保存されます。
+
+#### LoRA パラメータ
+
+*   `--network_module=networks.lora` **[必須]**
+    *   学習するネットワークの種別を指定します。LoRA の場合は `networks.lora` を指定します。
+*   `--network_dim=16` **[必須]**
+    *   LoRA のランク (rank / 次元数) を指定します。値が大きいほど表現力は増しますが、ファイルサイズと計算コストが増加します。一般的には 4〜128 程度の値が使われます。デフォルトは指定されていません（モジュール依存）。
+*   `--network_alpha=1`
+    *   LoRA のアルファ値 (alpha) を指定します。学習率のスケーリングに関係するパラメータで、一般的には `network_dim` の半分程度の値を指定することが推奨されますが、`network_dim` と同じ値を指定する場合もあります。デフォルトは 1 です。`network_dim` と同じ値に設定すると、旧バージョンと同様の挙動になります。
+
+#### 学習パラメータ
+
+*   `--learning_rate=1e-4`
+    *   学習率を指定します。LoRA 学習では（アルファ値が1の場合）比較的高めの値（例: `1e-4`から`1e-3`）が使われることが多いです。
+*   `--unet_lr=1e-4`
+    *   U-Net 部分の LoRA モジュールに対する学習率を個別に指定する場合に使用します。指定しない場合は `--learning_rate` の値が使用されます。
+*   `--text_encoder_lr=1e-5`
+    *   Text Encoder 部分の LoRA モジュールに対する学習率を個別に指定する場合に使用します。指定しない場合は `--learning_rate` の値が使用されます。U-Net よりも小さめの値が推奨されます。
+*   `--optimizer_type="AdamW8bit"`
+    *   学習に使用するオプティマイザを指定します。`AdamW8bit` (要 `bitsandbytes`), `AdamW`, `Lion` (要 `lion-pytorch`), `DAdaptation` (要 `dadaptation`), `Adafactor` などが選択可能です。`AdamW8bit` はメモリ効率が良く、広く使われています。
+*   `--lr_scheduler="constant"`
+    *   学習率スケジューラを指定します。学習の進行に合わせて学習率を変化させる方法です。`constant` (変化なし), `cosine` (コサインカーブ), `linear` (線形減衰), `constant_with_warmup` (ウォームアップ付き定数), `cosine_with_restarts` などが選択可能です。`constant`や`cosine` 、 `constant_with_warmup` がよく使われます。
+*   `--lr_warmup_steps=500`
+    *   学習率スケジューラのウォームアップステップ数を指定します。学習開始時に学習率を徐々に上げていく期間です。`lr_scheduler` がウォームアップをサポートする場合に有効です。
+*   `--max_train_steps=10000`
+    *   学習の総ステップ数を指定します。`max_train_epochs` が指定されている場合はそちらが優先されます。
+*   `--max_train_epochs=12`
+    *   学習のエポック数を指定します。これを指定すると `max_train_steps` は無視されます。
+*   `--sdpa`
+    *   Scaled Dot-Product Attention を使用します。LoRA の学習において、メモリ使用量を削減し、学習速度を向上させることができます。
+*   `--mixed_precision="fp16"`
+    *   混合精度学習の設定を指定します。`no` (無効), `fp16` (半精度), `bf16` (bfloat16) から選択できます。GPU が対応している場合は `fp16` または `bf16` を指定することで、学習速度の向上とメモリ使用量の削減が期待できます。
+*   `--gradient_accumulation_steps=1`
+    *   勾配を累積するステップ数を指定します。実質的なバッチサイズを `train_batch_size * gradient_accumulation_steps` に増やす効果があります。GPU メモリが足りない場合に大きな値を設定します。通常は `1` で問題ありません。
+
+#### その他
+
+*   `--seed=42`
+    *   乱数シードを指定します。学習の再現性を確保したい場合に設定します。
+*   `--logging_dir="<ログディレクトリ>"`
+    *   TensorBoard などのログを出力するディレクトリを指定します。指定しない場合、ログは出力されません。
+*   `--log_prefix="<プレフィックス>"`
+    *   `logging_dir` 内に作成されるサブディレクトリ名の接頭辞を指定します。
+*   `--gradient_checkpointing`
+    *   Gradient Checkpointing を有効にします。メモリ使用量を大幅に削減できますが、学習速度は若干低下します。メモリが厳しい場合に有効です。
+*   `--clip_skip=1`
+    *   Text Encoder の最後の層から数えて何層スキップするかを指定します。`2` を指定すると最後から 2 層目の出力を使用します。`None` または `1` はスキップなし（最後の層を使用）を意味します。学習対象のモデルの推奨する値を確認してください。
+</details>
+
+### 3.2. Starting the Training / 学習の開始
+
+After setting the necessary arguments and executing the command, training will begin. The progress of the training will be output to the console. If `logging_dir` is specified, you can visually check the training status (loss, learning rate, etc.) with TensorBoard.
+
+```bash
+tensorboard --logdir <directory specified by logging_dir>
+```
+
+<details>
+<summary>日本語</summary>
+
+必要な引数を設定し、コマンドを実行すると学習が開始されます。学習の進行状況はコンソールに出力されます。`logging_dir` を指定した場合は、TensorBoard などで学習状況（損失や学習率など）を視覚的に確認できます。
+</details>
+
+## 4. Using the Trained Model / 学習済みモデルの利用
+
+Once training is complete, a LoRA model file (`.safetensors` or `.ckpt`) with the name specified by `output_name` will be saved in the directory specified by `output_dir`.
+
+This file can be used with GUI tools such as AUTOMATIC1111/stable-diffusion-webui, ComfyUI, etc.
+
+<details>
+<summary>日本語</summary>
+
+学習が完了すると、`output_dir` で指定したディレクトリに、`output_name` で指定した名前の LoRA モデルファイル (`.safetensors` または `.ckpt`) が保存されます。
+
+このファイルは、AUTOMATIC1111/stable-diffusion-webui 、ComfyUI などの GUI ツールで利用できます。
+</details>
+
+## 5. Other Features / その他の機能
+
+`train_network.py` has many other options not introduced here.
+
+* Sample image generation (`--sample_prompts`, `--sample_every_n_steps`, etc.)
+* More detailed optimizer settings (`--optimizer_args`, etc.)
+* Caption preprocessing (`--shuffle_caption`, `--keep_tokens`, etc.)
+* Additional network settings (`--network_args`, etc.)
+
+For these features, please refer to the script's help (`python train_network.py --help`) or other documents in the repository.
+
+<details>
+<summary>日本語</summary>
+
+`train_network.py` には、ここで紹介した以外にも多くのオプションがあります。
+
+*   サンプル画像の生成 (`--sample_prompts`, `--sample_every_n_steps` など)
+*   より詳細なオプティマイザ設定 (`--optimizer_args` など)
+*   キャプションの前処理 (`--shuffle_caption`, `--keep_tokens` など)
+*   ネットワークの追加設定 (`--network_args` など)
+
+これらの機能については、スクリプトのヘルプ (`python train_network.py --help`) やリポジトリ内の他のドキュメントを参照してください。
+</details>
\ No newline at end of file

From ceb19bebf849df1d9d6d5928eba777c95bfda8c4 Mon Sep 17 00:00:00 2001
From: Kohya S <ykumeykume@gmail.com>
Date: Sun, 13 Apr 2025 22:06:58 +0900
Subject: [PATCH 434/582] update docs. sdxl is transltaed, flux.1 is corrected

---
 docs/flux_train_network.md | 224 ++++++++++++++++++++++++++++++++++---
 docs/sd3_train_network.md  |   8 +-
 docs/sdxl_train_network.md | 199 ++++++++++++++++++++++++++++----
 3 files changed, 390 insertions(+), 41 deletions(-)

diff --git a/docs/flux_train_network.md b/docs/flux_train_network.md
index d28d58778..46eee3e7e 100644
--- a/docs/flux_train_network.md
+++ b/docs/flux_train_network.md
@@ -6,7 +6,7 @@
 
 `flux_train_network.py`は、FLUX.1モデルに対してLoRAなどの追加ネットワークを学習させるためのスクリプトです。FLUX.1はStable Diffusionとは異なるアーキテクチャを持つ画像生成モデルであり、このスクリプトを使用することで、特定のキャラクターや画風を再現するLoRAモデルを作成できます。
 
-このガイドは、基本的なLoRA学習の手順を理解しているユーザーを対象とし、`train_network.py`での学習経験があることを前提としています。基本的な使い方や共通のオプションについては、[`train_network.py`のガイド](how_to_use_train_network.md)を参照してください。
+このガイドは、基本的なLoRA学習の手順を理解しているユーザーを対象としています。基本的な使い方や共通のオプションについては、[`train_network.py`のガイド](train_network.md)を参照してください。また一部のパラメータは [`sdxl_train_network.py`](sdxl_train_network.md) と同様のものがあるため、そちらも参考にしてください。
 
 **前提条件:**
 
@@ -30,9 +30,9 @@
 1.  **学習スクリプト:** `flux_train_network.py`
 2.  **FLUX.1モデルファイル:** 学習のベースとなるFLUX.1モデルの`.safetensors`ファイル（例: `flux1-dev.safetensors`）。
 3.  **Text Encoderモデルファイル:**
-    *   CLIP-Lモデルの`.safetensors`ファイル。
-    *   T5-XXLモデルの`.safetensors`ファイル。
-4.  **AutoEncoderモデルファイル:** FLUX.1に対応するAEモデルの`.safetensors`ファイル。
+    *   CLIP-Lモデルの`.safetensors`ファイル。例として`clip_l.safetensors`を使用します。
+    *   T5-XXLモデルの`.safetensors`ファイル。例として`t5xxl.safetensors`を使用します。
+4.  **AutoEncoderモデルファイル:** FLUX.1に対応するAEモデルの`.safetensors`ファイル。例として`ae.safetensors`を使用します。
 5.  **データセット定義ファイル (.toml):** 学習データセットの設定を記述したTOML形式のファイル。（詳細は[データセット設定ガイド](link/to/dataset/config/doc)を参照してください）。
 
     *   例として`my_flux_dataset_config.toml`を使用します。
@@ -53,7 +53,7 @@ accelerate launch --num_cpu_threads_per_process 1 flux_train_network.py
  --output_dir="<output directory for training results>" 
  --output_name="my_flux_lora" 
  --save_model_as=safetensors 
- --network_module=networks.lora 
+ --network_module=networks.lora_flux 
  --network_dim=16 
  --network_alpha=1 
  --learning_rate=1e-4 
@@ -64,15 +64,18 @@ accelerate launch --num_cpu_threads_per_process 1 flux_train_network.py
  --save_every_n_epochs=1 
  --mixed_precision="fp16" 
  --gradient_checkpointing 
- --apply_t5_attn_mask 
+ --guidance_scale=1.0 
+ --timestep_sampling="flux_shift" 
  --blocks_to_swap=18
+ --cache_text_encoder_outputs 
+ --cache_latents
 ```
 
 ※実際には1行で書くか、適切な改行文字（`\` または `^`）を使用してください。
 
 ### 4.1. 主要なコマンドライン引数の解説（`train_network.py`からの追加・変更点）
 
-[`train_network.py`のガイド](how_to_use_train_network.md)で説明されている引数に加え、以下のFLUX.1特有の引数を指定します。共通の引数（`--output_dir`, `--output_name`, `--network_module`, `--network_dim`, `--network_alpha`, `--learning_rate`など）については、上記ガイドを参照してください。
+[`train_network.py`のガイド](train_network.md)で説明されている引数に加え、以下のFLUX.1特有の引数を指定します。共通の引数（`--output_dir`, `--output_name`, `--network_module`, `--network_dim`, `--network_alpha`, `--learning_rate`など）については、上記ガイドを参照してください。
 
 #### モデル関連 [必須]
 
@@ -87,26 +90,26 @@ accelerate launch --num_cpu_threads_per_process 1 flux_train_network.py
 
 #### FLUX.1 学習パラメータ
 
-*   `--t5xxl_max_token_length=<integer>`
-    *   T5-XXL Text Encoderで使用するトークンの最大長を指定します。省略した場合、モデルがschnell版なら256、dev版なら512が自動的に設定されます。データセットのキャプション長に合わせて調整が必要な場合があります。
-*   `--apply_t5_attn_mask`
-    *   T5-XXLの出力とFLUXモデル内部（Double Block）のアテンション計算時に、パディングトークンに対応するアテンションマスクを適用します。精度向上が期待できる場合がありますが、わずかに計算コストが増加します。
 *   `--guidance_scale=<float>`
-    *   FLUX.1 dev版は特定のガイダンススケール値で蒸留されているため、学習時にもその値を指定します。デフォルトは`3.5`です。schnell版では通常無視されます。
+    *   FLUX.1 dev版は特定のガイダンススケール値で蒸留されていますが、学習時には `1.0` を指定してガイダンススケールを無効化します。デフォルトは`3.5`ですので、必ず指定してください。schnell版では通常無視されます。
 *   `--timestep_sampling=<choice>`
-    *   学習時に使用するタイムステップ（ノイズレベル）のサンプリング方法を指定します。`sigma`, `uniform`, `sigmoid`, `shift`, `flux_shift` から選択します。デフォルトは `sigma` です。
+    *   学習時に使用するタイムステップ（ノイズレベル）のサンプリング方法を指定します。`sigma`, `uniform`, `sigmoid`, `shift`, `flux_shift` から選択します。デフォルトは `sigma` です。推奨は `flux_shift` です。
 *   `--sigmoid_scale=<float>`
-    *   `timestep_sampling` に `sigmoid` または `shift`, `flux_shift` を指定した場合のスケール係数です。デフォルトは`1.0`です。
+    *   `timestep_sampling` に `sigmoid` または `shift`, `flux_shift` を指定した場合のスケール係数です。デフォルトおよび推奨値は`1.0`です。
 *   `--model_prediction_type=<choice>`
-    *   モデルが何を予測するかを指定します。`raw` (予測値をそのまま使用), `additive` (ノイズ入力に加算), `sigma_scaled` (シグマスケーリングを適用) から選択します。デフォルトは `sigma_scaled` です。
+    *   モデルが何を予測するかを指定します。`raw` (予測値をそのまま使用), `additive` (ノイズ入力に加算), `sigma_scaled` (シグマスケーリングを適用) から選択します。デフォルトは `sigma_scaled` です。推奨は `raw` です。
 *   `--discrete_flow_shift=<float>`
-    *   Flow Matchingで使用されるスケジューラのシフト値を指定します。デフォルトは`3.0`です。
+    *   Flow Matchingで使用されるスケジューラのシフト値を指定します。デフォルトは`3.0`です。`timestep_sampling`に`flux_shift`を指定した場合は、この値は無視されます。
 
 #### メモリ・速度関連
 
 *   `--blocks_to_swap=<integer>` **[実験的機能]**
     *   VRAM使用量を削減するために、モデルの一部（Transformerブロック）をCPUとGPU間でスワップする設定です。スワップするブロック数を整数で指定します（例: `18`）。値を大きくするとVRAM使用量は減りますが、学習速度は低下します。GPUのVRAM容量に応じて調整してください。`gradient_checkpointing`と併用可能です。
     *   `--cpu_offload_checkpointing`とは併用できません。
+* `--cache_text_encoder_outputs`
+    *   CLIP-LおよびT5-XXLの出力をキャッシュします。これにより、メモリ使用量が削減されます。
+* `--cache_latents`, `--cache_latents_to_disk`
+    *   AEの出力をキャッシュします。[sdxl_train_network.py](sdxl_train_network.md)と同様の機能です。
 
 #### 非互換・非推奨の引数
 
@@ -116,7 +119,7 @@ accelerate launch --num_cpu_threads_per_process 1 flux_train_network.py
 
 ### 4.2. 学習の開始
 
-必要な引数を設定し、コマンドを実行すると学習が開始されます。基本的な流れやログの確認方法は[`train_network.py`のガイド](how_to_use_train_network.md#32-starting-the-training--学習の開始)と同様です。
+必要な引数を設定し、コマンドを実行すると学習が開始されます。基本的な流れやログの確認方法は[`train_network.py`のガイド](train_network.md#32-starting-the-training--学習の開始)と同様です。
 
 ## 5. 学習済みモデルの利用
 
@@ -124,4 +127,189 @@ accelerate launch --num_cpu_threads_per_process 1 flux_train_network.py
 
 ## 6. その他
 
-`flux_train_network.py`には、サンプル画像の生成 (`--sample_prompts`など) や詳細なオプティマイザ設定など、`train_network.py`と共通の機能も多く存在します。これらについては、[`train_network.py`のガイド](how_to_use_train_network.md#5-other-features--その他の機能)やスクリプトのヘルプ (`python flux_train_network.py --help`) を参照してください。
+`flux_train_network.py`には、サンプル画像の生成 (`--sample_prompts`など) や詳細なオプティマイザ設定など、`train_network.py`と共通の機能も多く存在します。これらについては、[`train_network.py`のガイド](train_network.md#5-other-features--その他の機能)やスクリプトのヘルプ (`python flux_train_network.py --help`) を参照してください。
+
+# FLUX.1 LoRA学習の補足説明
+
+以下は、以上の基本的なFLUX.1 LoRAの学習手順を補足するものです。より詳細な設定オプションなどについて説明します。
+
+## 1. VRAM使用量の最適化
+
+FLUX.1モデルは比較的大きなモデルであるため、十分なVRAMを持たないGPUでは工夫が必要です。以下に、VRAM使用量を削減するための設定を紹介します。
+
+### 1.1 メモリ使用量別の推奨設定
+
+| GPUメモリ | 推奨設定 |
+|----------|----------|
+| 24GB VRAM | 基本設定で問題なく動作します（バッチサイズ2） |
+| 16GB VRAM | バッチサイズ1に設定し、`--blocks_to_swap`を使用 |
+| 12GB VRAM | `--blocks_to_swap 16`と8bit AdamWを使用 |
+| 10GB VRAM | `--blocks_to_swap 22`を使用、T5XXLはfp8形式を推奨 |
+| 8GB VRAM | `--blocks_to_swap 28`を使用、T5XXLはfp8形式を推奨 |
+
+### 1.2 主要なVRAM削減オプション
+
+- **`--blocks_to_swap <数値>`**：
+  CPUとGPU間でブロックをスワップしてVRAM使用量を削減します。数値が大きいほど多くのブロックをスワップし、より多くのVRAMを節約できますが、学習速度は低下します。FLUX.1では最大35ブロックまでスワップ可能です。
+
+- **`--cpu_offload_checkpointing`**：
+  勾配チェックポイントをCPUにオフロードします。これにより最大1GBのVRAM使用量を削減できますが、学習速度は約15%低下します。`--blocks_to_swap`とは併用できません。
+
+- **`--cache_text_encoder_outputs` / `--cache_text_encoder_outputs_to_disk`**：
+  CLIP-LとT5-XXLの出力をキャッシュします。これによりメモリ使用量を削減できます。
+
+- **`--cache_latents` / `--cache_latents_to_disk`**：
+  AEの出力をキャッシュします。メモリ使用量を削減できます。
+
+- **Adafactor オプティマイザの使用**：
+  8bit AdamWよりもVRAM使用量を削減できます。以下の設定を使用してください：
+  ```
+  --optimizer_type adafactor --optimizer_args "relative_step=False" "scale_parameter=False" "warmup_init=False" --lr_scheduler constant_with_warmup --max_grad_norm 0.0
+  ```
+
+- **T5XXLのfp8形式の使用**：
+  10GB未満のVRAMを持つGPUでは、T5XXLのfp8形式チェックポイントの使用を推奨します。[comfyanonymous/flux_text_encoders](https://huggingface.co/comfyanonymous/flux_text_encoders)から`t5xxl_fp8_e4m3fn.safetensors`をダウンロードできます（`scaled`なしで使用してください）。
+
+## 2. FLUX.1 LoRA学習の重要な設定オプション
+
+FLUX.1の学習には多くの未知の点があり、いくつかの設定は引数で指定できます。以下に重要な引数とその説明を示します。
+
+### 2.1 タイムステップのサンプリング方法
+
+`--timestep_sampling`オプションで、タイムステップ（0-1）のサンプリング方法を指定できます：
+
+- `sigma`：SD3と同様のシグマベース
+- `uniform`：一様ランダム
+- `sigmoid`：正規分布乱数のシグモイド（x-flux、AI-toolkitなどと同様）
+- `shift`：正規分布乱数のシグモイド値をシフト
+- `flux_shift`：解像度に応じて正規分布乱数のシグモイド値をシフト（FLUX.1 dev推論と同様）。この設定では`--discrete_flow_shift`は無視されます。
+
+### 2.2 モデル予測の処理方法
+
+`--model_prediction_type`オプションで、モデルの予測をどのように解釈し処理するかを指定できます：
+
+- `raw`：そのまま使用（x-fluxと同様）【推奨】
+- `additive`：ノイズ入力に加算
+- `sigma_scaled`：シグマスケーリングを適用（SD3と同様）
+
+### 2.3 推奨設定
+
+実験の結果、以下の設定が良好に動作することが確認されています：
+```
+--timestep_sampling shift --discrete_flow_shift 3.1582 --model_prediction_type raw --guidance_scale 1.0
+```
+
+ガイダンススケールについて：FLUX.1 dev版は特定のガイダンススケール値で蒸留されていますが、学習時には`--guidance_scale 1.0`を指定してガイダンススケールを無効化することを推奨します。
+
+## 3. 各層に対するランク指定
+
+FLUX.1の各層に対して異なるランク（network_dim）を指定できます。これにより、特定の層に対してLoRAの効果を強調したり、無効化したりできます。
+
+以下のnetwork_argsを指定することで、各層のランクを指定できます。0を指定するとその層にはLoRAが適用されません。
+
+| network_args | 対象レイヤー |
+|--------------|--------------|
+| img_attn_dim | DoubleStreamBlockのimg_attn |
+| txt_attn_dim | DoubleStreamBlockのtxt_attn |
+| img_mlp_dim | DoubleStreamBlockのimg_mlp |
+| txt_mlp_dim | DoubleStreamBlockのtxt_mlp |
+| img_mod_dim | DoubleStreamBlockのimg_mod |
+| txt_mod_dim | DoubleStreamBlockのtxt_mod |
+| single_dim | SingleStreamBlockのlinear1とlinear2 |
+| single_mod_dim | SingleStreamBlockのmodulation |
+
+使用例：
+```
+--network_args "img_attn_dim=4" "img_mlp_dim=8" "txt_attn_dim=2" "txt_mlp_dim=2" "img_mod_dim=2" "txt_mod_dim=2" "single_dim=4" "single_mod_dim=2"
+```
+
+さらに、FLUXの条件付けレイヤーにLoRAを適用するには、network_argsに`in_dims`を指定します。5つの数値をカンマ区切りのリストとして指定する必要があります。
+
+例：
+```
+--network_args "in_dims=[4,2,2,2,4]"
+```
+
+各数値は、`img_in`、`time_in`、`vector_in`、`guidance_in`、`txt_in`に対応します。上記の例では、すべての条件付けレイヤーにLoRAを適用し、`img_in`と`txt_in`のランクを4、その他のランクを2に設定しています。
+
+0を指定するとそのレイヤーにはLoRAが適用されません。例えば、`[4,0,0,0,4]`は`img_in`と`txt_in`にのみLoRAを適用します。
+
+## 4. 学習するブロックの指定
+
+FLUX.1 LoRA学習では、network_argsの`train_double_block_indices`と`train_single_block_indices`を指定することで、学習するブロックを指定できます。インデックスは0ベースです。省略した場合のデフォルトはすべてのブロックを学習することです。
+
+インデックスは、`0,1,5,8`のような整数のリストや、`0,1,4-5,7`のような整数の範囲として指定します。
+- double blocksの数は19なので、有効な範囲は0-18です
+- single blocksの数は38なので、有効な範囲は0-37です
+- `all`を指定するとすべてのブロックを学習します
+- `none`を指定するとブロックを学習しません
+
+使用例：
+```
+--network_args "train_double_block_indices=0,1,8-12,18" "train_single_block_indices=3,10,20-25,37"
+```
+
+または：
+```
+--network_args "train_double_block_indices=none" "train_single_block_indices=10-15"
+```
+
+`train_double_block_indices`または`train_single_block_indices`のどちらか一方だけを指定した場合、もう一方は通常通り学習されます。
+
+## 5. Text Encoder LoRAのサポート
+
+FLUX.1 LoRA学習は、CLIP-LとT5XXL LoRAのトレーニングもサポートしています。
+
+- FLUX.1のみをトレーニングする場合は、`--network_train_unet_only`を指定します
+- FLUX.1とCLIP-Lをトレーニングする場合は、`--network_train_unet_only`を省略します
+- FLUX.1、CLIP-L、T5XXLすべてをトレーニングする場合は、`--network_train_unet_only`を省略し、`--network_args "train_t5xxl=True"`を追加します
+
+CLIP-LとT5XXLの学習率は、`--text_encoder_lr`で個別に指定できます。例えば、`--text_encoder_lr 1e-4 1e-5`とすると、最初の値はCLIP-Lの学習率、2番目の値はT5XXLの学習率になります。1つだけ指定すると、CLIP-LとT5XXLの学習率は同じになります。`--text_encoder_lr`を指定しない場合、デフォルトの学習率`--learning_rate`が両方に使用されます。
+
+## 6. マルチ解像度トレーニング
+
+データセット設定ファイルで複数の解像度を定義できます。各解像度に対して異なるバッチサイズを指定することができます。
+
+設定ファイルの例：
+```toml
+[general]
+# 共通設定をここで定義
+flip_aug = true
+color_aug = false
+keep_tokens_separator= "|||"
+shuffle_caption = false
+caption_tag_dropout_rate = 0
+caption_extension = ".txt"
+
+[[datasets]]
+# 最初の解像度の設定
+batch_size = 2
+enable_bucket = true
+resolution = [1024, 1024]
+
+  [[datasets.subsets]]
+  image_dir = "画像ディレクトリへのパス"
+  num_repeats = 1
+
+[[datasets]]
+# 2番目の解像度の設定
+batch_size = 3
+enable_bucket = true
+resolution = [768, 768]
+
+  [[datasets.subsets]]
+  image_dir = "画像ディレクトリへのパス"
+  num_repeats = 1
+
+[[datasets]]
+# 3番目の解像度の設定
+batch_size = 4
+enable_bucket = true
+resolution = [512, 512]
+
+  [[datasets.subsets]]
+  image_dir = "画像ディレクトリへのパス"
+  num_repeats = 1
+```
+
+各解像度セクションの`[[datasets.subsets]]`部分は、データセットディレクトリを定義します。各解像度に対して同じディレクトリを指定してください。
\ No newline at end of file
diff --git a/docs/sd3_train_network.md b/docs/sd3_train_network.md
index d5cc5a75a..a5b7a82fe 100644
--- a/docs/sd3_train_network.md
+++ b/docs/sd3_train_network.md
@@ -6,7 +6,7 @@
 
 `sd3_train_network.py`は、Stable Diffusion 3/3.5モデルに対してLoRAなどの追加ネットワークを学習させるためのスクリプトです。SD3は、MMDiT (Multi-Modal Diffusion Transformer) と呼ばれる新しいアーキテクチャを採用しており、従来のStable Diffusionモデルとは構造が異なります。このスクリプトを使用することで、SD3/3.5モデルに特化したLoRAモデルを作成できます。
 
-このガイドは、基本的なLoRA学習の手順を理解しているユーザーを対象とし、`train_network.py`での学習経験があることを前提としています。基本的な使い方や共通のオプションについては、[`train_network.py`のガイド](how_to_use_train_network.md)を参照してください。
+このガイドは、基本的なLoRA学習の手順を理解しているユーザーを対象とし、`train_network.py`での学習経験があることを前提としています。基本的な使い方や共通のオプションについては、[`train_network.py`のガイド](train_network.md)を参照してください。
 
 **前提条件:**
 
@@ -68,7 +68,7 @@ accelerate launch --num_cpu_threads_per_process 1 sd3_train_network.py
 
 ### 4.1. 主要なコマンドライン引数の解説（`train_network.py`からの追加・変更点）
 
-[`train_network.py`のガイド](how_to_use_train_network.md)で説明されている引数に加え、以下のSD3/3.5特有の引数を指定します。共通の引数（`--output_dir`, `--output_name`, `--network_module`, `--network_dim`, `--network_alpha`, `--learning_rate`など）については、上記ガイドを参照してください。
+[`train_network.py`のガイド](train_network.md)で説明されている引数に加え、以下のSD3/3.5特有の引数を指定します。共通の引数（`--output_dir`, `--output_name`, `--network_module`, `--network_dim`, `--network_alpha`, `--learning_rate`など）については、上記ガイドを参照してください。
 
 #### モデル関連
 
@@ -111,7 +111,7 @@ accelerate launch --num_cpu_threads_per_process 1 sd3_train_network.py
 
 ### 4.2. 学習の開始
 
-必要な引数を設定し、コマンドを実行すると学習が開始されます。基本的な流れやログの確認方法は[`train_network.py`のガイド](how_to_use_train_network.md#32-starting-the-training--学習の開始)と同様です。
+必要な引数を設定し、コマンドを実行すると学習が開始されます。基本的な流れやログの確認方法は[`train_network.py`のガイド](train_network.md#32-starting-the-training--学習の開始)と同様です。
 
 ## 5. 学習済みモデルの利用
 
@@ -119,4 +119,4 @@ accelerate launch --num_cpu_threads_per_process 1 sd3_train_network.py
 
 ## 6. その他
 
-`sd3_train_network.py`には、サンプル画像の生成 (`--sample_prompts`など) や詳細なオプティマイザ設定など、`train_network.py`と共通の機能も多く存在します。これらについては、[`train_network.py`のガイド](how_to_use_train_network.md#5-other-features--その他の機能)やスクリプトのヘルプ (`python sd3_train_network.py --help`) を参照してください。
+`sd3_train_network.py`には、サンプル画像の生成 (`--sample_prompts`など) や詳細なオプティマイザ設定など、`train_network.py`と共通の機能も多く存在します。これらについては、[`train_network.py`のガイド](train_network.md#5-other-features--その他の機能)やスクリプトのヘルプ (`python sd3_train_network.py --help`) を参照してください。
diff --git a/docs/sdxl_train_network.md b/docs/sdxl_train_network.md
index 8a19f7aed..e1f6e9b9b 100644
--- a/docs/sdxl_train_network.md
+++ b/docs/sdxl_train_network.md
@@ -1,14 +1,27 @@
-はい、承知いたしました。`sd-scripts` リポジトリに含まれる `sdxl_train_network.py` を使用した SDXL LoRA 学習に関するドキュメントを作成します。`how_to_use_train_network.md` との差分を中心に、初心者ユーザー向けに解説します。
+# How to Use the SDXL LoRA Training Script `sdxl_train_network.py` / SDXL LoRA学習スクリプト `sdxl_train_network.py` の使い方
 
----
-
-# SDXL LoRA学習スクリプト `sdxl_train_network.py` の使い方
+This document explains the basic procedure for training a LoRA (Low-Rank Adaptation) model for SDXL (Stable Diffusion XL) using `sdxl_train_network.py` included in the `sd-scripts` repository.
 
+<details>
+<summary>日本語</summary>
 このドキュメントでは、`sd-scripts` リポジトリに含まれる `sdxl_train_network.py` を使用して、SDXL (Stable Diffusion XL) モデルに対する LoRA (Low-Rank Adaptation) モデルを学習する基本的な手順について解説します。
+</details>
+
+## 1. Introduction / はじめに
+
+`sdxl_train_network.py` is a script for training additional networks such as LoRA for SDXL models. The basic usage is common with `train_network.py` (see [How to Use the LoRA Training Script `train_network.py`](train_network.md)), but SDXL model-specific settings are required.
 
-## 1. はじめに
+This guide focuses on SDXL LoRA training, explaining the main differences from `train_network.py` and SDXL-specific configuration items.
 
-`sdxl_train_network.py` は、SDXL モデルに対して LoRA などの追加ネットワークを学習させるためのスクリプトです。基本的な使い方は `train_network.py` ([LoRA学習スクリプト `train_network.py` の使い方](how_to_use_train_network.md) 参照) と共通ですが、SDXL モデル特有の設定が必要となります。
+**Prerequisites:**
+
+* You have cloned the `sd-scripts` repository and set up the Python environment.
+* Your training dataset is ready. (Please refer to the [Dataset Preparation Guide](link/to/dataset/doc) for dataset preparation)
+* You have read [How to Use the LoRA Training Script `train_network.py`](train_network.md).
+
+<details>
+<summary>日本語</summary>
+`sdxl_train_network.py` は、SDXL モデルに対して LoRA などの追加ネットワークを学習させるためのスクリプトです。基本的な使い方は `train_network.py` ([LoRA学習スクリプト `train_network.py` の使い方](train_network.md) 参照) と共通ですが、SDXL モデル特有の設定が必要となります。
 
 このガイドでは、SDXL LoRA 学習に焦点を当て、`train_network.py` との主な違いや SDXL 特有の設定項目を中心に説明します。
 
@@ -16,10 +29,26 @@
 
 *   `sd-scripts` リポジトリのクローンと Python 環境のセットアップが完了していること。
 *   学習用データセットの準備が完了していること。（データセットの準備については[データセット準備ガイド](link/to/dataset/doc)を参照してください）
-*   [LoRA学習スクリプト `train_network.py` の使い方](how_to_use_train_network.md) を一読していること。
+*   [LoRA学習スクリプト `train_network.py` の使い方](train_network.md) を一読していること。
+</details>
+
+## 2. Preparation / 準備
 
-## 2. 準備
+Before starting training, you need the following files:
 
+1. **Training Script:** `sdxl_train_network.py`
+2. **Dataset Definition File (.toml):** A TOML format file describing the training dataset configuration.
+
+### About the Dataset Definition File
+
+The basic format of the dataset definition file (`.toml`) is the same as for `train_network.py`. Please refer to the [Dataset Configuration Guide](link/to/dataset/config/doc) and [How to Use the LoRA Training Script `train_network.py`](train_network.md#about-the-dataset-definition-file).
+
+For SDXL, it is common to use high-resolution datasets and the aspect ratio bucketing feature (`enable_bucket = true`).
+
+In this example, we'll use a file named `my_sdxl_dataset_config.toml`.
+
+<details>
+<summary>日本語</summary>
 学習を開始する前に、以下のファイルが必要です。
 
 1.  **学習スクリプト:** `sdxl_train_network.py`
@@ -27,14 +56,55 @@
 
 ### データセット定義ファイルについて
 
-データセット定義ファイル (`.toml`) の基本的な書き方は `train_network.py` と共通です。[データセット設定ガイド](link/to/dataset/config/doc) および [LoRA学習スクリプト `train_network.py` の使い方](how_to_use_train_network.md#データセット定義ファイルについて) を参照してください。
+データセット定義ファイル (`.toml`) の基本的な書き方は `train_network.py` と共通です。[データセット設定ガイド](link/to/dataset/config/doc) および [LoRA学習スクリプト `train_network.py` の使い方](train_network.md#データセット定義ファイルについて) を参照してください。
 
 SDXL では、高解像度のデータセットや、アスペクト比バケツ機能 (`enable_bucket = true`) の利用が一般的です。
 
 ここでは、例として `my_sdxl_dataset_config.toml` という名前のファイルを使用することにします。
+</details>
+
+## 3. Running the Training / 学習の実行
 
-## 3. 学習の実行
+Training starts by running `sdxl_train_network.py` from the terminal.
 
+Here's a basic command line execution example for SDXL LoRA training:
+
+```bash
+accelerate launch --num_cpu_threads_per_process 1 sdxl_train_network.py 
+ --pretrained_model_name_or_path="<SDXL base model path>" 
+ --dataset_config="my_sdxl_dataset_config.toml" 
+ --output_dir="<output directory for training results>" 
+ --output_name="my_sdxl_lora" 
+ --save_model_as=safetensors 
+ --network_module=networks.lora 
+ --network_dim=32 
+ --network_alpha=16 
+ --learning_rate=1e-4 
+ --unet_lr=1e-4 
+ --text_encoder_lr1=1e-5 
+ --text_encoder_lr2=1e-5 
+ --optimizer_type="AdamW8bit" 
+ --lr_scheduler="constant" 
+ --max_train_epochs=10 
+ --save_every_n_epochs=1 
+ --mixed_precision="bf16" 
+ --gradient_checkpointing 
+ --cache_text_encoder_outputs 
+ --cache_latents
+```
+
+Comparing with the execution example of `train_network.py`, the following points are different:
+
+* The script to execute is `sdxl_train_network.py`.
+* You specify an SDXL base model for `--pretrained_model_name_or_path`.
+* `--text_encoder_lr` is split into `--text_encoder_lr1` and `--text_encoder_lr2` (since SDXL has two Text Encoders).
+* `--mixed_precision` is recommended to be `bf16` or `fp16`.
+* `--cache_text_encoder_outputs` and `--cache_latents` are recommended to reduce VRAM usage.
+
+Next, we'll explain the main command line arguments that differ from `train_network.py`. For common arguments, please refer to [How to Use the LoRA Training Script `train_network.py`](train_network.md#31-main-command-line-arguments).
+
+<details>
+<summary>日本語</summary>
 学習は、ターミナルから `sdxl_train_network.py` を実行することで開始します。
 
 以下に、SDXL LoRA 学習における基本的なコマンドライン実行例を示します。
@@ -71,10 +141,78 @@ accelerate launch --num_cpu_threads_per_process 1 sdxl_train_network.py
 *   `--mixed_precision` は `bf16` または `fp16` が推奨されます。
 *   `--cache_text_encoder_outputs` や `--cache_latents` は VRAM 使用量を削減するために推奨されます。
 
-次に、`train_network.py` との差分となる主要なコマンドライン引数について解説します。共通の引数については、[LoRA学習スクリプト `train_network.py` の使い方](how_to_use_train_network.md#31-主要なコマンドライン引数) を参照してください。
-
-### 3.1. 主要なコマンドライン引数（差分）
-
+次に、`train_network.py` との差分となる主要なコマンドライン引数について解説します。共通の引数については、[LoRA学習スクリプト `train_network.py` の使い方](train_network.md#31-主要なコマンドライン引数) を参照してください。
+</details>
+
+### 3.1. Main Command Line Arguments (Differences) / 主要なコマンドライン引数（差分）
+
+#### Model Related / モデル関連
+
+* `--pretrained_model_name_or_path="<model path>"` **[Required]**
+  * Specifies the **SDXL model** to be used as the base for training. You can specify a Hugging Face Hub model ID (e.g., `"stabilityai/stable-diffusion-xl-base-1.0"`), a local Diffusers format model directory, or a path to a `.safetensors` file.
+* `--v2`, `--v_parameterization`
+  * These arguments are for SD1.x/2.x. When using `sdxl_train_network.py`, since an SDXL model is assumed, these **typically do not need to be specified**.
+
+#### Dataset Related / データセット関連
+
+* `--dataset_config="<config file path>"`
+  * This is common with `train_network.py`.
+  * For SDXL, it is common to use high-resolution data and the bucketing feature (specify `enable_bucket = true` in the `.toml` file).
+
+#### Output & Save Related / 出力・保存関連
+
+* These are common with `train_network.py`.
+
+#### LoRA Parameters / LoRA パラメータ
+
+* These are common with `train_network.py`.
+
+#### Training Parameters / 学習パラメータ
+
+* `--learning_rate=1e-4`
+  * Overall learning rate. This becomes the default value if `unet_lr`, `text_encoder_lr1`, and `text_encoder_lr2` are not specified.
+* `--unet_lr=1e-4`
+  * Learning rate for LoRA modules in the U-Net part. If not specified, the value of `--learning_rate` is used.
+* `--text_encoder_lr1=1e-5`
+  * Learning rate for LoRA modules in **Text Encoder 1 (OpenCLIP ViT-G/14)**. If not specified, the value of `--learning_rate` is used. A smaller value than U-Net is recommended.
+* `--text_encoder_lr2=1e-5`
+  * Learning rate for LoRA modules in **Text Encoder 2 (CLIP ViT-L/14)**. If not specified, the value of `--learning_rate` is used. A smaller value than U-Net is recommended.
+* `--optimizer_type="AdamW8bit"`
+  * Common with `train_network.py`.
+* `--lr_scheduler="constant"`
+  * Common with `train_network.py`.
+* `--lr_warmup_steps`
+  * Common with `train_network.py`.
+* `--max_train_steps`, `--max_train_epochs`
+  * Common with `train_network.py`.
+* `--mixed_precision="bf16"`
+  * Mixed precision training setting. For SDXL, `bf16` or `fp16` is recommended. Choose the one supported by your GPU. This reduces VRAM usage and improves training speed.
+* `--gradient_accumulation_steps=1`
+  * Common with `train_network.py`.
+* `--gradient_checkpointing`
+  * Common with `train_network.py`. Recommended to enable for SDXL due to its high memory consumption.
+* `--cache_latents`
+  * Caches VAE outputs in memory (or on disk when `--cache_latents_to_disk` is specified). By skipping VAE computation, this reduces VRAM usage and speeds up training. Image augmentations (`--color_aug`, `--flip_aug`, `--random_crop`, etc.) are disabled. This option is recommended for SDXL training.
+* `--cache_latents_to_disk`
+  * Used with `--cache_latents`, caches to disk. When loading the dataset for the first time, VAE outputs are cached to disk. This is recommended when you have a large number of training images, as it allows you to skip VAE computation on subsequent training runs.
+* `--cache_text_encoder_outputs`
+  * Caches Text Encoder outputs in memory (or on disk when `--cache_text_encoder_outputs_to_disk` is specified). By skipping Text Encoder computation, this reduces VRAM usage and speeds up training. Caption augmentations (`--shuffle_caption`, `--caption_dropout_rate`, etc.) are disabled.
+  * **Note:** When using this option, LoRA modules for Text Encoder cannot be trained (`--network_train_unet_only` must be specified).
+* `--cache_text_encoder_outputs_to_disk`
+  * Used with `--cache_text_encoder_outputs`, caches to disk.
+* `--no_half_vae`
+  * Runs VAE in `float32` even when using mixed precision (`fp16`/`bf16`). Since SDXL's VAE can be unstable in `float16`, enable this when using `fp16`.
+* `--clip_skip`
+  * Not normally used for SDXL. No need to specify.
+* `--fused_backward_pass`
+  * Fuses gradient computation and optimizer steps to reduce VRAM usage. Available for SDXL. (Currently only supports the `Adafactor` optimizer)
+
+#### Others / その他
+
+* `--seed`, `--logging_dir`, `--log_prefix`, etc. are common with `train_network.py`.
+
+<details>
+<summary>日本語</summary>
 #### モデル関連
 
 *   `--pretrained_model_name_or_path="<モデルのパス>"` **[必須]**
@@ -130,7 +268,7 @@ accelerate launch --num_cpu_threads_per_process 1 sdxl_train_network.py
 *   `--cache_text_encoder_outputs_to_disk`
     *   `--cache_text_encoder_outputs` と併用し、キャッシュ先をディスクにします。
 *   `--no_half_vae`
-    *   混合精度 (`fp16`/`bf16`) 使用時でも VAE を `float32` で動作させます。SDXL の VAE は `float16` で不安定になることがあるため、`fp16` 指定時には有効にしてくだ
+    *   混合精度 (`fp16`/`bf16`) 使用時でも VAE を `float32` で動作させます。SDXL の VAE は `float16` で不安定になることがあるため、`fp16` 指定時には有効にしてください。
 *   `--clip_skip`
     *   SDXL では通常使用しません。指定は不要です。
 *   `--fused_backward_pass`
@@ -139,22 +277,45 @@ accelerate launch --num_cpu_threads_per_process 1 sdxl_train_network.py
 #### その他
 
 *   `--seed`, `--logging_dir`, `--log_prefix` などは `train_network.py` と共通です。
+</details>
 
-### 3.2. 学習の開始
+### 3.2. Starting the Training / 学習の開始
 
+After setting the necessary arguments, execute the command to start training. The training progress will be displayed on the console. The basic flow is the same as with `train_network.py`.
+
+<details>
+<summary>日本語</summary>
 必要な引数を設定し、コマンドを実行すると学習が開始されます。学習の進行状況はコンソールに出力されます。基本的な流れは `train_network.py` と同じです。
+</details>
+
+## 4. Using the Trained Model / 学習済みモデルの利用
+
+When training is complete, a LoRA model file (`.safetensors`, etc.) with the name specified by `output_name` will be saved in the directory specified by `output_dir`.
 
-## 4. 学習済みモデルの利用
+This file can be used with GUI tools that support SDXL, such as AUTOMATIC1111/stable-diffusion-webui and ComfyUI.
 
+<details>
+<summary>日本語</summary>
 学習が完了すると、`output_dir` で指定したディレクトリに、`output_name` で指定した名前の LoRA モデルファイル (`.safetensors` など) が保存されます。
 
 このファイルは、AUTOMATIC1111/stable-diffusion-webui 、ComfyUI などの SDXL に対応した GUI ツールで利用できます。
+</details>
+
+## 5. Supplement: Main Differences from `train_network.py` / 補足: `train_network.py` との主な違い
+
+* **Target Model:** `sdxl_train_network.py` is exclusively for SDXL models.
+* **Text Encoder:** Since SDXL has two Text Encoders, there are differences in learning rate specifications (`--text_encoder_lr1`, `--text_encoder_lr2`), etc.
+* **Caching Features:** `--cache_text_encoder_outputs` is particularly effective for SDXL and is recommended.
+* **Recommended Settings:** Due to high VRAM usage, mixed precision (`bf16` or `fp16`), `gradient_checkpointing`, and caching features (`--cache_latents`, `--cache_text_encoder_outputs`) are recommended. When using `fp16`, it is recommended to run the VAE in `float32` with `--no_half_vae`.
 
-## 5. 補足: `train_network.py` との主な違い
+For other detailed options, please refer to the script's help (`python sdxl_train_network.py --help`) and other documents in the repository.
 
+<details>
+<summary>日本語</summary>
 *   **対象モデル:** `sdxl_train_network.py` は SDXL モデル専用です。
 *   **Text Encoder:** SDXL は 2 つの Text Encoder を持つため、学習率の指定 (`--text_encoder_lr1`, `--text_encoder_lr2`) などが異なります。
 *   **キャッシュ機能:** `--cache_text_encoder_outputs` は SDXL で特に効果が高く、推奨されます。
 *   **推奨設定:** VRAM 使用量が大きいため、`bf16` または `fp16` の混合精度、`gradient_checkpointing`、キャッシュ機能 (`--cache_latents`, `--cache_text_encoder_outputs`) の利用が推奨されます。`fp16` 指定時は、VAE は `--no_half_vae` で `float32` 動作を推奨します。
 
-その他の詳細なオプションについては、スクリプトのヘルプ (`python sdxl_train_network.py --help`) やリポジトリ内の他のドキュメントを参照してください。
\ No newline at end of file
+その他の詳細なオプションについては、スクリプトのヘルプ (`python sdxl_train_network.py --help`) やリポジトリ内の他のドキュメントを参照してください。
+</details>
\ No newline at end of file

From b1bbd4576cd454073a0059c96555367af5f41d1f Mon Sep 17 00:00:00 2001
From: Kohya S <ykumeykume@gmail.com>
Date: Mon, 14 Apr 2025 21:53:21 +0900
Subject: [PATCH 435/582] doc: update sd3 LoRA, sdxl LoRA advanced

---
 docs/sd3_train_network.md           |  31 ++--
 docs/sdxl_train_network_advanced.md | 260 ++++++++++++++++++++++++++++
 2 files changed, 276 insertions(+), 15 deletions(-)
 create mode 100644 docs/sdxl_train_network_advanced.md

diff --git a/docs/sd3_train_network.md b/docs/sd3_train_network.md
index a5b7a82fe..95f7ce621 100644
--- a/docs/sd3_train_network.md
+++ b/docs/sd3_train_network.md
@@ -6,7 +6,7 @@
 
 `sd3_train_network.py`は、Stable Diffusion 3/3.5モデルに対してLoRAなどの追加ネットワークを学習させるためのスクリプトです。SD3は、MMDiT (Multi-Modal Diffusion Transformer) と呼ばれる新しいアーキテクチャを採用しており、従来のStable Diffusionモデルとは構造が異なります。このスクリプトを使用することで、SD3/3.5モデルに特化したLoRAモデルを作成できます。
 
-このガイドは、基本的なLoRA学習の手順を理解しているユーザーを対象とし、`train_network.py`での学習経験があることを前提としています。基本的な使い方や共通のオプションについては、[`train_network.py`のガイド](train_network.md)を参照してください。
+このガイドは、基本的なLoRA学習の手順を理解しているユーザーを対象としています。基本的な使い方や共通のオプションについては、[`train_network.py`のガイド](train_network.md)を参照してください。また一部のパラメータは [`sdxl_train_network.py`](sdxl_train_network.md) と同様のものがあるため、そちらも参考にしてください。
 
 **前提条件:**
 
@@ -18,8 +18,8 @@
 
 `sd3_train_network.py`は`train_network.py`をベースに、SD3/3.5モデルに対応するための変更が加えられています。主な違いは以下の通りです。
 
-*   **対象モデル:** Stable Diffusion 3 Medium / Large (3.0 / 3.5) モデルを対象とします。
-*   **モデル構造:** U-Netの代わりにMMDiT (Transformerベース) を使用します。Text EncoderとしてCLIP-L, CLIP-G, T5-XXLの三つを使用します。VAEはSDXLと互換性がありますが、入力のスケール処理が異なります。
+*   **対象モデル:** Stable Diffusion 3, 3.5 Medium / Large モデルを対象とします。
+*   **モデル構造:** U-Netの代わりにMMDiT (Transformerベース) を使用します。Text EncoderとしてCLIP-L, CLIP-G, T5-XXLの三つを使用します。VAEはSDXLと互換性がありません。
 *   **引数:** SD3/3.5モデル、Text Encoder群、VAEを指定する引数があります。ただし、単一ファイルの`.safetensors`形式であれば、内部で自動的に分離されるため、個別のパス指定は必須ではありません。
 *   **一部引数の非互換性:** Stable Diffusion v1/v2向けの引数（例: `--v2`, `--v_parameterization`, `--clip_skip`）はSD3/3.5の学習では使用されません。
 *   **SD3特有の引数:** Text Encoderのアテンションマスクやドロップアウト率、Positional Embeddingの調整（SD3.5向け）、タイムステップのサンプリングや損失の重み付けに関する引数が追加されています。
@@ -29,8 +29,8 @@
 学習を開始する前に、以下のファイルが必要です。
 
 1.  **学習スクリプト:** `sd3_train_network.py`
-2.  **SD3/3.5モデルファイル:** 学習のベースとなるSD3/3.5モデルの`.safetensors`ファイル。単一ファイル形式（Diffusers/ComfyUI/AUTOMATIC1111形式）を推奨します。
-    *   Text EncoderやVAEが別ファイルになっている場合は、対応する引数でパスを指定します。
+2.  **SD3/3.5モデルファイル:** 学習のベースとなるSD3/3.5モデルの`.safetensors`ファイル。またText Encoderをそれぞれ対応する引数でパスを指定します。
+    * 単一ファイル形式も使用可能です。
 3.  **データセット定義ファイル (.toml):** 学習データセットの設定を記述したTOML形式のファイル。（詳細は[データセット設定ガイド](link/to/dataset/config/doc)を参照してください）。
     *   例として`my_sd3_dataset_config.toml`を使用します。
 
@@ -43,6 +43,9 @@
 ```bash
 accelerate launch --num_cpu_threads_per_process 1 sd3_train_network.py 
  --pretrained_model_name_or_path="<path to SD3 model>" 
+ --clip_l="<path to CLIP-L model>"
+ --clip_g="<path to CLIP-G model>" 
+ --t5xxl="<path to T5-XXL model>" 
  --dataset_config="my_sd3_dataset_config.toml" 
  --output_dir="<output directory for training results>" 
  --output_name="my_sd3_lora" 
@@ -58,8 +61,6 @@ accelerate launch --num_cpu_threads_per_process 1 sd3_train_network.py
  --save_every_n_epochs=1 
  --mixed_precision="fp16" 
  --gradient_checkpointing 
- --apply_lg_attn_mask 
- --apply_t5_attn_mask 
  --weighting_scheme="sigma_sqrt" 
  --blocks_to_swap=32
 ```
@@ -73,10 +74,10 @@ accelerate launch --num_cpu_threads_per_process 1 sd3_train_network.py
 #### モデル関連
 
 *   `--pretrained_model_name_or_path="<path to SD3 model>"` **[必須]**
-    *   学習のベースとなるSD3/3.5モデルの`.safetensors`ファイルのパスを指定します。単一ファイル形式（Diffusers/ComfyUI/AUTOMATIC1111形式）を想定しています。
+    *   学習のベースとなるSD3/3.5モデルの`.safetensors`ファイルのパスを指定します。
 *   `--clip_l`, `--clip_g`, `--t5xxl`, `--vae`:
-    *   ベースモデルが単一ファイル形式の場合、通常これらの指定は不要です（自動的にモデル内部から読み込まれます）。
-    *   もしText EncoderやVAEが別ファイルとして提供されている場合は、それぞれの`.safetensors`ファイルのパスを指定します。
+    *   ベースモデルが単一ファイル形式の場合、これらの指定は不要です（自動的にモデル内部から読み込まれます）。
+    *   Text Encoderが別ファイルとして提供されている場合は、それぞれの`.safetensors`ファイルのパスを指定します。`--vae` はベースモデルに含まれているため、通常は指定する必要はありません（明示的に異なるVAEを使用する場合のみ指定）。
 
 #### SD3/3.5 学習パラメータ
 
@@ -89,13 +90,13 @@ accelerate launch --num_cpu_threads_per_process 1 sd3_train_network.py
 *   `--clip_l_dropout_rate`, `--clip_g_dropout_rate`, `--t5_dropout_rate`:
     *   各Text Encoderの出力に対して、指定した確率でドロップアウト（出力をゼロにする）を適用します。過学習の抑制に役立つ場合があります。デフォルトは`0.0`（ドロップアウトなし）です。
 *   `--pos_emb_random_crop_rate=<float>` **[SD3.5向け]**
-    *   MMDiTのPositional Embeddingに対してランダムクロップを適用する確率を指定します。SD3 5M (3.5) モデルで学習された機能であり、他のモデルでの効果は限定的です。デフォルトは`0.0`です。
-*   `--enable_scaled_pos_embed` **[SD3.5向け]**
-    *   マルチ解像度学習時に、解像度に応じてPositional Embeddingをスケーリングします。SD3 5M (3.5) モデルで学習された機能であり、他のモデルでの効果は限定的です。
+    *   MMDiTのPositional Embeddingに対してランダムクロップを適用する確率を指定します。[SD3.5M model card](https://huggingface.co/stabilityai/stable-diffusion-3.5-medium) で説明されています。デフォルトは`0.0`です。
+*   `--enable_scaled_pos_embed` **[SD3.5向け]** **[実験的機能]**
+    *   マルチ解像度学習時に、解像度に応じてPositional Embeddingをスケーリングします。デフォルトは`False`です。通常は指定不要です。
 *   `--training_shift=<float>`
-    *   学習時のタイムステップ（ノイズレベル）の分布を調整するためのシフト値です。`weighting_scheme`に加えて適用されます。`1.0`より大きい値はノイズの大きい（構造寄り）領域を、小さい値はノイズの小さい（詳細寄り）領域を重視する傾向になります。デフォルトは`1.0`です。
+    *   学習時のタイムステップ（ノイズレベル）の分布を調整するためのシフト値です。`weighting_scheme`に加えて適用されます。`1.0`より大きい値はノイズの大きい（構造寄り）領域を、小さい値はノイズの小さい（詳細寄り）領域を重視する傾向になります。デフォルトは`1.0`です。通常はデフォルト値で問題ありません。
 *   `--weighting_scheme=<choice>`
-    *   損失計算時のタイムステップ（ノイズレベル）に応じた重み付け方法を指定します。`sigma_sqrt`, `logit_normal`, `mode`, `cosmap`, `uniform` (または`none`) から選択します。SD3の論文では`sigma_sqrt`が使用されています。デフォルトは`uniform`です。
+    *   損失計算時のタイムステップ（ノイズレベル）に応じた重み付け方法を指定します。`sigma_sqrt`, `logit_normal`, `mode`, `cosmap`, `uniform` (または`none`) から選択します。SD3の論文では`sigma_sqrt`が使用されています。デフォルトは`uniform`です。通常はデフォルト値で問題ありません。
 *   `--logit_mean`, `--logit_std`, `--mode_scale`:
     *   `weighting_scheme`で`logit_normal`または`mode`を選択した場合に、その分布を制御するためのパラメータです。通常はデフォルト値で問題ありません。
 
diff --git a/docs/sdxl_train_network_advanced.md b/docs/sdxl_train_network_advanced.md
new file mode 100644
index 000000000..ca718ad06
--- /dev/null
+++ b/docs/sdxl_train_network_advanced.md
@@ -0,0 +1,260 @@
+はい、承知いたしました。SDXL LoRA学習スクリプト `sdxl_train_network.py` の熟練した利用者向けの、機能全体の詳細を説明したドキュメントを作成します。
+
+---
+
+# 高度な設定: SDXL LoRA学習スクリプト `sdxl_train_network.py` 詳細ガイド
+
+このドキュメントでは、`sd-scripts` リポジトリに含まれる `sdxl_train_network.py` を使用した、SDXL (Stable Diffusion XL) モデルに対する LoRA (Low-Rank Adaptation) モデル学習の高度な設定オプションについて解説します。
+
+基本的な使い方については、以下のドキュメントを参照してください。
+
+*   [LoRA学習スクリプト `train_network.py` の使い方](train_network.md)
+*   [SDXL LoRA学習スクリプト `sdxl_train_network.py` の使い方](sdxl_train_network.md)
+
+このガイドは、基本的なLoRA学習の経験があり、より詳細な設定や高度な機能を試したい熟練した利用者を対象としています。
+
+**前提条件:**
+
+*   `sd-scripts` リポジトリのクローンと Python 環境のセットアップが完了していること。
+*   学習用データセットの準備と設定（`.toml`ファイル）が完了していること。（[データセット設定ガイド](link/to/dataset/config/doc)参照）
+*   基本的なLoRA学習のコマンドライン実行経験があること。
+
+## 1. コマンドライン引数 詳細解説
+
+`sdxl_train_network.py` は `train_network.py` の機能を継承しつつ、SDXL特有の機能を追加しています。ここでは、SDXL LoRA学習に関連する主要なコマンドライン引数について、機能別に分類して詳細に解説します。
+
+基本的な引数については、[LoRA学習スクリプト `train_network.py` の使い方](train_network.md#31-主要なコマンドライン引数) および [SDXL LoRA学習スクリプト `sdxl_train_network.py` の使い方](sdxl_train_network.md#31-主要なコマンドライン引数（差分）) を参照してください。
+
+### 1.1. モデル読み込み関連
+
+*   `--pretrained_model_name_or_path="<モデルパス>"` **[必須]**
+    *   学習のベースとなる **SDXLモデル** を指定します。Hugging Face HubのモデルID、ローカルのDiffusers形式モデルディレクトリ、または`.safetensors`ファイルを指定できます。
+    *   詳細は[基本ガイド](sdxl_train_network.md#モデル関連)を参照してください。
+*   `--vae="<VAEパス>"`
+    *   オプションで、学習に使用するVAEを指定します。SDXLモデルに含まれるVAE以外を使用する場合に指定します。`.ckpt`または`.safetensors`ファイルを指定できます。
+*   `--no_half_vae`
+    *   混合精度(`fp16`/`bf16`)使用時でもVAEを`float32`で動作させます。SDXLのVAEは`float16`で不安定になることがあるため、`fp16`指定時には有効にすることが推奨されます。`bf16`では通常不要です。
+*   `--fp8_base` / `--fp8_base_unet`
+    *   **実験的機能:** ベースモデル（U-Net, Text Encoder）またはU-NetのみをFP8で読み込み、VRAM使用量を削減します。PyTorch 2.1以上が必要です。詳細は[README](README.md#sd3-lora-training)の関連セクションを参照してください (SD3の説明ですがSDXLにも適用されます)。
+
+### 1.2. データセット設定関連
+
+*   `--dataset_config="<設定ファイルのパス>"` **[必須]**
+    *   データセットの設定を記述した`.toml`ファイルを指定します。SDXLでは高解像度データとバケツ機能（`.toml` で `enable_bucket = true` を指定）の利用が一般的です。
+    *   `.toml`ファイルの書き方の詳細は[データセット設定ガイド](link/to/dataset/config/doc)を参照してください。
+    *   アスペクト比バケツの解像度ステップ(`bucket_reso_steps`)は、SDXLでは8の倍数（例: 64）が一般的です。
+
+### 1.3. 出力・保存関連
+
+基本的なオプションは `train_network.py` と共通です。
+
+*   `--output_dir="<出力先ディレクトリ>"` **[必須]**
+*   `--output_name="<出力ファイル名>"` **[必須]**
+*   `--save_model_as="safetensors"` (推奨), `ckpt`, `pt`, `diffusers`, `diffusers_safetensors`
+*   `--save_precision="fp16"`, `"bf16"`, `"float"`
+    *   モデルの保存精度を指定します。未指定時は学習時の精度(`fp16`, `bf16`等)で保存されます。
+*   `--save_every_n_epochs=N` / `--save_every_n_steps=N`
+    *   Nエポック/ステップごとにモデルを保存します。
+*   `--save_last_n_epochs=M` / `--save_last_n_steps=M`
+    *   エポック/ステップごとに保存する際、最新のM個のみを保持し、古いものは削除します。
+*   `--save_state` / `--save_state_on_train_end`
+    *   モデル保存時/学習終了時に、Optimizerの状態などを含む学習状態(`state`)を保存します。`--resume`オプションでの学習再開に必要です。
+*   `--save_last_n_epochs_state=M` / `--save_last_n_steps_state=M`
+    *   `state`の保存数をM個に制限します。`--save_last_n_epochs/steps`の指定を上書きします。
+*   `--no_metadata`
+    *   出力モデルにメタデータを保存しません。
+*   `--save_state_to_huggingface` / `--huggingface_repo_id` など
+    *   Hugging Face Hubへのモデルやstateのアップロード関連オプション。詳細は[train\_util.py](train_util.py)や[huggingface\_util.py](huggingface_util.py)を参照してください。
+
+### 1.4. ネットワークパラメータ (LoRA)
+
+基本的なオプションは `train_network.py` と共通です。
+
+*   `--network_module=networks.lora` **[必須]**
+*   `--network_dim=N` **[必須]**
+    *   LoRAのランク (次元数) を指定します。SDXLでは32や64などが試されることが多いですが、データセットや目的に応じて調整が必要です。
+*   `--network_alpha=M`
+    *   LoRAのアルファ値。`network_dim`の半分程度、または`network_dim`と同じ値などが一般的です。デフォルトは1。
+*   `--network_dropout=P`
+    *   LoRAモジュール内のドロップアウト率 (0.0~1.0)。過学習抑制の効果が期待できます。デフォルトはNone (ドロップアウトなし)。
+*   `--network_args ...`
+    *   ネットワークモジュールへの追加引数を `key=value` 形式で指定します。LoRAでは以下の高度な設定が可能です。
+        *   **階層別 (Block-wise) 次元数/アルファ:**
+            *   U-Netの各ブロックごとに異なる`dim`と`alpha`を指定できます。これにより、特定の層の影響を強めたり弱めたりする調整が可能です。
+            *   `block_dims`: U-NetのLinear層およびConv2d 1x1層に対するブロックごとのdimをカンマ区切りで指定します (SDXLでは23個の数値)。
+            *   `block_alphas`: 上記に対応するalpha値をカンマ区切りで指定します。
+            *   `conv_block_dims`: U-NetのConv2d 3x3層に対するブロックごとのdimをカンマ区切りで指定します。
+            *   `conv_block_alphas`: 上記に対応するalpha値をカンマ区切りで指定します。
+            *   指定しないブロックは `--network_dim`/`--network_alpha` または `--conv_dim`/`--conv_alpha` (存在する場合) の値が使用されます。
+            *   詳細は[LoRA の階層別学習率](train_network.md#lora-の階層別学習率) (train\_network.md内、SDXLでも同様に適用可能) や実装 ([lora.py](lora.py)) を参照してください。
+        *   **LoRA+:**
+            *   `loraplus_lr_ratio=R`: LoRAの上向き重み(UP)の学習率を、下向き重み(DOWN)の学習率のR倍にします。学習速度の向上が期待できます。論文推奨は16。
+            *   `loraplus_unet_lr_ratio=RU`: U-Net部分のLoRA+学習率比を個別に指定します。
+            *   `loraplus_text_encoder_lr_ratio=RT`: Text Encoder部分のLoRA+学習率比を個別に指定します。(`--text_encoder_lr1`, `--text_encoder_lr2`で指定した学習率に乗算されます)
+            *   詳細は[README](../README.md#jan-17-2025--2025-01-17-version-090)や実装 ([lora.py](lora.py)) を参照してください。
+*   `--network_train_unet_only`
+    *   U-NetのLoRAモジュールのみを学習します。Text Encoderの学習を行わない場合に指定します。`--cache_text_encoder_outputs` を使用する場合は必須です。
+*   `--network_train_text_encoder_only`
+    *   Text EncoderのLoRAモジュールのみを学習します。U-Netの学習を行わない場合に指定します。
+*   `--network_weights="<重みファイル>"`
+    *   学習済みのLoRA重みを読み込んで学習を開始します。ファインチューニングや学習再開に使用します。
+*   `--dim_from_weights`
+    *   `--network_weights` で指定した重みファイルからLoRAの次元数 (`dim`) を自動的に読み込みます。`--network_dim` の指定は不要になります。
+
+### 1.5. 学習パラメータ
+
+*   `--learning_rate=LR`
+    *   全体の学習率。各モジュール(`unet_lr`, `text_encoder_lr1`, `text_encoder_lr2`)のデフォルト値となります。`1e-4` や `4e-5` などが試されることが多いです。
+*   `--unet_lr=LR_U`
+    *   U-Net部分のLoRAモジュールの学習率。
+*   `--text_encoder_lr1=LR_TE1`
+    *   Text Encoder 1 (OpenCLIP ViT-G/14) のLoRAモジュールの学習率。通常、U-Netより小さい値 (例: `1e-5`, `2e-5`) が推奨されます。
+*   `--text_encoder_lr2=LR_TE2`
+    *   Text Encoder 2 (CLIP ViT-L/14) のLoRAモジュールの学習率。通常、U-Netより小さい値 (例: `1e-5`, `2e-5`) が推奨されます。
+*   `--optimizer_type="..."`
+    *   使用するOptimizerを指定します。`AdamW8bit` (省メモリ、一般的), `Adafactor` (さらに省メモリ、SDXLフルモデル学習で実績あり), `Lion`, `DAdaptation`, `Prodigy`などが選択可能です。各Optimizerには追加の引数が必要な場合があります (`--optimizer_args`参照)。
+    *   `AdamW8bit` や `PagedAdamW8bit` (要 `bitsandbytes`) が一般的です。
+    *   `Adafactor` はメモリ効率が良いですが、設定がやや複雑です (相対ステップ(`relative_step=True`)推奨、学習率スケジューラは`adafactor`推奨)。
+    *   `DAdaptation`, `Prodigy` は学習率の自動調整機能がありますが、LoRA+との併用はできません。学習率は`1.0`程度を指定します。
+    *   詳細は[train\_util.py](train_util.py)の`get_optimizer`関数を参照してください。
+*   `--optimizer_args ...`
+    *   Optimizerへの追加引数を `key=value` 形式で指定します (例: `"weight_decay=0.01"` `"betas=0.9,0.999"`).
+*   `--lr_scheduler="..."`
+    *   学習率スケジューラを指定します。`constant` (変化なし), `cosine` (コサインカーブ), `linear` (線形減衰), `constant_with_warmup` (ウォームアップ付き定数), `cosine_with_restarts` など。`cosine` や `constant_with_warmup` がよく使われます。
+    *   スケジューラによっては追加の引数が必要です (`--lr_scheduler_args`参照)。
+    *   `DAdaptation` や `Prodigy` などの自己学習率調整機能付きOptimizerを使用する場合、スケジューラは不要です (`constant` を指定)。
+*   `--lr_warmup_steps=N`
+    *   学習率スケジューラのウォームアップステップ数。学習開始時に学習率を徐々に上げていく期間です。N < 1 の場合は全ステップ数に対する割合と解釈されます。
+*   `--lr_scheduler_num_cycles=N` / `--lr_scheduler_power=P`
+    *   特定のスケジューラ (`cosine_with_restarts`, `polynomial`) のためのパラメータ。
+*   `--max_train_steps=N` / `--max_train_epochs=N`
+    *   学習の総ステップ数またはエポック数を指定します。エポック指定が優先されます。
+*   `--mixed_precision="bf16"` / `"fp16"` / `"no"`
+    *   混合精度学習の設定。SDXLでは `bf16` (対応GPUの場合) または `fp16` の使用が強く推奨されます。VRAM使用量を削減し、学習速度を向上させます。
+*   `--full_fp16` / `--full_bf16`
+    *   勾配計算も含めて完全に半精度/bf16で行います。VRAM使用量をさらに削減できますが、学習の安定性に影響する可能性があります。
+*   `--gradient_accumulation_steps=N`
+    *   勾配をNステップ分蓄積してからOptimizerを更新します。実質的なバッチサイズを `train_batch_size * N` に増やし、少ないVRAMで大きなバッチサイズ相当の効果を得られます。デフォルトは1。
+*   `--max_grad_norm=N`
+    *   勾配クリッピングの閾値。勾配のノルムがNを超える場合にクリッピングします。デフォルトは1.0。`0`で無効。
+*   `--gradient_checkpointing`
+    *   メモリ使用量を大幅に削減しますが、学習速度は若干低下します。SDXLではメモリ消費が大きいため、有効にすることが推奨されます。
+*   `--fused_backward_pass`
+    *   **実験的機能:** 勾配計算とOptimizerのステップを融合し、VRAM使用量を削減します。SDXLで利用可能です。現在 `Adafactor` Optimizerのみ対応。Gradient Accumulationとは併用できません。
+*   `--resume="<stateディレクトリ>"`
+    *   `--save_state`で保存された学習状態から学習を再開します。Optimizerの状態や学習ステップ数などが復元されます。
+
+### 1.6. キャッシュ機能関連
+
+SDXLは計算コストが高いため、キャッシュ機能が効果的です。
+
+*   `--cache_latents`
+    *   VAEの出力(Latent)をメモリにキャッシュします。VAEの計算を省略でき、VRAM使用量を削減し、学習を高速化します。**注意:** 画像に対するAugmentation (`color_aug`, `flip_aug`, `random_crop` 等) は無効になります。
+*   `--cache_latents_to_disk`
+    *   `--cache_latents` と併用し、キャッシュ先をディスクにします。大量のデータセットや複数回の学習で特に有効です。初回実行時にディスクにキャッシュが生成され、2回目以降はそれを読み込みます。
+*   `--cache_text_encoder_outputs`
+    *   Text Encoderの出力をメモリにキャッシュします。Text Encoderの計算を省略でき、VRAM使用量を削減し、学習を高速化します。**注意:** キャプションに対するAugmentation (`shuffle_caption`, `caption_dropout_rate` 等) は無効になります。**また、このオプションを使用する場合、Text EncoderのLoRAモジュールは学習できません (`--network_train_unet_only` の指定が必須です)。**
+*   `--cache_text_encoder_outputs_to_disk`
+    *   `--cache_text_encoder_outputs` と併用し、キャッシュ先をディスクにします。
+*   `--skip_cache_check`
+    *   キャッシュファイルの内容の検証をスキップします。ファイルの存在確認は行われ、存在しない場合はキャッシュが生成されます。デバッグ等で意図的に再キャッシュしたい場合を除き、通常は指定不要です。
+
+### 1.7. サンプル画像生成関連
+
+基本的なオプションは `train_network.py` と共通です。
+
+*   `--sample_every_n_steps=N` / `--sample_every_n_epochs=N`
+    *   Nステップ/エポックごとにサンプル画像を生成します。
+*   `--sample_at_first`
+    *   学習開始前にサンプル画像を生成します。
+*   `--sample_prompts="<プロンプトファイル>"`
+    *   サンプル画像生成に使用するプロンプトを記述したファイル (`.txt`, `.toml`, `.json`) を指定します。書式は[gen\_img\_diffusers.py](gen_img_diffusers.py)に準じます。詳細は[ドキュメント](gen_img_README-ja.md)を参照してください。
+*   `--sample_sampler="..."`
+    *   サンプル画像生成時のサンプラー（スケジューラ）を指定します。`euler_a`, `dpm++_2m_karras` などが一般的です。選択肢は `--help` を参照してください。
+
+### 1.8. Logging & Tracking 関連
+
+*   `--logging_dir="<ログディレクトリ>"`
+    *   TensorBoardなどのログを出力するディレクトリを指定します。指定しない場合、ログは出力されません。
+*   `--log_with="tensorboard"` / `"wandb"` / `"all"`
+    *   使用するログツールを指定します。`wandb`を使用する場合、`pip install wandb`が必要です。
+*   `--log_prefix="<プレフィックス>"`
+    *   `logging_dir` 内に作成されるサブディレクトリ名の接頭辞を指定します。
+*   `--wandb_api_key="<APIキー>"` / `--wandb_run_name="<実行名>"`
+    *   Weights & Biases (wandb) 使用時のオプション。
+*   `--log_tracker_name` / `--log_tracker_config`
+    *   高度なトラッカー設定用オプション。通常は指定不要。
+*   `--log_config`
+    *   学習開始時に、使用された学習設定（一部の機密情報を除く）をログに出力します。再現性の確保に役立ちます。
+
+### 1.9. 正則化・高度な学習テクニック関連
+
+*   `--noise_offset=N`
+    *   ノイズオフセットを有効にし、その値を指定します。画像の明るさやコントラストの偏りを改善する効果が期待できます。SDXLのベースモデルはこの値で学習されているため、有効にすることが推奨されます (例: 0.0357)。
+*   `--noise_offset_random_strength`
+    *   ノイズオフセットの強度を0から指定値の間でランダムに変動させます。
+*   `--adaptive_noise_scale=N`
+    *   Latentの平均絶対値に応じてノイズオフセットを調整します。`--noise_offset`と併用します。
+*   `--multires_noise_iterations=N` / `--multires_noise_discount=D`
+    *   複数解像度ノイズを有効にします。異なる周波数成分のノイズを加えることで、ディテールの再現性を向上させる効果が期待できます。イテレーション回数N (6-10程度) と割引率D (0.3程度) を指定します。
+*   `--ip_noise_gamma=G` / `--ip_noise_gamma_random_strength`
+    *   Input Perturbation Noiseを有効にします。入力(Latent)に微小なノイズを加えて正則化を行います。Gamma値 (0.1程度) を指定します。`random_strength`で強度をランダム化できます。
+*   `--min_snr_gamma=N`
+    *   Min-SNR Weighting Strategy を適用します。学習初期のノイズが大きいタイムステップでのLossの重みを調整し、学習を安定させます。`N=5` などが使用されます。
+*   `--scale_v_pred_loss_like_noise_pred`
+    *   v-predictionモデルにおいて、vの予測ロスをノイズ予測ロスと同様のスケールに調整します。SDXLはv-predictionではないため、**通常は使用しません**。
+*   `--v_pred_like_loss=N`
+    *   ノイズ予測モデルにv予測ライクなロスを追加します。`N`でその重みを指定します。SDXLでは**通常は使用しません**。
+*   `--debiased_estimation_loss`
+    *   Debiased EstimationによるLoss計算を行います。Min-SNRと類似の目的を持ちますが、異なるアプローチです。
+*   `--loss_type="l1"` / `"l2"` / `"huber"` / `"smooth_l1"`
+    *   損失関数を指定します。デフォルトは`l2` (MSE)。`huber`や`smooth_l1`は外れ値に頑健な損失関数です。
+*   `--huber_schedule="constant"` / `"exponential"` / `"snr"`
+    *   `huber`または`smooth_l1`損失使用時のスケジューリング方法。`snr`が推奨されています。
+*   `--huber_c=C` / `--huber_scale=S`
+    *   `huber`または`smooth_l1`損失のパラメータ。
+*   `--masked_loss`
+    *   マスク画像に基づいてLoss計算領域を限定します。データセット設定で`conditioning_data_dir`にマスク画像（白黒）を指定する必要があります。詳細は[マスクロスについて](masked_loss_README.md)を参照してください。
+
+### 1.10. 分散学習・その他
+
+*   `--seed=N`
+    *   乱数シードを指定します。学習の再現性を確保したい場合に設定します。
+*   `--max_token_length=N` (`75`, `150`, `225`)
+    *   Text Encoderが処理するトークンの最大長。SDXLでは通常`75` (デフォルト) または `150`, `225`。長くするとより複雑なプロンプトを扱えますが、VRAM使用量が増加します。
+*   `--clip_skip=N`
+    *   Text Encoderの最終層からN層スキップした層の出力を使用します。SDXLでは**通常使用しません**。
+*   `--lowram` / `--highvram`
+    *   メモリ使用量の最適化に関するオプション。`--lowram`はColabなどRAM < VRAM環境向け、`--highvram`はVRAM潤沢な環境向け。
+*   `--persistent_data_loader_workers` / `--max_data_loader_n_workers=N`
+    *   DataLoaderのワーカプロセスに関する設定。エポック間の待ち時間やメモリ使用量に影響します。
+*   `--config_file="<設定ファイル>"` / `--output_config`
+    *   コマンドライン引数の代わりに`.toml`ファイルを使用/出力するオプション。
+*   **Accelerate/DeepSpeed関連:** (`--ddp_timeout`, `--ddp_gradient_as_bucket_view`, `--ddp_static_graph`)
+    *   分散学習時の詳細設定。通常はAccelerateの設定 (`accelerate config`) で十分です。DeepSpeedを使用する場合は、別途設定が必要です。
+
+## 2. その他のTips
+
+*   **VRAM使用量:** SDXL LoRA学習は多くのVRAMを必要とします。24GB VRAMでも設定によってはメモリ不足になることがあります。以下の設定でVRAM使用量を削減できます。
+    *   `--mixed_precision="bf16"` または `"fp16"` (必須級)
+    *   `--gradient_checkpointing` (強く推奨)
+    *   `--cache_latents` / `--cache_text_encoder_outputs` (効果大、制約あり)
+    *   `--optimizer_type="AdamW8bit"` または `"Adafactor"`
+    *   `--gradient_accumulation_steps` の値を増やす (バッチサイズを小さくする)
+    *   `--full_fp16` / `--full_bf16` (安定性に注意)
+    *   `--fp8_base` / `--fp8_base_unet` (実験的)
+    *   `--fused_backward_pass` (Adafactor限定、実験的)
+*   **学習率:** SDXL LoRAの適切な学習率はデータセットや`network_dim`/`alpha`に依存します。`1e-4` ~ `4e-5` (U-Net), `1e-5` ~ `2e-5` (Text Encoders) あたりから試すのが一般的です。
+*   **学習時間:** 高解像度データとSDXLモデルのサイズのため、学習には時間がかかります。キャッシュ機能や適切なハードウェアの利用が重要です。
+*   **トラブルシューティング:**
+    *   **NaN Loss:** 学習率が高すぎる、混合精度の設定が不適切 (`fp16`時の`--no_half_vae`未指定など)、データセットの問題などが考えられます。
+    *   **VRAM不足 (OOM):** 上記のVRAM削減策を試してください。
+    *   **学習が進まない:** 学習率が低すぎる、Optimizer/Schedulerの設定が不適切、データセットの問題などが考えられます。
+
+## 3. おわりに
+
+`sdxl_train_network.py` は非常に多くのオプションを提供しており、SDXL LoRA学習の様々な側面をカスタマイズできます。このドキュメントが、より高度な設定やチューニングを行う際の助けとなれば幸いです。
+
+不明な点や詳細については、各スクリプトの `--help` オプションや、リポジトリ内の他のドキュメント、実装コード自体を参照してください。
+
+---
\ No newline at end of file

From 176baa6b95ae1f9eb6fcc02665f4f1966ae47df9 Mon Sep 17 00:00:00 2001
From: Kohya S <ykumeykume@gmail.com>
Date: Wed, 16 Apr 2025 12:32:43 +0900
Subject: [PATCH 436/582] doc: update sd3 and sdxl training guides

---
 docs/sd3_train_network.md           |  2 ++
 docs/sdxl_train_network_advanced.md | 22 +++++++++++-----------
 2 files changed, 13 insertions(+), 11 deletions(-)

diff --git a/docs/sd3_train_network.md b/docs/sd3_train_network.md
index 95f7ce621..2911fdf2c 100644
--- a/docs/sd3_train_network.md
+++ b/docs/sd3_train_network.md
@@ -1,3 +1,5 @@
+ステータス：内容を一通り確認した
+
 # `sd3_train_network.py` を用いたStable Diffusion 3/3.5モデルのLoRA学習ガイド
 
 このドキュメントでは、`sd-scripts`リポジトリに含まれる`sd3_train_network.py`を使用して、Stable Diffusion 3 (SD3) および Stable Diffusion 3.5 (SD3.5) モデルに対するLoRA (Low-Rank Adaptation) モデルを学習する基本的な手順について解説します。
diff --git a/docs/sdxl_train_network_advanced.md b/docs/sdxl_train_network_advanced.md
index ca718ad06..a736f0b36 100644
--- a/docs/sdxl_train_network_advanced.md
+++ b/docs/sdxl_train_network_advanced.md
@@ -1,4 +1,4 @@
-はい、承知いたしました。SDXL LoRA学習スクリプト `sdxl_train_network.py` の熟練した利用者向けの、機能全体の詳細を説明したドキュメントを作成します。
+ステータス：確認中
 
 ---
 
@@ -35,14 +35,14 @@
 *   `--no_half_vae`
     *   混合精度(`fp16`/`bf16`)使用時でもVAEを`float32`で動作させます。SDXLのVAEは`float16`で不安定になることがあるため、`fp16`指定時には有効にすることが推奨されます。`bf16`では通常不要です。
 *   `--fp8_base` / `--fp8_base_unet`
-    *   **実験的機能:** ベースモデル（U-Net, Text Encoder）またはU-NetのみをFP8で読み込み、VRAM使用量を削減します。PyTorch 2.1以上が必要です。詳細は[README](README.md#sd3-lora-training)の関連セクションを参照してください (SD3の説明ですがSDXLにも適用されます)。
+    *   **実験的機能:** ベースモデル（U-Net, Text Encoder）またはU-NetのみをFP8で読み込み、VRAM使用量を削減します。PyTorch 2.1以上が必要です。詳細は TODO 後でドキュメントを追加 の関連セクションを参照してください (SD3の説明ですがSDXLにも適用されます)。
 
 ### 1.2. データセット設定関連
 
-*   `--dataset_config="<設定ファイルのパス>"` **[必須]**
+*   `--dataset_config="<設定ファイルのパス>"` 
     *   データセットの設定を記述した`.toml`ファイルを指定します。SDXLでは高解像度データとバケツ機能（`.toml` で `enable_bucket = true` を指定）の利用が一般的です。
     *   `.toml`ファイルの書き方の詳細は[データセット設定ガイド](link/to/dataset/config/doc)を参照してください。
-    *   アスペクト比バケツの解像度ステップ(`bucket_reso_steps`)は、SDXLでは8の倍数（例: 64）が一般的です。
+    *   アスペクト比バケツの解像度ステップ(`bucket_reso_steps`)は、SDXLでは32の倍数とする必要があります。
 
 ### 1.3. 出力・保存関連
 
@@ -64,7 +64,7 @@
 *   `--no_metadata`
     *   出力モデルにメタデータを保存しません。
 *   `--save_state_to_huggingface` / `--huggingface_repo_id` など
-    *   Hugging Face Hubへのモデルやstateのアップロード関連オプション。詳細は[train\_util.py](train_util.py)や[huggingface\_util.py](huggingface_util.py)を参照してください。
+    *   Hugging Face Hubへのモデルやstateのアップロード関連オプション。詳細は TODO ドキュメントを追加 を参照してください。
 
 ### 1.4. ネットワークパラメータ (LoRA)
 
@@ -97,14 +97,14 @@
 *   `--network_train_text_encoder_only`
     *   Text EncoderのLoRAモジュールのみを学習します。U-Netの学習を行わない場合に指定します。
 *   `--network_weights="<重みファイル>"`
-    *   学習済みのLoRA重みを読み込んで学習を開始します。ファインチューニングや学習再開に使用します。
+    *   学習済みのLoRA重みを読み込んで学習を開始します。ファインチューニングや学習再開に使用します。`--resume` との違いは、このオプションはLoRAモジュールの重みのみを読み込み、`--resume` はOptimizerの状態や学習ステップ数なども復元します。
 *   `--dim_from_weights`
     *   `--network_weights` で指定した重みファイルからLoRAの次元数 (`dim`) を自動的に読み込みます。`--network_dim` の指定は不要になります。
 
 ### 1.5. 学習パラメータ
 
 *   `--learning_rate=LR`
-    *   全体の学習率。各モジュール(`unet_lr`, `text_encoder_lr1`, `text_encoder_lr2`)のデフォルト値となります。`1e-4` や `4e-5` などが試されることが多いです。
+    *   全体の学習率。各モジュール(`unet_lr`, `text_encoder_lr1`, `text_encoder_lr2`)のデフォルト値となります。`1e-3` や `1e-4` などが試されることが多いです。
 *   `--unet_lr=LR_U`
     *   U-Net部分のLoRAモジュールの学習率。
 *   `--text_encoder_lr1=LR_TE1`
@@ -120,7 +120,7 @@
 *   `--optimizer_args ...`
     *   Optimizerへの追加引数を `key=value` 形式で指定します (例: `"weight_decay=0.01"` `"betas=0.9,0.999"`).
 *   `--lr_scheduler="..."`
-    *   学習率スケジューラを指定します。`constant` (変化なし), `cosine` (コサインカーブ), `linear` (線形減衰), `constant_with_warmup` (ウォームアップ付き定数), `cosine_with_restarts` など。`cosine` や `constant_with_warmup` がよく使われます。
+    *   学習率スケジューラを指定します。`constant` (変化なし), `cosine` (コサインカーブ), `linear` (線形減衰), `constant_with_warmup` (ウォームアップ付き定数), `cosine_with_restarts` など。`constant` や `cosine` 、 `constant_with_warmup` がよく使われます。
     *   スケジューラによっては追加の引数が必要です (`--lr_scheduler_args`参照)。
     *   `DAdaptation` や `Prodigy` などの自己学習率調整機能付きOptimizerを使用する場合、スケジューラは不要です (`constant` を指定)。
 *   `--lr_warmup_steps=N`
@@ -132,7 +132,7 @@
 *   `--mixed_precision="bf16"` / `"fp16"` / `"no"`
     *   混合精度学習の設定。SDXLでは `bf16` (対応GPUの場合) または `fp16` の使用が強く推奨されます。VRAM使用量を削減し、学習速度を向上させます。
 *   `--full_fp16` / `--full_bf16`
-    *   勾配計算も含めて完全に半精度/bf16で行います。VRAM使用量をさらに削減できますが、学習の安定性に影響する可能性があります。
+    *   勾配計算も含めて完全に半精度/bf16で行います。VRAM使用量をさらに削減できますが、学習の安定性に影響する可能性があります。VRAMがどうしても足りない場合に使用します。
 *   `--gradient_accumulation_steps=N`
     *   勾配をNステップ分蓄積してからOptimizerを更新します。実質的なバッチサイズを `train_batch_size * N` に増やし、少ないVRAMで大きなバッチサイズ相当の効果を得られます。デフォルトは1。
 *   `--max_grad_norm=N`
@@ -190,13 +190,13 @@ SDXLは計算コストが高いため、キャッシュ機能が効果的です
 ### 1.9. 正則化・高度な学習テクニック関連
 
 *   `--noise_offset=N`
-    *   ノイズオフセットを有効にし、その値を指定します。画像の明るさやコントラストの偏りを改善する効果が期待できます。SDXLのベースモデルはこの値で学習されているため、有効にすることが推奨されます (例: 0.0357)。
+    *   ノイズオフセットを有効にし、その値を指定します。画像の明るさやコントラストの偏りを改善する効果が期待できます。SDXLのベースモデルはこの値で学習されているため、有効にすることが推奨されます (例: 0.0357)。元々の技術解説は[こちら](https://www.crosslabs.org/blog/diffusion-with-offset-noise)。
 *   `--noise_offset_random_strength`
     *   ノイズオフセットの強度を0から指定値の間でランダムに変動させます。
 *   `--adaptive_noise_scale=N`
     *   Latentの平均絶対値に応じてノイズオフセットを調整します。`--noise_offset`と併用します。
 *   `--multires_noise_iterations=N` / `--multires_noise_discount=D`
-    *   複数解像度ノイズを有効にします。異なる周波数成分のノイズを加えることで、ディテールの再現性を向上させる効果が期待できます。イテレーション回数N (6-10程度) と割引率D (0.3程度) を指定します。
+    *   複数解像度ノイズを有効にします。異なる周波数成分のノイズを加えることで、ディテールの再現性を向上させる効果が期待できます。イテレーション回数N (6-10程度) と割引率D (0.3程度) を指定します。技術解説は[こちら](https://wandb.ai/johnowhitaker/multires_noise/reports/Multi-Resolution-Noise-for-Diffusion-Model-Training--VmlldzozNjYyOTU2)。
 *   `--ip_noise_gamma=G` / `--ip_noise_gamma_random_strength`
     *   Input Perturbation Noiseを有効にします。入力(Latent)に微小なノイズを加えて正則化を行います。Gamma値 (0.1程度) を指定します。`random_strength`で強度をランダム化できます。
 *   `--min_snr_gamma=N`

From 629073cd9dd21296ca8aa97a5267d4dc7f6e5fdb Mon Sep 17 00:00:00 2001
From: Kohya S <ykumeykume@gmail.com>
Date: Wed, 16 Apr 2025 21:50:36 +0900
Subject: [PATCH 437/582] Add guidance scale for prompt param and flux sampling

---
 library/flux_train_utils.py | 10 +++++++---
 library/train_util.py       |  5 +++++
 2 files changed, 12 insertions(+), 3 deletions(-)

diff --git a/library/flux_train_utils.py b/library/flux_train_utils.py
index ce3818292..d2ff347da 100644
--- a/library/flux_train_utils.py
+++ b/library/flux_train_utils.py
@@ -154,6 +154,7 @@ def sample_image_inference(
     sample_steps = prompt_dict.get("sample_steps", 20)
     width = prompt_dict.get("width", 512)
     height = prompt_dict.get("height", 512)
+    guidance_scale = prompt_dict.get("guidance_scale", args.guidance_scale)
     scale = prompt_dict.get("scale", 1.0)  # 1.0 means no guidance
     seed = prompt_dict.get("seed")
     controlnet_image = prompt_dict.get("controlnet_image")
@@ -180,9 +181,12 @@ def sample_image_inference(
     logger.info(f"prompt: {prompt}")
     if scale != 1.0:
         logger.info(f"negative_prompt: {negative_prompt}")
+    elif negative_prompt != "":
+        logger.info(f"negative prompt is ignored because scale is 1.0")
     logger.info(f"height: {height}")
     logger.info(f"width: {width}")
     logger.info(f"sample_steps: {sample_steps}")
+    logger.info(f"guidance_scale: {guidance_scale}")
     if scale != 1.0:
         logger.info(f"scale: {scale}")
     # logger.info(f"sample_sampler: {sampler_name}")
@@ -256,7 +260,7 @@ def encode_prompt(prpt):
             txt_ids,
             l_pooled,
             timesteps=timesteps,
-            guidance=scale,
+            guidance=guidance_scale,
             t5_attn_mask=t5_attn_mask,
             controlnet=controlnet,
             controlnet_img=controlnet_image,
@@ -489,7 +493,7 @@ def get_noisy_model_input_and_timesteps(
         sigmas = torch.randn(bsz, device=device)
         sigmas = sigmas * args.sigmoid_scale  # larger scale for more uniform sampling
         sigmas = sigmas.sigmoid()
-        mu = get_lin_function(y1=0.5, y2=1.15)((h // 2) * (w // 2)) # we are pre-packed so must adjust for packed size 
+        mu = get_lin_function(y1=0.5, y2=1.15)((h // 2) * (w // 2))  # we are pre-packed so must adjust for packed size
         sigmas = time_shift(mu, 1.0, sigmas)
         timesteps = sigmas * num_timesteps
     else:
@@ -514,7 +518,7 @@ def get_noisy_model_input_and_timesteps(
     if args.ip_noise_gamma:
         xi = torch.randn_like(latents, device=latents.device, dtype=dtype)
         if args.ip_noise_gamma_random_strength:
-            ip_noise_gamma = (torch.rand(1, device=latents.device, dtype=dtype) * args.ip_noise_gamma)
+            ip_noise_gamma = torch.rand(1, device=latents.device, dtype=dtype) * args.ip_noise_gamma
         else:
             ip_noise_gamma = args.ip_noise_gamma
         noisy_model_input = (1.0 - sigmas) * latents + sigmas * (noise + ip_noise_gamma * xi)
diff --git a/library/train_util.py b/library/train_util.py
index 6c39f8d98..e152f30f7 100644
--- a/library/train_util.py
+++ b/library/train_util.py
@@ -6178,6 +6178,11 @@ def line_to_prompt_dict(line: str) -> dict:
                 prompt_dict["scale"] = float(m.group(1))
                 continue
 
+            m = re.match(r"g ([\d\.]+)", parg, re.IGNORECASE)
+            if m:  # guidance scale
+                prompt_dict["guidance_scale"] = float(m.group(1))
+                continue
+
             m = re.match(r"n (.+)", parg, re.IGNORECASE)
             if m:  # negative prompt
                 prompt_dict["negative_prompt"] = m.group(1)

From 7c61c0dfe0e879fd6b66ccb70273e4b99deaf1c5 Mon Sep 17 00:00:00 2001
From: saibit <sharlyn.xiaoying@gmail.com>
Date: Tue, 22 Apr 2025 16:06:55 +0800
Subject: [PATCH 438/582] Add autocast warpper for forward functions in
 deepspeed_utils.py to try aligning precision when using mixed precision in
 training process

---
 library/deepspeed_utils.py | 32 ++++++++++++++++++++++++++++++++
 library/flux_models.py     |  2 +-
 library/train_util.py      |  5 +++++
 requirements.txt           |  3 ++-
 4 files changed, 40 insertions(+), 2 deletions(-)

diff --git a/library/deepspeed_utils.py b/library/deepspeed_utils.py
index 99a7b2b3b..3018def74 100644
--- a/library/deepspeed_utils.py
+++ b/library/deepspeed_utils.py
@@ -94,6 +94,7 @@ def prepare_deepspeed_plugin(args: argparse.Namespace):
     deepspeed_plugin.deepspeed_config["train_batch_size"] = (
         args.train_batch_size * args.gradient_accumulation_steps * int(os.environ["WORLD_SIZE"])
     )
+    
     deepspeed_plugin.set_mixed_precision(args.mixed_precision)
     if args.mixed_precision.lower() == "fp16":
         deepspeed_plugin.deepspeed_config["fp16"]["initial_scale_power"] = 0  # preventing overflow.
@@ -122,18 +123,49 @@ def prepare_deepspeed_model(args: argparse.Namespace, **models):
     class DeepSpeedWrapper(torch.nn.Module):
         def __init__(self, **kw_models) -> None:
             super().__init__()
+            
             self.models = torch.nn.ModuleDict()
+            
+            warp_model_forward_with_torch_autocast = args.mixed_precision is not "no"
 
             for key, model in kw_models.items():
                 if isinstance(model, list):
                     model = torch.nn.ModuleList(model)
+                                            
                 assert isinstance(
                     model, torch.nn.Module
                 ), f"model must be an instance of torch.nn.Module, but got {key} is {type(model)}"
+
+                if warp_model_forward_with_torch_autocast:
+                    model = self.__warp_with_torch_autocast(model)  
+                
                 self.models.update(torch.nn.ModuleDict({key: model}))
 
+        def __warp_with_torch_autocast(self, model):
+            if isinstance(model, torch.nn.ModuleList):
+                for i in range(len(model)):
+                    model[i] = self.__warp_model_forward_with_torch_autocast(model[i])
+            else:
+                model = self.__warp_model_forward_with_torch_autocast(model)
+            return model
+
+        def __warp_model_forward_with_torch_autocast(self, model):
+            
+            assert hasattr(model, "forward"), f"model must have a forward method."
+
+            forward_fn = model.forward
+            
+            def forward(*args, **kwargs):
+                device_type= "cuda" if torch.cuda.is_available() else "cpu"
+                with torch.autocast(device_type=device_type):
+                    return forward_fn(*args, **kwargs)
+            model.forward = forward    
+            
+            return model
+        
         def get_models(self):
             return self.models
+        
 
     ds_model = DeepSpeedWrapper(**models)
     return ds_model
diff --git a/library/flux_models.py b/library/flux_models.py
index 328ad481d..12151ee86 100644
--- a/library/flux_models.py
+++ b/library/flux_models.py
@@ -1005,7 +1005,7 @@ def prepare_block_swap_before_forward(self):
             return
         self.offloader_double.prepare_block_devices_before_forward(self.double_blocks)
         self.offloader_single.prepare_block_devices_before_forward(self.single_blocks)
-
+    
     def forward(
         self,
         img: Tensor,
diff --git a/library/train_util.py b/library/train_util.py
index 6c39f8d98..dbbfda3ec 100644
--- a/library/train_util.py
+++ b/library/train_util.py
@@ -5495,6 +5495,11 @@ def load_target_model(args, weight_dtype, accelerator, unet_use_linear_projectio
 
 
 def patch_accelerator_for_fp16_training(accelerator):
+    
+    from accelerate import DistributedType
+    if accelerator.distributed_type == DistributedType.DEEPSPEED:
+        return
+    
     org_unscale_grads = accelerator.scaler._unscale_grads_
 
     def _unscale_grads_replacer(optimizer, inv_scale, found_inf, allow_fp16):
diff --git a/requirements.txt b/requirements.txt
index 767d9e8eb..bead3f90c 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,6 +1,7 @@
 accelerate==0.33.0
 transformers==4.44.0
-diffusers[torch]==0.25.0
+diffusers==0.25.0
+deepspeed==0.16.7
 ftfy==6.1.1
 # albumentations==1.3.0
 opencv-python==4.8.1.78

From d33d5eccd16970e489359ee02b89a6259559e4b9 Mon Sep 17 00:00:00 2001
From: saibit <sharlyn.xiaoying@gmail.com>
Date: Tue, 22 Apr 2025 16:12:06 +0800
Subject: [PATCH 439/582] #

---
 library/flux_models.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/library/flux_models.py b/library/flux_models.py
index 12151ee86..d7840d51c 100644
--- a/library/flux_models.py
+++ b/library/flux_models.py
@@ -1004,8 +1004,7 @@ def prepare_block_swap_before_forward(self):
         if self.blocks_to_swap is None or self.blocks_to_swap == 0:
             return
         self.offloader_double.prepare_block_devices_before_forward(self.double_blocks)
-        self.offloader_single.prepare_block_devices_before_forward(self.single_blocks)
-    
+        self.offloader_single.prepare_block_devices_before_forward(self.single_blocks)    
     def forward(
         self,
         img: Tensor,

From 7f984f47758f9e17f4a82b92cb9dbc97b3ba982f Mon Sep 17 00:00:00 2001
From: saibit <sharlyn.xiaoying@gmail.com>
Date: Tue, 22 Apr 2025 16:15:12 +0800
Subject: [PATCH 440/582] #

---
 library/flux_models.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/library/flux_models.py b/library/flux_models.py
index d7840d51c..328ad481d 100644
--- a/library/flux_models.py
+++ b/library/flux_models.py
@@ -1004,7 +1004,8 @@ def prepare_block_swap_before_forward(self):
         if self.blocks_to_swap is None or self.blocks_to_swap == 0:
             return
         self.offloader_double.prepare_block_devices_before_forward(self.double_blocks)
-        self.offloader_single.prepare_block_devices_before_forward(self.single_blocks)    
+        self.offloader_single.prepare_block_devices_before_forward(self.single_blocks)
+
     def forward(
         self,
         img: Tensor,

From c8af252a44a7dbc54a0c1622946faedef4e7c52b Mon Sep 17 00:00:00 2001
From: Robert <robert@saibit.com>
Date: Tue, 22 Apr 2025 16:19:14 +0800
Subject: [PATCH 441/582] refactor

---
 library/deepspeed_utils.py | 19 +++++++++----------
 1 file changed, 9 insertions(+), 10 deletions(-)

diff --git a/library/deepspeed_utils.py b/library/deepspeed_utils.py
index 3018def74..f6eac3679 100644
--- a/library/deepspeed_utils.py
+++ b/library/deepspeed_utils.py
@@ -126,7 +126,7 @@ def __init__(self, **kw_models) -> None:
             
             self.models = torch.nn.ModuleDict()
             
-            warp_model_forward_with_torch_autocast = args.mixed_precision is not "no"
+            wrap_model_forward_with_torch_autocast = args.mixed_precision is not "no"
 
             for key, model in kw_models.items():
                 if isinstance(model, list):
@@ -136,31 +136,30 @@ def __init__(self, **kw_models) -> None:
                     model, torch.nn.Module
                 ), f"model must be an instance of torch.nn.Module, but got {key} is {type(model)}"
 
-                if warp_model_forward_with_torch_autocast:
-                    model = self.__warp_with_torch_autocast(model)  
+                if wrap_model_forward_with_torch_autocast:
+                    model = self.__wrap_model_with_torch_autocast(model)  
                 
                 self.models.update(torch.nn.ModuleDict({key: model}))
 
-        def __warp_with_torch_autocast(self, model):
+        def __wrap_model_with_torch_autocast(self, model):
             if isinstance(model, torch.nn.ModuleList):
-                for i in range(len(model)):
-                    model[i] = self.__warp_model_forward_with_torch_autocast(model[i])
+                model = [self.__wrap_model_forward_with_torch_autocast(m) for m in model]
             else:
-                model = self.__warp_model_forward_with_torch_autocast(model)
+                model = self.__wrap_model_forward_with_torch_autocast(model)
             return model
 
-        def __warp_model_forward_with_torch_autocast(self, model):
+        def __wrap_model_forward_with_torch_autocast(self, model):
             
             assert hasattr(model, "forward"), f"model must have a forward method."
 
             forward_fn = model.forward
             
             def forward(*args, **kwargs):
-                device_type= "cuda" if torch.cuda.is_available() else "cpu"
+                device_type = "cuda" if torch.cuda.is_available() else "cpu"
                 with torch.autocast(device_type=device_type):
                     return forward_fn(*args, **kwargs)
+
             model.forward = forward    
-            
             return model
         
         def get_models(self):

From 899f3454b6f92b48a4d5780549edd92a6bc9db49 Mon Sep 17 00:00:00 2001
From: sdbds <865105819@qq.com>
Date: Wed, 23 Apr 2025 15:47:12 +0800
Subject: [PATCH 442/582] update for init problem

---
 library/train_util.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/library/train_util.py b/library/train_util.py
index ba6e4cb9b..4babb8db8 100644
--- a/library/train_util.py
+++ b/library/train_util.py
@@ -2174,7 +2174,8 @@ def __init__(
         debug_dataset: bool,
         validation_seed: int,
         validation_split: float,
-        resize_interpolation: Optional[str],
+        system_prompt: Optional[str] = None,
+        resize_interpolation: Optional[str] = None,
     ) -> None:
         super().__init__(resolution, network_multiplier, debug_dataset, resize_interpolation)
 
@@ -2402,7 +2403,8 @@ def __init__(
         bucket_no_upscale: bool,
         debug_dataset: bool,
         validation_split: float,
-        validation_seed: Optional[int],        
+        validation_seed: Optional[int],
+        system_prompt: Optional[str] = None,
         resize_interpolation: Optional[str] = None,
     ) -> None:
         super().__init__(resolution, network_multiplier, debug_dataset, resize_interpolation)

From 4fc917821ac014972538888f5cf59d9dd1df502b Mon Sep 17 00:00:00 2001
From: sdbds <865105819@qq.com>
Date: Wed, 23 Apr 2025 16:16:36 +0800
Subject: [PATCH 443/582] fix bugs

---
 library/train_util.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/library/train_util.py b/library/train_util.py
index 4babb8db8..e2d0d1750 100644
--- a/library/train_util.py
+++ b/library/train_util.py
@@ -1883,8 +1883,8 @@ def __init__(
         debug_dataset: bool,
         validation_split: float,
         validation_seed: Optional[int],
-        system_prompt: Optional[str],
-        resize_interpolation: Optional[str],
+        system_prompt: Optional[str] = None,
+        resize_interpolation: Optional[str] = None,
     ) -> None:
         super().__init__(resolution, network_multiplier, debug_dataset, resize_interpolation)
 
@@ -2458,6 +2458,7 @@ def __init__(
             debug_dataset,
             validation_split,
             validation_seed,
+            system_prompt,
             resize_interpolation,
         )
 

From adb775c6165d93a856e33d0d9058efd629cf2a2d Mon Sep 17 00:00:00 2001
From: saibit <sharlyn.xiaoying@gmail.com>
Date: Wed, 23 Apr 2025 17:05:20 +0800
Subject: [PATCH 444/582] Update: requirement diffusers[torch]==0.25.0

---
 requirements.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/requirements.txt b/requirements.txt
index bead3f90c..9e97eed3e 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,6 +1,6 @@
 accelerate==0.33.0
 transformers==4.44.0
-diffusers==0.25.0
+diffusers[torch]==0.25.0
 deepspeed==0.16.7
 ftfy==6.1.1
 # albumentations==1.3.0

From abf2c44bc5650afef8bebbb1ef278c66f44c4dda Mon Sep 17 00:00:00 2001
From: sharlynxy <sharlyn.xiaoying@gmail.com>
Date: Wed, 23 Apr 2025 18:57:19 +0800
Subject: [PATCH 445/582] Dynamically set device in deepspeed wrapper (#2)

* get device type from model

* add logger warning

* format

* format

* format
---
 library/deepspeed_utils.py | 18 ++++++++++++++----
 1 file changed, 14 insertions(+), 4 deletions(-)

diff --git a/library/deepspeed_utils.py b/library/deepspeed_utils.py
index f6eac3679..09c6f7b99 100644
--- a/library/deepspeed_utils.py
+++ b/library/deepspeed_utils.py
@@ -5,6 +5,8 @@
 
 from .utils import setup_logging
 
+from .device_utils import get_preferred_device
+
 setup_logging()
 import logging
 
@@ -153,13 +155,21 @@ def __wrap_model_forward_with_torch_autocast(self, model):
             assert hasattr(model, "forward"), f"model must have a forward method."
 
             forward_fn = model.forward
-            
+
             def forward(*args, **kwargs):
-                device_type = "cuda" if torch.cuda.is_available() else "cpu"
-                with torch.autocast(device_type=device_type):
+                try:
+                    device_type = model.device.type
+                except AttributeError:
+                    logger.warning(
+                            "[DeepSpeed] model.device is not available. Using get_preferred_device() "
+                            "to determine the device_type for torch.autocast()."
+                    )                    
+                    device_type = get_preferred_device().type
+
+                with torch.autocast(device_type = device_type):
                     return forward_fn(*args, **kwargs)
 
-            model.forward = forward    
+            model.forward = forward
             return model
         
         def get_models(self):

From 46ad3be0593df1df9d485c3ac2efb5aebd87730c Mon Sep 17 00:00:00 2001
From: saibit <sharlyn.xiaoying@gmail.com>
Date: Thu, 24 Apr 2025 11:26:36 +0800
Subject: [PATCH 446/582] update deepspeed wrapper

---
 library/deepspeed_utils.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/library/deepspeed_utils.py b/library/deepspeed_utils.py
index 09c6f7b99..a8a05c3a1 100644
--- a/library/deepspeed_utils.py
+++ b/library/deepspeed_utils.py
@@ -134,18 +134,18 @@ def __init__(self, **kw_models) -> None:
                 if isinstance(model, list):
                     model = torch.nn.ModuleList(model)
                                             
+                if wrap_model_forward_with_torch_autocast:
+                    model = self.__wrap_model_with_torch_autocast(model)  
+                
                 assert isinstance(
                     model, torch.nn.Module
                 ), f"model must be an instance of torch.nn.Module, but got {key} is {type(model)}"
 
-                if wrap_model_forward_with_torch_autocast:
-                    model = self.__wrap_model_with_torch_autocast(model)  
-                
                 self.models.update(torch.nn.ModuleDict({key: model}))
 
         def __wrap_model_with_torch_autocast(self, model):
             if isinstance(model, torch.nn.ModuleList):
-                model = [self.__wrap_model_forward_with_torch_autocast(m) for m in model]
+                model = torch.nn.ModuleList([self.__wrap_model_forward_with_torch_autocast(m) for m in model])
             else:
                 model = self.__wrap_model_forward_with_torch_autocast(model)
             return model

From 8387e0b95c1067e919f91a2abec11ddcd5ed15cb Mon Sep 17 00:00:00 2001
From: Kohya S <ykumeykume@gmail.com>
Date: Sun, 27 Apr 2025 18:25:59 +0900
Subject: [PATCH 447/582] docs: update README to include CFG scale support in
 FLUX.1 training

---
 README.md | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 2e80a6974..f9831aeee 100644
--- a/README.md
+++ b/README.md
@@ -14,6 +14,10 @@ The command to install PyTorch is as follows:
 
 ### Recent Updates
 
+Apr 27, 2025:
+- FLUX.1 training now supports CFG scale in the sample generation during training. Please use `--g` option, to specify the CFG scale (note that `--l` is used as the embedded guidance scale.) PR [#2064](https://github.com/kohya-ss/sd-scripts/pull/2064).
+    - See [here](#sample-image-generation-during-training) for details.
+
 Apr 6, 2025:
 - IP noise gamma has been enabled in FLUX.1. Thanks to rockerBOO for PR [#1992](https://github.com/kohya-ss/sd-scripts/pull/1992). See the PR for details.
     - `--ip_noise_gamma` and `--ip_noise_gamma_random_strength` are available.
@@ -1344,11 +1348,13 @@ masterpiece, best quality, 1boy, in business suit, standing at street, looking b
 
   Lines beginning with `#` are comments. You can specify options for the generated image with options like `--n` after the prompt. The following can be used.
 
-  * `--n` Negative prompt up to the next option.
+  * `--n` Negative prompt up to the next option. Ignored when CFG scale is `1.0`.
   * `--w` Specifies the width of the generated image.
   * `--h` Specifies the height of the generated image.
   * `--d` Specifies the seed of the generated image.
   * `--l` Specifies the CFG scale of the generated image.
+    * In guidance distillation models like FLUX.1, this value is used as the embedded guidance scale for backward compatibility.
+  * `--g` Specifies the CFG scale for the models with embedded guidance scale. The default is `1.0`, `1.0` means no CFG. In general, should not be changed unless you train the un-distilled FLUX.1 models.
   * `--s` Specifies the number of steps in the generation.
 
   The prompt weighting such as `( )` and `[ ]` are working.

From fd3a445769910ddc0c8c02d13e535cac37b85d2e Mon Sep 17 00:00:00 2001
From: Kohya S <ykumeykume@gmail.com>
Date: Sun, 27 Apr 2025 22:50:27 +0900
Subject: [PATCH 448/582] fix: revert default emb guidance scale and CFG scale
 for FLUX.1 sampling

---
 library/flux_train_utils.py | 19 ++++++++++---------
 1 file changed, 10 insertions(+), 9 deletions(-)

diff --git a/library/flux_train_utils.py b/library/flux_train_utils.py
index d2ff347da..5f6867a81 100644
--- a/library/flux_train_utils.py
+++ b/library/flux_train_utils.py
@@ -154,8 +154,9 @@ def sample_image_inference(
     sample_steps = prompt_dict.get("sample_steps", 20)
     width = prompt_dict.get("width", 512)
     height = prompt_dict.get("height", 512)
-    guidance_scale = prompt_dict.get("guidance_scale", args.guidance_scale)
-    scale = prompt_dict.get("scale", 1.0)  # 1.0 means no guidance
+    # TODO refactor variable names
+    cfg_scale = prompt_dict.get("guidance_scale", 1.0)
+    emb_guidance_scale = prompt_dict.get("scale", 3.5)
     seed = prompt_dict.get("seed")
     controlnet_image = prompt_dict.get("controlnet_image")
     prompt: str = prompt_dict.get("prompt", "")
@@ -179,16 +180,16 @@ def sample_image_inference(
     height = max(64, height - height % 16)  # round to divisible by 16
     width = max(64, width - width % 16)  # round to divisible by 16
     logger.info(f"prompt: {prompt}")
-    if scale != 1.0:
+    if cfg_scale != 1.0:
         logger.info(f"negative_prompt: {negative_prompt}")
     elif negative_prompt != "":
         logger.info(f"negative prompt is ignored because scale is 1.0")
     logger.info(f"height: {height}")
     logger.info(f"width: {width}")
     logger.info(f"sample_steps: {sample_steps}")
-    logger.info(f"guidance_scale: {guidance_scale}")
-    if scale != 1.0:
-        logger.info(f"scale: {scale}")
+    logger.info(f"embedded guidance scale: {emb_guidance_scale}")
+    if cfg_scale != 1.0:
+        logger.info(f"CFG scale: {cfg_scale}")
     # logger.info(f"sample_sampler: {sampler_name}")
     if seed is not None:
         logger.info(f"seed: {seed}")
@@ -220,12 +221,12 @@ def encode_prompt(prpt):
 
     l_pooled, t5_out, txt_ids, t5_attn_mask = encode_prompt(prompt)
     # encode negative prompts
-    if scale != 1.0:
+    if cfg_scale != 1.0:
         neg_l_pooled, neg_t5_out, _, neg_t5_attn_mask = encode_prompt(negative_prompt)
         neg_t5_attn_mask = (
             neg_t5_attn_mask.to(accelerator.device) if args.apply_t5_attn_mask and neg_t5_attn_mask is not None else None
         )
-        neg_cond = (scale, neg_l_pooled, neg_t5_out, neg_t5_attn_mask)
+        neg_cond = (cfg_scale, neg_l_pooled, neg_t5_out, neg_t5_attn_mask)
     else:
         neg_cond = None
 
@@ -260,7 +261,7 @@ def encode_prompt(prpt):
             txt_ids,
             l_pooled,
             timesteps=timesteps,
-            guidance=guidance_scale,
+            guidance=emb_guidance_scale,
             t5_attn_mask=t5_attn_mask,
             controlnet=controlnet,
             controlnet_img=controlnet_image,

From 29523c9b68bd56cdb1cce3f4985f2e45cefb1f2b Mon Sep 17 00:00:00 2001
From: Kohya S <ykumeykume@gmail.com>
Date: Sun, 27 Apr 2025 23:34:37 +0900
Subject: [PATCH 449/582] docs: add note for user feedback on CFG scale in
 FLUX.1 training

---
 README.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/README.md b/README.md
index f9831aeee..18e8e6591 100644
--- a/README.md
+++ b/README.md
@@ -17,6 +17,7 @@ The command to install PyTorch is as follows:
 Apr 27, 2025:
 - FLUX.1 training now supports CFG scale in the sample generation during training. Please use `--g` option, to specify the CFG scale (note that `--l` is used as the embedded guidance scale.) PR [#2064](https://github.com/kohya-ss/sd-scripts/pull/2064).
     - See [here](#sample-image-generation-during-training) for details.
+    - If you have any issues with this, please let us know.
 
 Apr 6, 2025:
 - IP noise gamma has been enabled in FLUX.1. Thanks to rockerBOO for PR [#1992](https://github.com/kohya-ss/sd-scripts/pull/1992). See the PR for details.

From 1684ababcd7fc4259c77f1471ef41d10e612a721 Mon Sep 17 00:00:00 2001
From: sharlynxy <sharlyn.xiaoying@gmail.com>
Date: Wed, 30 Apr 2025 19:51:09 +0800
Subject: [PATCH 450/582] remove deepspeed from requirements.txt

---
 requirements.txt | 1 -
 1 file changed, 1 deletion(-)

diff --git a/requirements.txt b/requirements.txt
index 9e97eed3e..767d9e8eb 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,7 +1,6 @@
 accelerate==0.33.0
 transformers==4.44.0
 diffusers[torch]==0.25.0
-deepspeed==0.16.7
 ftfy==6.1.1
 # albumentations==1.3.0
 opencv-python==4.8.1.78

From a4fae93dce5b78a0c92ee328d6b2dd96be944a7d Mon Sep 17 00:00:00 2001
From: rockerBOO <rockerboo@gmail.com>
Date: Thu, 1 May 2025 00:55:10 -0400
Subject: [PATCH 451/582] Add  pythonpath to pytest.ini

---
 pytest.ini | 1 +
 1 file changed, 1 insertion(+)

diff --git a/pytest.ini b/pytest.ini
index 484d3aef6..34b7e9c1f 100644
--- a/pytest.ini
+++ b/pytest.ini
@@ -6,3 +6,4 @@ filterwarnings =
     ignore::DeprecationWarning
     ignore::UserWarning
 		ignore::FutureWarning
+pythonpath = .

From f62c68df3c96639e83dcb7f5062d48c3067055e1 Mon Sep 17 00:00:00 2001
From: rockerBOO <rockerboo@gmail.com>
Date: Thu, 1 May 2025 01:37:57 -0400
Subject: [PATCH 452/582] Make grad_norm and combined_grad_norm None is not
 recording

---
 networks/lora_flux.py | 12 ++++++------
 train_network.py      |  6 ++++--
 2 files changed, 10 insertions(+), 8 deletions(-)

diff --git a/networks/lora_flux.py b/networks/lora_flux.py
index 92b3979ae..0b30f1b8a 100644
--- a/networks/lora_flux.py
+++ b/networks/lora_flux.py
@@ -955,26 +955,26 @@ def update_grad_norms(self):
         for lora in self.text_encoder_loras + self.unet_loras:
             lora.update_grad_norms()
 
-    def grad_norms(self) -> Tensor:
+    def grad_norms(self) -> Tensor | None:
         grad_norms = []
         for lora in self.text_encoder_loras + self.unet_loras:
             if hasattr(lora, "grad_norms") and lora.grad_norms is not None:
                 grad_norms.append(lora.grad_norms.mean(dim=0))
-        return torch.stack(grad_norms) if len(grad_norms) > 0 else torch.tensor([])
+        return torch.stack(grad_norms) if len(grad_norms) > 0 else None
 
-    def weight_norms(self) -> Tensor:
+    def weight_norms(self) -> Tensor | None:
         weight_norms = []
         for lora in self.text_encoder_loras + self.unet_loras:
             if hasattr(lora, "weight_norms") and lora.weight_norms is not None:
                 weight_norms.append(lora.weight_norms.mean(dim=0))
-        return torch.stack(weight_norms) if len(weight_norms) > 0 else torch.tensor([])
+        return torch.stack(weight_norms) if len(weight_norms) > 0 else None
 
-    def combined_weight_norms(self) -> Tensor:
+    def combined_weight_norms(self) -> Tensor | None:
         combined_weight_norms = []
         for lora in self.text_encoder_loras + self.unet_loras:
             if hasattr(lora, "combined_weight_norms") and lora.combined_weight_norms is not None:
                 combined_weight_norms.append(lora.combined_weight_norms.mean(dim=0))
-        return torch.stack(combined_weight_norms) if len(combined_weight_norms) > 0 else torch.tensor([])
+        return torch.stack(combined_weight_norms) if len(combined_weight_norms) > 0 else None
 
 
     def load_weights(self, file):
diff --git a/train_network.py b/train_network.py
index d6bc66ed8..2b4e6d3fd 100644
--- a/train_network.py
+++ b/train_network.py
@@ -1444,8 +1444,10 @@ def restore_rng_state(rng_states: tuple[torch.ByteTensor, Optional[torch.ByteTen
                 else:
                     if hasattr(network, "weight_norms"):
                         mean_norm = network.weight_norms().mean().item()
-                        mean_grad_norm = network.grad_norms().mean().item()
-                        mean_combined_norm = network.combined_weight_norms().mean().item()
+                        grad_norms = network.grad_norms()
+                        mean_grad_norm = grad_norms.mean().item() if grad_norms is not None else None
+                        combined_weight_norms = network.combined_weight_norms()
+                        mean_combined_norm = combined_weight_norms.mean().item() if combined_weight_norms is not None else None
                         weight_norms = network.weight_norms()
                         maximum_norm = weight_norms.max().item() if weight_norms.numel() > 0 else None
                         keys_scaled = None

From b4a89c3cdf7319b6840f1e4a28a5a1001643bc22 Mon Sep 17 00:00:00 2001
From: rockerBOO <rockerboo@gmail.com>
Date: Thu, 1 May 2025 02:03:22 -0400
Subject: [PATCH 453/582] Fix None

---
 train_network.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/train_network.py b/train_network.py
index 2b4e6d3fd..1336a0b19 100644
--- a/train_network.py
+++ b/train_network.py
@@ -1443,13 +1443,13 @@ def restore_rng_state(rng_states: tuple[torch.ByteTensor, Optional[torch.ByteTen
                     max_mean_logs = {"Keys Scaled": keys_scaled, "Average key norm": mean_norm}
                 else:
                     if hasattr(network, "weight_norms"):
-                        mean_norm = network.weight_norms().mean().item()
+                        weight_norms = network.weight_norms()
+                        mean_norm = weight_norms.mean().item() if weight_norms is not None else None
                         grad_norms = network.grad_norms()
                         mean_grad_norm = grad_norms.mean().item() if grad_norms is not None else None
                         combined_weight_norms = network.combined_weight_norms()
                         mean_combined_norm = combined_weight_norms.mean().item() if combined_weight_norms is not None else None
-                        weight_norms = network.weight_norms()
-                        maximum_norm = weight_norms.max().item() if weight_norms.numel() > 0 else None
+                        maximum_norm = weight_norms.max().item() if weight_norms is not None else None
                         keys_scaled = None
                         max_mean_logs = {}
                     else:

From 865c8d55e2b8cd9f0b6008a6d4ee4a07949d9acc Mon Sep 17 00:00:00 2001
From: Kohya S <ykumeykume@gmail.com>
Date: Thu, 1 May 2025 23:29:19 +0900
Subject: [PATCH 454/582] README.md: Update recent updates and add DeepSpeed
 installation instructions

---
 README.md | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/README.md b/README.md
index 18e8e6591..13c2320cc 100644
--- a/README.md
+++ b/README.md
@@ -14,6 +14,10 @@ The command to install PyTorch is as follows:
 
 ### Recent Updates
 
+May 1, 2025:
+- The error when training FLUX.1 with mixed precision in flux_train.py with DeepSpeed enabled has been resolved. Thanks to sharlynxy for PR [#2060](https://github.com/kohya-ss/sd-scripts/pull/2060). Please refer to the PR for details.
+  - If you enable DeepSpeed, please install deepseed with `pip install deepspeed==0.16.7`.
+
 Apr 27, 2025:
 - FLUX.1 training now supports CFG scale in the sample generation during training. Please use `--g` option, to specify the CFG scale (note that `--l` is used as the embedded guidance scale.) PR [#2064](https://github.com/kohya-ss/sd-scripts/pull/2064).
     - See [here](#sample-image-generation-during-training) for details.
@@ -875,6 +879,14 @@ Note: Some user reports ``ValueError: fp16 mixed precision requires a GPU`` is o
 
 (Single GPU with id `0` will be used.)
 
+## DeepSpeed installation (experimental, Linux or WSL2 only)
+  
+To install DeepSpeed, run the following command in your activated virtual environment:
+
+```bash
+pip install deepspeed==0.16.7 
+```
+
 ## Upgrade
 
 When a new release comes out you can upgrade your repo with the following command:

From a27ace74d96d9519629283f4ff3d207c1ad8d98e Mon Sep 17 00:00:00 2001
From: Kohya S <ykumeykume@gmail.com>
Date: Thu, 1 May 2025 23:31:23 +0900
Subject: [PATCH 455/582] doc: add DeepSpeed installation in header section

---
 README.md | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 13c2320cc..497969ab4 100644
--- a/README.md
+++ b/README.md
@@ -9,6 +9,8 @@ __Please update PyTorch to 2.4.0. We have tested with `torch==2.4.0` and `torchv
 The command to install PyTorch is as follows:
 `pip3 install torch==2.4.0 torchvision==0.19.0 --index-url https://download.pytorch.org/whl/cu124`
 
+If you are using DeepSpeed, please install DeepSpeed with `pip install deepspeed==0.16.7`.
+
 - [FLUX.1 training](#flux1-training)
 - [SD3 training](#sd3-training)
 
@@ -16,7 +18,7 @@ The command to install PyTorch is as follows:
 
 May 1, 2025:
 - The error when training FLUX.1 with mixed precision in flux_train.py with DeepSpeed enabled has been resolved. Thanks to sharlynxy for PR [#2060](https://github.com/kohya-ss/sd-scripts/pull/2060). Please refer to the PR for details.
-  - If you enable DeepSpeed, please install deepseed with `pip install deepspeed==0.16.7`.
+  - If you enable DeepSpeed, please install DeepSpeed with `pip install deepspeed==0.16.7`.
 
 Apr 27, 2025:
 - FLUX.1 training now supports CFG scale in the sample generation during training. Please use `--g` option, to specify the CFG scale (note that `--l` is used as the embedded guidance scale.) PR [#2064](https://github.com/kohya-ss/sd-scripts/pull/2064).

From 19a180ff909e5f6deb46d2cac5b22615df6ad9b1 Mon Sep 17 00:00:00 2001
From: "Kohya S." <52813779+kohya-ss@users.noreply.github.com>
Date: Sat, 17 May 2025 14:28:26 +0900
Subject: [PATCH 456/582] Add English versions with Japanese in details

---
 docs/flux_train_network.md          |  84 ++++++++++-
 docs/sd3_train_network.md           | 209 ++++++++++++++++++++--------
 docs/sdxl_train_network_advanced.md |  89 +++++++++++-
 3 files changed, 323 insertions(+), 59 deletions(-)

diff --git a/docs/flux_train_network.md b/docs/flux_train_network.md
index 46eee3e7e..4d7c2d46f 100644
--- a/docs/flux_train_network.md
+++ b/docs/flux_train_network.md
@@ -1,3 +1,85 @@
+Status: reviewed
+
+# LoRA Training Guide for FLUX.1 using `flux_train_network.py` / `flux_train_network.py` を用いたFLUX.1モデルのLoRA学習ガイド
+
+This document explains how to train LoRA models for the FLUX.1 model using `flux_train_network.py` included in the `sd-scripts` repository.
+
+## 1. Introduction / はじめに
+
+`flux_train_network.py` trains additional networks such as LoRA on the FLUX.1 model, which uses a transformer-based architecture different from Stable Diffusion. Two text encoders, CLIP-L and T5-XXL, and a dedicated AutoEncoder are used.
+
+This guide assumes you know the basics of LoRA training. For common options see [train_network.py](train_network.md) and [sdxl_train_network.py](sdxl_train_network.md).
+
+**Prerequisites:**
+
+* The repository is cloned and the Python environment is ready.
+* A training dataset is prepared. See the dataset configuration guide.
+
+## 2. Differences from `train_network.py` / `train_network.py` との違い
+
+`flux_train_network.py` is based on `train_network.py` but adapted for FLUX.1. Main differences include required arguments for the FLUX.1 model, CLIP-L, T5-XXL and AE, different model structure, and some incompatible options from Stable Diffusion.
+
+## 3. Preparation / 準備
+
+Before starting training you need:
+
+1. **Training script:** `flux_train_network.py`
+2. **FLUX.1 model file** and text encoder files (`clip_l`, `t5xxl`) and AE file.
+3. **Dataset definition file (.toml)** such as `my_flux_dataset_config.toml`.
+
+## 4. Running the Training / 学習の実行
+
+Run `flux_train_network.py` from the terminal with FLUX.1 specific arguments. Example:
+
+```bash
+accelerate launch --num_cpu_threads_per_process 1 flux_train_network.py \
+  --pretrained_model_name_or_path="<path to FLUX.1 model>" \
+  --clip_l="<path to CLIP-L model>" \
+  --t5xxl="<path to T5-XXL model>" \
+  --ae="<path to AE model>" \
+  --dataset_config="my_flux_dataset_config.toml" \
+  --output_dir="<output directory>" \
+  --output_name="my_flux_lora" \
+  --save_model_as=safetensors \
+  --network_module=networks.lora_flux \
+  --network_dim=16 \
+  --network_alpha=1 \
+  --learning_rate=1e-4 \
+  --optimizer_type="AdamW8bit" \
+  --lr_scheduler="constant" \
+  --sdpa \
+  --max_train_epochs=10 \
+  --save_every_n_epochs=1 \
+  --mixed_precision="fp16" \
+  --gradient_checkpointing \
+  --guidance_scale=1.0 \
+  --timestep_sampling="flux_shift" \
+  --blocks_to_swap=18 \
+  --cache_text_encoder_outputs \
+  --cache_latents
+```
+
+### 4.1. Explanation of Key Options / 主要なコマンドライン引数の解説
+
+The script adds FLUX.1 specific arguments such as guidance scale, timestep sampling, block swapping, and options for training CLIP-L and T5-XXL LoRA modules. Some Stable Diffusion options like `--v2` and `--clip_skip` are not used.
+
+### 4.2. Starting Training / 学習の開始
+
+Training begins once you run the command with the required options. Log checking is the same as in `train_network.py`.
+
+## 5. Using the Trained Model / 学習済みモデルの利用
+
+After training, a LoRA model file is saved in `output_dir` and can be used in inference environments supporting FLUX.1 (e.g. ComfyUI + Flux nodes).
+
+## 6. Others / その他
+
+Additional notes on VRAM optimization, training options, multi-resolution datasets, block selection and text encoder LoRA are provided in the Japanese section.
+
+<details>
+<summary>日本語</summary>
+
+
+
 # `flux_train_network.py` を用いたFLUX.1モデルのLoRA学習ガイド
 
 このドキュメントでは、`sd-scripts`リポジトリに含まれる`flux_train_network.py`を使用して、FLUX.1モデルに対するLoRA (Low-Rank Adaptation) モデルを学習する基本的な手順について解説します。
@@ -312,4 +394,4 @@ resolution = [512, 512]
   num_repeats = 1
 ```
 
-各解像度セクションの`[[datasets.subsets]]`部分は、データセットディレクトリを定義します。各解像度に対して同じディレクトリを指定してください。
\ No newline at end of file
+各解像度セクションの`[[datasets.subsets]]`部分は、データセットディレクトリを定義します。各解像度に対して同じディレクトリを指定してください。</details>
diff --git a/docs/sd3_train_network.md b/docs/sd3_train_network.md
index 2911fdf2c..e10829aae 100644
--- a/docs/sd3_train_network.md
+++ b/docs/sd3_train_network.md
@@ -1,10 +1,24 @@
-ステータス：内容を一通り確認した
+Status: reviewed
+
+# LoRA Training Guide for Stable Diffusion 3/3.5 using `sd3_train_network.py` / `sd3_train_network.py` を用いたStable Diffusion 3/3.5モデルのLoRA学習ガイド
+
+This document explains how to train LoRA (Low-Rank Adaptation) models for Stable Diffusion 3 (SD3) and Stable Diffusion 3.5 (SD3.5) using `sd3_train_network.py` in the `sd-scripts` repository.
+
+## 1. Introduction / はじめに
+
+`sd3_train_network.py` trains additional networks such as LoRA for SD3/3.5 models. SD3 adopts a new architecture called MMDiT (Multi-Modal Diffusion Transformer), so its structure differs from previous Stable Diffusion models. With this script you can create LoRA models specialized for SD3/3.5.
 
-# `sd3_train_network.py` を用いたStable Diffusion 3/3.5モデルのLoRA学習ガイド
+This guide assumes you already understand the basics of LoRA training. For common usage and options, see the [train_network.py guide](train_network.md). Some parameters are the same as those in [`sdxl_train_network.py`](sdxl_train_network.md).
 
-このドキュメントでは、`sd-scripts`リポジトリに含まれる`sd3_train_network.py`を使用して、Stable Diffusion 3 (SD3) および Stable Diffusion 3.5 (SD3.5) モデルに対するLoRA (Low-Rank Adaptation) モデルを学習する基本的な手順について解説します。
+**Prerequisites:**
 
-## 1. はじめに
+* The `sd-scripts` repository has been cloned and the Python environment is ready.
+* A training dataset has been prepared. See the [Dataset Configuration Guide](link/to/dataset/config/doc).
+* SD3/3.5 model files for training are available.
+
+<details>
+<summary>日本語</summary>
+ステータス：内容を一通り確認した
 
 `sd3_train_network.py`は、Stable Diffusion 3/3.5モデルに対してLoRAなどの追加ネットワークを学習させるためのスクリプトです。SD3は、MMDiT (Multi-Modal Diffusion Transformer) と呼ばれる新しいアーキテクチャを採用しており、従来のStable Diffusionモデルとは構造が異なります。このスクリプトを使用することで、SD3/3.5モデルに特化したLoRAモデルを作成できます。
 
@@ -15,9 +29,20 @@
 *   `sd-scripts`リポジトリのクローンとPython環境のセットアップが完了していること。
 *   学習用データセットの準備が完了していること。（データセットの準備については[データセット設定ガイド](link/to/dataset/config/doc)を参照してください）
 *   学習対象のSD3/3.5モデルファイルが準備できていること。
+</details>
+
+## 2. Differences from `train_network.py` / `train_network.py` との違い
+
+`sd3_train_network.py` is based on `train_network.py` but modified for SD3/3.5. Main differences are:
 
-## 2. `train_network.py` との違い
+* **Target models:** Stable Diffusion 3 and 3.5 Medium/Large.
+* **Model structure:** Uses MMDiT (Transformer based) instead of U-Net and employs three text encoders: CLIP-L, CLIP-G and T5-XXL. The VAE is not compatible with SDXL.
+* **Arguments:** Options exist to specify the SD3/3.5 model, text encoders and VAE. With a single `.safetensors` file, these paths are detected automatically, so separate paths are optional.
+* **Incompatible arguments:** Stable Diffusion v1/v2 options such as `--v2`, `--v_parameterization` and `--clip_skip` are not used.
+* **SD3 specific options:** Additional parameters for attention masks, dropout rates, positional embedding adjustments (for SD3.5), timestep sampling and loss weighting.
 
+<details>
+<summary>日本語</summary>
 `sd3_train_network.py`は`train_network.py`をベースに、SD3/3.5モデルに対応するための変更が加えられています。主な違いは以下の通りです。
 
 *   **対象モデル:** Stable Diffusion 3, 3.5 Medium / Large モデルを対象とします。
@@ -25,9 +50,18 @@
 *   **引数:** SD3/3.5モデル、Text Encoder群、VAEを指定する引数があります。ただし、単一ファイルの`.safetensors`形式であれば、内部で自動的に分離されるため、個別のパス指定は必須ではありません。
 *   **一部引数の非互換性:** Stable Diffusion v1/v2向けの引数（例: `--v2`, `--v_parameterization`, `--clip_skip`）はSD3/3.5の学習では使用されません。
 *   **SD3特有の引数:** Text Encoderのアテンションマスクやドロップアウト率、Positional Embeddingの調整（SD3.5向け）、タイムステップのサンプリングや損失の重み付けに関する引数が追加されています。
+</details>
 
-## 3. 準備
+## 3. Preparation / 準備
 
+The following files are required before starting training:
+
+1. **Training script:** `sd3_train_network.py`
+2. **SD3/3.5 model file:** `.safetensors` file for the base model and paths to each text encoder. Single-file format can also be used.
+3. **Dataset definition file (.toml):** Dataset settings in TOML format. (See the [Dataset Configuration Guide](link/to/dataset/config/doc).) In this document we use `my_sd3_dataset_config.toml` as an example.
+
+<details>
+<summary>日本語</summary>
 学習を開始する前に、以下のファイルが必要です。
 
 1.  **学習スクリプト:** `sd3_train_network.py`
@@ -35,43 +69,107 @@
     * 単一ファイル形式も使用可能です。
 3.  **データセット定義ファイル (.toml):** 学習データセットの設定を記述したTOML形式のファイル。（詳細は[データセット設定ガイド](link/to/dataset/config/doc)を参照してください）。
     *   例として`my_sd3_dataset_config.toml`を使用します。
+</details>
+
+## 4. Running the Training / 学習の実行
+
+Execute `sd3_train_network.py` from the terminal to start training. The overall command-line format is the same as `train_network.py`, but SD3/3.5 specific options must be supplied.
+
+Example command:
+
+```bash
+accelerate launch --num_cpu_threads_per_process 1 sd3_train_network.py \
+  --pretrained_model_name_or_path="<path to SD3 model>" \
+  --clip_l="<path to CLIP-L model>" \
+  --clip_g="<path to CLIP-G model>" \
+  --t5xxl="<path to T5-XXL model>" \
+  --dataset_config="my_sd3_dataset_config.toml" \
+  --output_dir="<output directory for training results>" \
+  --output_name="my_sd3_lora" \
+  --save_model_as=safetensors \
+  --network_module=networks.lora \
+  --network_dim=16 \
+  --network_alpha=1 \
+  --learning_rate=1e-4 \
+  --optimizer_type="AdamW8bit" \
+  --lr_scheduler="constant" \
+  --sdpa \
+  --max_train_epochs=10 \
+  --save_every_n_epochs=1 \
+  --mixed_precision="fp16" \
+  --gradient_checkpointing \
+  --weighting_scheme="sigma_sqrt" \
+  --blocks_to_swap=32
+```
 
-## 4. 学習の実行
+*(Write the command on one line or use `\` or `^` for line breaks.)*
 
+<details>
+<summary>日本語</summary>
 学習は、ターミナルから`sd3_train_network.py`を実行することで開始します。基本的なコマンドラインの構造は`train_network.py`と同様ですが、SD3/3.5特有の引数を指定する必要があります。
 
 以下に、基本的なコマンドライン実行例を示します。
 
 ```bash
-accelerate launch --num_cpu_threads_per_process 1 sd3_train_network.py 
- --pretrained_model_name_or_path="<path to SD3 model>" 
+accelerate launch --num_cpu_threads_per_process 1 sd3_train_network.py
+ --pretrained_model_name_or_path="<path to SD3 model>"
  --clip_l="<path to CLIP-L model>"
- --clip_g="<path to CLIP-G model>" 
- --t5xxl="<path to T5-XXL model>" 
- --dataset_config="my_sd3_dataset_config.toml" 
- --output_dir="<output directory for training results>" 
- --output_name="my_sd3_lora" 
- --save_model_as=safetensors 
- --network_module=networks.lora 
- --network_dim=16 
- --network_alpha=1 
- --learning_rate=1e-4 
- --optimizer_type="AdamW8bit" 
- --lr_scheduler="constant" 
- --sdpa  
- --max_train_epochs=10 
- --save_every_n_epochs=1 
- --mixed_precision="fp16" 
- --gradient_checkpointing 
- --weighting_scheme="sigma_sqrt" 
+ --clip_g="<path to CLIP-G model>"
+ --t5xxl="<path to T5-XXL model>"
+ --dataset_config="my_sd3_dataset_config.toml"
+ --output_dir="<output directory for training results>"
+ --output_name="my_sd3_lora"
+ --save_model_as=safetensors
+ --network_module=networks.lora
+ --network_dim=16
+ --network_alpha=1
+ --learning_rate=1e-4
+ --optimizer_type="AdamW8bit"
+ --lr_scheduler="constant"
+ --sdpa
+ --max_train_epochs=10
+ --save_every_n_epochs=1
+ --mixed_precision="fp16"
+ --gradient_checkpointing
+ --weighting_scheme="sigma_sqrt"
  --blocks_to_swap=32
 ```
 
 ※実際には1行で書くか、適切な改行文字（`\` または `^`）を使用してください。
+</details>
 
-### 4.1. 主要なコマンドライン引数の解説（`train_network.py`からの追加・変更点）
+### 4.1. Explanation of Key Options / 主要なコマンドライン引数の解説
 
-[`train_network.py`のガイド](train_network.md)で説明されている引数に加え、以下のSD3/3.5特有の引数を指定します。共通の引数（`--output_dir`, `--output_name`, `--network_module`, `--network_dim`, `--network_alpha`, `--learning_rate`など）については、上記ガイドを参照してください。
+Besides the arguments explained in the [train_network.py guide](train_network.md), specify the following SD3/3.5 options. For shared options (`--output_dir`, `--output_name`, etc.), see that guide.
+
+#### Model Options / モデル関連
+
+* `--pretrained_model_name_or_path="<path to SD3 model>"` **required** – Path to the SD3/3.5 model.
+* `--clip_l`, `--clip_g`, `--t5xxl`, `--vae` – Skip these if the base model is a single file; otherwise specify each `.safetensors` path. `--vae` is usually unnecessary unless you use a different VAE.
+
+#### SD3/3.5 Training Parameters / SD3/3.5 学習パラメータ
+
+* `--t5xxl_max_token_length=<integer>` – Max token length for T5-XXL. Default `256`.
+* `--apply_lg_attn_mask` – Apply an attention mask to CLIP-L/CLIP-G outputs.
+* `--apply_t5_attn_mask` – Apply an attention mask to T5-XXL outputs.
+* `--clip_l_dropout_rate`, `--clip_g_dropout_rate`, `--t5_dropout_rate` – Dropout rates for the text encoders. Default `0.0`.
+* `--pos_emb_random_crop_rate=<float>` **[SD3.5]** – Probability of randomly cropping the positional embedding.
+* `--enable_scaled_pos_embed` **[SD3.5][experimental]** – Scale positional embeddings when training with multiple resolutions.
+* `--training_shift=<float>` – Shift applied to the timestep distribution. Default `1.0`.
+* `--weighting_scheme=<choice>` – Weighting method for loss by timestep. Default `uniform`.
+* `--logit_mean`, `--logit_std`, `--mode_scale` – Parameters for `logit_normal` or `mode` weighting.
+
+#### Memory and Speed / メモリ・速度関連
+
+* `--blocks_to_swap=<integer>` **[experimental]** – Swap a number of Transformer blocks between CPU and GPU. More blocks reduce VRAM but slow training. Cannot be used with `--cpu_offload_checkpointing`.
+
+#### Incompatible or Deprecated Options / 非互換・非推奨の引数
+
+* `--v2`, `--v_parameterization`, `--clip_skip` – Options for Stable Diffusion v1/v2 that are not used for SD3/3.5.
+
+<details>
+<summary>日本語</summary>
+[`train_network.py`のガイド](train_network.md)で説明されている引数に加え、以下のSD3/3.5特有の引数を指定します。共通の引数については、上記ガイドを参照してください。
 
 #### モデル関連
 
@@ -83,43 +181,42 @@ accelerate launch --num_cpu_threads_per_process 1 sd3_train_network.py
 
 #### SD3/3.5 学習パラメータ
 
-*   `--t5xxl_max_token_length=<integer>`
-    *   T5-XXL Text Encoderで使用するトークンの最大長を指定します。SD3のデフォルトは`256`です。データセットのキャプション長に合わせて調整が必要な場合があります。
-*   `--apply_lg_attn_mask`
-    *   CLIP-LおよびCLIP-Gの出力に対して、パディングトークンに対応するアテンションマスク（ゼロ埋め）を適用します。
-*   `--apply_t5_attn_mask`
-    *   T5-XXLの出力に対して、パディングトークンに対応するアテンションマスク（ゼロ埋め）を適用します。
-*   `--clip_l_dropout_rate`, `--clip_g_dropout_rate`, `--t5_dropout_rate`:
-    *   各Text Encoderの出力に対して、指定した確率でドロップアウト（出力をゼロにする）を適用します。過学習の抑制に役立つ場合があります。デフォルトは`0.0`（ドロップアウトなし）です。
-*   `--pos_emb_random_crop_rate=<float>` **[SD3.5向け]**
-    *   MMDiTのPositional Embeddingに対してランダムクロップを適用する確率を指定します。[SD3.5M model card](https://huggingface.co/stabilityai/stable-diffusion-3.5-medium) で説明されています。デフォルトは`0.0`です。
-*   `--enable_scaled_pos_embed` **[SD3.5向け]** **[実験的機能]**
-    *   マルチ解像度学習時に、解像度に応じてPositional Embeddingをスケーリングします。デフォルトは`False`です。通常は指定不要です。
-*   `--training_shift=<float>`
-    *   学習時のタイムステップ（ノイズレベル）の分布を調整するためのシフト値です。`weighting_scheme`に加えて適用されます。`1.0`より大きい値はノイズの大きい（構造寄り）領域を、小さい値はノイズの小さい（詳細寄り）領域を重視する傾向になります。デフォルトは`1.0`です。通常はデフォルト値で問題ありません。
-*   `--weighting_scheme=<choice>`
-    *   損失計算時のタイムステップ（ノイズレベル）に応じた重み付け方法を指定します。`sigma_sqrt`, `logit_normal`, `mode`, `cosmap`, `uniform` (または`none`) から選択します。SD3の論文では`sigma_sqrt`が使用されています。デフォルトは`uniform`です。通常はデフォルト値で問題ありません。
-*   `--logit_mean`, `--logit_std`, `--mode_scale`:
-    *   `weighting_scheme`で`logit_normal`または`mode`を選択した場合に、その分布を制御するためのパラメータです。通常はデフォルト値で問題ありません。
+*   `--t5xxl_max_token_length=<integer>` – T5-XXLで使用するトークンの最大長を指定します。デフォルトは`256`です。
+*   `--apply_lg_attn_mask` – CLIP-L/CLIP-Gの出力にパディング用のマスクを適用します。
+*   `--apply_t5_attn_mask` – T5-XXLの出力にパディング用のマスクを適用します。
+*   `--clip_l_dropout_rate`, `--clip_g_dropout_rate`, `--t5_dropout_rate` – 各Text Encoderのドロップアウト率を指定します。デフォルトは`0.0`です。
+*   `--pos_emb_random_crop_rate=<float>` **[SD3.5向け]** – Positional Embeddingにランダムクロップを適用する確率を指定します。
+*   `--enable_scaled_pos_embed` **[SD3.5向け][実験的機能]** – マルチ解像度学習時に解像度に応じてPositional Embeddingをスケーリングします。
+*   `--training_shift=<float>` – タイムステップ分布を調整するためのシフト値です。デフォルトは`1.0`です。
+*   `--weighting_scheme=<choice>` – タイムステップに応じた損失の重み付け方法を指定します。デフォルトは`uniform`です。
+*   `--logit_mean`, `--logit_std`, `--mode_scale` – `logit_normal`または`mode`使用時のパラメータです。
 
 #### メモリ・速度関連
 
-*   `--blocks_to_swap=<integer>` **[実験的機能]**
-    *   VRAM使用量を削減するために、モデルの一部（MMDiTのTransformerブロック）をCPUとGPU間でスワップする設定です。スワップするブロック数を整数で指定します（例: `32`）。値を大きくするとVRAM使用量は減りますが、学習速度は低下します。GPUのVRAM容量に応じて調整してください。`gradient_checkpointing`と併用可能です。
-    *   `--cpu_offload_checkpointing`とは併用できません。
+*   `--blocks_to_swap=<integer>` **[実験的機能]** – TransformerブロックをCPUとGPUでスワップしてVRAMを節約します。`--cpu_offload_checkpointing`とは併用できません。
 
 #### 非互換・非推奨の引数
 
-*   `--v2`, `--v_parameterization`, `--clip_skip`: Stable Diffusion v1/v2特有の引数のため、SD3/3.5学習では使用されません。
+*   `--v2`, `--v_parameterization`, `--clip_skip` – Stable Diffusion v1/v2向けの引数のため、SD3/3.5学習では使用されません。
+</details>
 
-### 4.2. 学習の開始
+### 4.2. Starting Training / 学習の開始
 
-必要な引数を設定し、コマンドを実行すると学習が開始されます。基本的な流れやログの確認方法は[`train_network.py`のガイド](train_network.md#32-starting-the-training--学習の開始)と同様です。
+After setting the required arguments, run the command to begin training. The overall flow and how to check logs are the same as in the [train_network.py guide](train_network.md#32-starting-the-training--学習の開始).
 
-## 5. 学習済みモデルの利用
+## 5. Using the Trained Model / 学習済みモデルの利用
 
-学習が完了すると、指定した`output_dir`にLoRAモデルファイル（例: `my_sd3_lora.safetensors`）が保存されます。このファイルは、SD3/3.5モデルに対応した推論環境（例: ComfyUIなど）で使用できます。
+When training finishes, a LoRA model file (e.g. `my_sd3_lora.safetensors`) is saved in the directory specified by `output_dir`. Use this file with inference environments that support SD3/3.5, such as ComfyUI.
+
+## 6. Others / その他
 
-## 6. その他
+`sd3_train_network.py` shares many features with `train_network.py`, such as sample image generation (`--sample_prompts`, etc.) and detailed optimizer settings. For these, see the [train_network.py guide](train_network.md#5-other-features--その他の機能) or run `python sd3_train_network.py --help`.
+
+<details>
+<summary>日本語</summary>
+必要な引数を設定し、コマンドを実行すると学習が開始されます。基本的な流れやログの確認方法は[`train_network.py`のガイド](train_network.md#32-starting-the-training--学習の開始)と同様です。
+
+学習が完了すると、指定した`output_dir`にLoRAモデルファイル（例: `my_sd3_lora.safetensors`）が保存されます。このファイルは、SD3/3.5モデルに対応した推論環境（例: ComfyUIなど）で使用できます。
 
 `sd3_train_network.py`には、サンプル画像の生成 (`--sample_prompts`など) や詳細なオプティマイザ設定など、`train_network.py`と共通の機能も多く存在します。これらについては、[`train_network.py`のガイド](train_network.md#5-other-features--その他の機能)やスクリプトのヘルプ (`python sd3_train_network.py --help`) を参照してください。
+</details>
diff --git a/docs/sdxl_train_network_advanced.md b/docs/sdxl_train_network_advanced.md
index a736f0b36..0d4747b6b 100644
--- a/docs/sdxl_train_network_advanced.md
+++ b/docs/sdxl_train_network_advanced.md
@@ -1,4 +1,86 @@
-ステータス：確認中
+Status: under review
+
+# Advanced Settings: Detailed Guide for SDXL LoRA Training Script `sdxl_train_network.py` / 高度な設定: SDXL LoRA学習スクリプト `sdxl_train_network.py` 詳細ガイド
+
+This document describes the advanced options available when training LoRA models for SDXL (Stable Diffusion XL) with `sdxl_train_network.py` in the `sd-scripts` repository. For the basics, please read [How to Use the LoRA Training Script `train_network.py`](train_network.md) and [How to Use the SDXL LoRA Training Script `sdxl_train_network.py`](sdxl_train_network.md).
+
+This guide targets experienced users who want to fine tune settings in detail.
+
+**Prerequisites:**
+
+* You have cloned the `sd-scripts` repository and prepared a Python environment.
+* A training dataset and its `.toml` configuration are ready (see the dataset configuration guide).
+* You are familiar with running basic LoRA training commands.
+
+## 1. Command Line Options / コマンドライン引数 詳細解説
+
+`sdxl_train_network.py` inherits the functionality of `train_network.py` and adds SDXL-specific features. Major options are grouped and explained below. For common arguments, see the other guides mentioned above.
+
+### 1.1. Model Loading
+
+* `--pretrained_model_name_or_path="<model path>"` (required): specify the base SDXL model. Supports a Hugging Face model ID, a local Diffusers directory or a `.safetensors` file.
+* `--vae="<VAE path>"`: optionally use a different VAE.
+* `--no_half_vae`: keep the VAE in float32 even with fp16/bf16 training.
+* `--fp8_base` / `--fp8_base_unet`: **experimental** load the base model or just the U-Net in FP8 to reduce VRAM (requires PyTorch 2.1+).
+
+### 1.2. Dataset Settings
+
+* `--dataset_config="<path to config>"`: specify a `.toml` dataset config. High resolution data and aspect ratio buckets are common for SDXL. Bucket resolution steps must be multiples of 32.
+
+### 1.3. Output and Saving
+
+Options match `train_network.py`:
+
+* `--output_dir`, `--output_name` (both required)
+* `--save_model_as` (recommended `safetensors`)
+* `--save_precision`, `--save_every_n_epochs`, `--save_every_n_steps`
+* `--save_last_n_epochs`, `--save_last_n_steps`
+* `--save_state`, `--save_state_on_train_end`, `--save_last_n_epochs_state`, `--save_last_n_steps_state`
+* `--no_metadata`
+* `--save_state_to_huggingface` and related options
+
+### 1.4. Network Parameters (LoRA)
+
+* `--network_module=networks.lora` and `--network_dim` (required)
+* `--network_alpha`, `--network_dropout`
+* `--network_args` allows advanced settings such as block-wise dims/alphas and LoRA+ options
+* `--network_train_unet_only` / `--network_train_text_encoder_only`
+* `--network_weights` and `--dim_from_weights`
+
+### 1.5. Training Parameters
+
+Includes options for learning rate, optimizer, scheduler, mixed precision, gradient accumulation, gradient checkpointing, fused backward pass, resume, and more. See `--help` for details.
+
+### 1.6. Caching
+
+Options to cache latents or text encoder outputs in memory or on disk to speed up training.
+
+### 1.7. Sample Image Generation
+
+Options to generate sample images periodically during training.
+
+### 1.8. Logging & Tracking
+
+TensorBoard and wandb logging related options.
+
+### 1.9. Regularization and Advanced Techniques
+
+Various options such as noise offset, multires noise, input perturbation, min-SNR weighting, loss type selection, and masked loss.
+
+### 1.10. Distributed Training and Others
+
+General options like random seed, max token length, clip skip, lowram/highvram, data loader workers, config files, and Accelerate/DeepSpeed settings.
+
+## 2. Other Tips / その他のTips
+
+Hints on reducing VRAM usage, appropriate learning rates, training time considerations and troubleshooting.
+
+## 3. Conclusion / おわりに
+
+`sdxl_train_network.py` offers many options to customize SDXL LoRA training. Refer to `--help`, other documents and the source code for further details.
+
+<details>
+<summary>日本語</summary>
 
 ---
 
@@ -257,4 +339,7 @@ SDXLは計算コストが高いため、キャッシュ機能が効果的です
 
 不明な点や詳細については、各スクリプトの `--help` オプションや、リポジトリ内の他のドキュメント、実装コード自体を参照してください。
 
----
\ No newline at end of file
+---
+
+
+</details>

From 08aed008eb947ca33a53007758ca619aec82fed3 Mon Sep 17 00:00:00 2001
From: Kohya S <ykumeykume@gmail.com>
Date: Sat, 17 May 2025 14:42:19 +0900
Subject: [PATCH 457/582] doc: update FLUX.1 for newer features from README.md

---
 docs/flux_train_network.md | 112 +++++++++++++++++++++++++++++++++++++
 1 file changed, 112 insertions(+)

diff --git a/docs/flux_train_network.md b/docs/flux_train_network.md
index 4d7c2d46f..2b7ff7499 100644
--- a/docs/flux_train_network.md
+++ b/docs/flux_train_network.md
@@ -252,6 +252,15 @@ FLUX.1モデルは比較的大きなモデルであるため、十分なVRAMを
 - **T5XXLのfp8形式の使用**：
   10GB未満のVRAMを持つGPUでは、T5XXLのfp8形式チェックポイントの使用を推奨します。[comfyanonymous/flux_text_encoders](https://huggingface.co/comfyanonymous/flux_text_encoders)から`t5xxl_fp8_e4m3fn.safetensors`をダウンロードできます（`scaled`なしで使用してください）。
 
+- **FP8/FP16 混合学習 [実験的機能]**：
+  `--fp8_base_unet` オプションを指定すると、FLUX.1モデル本体をFP8形式で学習し、Text Encoder (CLIP-L/T5XXL) をBF16/FP16形式で学習できます。これにより、さらにVRAM使用量を削減できる可能性があります。このオプションを指定すると、`--fp8_base` オプションも自動的に有効になります。
+
+- **`pytorch-optimizer` の利用**:
+  `pytorch-optimizer` ライブラリに含まれる様々なオプティマイザを使用できます。`requirements.txt` に追加されているため、別途インストールは不要です。
+  例えば、CAME オプティマイザを使用する場合は以下のように指定します。
+  ```bash
+  --optimizer_type "pytorch_optimizer.CAME" --optimizer_args "weight_decay=0.01"
+ 
 ## 2. FLUX.1 LoRA学習の重要な設定オプション
 
 FLUX.1の学習には多くの未知の点があり、いくつかの設定は引数で指定できます。以下に重要な引数とその説明を示します。
@@ -266,6 +275,27 @@ FLUX.1の学習には多くの未知の点があり、いくつかの設定は
 - `shift`：正規分布乱数のシグモイド値をシフト
 - `flux_shift`：解像度に応じて正規分布乱数のシグモイド値をシフト（FLUX.1 dev推論と同様）。この設定では`--discrete_flow_shift`は無視されます。
 
+
+#### タイムステップ分布の可視化
+
+`--timestep_sampling`, `--sigmoid_scale`, `--discrete_flow_shift` の組み合わせによって、学習中にサンプリングされるタイムステップの分布が変化します。以下にいくつかの例を示します。
+
+*   `--timestep_sampling shift` と `--discrete_flow_shift` の効果 (`--sigmoid_scale` はデフォルトの1.0):
+    ![Figure_2](https://github.com/user-attachments/assets/d9de42f9-f17d-40da-b88d-d964402569c6)
+
+*   `--timestep_sampling sigmoid` と `--timestep_sampling uniform` の比較 (`--discrete_flow_shift` は無視される):
+    ![Figure_3](https://github.com/user-attachments/assets/27029009-1f5d-4dc0-bb24-13d02ac4fdad)
+
+*   `--timestep_sampling sigmoid` と `--sigmoid_scale` の効果 (`--discrete_flow_shift` は無視される):
+    ![Figure_4](https://github.com/user-attachments/assets/08a2267c-e47e-48b7-826e-f9a080787cdc)
+
+#### AI Toolkit 設定との比較
+
+[Ostris氏のAI Toolkit](https://github.com/ostris/ai-toolkit) で使用されている設定は、概ね以下のオプションに相当すると考えられます。
+```
+--timestep_sampling sigmoid --model_prediction_type raw --guidance_scale 1.0
+```
+
 ### 2.2 モデル予測の処理方法
 
 `--model_prediction_type`オプションで、モデルの予測をどのように解釈し処理するかを指定できます：
@@ -283,6 +313,37 @@ FLUX.1の学習には多くの未知の点があり、いくつかの設定は
 
 ガイダンススケールについて：FLUX.1 dev版は特定のガイダンススケール値で蒸留されていますが、学習時には`--guidance_scale 1.0`を指定してガイダンススケールを無効化することを推奨します。
 
+
+### 2.4 T5 Attention Mask の適用
+
+`--apply_t5_attn_mask` オプションを指定すると、T5XXL Text Encoder の学習および推論時に Attention Mask が適用されます。
+
+Attention Maskに対応した推論環境が限られるため、このオプションは推奨されません。
+
+### 2.5 IP ノイズガンマ
+
+`--ip_noise_gamma` および `--ip_noise_gamma_random_strength` オプションを使用することで、学習時に Input Perturbation ノイズのガンマ値を調整できます。詳細は Stable Diffusion 3 の学習オプションを参照してください。
+
+### 2.6 LoRA-GGPO サポート
+
+LoRA-GGPO (Gradient Group Proportion Optimizer) を使用できます。これは LoRA の学習を安定化させるための手法です。以下の `network_args` を指定して有効化します。ハイパーパラメータ (`ggpo_sigma`, `ggpo_beta`) は調整が必要です。
+
+```bash
+--network_args "ggpo_sigma=0.03" "ggpo_beta=0.01"
+```
+TOMLファイルで指定する場合:
+```toml
+network_args = ["ggpo_sigma=0.03", "ggpo_beta=0.01"]
+```
+
+### 2.7 Q/K/V 射影層の分割 [実験的機能]
+
+`--network_args "split_qkv=True"` を指定することで、Attention層内の Q/K/V (および SingleStreamBlock の Text) 射影層を個別に分割し、それぞれに LoRA を適用できます。
+
+**技術的詳細:**
+FLUX.1 の元々の実装では、Q/K/V (および Text) の射影層は一つに結合されています。ここに LoRA を適用すると、一つの大きな LoRA モジュールが適用されます。一方、Diffusers の実装ではこれらの射影層は分離されており、それぞれに小さな LoRA モジュールが適用されます。このオプションは後者の挙動を模倣します。
+保存される LoRA モデルの互換性は維持されますが、内部的には分割された LoRA の重みを結合して保存するため、ゼロ要素が多くなりモデルサイズが大きくなる可能性があります。`convert_flux_lora.py` スクリプトを使用して Diffusers (AI-Toolkit) 形式に変換すると、サイズが削減されます。
+
 ## 3. 各層に対するランク指定
 
 FLUX.1の各層に対して異なるランク（network_dim）を指定できます。これにより、特定の層に対してLoRAの効果を強調したり、無効化したりできます。
@@ -395,3 +456,54 @@ resolution = [512, 512]
 ```
 
 各解像度セクションの`[[datasets.subsets]]`部分は、データセットディレクトリを定義します。各解像度に対して同じディレクトリを指定してください。</details>
+
+## 7. 検証 (Validation)
+
+学習中に検証データセットを使用して損失 (Validation Loss) を計算し、モデルの汎化性能を評価できます。
+
+検証を設定するには、データセット設定 TOML ファイルに `[validation]` セクションを追加します。設定方法は学習データセットと同様ですが、`num_repeats` は通常 1 に設定します。
+
+```toml
+# ... (学習データセットの設定) ...
+
+[validation]
+batch_size = 1
+enable_bucket = true
+resolution = [1024, 1024] # 検証に使用する解像度
+
+  [[validation.subsets]]
+  image_dir = "検証用画像ディレクトリへのパス"
+  num_repeats = 1
+  caption_extension = ".txt"
+  # ... 他の検証データセット固有の設定 ...
+```
+
+**注意点:**
+
+*   検証損失の計算は、固定されたタイムステップサンプリングと乱数シードで行われます。これにより、ランダム性による損失の変動を抑え、より安定した評価が可能になります。
+*   現在のところ、`--blocks_to_swap` オプションを使用している場合、または Schedule-Free オプティマイザ (`AdamWScheduleFree`, `RAdamScheduleFree`, `ProdigyScheduleFree`) を使用している場合は、検証損失はサポートされていません。
+
+## 8. データセット関連の追加オプション
+
+### 8.1 リサイズ時の補間方法指定
+
+データセットの画像を学習解像度にリサイズする際の補間方法を指定できます。データセット設定 TOML ファイルの `[[datasets]]` セクションまたは `[general]` セクションで `interpolation_type` を指定します。
+
+利用可能な値: `bicubic` (デフォルト), `bilinear`, `lanczos`, `nearest`, `area`
+
+```toml
+[[datasets]]
+resolution = [1024, 1024]
+enable_bucket = true
+interpolation_type = "lanczos" # 例: Lanczos補間を使用
+# ...
+```
+
+## 9. 関連ツール
+
+`flux_train_network.py` で学習したモデルや、学習プロセスに役立つ関連スクリプトが提供されています。
+
+*   **`networks/flux_extract_lora.py`**: 学習済みモデルとベースモデルの差分から LoRA モデルを抽出します。
+*   **`convert_flux_lora.py`**: 学習した LoRA モデルを Diffusers (AI-Toolkit) 形式など、他の形式に変換します。Q/K/V分割オプションで学習した場合、このスクリプトで変換するとモデルサイズを削減できます。
+*   **`networks/flux_merge_lora.py`**: 学習した LoRA モデルを FLUX.1 ベースモデルにマージします。
+*   **`flux_minimal_inference.py`**: 学習した LoRA モデルを適用して画像を生成するためのシンプルな推論スクリプトです。

From e7e371c9ce0d764d8761f0cbc8aa9a0ebd180d72 Mon Sep 17 00:00:00 2001
From: Kohya S <ykumeykume@gmail.com>
Date: Sat, 17 May 2025 15:06:00 +0900
Subject: [PATCH 458/582] doc: update English translation for advanced SDXL
 LoRA training

---
 docs/sdxl_train_network_advanced.md | 129 +++++++++++++++++++++++-----
 1 file changed, 106 insertions(+), 23 deletions(-)

diff --git a/docs/sdxl_train_network_advanced.md b/docs/sdxl_train_network_advanced.md
index 0d4747b6b..39844c98b 100644
--- a/docs/sdxl_train_network_advanced.md
+++ b/docs/sdxl_train_network_advanced.md
@@ -18,62 +18,145 @@ This guide targets experienced users who want to fine tune settings in detail.
 
 ### 1.1. Model Loading
 
-* `--pretrained_model_name_or_path="<model path>"` (required): specify the base SDXL model. Supports a Hugging Face model ID, a local Diffusers directory or a `.safetensors` file.
-* `--vae="<VAE path>"`: optionally use a different VAE.
-* `--no_half_vae`: keep the VAE in float32 even with fp16/bf16 training.
-* `--fp8_base` / `--fp8_base_unet`: **experimental** load the base model or just the U-Net in FP8 to reduce VRAM (requires PyTorch 2.1+).
+* `--pretrained_model_name_or_path=\"<model path>\"` **[Required]**: specify the base SDXL model. Supports a Hugging Face model ID, a local Diffusers directory or a `.safetensors` file.
+* `--vae=\"<VAE path>\"`: optionally use a different VAE. Specify when using a VAE other than the one included in the SDXL model. Can specify `.ckpt` or `.safetensors` files.
+* `--no_half_vae`: keep the VAE in float32 even with fp16/bf16 training. The VAE for SDXL can become unstable with `float16`, so it is recommended to enable this when `fp16` is specified. Usually unnecessary for `bf16`.
+* `--fp8_base` / `--fp8_base_unet`: **Experimental**: load the base model (U-Net, Text Encoder) or just the U-Net in FP8 to reduce VRAM (requires PyTorch 2.1+). For details, refer to the relevant section in TODO add document later (this is an SD3 explanation but also applies to SDXL).
 
 ### 1.2. Dataset Settings
 
-* `--dataset_config="<path to config>"`: specify a `.toml` dataset config. High resolution data and aspect ratio buckets are common for SDXL. Bucket resolution steps must be multiples of 32.
+* `--dataset_config=\"<path to config>\"`: specify a `.toml` dataset config. High resolution data and aspect ratio buckets (specify `enable_bucket = true` in `.toml`) are common for SDXL. The resolution steps for aspect ratio buckets (`bucket_reso_steps`) must be multiples of 32 for SDXL. For details on writing `.toml` files, refer to the [Dataset Configuration Guide](link/to/dataset/config/doc).
 
 ### 1.3. Output and Saving
 
 Options match `train_network.py`:
 
 * `--output_dir`, `--output_name` (both required)
-* `--save_model_as` (recommended `safetensors`)
-* `--save_precision`, `--save_every_n_epochs`, `--save_every_n_steps`
-* `--save_last_n_epochs`, `--save_last_n_steps`
-* `--save_state`, `--save_state_on_train_end`, `--save_last_n_epochs_state`, `--save_last_n_steps_state`
-* `--no_metadata`
-* `--save_state_to_huggingface` and related options
+* `--save_model_as` (recommended `safetensors`), `ckpt`, `pt`, `diffusers`, `diffusers_safetensors`
+* `--save_precision=\"fp16\"`, `\"bf16\"`, `\"float\"`: Specifies the precision for saving the model. If not specified, the model is saved with the training precision (`fp16`, `bf16`, etc.).
+* `--save_every_n_epochs=N`, `--save_every_n_steps=N`: Saves the model every N epochs/steps.
+* `--save_last_n_epochs=M`, `--save_last_n_steps=M`: When saving at every epoch/step, only the latest M files are kept, and older ones are deleted.
+* `--save_state`, `--save_state_on_train_end`: Saves the training state (`state`), including Optimizer status, etc., when saving the model or at the end of training. Required for resuming training with the `--resume` option.
+* `--save_last_n_epochs_state=M`, `--save_last_n_steps_state=M`: Limits the number of saved `state` files to M. Overrides the `--save_last_n_epochs/steps` specification.
+* `--no_metadata`: Does not save metadata to the output model.
+* `--save_state_to_huggingface` and related options (e.g., `--huggingface_repo_id`): Options related to uploading models and states to Hugging Face Hub. See TODO add document for details.
 
 ### 1.4. Network Parameters (LoRA)
 
-* `--network_module=networks.lora` and `--network_dim` (required)
-* `--network_alpha`, `--network_dropout`
-* `--network_args` allows advanced settings such as block-wise dims/alphas and LoRA+ options
-* `--network_train_unet_only` / `--network_train_text_encoder_only`
-* `--network_weights` and `--dim_from_weights`
+* `--network_module=networks.lora` **[Required]**
+* `--network_dim=N` **[Required]**: Specifies the rank (dimensionality) of LoRA. For SDXL, values like 32 or 64 are often tried, but adjustment is necessary depending on the dataset and purpose.
+* `--network_alpha=M`: LoRA alpha value. Generally around half of `network_dim` or the same value as `network_dim`. Default is 1.
+* `--network_dropout=P`: Dropout rate (0.0-1.0) within LoRA modules. Can be effective in suppressing overfitting. Default is None (no dropout).
+* `--network_args ...`: Allows advanced settings by specifying additional arguments to the network module in `key=value` format. For LoRA, the following advanced settings are available:
+    *   **Block-wise dimensions/alphas:**
+        *   Allows specifying different `dim` and `alpha` for each block of the U-Net. This enables adjustments to strengthen or weaken the influence of specific layers.
+        *   `block_dims`: Comma-separated dims for Linear and Conv2d 1x1 layers in U-Net (23 values for SDXL).
+        *   `block_alphas`: Comma-separated alpha values corresponding to the above.
+        *   `conv_block_dims`: Comma-separated dims for Conv2d 3x3 layers in U-Net.
+        *   `conv_block_alphas`: Comma-separated alpha values corresponding to the above.
+        *   Blocks not specified will use values from `--network_dim`/`--network_alpha` or `--conv_dim`/`--conv_alpha` (if they exist).
+        *   For details, refer to [Block-wise learning rate for LoRA](train_network.md#lora-の階層別学習率) (in train_network.md, applicable to SDXL) and the implementation ([lora.py](lora.py)).
+    *   **LoRA+:**
+        *   `loraplus_lr_ratio=R`: Sets the learning rate of LoRA's upward weights (UP) to R times the learning rate of downward weights (DOWN). Expected to improve learning speed. Paper recommends 16.
+        *   `loraplus_unet_lr_ratio=RU`: Specifies the LoRA+ learning rate ratio for the U-Net part individually.
+        *   `loraplus_text_encoder_lr_ratio=RT`: Specifies the LoRA+ learning rate ratio for the Text Encoder part individually (multiplied by the learning rates specified with `--text_encoder_lr1`, `--text_encoder_lr2`).
+        *   For details, refer to [README](../README.md#jan-17-2025--2025-01-17-version-090) and the implementation ([lora.py](lora.py)).
+* `--network_train_unet_only`: Trains only the LoRA modules of the U-Net. Specify this if not training Text Encoders. Required when using `--cache_text_encoder_outputs`.
+* `--network_train_text_encoder_only`: Trains only the LoRA modules of the Text Encoders. Specify this if not training the U-Net.
+* `--network_weights=\"<weight file>\"`: Starts training by loading pre-trained LoRA weights. Used for fine-tuning or resuming training. The difference from `--resume` is that this option only loads LoRA module weights, while `--resume` also restores Optimizer state, step count, etc.
+* `--dim_from_weights`: Automatically reads the LoRA dimension (`dim`) from the weight file specified by `--network_weights`. Specification of `--network_dim` becomes unnecessary.
 
 ### 1.5. Training Parameters
 
-Includes options for learning rate, optimizer, scheduler, mixed precision, gradient accumulation, gradient checkpointing, fused backward pass, resume, and more. See `--help` for details.
+*   `--learning_rate=LR`: Sets the overall learning rate. This becomes the default value for each module (`unet_lr`, `text_encoder_lr1`, `text_encoder_lr2`). Values like `1e-3` or `1e-4` are often tried.
+*   `--unet_lr=LR_U`: Learning rate for the LoRA module of the U-Net part.
+*   `--text_encoder_lr1=LR_TE1`: Learning rate for the LoRA module of Text Encoder 1 (OpenCLIP ViT-G/14). Usually, a smaller value than U-Net (e.g., `1e-5`, `2e-5`) is recommended.
+*   `--text_encoder_lr2=LR_TE2`: Learning rate for the LoRA module of Text Encoder 2 (CLIP ViT-L/14). Usually, a smaller value than U-Net (e.g., `1e-5`, `2e-5`) is recommended.
+*   `--optimizer_type=\"...\"`: Specifies the optimizer to use. Options include `AdamW8bit` (memory-efficient, common), `Adafactor` (even more memory-efficient, proven in SDXL full model training), `Lion`, `DAdaptation`, `Prodigy`, etc. Each optimizer may require additional arguments (see `--optimizer_args`). `AdamW8bit` or `PagedAdamW8bit` (requires `bitsandbytes`) are common. `Adafactor` is memory-efficient but slightly complex to configure (relative step (`relative_step=True`) recommended, `adafactor` learning rate scheduler recommended). `DAdaptation`, `Prodigy` have automatic learning rate adjustment but cannot be used with LoRA+. Specify a learning rate around `1.0`. For details, see the `get_optimizer` function in [train_util.py](train_util.py).
+*   `--optimizer_args ...`: Specifies additional arguments to the optimizer in `key=value` format (e.g., `\"weight_decay=0.01\"` `\"betas=0.9,0.999\"`).
+*   `--lr_scheduler=\"...\"`: Specifies the learning rate scheduler. Options include `constant` (no change), `cosine` (cosine curve), `linear` (linear decay), `constant_with_warmup` (constant with warmup), `cosine_with_restarts`, etc. `constant`, `cosine`, and `constant_with_warmup` are commonly used. Some schedulers require additional arguments (see `--lr_scheduler_args`). If using optimizers with auto LR adjustment like `DAdaptation` or `Prodigy`, a scheduler is not needed (`constant` should be specified).
+*   `--lr_warmup_steps=N`: Number of warmup steps for the learning rate scheduler. The learning rate gradually increases during this period at the start of training. If N < 1, it's interpreted as a fraction of total steps.
+*   `--lr_scheduler_num_cycles=N` / `--lr_scheduler_power=P`: Parameters for specific schedulers (`cosine_with_restarts`, `polynomial`).
+*   `--max_train_steps=N` / `--max_train_epochs=N`: Specifies the total number of training steps or epochs. Epoch specification takes precedence.
+*   `--mixed_precision=\"bf16\"` / `\"fp16\"` / `\"no\"`: Mixed precision training settings. For SDXL, using `bf16` (if GPU supports it) or `fp16` is strongly recommended. Reduces VRAM usage and improves training speed.
+*   `--full_fp16` / `--full_bf16`: Performs gradient calculations entirely in half-precision/bf16. Can further reduce VRAM usage but may affect training stability. Use if VRAM is critically low.
+*   `--gradient_accumulation_steps=N`: Accumulates gradients for N steps before updating the optimizer. Effectively increases the batch size to `train_batch_size * N`, achieving the effect of a larger batch size with less VRAM. Default is 1.
+*   `--max_grad_norm=N`: Gradient clipping threshold. Clips gradients if their norm exceeds N. Default is 1.0. `0` disables it.
+*   `--gradient_checkpointing`: Significantly reduces memory usage but slightly decreases training speed. Recommended for SDXL due to high memory consumption.
+*   `--fused_backward_pass`: **Experimental**: Fuses gradient calculation and optimizer steps to reduce VRAM usage. Available for SDXL. Currently only supports `Adafactor` optimizer. Cannot be used with Gradient Accumulation.
+*   `--resume=\"<state directory>\"`: Resumes training from a saved state (saved with `--save_state`). Restores optimizer state, step count, etc.
 
 ### 1.6. Caching
 
-Options to cache latents or text encoder outputs in memory or on disk to speed up training.
+Caching is effective for SDXL due to its high computational cost.
+
+*   `--cache_latents`: Caches VAE outputs (latents) in memory. Skips VAE computation, reducing VRAM usage and speeding up training. **Note:** Image augmentations (`color_aug`, `flip_aug`, `random_crop`, etc.) will be disabled.
+*   `--cache_latents_to_disk`: Used with `--cache_latents` to cache to disk. Particularly effective for large datasets or multiple training runs. Caches are generated on disk during the first run and loaded from there on subsequent runs.
+*   `--cache_text_encoder_outputs`: Caches Text Encoder outputs in memory. Skips Text Encoder computation, reducing VRAM usage and speeding up training. **Note:** Caption augmentations (`shuffle_caption`, `caption_dropout_rate`, etc.) will be disabled. **Also, when using this option, Text Encoder LoRA modules cannot be trained (requires `--network_train_unet_only`).**
+*   `--cache_text_encoder_outputs_to_disk`: Used with `--cache_text_encoder_outputs` to cache to disk.
+*   `--skip_cache_check`: Skips validation of cache file contents. File existence is checked, and if not found, caches are generated. Usually not needed unless intentionally re-caching for debugging, etc.
 
 ### 1.7. Sample Image Generation
 
-Options to generate sample images periodically during training.
+Basic options are common with `train_network.py`.
+
+*   `--sample_every_n_steps=N` / `--sample_every_n_epochs=N`: Generates sample images every N steps/epochs.
+*   `--sample_at_first`: Generates sample images before training starts.
+*   `--sample_prompts=\"<prompt file>\"`: Specifies a file (`.txt`, `.toml`, `.json`) containing prompts for sample image generation. Format follows [gen_img_diffusers.py](gen_img_diffusers.py). See [documentation](gen_img_README-ja.md) for details.
+*   `--sample_sampler=\"...\"`: Specifies the sampler (scheduler) for sample image generation. `euler_a`, `dpm++_2m_karras`, etc., are common. See `--help` for choices.
 
 ### 1.8. Logging & Tracking
 
-TensorBoard and wandb logging related options.
+*   `--logging_dir=\"<log directory>\"`: Specifies the directory for TensorBoard and other logs. If not specified, logs are not output.
+*   `--log_with=\"tensorboard\"` / `\"wandb\"` / `\"all\"`: Specifies the logging tool to use. If using `wandb`, `pip install wandb` is required.
+*   `--log_prefix=\"<prefix>\"`: Specifies the prefix for subdirectory names created within `logging_dir`.
+*   `--wandb_api_key=\"<API key>\"` / `--wandb_run_name=\"<run name>\"`: Options for Weights & Biases (wandb).
+*   `--log_tracker_name` / `--log_tracker_config`: Advanced tracker configuration options. Usually not needed.
+*   `--log_config`: Logs the training configuration used (excluding some sensitive information) at the start of training. Helps ensure reproducibility.
 
 ### 1.9. Regularization and Advanced Techniques
 
-Various options such as noise offset, multires noise, input perturbation, min-SNR weighting, loss type selection, and masked loss.
+*   `--noise_offset=N`: Enables noise offset and specifies its value. Expected to improve bias in image brightness and contrast. Recommended to enable as SDXL base models are trained with this (e.g., 0.0357). Original technical explanation [here](https://www.crosslabs.org/blog/diffusion-with-offset-noise).
+*   `--noise_offset_random_strength`: Randomly varies noise offset strength between 0 and the specified value.
+*   `--adaptive_noise_scale=N`: Adjusts noise offset based on the mean absolute value of latents. Used with `--noise_offset`.
+*   `--multires_noise_iterations=N` / `--multires_noise_discount=D`: Enables multi-resolution noise. Adding noise of different frequency components is expected to improve detail reproduction. Specify iteration count N (around 6-10) and discount rate D (around 0.3). Technical explanation [here](https://wandb.ai/johnowhitaker/multires_noise/reports/Multi-Resolution-Noise-for-Diffusion-Model-Training--VmlldzozNjYyOTU2).
+*   `--ip_noise_gamma=G` / `--ip_noise_gamma_random_strength`: Enables Input Perturbation Noise. Adds small noise to input (latents) for regularization. Specify Gamma value (around 0.1). Strength can be randomized with `random_strength`.
+*   `--min_snr_gamma=N`: Applies Min-SNR Weighting Strategy. Adjusts loss weights for timesteps with high noise in early training to stabilize learning. `N=5` etc. are used.
+*   `--scale_v_pred_loss_like_noise_pred`: In v-prediction models, scales v-prediction loss similarly to noise prediction loss. **Not typically used for SDXL** as it's not a v-prediction model.
+*   `--v_pred_like_loss=N`: Adds v-prediction-like loss to noise prediction models. `N` specifies its weight. **Not typically used for SDXL**.
+*   `--debiased_estimation_loss`: Calculates loss using Debiased Estimation. Similar purpose to Min-SNR but a different approach.
+*   `--loss_type=\"l1\"` / `\"l2\"` / `\"huber\"` / `\"smooth_l1\"`: Specifies the loss function. Default is `l2` (MSE). `huber` and `smooth_l1` are robust to outliers.
+*   `--huber_schedule=\"constant\"` / `\"exponential\"` / `\"snr\"`: Scheduling method when using `huber` or `smooth_l1` loss. `snr` is recommended.
+*   `--huber_c=C` / `--huber_scale=S`: Parameters for `huber` or `smooth_l1` loss.
+*   `--masked_loss`: Limits loss calculation area based on a mask image. Requires specifying mask images (black and white) in `conditioning_data_dir` in dataset settings. See [About Masked Loss](masked_loss_README.md) for details.
 
 ### 1.10. Distributed Training and Others
 
-General options like random seed, max token length, clip skip, lowram/highvram, data loader workers, config files, and Accelerate/DeepSpeed settings.
+*   `--seed=N`: Specifies the random seed. Set this to ensure training reproducibility.
+*   `--max_token_length=N` (`75`, `150`, `225`): Maximum token length processed by Text Encoders. For SDXL, typically `75` (default), `150`, or `225`. Longer lengths can handle more complex prompts but increase VRAM usage.
+*   `--clip_skip=N`: Uses the output from N layers skipped from the final layer of Text Encoders. **Not typically used for SDXL**.
+*   `--lowram` / `--highvram`: Options for memory usage optimization. `--lowram` is for environments like Colab where RAM < VRAM, `--highvram` is for environments with ample VRAM.
+*   `--persistent_data_loader_workers` / `--max_data_loader_n_workers=N`: Settings for DataLoader worker processes. Affects wait time between epochs and memory usage.
+*   `--config_file=\"<config file>\"` / `--output_config`: Options to use/output a `.toml` file instead of command line arguments.
+*   **Accelerate/DeepSpeed related:** (`--ddp_timeout`, `--ddp_gradient_as_bucket_view`, `--ddp_static_graph`): Detailed settings for distributed training. Accelerate settings (`accelerate config`) are usually sufficient. DeepSpeed requires separate configuration.
 
 ## 2. Other Tips / その他のTips
 
-Hints on reducing VRAM usage, appropriate learning rates, training time considerations and troubleshooting.
+*   **VRAM Usage:** SDXL LoRA training requires a lot of VRAM. Even with 24GB VRAM, you might run out of memory depending on settings. Reduce VRAM usage with these settings:
+    *   `--mixed_precision=\"bf16\"` or `\"fp16\"` (essential)
+    *   `--gradient_checkpointing` (strongly recommended)
+    *   `--cache_latents` / `--cache_text_encoder_outputs` (highly effective, with limitations)
+    *   `--optimizer_type=\"AdamW8bit\"` or `\"Adafactor\"`
+    *   Increase `--gradient_accumulation_steps` (reduce batch size)
+    *   `--full_fp16` / `--full_bf16` (be mindful of stability)
+    *   `--fp8_base` / `--fp8_base_unet` (experimental)
+    *   `--fused_backward_pass` (Adafactor only, experimental)
+*   **Learning Rate:** Appropriate learning rates for SDXL LoRA depend on the dataset and `network_dim`/`alpha`. Starting around `1e-4` ~ `4e-5` (U-Net), `1e-5` ~ `2e-5` (Text Encoders) is common.
+*   **Training Time:** Training takes time due to high-resolution data and the size of the SDXL model. Using caching features and appropriate hardware is important.
+*   **Troubleshooting:**
+    *   **NaN Loss:** Learning rate might be too high, mixed precision settings incorrect (e.g., `--no_half_vae` not specified with `fp16`), or dataset issues.
+    *   **Out of Memory (OOM):** Try the VRAM reduction measures listed above.
+    *   **Training not progressing:** Learning rate might be too low, optimizer/scheduler settings incorrect, or dataset issues.
 
 ## 3. Conclusion / おわりに
 

From 2bfda1271bcdbe13e823579fc406f3eaa229573b Mon Sep 17 00:00:00 2001
From: rockerBOO <rockerboo@gmail.com>
Date: Mon, 19 May 2025 20:25:42 -0400
Subject: [PATCH 459/582] Update workflows to read-all instead of write-all

---
 .github/workflows/tests.yml | 5 ++++-
 .github/workflows/typos.yml | 3 +++
 2 files changed, 7 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
index 2eddedc7b..9e037e539 100644
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -12,6 +12,9 @@ on:
       - dev
       - sd3
 
+# CKV2_GHA_1: "Ensure top-level permissions are not set to write-all"
+permissions: read-all
+
 jobs:
   build:
     runs-on: ${{ matrix.os }}
@@ -40,7 +43,7 @@ jobs:
       - name: Install dependencies
         run: |
           # Pre-install torch to pin version (requirements.txt has dependencies like transformers which requires pytorch)
-          pip install dadaptation==3.2 torch==${{ matrix.pytorch-version }} torchvision==0.19.0 pytest==8.3.4
+          pip install dadaptation==3.2 torch==${{ matrix.pytorch-version }} torchvision pytest==8.3.4
           pip install -r requirements.txt
 
       - name: Test with pytest
diff --git a/.github/workflows/typos.yml b/.github/workflows/typos.yml
index f53cda218..b9d6acc98 100644
--- a/.github/workflows/typos.yml
+++ b/.github/workflows/typos.yml
@@ -12,6 +12,9 @@ on:
       - synchronize
       - reopened
 
+# CKV2_GHA_1: "Ensure top-level permissions are not set to write-all"
+permissions: read-all
+
 jobs:
   build:
     runs-on: ubuntu-latest

From a376fec79caf6b352f4ae6144ce8b4bb42ccd8a8 Mon Sep 17 00:00:00 2001
From: Kohya S <ykumeykume@gmail.com>
Date: Sat, 24 May 2025 18:48:54 +0900
Subject: [PATCH 460/582] doc: add comprehensive README for image generation
 script with usage examples and options

---
 docs/gen_img_README.md | 560 +++++++++++++++++++++++++++++++++++++++++
 1 file changed, 560 insertions(+)
 create mode 100644 docs/gen_img_README.md

diff --git a/docs/gen_img_README.md b/docs/gen_img_README.md
new file mode 100644
index 000000000..fd4a82905
--- /dev/null
+++ b/docs/gen_img_README.md
@@ -0,0 +1,560 @@
+<!-- filepath: d:\\Work\\SD\\dev\\sd-scripts\\docs\\gen_img_README-en.md -->
+This is an inference (image generation) script that supports SD 1.x and 2.x models, LoRA trained with this repository, ControlNet (only v1.0 has been confirmed to work), etc. It is used from the command line.
+
+# Overview
+
+* Inference (image generation) script.
+* Supports SD 1.x and 2.x (base/v-parameterization) models.
+* Supports txt2img, img2img, and inpainting.
+* Supports interactive mode, prompt reading from files, and continuous generation.
+* The number of images generated per prompt line can be specified.
+* The total number of repetitions can be specified.
+* Supports not only `fp16` but also `bf16`.
+* Supports xformers for high-speed generation.
+    * Although xformers are used for memory-saving generation, it is not as optimized as Automatic 1111's Web UI, so it uses about 6GB of VRAM for 512*512 image generation.
+* Extension of prompts to 225 tokens. Supports negative prompts and weighting.
+* Supports various samplers from Diffusers (fewer samplers than Web UI).
+* Supports clip skip (uses the output of the nth layer from the end) of Text Encoder.
+* Separate loading of VAE.
+* Supports CLIP Guided Stable Diffusion, VGG16 Guided Stable Diffusion, Highres. fix, and upscale.
+    * Highres. fix is an original implementation that has not confirmed the Web UI implementation at all, so the output results may differ.
+* LoRA support. Supports application rate specification, simultaneous use of multiple LoRAs, and weight merging.
+    * It is not possible to specify different application rates for Text Encoder and U-Net.
+* Supports Attention Couple.
+* Supports ControlNet v1.0.
+* Supports Deep Shrink for optimizing generation at different depths.
+* Supports Gradual Latent for progressive upscaling during generation.
+* Supports CLIP Vision Conditioning for img2img.
+* It is not possible to switch models midway, but it can be handled by creating a batch file.
+* Various personally desired features have been added.
+
+Since not all tests are performed when adding features, it is possible that previous features may be affected and some features may not work. Please let us know if you have any problems.
+
+# Basic Usage
+
+## Image Generation in Interactive Mode
+
+Enter as follows:
+
+```batchfile
+python gen_img.py --ckpt <model_name> --outdir <image_output_destination> --xformers --fp16 --interactive
+```
+
+Specify the model (Stable Diffusion checkpoint file or Diffusers model folder) in the `--ckpt` option and the image output destination folder in the `--outdir` option.
+
+Specify the use of xformers with the `--xformers` option (remove it if you do not use xformers). The `--fp16` option performs inference in fp16 (single precision). For RTX 30 series GPUs, you can also perform inference in bf16 (bfloat16) with the `--bf16` option.
+
+The `--interactive` option specifies interactive mode.
+
+If you are using Stable Diffusion 2.0 (or a model with additional training from it), add the `--v2` option. If you are using a model that uses v-parameterization (`768-v-ema.ckpt` and models with additional training from it), add `--v_parameterization` as well.
+
+If the `--v2` specification is incorrect, an error will occur when loading the model. If the `--v_parameterization` specification is incorrect, a brown image will be displayed.
+
+When `Type prompt:` is displayed, enter the prompt.
+
+![image](https://user-images.githubusercontent.com/52813779/235343115-f3b8ac82-456d-4aab-9724-0cc73c4534aa.png)
+
+*If the image is not displayed and an error occurs, headless (no screen display function) OpenCV may be installed. Install normal OpenCV with `pip install opencv-python`. Alternatively, stop image display with the `--no_preview` option.
+
+Select the image window and press any key to close the window and enter the next prompt. Press Ctrl+Z and then Enter in the prompt to close the script.
+
+## Batch Generation of Images with a Single Prompt
+
+Enter as follows (actually entered on one line):
+
+```batchfile
+python gen_img.py --ckpt <model_name> --outdir <image_output_destination> \
+    --xformers --fp16 --images_per_prompt <number_of_images_to_generate> --prompt "<prompt>"
+```
+
+Specify the number of images to generate per prompt with the `--images_per_prompt` option. Specify the prompt with the `--prompt` option. If it contains spaces, enclose it in double quotes.
+
+You can specify the batch size with the `--batch_size` option (described later).
+
+## Batch Generation by Reading Prompts from a File
+
+Enter as follows:
+
+```batchfile
+python gen_img.py --ckpt <model_name> --outdir <image_output_destination> \
+    --xformers --fp16 --from_file <prompt_file_name>
+```
+
+Specify the file containing the prompts with the `--from_file` option. Write one prompt per line. You can specify the number of images to generate per line with the `--images_per_prompt` option.
+
+## Using Negative Prompts and Weighting
+
+If you write `--n` in the prompt options (specified like `--x` in the prompt, described later), the following will be a negative prompt.
+
+Also, weighting with `()` and `[]`, `(xxx:1.3)`, etc., similar to AUTOMATIC1111's Web UI, is possible (the implementation is copied from Diffusers' [Long Prompt Weighting Stable Diffusion](https://github.com/huggingface/diffusers/blob/main/examples/community/README.md#long-prompt-weighting-stable-diffusion)).
+
+It can be specified similarly for prompt specification from the command line and prompt reading from files.
+
+![image](https://user-images.githubusercontent.com/52813779/235343128-e79cd768-ec59-46f5-8395-fce9bdc46208.png)
+
+# Main Options
+
+Specify from the command line.
+
+## Model Specification
+
+- `--ckpt <model_name>`: Specifies the model name. The `--ckpt` option is mandatory. You can specify a Stable Diffusion checkpoint file, a Diffusers model folder, or a Hugging Face model ID.
+
+- `--v2`: Specify when using Stable Diffusion 2.x series models. Not required for 1.x series.
+
+- `--v_parameterization`: Specify when using models that use v-parameterization (`768-v-ema.ckpt` and models with additional training from it, Waifu Diffusion v1.5, etc.).
+
+    If the `--v2` specification is incorrect, an error will occur when loading the model. If the `--v_parameterization` specification is incorrect, a brown image will be displayed.
+
+- `--vae`: Specifies the VAE to use. If not specified, the VAE in the model will be used.
+
+## Image Generation and Output
+
+- `--interactive`: Operates in interactive mode. Images are generated when prompts are entered.
+
+- `--prompt <prompt>`: Specifies the prompt. If it contains spaces, enclose it in double quotes.
+
+- `--from_file <prompt_file_name>`: Specifies the file containing the prompts. Write one prompt per line. Image size and guidance scale can be specified with prompt options (described later).
+
+- `--from_module <module_file>`: Loads prompts from a Python module. The module should implement a `get_prompter(args, pipe, networks)` function.
+
+- `--W <image_width>`: Specifies the width of the image. The default is `512`.
+
+- `--H <image_height>`: Specifies the height of the image. The default is `512`.
+
+- `--steps <number_of_steps>`: Specifies the number of sampling steps. The default is `50`.
+
+- `--scale <guidance_scale>`: Specifies the unconditional guidance scale. The default is `7.5`.
+
+- `--sampler <sampler_name>`: Specifies the sampler. The default is `ddim`. ddim, pndm, dpmsolver, dpmsolver+++, lms, euler, euler_a provided by Diffusers can be specified (the last three can also be specified as k_lms, k_euler, k_euler_a).
+
+- `--outdir <image_output_destination_folder>`: Specifies the output destination for images.
+
+- `--images_per_prompt <number_of_images_to_generate>`: Specifies the number of images to generate per prompt. The default is `1`.
+
+- `--clip_skip <number_of_skips>`: Specifies which layer from the end of CLIP to use. If omitted, the last layer is used.
+
+- `--max_embeddings_multiples <multiplier>`: Specifies how many times the CLIP input/output length should be multiplied by the default (75). If not specified, it remains 75. For example, specifying 3 makes the input/output length 225.
+
+- `--negative_scale`: Specifies the guidance scale for unconditioning individually. Implemented with reference to [this article by gcem156](https://note.com/gcem156/n/ne9a53e4a6f43).
+
+- `--emb_normalize_mode`: Specifies the embedding normalization mode. Options are "original" (default), "abs", and "none". This affects how prompt weights are normalized.
+
+## Adjusting Memory Usage and Generation Speed
+
+- `--batch_size <batch_size>`: Specifies the batch size. The default is `1`. A larger batch size consumes more memory but speeds up generation.
+
+- `--vae_batch_size <VAE_batch_size>`: Specifies the VAE batch size. The default is the same as the batch size.
+    Since VAE consumes more memory, memory shortages may occur after denoising (after the step reaches 100%). In such cases, reduce the VAE batch size.
+
+- `--vae_slices <number_of_slices>`: Splits the image into slices for VAE processing to reduce VRAM usage. None (default) for no splitting. Values like 16 or 32 are recommended. Enabling this is slower but uses less VRAM.
+
+- `--no_half_vae`: Prevents using fp16/bf16 precision for VAE processing. Uses fp32 instead.
+
+- `--xformers`: Specify when using xformers.
+
+- `--sdpa`: Use scaled dot-product attention in PyTorch 2 for optimization.
+
+- `--fp16`: Performs inference in fp16 (single precision). If neither `fp16` nor `bf16` is specified, inference is performed in fp32 (single precision).
+
+- `--bf16`: Performs inference in bf16 (bfloat16). Can only be specified for RTX 30 series GPUs. The `--bf16` option will cause an error on GPUs other than the RTX 30 series. It seems that `bf16` is less likely to result in NaN (black image) inference results than `fp16`.
+
+## Using Additional Networks (LoRA, etc.)
+
+- `--network_module`: Specifies the additional network to use. For LoRA, specify `--network_module networks.lora`. To use multiple LoRAs, specify like `--network_module networks.lora networks.lora networks.lora`.
+
+- `--network_weights`: Specifies the weight file of the additional network to use. Specify like `--network_weights model.safetensors`. To use multiple LoRAs, specify like `--network_weights model1.safetensors model2.safetensors model3.safetensors`. The number of arguments should be the same as the number specified in `--network_module`.
+
+- `--network_mul`: Specifies how many times to multiply the weight of the additional network to use. The default is `1`. Specify like `--network_mul 0.8`. To use multiple LoRAs, specify like `--network_mul 0.4 0.5 0.7`. The number of arguments should be the same as the number specified in `--network_module`.
+
+- `--network_merge`: Merges the weights of the additional networks to be used in advance with the weights specified in `--network_mul`. Cannot be used simultaneously with `--network_pre_calc`. The prompt option `--am` and Regional LoRA can no longer be used, but generation will be accelerated to the same extent as when LoRA is not used.
+
+- `--network_pre_calc`: Calculates the weights of the additional network to be used in advance for each generation. The prompt option `--am` can be used. Generation is accelerated to the same extent as when LoRA is not used, but time is required to calculate the weights before generation, and memory usage also increases slightly. It is disabled when Regional LoRA is used.
+
+- `--network_regional_mask_max_color_codes`: Specifies the maximum number of color codes to use for regional masks. If not specified, masks are applied by channel. Used with Regional LoRA to control the number of regions that can be defined by colors in the mask.
+
+# Examples of Main Option Specifications
+
+The following is an example of batch generating 64 images with the same prompt and a batch size of 4.
+
+```batchfile
+python gen_img.py --ckpt model.ckpt --outdir outputs \
+    --xformers --fp16 --W 512 --H 704 --scale 12.5 --sampler k_euler_a \
+    --steps 32 --batch_size 4 --images_per_prompt 64 \
+    --prompt "beautiful flowers --n monochrome"
+```
+
+The following is an example of batch generating 10 images each for prompts written in a file, with a batch size of 4.
+
+```batchfile
+python gen_img.py --ckpt model.ckpt --outdir outputs \
+    --xformers --fp16 --W 512 --H 704 --scale 12.5 --sampler k_euler_a \
+    --steps 32 --batch_size 4 --images_per_prompt 10 \
+    --from_file prompts.txt
+```
+
+Example of using Textual Inversion (described later) and LoRA.
+
+```batchfile
+python gen_img.py --ckpt model.safetensors \
+    --scale 8 --steps 48 --outdir txt2img --xformers \
+    --W 512 --H 768 --fp16 --sampler k_euler_a \
+    --textual_inversion_embeddings goodembed.safetensors negprompt.pt \
+    --network_module networks.lora networks.lora \
+    --network_weights model1.safetensors model2.safetensors \
+    --network_mul 0.4 0.8 \
+    --clip_skip 2 --max_embeddings_multiples 1 \
+    --batch_size 8 --images_per_prompt 1 --interactive
+```
+
+# Prompt Options
+
+In the prompt, you can specify various options from the prompt with "two hyphens + n alphabetic characters" like `--n`. It is valid whether specifying the prompt from interactive mode, command line, or file.
+
+Please put spaces before and after the prompt option specification `--n`.
+
+- `--n`: Specifies a negative prompt.
+
+- `--w`: Specifies the image width. Overrides the command line specification.
+
+- `--h`: Specifies the image height. Overrides the command line specification.
+
+- `--s`: Specifies the number of steps. Overrides the command line specification.
+
+- `--d`: Specifies the random seed for this image. If `--images_per_prompt` is specified, specify multiple seeds separated by commas, like "--d 1,2,3,4".
+    *For various reasons, the generated image may differ from the Web UI even with the same random seed.
+
+- `--l`: Specifies the guidance scale. Overrides the command line specification.
+
+- `--t`: Specifies the strength of img2img (described later). Overrides the command line specification.
+
+- `--nl`: Specifies the guidance scale for negative prompts (described later). Overrides the command line specification.
+
+- `--am`: Specifies the weight of the additional network. Overrides the command line specification. If using multiple additional networks, specify them separated by __commas__, like `--am 0.8,0.5,0.3`.
+
+- `--glt`: Specifies the timestep to start increasing the size of the latent for Gradual Latent. Overrides the command line specification.
+
+- `--glr`: Specifies the initial size of the latent for Gradual Latent as a ratio. Overrides the command line specification.
+
+- `--gls`: Specifies the ratio to increase the size of the latent for Gradual Latent. Overrides the command line specification.
+
+- `--gle`: Specifies the interval to increase the size of the latent for Gradual Latent. Overrides the command line specification.
+
+*Specifying these options may cause the batch to be executed with a size smaller than the batch size (because they cannot be generated collectively if these values are different). (You don't have to worry too much, but when reading prompts from a file and generating, arranging prompts with the same values for these options will improve efficiency.)
+
+Example:
+```
+(masterpiece, best quality), 1girl, in shirt and plated skirt, standing at street under cherry blossoms, upper body, [from below], kind smile, looking at another, [goodembed] --n realistic, real life, (negprompt), (lowres:1.1), (worst quality:1.2), (low quality:1.1), bad anatomy, bad hands, text, error, missing fingers, extra digit, fewer digits, cropped, normal quality, jpeg artifacts, signature, watermark, username, blurry --w 960 --h 640 --s 28 --d 1
+```
+
+![image](https://user-images.githubusercontent.com/52813779/235343446-25654172-fff4-4aaf-977a-20d262b51676.png)
+
+# img2img
+
+## Options
+
+- `--image_path`: Specifies the image to use for img2img. Specify like `--image_path template.png`. If a folder is specified, images in that folder will be used sequentially.
+
+- `--strength`: Specifies the strength of img2img. Specify like `--strength 0.8`. The default is `0.8`.
+
+- `--sequential_file_name`: Specifies whether to make file names sequential. If specified, the generated file names will be sequential starting from `im_000001.png`.
+
+- `--use_original_file_name`: If specified, the generated file name will be the same as the original file name.
+
+- `--clip_vision_strength`: Enables CLIP Vision Conditioning for img2img with the specified strength. Uses the CLIP Vision model to enhance conditioning from the input image.
+
+## Command Line Execution Example
+
+```batchfile
+python gen_img.py --ckpt trinart_characters_it4_v1_vae_merged.ckpt \
+    --outdir outputs --xformers --fp16 --scale 12.5 --sampler k_euler --steps 32 \
+    --image_path template.png --strength 0.8 \
+    --prompt "1girl, cowboy shot, brown hair, pony tail, brown eyes, \
+          sailor school uniform, outdoors \
+          --n lowres, bad anatomy, bad hands, error, missing fingers, cropped, \
+          worst quality, low quality, normal quality, jpeg artifacts, (blurry), \
+          hair ornament, glasses" \
+    --batch_size 8 --images_per_prompt 32
+```
+
+If a folder is specified in the `--image_path` option, images in that folder will be read sequentially. The number of images generated will be the number of prompts, not the number of images, so please match the number of images to img2img and the number of prompts by specifying the `--images_per_prompt` option.
+
+Files are read sorted by file name. Note that the sort order is string order (not `1.jpg -> 2.jpg -> 10.jpg` but `1.jpg -> 10.jpg -> 2.jpg`), so please pad the beginning with zeros (e.g., `01.jpg -> 02.jpg -> 10.jpg`).
+
+## Upscale using img2img
+
+If you specify the generated image size with the `--W` and `--H` command line options during img2img, the original image will be resized to that size before img2img.
+
+Also, if the original image for img2img was generated by this script, omitting the prompt will retrieve the prompt from the original image's metadata and use it as is. This allows you to perform only the 2nd stage operation of Highres. fix.
+
+## Inpainting during img2img
+
+You can specify an image and a mask image for inpainting (inpainting models are not supported, it simply performs img2img on the mask area).
+
+The options are as follows:
+
+- `--mask_image`: Specifies the mask image. Similar to `--img_path`, if a folder is specified, images in that folder will be used sequentially.
+
+The mask image is a grayscale image, and the white parts will be inpainted. It is recommended to gradient the boundaries to make it somewhat smooth.
+
+![image](https://user-images.githubusercontent.com/52813779/235343795-9eaa6d98-02ff-4f32-b089-80d1fc482453.png)
+
+# Other Features
+
+## Textual Inversion
+
+Specify the embeddings to use with the `--textual_inversion_embeddings` option (multiple specifications possible). By using the file name without the extension in the prompt, that embedding will be used (same usage as Web UI). It can also be used in negative prompts.
+
+As models, you can use Textual Inversion models trained with this repository and Textual Inversion models trained with Web UI (image embedding is not supported).
+
+## Extended Textual Inversion
+
+Specify the `--XTI_embeddings` option instead of `--textual_inversion_embeddings`. Usage is the same as `--textual_inversion_embeddings`.
+
+## Highres. fix
+
+This is a similar feature to the one in AUTOMATIC1111's Web UI (it may differ in various ways as it is an original implementation). It first generates a smaller image and then uses that image as a base for img2img to generate a large resolution image while preventing the entire image from collapsing.
+
+The number of steps for the 2nd stage is calculated from the values of the `--steps` and `--strength` options (`steps*strength`).
+
+Cannot be used with img2img.
+
+The following options are available:
+
+- `--highres_fix_scale`: Enables Highres. fix and specifies the size of the image generated in the 1st stage as a magnification. If the final output is 1024x1024 and you want to generate a 512x512 image first, specify like `--highres_fix_scale 0.5`. Please note that this is the reciprocal of the specification in Web UI.
+
+- `--highres_fix_steps`: Specifies the number of steps for the 1st stage image. The default is `28`.
+
+- `--highres_fix_save_1st`: Specifies whether to save the 1st stage image.
+
+- `--highres_fix_latents_upscaling`: If specified, the 1st stage image will be upscaled on a latent basis during 2nd stage image generation (only bilinear is supported). If not specified, the image will be upscaled with LANCZOS4.
+
+- `--highres_fix_upscaler`: Uses an arbitrary upscaler for the 2nd stage. Currently, only `--highres_fix_upscaler tools.latent_upscaler` is supported.
+
+- `--highres_fix_upscaler_args`: Specifies the arguments to pass to the upscaler specified with `--highres_fix_upscaler`.
+    For `tools.latent_upscaler`, specify the weight file like `--highres_fix_upscaler_args "weights=D:\\Work\\SD\\Models\\others\\etc\\upscaler-v1-e100-220.safetensors"`.
+
+- `--highres_fix_disable_control_net`: Disables ControlNet for the 2nd stage of Highres fix. By default, ControlNet is used in both stages.
+
+Command line example:
+
+```batchfile
+python gen_img.py  --ckpt trinart_characters_it4_v1_vae_merged.ckpt\
+    --n_iter 1 --scale 7.5 --W 1024 --H 1024 --batch_size 1 --outdir ../txt2img \
+    --steps 48 --sampler ddim --fp16 \
+    --xformers \
+    --images_per_prompt 1  --interactive \
+    --highres_fix_scale 0.5 --highres_fix_steps 28 --strength 0.5
+```
+
+## Deep Shrink
+
+Deep Shrink is a technique that optimizes the generation process by using different depths of the UNet at different timesteps. It can improve generation quality and efficiency.
+
+The following options are available:
+
+- `--ds_depth_1`: Enables Deep Shrink with this depth for the first phase. Valid values are 0 to 8.
+
+- `--ds_timesteps_1`: Applies Deep Shrink depth 1 until this timestep. Default is 650.
+
+- `--ds_depth_2`: Specifies the depth for the second phase of Deep Shrink.
+
+- `--ds_timesteps_2`: Applies Deep Shrink depth 2 until this timestep. Default is 650.
+
+- `--ds_ratio`: Specifies the ratio for downsampling in Deep Shrink. Default is 0.5.
+
+These parameters can also be specified through prompt options:
+
+- `--dsd1`: Specifies Deep Shrink depth 1 from the prompt.
+  
+- `--dst1`: Specifies Deep Shrink timestep 1 from the prompt.
+  
+- `--dsd2`: Specifies Deep Shrink depth 2 from the prompt.
+  
+- `--dst2`: Specifies Deep Shrink timestep 2 from the prompt.
+  
+- `--dsr`: Specifies Deep Shrink ratio from the prompt.
+
+## ControlNet
+
+Currently, only ControlNet 1.0 has been confirmed to work. Only Canny is supported for preprocessing.
+
+The following options are available:
+
+- `--control_net_models`: Specifies the ControlNet model file.
+    If multiple are specified, they will be switched and used for each step (differs from the implementation of the ControlNet extension in Web UI). Supports both diff and normal.
+
+- `--guide_image_path`: Specifies the hint image to use for ControlNet. Similar to `--img_path`, if a folder is specified, images in that folder will be used sequentially. For models other than Canny, please perform preprocessing beforehand.
+
+- `--control_net_preps`: Specifies the preprocessing for ControlNet. Multiple specifications are possible, similar to `--control_net_models`. Currently, only canny is supported. If preprocessing is not used for the target model, specify `none`.
+   For canny, you can specify thresholds 1 and 2 separated by `_`, like `--control_net_preps canny_63_191`.
+
+- `--control_net_weights`: Specifies the weight when applying ControlNet (`1.0` for normal, `0.5` for half influence). Multiple specifications are possible, similar to `--control_net_models`.
+
+- `--control_net_ratios`: Specifies the range of steps to apply ControlNet. If `0.5`, ControlNet is applied up to half the number of steps. Multiple specifications are possible, similar to `--control_net_models`.
+
+Command line example:
+
+```batchfile
+python gen_img.py --ckpt model_ckpt --scale 8 --steps 48 --outdir txt2img --xformers \
+    --W 512 --H 768 --bf16 --sampler k_euler_a \
+    --control_net_models diff_control_sd15_canny.safetensors --control_net_weights 1.0 \
+    --guide_image_path guide.png --control_net_ratios 1.0 --interactive
+```
+
+## ControlNet-LLLite
+
+ControlNet-LLLite is a lightweight alternative to ControlNet that can be used for similar guidance purposes.
+
+The following options are available:
+
+- `--control_net_lllite_models`: Specifies the ControlNet-LLLite model files.
+
+- `--control_net_multipliers`: Specifies the multiplier for ControlNet-LLLite (similar to weights).
+
+- `--control_net_ratios`: Specifies the ratio of steps to apply ControlNet-LLLite.
+
+Note that ControlNet and ControlNet-LLLite cannot be used at the same time.
+
+## Attention Couple + Regional LoRA
+
+This is a feature that allows you to divide the prompt into several parts and specify which region in the image each prompt should be applied to. There are no individual options, but it is specified with `mask_path` and the prompt.
+
+First, define multiple parts using ` AND ` in the prompt. Region specification can be done for the first three parts, and subsequent parts are applied to the entire image. Negative prompts are applied to the entire image.
+
+In the following, three parts are defined with AND.
+
+```
+shs 2girls, looking at viewer, smile AND bsb 2girls, looking back AND 2girls --n bad quality, worst quality
+```
+
+Next, prepare a mask image. The mask image is a color image, and each RGB channel corresponds to the part separated by AND in the prompt. Also, if the value of a certain channel is all 0, it is applied to the entire image.
+
+In the example above, the R channel corresponds to `shs 2girls, looking at viewer, smile`, the G channel to `bsb 2girls, looking back`, and the B channel to `2girls`. If you use a mask image like the following, since there is no specification for the B channel, `2girls` will be applied to the entire image.
+
+![image](https://user-images.githubusercontent.com/52813779/235343061-b4dc9392-3dae-4831-8347-1e9ae5054251.png)
+
+The mask image is specified with `--mask_path`. Currently, only one image is supported. It is automatically resized and applied to the specified image size.
+
+It can also be combined with ControlNet (combination with ControlNet is recommended for detailed position specification).
+
+If LoRA is specified, multiple LoRAs specified with `--network_weights` will correspond to each part of AND. As a current constraint, the number of LoRAs must be the same as the number of AND parts.
+
+## CLIP Guided Stable Diffusion
+
+The source code is copied and modified from [this custom pipeline](https://github.com/huggingface/diffusers/blob/main/examples/community/README.md#clip-guided-stable-diffusion) in Diffusers' Community Examples.
+
+In addition to the normal prompt-based generation specification, it additionally acquires the text features of the prompt with a larger CLIP and controls the generated image so that the features of the image being generated approach those text features (this is my rough understanding). Since a larger CLIP is used, VRAM usage increases considerably (it may be difficult even for 512*512 with 8GB of VRAM), and generation time also increases.
+
+Note that the selectable samplers are DDIM, PNDM, and LMS only.
+
+Specify how much to reflect the CLIP features numerically with the `--clip_guidance_scale` option. In the previous sample, it is 100, so it seems good to start around there and increase or decrease it.
+
+By default, the first 75 tokens of the prompt (excluding special weighting characters) are passed to CLIP. With the `--c` option in the prompt, you can specify the text to be passed to CLIP separately from the normal prompt (for example, it is thought that CLIP cannot recognize DreamBooth identifiers or model-specific words like "1girl", so text excluding them is considered good).
+
+Command line example:
+
+```batchfile
+python gen_img.py  --ckpt v1-5-pruned-emaonly.ckpt --n_iter 1 \
+    --scale 2.5 --W 512 --H 512 --batch_size 1 --outdir ../txt2img --steps 36  \
+    --sampler ddim --fp16 --opt_channels_last --xformers --images_per_prompt 1  \
+    --interactive --clip_guidance_scale 100
+```
+
+## CLIP Image Guided Stable Diffusion
+
+This is a feature that passes another image to CLIP instead of text and controls generation to approach its features. Specify the numerical value of the application amount with the `--clip_image_guidance_scale` option and the image (file or folder) to use for guidance with the `--guide_image_path` option.
+
+Command line example:
+
+```batchfile
+python gen_img.py  --ckpt trinart_characters_it4_v1_vae_merged.ckpt\
+    --n_iter 1 --scale 7.5 --W 512 --H 512 --batch_size 1 --outdir ../txt2img \
+    --steps 80 --sampler ddim --fp16 --opt_channels_last --xformers \
+    --images_per_prompt 1  --interactive  --clip_image_guidance_scale 100 \
+    --guide_image_path YUKA160113420I9A4104_TP_V.jpg
+```
+
+### VGG16 Guided Stable Diffusion
+
+This is a feature that generates images to approach a specified image. In addition to the normal prompt-based generation specification, it additionally acquires the features of VGG16 and controls the generated image so that the image being generated approaches the specified guide image. It is recommended to use it with img2img (images tend to be blurred in normal generation). This is an original feature that reuses the mechanism of CLIP Guided Stable Diffusion. The idea is also borrowed from style transfer using VGG.
+
+Note that the selectable samplers are DDIM, PNDM, and LMS only.
+
+Specify how much to reflect the VGG16 features numerically with the `--vgg16_guidance_scale` option. From what I've tried, it seems good to start around 100 and increase or decrease it. Specify the image (file or folder) to use for guidance with the `--guide_image_path` option.
+
+When batch converting multiple images with img2img and using the original images as guide images, it is OK to specify the same value for `--guide_image_path` and `--image_path`.
+
+Command line example:
+
+```batchfile
+python gen_img.py --ckpt wd-v1-3-full-pruned-half.ckpt \
+    --n_iter 1 --scale 5.5 --steps 60 --outdir ../txt2img \
+    --xformers --sampler ddim --fp16 --W 512 --H 704 \
+    --batch_size 1 --images_per_prompt 1 \
+    --prompt "picturesque, 1girl, solo, anime face, skirt, beautiful face \
+        --n lowres, bad anatomy, bad hands, error, missing fingers, \
+        cropped, worst quality, low quality, normal quality, \
+        jpeg artifacts, blurry, 3d, bad face, monochrome --d 1" \
+    --strength 0.8 --image_path ..\\src_image\
+    --vgg16_guidance_scale 100 --guide_image_path ..\\src_image \
+```
+
+You can specify the VGG16 layer number used for feature acquisition with `--vgg16_guidance_layerP` (default is 20, which is ReLU of conv4-2). It is said that upper layers express style and lower layers express content.
+
+![image](https://user-images.githubusercontent.com/52813779/235343813-3c1f0d7a-4fb3-4274-98e4-b92d76b551df.png)
+
+# Other Options
+
+- `--no_preview`: Does not display preview images in interactive mode. Specify this if OpenCV is not installed or if you want to check the output files directly.
+
+- `--n_iter`: Specifies the number of times to repeat generation. The default is 1. Specify this when you want to perform generation multiple times when reading prompts from a file.
+
+- `--tokenizer_cache_dir`: Specifies the cache directory for the tokenizer. (Work in progress)
+
+- `--seed`: Specifies the random seed. When generating one image, it is the seed for that image. When generating multiple images, it is the seed for the random numbers used to generate the seeds for each image (when generating multiple images with `--from_file`, specifying the `--seed` option will make each image have the same seed when executed multiple times).
+
+- `--iter_same_seed`: When there is no random seed specification in the prompt, the same seed is used for all repetitions of `--n_iter`. Used to unify and compare seeds between multiple prompts specified with `--from_file`.
+
+- `--shuffle_prompts`: Shuffles the order of prompts in iteration. Useful when using `--from_file` with multiple prompts.
+
+- `--diffusers_xformers`: Uses Diffuser's xformers.
+
+- `--opt_channels_last`: Arranges tensor channels last during inference. May speed up in some cases.
+
+- `--network_show_meta`: Displays the metadata of the additional network.
+
+
+---
+
+# About Gradual Latent
+
+Gradual Latent is a Hires fix that gradually increases the size of the latent.  `gen_img.py`, `sdxl_gen_img.py`, and `gen_img.py` have the following options.
+
+- `--gradual_latent_timesteps`: Specifies the timestep to start increasing the size of the latent. The default is None, which means Gradual Latent is not used. Please try around 750 at first.
+- `--gradual_latent_ratio`: Specifies the initial size of the latent. The default is 0.5, which means it starts with half the default latent size.
+- `--gradual_latent_ratio_step`: Specifies the ratio to increase the size of the latent. The default is 0.125, which means the latent size is gradually increased to 0.625, 0.75, 0.875, 1.0.
+- `--gradual_latent_ratio_every_n_steps`: Specifies the interval to increase the size of the latent. The default is 3, which means the latent size is increased every 3 steps.
+- `--gradual_latent_s_noise`: Specifies the s_noise parameter for Gradual Latent. Default is 1.0.
+- `--gradual_latent_unsharp_params`: Specifies unsharp mask parameters for Gradual Latent: ksize, sigma, strength, target-x (1 means True). Values like `3,0.5,0.5,1` or `3,1.0,1.0,0` are recommended.
+
+Each option can also be specified with prompt options, `--glt`, `--glr`, `--gls`, `--gle`.
+
+__Please specify `euler_a` for the sampler.__ Because the source code of the sampler is modified. It will not work with other samplers.
+
+It is more effective with SD 1.5. It is quite subtle with SDXL.
+
+# Gradual Latent について (Japanese section - kept for reference)
+
+latentのサイズを徐々に大きくしていくHires fixです。`gen_img.py` 、``sdxl_gen_img.py` 、`gen_img.py` に以下のオプションが追加されています。
+
+- `--gradual_latent_timesteps` : latentのサイズを大きくし始めるタイムステップを指定します。デフォルトは None で、Gradual Latentを使用しません。750 くらいから始めてみてください。
+- `--gradual_latent_ratio` : latentの初期サイズを指定します。デフォルトは 0.5 で、デフォルトの latent サイズの半分のサイズから始めます。
+- `--gradual_latent_ratio_step`: latentのサイズを大きくする割合を指定します。デフォルトは 0.125 で、latentのサイズを 0.625, 0.75, 0.875, 1.0 と徐々に大きくします。
+- `--gradual_latent_ratio_every_n_steps`: latentのサイズを大きくする間隔を指定します。デフォルトは 3 で、3ステップごとに latent のサイズを大きくします。
+
+それぞれのオプションは、プロンプトオプション、`--glt`、`--glr`、`--gls`、`--gle` でも指定できます。
+
+サンプラーに手を加えているため、__サンプラーに `euler_a` を指定してください。__ 他のサンプラーでは動作しません。
+
+SD 1.5 のほうが効果があります。SDXL ではかなり微妙です。

From e4d6923409e6406436a4e9f98ae3f74a07a8dd8d Mon Sep 17 00:00:00 2001
From: rockerBOO <rockerboo@gmail.com>
Date: Tue, 3 Jun 2025 16:12:02 -0400
Subject: [PATCH 461/582] Add tests for syntax checking training scripts

---
 tests/test_fine_tune.py               | 6 ++++++
 tests/test_flux_train.py              | 6 ++++++
 tests/test_flux_train_network.py      | 5 +++++
 tests/test_sd3_train.py               | 6 ++++++
 tests/test_sd3_train_network.py       | 5 +++++
 tests/test_sdxl_train.py              | 6 ++++++
 tests/test_sdxl_train_network.py      | 6 ++++++
 tests/test_train.py                   | 6 ++++++
 tests/test_train_network.py           | 5 +++++
 tests/test_train_textual_inversion.py | 5 +++++
 10 files changed, 56 insertions(+)
 create mode 100644 tests/test_fine_tune.py
 create mode 100644 tests/test_flux_train.py
 create mode 100644 tests/test_flux_train_network.py
 create mode 100644 tests/test_sd3_train.py
 create mode 100644 tests/test_sd3_train_network.py
 create mode 100644 tests/test_sdxl_train.py
 create mode 100644 tests/test_sdxl_train_network.py
 create mode 100644 tests/test_train.py
 create mode 100644 tests/test_train_network.py
 create mode 100644 tests/test_train_textual_inversion.py

diff --git a/tests/test_fine_tune.py b/tests/test_fine_tune.py
new file mode 100644
index 000000000..fd39ce612
--- /dev/null
+++ b/tests/test_fine_tune.py
@@ -0,0 +1,6 @@
+import fine_tune
+
+
+def test_syntax():
+    # Very simply testing that the train_network imports without syntax errors
+    assert True
diff --git a/tests/test_flux_train.py b/tests/test_flux_train.py
new file mode 100644
index 000000000..2b8739cfc
--- /dev/null
+++ b/tests/test_flux_train.py
@@ -0,0 +1,6 @@
+import flux_train
+
+
+def test_syntax():
+    # Very simply testing that the train_network imports without syntax errors
+    assert True
diff --git a/tests/test_flux_train_network.py b/tests/test_flux_train_network.py
new file mode 100644
index 000000000..aaff89624
--- /dev/null
+++ b/tests/test_flux_train_network.py
@@ -0,0 +1,5 @@
+import flux_train_network
+
+def test_syntax():
+    # Very simply testing that the flux_train_network imports without syntax errors
+    assert True
diff --git a/tests/test_sd3_train.py b/tests/test_sd3_train.py
new file mode 100644
index 000000000..a7c5d27a2
--- /dev/null
+++ b/tests/test_sd3_train.py
@@ -0,0 +1,6 @@
+import sd3_train
+
+
+def test_syntax():
+    # Very simply testing that the train_network imports without syntax errors
+    assert True
diff --git a/tests/test_sd3_train_network.py b/tests/test_sd3_train_network.py
new file mode 100644
index 000000000..10c0795cb
--- /dev/null
+++ b/tests/test_sd3_train_network.py
@@ -0,0 +1,5 @@
+import sd3_train_network
+
+def test_syntax():
+    # Very simply testing that the flux_train_network imports without syntax errors
+    assert True
diff --git a/tests/test_sdxl_train.py b/tests/test_sdxl_train.py
new file mode 100644
index 000000000..1c0e85799
--- /dev/null
+++ b/tests/test_sdxl_train.py
@@ -0,0 +1,6 @@
+import sdxl_train
+
+
+def test_syntax():
+    # Very simply testing that the train_network imports without syntax errors
+    assert True
diff --git a/tests/test_sdxl_train_network.py b/tests/test_sdxl_train_network.py
new file mode 100644
index 000000000..58300ae7d
--- /dev/null
+++ b/tests/test_sdxl_train_network.py
@@ -0,0 +1,6 @@
+import sdxl_train_network
+
+
+def test_syntax():
+    # Very simply testing that the train_network imports without syntax errors
+    assert True
diff --git a/tests/test_train.py b/tests/test_train.py
new file mode 100644
index 000000000..51c794924
--- /dev/null
+++ b/tests/test_train.py
@@ -0,0 +1,6 @@
+import train_db
+
+
+def test_syntax():
+    # Very simply testing that the train_network imports without syntax errors
+    assert True
diff --git a/tests/test_train_network.py b/tests/test_train_network.py
new file mode 100644
index 000000000..fe17263c6
--- /dev/null
+++ b/tests/test_train_network.py
@@ -0,0 +1,5 @@
+import train_network
+
+def test_syntax():
+    # Very simply testing that the train_network imports without syntax errors
+    assert True
diff --git a/tests/test_train_textual_inversion.py b/tests/test_train_textual_inversion.py
new file mode 100644
index 000000000..ab6a93425
--- /dev/null
+++ b/tests/test_train_textual_inversion.py
@@ -0,0 +1,5 @@
+import train_textual_inversion
+
+def test_syntax():
+    # Very simply testing that the train_network imports without syntax errors
+    assert True

From bb47f1ea893bc1f12ceaa3856cc03c0b14ec559b Mon Sep 17 00:00:00 2001
From: Kohya S <ykumeykume@gmail.com>
Date: Sun, 8 Jun 2025 18:00:24 +0900
Subject: [PATCH 462/582] Fix unwrap_model handling for None text_encoders in
 sample_images function

---
 library/flux_train_utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/library/flux_train_utils.py b/library/flux_train_utils.py
index 5f6867a81..8392e5592 100644
--- a/library/flux_train_utils.py
+++ b/library/flux_train_utils.py
@@ -67,7 +67,7 @@ def sample_images(
     # unwrap unet and text_encoder(s)
     flux = accelerator.unwrap_model(flux)
     if text_encoders is not None:
-        text_encoders = [accelerator.unwrap_model(te) for te in text_encoders]
+        text_encoders = [(accelerator.unwrap_model(te) if te is not None else None) for te in text_encoders]
     if controlnet is not None:
         controlnet = accelerator.unwrap_model(controlnet)
     # print([(te.parameters().__next__().device if te is not None else None) for te in text_encoders])

From d94bed645a4d899cffd0bce5804fcf32c4500ad3 Mon Sep 17 00:00:00 2001
From: rockerBOO <rockerboo@gmail.com>
Date: Mon, 9 Jun 2025 21:14:51 -0400
Subject: [PATCH 463/582] Add lumina tests and fix image masks

---
 library/lumina_models.py                |   6 +
 library/lumina_util.py                  |  85 ++++---
 library/sd3_train_utils.py              | 259 +++------------------
 tests/library/test_lumina_models.py     | 295 ++++++++++++++++++++++++
 tests/library/test_lumina_train_util.py | 241 +++++++++++++++++++
 tests/library/test_lumina_util.py       | 112 +++++++++
 tests/library/test_strategy_lumina.py   | 227 ++++++++++++++++++
 tests/test_lumina_train_network.py      | 173 ++++++++++++++
 8 files changed, 1130 insertions(+), 268 deletions(-)
 create mode 100644 tests/library/test_lumina_models.py
 create mode 100644 tests/library/test_lumina_train_util.py
 create mode 100644 tests/library/test_lumina_util.py
 create mode 100644 tests/library/test_strategy_lumina.py
 create mode 100644 tests/test_lumina_train_network.py

diff --git a/library/lumina_models.py b/library/lumina_models.py
index 2508cc7df..7e9253525 100644
--- a/library/lumina_models.py
+++ b/library/lumina_models.py
@@ -868,6 +868,8 @@ def __init__(
             cap_feat_dim (int): Dimension of the caption features.
             axes_dims (List[int]): List of dimensions for the axes.
             axes_lens (List[int]): List of lengths for the axes.
+            use_flash_attn (bool): Whether to use Flash Attention.
+            use_sage_attn (bool): Whether to use Sage Attention. Sage Attention only supports inference.
 
         Returns:
             None
@@ -1110,7 +1112,11 @@ def patchify_and_embed(
             cap_feats = layer(cap_feats, cap_mask, cap_freqs_cis)
 
         x = x.view(bsz, channels, height // pH, pH, width // pW, pW).permute(0, 2, 4, 3, 5, 1).flatten(3).flatten(1, 2)
+
         x_mask = torch.zeros(bsz, image_seq_len, dtype=torch.bool, device=device)
+        for i in range(bsz):
+            x[i, :image_seq_len] = x[i]
+            x_mask[i, :image_seq_len] = True
 
         x = self.x_embedder(x)
 
diff --git a/library/lumina_util.py b/library/lumina_util.py
index 06f089d4a..452b242fd 100644
--- a/library/lumina_util.py
+++ b/library/lumina_util.py
@@ -173,62 +173,61 @@ def pack_latents(x: torch.Tensor) -> torch.Tensor:
     return x
 
 
-DIFFUSERS_TO_ALPHA_VLLM_MAP = {
+
+DIFFUSERS_TO_ALPHA_VLLM_MAP: dict[str, str] = {
     # Embedding layers
-    "cap_embedder.0.weight": ["time_caption_embed.caption_embedder.0.weight"],
-    "cap_embedder.1.weight": "time_caption_embed.caption_embedder.1.weight",
-    "cap_embedder.1.bias": "text_embedder.1.bias",
-    "x_embedder.weight": "patch_embedder.proj.weight",
-    "x_embedder.bias": "patch_embedder.proj.bias",
+    "time_caption_embed.caption_embedder.0.weight": "cap_embedder.0.weight",
+    "time_caption_embed.caption_embedder.1.weight": "cap_embedder.1.weight",
+    "text_embedder.1.bias": "cap_embedder.1.bias",
+    "patch_embedder.proj.weight": "x_embedder.weight",
+    "patch_embedder.proj.bias": "x_embedder.bias",
     # Attention modulation
-    "layers.().adaLN_modulation.1.weight": "transformer_blocks.().adaln_modulation.1.weight",
-    "layers.().adaLN_modulation.1.bias": "transformer_blocks.().adaln_modulation.1.bias",
+    "transformer_blocks.().adaln_modulation.1.weight": "layers.().adaLN_modulation.1.weight",
+    "transformer_blocks.().adaln_modulation.1.bias": "layers.().adaLN_modulation.1.bias",
     # Final layers
-    "final_layer.adaLN_modulation.1.weight": "final_adaln_modulation.1.weight",
-    "final_layer.adaLN_modulation.1.bias": "final_adaln_modulation.1.bias",
-    "final_layer.linear.weight": "final_linear.weight",
-    "final_layer.linear.bias": "final_linear.bias",
+    "final_adaln_modulation.1.weight": "final_layer.adaLN_modulation.1.weight",
+    "final_adaln_modulation.1.bias": "final_layer.adaLN_modulation.1.bias",
+    "final_linear.weight": "final_layer.linear.weight",
+    "final_linear.bias": "final_layer.linear.bias",
     # Noise refiner
-    "noise_refiner.().adaLN_modulation.1.weight": "single_transformer_blocks.().adaln_modulation.1.weight",
-    "noise_refiner.().adaLN_modulation.1.bias": "single_transformer_blocks.().adaln_modulation.1.bias",
-    "noise_refiner.().attention.qkv.weight": "single_transformer_blocks.().attn.to_qkv.weight",
-    "noise_refiner.().attention.out.weight": "single_transformer_blocks.().attn.to_out.0.weight",
-    # Time embedding
-    "t_embedder.mlp.0.weight": "time_embedder.0.weight",
-    "t_embedder.mlp.0.bias": "time_embedder.0.bias",
-    "t_embedder.mlp.2.weight": "time_embedder.2.weight",
-    "t_embedder.mlp.2.bias": "time_embedder.2.bias",
-    # Context attention
-    "context_refiner.().attention.qkv.weight": "transformer_blocks.().attn2.to_qkv.weight",
-    "context_refiner.().attention.out.weight": "transformer_blocks.().attn2.to_out.0.weight",
+    "single_transformer_blocks.().adaln_modulation.1.weight": "noise_refiner.().adaLN_modulation.1.weight",
+    "single_transformer_blocks.().adaln_modulation.1.bias": "noise_refiner.().adaLN_modulation.1.bias",
+    "single_transformer_blocks.().attn.to_qkv.weight": "noise_refiner.().attention.qkv.weight",
+    "single_transformer_blocks.().attn.to_out.0.weight": "noise_refiner.().attention.out.weight",
     # Normalization
-    "layers.().attention_norm1.weight": "transformer_blocks.().norm1.weight",
-    "layers.().attention_norm2.weight": "transformer_blocks.().norm2.weight",
+    "transformer_blocks.().norm1.weight": "layers.().attention_norm1.weight",
+    "transformer_blocks.().norm2.weight": "layers.().attention_norm2.weight",
     # FFN
-    "layers.().feed_forward.w1.weight": "transformer_blocks.().ff.net.0.proj.weight",
-    "layers.().feed_forward.w2.weight": "transformer_blocks.().ff.net.2.weight",
-    "layers.().feed_forward.w3.weight": "transformer_blocks.().ff.net.4.weight",
+    "transformer_blocks.().ff.net.0.proj.weight": "layers.().feed_forward.w1.weight",
+    "transformer_blocks.().ff.net.2.weight": "layers.().feed_forward.w2.weight",
+    "transformer_blocks.().ff.net.4.weight": "layers.().feed_forward.w3.weight",
 }
 
 
 def convert_diffusers_sd_to_alpha_vllm(sd: dict, num_double_blocks: int) -> dict:
     """Convert Diffusers checkpoint to Alpha-VLLM format"""
     logger.info("Converting Diffusers checkpoint to Alpha-VLLM format")
-    new_sd = {}
-
-    for key, value in sd.items():
-        new_key = key
-        for pattern, replacement in DIFFUSERS_TO_ALPHA_VLLM_MAP.items():
-            if "()." in pattern:
-                for block_idx in range(num_double_blocks):
-                    if str(block_idx) in key:
-                        converted = pattern.replace("()", str(block_idx))
-                        new_key = key.replace(converted, replacement.replace("()", str(block_idx)))
-                        break
+    new_sd = sd.copy()  # Preserve original keys
+
+    for diff_key, alpha_key in DIFFUSERS_TO_ALPHA_VLLM_MAP.items():
+        # Handle block-specific patterns
+        if '().' in diff_key:
+            for block_idx in range(num_double_blocks):
+                block_alpha_key = alpha_key.replace('().', f'{block_idx}.')
+                block_diff_key = diff_key.replace('().', f'{block_idx}.')
+                
+                # Search for and convert block-specific keys
+                for input_key, value in list(sd.items()):
+                    if input_key == block_diff_key:
+                        new_sd[block_alpha_key] = value
+        else:
+            # Handle static keys
+            if diff_key in sd:
+                print(f"Replacing {diff_key} with {alpha_key}")
+                new_sd[alpha_key] = sd[diff_key]
+            else:
+                print(f"Not found: {diff_key}")
 
-        if new_key == key:
-            logger.debug(f"Unmatched key in conversion: {key}")
-        new_sd[new_key] = value
 
     logger.info(f"Converted {len(new_sd)} keys to Alpha-VLLM format")
     return new_sd
diff --git a/library/sd3_train_utils.py b/library/sd3_train_utils.py
index 6a4b39b3a..c40798846 100644
--- a/library/sd3_train_utils.py
+++ b/library/sd3_train_utils.py
@@ -610,21 +610,6 @@ def encode_prompt(prpt):
 from diffusers.utils import BaseOutput
 
 
-# Copyright 2024 Stability AI, Katherine Crowson and The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
 @dataclass
 class FlowMatchEulerDiscreteSchedulerOutput(BaseOutput):
     """
@@ -664,49 +649,22 @@ def __init__(
         self,
         num_train_timesteps: int = 1000,
         shift: float = 1.0,
-        use_dynamic_shifting=False,
-        base_shift: Optional[float] = 0.5,
-        max_shift: Optional[float] = 1.15,
-        base_image_seq_len: Optional[int] = 256,
-        max_image_seq_len: Optional[int] = 4096,
-        invert_sigmas: bool = False,
-        shift_terminal: Optional[float] = None,
-        use_karras_sigmas: Optional[bool] = False,
-        use_exponential_sigmas: Optional[bool] = False,
-        use_beta_sigmas: Optional[bool] = False,
     ):
-        if self.config.use_beta_sigmas and not is_scipy_available():
-            raise ImportError("Make sure to install scipy if you want to use beta sigmas.")
-        if sum([self.config.use_beta_sigmas, self.config.use_exponential_sigmas, self.config.use_karras_sigmas]) > 1:
-            raise ValueError(
-                "Only one of `config.use_beta_sigmas`, `config.use_exponential_sigmas`, `config.use_karras_sigmas` can be used."
-            )
         timesteps = np.linspace(1, num_train_timesteps, num_train_timesteps, dtype=np.float32)[::-1].copy()
         timesteps = torch.from_numpy(timesteps).to(dtype=torch.float32)
 
         sigmas = timesteps / num_train_timesteps
-        if not use_dynamic_shifting:
-            # when use_dynamic_shifting is True, we apply the timestep shifting on the fly based on the image resolution
-            sigmas = shift * sigmas / (1 + (shift - 1) * sigmas)
+        sigmas = shift * sigmas / (1 + (shift - 1) * sigmas)
 
         self.timesteps = sigmas * num_train_timesteps
 
         self._step_index = None
         self._begin_index = None
 
-        self._shift = shift
-
         self.sigmas = sigmas.to("cpu")  # to avoid too much CPU/GPU communication
         self.sigma_min = self.sigmas[-1].item()
         self.sigma_max = self.sigmas[0].item()
 
-    @property
-    def shift(self):
-        """
-        The value used for shifting.
-        """
-        return self._shift
-
     @property
     def step_index(self):
         """
@@ -732,9 +690,6 @@ def set_begin_index(self, begin_index: int = 0):
         """
         self._begin_index = begin_index
 
-    def set_shift(self, shift: float):
-        self._shift = shift
-
     def scale_noise(
         self,
         sample: torch.FloatTensor,
@@ -754,31 +709,10 @@ def scale_noise(
             `torch.FloatTensor`:
                 A scaled input sample.
         """
-        # Make sure sigmas and timesteps have the same device and dtype as original_samples
-        sigmas = self.sigmas.to(device=sample.device, dtype=sample.dtype)
-
-        if sample.device.type == "mps" and torch.is_floating_point(timestep):
-            # mps does not support float64
-            schedule_timesteps = self.timesteps.to(sample.device, dtype=torch.float32)
-            timestep = timestep.to(sample.device, dtype=torch.float32)
-        else:
-            schedule_timesteps = self.timesteps.to(sample.device)
-            timestep = timestep.to(sample.device)
-
-        # self.begin_index is None when scheduler is used for training, or pipeline does not implement set_begin_index
-        if self.begin_index is None:
-            step_indices = [self.index_for_timestep(t, schedule_timesteps) for t in timestep]
-        elif self.step_index is not None:
-            # add_noise is called after first denoising step (for inpainting)
-            step_indices = [self.step_index] * timestep.shape[0]
-        else:
-            # add noise is called before first denoising step to create initial latent(img2img)
-            step_indices = [self.begin_index] * timestep.shape[0]
-
-        sigma = sigmas[step_indices].flatten()
-        while len(sigma.shape) < len(sample.shape):
-            sigma = sigma.unsqueeze(-1)
+        if self.step_index is None:
+            self._init_step_index(timestep)
 
+        sigma = self.sigmas[self.step_index]
         sample = sigma * noise + (1.0 - sigma) * sample
 
         return sample
@@ -786,37 +720,7 @@ def scale_noise(
     def _sigma_to_t(self, sigma):
         return sigma * self.config.num_train_timesteps
 
-    def time_shift(self, mu: float, sigma: float, t: torch.Tensor):
-        return math.exp(mu) / (math.exp(mu) + (1 / t - 1) ** sigma)
-
-    def stretch_shift_to_terminal(self, t: torch.Tensor) -> torch.Tensor:
-        r"""
-        Stretches and shifts the timestep schedule to ensure it terminates at the configured `shift_terminal` config
-        value.
-
-        Reference:
-        https://github.com/Lightricks/LTX-Video/blob/a01a171f8fe3d99dce2728d60a73fecf4d4238ae/ltx_video/schedulers/rf.py#L51
-
-        Args:
-            t (`torch.Tensor`):
-                A tensor of timesteps to be stretched and shifted.
-
-        Returns:
-            `torch.Tensor`:
-                A tensor of adjusted timesteps such that the final value equals `self.config.shift_terminal`.
-        """
-        one_minus_z = 1 - t
-        scale_factor = one_minus_z[-1] / (1 - self.config.shift_terminal)
-        stretched_t = 1 - (one_minus_z / scale_factor)
-        return stretched_t
-
-    def set_timesteps(
-        self,
-        num_inference_steps: int = None,
-        device: Union[str, torch.device] = None,
-        sigmas: Optional[List[float]] = None,
-        mu: Optional[float] = None,
-    ):
+    def set_timesteps(self, num_inference_steps: int, device: Union[str, torch.device] = None):
         """
         Sets the discrete timesteps used for the diffusion chain (to be run before inference).
 
@@ -826,49 +730,18 @@ def set_timesteps(
             device (`str` or `torch.device`, *optional*):
                 The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
         """
-        if self.config.use_dynamic_shifting and mu is None:
-            raise ValueError(" you have a pass a value for `mu` when `use_dynamic_shifting` is set to be `True`")
-
-        if sigmas is None:
-            timesteps = np.linspace(
-                self._sigma_to_t(self.sigma_max), self._sigma_to_t(self.sigma_min), num_inference_steps
-            )
-
-            sigmas = timesteps / self.config.num_train_timesteps
-        else:
-            sigmas = np.array(sigmas).astype(np.float32)
-            num_inference_steps = len(sigmas)
         self.num_inference_steps = num_inference_steps
 
-        if self.config.use_dynamic_shifting:
-            sigmas = self.time_shift(mu, 1.0, sigmas)
-        else:
-            sigmas = self.shift * sigmas / (1 + (self.shift - 1) * sigmas)
-
-        if self.config.shift_terminal:
-            sigmas = self.stretch_shift_to_terminal(sigmas)
-
-        if self.config.use_karras_sigmas:
-            sigmas = self._convert_to_karras(in_sigmas=sigmas, num_inference_steps=num_inference_steps)
-
-        elif self.config.use_exponential_sigmas:
-            sigmas = self._convert_to_exponential(in_sigmas=sigmas, num_inference_steps=num_inference_steps)
-
-        elif self.config.use_beta_sigmas:
-            sigmas = self._convert_to_beta(in_sigmas=sigmas, num_inference_steps=num_inference_steps)
+        timesteps = np.linspace(self._sigma_to_t(self.sigma_max), self._sigma_to_t(self.sigma_min), num_inference_steps)
 
+        sigmas = timesteps / self.config.num_train_timesteps
+        sigmas = self.config.shift * sigmas / (1 + (self.config.shift - 1) * sigmas)
         sigmas = torch.from_numpy(sigmas).to(dtype=torch.float32, device=device)
-        timesteps = sigmas * self.config.num_train_timesteps
-
-        if self.config.invert_sigmas:
-            sigmas = 1.0 - sigmas
-            timesteps = sigmas * self.config.num_train_timesteps
-            sigmas = torch.cat([sigmas, torch.ones(1, device=sigmas.device)])
-        else:
-            sigmas = torch.cat([sigmas, torch.zeros(1, device=sigmas.device)])
 
+        timesteps = sigmas * self.config.num_train_timesteps
         self.timesteps = timesteps.to(device=device)
-        self.sigmas = sigmas
+        self.sigmas = torch.cat([sigmas, torch.zeros(1, device=sigmas.device)])
+
         self._step_index = None
         self._begin_index = None
 
@@ -934,11 +807,7 @@ def step(
                 returned, otherwise a tuple is returned where the first element is the sample tensor.
         """
 
-        if (
-            isinstance(timestep, int)
-            or isinstance(timestep, torch.IntTensor)
-            or isinstance(timestep, torch.LongTensor)
-        ):
+        if isinstance(timestep, int) or isinstance(timestep, torch.IntTensor) or isinstance(timestep, torch.LongTensor):
             raise ValueError(
                 (
                     "Passing integer indices (e.g. from `enumerate(timesteps)`) as timesteps to"
@@ -954,100 +823,40 @@ def step(
         sample = sample.to(torch.float32)
 
         sigma = self.sigmas[self.step_index]
-        sigma_next = self.sigmas[self.step_index + 1]
 
-        prev_sample = sample + (sigma_next - sigma) * model_output
+        gamma = min(s_churn / (len(self.sigmas) - 1), 2**0.5 - 1) if s_tmin <= sigma <= s_tmax else 0.0
 
-        # Cast sample back to model compatible dtype
-        prev_sample = prev_sample.to(model_output.dtype)
+        noise = randn_tensor(model_output.shape, dtype=model_output.dtype, device=model_output.device, generator=generator)
 
-        # upon completion increase step index by one
-        self._step_index += 1
+        eps = noise * s_noise
+        sigma_hat = sigma * (gamma + 1)
 
-        if not return_dict:
-            return (prev_sample,)
+        if gamma > 0:
+            sample = sample + eps * (sigma_hat**2 - sigma**2) ** 0.5
 
-        return FlowMatchEulerDiscreteSchedulerOutput(prev_sample=prev_sample)
+        # 1. compute predicted original sample (x_0) from sigma-scaled predicted noise
+        # NOTE: "original_sample" should not be an expected prediction_type but is left in for
+        # backwards compatibility
 
-    # Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler._convert_to_karras
-    def _convert_to_karras(self, in_sigmas: torch.Tensor, num_inference_steps) -> torch.Tensor:
-        """Constructs the noise schedule of Karras et al. (2022)."""
+        # if self.config.prediction_type == "vector_field":
 
-        # Hack to make sure that other schedulers which copy this function don't break
-        # TODO: Add this logic to the other schedulers
-        if hasattr(self.config, "sigma_min"):
-            sigma_min = self.config.sigma_min
-        else:
-            sigma_min = None
-
-        if hasattr(self.config, "sigma_max"):
-            sigma_max = self.config.sigma_max
-        else:
-            sigma_max = None
-
-        sigma_min = sigma_min if sigma_min is not None else in_sigmas[-1].item()
-        sigma_max = sigma_max if sigma_max is not None else in_sigmas[0].item()
-
-        rho = 7.0  # 7.0 is the value used in the paper
-        ramp = np.linspace(0, 1, num_inference_steps)
-        min_inv_rho = sigma_min ** (1 / rho)
-        max_inv_rho = sigma_max ** (1 / rho)
-        sigmas = (max_inv_rho + ramp * (min_inv_rho - max_inv_rho)) ** rho
-        return sigmas
-
-    # Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler._convert_to_exponential
-    def _convert_to_exponential(self, in_sigmas: torch.Tensor, num_inference_steps: int) -> torch.Tensor:
-        """Constructs an exponential noise schedule."""
-
-        # Hack to make sure that other schedulers which copy this function don't break
-        # TODO: Add this logic to the other schedulers
-        if hasattr(self.config, "sigma_min"):
-            sigma_min = self.config.sigma_min
-        else:
-            sigma_min = None
+        denoised = sample - model_output * sigma
+        # 2. Convert to an ODE derivative
+        derivative = (sample - denoised) / sigma_hat
 
-        if hasattr(self.config, "sigma_max"):
-            sigma_max = self.config.sigma_max
-        else:
-            sigma_max = None
-
-        sigma_min = sigma_min if sigma_min is not None else in_sigmas[-1].item()
-        sigma_max = sigma_max if sigma_max is not None else in_sigmas[0].item()
+        dt = self.sigmas[self.step_index + 1] - sigma_hat
 
-        sigmas = np.exp(np.linspace(math.log(sigma_max), math.log(sigma_min), num_inference_steps))
-        return sigmas
+        prev_sample = sample + derivative * dt
+        # Cast sample back to model compatible dtype
+        prev_sample = prev_sample.to(model_output.dtype)
 
-    # Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler._convert_to_beta
-    def _convert_to_beta(
-        self, in_sigmas: torch.Tensor, num_inference_steps: int, alpha: float = 0.6, beta: float = 0.6
-    ) -> torch.Tensor:
-        """From "Beta Sampling is All You Need" [arXiv:2407.12173] (Lee et. al, 2024)"""
+        # upon completion increase step index by one
+        self._step_index += 1
 
-        # Hack to make sure that other schedulers which copy this function don't break
-        # TODO: Add this logic to the other schedulers
-        if hasattr(self.config, "sigma_min"):
-            sigma_min = self.config.sigma_min
-        else:
-            sigma_min = None
+        if not return_dict:
+            return (prev_sample,)
 
-        if hasattr(self.config, "sigma_max"):
-            sigma_max = self.config.sigma_max
-        else:
-            sigma_max = None
-
-        sigma_min = sigma_min if sigma_min is not None else in_sigmas[-1].item()
-        sigma_max = sigma_max if sigma_max is not None else in_sigmas[0].item()
-
-        sigmas = np.array(
-            [
-                sigma_min + (ppf * (sigma_max - sigma_min))
-                for ppf in [
-                    scipy.stats.beta.ppf(timestep, alpha, beta)
-                    for timestep in 1 - np.linspace(0, 1, num_inference_steps)
-                ]
-            ]
-        )
-        return sigmas
+        return FlowMatchEulerDiscreteSchedulerOutput(prev_sample=prev_sample)
 
     def __len__(self):
         return self.config.num_train_timesteps
diff --git a/tests/library/test_lumina_models.py b/tests/library/test_lumina_models.py
new file mode 100644
index 000000000..ba063688c
--- /dev/null
+++ b/tests/library/test_lumina_models.py
@@ -0,0 +1,295 @@
+import pytest
+import torch
+
+from library.lumina_models import (
+    LuminaParams,
+    to_cuda,
+    to_cpu,
+    RopeEmbedder,
+    TimestepEmbedder,
+    modulate,
+    NextDiT,
+)
+
+cuda_required = pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
+
+
+def test_lumina_params():
+    # Test default configuration
+    default_params = LuminaParams()
+    assert default_params.patch_size == 2
+    assert default_params.in_channels == 4
+    assert default_params.axes_dims == [36, 36, 36]
+    assert default_params.axes_lens == [300, 512, 512]
+
+    # Test 2B config
+    config_2b = LuminaParams.get_2b_config()
+    assert config_2b.dim == 2304
+    assert config_2b.in_channels == 16
+    assert config_2b.n_layers == 26
+    assert config_2b.n_heads == 24
+    assert config_2b.cap_feat_dim == 2304
+
+    # Test 7B config
+    config_7b = LuminaParams.get_7b_config()
+    assert config_7b.dim == 4096
+    assert config_7b.n_layers == 32
+    assert config_7b.n_heads == 32
+    assert config_7b.axes_dims == [64, 64, 64]
+
+
+@cuda_required
+def test_to_cuda_to_cpu():
+    # Test tensor conversion
+    x = torch.tensor([1, 2, 3])
+    x_cuda = to_cuda(x)
+    x_cpu = to_cpu(x_cuda)
+    assert x.cpu().tolist() == x_cpu.tolist()
+
+    # Test list conversion
+    list_data = [torch.tensor([1]), torch.tensor([2])]
+    list_cuda = to_cuda(list_data)
+    assert all(tensor.device.type == "cuda" for tensor in list_cuda)
+
+    list_cpu = to_cpu(list_cuda)
+    assert all(not tensor.device.type == "cuda" for tensor in list_cpu)
+
+    # Test dict conversion
+    dict_data = {"a": torch.tensor([1]), "b": torch.tensor([2])}
+    dict_cuda = to_cuda(dict_data)
+    assert all(tensor.device.type == "cuda" for tensor in dict_cuda.values())
+
+    dict_cpu = to_cpu(dict_cuda)
+    assert all(not tensor.device.type == "cuda" for tensor in dict_cpu.values())
+
+
+def test_timestep_embedder():
+    # Test initialization
+    hidden_size = 256
+    freq_emb_size = 128
+    embedder = TimestepEmbedder(hidden_size, freq_emb_size)
+    assert embedder.frequency_embedding_size == freq_emb_size
+
+    # Test timestep embedding
+    t = torch.tensor([0.5, 1.0, 2.0])
+    emb_dim = freq_emb_size
+    embeddings = TimestepEmbedder.timestep_embedding(t, emb_dim)
+
+    assert embeddings.shape == (3, emb_dim)
+    assert embeddings.dtype == torch.float32
+
+    # Ensure embeddings are unique for different input times
+    assert not torch.allclose(embeddings[0], embeddings[1])
+
+    # Test forward pass
+    t_emb = embedder(t)
+    assert t_emb.shape == (3, hidden_size)
+
+
+def test_rope_embedder_simple():
+    rope_embedder = RopeEmbedder()
+    batch_size, seq_len = 2, 10
+
+    # Create position_ids with valid ranges for each axis
+    position_ids = torch.stack(
+        [
+            torch.zeros(batch_size, seq_len, dtype=torch.int64),  # First axis: only 0 is valid
+            torch.randint(0, 512, (batch_size, seq_len), dtype=torch.int64),  # Second axis: 0-511
+            torch.randint(0, 512, (batch_size, seq_len), dtype=torch.int64),  # Third axis: 0-511
+        ],
+        dim=-1,
+    )
+
+    freqs_cis = rope_embedder(position_ids)
+    # RoPE embeddings work in pairs, so output dimension is half of total axes_dims
+    expected_dim = sum(rope_embedder.axes_dims) // 2  # 128 // 2 = 64
+    assert freqs_cis.shape == (batch_size, seq_len, expected_dim)
+
+
+def test_modulate():
+    # Test modulation with different scales
+    x = torch.tensor([[1.0, 2.0], [3.0, 4.0]])
+    scale = torch.tensor([1.5, 2.0])
+
+    modulated_x = modulate(x, scale)
+
+    # Check that modulation scales correctly
+    # The function does x * (1 + scale), so:
+    # For scale [1.5, 2.0], (1 + scale) = [2.5, 3.0]
+    expected_x = torch.tensor([[2.5 * 1.0, 2.5 * 2.0], [3.0 * 3.0, 3.0 * 4.0]])
+    # Which equals: [[2.5, 5.0], [9.0, 12.0]]
+
+    assert torch.allclose(modulated_x, expected_x)
+
+
+def test_nextdit_parameter_count_optimized():
+    # The constraint is: (dim // n_heads) == sum(axes_dims)
+    # So for dim=120, n_heads=4: 120//4 = 30, so sum(axes_dims) must = 30
+    model_small = NextDiT(
+        patch_size=2,
+        in_channels=4,  # Smaller
+        dim=120,  # 120 // 4 = 30
+        n_layers=2,  # Much fewer layers
+        n_heads=4,  # Fewer heads
+        n_kv_heads=2,
+        axes_dims=[10, 10, 10],  # sum = 30
+        axes_lens=[10, 32, 32],  # Smaller
+    )
+    param_count_small = model_small.parameter_count()
+    assert param_count_small > 0
+
+    # For dim=192, n_heads=6: 192//6 = 32, so sum(axes_dims) must = 32
+    model_medium = NextDiT(
+        patch_size=2,
+        in_channels=4,
+        dim=192,  # 192 // 6 = 32
+        n_layers=4,  # More layers
+        n_heads=6,
+        n_kv_heads=3,
+        axes_dims=[10, 11, 11],  # sum = 32
+        axes_lens=[10, 32, 32],
+    )
+    param_count_medium = model_medium.parameter_count()
+    assert param_count_medium > param_count_small
+    print(f"Small model: {param_count_small:,} parameters")
+    print(f"Medium model: {param_count_medium:,} parameters")
+
+
+@torch.no_grad()
+def test_precompute_freqs_cis():
+    # Test precompute_freqs_cis
+    dim = [16, 56, 56]
+    end = [1, 512, 512]
+    theta = 10000.0
+
+    freqs_cis = NextDiT.precompute_freqs_cis(dim, end, theta)
+
+    # Check number of frequency tensors
+    assert len(freqs_cis) == len(dim)
+
+    # Check each frequency tensor
+    for i, (d, e) in enumerate(zip(dim, end)):
+        assert freqs_cis[i].shape == (e, d // 2)
+        assert freqs_cis[i].dtype == torch.complex128
+
+
+@torch.no_grad()
+def test_nextdit_patchify_and_embed():
+    """Test the patchify_and_embed method which is crucial for training"""
+    # Create a small NextDiT model for testing
+    # The constraint is: (dim // n_heads) == sum(axes_dims)
+    # For dim=120, n_heads=4: 120//4 = 30, so sum(axes_dims) must = 30
+    model = NextDiT(
+        patch_size=2,
+        in_channels=4,
+        dim=120,  # 120 // 4 = 30
+        n_layers=1,  # Minimal layers for faster testing
+        n_refiner_layers=1,  # Minimal refiner layers
+        n_heads=4,
+        n_kv_heads=2,
+        axes_dims=[10, 10, 10],  # sum = 30
+        axes_lens=[10, 32, 32],
+        cap_feat_dim=120,  # Match dim for consistency
+    )
+
+    # Prepare test inputs
+    batch_size = 2
+    height, width = 64, 64  # Must be divisible by patch_size (2)
+    caption_seq_len = 8
+
+    # Create mock inputs
+    x = torch.randn(batch_size, 4, height, width)  # Image latents
+    cap_feats = torch.randn(batch_size, caption_seq_len, 120)  # Caption features
+    cap_mask = torch.ones(batch_size, caption_seq_len, dtype=torch.bool)  # All valid tokens
+    # Make second batch have shorter caption
+    cap_mask[1, 6:] = False  # Only first 6 tokens are valid for second batch
+    t = torch.randn(batch_size, 120)  # Timestep embeddings
+
+    # Call patchify_and_embed
+    joint_hidden_states, attention_mask, freqs_cis, l_effective_cap_len, seq_lengths = model.patchify_and_embed(
+        x, cap_feats, cap_mask, t
+    )
+
+    # Validate outputs
+    image_seq_len = (height // 2) * (width // 2)  # patch_size = 2
+    expected_seq_lengths = [caption_seq_len + image_seq_len, 6 + image_seq_len]  # Second batch has shorter caption
+    max_seq_len = max(expected_seq_lengths)
+
+    # Check joint hidden states shape
+    assert joint_hidden_states.shape == (batch_size, max_seq_len, 120)
+    assert joint_hidden_states.dtype == torch.float32
+
+    # Check attention mask shape and values
+    assert attention_mask.shape == (batch_size, max_seq_len)
+    assert attention_mask.dtype == torch.bool
+    # First batch should have all positions valid up to its sequence length
+    assert torch.all(attention_mask[0, : expected_seq_lengths[0]])
+    assert torch.all(~attention_mask[0, expected_seq_lengths[0] :])
+    # Second batch should have all positions valid up to its sequence length
+    assert torch.all(attention_mask[1, : expected_seq_lengths[1]])
+    assert torch.all(~attention_mask[1, expected_seq_lengths[1] :])
+
+    # Check freqs_cis shape
+    assert freqs_cis.shape == (batch_size, max_seq_len, sum(model.axes_dims) // 2)
+
+    # Check effective caption lengths
+    assert l_effective_cap_len == [caption_seq_len, 6]
+
+    # Check sequence lengths
+    assert seq_lengths == expected_seq_lengths
+
+    # Validate that the joint hidden states contain non-zero values where attention mask is True
+    for i in range(batch_size):
+        valid_positions = attention_mask[i]
+        # Check that valid positions have meaningful data (not all zeros)
+        valid_data = joint_hidden_states[i][valid_positions]
+        assert not torch.allclose(valid_data, torch.zeros_like(valid_data))
+
+        # Check that invalid positions are zeros
+        if valid_positions.sum() < max_seq_len:
+            invalid_data = joint_hidden_states[i][~valid_positions]
+            assert torch.allclose(invalid_data, torch.zeros_like(invalid_data))
+
+
+@torch.no_grad()
+def test_nextdit_patchify_and_embed_edge_cases():
+    """Test edge cases for patchify_and_embed"""
+    # Create minimal model
+    model = NextDiT(
+        patch_size=2,
+        in_channels=4,
+        dim=60,  # 60 // 3 = 20
+        n_layers=1,
+        n_refiner_layers=1,
+        n_heads=3,
+        n_kv_heads=1,
+        axes_dims=[8, 6, 6],  # sum = 20
+        axes_lens=[10, 16, 16],
+        cap_feat_dim=60,
+    )
+
+    # Test with empty captions (all masked)
+    batch_size = 1
+    height, width = 32, 32
+    caption_seq_len = 4
+
+    x = torch.randn(batch_size, 4, height, width)
+    cap_feats = torch.randn(batch_size, caption_seq_len, 60)
+    cap_mask = torch.zeros(batch_size, caption_seq_len, dtype=torch.bool)  # All tokens masked
+    t = torch.randn(batch_size, 60)
+
+    joint_hidden_states, attention_mask, freqs_cis, l_effective_cap_len, seq_lengths = model.patchify_and_embed(
+        x, cap_feats, cap_mask, t
+    )
+
+    # With all captions masked, effective length should be 0
+    assert l_effective_cap_len == [0]
+
+    # Sequence length should just be the image sequence length
+    image_seq_len = (height // 2) * (width // 2)
+    assert seq_lengths == [image_seq_len]
+
+    # Joint hidden states should only contain image data
+    assert joint_hidden_states.shape == (batch_size, image_seq_len, 60)
+    assert attention_mask.shape == (batch_size, image_seq_len)
+    assert torch.all(attention_mask[0])  # All image positions should be valid
diff --git a/tests/library/test_lumina_train_util.py b/tests/library/test_lumina_train_util.py
new file mode 100644
index 000000000..bcf448c89
--- /dev/null
+++ b/tests/library/test_lumina_train_util.py
@@ -0,0 +1,241 @@
+import pytest
+import torch
+import math
+
+from library.lumina_train_util import (
+    batchify,
+    time_shift,
+    get_lin_function,
+    get_schedule,
+    compute_density_for_timestep_sampling,
+    get_sigmas,
+    compute_loss_weighting_for_sd3,
+    get_noisy_model_input_and_timesteps,
+    apply_model_prediction_type,
+    retrieve_timesteps,
+)
+from library.sd3_train_utils import FlowMatchEulerDiscreteScheduler
+
+
+def test_batchify():
+    # Test case with no batch size specified
+    prompts = [
+        {"prompt": "test1"},
+        {"prompt": "test2"},
+        {"prompt": "test3"}
+    ]
+    batchified = list(batchify(prompts))
+    assert len(batchified) == 1
+    assert len(batchified[0]) == 3
+
+    # Test case with batch size specified
+    batchified_sized = list(batchify(prompts, batch_size=2))
+    assert len(batchified_sized) == 2
+    assert len(batchified_sized[0]) == 2
+    assert len(batchified_sized[1]) == 1
+
+    # Test batching with prompts having same parameters
+    prompts_with_params = [
+        {"prompt": "test1", "width": 512, "height": 512},
+        {"prompt": "test2", "width": 512, "height": 512},
+        {"prompt": "test3", "width": 1024, "height": 1024}
+    ]
+    batchified_params = list(batchify(prompts_with_params))
+    assert len(batchified_params) == 2
+
+    # Test invalid batch size
+    with pytest.raises(ValueError):
+        list(batchify(prompts, batch_size=0))
+    with pytest.raises(ValueError):
+        list(batchify(prompts, batch_size=-1))
+
+
+def test_time_shift():
+    # Test standard parameters
+    t = torch.tensor([0.5])
+    mu = 1.0
+    sigma = 1.0
+    result = time_shift(mu, sigma, t)
+    assert 0 <= result <= 1
+
+    # Test with edge cases
+    t_edges = torch.tensor([0.0, 1.0])
+    result_edges = time_shift(1.0, 1.0, t_edges)
+    
+    # Check that results are bounded within [0, 1]
+    assert torch.all(result_edges >= 0)
+    assert torch.all(result_edges <= 1)
+
+
+def test_get_lin_function():
+    # Default parameters
+    func = get_lin_function()
+    assert func(256) == 0.5
+    assert func(4096) == 1.15
+
+    # Custom parameters
+    custom_func = get_lin_function(x1=100, x2=1000, y1=0.1, y2=0.9)
+    assert custom_func(100) == 0.1
+    assert custom_func(1000) == 0.9
+
+
+def test_get_schedule():
+    # Basic schedule
+    schedule = get_schedule(num_steps=10, image_seq_len=256)
+    assert len(schedule) == 10
+    assert all(0 <= x <= 1 for x in schedule)
+
+    # Test different sequence lengths
+    short_schedule = get_schedule(num_steps=5, image_seq_len=128)
+    long_schedule = get_schedule(num_steps=15, image_seq_len=1024)
+    assert len(short_schedule) == 5
+    assert len(long_schedule) == 15
+
+    # Test with shift disabled
+    unshifted_schedule = get_schedule(num_steps=10, image_seq_len=256, shift=False)
+    assert torch.allclose(
+        torch.tensor(unshifted_schedule), 
+        torch.linspace(1, 1/10, 10)
+    )
+
+
+def test_compute_density_for_timestep_sampling():
+    # Test uniform sampling
+    uniform_samples = compute_density_for_timestep_sampling("uniform", batch_size=100)
+    assert len(uniform_samples) == 100
+    assert torch.all((uniform_samples >= 0) & (uniform_samples <= 1))
+
+    # Test logit normal sampling
+    logit_normal_samples = compute_density_for_timestep_sampling(
+        "logit_normal", batch_size=100, logit_mean=0.0, logit_std=1.0
+    )
+    assert len(logit_normal_samples) == 100
+    assert torch.all((logit_normal_samples >= 0) & (logit_normal_samples <= 1))
+
+    # Test mode sampling
+    mode_samples = compute_density_for_timestep_sampling(
+        "mode", batch_size=100, mode_scale=0.5
+    )
+    assert len(mode_samples) == 100
+    assert torch.all((mode_samples >= 0) & (mode_samples <= 1))
+
+
+def test_get_sigmas():
+    # Create a mock noise scheduler
+    scheduler = FlowMatchEulerDiscreteScheduler(num_train_timesteps=1000)
+    device = torch.device('cpu')
+    
+    # Test with default parameters
+    timesteps = torch.tensor([100, 500, 900])
+    sigmas = get_sigmas(scheduler, timesteps, device)
+    
+    # Check shape and basic properties
+    assert sigmas.shape[0] == 3
+    assert torch.all(sigmas >= 0)
+    
+    # Test with different n_dim
+    sigmas_4d = get_sigmas(scheduler, timesteps, device, n_dim=4)
+    assert sigmas_4d.ndim == 4
+    
+    # Test with different dtype
+    sigmas_float16 = get_sigmas(scheduler, timesteps, device, dtype=torch.float16)
+    assert sigmas_float16.dtype == torch.float16
+
+
+def test_compute_loss_weighting_for_sd3():
+    # Prepare some mock sigmas
+    sigmas = torch.tensor([0.1, 0.5, 1.0])
+    
+    # Test sigma_sqrt weighting
+    sqrt_weighting = compute_loss_weighting_for_sd3("sigma_sqrt", sigmas)
+    assert torch.allclose(sqrt_weighting, 1 / (sigmas**2), rtol=1e-5)
+    
+    # Test cosmap weighting
+    cosmap_weighting = compute_loss_weighting_for_sd3("cosmap", sigmas)
+    bot = 1 - 2 * sigmas + 2 * sigmas**2
+    expected_cosmap = 2 / (math.pi * bot)
+    assert torch.allclose(cosmap_weighting, expected_cosmap, rtol=1e-5)
+    
+    # Test default weighting
+    default_weighting = compute_loss_weighting_for_sd3("unknown", sigmas)
+    assert torch.all(default_weighting == 1)
+
+
+def test_apply_model_prediction_type():
+    # Create mock args and tensors
+    class MockArgs:
+        model_prediction_type = "raw"
+        weighting_scheme = "sigma_sqrt"
+    
+    args = MockArgs()
+    model_pred = torch.tensor([1.0, 2.0, 3.0])
+    noisy_model_input = torch.tensor([0.5, 1.0, 1.5])
+    sigmas = torch.tensor([0.1, 0.5, 1.0])
+    
+    # Test raw prediction type
+    raw_pred, raw_weighting = apply_model_prediction_type(args, model_pred, noisy_model_input, sigmas)
+    assert torch.all(raw_pred == model_pred)
+    assert raw_weighting is None
+    
+    # Test additive prediction type
+    args.model_prediction_type = "additive"
+    additive_pred, _ = apply_model_prediction_type(args, model_pred, noisy_model_input, sigmas)
+    assert torch.all(additive_pred == model_pred + noisy_model_input)
+    
+    # Test sigma scaled prediction type
+    args.model_prediction_type = "sigma_scaled"
+    sigma_scaled_pred, sigma_weighting = apply_model_prediction_type(args, model_pred, noisy_model_input, sigmas)
+    assert torch.all(sigma_scaled_pred == model_pred * (-sigmas) + noisy_model_input)
+    assert sigma_weighting is not None
+
+
+def test_retrieve_timesteps():
+    # Create a mock scheduler
+    scheduler = FlowMatchEulerDiscreteScheduler(num_train_timesteps=1000)
+    
+    # Test with num_inference_steps
+    timesteps, n_steps = retrieve_timesteps(scheduler, num_inference_steps=50)
+    assert len(timesteps) == 50
+    assert n_steps == 50
+    
+    # Test error handling with simultaneous timesteps and sigmas
+    with pytest.raises(ValueError):
+        retrieve_timesteps(scheduler, timesteps=[1, 2, 3], sigmas=[0.1, 0.2, 0.3])
+
+
+def test_get_noisy_model_input_and_timesteps():
+    # Create a mock args and setup
+    class MockArgs:
+        timestep_sampling = "uniform"
+        weighting_scheme = "sigma_sqrt"
+        sigmoid_scale = 1.0
+        discrete_flow_shift = 6.0
+
+    args = MockArgs()
+    scheduler = FlowMatchEulerDiscreteScheduler(num_train_timesteps=1000)
+    device = torch.device('cpu')
+    
+    # Prepare mock latents and noise
+    latents = torch.randn(4, 16, 64, 64)
+    noise = torch.randn_like(latents)
+    
+    # Test uniform sampling
+    noisy_input, timesteps, sigmas = get_noisy_model_input_and_timesteps(
+        args, scheduler, latents, noise, device, torch.float32
+    )
+    
+    # Validate output shapes and types
+    assert noisy_input.shape == latents.shape
+    assert timesteps.shape[0] == latents.shape[0]
+    assert noisy_input.dtype == torch.float32
+    assert timesteps.dtype == torch.float32
+    
+    # Test different sampling methods
+    sampling_methods = ["sigmoid", "shift", "nextdit_shift"]
+    for method in sampling_methods:
+        args.timestep_sampling = method
+        noisy_input, timesteps, _ = get_noisy_model_input_and_timesteps(
+            args, scheduler, latents, noise, device, torch.float32
+        )
+        assert noisy_input.shape == latents.shape
+        assert timesteps.shape[0] == latents.shape[0]
diff --git a/tests/library/test_lumina_util.py b/tests/library/test_lumina_util.py
new file mode 100644
index 000000000..397bab5a9
--- /dev/null
+++ b/tests/library/test_lumina_util.py
@@ -0,0 +1,112 @@
+import torch
+from torch.nn.modules import conv
+
+from library import lumina_util
+
+
+def test_unpack_latents():
+    # Create a test tensor
+    # Shape: [batch, height*width, channels*patch_height*patch_width]
+    x = torch.randn(2, 4, 16)  # 2 batches, 4 tokens, 16 channels
+    packed_latent_height = 2
+    packed_latent_width = 2
+
+    # Unpack the latents
+    unpacked = lumina_util.unpack_latents(x, packed_latent_height, packed_latent_width)
+
+    # Check output shape
+    # Expected shape: [batch, channels, height*patch_height, width*patch_width]
+    assert unpacked.shape == (2, 4, 4, 4)
+
+
+def test_pack_latents():
+    # Create a test tensor
+    # Shape: [batch, channels, height*patch_height, width*patch_width]
+    x = torch.randn(2, 4, 4, 4)
+
+    # Pack the latents
+    packed = lumina_util.pack_latents(x)
+
+    # Check output shape
+    # Expected shape: [batch, height*width, channels*patch_height*patch_width]
+    assert packed.shape == (2, 4, 16)
+
+
+def test_convert_diffusers_sd_to_alpha_vllm():
+    num_double_blocks = 2
+    # Predefined test cases based on the actual conversion map
+    test_cases = [
+        # Static key conversions with possible list mappings
+        {
+            "original_keys": ["time_caption_embed.caption_embedder.0.weight"],
+            "original_pattern": ["time_caption_embed.caption_embedder.0.weight"],
+            "expected_converted_keys": ["cap_embedder.0.weight"],
+        },
+        {
+            "original_keys": ["patch_embedder.proj.weight"],
+            "original_pattern": ["patch_embedder.proj.weight"],
+            "expected_converted_keys": ["x_embedder.weight"],
+        },
+        {
+            "original_keys": ["transformer_blocks.0.norm1.weight"],
+            "original_pattern": ["transformer_blocks.().norm1.weight"],
+            "expected_converted_keys": ["layers.0.attention_norm1.weight"],
+        },
+    ]
+
+
+    for test_case in test_cases:
+        for original_key, original_pattern, expected_converted_key in zip(
+            test_case["original_keys"], test_case["original_pattern"], test_case["expected_converted_keys"]
+        ):
+            # Create test state dict
+            test_sd = {original_key: torch.randn(10, 10)}
+
+            # Convert the state dict
+            converted_sd = lumina_util.convert_diffusers_sd_to_alpha_vllm(test_sd, num_double_blocks)
+
+            # Verify conversion (handle both string and list keys)
+            # Find the correct converted key
+            match_found = False
+            if expected_converted_key in converted_sd:
+                # Verify tensor preservation
+                assert torch.allclose(converted_sd[expected_converted_key], test_sd[original_key], atol=1e-6), (
+                    f"Tensor mismatch for {original_key}"
+                )
+                match_found = True
+                break
+
+            assert match_found, f"Failed to convert {original_key}"
+
+            # Ensure original key is also present
+            assert original_key in converted_sd
+
+    # Test with block-specific keys
+    block_specific_cases = [
+        {
+            "original_pattern": "transformer_blocks.().norm1.weight",
+            "converted_pattern": "layers.().attention_norm1.weight",
+        }
+    ]
+
+    for case in block_specific_cases:
+        for block_idx in range(2):  # Test multiple block indices
+            # Prepare block-specific keys
+            block_original_key = case["original_pattern"].replace("()", str(block_idx))
+            block_converted_key = case["converted_pattern"].replace("()", str(block_idx))
+            print(block_original_key, block_converted_key)
+
+            # Create test state dict
+            test_sd = {block_original_key: torch.randn(10, 10)}
+
+            # Convert the state dict
+            converted_sd = lumina_util.convert_diffusers_sd_to_alpha_vllm(test_sd, num_double_blocks)
+
+            # Verify conversion
+            # assert block_converted_key in converted_sd, f"Failed to convert block key {block_original_key}"
+            assert torch.allclose(converted_sd[block_converted_key], test_sd[block_original_key], atol=1e-6), (
+                f"Tensor mismatch for block key {block_original_key}"
+            )
+
+            # Ensure original key is also present
+            assert block_original_key in converted_sd
diff --git a/tests/library/test_strategy_lumina.py b/tests/library/test_strategy_lumina.py
new file mode 100644
index 000000000..18e196bf9
--- /dev/null
+++ b/tests/library/test_strategy_lumina.py
@@ -0,0 +1,227 @@
+import os
+import tempfile
+import torch
+import numpy as np
+from unittest.mock import patch
+from transformers import Gemma2Model
+
+from library.strategy_lumina import (
+    LuminaTokenizeStrategy,
+    LuminaTextEncodingStrategy,
+    LuminaTextEncoderOutputsCachingStrategy,
+    LuminaLatentsCachingStrategy,
+)
+
+
+class SimpleMockGemma2Model:
+    """Lightweight mock that avoids initializing the actual Gemma2Model"""
+
+    def __init__(self, hidden_size=2304):
+        self.device = torch.device("cpu")
+        self._hidden_size = hidden_size
+        self._orig_mod = self  # For dynamic compilation compatibility
+
+    def __call__(self, input_ids, attention_mask, output_hidden_states=False, return_dict=False):
+        # Create a mock output object with hidden states
+        batch_size, seq_len = input_ids.shape
+        hidden_size = self._hidden_size
+
+        class MockOutput:
+            def __init__(self, hidden_states):
+                self.hidden_states = hidden_states
+
+        mock_hidden_states = [
+            torch.randn(batch_size, seq_len, hidden_size, device=input_ids.device)
+            for _ in range(3)  # Mimic multiple layers of hidden states
+        ]
+
+        return MockOutput(mock_hidden_states)
+
+
+def test_lumina_tokenize_strategy():
+    # Test default initialization
+    tokenize_strategy = LuminaTokenizeStrategy(max_length=None)
+    assert tokenize_strategy.max_length == 256
+    assert tokenize_strategy.tokenizer.padding_side == "right"
+
+    # Test tokenization of a single string
+    text = "Hello"
+    tokens, attention_mask = tokenize_strategy.tokenize(text)
+
+    assert tokens.ndim == 2
+    assert attention_mask.ndim == 2
+    assert tokens.shape == attention_mask.shape
+    assert tokens.shape[1] == 256  # max_length
+
+    # Test tokenize_with_weights
+    tokens, attention_mask, weights = tokenize_strategy.tokenize_with_weights(text)
+    assert len(weights) == 1
+    assert torch.all(weights[0] == 1)
+
+
+def test_lumina_text_encoding_strategy():
+    # Create strategies
+    tokenize_strategy = LuminaTokenizeStrategy(max_length=None)
+    encoding_strategy = LuminaTextEncodingStrategy()
+
+    # Create a mock model
+    mock_model = SimpleMockGemma2Model()
+
+    # Patch the isinstance check to accept our simple mock
+    original_isinstance = isinstance
+    with patch("library.strategy_lumina.isinstance") as mock_isinstance:
+
+        def custom_isinstance(obj, class_or_tuple):
+            if obj is mock_model and class_or_tuple is Gemma2Model:
+                return True
+            if hasattr(obj, "_orig_mod") and obj._orig_mod is mock_model and class_or_tuple is Gemma2Model:
+                return True
+            return original_isinstance(obj, class_or_tuple)
+
+        mock_isinstance.side_effect = custom_isinstance
+
+        # Prepare sample text
+        text = "Test encoding strategy"
+        tokens, attention_mask = tokenize_strategy.tokenize(text)
+
+        # Perform encoding
+        hidden_states, input_ids, attention_masks = encoding_strategy.encode_tokens(
+            tokenize_strategy, [mock_model], (tokens, attention_mask)
+        )
+
+        # Validate outputs
+        assert original_isinstance(hidden_states, torch.Tensor)
+        assert original_isinstance(input_ids, torch.Tensor)
+        assert original_isinstance(attention_masks, torch.Tensor)
+
+        # Check the shape of the second-to-last hidden state
+        assert hidden_states.ndim == 3
+
+        # Test weighted encoding (which falls back to standard encoding for Lumina)
+        weights = [torch.ones_like(tokens)]
+        hidden_states_w, input_ids_w, attention_masks_w = encoding_strategy.encode_tokens_with_weights(
+            tokenize_strategy, [mock_model], (tokens, attention_mask), weights
+        )
+
+        # For the mock, we can't guarantee identical outputs since each call returns random tensors
+        # Instead, check that the outputs have the same shape and are tensors
+        assert hidden_states_w.shape == hidden_states.shape
+        assert original_isinstance(hidden_states_w, torch.Tensor)
+        assert torch.allclose(input_ids, input_ids_w)  # Input IDs should be the same
+        assert torch.allclose(attention_masks, attention_masks_w)  # Attention masks should be the same
+
+
+def test_lumina_text_encoder_outputs_caching_strategy():
+    # Create a temporary directory for caching
+    with tempfile.TemporaryDirectory() as tmpdir:
+        # Create a cache file path
+        cache_file = os.path.join(tmpdir, "test_outputs.npz")
+
+        # Create the caching strategy
+        caching_strategy = LuminaTextEncoderOutputsCachingStrategy(
+            cache_to_disk=True,
+            batch_size=1,
+            skip_disk_cache_validity_check=False,
+        )
+
+        # Create a mock class for ImageInfo
+        class MockImageInfo:
+            def __init__(self, caption, system_prompt, cache_path):
+                self.caption = caption
+                self.system_prompt = system_prompt
+                self.text_encoder_outputs_npz = cache_path
+
+        # Create a sample input info
+        image_info = MockImageInfo("Test caption", "", cache_file)
+
+        # Simulate a batch
+        batch = [image_info]
+
+        # Create mock strategies and model
+        tokenize_strategy = LuminaTokenizeStrategy(max_length=None)
+        encoding_strategy = LuminaTextEncodingStrategy()
+        mock_model = SimpleMockGemma2Model()
+
+        # Patch the isinstance check to accept our simple mock
+        original_isinstance = isinstance
+        with patch("library.strategy_lumina.isinstance") as mock_isinstance:
+
+            def custom_isinstance(obj, class_or_tuple):
+                if obj is mock_model and class_or_tuple is Gemma2Model:
+                    return True
+                if hasattr(obj, "_orig_mod") and obj._orig_mod is mock_model and class_or_tuple is Gemma2Model:
+                    return True
+                return original_isinstance(obj, class_or_tuple)
+
+            mock_isinstance.side_effect = custom_isinstance
+
+            # Call cache_batch_outputs
+            caching_strategy.cache_batch_outputs(tokenize_strategy, [mock_model], encoding_strategy, batch)
+
+        # Verify the npz file was created
+        assert os.path.exists(cache_file), f"Cache file not created at {cache_file}"
+
+        # Verify the is_disk_cached_outputs_expected method
+        assert caching_strategy.is_disk_cached_outputs_expected(cache_file)
+
+        # Test loading from npz
+        loaded_data = caching_strategy.load_outputs_npz(cache_file)
+        assert len(loaded_data) == 3  # hidden_state, input_ids, attention_mask
+
+
+def test_lumina_latents_caching_strategy():
+    # Create a temporary directory for caching
+    with tempfile.TemporaryDirectory() as tmpdir:
+        # Prepare a mock absolute path
+        abs_path = os.path.join(tmpdir, "test_image.png")
+
+        # Use smaller image size for faster testing
+        image_size = (64, 64)
+
+        # Create a smaller dummy image for testing
+        test_image = np.random.randint(0, 255, (64, 64, 3), dtype=np.uint8)
+
+        # Create the caching strategy
+        caching_strategy = LuminaLatentsCachingStrategy(cache_to_disk=True, batch_size=1, skip_disk_cache_validity_check=False)
+
+        # Create a simple mock VAE
+        class MockVAE:
+            def __init__(self):
+                self.device = torch.device("cpu")
+                self.dtype = torch.float32
+
+            def encode(self, x):
+                # Return smaller encoded tensor for faster processing
+                encoded = torch.randn(1, 4, 8, 8, device=x.device)
+                return type("EncodedLatents", (), {"to": lambda *args, **kwargs: encoded})
+
+        # Prepare a mock batch
+        class MockImageInfo:
+            def __init__(self, path, image):
+                self.absolute_path = path
+                self.image = image
+                self.image_path = path
+                self.bucket_reso = image_size
+                self.resized_size = image_size
+                self.resize_interpolation = "lanczos"
+                # Specify full path to the latents npz file
+                self.latents_npz = os.path.join(tmpdir, f"{os.path.splitext(os.path.basename(path))[0]}_0064x0064_lumina.npz")
+
+        batch = [MockImageInfo(abs_path, test_image)]
+
+        # Call cache_batch_latents
+        mock_vae = MockVAE()
+        caching_strategy.cache_batch_latents(mock_vae, batch, flip_aug=False, alpha_mask=False, random_crop=False)
+
+        # Generate the expected npz path
+        npz_path = caching_strategy.get_latents_npz_path(abs_path, image_size)
+
+        # Verify the file was created
+        assert os.path.exists(npz_path), f"NPZ file not created at {npz_path}"
+
+        # Verify is_disk_cached_latents_expected
+        assert caching_strategy.is_disk_cached_latents_expected(image_size, npz_path, False, False)
+
+        # Test loading from disk
+        loaded_data = caching_strategy.load_latents_from_disk(npz_path, image_size)
+        assert len(loaded_data) == 5  # Check for 5 expected elements
diff --git a/tests/test_lumina_train_network.py b/tests/test_lumina_train_network.py
new file mode 100644
index 000000000..353a742f4
--- /dev/null
+++ b/tests/test_lumina_train_network.py
@@ -0,0 +1,173 @@
+import pytest
+import torch
+from unittest.mock import MagicMock, patch
+import argparse
+
+from library import lumina_models, lumina_util
+from lumina_train_network import LuminaNetworkTrainer
+
+
+@pytest.fixture
+def lumina_trainer():
+    return LuminaNetworkTrainer()
+
+
+@pytest.fixture
+def mock_args():
+    args = MagicMock()
+    args.pretrained_model_name_or_path = "test_path"
+    args.disable_mmap_load_safetensors = False
+    args.use_flash_attn = False
+    args.use_sage_attn = False
+    args.fp8_base = False
+    args.blocks_to_swap = None
+    args.gemma2 = "test_gemma2_path"
+    args.ae = "test_ae_path"
+    args.cache_text_encoder_outputs = True
+    args.cache_text_encoder_outputs_to_disk = False
+    args.network_train_unet_only = False
+    return args
+
+
+@pytest.fixture
+def mock_accelerator():
+    accelerator = MagicMock()
+    accelerator.device = torch.device("cpu")
+    accelerator.prepare.side_effect = lambda x, **kwargs: x
+    accelerator.unwrap_model.side_effect = lambda x: x
+    return accelerator
+
+
+def test_assert_extra_args(lumina_trainer, mock_args):
+    train_dataset_group = MagicMock()
+    train_dataset_group.verify_bucket_reso_steps = MagicMock()
+    val_dataset_group = MagicMock()
+    val_dataset_group.verify_bucket_reso_steps = MagicMock()
+
+    # Test with default settings
+    lumina_trainer.assert_extra_args(mock_args, train_dataset_group, val_dataset_group)
+
+    # Verify verify_bucket_reso_steps was called for both groups
+    assert train_dataset_group.verify_bucket_reso_steps.call_count > 0
+    assert val_dataset_group.verify_bucket_reso_steps.call_count > 0
+
+    # Check text encoder output caching
+    assert lumina_trainer.train_gemma2 is (not mock_args.network_train_unet_only)
+    assert mock_args.cache_text_encoder_outputs is True
+
+
+def test_load_target_model(lumina_trainer, mock_args, mock_accelerator):
+    # Patch lumina_util methods
+    with (
+        patch("library.lumina_util.load_lumina_model") as mock_load_lumina_model,
+        patch("library.lumina_util.load_gemma2") as mock_load_gemma2,
+        patch("library.lumina_util.load_ae") as mock_load_ae
+    ):
+        # Create mock models
+        mock_model = MagicMock(spec=lumina_models.NextDiT)
+        mock_model.dtype = torch.float32
+        mock_gemma2 = MagicMock()
+        mock_ae = MagicMock()
+
+        mock_load_lumina_model.return_value = mock_model
+        mock_load_gemma2.return_value = mock_gemma2
+        mock_load_ae.return_value = mock_ae
+
+        # Test load_target_model
+        version, gemma2_list, ae, model = lumina_trainer.load_target_model(mock_args, torch.float32, mock_accelerator)
+
+        # Verify calls and return values
+        assert version == lumina_util.MODEL_VERSION_LUMINA_V2
+        assert gemma2_list == [mock_gemma2]
+        assert ae == mock_ae
+        assert model == mock_model
+
+        # Verify load calls
+        mock_load_lumina_model.assert_called_once()
+        mock_load_gemma2.assert_called_once()
+        mock_load_ae.assert_called_once()
+
+
+def test_get_strategies(lumina_trainer, mock_args):
+    # Test tokenize strategy
+    tokenize_strategy = lumina_trainer.get_tokenize_strategy(mock_args)
+    assert tokenize_strategy.__class__.__name__ == "LuminaTokenizeStrategy"
+
+    # Test latents caching strategy
+    latents_strategy = lumina_trainer.get_latents_caching_strategy(mock_args)
+    assert latents_strategy.__class__.__name__ == "LuminaLatentsCachingStrategy"
+
+    # Test text encoding strategy
+    text_encoding_strategy = lumina_trainer.get_text_encoding_strategy(mock_args)
+    assert text_encoding_strategy.__class__.__name__ == "LuminaTextEncodingStrategy"
+
+
+def test_text_encoder_output_caching_strategy(lumina_trainer, mock_args):
+    # Call assert_extra_args to set train_gemma2
+    train_dataset_group = MagicMock()
+    train_dataset_group.verify_bucket_reso_steps = MagicMock()
+    val_dataset_group = MagicMock()
+    val_dataset_group.verify_bucket_reso_steps = MagicMock()
+    lumina_trainer.assert_extra_args(mock_args, train_dataset_group, val_dataset_group)
+
+    # With text encoder caching enabled
+    mock_args.skip_cache_check = False
+    mock_args.text_encoder_batch_size = 16
+    strategy = lumina_trainer.get_text_encoder_outputs_caching_strategy(mock_args)
+    
+    assert strategy.__class__.__name__ == "LuminaTextEncoderOutputsCachingStrategy"
+    assert strategy.cache_to_disk is False  # based on mock_args
+    
+    # With text encoder caching disabled
+    mock_args.cache_text_encoder_outputs = False
+    strategy = lumina_trainer.get_text_encoder_outputs_caching_strategy(mock_args)
+    assert strategy is None
+
+
+def test_noise_scheduler(lumina_trainer, mock_args):
+    device = torch.device("cpu")
+    noise_scheduler = lumina_trainer.get_noise_scheduler(mock_args, device)
+
+    assert noise_scheduler.__class__.__name__ == "FlowMatchEulerDiscreteScheduler"
+    assert noise_scheduler.num_train_timesteps == 1000
+    assert hasattr(lumina_trainer, "noise_scheduler_copy")
+
+
+def test_sai_model_spec(lumina_trainer, mock_args):
+    with patch("library.train_util.get_sai_model_spec") as mock_get_spec:
+        mock_get_spec.return_value = "test_spec"
+        spec = lumina_trainer.get_sai_model_spec(mock_args)
+        assert spec == "test_spec"
+        mock_get_spec.assert_called_once_with(None, mock_args, False, True, False, lumina="lumina2")
+
+
+def test_update_metadata(lumina_trainer, mock_args):
+    metadata = {}
+    lumina_trainer.update_metadata(metadata, mock_args)
+
+    assert "ss_weighting_scheme" in metadata
+    assert "ss_logit_mean" in metadata
+    assert "ss_logit_std" in metadata
+    assert "ss_mode_scale" in metadata
+    assert "ss_timestep_sampling" in metadata
+    assert "ss_sigmoid_scale" in metadata
+    assert "ss_model_prediction_type" in metadata
+    assert "ss_discrete_flow_shift" in metadata
+
+
+def test_is_text_encoder_not_needed_for_training(lumina_trainer, mock_args):
+    # Test with text encoder output caching, but not training text encoder
+    mock_args.cache_text_encoder_outputs = True
+    with patch.object(lumina_trainer, 'is_train_text_encoder', return_value=False):
+        result = lumina_trainer.is_text_encoder_not_needed_for_training(mock_args)
+        assert result is True
+
+    # Test with text encoder output caching and training text encoder
+    with patch.object(lumina_trainer, 'is_train_text_encoder', return_value=True):
+        result = lumina_trainer.is_text_encoder_not_needed_for_training(mock_args)
+        assert result is False
+
+    # Test with no text encoder output caching
+    mock_args.cache_text_encoder_outputs = False
+    result = lumina_trainer.is_text_encoder_not_needed_for_training(mock_args)
+    assert result is False
\ No newline at end of file

From 0e929f97b9dfc488a454d62a3e27696c167a3936 Mon Sep 17 00:00:00 2001
From: rockerBOO <rockerboo@gmail.com>
Date: Mon, 16 Jun 2025 16:50:18 -0400
Subject: [PATCH 464/582] Revert system_prompt for dataset config

---
 library/train_util.py | 74 +++++++++++++++----------------------------
 1 file changed, 26 insertions(+), 48 deletions(-)

diff --git a/library/train_util.py b/library/train_util.py
index 68019e21b..1d80bcd85 100644
--- a/library/train_util.py
+++ b/library/train_util.py
@@ -192,7 +192,7 @@ def __init__(self, image_key: str, num_repeats: int, caption: str, is_reg: bool,
         self.latents_flipped: Optional[torch.Tensor] = None
         self.latents_npz: Optional[str] = None  # set in cache_latents
         self.latents_original_size: Optional[Tuple[int, int]] = None  # original image size, not latents size
-        self.latents_crop_ltrb: Optional[Tuple[int, int, int, int]] = (
+        self.latents_crop_ltrb: Optional[Tuple[int, int]] = (
             None  # crop left top right bottom in original pixel size, not latents size
         )
         self.cond_img_path: Optional[str] = None
@@ -209,8 +209,6 @@ def __init__(self, image_key: str, num_repeats: int, caption: str, is_reg: bool,
         self.alpha_mask: Optional[torch.Tensor] = None  # alpha mask can be flipped in runtime
         self.resize_interpolation: Optional[str] = None
 
-        self.system_prompt: Optional[str] = None
-
 
 class BucketManager:
     def __init__(self, no_upscale, max_reso, min_size, max_size, reso_steps) -> None:
@@ -434,7 +432,6 @@ def __init__(
         custom_attributes: Optional[Dict[str, Any]] = None,
         validation_seed: Optional[int] = None,
         validation_split: Optional[float] = 0.0,
-        system_prompt: Optional[str] = None,
         resize_interpolation: Optional[str] = None,
     ) -> None:
         self.image_dir = image_dir
@@ -466,7 +463,6 @@ def __init__(
         self.validation_seed = validation_seed
         self.validation_split = validation_split
 
-        self.system_prompt = system_prompt
         self.resize_interpolation = resize_interpolation
 
 
@@ -500,7 +496,6 @@ def __init__(
         custom_attributes: Optional[Dict[str, Any]] = None,
         validation_seed: Optional[int] = None,
         validation_split: Optional[float] = 0.0,
-        system_prompt: Optional[str] = None,
         resize_interpolation: Optional[str] = None,
     ) -> None:
         assert image_dir is not None, "image_dir must be specified / image_dirは指定が必須です"
@@ -529,15 +524,14 @@ def __init__(
             custom_attributes=custom_attributes,
             validation_seed=validation_seed,
             validation_split=validation_split,
-            system_prompt=system_prompt,
             resize_interpolation=resize_interpolation,
         )
 
         self.is_reg = is_reg
         self.class_tokens = class_tokens
         self.caption_extension = caption_extension
-        # if self.caption_extension and not self.caption_extension.startswith("."):
-        #     self.caption_extension = "." + self.caption_extension
+        if self.caption_extension and not self.caption_extension.startswith("."):
+            self.caption_extension = "." + self.caption_extension
         self.cache_info = cache_info
 
     def __eq__(self, other) -> bool:
@@ -573,7 +567,6 @@ def __init__(
         custom_attributes: Optional[Dict[str, Any]] = None,
         validation_seed: Optional[int] = None,
         validation_split: Optional[float] = 0.0,
-        system_prompt: Optional[str] = None,
         resize_interpolation: Optional[str] = None,
     ) -> None:
         assert metadata_file is not None, "metadata_file must be specified / metadata_fileは指定が必須です"
@@ -602,7 +595,6 @@ def __init__(
             custom_attributes=custom_attributes,
             validation_seed=validation_seed,
             validation_split=validation_split,
-            system_prompt=system_prompt,
             resize_interpolation=resize_interpolation,
         )
 
@@ -642,7 +634,6 @@ def __init__(
         custom_attributes: Optional[Dict[str, Any]] = None,
         validation_seed: Optional[int] = None,
         validation_split: Optional[float] = 0.0,
-        system_prompt: Optional[str] = None,
         resize_interpolation: Optional[str] = None,
     ) -> None:
         assert image_dir is not None, "image_dir must be specified / image_dirは指定が必須です"
@@ -671,7 +662,6 @@ def __init__(
             custom_attributes=custom_attributes,
             validation_seed=validation_seed,
             validation_split=validation_split,
-            system_prompt=system_prompt,
             resize_interpolation=resize_interpolation,
         )
 
@@ -1713,10 +1703,8 @@ def __getitem__(self, index):
             text_encoder_outputs_list.append(text_encoder_outputs)
 
             if tokenization_required:
-                system_prompt_special_token = "<Prompt Start>"
-                system_prompt = f"{subset.system_prompt} {system_prompt_special_token} " if subset.system_prompt else ""
                 caption = self.process_caption(subset, image_info.caption)
-                input_ids = [ids[0] for ids in self.tokenize_strategy.tokenize(system_prompt + caption)]  # remove batch dimension
+                input_ids = [ids[0] for ids in self.tokenize_strategy.tokenize(caption)]  # remove batch dimension
                 # if self.XTI_layers:
                 #     caption_layer = []
                 #     for layer in self.XTI_layers:
@@ -1886,8 +1874,7 @@ def __init__(
         debug_dataset: bool,
         validation_split: float,
         validation_seed: Optional[int],
-        system_prompt: Optional[str] = None,
-        resize_interpolation: Optional[str] = None,
+        resize_interpolation: Optional[str],
     ) -> None:
         super().__init__(resolution, network_multiplier, debug_dataset, resize_interpolation)
 
@@ -1900,7 +1887,6 @@ def __init__(
         self.is_training_dataset = is_training_dataset
         self.validation_seed = validation_seed
         self.validation_split = validation_split
-        self.system_prompt = system_prompt
 
         self.enable_bucket = enable_bucket
         if self.enable_bucket:
@@ -1917,33 +1903,30 @@ def __init__(
             self.bucket_reso_steps = None  # この情報は使われない
             self.bucket_no_upscale = False
 
-        def read_caption(img_path: str, caption_extension: str, enable_wildcard: bool):
+        def read_caption(img_path, caption_extension, enable_wildcard):
             # captionの候補ファイル名を作る
             base_name = os.path.splitext(img_path)[0]
             base_name_face_det = base_name
             tokens = base_name.split("_")
             if len(tokens) >= 5:
                 base_name_face_det = "_".join(tokens[:-4])
-            cap_paths = [(base_name, caption_extension), (base_name_face_det, caption_extension)]
+            cap_paths = [base_name + caption_extension, base_name_face_det + caption_extension]
 
             caption = None
-            for base, cap_extension in cap_paths:
-                # check with and without . to allow for extension flexibility (img_var.txt, img.txt, img + txt)
-                for cap_path in [base + cap_extension, base + "." + cap_extension]:
-                    if os.path.isfile(cap_path):
-                        with open(cap_path, "rt", encoding="utf-8") as f:
-                            try:
-                                lines = f.readlines()
-                            except UnicodeDecodeError as e:
-                                logger.error(f"illegal char in file (not UTF-8) / ファイルにUTF-8以外の文字があります: {cap_path}")
-                                raise e
-                            assert len(lines) > 0, f"caption file is empty / キャプションファイルが空です: {cap_path}"
-                            if enable_wildcard:
-                                caption = "\n".join([line.strip() for line in lines if line.strip() != ""])  # 空行を除く、改行で連結
-                            else:
-                                caption = lines[0].strip()
-                        break
-                break
+            for cap_path in cap_paths:
+                if os.path.isfile(cap_path):
+                    with open(cap_path, "rt", encoding="utf-8") as f:
+                        try:
+                            lines = f.readlines()
+                        except UnicodeDecodeError as e:
+                            logger.error(f"illegal char in file (not UTF-8) / ファイルにUTF-8以外の文字があります: {cap_path}")
+                            raise e
+                        assert len(lines) > 0, f"caption file is empty / キャプションファイルが空です: {cap_path}"
+                        if enable_wildcard:
+                            caption = "\n".join([line.strip() for line in lines if line.strip() != ""])  # 空行を除く、改行で連結
+                        else:
+                            caption = lines[0].strip()
+                    break
             return caption
 
         def load_dreambooth_dir(subset: DreamBoothSubset):
@@ -2090,7 +2073,6 @@ def load_dreambooth_dir(subset: DreamBoothSubset):
         num_train_images = 0
         num_reg_images = 0
         reg_infos: List[Tuple[ImageInfo, DreamBoothSubset]] = []
-
         for subset in subsets:
             num_repeats = subset.num_repeats if self.is_training_dataset else 1
             if num_repeats < 1:
@@ -2117,10 +2099,8 @@ def load_dreambooth_dir(subset: DreamBoothSubset):
             else:
                 num_train_images += num_repeats * len(img_paths)
 
-            system_prompt_special_token = "<Prompt Start>"
-            system_prompt = f"{self.system_prompt or subset.system_prompt} {system_prompt_special_token} " if self.system_prompt or subset.system_prompt else ""
             for img_path, caption, size in zip(img_paths, captions, sizes):
-                info = ImageInfo(img_path, num_repeats, system_prompt + caption, subset.is_reg, img_path)
+                info = ImageInfo(img_path, num_repeats, caption, subset.is_reg, img_path)
                 info.resize_interpolation = subset.resize_interpolation if subset.resize_interpolation is not None else self.resize_interpolation
                 if size is not None:
                     info.image_size = size
@@ -2177,8 +2157,7 @@ def __init__(
         debug_dataset: bool,
         validation_seed: int,
         validation_split: float,
-        system_prompt: Optional[str] = None,
-        resize_interpolation: Optional[str] = None,
+        resize_interpolation: Optional[str],
     ) -> None:
         super().__init__(resolution, network_multiplier, debug_dataset, resize_interpolation)
 
@@ -2406,8 +2385,7 @@ def __init__(
         bucket_no_upscale: bool,
         debug_dataset: bool,
         validation_split: float,
-        validation_seed: Optional[int],
-        system_prompt: Optional[str] = None,
+        validation_seed: Optional[int],        
         resize_interpolation: Optional[str] = None,
     ) -> None:
         super().__init__(resolution, network_multiplier, debug_dataset, resize_interpolation)
@@ -2461,7 +2439,6 @@ def __init__(
             debug_dataset,
             validation_split,
             validation_seed,
-            system_prompt,
             resize_interpolation,
         )
 
@@ -3005,7 +2982,7 @@ def trim_and_resize_if_required(
 # for new_cache_latents
 def load_images_and_masks_for_caching(
     image_infos: List[ImageInfo], use_alpha_mask: bool, random_crop: bool
-) -> Tuple[torch.Tensor, List[torch.Tensor], List[Tuple[int, int]], List[Tuple[int, int, int, int]]]:
+) -> Tuple[torch.Tensor, List[np.ndarray], List[Tuple[int, int]], List[Tuple[int, int, int, int]]]:
     r"""
     requires image_infos to have: [absolute_path or image], bucket_reso, resized_size
 
@@ -6241,6 +6218,7 @@ def line_to_prompt_dict(line: str) -> dict:
                 prompt_dict["renorm_cfg"] = float(m.group(1))
                 continue
 
+
         except ValueError as ex:
             logger.error(f"Exception in parsing / 解析エラー: {parg}")
             logger.error(ex)

From 935e0037dc7d520f87e2d05dd0a306bfe26c60bc Mon Sep 17 00:00:00 2001
From: Kohya S <ykumeykume@gmail.com>
Date: Sun, 29 Jun 2025 21:33:09 +0900
Subject: [PATCH 465/582] feat: update lumina system prompt handling

---
 .gitignore                            | 1 +
 library/config_util.py                | 6 ------
 library/strategy_lumina.py            | 3 +--
 lumina_train.py                       | 4 +++-
 lumina_train_network.py               | 9 ++++-----
 tests/library/test_strategy_lumina.py | 5 ++---
 6 files changed, 11 insertions(+), 17 deletions(-)

diff --git a/.gitignore b/.gitignore
index e492b1add..4fcf07f6c 100644
--- a/.gitignore
+++ b/.gitignore
@@ -6,3 +6,4 @@ venv
 build
 .vscode
 wandb
+MagicMock
\ No newline at end of file
diff --git a/library/config_util.py b/library/config_util.py
index ac726e4fc..53727f252 100644
--- a/library/config_util.py
+++ b/library/config_util.py
@@ -75,7 +75,6 @@ class BaseSubsetParams:
     custom_attributes: Optional[Dict[str, Any]] = None
     validation_seed: int = 0
     validation_split: float = 0.0
-    system_prompt: Optional[str] = None
     resize_interpolation: Optional[str] = None
 
 
@@ -108,7 +107,6 @@ class BaseDatasetParams:
     debug_dataset: bool = False
     validation_seed: Optional[int] = None
     validation_split: float = 0.0
-    system_prompt: Optional[str] = None
     resize_interpolation: Optional[str] = None
 
 @dataclass
@@ -199,7 +197,6 @@ def __validate_and_convert_scalar_or_twodim(klass, value: Union[float, Sequence]
         "caption_prefix": str,
         "caption_suffix": str,
         "custom_attributes": dict,
-        "system_prompt": str,
         "resize_interpolation": str,
     }
     # DO means DropOut
@@ -246,7 +243,6 @@ def __validate_and_convert_scalar_or_twodim(klass, value: Union[float, Sequence]
         "validation_split": float,
         "resolution": functools.partial(__validate_and_convert_scalar_or_twodim.__func__, int),
         "network_multiplier": float,
-        "system_prompt": str,
         "resize_interpolation": str,
     }
 
@@ -534,7 +530,6 @@ def print_info(_datasets, dataset_type: str):
                   resolution: {(dataset.width, dataset.height)}
                   resize_interpolation: {dataset.resize_interpolation}
                   enable_bucket: {dataset.enable_bucket}
-                  system_prompt: {dataset.system_prompt}
             """)
 
             if dataset.enable_bucket:
@@ -569,7 +564,6 @@ def print_info(_datasets, dataset_type: str):
                     alpha_mask: {subset.alpha_mask}
                     resize_interpolation: {subset.resize_interpolation}
                     custom_attributes: {subset.custom_attributes}
-                    system_prompt: {subset.system_prompt}
                 """), "  ")
 
                 if is_dreambooth:
diff --git a/library/strategy_lumina.py b/library/strategy_lumina.py
index d9e93f538..3d86dbef4 100644
--- a/library/strategy_lumina.py
+++ b/library/strategy_lumina.py
@@ -218,8 +218,7 @@ def cache_batch_outputs(
         assert isinstance(text_encoding_strategy, LuminaTextEncodingStrategy)
         assert isinstance(tokenize_strategy, LuminaTokenizeStrategy)
 
-        system_prompt_special_token = "<Prompt Start>"
-        captions = [f"{info.system_prompt} {system_prompt_special_token} " if info.system_prompt else "" + info.caption for info in batch]
+        captions = [info.caption for info in batch]
 
         if self.is_weighted:
             tokens, attention_masks, weights_list = (
diff --git a/lumina_train.py b/lumina_train.py
index 330d0093b..4b733c9e8 100644
--- a/lumina_train.py
+++ b/lumina_train.py
@@ -266,12 +266,14 @@ def train(args):
                 strategy_base.TextEncodingStrategy.get_strategy()
             )
 
+            system_prompt_special_token = "<Prompt Start>"
+            system_prompt = f"{args.system_prompt} {system_prompt_special_token} " if args.system_prompt else ""
             prompts = train_util.load_prompts(args.sample_prompts)
             sample_prompts_te_outputs = {}  # key: prompt, value: text encoder outputs
             with accelerator.autocast(), torch.no_grad():
                 for prompt_dict in prompts:
                     for p in [
-                        prompt_dict.get("prompt", ""),
+                        system_prompt + prompt_dict.get("prompt", ""),
                         prompt_dict.get("negative_prompt", ""),
                     ]:
                         if p not in sample_prompts_te_outputs:
diff --git a/lumina_train_network.py b/lumina_train_network.py
index e1b45ac70..037ddac6b 100644
--- a/lumina_train_network.py
+++ b/lumina_train_network.py
@@ -58,7 +58,7 @@ def load_target_model(self, args, weight_dtype, accelerator):
             torch.device("cpu"),
             disable_mmap=args.disable_mmap_load_safetensors,
             use_flash_attn=args.use_flash_attn,
-            use_sage_attn=args.use_sage_attn
+            use_sage_attn=args.use_sage_attn,
         )
 
         if args.fp8_base:
@@ -75,7 +75,7 @@ def load_target_model(self, args, weight_dtype, accelerator):
                 model.to(torch.float8_e4m3fn)
 
         if args.blocks_to_swap:
-            logger.info(f'Lumina 2: Enabling block swap: {args.blocks_to_swap}')
+            logger.info(f"Lumina 2: Enabling block swap: {args.blocks_to_swap}")
             model.enable_block_swap(args.blocks_to_swap, accelerator.device)
             self.is_swapping_blocks = True
 
@@ -157,13 +157,13 @@ def cache_text_encoder_outputs_if_needed(
                 assert isinstance(text_encoding_strategy, strategy_lumina.LuminaTextEncodingStrategy)
 
                 system_prompt_special_token = "<Prompt Start>"
-                system_prompt = f"{args.system_prompt} {system_prompt_special_token} " if args.system_prompt else "" 
+                system_prompt = f"{args.system_prompt} {system_prompt_special_token} " if args.system_prompt else ""
                 sample_prompts = train_util.load_prompts(args.sample_prompts)
                 sample_prompts_te_outputs = {}  # key: prompt, value: text encoder outputs
                 with accelerator.autocast(), torch.no_grad():
                     for prompt_dict in sample_prompts:
                         prompts = [
-                            prompt_dict.get("prompt", ""),
+                            system_prompt + prompt_dict.get("prompt", ""),
                             prompt_dict.get("negative_prompt", ""),
                         ]
                         for i, prompt in enumerate(prompts):
@@ -371,7 +371,6 @@ def on_validation_step_end(self, args, accelerator, network, text_encoders, unet
             accelerator.unwrap_model(unet).prepare_block_swap_before_forward()
 
 
-
 def setup_parser() -> argparse.ArgumentParser:
     parser = train_network.setup_parser()
     train_util.add_dit_training_arguments(parser)
diff --git a/tests/library/test_strategy_lumina.py b/tests/library/test_strategy_lumina.py
index 18e196bf9..aca163478 100644
--- a/tests/library/test_strategy_lumina.py
+++ b/tests/library/test_strategy_lumina.py
@@ -126,13 +126,12 @@ def test_lumina_text_encoder_outputs_caching_strategy():
 
         # Create a mock class for ImageInfo
         class MockImageInfo:
-            def __init__(self, caption, system_prompt, cache_path):
+            def __init__(self, caption, cache_path):
                 self.caption = caption
-                self.system_prompt = system_prompt
                 self.text_encoder_outputs_npz = cache_path
 
         # Create a sample input info
-        image_info = MockImageInfo("Test caption", "", cache_file)
+        image_info = MockImageInfo("Test caption", cache_file)
 
         # Simulate a batch
         batch = [image_info]

From 884c1f37c4c16fa83ed14f46f6e209770fbed4d8 Mon Sep 17 00:00:00 2001
From: Kohya S <ykumeykume@gmail.com>
Date: Sun, 29 Jun 2025 21:58:43 +0900
Subject: [PATCH 466/582] fix: update to work with cache text encoder outputs
 (without disk)

---
 library/strategy_lumina.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/library/strategy_lumina.py b/library/strategy_lumina.py
index 3d86dbef4..392d6594f 100644
--- a/library/strategy_lumina.py
+++ b/library/strategy_lumina.py
@@ -264,8 +264,8 @@ def cache_batch_outputs(
             else:
                 info.text_encoder_outputs = [
                     hidden_state_i,
-                    attention_mask_i,
                     input_ids_i,
+                    attention_mask_i,
                 ]
 
 

From 5034c6f813a39c1db9c2b0a5f8140f6364ca984d Mon Sep 17 00:00:00 2001
From: Kohya S <ykumeykume@gmail.com>
Date: Sun, 29 Jun 2025 22:00:58 +0900
Subject: [PATCH 467/582] feat: add workaround for 'gated repo' error on github
 actions

---
 tests/library/test_strategy_lumina.py | 21 ++++++++++++++++++---
 1 file changed, 18 insertions(+), 3 deletions(-)

diff --git a/tests/library/test_strategy_lumina.py b/tests/library/test_strategy_lumina.py
index aca163478..9bb0edf76 100644
--- a/tests/library/test_strategy_lumina.py
+++ b/tests/library/test_strategy_lumina.py
@@ -40,7 +40,12 @@ def __init__(self, hidden_states):
 
 def test_lumina_tokenize_strategy():
     # Test default initialization
-    tokenize_strategy = LuminaTokenizeStrategy(max_length=None)
+    try:
+        tokenize_strategy = LuminaTokenizeStrategy(max_length=None)
+    except OSError as e:
+        # If the tokenizer is not found (due to gated repo), we can skip the test
+        print(f"Skipping LuminaTokenizeStrategy test due to OSError: {e}")
+        return
     assert tokenize_strategy.max_length == 256
     assert tokenize_strategy.tokenizer.padding_side == "right"
 
@@ -61,7 +66,12 @@ def test_lumina_tokenize_strategy():
 
 def test_lumina_text_encoding_strategy():
     # Create strategies
-    tokenize_strategy = LuminaTokenizeStrategy(max_length=None)
+    try:
+        tokenize_strategy = LuminaTokenizeStrategy(max_length=None)
+    except OSError as e:
+        # If the tokenizer is not found (due to gated repo), we can skip the test
+        print(f"Skipping LuminaTokenizeStrategy test due to OSError: {e}")
+        return
     encoding_strategy = LuminaTextEncodingStrategy()
 
     # Create a mock model
@@ -137,7 +147,12 @@ def __init__(self, caption, cache_path):
         batch = [image_info]
 
         # Create mock strategies and model
-        tokenize_strategy = LuminaTokenizeStrategy(max_length=None)
+        try:
+            tokenize_strategy = LuminaTokenizeStrategy(max_length=None)
+        except OSError as e:
+            # If the tokenizer is not found (due to gated repo), we can skip the test
+            print(f"Skipping LuminaTokenizeStrategy test due to OSError: {e}")
+            return
         encoding_strategy = LuminaTextEncodingStrategy()
         mock_model = SimpleMockGemma2Model()
 

From 078ee28a949b65d16ade97824d8273bd8bbd6598 Mon Sep 17 00:00:00 2001
From: Kohya S <ykumeykume@gmail.com>
Date: Sun, 29 Jun 2025 22:06:19 +0900
Subject: [PATCH 468/582] feat: add more workaround for 'gated repo' error on
 github actions

---
 tests/test_lumina_train_network.py | 20 ++++++++++++--------
 1 file changed, 12 insertions(+), 8 deletions(-)

diff --git a/tests/test_lumina_train_network.py b/tests/test_lumina_train_network.py
index 353a742f4..2b8fe21d4 100644
--- a/tests/test_lumina_train_network.py
+++ b/tests/test_lumina_train_network.py
@@ -61,7 +61,7 @@ def test_load_target_model(lumina_trainer, mock_args, mock_accelerator):
     with (
         patch("library.lumina_util.load_lumina_model") as mock_load_lumina_model,
         patch("library.lumina_util.load_gemma2") as mock_load_gemma2,
-        patch("library.lumina_util.load_ae") as mock_load_ae
+        patch("library.lumina_util.load_ae") as mock_load_ae,
     ):
         # Create mock models
         mock_model = MagicMock(spec=lumina_models.NextDiT)
@@ -90,8 +90,12 @@ def test_load_target_model(lumina_trainer, mock_args, mock_accelerator):
 
 def test_get_strategies(lumina_trainer, mock_args):
     # Test tokenize strategy
-    tokenize_strategy = lumina_trainer.get_tokenize_strategy(mock_args)
-    assert tokenize_strategy.__class__.__name__ == "LuminaTokenizeStrategy"
+    try:
+        tokenize_strategy = lumina_trainer.get_tokenize_strategy(mock_args)
+        assert tokenize_strategy.__class__.__name__ == "LuminaTokenizeStrategy"
+    except OSError as e:
+        # If the tokenizer is not found (due to gated repo), we can skip the test
+        print(f"Skipping LuminaTokenizeStrategy test due to OSError: {e}")
 
     # Test latents caching strategy
     latents_strategy = lumina_trainer.get_latents_caching_strategy(mock_args)
@@ -114,10 +118,10 @@ def test_text_encoder_output_caching_strategy(lumina_trainer, mock_args):
     mock_args.skip_cache_check = False
     mock_args.text_encoder_batch_size = 16
     strategy = lumina_trainer.get_text_encoder_outputs_caching_strategy(mock_args)
-    
+
     assert strategy.__class__.__name__ == "LuminaTextEncoderOutputsCachingStrategy"
     assert strategy.cache_to_disk is False  # based on mock_args
-    
+
     # With text encoder caching disabled
     mock_args.cache_text_encoder_outputs = False
     strategy = lumina_trainer.get_text_encoder_outputs_caching_strategy(mock_args)
@@ -158,16 +162,16 @@ def test_update_metadata(lumina_trainer, mock_args):
 def test_is_text_encoder_not_needed_for_training(lumina_trainer, mock_args):
     # Test with text encoder output caching, but not training text encoder
     mock_args.cache_text_encoder_outputs = True
-    with patch.object(lumina_trainer, 'is_train_text_encoder', return_value=False):
+    with patch.object(lumina_trainer, "is_train_text_encoder", return_value=False):
         result = lumina_trainer.is_text_encoder_not_needed_for_training(mock_args)
         assert result is True
 
     # Test with text encoder output caching and training text encoder
-    with patch.object(lumina_trainer, 'is_train_text_encoder', return_value=True):
+    with patch.object(lumina_trainer, "is_train_text_encoder", return_value=True):
         result = lumina_trainer.is_text_encoder_not_needed_for_training(mock_args)
         assert result is False
 
     # Test with no text encoder output caching
     mock_args.cache_text_encoder_outputs = False
     result = lumina_trainer.is_text_encoder_not_needed_for_training(mock_args)
-    assert result is False
\ No newline at end of file
+    assert result is False

From 6731d8a57fb9a31c37dfaf926c5d70af0dc69b24 Mon Sep 17 00:00:00 2001
From: Kohya S <ykumeykume@gmail.com>
Date: Sun, 29 Jun 2025 22:21:48 +0900
Subject: [PATCH 469/582] fix: update system prompt handling

---
 library/strategy_lumina.py            | 16 ++++++++++++++--
 lumina_train.py                       | 14 ++++++--------
 lumina_train_network.py               | 11 +++--------
 tests/library/test_strategy_lumina.py |  6 +++---
 4 files changed, 26 insertions(+), 21 deletions(-)

diff --git a/library/strategy_lumina.py b/library/strategy_lumina.py
index 392d6594f..964d9f7a4 100644
--- a/library/strategy_lumina.py
+++ b/library/strategy_lumina.py
@@ -25,20 +25,26 @@
 
 class LuminaTokenizeStrategy(TokenizeStrategy):
     def __init__(
-        self, max_length: Optional[int], tokenizer_cache_dir: Optional[str] = None
+        self, system_prompt:str, max_length: Optional[int], tokenizer_cache_dir: Optional[str] = None
     ) -> None:
         self.tokenizer: GemmaTokenizerFast = AutoTokenizer.from_pretrained(
             GEMMA_ID, cache_dir=tokenizer_cache_dir
         )
         self.tokenizer.padding_side = "right"
 
+        if system_prompt is None:
+            system_prompt = ""
+        system_prompt_special_token = "<Prompt Start>"
+        system_prompt = f"{system_prompt} {system_prompt_special_token} " if system_prompt else ""
+        self.system_prompt = system_prompt
+
         if max_length is None:
             self.max_length = 256
         else:
             self.max_length = max_length
 
     def tokenize(
-        self, text: Union[str, List[str]]
+        self, text: Union[str, List[str]], is_negative: bool = False
     ) -> Tuple[torch.Tensor, torch.Tensor]:
         """
         Args:
@@ -49,6 +55,12 @@ def tokenize(
                 token input ids, attention_masks
         """
         text = [text] if isinstance(text, str) else text
+        
+        # In training, we always add system prompt (is_negative=False)
+        if not is_negative:
+            # Add system prompt to the beginning of each text
+            text = [self.system_prompt + t for t in text]
+
         encodings = self.tokenizer(
             text,
             max_length=self.max_length,
diff --git a/lumina_train.py b/lumina_train.py
index 4b733c9e8..0a91f4a0a 100644
--- a/lumina_train.py
+++ b/lumina_train.py
@@ -166,7 +166,7 @@ def train(args):
                 )
             )
         strategy_base.TokenizeStrategy.set_strategy(
-            strategy_lumina.LuminaTokenizeStrategy()
+            strategy_lumina.LuminaTokenizeStrategy(args.system_prompt)
         )
 
         train_dataset_group.set_current_strategies()
@@ -221,7 +221,7 @@ def train(args):
         gemma2_max_token_length = args.gemma2_max_token_length
 
     lumina_tokenize_strategy = strategy_lumina.LuminaTokenizeStrategy(
-        gemma2_max_token_length
+        args.system_prompt, gemma2_max_token_length
     )
     strategy_base.TokenizeStrategy.set_strategy(lumina_tokenize_strategy)
 
@@ -266,19 +266,17 @@ def train(args):
                 strategy_base.TextEncodingStrategy.get_strategy()
             )
 
-            system_prompt_special_token = "<Prompt Start>"
-            system_prompt = f"{args.system_prompt} {system_prompt_special_token} " if args.system_prompt else ""
             prompts = train_util.load_prompts(args.sample_prompts)
             sample_prompts_te_outputs = {}  # key: prompt, value: text encoder outputs
             with accelerator.autocast(), torch.no_grad():
                 for prompt_dict in prompts:
-                    for p in [
-                        system_prompt + prompt_dict.get("prompt", ""),
+                    for i, p in enumerate([
+                        prompt_dict.get("prompt", ""),
                         prompt_dict.get("negative_prompt", ""),
-                    ]:
+                    ]):
                         if p not in sample_prompts_te_outputs:
                             logger.info(f"cache Text Encoder outputs for prompt: {p}")
-                            tokens_and_masks = lumina_tokenize_strategy.tokenize(p)
+                            tokens_and_masks = lumina_tokenize_strategy.tokenize(p, i == 1)  # i == 1 means negative prompt
                             sample_prompts_te_outputs[p] = (
                                 text_encoding_strategy.encode_tokens(
                                     lumina_tokenize_strategy,
diff --git a/lumina_train_network.py b/lumina_train_network.py
index 037ddac6b..b08e31432 100644
--- a/lumina_train_network.py
+++ b/lumina_train_network.py
@@ -86,7 +86,7 @@ def load_target_model(self, args, weight_dtype, accelerator):
         return lumina_util.MODEL_VERSION_LUMINA_V2, [gemma2], ae, model
 
     def get_tokenize_strategy(self, args):
-        return strategy_lumina.LuminaTokenizeStrategy(args.gemma2_max_token_length, args.tokenizer_cache_dir)
+        return strategy_lumina.LuminaTokenizeStrategy(args.system_prompt, args.gemma2_max_token_length, args.tokenizer_cache_dir)
 
     def get_tokenizers(self, tokenize_strategy: strategy_lumina.LuminaTokenizeStrategy):
         return [tokenize_strategy.tokenizer]
@@ -156,25 +156,20 @@ def cache_text_encoder_outputs_if_needed(
                 assert isinstance(tokenize_strategy, strategy_lumina.LuminaTokenizeStrategy)
                 assert isinstance(text_encoding_strategy, strategy_lumina.LuminaTextEncodingStrategy)
 
-                system_prompt_special_token = "<Prompt Start>"
-                system_prompt = f"{args.system_prompt} {system_prompt_special_token} " if args.system_prompt else ""
                 sample_prompts = train_util.load_prompts(args.sample_prompts)
                 sample_prompts_te_outputs = {}  # key: prompt, value: text encoder outputs
                 with accelerator.autocast(), torch.no_grad():
                     for prompt_dict in sample_prompts:
                         prompts = [
-                            system_prompt + prompt_dict.get("prompt", ""),
+                            prompt_dict.get("prompt", ""),
                             prompt_dict.get("negative_prompt", ""),
                         ]
                         for i, prompt in enumerate(prompts):
-                            # Add system prompt only to positive prompt
-                            if i == 0:
-                                prompt = system_prompt + prompt
                             if prompt in sample_prompts_te_outputs:
                                 continue
 
                             logger.info(f"cache Text Encoder outputs for prompt: {prompt}")
-                            tokens_and_masks = tokenize_strategy.tokenize(prompt)
+                            tokens_and_masks = tokenize_strategy.tokenize(prompt, i == 1) # i == 1 means negative prompt
                             sample_prompts_te_outputs[prompt] = text_encoding_strategy.encode_tokens(
                                 tokenize_strategy,
                                 text_encoders,
diff --git a/tests/library/test_strategy_lumina.py b/tests/library/test_strategy_lumina.py
index 9bb0edf76..d77d27383 100644
--- a/tests/library/test_strategy_lumina.py
+++ b/tests/library/test_strategy_lumina.py
@@ -41,7 +41,7 @@ def __init__(self, hidden_states):
 def test_lumina_tokenize_strategy():
     # Test default initialization
     try:
-        tokenize_strategy = LuminaTokenizeStrategy(max_length=None)
+        tokenize_strategy = LuminaTokenizeStrategy("dummy system prompt", max_length=None)
     except OSError as e:
         # If the tokenizer is not found (due to gated repo), we can skip the test
         print(f"Skipping LuminaTokenizeStrategy test due to OSError: {e}")
@@ -67,7 +67,7 @@ def test_lumina_tokenize_strategy():
 def test_lumina_text_encoding_strategy():
     # Create strategies
     try:
-        tokenize_strategy = LuminaTokenizeStrategy(max_length=None)
+        tokenize_strategy = LuminaTokenizeStrategy("dummy system prompt", max_length=None)
     except OSError as e:
         # If the tokenizer is not found (due to gated repo), we can skip the test
         print(f"Skipping LuminaTokenizeStrategy test due to OSError: {e}")
@@ -148,7 +148,7 @@ def __init__(self, caption, cache_path):
 
         # Create mock strategies and model
         try:
-            tokenize_strategy = LuminaTokenizeStrategy(max_length=None)
+            tokenize_strategy = LuminaTokenizeStrategy("dummy system prompt", max_length=None)
         except OSError as e:
             # If the tokenizer is not found (due to gated repo), we can skip the test
             print(f"Skipping LuminaTokenizeStrategy test due to OSError: {e}")

From 05f392fa27371291b26c0ca5b751a3b829cd52d2 Mon Sep 17 00:00:00 2001
From: Kohya S <ykumeykume@gmail.com>
Date: Thu, 3 Jul 2025 21:47:15 +0900
Subject: [PATCH 470/582] feat: add minimum inference code for Lumina with
 image generation capabilities

---
 lumina_minimal_inference.py | 295 ++++++++++++++++++++++++++++++++++++
 1 file changed, 295 insertions(+)
 create mode 100644 lumina_minimal_inference.py

diff --git a/lumina_minimal_inference.py b/lumina_minimal_inference.py
new file mode 100644
index 000000000..ff7c21df7
--- /dev/null
+++ b/lumina_minimal_inference.py
@@ -0,0 +1,295 @@
+# Minimum Inference Code for Lumina
+# Based on flux_minimal_inference.py
+
+import logging
+import argparse
+import math
+import os
+import random
+import time
+from typing import Optional
+
+import einops
+import numpy as np
+import torch
+from accelerate import Accelerator
+from PIL import Image
+from safetensors.torch import load_file
+from tqdm import tqdm
+from transformers import Gemma2Model
+from library.flux_models import AutoEncoder
+
+from library import (
+    device_utils,
+    lumina_models,
+    lumina_train_util,
+    lumina_util,
+    sd3_train_utils,
+    strategy_lumina,
+)
+from library.device_utils import get_preferred_device, init_ipex
+from library.utils import setup_logging, str_to_dtype
+
+init_ipex()
+setup_logging()
+logger = logging.getLogger(__name__)
+
+
+def generate_image(
+    model: lumina_models.NextDiT,
+    gemma2: Gemma2Model,
+    ae: AutoEncoder,
+    prompt: str,
+    system_prompt: str,
+    seed: Optional[int],
+    image_width: int,
+    image_height: int,
+    steps: int,
+    guidance_scale: float,
+    negative_prompt: Optional[str],
+    args,
+    cfg_trunc_ratio: float = 0.25,
+    renorm_cfg: float = 1.0,
+):
+    #
+    # 0. Prepare arguments
+    #
+    device = get_preferred_device()
+    if args.device:
+        device = torch.device(args.device)
+
+    dtype = str_to_dtype(args.dtype)
+    ae_dtype = str_to_dtype(args.ae_dtype)
+    gemma2_dtype = str_to_dtype(args.gemma2_dtype)
+
+    #
+    # 1. Prepare models
+    #
+    # model.to(device, dtype=dtype)
+    model.to(dtype)
+    model.eval()
+
+    gemma2.to(device, dtype=gemma2_dtype)
+    gemma2.eval()
+
+    ae.to(ae_dtype)
+    ae.eval()
+
+    #
+    # 2. Encode prompts
+    #
+    logger.info("Encoding prompts...")
+
+    tokenize_strategy = strategy_lumina.LuminaTokenizeStrategy(system_prompt, args.gemma2_max_token_length)
+    encoding_strategy = strategy_lumina.LuminaTextEncodingStrategy()
+
+    tokens_and_masks = tokenize_strategy.tokenize(prompt)
+    with torch.no_grad():
+        gemma2_conds = encoding_strategy.encode_tokens(tokenize_strategy, [gemma2], tokens_and_masks)
+
+    tokens_and_masks = tokenize_strategy.tokenize(negative_prompt, is_negative=True)
+    with torch.no_grad():
+        neg_gemma2_conds = encoding_strategy.encode_tokens(tokenize_strategy, [gemma2], tokens_and_masks)
+
+    # Unpack Gemma2 outputs
+    prompt_hidden_states, _, prompt_attention_mask = gemma2_conds
+    uncond_hidden_states, _, uncond_attention_mask = neg_gemma2_conds
+
+    if args.offload:
+        print("Offloading models to CPU to save VRAM...")
+        gemma2.to("cpu")
+        device_utils.clean_memory()
+
+    model.to(device)
+
+    #
+    # 3. Prepare latents
+    #
+    seed = seed if seed is not None else random.randint(0, 2**32 - 1)
+    logger.info(f"Seed: {seed}")
+    torch.manual_seed(seed)
+
+    latent_height = image_height // 8
+    latent_width = image_width // 8
+    latent_channels = 16
+
+    latents = torch.randn(
+        (1, latent_channels, latent_height, latent_width),
+        device=device,
+        dtype=dtype,
+        generator=torch.Generator(device=device).manual_seed(seed),
+    )
+
+    #
+    # 4. Denoise
+    #
+    logger.info("Denoising...")
+    scheduler = sd3_train_utils.FlowMatchEulerDiscreteScheduler(num_train_timesteps=1000, shift=args.discrete_flow_shift)
+    scheduler.set_timesteps(steps, device=device)
+    timesteps = scheduler.timesteps
+
+    # # compare with lumina_train_util.retrieve_timesteps
+    # lumina_timestep = lumina_train_util.retrieve_timesteps(scheduler, num_inference_steps=steps)
+    # print(f"Using timesteps: {timesteps}")
+    # print(f"vs Lumina timesteps: {lumina_timestep}")  # should be the same
+
+    with torch.autocast(device_type=device.type, dtype=dtype), torch.no_grad():
+        latents = lumina_train_util.denoise(
+            scheduler,
+            model,
+            latents.to(device),
+            prompt_hidden_states.to(device),
+            prompt_attention_mask.to(device),
+            uncond_hidden_states.to(device),
+            uncond_attention_mask.to(device),
+            timesteps,
+            guidance_scale,
+            cfg_trunc_ratio,
+            renorm_cfg,
+        )
+
+    if args.offload:
+        model.to("cpu")
+        device_utils.clean_memory()
+        ae.to(device)
+
+    #
+    # 5. Decode latents
+    #
+    logger.info("Decoding image...")
+    latents = latents / ae.scale_factor + ae.shift_factor
+    with torch.no_grad():
+        image = ae.decode(latents.to(ae_dtype))
+    image = (image / 2 + 0.5).clamp(0, 1)
+    image = image.cpu().permute(0, 2, 3, 1).float().numpy()
+    image = (image * 255).round().astype("uint8")
+
+    #
+    # 6. Save image
+    #
+    pil_image = Image.fromarray(image[0])
+    output_dir = args.output_dir
+    os.makedirs(output_dir, exist_ok=True)
+    ts_str = time.strftime("%Y%m%d%H%M%S", time.localtime())
+    seed_suffix = f"_{seed}"
+    output_path = os.path.join(output_dir, f"image_{ts_str}{seed_suffix}.png")
+    pil_image.save(output_path)
+    logger.info(f"Image saved to {output_path}")
+
+
+def setup_parser() -> argparse.ArgumentParser:
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--pretrained_model_name_or_path",
+        type=str,
+        default=None,
+        required=True,
+        help="Lumina DiT model path / Lumina DiTモデルのパス",
+    )
+    parser.add_argument(
+        "--gemma2_path",
+        type=str,
+        default=None,
+        required=True,
+        help="Gemma2 model path / Gemma2モデルのパス",
+    )
+    parser.add_argument(
+        "--ae_path",
+        type=str,
+        default=None,
+        required=True,
+        help="Autoencoder model path / Autoencoderモデルのパス",
+    )
+    parser.add_argument("--prompt", type=str, default="A beautiful sunset over the mountains", help="Prompt for image generation")
+    parser.add_argument("--negative_prompt", type=str, default="", help="Negative prompt for image generation, default is empty")
+    parser.add_argument("--output_dir", type=str, default="outputs", help="Output directory for generated images")
+    parser.add_argument("--seed", type=int, default=None, help="Random seed")
+    parser.add_argument("--steps", type=int, default=30, help="Number of inference steps")
+    parser.add_argument("--guidance_scale", type=float, default=4.0, help="Guidance scale for classifier-free guidance")
+    parser.add_argument("--image_width", type=int, default=1024, help="Image width")
+    parser.add_argument("--image_height", type=int, default=1024, help="Image height")
+    parser.add_argument("--dtype", type=str, default="bf16", help="Data type for model (bf16, fp16, float)")
+    parser.add_argument("--gemma2_dtype", type=str, default="bf16", help="Data type for Gemma2 (bf16, fp16, float)")
+    parser.add_argument("--ae_dtype", type=str, default="bf16", help="Data type for Autoencoder (bf16, fp16, float)")
+    parser.add_argument("--device", type=str, default=None, help="Device to use (e.g., 'cuda:0')")
+    parser.add_argument("--offload", action="store_true", help="Offload models to CPU to save VRAM")
+    parser.add_argument("--system_prompt", type=str, default="", help="System prompt for Gemma2 model")
+    parser.add_argument(
+        "--gemma2_max_token_length",
+        type=int,
+        default=256,
+        help="Max token length for Gemma2 tokenizer",
+    )
+    parser.add_argument(
+        "--discrete_flow_shift",
+        type=float,
+        default=1.0,
+        help="Shift value for FlowMatchEulerDiscreteScheduler",
+    )
+    parser.add_argument(
+        "--cfg_trunc_ratio",
+        type=float,
+        default=0.25,
+        help="TBD",
+    )
+    parser.add_argument(
+        "--renorm_cfg",
+        type=float,
+        default=1.0,
+        help="TBD",
+    )
+    parser.add_argument(
+        "--use_flash_attn",
+        action="store_true",
+        help="Use flash attention for Lumina model",
+    )
+    parser.add_argument(
+        "--use_sage_attn",
+        action="store_true",
+        help="Use sage attention for Lumina model",
+    )
+    return parser
+
+
+if __name__ == "__main__":
+    parser = setup_parser()
+    args = parser.parse_args()
+
+    logger.info("Loading models...")
+    device = get_preferred_device()
+    if args.device:
+        device = torch.device(args.device)
+
+    # Load Lumina DiT model
+    model = lumina_util.load_lumina_model(
+        args.pretrained_model_name_or_path,
+        dtype=None,  # Load in fp32 and then convert
+        device="cpu",
+        use_flash_attn=args.use_flash_attn,
+        use_sage_attn=args.use_sage_attn,
+    )
+
+    # Load Gemma2
+    gemma2 = lumina_util.load_gemma2(args.gemma2_path, dtype=None, device="cpu")
+
+    # Load Autoencoder
+    ae = lumina_util.load_ae(args.ae_path, dtype=None, device="cpu")
+
+    generate_image(
+        model,
+        gemma2,
+        ae,
+        args.prompt,
+        args.system_prompt,
+        args.seed,
+        args.image_width,
+        args.image_height,
+        args.steps,
+        args.guidance_scale,
+        args.negative_prompt,
+        args,
+        args.cfg_trunc_ratio,
+        args.renorm_cfg,
+    )
+
+    logger.info("Done.")

From a87e9997861c58df7148705be12dae17114615de Mon Sep 17 00:00:00 2001
From: rockerBOO <rockerboo@gmail.com>
Date: Mon, 7 Jul 2025 17:12:07 -0400
Subject: [PATCH 471/582] Change to 3

---
 networks/lora_lumina.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/networks/lora_lumina.py b/networks/lora_lumina.py
index 15c35f441..e4149b4ab 100644
--- a/networks/lora_lumina.py
+++ b/networks/lora_lumina.py
@@ -344,7 +344,7 @@ def create_network(
         if embedder_dims.startswith("[") and embedder_dims.endswith("]"):
             embedder_dims = embedder_dims[1:-1]
         embedder_dims = [int(d) for d in embedder_dims.split(",")]
-        assert len(embedder_dims) == 3, f"invalid embedder_dims: {embedder_dims}, must be 4 dimensions (x_embedder, t_embedder, cap_embedder)"
+        assert len(embedder_dims) == 3, f"invalid embedder_dims: {embedder_dims}, must be 3 dimensions (x_embedder, t_embedder, cap_embedder)"
 
     # rank/module dropout
     rank_dropout = kwargs.get("rank_dropout", None)

From b4d11522939ce65aef46d835c00969a25bb485c5 Mon Sep 17 00:00:00 2001
From: Kohya S <ykumeykume@gmail.com>
Date: Wed, 9 Jul 2025 21:55:36 +0900
Subject: [PATCH 472/582] fix: sample generation with system prompt,  without
 TE output caching

---
 library/lumina_train_util.py | 16 ++++++----------
 1 file changed, 6 insertions(+), 10 deletions(-)

diff --git a/library/lumina_train_util.py b/library/lumina_train_util.py
index 14a79bb2e..45f22bc47 100644
--- a/library/lumina_train_util.py
+++ b/library/lumina_train_util.py
@@ -249,7 +249,7 @@ def sample_image_inference(
     accelerator: Accelerator,
     args: argparse.Namespace,
     nextdit: lumina_models.NextDiT,
-    gemma2_model: Gemma2Model,
+    gemma2_model: list[Gemma2Model],
     vae: AutoEncoder,
     save_dir: str,
     prompt_dicts: list[Dict[str, str]],
@@ -266,7 +266,7 @@ def sample_image_inference(
         accelerator (Accelerator): Accelerator object
         args (argparse.Namespace): Arguments object
         nextdit (lumina_models.NextDiT): NextDiT model
-        gemma2_model (Gemma2Model): Gemma2 model
+        gemma2_model (list[Gemma2Model]): Gemma2 model
         vae (AutoEncoder): VAE model
         save_dir (str): Directory to save images
         prompt_dict (Dict[str, str]): Prompt dictionary
@@ -330,12 +330,8 @@ def sample_image_inference(
         logger.info(f"renorm: {renorm_cfg}")
         # logger.info(f"sample_sampler: {sampler_name}")
 
-        system_prompt_special_token = "<Prompt Start>"
-        system_prompt = f"{args.system_prompt} {system_prompt_special_token} " if args.system_prompt else ""
 
-        # Apply system prompt to prompts
-        prompt = system_prompt + prompt
-        negative_prompt = negative_prompt
+        # No need to add system prompt here, as it has been handled in the tokenize_strategy
 
         # Get sample prompts from cache
         if sample_prompts_gemma2_outputs and prompt in sample_prompts_gemma2_outputs:
@@ -355,12 +351,12 @@ def sample_image_inference(
         if gemma2_model is not None:
             tokens_and_masks = tokenize_strategy.tokenize(prompt)
             gemma2_conds = encoding_strategy.encode_tokens(
-                tokenize_strategy, [gemma2_model], tokens_and_masks
+                tokenize_strategy, gemma2_model, tokens_and_masks
             )
 
-            tokens_and_masks = tokenize_strategy.tokenize(negative_prompt)
+            tokens_and_masks = tokenize_strategy.tokenize(negative_prompt, is_negative=True)
             neg_gemma2_conds = encoding_strategy.encode_tokens(
-                tokenize_strategy, [gemma2_model], tokens_and_masks
+                tokenize_strategy, gemma2_model, tokens_and_masks
             )
 
         # Unpack Gemma2 outputs

From 7fb0d30feba5f1112ad28099ac79b6109e98ec57 Mon Sep 17 00:00:00 2001
From: Kohya S <ykumeykume@gmail.com>
Date: Wed, 9 Jul 2025 23:28:55 +0900
Subject: [PATCH 473/582] feat: add LoRA support for lumina minimal inference

---
 lumina_minimal_inference.py | 33 +++++++++++++++++++++++++++++++++
 1 file changed, 33 insertions(+)

diff --git a/lumina_minimal_inference.py b/lumina_minimal_inference.py
index ff7c21df7..ba305f6ff 100644
--- a/lumina_minimal_inference.py
+++ b/lumina_minimal_inference.py
@@ -27,6 +27,7 @@
     sd3_train_utils,
     strategy_lumina,
 )
+import networks.lora_lumina as lora_lumina
 from library.device_utils import get_preferred_device, init_ipex
 from library.utils import setup_logging, str_to_dtype
 
@@ -248,6 +249,14 @@ def setup_parser() -> argparse.ArgumentParser:
         action="store_true",
         help="Use sage attention for Lumina model",
     )
+    parser.add_argument(
+        "--lora_weights",
+        type=str,
+        nargs="*",
+        default=[],
+        help="LoRA weights, each argument is a `path;multiplier` (semi-colon separated)",
+    )
+    parser.add_argument("--merge_lora_weights", action="store_true", help="Merge LoRA weights to model")
     return parser
 
 
@@ -275,6 +284,30 @@ def setup_parser() -> argparse.ArgumentParser:
     # Load Autoencoder
     ae = lumina_util.load_ae(args.ae_path, dtype=None, device="cpu")
 
+    # LoRA
+    lora_models = []
+    for weights_file in args.lora_weights:
+        if ";" in weights_file:
+            weights_file, multiplier = weights_file.split(";")
+            multiplier = float(multiplier)
+        else:
+            multiplier = 1.0
+
+        weights_sd = load_file(weights_file)
+        lora_model, _ = lora_lumina.create_network_from_weights(
+            multiplier, None, ae, [gemma2], model, weights_sd, True
+        )
+
+        if args.merge_lora_weights:
+            lora_model.merge_to([gemma2], model, weights_sd)
+        else:
+            lora_model.apply_to([gemma2], model)
+            info = lora_model.load_state_dict(weights_sd, strict=True)
+            logger.info(f"Loaded LoRA weights from {weights_file}: {info}")
+            lora_model.eval()
+
+        lora_models.append(lora_model)
+
     generate_image(
         model,
         gemma2,

From 3f9eab49467ba2d224d48464aac11cb07b85dbb1 Mon Sep 17 00:00:00 2001
From: Kohya S <ykumeykume@gmail.com>
Date: Wed, 9 Jul 2025 23:33:50 +0900
Subject: [PATCH 474/582] fix: update default values in lumina minimal
 inference as same as sample generation

---
 lumina_minimal_inference.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/lumina_minimal_inference.py b/lumina_minimal_inference.py
index ba305f6ff..4f9151792 100644
--- a/lumina_minimal_inference.py
+++ b/lumina_minimal_inference.py
@@ -205,8 +205,8 @@ def setup_parser() -> argparse.ArgumentParser:
     parser.add_argument("--negative_prompt", type=str, default="", help="Negative prompt for image generation, default is empty")
     parser.add_argument("--output_dir", type=str, default="outputs", help="Output directory for generated images")
     parser.add_argument("--seed", type=int, default=None, help="Random seed")
-    parser.add_argument("--steps", type=int, default=30, help="Number of inference steps")
-    parser.add_argument("--guidance_scale", type=float, default=4.0, help="Guidance scale for classifier-free guidance")
+    parser.add_argument("--steps", type=int, default=36, help="Number of inference steps")
+    parser.add_argument("--guidance_scale", type=float, default=3.5, help="Guidance scale for classifier-free guidance")
     parser.add_argument("--image_width", type=int, default=1024, help="Image width")
     parser.add_argument("--image_height", type=int, default=1024, help="Image height")
     parser.add_argument("--dtype", type=str, default="bf16", help="Data type for model (bf16, fp16, float)")
@@ -224,7 +224,7 @@ def setup_parser() -> argparse.ArgumentParser:
     parser.add_argument(
         "--discrete_flow_shift",
         type=float,
-        default=1.0,
+        default=6.0,
         help="Shift value for FlowMatchEulerDiscreteScheduler",
     )
     parser.add_argument(

From 7bd9a6b19ee3d44c298d9fa9e7b63176f16155ab Mon Sep 17 00:00:00 2001
From: kohya-ss <ykumeykume@gmail.com>
Date: Thu, 10 Jul 2025 19:16:05 +0900
Subject: [PATCH 475/582] Add prompt guidance files for Claude and Gemini, and
 update README for AI coding agents

---
 .ai/claude.prompt.md       |   9 ++++
 .ai/context/01-overview.md | 101 +++++++++++++++++++++++++++++++++++++
 .ai/gemini.prompt.md       |   9 ++++
 .gitignore                 |   2 +
 README.md                  |  49 +++++++-----------
 5 files changed, 139 insertions(+), 31 deletions(-)
 create mode 100644 .ai/claude.prompt.md
 create mode 100644 .ai/context/01-overview.md
 create mode 100644 .ai/gemini.prompt.md

diff --git a/.ai/claude.prompt.md b/.ai/claude.prompt.md
new file mode 100644
index 000000000..7f38f5752
--- /dev/null
+++ b/.ai/claude.prompt.md
@@ -0,0 +1,9 @@
+## About This File
+
+This file provides guidance to Claude Code (claude.ai/code) when working with code in this repository.
+
+## 1. Project Context
+Here is the essential context for our project. Please read and understand it thoroughly.
+
+### Project Overview
+@./context/01-overview.md
diff --git a/.ai/context/01-overview.md b/.ai/context/01-overview.md
new file mode 100644
index 000000000..41133e983
--- /dev/null
+++ b/.ai/context/01-overview.md
@@ -0,0 +1,101 @@
+This file provides the overview and guidance for developers working with the codebase, including setup instructions, architecture details, and common commands.
+
+## Project Architecture
+
+### Core Training Framework
+The codebase is built around a **strategy pattern architecture** that supports multiple diffusion model families:
+
+- **`library/strategy_base.py`**: Base classes for tokenization, text encoding, latent caching, and training strategies
+- **`library/strategy_*.py`**: Model-specific implementations for SD, SDXL, SD3, FLUX, etc.
+- **`library/train_util.py`**: Core training utilities shared across all model types
+- **`library/config_util.py`**: Configuration management with TOML support
+
+### Model Support Structure
+Each supported model family has a consistent structure:
+- **Training script**: `{model}_train.py` (full fine-tuning), `{model}_train_network.py` (LoRA/network training)
+- **Model utilities**: `library/{model}_models.py`, `library/{model}_train_utils.py`, `library/{model}_utils.py`
+- **Networks**: `networks/lora_{model}.py`, `networks/oft_{model}.py` for adapter training
+
+### Supported Models
+- **Stable Diffusion 1.x**: `train*.py`, `library/train_util.py`, `train_db.py` (for DreamBooth)
+- **SDXL**: `sdxl_train*.py`, `library/sdxl_*`
+- **SD3**: `sd3_train*.py`, `library/sd3_*`
+- **FLUX.1**: `flux_train*.py`, `library/flux_*`
+
+### Key Components
+
+#### Memory Management
+- **Block swapping**: CPU-GPU memory optimization via `--blocks_to_swap` parameter, works with custom offloading. Only available for models with transformer architectures like SD3 and FLUX.1.
+- **Custom offloading**: `library/custom_offloading_utils.py` for advanced memory management
+- **Gradient checkpointing**: Memory reduction during training
+
+#### Training Features
+- **LoRA training**: Low-rank adaptation networks in `networks/lora*.py`
+- **ControlNet training**: Conditional generation control
+- **Textual Inversion**: Custom embedding training
+- **Multi-resolution training**: Bucket-based aspect ratio handling
+- **Validation loss**: Real-time training monitoring, only for LoRA training
+
+#### Configuration System
+Dataset configuration uses TOML files with structured validation:
+```toml
+[datasets.sample_dataset]
+  resolution = 1024
+  batch_size = 2
+  
+  [[datasets.sample_dataset.subsets]]
+    image_dir = "path/to/images"
+    caption_extension = ".txt"
+```
+
+## Common Development Commands
+
+### Training Commands Pattern
+All training scripts follow this general pattern:
+```bash
+accelerate launch --mixed_precision bf16 {script_name}.py \
+  --pretrained_model_name_or_path model.safetensors \
+  --dataset_config config.toml \
+  --output_dir output \
+  --output_name model_name \
+  [model-specific options]
+```
+
+### Memory Optimization
+For low VRAM environments, use block swapping:
+```bash
+# Add to any training command for memory reduction
+--blocks_to_swap 10  # Swap 10 blocks to CPU (adjust number as needed)
+```
+
+### Utility Scripts
+Located in `tools/` directory:
+- `tools/merge_lora.py`: Merge LoRA weights into base models
+- `tools/cache_latents.py`: Pre-cache VAE latents for faster training
+- `tools/cache_text_encoder_outputs.py`: Pre-cache text encoder outputs
+
+## Development Notes
+
+### Strategy Pattern Implementation
+When adding support for new models, implement the four core strategies:
+1. `TokenizeStrategy`: Text tokenization handling
+2. `TextEncodingStrategy`: Text encoder forward pass
+3. `LatentsCachingStrategy`: VAE encoding/caching
+4. `TextEncoderOutputsCachingStrategy`: Text encoder output caching
+
+### Testing Approach
+- Unit tests focus on utility functions and model loading
+- Integration tests validate training script syntax and basic execution
+- Most tests use mocks to avoid requiring actual model files
+- Add tests for new model support in `tests/test_{model}_*.py`
+
+### Configuration System
+- Use `config_util.py` dataclasses for type-safe configuration
+- Support both command-line arguments and TOML file configuration
+- Validate configuration early in training scripts to prevent runtime errors
+
+### Memory Management
+- Always consider VRAM limitations when implementing features
+- Use gradient checkpointing for large models
+- Implement block swapping for models with transformer architectures
+- Cache intermediate results (latents, text embeddings) when possible
\ No newline at end of file
diff --git a/.ai/gemini.prompt.md b/.ai/gemini.prompt.md
new file mode 100644
index 000000000..6047390bc
--- /dev/null
+++ b/.ai/gemini.prompt.md
@@ -0,0 +1,9 @@
+## About This File
+
+This file provides guidance to Gemini CLI (https://github.com/google-gemini/gemini-cli) when working with code in this repository.
+
+## 1. Project Context
+Here is the essential context for our project. Please read and understand it thoroughly.
+
+### Project Overview
+@./context/01-overview.md
diff --git a/.gitignore b/.gitignore
index e492b1add..b991f6db5 100644
--- a/.gitignore
+++ b/.gitignore
@@ -6,3 +6,5 @@ venv
 build
 .vscode
 wandb
+CLAUDE.md
+GEMINI.md
\ No newline at end of file
diff --git a/README.md b/README.md
index 497969ab4..149f453b9 100644
--- a/README.md
+++ b/README.md
@@ -16,6 +16,9 @@ If you are using DeepSpeed, please install DeepSpeed with `pip install deepspeed
 
 ### Recent Updates
 
+Jul 10, 2025:
+- [AI Coding Agents](#for-developers-using-ai-coding-agents) section is added to the README. This section provides instructions for developers using AI coding agents like Claude and Gemini to understand the project context and coding standards.
+
 May 1, 2025:
 - The error when training FLUX.1 with mixed precision in flux_train.py with DeepSpeed enabled has been resolved. Thanks to sharlynxy for PR [#2060](https://github.com/kohya-ss/sd-scripts/pull/2060). Please refer to the PR for details.
   - If you enable DeepSpeed, please install DeepSpeed with `pip install deepspeed==0.16.7`.
@@ -54,46 +57,30 @@ Jan 25, 2025:
   - It will be added to other scripts as well.
   - As a current limitation, validation loss is not supported when `--block_to_swap` is specified, or when schedule-free optimizer is used.
 
-Dec 15, 2024:
-
-- RAdamScheduleFree optimizer is supported. PR [#1830](https://github.com/kohya-ss/sd-scripts/pull/1830) Thanks to nhamanasu!
-  - Update to `schedulefree==1.4` is required. Please update individually or with `pip install --use-pep517 --upgrade -r requirements.txt`.
-  - Available with `--optimizer_type=RAdamScheduleFree`. No need to specify warm up steps as well as learning rate scheduler.
-
-Dec 7, 2024:
-
-- The option to specify the model name during ControlNet training was different in each script. It has been unified. Please specify `--controlnet_model_name_or_path`. PR [#1821](https://github.com/kohya-ss/sd-scripts/pull/1821) Thanks to sdbds!
-<!-- 
-Also, the ControlNet training script for SD has been changed from `train_controlnet.py` to `train_control_net.py`.
-  - `train_controlnet.py` is still available, but it will be removed in the future.
--->
-
-- Fixed an issue where the saved model would be corrupted (pos_embed would not be saved) when `--enable_scaled_pos_embed` was specified in `sd3_train.py`.
+## For Developers Using AI Coding Agents
 
-Dec 3, 2024:
+This repository provides recommended instructions to help AI agents like Claude and Gemini understand our project context and coding standards.
 
--`--blocks_to_swap` now works in FLUX.1 ControlNet training. Sample commands for 24GB VRAM and 16GB VRAM are added [here](#flux1-controlnet-training).
+To use them, you need to opt-in by creating your own configuration file in the project root.
 
-Dec 2, 2024:
+**Quick Setup:**
 
-- FLUX.1 ControlNet training is supported. PR [#1813](https://github.com/kohya-ss/sd-scripts/pull/1813). Thanks to minux302!  See PR and [here](#flux1-controlnet-training) for details.
-  - Not fully tested. Feedback is welcome.
-  - 80GB VRAM is required for 1024x1024 resolution, and 48GB VRAM is required for 512x512 resolution.
-  - Currently, it only works in Linux environment (or Windows WSL2) because DeepSpeed is required.
-  - Multi-GPU training is not tested.
+1.  Create a `CLAUDE.md` and/or `GEMINI.md` file in the project root.
+2.  Add the following line to your `CLAUDE.md` to import the repository's recommended prompt:
 
-Dec 1, 2024:
+    ```markdown
+    @./.ai/claude.prompt.md
+    ```
 
-- Pseudo Huber loss is now available for FLUX.1 and SD3.5 training. See PR [#1808](https://github.com/kohya-ss/sd-scripts/pull/1808)  for details. Thanks to recris!
-  - Specify `--loss_type huber` or `--loss_type smooth_l1` to use it. `--huber_c` and `--huber_scale` are also available.
+    or for Gemini:
 
-- [Prodigy + ScheduleFree](https://github.com/LoganBooker/prodigy-plus-schedule-free) is supported. See PR [#1811](https://github.com/kohya-ss/sd-scripts/pull/1811) for details. Thanks to rockerBOO!
+    ```markdown
+    @./.ai/gemini.prompt.md
+    ```
 
-Nov 14, 2024:
+3.  You can now add your own personal instructions below the import line (e.g., `Always respond in Japanese.`).
 
-- Improved the implementation of block swap and made it available for both FLUX.1 and SD3 LoRA training. See [FLUX.1 LoRA training](#flux1-lora-training) etc. for how to use the new options. Training is possible with about 8-10GB of VRAM.
-- During fine-tuning, the memory usage when specifying the same number of blocks has increased slightly, but the training speed when specifying block swap has been significantly improved.
-- There may be bugs due to the significant changes. Feedback is welcome.
+This approach ensures that you have full control over the instructions given to your agent while benefiting from the shared project context. Your `CLAUDE.md` and `GEMINI.md` are already listed in `.gitignore`, so it won't be committed to the repository.
 
 ## FLUX.1 training
 

From 0b90555916acc4a44950d9ce1b47d645215e5b71 Mon Sep 17 00:00:00 2001
From: kohya-ss <ykumeykume@gmail.com>
Date: Thu, 10 Jul 2025 19:34:31 +0900
Subject: [PATCH 476/582] feat: add .claude and .gemini to .gitignore

---
 .gitignore | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/.gitignore b/.gitignore
index b991f6db5..eb19977ea 100644
--- a/.gitignore
+++ b/.gitignore
@@ -7,4 +7,6 @@ build
 .vscode
 wandb
 CLAUDE.md
-GEMINI.md
\ No newline at end of file
+GEMINI.md
+.claude
+.gemini
\ No newline at end of file

From d0b335d8cf543da68963103cbd7ae8d630d1eb3a Mon Sep 17 00:00:00 2001
From: kohya-ss <ykumeykume@gmail.com>
Date: Thu, 10 Jul 2025 20:15:45 +0900
Subject: [PATCH 477/582] feat: add LoRA training guide for Lumina Image 2.0
 (WIP)

---
 docs/lumina_train_network.md | 311 +++++++++++++++++++++++++++++++++++
 1 file changed, 311 insertions(+)
 create mode 100644 docs/lumina_train_network.md

diff --git a/docs/lumina_train_network.md b/docs/lumina_train_network.md
new file mode 100644
index 000000000..1c3794abc
--- /dev/null
+++ b/docs/lumina_train_network.md
@@ -0,0 +1,311 @@
+Status: reviewed
+
+# LoRA Training Guide for Lumina Image 2.0 using `lumina_train_network.py` / `lumina_train_network.py` を用いたLumina Image 2.0モデルのLoRA学習ガイド
+
+This document explains how to train LoRA (Low-Rank Adaptation) models for Lumina Image 2.0 using `lumina_train_network.py` in the `sd-scripts` repository.
+
+## 1. Introduction / はじめに
+
+`lumina_train_network.py` trains additional networks such as LoRA for Lumina Image 2.0 models. Lumina Image 2.0 adopts a Next-DiT (Next-generation Diffusion Transformer) architecture, which differs from previous Stable Diffusion models. It uses a single text encoder (Gemma2) and a dedicated AutoEncoder (AE).
+
+This guide assumes you already understand the basics of LoRA training. For common usage and options, see the [train_network.py guide](train_network.md). Some parameters are similar to those in [`sd3_train_network.py`](sd3_train_network.md) and [`flux_train_network.py`](flux_train_network.md).
+
+**Prerequisites:**
+
+* The `sd-scripts` repository has been cloned and the Python environment is ready.
+* A training dataset has been prepared. See the [Dataset Configuration Guide](link/to/dataset/config/doc).
+* Lumina Image 2.0 model files for training are available.
+
+<details>
+<summary>日本語</summary>
+ステータス：内容を一通り確認した
+
+`lumina_train_network.py`は、Lumina Image 2.0モデルに対してLoRAなどの追加ネットワークを学習させるためのスクリプトです。Lumina Image 2.0は、Next-DiT (Next-generation Diffusion Transformer) と呼ばれる新しいアーキテクチャを採用しており、従来のStable Diffusionモデルとは構造が異なります。テキストエンコーダーとしてGemma2を単体で使用し、専用のAutoEncoder (AE) を使用します。
+
+このガイドは、基本的なLoRA学習の手順を理解しているユーザーを対象としています。基本的な使い方や共通のオプションについては、[`train_network.py`のガイド](train_network.md)を参照してください。また一部のパラメータは [`sd3_train_network.py`](sd3_train_network.md) や [`flux_train_network.py`](flux_train_network.md) と同様のものがあるため、そちらも参考にしてください。
+
+**前提条件:**
+
+*   `sd-scripts`リポジトリのクローンとPython環境のセットアップが完了していること。
+*   学習用データセットの準備が完了していること。（データセットの準備については[データセット設定ガイド](link/to/dataset/config/doc)を参照してください）
+*   学習対象のLumina Image 2.0モデルファイルが準備できていること。
+</details>
+
+## 2. Differences from `train_network.py` / `train_network.py` との違い
+
+`lumina_train_network.py` is based on `train_network.py` but modified for Lumina Image 2.0. Main differences are:
+
+* **Target models:** Lumina Image 2.0 models.
+* **Model structure:** Uses Next-DiT (Transformer based) instead of U-Net and employs a single text encoder (Gemma2). The AutoEncoder (AE) is not compatible with SDXL/SD3/FLUX.
+* **Arguments:** Options exist to specify the Lumina Image 2.0 model, Gemma2 text encoder and AE. With a single `.safetensors` file, these components are typically provided separately.
+* **Incompatible arguments:** Stable Diffusion v1/v2 options such as `--v2`, `--v_parameterization` and `--clip_skip` are not used.
+* **Lumina specific options:** Additional parameters for timestep sampling, model prediction type, discrete flow shift, and system prompt.
+
+<details>
+<summary>日本語</summary>
+`lumina_train_network.py`は`train_network.py`をベースに、Lumina Image 2.0モデルに対応するための変更が加えられています。主な違いは以下の通りです。
+
+*   **対象モデル:** Lumina Image 2.0モデルを対象とします。
+*   **モデル構造:** U-Netの代わりにNext-DiT (Transformerベース) を使用します。Text EncoderとしてGemma2を単体で使用し、専用のAutoEncoder (AE) を使用します。
+*   **引数:** Lumina Image 2.0モデル、Gemma2 Text Encoder、AEを指定する引数があります。通常、これらのコンポーネントは個別に提供されます。
+*   **一部引数の非互換性:** Stable Diffusion v1/v2向けの引数（例: `--v2`, `--v_parameterization`, `--clip_skip`）はLumina Image 2.0の学習では使用されません。
+*   **Lumina特有の引数:** タイムステップのサンプリング、モデル予測タイプ、離散フローシフト、システムプロンプトに関する引数が追加されています。
+</details>
+
+## 3. Preparation / 準備
+
+The following files are required before starting training:
+
+1. **Training script:** `lumina_train_network.py`
+2. **Lumina Image 2.0 model file:** `.safetensors` file for the base model.
+3. **Gemma2 text encoder file:** `.safetensors` file for the text encoder.
+4. **AutoEncoder (AE) file:** `.safetensors` file for the AE.
+5. **Dataset definition file (.toml):** Dataset settings in TOML format. (See the [Dataset Configuration Guide](link/to/dataset/config/doc).) In this document we use `my_lumina_dataset_config.toml` as an example.
+
+<details>
+<summary>日本語</summary>
+学習を開始する前に、以下のファイルが必要です。
+
+1.  **学習スクリプト:** `lumina_train_network.py`
+2.  **Lumina Image 2.0モデルファイル:** 学習のベースとなるLumina Image 2.0モデルの`.safetensors`ファイル。
+3.  **Gemma2テキストエンコーダーファイル:** Gemma2テキストエンコーダーの`.safetensors`ファイル。
+4.  **AutoEncoder (AE) ファイル:** AEの`.safetensors`ファイル。
+5.  **データセット定義ファイル (.toml):** 学習データセットの設定を記述したTOML形式のファイル。（詳細は[データセット設定ガイド](link/to/dataset/config/doc)を参照してください）。
+    *   例として`my_lumina_dataset_config.toml`を使用します。
+</details>
+
+## 4. Running the Training / 学習の実行
+
+Execute `lumina_train_network.py` from the terminal to start training. The overall command-line format is the same as `train_network.py`, but Lumina Image 2.0 specific options must be supplied.
+
+Example command:
+
+```bash
+accelerate launch --num_cpu_threads_per_process 1 lumina_train_network.py \
+  --pretrained_model_name_or_path="lumina-image-2.safetensors" \
+  --gemma2="gemma-2-2b.safetensors" \
+  --ae="ae.safetensors" \
+  --dataset_config="my_lumina_dataset_config.toml" \
+  --output_dir="./output" \
+  --output_name="my_lumina_lora" \
+  --save_model_as=safetensors \
+  --network_module=networks.lora_lumina \
+  --network_dim=8 \
+  --network_alpha=8 \
+  --learning_rate=1e-4 \
+  --optimizer_type="AdamW" \
+  --lr_scheduler="constant" \
+  --timestep_sampling="nextdit_shift" \
+  --discrete_flow_shift=6.0 \
+  --model_prediction_type="raw" \
+  --guidance_scale=4.0 \
+  --system_prompt="You are an assistant designed to generate high-quality images based on user prompts." \
+  --use_flash_attn \
+  --max_train_epochs=10 \
+  --save_every_n_epochs=1 \
+  --mixed_precision="bf16" \
+  --gradient_checkpointing \
+  --cache_latents \
+  --cache_text_encoder_outputs
+```
+
+*(Write the command on one line or use `\` or `^` for line breaks.)*
+
+<details>
+<summary>日本語</summary>
+学習は、ターミナルから`lumina_train_network.py`を実行することで開始します。基本的なコマンドラインの構造は`train_network.py`と同様ですが、Lumina Image 2.0特有の引数を指定する必要があります。
+
+以下に、基本的なコマンドライン実行例を示します。
+
+```bash
+accelerate launch --num_cpu_threads_per_process 1 lumina_train_network.py \
+  --pretrained_model_name_or_path="lumina-image-2.safetensors" \
+  --gemma2="gemma-2-2b.safetensors" \
+  --ae="ae.safetensors" \
+  --dataset_config="my_lumina_dataset_config.toml" \
+  --output_dir="./output" \
+  --output_name="my_lumina_lora" \
+  --save_model_as=safetensors \
+  --network_module=networks.lora_lumina \
+  --network_dim=8 \
+  --network_alpha=8 \
+  --learning_rate=1e-4 \
+  --optimizer_type="AdamW" \
+  --lr_scheduler="constant" \
+  --timestep_sampling="nextdit_shift" \
+  --discrete_flow_shift=6.0 \
+  --model_prediction_type="raw" \
+  --guidance_scale=4.0 \
+  --system_prompt="You are an assistant designed to generate high-quality images based on user prompts." \
+  --use_flash_attn \
+  --max_train_epochs=10 \
+  --save_every_n_epochs=1 \
+  --mixed_precision="bf16" \
+  --gradient_checkpointing \
+  --cache_latents \
+  --cache_text_encoder_outputs
+```
+
+※実際には1行で書くか、適切な改行文字（`\` または `^`）を使用してください。
+</details>
+
+### 4.1. Explanation of Key Options / 主要なコマンドライン引数の解説
+
+Besides the arguments explained in the [train_network.py guide](train_network.md), specify the following Lumina Image 2.0 options. For shared options (`--output_dir`, `--output_name`, etc.), see that guide.
+
+#### Model Options / モデル関連
+
+* `--pretrained_model_name_or_path="<path to Lumina model>"` **required** – Path to the Lumina Image 2.0 model.
+* `--gemma2="<path to Gemma2 model>"` **required** – Path to the Gemma2 text encoder `.safetensors` file.
+* `--ae="<path to AE model>"` **required** – Path to the AutoEncoder `.safetensors` file.
+
+#### Lumina Image 2.0 Training Parameters / Lumina Image 2.0 学習パラメータ
+
+* `--gemma2_max_token_length=<integer>` – Max token length for Gemma2. Default varies by model.
+* `--timestep_sampling=<choice>` – Timestep sampling method. Options: `sigma`, `uniform`, `sigmoid`, `shift`, `nextdit_shift`. Default `sigma`. **Recommended: `nextdit_shift`**
+* `--discrete_flow_shift=<float>` – Discrete flow shift for the Euler Discrete Scheduler. Default `6.0`.
+* `--model_prediction_type=<choice>` – Model prediction processing method. Options: `raw`, `additive`, `sigma_scaled`. Default `sigma_scaled`. **Recommended: `raw`**
+* `--guidance_scale=<float>` – Guidance scale for training. **Recommended: `4.0`**
+* `--system_prompt=<string>` – System prompt to prepend to all prompts. Recommended: `"You are an assistant designed to generate high-quality images based on user prompts."` or `"You are an assistant designed to generate high-quality images with the highest degree of image-text alignment based on textual prompts."`
+* `--use_flash_attn` – Use Flash Attention. Requires `pip install flash-attn`.
+* `--use_sage_attn` – Use Sage Attention.
+* `--sigmoid_scale=<float>` – Scale factor for sigmoid timestep sampling. Default `1.0`.
+
+#### Memory and Speed / メモリ・速度関連
+
+* `--blocks_to_swap=<integer>` **[experimental]** – Swap a number of Transformer blocks between CPU and GPU. More blocks reduce VRAM but slow training. Cannot be used with `--cpu_offload_checkpointing`.
+* `--cache_text_encoder_outputs` – Cache Gemma2 outputs to reduce memory usage.
+* `--cache_latents`, `--cache_latents_to_disk` – Cache AE outputs.
+* `--fp8_base` – Use FP8 precision for the base model.
+
+#### Network Arguments / ネットワーク引数
+
+For Lumina Image 2.0, you can specify different dimensions for various components:
+
+* `--network_args` can include:
+  * `"attn_dim=4"` – Attention dimension
+  * `"mlp_dim=4"` – MLP dimension  
+  * `"mod_dim=4"` – Modulation dimension
+  * `"refiner_dim=4"` – Refiner blocks dimension
+  * `"embedder_dims=[4,4,4]"` – Embedder dimensions for x, t, and caption embedders
+
+#### Incompatible or Deprecated Options / 非互換・非推奨の引数
+
+* `--v2`, `--v_parameterization`, `--clip_skip` – Options for Stable Diffusion v1/v2 that are not used for Lumina Image 2.0.
+
+<details>
+<summary>日本語</summary>
+[`train_network.py`のガイド](train_network.md)で説明されている引数に加え、以下のLumina Image 2.0特有の引数を指定します。共通の引数については、上記ガイドを参照してください。
+
+#### モデル関連
+
+*   `--pretrained_model_name_or_path="<path to Lumina model>"` **[必須]**
+    *   学習のベースとなるLumina Image 2.0モデルの`.safetensors`ファイルのパスを指定します。
+*   `--gemma2="<path to Gemma2 model>"` **[必須]**
+    *   Gemma2テキストエンコーダーの`.safetensors`ファイルのパスを指定します。
+*   `--ae="<path to AE model>"` **[必須]**
+    *   AutoEncoderの`.safetensors`ファイルのパスを指定します。
+
+#### Lumina Image 2.0 学習パラメータ
+
+*   `--gemma2_max_token_length=<integer>` – Gemma2で使用するトークンの最大長を指定します。デフォルトはモデルによって異なります。
+*   `--timestep_sampling=<choice>` – タイムステップのサンプリング方法を指定します。`sigma`, `uniform`, `sigmoid`, `shift`, `nextdit_shift`から選択します。デフォルトは`sigma`です。**推奨: `nextdit_shift`**
+*   `--discrete_flow_shift=<float>` – Euler Discrete Schedulerの離散フローシフトを指定します。デフォルトは`6.0`です。
+*   `--model_prediction_type=<choice>` – モデル予測の処理方法を指定します。`raw`, `additive`, `sigma_scaled`から選択します。デフォルトは`sigma_scaled`です。**推奨: `raw`**
+*   `--guidance_scale=<float>` – 学習時のガイダンススケールを指定します。**推奨: `4.0`**
+*   `--system_prompt=<string>` – 全てのプロンプトに前置するシステムプロンプトを指定します。推奨: `"You are an assistant designed to generate high-quality images based on user prompts."` または `"You are an assistant designed to generate high-quality images with the highest degree of image-text alignment based on textual prompts."`
+*   `--use_flash_attn` – Flash Attentionを使用します。`pip install flash-attn`が必要です。
+*   `--use_sage_attn` – Sage Attentionを使用します。
+*   `--sigmoid_scale=<float>` – sigmoidタイムステップサンプリングのスケール係数を指定します。デフォルトは`1.0`です。
+
+#### メモリ・速度関連
+
+*   `--blocks_to_swap=<integer>` **[実験的機能]** – TransformerブロックをCPUとGPUでスワップしてVRAMを節約します。`--cpu_offload_checkpointing`とは併用できません。
+*   `--cache_text_encoder_outputs` – Gemma2の出力をキャッシュしてメモリ使用量を削減します。
+*   `--cache_latents`, `--cache_latents_to_disk` – AEの出力をキャッシュします。
+*   `--fp8_base` – ベースモデルにFP8精度を使用します。
+
+#### ネットワーク引数
+
+Lumina Image 2.0では、各コンポーネントに対して異なる次元を指定できます：
+
+*   `--network_args` には以下を含めることができます：
+    *   `"attn_dim=4"` – アテンション次元
+    *   `"mlp_dim=4"` – MLP次元
+    *   `"mod_dim=4"` – モジュレーション次元
+    *   `"refiner_dim=4"` – リファイナーブロック次元
+    *   `"embedder_dims=[4,4,4]"` – x、t、キャプションエンベッダーのエンベッダー次元
+
+#### 非互換・非推奨の引数
+
+*   `--v2`, `--v_parameterization`, `--clip_skip` – Stable Diffusion v1/v2向けの引数のため、Lumina Image 2.0学習では使用されません。
+</details>
+
+### 4.2. Starting Training / 学習の開始
+
+After setting the required arguments, run the command to begin training. The overall flow and how to check logs are the same as in the [train_network.py guide](train_network.md#32-starting-the-training--学習の開始).
+
+## 5. Using the Trained Model / 学習済みモデルの利用
+
+When training finishes, a LoRA model file (e.g. `my_lumina_lora.safetensors`) is saved in the directory specified by `output_dir`. Use this file with inference environments that support Lumina Image 2.0, such as ComfyUI with appropriate nodes.
+
+## 6. Others / その他
+
+`lumina_train_network.py` shares many features with `train_network.py`, such as sample image generation (`--sample_prompts`, etc.) and detailed optimizer settings. For these, see the [train_network.py guide](train_network.md#5-other-features--その他の機能) or run `python lumina_train_network.py --help`.
+
+### 6.1. Recommended Settings / 推奨設定
+
+Based on the contributor's recommendations, here are the suggested settings for optimal training:
+
+**Model Files:**
+* Lumina Image 2.0: `lumina-image-2.safetensors` ([full precision link](https://huggingface.co/rockerBOO/lumina-image-2/blob/main/lumina-image-2.safetensors)) or `lumina_2_model_bf16.safetensors` ([bf16 link](https://huggingface.co/Comfy-Org/Lumina_Image_2.0_Repackaged/blob/main/split_files/diffusion_models/lumina_2_model_bf16.safetensors))
+* Gemma2 2B (fp16): `gemma-2-2b.safetensors` ([link](https://huggingface.co/Comfy-Org/Lumina_Image_2.0_Repackaged/blob/main/split_files/text_encoders/gemma_2_2b_fp16.safetensors))
+* AutoEncoder: `ae.safetensors` ([link](https://huggingface.co/Comfy-Org/Lumina_Image_2.0_Repackaged/blob/main/split_files/vae/ae.safetensors)) (same as FLUX)
+
+**Key Parameters:**
+* `--timestep_sampling="nextdit_shift"`
+* `--discrete_flow_shift=6.0`
+* `--model_prediction_type="raw"`
+* `--guidance_scale=4.0`
+* `--mixed_precision="bf16"`
+
+**System Prompts:**
+* General purpose: `"You are an assistant designed to generate high-quality images based on user prompts."`
+* High image-text alignment: `"You are an assistant designed to generate high-quality images with the highest degree of image-text alignment based on textual prompts."`
+
+**Sample Prompts:**
+Sample prompts can include CFG truncate (`-ct`) and Renorm CFG (`-rc`) parameters:
+* `-ct 0.25 -rc 1.0` (default values)
+
+<details>
+<summary>日本語</summary>
+必要な引数を設定し、コマンドを実行すると学習が開始されます。基本的な流れやログの確認方法は[`train_network.py`のガイド](train_network.md#32-starting-the-training--学習の開始)と同様です。
+
+学習が完了すると、指定した`output_dir`にLoRAモデルファイル（例: `my_lumina_lora.safetensors`）が保存されます。このファイルは、Lumina Image 2.0モデルに対応した推論環境（例: ComfyUI + 適切なノード）で使用できます。
+
+`lumina_train_network.py`には、サンプル画像の生成 (`--sample_prompts`など) や詳細なオプティマイザ設定など、`train_network.py`と共通の機能も多く存在します。これらについては、[`train_network.py`のガイド](train_network.md#5-other-features--その他の機能)やスクリプトのヘルプ (`python lumina_train_network.py --help`) を参照してください。
+
+### 6.1. 推奨設定
+
+コントリビューターの推奨に基づく、最適な学習のための推奨設定：
+
+**モデルファイル:**
+* Lumina Image 2.0: `lumina-image-2.safetensors` ([full precisionリンク](https://huggingface.co/rockerBOO/lumina-image-2/blob/main/lumina-image-2.safetensors)) または `lumina_2_model_bf16.safetensors` ([bf16リンク](https://huggingface.co/Comfy-Org/Lumina_Image_2.0_Repackaged/blob/main/split_files/diffusion_models/lumina_2_model_bf16.safetensors))
+* Gemma2 2B (fp16): `gemma-2-2b.safetensors` ([リンク](https://huggingface.co/Comfy-Org/Lumina_Image_2.0_Repackaged/blob/main/split_files/text_encoders/gemma_2_2b_fp16.safetensors))
+* AutoEncoder: `ae.safetensors` ([リンク](https://huggingface.co/Comfy-Org/Lumina_Image_2.0_Repackaged/blob/main/split_files/vae/ae.safetensors)) (FLUXと同じ)
+
+**主要パラメータ:**
+* `--timestep_sampling="nextdit_shift"`
+* `--discrete_flow_shift=6.0`
+* `--model_prediction_type="raw"`
+* `--guidance_scale=4.0`
+* `--mixed_precision="bf16"`
+
+**システムプロンプト:**
+* 汎用目的: `"You are an assistant designed to generate high-quality images based on user prompts."`
+* 高い画像-テキスト整合性: `"You are an assistant designed to generate high-quality images with the highest degree of image-text alignment based on textual prompts."`
+
+**サンプルプロンプト:**
+サンプルプロンプトには CFG truncate (`-ct`) と Renorm CFG (`-rc`) パラメータを含めることができます：
+* `-ct 0.25 -rc 1.0` (デフォルト値)
+</details>
\ No newline at end of file

From 8a72f56c9f65d24646b3db8a902a74b077e07106 Mon Sep 17 00:00:00 2001
From: Kohya S <ykumeykume@gmail.com>
Date: Fri, 11 Jul 2025 22:14:16 +0900
Subject: [PATCH 478/582] fix: clarify Flash Attention usage in lumina training
 guide

---
 docs/lumina_train_network.md | 9 ++-------
 1 file changed, 2 insertions(+), 7 deletions(-)

diff --git a/docs/lumina_train_network.md b/docs/lumina_train_network.md
index 1c3794abc..2872f513c 100644
--- a/docs/lumina_train_network.md
+++ b/docs/lumina_train_network.md
@@ -18,7 +18,6 @@ This guide assumes you already understand the basics of LoRA training. For commo
 
 <details>
 <summary>日本語</summary>
-ステータス：内容を一通り確認した
 
 `lumina_train_network.py`は、Lumina Image 2.0モデルに対してLoRAなどの追加ネットワークを学習させるためのスクリプトです。Lumina Image 2.0は、Next-DiT (Next-generation Diffusion Transformer) と呼ばれる新しいアーキテクチャを採用しており、従来のStable Diffusionモデルとは構造が異なります。テキストエンコーダーとしてGemma2を単体で使用し、専用のAutoEncoder (AE) を使用します。
 
@@ -100,7 +99,6 @@ accelerate launch --num_cpu_threads_per_process 1 lumina_train_network.py \
   --model_prediction_type="raw" \
   --guidance_scale=4.0 \
   --system_prompt="You are an assistant designed to generate high-quality images based on user prompts." \
-  --use_flash_attn \
   --max_train_epochs=10 \
   --save_every_n_epochs=1 \
   --mixed_precision="bf16" \
@@ -137,7 +135,6 @@ accelerate launch --num_cpu_threads_per_process 1 lumina_train_network.py \
   --model_prediction_type="raw" \
   --guidance_scale=4.0 \
   --system_prompt="You are an assistant designed to generate high-quality images based on user prompts." \
-  --use_flash_attn \
   --max_train_epochs=10 \
   --save_every_n_epochs=1 \
   --mixed_precision="bf16" \
@@ -167,8 +164,7 @@ Besides the arguments explained in the [train_network.py guide](train_network.md
 * `--model_prediction_type=<choice>` – Model prediction processing method. Options: `raw`, `additive`, `sigma_scaled`. Default `sigma_scaled`. **Recommended: `raw`**
 * `--guidance_scale=<float>` – Guidance scale for training. **Recommended: `4.0`**
 * `--system_prompt=<string>` – System prompt to prepend to all prompts. Recommended: `"You are an assistant designed to generate high-quality images based on user prompts."` or `"You are an assistant designed to generate high-quality images with the highest degree of image-text alignment based on textual prompts."`
-* `--use_flash_attn` – Use Flash Attention. Requires `pip install flash-attn`.
-* `--use_sage_attn` – Use Sage Attention.
+* `--use_flash_attn` – Use Flash Attention. Requires `pip install flash-attn` (may not be supported in all environments). If installed correctly, it speeds up training. 
 * `--sigmoid_scale=<float>` – Scale factor for sigmoid timestep sampling. Default `1.0`.
 
 #### Memory and Speed / メモリ・速度関連
@@ -214,8 +210,7 @@ For Lumina Image 2.0, you can specify different dimensions for various component
 *   `--model_prediction_type=<choice>` – モデル予測の処理方法を指定します。`raw`, `additive`, `sigma_scaled`から選択します。デフォルトは`sigma_scaled`です。**推奨: `raw`**
 *   `--guidance_scale=<float>` – 学習時のガイダンススケールを指定します。**推奨: `4.0`**
 *   `--system_prompt=<string>` – 全てのプロンプトに前置するシステムプロンプトを指定します。推奨: `"You are an assistant designed to generate high-quality images based on user prompts."` または `"You are an assistant designed to generate high-quality images with the highest degree of image-text alignment based on textual prompts."`
-*   `--use_flash_attn` – Flash Attentionを使用します。`pip install flash-attn`が必要です。
-*   `--use_sage_attn` – Sage Attentionを使用します。
+*   `--use_flash_attn` – Flash Attentionを使用します。`pip install flash-attn`でインストールが必要です（環境によってはサポートされていません）。正しくインストールされている場合は、指定すると学習が高速化されます。
 *   `--sigmoid_scale=<float>` – sigmoidタイムステップサンプリングのスケール係数を指定します。デフォルトは`1.0`です。
 
 #### メモリ・速度関連

From 1a9bf2ab56ef488e7cf1789cf7689977fdeece5d Mon Sep 17 00:00:00 2001
From: Kohya S <ykumeykume@gmail.com>
Date: Sun, 13 Jul 2025 20:45:09 +0900
Subject: [PATCH 479/582] feat: add interactive mode for generating multiple
 images

---
 lumina_minimal_inference.py | 125 ++++++++++++++++++++++++++++++------
 1 file changed, 106 insertions(+), 19 deletions(-)

diff --git a/lumina_minimal_inference.py b/lumina_minimal_inference.py
index 4f9151792..31362c00d 100644
--- a/lumina_minimal_inference.py
+++ b/lumina_minimal_inference.py
@@ -257,6 +257,11 @@ def setup_parser() -> argparse.ArgumentParser:
         help="LoRA weights, each argument is a `path;multiplier` (semi-colon separated)",
     )
     parser.add_argument("--merge_lora_weights", action="store_true", help="Merge LoRA weights to model")
+    parser.add_argument(
+        "--interactive",
+        action="store_true",
+        help="Enable interactive mode for generating multiple images / 対話モードで複数の画像を生成する",
+    )
     return parser
 
 
@@ -294,9 +299,7 @@ def setup_parser() -> argparse.ArgumentParser:
             multiplier = 1.0
 
         weights_sd = load_file(weights_file)
-        lora_model, _ = lora_lumina.create_network_from_weights(
-            multiplier, None, ae, [gemma2], model, weights_sd, True
-        )
+        lora_model, _ = lora_lumina.create_network_from_weights(multiplier, None, ae, [gemma2], model, weights_sd, True)
 
         if args.merge_lora_weights:
             lora_model.merge_to([gemma2], model, weights_sd)
@@ -304,25 +307,109 @@ def setup_parser() -> argparse.ArgumentParser:
             lora_model.apply_to([gemma2], model)
             info = lora_model.load_state_dict(weights_sd, strict=True)
             logger.info(f"Loaded LoRA weights from {weights_file}: {info}")
+            lora_model.to(device)
+            lora_model.set_multiplier(multiplier)
             lora_model.eval()
 
         lora_models.append(lora_model)
 
-    generate_image(
-        model,
-        gemma2,
-        ae,
-        args.prompt,
-        args.system_prompt,
-        args.seed,
-        args.image_width,
-        args.image_height,
-        args.steps,
-        args.guidance_scale,
-        args.negative_prompt,
-        args,
-        args.cfg_trunc_ratio,
-        args.renorm_cfg,
-    )
+    if not args.interactive:
+        generate_image(
+            model,
+            gemma2,
+            ae,
+            args.prompt,
+            args.system_prompt,
+            args.seed,
+            args.image_width,
+            args.image_height,
+            args.steps,
+            args.guidance_scale,
+            args.negative_prompt,
+            args,
+            args.cfg_trunc_ratio,
+            args.renorm_cfg,
+        )
+    else:
+        # Interactive mode loop
+        image_width = args.image_width
+        image_height = args.image_height
+        steps = args.steps
+        guidance_scale = args.guidance_scale
+        cfg_trunc_ratio = args.cfg_trunc_ratio
+        renorm_cfg = args.renorm_cfg
+
+        print("Entering interactive mode.")
+        while True:
+            print(
+                "\nEnter prompt (or 'exit'). Options: --w <int> --h <int> --s <int> --d <int> --g <float> --n <str> --ctr <float> --rcfg <float> --m <m1,m2...>"
+            )
+            user_input = input()
+            if user_input.lower() == "exit":
+                break
+            if not user_input:
+                continue
+
+            # Parse options
+            options = user_input.split("--")
+            prompt = options[0].strip()
+
+            # Set defaults for each generation
+            seed = None  # New random seed each time unless specified
+            negative_prompt = args.negative_prompt  # Reset to default
+
+            for opt in options[1:]:
+                try:
+                    opt = opt.strip()
+                    if not opt:
+                        continue
+
+                    key, value = (opt.split(None, 1) + [""])[:2]
+
+                    if key == "w":
+                        image_width = int(value)
+                    elif key == "h":
+                        image_height = int(value)
+                    elif key == "s":
+                        steps = int(value)
+                    elif key == "d":
+                        seed = int(value)
+                    elif key == "g":
+                        guidance_scale = float(value)
+                    elif key == "n":
+                        negative_prompt = value if value != "-" else ""
+                    elif key == "ctr":
+                        cfg_trunc_ratio = float(value)
+                    elif key == "rcfg":
+                        renorm_cfg = float(value)
+                    elif key == "m":
+                        multipliers = value.split(",")
+                        if len(multipliers) != len(lora_models):
+                            logger.error(f"Invalid number of multipliers, expected {len(lora_models)}")
+                            continue
+                        for i, lora_model in enumerate(lora_models):
+                            lora_model.set_multiplier(float(multipliers[i].strip()))
+                    else:
+                        logger.warning(f"Unknown option: --{key}")
+
+                except (ValueError, IndexError) as e:
+                    logger.error(f"Invalid value for option --{key}: '{value}'. Error: {e}")
+
+            generate_image(
+                model,
+                gemma2,
+                ae,
+                prompt,
+                args.system_prompt,
+                seed,
+                image_width,
+                image_height,
+                steps,
+                guidance_scale,
+                negative_prompt,
+                args,
+                cfg_trunc_ratio,
+                renorm_cfg,
+            )
 
     logger.info("Done.")

From 88dc3213a90fffce3586e2f87fa74cb106488f5a Mon Sep 17 00:00:00 2001
From: Kohya S <ykumeykume@gmail.com>
Date: Sun, 13 Jul 2025 20:46:24 +0900
Subject: [PATCH 480/582] fix: support LoRA w/o TE for
 create_network_from_weights

---
 networks/lora_lumina.py | 31 +++++++++++++++++--------------
 1 file changed, 17 insertions(+), 14 deletions(-)

diff --git a/networks/lora_lumina.py b/networks/lora_lumina.py
index e4149b4ab..0929e8390 100644
--- a/networks/lora_lumina.py
+++ b/networks/lora_lumina.py
@@ -562,23 +562,26 @@ def create_modules(
 
                         # Set dim/alpha to modules dim/alpha
                         if modules_dim is not None and modules_alpha is not None:
-                            # モジュール指定あり
+                            # network from weights
                             if lora_name in modules_dim:
                                 dim = modules_dim[lora_name]
                                 alpha = modules_alpha[lora_name]
-
-                        # Set dims to type_dims
-                        if is_lumina and type_dims is not None:
-                            identifier = [
-                                ("attention",),  # attention layers
-                                ("mlp",),       # MLP layers
-                                ("modulation",), # modulation layers
-                                ("refiner",),   # refiner blocks
-                            ]
-                            for i, d in enumerate(type_dims):
-                                if d is not None and all([id in lora_name for id in identifier[i]]):
-                                    dim = d  # may be 0 for skip
-                                    break
+                            else:
+                                dim = 0 # skip if not found
+
+                        else:
+                            # Set dims to type_dims
+                            if is_lumina and type_dims is not None:
+                                identifier = [
+                                    ("attention",),  # attention layers
+                                    ("mlp",),       # MLP layers
+                                    ("modulation",), # modulation layers
+                                    ("refiner",),   # refiner blocks
+                                ]
+                                for i, d in enumerate(type_dims):
+                                    if d is not None and all([id in lora_name for id in identifier[i]]):
+                                        dim = d  # may be 0 for skip
+                                        break
 
                         # Drop blocks if we are only training some blocks
                         if (

From 88960e63094bcb96fae318c526867fe409fade18 Mon Sep 17 00:00:00 2001
From: Kohya S <ykumeykume@gmail.com>
Date: Sun, 13 Jul 2025 20:49:38 +0900
Subject: [PATCH 481/582] doc: update lumina LoRA training guide

---
 docs/lumina_train_network.md | 42 ++++++++++++++++--------------------
 library/lumina_train_util.py |  4 ++--
 2 files changed, 20 insertions(+), 26 deletions(-)

diff --git a/docs/lumina_train_network.md b/docs/lumina_train_network.md
index 2872f513c..e811f68b2 100644
--- a/docs/lumina_train_network.md
+++ b/docs/lumina_train_network.md
@@ -8,12 +8,12 @@ This document explains how to train LoRA (Low-Rank Adaptation) models for Lumina
 
 `lumina_train_network.py` trains additional networks such as LoRA for Lumina Image 2.0 models. Lumina Image 2.0 adopts a Next-DiT (Next-generation Diffusion Transformer) architecture, which differs from previous Stable Diffusion models. It uses a single text encoder (Gemma2) and a dedicated AutoEncoder (AE).
 
-This guide assumes you already understand the basics of LoRA training. For common usage and options, see the [train_network.py guide](train_network.md). Some parameters are similar to those in [`sd3_train_network.py`](sd3_train_network.md) and [`flux_train_network.py`](flux_train_network.md).
+This guide assumes you already understand the basics of LoRA training. For common usage and options, see the train_network.py guide (to be documented). Some parameters are similar to those in [`sd3_train_network.py`](sd3_train_network.md) and [`flux_train_network.py`](flux_train_network.md).
 
 **Prerequisites:**
 
 * The `sd-scripts` repository has been cloned and the Python environment is ready.
-* A training dataset has been prepared. See the [Dataset Configuration Guide](link/to/dataset/config/doc).
+* A training dataset has been prepared. See the [Dataset Configuration Guide](./config_README-en.md).
 * Lumina Image 2.0 model files for training are available.
 
 <details>
@@ -21,12 +21,12 @@ This guide assumes you already understand the basics of LoRA training. For commo
 
 `lumina_train_network.py`は、Lumina Image 2.0モデルに対してLoRAなどの追加ネットワークを学習させるためのスクリプトです。Lumina Image 2.0は、Next-DiT (Next-generation Diffusion Transformer) と呼ばれる新しいアーキテクチャを採用しており、従来のStable Diffusionモデルとは構造が異なります。テキストエンコーダーとしてGemma2を単体で使用し、専用のAutoEncoder (AE) を使用します。
 
-このガイドは、基本的なLoRA学習の手順を理解しているユーザーを対象としています。基本的な使い方や共通のオプションについては、[`train_network.py`のガイド](train_network.md)を参照してください。また一部のパラメータは [`sd3_train_network.py`](sd3_train_network.md) や [`flux_train_network.py`](flux_train_network.md) と同様のものがあるため、そちらも参考にしてください。
+このガイドは、基本的なLoRA学習の手順を理解しているユーザーを対象としています。基本的な使い方や共通のオプションについては、`train_network.py`のガイド（作成中）を参照してください。また一部のパラメータは [`sd3_train_network.py`](sd3_train_network.md) や [`flux_train_network.py`](flux_train_network.md) と同様のものがあるため、そちらも参考にしてください。
 
 **前提条件:**
 
 *   `sd-scripts`リポジトリのクローンとPython環境のセットアップが完了していること。
-*   学習用データセットの準備が完了していること。（データセットの準備については[データセット設定ガイド](link/to/dataset/config/doc)を参照してください）
+*   学習用データセットの準備が完了していること。（データセットの準備については[データセット設定ガイド](./config_README-en.md)を参照してください）
 *   学習対象のLumina Image 2.0モデルファイルが準備できていること。
 </details>
 
@@ -59,7 +59,14 @@ The following files are required before starting training:
 2. **Lumina Image 2.0 model file:** `.safetensors` file for the base model.
 3. **Gemma2 text encoder file:** `.safetensors` file for the text encoder.
 4. **AutoEncoder (AE) file:** `.safetensors` file for the AE.
-5. **Dataset definition file (.toml):** Dataset settings in TOML format. (See the [Dataset Configuration Guide](link/to/dataset/config/doc).) In this document we use `my_lumina_dataset_config.toml` as an example.
+5. **Dataset definition file (.toml):** Dataset settings in TOML format. (See the [Dataset Configuration Guide](./config_README-en.md). In this document we use `my_lumina_dataset_config.toml` as an example.
+
+
+**Model Files:**
+* Lumina Image 2.0: `lumina-image-2.safetensors` ([full precision link](https://huggingface.co/rockerBOO/lumina-image-2/blob/main/lumina-image-2.safetensors)) or `lumina_2_model_bf16.safetensors` ([bf16 link](https://huggingface.co/Comfy-Org/Lumina_Image_2.0_Repackaged/blob/main/split_files/diffusion_models/lumina_2_model_bf16.safetensors))
+* Gemma2 2B (fp16): `gemma-2-2b.safetensors` ([link](https://huggingface.co/Comfy-Org/Lumina_Image_2.0_Repackaged/blob/main/split_files/text_encoders/gemma_2_2b_fp16.safetensors))
+* AutoEncoder: `ae.safetensors` ([link](https://huggingface.co/Comfy-Org/Lumina_Image_2.0_Repackaged/blob/main/split_files/vae/ae.safetensors)) (same as FLUX)
+
 
 <details>
 <summary>日本語</summary>
@@ -69,8 +76,11 @@ The following files are required before starting training:
 2.  **Lumina Image 2.0モデルファイル:** 学習のベースとなるLumina Image 2.0モデルの`.safetensors`ファイル。
 3.  **Gemma2テキストエンコーダーファイル:** Gemma2テキストエンコーダーの`.safetensors`ファイル。
 4.  **AutoEncoder (AE) ファイル:** AEの`.safetensors`ファイル。
-5.  **データセット定義ファイル (.toml):** 学習データセットの設定を記述したTOML形式のファイル。（詳細は[データセット設定ガイド](link/to/dataset/config/doc)を参照してください）。
+5.  **データセット定義ファイル (.toml):** 学習データセットの設定を記述したTOML形式のファイル。（詳細は[データセット設定ガイド](./config_README-en.md)を参照してください）。
     *   例として`my_lumina_dataset_config.toml`を使用します。
+
+**モデルファイル** は英語ドキュメントの通りです。
+
 </details>
 
 ## 4. Running the Training / 学習の実行
@@ -97,7 +107,6 @@ accelerate launch --num_cpu_threads_per_process 1 lumina_train_network.py \
   --timestep_sampling="nextdit_shift" \
   --discrete_flow_shift=6.0 \
   --model_prediction_type="raw" \
-  --guidance_scale=4.0 \
   --system_prompt="You are an assistant designed to generate high-quality images based on user prompts." \
   --max_train_epochs=10 \
   --save_every_n_epochs=1 \
@@ -133,7 +142,6 @@ accelerate launch --num_cpu_threads_per_process 1 lumina_train_network.py \
   --timestep_sampling="nextdit_shift" \
   --discrete_flow_shift=6.0 \
   --model_prediction_type="raw" \
-  --guidance_scale=4.0 \
   --system_prompt="You are an assistant designed to generate high-quality images based on user prompts." \
   --max_train_epochs=10 \
   --save_every_n_epochs=1 \
@@ -158,11 +166,10 @@ Besides the arguments explained in the [train_network.py guide](train_network.md
 
 #### Lumina Image 2.0 Training Parameters / Lumina Image 2.0 学習パラメータ
 
-* `--gemma2_max_token_length=<integer>` – Max token length for Gemma2. Default varies by model.
+* `--gemma2_max_token_length=<integer>` – Max token length for Gemma2. Default is 256.
 * `--timestep_sampling=<choice>` – Timestep sampling method. Options: `sigma`, `uniform`, `sigmoid`, `shift`, `nextdit_shift`. Default `sigma`. **Recommended: `nextdit_shift`**
 * `--discrete_flow_shift=<float>` – Discrete flow shift for the Euler Discrete Scheduler. Default `6.0`.
 * `--model_prediction_type=<choice>` – Model prediction processing method. Options: `raw`, `additive`, `sigma_scaled`. Default `sigma_scaled`. **Recommended: `raw`**
-* `--guidance_scale=<float>` – Guidance scale for training. **Recommended: `4.0`**
 * `--system_prompt=<string>` – System prompt to prepend to all prompts. Recommended: `"You are an assistant designed to generate high-quality images based on user prompts."` or `"You are an assistant designed to generate high-quality images with the highest degree of image-text alignment based on textual prompts."`
 * `--use_flash_attn` – Use Flash Attention. Requires `pip install flash-attn` (may not be supported in all environments). If installed correctly, it speeds up training. 
 * `--sigmoid_scale=<float>` – Scale factor for sigmoid timestep sampling. Default `1.0`.
@@ -204,11 +211,10 @@ For Lumina Image 2.0, you can specify different dimensions for various component
 
 #### Lumina Image 2.0 学習パラメータ
 
-*   `--gemma2_max_token_length=<integer>` – Gemma2で使用するトークンの最大長を指定します。デフォルトはモデルによって異なります。
+*   `--gemma2_max_token_length=<integer>` – Gemma2で使用するトークンの最大長を指定します。デフォルトは256です。
 *   `--timestep_sampling=<choice>` – タイムステップのサンプリング方法を指定します。`sigma`, `uniform`, `sigmoid`, `shift`, `nextdit_shift`から選択します。デフォルトは`sigma`です。**推奨: `nextdit_shift`**
 *   `--discrete_flow_shift=<float>` – Euler Discrete Schedulerの離散フローシフトを指定します。デフォルトは`6.0`です。
 *   `--model_prediction_type=<choice>` – モデル予測の処理方法を指定します。`raw`, `additive`, `sigma_scaled`から選択します。デフォルトは`sigma_scaled`です。**推奨: `raw`**
-*   `--guidance_scale=<float>` – 学習時のガイダンススケールを指定します。**推奨: `4.0`**
 *   `--system_prompt=<string>` – 全てのプロンプトに前置するシステムプロンプトを指定します。推奨: `"You are an assistant designed to generate high-quality images based on user prompts."` または `"You are an assistant designed to generate high-quality images with the highest degree of image-text alignment based on textual prompts."`
 *   `--use_flash_attn` – Flash Attentionを使用します。`pip install flash-attn`でインストールが必要です（環境によってはサポートされていません）。正しくインストールされている場合は、指定すると学習が高速化されます。
 *   `--sigmoid_scale=<float>` – sigmoidタイムステップサンプリングのスケール係数を指定します。デフォルトは`1.0`です。
@@ -252,16 +258,10 @@ When training finishes, a LoRA model file (e.g. `my_lumina_lora.safetensors`) is
 
 Based on the contributor's recommendations, here are the suggested settings for optimal training:
 
-**Model Files:**
-* Lumina Image 2.0: `lumina-image-2.safetensors` ([full precision link](https://huggingface.co/rockerBOO/lumina-image-2/blob/main/lumina-image-2.safetensors)) or `lumina_2_model_bf16.safetensors` ([bf16 link](https://huggingface.co/Comfy-Org/Lumina_Image_2.0_Repackaged/blob/main/split_files/diffusion_models/lumina_2_model_bf16.safetensors))
-* Gemma2 2B (fp16): `gemma-2-2b.safetensors` ([link](https://huggingface.co/Comfy-Org/Lumina_Image_2.0_Repackaged/blob/main/split_files/text_encoders/gemma_2_2b_fp16.safetensors))
-* AutoEncoder: `ae.safetensors` ([link](https://huggingface.co/Comfy-Org/Lumina_Image_2.0_Repackaged/blob/main/split_files/vae/ae.safetensors)) (same as FLUX)
-
 **Key Parameters:**
 * `--timestep_sampling="nextdit_shift"`
 * `--discrete_flow_shift=6.0`
 * `--model_prediction_type="raw"`
-* `--guidance_scale=4.0`
 * `--mixed_precision="bf16"`
 
 **System Prompts:**
@@ -284,16 +284,10 @@ Sample prompts can include CFG truncate (`-ct`) and Renorm CFG (`-rc`) parameter
 
 コントリビューターの推奨に基づく、最適な学習のための推奨設定：
 
-**モデルファイル:**
-* Lumina Image 2.0: `lumina-image-2.safetensors` ([full precisionリンク](https://huggingface.co/rockerBOO/lumina-image-2/blob/main/lumina-image-2.safetensors)) または `lumina_2_model_bf16.safetensors` ([bf16リンク](https://huggingface.co/Comfy-Org/Lumina_Image_2.0_Repackaged/blob/main/split_files/diffusion_models/lumina_2_model_bf16.safetensors))
-* Gemma2 2B (fp16): `gemma-2-2b.safetensors` ([リンク](https://huggingface.co/Comfy-Org/Lumina_Image_2.0_Repackaged/blob/main/split_files/text_encoders/gemma_2_2b_fp16.safetensors))
-* AutoEncoder: `ae.safetensors` ([リンク](https://huggingface.co/Comfy-Org/Lumina_Image_2.0_Repackaged/blob/main/split_files/vae/ae.safetensors)) (FLUXと同じ)
-
 **主要パラメータ:**
 * `--timestep_sampling="nextdit_shift"`
 * `--discrete_flow_shift=6.0`
 * `--model_prediction_type="raw"`
-* `--guidance_scale=4.0`
 * `--mixed_precision="bf16"`
 
 **システムプロンプト:**
diff --git a/library/lumina_train_util.py b/library/lumina_train_util.py
index 45f22bc47..1cf9278aa 100644
--- a/library/lumina_train_util.py
+++ b/library/lumina_train_util.py
@@ -1042,8 +1042,8 @@ def add_lumina_train_arguments(parser: argparse.ArgumentParser):
         "--gemma2_max_token_length",
         type=int,
         default=None,
-        help="maximum token length for Gemma2. if omitted, 256 for schnell and 512 for dev"
-        " / Gemma2の最大トークン長。省略された場合、schnellの場合は256、devの場合は512",
+        help="maximum token length for Gemma2. if omitted, 256"
+        " / Gemma2の最大トークン長。省略された場合、256になります",
     )
 
     parser.add_argument(

From 999df5ec15c900a7dde3ac57c46db048ad988417 Mon Sep 17 00:00:00 2001
From: Kohya S <ykumeykume@gmail.com>
Date: Sun, 13 Jul 2025 20:52:00 +0900
Subject: [PATCH 482/582] fix: update default values for timestep_sampling and
 model_prediction_type in training arguments

---
 docs/lumina_train_network.md | 8 ++++----
 library/lumina_train_util.py | 8 ++++----
 2 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/docs/lumina_train_network.md b/docs/lumina_train_network.md
index e811f68b2..45695e89e 100644
--- a/docs/lumina_train_network.md
+++ b/docs/lumina_train_network.md
@@ -167,9 +167,9 @@ Besides the arguments explained in the [train_network.py guide](train_network.md
 #### Lumina Image 2.0 Training Parameters / Lumina Image 2.0 学習パラメータ
 
 * `--gemma2_max_token_length=<integer>` – Max token length for Gemma2. Default is 256.
-* `--timestep_sampling=<choice>` – Timestep sampling method. Options: `sigma`, `uniform`, `sigmoid`, `shift`, `nextdit_shift`. Default `sigma`. **Recommended: `nextdit_shift`**
+* `--timestep_sampling=<choice>` – Timestep sampling method. Options: `sigma`, `uniform`, `sigmoid`, `shift`, `nextdit_shift`. Default `shift`. **Recommended: `nextdit_shift`**
 * `--discrete_flow_shift=<float>` – Discrete flow shift for the Euler Discrete Scheduler. Default `6.0`.
-* `--model_prediction_type=<choice>` – Model prediction processing method. Options: `raw`, `additive`, `sigma_scaled`. Default `sigma_scaled`. **Recommended: `raw`**
+* `--model_prediction_type=<choice>` – Model prediction processing method. Options: `raw`, `additive`, `sigma_scaled`. Default `raw`. **Recommended: `raw`**
 * `--system_prompt=<string>` – System prompt to prepend to all prompts. Recommended: `"You are an assistant designed to generate high-quality images based on user prompts."` or `"You are an assistant designed to generate high-quality images with the highest degree of image-text alignment based on textual prompts."`
 * `--use_flash_attn` – Use Flash Attention. Requires `pip install flash-attn` (may not be supported in all environments). If installed correctly, it speeds up training. 
 * `--sigmoid_scale=<float>` – Scale factor for sigmoid timestep sampling. Default `1.0`.
@@ -212,9 +212,9 @@ For Lumina Image 2.0, you can specify different dimensions for various component
 #### Lumina Image 2.0 学習パラメータ
 
 *   `--gemma2_max_token_length=<integer>` – Gemma2で使用するトークンの最大長を指定します。デフォルトは256です。
-*   `--timestep_sampling=<choice>` – タイムステップのサンプリング方法を指定します。`sigma`, `uniform`, `sigmoid`, `shift`, `nextdit_shift`から選択します。デフォルトは`sigma`です。**推奨: `nextdit_shift`**
+*   `--timestep_sampling=<choice>` – タイムステップのサンプリング方法を指定します。`sigma`, `uniform`, `sigmoid`, `shift`, `nextdit_shift`から選択します。デフォルトは`shift`です。**推奨: `nextdit_shift`**
 *   `--discrete_flow_shift=<float>` – Euler Discrete Schedulerの離散フローシフトを指定します。デフォルトは`6.0`です。
-*   `--model_prediction_type=<choice>` – モデル予測の処理方法を指定します。`raw`, `additive`, `sigma_scaled`から選択します。デフォルトは`sigma_scaled`です。**推奨: `raw`**
+*   `--model_prediction_type=<choice>` – モデル予測の処理方法を指定します。`raw`, `additive`, `sigma_scaled`から選択します。デフォルトは`raw`です。**推奨: `raw`**
 *   `--system_prompt=<string>` – 全てのプロンプトに前置するシステムプロンプトを指定します。推奨: `"You are an assistant designed to generate high-quality images based on user prompts."` または `"You are an assistant designed to generate high-quality images with the highest degree of image-text alignment based on textual prompts."`
 *   `--use_flash_attn` – Flash Attentionを使用します。`pip install flash-attn`でインストールが必要です（環境によってはサポートされていません）。正しくインストールされている場合は、指定すると学習が高速化されます。
 *   `--sigmoid_scale=<float>` – sigmoidタイムステップサンプリングのスケール係数を指定します。デフォルトは`1.0`です。
diff --git a/library/lumina_train_util.py b/library/lumina_train_util.py
index 1cf9278aa..0645a8ae0 100644
--- a/library/lumina_train_util.py
+++ b/library/lumina_train_util.py
@@ -1049,9 +1049,9 @@ def add_lumina_train_arguments(parser: argparse.ArgumentParser):
     parser.add_argument(
         "--timestep_sampling",
         choices=["sigma", "uniform", "sigmoid", "shift", "nextdit_shift"],
-        default="sigma",
-        help="Method to sample timesteps: sigma-based, uniform random, sigmoid of random normal, shift of sigmoid and NextDIT.1 shifting."
-        " / タイムステップをサンプリングする方法：sigma、random uniform、random normalのsigmoid、sigmoidのシフト、NextDIT.1のシフト。",
+        default="shift",
+        help="Method to sample timesteps: sigma-based, uniform random, sigmoid of random normal, shift of sigmoid and NextDIT.1 shifting. Default is 'shift'."
+        " / タイムステップをサンプリングする方法：sigma、random uniform、random normalのsigmoid、sigmoidのシフト、NextDIT.1のシフト。デフォルトは'shift'です。",
     )
     parser.add_argument(
         "--sigmoid_scale",
@@ -1062,7 +1062,7 @@ def add_lumina_train_arguments(parser: argparse.ArgumentParser):
     parser.add_argument(
         "--model_prediction_type",
         choices=["raw", "additive", "sigma_scaled"],
-        default="sigma_scaled",
+        default="raw",
         help="How to interpret and process the model prediction: "
         "raw (use as is), additive (add to noisy input), sigma_scaled (apply sigma scaling)."
         " / モデル予測の解釈と処理方法："

From 30295c96686c90d4773e12fd5eb248e0a6bd406b Mon Sep 17 00:00:00 2001
From: Kohya S <ykumeykume@gmail.com>
Date: Sun, 13 Jul 2025 21:00:27 +0900
Subject: [PATCH 483/582] fix: update parameter names for CFG truncate and
 Renorm CFG in documentation and code

---
 docs/lumina_train_network.md | 10 ++++++----
 library/train_util.py        |  4 ++--
 2 files changed, 8 insertions(+), 6 deletions(-)

diff --git a/docs/lumina_train_network.md b/docs/lumina_train_network.md
index 45695e89e..cb3b600f6 100644
--- a/docs/lumina_train_network.md
+++ b/docs/lumina_train_network.md
@@ -269,11 +269,12 @@ Based on the contributor's recommendations, here are the suggested settings for
 * High image-text alignment: `"You are an assistant designed to generate high-quality images with the highest degree of image-text alignment based on textual prompts."`
 
 **Sample Prompts:**
-Sample prompts can include CFG truncate (`-ct`) and Renorm CFG (`-rc`) parameters:
-* `-ct 0.25 -rc 1.0` (default values)
+Sample prompts can include CFG truncate (`--ctr`) and Renorm CFG (`-rcfg`) parameters:
+* `--ctr 0.25 --rcfg 1.0` (default values)
 
 <details>
 <summary>日本語</summary>
+
 必要な引数を設定し、コマンドを実行すると学習が開始されます。基本的な流れやログの確認方法は[`train_network.py`のガイド](train_network.md#32-starting-the-training--学習の開始)と同様です。
 
 学習が完了すると、指定した`output_dir`にLoRAモデルファイル（例: `my_lumina_lora.safetensors`）が保存されます。このファイルは、Lumina Image 2.0モデルに対応した推論環境（例: ComfyUI + 適切なノード）で使用できます。
@@ -295,6 +296,7 @@ Sample prompts can include CFG truncate (`-ct`) and Renorm CFG (`-rc`) parameter
 * 高い画像-テキスト整合性: `"You are an assistant designed to generate high-quality images with the highest degree of image-text alignment based on textual prompts."`
 
 **サンプルプロンプト:**
-サンプルプロンプトには CFG truncate (`-ct`) と Renorm CFG (`-rc`) パラメータを含めることができます：
-* `-ct 0.25 -rc 1.0` (デフォルト値)
+サンプルプロンプトには CFG truncate (`--ctr`) と Renorm CFG (`--rcfg`) パラメータを含めることができます：
+* `--ctr 0.25 --rcfg 1.0` (デフォルト値)
+
 </details>
\ No newline at end of file
diff --git a/library/train_util.py b/library/train_util.py
index 1d80bcd85..2e8e9c296 100644
--- a/library/train_util.py
+++ b/library/train_util.py
@@ -6208,12 +6208,12 @@ def line_to_prompt_dict(line: str) -> dict:
                 prompt_dict["controlnet_image"] = m.group(1)
                 continue
 
-            m = re.match(r"ct (.+)", parg, re.IGNORECASE)
+            m = re.match(r"ctr (.+)", parg, re.IGNORECASE)
             if m:
                 prompt_dict["cfg_trunc_ratio"] = float(m.group(1))
                 continue
 
-            m = re.match(r"rc (.+)", parg, re.IGNORECASE)
+            m = re.match(r"rcfg (.+)", parg, re.IGNORECASE)
             if m:
                 prompt_dict["renorm_cfg"] = float(m.group(1))
                 continue

From 13ccfc39f860b9653b2f22ec1619a01ab8ffab90 Mon Sep 17 00:00:00 2001
From: Kohya S <ykumeykume@gmail.com>
Date: Sun, 13 Jul 2025 21:26:06 +0900
Subject: [PATCH 484/582] fix: update flow matching loss and variable names

---
 lumina_train.py | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/lumina_train.py b/lumina_train.py
index 0a91f4a0a..a333427db 100644
--- a/lumina_train.py
+++ b/lumina_train.py
@@ -294,7 +294,7 @@ def train(args):
     # load lumina
     nextdit = lumina_util.load_lumina_model(
         args.pretrained_model_name_or_path,
-        loading_dtype,
+        weight_dtype,
         torch.device("cpu"),
         disable_mmap=args.disable_mmap_load_safetensors,
         use_flash_attn=args.use_flash_attn,
@@ -494,6 +494,8 @@ def train(args):
 
     clean_memory_on_device(accelerator.device)
 
+    is_swapping_blocks = args.blocks_to_swap is not None and args.blocks_to_swap > 0
+
     if args.deepspeed:
         ds_model = deepspeed_utils.prepare_deepspeed_model(args, nextdit=nextdit)
         # most of ZeRO stage uses optimizer partitioning, so we have to prepare optimizer and ds_model at the same time. # pull/1139#issuecomment-1986790007
@@ -739,7 +741,7 @@ def grad_hook(parameter: torch.Tensor):
                 with accelerator.autocast():
                     # YiYi notes: divide it by 1000 for now because we scale it by 1000 in the transformer model (we should not keep it but I want to keep the inputs same for the model for testing)
                     model_pred = nextdit(
-                        x=img,  # image latents (B, C, H, W)
+                        x=noisy_model_input,  # image latents (B, C, H, W)
                         t=timesteps / 1000,  # timesteps需要除以1000来匹配模型预期
                         cap_feats=gemma2_hidden_states,  # Gemma2的hidden states作为caption features
                         cap_mask=gemma2_attn_mask.to(
@@ -751,8 +753,8 @@ def grad_hook(parameter: torch.Tensor):
                     args, model_pred, noisy_model_input, sigmas
                 )
 
-                # flow matching loss: this is different from SD3
-                target = noise - latents
+                # flow matching loss
+                target = latents - noise
 
                 # calculate loss
                 huber_c = train_util.get_huber_threshold_if_needed(

From a96d684ffab11d6f40a8f1dde3c8103ab1d2bd27 Mon Sep 17 00:00:00 2001
From: Kohya S <ykumeykume@gmail.com>
Date: Tue, 15 Jul 2025 20:44:43 +0900
Subject: [PATCH 485/582] feat: add Chroma model implementation

---
 library/chroma_models.py | 706 +++++++++++++++++++++++++++++++++++++++
 1 file changed, 706 insertions(+)
 create mode 100644 library/chroma_models.py

diff --git a/library/chroma_models.py b/library/chroma_models.py
new file mode 100644
index 000000000..9f21afad6
--- /dev/null
+++ b/library/chroma_models.py
@@ -0,0 +1,706 @@
+# copy from the official repo: https://github.com/lodestone-rock/flow/blob/master/src/models/chroma/model.py
+# and modified
+# licensed under Apache License 2.0
+
+import math
+from dataclasses import dataclass
+
+import torch
+from einops import rearrange
+from torch import Tensor, nn
+import torch.nn.functional as F
+import torch.utils.checkpoint as ckpt
+
+from .flux_models import (
+    attention,
+    rope,
+    apply_rope,
+    EmbedND,
+    timestep_embedding,
+    MLPEmbedder,
+    RMSNorm,
+    QKNorm,
+    SelfAttention
+)
+from . import custom_offloading_utils
+
+
+def distribute_modulations(tensor: torch.Tensor, depth_single_blocks, depth_double_blocks):
+    """
+    Distributes slices of the tensor into the block_dict as ModulationOut objects.
+
+    Args:
+        tensor (torch.Tensor): Input tensor with shape [batch_size, vectors, dim].
+    """
+    batch_size, vectors, dim = tensor.shape
+
+    block_dict = {}
+
+    # HARD CODED VALUES! lookup table for the generated vectors
+    # TODO: move this into chroma config!
+    # Add 38 single mod blocks
+    for i in range(depth_single_blocks):
+        key = f"single_blocks.{i}.modulation.lin"
+        block_dict[key] = None
+
+    # Add 19 image double blocks
+    for i in range(depth_double_blocks):
+        key = f"double_blocks.{i}.img_mod.lin"
+        block_dict[key] = None
+
+    # Add 19 text double blocks
+    for i in range(depth_double_blocks):
+        key = f"double_blocks.{i}.txt_mod.lin"
+        block_dict[key] = None
+
+    # Add the final layer
+    block_dict["final_layer.adaLN_modulation.1"] = None
+    # 6.2b version
+    # block_dict["lite_double_blocks.4.img_mod.lin"] = None
+    # block_dict["lite_double_blocks.4.txt_mod.lin"] = None
+
+    idx = 0  # Index to keep track of the vector slices
+
+    for key in block_dict.keys():
+        if "single_blocks" in key:
+            # Single block: 1 ModulationOut
+            block_dict[key] = ModulationOut(
+                shift=tensor[:, idx : idx + 1, :],
+                scale=tensor[:, idx + 1 : idx + 2, :],
+                gate=tensor[:, idx + 2 : idx + 3, :],
+            )
+            idx += 3  # Advance by 3 vectors
+
+        elif "img_mod" in key:
+            # Double block: List of 2 ModulationOut
+            double_block = []
+            for _ in range(2):  # Create 2 ModulationOut objects
+                double_block.append(
+                    ModulationOut(
+                        shift=tensor[:, idx : idx + 1, :],
+                        scale=tensor[:, idx + 1 : idx + 2, :],
+                        gate=tensor[:, idx + 2 : idx + 3, :],
+                    )
+                )
+                idx += 3  # Advance by 3 vectors per ModulationOut
+            block_dict[key] = double_block
+
+        elif "txt_mod" in key:
+            # Double block: List of 2 ModulationOut
+            double_block = []
+            for _ in range(2):  # Create 2 ModulationOut objects
+                double_block.append(
+                    ModulationOut(
+                        shift=tensor[:, idx : idx + 1, :],
+                        scale=tensor[:, idx + 1 : idx + 2, :],
+                        gate=tensor[:, idx + 2 : idx + 3, :],
+                    )
+                )
+                idx += 3  # Advance by 3 vectors per ModulationOut
+            block_dict[key] = double_block
+
+        elif "final_layer" in key:
+            # Final layer: 1 ModulationOut
+            block_dict[key] = [
+                tensor[:, idx : idx + 1, :],
+                tensor[:, idx + 1 : idx + 2, :],
+            ]
+            idx += 2  # Advance by 3 vectors
+
+    return block_dict
+
+
+class Approximator(nn.Module):
+    def __init__(self, in_dim: int, out_dim: int, hidden_dim: int, n_layers=4):
+        super().__init__()
+        self.in_proj = nn.Linear(in_dim, hidden_dim, bias=True)
+        self.layers = nn.ModuleList([MLPEmbedder(hidden_dim, hidden_dim) for x in range(n_layers)])
+        self.norms = nn.ModuleList([RMSNorm(hidden_dim) for x in range(n_layers)])
+        self.out_proj = nn.Linear(hidden_dim, out_dim)
+
+    @property
+    def device(self):
+        # Get the device of the module (assumes all parameters are on the same device)
+        return next(self.parameters()).device
+
+    def enable_gradient_checkpointing(self):
+        for layer in self.layers:
+            layer.enable_gradient_checkpointing()
+
+    def disable_gradient_checkpointing(self):
+        for layer in self.layers:
+            layer.disable_gradient_checkpointing()
+
+    def forward(self, x: Tensor) -> Tensor:
+        x = self.in_proj(x)
+
+        for layer, norms in zip(self.layers, self.norms):
+            x = x + layer(norms(x))
+
+        x = self.out_proj(x)
+
+        return x
+
+
+@dataclass
+class ModulationOut:
+    shift: Tensor
+    scale: Tensor
+    gate: Tensor
+
+
+def _modulation_shift_scale_fn(x, scale, shift):
+    return (1 + scale) * x + shift
+
+
+def _modulation_gate_fn(x, gate, gate_params):
+    return x + gate * gate_params
+
+
+class DoubleStreamBlock(nn.Module):
+    def __init__(
+        self,
+        hidden_size: int,
+        num_heads: int,
+        mlp_ratio: float,
+        qkv_bias: bool = False,
+    ):
+        super().__init__()
+
+        mlp_hidden_dim = int(hidden_size * mlp_ratio)
+        self.num_heads = num_heads
+        self.hidden_size = hidden_size
+        self.img_norm1 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
+        self.img_attn = SelfAttention(
+            dim=hidden_size,
+            num_heads=num_heads,
+            qkv_bias=qkv_bias,
+        )
+
+        self.img_norm2 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
+        self.img_mlp = nn.Sequential(
+            nn.Linear(hidden_size, mlp_hidden_dim, bias=True),
+            nn.GELU(approximate="tanh"),
+            nn.Linear(mlp_hidden_dim, hidden_size, bias=True),
+        )
+
+        self.txt_norm1 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
+        self.txt_attn = SelfAttention(
+            dim=hidden_size,
+            num_heads=num_heads,
+            qkv_bias=qkv_bias,
+        )
+
+        self.txt_norm2 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
+        self.txt_mlp = nn.Sequential(
+            nn.Linear(hidden_size, mlp_hidden_dim, bias=True),
+            nn.GELU(approximate="tanh"),
+            nn.Linear(mlp_hidden_dim, hidden_size, bias=True),
+        )
+
+        self.gradient_checkpointing = False
+
+    @property
+    def device(self):
+        # Get the device of the module (assumes all parameters are on the same device)
+        return next(self.parameters()).device
+
+    def modulation_shift_scale_fn(self, x, scale, shift):
+        return _modulation_shift_scale_fn(x, scale, shift)
+
+    def modulation_gate_fn(self, x, gate, gate_params):
+        return _modulation_gate_fn(x, gate, gate_params)
+
+    def enable_gradient_checkpointing(self):
+        self.gradient_checkpointing = True
+
+    def disable_gradient_checkpointing(self):
+        self.gradient_checkpointing = False
+
+    def _forward(
+        self,
+        img: Tensor,
+        txt: Tensor,
+        pe: Tensor,
+        distill_vec: list[ModulationOut],
+        mask: Tensor,
+    ) -> tuple[Tensor, Tensor]:
+        (img_mod1, img_mod2), (txt_mod1, txt_mod2) = distill_vec
+
+        # prepare image for attention
+        img_modulated = self.img_norm1(img)
+        # replaced with compiled fn
+        # img_modulated = (1 + img_mod1.scale) * img_modulated + img_mod1.shift
+        img_modulated = self.modulation_shift_scale_fn(img_modulated, img_mod1.scale, img_mod1.shift)
+        img_qkv = self.img_attn.qkv(img_modulated)
+        img_q, img_k, img_v = rearrange(img_qkv, "B L (K H D) -> K B H L D", K=3, H=self.num_heads)
+        img_q, img_k = self.img_attn.norm(img_q, img_k, img_v)
+
+        # prepare txt for attention
+        txt_modulated = self.txt_norm1(txt)
+        # replaced with compiled fn
+        # txt_modulated = (1 + txt_mod1.scale) * txt_modulated + txt_mod1.shift
+        txt_modulated = self.modulation_shift_scale_fn(txt_modulated, txt_mod1.scale, txt_mod1.shift)
+        txt_qkv = self.txt_attn.qkv(txt_modulated)
+        txt_q, txt_k, txt_v = rearrange(txt_qkv, "B L (K H D) -> K B H L D", K=3, H=self.num_heads)
+        txt_q, txt_k = self.txt_attn.norm(txt_q, txt_k, txt_v)
+
+        # run actual attention
+        q = torch.cat((txt_q, img_q), dim=2)
+        k = torch.cat((txt_k, img_k), dim=2)
+        v = torch.cat((txt_v, img_v), dim=2)
+
+        attn = attention(q, k, v, pe=pe, mask=mask)
+        txt_attn, img_attn = attn[:, : txt.shape[1]], attn[:, txt.shape[1] :]
+
+        # calculate the img bloks
+        # replaced with compiled fn
+        # img = img + img_mod1.gate * self.img_attn.proj(img_attn)
+        # img = img + img_mod2.gate * self.img_mlp((1 + img_mod2.scale) * self.img_norm2(img) + img_mod2.shift)
+        img = self.modulation_gate_fn(img, img_mod1.gate, self.img_attn.proj(img_attn))
+        img = self.modulation_gate_fn(
+            img,
+            img_mod2.gate,
+            self.img_mlp(self.modulation_shift_scale_fn(self.img_norm2(img), img_mod2.scale, img_mod2.shift)),
+        )
+
+        # calculate the txt bloks
+        # replaced with compiled fn
+        # txt = txt + txt_mod1.gate * self.txt_attn.proj(txt_attn)
+        # txt = txt + txt_mod2.gate * self.txt_mlp((1 + txt_mod2.scale) * self.txt_norm2(txt) + txt_mod2.shift)
+        txt = self.modulation_gate_fn(txt, txt_mod1.gate, self.txt_attn.proj(txt_attn))
+        txt = self.modulation_gate_fn(
+            txt,
+            txt_mod2.gate,
+            self.txt_mlp(self.modulation_shift_scale_fn(self.txt_norm2(txt), txt_mod2.scale, txt_mod2.shift)),
+        )
+
+        return img, txt
+
+    def forward(
+        self,
+        img: Tensor,
+        txt: Tensor,
+        pe: Tensor,
+        distill_vec: list[ModulationOut],
+        mask: Tensor,
+    ) -> tuple[Tensor, Tensor]:
+        if self.training and self.gradient_checkpointing:
+            return ckpt.checkpoint(self._forward, img, txt, pe, distill_vec, mask, use_reentrant=False)
+        else:
+            return self._forward(img, txt, pe, distill_vec, mask)
+
+
+class SingleStreamBlock(nn.Module):
+    """
+    A DiT block with parallel linear layers as described in
+    https://arxiv.org/abs/2302.05442 and adapted modulation interface.
+    """
+
+    def __init__(
+        self,
+        hidden_size: int,
+        num_heads: int,
+        mlp_ratio: float = 4.0,
+        qk_scale: float | None = None,
+    ):
+        super().__init__()
+        self.hidden_dim = hidden_size
+        self.num_heads = num_heads
+        head_dim = hidden_size // num_heads
+        self.scale = qk_scale or head_dim**-0.5
+
+        self.mlp_hidden_dim = int(hidden_size * mlp_ratio)
+        # qkv and mlp_in
+        self.linear1 = nn.Linear(hidden_size, hidden_size * 3 + self.mlp_hidden_dim)
+        # proj and mlp_out
+        self.linear2 = nn.Linear(hidden_size + self.mlp_hidden_dim, hidden_size)
+
+        self.norm = QKNorm(head_dim)
+
+        self.hidden_size = hidden_size
+        self.pre_norm = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
+
+        self.mlp_act = nn.GELU(approximate="tanh")
+
+        self.gradient_checkpointing = False
+
+    @property
+    def device(self):
+        # Get the device of the module (assumes all parameters are on the same device)
+        return next(self.parameters()).device
+
+    def modulation_shift_scale_fn(self, x, scale, shift):
+        return _modulation_shift_scale_fn(x, scale, shift)
+
+    def modulation_gate_fn(self, x, gate, gate_params):
+        return _modulation_gate_fn(x, gate, gate_params)
+
+    def enable_gradient_checkpointing(self):
+        self.gradient_checkpointing = True
+
+    def disable_gradient_checkpointing(self):
+        self.gradient_checkpointing = False
+
+    def _forward(self, x: Tensor, pe: Tensor, distill_vec: list[ModulationOut], mask: Tensor) -> Tensor:
+        mod = distill_vec
+        # replaced with compiled fn
+        # x_mod = (1 + mod.scale) * self.pre_norm(x) + mod.shift
+        x_mod = self.modulation_shift_scale_fn(self.pre_norm(x), mod.scale, mod.shift)
+        qkv, mlp = torch.split(self.linear1(x_mod), [3 * self.hidden_size, self.mlp_hidden_dim], dim=-1)
+
+        q, k, v = rearrange(qkv, "B L (K H D) -> K B H L D", K=3, H=self.num_heads)
+        q, k = self.norm(q, k, v)
+
+        # compute attention
+        attn = attention(q, k, v, pe=pe, mask=mask)
+        # compute activation in mlp stream, cat again and run second linear layer
+        output = self.linear2(torch.cat((attn, self.mlp_act(mlp)), 2))
+        # replaced with compiled fn
+        # return x + mod.gate * output
+        return self.modulation_gate_fn(x, mod.gate, output)
+
+    def forward(self, x: Tensor, pe: Tensor, distill_vec: list[ModulationOut], mask: Tensor) -> Tensor:
+        if self.training and self.gradient_checkpointing:
+            return ckpt.checkpoint(self._forward, x, pe, distill_vec, mask, use_reentrant=False)
+        else:
+            return self._forward(x, pe, distill_vec, mask)
+
+
+class LastLayer(nn.Module):
+    def __init__(
+        self,
+        hidden_size: int,
+        patch_size: int,
+        out_channels: int,
+    ):
+        super().__init__()
+        self.norm_final = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
+        self.linear = nn.Linear(hidden_size, patch_size * patch_size * out_channels, bias=True)
+
+    @property
+    def device(self):
+        # Get the device of the module (assumes all parameters are on the same device)
+        return next(self.parameters()).device
+
+    def modulation_shift_scale_fn(self, x, scale, shift):
+        return _modulation_shift_scale_fn(x, scale, shift)
+
+    def forward(self, x: Tensor, distill_vec: list[Tensor]) -> Tensor:
+        shift, scale = distill_vec
+        shift = shift.squeeze(1)
+        scale = scale.squeeze(1)
+        # replaced with compiled fn
+        # x = (1 + scale[:, None, :]) * self.norm_final(x) + shift[:, None, :]
+        x = self.modulation_shift_scale_fn(self.norm_final(x), scale[:, None, :], shift[:, None, :])
+        x = self.linear(x)
+        return x
+
+
+@dataclass
+class ChromaParams:
+    in_channels: int
+    context_in_dim: int
+    hidden_size: int
+    mlp_ratio: float
+    num_heads: int
+    depth: int
+    depth_single_blocks: int
+    axes_dim: list[int]
+    theta: int
+    qkv_bias: bool
+    guidance_embed: bool
+    approximator_in_dim: int
+    approximator_depth: int
+    approximator_hidden_size: int
+    _use_compiled: bool
+
+
+chroma_params = ChromaParams(
+    in_channels=64,
+    context_in_dim=4096,
+    hidden_size=3072,
+    mlp_ratio=4.0,
+    num_heads=24,
+    depth=19,
+    depth_single_blocks=38,
+    axes_dim=[16, 56, 56],
+    theta=10_000,
+    qkv_bias=True,
+    guidance_embed=True,
+    approximator_in_dim=64,
+    approximator_depth=5,
+    approximator_hidden_size=5120,
+    _use_compiled=False,
+)
+
+
+def modify_mask_to_attend_padding(mask, max_seq_length, num_extra_padding=8):
+    """
+    Modifies attention mask to allow attention to a few extra padding tokens.
+
+    Args:
+        mask: Original attention mask (1 for tokens to attend to, 0 for masked tokens)
+        max_seq_length: Maximum sequence length of the model
+        num_extra_padding: Number of padding tokens to unmask
+
+    Returns:
+        Modified mask
+    """
+    # Get the actual sequence length from the mask
+    seq_length = mask.sum(dim=-1)
+    batch_size = mask.shape[0]
+
+    modified_mask = mask.clone()
+
+    for i in range(batch_size):
+        current_seq_len = int(seq_length[i].item())
+
+        # Only add extra padding tokens if there's room
+        if current_seq_len < max_seq_length:
+            # Calculate how many padding tokens we can unmask
+            available_padding = max_seq_length - current_seq_len
+            tokens_to_unmask = min(num_extra_padding, available_padding)
+
+            # Unmask the specified number of padding tokens right after the sequence
+            modified_mask[i, current_seq_len : current_seq_len + tokens_to_unmask] = 1
+
+    return modified_mask
+
+
+class Chroma(nn.Module):
+    """
+    Transformer model for flow matching on sequences.
+    """
+
+    def __init__(self, params: ChromaParams):
+        super().__init__()
+        self.params = params
+        self.in_channels = params.in_channels
+        self.out_channels = self.in_channels
+        if params.hidden_size % params.num_heads != 0:
+            raise ValueError(f"Hidden size {params.hidden_size} must be divisible by num_heads {params.num_heads}")
+        pe_dim = params.hidden_size // params.num_heads
+        if sum(params.axes_dim) != pe_dim:
+            raise ValueError(f"Got {params.axes_dim} but expected positional dim {pe_dim}")
+        self.hidden_size = params.hidden_size
+        self.num_heads = params.num_heads
+        self.pe_embedder = EmbedND(dim=pe_dim, theta=params.theta, axes_dim=params.axes_dim)
+        self.img_in = nn.Linear(self.in_channels, self.hidden_size, bias=True)
+
+        # TODO: need proper mapping for this approximator output!
+        # currently the mapping is hardcoded in distribute_modulations function
+        self.distilled_guidance_layer = Approximator(
+            params.approximator_in_dim,
+            self.hidden_size,
+            params.approximator_hidden_size,
+            params.approximator_depth,
+        )
+        self.txt_in = nn.Linear(params.context_in_dim, self.hidden_size)
+
+        self.double_blocks = nn.ModuleList(
+            [
+                DoubleStreamBlock(
+                    self.hidden_size,
+                    self.num_heads,
+                    mlp_ratio=params.mlp_ratio,
+                    qkv_bias=params.qkv_bias,
+                )
+                for _ in range(params.depth)
+            ]
+        )
+
+        self.single_blocks = nn.ModuleList(
+            [
+                SingleStreamBlock(
+                    self.hidden_size,
+                    self.num_heads,
+                    mlp_ratio=params.mlp_ratio,
+                )
+                for _ in range(params.depth_single_blocks)
+            ]
+        )
+
+        self.final_layer = LastLayer(
+            self.hidden_size,
+            1,
+            self.out_channels,
+        )
+
+        # TODO: move this hardcoded value to config
+        # single layer has 3 modulation vectors
+        # double layer has 6 modulation vectors for each expert
+        # final layer has 2 modulation vectors
+        self.mod_index_length = 3 * params.depth_single_blocks + 2 * 6 * params.depth + 2
+        self.depth_single_blocks = params.depth_single_blocks
+        self.depth_double_blocks = params.depth
+        # self.mod_index = torch.tensor(list(range(self.mod_index_length)), device=0)
+        self.register_buffer(
+            "mod_index",
+            torch.tensor(list(range(self.mod_index_length)), device="cpu"),
+            persistent=False,
+        )
+        self.approximator_in_dim = params.approximator_in_dim
+
+        self.blocks_to_swap = None
+        self.offloader_double = None
+        self.offloader_single = None
+        self.num_double_blocks = len(self.double_blocks)
+        self.num_single_blocks = len(self.single_blocks)
+
+    @property
+    def device(self):
+        # Get the device of the module (assumes all parameters are on the same device)
+        return next(self.parameters()).device
+
+    def enable_gradient_checkpointing(self):
+        self.distilled_guidance_layer.enable_gradient_checkpointing()
+        for block in self.double_blocks + self.single_blocks:
+            block.enable_gradient_checkpointing()
+
+    def disable_gradient_checkpointing(self):
+        self.distilled_guidance_layer.disable_gradient_checkpointing()
+        for block in self.double_blocks + self.single_blocks:
+            block.disable_gradient_checkpointing()
+
+    def enable_block_swap(self, num_blocks: int, device: torch.device):
+        self.blocks_to_swap = num_blocks
+        double_blocks_to_swap = num_blocks // 2
+        single_blocks_to_swap = (num_blocks - double_blocks_to_swap) * 2
+
+        assert double_blocks_to_swap <= self.num_double_blocks - 2 and single_blocks_to_swap <= self.num_single_blocks - 2, (
+            f"Cannot swap more than {self.num_double_blocks - 2} double blocks and {self.num_single_blocks - 2} single blocks. "
+            f"Requested {double_blocks_to_swap} double blocks and {single_blocks_to_swap} single blocks."
+        )
+
+        self.offloader_double = custom_offloading_utils.ModelOffloader(
+            self.double_blocks, double_blocks_to_swap, device
+        )
+        self.offloader_single = custom_offloading_utils.ModelOffloader(
+            self.single_blocks, single_blocks_to_swap, device
+        )
+        print(
+            f"Chroma: Block swap enabled. Swapping {num_blocks} blocks, double blocks: {double_blocks_to_swap}, single blocks: {single_blocks_to_swap}."
+        )
+
+    def move_to_device_except_swap_blocks(self, device: torch.device):
+        # assume model is on cpu. do not move blocks to device to reduce temporary memory usage
+        if self.blocks_to_swap:
+            save_double_blocks = self.double_blocks
+            save_single_blocks = self.single_blocks
+            self.double_blocks = None
+            self.single_blocks = None
+
+        self.to(device)
+
+        if self.blocks_to_swap:
+            self.double_blocks = save_double_blocks
+            self.single_blocks = save_single_blocks
+
+    def prepare_block_swap_before_forward(self):
+        if self.blocks_to_swap is None or self.blocks_to_swap == 0:
+            return
+        self.offloader_double.prepare_block_devices_before_forward(self.double_blocks)
+        self.offloader_single.prepare_block_devices_before_forward(self.single_blocks)
+
+    def forward(
+        self,
+        img: Tensor,
+        img_ids: Tensor,
+        txt: Tensor,
+        txt_ids: Tensor,
+        txt_mask: Tensor,
+        timesteps: Tensor,
+        guidance: Tensor,
+        attn_padding: int = 1,
+    ) -> Tensor:
+        if img.ndim != 3 or txt.ndim != 3:
+            raise ValueError("Input img and txt tensors must have 3 dimensions.")
+
+        # running on sequences img
+        img = self.img_in(img)
+        txt = self.txt_in(txt)
+
+        # TODO:
+        # need to fix grad accumulation issue here for now it's in no grad mode
+        # besides, i don't want to wash out the PFP that's trained on this model weights anyway
+        # the fan out operation here is deleting the backward graph
+        # alternatively doing forward pass for every block manually is doable but slow
+        # custom backward probably be better
+        with torch.no_grad():
+            distill_timestep = timestep_embedding(timesteps, self.approximator_in_dim // 4)
+            # TODO: need to add toggle to omit this from schnell but that's not a priority
+            distil_guidance = timestep_embedding(guidance, self.approximator_in_dim // 4)
+            # get all modulation index
+            modulation_index = timestep_embedding(self.mod_index, self.approximator_in_dim // 2)
+            # we need to broadcast the modulation index here so each batch has all of the index
+            modulation_index = modulation_index.unsqueeze(0).repeat(img.shape[0], 1, 1)
+            # and we need to broadcast timestep and guidance along too
+            timestep_guidance = (
+                torch.cat([distill_timestep, distil_guidance], dim=1).unsqueeze(1).repeat(1, self.mod_index_length, 1)
+            )
+            # then and only then we could concatenate it together
+            input_vec = torch.cat([timestep_guidance, modulation_index], dim=-1)
+            mod_vectors = self.distilled_guidance_layer(input_vec.requires_grad_(True))
+        mod_vectors_dict = distribute_modulations(mod_vectors, self.depth_single_blocks, self.depth_double_blocks)
+
+        ids = torch.cat((txt_ids, img_ids), dim=1)
+        pe = self.pe_embedder(ids)
+
+        # compute mask
+        # assume max seq length from the batched input
+
+        max_len = txt.shape[1]
+
+        # mask
+        with torch.no_grad():
+            txt_mask_w_padding = modify_mask_to_attend_padding(txt_mask, max_len, attn_padding)
+            txt_img_mask = torch.cat(
+                [
+                    txt_mask_w_padding,
+                    torch.ones([img.shape[0], img.shape[1]], device=txt_mask.device),
+                ],
+                dim=1,
+            )
+            txt_img_mask = txt_img_mask.float().T @ txt_img_mask.float()
+            txt_img_mask = txt_img_mask[None, None, ...].repeat(txt.shape[0], self.num_heads, 1, 1).int().bool()
+            # txt_mask_w_padding[txt_mask_w_padding==False] = True
+
+        if not self.blocks_to_swap:
+            for i, block in enumerate(self.double_blocks):
+                # the guidance replaced by FFN output
+                img_mod = mod_vectors_dict[f"double_blocks.{i}.img_mod.lin"]
+                txt_mod = mod_vectors_dict[f"double_blocks.{i}.txt_mod.lin"]
+                double_mod = [img_mod, txt_mod]
+
+                img, txt = block(img=img, txt=txt, pe=pe, distill_vec=double_mod, mask=txt_img_mask)
+        else:
+            for i, block in enumerate(self.double_blocks):
+                self.offloader_double.wait_for_block(i)
+
+                # the guidance replaced by FFN output
+                img_mod = mod_vectors_dict[f"double_blocks.{i}.img_mod.lin"]
+                txt_mod = mod_vectors_dict[f"double_blocks.{i}.txt_mod.lin"]
+                double_mod = [img_mod, txt_mod]
+
+                img, txt = block(img=img, txt=txt, pe=pe, distill_vec=double_mod, mask=txt_img_mask)
+
+                self.offloader_double.submit_move_blocks(self.double_blocks, i)
+
+        img = torch.cat((txt, img), 1)
+        if not self.blocks_to_swap:
+            for i, block in enumerate(self.single_blocks):
+                single_mod = mod_vectors_dict[f"single_blocks.{i}.modulation.lin"]
+                img = block(img, pe=pe, distill_vec=single_mod, mask=txt_img_mask)
+        else:
+            for i, block in enumerate(self.single_blocks):
+                self.offloader_single.wait_for_block(i)
+
+                single_mod = mod_vectors_dict[f"single_blocks.{i}.modulation.lin"]
+                img = block(img, pe=pe, distill_vec=single_mod, mask=txt_img_mask)
+
+                self.offloader_single.submit_move_blocks(self.single_blocks, i)
+        img = img[:, txt.shape[1] :, ...]
+        final_mod = mod_vectors_dict["final_layer.adaLN_modulation.1"]
+        img = self.final_layer(img, distill_vec=final_mod)  # (N, T, patch_size ** 2 * out_channels)
+        return img

From e0fcb5152a8c6f36d27b0f9f0e20e4ce75860c12 Mon Sep 17 00:00:00 2001
From: Kohya S <ykumeykume@gmail.com>
Date: Tue, 15 Jul 2025 21:34:35 +0900
Subject: [PATCH 486/582] feat: support Neta Lumina all-in-one weights

---
 library/lumina_util.py      | 40 ++++++++++++++++++++++++++++++-------
 lumina_minimal_inference.py |  4 ++--
 2 files changed, 35 insertions(+), 9 deletions(-)

diff --git a/library/lumina_util.py b/library/lumina_util.py
index 452b242fd..87853ef62 100644
--- a/library/lumina_util.py
+++ b/library/lumina_util.py
@@ -44,10 +44,21 @@ def load_lumina_model(
     """
     logger.info("Building Lumina")
     with torch.device("meta"):
-        model = lumina_models.NextDiT_2B_GQA_patch2_Adaln_Refiner(use_flash_attn=use_flash_attn, use_sage_attn=use_sage_attn).to(dtype)
+        model = lumina_models.NextDiT_2B_GQA_patch2_Adaln_Refiner(use_flash_attn=use_flash_attn, use_sage_attn=use_sage_attn).to(
+            dtype
+        )
 
     logger.info(f"Loading state dict from {ckpt_path}")
     state_dict = load_safetensors(ckpt_path, device=device, disable_mmap=disable_mmap, dtype=dtype)
+
+    # Neta-Lumina support
+    if "model.diffusion_model.cap_embedder.0.weight" in state_dict:
+        # remove "model.diffusion_model." prefix
+        filtered_state_dict = {
+            k.replace("model.diffusion_model.", ""): v for k, v in state_dict.items() if k.startswith("model.diffusion_model.")
+        }
+        state_dict = filtered_state_dict
+
     info = model.load_state_dict(state_dict, strict=False, assign=True)
     logger.info(f"Loaded Lumina: {info}")
     return model
@@ -78,6 +89,13 @@ def load_ae(
 
     logger.info(f"Loading state dict from {ckpt_path}")
     sd = load_safetensors(ckpt_path, device=device, disable_mmap=disable_mmap, dtype=dtype)
+
+    # Neta-Lumina support
+    if "vae.decoder.conv_in.bias" in sd:
+        # remove "vae." prefix
+        filtered_sd = {k.replace("vae.", ""): v for k, v in sd.items() if k.startswith("vae.")}
+        sd = filtered_sd
+
     info = ae.load_state_dict(sd, strict=False, assign=True)
     logger.info(f"Loaded AE: {info}")
     return ae
@@ -152,6 +170,16 @@ def load_gemma2(
             break  # the model doesn't have annoying prefix
         sd[new_key] = sd.pop(key)
 
+    # Neta-Lumina support
+    if "text_encoders.gemma2_2b.logit_scale" in sd:
+        # remove "text_encoders.gemma2_2b.transformer.model." prefix
+        filtered_sd = {
+            k.replace("text_encoders.gemma2_2b.transformer.model.", ""): v
+            for k, v in sd.items()
+            if k.startswith("text_encoders.gemma2_2b.transformer.model.")
+        }
+        sd = filtered_sd
+
     info = gemma2.load_state_dict(sd, strict=False, assign=True)
     logger.info(f"Loaded Gemma2: {info}")
     return gemma2
@@ -173,7 +201,6 @@ def pack_latents(x: torch.Tensor) -> torch.Tensor:
     return x
 
 
-
 DIFFUSERS_TO_ALPHA_VLLM_MAP: dict[str, str] = {
     # Embedding layers
     "time_caption_embed.caption_embedder.0.weight": "cap_embedder.0.weight",
@@ -211,11 +238,11 @@ def convert_diffusers_sd_to_alpha_vllm(sd: dict, num_double_blocks: int) -> dict
 
     for diff_key, alpha_key in DIFFUSERS_TO_ALPHA_VLLM_MAP.items():
         # Handle block-specific patterns
-        if '().' in diff_key:
+        if "()." in diff_key:
             for block_idx in range(num_double_blocks):
-                block_alpha_key = alpha_key.replace('().', f'{block_idx}.')
-                block_diff_key = diff_key.replace('().', f'{block_idx}.')
-                
+                block_alpha_key = alpha_key.replace("().", f"{block_idx}.")
+                block_diff_key = diff_key.replace("().", f"{block_idx}.")
+
                 # Search for and convert block-specific keys
                 for input_key, value in list(sd.items()):
                     if input_key == block_diff_key:
@@ -228,6 +255,5 @@ def convert_diffusers_sd_to_alpha_vllm(sd: dict, num_double_blocks: int) -> dict
             else:
                 print(f"Not found: {diff_key}")
 
-
     logger.info(f"Converted {len(new_sd)} keys to Alpha-VLLM format")
     return new_sd
diff --git a/lumina_minimal_inference.py b/lumina_minimal_inference.py
index 31362c00d..d829616b8 100644
--- a/lumina_minimal_inference.py
+++ b/lumina_minimal_inference.py
@@ -231,13 +231,13 @@ def setup_parser() -> argparse.ArgumentParser:
         "--cfg_trunc_ratio",
         type=float,
         default=0.25,
-        help="TBD",
+        help="The ratio of the timestep interval to apply normalization-based guidance scale. For example, 0.25 means the last 25% of timesteps will be guided.",
     )
     parser.add_argument(
         "--renorm_cfg",
         type=float,
         default=1.0,
-        help="TBD",
+        help="The factor to limit the maximum norm after guidance. Default: 1.0, 0.0 means no renormalization.",
     )
     parser.add_argument(
         "--use_flash_attn",

From 25771a5180a134190c0e9b540ee5a074ff70e6cd Mon Sep 17 00:00:00 2001
From: Kohya S <ykumeykume@gmail.com>
Date: Tue, 15 Jul 2025 21:53:13 +0900
Subject: [PATCH 487/582] fix: update help text for cfg_trunc_ratio argument

---
 lumina_minimal_inference.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lumina_minimal_inference.py b/lumina_minimal_inference.py
index d829616b8..691ee4180 100644
--- a/lumina_minimal_inference.py
+++ b/lumina_minimal_inference.py
@@ -231,7 +231,7 @@ def setup_parser() -> argparse.ArgumentParser:
         "--cfg_trunc_ratio",
         type=float,
         default=0.25,
-        help="The ratio of the timestep interval to apply normalization-based guidance scale. For example, 0.25 means the last 25% of timesteps will be guided.",
+        help="The ratio of the timestep interval to apply normalization-based guidance scale. For example, 0.25 means the first 25% of timesteps will be guided.",
     )
     parser.add_argument(
         "--renorm_cfg",

From c0c36a4e2ffb9a8438f490ff3d0deca8a03bbd26 Mon Sep 17 00:00:00 2001
From: Kohya S <ykumeykume@gmail.com>
Date: Tue, 15 Jul 2025 21:58:03 +0900
Subject: [PATCH 488/582] fix: remove duplicated latent normalization in
 decoding

---
 lumina_minimal_inference.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lumina_minimal_inference.py b/lumina_minimal_inference.py
index 691ee4180..87dc9a194 100644
--- a/lumina_minimal_inference.py
+++ b/lumina_minimal_inference.py
@@ -158,7 +158,7 @@ def generate_image(
     # 5. Decode latents
     #
     logger.info("Decoding image...")
-    latents = latents / ae.scale_factor + ae.shift_factor
+    # latents = latents / ae.scale_factor + ae.shift_factor
     with torch.no_grad():
         image = ae.decode(latents.to(ae_dtype))
     image = (image / 2 + 0.5).clamp(0, 1)

From a7b33f320495afa39e353e0c583accf15ad9cb20 Mon Sep 17 00:00:00 2001
From: rockerBOO <rockerboo@gmail.com>
Date: Tue, 15 Jul 2025 22:36:46 -0400
Subject: [PATCH 489/582] Fix alphas cumprod after add_noise for DDIMScheduler

---
 library/train_util.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/library/train_util.py b/library/train_util.py
index 36d419fd2..285870faf 100644
--- a/library/train_util.py
+++ b/library/train_util.py
@@ -6008,6 +6008,8 @@ def get_noise_noisy_latents_and_timesteps(
     else:
         noisy_latents = noise_scheduler.add_noise(latents, noise, timesteps)
 
+    noise_scheduler.alphas_cumprod = noise_scheduler.alphas_cumprod.cpu()
+
     return noise, noisy_latents, timesteps
 
 

From 3adbbb6e3347b9a0da852a65a85d58a5da777443 Mon Sep 17 00:00:00 2001
From: Dave Lage <rockerboo@gmail.com>
Date: Wed, 16 Jul 2025 16:09:20 -0400
Subject: [PATCH 490/582] Add note about why we are moving it

---
 library/train_util.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/library/train_util.py b/library/train_util.py
index 285870faf..165d873bc 100644
--- a/library/train_util.py
+++ b/library/train_util.py
@@ -6008,6 +6008,7 @@ def get_noise_noisy_latents_and_timesteps(
     else:
         noisy_latents = noise_scheduler.add_noise(latents, noise, timesteps)
 
+    # This moves the alphas_cumprod back to the CPU after it is moved in noise_scheduler.add_noise
     noise_scheduler.alphas_cumprod = noise_scheduler.alphas_cumprod.cpu()
 
     return noise, noisy_latents, timesteps

From 24d2ea86c70482ec062412e4214ae221a22cd0a0 Mon Sep 17 00:00:00 2001
From: kohya-ss <ykumeykume@gmail.com>
Date: Sun, 20 Jul 2025 12:56:42 +0900
Subject: [PATCH 491/582] feat: support Chroma model in loading and inference
 processes

---
 flux_minimal_inference.py |  49 ++++++++++------
 flux_train.py             |   4 +-
 flux_train_control_net.py |   4 +-
 flux_train_network.py     |   4 +-
 library/chroma_models.py  |  85 +++++----------------------
 library/flux_utils.py     | 118 ++++++++++++++++++++++++--------------
 6 files changed, 127 insertions(+), 137 deletions(-)

diff --git a/flux_minimal_inference.py b/flux_minimal_inference.py
index 7ab224f1b..a7bff74db 100644
--- a/flux_minimal_inference.py
+++ b/flux_minimal_inference.py
@@ -108,12 +108,18 @@ def denoise(
         else:
             b_img = img
 
+        # For Chroma model, y might be None, so create dummy tensor
+        if b_vec is None:
+            y_input = torch.zeros_like(b_txt[:, :1, :])  # dummy tensor
+        else:
+            y_input = b_vec
+            
         pred = model(
             img=b_img,
             img_ids=b_img_ids,
             txt=b_txt,
             txt_ids=b_txt_ids,
-            y=b_vec,
+            y=y_input,
             timesteps=t_vec,
             guidance=guidance_vec,
             txt_attention_mask=b_t5_attn_mask,
@@ -134,7 +140,7 @@ def do_sample(
     model: flux_models.Flux,
     img: torch.Tensor,
     img_ids: torch.Tensor,
-    l_pooled: torch.Tensor,
+    l_pooled: Optional[torch.Tensor],
     t5_out: torch.Tensor,
     txt_ids: torch.Tensor,
     num_steps: int,
@@ -192,7 +198,7 @@ def do_sample(
 
 def generate_image(
     model,
-    clip_l: CLIPTextModel,
+    clip_l: Optional[CLIPTextModel],
     t5xxl,
     ae,
     prompt: str,
@@ -231,7 +237,7 @@ def generate_image(
     img_ids = flux_utils.prepare_img_ids(1, packed_latent_height, packed_latent_width)
 
     # prepare fp8 models
-    if is_fp8(clip_l_dtype) and (not hasattr(clip_l, "fp8_prepared") or not clip_l.fp8_prepared):
+    if clip_l is not None and is_fp8(clip_l_dtype) and (not hasattr(clip_l, "fp8_prepared") or not clip_l.fp8_prepared):
         logger.info(f"prepare CLIP-L for fp8: set to {clip_l_dtype}, set embeddings to {torch.bfloat16}")
         clip_l.to(clip_l_dtype)  # fp8
         clip_l.text_model.embeddings.to(dtype=torch.bfloat16)
@@ -267,18 +273,22 @@ def forward(hidden_states):
 
     # prepare embeddings
     logger.info("Encoding prompts...")
-    clip_l = clip_l.to(device)
+    if clip_l is not None:
+        clip_l = clip_l.to(device)
     t5xxl = t5xxl.to(device)
 
     def encode(prpt: str):
         tokens_and_masks = tokenize_strategy.tokenize(prpt)
         with torch.no_grad():
-            if is_fp8(clip_l_dtype):
-                with accelerator.autocast():
-                    l_pooled, _, _, _ = encoding_strategy.encode_tokens(tokenize_strategy, [clip_l, None], tokens_and_masks)
+            if clip_l is not None:
+                if is_fp8(clip_l_dtype):
+                    with accelerator.autocast():
+                        l_pooled, _, _, _ = encoding_strategy.encode_tokens(tokenize_strategy, [clip_l, None], tokens_and_masks)
+                else:
+                    with torch.autocast(device_type=device.type, dtype=clip_l_dtype):
+                        l_pooled, _, _, _ = encoding_strategy.encode_tokens(tokenize_strategy, [clip_l, None], tokens_and_masks)
             else:
-                with torch.autocast(device_type=device.type, dtype=clip_l_dtype):
-                    l_pooled, _, _, _ = encoding_strategy.encode_tokens(tokenize_strategy, [clip_l, None], tokens_and_masks)
+                l_pooled = None
 
             if is_fp8(t5xxl_dtype):
                 with accelerator.autocast():
@@ -288,7 +298,7 @@ def encode(prpt: str):
             else:
                 with torch.autocast(device_type=device.type, dtype=t5xxl_dtype):
                     _, t5_out, txt_ids, t5_attn_mask = encoding_strategy.encode_tokens(
-                        tokenize_strategy, [None, t5xxl], tokens_and_masks, args.apply_t5_attn_mask
+                        tokenize_strategy, [clip_l, t5xxl], tokens_and_masks, args.apply_t5_attn_mask
                     )
         return l_pooled, t5_out, txt_ids, t5_attn_mask
 
@@ -305,7 +315,8 @@ def encode(prpt: str):
         raise ValueError("NaN in t5_out")
 
     if args.offload:
-        clip_l = clip_l.cpu()
+        if clip_l is not None:
+            clip_l = clip_l.cpu()
         t5xxl = t5xxl.cpu()
     # del clip_l, t5xxl
     device_utils.clean_memory()
@@ -385,6 +396,7 @@ def encode(prpt: str):
 
     parser = argparse.ArgumentParser()
     parser.add_argument("--ckpt_path", type=str, required=True)
+    parser.add_argument("--model_type", type=str, choices=["flux", "chroma"], default="flux", help="Model type to use")
     parser.add_argument("--clip_l", type=str, required=False)
     parser.add_argument("--t5xxl", type=str, required=False)
     parser.add_argument("--ae", type=str, required=False)
@@ -438,10 +450,13 @@ def is_fp8(dt):
     else:
         accelerator = None
 
-    # load clip_l
-    logger.info(f"Loading clip_l from {args.clip_l}...")
-    clip_l = flux_utils.load_clip_l(args.clip_l, clip_l_dtype, loading_device)
-    clip_l.eval()
+    # load clip_l (skip for chroma model)
+    if args.model_type == "flux":
+        logger.info(f"Loading clip_l from {args.clip_l}...")
+        clip_l = flux_utils.load_clip_l(args.clip_l, clip_l_dtype, loading_device)
+        clip_l.eval()
+    else:
+        clip_l = None
 
     logger.info(f"Loading t5xxl from {args.t5xxl}...")
     t5xxl = flux_utils.load_t5xxl(args.t5xxl, t5xxl_dtype, loading_device)
@@ -453,7 +468,7 @@ def is_fp8(dt):
     #     t5xxl = accelerator.prepare(t5xxl)
 
     # DiT
-    is_schnell, model = flux_utils.load_flow_model(args.ckpt_path, None, loading_device)
+    model_type, is_schnell, model = flux_utils.load_flow_model(args.ckpt_path, None, loading_device, model_type=args.model_type)
     model.eval()
     logger.info(f"Casting model to {flux_dtype}")
     model.to(flux_dtype)  # make sure model is dtype
diff --git a/flux_train.py b/flux_train.py
index 6f98adea8..1d2cc68b7 100644
--- a/flux_train.py
+++ b/flux_train.py
@@ -270,8 +270,8 @@ def train(args):
         clean_memory_on_device(accelerator.device)
 
     # load FLUX
-    _, flux = flux_utils.load_flow_model(
-        args.pretrained_model_name_or_path, weight_dtype, "cpu", args.disable_mmap_load_safetensors
+    model_type, _, flux = flux_utils.load_flow_model(
+        args.pretrained_model_name_or_path, weight_dtype, "cpu", args.disable_mmap_load_safetensors, model_type="flux"
     )
 
     if args.gradient_checkpointing:
diff --git a/flux_train_control_net.py b/flux_train_control_net.py
index cecd00019..3c038c32a 100644
--- a/flux_train_control_net.py
+++ b/flux_train_control_net.py
@@ -258,8 +258,8 @@ def train(args):
         clean_memory_on_device(accelerator.device)
 
     # load FLUX
-    is_schnell, flux = flux_utils.load_flow_model(
-        args.pretrained_model_name_or_path, weight_dtype, "cpu", args.disable_mmap_load_safetensors
+    model_type, is_schnell, flux = flux_utils.load_flow_model(
+        args.pretrained_model_name_or_path, weight_dtype, "cpu", args.disable_mmap_load_safetensors, model_type="flux"
     )
     flux.requires_grad_(False)
 
diff --git a/flux_train_network.py b/flux_train_network.py
index def441559..b2bf8e7cf 100644
--- a/flux_train_network.py
+++ b/flux_train_network.py
@@ -95,8 +95,8 @@ def load_target_model(self, args, weight_dtype, accelerator):
         loading_dtype = None if args.fp8_base else weight_dtype
 
         # if we load to cpu, flux.to(fp8) takes a long time, so we should load to gpu in future
-        self.is_schnell, model = flux_utils.load_flow_model(
-            args.pretrained_model_name_or_path, loading_dtype, "cpu", disable_mmap=args.disable_mmap_load_safetensors
+        self.model_type, self.is_schnell, model = flux_utils.load_flow_model(
+            args.pretrained_model_name_or_path, loading_dtype, "cpu", disable_mmap=args.disable_mmap_load_safetensors, model_type="flux"
         )
         if args.fp8_base:
             # check dtype of model
diff --git a/library/chroma_models.py b/library/chroma_models.py
index 9f21afad6..e1da751b0 100644
--- a/library/chroma_models.py
+++ b/library/chroma_models.py
@@ -11,17 +11,7 @@
 import torch.nn.functional as F
 import torch.utils.checkpoint as ckpt
 
-from .flux_models import (
-    attention,
-    rope,
-    apply_rope,
-    EmbedND,
-    timestep_embedding,
-    MLPEmbedder,
-    RMSNorm,
-    QKNorm,
-    SelfAttention
-)
+from .flux_models import attention, rope, apply_rope, EmbedND, timestep_embedding, MLPEmbedder, RMSNorm, QKNorm, SelfAttention, Flux
 from . import custom_offloading_utils
 
 
@@ -468,13 +458,13 @@ def modify_mask_to_attend_padding(mask, max_seq_length, num_extra_padding=8):
     return modified_mask
 
 
-class Chroma(nn.Module):
+class Chroma(Flux):
     """
     Transformer model for flow matching on sequences.
     """
 
     def __init__(self, params: ChromaParams):
-        super().__init__()
+        nn.Module.__init__(self)
         self.params = params
         self.in_channels = params.in_channels
         self.out_channels = self.in_channels
@@ -548,60 +538,9 @@ def __init__(self, params: ChromaParams):
         self.num_double_blocks = len(self.double_blocks)
         self.num_single_blocks = len(self.single_blocks)
 
-    @property
-    def device(self):
-        # Get the device of the module (assumes all parameters are on the same device)
-        return next(self.parameters()).device
-
-    def enable_gradient_checkpointing(self):
-        self.distilled_guidance_layer.enable_gradient_checkpointing()
-        for block in self.double_blocks + self.single_blocks:
-            block.enable_gradient_checkpointing()
-
-    def disable_gradient_checkpointing(self):
-        self.distilled_guidance_layer.disable_gradient_checkpointing()
-        for block in self.double_blocks + self.single_blocks:
-            block.disable_gradient_checkpointing()
-
-    def enable_block_swap(self, num_blocks: int, device: torch.device):
-        self.blocks_to_swap = num_blocks
-        double_blocks_to_swap = num_blocks // 2
-        single_blocks_to_swap = (num_blocks - double_blocks_to_swap) * 2
-
-        assert double_blocks_to_swap <= self.num_double_blocks - 2 and single_blocks_to_swap <= self.num_single_blocks - 2, (
-            f"Cannot swap more than {self.num_double_blocks - 2} double blocks and {self.num_single_blocks - 2} single blocks. "
-            f"Requested {double_blocks_to_swap} double blocks and {single_blocks_to_swap} single blocks."
-        )
-
-        self.offloader_double = custom_offloading_utils.ModelOffloader(
-            self.double_blocks, double_blocks_to_swap, device
-        )
-        self.offloader_single = custom_offloading_utils.ModelOffloader(
-            self.single_blocks, single_blocks_to_swap, device
-        )
-        print(
-            f"Chroma: Block swap enabled. Swapping {num_blocks} blocks, double blocks: {double_blocks_to_swap}, single blocks: {single_blocks_to_swap}."
-        )
-
-    def move_to_device_except_swap_blocks(self, device: torch.device):
-        # assume model is on cpu. do not move blocks to device to reduce temporary memory usage
-        if self.blocks_to_swap:
-            save_double_blocks = self.double_blocks
-            save_single_blocks = self.single_blocks
-            self.double_blocks = None
-            self.single_blocks = None
-
-        self.to(device)
-
-        if self.blocks_to_swap:
-            self.double_blocks = save_double_blocks
-            self.single_blocks = save_single_blocks
-
-    def prepare_block_swap_before_forward(self):
-        if self.blocks_to_swap is None or self.blocks_to_swap == 0:
-            return
-        self.offloader_double.prepare_block_devices_before_forward(self.double_blocks)
-        self.offloader_single.prepare_block_devices_before_forward(self.single_blocks)
+        # Initialize properties required by Flux parent class
+        self.gradient_checkpointing = False
+        self.cpu_offload_checkpointing = False
 
     def forward(
         self,
@@ -609,10 +548,12 @@ def forward(
         img_ids: Tensor,
         txt: Tensor,
         txt_ids: Tensor,
-        txt_mask: Tensor,
         timesteps: Tensor,
-        guidance: Tensor,
-        attn_padding: int = 1,
+        y: Tensor,
+        block_controlnet_hidden_states=None,
+        block_controlnet_single_hidden_states=None,
+        guidance: Tensor | None = None,
+        txt_attention_mask: Tensor | None = None,
     ) -> Tensor:
         if img.ndim != 3 or txt.ndim != 3:
             raise ValueError("Input img and txt tensors must have 3 dimensions.")
@@ -654,11 +595,11 @@ def forward(
 
         # mask
         with torch.no_grad():
-            txt_mask_w_padding = modify_mask_to_attend_padding(txt_mask, max_len, attn_padding)
+            txt_mask_w_padding = modify_mask_to_attend_padding(txt_attention_mask, max_len, 1)
             txt_img_mask = torch.cat(
                 [
                     txt_mask_w_padding,
-                    torch.ones([img.shape[0], img.shape[1]], device=txt_mask.device),
+                    torch.ones([img.shape[0], img.shape[1]], device=txt_attention_mask.device),
                 ],
                 dim=1,
             )
diff --git a/library/flux_utils.py b/library/flux_utils.py
index 8be1d63ee..a5cfcdfff 100644
--- a/library/flux_utils.py
+++ b/library/flux_utils.py
@@ -92,50 +92,84 @@ def analyze_checkpoint_state(ckpt_path: str) -> Tuple[bool, bool, Tuple[int, int
 
 
 def load_flow_model(
-    ckpt_path: str, dtype: Optional[torch.dtype], device: Union[str, torch.device], disable_mmap: bool = False
-) -> Tuple[bool, flux_models.Flux]:
-    is_diffusers, is_schnell, (num_double_blocks, num_single_blocks), ckpt_paths = analyze_checkpoint_state(ckpt_path)
-    name = MODEL_NAME_DEV if not is_schnell else MODEL_NAME_SCHNELL
-
-    # build model
-    logger.info(f"Building Flux model {name} from {'Diffusers' if is_diffusers else 'BFL'} checkpoint")
-    with torch.device("meta"):
-        params = flux_models.configs[name].params
-
-        # set the number of blocks
-        if params.depth != num_double_blocks:
-            logger.info(f"Setting the number of double blocks from {params.depth} to {num_double_blocks}")
-            params = replace(params, depth=num_double_blocks)
-        if params.depth_single_blocks != num_single_blocks:
-            logger.info(f"Setting the number of single blocks from {params.depth_single_blocks} to {num_single_blocks}")
-            params = replace(params, depth_single_blocks=num_single_blocks)
-
-        model = flux_models.Flux(params)
-        if dtype is not None:
-            model = model.to(dtype)
-
-    # load_sft doesn't support torch.device
-    logger.info(f"Loading state dict from {ckpt_path}")
-    sd = {}
-    for ckpt_path in ckpt_paths:
-        sd.update(load_safetensors(ckpt_path, device=str(device), disable_mmap=disable_mmap, dtype=dtype))
+    ckpt_path: str,
+    dtype: Optional[torch.dtype],
+    device: Union[str, torch.device],
+    disable_mmap: bool = False,
+    model_type: str = "flux",
+) -> Tuple[str, bool, flux_models.Flux]:
+    if model_type == "flux":
+        is_diffusers, is_schnell, (num_double_blocks, num_single_blocks), ckpt_paths = analyze_checkpoint_state(ckpt_path)
+        name = MODEL_NAME_DEV if not is_schnell else MODEL_NAME_SCHNELL
+
+        # build model
+        logger.info(f"Building Flux model {name} from {'Diffusers' if is_diffusers else 'BFL'} checkpoint")
+        with torch.device("meta"):
+            params = flux_models.configs[name].params
+
+            # set the number of blocks
+            if params.depth != num_double_blocks:
+                logger.info(f"Setting the number of double blocks from {params.depth} to {num_double_blocks}")
+                params = replace(params, depth=num_double_blocks)
+            if params.depth_single_blocks != num_single_blocks:
+                logger.info(f"Setting the number of single blocks from {params.depth_single_blocks} to {num_single_blocks}")
+                params = replace(params, depth_single_blocks=num_single_blocks)
+
+            model = flux_models.Flux(params)
+            if dtype is not None:
+                model = model.to(dtype)
+
+        # load_sft doesn't support torch.device
+        logger.info(f"Loading state dict from {ckpt_path}")
+        sd = {}
+        for ckpt_path in ckpt_paths:
+            sd.update(load_safetensors(ckpt_path, device=str(device), disable_mmap=disable_mmap, dtype=dtype))
+
+        # convert Diffusers to BFL
+        if is_diffusers:
+            logger.info("Converting Diffusers to BFL")
+            sd = convert_diffusers_sd_to_bfl(sd, num_double_blocks, num_single_blocks)
+            logger.info("Converted Diffusers to BFL")
+
+        # if the key has annoying prefix, remove it
+        for key in list(sd.keys()):
+            new_key = key.replace("model.diffusion_model.", "")
+            if new_key == key:
+                break  # the model doesn't have annoying prefix
+            sd[new_key] = sd.pop(key)
+
+        info = model.load_state_dict(sd, strict=False, assign=True)
+        logger.info(f"Loaded Flux: {info}")
+        return model_type, is_schnell, model
+
+    elif model_type == "chroma":
+        from . import chroma_models
+
+        # build model
+        logger.info("Building Chroma model from BFL checkpoint")
+        with torch.device("meta"):
+            model = chroma_models.Chroma(chroma_models.chroma_params)
+            if dtype is not None:
+                model = model.to(dtype)
+
+        # load_sft doesn't support torch.device
+        logger.info(f"Loading state dict from {ckpt_path}")
+        sd = load_safetensors(ckpt_path, device=str(device), disable_mmap=disable_mmap, dtype=dtype)
 
-    # convert Diffusers to BFL
-    if is_diffusers:
-        logger.info("Converting Diffusers to BFL")
-        sd = convert_diffusers_sd_to_bfl(sd, num_double_blocks, num_single_blocks)
-        logger.info("Converted Diffusers to BFL")
+        # if the key has annoying prefix, remove it
+        for key in list(sd.keys()):
+            new_key = key.replace("model.diffusion_model.", "")
+            if new_key == key:
+                break  # the model doesn't have annoying prefix
+            sd[new_key] = sd.pop(key)
 
-    # if the key has annoying prefix, remove it
-    for key in list(sd.keys()):
-        new_key = key.replace("model.diffusion_model.", "")
-        if new_key == key:
-            break  # the model doesn't have annoying prefix
-        sd[new_key] = sd.pop(key)
+        info = model.load_state_dict(sd, strict=False, assign=True)
+        logger.info(f"Loaded Chroma: {info}")
+        is_schnell = False  # Chroma is not schnell
+        return model_type, is_schnell, model
 
-    info = model.load_state_dict(sd, strict=False, assign=True)
-    logger.info(f"Loaded Flux: {info}")
-    return is_schnell, model
+    else:
+        raise ValueError(f"Unsupported model_type: {model_type}. Supported types are 'flux' and 'chroma'.")
 
 
 def load_ae(
@@ -166,7 +200,7 @@ def load_controlnet(
         sd = load_safetensors(ckpt_path, device=str(device), disable_mmap=disable_mmap, dtype=dtype)
         info = controlnet.load_state_dict(sd, strict=False, assign=True)
         logger.info(f"Loaded ControlNet: {info}")
-    return controlnet    
+    return controlnet
 
 
 def load_clip_l(

From 404ddb060d04285d72ffff9342542eec71d9c352 Mon Sep 17 00:00:00 2001
From: Kohya S <ykumeykume@gmail.com>
Date: Sun, 20 Jul 2025 14:08:54 +0900
Subject: [PATCH 492/582] fix: inference for Chroma model

---
 flux_minimal_inference.py | 28 ++++++++++++++--------------
 library/chroma_models.py  |  9 +++++++--
 library/flux_utils.py     |  2 +-
 3 files changed, 22 insertions(+), 17 deletions(-)

diff --git a/flux_minimal_inference.py b/flux_minimal_inference.py
index a7bff74db..550904d23 100644
--- a/flux_minimal_inference.py
+++ b/flux_minimal_inference.py
@@ -78,16 +78,19 @@ def denoise(
     neg_t5_attn_mask: Optional[torch.Tensor] = None,
     cfg_scale: Optional[float] = None,
 ):
-    # this is ignored for schnell
+    # prepare classifier free guidance
     logger.info(f"guidance: {guidance}, cfg_scale: {cfg_scale}")
-    guidance_vec = torch.full((img.shape[0],), guidance, device=img.device, dtype=img.dtype)
+    do_cfg = neg_txt is not None and (cfg_scale is not None and cfg_scale != 1.0)
 
-    # prepare classifier free guidance
-    if neg_txt is not None and neg_vec is not None:
+    # this is ignored for schnell
+    guidance_vec = torch.full((img.shape[0] * (2 if do_cfg else 1),), guidance, device=img.device, dtype=img.dtype)
+
+    if do_cfg:
+        print("Using classifier free guidance")
         b_img_ids = torch.cat([img_ids, img_ids], dim=0)
         b_txt_ids = torch.cat([txt_ids, txt_ids], dim=0)
         b_txt = torch.cat([neg_txt, txt], dim=0)
-        b_vec = torch.cat([neg_vec, vec], dim=0)
+        b_vec = torch.cat([neg_vec, vec], dim=0) if neg_vec is not None else None
         if t5_attn_mask is not None and neg_t5_attn_mask is not None:
             b_t5_attn_mask = torch.cat([neg_t5_attn_mask, t5_attn_mask], dim=0)
         else:
@@ -103,17 +106,13 @@ def denoise(
         t_vec = torch.full((b_img_ids.shape[0],), t_curr, dtype=img.dtype, device=img.device)
 
         # classifier free guidance
-        if neg_txt is not None and neg_vec is not None:
+        if do_cfg:
             b_img = torch.cat([img, img], dim=0)
         else:
             b_img = img
 
-        # For Chroma model, y might be None, so create dummy tensor
-        if b_vec is None:
-            y_input = torch.zeros_like(b_txt[:, :1, :])  # dummy tensor
-        else:
-            y_input = b_vec
-            
+        y_input = b_vec
+
         pred = model(
             img=b_img,
             img_ids=b_img_ids,
@@ -126,7 +125,7 @@ def denoise(
         )
 
         # classifier free guidance
-        if neg_txt is not None and neg_vec is not None:
+        if do_cfg:
             pred_uncond, pred = torch.chunk(pred, 2, dim=0)
             pred = pred_uncond + cfg_scale * (pred - pred_uncond)
 
@@ -309,7 +308,7 @@ def encode(prpt: str):
         neg_l_pooled, neg_t5_out, neg_t5_attn_mask = None, None, None
 
     # NaN check
-    if torch.isnan(l_pooled).any():
+    if l_pooled is not None and torch.isnan(l_pooled).any():
         raise ValueError("NaN in l_pooled")
     if torch.isnan(t5_out).any():
         raise ValueError("NaN in t5_out")
@@ -329,6 +328,7 @@ def encode(prpt: str):
 
     img_ids = img_ids.to(device)
     t5_attn_mask = t5_attn_mask.to(device) if args.apply_t5_attn_mask else None
+    neg_t5_attn_mask = neg_t5_attn_mask.to(device) if neg_t5_attn_mask is not None and args.apply_t5_attn_mask else None
 
     x = do_sample(
         accelerator,
diff --git a/library/chroma_models.py b/library/chroma_models.py
index e1da751b0..f725db872 100644
--- a/library/chroma_models.py
+++ b/library/chroma_models.py
@@ -240,7 +240,7 @@ def _forward(
         k = torch.cat((txt_k, img_k), dim=2)
         v = torch.cat((txt_v, img_v), dim=2)
 
-        attn = attention(q, k, v, pe=pe, mask=mask)
+        attn = attention(q, k, v, pe=pe, attn_mask=mask)
         txt_attn, img_attn = attn[:, : txt.shape[1]], attn[:, txt.shape[1] :]
 
         # calculate the img bloks
@@ -343,7 +343,7 @@ def _forward(self, x: Tensor, pe: Tensor, distill_vec: list[ModulationOut], mask
         q, k = self.norm(q, k, v)
 
         # compute attention
-        attn = attention(q, k, v, pe=pe, mask=mask)
+        attn = attention(q, k, v, pe=pe, attn_mask=mask)
         # compute activation in mlp stream, cat again and run second linear layer
         output = self.linear2(torch.cat((attn, self.mlp_act(mlp)), 2))
         # replaced with compiled fn
@@ -555,6 +555,11 @@ def forward(
         guidance: Tensor | None = None,
         txt_attention_mask: Tensor | None = None,
     ) -> Tensor:
+        # print(
+        #     f"Chroma forward: img shape {img.shape}, txt shape {txt.shape}, img_ids shape {img_ids.shape}, txt_ids shape {txt_ids.shape}"
+        # )
+        # print(f"timesteps: {timesteps}, guidance: {guidance}")
+
         if img.ndim != 3 or txt.ndim != 3:
             raise ValueError("Input img and txt tensors must have 3 dimensions.")
 
diff --git a/library/flux_utils.py b/library/flux_utils.py
index a5cfcdfff..dda7c789d 100644
--- a/library/flux_utils.py
+++ b/library/flux_utils.py
@@ -146,7 +146,7 @@ def load_flow_model(
         from . import chroma_models
 
         # build model
-        logger.info("Building Chroma model from BFL checkpoint")
+        logger.info("Building Chroma model")
         with torch.device("meta"):
             model = chroma_models.Chroma(chroma_models.chroma_params)
             if dtype is not None:

From 8fd0b12d1f8bcae52cb11f0ccd193d8382b06166 Mon Sep 17 00:00:00 2001
From: Kohya S <ykumeykume@gmail.com>
Date: Sun, 20 Jul 2025 16:00:58 +0900
Subject: [PATCH 493/582] feat: update DoubleStreamBlock and SingleStreamBlock
 to handle text sequence lengths instead of mask

---
 library/chroma_models.py | 242 +++++++++++++++++++++++++--------------
 1 file changed, 159 insertions(+), 83 deletions(-)

diff --git a/library/chroma_models.py b/library/chroma_models.py
index f725db872..06822a37b 100644
--- a/library/chroma_models.py
+++ b/library/chroma_models.py
@@ -211,9 +211,9 @@ def _forward(
         self,
         img: Tensor,
         txt: Tensor,
-        pe: Tensor,
+        pe: list[Tensor],
         distill_vec: list[ModulationOut],
-        mask: Tensor,
+        txt_seq_len: Tensor,
     ) -> tuple[Tensor, Tensor]:
         (img_mod1, img_mod2), (txt_mod1, txt_mod2) = distill_vec
 
@@ -235,13 +235,58 @@ def _forward(
         txt_q, txt_k, txt_v = rearrange(txt_qkv, "B L (K H D) -> K B H L D", K=3, H=self.num_heads)
         txt_q, txt_k = self.txt_attn.norm(txt_q, txt_k, txt_v)
 
-        # run actual attention
-        q = torch.cat((txt_q, img_q), dim=2)
-        k = torch.cat((txt_k, img_k), dim=2)
-        v = torch.cat((txt_v, img_v), dim=2)
-
-        attn = attention(q, k, v, pe=pe, attn_mask=mask)
-        txt_attn, img_attn = attn[:, : txt.shape[1]], attn[:, txt.shape[1] :]
+        # run actual attention: we split the batch into each element
+        max_txt_len = txt_q.shape[-2]  # max 512
+        txt_q = list(torch.chunk(txt_q, txt_q.shape[0], dim=0))  # list of [B, H, L, D] tensors
+        txt_k = list(torch.chunk(txt_k, txt_k.shape[0], dim=0))
+        txt_v = list(torch.chunk(txt_v, txt_v.shape[0], dim=0))
+        img_q = list(torch.chunk(img_q, img_q.shape[0], dim=0))
+        img_k = list(torch.chunk(img_k, img_k.shape[0], dim=0))
+        img_v = list(torch.chunk(img_v, img_v.shape[0], dim=0))
+        txt_attn = []
+        img_attn = []
+        for i in range(txt.shape[0]):
+            print(i)
+            print(f"len(txt_q) = {len(txt_q)}, len(img_q) = {len(img_q)}, txt_seq_len.shape = {txt_seq_len.shape}")
+            print(f"txt_seq_len[i] = {txt_seq_len[i]}, txt_q.shape = {txt_q[i].shape}, img_q.shape = {img_q[i].shape}")
+            txt_q_i = txt_q[i][:, :, : txt_seq_len[i]]
+            txt_q[i] = None
+            img_q_i = img_q[i]
+            img_q[i] = None
+            q = torch.cat((txt_q_i, img_q_i), dim=2)
+            del txt_q_i, img_q_i
+
+            txt_k_i = txt_k[i][:, :, : txt_seq_len[i]]
+            txt_k[i] = None
+            img_k_i = img_k[i]
+            img_k[i] = None
+            k = torch.cat((txt_k_i, img_k_i), dim=2)
+            del txt_k_i, img_k_i
+
+            txt_v_i = txt_v[i][:, :, : txt_seq_len[i]]
+            txt_v[i] = None
+            img_v_i = img_v[i]
+            img_v[i] = None
+            v = torch.cat((txt_v_i, img_v_i), dim=2)
+            del txt_v_i, img_v_i
+
+            attn = attention(q, k, v, pe=pe[i], attn_mask=None)  # (1, L, D)
+            print(f"attn.shape = {attn.shape}, txt_seq_len[i] = {txt_seq_len[i]}, max_txt_len = {max_txt_len}")
+            txt_attn_i = torch.zeros((1, max_txt_len, attn.shape[-1]), dtype=attn.dtype, device=self.device)
+            txt_attn_i[:, : txt_seq_len[i], :] = attn[:, : txt_seq_len[i], :]
+            img_attn_i = attn[:, txt_seq_len[i] :, :]
+            txt_attn.append(txt_attn_i)
+            img_attn.append(img_attn_i)
+
+        txt_attn = torch.cat(txt_attn, dim=0)
+        img_attn = torch.cat(img_attn, dim=0)
+
+        # q = torch.cat((txt_q, img_q), dim=2)
+        # k = torch.cat((txt_k, img_k), dim=2)
+        # v = torch.cat((txt_v, img_v), dim=2)
+
+        # attn = attention(q, k, v, pe=pe, attn_mask=mask)
+        # txt_attn, img_attn = attn[:, : txt.shape[1]], attn[:, txt.shape[1] :]
 
         # calculate the img bloks
         # replaced with compiled fn
@@ -273,12 +318,12 @@ def forward(
         txt: Tensor,
         pe: Tensor,
         distill_vec: list[ModulationOut],
-        mask: Tensor,
+        txt_seq_len: Tensor,
     ) -> tuple[Tensor, Tensor]:
         if self.training and self.gradient_checkpointing:
-            return ckpt.checkpoint(self._forward, img, txt, pe, distill_vec, mask, use_reentrant=False)
+            return ckpt.checkpoint(self._forward, img, txt, pe, distill_vec, txt_seq_len, use_reentrant=False)
         else:
-            return self._forward(img, txt, pe, distill_vec, mask)
+            return self._forward(img, txt, pe, distill_vec, txt_seq_len)
 
 
 class SingleStreamBlock(nn.Module):
@@ -332,7 +377,9 @@ def enable_gradient_checkpointing(self):
     def disable_gradient_checkpointing(self):
         self.gradient_checkpointing = False
 
-    def _forward(self, x: Tensor, pe: Tensor, distill_vec: list[ModulationOut], mask: Tensor) -> Tensor:
+    def _forward(
+        self, x: Tensor, pe: list[Tensor], distill_vec: list[ModulationOut], txt_seq_len: Tensor, max_txt_len: int
+    ) -> Tensor:
         mod = distill_vec
         # replaced with compiled fn
         # x_mod = (1 + mod.scale) * self.pre_norm(x) + mod.shift
@@ -342,19 +389,44 @@ def _forward(self, x: Tensor, pe: Tensor, distill_vec: list[ModulationOut], mask
         q, k, v = rearrange(qkv, "B L (K H D) -> K B H L D", K=3, H=self.num_heads)
         q, k = self.norm(q, k, v)
 
-        # compute attention
-        attn = attention(q, k, v, pe=pe, attn_mask=mask)
+        # # compute attention
+        # attn = attention(q, k, v, pe=pe, attn_mask=mask)
+
+        # compute attention: we split the batch into each element
+        q = list(torch.chunk(q, q.shape[0], dim=0))
+        k = list(torch.chunk(k, k.shape[0], dim=0))
+        v = list(torch.chunk(v, v.shape[0], dim=0))
+        attn = []
+        for i in range(x.size(0)):
+            q_i = torch.cat((q[i][:, :, : txt_seq_len[i]], q[i][:, :, max_txt_len:]), dim=2)
+            q[i] = None
+            k_i = torch.cat((k[i][:, :, : txt_seq_len[i]], k[i][:, :, max_txt_len:]), dim=2)
+            k[i] = None
+            v_i = torch.cat((v[i][:, :, : txt_seq_len[i]], v[i][:, :, max_txt_len:]), dim=2)
+            v[i] = None
+            attn_trimmed = attention(q_i, k_i, v_i, pe=pe[i], attn_mask=None)
+            print(
+                f"attn_trimmed.shape = {attn_trimmed.shape}, txt_seq_len[i] = {txt_seq_len[i]}, max_txt_len = {max_txt_len}, x.shape = {x.shape}"
+            )
+
+            attn_i = torch.zeros((1, x.shape[1], attn_trimmed.shape[-1]), dtype=attn_trimmed.dtype, device=self.device)
+            attn_i[:, : txt_seq_len[i], :] = attn_trimmed[:, : txt_seq_len[i], :]
+            attn_i[:, max_txt_len:, :] = attn_trimmed[:, txt_seq_len[i] :, :]
+            attn.append(attn_i)
+
+        attn = torch.cat(attn, dim=0)
+
         # compute activation in mlp stream, cat again and run second linear layer
         output = self.linear2(torch.cat((attn, self.mlp_act(mlp)), 2))
         # replaced with compiled fn
         # return x + mod.gate * output
         return self.modulation_gate_fn(x, mod.gate, output)
 
-    def forward(self, x: Tensor, pe: Tensor, distill_vec: list[ModulationOut], mask: Tensor) -> Tensor:
+    def forward(self, x: Tensor, pe: Tensor, distill_vec: list[ModulationOut], txt_seq_len: Tensor, max_txt_len: int) -> Tensor:
         if self.training and self.gradient_checkpointing:
-            return ckpt.checkpoint(self._forward, x, pe, distill_vec, mask, use_reentrant=False)
+            return ckpt.checkpoint(self._forward, x, pe, distill_vec, txt_seq_len, max_txt_len, use_reentrant=False)
         else:
-            return self._forward(x, pe, distill_vec, mask)
+            return self._forward(x, pe, distill_vec, txt_seq_len, max_txt_len)
 
 
 class LastLayer(nn.Module):
@@ -542,6 +614,29 @@ def __init__(self, params: ChromaParams):
         self.gradient_checkpointing = False
         self.cpu_offload_checkpointing = False
 
+    def get_mod_vectors(
+        self,
+        timesteps: Tensor,
+        guidance: Tensor | None = None,
+        batch_size: int | None = None,
+        requires_grad: bool = False,
+    ) -> Tensor:
+        distill_timestep = timestep_embedding(timesteps, self.approximator_in_dim // 4)
+        # TODO: need to add toggle to omit this from schnell but that's not a priority
+        distil_guidance = timestep_embedding(guidance, self.approximator_in_dim // 4)
+        # get all modulation index
+        modulation_index = timestep_embedding(self.mod_index, self.approximator_in_dim // 2)
+        # we need to broadcast the modulation index here so each batch has all of the index
+        modulation_index = modulation_index.unsqueeze(0).repeat(batch_size, 1, 1)
+        # and we need to broadcast timestep and guidance along too
+        timestep_guidance = torch.cat([distill_timestep, distil_guidance], dim=1).unsqueeze(1).repeat(1, self.mod_index_length, 1)
+        # then and only then we could concatenate it together
+        input_vec = torch.cat([timestep_guidance, modulation_index], dim=-1)
+        if requires_grad:
+            input_vec = input_vec.requires_grad_(True)
+        mod_vectors = self.distilled_guidance_layer(input_vec)
+        return mod_vectors
+
     def forward(
         self,
         img: Tensor,
@@ -554,6 +649,8 @@ def forward(
         block_controlnet_single_hidden_states=None,
         guidance: Tensor | None = None,
         txt_attention_mask: Tensor | None = None,
+        attn_padding: int = 1,
+        mod_vectors: Tensor | None = None,
     ) -> Tensor:
         # print(
         #     f"Chroma forward: img shape {img.shape}, txt shape {txt.shape}, img_ids shape {img_ids.shape}, txt_ids shape {txt_ids.shape}"
@@ -567,85 +664,64 @@ def forward(
         img = self.img_in(img)
         txt = self.txt_in(txt)
 
-        # TODO:
-        # need to fix grad accumulation issue here for now it's in no grad mode
-        # besides, i don't want to wash out the PFP that's trained on this model weights anyway
-        # the fan out operation here is deleting the backward graph
-        # alternatively doing forward pass for every block manually is doable but slow
-        # custom backward probably be better
-        with torch.no_grad():
-            distill_timestep = timestep_embedding(timesteps, self.approximator_in_dim // 4)
-            # TODO: need to add toggle to omit this from schnell but that's not a priority
-            distil_guidance = timestep_embedding(guidance, self.approximator_in_dim // 4)
-            # get all modulation index
-            modulation_index = timestep_embedding(self.mod_index, self.approximator_in_dim // 2)
-            # we need to broadcast the modulation index here so each batch has all of the index
-            modulation_index = modulation_index.unsqueeze(0).repeat(img.shape[0], 1, 1)
-            # and we need to broadcast timestep and guidance along too
-            timestep_guidance = (
-                torch.cat([distill_timestep, distil_guidance], dim=1).unsqueeze(1).repeat(1, self.mod_index_length, 1)
-            )
-            # then and only then we could concatenate it together
-            input_vec = torch.cat([timestep_guidance, modulation_index], dim=-1)
-            mod_vectors = self.distilled_guidance_layer(input_vec.requires_grad_(True))
+        if mod_vectors is None:
+            # TODO:
+            # need to fix grad accumulation issue here for now it's in no grad mode
+            # besides, i don't want to wash out the PFP that's trained on this model weights anyway
+            # the fan out operation here is deleting the backward graph
+            # alternatively doing forward pass for every block manually is doable but slow
+            # custom backward probably be better
+            with torch.no_grad():
+                # kohya-ss: I'm not sure why requires_grad is set to True here
+                mod_vectors = self.get_mod_vectors(timesteps, guidance, img.shape[0], requires_grad=True)
+
         mod_vectors_dict = distribute_modulations(mod_vectors, self.depth_single_blocks, self.depth_double_blocks)
 
         ids = torch.cat((txt_ids, img_ids), dim=1)
-        pe = self.pe_embedder(ids)
-
-        # compute mask
-        # assume max seq length from the batched input
-
-        max_len = txt.shape[1]
-
-        # mask
-        with torch.no_grad():
-            txt_mask_w_padding = modify_mask_to_attend_padding(txt_attention_mask, max_len, 1)
-            txt_img_mask = torch.cat(
-                [
-                    txt_mask_w_padding,
-                    torch.ones([img.shape[0], img.shape[1]], device=txt_attention_mask.device),
-                ],
-                dim=1,
-            )
-            txt_img_mask = txt_img_mask.float().T @ txt_img_mask.float()
-            txt_img_mask = txt_img_mask[None, None, ...].repeat(txt.shape[0], self.num_heads, 1, 1).int().bool()
-            # txt_mask_w_padding[txt_mask_w_padding==False] = True
-
-        if not self.blocks_to_swap:
-            for i, block in enumerate(self.double_blocks):
-                # the guidance replaced by FFN output
-                img_mod = mod_vectors_dict[f"double_blocks.{i}.img_mod.lin"]
-                txt_mod = mod_vectors_dict[f"double_blocks.{i}.txt_mod.lin"]
-                double_mod = [img_mod, txt_mod]
-
-                img, txt = block(img=img, txt=txt, pe=pe, distill_vec=double_mod, mask=txt_img_mask)
-        else:
-            for i, block in enumerate(self.double_blocks):
+        pe = self.pe_embedder(ids)  # B, 1, seq_length, 64, 2, 2
+
+        # calculate text length for each batch instead of masking
+        txt_emb_len = txt.shape[1]
+        txt_seq_len = txt_attention_mask[:, :txt_emb_len].sum(dim=-1)  # (batch_size, )
+        txt_seq_len = torch.clip(txt_seq_len + attn_padding, 0, txt_emb_len)
+        max_txt_len = torch.max(txt_seq_len).item()  # max text length in the batch
+
+        # trim txt embedding to the text length
+        txt = txt[:, :max_txt_len, :]
+
+        # split positional encoding into each element of the batch, and trim masked tokens
+        print(f"pe shape = {pe.shape} dtype = {pe.dtype}, txt_seq_len = {txt_seq_len}")
+        pe = list(torch.chunk(pe, pe.shape[0], dim=0))
+        for i in range(len(pe)):
+            # trim positional encoding to the text length
+            pe[i] = torch.cat([pe[i][:, :, : txt_seq_len[i]], pe[i][:, :, txt_emb_len:]], dim=2)
+
+        for i, block in enumerate(self.double_blocks):
+            if self.blocks_to_swap:
                 self.offloader_double.wait_for_block(i)
 
-                # the guidance replaced by FFN output
-                img_mod = mod_vectors_dict[f"double_blocks.{i}.img_mod.lin"]
-                txt_mod = mod_vectors_dict[f"double_blocks.{i}.txt_mod.lin"]
-                double_mod = [img_mod, txt_mod]
+            # the guidance replaced by FFN output
+            img_mod = mod_vectors_dict[f"double_blocks.{i}.img_mod.lin"]
+            txt_mod = mod_vectors_dict[f"double_blocks.{i}.txt_mod.lin"]
+            double_mod = [img_mod, txt_mod]
 
-                img, txt = block(img=img, txt=txt, pe=pe, distill_vec=double_mod, mask=txt_img_mask)
+            img, txt = block(img=img, txt=txt, pe=pe, distill_vec=double_mod, txt_seq_len=txt_seq_len)
 
+            if self.blocks_to_swap:
                 self.offloader_double.submit_move_blocks(self.double_blocks, i)
 
         img = torch.cat((txt, img), 1)
-        if not self.blocks_to_swap:
-            for i, block in enumerate(self.single_blocks):
-                single_mod = mod_vectors_dict[f"single_blocks.{i}.modulation.lin"]
-                img = block(img, pe=pe, distill_vec=single_mod, mask=txt_img_mask)
-        else:
-            for i, block in enumerate(self.single_blocks):
+
+        for i, block in enumerate(self.single_blocks):
+            if self.blocks_to_swap:
                 self.offloader_single.wait_for_block(i)
 
-                single_mod = mod_vectors_dict[f"single_blocks.{i}.modulation.lin"]
-                img = block(img, pe=pe, distill_vec=single_mod, mask=txt_img_mask)
+            single_mod = mod_vectors_dict[f"single_blocks.{i}.modulation.lin"]
+            img = block(img, pe=pe, distill_vec=single_mod, txt_seq_len=txt_seq_len, max_txt_len=max_txt_len)
 
+            if self.blocks_to_swap:
                 self.offloader_single.submit_move_blocks(self.single_blocks, i)
+
         img = img[:, txt.shape[1] :, ...]
         final_mod = mod_vectors_dict["final_layer.adaLN_modulation.1"]
         img = self.final_layer(img, distill_vec=final_mod)  # (N, T, patch_size ** 2 * out_channels)

From c4958b5dca0102b3f18fa2d2a383f177d508f872 Mon Sep 17 00:00:00 2001
From: Kohya S <ykumeykume@gmail.com>
Date: Sun, 20 Jul 2025 16:30:43 +0900
Subject: [PATCH 494/582] feat: change img/txt order for attention and single
 blocks

---
 library/chroma_models.py | 75 +++++++++++++++-------------------------
 1 file changed, 28 insertions(+), 47 deletions(-)

diff --git a/library/chroma_models.py b/library/chroma_models.py
index 06822a37b..1b62f20f6 100644
--- a/library/chroma_models.py
+++ b/library/chroma_models.py
@@ -236,7 +236,8 @@ def _forward(
         txt_q, txt_k = self.txt_attn.norm(txt_q, txt_k, txt_v)
 
         # run actual attention: we split the batch into each element
-        max_txt_len = txt_q.shape[-2]  # max 512
+        max_txt_len = torch.max(txt_seq_len).item()
+        img_len = img_q.shape[-2]  # max 64
         txt_q = list(torch.chunk(txt_q, txt_q.shape[0], dim=0))  # list of [B, H, L, D] tensors
         txt_k = list(torch.chunk(txt_k, txt_k.shape[0], dim=0))
         txt_v = list(torch.chunk(txt_v, txt_v.shape[0], dim=0))
@@ -246,35 +247,25 @@ def _forward(
         txt_attn = []
         img_attn = []
         for i in range(txt.shape[0]):
-            print(i)
-            print(f"len(txt_q) = {len(txt_q)}, len(img_q) = {len(img_q)}, txt_seq_len.shape = {txt_seq_len.shape}")
-            print(f"txt_seq_len[i] = {txt_seq_len[i]}, txt_q.shape = {txt_q[i].shape}, img_q.shape = {img_q[i].shape}")
-            txt_q_i = txt_q[i][:, :, : txt_seq_len[i]]
+            txt_q[i] = txt_q[i][:, :, : txt_seq_len[i]]
+            q = torch.cat((img_q[i], txt_q[i]), dim=2)
             txt_q[i] = None
-            img_q_i = img_q[i]
             img_q[i] = None
-            q = torch.cat((txt_q_i, img_q_i), dim=2)
-            del txt_q_i, img_q_i
 
-            txt_k_i = txt_k[i][:, :, : txt_seq_len[i]]
+            txt_k[i] = txt_k[i][:, :, : txt_seq_len[i]]
+            k = torch.cat((img_k[i], txt_k[i]), dim=2)
             txt_k[i] = None
-            img_k_i = img_k[i]
             img_k[i] = None
-            k = torch.cat((txt_k_i, img_k_i), dim=2)
-            del txt_k_i, img_k_i
 
-            txt_v_i = txt_v[i][:, :, : txt_seq_len[i]]
+            txt_v[i] = txt_v[i][:, :, : txt_seq_len[i]]
+            v = torch.cat((img_v[i], txt_v[i]), dim=2)
             txt_v[i] = None
-            img_v_i = img_v[i]
             img_v[i] = None
-            v = torch.cat((txt_v_i, img_v_i), dim=2)
-            del txt_v_i, img_v_i
 
-            attn = attention(q, k, v, pe=pe[i], attn_mask=None)  # (1, L, D)
-            print(f"attn.shape = {attn.shape}, txt_seq_len[i] = {txt_seq_len[i]}, max_txt_len = {max_txt_len}")
+            attn = attention(q, k, v, pe=pe[i : i + 1, :, : q.shape[2]], attn_mask=None)  # attn = (1, L, D)
+            img_attn_i = attn[:, :img_len, :]
             txt_attn_i = torch.zeros((1, max_txt_len, attn.shape[-1]), dtype=attn.dtype, device=self.device)
-            txt_attn_i[:, : txt_seq_len[i], :] = attn[:, : txt_seq_len[i], :]
-            img_attn_i = attn[:, txt_seq_len[i] :, :]
+            txt_attn_i[:, : txt_seq_len[i], :] = attn[:, img_len:, :]
             txt_attn.append(txt_attn_i)
             img_attn.append(img_attn_i)
 
@@ -377,9 +368,7 @@ def enable_gradient_checkpointing(self):
     def disable_gradient_checkpointing(self):
         self.gradient_checkpointing = False
 
-    def _forward(
-        self, x: Tensor, pe: list[Tensor], distill_vec: list[ModulationOut], txt_seq_len: Tensor, max_txt_len: int
-    ) -> Tensor:
+    def _forward(self, x: Tensor, pe: list[Tensor], distill_vec: list[ModulationOut], txt_seq_len: Tensor) -> Tensor:
         mod = distill_vec
         # replaced with compiled fn
         # x_mod = (1 + mod.scale) * self.pre_norm(x) + mod.shift
@@ -393,25 +382,23 @@ def _forward(
         # attn = attention(q, k, v, pe=pe, attn_mask=mask)
 
         # compute attention: we split the batch into each element
+        max_txt_len = torch.max(txt_seq_len).item()
+        img_len = q.shape[-2] - max_txt_len
         q = list(torch.chunk(q, q.shape[0], dim=0))
         k = list(torch.chunk(k, k.shape[0], dim=0))
         v = list(torch.chunk(v, v.shape[0], dim=0))
         attn = []
         for i in range(x.size(0)):
-            q_i = torch.cat((q[i][:, :, : txt_seq_len[i]], q[i][:, :, max_txt_len:]), dim=2)
+            q[i] = q[i][:, :, : img_len + txt_seq_len[i]]
+            k[i] = k[i][:, :, : img_len + txt_seq_len[i]]
+            v[i] = v[i][:, :, : img_len + txt_seq_len[i]]
+            attn_trimmed = attention(q[i], k[i], v[i], pe=pe[i : i + 1, :, : img_len + txt_seq_len[i]], attn_mask=None)
             q[i] = None
-            k_i = torch.cat((k[i][:, :, : txt_seq_len[i]], k[i][:, :, max_txt_len:]), dim=2)
             k[i] = None
-            v_i = torch.cat((v[i][:, :, : txt_seq_len[i]], v[i][:, :, max_txt_len:]), dim=2)
             v[i] = None
-            attn_trimmed = attention(q_i, k_i, v_i, pe=pe[i], attn_mask=None)
-            print(
-                f"attn_trimmed.shape = {attn_trimmed.shape}, txt_seq_len[i] = {txt_seq_len[i]}, max_txt_len = {max_txt_len}, x.shape = {x.shape}"
-            )
 
             attn_i = torch.zeros((1, x.shape[1], attn_trimmed.shape[-1]), dtype=attn_trimmed.dtype, device=self.device)
-            attn_i[:, : txt_seq_len[i], :] = attn_trimmed[:, : txt_seq_len[i], :]
-            attn_i[:, max_txt_len:, :] = attn_trimmed[:, txt_seq_len[i] :, :]
+            attn_i[:, : img_len + txt_seq_len[i], :] = attn_trimmed
             attn.append(attn_i)
 
         attn = torch.cat(attn, dim=0)
@@ -422,11 +409,11 @@ def _forward(
         # return x + mod.gate * output
         return self.modulation_gate_fn(x, mod.gate, output)
 
-    def forward(self, x: Tensor, pe: Tensor, distill_vec: list[ModulationOut], txt_seq_len: Tensor, max_txt_len: int) -> Tensor:
+    def forward(self, x: Tensor, pe: Tensor, distill_vec: list[ModulationOut], txt_seq_len: Tensor) -> Tensor:
         if self.training and self.gradient_checkpointing:
-            return ckpt.checkpoint(self._forward, x, pe, distill_vec, txt_seq_len, max_txt_len, use_reentrant=False)
+            return ckpt.checkpoint(self._forward, x, pe, distill_vec, txt_seq_len, use_reentrant=False)
         else:
-            return self._forward(x, pe, distill_vec, txt_seq_len, max_txt_len)
+            return self._forward(x, pe, distill_vec, txt_seq_len)
 
 
 class LastLayer(nn.Module):
@@ -677,9 +664,6 @@ def forward(
 
         mod_vectors_dict = distribute_modulations(mod_vectors, self.depth_single_blocks, self.depth_double_blocks)
 
-        ids = torch.cat((txt_ids, img_ids), dim=1)
-        pe = self.pe_embedder(ids)  # B, 1, seq_length, 64, 2, 2
-
         # calculate text length for each batch instead of masking
         txt_emb_len = txt.shape[1]
         txt_seq_len = txt_attention_mask[:, :txt_emb_len].sum(dim=-1)  # (batch_size, )
@@ -689,12 +673,9 @@ def forward(
         # trim txt embedding to the text length
         txt = txt[:, :max_txt_len, :]
 
-        # split positional encoding into each element of the batch, and trim masked tokens
-        print(f"pe shape = {pe.shape} dtype = {pe.dtype}, txt_seq_len = {txt_seq_len}")
-        pe = list(torch.chunk(pe, pe.shape[0], dim=0))
-        for i in range(len(pe)):
-            # trim positional encoding to the text length
-            pe[i] = torch.cat([pe[i][:, :, : txt_seq_len[i]], pe[i][:, :, txt_emb_len:]], dim=2)
+        # create positional encoding for the text and image
+        ids = torch.cat((img_ids, txt_ids[:, :max_txt_len]), dim=1)  # reverse order of ids for faster attention
+        pe = self.pe_embedder(ids)  # B, 1, seq_length, 64, 2, 2
 
         for i, block in enumerate(self.double_blocks):
             if self.blocks_to_swap:
@@ -710,19 +691,19 @@ def forward(
             if self.blocks_to_swap:
                 self.offloader_double.submit_move_blocks(self.double_blocks, i)
 
-        img = torch.cat((txt, img), 1)
+        img = torch.cat((img, txt), 1)
 
         for i, block in enumerate(self.single_blocks):
             if self.blocks_to_swap:
                 self.offloader_single.wait_for_block(i)
 
             single_mod = mod_vectors_dict[f"single_blocks.{i}.modulation.lin"]
-            img = block(img, pe=pe, distill_vec=single_mod, txt_seq_len=txt_seq_len, max_txt_len=max_txt_len)
+            img = block(img, pe=pe, distill_vec=single_mod, txt_seq_len=txt_seq_len)
 
             if self.blocks_to_swap:
                 self.offloader_single.submit_move_blocks(self.single_blocks, i)
 
-        img = img[:, txt.shape[1] :, ...]
+        img = img[:, :-max_txt_len, ...]
         final_mod = mod_vectors_dict["final_layer.adaLN_modulation.1"]
         img = self.final_layer(img, distill_vec=final_mod)  # (N, T, patch_size ** 2 * out_channels)
         return img

From b4e862626aaba996ffe8b7f942ce5ce21d762919 Mon Sep 17 00:00:00 2001
From: Kohya S <ykumeykume@gmail.com>
Date: Sun, 20 Jul 2025 19:00:09 +0900
Subject: [PATCH 495/582] feat: add LoRA training support for Chroma

---
 flux_minimal_inference.py   |   2 +-
 flux_train.py               |   2 +-
 flux_train_control_net.py   |   7 +-
 flux_train_network.py       | 102 +++++++++------------
 library/chroma_models.py    |  50 ++++++----
 library/flux_models.py      | 177 +-----------------------------------
 library/flux_train_utils.py |  19 ++--
 library/flux_utils.py       |  43 ++++++++-
 library/sai_model_spec.py   |  14 ++-
 library/train_util.py       |   2 +-
 10 files changed, 158 insertions(+), 260 deletions(-)

diff --git a/flux_minimal_inference.py b/flux_minimal_inference.py
index 550904d23..86e8e1b1f 100644
--- a/flux_minimal_inference.py
+++ b/flux_minimal_inference.py
@@ -468,7 +468,7 @@ def is_fp8(dt):
     #     t5xxl = accelerator.prepare(t5xxl)
 
     # DiT
-    model_type, is_schnell, model = flux_utils.load_flow_model(args.ckpt_path, None, loading_device, model_type=args.model_type)
+    is_schnell, model = flux_utils.load_flow_model(args.ckpt_path, None, loading_device, model_type=args.model_type)
     model.eval()
     logger.info(f"Casting model to {flux_dtype}")
     model.to(flux_dtype)  # make sure model is dtype
diff --git a/flux_train.py b/flux_train.py
index 1d2cc68b7..84db34cfd 100644
--- a/flux_train.py
+++ b/flux_train.py
@@ -270,7 +270,7 @@ def train(args):
         clean_memory_on_device(accelerator.device)
 
     # load FLUX
-    model_type, _, flux = flux_utils.load_flow_model(
+    _, flux = flux_utils.load_flow_model(
         args.pretrained_model_name_or_path, weight_dtype, "cpu", args.disable_mmap_load_safetensors, model_type="flux"
     )
 
diff --git a/flux_train_control_net.py b/flux_train_control_net.py
index 3c038c32a..93c20dabd 100644
--- a/flux_train_control_net.py
+++ b/flux_train_control_net.py
@@ -68,6 +68,11 @@ def train(args):
     if not args.skip_cache_check:
         args.skip_cache_check = args.skip_latents_validity_check
 
+    if args.model_type != "flux":
+        raise ValueError(
+            f"FLUX.1 ControlNet training requires model_type='flux'. / FLUX.1 ControlNetの学習にはmodel_type='flux'を指定してください。"
+        )
+
     # assert (
     #     not args.weighted_captions
     # ), "weighted_captions is not supported currently / weighted_captionsは現在サポートされていません"
@@ -258,7 +263,7 @@ def train(args):
         clean_memory_on_device(accelerator.device)
 
     # load FLUX
-    model_type, is_schnell, flux = flux_utils.load_flow_model(
+    is_schnell, flux = flux_utils.load_flow_model(
         args.pretrained_model_name_or_path, weight_dtype, "cpu", args.disable_mmap_load_safetensors, model_type="flux"
     )
     flux.requires_grad_(False)
diff --git a/flux_train_network.py b/flux_train_network.py
index b2bf8e7cf..1b61ac723 100644
--- a/flux_train_network.py
+++ b/flux_train_network.py
@@ -35,6 +35,7 @@ def __init__(self):
         self.sample_prompts_te_outputs = None
         self.is_schnell: Optional[bool] = None
         self.is_swapping_blocks: bool = False
+        self.model_type: Optional[str] = None
 
     def assert_extra_args(
         self,
@@ -45,6 +46,12 @@ def assert_extra_args(
         super().assert_extra_args(args, train_dataset_group, val_dataset_group)
         # sdxl_train_util.verify_sdxl_training_args(args)
 
+        self.model_type = args.model_type  # "flux" or "chroma"
+        if self.model_type != "chroma":
+            self.use_clip_l = True
+        else:
+            self.use_clip_l = False  # Chroma does not use CLIP-L
+
         if args.fp8_base_unet:
             args.fp8_base = True  # if fp8_base_unet is enabled, fp8_base is also enabled for FLUX.1
 
@@ -60,7 +67,7 @@ def assert_extra_args(
             ), "when caching Text Encoder output, either caption_dropout_rate, shuffle_caption, token_warmup_step or caption_tag_dropout_rate cannot be used / Text Encoderの出力をキャッシュするときはcaption_dropout_rate, shuffle_caption, token_warmup_step, caption_tag_dropout_rateは使えません"
 
         # prepare CLIP-L/T5XXL training flags
-        self.train_clip_l = not args.network_train_unet_only
+        self.train_clip_l = not args.network_train_unet_only and self.use_clip_l
         self.train_t5xxl = False  # default is False even if args.network_train_unet_only is False
 
         if args.max_token_length is not None:
@@ -95,8 +102,12 @@ def load_target_model(self, args, weight_dtype, accelerator):
         loading_dtype = None if args.fp8_base else weight_dtype
 
         # if we load to cpu, flux.to(fp8) takes a long time, so we should load to gpu in future
-        self.model_type, self.is_schnell, model = flux_utils.load_flow_model(
-            args.pretrained_model_name_or_path, loading_dtype, "cpu", disable_mmap=args.disable_mmap_load_safetensors, model_type="flux"
+        _, model = flux_utils.load_flow_model(
+            args.pretrained_model_name_or_path,
+            loading_dtype,
+            "cpu",
+            disable_mmap=args.disable_mmap_load_safetensors,
+            model_type=self.model_type,
         )
         if args.fp8_base:
             # check dtype of model
@@ -120,7 +131,10 @@ def load_target_model(self, args, weight_dtype, accelerator):
             logger.info(f"enable block swap: blocks_to_swap={args.blocks_to_swap}")
             model.enable_block_swap(args.blocks_to_swap, accelerator.device)
 
-        clip_l = flux_utils.load_clip_l(args.clip_l, weight_dtype, "cpu", disable_mmap=args.disable_mmap_load_safetensors)
+        if self.use_clip_l:
+            clip_l = flux_utils.load_clip_l(args.clip_l, weight_dtype, "cpu", disable_mmap=args.disable_mmap_load_safetensors)
+        else:
+            clip_l = flux_utils.dummy_clip_l()  # dummy CLIP-L for Chroma, which does not use CLIP-L
         clip_l.eval()
 
         # if the file is fp8 and we are using fp8_base (not unet), we can load it as is (fp8)
@@ -141,13 +155,20 @@ def load_target_model(self, args, weight_dtype, accelerator):
 
         ae = flux_utils.load_ae(args.ae, weight_dtype, "cpu", disable_mmap=args.disable_mmap_load_safetensors)
 
-        return flux_utils.MODEL_VERSION_FLUX_V1, [clip_l, t5xxl], ae, model
+        model_version = flux_utils.MODEL_VERSION_FLUX_V1 if self.model_type != "chroma" else flux_utils.MODEL_VERSION_CHROMA
+        return model_version, [clip_l, t5xxl], ae, model
 
     def get_tokenize_strategy(self, args):
-        _, is_schnell, _, _ = flux_utils.analyze_checkpoint_state(args.pretrained_model_name_or_path)
+        # This method is called before `assert_extra_args`, so we cannot use `self.is_schnell` here.
+        # Instead, we analyze the checkpoint state to determine if it is schnell.
+        if args.model_type != "chroma":
+            _, is_schnell, _, _ = flux_utils.analyze_checkpoint_state(args.pretrained_model_name_or_path)
+        else:
+            is_schnell = False
+        self.is_schnell = is_schnell
 
         if args.t5xxl_max_token_length is None:
-            if is_schnell:
+            if self.is_schnell:
                 t5xxl_max_token_length = 256
             else:
                 t5xxl_max_token_length = 512
@@ -268,23 +289,6 @@ def cache_text_encoder_outputs_if_needed(
             text_encoders[0].to(accelerator.device, dtype=weight_dtype)
             text_encoders[1].to(accelerator.device)
 
-    # def call_unet(self, args, accelerator, unet, noisy_latents, timesteps, text_conds, batch, weight_dtype):
-    #     noisy_latents = noisy_latents.to(weight_dtype)  # TODO check why noisy_latents is not weight_dtype
-
-    #     # get size embeddings
-    #     orig_size = batch["original_sizes_hw"]
-    #     crop_size = batch["crop_top_lefts"]
-    #     target_size = batch["target_sizes_hw"]
-    #     embs = sdxl_train_util.get_size_embeddings(orig_size, crop_size, target_size, accelerator.device).to(weight_dtype)
-
-    #     # concat embeddings
-    #     encoder_hidden_states1, encoder_hidden_states2, pool2 = text_conds
-    #     vector_embedding = torch.cat([pool2, embs], dim=1).to(weight_dtype)
-    #     text_embedding = torch.cat([encoder_hidden_states1, encoder_hidden_states2], dim=2).to(weight_dtype)
-
-    #     noise_pred = unet(noisy_latents, timesteps, text_embedding, vector_embedding)
-    #     return noise_pred
-
     def sample_images(self, accelerator, args, epoch, global_step, device, ae, tokenizer, text_encoder, flux):
         text_encoders = text_encoder  # for compatibility
         text_encoders = self.get_models_for_text_encoding(args, accelerator, text_encoders)
@@ -292,36 +296,6 @@ def sample_images(self, accelerator, args, epoch, global_step, device, ae, token
         flux_train_utils.sample_images(
             accelerator, args, epoch, global_step, flux, ae, text_encoders, self.sample_prompts_te_outputs
         )
-        # return
-
-        """
-        class FluxUpperLowerWrapper(torch.nn.Module):
-            def __init__(self, flux_upper: flux_models.FluxUpper, flux_lower: flux_models.FluxLower, device: torch.device):
-                super().__init__()
-                self.flux_upper = flux_upper
-                self.flux_lower = flux_lower
-                self.target_device = device
-
-            def prepare_block_swap_before_forward(self):
-                pass
-
-            def forward(self, img, img_ids, txt, txt_ids, timesteps, y, guidance=None, txt_attention_mask=None):
-                self.flux_lower.to("cpu")
-                clean_memory_on_device(self.target_device)
-                self.flux_upper.to(self.target_device)
-                img, txt, vec, pe = self.flux_upper(img, img_ids, txt, txt_ids, timesteps, y, guidance, txt_attention_mask)
-                self.flux_upper.to("cpu")
-                clean_memory_on_device(self.target_device)
-                self.flux_lower.to(self.target_device)
-                return self.flux_lower(img, txt, vec, pe, txt_attention_mask)
-
-        wrapper = FluxUpperLowerWrapper(self.flux_upper, flux, accelerator.device)
-        clean_memory_on_device(accelerator.device)
-        flux_train_utils.sample_images(
-            accelerator, args, epoch, global_step, wrapper, ae, text_encoders, self.sample_prompts_te_outputs
-        )
-        clean_memory_on_device(accelerator.device)
-        """
 
     def get_noise_scheduler(self, args: argparse.Namespace, device: torch.device) -> Any:
         noise_scheduler = sd3_train_utils.FlowMatchEulerDiscreteScheduler(num_train_timesteps=1000, shift=args.discrete_flow_shift)
@@ -366,7 +340,11 @@ def get_noise_pred_and_target(
         # ensure guidance_scale in args is float
         guidance_vec = torch.full((bsz,), float(args.guidance_scale), device=accelerator.device)
 
-        # ensure the hidden state will require grad
+        # get modulation vectors for Chroma
+        input_vec = None
+        if self.model_type == "chroma":
+            input_vec = unet.get_input_vec(timesteps=timesteps, guidance=guidance_vec, batch_size=bsz)
+
         if args.gradient_checkpointing:
             noisy_model_input.requires_grad_(True)
             for t in text_encoder_conds:
@@ -374,13 +352,15 @@ def get_noise_pred_and_target(
                     t.requires_grad_(True)
             img_ids.requires_grad_(True)
             guidance_vec.requires_grad_(True)
+            if input_vec is not None:
+                input_vec.requires_grad_(True)
 
         # Predict the noise residual
         l_pooled, t5_out, txt_ids, t5_attn_mask = text_encoder_conds
         if not args.apply_t5_attn_mask:
             t5_attn_mask = None
 
-        def call_dit(img, img_ids, t5_out, txt_ids, l_pooled, timesteps, guidance_vec, t5_attn_mask):
+        def call_dit(img, img_ids, t5_out, txt_ids, l_pooled, timesteps, guidance_vec, t5_attn_mask, input_vec):
             # grad is enabled even if unet is not in train mode, because Text Encoder is in train mode
             with torch.set_grad_enabled(is_train), accelerator.autocast():
                 # YiYi notes: divide it by 1000 for now because we scale it by 1000 in the transformer model (we should not keep it but I want to keep the inputs same for the model for testing)
@@ -393,6 +373,7 @@ def call_dit(img, img_ids, t5_out, txt_ids, l_pooled, timesteps, guidance_vec, t
                     timesteps=timesteps / 1000,
                     guidance=guidance_vec,
                     txt_attention_mask=t5_attn_mask,
+                    input_vec=input_vec,
                 )
             return model_pred
 
@@ -405,6 +386,7 @@ def call_dit(img, img_ids, t5_out, txt_ids, l_pooled, timesteps, guidance_vec, t
             timesteps=timesteps,
             guidance_vec=guidance_vec,
             t5_attn_mask=t5_attn_mask,
+            input_vec=input_vec,
         )
 
         # unpack latents
@@ -436,6 +418,7 @@ def call_dit(img, img_ids, t5_out, txt_ids, l_pooled, timesteps, guidance_vec, t
                         timesteps=timesteps[diff_output_pr_indices],
                         guidance_vec=guidance_vec[diff_output_pr_indices] if guidance_vec is not None else None,
                         t5_attn_mask=t5_attn_mask[diff_output_pr_indices] if t5_attn_mask is not None else None,
+                        input_vec=input_vec[diff_output_pr_indices] if input_vec is not None else None,
                     )
                 network.set_multiplier(1.0)  # may be overwritten by "network_multipliers" in the next step
 
@@ -454,9 +437,14 @@ def post_process_loss(self, loss, args, timesteps, noise_scheduler):
         return loss
 
     def get_sai_model_spec(self, args):
-        return train_util.get_sai_model_spec(None, args, False, True, False, flux="dev")
+        if self.model_type != "chroma":
+            model_description = "schnell" if self.is_schnell else "dev"
+        else:
+            model_description = "chroma"
+        return train_util.get_sai_model_spec(None, args, False, True, False, flux=model_description)
 
     def update_metadata(self, metadata, args):
+        metadata["ss_model_type"] = args.model_type
         metadata["ss_apply_t5_attn_mask"] = args.apply_t5_attn_mask
         metadata["ss_weighting_scheme"] = args.weighting_scheme
         metadata["ss_logit_mean"] = args.logit_mean
diff --git a/library/chroma_models.py b/library/chroma_models.py
index 1b62f20f6..e5d3b547a 100644
--- a/library/chroma_models.py
+++ b/library/chroma_models.py
@@ -601,13 +601,30 @@ def __init__(self, params: ChromaParams):
         self.gradient_checkpointing = False
         self.cpu_offload_checkpointing = False
 
-    def get_mod_vectors(
-        self,
-        timesteps: Tensor,
-        guidance: Tensor | None = None,
-        batch_size: int | None = None,
-        requires_grad: bool = False,
-    ) -> Tensor:
+    def get_model_type(self) -> str:
+        return "chroma"
+
+    def enable_gradient_checkpointing(self, cpu_offload: bool = False):
+        self.gradient_checkpointing = True
+        self.cpu_offload_checkpointing = cpu_offload
+
+        self.distilled_guidance_layer.enable_gradient_checkpointing()
+        for block in self.double_blocks + self.single_blocks:
+            block.enable_gradient_checkpointing()
+
+        print(f"Chroma: Gradient checkpointing enabled.")
+
+    def disable_gradient_checkpointing(self):
+        self.gradient_checkpointing = False
+        self.cpu_offload_checkpointing = False
+
+        self.distilled_guidance_layer.disable_gradient_checkpointing()
+        for block in self.double_blocks + self.single_blocks:
+            block.disable_gradient_checkpointing()
+
+        print("Chroma: Gradient checkpointing disabled.")
+
+    def get_input_vec(self, timesteps: Tensor, guidance: Tensor | None = None, batch_size: int | None = None) -> Tensor:
         distill_timestep = timestep_embedding(timesteps, self.approximator_in_dim // 4)
         # TODO: need to add toggle to omit this from schnell but that's not a priority
         distil_guidance = timestep_embedding(guidance, self.approximator_in_dim // 4)
@@ -619,10 +636,7 @@ def get_mod_vectors(
         timestep_guidance = torch.cat([distill_timestep, distil_guidance], dim=1).unsqueeze(1).repeat(1, self.mod_index_length, 1)
         # then and only then we could concatenate it together
         input_vec = torch.cat([timestep_guidance, modulation_index], dim=-1)
-        if requires_grad:
-            input_vec = input_vec.requires_grad_(True)
-        mod_vectors = self.distilled_guidance_layer(input_vec)
-        return mod_vectors
+        return input_vec
 
     def forward(
         self,
@@ -637,7 +651,7 @@ def forward(
         guidance: Tensor | None = None,
         txt_attention_mask: Tensor | None = None,
         attn_padding: int = 1,
-        mod_vectors: Tensor | None = None,
+        input_vec: Tensor | None = None,
     ) -> Tensor:
         # print(
         #     f"Chroma forward: img shape {img.shape}, txt shape {txt.shape}, img_ids shape {img_ids.shape}, txt_ids shape {txt_ids.shape}"
@@ -651,7 +665,7 @@ def forward(
         img = self.img_in(img)
         txt = self.txt_in(txt)
 
-        if mod_vectors is None:
+        if input_vec is None:
             # TODO:
             # need to fix grad accumulation issue here for now it's in no grad mode
             # besides, i don't want to wash out the PFP that's trained on this model weights anyway
@@ -659,14 +673,18 @@ def forward(
             # alternatively doing forward pass for every block manually is doable but slow
             # custom backward probably be better
             with torch.no_grad():
-                # kohya-ss: I'm not sure why requires_grad is set to True here
-                mod_vectors = self.get_mod_vectors(timesteps, guidance, img.shape[0], requires_grad=True)
+                input_vec = self.get_input_vec(timesteps, guidance, img.shape[0])
 
+                # kohya-ss: I'm not sure why requires_grad is set to True here
+                input_vec.requires_grad = True
+                mod_vectors = self.distilled_guidance_layer(input_vec)
+        else:
+            mod_vectors = self.distilled_guidance_layer(input_vec)
         mod_vectors_dict = distribute_modulations(mod_vectors, self.depth_single_blocks, self.depth_double_blocks)
 
         # calculate text length for each batch instead of masking
         txt_emb_len = txt.shape[1]
-        txt_seq_len = txt_attention_mask[:, :txt_emb_len].sum(dim=-1)  # (batch_size, )
+        txt_seq_len = txt_attention_mask[:, :txt_emb_len].sum(dim=-1).to(torch.int64)  # (batch_size, )
         txt_seq_len = torch.clip(txt_seq_len + attn_padding, 0, txt_emb_len)
         max_txt_len = torch.max(txt_seq_len).item()  # max text length in the batch
 
diff --git a/library/flux_models.py b/library/flux_models.py
index 328ad481d..6f889755a 100644
--- a/library/flux_models.py
+++ b/library/flux_models.py
@@ -930,6 +930,9 @@ def __init__(self, params: FluxParams):
         self.num_double_blocks = len(self.double_blocks)
         self.num_single_blocks = len(self.single_blocks)
 
+    def get_model_type(self) -> str:
+        return "flux"
+
     @property
     def device(self):
         return next(self.parameters()).device
@@ -1018,6 +1021,7 @@ def forward(
         block_controlnet_single_hidden_states=None,
         guidance: Tensor | None = None,
         txt_attention_mask: Tensor | None = None,
+        input_vec: Tensor | None = None,
     ) -> Tensor:
         if img.ndim != 3 or txt.ndim != 3:
             raise ValueError("Input img and txt tensors must have 3 dimensions.")
@@ -1169,7 +1173,7 @@ def __init__(self, params: FluxParams, controlnet_depth=2, controlnet_single_dep
             nn.SiLU(),
             nn.Conv2d(16, 16, 3, padding=1, stride=2),
             nn.SiLU(),
-            zero_module(nn.Conv2d(16, 16, 3, padding=1))
+            zero_module(nn.Conv2d(16, 16, 3, padding=1)),
         )
 
     @property
@@ -1320,174 +1324,3 @@ def forward(
             controlnet_single_block_samples = controlnet_single_block_samples + (block_sample,)
 
         return controlnet_block_samples, controlnet_single_block_samples
-
-
-"""
-class FluxUpper(nn.Module):
-    ""
-    Transformer model for flow matching on sequences.
-    ""
-
-    def __init__(self, params: FluxParams):
-        super().__init__()
-
-        self.params = params
-        self.in_channels = params.in_channels
-        self.out_channels = self.in_channels
-        if params.hidden_size % params.num_heads != 0:
-            raise ValueError(f"Hidden size {params.hidden_size} must be divisible by num_heads {params.num_heads}")
-        pe_dim = params.hidden_size // params.num_heads
-        if sum(params.axes_dim) != pe_dim:
-            raise ValueError(f"Got {params.axes_dim} but expected positional dim {pe_dim}")
-        self.hidden_size = params.hidden_size
-        self.num_heads = params.num_heads
-        self.pe_embedder = EmbedND(dim=pe_dim, theta=params.theta, axes_dim=params.axes_dim)
-        self.img_in = nn.Linear(self.in_channels, self.hidden_size, bias=True)
-        self.time_in = MLPEmbedder(in_dim=256, hidden_dim=self.hidden_size)
-        self.vector_in = MLPEmbedder(params.vec_in_dim, self.hidden_size)
-        self.guidance_in = MLPEmbedder(in_dim=256, hidden_dim=self.hidden_size) if params.guidance_embed else nn.Identity()
-        self.txt_in = nn.Linear(params.context_in_dim, self.hidden_size)
-
-        self.double_blocks = nn.ModuleList(
-            [
-                DoubleStreamBlock(
-                    self.hidden_size,
-                    self.num_heads,
-                    mlp_ratio=params.mlp_ratio,
-                    qkv_bias=params.qkv_bias,
-                )
-                for _ in range(params.depth)
-            ]
-        )
-
-        self.gradient_checkpointing = False
-
-    @property
-    def device(self):
-        return next(self.parameters()).device
-
-    @property
-    def dtype(self):
-        return next(self.parameters()).dtype
-
-    def enable_gradient_checkpointing(self):
-        self.gradient_checkpointing = True
-
-        self.time_in.enable_gradient_checkpointing()
-        self.vector_in.enable_gradient_checkpointing()
-        if self.guidance_in.__class__ != nn.Identity:
-            self.guidance_in.enable_gradient_checkpointing()
-
-        for block in self.double_blocks:
-            block.enable_gradient_checkpointing()
-
-        print("FLUX: Gradient checkpointing enabled.")
-
-    def disable_gradient_checkpointing(self):
-        self.gradient_checkpointing = False
-
-        self.time_in.disable_gradient_checkpointing()
-        self.vector_in.disable_gradient_checkpointing()
-        if self.guidance_in.__class__ != nn.Identity:
-            self.guidance_in.disable_gradient_checkpointing()
-
-        for block in self.double_blocks:
-            block.disable_gradient_checkpointing()
-
-        print("FLUX: Gradient checkpointing disabled.")
-
-    def forward(
-        self,
-        img: Tensor,
-        img_ids: Tensor,
-        txt: Tensor,
-        txt_ids: Tensor,
-        timesteps: Tensor,
-        y: Tensor,
-        guidance: Tensor | None = None,
-        txt_attention_mask: Tensor | None = None,
-    ) -> Tensor:
-        if img.ndim != 3 or txt.ndim != 3:
-            raise ValueError("Input img and txt tensors must have 3 dimensions.")
-
-        # running on sequences img
-        img = self.img_in(img)
-        vec = self.time_in(timestep_embedding(timesteps, 256))
-        if self.params.guidance_embed:
-            if guidance is None:
-                raise ValueError("Didn't get guidance strength for guidance distilled model.")
-            vec = vec + self.guidance_in(timestep_embedding(guidance, 256))
-        vec = vec + self.vector_in(y)
-        txt = self.txt_in(txt)
-
-        ids = torch.cat((txt_ids, img_ids), dim=1)
-        pe = self.pe_embedder(ids)
-
-        for block in self.double_blocks:
-            img, txt = block(img=img, txt=txt, vec=vec, pe=pe, txt_attention_mask=txt_attention_mask)
-
-        return img, txt, vec, pe
-
-
-class FluxLower(nn.Module):
-    ""
-    Transformer model for flow matching on sequences.
-    ""
-
-    def __init__(self, params: FluxParams):
-        super().__init__()
-        self.hidden_size = params.hidden_size
-        self.num_heads = params.num_heads
-        self.out_channels = params.in_channels
-
-        self.single_blocks = nn.ModuleList(
-            [
-                SingleStreamBlock(self.hidden_size, self.num_heads, mlp_ratio=params.mlp_ratio)
-                for _ in range(params.depth_single_blocks)
-            ]
-        )
-
-        self.final_layer = LastLayer(self.hidden_size, 1, self.out_channels)
-
-        self.gradient_checkpointing = False
-
-    @property
-    def device(self):
-        return next(self.parameters()).device
-
-    @property
-    def dtype(self):
-        return next(self.parameters()).dtype
-
-    def enable_gradient_checkpointing(self):
-        self.gradient_checkpointing = True
-
-        for block in self.single_blocks:
-            block.enable_gradient_checkpointing()
-
-        print("FLUX: Gradient checkpointing enabled.")
-
-    def disable_gradient_checkpointing(self):
-        self.gradient_checkpointing = False
-
-        for block in self.single_blocks:
-            block.disable_gradient_checkpointing()
-
-        print("FLUX: Gradient checkpointing disabled.")
-
-    def forward(
-        self,
-        img: Tensor,
-        txt: Tensor,
-        vec: Tensor | None = None,
-        pe: Tensor | None = None,
-        txt_attention_mask: Tensor | None = None,
-    ) -> Tensor:
-        img = torch.cat((txt, img), 1)
-        for block in self.single_blocks:
-            img = block(img, vec=vec, pe=pe, txt_attention_mask=txt_attention_mask)
-        img = img[:, txt.shape[1] :, ...]
-
-        img = self.final_layer(img, vec)  # (N, T, patch_size ** 2 * out_channels)
-        return img
-"""
diff --git a/library/flux_train_utils.py b/library/flux_train_utils.py
index 8392e5592..f3eb81992 100644
--- a/library/flux_train_utils.py
+++ b/library/flux_train_utils.py
@@ -154,9 +154,8 @@ def sample_image_inference(
     sample_steps = prompt_dict.get("sample_steps", 20)
     width = prompt_dict.get("width", 512)
     height = prompt_dict.get("height", 512)
-    # TODO refactor variable names
-    cfg_scale = prompt_dict.get("guidance_scale", 1.0)
-    emb_guidance_scale = prompt_dict.get("scale", 3.5)
+    emb_guidance_scale = prompt_dict.get("guidance_scale", 3.5)
+    cfg_scale = prompt_dict.get("scale", 1.0)
     seed = prompt_dict.get("seed")
     controlnet_image = prompt_dict.get("controlnet_image")
     prompt: str = prompt_dict.get("prompt", "")
@@ -242,7 +241,7 @@ def encode_prompt(prpt):
         dtype=weight_dtype,
         generator=torch.Generator(device=accelerator.device).manual_seed(seed) if seed is not None else None,
     )
-    timesteps = get_schedule(sample_steps, noise.shape[1], shift=True)  # FLUX.1 dev -> shift=True
+    timesteps = get_schedule(sample_steps, noise.shape[1], shift=True)  # Chroma can use shift=True
     img_ids = flux_utils.prepare_img_ids(1, packed_latent_height, packed_latent_width).to(accelerator.device, weight_dtype)
     t5_attn_mask = t5_attn_mask.to(accelerator.device) if args.apply_t5_attn_mask else None
 
@@ -403,8 +402,8 @@ def denoise(
                 y=torch.cat([neg_l_pooled, vec], dim=0),
                 block_controlnet_hidden_states=block_samples,
                 block_controlnet_single_hidden_states=block_single_samples,
-                timesteps=t_vec,
-                guidance=guidance_vec,
+                timesteps=t_vec.repeat(2),
+                guidance=guidance_vec.repeat(2),
                 txt_attention_mask=nc_c_t5_attn_mask,
             )
             neg_pred, pred = torch.chunk(nc_c_pred, 2, dim=0)
@@ -680,3 +679,11 @@ def add_flux_train_arguments(parser: argparse.ArgumentParser):
         default=3.0,
         help="Discrete flow shift for the Euler Discrete Scheduler, default is 3.0. / Euler Discrete Schedulerの離散フローシフト、デフォルトは3.0。",
     )
+
+    parser.add_argument(
+        "--model_type",
+        type=str,
+        choices=["flux", "chroma"],
+        default="flux",
+        help="Model type to use for training / トレーニングに使用するモデルタイプ：flux or chroma (default: flux)",
+    )
diff --git a/library/flux_utils.py b/library/flux_utils.py
index dda7c789d..3f0a0d63e 100644
--- a/library/flux_utils.py
+++ b/library/flux_utils.py
@@ -23,6 +23,7 @@
 MODEL_VERSION_FLUX_V1 = "flux1"
 MODEL_NAME_DEV = "dev"
 MODEL_NAME_SCHNELL = "schnell"
+MODEL_VERSION_CHROMA = "chroma"
 
 
 def analyze_checkpoint_state(ckpt_path: str) -> Tuple[bool, bool, Tuple[int, int], List[str]]:
@@ -97,7 +98,7 @@ def load_flow_model(
     device: Union[str, torch.device],
     disable_mmap: bool = False,
     model_type: str = "flux",
-) -> Tuple[str, bool, flux_models.Flux]:
+) -> Tuple[bool, flux_models.Flux]:
     if model_type == "flux":
         is_diffusers, is_schnell, (num_double_blocks, num_single_blocks), ckpt_paths = analyze_checkpoint_state(ckpt_path)
         name = MODEL_NAME_DEV if not is_schnell else MODEL_NAME_SCHNELL
@@ -140,7 +141,7 @@ def load_flow_model(
 
         info = model.load_state_dict(sd, strict=False, assign=True)
         logger.info(f"Loaded Flux: {info}")
-        return model_type, is_schnell, model
+        return is_schnell, model
 
     elif model_type == "chroma":
         from . import chroma_models
@@ -166,7 +167,7 @@ def load_flow_model(
         info = model.load_state_dict(sd, strict=False, assign=True)
         logger.info(f"Loaded Chroma: {info}")
         is_schnell = False  # Chroma is not schnell
-        return model_type, is_schnell, model
+        return is_schnell, model
 
     else:
         raise ValueError(f"Unsupported model_type: {model_type}. Supported types are 'flux' and 'chroma'.")
@@ -203,6 +204,42 @@ def load_controlnet(
     return controlnet
 
 
+def dummy_clip_l() -> torch.nn.Module:
+    """
+    Returns a dummy CLIP-L model with the output shape of (N, 77, 768).
+    """
+    return DummyCLIPL()
+
+
+class DummyTextModel(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.embeddings = torch.nn.Parameter(torch.zeros(1))
+
+
+class DummyCLIPL(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.output_shape = (77, 1) # Note: The original code had (77, 768), but we use (77, 1) for the dummy output
+        self.dummy_param = torch.nn.Parameter(torch.zeros(1))  # get dtype and device from this parameter
+        self.text_model = DummyTextModel()
+
+    @property
+    def device(self):
+        return self.dummy_param.device
+
+    @property
+    def dtype(self):
+        return self.dummy_param.dtype
+
+    def forward(self, *args, **kwargs):
+        """
+        Returns a dummy output with the shape of (N, 77, 768).
+        """
+        batch_size = args[0].shape[0] if args else 1
+        return {"pooler_output": torch.zeros(batch_size, *self.output_shape, device=self.device, dtype=self.dtype)}
+
+
 def load_clip_l(
     ckpt_path: Optional[str],
     dtype: torch.dtype,
diff --git a/library/sai_model_spec.py b/library/sai_model_spec.py
index 8896c047e..662a6b2ee 100644
--- a/library/sai_model_spec.py
+++ b/library/sai_model_spec.py
@@ -60,6 +60,8 @@
 ARCH_SD3_M = "stable-diffusion-3"  # may be followed by "-m" or "-5-large" etc.
 # ARCH_SD3_UNKNOWN = "stable-diffusion-3"
 ARCH_FLUX_1_DEV = "flux-1-dev"
+ARCH_FLUX_1_SCHNELL = "flux-1-schnell"
+ARCH_FLUX_1_CHROMA = "chroma"  # for Flux Chroma
 ARCH_FLUX_1_UNKNOWN = "flux-1"
 
 ADAPTER_LORA = "lora"
@@ -69,6 +71,7 @@
 IMPL_COMFY_UI = "https://github.com/comfyanonymous/ComfyUI"
 IMPL_DIFFUSERS = "diffusers"
 IMPL_FLUX = "https://github.com/black-forest-labs/flux"
+IMPL_CHROMA = "https://huggingface.co/lodestones/Chroma"
 
 PRED_TYPE_EPSILON = "epsilon"
 PRED_TYPE_V = "v"
@@ -125,7 +128,7 @@ def build_metadata(
     flux: Optional[str] = None,
 ):
     """
-    sd3: only supports "m", flux: only supports "dev"
+    sd3: only supports "m", flux: supports "dev", "schnell" or "chroma"
     """
     # if state_dict is None, hash is not calculated
 
@@ -144,6 +147,10 @@ def build_metadata(
     elif flux is not None:
         if flux == "dev":
             arch = ARCH_FLUX_1_DEV
+        elif flux == "schnell":
+            arch = ARCH_FLUX_1_SCHNELL
+        elif flux == "chroma":
+            arch = ARCH_FLUX_1_CHROMA
         else:
             arch = ARCH_FLUX_1_UNKNOWN
     elif v2:
@@ -166,7 +173,10 @@ def build_metadata(
 
     if flux is not None:
         # Flux
-        impl = IMPL_FLUX
+        if flux == "chroma":
+            impl = IMPL_CHROMA
+        else:
+            impl = IMPL_FLUX
     elif (lora and sdxl) or textual_inversion or is_stable_diffusion_ckpt:
         # Stable Diffusion ckpt, TI, SDXL LoRA
         impl = IMPL_STABILITY_AI
diff --git a/library/train_util.py b/library/train_util.py
index 36d419fd2..b09963fb1 100644
--- a/library/train_util.py
+++ b/library/train_util.py
@@ -3482,7 +3482,7 @@ def get_sai_model_spec(
     textual_inversion: bool,
     is_stable_diffusion_ckpt: Optional[bool] = None,  # None for TI and LoRA
     sd3: str = None,
-    flux: str = None,
+    flux: str = None, # "dev", "schnell" or "chroma"
 ):
     timestamp = time.time()
 

From 0b763ef1f17fc9117b630c3478c6ae02437ac07e Mon Sep 17 00:00:00 2001
From: Kohya S <ykumeykume@gmail.com>
Date: Sun, 20 Jul 2025 20:53:06 +0900
Subject: [PATCH 496/582] feat: fix timestep for input_vec for Chroma

---
 flux_train_network.py    |  4 +---
 library/chroma_models.py | 36 ++++++++++++++++++++++++++++++------
 library/flux_models.py   |  3 +++
 3 files changed, 34 insertions(+), 9 deletions(-)

diff --git a/flux_train_network.py b/flux_train_network.py
index 1b61ac723..13e9ae2a2 100644
--- a/flux_train_network.py
+++ b/flux_train_network.py
@@ -341,9 +341,7 @@ def get_noise_pred_and_target(
         guidance_vec = torch.full((bsz,), float(args.guidance_scale), device=accelerator.device)
 
         # get modulation vectors for Chroma
-        input_vec = None
-        if self.model_type == "chroma":
-            input_vec = unet.get_input_vec(timesteps=timesteps, guidance=guidance_vec, batch_size=bsz)
+        input_vec = unet.get_input_vec(timesteps=timesteps / 1000, guidance=guidance_vec, batch_size=bsz)
 
         if args.gradient_checkpointing:
             noisy_model_input.requires_grad_(True)
diff --git a/library/chroma_models.py b/library/chroma_models.py
index e5d3b547a..b9c54db41 100644
--- a/library/chroma_models.py
+++ b/library/chroma_models.py
@@ -223,7 +223,10 @@ def _forward(
         # img_modulated = (1 + img_mod1.scale) * img_modulated + img_mod1.shift
         img_modulated = self.modulation_shift_scale_fn(img_modulated, img_mod1.scale, img_mod1.shift)
         img_qkv = self.img_attn.qkv(img_modulated)
+        del img_modulated
+
         img_q, img_k, img_v = rearrange(img_qkv, "B L (K H D) -> K B H L D", K=3, H=self.num_heads)
+        del img_qkv
         img_q, img_k = self.img_attn.norm(img_q, img_k, img_v)
 
         # prepare txt for attention
@@ -232,7 +235,10 @@ def _forward(
         # txt_modulated = (1 + txt_mod1.scale) * txt_modulated + txt_mod1.shift
         txt_modulated = self.modulation_shift_scale_fn(txt_modulated, txt_mod1.scale, txt_mod1.shift)
         txt_qkv = self.txt_attn.qkv(txt_modulated)
+        del txt_modulated
+
         txt_q, txt_k, txt_v = rearrange(txt_qkv, "B L (K H D) -> K B H L D", K=3, H=self.num_heads)
+        del txt_qkv
         txt_q, txt_k = self.txt_attn.norm(txt_q, txt_k, txt_v)
 
         # run actual attention: we split the batch into each element
@@ -263,9 +269,11 @@ def _forward(
             img_v[i] = None
 
             attn = attention(q, k, v, pe=pe[i : i + 1, :, : q.shape[2]], attn_mask=None)  # attn = (1, L, D)
+            del q, k, v
             img_attn_i = attn[:, :img_len, :]
             txt_attn_i = torch.zeros((1, max_txt_len, attn.shape[-1]), dtype=attn.dtype, device=self.device)
             txt_attn_i[:, : txt_seq_len[i], :] = attn[:, img_len:, :]
+            del attn
             txt_attn.append(txt_attn_i)
             img_attn.append(img_attn_i)
 
@@ -279,27 +287,31 @@ def _forward(
         # attn = attention(q, k, v, pe=pe, attn_mask=mask)
         # txt_attn, img_attn = attn[:, : txt.shape[1]], attn[:, txt.shape[1] :]
 
-        # calculate the img bloks
+        # calculate the img blocks
         # replaced with compiled fn
         # img = img + img_mod1.gate * self.img_attn.proj(img_attn)
         # img = img + img_mod2.gate * self.img_mlp((1 + img_mod2.scale) * self.img_norm2(img) + img_mod2.shift)
         img = self.modulation_gate_fn(img, img_mod1.gate, self.img_attn.proj(img_attn))
+        del img_attn, img_mod1
         img = self.modulation_gate_fn(
             img,
             img_mod2.gate,
             self.img_mlp(self.modulation_shift_scale_fn(self.img_norm2(img), img_mod2.scale, img_mod2.shift)),
         )
+        del img_mod2
 
-        # calculate the txt bloks
+        # calculate the txt blocks
         # replaced with compiled fn
         # txt = txt + txt_mod1.gate * self.txt_attn.proj(txt_attn)
         # txt = txt + txt_mod2.gate * self.txt_mlp((1 + txt_mod2.scale) * self.txt_norm2(txt) + txt_mod2.shift)
         txt = self.modulation_gate_fn(txt, txt_mod1.gate, self.txt_attn.proj(txt_attn))
+        del txt_attn, txt_mod1
         txt = self.modulation_gate_fn(
             txt,
             txt_mod2.gate,
             self.txt_mlp(self.modulation_shift_scale_fn(self.txt_norm2(txt), txt_mod2.scale, txt_mod2.shift)),
         )
+        del txt_mod2
 
         return img, txt
 
@@ -374,8 +386,10 @@ def _forward(self, x: Tensor, pe: list[Tensor], distill_vec: list[ModulationOut]
         # x_mod = (1 + mod.scale) * self.pre_norm(x) + mod.shift
         x_mod = self.modulation_shift_scale_fn(self.pre_norm(x), mod.scale, mod.shift)
         qkv, mlp = torch.split(self.linear1(x_mod), [3 * self.hidden_size, self.mlp_hidden_dim], dim=-1)
+        del x_mod
 
         q, k, v = rearrange(qkv, "B L (K H D) -> K B H L D", K=3, H=self.num_heads)
+        del qkv
         q, k = self.norm(q, k, v)
 
         # # compute attention
@@ -399,12 +413,15 @@ def _forward(self, x: Tensor, pe: list[Tensor], distill_vec: list[ModulationOut]
 
             attn_i = torch.zeros((1, x.shape[1], attn_trimmed.shape[-1]), dtype=attn_trimmed.dtype, device=self.device)
             attn_i[:, : img_len + txt_seq_len[i], :] = attn_trimmed
+            del attn_trimmed
             attn.append(attn_i)
 
         attn = torch.cat(attn, dim=0)
 
         # compute activation in mlp stream, cat again and run second linear layer
-        output = self.linear2(torch.cat((attn, self.mlp_act(mlp)), 2))
+        mlp = self.mlp_act(mlp)
+        output = self.linear2(torch.cat((attn, mlp), 2))
+        del attn, mlp
         # replaced with compiled fn
         # return x + mod.gate * output
         return self.modulation_gate_fn(x, mod.gate, output)
@@ -625,6 +642,7 @@ def disable_gradient_checkpointing(self):
         print("Chroma: Gradient checkpointing disabled.")
 
     def get_input_vec(self, timesteps: Tensor, guidance: Tensor | None = None, batch_size: int | None = None) -> Tensor:
+        # print(f"Chroma get_input_vec: timesteps {timesteps}, guidance: {guidance}, batch_size: {batch_size}")
         distill_timestep = timestep_embedding(timesteps, self.approximator_in_dim // 4)
         # TODO: need to add toggle to omit this from schnell but that's not a priority
         distil_guidance = timestep_embedding(guidance, self.approximator_in_dim // 4)
@@ -656,6 +674,7 @@ def forward(
         # print(
         #     f"Chroma forward: img shape {img.shape}, txt shape {txt.shape}, img_ids shape {img_ids.shape}, txt_ids shape {txt_ids.shape}"
         # )
+        # print(f"input_vec shape: {input_vec.shape if input_vec is not None else 'None'}")
         # print(f"timesteps: {timesteps}, guidance: {guidance}")
 
         if img.ndim != 3 or txt.ndim != 3:
@@ -687,6 +706,7 @@ def forward(
         txt_seq_len = txt_attention_mask[:, :txt_emb_len].sum(dim=-1).to(torch.int64)  # (batch_size, )
         txt_seq_len = torch.clip(txt_seq_len + attn_padding, 0, txt_emb_len)
         max_txt_len = torch.max(txt_seq_len).item()  # max text length in the batch
+        # print(f"max_txt_len: {max_txt_len}, txt_seq_len: {txt_seq_len}")
 
         # trim txt embedding to the text length
         txt = txt[:, :max_txt_len, :]
@@ -700,23 +720,27 @@ def forward(
                 self.offloader_double.wait_for_block(i)
 
             # the guidance replaced by FFN output
-            img_mod = mod_vectors_dict[f"double_blocks.{i}.img_mod.lin"]
-            txt_mod = mod_vectors_dict[f"double_blocks.{i}.txt_mod.lin"]
+            img_mod = mod_vectors_dict.pop(f"double_blocks.{i}.img_mod.lin")
+            txt_mod = mod_vectors_dict.pop(f"double_blocks.{i}.txt_mod.lin")
             double_mod = [img_mod, txt_mod]
+            del img_mod, txt_mod
 
             img, txt = block(img=img, txt=txt, pe=pe, distill_vec=double_mod, txt_seq_len=txt_seq_len)
+            del double_mod
 
             if self.blocks_to_swap:
                 self.offloader_double.submit_move_blocks(self.double_blocks, i)
 
         img = torch.cat((img, txt), 1)
+        del txt
 
         for i, block in enumerate(self.single_blocks):
             if self.blocks_to_swap:
                 self.offloader_single.wait_for_block(i)
 
-            single_mod = mod_vectors_dict[f"single_blocks.{i}.modulation.lin"]
+            single_mod = mod_vectors_dict.pop(f"single_blocks.{i}.modulation.lin")
             img = block(img, pe=pe, distill_vec=single_mod, txt_seq_len=txt_seq_len)
+            del single_mod
 
             if self.blocks_to_swap:
                 self.offloader_single.submit_move_blocks(self.single_blocks, i)
diff --git a/library/flux_models.py b/library/flux_models.py
index 6f889755a..2a2fe5f86 100644
--- a/library/flux_models.py
+++ b/library/flux_models.py
@@ -1009,6 +1009,9 @@ def prepare_block_swap_before_forward(self):
         self.offloader_double.prepare_block_devices_before_forward(self.double_blocks)
         self.offloader_single.prepare_block_devices_before_forward(self.single_blocks)
 
+    def get_input_vec(self, timesteps: Tensor, guidance: Tensor | None = None, batch_size: int | None = None) -> Tensor:
+        return None  # FLUX.1 does not use input_vec, but Chroma does.
+
     def forward(
         self,
         img: Tensor,

From 77a160d8867422ffdf7be34d8879fe29e05a8040 Mon Sep 17 00:00:00 2001
From: Kohya S <ykumeykume@gmail.com>
Date: Sun, 20 Jul 2025 21:25:43 +0900
Subject: [PATCH 497/582] fix: skip LoRA creation for None text encoders
 (CLIP-L for Chroma)

---
 networks/lora_flux.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/networks/lora_flux.py b/networks/lora_flux.py
index 0b30f1b8a..ddc916089 100644
--- a/networks/lora_flux.py
+++ b/networks/lora_flux.py
@@ -892,6 +892,9 @@ def create_modules(
         skipped_te = []
         for i, text_encoder in enumerate(text_encoders):
             index = i
+            if text_encoder is None:
+                logger.info(f"Text Encoder {index+1} is None, skipping LoRA creation for this encoder.")
+                continue
             if not train_t5xxl and index > 0:  # 0: CLIP, 1: T5XXL, so we skip T5XXL if train_t5xxl is False
                 break
 

From aec7e160949d900f709fe3c10a8602362dc097f2 Mon Sep 17 00:00:00 2001
From: Kohya S <ykumeykume@gmail.com>
Date: Mon, 21 Jul 2025 13:14:59 +0900
Subject: [PATCH 498/582] feat: add an option to add system prompt for negative
 in lumina inference

---
 lumina_minimal_inference.py | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/lumina_minimal_inference.py b/lumina_minimal_inference.py
index 87dc9a194..47d6d30b9 100644
--- a/lumina_minimal_inference.py
+++ b/lumina_minimal_inference.py
@@ -48,7 +48,7 @@ def generate_image(
     steps: int,
     guidance_scale: float,
     negative_prompt: Optional[str],
-    args,
+    args: argparse.Namespace,
     cfg_trunc_ratio: float = 0.25,
     renorm_cfg: float = 1.0,
 ):
@@ -88,7 +88,9 @@ def generate_image(
     with torch.no_grad():
         gemma2_conds = encoding_strategy.encode_tokens(tokenize_strategy, [gemma2], tokens_and_masks)
 
-    tokens_and_masks = tokenize_strategy.tokenize(negative_prompt, is_negative=True)
+    tokens_and_masks = tokenize_strategy.tokenize(
+        negative_prompt, is_negative=True and not args.add_system_prompt_to_negative_prompt
+    )
     with torch.no_grad():
         neg_gemma2_conds = encoding_strategy.encode_tokens(tokenize_strategy, [gemma2], tokens_and_masks)
 
@@ -215,6 +217,7 @@ def setup_parser() -> argparse.ArgumentParser:
     parser.add_argument("--device", type=str, default=None, help="Device to use (e.g., 'cuda:0')")
     parser.add_argument("--offload", action="store_true", help="Offload models to CPU to save VRAM")
     parser.add_argument("--system_prompt", type=str, default="", help="System prompt for Gemma2 model")
+    parser.add_argument("--add_system_prompt_to_negative_prompt", action="store_true", help="Add system prompt to negative prompt")
     parser.add_argument(
         "--gemma2_max_token_length",
         type=int,
@@ -231,7 +234,7 @@ def setup_parser() -> argparse.ArgumentParser:
         "--cfg_trunc_ratio",
         type=float,
         default=0.25,
-        help="The ratio of the timestep interval to apply normalization-based guidance scale. For example, 0.25 means the first 25% of timesteps will be guided.",
+        help="The ratio of the timestep interval to apply normalization-based guidance scale. For example, 0.25 means the first 25%% of timesteps will be guided.",
     )
     parser.add_argument(
         "--renorm_cfg",

From d300f19045e8c87bd5dd2dcd9f3cf84571f80206 Mon Sep 17 00:00:00 2001
From: Kohya S <ykumeykume@gmail.com>
Date: Mon, 21 Jul 2025 13:15:09 +0900
Subject: [PATCH 499/582] docs: update Lumina training guide to include
 inference script and options

---
 docs/lumina_train_network.md | 17 +++++++++++++++--
 1 file changed, 15 insertions(+), 2 deletions(-)

diff --git a/docs/lumina_train_network.md b/docs/lumina_train_network.md
index cb3b600f6..5f2fda172 100644
--- a/docs/lumina_train_network.md
+++ b/docs/lumina_train_network.md
@@ -1,5 +1,3 @@
-Status: reviewed
-
 # LoRA Training Guide for Lumina Image 2.0 using `lumina_train_network.py` / `lumina_train_network.py` を用いたLumina Image 2.0モデルのLoRA学習ガイド
 
 This document explains how to train LoRA (Low-Rank Adaptation) models for Lumina Image 2.0 using `lumina_train_network.py` in the `sd-scripts` repository.
@@ -198,6 +196,7 @@ For Lumina Image 2.0, you can specify different dimensions for various component
 
 <details>
 <summary>日本語</summary>
+
 [`train_network.py`のガイド](train_network.md)で説明されている引数に加え、以下のLumina Image 2.0特有の引数を指定します。共通の引数については、上記ガイドを参照してください。
 
 #### モデル関連
@@ -250,6 +249,18 @@ After setting the required arguments, run the command to begin training. The ove
 
 When training finishes, a LoRA model file (e.g. `my_lumina_lora.safetensors`) is saved in the directory specified by `output_dir`. Use this file with inference environments that support Lumina Image 2.0, such as ComfyUI with appropriate nodes.
 
+### Inference with scripts in this repository / このリポジトリのスクリプトを使用した推論
+
+The inference script is also available. The script is `lumina_minimal_inference.py`. See `--help` for options. 
+
+```
+python lumina_minimal_inference.py --pretrained_model_name_or_path path/to/lumina.safetensors  --gemma2_path path/to/gemma.safetensors" --ae_path  path/to/flux_ae.safetensors  --output_dir path/to/output_dir --offload --seed 1234 --prompt "Positive prompt" --system_prompt "You are an assistant designed to generate high-quality images based on user prompts."  --negative_prompt "negative prompt"  
+```
+
+`--add_system_prompt_to_negative_prompt` option can be used to add the system prompt to the negative prompt.
+
+`--lora_weights` option can be used to specify the LoRA weights file, and optional multiplier (like `path;1.0`).
+
 ## 6. Others / その他
 
 `lumina_train_network.py` shares many features with `train_network.py`, such as sample image generation (`--sample_prompts`, etc.) and detailed optimizer settings. For these, see the [train_network.py guide](train_network.md#5-other-features--その他の機能) or run `python lumina_train_network.py --help`.
@@ -279,6 +290,8 @@ Sample prompts can include CFG truncate (`--ctr`) and Renorm CFG (`-rcfg`) param
 
 学習が完了すると、指定した`output_dir`にLoRAモデルファイル（例: `my_lumina_lora.safetensors`）が保存されます。このファイルは、Lumina Image 2.0モデルに対応した推論環境（例: ComfyUI + 適切なノード）で使用できます。
 
+当リポジトリ内の推論スクリプトを用いて推論することも可能です。スクリプトは`lumina_minimal_inference.py`です。オプションは`--help`で確認できます。記述例は英語版のドキュメントをご確認ください。
+
 `lumina_train_network.py`には、サンプル画像の生成 (`--sample_prompts`など) や詳細なオプティマイザ設定など、`train_network.py`と共通の機能も多く存在します。これらについては、[`train_network.py`のガイド](train_network.md#5-other-features--その他の機能)やスクリプトのヘルプ (`python lumina_train_network.py --help`) を参照してください。
 
 ### 6.1. 推奨設定

From 518545bffbd8b2629944b9d3c65e6e77f167e7ce Mon Sep 17 00:00:00 2001
From: Kohya S <ykumeykume@gmail.com>
Date: Mon, 21 Jul 2025 13:16:42 +0900
Subject: [PATCH 500/582] docs: add support information for Lumina-Image 2.0 in
 recent updates

---
 README.md | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/README.md b/README.md
index 149f453b9..b6365644d 100644
--- a/README.md
+++ b/README.md
@@ -16,6 +16,10 @@ If you are using DeepSpeed, please install DeepSpeed with `pip install deepspeed
 
 ### Recent Updates
 
+Jul 21, 2025:
+- Support for [Lumina-Image 2.0](https://github.com/Alpha-VLLM/Lumina-Image-2.0) has been added in PR [#1927](https://github.com/kohya-ss/sd-scripts/pull/1927) and [#2138](https://github.com/kohya-ss/sd-scripts/pull/2138). Special thanks to sdbds and RockerBOO for their contributions.
+    - Please refer to the [Lumina-Image 2.0 documentation](./docs/lumina_train_network.md) for more details.
+
 Jul 10, 2025:
 - [AI Coding Agents](#for-developers-using-ai-coding-agents) section is added to the README. This section provides instructions for developers using AI coding agents like Claude and Gemini to understand the project context and coding standards.
 

From c84a163b3231e97cea77292551fa8b3967d2594a Mon Sep 17 00:00:00 2001
From: Kohya S <ykumeykume@gmail.com>
Date: Mon, 21 Jul 2025 13:40:03 +0900
Subject: [PATCH 501/582] docs: update README for documentation

---
 README.md                    | 11 ++++++++++-
 docs/lumina_train_network.md |  2 +-
 2 files changed, 11 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index b6365644d..3ef165931 100644
--- a/README.md
+++ b/README.md
@@ -19,7 +19,16 @@ If you are using DeepSpeed, please install DeepSpeed with `pip install deepspeed
 Jul 21, 2025:
 - Support for [Lumina-Image 2.0](https://github.com/Alpha-VLLM/Lumina-Image-2.0) has been added in PR [#1927](https://github.com/kohya-ss/sd-scripts/pull/1927) and [#2138](https://github.com/kohya-ss/sd-scripts/pull/2138). Special thanks to sdbds and RockerBOO for their contributions.
     - Please refer to the [Lumina-Image 2.0 documentation](./docs/lumina_train_network.md) for more details.
-
+- We have started adding comprehensive training-related documentation to [docs](./docs). These documents are being created with the help of generative AI and will be updated over time. While there are still many gaps at this stage, we plan to improve them gradually.
+
+    Currently, the following documents are available:
+    - train_network.md
+    - sdxl_train_network.md
+    - sdxl_train_network_advanced.md
+    - flux_train_network.md
+    - sd3_train_network.md
+    - lumina_train_network.md
+    
 Jul 10, 2025:
 - [AI Coding Agents](#for-developers-using-ai-coding-agents) section is added to the README. This section provides instructions for developers using AI coding agents like Claude and Gemini to understand the project context and coding standards.
 
diff --git a/docs/lumina_train_network.md b/docs/lumina_train_network.md
index 5f2fda172..3f0548d9c 100644
--- a/docs/lumina_train_network.md
+++ b/docs/lumina_train_network.md
@@ -6,7 +6,7 @@ This document explains how to train LoRA (Low-Rank Adaptation) models for Lumina
 
 `lumina_train_network.py` trains additional networks such as LoRA for Lumina Image 2.0 models. Lumina Image 2.0 adopts a Next-DiT (Next-generation Diffusion Transformer) architecture, which differs from previous Stable Diffusion models. It uses a single text encoder (Gemma2) and a dedicated AutoEncoder (AE).
 
-This guide assumes you already understand the basics of LoRA training. For common usage and options, see the train_network.py guide (to be documented). Some parameters are similar to those in [`sd3_train_network.py`](sd3_train_network.md) and [`flux_train_network.py`](flux_train_network.md).
+This guide assumes you already understand the basics of LoRA training. For common usage and options, see [the train_network.py guide](./train_network.md). Some parameters are similar to those in [`sd3_train_network.py`](sd3_train_network.md) and [`flux_train_network.py`](flux_train_network.md).
 
 **Prerequisites:**
 

From 32f06012a750737699bc4872173c9e960f000980 Mon Sep 17 00:00:00 2001
From: kohya-ss <ykumeykume@gmail.com>
Date: Mon, 21 Jul 2025 21:48:06 +0900
Subject: [PATCH 502/582] doc: update flux train document and add about
 breaking changes in sample generation prompts

---
 README-ja.md               |  13 +-
 README.md                  |  12 +-
 docs/flux_train_network.md | 654 +++++++++++++++++++++----------------
 3 files changed, 380 insertions(+), 299 deletions(-)

diff --git a/README-ja.md b/README-ja.md
index 60249f61e..c310dd8ad 100644
--- a/README-ja.md
+++ b/README-ja.md
@@ -155,11 +155,12 @@ masterpiece, best quality, 1boy, in business suit, standing at street, looking b
 
   `#` で始まる行はコメントになります。`--n` のように「ハイフン二個＋英小文字」の形でオプションを指定できます。以下が使用可能できます。
 
-  * `--n` Negative prompt up to the next option.
-  * `--w` Specifies the width of the generated image.
-  * `--h` Specifies the height of the generated image.
-  * `--d` Specifies the seed of the generated image.
-  * `--l` Specifies the CFG scale of the generated image.
-  * `--s` Specifies the number of steps in the generation.
+  * `--n` ネガティブプロンプト（次のオプションまで）
+  * `--w` 生成画像の幅を指定
+  * `--h` 生成画像の高さを指定
+  * `--d` 生成画像のシード値を指定
+  * `--l` 生成画像のCFGスケールを指定。FLUX.1モデルでは、デフォルトは `1.0` でCFGなしを意味します。Chromaモデルでは、CFGを有効にするために `4.0` 程度に設定してください
+  * `--g` 埋め込みガイダンス付きモデル（FLUX.1）の埋め込みガイダンススケールを指定、デフォルトは `3.5`。Chromaモデルでは `0.0` に設定してください
+  * `--s` 生成時のステップ数を指定
 
   `( )` や `[ ]` などの重みづけも動作します。
diff --git a/README.md b/README.md
index 3ef165931..9ba1cbfc1 100644
--- a/README.md
+++ b/README.md
@@ -16,6 +16,13 @@ If you are using DeepSpeed, please install DeepSpeed with `pip install deepspeed
 
 ### Recent Updates
 
+Jul XX, 2025:
+- **Breaking Change**: For FLUX.1 and Chroma training, the CFG scale option for sample image generation during training has been changed from `--g` to `--l`. The `--g` option is now used for the embedded guidance scale. Please update your prompts accordingly. See [Sample Image Generation During Training](#sample-image-generation-during-training) for details.
+
+- Support for [Chroma](https://huggingface.co/lodestones/Chroma) has been added in PR [#2157](https://github.com/kohya-ss/sd-scripts/pull/2157). Thank you to lodestones for the high-quality model.
+    - Chroma is a new model based on FLUX.1 schnell. In this repository, `flux_train_network.py` is used for training LoRAs for Chroma with `--model_type chroma`.
+    - Please refer to the [FLUX.1 LoRA training documentation](./docs/flux_train_network.md) for more details.
+
 Jul 21, 2025:
 - Support for [Lumina-Image 2.0](https://github.com/Alpha-VLLM/Lumina-Image-2.0) has been added in PR [#1927](https://github.com/kohya-ss/sd-scripts/pull/1927) and [#2138](https://github.com/kohya-ss/sd-scripts/pull/2138). Special thanks to sdbds and RockerBOO for their contributions.
     - Please refer to the [Lumina-Image 2.0 documentation](./docs/lumina_train_network.md) for more details.
@@ -1367,9 +1374,8 @@ masterpiece, best quality, 1boy, in business suit, standing at street, looking b
   * `--w` Specifies the width of the generated image.
   * `--h` Specifies the height of the generated image.
   * `--d` Specifies the seed of the generated image.
-  * `--l` Specifies the CFG scale of the generated image.
-    * In guidance distillation models like FLUX.1, this value is used as the embedded guidance scale for backward compatibility.
-  * `--g` Specifies the CFG scale for the models with embedded guidance scale. The default is `1.0`, `1.0` means no CFG. In general, should not be changed unless you train the un-distilled FLUX.1 models.
+  * `--l` Specifies the CFG scale of the generated image. For FLUX.1 models, the default is `1.0`, which means no CFG. For Chroma models, set to around `4.0` to enable CFG.
+  * `--g` Specifies the embedded guidance scale for the models with embedded guidance (FLUX.1), the default is `3.5`. Set to `0.0` for Chroma models.
   * `--s` Specifies the number of steps in the generation.
 
   The prompt weighting such as `( )` and `[ ]` are working.
diff --git a/docs/flux_train_network.md b/docs/flux_train_network.md
index 2b7ff7499..f324b9594 100644
--- a/docs/flux_train_network.md
+++ b/docs/flux_train_network.md
@@ -4,6 +4,13 @@ Status: reviewed
 
 This document explains how to train LoRA models for the FLUX.1 model using `flux_train_network.py` included in the `sd-scripts` repository.
 
+<details>
+<summary>日本語</summary>
+
+このドキュメントでは、`sd-scripts`リポジトリに含まれる`flux_train_network.py`を使用して、FLUX.1モデルに対するLoRA (Low-Rank Adaptation) モデルを学習する基本的な手順について解説します。
+
+</details>
+
 ## 1. Introduction / はじめに
 
 `flux_train_network.py` trains additional networks such as LoRA on the FLUX.1 model, which uses a transformer-based architecture different from Stable Diffusion. Two text encoders, CLIP-L and T5-XXL, and a dedicated AutoEncoder are used.
@@ -15,21 +22,73 @@ This guide assumes you know the basics of LoRA training. For common options see
 * The repository is cloned and the Python environment is ready.
 * A training dataset is prepared. See the dataset configuration guide.
 
+<details>
+<summary>日本語</summary>
+
+`flux_train_network.py`は、FLUX.1モデルに対してLoRAなどの追加ネットワークを学習させるためのスクリプトです。FLUX.1はStable Diffusionとは異なるアーキテクチャを持つ画像生成モデルであり、このスクリプトを使用することで、特定のキャラクターや画風を再現するLoRAモデルを作成できます。
+
+このガイドは、基本的なLoRA学習の手順を理解しているユーザーを対象としています。基本的な使い方や共通のオプションについては、[`train_network.py`のガイド](train_network.md)を参照してください。また一部のパラメータは [`sdxl_train_network.py`](sdxl_train_network.md) と同様のものがあるため、そちらも参考にしてください。
+
+**前提条件:**
+
+* `sd-scripts`リポジトリのクローンとPython環境のセットアップが完了していること。
+* 学習用データセットの準備が完了していること。（データセットの準備については[データセット設定ガイド](link/to/dataset/config/doc)を参照してください）
+
+</details>
+
 ## 2. Differences from `train_network.py` / `train_network.py` との違い
 
-`flux_train_network.py` is based on `train_network.py` but adapted for FLUX.1. Main differences include required arguments for the FLUX.1 model, CLIP-L, T5-XXL and AE, different model structure, and some incompatible options from Stable Diffusion.
+`flux_train_network.py` is based on `train_network.py` but adapted for FLUX.1. Main differences include:
+
+* **Target model:** FLUX.1 model (dev or schnell version).
+* **Model structure:** Unlike Stable Diffusion, FLUX.1 uses a Transformer-based architecture with two text encoders (CLIP-L and T5-XXL) and a dedicated AutoEncoder (AE) instead of VAE.
+* **Required arguments:** Additional arguments for FLUX.1 model, CLIP-L, T5-XXL, and AE model files.
+* **Incompatible options:** Some Stable Diffusion-specific arguments (e.g., `--v2`, `--clip_skip`, `--max_token_length`) are not used in FLUX.1 training.
+* **FLUX.1-specific arguments:** Additional arguments for FLUX.1-specific training parameters like timestep sampling and guidance scale.
+
+<details>
+<summary>日本語</summary>
+
+`flux_train_network.py`は`train_network.py`をベースに、FLUX.1モデルに対応するための変更が加えられています。主な違いは以下の通りです。
+
+* **対象モデル:** FLUX.1モデル（dev版またはschnell版）を対象とします。
+* **モデル構造:** Stable Diffusionとは異なり、FLUX.1はTransformerベースのアーキテクチャを持ちます。Text EncoderとしてCLIP-LとT5-XXLの二つを使用し、VAEの代わりに専用のAutoEncoder (AE) を使用します。
+* **必須の引数:** FLUX.1モデル、CLIP-L、T5-XXL、AEの各モデルファイルを指定する引数が追加されています。
+* **一部引数の非互換性:** Stable Diffusion向けの引数の一部（例: `--v2`, `--clip_skip`, `--max_token_length`）はFLUX.1の学習では使用されません。
+* **FLUX.1特有の引数:** タイムステップのサンプリング方法やガイダンススケールなど、FLUX.1特有の学習パラメータを指定する引数が追加されています。
+
+</details>
 
 ## 3. Preparation / 準備
 
 Before starting training you need:
 
 1. **Training script:** `flux_train_network.py`
-2. **FLUX.1 model file** and text encoder files (`clip_l`, `t5xxl`) and AE file.
-3. **Dataset definition file (.toml)** such as `my_flux_dataset_config.toml`.
+2. **FLUX.1 model file:** Base FLUX.1 model `.safetensors` file (e.g., `flux1-dev.safetensors`).
+3. **Text Encoder model files:**
+   - CLIP-L model `.safetensors` file (e.g., `clip_l.safetensors`)
+   - T5-XXL model `.safetensors` file (e.g., `t5xxl.safetensors`)
+4. **AutoEncoder model file:** FLUX.1-compatible AE model `.safetensors` file (e.g., `ae.safetensors`).
+5. **Dataset definition file (.toml):** TOML format file describing training dataset configuration (e.g., `my_flux_dataset_config.toml`).
+
+<details>
+<summary>日本語</summary>
+
+学習を開始する前に、以下のファイルが必要です。
+
+1. **学習スクリプト:** `flux_train_network.py`
+2. **FLUX.1モデルファイル:** 学習のベースとなるFLUX.1モデルの`.safetensors`ファイル（例: `flux1-dev.safetensors`）。
+3. **Text Encoderモデルファイル:**
+   - CLIP-Lモデルの`.safetensors`ファイル。例として`clip_l.safetensors`を使用します。
+   - T5-XXLモデルの`.safetensors`ファイル。例として`t5xxl.safetensors`を使用します。
+4. **AutoEncoderモデルファイル:** FLUX.1に対応するAEモデルの`.safetensors`ファイル。例として`ae.safetensors`を使用します。
+5. **データセット定義ファイル (.toml):** 学習データセットの設定を記述したTOML形式のファイル。（詳細は[データセット設定ガイド](link/to/dataset/config/doc)を参照してください）。例として`my_flux_dataset_config.toml`を使用します。
+
+</details>
 
 ## 4. Running the Training / 学習の実行
 
-Run `flux_train_network.py` from the terminal with FLUX.1 specific arguments. Example:
+Run `flux_train_network.py` from the terminal with FLUX.1 specific arguments. Here's a basic command example:
 
 ```bash
 accelerate launch --num_cpu_threads_per_process 1 flux_train_network.py \
@@ -54,369 +113,318 @@ accelerate launch --num_cpu_threads_per_process 1 flux_train_network.py \
   --gradient_checkpointing \
   --guidance_scale=1.0 \
   --timestep_sampling="flux_shift" \
+  --model_prediction_type="raw" \
   --blocks_to_swap=18 \
   --cache_text_encoder_outputs \
   --cache_latents
 ```
 
-### 4.1. Explanation of Key Options / 主要なコマンドライン引数の解説
-
-The script adds FLUX.1 specific arguments such as guidance scale, timestep sampling, block swapping, and options for training CLIP-L and T5-XXL LoRA modules. Some Stable Diffusion options like `--v2` and `--clip_skip` are not used.
+### Training Chroma Models
 
-### 4.2. Starting Training / 学習の開始
-
-Training begins once you run the command with the required options. Log checking is the same as in `train_network.py`.
-
-## 5. Using the Trained Model / 学習済みモデルの利用
-
-After training, a LoRA model file is saved in `output_dir` and can be used in inference environments supporting FLUX.1 (e.g. ComfyUI + Flux nodes).
+If you want to train a Chroma model, specify `--model_type=chroma`. Chroma does not use CLIP-L, so the `--clip_l` argument is not needed. T5XXL and AE are same as FLUX.1. The command would look like this:
 
-## 6. Others / その他
+```bash
+accelerate launch --num_cpu_threads_per_process 1 flux_train_network.py \
+  --pretrained_model_name_or_path="<path to Chroma model>" \
+  --model_type=chroma \
+  --t5xxl="<path to T5-XXL model>" \
+  --ae="<path to AE model>" \
+  --dataset_config="my_flux_dataset_config.toml" \
+  --output_dir="<output directory>" \
+  --output_name="my_chroma_lora" \
+  --guidance_scale=0.0 \
+  --timestep_sampling="sigmoid" \
+  --apply_t5_attn_mask \
+  ...
+```
 
-Additional notes on VRAM optimization, training options, multi-resolution datasets, block selection and text encoder LoRA are provided in the Japanese section.
+Note that for Chroma models, `--guidance_scale=0.0` is required to disable guidance scale, and `--apply_t5_attn_mask` is needed to apply attention masks for T5XXL Text Encoder.
 
 <details>
 <summary>日本語</summary>
 
+学習は、ターミナルから`flux_train_network.py`を実行することで開始します。基本的なコマンドラインの構造は`train_network.py`と同様ですが、FLUX.1特有の引数を指定する必要があります。
 
+コマンドラインの例は英語のドキュメントを参照してください。
 
-# `flux_train_network.py` を用いたFLUX.1モデルのLoRA学習ガイド
-
-このドキュメントでは、`sd-scripts`リポジトリに含まれる`flux_train_network.py`を使用して、FLUX.1モデルに対するLoRA (Low-Rank Adaptation) モデルを学習する基本的な手順について解説します。
-
-## 1. はじめに
-
-`flux_train_network.py`は、FLUX.1モデルに対してLoRAなどの追加ネットワークを学習させるためのスクリプトです。FLUX.1はStable Diffusionとは異なるアーキテクチャを持つ画像生成モデルであり、このスクリプトを使用することで、特定のキャラクターや画風を再現するLoRAモデルを作成できます。
-
-このガイドは、基本的なLoRA学習の手順を理解しているユーザーを対象としています。基本的な使い方や共通のオプションについては、[`train_network.py`のガイド](train_network.md)を参照してください。また一部のパラメータは [`sdxl_train_network.py`](sdxl_train_network.md) と同様のものがあるため、そちらも参考にしてください。
-
-**前提条件:**
-
-*   `sd-scripts`リポジトリのクローンとPython環境のセットアップが完了していること。
-*   学習用データセットの準備が完了していること。（データセットの準備については[データセット設定ガイド](link/to/dataset/config/doc)を参照してください）
-
-## 2. `train_network.py` との違い
-
-`flux_train_network.py`は`train_network.py`をベースに、FLUX.1モデルに対応するための変更が加えられています。主な違いは以下の通りです。
-
-*   **対象モデル:** FLUX.1モデル（dev版またはschnell版）を対象とします。
-*   **モデル構造:** Stable Diffusionとは異なり、FLUX.1はTransformerベースのアーキテクチャを持ちます。Text EncoderとしてCLIP-LとT5-XXLの二つを使用し、VAEの代わりに専用のAutoEncoder (AE) を使用します。
-*   **必須の引数:** FLUX.1モデル、CLIP-L、T5-XXL、AEの各モデルファイルを指定する引数が追加されています。
-*   **一部引数の非互換性:** Stable Diffusion向けの引数の一部（例: `--v2`, `--clip_skip`, `--max_token_length`）はFLUX.1の学習では使用されません。
-*   **FLUX.1特有の引数:** タイムステップのサンプリング方法やガイダンススケールなど、FLUX.1特有の学習パラメータを指定する引数が追加されています。
-
-## 3. 準備
-
-学習を開始する前に、以下のファイルが必要です。
+#### Chromaモデルの学習
 
-1.  **学習スクリプト:** `flux_train_network.py`
-2.  **FLUX.1モデルファイル:** 学習のベースとなるFLUX.1モデルの`.safetensors`ファイル（例: `flux1-dev.safetensors`）。
-3.  **Text Encoderモデルファイル:**
-    *   CLIP-Lモデルの`.safetensors`ファイル。例として`clip_l.safetensors`を使用します。
-    *   T5-XXLモデルの`.safetensors`ファイル。例として`t5xxl.safetensors`を使用します。
-4.  **AutoEncoderモデルファイル:** FLUX.1に対応するAEモデルの`.safetensors`ファイル。例として`ae.safetensors`を使用します。
-5.  **データセット定義ファイル (.toml):** 学習データセットの設定を記述したTOML形式のファイル。（詳細は[データセット設定ガイド](link/to/dataset/config/doc)を参照してください）。
+Chromaモデルを学習したい場合は、`--model_type=chroma`を指定します。ChromaはCLIP-Lを使用しないため、`--clip_l`引数は不要です。T5XXLとAEはFLUX.1と同様です。
 
-    *   例として`my_flux_dataset_config.toml`を使用します。
+コマンドラインの例は英語のドキュメントを参照してください。
 
-## 4. 学習の実行
+</details>
 
-学習は、ターミナルから`flux_train_network.py`を実行することで開始します。基本的なコマンドラインの構造は`train_network.py`と同様ですが、FLUX.1特有の引数を指定する必要があります。
+### 4.1. Explanation of Key Options / 主要なコマンドライン引数の解説
 
-以下に、基本的なコマンドライン実行例を示します。
+The script adds FLUX.1 specific arguments. For common arguments (like `--output_dir`, `--output_name`, `--network_module`, etc.), see the [`train_network.py` guide](train_network.md).
+
+#### Model-related [Required]
+
+* `--pretrained_model_name_or_path="<path to FLUX.1/Chroma model>"` **[Required]**
+  - Specifies the path to the base FLUX.1 or Chroma model `.safetensors` file. Diffusers format directories are not currently supported.
+* `--model_type=<model type>`
+  - Specifies the type of base model for training. Choose from `flux` or `chroma`. Default is `flux`.
+* `--clip_l="<path to CLIP-L model>"` **[Required when flux is selected]**
+  - Specifies the path to the CLIP-L Text Encoder model `.safetensors` file. Not needed when `--model_type=chroma`.
+* `--t5xxl="<path to T5-XXL model>"` **[Required]**
+  - Specifies the path to the T5-XXL Text Encoder model `.safetensors` file.
+* `--ae="<path to AE model>"` **[Required]**
+  - Specifies the path to the FLUX.1-compatible AutoEncoder model `.safetensors` file.
+
+#### FLUX.1 Training Parameters
+
+* `--guidance_scale=<float>`
+  - FLUX.1 dev version is distilled with specific guidance scale values, but for training, specify `1.0` to disable guidance scale. Default is `3.5`, so be sure to specify this. Usually ignored for schnell version.
+  - Chroma requires `--guidance_scale=0.0` to disable guidance scale.
+* `--timestep_sampling=<choice>`
+  - Specifies the sampling method for timesteps (noise levels) during training. Choose from `sigma`, `uniform`, `sigmoid`, `shift`, `flux_shift`. Default is `sigma`. Recommended is `flux_shift`. For Chroma models, `sigmoid` is recommended.
+* `--sigmoid_scale=<float>`
+  - Scale factor when `timestep_sampling` is set to `sigmoid`, `shift`, or `flux_shift`. Default and recommended value is `1.0`.
+* `--model_prediction_type=<choice>`
+  - Specifies what the model predicts. Choose from `raw` (use prediction as-is), `additive` (add to noise input), `sigma_scaled` (apply sigma scaling). Default is `sigma_scaled`. Recommended is `raw`.
+* `--discrete_flow_shift=<float>`
+  - Specifies the shift value for the scheduler used in Flow Matching. Default is `3.0`. This value is ignored when `timestep_sampling` is set to other than `shift`.
+
+#### Memory/Speed Related
+
+* `--fp8_base` 
+  - Enables training in FP8 format for FLUX.1, CLIP-L, and T5-XXL. This can significantly reduce VRAM usage, but the training results may vary. 
+* `--blocks_to_swap=<integer>` **[Experimental Feature]**
+  - Setting to reduce VRAM usage by swapping parts of the model (Transformer blocks) between CPU and GPU. Specify the number of blocks to swap as an integer (e.g., `18`). Larger values reduce VRAM usage but decrease training speed. Adjust according to your GPU's VRAM capacity. Can be used with `gradient_checkpointing`.
+  - Cannot be used with `--cpu_offload_checkpointing`.
+* `--cache_text_encoder_outputs`
+  - Caches the outputs of CLIP-L and T5-XXL. This reduces memory usage.
+* `--cache_latents`, `--cache_latents_to_disk`
+  - Caches the outputs of AE. Similar functionality to [sdxl_train_network.py](sdxl_train_network.md).
 
-```bash
-accelerate launch --num_cpu_threads_per_process 1 flux_train_network.py 
- --pretrained_model_name_or_path="<path to FLUX.1 model>" 
- --clip_l="<path to CLIP-L model>" 
- --t5xxl="<path to T5-XXL model>" 
- --ae="<path to AE model>" 
- --dataset_config="my_flux_dataset_config.toml" 
- --output_dir="<output directory for training results>" 
- --output_name="my_flux_lora" 
- --save_model_as=safetensors 
- --network_module=networks.lora_flux 
- --network_dim=16 
- --network_alpha=1 
- --learning_rate=1e-4 
- --optimizer_type="AdamW8bit" 
- --lr_scheduler="constant" 
- --sdpa  
- --max_train_epochs=10 
- --save_every_n_epochs=1 
- --mixed_precision="fp16" 
- --gradient_checkpointing 
- --guidance_scale=1.0 
- --timestep_sampling="flux_shift" 
- --blocks_to_swap=18
- --cache_text_encoder_outputs 
- --cache_latents
-```
+#### Incompatible/Deprecated Arguments
 
-※実際には1行で書くか、適切な改行文字（`\` または `^`）を使用してください。
+* `--v2`, `--v_parameterization`, `--clip_skip`: These are Stable Diffusion-specific arguments and are not used in FLUX.1 training.
+* `--max_token_length`: This is an argument for Stable Diffusion v1/v2. For FLUX.1, use `--t5xxl_max_token_length`.
+* `--split_mode`: Deprecated argument. Use `--blocks_to_swap` instead.
 
-### 4.1. 主要なコマンドライン引数の解説（`train_network.py`からの追加・変更点）
+<details>
+<summary>日本語</summary>
 
 [`train_network.py`のガイド](train_network.md)で説明されている引数に加え、以下のFLUX.1特有の引数を指定します。共通の引数（`--output_dir`, `--output_name`, `--network_module`, `--network_dim`, `--network_alpha`, `--learning_rate`など）については、上記ガイドを参照してください。
 
-#### モデル関連 [必須]
-
-*   `--pretrained_model_name_or_path="<path to FLUX.1 model>"` **[必須]**
-    *   学習のベースとなるFLUX.1モデル（dev版またはschnell版）の`.safetensors`ファイルのパスを指定します。Diffusers形式のディレクトリは現在サポートされていません。
-*   `--clip_l="<path to CLIP-L model>"` **[必須]**
-    *   CLIP-L Text Encoderモデルの`.safetensors`ファイルのパスを指定します。
-*   `--t5xxl="<path to T5-XXL model>"` **[必須]**
-    *   T5-XXL Text Encoderモデルの`.safetensors`ファイルのパスを指定します。
-*   `--ae="<path to AE model>"` **[必須]**
-    *   FLUX.1に対応するAutoEncoderモデルの`.safetensors`ファイルのパスを指定します。
-
-#### FLUX.1 学習パラメータ
-
-*   `--guidance_scale=<float>`
-    *   FLUX.1 dev版は特定のガイダンススケール値で蒸留されていますが、学習時には `1.0` を指定してガイダンススケールを無効化します。デフォルトは`3.5`ですので、必ず指定してください。schnell版では通常無視されます。
-*   `--timestep_sampling=<choice>`
-    *   学習時に使用するタイムステップ（ノイズレベル）のサンプリング方法を指定します。`sigma`, `uniform`, `sigmoid`, `shift`, `flux_shift` から選択します。デフォルトは `sigma` です。推奨は `flux_shift` です。
-*   `--sigmoid_scale=<float>`
-    *   `timestep_sampling` に `sigmoid` または `shift`, `flux_shift` を指定した場合のスケール係数です。デフォルトおよび推奨値は`1.0`です。
-*   `--model_prediction_type=<choice>`
-    *   モデルが何を予測するかを指定します。`raw` (予測値をそのまま使用), `additive` (ノイズ入力に加算), `sigma_scaled` (シグマスケーリングを適用) から選択します。デフォルトは `sigma_scaled` です。推奨は `raw` です。
-*   `--discrete_flow_shift=<float>`
-    *   Flow Matchingで使用されるスケジューラのシフト値を指定します。デフォルトは`3.0`です。`timestep_sampling`に`flux_shift`を指定した場合は、この値は無視されます。
-
-#### メモリ・速度関連
-
-*   `--blocks_to_swap=<integer>` **[実験的機能]**
-    *   VRAM使用量を削減するために、モデルの一部（Transformerブロック）をCPUとGPU間でスワップする設定です。スワップするブロック数を整数で指定します（例: `18`）。値を大きくするとVRAM使用量は減りますが、学習速度は低下します。GPUのVRAM容量に応じて調整してください。`gradient_checkpointing`と併用可能です。
-    *   `--cpu_offload_checkpointing`とは併用できません。
-* `--cache_text_encoder_outputs`
-    *   CLIP-LおよびT5-XXLの出力をキャッシュします。これにより、メモリ使用量が削減されます。
-* `--cache_latents`, `--cache_latents_to_disk`
-    *   AEの出力をキャッシュします。[sdxl_train_network.py](sdxl_train_network.md)と同様の機能です。
+コマンドラインの例と詳細な引数の説明は英語のドキュメントを参照してください。
 
-#### 非互換・非推奨の引数
+</details>
 
-*   `--v2`, `--v_parameterization`, `--clip_skip`: Stable Diffusion特有の引数のため、FLUX.1学習では使用されません。
-*   `--max_token_length`: Stable Diffusion v1/v2向けの引数です。FLUX.1では`--t5xxl_max_token_length`を使用してください。
-*   `--split_mode`: 非推奨の引数です。代わりに`--blocks_to_swap`を使用してください。
+### 4.2. Starting Training / 学習の開始
 
-### 4.2. 学習の開始
+Training begins once you run the command with the required options. Log checking is the same as in [`train_network.py`](train_network.md#32-starting-the-training--学習の開始).
+
+<details>
+<summary>日本語</summary>
 
 必要な引数を設定し、コマンドを実行すると学習が開始されます。基本的な流れやログの確認方法は[`train_network.py`のガイド](train_network.md#32-starting-the-training--学習の開始)と同様です。
 
-## 5. 学習済みモデルの利用
+</details>
 
-学習が完了すると、指定した`output_dir`にLoRAモデルファイル（例: `my_flux_lora.safetensors`）が保存されます。このファイルは、FLUX.1モデルに対応した推論環境（例: ComfyUI + ComfyUI-FluxNodes）で使用できます。
+## 5. Using the Trained Model / 学習済みモデルの利用
 
-## 6. その他
+After training, a LoRA model file is saved in `output_dir` and can be used in inference environments supporting FLUX.1 (e.g. ComfyUI + Flux nodes).
 
-`flux_train_network.py`には、サンプル画像の生成 (`--sample_prompts`など) や詳細なオプティマイザ設定など、`train_network.py`と共通の機能も多く存在します。これらについては、[`train_network.py`のガイド](train_network.md#5-other-features--その他の機能)やスクリプトのヘルプ (`python flux_train_network.py --help`) を参照してください。
+<details>
+<summary>日本語</summary>
 
-# FLUX.1 LoRA学習の補足説明
+学習が完了すると、指定した`output_dir`にLoRAモデルファイル（例: `my_flux_lora.safetensors`）が保存されます。このファイルは、FLUX.1モデルに対応した推論環境（例: ComfyUI + ComfyUI-FluxNodes）で使用できます。
 
-以下は、以上の基本的なFLUX.1 LoRAの学習手順を補足するものです。より詳細な設定オプションなどについて説明します。
+</details>
 
-## 1. VRAM使用量の最適化
+## 6. Advanced Settings / 高度な設定
 
-FLUX.1モデルは比較的大きなモデルであるため、十分なVRAMを持たないGPUでは工夫が必要です。以下に、VRAM使用量を削減するための設定を紹介します。
+### 6.1. VRAM Usage Optimization / VRAM使用量の最適化
 
-### 1.1 メモリ使用量別の推奨設定
+FLUX.1 is a relatively large model, so GPUs without sufficient VRAM require optimization. Here are settings to reduce VRAM usage (with `--fp8_base`):
 
-| GPUメモリ | 推奨設定 |
-|----------|----------|
-| 24GB VRAM | 基本設定で問題なく動作します（バッチサイズ2） |
-| 16GB VRAM | バッチサイズ1に設定し、`--blocks_to_swap`を使用 |
-| 12GB VRAM | `--blocks_to_swap 16`と8bit AdamWを使用 |
-| 10GB VRAM | `--blocks_to_swap 22`を使用、T5XXLはfp8形式を推奨 |
-| 8GB VRAM | `--blocks_to_swap 28`を使用、T5XXLはfp8形式を推奨 |
+#### Recommended Settings by GPU Memory
 
-### 1.2 主要なVRAM削減オプション
+| GPU Memory | Recommended Settings |
+|------------|---------------------|
+| 24GB VRAM | Basic settings work fine (batch size 2) |
+| 16GB VRAM | Set batch size to 1 and use `--blocks_to_swap` |
+| 12GB VRAM | Use `--blocks_to_swap 16` and 8bit AdamW |
+| 10GB VRAM | Use `--blocks_to_swap 22`, recommend fp8 format for T5XXL |
+| 8GB VRAM | Use `--blocks_to_swap 28`, recommend fp8 format for T5XXL |
 
-- **`--blocks_to_swap <数値>`**：
-  CPUとGPU間でブロックをスワップしてVRAM使用量を削減します。数値が大きいほど多くのブロックをスワップし、より多くのVRAMを節約できますが、学習速度は低下します。FLUX.1では最大35ブロックまでスワップ可能です。
+#### Key VRAM Reduction Options
 
-- **`--cpu_offload_checkpointing`**：
-  勾配チェックポイントをCPUにオフロードします。これにより最大1GBのVRAM使用量を削減できますが、学習速度は約15%低下します。`--blocks_to_swap`とは併用できません。
+- **`--fp8_base`**: Enables training in FP8 format.
 
-- **`--cache_text_encoder_outputs` / `--cache_text_encoder_outputs_to_disk`**：
-  CLIP-LとT5-XXLの出力をキャッシュします。これによりメモリ使用量を削減できます。
+- **`--blocks_to_swap <number>`**: Swaps blocks between CPU and GPU to reduce VRAM usage. Higher numbers save more VRAM but reduce training speed. FLUX.1 supports up to 35 blocks for swapping.
 
-- **`--cache_latents` / `--cache_latents_to_disk`**：
-  AEの出力をキャッシュします。メモリ使用量を削減できます。
+- **`--cpu_offload_checkpointing`**: Offloads gradient checkpoints to CPU. Can reduce VRAM usage by up to 1GB but decreases training speed by about 15%. Cannot be used with `--blocks_to_swap`. Chroma models do not support this option.
 
-- **Adafactor オプティマイザの使用**：
-  8bit AdamWよりもVRAM使用量を削減できます。以下の設定を使用してください：
+- **Using Adafactor optimizer**: Can reduce VRAM usage more than 8bit AdamW:
   ```
   --optimizer_type adafactor --optimizer_args "relative_step=False" "scale_parameter=False" "warmup_init=False" --lr_scheduler constant_with_warmup --max_grad_norm 0.0
   ```
 
-- **T5XXLのfp8形式の使用**：
-  10GB未満のVRAMを持つGPUでは、T5XXLのfp8形式チェックポイントの使用を推奨します。[comfyanonymous/flux_text_encoders](https://huggingface.co/comfyanonymous/flux_text_encoders)から`t5xxl_fp8_e4m3fn.safetensors`をダウンロードできます（`scaled`なしで使用してください）。
-
-- **FP8/FP16 混合学習 [実験的機能]**：
-  `--fp8_base_unet` オプションを指定すると、FLUX.1モデル本体をFP8形式で学習し、Text Encoder (CLIP-L/T5XXL) をBF16/FP16形式で学習できます。これにより、さらにVRAM使用量を削減できる可能性があります。このオプションを指定すると、`--fp8_base` オプションも自動的に有効になります。
-
-- **`pytorch-optimizer` の利用**:
-  `pytorch-optimizer` ライブラリに含まれる様々なオプティマイザを使用できます。`requirements.txt` に追加されているため、別途インストールは不要です。
-  例えば、CAME オプティマイザを使用する場合は以下のように指定します。
-  ```bash
-  --optimizer_type "pytorch_optimizer.CAME" --optimizer_args "weight_decay=0.01"
- 
-## 2. FLUX.1 LoRA学習の重要な設定オプション
-
-FLUX.1の学習には多くの未知の点があり、いくつかの設定は引数で指定できます。以下に重要な引数とその説明を示します。
+- **Using T5XXL fp8 format**: For GPUs with less than 10GB VRAM, using fp8 format T5XXL checkpoints is recommended. Download `t5xxl_fp8_e4m3fn.safetensors` from [comfyanonymous/flux_text_encoders](https://huggingface.co/comfyanonymous/flux_text_encoders) (use without `scaled`).
 
-### 2.1 タイムステップのサンプリング方法
+- **FP8/FP16 Mixed Training [Experimental]**: Specify `--fp8_base_unet` to train the FLUX.1 model in FP8 format while training Text Encoders (CLIP-L/T5XXL) in BF16/FP16 format. This can further reduce VRAM usage.
 
-`--timestep_sampling`オプションで、タイムステップ（0-1）のサンプリング方法を指定できます：
+<details>
+<summary>日本語</summary>
 
-- `sigma`：SD3と同様のシグマベース
-- `uniform`：一様ランダム
-- `sigmoid`：正規分布乱数のシグモイド（x-flux、AI-toolkitなどと同様）
-- `shift`：正規分布乱数のシグモイド値をシフト
-- `flux_shift`：解像度に応じて正規分布乱数のシグモイド値をシフト（FLUX.1 dev推論と同様）。この設定では`--discrete_flow_shift`は無視されます。
+FLUX.1モデルは比較的大きなモデルであるため、十分なVRAMを持たないGPUでは工夫が必要です。VRAM使用量を削減するための設定の詳細は英語のドキュメントを参照してください。
 
+主要なVRAM削減オプション：
+- `--fp8_base`: FP8形式での学習を有効化
+- `--blocks_to_swap`: CPUとGPU間でブロックをスワップ
+- `--cpu_offload_checkpointing`: 勾配チェックポイントをCPUにオフロード  
+- Adafactorオプティマイザの使用
+- T5XXLのfp8形式の使用
+- FP8/FP16混合学習（実験的機能）
 
-#### タイムステップ分布の可視化
+</details>
 
-`--timestep_sampling`, `--sigmoid_scale`, `--discrete_flow_shift` の組み合わせによって、学習中にサンプリングされるタイムステップの分布が変化します。以下にいくつかの例を示します。
+### 6.2. Important FLUX.1 LoRA Training Settings / FLUX.1 LoRA学習の重要な設定
 
-*   `--timestep_sampling shift` と `--discrete_flow_shift` の効果 (`--sigmoid_scale` はデフォルトの1.0):
-    ![Figure_2](https://github.com/user-attachments/assets/d9de42f9-f17d-40da-b88d-d964402569c6)
+FLUX.1 training has many unknowns, and several settings can be specified with arguments:
 
-*   `--timestep_sampling sigmoid` と `--timestep_sampling uniform` の比較 (`--discrete_flow_shift` は無視される):
-    ![Figure_3](https://github.com/user-attachments/assets/27029009-1f5d-4dc0-bb24-13d02ac4fdad)
+#### Timestep Sampling Methods
 
-*   `--timestep_sampling sigmoid` と `--sigmoid_scale` の効果 (`--discrete_flow_shift` は無視される):
-    ![Figure_4](https://github.com/user-attachments/assets/08a2267c-e47e-48b7-826e-f9a080787cdc)
+The `--timestep_sampling` option specifies how timesteps (0-1) are sampled:
 
-#### AI Toolkit 設定との比較
+- `sigma`: Sigma-based like SD3
+- `uniform`: Uniform random
+- `sigmoid`: Sigmoid of normal distribution random (similar to x-flux, AI-toolkit)
+- `shift`: Sigmoid value of normal distribution random with shift. The `--discrete_flow_shift` setting is used to shift the sigmoid value.
+- `flux_shift`: Shift sigmoid value of normal distribution random according to resolution (similar to FLUX.1 dev inference).
 
-[Ostris氏のAI Toolkit](https://github.com/ostris/ai-toolkit) で使用されている設定は、概ね以下のオプションに相当すると考えられます。
-```
---timestep_sampling sigmoid --model_prediction_type raw --guidance_scale 1.0
-```
+`--discrete_flow_shift` only applies when `--timestep_sampling` is set to `shift`.
 
-### 2.2 モデル予測の処理方法
+#### Model Prediction Processing
 
-`--model_prediction_type`オプションで、モデルの予測をどのように解釈し処理するかを指定できます：
+The `--model_prediction_type` option specifies how to interpret and process model predictions:
 
-- `raw`：そのまま使用（x-fluxと同様）【推奨】
-- `additive`：ノイズ入力に加算
-- `sigma_scaled`：シグマスケーリングを適用（SD3と同様）
+- `raw`: Use as-is (similar to x-flux) **[Recommended]**
+- `additive`: Add to noise input
+- `sigma_scaled`: Apply sigma scaling (similar to SD3)
 
-### 2.3 推奨設定
+#### Recommended Settings
 
-実験の結果、以下の設定が良好に動作することが確認されています：
+Based on experiments, the following settings work well:
 ```
 --timestep_sampling shift --discrete_flow_shift 3.1582 --model_prediction_type raw --guidance_scale 1.0
 ```
 
-ガイダンススケールについて：FLUX.1 dev版は特定のガイダンススケール値で蒸留されていますが、学習時には`--guidance_scale 1.0`を指定してガイダンススケールを無効化することを推奨します。
-
-
-### 2.4 T5 Attention Mask の適用
-
-`--apply_t5_attn_mask` オプションを指定すると、T5XXL Text Encoder の学習および推論時に Attention Mask が適用されます。
-
-Attention Maskに対応した推論環境が限られるため、このオプションは推奨されません。
-
-### 2.5 IP ノイズガンマ
-
-`--ip_noise_gamma` および `--ip_noise_gamma_random_strength` オプションを使用することで、学習時に Input Perturbation ノイズのガンマ値を調整できます。詳細は Stable Diffusion 3 の学習オプションを参照してください。
-
-### 2.6 LoRA-GGPO サポート
+**About Guidance Scale**: FLUX.1 dev version is distilled with specific guidance scale values, but for training, specify `--guidance_scale 1.0` to disable guidance scale.
 
-LoRA-GGPO (Gradient Group Proportion Optimizer) を使用できます。これは LoRA の学習を安定化させるための手法です。以下の `network_args` を指定して有効化します。ハイパーパラメータ (`ggpo_sigma`, `ggpo_beta`) は調整が必要です。
+`--timestep_sampling sigmoid --model_prediction_type raw --guidance_scale 0.0` is recommended for Chroma models.
 
-```bash
---network_args "ggpo_sigma=0.03" "ggpo_beta=0.01"
-```
-TOMLファイルで指定する場合:
-```toml
-network_args = ["ggpo_sigma=0.03", "ggpo_beta=0.01"]
-```
+<details>
+<summary>日本語</summary>
 
-### 2.7 Q/K/V 射影層の分割 [実験的機能]
+FLUX.1の学習には多くの未知の点があり、いくつかの設定は引数で指定できます。詳細な説明とコマンドラインの例は英語のドキュメントを参照してください。
 
-`--network_args "split_qkv=True"` を指定することで、Attention層内の Q/K/V (および SingleStreamBlock の Text) 射影層を個別に分割し、それぞれに LoRA を適用できます。
+主要な設定オプション：
+- タイムステップのサンプリング方法（`--timestep_sampling`）
+- モデル予測の処理方法（`--model_prediction_type`）
+- 推奨設定の組み合わせ
 
-**技術的詳細:**
-FLUX.1 の元々の実装では、Q/K/V (および Text) の射影層は一つに結合されています。ここに LoRA を適用すると、一つの大きな LoRA モジュールが適用されます。一方、Diffusers の実装ではこれらの射影層は分離されており、それぞれに小さな LoRA モジュールが適用されます。このオプションは後者の挙動を模倣します。
-保存される LoRA モデルの互換性は維持されますが、内部的には分割された LoRA の重みを結合して保存するため、ゼロ要素が多くなりモデルサイズが大きくなる可能性があります。`convert_flux_lora.py` スクリプトを使用して Diffusers (AI-Toolkit) 形式に変換すると、サイズが削減されます。
+</details>
 
-## 3. 各層に対するランク指定
+### 6.3. Layer-specific Rank Configuration / 各層に対するランク指定
 
-FLUX.1の各層に対して異なるランク（network_dim）を指定できます。これにより、特定の層に対してLoRAの効果を強調したり、無効化したりできます。
+You can specify different ranks (network_dim) for each layer of FLUX.1. This allows you to emphasize or disable LoRA effects for specific layers.
 
-以下のnetwork_argsを指定することで、各層のランクを指定できます。0を指定するとその層にはLoRAが適用されません。
+Specify the following network_args to set ranks for each layer. Setting 0 disables LoRA for that layer:
 
-| network_args | 対象レイヤー |
+| network_args | Target Layer |
 |--------------|--------------|
-| img_attn_dim | DoubleStreamBlockのimg_attn |
-| txt_attn_dim | DoubleStreamBlockのtxt_attn |
-| img_mlp_dim | DoubleStreamBlockのimg_mlp |
-| txt_mlp_dim | DoubleStreamBlockのtxt_mlp |
-| img_mod_dim | DoubleStreamBlockのimg_mod |
-| txt_mod_dim | DoubleStreamBlockのtxt_mod |
-| single_dim | SingleStreamBlockのlinear1とlinear2 |
-| single_mod_dim | SingleStreamBlockのmodulation |
-
-使用例：
+| img_attn_dim | DoubleStreamBlock img_attn |
+| txt_attn_dim | DoubleStreamBlock txt_attn |
+| img_mlp_dim | DoubleStreamBlock img_mlp |
+| txt_mlp_dim | DoubleStreamBlock txt_mlp |
+| img_mod_dim | DoubleStreamBlock img_mod |
+| txt_mod_dim | DoubleStreamBlock txt_mod |
+| single_dim | SingleStreamBlock linear1 and linear2 |
+| single_mod_dim | SingleStreamBlock modulation |
+
+Example usage:
 ```
 --network_args "img_attn_dim=4" "img_mlp_dim=8" "txt_attn_dim=2" "txt_mlp_dim=2" "img_mod_dim=2" "txt_mod_dim=2" "single_dim=4" "single_mod_dim=2"
 ```
 
-さらに、FLUXの条件付けレイヤーにLoRAを適用するには、network_argsに`in_dims`を指定します。5つの数値をカンマ区切りのリストとして指定する必要があります。
+To apply LoRA to FLUX conditioning layers, specify `in_dims` in network_args as a comma-separated list of 5 numbers:
 
-例：
 ```
 --network_args "in_dims=[4,2,2,2,4]"
 ```
 
-各数値は、`img_in`、`time_in`、`vector_in`、`guidance_in`、`txt_in`に対応します。上記の例では、すべての条件付けレイヤーにLoRAを適用し、`img_in`と`txt_in`のランクを4、その他のランクを2に設定しています。
+Each number corresponds to `img_in`, `time_in`, `vector_in`, `guidance_in`, `txt_in`. The example above applies LoRA to all conditioning layers with ranks of 4 for `img_in` and `txt_in`, and ranks of 2 for others.
 
-0を指定するとそのレイヤーにはLoRAが適用されません。例えば、`[4,0,0,0,4]`は`img_in`と`txt_in`にのみLoRAを適用します。
+<details>
+<summary>日本語</summary>
+
+FLUX.1の各層に対して異なるランク（network_dim）を指定できます。これにより、特定の層に対してLoRAの効果を強調したり、無効化したりできます。
 
-## 4. 学習するブロックの指定
+詳細な設定方法とコマンドラインの例は英語のドキュメントを参照してください。
 
-FLUX.1 LoRA学習では、network_argsの`train_double_block_indices`と`train_single_block_indices`を指定することで、学習するブロックを指定できます。インデックスは0ベースです。省略した場合のデフォルトはすべてのブロックを学習することです。
+</details>
 
-インデックスは、`0,1,5,8`のような整数のリストや、`0,1,4-5,7`のような整数の範囲として指定します。
-- double blocksの数は19なので、有効な範囲は0-18です
-- single blocksの数は38なので、有効な範囲は0-37です
-- `all`を指定するとすべてのブロックを学習します
-- `none`を指定するとブロックを学習しません
+### 6.4. Block Selection for Training / 学習するブロックの指定
 
-使用例：
+You can specify which blocks to train using `train_double_block_indices` and `train_single_block_indices` in network_args. Indices are 0-based. Default is to train all blocks if omitted.
+
+Specify indices as integer lists like `0,1,5,8` or integer ranges like `0,1,4-5,7`:
+- Double blocks: 19 blocks, valid range 0-18
+- Single blocks: 38 blocks, valid range 0-37
+- Specify `all` to train all blocks
+- Specify `none` to skip training blocks
+
+Example usage:
 ```
 --network_args "train_double_block_indices=0,1,8-12,18" "train_single_block_indices=3,10,20-25,37"
 ```
 
-または：
+Or:
 ```
 --network_args "train_double_block_indices=none" "train_single_block_indices=10-15"
 ```
 
-`train_double_block_indices`または`train_single_block_indices`のどちらか一方だけを指定した場合、もう一方は通常通り学習されます。
+<details>
+<summary>日本語</summary>
+
+FLUX.1 LoRA学習では、network_argsの`train_double_block_indices`と`train_single_block_indices`を指定することで、学習するブロックを指定できます。
+
+詳細な設定方法とコマンドラインの例は英語のドキュメントを参照してください。
+
+</details>
+
+### 6.5. Text Encoder LoRA Support / Text Encoder LoRAのサポート
+
+FLUX.1 LoRA training supports training CLIP-L and T5XXL LoRA:
+
+- To train only FLUX.1: specify `--network_train_unet_only`
+- To train FLUX.1 and CLIP-L: omit `--network_train_unet_only`
+- To train FLUX.1, CLIP-L, and T5XXL: omit `--network_train_unet_only` and add `--network_args "train_t5xxl=True"`
+
+You can specify individual learning rates for CLIP-L and T5XXL with `--text_encoder_lr`. For example, `--text_encoder_lr 1e-4 1e-5` sets the first value for CLIP-L and the second for T5XXL. Specifying one value uses the same learning rate for both. If `--text_encoder_lr` is not specified, the default `--learning_rate` is used for both.
 
-## 5. Text Encoder LoRAのサポート
+<details>
+<summary>日本語</summary>
 
 FLUX.1 LoRA学習は、CLIP-LとT5XXL LoRAのトレーニングもサポートしています。
 
-- FLUX.1のみをトレーニングする場合は、`--network_train_unet_only`を指定します
-- FLUX.1とCLIP-Lをトレーニングする場合は、`--network_train_unet_only`を省略します
-- FLUX.1、CLIP-L、T5XXLすべてをトレーニングする場合は、`--network_train_unet_only`を省略し、`--network_args "train_t5xxl=True"`を追加します
+詳細な設定方法とコマンドラインの例は英語のドキュメントを参照してください。
 
-CLIP-LとT5XXLの学習率は、`--text_encoder_lr`で個別に指定できます。例えば、`--text_encoder_lr 1e-4 1e-5`とすると、最初の値はCLIP-Lの学習率、2番目の値はT5XXLの学習率になります。1つだけ指定すると、CLIP-LとT5XXLの学習率は同じになります。`--text_encoder_lr`を指定しない場合、デフォルトの学習率`--learning_rate`が両方に使用されます。
+</details>
 
-## 6. マルチ解像度トレーニング
+### 6.6. Multi-Resolution Training / マルチ解像度トレーニング
 
-データセット設定ファイルで複数の解像度を定義できます。各解像度に対して異なるバッチサイズを指定することができます。
+You can define multiple resolutions in the dataset configuration file, with different batch sizes for each resolution.
 
-設定ファイルの例：
+Configuration file example:
 ```toml
 [general]
-# 共通設定をここで定義
+# Common settings
 flip_aug = true
 color_aug = false
 keep_tokens_separator= "|||"
@@ -425,85 +433,151 @@ caption_tag_dropout_rate = 0
 caption_extension = ".txt"
 
 [[datasets]]
-# 最初の解像度の設定
+# First resolution settings
 batch_size = 2
 enable_bucket = true
 resolution = [1024, 1024]
 
   [[datasets.subsets]]
-  image_dir = "画像ディレクトリへのパス"
+  image_dir = "path/to/image/directory"
   num_repeats = 1
 
 [[datasets]]
-# 2番目の解像度の設定
+# Second resolution settings
 batch_size = 3
 enable_bucket = true
 resolution = [768, 768]
 
   [[datasets.subsets]]
-  image_dir = "画像ディレクトリへのパス"
+  image_dir = "path/to/image/directory"
   num_repeats = 1
+```
 
-[[datasets]]
-# 3番目の解像度の設定
-batch_size = 4
-enable_bucket = true
-resolution = [512, 512]
+<details>
+<summary>日本語</summary>
 
-  [[datasets.subsets]]
-  image_dir = "画像ディレクトリへのパス"
-  num_repeats = 1
-```
+データセット設定ファイルで複数の解像度を定義できます。各解像度に対して異なるバッチサイズを指定することができます。
 
-各解像度セクションの`[[datasets.subsets]]`部分は、データセットディレクトリを定義します。各解像度に対して同じディレクトリを指定してください。</details>
+設定ファイルの例は英語のドキュメントを参照してください。
 
-## 7. 検証 (Validation)
+</details>
 
-学習中に検証データセットを使用して損失 (Validation Loss) を計算し、モデルの汎化性能を評価できます。
+### 6.7. Validation / 検証
+
+You can calculate validation loss during training using a validation dataset to evaluate model generalization performance.
 
-検証を設定するには、データセット設定 TOML ファイルに `[validation]` セクションを追加します。設定方法は学習データセットと同様ですが、`num_repeats` は通常 1 に設定します。
+To set up validation, add a `[validation]` section to your dataset configuration TOML file. Configuration is similar to training datasets, but `num_repeats` is usually set to 1.
 
 ```toml
-# ... (学習データセットの設定) ...
+# ... (training dataset configuration) ...
 
 [validation]
 batch_size = 1
 enable_bucket = true
-resolution = [1024, 1024] # 検証に使用する解像度
+resolution = [1024, 1024] # Resolution for validation
 
   [[validation.subsets]]
-  image_dir = "検証用画像ディレクトリへのパス"
+  image_dir = "path/to/validation/images"
   num_repeats = 1
   caption_extension = ".txt"
-  # ... 他の検証データセット固有の設定 ...
+  # ... other validation dataset settings ...
 ```
 
-**注意点:**
+**Notes:**
+
+* Validation loss calculation uses fixed timestep sampling and random seeds to reduce loss variation due to randomness for more stable evaluation.
+* Currently, validation loss is not supported when using `--blocks_to_swap` or Schedule-Free optimizers (`AdamWScheduleFree`, `RAdamScheduleFree`, `ProdigyScheduleFree`).
+
+<details>
+<summary>日本語</summary>
+
+学習中に検証データセットを使用して損失 (Validation Loss) を計算し、モデルの汎化性能を評価できます。
+
+詳細な設定方法とコマンドラインの例は英語のドキュメントを参照してください。
+
+</details>
+
+## 7. Additional Options / 追加オプション
+
+### 7.1. Other FLUX.1-specific Options / その他のFLUX.1特有のオプション
+
+- **T5 Attention Mask Application**: Specify `--apply_t5_attn_mask` to apply attention masks during T5XXL Text Encoder training and inference. Not recommended due to limited inference environment support. **For Chroma models, this option is required.**
+
+- **IP Noise Gamma**: Use `--ip_noise_gamma` and `--ip_noise_gamma_random_strength` to adjust Input Perturbation noise gamma values during training. See Stable Diffusion 3 training options for details.
+
+- **LoRA-GGPO Support**: Use LoRA-GGPO (Gradient Group Proportion Optimizer) to stabilize LoRA training:
+  ```bash
+  --network_args "ggpo_sigma=0.03" "ggpo_beta=0.01"
+  ```
+
+- **Q/K/V Projection Layer Splitting [Experimental]**: Specify `--network_args "split_qkv=True"` to individually split and apply LoRA to Q/K/V (and SingleStreamBlock Text) projection layers within Attention layers.
+
+<details>
+<summary>日本語</summary>
+
+その他のFLUX.1特有のオプション：
+- T5 Attention Maskの適用（Chromaモデルでは必須）
+- IPノイズガンマ
+- LoRA-GGPOサポート
+- Q/K/V射影層の分割（実験的機能）
+
+詳細な設定方法とコマンドラインの例は英語のドキュメントを参照してください。
 
-*   検証損失の計算は、固定されたタイムステップサンプリングと乱数シードで行われます。これにより、ランダム性による損失の変動を抑え、より安定した評価が可能になります。
-*   現在のところ、`--blocks_to_swap` オプションを使用している場合、または Schedule-Free オプティマイザ (`AdamWScheduleFree`, `RAdamScheduleFree`, `ProdigyScheduleFree`) を使用している場合は、検証損失はサポートされていません。
+</details>
 
-## 8. データセット関連の追加オプション
+### 7.2. Dataset-related Additional Options / データセット関連の追加オプション
 
-### 8.1 リサイズ時の補間方法指定
+#### Interpolation Method for Resizing
 
-データセットの画像を学習解像度にリサイズする際の補間方法を指定できます。データセット設定 TOML ファイルの `[[datasets]]` セクションまたは `[general]` セクションで `interpolation_type` を指定します。
+You can specify the interpolation method when resizing dataset images to training resolution. Specify `interpolation_type` in the `[[datasets]]` or `[general]` section of the dataset configuration TOML file.
 
-利用可能な値: `bicubic` (デフォルト), `bilinear`, `lanczos`, `nearest`, `area`
+Available values: `bicubic` (default), `bilinear`, `lanczos`, `nearest`, `area`
 
 ```toml
 [[datasets]]
 resolution = [1024, 1024]
 enable_bucket = true
-interpolation_type = "lanczos" # 例: Lanczos補間を使用
+interpolation_type = "lanczos" # Example: Use Lanczos interpolation
 # ...
 ```
 
-## 9. 関連ツール
+<details>
+<summary>日本語</summary>
+
+データセットの画像を学習解像度にリサイズする際の補間方法を指定できます。
+
+設定方法とオプションの詳細は英語のドキュメントを参照してください。
+
+</details>
+
+## 8. Related Tools / 関連ツール
+
+Several related scripts are provided for models trained with `flux_train_network.py` and to assist with the training process:
+
+* **`networks/flux_extract_lora.py`**: Extracts LoRA models from the difference between trained and base models.
+* **`convert_flux_lora.py`**: Converts trained LoRA models to other formats like Diffusers (AI-Toolkit) format. When trained with Q/K/V split option, converting with this script can reduce model size.
+* **`networks/flux_merge_lora.py`**: Merges trained LoRA models into FLUX.1 base models.
+* **`flux_minimal_inference.py`**: Simple inference script for generating images with trained LoRA models. You can specify `flux` or `chroma` with the `--model_type` argument.
+
+<details>
+<summary>日本語</summary>
+
+`flux_train_network.py` で学習したモデルや、学習プロセスに役立つ関連スクリプトが提供されています：
+
+* **`networks/flux_extract_lora.py`**: 学習済みモデルとベースモデルの差分から LoRA モデルを抽出
+* **`convert_flux_lora.py`**: 学習した LoRA モデルを Diffusers (AI-Toolkit) 形式など他の形式に変換
+* **`networks/flux_merge_lora.py`**: 学習した LoRA モデルを FLUX.1 ベースモデルにマージ
+* **`flux_minimal_inference.py`**: 学習した LoRA モデルを適用して画像を生成するシンプルな推論スクリプト
+
+</details>
+
+## 9. Others / その他
 
-`flux_train_network.py` で学習したモデルや、学習プロセスに役立つ関連スクリプトが提供されています。
+`flux_train_network.py` includes many features common with `train_network.py`, such as sample image generation (`--sample_prompts`, etc.) and detailed optimizer settings. For these features, refer to the [`train_network.py` guide](train_network.md#5-other-features--その他の機能) or the script help (`python flux_train_network.py --help`).
+
+<details>
+<summary>日本語</summary>
+
+`flux_train_network.py`には、サンプル画像の生成 (`--sample_prompts`など) や詳細なオプティマイザ設定など、`train_network.py`と共通の機能も多く存在します。これらについては、[`train_network.py`のガイド](train_network.md#5-other-features--その他の機能)やスクリプトのヘルプ (`python flux_train_network.py --help`) を参照してください。
 
-*   **`networks/flux_extract_lora.py`**: 学習済みモデルとベースモデルの差分から LoRA モデルを抽出します。
-*   **`convert_flux_lora.py`**: 学習した LoRA モデルを Diffusers (AI-Toolkit) 形式など、他の形式に変換します。Q/K/V分割オプションで学習した場合、このスクリプトで変換するとモデルサイズを削減できます。
-*   **`networks/flux_merge_lora.py`**: 学習した LoRA モデルを FLUX.1 ベースモデルにマージします。
-*   **`flux_minimal_inference.py`**: 学習した LoRA モデルを適用して画像を生成するためのシンプルな推論スクリプトです。
+</details>

From c28e7a47c3bd3c4efc81404bf4dadba2b41d4fe4 Mon Sep 17 00:00:00 2001
From: Kohya S <ykumeykume@gmail.com>
Date: Sat, 26 Jul 2025 19:35:42 +0900
Subject: [PATCH 503/582] feat: add regex-based rank and learning rate
 configuration for FLUX.1 LoRA

---
 docs/flux_train_network.md |  49 +++++++++-
 networks/lora_flux.py      | 195 +++++++++++++++++++++++++++----------
 train_network.py           |   2 +-
 3 files changed, 193 insertions(+), 53 deletions(-)

diff --git a/docs/flux_train_network.md b/docs/flux_train_network.md
index f324b9594..1b584180f 100644
--- a/docs/flux_train_network.md
+++ b/docs/flux_train_network.md
@@ -398,7 +398,50 @@ FLUX.1 LoRA学習では、network_argsの`train_double_block_indices`と`train_s
 
 </details>
 
-### 6.5. Text Encoder LoRA Support / Text Encoder LoRAのサポート
+
+</details>
+
+### 6.4. Regular Expression-based Rank/LR Configuration / 正規表現によるランク・学習率の指定
+
+You can specify ranks (dims) and learning rates for LoRA modules using regular expressions. This allows for more flexible and fine-grained control than specifying by layer.
+
+These settings are specified via the `network_args` argument.
+
+*   `network_reg_dims`: Specify ranks for modules matching a regular expression. The format is a comma-separated string of `pattern=rank`.
+    *   Example: `--network_args "network_reg_dims=single.*_modulation.*=4,img_attn=8"`
+    *   This sets the rank to 4 for modules whose names contain `single` and contain `_modulation`, and to 8 for modules containing `img_attn`.
+*   `network_reg_lrs`: Specify learning rates for modules matching a regular expression. The format is a comma-separated string of `pattern=lr`.
+    *   Example: `--network_args "network_reg_lrs=single_blocks_(\d|10)_=1e-3,double_blocks=2e-3"`
+    *   This sets the learning rate to `1e-3` for modules whose names contain `single_blocks` followed by a digit (`0` to `9`) or `10`, and to `2e-3` for modules whose names contain `double_blocks`.
+
+**Notes:**
+
+*   Settings via `network_reg_dims` and `network_reg_lrs` take precedence over the global `--network_dim` and `--learning_rate` settings.
+*   If a module name matches multiple patterns, the setting from the last matching pattern in the string will be applied.
+*   These settings are applied after the block-specific training settings (`train_double_block_indices`, `train_single_block_indices`).
+
+<details>
+<summary>日本語</summary>
+
+正規表現を用いて、LoRAのモジュールごとにランク（dim）や学習率を指定することができます。これにより、層ごとの指定よりも柔軟できめ細やかな制御が可能になります。
+
+これらの設定は `network_args` 引数で指定します。
+
+*   `network_reg_dims`: 正規表現にマッチするモジュールに対してランクを指定します。`pattern=rank` という形式の文字列をカンマで区切って指定します。
+    *   例: `--network_args "network_reg_dims=single.*_modulation.*=4,img_attn=8"`
+    *   この例では、名前に `single` で始まり `_modulation` を含むモジュールのランクを4に、`img_attn` を含むモジュールのランクを8に設定します。
+*   `network_reg_lrs`: 正規表現にマッチするモジュールに対して学習率を指定します。`pattern=lr` という形式の文字列をカンマで区切って指定します。
+    *   例: `--network_args "network_reg_lrs=single_blocks_(\d|10)_=1e-3,double_blocks=2e-3"`
+    *   この例では、名前が `single_blocks` で始まり、後に数字（`0`から`9`）または`10`が続くモジュールの学習率を `1e-3` に、`double_blocks` を含むモジュールの学習率を `2e-3` に設定します。
+**注意点:**
+
+*   `network_reg_dims` および `network_reg_lrs` での設定は、全体設定である `--network_dim` や `--learning_rate` よりも優先されます。
+*   あるモジュール名が複数のパターンにマッチした場合、文字列の中で後方にあるパターンの設定が適用されます。
+*   これらの設定は、ブロック指定（`train_double_block_indices`, `train_single_block_indices`）が適用された後に行われます。
+
+</details>
+
+### 6.6. Text Encoder LoRA Support / Text Encoder LoRAのサポート
 
 FLUX.1 LoRA training supports training CLIP-L and T5XXL LoRA:
 
@@ -417,7 +460,7 @@ FLUX.1 LoRA学習は、CLIP-LとT5XXL LoRAのトレーニングもサポート
 
 </details>
 
-### 6.6. Multi-Resolution Training / マルチ解像度トレーニング
+### 6.7. Multi-Resolution Training / マルチ解像度トレーニング
 
 You can define multiple resolutions in the dataset configuration file, with different batch sizes for each resolution.
 
@@ -462,7 +505,7 @@ resolution = [768, 768]
 
 </details>
 
-### 6.7. Validation / 検証
+### 6.8. Validation / 検証
 
 You can calculate validation loss during training using a validation dataset to evaluate model generalization performance.
 
diff --git a/networks/lora_flux.py b/networks/lora_flux.py
index ddc916089..320bc4632 100644
--- a/networks/lora_flux.py
+++ b/networks/lora_flux.py
@@ -156,11 +156,19 @@ def forward(self, x):
             lx = self.lora_up(lx)
 
             # LoRA Gradient-Guided Perturbation Optimization
-            if self.training and self.ggpo_sigma is not None and self.ggpo_beta is not None and self.combined_weight_norms is not None and self.grad_norms is not None:
+            if (
+                self.training
+                and self.ggpo_sigma is not None
+                and self.ggpo_beta is not None
+                and self.combined_weight_norms is not None
+                and self.grad_norms is not None
+            ):
                 with torch.no_grad():
-                    perturbation_scale = (self.ggpo_sigma * torch.sqrt(self.combined_weight_norms ** 2)) + (self.ggpo_beta * (self.grad_norms ** 2))
+                    perturbation_scale = (self.ggpo_sigma * torch.sqrt(self.combined_weight_norms**2)) + (
+                        self.ggpo_beta * (self.grad_norms**2)
+                    )
                     perturbation_scale_factor = (perturbation_scale * self.perturbation_norm_factor).to(self.device)
-                    perturbation = torch.randn(self.org_module_shape,  dtype=self.dtype, device=self.device)
+                    perturbation = torch.randn(self.org_module_shape, dtype=self.dtype, device=self.device)
                     perturbation.mul_(perturbation_scale_factor)
                     perturbation_output = x @ perturbation.T  # Result: (batch × n)
                 return org_forwarded + (self.multiplier * scale * lx) + perturbation_output
@@ -197,24 +205,24 @@ def initialize_norm_cache(self, org_module_weight: Tensor):
         # Choose a reasonable sample size
         n_rows = org_module_weight.shape[0]
         sample_size = min(1000, n_rows)  # Cap at 1000 samples or use all if smaller
-        
+
         # Sample random indices across all rows
         indices = torch.randperm(n_rows)[:sample_size]
-        
+
         # Convert to a supported data type first, then index
         # Use float32 for indexing operations
         weights_float32 = org_module_weight.to(dtype=torch.float32)
         sampled_weights = weights_float32[indices].to(device=self.device)
-        
+
         # Calculate sampled norms
         sampled_norms = torch.norm(sampled_weights, dim=1, keepdim=True)
-        
+
         # Store the mean norm as our estimate
         self.org_weight_norm_estimate = sampled_norms.mean()
-        
+
         # Optional: store standard deviation for confidence intervals
         self.org_weight_norm_std = sampled_norms.std()
-        
+
         # Free memory
         del sampled_weights, weights_float32
 
@@ -223,45 +231,44 @@ def validate_norm_approximation(self, org_module_weight: Tensor, verbose=True):
         # Calculate the true norm (this will be slow but it's just for validation)
         true_norms = []
         chunk_size = 1024  # Process in chunks to avoid OOM
-        
+
         for i in range(0, org_module_weight.shape[0], chunk_size):
             end_idx = min(i + chunk_size, org_module_weight.shape[0])
             chunk = org_module_weight[i:end_idx].to(device=self.device, dtype=self.dtype)
             chunk_norms = torch.norm(chunk, dim=1, keepdim=True)
             true_norms.append(chunk_norms.cpu())
             del chunk
-            
+
         true_norms = torch.cat(true_norms, dim=0)
         true_mean_norm = true_norms.mean().item()
-        
+
         # Compare with our estimate
         estimated_norm = self.org_weight_norm_estimate.item()
-        
+
         # Calculate error metrics
         absolute_error = abs(true_mean_norm - estimated_norm)
         relative_error = absolute_error / true_mean_norm * 100  # as percentage
-        
+
         if verbose:
             logger.info(f"True mean norm: {true_mean_norm:.6f}")
             logger.info(f"Estimated norm: {estimated_norm:.6f}")
             logger.info(f"Absolute error: {absolute_error:.6f}")
             logger.info(f"Relative error: {relative_error:.2f}%")
-            
+
         return {
-            'true_mean_norm': true_mean_norm,
-            'estimated_norm': estimated_norm,
-            'absolute_error': absolute_error,
-            'relative_error': relative_error
+            "true_mean_norm": true_mean_norm,
+            "estimated_norm": estimated_norm,
+            "absolute_error": absolute_error,
+            "relative_error": relative_error,
         }
 
-
     @torch.no_grad()
     def update_norms(self):
         # Not running GGPO so not currently running update norms
         if self.ggpo_beta is None or self.ggpo_sigma is None:
             return
 
-        # only update norms when we are training 
+        # only update norms when we are training
         if self.training is False:
             return
 
@@ -269,8 +276,9 @@ def update_norms(self):
         module_weights.mul(self.scale)
 
         self.weight_norms = torch.norm(module_weights, dim=1, keepdim=True)
-        self.combined_weight_norms = torch.sqrt((self.org_weight_norm_estimate**2) + 
-                                           torch.sum(module_weights**2, dim=1, keepdim=True))
+        self.combined_weight_norms = torch.sqrt(
+            (self.org_weight_norm_estimate**2) + torch.sum(module_weights**2, dim=1, keepdim=True)
+        )
 
     @torch.no_grad()
     def update_grad_norms(self):
@@ -293,7 +301,6 @@ def update_grad_norms(self):
                 approx_grad = self.scale * ((self.lora_up.weight @ lora_down_grad) + (lora_up_grad @ self.lora_down.weight))
                 self.grad_norms = torch.norm(approx_grad, dim=1, keepdim=True)
 
-
     @property
     def device(self):
         return next(self.parameters()).device
@@ -564,7 +571,6 @@ def parse_block_selection(selection: str, total_blocks: int) -> List[bool]:
     if ggpo_sigma is not None:
         ggpo_sigma = float(ggpo_sigma)
 
-
     # train T5XXL
     train_t5xxl = kwargs.get("train_t5xxl", False)
     if train_t5xxl is not None:
@@ -575,6 +581,42 @@ def parse_block_selection(selection: str, total_blocks: int) -> List[bool]:
     if verbose is not None:
         verbose = True if verbose == "True" else False
 
+    # regex-specific learning rates
+    def parse_kv_pairs(kv_pair_str: str, is_int: bool) -> Dict[str, float]:
+        """
+        Parse a string of key-value pairs separated by commas.
+        """
+        pairs = {}
+        for pair in kv_pair_str.split(","):
+            pair = pair.strip()
+            if not pair:
+                continue
+            if "=" not in pair:
+                logger.warning(f"Invalid format: {pair}, expected 'key=value'")
+                continue
+            key, value = pair.split("=", 1)
+            key = key.strip()
+            value = value.strip()
+            try:
+                pairs[key] = int(value) if is_int else float(value)
+            except ValueError:
+                logger.warning(f"Invalid value for {key}: {value}")
+        return pairs
+
+    # parse regular expression based learning rates
+    network_reg_lrs = kwargs.get("network_reg_lrs", None)
+    if network_reg_lrs is not None:
+        reg_lrs = parse_kv_pairs(network_reg_lrs, is_int=False)
+    else:
+        reg_lrs = None
+
+    # regex-specific dimensions (ranks)
+    network_reg_dims = kwargs.get("network_reg_dims", None)
+    if network_reg_dims is not None:
+        reg_dims = parse_kv_pairs(network_reg_dims, is_int=True)
+    else:
+        reg_dims = None
+
     # すごく引数が多いな ( ^ω^)･･･
     network = LoRANetwork(
         text_encoders,
@@ -594,8 +636,10 @@ def parse_block_selection(selection: str, total_blocks: int) -> List[bool]:
         in_dims=in_dims,
         train_double_block_indices=train_double_block_indices,
         train_single_block_indices=train_single_block_indices,
+        reg_dims=reg_dims,
         ggpo_beta=ggpo_beta,
         ggpo_sigma=ggpo_sigma,
+        reg_lrs=reg_lrs,
         verbose=verbose,
     )
 
@@ -613,7 +657,6 @@ def parse_block_selection(selection: str, total_blocks: int) -> List[bool]:
 
 # Create network from weights for inference, weights are not loaded here (because can be merged)
 def create_network_from_weights(multiplier, file, ae, text_encoders, flux, weights_sd=None, for_inference=False, **kwargs):
-    # if unet is an instance of SdxlUNet2DConditionModel or subclass, set is_sdxl to True
     if weights_sd is None:
         if os.path.splitext(file)[1] == ".safetensors":
             from safetensors.torch import load_file, safe_open
@@ -644,22 +687,6 @@ def create_network_from_weights(multiplier, file, ae, text_encoders, flux, weigh
     if train_t5xxl is None:
         train_t5xxl = False
 
-    # # split qkv
-    # double_qkv_rank = None
-    # single_qkv_rank = None
-    # rank = None
-    # for lora_name, dim in modules_dim.items():
-    #     if "double" in lora_name and "qkv" in lora_name:
-    #         double_qkv_rank = dim
-    #     elif "single" in lora_name and "linear1" in lora_name:
-    #         single_qkv_rank = dim
-    #     elif rank is None:
-    #         rank = dim
-    #     if double_qkv_rank is not None and single_qkv_rank is not None and rank is not None:
-    #         break
-    # split_qkv = (double_qkv_rank is not None and double_qkv_rank != rank) or (
-    #     single_qkv_rank is not None and single_qkv_rank != rank
-    # )
     split_qkv = False  # split_qkv is not needed to care, because state_dict is qkv combined
 
     module_class = LoRAInfModule if for_inference else LoRAModule
@@ -708,8 +735,10 @@ def __init__(
         in_dims: Optional[List[int]] = None,
         train_double_block_indices: Optional[List[bool]] = None,
         train_single_block_indices: Optional[List[bool]] = None,
+        reg_dims: Optional[Dict[str, int]] = None,
         ggpo_beta: Optional[float] = None,
         ggpo_sigma: Optional[float] = None,
+        reg_lrs: Optional[Dict[str, float]] = None,
         verbose: Optional[bool] = False,
     ) -> None:
         super().__init__()
@@ -730,6 +759,8 @@ def __init__(
         self.in_dims = in_dims
         self.train_double_block_indices = train_double_block_indices
         self.train_single_block_indices = train_single_block_indices
+        self.reg_dims = reg_dims
+        self.reg_lrs = reg_lrs
 
         self.loraplus_lr_ratio = None
         self.loraplus_unet_lr_ratio = None
@@ -757,7 +788,6 @@ def __init__(
         if self.train_blocks is not None:
             logger.info(f"train {self.train_blocks} blocks only")
 
-
         if train_t5xxl:
             logger.info(f"train T5XXL as well")
 
@@ -803,8 +833,16 @@ def create_modules(
                                 if lora_name in modules_dim:
                                     dim = modules_dim[lora_name]
                                     alpha = modules_alpha[lora_name]
-                            else:
-                                # 通常、すべて対象とする
+                            elif self.reg_dims is not None:
+                                for reg, d in self.reg_dims.items():
+                                    if re.search(reg, lora_name):
+                                        dim = d
+                                        alpha = self.alpha
+                                        logger.info(f"LoRA {lora_name} matched with regex {reg}, using dim: {dim}")
+                                        break
+
+                            # 通常、すべて対象とする
+                            if dim is None:
                                 if is_linear or is_conv2d_1x1:
                                     dim = default_dim if default_dim is not None else self.lora_dim
                                     alpha = self.alpha
@@ -979,7 +1017,6 @@ def combined_weight_norms(self) -> Tensor | None:
                 combined_weight_norms.append(lora.combined_weight_norms.mean(dim=0))
         return torch.stack(combined_weight_norms) if len(combined_weight_norms) > 0 else None
 
-
     def load_weights(self, file):
         if os.path.splitext(file)[1] == ".safetensors":
             from safetensors.torch import load_file
@@ -1166,17 +1203,77 @@ def prepare_optimizer_params_with_multiple_te_lrs(self, text_encoder_lr, unet_lr
         all_params = []
         lr_descriptions = []
 
+        reg_lrs_list = list(self.reg_lrs.items()) if self.reg_lrs is not None else []
+
         def assemble_params(loras, lr, loraplus_ratio):
             param_groups = {"lora": {}, "plus": {}}
+            # regular expression param groups: {"reg_lr_0": {"lora": {}, "plus": {}}, ...}
+            reg_groups = {}
+
             for lora in loras:
+                # check if this lora matches any regex learning rate
+                matched_reg_lr = None
+                for i, (regex_str, reg_lr) in enumerate(reg_lrs_list):
+                    try:
+                        if re.search(regex_str, lora.lora_name):
+                            matched_reg_lr = (i, reg_lr)
+                            logger.info(f"Module {lora.lora_name} matched regex '{regex_str}' -> LR {reg_lr}")
+                            break
+                    except re.error:
+                        # regex error should have been caught during parsing, but just in case
+                        continue
+
                 for name, param in lora.named_parameters():
-                    if loraplus_ratio is not None and "lora_up" in name:
-                        param_groups["plus"][f"{lora.lora_name}.{name}"] = param
+                    param_key = f"{lora.lora_name}.{name}"
+                    is_plus = loraplus_ratio is not None and "lora_up" in name
+
+                    if matched_reg_lr is not None:
+                        # use regex-specific learning rate
+                        reg_idx, reg_lr = matched_reg_lr
+                        group_key = f"reg_lr_{reg_idx}"
+                        if group_key not in reg_groups:
+                            reg_groups[group_key] = {"lora": {}, "plus": {}, "lr": reg_lr}
+
+                        if is_plus:
+                            reg_groups[group_key]["plus"][param_key] = param
+                        else:
+                            reg_groups[group_key]["lora"][param_key] = param
                     else:
-                        param_groups["lora"][f"{lora.lora_name}.{name}"] = param
+                        # use default learning rate
+                        if is_plus:
+                            param_groups["plus"][param_key] = param
+                        else:
+                            param_groups["lora"][param_key] = param
 
             params = []
             descriptions = []
+
+            # process regex-specific groups first (higher priority)
+            for group_key in sorted(reg_groups.keys()):
+                group = reg_groups[group_key]
+                reg_lr = group["lr"]
+
+                for param_type in ["lora", "plus"]:
+                    if len(group[param_type]) == 0:
+                        continue
+
+                    param_data = {"params": group[param_type].values()}
+
+                    if param_type == "plus" and loraplus_ratio is not None:
+                        param_data["lr"] = reg_lr * loraplus_ratio
+                    else:
+                        param_data["lr"] = reg_lr
+
+                    if param_data.get("lr", None) == 0 or param_data.get("lr", None) is None:
+                        continue
+
+                    params.append(param_data)
+                    desc = f"reg_lr_{group_key.split('_')[-1]}"
+                    if param_type == "plus":
+                        desc += " plus"
+                    descriptions.append(desc)
+
+            # process default groups
             for key in param_groups.keys():
                 param_data = {"params": param_groups[key].values()}
 
diff --git a/train_network.py b/train_network.py
index 6073c4c36..7861e7404 100644
--- a/train_network.py
+++ b/train_network.py
@@ -645,7 +645,7 @@ def train(self, args):
         net_kwargs = {}
         if args.network_args is not None:
             for net_arg in args.network_args:
-                key, value = net_arg.split("=")
+                key, value = net_arg.split("=", 1)
                 net_kwargs[key] = value
 
         # if a new network is added in future, add if ~ then blocks for each network (;'∀')

From af14eab6d7f81493d23a7b961e01084f52eb5adf Mon Sep 17 00:00:00 2001
From: Kohya S <ykumeykume@gmail.com>
Date: Sat, 26 Jul 2025 19:37:15 +0900
Subject: [PATCH 504/582] doc: update section number for regex-based rank and
 learning rate configuration in FLUX.1 LoRA guide

---
 docs/flux_train_network.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/flux_train_network.md b/docs/flux_train_network.md
index 1b584180f..647e87c97 100644
--- a/docs/flux_train_network.md
+++ b/docs/flux_train_network.md
@@ -401,7 +401,7 @@ FLUX.1 LoRA学習では、network_argsの`train_double_block_indices`と`train_s
 
 </details>
 
-### 6.4. Regular Expression-based Rank/LR Configuration / 正規表現によるランク・学習率の指定
+### 6.5. Regular Expression-based Rank/LR Configuration / 正規表現によるランク・学習率の指定
 
 You can specify ranks (dims) and learning rates for LoRA modules using regular expressions. This allows for more flexible and fine-grained control than specifying by layer.
 

From 6c8973c2da72fe9112729bdac9fc1ca21e06945c Mon Sep 17 00:00:00 2001
From: Kohya S <ykumeykume@gmail.com>
Date: Mon, 28 Jul 2025 22:08:02 +0900
Subject: [PATCH 505/582] doc: add reference link for input vector gradient
 requirement in Chroma class

---
 library/chroma_models.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/library/chroma_models.py b/library/chroma_models.py
index b9c54db41..0c93f5269 100644
--- a/library/chroma_models.py
+++ b/library/chroma_models.py
@@ -695,6 +695,7 @@ def forward(
                 input_vec = self.get_input_vec(timesteps, guidance, img.shape[0])
 
                 # kohya-ss: I'm not sure why requires_grad is set to True here
+                # original code: https://github.com/lodestone-rock/flow/blob/c76f63058980d0488826936025889e256a2e0458/src/models/chroma/model.py#L217
                 input_vec.requires_grad = True
                 mod_vectors = self.distilled_guidance_layer(input_vec)
         else:

From 10de781806623c8acc4cab4e0427aed64491c50c Mon Sep 17 00:00:00 2001
From: kozistr <kozistr@gmail.com>
Date: Mon, 28 Jul 2025 23:40:38 +0900
Subject: [PATCH 506/582] build(deps): pytorch-optimizer to 3.7.0

---
 requirements.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/requirements.txt b/requirements.txt
index 767d9e8eb..448af323c 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -9,7 +9,7 @@ pytorch-lightning==1.9.0
 bitsandbytes==0.44.0
 lion-pytorch==0.0.6
 schedulefree==1.4
-pytorch-optimizer==3.5.0
+pytorch-optimizer==3.7.0
 prodigy-plus-schedule-free==1.9.0
 prodigyopt==1.1.2
 tensorboard

From 450630c6bda18026c6017df088a8d73f89f67a60 Mon Sep 17 00:00:00 2001
From: Kohya S <ykumeykume@gmail.com>
Date: Tue, 29 Jul 2025 20:32:24 +0900
Subject: [PATCH 507/582] fix: create network from weights not working

---
 networks/lora_flux.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/networks/lora_flux.py b/networks/lora_flux.py
index 320bc4632..e9ad5f68d 100644
--- a/networks/lora_flux.py
+++ b/networks/lora_flux.py
@@ -841,8 +841,8 @@ def create_modules(
                                         logger.info(f"LoRA {lora_name} matched with regex {reg}, using dim: {dim}")
                                         break
 
-                            # 通常、すべて対象とする
-                            if dim is None:
+                            # if modules_dim is None, we use default lora_dim. if modules_dim is not None, we use the specified dim (no default)
+                            if dim is None and modules_dim is None: 
                                 if is_linear or is_conv2d_1x1:
                                     dim = default_dim if default_dim is not None else self.lora_dim
                                     alpha = self.alpha

From 96feb61c0a3d42f3526c09131090a33d2e5d8f23 Mon Sep 17 00:00:00 2001
From: Kohya S <ykumeykume@gmail.com>
Date: Wed, 30 Jul 2025 21:34:49 +0900
Subject: [PATCH 508/582] feat: implement modulation vector extraction for
 Chroma and update related methods

---
 flux_minimal_inference.py |  3 +++
 flux_train_network.py     | 15 ++++++++-------
 library/chroma_models.py  | 28 ++++++++++------------------
 library/flux_models.py    |  6 +++---
 4 files changed, 24 insertions(+), 28 deletions(-)

diff --git a/flux_minimal_inference.py b/flux_minimal_inference.py
index 86e8e1b1f..d5f2d8d98 100644
--- a/flux_minimal_inference.py
+++ b/flux_minimal_inference.py
@@ -113,6 +113,8 @@ def denoise(
 
         y_input = b_vec
 
+        mod_vectors = model.get_mod_vectors(timesteps=t_vec, guidance=guidance_vec, batch_size=b_img.shape[0])
+
         pred = model(
             img=b_img,
             img_ids=b_img_ids,
@@ -122,6 +124,7 @@ def denoise(
             timesteps=t_vec,
             guidance=guidance_vec,
             txt_attention_mask=b_t5_attn_mask,
+            mod_vectors=mod_vectors,
         )
 
         # classifier free guidance
diff --git a/flux_train_network.py b/flux_train_network.py
index 13e9ae2a2..2d9ab2487 100644
--- a/flux_train_network.py
+++ b/flux_train_network.py
@@ -341,7 +341,8 @@ def get_noise_pred_and_target(
         guidance_vec = torch.full((bsz,), float(args.guidance_scale), device=accelerator.device)
 
         # get modulation vectors for Chroma
-        input_vec = unet.get_input_vec(timesteps=timesteps / 1000, guidance=guidance_vec, batch_size=bsz)
+        with accelerator.autocast(), torch.no_grad():
+            mod_vectors = unet.get_mod_vectors(timesteps=timesteps / 1000, guidance=guidance_vec, batch_size=bsz)
 
         if args.gradient_checkpointing:
             noisy_model_input.requires_grad_(True)
@@ -350,15 +351,15 @@ def get_noise_pred_and_target(
                     t.requires_grad_(True)
             img_ids.requires_grad_(True)
             guidance_vec.requires_grad_(True)
-            if input_vec is not None:
-                input_vec.requires_grad_(True)
+            if mod_vectors is not None:
+                mod_vectors.requires_grad_(True)
 
         # Predict the noise residual
         l_pooled, t5_out, txt_ids, t5_attn_mask = text_encoder_conds
         if not args.apply_t5_attn_mask:
             t5_attn_mask = None
 
-        def call_dit(img, img_ids, t5_out, txt_ids, l_pooled, timesteps, guidance_vec, t5_attn_mask, input_vec):
+        def call_dit(img, img_ids, t5_out, txt_ids, l_pooled, timesteps, guidance_vec, t5_attn_mask, mod_vectors):
             # grad is enabled even if unet is not in train mode, because Text Encoder is in train mode
             with torch.set_grad_enabled(is_train), accelerator.autocast():
                 # YiYi notes: divide it by 1000 for now because we scale it by 1000 in the transformer model (we should not keep it but I want to keep the inputs same for the model for testing)
@@ -371,7 +372,7 @@ def call_dit(img, img_ids, t5_out, txt_ids, l_pooled, timesteps, guidance_vec, t
                     timesteps=timesteps / 1000,
                     guidance=guidance_vec,
                     txt_attention_mask=t5_attn_mask,
-                    input_vec=input_vec,
+                    mod_vectors=mod_vectors,
                 )
             return model_pred
 
@@ -384,7 +385,7 @@ def call_dit(img, img_ids, t5_out, txt_ids, l_pooled, timesteps, guidance_vec, t
             timesteps=timesteps,
             guidance_vec=guidance_vec,
             t5_attn_mask=t5_attn_mask,
-            input_vec=input_vec,
+            mod_vectors=mod_vectors,
         )
 
         # unpack latents
@@ -416,7 +417,7 @@ def call_dit(img, img_ids, t5_out, txt_ids, l_pooled, timesteps, guidance_vec, t
                         timesteps=timesteps[diff_output_pr_indices],
                         guidance_vec=guidance_vec[diff_output_pr_indices] if guidance_vec is not None else None,
                         t5_attn_mask=t5_attn_mask[diff_output_pr_indices] if t5_attn_mask is not None else None,
-                        input_vec=input_vec[diff_output_pr_indices] if input_vec is not None else None,
+                        mod_vectors=mod_vectors[diff_output_pr_indices] if mod_vectors is not None else None,
                     )
                 network.set_multiplier(1.0)  # may be overwritten by "network_multipliers" in the next step
 
diff --git a/library/chroma_models.py b/library/chroma_models.py
index 0c93f5269..d5ac1f39e 100644
--- a/library/chroma_models.py
+++ b/library/chroma_models.py
@@ -641,7 +641,10 @@ def disable_gradient_checkpointing(self):
 
         print("Chroma: Gradient checkpointing disabled.")
 
-    def get_input_vec(self, timesteps: Tensor, guidance: Tensor | None = None, batch_size: int | None = None) -> Tensor:
+    def get_mod_vectors(self, timesteps: Tensor, guidance: Tensor | None = None, batch_size: int | None = None) -> Tensor:
+        # We extract this logic from forward to clarify the propagation of the gradients
+        # original comment: https://github.com/lodestone-rock/flow/blob/c76f63058980d0488826936025889e256a2e0458/src/models/chroma/model.py#L195
+
         # print(f"Chroma get_input_vec: timesteps {timesteps}, guidance: {guidance}, batch_size: {batch_size}")
         distill_timestep = timestep_embedding(timesteps, self.approximator_in_dim // 4)
         # TODO: need to add toggle to omit this from schnell but that's not a priority
@@ -654,7 +657,9 @@ def get_input_vec(self, timesteps: Tensor, guidance: Tensor | None = None, batch
         timestep_guidance = torch.cat([distill_timestep, distil_guidance], dim=1).unsqueeze(1).repeat(1, self.mod_index_length, 1)
         # then and only then we could concatenate it together
         input_vec = torch.cat([timestep_guidance, modulation_index], dim=-1)
-        return input_vec
+
+        mod_vectors = self.distilled_guidance_layer(input_vec)
+        return mod_vectors
 
     def forward(
         self,
@@ -669,7 +674,7 @@ def forward(
         guidance: Tensor | None = None,
         txt_attention_mask: Tensor | None = None,
         attn_padding: int = 1,
-        input_vec: Tensor | None = None,
+        mod_vectors: Tensor | None = None,
     ) -> Tensor:
         # print(
         #     f"Chroma forward: img shape {img.shape}, txt shape {txt.shape}, img_ids shape {img_ids.shape}, txt_ids shape {txt_ids.shape}"
@@ -684,22 +689,9 @@ def forward(
         img = self.img_in(img)
         txt = self.txt_in(txt)
 
-        if input_vec is None:
-            # TODO:
-            # need to fix grad accumulation issue here for now it's in no grad mode
-            # besides, i don't want to wash out the PFP that's trained on this model weights anyway
-            # the fan out operation here is deleting the backward graph
-            # alternatively doing forward pass for every block manually is doable but slow
-            # custom backward probably be better
+        if mod_vectors is None:  # fallback to the original logic
             with torch.no_grad():
-                input_vec = self.get_input_vec(timesteps, guidance, img.shape[0])
-
-                # kohya-ss: I'm not sure why requires_grad is set to True here
-                # original code: https://github.com/lodestone-rock/flow/blob/c76f63058980d0488826936025889e256a2e0458/src/models/chroma/model.py#L217
-                input_vec.requires_grad = True
-                mod_vectors = self.distilled_guidance_layer(input_vec)
-        else:
-            mod_vectors = self.distilled_guidance_layer(input_vec)
+                mod_vectors = self.get_mod_vectors(timesteps, guidance, img.shape[0])
         mod_vectors_dict = distribute_modulations(mod_vectors, self.depth_single_blocks, self.depth_double_blocks)
 
         # calculate text length for each batch instead of masking
diff --git a/library/flux_models.py b/library/flux_models.py
index 63d699d49..d2d7e06c7 100644
--- a/library/flux_models.py
+++ b/library/flux_models.py
@@ -1009,8 +1009,8 @@ def prepare_block_swap_before_forward(self):
         self.offloader_double.prepare_block_devices_before_forward(self.double_blocks)
         self.offloader_single.prepare_block_devices_before_forward(self.single_blocks)
 
-    def get_input_vec(self, timesteps: Tensor, guidance: Tensor | None = None, batch_size: int | None = None) -> Tensor:
-        return None  # FLUX.1 does not use input_vec, but Chroma does.
+    def get_mod_vectors(self, timesteps: Tensor, guidance: Tensor | None = None, batch_size: int | None = None) -> Tensor:
+        return None  # FLUX.1 does not use mod_vectors, but Chroma does.
 
     def forward(
         self,
@@ -1024,7 +1024,7 @@ def forward(
         block_controlnet_single_hidden_states=None,
         guidance: Tensor | None = None,
         txt_attention_mask: Tensor | None = None,
-        input_vec: Tensor | None = None,
+        mod_vectors: Tensor | None = None,
     ) -> Tensor:
         if img.ndim != 3 or txt.ndim != 3:
             raise ValueError("Input img and txt tensors must have 3 dimensions.")

From 250f0eb9b051784f6f18bb223ea88860119a0172 Mon Sep 17 00:00:00 2001
From: Kohya S <ykumeykume@gmail.com>
Date: Wed, 30 Jul 2025 22:08:51 +0900
Subject: [PATCH 509/582] doc: update README and training guide with breaking
 changes for CFG scale and model download instructions

---
 README.md                  |  4 +--
 docs/flux_train_network.md | 57 ++++++++++++++++++++++++++++++++------
 2 files changed, 50 insertions(+), 11 deletions(-)

diff --git a/README.md b/README.md
index 9ba1cbfc1..724bd3d84 100644
--- a/README.md
+++ b/README.md
@@ -16,8 +16,8 @@ If you are using DeepSpeed, please install DeepSpeed with `pip install deepspeed
 
 ### Recent Updates
 
-Jul XX, 2025:
-- **Breaking Change**: For FLUX.1 and Chroma training, the CFG scale option for sample image generation during training has been changed from `--g` to `--l`. The `--g` option is now used for the embedded guidance scale. Please update your prompts accordingly. See [Sample Image Generation During Training](#sample-image-generation-during-training) for details.
+Jul 30, 2025:
+- **Breaking Change**: For FLUX.1 and Chroma training, the CFG (Classifier-Free Guidance, using negative prompts) scale option for sample image generation during training has been changed from `--g` to `--l`. The `--g` option is now used for the embedded guidance scale. Please update your prompts accordingly. See [Sample Image Generation During Training](#sample-image-generation-during-training) for details.
 
 - Support for [Chroma](https://huggingface.co/lodestones/Chroma) has been added in PR [#2157](https://github.com/kohya-ss/sd-scripts/pull/2157). Thank you to lodestones for the high-quality model.
     - Chroma is a new model based on FLUX.1 schnell. In this repository, `flux_train_network.py` is used for training LoRAs for Chroma with `--model_type chroma`.
diff --git a/docs/flux_train_network.md b/docs/flux_train_network.md
index 647e87c97..2bf3bfb24 100644
--- a/docs/flux_train_network.md
+++ b/docs/flux_train_network.md
@@ -71,6 +71,21 @@ Before starting training you need:
 4. **AutoEncoder model file:** FLUX.1-compatible AE model `.safetensors` file (e.g., `ae.safetensors`).
 5. **Dataset definition file (.toml):** TOML format file describing training dataset configuration (e.g., `my_flux_dataset_config.toml`).
 
+### Downloading Required Models
+
+To train FLUX.1 models, you need to download the following model files:
+
+- **DiT, AE**: Download from the [black-forest-labs/FLUX.1 dev](https://huggingface.co/black-forest-labs/FLUX.1-dev) repository. Use `flux1-dev.safetensors` and `ae.safetensors`. The weights in the subfolder are in Diffusers format and cannot be used.
+- **Text Encoder 1 (T5-XXL), Text Encoder 2 (CLIP-L)**: Download from the [ComfyUI FLUX Text Encoders](https://huggingface.co/comfyanonymous/flux_text_encoders) repository. Please use `t5xxl_fp16.safetensors` for T5-XXL. Thanks to ComfyUI for providing these models.
+
+To train Chroma models, you need to download the Chroma model file from the following repository:
+
+- **Chroma Base**: Download from the [lodestones/Chroma1-Base](https://huggingface.co/lodestones/Chroma1-Base) repository. Use `Chroma.safetensors`.
+
+We have tested Chroma training with the weights from the [lodestones/Chroma](https://huggingface.co/lodestones/Chroma) repository. 
+
+AE and T5-XXL models are same as FLUX.1, so you can use the same files. CLIP-L model is not used for Chroma training, so you can omit the `--clip_l` argument.
+
 <details>
 <summary>日本語</summary>
 
@@ -84,6 +99,21 @@ Before starting training you need:
 4. **AutoEncoderモデルファイル:** FLUX.1に対応するAEモデルの`.safetensors`ファイル。例として`ae.safetensors`を使用します。
 5. **データセット定義ファイル (.toml):** 学習データセットの設定を記述したTOML形式のファイル。（詳細は[データセット設定ガイド](link/to/dataset/config/doc)を参照してください）。例として`my_flux_dataset_config.toml`を使用します。
 
+**必要なモデルのダウンロード**
+
+FLUX.1モデルを学習するためには、以下のモデルファイルをダウンロードする必要があります。
+
+- **DiT, AE**: [black-forest-labs/FLUX.1 dev](https://huggingface.co/black-forest-labs/FLUX.1-dev) リポジトリからダウンロードします。`flux1-dev.safetensors`と`ae.safetensors`を使用してください。サブフォルダ内の重みはDiffusers形式であり、使用できません。
+- **Text Encoder 1 (T5-XXL), Text Encoder 2 (CLIP-L)**: [ComfyUI FLUX Text Encoders](https://huggingface.co/comfyanonymous/flux_text_encoders) リポジトリからダウンロードします。T5-XXLには`t5xxl_fp16.safetensors`を使用してください。これらのモデルを提供いただいたComfyUIに感謝します。
+
+Chromaモデルを学習する場合は、以下のリポジトリからChromaモデルファイルをダウンロードする必要があります。
+
+- **Chroma Base**: [lodestones/Chroma1-Base](https://huggingface.co/lodestones/Chroma1-Base) リポジトリからダウンロードします。`Chroma.safetensors`を使用してください。
+
+Chromaの学習のテストは [lodestones/Chroma](https://huggingface.co/lodestones/Chroma) リポジトリの重みを使用して行いました。
+
+AEとT5-XXLモデルはFLUX.1と同じものを使用できるため、同じファイルを使用します。CLIP-LモデルはChroma学習では使用されないため、`--clip_l`引数は省略できます。
+
 </details>
 
 ## 4. Running the Training / 学習の実行
@@ -140,6 +170,12 @@ accelerate launch --num_cpu_threads_per_process 1 flux_train_network.py \
 
 Note that for Chroma models, `--guidance_scale=0.0` is required to disable guidance scale, and `--apply_t5_attn_mask` is needed to apply attention masks for T5XXL Text Encoder.
 
+The sample image generation during training requires specifying a negative prompt. Also, set `--g 0` to disable embedded guidance scale and `--l 4.0` to set the CFG scale. For example:
+
+```
+Japanese shrine in the summer forest. --n low quality, ugly, unfinished, out of focus, deformed, disfigure, blurry, smudged, restricted palette, flat colors --w 512 --h 512 --d 1 --l 4.0 --g 0.0 --s 20
+```
+
 <details>
 <summary>日本語</summary>
 
@@ -153,6 +189,8 @@ Chromaモデルを学習したい場合は、`--model_type=chroma`を指定し
 
 コマンドラインの例は英語のドキュメントを参照してください。
 
+学習中のサンプル画像生成には、ネガティブプロンプトを指定してください。また `--g 0` を指定して埋め込みガイダンススケールを無効化し、`--l 4.0` を指定してCFGスケールを設定します。
+
 </details>
 
 ### 4.1. Explanation of Key Options / 主要なコマンドライン引数の解説
@@ -314,9 +352,12 @@ Based on experiments, the following settings work well:
 --timestep_sampling shift --discrete_flow_shift 3.1582 --model_prediction_type raw --guidance_scale 1.0
 ```
 
-**About Guidance Scale**: FLUX.1 dev version is distilled with specific guidance scale values, but for training, specify `--guidance_scale 1.0` to disable guidance scale.
+For Chroma models, the following settings are recommended:
+```
+--timestep_sampling sigmoid --model_prediction_type raw --guidance_scale 0.0
+```
 
-`--timestep_sampling sigmoid --model_prediction_type raw --guidance_scale 0.0` is recommended for Chroma models.
+**About Guidance Scale**: FLUX.1 dev version is distilled with specific guidance scale values, but for training, specify `--guidance_scale 1.0` to disable guidance scale. Chroma requires `--guidance_scale 0.0` to disable guidance scale because it is not distilled.
 
 <details>
 <summary>日本語</summary>
@@ -396,9 +437,6 @@ FLUX.1 LoRA学習では、network_argsの`train_double_block_indices`と`train_s
 
 詳細な設定方法とコマンドラインの例は英語のドキュメントを参照してください。
 
-</details>
-
-
 </details>
 
 ### 6.5. Regular Expression-based Rank/LR Configuration / 正規表現によるランク・学習率の指定
@@ -607,10 +645,11 @@ Several related scripts are provided for models trained with `flux_train_network
 
 `flux_train_network.py` で学習したモデルや、学習プロセスに役立つ関連スクリプトが提供されています：
 
-* **`networks/flux_extract_lora.py`**: 学習済みモデルとベースモデルの差分から LoRA モデルを抽出
-* **`convert_flux_lora.py`**: 学習した LoRA モデルを Diffusers (AI-Toolkit) 形式など他の形式に変換
-* **`networks/flux_merge_lora.py`**: 学習した LoRA モデルを FLUX.1 ベースモデルにマージ
-* **`flux_minimal_inference.py`**: 学習した LoRA モデルを適用して画像を生成するシンプルな推論スクリプト
+* **`networks/flux_extract_lora.py`**: 学習済みモデルとベースモデルの差分から LoRA モデルを抽出。
+* **`convert_flux_lora.py`**: 学習した LoRA モデルを Diffusers (AI-Toolkit) 形式など他の形式に変換。
+* **`networks/flux_merge_lora.py`**: 学習した LoRA モデルを FLUX.1 ベースモデルにマージ。
+* **`flux_minimal_inference.py`**: 学習した LoRA モデルを適用して画像を生成するシンプルな推論スクリプト。
+  `--model_type` 引数で `flux` または `chroma` を指定できます。
 
 </details>
 

From bd6418a940cad7ae88df3d849617f33ad2f5bd9d Mon Sep 17 00:00:00 2001
From: Kohya S <ykumeykume@gmail.com>
Date: Fri, 1 Aug 2025 21:47:38 +0900
Subject: [PATCH 510/582] fix: add assertion for apply_t5_attn_mask requirement
 in Chroma

---
 flux_train_network.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/flux_train_network.py b/flux_train_network.py
index 2d9ab2487..cfc617088 100644
--- a/flux_train_network.py
+++ b/flux_train_network.py
@@ -51,6 +51,7 @@ def assert_extra_args(
             self.use_clip_l = True
         else:
             self.use_clip_l = False  # Chroma does not use CLIP-L
+            assert args.apply_t5_attn_mask, "apply_t5_attn_mask must be True for Chroma / Chromaではapply_t5_attn_maskを指定する必要があります"
 
         if args.fp8_base_unet:
             args.fp8_base = True  # if fp8_base_unet is enabled, fp8_base is also enabled for FLUX.1

From 5249732a0fbe8b72a3b9fe5758ac5ad5906b283a Mon Sep 17 00:00:00 2001
From: Kohya S <ykumeykume@gmail.com>
Date: Fri, 1 Aug 2025 23:38:02 +0900
Subject: [PATCH 511/582] chore: update README to include
 `--apply_t5_attn_mask` requirement for Chroma training #2163

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 724bd3d84..be0ae4064 100644
--- a/README.md
+++ b/README.md
@@ -20,7 +20,7 @@ Jul 30, 2025:
 - **Breaking Change**: For FLUX.1 and Chroma training, the CFG (Classifier-Free Guidance, using negative prompts) scale option for sample image generation during training has been changed from `--g` to `--l`. The `--g` option is now used for the embedded guidance scale. Please update your prompts accordingly. See [Sample Image Generation During Training](#sample-image-generation-during-training) for details.
 
 - Support for [Chroma](https://huggingface.co/lodestones/Chroma) has been added in PR [#2157](https://github.com/kohya-ss/sd-scripts/pull/2157). Thank you to lodestones for the high-quality model.
-    - Chroma is a new model based on FLUX.1 schnell. In this repository, `flux_train_network.py` is used for training LoRAs for Chroma with `--model_type chroma`.
+    - Chroma is a new model based on FLUX.1 schnell. In this repository, `flux_train_network.py` is used for training LoRAs for Chroma with `--model_type chroma`. `--apply_t5_attn_mask` is also needed for Chroma training.
     - Please refer to the [FLUX.1 LoRA training documentation](./docs/flux_train_network.md) for more details.
 
 Jul 21, 2025:

From b9c091eafcca028d4edfce4a407321442682d07e Mon Sep 17 00:00:00 2001
From: Dave Lage <rockerboo@gmail.com>
Date: Sat, 2 Aug 2025 17:19:26 -0400
Subject: [PATCH 512/582] Fix validation documentation

---
 docs/flux_train_network.md | 22 +++++++++++-----------
 1 file changed, 11 insertions(+), 11 deletions(-)

diff --git a/docs/flux_train_network.md b/docs/flux_train_network.md
index 2bf3bfb24..690c9c899 100644
--- a/docs/flux_train_network.md
+++ b/docs/flux_train_network.md
@@ -547,21 +547,21 @@ resolution = [768, 768]
 
 You can calculate validation loss during training using a validation dataset to evaluate model generalization performance.
 
-To set up validation, add a `[validation]` section to your dataset configuration TOML file. Configuration is similar to training datasets, but `num_repeats` is usually set to 1.
+To set up validation, add a `validation_split` and optionally `validation_seed` to your dataset configuration TOML file. 
 
 ```toml
-# ... (training dataset configuration) ...
-
-[validation]
-batch_size = 1
+[[datasets]]
 enable_bucket = true
-resolution = [1024, 1024] # Resolution for validation
+resolution = [1024, 1024]
+validation_seed = 42 # [Optional] Validation seed, otherwise uses training seed for validation split .
 
-  [[validation.subsets]]
-  image_dir = "path/to/validation/images"
-  num_repeats = 1
-  caption_extension = ".txt"
-  # ... other validation dataset settings ...
+  [[datasets.subsets]]
+  image_dir = "path/to/image/directory"
+  validation_split = 0.1 # Split between 0.0 and 1.0 where 1.0 will use the full subset as a valiation dataset
+
+  [[datasets.subsets]]
+  image_dir = "path/to/image/full_validation"
+  validation_split = 1.0 # Split between 0.0 and 1.0 where 1.0 will use the full subset as a valiation dataset
 ```
 
 **Notes:**

From 24c605ee3bec841a375fbb47822e671f74684796 Mon Sep 17 00:00:00 2001
From: Dave Lage <rockerboo@gmail.com>
Date: Sat, 2 Aug 2025 17:21:25 -0400
Subject: [PATCH 513/582] Update flux_train_network.md

---
 docs/flux_train_network.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/flux_train_network.md b/docs/flux_train_network.md
index 690c9c899..2b4afd401 100644
--- a/docs/flux_train_network.md
+++ b/docs/flux_train_network.md
@@ -557,11 +557,11 @@ validation_seed = 42 # [Optional] Validation seed, otherwise uses training seed
 
   [[datasets.subsets]]
   image_dir = "path/to/image/directory"
-  validation_split = 0.1 # Split between 0.0 and 1.0 where 1.0 will use the full subset as a valiation dataset
+  validation_split = 0.1 # Split between 0.0 and 1.0 where 1.0 will use the full subset as a validation dataset
 
   [[datasets.subsets]]
   image_dir = "path/to/image/full_validation"
-  validation_split = 1.0 # Split between 0.0 and 1.0 where 1.0 will use the full subset as a valiation dataset
+  validation_split = 1.0 # Split between 0.0 and 1.0 where 1.0 will use the full subset as a validation dataset
 ```
 
 **Notes:**

From 0ad2cb854de3ea4124cb6ab56aee432796919f8b Mon Sep 17 00:00:00 2001
From: Dave Lage <rockerboo@gmail.com>
Date: Sat, 2 Aug 2025 17:27:55 -0400
Subject: [PATCH 514/582] Update flux_train_network.md

---
 docs/flux_train_network.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/flux_train_network.md b/docs/flux_train_network.md
index 2b4afd401..23828eb71 100644
--- a/docs/flux_train_network.md
+++ b/docs/flux_train_network.md
@@ -561,7 +561,7 @@ validation_seed = 42 # [Optional] Validation seed, otherwise uses training seed
 
   [[datasets.subsets]]
   image_dir = "path/to/image/full_validation"
-  validation_split = 1.0 # Split between 0.0 and 1.0 where 1.0 will use the full subset as a validation dataset
+  validation_split = 1.0 # Will use this full subset as a validation subset. 
 ```
 
 **Notes:**

From d24d733892a6de393267111b32f4a56e896e1f64 Mon Sep 17 00:00:00 2001
From: rockerBOO <rockerboo@gmail.com>
Date: Sat, 2 Aug 2025 21:14:27 -0400
Subject: [PATCH 515/582] Update model spec to 1.0.1. Refactor model spec

---
 library/sai_model_spec.py | 663 ++++++++++++++++++++++++++++++--------
 library/train_util.py     | 130 +++++---
 2 files changed, 624 insertions(+), 169 deletions(-)

diff --git a/library/sai_model_spec.py b/library/sai_model_spec.py
index bb4bea401..8b1224842 100644
--- a/library/sai_model_spec.py
+++ b/library/sai_model_spec.py
@@ -1,14 +1,19 @@
 # based on https://github.com/Stability-AI/ModelSpec
 import datetime
 import hashlib
+import argparse
+import base64
+import logging
+import mimetypes
+import subprocess
+from dataclasses import dataclass, field
 from io import BytesIO
 import os
-from typing import List, Optional, Tuple, Union
+from typing import Dict, List, Optional, Tuple, Union
 import safetensors
 from library.utils import setup_logging
 
 setup_logging()
-import logging
 
 logger = logging.getLogger(__name__)
 
@@ -31,23 +36,44 @@
 """
 
 BASE_METADATA = {
-    # === Must ===
-    "modelspec.sai_model_spec": "1.0.0",  # Required version ID for the spec
+    # === Universal MUST fields ===
+    "modelspec.sai_model_spec": "1.0.1",  # Updated to latest spec version
     "modelspec.architecture": None,
     "modelspec.implementation": None,
     "modelspec.title": None,
-    "modelspec.resolution": None,
-    # === Should ===
+    
+    # === Universal SHOULD fields ===
     "modelspec.description": None,
     "modelspec.author": None,
     "modelspec.date": None,
-    # === Can ===
+    "modelspec.hash_sha256": None,
+    
+    # === Universal CAN fields ===
+    "modelspec.implementation_version": None,
     "modelspec.license": None,
+    "modelspec.usage_hint": None,
+    "modelspec.thumbnail": None,
     "modelspec.tags": None,
     "modelspec.merged_from": None,
+    
+    # === Image generation MUST fields ===
+    "modelspec.resolution": None,
+    
+    # === Image generation CAN fields ===
+    "modelspec.trigger_phrase": None,
     "modelspec.prediction_type": None,
     "modelspec.timestep_range": None,
     "modelspec.encoder_layer": None,
+    "modelspec.preprocessor": None,
+    "modelspec.is_negative_embedding": None,
+    "modelspec.unet_dtype": None,
+    "modelspec.vae_dtype": None,
+    
+    # === Text prediction fields ===
+    "modelspec.data_format": None,
+    "modelspec.format_type": None,
+    "modelspec.language": None,
+    "modelspec.format_template": None,
 }
 
 # 別に使うやつだけ定義
@@ -80,6 +106,256 @@
 PRED_TYPE_V = "v"
 
 
+@dataclass
+class ModelSpecMetadata:
+    """
+    ModelSpec 1.0.1 compliant metadata for safetensors models.
+    All fields correspond to modelspec.* keys in the final metadata.
+    """
+    
+    # === Universal MUST fields ===
+    architecture: str
+    implementation: str
+    title: str
+    
+    # === Universal SHOULD fields ===
+    description: Optional[str] = None
+    author: Optional[str] = None
+    date: Optional[str] = None
+    hash_sha256: Optional[str] = None
+    
+    # === Universal CAN fields ===
+    sai_model_spec: str = "1.0.1"
+    implementation_version: Optional[str] = None
+    license: Optional[str] = None
+    usage_hint: Optional[str] = None
+    thumbnail: Optional[str] = None
+    tags: Optional[str] = None
+    merged_from: Optional[str] = None
+    
+    # === Image generation MUST fields ===
+    resolution: Optional[str] = None
+    
+    # === Image generation CAN fields ===
+    trigger_phrase: Optional[str] = None
+    prediction_type: Optional[str] = None
+    timestep_range: Optional[str] = None
+    encoder_layer: Optional[str] = None
+    preprocessor: Optional[str] = None
+    is_negative_embedding: Optional[str] = None
+    unet_dtype: Optional[str] = None
+    vae_dtype: Optional[str] = None
+    
+    # === Text prediction fields ===
+    data_format: Optional[str] = None
+    format_type: Optional[str] = None
+    language: Optional[str] = None
+    format_template: Optional[str] = None
+    
+    # === Additional metadata ===
+    additional_fields: Dict[str, str] = field(default_factory=dict)
+    
+    def to_metadata_dict(self) -> Dict[str, str]:
+        """Convert dataclass to metadata dictionary with modelspec. prefixes."""
+        metadata = {}
+        
+        # Add all non-None fields with modelspec prefix
+        for field_name, value in self.__dict__.items():
+            if field_name == "additional_fields":
+                # Handle additional fields separately
+                for key, val in value.items():
+                    if key.startswith("modelspec."):
+                        metadata[key] = val
+                    else:
+                        metadata[f"modelspec.{key}"] = val
+            elif value is not None:
+                metadata[f"modelspec.{field_name}"] = value
+        
+        return metadata
+    
+    @classmethod
+    def from_args(cls, args, **kwargs) -> "ModelSpecMetadata":
+        """Create ModelSpecMetadata from argparse Namespace, extracting metadata_* fields."""
+        metadata_fields = {}
+        
+        # Extract all metadata_* attributes from args
+        for attr_name in dir(args):
+            if attr_name.startswith("metadata_") and not attr_name.startswith("metadata___"):
+                value = getattr(args, attr_name, None)
+                if value is not None:
+                    # Remove metadata_ prefix
+                    field_name = attr_name[9:]  # len("metadata_") = 9
+                    metadata_fields[field_name] = value
+        
+        # Handle known standard fields
+        standard_fields = {
+            "author": metadata_fields.pop("author", None),
+            "description": metadata_fields.pop("description", None),
+            "license": metadata_fields.pop("license", None),
+            "tags": metadata_fields.pop("tags", None),
+        }
+        
+        # Remove None values
+        standard_fields = {k: v for k, v in standard_fields.items() if v is not None}
+        
+        # Merge with kwargs and remaining metadata fields
+        all_fields = {**standard_fields, **kwargs}
+        if metadata_fields:
+            all_fields["additional_fields"] = metadata_fields
+        
+        return cls(**all_fields)
+
+
+def determine_architecture(
+    v2: bool,
+    v_parameterization: bool,
+    sdxl: bool,
+    lora: bool,
+    textual_inversion: bool,
+    model_config: Optional[dict] = None
+) -> str:
+    """Determine model architecture string from parameters."""
+    
+    model_config = model_config or {}
+    
+    if sdxl:
+        arch = ARCH_SD_XL_V1_BASE
+    elif "sd3" in model_config:
+        arch = ARCH_SD3_M + "-" + model_config["sd3"]
+    elif "flux" in model_config:
+        flux_type = model_config["flux"]
+        if flux_type == "dev":
+            arch = ARCH_FLUX_1_DEV
+        elif flux_type == "schnell":
+            arch = ARCH_FLUX_1_SCHNELL
+        elif flux_type == "chroma":
+            arch = ARCH_FLUX_1_CHROMA
+        else:
+            arch = ARCH_FLUX_1_UNKNOWN
+    elif "lumina" in model_config:
+        lumina_type = model_config["lumina"]
+        if lumina_type == "lumina2":
+            arch = ARCH_LUMINA_2
+        else:
+            arch = ARCH_LUMINA_UNKNOWN
+    elif v2:
+        arch = ARCH_SD_V2_768_V if v_parameterization else ARCH_SD_V2_512
+    else:
+        arch = ARCH_SD_V1
+    
+    # Add adapter suffix
+    if lora:
+        arch += f"/{ADAPTER_LORA}"
+    elif textual_inversion:
+        arch += f"/{ADAPTER_TEXTUAL_INVERSION}"
+    
+    return arch
+
+
+def determine_implementation(
+    lora: bool,
+    textual_inversion: bool,
+    sdxl: bool,
+    model_config: Optional[dict] = None,
+    is_stable_diffusion_ckpt: Optional[bool] = None
+) -> str:
+    """Determine implementation string from parameters."""
+    
+    model_config = model_config or {}
+    
+    if "flux" in model_config:
+        if model_config["flux"] == "chroma":
+            return IMPL_CHROMA
+        else:
+            return IMPL_FLUX
+    elif "lumina" in model_config:
+        return IMPL_LUMINA
+    elif (lora and sdxl) or textual_inversion or is_stable_diffusion_ckpt:
+        return IMPL_STABILITY_AI
+    else:
+        return IMPL_DIFFUSERS
+
+
+def get_implementation_version() -> str:
+    """Get the current implementation version as sd-scripts/{commit_hash}."""
+    try:
+        # Get the git commit hash
+        result = subprocess.run(
+            ["git", "rev-parse", "HEAD"],
+            capture_output=True,
+            text=True,
+            cwd=os.path.dirname(os.path.dirname(__file__)),  # Go up to sd-scripts root
+            timeout=5
+        )
+        
+        if result.returncode == 0:
+            commit_hash = result.stdout.strip()
+            return f"sd-scripts/{commit_hash}"
+        else:
+            logger.warning("Failed to get git commit hash, using fallback")
+            return "sd-scripts/unknown"
+    
+    except (subprocess.TimeoutExpired, subprocess.SubprocessError, FileNotFoundError) as e:
+        logger.warning(f"Could not determine git commit: {e}")
+        return "sd-scripts/unknown"
+
+
+def file_to_data_url(file_path: str) -> str:
+    """Convert a file path to a data URL for embedding in metadata."""
+    if not os.path.exists(file_path):
+        raise FileNotFoundError(f"File not found: {file_path}")
+    
+    # Get MIME type
+    mime_type, _ = mimetypes.guess_type(file_path)
+    if mime_type is None:
+        # Default to binary if we can't detect
+        mime_type = "application/octet-stream"
+    
+    # Read file and encode as base64
+    with open(file_path, "rb") as f:
+        file_data = f.read()
+    
+    encoded_data = base64.b64encode(file_data).decode("ascii")
+    
+    return f"data:{mime_type};base64,{encoded_data}"
+
+
+def determine_resolution(
+    reso: Optional[Union[int, Tuple[int, int]]] = None,
+    sdxl: bool = False,
+    model_config: Optional[dict] = None,
+    v2: bool = False,
+    v_parameterization: bool = False
+) -> str:
+    """Determine resolution string from parameters."""
+    
+    model_config = model_config or {}
+    
+    if reso is not None:
+        # Handle comma separated string
+        if isinstance(reso, str):
+            reso = tuple(map(int, reso.split(",")))
+        # Handle single int
+        if isinstance(reso, int):
+            reso = (reso, reso)
+        # Handle single-element tuple  
+        if len(reso) == 1:
+            reso = (reso[0], reso[0])
+    else:
+        # Determine default resolution based on model type
+        if (sdxl or 
+            "sd3" in model_config or 
+            "flux" in model_config or 
+            "lumina" in model_config):
+            reso = (1024, 1024)
+        elif v2 and v_parameterization:
+            reso = (768, 768)
+        else:
+            reso = (512, 512)
+    
+    return f"{reso[0]}x{reso[1]}"
+
+
 def load_bytes_in_safetensors(tensors):
     bytes = safetensors.torch.save(tensors)
     b = BytesIO(bytes)
@@ -109,7 +385,7 @@ def update_hash_sha256(metadata: dict, state_dict: dict):
     raise NotImplementedError
 
 
-def build_metadata(
+def build_metadata_dataclass(
     state_dict: Optional[dict],
     v2: bool,
     v_parameterization: bool,
@@ -127,75 +403,28 @@ def build_metadata(
     merged_from: Optional[str] = None,
     timesteps: Optional[Tuple[int, int]] = None,
     clip_skip: Optional[int] = None,
-    sd3: Optional[str] = None,
-    flux: Optional[str] = None,
-    lumina: Optional[str] = None,
-):
+    model_config: Optional[dict] = None,
+    optional_metadata: Optional[dict] = None,
+) -> ModelSpecMetadata:
     """
-    sd3: only supports "m", flux: supports "dev", "schnell" or "chroma"
+    Build ModelSpec 1.0.1 compliant metadata dataclass.
+    
+    Args:
+        model_config: Dict containing model type info, e.g. {"flux": "dev"}, {"sd3": "large"}
+        optional_metadata: Dict of additional metadata fields to include
     """
-    # if state_dict is None, hash is not calculated
-
-    metadata = {}
-    metadata.update(BASE_METADATA)
-
-    # TODO メモリを消費せずかつ正しいハッシュ計算の方法がわかったら実装する
-    # if state_dict is not None:
-    # hash = precalculate_safetensors_hashes(state_dict)
-    # metadata["modelspec.hash_sha256"] = hash
-
-    if sdxl:
-        arch = ARCH_SD_XL_V1_BASE
-    elif sd3 is not None:
-        arch = ARCH_SD3_M + "-" + sd3
-    elif flux is not None:
-        if flux == "dev":
-            arch = ARCH_FLUX_1_DEV
-        elif flux == "schnell":
-            arch = ARCH_FLUX_1_SCHNELL
-        elif flux == "chroma":
-            arch = ARCH_FLUX_1_CHROMA
-        else:
-            arch = ARCH_FLUX_1_UNKNOWN
-    elif lumina is not None:
-        if lumina == "lumina2":
-            arch = ARCH_LUMINA_2
-        else:
-            arch = ARCH_LUMINA_UNKNOWN
-    elif v2:
-        if v_parameterization:
-            arch = ARCH_SD_V2_768_V
-        else:
-            arch = ARCH_SD_V2_512
-    else:
-        arch = ARCH_SD_V1
-
-    if lora:
-        arch += f"/{ADAPTER_LORA}"
-    elif textual_inversion:
-        arch += f"/{ADAPTER_TEXTUAL_INVERSION}"
-
-    metadata["modelspec.architecture"] = arch
+    
+    # Use helper functions for complex logic
+    architecture = determine_architecture(
+        v2, v_parameterization, sdxl, lora, textual_inversion, model_config
+    )
 
     if not lora and not textual_inversion and is_stable_diffusion_ckpt is None:
         is_stable_diffusion_ckpt = True  # default is stable diffusion ckpt if not lora and not textual_inversion
 
-    if flux is not None:
-        # Flux
-        if flux == "chroma":
-            impl = IMPL_CHROMA
-        else:
-            impl = IMPL_FLUX
-    elif lumina is not None:
-        # Lumina
-        impl = IMPL_LUMINA
-    elif (lora and sdxl) or textual_inversion or is_stable_diffusion_ckpt:
-        # Stable Diffusion ckpt, TI, SDXL LoRA
-        impl = IMPL_STABILITY_AI
-    else:
-        # v1/v2 LoRA or Diffusers
-        impl = IMPL_DIFFUSERS
-    metadata["modelspec.implementation"] = impl
+    implementation = determine_implementation(
+        lora, textual_inversion, sdxl, model_config, is_stable_diffusion_ckpt
+    )
 
     if title is None:
         if lora:
@@ -205,88 +434,141 @@ def build_metadata(
         else:
             title = "Checkpoint"
         title += f"@{timestamp}"
-    metadata[MODELSPEC_TITLE] = title
-
-    if author is not None:
-        metadata["modelspec.author"] = author
-    else:
-        del metadata["modelspec.author"]
-
-    if description is not None:
-        metadata["modelspec.description"] = description
-    else:
-        del metadata["modelspec.description"]
-
-    if merged_from is not None:
-        metadata["modelspec.merged_from"] = merged_from
-    else:
-        del metadata["modelspec.merged_from"]
-
-    if license is not None:
-        metadata["modelspec.license"] = license
-    else:
-        del metadata["modelspec.license"]
-
-    if tags is not None:
-        metadata["modelspec.tags"] = tags
-    else:
-        del metadata["modelspec.tags"]
 
     # remove microsecond from time
     int_ts = int(timestamp)
-
     # time to iso-8601 compliant date
     date = datetime.datetime.fromtimestamp(int_ts).isoformat()
-    metadata["modelspec.date"] = date
 
-    if reso is not None:
-        # comma separated to tuple
-        if isinstance(reso, str):
-            reso = tuple(map(int, reso.split(",")))
-        if len(reso) == 1:
-            reso = (reso[0], reso[0])
-    else:
-        # resolution is defined in dataset, so use default
-        if sdxl or sd3 is not None or flux is not None or lumina is not None:
-            reso = 1024
-        elif v2 and v_parameterization:
-            reso = 768
-        else:
-            reso = 512
-    if isinstance(reso, int):
-        reso = (reso, reso)
-
-    metadata["modelspec.resolution"] = f"{reso[0]}x{reso[1]}"
+    # Use helper function for resolution
+    resolution = determine_resolution(
+        reso, sdxl, model_config, v2, v_parameterization
+    )
 
-    if flux is not None:
-        del metadata["modelspec.prediction_type"]
-    elif v_parameterization:
-        metadata["modelspec.prediction_type"] = PRED_TYPE_V
-    else:
-        metadata["modelspec.prediction_type"] = PRED_TYPE_EPSILON
+    # Handle prediction type - Flux models don't use prediction_type
+    model_config = model_config or {}
+    prediction_type = None
+    if "flux" not in model_config:
+        if v_parameterization:
+            prediction_type = PRED_TYPE_V
+        else:
+            prediction_type = PRED_TYPE_EPSILON
 
+    # Handle timesteps
+    timestep_range = None
     if timesteps is not None:
         if isinstance(timesteps, str) or isinstance(timesteps, int):
             timesteps = (timesteps, timesteps)
         if len(timesteps) == 1:
             timesteps = (timesteps[0], timesteps[0])
-        metadata["modelspec.timestep_range"] = f"{timesteps[0]},{timesteps[1]}"
-    else:
-        del metadata["modelspec.timestep_range"]
+        timestep_range = f"{timesteps[0]},{timesteps[1]}"
 
+    # Handle encoder layer (clip skip)
+    encoder_layer = None
     if clip_skip is not None:
-        metadata["modelspec.encoder_layer"] = f"{clip_skip}"
-    else:
-        del metadata["modelspec.encoder_layer"]
+        encoder_layer = f"{clip_skip}"
 
-    # # assert all values are filled
-    # assert all([v is not None for v in metadata.values()]), metadata
-    if not all([v is not None for v in metadata.values()]):
-        logger.error(f"Internal error: some metadata values are None: {metadata}")
+    # TODO: Implement hash calculation when memory-efficient method is available
+    # hash_sha256 = None
+    # if state_dict is not None:
+    #     hash_sha256 = precalculate_safetensors_hashes(state_dict)
+
+    # Process thumbnail - convert file path to data URL if needed
+    processed_optional_metadata = optional_metadata.copy() if optional_metadata else {}
+    if "thumbnail" in processed_optional_metadata:
+        thumbnail_value = processed_optional_metadata["thumbnail"]
+        # Check if it's already a data URL or if it's a file path
+        if thumbnail_value and not thumbnail_value.startswith("data:"):
+            try:
+                processed_optional_metadata["thumbnail"] = file_to_data_url(thumbnail_value)
+                logger.info(f"Converted thumbnail file {thumbnail_value} to data URL")
+            except FileNotFoundError as e:
+                logger.warning(f"Thumbnail file not found, skipping: {e}")
+                del processed_optional_metadata["thumbnail"]
+            except Exception as e:
+                logger.warning(f"Failed to convert thumbnail to data URL: {e}")
+                del processed_optional_metadata["thumbnail"]
+
+    # Automatically set implementation version if not provided
+    if "implementation_version" not in processed_optional_metadata:
+        processed_optional_metadata["implementation_version"] = get_implementation_version()
+
+    # Create the dataclass
+    metadata = ModelSpecMetadata(
+        architecture=architecture,
+        implementation=implementation,
+        title=title,
+        description=description,
+        author=author,
+        date=date,
+        license=license,
+        tags=tags,
+        merged_from=merged_from,
+        resolution=resolution,
+        prediction_type=prediction_type,
+        timestep_range=timestep_range,
+        encoder_layer=encoder_layer,
+        additional_fields=processed_optional_metadata
+    )
 
     return metadata
 
 
+def build_metadata(
+    state_dict: Optional[dict],
+    v2: bool,
+    v_parameterization: bool,
+    sdxl: bool,
+    lora: bool,
+    textual_inversion: bool,
+    timestamp: float,
+    title: Optional[str] = None,
+    reso: Optional[Union[int, Tuple[int, int]]] = None,
+    is_stable_diffusion_ckpt: Optional[bool] = None,
+    author: Optional[str] = None,
+    description: Optional[str] = None,
+    license: Optional[str] = None,
+    tags: Optional[str] = None,
+    merged_from: Optional[str] = None,
+    timesteps: Optional[Tuple[int, int]] = None,
+    clip_skip: Optional[int] = None,
+    model_config: Optional[dict] = None,
+    optional_metadata: Optional[dict] = None,
+) -> Dict[str, str]:
+    """
+    Build ModelSpec 1.0.1 compliant metadata for safetensors models.
+    Legacy function that returns dict - prefer build_metadata_dataclass for new code.
+    
+    Args:
+        model_config: Dict containing model type info, e.g. {"flux": "dev"}, {"sd3": "large"}
+        optional_metadata: Dict of additional metadata fields to include
+    """
+    # Use the dataclass function and convert to dict
+    metadata_obj = build_metadata_dataclass(
+        state_dict=state_dict,
+        v2=v2,
+        v_parameterization=v_parameterization,
+        sdxl=sdxl,
+        lora=lora,
+        textual_inversion=textual_inversion,
+        timestamp=timestamp,
+        title=title,
+        reso=reso,
+        is_stable_diffusion_ckpt=is_stable_diffusion_ckpt,
+        author=author,
+        description=description,
+        license=license,
+        tags=tags,
+        merged_from=merged_from,
+        timesteps=timesteps,
+        clip_skip=clip_skip,
+        model_config=model_config,
+        optional_metadata=optional_metadata,
+    )
+    
+    return metadata_obj.to_metadata_dict()
+
+
 # region utils
 
 
@@ -317,6 +599,121 @@ def get_title(model: str):
     return ", ".join(titles)
 
 
+def add_model_spec_arguments(parser: argparse.ArgumentParser):
+    """Add all ModelSpec metadata arguments to the parser."""
+    
+    # === Existing standard metadata fields ===
+    parser.add_argument(
+        "--metadata_title",
+        type=str,
+        default=None,
+        help="title for model metadata (default is output_name) / メタデータに書き込まれるモデルタイトル、省略時はoutput_name",
+    )
+    parser.add_argument(
+        "--metadata_author",
+        type=str,
+        default=None,
+        help="author name for model metadata / メタデータに書き込まれるモデル作者名",
+    )
+    parser.add_argument(
+        "--metadata_description",
+        type=str,
+        default=None,
+        help="description for model metadata / メタデータに書き込まれるモデル説明",
+    )
+    parser.add_argument(
+        "--metadata_license",
+        type=str,
+        default=None,
+        help="license for model metadata / メタデータに書き込まれるモデルライセンス",
+    )
+    parser.add_argument(
+        "--metadata_tags",
+        type=str,
+        default=None,
+        help="tags for model metadata, separated by comma / メタデータに書き込まれるモデルタグ、カンマ区切り",
+    )
+    
+    # === Universal CAN fields ===
+    # Note: implementation_version is automatically set to sd-scripts/{commit_hash}
+    parser.add_argument(
+        "--metadata_usage_hint",
+        type=str,
+        default=None,
+        help="usage hint for model metadata / メタデータに書き込まれる使用方法のヒント",
+    )
+    parser.add_argument(
+        "--metadata_thumbnail",
+        type=str,
+        default=None,
+        help="thumbnail image as data URL or file path (will be converted to data URL) for model metadata / メタデータに書き込まれるサムネイル画像（データURLまたはファイルパス、ファイルパスの場合はデータURLに変換されます）",
+    )
+    parser.add_argument(
+        "--metadata_merged_from",
+        type=str,
+        default=None,
+        help="source models for merged model metadata / メタデータに書き込まれるマージ元モデル名",
+    )
+    
+    # === Image generation CAN fields ===
+    parser.add_argument(
+        "--metadata_trigger_phrase",
+        type=str,
+        default=None,
+        help="trigger phrase for model metadata / メタデータに書き込まれるトリガーフレーズ",
+    )
+    parser.add_argument(
+        "--metadata_preprocessor",
+        type=str,
+        default=None,
+        help="preprocessor used for model metadata / メタデータに書き込まれる前処理手法",
+    )
+    parser.add_argument(
+        "--metadata_is_negative_embedding",
+        type=str,
+        default=None,
+        help="whether this is a negative embedding for model metadata / メタデータに書き込まれるネガティブ埋め込みかどうか",
+    )
+    parser.add_argument(
+        "--metadata_unet_dtype",
+        type=str,
+        default=None,
+        help="UNet data type for model metadata / メタデータに書き込まれるUNetのデータ型",
+    )
+    parser.add_argument(
+        "--metadata_vae_dtype",
+        type=str,
+        default=None,
+        help="VAE data type for model metadata / メタデータに書き込まれるVAEのデータ型",
+    )
+    
+    # === Text prediction fields ===
+    parser.add_argument(
+        "--metadata_data_format",
+        type=str,
+        default=None,
+        help="data format for text prediction model metadata / メタデータに書き込まれるテキスト予測モデルのデータ形式",
+    )
+    parser.add_argument(
+        "--metadata_format_type",
+        type=str,
+        default=None,
+        help="format type for text prediction model metadata / メタデータに書き込まれるテキスト予測モデルの形式タイプ",
+    )
+    parser.add_argument(
+        "--metadata_language",
+        type=str,
+        default=None,
+        help="language for text prediction model metadata / メタデータに書き込まれるテキスト予測モデルの言語",
+    )
+    parser.add_argument(
+        "--metadata_format_template",
+        type=str,
+        default=None,
+        help="format template for text prediction model metadata / メタデータに書き込まれるテキスト予測モデルの形式テンプレート",
+    )
+
+
 # endregion
 
 
diff --git a/library/train_util.py b/library/train_util.py
index c866dec2a..395183957 100644
--- a/library/train_util.py
+++ b/library/train_util.py
@@ -3484,6 +3484,7 @@ def get_sai_model_spec(
     sd3: str = None,
     flux: str = None, # "dev", "schnell" or "chroma"
     lumina: str = None,
+    optional_metadata: dict[str, str] | None = None
 ):
     timestamp = time.time()
 
@@ -3500,6 +3501,34 @@ def get_sai_model_spec(
     else:
         timesteps = None
 
+    # Convert individual model parameters to model_config dict
+    # TODO: Update calls to this function to pass in the model config
+    model_config = {}
+    if sd3 is not None:
+        model_config["sd3"] = sd3
+    if flux is not None:
+        model_config["flux"] = flux
+    if lumina is not None:
+        model_config["lumina"] = lumina
+
+    # Extract metadata_* fields from args and merge with optional_metadata
+    extracted_metadata = {}
+    
+    # Extract all metadata_* attributes from args
+    for attr_name in dir(args):
+        if attr_name.startswith("metadata_") and not attr_name.startswith("metadata___"):
+            value = getattr(args, attr_name, None)
+            if value is not None:
+                # Remove metadata_ prefix and exclude already handled fields
+                field_name = attr_name[9:]  # len("metadata_") = 9
+                if field_name not in ["title", "author", "description", "license", "tags"]:
+                    extracted_metadata[field_name] = value
+    
+    # Merge extracted metadata with provided optional_metadata
+    all_optional_metadata = {**extracted_metadata}
+    if optional_metadata:
+        all_optional_metadata.update(optional_metadata)
+
     metadata = sai_model_spec.build_metadata(
         state_dict,
         v2,
@@ -3517,13 +3546,75 @@ def get_sai_model_spec(
         tags=args.metadata_tags,
         timesteps=timesteps,
         clip_skip=args.clip_skip,  # None or int
-        sd3=sd3,
-        flux=flux,
-        lumina=lumina,
+        model_config=model_config, 
+        optional_metadata=all_optional_metadata if all_optional_metadata else None,
     )
     return metadata
 
 
+def get_sai_model_spec_dataclass(
+    state_dict: dict,
+    args: argparse.Namespace,
+    sdxl: bool,
+    lora: bool,
+    textual_inversion: bool,
+    is_stable_diffusion_ckpt: Optional[bool] = None,
+    sd3: str = None,
+    flux: str = None,
+    lumina: str = None,
+    optional_metadata: dict[str, str] | None = None
+) -> sai_model_spec.ModelSpecMetadata:
+    """
+    Get ModelSpec metadata as a dataclass - preferred for new code.
+    Automatically extracts metadata_* fields from args.
+    """
+    timestamp = time.time()
+
+    v2 = args.v2
+    v_parameterization = args.v_parameterization
+    reso = args.resolution
+
+    title = args.metadata_title if args.metadata_title is not None else args.output_name
+
+    if args.min_timestep is not None or args.max_timestep is not None:
+        min_time_step = args.min_timestep if args.min_timestep is not None else 0
+        max_time_step = args.max_timestep if args.max_timestep is not None else 1000
+        timesteps = (min_time_step, max_time_step)
+    else:
+        timesteps = None
+
+    # Convert individual model parameters to model_config dict
+    model_config = {}
+    if sd3 is not None:
+        model_config["sd3"] = sd3
+    if flux is not None:
+        model_config["flux"] = flux
+    if lumina is not None:
+        model_config["lumina"] = lumina
+
+    # Use the dataclass function directly
+    return sai_model_spec.build_metadata_dataclass(
+        state_dict,
+        v2,
+        v_parameterization,
+        sdxl,
+        lora,
+        textual_inversion,
+        timestamp,
+        title=title,
+        reso=reso,
+        is_stable_diffusion_ckpt=is_stable_diffusion_ckpt,
+        author=args.metadata_author,
+        description=args.metadata_description,
+        license=args.metadata_license,
+        tags=args.metadata_tags,
+        timesteps=timesteps,
+        clip_skip=args.clip_skip,
+        model_config=model_config,
+        optional_metadata=optional_metadata,
+    )
+
+
 def add_sd_models_arguments(parser: argparse.ArgumentParser):
     # for pretrained models
     parser.add_argument(
@@ -4103,39 +4194,6 @@ def add_training_arguments(parser: argparse.ArgumentParser, support_dreambooth:
     parser.add_argument(
         "--output_config", action="store_true", help="output command line args to given .toml file / 引数を.tomlファイルに出力する"
     )
-
-    # SAI Model spec
-    parser.add_argument(
-        "--metadata_title",
-        type=str,
-        default=None,
-        help="title for model metadata (default is output_name) / メタデータに書き込まれるモデルタイトル、省略時はoutput_name",
-    )
-    parser.add_argument(
-        "--metadata_author",
-        type=str,
-        default=None,
-        help="author name for model metadata / メタデータに書き込まれるモデル作者名",
-    )
-    parser.add_argument(
-        "--metadata_description",
-        type=str,
-        default=None,
-        help="description for model metadata / メタデータに書き込まれるモデル説明",
-    )
-    parser.add_argument(
-        "--metadata_license",
-        type=str,
-        default=None,
-        help="license for model metadata / メタデータに書き込まれるモデルライセンス",
-    )
-    parser.add_argument(
-        "--metadata_tags",
-        type=str,
-        default=None,
-        help="tags for model metadata, separated by comma / メタデータに書き込まれるモデルタグ、カンマ区切り",
-    )
-
     if support_dreambooth:
         # DreamBooth training
         parser.add_argument(

From 056472c2fcb0b46f35459caaa9f2a4ed3b234499 Mon Sep 17 00:00:00 2001
From: rockerBOO <rockerboo@gmail.com>
Date: Sat, 2 Aug 2025 21:16:56 -0400
Subject: [PATCH 516/582] Add tests

---
 tests/library/test_sai_model_spec.py | 349 +++++++++++++++++++++++++++
 1 file changed, 349 insertions(+)
 create mode 100644 tests/library/test_sai_model_spec.py

diff --git a/tests/library/test_sai_model_spec.py b/tests/library/test_sai_model_spec.py
new file mode 100644
index 000000000..92dcf4c64
--- /dev/null
+++ b/tests/library/test_sai_model_spec.py
@@ -0,0 +1,349 @@
+"""Tests for sai_model_spec module."""
+import pytest
+import time
+
+from library import sai_model_spec
+
+
+class MockArgs:
+    """Mock argparse.Namespace for testing."""
+    
+    def __init__(self, **kwargs):
+        # Default values
+        self.v2 = False
+        self.v_parameterization = False
+        self.resolution = 512
+        self.metadata_title = None
+        self.metadata_author = None
+        self.metadata_description = None
+        self.metadata_license = None
+        self.metadata_tags = None
+        self.min_timestep = None
+        self.max_timestep = None
+        self.clip_skip = None
+        self.output_name = "test_output"
+        
+        # Override with provided values
+        for key, value in kwargs.items():
+            setattr(self, key, value)
+
+
+class TestModelSpecMetadata:
+    """Test the ModelSpecMetadata dataclass."""
+    
+    def test_creation_and_conversion(self):
+        """Test creating dataclass and converting to metadata dict."""
+        metadata = sai_model_spec.ModelSpecMetadata(
+            architecture="stable-diffusion-v1",
+            implementation="diffusers",
+            title="Test Model",
+            author="Test Author",
+            description=None  # Test None exclusion
+        )
+        
+        assert metadata.architecture == "stable-diffusion-v1"
+        assert metadata.sai_model_spec == "1.0.1"
+        
+        metadata_dict = metadata.to_metadata_dict()
+        assert "modelspec.architecture" in metadata_dict
+        assert "modelspec.author" in metadata_dict
+        assert "modelspec.description" not in metadata_dict  # None values excluded
+        assert metadata_dict["modelspec.sai_model_spec"] == "1.0.1"
+    
+    def test_additional_fields_handling(self):
+        """Test handling of additional metadata fields."""
+        additional = {"custom_field": "custom_value", "modelspec.prefixed": "prefixed_value"}
+        
+        metadata = sai_model_spec.ModelSpecMetadata(
+            architecture="stable-diffusion-v1",
+            implementation="diffusers",
+            title="Test Model",
+            additional_fields=additional
+        )
+        
+        metadata_dict = metadata.to_metadata_dict()
+        assert "modelspec.custom_field" in metadata_dict
+        assert "modelspec.prefixed" in metadata_dict
+        assert metadata_dict["modelspec.custom_field"] == "custom_value"
+    
+    def test_from_args_extraction(self):
+        """Test creating ModelSpecMetadata from args with metadata_* fields."""
+        args = MockArgs(
+            metadata_author="Test Author",
+            metadata_trigger_phrase="anime style",
+            metadata_usage_hint="Use CFG 7.5"
+        )
+        
+        metadata = sai_model_spec.ModelSpecMetadata.from_args(
+            args,
+            architecture="stable-diffusion-v1",
+            implementation="diffusers",
+            title="Test Model"
+        )
+        
+        assert metadata.author == "Test Author"
+        assert metadata.additional_fields["trigger_phrase"] == "anime style"
+        assert metadata.additional_fields["usage_hint"] == "Use CFG 7.5"
+
+
+class TestArchitectureDetection:
+    """Test architecture detection for different model types."""
+    
+    @pytest.mark.parametrize("config,expected", [
+        ({"v2": False, "v_parameterization": False, "sdxl": True}, "stable-diffusion-xl-v1-base"),
+        ({"v2": False, "v_parameterization": False, "sdxl": False, "model_config": {"flux": "dev"}}, "flux-1-dev"),
+        ({"v2": False, "v_parameterization": False, "sdxl": False, "model_config": {"flux": "chroma"}}, "chroma"),
+        ({"v2": False, "v_parameterization": False, "sdxl": False, "model_config": {"sd3": "large"}}, "stable-diffusion-3-large"),
+        ({"v2": True, "v_parameterization": True, "sdxl": False}, "stable-diffusion-v2-768-v"),
+        ({"v2": False, "v_parameterization": False, "sdxl": False}, "stable-diffusion-v1"),
+    ])
+    def test_architecture_detection(self, config, expected):
+        """Test architecture detection for various model configurations."""
+        model_config = config.pop("model_config", None)
+        arch = sai_model_spec.determine_architecture(
+            lora=False, textual_inversion=False, model_config=model_config, **config
+        )
+        assert arch == expected
+    
+    def test_adapter_suffixes(self):
+        """Test LoRA and textual inversion suffixes."""
+        lora_arch = sai_model_spec.determine_architecture(
+            v2=False, v_parameterization=False, sdxl=True,
+            lora=True, textual_inversion=False
+        )
+        assert lora_arch == "stable-diffusion-xl-v1-base/lora"
+        
+        ti_arch = sai_model_spec.determine_architecture(
+            v2=False, v_parameterization=False, sdxl=False,
+            lora=False, textual_inversion=True
+        )
+        assert ti_arch == "stable-diffusion-v1/textual-inversion"
+
+
+class TestImplementationDetection:
+    """Test implementation detection for different model types."""
+    
+    @pytest.mark.parametrize("config,expected", [
+        ({"model_config": {"flux": "dev"}}, "https://github.com/black-forest-labs/flux"),
+        ({"model_config": {"flux": "chroma"}}, "https://huggingface.co/lodestones/Chroma"),
+        ({"model_config": {"lumina": "lumina2"}}, "https://github.com/Alpha-VLLM/Lumina-Image-2.0"),
+        ({"lora": True, "sdxl": True}, "https://github.com/Stability-AI/generative-models"),
+        ({"lora": True, "sdxl": False}, "diffusers"),
+    ])
+    def test_implementation_detection(self, config, expected):
+        """Test implementation detection for various configurations."""
+        model_config = config.pop("model_config", None)
+        impl = sai_model_spec.determine_implementation(
+            lora=config.get("lora", False), 
+            textual_inversion=False, 
+            sdxl=config.get("sdxl", False),
+            model_config=model_config
+        )
+        assert impl == expected
+
+
+class TestResolutionHandling:
+    """Test resolution parsing and defaults."""
+    
+    @pytest.mark.parametrize("input_reso,expected", [
+        ((768, 1024), "768x1024"),
+        (768, "768x768"),
+        ("768,1024", "768x1024"),
+    ])
+    def test_explicit_resolution_formats(self, input_reso, expected):
+        """Test different resolution input formats."""
+        res = sai_model_spec.determine_resolution(reso=input_reso)
+        assert res == expected
+    
+    @pytest.mark.parametrize("config,expected", [
+        ({"sdxl": True}, "1024x1024"),
+        ({"model_config": {"flux": "dev"}}, "1024x1024"),
+        ({"v2": True, "v_parameterization": True}, "768x768"),
+        ({}, "512x512"),  # Default SD v1
+    ])
+    def test_default_resolutions(self, config, expected):
+        """Test default resolution detection."""
+        model_config = config.pop("model_config", None)
+        res = sai_model_spec.determine_resolution(model_config=model_config, **config)
+        assert res == expected
+
+
+class TestThumbnailProcessing:
+    """Test thumbnail data URL processing."""
+    
+    def test_file_to_data_url(self):
+        """Test converting file to data URL."""
+        import tempfile
+        import os
+        
+        # Create a tiny test PNG (1x1 pixel)
+        test_png_data = b'\x89PNG\r\n\x1a\n\x00\x00\x00\rIHDR\x00\x00\x00\x01\x00\x00\x00\x01\x08\x02\x00\x00\x00\x90wS\xde\x00\x00\x00\x0cIDATx\x9cc\xff\xff\xff\x00\x00\x00\x04\x00\x01\x9d\xb3\xa7c\x00\x00\x00\x00IEND\xaeB`\x82'
+        
+        with tempfile.NamedTemporaryFile(suffix='.png', delete=False) as f:
+            f.write(test_png_data)
+            temp_path = f.name
+        
+        try:
+            data_url = sai_model_spec.file_to_data_url(temp_path)
+            
+            # Check format
+            assert data_url.startswith("data:image/png;base64,")
+            
+            # Check it's a reasonable length (base64 encoded)
+            assert len(data_url) > 50
+            
+            # Verify we can decode it back
+            import base64
+            encoded_part = data_url.split(",", 1)[1]
+            decoded_data = base64.b64decode(encoded_part)
+            assert decoded_data == test_png_data
+            
+        finally:
+            os.unlink(temp_path)
+    
+    def test_file_to_data_url_nonexistent_file(self):
+        """Test error handling for nonexistent files."""
+        import pytest
+        
+        with pytest.raises(FileNotFoundError):
+            sai_model_spec.file_to_data_url("/nonexistent/file.png")
+    
+    def test_thumbnail_processing_in_metadata(self):
+        """Test thumbnail processing in build_metadata_dataclass."""
+        import tempfile
+        import os
+        
+        # Create a test image file
+        test_png_data = b'\x89PNG\r\n\x1a\n\x00\x00\x00\rIHDR\x00\x00\x00\x01\x00\x00\x00\x01\x08\x02\x00\x00\x00\x90wS\xde\x00\x00\x00\x0cIDATx\x9cc\xff\xff\xff\x00\x00\x00\x04\x00\x01\x9d\xb3\xa7c\x00\x00\x00\x00IEND\xaeB`\x82'
+        
+        with tempfile.NamedTemporaryFile(suffix='.png', delete=False) as f:
+            f.write(test_png_data)
+            temp_path = f.name
+        
+        try:
+            timestamp = time.time()
+            
+            # Test with file path - should be converted to data URL
+            metadata = sai_model_spec.build_metadata_dataclass(
+                state_dict=None,
+                v2=False,
+                v_parameterization=False,
+                sdxl=False,
+                lora=False,
+                textual_inversion=False,
+                timestamp=timestamp,
+                title="Test Model",
+                optional_metadata={"thumbnail": temp_path}
+            )
+            
+            # Should be converted to data URL
+            assert "thumbnail" in metadata.additional_fields
+            assert metadata.additional_fields["thumbnail"].startswith("data:image/png;base64,")
+            
+        finally:
+            os.unlink(temp_path)
+    
+    def test_thumbnail_data_url_passthrough(self):
+        """Test that existing data URLs are passed through unchanged."""
+        timestamp = time.time()
+        
+        existing_data_url = "data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAYAAAAfFcSJAAAADUlEQVR42mP8/5+hHgAHggJ/PchI7wAAAABJRU5ErkJggg=="
+        
+        metadata = sai_model_spec.build_metadata_dataclass(
+            state_dict=None,
+            v2=False,
+            v_parameterization=False,
+            sdxl=False,
+            lora=False,
+            textual_inversion=False,
+            timestamp=timestamp,
+            title="Test Model",
+            optional_metadata={"thumbnail": existing_data_url}
+        )
+        
+        # Should be unchanged
+        assert metadata.additional_fields["thumbnail"] == existing_data_url
+    
+    def test_thumbnail_invalid_file_handling(self):
+        """Test graceful handling of invalid thumbnail files."""
+        timestamp = time.time()
+        
+        metadata = sai_model_spec.build_metadata_dataclass(
+            state_dict=None,
+            v2=False,
+            v_parameterization=False,
+            sdxl=False,
+            lora=False,
+            textual_inversion=False,
+            timestamp=timestamp,
+            title="Test Model",
+            optional_metadata={"thumbnail": "/nonexistent/file.png"}
+        )
+        
+        # Should be removed from additional_fields due to error
+        assert "thumbnail" not in metadata.additional_fields
+
+
+class TestBuildMetadataIntegration:
+    """Test the complete metadata building workflow."""
+    
+    def test_sdxl_model_workflow(self):
+        """Test complete workflow for SDXL model."""
+        timestamp = time.time()
+        
+        metadata = sai_model_spec.build_metadata_dataclass(
+            state_dict=None,
+            v2=False,
+            v_parameterization=False,
+            sdxl=True,
+            lora=False,
+            textual_inversion=False,
+            timestamp=timestamp,
+            title="Test SDXL Model"
+        )
+        
+        assert metadata.architecture == "stable-diffusion-xl-v1-base"
+        assert metadata.implementation == "https://github.com/Stability-AI/generative-models"
+        assert metadata.resolution == "1024x1024"
+        assert metadata.prediction_type == "epsilon"
+    
+    def test_flux_model_workflow(self):
+        """Test complete workflow for Flux model."""
+        timestamp = time.time()
+        
+        metadata = sai_model_spec.build_metadata_dataclass(
+            state_dict=None,
+            v2=False,
+            v_parameterization=False,
+            sdxl=False,
+            lora=False,
+            textual_inversion=False,
+            timestamp=timestamp,
+            title="Test Flux Model",
+            model_config={"flux": "dev"},
+            optional_metadata={"trigger_phrase": "anime style"}
+        )
+        
+        assert metadata.architecture == "flux-1-dev"
+        assert metadata.implementation == "https://github.com/black-forest-labs/flux"
+        assert metadata.prediction_type is None  # Flux doesn't use prediction_type
+        assert metadata.additional_fields["trigger_phrase"] == "anime style"
+    
+    def test_legacy_function_compatibility(self):
+        """Test that legacy build_metadata function works correctly."""
+        timestamp = time.time()
+        
+        metadata_dict = sai_model_spec.build_metadata(
+            state_dict=None,
+            v2=False,
+            v_parameterization=False,
+            sdxl=True,
+            lora=False,
+            textual_inversion=False,
+            timestamp=timestamp,
+            title="Test Model"
+        )
+        
+        assert isinstance(metadata_dict, dict)
+        assert metadata_dict["modelspec.sai_model_spec"] == "1.0.1"
+        assert metadata_dict["modelspec.architecture"] == "stable-diffusion-xl-v1-base"
\ No newline at end of file

From bf0f86e79726e7283359a15f7a03793595300102 Mon Sep 17 00:00:00 2001
From: rockerBOO <rockerboo@gmail.com>
Date: Sat, 2 Aug 2025 21:35:45 -0400
Subject: [PATCH 517/582] Add sai_model_spec to train_network.py

---
 train_network.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/train_network.py b/train_network.py
index 7861e7404..aa42a3bf1 100644
--- a/train_network.py
+++ b/train_network.py
@@ -24,7 +24,7 @@
 from accelerate import Accelerator
 from diffusers import DDPMScheduler
 from diffusers.models.autoencoders.autoencoder_kl import AutoencoderKL
-from library import deepspeed_utils, model_util, strategy_base, strategy_sd
+from library import deepspeed_utils, model_util, sai_model_spec, strategy_base, strategy_sd
 
 import library.train_util as train_util
 from library.train_util import DreamBoothDataset
@@ -1718,6 +1718,7 @@ def setup_parser() -> argparse.ArgumentParser:
     train_util.add_optimizer_arguments(parser)
     config_util.add_config_arguments(parser)
     custom_train_functions.add_custom_train_arguments(parser)
+    sai_model_spec.add_model_spec_arguments(parser)
 
     parser.add_argument(
         "--cpu_offload_checkpointing",

From 10bfcb9ac5b3467abde3a0aa5972478d1a0a6595 Mon Sep 17 00:00:00 2001
From: rockerBOO <rockerboo@gmail.com>
Date: Sun, 3 Aug 2025 00:40:10 -0400
Subject: [PATCH 518/582] Remove text model spec

---
 library/sai_model_spec.py | 192 +++++++++++++-------------------------
 1 file changed, 64 insertions(+), 128 deletions(-)

diff --git a/library/sai_model_spec.py b/library/sai_model_spec.py
index 8b1224842..2ee3ff224 100644
--- a/library/sai_model_spec.py
+++ b/library/sai_model_spec.py
@@ -9,7 +9,7 @@
 from dataclasses import dataclass, field
 from io import BytesIO
 import os
-from typing import Dict, List, Optional, Tuple, Union
+from typing import Union
 import safetensors
 from library.utils import setup_logging
 
@@ -36,30 +36,26 @@
 """
 
 BASE_METADATA = {
-    # === Universal MUST fields ===
-    "modelspec.sai_model_spec": "1.0.1",  # Updated to latest spec version
+    # === MUST ===
+    "modelspec.sai_model_spec": "1.0.1", 
     "modelspec.architecture": None,
     "modelspec.implementation": None,
     "modelspec.title": None,
+    "modelspec.resolution": None,
     
-    # === Universal SHOULD fields ===
+    # === SHOULD ===
     "modelspec.description": None,
     "modelspec.author": None,
     "modelspec.date": None,
     "modelspec.hash_sha256": None,
     
-    # === Universal CAN fields ===
+    # === CAN===
     "modelspec.implementation_version": None,
     "modelspec.license": None,
     "modelspec.usage_hint": None,
     "modelspec.thumbnail": None,
     "modelspec.tags": None,
     "modelspec.merged_from": None,
-    
-    # === Image generation MUST fields ===
-    "modelspec.resolution": None,
-    
-    # === Image generation CAN fields ===
     "modelspec.trigger_phrase": None,
     "modelspec.prediction_type": None,
     "modelspec.timestep_range": None,
@@ -68,12 +64,6 @@
     "modelspec.is_negative_embedding": None,
     "modelspec.unet_dtype": None,
     "modelspec.vae_dtype": None,
-    
-    # === Text prediction fields ===
-    "modelspec.data_format": None,
-    "modelspec.format_type": None,
-    "modelspec.language": None,
-    "modelspec.format_template": None,
 }
 
 # 別に使うやつだけ定義
@@ -113,49 +103,39 @@ class ModelSpecMetadata:
     All fields correspond to modelspec.* keys in the final metadata.
     """
     
-    # === Universal MUST fields ===
+    # === MUST ===
     architecture: str
     implementation: str
     title: str
+    resolution: str | None = None
     
-    # === Universal SHOULD fields ===
-    description: Optional[str] = None
-    author: Optional[str] = None
-    date: Optional[str] = None
-    hash_sha256: Optional[str] = None
+    # === SHOULD ===
+    description: str | None = None
+    author: str | None = None
+    date: str | None = None
+    hash_sha256: str | None = None
     
-    # === Universal CAN fields ===
+    # === CAN ===
     sai_model_spec: str = "1.0.1"
-    implementation_version: Optional[str] = None
-    license: Optional[str] = None
-    usage_hint: Optional[str] = None
-    thumbnail: Optional[str] = None
-    tags: Optional[str] = None
-    merged_from: Optional[str] = None
-    
-    # === Image generation MUST fields ===
-    resolution: Optional[str] = None
-    
-    # === Image generation CAN fields ===
-    trigger_phrase: Optional[str] = None
-    prediction_type: Optional[str] = None
-    timestep_range: Optional[str] = None
-    encoder_layer: Optional[str] = None
-    preprocessor: Optional[str] = None
-    is_negative_embedding: Optional[str] = None
-    unet_dtype: Optional[str] = None
-    vae_dtype: Optional[str] = None
-    
-    # === Text prediction fields ===
-    data_format: Optional[str] = None
-    format_type: Optional[str] = None
-    language: Optional[str] = None
-    format_template: Optional[str] = None
+    implementation_version: str | None = None
+    license: str | None = None
+    usage_hint: str | None = None
+    thumbnail: str | None = None
+    tags: str | None = None
+    merged_from: str | None = None
+    trigger_phrase: str | None = None
+    prediction_type: str | None = None
+    timestep_range: str | None = None
+    encoder_layer: str | None = None
+    preprocessor: str | None = None
+    is_negative_embedding: str | None = None
+    unet_dtype: str | None = None
+    vae_dtype: str | None = None
     
     # === Additional metadata ===
-    additional_fields: Dict[str, str] = field(default_factory=dict)
+    additional_fields: dict[str, str] = field(default_factory=dict)
     
-    def to_metadata_dict(self) -> Dict[str, str]:
+    def to_metadata_dict(self) -> dict[str, str]:
         """Convert dataclass to metadata dictionary with modelspec. prefixes."""
         metadata = {}
         
@@ -212,7 +192,7 @@ def determine_architecture(
     sdxl: bool,
     lora: bool,
     textual_inversion: bool,
-    model_config: Optional[dict] = None
+    model_config: dict[str, str] | None = None
 ) -> str:
     """Determine model architecture string from parameters."""
     
@@ -256,8 +236,8 @@ def determine_implementation(
     lora: bool,
     textual_inversion: bool,
     sdxl: bool,
-    model_config: Optional[dict] = None,
-    is_stable_diffusion_ckpt: Optional[bool] = None
+    model_config: dict[str, str] | None = None,
+    is_stable_diffusion_ckpt: bool | None = None
 ) -> str:
     """Determine implementation string from parameters."""
     
@@ -321,9 +301,9 @@ def file_to_data_url(file_path: str) -> str:
 
 
 def determine_resolution(
-    reso: Optional[Union[int, Tuple[int, int]]] = None,
+    reso: Union[int, tuple[int, int]] | None = None,
     sdxl: bool = False,
-    model_config: Optional[dict] = None,
+    model_config: dict[str, str] | None = None,
     v2: bool = False,
     v_parameterization: bool = False
 ) -> str:
@@ -386,25 +366,25 @@ def update_hash_sha256(metadata: dict, state_dict: dict):
 
 
 def build_metadata_dataclass(
-    state_dict: Optional[dict],
+    state_dict: dict | None,
     v2: bool,
     v_parameterization: bool,
     sdxl: bool,
     lora: bool,
     textual_inversion: bool,
     timestamp: float,
-    title: Optional[str] = None,
-    reso: Optional[Union[int, Tuple[int, int]]] = None,
-    is_stable_diffusion_ckpt: Optional[bool] = None,
-    author: Optional[str] = None,
-    description: Optional[str] = None,
-    license: Optional[str] = None,
-    tags: Optional[str] = None,
-    merged_from: Optional[str] = None,
-    timesteps: Optional[Tuple[int, int]] = None,
-    clip_skip: Optional[int] = None,
-    model_config: Optional[dict] = None,
-    optional_metadata: Optional[dict] = None,
+    title: str | None = None,
+    reso: int | tuple[int, int] | None = None,
+    is_stable_diffusion_ckpt: bool | None = None,
+    author: str | None = None,
+    description: str | None = None,
+    license: str | None = None,
+    tags: str | None = None,
+    merged_from: str | None = None,
+    timesteps: tuple[int, int] | None = None,
+    clip_skip: int | None = None,
+    model_config: dict | None = None,
+    optional_metadata: dict | None = None,
 ) -> ModelSpecMetadata:
     """
     Build ModelSpec 1.0.1 compliant metadata dataclass.
@@ -515,26 +495,26 @@ def build_metadata_dataclass(
 
 
 def build_metadata(
-    state_dict: Optional[dict],
+    state_dict: dict | None,
     v2: bool,
     v_parameterization: bool,
     sdxl: bool,
     lora: bool,
     textual_inversion: bool,
     timestamp: float,
-    title: Optional[str] = None,
-    reso: Optional[Union[int, Tuple[int, int]]] = None,
-    is_stable_diffusion_ckpt: Optional[bool] = None,
-    author: Optional[str] = None,
-    description: Optional[str] = None,
-    license: Optional[str] = None,
-    tags: Optional[str] = None,
-    merged_from: Optional[str] = None,
-    timesteps: Optional[Tuple[int, int]] = None,
-    clip_skip: Optional[int] = None,
-    model_config: Optional[dict] = None,
-    optional_metadata: Optional[dict] = None,
-) -> Dict[str, str]:
+    title: str | None = None,
+    reso: int | tuple[int, int] | None = None,
+    is_stable_diffusion_ckpt: bool | None = None,
+    author: str | None = None,
+    description: str | None = None,
+    license: str | None = None,
+    tags: str | None = None,
+    merged_from: str | None = None,
+    timesteps: tuple[int, int] | None = None,
+    clip_skip: int | None = None,
+    model_config: dict | None = None,
+    optional_metadata: dict | None = None,
+) -> dict[str, str]:
     """
     Build ModelSpec 1.0.1 compliant metadata for safetensors models.
     Legacy function that returns dict - prefer build_metadata_dataclass for new code.
@@ -572,7 +552,7 @@ def build_metadata(
 # region utils
 
 
-def get_title(metadata: dict) -> Optional[str]:
+def get_title(metadata: dict) -> str | None:
     return metadata.get(MODELSPEC_TITLE, None)
 
 
@@ -587,7 +567,7 @@ def load_metadata_from_safetensors(model: str) -> dict:
     return metadata
 
 
-def build_merged_from(models: List[str]) -> str:
+def build_merged_from(models: list[str]) -> str:
     def get_title(model: str):
         metadata = load_metadata_from_safetensors(model)
         title = metadata.get(MODELSPEC_TITLE, None)
@@ -602,7 +582,6 @@ def get_title(model: str):
 def add_model_spec_arguments(parser: argparse.ArgumentParser):
     """Add all ModelSpec metadata arguments to the parser."""
     
-    # === Existing standard metadata fields ===
     parser.add_argument(
         "--metadata_title",
         type=str,
@@ -633,9 +612,6 @@ def add_model_spec_arguments(parser: argparse.ArgumentParser):
         default=None,
         help="tags for model metadata, separated by comma / メタデータに書き込まれるモデルタグ、カンマ区切り",
     )
-    
-    # === Universal CAN fields ===
-    # Note: implementation_version is automatically set to sd-scripts/{commit_hash}
     parser.add_argument(
         "--metadata_usage_hint",
         type=str,
@@ -654,8 +630,6 @@ def add_model_spec_arguments(parser: argparse.ArgumentParser):
         default=None,
         help="source models for merged model metadata / メタデータに書き込まれるマージ元モデル名",
     )
-    
-    # === Image generation CAN fields ===
     parser.add_argument(
         "--metadata_trigger_phrase",
         type=str,
@@ -674,44 +648,6 @@ def add_model_spec_arguments(parser: argparse.ArgumentParser):
         default=None,
         help="whether this is a negative embedding for model metadata / メタデータに書き込まれるネガティブ埋め込みかどうか",
     )
-    parser.add_argument(
-        "--metadata_unet_dtype",
-        type=str,
-        default=None,
-        help="UNet data type for model metadata / メタデータに書き込まれるUNetのデータ型",
-    )
-    parser.add_argument(
-        "--metadata_vae_dtype",
-        type=str,
-        default=None,
-        help="VAE data type for model metadata / メタデータに書き込まれるVAEのデータ型",
-    )
-    
-    # === Text prediction fields ===
-    parser.add_argument(
-        "--metadata_data_format",
-        type=str,
-        default=None,
-        help="data format for text prediction model metadata / メタデータに書き込まれるテキスト予測モデルのデータ形式",
-    )
-    parser.add_argument(
-        "--metadata_format_type",
-        type=str,
-        default=None,
-        help="format type for text prediction model metadata / メタデータに書き込まれるテキスト予測モデルの形式タイプ",
-    )
-    parser.add_argument(
-        "--metadata_language",
-        type=str,
-        default=None,
-        help="language for text prediction model metadata / メタデータに書き込まれるテキスト予測モデルの言語",
-    )
-    parser.add_argument(
-        "--metadata_format_template",
-        type=str,
-        default=None,
-        help="format template for text prediction model metadata / メタデータに書き込まれるテキスト予測モデルの形式テンプレート",
-    )
 
 
 # endregion

From 9bb50c26c4e2ba1f4bdaa4ff3ed8b77aa19905d7 Mon Sep 17 00:00:00 2001
From: rockerBOO <rockerboo@gmail.com>
Date: Sun, 3 Aug 2025 00:43:09 -0400
Subject: [PATCH 519/582] Set sai_model_spec to must

---
 library/sai_model_spec.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/library/sai_model_spec.py b/library/sai_model_spec.py
index 2ee3ff224..24b958dd0 100644
--- a/library/sai_model_spec.py
+++ b/library/sai_model_spec.py
@@ -107,7 +107,8 @@ class ModelSpecMetadata:
     architecture: str
     implementation: str
     title: str
-    resolution: str | None = None
+    resolution: str
+    sai_model_spec: str = "1.0.1"
     
     # === SHOULD ===
     description: str | None = None
@@ -116,7 +117,6 @@ class ModelSpecMetadata:
     hash_sha256: str | None = None
     
     # === CAN ===
-    sai_model_spec: str = "1.0.1"
     implementation_version: str | None = None
     license: str | None = None
     usage_hint: str | None = None

From c149cf283ba8ba45e006947a4474b93e420ade9d Mon Sep 17 00:00:00 2001
From: rockerBOO <rockerboo@gmail.com>
Date: Sun, 3 Aug 2025 00:58:25 -0400
Subject: [PATCH 520/582] Add parser args for other trainers.

---
 fine_tune.py                         |   2 +
 flux_train.py                        |   3 +-
 flux_train_control_net.py            |   2 +
 lumina_train.py                      |   2 +
 sd3_train.py                         |   3 +
 sdxl_train.py                        |   3 +-
 sdxl_train_control_net.py            |   2 +
 sdxl_train_control_net_lllite.py     |   2 +
 sdxl_train_control_net_lllite_old.py |   2 +
 tests/library/test_sai_model_spec.py | 225 ++++++++++++++-------------
 tools/cache_latents.py               |   2 +
 tools/cache_text_encoder_outputs.py  |   2 +
 train_control_net.py                 |   1 +
 train_db.py                          |   2 +
 train_network.py                     |   4 +-
 train_textual_inversion.py           |   3 +-
 train_textual_inversion_XTI.py       |   2 +
 17 files changed, 150 insertions(+), 112 deletions(-)

diff --git a/fine_tune.py b/fine_tune.py
index e1ed47496..ffbbbb09f 100644
--- a/fine_tune.py
+++ b/fine_tune.py
@@ -27,6 +27,7 @@
 
 import library.train_util as train_util
 import library.config_util as config_util
+import library.sai_model_spec as sai_model_spec
 from library.config_util import (
     ConfigSanitizer,
     BlueprintGenerator,
@@ -519,6 +520,7 @@ def setup_parser() -> argparse.ArgumentParser:
 
     add_logging_arguments(parser)
     train_util.add_sd_models_arguments(parser)
+    sai_model_spec.add_model_spec_arguments(parser)
     train_util.add_dataset_arguments(parser, False, True, True)
     train_util.add_training_arguments(parser, False)
     deepspeed_utils.add_deepspeed_arguments(parser)
diff --git a/flux_train.py b/flux_train.py
index 84db34cfd..4aa67220f 100644
--- a/flux_train.py
+++ b/flux_train.py
@@ -30,7 +30,7 @@
 init_ipex()
 
 from accelerate.utils import set_seed
-from library import deepspeed_utils, flux_train_utils, flux_utils, strategy_base, strategy_flux
+from library import deepspeed_utils, flux_train_utils, flux_utils, strategy_base, strategy_flux, sai_model_spec
 from library.sd3_train_utils import FlowMatchEulerDiscreteScheduler
 
 import library.train_util as train_util
@@ -787,6 +787,7 @@ def setup_parser() -> argparse.ArgumentParser:
 
     add_logging_arguments(parser)
     train_util.add_sd_models_arguments(parser)  # TODO split this
+    sai_model_spec.add_model_spec_arguments(parser)
     train_util.add_dataset_arguments(parser, True, True, True)
     train_util.add_training_arguments(parser, False)
     train_util.add_masked_loss_arguments(parser)
diff --git a/flux_train_control_net.py b/flux_train_control_net.py
index 93c20dabd..019914058 100644
--- a/flux_train_control_net.py
+++ b/flux_train_control_net.py
@@ -32,6 +32,7 @@
 from accelerate.utils import set_seed
 
 import library.train_util as train_util
+import library.sai_model_spec as sai_model_spec
 from library import (
     deepspeed_utils,
     flux_train_utils,
@@ -820,6 +821,7 @@ def setup_parser() -> argparse.ArgumentParser:
 
     add_logging_arguments(parser)
     train_util.add_sd_models_arguments(parser)  # TODO split this
+    sai_model_spec.add_model_spec_arguments(parser)
     train_util.add_dataset_arguments(parser, False, True, True)
     train_util.add_training_arguments(parser, False)
     train_util.add_masked_loss_arguments(parser)
diff --git a/lumina_train.py b/lumina_train.py
index a333427db..ca60c6582 100644
--- a/lumina_train.py
+++ b/lumina_train.py
@@ -31,6 +31,7 @@
     lumina_util,
     strategy_base,
     strategy_lumina,
+    sai_model_spec
 )
 from library.sd3_train_utils import FlowMatchEulerDiscreteScheduler
 
@@ -904,6 +905,7 @@ def setup_parser() -> argparse.ArgumentParser:
 
     add_logging_arguments(parser)
     train_util.add_sd_models_arguments(parser)  # TODO split this
+    sai_model_spec.add_model_spec_arguments(parser)
     train_util.add_dataset_arguments(parser, True, True, True)
     train_util.add_training_arguments(parser, False)
     train_util.add_masked_loss_arguments(parser)
diff --git a/sd3_train.py b/sd3_train.py
index 3bff6a50f..355e13dd2 100644
--- a/sd3_train.py
+++ b/sd3_train.py
@@ -20,6 +20,8 @@
 from accelerate.utils import set_seed
 from diffusers import DDPMScheduler
 from library import deepspeed_utils, sd3_models, sd3_train_utils, sd3_utils, strategy_base, strategy_sd3
+
+import library.sai_model_spec as sai_model_spec
 from library.sdxl_train_util import match_mixed_precision
 
 # , sdxl_model_util
@@ -986,6 +988,7 @@ def setup_parser() -> argparse.ArgumentParser:
 
     add_logging_arguments(parser)
     train_util.add_sd_models_arguments(parser)
+    sai_model_spec.add_model_spec_arguments(parser)
     train_util.add_dataset_arguments(parser, True, True, True)
     train_util.add_training_arguments(parser, False)
     train_util.add_masked_loss_arguments(parser)
diff --git a/sdxl_train.py b/sdxl_train.py
index a60f6df63..f454263a4 100644
--- a/sdxl_train.py
+++ b/sdxl_train.py
@@ -17,7 +17,7 @@
 
 from accelerate.utils import set_seed
 from diffusers import DDPMScheduler
-from library import deepspeed_utils, sdxl_model_util, strategy_base, strategy_sd, strategy_sdxl
+from library import deepspeed_utils, sdxl_model_util, strategy_base, strategy_sd, strategy_sdxl, sai_model_spec
 
 import library.train_util as train_util
 
@@ -893,6 +893,7 @@ def setup_parser() -> argparse.ArgumentParser:
 
     add_logging_arguments(parser)
     train_util.add_sd_models_arguments(parser)
+    sai_model_spec.add_model_spec_arguments(parser)
     train_util.add_dataset_arguments(parser, True, True, True)
     train_util.add_training_arguments(parser, False)
     train_util.add_masked_loss_arguments(parser)
diff --git a/sdxl_train_control_net.py b/sdxl_train_control_net.py
index c6e8136f7..3d107e57c 100644
--- a/sdxl_train_control_net.py
+++ b/sdxl_train_control_net.py
@@ -25,6 +25,7 @@
     strategy_base,
     strategy_sd,
     strategy_sdxl,
+    sai_model_spec
 )
 
 import library.train_util as train_util
@@ -664,6 +665,7 @@ def setup_parser() -> argparse.ArgumentParser:
 
     add_logging_arguments(parser)
     train_util.add_sd_models_arguments(parser)
+    sai_model_spec.add_model_spec_arguments(parser)
     train_util.add_dataset_arguments(parser, False, True, True)
     train_util.add_training_arguments(parser, False)
     # train_util.add_masked_loss_arguments(parser)
diff --git a/sdxl_train_control_net_lllite.py b/sdxl_train_control_net_lllite.py
index 00e51a673..4dd4b8d94 100644
--- a/sdxl_train_control_net_lllite.py
+++ b/sdxl_train_control_net_lllite.py
@@ -32,6 +32,7 @@
     strategy_base,
     strategy_sd,
     strategy_sdxl,
+    sai_model_spec,
 )
 
 import library.model_util as model_util
@@ -589,6 +590,7 @@ def setup_parser() -> argparse.ArgumentParser:
 
     add_logging_arguments(parser)
     train_util.add_sd_models_arguments(parser)
+    sai_model_spec.add_model_spec_arguments(parser)
     train_util.add_dataset_arguments(parser, False, True, True)
     train_util.add_training_arguments(parser, False)
     deepspeed_utils.add_deepspeed_arguments(parser)
diff --git a/sdxl_train_control_net_lllite_old.py b/sdxl_train_control_net_lllite_old.py
index 63457cc61..0a9f4a92f 100644
--- a/sdxl_train_control_net_lllite_old.py
+++ b/sdxl_train_control_net_lllite_old.py
@@ -24,6 +24,7 @@
 import library.model_util as model_util
 import library.train_util as train_util
 import library.config_util as config_util
+import library.sai_model_spec as sai_model_spec
 from library.config_util import (
     ConfigSanitizer,
     BlueprintGenerator,
@@ -536,6 +537,7 @@ def setup_parser() -> argparse.ArgumentParser:
 
     add_logging_arguments(parser)
     train_util.add_sd_models_arguments(parser)
+    sai_model_spec.add_model_spec_arguments(parser)
     train_util.add_dataset_arguments(parser, False, True, True)
     train_util.add_training_arguments(parser, False)
     deepspeed_utils.add_deepspeed_arguments(parser)
diff --git a/tests/library/test_sai_model_spec.py b/tests/library/test_sai_model_spec.py
index 92dcf4c64..0bbfa1167 100644
--- a/tests/library/test_sai_model_spec.py
+++ b/tests/library/test_sai_model_spec.py
@@ -1,4 +1,5 @@
 """Tests for sai_model_spec module."""
+
 import pytest
 import time
 
@@ -7,7 +8,7 @@
 
 class MockArgs:
     """Mock argparse.Namespace for testing."""
-    
+
     def __init__(self, **kwargs):
         # Default values
         self.v2 = False
@@ -22,7 +23,7 @@ def __init__(self, **kwargs):
         self.max_timestep = None
         self.clip_skip = None
         self.output_name = "test_output"
-        
+
         # Override with provided values
         for key, value in kwargs.items():
             setattr(self, key, value)
@@ -30,57 +31,56 @@ def __init__(self, **kwargs):
 
 class TestModelSpecMetadata:
     """Test the ModelSpecMetadata dataclass."""
-    
+
     def test_creation_and_conversion(self):
         """Test creating dataclass and converting to metadata dict."""
         metadata = sai_model_spec.ModelSpecMetadata(
             architecture="stable-diffusion-v1",
             implementation="diffusers",
             title="Test Model",
+            resolution="512x512",
             author="Test Author",
-            description=None  # Test None exclusion
+            description=None,  # Test None exclusion
         )
-        
+
         assert metadata.architecture == "stable-diffusion-v1"
         assert metadata.sai_model_spec == "1.0.1"
-        
+
         metadata_dict = metadata.to_metadata_dict()
         assert "modelspec.architecture" in metadata_dict
         assert "modelspec.author" in metadata_dict
         assert "modelspec.description" not in metadata_dict  # None values excluded
         assert metadata_dict["modelspec.sai_model_spec"] == "1.0.1"
-    
+
     def test_additional_fields_handling(self):
         """Test handling of additional metadata fields."""
         additional = {"custom_field": "custom_value", "modelspec.prefixed": "prefixed_value"}
-        
+
         metadata = sai_model_spec.ModelSpecMetadata(
             architecture="stable-diffusion-v1",
             implementation="diffusers",
             title="Test Model",
-            additional_fields=additional
+            resolution="512x512",
+            additional_fields=additional,
         )
-        
+
         metadata_dict = metadata.to_metadata_dict()
         assert "modelspec.custom_field" in metadata_dict
         assert "modelspec.prefixed" in metadata_dict
         assert metadata_dict["modelspec.custom_field"] == "custom_value"
-    
+
     def test_from_args_extraction(self):
         """Test creating ModelSpecMetadata from args with metadata_* fields."""
-        args = MockArgs(
-            metadata_author="Test Author",
-            metadata_trigger_phrase="anime style",
-            metadata_usage_hint="Use CFG 7.5"
-        )
-        
+        args = MockArgs(metadata_author="Test Author", metadata_trigger_phrase="anime style", metadata_usage_hint="Use CFG 7.5")
+
         metadata = sai_model_spec.ModelSpecMetadata.from_args(
             args,
             architecture="stable-diffusion-v1",
             implementation="diffusers",
-            title="Test Model"
+            title="Test Model",
+            resolution="512x512",
         )
-        
+
         assert metadata.author == "Test Author"
         assert metadata.additional_fields["trigger_phrase"] == "anime style"
         assert metadata.additional_fields["usage_hint"] == "Use CFG 7.5"
@@ -88,79 +88,87 @@ def test_from_args_extraction(self):
 
 class TestArchitectureDetection:
     """Test architecture detection for different model types."""
-    
-    @pytest.mark.parametrize("config,expected", [
-        ({"v2": False, "v_parameterization": False, "sdxl": True}, "stable-diffusion-xl-v1-base"),
-        ({"v2": False, "v_parameterization": False, "sdxl": False, "model_config": {"flux": "dev"}}, "flux-1-dev"),
-        ({"v2": False, "v_parameterization": False, "sdxl": False, "model_config": {"flux": "chroma"}}, "chroma"),
-        ({"v2": False, "v_parameterization": False, "sdxl": False, "model_config": {"sd3": "large"}}, "stable-diffusion-3-large"),
-        ({"v2": True, "v_parameterization": True, "sdxl": False}, "stable-diffusion-v2-768-v"),
-        ({"v2": False, "v_parameterization": False, "sdxl": False}, "stable-diffusion-v1"),
-    ])
+
+    @pytest.mark.parametrize(
+        "config,expected",
+        [
+            ({"v2": False, "v_parameterization": False, "sdxl": True}, "stable-diffusion-xl-v1-base"),
+            ({"v2": False, "v_parameterization": False, "sdxl": False, "model_config": {"flux": "dev"}}, "flux-1-dev"),
+            ({"v2": False, "v_parameterization": False, "sdxl": False, "model_config": {"flux": "chroma"}}, "chroma"),
+            (
+                {"v2": False, "v_parameterization": False, "sdxl": False, "model_config": {"sd3": "large"}},
+                "stable-diffusion-3-large",
+            ),
+            ({"v2": True, "v_parameterization": True, "sdxl": False}, "stable-diffusion-v2-768-v"),
+            ({"v2": False, "v_parameterization": False, "sdxl": False}, "stable-diffusion-v1"),
+        ],
+    )
     def test_architecture_detection(self, config, expected):
         """Test architecture detection for various model configurations."""
         model_config = config.pop("model_config", None)
-        arch = sai_model_spec.determine_architecture(
-            lora=False, textual_inversion=False, model_config=model_config, **config
-        )
+        arch = sai_model_spec.determine_architecture(lora=False, textual_inversion=False, model_config=model_config, **config)
         assert arch == expected
-    
+
     def test_adapter_suffixes(self):
         """Test LoRA and textual inversion suffixes."""
         lora_arch = sai_model_spec.determine_architecture(
-            v2=False, v_parameterization=False, sdxl=True,
-            lora=True, textual_inversion=False
+            v2=False, v_parameterization=False, sdxl=True, lora=True, textual_inversion=False
         )
         assert lora_arch == "stable-diffusion-xl-v1-base/lora"
-        
+
         ti_arch = sai_model_spec.determine_architecture(
-            v2=False, v_parameterization=False, sdxl=False,
-            lora=False, textual_inversion=True
+            v2=False, v_parameterization=False, sdxl=False, lora=False, textual_inversion=True
         )
         assert ti_arch == "stable-diffusion-v1/textual-inversion"
 
 
 class TestImplementationDetection:
     """Test implementation detection for different model types."""
-    
-    @pytest.mark.parametrize("config,expected", [
-        ({"model_config": {"flux": "dev"}}, "https://github.com/black-forest-labs/flux"),
-        ({"model_config": {"flux": "chroma"}}, "https://huggingface.co/lodestones/Chroma"),
-        ({"model_config": {"lumina": "lumina2"}}, "https://github.com/Alpha-VLLM/Lumina-Image-2.0"),
-        ({"lora": True, "sdxl": True}, "https://github.com/Stability-AI/generative-models"),
-        ({"lora": True, "sdxl": False}, "diffusers"),
-    ])
+
+    @pytest.mark.parametrize(
+        "config,expected",
+        [
+            ({"model_config": {"flux": "dev"}}, "https://github.com/black-forest-labs/flux"),
+            ({"model_config": {"flux": "chroma"}}, "https://huggingface.co/lodestones/Chroma"),
+            ({"model_config": {"lumina": "lumina2"}}, "https://github.com/Alpha-VLLM/Lumina-Image-2.0"),
+            ({"lora": True, "sdxl": True}, "https://github.com/Stability-AI/generative-models"),
+            ({"lora": True, "sdxl": False}, "diffusers"),
+        ],
+    )
     def test_implementation_detection(self, config, expected):
         """Test implementation detection for various configurations."""
         model_config = config.pop("model_config", None)
         impl = sai_model_spec.determine_implementation(
-            lora=config.get("lora", False), 
-            textual_inversion=False, 
-            sdxl=config.get("sdxl", False),
-            model_config=model_config
+            lora=config.get("lora", False), textual_inversion=False, sdxl=config.get("sdxl", False), model_config=model_config
         )
         assert impl == expected
 
 
 class TestResolutionHandling:
     """Test resolution parsing and defaults."""
-    
-    @pytest.mark.parametrize("input_reso,expected", [
-        ((768, 1024), "768x1024"),
-        (768, "768x768"),
-        ("768,1024", "768x1024"),
-    ])
+
+    @pytest.mark.parametrize(
+        "input_reso,expected",
+        [
+            ((768, 1024), "768x1024"),
+            (768, "768x768"),
+            ("768,1024", "768x1024"),
+        ],
+    )
     def test_explicit_resolution_formats(self, input_reso, expected):
         """Test different resolution input formats."""
         res = sai_model_spec.determine_resolution(reso=input_reso)
         assert res == expected
-    
-    @pytest.mark.parametrize("config,expected", [
-        ({"sdxl": True}, "1024x1024"),
-        ({"model_config": {"flux": "dev"}}, "1024x1024"),
-        ({"v2": True, "v_parameterization": True}, "768x768"),
-        ({}, "512x512"),  # Default SD v1
-    ])
+
+    @pytest.mark.parametrize(
+        "config,expected",
+        [
+            ({"sdxl": True}, "1024x1024"),
+            ({"model_config": {"flux": "dev"}}, "1024x1024"),
+            ({"v2": True, "v_parameterization": True}, "768x768"),
+            ({}, "512x512"),  # Default SD v1
+        ],
+    )
     def test_default_resolutions(self, config, expected):
         """Test default resolution detection."""
         model_config = config.pop("model_config", None)
@@ -170,59 +178,60 @@ def test_default_resolutions(self, config, expected):
 
 class TestThumbnailProcessing:
     """Test thumbnail data URL processing."""
-    
+
     def test_file_to_data_url(self):
         """Test converting file to data URL."""
         import tempfile
         import os
-        
+
         # Create a tiny test PNG (1x1 pixel)
-        test_png_data = b'\x89PNG\r\n\x1a\n\x00\x00\x00\rIHDR\x00\x00\x00\x01\x00\x00\x00\x01\x08\x02\x00\x00\x00\x90wS\xde\x00\x00\x00\x0cIDATx\x9cc\xff\xff\xff\x00\x00\x00\x04\x00\x01\x9d\xb3\xa7c\x00\x00\x00\x00IEND\xaeB`\x82'
-        
-        with tempfile.NamedTemporaryFile(suffix='.png', delete=False) as f:
+        test_png_data = b"\x89PNG\r\n\x1a\n\x00\x00\x00\rIHDR\x00\x00\x00\x01\x00\x00\x00\x01\x08\x02\x00\x00\x00\x90wS\xde\x00\x00\x00\x0cIDATx\x9cc\xff\xff\xff\x00\x00\x00\x04\x00\x01\x9d\xb3\xa7c\x00\x00\x00\x00IEND\xaeB`\x82"
+
+        with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as f:
             f.write(test_png_data)
             temp_path = f.name
-        
+
         try:
             data_url = sai_model_spec.file_to_data_url(temp_path)
-            
+
             # Check format
             assert data_url.startswith("data:image/png;base64,")
-            
+
             # Check it's a reasonable length (base64 encoded)
             assert len(data_url) > 50
-            
+
             # Verify we can decode it back
             import base64
+
             encoded_part = data_url.split(",", 1)[1]
             decoded_data = base64.b64decode(encoded_part)
             assert decoded_data == test_png_data
-            
+
         finally:
             os.unlink(temp_path)
-    
+
     def test_file_to_data_url_nonexistent_file(self):
         """Test error handling for nonexistent files."""
         import pytest
-        
+
         with pytest.raises(FileNotFoundError):
             sai_model_spec.file_to_data_url("/nonexistent/file.png")
-    
+
     def test_thumbnail_processing_in_metadata(self):
         """Test thumbnail processing in build_metadata_dataclass."""
         import tempfile
         import os
-        
+
         # Create a test image file
-        test_png_data = b'\x89PNG\r\n\x1a\n\x00\x00\x00\rIHDR\x00\x00\x00\x01\x00\x00\x00\x01\x08\x02\x00\x00\x00\x90wS\xde\x00\x00\x00\x0cIDATx\x9cc\xff\xff\xff\x00\x00\x00\x04\x00\x01\x9d\xb3\xa7c\x00\x00\x00\x00IEND\xaeB`\x82'
-        
-        with tempfile.NamedTemporaryFile(suffix='.png', delete=False) as f:
+        test_png_data = b"\x89PNG\r\n\x1a\n\x00\x00\x00\rIHDR\x00\x00\x00\x01\x00\x00\x00\x01\x08\x02\x00\x00\x00\x90wS\xde\x00\x00\x00\x0cIDATx\x9cc\xff\xff\xff\x00\x00\x00\x04\x00\x01\x9d\xb3\xa7c\x00\x00\x00\x00IEND\xaeB`\x82"
+
+        with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as f:
             f.write(test_png_data)
             temp_path = f.name
-        
+
         try:
             timestamp = time.time()
-            
+
             # Test with file path - should be converted to data URL
             metadata = sai_model_spec.build_metadata_dataclass(
                 state_dict=None,
@@ -233,22 +242,24 @@ def test_thumbnail_processing_in_metadata(self):
                 textual_inversion=False,
                 timestamp=timestamp,
                 title="Test Model",
-                optional_metadata={"thumbnail": temp_path}
+                optional_metadata={"thumbnail": temp_path},
             )
-            
+
             # Should be converted to data URL
             assert "thumbnail" in metadata.additional_fields
             assert metadata.additional_fields["thumbnail"].startswith("data:image/png;base64,")
-            
+
         finally:
             os.unlink(temp_path)
-    
+
     def test_thumbnail_data_url_passthrough(self):
         """Test that existing data URLs are passed through unchanged."""
         timestamp = time.time()
-        
-        existing_data_url = "data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAYAAAAfFcSJAAAADUlEQVR42mP8/5+hHgAHggJ/PchI7wAAAABJRU5ErkJggg=="
-        
+
+        existing_data_url = (
+            "data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAYAAAAfFcSJAAAADUlEQVR42mP8/5+hHgAHggJ/PchI7wAAAABJRU5ErkJggg=="
+        )
+
         metadata = sai_model_spec.build_metadata_dataclass(
             state_dict=None,
             v2=False,
@@ -258,16 +269,16 @@ def test_thumbnail_data_url_passthrough(self):
             textual_inversion=False,
             timestamp=timestamp,
             title="Test Model",
-            optional_metadata={"thumbnail": existing_data_url}
+            optional_metadata={"thumbnail": existing_data_url},
         )
-        
+
         # Should be unchanged
         assert metadata.additional_fields["thumbnail"] == existing_data_url
-    
+
     def test_thumbnail_invalid_file_handling(self):
         """Test graceful handling of invalid thumbnail files."""
         timestamp = time.time()
-        
+
         metadata = sai_model_spec.build_metadata_dataclass(
             state_dict=None,
             v2=False,
@@ -277,20 +288,20 @@ def test_thumbnail_invalid_file_handling(self):
             textual_inversion=False,
             timestamp=timestamp,
             title="Test Model",
-            optional_metadata={"thumbnail": "/nonexistent/file.png"}
+            optional_metadata={"thumbnail": "/nonexistent/file.png"},
         )
-        
+
         # Should be removed from additional_fields due to error
         assert "thumbnail" not in metadata.additional_fields
 
 
 class TestBuildMetadataIntegration:
     """Test the complete metadata building workflow."""
-    
+
     def test_sdxl_model_workflow(self):
         """Test complete workflow for SDXL model."""
         timestamp = time.time()
-        
+
         metadata = sai_model_spec.build_metadata_dataclass(
             state_dict=None,
             v2=False,
@@ -299,18 +310,18 @@ def test_sdxl_model_workflow(self):
             lora=False,
             textual_inversion=False,
             timestamp=timestamp,
-            title="Test SDXL Model"
+            title="Test SDXL Model",
         )
-        
+
         assert metadata.architecture == "stable-diffusion-xl-v1-base"
         assert metadata.implementation == "https://github.com/Stability-AI/generative-models"
         assert metadata.resolution == "1024x1024"
         assert metadata.prediction_type == "epsilon"
-    
+
     def test_flux_model_workflow(self):
         """Test complete workflow for Flux model."""
         timestamp = time.time()
-        
+
         metadata = sai_model_spec.build_metadata_dataclass(
             state_dict=None,
             v2=False,
@@ -321,18 +332,18 @@ def test_flux_model_workflow(self):
             timestamp=timestamp,
             title="Test Flux Model",
             model_config={"flux": "dev"},
-            optional_metadata={"trigger_phrase": "anime style"}
+            optional_metadata={"trigger_phrase": "anime style"},
         )
-        
+
         assert metadata.architecture == "flux-1-dev"
         assert metadata.implementation == "https://github.com/black-forest-labs/flux"
         assert metadata.prediction_type is None  # Flux doesn't use prediction_type
         assert metadata.additional_fields["trigger_phrase"] == "anime style"
-    
+
     def test_legacy_function_compatibility(self):
         """Test that legacy build_metadata function works correctly."""
         timestamp = time.time()
-        
+
         metadata_dict = sai_model_spec.build_metadata(
             state_dict=None,
             v2=False,
@@ -341,9 +352,9 @@ def test_legacy_function_compatibility(self):
             lora=False,
             textual_inversion=False,
             timestamp=timestamp,
-            title="Test Model"
+            title="Test Model",
         )
-        
+
         assert isinstance(metadata_dict, dict)
         assert metadata_dict["modelspec.sai_model_spec"] == "1.0.1"
-        assert metadata_dict["modelspec.architecture"] == "stable-diffusion-xl-v1-base"
\ No newline at end of file
+        assert metadata_dict["modelspec.architecture"] == "stable-diffusion-xl-v1-base"
diff --git a/tools/cache_latents.py b/tools/cache_latents.py
index 515ece98d..5baddb5bf 100644
--- a/tools/cache_latents.py
+++ b/tools/cache_latents.py
@@ -12,6 +12,7 @@
 from library import config_util, flux_train_utils, flux_utils, strategy_base, strategy_flux, strategy_sd, strategy_sdxl
 from library import train_util
 from library import sdxl_train_util
+import library.sai_model_spec as sai_model_spec
 from library.config_util import (
     ConfigSanitizer,
     BlueprintGenerator,
@@ -161,6 +162,7 @@ def setup_parser() -> argparse.ArgumentParser:
 
     add_logging_arguments(parser)
     train_util.add_sd_models_arguments(parser)
+    sai_model_spec.add_model_spec_arguments(parser)
     train_util.add_training_arguments(parser, True)
     train_util.add_dataset_arguments(parser, True, True, True)
     train_util.add_masked_loss_arguments(parser)
diff --git a/tools/cache_text_encoder_outputs.py b/tools/cache_text_encoder_outputs.py
index 00459658e..8e6042923 100644
--- a/tools/cache_text_encoder_outputs.py
+++ b/tools/cache_text_encoder_outputs.py
@@ -22,6 +22,7 @@
 from library import train_util
 from library import sdxl_train_util
 from library import utils
+import library.sai_model_spec as sai_model_spec
 from library.config_util import (
     ConfigSanitizer,
     BlueprintGenerator,
@@ -188,6 +189,7 @@ def setup_parser() -> argparse.ArgumentParser:
 
     add_logging_arguments(parser)
     train_util.add_sd_models_arguments(parser)
+    sai_model_spec.add_model_spec_arguments(parser)
     train_util.add_training_arguments(parser, True)
     train_util.add_dataset_arguments(parser, True, True, True)
     train_util.add_masked_loss_arguments(parser)
diff --git a/train_control_net.py b/train_control_net.py
index ba016ac5d..97cd1ebb0 100644
--- a/train_control_net.py
+++ b/train_control_net.py
@@ -25,6 +25,7 @@
 import library.model_util as model_util
 import library.train_util as train_util
 import library.config_util as config_util
+import library.sai_model_spec as sai_model_spec
 from library.config_util import (
     ConfigSanitizer,
     BlueprintGenerator,
diff --git a/train_db.py b/train_db.py
index edd674034..4bf3b31ce 100644
--- a/train_db.py
+++ b/train_db.py
@@ -22,6 +22,7 @@
 
 import library.train_util as train_util
 import library.config_util as config_util
+import library.sai_model_spec as sai_model_spec
 from library.config_util import (
     ConfigSanitizer,
     BlueprintGenerator,
@@ -512,6 +513,7 @@ def setup_parser() -> argparse.ArgumentParser:
 
     add_logging_arguments(parser)
     train_util.add_sd_models_arguments(parser)
+    sai_model_spec.add_model_spec_arguments(parser)
     train_util.add_dataset_arguments(parser, True, False, True)
     train_util.add_training_arguments(parser, True)
     train_util.add_masked_loss_arguments(parser)
diff --git a/train_network.py b/train_network.py
index aa42a3bf1..e055f5d8e 100644
--- a/train_network.py
+++ b/train_network.py
@@ -24,7 +24,7 @@
 from accelerate import Accelerator
 from diffusers import DDPMScheduler
 from diffusers.models.autoencoders.autoencoder_kl import AutoencoderKL
-from library import deepspeed_utils, model_util, sai_model_spec, strategy_base, strategy_sd
+from library import deepspeed_utils, model_util, sai_model_spec, strategy_base, strategy_sd, sai_model_spec
 
 import library.train_util as train_util
 from library.train_util import DreamBoothDataset
@@ -1711,6 +1711,7 @@ def setup_parser() -> argparse.ArgumentParser:
 
     add_logging_arguments(parser)
     train_util.add_sd_models_arguments(parser)
+    sai_model_spec.add_model_spec_arguments(parser)
     train_util.add_dataset_arguments(parser, True, True, True)
     train_util.add_training_arguments(parser, True)
     train_util.add_masked_loss_arguments(parser)
@@ -1718,7 +1719,6 @@ def setup_parser() -> argparse.ArgumentParser:
     train_util.add_optimizer_arguments(parser)
     config_util.add_config_arguments(parser)
     custom_train_functions.add_custom_train_arguments(parser)
-    sai_model_spec.add_model_spec_arguments(parser)
 
     parser.add_argument(
         "--cpu_offload_checkpointing",
diff --git a/train_textual_inversion.py b/train_textual_inversion.py
index 0c6568b08..8575698d6 100644
--- a/train_textual_inversion.py
+++ b/train_textual_inversion.py
@@ -16,7 +16,7 @@
 from accelerate.utils import set_seed
 from diffusers import DDPMScheduler
 from transformers import CLIPTokenizer
-from library import deepspeed_utils, model_util, strategy_base, strategy_sd
+from library import deepspeed_utils, model_util, strategy_base, strategy_sd, sai_model_spec
 
 import library.train_util as train_util
 import library.huggingface_util as huggingface_util
@@ -771,6 +771,7 @@ def setup_parser() -> argparse.ArgumentParser:
 
     add_logging_arguments(parser)
     train_util.add_sd_models_arguments(parser)
+    sai_model_spec.add_model_spec_arguments(parser)
     train_util.add_dataset_arguments(parser, True, True, False)
     train_util.add_training_arguments(parser, True)
     train_util.add_masked_loss_arguments(parser)
diff --git a/train_textual_inversion_XTI.py b/train_textual_inversion_XTI.py
index 6ff97d03f..778210950 100644
--- a/train_textual_inversion_XTI.py
+++ b/train_textual_inversion_XTI.py
@@ -21,6 +21,7 @@
 import library.train_util as train_util
 import library.huggingface_util as huggingface_util
 import library.config_util as config_util
+import library.sai_model_spec as sai_model_spec
 from library.config_util import (
     ConfigSanitizer,
     BlueprintGenerator,
@@ -668,6 +669,7 @@ def setup_parser() -> argparse.ArgumentParser:
 
     add_logging_arguments(parser)
     train_util.add_sd_models_arguments(parser)
+    sai_model_spec.add_model_spec_arguments(parser)
     train_util.add_dataset_arguments(parser, True, True, False)
     train_util.add_training_arguments(parser, True)
     train_util.add_masked_loss_arguments(parser)

From 351bed965cfe27385557c52458ac4b35d4af5de7 Mon Sep 17 00:00:00 2001
From: Kohya S <ykumeykume@gmail.com>
Date: Wed, 13 Aug 2025 21:38:51 +0900
Subject: [PATCH 521/582] fix model type handling in analyze_state_dict_state
 function for SD3

---
 library/sd3_utils.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/library/sd3_utils.py b/library/sd3_utils.py
index 1861dfbc2..d2ea6fffe 100644
--- a/library/sd3_utils.py
+++ b/library/sd3_utils.py
@@ -50,14 +50,14 @@ def analyze_state_dict_state(state_dict: Dict, prefix: str = ""):
     context_embedder_in_features = context_shape[1]
     context_embedder_out_features = context_shape[0]
 
-    # only supports 3-5-large, medium or 3-medium
+    # only supports 3-5-large, medium or 3-medium. This is added after `stable-diffusion-3-`.
     if qk_norm is not None:
         if len(x_block_self_attn_layers) == 0:
-            model_type = "3-5-large"
+            model_type = "5-large"
         else:
-            model_type = "3-5-medium"
+            model_type = "5-medium"
     else:
-        model_type = "3-medium"
+        model_type = "medium"
 
     params = sd3_models.SD3Params(
         patch_size=patch_size,

From 6edbe00547bac6c2efb2d6952eb910851662cdf2 Mon Sep 17 00:00:00 2001
From: Kohya S <ykumeykume@gmail.com>
Date: Sat, 16 Aug 2025 20:07:03 +0900
Subject: [PATCH 522/582] feat: update libraries, remove warnings

---
 library/model_util.py                         |  5 +-
 library/original_unet.py                      | 42 ++++++-----
 library/train_util.py                         | 72 ++++++++++++-------
 pytorch_lightning/__init__.py                 |  0
 pytorch_lightning/callbacks/__init__.py       |  0
 .../callbacks/model_checkpoint.py             |  4 ++
 requirements.txt                              | 35 ++++-----
 sdxl_train_network.py                         |  7 +-
 train_network.py                              |  5 +-
 9 files changed, 107 insertions(+), 63 deletions(-)
 create mode 100644 pytorch_lightning/__init__.py
 create mode 100644 pytorch_lightning/callbacks/__init__.py
 create mode 100644 pytorch_lightning/callbacks/model_checkpoint.py

diff --git a/library/model_util.py b/library/model_util.py
index 9918c7b2a..bcaa1145b 100644
--- a/library/model_util.py
+++ b/library/model_util.py
@@ -6,6 +6,7 @@
 
 import torch
 from library.device_utils import init_ipex
+
 init_ipex()
 
 import diffusers
@@ -14,8 +15,10 @@
 from safetensors.torch import load_file, save_file
 from library.original_unet import UNet2DConditionModel
 from library.utils import setup_logging
+
 setup_logging()
 import logging
+
 logger = logging.getLogger(__name__)
 
 # DiffUsers版StableDiffusionのモデルパラメータ
@@ -974,7 +977,7 @@ def load_checkpoint_with_text_encoder_conversion(ckpt_path, device="cpu"):
         checkpoint = None
         state_dict = load_file(ckpt_path)  # , device) # may causes error
     else:
-        checkpoint = torch.load(ckpt_path, map_location=device)
+        checkpoint = torch.load(ckpt_path, map_location=device, weights_only=False)
         if "state_dict" in checkpoint:
             state_dict = checkpoint["state_dict"]
         else:
diff --git a/library/original_unet.py b/library/original_unet.py
index e944ff22b..aa9dc233b 100644
--- a/library/original_unet.py
+++ b/library/original_unet.py
@@ -114,8 +114,10 @@
 from torch.nn import functional as F
 from einops import rearrange
 from library.utils import setup_logging
+
 setup_logging()
 import logging
+
 logger = logging.getLogger(__name__)
 
 BLOCK_OUT_CHANNELS: Tuple[int] = (320, 640, 1280, 1280)
@@ -530,7 +532,9 @@ def custom_forward(*inputs):
 
                     return custom_forward
 
-                hidden_states = torch.utils.checkpoint.checkpoint(create_custom_forward(resnet), hidden_states, temb)
+                hidden_states = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(resnet), hidden_states, temb, use_reentrant=False
+                )
             else:
                 hidden_states = resnet(hidden_states, temb)
 
@@ -626,15 +630,9 @@ def forward(self, hidden_states, context=None, mask=None, **kwargs):
                 hidden_states,
                 encoder_hidden_states,
                 attention_mask,
-            ) = translate_attention_names_from_diffusers(
-                hidden_states=hidden_states, context=context, mask=mask, **kwargs
-            )
+            ) = translate_attention_names_from_diffusers(hidden_states=hidden_states, context=context, mask=mask, **kwargs)
             return self.processor(
-                attn=self,
-                hidden_states=hidden_states,
-                encoder_hidden_states=context,
-                attention_mask=mask,
-                **kwargs
+                attn=self, hidden_states=hidden_states, encoder_hidden_states=context, attention_mask=mask, **kwargs
             )
         if self.use_memory_efficient_attention_xformers:
             return self.forward_memory_efficient_xformers(hidden_states, context, mask)
@@ -748,13 +746,14 @@ def forward_sdpa(self, x, context=None, mask=None):
         out = self.to_out[0](out)
         return out
 
+
 def translate_attention_names_from_diffusers(
     hidden_states: torch.FloatTensor,
     context: Optional[torch.FloatTensor] = None,
     mask: Optional[torch.FloatTensor] = None,
     # HF naming
     encoder_hidden_states: Optional[torch.FloatTensor] = None,
-    attention_mask: Optional[torch.FloatTensor] = None
+    attention_mask: Optional[torch.FloatTensor] = None,
 ):
     # translate from hugging face diffusers
     context = context if context is not None else encoder_hidden_states
@@ -764,6 +763,7 @@ def translate_attention_names_from_diffusers(
 
     return hidden_states, context, mask
 
+
 # feedforward
 class GEGLU(nn.Module):
     r"""
@@ -1015,9 +1015,11 @@ def custom_forward(*inputs):
 
                     return custom_forward
 
-                hidden_states = torch.utils.checkpoint.checkpoint(create_custom_forward(resnet), hidden_states, temb)
                 hidden_states = torch.utils.checkpoint.checkpoint(
-                    create_custom_forward(attn, return_dict=False), hidden_states, encoder_hidden_states
+                    create_custom_forward(resnet), hidden_states, temb, use_reentrant=False
+                )
+                hidden_states = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(attn, return_dict=False), hidden_states, encoder_hidden_states, use_reentrant=False
                 )[0]
             else:
                 hidden_states = resnet(hidden_states, temb)
@@ -1098,10 +1100,12 @@ def custom_forward(*inputs):
 
                 if attn is not None:
                     hidden_states = torch.utils.checkpoint.checkpoint(
-                        create_custom_forward(attn, return_dict=False), hidden_states, encoder_hidden_states
+                        create_custom_forward(attn, return_dict=False), hidden_states, encoder_hidden_states, use_reentrant=False
                     )[0]
 
-                hidden_states = torch.utils.checkpoint.checkpoint(create_custom_forward(resnet), hidden_states, temb)
+                hidden_states = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(resnet), hidden_states, temb, use_reentrant=False
+                )
             else:
                 if attn is not None:
                     hidden_states = attn(hidden_states, encoder_hidden_states).sample
@@ -1201,7 +1205,9 @@ def custom_forward(*inputs):
 
                     return custom_forward
 
-                hidden_states = torch.utils.checkpoint.checkpoint(create_custom_forward(resnet), hidden_states, temb)
+                hidden_states = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(resnet), hidden_states, temb, use_reentrant=False
+                )
             else:
                 hidden_states = resnet(hidden_states, temb)
 
@@ -1296,9 +1302,11 @@ def custom_forward(*inputs):
 
                     return custom_forward
 
-                hidden_states = torch.utils.checkpoint.checkpoint(create_custom_forward(resnet), hidden_states, temb)
                 hidden_states = torch.utils.checkpoint.checkpoint(
-                    create_custom_forward(attn, return_dict=False), hidden_states, encoder_hidden_states
+                    create_custom_forward(resnet), hidden_states, temb, use_reentrant=False
+                )
+                hidden_states = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(attn, return_dict=False), hidden_states, encoder_hidden_states, use_reentrant=False
                 )[0]
             else:
                 hidden_states = resnet(hidden_states, temb)
diff --git a/library/train_util.py b/library/train_util.py
index 395183957..b432d0b62 100644
--- a/library/train_util.py
+++ b/library/train_util.py
@@ -683,7 +683,7 @@ def __init__(
         resolution: Optional[Tuple[int, int]],
         network_multiplier: float,
         debug_dataset: bool,
-        resize_interpolation: Optional[str] = None
+        resize_interpolation: Optional[str] = None,
     ) -> None:
         super().__init__()
 
@@ -719,7 +719,9 @@ def __init__(
         self.image_transforms = IMAGE_TRANSFORMS
 
         if resize_interpolation is not None:
-            assert validate_interpolation_fn(resize_interpolation), f"Resize interpolation \"{resize_interpolation}\" is not a valid interpolation"
+            assert validate_interpolation_fn(
+                resize_interpolation
+            ), f'Resize interpolation "{resize_interpolation}" is not a valid interpolation'
         self.resize_interpolation = resize_interpolation
 
         self.image_data: Dict[str, ImageInfo] = {}
@@ -1613,7 +1615,11 @@ def __getitem__(self, index):
 
                 if self.enable_bucket:
                     img, original_size, crop_ltrb = trim_and_resize_if_required(
-                        subset.random_crop, img, image_info.bucket_reso, image_info.resized_size, resize_interpolation=image_info.resize_interpolation
+                        subset.random_crop,
+                        img,
+                        image_info.bucket_reso,
+                        image_info.resized_size,
+                        resize_interpolation=image_info.resize_interpolation,
                     )
                 else:
                     if face_cx > 0:  # 顔位置情報あり
@@ -2101,7 +2107,9 @@ def load_dreambooth_dir(subset: DreamBoothSubset):
 
             for img_path, caption, size in zip(img_paths, captions, sizes):
                 info = ImageInfo(img_path, num_repeats, caption, subset.is_reg, img_path)
-                info.resize_interpolation = subset.resize_interpolation if subset.resize_interpolation is not None else self.resize_interpolation
+                info.resize_interpolation = (
+                    subset.resize_interpolation if subset.resize_interpolation is not None else self.resize_interpolation
+                )
                 if size is not None:
                     info.image_size = size
                 if subset.is_reg:
@@ -2385,7 +2393,7 @@ def __init__(
         bucket_no_upscale: bool,
         debug_dataset: bool,
         validation_split: float,
-        validation_seed: Optional[int],        
+        validation_seed: Optional[int],
         resize_interpolation: Optional[str] = None,
     ) -> None:
         super().__init__(resolution, network_multiplier, debug_dataset, resize_interpolation)
@@ -2448,7 +2456,7 @@ def __init__(
         self.num_train_images = self.dreambooth_dataset_delegate.num_train_images
         self.num_reg_images = self.dreambooth_dataset_delegate.num_reg_images
         self.validation_split = validation_split
-        self.validation_seed = validation_seed 
+        self.validation_seed = validation_seed
         self.resize_interpolation = resize_interpolation
 
         # assert all conditioning data exists
@@ -2538,7 +2546,14 @@ def __getitem__(self, index):
                     cond_img.shape[0] == original_size_hw[0] and cond_img.shape[1] == original_size_hw[1]
                 ), f"size of conditioning image is not match / 画像サイズが合いません: {image_info.absolute_path}"
 
-                cond_img = resize_image(cond_img, original_size_hw[1], original_size_hw[0], target_size_hw[1], target_size_hw[0], self.resize_interpolation)
+                cond_img = resize_image(
+                    cond_img,
+                    original_size_hw[1],
+                    original_size_hw[0],
+                    target_size_hw[1],
+                    target_size_hw[0],
+                    self.resize_interpolation,
+                )
 
                 # TODO support random crop
                 # 現在サポートしているcropはrandomではなく中央のみ
@@ -2552,7 +2567,14 @@ def __getitem__(self, index):
                 # ), f"image size is small / 画像サイズが小さいようです: {image_info.absolute_path}"
                 # resize to target
                 if cond_img.shape[0] != target_size_hw[0] or cond_img.shape[1] != target_size_hw[1]:
-                    cond_img = resize_image(cond_img, cond_img.shape[0], cond_img.shape[1], target_size_hw[1], target_size_hw[0], self.resize_interpolation)
+                    cond_img = resize_image(
+                        cond_img,
+                        cond_img.shape[0],
+                        cond_img.shape[1],
+                        target_size_hw[1],
+                        target_size_hw[0],
+                        self.resize_interpolation,
+                    )
 
             if flipped:
                 cond_img = cond_img[:, ::-1, :].copy()  # copy to avoid negative stride
@@ -3000,7 +3022,9 @@ def load_images_and_masks_for_caching(
     for info in image_infos:
         image = load_image(info.absolute_path, use_alpha_mask) if info.image is None else np.array(info.image, np.uint8)
         # TODO 画像のメタデータが壊れていて、メタデータから割り当てたbucketと実際の画像サイズが一致しない場合があるのでチェック追加要
-        image, original_size, crop_ltrb = trim_and_resize_if_required(random_crop, image, info.bucket_reso, info.resized_size, resize_interpolation=info.resize_interpolation)
+        image, original_size, crop_ltrb = trim_and_resize_if_required(
+            random_crop, image, info.bucket_reso, info.resized_size, resize_interpolation=info.resize_interpolation
+        )
 
         original_sizes.append(original_size)
         crop_ltrbs.append(crop_ltrb)
@@ -3041,7 +3065,9 @@ def cache_batch_latents(
     for info in image_infos:
         image = load_image(info.absolute_path, use_alpha_mask) if info.image is None else np.array(info.image, np.uint8)
         # TODO 画像のメタデータが壊れていて、メタデータから割り当てたbucketと実際の画像サイズが一致しない場合があるのでチェック追加要
-        image, original_size, crop_ltrb = trim_and_resize_if_required(random_crop, image, info.bucket_reso, info.resized_size, resize_interpolation=info.resize_interpolation)
+        image, original_size, crop_ltrb = trim_and_resize_if_required(
+            random_crop, image, info.bucket_reso, info.resized_size, resize_interpolation=info.resize_interpolation
+        )
 
         info.latents_original_size = original_size
         info.latents_crop_ltrb = crop_ltrb
@@ -3482,9 +3508,9 @@ def get_sai_model_spec(
     textual_inversion: bool,
     is_stable_diffusion_ckpt: Optional[bool] = None,  # None for TI and LoRA
     sd3: str = None,
-    flux: str = None, # "dev", "schnell" or "chroma"
+    flux: str = None,  # "dev", "schnell" or "chroma"
     lumina: str = None,
-    optional_metadata: dict[str, str] | None = None
+    optional_metadata: dict[str, str] | None = None,
 ):
     timestamp = time.time()
 
@@ -3513,7 +3539,7 @@ def get_sai_model_spec(
 
     # Extract metadata_* fields from args and merge with optional_metadata
     extracted_metadata = {}
-    
+
     # Extract all metadata_* attributes from args
     for attr_name in dir(args):
         if attr_name.startswith("metadata_") and not attr_name.startswith("metadata___"):
@@ -3523,7 +3549,7 @@ def get_sai_model_spec(
                 field_name = attr_name[9:]  # len("metadata_") = 9
                 if field_name not in ["title", "author", "description", "license", "tags"]:
                     extracted_metadata[field_name] = value
-    
+
     # Merge extracted metadata with provided optional_metadata
     all_optional_metadata = {**extracted_metadata}
     if optional_metadata:
@@ -3546,7 +3572,7 @@ def get_sai_model_spec(
         tags=args.metadata_tags,
         timesteps=timesteps,
         clip_skip=args.clip_skip,  # None or int
-        model_config=model_config, 
+        model_config=model_config,
         optional_metadata=all_optional_metadata if all_optional_metadata else None,
     )
     return metadata
@@ -3562,7 +3588,7 @@ def get_sai_model_spec_dataclass(
     sd3: str = None,
     flux: str = None,
     lumina: str = None,
-    optional_metadata: dict[str, str] | None = None
+    optional_metadata: dict[str, str] | None = None,
 ) -> sai_model_spec.ModelSpecMetadata:
     """
     Get ModelSpec metadata as a dataclass - preferred for new code.
@@ -5558,11 +5584,12 @@ def load_target_model(args, weight_dtype, accelerator, unet_use_linear_projectio
 
 
 def patch_accelerator_for_fp16_training(accelerator):
-    
+
     from accelerate import DistributedType
+
     if accelerator.distributed_type == DistributedType.DEEPSPEED:
         return
-    
+
     org_unscale_grads = accelerator.scaler._unscale_grads_
 
     def _unscale_grads_replacer(optimizer, inv_scale, found_inf, allow_fp16):
@@ -6054,7 +6081,6 @@ def get_noise_noisy_latents_and_timesteps(
     b_size = latents.shape[0]
     min_timestep = 0 if args.min_timestep is None else args.min_timestep
     max_timestep = noise_scheduler.config.num_train_timesteps if args.max_timestep is None else args.max_timestep
-
     timesteps = get_timesteps(min_timestep, max_timestep, b_size, latents.device)
 
     # Add noise to the latents according to the noise magnitude at each timestep
@@ -6279,7 +6305,6 @@ def line_to_prompt_dict(line: str) -> dict:
                 prompt_dict["renorm_cfg"] = float(m.group(1))
                 continue
 
-
         except ValueError as ex:
             logger.error(f"Exception in parsing / 解析エラー: {parg}")
             logger.error(ex)
@@ -6328,7 +6353,7 @@ def sample_images_common(
     vae,
     tokenizer,
     text_encoder,
-    unet,
+    unet_wrapped,
     prompt_replacement=None,
     controlnet=None,
 ):
@@ -6363,7 +6388,7 @@ def sample_images_common(
     vae.to(distributed_state.device)  # distributed_state.device is same as accelerator.device
 
     # unwrap unet and text_encoder(s)
-    unet = accelerator.unwrap_model(unet)
+    unet = accelerator.unwrap_model(unet_wrapped)
     if isinstance(text_encoder, (list, tuple)):
         text_encoder = [accelerator.unwrap_model(te) for te in text_encoder]
     else:
@@ -6509,7 +6534,7 @@ def sample_image_inference(
     logger.info(f"sample_sampler: {sampler_name}")
     if seed is not None:
         logger.info(f"seed: {seed}")
-    with accelerator.autocast():
+    with accelerator.autocast(), torch.no_grad():
         latents = pipeline(
             prompt=prompt,
             height=height,
@@ -6647,4 +6672,3 @@ def moving_average(self) -> float:
         if losses == 0:
             return 0
         return self.loss_total / losses
-
diff --git a/pytorch_lightning/__init__.py b/pytorch_lightning/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/pytorch_lightning/callbacks/__init__.py b/pytorch_lightning/callbacks/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/pytorch_lightning/callbacks/model_checkpoint.py b/pytorch_lightning/callbacks/model_checkpoint.py
new file mode 100644
index 000000000..1ba145634
--- /dev/null
+++ b/pytorch_lightning/callbacks/model_checkpoint.py
@@ -0,0 +1,4 @@
+# dummy module for pytorch_lightning
+
+class ModelCheckpoint:
+    pass
diff --git a/requirements.txt b/requirements.txt
index 448af323c..7c7060c7b 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,28 +1,29 @@
-accelerate==0.33.0
-transformers==4.44.0
-diffusers[torch]==0.25.0
-ftfy==6.1.1
+accelerate==1.6.0
+transformers==4.54.1
+diffusers[torch]==0.32.1
+ftfy==6.3.1
 # albumentations==1.3.0
-opencv-python==4.8.1.78
+opencv-python==4.10.0.84
 einops==0.7.0
-pytorch-lightning==1.9.0
-bitsandbytes==0.44.0
-lion-pytorch==0.0.6
+# pytorch-lightning==1.9.0
+bitsandbytes==0.45.4
+lion-pytorch==0.2.3
 schedulefree==1.4
 pytorch-optimizer==3.7.0
-prodigy-plus-schedule-free==1.9.0
+prodigy-plus-schedule-free==1.9.2
 prodigyopt==1.1.2
 tensorboard
-safetensors==0.4.4
+safetensors==0.4.5
 # gradio==3.16.2
-altair==4.2.2
-easygui==0.98.3
+# altair==4.2.2
+# easygui==0.98.3
 toml==0.10.2
-voluptuous==0.13.1
-huggingface-hub==0.24.5
+voluptuous==0.15.2
+huggingface-hub==0.34.3
 # for Image utils
 imagesize==1.4.1
-numpy<=2.0
+numpy
+# <=2.0
 # for BLIP captioning
 # requests==2.28.2
 # timm==0.6.12
@@ -41,8 +42,8 @@ numpy<=2.0
 # open clip for SDXL
 # open-clip-torch==2.20.0
 # For logging
-rich==13.7.0
+rich==14.1.0
 # for T5XXL tokenizer (SD3/FLUX)
-sentencepiece==0.2.0
+sentencepiece==0.2.1
 # for kohya_ss library
 -e .
diff --git a/sdxl_train_network.py b/sdxl_train_network.py
index d56c76b03..5c5bcd63a 100644
--- a/sdxl_train_network.py
+++ b/sdxl_train_network.py
@@ -23,7 +23,12 @@ def __init__(self):
         self.vae_scale_factor = sdxl_model_util.VAE_SCALE_FACTOR
         self.is_sdxl = True
 
-    def assert_extra_args(self, args, train_dataset_group: Union[train_util.DatasetGroup, train_util.MinimalDataset], val_dataset_group: Optional[train_util.DatasetGroup]):
+    def assert_extra_args(
+        self,
+        args,
+        train_dataset_group: Union[train_util.DatasetGroup, train_util.MinimalDataset],
+        val_dataset_group: Optional[train_util.DatasetGroup],
+    ):
         sdxl_train_util.verify_sdxl_training_args(args)
 
         if args.cache_text_encoder_outputs:
diff --git a/train_network.py b/train_network.py
index e055f5d8e..3dedb574c 100644
--- a/train_network.py
+++ b/train_network.py
@@ -414,13 +414,12 @@ def process_batch(
         if text_encoder_outputs_list is not None:
             text_encoder_conds = text_encoder_outputs_list  # List of text encoder outputs
 
-
         if len(text_encoder_conds) == 0 or text_encoder_conds[0] is None or train_text_encoder:
             # TODO this does not work if 'some text_encoders are trained' and 'some are not and not cached'
             with torch.set_grad_enabled(is_train and train_text_encoder), accelerator.autocast():
                 # Get the text embedding for conditioning
                 if args.weighted_captions:
-                    input_ids_list, weights_list = tokenize_strategy.tokenize_with_weights(batch['captions'])
+                    input_ids_list, weights_list = tokenize_strategy.tokenize_with_weights(batch["captions"])
                     encoded_text_encoder_conds = text_encoding_strategy.encode_tokens_with_weights(
                         tokenize_strategy,
                         self.get_models_for_text_encoding(args, accelerator, text_encoders),
@@ -1340,7 +1339,7 @@ def remove_model(old_ckpt_name):
         )
         NUM_VALIDATION_TIMESTEPS = 4  # 200, 400, 600, 800 TODO make this configurable
         min_timestep = 0 if args.min_timestep is None else args.min_timestep
-        max_timestep = noise_scheduler.num_train_timesteps if args.max_timestep is None else args.max_timestep
+        max_timestep = noise_scheduler.config.num_train_timesteps if args.max_timestep is None else args.max_timestep
         validation_timesteps = np.linspace(min_timestep, max_timestep, (NUM_VALIDATION_TIMESTEPS + 2), dtype=int)[1:-1]
         validation_total_steps = validation_steps * len(validation_timesteps)
         original_args_min_timestep = args.min_timestep

From 6f24bce7ccacdf0c13614fe84413c2446adbf35c Mon Sep 17 00:00:00 2001
From: Kohya S <ykumeykume@gmail.com>
Date: Sat, 16 Aug 2025 22:03:31 +0900
Subject: [PATCH 523/582] fix: remove unnecessary super call in
 assert_extra_args method

---
 sdxl_train_textual_inversion.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/sdxl_train_textual_inversion.py b/sdxl_train_textual_inversion.py
index 982007601..be538cdd6 100644
--- a/sdxl_train_textual_inversion.py
+++ b/sdxl_train_textual_inversion.py
@@ -20,7 +20,6 @@ def __init__(self):
         self.is_sdxl = True
 
     def assert_extra_args(self, args, train_dataset_group: Union[train_util.DatasetGroup, train_util.MinimalDataset], val_dataset_group: Optional[train_util.DatasetGroup]):
-        super().assert_extra_args(args, train_dataset_group, val_dataset_group)
         sdxl_train_util.verify_sdxl_training_args(args, supportTextEncoderCaching=False)
 
         train_dataset_group.verify_bucket_reso_steps(32)

From f61c442f0b7d4bc30f3d3eb3e169c13f424107ad Mon Sep 17 00:00:00 2001
From: Kohya S <ykumeykume@gmail.com>
Date: Sat, 16 Aug 2025 22:03:52 +0900
Subject: [PATCH 524/582] fix: use strategy for tokenizer and latent caching

---
 train_control_net.py | 23 +++++++++++++----------
 1 file changed, 13 insertions(+), 10 deletions(-)

diff --git a/train_control_net.py b/train_control_net.py
index 97cd1ebb0..c12693baf 100644
--- a/train_control_net.py
+++ b/train_control_net.py
@@ -12,7 +12,7 @@
 from tqdm import tqdm
 
 import torch
-from library import deepspeed_utils
+from library import deepspeed_utils, strategy_base, strategy_sd
 from library.device_utils import init_ipex, clean_memory_on_device
 
 init_ipex()
@@ -73,7 +73,14 @@ def train(args):
         args.seed = random.randint(0, 2**32)
     set_seed(args.seed)
 
-    tokenizer = train_util.load_tokenizer(args)
+    tokenize_strategy = strategy_sd.SdTokenizeStrategy(args.v2, args.max_token_length, args.tokenizer_cache_dir)
+    strategy_base.TokenizeStrategy.set_strategy(tokenize_strategy)
+    tokenizer = tokenize_strategy.tokenizer
+    # prepare caching strategy: this must be set before preparing dataset. because dataset may use this strategy for initialization.
+    latents_caching_strategy = strategy_sd.SdSdxlLatentsCachingStrategy(
+        True, args.cache_latents_to_disk, args.vae_batch_size, False
+    )
+    strategy_base.LatentsCachingStrategy.set_strategy(latents_caching_strategy)
 
     # データセットを準備する
     blueprint_generator = BlueprintGenerator(ConfigSanitizer(False, False, True, True))
@@ -100,7 +107,7 @@ def train(args):
             ]
         }
 
-    blueprint = blueprint_generator.generate(user_config, args, tokenizer=tokenizer)
+    blueprint = blueprint_generator.generate(user_config, args)
     train_dataset_group, val_dataset_group = config_util.generate_dataset_group_by_blueprint(blueprint.dataset_group)
 
     current_epoch = Value("i", 0)
@@ -243,12 +250,7 @@ def __contains__(self, name):
         vae.requires_grad_(False)
         vae.eval()
         with torch.no_grad():
-            train_dataset_group.cache_latents(
-                vae,
-                args.vae_batch_size,
-                args.cache_latents_to_disk,
-                accelerator.is_main_process,
-            )
+            train_dataset_group.new_cache_latents(vae, accelerator)
         vae.to("cpu")
         clean_memory_on_device(accelerator.device)
 
@@ -267,6 +269,7 @@ def __contains__(self, name):
 
     # dataloaderを準備する
     # DataLoaderのプロセス数：0 は persistent_workers が使えないので注意
+    train_dataset_group.set_current_strategies()
     n_workers = min(args.max_data_loader_n_workers, os.cpu_count())  # cpu_count or max_data_loader_n_workers
 
     train_dataloader = torch.utils.data.DataLoader(
@@ -451,7 +454,7 @@ def remove_model(old_ckpt_name):
                     latents = latents * 0.18215
                 b_size = latents.shape[0]
 
-                input_ids = batch["input_ids"].to(accelerator.device)
+                input_ids = batch["input_ids_list"][0].to(accelerator.device)
                 encoder_hidden_states = train_util.get_hidden_states(args, input_ids, tokenizer, text_encoder, weight_dtype)
 
                 # Sample noise that we'll add to the latents

From acba279b0b6a62ed9266f1011bbbae13d098d7fe Mon Sep 17 00:00:00 2001
From: Kohya S <52813779+kohya-ss@users.noreply.github.com>
Date: Sun, 24 Aug 2025 17:03:18 +0900
Subject: [PATCH 525/582] fix: update PyTorch version in workflow matrix

---
 .github/workflows/tests.yml | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
index 9e037e539..88b0b1770 100644
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -23,7 +23,8 @@ jobs:
         os: [ubuntu-latest]
         python-version: ["3.10"] # Python versions to test
         pytorch-version: ["2.4.0"] # PyTorch versions to test
-
+        pytorch-version: ["2.6.0"] # PyTorch versions to test 
+        
     steps:
       - uses: actions/checkout@v4
         with:

From f7acd2f7a3319487feb5277934f5cc998e46b231 Mon Sep 17 00:00:00 2001
From: Kohya S <52813779+kohya-ss@users.noreply.github.com>
Date: Sun, 24 Aug 2025 17:16:17 +0900
Subject: [PATCH 526/582] fix: consolidate PyTorch versions in workflow matrix

---
 .github/workflows/tests.yml | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
index 88b0b1770..d35fe3925 100644
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -22,9 +22,8 @@ jobs:
       matrix:
         os: [ubuntu-latest]
         python-version: ["3.10"] # Python versions to test
-        pytorch-version: ["2.4.0"] # PyTorch versions to test
-        pytorch-version: ["2.6.0"] # PyTorch versions to test 
-        
+        pytorch-version: ["2.4.0", "2.6.0"] # PyTorch versions to test
+
     steps:
       - uses: actions/checkout@v4
         with:

From ac72cf88a76d6165e70c96c6aa76282977bd378c Mon Sep 17 00:00:00 2001
From: Kohya S <52813779+kohya-ss@users.noreply.github.com>
Date: Thu, 28 Aug 2025 08:35:40 +0900
Subject: [PATCH 527/582] feat: remove bitsandbytes version specification in
 requirements.txt

---
 requirements.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/requirements.txt b/requirements.txt
index 7c7060c7b..624978b49 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -6,7 +6,7 @@ ftfy==6.3.1
 opencv-python==4.10.0.84
 einops==0.7.0
 # pytorch-lightning==1.9.0
-bitsandbytes==0.45.4
+bitsandbytes
 lion-pytorch==0.2.3
 schedulefree==1.4
 pytorch-optimizer==3.7.0

From c52c45cd7ab6b8e92b8967e1d46965a67128ee6a Mon Sep 17 00:00:00 2001
From: Kohya S <52813779+kohya-ss@users.noreply.github.com>
Date: Thu, 28 Aug 2025 08:36:09 +0900
Subject: [PATCH 528/582] doc: update for PyTorch and libraries versions

---
 README.md | 17 ++++++++++++++---
 1 file changed, 14 insertions(+), 3 deletions(-)

diff --git a/README.md b/README.md
index be0ae4064..7dc9a4b6b 100644
--- a/README.md
+++ b/README.md
@@ -4,18 +4,29 @@ This repository contains training, generation and utility scripts for Stable Dif
 
 This feature is experimental. The options and the training script may change in the future. Please let us know if you have any idea to improve the training.
 
-__Please update PyTorch to 2.4.0. We have tested with `torch==2.4.0` and `torchvision==0.19.0` with CUDA 12.4. We also updated `accelerate` to 0.33.0 just to be safe. `requirements.txt` is also updated, so please update the requirements.__
+__Please update PyTorch to 2.6.0 or later. We have tested with `torch==2.6.0` and `torchvision==0.21.0` with CUDA 12.4. `requirements.txt` is also updated, so please update the requirements.__
 
 The command to install PyTorch is as follows:
-`pip3 install torch==2.4.0 torchvision==0.19.0 --index-url https://download.pytorch.org/whl/cu124`
+`pip3 install torch==2.6.0 torchvision==0.21.0 --index-url https://download.pytorch.org/whl/cu124`
 
-If you are using DeepSpeed, please install DeepSpeed with `pip install deepspeed==0.16.7`.
+For RTX 50 series GPUs, PyTorch 2.8.0 with CUDA 12.8/9 should be used. `requirements.txt` will work with this version.
+
+If you are using DeepSpeed, please install DeepSpeed with `pip install deepspeed` (appropriate version is not confirmed yet).
 
 - [FLUX.1 training](#flux1-training)
 - [SD3 training](#sd3-training)
 
 ### Recent Updates
 
+Aug 28, 2025:
+- In order to support the latest GPUs and features, we have updated the **PyTorch and library versions**. There are many changes, so please let us know if you encounter any issues.
+- The PyTorch version used for testing has been updated to 2.6.0. We have confirmed that it works with PyTorch 2.6.0 and later.
+- The `requirements.txt` has been updated, so please update your dependencies.
+  - You can update the dependencies with `pip install -r requirements.txt`.
+  - The version specification for `bitsandbytes` has been removed. If you encounter errors on RTX 50 series GPUs, please update it with `pip install -U bitsandbytes`.
+- We have modified each script to minimize warnings as much as possible.
+  - The modified scripts will work in the old environment (library versions), but please update them when convenient.
+
 Jul 30, 2025:
 - **Breaking Change**: For FLUX.1 and Chroma training, the CFG (Classifier-Free Guidance, using negative prompts) scale option for sample image generation during training has been changed from `--g` to `--l`. The `--g` option is now used for the embedded guidance scale. Please update your prompts accordingly. See [Sample Image Generation During Training](#sample-image-generation-during-training) for details.
 

From 5a5138d0ab71d1d640851e6ffa8eb2ad2f2e2b60 Mon Sep 17 00:00:00 2001
From: Kohya S <52813779+kohya-ss@users.noreply.github.com>
Date: Thu, 28 Aug 2025 08:38:14 +0900
Subject: [PATCH 529/582] doc: add PR reference for PyTorch and library
 versions update

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 7dc9a4b6b..5953cef50 100644
--- a/README.md
+++ b/README.md
@@ -19,7 +19,7 @@ If you are using DeepSpeed, please install DeepSpeed with `pip install deepspeed
 ### Recent Updates
 
 Aug 28, 2025:
-- In order to support the latest GPUs and features, we have updated the **PyTorch and library versions**. There are many changes, so please let us know if you encounter any issues.
+- In order to support the latest GPUs and features, we have updated the **PyTorch and library versions**. PR [#2178](https://github.com/kohya-ss/sd-scripts/pull/2178) There are many changes, so please let us know if you encounter any issues.
 - The PyTorch version used for testing has been updated to 2.6.0. We have confirmed that it works with PyTorch 2.6.0 and later.
 - The `requirements.txt` has been updated, so please update your dependencies.
   - You can update the dependencies with `pip install -r requirements.txt`.

From e836b7f66d93f411515f593d17fa17eaca3bb5b1 Mon Sep 17 00:00:00 2001
From: Kohya S <52813779+kohya-ss@users.noreply.github.com>
Date: Sat, 30 Aug 2025 09:30:24 +0900
Subject: [PATCH 530/582] fix: chroma LoRA training without Text Encode caching

---
 library/flux_utils.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/library/flux_utils.py b/library/flux_utils.py
index 3f0a0d63e..220548547 100644
--- a/library/flux_utils.py
+++ b/library/flux_utils.py
@@ -220,8 +220,12 @@ def __init__(self):
 class DummyCLIPL(torch.nn.Module):
     def __init__(self):
         super().__init__()
-        self.output_shape = (77, 1) # Note: The original code had (77, 768), but we use (77, 1) for the dummy output
-        self.dummy_param = torch.nn.Parameter(torch.zeros(1))  # get dtype and device from this parameter
+        self.output_shape = (77, 1)  # Note: The original code had (77, 768), but we use (77, 1) for the dummy output
+
+        # dtype and device from these parameters. train_network.py accesses them
+        self.dummy_param = torch.nn.Parameter(torch.zeros(1))
+        self.dummy_param_2 = torch.nn.Parameter(torch.zeros(1))
+        self.dummy_param_3 = torch.nn.Parameter(torch.zeros(1))
         self.text_model = DummyTextModel()
 
     @property

From 989448afddb47e10c9177e31cf12065f88af291e Mon Sep 17 00:00:00 2001
From: kohya-ss <52813779+kohya-ss@users.noreply.github.com>
Date: Sun, 31 Aug 2025 19:19:10 +0900
Subject: [PATCH 531/582] doc: enhance SD3/SDXL LoRA training guide

---
 docs/sd3_train_network.md           | 137 ++++++++++++++++++++++++++--
 docs/sdxl_train_network_advanced.md |  97 ++++++++++++++++++--
 2 files changed, 218 insertions(+), 16 deletions(-)

diff --git a/docs/sd3_train_network.md b/docs/sd3_train_network.md
index e10829aae..f235e8ef0 100644
--- a/docs/sd3_train_network.md
+++ b/docs/sd3_train_network.md
@@ -1,5 +1,3 @@
-Status: reviewed
-
 # LoRA Training Guide for Stable Diffusion 3/3.5 using `sd3_train_network.py` / `sd3_train_network.py` を用いたStable Diffusion 3/3.5モデルのLoRA学習ガイド
 
 This document explains how to train LoRA (Low-Rank Adaptation) models for Stable Diffusion 3 (SD3) and Stable Diffusion 3.5 (SD3.5) using `sd3_train_network.py` in the `sd-scripts` repository.
@@ -18,7 +16,6 @@ This guide assumes you already understand the basics of LoRA training. For commo
 
 <details>
 <summary>日本語</summary>
-ステータス：内容を一通り確認した
 
 `sd3_train_network.py`は、Stable Diffusion 3/3.5モデルに対してLoRAなどの追加ネットワークを学習させるためのスクリプトです。SD3は、MMDiT (Multi-Modal Diffusion Transformer) と呼ばれる新しいアーキテクチャを採用しており、従来のStable Diffusionモデルとは構造が異なります。このスクリプトを使用することで、SD3/3.5モデルに特化したLoRAモデルを作成できます。
 
@@ -106,6 +103,7 @@ accelerate launch --num_cpu_threads_per_process 1 sd3_train_network.py \
 
 <details>
 <summary>日本語</summary>
+
 学習は、ターミナルから`sd3_train_network.py`を実行することで開始します。基本的なコマンドラインの構造は`train_network.py`と同様ですが、SD3/3.5特有の引数を指定する必要があります。
 
 以下に、基本的なコマンドライン実行例を示します。
@@ -136,6 +134,7 @@ accelerate launch --num_cpu_threads_per_process 1 sd3_train_network.py
 ```
 
 ※実際には1行で書くか、適切な改行文字（`\` または `^`）を使用してください。
+
 </details>
 
 ### 4.1. Explanation of Key Options / 主要なコマンドライン引数の解説
@@ -162,6 +161,8 @@ Besides the arguments explained in the [train_network.py guide](train_network.md
 #### Memory and Speed / メモリ・速度関連
 
 * `--blocks_to_swap=<integer>` **[experimental]** – Swap a number of Transformer blocks between CPU and GPU. More blocks reduce VRAM but slow training. Cannot be used with `--cpu_offload_checkpointing`.
+* `--cache_text_encoder_outputs` – Caches the outputs of the text encoders to reduce VRAM usage and speed up training. This is particularly effective for SD3, which uses three text encoders. Recommended when not training the text encoder LoRA. For more details, see the [`sdxl_train_network.py` guide](sdxl_train_network.md).
+* `--cache_text_encoder_outputs_to_disk` – Caches the text encoder outputs to disk when the above option is enabled.
 
 #### Incompatible or Deprecated Options / 非互換・非推奨の引数
 
@@ -169,6 +170,7 @@ Besides the arguments explained in the [train_network.py guide](train_network.md
 
 <details>
 <summary>日本語</summary>
+
 [`train_network.py`のガイド](train_network.md)で説明されている引数に加え、以下のSD3/3.5特有の引数を指定します。共通の引数については、上記ガイドを参照してください。
 
 #### モデル関連
@@ -194,29 +196,148 @@ Besides the arguments explained in the [train_network.py guide](train_network.md
 #### メモリ・速度関連
 
 *   `--blocks_to_swap=<integer>` **[実験的機能]** – TransformerブロックをCPUとGPUでスワップしてVRAMを節約します。`--cpu_offload_checkpointing`とは併用できません。
+*   `--cache_text_encoder_outputs` – Text Encoderの出力をキャッシュし、VRAM使用量削減と学習高速化を図ります。SD3は3つのText Encoderを持つため特に効果的です。Text EncoderのLoRAを学習しない場合に推奨されます。詳細は[`sdxl_train_network.py`のガイド](sdxl_train_network.md)を参照してください。
+*   `--cache_text_encoder_outputs_to_disk` – 上記オプションと併用し、Text Encoderの出力をディスクにキャッシュします。
 
 #### 非互換・非推奨の引数
 
 *   `--v2`, `--v_parameterization`, `--clip_skip` – Stable Diffusion v1/v2向けの引数のため、SD3/3.5学習では使用されません。
+
 </details>
 
 ### 4.2. Starting Training / 学習の開始
 
 After setting the required arguments, run the command to begin training. The overall flow and how to check logs are the same as in the [train_network.py guide](train_network.md#32-starting-the-training--学習の開始).
 
-## 5. Using the Trained Model / 学習済みモデルの利用
+<details>
+<summary>日本語</summary>
 
-When training finishes, a LoRA model file (e.g. `my_sd3_lora.safetensors`) is saved in the directory specified by `output_dir`. Use this file with inference environments that support SD3/3.5, such as ComfyUI.
+必要な引数を設定したら、コマンドを実行して学習を開始します。全体の流れやログの確認方法は、[train_network.pyのガイド](train_network.md#32-starting-the-training--学習の開始)と同様です。
+
+</details>
 
-## 6. Others / その他
+## 5. LoRA Target Modules / LoRAの学習対象モジュール
 
-`sd3_train_network.py` shares many features with `train_network.py`, such as sample image generation (`--sample_prompts`, etc.) and detailed optimizer settings. For these, see the [train_network.py guide](train_network.md#5-other-features--その他の機能) or run `python sd3_train_network.py --help`.
+When training LoRA with `sd3_train_network.py`, the following modules are targeted by default:
+
+*   **MMDiT (replaces U-Net)**:
+    *   `qkv` (Query, Key, Value) matrices and `proj_out` (output projection) in the attention blocks.
+*   **final_layer**:
+    *   The output layer at the end of MMDiT.
+
+By using `--network_args`, you can apply more detailed controls, such as setting different ranks (dimensions) for each module.
+
+### Specify rank for each layer in SD3 LoRA / 各層のランクを指定する
+
+You can specify the rank for each layer in SD3 by specifying the following network_args. If you specify `0`, LoRA will not be applied to that layer.
+
+When network_args is not specified, the default value (`network_dim`) is applied, same as before.
+
+|network_args|target layer|
+|---|---|
+|context_attn_dim|attn in context_block|
+|context_mlp_dim|mlp in context_block|
+|context_mod_dim|adaLN_modulation in context_block|
+|x_attn_dim|attn in x_block|
+|x_mlp_dim|mlp in x_block|
+|x_mod_dim|adaLN_modulation in x_block|
+
+`"verbose=True"` is also available for debugging. It shows the rank of each layer.
+
+example: 
+```
+--network_args "context_attn_dim=2" "context_mlp_dim=3" "context_mod_dim=4" "x_attn_dim=5" "x_mlp_dim=6" "x_mod_dim=7" "verbose=True"
+```
+
+You can apply LoRA to the conditioning layers of SD3 by specifying `emb_dims` in network_args. When specifying, be sure to specify 6 numbers in `[]` as a comma-separated list.
+
+example: 
+```
+--network_args "emb_dims=[2,3,4,5,6,7]"
+```
+
+Each number corresponds to `context_embedder`, `t_embedder`, `x_embedder`, `y_embedder`, `final_layer_adaLN_modulation`, `final_layer_linear`. The above example applies LoRA to all conditioning layers, with rank 2 for `context_embedder`, 3 for `t_embedder`, 4 for `context_embedder`, 5 for `y_embedder`, 6 for `final_layer_adaLN_modulation`, and 7 for `final_layer_linear`.
+
+If you specify `0`, LoRA will not be applied to that layer. For example, `[4,0,0,4,0,0]` applies LoRA only to `context_embedder` and `y_embedder`.
+
+### Specify blocks to train in SD3 LoRA training
+
+You can specify the blocks to train in SD3 LoRA training by specifying `train_block_indices` in network_args. The indices are 0-based. The default (when omitted) is to train all blocks. The indices are specified as a list of integers or a range of integers, like `0,1,5,8` or `0,1,4-5,7`. 
+
+The number of blocks depends on the model. The valid range is 0-(the number of blocks - 1). `all` is also available to train all blocks, `none` is also available to train no blocks.
+
+example: 
+```
+--network_args "train_block_indices=1,2,6-8" 
+```
+
+<details>
+<summary>日本語</summary>
+
+`sd3_train_network.py`でLoRAを学習させる場合、デフォルトでは以下のモジュールが対象となります。
+
+*   **MMDiT (U-Netの代替)**:
+    *   Attentionブロック内の`qkv`（Query, Key, Value）行列と、`proj_out`（出力Projection）。
+*   **final_layer**:
+    *   MMDiTの最後にある出力層。
+
+`--network_args` を使用することで、モジュールごとに異なるランク（次元数）を設定するなど、より詳細な制御が可能です。
+
+### SD3 LoRAで各層のランクを指定する
+
+各層のランクを指定するには、`--network_args`オプションを使用します。`0`を指定すると、その層にはLoRAが適用されません。
+
+network_argsが指定されない場合、デフォルト値（`network_dim`）が適用されます。
+
+|network_args|target layer|
+|---|---|
+|context_attn_dim|attn in context_block|
+|context_mlp_dim|mlp in context_block|
+|context_mod_dim|adaLN_modulation in context_block|
+|x_attn_dim|attn in x_block|
+|x_mlp_dim|mlp in x_block|
+|x_mod_dim|adaLN_modulation in x_block|
+
+`"verbose=True"`を指定すると、各層のランクが表示されます。
+
+例：
+
+```bash
+--network_args "context_attn_dim=2" "context_mlp_dim=3" "context_mod_dim=4" "x_attn_dim=5" "x_mlp_dim=6" "x_mod_dim=7" "verbose=True"
+```
+
+また、`emb_dims`を指定することで、SD3の条件付け層にLoRAを適用することもできます。指定する際は、必ず`[]`内にカンマ区切りで6つの数字を指定してください。
+
+```bash
+--network_args "emb_dims=[2,3,4,5,6,7]"
+```
+
+各数字は、`context_embedder`、`t_embedder`、`x_embedder`、`y_embedder`、`final_layer_adaLN_modulation`、`final_layer_linear`に対応しています。上記の例では、すべての条件付け層にLoRAを適用し、`context_embedder`に2、`t_embedder`に3、`x_embedder`に4、`y_embedder`に5、`final_layer_adaLN_modulation`に6、`final_layer_linear`に7のランクを設定しています。
+
+`0`を指定すると、その層にはLoRAが適用されません。例えば、`[4,0,0,4,0,0]`と指定すると、`context_embedder`と`y_embedder`のみにLoRAが適用されます。
+
+</details>
+
+
+## 6. Using the Trained Model / 学習済みモデルの利用
+
+When training finishes, a LoRA model file (e.g. `my_sd3_lora.safetensors`) is saved in the directory specified by `output_dir`. Use this file with inference environments that support SD3/3.5, such as ComfyUI.
 
 <details>
 <summary>日本語</summary>
-必要な引数を設定し、コマンドを実行すると学習が開始されます。基本的な流れやログの確認方法は[`train_network.py`のガイド](train_network.md#32-starting-the-training--学習の開始)と同様です。
 
 学習が完了すると、指定した`output_dir`にLoRAモデルファイル（例: `my_sd3_lora.safetensors`）が保存されます。このファイルは、SD3/3.5モデルに対応した推論環境（例: ComfyUIなど）で使用できます。
 
+</details>
+
+
+## 7. Others / その他
+
+`sd3_train_network.py` shares many features with `train_network.py`, such as sample image generation (`--sample_prompts`, etc.) and detailed optimizer settings. For these, see the [train_network.py guide](train_network.md#5-other-features--その他の機能) or run `python sd3_train_network.py --help`.
+
+<details>
+<summary>日本語</summary>
+
 `sd3_train_network.py`には、サンプル画像の生成 (`--sample_prompts`など) や詳細なオプティマイザ設定など、`train_network.py`と共通の機能も多く存在します。これらについては、[`train_network.py`のガイド](train_network.md#5-other-features--その他の機能)やスクリプトのヘルプ (`python sd3_train_network.py --help`) を参照してください。
+
 </details>
diff --git a/docs/sdxl_train_network_advanced.md b/docs/sdxl_train_network_advanced.md
index 39844c98b..fd7086047 100644
--- a/docs/sdxl_train_network_advanced.md
+++ b/docs/sdxl_train_network_advanced.md
@@ -1,5 +1,3 @@
-Status: under review
-
 # Advanced Settings: Detailed Guide for SDXL LoRA Training Script `sdxl_train_network.py` / 高度な設定: SDXL LoRA学習スクリプト `sdxl_train_network.py` 詳細ガイド
 
 This document describes the advanced options available when training LoRA models for SDXL (Stable Diffusion XL) with `sdxl_train_network.py` in the `sd-scripts` repository. For the basics, please read [How to Use the LoRA Training Script `train_network.py`](train_network.md) and [How to Use the SDXL LoRA Training Script `sdxl_train_network.py`](sdxl_train_network.md).
@@ -137,11 +135,55 @@ Basic options are common with `train_network.py`.
 *   `--clip_skip=N`: Uses the output from N layers skipped from the final layer of Text Encoders. **Not typically used for SDXL**.
 *   `--lowram` / `--highvram`: Options for memory usage optimization. `--lowram` is for environments like Colab where RAM < VRAM, `--highvram` is for environments with ample VRAM.
 *   `--persistent_data_loader_workers` / `--max_data_loader_n_workers=N`: Settings for DataLoader worker processes. Affects wait time between epochs and memory usage.
-*   `--config_file=\"<config file>\"` / `--output_config`: Options to use/output a `.toml` file instead of command line arguments.
+*   `--config_file="<config file>"` / `--output_config`: Options to use/output a `.toml` file instead of command line arguments.
 *   **Accelerate/DeepSpeed related:** (`--ddp_timeout`, `--ddp_gradient_as_bucket_view`, `--ddp_static_graph`): Detailed settings for distributed training. Accelerate settings (`accelerate config`) are usually sufficient. DeepSpeed requires separate configuration.
 
+## 1.11. Console and Logging / コンソールとログ
+
+* `--console_log_level`: Sets the logging level for the console output. Choose from `DEBUG`, `INFO`, `WARNING`, `ERROR`, `CRITICAL`.
+* `--console_log_file`: Redirects console logs to a specified file.
+* `--console_log_simple`: Enables a simpler log format.
+
+### 1.12. Hugging Face Hub Integration / Hugging Face Hub 連携
+
+* `--huggingface_repo_id`: The repository name on Hugging Face Hub to upload the model to (e.g., `your-username/your-model`).
+* `--huggingface_repo_type`: The type of repository on Hugging Face Hub. Usually `model`.
+* `--huggingface_path_in_repo`: The path within the repository to upload files to.
+* `--huggingface_token`: Your Hugging Face Hub authentication token.
+* `--huggingface_repo_visibility`: Sets the visibility of the repository (`public` or `private`).
+* `--resume_from_huggingface`: Resumes training from a state saved on Hugging Face Hub.
+* `--async_upload`: Enables asynchronous uploading of models to the Hub, preventing it from blocking the training process.
+* `--save_n_epoch_ratio`: Saves the model at a certain ratio of total epochs. For example, `5` will save at least 5 checkpoints throughout the training.
+
+### 1.13. Advanced Attention Settings / 高度なAttention設定
+
+* `--mem_eff_attn`: Use memory-efficient attention mechanism. This is an older implementation and `sdpa` or `xformers` are generally recommended.
+* `--xformers`: Use xformers library for memory-efficient attention. Requires `pip install xformers`.
+
+### 1.14. Advanced LR Scheduler Settings / 高度な学習率スケジューラ設定
+
+* `--lr_scheduler_type`: Specifies a custom scheduler module.
+* `--lr_scheduler_args`: Provides additional arguments to the custom scheduler (e.g., `"T_max=100"`).
+* `--lr_decay_steps`: Sets the number of steps for the learning rate to decay.
+* `--lr_scheduler_timescale`: The timescale for the inverse square root scheduler.
+* `--lr_scheduler_min_lr_ratio`: Sets the minimum learning rate as a ratio of the initial learning rate for certain schedulers.
+
+### 1.15. Differential Learning with LoRA / LoRAの差分学習
+
+This technique involves merging a pre-trained LoRA into the base model before starting a new training session. This is useful for fine-tuning an existing LoRA or for learning the 'difference' from it.
+
+* `--base_weights`: Path to one or more LoRA weight files to be merged into the base model before training begins.
+* `--base_weights_multiplier`: A multiplier for the weights of the LoRA specified by `--base_weights`. You can specify multiple values if you provide multiple weights.
+
+### 1.16. Other Miscellaneous Options / その他のオプション
+
+* `--tokenizer_cache_dir`: Specifies a directory to cache the tokenizer, which is useful for offline training.
+* `--scale_weight_norms`: Scales the weight norms of the LoRA modules. This can help prevent overfitting by controlling the magnitude of the weights. A value of `1.0` is a good starting point.
+* `--disable_mmap_load_safetensors`: Disables memory-mapped loading for `.safetensors` files. This can speed up model loading in some environments like WSL.
+
 ## 2. Other Tips / その他のTips
 
+
 *   **VRAM Usage:** SDXL LoRA training requires a lot of VRAM. Even with 24GB VRAM, you might run out of memory depending on settings. Reduce VRAM usage with these settings:
     *   `--mixed_precision=\"bf16\"` or `\"fp16\"` (essential)
     *   `--gradient_checkpointing` (strongly recommended)
@@ -165,8 +207,6 @@ Basic options are common with `train_network.py`.
 <details>
 <summary>日本語</summary>
 
----
-
 # 高度な設定: SDXL LoRA学習スクリプト `sdxl_train_network.py` 詳細ガイド
 
 このドキュメントでは、`sd-scripts` リポジトリに含まれる `sdxl_train_network.py` を使用した、SDXL (Stable Diffusion XL) モデルに対する LoRA (Low-Rank Adaptation) モデル学習の高度な設定オプションについて解説します。
@@ -398,8 +438,52 @@ SDXLは計算コストが高いため、キャッシュ機能が効果的です
 *   **Accelerate/DeepSpeed関連:** (`--ddp_timeout`, `--ddp_gradient_as_bucket_view`, `--ddp_static_graph`)
     *   分散学習時の詳細設定。通常はAccelerateの設定 (`accelerate config`) で十分です。DeepSpeedを使用する場合は、別途設定が必要です。
 
+## 1.11. コンソールとログ
+
+* `--console_log_level`: コンソール出力のログレベルを設定します。`DEBUG`, `INFO`, `WARNING`, `ERROR`, `CRITICAL`から選択します。
+* `--console_log_file`: コンソールのログを指定されたファイルに出力します。
+* `--console_log_simple`: よりシンプルなログフォーマットを有効にします。
+
+### 1.12. Hugging Face Hub 連携
+
+* `--huggingface_repo_id`: モデルをアップロードするHugging Face Hubのリポジトリ名 (例: `your-username/your-model`)。
+* `--huggingface_repo_type`: Hugging Face Hubのリポジトリの種類。通常は`model`です。
+* `--huggingface_path_in_repo`: リポジトリ内でファイルをアップロードするパス。
+* `--huggingface_token`: Hugging Face Hubの認証トークン。
+* `--huggingface_repo_visibility`: リポジトリの公開設定 (`public`または`private`)。
+* `--resume_from_huggingface`: Hugging Face Hubに保存された状態から学習を再開します。
+* `--async_upload`: Hubへのモデルの非同期アップロードを有効にし、学習プロセスをブロックしないようにします。
+* `--save_n_epoch_ratio`: 総エポック数に対する特定の比率でモデルを保存します。例えば`5`を指定すると、学習全体で少なくとも5つのチェックポイントが保存されます。
+
+### 1.13. 高度なAttention設定
+
+* `--mem_eff_attn`: メモリ効率の良いAttentionメカニズムを使用します。これは古い実装であり、一般的には`sdpa`や`xformers`の使用が推奨されます。
+* `--xformers`: メモリ効率の良いAttentionのためにxformersライブラリを使用します。`pip install xformers`が必要です。
+
+### 1.14. 高度な学習率スケジューラ設定
+
+* `--lr_scheduler_type`: カスタムスケジューラモジュールを指定します。
+* `--lr_scheduler_args`: カスタムスケジューラに追加の引数を渡します (例: `"T_max=100"`)。
+* `--lr_decay_steps`: 学習率が減衰するステップ数を設定します。
+* `--lr_scheduler_timescale`: 逆平方根スケジューラのタイムスケール。
+* `--lr_scheduler_min_lr_ratio`: 特定のスケジューラについて、初期学習率に対する最小学習率の比率を設定します。
+
+### 1.15. LoRAの差分学習
+
+既存の学習済みLoRAをベースモデルにマージしてから、新たな学習を開始する手法です。既存LoRAのファインチューニングや、差分を学習させたい場合に有効です。
+
+* `--base_weights`: 学習開始前にベースモデルにマージするLoRAの重みファイルを1つ以上指定します。
+* `--base_weights_multiplier`: `--base_weights`で指定したLoRAの重みの倍率。複数指定も可能です。
+
+### 1.16. その他のオプション
+
+* `--tokenizer_cache_dir`: オフラインでの学習に便利なように、tokenizerをキャッシュするディレクトリを指定します。
+* `--scale_weight_norms`: LoRAモジュールの重みのノルムをスケーリングします。重みの大きさを制御することで過学習を防ぐ助けになります。`1.0`が良い出発点です。
+* `--disable_mmap_load_safetensors`: `.safetensors`ファイルのメモリマップドローディングを無効にします。WSLなどの一部環境でモデルの読み込みを高速化できます。
+
 ## 2. その他のTips
 
+
 *   **VRAM使用量:** SDXL LoRA学習は多くのVRAMを必要とします。24GB VRAMでも設定によってはメモリ不足になることがあります。以下の設定でVRAM使用量を削減できます。
     *   `--mixed_precision="bf16"` または `"fp16"` (必須級)
     *   `--gradient_checkpointing` (強く推奨)
@@ -422,7 +506,4 @@ SDXLは計算コストが高いため、キャッシュ機能が効果的です
 
 不明な点や詳細については、各スクリプトの `--help` オプションや、リポジトリ内の他のドキュメント、実装コード自体を参照してください。
 
----
-
-
 </details>

From fe81d40202808d59c78ed906ed15e824c18d091f Mon Sep 17 00:00:00 2001
From: kohya-ss <52813779+kohya-ss@users.noreply.github.com>
Date: Sun, 31 Aug 2025 21:14:45 +0900
Subject: [PATCH 532/582] doc: refactor structure for improved readability and
 maintainability

---
 README.md                                     |  2 +-
 docs/flux_train_network.md                    | 34 +++++++++++++++++++
 docs/sd3_train_network.md                     | 16 +++++++--
 ..._advanced.md => train_network_advanced.md} | 14 +++++---
 4 files changed, 59 insertions(+), 7 deletions(-)
 rename docs/{sdxl_train_network_advanced.md => train_network_advanced.md} (97%)

diff --git a/README.md b/README.md
index 5e569eab6..27356ed44 100644
--- a/README.md
+++ b/README.md
@@ -42,7 +42,7 @@ Jul 21, 2025:
     Currently, the following documents are available:
     - train_network.md
     - sdxl_train_network.md
-    - sdxl_train_network_advanced.md
+    - train_network_advanced.md
     - flux_train_network.md
     - sd3_train_network.md
     - lumina_train_network.md
diff --git a/docs/flux_train_network.md b/docs/flux_train_network.md
index 23828eb71..f30717a68 100644
--- a/docs/flux_train_network.md
+++ b/docs/flux_train_network.md
@@ -631,6 +631,40 @@ interpolation_type = "lanczos" # Example: Use Lanczos interpolation
 
 </details>
 
+### 7.3. Other Training Options / その他の学習オプション
+
+- **`--controlnet_model_name_or_path`**: Specifies the path to a ControlNet model compatible with FLUX.1. This allows for training a LoRA that works in conjunction with ControlNet. This is an advanced feature and requires a compatible ControlNet model.
+
+- **`--loss_type`**: Specifies the loss function for training. The default is `l2`.
+  - `l1`: L1 loss.
+  - `l2`: L2 loss (mean squared error).
+  - `huber`: Huber loss.
+  - `smooth_l1`: Smooth L1 loss.
+
+- **`--huber_schedule`**, **`--huber_c`**, **`--huber_scale`**: These are parameters for Huber loss. They are used when `--loss_type` is set to `huber` or `smooth_l1`.
+
+- **`--t5xxl_max_token_length`**: Specifies the maximum token length for the T5-XXL text encoder. For details, refer to the [`sd3_train_network.md` guide](sd3_train_network.md).
+
+- **`--weighting_scheme`**, **`--logit_mean`**, **`--logit_std`**, **`--mode_scale`**: These options allow you to adjust the loss weighting for each timestep. For details, refer to the [`sd3_train_network.md` guide](sd3_train_network.md).
+
+- **`--fused_backward_pass`**: Fuses the backward pass and optimizer step to reduce VRAM usage. For details, refer to the [`sdxl_train_network.md` guide](sdxl_train_network.md).
+
+<details>
+<summary>日本語</summary>
+
+- **`--controlnet_model_name_or_path`**: FLUX.1互換のControlNetモデルへのパスを指定します。これにより、ControlNetと連携して動作するLoRAを学習できます。これは高度な機能であり、互換性のあるControlNetモデルが必要です。
+- **`--loss_type`**: 学習に用いる損失関数を指定します。デフォルトは `l2` です。
+  - `l1`: L1損失。
+  - `l2`: L2損失（平均二乗誤差）。
+  - `huber`: Huber損失。
+  - `smooth_l1`: Smooth L1損失。
+- **`--huber_schedule`**, **`--huber_c`**, **`--huber_scale`**: これらはHuber損失のパラメータです。`--loss_type` が `huber` または `smooth_l1` の場合に使用されます。
+- **`--t5xxl_max_token_length`**: T5-XXLテキストエンコーダの最大トークン長を指定します。詳細は [`sd3_train_network.md` ガイド](sd3_train_network.md) を参照してください。
+- **`--weighting_scheme`**, **`--logit_mean`**, **`--logit_std`**, **`--mode_scale`**: これらのオプションは、各タイムステップの損失の重み付けを調整するために使用されます。詳細は [`sd3_train_network.md` ガイド](sd3_train_network.md) を参照してください。
+- **`--fused_backward_pass`**: バックワードパスとオプティマイザステップを融合してVRAM使用量を削減します。詳細は [`sdxl_train_network.md` ガイド](sdxl_train_network.md) を参照してください。
+
+</details>
+
 ## 8. Related Tools / 関連ツール
 
 Several related scripts are provided for models trained with `flux_train_network.py` and to assist with the training process:
diff --git a/docs/sd3_train_network.md b/docs/sd3_train_network.md
index f235e8ef0..30876ce05 100644
--- a/docs/sd3_train_network.md
+++ b/docs/sd3_train_network.md
@@ -156,13 +156,19 @@ Besides the arguments explained in the [train_network.py guide](train_network.md
 * `--enable_scaled_pos_embed` **[SD3.5][experimental]** – Scale positional embeddings when training with multiple resolutions.
 * `--training_shift=<float>` – Shift applied to the timestep distribution. Default `1.0`.
 * `--weighting_scheme=<choice>` – Weighting method for loss by timestep. Default `uniform`.
-* `--logit_mean`, `--logit_std`, `--mode_scale` – Parameters for `logit_normal` or `mode` weighting.
+* `--logit_mean=<float>` – Mean value for `logit_normal` weighting scheme. Default `0.0`.
+* `--logit_std=<float>` – Standard deviation for `logit_normal` weighting scheme. Default `1.0`.
+* `--mode_scale=<float>` – Scale factor for `mode` weighting scheme. Default `1.29`.
 
 #### Memory and Speed / メモリ・速度関連
 
 * `--blocks_to_swap=<integer>` **[experimental]** – Swap a number of Transformer blocks between CPU and GPU. More blocks reduce VRAM but slow training. Cannot be used with `--cpu_offload_checkpointing`.
 * `--cache_text_encoder_outputs` – Caches the outputs of the text encoders to reduce VRAM usage and speed up training. This is particularly effective for SD3, which uses three text encoders. Recommended when not training the text encoder LoRA. For more details, see the [`sdxl_train_network.py` guide](sdxl_train_network.md).
 * `--cache_text_encoder_outputs_to_disk` – Caches the text encoder outputs to disk when the above option is enabled.
+* `--t5xxl_device=<device>` **[not supported yet]** – Specifies the device for T5-XXL model. If not specified, uses accelerator's device.
+* `--t5xxl_dtype=<dtype>` **[not supported yet]** – Specifies the dtype for T5-XXL model. If not specified, uses default dtype from mixed precision.
+* `--save_clip` **[not supported yet]** – Saves CLIP models to checkpoint (unified checkpoint format not yet supported).
+* `--save_t5xxl` **[not supported yet]** – Saves T5-XXL model to checkpoint (unified checkpoint format not yet supported).
 
 #### Incompatible or Deprecated Options / 非互換・非推奨の引数
 
@@ -191,13 +197,19 @@ Besides the arguments explained in the [train_network.py guide](train_network.md
 *   `--enable_scaled_pos_embed` **[SD3.5向け][実験的機能]** – マルチ解像度学習時に解像度に応じてPositional Embeddingをスケーリングします。
 *   `--training_shift=<float>` – タイムステップ分布を調整するためのシフト値です。デフォルトは`1.0`です。
 *   `--weighting_scheme=<choice>` – タイムステップに応じた損失の重み付け方法を指定します。デフォルトは`uniform`です。
-*   `--logit_mean`, `--logit_std`, `--mode_scale` – `logit_normal`または`mode`使用時のパラメータです。
+*   `--logit_mean=<float>` – `logit_normal`重み付けスキームの平均値です。デフォルトは`0.0`です。
+*   `--logit_std=<float>` – `logit_normal`重み付けスキームの標準偏差です。デフォルトは`1.0`です。
+*   `--mode_scale=<float>` – `mode`重み付けスキームのスケール係数です。デフォルトは`1.29`です。
 
 #### メモリ・速度関連
 
 *   `--blocks_to_swap=<integer>` **[実験的機能]** – TransformerブロックをCPUとGPUでスワップしてVRAMを節約します。`--cpu_offload_checkpointing`とは併用できません。
 *   `--cache_text_encoder_outputs` – Text Encoderの出力をキャッシュし、VRAM使用量削減と学習高速化を図ります。SD3は3つのText Encoderを持つため特に効果的です。Text EncoderのLoRAを学習しない場合に推奨されます。詳細は[`sdxl_train_network.py`のガイド](sdxl_train_network.md)を参照してください。
 *   `--cache_text_encoder_outputs_to_disk` – 上記オプションと併用し、Text Encoderの出力をディスクにキャッシュします。
+*   `--t5xxl_device=<device>` **[未サポート]** – T5-XXLモデルのデバイスを指定します。指定しない場合はacceleratorのデバイスを使用します。
+*   `--t5xxl_dtype=<dtype>` **[未サポート]** – T5-XXLモデルのdtypeを指定します。指定しない場合はデフォルトのdtype（mixed precisionから）を使用します。
+*   `--save_clip` **[未サポート]** – CLIPモデルをチェックポイントに保存します（統合チェックポイント形式は未サポート）。
+*   `--save_t5xxl` **[未サポート]** – T5-XXLモデルをチェックポイントに保存します（統合チェックポイント形式は未サポート）。
 
 #### 非互換・非推奨の引数
 
diff --git a/docs/sdxl_train_network_advanced.md b/docs/train_network_advanced.md
similarity index 97%
rename from docs/sdxl_train_network_advanced.md
rename to docs/train_network_advanced.md
index fd7086047..c1fd86a22 100644
--- a/docs/sdxl_train_network_advanced.md
+++ b/docs/train_network_advanced.md
@@ -128,7 +128,7 @@ Basic options are common with `train_network.py`.
 *   `--huber_c=C` / `--huber_scale=S`: Parameters for `huber` or `smooth_l1` loss.
 *   `--masked_loss`: Limits loss calculation area based on a mask image. Requires specifying mask images (black and white) in `conditioning_data_dir` in dataset settings. See [About Masked Loss](masked_loss_README.md) for details.
 
-### 1.10. Distributed Training and Others
+### 1.10. Distributed Training and Other Training Related Options
 
 *   `--seed=N`: Specifies the random seed. Set this to ensure training reproducibility.
 *   `--max_token_length=N` (`75`, `150`, `225`): Maximum token length processed by Text Encoders. For SDXL, typically `75` (default), `150`, or `225`. Longer lengths can handle more complex prompts but increase VRAM usage.
@@ -137,8 +137,11 @@ Basic options are common with `train_network.py`.
 *   `--persistent_data_loader_workers` / `--max_data_loader_n_workers=N`: Settings for DataLoader worker processes. Affects wait time between epochs and memory usage.
 *   `--config_file="<config file>"` / `--output_config`: Options to use/output a `.toml` file instead of command line arguments.
 *   **Accelerate/DeepSpeed related:** (`--ddp_timeout`, `--ddp_gradient_as_bucket_view`, `--ddp_static_graph`): Detailed settings for distributed training. Accelerate settings (`accelerate config`) are usually sufficient. DeepSpeed requires separate configuration.
+* `--initial_epoch=<integer>` – Sets the initial epoch number. `1` means first epoch (same as not specifying). Note: `initial_epoch`/`initial_step` doesn't affect the lr scheduler, which means lr scheduler will start from 0 without `--resume`.
+* `--initial_step=<integer>` – Sets the initial step number including all epochs. `0` means first step (same as not specifying). Overwrites `initial_epoch`.
+* `--skip_until_initial_step` – Skips training until `initial_step` is reached.
 
-## 1.11. Console and Logging / コンソールとログ
+### 1.11. Console and Logging / コンソールとログ
 
 * `--console_log_level`: Sets the logging level for the console output. Choose from `DEBUG`, `INFO`, `WARNING`, `ERROR`, `CRITICAL`.
 * `--console_log_file`: Redirects console logs to a specified file.
@@ -421,7 +424,7 @@ SDXLは計算コストが高いため、キャッシュ機能が効果的です
 *   `--masked_loss`
     *   マスク画像に基づいてLoss計算領域を限定します。データセット設定で`conditioning_data_dir`にマスク画像（白黒）を指定する必要があります。詳細は[マスクロスについて](masked_loss_README.md)を参照してください。
 
-### 1.10. 分散学習・その他
+### 1.10. 分散学習、その他学習関連
 
 *   `--seed=N`
     *   乱数シードを指定します。学習の再現性を確保したい場合に設定します。
@@ -437,8 +440,11 @@ SDXLは計算コストが高いため、キャッシュ機能が効果的です
     *   コマンドライン引数の代わりに`.toml`ファイルを使用/出力するオプション。
 *   **Accelerate/DeepSpeed関連:** (`--ddp_timeout`, `--ddp_gradient_as_bucket_view`, `--ddp_static_graph`)
     *   分散学習時の詳細設定。通常はAccelerateの設定 (`accelerate config`) で十分です。DeepSpeedを使用する場合は、別途設定が必要です。
+*   `--initial_epoch=<integer>` – 開始エポック番号を設定します。`1`で最初のエポック（未指定時と同じ）。注意：`initial_epoch`/`initial_step`はlr schedulerに影響しないため、`--resume`しない場合はlr schedulerは0から始まります。
+*   `--initial_step=<integer>` – 全エポックを含む開始ステップ番号を設定します。`0`で最初のステップ（未指定時と同じ）。`initial_epoch`を上書きします。
+*   `--skip_until_initial_step` – `initial_step`に到達するまで学習をスキップします。
 
-## 1.11. コンソールとログ
+### 1.11. コンソールとログ
 
 * `--console_log_level`: コンソール出力のログレベルを設定します。`DEBUG`, `INFO`, `WARNING`, `ERROR`, `CRITICAL`から選択します。
 * `--console_log_file`: コンソールのログを指定されたファイルに出力します。

From 80710134d5af8169e65906f1a3f0892a93689421 Mon Sep 17 00:00:00 2001
From: kohya-ss <52813779+kohya-ss@users.noreply.github.com>
Date: Sun, 31 Aug 2025 21:19:28 +0900
Subject: [PATCH 533/582] doc: add Sage Attention and sample batch size options
 to Lumina training guide

---
 docs/lumina_train_network.md | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/docs/lumina_train_network.md b/docs/lumina_train_network.md
index 3f0548d9c..80ae84187 100644
--- a/docs/lumina_train_network.md
+++ b/docs/lumina_train_network.md
@@ -170,6 +170,8 @@ Besides the arguments explained in the [train_network.py guide](train_network.md
 * `--model_prediction_type=<choice>` – Model prediction processing method. Options: `raw`, `additive`, `sigma_scaled`. Default `raw`. **Recommended: `raw`**
 * `--system_prompt=<string>` – System prompt to prepend to all prompts. Recommended: `"You are an assistant designed to generate high-quality images based on user prompts."` or `"You are an assistant designed to generate high-quality images with the highest degree of image-text alignment based on textual prompts."`
 * `--use_flash_attn` – Use Flash Attention. Requires `pip install flash-attn` (may not be supported in all environments). If installed correctly, it speeds up training. 
+* `--use_sage_attn` – Use Sage Attention for the model.
+* `--sample_batch_size=<integer>` – Batch size to use for sampling, defaults to `--training_batch_size` value. Sample batches are bucketed by width, height, guidance scale, and seed.
 * `--sigmoid_scale=<float>` – Scale factor for sigmoid timestep sampling. Default `1.0`.
 
 #### Memory and Speed / メモリ・速度関連
@@ -216,6 +218,8 @@ For Lumina Image 2.0, you can specify different dimensions for various component
 *   `--model_prediction_type=<choice>` – モデル予測の処理方法を指定します。`raw`, `additive`, `sigma_scaled`から選択します。デフォルトは`raw`です。**推奨: `raw`**
 *   `--system_prompt=<string>` – 全てのプロンプトに前置するシステムプロンプトを指定します。推奨: `"You are an assistant designed to generate high-quality images based on user prompts."` または `"You are an assistant designed to generate high-quality images with the highest degree of image-text alignment based on textual prompts."`
 *   `--use_flash_attn` – Flash Attentionを使用します。`pip install flash-attn`でインストールが必要です（環境によってはサポートされていません）。正しくインストールされている場合は、指定すると学習が高速化されます。
+*   `--use_sage_attn` – Sage Attentionを使用します。
+*   `--sample_batch_size=<integer>` – サンプリングに使用するバッチサイズ。デフォルトは `--training_batch_size` の値です。サンプルバッチは、幅、高さ、ガイダンススケール、シードによってバケット化されます。
 *   `--sigmoid_scale=<float>` – sigmoidタイムステップサンプリングのスケール係数を指定します。デフォルトは`1.0`です。
 
 #### メモリ・速度関連

From c38b07d0da275eba395e1b25eabbdc5c0553b410 Mon Sep 17 00:00:00 2001
From: kohya-ss <52813779+kohya-ss@users.noreply.github.com>
Date: Sun, 31 Aug 2025 21:39:47 +0900
Subject: [PATCH 534/582] doc: add validation loss documentation for model
 training

---
 README.md          |   3 +-
 docs/validation.md | 261 +++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 263 insertions(+), 1 deletion(-)
 create mode 100644 docs/validation.md

diff --git a/README.md b/README.md
index 27356ed44..2ed53d5af 100644
--- a/README.md
+++ b/README.md
@@ -46,7 +46,8 @@ Jul 21, 2025:
     - flux_train_network.md
     - sd3_train_network.md
     - lumina_train_network.md
-    
+    - validation.md
+
 Jul 10, 2025:
 - [AI Coding Agents](#for-developers-using-ai-coding-agents) section is added to the README. This section provides instructions for developers using AI coding agents like Claude and Gemini to understand the project context and coding standards.
 
diff --git a/docs/validation.md b/docs/validation.md
new file mode 100644
index 000000000..7f6a008c2
--- /dev/null
+++ b/docs/validation.md
@@ -0,0 +1,261 @@
+# Validation Loss
+
+Validation loss is a crucial metric for monitoring the training process of a model. It helps you assess how well your model is generalizing to data it hasn't seen during training, which is essential for preventing overfitting. By periodically evaluating the model on a separate validation dataset, you can gain insights into its performance and make more informed decisions about when to stop training or adjust hyperparameters.
+
+This feature provides a stable and reliable validation loss metric by ensuring the validation process is deterministic.
+
+<details>
+<summary>日本語</summary>
+
+Validation loss（検証損失）は、モデルの学習過程を監視するための重要な指標です。モデルが学習中に見ていないデータに対してどの程度汎化できているかを評価するのに役立ち、過学習を防ぐために不可欠です。個別の検証データセットで定期的にモデルを評価することで、そのパフォーマンスに関する洞察を得て、学習をいつ停止するか、またはハイパーパラメータを調整するかについて、より多くの情報に基づいた決定を下すことができます。
+
+この機能は、検証プロセスが決定論的であることを保証することにより、安定して信頼性の高い検証損失指標を提供します。
+
+</details>
+
+## How It Works
+
+When validation is enabled, a portion of your dataset is set aside specifically for this purpose. The script then runs a validation step at regular intervals, calculating the loss on this validation data.
+
+To ensure that the validation loss is a reliable indicator of model performance, the process is deterministic. This means that for every validation run, the same random seed is used for noise generation and timestep selection. This consistency ensures that any fluctuations in the validation loss are due to changes in the model's weights, not random variations in the validation process itself.
+
+The average loss across all validation steps is then logged, providing a single, clear metric to track.
+
+For more technical details, please refer to the original pull request: [PR #1903](https://github.com/kohya-ss/sd-scripts/pull/1903).
+
+<details>
+<summary>日本語</summary>
+
+検証が有効になると、データセットの一部がこの目的のために特別に確保されます。スクリプトは定期的な間隔で検証ステップを実行し、この検証データに対する損失を計算します。
+
+検証損失がモデルのパフォーマンスの信頼できる指標であることを保証するために、プロセスは決定論的です。つまり、すべての検証実行で、ノイズ生成とタイムステップ選択に同じランダムシードが使用されます。この一貫性により、検証損失の変動が、検証プロセス自体のランダムな変動ではなく、モデルの重みの変化によるものであることが保証されます。
+
+すべての検証ステップにわたる平均損失がログに記録され、追跡するための単一の明確な指標が提供されます。
+
+より技術的な詳細については、元のプルリクエストを参照してください: [PR #1903](https://github.com/kohya-ss/sd-scripts/pull/1903).
+
+</details>
+
+## How to Use
+
+### Enabling Validation
+
+There are two primary ways to enable validation:
+
+1.  **Using a Dataset Config File (Recommended)**: You can specify a validation set directly within your dataset `.toml` file. This method offers the most control, allowing you to designate entire directories as validation sets or split a percentage of a specific subset for validation.
+
+    To use a whole directory for validation, add a subset and set `validation_split = 1.0`.
+
+    **Example: Separate Validation Set**
+    ```toml
+    [[datasets]]
+      # ... training subset ...
+      [[datasets.subsets]]
+        image_dir = "path/to/train_images"
+        # ... other settings ...
+
+      # Validation subset
+      [[datasets.subsets]]
+        image_dir = "path/to/validation_images"
+        validation_split = 1.0  # Use this entire subset for validation
+    ```
+
+    To use a fraction of a subset for validation, set `validation_split` to a value between 0.0 and 1.0.
+
+    **Example: Splitting a Subset**
+    ```toml
+    [[datasets]]
+      # ... dataset settings ...
+      [[datasets.subsets]]
+        image_dir = "path/to/images"
+        validation_split = 0.1  # Use 10% of this subset for validation
+    ```
+
+2.  **Using a Command-Line Argument**: For a simpler setup, you can use the `--validation_split` argument. This will take a random percentage of your *entire* training dataset for validation. This method is ignored if `validation_split` is defined in your dataset config file.
+
+    **Example Command:**
+    ```bash
+    accelerate launch train_network.py ... --validation_split 0.1
+    ```
+    This command will use 10% of the total training data for validation.
+
+<details>
+<summary>日本語</summary>
+
+### 検証を有効にする
+
+検証を有効にする主な方法は2つあります。
+
+1.  **データセット設定ファイルを使用する（推奨）**: データセットの`.toml`ファイル内で直接検証セットを指定できます。この方法は最も制御性が高く、ディレクトリ全体を検証セットとして指定したり、特定のサブセットのパーセンテージを検証用に分割したりすることができます。
+
+    ディレクトリ全体を検証に使用するには、サブセットを追加して`validation_split = 1.0`と設定します。
+
+    **例：個別の検証セット**
+    ```toml
+    [[datasets]]
+      # ... training subset ...
+      [[datasets.subsets]]
+        image_dir = "path/to/train_images"
+        # ... other settings ...
+
+      # Validation subset
+      [[datasets.subsets]]
+        image_dir = "path/to/validation_images"
+        validation_split = 1.0  # このサブセット全体を検証に使用します
+    ```
+
+    サブセットの一部を検証に使用するには、`validation_split`を0.0から1.0の間の値に設定します。
+
+    **例：サブセットの分割**
+    ```toml
+    [[datasets]]
+      # ... dataset settings ...
+      [[datasets.subsets]]
+        image_dir = "path/to/images"
+        validation_split = 0.1  # このサブセットの10%を検証に使用します
+    ```
+
+2.  **コマンドライン引数を使用する**: より簡単な設定のために、`--validation_split`引数を使用できます。これにより、*全*学習データセットのランダムなパーセンテージが検証に使用されます。この方法は、データセット設定ファイルで`validation_split`が定義されている場合は無視されます。
+
+    **コマンド例:**
+    ```bash
+    accelerate launch train_network.py ... --validation_split 0.1
+    ```
+    このコマンドは、全学習データの10%を検証に使用します。
+
+</details>
+
+### Configuration Options
+
+| Argument                    | TOML Option         | Description                                                                                                                            |
+| --------------------------- | ------------------- | -------------------------------------------------------------------------------------------------------------------------------------- |
+| `--validation_split`        | `validation_split`  | The fraction of the dataset to use for validation. The command-line argument applies globally, while the TOML option applies per-subset. The TOML setting takes precedence. |
+| `--validate_every_n_steps`  |                     | Run validation every N steps.                                                                                                          |
+| `--validate_every_n_epochs` |                     | Run validation every N epochs. If not specified, validation runs once per epoch by default.                                            |
+| `--max_validation_steps`    |                     | The maximum number of batches to use for a single validation run. If not set, the entire validation dataset is used.                     |
+| `--validation_seed`         | `validation_seed`   | A specific seed for the validation dataloader shuffling. If not set in the TOML file, the main training `--seed` is used.                 |
+
+<details>
+<summary>日本語</summary>
+
+### 設定オプション
+
+| 引数                        | TOMLオプション      | 説明                                                                                                                                   |
+| --------------------------- | ------------------- | -------------------------------------------------------------------------------------------------------------------------------------- |
+| `--validation_split`        | `validation_split`  | 検証に使用するデータセットの割合。コマンドライン引数は全体に適用され、TOMLオプションはサブセットごとに適用されます。TOML設定が優先されます。 |
+| `--validate_every_n_steps`  |                     | Nステップごとに検証を実行します。                                                                                                      |
+| `--validate_every_n_epochs` |                     | Nエポックごとに検証を実行します。指定しない場合、デフォルトでエポックごとに1回検証が実行されます。                                       |
+| `--max_validation_steps`    |                     | 1回の検証実行に使用するバッチの最大数。設定しない場合、検証データセット全体が使用されます。                                            |
+| `--validation_seed`         | `validation_seed`   | 検証データローダーのシャッフル用の特定のシード。TOMLファイルで設定されていない場合、メインの学習`--seed`が使用されます。                 |
+
+</details>
+
+### Viewing the Results
+
+The validation loss is logged to your tracking tool of choice (TensorBoard or Weights & Biases). Look for the metric `loss/validation` to monitor the performance.
+
+<details>
+<summary>日本語</summary>
+
+### 結果の表示
+
+検証損失は、選択した追跡ツール（TensorBoardまたはWeights & Biases）に記録されます。パフォーマンスを監視するには、`loss/validation`という指標を探してください。
+
+</details>
+
+### Practical Example
+
+Here is a complete example of how to run a LoRA training with validation enabled:
+
+**1. Prepare your `dataset_config.toml`:**
+
+```toml
+[general]
+shuffle_caption = true
+keep_tokens = 1
+
+[[datasets]]
+resolution = "1024,1024"
+batch_size = 2
+
+  [[datasets.subsets]]
+  image_dir = 'path/to/your_images'
+  caption_extension = '.txt'
+  num_repeats = 10
+
+  [[datasets.subsets]]
+  image_dir = 'path/to/your_validation_images'
+  caption_extension = '.txt'
+  validation_split = 1.0 # Use this entire subset for validation
+```
+
+**2. Run the training command:**
+
+```bash
+accelerate launch sdxl_train_network.py \
+  --pretrained_model_name_or_path="sd_xl_base_1.0.safetensors" \
+  --dataset_config="dataset_config.toml" \
+  --output_dir="output" \
+  --output_name="my_lora" \
+  --network_module=networks.lora \
+  --network_dim=32 \
+  --network_alpha=16 \
+  --save_every_n_epochs=1 \
+  --learning_rate=1e-4 \
+  --optimizer_type="AdamW8bit" \
+  --mixed_precision="bf16" \
+  --logging_dir=logs
+```
+
+The validation loss will be calculated once per epoch and saved to the `logs` directory, which you can view with TensorBoard.
+
+<details>
+<summary>日本語</summary>
+
+### 実践的な例
+
+検証を有効にしてLoRAの学習を実行する完全な例を次に示します。
+
+**1. `dataset_config.toml`を準備します:**
+
+```toml
+[general]
+shuffle_caption = true
+keep_tokens = 1
+
+[[datasets]]
+resolution = "1024,1024"
+batch_size = 2
+
+  [[datasets.subsets]]
+  image_dir = 'path/to/your_images'
+  caption_extension = '.txt'
+  num_repeats = 10
+
+  [[datasets.subsets]]
+  image_dir = 'path/to/your_validation_images'
+  caption_extension = '.txt'
+  validation_split = 1.0 # このサブセット全体を検証に使用します
+```
+
+**2. 学習コマンドを実行します:**
+
+```bash
+accelerate launch sdxl_train_network.py \
+  --pretrained_model_name_or_path="sd_xl_base_1.0.safetensors" \
+  --dataset_config="dataset_config.toml" \
+  --output_dir="output" \
+  --output_name="my_lora" \
+  --network_module=networks.lora \
+  --network_dim=32 \
+  --network_alpha=16 \
+  --save_every_n_epochs=1 \
+  --learning_rate=1e-4 \
+  --optimizer_type="AdamW8bit" \
+  --mixed_precision="bf16" \
+  --logging_dir=logs
+```
+
+検証損失はエポックごとに1回計算され、`logs`ディレクトリに保存されます。これはTensorBoardで表示できます。
+
+</details>

From 142d0be180524c3c7d5a021c687d8d75dbff2dc0 Mon Sep 17 00:00:00 2001
From: kohya-ss <52813779+kohya-ss@users.noreply.github.com>
Date: Mon, 1 Sep 2025 12:36:51 +0900
Subject: [PATCH 535/582] doc: add comprehensive fine-tuning guide for various
 model architectures

---
 docs/fine_tune.md | 347 ++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 347 insertions(+)
 create mode 100644 docs/fine_tune.md

diff --git a/docs/fine_tune.md b/docs/fine_tune.md
new file mode 100644
index 000000000..1560fb28a
--- /dev/null
+++ b/docs/fine_tune.md
@@ -0,0 +1,347 @@
+# Fine-tuning Guide
+
+This document explains how to perform fine-tuning on various model architectures using the `*_train.py` scripts.
+
+<details>
+<summary>日本語</summary>
+
+# Fine-tuning ガイド
+
+このドキュメントでは、`*_train.py` スクリプトを用いた、各種モデルアーキテクチャのFine-tuningの方法について解説します。
+
+</details>
+
+### Difference between Fine-tuning and LoRA tuning
+
+This repository supports two methods for additional model training: **Fine-tuning** and **LoRA (Low-Rank Adaptation)**. Each method has distinct features and advantages.
+
+**Fine-tuning** is a method that retrains all (or most) of the weights of a pre-trained model.
+- **Pros**: It can improve the overall expressive power of the model and is suitable for learning styles or concepts that differ significantly from the original model.
+- **Cons**:
+    - It requires a large amount of VRAM and computational cost.
+    - The saved file size is large (same as the original model).
+    - It is prone to "overfitting," where the model loses the diversity of the original model if over-trained.
+- **Corresponding scripts**: Scripts named `*_train.py`, such as `sdxl_train.py`, `sd3_train.py`, `flux_train.py`, and `lumina_train.py`.
+
+**LoRA tuning** is a method that freezes the model's weights and only trains a small additional network called an "adapter."
+- **Pros**:
+    - It allows for fast training with low VRAM and computational cost.
+    - It is considered resistant to overfitting because it trains fewer weights.
+    - The saved file (LoRA network) is very small, ranging from tens to hundreds of MB, making it easy to manage.
+    - Multiple LoRAs can be used in combination.
+- **Cons**: Since it does not train the entire model, it may not achieve changes as significant as fine-tuning.
+- **Corresponding scripts**: Scripts named `*_train_network.py`, such as `sdxl_train_network.py`, `sd3_train_network.py`, and `flux_train_network.py`.
+
+| Feature | Fine-tuning | LoRA tuning |
+|:---|:---|:---|
+| **Training Target** | All model weights | Additional network (adapter) only |
+| **VRAM/Compute Cost**| High | Low |
+| **Training Time** | Long | Short |
+| **File Size** | Large (several GB) | Small (few MB to hundreds of MB) |
+| **Overfitting Risk** | High | Low |
+| **Suitable Use Case** | Major style changes, concept learning | Adding specific characters or styles |
+
+Generally, it is recommended to start with **LoRA tuning** if you want to add a specific character or style. **Fine-tuning** is a valid option for more fundamental style changes or aiming for a high-quality model.
+
+<details>
+<summary>日本語</summary>
+
+### Fine-tuningとLoRA学習の違い
+
+このリポジトリでは、モデルの追加学習手法として**Fine-tuning**と**LoRA (Low-Rank Adaptation)**学習の2種類をサポートしています。それぞれの手法には異なる特徴と利点があります。
+
+**Fine-tuning**は、事前学習済みモデルの重み全体（または大部分）を再学習する手法です。
+- **利点**: モデル全体の表現力を向上させることができ、元のモデルから大きく変化した画風やコンセプトの学習に適しています。
+- **欠点**:
+    - 学習には多くのVRAMと計算コストが必要です。
+    - 保存されるファイルサイズが大きくなります（元のモデルと同じサイズ）。
+    - 学習させすぎると、元のモデルが持っていた多様性が失われる「過学習（overfitting）」に陥りやすい傾向があります。
+- **対応スクリプト**: `sdxl_train.py`, `sd3_train.py`, `flux_train.py`, `lumina_train.py` など、`*_train.py` という命名規則のスクリプトが対応します。
+
+**LoRA学習**は、モデルの重みは凍結（固定）したまま、「アダプター」と呼ばれる小さな追加ネットワークのみを学習する手法です。
+- **利点**:
+    - 少ないVRAMと計算コストで高速に学習できます。
+    - 学習する重みが少ないため、過学習に強いとされています。
+    - 保存されるファイル（LoRAネットワーク）は数十〜数百MBと非常に小さく、管理が容易です。
+    - 複数のLoRAを組み合わせて使用することも可能です。
+- **欠点**: モデル全体を学習するわけではないため、Fine-tuningほどの大きな変化は期待できない場合があります。
+- **対応スクリプト**: `sdxl_train_network.py`, `sd3_train_network.py`, `flux_train_network.py` など、`*_train_network.py` という命名規則のスクリプトが対応します。
+
+| 特徴 | Fine-tuning | LoRA学習 |
+|:---|:---|:---|
+| **学習対象** | モデルの全重み | 追加ネットワーク（アダプター）のみ |
+| **VRAM/計算コスト**| 大 | 小 |
+| **学習時間** | 長 | 短 |
+| **ファイルサイズ** | 大（数GB） | 小（数MB〜数百MB） |
+| **過学習リスク** | 高 | 低 |
+| **適した用途** | 大規模な画風変更、コンセプト学習 | 特定のキャラ、画風の追加学習 |
+
+一般的に、特定のキャラクターや画風を追加したい場合は**LoRA学習**から試すことが推奨されます。より根本的な画風の変更や、高品質なモデルを目指す場合は**Fine-tuning**が有効な選択肢となります。
+
+</details>
+
+--- 
+
+### Fine-tuning for each architecture
+
+Fine-tuning updates the entire weights of the model, so it has different options and considerations than LoRA tuning. This section describes the fine-tuning scripts for major architectures.
+
+The basic command structure is common to all architectures.
+
+```bash
+accelerate launch --mixed_precision bf16 {script_name}.py \
+  --pretrained_model_name_or_path <path_to_model> \
+  --dataset_config <path_to_config.toml> \
+  --output_dir <output_directory> \
+  --output_name <model_output_name> \
+  --save_model_as safetensors \
+  --max_train_steps 10000 \
+  --learning_rate 1e-5 \
+  --optimizer_type AdamW8bit
+```
+
+<details>
+<summary>日本語</summary>
+
+### 各アーキテクチャのFine-tuning
+
+Fine-tuningはモデルの重み全体を更新するため、LoRA学習とは異なるオプションや考慮事項があります。ここでは主要なアーキテクチャごとのFine-tuningスクリプトについて説明します。
+
+基本的なコマンドの構造は、どのアーキテクチャでも共通です。
+
+```bash
+accelerate launch --mixed_precision bf16 {script_name}.py \
+  --pretrained_model_name_or_path <path_to_model> \
+  --dataset_config <path_to_config.toml> \
+  --output_dir <output_directory> \
+  --output_name <model_output_name> \
+  --save_model_as safetensors \
+  --max_train_steps 10000 \
+  --learning_rate 1e-5 \
+  --optimizer_type AdamW8bit
+```
+
+</details>
+
+#### SDXL (`sdxl_train.py`)
+
+Performs fine-tuning for SDXL models. It is possible to train both the U-Net and the Text Encoders.
+
+**Key Options:**
+
+- `--train_text_encoder`: Includes the weights of the Text Encoders (CLIP ViT-L and OpenCLIP ViT-bigG) in the training. Effective for significant style changes or strongly learning specific concepts.
+- `--learning_rate_te1`, `--learning_rate_te2`: Set individual learning rates for each Text Encoder.
+- `--block_lr`: Divides the U-Net into 23 blocks and sets a different learning rate for each block. This allows for advanced adjustments, such as strengthening or weakening the learning of specific layers. (Not available in LoRA tuning).
+
+**Command Example:**
+
+```bash
+accelerate launch --mixed_precision bf16 sdxl_train.py \
+  --pretrained_model_name_or_path "sd_xl_base_1.0.safetensors" \
+  --dataset_config "dataset_config.toml" \
+  --output_dir "output" \
+  --output_name "sdxl_finetuned" \
+  --train_text_encoder \
+  --learning_rate 1e-5 \
+  --learning_rate_te1 5e-6 \
+  --learning_rate_te2 2e-6
+```
+
+<details>
+<summary>日本語</summary>
+
+#### SDXL (`sdxl_train.py`)
+
+SDXLモデルのFine-tuningを行います。U-NetとText Encoderの両方を学習させることが可能です。
+
+**主要なオプション:**
+
+- `--train_text_encoder`: Text Encoder（CLIP ViT-LとOpenCLIP ViT-bigG）の重みを学習対象に含めます。画風を大きく変えたい場合や、特定の概念を強く学習させたい場合に有効です。
+- `--learning_rate_te1`, `--learning_rate_te2`: それぞれのText Encoderに個別の学習率を設定します。
+- `--block_lr`: U-Netを23個のブロックに分割し、ブロックごとに異なる学習率を設定できます。特定の層の学習を強めたり弱めたりする高度な調整が可能です。（LoRA学習では利用できません）
+
+**コマンド例:**
+
+```bash
+accelerate launch --mixed_precision bf16 sdxl_train.py \
+  --pretrained_model_name_or_path "sd_xl_base_1.0.safetensors" \
+  --dataset_config "dataset_config.toml" \
+  --output_dir "output" \
+  --output_name "sdxl_finetuned" \
+  --train_text_encoder \
+  --learning_rate 1e-5 \
+  --learning_rate_te1 5e-6 \
+  --learning_rate_te2 2e-6
+```
+
+</details>
+
+#### SD3 (`sd3_train.py`)
+
+Performs fine-tuning for Stable Diffusion 3 Medium models. SD3 consists of three Text Encoders (CLIP-L, CLIP-G, T5-XXL) and a MMDiT (equivalent to U-Net), which can be targeted for training.
+
+**Key Options:**
+
+- `--train_text_encoder`: Enables training for CLIP-L and CLIP-G.
+- `--train_t5xxl`: Enables training for T5-XXL. T5-XXL is a very large model and requires a lot of VRAM for training.
+- `--blocks_to_swap`: A memory optimization feature to reduce VRAM usage. It swaps some blocks of the MMDiT to CPU memory during training. Useful for using larger batch sizes in low VRAM environments. (Also available in LoRA tuning).
+- `--num_last_block_to_freeze`: Freezes the weights of the last N blocks of the MMDiT, excluding them from training. Useful for maintaining model stability while focusing on learning in the lower layers.
+
+**Command Example:**
+
+```bash
+accelerate launch --mixed_precision bf16 sd3_train.py \
+  --pretrained_model_name_or_path "sd3_medium.safetensors" \
+  --dataset_config "dataset_config.toml" \
+  --output_dir "output" \
+  --output_name "sd3_finetuned" \
+  --train_text_encoder \
+  --learning_rate 4e-6 \
+  --blocks_to_swap 10
+```
+
+<details>
+<summary>日本語</summary>
+
+#### SD3 (`sd3_train.py`)
+
+Stable Diffusion 3 MediumモデルのFine-tuningを行います。SD3は3つのText Encoder（CLIP-L, CLIP-G, T5-XXL）とMMDiT（U-Netに相当）で構成されており、これらを学習対象にできます。
+
+**主要なオプション:**
+
+- `--train_text_encoder`: CLIP-LとCLIP-Gの学習を有効にします。
+- `--train_t5xxl`: T5-XXLの学習を有効にします。T5-XXLは非常に大きなモデルのため、学習には多くのVRAMが必要です。
+- `--blocks_to_swap`: VRAM使用量を削減するためのメモリ最適化機能です。MMDiTの一部のブロックを学習中にCPUメモリに退避（スワップ）させます。VRAMが少ない環境で大きなバッチサイズを使いたい場合に有効です。（LoRA学習でも利用可能）
+- `--num_last_block_to_freeze`: MMDiTの最後のNブロックの重みを凍結し、学習対象から除外します。モデルの安定性を保ちつつ、下位層を中心に学習させたい場合に有効です。
+
+**コマンド例:**
+
+```bash
+accelerate launch --mixed_precision bf16 sd3_train.py \
+  --pretrained_model_name_or_path "sd3_medium.safetensors" \
+  --dataset_config "dataset_config.toml" \
+  --output_dir "output" \
+  --output_name "sd3_finetuned" \
+  --train_text_encoder \
+  --learning_rate 4e-6 \
+  --blocks_to_swap 10
+```
+
+</details>
+
+#### FLUX.1 (`flux_train.py`)
+
+Performs fine-tuning for FLUX.1 models. FLUX.1 is internally composed of two Transformer blocks (Double Blocks, Single Blocks).
+
+**Key Options:**
+
+- `--blocks_to_swap`: Similar to SD3, this feature swaps Transformer blocks to the CPU for memory optimization.
+- `--blockwise_fused_optimizers`: An experimental feature that aims to streamline training by applying individual optimizers to each block.
+
+**Command Example:**
+
+```bash
+accelerate launch --mixed_precision bf16 flux_train.py \
+  --pretrained_model_name_or_path "FLUX.1-dev.safetensors" \
+  --dataset_config "dataset_config.toml" \
+  --output_dir "output" \
+  --output_name "flux1_finetuned" \
+  --learning_rate 1e-5 \
+  --blocks_to_swap 18
+```
+
+<details>
+<summary>日本語</summary>
+
+#### FLUX.1 (`flux_train.py`)
+
+FLUX.1モデルのFine-tuningを行います。FLUX.1は内部的に2つのTransformerブロック（Double Blocks, Single Blocks）で構成されています。
+
+**主要なオプション:**
+
+- `--blocks_to_swap`: SD3と同様に、メモリ最適化のためにTransformerブロックをCPUにスワップする機能です。
+- `--blockwise_fused_optimizers`: 実験的な機能で、各ブロックに個別のオプティマイザを適用し、学習を効率化することを目指します。
+
+**コマンド例:**
+
+```bash
+accelerate launch --mixed_precision bf16 flux_train.py \
+  --pretrained_model_name_or_path "FLUX.1-dev.safetensors" \
+  --dataset_config "dataset_config.toml" \
+  --output_dir "output" \
+  --output_name "flux1_finetuned" \
+  --learning_rate 1e-5 \
+  --blocks_to_swap 18
+```
+
+</details>
+
+#### Lumina (`lumina_train.py`)
+
+Performs fine-tuning for Lumina-Next DiT models.
+
+**Key Options:**
+
+- `--use_flash_attn`: Enables Flash Attention to speed up computation.
+- `lumina_train.py` is relatively new, and many of its options are shared with other scripts. Training can be performed following the basic command pattern.
+
+**Command Example:**
+
+```bash
+accelerate launch --mixed_precision bf16 lumina_train.py \
+  --pretrained_model_name_or_path "Lumina-Next-DiT-B.safetensors" \
+  --dataset_config "dataset_config.toml" \
+  --output_dir "output" \
+  --output_name "lumina_finetuned" \
+  --learning_rate 1e-5
+```
+
+<details>
+<summary>日本語</summary>
+
+#### Lumina (`lumina_train.py`)
+
+Lumina-Next DiTモデルのFine-tuningを行います。
+
+**主要なオプション:**
+
+- `--use_flash_attn`: Flash Attentionを有効にし、計算を高速化します。
+- `lumina_train.py`は比較的新しく、オプションは他のスクリプトと共通化されている部分が多いです。基本的なコマンドパターンに従って学習を行えます。
+
+**コマンド例:**
+
+```bash
+accelerate launch --mixed_precision bf16 lumina_train.py \
+  --pretrained_model_name_or_path "Lumina-Next-DiT-B.safetensors" \
+  --dataset_config "dataset_config.toml" \
+  --output_dir "output" \
+  --output_name "lumina_finetuned" \
+  --learning_rate 1e-5
+```
+
+</details>
+
+--- 
+
+### Differences between Fine-tuning and LoRA tuning per architecture
+
+| Architecture | Key Features/Options Specific to Fine-tuning | Main Differences from LoRA tuning |
+|:---|:---|:---|
+| **SDXL** | `--block_lr` | Only fine-tuning allows for granular control over the learning rate for each U-Net block. |
+| **SD3** | `--train_text_encoder`, `--train_t5xxl`, `--num_last_block_to_freeze` | Only fine-tuning can train the entire Text Encoders. LoRA only trains the adapter parts. |
+| **FLUX.1** | `--blockwise_fused_optimizers` | Since fine-tuning updates the entire model's weights, more experimental optimizer options are available. |
+| **Lumina** | (Few specific options) | Basic training options are common, but fine-tuning differs in that it updates the entire model's foundation. |
+
+<details>
+<summary>日本語</summary>
+
+### アーキテクチャごとのFine-tuningとLoRA学習の違い
+
+| アーキテクチャ | Fine-tuning特有の主要機能・オプション | LoRA学習との主な違い |
+|:---|:---|:---|
+| **SDXL** | `--block_lr` | U-Netのブロックごとに学習率を細かく制御できるのはFine-tuningのみです。 |
+| **SD3** | `--train_text_encoder`, `--train_t5xxl`, `--num_last_block_to_freeze` | Text Encoder全体を学習対象にできるのはFine-tuningです。LoRAではアダプター部分のみ学習します。 |
+| **FLUX.1** | `--blockwise_fused_optimizers` | Fine-tuningではモデル全体の重みを更新するため、より実験的なオプティマイザの選択肢が用意されています。 |
+| **Lumina** | （特有のオプションは少ない） | 基本的な学習オプションは共通ですが、Fine-tuningはモデルの基盤全体を更新する点で異なります。 |
+
+</details>

From 9984868154ef6d90fdf1dd6ba29bf7a037b0acb4 Mon Sep 17 00:00:00 2001
From: kohya-ss <52813779+kohya-ss@users.noreply.github.com>
Date: Mon, 1 Sep 2025 21:32:24 +0900
Subject: [PATCH 536/582] doc: update README to include support for SDXL models
 and additional command-line options for gen_img.py

---
 docs/gen_img_README-ja.md | 102 ++++++++++++++++++++++++++++++++++++--
 docs/gen_img_README.md    |  54 +++++++++++++++++---
 2 files changed, 144 insertions(+), 12 deletions(-)

diff --git a/docs/gen_img_README-ja.md b/docs/gen_img_README-ja.md
index 8f4442d00..ca2eeab2a 100644
--- a/docs/gen_img_README-ja.md
+++ b/docs/gen_img_README-ja.md
@@ -3,7 +3,7 @@ SD 1.xおよび2.xのモデル、当リポジトリで学習したLoRA、Control
 # 概要
 
 * Diffusers (v0.10.2) ベースの推論（画像生成）スクリプト。
-* SD 1.xおよび2.x (base/v-parameterization)モデルに対応。
+* SD 1.x、2.x (base/v-parameterization)、およびSDXLモデルに対応。
 * txt2img、img2img、inpaintingに対応。
 * 対話モード、およびファイルからのプロンプト読み込み、連続生成に対応。
 * プロンプト1行あたりの生成枚数を指定可能。
@@ -96,14 +96,20 @@ python gen_img_diffusers.py --ckpt <モデル名> --outdir <画像出力先>
 
 - `--ckpt <モデル名>`：モデル名を指定します。`--ckpt`オプションは必須です。Stable Diffusionのcheckpointファイル、またはDiffusersのモデルフォルダ、Hugging FaceのモデルIDを指定できます。
 
+- `--v1`：Stable Diffusion 1.x系のモデルを使う場合に指定します。これがデフォルトの動作です。
+
 - `--v2`：Stable Diffusion 2.x系のモデルを使う場合に指定します。1.x系の場合には指定不要です。
 
+- `--sdxl`：Stable Diffusion XLモデルを使う場合に指定します。
+
 - `--v_parameterization`：v-parameterizationを使うモデルを使う場合に指定します（`768-v-ema.ckpt`およびそこからの追加学習モデル、Waifu Diffusion v1.5など）。
     
-    `--v2`の指定有無が間違っているとモデル読み込み時にエラーになります。`--v_parameterization`の指定有無が間違っていると茶色い画像が表示されます。
+    `--v2`や`--sdxl`の指定有無が間違っているとモデル読み込み時にエラーになります。`--v_parameterization`の指定有無が間違っていると茶色い画像が表示されます。
 
 - `--vae`：使用するVAEを指定します。未指定時はモデル内のVAEを使用します。
 
+- `--tokenizer_cache_dir`：トークナイザーのキャッシュディレクトリを指定します（オフライン利用のため）。
+
 ## 画像生成と出力
 
 - `--interactive`：インタラクティブモードで動作します。プロンプトを入力すると画像が生成されます。
@@ -112,6 +118,10 @@ python gen_img_diffusers.py --ckpt <モデル名> --outdir <画像出力先>
 
 - `--from_file <プロンプトファイル名>`：プロンプトが記述されたファイルを指定します。1行1プロンプトで記述してください。なお画像サイズやguidance scaleはプロンプトオプション（後述）で指定できます。
 
+- `--from_module <モジュールファイル>`：Pythonモジュールからプロンプトを読み込みます。モジュールは`get_prompter(args, pipe, networks)`関数を実装している必要があります。
+
+- `--prompter_module_args`：prompterモジュールに渡す追加の引数を指定します。
+
 - `--W <画像幅>`：画像の幅を指定します。デフォルトは`512`です。
 
 - `--H <画像高さ>`：画像の高さを指定します。デフォルトは`512`です。
@@ -132,6 +142,24 @@ python gen_img_diffusers.py --ckpt <モデル名> --outdir <画像出力先>
 
 - `--negative_scale` : uncoditioningのguidance scaleを個別に指定します。[gcem156氏のこちらの記事](https://note.com/gcem156/n/ne9a53e4a6f43)を参考に実装したものです。
 
+- `--emb_normalize_mode`：embedding正規化モードを指定します。"original"（デフォルト）、"abs"、"none"から選択できます。プロンプトの重みの正規化方法に影響します。
+
+## SDXL固有のオプション
+
+SDXL モデル（`--sdxl`フラグ付き）を使用する場合、追加のコンディショニングオプションが利用できます：
+
+- `--original_height`：SDXL コンディショニング用の元の高さを指定します。これはモデルの対象解像度の理解に影響します。
+
+- `--original_width`：SDXL コンディショニング用の元の幅を指定します。これはモデルの対象解像度の理解に影響します。
+
+- `--original_height_negative`：SDXL ネガティブコンディショニング用の元の高さを指定します。
+
+- `--original_width_negative`：SDXL ネガティブコンディショニング用の元の幅を指定します。
+
+- `--crop_top`：SDXL コンディショニング用のクロップ上オフセットを指定します。
+
+- `--crop_left`：SDXL コンディショニング用のクロップ左オフセットを指定します。
+
 ## メモリ使用量や生成速度の調整
 
 - `--batch_size <バッチサイズ>`：バッチサイズを指定します。デフォルトは`1`です。バッチサイズが大きいとメモリを多く消費しますが、生成速度が速くなります。
@@ -139,8 +167,16 @@ python gen_img_diffusers.py --ckpt <モデル名> --outdir <画像出力先>
 - `--vae_batch_size <VAEのバッチサイズ>`：VAEのバッチサイズを指定します。デフォルトはバッチサイズと同じです。
     VAEのほうがメモリを多く消費するため、デノイジング後（stepが100%になった後）でメモリ不足になる場合があります。このような場合にはVAEのバッチサイズを小さくしてください。
 
+- `--vae_slices <スライス数>`：VAE処理時に画像をスライスに分割してVRAM使用量を削減します。None（デフォルト）で分割なし。16や32のような値が推奨されます。有効にすると処理が遅くなりますが、VRAM使用量が少なくなります。
+
+- `--no_half_vae`：VAE処理でfp16/bf16精度の使用を防ぎます。代わりにfp32を使用します。VAE関連の問題やアーティファクトが発生した場合に使用してください。
+
 - `--xformers`：xformersを使う場合に指定します。
 
+- `--sdpa`：最適化のためにPyTorch 2のscaled dot-product attentionを使用します。
+
+- `--diffusers_xformers`：Diffusers経由でxformersを使用します（注：Hypernetworksと互換性がありません）。
+
 - `--fp16`：fp16（単精度）での推論を行います。`fp16`と`bf16`をどちらも指定しない場合はfp32（単精度）での推論を行います。
 
 - `--bf16`：bf16（bfloat16）での推論を行います。RTX 30系のGPUでのみ指定可能です。`--bf16`オプションはRTX 30系以外のGPUではエラーになります。`fp16`よりも`bf16`のほうが推論結果がNaNになる（真っ黒の画像になる）可能性が低いようです。
@@ -157,6 +193,12 @@ python gen_img_diffusers.py --ckpt <モデル名> --outdir <画像出力先>
 
 - `--network_pre_calc`：使用する追加ネットワークの重みを生成ごとにあらかじめ計算します。プロンプトオプションの`--am`が使用できます。LoRA未使用時と同じ程度まで生成は高速化されますが、生成前に重みを計算する時間が必要で、またメモリ使用量も若干増加します。Regional LoRA使用時は無効になります 。
 
+- `--network_regional_mask_max_color_codes`：リージョナルマスクに使用する色コードの最大数を指定します。指定されていない場合、マスクはチャンネルごとに適用されます。Regional LoRAと組み合わせて、マスク内の色で定義できるリージョン数を制御するために使用されます。
+
+- `--network_args`：key=value形式でネットワークモジュールに渡す追加引数を指定します。例: `--network_args "alpha=1.0,dropout=0.1"`。
+
+- `--network_merge_n_models`：ネットワークマージを使用する場合、マージするモデル数を指定します（全ての読み込み済みネットワークをマージする代わりに）。
+
 # 主なオプションの指定例
 
 次は同一プロンプトで64枚をバッチサイズ4で一括生成する例です。
@@ -235,7 +277,9 @@ python gen_img_diffusers.py --ckpt model.safetensors
 
 - `--sequential_file_name`：ファイル名を連番にするかどうかを指定します。指定すると生成されるファイル名が`im_000001.png`からの連番になります。
 
-- `--use_original_file_name`：指定すると生成ファイル名がオリジナルのファイル名と同じになります。
+- `--use_original_file_name`：指定すると生成ファイル名がオリジナルのファイル名の前に追加されます（img2imgモード用）。
+
+- `--clip_vision_strength`：指定した強度でimg2img用のCLIP Vision Conditioningを有効にします。CLIP Visionモデルを使用して入力画像からのコンディショニングを強化します。
 
 ## コマンドラインからの実行例
 
@@ -306,7 +350,9 @@ img2imgと併用できません。
 - `--highres_fix_upscaler`：2nd stageに任意のupscalerを利用します。現在は`--highres_fix_upscaler tools.latent_upscaler` のみ対応しています。
 
 - `--highres_fix_upscaler_args`：`--highres_fix_upscaler`で指定したupscalerに渡す引数を指定します。
-    `tools.latent_upscaler`の場合は、`--highres_fix_upscaler_args "weights=D:\Work\SD\Models\others\etc\upscaler-v1-e100-220.safetensors"`のように重みファイルを指定します。 
+    `tools.latent_upscaler`の場合は、`--highres_fix_upscaler_args "weights=D:\Work\SD\Models\others\etc\upscaler-v1-e100-220.safetensors"`のように重みファイルを指定します。
+
+- `--highres_fix_disable_control_net`：Highres fixの2nd stageでControlNetを無効にします。デフォルトでは、ControlNetは両ステージで使用されます。
 
 コマンドラインの例です。
 
@@ -319,6 +365,34 @@ python gen_img_diffusers.py  --ckpt trinart_characters_it4_v1_vae_merged.ckpt
     --highres_fix_scale 0.5 --highres_fix_steps 28 --strength 0.5
 ```
 
+## Deep Shrink
+
+Deep Shrinkは、異なるタイムステップで異なる深度のUNetを使用して生成プロセスを最適化する技術です。生成品質と効率を向上させることができます。
+
+以下のオプションがあります：
+
+- `--ds_depth_1`：第1フェーズでこの深度のDeep Shrinkを有効にします。有効な値は0から8です。
+
+- `--ds_timesteps_1`：このタイムステップまでDeep Shrink深度1を適用します。デフォルトは650です。
+
+- `--ds_depth_2`：Deep Shrinkの第2フェーズの深度を指定します。
+
+- `--ds_timesteps_2`：このタイムステップまでDeep Shrink深度2を適用します。デフォルトは650です。
+
+- `--ds_ratio`：Deep Shrinkでのダウンサンプリングの比率を指定します。デフォルトは0.5です。
+
+これらのパラメータはプロンプトオプションでも指定できます：
+
+- `--dsd1`：プロンプトからDeep Shrink深度1を指定します。
+  
+- `--dst1`：プロンプトからDeep Shrinkタイムステップ1を指定します。
+  
+- `--dsd2`：プロンプトからDeep Shrink深度2を指定します。
+  
+- `--dst2`：プロンプトからDeep Shrinkタイムステップ2を指定します。
+  
+- `--dsr`：プロンプトからDeep Shrink比率を指定します。
+
 ## ControlNet
 
 現在はControlNet 1.0のみ動作確認しています。プリプロセスはCannyのみサポートしています。
@@ -346,6 +420,20 @@ python gen_img_diffusers.py --ckpt model_ckpt --scale 8 --steps 48 --outdir txt2
     --guide_image_path guide.png --control_net_ratios 1.0 --interactive
 ```
 
+## ControlNet-LLLite
+
+ControlNet-LLLiteは、類似の誘導目的に使用できるControlNetの軽量な代替手段です。
+
+以下のオプションがあります：
+
+- `--control_net_lllite_models`：ControlNet-LLLiteモデルファイルを指定します。
+
+- `--control_net_multipliers`：ControlNet-LLLiteの倍率を指定します（重みに類似）。
+
+- `--control_net_ratios`：ControlNet-LLLiteを適用するステップの比率を指定します。
+
+注意：ControlNetとControlNet-LLLiteは同時に使用できません。
+
 ## Attention Couple + Reginal LoRA
 
 プロンプトをいくつかの部分に分割し、それぞれのプロンプトを画像内のどの領域に適用するかを指定できる機能です。個別のオプションはありませんが、`mask_path`とプロンプトで指定します。
@@ -450,7 +538,9 @@ python gen_img_diffusers.py --ckpt wd-v1-3-full-pruned-half.ckpt
 
 - `--opt_channels_last` : 推論時にテンソルのチャンネルを最後に配置します。場合によっては高速化されることがあります。
 
-- `--network_show_meta` : 追加ネットワークのメタデータを表示します。
+- `--shuffle_prompts`：繰り返し時にプロンプトの順序をシャッフルします。`--from_file`で複数のプロンプトを使用する場合に便利です。
+
+- `--network_show_meta`：追加ネットワークのメタデータを表示します。
 
 
 --- 
@@ -478,6 +568,8 @@ latentのサイズを徐々に大きくしていくHires fixです。`gen_img.py
 - `--gradual_latent_ratio` : latentの初期サイズを指定します。デフォルトは 0.5 で、デフォルトの latent サイズの半分のサイズから始めます。
 - `--gradual_latent_ratio_step`: latentのサイズを大きくする割合を指定します。デフォルトは 0.125 で、latentのサイズを 0.625, 0.75, 0.875, 1.0 と徐々に大きくします。
 - `--gradual_latent_ratio_every_n_steps`: latentのサイズを大きくする間隔を指定します。デフォルトは 3 で、3ステップごとに latent のサイズを大きくします。
+- `--gradual_latent_s_noise`：Gradual LatentのS_noiseパラメータを指定します。デフォルトは1.0です。
+- `--gradual_latent_unsharp_params`：Gradual Latentのアンシャープマスクパラメータをksize,sigma,strength,target-x形式で指定します（target-x: 1=True, 0=False）。推奨値：`3,0.5,0.5,1`または`3,1.0,1.0,0`。
 
 それぞれのオプションは、プロンプトオプション、`--glt`、`--glr`、`--gls`、`--gle` でも指定できます。
 
diff --git a/docs/gen_img_README.md b/docs/gen_img_README.md
index fd4a82905..4723518cc 100644
--- a/docs/gen_img_README.md
+++ b/docs/gen_img_README.md
@@ -4,7 +4,7 @@ This is an inference (image generation) script that supports SD 1.x and 2.x mode
 # Overview
 
 * Inference (image generation) script.
-* Supports SD 1.x and 2.x (base/v-parameterization) models.
+* Supports SD 1.x, 2.x (base/v-parameterization), and SDXL models.
 * Supports txt2img, img2img, and inpainting.
 * Supports interactive mode, prompt reading from files, and continuous generation.
 * The number of images generated per prompt line can be specified.
@@ -13,7 +13,7 @@ This is an inference (image generation) script that supports SD 1.x and 2.x mode
 * Supports xformers for high-speed generation.
     * Although xformers are used for memory-saving generation, it is not as optimized as Automatic 1111's Web UI, so it uses about 6GB of VRAM for 512*512 image generation.
 * Extension of prompts to 225 tokens. Supports negative prompts and weighting.
-* Supports various samplers from Diffusers (fewer samplers than Web UI).
+* Supports various samplers from Diffusers including ddim, pndm, lms, euler, euler_a, heun, dpm_2, dpm_2_a, dpmsolver, dpmsolver++, dpmsingle.
 * Supports clip skip (uses the output of the nth layer from the end) of Text Encoder.
 * Separate loading of VAE.
 * Supports CLIP Guided Stable Diffusion, VGG16 Guided Stable Diffusion, Highres. fix, and upscale.
@@ -100,14 +100,20 @@ Specify from the command line.
 
 - `--ckpt <model_name>`: Specifies the model name. The `--ckpt` option is mandatory. You can specify a Stable Diffusion checkpoint file, a Diffusers model folder, or a Hugging Face model ID.
 
+- `--v1`: Specify when using Stable Diffusion 1.x series models. This is the default behavior.
+
 - `--v2`: Specify when using Stable Diffusion 2.x series models. Not required for 1.x series.
 
+- `--sdxl`: Specify when using Stable Diffusion XL models.
+
 - `--v_parameterization`: Specify when using models that use v-parameterization (`768-v-ema.ckpt` and models with additional training from it, Waifu Diffusion v1.5, etc.).
 
-    If the `--v2` specification is incorrect, an error will occur when loading the model. If the `--v_parameterization` specification is incorrect, a brown image will be displayed.
+    If the `--v2` or `--sdxl` specification is incorrect, an error will occur when loading the model. If the `--v_parameterization` specification is incorrect, a brown image will be displayed.
 
 - `--vae`: Specifies the VAE to use. If not specified, the VAE in the model will be used.
 
+- `--tokenizer_cache_dir`: Specifies the cache directory for the tokenizer (for offline usage).
+
 ## Image Generation and Output
 
 - `--interactive`: Operates in interactive mode. Images are generated when prompts are entered.
@@ -118,6 +124,8 @@ Specify from the command line.
 
 - `--from_module <module_file>`: Loads prompts from a Python module. The module should implement a `get_prompter(args, pipe, networks)` function.
 
+- `--prompter_module_args`: Specifies additional arguments to pass to the prompter module.
+
 - `--W <image_width>`: Specifies the width of the image. The default is `512`.
 
 - `--H <image_height>`: Specifies the height of the image. The default is `512`.
@@ -126,7 +134,7 @@ Specify from the command line.
 
 - `--scale <guidance_scale>`: Specifies the unconditional guidance scale. The default is `7.5`.
 
-- `--sampler <sampler_name>`: Specifies the sampler. The default is `ddim`. ddim, pndm, dpmsolver, dpmsolver+++, lms, euler, euler_a provided by Diffusers can be specified (the last three can also be specified as k_lms, k_euler, k_euler_a).
+- `--sampler <sampler_name>`: Specifies the sampler. The default is `ddim`. The following samplers are supported: ddim, pndm, lms, euler, euler_a, heun, dpm_2, dpm_2_a, dpmsolver, dpmsolver++, dpmsingle. Some can also be specified with k_ prefix (k_lms, k_euler, k_euler_a, k_dpm_2, k_dpm_2_a).
 
 - `--outdir <image_output_destination_folder>`: Specifies the output destination for images.
 
@@ -140,6 +148,22 @@ Specify from the command line.
 
 - `--emb_normalize_mode`: Specifies the embedding normalization mode. Options are "original" (default), "abs", and "none". This affects how prompt weights are normalized.
 
+## SDXL-Specific Options
+
+When using SDXL models (with `--sdxl` flag), additional conditioning options are available:
+
+- `--original_height`: Specifies the original height for SDXL conditioning. This affects the model's understanding of the target resolution.
+
+- `--original_width`: Specifies the original width for SDXL conditioning. This affects the model's understanding of the target resolution.
+
+- `--original_height_negative`: Specifies the original height for SDXL negative conditioning.
+
+- `--original_width_negative`: Specifies the original width for SDXL negative conditioning.
+
+- `--crop_top`: Specifies the crop top offset for SDXL conditioning.
+
+- `--crop_left`: Specifies the crop left offset for SDXL conditioning.
+
 ## Adjusting Memory Usage and Generation Speed
 
 - `--batch_size <batch_size>`: Specifies the batch size. The default is `1`. A larger batch size consumes more memory but speeds up generation.
@@ -149,12 +173,14 @@ Specify from the command line.
 
 - `--vae_slices <number_of_slices>`: Splits the image into slices for VAE processing to reduce VRAM usage. None (default) for no splitting. Values like 16 or 32 are recommended. Enabling this is slower but uses less VRAM.
 
-- `--no_half_vae`: Prevents using fp16/bf16 precision for VAE processing. Uses fp32 instead.
+- `--no_half_vae`: Prevents using fp16/bf16 precision for VAE processing. Uses fp32 instead. Use this if you encounter VAE-related issues or artifacts.
 
 - `--xformers`: Specify when using xformers.
 
 - `--sdpa`: Use scaled dot-product attention in PyTorch 2 for optimization.
 
+- `--diffusers_xformers`: Use xformers via Diffusers (note: incompatible with Hypernetworks).
+
 - `--fp16`: Performs inference in fp16 (single precision). If neither `fp16` nor `bf16` is specified, inference is performed in fp32 (single precision).
 
 - `--bf16`: Performs inference in bf16 (bfloat16). Can only be specified for RTX 30 series GPUs. The `--bf16` option will cause an error on GPUs other than the RTX 30 series. It seems that `bf16` is less likely to result in NaN (black image) inference results than `fp16`.
@@ -173,6 +199,10 @@ Specify from the command line.
 
 - `--network_regional_mask_max_color_codes`: Specifies the maximum number of color codes to use for regional masks. If not specified, masks are applied by channel. Used with Regional LoRA to control the number of regions that can be defined by colors in the mask.
 
+- `--network_args`: Specifies additional arguments to pass to the network module in key=value format. For example: `--network_args "alpha=1.0,dropout=0.1"`.
+
+- `--network_merge_n_models`: When using network merging, specifies the number of models to merge (instead of merging all loaded networks).
+
 # Examples of Main Option Specifications
 
 The following is an example of batch generating 64 images with the same prompt and a batch size of 4.
@@ -259,7 +289,7 @@ Example:
 
 - `--sequential_file_name`: Specifies whether to make file names sequential. If specified, the generated file names will be sequential starting from `im_000001.png`.
 
-- `--use_original_file_name`: If specified, the generated file name will be the same as the original file name.
+- `--use_original_file_name`: If specified, the generated file name will be prepended with the original file name (for img2img mode).
 
 - `--clip_vision_strength`: Enables CLIP Vision Conditioning for img2img with the specified strength. Uses the CLIP Vision model to enhance conditioning from the input image.
 
@@ -375,6 +405,16 @@ These parameters can also be specified through prompt options:
   
 - `--dsr`: Specifies Deep Shrink ratio from the prompt.
 
+*Additional prompt options for Gradual Latent (requires `euler_a` sampler):*
+
+- `--glt`: Specifies the timestep to start increasing the size of the latent for Gradual Latent. Overrides the command line specification.
+
+- `--glr`: Specifies the initial size of the latent for Gradual Latent as a ratio. Overrides the command line specification.
+
+- `--gls`: Specifies the ratio to increase the size of the latent for Gradual Latent. Overrides the command line specification.
+
+- `--gle`: Specifies the interval to increase the size of the latent for Gradual Latent. Overrides the command line specification.
+
 ## ControlNet
 
 Currently, only ControlNet 1.0 has been confirmed to work. Only Canny is supported for preprocessing.
@@ -536,7 +576,7 @@ Gradual Latent is a Hires fix that gradually increases the size of the latent.
 - `--gradual_latent_ratio_step`: Specifies the ratio to increase the size of the latent. The default is 0.125, which means the latent size is gradually increased to 0.625, 0.75, 0.875, 1.0.
 - `--gradual_latent_ratio_every_n_steps`: Specifies the interval to increase the size of the latent. The default is 3, which means the latent size is increased every 3 steps.
 - `--gradual_latent_s_noise`: Specifies the s_noise parameter for Gradual Latent. Default is 1.0.
-- `--gradual_latent_unsharp_params`: Specifies unsharp mask parameters for Gradual Latent: ksize, sigma, strength, target-x (1 means True). Values like `3,0.5,0.5,1` or `3,1.0,1.0,0` are recommended.
+- `--gradual_latent_unsharp_params`: Specifies unsharp mask parameters for Gradual Latent in the format: ksize,sigma,strength,target-x (where target-x: 1=True, 0=False). Recommended values: `3,0.5,0.5,1` or `3,1.0,1.0,0`.
 
 Each option can also be specified with prompt options, `--glt`, `--glr`, `--gls`, `--gle`.
 

From 6c82327dc819a1e95a6e939ef4b505f6deeb69e1 Mon Sep 17 00:00:00 2001
From: kohya-ss <52813779+kohya-ss@users.noreply.github.com>
Date: Mon, 1 Sep 2025 21:32:50 +0900
Subject: [PATCH 537/582] doc: remove Japanese section on Gradual Latent
 options from gen_img README

---
 docs/gen_img_README.md | 15 ---------------
 1 file changed, 15 deletions(-)

diff --git a/docs/gen_img_README.md b/docs/gen_img_README.md
index 4723518cc..bcfbef7f7 100644
--- a/docs/gen_img_README.md
+++ b/docs/gen_img_README.md
@@ -583,18 +583,3 @@ Each option can also be specified with prompt options, `--glt`, `--glr`, `--gls`
 __Please specify `euler_a` for the sampler.__ Because the source code of the sampler is modified. It will not work with other samplers.
 
 It is more effective with SD 1.5. It is quite subtle with SDXL.
-
-# Gradual Latent について (Japanese section - kept for reference)
-
-latentのサイズを徐々に大きくしていくHires fixです。`gen_img.py` 、``sdxl_gen_img.py` 、`gen_img.py` に以下のオプションが追加されています。
-
-- `--gradual_latent_timesteps` : latentのサイズを大きくし始めるタイムステップを指定します。デフォルトは None で、Gradual Latentを使用しません。750 くらいから始めてみてください。
-- `--gradual_latent_ratio` : latentの初期サイズを指定します。デフォルトは 0.5 で、デフォルトの latent サイズの半分のサイズから始めます。
-- `--gradual_latent_ratio_step`: latentのサイズを大きくする割合を指定します。デフォルトは 0.125 で、latentのサイズを 0.625, 0.75, 0.875, 1.0 と徐々に大きくします。
-- `--gradual_latent_ratio_every_n_steps`: latentのサイズを大きくする間隔を指定します。デフォルトは 3 で、3ステップごとに latent のサイズを大きくします。
-
-それぞれのオプションは、プロンプトオプション、`--glt`、`--glr`、`--gls`、`--gle` でも指定できます。
-
-サンプラーに手を加えているため、__サンプラーに `euler_a` を指定してください。__ 他のサンプラーでは動作しません。
-
-SD 1.5 のほうが効果があります。SDXL ではかなり微妙です。

From ddfb38e5016878d7b946b63b2cfcd8b43c9aabc4 Mon Sep 17 00:00:00 2001
From: kohya-ss <52813779+kohya-ss@users.noreply.github.com>
Date: Thu, 4 Sep 2025 18:39:52 +0900
Subject: [PATCH 538/582] doc: add documentation for Textual Inversion training
 scripts

---
 docs/train_textual_inversion.md | 294 ++++++++++++++++++++++++++++++++
 1 file changed, 294 insertions(+)
 create mode 100644 docs/train_textual_inversion.md

diff --git a/docs/train_textual_inversion.md b/docs/train_textual_inversion.md
new file mode 100644
index 000000000..c18c23071
--- /dev/null
+++ b/docs/train_textual_inversion.md
@@ -0,0 +1,294 @@
+# How to use Textual Inversion training scripts / Textual Inversion学習スクリプトの使い方
+
+This document explains how to train Textual Inversion embeddings using the `train_textual_inversion.py` and `sdxl_train_textual_inversion.py` scripts included in the `sd-scripts` repository.
+
+<details>
+<summary>日本語</summary>
+このドキュメントでは、`sd-scripts` リポジトリに含まれる `train_textual_inversion.py` および `sdxl_train_textual_inversion.py` を使用してTextual Inversionの埋め込みを学習する方法について解説します。
+</details>
+
+## 1. Introduction / はじめに
+
+[Textual Inversion](https://textual-inversion.github.io/) is a technique that teaches Stable Diffusion new concepts by learning new token embeddings. Instead of fine-tuning the entire model, it only optimizes the text encoder's token embeddings, making it a lightweight approach to teaching the model specific characters, objects, or artistic styles.
+
+**Available Scripts:**
+- `train_textual_inversion.py`: For Stable Diffusion v1.x and v2.x models
+- `sdxl_train_textual_inversion.py`: For Stable Diffusion XL models
+
+**Prerequisites:**
+* The `sd-scripts` repository has been cloned and the Python environment has been set up.
+* The training dataset has been prepared. For dataset preparation, please refer to the [Dataset Configuration Guide](config_README-en.md).
+
+<details>
+<summary>日本語</summary>
+
+[Textual Inversion](https://textual-inversion.github.io/) は、新しいトークンの埋め込みを学習することで、Stable Diffusionに新しい概念を教える技術です。モデル全体をファインチューニングする代わりに、テキストエンコーダのトークン埋め込みのみを最適化するため、特定のキャラクター、オブジェクト、芸術的スタイルをモデルに教えるための軽量なアプローチです。
+
+**利用可能なスクリプト:**
+- `train_textual_inversion.py`: Stable Diffusion v1.xおよびv2.xモデル用
+- `sdxl_train_textual_inversion.py`: Stable Diffusion XLモデル用
+
+**前提条件:**
+* `sd-scripts` リポジトリのクローンとPython環境のセットアップが完了していること。
+* 学習用データセットの準備が完了していること。データセットの準備については[データセット設定ガイド](config_README-en.md)を参照してください。
+</details>
+
+## 2. Basic Usage / 基本的な使用方法
+
+### 2.1. For Stable Diffusion v1.x/v2.x Models / Stable Diffusion v1.x/v2.xモデル用
+
+```bash
+accelerate launch --num_cpu_threads_per_process 1 train_textual_inversion.py \
+  --pretrained_model_name_or_path="path/to/model.safetensors" \
+  --dataset_config="dataset_config.toml" \
+  --output_dir="output" \
+  --output_name="my_textual_inversion" \
+  --save_model_as="safetensors" \
+  --token_string="mychar" \
+  --init_word="girl" \
+  --num_vectors_per_token=4 \
+  --max_train_steps=1600 \
+  --learning_rate=1e-6 \
+  --optimizer_type="AdamW8bit" \
+  --mixed_precision="fp16" \
+  --cache_latents \
+  --sdpa
+```
+
+### 2.2. For SDXL Models / SDXLモデル用
+
+```bash
+accelerate launch --num_cpu_threads_per_process 1 sdxl_train_textual_inversion.py \
+  --pretrained_model_name_or_path="path/to/sdxl_model.safetensors" \
+  --dataset_config="dataset_config.toml" \
+  --output_dir="output" \
+  --output_name="my_sdxl_textual_inversion" \
+  --save_model_as="safetensors" \
+  --token_string="mychar" \
+  --init_word="girl" \
+  --num_vectors_per_token=4 \
+  --max_train_steps=1600 \
+  --learning_rate=1e-6 \
+  --optimizer_type="AdamW8bit" \
+  --mixed_precision="fp16" \
+  --cache_latents \
+  --sdpa
+```
+
+<details>
+<summary>日本語</summary>
+上記のコマンドは実際には1行で書く必要がありますが、見やすさのために改行しています（LinuxやMacでは行末に `\` を追加することで改行できます）。Windowsの場合は、改行せずに1行で書くか、`^` を行末に追加してください。
+</details>
+
+## 3. Key Command-Line Arguments / 主要なコマンドライン引数
+
+### 3.1. Textual Inversion Specific Arguments / Textual Inversion固有の引数
+
+#### Core Parameters / コアパラメータ
+
+* `--token_string="mychar"` **[Required]**
+  * Specifies the token string used in training. This must not exist in the tokenizer's vocabulary. In your training prompts, include this token string (e.g., if token_string is "mychar", use prompts like "mychar 1girl").
+  * 学習時に使用されるトークン文字列を指定します。tokenizerの語彙に存在しない文字である必要があります。学習時のプロンプトには、このトークン文字列を含める必要があります（例：token_stringが"mychar"なら、"mychar 1girl"のようなプロンプトを使用）。
+
+* `--init_word="girl"`
+  * Specifies the word to use for initializing the embedding vector. Choose a word that is conceptually close to what you want to teach. Must be a single token.
+  * 埋め込みベクトルの初期化に使用する単語を指定します。教えたい概念に近い単語を選ぶとよいでしょう。単一のトークンである必要があります。
+
+* `--num_vectors_per_token=4`
+  * Specifies how many embedding vectors to use for this token. More vectors provide greater expressiveness but consume more tokens from the 77-token limit.
+  * このトークンに使用する埋め込みベクトルの数を指定します。多いほど表現力が増しますが、77トークン制限からより多くのトークンを消費します。
+
+* `--weights="path/to/existing_embedding.safetensors"`
+  * Loads pre-trained embeddings to continue training from. Optional parameter for transfer learning.
+  * 既存の埋め込みを読み込んで、そこから追加で学習します。転移学習のオプションパラメータです。
+
+#### Template Options / テンプレートオプション
+
+* `--use_object_template`
+  * Ignores captions and uses predefined object templates (e.g., "a photo of a {}"). Same as the original implementation.
+  * キャプションを無視して、事前定義された物体用テンプレート（例："a photo of a {}"）を使用します。公式実装と同じです。
+
+* `--use_style_template`
+  * Ignores captions and uses predefined style templates (e.g., "a painting in the style of {}"). Same as the original implementation.
+  * キャプションを無視して、事前定義されたスタイル用テンプレート（例："a painting in the style of {}"）を使用します。公式実装と同じです。
+
+### 3.2. Model and Dataset Arguments / モデル・データセット引数
+
+For common model and dataset arguments, please refer to [LoRA Training Guide](train_network.md#31-main-command-line-arguments--主要なコマンドライン引数). The following arguments work the same way:
+
+* `--pretrained_model_name_or_path`
+* `--dataset_config`
+* `--v2`, `--v_parameterization`
+* `--resolution`
+* `--cache_latents`, `--vae_batch_size`
+* `--enable_bucket`, `--min_bucket_reso`, `--max_bucket_reso`
+
+<details>
+<summary>日本語</summary>
+一般的なモデル・データセット引数については、[LoRA学習ガイド](train_network.md#31-main-command-line-arguments--主要なコマンドライン引数)を参照してください。以下の引数は同様に動作します：
+
+* `--pretrained_model_name_or_path`
+* `--dataset_config`
+* `--v2`, `--v_parameterization`
+* `--resolution`
+* `--cache_latents`, `--vae_batch_size`
+* `--enable_bucket`, `--min_bucket_reso`, `--max_bucket_reso`
+</details>
+
+### 3.3. Training Parameters / 学習パラメータ
+
+For training parameters, please refer to [LoRA Training Guide](train_network.md#31-main-command-line-arguments--主要なコマンドライン引数). Textual Inversion typically uses these settings:
+
+* `--learning_rate=1e-6`: Lower learning rates are often used compared to LoRA training
+* `--max_train_steps=1600`: Fewer steps are usually sufficient
+* `--optimizer_type="AdamW8bit"`: Memory-efficient optimizer
+* `--mixed_precision="fp16"`: Reduces memory usage
+
+**Note:** Textual Inversion has lower memory requirements compared to full model fine-tuning, so you can often use larger batch sizes.
+
+<details>
+<summary>日本語</summary>
+学習パラメータについては、[LoRA学習ガイド](train_network.md#31-main-command-line-arguments--主要なコマンドライン引数)を参照してください。Textual Inversionでは通常以下の設定を使用します：
+
+* `--learning_rate=1e-6`: LoRA学習と比べて低い学習率がよく使用されます
+* `--max_train_steps=1600`: より少ないステップで十分な場合が多いです
+* `--optimizer_type="AdamW8bit"`: メモリ効率的なオプティマイザ
+* `--mixed_precision="fp16"`: メモリ使用量を削減
+
+**注意:** Textual Inversionはモデル全体のファインチューニングと比べてメモリ要件が低いため、多くの場合、より大きなバッチサイズを使用できます。
+</details>
+
+## 4. Dataset Preparation / データセット準備
+
+### 4.1. Dataset Configuration / データセット設定
+
+Create a TOML configuration file as described in the [Dataset Configuration Guide](config_README-en.md). Here's an example for Textual Inversion:
+
+```toml
+[general]
+shuffle_caption = false
+caption_extension = ".txt"
+keep_tokens = 1
+
+[[datasets]]
+resolution = 512                    # 1024 for SDXL
+batch_size = 4                      # Can use larger values than LoRA training
+enable_bucket = true
+
+  [[datasets.subsets]]
+  image_dir = "path/to/images"
+  caption_extension = ".txt"
+  num_repeats = 10
+```
+
+### 4.2. Caption Guidelines / キャプションガイドライン
+
+**Important:** Your captions must include the token string you specified. For example:
+
+* If `--token_string="mychar"`, captions should be like: "mychar, 1girl, blonde hair, blue eyes"
+* The token string can appear anywhere in the caption, but including it is essential
+
+You can verify that your token string is being recognized by using `--debug_dataset`, which will show token IDs. Look for tokens with IDs ≥ 49408 (these are the new custom tokens).
+
+<details>
+<summary>日本語</summary>
+
+**重要:** キャプションには指定したトークン文字列を含める必要があります。例：
+
+* `--token_string="mychar"` の場合、キャプションは "mychar, 1girl, blonde hair, blue eyes" のようにします
+* トークン文字列はキャプション内のどこに配置しても構いませんが、含めることが必須です
+
+`--debug_dataset` を使用してトークン文字列が認識されているかを確認できます。これによりトークンIDが表示されます。ID ≥ 49408 のトークン（これらは新しいカスタムトークン）を探してください。
+</details>
+
+## 5. Advanced Configuration / 高度な設定
+
+### 5.1. Multiple Token Vectors / 複数トークンベクトル
+
+When using `--num_vectors_per_token` > 1, the system creates additional token variations:
+- `--token_string="mychar"` with `--num_vectors_per_token=4` creates: "mychar", "mychar1", "mychar2", "mychar3"
+
+For generation, you can use either the base token or all tokens together.
+
+### 5.2. Memory Optimization / メモリ最適化
+
+* Use `--cache_latents` to cache VAE outputs and reduce VRAM usage
+* Use `--gradient_checkpointing` for additional memory savings
+* For SDXL, use `--cache_text_encoder_outputs` to cache text encoder outputs
+* Consider using `--mixed_precision="bf16"` on newer GPUs (RTX 30 series and later)
+
+### 5.3. Training Tips / 学習のコツ
+
+* **Learning Rate:** Start with 1e-6 and adjust based on results. Lower rates often work better than LoRA training.
+* **Steps:** 1000-2000 steps are usually sufficient, but this varies by dataset size and complexity.
+* **Batch Size:** Textual Inversion can handle larger batch sizes than full fine-tuning due to lower memory requirements.
+* **Templates:** Use `--use_object_template` for characters/objects, `--use_style_template` for artistic styles.
+
+<details>
+<summary>日本語</summary>
+
+* **学習率:** 1e-6から始めて、結果に基づいて調整してください。LoRA学習よりも低い率がよく機能します。
+* **ステップ数:** 通常1000-2000ステップで十分ですが、データセットのサイズと複雑さによって異なります。
+* **バッチサイズ:** メモリ要件が低いため、Textual Inversionは完全なファインチューニングよりも大きなバッチサイズを処理できます。
+* **テンプレート:** キャラクター/オブジェクトには `--use_object_template`、芸術的スタイルには `--use_style_template` を使用してください。
+</details>
+
+## 6. Usage After Training / 学習後の使用方法
+
+The trained Textual Inversion embeddings can be used in:
+
+* **Automatic1111 WebUI:** Place the `.safetensors` file in the `embeddings` folder
+* **ComfyUI:** Use the embedding file with appropriate nodes
+* **Other Diffusers-based applications:** Load using the embedding path
+
+In your prompts, simply use the token string you trained (e.g., "mychar") and the model will use the learned embedding.
+
+<details>
+<summary>日本語</summary>
+
+学習したTextual Inversionの埋め込みは以下で使用できます：
+
+* **Automatic1111 WebUI:** `.safetensors` ファイルを `embeddings` フォルダに配置
+* **ComfyUI:** 適切なノードで埋め込みファイルを使用
+* **その他のDiffusersベースアプリケーション:** 埋め込みパスを使用して読み込み
+
+プロンプトでは、学習したトークン文字列（例："mychar"）を単純に使用するだけで、モデルが学習した埋め込みを使用します。
+</details>
+
+## 7. Troubleshooting / トラブルシューティング
+
+### Common Issues / よくある問題
+
+1. **Token string already exists in tokenizer**
+   * Use a unique string that doesn't exist in the model's vocabulary
+   * Try adding numbers or special characters (e.g., "mychar123")
+
+2. **No improvement after training**
+   * Ensure your captions include the token string
+   * Try adjusting the learning rate (lower values like 5e-7)
+   * Increase the number of training steps
+
+3. **Out of memory errors**
+   * Reduce batch size in the dataset configuration
+   * Use `--gradient_checkpointing`
+   * Use `--cache_latents` (for SDXL)
+
+<details>
+<summary>日本語</summary>
+
+1. **トークン文字列がtokenizerに既に存在する**
+   * モデルの語彙に存在しない固有の文字列を使用してください
+   * 数字や特殊文字を追加してみてください（例："mychar123"）
+
+2. **学習後に改善が見られない**
+   * キャプションにトークン文字列が含まれていることを確認してください
+   * 学習率を調整してみてください（5e-7のような低い値）
+   * 学習ステップ数を増やしてください
+
+3. **メモリ不足エラー**
+   * データセット設定でバッチサイズを減らしてください
+   * `--gradient_checkpointing` を使用してください
+   * `--cache_latents` を使用してください
+</details>
+
+For additional training options and advanced configurations, please refer to the [LoRA Training Guide](train_network.md) as many parameters are shared between training methods.
\ No newline at end of file

From 884fc8c7f5c1b1c4ed6c23cd9cd392e872015b8f Mon Sep 17 00:00:00 2001
From: kohya-ss <52813779+kohya-ss@users.noreply.github.com>
Date: Thu, 4 Sep 2025 18:40:21 +0900
Subject: [PATCH 539/582] doc: remove SD3/FLUX.1 training guide

---
 README.md | 751 ++----------------------------------------------------
 1 file changed, 15 insertions(+), 736 deletions(-)

diff --git a/README.md b/README.md
index 2ed53d5af..843cf71b9 100644
--- a/README.md
+++ b/README.md
@@ -18,76 +18,27 @@ If you are using DeepSpeed, please install DeepSpeed with `pip install deepspeed
 
 ### Recent Updates
 
+Sep 4, 2025:
+- The information about FLUX.1 and SD3/SD3.5 training that was described in the README has been organized and divided into the following documents:
+    - [LoRA Training Overview](./docs/train_network.md)
+    - [SDXL Training](./docs/sdxl_train_network.md)
+    - [Advanced Training](./docs/train_network_advanced.md)
+    - [FLUX.1 Training](./docs/flux_train_network.md)
+    - [SD3 Training](./docs/sd3_train_network.md)
+    - [LUMINA Training](./docs/lumina_train_network.md)
+    - [Validation](./docs/validation.md)
+    - [Fine-tuning](./docs/fine_tune.md)
+    - [Textual Inversion Training](./docs/train_textual_inversion.md)
+
 Aug 28, 2025:
 - In order to support the latest GPUs and features, we have updated the **PyTorch and library versions**. PR [#2178](https://github.com/kohya-ss/sd-scripts/pull/2178) There are many changes, so please let us know if you encounter any issues.
 - The PyTorch version used for testing has been updated to 2.6.0. We have confirmed that it works with PyTorch 2.6.0 and later.
 - The `requirements.txt` has been updated, so please update your dependencies.
-  - You can update the dependencies with `pip install -r requirements.txt`.
-  - The version specification for `bitsandbytes` has been removed. If you encounter errors on RTX 50 series GPUs, please update it with `pip install -U bitsandbytes`.
+    - You can update the dependencies with `pip install -r requirements.txt`.
+    - The version specification for `bitsandbytes` has been removed. If you encounter errors on RTX 50 series GPUs, please update it with `pip install -U bitsandbytes`.
 - We have modified each script to minimize warnings as much as possible.
-  - The modified scripts will work in the old environment (library versions), but please update them when convenient.
-
-Jul 30, 2025:
-- **Breaking Change**: For FLUX.1 and Chroma training, the CFG (Classifier-Free Guidance, using negative prompts) scale option for sample image generation during training has been changed from `--g` to `--l`. The `--g` option is now used for the embedded guidance scale. Please update your prompts accordingly. See [Sample Image Generation During Training](#sample-image-generation-during-training) for details.
-
-- Support for [Chroma](https://huggingface.co/lodestones/Chroma) has been added in PR [#2157](https://github.com/kohya-ss/sd-scripts/pull/2157). Thank you to lodestones for the high-quality model.
-    - Chroma is a new model based on FLUX.1 schnell. In this repository, `flux_train_network.py` is used for training LoRAs for Chroma with `--model_type chroma`. `--apply_t5_attn_mask` is also needed for Chroma training.
-    - Please refer to the [FLUX.1 LoRA training documentation](./docs/flux_train_network.md) for more details.
-
-Jul 21, 2025:
-- Support for [Lumina-Image 2.0](https://github.com/Alpha-VLLM/Lumina-Image-2.0) has been added in PR [#1927](https://github.com/kohya-ss/sd-scripts/pull/1927) and [#2138](https://github.com/kohya-ss/sd-scripts/pull/2138). Special thanks to sdbds and RockerBOO for their contributions.
-    - Please refer to the [Lumina-Image 2.0 documentation](./docs/lumina_train_network.md) for more details.
-- We have started adding comprehensive training-related documentation to [docs](./docs). These documents are being created with the help of generative AI and will be updated over time. While there are still many gaps at this stage, we plan to improve them gradually.
-
-    Currently, the following documents are available:
-    - train_network.md
-    - sdxl_train_network.md
-    - train_network_advanced.md
-    - flux_train_network.md
-    - sd3_train_network.md
-    - lumina_train_network.md
-    - validation.md
-
-Jul 10, 2025:
-- [AI Coding Agents](#for-developers-using-ai-coding-agents) section is added to the README. This section provides instructions for developers using AI coding agents like Claude and Gemini to understand the project context and coding standards.
-
-May 1, 2025:
-- The error when training FLUX.1 with mixed precision in flux_train.py with DeepSpeed enabled has been resolved. Thanks to sharlynxy for PR [#2060](https://github.com/kohya-ss/sd-scripts/pull/2060). Please refer to the PR for details.
-  - If you enable DeepSpeed, please install DeepSpeed with `pip install deepspeed==0.16.7`.
-
-Apr 27, 2025:
-- FLUX.1 training now supports CFG scale in the sample generation during training. Please use `--g` option, to specify the CFG scale (note that `--l` is used as the embedded guidance scale.) PR [#2064](https://github.com/kohya-ss/sd-scripts/pull/2064).
-    - See [here](#sample-image-generation-during-training) for details.
-    - If you have any issues with this, please let us know.
-
-Apr 6, 2025:
-- IP noise gamma has been enabled in FLUX.1. Thanks to rockerBOO for PR [#1992](https://github.com/kohya-ss/sd-scripts/pull/1992). See the PR for details.
-    - `--ip_noise_gamma` and `--ip_noise_gamma_random_strength` are available.
-  
-Mar 30, 2025:
-- LoRA-GGPO is added for FLUX.1 LoRA training. Thank you to rockerBOO for PR [#1974](https://github.com/kohya-ss/sd-scripts/pull/1974). 
-  - Specify `--network_args ggpo_sigma=0.03 ggpo_beta=0.01` in the command line or `network_args = ["ggpo_sigma=0.03", "ggpo_beta=0.01"]` in .toml file. See PR for details.
-- The interpolation method for resizing the original image to the training size can now be specified. Thank you to rockerBOO for PR [#1936](https://github.com/kohya-ss/sd-scripts/pull/1936).
-
-Mar 20, 2025:
-- `pytorch-optimizer` is added to requirements.txt. Thank you to gesen2egee for PR [#1985](https://github.com/kohya-ss/sd-scripts/pull/1985). 
-  - For example, you can use CAME optimizer with `--optimizer_type "pytorch_optimizer.CAME" --optimizer_args "weight_decay=0.01"`.
-
-Mar 6, 2025:
-
-- Added a utility script to merge the weights of SD3's DiT, VAE (optional), CLIP-L, CLIP-G, and T5XXL into a single .safetensors file. Run `tools/merge_sd3_safetensors.py`. See `--help` for usage. PR [#1960](https://github.com/kohya-ss/sd-scripts/pull/1960)
-
-Feb 26, 2025:
-
-- Improve the validation loss calculation in `train_network.py`, `sdxl_train_network.py`, `flux_train_network.py`, and `sd3_train_network.py`. PR [#1903](https://github.com/kohya-ss/sd-scripts/pull/1903)
-  - The validation loss uses the fixed timestep sampling and the fixed random seed. This is to ensure that the validation loss is not fluctuated by the random values.
-
-Jan 25, 2025:
+    - The modified scripts will work in the old environment (library versions), but please update them when convenient.
 
-- `train_network.py`, `sdxl_train_network.py`, `flux_train_network.py`, and `sd3_train_network.py` now support validation loss. PR [#1864](https://github.com/kohya-ss/sd-scripts/pull/1864) Thank you to rockerBOO!
-  - For details on how to set it up, please refer to the PR. The documentation will be updated as needed.
-  - It will be added to other scripts as well.
-  - As a current limitation, validation loss is not supported when `--block_to_swap` is specified, or when schedule-free optimizer is used.
 
 ## For Developers Using AI Coding Agents
 
@@ -114,678 +65,6 @@ To use them, you need to opt-in by creating your own configuration file in the p
 
 This approach ensures that you have full control over the instructions given to your agent while benefiting from the shared project context. Your `CLAUDE.md` and `GEMINI.md` are already listed in `.gitignore`, so it won't be committed to the repository.
 
-## FLUX.1 training
-
-- [FLUX.1 LoRA training](#flux1-lora-training)
-  - [Key Options for FLUX.1 LoRA training](#key-options-for-flux1-lora-training)
-  - [Distribution of timesteps](#distribution-of-timesteps)
-  - [Key Features for FLUX.1 LoRA training](#key-features-for-flux1-lora-training)
-  - [Specify rank for each layer in FLUX.1](#specify-rank-for-each-layer-in-flux1)
-  - [Specify blocks to train in FLUX.1 LoRA training](#specify-blocks-to-train-in-flux1-lora-training)
-- [FLUX.1 ControlNet training](#flux1-controlnet-training)
-- [FLUX.1 OFT training](#flux1-oft-training)
-- [Inference for FLUX.1 with LoRA model](#inference-for-flux1-with-lora-model)
-- [FLUX.1 fine-tuning](#flux1-fine-tuning)
-  - [Key Features for FLUX.1 fine-tuning](#key-features-for-flux1-fine-tuning)
-- [Extract LoRA from FLUX.1 Models](#extract-lora-from-flux1-models)
-- [Convert FLUX LoRA](#convert-flux-lora)
-- [Merge LoRA to FLUX.1 checkpoint](#merge-lora-to-flux1-checkpoint)
-- [FLUX.1 Multi-resolution training](#flux1-multi-resolution-training)
-- [Convert Diffusers to FLUX.1](#convert-diffusers-to-flux1)
-
-### FLUX.1 LoRA training
-
-We have added a new training script for LoRA training. The script is `flux_train_network.py`. See `--help` for options. 
-
-FLUX.1 model, CLIP-L, and T5XXL models are recommended to be in bf16/fp16 format. If you specify `--fp8_base`, you can use fp8 models for FLUX.1. The fp8 model is only compatible with `float8_e4m3fn` format.
-
-Sample command is below. It will work with 24GB VRAM GPUs. 
-
-```
-accelerate launch  --mixed_precision bf16 --num_cpu_threads_per_process 1 flux_train_network.py 
---pretrained_model_name_or_path flux1-dev.safetensors --clip_l sd3/clip_l.safetensors --t5xxl sd3/t5xxl_fp16.safetensors 
---ae ae.safetensors --cache_latents_to_disk --save_model_as safetensors --sdpa --persistent_data_loader_workers 
---max_data_loader_n_workers 2 --seed 42 --gradient_checkpointing --mixed_precision bf16 --save_precision bf16 
---network_module networks.lora_flux --network_dim 4 --network_train_unet_only 
---optimizer_type adamw8bit --learning_rate 1e-4 
---cache_text_encoder_outputs --cache_text_encoder_outputs_to_disk --fp8_base 
---highvram --max_train_epochs 4 --save_every_n_epochs 1 --dataset_config dataset_1024_bs2.toml 
---output_dir path/to/output/dir --output_name flux-lora-name 
---timestep_sampling shift --discrete_flow_shift 3.1582 --model_prediction_type raw --guidance_scale 1.0 
-```
-(The command is multi-line for readability. Please combine it into one line.)
-
-We also not sure how many epochs are needed for convergence, and how the learning rate should be adjusted.
-
-The trained LoRA model can be used with ComfyUI. 
-
-When training LoRA for Text Encoder (without `--network_train_unet_only`), more VRAM is required. Please refer to the settings below to reduce VRAM usage.
-
-__Options for GPUs with less VRAM:__
-
-By specifying `--blocks_to_swap`, you can save VRAM by swapping some blocks between CPU and GPU. See [FLUX.1 fine-tuning](#flux1-fine-tuning) for details.
-
-Specify a number like `--blocks_to_swap 10`. A larger number will swap more blocks, saving more VRAM, but training will be slower. In FLUX.1, you can swap up to 35 blocks.
-
-`--cpu_offload_checkpointing` offloads gradient checkpointing to CPU. This reduces up to 1GB of VRAM usage but slows down the training by about 15%. Cannot be used with `--blocks_to_swap`.
-
-Adafactor optimizer may reduce the VRAM usage than 8bit AdamW. Please use settings like below:
-
-```
---optimizer_type adafactor --optimizer_args "relative_step=False" "scale_parameter=False" "warmup_init=False" --lr_scheduler constant_with_warmup --max_grad_norm 0.0
-```
-
-The training can be done with 16GB VRAM GPUs with the batch size of 1. Please change your dataset configuration.
-
-The training can be done with 12GB VRAM GPUs with `--blocks_to_swap 16` with 8bit AdamW. Please use settings like below:
-
-```
---blocks_to_swap 16 
-```
-
-For GPUs with less than 10GB of VRAM, it is recommended to use an fp8 checkpoint for T5XXL. You can download `t5xxl_fp8_e4m3fn.safetensors` from [comfyanonymous/flux_text_encoders](https://huggingface.co/comfyanonymous/flux_text_encoders) (please use without `scaled`).
-
-10GB VRAM GPUs will work with 22 blocks swapped, and 8GB VRAM GPUs will work with 28 blocks swapped.
-
-__`--split_mode` is deprecated. This option is still available, but they will be removed in the future. Please use `--blocks_to_swap` instead. If this option is specified and `--blocks_to_swap` is not specified, `--blocks_to_swap 18` is automatically enabled.__
-
-#### Key Options for FLUX.1 LoRA training
-
-There are many unknown points in FLUX.1 training, so some settings can be specified by arguments. Here are the arguments. The arguments and sample settings are still experimental and may change in the future. Feedback on the settings is welcome.
-
-- `--pretrained_model_name_or_path` is the path to the pretrained model (FLUX.1). bf16 (original BFL model) is recommended (`flux1-dev.safetensors` or `flux1-dev.sft`). If you specify `--fp8_base`, you can use fp8 models for FLUX.1. The fp8 model is only compatible with `float8_e4m3fn` format.
-- `--clip_l` is the path to the CLIP-L model. 
-- `--t5xxl` is the path to the T5XXL model. If you specify `--fp8_base`, you can use fp8 (float8_e4m3fn) models for T5XXL. However, it is recommended to use fp16 models for caching.
-- `--ae` is the path to the autoencoder model (`ae.safetensors` or `ae.sft`).
-
-- `--timestep_sampling` is the method to sample timesteps (0-1):
-  - `sigma`: sigma-based, same as SD3
-  - `uniform`: uniform random
-  - `sigmoid`: sigmoid of random normal, same as x-flux, AI-toolkit etc.
-  - `shift`: shifts the value of sigmoid of normal distribution random number
-  - `flux_shift`: shifts the value of sigmoid of normal distribution random number, depending on the resolution (same as FLUX.1 dev inference). `--discrete_flow_shift` is ignored when `flux_shift` is specified.
-- `--sigmoid_scale` is the scale factor for sigmoid timestep sampling (only used when timestep-sampling is "sigmoid"). The default is 1.0. Larger values will make the sampling more uniform.
-  - This option is effective even when`--timestep_sampling shift` is specified.
-  - Normally, leave it at 1.0. Larger values make the value before shift closer to a uniform distribution.
-- `--model_prediction_type` is how to interpret and process the model prediction:
-  - `raw`: use as is, same as x-flux
-  - `additive`: add to noisy input
-  - `sigma_scaled`: apply sigma scaling, same as SD3
-- `--discrete_flow_shift` is the discrete flow shift for the Euler Discrete Scheduler, default is 3.0 (same as SD3).
-- `--blocks_to_swap`. See [FLUX.1 fine-tuning](#flux1-fine-tuning) for details.
-
-The existing `--loss_type` option may be useful for FLUX.1 training. The default is `l2`.
-
-~~In our experiments, `--timestep_sampling sigma --model_prediction_type raw --discrete_flow_shift 1.0` with `--loss_type l2` seems to work better than the default (SD3) settings. The multiplier of LoRA should be adjusted.~~
-
-In our experiments, `--timestep_sampling shift --discrete_flow_shift 3.1582 --model_prediction_type raw --guidance_scale 1.0` (with the default `l2` loss_type) seems to work better. 
-
-The settings in [AI Toolkit by Ostris](https://github.com/ostris/ai-toolkit) seems to be equivalent to `--timestep_sampling sigmoid --model_prediction_type raw --guidance_scale 1.0` (with the default `l2` loss_type). 
-
-Other settings may work better, so please try different settings.
-
-Other options are described below.
-
-#### Distribution of timesteps
-
-`--timestep_sampling` and `--sigmoid_scale`, `--discrete_flow_shift` adjust the distribution of timesteps. The distribution is shown in the figures below.
-
-The effect of `--discrete_flow_shift` with `--timestep_sampling shift` (when `--sigmoid_scale` is not specified, the default is 1.0):
-![Figure_2](https://github.com/user-attachments/assets/d9de42f9-f17d-40da-b88d-d964402569c6)
-
-The difference between `--timestep_sampling sigmoid` and `--timestep_sampling uniform` (when `--timestep_sampling sigmoid` or `uniform` is specified, `--discrete_flow_shift` is ignored):
-![Figure_3](https://github.com/user-attachments/assets/27029009-1f5d-4dc0-bb24-13d02ac4fdad)
-
-The effect of `--timestep_sampling sigmoid` and `--sigmoid_scale` (when `--timestep_sampling sigmoid` is specified, `--discrete_flow_shift` is ignored):
-![Figure_4](https://github.com/user-attachments/assets/08a2267c-e47e-48b7-826e-f9a080787cdc)
-
-#### Key Features for FLUX.1 LoRA training
-
-1. CLIP-L and T5XXL LoRA Support:
-   - FLUX.1 LoRA training now supports CLIP-L and T5XXL LoRA training.
-   - Remove `--network_train_unet_only` from your command.
-   - Add `train_t5xxl=True` to `--network_args` to train T5XXL LoRA. CLIP-L is also trained at the same time.
-   - T5XXL output can be cached for CLIP-L LoRA training. So, `--cache_text_encoder_outputs` or `--cache_text_encoder_outputs_to_disk` is also available.
-   - The learning rates for CLIP-L and T5XXL can be specified separately. Multiple numbers can be specified in `--text_encoder_lr`. For example, `--text_encoder_lr 1e-4 1e-5`. The first value is the learning rate for CLIP-L, and the second value is for T5XXL. If you specify only one, the learning rates for CLIP-L and T5XXL will be the same. If `--text_encoder_lr` is not specified, the default learning rate `--learning_rate` is used for both CLIP-L and T5XXL.
-   - The trained LoRA can be used with ComfyUI.
-   - Note: `flux_extract_lora.py`, `convert_flux_lora.py`and `merge_flux_lora.py` do not support CLIP-L and T5XXL LoRA yet.
-
-    | trained LoRA|option|network_args|cache_text_encoder_outputs (*1)|
-    |---|---|---|---|
-    |FLUX.1|`--network_train_unet_only`|-|o|
-    |FLUX.1 + CLIP-L|-|-|o (*2)|
-    |FLUX.1 + CLIP-L + T5XXL|-|`train_t5xxl=True`|-|
-    |CLIP-L (*3)|`--network_train_text_encoder_only`|-|o (*2)|
-    |CLIP-L + T5XXL (*3)|`--network_train_text_encoder_only`|`train_t5xxl=True`|-|
-
-    - *1: `--cache_text_encoder_outputs` or `--cache_text_encoder_outputs_to_disk` is also available.
-    - *2: T5XXL output can be cached for CLIP-L LoRA training.
-    - *3: Not tested yet.
-
-2. Experimental FP8/FP16 mixed training:
-   - `--fp8_base_unet` enables training with fp8 for FLUX and bf16/fp16 for CLIP-L/T5XXL.
-   - FLUX can be trained with fp8, and CLIP-L/T5XXL can be trained with bf16/fp16.
-   - When specifying this option, the `--fp8_base` option is automatically enabled.
-
-3. Split Q/K/V Projection Layers (Experimental):
-   - Added an option to split the projection layers of q/k/v/txt in the attention and apply LoRA to each of them.
-   - Specify `"split_qkv=True"` in network_args like `--network_args "split_qkv=True"` (`train_blocks` is also available).
-   - May increase expressiveness but also training time.
-   - The trained model is compatible with normal LoRA models in sd-scripts and can be used in environments like ComfyUI.
-   - Converting to AI-toolkit (Diffusers) format with `convert_flux_lora.py` will reduce the size.
-   
-4. T5 Attention Mask Application:
-   - T5 attention mask is applied when `--apply_t5_attn_mask` is specified.
-   - Now applies mask when encoding T5 and in the attention of Double and Single Blocks
-   - Affects fine-tuning, LoRA training, and inference in `flux_minimal_inference.py`.
-
-5. Multi-resolution Training Support:
-   - FLUX.1 now supports multi-resolution training, even with caching latents to disk.
-
-
-Technical details of Q/K/V split: 
-
-In the implementation of Black Forest Labs' model, the projection layers of q/k/v (and txt in single blocks) are concatenated into one. If LoRA is added there as it is, the LoRA module is only one, and the dimension is large. In contrast, in the implementation of Diffusers, the projection layers of q/k/v/txt are separated. Therefore, the LoRA module is applied to q/k/v/txt separately, and the dimension is smaller. This option is for training LoRA similar to the latter.
-
-The compatibility of the saved model (state dict) is ensured by concatenating the weights of multiple LoRAs. However, since there are zero weights in some parts, the model size will be large.
-
-#### Specify rank for each layer in FLUX.1
-
-You can specify the rank for each layer in FLUX.1 by specifying the following network_args. If you specify `0`, LoRA will not be applied to that layer.
-
-When network_args is not specified, the default value (`network_dim`) is applied, same as before.
-
-|network_args|target layer|
-|---|---|
-|img_attn_dim|img_attn in DoubleStreamBlock|
-|txt_attn_dim|txt_attn in DoubleStreamBlock|
-|img_mlp_dim|img_mlp in DoubleStreamBlock|
-|txt_mlp_dim|txt_mlp in DoubleStreamBlock|
-|img_mod_dim|img_mod in DoubleStreamBlock|
-|txt_mod_dim|txt_mod in DoubleStreamBlock|
-|single_dim|linear1 and linear2 in SingleStreamBlock|
-|single_mod_dim|modulation in SingleStreamBlock|
-
-`"verbose=True"` is also available for debugging. It shows the rank of each layer.
-
-example: 
-```
---network_args "img_attn_dim=4" "img_mlp_dim=8" "txt_attn_dim=2" "txt_mlp_dim=2" 
-"img_mod_dim=2" "txt_mod_dim=2" "single_dim=4" "single_mod_dim=2" "verbose=True"
-```
-
-You can apply LoRA to the conditioning layers of Flux by specifying `in_dims` in network_args. When specifying, be sure to specify 5 numbers in `[]` as a comma-separated list.
-
-example: 
-```
---network_args "in_dims=[4,2,2,2,4]"
-```
-
-Each number corresponds to `img_in`, `time_in`, `vector_in`, `guidance_in`, `txt_in`. The above example applies LoRA to all conditioning layers, with rank 4 for `img_in`, 2 for `time_in`, `vector_in`, `guidance_in`, and 4 for `txt_in`.
-
-If you specify `0`, LoRA will not be applied to that layer. For example, `[4,0,0,0,4]` applies LoRA only to `img_in` and `txt_in`.
-
-#### Specify blocks to train in FLUX.1 LoRA training
-
-You can specify the blocks to train in FLUX.1 LoRA training by specifying `train_double_block_indices` and `train_single_block_indices` in network_args. The indices are 0-based. The default (when omitted) is to train all blocks. The indices are specified as a list of integers or a range of integers, like `0,1,5,8` or `0,1,4-5,7`. The number of double blocks is 19, and the number of single blocks is 38, so the valid range is 0-18 and 0-37, respectively. `all` is also available to train all blocks, `none` is also available to train no blocks.
-
-example: 
-```
---network_args "train_double_block_indices=0,1,8-12,18" "train_single_block_indices=3,10,20-25,37"
-```
-
-```
---network_args "train_double_block_indices=none" "train_single_block_indices=10-15"
-```
-
-If you specify one of `train_double_block_indices` or `train_single_block_indices`, the other will be trained as usual. 
-
-### FLUX.1 ControlNet training
-We have added a new training script for ControlNet training. The script is flux_train_control_net.py. See --help for options.
-
-Sample command is below. It will work with 80GB VRAM GPUs.
-```
-accelerate launch --mixed_precision bf16 --num_cpu_threads_per_process 1 flux_train_control_net.py
---pretrained_model_name_or_path flux1-dev.safetensors --clip_l clip_l.safetensors --t5xxl t5xxl_fp16.safetensors
---ae ae.safetensors --save_model_as safetensors --sdpa --persistent_data_loader_workers
---max_data_loader_n_workers 1 --seed 42 --gradient_checkpointing --mixed_precision bf16
---optimizer_type adamw8bit --learning_rate 2e-5 
---highvram --max_train_epochs 1 --save_every_n_steps 1000 --dataset_config dataset.toml
---output_dir /path/to/output/dir --output_name flux-cn
---timestep_sampling shift --discrete_flow_shift 3.1582 --model_prediction_type raw --guidance_scale 1.0 --deepspeed
-```
-
-For 24GB VRAM GPUs, you can train with 16 blocks swapped and caching latents and text encoder outputs with the batch size of 1. Remove `--deepspeed` . Sample command is below. Not fully tested.
-```
- --blocks_to_swap 16 --cache_latents_to_disk --cache_text_encoder_outputs_to_disk 
-```
-
-The training can be done with 16GB VRAM GPUs with around 30 blocks swapped. 
-
-`--gradient_accumulation_steps` is also available. The default value is 1 (no accumulation), but according to the original PR, 8 is used.
-
-### FLUX.1 OFT training
-
-You can train OFT with almost the same options as LoRA, such as `--timestamp_sampling`. The following points are different.
-
-- Change `--network_module` from `networks.lora_flux` to `networks.oft_flux`.
-- `--network_dim` is the number of OFT blocks. Unlike LoRA rank, the smaller the dim, the larger the model. We recommend about 64 or 128. Please make the output dimension of the target layer of OFT divisible by the value of `--network_dim` (an error will occur if it is not divisible). Valid values are 64, 128, 256, 512, 1024, etc.
-- `--network_alpha` is treated as a constraint for OFT. We recommend about 1e-2 to 1e-4. The default value when omitted is 1, which is too large, so be sure to specify it.
-- CLIP/T5XXL is not supported. Specify `--network_train_unet_only`.
-- `--network_args` specifies the hyperparameters of OFT. The following are valid:
-    - Specify `enable_all_linear=True` to target all linear connections in the MLP layer. The default is False, which targets only attention.
-
-Currently, there is no environment to infer FLUX.1 OFT. Inference is only possible with `flux_minimal_inference.py` (specify OFT model with `--lora`).
-
-Sample command is below. It will work with 24GB VRAM GPUs with the batch size of 1.
-
-```
---network_module networks.oft_flux  --network_dim 128 --network_alpha 1e-3 
---network_args "enable_all_linear=True" --learning_rate 1e-5 
-```
-
-The training can be done with 16GB VRAM GPUs without `--enable_all_linear` option and with Adafactor optimizer. 
-
-### Inference for FLUX.1 with LoRA model
-
-The inference script is also available. The script is `flux_minimal_inference.py`. See `--help` for options. 
-
-```
-python flux_minimal_inference.py --ckpt flux1-dev.safetensors --clip_l sd3/clip_l.safetensors --t5xxl sd3/t5xxl_fp16.safetensors --ae ae.safetensors --dtype bf16 --prompt "a cat holding a sign that says hello world" --out path/to/output/dir --seed 1 --flux_dtype fp8 --offload --lora lora-flux-name.safetensors;1.0
-```
-
-### FLUX.1 fine-tuning
-
-The memory-efficient training with block swap is based on 2kpr's implementation. Thanks to 2kpr!
-
-__`--double_blocks_to_swap` and `--single_blocks_to_swap` are deprecated. These options is still available, but they will be removed in the future. Please use `--blocks_to_swap` instead. These options are equivalent to specifying `double_blocks_to_swap + single_blocks_to_swap // 2` in `--blocks_to_swap`.__
-
-Sample command for FLUX.1 fine-tuning is below. This will work with 24GB VRAM GPUs, and 64GB main memory is recommended. 
-
-```
-accelerate launch  --mixed_precision bf16 --num_cpu_threads_per_process 1 flux_train.py   
---pretrained_model_name_or_path flux1-dev.safetensors  --clip_l clip_l.safetensors --t5xxl t5xxl_fp16.safetensors --ae ae_dev.safetensors 
---save_model_as safetensors --sdpa --persistent_data_loader_workers --max_data_loader_n_workers 2 
---seed 42 --gradient_checkpointing --mixed_precision bf16 --save_precision bf16 
---dataset_config dataset_1024_bs1.toml  --output_dir path/to/output/dir --output_name output-name 
---learning_rate 5e-5 --max_train_epochs 4  --sdpa --highvram --cache_text_encoder_outputs_to_disk --cache_latents_to_disk --save_every_n_epochs 1 
---optimizer_type adafactor --optimizer_args "relative_step=False" "scale_parameter=False" "warmup_init=False" 
---lr_scheduler constant_with_warmup --max_grad_norm 0.0 
---timestep_sampling shift --discrete_flow_shift 3.1582 --model_prediction_type raw --guidance_scale 1.0 
---fused_backward_pass  --blocks_to_swap 8 --full_bf16 
-```
-(The command is multi-line for readability. Please combine it into one line.)
-
-Options are almost the same as LoRA training. The difference is `--full_bf16`, `--fused_backward_pass` and  `--blocks_to_swap`. `--cpu_offload_checkpointing` is also available.
-
-`--full_bf16` enables the training with bf16 (weights and gradients). 
-
-`--fused_backward_pass` enables the fusing of the optimizer step into the backward pass for each parameter. This reduces the memory usage during training. Only Adafactor optimizer is supported for now. Stochastic rounding is also enabled when `--fused_backward_pass` and `--full_bf16` are specified.
-
-`--blockwise_fused_optimizers` enables the fusing of the optimizer step into the backward pass for each block. This is similar to `--fused_backward_pass`. Any optimizer can be used, but Adafactor is recommended for memory efficiency and stochastic rounding. `--blockwise_fused_optimizers` cannot be used with `--fused_backward_pass`. Stochastic rounding is not supported for now.
-
-`--blocks_to_swap` is the number of blocks to swap. The default is None (no swap). The maximum value is 35.
-
-`--cpu_offload_checkpointing` is to offload the gradient checkpointing to CPU. This reduces about 2GB of VRAM usage. This option cannot be used with `--blocks_to_swap`.
-
-All these options are experimental and may change in the future.
-
-The increasing the number of blocks to swap may reduce the memory usage, but the training speed will be slower. `--cpu_offload_checkpointing` also slows down the training.
-
-Swap 8 blocks without cpu offload checkpointing may be a good starting point for 24GB VRAM GPUs. Please try different settings according to VRAM usage and training speed.
-
-The learning rate and the number of epochs are not optimized yet. Please adjust them according to the training results.
-
-#### How to use block swap
-
-There are two possible ways to use block swap. It is unknown which is better.
-
-1. Swap the minimum number of blocks that fit in VRAM with batch size 1 and shorten the training speed of one step.
-
-    The above command example is for this usage.
-
-2. Swap many blocks to increase the batch size and shorten the training speed per data.
-
-    For example, swapping 35 blocks seems to increase the batch size to about 5. In this case, the training speed per data will be relatively faster than 1.
-  
-#### Training with <24GB VRAM GPUs
-
-Swap 28 blocks without cpu offload checkpointing may be working with 12GB VRAM GPUs. Please try different settings according to VRAM size of your GPU.
-
-T5XXL requires about 10GB of VRAM, so 10GB of VRAM will be minimum requirement for FLUX.1 fine-tuning. 
-
-#### Key Features for FLUX.1 fine-tuning
-
-1.  Technical details of block swap:
-    - Reduce memory usage by transferring double and single blocks of FLUX.1 from GPU to CPU when they are not needed.
-    - During forward pass, the weights of the blocks that have finished calculation are transferred to CPU, and the weights of the blocks to be calculated are transferred to GPU.
-    - The same is true for the backward pass, but the order is reversed. The gradients remain on the GPU.
-    - Since the transfer between CPU and GPU takes time, the training will be slower.
-    - `--blocks_to_swap` specify the number of blocks to swap. 
-    - About 640MB of memory can be saved per block.
-  - (Update 1: Nov 12, 2024) 
-    - The maximum number of blocks that can be swapped is 35.
-    - We are exchanging only the data of the weights (weight.data) in reference to the implementation of OneTrainer (thanks to OneTrainer). However, the mechanism of the exchange is a custom implementation.
-    - Since it takes time to free CUDA memory (torch.cuda.empty_cache()), we reuse the CUDA memory allocated to weight.data as it is and exchange the weights between modules.
-    - This shortens the time it takes to exchange weights between modules.
-    - Since the weights must be almost identical to be exchanged, FLUX.1 exchanges the weights between double blocks and single blocks.
-    - In SD3, all blocks are similar, but some weights are different, so there are weights that always remain on the GPU.
-
-2. Sample Image Generation:
-   - Sample image generation during training is now supported.
-   - The prompts are cached and used for generation if `--cache_latents` is specified. So changing the prompts during training will not affect the generated images.
-   - Specify options such as `--sample_prompts` and `--sample_every_n_epochs`.
-   - Note: It will be very slow when `--blocks_to_swap` is specified.
-
-3. Experimental Memory-Efficient Saving:
-   - `--mem_eff_save` option can further reduce memory consumption during model saving (about 22GB).
-   - This is a custom implementation and may cause unexpected issues. Use with caution.
-
-4. T5XXL Token Length Control:
-   - Added `--t5xxl_max_token_length` option to specify the maximum token length of T5XXL.
-   - Default is 512 in dev and 256 in schnell models.
-
-5. Multi-GPU Training Support:
-   - Note: `--double_blocks_to_swap` and `--single_blocks_to_swap` cannot be used in multi-GPU training.
-
-6. Disable mmap Load for Safetensors:
-   - `--disable_mmap_load_safetensors` option now works in `flux_train.py`.
-   - Speeds up model loading during training in WSL2.
-   - Effective in reducing memory usage when loading models during multi-GPU training.
-
-
-### Extract LoRA from FLUX.1 Models
-
-Script: `networks/flux_extract_lora.py`
-
-Extracts LoRA from the difference between two FLUX.1 models.
-
-Offers memory-efficient option with `--mem_eff_safe_open`.
-
-CLIP-L LoRA is not supported.
-
-### Convert FLUX LoRA
-
-Script: `convert_flux_lora.py`
-
-Converts LoRA between sd-scripts format (BFL-based) and AI-toolkit format (Diffusers-based).
-
-If you use LoRA in the inference environment, converting it to AI-toolkit format may reduce temporary memory usage.
-
-Note that re-conversion will increase the size of LoRA.
-
-CLIP-L/T5XXL LoRA is not supported.
-
-### Merge LoRA to FLUX.1 checkpoint
-
-`networks/flux_merge_lora.py` merges LoRA to FLUX.1 checkpoint, CLIP-L or T5XXL models. __The script is experimental.__ 
-
-```
-python networks/flux_merge_lora.py --flux_model flux1-dev.safetensors --save_to output.safetensors --models lora1.safetensors --ratios 2.0 --save_precision fp16 --loading_device cuda --working_device cpu
-```
-
-You can also merge multiple LoRA models into a FLUX.1 model. Specify multiple LoRA models in `--models`. Specify the same number of ratios in `--ratios`.
-
-CLIP-L and T5XXL LoRA are supported. `--clip_l` and `--clip_l_save_to` are for CLIP-L, `--t5xxl` and `--t5xxl_save_to` are for T5XXL. Sample command is below.
-
-```
---clip_l clip_l.safetensors --clip_l_save_to merged_clip_l.safetensors  --t5xxl t5xxl_fp16.safetensors --t5xxl_save_to merged_t5xxl.safetensors
-```
-
-FLUX.1, CLIP-L, and T5XXL can be merged together or separately for memory efficiency.
-
-An experimental option `--mem_eff_load_save` is available. This option is for memory-efficient loading and saving. It may also speed up loading and saving. 
-
-`--loading_device` is the device to load the LoRA models. `--working_device` is the device to merge (calculate) the models. Default is `cpu` for both. Loading / working device examples are below (in the case of `--save_precision fp16` or `--save_precision bf16`, `float32` will consume more memory):
-
-- 'cpu' / 'cpu': Uses >50GB of RAM, but works on any machine.
-- 'cuda' / 'cpu': Uses 24GB of VRAM, but requires 30GB of RAM.
-- 'cpu' / 'cuda': Uses 4GB of VRAM, but requires 50GB of RAM, faster than 'cpu' / 'cpu' or 'cuda' / 'cpu'.
-- 'cuda' / 'cuda': Uses 30GB of VRAM, but requires 30GB of RAM, faster than 'cpu' / 'cpu' or 'cuda' / 'cpu'.
-
-`--save_precision` is the precision to save the merged model. In the case of LoRA models are trained with `bf16`, we are not sure which is better, `fp16` or `bf16` for `--save_precision`.
-
-The script can merge multiple LoRA models. If you want to merge multiple LoRA models, specify `--concat` option to work the merged LoRA model properly.
-
-### FLUX.1 Multi-resolution training
-
-You can define multiple resolutions in the dataset configuration file.
-
-The dataset configuration file is like below. You can define multiple resolutions with different batch sizes. The resolutions are defined in the `[[datasets]]` section. The `[[datasets.subsets]]` section is for the dataset directory. Please specify the same directory for each resolution.
-
-```
-[general]
-# define common settings here
-flip_aug = true
-color_aug = false
-keep_tokens_separator= "|||"
-shuffle_caption = false
-caption_tag_dropout_rate = 0
-caption_extension = ".txt"
-
-[[datasets]]
-# define the first resolution here
-batch_size = 2
-enable_bucket = true
-resolution = [1024, 1024]
-
-  [[datasets.subsets]]
-  image_dir = "path/to/image/dir"
-  num_repeats = 1
-
-[[datasets]]
-# define the second resolution here
-batch_size = 3
-enable_bucket = true
-resolution = [768, 768]
-
-  [[datasets.subsets]]
-  image_dir = "path/to/image/dir"
-  num_repeats = 1
-
-[[datasets]]
-# define the third resolution here
-batch_size = 4
-enable_bucket = true
-resolution = [512, 512]
-
-  [[datasets.subsets]]
-  image_dir = "path/to/image/dir"
-  num_repeats = 1
-```
-
-### Convert Diffusers to FLUX.1
-
-Script: `convert_diffusers_to_flux1.py`
-
-Converts Diffusers models to FLUX.1 models. The script is experimental. See `--help` for options. schnell and dev models are supported. AE/CLIP/T5XXL are not supported. The diffusers folder is a parent folder of `rmer` folder.
-
-```
-python tools/convert_diffusers_to_flux.py --diffusers_path path/to/diffusers_folder_or_00001_safetensors --save_to path/to/flux1.safetensors --mem_eff_load_save --save_precision bf16
-```
-
-## SD3 training
-
-SD3.5L/M training is now available. 
-
-### SD3 LoRA training
-
-The script is `sd3_train_network.py`. See `--help` for options. 
-
-SD3 model, CLIP-L, CLIP-G, and T5XXL models are recommended to be in float/fp16 format. If you specify `--fp8_base`, you can use fp8 models for SD3. The fp8 model is only compatible with `float8_e4m3fn` format.
-
-Sample command is below. It will work with 16GB VRAM GPUs (SD3.5L).
-
-```
-accelerate launch  --mixed_precision bf16 --num_cpu_threads_per_process 1 sd3_train_network.py 
---pretrained_model_name_or_path path/to/sd3.5_large.safetensors --clip_l sd3/clip_l.safetensors --clip_g sd3/clip_g.safetensors --t5xxl sd3/t5xxl_fp16.safetensors 
---cache_latents_to_disk --save_model_as safetensors --sdpa --persistent_data_loader_workers 
---max_data_loader_n_workers 2 --seed 42 --gradient_checkpointing --mixed_precision bf16 --save_precision bf16 
---network_module networks.lora_sd3 --network_dim 4 --network_train_unet_only 
---optimizer_type adamw8bit --learning_rate 1e-4 
---cache_text_encoder_outputs --cache_text_encoder_outputs_to_disk --fp8_base 
---highvram --max_train_epochs 4 --save_every_n_epochs 1 --dataset_config dataset_1024_bs2.toml 
---output_dir path/to/output/dir --output_name sd3-lora-name 
-```
-(The command is multi-line for readability. Please combine it into one line.)
-
-Like FLUX.1 training, the `--blocks_to_swap` option for memory reduction is available. The maximum number of blocks that can be swapped is 36 for SD3.5L and 22 for SD3.5M.
-
-Adafactor optimizer is also available.
-
-`--cpu_offload_checkpointing` option is not available.
-
-We also not sure how many epochs are needed for convergence, and how the learning rate should be adjusted.
-
-The trained LoRA model can be used with ComfyUI. 
-
-#### Key Options for SD3 LoRA training
-
-Here are the arguments. The arguments and sample settings are still experimental and may change in the future. Feedback on the settings is welcome.
-
-- `--network_module` is the module for LoRA training. Specify `networks.lora_sd3` for SD3 LoRA training.
-- `--pretrained_model_name_or_path` is the path to the pretrained model (SD3/3.5). If you specify `--fp8_base`, you can use fp8 models for SD3/3.5. The fp8 model is only compatible with `float8_e4m3fn` format.
-- `--clip_l` is the path to the CLIP-L model. 
-- `--clip_g` is the path to the CLIP-G model.
-- `--t5xxl` is the path to the T5XXL model. If you specify `--fp8_base`, you can use fp8 (float8_e4m3fn) models for T5XXL. However, it is recommended to use fp16 models for caching.
-- `--vae` is the path to the autoencoder model. __This option is not necessary for SD3.__ VAE is included in the standard SD3 model.
-- `--disable_mmap_load_safetensors` is to disable memory mapping when loading safetensors. __This option significantly reduces the memory usage when loading models for Windows users.__
-- `--clip_l_dropout_rate`, `--clip_g_dropout_rate` and `--t5_dropout_rate` are the dropout rates for the embeddings of CLIP-L, CLIP-G, and T5XXL, described in [SAI research papre](http://arxiv.org/pdf/2403.03206). The default is 0.0. For LoRA training, it is seems to be better to set 0.0.
-- `--pos_emb_random_crop_rate` is the rate of random cropping of positional embeddings, described in [SD3.5M model card](https://huggingface.co/stabilityai/stable-diffusion-3.5-medium). The default is 0. It is seems to be better to set 0.0 for LoRA training.
-- `--enable_scaled_pos_embed` is to enable the scaled positional embeddings. The default is False. This option is an experimental feature for SD3.5M. Details are described below.
-- `--training_shift` is the shift value for the training distribution of timesteps. The default is 1.0 (uniform distribution, no shift).  If less than 1.0, the side closer to the image is more sampled, and if more than 1.0, the side closer to noise is more sampled. 
-
-Other options are described below.
-
-#### Key Features for SD3 LoRA training
-
-1. CLIP-L, G and T5XXL LoRA Support:
-   - SD3 LoRA training now supports CLIP-L, CLIP-G and T5XXL LoRA training.
-   - Remove `--network_train_unet_only` from your command.
-   - Add `train_t5xxl=True` to `--network_args` to train T5XXL LoRA. CLIP-L and G is also trained at the same time.
-   - T5XXL output can be cached for CLIP-L and G LoRA training. So, `--cache_text_encoder_outputs` or `--cache_text_encoder_outputs_to_disk` is also available.
-   - The learning rates for CLIP-L, CLIP-G and T5XXL can be specified separately. Multiple numbers can be specified in `--text_encoder_lr`. For example, `--text_encoder_lr 1e-4 1e-5 5e-6`. The first value is the learning rate for CLIP-L, the second value is for CLIP-G, and the third value is for T5XXL. If you specify only one, the learning rates for CLIP-L, CLIP-G and T5XXL will be the same. If the third value is not specified, the second value is used for T5XXL. If `--text_encoder_lr` is not specified, the default learning rate `--learning_rate` is used for both CLIP-L and T5XXL.
-   - The trained LoRA can be used with ComfyUI.
-
-    | trained LoRA|option|network_args|cache_text_encoder_outputs (*1)|
-    |---|---|---|---|
-    |MMDiT|`--network_train_unet_only`|-|o|
-    |MMDiT + CLIP-L + CLIP-G|-|-|o (*2)|
-    |MMDiT + CLIP-L + CLIP-G + T5XXL|-|`train_t5xxl=True`|-|
-    |CLIP-L + CLIP-G (*3)|`--network_train_text_encoder_only`|-|o (*2)|
-    |CLIP-L + CLIP-G + T5XXL (*3)|`--network_train_text_encoder_only`|`train_t5xxl=True`|-|
-
-    - *1: `--cache_text_encoder_outputs` or `--cache_text_encoder_outputs_to_disk` is also available.
-    - *2: T5XXL output can be cached for CLIP-L and G LoRA training.
-    - *3: Not tested yet.
-
-2. Experimental FP8/FP16 mixed training:
-   - `--fp8_base_unet` enables training with fp8 for MMDiT and bf16/fp16 for CLIP-L/G/T5XXL.
-   - When specifying this option, the `--fp8_base` option is automatically enabled.
-
-3. Split Q/K/V Projection Layers (Experimental):
-   - Same as FLUX.1.
-   
-4. CLIP-L/G and T5 Attention Mask Application:
-   - This function is planned to be implemented in the future.
-   
-5. Multi-resolution Training Support:
-   - Only for SD3.5M. 
-   - Same as FLUX.1 for data preparation.
-   - If you train with multiple resolutions, you can enable the scaled positional embeddings with `--enable_scaled_pos_embed`. The default is False. __This option is an experimental feature.__
-
-6. Weighting scheme and training shift:
-   - The weighting scheme is described in the section 3.1 of the [SD3 paper](https://arxiv.org/abs/2403.03206v1). 
-   - The uniform distribution is the default. If you want to change the distribution, see `--help` for options. 
-   - `--training_shift` is the shift value for the training distribution of timesteps.
-   - The effect of a shift in uniform distribution is shown in the figure below.
-   - ![Figure_1](https://github.com/user-attachments/assets/99a72c67-adfb-4440-81d4-a718985ff350)
-
-Technical details of multi-resolution training for SD3.5M:
-
-SD3.5M does not use scaled positional embeddings for multi-resolution training, and is trained with a single positional embedding. Therefore, this feature is very experimental.
-
-Generally, in multi-resolution training, the values of the positional embeddings must be the same for each resolution. That is, the same value must be in the same position for 512x512, 768x768, and 1024x1024. To achieve this, the positional embeddings for each resolution are calculated in advance and switched according to the resolution of the training data. This feature is enabled by `--enable_scaled_pos_embed`.
-
-This idea and the code for calculating scaled positional embeddings are contributed by KohakuBlueleaf. Thanks to KohakuBlueleaf!
-
-
-#### Specify rank for each layer in SD3 LoRA
-
-You can specify the rank for each layer in SD3 by specifying the following network_args. If you specify `0`, LoRA will not be applied to that layer.
-
-When network_args is not specified, the default value (`network_dim`) is applied, same as before.
-
-|network_args|target layer|
-|---|---|
-|context_attn_dim|attn in context_block|
-|context_mlp_dim|mlp in context_block|
-|context_mod_dim|adaLN_modulation in context_block|
-|x_attn_dim|attn in x_block|
-|x_mlp_dim|mlp in x_block|
-|x_mod_dim|adaLN_modulation in x_block|
-
-`"verbose=True"` is also available for debugging. It shows the rank of each layer.
-
-example: 
-```
---network_args "context_attn_dim=2" "context_mlp_dim=3" "context_mod_dim=4" "x_attn_dim=5" "x_mlp_dim=6" "x_mod_dim=7" "verbose=True"
-```
-
-You can apply LoRA to the conditioning layers of SD3 by specifying `emb_dims` in network_args. When specifying, be sure to specify 6 numbers in `[]` as a comma-separated list.
-
-example: 
-```
---network_args "emb_dims=[2,3,4,5,6,7]"
-```
-
-Each number corresponds to `context_embedder`, `t_embedder`, `x_embedder`, `y_embedder`, `final_layer_adaLN_modulation`, `final_layer_linear`. The above example applies LoRA to all conditioning layers, with rank 2 for `context_embedder`, 3 for `t_embedder`, 4 for `context_embedder`, 5 for `y_embedder`, 6 for `final_layer_adaLN_modulation`, and 7 for `final_layer_linear`.
-
-If you specify `0`, LoRA will not be applied to that layer. For example, `[4,0,0,4,0,0]` applies LoRA only to `context_embedder` and `y_embedder`.
-
-#### Specify blocks to train in SD3 LoRA training
-
-You can specify the blocks to train in SD3 LoRA training by specifying `train_block_indices` in network_args. The indices are 0-based. The default (when omitted) is to train all blocks. The indices are specified as a list of integers or a range of integers, like `0,1,5,8` or `0,1,4-5,7`. 
-
-The number of blocks depends on the model. The valid range is 0-(the number of blocks - 1). `all` is also available to train all blocks, `none` is also available to train no blocks.
-
-example: 
-```
---network_args "train_block_indices=1,2,6-8" 
-```
-
-### Inference for SD3 with LoRA model
-
-The inference script is also available. The script is `sd3_minimal_inference.py`. See `--help` for options. 
-
-### SD3 fine-tuning
-
-Documentation is not available yet. Please refer to the FLUX.1 fine-tuning guide for now. The major difference are following:
-
-- `--clip_g` is also available for SD3 fine-tuning.
-- `--timestep_sampling` `--discrete_flow_shift``--model_prediction_type` --guidance_scale` are not necessary for SD3 fine-tuning.
-- Use `--vae` instead of `--ae` if necessary. __This option is not necessary for SD3.__ VAE is included in the standard SD3 model.
-- `--disable_mmap_load_safetensors` is available. __This option significantly reduces the memory usage when loading models for Windows users.__
-- `--cpu_offload_checkpointing` is not available for SD3 fine-tuning.
-- `--clip_l_dropout_rate`, `--clip_g_dropout_rate` and `--t5_dropout_rate` are available same as LoRA training. 
-- `--pos_emb_random_crop_rate` and `--enable_scaled_pos_embed` are available for SD3.5M fine-tuning.
-- Training text encoders is available with `--train_text_encoder` option, similar to SDXL training.
-  - CLIP-L and G can be trained with `--train_text_encoder` option. Training T5XXL needs `--train_t5xxl` option.
-  - If you use the cached text encoder outputs for T5XXL with training CLIP-L and G, specify `--use_t5xxl_cache_only`. This option enables to use the cached text encoder outputs for T5XXL only.
-  - The learning rates for CLIP-L, CLIP-G and T5XXL can be specified separately. `--text_encoder_lr1`, `--text_encoder_lr2` and `--text_encoder_lr3` are available. 
-
-### Extract LoRA from SD3 Models
-
-Not available yet.
-
-### Convert SD3 LoRA
-
-Not available yet.
-
-### Merge LoRA to SD3 checkpoint
-
-Not available yet.
-
 --- 
 
 [__Change History__](#change-history) is moved to the bottom of the page. 

From 952f9ce7be6794a88b12ba8fa37418c37b24f30a Mon Sep 17 00:00:00 2001
From: "Kohya S." <52813779+kohya-ss@users.noreply.github.com>
Date: Thu, 4 Sep 2025 19:46:04 +0900
Subject: [PATCH 540/582] Update docs/train_textual_inversion.md

Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
---
 docs/train_textual_inversion.md | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/docs/train_textual_inversion.md b/docs/train_textual_inversion.md
index c18c23071..b7c69eb7b 100644
--- a/docs/train_textual_inversion.md
+++ b/docs/train_textual_inversion.md
@@ -268,10 +268,7 @@ In your prompts, simply use the token string you trained (e.g., "mychar") and th
    * Try adjusting the learning rate (lower values like 5e-7)
    * Increase the number of training steps
 
-3. **Out of memory errors**
-   * Reduce batch size in the dataset configuration
-   * Use `--gradient_checkpointing`
-   * Use `--cache_latents` (for SDXL)
+   * Use `--cache_latents`
 
 <details>
 <summary>日本語</summary>

From 0bb0d91615d690caa7167339701f5d86316fcd40 Mon Sep 17 00:00:00 2001
From: kohya-ss <52813779+kohya-ss@users.noreply.github.com>
Date: Sat, 6 Sep 2025 19:52:54 +0900
Subject: [PATCH 541/582] doc: update introduction and clarify command line
 option priorities in config README

---
 docs/config_README-en.md | 13 +++++--------
 1 file changed, 5 insertions(+), 8 deletions(-)

diff --git a/docs/config_README-en.md b/docs/config_README-en.md
index 8c55903d0..78687ee6c 100644
--- a/docs/config_README-en.md
+++ b/docs/config_README-en.md
@@ -1,9 +1,6 @@
-Original Source by kohya-ss
+First version: A.I Translation by Model: NousResearch/Nous-Hermes-2-Mixtral-8x7B-DPO, editing by Darkstorm2150
 
-First version:
-A.I Translation by Model: NousResearch/Nous-Hermes-2-Mixtral-8x7B-DPO, editing by Darkstorm2150
-
-Some parts are manually added.
+Document is updated and maintained manually.
 
 # Config Readme
 
@@ -267,10 +264,10 @@ The following command line argument options are ignored if a configuration file
 * `--reg_data_dir`
 * `--in_json`
 
-The following command line argument options are given priority over the configuration file options if both are specified simultaneously. In most cases, they have the same names as the corresponding options in the configuration file.
+For the command line options listed below, if an option is specified in both the command line arguments and the configuration file, the value from the configuration file will be given priority. Unless otherwise noted, the option names are the same.
 
-| Command Line Argument Option   | Prioritized Configuration File Option |
-| ------------------------------- | ------------------------------------- |
+| Command Line Argument Option   | Corresponding Configuration File Option |
+| ------------------------------- | --------------------------------------- |
 | `--bucket_no_upscale`           |                                       |
 | `--bucket_reso_steps`           |                                       |
 | `--caption_dropout_every_n_epochs` |                                       |

From ef4397963bdfc7882addc12e0a4510868a4b1f33 Mon Sep 17 00:00:00 2001
From: rockerBOO <rockerboo@gmail.com>
Date: Mon, 8 Sep 2025 14:16:33 -0400
Subject: [PATCH 542/582] Fix validation dataset documentation to not use
 subsets

---
 docs/flux_train_network.md | 20 +++++++++++++++++---
 1 file changed, 17 insertions(+), 3 deletions(-)

diff --git a/docs/flux_train_network.md b/docs/flux_train_network.md
index 23828eb71..f5b67a7e0 100644
--- a/docs/flux_train_network.md
+++ b/docs/flux_train_network.md
@@ -550,18 +550,32 @@ You can calculate validation loss during training using a validation dataset to
 To set up validation, add a `validation_split` and optionally `validation_seed` to your dataset configuration TOML file. 
 
 ```toml
+validation_seed = 42 # [Optional] Validation seed, otherwise uses training seed for validation split .
+
 [[datasets]]
 enable_bucket = true
 resolution = [1024, 1024]
-validation_seed = 42 # [Optional] Validation seed, otherwise uses training seed for validation split .
 
   [[datasets.subsets]]
   image_dir = "path/to/image/directory"
-  validation_split = 0.1 # Split between 0.0 and 1.0 where 1.0 will use the full subset as a validation dataset
+
+[[datasets]]
+enable_bucket = true
+resolution = [1024, 1024]
+validation_split = 0.1 # Split between 0.0 and 1.0 where 1.0 will use the full subset as a validation dataset
+
+  [[datasets.subsets]]
+  # This directory will split 10% to validation and 90% to training
+  image_dir = "path/to/image/second-directory"
+
+[[datasets]]
+enable_bucket = true
+resolution = [1024, 1024]
+validation_split = 1.0 # Will use this full subset as a validation subset. 
 
   [[datasets.subsets]]
+  # This directory will use the 100% to validation and 0% to training
   image_dir = "path/to/image/full_validation"
-  validation_split = 1.0 # Will use this full subset as a validation subset. 
 ```
 
 **Notes:**

From 78685b9c5f2141c99a1478ff3f4d59c276828dd1 Mon Sep 17 00:00:00 2001
From: rockerBOO <rockerboo@gmail.com>
Date: Mon, 8 Sep 2025 14:18:50 -0400
Subject: [PATCH 543/582] Move general settings to top to make more clear the
 validation bits

---
 docs/flux_train_network.md | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

diff --git a/docs/flux_train_network.md b/docs/flux_train_network.md
index f5b67a7e0..ccf6dff7e 100644
--- a/docs/flux_train_network.md
+++ b/docs/flux_train_network.md
@@ -551,17 +551,15 @@ To set up validation, add a `validation_split` and optionally `validation_seed`
 
 ```toml
 validation_seed = 42 # [Optional] Validation seed, otherwise uses training seed for validation split .
-
-[[datasets]]
 enable_bucket = true
 resolution = [1024, 1024]
 
+[[datasets]]
   [[datasets.subsets]]
+  # This directory will use 100% of the images for training
   image_dir = "path/to/image/directory"
 
 [[datasets]]
-enable_bucket = true
-resolution = [1024, 1024]
 validation_split = 0.1 # Split between 0.0 and 1.0 where 1.0 will use the full subset as a validation dataset
 
   [[datasets.subsets]]
@@ -569,8 +567,6 @@ validation_split = 0.1 # Split between 0.0 and 1.0 where 1.0 will use the full s
   image_dir = "path/to/image/second-directory"
 
 [[datasets]]
-enable_bucket = true
-resolution = [1024, 1024]
 validation_split = 1.0 # Will use this full subset as a validation subset. 
 
   [[datasets.subsets]]

From fe4c18934c2d34ff2eb3eb65ea1eaa8ecec207cd Mon Sep 17 00:00:00 2001
From: rockerBOO <rockerboo@gmail.com>
Date: Mon, 8 Sep 2025 14:28:55 -0400
Subject: [PATCH 544/582] blocks_to_swap is supported for validation loss now

---
 docs/flux_train_network.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/flux_train_network.md b/docs/flux_train_network.md
index ccf6dff7e..3d61f8304 100644
--- a/docs/flux_train_network.md
+++ b/docs/flux_train_network.md
@@ -577,7 +577,7 @@ validation_split = 1.0 # Will use this full subset as a validation subset.
 **Notes:**
 
 * Validation loss calculation uses fixed timestep sampling and random seeds to reduce loss variation due to randomness for more stable evaluation.
-* Currently, validation loss is not supported when using `--blocks_to_swap` or Schedule-Free optimizers (`AdamWScheduleFree`, `RAdamScheduleFree`, `ProdigyScheduleFree`).
+* Currently, validation loss is not supported when using Schedule-Free optimizers (`AdamWScheduleFree`, `RAdamScheduleFree`, `ProdigyScheduleFree`).
 
 <details>
 <summary>日本語</summary>

From 5149be5a8708a60bbdd119e7a73b403c51a03458 Mon Sep 17 00:00:00 2001
From: Kohya S <52813779+kohya-ss@users.noreply.github.com>
Date: Thu, 11 Sep 2025 12:54:12 +0900
Subject: [PATCH 545/582] feat: initial commit for HunyuanImage-2.1 inference

---
 hunyuan_image_minimal_inference.py    | 1197 ++++++++++++++++++++
 library/attention.py                  |   50 +
 library/fp8_optimization_utils.py     |  391 +++++++
 library/hunyuan_image_models.py       |  374 +++++++
 library/hunyuan_image_modules.py      |  804 ++++++++++++++
 library/hunyuan_image_text_encoder.py |  649 +++++++++++
 library/hunyuan_image_utils.py        |  461 ++++++++
 library/hunyuan_image_vae.py          |  622 +++++++++++
 library/lora_utils.py                 |  249 +++++
 networks/lora_hunyuan_image.py        | 1444 +++++++++++++++++++++++++
 10 files changed, 6241 insertions(+)
 create mode 100644 hunyuan_image_minimal_inference.py
 create mode 100644 library/attention.py
 create mode 100644 library/fp8_optimization_utils.py
 create mode 100644 library/hunyuan_image_models.py
 create mode 100644 library/hunyuan_image_modules.py
 create mode 100644 library/hunyuan_image_text_encoder.py
 create mode 100644 library/hunyuan_image_utils.py
 create mode 100644 library/hunyuan_image_vae.py
 create mode 100644 library/lora_utils.py
 create mode 100644 networks/lora_hunyuan_image.py

diff --git a/hunyuan_image_minimal_inference.py b/hunyuan_image_minimal_inference.py
new file mode 100644
index 000000000..8a956f491
--- /dev/null
+++ b/hunyuan_image_minimal_inference.py
@@ -0,0 +1,1197 @@
+import argparse
+import datetime
+import gc
+from importlib.util import find_spec
+import random
+import os
+import re
+import time
+import copy
+from types import ModuleType
+from typing import Tuple, Optional, List, Any, Dict
+
+import numpy as np
+import torch
+from safetensors.torch import load_file, save_file
+from safetensors import safe_open
+from tqdm import tqdm
+from diffusers.utils.torch_utils import randn_tensor
+from PIL import Image
+
+from library import hunyuan_image_models, hunyuan_image_text_encoder, hunyuan_image_utils
+from library import hunyuan_image_vae
+from library.hunyuan_image_vae import HunyuanVAE2D
+from library.device_utils import clean_memory_on_device
+from networks import lora_hunyuan_image
+
+
+lycoris_available = find_spec("lycoris") is not None
+if lycoris_available:
+    from lycoris.kohya import create_network_from_weights
+
+from library.custom_offloading_utils import synchronize_device
+from library.utils import mem_eff_save_file, setup_logging
+
+setup_logging()
+import logging
+
+logger = logging.getLogger(__name__)
+
+
+class GenerationSettings:
+    def __init__(self, device: torch.device, dit_weight_dtype: Optional[torch.dtype] = None):
+        self.device = device
+        self.dit_weight_dtype = dit_weight_dtype  # not used currently because model may be optimized
+
+
+def parse_args() -> argparse.Namespace:
+    """parse command line arguments"""
+    parser = argparse.ArgumentParser(description="HunyuanImage inference script")
+
+    parser.add_argument("--dit", type=str, default=None, help="DiT directory or path")
+    parser.add_argument("--vae", type=str, default=None, help="VAE directory or path")
+    parser.add_argument("--text_encoder", type=str, required=True, help="Text Encoder 1 (Qwen2.5-VL) directory or path")
+    parser.add_argument("--byt5", type=str, default=None, help="ByT5 Text Encoder 2 directory or path")
+
+    # LoRA
+    parser.add_argument("--lora_weight", type=str, nargs="*", required=False, default=None, help="LoRA weight path")
+    parser.add_argument("--lora_multiplier", type=float, nargs="*", default=1.0, help="LoRA multiplier")
+    parser.add_argument("--include_patterns", type=str, nargs="*", default=None, help="LoRA module include patterns")
+    parser.add_argument("--exclude_patterns", type=str, nargs="*", default=None, help="LoRA module exclude patterns")
+    parser.add_argument(
+        "--save_merged_model",
+        type=str,
+        default=None,
+        help="Save merged model to path. If specified, no inference will be performed.",
+    )
+
+    # inference
+    parser.add_argument(
+        "--guidance_scale", type=float, default=4.0, help="Guidance scale for classifier free guidance. Default is 4.0."
+    )
+    parser.add_argument("--prompt", type=str, default=None, help="prompt for generation")
+    parser.add_argument("--negative_prompt", type=str, default="", help="negative prompt for generation, default is empty string")
+    parser.add_argument("--image_size", type=int, nargs=2, default=[256, 256], help="image size, height and width")
+    parser.add_argument("--infer_steps", type=int, default=25, help="number of inference steps, default is 25")
+    parser.add_argument("--save_path", type=str, required=True, help="path to save generated video")
+    parser.add_argument("--seed", type=int, default=None, help="Seed for evaluation.")
+
+    # Flow Matching
+    parser.add_argument(
+        "--flow_shift",
+        type=float,
+        default=None,
+        help="Shift factor for flow matching schedulers. Default is None (default).",
+    )
+
+    parser.add_argument("--fp8", action="store_true", help="use fp8 for DiT model")
+    parser.add_argument("--fp8_scaled", action="store_true", help="use scaled fp8 for DiT, only for fp8")
+
+    parser.add_argument("--text_encoder_cpu", action="store_true", help="Inference on CPU for Text Encoders")
+    parser.add_argument("--vae_enable_tiling", action="store_true", help="Enable tiling for VAE decoding")
+    parser.add_argument(
+        "--device", type=str, default=None, help="device to use for inference. If None, use CUDA if available, otherwise use CPU"
+    )
+    parser.add_argument(
+        "--attn_mode",
+        type=str,
+        default="torch",
+        choices=["flash", "torch", "sageattn", "xformers", "sdpa"],  #  "flash2", "flash3",
+        help="attention mode",
+    )
+    parser.add_argument("--blocks_to_swap", type=int, default=0, help="number of blocks to swap in the model")
+    parser.add_argument(
+        "--output_type",
+        type=str,
+        default="images",
+        choices=["images", "latent", "latent_images"],
+        help="output type",
+    )
+    parser.add_argument("--no_metadata", action="store_true", help="do not save metadata")
+    parser.add_argument("--latent_path", type=str, nargs="*", default=None, help="path to latent for decode. no inference")
+    parser.add_argument(
+        "--lycoris", action="store_true", help=f"use lycoris for inference{'' if lycoris_available else ' (not available)'}"
+    )
+
+    # arguments for batch and interactive modes
+    parser.add_argument("--from_file", type=str, default=None, help="Read prompts from a file")
+    parser.add_argument("--interactive", action="store_true", help="Interactive mode: read prompts from console")
+
+    args = parser.parse_args()
+
+    # Validate arguments
+    if args.from_file and args.interactive:
+        raise ValueError("Cannot use both --from_file and --interactive at the same time")
+
+    if args.latent_path is None or len(args.latent_path) == 0:
+        if args.prompt is None and not args.from_file and not args.interactive:
+            raise ValueError("Either --prompt, --from_file or --interactive must be specified")
+
+    if args.lycoris and not lycoris_available:
+        raise ValueError("install lycoris: https://github.com/KohakuBlueleaf/LyCORIS")
+
+    return args
+
+
+def parse_prompt_line(line: str) -> Dict[str, Any]:
+    """Parse a prompt line into a dictionary of argument overrides
+
+    Args:
+        line: Prompt line with options
+
+    Returns:
+        Dict[str, Any]: Dictionary of argument overrides
+    """
+    # TODO common function with hv_train_network.line_to_prompt_dict
+    parts = line.split(" --")
+    prompt = parts[0].strip()
+
+    # Create dictionary of overrides
+    overrides = {"prompt": prompt}
+
+    for part in parts[1:]:
+        if not part.strip():
+            continue
+        option_parts = part.split(" ", 1)
+        option = option_parts[0].strip()
+        value = option_parts[1].strip() if len(option_parts) > 1 else ""
+
+        # Map options to argument names
+        if option == "w":
+            overrides["image_size_width"] = int(value)
+        elif option == "h":
+            overrides["image_size_height"] = int(value)
+        elif option == "d":
+            overrides["seed"] = int(value)
+        elif option == "s":
+            overrides["infer_steps"] = int(value)
+        elif option == "g" or option == "l":
+            overrides["guidance_scale"] = float(value)
+        elif option == "fs":
+            overrides["flow_shift"] = float(value)
+        # elif option == "i":
+        #     overrides["image_path"] = value
+        # elif option == "im":
+        #     overrides["image_mask_path"] = value
+        # elif option == "cn":
+        #     overrides["control_path"] = value
+        elif option == "n":
+            overrides["negative_prompt"] = value
+        # elif option == "ci":  # control_image_path
+        #     overrides["control_image_path"] = value
+
+    return overrides
+
+
+def apply_overrides(args: argparse.Namespace, overrides: Dict[str, Any]) -> argparse.Namespace:
+    """Apply overrides to args
+
+    Args:
+        args: Original arguments
+        overrides: Dictionary of overrides
+
+    Returns:
+        argparse.Namespace: New arguments with overrides applied
+    """
+    args_copy = copy.deepcopy(args)
+
+    for key, value in overrides.items():
+        if key == "image_size_width":
+            args_copy.image_size[1] = value
+        elif key == "image_size_height":
+            args_copy.image_size[0] = value
+        else:
+            setattr(args_copy, key, value)
+
+    return args_copy
+
+
+def check_inputs(args: argparse.Namespace) -> Tuple[int, int]:
+    """Validate video size and length
+
+    Args:
+        args: command line arguments
+
+    Returns:
+        Tuple[int, int]: (height, width)
+    """
+    height = args.image_size[0]
+    width = args.image_size[1]
+
+    if height % 32 != 0 or width % 32 != 0:
+        raise ValueError(f"`height` and `width` have to be divisible by 32 but are {height} and {width}.")
+
+    return height, width
+
+
+# region Model
+
+
+def load_dit_model(
+    args: argparse.Namespace, device: torch.device, dit_weight_dtype: Optional[torch.dtype] = None
+) -> hunyuan_image_models.HYImageDiffusionTransformer:
+    """load DiT model
+
+    Args:
+        args: command line arguments
+        device: device to use
+        dit_weight_dtype: data type for the model weights. None for as-is
+
+    Returns:
+        qwen_image_model.HYImageDiffusionTransformer: DiT model instance
+    """
+    # If LyCORIS is enabled, we will load the model to CPU and then merge LoRA weights (static method)
+
+    loading_device = "cpu"
+    if args.blocks_to_swap == 0 and not args.lycoris:
+        loading_device = device
+
+    # load LoRA weights
+    if not args.lycoris and args.lora_weight is not None and len(args.lora_weight) > 0:
+        lora_weights_list = []
+        for lora_weight in args.lora_weight:
+            logger.info(f"Loading LoRA weight from: {lora_weight}")
+            lora_sd = load_file(lora_weight)  # load on CPU, dtype is as is
+            # lora_sd = filter_lora_state_dict(lora_sd, args.include_patterns, args.exclude_patterns)
+            lora_weights_list.append(lora_sd)
+    else:
+        lora_weights_list = None
+
+    loading_weight_dtype = dit_weight_dtype
+    if args.fp8_scaled and not args.lycoris:
+        loading_weight_dtype = None  # we will load weights as-is and then optimize to fp8
+
+    model = hunyuan_image_models.load_hunyuan_image_model(
+        device,
+        args.dit,
+        args.attn_mode,
+        False,
+        loading_device,
+        loading_weight_dtype,
+        args.fp8_scaled and not args.lycoris,
+        lora_weights_list=lora_weights_list,
+        lora_multipliers=args.lora_multiplier,
+    )
+
+    # merge LoRA weights
+    if args.lycoris:
+        if args.lora_weight is not None and len(args.lora_weight) > 0:
+            merge_lora_weights(lora_hunyuan_image, model, args, device)
+
+        if args.fp8_scaled:
+            # load state dict as-is and optimize to fp8
+            state_dict = model.state_dict()
+
+            # if no blocks to swap, we can move the weights to GPU after optimization on GPU (omit redundant CPU->GPU copy)
+            move_to_device = args.blocks_to_swap == 0  # if blocks_to_swap > 0, we will keep the model on CPU
+            state_dict = model.fp8_optimization(state_dict, device, move_to_device, use_scaled_mm=args.fp8_fast)
+
+            info = model.load_state_dict(state_dict, strict=True, assign=True)
+            logger.info(f"Loaded FP8 optimized weights: {info}")
+
+    # if we only want to save the model, we can skip the rest
+    if args.save_merged_model:
+        return None
+
+    if not args.fp8_scaled:
+        # simple cast to dit_weight_dtype
+        target_dtype = None  # load as-is (dit_weight_dtype == dtype of the weights in state_dict)
+        target_device = None
+
+        if dit_weight_dtype is not None:  # in case of args.fp8 and not args.fp8_scaled
+            logger.info(f"Convert model to {dit_weight_dtype}")
+            target_dtype = dit_weight_dtype
+
+        if args.blocks_to_swap == 0:
+            logger.info(f"Move model to device: {device}")
+            target_device = device
+
+        model.to(target_device, target_dtype)  # move and cast  at the same time. this reduces redundant copy operations
+
+    # if args.compile:
+    #     compile_backend, compile_mode, compile_dynamic, compile_fullgraph = args.compile_args
+    #     logger.info(
+    #         f"Torch Compiling[Backend: {compile_backend}; Mode: {compile_mode}; Dynamic: {compile_dynamic}; Fullgraph: {compile_fullgraph}]"
+    #     )
+    #     torch._dynamo.config.cache_size_limit = 32
+    #     for i in range(len(model.blocks)):
+    #         model.blocks[i] = torch.compile(
+    #             model.blocks[i],
+    #             backend=compile_backend,
+    #             mode=compile_mode,
+    #             dynamic=compile_dynamic.lower() in "true",
+    #             fullgraph=compile_fullgraph.lower() in "true",
+    #         )
+
+    if args.blocks_to_swap > 0:
+        logger.info(f"Enable swap {args.blocks_to_swap} blocks to CPU from device: {device}")
+        model.enable_block_swap(args.blocks_to_swap, device, supports_backward=False)
+        model.move_to_device_except_swap_blocks(device)
+        model.prepare_block_swap_before_forward()
+    else:
+        # make sure the model is on the right device
+        model.to(device)
+
+    model.eval().requires_grad_(False)
+    clean_memory_on_device(device)
+
+    return model
+
+
+def merge_lora_weights(
+    lora_module: ModuleType,
+    model: torch.nn.Module,
+    lora_weights: List[str],
+    lora_multipliers: List[float],
+    include_patterns: Optional[List[str]],
+    exclude_patterns: Optional[List[str]],
+    device: torch.device,
+    lycoris: bool = False,
+    save_merged_model: Optional[str] = None,
+    converter: Optional[callable] = None,
+) -> None:
+    """merge LoRA weights to the model
+
+    Args:
+        lora_module: LoRA module, e.g. lora_wan
+        model: DiT model
+        lora_weights: paths to LoRA weights
+        lora_multipliers: multipliers for LoRA weights
+        include_patterns: regex patterns to include LoRA modules
+        exclude_patterns: regex patterns to exclude LoRA modules
+        device: torch.device
+        lycoris: use LyCORIS
+        save_merged_model: path to save merged model, if specified, no inference will be performed
+        converter: Optional[callable] = None
+    """
+    if lora_weights is None or len(lora_weights) == 0:
+        return
+
+    for i, lora_weight in enumerate(lora_weights):
+        if lora_multipliers is not None and len(lora_multipliers) > i:
+            lora_multiplier = lora_multipliers[i]
+        else:
+            lora_multiplier = 1.0
+
+        logger.info(f"Loading LoRA weights from {lora_weight} with multiplier {lora_multiplier}")
+        weights_sd = load_file(lora_weight)
+        if converter is not None:
+            weights_sd = converter(weights_sd)
+
+        # apply include/exclude patterns
+        original_key_count = len(weights_sd.keys())
+        if include_patterns is not None and len(include_patterns) > i:
+            include_pattern = include_patterns[i]
+            regex_include = re.compile(include_pattern)
+            weights_sd = {k: v for k, v in weights_sd.items() if regex_include.search(k)}
+            logger.info(f"Filtered keys with include pattern {include_pattern}: {original_key_count} -> {len(weights_sd.keys())}")
+        if exclude_patterns is not None and len(exclude_patterns) > i:
+            original_key_count_ex = len(weights_sd.keys())
+            exclude_pattern = exclude_patterns[i]
+            regex_exclude = re.compile(exclude_pattern)
+            weights_sd = {k: v for k, v in weights_sd.items() if not regex_exclude.search(k)}
+            logger.info(
+                f"Filtered keys with exclude pattern {exclude_pattern}: {original_key_count_ex} -> {len(weights_sd.keys())}"
+            )
+        if len(weights_sd) != original_key_count:
+            remaining_keys = list(set([k.split(".", 1)[0] for k in weights_sd.keys()]))
+            remaining_keys.sort()
+            logger.info(f"Remaining LoRA modules after filtering: {remaining_keys}")
+            if len(weights_sd) == 0:
+                logger.warning("No keys left after filtering.")
+
+        if lycoris:
+            lycoris_net, _ = create_network_from_weights(
+                multiplier=lora_multiplier,
+                file=None,
+                weights_sd=weights_sd,
+                unet=model,
+                text_encoder=None,
+                vae=None,
+                for_inference=True,
+            )
+            lycoris_net.merge_to(None, model, weights_sd, dtype=None, device=device)
+        else:
+            network = lora_module.create_arch_network_from_weights(lora_multiplier, weights_sd, unet=model, for_inference=True)
+            network.merge_to(None, model, weights_sd, device=device, non_blocking=True)
+
+        synchronize_device(device)
+        logger.info("LoRA weights loaded")
+
+    # save model here before casting to dit_weight_dtype
+    if save_merged_model:
+        logger.info(f"Saving merged model to {save_merged_model}")
+        mem_eff_save_file(model.state_dict(), save_merged_model)  # save_file needs a lot of memory
+        logger.info("Merged model saved")
+
+
+# endregion
+
+
+def decode_latent(vae: HunyuanVAE2D, latent: torch.Tensor, device: torch.device, enable_tiling: bool = False) -> torch.Tensor:
+    logger.info(f"Decoding image. Latent shape {latent.shape}, device {device}")
+
+    vae.to(device)
+    if enable_tiling:
+        vae.enable_tiling()
+    else:
+        vae.disable_tiling()
+    with torch.no_grad():
+        latent = latent / vae.scaling_factor  # scale latent back to original range
+        pixels = vae.decode(latent.to(device, dtype=vae.dtype))
+    pixels = pixels.to("cpu", dtype=torch.float32)  # move to CPU and convert to float32 (bfloat16 is not supported by numpy)
+    vae.to("cpu")
+
+    logger.info(f"Decoded. Pixel shape {pixels.shape}")
+    return pixels[0]  # remove batch dimension
+
+
+def prepare_text_inputs(
+    args: argparse.Namespace, device: torch.device, shared_models: Optional[Dict] = None
+) -> Tuple[Dict[str, Any], Dict[str, Any]]:
+    """Prepare text-related inputs for T2I: LLM encoding."""
+
+    # load text encoder: conds_cache holds cached encodings for prompts without padding
+    conds_cache = {}
+    vl_device = torch.device("cpu") if args.text_encoder_cpu else device
+    if shared_models is not None:
+        tokenizer_vlm = shared_models.get("tokenizer_vlm")
+        text_encoder_vlm = shared_models.get("text_encoder_vlm")
+        tokenizer_byt5 = shared_models.get("tokenizer_byt5")
+        text_encoder_byt5 = shared_models.get("text_encoder_byt5")
+
+        if "conds_cache" in shared_models:  # Use shared cache if available
+            conds_cache = shared_models["conds_cache"]
+
+        # text_encoder is on device (batched inference) or CPU (interactive inference)
+    else:  # Load if not in shared_models
+        vl_dtype = torch.bfloat16  # Default dtype for Text Encoder
+        tokenizer_vlm, text_encoder_vlm = hunyuan_image_text_encoder.load_qwen2_5_vl(
+            args.text_encoder, dtype=vl_dtype, device=vl_device, disable_mmap=True
+        )
+        tokenizer_byt5, text_encoder_byt5 = hunyuan_image_text_encoder.load_byt5(
+            args.byt5, dtype=torch.float16, device=vl_device, disable_mmap=True
+        )
+
+    # Store original devices to move back later if they were shared. This does nothing if shared_models is None
+    text_encoder_original_device = text_encoder_vlm.device if text_encoder_vlm else None
+
+    # Ensure text_encoder is not None before proceeding
+    if not text_encoder_vlm or not tokenizer_vlm or not tokenizer_byt5 or not text_encoder_byt5:
+        raise ValueError("Text encoder or tokenizer is not loaded properly.")
+
+    # Define a function to move models to device if needed
+    # This is to avoid moving models if not needed, especially in interactive mode
+    model_is_moved = False
+
+    def move_models_to_device_if_needed():
+        nonlocal model_is_moved
+        nonlocal shared_models
+
+        if model_is_moved:
+            return
+        model_is_moved = True
+
+        logger.info(f"Moving DiT and Text Encoder to appropriate device: {device} or CPU")
+        if shared_models and "model" in shared_models:  # DiT model is shared
+            if args.blocks_to_swap > 0:
+                logger.info("Waiting for 5 seconds to finish block swap")
+                time.sleep(5)
+            model = shared_models["model"]
+            model.to("cpu")
+            clean_memory_on_device(device)  # clean memory on device before moving models
+
+        text_encoder_vlm.to(vl_device)  # If text_encoder_cpu is True, this will be CPU
+        text_encoder_byt5.to(vl_device)
+
+    logger.info("Encoding prompt with Text Encoder")
+
+    prompt = args.prompt
+    cache_key = prompt
+    if cache_key in conds_cache:
+        embed, mask = conds_cache[cache_key]
+    else:
+        move_models_to_device_if_needed()
+
+        embed, mask = hunyuan_image_text_encoder.get_qwen_prompt_embeds(tokenizer_vlm, text_encoder_vlm, prompt)
+        ocr_mask, embed_byt5, mask_byt5 = hunyuan_image_text_encoder.get_glyph_prompt_embeds(
+            tokenizer_byt5, text_encoder_byt5, prompt
+        )
+        embed = embed.cpu()
+        mask = mask.cpu()
+        embed_byt5 = embed_byt5.cpu()
+        mask_byt5 = mask_byt5.cpu()
+
+        conds_cache[cache_key] = (embed, mask, embed_byt5, mask_byt5, ocr_mask)
+
+    negative_prompt = args.negative_prompt
+    cache_key = negative_prompt
+    if cache_key in conds_cache:
+        negative_embed, negative_mask = conds_cache[cache_key]
+    else:
+        move_models_to_device_if_needed()
+
+        negative_embed, negative_mask = hunyuan_image_text_encoder.get_qwen_prompt_embeds(
+            tokenizer_vlm, text_encoder_vlm, negative_prompt
+        )
+        negative_ocr_mask, negative_embed_byt5, negative_mask_byt5 = hunyuan_image_text_encoder.get_glyph_prompt_embeds(
+            tokenizer_byt5, text_encoder_byt5, negative_prompt
+        )
+        negative_embed = negative_embed.cpu()
+        negative_mask = negative_mask.cpu()
+        negative_embed_byt5 = negative_embed_byt5.cpu()
+        negative_mask_byt5 = negative_mask_byt5.cpu()
+
+        conds_cache[cache_key] = (negative_embed, negative_mask, negative_embed_byt5, negative_mask_byt5, negative_ocr_mask)
+
+    if not (shared_models and "text_encoder_vlm" in shared_models):  # if loaded locally
+        # There is a bug text_encoder is not freed from GPU memory when text encoder is fp8
+        del tokenizer_vlm, text_encoder_vlm, tokenizer_byt5, text_encoder_byt5
+        gc.collect()  # This may force Text Encoder to be freed from GPU memory
+    else:  # if shared, move back to original device (likely CPU)
+        if text_encoder_vlm:
+            text_encoder_vlm.to(text_encoder_original_device)
+        if text_encoder_byt5:
+            text_encoder_byt5.to(text_encoder_original_device)
+
+    clean_memory_on_device(device)
+
+    arg_c = {"embed": embed, "mask": mask, "embed_byt5": embed_byt5, "mask_byt5": mask_byt5, "ocr_mask": ocr_mask, "prompt": prompt}
+    arg_null = {
+        "embed": negative_embed,
+        "mask": negative_mask,
+        "embed_byt5": negative_embed_byt5,
+        "mask_byt5": negative_mask_byt5,
+        "ocr_mask": negative_ocr_mask,
+        "prompt": negative_prompt,
+    }
+
+    return arg_c, arg_null
+
+
+def generate(
+    args: argparse.Namespace,
+    gen_settings: GenerationSettings,
+    shared_models: Optional[Dict] = None,
+    precomputed_text_data: Optional[Dict] = None,
+) -> torch.Tensor:
+    """main function for generation
+
+    Args:
+        args: command line arguments
+        shared_models: dictionary containing pre-loaded models (mainly for DiT)
+        precomputed_image_data: Optional dictionary with precomputed image data
+        precomputed_text_data: Optional dictionary with precomputed text data
+
+    Returns:
+        tuple: (HunyuanVAE2D model (vae) or None, torch.Tensor generated latent)
+    """
+    device, dit_weight_dtype = (gen_settings.device, gen_settings.dit_weight_dtype)
+
+    # prepare seed
+    seed = args.seed if args.seed is not None else random.randint(0, 2**32 - 1)
+    args.seed = seed  # set seed to args for saving
+
+    if precomputed_text_data is not None:
+        logger.info("Using precomputed text data.")
+        context = precomputed_text_data["context"]
+        context_null = precomputed_text_data["context_null"]
+
+    else:
+        logger.info("No precomputed data. Preparing image and text inputs.")
+        context, context_null = prepare_text_inputs(args, device, shared_models)
+
+    if shared_models is None or "model" not in shared_models:
+        # load DiT model
+        model = load_dit_model(args, device, dit_weight_dtype)
+
+        # if we only want to save the model, we can skip the rest
+        if args.save_merged_model:
+            return None
+
+        if shared_models is not None:
+            shared_models["model"] = model
+    else:
+        # use shared model
+        model: hunyuan_image_models.HYImageDiffusionTransformer = shared_models["model"]
+        # model.move_to_device_except_swap_blocks(device)  # Handles block swap correctly
+        # model.prepare_block_swap_before_forward()
+
+    # set random generator
+    seed_g = torch.Generator(device="cpu")
+    seed_g.manual_seed(seed)
+
+    height, width = check_inputs(args)
+    logger.info(f"Image size: {height}x{width} (HxW), infer_steps: {args.infer_steps}")
+
+    # image generation ######
+
+    logger.info(f"Prompt: {context['prompt']}")
+
+    embed = context["embed"].to(device, dtype=torch.bfloat16)
+    mask = context["mask"].to(device, dtype=torch.bfloat16)
+    embed_byt5 = context["embed_byt5"].to(device, dtype=torch.bfloat16)
+    mask_byt5 = context["mask_byt5"].to(device, dtype=torch.bfloat16)
+    ocr_mask = context["ocr_mask"]  # list of bool
+    negative_embed = context_null["embed"].to(device, dtype=torch.bfloat16)
+    negative_mask = context_null["mask"].to(device, dtype=torch.bfloat16)
+    negative_embed_byt5 = context_null["embed_byt5"].to(device, dtype=torch.bfloat16)
+    negative_mask_byt5 = context_null["mask_byt5"].to(device, dtype=torch.bfloat16)
+    # negative_ocr_mask = context_null["ocr_mask"]  # list of bool
+
+    # Prepare latent variables
+    num_channels_latents = model.in_channels
+    shape = (1, num_channels_latents, height // hunyuan_image_vae.VAE_SCALE_FACTOR, width // hunyuan_image_vae.VAE_SCALE_FACTOR)
+    latents = randn_tensor(shape, generator=seed_g, device=device, dtype=torch.bfloat16)
+
+    logger.info(
+        f"Embed: {embed.shape}, embed byt5: {embed_byt5.shape}, negative_embed: {negative_embed.shape}, negative embed byt5: {negative_embed_byt5.shape}, latents: {latents.shape}"
+    )
+
+    # Prepare timesteps
+    timesteps, sigmas = hunyuan_image_utils.get_timesteps_sigmas(args.infer_steps, args.flow_shift, device)
+
+    # Prepare Guider
+    cfg_guider_ocr = hunyuan_image_utils.AdaptiveProjectedGuidance(
+        guidance_scale=10.0, eta=0.0, adaptive_projected_guidance_rescale=10.0, adaptive_projected_guidance_momentum=-0.5
+    )
+    cfg_guider_general = hunyuan_image_utils.AdaptiveProjectedGuidance(
+        guidance_scale=10.0, eta=0.0, adaptive_projected_guidance_rescale=10.0, adaptive_projected_guidance_momentum=-0.5
+    )
+
+    # Denoising loop
+    do_cfg = args.guidance_scale != 1.0
+    with tqdm(total=len(timesteps), desc="Denoising steps") as pbar:
+        for i, t in enumerate(timesteps):
+            t_expand = t.expand(latents.shape[0]).to(latents.dtype)
+
+            with torch.no_grad():
+                noise_pred = model(latents, t_expand, embed, mask, embed_byt5, mask_byt5)
+
+            if do_cfg:
+                with torch.no_grad():
+                    uncond_noise_pred = model(
+                        latents, t_expand, negative_embed, negative_mask, negative_embed_byt5, negative_mask_byt5
+                    )
+                noise_pred = hunyuan_image_utils.apply_classifier_free_guidance(
+                    noise_pred,
+                    uncond_noise_pred,
+                    ocr_mask[0],
+                    args.guidance_scale,
+                    i,
+                    cfg_guider_ocr=cfg_guider_ocr,
+                    cfg_guider_general=cfg_guider_general,
+                )
+
+            # ensure latents dtype is consistent
+            latents = hunyuan_image_utils.step(latents, noise_pred, sigmas, i).to(latents.dtype)
+
+            pbar.update()
+
+    return latents
+
+
+def get_time_flag():
+    return datetime.datetime.fromtimestamp(time.time()).strftime("%Y%m%d-%H%M%S-%f")[:-3]
+
+
+def save_latent(latent: torch.Tensor, args: argparse.Namespace, height: int, width: int) -> str:
+    """Save latent to file
+
+    Args:
+        latent: Latent tensor
+        args: command line arguments
+        height: height of frame
+        width: width of frame
+
+    Returns:
+        str: Path to saved latent file
+    """
+    save_path = args.save_path
+    os.makedirs(save_path, exist_ok=True)
+    time_flag = get_time_flag()
+
+    seed = args.seed
+
+    latent_path = f"{save_path}/{time_flag}_{seed}_latent.safetensors"
+
+    if args.no_metadata:
+        metadata = None
+    else:
+        metadata = {
+            "seeds": f"{seed}",
+            "prompt": f"{args.prompt}",
+            "height": f"{height}",
+            "width": f"{width}",
+            "infer_steps": f"{args.infer_steps}",
+            # "embedded_cfg_scale": f"{args.embedded_cfg_scale}",
+            "guidance_scale": f"{args.guidance_scale}",
+        }
+        if args.negative_prompt is not None:
+            metadata["negative_prompt"] = f"{args.negative_prompt}"
+
+    sd = {"latent": latent.contiguous()}
+    save_file(sd, latent_path, metadata=metadata)
+    logger.info(f"Latent saved to: {latent_path}")
+
+    return latent_path
+
+
+def save_images(sample: torch.Tensor, args: argparse.Namespace, original_base_name: Optional[str] = None) -> str:
+    """Save images to directory
+
+    Args:
+        sample: Video tensor
+        args: command line arguments
+        original_base_name: Original base name (if latents are loaded from files)
+
+    Returns:
+        str: Path to saved images directory
+    """
+    save_path = args.save_path
+    os.makedirs(save_path, exist_ok=True)
+    time_flag = get_time_flag()
+
+    seed = args.seed
+    original_name = "" if original_base_name is None else f"_{original_base_name}"
+    image_name = f"{time_flag}_{seed}{original_name}"
+
+    x = torch.clamp(sample, -1.0, 1.0)
+    x = ((x + 1.0) * 127.5).to(torch.uint8).cpu().numpy()
+    x = x.transpose(1, 2, 0)  # C, H, W -> H, W, C
+
+    image = Image.fromarray(x)
+    image.save(os.path.join(save_path, f"{image_name}.png"))
+
+    logger.info(f"Sample images saved to: {save_path}/{image_name}")
+
+    return f"{save_path}/{image_name}"
+
+
+def save_output(
+    args: argparse.Namespace,
+    vae: HunyuanVAE2D,
+    latent: torch.Tensor,
+    device: torch.device,
+    original_base_names: Optional[List[str]] = None,
+) -> None:
+    """save output
+
+    Args:
+        args: command line arguments
+        vae: VAE model
+        latent: latent tensor
+        device: device to use
+        original_base_names: original base names (if latents are loaded from files)
+    """
+    height, width = latent.shape[-2], latent.shape[-1]  # BCTHW
+    height *= hunyuan_image_vae.VAE_SCALE_FACTOR
+    width *= hunyuan_image_vae.VAE_SCALE_FACTOR
+    # print(f"Saving output. Latent shape {latent.shape}; pixel shape {height}x{width}")
+    if args.output_type == "latent" or args.output_type == "latent_images":
+        # save latent
+        save_latent(latent, args, height, width)
+    if args.output_type == "latent":
+        return
+
+    if vae is None:
+        logger.error("VAE is None, cannot decode latents for saving video/images.")
+        return
+
+    if latent.ndim == 2:  # S,C. For packed latents from other inference scripts
+        latent = latent.unsqueeze(0)
+        height, width = check_inputs(args)  # Get height/width from args
+        latent = latent.view(
+            1, vae.latent_channels, height // hunyuan_image_vae.VAE_SCALE_FACTOR, width // hunyuan_image_vae.VAE_SCALE_FACTOR
+        )
+
+    image = decode_latent(vae, latent, device, args.vae_enable_tiling)
+
+    if args.output_type == "images" or args.output_type == "latent_images":
+        # save images
+        if original_base_names is None or len(original_base_names) == 0:
+            original_name = ""
+        else:
+            original_name = f"_{original_base_names[0]}"
+        save_images(image, args, original_name)
+
+
+def preprocess_prompts_for_batch(prompt_lines: List[str], base_args: argparse.Namespace) -> List[Dict]:
+    """Process multiple prompts for batch mode
+
+    Args:
+        prompt_lines: List of prompt lines
+        base_args: Base command line arguments
+
+    Returns:
+        List[Dict]: List of prompt data dictionaries
+    """
+    prompts_data = []
+
+    for line in prompt_lines:
+        line = line.strip()
+        if not line or line.startswith("#"):  # Skip empty lines and comments
+            continue
+
+        # Parse prompt line and create override dictionary
+        prompt_data = parse_prompt_line(line)
+        logger.info(f"Parsed prompt data: {prompt_data}")
+        prompts_data.append(prompt_data)
+
+    return prompts_data
+
+
+def load_shared_models(args: argparse.Namespace) -> Dict:
+    """Load shared models for batch processing or interactive mode.
+    Models are loaded to CPU to save memory. VAE is NOT loaded here.
+    DiT model is also NOT loaded here, handled by process_batch_prompts or generate.
+
+    Args:
+        args: Base command line arguments
+
+    Returns:
+        Dict: Dictionary of shared models (text/image encoders)
+    """
+    shared_models = {}
+    # Load text encoders to CPU
+    vl_dtype = torch.bfloat16  # Default dtype for Text Encoder
+    tokenizer_vlm, text_encoder_vlm = hunyuan_image_text_encoder.load_qwen2_5_vl(
+        args.text_encoder, dtype=vl_dtype, device="cpu", disable_mmap=True
+    )
+    tokenizer_byt5, text_encoder_byt5 = hunyuan_image_text_encoder.load_byt5(
+        args.byt5, dtype=torch.float16, device="cpu", disable_mmap=True
+    )
+    shared_models["tokenizer_vlm"] = tokenizer_vlm
+    shared_models["text_encoder_vlm"] = text_encoder_vlm
+    shared_models["tokenizer_byt5"] = tokenizer_byt5
+    shared_models["text_encoder_byt5"] = text_encoder_byt5
+    return shared_models
+
+
+def process_batch_prompts(prompts_data: List[Dict], args: argparse.Namespace) -> None:
+    """Process multiple prompts with model reuse and batched precomputation
+
+    Args:
+        prompts_data: List of prompt data dictionaries
+        args: Base command line arguments
+    """
+    if not prompts_data:
+        logger.warning("No valid prompts found")
+        return
+
+    gen_settings = get_generation_settings(args)
+    dit_weight_dtype = gen_settings.dit_weight_dtype
+    device = gen_settings.device
+
+    # 1. Prepare VAE
+    logger.info("Loading VAE for batch generation...")
+    vae_for_batch = hunyuan_image_vae.load_vae(args.vae, device="cpu", disable_mmap=True)
+    vae_for_batch.eval()
+
+    all_prompt_args_list = [apply_overrides(args, pd) for pd in prompts_data]  # Create all arg instances first
+    for prompt_args in all_prompt_args_list:
+        check_inputs(prompt_args)  # Validate each prompt's height/width
+
+    # 2. Precompute Text Data (Text Encoder)
+    logger.info("Loading Text Encoder for batch text preprocessing...")
+
+    # Text Encoder loaded to CPU by load_text_encoder
+    vl_dtype = torch.bfloat16  # Default dtype for Text Encoder
+    tokenizer_vlm, text_encoder_vlm_batch = hunyuan_image_text_encoder.load_qwen2_5_vl(
+        args.text_encoder, dtype=vl_dtype, device="cpu", disable_mmap=True
+    )
+    tokenizer_byt5, text_encoder_byt5_batch = hunyuan_image_text_encoder.load_byt5(
+        args.byt5, dtype=torch.float16, device="cpu", disable_mmap=True
+    )
+
+    # Text Encoder to device for this phase
+    vl_device = torch.device("cpu") if args.text_encoder_cpu else device
+    text_encoder_vlm_batch.to(vl_device)  # Moved into prepare_text_inputs logic
+    text_encoder_byt5_batch.to(vl_device)
+
+    all_precomputed_text_data = []
+    conds_cache_batch = {}
+
+    logger.info("Preprocessing text and LLM/TextEncoder encoding for all prompts...")
+    temp_shared_models_txt = {
+        "tokenizer_vlm": tokenizer_vlm,
+        "text_encoder_vlm": text_encoder_vlm_batch,  # on GPU if not text_encoder_cpu
+        "tokenizer_byt5": tokenizer_byt5,
+        "text_encoder_byt5": text_encoder_byt5_batch,  # on GPU if not text_encoder_cpu
+        "conds_cache": conds_cache_batch,
+    }
+
+    for i, prompt_args_item in enumerate(all_prompt_args_list):
+        logger.info(f"Text preprocessing for prompt {i+1}/{len(all_prompt_args_list)}: {prompt_args_item.prompt}")
+
+        # prepare_text_inputs will move text_encoders to device temporarily
+        context, context_null = prepare_text_inputs(prompt_args_item, device, temp_shared_models_txt)
+        text_data = {"context": context, "context_null": context_null}
+        all_precomputed_text_data.append(text_data)
+
+    # Models should be removed from device after prepare_text_inputs
+    del tokenizer_batch, text_encoder_batch, temp_shared_models_txt, conds_cache_batch
+    gc.collect()  # Force cleanup of Text Encoder from GPU memory
+    clean_memory_on_device(device)
+
+    # 3. Load DiT Model once
+    logger.info("Loading DiT model for batch generation...")
+    # Use args from the first prompt for DiT loading (LoRA etc. should be consistent for a batch)
+    first_prompt_args = all_prompt_args_list[0]
+    dit_model = load_dit_model(first_prompt_args, device, dit_weight_dtype)  # Load directly to target device if possible
+
+    if first_prompt_args.save_merged_model:
+        logger.info("Merged DiT model saved. Skipping generation.")
+
+    shared_models_for_generate = {"model": dit_model}  # Pass DiT via shared_models
+
+    all_latents = []
+
+    logger.info("Generating latents for all prompts...")
+    with torch.no_grad():
+        for i, prompt_args_item in enumerate(all_prompt_args_list):
+            current_text_data = all_precomputed_text_data[i]
+            height, width = check_inputs(prompt_args_item)  # Get height/width for each prompt
+
+            logger.info(f"Generating latent for prompt {i+1}/{len(all_prompt_args_list)}: {prompt_args_item.prompt}")
+            try:
+                # generate is called with precomputed data, so it won't load Text Encoders.
+                # It will use the DiT model from shared_models_for_generate.
+                latent = generate(prompt_args_item, gen_settings, shared_models_for_generate, current_text_data)
+
+                if latent is None:  # and prompt_args_item.save_merged_model:  # Should be caught earlier
+                    continue
+
+                # Save latent if needed (using data from precomputed_image_data for H/W)
+                if prompt_args_item.output_type in ["latent", "latent_images"]:
+                    save_latent(latent, prompt_args_item, height, width)
+
+                all_latents.append(latent)
+            except Exception as e:
+                logger.error(f"Error generating latent for prompt: {prompt_args_item.prompt}. Error: {e}", exc_info=True)
+                all_latents.append(None)  # Add placeholder for failed generations
+                continue
+
+    # Free DiT model
+    logger.info("Releasing DiT model from memory...")
+    if args.blocks_to_swap > 0:
+        logger.info("Waiting for 5 seconds to finish block swap")
+        time.sleep(5)
+
+    del shared_models_for_generate["model"]
+    del dit_model
+    clean_memory_on_device(device)
+    synchronize_device(device)  # Ensure memory is freed before loading VAE for decoding
+
+    # 4. Decode latents and save outputs (using vae_for_batch)
+    if args.output_type != "latent":
+        logger.info("Decoding latents to videos/images using batched VAE...")
+        vae_for_batch.to(device)  # Move VAE to device for decoding
+
+        for i, latent in enumerate(all_latents):
+            if latent is None:  # Skip failed generations
+                logger.warning(f"Skipping decoding for prompt {i+1} due to previous error.")
+                continue
+
+            current_args = all_prompt_args_list[i]
+            logger.info(f"Decoding output {i+1}/{len(all_latents)} for prompt: {current_args.prompt}")
+
+            # if args.output_type is "latent_images", we already saved latent above.
+            # so we skip saving latent here.
+            if current_args.output_type == "latent_images":
+                current_args.output_type = "images"
+
+            # save_output expects latent to be [BCTHW] or [CTHW]. generate returns [BCTHW] (batch size 1).
+            # latent[0] is correct if generate returns it with batch dim.
+            # The latent from generate is (1, C, T, H, W)
+            save_output(current_args, vae_for_batch, latent[0], device)  # Pass vae_for_batch
+
+        vae_for_batch.to("cpu")  # Move VAE back to CPU
+
+    del vae_for_batch
+    clean_memory_on_device(device)
+
+
+def process_interactive(args: argparse.Namespace) -> None:
+    """Process prompts in interactive mode
+
+    Args:
+        args: Base command line arguments
+    """
+    gen_settings = get_generation_settings(args)
+    device = gen_settings.device
+    shared_models = load_shared_models(args)
+    shared_models["conds_cache"] = {}  # Initialize empty cache for interactive mode
+
+    print("Interactive mode. Enter prompts (Ctrl+D or Ctrl+Z (Windows) to exit):")
+
+    try:
+        import prompt_toolkit
+    except ImportError:
+        logger.warning("prompt_toolkit not found. Using basic input instead.")
+        prompt_toolkit = None
+
+    if prompt_toolkit:
+        session = prompt_toolkit.PromptSession()
+
+        def input_line(prompt: str) -> str:
+            return session.prompt(prompt)
+
+    else:
+
+        def input_line(prompt: str) -> str:
+            return input(prompt)
+
+    vae = hunyuan_image_vae.load_vae(args.vae, device="cpu", disable_mmap=True)
+    vae.eval()
+
+    try:
+        while True:
+            try:
+                line = input_line("> ")
+                if not line.strip():
+                    continue
+                if len(line.strip()) == 1 and line.strip() in ["\x04", "\x1a"]:  # Ctrl+D or Ctrl+Z with prompt_toolkit
+                    raise EOFError  # Exit on Ctrl+D or Ctrl+Z
+
+                # Parse prompt
+                prompt_data = parse_prompt_line(line)
+                prompt_args = apply_overrides(args, prompt_data)
+
+                # Generate latent
+                # For interactive, precomputed data is None. shared_models contains text encoders.
+                latent = generate(prompt_args, gen_settings, shared_models)
+
+                # # If not one_frame_inference, move DiT model to CPU after generation
+                # if prompt_args.blocks_to_swap > 0:
+                #     logger.info("Waiting for 5 seconds to finish block swap")
+                #     time.sleep(5)
+                # model = shared_models.get("model")
+                # model.to("cpu")  # Move DiT model to CPU after generation
+
+                # Save latent and video
+                # returned_vae from generate will be used for decoding here.
+                save_output(prompt_args, vae, latent[0], device)
+
+            except KeyboardInterrupt:
+                print("\nInterrupted. Continue (Ctrl+D or Ctrl+Z (Windows) to exit)")
+                continue
+
+    except EOFError:
+        print("\nExiting interactive mode")
+
+
+def get_generation_settings(args: argparse.Namespace) -> GenerationSettings:
+    device = torch.device(args.device)
+
+    dit_weight_dtype = torch.bfloat16  # default
+    if args.fp8_scaled:
+        dit_weight_dtype = None  # various precision weights, so don't cast to specific dtype
+    elif args.fp8:
+        dit_weight_dtype = torch.float8_e4m3fn
+
+    logger.info(f"Using device: {device}, DiT weight weight precision: {dit_weight_dtype}")
+
+    gen_settings = GenerationSettings(device=device, dit_weight_dtype=dit_weight_dtype)
+    return gen_settings
+
+
+def main():
+    # Parse arguments
+    args = parse_args()
+
+    # Check if latents are provided
+    latents_mode = args.latent_path is not None and len(args.latent_path) > 0
+
+    # Set device
+    device = args.device if args.device is not None else "cuda" if torch.cuda.is_available() else "cpu"
+    device = torch.device(device)
+    logger.info(f"Using device: {device}")
+    args.device = device
+
+    if latents_mode:
+        # Original latent decode mode
+        original_base_names = []
+        latents_list = []
+        seeds = []
+
+        # assert len(args.latent_path) == 1, "Only one latent path is supported for now"
+
+        for latent_path in args.latent_path:
+            original_base_names.append(os.path.splitext(os.path.basename(latent_path))[0])
+            seed = 0
+
+            if os.path.splitext(latent_path)[1] != ".safetensors":
+                latents = torch.load(latent_path, map_location="cpu")
+            else:
+                latents = load_file(latent_path)["latent"]
+                with safe_open(latent_path, framework="pt") as f:
+                    metadata = f.metadata()
+                if metadata is None:
+                    metadata = {}
+                logger.info(f"Loaded metadata: {metadata}")
+
+                if "seeds" in metadata:
+                    seed = int(metadata["seeds"])
+                if "height" in metadata and "width" in metadata:
+                    height = int(metadata["height"])
+                    width = int(metadata["width"])
+                    args.image_size = [height, width]
+
+            seeds.append(seed)
+            logger.info(f"Loaded latent from {latent_path}. Shape: {latents.shape}")
+
+            if latents.ndim == 5:  # [BCTHW]
+                latents = latents.squeeze(0)  # [CTHW]
+
+            latents_list.append(latents)
+
+        # latent = torch.stack(latents_list, dim=0)  # [N, ...], must be same shape
+
+        for i, latent in enumerate(latents_list):
+            args.seed = seeds[i]
+
+            vae = hunyuan_image_vae.load_vae(args.vae, device=device, disable_mmap=True)
+            vae.eval()
+            save_output(args, vae, latent, device, original_base_names)
+
+    elif args.from_file:
+        # Batch mode from file
+
+        # Read prompts from file
+        with open(args.from_file, "r", encoding="utf-8") as f:
+            prompt_lines = f.readlines()
+
+        # Process prompts
+        prompts_data = preprocess_prompts_for_batch(prompt_lines, args)
+        process_batch_prompts(prompts_data, args)
+
+    elif args.interactive:
+        # Interactive mode
+        process_interactive(args)
+
+    else:
+        # Single prompt mode (original behavior)
+
+        # Generate latent
+        gen_settings = get_generation_settings(args)
+
+        # For single mode, precomputed data is None, shared_models is None.
+        # generate will load all necessary models (Text Encoders, DiT).
+        latent = generate(args, gen_settings)
+        # print(f"Generated latent shape: {latent.shape}")
+        # if args.save_merged_model:
+        #     return
+
+        clean_memory_on_device(device)
+
+        # Save latent and video
+        vae = hunyuan_image_vae.load_vae(args.vae, device="cpu", disable_mmap=True)
+        vae.eval()
+        save_output(args, vae, latent, device)
+
+    logger.info("Done!")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/library/attention.py b/library/attention.py
new file mode 100644
index 000000000..10a096143
--- /dev/null
+++ b/library/attention.py
@@ -0,0 +1,50 @@
+import torch
+from typing import Optional
+
+
+def attention(
+    q: torch.Tensor, k: torch.Tensor, v: torch.Tensor, seq_lens: list[int], attn_mode: str = "torch", drop_rate: float = 0.0
+) -> torch.Tensor:
+    """
+    Compute scaled dot-product attention with variable sequence lengths.
+
+    Handles batches with different sequence lengths by splitting and
+    processing each sequence individually.
+
+    Args:
+        q: Query tensor [B, L, H, D].
+        k: Key tensor [B, L, H, D].
+        v: Value tensor [B, L, H, D].
+        seq_lens: Valid sequence length for each batch element.
+        attn_mode: Attention implementation ("torch" or "sageattn").
+        drop_rate: Attention dropout rate.
+
+    Returns:
+        Attention output tensor [B, L, H*D].
+    """
+    # Determine tensor layout based on attention implementation
+    if attn_mode == "torch" or attn_mode == "sageattn":
+        transpose_fn = lambda x: x.transpose(1, 2)  # [B, H, L, D] for SDPA
+    else:
+        transpose_fn = lambda x: x  # [B, L, H, D] for other implementations
+
+    # Process each batch element with its valid sequence length
+    q = [transpose_fn(q[i : i + 1, : seq_lens[i]]) for i in range(len(q))]
+    k = [transpose_fn(k[i : i + 1, : seq_lens[i]]) for i in range(len(k))]
+    v = [transpose_fn(v[i : i + 1, : seq_lens[i]]) for i in range(len(v))]
+
+    if attn_mode == "torch":
+        x = []
+        for i in range(len(q)):
+            x_i = torch.nn.functional.scaled_dot_product_attention(q[i], k[i], v[i], dropout_p=drop_rate)
+            q[i] = None
+            k[i] = None
+            v[i] = None
+            x.append(x_i)
+        x = torch.cat(x, dim=0)
+        del q, k, v
+    # Currently only PyTorch SDPA is implemented
+
+    x = transpose_fn(x)  # [B, L, H, D]
+    x = x.reshape(x.shape[0], x.shape[1], -1)  # [B, L, H*D]
+    return x
diff --git a/library/fp8_optimization_utils.py b/library/fp8_optimization_utils.py
new file mode 100644
index 000000000..a91eb4e4c
--- /dev/null
+++ b/library/fp8_optimization_utils.py
@@ -0,0 +1,391 @@
+import os
+from typing import List, Union
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+import logging
+
+from tqdm import tqdm
+
+from library.device_utils import clean_memory_on_device
+from library.utils import MemoryEfficientSafeOpen, setup_logging
+
+setup_logging()
+import logging
+
+logger = logging.getLogger(__name__)
+
+
+def calculate_fp8_maxval(exp_bits=4, mantissa_bits=3, sign_bits=1):
+    """
+    Calculate the maximum representable value in FP8 format.
+    Default is E4M3 format (4-bit exponent, 3-bit mantissa, 1-bit sign).
+
+    Args:
+        exp_bits (int): Number of exponent bits
+        mantissa_bits (int): Number of mantissa bits
+        sign_bits (int): Number of sign bits (0 or 1)
+
+    Returns:
+        float: Maximum value representable in FP8 format
+    """
+    assert exp_bits + mantissa_bits + sign_bits == 8, "Total bits must be 8"
+
+    # Calculate exponent bias
+    bias = 2 ** (exp_bits - 1) - 1
+
+    # Calculate maximum mantissa value
+    mantissa_max = 1.0
+    for i in range(mantissa_bits - 1):
+        mantissa_max += 2 ** -(i + 1)
+
+    # Calculate maximum value
+    max_value = mantissa_max * (2 ** (2**exp_bits - 1 - bias))
+
+    return max_value
+
+
+def quantize_tensor_to_fp8(tensor, scale, exp_bits=4, mantissa_bits=3, sign_bits=1, max_value=None, min_value=None):
+    """
+    Quantize a tensor to FP8 format.
+
+    Args:
+        tensor (torch.Tensor): Tensor to quantize
+        scale (float or torch.Tensor): Scale factor
+        exp_bits (int): Number of exponent bits
+        mantissa_bits (int): Number of mantissa bits
+        sign_bits (int): Number of sign bits
+
+    Returns:
+        tuple: (quantized_tensor, scale_factor)
+    """
+    # Create scaled tensor
+    scaled_tensor = tensor / scale
+
+    # Calculate FP8 parameters
+    bias = 2 ** (exp_bits - 1) - 1
+
+    if max_value is None:
+        # Calculate max and min values
+        max_value = calculate_fp8_maxval(exp_bits, mantissa_bits, sign_bits)
+        min_value = -max_value if sign_bits > 0 else 0.0
+
+    # Clamp tensor to range
+    clamped_tensor = torch.clamp(scaled_tensor, min_value, max_value)
+
+    # Quantization process
+    abs_values = torch.abs(clamped_tensor)
+    nonzero_mask = abs_values > 0
+
+    # Calculate log scales (only for non-zero elements)
+    log_scales = torch.zeros_like(clamped_tensor)
+    if nonzero_mask.any():
+        log_scales[nonzero_mask] = torch.floor(torch.log2(abs_values[nonzero_mask]) + bias).detach()
+
+    # Limit log scales and calculate quantization factor
+    log_scales = torch.clamp(log_scales, min=1.0)
+    quant_factor = 2.0 ** (log_scales - mantissa_bits - bias)
+
+    # Quantize and dequantize
+    quantized = torch.round(clamped_tensor / quant_factor) * quant_factor
+
+    return quantized, scale
+
+
+def optimize_state_dict_with_fp8(
+    state_dict, calc_device, target_layer_keys=None, exclude_layer_keys=None, exp_bits=4, mantissa_bits=3, move_to_device=False
+):
+    """
+    Optimize Linear layer weights in a model's state dict to FP8 format.
+
+    Args:
+        state_dict (dict): State dict to optimize, replaced in-place
+        calc_device (str): Device to quantize tensors on
+        target_layer_keys (list, optional): Layer key patterns to target (None for all Linear layers)
+        exclude_layer_keys (list, optional): Layer key patterns to exclude
+        exp_bits (int): Number of exponent bits
+        mantissa_bits (int): Number of mantissa bits
+        move_to_device (bool): Move optimized tensors to the calculating device
+
+    Returns:
+        dict: FP8 optimized state dict
+    """
+    if exp_bits == 4 and mantissa_bits == 3:
+        fp8_dtype = torch.float8_e4m3fn
+    elif exp_bits == 5 and mantissa_bits == 2:
+        fp8_dtype = torch.float8_e5m2
+    else:
+        raise ValueError(f"Unsupported FP8 format: E{exp_bits}M{mantissa_bits}")
+
+    # Calculate FP8 max value
+    max_value = calculate_fp8_maxval(exp_bits, mantissa_bits)
+    min_value = -max_value  # this function supports only signed FP8
+
+    # Create optimized state dict
+    optimized_count = 0
+
+    # Enumerate tarket keys
+    target_state_dict_keys = []
+    for key in state_dict.keys():
+        # Check if it's a weight key and matches target patterns
+        is_target = (target_layer_keys is None or any(pattern in key for pattern in target_layer_keys)) and key.endswith(".weight")
+        is_excluded = exclude_layer_keys is not None and any(pattern in key for pattern in exclude_layer_keys)
+        is_target = is_target and not is_excluded
+
+        if is_target and isinstance(state_dict[key], torch.Tensor):
+            target_state_dict_keys.append(key)
+
+    # Process each key
+    for key in tqdm(target_state_dict_keys):
+        value = state_dict[key]
+
+        # Save original device and dtype
+        original_device = value.device
+        original_dtype = value.dtype
+
+        # Move to calculation device
+        if calc_device is not None:
+            value = value.to(calc_device)
+
+        # Calculate scale factor
+        scale = torch.max(torch.abs(value.flatten())) / max_value
+        # print(f"Optimizing {key} with scale: {scale}")
+
+        # Quantize weight to FP8
+        quantized_weight, _ = quantize_tensor_to_fp8(value, scale, exp_bits, mantissa_bits, 1, max_value, min_value)
+
+        # Add to state dict using original key for weight and new key for scale
+        fp8_key = key  # Maintain original key
+        scale_key = key.replace(".weight", ".scale_weight")
+
+        quantized_weight = quantized_weight.to(fp8_dtype)
+
+        if not move_to_device:
+            quantized_weight = quantized_weight.to(original_device)
+
+        scale_tensor = torch.tensor([scale], dtype=original_dtype, device=quantized_weight.device)
+
+        state_dict[fp8_key] = quantized_weight
+        state_dict[scale_key] = scale_tensor
+
+        optimized_count += 1
+
+        if calc_device is not None:  # optimized_count % 10 == 0 and
+            # free memory on calculation device
+            clean_memory_on_device(calc_device)
+
+    logger.info(f"Number of optimized Linear layers: {optimized_count}")
+    return state_dict
+
+
+def load_safetensors_with_fp8_optimization(
+    model_files: List[str],
+    calc_device: Union[str, torch.device],
+    target_layer_keys=None,
+    exclude_layer_keys=None,
+    exp_bits=4,
+    mantissa_bits=3,
+    move_to_device=False,
+    weight_hook=None,
+):
+    """
+    Load weight tensors from safetensors files and merge LoRA weights into the state dict with explicit FP8 optimization.
+
+    Args:
+        model_files (list[str]): List of model files to load
+        calc_device (str or torch.device): Device to quantize tensors on
+        target_layer_keys (list, optional): Layer key patterns to target for optimization (None for all Linear layers)
+        exclude_layer_keys (list, optional): Layer key patterns to exclude from optimization
+        exp_bits (int): Number of exponent bits
+        mantissa_bits (int): Number of mantissa bits
+        move_to_device (bool): Move optimized tensors to the calculating device
+        weight_hook (callable, optional): Function to apply to each weight tensor before optimization
+
+    Returns:
+        dict: FP8 optimized state dict
+    """
+    if exp_bits == 4 and mantissa_bits == 3:
+        fp8_dtype = torch.float8_e4m3fn
+    elif exp_bits == 5 and mantissa_bits == 2:
+        fp8_dtype = torch.float8_e5m2
+    else:
+        raise ValueError(f"Unsupported FP8 format: E{exp_bits}M{mantissa_bits}")
+
+    # Calculate FP8 max value
+    max_value = calculate_fp8_maxval(exp_bits, mantissa_bits)
+    min_value = -max_value  # this function supports only signed FP8
+
+    # Define function to determine if a key is a target key. target means fp8 optimization, not for weight hook.
+    def is_target_key(key):
+        # Check if weight key matches target patterns and does not match exclude patterns
+        is_target = (target_layer_keys is None or any(pattern in key for pattern in target_layer_keys)) and key.endswith(".weight")
+        is_excluded = exclude_layer_keys is not None and any(pattern in key for pattern in exclude_layer_keys)
+        return is_target and not is_excluded
+
+    # Create optimized state dict
+    optimized_count = 0
+
+    # Process each file
+    state_dict = {}
+    for model_file in model_files:
+        with MemoryEfficientSafeOpen(model_file) as f:
+            keys = f.keys()
+            for key in tqdm(keys, desc=f"Loading {os.path.basename(model_file)}", unit="key"):
+                value = f.get_tensor(key)
+                if weight_hook is not None:
+                    # Apply weight hook if provided
+                    value = weight_hook(key, value)
+
+                if not is_target_key(key):
+                    state_dict[key] = value
+                    continue
+
+                # Save original device and dtype
+                original_device = value.device
+                original_dtype = value.dtype
+
+                # Move to calculation device
+                if calc_device is not None:
+                    value = value.to(calc_device)
+
+                # Calculate scale factor
+                scale = torch.max(torch.abs(value.flatten())) / max_value
+                # print(f"Optimizing {key} with scale: {scale}")
+
+                # Quantize weight to FP8
+                quantized_weight, _ = quantize_tensor_to_fp8(value, scale, exp_bits, mantissa_bits, 1, max_value, min_value)
+
+                # Add to state dict using original key for weight and new key for scale
+                fp8_key = key  # Maintain original key
+                scale_key = key.replace(".weight", ".scale_weight")
+                assert fp8_key != scale_key, "FP8 key and scale key must be different"
+
+                quantized_weight = quantized_weight.to(fp8_dtype)
+
+                if not move_to_device:
+                    quantized_weight = quantized_weight.to(original_device)
+
+                scale_tensor = torch.tensor([scale], dtype=original_dtype, device=quantized_weight.device)
+
+                state_dict[fp8_key] = quantized_weight
+                state_dict[scale_key] = scale_tensor
+
+                optimized_count += 1
+
+                if calc_device is not None and optimized_count % 10 == 0:
+                    # free memory on calculation device
+                    clean_memory_on_device(calc_device)
+
+    logger.info(f"Number of optimized Linear layers: {optimized_count}")
+    return state_dict
+
+
+def fp8_linear_forward_patch(self: nn.Linear, x, use_scaled_mm=False, max_value=None):
+    """
+    Patched forward method for Linear layers with FP8 weights.
+
+    Args:
+        self: Linear layer instance
+        x (torch.Tensor): Input tensor
+        use_scaled_mm (bool): Use scaled_mm for FP8 Linear layers, requires SM 8.9+ (RTX 40 series)
+        max_value (float): Maximum value for FP8 quantization. If None, no quantization is applied for input tensor.
+
+    Returns:
+        torch.Tensor: Result of linear transformation
+    """
+    if use_scaled_mm:
+        input_dtype = x.dtype
+        original_weight_dtype = self.scale_weight.dtype
+        weight_dtype = self.weight.dtype
+        target_dtype = torch.float8_e5m2
+        assert weight_dtype == torch.float8_e4m3fn, "Only FP8 E4M3FN format is supported"
+        assert x.ndim == 3, "Input tensor must be 3D (batch_size, seq_len, hidden_dim)"
+
+        if max_value is None:
+            # no input quantization
+            scale_x = torch.tensor(1.0, dtype=torch.float32, device=x.device)
+        else:
+            # calculate scale factor for input tensor
+            scale_x = (torch.max(torch.abs(x.flatten())) / max_value).to(torch.float32)
+
+            # quantize input tensor to FP8: this seems to consume a lot of memory
+            x, _ = quantize_tensor_to_fp8(x, scale_x, 5, 2, 1, max_value, -max_value)
+
+        original_shape = x.shape
+        x = x.reshape(-1, x.shape[2]).to(target_dtype)
+
+        weight = self.weight.t()
+        scale_weight = self.scale_weight.to(torch.float32)
+
+        if self.bias is not None:
+            # float32 is not supported with bias in scaled_mm
+            o = torch._scaled_mm(x, weight, out_dtype=original_weight_dtype, bias=self.bias, scale_a=scale_x, scale_b=scale_weight)
+        else:
+            o = torch._scaled_mm(x, weight, out_dtype=input_dtype, scale_a=scale_x, scale_b=scale_weight)
+
+        return o.reshape(original_shape[0], original_shape[1], -1).to(input_dtype)
+
+    else:
+        # Dequantize the weight
+        original_dtype = self.scale_weight.dtype
+        dequantized_weight = self.weight.to(original_dtype) * self.scale_weight
+
+        # Perform linear transformation
+        if self.bias is not None:
+            output = F.linear(x, dequantized_weight, self.bias)
+        else:
+            output = F.linear(x, dequantized_weight)
+
+        return output
+
+
+def apply_fp8_monkey_patch(model, optimized_state_dict, use_scaled_mm=False):
+    """
+    Apply monkey patching to a model using FP8 optimized state dict.
+
+    Args:
+        model (nn.Module): Model instance to patch
+        optimized_state_dict (dict): FP8 optimized state dict
+        use_scaled_mm (bool): Use scaled_mm for FP8 Linear layers, requires SM 8.9+ (RTX 40 series)
+
+    Returns:
+        nn.Module: The patched model (same instance, modified in-place)
+    """
+    # # Calculate FP8 float8_e5m2 max value
+    # max_value = calculate_fp8_maxval(5, 2)
+    max_value = None  # do not quantize input tensor
+
+    # Find all scale keys to identify FP8-optimized layers
+    scale_keys = [k for k in optimized_state_dict.keys() if k.endswith(".scale_weight")]
+
+    # Enumerate patched layers
+    patched_module_paths = set()
+    for scale_key in scale_keys:
+        # Extract module path from scale key (remove .scale_weight)
+        module_path = scale_key.rsplit(".scale_weight", 1)[0]
+        patched_module_paths.add(module_path)
+
+    patched_count = 0
+
+    # Apply monkey patch to each layer with FP8 weights
+    for name, module in model.named_modules():
+        # Check if this module has a corresponding scale_weight
+        has_scale = name in patched_module_paths
+
+        # Apply patch if it's a Linear layer with FP8 scale
+        if isinstance(module, nn.Linear) and has_scale:
+            # register the scale_weight as a buffer to load the state_dict
+            module.register_buffer("scale_weight", torch.tensor(1.0, dtype=module.weight.dtype))
+
+            # Create a new forward method with the patched version.
+            def new_forward(self, x):
+                return fp8_linear_forward_patch(self, x, use_scaled_mm, max_value)
+
+            # Bind method to module
+            module.forward = new_forward.__get__(module, type(module))
+
+            patched_count += 1
+
+    logger.info(f"Number of monkey-patched Linear layers: {patched_count}")
+    return model
diff --git a/library/hunyuan_image_models.py b/library/hunyuan_image_models.py
new file mode 100644
index 000000000..5bd08c5ca
--- /dev/null
+++ b/library/hunyuan_image_models.py
@@ -0,0 +1,374 @@
+# Original work: https://github.com/Tencent-Hunyuan/HunyuanImage-2.1
+# Re-implemented for license compliance for sd-scripts.
+
+from typing import Dict, Optional, Tuple, Union
+
+import torch
+import torch.nn as nn
+from accelerate import init_empty_weights
+
+from library.fp8_optimization_utils import apply_fp8_monkey_patch
+from library.lora_utils import load_safetensors_with_lora_and_fp8
+from library.utils import setup_logging
+
+setup_logging()
+import logging
+
+logger = logging.getLogger(__name__)
+
+
+from library.hunyuan_image_modules import (
+    SingleTokenRefiner,
+    ByT5Mapper,
+    PatchEmbed2D,
+    TimestepEmbedder,
+    MMDoubleStreamBlock,
+    MMSingleStreamBlock,
+    FinalLayer,
+)
+from library.hunyuan_image_utils import get_nd_rotary_pos_embed
+
+FP8_OPTIMIZATION_TARGET_KEYS = ["double_blocks", "single_blocks"]
+FP8_OPTIMIZATION_EXCLUDE_KEYS = [
+    "norm",
+    "_mod",
+    "modulation",
+]
+
+
+# region DiT Model
+class HYImageDiffusionTransformer(nn.Module):
+    """
+    HunyuanImage-2.1 Diffusion Transformer.
+
+    A multimodal transformer for image generation with text conditioning,
+    featuring separate double-stream and single-stream processing blocks.
+
+    Args:
+        attn_mode: Attention implementation mode ("torch" or "sageattn").
+    """
+
+    def __init__(self, attn_mode: str = "torch"):
+        super().__init__()
+
+        # Fixed architecture parameters for HunyuanImage-2.1
+        self.patch_size = [1, 1]  # 1x1 patch size (no spatial downsampling)
+        self.in_channels = 64  # Input latent channels
+        self.out_channels = 64  # Output latent channels
+        self.unpatchify_channels = self.out_channels
+        self.guidance_embed = False  # Guidance embedding disabled
+        self.rope_dim_list = [64, 64]  # RoPE dimensions for 2D positional encoding
+        self.rope_theta = 256  # RoPE frequency scaling
+        self.use_attention_mask = True
+        self.text_projection = "single_refiner"
+        self.hidden_size = 3584  # Model dimension
+        self.heads_num = 28  # Number of attention heads
+
+        # Architecture configuration
+        mm_double_blocks_depth = 20  # Double-stream transformer blocks
+        mm_single_blocks_depth = 40  # Single-stream transformer blocks
+        mlp_width_ratio = 4  # MLP expansion ratio
+        text_states_dim = 3584  # Text encoder output dimension
+        guidance_embed = False  # No guidance embedding
+
+        # Layer configuration
+        mlp_act_type: str = "gelu_tanh"  # MLP activation function
+        qkv_bias: bool = True  # Use bias in QKV projections
+        qk_norm: bool = True  # Apply QK normalization
+        qk_norm_type: str = "rms"  # RMS normalization type
+
+        self.attn_mode = attn_mode
+
+        # ByT5 character-level text encoder mapping
+        self.byt5_in = ByT5Mapper(in_dim=1472, out_dim=2048, hidden_dim=2048, out_dim1=self.hidden_size, use_residual=False)
+
+        # Image latent patch embedding
+        self.img_in = PatchEmbed2D(self.patch_size, self.in_channels, self.hidden_size)
+
+        # Text token refinement with cross-attention
+        self.txt_in = SingleTokenRefiner(text_states_dim, self.hidden_size, self.heads_num, depth=2, attn_mode=self.attn_mode)
+
+        # Timestep embedding for diffusion process
+        self.time_in = TimestepEmbedder(self.hidden_size, nn.SiLU)
+
+        # MeanFlow not supported in this implementation
+        self.time_r_in = None
+
+        # Guidance embedding (disabled for non-distilled model)
+        self.guidance_in = TimestepEmbedder(self.hidden_size, nn.SiLU) if guidance_embed else None
+
+        # Double-stream blocks: separate image and text processing
+        self.double_blocks = nn.ModuleList(
+            [
+                MMDoubleStreamBlock(
+                    self.hidden_size,
+                    self.heads_num,
+                    mlp_width_ratio=mlp_width_ratio,
+                    mlp_act_type=mlp_act_type,
+                    qk_norm=qk_norm,
+                    qk_norm_type=qk_norm_type,
+                    qkv_bias=qkv_bias,
+                    attn_mode=self.attn_mode,
+                )
+                for _ in range(mm_double_blocks_depth)
+            ]
+        )
+
+        # Single-stream blocks: joint processing of concatenated features
+        self.single_blocks = nn.ModuleList(
+            [
+                MMSingleStreamBlock(
+                    self.hidden_size,
+                    self.heads_num,
+                    mlp_width_ratio=mlp_width_ratio,
+                    mlp_act_type=mlp_act_type,
+                    qk_norm=qk_norm,
+                    qk_norm_type=qk_norm_type,
+                    attn_mode=self.attn_mode,
+                )
+                for _ in range(mm_single_blocks_depth)
+            ]
+        )
+
+        self.final_layer = FinalLayer(self.hidden_size, self.patch_size, self.out_channels, nn.SiLU)
+
+    def get_rotary_pos_embed(self, rope_sizes):
+        """
+        Generate 2D rotary position embeddings for image tokens.
+
+        Args:
+            rope_sizes: Tuple of (height, width) for spatial dimensions.
+
+        Returns:
+            Tuple of (freqs_cos, freqs_sin) tensors for rotary position encoding.
+        """
+        freqs_cos, freqs_sin = get_nd_rotary_pos_embed(self.rope_dim_list, rope_sizes, theta=self.rope_theta)
+        return freqs_cos, freqs_sin
+
+    def reorder_txt_token(
+        self, byt5_txt: torch.Tensor, txt: torch.Tensor, byt5_text_mask: torch.Tensor, text_mask: torch.Tensor
+    ) -> Tuple[torch.Tensor, torch.Tensor, list[int]]:
+        """
+        Combine and reorder ByT5 character-level and word-level text embeddings.
+
+        Concatenates valid tokens from both encoders and creates appropriate masks.
+
+        Args:
+            byt5_txt: ByT5 character-level embeddings [B, L1, D].
+            txt: Word-level text embeddings [B, L2, D].
+            byt5_text_mask: Valid token mask for ByT5 [B, L1].
+            text_mask: Valid token mask for word tokens [B, L2].
+
+        Returns:
+            Tuple of (reordered_embeddings, combined_mask, sequence_lengths).
+        """
+        # Process each batch element separately to handle variable sequence lengths
+
+        reorder_txt = []
+        reorder_mask = []
+
+        txt_lens = []
+        for i in range(text_mask.shape[0]):
+            byt5_text_mask_i = byt5_text_mask[i].bool()
+            text_mask_i = text_mask[i].bool()
+            byt5_text_length = byt5_text_mask_i.sum()
+            text_length = text_mask_i.sum()
+            assert byt5_text_length == byt5_text_mask_i[:byt5_text_length].sum()
+            assert text_length == text_mask_i[:text_length].sum()
+
+            byt5_txt_i = byt5_txt[i]
+            txt_i = txt[i]
+            reorder_txt_i = torch.cat(
+                [byt5_txt_i[:byt5_text_length], txt_i[:text_length], byt5_txt_i[byt5_text_length:], txt_i[text_length:]], dim=0
+            )
+
+            reorder_mask_i = torch.zeros(
+                byt5_text_mask_i.shape[0] + text_mask_i.shape[0], dtype=torch.bool, device=byt5_text_mask_i.device
+            )
+            reorder_mask_i[: byt5_text_length + text_length] = True
+
+            reorder_txt.append(reorder_txt_i)
+            reorder_mask.append(reorder_mask_i)
+            txt_lens.append(byt5_text_length + text_length)
+
+        reorder_txt = torch.stack(reorder_txt)
+        reorder_mask = torch.stack(reorder_mask).to(dtype=torch.int64)
+
+        return reorder_txt, reorder_mask, txt_lens
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        timestep: torch.LongTensor,
+        text_states: torch.Tensor,
+        encoder_attention_mask: torch.Tensor,
+        byt5_text_states: Optional[torch.Tensor] = None,
+        byt5_text_mask: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        """
+        Forward pass through the HunyuanImage diffusion transformer.
+
+        Args:
+            hidden_states: Input image latents [B, C, H, W].
+            timestep: Diffusion timestep [B].
+            text_states: Word-level text embeddings [B, L, D].
+            encoder_attention_mask: Text attention mask [B, L].
+            byt5_text_states: ByT5 character-level embeddings [B, L_byt5, D_byt5].
+            byt5_text_mask: ByT5 attention mask [B, L_byt5].
+
+        Returns:
+            Tuple of (denoised_image, spatial_shape).
+        """
+        img = x = hidden_states
+        text_mask = encoder_attention_mask
+        t = timestep
+        txt = text_states
+
+        # Calculate spatial dimensions for rotary position embeddings
+        _, _, oh, ow = x.shape
+        th, tw = oh, ow  # Height and width (patch_size=[1,1] means no spatial downsampling)
+        freqs_cis = self.get_rotary_pos_embed((th, tw))
+
+        # Reshape image latents to sequence format: [B, C, H, W] -> [B, H*W, C]
+        img = self.img_in(img)
+
+        # Generate timestep conditioning vector
+        vec = self.time_in(t)
+
+        # MeanFlow and guidance embedding not used in this configuration
+
+        # Process text tokens through refinement layers
+        txt_lens = text_mask.to(torch.bool).sum(dim=1).tolist()
+        txt = self.txt_in(txt, t, txt_lens)
+
+        # Integrate character-level ByT5 features with word-level tokens
+        # Use variable length sequences with sequence lengths
+        byt5_txt = self.byt5_in(byt5_text_states)
+        txt, _, txt_lens = self.reorder_txt_token(byt5_txt, txt, byt5_text_mask, text_mask)
+
+        # Trim sequences to maximum length in the batch
+        img_seq_len = img.shape[1]
+        # print(f"img_seq_len: {img_seq_len}, txt_lens: {txt_lens}")
+        seq_lens = [img_seq_len + l for l in txt_lens]
+        max_txt_len = max(txt_lens)
+        # print(f"max_txt_len: {max_txt_len}, seq_lens: {seq_lens}, txt.shape: {txt.shape}")
+        txt = txt[:, :max_txt_len, :]
+        txt_seq_len = txt.shape[1]
+
+        # Process through double-stream blocks (separate image/text attention)
+        for index, block in enumerate(self.double_blocks):
+            img, txt = block(img, txt, vec, freqs_cis, seq_lens)
+
+        # Concatenate image and text tokens for joint processing
+        x = torch.cat((img, txt), 1)
+
+        # Process through single-stream blocks (joint attention)
+        for index, block in enumerate(self.single_blocks):
+            x = block(x, vec, txt_seq_len, freqs_cis, seq_lens)
+
+        img = x[:, :img_seq_len, ...]
+
+        # Apply final projection to output space
+        img = self.final_layer(img, vec)
+
+        # Reshape from sequence to spatial format: [B, L, C] -> [B, C, H, W]
+        img = self.unpatchify_2d(img, th, tw)
+        return img
+
+    def unpatchify_2d(self, x, h, w):
+        """
+        Convert sequence format back to spatial image format.
+
+        Args:
+            x: Input tensor [B, H*W, C].
+            h: Height dimension.
+            w: Width dimension.
+
+        Returns:
+            Spatial tensor [B, C, H, W].
+        """
+        c = self.unpatchify_channels
+
+        x = x.reshape(shape=(x.shape[0], h, w, c))
+        imgs = x.permute(0, 3, 1, 2)
+        return imgs
+
+
+# endregion
+
+# region Model Utils
+
+
+def create_model(attn_mode: str, split_attn: bool, dtype: Optional[torch.dtype]) -> HYImageDiffusionTransformer:
+    with init_empty_weights():
+        model = HYImageDiffusionTransformer(attn_mode=attn_mode)
+        if dtype is not None:
+            model.to(dtype)
+    return model
+
+
+def load_hunyuan_image_model(
+    device: Union[str, torch.device],
+    dit_path: str,
+    attn_mode: str,
+    split_attn: bool,
+    loading_device: Union[str, torch.device],
+    dit_weight_dtype: Optional[torch.dtype],
+    fp8_scaled: bool = False,
+    lora_weights_list: Optional[Dict[str, torch.Tensor]] = None,
+    lora_multipliers: Optional[list[float]] = None,
+) -> HYImageDiffusionTransformer:
+    """
+    Load a HunyuanImage model from the specified checkpoint.
+
+    Args:
+        device (Union[str, torch.device]): Device for optimization or merging
+        dit_path (str): Path to the DiT model checkpoint.
+        attn_mode (str): Attention mode to use, e.g., "torch", "flash", etc.
+        split_attn (bool): Whether to use split attention.
+        loading_device (Union[str, torch.device]): Device to load the model weights on.
+        dit_weight_dtype (Optional[torch.dtype]): Data type of the DiT weights.
+            If None, it will be loaded as is (same as the state_dict) or scaled for fp8. if not None, model weights will be casted to this dtype.
+        fp8_scaled (bool): Whether to use fp8 scaling for the model weights.
+        lora_weights_list (Optional[Dict[str, torch.Tensor]]): LoRA weights to apply, if any.
+        lora_multipliers (Optional[List[float]]): LoRA multipliers for the weights, if any.
+    """
+    # dit_weight_dtype is None for fp8_scaled
+    assert (not fp8_scaled and dit_weight_dtype is not None) or (fp8_scaled and dit_weight_dtype is None)
+
+    device = torch.device(device)
+    loading_device = torch.device(loading_device)
+
+    model = create_model(attn_mode, split_attn, dit_weight_dtype)
+
+    # load model weights with dynamic fp8 optimization and LoRA merging if needed
+    logger.info(f"Loading DiT model from {dit_path}, device={loading_device}")
+
+    sd = load_safetensors_with_lora_and_fp8(
+        model_files=dit_path,
+        lora_weights_list=lora_weights_list,
+        lora_multipliers=lora_multipliers,
+        fp8_optimization=fp8_scaled,
+        calc_device=device,
+        move_to_device=(loading_device == device),
+        dit_weight_dtype=dit_weight_dtype,
+        target_keys=FP8_OPTIMIZATION_TARGET_KEYS,
+        exclude_keys=FP8_OPTIMIZATION_EXCLUDE_KEYS,
+    )
+
+    if fp8_scaled:
+        apply_fp8_monkey_patch(model, sd, use_scaled_mm=False)
+
+        if loading_device.type != "cpu":
+            # make sure all the model weights are on the loading_device
+            logger.info(f"Moving weights to {loading_device}")
+            for key in sd.keys():
+                sd[key] = sd[key].to(loading_device)
+
+    info = model.load_state_dict(sd, strict=True, assign=True)
+    logger.info(f"Loaded DiT model from {dit_path}, info={info}")
+
+    return model
+
+
+# endregion
diff --git a/library/hunyuan_image_modules.py b/library/hunyuan_image_modules.py
new file mode 100644
index 000000000..b4ded4c53
--- /dev/null
+++ b/library/hunyuan_image_modules.py
@@ -0,0 +1,804 @@
+# Original work: https://github.com/Tencent-Hunyuan/HunyuanImage-2.1
+# Re-implemented for license compliance for sd-scripts.
+
+from typing import Tuple, Callable
+import torch
+import torch.nn as nn
+from einops import rearrange
+
+from library.attention import attention
+from library.hunyuan_image_utils import timestep_embedding, apply_rotary_emb, _to_tuple, apply_gate, modulate
+from library.attention import attention
+
+# region Modules
+
+
+class ByT5Mapper(nn.Module):
+    """
+    Maps ByT5 character-level encoder outputs to transformer hidden space.
+
+    Applies layer normalization, two MLP layers with GELU activation,
+    and optional residual connection.
+
+    Args:
+        in_dim: Input dimension from ByT5 encoder (1472 for ByT5-large).
+        out_dim: Intermediate dimension after first projection.
+        hidden_dim: Hidden dimension for MLP layer.
+        out_dim1: Final output dimension matching transformer hidden size.
+        use_residual: Whether to add residual connection (requires in_dim == out_dim).
+    """
+
+    def __init__(self, in_dim, out_dim, hidden_dim, out_dim1, use_residual=True):
+        super().__init__()
+        if use_residual:
+            assert in_dim == out_dim
+        self.layernorm = nn.LayerNorm(in_dim)
+        self.fc1 = nn.Linear(in_dim, hidden_dim)
+        self.fc2 = nn.Linear(hidden_dim, out_dim)
+        self.fc3 = nn.Linear(out_dim, out_dim1)
+        self.use_residual = use_residual
+        self.act_fn = nn.GELU()
+
+    def forward(self, x):
+        """
+        Transform ByT5 embeddings to transformer space.
+
+        Args:
+            x: Input ByT5 embeddings [..., in_dim].
+
+        Returns:
+            Transformed embeddings [..., out_dim1].
+        """
+        residual = x
+        x = self.layernorm(x)
+        x = self.fc1(x)
+        x = self.act_fn(x)
+        x = self.fc2(x)
+        x = self.act_fn(x)
+        x = self.fc3(x)
+        if self.use_residual:
+            x = x + residual
+        return x
+
+
+class PatchEmbed2D(nn.Module):
+    """
+    2D patch embedding layer for converting image latents to transformer tokens.
+
+    Uses 2D convolution to project image patches to embedding space.
+    For HunyuanImage-2.1, patch_size=[1,1] means no spatial downsampling.
+
+    Args:
+        patch_size: Spatial size of patches (int or tuple).
+        in_chans: Number of input channels.
+        embed_dim: Output embedding dimension.
+    """
+
+    def __init__(self, patch_size=16, in_chans=3, embed_dim=768):
+        super().__init__()
+        self.patch_size = tuple(patch_size)
+
+        self.proj = nn.Conv2d(in_chans, embed_dim, kernel_size=self.patch_size, stride=self.patch_size, bias=True)
+        self.norm = nn.Identity()  # No normalization layer used
+
+    def forward(self, x):
+        x = self.proj(x)
+        x = x.flatten(2).transpose(1, 2)
+        x = self.norm(x)
+        return x
+
+
+class TimestepEmbedder(nn.Module):
+    """
+    Embeds scalar diffusion timesteps into vector representations.
+
+    Uses sinusoidal encoding followed by a two-layer MLP.
+
+    Args:
+        hidden_size: Output embedding dimension.
+        act_layer: Activation function class (e.g., nn.SiLU).
+        frequency_embedding_size: Dimension of sinusoidal encoding.
+        max_period: Maximum period for sinusoidal frequencies.
+        out_size: Output dimension (defaults to hidden_size).
+    """
+
+    def __init__(self, hidden_size, act_layer, frequency_embedding_size=256, max_period=10000, out_size=None):
+        super().__init__()
+        self.frequency_embedding_size = frequency_embedding_size
+        self.max_period = max_period
+        if out_size is None:
+            out_size = hidden_size
+
+        self.mlp = nn.Sequential(
+            nn.Linear(frequency_embedding_size, hidden_size, bias=True), act_layer(), nn.Linear(hidden_size, out_size, bias=True)
+        )
+
+    def forward(self, t):
+        t_freq = timestep_embedding(t, self.frequency_embedding_size, self.max_period).type(self.mlp[0].weight.dtype)
+        return self.mlp(t_freq)
+
+
+class TextProjection(nn.Module):
+    """
+    Projects text embeddings through a two-layer MLP.
+
+    Used for context-aware representation computation in token refinement.
+
+    Args:
+        in_channels: Input feature dimension.
+        hidden_size: Hidden and output dimension.
+        act_layer: Activation function class.
+    """
+
+    def __init__(self, in_channels, hidden_size, act_layer):
+        super().__init__()
+        self.linear_1 = nn.Linear(in_features=in_channels, out_features=hidden_size, bias=True)
+        self.act_1 = act_layer()
+        self.linear_2 = nn.Linear(in_features=hidden_size, out_features=hidden_size, bias=True)
+
+    def forward(self, caption):
+        hidden_states = self.linear_1(caption)
+        hidden_states = self.act_1(hidden_states)
+        hidden_states = self.linear_2(hidden_states)
+        return hidden_states
+
+
+class MLP(nn.Module):
+    """
+    Multi-layer perceptron with configurable activation and normalization.
+
+    Standard two-layer MLP with optional dropout and intermediate normalization.
+
+    Args:
+        in_channels: Input feature dimension.
+        hidden_channels: Hidden layer dimension (defaults to in_channels).
+        out_features: Output dimension (defaults to in_channels).
+        act_layer: Activation function class.
+        norm_layer: Optional normalization layer class.
+        bias: Whether to use bias (can be bool or tuple for each layer).
+        drop: Dropout rate (can be float or tuple for each layer).
+        use_conv: Whether to use convolution instead of linear (not supported).
+    """
+
+    def __init__(
+        self,
+        in_channels,
+        hidden_channels=None,
+        out_features=None,
+        act_layer=nn.GELU,
+        norm_layer=None,
+        bias=True,
+        drop=0.0,
+        use_conv=False,
+    ):
+        super().__init__()
+        assert not use_conv, "Convolutional MLP not supported in this implementation."
+
+        out_features = out_features or in_channels
+        hidden_channels = hidden_channels or in_channels
+        bias = _to_tuple(bias, 2)
+        drop_probs = _to_tuple(drop, 2)
+
+        self.fc1 = nn.Linear(in_channels, hidden_channels, bias=bias[0])
+        self.act = act_layer()
+        self.drop1 = nn.Dropout(drop_probs[0])
+        self.norm = norm_layer(hidden_channels) if norm_layer is not None else nn.Identity()
+        self.fc2 = nn.Linear(hidden_channels, out_features, bias=bias[1])
+        self.drop2 = nn.Dropout(drop_probs[1])
+
+    def forward(self, x):
+        x = self.fc1(x)
+        x = self.act(x)
+        x = self.drop1(x)
+        x = self.norm(x)
+        x = self.fc2(x)
+        x = self.drop2(x)
+        return x
+
+
+class IndividualTokenRefinerBlock(nn.Module):
+    """
+    Single transformer block for individual token refinement.
+
+    Applies self-attention and MLP with adaptive layer normalization (AdaLN)
+    conditioned on timestep and context information.
+
+    Args:
+        hidden_size: Model dimension.
+        heads_num: Number of attention heads.
+        mlp_width_ratio: MLP expansion ratio.
+        mlp_drop_rate: MLP dropout rate.
+        act_type: Activation function (only "silu" supported).
+        qk_norm: QK normalization flag (must be False).
+        qk_norm_type: QK normalization type (only "layer" supported).
+        qkv_bias: Use bias in QKV projections.
+        attn_mode: Attention implementation mode.
+    """
+
+    def __init__(
+        self,
+        hidden_size: int,
+        heads_num: int,
+        mlp_width_ratio: float = 4.0,
+        mlp_drop_rate: float = 0.0,
+        act_type: str = "silu",
+        qk_norm: bool = False,
+        qk_norm_type: str = "layer",
+        qkv_bias: bool = True,
+        attn_mode: str = "torch",
+    ):
+        super().__init__()
+        assert qk_norm_type == "layer", "Only layer normalization supported for QK norm."
+        assert act_type == "silu", "Only SiLU activation supported."
+        assert not qk_norm, "QK normalization must be disabled."
+
+        self.attn_mode = attn_mode
+
+        self.heads_num = heads_num
+        mlp_hidden_dim = int(hidden_size * mlp_width_ratio)
+
+        self.norm1 = nn.LayerNorm(hidden_size, elementwise_affine=True, eps=1e-6)
+        self.self_attn_qkv = nn.Linear(hidden_size, hidden_size * 3, bias=qkv_bias)
+
+        self.self_attn_q_norm = nn.Identity()
+        self.self_attn_k_norm = nn.Identity()
+        self.self_attn_proj = nn.Linear(hidden_size, hidden_size, bias=qkv_bias)
+
+        self.norm2 = nn.LayerNorm(hidden_size, elementwise_affine=True, eps=1e-6)
+        self.mlp = MLP(in_channels=hidden_size, hidden_channels=mlp_hidden_dim, act_layer=nn.SiLU, drop=mlp_drop_rate)
+
+        self.adaLN_modulation = nn.Sequential(
+            nn.SiLU(),
+            nn.Linear(hidden_size, 2 * hidden_size, bias=True),
+        )
+
+    def forward(
+        self,
+        x: torch.Tensor,
+        c: torch.Tensor,  # Combined timestep and context conditioning
+        txt_lens: list[int],
+    ) -> torch.Tensor:
+        """
+        Apply self-attention and MLP with adaptive conditioning.
+
+        Args:
+            x: Input token embeddings [B, L, C].
+            c: Combined conditioning vector [B, C].
+            txt_lens: Valid sequence lengths for each batch element.
+
+        Returns:
+            Refined token embeddings [B, L, C].
+        """
+        gate_msa, gate_mlp = self.adaLN_modulation(c).chunk(2, dim=1)
+        norm_x = self.norm1(x)
+        qkv = self.self_attn_qkv(norm_x)
+        q, k, v = rearrange(qkv, "B L (K H D) -> K B L H D", K=3, H=self.heads_num)
+        q = self.self_attn_q_norm(q).to(v)
+        k = self.self_attn_k_norm(k).to(v)
+        attn = attention(q, k, v, seq_lens=txt_lens, attn_mode=self.attn_mode)
+
+        x = x + apply_gate(self.self_attn_proj(attn), gate_msa)
+        x = x + apply_gate(self.mlp(self.norm2(x)), gate_mlp)
+        return x
+
+
+class IndividualTokenRefiner(nn.Module):
+    """
+    Stack of token refinement blocks with self-attention.
+
+    Processes tokens individually with adaptive layer normalization.
+
+    Args:
+        hidden_size: Model dimension.
+        heads_num: Number of attention heads.
+        depth: Number of refinement blocks.
+        mlp_width_ratio: MLP expansion ratio.
+        mlp_drop_rate: MLP dropout rate.
+        act_type: Activation function type.
+        qk_norm: QK normalization flag.
+        qk_norm_type: QK normalization type.
+        qkv_bias: Use bias in QKV projections.
+        attn_mode: Attention implementation mode.
+    """
+
+    def __init__(
+        self,
+        hidden_size: int,
+        heads_num: int,
+        depth: int,
+        mlp_width_ratio: float = 4.0,
+        mlp_drop_rate: float = 0.0,
+        act_type: str = "silu",
+        qk_norm: bool = False,
+        qk_norm_type: str = "layer",
+        qkv_bias: bool = True,
+        attn_mode: str = "torch",
+    ):
+        super().__init__()
+        self.blocks = nn.ModuleList(
+            [
+                IndividualTokenRefinerBlock(
+                    hidden_size=hidden_size,
+                    heads_num=heads_num,
+                    mlp_width_ratio=mlp_width_ratio,
+                    mlp_drop_rate=mlp_drop_rate,
+                    act_type=act_type,
+                    qk_norm=qk_norm,
+                    qk_norm_type=qk_norm_type,
+                    qkv_bias=qkv_bias,
+                    attn_mode=attn_mode,
+                )
+                for _ in range(depth)
+            ]
+        )
+
+    def forward(self, x: torch.Tensor, c: torch.LongTensor, txt_lens: list[int]) -> torch.Tensor:
+        """
+        Apply sequential token refinement.
+
+        Args:
+            x: Input token embeddings [B, L, C].
+            c: Combined conditioning vector [B, C].
+            txt_lens: Valid sequence lengths for each batch element.
+
+        Returns:
+            Refined token embeddings [B, L, C].
+        """
+        for block in self.blocks:
+            x = block(x, c, txt_lens)
+        return x
+
+
+class SingleTokenRefiner(nn.Module):
+    """
+    Text embedding refinement with timestep and context conditioning.
+
+    Projects input text embeddings and applies self-attention refinement
+    conditioned on diffusion timestep and aggregate text context.
+
+    Args:
+        in_channels: Input text embedding dimension.
+        hidden_size: Transformer hidden dimension.
+        heads_num: Number of attention heads.
+        depth: Number of refinement blocks.
+        attn_mode: Attention implementation mode.
+    """
+
+    def __init__(self, in_channels: int, hidden_size: int, heads_num: int, depth: int, attn_mode: str = "torch"):
+        # Fixed architecture parameters for HunyuanImage-2.1
+        mlp_drop_rate: float = 0.0  # No MLP dropout
+        act_type: str = "silu"  # SiLU activation
+        mlp_width_ratio: float = 4.0  # 4x MLP expansion
+        qk_norm: bool = False  # No QK normalization
+        qk_norm_type: str = "layer"  # Layer norm type (unused)
+        qkv_bias: bool = True  # Use QKV bias
+
+        super().__init__()
+        self.input_embedder = nn.Linear(in_channels, hidden_size, bias=True)
+        act_layer = nn.SiLU
+        self.t_embedder = TimestepEmbedder(hidden_size, act_layer)
+        self.c_embedder = TextProjection(in_channels, hidden_size, act_layer)
+        self.individual_token_refiner = IndividualTokenRefiner(
+            hidden_size=hidden_size,
+            heads_num=heads_num,
+            depth=depth,
+            mlp_width_ratio=mlp_width_ratio,
+            mlp_drop_rate=mlp_drop_rate,
+            act_type=act_type,
+            qk_norm=qk_norm,
+            qk_norm_type=qk_norm_type,
+            qkv_bias=qkv_bias,
+            attn_mode=attn_mode,
+        )
+
+    def forward(self, x: torch.Tensor, t: torch.LongTensor, txt_lens: list[int]) -> torch.Tensor:
+        """
+        Refine text embeddings with timestep conditioning.
+
+        Args:
+            x: Input text embeddings [B, L, in_channels].
+            t: Diffusion timestep [B].
+            txt_lens: Valid sequence lengths for each batch element.
+
+        Returns:
+            Refined embeddings [B, L, hidden_size].
+        """
+        timestep_aware_representations = self.t_embedder(t)
+
+        # Compute context-aware representations by averaging valid tokens
+        context_aware_representations = torch.stack([x[i, : txt_lens[i]].mean(dim=0) for i in range(x.shape[0])], dim=0)  # [B, C]
+
+        context_aware_representations = self.c_embedder(context_aware_representations)
+        c = timestep_aware_representations + context_aware_representations
+        x = self.input_embedder(x)
+        x = self.individual_token_refiner(x, c, txt_lens)
+        return x
+
+
+class FinalLayer(nn.Module):
+    """
+    Final output projection layer with adaptive layer normalization.
+
+    Projects transformer hidden states to output patch space with
+    timestep-conditioned modulation.
+
+    Args:
+        hidden_size: Input hidden dimension.
+        patch_size: Spatial patch size for output reshaping.
+        out_channels: Number of output channels.
+        act_layer: Activation function class.
+    """
+
+    def __init__(self, hidden_size, patch_size, out_channels, act_layer):
+        super().__init__()
+
+        # Layer normalization without learnable parameters
+        self.norm_final = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
+        out_size = (patch_size[0] * patch_size[1]) * out_channels
+        self.linear = nn.Linear(hidden_size, out_size, bias=True)
+
+        # Adaptive layer normalization modulation
+        self.adaLN_modulation = nn.Sequential(
+            act_layer(),
+            nn.Linear(hidden_size, 2 * hidden_size, bias=True),
+        )
+
+    def forward(self, x, c):
+        shift, scale = self.adaLN_modulation(c).chunk(2, dim=1)
+        x = modulate(self.norm_final(x), shift=shift, scale=scale)
+        x = self.linear(x)
+        return x
+
+
+class RMSNorm(nn.Module):
+    """
+    Root Mean Square Layer Normalization.
+
+    Normalizes input using RMS and applies learnable scaling.
+    More efficient than LayerNorm as it doesn't compute mean.
+
+    Args:
+        dim: Input feature dimension.
+        eps: Small value for numerical stability.
+    """
+
+    def __init__(self, dim: int, eps: float = 1e-6):
+        super().__init__()
+        self.eps = eps
+        self.weight = nn.Parameter(torch.ones(dim))
+
+    def _norm(self, x):
+        """
+        Apply RMS normalization.
+
+        Args:
+            x: Input tensor.
+
+        Returns:
+            RMS normalized tensor.
+        """
+        return x * torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + self.eps)
+
+    def reset_parameters(self):
+        self.weight.fill_(1)
+
+    def forward(self, x):
+        """
+        Apply RMSNorm with learnable scaling.
+
+        Args:
+            x: Input tensor.
+
+        Returns:
+            Normalized and scaled tensor.
+        """
+        output = self._norm(x.float()).type_as(x)
+        output = output * self.weight
+        return output
+
+
+# kept for reference, not used in current implementation
+# class LinearWarpforSingle(nn.Module):
+#     """
+#     Linear layer wrapper for concatenating and projecting two inputs.
+
+#     Used in single-stream blocks to combine attention output with MLP features.
+
+#     Args:
+#         in_dim: Input dimension (sum of both input feature dimensions).
+#         out_dim: Output dimension.
+#         bias: Whether to use bias in linear projection.
+#     """
+
+#     def __init__(self, in_dim: int, out_dim: int, bias=False):
+#         super().__init__()
+#         self.fc = nn.Linear(in_dim, out_dim, bias=bias)
+
+#     def forward(self, x, y):
+#         """Concatenate inputs along feature dimension and project."""
+#         x = torch.cat([x.contiguous(), y.contiguous()], dim=2).contiguous()
+#         return self.fc(x)
+
+
+class ModulateDiT(nn.Module):
+    """
+    Timestep conditioning modulation layer.
+
+    Projects timestep embeddings to multiple modulation parameters
+    for adaptive layer normalization.
+
+    Args:
+        hidden_size: Input conditioning dimension.
+        factor: Number of modulation parameters to generate.
+        act_layer: Activation function class.
+    """
+
+    def __init__(self, hidden_size: int, factor: int, act_layer: Callable):
+        super().__init__()
+        self.act = act_layer()
+        self.linear = nn.Linear(hidden_size, factor * hidden_size, bias=True)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return self.linear(self.act(x))
+
+
+class MMDoubleStreamBlock(nn.Module):
+    """
+    Multimodal double-stream transformer block.
+
+    Processes image and text tokens separately with cross-modal attention.
+    Each stream has its own normalization and MLP layers but shares
+    attention computation for cross-modal interaction.
+
+    Args:
+        hidden_size: Model dimension.
+        heads_num: Number of attention heads.
+        mlp_width_ratio: MLP expansion ratio.
+        mlp_act_type: MLP activation function (only "gelu_tanh" supported).
+        qk_norm: QK normalization flag (must be True).
+        qk_norm_type: QK normalization type (only "rms" supported).
+        qkv_bias: Use bias in QKV projections.
+        attn_mode: Attention implementation mode.
+    """
+
+    def __init__(
+        self,
+        hidden_size: int,
+        heads_num: int,
+        mlp_width_ratio: float,
+        mlp_act_type: str = "gelu_tanh",
+        qk_norm: bool = True,
+        qk_norm_type: str = "rms",
+        qkv_bias: bool = False,
+        attn_mode: str = "torch",
+    ):
+        super().__init__()
+
+        assert mlp_act_type == "gelu_tanh", "Only GELU-tanh activation supported."
+        assert qk_norm_type == "rms", "Only RMS normalization supported."
+        assert qk_norm, "QK normalization must be enabled."
+
+        self.attn_mode = attn_mode
+        self.heads_num = heads_num
+        head_dim = hidden_size // heads_num
+        mlp_hidden_dim = int(hidden_size * mlp_width_ratio)
+
+        # Image stream processing components
+        self.img_mod = ModulateDiT(hidden_size, factor=6, act_layer=nn.SiLU)
+        self.img_norm1 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
+
+        self.img_attn_qkv = nn.Linear(hidden_size, hidden_size * 3, bias=qkv_bias)
+
+        self.img_attn_q_norm = RMSNorm(head_dim, eps=1e-6)
+        self.img_attn_k_norm = RMSNorm(head_dim, eps=1e-6)
+        self.img_attn_proj = nn.Linear(hidden_size, hidden_size, bias=qkv_bias)
+
+        self.img_norm2 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
+        self.img_mlp = MLP(hidden_size, mlp_hidden_dim, act_layer=lambda: nn.GELU(approximate="tanh"), bias=True)
+
+        # Text stream processing components
+        self.txt_mod = ModulateDiT(hidden_size, factor=6, act_layer=nn.SiLU)
+        self.txt_norm1 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
+
+        self.txt_attn_qkv = nn.Linear(hidden_size, hidden_size * 3, bias=qkv_bias)
+        self.txt_attn_q_norm = RMSNorm(head_dim, eps=1e-6)
+        self.txt_attn_k_norm = RMSNorm(head_dim, eps=1e-6)
+        self.txt_attn_proj = nn.Linear(hidden_size, hidden_size, bias=qkv_bias)
+
+        self.txt_norm2 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
+        self.txt_mlp = MLP(hidden_size, mlp_hidden_dim, act_layer=lambda: nn.GELU(approximate="tanh"), bias=True)
+
+    def forward(
+        self, img: torch.Tensor, txt: torch.Tensor, vec: torch.Tensor, freqs_cis: tuple = None, seq_lens: list[int] = None
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        # Extract modulation parameters for image and text streams
+        (img_mod1_shift, img_mod1_scale, img_mod1_gate, img_mod2_shift, img_mod2_scale, img_mod2_gate) = self.img_mod(vec).chunk(
+            6, dim=-1
+        )
+        (txt_mod1_shift, txt_mod1_scale, txt_mod1_gate, txt_mod2_shift, txt_mod2_scale, txt_mod2_gate) = self.txt_mod(vec).chunk(
+            6, dim=-1
+        )
+
+        # Process image stream for attention
+        img_modulated = self.img_norm1(img)
+        img_modulated = modulate(img_modulated, shift=img_mod1_shift, scale=img_mod1_scale)
+
+        img_qkv = self.img_attn_qkv(img_modulated)
+        img_q, img_k, img_v = img_qkv.chunk(3, dim=-1)
+        del img_qkv
+
+        img_q = rearrange(img_q, "B L (H D) -> B L H D", H=self.heads_num)
+        img_k = rearrange(img_k, "B L (H D) -> B L H D", H=self.heads_num)
+        img_v = rearrange(img_v, "B L (H D) -> B L H D", H=self.heads_num)
+
+        # Apply QK-Norm if enabled
+        img_q = self.img_attn_q_norm(img_q).to(img_v)
+        img_k = self.img_attn_k_norm(img_k).to(img_v)
+
+        # Apply rotary position embeddings to image tokens
+        if freqs_cis is not None:
+            img_qq, img_kk = apply_rotary_emb(img_q, img_k, freqs_cis, head_first=False)
+            assert (
+                img_qq.shape == img_q.shape and img_kk.shape == img_k.shape
+            ), f"RoPE output shape mismatch: got {img_qq.shape}, {img_kk.shape}, expected {img_q.shape}, {img_k.shape}"
+            img_q, img_k = img_qq, img_kk
+
+        # Process text stream for attention
+        txt_modulated = self.txt_norm1(txt)
+        txt_modulated = modulate(txt_modulated, shift=txt_mod1_shift, scale=txt_mod1_scale)
+
+        txt_qkv = self.txt_attn_qkv(txt_modulated)
+        txt_q, txt_k, txt_v = txt_qkv.chunk(3, dim=-1)
+        del txt_qkv
+
+        txt_q = rearrange(txt_q, "B L (H D) -> B L H D", H=self.heads_num)
+        txt_k = rearrange(txt_k, "B L (H D) -> B L H D", H=self.heads_num)
+        txt_v = rearrange(txt_v, "B L (H D) -> B L H D", H=self.heads_num)
+
+        # Apply QK-Norm if enabled
+        txt_q = self.txt_attn_q_norm(txt_q).to(txt_v)
+        txt_k = self.txt_attn_k_norm(txt_k).to(txt_v)
+
+        # Concatenate image and text tokens for joint attention
+        q = torch.cat([img_q, txt_q], dim=1)
+        k = torch.cat([img_k, txt_k], dim=1)
+        v = torch.cat([img_v, txt_v], dim=1)
+        attn = attention(q, k, v, seq_lens=seq_lens, attn_mode=self.attn_mode)
+
+        # Split attention outputs back to separate streams
+        img_attn, txt_attn = (attn[:, : img_q.shape[1]].contiguous(), attn[:, img_q.shape[1] :].contiguous())
+
+        # Apply attention projection and residual connection for image stream
+        img = img + apply_gate(self.img_attn_proj(img_attn), gate=img_mod1_gate)
+
+        # Apply MLP and residual connection for image stream
+        img = img + apply_gate(
+            self.img_mlp(modulate(self.img_norm2(img), shift=img_mod2_shift, scale=img_mod2_scale)),
+            gate=img_mod2_gate,
+        )
+
+        # Apply attention projection and residual connection for text stream
+        txt = txt + apply_gate(self.txt_attn_proj(txt_attn), gate=txt_mod1_gate)
+
+        # Apply MLP and residual connection for text stream
+        txt = txt + apply_gate(
+            self.txt_mlp(modulate(self.txt_norm2(txt), shift=txt_mod2_shift, scale=txt_mod2_scale)),
+            gate=txt_mod2_gate,
+        )
+
+        return img, txt
+
+
+class MMSingleStreamBlock(nn.Module):
+    """
+    Multimodal single-stream transformer block.
+
+    Processes concatenated image and text tokens jointly with shared attention.
+    Uses parallel linear layers for efficiency and applies RoPE only to image tokens.
+
+    Args:
+        hidden_size: Model dimension.
+        heads_num: Number of attention heads.
+        mlp_width_ratio: MLP expansion ratio.
+        mlp_act_type: MLP activation function (only "gelu_tanh" supported).
+        qk_norm: QK normalization flag (must be True).
+        qk_norm_type: QK normalization type (only "rms" supported).
+        qk_scale: Attention scaling factor (computed automatically if None).
+        attn_mode: Attention implementation mode.
+    """
+
+    def __init__(
+        self,
+        hidden_size: int,
+        heads_num: int,
+        mlp_width_ratio: float = 4.0,
+        mlp_act_type: str = "gelu_tanh",
+        qk_norm: bool = True,
+        qk_norm_type: str = "rms",
+        qk_scale: float = None,
+        attn_mode: str = "torch",
+    ):
+        super().__init__()
+
+        assert mlp_act_type == "gelu_tanh", "Only GELU-tanh activation supported."
+        assert qk_norm_type == "rms", "Only RMS normalization supported."
+        assert qk_norm, "QK normalization must be enabled."
+
+        self.attn_mode = attn_mode
+        self.hidden_size = hidden_size
+        self.heads_num = heads_num
+        head_dim = hidden_size // heads_num
+        mlp_hidden_dim = int(hidden_size * mlp_width_ratio)
+        self.mlp_hidden_dim = mlp_hidden_dim
+        self.scale = qk_scale or head_dim**-0.5
+
+        # Parallel linear projections for efficiency
+        self.linear1 = nn.Linear(hidden_size, hidden_size * 3 + mlp_hidden_dim)
+
+        # Combined output projection
+        # self.linear2 = LinearWarpforSingle(hidden_size + mlp_hidden_dim, hidden_size, bias=True) # for reference
+        self.linear2 = nn.Linear(hidden_size + mlp_hidden_dim, hidden_size, bias=True)
+
+        # QK normalization layers
+        self.q_norm = RMSNorm(head_dim, eps=1e-6)
+        self.k_norm = RMSNorm(head_dim, eps=1e-6)
+
+        self.pre_norm = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
+
+        self.mlp_act = nn.GELU(approximate="tanh")
+        self.modulation = ModulateDiT(hidden_size, factor=3, act_layer=nn.SiLU)
+
+    def forward(
+        self,
+        x: torch.Tensor,
+        vec: torch.Tensor,
+        txt_len: int,
+        freqs_cis: Tuple[torch.Tensor, torch.Tensor] = None,
+        seq_lens: list[int] = None,
+    ) -> torch.Tensor:
+        # Extract modulation parameters
+        mod_shift, mod_scale, mod_gate = self.modulation(vec).chunk(3, dim=-1)
+        x_mod = modulate(self.pre_norm(x), shift=mod_shift, scale=mod_scale)
+
+        # Compute Q, K, V, and MLP input
+        qkv_mlp = self.linear1(x_mod)
+        q, k, v, mlp = qkv_mlp.split([self.hidden_size, self.hidden_size, self.hidden_size, self.mlp_hidden_dim], dim=-1)
+        del qkv_mlp
+
+        q = rearrange(q, "B L (H D) -> B L H D", H=self.heads_num)
+        k = rearrange(k, "B L (H D) -> B L H D", H=self.heads_num)
+        v = rearrange(v, "B L (H D) -> B L H D", H=self.heads_num)
+
+        # Apply QK-Norm if enabled
+        q = self.q_norm(q).to(v)
+        k = self.k_norm(k).to(v)
+
+        # Separate image and text tokens
+        img_q, txt_q = q[:, :-txt_len, :, :], q[:, -txt_len:, :, :]
+        img_k, txt_k = k[:, :-txt_len, :, :], k[:, -txt_len:, :, :]
+        img_v, txt_v = v[:, :-txt_len, :, :], v[:, -txt_len:, :, :]
+
+        # Apply rotary position embeddings only to image tokens
+        img_qq, img_kk = apply_rotary_emb(img_q, img_k, freqs_cis, head_first=False)
+        assert (
+            img_qq.shape == img_q.shape and img_kk.shape == img_k.shape
+        ), f"RoPE output shape mismatch: got {img_qq.shape}, {img_kk.shape}, expected {img_q.shape}, {img_k.shape}"
+        img_q, img_k = img_qq, img_kk
+
+        # Recombine and compute joint attention
+        q = torch.cat([img_q, txt_q], dim=1)
+        k = torch.cat([img_k, txt_k], dim=1)
+        v = torch.cat([img_v, txt_v], dim=1)
+        attn = attention(q, k, v, seq_lens=seq_lens, attn_mode=self.attn_mode)
+
+        # Combine attention and MLP outputs, apply gating
+        # output = self.linear2(attn, self.mlp_act(mlp))
+
+        mlp = self.mlp_act(mlp)
+        output = torch.cat([attn, mlp], dim=2).contiguous()
+        output = self.linear2(output)
+
+        return x + apply_gate(output, gate=mod_gate)
+
+
+# endregion
diff --git a/library/hunyuan_image_text_encoder.py b/library/hunyuan_image_text_encoder.py
new file mode 100644
index 000000000..85bdaa43e
--- /dev/null
+++ b/library/hunyuan_image_text_encoder.py
@@ -0,0 +1,649 @@
+import json
+import re
+from typing import Tuple, Optional, Union
+import torch
+from transformers import (
+    AutoTokenizer,
+    Qwen2_5_VLConfig,
+    Qwen2_5_VLForConditionalGeneration,
+    Qwen2Tokenizer,
+    T5ForConditionalGeneration,
+    T5Config,
+    T5Tokenizer,
+)
+from transformers.models.t5.modeling_t5 import T5Stack
+from accelerate import init_empty_weights
+
+from library import model_util
+from library.utils import load_safetensors, setup_logging
+
+setup_logging()
+import logging
+
+logger = logging.getLogger(__name__)
+
+
+BYT5_TOKENIZER_PATH = "google/byt5-small"
+QWEN_2_5_VL_IMAGE_ID ="Qwen/Qwen2.5-VL-7B-Instruct"
+
+
+# Copy from Glyph-SDXL-V2
+
+COLOR_IDX_JSON = """{"white": 0, "black": 1, "darkslategray": 2, "dimgray": 3, "darkolivegreen": 4, "midnightblue": 5, "saddlebrown": 6, "sienna": 7, "whitesmoke": 8, "darkslateblue": 9, 
+"indianred": 10, "linen": 11, "maroon": 12, "khaki": 13, "sandybrown": 14, "gray": 15, "gainsboro": 16, "teal": 17, "peru": 18, "gold": 19, 
+"snow": 20, "firebrick": 21, "crimson": 22, "chocolate": 23, "tomato": 24, "brown": 25, "goldenrod": 26, "antiquewhite": 27, "rosybrown": 28, "steelblue": 29, 
+"floralwhite": 30, "seashell": 31, "darkgreen": 32, "oldlace": 33, "darkkhaki": 34, "burlywood": 35, "red": 36, "darkgray": 37, "orange": 38, "royalblue": 39, 
+"seagreen": 40, "lightgray": 41, "tan": 42, "coral": 43, "beige": 44, "palevioletred": 45, "wheat": 46, "lavender": 47, "darkcyan": 48, "slateblue": 49, 
+"slategray": 50, "orangered": 51, "silver": 52, "olivedrab": 53, "forestgreen": 54, "darkgoldenrod": 55, "ivory": 56, "darkorange": 57, "yellow": 58, "hotpink": 59, 
+"ghostwhite": 60, "lightcoral": 61, "indigo": 62, "bisque": 63, "darkred": 64, "darksalmon": 65, "lightslategray": 66, "dodgerblue": 67, "lightpink": 68, "mistyrose": 69, 
+"mediumvioletred": 70, "cadetblue": 71, "deeppink": 72, "salmon": 73, "palegoldenrod": 74, "blanchedalmond": 75, "lightseagreen": 76, "cornflowerblue": 77, "yellowgreen": 78, "greenyellow": 79, 
+"navajowhite": 80, "papayawhip": 81, "mediumslateblue": 82, "purple": 83, "blueviolet": 84, "pink": 85, "cornsilk": 86, "lightsalmon": 87, "mediumpurple": 88, "moccasin": 89, 
+"turquoise": 90, "mediumseagreen": 91, "lavenderblush": 92, "mediumblue": 93, "darkseagreen": 94, "mediumturquoise": 95, "paleturquoise": 96, "skyblue": 97, "lemonchiffon": 98, "olive": 99, 
+"peachpuff": 100, "lightyellow": 101, "lightsteelblue": 102, "mediumorchid": 103, "plum": 104, "darkturquoise": 105, "aliceblue": 106, "mediumaquamarine": 107, "orchid": 108, "powderblue": 109, 
+"blue": 110, "darkorchid": 111, "violet": 112, "lightskyblue": 113, "lightcyan": 114, "lightgoldenrodyellow": 115, "navy": 116, "thistle": 117, "honeydew": 118, "mintcream": 119, 
+"lightblue": 120, "darkblue": 121, "darkmagenta": 122, "deepskyblue": 123, "magenta": 124, "limegreen": 125, "darkviolet": 126, "cyan": 127, "palegreen": 128, "aquamarine": 129, 
+"lawngreen": 130, "lightgreen": 131, "azure": 132, "chartreuse": 133, "green": 134, "mediumspringgreen": 135, "lime": 136, "springgreen": 137}"""
+
+MULTILINGUAL_10_LANG_IDX_JSON = """{"en-Montserrat-Regular": 0, "en-Poppins-Italic": 1, "en-GlacialIndifference-Regular": 2, "en-OpenSans-ExtraBoldItalic": 3, "en-Montserrat-Bold": 4, "en-Now-Regular": 5, "en-Garet-Regular": 6, "en-LeagueSpartan-Bold": 7, "en-DMSans-Regular": 8, "en-OpenSauceOne-Regular": 9, 
+"en-OpenSans-ExtraBold": 10, "en-KGPrimaryPenmanship": 11, "en-Anton-Regular": 12, "en-Aileron-BlackItalic": 13, "en-Quicksand-Light": 14, "en-Roboto-BoldItalic": 15, "en-TheSeasons-It": 16, "en-Kollektif": 17, "en-Inter-BoldItalic": 18, "en-Poppins-Medium": 19, 
+"en-Poppins-Light": 20, "en-RoxboroughCF-RegularItalic": 21, "en-PlayfairDisplay-SemiBold": 22, "en-Agrandir-Italic": 23, "en-Lato-Regular": 24, "en-MoreSugarRegular": 25, "en-CanvaSans-RegularItalic": 26, "en-PublicSans-Italic": 27, "en-CodePro-NormalLC": 28, "en-Belleza-Regular": 29, 
+"en-JosefinSans-Bold": 30, "en-HKGrotesk-Bold": 31, "en-Telegraf-Medium": 32, "en-BrittanySignatureRegular": 33, "en-Raleway-ExtraBoldItalic": 34, "en-Mont-RegularItalic": 35, "en-Arimo-BoldItalic": 36, "en-Lora-Italic": 37, "en-ArchivoBlack-Regular": 38, "en-Poppins": 39, 
+"en-Barlow-Black": 40, "en-CormorantGaramond-Bold": 41, "en-LibreBaskerville-Regular": 42, "en-CanvaSchoolFontRegular": 43, "en-BebasNeueBold": 44, "en-LazydogRegular": 45, "en-FredokaOne-Regular": 46, "en-Horizon-Bold": 47, "en-Nourd-Regular": 48, "en-Hatton-Regular": 49, 
+"en-Nunito-ExtraBoldItalic": 50, "en-CerebriSans-Regular": 51, "en-Montserrat-Light": 52, "en-TenorSans": 53, "en-Norwester-Regular": 54, "en-ClearSans-Bold": 55, "en-Cardo-Regular": 56, "en-Alice-Regular": 57, "en-Oswald-Regular": 58, "en-Gaegu-Bold": 59, 
+"en-Muli-Black": 60, "en-TAN-PEARL-Regular": 61, "en-CooperHewitt-Book": 62, "en-Agrandir-Grand": 63, "en-BlackMango-Thin": 64, "en-DMSerifDisplay-Regular": 65, "en-Antonio-Bold": 66, "en-Sniglet-Regular": 67, "en-BeVietnam-Regular": 68, "en-NunitoSans10pt-BlackItalic": 69, 
+"en-AbhayaLibre-ExtraBold": 70, "en-Rubik-Regular": 71, "en-PPNeueMachina-Regular": 72, "en-TAN - MON CHERI-Regular": 73, "en-Jua-Regular": 74, "en-Playlist-Script": 75, "en-SourceSansPro-BoldItalic": 76, "en-MoonTime-Regular": 77, "en-Eczar-ExtraBold": 78, "en-Gatwick-Regular": 79, 
+"en-MonumentExtended-Regular": 80, "en-BarlowSemiCondensed-Regular": 81, "en-BarlowCondensed-Regular": 82, "en-Alegreya-Regular": 83, "en-DreamAvenue": 84, "en-RobotoCondensed-Italic": 85, "en-BobbyJones-Regular": 86, "en-Garet-ExtraBold": 87, "en-YesevaOne-Regular": 88, "en-Dosis-ExtraBold": 89, 
+"en-LeagueGothic-Regular": 90, "en-OpenSans-Italic": 91, "en-TANAEGEAN-Regular": 92, "en-Maharlika-Regular": 93, "en-MarykateRegular": 94, "en-Cinzel-Regular": 95, "en-Agrandir-Wide": 96, "en-Chewy-Regular": 97, "en-BodoniFLF-BoldItalic": 98, "en-Nunito-BlackItalic": 99, 
+"en-LilitaOne": 100, "en-HandyCasualCondensed-Regular": 101, "en-Ovo": 102, "en-Livvic-Regular": 103, "en-Agrandir-Narrow": 104, "en-CrimsonPro-Italic": 105, "en-AnonymousPro-Bold": 106, "en-NF-OneLittleFont-Bold": 107, "en-RedHatDisplay-BoldItalic": 108, "en-CodecPro-Regular": 109, 
+"en-HalimunRegular": 110, "en-LibreFranklin-Black": 111, "en-TeXGyreTermes-BoldItalic": 112, "en-Shrikhand-Regular": 113, "en-TTNormsPro-Italic": 114, "en-Gagalin-Regular": 115, "en-OpenSans-Bold": 116, "en-GreatVibes-Regular": 117, "en-Breathing": 118, "en-HeroLight-Regular": 119, 
+"en-KGPrimaryDots": 120, "en-Quicksand-Bold": 121, "en-Brice-ExtraLightSemiExpanded": 122, "en-Lato-BoldItalic": 123, "en-Fraunces9pt-Italic": 124, "en-AbrilFatface-Regular": 125, "en-BerkshireSwash-Regular": 126, "en-Atma-Bold": 127, "en-HolidayRegular": 128, "en-BebasNeueCyrillic": 129, 
+"en-IntroRust-Base": 130, "en-Gistesy": 131, "en-BDScript-Regular": 132, "en-ApricotsRegular": 133, "en-Prompt-Black": 134, "en-TAN MERINGUE": 135, "en-Sukar Regular": 136, "en-GentySans-Regular": 137, "en-NeueEinstellung-Normal": 138, "en-Garet-Bold": 139, 
+"en-FiraSans-Black": 140, "en-BantayogLight": 141, "en-NotoSerifDisplay-Black": 142, "en-TTChocolates-Regular": 143, "en-Ubuntu-Regular": 144, "en-Assistant-Bold": 145, "en-ABeeZee-Regular": 146, "en-LexendDeca-Regular": 147, "en-KingredSerif": 148, "en-Radley-Regular": 149, 
+"en-BrownSugar": 150, "en-MigraItalic-ExtraboldItalic": 151, "en-ChildosArabic-Regular": 152, "en-PeaceSans": 153, "en-LondrinaSolid-Black": 154, "en-SpaceMono-BoldItalic": 155, "en-RobotoMono-Light": 156, "en-CourierPrime-Regular": 157, "en-Alata-Regular": 158, "en-Amsterdam-One": 159, 
+"en-IreneFlorentina-Regular": 160, "en-CatchyMager": 161, "en-Alta_regular": 162, "en-ArticulatCF-Regular": 163, "en-Raleway-Regular": 164, "en-BrasikaDisplay": 165, "en-TANAngleton-Italic": 166, "en-NotoSerifDisplay-ExtraCondensedItalic": 167, "en-Bryndan Write": 168, "en-TTCommonsPro-It": 169, 
+"en-AlexBrush-Regular": 170, "en-Antic-Regular": 171, "en-TTHoves-Bold": 172, "en-DroidSerif": 173, "en-AblationRegular": 174, "en-Marcellus-Regular": 175, "en-Sanchez-Italic": 176, "en-JosefinSans": 177, "en-Afrah-Regular": 178, "en-PinyonScript": 179, 
+"en-TTInterphases-BoldItalic": 180, "en-Yellowtail-Regular": 181, "en-Gliker-Regular": 182, "en-BobbyJonesSoft-Regular": 183, "en-IBMPlexSans": 184, "en-Amsterdam-Three": 185, "en-Amsterdam-FourSlant": 186, "en-TTFors-Regular": 187, "en-Quattrocento": 188, "en-Sifonn-Basic": 189, 
+"en-AlegreyaSans-Black": 190, "en-Daydream": 191, "en-AristotelicaProTx-Rg": 192, "en-NotoSerif": 193, "en-EBGaramond-Italic": 194, "en-HammersmithOne-Regular": 195, "en-RobotoSlab-Regular": 196, "en-DO-Sans-Regular": 197, "en-KGPrimaryDotsLined": 198, "en-Blinker-Regular": 199, 
+"en-TAN NIMBUS": 200, "en-Blueberry-Regular": 201, "en-Rosario-Regular": 202, "en-Forum": 203, "en-MistrullyRegular": 204, "en-SourceSerifPro-Regular": 205, "en-Bugaki-Regular": 206, "en-CMUSerif-Roman": 207, "en-GulfsDisplay-NormalItalic": 208, "en-PTSans-Bold": 209, 
+"en-Sensei-Medium": 210, "en-SquadaOne-Regular": 211, "en-Arapey-Italic": 212, "en-Parisienne-Regular": 213, "en-Aleo-Italic": 214, "en-QuicheDisplay-Italic": 215, "en-RocaOne-It": 216, "en-Funtastic-Regular": 217, "en-PTSerif-BoldItalic": 218, "en-Muller-RegularItalic": 219, 
+"en-ArgentCF-Regular": 220, "en-Brightwall-Italic": 221, "en-Knewave-Regular": 222, "en-TYSerif-D": 223, "en-Agrandir-Tight": 224, "en-AlfaSlabOne-Regular": 225, "en-TANTangkiwood-Display": 226, "en-Kief-Montaser-Regular": 227, "en-Gotham-Book": 228, "en-JuliusSansOne-Regular": 229, 
+"en-CocoGothic-Italic": 230, "en-SairaCondensed-Regular": 231, "en-DellaRespira-Regular": 232, "en-Questrial-Regular": 233, "en-BukhariScript-Regular": 234, "en-HelveticaWorld-Bold": 235, "en-TANKINDRED-Display": 236, "en-CinzelDecorative-Regular": 237, "en-Vidaloka-Regular": 238, "en-AlegreyaSansSC-Black": 239, 
+"en-FeelingPassionate-Regular": 240, "en-QuincyCF-Regular": 241, "en-FiraCode-Regular": 242, "en-Genty-Regular": 243, "en-Nickainley-Normal": 244, "en-RubikOne-Regular": 245, "en-Gidole-Regular": 246, "en-Borsok": 247, "en-Gordita-RegularItalic": 248, "en-Scripter-Regular": 249, 
+"en-Buffalo-Regular": 250, "en-KleinText-Regular": 251, "en-Creepster-Regular": 252, "en-Arvo-Bold": 253, "en-GabrielSans-NormalItalic": 254, "en-Heebo-Black": 255, "en-LexendExa-Regular": 256, "en-BrixtonSansTC-Regular": 257, "en-GildaDisplay-Regular": 258, "en-ChunkFive-Roman": 259, 
+"en-Amaranth-BoldItalic": 260, "en-BubbleboddyNeue-Regular": 261, "en-MavenPro-Bold": 262, "en-TTDrugs-Italic": 263, "en-CyGrotesk-KeyRegular": 264, "en-VarelaRound-Regular": 265, "en-Ruda-Black": 266, "en-SafiraMarch": 267, "en-BloggerSans": 268, "en-TANHEADLINE-Regular": 269, 
+"en-SloopScriptPro-Regular": 270, "en-NeueMontreal-Regular": 271, "en-Schoolbell-Regular": 272, "en-SigherRegular": 273, "en-InriaSerif-Regular": 274, "en-JetBrainsMono-Regular": 275, "en-MADEEvolveSans": 276, "en-Dekko": 277, "en-Handyman-Regular": 278, "en-Aileron-BoldItalic": 279, 
+"en-Bright-Italic": 280, "en-Solway-Regular": 281, "en-Higuen-Regular": 282, "en-WedgesItalic": 283, "en-TANASHFORD-BOLD": 284, "en-IBMPlexMono": 285, "en-RacingSansOne-Regular": 286, "en-RegularBrush": 287, "en-OpenSans-LightItalic": 288, "en-SpecialElite-Regular": 289, 
+"en-FuturaLTPro-Medium": 290, "en-MaragsaDisplay": 291, "en-BigShouldersDisplay-Regular": 292, "en-BDSans-Regular": 293, "en-RasputinRegular": 294, "en-Yvesyvesdrawing-BoldItalic": 295, "en-Bitter-Regular": 296, "en-LuckiestGuy-Regular": 297, "en-CanvaSchoolFontDotted": 298, "en-TTFirsNeue-Italic": 299, 
+"en-Sunday-Regular": 300, "en-HKGothic-MediumItalic": 301, "en-CaveatBrush-Regular": 302, "en-HeliosExt": 303, "en-ArchitectsDaughter-Regular": 304, "en-Angelina": 305, "en-Calistoga-Regular": 306, "en-ArchivoNarrow-Regular": 307, "en-ObjectSans-MediumSlanted": 308, "en-AyrLucidityCondensed-Regular": 309, 
+"en-Nexa-RegularItalic": 310, "en-Lustria-Regular": 311, "en-Amsterdam-TwoSlant": 312, "en-Virtual-Regular": 313, "en-Brusher-Regular": 314, "en-NF-Lepetitcochon-Regular": 315, "en-TANTWINKLE": 316, "en-LeJour-Serif": 317, "en-Prata-Regular": 318, "en-PPWoodland-Regular": 319, 
+"en-PlayfairDisplay-BoldItalic": 320, "en-AmaticSC-Regular": 321, "en-Cabin-Regular": 322, "en-Manjari-Bold": 323, "en-MrDafoe-Regular": 324, "en-TTRamillas-Italic": 325, "en-Luckybones-Bold": 326, "en-DarkerGrotesque-Light": 327, "en-BellabooRegular": 328, "en-CormorantSC-Bold": 329, 
+"en-GochiHand-Regular": 330, "en-Atteron": 331, "en-RocaTwo-Lt": 332, "en-ZCOOLXiaoWei-Regular": 333, "en-TANSONGBIRD": 334, "en-HeadingNow-74Regular": 335, "en-Luthier-BoldItalic": 336, "en-Oregano-Regular": 337, "en-AyrTropikaIsland-Int": 338, "en-Mali-Regular": 339, 
+"en-DidactGothic-Regular": 340, "en-Lovelace-Regular": 341, "en-BakerieSmooth-Regular": 342, "en-CarterOne": 343, "en-HussarBd": 344, "en-OldStandard-Italic": 345, "en-TAN-ASTORIA-Display": 346, "en-rugratssans-Regular": 347, "en-BMHANNA": 348, "en-BetterSaturday": 349, 
+"en-AdigianaToybox": 350, "en-Sailors": 351, "en-PlayfairDisplaySC-Italic": 352, "en-Etna-Regular": 353, "en-Revive80Signature": 354, "en-CAGenerated": 355, "en-Poppins-Regular": 356, "en-Jonathan-Regular": 357, "en-Pacifico-Regular": 358, "en-Saira-Black": 359, 
+"en-Loubag-Regular": 360, "en-Decalotype-Black": 361, "en-Mansalva-Regular": 362, "en-Allura-Regular": 363, "en-ProximaNova-Bold": 364, "en-TANMIGNON-DISPLAY": 365, "en-ArsenicaAntiqua-Regular": 366, "en-BreulGroteskA-RegularItalic": 367, "en-HKModular-Bold": 368, "en-TANNightingale-Regular": 369, 
+"en-AristotelicaProCndTxt-Rg": 370, "en-Aprila-Regular": 371, "en-Tomorrow-Regular": 372, "en-AngellaWhite": 373, "en-KaushanScript-Regular": 374, "en-NotoSans": 375, "en-LeJour-Script": 376, "en-BrixtonTC-Regular": 377, "en-OleoScript-Regular": 378, "en-Cakerolli-Regular": 379, 
+"en-Lobster-Regular": 380, "en-FrunchySerif-Regular": 381, "en-PorcelainRegular": 382, "en-AlojaExtended": 383, "en-SergioTrendy-Italic": 384, "en-LovelaceText-Bold": 385, "en-Anaktoria": 386, "en-JimmyScript-Light": 387, "en-IBMPlexSerif": 388, "en-Marta": 389, 
+"en-Mango-Regular": 390, "en-Overpass-Italic": 391, "en-Hagrid-Regular": 392, "en-ElikaGorica": 393, "en-Amiko-Regular": 394, "en-EFCOBrookshire-Regular": 395, "en-Caladea-Regular": 396, "en-MoonlightBold": 397, "en-Staatliches-Regular": 398, "en-Helios-Bold": 399, 
+"en-Satisfy-Regular": 400, "en-NexaScript-Regular": 401, "en-Trocchi-Regular": 402, "en-March": 403, "en-IbarraRealNova-Regular": 404, "en-Nectarine-Regular": 405, "en-Overpass-Light": 406, "en-TruetypewriterPolyglOTT": 407, "en-Bangers-Regular": 408, "en-Lazord-BoldExpandedItalic": 409, 
+"en-Chloe-Regular": 410, "en-BaskervilleDisplayPT-Regular": 411, "en-Bright-Regular": 412, "en-Vollkorn-Regular": 413, "en-Harmattan": 414, "en-SortsMillGoudy-Regular": 415, "en-Biryani-Bold": 416, "en-SugoProDisplay-Italic": 417, "en-Lazord-BoldItalic": 418, "en-Alike-Regular": 419, 
+"en-PermanentMarker-Regular": 420, "en-Sacramento-Regular": 421, "en-HKGroteskPro-Italic": 422, "en-Aleo-BoldItalic": 423, "en-Noot": 424, "en-TANGARLAND-Regular": 425, "en-Twister": 426, "en-Arsenal-Italic": 427, "en-Bogart-Italic": 428, "en-BethEllen-Regular": 429, 
+"en-Caveat-Regular": 430, "en-BalsamiqSans-Bold": 431, "en-BreeSerif-Regular": 432, "en-CodecPro-ExtraBold": 433, "en-Pierson-Light": 434, "en-CyGrotesk-WideRegular": 435, "en-Lumios-Marker": 436, "en-Comfortaa-Bold": 437, "en-TraceFontRegular": 438, "en-RTL-AdamScript-Regular": 439, 
+"en-EastmanGrotesque-Italic": 440, "en-Kalam-Bold": 441, "en-ChauPhilomeneOne-Regular": 442, "en-Coiny-Regular": 443, "en-Lovera": 444, "en-Gellatio": 445, "en-TitilliumWeb-Bold": 446, "en-OilvareBase-Italic": 447, "en-Catamaran-Black": 448, "en-Anteb-Italic": 449, 
+"en-SueEllenFrancisco": 450, "en-SweetApricot": 451, "en-BrightSunshine": 452, "en-IM_FELL_Double_Pica_Italic": 453, "en-Granaina-limpia": 454, "en-TANPARFAIT": 455, "en-AcherusGrotesque-Regular": 456, "en-AwesomeLathusca-Italic": 457, "en-Signika-Bold": 458, "en-Andasia": 459, 
+"en-DO-AllCaps-Slanted": 460, "en-Zenaida-Regular": 461, "en-Fahkwang-Regular": 462, "en-Play-Regular": 463, "en-BERNIERRegular-Regular": 464, "en-PlumaThin-Regular": 465, "en-SportsWorld": 466, "en-Garet-Black": 467, "en-CarolloPlayscript-BlackItalic": 468, "en-Cheque-Regular": 469, 
+"en-SEGO": 470, "en-BobbyJones-Condensed": 471, "en-NexaSlab-RegularItalic": 472, "en-DancingScript-Regular": 473, "en-PaalalabasDisplayWideBETA": 474, "en-Magnolia-Script": 475, "en-OpunMai-400It": 476, "en-MadelynFill-Regular": 477, "en-ZingRust-Base": 478, "en-FingerPaint-Regular": 479, 
+"en-BostonAngel-Light": 480, "en-Gliker-RegularExpanded": 481, "en-Ahsing": 482, "en-Engagement-Regular": 483, "en-EyesomeScript": 484, "en-LibraSerifModern-Regular": 485, "en-London-Regular": 486, "en-AtkinsonHyperlegible-Regular": 487, "en-StadioNow-TextItalic": 488, "en-Aniyah": 489, 
+"en-ITCAvantGardePro-Bold": 490, "en-Comica-Regular": 491, "en-Coustard-Regular": 492, "en-Brice-BoldCondensed": 493, "en-TANNEWYORK-Bold": 494, "en-TANBUSTER-Bold": 495, "en-Alatsi-Regular": 496, "en-TYSerif-Book": 497, "en-Jingleberry": 498, "en-Rajdhani-Bold": 499, 
+"en-LobsterTwo-BoldItalic": 500, "en-BestLight-Medium": 501, "en-Hitchcut-Regular": 502, "en-GermaniaOne-Regular": 503, "en-Emitha-Script": 504, "en-LemonTuesday": 505, "en-Cubao_Free_Regular": 506, "en-MonterchiSerif-Regular": 507, "en-AllertaStencil-Regular": 508, "en-RTL-Sondos-Regular": 509, 
+"en-HomemadeApple-Regular": 510, "en-CosmicOcto-Medium": 511, "cn-HelloFont-FangHuaTi": 0, "cn-HelloFont-ID-DianFangSong-Bold": 1, "cn-HelloFont-ID-DianFangSong": 2, "cn-HelloFont-ID-DianHei-CEJ": 3, "cn-HelloFont-ID-DianHei-DEJ": 4, "cn-HelloFont-ID-DianHei-EEJ": 5, "cn-HelloFont-ID-DianHei-FEJ": 6, "cn-HelloFont-ID-DianHei-GEJ": 7, "cn-HelloFont-ID-DianKai-Bold": 8, "cn-HelloFont-ID-DianKai": 9, 
+"cn-HelloFont-WenYiHei": 10, "cn-Hellofont-ID-ChenYanXingKai": 11, "cn-Hellofont-ID-DaZiBao": 12, "cn-Hellofont-ID-DaoCaoRen": 13, "cn-Hellofont-ID-JianSong": 14, "cn-Hellofont-ID-JiangHuZhaoPaiHei": 15, "cn-Hellofont-ID-KeSong": 16, "cn-Hellofont-ID-LeYuanTi": 17, "cn-Hellofont-ID-Pinocchio": 18, "cn-Hellofont-ID-QiMiaoTi": 19, 
+"cn-Hellofont-ID-QingHuaKai": 20, "cn-Hellofont-ID-QingHuaXingKai": 21, "cn-Hellofont-ID-ShanShuiXingKai": 22, "cn-Hellofont-ID-ShouXieQiShu": 23, "cn-Hellofont-ID-ShouXieTongZhenTi": 24, "cn-Hellofont-ID-TengLingTi": 25, "cn-Hellofont-ID-XiaoLiShu": 26, "cn-Hellofont-ID-XuanZhenSong": 27, "cn-Hellofont-ID-ZhongLingXingKai": 28, "cn-HellofontIDJiaoTangTi": 29, 
+"cn-HellofontIDJiuZhuTi": 30, "cn-HuXiaoBao-SaoBao": 31, "cn-HuXiaoBo-NanShen": 32, "cn-HuXiaoBo-ZhenShuai": 33, "cn-SourceHanSansSC-Bold": 34, "cn-SourceHanSansSC-ExtraLight": 35, "cn-SourceHanSansSC-Heavy": 36, "cn-SourceHanSansSC-Light": 37, "cn-SourceHanSansSC-Medium": 38, "cn-SourceHanSansSC-Normal": 39, 
+"cn-SourceHanSansSC-Regular": 40, "cn-SourceHanSerifSC-Bold": 41, "cn-SourceHanSerifSC-ExtraLight": 42, "cn-SourceHanSerifSC-Heavy": 43, "cn-SourceHanSerifSC-Light": 44, "cn-SourceHanSerifSC-Medium": 45, "cn-SourceHanSerifSC-Regular": 46, "cn-SourceHanSerifSC-SemiBold": 47, "cn-xiaowei": 48, "cn-AaJianHaoTi": 49, 
+"cn-AlibabaPuHuiTi-Bold": 50, "cn-AlibabaPuHuiTi-Heavy": 51, "cn-AlibabaPuHuiTi-Light": 52, "cn-AlibabaPuHuiTi-Medium": 53, "cn-AlibabaPuHuiTi-Regular": 54, "cn-CanvaAcidBoldSC": 55, "cn-CanvaBreezeCN": 56, "cn-CanvaBumperCropSC": 57, "cn-CanvaCakeShopCN": 58, "cn-CanvaEndeavorBlackSC": 59, 
+"cn-CanvaJoyHeiCN": 60, "cn-CanvaLiCN": 61, "cn-CanvaOrientalBrushCN": 62, "cn-CanvaPoster": 63, "cn-CanvaQinfuCalligraphyCN": 64, "cn-CanvaSweetHeartCN": 65, "cn-CanvaSwordLikeDreamCN": 66, "cn-CanvaTangyuanHandwritingCN": 67, "cn-CanvaWanderWorldCN": 68, "cn-CanvaWenCN": 69, 
+"cn-DianZiChunYi": 70, "cn-GenSekiGothicTW-H": 71, "cn-GenWanMinTW-L": 72, "cn-GenYoMinTW-B": 73, "cn-GenYoMinTW-EL": 74, "cn-GenYoMinTW-H": 75, "cn-GenYoMinTW-M": 76, "cn-GenYoMinTW-R": 77, "cn-GenYoMinTW-SB": 78, "cn-HYQiHei-AZEJ": 79, 
+"cn-HYQiHei-EES": 80, "cn-HanaMinA": 81, "cn-HappyZcool-2016": 82, "cn-HelloFont ZJ KeKouKeAiTi": 83, "cn-HelloFont-ID-BoBoTi": 84, "cn-HelloFont-ID-FuGuHei-25": 85, "cn-HelloFont-ID-FuGuHei-35": 86, "cn-HelloFont-ID-FuGuHei-45": 87, "cn-HelloFont-ID-FuGuHei-55": 88, "cn-HelloFont-ID-FuGuHei-65": 89, 
+"cn-HelloFont-ID-FuGuHei-75": 90, "cn-HelloFont-ID-FuGuHei-85": 91, "cn-HelloFont-ID-HeiKa": 92, "cn-HelloFont-ID-HeiTang": 93, "cn-HelloFont-ID-JianSong-95": 94, "cn-HelloFont-ID-JueJiangHei-50": 95, "cn-HelloFont-ID-JueJiangHei-55": 96, "cn-HelloFont-ID-JueJiangHei-60": 97, "cn-HelloFont-ID-JueJiangHei-65": 98, "cn-HelloFont-ID-JueJiangHei-70": 99, 
+"cn-HelloFont-ID-JueJiangHei-75": 100, "cn-HelloFont-ID-JueJiangHei-80": 101, "cn-HelloFont-ID-KuHeiTi": 102, "cn-HelloFont-ID-LingDongTi": 103, "cn-HelloFont-ID-LingLiTi": 104, "cn-HelloFont-ID-MuFengTi": 105, "cn-HelloFont-ID-NaiNaiJiangTi": 106, "cn-HelloFont-ID-PangDu": 107, "cn-HelloFont-ID-ReLieTi": 108, "cn-HelloFont-ID-RouRun": 109, 
+"cn-HelloFont-ID-SaShuangShouXieTi": 110, "cn-HelloFont-ID-WangZheFengFan": 111, "cn-HelloFont-ID-YouQiTi": 112, "cn-Hellofont-ID-XiaLeTi": 113, "cn-Hellofont-ID-XianXiaTi": 114, "cn-HuXiaoBoKuHei": 115, "cn-IDDanMoXingKai": 116, "cn-IDJueJiangHei": 117, "cn-IDMeiLingTi": 118, "cn-IDQQSugar": 119, 
+"cn-LiuJianMaoCao-Regular": 120, "cn-LongCang-Regular": 121, "cn-MaShanZheng-Regular": 122, "cn-PangMenZhengDao-3": 123, "cn-PangMenZhengDao-Cu": 124, "cn-PangMenZhengDao": 125, "cn-SentyCaramel": 126, "cn-SourceHanSerifSC": 127, "cn-WenCang-Regular": 128, "cn-WenQuanYiMicroHei": 129, 
+"cn-XianErTi": 130, "cn-YRDZSTJF": 131, "cn-YS-HelloFont-BangBangTi": 132, "cn-ZCOOLKuaiLe-Regular": 133, "cn-ZCOOLQingKeHuangYou-Regular": 134, "cn-ZCOOLXiaoWei-Regular": 135, "cn-ZCOOL_KuHei": 136, "cn-ZhiMangXing-Regular": 137, "cn-baotuxiaobaiti": 138, "cn-jiangxizhuokai-Regular": 139, 
+"cn-zcool-gdh": 140, "cn-zcoolqingkehuangyouti-Regular": 141, "cn-zcoolwenyiti": 142, "jp-04KanjyukuGothic": 0, "jp-07LightNovelPOP": 1, "jp-07NikumaruFont": 2, "jp-07YasashisaAntique": 3, "jp-07YasashisaGothic": 4, "jp-BokutachinoGothic2Bold": 5, "jp-BokutachinoGothic2Regular": 6, "jp-CHI_SpeedyRight_full_211128-Regular": 7, "jp-CHI_SpeedyRight_italic_full_211127-Regular": 8, "jp-CP-Font": 9, 
+"jp-Canva_CezanneProN-B": 10, "jp-Canva_CezanneProN-M": 11, "jp-Canva_ChiaroStd-B": 12, "jp-Canva_CometStd-B": 13, "jp-Canva_DotMincho16Std-M": 14, "jp-Canva_GrecoStd-B": 15, "jp-Canva_GrecoStd-M": 16, "jp-Canva_LyraStd-DB": 17, "jp-Canva_MatisseHatsuhiPro-B": 18, "jp-Canva_MatisseHatsuhiPro-M": 19, 
+"jp-Canva_ModeMinAStd-B": 20, "jp-Canva_NewCezanneProN-B": 21, "jp-Canva_NewCezanneProN-M": 22, "jp-Canva_PearlStd-L": 23, "jp-Canva_RaglanStd-UB": 24, "jp-Canva_RailwayStd-B": 25, "jp-Canva_ReggaeStd-B": 26, "jp-Canva_RocknRollStd-DB": 27, "jp-Canva_RodinCattleyaPro-B": 28, "jp-Canva_RodinCattleyaPro-M": 29, 
+"jp-Canva_RodinCattleyaPro-UB": 30, "jp-Canva_RodinHimawariPro-B": 31, "jp-Canva_RodinHimawariPro-M": 32, "jp-Canva_RodinMariaPro-B": 33, "jp-Canva_RodinMariaPro-DB": 34, "jp-Canva_RodinProN-M": 35, "jp-Canva_ShadowTLStd-B": 36, "jp-Canva_StickStd-B": 37, "jp-Canva_TsukuAOldMinPr6N-B": 38, "jp-Canva_TsukuAOldMinPr6N-R": 39, 
+"jp-Canva_UtrilloPro-DB": 40, "jp-Canva_UtrilloPro-M": 41, "jp-Canva_YurukaStd-UB": 42, "jp-FGUIGEN": 43, "jp-GlowSansJ-Condensed-Heavy": 44, "jp-GlowSansJ-Condensed-Light": 45, "jp-GlowSansJ-Normal-Bold": 46, "jp-GlowSansJ-Normal-Light": 47, "jp-HannariMincho": 48, "jp-HarenosoraMincho": 49, 
+"jp-Jiyucho": 50, "jp-Kaiso-Makina-B": 51, "jp-Kaisotai-Next-UP-B": 52, "jp-KokoroMinchoutai": 53, "jp-Mamelon-3-Hi-Regular": 54, "jp-MotoyaAnemoneStd-W1": 55, "jp-MotoyaAnemoneStd-W5": 56, "jp-MotoyaAnticPro-W3": 57, "jp-MotoyaCedarStd-W3": 58, "jp-MotoyaCedarStd-W5": 59, 
+"jp-MotoyaGochikaStd-W4": 60, "jp-MotoyaGochikaStd-W8": 61, "jp-MotoyaGothicMiyabiStd-W6": 62, "jp-MotoyaGothicStd-W3": 63, "jp-MotoyaGothicStd-W5": 64, "jp-MotoyaKoinStd-W3": 65, "jp-MotoyaKyotaiStd-W2": 66, "jp-MotoyaKyotaiStd-W4": 67, "jp-MotoyaMaruStd-W3": 68, "jp-MotoyaMaruStd-W5": 69, 
+"jp-MotoyaMinchoMiyabiStd-W4": 70, "jp-MotoyaMinchoMiyabiStd-W6": 71, "jp-MotoyaMinchoModernStd-W4": 72, "jp-MotoyaMinchoModernStd-W6": 73, "jp-MotoyaMinchoStd-W3": 74, "jp-MotoyaMinchoStd-W5": 75, "jp-MotoyaReisyoStd-W2": 76, "jp-MotoyaReisyoStd-W6": 77, "jp-MotoyaTohitsuStd-W4": 78, "jp-MotoyaTohitsuStd-W6": 79, 
+"jp-MtySousyokuEmBcJis-W6": 80, "jp-MtySousyokuLiBcJis-W6": 81, "jp-Mushin": 82, "jp-NotoSansJP-Bold": 83, "jp-NotoSansJP-Regular": 84, "jp-NudMotoyaAporoStd-W3": 85, "jp-NudMotoyaAporoStd-W5": 86, "jp-NudMotoyaCedarStd-W3": 87, "jp-NudMotoyaCedarStd-W5": 88, "jp-NudMotoyaMaruStd-W3": 89, 
+"jp-NudMotoyaMaruStd-W5": 90, "jp-NudMotoyaMinchoStd-W5": 91, "jp-Ounen-mouhitsu": 92, "jp-Ronde-B-Square": 93, "jp-SMotoyaGyosyoStd-W5": 94, "jp-SMotoyaSinkaiStd-W3": 95, "jp-SMotoyaSinkaiStd-W5": 96, "jp-SourceHanSansJP-Bold": 97, "jp-SourceHanSansJP-Regular": 98, "jp-SourceHanSerifJP-Bold": 99, 
+"jp-SourceHanSerifJP-Regular": 100, "jp-TazuganeGothicStdN-Bold": 101, "jp-TazuganeGothicStdN-Regular": 102, "jp-TelopMinProN-B": 103, "jp-Togalite-Bold": 104, "jp-Togalite-Regular": 105, "jp-TsukuMinPr6N-E": 106, "jp-TsukuMinPr6N-M": 107, "jp-mikachan_o": 108, "jp-nagayama_kai": 109, 
+"jp-07LogoTypeGothic7": 110, "jp-07TetsubinGothic": 111, "jp-851CHIKARA-DZUYOKU-KANA-A": 112, "jp-ARMinchoJIS-Light": 113, "jp-ARMinchoJIS-Ultra": 114, "jp-ARPCrystalMinchoJIS-Medium": 115, "jp-ARPCrystalRGothicJIS-Medium": 116, "jp-ARShounanShinpitsuGyosyoJIS-Medium": 117, "jp-AozoraMincho-bold": 118, "jp-AozoraMinchoRegular": 119, 
+"jp-ArialUnicodeMS-Bold": 120, "jp-ArialUnicodeMS": 121, "jp-CanvaBreezeJP": 122, "jp-CanvaLiCN": 123, "jp-CanvaLiJP": 124, "jp-CanvaOrientalBrushCN": 125, "jp-CanvaQinfuCalligraphyJP": 126, "jp-CanvaSweetHeartJP": 127, "jp-CanvaWenJP": 128, "jp-Corporate-Logo-Bold": 129, 
+"jp-DelaGothicOne-Regular": 130, "jp-GN-Kin-iro_SansSerif": 131, "jp-GN-Koharuiro_Sunray": 132, "jp-GenEiGothicM-B": 133, "jp-GenEiGothicM-R": 134, "jp-GenJyuuGothic-Bold": 135, "jp-GenRyuMinTW-B": 136, "jp-GenRyuMinTW-R": 137, "jp-GenSekiGothicTW-B": 138, "jp-GenSekiGothicTW-R": 139, 
+"jp-GenSenRoundedTW-B": 140, "jp-GenSenRoundedTW-R": 141, "jp-GenShinGothic-Bold": 142, "jp-GenShinGothic-Normal": 143, "jp-GenWanMinTW-L": 144, "jp-GenYoGothicTW-B": 145, "jp-GenYoGothicTW-R": 146, "jp-GenYoMinTW-B": 147, "jp-GenYoMinTW-R": 148, "jp-HGBouquet": 149, 
+"jp-HanaMinA": 150, "jp-HanazomeFont": 151, "jp-HinaMincho-Regular": 152, "jp-Honoka-Antique-Maru": 153, "jp-Honoka-Mincho": 154, "jp-HuiFontP": 155, "jp-IPAexMincho": 156, "jp-JK-Gothic-L": 157, "jp-JK-Gothic-M": 158, "jp-JackeyFont": 159, 
+"jp-KaiseiTokumin-Bold": 160, "jp-KaiseiTokumin-Regular": 161, "jp-Keifont": 162, "jp-KiwiMaru-Regular": 163, "jp-Koku-Mincho-Regular": 164, "jp-MotoyaLMaru-W3-90ms-RKSJ-H": 165, "jp-NewTegomin-Regular": 166, "jp-NicoKaku": 167, "jp-NicoMoji+": 168, "jp-Otsutome_font-Bold": 169, 
+"jp-PottaOne-Regular": 170, "jp-RampartOne-Regular": 171, "jp-Senobi-Gothic-Bold": 172, "jp-Senobi-Gothic-Regular": 173, "jp-SmartFontUI-Proportional": 174, "jp-SoukouMincho": 175, "jp-TEST_Klee-DB": 176, "jp-TEST_Klee-M": 177, "jp-TEST_UDMincho-B": 178, "jp-TEST_UDMincho-L": 179, 
+"jp-TT_Akakane-EB": 180, "jp-Tanuki-Permanent-Marker": 181, "jp-TrainOne-Regular": 182, "jp-TsunagiGothic-Black": 183, "jp-Ume-Hy-Gothic": 184, "jp-Ume-P-Mincho": 185, "jp-WenQuanYiMicroHei": 186, "jp-XANO-mincho-U32": 187, "jp-YOzFontM90-Regular": 188, "jp-Yomogi-Regular": 189, 
+"jp-YujiBoku-Regular": 190, "jp-YujiSyuku-Regular": 191, "jp-ZenKakuGothicNew-Bold": 192, "jp-ZenKakuGothicNew-Regular": 193, "jp-ZenKurenaido-Regular": 194, "jp-ZenMaruGothic-Bold": 195, "jp-ZenMaruGothic-Regular": 196, "jp-darts-font": 197, "jp-irohakakuC-Bold": 198, "jp-irohakakuC-Medium": 199, 
+"jp-irohakakuC-Regular": 200, "jp-katyou": 201, "jp-mplus-1m-bold": 202, "jp-mplus-1m-regular": 203, "jp-mplus-1p-bold": 204, "jp-mplus-1p-regular": 205, "jp-rounded-mplus-1p-bold": 206, "jp-rounded-mplus-1p-regular": 207, "jp-timemachine-wa": 208, "jp-ttf-GenEiLateMin-Medium": 209, 
+"jp-uzura_font": 210, "kr-Arita-buri-Bold_OTF": 0, "kr-Arita-buri-HairLine_OTF": 1, "kr-Arita-buri-Light_OTF": 2, "kr-Arita-buri-Medium_OTF": 3, "kr-Arita-buri-SemiBold_OTF": 4, "kr-Canva_YDSunshineL": 5, "kr-Canva_YDSunshineM": 6, "kr-Canva_YoonGulimPro710": 7, "kr-Canva_YoonGulimPro730": 8, "kr-Canva_YoonGulimPro740": 9, 
+"kr-Canva_YoonGulimPro760": 10, "kr-Canva_YoonGulimPro770": 11, "kr-Canva_YoonGulimPro790": 12, "kr-CreHappB": 13, "kr-CreHappL": 14, "kr-CreHappM": 15, "kr-CreHappS": 16, "kr-OTAuroraB": 17, "kr-OTAuroraL": 18, "kr-OTAuroraR": 19, 
+"kr-OTDoldamgilB": 20, "kr-OTDoldamgilL": 21, "kr-OTDoldamgilR": 22, "kr-OTHamsterB": 23, "kr-OTHamsterL": 24, "kr-OTHamsterR": 25, "kr-OTHapchangdanB": 26, "kr-OTHapchangdanL": 27, "kr-OTHapchangdanR": 28, "kr-OTSupersizeBkBOX": 29, 
+"kr-SourceHanSansKR-Bold": 30, "kr-SourceHanSansKR-ExtraLight": 31, "kr-SourceHanSansKR-Heavy": 32, "kr-SourceHanSansKR-Light": 33, "kr-SourceHanSansKR-Medium": 34, "kr-SourceHanSansKR-Normal": 35, "kr-SourceHanSansKR-Regular": 36, "kr-SourceHanSansSC-Bold": 37, "kr-SourceHanSansSC-ExtraLight": 38, "kr-SourceHanSansSC-Heavy": 39, 
+"kr-SourceHanSansSC-Light": 40, "kr-SourceHanSansSC-Medium": 41, "kr-SourceHanSansSC-Normal": 42, "kr-SourceHanSansSC-Regular": 43, "kr-SourceHanSerifSC-Bold": 44, "kr-SourceHanSerifSC-SemiBold": 45, "kr-TDTDBubbleBubbleOTF": 46, "kr-TDTDConfusionOTF": 47, "kr-TDTDCuteAndCuteOTF": 48, "kr-TDTDEggTakOTF": 49, 
+"kr-TDTDEmotionalLetterOTF": 50, "kr-TDTDGalapagosOTF": 51, "kr-TDTDHappyHourOTF": 52, "kr-TDTDLatteOTF": 53, "kr-TDTDMoonLightOTF": 54, "kr-TDTDParkForestOTF": 55, "kr-TDTDPencilOTF": 56, "kr-TDTDSmileOTF": 57, "kr-TDTDSproutOTF": 58, "kr-TDTDSunshineOTF": 59, 
+"kr-TDTDWaferOTF": 60, "kr-777Chyaochyureu": 61, "kr-ArialUnicodeMS-Bold": 62, "kr-ArialUnicodeMS": 63, "kr-BMHANNA": 64, "kr-Baekmuk-Dotum": 65, "kr-BagelFatOne-Regular": 66, "kr-CoreBandi": 67, "kr-CoreBandiFace": 68, "kr-CoreBori": 69, 
+"kr-DoHyeon-Regular": 70, "kr-Dokdo-Regular": 71, "kr-Gaegu-Bold": 72, "kr-Gaegu-Light": 73, "kr-Gaegu-Regular": 74, "kr-GamjaFlower-Regular": 75, "kr-GasoekOne-Regular": 76, "kr-GothicA1-Black": 77, "kr-GothicA1-Bold": 78, "kr-GothicA1-ExtraBold": 79, 
+"kr-GothicA1-ExtraLight": 80, "kr-GothicA1-Light": 81, "kr-GothicA1-Medium": 82, "kr-GothicA1-Regular": 83, "kr-GothicA1-SemiBold": 84, "kr-GothicA1-Thin": 85, "kr-Gugi-Regular": 86, "kr-HiMelody-Regular": 87, "kr-Jua-Regular": 88, "kr-KirangHaerang-Regular": 89, 
+"kr-NanumBrush": 90, "kr-NanumPen": 91, "kr-NanumSquareRoundB": 92, "kr-NanumSquareRoundEB": 93, "kr-NanumSquareRoundL": 94, "kr-NanumSquareRoundR": 95, "kr-SeH-CB": 96, "kr-SeH-CBL": 97, "kr-SeH-CEB": 98, "kr-SeH-CL": 99, 
+"kr-SeH-CM": 100, "kr-SeN-CB": 101, "kr-SeN-CBL": 102, "kr-SeN-CEB": 103, "kr-SeN-CL": 104, "kr-SeN-CM": 105, "kr-Sunflower-Bold": 106, "kr-Sunflower-Light": 107, "kr-Sunflower-Medium": 108, "kr-TTClaytoyR": 109, 
+"kr-TTDalpangiR": 110, "kr-TTMamablockR": 111, "kr-TTNauidongmuR": 112, "kr-TTOktapbangR": 113, "kr-UhBeeMiMi": 114, "kr-UhBeeMiMiBold": 115, "kr-UhBeeSe_hyun": 116, "kr-UhBeeSe_hyunBold": 117, "kr-UhBeenamsoyoung": 118, "kr-UhBeenamsoyoungBold": 119, 
+"kr-WenQuanYiMicroHei": 120, "kr-YeonSung-Regular": 121}"""
+
+
+def add_special_token(tokenizer: T5Tokenizer, text_encoder: T5Stack):
+    """
+    Add special tokens for color and font to tokenizer and text encoder.
+
+    Args:
+        tokenizer: Huggingface tokenizer.
+        text_encoder: Huggingface T5 encoder.
+    """
+    idx_font_dict = json.loads(MULTILINGUAL_10_LANG_IDX_JSON)
+    idx_color_dict = json.loads(COLOR_IDX_JSON)
+
+    font_token = [f"<{font_code[:2]}-font-{idx_font_dict[font_code]}>" for font_code in idx_font_dict]
+    color_token = [f"<color-{i}>" for i in range(len(idx_color_dict))]
+    additional_special_tokens = []
+    additional_special_tokens += color_token
+    additional_special_tokens += font_token
+
+    tokenizer.add_tokens(additional_special_tokens, special_tokens=True)
+    # Set mean_resizing=False to avoid PyTorch LAPACK dependency
+    text_encoder.resize_token_embeddings(len(tokenizer), mean_resizing=False)
+
+
+def load_byt5(
+    ckpt_path: str,
+    dtype: Optional[torch.dtype],
+    device: Union[str, torch.device],
+    disable_mmap: bool = False,
+    state_dict: Optional[dict] = None,
+) -> Tuple[T5Stack, T5Tokenizer]:
+    BYT5_CONFIG_JSON = """
+{
+    "_name_or_path": "/home/patrick/t5/byt5-small",
+    "architectures": [
+        "T5ForConditionalGeneration"
+    ],
+    "d_ff": 3584,
+    "d_kv": 64,
+    "d_model": 1472,
+    "decoder_start_token_id": 0,
+    "dropout_rate": 0.1,
+    "eos_token_id": 1,
+    "feed_forward_proj": "gated-gelu",
+    "gradient_checkpointing": false,
+    "initializer_factor": 1.0,
+    "is_encoder_decoder": true,
+    "layer_norm_epsilon": 1e-06,
+    "model_type": "t5",
+    "num_decoder_layers": 4,
+    "num_heads": 6,
+    "num_layers": 12,
+    "pad_token_id": 0,
+    "relative_attention_num_buckets": 32,
+    "tie_word_embeddings": false,
+    "tokenizer_class": "ByT5Tokenizer",
+    "transformers_version": "4.7.0.dev0",
+    "use_cache": true,
+    "vocab_size": 384
+    }
+"""
+
+    logger.info(f"Loading BYT5 tokenizer from {BYT5_TOKENIZER_PATH}")
+    byt5_tokenizer = AutoTokenizer.from_pretrained(BYT5_TOKENIZER_PATH)
+
+    logger.info("Initializing BYT5 text encoder")
+    config = json.loads(BYT5_CONFIG_JSON)
+    config = T5Config(**config)
+    with init_empty_weights():
+        byt5_text_encoder = T5ForConditionalGeneration._from_config(config).get_encoder()
+
+    add_special_token(byt5_tokenizer, byt5_text_encoder)
+
+    if state_dict is not None:
+        sd = state_dict
+    else:
+        logger.info(f"Loading state dict from {ckpt_path}")
+        sd = load_safetensors(ckpt_path, device, disable_mmap=disable_mmap, dtype=dtype)
+
+    # remove "encoder." prefix
+    sd = {k[len("encoder.") :] if k.startswith("encoder.") else k: v for k, v in sd.items()}
+    sd["embed_tokens.weight"] = sd.pop("shared.weight")
+
+    info = byt5_text_encoder.load_state_dict(sd, strict=True, assign=True)
+    byt5_text_encoder.to(device)
+    logger.info(f"BYT5 text encoder loaded with info: {info}")
+
+    return byt5_tokenizer, byt5_text_encoder
+
+
+def load_qwen2_5_vl(
+    ckpt_path: str,
+    dtype: Optional[torch.dtype],
+    device: Union[str, torch.device],
+    disable_mmap: bool = False,
+    state_dict: Optional[dict] = None,
+) -> tuple[Qwen2Tokenizer, Qwen2_5_VLForConditionalGeneration]:
+    QWEN2_5_VL_CONFIG_JSON = """
+{
+  "architectures": [
+    "Qwen2_5_VLForConditionalGeneration"
+  ],
+  "attention_dropout": 0.0,
+  "bos_token_id": 151643,
+  "eos_token_id": 151645,
+  "hidden_act": "silu",
+  "hidden_size": 3584,
+  "image_token_id": 151655,
+  "initializer_range": 0.02,
+  "intermediate_size": 18944,
+  "max_position_embeddings": 128000,
+  "max_window_layers": 28,
+  "model_type": "qwen2_5_vl",
+  "num_attention_heads": 28,
+  "num_hidden_layers": 28,
+  "num_key_value_heads": 4,
+  "rms_norm_eps": 1e-06,
+  "rope_scaling": {
+    "mrope_section": [
+      16,
+      24,
+      24
+    ],
+    "rope_type": "default",
+    "type": "default"
+  },
+  "rope_theta": 1000000.0,
+  "sliding_window": 32768,
+  "text_config": {
+    "architectures": [
+      "Qwen2_5_VLForConditionalGeneration"
+    ],
+    "attention_dropout": 0.0,
+    "bos_token_id": 151643,
+    "eos_token_id": 151645,
+    "hidden_act": "silu",
+    "hidden_size": 3584,
+    "image_token_id": null,
+    "initializer_range": 0.02,
+    "intermediate_size": 18944,
+    "layer_types": [
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention"
+    ],
+    "max_position_embeddings": 128000,
+    "max_window_layers": 28,
+    "model_type": "qwen2_5_vl_text",
+    "num_attention_heads": 28,
+    "num_hidden_layers": 28,
+    "num_key_value_heads": 4,
+    "rms_norm_eps": 1e-06,
+    "rope_scaling": {
+      "mrope_section": [
+        16,
+        24,
+        24
+      ],
+      "rope_type": "default",
+      "type": "default"
+    },
+    "rope_theta": 1000000.0,
+    "sliding_window": null,
+    "torch_dtype": "float32",
+    "use_cache": true,
+    "use_sliding_window": false,
+    "video_token_id": null,
+    "vision_end_token_id": 151653,
+    "vision_start_token_id": 151652,
+    "vision_token_id": 151654,
+    "vocab_size": 152064
+  },
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.53.1",
+  "use_cache": true,
+  "use_sliding_window": false,
+  "video_token_id": 151656,
+  "vision_config": {
+    "depth": 32,
+    "fullatt_block_indexes": [
+      7,
+      15,
+      23,
+      31
+    ],
+    "hidden_act": "silu",
+    "hidden_size": 1280,
+    "in_channels": 3,
+    "in_chans": 3,
+    "initializer_range": 0.02,
+    "intermediate_size": 3420,
+    "model_type": "qwen2_5_vl",
+    "num_heads": 16,
+    "out_hidden_size": 3584,
+    "patch_size": 14,
+    "spatial_merge_size": 2,
+    "spatial_patch_size": 14,
+    "temporal_patch_size": 2,
+    "tokens_per_second": 2,
+    "torch_dtype": "float32",
+    "window_size": 112
+  },
+  "vision_end_token_id": 151653,
+  "vision_start_token_id": 151652,
+  "vision_token_id": 151654,
+  "vocab_size": 152064
+}
+"""
+    config = json.loads(QWEN2_5_VL_CONFIG_JSON)
+    config = Qwen2_5_VLConfig(**config)
+    with init_empty_weights():
+        qwen2_5_vl = Qwen2_5_VLForConditionalGeneration._from_config(config)
+
+    if state_dict is not None:
+        sd = state_dict
+    else:
+        logger.info(f"Loading state dict from {ckpt_path}")
+        sd = load_safetensors(ckpt_path, device, disable_mmap=disable_mmap, dtype=dtype)
+
+    # convert prefixes
+    for key in list(sd.keys()):
+        if key.startswith("model."):
+            new_key = key.replace("model.", "model.language_model.", 1)
+        elif key.startswith("visual."):
+            new_key = key.replace("visual.", "model.visual.", 1)
+        else:
+            continue
+        if key not in sd:
+            logger.warning(f"Key {key} not found in state dict, skipping.")
+            continue
+        sd[new_key] = sd.pop(key)
+
+    info = qwen2_5_vl.load_state_dict(sd, strict=True, assign=True)
+    logger.info(f"Loaded Qwen2.5-VL: {info}")
+    qwen2_5_vl.to(device)
+
+    if dtype is not None:
+        if dtype.itemsize == 1:  # fp8
+            org_dtype = torch.bfloat16  # model weight is fp8 in loading, but original dtype is bfloat16
+            logger.info(f"prepare Qwen2.5-VL for fp8: set to {dtype} from {org_dtype}")
+            qwen2_5_vl.to(dtype)
+
+            # prepare LLM for fp8
+            def prepare_fp8(vl_model: Qwen2_5_VLForConditionalGeneration, target_dtype):
+                def forward_hook(module):
+                    def forward(hidden_states):
+                        input_dtype = hidden_states.dtype
+                        hidden_states = hidden_states.to(torch.float32)
+                        variance = hidden_states.pow(2).mean(-1, keepdim=True)
+                        hidden_states = hidden_states * torch.rsqrt(variance + module.variance_epsilon)
+                        # return module.weight.to(input_dtype) * hidden_states.to(input_dtype)
+                        return (module.weight.to(torch.float32) * hidden_states.to(torch.float32)).to(input_dtype)
+
+                    return forward
+
+                def decoder_forward_hook(module):
+                    def forward(
+                        hidden_states: torch.Tensor,
+                        attention_mask: Optional[torch.Tensor] = None,
+                        position_ids: Optional[torch.LongTensor] = None,
+                        past_key_value: Optional[tuple[torch.Tensor]] = None,
+                        output_attentions: Optional[bool] = False,
+                        use_cache: Optional[bool] = False,
+                        cache_position: Optional[torch.LongTensor] = None,
+                        position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None,  # necessary, but kept here for BC
+                        **kwargs,
+                    ) -> tuple[torch.FloatTensor, Optional[tuple[torch.FloatTensor, torch.FloatTensor]]]:
+
+                        residual = hidden_states
+
+                        hidden_states = module.input_layernorm(hidden_states)
+
+                        # Self Attention
+                        hidden_states, self_attn_weights = module.self_attn(
+                            hidden_states=hidden_states,
+                            attention_mask=attention_mask,
+                            position_ids=position_ids,
+                            past_key_value=past_key_value,
+                            output_attentions=output_attentions,
+                            use_cache=use_cache,
+                            cache_position=cache_position,
+                            position_embeddings=position_embeddings,
+                            **kwargs,
+                        )
+                        input_dtype = hidden_states.dtype
+                        hidden_states = residual.to(torch.float32) + hidden_states.to(torch.float32)
+                        hidden_states = hidden_states.to(input_dtype)
+
+                        # Fully Connected
+                        residual = hidden_states
+                        hidden_states = module.post_attention_layernorm(hidden_states)
+                        hidden_states = module.mlp(hidden_states)
+                        hidden_states = residual + hidden_states
+
+                        outputs = (hidden_states,)
+
+                        if output_attentions:
+                            outputs += (self_attn_weights,)
+
+                        return outputs
+
+                    return forward
+
+                for module in vl_model.modules():
+                    if module.__class__.__name__ in ["Embedding"]:
+                        # print("set", module.__class__.__name__, "to", target_dtype)
+                        module.to(target_dtype)
+                    if module.__class__.__name__ in ["Qwen2RMSNorm"]:
+                        # print("set", module.__class__.__name__, "hooks")
+                        module.forward = forward_hook(module)
+                    if module.__class__.__name__ in ["Qwen2_5_VLDecoderLayer"]:
+                        # print("set", module.__class__.__name__, "hooks")
+                        module.forward = decoder_forward_hook(module)
+                    if module.__class__.__name__ in ["Qwen2_5_VisionRotaryEmbedding"]:
+                        # print("set", module.__class__.__name__, "hooks")
+                        module.to(target_dtype)
+
+            prepare_fp8(qwen2_5_vl, org_dtype)
+
+        else:
+            logger.info(f"Setting Qwen2.5-VL to dtype: {dtype}")
+            qwen2_5_vl.to(dtype)
+
+    # Load tokenizer
+    logger.info(f"Loading tokenizer from {QWEN_2_5_VL_IMAGE_ID}")
+    tokenizer = Qwen2Tokenizer.from_pretrained(QWEN_2_5_VL_IMAGE_ID) 
+    return tokenizer, qwen2_5_vl
+
+
+def get_qwen_prompt_embeds(
+    tokenizer: Qwen2Tokenizer, vlm: Qwen2_5_VLForConditionalGeneration, prompt: Union[str, list[str]] = None
+):
+    tokenizer_max_length = 1024
+
+    # HunyuanImage-2.1 does not use "<|im_start|>assistant\n" in the prompt template
+    prompt_template_encode = "<|im_start|>system\nDescribe the image by detailing the color, shape, size, texture, quantity, text, spatial relationships of the objects and background:<|im_end|>\n<|im_start|>user\n{}<|im_end|>"
+    # \n<|im_start|>assistant\n"
+    prompt_template_encode_start_idx = 34
+    # default_sample_size = 128
+
+    device = vlm.device
+    dtype = vlm.dtype
+
+    prompt = [prompt] if isinstance(prompt, str) else prompt
+
+    template = prompt_template_encode
+    drop_idx = prompt_template_encode_start_idx
+    txt = [template.format(e) for e in prompt]
+    txt_tokens = tokenizer(txt, max_length=tokenizer_max_length + drop_idx, padding=True, truncation=True, return_tensors="pt").to(
+        device
+    )
+
+    if dtype.itemsize == 1:  # fp8
+        with torch.no_grad(), torch.autocast(device_type=device.type, dtype=torch.bfloat16, enabled=True):
+            encoder_hidden_states = vlm(
+                input_ids=txt_tokens.input_ids, attention_mask=txt_tokens.attention_mask, output_hidden_states=True
+            )
+    else:
+        with torch.no_grad(), torch.autocast(device_type=device.type, dtype=dtype, enabled=True):
+            encoder_hidden_states = vlm(
+                input_ids=txt_tokens.input_ids, attention_mask=txt_tokens.attention_mask, output_hidden_states=True
+            )
+    hidden_states = encoder_hidden_states.hidden_states[-3]  # use the 3rd last layer's hidden states for HunyuanImage-2.1
+    if hidden_states.shape[1] > tokenizer_max_length + drop_idx:
+        logger.warning(f"Hidden states shape {hidden_states.shape} exceeds max length {tokenizer_max_length + drop_idx}")
+
+    # --- Unnecessary complicated processing, keep for reference ---
+    # split_hidden_states = extract_masked_hidden(hidden_states, txt_tokens.attention_mask)
+    # split_hidden_states = [e[drop_idx:] for e in split_hidden_states]
+    # attn_mask_list = [torch.ones(e.size(0), dtype=torch.long, device=e.device) for e in split_hidden_states]
+    # max_seq_len = max([e.size(0) for e in split_hidden_states])
+    # prompt_embeds = torch.stack([torch.cat([u, u.new_zeros(max_seq_len - u.size(0), u.size(1))]) for u in split_hidden_states])
+    # encoder_attention_mask = torch.stack([torch.cat([u, u.new_zeros(max_seq_len - u.size(0))]) for u in attn_mask_list])
+    # ----------------------------------------------------------
+
+    prompt_embeds = hidden_states[:, drop_idx:, :]
+    encoder_attention_mask = txt_tokens.attention_mask[:, drop_idx:]
+    prompt_embeds = prompt_embeds.to(dtype=dtype, device=device)
+
+    return prompt_embeds, encoder_attention_mask
+
+
+def format_prompt(texts, styles):
+    """
+    Text "{text}" in {color}, {type}.
+    """
+
+    prompt = ""
+    for text, style in zip(texts, styles):
+        # color and style are always None in official implementation, so we only use text
+        text_prompt = f'Text "{text}"'
+        text_prompt += ". "
+        prompt = prompt + text_prompt
+    return prompt
+
+
+def get_glyph_prompt_embeds(
+    tokenizer: T5Tokenizer, text_encoder: T5Stack, prompt: Union[str, list[str]] = None
+) -> Tuple[list[bool], torch.Tensor, torch.Tensor]:
+    byt5_max_length = 128
+    if not prompt:
+        return (
+            [False],
+            torch.zeros((1, byt5_max_length, 1472), device=text_encoder.device),
+            torch.zeros((1, byt5_max_length), device=text_encoder.device, dtype=torch.int64),
+        )
+
+    try:
+        text_prompt_texts = []
+        # pattern_quote_single = r"\'(.*?)\'"
+        pattern_quote_double = r"\"(.*?)\""
+        pattern_quote_chinese_single = r"‘(.*?)’"
+        pattern_quote_chinese_double = r"“(.*?)”"
+
+        # matches_quote_single = re.findall(pattern_quote_single, prompt)
+        matches_quote_double = re.findall(pattern_quote_double, prompt)
+        matches_quote_chinese_single = re.findall(pattern_quote_chinese_single, prompt)
+        matches_quote_chinese_double = re.findall(pattern_quote_chinese_double, prompt)
+
+        # text_prompt_texts.extend(matches_quote_single)
+        text_prompt_texts.extend(matches_quote_double)
+        text_prompt_texts.extend(matches_quote_chinese_single)
+        text_prompt_texts.extend(matches_quote_chinese_double)
+
+        if not text_prompt_texts:
+            return (
+                [False],
+                torch.zeros((1, byt5_max_length, 1472), device=text_encoder.device),
+                torch.zeros((1, byt5_max_length), device=text_encoder.device, dtype=torch.int64),
+            )
+
+        text_prompt_style_list = [{"color": None, "font-family": None} for _ in range(len(text_prompt_texts))]
+        glyph_text_formatted = format_prompt(text_prompt_texts, text_prompt_style_list)
+
+        byt5_text_ids, byt5_text_mask = get_byt5_text_tokens(tokenizer, byt5_max_length, glyph_text_formatted)
+
+        byt5_text_ids = byt5_text_ids.to(device=text_encoder.device)
+        byt5_text_mask = byt5_text_mask.to(device=text_encoder.device)
+
+        byt5_prompt_embeds = text_encoder(byt5_text_ids, attention_mask=byt5_text_mask.float())
+        byt5_emb = byt5_prompt_embeds[0]
+
+        return [True], byt5_emb, byt5_text_mask
+
+    except Exception as e:
+        logger.warning(f"Warning: Error in glyph encoding, using fallback: {e}")
+        return (
+            [False],
+            torch.zeros((1, byt5_max_length, 1472), device=text_encoder.device),
+            torch.zeros((1, byt5_max_length), device=text_encoder.device, dtype=torch.int64),
+        )
+
+
+def get_byt5_text_tokens(tokenizer, max_length, text_list):
+    """
+    Get byT5 text tokens.
+
+    Args:
+        tokenizer: The tokenizer object
+        max_length: Maximum token length
+        text_list: List or string of text
+
+    Returns:
+        Tuple of (byt5_text_ids, byt5_text_mask)
+    """
+    if isinstance(text_list, list):
+        text_prompt = " ".join(text_list)
+    else:
+        text_prompt = text_list
+
+    byt5_text_inputs = tokenizer(
+        text_prompt, padding="max_length", max_length=max_length, truncation=True, add_special_tokens=True, return_tensors="pt"
+    )
+
+    byt5_text_ids = byt5_text_inputs.input_ids
+    byt5_text_mask = byt5_text_inputs.attention_mask
+
+    return byt5_text_ids, byt5_text_mask
diff --git a/library/hunyuan_image_utils.py b/library/hunyuan_image_utils.py
new file mode 100644
index 000000000..17847104a
--- /dev/null
+++ b/library/hunyuan_image_utils.py
@@ -0,0 +1,461 @@
+# Original work: https://github.com/Tencent-Hunyuan/HunyuanImage-2.1
+# Re-implemented for license compliance for sd-scripts.
+
+import math
+from typing import Tuple, Union, Optional
+import torch
+
+
+def _to_tuple(x, dim=2):
+    """
+    Convert int or sequence to tuple of specified dimension.
+
+    Args:
+        x: Int or sequence to convert.
+        dim: Target dimension for tuple.
+
+    Returns:
+        Tuple of length dim.
+    """
+    if isinstance(x, int) or isinstance(x, float):
+        return (x,) * dim
+    elif len(x) == dim:
+        return x
+    else:
+        raise ValueError(f"Expected length {dim} or int, but got {x}")
+
+
+def get_meshgrid_nd(start, dim=2):
+    """
+    Generate n-dimensional coordinate meshgrid from 0 to grid_size.
+
+    Creates coordinate grids for each spatial dimension, useful for
+    generating position embeddings.
+
+    Args:
+        start: Grid size for each dimension (int or tuple).
+        dim: Number of spatial dimensions.
+
+    Returns:
+        Coordinate grid tensor [dim, *grid_size].
+    """
+    # Convert start to grid sizes
+    num = _to_tuple(start, dim=dim)
+    start = (0,) * dim
+    stop = num
+
+    # Generate coordinate arrays for each dimension
+    axis_grid = []
+    for i in range(dim):
+        a, b, n = start[i], stop[i], num[i]
+        g = torch.linspace(a, b, n + 1, dtype=torch.float32)[:n]
+        axis_grid.append(g)
+    grid = torch.meshgrid(*axis_grid, indexing="ij")  # dim x [W, H, D]
+    grid = torch.stack(grid, dim=0)  # [dim, W, H, D]
+
+    return grid
+
+
+def get_nd_rotary_pos_embed(rope_dim_list, start, theta=10000.0):
+    """
+    Generate n-dimensional rotary position embeddings for spatial tokens.
+
+    Creates RoPE embeddings for multi-dimensional positional encoding,
+    distributing head dimensions across spatial dimensions.
+
+    Args:
+        rope_dim_list: Dimensions allocated to each spatial axis (should sum to head_dim).
+        start: Spatial grid size for each dimension.
+        theta: Base frequency for RoPE computation.
+
+    Returns:
+        Tuple of (cos_freqs, sin_freqs) for rotary embedding [H*W, D/2].
+    """
+
+    grid = get_meshgrid_nd(start, dim=len(rope_dim_list))  # [3, W, H, D] / [2, W, H]
+
+    # Generate RoPE embeddings for each spatial dimension
+    embs = []
+    for i in range(len(rope_dim_list)):
+        emb = get_1d_rotary_pos_embed(rope_dim_list[i], grid[i].reshape(-1), theta)  # 2 x [WHD, rope_dim_list[i]]
+        embs.append(emb)
+
+    cos = torch.cat([emb[0] for emb in embs], dim=1)  # (WHD, D/2)
+    sin = torch.cat([emb[1] for emb in embs], dim=1)  # (WHD, D/2)
+    return cos, sin
+
+
+def get_1d_rotary_pos_embed(
+    dim: int, pos: Union[torch.FloatTensor, int], theta: float = 10000.0
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    """
+    Generate 1D rotary position embeddings.
+
+    Args:
+        dim: Embedding dimension (must be even).
+        pos: Position indices [S] or scalar for sequence length.
+        theta: Base frequency for sinusoidal encoding.
+
+    Returns:
+        Tuple of (cos_freqs, sin_freqs) tensors [S, D].
+    """
+    if isinstance(pos, int):
+        pos = torch.arange(pos).float()
+
+    freqs = 1.0 / (theta ** (torch.arange(0, dim, 2)[: (dim // 2)].float() / dim))  # [D/2]
+    freqs = torch.outer(pos, freqs)  # [S, D/2]
+    freqs_cos = freqs.cos().repeat_interleave(2, dim=1)  # [S, D]
+    freqs_sin = freqs.sin().repeat_interleave(2, dim=1)  # [S, D]
+    return freqs_cos, freqs_sin
+
+
+def timestep_embedding(t, dim, max_period=10000):
+    """
+    Create sinusoidal timestep embeddings for diffusion models.
+
+    Converts scalar timesteps to high-dimensional embeddings using
+    sinusoidal encoding at different frequencies.
+
+    Args:
+        t: Timestep tensor [N].
+        dim: Output embedding dimension.
+        max_period: Maximum period for frequency computation.
+
+    Returns:
+        Timestep embeddings [N, dim].
+    """
+    half = dim // 2
+    freqs = torch.exp(-math.log(max_period) * torch.arange(start=0, end=half, dtype=torch.float32) / half).to(device=t.device)
+    args = t[:, None].float() * freqs[None]
+    embedding = torch.cat([torch.cos(args), torch.sin(args)], dim=-1)
+    if dim % 2:
+        embedding = torch.cat([embedding, torch.zeros_like(embedding[:, :1])], dim=-1)
+    return embedding
+
+
+def modulate(x, shift=None, scale=None):
+    """
+    Apply adaptive layer normalization modulation.
+
+    Applies scale and shift transformations for conditioning
+    in adaptive layer normalization.
+
+    Args:
+        x: Input tensor to modulate.
+        shift: Additive shift parameter (optional).
+        scale: Multiplicative scale parameter (optional).
+
+    Returns:
+        Modulated tensor x * (1 + scale) + shift.
+    """
+    if scale is None and shift is None:
+        return x
+    elif shift is None:
+        return x * (1 + scale.unsqueeze(1))
+    elif scale is None:
+        return x + shift.unsqueeze(1)
+    else:
+        return x * (1 + scale.unsqueeze(1)) + shift.unsqueeze(1)
+
+
+def apply_gate(x, gate=None, tanh=False):
+    """
+    Apply gating mechanism to tensor.
+
+    Multiplies input by gate values, optionally applying tanh activation.
+    Used in residual connections for adaptive control.
+
+    Args:
+        x: Input tensor to gate.
+        gate: Gating values (optional).
+        tanh: Whether to apply tanh to gate values.
+
+    Returns:
+        Gated tensor x * gate (with optional tanh).
+    """
+    if gate is None:
+        return x
+    if tanh:
+        return x * gate.unsqueeze(1).tanh()
+    else:
+        return x * gate.unsqueeze(1)
+
+
+def reshape_for_broadcast(
+    freqs_cis: Tuple[torch.Tensor, torch.Tensor],
+    x: torch.Tensor,
+    head_first=False,
+):
+    """
+    Reshape RoPE frequency tensors for broadcasting with attention tensors.
+
+    Args:
+        freqs_cis: Tuple of (cos_freqs, sin_freqs) tensors.
+        x: Target tensor for broadcasting compatibility.
+        head_first: Must be False (only supported layout).
+
+    Returns:
+        Reshaped (cos_freqs, sin_freqs) tensors ready for broadcasting.
+    """
+    assert not head_first, "Only head_first=False layout supported."
+    assert isinstance(freqs_cis, tuple), "Expected tuple of (cos, sin) frequency tensors."
+    assert x.ndim > 1, f"x should have at least 2 dimensions, but got {x.ndim}"
+
+    # Validate frequency tensor dimensions match target tensor
+    assert freqs_cis[0].shape == (
+        x.shape[1],
+        x.shape[-1],
+    ), f"Frequency tensor shape {freqs_cis[0].shape} incompatible with target shape {x.shape}"
+    
+    shape = [d if i == 1 or i == x.ndim - 1 else 1 for i, d in enumerate(x.shape)]
+    return freqs_cis[0].view(*shape), freqs_cis[1].view(*shape)
+
+
+def rotate_half(x):
+    """
+    Rotate half the dimensions for RoPE computation.
+
+    Splits the last dimension in half and applies a 90-degree rotation
+    by swapping and negating components.
+
+    Args:
+        x: Input tensor [..., D] where D is even.
+
+    Returns:
+        Rotated tensor with same shape as input.
+    """
+    x_real, x_imag = x.float().reshape(*x.shape[:-1], -1, 2).unbind(-1)  # [B, S, H, D//2]
+    return torch.stack([-x_imag, x_real], dim=-1).flatten(3)
+
+
+def apply_rotary_emb(
+    xq: torch.Tensor, xk: torch.Tensor, freqs_cis: Tuple[torch.Tensor, torch.Tensor], head_first: bool = False
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    """
+    Apply rotary position embeddings to query and key tensors.
+
+    Args:
+        xq: Query tensor [B, S, H, D].
+        xk: Key tensor [B, S, H, D].
+        freqs_cis: Tuple of (cos_freqs, sin_freqs) for rotation.
+        head_first: Whether head dimension precedes sequence dimension.
+
+    Returns:
+        Tuple of rotated (query, key) tensors.
+    """
+    device = xq.device
+    dtype = xq.dtype
+
+    cos, sin = reshape_for_broadcast(freqs_cis, xq, head_first)
+    cos, sin = cos.to(device), sin.to(device)
+    
+    # Apply rotation: x' = x * cos + rotate_half(x) * sin
+    xq_out = (xq.float() * cos + rotate_half(xq.float()) * sin).to(dtype)
+    xk_out = (xk.float() * cos + rotate_half(xk.float()) * sin).to(dtype)
+
+    return xq_out, xk_out
+
+
+def get_timesteps_sigmas(sampling_steps: int, shift: float, device: torch.device) -> Tuple[torch.Tensor, torch.Tensor]:
+    """
+    Generate timesteps and sigmas for diffusion sampling.
+
+    Args:
+        sampling_steps: Number of sampling steps.
+        shift: Sigma shift parameter for schedule modification.
+        device: Target device for tensors.
+
+    Returns:
+        Tuple of (timesteps, sigmas) tensors.
+    """
+    sigmas = torch.linspace(1, 0, sampling_steps + 1)
+    sigmas = (shift * sigmas) / (1 + (shift - 1) * sigmas)
+    sigmas = sigmas.to(torch.float32)
+    timesteps = (sigmas[:-1] * 1000).to(dtype=torch.float32, device=device)
+    return timesteps, sigmas
+
+
+def step(latents, noise_pred, sigmas, step_i):
+    """
+    Perform a single diffusion sampling step.
+
+    Args:
+        latents: Current latent state.
+        noise_pred: Predicted noise.
+        sigmas: Noise schedule sigmas.
+        step_i: Current step index.
+
+    Returns:
+        Updated latents after the step.
+    """
+    return latents.float() - (sigmas[step_i] - sigmas[step_i + 1]) * noise_pred.float()
+
+
+# region AdaptiveProjectedGuidance
+
+
+class MomentumBuffer:
+    """
+    Exponential moving average buffer for APG momentum.
+    """
+    def __init__(self, momentum: float):
+        self.momentum = momentum
+        self.running_average = 0
+
+    def update(self, update_value: torch.Tensor):
+        new_average = self.momentum * self.running_average
+        self.running_average = update_value + new_average
+
+
+def normalized_guidance_apg(
+    pred_cond: torch.Tensor,
+    pred_uncond: torch.Tensor,
+    guidance_scale: float,
+    momentum_buffer: Optional[MomentumBuffer] = None,
+    eta: float = 1.0,
+    norm_threshold: float = 0.0,
+    use_original_formulation: bool = False,
+):
+    """
+    Apply normalized adaptive projected guidance.
+    
+    Projects the guidance vector to reduce over-saturation while maintaining
+    directional control by decomposing into parallel and orthogonal components.
+    
+    Args:
+        pred_cond: Conditional prediction.
+        pred_uncond: Unconditional prediction.
+        guidance_scale: Guidance scale factor.
+        momentum_buffer: Optional momentum buffer for temporal smoothing.
+        eta: Scaling factor for parallel component.
+        norm_threshold: Maximum norm for guidance vector clipping.
+        use_original_formulation: Whether to use original APG formulation.
+        
+    Returns:
+        Guided prediction tensor.
+    """
+    diff = pred_cond - pred_uncond
+    dim = [-i for i in range(1, len(diff.shape))]  # All dimensions except batch
+
+    # Apply momentum smoothing if available
+    if momentum_buffer is not None:
+        momentum_buffer.update(diff)
+        diff = momentum_buffer.running_average
+
+    # Apply norm clipping if threshold is set
+    if norm_threshold > 0:
+        diff_norm = diff.norm(p=2, dim=dim, keepdim=True)
+        scale_factor = torch.minimum(torch.ones_like(diff_norm), norm_threshold / diff_norm)
+        diff = diff * scale_factor
+
+    # Project guidance vector into parallel and orthogonal components
+    v0, v1 = diff.double(), pred_cond.double()
+    v1 = torch.nn.functional.normalize(v1, dim=dim)
+    v0_parallel = (v0 * v1).sum(dim=dim, keepdim=True) * v1
+    v0_orthogonal = v0 - v0_parallel
+    diff_parallel, diff_orthogonal = v0_parallel.type_as(diff), v0_orthogonal.type_as(diff)
+
+    # Combine components with different scaling
+    normalized_update = diff_orthogonal + eta * diff_parallel
+    pred = pred_cond if use_original_formulation else pred_uncond
+    pred = pred + guidance_scale * normalized_update
+
+    return pred
+
+
+class AdaptiveProjectedGuidance:
+    """
+    Adaptive Projected Guidance for classifier-free guidance.
+    
+    Implements APG which projects the guidance vector to reduce over-saturation
+    while maintaining directional control.
+    """
+    def __init__(
+        self,
+        guidance_scale: float = 7.5,
+        adaptive_projected_guidance_momentum: Optional[float] = None,
+        adaptive_projected_guidance_rescale: float = 15.0,
+        eta: float = 0.0,
+        guidance_rescale: float = 0.0,
+        use_original_formulation: bool = False,
+    ):
+        assert guidance_rescale == 0.0, "guidance_rescale > 0.0 not supported."
+
+        self.guidance_scale = guidance_scale
+        self.adaptive_projected_guidance_momentum = adaptive_projected_guidance_momentum
+        self.adaptive_projected_guidance_rescale = adaptive_projected_guidance_rescale
+        self.eta = eta
+        self.guidance_rescale = guidance_rescale
+        self.use_original_formulation = use_original_formulation
+        self.momentum_buffer = None
+
+    def __call__(self, pred_cond: torch.Tensor, pred_uncond: Optional[torch.Tensor] = None, step=None) -> torch.Tensor:
+        if step == 0 and self.adaptive_projected_guidance_momentum is not None:
+            self.momentum_buffer = MomentumBuffer(self.adaptive_projected_guidance_momentum)
+
+        pred = normalized_guidance_apg(
+            pred_cond,
+            pred_uncond,
+            self.guidance_scale,
+            self.momentum_buffer,
+            self.eta,
+            self.adaptive_projected_guidance_rescale,
+            self.use_original_formulation,
+        )
+
+        return pred
+
+
+# endregion
+
+
+def apply_classifier_free_guidance(
+    noise_pred_text: torch.Tensor,
+    noise_pred_uncond: torch.Tensor,
+    is_ocr: bool,
+    guidance_scale: float,
+    step: int,
+    apg_start_step_ocr: int = 75,
+    apg_start_step_general: int = 10,
+    cfg_guider_ocr: AdaptiveProjectedGuidance = None,
+    cfg_guider_general: AdaptiveProjectedGuidance = None,
+):
+    """
+    Apply classifier-free guidance with OCR-aware APG for batch_size=1.
+
+    Args:
+        noise_pred_text: Conditional noise prediction tensor [1, ...].
+        noise_pred_uncond: Unconditional noise prediction tensor [1, ...].
+        is_ocr: Whether this sample requires OCR-specific guidance.
+        guidance_scale: Guidance scale for CFG.
+        step: Current diffusion step index.
+        apg_start_step_ocr: Step to start APG for OCR regions.
+        apg_start_step_general: Step to start APG for general regions.
+        cfg_guider_ocr: APG guider for OCR regions.
+        cfg_guider_general: APG guider for general regions.
+
+    Returns:
+        Guided noise prediction tensor [1, ...].
+    """
+    if guidance_scale == 1.0:
+        return noise_pred_text
+
+    # Select appropriate guider and start step based on OCR requirement
+    if is_ocr:
+        cfg_guider = cfg_guider_ocr
+        apg_start_step = apg_start_step_ocr
+    else:
+        cfg_guider = cfg_guider_general
+        apg_start_step = apg_start_step_general
+
+    # Apply standard CFG or APG based on current step
+    if step <= apg_start_step:
+        # Standard classifier-free guidance
+        noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+        # Initialize APG guider state
+        _ = cfg_guider(noise_pred_text, noise_pred_uncond, step=step)
+    else:
+        # Use APG for guidance
+        noise_pred = cfg_guider(noise_pred_text, noise_pred_uncond, step=step)
+
+    return noise_pred
diff --git a/library/hunyuan_image_vae.py b/library/hunyuan_image_vae.py
new file mode 100644
index 000000000..6eb035c38
--- /dev/null
+++ b/library/hunyuan_image_vae.py
@@ -0,0 +1,622 @@
+from typing import Optional, Tuple
+
+from einops import rearrange
+import numpy as np
+import torch
+from torch import Tensor, nn
+from torch.nn import Conv2d
+from diffusers.models.autoencoders.vae import DiagonalGaussianDistribution
+
+from library.utils import load_safetensors, setup_logging
+
+setup_logging()
+import logging
+
+logger = logging.getLogger(__name__)
+
+
+VAE_SCALE_FACTOR = 32  # 32x spatial compression
+
+
+def swish(x: Tensor) -> Tensor:
+    """Swish activation function: x * sigmoid(x)."""
+    return x * torch.sigmoid(x)
+
+
+class AttnBlock(nn.Module):
+    """Self-attention block using scaled dot-product attention."""
+
+    def __init__(self, in_channels: int):
+        super().__init__()
+        self.in_channels = in_channels
+        self.norm = nn.GroupNorm(num_groups=32, num_channels=in_channels, eps=1e-6, affine=True)
+        self.q = Conv2d(in_channels, in_channels, kernel_size=1)
+        self.k = Conv2d(in_channels, in_channels, kernel_size=1)
+        self.v = Conv2d(in_channels, in_channels, kernel_size=1)
+        self.proj_out = Conv2d(in_channels, in_channels, kernel_size=1)
+
+    def attention(self, x: Tensor) -> Tensor:
+        x = self.norm(x)
+        q = self.q(x)
+        k = self.k(x)
+        v = self.v(x)
+
+        b, c, h, w = q.shape
+        q = rearrange(q, "b c h w -> b (h w) c").contiguous()
+        k = rearrange(k, "b c h w -> b (h w) c").contiguous()
+        v = rearrange(v, "b c h w -> b (h w) c").contiguous()
+
+        x = nn.functional.scaled_dot_product_attention(q, k, v)
+        return rearrange(x, "b (h w) c -> b c h w", h=h, w=w, c=c, b=b)
+
+    def forward(self, x: Tensor) -> Tensor:
+        return x + self.proj_out(self.attention(x))
+
+
+class ResnetBlock(nn.Module):
+    """
+    Residual block with two convolutions, group normalization, and swish activation.
+    Includes skip connection with optional channel dimension matching.
+
+    Parameters
+    ----------
+    in_channels : int
+        Number of input channels.
+    out_channels : int
+        Number of output channels.
+    """
+
+    def __init__(self, in_channels: int, out_channels: int):
+        super().__init__()
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+
+        self.norm1 = nn.GroupNorm(num_groups=32, num_channels=in_channels, eps=1e-6, affine=True)
+        self.conv1 = Conv2d(in_channels, out_channels, kernel_size=3, stride=1, padding=1)
+        self.norm2 = nn.GroupNorm(num_groups=32, num_channels=out_channels, eps=1e-6, affine=True)
+        self.conv2 = Conv2d(out_channels, out_channels, kernel_size=3, stride=1, padding=1)
+
+        # Skip connection projection for channel dimension mismatch
+        if self.in_channels != self.out_channels:
+            self.nin_shortcut = Conv2d(in_channels, out_channels, kernel_size=1, stride=1, padding=0)
+
+    def forward(self, x: Tensor) -> Tensor:
+        h = x
+        # First convolution block
+        h = self.norm1(h)
+        h = swish(h)
+        h = self.conv1(h)
+        # Second convolution block
+        h = self.norm2(h)
+        h = swish(h)
+        h = self.conv2(h)
+
+        # Apply skip connection with optional projection
+        if self.in_channels != self.out_channels:
+            x = self.nin_shortcut(x)
+        return x + h
+
+
+class Downsample(nn.Module):
+    """
+    Spatial downsampling block that reduces resolution by 2x using convolution followed by
+    pixel rearrangement. Includes skip connection with grouped averaging.
+
+    Parameters
+    ----------
+    in_channels : int
+        Number of input channels.
+    out_channels : int
+        Number of output channels (must be divisible by 4).
+    """
+
+    def __init__(self, in_channels: int, out_channels: int):
+        super().__init__()
+        factor = 4  # 2x2 spatial reduction factor
+        assert out_channels % factor == 0
+
+        self.conv = Conv2d(in_channels, out_channels // factor, kernel_size=3, stride=1, padding=1)
+        self.group_size = factor * in_channels // out_channels
+
+    def forward(self, x: Tensor) -> Tensor:
+        # Apply convolution and rearrange pixels for 2x downsampling
+        h = self.conv(x)
+        h = rearrange(h, "b c (h r1) (w r2) -> b (r1 r2 c) h w", r1=2, r2=2)
+
+        # Create skip connection with pixel rearrangement
+        shortcut = rearrange(x, "b c (h r1) (w r2) -> b (r1 r2 c) h w", r1=2, r2=2)
+        B, C, H, W = shortcut.shape
+        shortcut = shortcut.view(B, h.shape[1], self.group_size, H, W).mean(dim=2)
+
+        return h + shortcut
+
+
+class Upsample(nn.Module):
+    """
+    Spatial upsampling block that increases resolution by 2x using convolution followed by
+    pixel rearrangement. Includes skip connection with channel repetition.
+
+    Parameters
+    ----------
+    in_channels : int
+        Number of input channels.
+    out_channels : int
+        Number of output channels.
+    """
+
+    def __init__(self, in_channels: int, out_channels: int):
+        super().__init__()
+        factor = 4  # 2x2 spatial expansion factor
+        self.conv = Conv2d(in_channels, out_channels * factor, kernel_size=3, stride=1, padding=1)
+        self.repeats = factor * out_channels // in_channels
+
+    def forward(self, x: Tensor) -> Tensor:
+        # Apply convolution and rearrange pixels for 2x upsampling
+        h = self.conv(x)
+        h = rearrange(h, "b (r1 r2 c) h w -> b c (h r1) (w r2)", r1=2, r2=2)
+
+        # Create skip connection with channel repetition
+        shortcut = x.repeat_interleave(repeats=self.repeats, dim=1)
+        shortcut = rearrange(shortcut, "b (r1 r2 c) h w -> b c (h r1) (w r2)", r1=2, r2=2)
+
+        return h + shortcut
+
+
+class Encoder(nn.Module):
+    """
+    VAE encoder that progressively downsamples input images to a latent representation.
+    Uses residual blocks, attention, and spatial downsampling.
+
+    Parameters
+    ----------
+    in_channels : int
+        Number of input image channels (e.g., 3 for RGB).
+    z_channels : int
+        Number of latent channels in the output.
+    block_out_channels : Tuple[int, ...]
+        Output channels for each downsampling block.
+    num_res_blocks : int
+        Number of residual blocks per downsampling stage.
+    ffactor_spatial : int
+        Total spatial downsampling factor (e.g., 32 for 32x compression).
+    """
+
+    def __init__(
+        self,
+        in_channels: int,
+        z_channels: int,
+        block_out_channels: Tuple[int, ...],
+        num_res_blocks: int,
+        ffactor_spatial: int,
+    ):
+        super().__init__()
+        assert block_out_channels[-1] % (2 * z_channels) == 0
+
+        self.z_channels = z_channels
+        self.block_out_channels = block_out_channels
+        self.num_res_blocks = num_res_blocks
+
+        self.conv_in = Conv2d(in_channels, block_out_channels[0], kernel_size=3, stride=1, padding=1)
+
+        self.down = nn.ModuleList()
+        block_in = block_out_channels[0]
+
+        # Build downsampling blocks
+        for i_level, ch in enumerate(block_out_channels):
+            block = nn.ModuleList()
+            block_out = ch
+
+            # Add residual blocks for this level
+            for _ in range(self.num_res_blocks):
+                block.append(ResnetBlock(in_channels=block_in, out_channels=block_out))
+                block_in = block_out
+
+            down = nn.Module()
+            down.block = block
+
+            # Add spatial downsampling if needed
+            add_spatial_downsample = bool(i_level < np.log2(ffactor_spatial))
+            if add_spatial_downsample:
+                assert i_level < len(block_out_channels) - 1
+                block_out = block_out_channels[i_level + 1]
+                down.downsample = Downsample(block_in, block_out)
+                block_in = block_out
+
+            self.down.append(down)
+
+        # Middle blocks with attention
+        self.mid = nn.Module()
+        self.mid.block_1 = ResnetBlock(in_channels=block_in, out_channels=block_in)
+        self.mid.attn_1 = AttnBlock(block_in)
+        self.mid.block_2 = ResnetBlock(in_channels=block_in, out_channels=block_in)
+
+        # Output layers
+        self.norm_out = nn.GroupNorm(num_groups=32, num_channels=block_in, eps=1e-6, affine=True)
+        self.conv_out = Conv2d(block_in, 2 * z_channels, kernel_size=3, stride=1, padding=1)
+
+    def forward(self, x: Tensor) -> Tensor:
+        # Initial convolution
+        h = self.conv_in(x)
+
+        # Progressive downsampling through blocks
+        for i_level in range(len(self.block_out_channels)):
+            # Apply residual blocks at this level
+            for i_block in range(self.num_res_blocks):
+                h = self.down[i_level].block[i_block](h)
+            # Apply spatial downsampling if available
+            if hasattr(self.down[i_level], "downsample"):
+                h = self.down[i_level].downsample(h)
+
+        # Middle processing with attention
+        h = self.mid.block_1(h)
+        h = self.mid.attn_1(h)
+        h = self.mid.block_2(h)
+
+        # Final output layers with skip connection
+        group_size = self.block_out_channels[-1] // (2 * self.z_channels)
+        shortcut = rearrange(h, "b (c r) h w -> b c r h w", r=group_size).mean(dim=2)
+        h = self.norm_out(h)
+        h = swish(h)
+        h = self.conv_out(h)
+        h += shortcut
+        return h
+
+
+class Decoder(nn.Module):
+    """
+    VAE decoder that progressively upsamples latent representations back to images.
+    Uses residual blocks, attention, and spatial upsampling.
+
+    Parameters
+    ----------
+    z_channels : int
+        Number of latent channels in the input.
+    out_channels : int
+        Number of output image channels (e.g., 3 for RGB).
+    block_out_channels : Tuple[int, ...]
+        Output channels for each upsampling block.
+    num_res_blocks : int
+        Number of residual blocks per upsampling stage.
+    ffactor_spatial : int
+        Total spatial upsampling factor (e.g., 32 for 32x expansion).
+    """
+
+    def __init__(
+        self,
+        z_channels: int,
+        out_channels: int,
+        block_out_channels: Tuple[int, ...],
+        num_res_blocks: int,
+        ffactor_spatial: int,
+    ):
+        super().__init__()
+        assert block_out_channels[0] % z_channels == 0
+
+        self.z_channels = z_channels
+        self.block_out_channels = block_out_channels
+        self.num_res_blocks = num_res_blocks
+
+        block_in = block_out_channels[0]
+        self.conv_in = Conv2d(z_channels, block_in, kernel_size=3, stride=1, padding=1)
+
+        # Middle blocks with attention
+        self.mid = nn.Module()
+        self.mid.block_1 = ResnetBlock(in_channels=block_in, out_channels=block_in)
+        self.mid.attn_1 = AttnBlock(block_in)
+        self.mid.block_2 = ResnetBlock(in_channels=block_in, out_channels=block_in)
+
+        # Build upsampling blocks
+        self.up = nn.ModuleList()
+        for i_level, ch in enumerate(block_out_channels):
+            block = nn.ModuleList()
+            block_out = ch
+
+            # Add residual blocks for this level (extra block for decoder)
+            for _ in range(self.num_res_blocks + 1):
+                block.append(ResnetBlock(in_channels=block_in, out_channels=block_out))
+                block_in = block_out
+
+            up = nn.Module()
+            up.block = block
+
+            # Add spatial upsampling if needed
+            add_spatial_upsample = bool(i_level < np.log2(ffactor_spatial))
+            if add_spatial_upsample:
+                assert i_level < len(block_out_channels) - 1
+                block_out = block_out_channels[i_level + 1]
+                up.upsample = Upsample(block_in, block_out)
+                block_in = block_out
+
+            self.up.append(up)
+
+        # Output layers
+        self.norm_out = nn.GroupNorm(num_groups=32, num_channels=block_in, eps=1e-6, affine=True)
+        self.conv_out = Conv2d(block_in, out_channels, kernel_size=3, stride=1, padding=1)
+
+    def forward(self, z: Tensor) -> Tensor:
+        # Initial processing with skip connection
+        repeats = self.block_out_channels[0] // self.z_channels
+        h = self.conv_in(z) + z.repeat_interleave(repeats=repeats, dim=1)
+
+        # Middle processing with attention
+        h = self.mid.block_1(h)
+        h = self.mid.attn_1(h)
+        h = self.mid.block_2(h)
+
+        # Progressive upsampling through blocks
+        for i_level in range(len(self.block_out_channels)):
+            # Apply residual blocks at this level
+            for i_block in range(self.num_res_blocks + 1):
+                h = self.up[i_level].block[i_block](h)
+            # Apply spatial upsampling if available
+            if hasattr(self.up[i_level], "upsample"):
+                h = self.up[i_level].upsample(h)
+
+        # Final output layers
+        h = self.norm_out(h)
+        h = swish(h)
+        h = self.conv_out(h)
+        return h
+
+
+class HunyuanVAE2D(nn.Module):
+    """
+    VAE model for Hunyuan Image-2.1 with spatial tiling support.
+
+    This VAE uses a fixed architecture optimized for the Hunyuan Image-2.1 model,
+    with 32x spatial compression and optional memory-efficient tiling for large images.
+    """
+
+    def __init__(self):
+        super().__init__()
+
+        # Fixed configuration for Hunyuan Image-2.1
+        block_out_channels = (128, 256, 512, 512, 1024, 1024)
+        in_channels = 3  # RGB input
+        out_channels = 3  # RGB output
+        latent_channels = 64
+        layers_per_block = 2
+        ffactor_spatial = 32  # 32x spatial compression
+        sample_size = 384  # Minimum sample size for tiling
+        scaling_factor = 0.75289  # Latent scaling factor
+
+        self.ffactor_spatial = ffactor_spatial
+        self.scaling_factor = scaling_factor
+
+        self.encoder = Encoder(
+            in_channels=in_channels,
+            z_channels=latent_channels,
+            block_out_channels=block_out_channels,
+            num_res_blocks=layers_per_block,
+            ffactor_spatial=ffactor_spatial,
+        )
+
+        self.decoder = Decoder(
+            z_channels=latent_channels,
+            out_channels=out_channels,
+            block_out_channels=list(reversed(block_out_channels)),
+            num_res_blocks=layers_per_block,
+            ffactor_spatial=ffactor_spatial,
+        )
+
+        # Spatial tiling configuration for memory efficiency
+        self.use_spatial_tiling = False
+        self.tile_sample_min_size = sample_size
+        self.tile_latent_min_size = sample_size // ffactor_spatial
+        self.tile_overlap_factor = 0.25  # 25% overlap between tiles
+
+    @property
+    def dtype(self):
+        """Get the data type of the model parameters."""
+        return next(self.encoder.parameters()).dtype
+
+    @property
+    def device(self):
+        """Get the device of the model parameters."""
+        return next(self.encoder.parameters()).device
+
+    def enable_spatial_tiling(self, use_tiling: bool = True):
+        """Enable or disable spatial tiling."""
+        self.use_spatial_tiling = use_tiling
+
+    def disable_spatial_tiling(self):
+        """Disable spatial tiling."""
+        self.use_spatial_tiling = False
+
+    def enable_tiling(self, use_tiling: bool = True):
+        """Enable or disable spatial tiling (alias for enable_spatial_tiling)."""
+        self.enable_spatial_tiling(use_tiling)
+
+    def disable_tiling(self):
+        """Disable spatial tiling (alias for disable_spatial_tiling)."""
+        self.disable_spatial_tiling()
+
+    def blend_h(self, a: torch.Tensor, b: torch.Tensor, blend_extent: int) -> torch.Tensor:
+        """
+        Blend two tensors horizontally with smooth transition.
+
+        Parameters
+        ----------
+        a : torch.Tensor
+            Left tensor.
+        b : torch.Tensor
+            Right tensor.
+        blend_extent : int
+            Number of columns to blend.
+        """
+        blend_extent = min(a.shape[-1], b.shape[-1], blend_extent)
+        for x in range(blend_extent):
+            b[:, :, :, :, x] = a[:, :, :, :, -blend_extent + x] * (1 - x / blend_extent) + b[:, :, :, :, x] * (x / blend_extent)
+        return b
+
+    def blend_v(self, a: torch.Tensor, b: torch.Tensor, blend_extent: int) -> torch.Tensor:
+        """
+        Blend two tensors vertically with smooth transition.
+
+        Parameters
+        ----------
+        a : torch.Tensor
+            Top tensor.
+        b : torch.Tensor
+            Bottom tensor.
+        blend_extent : int
+            Number of rows to blend.
+        """
+        blend_extent = min(a.shape[-2], b.shape[-2], blend_extent)
+        for y in range(blend_extent):
+            b[:, :, :, y, :] = a[:, :, :, -blend_extent + y, :] * (1 - y / blend_extent) + b[:, :, :, y, :] * (y / blend_extent)
+        return b
+
+    def spatial_tiled_encode(self, x: torch.Tensor) -> torch.Tensor:
+        """
+        Encode large images using spatial tiling to reduce memory usage.
+        Tiles are processed independently and blended at boundaries.
+
+        Parameters
+        ----------
+        x : torch.Tensor
+            Input tensor of shape (B, C, T, H, W).
+        """
+        B, C, T, H, W = x.shape
+        overlap_size = int(self.tile_sample_min_size * (1 - self.tile_overlap_factor))
+        blend_extent = int(self.tile_latent_min_size * self.tile_overlap_factor)
+        row_limit = self.tile_latent_min_size - blend_extent
+
+        rows = []
+        for i in range(0, H, overlap_size):
+            row = []
+            for j in range(0, W, overlap_size):
+                tile = x[:, :, :, i : i + self.tile_sample_min_size, j : j + self.tile_sample_min_size]
+                tile = self.encoder(tile)
+                row.append(tile)
+            rows.append(row)
+
+        result_rows = []
+        for i, row in enumerate(rows):
+            result_row = []
+            for j, tile in enumerate(row):
+                if i > 0:
+                    tile = self.blend_v(rows[i - 1][j], tile, blend_extent)
+                if j > 0:
+                    tile = self.blend_h(row[j - 1], tile, blend_extent)
+                result_row.append(tile[:, :, :, :row_limit, :row_limit])
+            result_rows.append(torch.cat(result_row, dim=-1))
+
+        moments = torch.cat(result_rows, dim=-2)
+        return moments
+
+    def spatial_tiled_decode(self, z: torch.Tensor) -> torch.Tensor:
+        """
+        Decode large latents using spatial tiling to reduce memory usage.
+        Tiles are processed independently and blended at boundaries.
+
+        Parameters
+        ----------
+        z : torch.Tensor
+            Latent tensor of shape (B, C, H, W).
+        """
+        B, C, H, W = z.shape
+        overlap_size = int(self.tile_latent_min_size * (1 - self.tile_overlap_factor))
+        blend_extent = int(self.tile_sample_min_size * self.tile_overlap_factor)
+        row_limit = self.tile_sample_min_size - blend_extent
+
+        rows = []
+        for i in range(0, H, overlap_size):
+            row = []
+            for j in range(0, W, overlap_size):
+                tile = z[:, :, :, i : i + self.tile_latent_min_size, j : j + self.tile_latent_min_size]
+                decoded = self.decoder(tile)
+                row.append(decoded)
+            rows.append(row)
+
+        result_rows = []
+        for i, row in enumerate(rows):
+            result_row = []
+            for j, tile in enumerate(row):
+                if i > 0:
+                    tile = self.blend_v(rows[i - 1][j], tile, blend_extent)
+                if j > 0:
+                    tile = self.blend_h(row[j - 1], tile, blend_extent)
+                result_row.append(tile[:, :, :, :row_limit, :row_limit])
+            result_rows.append(torch.cat(result_row, dim=-1))
+
+        dec = torch.cat(result_rows, dim=-2)
+        return dec
+
+    def encode(self, x: Tensor) -> DiagonalGaussianDistribution:
+        """
+        Encode input images to latent representation.
+        Uses spatial tiling for large images if enabled.
+
+        Parameters
+        ----------
+        x : Tensor
+            Input image tensor of shape (B, C, H, W) or (B, C, T, H, W).
+
+        Returns
+        -------
+        DiagonalGaussianDistribution
+            Latent distribution with mean and logvar.
+        """
+        # Handle 5D input (B, C, T, H, W) by removing time dimension
+        original_ndim = x.ndim
+        if original_ndim == 5:
+            x = x.squeeze(2)
+
+        # Use tiling for large images to reduce memory usage
+        if self.use_spatial_tiling and (x.shape[-1] > self.tile_sample_min_size or x.shape[-2] > self.tile_sample_min_size):
+            h = self.spatial_tiled_encode(x)
+        else:
+            h = self.encoder(x)
+
+        # Restore time dimension if input was 5D
+        if original_ndim == 5:
+            h = h.unsqueeze(2)
+
+        posterior = DiagonalGaussianDistribution(h)
+        return posterior
+
+    def decode(self, z: Tensor):
+        """
+        Decode latent representation back to images.
+        Uses spatial tiling for large latents if enabled.
+
+        Parameters
+        ----------
+        z : Tensor
+            Latent tensor of shape (B, C, H, W) or (B, C, T, H, W).
+
+        Returns
+        -------
+        Tensor
+            Decoded image tensor.
+        """
+        # Handle 5D input (B, C, T, H, W) by removing time dimension
+        original_ndim = z.ndim
+        if original_ndim == 5:
+            z = z.squeeze(2)
+
+        # Use tiling for large latents to reduce memory usage
+        if self.use_spatial_tiling and (z.shape[-1] > self.tile_latent_min_size or z.shape[-2] > self.tile_latent_min_size):
+            decoded = self.spatial_tiled_decode(z)
+        else:
+            decoded = self.decoder(z)
+
+        # Restore time dimension if input was 5D
+        if original_ndim == 5:
+            decoded = decoded.unsqueeze(2)
+
+        return decoded
+
+
+def load_vae(vae_path: str, device: torch.device, disable_mmap: bool = False) -> HunyuanVAE2D:
+    logger.info("Initializing VAE")
+    vae = HunyuanVAE2D()
+
+    logger.info(f"Loading VAE from {vae_path}")
+    state_dict = load_safetensors(vae_path, device=device, disable_mmap=disable_mmap)
+    info = vae.load_state_dict(state_dict, strict=True, assign=True)
+    logger.info(f"Loaded VAE: {info}")
+
+    vae.to(device)
+    return vae
diff --git a/library/lora_utils.py b/library/lora_utils.py
new file mode 100644
index 000000000..db0046229
--- /dev/null
+++ b/library/lora_utils.py
@@ -0,0 +1,249 @@
+# copy from Musubi Tuner
+
+import os
+import re
+from typing import Dict, List, Optional, Union
+import torch
+
+from tqdm import tqdm
+
+from library.custom_offloading_utils import synchronize_device
+from library.fp8_optimization_utils import load_safetensors_with_fp8_optimization
+from library.utils import MemoryEfficientSafeOpen, setup_logging
+
+setup_logging()
+import logging
+
+logger = logging.getLogger(__name__)
+
+
+def filter_lora_state_dict(
+    weights_sd: Dict[str, torch.Tensor],
+    include_pattern: Optional[str] = None,
+    exclude_pattern: Optional[str] = None,
+) -> Dict[str, torch.Tensor]:
+    # apply include/exclude patterns
+    original_key_count = len(weights_sd.keys())
+    if include_pattern is not None:
+        regex_include = re.compile(include_pattern)
+        weights_sd = {k: v for k, v in weights_sd.items() if regex_include.search(k)}
+        logger.info(f"Filtered keys with include pattern {include_pattern}: {original_key_count} -> {len(weights_sd.keys())}")
+
+    if exclude_pattern is not None:
+        original_key_count_ex = len(weights_sd.keys())
+        regex_exclude = re.compile(exclude_pattern)
+        weights_sd = {k: v for k, v in weights_sd.items() if not regex_exclude.search(k)}
+        logger.info(f"Filtered keys with exclude pattern {exclude_pattern}: {original_key_count_ex} -> {len(weights_sd.keys())}")
+
+    if len(weights_sd) != original_key_count:
+        remaining_keys = list(set([k.split(".", 1)[0] for k in weights_sd.keys()]))
+        remaining_keys.sort()
+        logger.info(f"Remaining LoRA modules after filtering: {remaining_keys}")
+        if len(weights_sd) == 0:
+            logger.warning("No keys left after filtering.")
+
+    return weights_sd
+
+
+def load_safetensors_with_lora_and_fp8(
+    model_files: Union[str, List[str]],
+    lora_weights_list: Optional[Dict[str, torch.Tensor]],
+    lora_multipliers: Optional[List[float]],
+    fp8_optimization: bool,
+    calc_device: torch.device,
+    move_to_device: bool = False,
+    dit_weight_dtype: Optional[torch.dtype] = None,
+    target_keys: Optional[List[str]] = None,
+    exclude_keys: Optional[List[str]] = None,
+) -> dict[str, torch.Tensor]:
+    """
+    Merge LoRA weights into the state dict of a model with fp8 optimization if needed.
+
+    Args:
+        model_files (Union[str, List[str]]): Path to the model file or list of paths. If the path matches a pattern like `00001-of-00004`, it will load all files with the same prefix.
+        lora_weights_list (Optional[Dict[str, torch.Tensor]]): Dictionary of LoRA weight tensors to load.
+        lora_multipliers (Optional[List[float]]): List of multipliers for LoRA weights.
+        fp8_optimization (bool): Whether to apply FP8 optimization.
+        calc_device (torch.device): Device to calculate on.
+        move_to_device (bool): Whether to move tensors to the calculation device after loading.
+        target_keys (Optional[List[str]]): Keys to target for optimization.
+        exclude_keys (Optional[List[str]]): Keys to exclude from optimization.
+    """
+
+    # if the file name ends with 00001-of-00004 etc, we need to load the files with the same prefix
+    if isinstance(model_files, str):
+        model_files = [model_files]
+
+    extended_model_files = []
+    for model_file in model_files:
+        basename = os.path.basename(model_file)
+        match = re.match(r"^(.*?)(\d+)-of-(\d+)\.safetensors$", basename)
+        if match:
+            prefix = basename[: match.start(2)]
+            count = int(match.group(3))
+            state_dict = {}
+            for i in range(count):
+                filename = f"{prefix}{i+1:05d}-of-{count:05d}.safetensors"
+                filepath = os.path.join(os.path.dirname(model_file), filename)
+                if os.path.exists(filepath):
+                    extended_model_files.append(filepath)
+                else:
+                    raise FileNotFoundError(f"File {filepath} not found")
+        else:
+            extended_model_files.append(model_file)
+    model_files = extended_model_files
+    logger.info(f"Loading model files: {model_files}")
+
+    # load LoRA weights
+    weight_hook = None
+    if lora_weights_list is None or len(lora_weights_list) == 0:
+        lora_weights_list = []
+        lora_multipliers = []
+        list_of_lora_weight_keys = []
+    else:
+        list_of_lora_weight_keys = []
+        for lora_sd in lora_weights_list:
+            lora_weight_keys = set(lora_sd.keys())
+            list_of_lora_weight_keys.append(lora_weight_keys)
+
+        if lora_multipliers is None:
+            lora_multipliers = [1.0] * len(lora_weights_list)
+        while len(lora_multipliers) < len(lora_weights_list):
+            lora_multipliers.append(1.0)
+        if len(lora_multipliers) > len(lora_weights_list):
+            lora_multipliers = lora_multipliers[: len(lora_weights_list)]
+
+        # Merge LoRA weights into the state dict
+        logger.info(f"Merging LoRA weights into state dict. multipliers: {lora_multipliers}")
+
+        # make hook for LoRA merging
+        def weight_hook_func(model_weight_key, model_weight):
+            nonlocal list_of_lora_weight_keys, lora_weights_list, lora_multipliers, calc_device
+
+            if not model_weight_key.endswith(".weight"):
+                return model_weight
+
+            original_device = model_weight.device
+            if original_device != calc_device:
+                model_weight = model_weight.to(calc_device)  # to make calculation faster
+
+            for lora_weight_keys, lora_sd, multiplier in zip(list_of_lora_weight_keys, lora_weights_list, lora_multipliers):
+                # check if this weight has LoRA weights
+                lora_name = model_weight_key.rsplit(".", 1)[0]  # remove trailing ".weight"
+                lora_name = "lora_unet_" + lora_name.replace(".", "_")
+                down_key = lora_name + ".lora_down.weight"
+                up_key = lora_name + ".lora_up.weight"
+                alpha_key = lora_name + ".alpha"
+                if down_key not in lora_weight_keys or up_key not in lora_weight_keys:
+                    continue
+
+                # get LoRA weights
+                down_weight = lora_sd[down_key]
+                up_weight = lora_sd[up_key]
+
+                dim = down_weight.size()[0]
+                alpha = lora_sd.get(alpha_key, dim)
+                scale = alpha / dim
+
+                down_weight = down_weight.to(calc_device)
+                up_weight = up_weight.to(calc_device)
+
+                # W <- W + U * D
+                if len(model_weight.size()) == 2:
+                    # linear
+                    if len(up_weight.size()) == 4:  # use linear projection mismatch
+                        up_weight = up_weight.squeeze(3).squeeze(2)
+                        down_weight = down_weight.squeeze(3).squeeze(2)
+                    model_weight = model_weight + multiplier * (up_weight @ down_weight) * scale
+                elif down_weight.size()[2:4] == (1, 1):
+                    # conv2d 1x1
+                    model_weight = (
+                        model_weight
+                        + multiplier
+                        * (up_weight.squeeze(3).squeeze(2) @ down_weight.squeeze(3).squeeze(2)).unsqueeze(2).unsqueeze(3)
+                        * scale
+                    )
+                else:
+                    # conv2d 3x3
+                    conved = torch.nn.functional.conv2d(down_weight.permute(1, 0, 2, 3), up_weight).permute(1, 0, 2, 3)
+                    # logger.info(conved.size(), weight.size(), module.stride, module.padding)
+                    model_weight = model_weight + multiplier * conved * scale
+
+                # remove LoRA keys from set
+                lora_weight_keys.remove(down_key)
+                lora_weight_keys.remove(up_key)
+                if alpha_key in lora_weight_keys:
+                    lora_weight_keys.remove(alpha_key)
+
+            model_weight = model_weight.to(original_device)  # move back to original device
+            return model_weight
+
+        weight_hook = weight_hook_func
+
+    state_dict = load_safetensors_with_fp8_optimization_and_hook(
+        model_files,
+        fp8_optimization,
+        calc_device,
+        move_to_device,
+        dit_weight_dtype,
+        target_keys,
+        exclude_keys,
+        weight_hook=weight_hook,
+    )
+
+    for lora_weight_keys in list_of_lora_weight_keys:
+        # check if all LoRA keys are used
+        if len(lora_weight_keys) > 0:
+            # if there are still LoRA keys left, it means they are not used in the model
+            # this is a warning, not an error
+            logger.warning(f"Warning: not all LoRA keys are used: {', '.join(lora_weight_keys)}")
+
+    return state_dict
+
+
+def load_safetensors_with_fp8_optimization_and_hook(
+    model_files: list[str],
+    fp8_optimization: bool,
+    calc_device: torch.device,
+    move_to_device: bool = False,
+    dit_weight_dtype: Optional[torch.dtype] = None,
+    target_keys: Optional[List[str]] = None,
+    exclude_keys: Optional[List[str]] = None,
+    weight_hook: callable = None,
+) -> dict[str, torch.Tensor]:
+    """
+    Load state dict from safetensors files and merge LoRA weights into the state dict with fp8 optimization if needed.
+    """
+    if fp8_optimization:
+        logger.info(
+            f"Loading state dict with FP8 optimization. Dtype of weight: {dit_weight_dtype}, hook enabled: {weight_hook is not None}"
+        )
+        # dit_weight_dtype is not used because we use fp8 optimization
+        state_dict = load_safetensors_with_fp8_optimization(
+            model_files, calc_device, target_keys, exclude_keys, move_to_device=move_to_device, weight_hook=weight_hook
+        )
+    else:
+        logger.info(
+            f"Loading state dict without FP8 optimization. Dtype of weight: {dit_weight_dtype}, hook enabled: {weight_hook is not None}"
+        )
+        state_dict = {}
+        for model_file in model_files:
+            with MemoryEfficientSafeOpen(model_file) as f:
+                for key in tqdm(f.keys(), desc=f"Loading {os.path.basename(model_file)}", leave=False):
+                    value = f.get_tensor(key)
+                    if weight_hook is not None:
+                        value = weight_hook(key, value)
+                    if move_to_device:
+                        if dit_weight_dtype is None:
+                            value = value.to(calc_device, non_blocking=True)
+                        else:
+                            value = value.to(calc_device, dtype=dit_weight_dtype, non_blocking=True)
+                    elif dit_weight_dtype is not None:
+                        value = value.to(dit_weight_dtype)
+
+                    state_dict[key] = value
+
+        if move_to_device:
+            synchronize_device(calc_device)
+
+    return state_dict
diff --git a/networks/lora_hunyuan_image.py b/networks/lora_hunyuan_image.py
new file mode 100644
index 000000000..e9ad5f68d
--- /dev/null
+++ b/networks/lora_hunyuan_image.py
@@ -0,0 +1,1444 @@
+# temporary minimum implementation of LoRA
+# FLUX doesn't have Conv2d, so we ignore it
+# TODO commonize with the original implementation
+
+# LoRA network module
+# reference:
+# https://github.com/microsoft/LoRA/blob/main/loralib/layers.py
+# https://github.com/cloneofsimo/lora/blob/master/lora_diffusion/lora.py
+
+import math
+import os
+from contextlib import contextmanager
+from typing import Dict, List, Optional, Tuple, Type, Union
+from diffusers import AutoencoderKL
+from transformers import CLIPTextModel
+import numpy as np
+import torch
+from torch import Tensor
+import re
+from library.utils import setup_logging
+from library.sdxl_original_unet import SdxlUNet2DConditionModel
+
+setup_logging()
+import logging
+
+logger = logging.getLogger(__name__)
+
+
+NUM_DOUBLE_BLOCKS = 19
+NUM_SINGLE_BLOCKS = 38
+
+
+class LoRAModule(torch.nn.Module):
+    """
+    replaces forward method of the original Linear, instead of replacing the original Linear module.
+    """
+
+    def __init__(
+        self,
+        lora_name,
+        org_module: torch.nn.Module,
+        multiplier=1.0,
+        lora_dim=4,
+        alpha=1,
+        dropout=None,
+        rank_dropout=None,
+        module_dropout=None,
+        split_dims: Optional[List[int]] = None,
+        ggpo_beta: Optional[float] = None,
+        ggpo_sigma: Optional[float] = None,
+    ):
+        """
+        if alpha == 0 or None, alpha is rank (no scaling).
+
+        split_dims is used to mimic the split qkv of FLUX as same as Diffusers
+        """
+        super().__init__()
+        self.lora_name = lora_name
+
+        if org_module.__class__.__name__ == "Conv2d":
+            in_dim = org_module.in_channels
+            out_dim = org_module.out_channels
+        else:
+            in_dim = org_module.in_features
+            out_dim = org_module.out_features
+
+        self.lora_dim = lora_dim
+        self.split_dims = split_dims
+
+        if split_dims is None:
+            if org_module.__class__.__name__ == "Conv2d":
+                kernel_size = org_module.kernel_size
+                stride = org_module.stride
+                padding = org_module.padding
+                self.lora_down = torch.nn.Conv2d(in_dim, self.lora_dim, kernel_size, stride, padding, bias=False)
+                self.lora_up = torch.nn.Conv2d(self.lora_dim, out_dim, (1, 1), (1, 1), bias=False)
+            else:
+                self.lora_down = torch.nn.Linear(in_dim, self.lora_dim, bias=False)
+                self.lora_up = torch.nn.Linear(self.lora_dim, out_dim, bias=False)
+
+            torch.nn.init.kaiming_uniform_(self.lora_down.weight, a=math.sqrt(5))
+            torch.nn.init.zeros_(self.lora_up.weight)
+        else:
+            # conv2d not supported
+            assert sum(split_dims) == out_dim, "sum of split_dims must be equal to out_dim"
+            assert org_module.__class__.__name__ == "Linear", "split_dims is only supported for Linear"
+            # print(f"split_dims: {split_dims}")
+            self.lora_down = torch.nn.ModuleList(
+                [torch.nn.Linear(in_dim, self.lora_dim, bias=False) for _ in range(len(split_dims))]
+            )
+            self.lora_up = torch.nn.ModuleList([torch.nn.Linear(self.lora_dim, split_dim, bias=False) for split_dim in split_dims])
+            for lora_down in self.lora_down:
+                torch.nn.init.kaiming_uniform_(lora_down.weight, a=math.sqrt(5))
+            for lora_up in self.lora_up:
+                torch.nn.init.zeros_(lora_up.weight)
+
+        if type(alpha) == torch.Tensor:
+            alpha = alpha.detach().float().numpy()  # without casting, bf16 causes error
+        alpha = self.lora_dim if alpha is None or alpha == 0 else alpha
+        self.scale = alpha / self.lora_dim
+        self.register_buffer("alpha", torch.tensor(alpha))  # 定数として扱える
+
+        # same as microsoft's
+        self.multiplier = multiplier
+        self.org_module = org_module  # remove in applying
+        self.dropout = dropout
+        self.rank_dropout = rank_dropout
+        self.module_dropout = module_dropout
+
+        self.ggpo_sigma = ggpo_sigma
+        self.ggpo_beta = ggpo_beta
+
+        if self.ggpo_beta is not None and self.ggpo_sigma is not None:
+            self.combined_weight_norms = None
+            self.grad_norms = None
+            self.perturbation_norm_factor = 1.0 / math.sqrt(org_module.weight.shape[0])
+            self.initialize_norm_cache(org_module.weight)
+            self.org_module_shape: tuple[int] = org_module.weight.shape
+
+    def apply_to(self):
+        self.org_forward = self.org_module.forward
+        self.org_module.forward = self.forward
+
+        del self.org_module
+
+    def forward(self, x):
+        org_forwarded = self.org_forward(x)
+
+        # module dropout
+        if self.module_dropout is not None and self.training:
+            if torch.rand(1) < self.module_dropout:
+                return org_forwarded
+
+        if self.split_dims is None:
+            lx = self.lora_down(x)
+
+            # normal dropout
+            if self.dropout is not None and self.training:
+                lx = torch.nn.functional.dropout(lx, p=self.dropout)
+
+            # rank dropout
+            if self.rank_dropout is not None and self.training:
+                mask = torch.rand((lx.size(0), self.lora_dim), device=lx.device) > self.rank_dropout
+                if len(lx.size()) == 3:
+                    mask = mask.unsqueeze(1)  # for Text Encoder
+                elif len(lx.size()) == 4:
+                    mask = mask.unsqueeze(-1).unsqueeze(-1)  # for Conv2d
+                lx = lx * mask
+
+                # scaling for rank dropout: treat as if the rank is changed
+                # maskから計算することも考えられるが、augmentation的な効果を期待してrank_dropoutを用いる
+                scale = self.scale * (1.0 / (1.0 - self.rank_dropout))  # redundant for readability
+            else:
+                scale = self.scale
+
+            lx = self.lora_up(lx)
+
+            # LoRA Gradient-Guided Perturbation Optimization
+            if (
+                self.training
+                and self.ggpo_sigma is not None
+                and self.ggpo_beta is not None
+                and self.combined_weight_norms is not None
+                and self.grad_norms is not None
+            ):
+                with torch.no_grad():
+                    perturbation_scale = (self.ggpo_sigma * torch.sqrt(self.combined_weight_norms**2)) + (
+                        self.ggpo_beta * (self.grad_norms**2)
+                    )
+                    perturbation_scale_factor = (perturbation_scale * self.perturbation_norm_factor).to(self.device)
+                    perturbation = torch.randn(self.org_module_shape, dtype=self.dtype, device=self.device)
+                    perturbation.mul_(perturbation_scale_factor)
+                    perturbation_output = x @ perturbation.T  # Result: (batch × n)
+                return org_forwarded + (self.multiplier * scale * lx) + perturbation_output
+            else:
+                return org_forwarded + lx * self.multiplier * scale
+        else:
+            lxs = [lora_down(x) for lora_down in self.lora_down]
+
+            # normal dropout
+            if self.dropout is not None and self.training:
+                lxs = [torch.nn.functional.dropout(lx, p=self.dropout) for lx in lxs]
+
+            # rank dropout
+            if self.rank_dropout is not None and self.training:
+                masks = [torch.rand((lx.size(0), self.lora_dim), device=lx.device) > self.rank_dropout for lx in lxs]
+                for i in range(len(lxs)):
+                    if len(lx.size()) == 3:
+                        masks[i] = masks[i].unsqueeze(1)
+                    elif len(lx.size()) == 4:
+                        masks[i] = masks[i].unsqueeze(-1).unsqueeze(-1)
+                    lxs[i] = lxs[i] * masks[i]
+
+                # scaling for rank dropout: treat as if the rank is changed
+                scale = self.scale * (1.0 / (1.0 - self.rank_dropout))  # redundant for readability
+            else:
+                scale = self.scale
+
+            lxs = [lora_up(lx) for lora_up, lx in zip(self.lora_up, lxs)]
+
+            return org_forwarded + torch.cat(lxs, dim=-1) * self.multiplier * scale
+
+    @torch.no_grad()
+    def initialize_norm_cache(self, org_module_weight: Tensor):
+        # Choose a reasonable sample size
+        n_rows = org_module_weight.shape[0]
+        sample_size = min(1000, n_rows)  # Cap at 1000 samples or use all if smaller
+
+        # Sample random indices across all rows
+        indices = torch.randperm(n_rows)[:sample_size]
+
+        # Convert to a supported data type first, then index
+        # Use float32 for indexing operations
+        weights_float32 = org_module_weight.to(dtype=torch.float32)
+        sampled_weights = weights_float32[indices].to(device=self.device)
+
+        # Calculate sampled norms
+        sampled_norms = torch.norm(sampled_weights, dim=1, keepdim=True)
+
+        # Store the mean norm as our estimate
+        self.org_weight_norm_estimate = sampled_norms.mean()
+
+        # Optional: store standard deviation for confidence intervals
+        self.org_weight_norm_std = sampled_norms.std()
+
+        # Free memory
+        del sampled_weights, weights_float32
+
+    @torch.no_grad()
+    def validate_norm_approximation(self, org_module_weight: Tensor, verbose=True):
+        # Calculate the true norm (this will be slow but it's just for validation)
+        true_norms = []
+        chunk_size = 1024  # Process in chunks to avoid OOM
+
+        for i in range(0, org_module_weight.shape[0], chunk_size):
+            end_idx = min(i + chunk_size, org_module_weight.shape[0])
+            chunk = org_module_weight[i:end_idx].to(device=self.device, dtype=self.dtype)
+            chunk_norms = torch.norm(chunk, dim=1, keepdim=True)
+            true_norms.append(chunk_norms.cpu())
+            del chunk
+
+        true_norms = torch.cat(true_norms, dim=0)
+        true_mean_norm = true_norms.mean().item()
+
+        # Compare with our estimate
+        estimated_norm = self.org_weight_norm_estimate.item()
+
+        # Calculate error metrics
+        absolute_error = abs(true_mean_norm - estimated_norm)
+        relative_error = absolute_error / true_mean_norm * 100  # as percentage
+
+        if verbose:
+            logger.info(f"True mean norm: {true_mean_norm:.6f}")
+            logger.info(f"Estimated norm: {estimated_norm:.6f}")
+            logger.info(f"Absolute error: {absolute_error:.6f}")
+            logger.info(f"Relative error: {relative_error:.2f}%")
+
+        return {
+            "true_mean_norm": true_mean_norm,
+            "estimated_norm": estimated_norm,
+            "absolute_error": absolute_error,
+            "relative_error": relative_error,
+        }
+
+    @torch.no_grad()
+    def update_norms(self):
+        # Not running GGPO so not currently running update norms
+        if self.ggpo_beta is None or self.ggpo_sigma is None:
+            return
+
+        # only update norms when we are training
+        if self.training is False:
+            return
+
+        module_weights = self.lora_up.weight @ self.lora_down.weight
+        module_weights.mul(self.scale)
+
+        self.weight_norms = torch.norm(module_weights, dim=1, keepdim=True)
+        self.combined_weight_norms = torch.sqrt(
+            (self.org_weight_norm_estimate**2) + torch.sum(module_weights**2, dim=1, keepdim=True)
+        )
+
+    @torch.no_grad()
+    def update_grad_norms(self):
+        if self.training is False:
+            print(f"skipping update_grad_norms for {self.lora_name}")
+            return
+
+        lora_down_grad = None
+        lora_up_grad = None
+
+        for name, param in self.named_parameters():
+            if name == "lora_down.weight":
+                lora_down_grad = param.grad
+            elif name == "lora_up.weight":
+                lora_up_grad = param.grad
+
+        # Calculate gradient norms if we have both gradients
+        if lora_down_grad is not None and lora_up_grad is not None:
+            with torch.autocast(self.device.type):
+                approx_grad = self.scale * ((self.lora_up.weight @ lora_down_grad) + (lora_up_grad @ self.lora_down.weight))
+                self.grad_norms = torch.norm(approx_grad, dim=1, keepdim=True)
+
+    @property
+    def device(self):
+        return next(self.parameters()).device
+
+    @property
+    def dtype(self):
+        return next(self.parameters()).dtype
+
+
+class LoRAInfModule(LoRAModule):
+    def __init__(
+        self,
+        lora_name,
+        org_module: torch.nn.Module,
+        multiplier=1.0,
+        lora_dim=4,
+        alpha=1,
+        **kwargs,
+    ):
+        # no dropout for inference
+        super().__init__(lora_name, org_module, multiplier, lora_dim, alpha)
+
+        self.org_module_ref = [org_module]  # 後から参照できるように
+        self.enabled = True
+        self.network: LoRANetwork = None
+
+    def set_network(self, network):
+        self.network = network
+
+    # freezeしてマージする
+    def merge_to(self, sd, dtype, device):
+        # extract weight from org_module
+        org_sd = self.org_module.state_dict()
+        weight = org_sd["weight"]
+        org_dtype = weight.dtype
+        org_device = weight.device
+        weight = weight.to(torch.float)  # calc in float
+
+        if dtype is None:
+            dtype = org_dtype
+        if device is None:
+            device = org_device
+
+        if self.split_dims is None:
+            # get up/down weight
+            down_weight = sd["lora_down.weight"].to(torch.float).to(device)
+            up_weight = sd["lora_up.weight"].to(torch.float).to(device)
+
+            # merge weight
+            if len(weight.size()) == 2:
+                # linear
+                weight = weight + self.multiplier * (up_weight @ down_weight) * self.scale
+            elif down_weight.size()[2:4] == (1, 1):
+                # conv2d 1x1
+                weight = (
+                    weight
+                    + self.multiplier
+                    * (up_weight.squeeze(3).squeeze(2) @ down_weight.squeeze(3).squeeze(2)).unsqueeze(2).unsqueeze(3)
+                    * self.scale
+                )
+            else:
+                # conv2d 3x3
+                conved = torch.nn.functional.conv2d(down_weight.permute(1, 0, 2, 3), up_weight).permute(1, 0, 2, 3)
+                # logger.info(conved.size(), weight.size(), module.stride, module.padding)
+                weight = weight + self.multiplier * conved * self.scale
+
+            # set weight to org_module
+            org_sd["weight"] = weight.to(dtype)
+            self.org_module.load_state_dict(org_sd)
+        else:
+            # split_dims
+            total_dims = sum(self.split_dims)
+            for i in range(len(self.split_dims)):
+                # get up/down weight
+                down_weight = sd[f"lora_down.{i}.weight"].to(torch.float).to(device)  # (rank, in_dim)
+                up_weight = sd[f"lora_up.{i}.weight"].to(torch.float).to(device)  # (split dim, rank)
+
+                # pad up_weight -> (total_dims, rank)
+                padded_up_weight = torch.zeros((total_dims, up_weight.size(0)), device=device, dtype=torch.float)
+                padded_up_weight[sum(self.split_dims[:i]) : sum(self.split_dims[: i + 1])] = up_weight
+
+                # merge weight
+                weight = weight + self.multiplier * (up_weight @ down_weight) * self.scale
+
+            # set weight to org_module
+            org_sd["weight"] = weight.to(dtype)
+            self.org_module.load_state_dict(org_sd)
+
+    # 復元できるマージのため、このモジュールのweightを返す
+    def get_weight(self, multiplier=None):
+        if multiplier is None:
+            multiplier = self.multiplier
+
+        # get up/down weight from module
+        up_weight = self.lora_up.weight.to(torch.float)
+        down_weight = self.lora_down.weight.to(torch.float)
+
+        # pre-calculated weight
+        if len(down_weight.size()) == 2:
+            # linear
+            weight = self.multiplier * (up_weight @ down_weight) * self.scale
+        elif down_weight.size()[2:4] == (1, 1):
+            # conv2d 1x1
+            weight = (
+                self.multiplier
+                * (up_weight.squeeze(3).squeeze(2) @ down_weight.squeeze(3).squeeze(2)).unsqueeze(2).unsqueeze(3)
+                * self.scale
+            )
+        else:
+            # conv2d 3x3
+            conved = torch.nn.functional.conv2d(down_weight.permute(1, 0, 2, 3), up_weight).permute(1, 0, 2, 3)
+            weight = self.multiplier * conved * self.scale
+
+        return weight
+
+    def set_region(self, region):
+        self.region = region
+        self.region_mask = None
+
+    def default_forward(self, x):
+        # logger.info(f"default_forward {self.lora_name} {x.size()}")
+        if self.split_dims is None:
+            lx = self.lora_down(x)
+            lx = self.lora_up(lx)
+            return self.org_forward(x) + lx * self.multiplier * self.scale
+        else:
+            lxs = [lora_down(x) for lora_down in self.lora_down]
+            lxs = [lora_up(lx) for lora_up, lx in zip(self.lora_up, lxs)]
+            return self.org_forward(x) + torch.cat(lxs, dim=-1) * self.multiplier * self.scale
+
+    def forward(self, x):
+        if not self.enabled:
+            return self.org_forward(x)
+        return self.default_forward(x)
+
+
+def create_network(
+    multiplier: float,
+    network_dim: Optional[int],
+    network_alpha: Optional[float],
+    ae: AutoencoderKL,
+    text_encoders: List[CLIPTextModel],
+    flux,
+    neuron_dropout: Optional[float] = None,
+    **kwargs,
+):
+    if network_dim is None:
+        network_dim = 4  # default
+    if network_alpha is None:
+        network_alpha = 1.0
+
+    # extract dim/alpha for conv2d, and block dim
+    conv_dim = kwargs.get("conv_dim", None)
+    conv_alpha = kwargs.get("conv_alpha", None)
+    if conv_dim is not None:
+        conv_dim = int(conv_dim)
+        if conv_alpha is None:
+            conv_alpha = 1.0
+        else:
+            conv_alpha = float(conv_alpha)
+
+    # attn dim, mlp dim: only for DoubleStreamBlock. SingleStreamBlock is not supported because of combined qkv
+    img_attn_dim = kwargs.get("img_attn_dim", None)
+    txt_attn_dim = kwargs.get("txt_attn_dim", None)
+    img_mlp_dim = kwargs.get("img_mlp_dim", None)
+    txt_mlp_dim = kwargs.get("txt_mlp_dim", None)
+    img_mod_dim = kwargs.get("img_mod_dim", None)
+    txt_mod_dim = kwargs.get("txt_mod_dim", None)
+    single_dim = kwargs.get("single_dim", None)  # SingleStreamBlock
+    single_mod_dim = kwargs.get("single_mod_dim", None)  # SingleStreamBlock
+    if img_attn_dim is not None:
+        img_attn_dim = int(img_attn_dim)
+    if txt_attn_dim is not None:
+        txt_attn_dim = int(txt_attn_dim)
+    if img_mlp_dim is not None:
+        img_mlp_dim = int(img_mlp_dim)
+    if txt_mlp_dim is not None:
+        txt_mlp_dim = int(txt_mlp_dim)
+    if img_mod_dim is not None:
+        img_mod_dim = int(img_mod_dim)
+    if txt_mod_dim is not None:
+        txt_mod_dim = int(txt_mod_dim)
+    if single_dim is not None:
+        single_dim = int(single_dim)
+    if single_mod_dim is not None:
+        single_mod_dim = int(single_mod_dim)
+    type_dims = [img_attn_dim, txt_attn_dim, img_mlp_dim, txt_mlp_dim, img_mod_dim, txt_mod_dim, single_dim, single_mod_dim]
+    if all([d is None for d in type_dims]):
+        type_dims = None
+
+    # in_dims [img, time, vector, guidance, txt]
+    in_dims = kwargs.get("in_dims", None)
+    if in_dims is not None:
+        in_dims = in_dims.strip()
+        if in_dims.startswith("[") and in_dims.endswith("]"):
+            in_dims = in_dims[1:-1]
+        in_dims = [int(d) for d in in_dims.split(",")]  # is it better to use ast.literal_eval?
+        assert len(in_dims) == 5, f"invalid in_dims: {in_dims}, must be 5 dimensions (img, time, vector, guidance, txt)"
+
+    # double/single train blocks
+    def parse_block_selection(selection: str, total_blocks: int) -> List[bool]:
+        """
+        Parse a block selection string and return a list of booleans.
+
+        Args:
+        selection (str): A string specifying which blocks to select.
+        total_blocks (int): The total number of blocks available.
+
+        Returns:
+        List[bool]: A list of booleans indicating which blocks are selected.
+        """
+        if selection == "all":
+            return [True] * total_blocks
+        if selection == "none" or selection == "":
+            return [False] * total_blocks
+
+        selected = [False] * total_blocks
+        ranges = selection.split(",")
+
+        for r in ranges:
+            if "-" in r:
+                start, end = map(str.strip, r.split("-"))
+                start = int(start)
+                end = int(end)
+                assert 0 <= start < total_blocks, f"invalid start index: {start}"
+                assert 0 <= end < total_blocks, f"invalid end index: {end}"
+                assert start <= end, f"invalid range: {start}-{end}"
+                for i in range(start, end + 1):
+                    selected[i] = True
+            else:
+                index = int(r)
+                assert 0 <= index < total_blocks, f"invalid index: {index}"
+                selected[index] = True
+
+        return selected
+
+    train_double_block_indices = kwargs.get("train_double_block_indices", None)
+    train_single_block_indices = kwargs.get("train_single_block_indices", None)
+    if train_double_block_indices is not None:
+        train_double_block_indices = parse_block_selection(train_double_block_indices, NUM_DOUBLE_BLOCKS)
+    if train_single_block_indices is not None:
+        train_single_block_indices = parse_block_selection(train_single_block_indices, NUM_SINGLE_BLOCKS)
+
+    # rank/module dropout
+    rank_dropout = kwargs.get("rank_dropout", None)
+    if rank_dropout is not None:
+        rank_dropout = float(rank_dropout)
+    module_dropout = kwargs.get("module_dropout", None)
+    if module_dropout is not None:
+        module_dropout = float(module_dropout)
+
+    # single or double blocks
+    train_blocks = kwargs.get("train_blocks", None)  # None (default), "all" (same as None), "single", "double"
+    if train_blocks is not None:
+        assert train_blocks in ["all", "single", "double"], f"invalid train_blocks: {train_blocks}"
+
+    # split qkv
+    split_qkv = kwargs.get("split_qkv", False)
+    if split_qkv is not None:
+        split_qkv = True if split_qkv == "True" else False
+
+    ggpo_beta = kwargs.get("ggpo_beta", None)
+    ggpo_sigma = kwargs.get("ggpo_sigma", None)
+
+    if ggpo_beta is not None:
+        ggpo_beta = float(ggpo_beta)
+
+    if ggpo_sigma is not None:
+        ggpo_sigma = float(ggpo_sigma)
+
+    # train T5XXL
+    train_t5xxl = kwargs.get("train_t5xxl", False)
+    if train_t5xxl is not None:
+        train_t5xxl = True if train_t5xxl == "True" else False
+
+    # verbose
+    verbose = kwargs.get("verbose", False)
+    if verbose is not None:
+        verbose = True if verbose == "True" else False
+
+    # regex-specific learning rates
+    def parse_kv_pairs(kv_pair_str: str, is_int: bool) -> Dict[str, float]:
+        """
+        Parse a string of key-value pairs separated by commas.
+        """
+        pairs = {}
+        for pair in kv_pair_str.split(","):
+            pair = pair.strip()
+            if not pair:
+                continue
+            if "=" not in pair:
+                logger.warning(f"Invalid format: {pair}, expected 'key=value'")
+                continue
+            key, value = pair.split("=", 1)
+            key = key.strip()
+            value = value.strip()
+            try:
+                pairs[key] = int(value) if is_int else float(value)
+            except ValueError:
+                logger.warning(f"Invalid value for {key}: {value}")
+        return pairs
+
+    # parse regular expression based learning rates
+    network_reg_lrs = kwargs.get("network_reg_lrs", None)
+    if network_reg_lrs is not None:
+        reg_lrs = parse_kv_pairs(network_reg_lrs, is_int=False)
+    else:
+        reg_lrs = None
+
+    # regex-specific dimensions (ranks)
+    network_reg_dims = kwargs.get("network_reg_dims", None)
+    if network_reg_dims is not None:
+        reg_dims = parse_kv_pairs(network_reg_dims, is_int=True)
+    else:
+        reg_dims = None
+
+    # すごく引数が多いな ( ^ω^)･･･
+    network = LoRANetwork(
+        text_encoders,
+        flux,
+        multiplier=multiplier,
+        lora_dim=network_dim,
+        alpha=network_alpha,
+        dropout=neuron_dropout,
+        rank_dropout=rank_dropout,
+        module_dropout=module_dropout,
+        conv_lora_dim=conv_dim,
+        conv_alpha=conv_alpha,
+        train_blocks=train_blocks,
+        split_qkv=split_qkv,
+        train_t5xxl=train_t5xxl,
+        type_dims=type_dims,
+        in_dims=in_dims,
+        train_double_block_indices=train_double_block_indices,
+        train_single_block_indices=train_single_block_indices,
+        reg_dims=reg_dims,
+        ggpo_beta=ggpo_beta,
+        ggpo_sigma=ggpo_sigma,
+        reg_lrs=reg_lrs,
+        verbose=verbose,
+    )
+
+    loraplus_lr_ratio = kwargs.get("loraplus_lr_ratio", None)
+    loraplus_unet_lr_ratio = kwargs.get("loraplus_unet_lr_ratio", None)
+    loraplus_text_encoder_lr_ratio = kwargs.get("loraplus_text_encoder_lr_ratio", None)
+    loraplus_lr_ratio = float(loraplus_lr_ratio) if loraplus_lr_ratio is not None else None
+    loraplus_unet_lr_ratio = float(loraplus_unet_lr_ratio) if loraplus_unet_lr_ratio is not None else None
+    loraplus_text_encoder_lr_ratio = float(loraplus_text_encoder_lr_ratio) if loraplus_text_encoder_lr_ratio is not None else None
+    if loraplus_lr_ratio is not None or loraplus_unet_lr_ratio is not None or loraplus_text_encoder_lr_ratio is not None:
+        network.set_loraplus_lr_ratio(loraplus_lr_ratio, loraplus_unet_lr_ratio, loraplus_text_encoder_lr_ratio)
+
+    return network
+
+
+# Create network from weights for inference, weights are not loaded here (because can be merged)
+def create_network_from_weights(multiplier, file, ae, text_encoders, flux, weights_sd=None, for_inference=False, **kwargs):
+    if weights_sd is None:
+        if os.path.splitext(file)[1] == ".safetensors":
+            from safetensors.torch import load_file, safe_open
+
+            weights_sd = load_file(file)
+        else:
+            weights_sd = torch.load(file, map_location="cpu")
+
+    # get dim/alpha mapping, and train t5xxl
+    modules_dim = {}
+    modules_alpha = {}
+    train_t5xxl = None
+    for key, value in weights_sd.items():
+        if "." not in key:
+            continue
+
+        lora_name = key.split(".")[0]
+        if "alpha" in key:
+            modules_alpha[lora_name] = value
+        elif "lora_down" in key:
+            dim = value.size()[0]
+            modules_dim[lora_name] = dim
+            # logger.info(lora_name, value.size(), dim)
+
+        if train_t5xxl is None or train_t5xxl is False:
+            train_t5xxl = "lora_te3" in lora_name
+
+    if train_t5xxl is None:
+        train_t5xxl = False
+
+    split_qkv = False  # split_qkv is not needed to care, because state_dict is qkv combined
+
+    module_class = LoRAInfModule if for_inference else LoRAModule
+
+    network = LoRANetwork(
+        text_encoders,
+        flux,
+        multiplier=multiplier,
+        modules_dim=modules_dim,
+        modules_alpha=modules_alpha,
+        module_class=module_class,
+        split_qkv=split_qkv,
+        train_t5xxl=train_t5xxl,
+    )
+    return network, weights_sd
+
+
+class LoRANetwork(torch.nn.Module):
+    # FLUX_TARGET_REPLACE_MODULE = ["DoubleStreamBlock", "SingleStreamBlock"]
+    FLUX_TARGET_REPLACE_MODULE_DOUBLE = ["DoubleStreamBlock"]
+    FLUX_TARGET_REPLACE_MODULE_SINGLE = ["SingleStreamBlock"]
+    TEXT_ENCODER_TARGET_REPLACE_MODULE = ["CLIPAttention", "CLIPSdpaAttention", "CLIPMLP", "T5Attention", "T5DenseGatedActDense"]
+    LORA_PREFIX_FLUX = "lora_unet"  # make ComfyUI compatible
+    LORA_PREFIX_TEXT_ENCODER_CLIP = "lora_te1"
+    LORA_PREFIX_TEXT_ENCODER_T5 = "lora_te3"  # make ComfyUI compatible
+
+    def __init__(
+        self,
+        text_encoders: Union[List[CLIPTextModel], CLIPTextModel],
+        unet,
+        multiplier: float = 1.0,
+        lora_dim: int = 4,
+        alpha: float = 1,
+        dropout: Optional[float] = None,
+        rank_dropout: Optional[float] = None,
+        module_dropout: Optional[float] = None,
+        conv_lora_dim: Optional[int] = None,
+        conv_alpha: Optional[float] = None,
+        module_class: Type[object] = LoRAModule,
+        modules_dim: Optional[Dict[str, int]] = None,
+        modules_alpha: Optional[Dict[str, int]] = None,
+        train_blocks: Optional[str] = None,
+        split_qkv: bool = False,
+        train_t5xxl: bool = False,
+        type_dims: Optional[List[int]] = None,
+        in_dims: Optional[List[int]] = None,
+        train_double_block_indices: Optional[List[bool]] = None,
+        train_single_block_indices: Optional[List[bool]] = None,
+        reg_dims: Optional[Dict[str, int]] = None,
+        ggpo_beta: Optional[float] = None,
+        ggpo_sigma: Optional[float] = None,
+        reg_lrs: Optional[Dict[str, float]] = None,
+        verbose: Optional[bool] = False,
+    ) -> None:
+        super().__init__()
+        self.multiplier = multiplier
+
+        self.lora_dim = lora_dim
+        self.alpha = alpha
+        self.conv_lora_dim = conv_lora_dim
+        self.conv_alpha = conv_alpha
+        self.dropout = dropout
+        self.rank_dropout = rank_dropout
+        self.module_dropout = module_dropout
+        self.train_blocks = train_blocks if train_blocks is not None else "all"
+        self.split_qkv = split_qkv
+        self.train_t5xxl = train_t5xxl
+
+        self.type_dims = type_dims
+        self.in_dims = in_dims
+        self.train_double_block_indices = train_double_block_indices
+        self.train_single_block_indices = train_single_block_indices
+        self.reg_dims = reg_dims
+        self.reg_lrs = reg_lrs
+
+        self.loraplus_lr_ratio = None
+        self.loraplus_unet_lr_ratio = None
+        self.loraplus_text_encoder_lr_ratio = None
+
+        if modules_dim is not None:
+            logger.info(f"create LoRA network from weights")
+            self.in_dims = [0] * 5  # create in_dims
+            # verbose = True
+        else:
+            logger.info(f"create LoRA network. base dim (rank): {lora_dim}, alpha: {alpha}")
+            logger.info(
+                f"neuron dropout: p={self.dropout}, rank dropout: p={self.rank_dropout}, module dropout: p={self.module_dropout}"
+            )
+            # if self.conv_lora_dim is not None:
+            #     logger.info(
+            #         f"apply LoRA to Conv2d with kernel size (3,3). dim (rank): {self.conv_lora_dim}, alpha: {self.conv_alpha}"
+            #     )
+
+        if ggpo_beta is not None and ggpo_sigma is not None:
+            logger.info(f"LoRA-GGPO training sigma: {ggpo_sigma} beta: {ggpo_beta}")
+
+        if self.split_qkv:
+            logger.info(f"split qkv for LoRA")
+        if self.train_blocks is not None:
+            logger.info(f"train {self.train_blocks} blocks only")
+
+        if train_t5xxl:
+            logger.info(f"train T5XXL as well")
+
+        # create module instances
+        def create_modules(
+            is_flux: bool,
+            text_encoder_idx: Optional[int],
+            root_module: torch.nn.Module,
+            target_replace_modules: List[str],
+            filter: Optional[str] = None,
+            default_dim: Optional[int] = None,
+        ) -> List[LoRAModule]:
+            prefix = (
+                self.LORA_PREFIX_FLUX
+                if is_flux
+                else (self.LORA_PREFIX_TEXT_ENCODER_CLIP if text_encoder_idx == 0 else self.LORA_PREFIX_TEXT_ENCODER_T5)
+            )
+
+            loras = []
+            skipped = []
+            for name, module in root_module.named_modules():
+                if target_replace_modules is None or module.__class__.__name__ in target_replace_modules:
+                    if target_replace_modules is None:  # dirty hack for all modules
+                        module = root_module  # search all modules
+
+                    for child_name, child_module in module.named_modules():
+                        is_linear = child_module.__class__.__name__ == "Linear"
+                        is_conv2d = child_module.__class__.__name__ == "Conv2d"
+                        is_conv2d_1x1 = is_conv2d and child_module.kernel_size == (1, 1)
+
+                        if is_linear or is_conv2d:
+                            lora_name = prefix + "." + (name + "." if name else "") + child_name
+                            lora_name = lora_name.replace(".", "_")
+
+                            if filter is not None and not filter in lora_name:
+                                continue
+
+                            dim = None
+                            alpha = None
+
+                            if modules_dim is not None:
+                                # モジュール指定あり
+                                if lora_name in modules_dim:
+                                    dim = modules_dim[lora_name]
+                                    alpha = modules_alpha[lora_name]
+                            elif self.reg_dims is not None:
+                                for reg, d in self.reg_dims.items():
+                                    if re.search(reg, lora_name):
+                                        dim = d
+                                        alpha = self.alpha
+                                        logger.info(f"LoRA {lora_name} matched with regex {reg}, using dim: {dim}")
+                                        break
+
+                            # if modules_dim is None, we use default lora_dim. if modules_dim is not None, we use the specified dim (no default)
+                            if dim is None and modules_dim is None: 
+                                if is_linear or is_conv2d_1x1:
+                                    dim = default_dim if default_dim is not None else self.lora_dim
+                                    alpha = self.alpha
+
+                                    if is_flux and type_dims is not None:
+                                        identifier = [
+                                            ("img_attn",),
+                                            ("txt_attn",),
+                                            ("img_mlp",),
+                                            ("txt_mlp",),
+                                            ("img_mod",),
+                                            ("txt_mod",),
+                                            ("single_blocks", "linear"),
+                                            ("modulation",),
+                                        ]
+                                        for i, d in enumerate(type_dims):
+                                            if d is not None and all([id in lora_name for id in identifier[i]]):
+                                                dim = d  # may be 0 for skip
+                                                break
+
+                                    if (
+                                        is_flux
+                                        and dim
+                                        and (
+                                            self.train_double_block_indices is not None
+                                            or self.train_single_block_indices is not None
+                                        )
+                                        and ("double" in lora_name or "single" in lora_name)
+                                    ):
+                                        # "lora_unet_double_blocks_0_..." or "lora_unet_single_blocks_0_..."
+                                        block_index = int(lora_name.split("_")[4])  # bit dirty
+                                        if (
+                                            "double" in lora_name
+                                            and self.train_double_block_indices is not None
+                                            and not self.train_double_block_indices[block_index]
+                                        ):
+                                            dim = 0
+                                        elif (
+                                            "single" in lora_name
+                                            and self.train_single_block_indices is not None
+                                            and not self.train_single_block_indices[block_index]
+                                        ):
+                                            dim = 0
+
+                                elif self.conv_lora_dim is not None:
+                                    dim = self.conv_lora_dim
+                                    alpha = self.conv_alpha
+
+                            if dim is None or dim == 0:
+                                # skipした情報を出力
+                                if is_linear or is_conv2d_1x1 or (self.conv_lora_dim is not None):
+                                    skipped.append(lora_name)
+                                continue
+
+                            # qkv split
+                            split_dims = None
+                            if is_flux and split_qkv:
+                                if "double" in lora_name and "qkv" in lora_name:
+                                    split_dims = [3072] * 3
+                                elif "single" in lora_name and "linear1" in lora_name:
+                                    split_dims = [3072] * 3 + [12288]
+
+                            lora = module_class(
+                                lora_name,
+                                child_module,
+                                self.multiplier,
+                                dim,
+                                alpha,
+                                dropout=dropout,
+                                rank_dropout=rank_dropout,
+                                module_dropout=module_dropout,
+                                split_dims=split_dims,
+                                ggpo_beta=ggpo_beta,
+                                ggpo_sigma=ggpo_sigma,
+                            )
+                            loras.append(lora)
+
+                if target_replace_modules is None:
+                    break  # all modules are searched
+            return loras, skipped
+
+        # create LoRA for text encoder
+        # 毎回すべてのモジュールを作るのは無駄なので要検討
+        self.text_encoder_loras: List[Union[LoRAModule, LoRAInfModule]] = []
+        skipped_te = []
+        for i, text_encoder in enumerate(text_encoders):
+            index = i
+            if text_encoder is None:
+                logger.info(f"Text Encoder {index+1} is None, skipping LoRA creation for this encoder.")
+                continue
+            if not train_t5xxl and index > 0:  # 0: CLIP, 1: T5XXL, so we skip T5XXL if train_t5xxl is False
+                break
+
+            logger.info(f"create LoRA for Text Encoder {index+1}:")
+
+            text_encoder_loras, skipped = create_modules(False, index, text_encoder, LoRANetwork.TEXT_ENCODER_TARGET_REPLACE_MODULE)
+            logger.info(f"create LoRA for Text Encoder {index+1}: {len(text_encoder_loras)} modules.")
+            self.text_encoder_loras.extend(text_encoder_loras)
+            skipped_te += skipped
+
+        # create LoRA for U-Net
+        if self.train_blocks == "all":
+            target_replace_modules = LoRANetwork.FLUX_TARGET_REPLACE_MODULE_DOUBLE + LoRANetwork.FLUX_TARGET_REPLACE_MODULE_SINGLE
+        elif self.train_blocks == "single":
+            target_replace_modules = LoRANetwork.FLUX_TARGET_REPLACE_MODULE_SINGLE
+        elif self.train_blocks == "double":
+            target_replace_modules = LoRANetwork.FLUX_TARGET_REPLACE_MODULE_DOUBLE
+
+        self.unet_loras: List[Union[LoRAModule, LoRAInfModule]]
+        self.unet_loras, skipped_un = create_modules(True, None, unet, target_replace_modules)
+
+        # img, time, vector, guidance, txt
+        if self.in_dims:
+            for filter, in_dim in zip(["_img_in", "_time_in", "_vector_in", "_guidance_in", "_txt_in"], self.in_dims):
+                loras, _ = create_modules(True, None, unet, None, filter=filter, default_dim=in_dim)
+                self.unet_loras.extend(loras)
+
+        logger.info(f"create LoRA for FLUX {self.train_blocks} blocks: {len(self.unet_loras)} modules.")
+        if verbose:
+            for lora in self.unet_loras:
+                logger.info(f"\t{lora.lora_name:50} {lora.lora_dim}, {lora.alpha}")
+
+        skipped = skipped_te + skipped_un
+        if verbose and len(skipped) > 0:
+            logger.warning(
+                f"because dim (rank) is 0, {len(skipped)} LoRA modules are skipped / dim (rank)が0の為、次の{len(skipped)}個のLoRAモジュールはスキップされます:"
+            )
+            for name in skipped:
+                logger.info(f"\t{name}")
+
+        # assertion
+        names = set()
+        for lora in self.text_encoder_loras + self.unet_loras:
+            assert lora.lora_name not in names, f"duplicated lora name: {lora.lora_name}"
+            names.add(lora.lora_name)
+
+    def set_multiplier(self, multiplier):
+        self.multiplier = multiplier
+        for lora in self.text_encoder_loras + self.unet_loras:
+            lora.multiplier = self.multiplier
+
+    def set_enabled(self, is_enabled):
+        for lora in self.text_encoder_loras + self.unet_loras:
+            lora.enabled = is_enabled
+
+    def update_norms(self):
+        for lora in self.text_encoder_loras + self.unet_loras:
+            lora.update_norms()
+
+    def update_grad_norms(self):
+        for lora in self.text_encoder_loras + self.unet_loras:
+            lora.update_grad_norms()
+
+    def grad_norms(self) -> Tensor | None:
+        grad_norms = []
+        for lora in self.text_encoder_loras + self.unet_loras:
+            if hasattr(lora, "grad_norms") and lora.grad_norms is not None:
+                grad_norms.append(lora.grad_norms.mean(dim=0))
+        return torch.stack(grad_norms) if len(grad_norms) > 0 else None
+
+    def weight_norms(self) -> Tensor | None:
+        weight_norms = []
+        for lora in self.text_encoder_loras + self.unet_loras:
+            if hasattr(lora, "weight_norms") and lora.weight_norms is not None:
+                weight_norms.append(lora.weight_norms.mean(dim=0))
+        return torch.stack(weight_norms) if len(weight_norms) > 0 else None
+
+    def combined_weight_norms(self) -> Tensor | None:
+        combined_weight_norms = []
+        for lora in self.text_encoder_loras + self.unet_loras:
+            if hasattr(lora, "combined_weight_norms") and lora.combined_weight_norms is not None:
+                combined_weight_norms.append(lora.combined_weight_norms.mean(dim=0))
+        return torch.stack(combined_weight_norms) if len(combined_weight_norms) > 0 else None
+
+    def load_weights(self, file):
+        if os.path.splitext(file)[1] == ".safetensors":
+            from safetensors.torch import load_file
+
+            weights_sd = load_file(file)
+        else:
+            weights_sd = torch.load(file, map_location="cpu")
+
+        info = self.load_state_dict(weights_sd, False)
+        return info
+
+    def load_state_dict(self, state_dict, strict=True):
+        # override to convert original weight to split qkv
+        if not self.split_qkv:
+            return super().load_state_dict(state_dict, strict)
+
+        # split qkv
+        for key in list(state_dict.keys()):
+            if "double" in key and "qkv" in key:
+                split_dims = [3072] * 3
+            elif "single" in key and "linear1" in key:
+                split_dims = [3072] * 3 + [12288]
+            else:
+                continue
+
+            weight = state_dict[key]
+            lora_name = key.split(".")[0]
+            if "lora_down" in key and "weight" in key:
+                # dense weight (rank*3, in_dim)
+                split_weight = torch.chunk(weight, len(split_dims), dim=0)
+                for i, split_w in enumerate(split_weight):
+                    state_dict[f"{lora_name}.lora_down.{i}.weight"] = split_w
+
+                del state_dict[key]
+                # print(f"split {key}: {weight.shape} to {[w.shape for w in split_weight]}")
+            elif "lora_up" in key and "weight" in key:
+                # sparse weight (out_dim=sum(split_dims), rank*3)
+                rank = weight.size(1) // len(split_dims)
+                i = 0
+                for j in range(len(split_dims)):
+                    state_dict[f"{lora_name}.lora_up.{j}.weight"] = weight[i : i + split_dims[j], j * rank : (j + 1) * rank]
+                    i += split_dims[j]
+                del state_dict[key]
+
+                # # check is sparse
+                # i = 0
+                # is_zero = True
+                # for j in range(len(split_dims)):
+                #     for k in range(len(split_dims)):
+                #         if j == k:
+                #             continue
+                #         is_zero = is_zero and torch.all(weight[i : i + split_dims[j], k * rank : (k + 1) * rank] == 0)
+                #     i += split_dims[j]
+                # if not is_zero:
+                #     logger.warning(f"weight is not sparse: {key}")
+                # else:
+                #     logger.info(f"weight is sparse: {key}")
+
+                # print(
+                #     f"split {key}: {weight.shape} to {[state_dict[k].shape for k in [f'{lora_name}.lora_up.{j}.weight' for j in range(len(split_dims))]]}"
+                # )
+
+            # alpha is unchanged
+
+        return super().load_state_dict(state_dict, strict)
+
+    def state_dict(self, destination=None, prefix="", keep_vars=False):
+        if not self.split_qkv:
+            return super().state_dict(destination, prefix, keep_vars)
+
+        # merge qkv
+        state_dict = super().state_dict(destination, prefix, keep_vars)
+        new_state_dict = {}
+        for key in list(state_dict.keys()):
+            if "double" in key and "qkv" in key:
+                split_dims = [3072] * 3
+            elif "single" in key and "linear1" in key:
+                split_dims = [3072] * 3 + [12288]
+            else:
+                new_state_dict[key] = state_dict[key]
+                continue
+
+            if key not in state_dict:
+                continue  # already merged
+
+            lora_name = key.split(".")[0]
+
+            # (rank, in_dim) * 3
+            down_weights = [state_dict.pop(f"{lora_name}.lora_down.{i}.weight") for i in range(len(split_dims))]
+            # (split dim, rank) * 3
+            up_weights = [state_dict.pop(f"{lora_name}.lora_up.{i}.weight") for i in range(len(split_dims))]
+
+            alpha = state_dict.pop(f"{lora_name}.alpha")
+
+            # merge down weight
+            down_weight = torch.cat(down_weights, dim=0)  # (rank, split_dim) * 3 -> (rank*3, sum of split_dim)
+
+            # merge up weight (sum of split_dim, rank*3)
+            rank = up_weights[0].size(1)
+            up_weight = torch.zeros((sum(split_dims), down_weight.size(0)), device=down_weight.device, dtype=down_weight.dtype)
+            i = 0
+            for j in range(len(split_dims)):
+                up_weight[i : i + split_dims[j], j * rank : (j + 1) * rank] = up_weights[j]
+                i += split_dims[j]
+
+            new_state_dict[f"{lora_name}.lora_down.weight"] = down_weight
+            new_state_dict[f"{lora_name}.lora_up.weight"] = up_weight
+            new_state_dict[f"{lora_name}.alpha"] = alpha
+
+            # print(
+            #     f"merged {lora_name}: {lora_name}, {[w.shape for w in down_weights]}, {[w.shape for w in up_weights]} to {down_weight.shape}, {up_weight.shape}"
+            # )
+            print(f"new key: {lora_name}.lora_down.weight, {lora_name}.lora_up.weight, {lora_name}.alpha")
+
+        return new_state_dict
+
+    def apply_to(self, text_encoders, flux, apply_text_encoder=True, apply_unet=True):
+        if apply_text_encoder:
+            logger.info(f"enable LoRA for text encoder: {len(self.text_encoder_loras)} modules")
+        else:
+            self.text_encoder_loras = []
+
+        if apply_unet:
+            logger.info(f"enable LoRA for U-Net: {len(self.unet_loras)} modules")
+        else:
+            self.unet_loras = []
+
+        for lora in self.text_encoder_loras + self.unet_loras:
+            lora.apply_to()
+            self.add_module(lora.lora_name, lora)
+
+    # マージできるかどうかを返す
+    def is_mergeable(self):
+        return True
+
+    # TODO refactor to common function with apply_to
+    def merge_to(self, text_encoders, flux, weights_sd, dtype=None, device=None):
+        apply_text_encoder = apply_unet = False
+        for key in weights_sd.keys():
+            if key.startswith(LoRANetwork.LORA_PREFIX_TEXT_ENCODER_CLIP) or key.startswith(LoRANetwork.LORA_PREFIX_TEXT_ENCODER_T5):
+                apply_text_encoder = True
+            elif key.startswith(LoRANetwork.LORA_PREFIX_FLUX):
+                apply_unet = True
+
+        if apply_text_encoder:
+            logger.info("enable LoRA for text encoder")
+        else:
+            self.text_encoder_loras = []
+
+        if apply_unet:
+            logger.info("enable LoRA for U-Net")
+        else:
+            self.unet_loras = []
+
+        for lora in self.text_encoder_loras + self.unet_loras:
+            sd_for_lora = {}
+            for key in weights_sd.keys():
+                if key.startswith(lora.lora_name):
+                    sd_for_lora[key[len(lora.lora_name) + 1 :]] = weights_sd[key]
+            lora.merge_to(sd_for_lora, dtype, device)
+
+        logger.info(f"weights are merged")
+
+    def set_loraplus_lr_ratio(self, loraplus_lr_ratio, loraplus_unet_lr_ratio, loraplus_text_encoder_lr_ratio):
+        self.loraplus_lr_ratio = loraplus_lr_ratio
+        self.loraplus_unet_lr_ratio = loraplus_unet_lr_ratio
+        self.loraplus_text_encoder_lr_ratio = loraplus_text_encoder_lr_ratio
+
+        logger.info(f"LoRA+ UNet LR Ratio: {self.loraplus_unet_lr_ratio or self.loraplus_lr_ratio}")
+        logger.info(f"LoRA+ Text Encoder LR Ratio: {self.loraplus_text_encoder_lr_ratio or self.loraplus_lr_ratio}")
+
+    def prepare_optimizer_params_with_multiple_te_lrs(self, text_encoder_lr, unet_lr, default_lr):
+        # make sure text_encoder_lr as list of two elements
+        # if float, use the same value for both text encoders
+        if text_encoder_lr is None or (isinstance(text_encoder_lr, list) and len(text_encoder_lr) == 0):
+            text_encoder_lr = [default_lr, default_lr]
+        elif isinstance(text_encoder_lr, float) or isinstance(text_encoder_lr, int):
+            text_encoder_lr = [float(text_encoder_lr), float(text_encoder_lr)]
+        elif len(text_encoder_lr) == 1:
+            text_encoder_lr = [text_encoder_lr[0], text_encoder_lr[0]]
+
+        self.requires_grad_(True)
+
+        all_params = []
+        lr_descriptions = []
+
+        reg_lrs_list = list(self.reg_lrs.items()) if self.reg_lrs is not None else []
+
+        def assemble_params(loras, lr, loraplus_ratio):
+            param_groups = {"lora": {}, "plus": {}}
+            # regular expression param groups: {"reg_lr_0": {"lora": {}, "plus": {}}, ...}
+            reg_groups = {}
+
+            for lora in loras:
+                # check if this lora matches any regex learning rate
+                matched_reg_lr = None
+                for i, (regex_str, reg_lr) in enumerate(reg_lrs_list):
+                    try:
+                        if re.search(regex_str, lora.lora_name):
+                            matched_reg_lr = (i, reg_lr)
+                            logger.info(f"Module {lora.lora_name} matched regex '{regex_str}' -> LR {reg_lr}")
+                            break
+                    except re.error:
+                        # regex error should have been caught during parsing, but just in case
+                        continue
+
+                for name, param in lora.named_parameters():
+                    param_key = f"{lora.lora_name}.{name}"
+                    is_plus = loraplus_ratio is not None and "lora_up" in name
+
+                    if matched_reg_lr is not None:
+                        # use regex-specific learning rate
+                        reg_idx, reg_lr = matched_reg_lr
+                        group_key = f"reg_lr_{reg_idx}"
+                        if group_key not in reg_groups:
+                            reg_groups[group_key] = {"lora": {}, "plus": {}, "lr": reg_lr}
+
+                        if is_plus:
+                            reg_groups[group_key]["plus"][param_key] = param
+                        else:
+                            reg_groups[group_key]["lora"][param_key] = param
+                    else:
+                        # use default learning rate
+                        if is_plus:
+                            param_groups["plus"][param_key] = param
+                        else:
+                            param_groups["lora"][param_key] = param
+
+            params = []
+            descriptions = []
+
+            # process regex-specific groups first (higher priority)
+            for group_key in sorted(reg_groups.keys()):
+                group = reg_groups[group_key]
+                reg_lr = group["lr"]
+
+                for param_type in ["lora", "plus"]:
+                    if len(group[param_type]) == 0:
+                        continue
+
+                    param_data = {"params": group[param_type].values()}
+
+                    if param_type == "plus" and loraplus_ratio is not None:
+                        param_data["lr"] = reg_lr * loraplus_ratio
+                    else:
+                        param_data["lr"] = reg_lr
+
+                    if param_data.get("lr", None) == 0 or param_data.get("lr", None) is None:
+                        continue
+
+                    params.append(param_data)
+                    desc = f"reg_lr_{group_key.split('_')[-1]}"
+                    if param_type == "plus":
+                        desc += " plus"
+                    descriptions.append(desc)
+
+            # process default groups
+            for key in param_groups.keys():
+                param_data = {"params": param_groups[key].values()}
+
+                if len(param_data["params"]) == 0:
+                    continue
+
+                if lr is not None:
+                    if key == "plus":
+                        param_data["lr"] = lr * loraplus_ratio
+                    else:
+                        param_data["lr"] = lr
+
+                if param_data.get("lr", None) == 0 or param_data.get("lr", None) is None:
+                    logger.info("NO LR skipping!")
+                    continue
+
+                params.append(param_data)
+                descriptions.append("plus" if key == "plus" else "")
+
+            return params, descriptions
+
+        if self.text_encoder_loras:
+            loraplus_lr_ratio = self.loraplus_text_encoder_lr_ratio or self.loraplus_lr_ratio
+
+            # split text encoder loras for te1 and te3
+            te1_loras = [lora for lora in self.text_encoder_loras if lora.lora_name.startswith(self.LORA_PREFIX_TEXT_ENCODER_CLIP)]
+            te3_loras = [lora for lora in self.text_encoder_loras if lora.lora_name.startswith(self.LORA_PREFIX_TEXT_ENCODER_T5)]
+            if len(te1_loras) > 0:
+                logger.info(f"Text Encoder 1 (CLIP-L): {len(te1_loras)} modules, LR {text_encoder_lr[0]}")
+                params, descriptions = assemble_params(te1_loras, text_encoder_lr[0], loraplus_lr_ratio)
+                all_params.extend(params)
+                lr_descriptions.extend(["textencoder 1 " + (" " + d if d else "") for d in descriptions])
+            if len(te3_loras) > 0:
+                logger.info(f"Text Encoder 2 (T5XXL): {len(te3_loras)} modules, LR {text_encoder_lr[1]}")
+                params, descriptions = assemble_params(te3_loras, text_encoder_lr[1], loraplus_lr_ratio)
+                all_params.extend(params)
+                lr_descriptions.extend(["textencoder 2 " + (" " + d if d else "") for d in descriptions])
+
+        if self.unet_loras:
+            params, descriptions = assemble_params(
+                self.unet_loras,
+                unet_lr if unet_lr is not None else default_lr,
+                self.loraplus_unet_lr_ratio or self.loraplus_lr_ratio,
+            )
+            all_params.extend(params)
+            lr_descriptions.extend(["unet" + (" " + d if d else "") for d in descriptions])
+
+        return all_params, lr_descriptions
+
+    def enable_gradient_checkpointing(self):
+        # not supported
+        pass
+
+    def prepare_grad_etc(self, text_encoder, unet):
+        self.requires_grad_(True)
+
+    def on_epoch_start(self, text_encoder, unet):
+        self.train()
+
+    def get_trainable_params(self):
+        return self.parameters()
+
+    def save_weights(self, file, dtype, metadata):
+        if metadata is not None and len(metadata) == 0:
+            metadata = None
+
+        state_dict = self.state_dict()
+
+        if dtype is not None:
+            for key in list(state_dict.keys()):
+                v = state_dict[key]
+                v = v.detach().clone().to("cpu").to(dtype)
+                state_dict[key] = v
+
+        if os.path.splitext(file)[1] == ".safetensors":
+            from safetensors.torch import save_file
+            from library import train_util
+
+            # Precalculate model hashes to save time on indexing
+            if metadata is None:
+                metadata = {}
+            model_hash, legacy_hash = train_util.precalculate_safetensors_hashes(state_dict, metadata)
+            metadata["sshs_model_hash"] = model_hash
+            metadata["sshs_legacy_hash"] = legacy_hash
+
+            save_file(state_dict, file, metadata)
+        else:
+            torch.save(state_dict, file)
+
+    def backup_weights(self):
+        # 重みのバックアップを行う
+        loras: List[LoRAInfModule] = self.text_encoder_loras + self.unet_loras
+        for lora in loras:
+            org_module = lora.org_module_ref[0]
+            if not hasattr(org_module, "_lora_org_weight"):
+                sd = org_module.state_dict()
+                org_module._lora_org_weight = sd["weight"].detach().clone()
+                org_module._lora_restored = True
+
+    def restore_weights(self):
+        # 重みのリストアを行う
+        loras: List[LoRAInfModule] = self.text_encoder_loras + self.unet_loras
+        for lora in loras:
+            org_module = lora.org_module_ref[0]
+            if not org_module._lora_restored:
+                sd = org_module.state_dict()
+                sd["weight"] = org_module._lora_org_weight
+                org_module.load_state_dict(sd)
+                org_module._lora_restored = True
+
+    def pre_calculation(self):
+        # 事前計算を行う
+        loras: List[LoRAInfModule] = self.text_encoder_loras + self.unet_loras
+        for lora in loras:
+            org_module = lora.org_module_ref[0]
+            sd = org_module.state_dict()
+
+            org_weight = sd["weight"]
+            lora_weight = lora.get_weight().to(org_weight.device, dtype=org_weight.dtype)
+            sd["weight"] = org_weight + lora_weight
+            assert sd["weight"].shape == org_weight.shape
+            org_module.load_state_dict(sd)
+
+            org_module._lora_restored = False
+            lora.enabled = False
+
+    def apply_max_norm_regularization(self, max_norm_value, device):
+        downkeys = []
+        upkeys = []
+        alphakeys = []
+        norms = []
+        keys_scaled = 0
+
+        state_dict = self.state_dict()
+        for key in state_dict.keys():
+            if "lora_down" in key and "weight" in key:
+                downkeys.append(key)
+                upkeys.append(key.replace("lora_down", "lora_up"))
+                alphakeys.append(key.replace("lora_down.weight", "alpha"))
+
+        for i in range(len(downkeys)):
+            down = state_dict[downkeys[i]].to(device)
+            up = state_dict[upkeys[i]].to(device)
+            alpha = state_dict[alphakeys[i]].to(device)
+            dim = down.shape[0]
+            scale = alpha / dim
+
+            if up.shape[2:] == (1, 1) and down.shape[2:] == (1, 1):
+                updown = (up.squeeze(2).squeeze(2) @ down.squeeze(2).squeeze(2)).unsqueeze(2).unsqueeze(3)
+            elif up.shape[2:] == (3, 3) or down.shape[2:] == (3, 3):
+                updown = torch.nn.functional.conv2d(down.permute(1, 0, 2, 3), up).permute(1, 0, 2, 3)
+            else:
+                updown = up @ down
+
+            updown *= scale
+
+            norm = updown.norm().clamp(min=max_norm_value / 2)
+            desired = torch.clamp(norm, max=max_norm_value)
+            ratio = desired.cpu() / norm.cpu()
+            sqrt_ratio = ratio**0.5
+            if ratio != 1:
+                keys_scaled += 1
+                state_dict[upkeys[i]] *= sqrt_ratio
+                state_dict[downkeys[i]] *= sqrt_ratio
+            scalednorm = updown.norm() * ratio
+            norms.append(scalednorm.item())
+
+        return keys_scaled, sum(norms) / len(norms), max(norms)

From 7f983c558de540c26b888de66da8a0acfbdc45b6 Mon Sep 17 00:00:00 2001
From: Kohya S <52813779+kohya-ss@users.noreply.github.com>
Date: Thu, 11 Sep 2025 22:15:22 +0900
Subject: [PATCH 546/582] feat: block swap for inference and initial impl for
 HunyuanImage LoRA (not working)

---
 _typos.toml                           |    9 +-
 hunyuan_image_minimal_inference.py    |   45 +-
 hunyuan_image_train_network.py        |  640 ++++++++++++++
 library/custom_offloading_utils.py    |  155 +++-
 library/device_utils.py               |   10 +
 library/hunyuan_image_models.py       |   82 ++
 library/hunyuan_image_modules.py      |   56 +-
 library/hunyuan_image_text_encoder.py |  145 ++--
 library/hunyuan_image_utils.py        |   40 +-
 library/lora_utils.py                 |    2 +-
 library/sai_model_spec.py             |  113 ++-
 library/strategy_hunyuan_image.py     |  187 ++++
 library/train_util.py                 |   24 +-
 networks/lora_flux.py                 |   18 +-
 networks/lora_hunyuan_image.py        | 1137 +------------------------
 train_network.py                      |    5 +-
 16 files changed, 1364 insertions(+), 1304 deletions(-)
 create mode 100644 hunyuan_image_train_network.py
 create mode 100644 library/strategy_hunyuan_image.py

diff --git a/_typos.toml b/_typos.toml
index bbf7728f4..75f0bf055 100644
--- a/_typos.toml
+++ b/_typos.toml
@@ -29,7 +29,10 @@ koo="koo"
 yos="yos"
 wn="wn"
 hime="hime"
+OT="OT"
 
-
-[files]
-extend-exclude = ["_typos.toml", "venv"]
+# [files]
+# # Extend the default list of files to check
+# extend-exclude = [
+#     "library/hunyuan_image_text_encoder.py",
+# ]
diff --git a/hunyuan_image_minimal_inference.py b/hunyuan_image_minimal_inference.py
index 8a956f491..ba8ca78e6 100644
--- a/hunyuan_image_minimal_inference.py
+++ b/hunyuan_image_minimal_inference.py
@@ -7,8 +7,8 @@
 import re
 import time
 import copy
-from types import ModuleType
-from typing import Tuple, Optional, List, Any, Dict
+from types import ModuleType, SimpleNamespace
+from typing import Tuple, Optional, List, Any, Dict, Union
 
 import numpy as np
 import torch
@@ -21,7 +21,7 @@
 from library import hunyuan_image_models, hunyuan_image_text_encoder, hunyuan_image_utils
 from library import hunyuan_image_vae
 from library.hunyuan_image_vae import HunyuanVAE2D
-from library.device_utils import clean_memory_on_device
+from library.device_utils import clean_memory_on_device, synchronize_device
 from networks import lora_hunyuan_image
 
 
@@ -29,7 +29,6 @@
 if lycoris_available:
     from lycoris.kohya import create_network_from_weights
 
-from library.custom_offloading_utils import synchronize_device
 from library.utils import mem_eff_save_file, setup_logging
 
 setup_logging()
@@ -513,10 +512,11 @@ def move_models_to_device_if_needed():
     else:
         move_models_to_device_if_needed()
 
-        embed, mask = hunyuan_image_text_encoder.get_qwen_prompt_embeds(tokenizer_vlm, text_encoder_vlm, prompt)
-        ocr_mask, embed_byt5, mask_byt5 = hunyuan_image_text_encoder.get_glyph_prompt_embeds(
-            tokenizer_byt5, text_encoder_byt5, prompt
-        )
+        with torch.no_grad():
+            embed, mask = hunyuan_image_text_encoder.get_qwen_prompt_embeds(tokenizer_vlm, text_encoder_vlm, prompt)
+            ocr_mask, embed_byt5, mask_byt5 = hunyuan_image_text_encoder.get_glyph_prompt_embeds(
+                tokenizer_byt5, text_encoder_byt5, prompt
+            )
         embed = embed.cpu()
         mask = mask.cpu()
         embed_byt5 = embed_byt5.cpu()
@@ -531,12 +531,13 @@ def move_models_to_device_if_needed():
     else:
         move_models_to_device_if_needed()
 
-        negative_embed, negative_mask = hunyuan_image_text_encoder.get_qwen_prompt_embeds(
-            tokenizer_vlm, text_encoder_vlm, negative_prompt
-        )
-        negative_ocr_mask, negative_embed_byt5, negative_mask_byt5 = hunyuan_image_text_encoder.get_glyph_prompt_embeds(
-            tokenizer_byt5, text_encoder_byt5, negative_prompt
-        )
+        with torch.no_grad():
+            negative_embed, negative_mask = hunyuan_image_text_encoder.get_qwen_prompt_embeds(
+                tokenizer_vlm, text_encoder_vlm, negative_prompt
+            )
+            negative_ocr_mask, negative_embed_byt5, negative_mask_byt5 = hunyuan_image_text_encoder.get_glyph_prompt_embeds(
+                tokenizer_byt5, text_encoder_byt5, negative_prompt
+            )
         negative_embed = negative_embed.cpu()
         negative_mask = negative_mask.cpu()
         negative_embed_byt5 = negative_embed_byt5.cpu()
@@ -617,6 +618,18 @@ def generate(
         # model.move_to_device_except_swap_blocks(device)  # Handles block swap correctly
         # model.prepare_block_swap_before_forward()
 
+    return generate_body(args, model, context, context_null, device, seed)
+
+
+def generate_body(
+    args: Union[argparse.Namespace, SimpleNamespace],
+    model: hunyuan_image_models.HYImageDiffusionTransformer,
+    context: Dict[str, Any],
+    context_null: Optional[Dict[str, Any]],
+    device: torch.device,
+    seed: int,
+) -> torch.Tensor:
+
     # set random generator
     seed_g = torch.Generator(device="cpu")
     seed_g.manual_seed(seed)
@@ -633,6 +646,10 @@ def generate(
     embed_byt5 = context["embed_byt5"].to(device, dtype=torch.bfloat16)
     mask_byt5 = context["mask_byt5"].to(device, dtype=torch.bfloat16)
     ocr_mask = context["ocr_mask"]  # list of bool
+
+    if context_null is None:
+        context_null = context  # dummy for unconditional
+
     negative_embed = context_null["embed"].to(device, dtype=torch.bfloat16)
     negative_mask = context_null["mask"].to(device, dtype=torch.bfloat16)
     negative_embed_byt5 = context_null["embed_byt5"].to(device, dtype=torch.bfloat16)
diff --git a/hunyuan_image_train_network.py b/hunyuan_image_train_network.py
new file mode 100644
index 000000000..b1281fa01
--- /dev/null
+++ b/hunyuan_image_train_network.py
@@ -0,0 +1,640 @@
+import argparse
+import copy
+from typing import Any, Optional, Union
+import argparse
+import os
+import time
+from types import SimpleNamespace
+
+import numpy as np
+import torch
+import torch.nn as nn
+from PIL import Image
+from accelerate import Accelerator, PartialState
+
+from library import hunyuan_image_models, hunyuan_image_vae, strategy_base, train_util
+from library.device_utils import clean_memory_on_device, init_ipex
+
+init_ipex()
+
+import train_network
+from library import (
+    flux_train_utils,
+    hunyuan_image_models,
+    hunyuan_image_text_encoder,
+    hunyuan_image_utils,
+    hunyuan_image_vae,
+    sai_model_spec,
+    sd3_train_utils,
+    strategy_base,
+    strategy_hunyuan_image,
+    train_util,
+)
+from library.utils import setup_logging
+
+setup_logging()
+import logging
+
+logger = logging.getLogger(__name__)
+
+
+# region sampling
+
+
+# TODO commonize with flux_utils
+def sample_images(
+    accelerator: Accelerator,
+    args: argparse.Namespace,
+    epoch,
+    steps,
+    dit,
+    vae,
+    text_encoders,
+    sample_prompts_te_outputs,
+    prompt_replacement=None,
+):
+    if steps == 0:
+        if not args.sample_at_first:
+            return
+    else:
+        if args.sample_every_n_steps is None and args.sample_every_n_epochs is None:
+            return
+        if args.sample_every_n_epochs is not None:
+            # sample_every_n_steps は無視する
+            if epoch is None or epoch % args.sample_every_n_epochs != 0:
+                return
+        else:
+            if steps % args.sample_every_n_steps != 0 or epoch is not None:  # steps is not divisible or end of epoch
+                return
+
+    logger.info("")
+    logger.info(f"generating sample images at step / サンプル画像生成 ステップ: {steps}")
+    if not os.path.isfile(args.sample_prompts) and sample_prompts_te_outputs is None:
+        logger.error(f"No prompt file / プロンプトファイルがありません: {args.sample_prompts}")
+        return
+
+    distributed_state = PartialState()  # for multi gpu distributed inference. this is a singleton, so it's safe to use it here
+
+    # unwrap unet and text_encoder(s)
+    dit = accelerator.unwrap_model(dit)
+    if text_encoders is not None:
+        text_encoders = [(accelerator.unwrap_model(te) if te is not None else None) for te in text_encoders]
+    if controlnet is not None:
+        controlnet = accelerator.unwrap_model(controlnet)
+    # print([(te.parameters().__next__().device if te is not None else None) for te in text_encoders])
+
+    prompts = train_util.load_prompts(args.sample_prompts)
+
+    save_dir = args.output_dir + "/sample"
+    os.makedirs(save_dir, exist_ok=True)
+
+    # save random state to restore later
+    rng_state = torch.get_rng_state()
+    cuda_rng_state = None
+    try:
+        cuda_rng_state = torch.cuda.get_rng_state() if torch.cuda.is_available() else None
+    except Exception:
+        pass
+
+    if distributed_state.num_processes <= 1:
+        # If only one device is available, just use the original prompt list. We don't need to care about the distribution of prompts.
+        with torch.no_grad(), accelerator.autocast():
+            for prompt_dict in prompts:
+                sample_image_inference(
+                    accelerator,
+                    args,
+                    dit,
+                    text_encoders,
+                    vae,
+                    save_dir,
+                    prompt_dict,
+                    epoch,
+                    steps,
+                    sample_prompts_te_outputs,
+                    prompt_replacement,
+                )
+    else:
+        # Creating list with N elements, where each element is a list of prompt_dicts, and N is the number of processes available (number of devices available)
+        # prompt_dicts are assigned to lists based on order of processes, to attempt to time the image creation time to match enum order. Probably only works when steps and sampler are identical.
+        per_process_prompts = []  # list of lists
+        for i in range(distributed_state.num_processes):
+            per_process_prompts.append(prompts[i :: distributed_state.num_processes])
+
+        with torch.no_grad():
+            with distributed_state.split_between_processes(per_process_prompts) as prompt_dict_lists:
+                for prompt_dict in prompt_dict_lists[0]:
+                    sample_image_inference(
+                        accelerator,
+                        args,
+                        dit,
+                        text_encoders,
+                        vae,
+                        save_dir,
+                        prompt_dict,
+                        epoch,
+                        steps,
+                        sample_prompts_te_outputs,
+                        prompt_replacement,
+                    )
+
+    torch.set_rng_state(rng_state)
+    if cuda_rng_state is not None:
+        torch.cuda.set_rng_state(cuda_rng_state)
+
+    clean_memory_on_device(accelerator.device)
+
+
+def sample_image_inference(
+    accelerator: Accelerator,
+    args: argparse.Namespace,
+    dit: hunyuan_image_models.HYImageDiffusionTransformer,
+    text_encoders: Optional[list[nn.Module]],
+    vae: hunyuan_image_vae.HunyuanVAE2D,
+    save_dir,
+    prompt_dict,
+    epoch,
+    steps,
+    sample_prompts_te_outputs,
+    prompt_replacement,
+):
+    assert isinstance(prompt_dict, dict)
+    negative_prompt = prompt_dict.get("negative_prompt")
+    sample_steps = prompt_dict.get("sample_steps", 20)
+    width = prompt_dict.get("width", 512)
+    height = prompt_dict.get("height", 512)
+    cfg_scale = prompt_dict.get("scale", 1.0)
+    seed = prompt_dict.get("seed")
+    prompt: str = prompt_dict.get("prompt", "")
+    flow_shift: float = prompt_dict.get("flow_shift", 4.0)
+    # sampler_name: str = prompt_dict.get("sample_sampler", args.sample_sampler)
+
+    if prompt_replacement is not None:
+        prompt = prompt.replace(prompt_replacement[0], prompt_replacement[1])
+        if negative_prompt is not None:
+            negative_prompt = negative_prompt.replace(prompt_replacement[0], prompt_replacement[1])
+
+    if seed is not None:
+        torch.manual_seed(seed)
+        torch.cuda.manual_seed(seed)
+    else:
+        # True random sample image generation
+        torch.seed()
+        torch.cuda.seed()
+
+    if negative_prompt is None:
+        negative_prompt = ""
+    height = max(64, height - height % 16)  # round to divisible by 16
+    width = max(64, width - width % 16)  # round to divisible by 16
+    logger.info(f"prompt: {prompt}")
+    if cfg_scale != 1.0:
+        logger.info(f"negative_prompt: {negative_prompt}")
+    elif negative_prompt != "":
+        logger.info(f"negative prompt is ignored because scale is 1.0")
+    logger.info(f"height: {height}")
+    logger.info(f"width: {width}")
+    logger.info(f"sample_steps: {sample_steps}")
+    if cfg_scale != 1.0:
+        logger.info(f"CFG scale: {cfg_scale}")
+    logger.info(f"flow_shift: {flow_shift}")
+    # logger.info(f"sample_sampler: {sampler_name}")
+    if seed is not None:
+        logger.info(f"seed: {seed}")
+
+    # encode prompts
+    tokenize_strategy = strategy_base.TokenizeStrategy.get_strategy()
+    encoding_strategy = strategy_base.TextEncodingStrategy.get_strategy()
+
+    def encode_prompt(prpt):
+        text_encoder_conds = []
+        if sample_prompts_te_outputs and prpt in sample_prompts_te_outputs:
+            text_encoder_conds = sample_prompts_te_outputs[prpt]
+            print(f"Using cached text encoder outputs for prompt: {prpt}")
+        if text_encoders is not None:
+            print(f"Encoding prompt: {prpt}")
+            tokens_and_masks = tokenize_strategy.tokenize(prpt)
+            # strategy has apply_t5_attn_mask option
+            encoded_text_encoder_conds = encoding_strategy.encode_tokens(tokenize_strategy, text_encoders, tokens_and_masks)
+
+            # if text_encoder_conds is not cached, use encoded_text_encoder_conds
+            if len(text_encoder_conds) == 0:
+                text_encoder_conds = encoded_text_encoder_conds
+            else:
+                # if encoded_text_encoder_conds is not None, update cached text_encoder_conds
+                for i in range(len(encoded_text_encoder_conds)):
+                    if encoded_text_encoder_conds[i] is not None:
+                        text_encoder_conds[i] = encoded_text_encoder_conds[i]
+        return text_encoder_conds
+
+    vl_embed, vl_mask, byt5_embed, byt5_mask, ocr_mask = encode_prompt(prompt)
+    arg_c = {
+        "embed": vl_embed,
+        "mask": vl_mask,
+        "embed_byt5": byt5_embed,
+        "mask_byt5": byt5_mask,
+        "ocr_mask": ocr_mask,
+        "prompt": prompt,
+    }
+
+    # encode negative prompts
+    if cfg_scale != 1.0:
+        neg_vl_embed, neg_vl_mask, neg_byt5_embed, neg_byt5_mask, neg_ocr_mask = encode_prompt(negative_prompt)
+        arg_c_null = {
+            "embed": neg_vl_embed,
+            "mask": neg_vl_mask,
+            "embed_byt5": neg_byt5_embed,
+            "mask_byt5": neg_byt5_mask,
+            "ocr_mask": neg_ocr_mask,
+            "prompt": negative_prompt,
+        }
+    else:
+        arg_c_null = None
+
+    gen_args = SimpleNamespace(
+        image_size=(height, width), infer_steps=sample_steps, flow_shift=flow_shift, guidance_scale=cfg_scale
+    )
+
+    from hunyuan_image_minimal_inference import generate_body  # import here to avoid circular import
+
+    latents = generate_body(gen_args, dit, arg_c, arg_c_null, accelerator.device, seed)
+
+    # latent to image
+    clean_memory_on_device(accelerator.device)
+    org_vae_device = vae.device  # will be on cpu
+    vae.to(accelerator.device)  # distributed_state.device is same as accelerator.device
+    with torch.autocast(accelerator.device.type, vae.dtype, enabled=True), torch.no_grad():
+        x = x / hunyuan_image_vae.VAE_SCALE_FACTOR
+        x = vae.decode(x)
+    vae.to(org_vae_device)
+    clean_memory_on_device(accelerator.device)
+
+    x = x.clamp(-1, 1)
+    x = x.permute(0, 2, 3, 1)
+    image = Image.fromarray((127.5 * (x + 1.0)).float().cpu().numpy().astype(np.uint8)[0])
+
+    # adding accelerator.wait_for_everyone() here should sync up and ensure that sample images are saved in the same order as the original prompt list
+    # but adding 'enum' to the filename should be enough
+
+    ts_str = time.strftime("%Y%m%d%H%M%S", time.localtime())
+    num_suffix = f"e{epoch:06d}" if epoch is not None else f"{steps:06d}"
+    seed_suffix = "" if seed is None else f"_{seed}"
+    i: int = prompt_dict["enum"]
+    img_filename = f"{'' if args.output_name is None else args.output_name + '_'}{num_suffix}_{i:02d}_{ts_str}{seed_suffix}.png"
+    image.save(os.path.join(save_dir, img_filename))
+
+    # send images to wandb if enabled
+    if "wandb" in [tracker.name for tracker in accelerator.trackers]:
+        wandb_tracker = accelerator.get_tracker("wandb")
+
+        import wandb
+
+        # not to commit images to avoid inconsistency between training and logging steps
+        wandb_tracker.log({f"sample_{i}": wandb.Image(image, caption=prompt)}, commit=False)  # positive prompt as a caption
+
+
+# endregion
+
+
+class HunyuanImageNetworkTrainer(train_network.NetworkTrainer):
+    def __init__(self):
+        super().__init__()
+        self.sample_prompts_te_outputs = None
+        self.is_swapping_blocks: bool = False
+
+    def assert_extra_args(
+        self,
+        args,
+        train_dataset_group: Union[train_util.DatasetGroup, train_util.MinimalDataset],
+        val_dataset_group: Optional[train_util.DatasetGroup],
+    ):
+        super().assert_extra_args(args, train_dataset_group, val_dataset_group)
+        # sdxl_train_util.verify_sdxl_training_args(args)
+
+        if args.mixed_precision == "fp16":
+            logger.warning(
+                "mixed_precision bf16 is recommended for HunyuanImage-2.1 / HunyuanImage-2.1ではmixed_precision bf16が推奨されます"
+            )
+
+        if (args.fp8_base or args.fp8_base_unet) and not args.fp8_scaled:
+            logger.warning(
+                "fp8_base and fp8_base_unet are not supported. Use fp8_scaled instead / fp8_baseとfp8_base_unetはサポートされていません。代わりにfp8_scaledを使用してください"
+            )
+        if args.fp8_scaled and (args.fp8_base or args.fp8_base_unet):
+            logger.info(
+                "fp8_scaled is used, so fp8_base and fp8_base_unet are ignored / fp8_scaledが使われているので、fp8_baseとfp8_base_unetは無視されます"
+            )
+
+        if args.cache_text_encoder_outputs_to_disk and not args.cache_text_encoder_outputs:
+            logger.warning(
+                "cache_text_encoder_outputs_to_disk is enabled, so cache_text_encoder_outputs is also enabled / cache_text_encoder_outputs_to_diskが有効になっているため、cache_text_encoder_outputsも有効になります"
+            )
+            args.cache_text_encoder_outputs = True
+
+        if args.cache_text_encoder_outputs:
+            assert (
+                train_dataset_group.is_text_encoder_output_cacheable()
+            ), "when caching Text Encoder output, either caption_dropout_rate, shuffle_caption, token_warmup_step or caption_tag_dropout_rate cannot be used / Text Encoderの出力をキャッシュするときはcaption_dropout_rate, shuffle_caption, token_warmup_step, caption_tag_dropout_rateは使えません"
+
+        train_dataset_group.verify_bucket_reso_steps(32)
+        if val_dataset_group is not None:
+            val_dataset_group.verify_bucket_reso_steps(32)
+
+    def load_target_model(self, args, weight_dtype, accelerator):
+        self.is_swapping_blocks = args.blocks_to_swap is not None and args.blocks_to_swap > 0
+
+        # currently offload to cpu for some models
+        loading_dtype = None if args.fp8_scaled else weight_dtype
+        loading_device = "cpu" if self.is_swapping_blocks else accelerator.device
+        split_attn = True
+
+        attn_mode = "torch"
+
+        model = hunyuan_image_models.load_hunyuan_image_model(
+            accelerator.device,
+            args.pretrained_model_name_or_path,
+            attn_mode,
+            split_attn,
+            loading_device,
+            loading_dtype,
+            args.fp8_scaled,
+        )
+
+        if self.is_swapping_blocks:
+            # Swap blocks between CPU and GPU to reduce memory usage, in forward and backward passes.
+            logger.info(f"enable block swap: blocks_to_swap={args.blocks_to_swap}")
+            model.enable_block_swap(args.blocks_to_swap, accelerator.device)
+
+        vl_dtype = torch.bfloat16
+        vl_device = "cpu"
+        _, text_encoder_vlm = hunyuan_image_text_encoder.load_qwen2_5_vl(
+            args.text_encoder, dtype=vl_dtype, device=vl_device, disable_mmap=args.disable_mmap_load_safetensors
+        )
+        _, text_encoder_byt5 = hunyuan_image_text_encoder.load_byt5(
+            args.byt5, dtype=torch.float16, device=vl_device, disable_mmap=args.disable_mmap_load_safetensors
+        )
+
+        vae = hunyuan_image_vae.load_vae(args.vae, "cpu", disable_mmap=args.disable_mmap_load_safetensors)
+
+        model_version = hunyuan_image_utils.MODEL_VERSION_2_1
+        return model_version, [text_encoder_vlm, text_encoder_byt5], vae, model
+
+    def get_tokenize_strategy(self, args):
+        return strategy_hunyuan_image.HunyuanImageTokenizeStrategy(args.tokenizer_cache_dir)
+
+    def get_tokenizers(self, tokenize_strategy: strategy_hunyuan_image.HunyuanImageTokenizeStrategy):
+        return [tokenize_strategy.vlm_tokenizer, tokenize_strategy.byt5_tokenizer]
+
+    def get_latents_caching_strategy(self, args):
+        return strategy_hunyuan_image.HunyuanImageLatentsCachingStrategy(args.cache_latents_to_disk, args.vae_batch_size, False)
+
+    def get_text_encoding_strategy(self, args):
+        return strategy_hunyuan_image.HunyuanImageTextEncodingStrategy()
+
+    def post_process_network(self, args, accelerator, network, text_encoders, unet):
+        pass
+
+    def get_models_for_text_encoding(self, args, accelerator, text_encoders):
+        if args.cache_text_encoder_outputs:
+            return None  # no text encoders are needed for encoding because both are cached
+        else:
+            return text_encoders
+
+    def get_text_encoders_train_flags(self, args, text_encoders):
+        # HunyuanImage-2.1 does not support training VLM or byT5
+        return [False, False]
+
+    def get_text_encoder_outputs_caching_strategy(self, args):
+        if args.cache_text_encoder_outputs:
+            # if the text encoders is trained, we need tokenization, so is_partial is True
+            return strategy_hunyuan_image.HunyuanImageTextEncoderOutputsCachingStrategy(
+                args.cache_text_encoder_outputs_to_disk, args.text_encoder_batch_size, args.skip_cache_check, False
+            )
+        else:
+            return None
+
+    def cache_text_encoder_outputs_if_needed(
+        self, args, accelerator: Accelerator, unet, vae, text_encoders, dataset: train_util.DatasetGroup, weight_dtype
+    ):
+        if args.cache_text_encoder_outputs:
+            if not args.lowram:
+                # メモリ消費を減らす
+                logger.info("move vae and unet to cpu to save memory")
+                org_vae_device = vae.device
+                org_unet_device = unet.device
+                vae.to("cpu")
+                unet.to("cpu")
+                clean_memory_on_device(accelerator.device)
+
+            logger.info("move text encoders to gpu")
+            text_encoders[0].to(accelerator.device)
+            text_encoders[1].to(accelerator.device)
+
+            # VLM (bf16) and byT5 (fp16) are used for encoding, so we cannot use autocast here
+            dataset.new_cache_text_encoder_outputs(text_encoders, accelerator)
+
+            # cache sample prompts
+            if args.sample_prompts is not None:
+                logger.info(f"cache Text Encoder outputs for sample prompt: {args.sample_prompts}")
+
+                tokenize_strategy: strategy_hunyuan_image.HunyuanImageTokenizeStrategy = (
+                    strategy_base.TokenizeStrategy.get_strategy()
+                )
+                text_encoding_strategy: strategy_hunyuan_image.HunyuanImageTextEncodingStrategy = (
+                    strategy_base.TextEncodingStrategy.get_strategy()
+                )
+
+                prompts = train_util.load_prompts(args.sample_prompts)
+                sample_prompts_te_outputs = {}  # key: prompt, value: text encoder outputs
+                with accelerator.autocast(), torch.no_grad():
+                    for prompt_dict in prompts:
+                        for p in [prompt_dict.get("prompt", ""), prompt_dict.get("negative_prompt", "")]:
+                            if p not in sample_prompts_te_outputs:
+                                logger.info(f"cache Text Encoder outputs for prompt: {p}")
+                                tokens_and_masks = tokenize_strategy.tokenize(p)
+                                sample_prompts_te_outputs[p] = text_encoding_strategy.encode_tokens(
+                                    tokenize_strategy, text_encoders, tokens_and_masks
+                                )
+                self.sample_prompts_te_outputs = sample_prompts_te_outputs
+
+            accelerator.wait_for_everyone()
+
+            # move back to cpu
+            logger.info("move VLM back to cpu")
+            text_encoders[0].to("cpu")
+            logger.info("move byT5 back to cpu")
+            text_encoders[1].to("cpu")
+            clean_memory_on_device(accelerator.device)
+
+            if not args.lowram:
+                logger.info("move vae and unet back to original device")
+                vae.to(org_vae_device)
+                unet.to(org_unet_device)
+        else:
+            # Text Encoderから毎回出力を取得するので、GPUに乗せておく
+            text_encoders[0].to(accelerator.device)
+            text_encoders[1].to(accelerator.device)
+
+    def sample_images(self, accelerator, args, epoch, global_step, device, ae, tokenizer, text_encoder, flux):
+        text_encoders = text_encoder  # for compatibility
+        text_encoders = self.get_models_for_text_encoding(args, accelerator, text_encoders)
+
+        flux_train_utils.sample_images(
+            accelerator, args, epoch, global_step, flux, ae, text_encoders, self.sample_prompts_te_outputs
+        )
+
+    def get_noise_scheduler(self, args: argparse.Namespace, device: torch.device) -> Any:
+        noise_scheduler = sd3_train_utils.FlowMatchEulerDiscreteScheduler(num_train_timesteps=1000, shift=args.discrete_flow_shift)
+        self.noise_scheduler_copy = copy.deepcopy(noise_scheduler)
+        return noise_scheduler
+
+    def encode_images_to_latents(self, args, vae, images):
+        return vae.encode(images)
+
+    def shift_scale_latents(self, args, latents):
+        # for encoding, we need to scale the latents
+        return latents * hunyuan_image_vae.VAE_SCALE_FACTOR
+
+    def get_noise_pred_and_target(
+        self,
+        args,
+        accelerator,
+        noise_scheduler,
+        latents,
+        batch,
+        text_encoder_conds,
+        unet: hunyuan_image_models.HYImageDiffusionTransformer,
+        network,
+        weight_dtype,
+        train_unet,
+        is_train=True,
+    ):
+        # Sample noise that we'll add to the latents
+        noise = torch.randn_like(latents)
+        bsz = latents.shape[0]
+
+        # get noisy model input and timesteps
+        noisy_model_input, timesteps, sigmas = flux_train_utils.get_noisy_model_input_and_timesteps(
+            args, noise_scheduler, latents, noise, accelerator.device, weight_dtype
+        )
+
+        if args.gradient_checkpointing:
+            noisy_model_input.requires_grad_(True)
+            for t in text_encoder_conds:
+                if t is not None and t.dtype.is_floating_point:
+                    t.requires_grad_(True)
+
+        # Predict the noise residual
+        # ocr_mask is for inference only, so it is not used here
+        vlm_embed, vlm_mask, byt5_embed, byt5_mask, ocr_mask = text_encoder_conds
+
+        with torch.set_grad_enabled(is_train), accelerator.autocast():
+            model_pred = unet(noisy_model_input, timesteps / 1000, vlm_embed, vlm_mask, byt5_embed, byt5_mask)
+
+        # model prediction and weighting is omitted for HunyuanImage-2.1 currently
+
+        # flow matching loss
+        target = noise - latents
+
+        # differential output preservation is not used for HunyuanImage-2.1 currently
+
+        return model_pred, target, timesteps, None
+
+    def post_process_loss(self, loss, args, timesteps, noise_scheduler):
+        return loss
+
+    def get_sai_model_spec(self, args):
+        # if self.model_type != "chroma":
+        #     model_description = "schnell" if self.is_schnell else "dev"
+        # else:
+        #     model_description = "chroma"
+        # return train_util.get_sai_model_spec(None, args, False, True, False, flux=model_description)
+        train_util.get_sai_model_spec_dataclass(None, args, False, True, False, hunyuan_image="2.1")
+
+    def update_metadata(self, metadata, args):
+        metadata["ss_model_type"] = args.model_type
+        metadata["ss_logit_mean"] = args.logit_mean
+        metadata["ss_logit_std"] = args.logit_std
+        metadata["ss_mode_scale"] = args.mode_scale
+        metadata["ss_timestep_sampling"] = args.timestep_sampling
+        metadata["ss_sigmoid_scale"] = args.sigmoid_scale
+        metadata["ss_model_prediction_type"] = args.model_prediction_type
+        metadata["ss_discrete_flow_shift"] = args.discrete_flow_shift
+
+    def is_text_encoder_not_needed_for_training(self, args):
+        return args.cache_text_encoder_outputs and not self.is_train_text_encoder(args)
+
+    def prepare_text_encoder_grad_ckpt_workaround(self, index, text_encoder):
+        # do not support text encoder training for HunyuanImage-2.1
+        pass
+
+    def cast_text_encoder(self):
+        return False  # VLM is bf16, byT5 is fp16, so do not cast to other dtype
+
+    def prepare_text_encoder_fp8(self, index, text_encoder, te_weight_dtype, weight_dtype):
+        # fp8 text encoder for HunyuanImage-2.1 is not supported currently
+        pass
+
+    def on_validation_step_end(self, args, accelerator, network, text_encoders, unet, batch, weight_dtype):
+        if self.is_swapping_blocks:
+            # prepare for next forward: because backward pass is not called, we need to prepare it here
+            accelerator.unwrap_model(unet).prepare_block_swap_before_forward()
+
+    def prepare_unet_with_accelerator(
+        self, args: argparse.Namespace, accelerator: Accelerator, unet: torch.nn.Module
+    ) -> torch.nn.Module:
+        if not self.is_swapping_blocks:
+            return super().prepare_unet_with_accelerator(args, accelerator, unet)
+
+        # if we doesn't swap blocks, we can move the model to device
+        model: hunyuan_image_models.HYImageDiffusionTransformer = unet
+        model = accelerator.prepare(model, device_placement=[not self.is_swapping_blocks])
+        accelerator.unwrap_model(model).move_to_device_except_swap_blocks(accelerator.device)  # reduce peak memory usage
+        accelerator.unwrap_model(model).prepare_block_swap_before_forward()
+
+        return model
+
+
+def setup_parser() -> argparse.ArgumentParser:
+    parser = train_network.setup_parser()
+    train_util.add_dit_training_arguments(parser)
+
+    parser.add_argument(
+        "--timestep_sampling",
+        choices=["sigma", "uniform", "sigmoid", "shift", "flux_shift"],
+        default="sigma",
+        help="Method to sample timesteps: sigma-based, uniform random, sigmoid of random normal, shift of sigmoid and FLUX.1 shifting."
+        " / タイムステップをサンプリングする方法：sigma、random uniform、random normalのsigmoid、sigmoidのシフト、FLUX.1のシフト。",
+    )
+    parser.add_argument(
+        "--sigmoid_scale",
+        type=float,
+        default=1.0,
+        help='Scale factor for sigmoid timestep sampling (only used when timestep-sampling is "sigmoid"). / sigmoidタイムステップサンプリングの倍率（timestep-samplingが"sigmoid"の場合のみ有効）。',
+    )
+    parser.add_argument(
+        "--model_prediction_type",
+        choices=["raw", "additive", "sigma_scaled"],
+        default="sigma_scaled",
+        help="How to interpret and process the model prediction: "
+        "raw (use as is), additive (add to noisy input), sigma_scaled (apply sigma scaling)."
+        " / モデル予測の解釈と処理方法："
+        "raw（そのまま使用）、additive（ノイズ入力に加算）、sigma_scaled（シグマスケーリングを適用）。",
+    )
+    parser.add_argument(
+        "--discrete_flow_shift",
+        type=float,
+        default=3.0,
+        help="Discrete flow shift for the Euler Discrete Scheduler, default is 3.0. / Euler Discrete Schedulerの離散フローシフト、デフォルトは3.0。",
+    )
+
+    return parser
+
+
+if __name__ == "__main__":
+    parser = setup_parser()
+
+    args = parser.parse_args()
+    train_util.verify_command_line_training_args(args)
+    args = train_util.read_config_from_file(args, parser)
+
+    trainer = HunyuanImageNetworkTrainer()
+    trainer.train(args)
diff --git a/library/custom_offloading_utils.py b/library/custom_offloading_utils.py
index 55ff08b64..4fbea542a 100644
--- a/library/custom_offloading_utils.py
+++ b/library/custom_offloading_utils.py
@@ -1,19 +1,12 @@
 from concurrent.futures import ThreadPoolExecutor
 import time
-from typing import Optional, Union, Callable, Tuple
+from typing import Any, Optional, Union, Callable, Tuple
 import torch
 import torch.nn as nn
 
-from library.device_utils import clean_memory_on_device
+from library.device_utils import clean_memory_on_device, synchronize_device
 
-
-def synchronize_device(device: torch.device):
-    if device.type == "cuda":
-        torch.cuda.synchronize()
-    elif device.type == "xpu":
-        torch.xpu.synchronize()
-    elif device.type == "mps":
-        torch.mps.synchronize()
+# region block swap utils
 
 
 def swap_weight_devices_cuda(device: torch.device, layer_to_cpu: nn.Module, layer_to_cuda: nn.Module):
@@ -71,7 +64,6 @@ def swap_weight_devices_no_cuda(device: torch.device, layer_to_cpu: nn.Module, l
         if hasattr(module_to_cpu, "weight") and module_to_cpu.weight is not None:
             weight_swap_jobs.append((module_to_cpu, module_to_cuda, module_to_cpu.weight.data, module_to_cuda.weight.data))
 
-
     # device to cpu
     for module_to_cpu, module_to_cuda, cuda_data_view, cpu_data_view in weight_swap_jobs:
         module_to_cpu.weight.data = cuda_data_view.data.to("cpu", non_blocking=True)
@@ -97,7 +89,8 @@ class Offloader:
     common offloading class
     """
 
-    def __init__(self, num_blocks: int, blocks_to_swap: int, device: torch.device, debug: bool = False):
+    def __init__(self, block_type: str, num_blocks: int, blocks_to_swap: int, device: torch.device, debug: bool = False):
+        self.block_type = block_type
         self.num_blocks = num_blocks
         self.blocks_to_swap = blocks_to_swap
         self.device = device
@@ -117,12 +110,16 @@ def _submit_move_blocks(self, blocks, block_idx_to_cpu, block_idx_to_cuda):
         def move_blocks(bidx_to_cpu, block_to_cpu, bidx_to_cuda, block_to_cuda):
             if self.debug:
                 start_time = time.perf_counter()
-                print(f"Move block {bidx_to_cpu} to CPU and block {bidx_to_cuda} to {'CUDA' if self.cuda_available else 'device'}")
+                print(
+                    f"[{self.block_type}] Move block {bidx_to_cpu} to CPU and block {bidx_to_cuda} to {'CUDA' if self.cuda_available else 'device'}"
+                )
 
             self.swap_weight_devices(block_to_cpu, block_to_cuda)
 
             if self.debug:
-                print(f"Moved blocks {bidx_to_cpu} and {bidx_to_cuda} in {time.perf_counter()-start_time:.2f}s")
+                print(
+                    f"[{self.block_type}] Moved blocks {bidx_to_cpu} and {bidx_to_cuda} in {time.perf_counter() - start_time:.2f}s"
+                )
             return bidx_to_cpu, bidx_to_cuda  # , event
 
         block_to_cpu = blocks[block_idx_to_cpu]
@@ -137,7 +134,7 @@ def _wait_blocks_move(self, block_idx):
             return
 
         if self.debug:
-            print(f"Wait for block {block_idx}")
+            print(f"[{self.block_type}] Wait for block {block_idx}")
             start_time = time.perf_counter()
 
         future = self.futures.pop(block_idx)
@@ -146,33 +143,41 @@ def _wait_blocks_move(self, block_idx):
         assert block_idx == bidx_to_cuda, f"Block index mismatch: {block_idx} != {bidx_to_cuda}"
 
         if self.debug:
-            print(f"Waited for block {block_idx}: {time.perf_counter()-start_time:.2f}s")
+            print(f"[{self.block_type}] Waited for block {block_idx}: {time.perf_counter() - start_time:.2f}s")
 
 
-# Gradient tensors
-_grad_t = Union[tuple[torch.Tensor, ...], torch.Tensor]
-
 class ModelOffloader(Offloader):
     """
     supports forward offloading
     """
 
-    def __init__(self, blocks: Union[list[nn.Module], nn.ModuleList], blocks_to_swap: int, device: torch.device, debug: bool = False):
-        super().__init__(len(blocks), blocks_to_swap, device, debug)
+    def __init__(
+        self, blocks: list[nn.Module], blocks_to_swap: int, supports_backward: bool, device: torch.device, debug: bool = False
+    ):
+        block_type = f"{blocks[0].__class__.__name__}" if len(blocks) > 0 else "Unknown"
+        super().__init__(block_type, len(blocks), blocks_to_swap, device, debug)
+
+        self.supports_backward = supports_backward
+        self.forward_only = not supports_backward  # forward only offloading: can be changed to True for inference
 
-        # register backward hooks
-        self.remove_handles = []
-        for i, block in enumerate(blocks):
-            hook = self.create_backward_hook(blocks, i)
-            if hook is not None:
-                handle = block.register_full_backward_hook(hook)
-                self.remove_handles.append(handle)
+        if self.supports_backward:
+            # register backward hooks
+            self.remove_handles = []
+            for i, block in enumerate(blocks):
+                hook = self.create_backward_hook(blocks, i)
+                if hook is not None:
+                    handle = block.register_full_backward_hook(hook)
+                    self.remove_handles.append(handle)
+
+    def set_forward_only(self, forward_only: bool):
+        self.forward_only = forward_only
 
     def __del__(self):
-        for handle in self.remove_handles:
-            handle.remove()
+        if self.supports_backward:
+            for handle in self.remove_handles:
+                handle.remove()
 
-    def create_backward_hook(self, blocks: Union[list[nn.Module], nn.ModuleList], block_index: int) -> Optional[Callable[[nn.Module, _grad_t, _grad_t], Union[None, _grad_t]]]:
+    def create_backward_hook(self, blocks: list[nn.Module], block_index: int) -> Optional[callable]:
         # -1 for 0-based index
         num_blocks_propagated = self.num_blocks - block_index - 1
         swapping = num_blocks_propagated > 0 and num_blocks_propagated <= self.blocks_to_swap
@@ -186,7 +191,7 @@ def create_backward_hook(self, blocks: Union[list[nn.Module], nn.ModuleList], bl
         block_idx_to_cuda = self.blocks_to_swap - num_blocks_propagated
         block_idx_to_wait = block_index - 1
 
-        def backward_hook(module: nn.Module, grad_input: _grad_t, grad_output: _grad_t):
+        def backward_hook(module, grad_input, grad_output):
             if self.debug:
                 print(f"Backward hook for block {block_index}")
 
@@ -198,20 +203,20 @@ def backward_hook(module: nn.Module, grad_input: _grad_t, grad_output: _grad_t):
 
         return backward_hook
 
-    def prepare_block_devices_before_forward(self, blocks: Union[list[nn.Module], nn.ModuleList]):
+    def prepare_block_devices_before_forward(self, blocks: list[nn.Module]):
         if self.blocks_to_swap is None or self.blocks_to_swap == 0:
             return
 
         if self.debug:
-            print("Prepare block devices before forward")
+            print(f"[{self.block_type}] Prepare block devices before forward")
 
         for b in blocks[0 : self.num_blocks - self.blocks_to_swap]:
             b.to(self.device)
             weighs_to_device(b, self.device)  # make sure weights are on device
 
         for b in blocks[self.num_blocks - self.blocks_to_swap :]:
-            b.to(self.device)  # move block to device first
-            weighs_to_device(b, torch.device("cpu"))  # make sure weights are on cpu
+            b.to(self.device)  # move block to device first. this makes sure that buffers (non weights) are on the device
+            weighs_to_device(b, "cpu")  # make sure weights are on cpu
 
         synchronize_device(self.device)
         clean_memory_on_device(self.device)
@@ -221,11 +226,85 @@ def wait_for_block(self, block_idx: int):
             return
         self._wait_blocks_move(block_idx)
 
-    def submit_move_blocks(self, blocks: Union[list[nn.Module], nn.ModuleList], block_idx: int):
+    def submit_move_blocks(self, blocks: list[nn.Module], block_idx: int):
+        # check if blocks_to_swap is enabled
         if self.blocks_to_swap is None or self.blocks_to_swap == 0:
             return
-        if block_idx >= self.blocks_to_swap:
+
+        # if backward is enabled, we do not swap blocks in forward pass more than blocks_to_swap, because it should be on GPU
+        if not self.forward_only and block_idx >= self.blocks_to_swap:
             return
+
         block_idx_to_cpu = block_idx
         block_idx_to_cuda = self.num_blocks - self.blocks_to_swap + block_idx
+        block_idx_to_cuda = block_idx_to_cuda % self.num_blocks  # this works for forward-only offloading
         self._submit_move_blocks(blocks, block_idx_to_cpu, block_idx_to_cuda)
+
+
+# endregion
+
+# region cpu offload utils
+
+
+def to_device(x: Any, device: torch.device) -> Any:
+    if isinstance(x, torch.Tensor):
+        return x.to(device)
+    elif isinstance(x, list):
+        return [to_device(elem, device) for elem in x]
+    elif isinstance(x, tuple):
+        return tuple(to_device(elem, device) for elem in x)
+    elif isinstance(x, dict):
+        return {k: to_device(v, device) for k, v in x.items()}
+    else:
+        return x
+
+
+def to_cpu(x: Any) -> Any:
+    """
+    Recursively moves torch.Tensor objects (and containers thereof) to CPU.
+
+    Args:
+        x: A torch.Tensor, or a (possibly nested) list, tuple, or dict containing tensors.
+
+    Returns:
+        The same structure as x, with all torch.Tensor objects moved to CPU.
+        Non-tensor objects are returned unchanged.
+    """
+    if isinstance(x, torch.Tensor):
+        return x.cpu()
+    elif isinstance(x, list):
+        return [to_cpu(elem) for elem in x]
+    elif isinstance(x, tuple):
+        return tuple(to_cpu(elem) for elem in x)
+    elif isinstance(x, dict):
+        return {k: to_cpu(v) for k, v in x.items()}
+    else:
+        return x
+
+
+def create_cpu_offloading_wrapper(func: Callable, device: torch.device) -> Callable:
+    """
+    Create a wrapper function that offloads inputs to CPU before calling the original function
+    and moves outputs back to the specified device.
+
+    Args:
+        func: The original function to wrap.
+        device: The device to move outputs back to.
+
+    Returns:
+        A wrapped function that offloads inputs to CPU and moves outputs back to the specified device.
+    """
+
+    def wrapper(orig_func: Callable) -> Callable:
+        def custom_forward(*inputs):
+            nonlocal device, orig_func
+            cuda_inputs = to_device(inputs, device)
+            outputs = orig_func(*cuda_inputs)
+            return to_cpu(outputs)
+
+        return custom_forward
+
+    return wrapper(func)
+
+
+# endregion
diff --git a/library/device_utils.py b/library/device_utils.py
index d2e197450..deffa9af8 100644
--- a/library/device_utils.py
+++ b/library/device_utils.py
@@ -2,6 +2,7 @@
 import gc
 
 import torch
+
 try:
     # intel gpu support for pytorch older than 2.5
     # ipex is not needed after pytorch 2.5
@@ -51,6 +52,15 @@ def clean_memory_on_device(device: torch.device):
         torch.mps.empty_cache()
 
 
+def synchronize_device(device: torch.device):
+    if device.type == "cuda":
+        torch.cuda.synchronize()
+    elif device.type == "xpu":
+        torch.xpu.synchronize()
+    elif device.type == "mps":
+        torch.mps.synchronize()
+
+
 @functools.lru_cache(maxsize=None)
 def get_preferred_device() -> torch.device:
     r"""
diff --git a/library/hunyuan_image_models.py b/library/hunyuan_image_models.py
index 5bd08c5ca..9847c55ee 100644
--- a/library/hunyuan_image_models.py
+++ b/library/hunyuan_image_models.py
@@ -7,6 +7,7 @@
 import torch.nn as nn
 from accelerate import init_empty_weights
 
+from library import custom_offloading_utils
 from library.fp8_optimization_utils import apply_fp8_monkey_patch
 from library.lora_utils import load_safetensors_with_lora_and_fp8
 from library.utils import setup_logging
@@ -132,6 +133,74 @@ def __init__(self, attn_mode: str = "torch"):
 
         self.final_layer = FinalLayer(self.hidden_size, self.patch_size, self.out_channels, nn.SiLU)
 
+        self.gradient_checkpointing = False
+        self.cpu_offload_checkpointing = False
+        self.blocks_to_swap = None
+
+        self.offloader_double = None
+        self.offloader_single = None
+        self.num_double_blocks = len(self.double_blocks)
+        self.num_single_blocks = len(self.single_blocks)
+
+    def enable_gradient_checkpointing(self, cpu_offload: bool = False):
+        self.gradient_checkpointing = True
+        self.cpu_offload_checkpointing = cpu_offload
+
+        for block in self.double_blocks + self.single_blocks:
+            block.enable_gradient_checkpointing(cpu_offload=cpu_offload)
+
+        print(f"HunyuanImage-2.1: Gradient checkpointing enabled. CPU offload: {cpu_offload}")
+
+    def disable_gradient_checkpointing(self):
+        self.gradient_checkpointing = False
+        self.cpu_offload_checkpointing = False
+
+        for block in self.double_blocks + self.single_blocks:
+            block.disable_gradient_checkpointing()
+
+        print("HunyuanImage-2.1: Gradient checkpointing disabled.")
+
+    def enable_block_swap(self, num_blocks: int, device: torch.device, supports_backward: bool = False):
+        self.blocks_to_swap = num_blocks
+        double_blocks_to_swap = num_blocks // 2
+        single_blocks_to_swap = (num_blocks - double_blocks_to_swap) * 2
+
+        assert double_blocks_to_swap <= self.num_double_blocks - 2 and single_blocks_to_swap <= self.num_single_blocks - 2, (
+            f"Cannot swap more than {self.num_double_blocks - 2} double blocks and {self.num_single_blocks - 2} single blocks. "
+            f"Requested {double_blocks_to_swap} double blocks and {single_blocks_to_swap} single blocks."
+        )
+
+        self.offloader_double = custom_offloading_utils.ModelOffloader(
+            self.double_blocks, double_blocks_to_swap, supports_backward, device
+        )
+        self.offloader_single = custom_offloading_utils.ModelOffloader(
+            self.single_blocks, single_blocks_to_swap, supports_backward, device
+        )
+        # , debug=True
+        print(
+            f"HunyuanImage-2.1: Block swap enabled. Swapping {num_blocks} blocks, double blocks: {double_blocks_to_swap}, single blocks: {single_blocks_to_swap}."
+        )
+
+    def move_to_device_except_swap_blocks(self, device: torch.device):
+        # assume model is on cpu. do not move blocks to device to reduce temporary memory usage
+        if self.blocks_to_swap:
+            save_double_blocks = self.double_blocks
+            save_single_blocks = self.single_blocks
+            self.double_blocks = nn.ModuleList()
+            self.single_blocks = nn.ModuleList()
+
+        self.to(device)
+
+        if self.blocks_to_swap:
+            self.double_blocks = save_double_blocks
+            self.single_blocks = save_single_blocks
+
+    def prepare_block_swap_before_forward(self):
+        if self.blocks_to_swap is None or self.blocks_to_swap == 0:
+            return
+        self.offloader_double.prepare_block_devices_before_forward(self.double_blocks)
+        self.offloader_single.prepare_block_devices_before_forward(self.single_blocks)
+
     def get_rotary_pos_embed(self, rope_sizes):
         """
         Generate 2D rotary position embeddings for image tokens.
@@ -255,16 +324,29 @@ def forward(
         txt = txt[:, :max_txt_len, :]
         txt_seq_len = txt.shape[1]
 
+        input_device = img.device
+
         # Process through double-stream blocks (separate image/text attention)
         for index, block in enumerate(self.double_blocks):
+            if self.blocks_to_swap:
+                self.offloader_double.wait_for_block(index)
             img, txt = block(img, txt, vec, freqs_cis, seq_lens)
+            if self.blocks_to_swap:
+                self.offloader_double.submit_move_blocks(self.double_blocks, index)
 
         # Concatenate image and text tokens for joint processing
         x = torch.cat((img, txt), 1)
 
         # Process through single-stream blocks (joint attention)
         for index, block in enumerate(self.single_blocks):
+            if self.blocks_to_swap:
+                self.offloader_single.wait_for_block(index)
             x = block(x, vec, txt_seq_len, freqs_cis, seq_lens)
+            if self.blocks_to_swap:
+                self.offloader_single.submit_move_blocks(self.single_blocks, index)
+
+        x = x.to(input_device)
+        vec = vec.to(input_device)
 
         img = x[:, :img_seq_len, ...]
 
diff --git a/library/hunyuan_image_modules.py b/library/hunyuan_image_modules.py
index b4ded4c53..633cd310d 100644
--- a/library/hunyuan_image_modules.py
+++ b/library/hunyuan_image_modules.py
@@ -6,6 +6,7 @@
 import torch.nn as nn
 from einops import rearrange
 
+from library import custom_offloading_utils
 from library.attention import attention
 from library.hunyuan_image_utils import timestep_embedding, apply_rotary_emb, _to_tuple, apply_gate, modulate
 from library.attention import attention
@@ -608,7 +609,18 @@ def __init__(
         self.txt_norm2 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
         self.txt_mlp = MLP(hidden_size, mlp_hidden_dim, act_layer=lambda: nn.GELU(approximate="tanh"), bias=True)
 
-    def forward(
+        self.gradient_checkpointing = False
+        self.cpu_offload_checkpointing = False
+
+    def enable_gradient_checkpointing(self, cpu_offload: bool = False):
+        self.gradient_checkpointing = True
+        self.cpu_offload_checkpointing = cpu_offload
+
+    def disable_gradient_checkpointing(self):
+        self.gradient_checkpointing = False
+        self.cpu_offload_checkpointing = False
+
+    def _forward(
         self, img: torch.Tensor, txt: torch.Tensor, vec: torch.Tensor, freqs_cis: tuple = None, seq_lens: list[int] = None
     ) -> Tuple[torch.Tensor, torch.Tensor]:
         # Extract modulation parameters for image and text streams
@@ -688,6 +700,18 @@ def forward(
 
         return img, txt
 
+    def forward(
+        self, img: torch.Tensor, txt: torch.Tensor, vec: torch.Tensor, freqs_cis: tuple = None, seq_lens: list[int] = None
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        if self.gradient_checkpointing and self.training:
+            forward_fn = self._forward
+            if self.cpu_offload_checkpointing:
+                forward_fn = custom_offloading_utils.cpu_offload_wrapper(forward_fn, self.img_attn_qkv.weight.device)
+
+            return torch.utils.checkpoint.checkpoint(forward_fn, img, txt, vec, freqs_cis, seq_lens, use_reentrant=False)
+        else:
+            return self._forward(img, txt, vec, freqs_cis, seq_lens)
+
 
 class MMSingleStreamBlock(nn.Module):
     """
@@ -748,7 +772,18 @@ def __init__(
         self.mlp_act = nn.GELU(approximate="tanh")
         self.modulation = ModulateDiT(hidden_size, factor=3, act_layer=nn.SiLU)
 
-    def forward(
+        self.gradient_checkpointing = False
+        self.cpu_offload_checkpointing = False
+
+    def enable_gradient_checkpointing(self, cpu_offload: bool = False):
+        self.gradient_checkpointing = True
+        self.cpu_offload_checkpointing = cpu_offload
+
+    def disable_gradient_checkpointing(self):
+        self.gradient_checkpointing = False
+        self.cpu_offload_checkpointing = False
+
+    def _forward(
         self,
         x: torch.Tensor,
         vec: torch.Tensor,
@@ -800,5 +835,22 @@ def forward(
 
         return x + apply_gate(output, gate=mod_gate)
 
+    def forward(
+        self,
+        x: torch.Tensor,
+        vec: torch.Tensor,
+        txt_len: int,
+        freqs_cis: Tuple[torch.Tensor, torch.Tensor] = None,
+        seq_lens: list[int] = None,
+    ) -> torch.Tensor:
+        if self.gradient_checkpointing and self.training:
+            forward_fn = self._forward
+            if self.cpu_offload_checkpointing:
+                forward_fn = custom_offloading_utils.create_cpu_offloading_wrapper(forward_fn, self.linear1.weight.device)
+
+            return torch.utils.checkpoint.checkpoint(forward_fn, x, vec, txt_len, freqs_cis, seq_lens, use_reentrant=False)
+        else:
+            return self._forward(x, vec, txt_len, freqs_cis, seq_lens)
+
 
 # endregion
diff --git a/library/hunyuan_image_text_encoder.py b/library/hunyuan_image_text_encoder.py
index 85bdaa43e..1300b39b7 100644
--- a/library/hunyuan_image_text_encoder.py
+++ b/library/hunyuan_image_text_encoder.py
@@ -24,7 +24,7 @@
 
 
 BYT5_TOKENIZER_PATH = "google/byt5-small"
-QWEN_2_5_VL_IMAGE_ID ="Qwen/Qwen2.5-VL-7B-Instruct"
+QWEN_2_5_VL_IMAGE_ID = "Qwen/Qwen2.5-VL-7B-Instruct"
 
 
 # Copy from Glyph-SDXL-V2
@@ -228,6 +228,7 @@ def load_byt5(
 
     info = byt5_text_encoder.load_state_dict(sd, strict=True, assign=True)
     byt5_text_encoder.to(device)
+    byt5_text_encoder.eval()
     logger.info(f"BYT5 text encoder loaded with info: {info}")
 
     return byt5_tokenizer, byt5_text_encoder
@@ -404,6 +405,7 @@ def load_qwen2_5_vl(
     info = qwen2_5_vl.load_state_dict(sd, strict=True, assign=True)
     logger.info(f"Loaded Qwen2.5-VL: {info}")
     qwen2_5_vl.to(device)
+    qwen2_5_vl.eval()
 
     if dtype is not None:
         if dtype.itemsize == 1:  # fp8
@@ -494,43 +496,59 @@ def forward(
 
     # Load tokenizer
     logger.info(f"Loading tokenizer from {QWEN_2_5_VL_IMAGE_ID}")
-    tokenizer = Qwen2Tokenizer.from_pretrained(QWEN_2_5_VL_IMAGE_ID) 
+    tokenizer = Qwen2Tokenizer.from_pretrained(QWEN_2_5_VL_IMAGE_ID)
     return tokenizer, qwen2_5_vl
 
 
+TOKENIZER_MAX_LENGTH = 1024
+PROMPT_TEMPLATE_ENCODE_START_IDX = 34
+
+
 def get_qwen_prompt_embeds(
     tokenizer: Qwen2Tokenizer, vlm: Qwen2_5_VLForConditionalGeneration, prompt: Union[str, list[str]] = None
-):
-    tokenizer_max_length = 1024
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    input_ids, mask = get_qwen_tokens(tokenizer, prompt)
+    return get_qwen_prompt_embeds_from_tokens(vlm, input_ids, mask)
+
+
+def get_qwen_tokens(tokenizer: Qwen2Tokenizer, prompt: Union[str, list[str]] = None) -> Tuple[torch.Tensor, torch.Tensor]:
+    tokenizer_max_length = TOKENIZER_MAX_LENGTH
 
     # HunyuanImage-2.1 does not use "<|im_start|>assistant\n" in the prompt template
     prompt_template_encode = "<|im_start|>system\nDescribe the image by detailing the color, shape, size, texture, quantity, text, spatial relationships of the objects and background:<|im_end|>\n<|im_start|>user\n{}<|im_end|>"
     # \n<|im_start|>assistant\n"
-    prompt_template_encode_start_idx = 34
+    prompt_template_encode_start_idx = PROMPT_TEMPLATE_ENCODE_START_IDX
     # default_sample_size = 128
 
-    device = vlm.device
-    dtype = vlm.dtype
-
     prompt = [prompt] if isinstance(prompt, str) else prompt
 
     template = prompt_template_encode
     drop_idx = prompt_template_encode_start_idx
     txt = [template.format(e) for e in prompt]
-    txt_tokens = tokenizer(txt, max_length=tokenizer_max_length + drop_idx, padding=True, truncation=True, return_tensors="pt").to(
-        device
-    )
+    txt_tokens = tokenizer(txt, max_length=tokenizer_max_length + drop_idx, padding=True, truncation=True, return_tensors="pt")
+    return txt_tokens.input_ids, txt_tokens.attention_mask
+
+
+def get_qwen_prompt_embeds_from_tokens(
+    vlm: Qwen2_5_VLForConditionalGeneration, input_ids: torch.Tensor, attention_mask: torch.Tensor
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    tokenizer_max_length = TOKENIZER_MAX_LENGTH
+    drop_idx = PROMPT_TEMPLATE_ENCODE_START_IDX
+
+    device = vlm.device
+    dtype = vlm.dtype
+
+    input_ids = input_ids.to(device=device)
+    attention_mask = attention_mask.to(device=device)
 
     if dtype.itemsize == 1:  # fp8
+        # TODO dtype should be vlm.dtype?
         with torch.no_grad(), torch.autocast(device_type=device.type, dtype=torch.bfloat16, enabled=True):
-            encoder_hidden_states = vlm(
-                input_ids=txt_tokens.input_ids, attention_mask=txt_tokens.attention_mask, output_hidden_states=True
-            )
+            encoder_hidden_states = vlm(input_ids=input_ids, attention_mask=attention_mask, output_hidden_states=True)
     else:
         with torch.no_grad(), torch.autocast(device_type=device.type, dtype=dtype, enabled=True):
-            encoder_hidden_states = vlm(
-                input_ids=txt_tokens.input_ids, attention_mask=txt_tokens.attention_mask, output_hidden_states=True
-            )
+            encoder_hidden_states = vlm(input_ids=input_ids, attention_mask=attention_mask, output_hidden_states=True)
+
     hidden_states = encoder_hidden_states.hidden_states[-3]  # use the 3rd last layer's hidden states for HunyuanImage-2.1
     if hidden_states.shape[1] > tokenizer_max_length + drop_idx:
         logger.warning(f"Hidden states shape {hidden_states.shape} exceeds max length {tokenizer_max_length + drop_idx}")
@@ -545,7 +563,7 @@ def get_qwen_prompt_embeds(
     # ----------------------------------------------------------
 
     prompt_embeds = hidden_states[:, drop_idx:, :]
-    encoder_attention_mask = txt_tokens.attention_mask[:, drop_idx:]
+    encoder_attention_mask = attention_mask[:, drop_idx:]
     prompt_embeds = prompt_embeds.to(dtype=dtype, device=device)
 
     return prompt_embeds, encoder_attention_mask
@@ -565,17 +583,42 @@ def format_prompt(texts, styles):
     return prompt
 
 
+BYT5_MAX_LENGTH = 128
+
+
 def get_glyph_prompt_embeds(
-    tokenizer: T5Tokenizer, text_encoder: T5Stack, prompt: Union[str, list[str]] = None
+    tokenizer: T5Tokenizer, text_encoder: T5Stack, prompt: Optional[str] = None
 ) -> Tuple[list[bool], torch.Tensor, torch.Tensor]:
-    byt5_max_length = 128
-    if not prompt:
+    byt5_tokens, byt5_text_mask = get_byt5_text_tokens(tokenizer, prompt)
+    return get_byt5_prompt_embeds_from_tokens(text_encoder, byt5_tokens, byt5_text_mask)
+
+
+def get_byt5_prompt_embeds_from_tokens(
+    text_encoder: T5Stack, byt5_text_ids: Optional[torch.Tensor], byt5_text_mask: Optional[torch.Tensor]
+) -> Tuple[list[bool], torch.Tensor, torch.Tensor]:
+    byt5_max_length = BYT5_MAX_LENGTH
+
+    if byt5_text_ids is None or byt5_text_mask is None:
         return (
             [False],
             torch.zeros((1, byt5_max_length, 1472), device=text_encoder.device),
             torch.zeros((1, byt5_max_length), device=text_encoder.device, dtype=torch.int64),
         )
 
+    byt5_text_ids = byt5_text_ids.to(device=text_encoder.device)
+    byt5_text_mask = byt5_text_mask.to(device=text_encoder.device)
+
+    with torch.no_grad(), torch.autocast(device_type=text_encoder.device.type, dtype=text_encoder.dtype, enabled=True):
+        byt5_prompt_embeds = text_encoder(byt5_text_ids, attention_mask=byt5_text_mask.float())
+    byt5_emb = byt5_prompt_embeds[0]
+
+    return [True], byt5_emb, byt5_text_mask
+
+
+def get_byt5_text_tokens(tokenizer, prompt):
+    if not prompt:
+        return None, None
+
     try:
         text_prompt_texts = []
         # pattern_quote_single = r"\'(.*?)\'"
@@ -594,56 +637,26 @@ def get_glyph_prompt_embeds(
         text_prompt_texts.extend(matches_quote_chinese_double)
 
         if not text_prompt_texts:
-            return (
-                [False],
-                torch.zeros((1, byt5_max_length, 1472), device=text_encoder.device),
-                torch.zeros((1, byt5_max_length), device=text_encoder.device, dtype=torch.int64),
-            )
+            return None, None
 
         text_prompt_style_list = [{"color": None, "font-family": None} for _ in range(len(text_prompt_texts))]
         glyph_text_formatted = format_prompt(text_prompt_texts, text_prompt_style_list)
+        logger.info(f"Glyph text formatted: {glyph_text_formatted}")
+
+        byt5_text_inputs = tokenizer(
+            glyph_text_formatted,
+            padding="max_length",
+            max_length=BYT5_MAX_LENGTH,
+            truncation=True,
+            add_special_tokens=True,
+            return_tensors="pt",
+        )
 
-        byt5_text_ids, byt5_text_mask = get_byt5_text_tokens(tokenizer, byt5_max_length, glyph_text_formatted)
-
-        byt5_text_ids = byt5_text_ids.to(device=text_encoder.device)
-        byt5_text_mask = byt5_text_mask.to(device=text_encoder.device)
-
-        byt5_prompt_embeds = text_encoder(byt5_text_ids, attention_mask=byt5_text_mask.float())
-        byt5_emb = byt5_prompt_embeds[0]
+        byt5_text_ids = byt5_text_inputs.input_ids
+        byt5_text_mask = byt5_text_inputs.attention_mask
 
-        return [True], byt5_emb, byt5_text_mask
+        return byt5_text_ids, byt5_text_mask
 
     except Exception as e:
         logger.warning(f"Warning: Error in glyph encoding, using fallback: {e}")
-        return (
-            [False],
-            torch.zeros((1, byt5_max_length, 1472), device=text_encoder.device),
-            torch.zeros((1, byt5_max_length), device=text_encoder.device, dtype=torch.int64),
-        )
-
-
-def get_byt5_text_tokens(tokenizer, max_length, text_list):
-    """
-    Get byT5 text tokens.
-
-    Args:
-        tokenizer: The tokenizer object
-        max_length: Maximum token length
-        text_list: List or string of text
-
-    Returns:
-        Tuple of (byt5_text_ids, byt5_text_mask)
-    """
-    if isinstance(text_list, list):
-        text_prompt = " ".join(text_list)
-    else:
-        text_prompt = text_list
-
-    byt5_text_inputs = tokenizer(
-        text_prompt, padding="max_length", max_length=max_length, truncation=True, add_special_tokens=True, return_tensors="pt"
-    )
-
-    byt5_text_ids = byt5_text_inputs.input_ids
-    byt5_text_mask = byt5_text_inputs.attention_mask
-
-    return byt5_text_ids, byt5_text_mask
+        return None, None
diff --git a/library/hunyuan_image_utils.py b/library/hunyuan_image_utils.py
index 17847104a..79756dd7e 100644
--- a/library/hunyuan_image_utils.py
+++ b/library/hunyuan_image_utils.py
@@ -5,6 +5,18 @@
 from typing import Tuple, Union, Optional
 import torch
 
+from library.utils import setup_logging
+
+setup_logging()
+import logging
+
+logger = logging.getLogger(__name__)
+
+
+MODEL_VERSION_2_1 = "hunyuan-image-2.1"
+
+# region model
+
 
 def _to_tuple(x, dim=2):
     """
@@ -206,7 +218,7 @@ def reshape_for_broadcast(
         x.shape[1],
         x.shape[-1],
     ), f"Frequency tensor shape {freqs_cis[0].shape} incompatible with target shape {x.shape}"
-    
+
     shape = [d if i == 1 or i == x.ndim - 1 else 1 for i, d in enumerate(x.shape)]
     return freqs_cis[0].view(*shape), freqs_cis[1].view(*shape)
 
@@ -248,7 +260,7 @@ def apply_rotary_emb(
 
     cos, sin = reshape_for_broadcast(freqs_cis, xq, head_first)
     cos, sin = cos.to(device), sin.to(device)
-    
+
     # Apply rotation: x' = x * cos + rotate_half(x) * sin
     xq_out = (xq.float() * cos + rotate_half(xq.float()) * sin).to(dtype)
     xk_out = (xk.float() * cos + rotate_half(xk.float()) * sin).to(dtype)
@@ -256,6 +268,11 @@ def apply_rotary_emb(
     return xq_out, xk_out
 
 
+# endregion
+
+# region inference
+
+
 def get_timesteps_sigmas(sampling_steps: int, shift: float, device: torch.device) -> Tuple[torch.Tensor, torch.Tensor]:
     """
     Generate timesteps and sigmas for diffusion sampling.
@@ -291,6 +308,9 @@ def step(latents, noise_pred, sigmas, step_i):
     return latents.float() - (sigmas[step_i] - sigmas[step_i + 1]) * noise_pred.float()
 
 
+# endregion
+
+
 # region AdaptiveProjectedGuidance
 
 
@@ -298,6 +318,7 @@ class MomentumBuffer:
     """
     Exponential moving average buffer for APG momentum.
     """
+
     def __init__(self, momentum: float):
         self.momentum = momentum
         self.running_average = 0
@@ -318,10 +339,10 @@ def normalized_guidance_apg(
 ):
     """
     Apply normalized adaptive projected guidance.
-    
+
     Projects the guidance vector to reduce over-saturation while maintaining
     directional control by decomposing into parallel and orthogonal components.
-    
+
     Args:
         pred_cond: Conditional prediction.
         pred_uncond: Unconditional prediction.
@@ -330,7 +351,7 @@ def normalized_guidance_apg(
         eta: Scaling factor for parallel component.
         norm_threshold: Maximum norm for guidance vector clipping.
         use_original_formulation: Whether to use original APG formulation.
-        
+
     Returns:
         Guided prediction tensor.
     """
@@ -366,10 +387,11 @@ def normalized_guidance_apg(
 class AdaptiveProjectedGuidance:
     """
     Adaptive Projected Guidance for classifier-free guidance.
-    
+
     Implements APG which projects the guidance vector to reduce over-saturation
     while maintaining directional control.
     """
+
     def __init__(
         self,
         guidance_scale: float = 7.5,
@@ -406,9 +428,6 @@ def __call__(self, pred_cond: torch.Tensor, pred_uncond: Optional[torch.Tensor]
         return pred
 
 
-# endregion
-
-
 def apply_classifier_free_guidance(
     noise_pred_text: torch.Tensor,
     noise_pred_uncond: torch.Tensor,
@@ -459,3 +478,6 @@ def apply_classifier_free_guidance(
         noise_pred = cfg_guider(noise_pred_text, noise_pred_uncond, step=step)
 
     return noise_pred
+
+
+# endregion
diff --git a/library/lora_utils.py b/library/lora_utils.py
index db0046229..468fb01ad 100644
--- a/library/lora_utils.py
+++ b/library/lora_utils.py
@@ -7,7 +7,7 @@
 
 from tqdm import tqdm
 
-from library.custom_offloading_utils import synchronize_device
+from library.device_utils import synchronize_device
 from library.fp8_optimization_utils import load_safetensors_with_fp8_optimization
 from library.utils import MemoryEfficientSafeOpen, setup_logging
 
diff --git a/library/sai_model_spec.py b/library/sai_model_spec.py
index 24b958dd0..32a4fd7bf 100644
--- a/library/sai_model_spec.py
+++ b/library/sai_model_spec.py
@@ -37,18 +37,16 @@
 
 BASE_METADATA = {
     # === MUST ===
-    "modelspec.sai_model_spec": "1.0.1", 
+    "modelspec.sai_model_spec": "1.0.1",
     "modelspec.architecture": None,
     "modelspec.implementation": None,
     "modelspec.title": None,
     "modelspec.resolution": None,
-    
     # === SHOULD ===
     "modelspec.description": None,
     "modelspec.author": None,
     "modelspec.date": None,
     "modelspec.hash_sha256": None,
-    
     # === CAN===
     "modelspec.implementation_version": None,
     "modelspec.license": None,
@@ -81,6 +79,8 @@
 ARCH_FLUX_1_UNKNOWN = "flux-1"
 ARCH_LUMINA_2 = "lumina-2"
 ARCH_LUMINA_UNKNOWN = "lumina"
+ARCH_HUNYUAN_IMAGE_2_1 = "hunyuan-image-2.1"
+ARCH_HUNYUAN_IMAGE_UNKNOWN = "hunyuan-image"
 
 ADAPTER_LORA = "lora"
 ADAPTER_TEXTUAL_INVERSION = "textual-inversion"
@@ -91,6 +91,7 @@
 IMPL_FLUX = "https://github.com/black-forest-labs/flux"
 IMPL_CHROMA = "https://huggingface.co/lodestones/Chroma"
 IMPL_LUMINA = "https://github.com/Alpha-VLLM/Lumina-Image-2.0"
+IMPL_HUNYUAN_IMAGE = "https://github.com/Tencent-Hunyuan/HunyuanImage-2.1"
 
 PRED_TYPE_EPSILON = "epsilon"
 PRED_TYPE_V = "v"
@@ -102,20 +103,20 @@ class ModelSpecMetadata:
     ModelSpec 1.0.1 compliant metadata for safetensors models.
     All fields correspond to modelspec.* keys in the final metadata.
     """
-    
+
     # === MUST ===
     architecture: str
     implementation: str
     title: str
     resolution: str
     sai_model_spec: str = "1.0.1"
-    
+
     # === SHOULD ===
     description: str | None = None
     author: str | None = None
     date: str | None = None
     hash_sha256: str | None = None
-    
+
     # === CAN ===
     implementation_version: str | None = None
     license: str | None = None
@@ -131,14 +132,14 @@ class ModelSpecMetadata:
     is_negative_embedding: str | None = None
     unet_dtype: str | None = None
     vae_dtype: str | None = None
-    
+
     # === Additional metadata ===
     additional_fields: dict[str, str] = field(default_factory=dict)
-    
+
     def to_metadata_dict(self) -> dict[str, str]:
         """Convert dataclass to metadata dictionary with modelspec. prefixes."""
         metadata = {}
-        
+
         # Add all non-None fields with modelspec prefix
         for field_name, value in self.__dict__.items():
             if field_name == "additional_fields":
@@ -150,14 +151,14 @@ def to_metadata_dict(self) -> dict[str, str]:
                         metadata[f"modelspec.{key}"] = val
             elif value is not None:
                 metadata[f"modelspec.{field_name}"] = value
-        
+
         return metadata
-    
+
     @classmethod
     def from_args(cls, args, **kwargs) -> "ModelSpecMetadata":
         """Create ModelSpecMetadata from argparse Namespace, extracting metadata_* fields."""
         metadata_fields = {}
-        
+
         # Extract all metadata_* attributes from args
         for attr_name in dir(args):
             if attr_name.startswith("metadata_") and not attr_name.startswith("metadata___"):
@@ -166,7 +167,7 @@ def from_args(cls, args, **kwargs) -> "ModelSpecMetadata":
                     # Remove metadata_ prefix
                     field_name = attr_name[9:]  # len("metadata_") = 9
                     metadata_fields[field_name] = value
-        
+
         # Handle known standard fields
         standard_fields = {
             "author": metadata_fields.pop("author", None),
@@ -174,30 +175,25 @@ def from_args(cls, args, **kwargs) -> "ModelSpecMetadata":
             "license": metadata_fields.pop("license", None),
             "tags": metadata_fields.pop("tags", None),
         }
-        
+
         # Remove None values
         standard_fields = {k: v for k, v in standard_fields.items() if v is not None}
-        
+
         # Merge with kwargs and remaining metadata fields
         all_fields = {**standard_fields, **kwargs}
         if metadata_fields:
             all_fields["additional_fields"] = metadata_fields
-        
+
         return cls(**all_fields)
 
 
 def determine_architecture(
-    v2: bool,
-    v_parameterization: bool,
-    sdxl: bool,
-    lora: bool,
-    textual_inversion: bool,
-    model_config: dict[str, str] | None = None
+    v2: bool, v_parameterization: bool, sdxl: bool, lora: bool, textual_inversion: bool, model_config: dict[str, str] | None = None
 ) -> str:
     """Determine model architecture string from parameters."""
-    
+
     model_config = model_config or {}
-    
+
     if sdxl:
         arch = ARCH_SD_XL_V1_BASE
     elif "sd3" in model_config:
@@ -218,17 +214,23 @@ def determine_architecture(
             arch = ARCH_LUMINA_2
         else:
             arch = ARCH_LUMINA_UNKNOWN
+    elif "hunyuan_image" in model_config:
+        hunyuan_image_type = model_config["hunyuan_image"]
+        if hunyuan_image_type == "2.1":
+            arch = ARCH_HUNYUAN_IMAGE_2_1
+        else:
+            arch = ARCH_HUNYUAN_IMAGE_UNKNOWN
     elif v2:
         arch = ARCH_SD_V2_768_V if v_parameterization else ARCH_SD_V2_512
     else:
         arch = ARCH_SD_V1
-    
+
     # Add adapter suffix
     if lora:
         arch += f"/{ADAPTER_LORA}"
     elif textual_inversion:
         arch += f"/{ADAPTER_TEXTUAL_INVERSION}"
-    
+
     return arch
 
 
@@ -237,12 +239,12 @@ def determine_implementation(
     textual_inversion: bool,
     sdxl: bool,
     model_config: dict[str, str] | None = None,
-    is_stable_diffusion_ckpt: bool | None = None
+    is_stable_diffusion_ckpt: bool | None = None,
 ) -> str:
     """Determine implementation string from parameters."""
-    
+
     model_config = model_config or {}
-    
+
     if "flux" in model_config:
         if model_config["flux"] == "chroma":
             return IMPL_CHROMA
@@ -265,16 +267,16 @@ def get_implementation_version() -> str:
             capture_output=True,
             text=True,
             cwd=os.path.dirname(os.path.dirname(__file__)),  # Go up to sd-scripts root
-            timeout=5
+            timeout=5,
         )
-        
+
         if result.returncode == 0:
             commit_hash = result.stdout.strip()
             return f"sd-scripts/{commit_hash}"
         else:
             logger.warning("Failed to get git commit hash, using fallback")
             return "sd-scripts/unknown"
-    
+
     except (subprocess.TimeoutExpired, subprocess.SubprocessError, FileNotFoundError) as e:
         logger.warning(f"Could not determine git commit: {e}")
         return "sd-scripts/unknown"
@@ -284,19 +286,19 @@ def file_to_data_url(file_path: str) -> str:
     """Convert a file path to a data URL for embedding in metadata."""
     if not os.path.exists(file_path):
         raise FileNotFoundError(f"File not found: {file_path}")
-    
+
     # Get MIME type
     mime_type, _ = mimetypes.guess_type(file_path)
     if mime_type is None:
         # Default to binary if we can't detect
         mime_type = "application/octet-stream"
-    
+
     # Read file and encode as base64
     with open(file_path, "rb") as f:
         file_data = f.read()
-    
+
     encoded_data = base64.b64encode(file_data).decode("ascii")
-    
+
     return f"data:{mime_type};base64,{encoded_data}"
 
 
@@ -305,12 +307,12 @@ def determine_resolution(
     sdxl: bool = False,
     model_config: dict[str, str] | None = None,
     v2: bool = False,
-    v_parameterization: bool = False
+    v_parameterization: bool = False,
 ) -> str:
     """Determine resolution string from parameters."""
-    
+
     model_config = model_config or {}
-    
+
     if reso is not None:
         # Handle comma separated string
         if isinstance(reso, str):
@@ -318,21 +320,18 @@ def determine_resolution(
         # Handle single int
         if isinstance(reso, int):
             reso = (reso, reso)
-        # Handle single-element tuple  
+        # Handle single-element tuple
         if len(reso) == 1:
             reso = (reso[0], reso[0])
     else:
         # Determine default resolution based on model type
-        if (sdxl or 
-            "sd3" in model_config or 
-            "flux" in model_config or 
-            "lumina" in model_config):
+        if sdxl or "sd3" in model_config or "flux" in model_config or "lumina" in model_config:
             reso = (1024, 1024)
         elif v2 and v_parameterization:
             reso = (768, 768)
         else:
             reso = (512, 512)
-    
+
     return f"{reso[0]}x{reso[1]}"
 
 
@@ -388,23 +387,19 @@ def build_metadata_dataclass(
 ) -> ModelSpecMetadata:
     """
     Build ModelSpec 1.0.1 compliant metadata dataclass.
-    
+
     Args:
         model_config: Dict containing model type info, e.g. {"flux": "dev"}, {"sd3": "large"}
         optional_metadata: Dict of additional metadata fields to include
     """
-    
+
     # Use helper functions for complex logic
-    architecture = determine_architecture(
-        v2, v_parameterization, sdxl, lora, textual_inversion, model_config
-    )
+    architecture = determine_architecture(v2, v_parameterization, sdxl, lora, textual_inversion, model_config)
 
     if not lora and not textual_inversion and is_stable_diffusion_ckpt is None:
         is_stable_diffusion_ckpt = True  # default is stable diffusion ckpt if not lora and not textual_inversion
 
-    implementation = determine_implementation(
-        lora, textual_inversion, sdxl, model_config, is_stable_diffusion_ckpt
-    )
+    implementation = determine_implementation(lora, textual_inversion, sdxl, model_config, is_stable_diffusion_ckpt)
 
     if title is None:
         if lora:
@@ -421,9 +416,7 @@ def build_metadata_dataclass(
     date = datetime.datetime.fromtimestamp(int_ts).isoformat()
 
     # Use helper function for resolution
-    resolution = determine_resolution(
-        reso, sdxl, model_config, v2, v_parameterization
-    )
+    resolution = determine_resolution(reso, sdxl, model_config, v2, v_parameterization)
 
     # Handle prediction type - Flux models don't use prediction_type
     model_config = model_config or {}
@@ -488,7 +481,7 @@ def build_metadata_dataclass(
         prediction_type=prediction_type,
         timestep_range=timestep_range,
         encoder_layer=encoder_layer,
-        additional_fields=processed_optional_metadata
+        additional_fields=processed_optional_metadata,
     )
 
     return metadata
@@ -518,7 +511,7 @@ def build_metadata(
     """
     Build ModelSpec 1.0.1 compliant metadata for safetensors models.
     Legacy function that returns dict - prefer build_metadata_dataclass for new code.
-    
+
     Args:
         model_config: Dict containing model type info, e.g. {"flux": "dev"}, {"sd3": "large"}
         optional_metadata: Dict of additional metadata fields to include
@@ -545,7 +538,7 @@ def build_metadata(
         model_config=model_config,
         optional_metadata=optional_metadata,
     )
-    
+
     return metadata_obj.to_metadata_dict()
 
 
@@ -581,7 +574,7 @@ def get_title(model: str):
 
 def add_model_spec_arguments(parser: argparse.ArgumentParser):
     """Add all ModelSpec metadata arguments to the parser."""
-    
+
     parser.add_argument(
         "--metadata_title",
         type=str,
diff --git a/library/strategy_hunyuan_image.py b/library/strategy_hunyuan_image.py
new file mode 100644
index 000000000..2188ed371
--- /dev/null
+++ b/library/strategy_hunyuan_image.py
@@ -0,0 +1,187 @@
+import os
+from typing import Any, List, Optional, Tuple, Union
+import torch
+import numpy as np
+from transformers import AutoTokenizer, Qwen2Tokenizer
+
+from library import hunyuan_image_text_encoder, hunyuan_image_vae, train_util
+from library.strategy_base import LatentsCachingStrategy, TextEncodingStrategy, TokenizeStrategy, TextEncoderOutputsCachingStrategy
+
+from library.utils import setup_logging
+
+setup_logging()
+import logging
+
+logger = logging.getLogger(__name__)
+
+
+class HunyuanImageTokenizeStrategy(TokenizeStrategy):
+    def __init__(self, tokenizer_cache_dir: Optional[str] = None) -> None:
+        self.vlm_tokenizer = self._load_tokenizer(
+            Qwen2Tokenizer, hunyuan_image_text_encoder.QWEN_2_5_VL_IMAGE_ID, tokenizer_cache_dir=tokenizer_cache_dir
+        )
+        self.byt5_tokenizer = self._load_tokenizer(
+            AutoTokenizer, hunyuan_image_text_encoder.BYT5_TOKENIZER_PATH, tokenizer_cache_dir=tokenizer_cache_dir
+        )
+
+    def tokenize(self, text: Union[str, List[str]]) -> List[torch.Tensor]:
+        text = [text] if isinstance(text, str) else text
+
+        vlm_tokens, vlm_mask = hunyuan_image_text_encoder.get_qwen_tokens(self.vlm_tokenizer, text)
+        byt5_tokens, byt5_mask = hunyuan_image_text_encoder.get_byt5_text_tokens(self.byt5_tokenizer, text)
+
+        return [vlm_tokens, vlm_mask, byt5_tokens, byt5_mask]
+
+
+class HunyuanImageTextEncodingStrategy(TextEncodingStrategy):
+    def __init__(self) -> None:
+        pass
+
+    def encode_tokens(
+        self, tokenize_strategy: TokenizeStrategy, models: List[Any], tokens: List[torch.Tensor]
+    ) -> List[torch.Tensor]:
+        vlm_tokens, vlm_mask, byt5_tokens, byt5_mask = tokens
+
+        qwen2vlm, byt5 = models
+
+        # autocast and no_grad are handled in hunyuan_image_text_encoder
+        vlm_embed, vlm_mask = hunyuan_image_text_encoder.get_qwen_prompt_embeds_from_tokens(qwen2vlm, vlm_tokens, vlm_mask)
+        ocr_mask, byt5_embed, byt5_mask = hunyuan_image_text_encoder.get_byt5_prompt_embeds_from_tokens(
+            byt5, byt5_tokens, byt5_mask
+        )
+
+        return [vlm_embed, vlm_mask, byt5_embed, byt5_mask, ocr_mask]
+
+
+class HunyuanImageTextEncoderOutputsCachingStrategy(TextEncoderOutputsCachingStrategy):
+    HUNYUAN_IMAGE_TEXT_ENCODER_OUTPUTS_NPZ_SUFFIX = "_hi_te.npz"
+
+    def __init__(
+        self, cache_to_disk: bool, batch_size: int, skip_disk_cache_validity_check: bool, is_partial: bool = False
+    ) -> None:
+        super().__init__(cache_to_disk, batch_size, skip_disk_cache_validity_check, is_partial)
+
+    def get_outputs_npz_path(self, image_abs_path: str) -> str:
+        return (
+            os.path.splitext(image_abs_path)[0]
+            + HunyuanImageTextEncoderOutputsCachingStrategy.HUNYUAN_IMAGE_TEXT_ENCODER_OUTPUTS_NPZ_SUFFIX
+        )
+
+    def is_disk_cached_outputs_expected(self, npz_path: str):
+        if not self.cache_to_disk:
+            return False
+        if not os.path.exists(npz_path):
+            return False
+        if self.skip_disk_cache_validity_check:
+            return True
+
+        try:
+            npz = np.load(npz_path)
+            if "vlm_embed" not in npz:
+                return False
+            if "vlm_mask" not in npz:
+                return False
+            if "byt5_embed" not in npz:
+                return False
+            if "byt5_mask" not in npz:
+                return False
+            if "ocr_mask" not in npz:
+                return False
+        except Exception as e:
+            logger.error(f"Error loading file: {npz_path}")
+            raise e
+
+        return True
+
+    def load_outputs_npz(self, npz_path: str) -> List[np.ndarray]:
+        data = np.load(npz_path)
+        vln_embed = data["vlm_embed"]
+        vlm_mask = data["vlm_mask"]
+        byt5_embed = data["byt5_embed"]
+        byt5_mask = data["byt5_mask"]
+        ocr_mask = data["ocr_mask"]
+        return [vln_embed, vlm_mask, byt5_embed, byt5_mask, ocr_mask]
+
+    def cache_batch_outputs(
+        self, tokenize_strategy: TokenizeStrategy, models: List[Any], text_encoding_strategy: TextEncodingStrategy, infos: List
+    ):
+        huyuan_image_text_encoding_strategy: HunyuanImageTextEncodingStrategy = text_encoding_strategy
+        captions = [info.caption for info in infos]
+
+        tokens_and_masks = tokenize_strategy.tokenize(captions)
+        with torch.no_grad():
+            # attn_mask is applied in text_encoding_strategy.encode_tokens if apply_t5_attn_mask is True
+            vlm_embed, vlm_mask, byt5_embed, byt5_mask, ocr_mask = huyuan_image_text_encoding_strategy.encode_tokens(
+                tokenize_strategy, models, tokens_and_masks
+            )
+
+        if vlm_embed.dtype == torch.bfloat16:
+            vlm_embed = vlm_embed.float()
+        if byt5_embed.dtype == torch.bfloat16:
+            byt5_embed = byt5_embed.float()
+
+        vlm_embed = vlm_embed.cpu().numpy()
+        vlm_mask = vlm_mask.cpu().numpy()
+        byt5_embed = byt5_embed.cpu().numpy()
+        byt5_mask = byt5_mask.cpu().numpy()
+        ocr_mask = np.array(ocr_mask, dtype=bool)
+
+        for i, info in enumerate(infos):
+            vlm_embed_i = vlm_embed[i]
+            vlm_mask_i = vlm_mask[i]
+            byt5_embed_i = byt5_embed[i]
+            byt5_mask_i = byt5_mask[i]
+            ocr_mask_i = ocr_mask[i]
+
+            if self.cache_to_disk:
+                np.savez(
+                    info.text_encoder_outputs_npz,
+                    vlm_embed=vlm_embed_i,
+                    vlm_mask=vlm_mask_i,
+                    byt5_embed=byt5_embed_i,
+                    byt5_mask=byt5_mask_i,
+                    ocr_mask=ocr_mask_i,
+                )
+            else:
+                info.text_encoder_outputs = (vlm_embed_i, vlm_mask_i, byt5_embed_i, byt5_mask_i, ocr_mask_i)
+
+
+class HunyuanImageLatentsCachingStrategy(LatentsCachingStrategy):
+    HUNYUAN_IMAGE_LATENTS_NPZ_SUFFIX = "_hi.npz"
+
+    def __init__(self, cache_to_disk: bool, batch_size: int, skip_disk_cache_validity_check: bool) -> None:
+        super().__init__(cache_to_disk, batch_size, skip_disk_cache_validity_check)
+
+    @property
+    def cache_suffix(self) -> str:
+        return HunyuanImageLatentsCachingStrategy.HUNYUAN_IMAGE_LATENTS_NPZ_SUFFIX
+
+    def get_latents_npz_path(self, absolute_path: str, image_size: Tuple[int, int]) -> str:
+        return (
+            os.path.splitext(absolute_path)[0]
+            + f"_{image_size[0]:04d}x{image_size[1]:04d}"
+            + HunyuanImageLatentsCachingStrategy.HUNYUAN_IMAGE_LATENTS_NPZ_SUFFIX
+        )
+
+    def is_disk_cached_latents_expected(self, bucket_reso: Tuple[int, int], npz_path: str, flip_aug: bool, alpha_mask: bool):
+        return self._default_is_disk_cached_latents_expected(32, bucket_reso, npz_path, flip_aug, alpha_mask, multi_resolution=True)
+
+    def load_latents_from_disk(
+        self, npz_path: str, bucket_reso: Tuple[int, int]
+    ) -> Tuple[Optional[np.ndarray], Optional[List[int]], Optional[List[int]], Optional[np.ndarray], Optional[np.ndarray]]:
+        return self._default_load_latents_from_disk(32, npz_path, bucket_reso)  # support multi-resolution
+
+    # TODO remove circular dependency for ImageInfo
+    def cache_batch_latents(
+        self, vae: hunyuan_image_vae.HunyuanVAE2D, image_infos: List, flip_aug: bool, alpha_mask: bool, random_crop: bool
+    ):
+        encode_by_vae = lambda img_tensor: vae.encode(img_tensor).sample()
+        vae_device = vae.device
+        vae_dtype = vae.dtype
+
+        self._default_cache_batch_latents(
+            encode_by_vae, vae_device, vae_dtype, image_infos, flip_aug, alpha_mask, random_crop, multi_resolution=True
+        )
+
+        if not train_util.HIGH_VRAM:
+            train_util.clean_memory_on_device(vae.device)
diff --git a/library/train_util.py b/library/train_util.py
index b432d0b62..8cd43463c 100644
--- a/library/train_util.py
+++ b/library/train_util.py
@@ -3588,6 +3588,7 @@ def get_sai_model_spec_dataclass(
     sd3: str = None,
     flux: str = None,
     lumina: str = None,
+    hunyuan_image: str = None,
     optional_metadata: dict[str, str] | None = None,
 ) -> sai_model_spec.ModelSpecMetadata:
     """
@@ -3617,6 +3618,8 @@ def get_sai_model_spec_dataclass(
         model_config["flux"] = flux
     if lumina is not None:
         model_config["lumina"] = lumina
+    if hunyuan_image is not None:
+        model_config["hunyuan_image"] = hunyuan_image
 
     # Use the dataclass function directly
     return sai_model_spec.build_metadata_dataclass(
@@ -3987,11 +3990,21 @@ def add_training_arguments(parser: argparse.ArgumentParser, support_dreambooth:
         choices=["no", "fp16", "bf16"],
         help="use mixed precision / 混合精度を使う場合、その精度",
     )
-    parser.add_argument("--full_fp16", action="store_true", help="fp16 training including gradients / 勾配も含めてfp16で学習する")
     parser.add_argument(
-        "--full_bf16", action="store_true", help="bf16 training including gradients / 勾配も含めてbf16で学習する"
+        "--full_fp16",
+        action="store_true",
+        help="fp16 training including gradients, some models are not supported / 勾配も含めてfp16で学習する、一部のモデルではサポートされていません",
+    )
+    parser.add_argument(
+        "--full_bf16",
+        action="store_true",
+        help="bf16 training including gradients, some models are not supported / 勾配も含めてbf16で学習する、一部のモデルではサポートされていません",
     )  # TODO move to SDXL training, because it is not supported by SD1/2
-    parser.add_argument("--fp8_base", action="store_true", help="use fp8 for base model / base modelにfp8を使う")
+    parser.add_argument(
+        "--fp8_base",
+        action="store_true",
+        help="use fp8 for base model, some models are not supported / base modelにfp8を使う、一部のモデルではサポートされていません",
+    )
 
     parser.add_argument(
         "--ddp_timeout",
@@ -6305,6 +6318,11 @@ def line_to_prompt_dict(line: str) -> dict:
                 prompt_dict["renorm_cfg"] = float(m.group(1))
                 continue
 
+            m = re.match(r"fs (.+)", parg, re.IGNORECASE)
+            if m:
+                prompt_dict["flow_shift"] = m.group(1)
+                continue
+
         except ValueError as ex:
             logger.error(f"Exception in parsing / 解析エラー: {parg}")
             logger.error(ex)
diff --git a/networks/lora_flux.py b/networks/lora_flux.py
index e9ad5f68d..d74d01728 100644
--- a/networks/lora_flux.py
+++ b/networks/lora_flux.py
@@ -713,6 +713,10 @@ class LoRANetwork(torch.nn.Module):
     LORA_PREFIX_TEXT_ENCODER_CLIP = "lora_te1"
     LORA_PREFIX_TEXT_ENCODER_T5 = "lora_te3"  # make ComfyUI compatible
 
+    @classmethod
+    def get_qkv_mlp_split_dims(cls) -> List[int]:
+        return [3072] * 3 + [12288]
+
     def __init__(
         self,
         text_encoders: Union[List[CLIPTextModel], CLIPTextModel],
@@ -842,7 +846,7 @@ def create_modules(
                                         break
 
                             # if modules_dim is None, we use default lora_dim. if modules_dim is not None, we use the specified dim (no default)
-                            if dim is None and modules_dim is None: 
+                            if dim is None and modules_dim is None:
                                 if is_linear or is_conv2d_1x1:
                                     dim = default_dim if default_dim is not None else self.lora_dim
                                     alpha = self.alpha
@@ -901,9 +905,9 @@ def create_modules(
                             split_dims = None
                             if is_flux and split_qkv:
                                 if "double" in lora_name and "qkv" in lora_name:
-                                    split_dims = [3072] * 3
+                                    (split_dims,) = self.get_qkv_mlp_split_dims()[:3]  # qkv only
                                 elif "single" in lora_name and "linear1" in lora_name:
-                                    split_dims = [3072] * 3 + [12288]
+                                    split_dims = self.get_qkv_mlp_split_dims()  # qkv + mlp
 
                             lora = module_class(
                                 lora_name,
@@ -1036,9 +1040,9 @@ def load_state_dict(self, state_dict, strict=True):
         # split qkv
         for key in list(state_dict.keys()):
             if "double" in key and "qkv" in key:
-                split_dims = [3072] * 3
+                split_dims = self.get_qkv_mlp_split_dims()[:3]  # qkv only
             elif "single" in key and "linear1" in key:
-                split_dims = [3072] * 3 + [12288]
+                split_dims = self.get_qkv_mlp_split_dims()  # qkv + mlp
             else:
                 continue
 
@@ -1092,9 +1096,9 @@ def state_dict(self, destination=None, prefix="", keep_vars=False):
         new_state_dict = {}
         for key in list(state_dict.keys()):
             if "double" in key and "qkv" in key:
-                split_dims = [3072] * 3
+                split_dims = self.get_qkv_mlp_split_dims()[:3]  # qkv only
             elif "single" in key and "linear1" in key:
-                split_dims = [3072] * 3 + [12288]
+                split_dims = self.get_qkv_mlp_split_dims()  # qkv + mlp
             else:
                 new_state_dict[key] = state_dict[key]
                 continue
diff --git a/networks/lora_hunyuan_image.py b/networks/lora_hunyuan_image.py
index e9ad5f68d..b0edde575 100644
--- a/networks/lora_hunyuan_image.py
+++ b/networks/lora_hunyuan_image.py
@@ -7,18 +7,17 @@
 # https://github.com/microsoft/LoRA/blob/main/loralib/layers.py
 # https://github.com/cloneofsimo/lora/blob/master/lora_diffusion/lora.py
 
-import math
 import os
-from contextlib import contextmanager
-from typing import Dict, List, Optional, Tuple, Type, Union
-from diffusers import AutoencoderKL
-from transformers import CLIPTextModel
-import numpy as np
+from typing import Dict, List, Optional, Type, Union
 import torch
+import torch.nn as nn
 from torch import Tensor
 import re
+
+from networks import lora_flux
+from library.hunyuan_image_vae import HunyuanVAE2D
+
 from library.utils import setup_logging
-from library.sdxl_original_unet import SdxlUNet2DConditionModel
 
 setup_logging()
 import logging
@@ -26,423 +25,16 @@
 logger = logging.getLogger(__name__)
 
 
-NUM_DOUBLE_BLOCKS = 19
-NUM_SINGLE_BLOCKS = 38
-
-
-class LoRAModule(torch.nn.Module):
-    """
-    replaces forward method of the original Linear, instead of replacing the original Linear module.
-    """
-
-    def __init__(
-        self,
-        lora_name,
-        org_module: torch.nn.Module,
-        multiplier=1.0,
-        lora_dim=4,
-        alpha=1,
-        dropout=None,
-        rank_dropout=None,
-        module_dropout=None,
-        split_dims: Optional[List[int]] = None,
-        ggpo_beta: Optional[float] = None,
-        ggpo_sigma: Optional[float] = None,
-    ):
-        """
-        if alpha == 0 or None, alpha is rank (no scaling).
-
-        split_dims is used to mimic the split qkv of FLUX as same as Diffusers
-        """
-        super().__init__()
-        self.lora_name = lora_name
-
-        if org_module.__class__.__name__ == "Conv2d":
-            in_dim = org_module.in_channels
-            out_dim = org_module.out_channels
-        else:
-            in_dim = org_module.in_features
-            out_dim = org_module.out_features
-
-        self.lora_dim = lora_dim
-        self.split_dims = split_dims
-
-        if split_dims is None:
-            if org_module.__class__.__name__ == "Conv2d":
-                kernel_size = org_module.kernel_size
-                stride = org_module.stride
-                padding = org_module.padding
-                self.lora_down = torch.nn.Conv2d(in_dim, self.lora_dim, kernel_size, stride, padding, bias=False)
-                self.lora_up = torch.nn.Conv2d(self.lora_dim, out_dim, (1, 1), (1, 1), bias=False)
-            else:
-                self.lora_down = torch.nn.Linear(in_dim, self.lora_dim, bias=False)
-                self.lora_up = torch.nn.Linear(self.lora_dim, out_dim, bias=False)
-
-            torch.nn.init.kaiming_uniform_(self.lora_down.weight, a=math.sqrt(5))
-            torch.nn.init.zeros_(self.lora_up.weight)
-        else:
-            # conv2d not supported
-            assert sum(split_dims) == out_dim, "sum of split_dims must be equal to out_dim"
-            assert org_module.__class__.__name__ == "Linear", "split_dims is only supported for Linear"
-            # print(f"split_dims: {split_dims}")
-            self.lora_down = torch.nn.ModuleList(
-                [torch.nn.Linear(in_dim, self.lora_dim, bias=False) for _ in range(len(split_dims))]
-            )
-            self.lora_up = torch.nn.ModuleList([torch.nn.Linear(self.lora_dim, split_dim, bias=False) for split_dim in split_dims])
-            for lora_down in self.lora_down:
-                torch.nn.init.kaiming_uniform_(lora_down.weight, a=math.sqrt(5))
-            for lora_up in self.lora_up:
-                torch.nn.init.zeros_(lora_up.weight)
-
-        if type(alpha) == torch.Tensor:
-            alpha = alpha.detach().float().numpy()  # without casting, bf16 causes error
-        alpha = self.lora_dim if alpha is None or alpha == 0 else alpha
-        self.scale = alpha / self.lora_dim
-        self.register_buffer("alpha", torch.tensor(alpha))  # 定数として扱える
-
-        # same as microsoft's
-        self.multiplier = multiplier
-        self.org_module = org_module  # remove in applying
-        self.dropout = dropout
-        self.rank_dropout = rank_dropout
-        self.module_dropout = module_dropout
-
-        self.ggpo_sigma = ggpo_sigma
-        self.ggpo_beta = ggpo_beta
-
-        if self.ggpo_beta is not None and self.ggpo_sigma is not None:
-            self.combined_weight_norms = None
-            self.grad_norms = None
-            self.perturbation_norm_factor = 1.0 / math.sqrt(org_module.weight.shape[0])
-            self.initialize_norm_cache(org_module.weight)
-            self.org_module_shape: tuple[int] = org_module.weight.shape
-
-    def apply_to(self):
-        self.org_forward = self.org_module.forward
-        self.org_module.forward = self.forward
-
-        del self.org_module
-
-    def forward(self, x):
-        org_forwarded = self.org_forward(x)
-
-        # module dropout
-        if self.module_dropout is not None and self.training:
-            if torch.rand(1) < self.module_dropout:
-                return org_forwarded
-
-        if self.split_dims is None:
-            lx = self.lora_down(x)
-
-            # normal dropout
-            if self.dropout is not None and self.training:
-                lx = torch.nn.functional.dropout(lx, p=self.dropout)
-
-            # rank dropout
-            if self.rank_dropout is not None and self.training:
-                mask = torch.rand((lx.size(0), self.lora_dim), device=lx.device) > self.rank_dropout
-                if len(lx.size()) == 3:
-                    mask = mask.unsqueeze(1)  # for Text Encoder
-                elif len(lx.size()) == 4:
-                    mask = mask.unsqueeze(-1).unsqueeze(-1)  # for Conv2d
-                lx = lx * mask
-
-                # scaling for rank dropout: treat as if the rank is changed
-                # maskから計算することも考えられるが、augmentation的な効果を期待してrank_dropoutを用いる
-                scale = self.scale * (1.0 / (1.0 - self.rank_dropout))  # redundant for readability
-            else:
-                scale = self.scale
-
-            lx = self.lora_up(lx)
-
-            # LoRA Gradient-Guided Perturbation Optimization
-            if (
-                self.training
-                and self.ggpo_sigma is not None
-                and self.ggpo_beta is not None
-                and self.combined_weight_norms is not None
-                and self.grad_norms is not None
-            ):
-                with torch.no_grad():
-                    perturbation_scale = (self.ggpo_sigma * torch.sqrt(self.combined_weight_norms**2)) + (
-                        self.ggpo_beta * (self.grad_norms**2)
-                    )
-                    perturbation_scale_factor = (perturbation_scale * self.perturbation_norm_factor).to(self.device)
-                    perturbation = torch.randn(self.org_module_shape, dtype=self.dtype, device=self.device)
-                    perturbation.mul_(perturbation_scale_factor)
-                    perturbation_output = x @ perturbation.T  # Result: (batch × n)
-                return org_forwarded + (self.multiplier * scale * lx) + perturbation_output
-            else:
-                return org_forwarded + lx * self.multiplier * scale
-        else:
-            lxs = [lora_down(x) for lora_down in self.lora_down]
-
-            # normal dropout
-            if self.dropout is not None and self.training:
-                lxs = [torch.nn.functional.dropout(lx, p=self.dropout) for lx in lxs]
-
-            # rank dropout
-            if self.rank_dropout is not None and self.training:
-                masks = [torch.rand((lx.size(0), self.lora_dim), device=lx.device) > self.rank_dropout for lx in lxs]
-                for i in range(len(lxs)):
-                    if len(lx.size()) == 3:
-                        masks[i] = masks[i].unsqueeze(1)
-                    elif len(lx.size()) == 4:
-                        masks[i] = masks[i].unsqueeze(-1).unsqueeze(-1)
-                    lxs[i] = lxs[i] * masks[i]
-
-                # scaling for rank dropout: treat as if the rank is changed
-                scale = self.scale * (1.0 / (1.0 - self.rank_dropout))  # redundant for readability
-            else:
-                scale = self.scale
-
-            lxs = [lora_up(lx) for lora_up, lx in zip(self.lora_up, lxs)]
-
-            return org_forwarded + torch.cat(lxs, dim=-1) * self.multiplier * scale
-
-    @torch.no_grad()
-    def initialize_norm_cache(self, org_module_weight: Tensor):
-        # Choose a reasonable sample size
-        n_rows = org_module_weight.shape[0]
-        sample_size = min(1000, n_rows)  # Cap at 1000 samples or use all if smaller
-
-        # Sample random indices across all rows
-        indices = torch.randperm(n_rows)[:sample_size]
-
-        # Convert to a supported data type first, then index
-        # Use float32 for indexing operations
-        weights_float32 = org_module_weight.to(dtype=torch.float32)
-        sampled_weights = weights_float32[indices].to(device=self.device)
-
-        # Calculate sampled norms
-        sampled_norms = torch.norm(sampled_weights, dim=1, keepdim=True)
-
-        # Store the mean norm as our estimate
-        self.org_weight_norm_estimate = sampled_norms.mean()
-
-        # Optional: store standard deviation for confidence intervals
-        self.org_weight_norm_std = sampled_norms.std()
-
-        # Free memory
-        del sampled_weights, weights_float32
-
-    @torch.no_grad()
-    def validate_norm_approximation(self, org_module_weight: Tensor, verbose=True):
-        # Calculate the true norm (this will be slow but it's just for validation)
-        true_norms = []
-        chunk_size = 1024  # Process in chunks to avoid OOM
-
-        for i in range(0, org_module_weight.shape[0], chunk_size):
-            end_idx = min(i + chunk_size, org_module_weight.shape[0])
-            chunk = org_module_weight[i:end_idx].to(device=self.device, dtype=self.dtype)
-            chunk_norms = torch.norm(chunk, dim=1, keepdim=True)
-            true_norms.append(chunk_norms.cpu())
-            del chunk
-
-        true_norms = torch.cat(true_norms, dim=0)
-        true_mean_norm = true_norms.mean().item()
-
-        # Compare with our estimate
-        estimated_norm = self.org_weight_norm_estimate.item()
-
-        # Calculate error metrics
-        absolute_error = abs(true_mean_norm - estimated_norm)
-        relative_error = absolute_error / true_mean_norm * 100  # as percentage
-
-        if verbose:
-            logger.info(f"True mean norm: {true_mean_norm:.6f}")
-            logger.info(f"Estimated norm: {estimated_norm:.6f}")
-            logger.info(f"Absolute error: {absolute_error:.6f}")
-            logger.info(f"Relative error: {relative_error:.2f}%")
-
-        return {
-            "true_mean_norm": true_mean_norm,
-            "estimated_norm": estimated_norm,
-            "absolute_error": absolute_error,
-            "relative_error": relative_error,
-        }
-
-    @torch.no_grad()
-    def update_norms(self):
-        # Not running GGPO so not currently running update norms
-        if self.ggpo_beta is None or self.ggpo_sigma is None:
-            return
-
-        # only update norms when we are training
-        if self.training is False:
-            return
-
-        module_weights = self.lora_up.weight @ self.lora_down.weight
-        module_weights.mul(self.scale)
-
-        self.weight_norms = torch.norm(module_weights, dim=1, keepdim=True)
-        self.combined_weight_norms = torch.sqrt(
-            (self.org_weight_norm_estimate**2) + torch.sum(module_weights**2, dim=1, keepdim=True)
-        )
-
-    @torch.no_grad()
-    def update_grad_norms(self):
-        if self.training is False:
-            print(f"skipping update_grad_norms for {self.lora_name}")
-            return
-
-        lora_down_grad = None
-        lora_up_grad = None
-
-        for name, param in self.named_parameters():
-            if name == "lora_down.weight":
-                lora_down_grad = param.grad
-            elif name == "lora_up.weight":
-                lora_up_grad = param.grad
-
-        # Calculate gradient norms if we have both gradients
-        if lora_down_grad is not None and lora_up_grad is not None:
-            with torch.autocast(self.device.type):
-                approx_grad = self.scale * ((self.lora_up.weight @ lora_down_grad) + (lora_up_grad @ self.lora_down.weight))
-                self.grad_norms = torch.norm(approx_grad, dim=1, keepdim=True)
-
-    @property
-    def device(self):
-        return next(self.parameters()).device
-
-    @property
-    def dtype(self):
-        return next(self.parameters()).dtype
-
-
-class LoRAInfModule(LoRAModule):
-    def __init__(
-        self,
-        lora_name,
-        org_module: torch.nn.Module,
-        multiplier=1.0,
-        lora_dim=4,
-        alpha=1,
-        **kwargs,
-    ):
-        # no dropout for inference
-        super().__init__(lora_name, org_module, multiplier, lora_dim, alpha)
-
-        self.org_module_ref = [org_module]  # 後から参照できるように
-        self.enabled = True
-        self.network: LoRANetwork = None
-
-    def set_network(self, network):
-        self.network = network
-
-    # freezeしてマージする
-    def merge_to(self, sd, dtype, device):
-        # extract weight from org_module
-        org_sd = self.org_module.state_dict()
-        weight = org_sd["weight"]
-        org_dtype = weight.dtype
-        org_device = weight.device
-        weight = weight.to(torch.float)  # calc in float
-
-        if dtype is None:
-            dtype = org_dtype
-        if device is None:
-            device = org_device
-
-        if self.split_dims is None:
-            # get up/down weight
-            down_weight = sd["lora_down.weight"].to(torch.float).to(device)
-            up_weight = sd["lora_up.weight"].to(torch.float).to(device)
-
-            # merge weight
-            if len(weight.size()) == 2:
-                # linear
-                weight = weight + self.multiplier * (up_weight @ down_weight) * self.scale
-            elif down_weight.size()[2:4] == (1, 1):
-                # conv2d 1x1
-                weight = (
-                    weight
-                    + self.multiplier
-                    * (up_weight.squeeze(3).squeeze(2) @ down_weight.squeeze(3).squeeze(2)).unsqueeze(2).unsqueeze(3)
-                    * self.scale
-                )
-            else:
-                # conv2d 3x3
-                conved = torch.nn.functional.conv2d(down_weight.permute(1, 0, 2, 3), up_weight).permute(1, 0, 2, 3)
-                # logger.info(conved.size(), weight.size(), module.stride, module.padding)
-                weight = weight + self.multiplier * conved * self.scale
-
-            # set weight to org_module
-            org_sd["weight"] = weight.to(dtype)
-            self.org_module.load_state_dict(org_sd)
-        else:
-            # split_dims
-            total_dims = sum(self.split_dims)
-            for i in range(len(self.split_dims)):
-                # get up/down weight
-                down_weight = sd[f"lora_down.{i}.weight"].to(torch.float).to(device)  # (rank, in_dim)
-                up_weight = sd[f"lora_up.{i}.weight"].to(torch.float).to(device)  # (split dim, rank)
-
-                # pad up_weight -> (total_dims, rank)
-                padded_up_weight = torch.zeros((total_dims, up_weight.size(0)), device=device, dtype=torch.float)
-                padded_up_weight[sum(self.split_dims[:i]) : sum(self.split_dims[: i + 1])] = up_weight
-
-                # merge weight
-                weight = weight + self.multiplier * (up_weight @ down_weight) * self.scale
-
-            # set weight to org_module
-            org_sd["weight"] = weight.to(dtype)
-            self.org_module.load_state_dict(org_sd)
-
-    # 復元できるマージのため、このモジュールのweightを返す
-    def get_weight(self, multiplier=None):
-        if multiplier is None:
-            multiplier = self.multiplier
-
-        # get up/down weight from module
-        up_weight = self.lora_up.weight.to(torch.float)
-        down_weight = self.lora_down.weight.to(torch.float)
-
-        # pre-calculated weight
-        if len(down_weight.size()) == 2:
-            # linear
-            weight = self.multiplier * (up_weight @ down_weight) * self.scale
-        elif down_weight.size()[2:4] == (1, 1):
-            # conv2d 1x1
-            weight = (
-                self.multiplier
-                * (up_weight.squeeze(3).squeeze(2) @ down_weight.squeeze(3).squeeze(2)).unsqueeze(2).unsqueeze(3)
-                * self.scale
-            )
-        else:
-            # conv2d 3x3
-            conved = torch.nn.functional.conv2d(down_weight.permute(1, 0, 2, 3), up_weight).permute(1, 0, 2, 3)
-            weight = self.multiplier * conved * self.scale
-
-        return weight
-
-    def set_region(self, region):
-        self.region = region
-        self.region_mask = None
-
-    def default_forward(self, x):
-        # logger.info(f"default_forward {self.lora_name} {x.size()}")
-        if self.split_dims is None:
-            lx = self.lora_down(x)
-            lx = self.lora_up(lx)
-            return self.org_forward(x) + lx * self.multiplier * self.scale
-        else:
-            lxs = [lora_down(x) for lora_down in self.lora_down]
-            lxs = [lora_up(lx) for lora_up, lx in zip(self.lora_up, lxs)]
-            return self.org_forward(x) + torch.cat(lxs, dim=-1) * self.multiplier * self.scale
-
-    def forward(self, x):
-        if not self.enabled:
-            return self.org_forward(x)
-        return self.default_forward(x)
+NUM_DOUBLE_BLOCKS = 20
+NUM_SINGLE_BLOCKS = 40
 
 
 def create_network(
     multiplier: float,
     network_dim: Optional[int],
     network_alpha: Optional[float],
-    ae: AutoencoderKL,
-    text_encoders: List[CLIPTextModel],
+    vae: HunyuanVAE2D,
+    text_encoders: List[nn.Module],
     flux,
     neuron_dropout: Optional[float] = None,
     **kwargs,
@@ -462,88 +54,6 @@ def create_network(
         else:
             conv_alpha = float(conv_alpha)
 
-    # attn dim, mlp dim: only for DoubleStreamBlock. SingleStreamBlock is not supported because of combined qkv
-    img_attn_dim = kwargs.get("img_attn_dim", None)
-    txt_attn_dim = kwargs.get("txt_attn_dim", None)
-    img_mlp_dim = kwargs.get("img_mlp_dim", None)
-    txt_mlp_dim = kwargs.get("txt_mlp_dim", None)
-    img_mod_dim = kwargs.get("img_mod_dim", None)
-    txt_mod_dim = kwargs.get("txt_mod_dim", None)
-    single_dim = kwargs.get("single_dim", None)  # SingleStreamBlock
-    single_mod_dim = kwargs.get("single_mod_dim", None)  # SingleStreamBlock
-    if img_attn_dim is not None:
-        img_attn_dim = int(img_attn_dim)
-    if txt_attn_dim is not None:
-        txt_attn_dim = int(txt_attn_dim)
-    if img_mlp_dim is not None:
-        img_mlp_dim = int(img_mlp_dim)
-    if txt_mlp_dim is not None:
-        txt_mlp_dim = int(txt_mlp_dim)
-    if img_mod_dim is not None:
-        img_mod_dim = int(img_mod_dim)
-    if txt_mod_dim is not None:
-        txt_mod_dim = int(txt_mod_dim)
-    if single_dim is not None:
-        single_dim = int(single_dim)
-    if single_mod_dim is not None:
-        single_mod_dim = int(single_mod_dim)
-    type_dims = [img_attn_dim, txt_attn_dim, img_mlp_dim, txt_mlp_dim, img_mod_dim, txt_mod_dim, single_dim, single_mod_dim]
-    if all([d is None for d in type_dims]):
-        type_dims = None
-
-    # in_dims [img, time, vector, guidance, txt]
-    in_dims = kwargs.get("in_dims", None)
-    if in_dims is not None:
-        in_dims = in_dims.strip()
-        if in_dims.startswith("[") and in_dims.endswith("]"):
-            in_dims = in_dims[1:-1]
-        in_dims = [int(d) for d in in_dims.split(",")]  # is it better to use ast.literal_eval?
-        assert len(in_dims) == 5, f"invalid in_dims: {in_dims}, must be 5 dimensions (img, time, vector, guidance, txt)"
-
-    # double/single train blocks
-    def parse_block_selection(selection: str, total_blocks: int) -> List[bool]:
-        """
-        Parse a block selection string and return a list of booleans.
-
-        Args:
-        selection (str): A string specifying which blocks to select.
-        total_blocks (int): The total number of blocks available.
-
-        Returns:
-        List[bool]: A list of booleans indicating which blocks are selected.
-        """
-        if selection == "all":
-            return [True] * total_blocks
-        if selection == "none" or selection == "":
-            return [False] * total_blocks
-
-        selected = [False] * total_blocks
-        ranges = selection.split(",")
-
-        for r in ranges:
-            if "-" in r:
-                start, end = map(str.strip, r.split("-"))
-                start = int(start)
-                end = int(end)
-                assert 0 <= start < total_blocks, f"invalid start index: {start}"
-                assert 0 <= end < total_blocks, f"invalid end index: {end}"
-                assert start <= end, f"invalid range: {start}-{end}"
-                for i in range(start, end + 1):
-                    selected[i] = True
-            else:
-                index = int(r)
-                assert 0 <= index < total_blocks, f"invalid index: {index}"
-                selected[index] = True
-
-        return selected
-
-    train_double_block_indices = kwargs.get("train_double_block_indices", None)
-    train_single_block_indices = kwargs.get("train_single_block_indices", None)
-    if train_double_block_indices is not None:
-        train_double_block_indices = parse_block_selection(train_double_block_indices, NUM_DOUBLE_BLOCKS)
-    if train_single_block_indices is not None:
-        train_single_block_indices = parse_block_selection(train_single_block_indices, NUM_SINGLE_BLOCKS)
-
     # rank/module dropout
     rank_dropout = kwargs.get("rank_dropout", None)
     if rank_dropout is not None:
@@ -552,11 +62,6 @@ def parse_block_selection(selection: str, total_blocks: int) -> List[bool]:
     if module_dropout is not None:
         module_dropout = float(module_dropout)
 
-    # single or double blocks
-    train_blocks = kwargs.get("train_blocks", None)  # None (default), "all" (same as None), "single", "double"
-    if train_blocks is not None:
-        assert train_blocks in ["all", "single", "double"], f"invalid train_blocks: {train_blocks}"
-
     # split qkv
     split_qkv = kwargs.get("split_qkv", False)
     if split_qkv is not None:
@@ -571,11 +76,6 @@ def parse_block_selection(selection: str, total_blocks: int) -> List[bool]:
     if ggpo_sigma is not None:
         ggpo_sigma = float(ggpo_sigma)
 
-    # train T5XXL
-    train_t5xxl = kwargs.get("train_t5xxl", False)
-    if train_t5xxl is not None:
-        train_t5xxl = True if train_t5xxl == "True" else False
-
     # verbose
     verbose = kwargs.get("verbose", False)
     if verbose is not None:
@@ -617,8 +117,8 @@ def parse_kv_pairs(kv_pair_str: str, is_int: bool) -> Dict[str, float]:
     else:
         reg_dims = None
 
-    # すごく引数が多いな ( ^ω^)･･･
-    network = LoRANetwork(
+    # Too many arguments ( ^ω^)･･･
+    network = HunyuanImageLoRANetwork(
         text_encoders,
         flux,
         multiplier=multiplier,
@@ -629,13 +129,7 @@ def parse_kv_pairs(kv_pair_str: str, is_int: bool) -> Dict[str, float]:
         module_dropout=module_dropout,
         conv_lora_dim=conv_dim,
         conv_alpha=conv_alpha,
-        train_blocks=train_blocks,
         split_qkv=split_qkv,
-        train_t5xxl=train_t5xxl,
-        type_dims=type_dims,
-        in_dims=in_dims,
-        train_double_block_indices=train_double_block_indices,
-        train_single_block_indices=train_single_block_indices,
         reg_dims=reg_dims,
         ggpo_beta=ggpo_beta,
         ggpo_sigma=ggpo_sigma,
@@ -668,7 +162,6 @@ def create_network_from_weights(multiplier, file, ae, text_encoders, flux, weigh
     # get dim/alpha mapping, and train t5xxl
     modules_dim = {}
     modules_alpha = {}
-    train_t5xxl = None
     for key, value in weights_sd.items():
         if "." not in key:
             continue
@@ -681,17 +174,11 @@ def create_network_from_weights(multiplier, file, ae, text_encoders, flux, weigh
             modules_dim[lora_name] = dim
             # logger.info(lora_name, value.size(), dim)
 
-        if train_t5xxl is None or train_t5xxl is False:
-            train_t5xxl = "lora_te3" in lora_name
-
-    if train_t5xxl is None:
-        train_t5xxl = False
-
     split_qkv = False  # split_qkv is not needed to care, because state_dict is qkv combined
 
-    module_class = LoRAInfModule if for_inference else LoRAModule
+    module_class = lora_flux.LoRAInfModule if for_inference else lora_flux.LoRAModule
 
-    network = LoRANetwork(
+    network = HunyuanImageLoRANetwork(
         text_encoders,
         flux,
         multiplier=multiplier,
@@ -699,23 +186,23 @@ def create_network_from_weights(multiplier, file, ae, text_encoders, flux, weigh
         modules_alpha=modules_alpha,
         module_class=module_class,
         split_qkv=split_qkv,
-        train_t5xxl=train_t5xxl,
     )
     return network, weights_sd
 
 
-class LoRANetwork(torch.nn.Module):
+class HunyuanImageLoRANetwork(lora_flux.LoRANetwork):
     # FLUX_TARGET_REPLACE_MODULE = ["DoubleStreamBlock", "SingleStreamBlock"]
     FLUX_TARGET_REPLACE_MODULE_DOUBLE = ["DoubleStreamBlock"]
     FLUX_TARGET_REPLACE_MODULE_SINGLE = ["SingleStreamBlock"]
-    TEXT_ENCODER_TARGET_REPLACE_MODULE = ["CLIPAttention", "CLIPSdpaAttention", "CLIPMLP", "T5Attention", "T5DenseGatedActDense"]
-    LORA_PREFIX_FLUX = "lora_unet"  # make ComfyUI compatible
-    LORA_PREFIX_TEXT_ENCODER_CLIP = "lora_te1"
-    LORA_PREFIX_TEXT_ENCODER_T5 = "lora_te3"  # make ComfyUI compatible
+    LORA_PREFIX_HUNYUAN_IMAGE_DIT = "lora_unet"  # make ComfyUI compatible
+
+    @classmethod
+    def get_qkv_mlp_split_dims(cls) -> List[int]:
+        return [3584] * 3 + [14336]
 
     def __init__(
         self,
-        text_encoders: Union[List[CLIPTextModel], CLIPTextModel],
+        text_encoders: list[nn.Module],
         unet,
         multiplier: float = 1.0,
         lora_dim: int = 4,
@@ -725,16 +212,10 @@ def __init__(
         module_dropout: Optional[float] = None,
         conv_lora_dim: Optional[int] = None,
         conv_alpha: Optional[float] = None,
-        module_class: Type[object] = LoRAModule,
+        module_class: Type[object] = lora_flux.LoRAModule,
         modules_dim: Optional[Dict[str, int]] = None,
         modules_alpha: Optional[Dict[str, int]] = None,
-        train_blocks: Optional[str] = None,
         split_qkv: bool = False,
-        train_t5xxl: bool = False,
-        type_dims: Optional[List[int]] = None,
-        in_dims: Optional[List[int]] = None,
-        train_double_block_indices: Optional[List[bool]] = None,
-        train_single_block_indices: Optional[List[bool]] = None,
         reg_dims: Optional[Dict[str, int]] = None,
         ggpo_beta: Optional[float] = None,
         ggpo_sigma: Optional[float] = None,
@@ -751,14 +232,7 @@ def __init__(
         self.dropout = dropout
         self.rank_dropout = rank_dropout
         self.module_dropout = module_dropout
-        self.train_blocks = train_blocks if train_blocks is not None else "all"
         self.split_qkv = split_qkv
-        self.train_t5xxl = train_t5xxl
-
-        self.type_dims = type_dims
-        self.in_dims = in_dims
-        self.train_double_block_indices = train_double_block_indices
-        self.train_single_block_indices = train_single_block_indices
         self.reg_dims = reg_dims
         self.reg_lrs = reg_lrs
 
@@ -788,23 +262,18 @@ def __init__(
         if self.train_blocks is not None:
             logger.info(f"train {self.train_blocks} blocks only")
 
-        if train_t5xxl:
-            logger.info(f"train T5XXL as well")
-
         # create module instances
         def create_modules(
-            is_flux: bool,
+            is_dit: bool,
             text_encoder_idx: Optional[int],
             root_module: torch.nn.Module,
             target_replace_modules: List[str],
             filter: Optional[str] = None,
             default_dim: Optional[int] = None,
-        ) -> List[LoRAModule]:
-            prefix = (
-                self.LORA_PREFIX_FLUX
-                if is_flux
-                else (self.LORA_PREFIX_TEXT_ENCODER_CLIP if text_encoder_idx == 0 else self.LORA_PREFIX_TEXT_ENCODER_T5)
-            )
+        ) -> List[lora_flux.LoRAModule]:
+            assert is_dit, "only DIT is supported now"
+
+            prefix = self.LORA_PREFIX_HUNYUAN_IMAGE_DIT
 
             loras = []
             skipped = []
@@ -842,51 +311,10 @@ def create_modules(
                                         break
 
                             # if modules_dim is None, we use default lora_dim. if modules_dim is not None, we use the specified dim (no default)
-                            if dim is None and modules_dim is None: 
+                            if dim is None and modules_dim is None:
                                 if is_linear or is_conv2d_1x1:
                                     dim = default_dim if default_dim is not None else self.lora_dim
                                     alpha = self.alpha
-
-                                    if is_flux and type_dims is not None:
-                                        identifier = [
-                                            ("img_attn",),
-                                            ("txt_attn",),
-                                            ("img_mlp",),
-                                            ("txt_mlp",),
-                                            ("img_mod",),
-                                            ("txt_mod",),
-                                            ("single_blocks", "linear"),
-                                            ("modulation",),
-                                        ]
-                                        for i, d in enumerate(type_dims):
-                                            if d is not None and all([id in lora_name for id in identifier[i]]):
-                                                dim = d  # may be 0 for skip
-                                                break
-
-                                    if (
-                                        is_flux
-                                        and dim
-                                        and (
-                                            self.train_double_block_indices is not None
-                                            or self.train_single_block_indices is not None
-                                        )
-                                        and ("double" in lora_name or "single" in lora_name)
-                                    ):
-                                        # "lora_unet_double_blocks_0_..." or "lora_unet_single_blocks_0_..."
-                                        block_index = int(lora_name.split("_")[4])  # bit dirty
-                                        if (
-                                            "double" in lora_name
-                                            and self.train_double_block_indices is not None
-                                            and not self.train_double_block_indices[block_index]
-                                        ):
-                                            dim = 0
-                                        elif (
-                                            "single" in lora_name
-                                            and self.train_single_block_indices is not None
-                                            and not self.train_single_block_indices[block_index]
-                                        ):
-                                            dim = 0
-
                                 elif self.conv_lora_dim is not None:
                                     dim = self.conv_lora_dim
                                     alpha = self.conv_alpha
@@ -899,11 +327,11 @@ def create_modules(
 
                             # qkv split
                             split_dims = None
-                            if is_flux and split_qkv:
+                            if is_dit and split_qkv:
                                 if "double" in lora_name and "qkv" in lora_name:
-                                    split_dims = [3072] * 3
+                                    split_dims = self.get_qkv_mlp_split_dims()[:3]  # qkv only
                                 elif "single" in lora_name and "linear1" in lora_name:
-                                    split_dims = [3072] * 3 + [12288]
+                                    split_dims = self.get_qkv_mlp_split_dims()  # qkv + mlp
 
                             lora = module_class(
                                 lora_name,
@@ -924,48 +352,21 @@ def create_modules(
                     break  # all modules are searched
             return loras, skipped
 
-        # create LoRA for text encoder
-        # 毎回すべてのモジュールを作るのは無駄なので要検討
-        self.text_encoder_loras: List[Union[LoRAModule, LoRAInfModule]] = []
-        skipped_te = []
-        for i, text_encoder in enumerate(text_encoders):
-            index = i
-            if text_encoder is None:
-                logger.info(f"Text Encoder {index+1} is None, skipping LoRA creation for this encoder.")
-                continue
-            if not train_t5xxl and index > 0:  # 0: CLIP, 1: T5XXL, so we skip T5XXL if train_t5xxl is False
-                break
-
-            logger.info(f"create LoRA for Text Encoder {index+1}:")
-
-            text_encoder_loras, skipped = create_modules(False, index, text_encoder, LoRANetwork.TEXT_ENCODER_TARGET_REPLACE_MODULE)
-            logger.info(f"create LoRA for Text Encoder {index+1}: {len(text_encoder_loras)} modules.")
-            self.text_encoder_loras.extend(text_encoder_loras)
-            skipped_te += skipped
-
         # create LoRA for U-Net
-        if self.train_blocks == "all":
-            target_replace_modules = LoRANetwork.FLUX_TARGET_REPLACE_MODULE_DOUBLE + LoRANetwork.FLUX_TARGET_REPLACE_MODULE_SINGLE
-        elif self.train_blocks == "single":
-            target_replace_modules = LoRANetwork.FLUX_TARGET_REPLACE_MODULE_SINGLE
-        elif self.train_blocks == "double":
-            target_replace_modules = LoRANetwork.FLUX_TARGET_REPLACE_MODULE_DOUBLE
+        target_replace_modules = (
+            HunyuanImageLoRANetwork.FLUX_TARGET_REPLACE_MODULE_DOUBLE + HunyuanImageLoRANetwork.FLUX_TARGET_REPLACE_MODULE_SINGLE
+        )
 
-        self.unet_loras: List[Union[LoRAModule, LoRAInfModule]]
+        self.unet_loras: List[Union[lora_flux.LoRAModule, lora_flux.LoRAInfModule]]
         self.unet_loras, skipped_un = create_modules(True, None, unet, target_replace_modules)
-
-        # img, time, vector, guidance, txt
-        if self.in_dims:
-            for filter, in_dim in zip(["_img_in", "_time_in", "_vector_in", "_guidance_in", "_txt_in"], self.in_dims):
-                loras, _ = create_modules(True, None, unet, None, filter=filter, default_dim=in_dim)
-                self.unet_loras.extend(loras)
+        self.text_encoder_loras = []
 
         logger.info(f"create LoRA for FLUX {self.train_blocks} blocks: {len(self.unet_loras)} modules.")
         if verbose:
             for lora in self.unet_loras:
                 logger.info(f"\t{lora.lora_name:50} {lora.lora_dim}, {lora.alpha}")
 
-        skipped = skipped_te + skipped_un
+        skipped = skipped_un
         if verbose and len(skipped) > 0:
             logger.warning(
                 f"because dim (rank) is 0, {len(skipped)} LoRA modules are skipped / dim (rank)が0の為、次の{len(skipped)}個のLoRAモジュールはスキップされます:"
@@ -978,467 +379,3 @@ def create_modules(
         for lora in self.text_encoder_loras + self.unet_loras:
             assert lora.lora_name not in names, f"duplicated lora name: {lora.lora_name}"
             names.add(lora.lora_name)
-
-    def set_multiplier(self, multiplier):
-        self.multiplier = multiplier
-        for lora in self.text_encoder_loras + self.unet_loras:
-            lora.multiplier = self.multiplier
-
-    def set_enabled(self, is_enabled):
-        for lora in self.text_encoder_loras + self.unet_loras:
-            lora.enabled = is_enabled
-
-    def update_norms(self):
-        for lora in self.text_encoder_loras + self.unet_loras:
-            lora.update_norms()
-
-    def update_grad_norms(self):
-        for lora in self.text_encoder_loras + self.unet_loras:
-            lora.update_grad_norms()
-
-    def grad_norms(self) -> Tensor | None:
-        grad_norms = []
-        for lora in self.text_encoder_loras + self.unet_loras:
-            if hasattr(lora, "grad_norms") and lora.grad_norms is not None:
-                grad_norms.append(lora.grad_norms.mean(dim=0))
-        return torch.stack(grad_norms) if len(grad_norms) > 0 else None
-
-    def weight_norms(self) -> Tensor | None:
-        weight_norms = []
-        for lora in self.text_encoder_loras + self.unet_loras:
-            if hasattr(lora, "weight_norms") and lora.weight_norms is not None:
-                weight_norms.append(lora.weight_norms.mean(dim=0))
-        return torch.stack(weight_norms) if len(weight_norms) > 0 else None
-
-    def combined_weight_norms(self) -> Tensor | None:
-        combined_weight_norms = []
-        for lora in self.text_encoder_loras + self.unet_loras:
-            if hasattr(lora, "combined_weight_norms") and lora.combined_weight_norms is not None:
-                combined_weight_norms.append(lora.combined_weight_norms.mean(dim=0))
-        return torch.stack(combined_weight_norms) if len(combined_weight_norms) > 0 else None
-
-    def load_weights(self, file):
-        if os.path.splitext(file)[1] == ".safetensors":
-            from safetensors.torch import load_file
-
-            weights_sd = load_file(file)
-        else:
-            weights_sd = torch.load(file, map_location="cpu")
-
-        info = self.load_state_dict(weights_sd, False)
-        return info
-
-    def load_state_dict(self, state_dict, strict=True):
-        # override to convert original weight to split qkv
-        if not self.split_qkv:
-            return super().load_state_dict(state_dict, strict)
-
-        # split qkv
-        for key in list(state_dict.keys()):
-            if "double" in key and "qkv" in key:
-                split_dims = [3072] * 3
-            elif "single" in key and "linear1" in key:
-                split_dims = [3072] * 3 + [12288]
-            else:
-                continue
-
-            weight = state_dict[key]
-            lora_name = key.split(".")[0]
-            if "lora_down" in key and "weight" in key:
-                # dense weight (rank*3, in_dim)
-                split_weight = torch.chunk(weight, len(split_dims), dim=0)
-                for i, split_w in enumerate(split_weight):
-                    state_dict[f"{lora_name}.lora_down.{i}.weight"] = split_w
-
-                del state_dict[key]
-                # print(f"split {key}: {weight.shape} to {[w.shape for w in split_weight]}")
-            elif "lora_up" in key and "weight" in key:
-                # sparse weight (out_dim=sum(split_dims), rank*3)
-                rank = weight.size(1) // len(split_dims)
-                i = 0
-                for j in range(len(split_dims)):
-                    state_dict[f"{lora_name}.lora_up.{j}.weight"] = weight[i : i + split_dims[j], j * rank : (j + 1) * rank]
-                    i += split_dims[j]
-                del state_dict[key]
-
-                # # check is sparse
-                # i = 0
-                # is_zero = True
-                # for j in range(len(split_dims)):
-                #     for k in range(len(split_dims)):
-                #         if j == k:
-                #             continue
-                #         is_zero = is_zero and torch.all(weight[i : i + split_dims[j], k * rank : (k + 1) * rank] == 0)
-                #     i += split_dims[j]
-                # if not is_zero:
-                #     logger.warning(f"weight is not sparse: {key}")
-                # else:
-                #     logger.info(f"weight is sparse: {key}")
-
-                # print(
-                #     f"split {key}: {weight.shape} to {[state_dict[k].shape for k in [f'{lora_name}.lora_up.{j}.weight' for j in range(len(split_dims))]]}"
-                # )
-
-            # alpha is unchanged
-
-        return super().load_state_dict(state_dict, strict)
-
-    def state_dict(self, destination=None, prefix="", keep_vars=False):
-        if not self.split_qkv:
-            return super().state_dict(destination, prefix, keep_vars)
-
-        # merge qkv
-        state_dict = super().state_dict(destination, prefix, keep_vars)
-        new_state_dict = {}
-        for key in list(state_dict.keys()):
-            if "double" in key and "qkv" in key:
-                split_dims = [3072] * 3
-            elif "single" in key and "linear1" in key:
-                split_dims = [3072] * 3 + [12288]
-            else:
-                new_state_dict[key] = state_dict[key]
-                continue
-
-            if key not in state_dict:
-                continue  # already merged
-
-            lora_name = key.split(".")[0]
-
-            # (rank, in_dim) * 3
-            down_weights = [state_dict.pop(f"{lora_name}.lora_down.{i}.weight") for i in range(len(split_dims))]
-            # (split dim, rank) * 3
-            up_weights = [state_dict.pop(f"{lora_name}.lora_up.{i}.weight") for i in range(len(split_dims))]
-
-            alpha = state_dict.pop(f"{lora_name}.alpha")
-
-            # merge down weight
-            down_weight = torch.cat(down_weights, dim=0)  # (rank, split_dim) * 3 -> (rank*3, sum of split_dim)
-
-            # merge up weight (sum of split_dim, rank*3)
-            rank = up_weights[0].size(1)
-            up_weight = torch.zeros((sum(split_dims), down_weight.size(0)), device=down_weight.device, dtype=down_weight.dtype)
-            i = 0
-            for j in range(len(split_dims)):
-                up_weight[i : i + split_dims[j], j * rank : (j + 1) * rank] = up_weights[j]
-                i += split_dims[j]
-
-            new_state_dict[f"{lora_name}.lora_down.weight"] = down_weight
-            new_state_dict[f"{lora_name}.lora_up.weight"] = up_weight
-            new_state_dict[f"{lora_name}.alpha"] = alpha
-
-            # print(
-            #     f"merged {lora_name}: {lora_name}, {[w.shape for w in down_weights]}, {[w.shape for w in up_weights]} to {down_weight.shape}, {up_weight.shape}"
-            # )
-            print(f"new key: {lora_name}.lora_down.weight, {lora_name}.lora_up.weight, {lora_name}.alpha")
-
-        return new_state_dict
-
-    def apply_to(self, text_encoders, flux, apply_text_encoder=True, apply_unet=True):
-        if apply_text_encoder:
-            logger.info(f"enable LoRA for text encoder: {len(self.text_encoder_loras)} modules")
-        else:
-            self.text_encoder_loras = []
-
-        if apply_unet:
-            logger.info(f"enable LoRA for U-Net: {len(self.unet_loras)} modules")
-        else:
-            self.unet_loras = []
-
-        for lora in self.text_encoder_loras + self.unet_loras:
-            lora.apply_to()
-            self.add_module(lora.lora_name, lora)
-
-    # マージできるかどうかを返す
-    def is_mergeable(self):
-        return True
-
-    # TODO refactor to common function with apply_to
-    def merge_to(self, text_encoders, flux, weights_sd, dtype=None, device=None):
-        apply_text_encoder = apply_unet = False
-        for key in weights_sd.keys():
-            if key.startswith(LoRANetwork.LORA_PREFIX_TEXT_ENCODER_CLIP) or key.startswith(LoRANetwork.LORA_PREFIX_TEXT_ENCODER_T5):
-                apply_text_encoder = True
-            elif key.startswith(LoRANetwork.LORA_PREFIX_FLUX):
-                apply_unet = True
-
-        if apply_text_encoder:
-            logger.info("enable LoRA for text encoder")
-        else:
-            self.text_encoder_loras = []
-
-        if apply_unet:
-            logger.info("enable LoRA for U-Net")
-        else:
-            self.unet_loras = []
-
-        for lora in self.text_encoder_loras + self.unet_loras:
-            sd_for_lora = {}
-            for key in weights_sd.keys():
-                if key.startswith(lora.lora_name):
-                    sd_for_lora[key[len(lora.lora_name) + 1 :]] = weights_sd[key]
-            lora.merge_to(sd_for_lora, dtype, device)
-
-        logger.info(f"weights are merged")
-
-    def set_loraplus_lr_ratio(self, loraplus_lr_ratio, loraplus_unet_lr_ratio, loraplus_text_encoder_lr_ratio):
-        self.loraplus_lr_ratio = loraplus_lr_ratio
-        self.loraplus_unet_lr_ratio = loraplus_unet_lr_ratio
-        self.loraplus_text_encoder_lr_ratio = loraplus_text_encoder_lr_ratio
-
-        logger.info(f"LoRA+ UNet LR Ratio: {self.loraplus_unet_lr_ratio or self.loraplus_lr_ratio}")
-        logger.info(f"LoRA+ Text Encoder LR Ratio: {self.loraplus_text_encoder_lr_ratio or self.loraplus_lr_ratio}")
-
-    def prepare_optimizer_params_with_multiple_te_lrs(self, text_encoder_lr, unet_lr, default_lr):
-        # make sure text_encoder_lr as list of two elements
-        # if float, use the same value for both text encoders
-        if text_encoder_lr is None or (isinstance(text_encoder_lr, list) and len(text_encoder_lr) == 0):
-            text_encoder_lr = [default_lr, default_lr]
-        elif isinstance(text_encoder_lr, float) or isinstance(text_encoder_lr, int):
-            text_encoder_lr = [float(text_encoder_lr), float(text_encoder_lr)]
-        elif len(text_encoder_lr) == 1:
-            text_encoder_lr = [text_encoder_lr[0], text_encoder_lr[0]]
-
-        self.requires_grad_(True)
-
-        all_params = []
-        lr_descriptions = []
-
-        reg_lrs_list = list(self.reg_lrs.items()) if self.reg_lrs is not None else []
-
-        def assemble_params(loras, lr, loraplus_ratio):
-            param_groups = {"lora": {}, "plus": {}}
-            # regular expression param groups: {"reg_lr_0": {"lora": {}, "plus": {}}, ...}
-            reg_groups = {}
-
-            for lora in loras:
-                # check if this lora matches any regex learning rate
-                matched_reg_lr = None
-                for i, (regex_str, reg_lr) in enumerate(reg_lrs_list):
-                    try:
-                        if re.search(regex_str, lora.lora_name):
-                            matched_reg_lr = (i, reg_lr)
-                            logger.info(f"Module {lora.lora_name} matched regex '{regex_str}' -> LR {reg_lr}")
-                            break
-                    except re.error:
-                        # regex error should have been caught during parsing, but just in case
-                        continue
-
-                for name, param in lora.named_parameters():
-                    param_key = f"{lora.lora_name}.{name}"
-                    is_plus = loraplus_ratio is not None and "lora_up" in name
-
-                    if matched_reg_lr is not None:
-                        # use regex-specific learning rate
-                        reg_idx, reg_lr = matched_reg_lr
-                        group_key = f"reg_lr_{reg_idx}"
-                        if group_key not in reg_groups:
-                            reg_groups[group_key] = {"lora": {}, "plus": {}, "lr": reg_lr}
-
-                        if is_plus:
-                            reg_groups[group_key]["plus"][param_key] = param
-                        else:
-                            reg_groups[group_key]["lora"][param_key] = param
-                    else:
-                        # use default learning rate
-                        if is_plus:
-                            param_groups["plus"][param_key] = param
-                        else:
-                            param_groups["lora"][param_key] = param
-
-            params = []
-            descriptions = []
-
-            # process regex-specific groups first (higher priority)
-            for group_key in sorted(reg_groups.keys()):
-                group = reg_groups[group_key]
-                reg_lr = group["lr"]
-
-                for param_type in ["lora", "plus"]:
-                    if len(group[param_type]) == 0:
-                        continue
-
-                    param_data = {"params": group[param_type].values()}
-
-                    if param_type == "plus" and loraplus_ratio is not None:
-                        param_data["lr"] = reg_lr * loraplus_ratio
-                    else:
-                        param_data["lr"] = reg_lr
-
-                    if param_data.get("lr", None) == 0 or param_data.get("lr", None) is None:
-                        continue
-
-                    params.append(param_data)
-                    desc = f"reg_lr_{group_key.split('_')[-1]}"
-                    if param_type == "plus":
-                        desc += " plus"
-                    descriptions.append(desc)
-
-            # process default groups
-            for key in param_groups.keys():
-                param_data = {"params": param_groups[key].values()}
-
-                if len(param_data["params"]) == 0:
-                    continue
-
-                if lr is not None:
-                    if key == "plus":
-                        param_data["lr"] = lr * loraplus_ratio
-                    else:
-                        param_data["lr"] = lr
-
-                if param_data.get("lr", None) == 0 or param_data.get("lr", None) is None:
-                    logger.info("NO LR skipping!")
-                    continue
-
-                params.append(param_data)
-                descriptions.append("plus" if key == "plus" else "")
-
-            return params, descriptions
-
-        if self.text_encoder_loras:
-            loraplus_lr_ratio = self.loraplus_text_encoder_lr_ratio or self.loraplus_lr_ratio
-
-            # split text encoder loras for te1 and te3
-            te1_loras = [lora for lora in self.text_encoder_loras if lora.lora_name.startswith(self.LORA_PREFIX_TEXT_ENCODER_CLIP)]
-            te3_loras = [lora for lora in self.text_encoder_loras if lora.lora_name.startswith(self.LORA_PREFIX_TEXT_ENCODER_T5)]
-            if len(te1_loras) > 0:
-                logger.info(f"Text Encoder 1 (CLIP-L): {len(te1_loras)} modules, LR {text_encoder_lr[0]}")
-                params, descriptions = assemble_params(te1_loras, text_encoder_lr[0], loraplus_lr_ratio)
-                all_params.extend(params)
-                lr_descriptions.extend(["textencoder 1 " + (" " + d if d else "") for d in descriptions])
-            if len(te3_loras) > 0:
-                logger.info(f"Text Encoder 2 (T5XXL): {len(te3_loras)} modules, LR {text_encoder_lr[1]}")
-                params, descriptions = assemble_params(te3_loras, text_encoder_lr[1], loraplus_lr_ratio)
-                all_params.extend(params)
-                lr_descriptions.extend(["textencoder 2 " + (" " + d if d else "") for d in descriptions])
-
-        if self.unet_loras:
-            params, descriptions = assemble_params(
-                self.unet_loras,
-                unet_lr if unet_lr is not None else default_lr,
-                self.loraplus_unet_lr_ratio or self.loraplus_lr_ratio,
-            )
-            all_params.extend(params)
-            lr_descriptions.extend(["unet" + (" " + d if d else "") for d in descriptions])
-
-        return all_params, lr_descriptions
-
-    def enable_gradient_checkpointing(self):
-        # not supported
-        pass
-
-    def prepare_grad_etc(self, text_encoder, unet):
-        self.requires_grad_(True)
-
-    def on_epoch_start(self, text_encoder, unet):
-        self.train()
-
-    def get_trainable_params(self):
-        return self.parameters()
-
-    def save_weights(self, file, dtype, metadata):
-        if metadata is not None and len(metadata) == 0:
-            metadata = None
-
-        state_dict = self.state_dict()
-
-        if dtype is not None:
-            for key in list(state_dict.keys()):
-                v = state_dict[key]
-                v = v.detach().clone().to("cpu").to(dtype)
-                state_dict[key] = v
-
-        if os.path.splitext(file)[1] == ".safetensors":
-            from safetensors.torch import save_file
-            from library import train_util
-
-            # Precalculate model hashes to save time on indexing
-            if metadata is None:
-                metadata = {}
-            model_hash, legacy_hash = train_util.precalculate_safetensors_hashes(state_dict, metadata)
-            metadata["sshs_model_hash"] = model_hash
-            metadata["sshs_legacy_hash"] = legacy_hash
-
-            save_file(state_dict, file, metadata)
-        else:
-            torch.save(state_dict, file)
-
-    def backup_weights(self):
-        # 重みのバックアップを行う
-        loras: List[LoRAInfModule] = self.text_encoder_loras + self.unet_loras
-        for lora in loras:
-            org_module = lora.org_module_ref[0]
-            if not hasattr(org_module, "_lora_org_weight"):
-                sd = org_module.state_dict()
-                org_module._lora_org_weight = sd["weight"].detach().clone()
-                org_module._lora_restored = True
-
-    def restore_weights(self):
-        # 重みのリストアを行う
-        loras: List[LoRAInfModule] = self.text_encoder_loras + self.unet_loras
-        for lora in loras:
-            org_module = lora.org_module_ref[0]
-            if not org_module._lora_restored:
-                sd = org_module.state_dict()
-                sd["weight"] = org_module._lora_org_weight
-                org_module.load_state_dict(sd)
-                org_module._lora_restored = True
-
-    def pre_calculation(self):
-        # 事前計算を行う
-        loras: List[LoRAInfModule] = self.text_encoder_loras + self.unet_loras
-        for lora in loras:
-            org_module = lora.org_module_ref[0]
-            sd = org_module.state_dict()
-
-            org_weight = sd["weight"]
-            lora_weight = lora.get_weight().to(org_weight.device, dtype=org_weight.dtype)
-            sd["weight"] = org_weight + lora_weight
-            assert sd["weight"].shape == org_weight.shape
-            org_module.load_state_dict(sd)
-
-            org_module._lora_restored = False
-            lora.enabled = False
-
-    def apply_max_norm_regularization(self, max_norm_value, device):
-        downkeys = []
-        upkeys = []
-        alphakeys = []
-        norms = []
-        keys_scaled = 0
-
-        state_dict = self.state_dict()
-        for key in state_dict.keys():
-            if "lora_down" in key and "weight" in key:
-                downkeys.append(key)
-                upkeys.append(key.replace("lora_down", "lora_up"))
-                alphakeys.append(key.replace("lora_down.weight", "alpha"))
-
-        for i in range(len(downkeys)):
-            down = state_dict[downkeys[i]].to(device)
-            up = state_dict[upkeys[i]].to(device)
-            alpha = state_dict[alphakeys[i]].to(device)
-            dim = down.shape[0]
-            scale = alpha / dim
-
-            if up.shape[2:] == (1, 1) and down.shape[2:] == (1, 1):
-                updown = (up.squeeze(2).squeeze(2) @ down.squeeze(2).squeeze(2)).unsqueeze(2).unsqueeze(3)
-            elif up.shape[2:] == (3, 3) or down.shape[2:] == (3, 3):
-                updown = torch.nn.functional.conv2d(down.permute(1, 0, 2, 3), up).permute(1, 0, 2, 3)
-            else:
-                updown = up @ down
-
-            updown *= scale
-
-            norm = updown.norm().clamp(min=max_norm_value / 2)
-            desired = torch.clamp(norm, max=max_norm_value)
-            ratio = desired.cpu() / norm.cpu()
-            sqrt_ratio = ratio**0.5
-            if ratio != 1:
-                keys_scaled += 1
-                state_dict[upkeys[i]] *= sqrt_ratio
-                state_dict[downkeys[i]] *= sqrt_ratio
-            scalednorm = updown.norm() * ratio
-            norms.append(scalednorm.item())
-
-        return keys_scaled, sum(norms) / len(norms), max(norms)
diff --git a/train_network.py b/train_network.py
index 3dedb574c..00118877b 100644
--- a/train_network.py
+++ b/train_network.py
@@ -475,6 +475,9 @@ def process_batch(
 
         return loss.mean()
 
+    def cast_text_encoder(self):
+        return True # default for other than HunyuanImage
+    
     def train(self, args):
         session_id = random.randint(0, 2**32)
         training_started_at = time.time()
@@ -832,7 +835,7 @@ def train(self, args):
             t_enc.requires_grad_(False)
 
             # in case of cpu, dtype is already set to fp32 because cpu does not support fp8/fp16/bf16
-            if t_enc.device.type != "cpu":
+            if t_enc.device.type != "cpu" and self.cast_text_encoder():
                 t_enc.to(dtype=te_weight_dtype)
 
                 # nn.Embedding not support FP8

From a0f0afbb4603290bf1b9bd7a3c9a6bf6d8a6a568 Mon Sep 17 00:00:00 2001
From: Kohya S <52813779+kohya-ss@users.noreply.github.com>
Date: Thu, 11 Sep 2025 22:27:00 +0900
Subject: [PATCH 547/582] fix: revert constructor signature update

---
 library/custom_offloading_utils.py | 27 +++++++++++++--------------
 library/hunyuan_image_models.py    |  4 ++--
 2 files changed, 15 insertions(+), 16 deletions(-)

diff --git a/library/custom_offloading_utils.py b/library/custom_offloading_utils.py
index 4fbea542a..8699b3448 100644
--- a/library/custom_offloading_utils.py
+++ b/library/custom_offloading_utils.py
@@ -89,8 +89,7 @@ class Offloader:
     common offloading class
     """
 
-    def __init__(self, block_type: str, num_blocks: int, blocks_to_swap: int, device: torch.device, debug: bool = False):
-        self.block_type = block_type
+    def __init__(self, num_blocks: int, blocks_to_swap: int, device: torch.device, debug: bool = False):
         self.num_blocks = num_blocks
         self.blocks_to_swap = blocks_to_swap
         self.device = device
@@ -110,16 +109,12 @@ def _submit_move_blocks(self, blocks, block_idx_to_cpu, block_idx_to_cuda):
         def move_blocks(bidx_to_cpu, block_to_cpu, bidx_to_cuda, block_to_cuda):
             if self.debug:
                 start_time = time.perf_counter()
-                print(
-                    f"[{self.block_type}] Move block {bidx_to_cpu} to CPU and block {bidx_to_cuda} to {'CUDA' if self.cuda_available else 'device'}"
-                )
+                print(f"Move block {bidx_to_cpu} to CPU and block {bidx_to_cuda} to {'CUDA' if self.cuda_available else 'device'}")
 
             self.swap_weight_devices(block_to_cpu, block_to_cuda)
 
             if self.debug:
-                print(
-                    f"[{self.block_type}] Moved blocks {bidx_to_cpu} and {bidx_to_cuda} in {time.perf_counter() - start_time:.2f}s"
-                )
+                print(f"Moved blocks {bidx_to_cpu} and {bidx_to_cuda} in {time.perf_counter() - start_time:.2f}s")
             return bidx_to_cpu, bidx_to_cuda  # , event
 
         block_to_cpu = blocks[block_idx_to_cpu]
@@ -134,7 +129,7 @@ def _wait_blocks_move(self, block_idx):
             return
 
         if self.debug:
-            print(f"[{self.block_type}] Wait for block {block_idx}")
+            print(f"Wait for block {block_idx}")
             start_time = time.perf_counter()
 
         future = self.futures.pop(block_idx)
@@ -143,7 +138,7 @@ def _wait_blocks_move(self, block_idx):
         assert block_idx == bidx_to_cuda, f"Block index mismatch: {block_idx} != {bidx_to_cuda}"
 
         if self.debug:
-            print(f"[{self.block_type}] Waited for block {block_idx}: {time.perf_counter() - start_time:.2f}s")
+            print(f"Waited for block {block_idx}: {time.perf_counter() - start_time:.2f}s")
 
 
 class ModelOffloader(Offloader):
@@ -152,10 +147,14 @@ class ModelOffloader(Offloader):
     """
 
     def __init__(
-        self, blocks: list[nn.Module], blocks_to_swap: int, supports_backward: bool, device: torch.device, debug: bool = False
+        self,
+        blocks: list[nn.Module],
+        blocks_to_swap: int,
+        device: torch.device,
+        supports_backward: bool = True,
+        debug: bool = False,
     ):
-        block_type = f"{blocks[0].__class__.__name__}" if len(blocks) > 0 else "Unknown"
-        super().__init__(block_type, len(blocks), blocks_to_swap, device, debug)
+        super().__init__(len(blocks), blocks_to_swap, device, debug)
 
         self.supports_backward = supports_backward
         self.forward_only = not supports_backward  # forward only offloading: can be changed to True for inference
@@ -208,7 +207,7 @@ def prepare_block_devices_before_forward(self, blocks: list[nn.Module]):
             return
 
         if self.debug:
-            print(f"[{self.block_type}] Prepare block devices before forward")
+            print(f"Prepare block devices before forward")
 
         for b in blocks[0 : self.num_blocks - self.blocks_to_swap]:
             b.to(self.device)
diff --git a/library/hunyuan_image_models.py b/library/hunyuan_image_models.py
index 9847c55ee..9e3a00e8b 100644
--- a/library/hunyuan_image_models.py
+++ b/library/hunyuan_image_models.py
@@ -171,10 +171,10 @@ def enable_block_swap(self, num_blocks: int, device: torch.device, supports_back
         )
 
         self.offloader_double = custom_offloading_utils.ModelOffloader(
-            self.double_blocks, double_blocks_to_swap, supports_backward, device
+            self.double_blocks, double_blocks_to_swap, device, supports_backward=supports_backward
         )
         self.offloader_single = custom_offloading_utils.ModelOffloader(
-            self.single_blocks, single_blocks_to_swap, supports_backward, device
+            self.single_blocks, single_blocks_to_swap, device, supports_backward=supports_backward
         )
         # , debug=True
         print(

From cbc9e1a3b18b191fca2582ad18e9282381368360 Mon Sep 17 00:00:00 2001
From: Kohya S <52813779+kohya-ss@users.noreply.github.com>
Date: Thu, 11 Sep 2025 22:27:08 +0900
Subject: [PATCH 548/582] feat: add byt5 to the list of recognized words in
 typos configuration

---
 _typos.toml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/_typos.toml b/_typos.toml
index 75f0bf055..cc167eaaa 100644
--- a/_typos.toml
+++ b/_typos.toml
@@ -30,6 +30,7 @@ yos="yos"
 wn="wn"
 hime="hime"
 OT="OT"
+byt5="byt5"
 
 # [files]
 # # Extend the default list of files to check

From 209c02dbb6952e1006a625c2cdd653a91db25bd0 Mon Sep 17 00:00:00 2001
From: Kohya S <52813779+kohya-ss@users.noreply.github.com>
Date: Fri, 12 Sep 2025 21:40:42 +0900
Subject: [PATCH 549/582] feat: HunyuanImage LoRA training

---
 _typos.toml                           |   2 +-
 hunyuan_image_minimal_inference.py    |  30 +++--
 hunyuan_image_train_network.py        | 164 ++++++++++++++++----------
 library/attention.py                  |  46 +++++++-
 library/hunyuan_image_models.py       |  27 ++++-
 library/hunyuan_image_modules.py      |  56 ++++++---
 library/hunyuan_image_text_encoder.py |   2 +-
 library/hunyuan_image_vae.py          |   4 +-
 library/strategy_hunyuan_image.py     |  49 ++++++--
 library/train_util.py                 |  34 +++++-
 networks/lora_hunyuan_image.py        |  13 +-
 train_network.py                      |  74 +++++++-----
 12 files changed, 352 insertions(+), 149 deletions(-)

diff --git a/_typos.toml b/_typos.toml
index cc167eaaa..362ba8a60 100644
--- a/_typos.toml
+++ b/_typos.toml
@@ -30,7 +30,7 @@ yos="yos"
 wn="wn"
 hime="hime"
 OT="OT"
-byt5="byt5"
+byt="byt"
 
 # [files]
 # # Extend the default list of files to check
diff --git a/hunyuan_image_minimal_inference.py b/hunyuan_image_minimal_inference.py
index ba8ca78e6..3de0b1cd4 100644
--- a/hunyuan_image_minimal_inference.py
+++ b/hunyuan_image_minimal_inference.py
@@ -66,7 +66,7 @@ def parse_args() -> argparse.Namespace:
 
     # inference
     parser.add_argument(
-        "--guidance_scale", type=float, default=4.0, help="Guidance scale for classifier free guidance. Default is 4.0."
+        "--guidance_scale", type=float, default=5.0, help="Guidance scale for classifier free guidance. Default is 5.0."
     )
     parser.add_argument("--prompt", type=str, default=None, help="prompt for generation")
     parser.add_argument("--negative_prompt", type=str, default="", help="negative prompt for generation, default is empty string")
@@ -508,7 +508,7 @@ def move_models_to_device_if_needed():
     prompt = args.prompt
     cache_key = prompt
     if cache_key in conds_cache:
-        embed, mask = conds_cache[cache_key]
+        embed, mask, embed_byt5, mask_byt5, ocr_mask = conds_cache[cache_key]
     else:
         move_models_to_device_if_needed()
 
@@ -527,7 +527,7 @@ def move_models_to_device_if_needed():
     negative_prompt = args.negative_prompt
     cache_key = negative_prompt
     if cache_key in conds_cache:
-        negative_embed, negative_mask = conds_cache[cache_key]
+        negative_embed, negative_mask, negative_embed_byt5, negative_mask_byt5, negative_ocr_mask = conds_cache[cache_key]
     else:
         move_models_to_device_if_needed()
 
@@ -614,9 +614,10 @@ def generate(
             shared_models["model"] = model
     else:
         # use shared model
+        logger.info("Using shared DiT model.")
         model: hunyuan_image_models.HYImageDiffusionTransformer = shared_models["model"]
-        # model.move_to_device_except_swap_blocks(device)  # Handles block swap correctly
-        # model.prepare_block_swap_before_forward()
+        model.move_to_device_except_swap_blocks(device)  # Handles block swap correctly
+        model.prepare_block_swap_before_forward()
 
     return generate_body(args, model, context, context_null, device, seed)
 
@@ -678,9 +679,18 @@ def generate_body(
 
     # Denoising loop
     do_cfg = args.guidance_scale != 1.0
+    # print(f"embed shape: {embed.shape}, mean: {embed.mean()}, std: {embed.std()}")
+    # print(f"embed_byt5 shape: {embed_byt5.shape}, mean: {embed_byt5.mean()}, std: {embed_byt5.std()}")
+    # print(f"negative_embed shape: {negative_embed.shape}, mean: {negative_embed.mean()}, std: {negative_embed.std()}")
+    # print(f"negative_embed_byt5 shape: {negative_embed_byt5.shape}, mean: {negative_embed_byt5.mean()}, std: {negative_embed_byt5.std()}")
+    # print(f"latents shape: {latents.shape}, mean: {latents.mean()}, std: {latents.std()}")
+    # print(f"mask shape: {mask.shape}, sum: {mask.sum()}")
+    # print(f"mask_byt5 shape: {mask_byt5.shape}, sum: {mask_byt5.sum()}")
+    # print(f"negative_mask shape: {negative_mask.shape}, sum: {negative_mask.sum()}")
+    # print(f"negative_mask_byt5 shape: {negative_mask_byt5.shape}, sum: {negative_mask_byt5.sum()}")
     with tqdm(total=len(timesteps), desc="Denoising steps") as pbar:
         for i, t in enumerate(timesteps):
-            t_expand = t.expand(latents.shape[0]).to(latents.dtype)
+            t_expand = t.expand(latents.shape[0]).to(torch.int64)
 
             with torch.no_grad():
                 noise_pred = model(latents, t_expand, embed, mask, embed_byt5, mask_byt5)
@@ -1040,6 +1050,9 @@ def process_interactive(args: argparse.Namespace) -> None:
     shared_models = load_shared_models(args)
     shared_models["conds_cache"] = {}  # Initialize empty cache for interactive mode
 
+    vae = hunyuan_image_vae.load_vae(args.vae, device="cpu", disable_mmap=True)
+    vae.eval()
+
     print("Interactive mode. Enter prompts (Ctrl+D or Ctrl+Z (Windows) to exit):")
 
     try:
@@ -1059,9 +1072,6 @@ def input_line(prompt: str) -> str:
         def input_line(prompt: str) -> str:
             return input(prompt)
 
-    vae = hunyuan_image_vae.load_vae(args.vae, device="cpu", disable_mmap=True)
-    vae.eval()
-
     try:
         while True:
             try:
@@ -1088,7 +1098,7 @@ def input_line(prompt: str) -> str:
 
                 # Save latent and video
                 # returned_vae from generate will be used for decoding here.
-                save_output(prompt_args, vae, latent[0], device)
+                save_output(prompt_args, vae, latent, device)
 
             except KeyboardInterrupt:
                 print("\nInterrupted. Continue (Ctrl+D or Ctrl+Z (Windows) to exit)")
diff --git a/hunyuan_image_train_network.py b/hunyuan_image_train_network.py
index b1281fa01..291d5132f 100644
--- a/hunyuan_image_train_network.py
+++ b/hunyuan_image_train_network.py
@@ -1,5 +1,6 @@
 import argparse
 import copy
+import gc
 from typing import Any, Optional, Union
 import argparse
 import os
@@ -12,7 +13,7 @@
 from PIL import Image
 from accelerate import Accelerator, PartialState
 
-from library import hunyuan_image_models, hunyuan_image_vae, strategy_base, train_util
+from library import flux_utils, hunyuan_image_models, hunyuan_image_vae, strategy_base, train_util
 from library.device_utils import clean_memory_on_device, init_ipex
 
 init_ipex()
@@ -24,7 +25,6 @@
     hunyuan_image_text_encoder,
     hunyuan_image_utils,
     hunyuan_image_vae,
-    sai_model_spec,
     sd3_train_utils,
     strategy_base,
     strategy_hunyuan_image,
@@ -79,8 +79,6 @@ def sample_images(
     dit = accelerator.unwrap_model(dit)
     if text_encoders is not None:
         text_encoders = [(accelerator.unwrap_model(te) if te is not None else None) for te in text_encoders]
-    if controlnet is not None:
-        controlnet = accelerator.unwrap_model(controlnet)
     # print([(te.parameters().__next__().device if te is not None else None) for te in text_encoders])
 
     prompts = train_util.load_prompts(args.sample_prompts)
@@ -162,10 +160,10 @@ def sample_image_inference(
     sample_steps = prompt_dict.get("sample_steps", 20)
     width = prompt_dict.get("width", 512)
     height = prompt_dict.get("height", 512)
-    cfg_scale = prompt_dict.get("scale", 1.0)
+    cfg_scale = prompt_dict.get("scale", 3.5)
     seed = prompt_dict.get("seed")
     prompt: str = prompt_dict.get("prompt", "")
-    flow_shift: float = prompt_dict.get("flow_shift", 4.0)
+    flow_shift: float = prompt_dict.get("flow_shift", 5.0)
     # sampler_name: str = prompt_dict.get("sample_sampler", args.sample_sampler)
 
     if prompt_replacement is not None:
@@ -208,11 +206,10 @@ def encode_prompt(prpt):
         text_encoder_conds = []
         if sample_prompts_te_outputs and prpt in sample_prompts_te_outputs:
             text_encoder_conds = sample_prompts_te_outputs[prpt]
-            print(f"Using cached text encoder outputs for prompt: {prpt}")
+            # print(f"Using cached text encoder outputs for prompt: {prpt}")
         if text_encoders is not None:
-            print(f"Encoding prompt: {prpt}")
+            # print(f"Encoding prompt: {prpt}")
             tokens_and_masks = tokenize_strategy.tokenize(prpt)
-            # strategy has apply_t5_attn_mask option
             encoded_text_encoder_conds = encoding_strategy.encode_tokens(tokenize_strategy, text_encoders, tokens_and_masks)
 
             # if text_encoder_conds is not cached, use encoded_text_encoder_conds
@@ -255,16 +252,21 @@ def encode_prompt(prpt):
 
     from hunyuan_image_minimal_inference import generate_body  # import here to avoid circular import
 
-    latents = generate_body(gen_args, dit, arg_c, arg_c_null, accelerator.device, seed)
+    dit_is_training = dit.training
+    dit.eval()
+    x = generate_body(gen_args, dit, arg_c, arg_c_null, accelerator.device, seed)
+    if dit_is_training:
+        dit.train()
+    clean_memory_on_device(accelerator.device)
 
     # latent to image
-    clean_memory_on_device(accelerator.device)
     org_vae_device = vae.device  # will be on cpu
     vae.to(accelerator.device)  # distributed_state.device is same as accelerator.device
-    with torch.autocast(accelerator.device.type, vae.dtype, enabled=True), torch.no_grad():
-        x = x / hunyuan_image_vae.VAE_SCALE_FACTOR
-        x = vae.decode(x)
+    with torch.no_grad():
+        x = x / vae.scaling_factor
+        x = vae.decode(x.to(vae.device, dtype=vae.dtype))
     vae.to(org_vae_device)
+
     clean_memory_on_device(accelerator.device)
 
     x = x.clamp(-1, 1)
@@ -299,6 +301,7 @@ def __init__(self):
         super().__init__()
         self.sample_prompts_te_outputs = None
         self.is_swapping_blocks: bool = False
+        self.rotary_pos_emb_cache = {}
 
     def assert_extra_args(
         self,
@@ -341,12 +344,42 @@ def assert_extra_args(
     def load_target_model(self, args, weight_dtype, accelerator):
         self.is_swapping_blocks = args.blocks_to_swap is not None and args.blocks_to_swap > 0
 
-        # currently offload to cpu for some models
+        vl_dtype = torch.float8_e4m3fn if args.fp8_vl else torch.bfloat16
+        vl_device = "cpu"
+        _, text_encoder_vlm = hunyuan_image_text_encoder.load_qwen2_5_vl(
+            args.text_encoder, dtype=vl_dtype, device=vl_device, disable_mmap=args.disable_mmap_load_safetensors
+        )
+        _, text_encoder_byt5 = hunyuan_image_text_encoder.load_byt5(
+            args.byt5, dtype=torch.float16, device=vl_device, disable_mmap=args.disable_mmap_load_safetensors
+        )
+
+        vae = hunyuan_image_vae.load_vae(args.vae, "cpu", disable_mmap=args.disable_mmap_load_safetensors)
+        vae.to(dtype=torch.float16)  # VAE is always fp16
+        vae.eval()
+        if args.vae_enable_tiling:
+            vae.enable_tiling()
+            logger.info("VAE tiling is enabled")
+
+        model_version = hunyuan_image_utils.MODEL_VERSION_2_1
+        return model_version, [text_encoder_vlm, text_encoder_byt5], vae, None  # unet will be loaded later
+
+    def load_unet_lazily(self, args, weight_dtype, accelerator, text_encoders) -> tuple[nn.Module, list[nn.Module]]:
+        if args.cache_text_encoder_outputs:
+            logger.info("Replace text encoders with dummy models to save memory")
+
+            # This doesn't free memory, so we move text encoders to meta device in cache_text_encoder_outputs_if_needed
+            text_encoders = [flux_utils.dummy_clip_l() for _ in text_encoders]
+            clean_memory_on_device(accelerator.device)
+            gc.collect()
+
         loading_dtype = None if args.fp8_scaled else weight_dtype
         loading_device = "cpu" if self.is_swapping_blocks else accelerator.device
         split_attn = True
 
         attn_mode = "torch"
+        if args.xformers:
+            attn_mode = "xformers"
+            logger.info("xformers is enabled for attention")
 
         model = hunyuan_image_models.load_hunyuan_image_model(
             accelerator.device,
@@ -363,19 +396,7 @@ def load_target_model(self, args, weight_dtype, accelerator):
             logger.info(f"enable block swap: blocks_to_swap={args.blocks_to_swap}")
             model.enable_block_swap(args.blocks_to_swap, accelerator.device)
 
-        vl_dtype = torch.bfloat16
-        vl_device = "cpu"
-        _, text_encoder_vlm = hunyuan_image_text_encoder.load_qwen2_5_vl(
-            args.text_encoder, dtype=vl_dtype, device=vl_device, disable_mmap=args.disable_mmap_load_safetensors
-        )
-        _, text_encoder_byt5 = hunyuan_image_text_encoder.load_byt5(
-            args.byt5, dtype=torch.float16, device=vl_device, disable_mmap=args.disable_mmap_load_safetensors
-        )
-
-        vae = hunyuan_image_vae.load_vae(args.vae, "cpu", disable_mmap=args.disable_mmap_load_safetensors)
-
-        model_version = hunyuan_image_utils.MODEL_VERSION_2_1
-        return model_version, [text_encoder_vlm, text_encoder_byt5], vae, model
+        return model, text_encoders
 
     def get_tokenize_strategy(self, args):
         return strategy_hunyuan_image.HunyuanImageTokenizeStrategy(args.tokenizer_cache_dir)
@@ -404,7 +425,6 @@ def get_text_encoders_train_flags(self, args, text_encoders):
 
     def get_text_encoder_outputs_caching_strategy(self, args):
         if args.cache_text_encoder_outputs:
-            # if the text encoders is trained, we need tokenization, so is_partial is True
             return strategy_hunyuan_image.HunyuanImageTextEncoderOutputsCachingStrategy(
                 args.cache_text_encoder_outputs_to_disk, args.text_encoder_batch_size, args.skip_cache_check, False
             )
@@ -417,11 +437,9 @@ def cache_text_encoder_outputs_if_needed(
         if args.cache_text_encoder_outputs:
             if not args.lowram:
                 # メモリ消費を減らす
-                logger.info("move vae and unet to cpu to save memory")
+                logger.info("move vae to cpu to save memory")
                 org_vae_device = vae.device
-                org_unet_device = unet.device
                 vae.to("cpu")
-                unet.to("cpu")
                 clean_memory_on_device(accelerator.device)
 
             logger.info("move text encoders to gpu")
@@ -457,17 +475,14 @@ def cache_text_encoder_outputs_if_needed(
 
             accelerator.wait_for_everyone()
 
-            # move back to cpu
-            logger.info("move VLM back to cpu")
-            text_encoders[0].to("cpu")
-            logger.info("move byT5 back to cpu")
-            text_encoders[1].to("cpu")
+            # text encoders are not needed for training, so we move to meta device
+            logger.info("move text encoders to meta device to save memory")
+            text_encoders = [te.to("meta") for te in text_encoders]
             clean_memory_on_device(accelerator.device)
 
             if not args.lowram:
-                logger.info("move vae and unet back to original device")
+                logger.info("move vae back to original device")
                 vae.to(org_vae_device)
-                unet.to(org_unet_device)
         else:
             # Text Encoderから毎回出力を取得するので、GPUに乗せておく
             text_encoders[0].to(accelerator.device)
@@ -477,21 +492,19 @@ def sample_images(self, accelerator, args, epoch, global_step, device, ae, token
         text_encoders = text_encoder  # for compatibility
         text_encoders = self.get_models_for_text_encoding(args, accelerator, text_encoders)
 
-        flux_train_utils.sample_images(
-            accelerator, args, epoch, global_step, flux, ae, text_encoders, self.sample_prompts_te_outputs
-        )
+        sample_images(accelerator, args, epoch, global_step, flux, ae, text_encoders, self.sample_prompts_te_outputs)
 
     def get_noise_scheduler(self, args: argparse.Namespace, device: torch.device) -> Any:
         noise_scheduler = sd3_train_utils.FlowMatchEulerDiscreteScheduler(num_train_timesteps=1000, shift=args.discrete_flow_shift)
         self.noise_scheduler_copy = copy.deepcopy(noise_scheduler)
         return noise_scheduler
 
-    def encode_images_to_latents(self, args, vae, images):
-        return vae.encode(images)
+    def encode_images_to_latents(self, args, vae: hunyuan_image_vae.HunyuanVAE2D, images):
+        return vae.encode(images).sample()
 
     def shift_scale_latents(self, args, latents):
         # for encoding, we need to scale the latents
-        return latents * hunyuan_image_vae.VAE_SCALE_FACTOR
+        return latents * hunyuan_image_vae.LATENT_SCALING_FACTOR
 
     def get_noise_pred_and_target(
         self,
@@ -509,12 +522,16 @@ def get_noise_pred_and_target(
     ):
         # Sample noise that we'll add to the latents
         noise = torch.randn_like(latents)
-        bsz = latents.shape[0]
 
         # get noisy model input and timesteps
-        noisy_model_input, timesteps, sigmas = flux_train_utils.get_noisy_model_input_and_timesteps(
+        noisy_model_input, _, sigmas = flux_train_utils.get_noisy_model_input_and_timesteps(
             args, noise_scheduler, latents, noise, accelerator.device, weight_dtype
         )
+        # bfloat16 is too low precision for 0-1000 TODO fix get_noisy_model_input_and_timesteps
+        timesteps = (sigmas[:, 0, 0, 0] * 1000).to(torch.int64)
+        # print(
+        #     f"timestep: {timesteps}, noisy_model_input shape: {noisy_model_input.shape}, mean: {noisy_model_input.mean()}, std: {noisy_model_input.std()}"
+        # )
 
         if args.gradient_checkpointing:
             noisy_model_input.requires_grad_(True)
@@ -526,31 +543,33 @@ def get_noise_pred_and_target(
         # ocr_mask is for inference only, so it is not used here
         vlm_embed, vlm_mask, byt5_embed, byt5_mask, ocr_mask = text_encoder_conds
 
+        # print(f"embed shape: {vlm_embed.shape}, mean: {vlm_embed.mean()}, std: {vlm_embed.std()}")
+        # print(f"embed_byt5 shape: {byt5_embed.shape}, mean: {byt5_embed.mean()}, std: {byt5_embed.std()}")
+        # print(f"latents shape: {latents.shape}, mean: {latents.mean()}, std: {latents.std()}")
+        # print(f"mask shape: {vlm_mask.shape}, sum: {vlm_mask.sum()}")
+        # print(f"mask_byt5 shape: {byt5_mask.shape}, sum: {byt5_mask.sum()}")
         with torch.set_grad_enabled(is_train), accelerator.autocast():
-            model_pred = unet(noisy_model_input, timesteps / 1000, vlm_embed, vlm_mask, byt5_embed, byt5_mask)
+            model_pred = unet(
+                noisy_model_input, timesteps, vlm_embed, vlm_mask, byt5_embed, byt5_mask  # , self.rotary_pos_emb_cache
+            )
 
-        # model prediction and weighting is omitted for HunyuanImage-2.1 currently
+        # apply model prediction type
+        model_pred, weighting = flux_train_utils.apply_model_prediction_type(args, model_pred, noisy_model_input, sigmas)
 
         # flow matching loss
         target = noise - latents
 
         # differential output preservation is not used for HunyuanImage-2.1 currently
 
-        return model_pred, target, timesteps, None
+        return model_pred, target, timesteps, weighting
 
     def post_process_loss(self, loss, args, timesteps, noise_scheduler):
         return loss
 
     def get_sai_model_spec(self, args):
-        # if self.model_type != "chroma":
-        #     model_description = "schnell" if self.is_schnell else "dev"
-        # else:
-        #     model_description = "chroma"
-        # return train_util.get_sai_model_spec(None, args, False, True, False, flux=model_description)
-        train_util.get_sai_model_spec_dataclass(None, args, False, True, False, hunyuan_image="2.1")
+        return train_util.get_sai_model_spec_dataclass(None, args, False, True, False, hunyuan_image="2.1").to_metadata_dict()
 
     def update_metadata(self, metadata, args):
-        metadata["ss_model_type"] = args.model_type
         metadata["ss_logit_mean"] = args.logit_mean
         metadata["ss_logit_std"] = args.logit_std
         metadata["ss_mode_scale"] = args.mode_scale
@@ -569,6 +588,9 @@ def prepare_text_encoder_grad_ckpt_workaround(self, index, text_encoder):
     def cast_text_encoder(self):
         return False  # VLM is bf16, byT5 is fp16, so do not cast to other dtype
 
+    def cast_vae(self):
+        return False  # VAE is fp16, so do not cast to other dtype
+
     def prepare_text_encoder_fp8(self, index, text_encoder, te_weight_dtype, weight_dtype):
         # fp8 text encoder for HunyuanImage-2.1 is not supported currently
         pass
@@ -597,6 +619,17 @@ def setup_parser() -> argparse.ArgumentParser:
     parser = train_network.setup_parser()
     train_util.add_dit_training_arguments(parser)
 
+    parser.add_argument(
+        "--text_encoder",
+        type=str,
+        help="path to Qwen2.5-VL (*.sft or *.safetensors), should be bfloat16 / Qwen2.5-VLのパス（*.sftまたは*.safetensors）、bfloat16が前提",
+    )
+    parser.add_argument(
+        "--byt5",
+        type=str,
+        help="path to byt5 (*.sft or *.safetensors), should be float16 / byt5のパス（*.sftまたは*.safetensors）、float16が前提",
+    )
+
     parser.add_argument(
         "--timestep_sampling",
         choices=["sigma", "uniform", "sigmoid", "shift", "flux_shift"],
@@ -613,17 +646,24 @@ def setup_parser() -> argparse.ArgumentParser:
     parser.add_argument(
         "--model_prediction_type",
         choices=["raw", "additive", "sigma_scaled"],
-        default="sigma_scaled",
+        default="raw",
         help="How to interpret and process the model prediction: "
-        "raw (use as is), additive (add to noisy input), sigma_scaled (apply sigma scaling)."
+        "raw (use as is), additive (add to noisy input), sigma_scaled (apply sigma scaling). Default is raw unlike FLUX.1."
         " / モデル予測の解釈と処理方法："
-        "raw（そのまま使用）、additive（ノイズ入力に加算）、sigma_scaled（シグマスケーリングを適用）。",
+        "raw（そのまま使用）、additive（ノイズ入力に加算）、sigma_scaled（シグマスケーリングを適用）。デフォルトはFLUX.1とは異なりrawです。",
     )
     parser.add_argument(
         "--discrete_flow_shift",
         type=float,
-        default=3.0,
-        help="Discrete flow shift for the Euler Discrete Scheduler, default is 3.0. / Euler Discrete Schedulerの離散フローシフト、デフォルトは3.0。",
+        default=5.0,
+        help="Discrete flow shift for the Euler Discrete Scheduler, default is 5.0. / Euler Discrete Schedulerの離散フローシフト、デフォルトは5.0。",
+    )
+    parser.add_argument("--fp8_scaled", action="store_true", help="use scaled fp8 for DiT / DiTにスケーリングされたfp8を使う")
+    parser.add_argument("--fp8_vl", action="store_true", help="use fp8 for VLM text encoder / VLMテキストエンコーダにfp8を使用する")
+    parser.add_argument(
+        "--vae_enable_tiling",
+        action="store_true",
+        help="Enable tiling for VAE decoding and encoding / VAEデコーディングとエンコーディングのタイルを有効にする",
     )
 
     return parser
diff --git a/library/attention.py b/library/attention.py
index 10a096143..f1e7c0b0c 100644
--- a/library/attention.py
+++ b/library/attention.py
@@ -1,9 +1,19 @@
 import torch
-from typing import Optional
+from typing import Optional, Union
+
+try:
+    import xformers.ops as xops
+except ImportError:
+    xops = None
 
 
 def attention(
-    q: torch.Tensor, k: torch.Tensor, v: torch.Tensor, seq_lens: list[int], attn_mode: str = "torch", drop_rate: float = 0.0
+    qkv_or_q: Union[torch.Tensor, list],
+    k: Optional[torch.Tensor] = None,
+    v: Optional[torch.Tensor] = None,
+    seq_lens: Optional[list[int]] = None,
+    attn_mode: str = "torch",
+    drop_rate: float = 0.0,
 ) -> torch.Tensor:
     """
     Compute scaled dot-product attention with variable sequence lengths.
@@ -12,7 +22,7 @@ def attention(
     processing each sequence individually.
 
     Args:
-        q: Query tensor [B, L, H, D].
+        qkv_or_q: Query tensor [B, L, H, D]. or list of such tensors.
         k: Key tensor [B, L, H, D].
         v: Value tensor [B, L, H, D].
         seq_lens: Valid sequence length for each batch element.
@@ -22,6 +32,17 @@ def attention(
     Returns:
         Attention output tensor [B, L, H*D].
     """
+    if isinstance(qkv_or_q, list):
+        q, k, v = qkv_or_q
+        qkv_or_q.clear()
+        del qkv_or_q
+    else:
+        q = qkv_or_q
+        del qkv_or_q
+        assert k is not None and v is not None, "k and v must be provided if qkv_or_q is a tensor"
+    if seq_lens is None:
+        seq_lens = [q.shape[1]] * q.shape[0]
+
     # Determine tensor layout based on attention implementation
     if attn_mode == "torch" or attn_mode == "sageattn":
         transpose_fn = lambda x: x.transpose(1, 2)  # [B, H, L, D] for SDPA
@@ -29,6 +50,7 @@ def attention(
         transpose_fn = lambda x: x  # [B, L, H, D] for other implementations
 
     # Process each batch element with its valid sequence length
+    q_seq_len = q.shape[1]
     q = [transpose_fn(q[i : i + 1, : seq_lens[i]]) for i in range(len(q))]
     k = [transpose_fn(k[i : i + 1, : seq_lens[i]]) for i in range(len(k))]
     v = [transpose_fn(v[i : i + 1, : seq_lens[i]]) for i in range(len(v))]
@@ -40,10 +62,24 @@ def attention(
             q[i] = None
             k[i] = None
             v[i] = None
-            x.append(x_i)
+            x.append(torch.nn.functional.pad(x_i, (0, 0, 0, q_seq_len - x_i.shape[2]), value=0))  # Pad to max seq len, B, H, L, D
         x = torch.cat(x, dim=0)
         del q, k, v
-    # Currently only PyTorch SDPA is implemented
+
+    elif attn_mode == "xformers":
+        x = []
+        for i in range(len(q)):
+            x_i = xops.memory_efficient_attention(q[i], k[i], v[i], p=drop_rate)
+            q[i] = None
+            k[i] = None
+            v[i] = None
+            x.append(torch.nn.functional.pad(x_i, (0, 0, 0, 0, 0, q_seq_len - x_i.shape[1]), value=0))  # B, L, H, D
+        x = torch.cat(x, dim=0)
+        del q, k, v
+
+    else:
+        # Currently only PyTorch SDPA and xformers are implemented
+        raise ValueError(f"Unsupported attention mode: {attn_mode}")
 
     x = transpose_fn(x)  # [B, L, H, D]
     x = x.reshape(x.shape[0], x.shape[1], -1)  # [B, L, H*D]
diff --git a/library/hunyuan_image_models.py b/library/hunyuan_image_models.py
index 9e3a00e8b..ce2d23ddc 100644
--- a/library/hunyuan_image_models.py
+++ b/library/hunyuan_image_models.py
@@ -30,11 +30,7 @@
 from library.hunyuan_image_utils import get_nd_rotary_pos_embed
 
 FP8_OPTIMIZATION_TARGET_KEYS = ["double_blocks", "single_blocks"]
-FP8_OPTIMIZATION_EXCLUDE_KEYS = [
-    "norm",
-    "_mod",
-    "modulation",
-]
+FP8_OPTIMIZATION_EXCLUDE_KEYS = ["norm", "_mod", "modulation", "_emb"]
 
 
 # region DiT Model
@@ -142,6 +138,14 @@ def __init__(self, attn_mode: str = "torch"):
         self.num_double_blocks = len(self.double_blocks)
         self.num_single_blocks = len(self.single_blocks)
 
+    @property
+    def device(self):
+        return next(self.parameters()).device
+
+    @property
+    def dtype(self):
+        return next(self.parameters()).dtype
+
     def enable_gradient_checkpointing(self, cpu_offload: bool = False):
         self.gradient_checkpointing = True
         self.cpu_offload_checkpointing = cpu_offload
@@ -273,6 +277,7 @@ def forward(
         encoder_attention_mask: torch.Tensor,
         byt5_text_states: Optional[torch.Tensor] = None,
         byt5_text_mask: Optional[torch.Tensor] = None,
+        rotary_pos_emb_cache: Optional[Dict[Tuple[int, int], Tuple[torch.Tensor, torch.Tensor]]] = None,
     ) -> torch.Tensor:
         """
         Forward pass through the HunyuanImage diffusion transformer.
@@ -296,7 +301,15 @@ def forward(
         # Calculate spatial dimensions for rotary position embeddings
         _, _, oh, ow = x.shape
         th, tw = oh, ow  # Height and width (patch_size=[1,1] means no spatial downsampling)
-        freqs_cis = self.get_rotary_pos_embed((th, tw))
+        if rotary_pos_emb_cache is not None:
+            if (th, tw) in rotary_pos_emb_cache:
+                freqs_cis = rotary_pos_emb_cache[(th, tw)]
+                freqs_cis = (freqs_cis[0].to(img.device), freqs_cis[1].to(img.device))
+            else:
+                freqs_cis = self.get_rotary_pos_embed((th, tw))
+                rotary_pos_emb_cache[(th, tw)] = (freqs_cis[0].cpu(), freqs_cis[1].cpu())
+        else:
+            freqs_cis = self.get_rotary_pos_embed((th, tw))
 
         # Reshape image latents to sequence format: [B, C, H, W] -> [B, H*W, C]
         img = self.img_in(img)
@@ -349,9 +362,11 @@ def forward(
         vec = vec.to(input_device)
 
         img = x[:, :img_seq_len, ...]
+        del x
 
         # Apply final projection to output space
         img = self.final_layer(img, vec)
+        del vec
 
         # Reshape from sequence to spatial format: [B, L, C] -> [B, C, H, W]
         img = self.unpatchify_2d(img, th, tw)
diff --git a/library/hunyuan_image_modules.py b/library/hunyuan_image_modules.py
index 633cd310d..ef4d5e5d7 100644
--- a/library/hunyuan_image_modules.py
+++ b/library/hunyuan_image_modules.py
@@ -50,7 +50,7 @@ def forward(self, x):
         Returns:
             Transformed embeddings [..., out_dim1].
         """
-        residual = x
+        residual = x if self.use_residual else None
         x = self.layernorm(x)
         x = self.fc1(x)
         x = self.act_fn(x)
@@ -411,6 +411,7 @@ def forward(self, x: torch.Tensor, t: torch.LongTensor, txt_lens: list[int]) ->
 
         context_aware_representations = self.c_embedder(context_aware_representations)
         c = timestep_aware_representations + context_aware_representations
+        del timestep_aware_representations, context_aware_representations
         x = self.input_embedder(x)
         x = self.individual_token_refiner(x, c, txt_lens)
         return x
@@ -447,6 +448,7 @@ def __init__(self, hidden_size, patch_size, out_channels, act_layer):
     def forward(self, x, c):
         shift, scale = self.adaLN_modulation(c).chunk(2, dim=1)
         x = modulate(self.norm_final(x), shift=shift, scale=scale)
+        del shift, scale, c
         x = self.linear(x)
         return x
 
@@ -494,6 +496,7 @@ def forward(self, x):
             Normalized and scaled tensor.
         """
         output = self._norm(x.float()).type_as(x)
+        del x
         output = output * self.weight
         return output
 
@@ -634,8 +637,10 @@ def _forward(
         # Process image stream for attention
         img_modulated = self.img_norm1(img)
         img_modulated = modulate(img_modulated, shift=img_mod1_shift, scale=img_mod1_scale)
+        del img_mod1_shift, img_mod1_scale
 
         img_qkv = self.img_attn_qkv(img_modulated)
+        del img_modulated
         img_q, img_k, img_v = img_qkv.chunk(3, dim=-1)
         del img_qkv
 
@@ -649,17 +654,15 @@ def _forward(
 
         # Apply rotary position embeddings to image tokens
         if freqs_cis is not None:
-            img_qq, img_kk = apply_rotary_emb(img_q, img_k, freqs_cis, head_first=False)
-            assert (
-                img_qq.shape == img_q.shape and img_kk.shape == img_k.shape
-            ), f"RoPE output shape mismatch: got {img_qq.shape}, {img_kk.shape}, expected {img_q.shape}, {img_k.shape}"
-            img_q, img_k = img_qq, img_kk
+            img_q, img_k = apply_rotary_emb(img_q, img_k, freqs_cis, head_first=False)
+            del freqs_cis
 
         # Process text stream for attention
         txt_modulated = self.txt_norm1(txt)
         txt_modulated = modulate(txt_modulated, shift=txt_mod1_shift, scale=txt_mod1_scale)
 
         txt_qkv = self.txt_attn_qkv(txt_modulated)
+        del txt_modulated
         txt_q, txt_k, txt_v = txt_qkv.chunk(3, dim=-1)
         del txt_qkv
 
@@ -672,31 +675,44 @@ def _forward(
         txt_k = self.txt_attn_k_norm(txt_k).to(txt_v)
 
         # Concatenate image and text tokens for joint attention
+        img_seq_len = img.shape[1]
         q = torch.cat([img_q, txt_q], dim=1)
+        del img_q, txt_q
         k = torch.cat([img_k, txt_k], dim=1)
+        del img_k, txt_k
         v = torch.cat([img_v, txt_v], dim=1)
-        attn = attention(q, k, v, seq_lens=seq_lens, attn_mode=self.attn_mode)
+        del img_v, txt_v
+
+        qkv = [q, k, v]
+        del q, k, v
+        attn = attention(qkv, seq_lens=seq_lens, attn_mode=self.attn_mode)
+        del qkv
 
         # Split attention outputs back to separate streams
-        img_attn, txt_attn = (attn[:, : img_q.shape[1]].contiguous(), attn[:, img_q.shape[1] :].contiguous())
+        img_attn, txt_attn = (attn[:, : img_seq_len].contiguous(), attn[:, img_seq_len :].contiguous())
+        del attn
 
         # Apply attention projection and residual connection for image stream
         img = img + apply_gate(self.img_attn_proj(img_attn), gate=img_mod1_gate)
+        del img_attn, img_mod1_gate
 
         # Apply MLP and residual connection for image stream
         img = img + apply_gate(
             self.img_mlp(modulate(self.img_norm2(img), shift=img_mod2_shift, scale=img_mod2_scale)),
             gate=img_mod2_gate,
         )
+        del img_mod2_shift, img_mod2_scale, img_mod2_gate
 
         # Apply attention projection and residual connection for text stream
         txt = txt + apply_gate(self.txt_attn_proj(txt_attn), gate=txt_mod1_gate)
+        del txt_attn, txt_mod1_gate
 
         # Apply MLP and residual connection for text stream
         txt = txt + apply_gate(
             self.txt_mlp(modulate(self.txt_norm2(txt), shift=txt_mod2_shift, scale=txt_mod2_scale)),
             gate=txt_mod2_gate,
         )
+        del txt_mod2_shift, txt_mod2_scale, txt_mod2_gate
 
         return img, txt
 
@@ -797,6 +813,7 @@ def _forward(
 
         # Compute Q, K, V, and MLP input
         qkv_mlp = self.linear1(x_mod)
+        del x_mod
         q, k, v, mlp = qkv_mlp.split([self.hidden_size, self.hidden_size, self.hidden_size, self.mlp_hidden_dim], dim=-1)
         del qkv_mlp
 
@@ -810,27 +827,34 @@ def _forward(
 
         # Separate image and text tokens
         img_q, txt_q = q[:, :-txt_len, :, :], q[:, -txt_len:, :, :]
+        del q
         img_k, txt_k = k[:, :-txt_len, :, :], k[:, -txt_len:, :, :]
-        img_v, txt_v = v[:, :-txt_len, :, :], v[:, -txt_len:, :, :]
+        del k
+        # img_v, txt_v = v[:, :-txt_len, :, :], v[:, -txt_len:, :, :]
+        # del v
 
         # Apply rotary position embeddings only to image tokens
-        img_qq, img_kk = apply_rotary_emb(img_q, img_k, freqs_cis, head_first=False)
-        assert (
-            img_qq.shape == img_q.shape and img_kk.shape == img_k.shape
-        ), f"RoPE output shape mismatch: got {img_qq.shape}, {img_kk.shape}, expected {img_q.shape}, {img_k.shape}"
-        img_q, img_k = img_qq, img_kk
+        img_q, img_k = apply_rotary_emb(img_q, img_k, freqs_cis, head_first=False)
+        del freqs_cis
 
         # Recombine and compute joint attention
         q = torch.cat([img_q, txt_q], dim=1)
+        del img_q, txt_q
         k = torch.cat([img_k, txt_k], dim=1)
-        v = torch.cat([img_v, txt_v], dim=1)
-        attn = attention(q, k, v, seq_lens=seq_lens, attn_mode=self.attn_mode)
+        del img_k, txt_k
+        # v = torch.cat([img_v, txt_v], dim=1)
+        # del img_v, txt_v
+        qkv = [q, k, v]
+        del q, k, v
+        attn = attention(qkv, seq_lens=seq_lens, attn_mode=self.attn_mode)
+        del qkv
 
         # Combine attention and MLP outputs, apply gating
         # output = self.linear2(attn, self.mlp_act(mlp))
 
         mlp = self.mlp_act(mlp)
         output = torch.cat([attn, mlp], dim=2).contiguous()
+        del attn, mlp
         output = self.linear2(output)
 
         return x + apply_gate(output, gate=mod_gate)
diff --git a/library/hunyuan_image_text_encoder.py b/library/hunyuan_image_text_encoder.py
index 1300b39b7..960f14b37 100644
--- a/library/hunyuan_image_text_encoder.py
+++ b/library/hunyuan_image_text_encoder.py
@@ -598,7 +598,7 @@ def get_byt5_prompt_embeds_from_tokens(
 ) -> Tuple[list[bool], torch.Tensor, torch.Tensor]:
     byt5_max_length = BYT5_MAX_LENGTH
 
-    if byt5_text_ids is None or byt5_text_mask is None:
+    if byt5_text_ids is None or byt5_text_mask is None or byt5_text_mask.sum() == 0:
         return (
             [False],
             torch.zeros((1, byt5_max_length, 1472), device=text_encoder.device),
diff --git a/library/hunyuan_image_vae.py b/library/hunyuan_image_vae.py
index 6eb035c38..570d4caa6 100644
--- a/library/hunyuan_image_vae.py
+++ b/library/hunyuan_image_vae.py
@@ -17,6 +17,8 @@
 
 VAE_SCALE_FACTOR = 32  # 32x spatial compression
 
+LATENT_SCALING_FACTOR = 0.75289  # Latent scaling factor for Hunyuan Image-2.1
+
 
 def swish(x: Tensor) -> Tensor:
     """Swish activation function: x * sigmoid(x)."""
@@ -378,7 +380,7 @@ def __init__(self):
         layers_per_block = 2
         ffactor_spatial = 32  # 32x spatial compression
         sample_size = 384  # Minimum sample size for tiling
-        scaling_factor = 0.75289  # Latent scaling factor
+        scaling_factor = LATENT_SCALING_FACTOR  # 0.75289  # Latent scaling factor
 
         self.ffactor_spatial = ffactor_spatial
         self.scaling_factor = scaling_factor
diff --git a/library/strategy_hunyuan_image.py b/library/strategy_hunyuan_image.py
index 2188ed371..5c704728f 100644
--- a/library/strategy_hunyuan_image.py
+++ b/library/strategy_hunyuan_image.py
@@ -21,14 +21,27 @@ def __init__(self, tokenizer_cache_dir: Optional[str] = None) -> None:
             Qwen2Tokenizer, hunyuan_image_text_encoder.QWEN_2_5_VL_IMAGE_ID, tokenizer_cache_dir=tokenizer_cache_dir
         )
         self.byt5_tokenizer = self._load_tokenizer(
-            AutoTokenizer, hunyuan_image_text_encoder.BYT5_TOKENIZER_PATH, tokenizer_cache_dir=tokenizer_cache_dir
+            AutoTokenizer, hunyuan_image_text_encoder.BYT5_TOKENIZER_PATH, subfolder="", tokenizer_cache_dir=tokenizer_cache_dir
         )
 
     def tokenize(self, text: Union[str, List[str]]) -> List[torch.Tensor]:
         text = [text] if isinstance(text, str) else text
 
         vlm_tokens, vlm_mask = hunyuan_image_text_encoder.get_qwen_tokens(self.vlm_tokenizer, text)
-        byt5_tokens, byt5_mask = hunyuan_image_text_encoder.get_byt5_text_tokens(self.byt5_tokenizer, text)
+
+        # byt5_tokens, byt5_mask = hunyuan_image_text_encoder.get_byt5_text_tokens(self.byt5_tokenizer, text)
+        byt5_tokens = []
+        byt5_mask = []
+        for t in text:
+            tokens, mask = hunyuan_image_text_encoder.get_byt5_text_tokens(self.byt5_tokenizer, t)
+            if tokens is None:
+                tokens = torch.zeros((1, 1), dtype=torch.long)
+                mask = torch.zeros((1, 1), dtype=torch.long)
+            byt5_tokens.append(tokens)
+            byt5_mask.append(mask)
+        max_len = max([m.shape[1] for m in byt5_mask])
+        byt5_tokens = torch.cat([torch.nn.functional.pad(t, (0, max_len - t.shape[1]), value=0) for t in byt5_tokens], dim=0)
+        byt5_mask = torch.cat([torch.nn.functional.pad(m, (0, max_len - m.shape[1]), value=0) for m in byt5_mask], dim=0)
 
         return [vlm_tokens, vlm_mask, byt5_tokens, byt5_mask]
 
@@ -46,11 +59,24 @@ def encode_tokens(
 
         # autocast and no_grad are handled in hunyuan_image_text_encoder
         vlm_embed, vlm_mask = hunyuan_image_text_encoder.get_qwen_prompt_embeds_from_tokens(qwen2vlm, vlm_tokens, vlm_mask)
-        ocr_mask, byt5_embed, byt5_mask = hunyuan_image_text_encoder.get_byt5_prompt_embeds_from_tokens(
-            byt5, byt5_tokens, byt5_mask
-        )
 
-        return [vlm_embed, vlm_mask, byt5_embed, byt5_mask, ocr_mask]
+        # ocr_mask, byt5_embed, byt5_mask = hunyuan_image_text_encoder.get_byt5_prompt_embeds_from_tokens(
+        #     byt5, byt5_tokens, byt5_mask
+        # )
+        ocr_mask, byt5_embed, byt5_updated_mask = [], [], []
+        for i in range(byt5_tokens.shape[0]):
+            ocr_m, byt5_e, byt5_m = hunyuan_image_text_encoder.get_byt5_prompt_embeds_from_tokens(
+                byt5, byt5_tokens[i : i + 1], byt5_mask[i : i + 1]
+            )
+            ocr_mask.append(torch.zeros((1,), dtype=torch.long) + (1 if ocr_m[0] else 0))  # 1 or 0
+            byt5_embed.append(byt5_e)
+            byt5_updated_mask.append(byt5_m)
+
+        ocr_mask = torch.cat(ocr_mask, dim=0).to(torch.bool)  # [B]
+        byt5_embed = torch.cat(byt5_embed, dim=0)
+        byt5_updated_mask = torch.cat(byt5_updated_mask, dim=0)
+
+        return [vlm_embed, vlm_mask, byt5_embed, byt5_updated_mask, ocr_mask]
 
 
 class HunyuanImageTextEncoderOutputsCachingStrategy(TextEncoderOutputsCachingStrategy):
@@ -110,7 +136,6 @@ def cache_batch_outputs(
 
         tokens_and_masks = tokenize_strategy.tokenize(captions)
         with torch.no_grad():
-            # attn_mask is applied in text_encoding_strategy.encode_tokens if apply_t5_attn_mask is True
             vlm_embed, vlm_mask, byt5_embed, byt5_mask, ocr_mask = huyuan_image_text_encoding_strategy.encode_tokens(
                 tokenize_strategy, models, tokens_and_masks
             )
@@ -124,7 +149,7 @@ def cache_batch_outputs(
         vlm_mask = vlm_mask.cpu().numpy()
         byt5_embed = byt5_embed.cpu().numpy()
         byt5_mask = byt5_mask.cpu().numpy()
-        ocr_mask = np.array(ocr_mask, dtype=bool)
+        ocr_mask = ocr_mask.cpu().numpy()
 
         for i, info in enumerate(infos):
             vlm_embed_i = vlm_embed[i]
@@ -175,7 +200,13 @@ def load_latents_from_disk(
     def cache_batch_latents(
         self, vae: hunyuan_image_vae.HunyuanVAE2D, image_infos: List, flip_aug: bool, alpha_mask: bool, random_crop: bool
     ):
-        encode_by_vae = lambda img_tensor: vae.encode(img_tensor).sample()
+        # encode_by_vae = lambda img_tensor: vae.encode(img_tensor).sample()
+        def encode_by_vae(img_tensor):
+            # no_grad is handled in _default_cache_batch_latents
+            nonlocal vae
+            with torch.autocast(device_type=vae.device.type, dtype=vae.dtype):
+                return vae.encode(img_tensor).sample()
+
         vae_device = vae.device
         vae_dtype = vae.dtype
 
diff --git a/library/train_util.py b/library/train_util.py
index 8cd43463c..756d88b1c 100644
--- a/library/train_util.py
+++ b/library/train_util.py
@@ -1744,7 +1744,39 @@ def none_or_stack_elements(tensors_list, converter):
             # [[clip_l, clip_g, t5xxl], [clip_l, clip_g, t5xxl], ...] -> [torch.stack(clip_l), torch.stack(clip_g), torch.stack(t5xxl)]
             if len(tensors_list) == 0 or tensors_list[0] == None or len(tensors_list[0]) == 0 or tensors_list[0][0] is None:
                 return None
-            return [torch.stack([converter(x[i]) for x in tensors_list]) for i in range(len(tensors_list[0]))]
+
+            # old implementation without padding: all elements must have same length
+            # return [torch.stack([converter(x[i]) for x in tensors_list]) for i in range(len(tensors_list[0]))]
+
+            # new implementation with padding support
+            result = []
+            for i in range(len(tensors_list[0])):
+                tensors = [x[i] for x in tensors_list]
+                if tensors[0].ndim == 0:
+                    # scalar value: e.g. ocr mask
+                    result.append(torch.stack([converter(x[i]) for x in tensors_list]))
+                    continue
+
+                min_len = min([len(x) for x in tensors])
+                max_len = max([len(x) for x in tensors])
+
+                if min_len == max_len:
+                    # no padding
+                    result.append(torch.stack([converter(x) for x in tensors]))
+                else:
+                    # padding
+                    tensors = [converter(x) for x in tensors]
+                    if tensors[0].ndim == 1:
+                        # input_ids or mask
+                        result.append(
+                            torch.stack([(torch.nn.functional.pad(x, (0, max_len - x.shape[0]))) for x in tensors])
+                        )
+                    else:
+                        # text encoder outputs
+                        result.append(
+                            torch.stack([(torch.nn.functional.pad(x, (0, 0, 0, max_len - x.shape[0]))) for x in tensors])
+                        )
+            return result
 
         # set example
         example = {}
diff --git a/networks/lora_hunyuan_image.py b/networks/lora_hunyuan_image.py
index b0edde575..3e801f950 100644
--- a/networks/lora_hunyuan_image.py
+++ b/networks/lora_hunyuan_image.py
@@ -191,9 +191,8 @@ def create_network_from_weights(multiplier, file, ae, text_encoders, flux, weigh
 
 
 class HunyuanImageLoRANetwork(lora_flux.LoRANetwork):
-    # FLUX_TARGET_REPLACE_MODULE = ["DoubleStreamBlock", "SingleStreamBlock"]
-    FLUX_TARGET_REPLACE_MODULE_DOUBLE = ["DoubleStreamBlock"]
-    FLUX_TARGET_REPLACE_MODULE_SINGLE = ["SingleStreamBlock"]
+    TARGET_REPLACE_MODULE_DOUBLE = ["MMDoubleStreamBlock"]
+    TARGET_REPLACE_MODULE_SINGLE = ["MMSingleStreamBlock"]
     LORA_PREFIX_HUNYUAN_IMAGE_DIT = "lora_unet"  # make ComfyUI compatible
 
     @classmethod
@@ -222,7 +221,7 @@ def __init__(
         reg_lrs: Optional[Dict[str, float]] = None,
         verbose: Optional[bool] = False,
     ) -> None:
-        super().__init__()
+        nn.Module.__init__(self)
         self.multiplier = multiplier
 
         self.lora_dim = lora_dim
@@ -259,8 +258,6 @@ def __init__(
 
         if self.split_qkv:
             logger.info(f"split qkv for LoRA")
-        if self.train_blocks is not None:
-            logger.info(f"train {self.train_blocks} blocks only")
 
         # create module instances
         def create_modules(
@@ -354,14 +351,14 @@ def create_modules(
 
         # create LoRA for U-Net
         target_replace_modules = (
-            HunyuanImageLoRANetwork.FLUX_TARGET_REPLACE_MODULE_DOUBLE + HunyuanImageLoRANetwork.FLUX_TARGET_REPLACE_MODULE_SINGLE
+            HunyuanImageLoRANetwork.TARGET_REPLACE_MODULE_DOUBLE + HunyuanImageLoRANetwork.TARGET_REPLACE_MODULE_SINGLE
         )
 
         self.unet_loras: List[Union[lora_flux.LoRAModule, lora_flux.LoRAInfModule]]
         self.unet_loras, skipped_un = create_modules(True, None, unet, target_replace_modules)
         self.text_encoder_loras = []
 
-        logger.info(f"create LoRA for FLUX {self.train_blocks} blocks: {len(self.unet_loras)} modules.")
+        logger.info(f"create LoRA for HunyuanImage-2.1: {len(self.unet_loras)} modules.")
         if verbose:
             for lora in self.unet_loras:
                 logger.info(f"\t{lora.lora_name:50} {lora.lora_dim}, {lora.alpha}")
diff --git a/train_network.py b/train_network.py
index 00118877b..c03c5fa09 100644
--- a/train_network.py
+++ b/train_network.py
@@ -1,3 +1,4 @@
+import gc
 import importlib
 import argparse
 import math
@@ -10,11 +11,11 @@
 import json
 from multiprocessing import Value
 import numpy as np
-import toml
 
 from tqdm import tqdm
 
 import torch
+import torch.nn as nn
 from torch.types import Number
 from library.device_utils import init_ipex, clean_memory_on_device
 
@@ -175,7 +176,7 @@ def assert_extra_args(
         if val_dataset_group is not None:
             val_dataset_group.verify_bucket_reso_steps(64)
 
-    def load_target_model(self, args, weight_dtype, accelerator) -> tuple:
+    def load_target_model(self, args, weight_dtype, accelerator) -> tuple[str, nn.Module, nn.Module, Optional[nn.Module]]:
         text_encoder, vae, unet, _ = train_util.load_target_model(args, weight_dtype, accelerator)
 
         # モデルに xformers とか memory efficient attention を組み込む
@@ -185,6 +186,9 @@ def load_target_model(self, args, weight_dtype, accelerator) -> tuple:
 
         return model_util.get_model_version_str_for_sd1_sd2(args.v2, args.v_parameterization), text_encoder, vae, unet
 
+    def load_unet_lazily(self, args, weight_dtype, accelerator, text_encoders) -> tuple[nn.Module, List[nn.Module]]:
+        raise NotImplementedError()
+
     def get_tokenize_strategy(self, args):
         return strategy_sd.SdTokenizeStrategy(args.v2, args.max_token_length, args.tokenizer_cache_dir)
 
@@ -476,8 +480,11 @@ def process_batch(
         return loss.mean()
 
     def cast_text_encoder(self):
-        return True # default for other than HunyuanImage
+        return True  # default for other than HunyuanImage
     
+    def cast_vae(self):
+        return True  # default for other than HunyuanImage
+
     def train(self, args):
         session_id = random.randint(0, 2**32)
         training_started_at = time.time()
@@ -586,37 +593,18 @@ def train(self, args):
 
         # mixed precisionに対応した型を用意しておき適宜castする
         weight_dtype, save_dtype = train_util.prepare_dtype(args)
-        vae_dtype = torch.float32 if args.no_half_vae else weight_dtype
+        vae_dtype = (torch.float32 if args.no_half_vae else weight_dtype) if self.cast_vae() else None
 
-        # モデルを読み込む
+        # load target models: unet may be None for lazy loading
         model_version, text_encoder, vae, unet = self.load_target_model(args, weight_dtype, accelerator)
+        if vae_dtype is None:
+            vae_dtype = vae.dtype
+            logger.info(f"vae_dtype is set to {vae_dtype} by the model since cast_vae() is false")
 
         # text_encoder is List[CLIPTextModel] or CLIPTextModel
         text_encoders = text_encoder if isinstance(text_encoder, list) else [text_encoder]
 
-        # 差分追加学習のためにモデルを読み込む
-        sys.path.append(os.path.dirname(__file__))
-        accelerator.print("import network module:", args.network_module)
-        network_module = importlib.import_module(args.network_module)
-
-        if args.base_weights is not None:
-            # base_weights が指定されている場合は、指定された重みを読み込みマージする
-            for i, weight_path in enumerate(args.base_weights):
-                if args.base_weights_multiplier is None or len(args.base_weights_multiplier) <= i:
-                    multiplier = 1.0
-                else:
-                    multiplier = args.base_weights_multiplier[i]
-
-                accelerator.print(f"merging module: {weight_path} with multiplier {multiplier}")
-
-                module, weights_sd = network_module.create_network_from_weights(
-                    multiplier, weight_path, vae, text_encoder, unet, for_inference=True
-                )
-                module.merge_to(text_encoder, unet, weights_sd, weight_dtype, accelerator.device if args.lowram else "cpu")
-
-            accelerator.print(f"all weights merged: {', '.join(args.base_weights)}")
-
-        # 学習を準備する
+        # prepare dataset for latents caching if needed
         if cache_latents:
             vae.to(accelerator.device, dtype=vae_dtype)
             vae.requires_grad_(False)
@@ -643,6 +631,32 @@ def train(self, args):
         if val_dataset_group is not None:
             self.cache_text_encoder_outputs_if_needed(args, accelerator, unet, vae, text_encoders, val_dataset_group, weight_dtype)
 
+        if unet is None:
+            # lazy load unet if needed. text encoders may be freed or replaced with dummy models for saving memory
+            unet, text_encoders = self.load_unet_lazily(args, weight_dtype, accelerator, text_encoders)
+
+        # 差分追加学習のためにモデルを読み込む
+        sys.path.append(os.path.dirname(__file__))
+        accelerator.print("import network module:", args.network_module)
+        network_module = importlib.import_module(args.network_module)
+
+        if args.base_weights is not None:
+            # base_weights が指定されている場合は、指定された重みを読み込みマージする
+            for i, weight_path in enumerate(args.base_weights):
+                if args.base_weights_multiplier is None or len(args.base_weights_multiplier) <= i:
+                    multiplier = 1.0
+                else:
+                    multiplier = args.base_weights_multiplier[i]
+
+                accelerator.print(f"merging module: {weight_path} with multiplier {multiplier}")
+
+                module, weights_sd = network_module.create_network_from_weights(
+                    multiplier, weight_path, vae, text_encoder, unet, for_inference=True
+                )
+                module.merge_to(text_encoder, unet, weights_sd, weight_dtype, accelerator.device if args.lowram else "cpu")
+
+            accelerator.print(f"all weights merged: {', '.join(args.base_weights)}")
+
         # prepare network
         net_kwargs = {}
         if args.network_args is not None:
@@ -672,7 +686,7 @@ def train(self, args):
             return
         network_has_multiplier = hasattr(network, "set_multiplier")
 
-        # TODO remove `hasattr`s by setting up methods if not defined in the network like (hacky but works):
+        # TODO remove `hasattr` by setting up methods if not defined in the network like below  (hacky but will work):
         # if not hasattr(network, "prepare_network"):
         #    network.prepare_network = lambda args: None
 
@@ -1305,6 +1319,8 @@ def remove_model(old_ckpt_name):
                 del t_enc
             text_encoders = []
             text_encoder = None
+            gc.collect()
+            clean_memory_on_device(accelerator.device)
 
         # For --sample_at_first
         optimizer_eval_fn()

From 7a651efd4dab281acf8dc66200ade8620c5138dd Mon Sep 17 00:00:00 2001
From: Kohya S <52813779+kohya-ss@users.noreply.github.com>
Date: Fri, 12 Sep 2025 22:00:41 +0900
Subject: [PATCH 550/582] feat: add 'tak' to recognized words and update block
 swap method to support backward pass

---
 _typos.toml                    | 1 +
 hunyuan_image_train_network.py | 2 +-
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/_typos.toml b/_typos.toml
index 362ba8a60..bf0292e50 100644
--- a/_typos.toml
+++ b/_typos.toml
@@ -31,6 +31,7 @@ wn="wn"
 hime="hime"
 OT="OT"
 byt="byt"
+tak="tak"
 
 # [files]
 # # Extend the default list of files to check
diff --git a/hunyuan_image_train_network.py b/hunyuan_image_train_network.py
index 291d5132f..40c1f2fe9 100644
--- a/hunyuan_image_train_network.py
+++ b/hunyuan_image_train_network.py
@@ -394,7 +394,7 @@ def load_unet_lazily(self, args, weight_dtype, accelerator, text_encoders) -> tu
         if self.is_swapping_blocks:
             # Swap blocks between CPU and GPU to reduce memory usage, in forward and backward passes.
             logger.info(f"enable block swap: blocks_to_swap={args.blocks_to_swap}")
-            model.enable_block_swap(args.blocks_to_swap, accelerator.device)
+            model.enable_block_swap(args.blocks_to_swap, accelerator.device, supports_backward=True)
 
         return model, text_encoders
 

From 9a61d61b22e942f3ed8101550470c2029d4204c2 Mon Sep 17 00:00:00 2001
From: Kohya S <52813779+kohya-ss@users.noreply.github.com>
Date: Fri, 12 Sep 2025 22:18:29 +0900
Subject: [PATCH 551/582] feat: avoid unet type casting when fp8_scaled

---
 hunyuan_image_train_network.py |  9 +++++++--
 train_network.py               | 19 ++++++++++++-------
 2 files changed, 19 insertions(+), 9 deletions(-)

diff --git a/hunyuan_image_train_network.py b/hunyuan_image_train_network.py
index 40c1f2fe9..7167ce4c2 100644
--- a/hunyuan_image_train_network.py
+++ b/hunyuan_image_train_network.py
@@ -325,6 +325,8 @@ def assert_extra_args(
             logger.info(
                 "fp8_scaled is used, so fp8_base and fp8_base_unet are ignored / fp8_scaledが使われているので、fp8_baseとfp8_base_unetは無視されます"
             )
+            args.fp8_base = False
+            args.fp8_base_unet = False
 
         if args.cache_text_encoder_outputs_to_disk and not args.cache_text_encoder_outputs:
             logger.warning(
@@ -585,12 +587,15 @@ def prepare_text_encoder_grad_ckpt_workaround(self, index, text_encoder):
         # do not support text encoder training for HunyuanImage-2.1
         pass
 
-    def cast_text_encoder(self):
+    def cast_text_encoder(self, args):
         return False  # VLM is bf16, byT5 is fp16, so do not cast to other dtype
 
-    def cast_vae(self):
+    def cast_vae(self, args):
         return False  # VAE is fp16, so do not cast to other dtype
 
+    def cast_unet(self, args):
+        return not args.fp8_scaled  # if fp8_scaled is used, do not cast to other dtype
+
     def prepare_text_encoder_fp8(self, index, text_encoder, te_weight_dtype, weight_dtype):
         # fp8 text encoder for HunyuanImage-2.1 is not supported currently
         pass
diff --git a/train_network.py b/train_network.py
index c03c5fa09..6cebf5fc7 100644
--- a/train_network.py
+++ b/train_network.py
@@ -479,10 +479,13 @@ def process_batch(
 
         return loss.mean()
 
-    def cast_text_encoder(self):
+    def cast_text_encoder(self, args):
         return True  # default for other than HunyuanImage
-    
-    def cast_vae(self):
+
+    def cast_vae(self, args):
+        return True  # default for other than HunyuanImage
+
+    def cast_unet(self, args):
         return True  # default for other than HunyuanImage
 
     def train(self, args):
@@ -593,7 +596,7 @@ def train(self, args):
 
         # mixed precisionに対応した型を用意しておき適宜castする
         weight_dtype, save_dtype = train_util.prepare_dtype(args)
-        vae_dtype = (torch.float32 if args.no_half_vae else weight_dtype) if self.cast_vae() else None
+        vae_dtype = (torch.float32 if args.no_half_vae else weight_dtype) if self.cast_vae(args) else None
 
         # load target models: unet may be None for lazy loading
         model_version, text_encoder, vae, unet = self.load_target_model(args, weight_dtype, accelerator)
@@ -844,12 +847,13 @@ def train(self, args):
             unet.to(dtype=unet_weight_dtype)  # do not move to device because unet is not prepared by accelerator
 
         unet.requires_grad_(False)
-        unet.to(dtype=unet_weight_dtype)
+        if self.cast_unet(args):
+            unet.to(dtype=unet_weight_dtype)
         for i, t_enc in enumerate(text_encoders):
             t_enc.requires_grad_(False)
 
             # in case of cpu, dtype is already set to fp32 because cpu does not support fp8/fp16/bf16
-            if t_enc.device.type != "cpu" and self.cast_text_encoder():
+            if t_enc.device.type != "cpu" and self.cast_text_encoder(args):
                 t_enc.to(dtype=te_weight_dtype)
 
                 # nn.Embedding not support FP8
@@ -875,7 +879,8 @@ def train(self, args):
                 # default implementation is:  unet = accelerator.prepare(unet)
                 unet = self.prepare_unet_with_accelerator(args, accelerator, unet)  # accelerator does some magic here
             else:
-                unet.to(accelerator.device, dtype=unet_weight_dtype)  # move to device because unet is not prepared by accelerator
+                # move to device because unet is not prepared by accelerator
+                unet.to(accelerator.device, dtype=unet_weight_dtype if self.cast_unet(args) else None)
             if train_text_encoder:
                 text_encoders = [
                     (accelerator.prepare(t_enc) if flag else t_enc)

From 8783f8aed395e82678e0f7a48b0415b95e819484 Mon Sep 17 00:00:00 2001
From: Kohya S <52813779+kohya-ss@users.noreply.github.com>
Date: Sat, 13 Sep 2025 19:51:38 +0900
Subject: [PATCH 552/582] feat: faster safetensors load and split safetensor
 utils

---
 flux_minimal_inference.py             |   8 +-
 library/custom_offloading_utils.py    |  37 ++-
 library/device_utils.py               |  22 +-
 library/flux_train_utils.py           |   3 +-
 library/flux_utils.py                 |   4 +-
 library/lumina_train_util.py          |   3 +-
 library/lumina_util.py                |   2 +-
 library/safetensors_utils.py          | 352 ++++++++++++++++++++++++++
 library/sd3_utils.py                  |   4 +-
 library/utils.py                      | 221 ++--------------
 networks/flux_merge_lora.py           |   3 +-
 sd3_minimal_inference.py              |   2 +-
 sd3_train.py                          |   5 +-
 sd3_train_network.py                  |   3 +-
 tests/test_custom_offloading_utils.py |  18 +-
 tools/convert_diffusers_to_flux.py    |   3 +-
 tools/merge_sd3_safetensors.py        |   3 +-
 17 files changed, 459 insertions(+), 234 deletions(-)
 create mode 100644 library/safetensors_utils.py

diff --git a/flux_minimal_inference.py b/flux_minimal_inference.py
index d5f2d8d98..0664b3c78 100644
--- a/flux_minimal_inference.py
+++ b/flux_minimal_inference.py
@@ -456,13 +456,13 @@ def is_fp8(dt):
     # load clip_l (skip for chroma model)
     if args.model_type == "flux":
         logger.info(f"Loading clip_l from {args.clip_l}...")
-        clip_l = flux_utils.load_clip_l(args.clip_l, clip_l_dtype, loading_device)
+        clip_l = flux_utils.load_clip_l(args.clip_l, clip_l_dtype, loading_device, disable_mmap=True)
         clip_l.eval()
     else:
         clip_l = None
 
     logger.info(f"Loading t5xxl from {args.t5xxl}...")
-    t5xxl = flux_utils.load_t5xxl(args.t5xxl, t5xxl_dtype, loading_device)
+    t5xxl = flux_utils.load_t5xxl(args.t5xxl, t5xxl_dtype, loading_device, disable_mmap=True)
     t5xxl.eval()
 
     # if is_fp8(clip_l_dtype):
@@ -471,7 +471,9 @@ def is_fp8(dt):
     #     t5xxl = accelerator.prepare(t5xxl)
 
     # DiT
-    is_schnell, model = flux_utils.load_flow_model(args.ckpt_path, None, loading_device, model_type=args.model_type)
+    is_schnell, model = flux_utils.load_flow_model(
+        args.ckpt_path, None, loading_device, disable_mmap=True, model_type=args.model_type
+    )
     model.eval()
     logger.info(f"Casting model to {flux_dtype}")
     model.to(flux_dtype)  # make sure model is dtype
diff --git a/library/custom_offloading_utils.py b/library/custom_offloading_utils.py
index 55ff08b64..fce3747e5 100644
--- a/library/custom_offloading_utils.py
+++ b/library/custom_offloading_utils.py
@@ -1,13 +1,28 @@
 from concurrent.futures import ThreadPoolExecutor
+import gc
 import time
 from typing import Optional, Union, Callable, Tuple
 import torch
 import torch.nn as nn
 
-from library.device_utils import clean_memory_on_device
 
+# Keep these functions here for portability, and private to avoid confusion with the ones in device_utils.py
+def _clean_memory_on_device(device: torch.device):
+    r"""
+    Clean memory on the specified device, will be called from training scripts.
+    """
+    gc.collect()
+
+    # device may "cuda" or "cuda:0", so we need to check the type of device
+    if device.type == "cuda":
+        torch.cuda.empty_cache()
+    if device.type == "xpu":
+        torch.xpu.empty_cache()
+    if device.type == "mps":
+        torch.mps.empty_cache()
 
-def synchronize_device(device: torch.device):
+
+def _synchronize_device(device: torch.device):
     if device.type == "cuda":
         torch.cuda.synchronize()
     elif device.type == "xpu":
@@ -71,19 +86,18 @@ def swap_weight_devices_no_cuda(device: torch.device, layer_to_cpu: nn.Module, l
         if hasattr(module_to_cpu, "weight") and module_to_cpu.weight is not None:
             weight_swap_jobs.append((module_to_cpu, module_to_cuda, module_to_cpu.weight.data, module_to_cuda.weight.data))
 
-
     # device to cpu
     for module_to_cpu, module_to_cuda, cuda_data_view, cpu_data_view in weight_swap_jobs:
         module_to_cpu.weight.data = cuda_data_view.data.to("cpu", non_blocking=True)
 
-    synchronize_device(device)
+    _synchronize_device(device)
 
     # cpu to device
     for module_to_cpu, module_to_cuda, cuda_data_view, cpu_data_view in weight_swap_jobs:
         cuda_data_view.copy_(module_to_cuda.weight.data, non_blocking=True)
         module_to_cuda.weight.data = cuda_data_view
 
-    synchronize_device(device)
+    _synchronize_device(device)
 
 
 def weighs_to_device(layer: nn.Module, device: torch.device):
@@ -152,12 +166,15 @@ def _wait_blocks_move(self, block_idx):
 # Gradient tensors
 _grad_t = Union[tuple[torch.Tensor, ...], torch.Tensor]
 
+
 class ModelOffloader(Offloader):
     """
     supports forward offloading
     """
 
-    def __init__(self, blocks: Union[list[nn.Module], nn.ModuleList], blocks_to_swap: int, device: torch.device, debug: bool = False):
+    def __init__(
+        self, blocks: Union[list[nn.Module], nn.ModuleList], blocks_to_swap: int, device: torch.device, debug: bool = False
+    ):
         super().__init__(len(blocks), blocks_to_swap, device, debug)
 
         # register backward hooks
@@ -172,7 +189,9 @@ def __del__(self):
         for handle in self.remove_handles:
             handle.remove()
 
-    def create_backward_hook(self, blocks: Union[list[nn.Module], nn.ModuleList], block_index: int) -> Optional[Callable[[nn.Module, _grad_t, _grad_t], Union[None, _grad_t]]]:
+    def create_backward_hook(
+        self, blocks: Union[list[nn.Module], nn.ModuleList], block_index: int
+    ) -> Optional[Callable[[nn.Module, _grad_t, _grad_t], Union[None, _grad_t]]]:
         # -1 for 0-based index
         num_blocks_propagated = self.num_blocks - block_index - 1
         swapping = num_blocks_propagated > 0 and num_blocks_propagated <= self.blocks_to_swap
@@ -213,8 +232,8 @@ def prepare_block_devices_before_forward(self, blocks: Union[list[nn.Module], nn
             b.to(self.device)  # move block to device first
             weighs_to_device(b, torch.device("cpu"))  # make sure weights are on cpu
 
-        synchronize_device(self.device)
-        clean_memory_on_device(self.device)
+        _synchronize_device(self.device)
+        _clean_memory_on_device(self.device)
 
     def wait_for_block(self, block_idx: int):
         if self.blocks_to_swap is None or self.blocks_to_swap == 0:
diff --git a/library/device_utils.py b/library/device_utils.py
index d2e197450..9d7757ed1 100644
--- a/library/device_utils.py
+++ b/library/device_utils.py
@@ -1,7 +1,9 @@
 import functools
 import gc
+from typing import Optional, Union
 
 import torch
+
 try:
     # intel gpu support for pytorch older than 2.5
     # ipex is not needed after pytorch 2.5
@@ -36,12 +38,15 @@ def clean_memory():
         torch.mps.empty_cache()
 
 
-def clean_memory_on_device(device: torch.device):
+def clean_memory_on_device(device: Optional[Union[str, torch.device]]):
     r"""
     Clean memory on the specified device, will be called from training scripts.
     """
     gc.collect()
-
+    if device is None:
+        return
+    if isinstance(device, str):
+        device = torch.device(device)
     # device may "cuda" or "cuda:0", so we need to check the type of device
     if device.type == "cuda":
         torch.cuda.empty_cache()
@@ -51,6 +56,19 @@ def clean_memory_on_device(device: torch.device):
         torch.mps.empty_cache()
 
 
+def synchronize_device(device: Optional[Union[str, torch.device]]):
+    if device is None:
+        return
+    if isinstance(device, str):
+        device = torch.device(device)
+    if device.type == "cuda":
+        torch.cuda.synchronize()
+    elif device.type == "xpu":
+        torch.xpu.synchronize()
+    elif device.type == "mps":
+        torch.mps.synchronize()
+
+
 @functools.lru_cache(maxsize=None)
 def get_preferred_device() -> torch.device:
     r"""
diff --git a/library/flux_train_utils.py b/library/flux_train_utils.py
index f3eb81992..06fe0b953 100644
--- a/library/flux_train_utils.py
+++ b/library/flux_train_utils.py
@@ -16,10 +16,11 @@
 
 from library import flux_models, flux_utils, strategy_base, train_util
 from library.device_utils import init_ipex, clean_memory_on_device
+from library.safetensors_utils import mem_eff_save_file
 
 init_ipex()
 
-from .utils import setup_logging, mem_eff_save_file
+from .utils import setup_logging
 
 setup_logging()
 import logging
diff --git a/library/flux_utils.py b/library/flux_utils.py
index 220548547..410b34ce2 100644
--- a/library/flux_utils.py
+++ b/library/flux_utils.py
@@ -18,7 +18,7 @@
 logger = logging.getLogger(__name__)
 
 from library import flux_models
-from library.utils import load_safetensors
+from library.safetensors_utils import load_safetensors
 
 MODEL_VERSION_FLUX_V1 = "flux1"
 MODEL_NAME_DEV = "dev"
@@ -124,7 +124,7 @@ def load_flow_model(
         logger.info(f"Loading state dict from {ckpt_path}")
         sd = {}
         for ckpt_path in ckpt_paths:
-            sd.update(load_safetensors(ckpt_path, device=str(device), disable_mmap=disable_mmap, dtype=dtype))
+            sd.update(load_safetensors(ckpt_path, device=device, disable_mmap=disable_mmap, dtype=dtype))
 
         # convert Diffusers to BFL
         if is_diffusers:
diff --git a/library/lumina_train_util.py b/library/lumina_train_util.py
index 0645a8ae0..d5d5db05f 100644
--- a/library/lumina_train_util.py
+++ b/library/lumina_train_util.py
@@ -18,10 +18,11 @@
 from library.flux_models import AutoEncoder
 from library.device_utils import init_ipex, clean_memory_on_device
 from library.sd3_train_utils import FlowMatchEulerDiscreteScheduler
+from library.safetensors_utils import mem_eff_save_file
 
 init_ipex()
 
-from .utils import setup_logging, mem_eff_save_file
+from .utils import setup_logging
 
 setup_logging()
 import logging
diff --git a/library/lumina_util.py b/library/lumina_util.py
index 87853ef62..f7f3c8231 100644
--- a/library/lumina_util.py
+++ b/library/lumina_util.py
@@ -12,7 +12,7 @@
 
 from library.utils import setup_logging
 from library import lumina_models, flux_models
-from library.utils import load_safetensors
+from library.safetensors_utils import load_safetensors
 import logging
 
 setup_logging()
diff --git a/library/safetensors_utils.py b/library/safetensors_utils.py
new file mode 100644
index 000000000..dcd2309e1
--- /dev/null
+++ b/library/safetensors_utils.py
@@ -0,0 +1,352 @@
+import os
+import re
+import numpy as np
+import torch
+import json
+import struct
+from typing import Dict, Any, Union, Optional
+
+from safetensors.torch import load_file
+
+from library.device_utils import synchronize_device
+
+
+def mem_eff_save_file(tensors: Dict[str, torch.Tensor], filename: str, metadata: Dict[str, Any] = None):
+    """
+    memory efficient save file
+    """
+
+    _TYPES = {
+        torch.float64: "F64",
+        torch.float32: "F32",
+        torch.float16: "F16",
+        torch.bfloat16: "BF16",
+        torch.int64: "I64",
+        torch.int32: "I32",
+        torch.int16: "I16",
+        torch.int8: "I8",
+        torch.uint8: "U8",
+        torch.bool: "BOOL",
+        getattr(torch, "float8_e5m2", None): "F8_E5M2",
+        getattr(torch, "float8_e4m3fn", None): "F8_E4M3",
+    }
+    _ALIGN = 256
+
+    def validate_metadata(metadata: Dict[str, Any]) -> Dict[str, str]:
+        validated = {}
+        for key, value in metadata.items():
+            if not isinstance(key, str):
+                raise ValueError(f"Metadata key must be a string, got {type(key)}")
+            if not isinstance(value, str):
+                print(f"Warning: Metadata value for key '{key}' is not a string. Converting to string.")
+                validated[key] = str(value)
+            else:
+                validated[key] = value
+        return validated
+
+    # print(f"Using memory efficient save file: {filename}")
+
+    header = {}
+    offset = 0
+    if metadata:
+        header["__metadata__"] = validate_metadata(metadata)
+    for k, v in tensors.items():
+        if v.numel() == 0:  # empty tensor
+            header[k] = {"dtype": _TYPES[v.dtype], "shape": list(v.shape), "data_offsets": [offset, offset]}
+        else:
+            size = v.numel() * v.element_size()
+            header[k] = {"dtype": _TYPES[v.dtype], "shape": list(v.shape), "data_offsets": [offset, offset + size]}
+            offset += size
+
+    hjson = json.dumps(header).encode("utf-8")
+    hjson += b" " * (-(len(hjson) + 8) % _ALIGN)
+
+    with open(filename, "wb") as f:
+        f.write(struct.pack("<Q", len(hjson)))
+        f.write(hjson)
+
+        for k, v in tensors.items():
+            if v.numel() == 0:
+                continue
+            if v.is_cuda:
+                # Direct GPU to disk save
+                with torch.cuda.device(v.device):
+                    if v.dim() == 0:  # if scalar, need to add a dimension to work with view
+                        v = v.unsqueeze(0)
+                    tensor_bytes = v.contiguous().view(torch.uint8)
+                    tensor_bytes.cpu().numpy().tofile(f)
+            else:
+                # CPU tensor save
+                if v.dim() == 0:  # if scalar, need to add a dimension to work with view
+                    v = v.unsqueeze(0)
+                v.contiguous().view(torch.uint8).numpy().tofile(f)
+
+
+class MemoryEfficientSafeOpen:
+    """Memory-efficient reader for safetensors files.
+
+    This class provides a memory-efficient way to read tensors from safetensors files
+    by using memory mapping for large tensors and avoiding unnecessary copies.
+    """
+
+    def __init__(self, filename):
+        """Initialize the SafeTensor reader.
+
+        Args:
+            filename (str): Path to the safetensors file to read.
+        """
+        self.filename = filename
+        self.file = open(filename, "rb")
+        self.header, self.header_size = self._read_header()
+
+    def __enter__(self):
+        """Enter context manager."""
+        return self
+
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        """Exit context manager and close file."""
+        self.file.close()
+
+    def keys(self):
+        """Get all tensor keys in the file.
+
+        Returns:
+            list: List of tensor names (excludes metadata).
+        """
+        return [k for k in self.header.keys() if k != "__metadata__"]
+
+    def metadata(self) -> Dict[str, str]:
+        """Get metadata from the file.
+
+        Returns:
+            Dict[str, str]: Metadata dictionary.
+        """
+        return self.header.get("__metadata__", {})
+
+    def _read_header(self):
+        """Read and parse the header from the safetensors file.
+
+        Returns:
+            tuple: (header_dict, header_size) containing parsed header and its size.
+        """
+        # Read header size (8 bytes, little-endian unsigned long long)
+        header_size = struct.unpack("<Q", self.file.read(8))[0]
+        # Read and decode header JSON
+        header_json = self.file.read(header_size).decode("utf-8")
+        return json.loads(header_json), header_size
+
+    def get_tensor(self, key: str, device: Optional[torch.device] = None, dtype: Optional[torch.dtype] = None):
+        """Load a tensor from the file with memory-efficient strategies.
+
+        **Note:**
+        If device is 'cuda' , the transfer to GPU is done efficiently using pinned memory and non-blocking transfer.
+        So you must ensure that the transfer is completed before using the tensor (e.g., by `torch.cuda.synchronize()`).
+
+        If the tensor is large (>10MB) and the target device is CUDA, memory mapping with numpy.memmap is used to avoid intermediate copies.
+
+        Args:
+            key (str): Name of the tensor to load.
+            device (Optional[torch.device]): Target device for the tensor.
+            dtype (Optional[torch.dtype]): Target dtype for the tensor.
+
+        Returns:
+            torch.Tensor: The loaded tensor.
+
+        Raises:
+            KeyError: If the tensor key is not found in the file.
+        """
+        if key not in self.header:
+            raise KeyError(f"Tensor '{key}' not found in the file")
+
+        metadata = self.header[key]
+        offset_start, offset_end = metadata["data_offsets"]
+        num_bytes = offset_end - offset_start
+
+        original_dtype = self._get_torch_dtype(metadata["dtype"])
+        target_dtype = dtype if dtype is not None else original_dtype
+
+        # Handle empty tensors
+        if num_bytes == 0:
+            return torch.empty(metadata["shape"], dtype=target_dtype, device=device)
+
+        # Determine if we should use pinned memory for GPU transfer
+        non_blocking = device is not None and device.type == "cuda"
+
+        # Calculate absolute file offset
+        tensor_offset = self.header_size + 8 + offset_start  # adjust offset by header size
+
+        # Memory mapping strategy for large tensors to GPU
+        # Use memmap for large tensors to avoid intermediate copies.
+        # If device is cpu, tensor is not copied to gpu, so using memmap locks the file, which is not desired.
+        # So we only use memmap if device is not cpu.
+        if num_bytes > 10 * 1024 * 1024 and device is not None and device.type != "cpu":
+            # Create memory map for zero-copy reading
+            mm = np.memmap(self.filename, mode="c", dtype=np.uint8, offset=tensor_offset, shape=(num_bytes,))
+            byte_tensor = torch.from_numpy(mm)  # zero copy
+            del mm
+
+            # Deserialize tensor (view and reshape)
+            cpu_tensor = self._deserialize_tensor(byte_tensor, metadata)  # view and reshape
+            del byte_tensor
+
+            # Transfer to target device and dtype
+            gpu_tensor = cpu_tensor.to(device=device, dtype=target_dtype, non_blocking=non_blocking)
+            del cpu_tensor
+            return gpu_tensor
+
+        # Standard file reading strategy for smaller tensors or CPU target
+        # seek to the specified position
+        self.file.seek(tensor_offset)
+
+        # read directly into a numpy array by numpy.fromfile without intermediate copy
+        numpy_array = np.fromfile(self.file, dtype=np.uint8, count=num_bytes)
+        byte_tensor = torch.from_numpy(numpy_array)
+        del numpy_array
+
+        # deserialize (view and reshape)
+        deserialized_tensor = self._deserialize_tensor(byte_tensor, metadata)
+        del byte_tensor
+
+        # cast to target dtype and move to device
+        return deserialized_tensor.to(device=device, dtype=target_dtype, non_blocking=non_blocking)
+
+    def _deserialize_tensor(self, byte_tensor: torch.Tensor, metadata: Dict):
+        """Deserialize byte tensor to the correct shape and dtype.
+
+        Args:
+            byte_tensor (torch.Tensor): Raw byte tensor from file.
+            metadata (Dict): Tensor metadata containing dtype and shape info.
+
+        Returns:
+            torch.Tensor: Deserialized tensor with correct shape and dtype.
+        """
+        dtype = self._get_torch_dtype(metadata["dtype"])
+        shape = metadata["shape"]
+
+        # Handle special float8 types
+        if metadata["dtype"] in ["F8_E5M2", "F8_E4M3"]:
+            return self._convert_float8(byte_tensor, metadata["dtype"], shape)
+
+        # Standard conversion: view as target dtype and reshape
+        return byte_tensor.view(dtype).reshape(shape)
+
+    @staticmethod
+    def _get_torch_dtype(dtype_str):
+        """Convert string dtype to PyTorch dtype.
+
+        Args:
+            dtype_str (str): String representation of the dtype.
+
+        Returns:
+            torch.dtype: Corresponding PyTorch dtype.
+        """
+        # Standard dtype mappings
+        dtype_map = {
+            "F64": torch.float64,
+            "F32": torch.float32,
+            "F16": torch.float16,
+            "BF16": torch.bfloat16,
+            "I64": torch.int64,
+            "I32": torch.int32,
+            "I16": torch.int16,
+            "I8": torch.int8,
+            "U8": torch.uint8,
+            "BOOL": torch.bool,
+        }
+        # Add float8 types if available in PyTorch version
+        if hasattr(torch, "float8_e5m2"):
+            dtype_map["F8_E5M2"] = torch.float8_e5m2
+        if hasattr(torch, "float8_e4m3fn"):
+            dtype_map["F8_E4M3"] = torch.float8_e4m3fn
+        return dtype_map.get(dtype_str)
+
+    @staticmethod
+    def _convert_float8(byte_tensor, dtype_str, shape):
+        """Convert byte tensor to float8 format if supported.
+
+        Args:
+            byte_tensor (torch.Tensor): Raw byte tensor.
+            dtype_str (str): Float8 dtype string ("F8_E5M2" or "F8_E4M3").
+            shape (tuple): Target tensor shape.
+
+        Returns:
+            torch.Tensor: Tensor with float8 dtype.
+
+        Raises:
+            ValueError: If float8 type is not supported in current PyTorch version.
+        """
+        # Convert to specific float8 types if available
+        if dtype_str == "F8_E5M2" and hasattr(torch, "float8_e5m2"):
+            return byte_tensor.view(torch.float8_e5m2).reshape(shape)
+        elif dtype_str == "F8_E4M3" and hasattr(torch, "float8_e4m3fn"):
+            return byte_tensor.view(torch.float8_e4m3fn).reshape(shape)
+        else:
+            # Float8 not supported in this PyTorch version
+            raise ValueError(f"Unsupported float8 type: {dtype_str} (upgrade PyTorch to support float8 types)")
+
+
+def load_safetensors(
+    path: str, device: Union[str, torch.device], disable_mmap: bool = False, dtype: Optional[torch.dtype] = None
+) -> dict[str, torch.Tensor]:
+    if disable_mmap:
+        # return safetensors.torch.load(open(path, "rb").read())
+        # use experimental loader
+        # logger.info(f"Loading without mmap (experimental)")
+        state_dict = {}
+        device = torch.device(device) if device is not None else None
+        with MemoryEfficientSafeOpen(path) as f:
+            for key in f.keys():
+                state_dict[key] = f.get_tensor(key, device=device, dtype=dtype)
+        synchronize_device(device)
+        return state_dict
+    else:
+        try:
+            state_dict = load_file(path, device=device)
+        except:
+            state_dict = load_file(path)  # prevent device invalid Error
+        if dtype is not None:
+            for key in state_dict.keys():
+                state_dict[key] = state_dict[key].to(dtype=dtype)
+        return state_dict
+
+
+def load_split_weights(
+    file_path: str, device: Union[str, torch.device] = "cpu", disable_mmap: bool = False, dtype: Optional[torch.dtype] = None
+) -> Dict[str, torch.Tensor]:
+    """
+    Load split weights from a file. If the file name ends with 00001-of-00004 etc, it will load all files with the same prefix.
+    dtype is as is, no conversion is done.
+    """
+    device = torch.device(device)
+
+    # if the file name ends with 00001-of-00004 etc, we need to load the files with the same prefix
+    basename = os.path.basename(file_path)
+    match = re.match(r"^(.*?)(\d+)-of-(\d+)\.safetensors$", basename)
+    if match:
+        prefix = basename[: match.start(2)]
+        count = int(match.group(3))
+        state_dict = {}
+        for i in range(count):
+            filename = f"{prefix}{i + 1:05d}-of-{count:05d}.safetensors"
+            filepath = os.path.join(os.path.dirname(file_path), filename)
+            if os.path.exists(filepath):
+                state_dict.update(load_safetensors(filepath, device=device, disable_mmap=disable_mmap, dtype=dtype))
+            else:
+                raise FileNotFoundError(f"File {filepath} not found")
+    else:
+        state_dict = load_safetensors(file_path, device=device, disable_mmap=disable_mmap, dtype=dtype)
+    return state_dict
+
+
+def find_key(safetensors_file: str, starts_with: Optional[str] = None, ends_with: Optional[str] = None) -> Optional[str]:
+    """
+    Find a key in a safetensors file that starts with `starts_with` and ends with `ends_with`.
+    If `starts_with` is None, it will match any key.
+    If `ends_with` is None, it will match any key.
+    Returns the first matching key or None if no key matches.
+    """
+    with MemoryEfficientSafeOpen(safetensors_file) as f:
+        for key in f.keys():
+            if (starts_with is None or key.startswith(starts_with)) and (ends_with is None or key.endswith(ends_with)):
+                return key
+    return None
diff --git a/library/sd3_utils.py b/library/sd3_utils.py
index d2ea6fffe..5fbaa4c3e 100644
--- a/library/sd3_utils.py
+++ b/library/sd3_utils.py
@@ -23,7 +23,7 @@
 # region models
 
 # TODO remove dependency on flux_utils
-from library.utils import load_safetensors
+from library.safetensors_utils import load_safetensors
 from library.flux_utils import load_t5xxl as flux_utils_load_t5xxl
 
 
@@ -246,7 +246,7 @@ def load_vae(
     vae_sd = {}
     if vae_path:
         logger.info(f"Loading VAE from {vae_path}...")
-        vae_sd = load_safetensors(vae_path, device, disable_mmap)
+        vae_sd = load_safetensors(vae_path, device, disable_mmap, dtype=vae_dtype)
     else:
         # remove prefix "first_stage_model."
         vae_sd = {}
diff --git a/library/utils.py b/library/utils.py
index d0586b84a..296fc4151 100644
--- a/library/utils.py
+++ b/library/utils.py
@@ -2,8 +2,6 @@
 import sys
 import threading
 from typing import *
-import json
-import struct
 
 import torch
 import torch.nn as nn
@@ -14,7 +12,7 @@
 import cv2
 from PIL import Image
 import numpy as np
-from safetensors.torch import load_file
+
 
 def fire_in_thread(f, *args, **kwargs):
     threading.Thread(target=f, args=args, kwargs=kwargs).start()
@@ -88,6 +86,7 @@ def setup_logging(args=None, log_level=None, reset=False):
         logger = logging.getLogger(__name__)
         logger.info(msg_init)
 
+
 setup_logging()
 logger = logging.getLogger(__name__)
 
@@ -190,190 +189,6 @@ def str_to_dtype(s: Optional[str], default_dtype: Optional[torch.dtype] = None)
         raise ValueError(f"Unsupported dtype: {s}")
 
 
-def mem_eff_save_file(tensors: Dict[str, torch.Tensor], filename: str, metadata: Dict[str, Any] = None):
-    """
-    memory efficient save file
-    """
-
-    _TYPES = {
-        torch.float64: "F64",
-        torch.float32: "F32",
-        torch.float16: "F16",
-        torch.bfloat16: "BF16",
-        torch.int64: "I64",
-        torch.int32: "I32",
-        torch.int16: "I16",
-        torch.int8: "I8",
-        torch.uint8: "U8",
-        torch.bool: "BOOL",
-        getattr(torch, "float8_e5m2", None): "F8_E5M2",
-        getattr(torch, "float8_e4m3fn", None): "F8_E4M3",
-    }
-    _ALIGN = 256
-
-    def validate_metadata(metadata: Dict[str, Any]) -> Dict[str, str]:
-        validated = {}
-        for key, value in metadata.items():
-            if not isinstance(key, str):
-                raise ValueError(f"Metadata key must be a string, got {type(key)}")
-            if not isinstance(value, str):
-                print(f"Warning: Metadata value for key '{key}' is not a string. Converting to string.")
-                validated[key] = str(value)
-            else:
-                validated[key] = value
-        return validated
-
-    print(f"Using memory efficient save file: {filename}")
-
-    header = {}
-    offset = 0
-    if metadata:
-        header["__metadata__"] = validate_metadata(metadata)
-    for k, v in tensors.items():
-        if v.numel() == 0:  # empty tensor
-            header[k] = {"dtype": _TYPES[v.dtype], "shape": list(v.shape), "data_offsets": [offset, offset]}
-        else:
-            size = v.numel() * v.element_size()
-            header[k] = {"dtype": _TYPES[v.dtype], "shape": list(v.shape), "data_offsets": [offset, offset + size]}
-            offset += size
-
-    hjson = json.dumps(header).encode("utf-8")
-    hjson += b" " * (-(len(hjson) + 8) % _ALIGN)
-
-    with open(filename, "wb") as f:
-        f.write(struct.pack("<Q", len(hjson)))
-        f.write(hjson)
-
-        for k, v in tensors.items():
-            if v.numel() == 0:
-                continue
-            if v.is_cuda:
-                # Direct GPU to disk save
-                with torch.cuda.device(v.device):
-                    if v.dim() == 0:  # if scalar, need to add a dimension to work with view
-                        v = v.unsqueeze(0)
-                    tensor_bytes = v.contiguous().view(torch.uint8)
-                    tensor_bytes.cpu().numpy().tofile(f)
-            else:
-                # CPU tensor save
-                if v.dim() == 0:  # if scalar, need to add a dimension to work with view
-                    v = v.unsqueeze(0)
-                v.contiguous().view(torch.uint8).numpy().tofile(f)
-
-
-class MemoryEfficientSafeOpen:
-    def __init__(self, filename):
-        self.filename = filename
-        self.file = open(filename, "rb")
-        self.header, self.header_size = self._read_header()
-
-    def __enter__(self):
-        return self
-
-    def __exit__(self, exc_type, exc_val, exc_tb):
-        self.file.close()
-
-    def keys(self):
-        return [k for k in self.header.keys() if k != "__metadata__"]
-
-    def metadata(self) -> Dict[str, str]:
-        return self.header.get("__metadata__", {})
-
-    def get_tensor(self, key):
-        if key not in self.header:
-            raise KeyError(f"Tensor '{key}' not found in the file")
-
-        metadata = self.header[key]
-        offset_start, offset_end = metadata["data_offsets"]
-
-        if offset_start == offset_end:
-            tensor_bytes = None
-        else:
-            # adjust offset by header size
-            self.file.seek(self.header_size + 8 + offset_start)
-            tensor_bytes = self.file.read(offset_end - offset_start)
-
-        return self._deserialize_tensor(tensor_bytes, metadata)
-
-    def _read_header(self):
-        header_size = struct.unpack("<Q", self.file.read(8))[0]
-        header_json = self.file.read(header_size).decode("utf-8")
-        return json.loads(header_json), header_size
-
-    def _deserialize_tensor(self, tensor_bytes, metadata):
-        dtype = self._get_torch_dtype(metadata["dtype"])
-        shape = metadata["shape"]
-
-        if tensor_bytes is None:
-            byte_tensor = torch.empty(0, dtype=torch.uint8)
-        else:
-            tensor_bytes = bytearray(tensor_bytes)  # make it writable
-            byte_tensor = torch.frombuffer(tensor_bytes, dtype=torch.uint8)
-
-        # process float8 types
-        if metadata["dtype"] in ["F8_E5M2", "F8_E4M3"]:
-            return self._convert_float8(byte_tensor, metadata["dtype"], shape)
-
-        # convert to the target dtype and reshape
-        return byte_tensor.view(dtype).reshape(shape)
-
-    @staticmethod
-    def _get_torch_dtype(dtype_str):
-        dtype_map = {
-            "F64": torch.float64,
-            "F32": torch.float32,
-            "F16": torch.float16,
-            "BF16": torch.bfloat16,
-            "I64": torch.int64,
-            "I32": torch.int32,
-            "I16": torch.int16,
-            "I8": torch.int8,
-            "U8": torch.uint8,
-            "BOOL": torch.bool,
-        }
-        # add float8 types if available
-        if hasattr(torch, "float8_e5m2"):
-            dtype_map["F8_E5M2"] = torch.float8_e5m2
-        if hasattr(torch, "float8_e4m3fn"):
-            dtype_map["F8_E4M3"] = torch.float8_e4m3fn
-        return dtype_map.get(dtype_str)
-
-    @staticmethod
-    def _convert_float8(byte_tensor, dtype_str, shape):
-        if dtype_str == "F8_E5M2" and hasattr(torch, "float8_e5m2"):
-            return byte_tensor.view(torch.float8_e5m2).reshape(shape)
-        elif dtype_str == "F8_E4M3" and hasattr(torch, "float8_e4m3fn"):
-            return byte_tensor.view(torch.float8_e4m3fn).reshape(shape)
-        else:
-            # # convert to float16 if float8 is not supported
-            # print(f"Warning: {dtype_str} is not supported in this PyTorch version. Converting to float16.")
-            # return byte_tensor.view(torch.uint8).to(torch.float16).reshape(shape)
-            raise ValueError(f"Unsupported float8 type: {dtype_str} (upgrade PyTorch to support float8 types)")
-
-
-def load_safetensors(
-    path: str, device: Union[str, torch.device], disable_mmap: bool = False, dtype: Optional[torch.dtype] = torch.float32
-) -> dict[str, torch.Tensor]:
-    if disable_mmap:
-        # return safetensors.torch.load(open(path, "rb").read())
-        # use experimental loader
-        # logger.info(f"Loading without mmap (experimental)")
-        state_dict = {}
-        with MemoryEfficientSafeOpen(path) as f:
-            for key in f.keys():
-                state_dict[key] = f.get_tensor(key).to(device, dtype=dtype)
-        return state_dict
-    else:
-        try:
-            state_dict = load_file(path, device=device)
-        except:
-            state_dict = load_file(path)  # prevent device invalid Error
-        if dtype is not None:
-            for key in state_dict.keys():
-                state_dict[key] = state_dict[key].to(dtype=dtype)
-        return state_dict
-
-
 # endregion
 
 # region Image utils
@@ -398,7 +213,14 @@ def pil_resize(image, size, interpolation):
     return resized_cv2
 
 
-def resize_image(image: np.ndarray, width: int, height: int, resized_width: int, resized_height: int, resize_interpolation: Optional[str] = None):
+def resize_image(
+    image: np.ndarray,
+    width: int,
+    height: int,
+    resized_width: int,
+    resized_height: int,
+    resize_interpolation: Optional[str] = None,
+):
     """
     Resize image with resize interpolation. Default interpolation to AREA if image is smaller, else LANCZOS.
 
@@ -449,29 +271,30 @@ def get_cv2_interpolation(interpolation: Optional[str]) -> Optional[int]:
     https://docs.opencv.org/3.4/da/d54/group__imgproc__transform.html#ga5bb5a1fea74ea38e1a5445ca803ff121
     """
     if interpolation is None:
-        return None 
+        return None
 
     if interpolation == "lanczos" or interpolation == "lanczos4":
-        # Lanczos interpolation over 8x8 neighborhood 
+        # Lanczos interpolation over 8x8 neighborhood
         return cv2.INTER_LANCZOS4
     elif interpolation == "nearest":
-        # Bit exact nearest neighbor interpolation. This will produce same results as the nearest neighbor method in PIL, scikit-image or Matlab. 
+        # Bit exact nearest neighbor interpolation. This will produce same results as the nearest neighbor method in PIL, scikit-image or Matlab.
         return cv2.INTER_NEAREST_EXACT
     elif interpolation == "bilinear" or interpolation == "linear":
         # bilinear interpolation
         return cv2.INTER_LINEAR
     elif interpolation == "bicubic" or interpolation == "cubic":
-        # bicubic interpolation 
+        # bicubic interpolation
         return cv2.INTER_CUBIC
     elif interpolation == "area":
-        # resampling using pixel area relation. It may be a preferred method for image decimation, as it gives moire'-free results. But when the image is zoomed, it is similar to the INTER_NEAREST method. 
+        # resampling using pixel area relation. It may be a preferred method for image decimation, as it gives moire'-free results. But when the image is zoomed, it is similar to the INTER_NEAREST method.
         return cv2.INTER_AREA
     elif interpolation == "box":
-        # resampling using pixel area relation. It may be a preferred method for image decimation, as it gives moire'-free results. But when the image is zoomed, it is similar to the INTER_NEAREST method. 
+        # resampling using pixel area relation. It may be a preferred method for image decimation, as it gives moire'-free results. But when the image is zoomed, it is similar to the INTER_NEAREST method.
         return cv2.INTER_AREA
     else:
         return None
 
+
 def get_pil_interpolation(interpolation: Optional[str]) -> Optional[Image.Resampling]:
     """
     Convert interpolation value to PIL interpolation
@@ -479,7 +302,7 @@ def get_pil_interpolation(interpolation: Optional[str]) -> Optional[Image.Resamp
     https://pillow.readthedocs.io/en/stable/handbook/concepts.html#concept-filters
     """
     if interpolation is None:
-        return None 
+        return None
 
     if interpolation == "lanczos":
         return Image.Resampling.LANCZOS
@@ -493,7 +316,7 @@ def get_pil_interpolation(interpolation: Optional[str]) -> Optional[Image.Resamp
         # For resize calculate the output pixel value using cubic interpolation on all pixels that may contribute to the output value. For other transformations cubic interpolation over a 4x4 environment in the input image is used.
         return Image.Resampling.BICUBIC
     elif interpolation == "area":
-        # Image.Resampling.BOX may be more appropriate if upscaling 
+        # Image.Resampling.BOX may be more appropriate if upscaling
         # Area interpolation is related to cv2.INTER_AREA
         # Produces a sharper image than Resampling.BILINEAR, doesn’t have dislocations on local level like with Resampling.BOX.
         return Image.Resampling.HAMMING
@@ -503,12 +326,14 @@ def get_pil_interpolation(interpolation: Optional[str]) -> Optional[Image.Resamp
     else:
         return None
 
+
 def validate_interpolation_fn(interpolation_str: str) -> bool:
     """
     Check if a interpolation function is supported
     """
     return interpolation_str in ["lanczos", "nearest", "bilinear", "linear", "bicubic", "cubic", "area", "box"]
 
+
 # endregion
 
 # TODO make inf_utils.py
@@ -642,7 +467,9 @@ def step(
         elif self.config.prediction_type == "sample":
             raise NotImplementedError("prediction_type not implemented yet: sample")
         else:
-            raise ValueError(f"prediction_type given as {self.config.prediction_type} must be one of `epsilon`, or `v_prediction`")
+            raise ValueError(
+                f"prediction_type given as {self.config.prediction_type} must be one of `epsilon`, or `v_prediction`"
+            )
 
         sigma_from = self.sigmas[self.step_index]
         sigma_to = self.sigmas[self.step_index + 1]
diff --git a/networks/flux_merge_lora.py b/networks/flux_merge_lora.py
index 5e100a3ba..855c0ed98 100644
--- a/networks/flux_merge_lora.py
+++ b/networks/flux_merge_lora.py
@@ -9,7 +9,8 @@
 from safetensors.torch import load_file, save_file
 from tqdm import tqdm
 
-from library.utils import setup_logging, str_to_dtype, MemoryEfficientSafeOpen, mem_eff_save_file
+from library.utils import setup_logging, str_to_dtype
+from library.safetensors_utils import MemoryEfficientSafeOpen, mem_eff_save_file
 
 setup_logging()
 import logging
diff --git a/sd3_minimal_inference.py b/sd3_minimal_inference.py
index 86dba246d..d7b97a59f 100644
--- a/sd3_minimal_inference.py
+++ b/sd3_minimal_inference.py
@@ -28,7 +28,7 @@
 logger = logging.getLogger(__name__)
 
 from library import sd3_models, sd3_utils, strategy_sd3
-from library.utils import load_safetensors
+from library.safetensors_utils import load_safetensors
 
 
 def get_noise(seed, latent, device="cpu"):
diff --git a/sd3_train.py b/sd3_train.py
index 355e13dd2..c6a2fdd8d 100644
--- a/sd3_train.py
+++ b/sd3_train.py
@@ -14,6 +14,7 @@
 import torch
 from library import utils
 from library.device_utils import init_ipex, clean_memory_on_device
+from library.safetensors_utils import load_safetensors
 
 init_ipex()
 
@@ -206,7 +207,7 @@ def train(args):
     # t5xxl_dtype = weight_dtype
     model_dtype = match_mixed_precision(args, weight_dtype)  # None (default) or fp16/bf16 (full_xxxx)
     if args.clip_l is None:
-        sd3_state_dict = utils.load_safetensors(
+        sd3_state_dict = load_safetensors(
             args.pretrained_model_name_or_path, "cpu", args.disable_mmap_load_safetensors, model_dtype
         )
     else:
@@ -322,7 +323,7 @@ def train(args):
     # load VAE for caching latents
     if sd3_state_dict is None:
         logger.info(f"load state dict for MMDiT and VAE from {args.pretrained_model_name_or_path}")
-        sd3_state_dict = utils.load_safetensors(
+        sd3_state_dict = load_safetensors(
             args.pretrained_model_name_or_path, "cpu", args.disable_mmap_load_safetensors, model_dtype
         )
 
diff --git a/sd3_train_network.py b/sd3_train_network.py
index cdb7aa4e3..c9b06a38a 100644
--- a/sd3_train_network.py
+++ b/sd3_train_network.py
@@ -8,6 +8,7 @@
 from accelerate import Accelerator
 from library import sd3_models, strategy_sd3, utils
 from library.device_utils import init_ipex, clean_memory_on_device
+from library.safetensors_utils import load_safetensors
 
 init_ipex()
 
@@ -77,7 +78,7 @@ def load_target_model(self, args, weight_dtype, accelerator):
         loading_dtype = None if args.fp8_base else weight_dtype
 
         # if we load to cpu, flux.to(fp8) takes a long time, so we should load to gpu in future
-        state_dict = utils.load_safetensors(
+        state_dict = load_safetensors(
             args.pretrained_model_name_or_path, "cpu", disable_mmap=args.disable_mmap_load_safetensors, dtype=loading_dtype
         )
         mmdit = sd3_utils.load_mmdit(state_dict, loading_dtype, "cpu")
diff --git a/tests/test_custom_offloading_utils.py b/tests/test_custom_offloading_utils.py
index 5fa40b768..8c23bdf55 100644
--- a/tests/test_custom_offloading_utils.py
+++ b/tests/test_custom_offloading_utils.py
@@ -4,7 +4,7 @@
 from unittest.mock import patch, MagicMock
 
 from library.custom_offloading_utils import (
-    synchronize_device, 
+    _synchronize_device, 
     swap_weight_devices_cuda,
     swap_weight_devices_no_cuda,
     weighs_to_device,
@@ -50,21 +50,21 @@ def device(self):
 @pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
 def test_cuda_synchronize(mock_cuda_sync):
     device = torch.device('cuda')
-    synchronize_device(device)
+    _synchronize_device(device)
     mock_cuda_sync.assert_called_once()
 
 @patch('torch.xpu.synchronize')
 @pytest.mark.skipif(not torch.xpu.is_available(), reason="XPU not available")
 def test_xpu_synchronize(mock_xpu_sync):
     device = torch.device('xpu')
-    synchronize_device(device)
+    _synchronize_device(device)
     mock_xpu_sync.assert_called_once()
 
 @patch('torch.mps.synchronize')
 @pytest.mark.skipif(not torch.xpu.is_available(), reason="MPS not available")
 def test_mps_synchronize(mock_mps_sync):
     device = torch.device('mps')
-    synchronize_device(device)
+    _synchronize_device(device)
     mock_mps_sync.assert_called_once()
 
 
@@ -111,7 +111,7 @@ def test_swap_weight_devices_cuda():
 
 
 
-@patch('library.custom_offloading_utils.synchronize_device')
+@patch('library.custom_offloading_utils._synchronize_device')
 def test_swap_weight_devices_no_cuda(mock_sync_device):
     device = torch.device('cpu')
     layer_to_cpu = SimpleModel()
@@ -121,7 +121,7 @@ def test_swap_weight_devices_no_cuda(mock_sync_device):
         with patch('torch.Tensor.copy_'):
             swap_weight_devices_no_cuda(device, layer_to_cpu, layer_to_cuda)
             
-            # Verify synchronize_device was called twice
+            # Verify _synchronize_device was called twice
             assert mock_sync_device.call_count == 2
 
 
@@ -279,8 +279,8 @@ def test_backward_hook_execution(mock_wait, mock_submit):
 
 
 @patch('library.custom_offloading_utils.weighs_to_device')
-@patch('library.custom_offloading_utils.synchronize_device')
-@patch('library.custom_offloading_utils.clean_memory_on_device')
+@patch('library.custom_offloading_utils._synchronize_device')
+@patch('library.custom_offloading_utils._clean_memory_on_device')
 def test_prepare_block_devices_before_forward(mock_clean, mock_sync, mock_weights_to_device, model_offloader):
     model = SimpleModel(4)
     blocks = model.blocks
@@ -291,7 +291,7 @@ def test_prepare_block_devices_before_forward(mock_clean, mock_sync, mock_weight
         # Check that weighs_to_device was called for each block
         assert mock_weights_to_device.call_count == 4
         
-        # Check that synchronize_device and clean_memory_on_device were called
+        # Check that _synchronize_device and _clean_memory_on_device were called
         mock_sync.assert_called_once_with(model_offloader.device)
         mock_clean.assert_called_once_with(model_offloader.device)
 
diff --git a/tools/convert_diffusers_to_flux.py b/tools/convert_diffusers_to_flux.py
index 65ba7321a..a11093c92 100644
--- a/tools/convert_diffusers_to_flux.py
+++ b/tools/convert_diffusers_to_flux.py
@@ -30,7 +30,8 @@
 from tqdm import tqdm
 
 from library import flux_utils
-from library.utils import setup_logging, str_to_dtype, MemoryEfficientSafeOpen, mem_eff_save_file
+from library.utils import setup_logging, str_to_dtype
+from library.safetensors_utils import MemoryEfficientSafeOpen, mem_eff_save_file
 
 setup_logging()
 import logging
diff --git a/tools/merge_sd3_safetensors.py b/tools/merge_sd3_safetensors.py
index 6bc1003ec..6ec045ddc 100644
--- a/tools/merge_sd3_safetensors.py
+++ b/tools/merge_sd3_safetensors.py
@@ -6,7 +6,8 @@
 from safetensors.torch import safe_open
 
 from library.utils import setup_logging
-from library.utils import load_safetensors, mem_eff_save_file, str_to_dtype
+from library.utils import str_to_dtype
+from library.safetensors_utils import load_safetensors, mem_eff_save_file
 
 setup_logging()
 import logging

From e1c666e97f99f50e381ab88b8878392ca26870bb Mon Sep 17 00:00:00 2001
From: "Kohya S." <52813779+kohya-ss@users.noreply.github.com>
Date: Sat, 13 Sep 2025 20:03:55 +0900
Subject: [PATCH 553/582] Update library/safetensors_utils.py

Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
---
 library/safetensors_utils.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/library/safetensors_utils.py b/library/safetensors_utils.py
index dcd2309e1..c65cdfabe 100644
--- a/library/safetensors_utils.py
+++ b/library/safetensors_utils.py
@@ -44,7 +44,6 @@ def validate_metadata(metadata: Dict[str, Any]) -> Dict[str, str]:
                 validated[key] = value
         return validated
 
-    # print(f"Using memory efficient save file: {filename}")
 
     header = {}
     offset = 0

From 4568631b43f348ea4360b021315a3da8064f3d7b Mon Sep 17 00:00:00 2001
From: Kohya S <52813779+kohya-ss@users.noreply.github.com>
Date: Sat, 13 Sep 2025 20:05:39 +0900
Subject: [PATCH 554/582] docs: update README to reflect improved loading speed
 of .safetensors files

---
 README.md | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/README.md b/README.md
index 843cf71b9..da38a2416 100644
--- a/README.md
+++ b/README.md
@@ -13,11 +13,13 @@ For RTX 50 series GPUs, PyTorch 2.8.0 with CUDA 12.8/9 should be used. `requirem
 
 If you are using DeepSpeed, please install DeepSpeed with `pip install deepspeed` (appropriate version is not confirmed yet).
 
-- [FLUX.1 training](#flux1-training)
-- [SD3 training](#sd3-training)
-
 ### Recent Updates
 
+Sep 13, 2025:
+- The loading speed of `.safetensors` files has been improved for SD3, FLUX.1 and Lumina. See [PR #2200](https://github.com/kohya-ss/sd-scripts/pull/2200) for more details.
+    - Model loading can be up to 1.5 times faster.
+    - This is a wide-ranging update, so there may be bugs. Please let us know if you encounter any issues.
+
 Sep 4, 2025:
 - The information about FLUX.1 and SD3/SD3.5 training that was described in the README has been organized and divided into the following documents:
     - [LoRA Training Overview](./docs/train_network.md)

From d831c8883214f3af757ae13848e33c49c29ff89b Mon Sep 17 00:00:00 2001
From: Kohya S <52813779+kohya-ss@users.noreply.github.com>
Date: Sat, 13 Sep 2025 21:06:04 +0900
Subject: [PATCH 555/582] fix: sample generation doesn't work with block swap

---
 hunyuan_image_train_network.py  |  7 +++++--
 library/hunyuan_image_models.py | 14 ++++++++++++++
 2 files changed, 19 insertions(+), 2 deletions(-)

diff --git a/hunyuan_image_train_network.py b/hunyuan_image_train_network.py
index 7167ce4c2..a3c0cd898 100644
--- a/hunyuan_image_train_network.py
+++ b/hunyuan_image_train_network.py
@@ -1,7 +1,7 @@
 import argparse
 import copy
 import gc
-from typing import Any, Optional, Union
+from typing import Any, Optional, Union, cast
 import argparse
 import os
 import time
@@ -47,7 +47,7 @@ def sample_images(
     args: argparse.Namespace,
     epoch,
     steps,
-    dit,
+    dit: hunyuan_image_models.HYImageDiffusionTransformer,
     vae,
     text_encoders,
     sample_prompts_te_outputs,
@@ -77,6 +77,8 @@ def sample_images(
 
     # unwrap unet and text_encoder(s)
     dit = accelerator.unwrap_model(dit)
+    dit = cast(hunyuan_image_models.HYImageDiffusionTransformer, dit)
+    dit.switch_block_swap_for_inference()
     if text_encoders is not None:
         text_encoders = [(accelerator.unwrap_model(te) if te is not None else None) for te in text_encoders]
     # print([(te.parameters().__next__().device if te is not None else None) for te in text_encoders])
@@ -139,6 +141,7 @@ def sample_images(
     if cuda_rng_state is not None:
         torch.cuda.set_rng_state(cuda_rng_state)
 
+    dit.switch_block_swap_for_training()
     clean_memory_on_device(accelerator.device)
 
 
diff --git a/library/hunyuan_image_models.py b/library/hunyuan_image_models.py
index ce2d23ddc..2a6092ea3 100644
--- a/library/hunyuan_image_models.py
+++ b/library/hunyuan_image_models.py
@@ -185,6 +185,20 @@ def enable_block_swap(self, num_blocks: int, device: torch.device, supports_back
             f"HunyuanImage-2.1: Block swap enabled. Swapping {num_blocks} blocks, double blocks: {double_blocks_to_swap}, single blocks: {single_blocks_to_swap}."
         )
 
+    def switch_block_swap_for_inference(self):
+        if self.blocks_to_swap:
+            self.offloader_double.set_forward_only(True)
+            self.offloader_single.set_forward_only(True)
+            self.prepare_block_swap_before_forward()
+            print(f"HunyuanImage-2.1: Block swap set to forward only.")
+
+    def switch_block_swap_for_training(self):
+        if self.blocks_to_swap:
+            self.offloader_double.set_forward_only(False)
+            self.offloader_single.set_forward_only(False)
+            self.prepare_block_swap_before_forward()
+            print(f"HunyuanImage-2.1: Block swap set to forward and backward.")
+
     def move_to_device_except_swap_blocks(self, device: torch.device):
         # assume model is on cpu. do not move blocks to device to reduce temporary memory usage
         if self.blocks_to_swap:

From 4e2a80a6caa546f44a3667a7d9dec6a2c6378591 Mon Sep 17 00:00:00 2001
From: Kohya S <52813779+kohya-ss@users.noreply.github.com>
Date: Sat, 13 Sep 2025 21:07:11 +0900
Subject: [PATCH 556/582] refactor: update imports to use safetensors_utils for
 memory-efficient operations

---
 hunyuan_image_minimal_inference.py    |  3 ++-
 library/custom_offloading_utils.py    | 15 ++++++++-------
 library/fp8_optimization_utils.py     |  3 ++-
 library/hunyuan_image_text_encoder.py |  4 ++--
 library/hunyuan_image_vae.py          |  3 ++-
 library/lora_utils.py                 |  3 ++-
 networks/flux_extract_lora.py         |  5 ++---
 7 files changed, 20 insertions(+), 16 deletions(-)

diff --git a/hunyuan_image_minimal_inference.py b/hunyuan_image_minimal_inference.py
index 3de0b1cd4..7db490cd1 100644
--- a/hunyuan_image_minimal_inference.py
+++ b/hunyuan_image_minimal_inference.py
@@ -22,6 +22,7 @@
 from library import hunyuan_image_vae
 from library.hunyuan_image_vae import HunyuanVAE2D
 from library.device_utils import clean_memory_on_device, synchronize_device
+from library.safetensors_utils import mem_eff_save_file
 from networks import lora_hunyuan_image
 
 
@@ -29,7 +30,7 @@
 if lycoris_available:
     from lycoris.kohya import create_network_from_weights
 
-from library.utils import mem_eff_save_file, setup_logging
+from library.utils import setup_logging
 
 setup_logging()
 import logging
diff --git a/library/custom_offloading_utils.py b/library/custom_offloading_utils.py
index 1b7bbc143..fe7e59d2b 100644
--- a/library/custom_offloading_utils.py
+++ b/library/custom_offloading_utils.py
@@ -173,14 +173,12 @@ class ModelOffloader(Offloader):
     """
 
     def __init__(
-        
         self,
         blocks: Union[list[nn.Module], nn.ModuleList],
         blocks_to_swap: int,
         device: torch.device,
         supports_backward: bool = True,
         debug: bool = False,
-    
     ):
         super().__init__(len(blocks), blocks_to_swap, device, debug)
 
@@ -220,7 +218,7 @@ def create_backward_hook(
         block_idx_to_cuda = self.blocks_to_swap - num_blocks_propagated
         block_idx_to_wait = block_index - 1
 
-        def backward_hook(module, grad_input, grad_output):
+        def backward_hook(module: nn.Module, grad_input: _grad_t, grad_output: _grad_t):
             if self.debug:
                 print(f"Backward hook for block {block_index}")
 
@@ -232,7 +230,7 @@ def backward_hook(module, grad_input, grad_output):
 
         return backward_hook
 
-    def prepare_block_devices_before_forward(self, blocks: list[nn.Module]):
+    def prepare_block_devices_before_forward(self, blocks: Union[list[nn.Module], nn.ModuleList]):
         if self.blocks_to_swap is None or self.blocks_to_swap == 0:
             return
 
@@ -245,7 +243,7 @@ def prepare_block_devices_before_forward(self, blocks: list[nn.Module]):
 
         for b in blocks[self.num_blocks - self.blocks_to_swap :]:
             b.to(self.device)  # move block to device first. this makes sure that buffers (non weights) are on the device
-            weighs_to_device(b, "cpu")  # make sure weights are on cpu
+            weighs_to_device(b, torch.device("cpu"))  # make sure weights are on cpu
 
         _synchronize_device(self.device)
         _clean_memory_on_device(self.device)
@@ -255,7 +253,7 @@ def wait_for_block(self, block_idx: int):
             return
         self._wait_blocks_move(block_idx)
 
-    def submit_move_blocks(self, blocks: list[nn.Module], block_idx: int):
+    def submit_move_blocks(self, blocks: Union[list[nn.Module], nn.ModuleList], block_idx: int):
         # check if blocks_to_swap is enabled
         if self.blocks_to_swap is None or self.blocks_to_swap == 0:
             return
@@ -266,7 +264,10 @@ def submit_move_blocks(self, blocks: list[nn.Module], block_idx: int):
 
         block_idx_to_cpu = block_idx
         block_idx_to_cuda = self.num_blocks - self.blocks_to_swap + block_idx
-        block_idx_to_cuda = block_idx_to_cuda % self.num_blocks  # this works for forward-only offloading
+        
+        # this works for forward-only offloading. move upstream blocks to cuda
+        block_idx_to_cuda = block_idx_to_cuda % self.num_blocks
+        
         self._submit_move_blocks(blocks, block_idx_to_cpu, block_idx_to_cuda)
 
 
diff --git a/library/fp8_optimization_utils.py b/library/fp8_optimization_utils.py
index a91eb4e4c..ed7d3f764 100644
--- a/library/fp8_optimization_utils.py
+++ b/library/fp8_optimization_utils.py
@@ -9,7 +9,8 @@
 from tqdm import tqdm
 
 from library.device_utils import clean_memory_on_device
-from library.utils import MemoryEfficientSafeOpen, setup_logging
+from library.safetensors_utils import MemoryEfficientSafeOpen
+from library.utils import setup_logging
 
 setup_logging()
 import logging
diff --git a/library/hunyuan_image_text_encoder.py b/library/hunyuan_image_text_encoder.py
index 960f14b37..509f9bd2f 100644
--- a/library/hunyuan_image_text_encoder.py
+++ b/library/hunyuan_image_text_encoder.py
@@ -14,8 +14,8 @@
 from transformers.models.t5.modeling_t5 import T5Stack
 from accelerate import init_empty_weights
 
-from library import model_util
-from library.utils import load_safetensors, setup_logging
+from library.safetensors_utils import load_safetensors
+from library.utils import  setup_logging
 
 setup_logging()
 import logging
diff --git a/library/hunyuan_image_vae.py b/library/hunyuan_image_vae.py
index 570d4caa6..6f6eea22d 100644
--- a/library/hunyuan_image_vae.py
+++ b/library/hunyuan_image_vae.py
@@ -7,7 +7,8 @@
 from torch.nn import Conv2d
 from diffusers.models.autoencoders.vae import DiagonalGaussianDistribution
 
-from library.utils import load_safetensors, setup_logging
+from library.safetensors_utils import load_safetensors
+from library.utils import setup_logging
 
 setup_logging()
 import logging
diff --git a/library/lora_utils.py b/library/lora_utils.py
index 468fb01ad..b93eb9af3 100644
--- a/library/lora_utils.py
+++ b/library/lora_utils.py
@@ -9,7 +9,8 @@
 
 from library.device_utils import synchronize_device
 from library.fp8_optimization_utils import load_safetensors_with_fp8_optimization
-from library.utils import MemoryEfficientSafeOpen, setup_logging
+from library.safetensors_utils import MemoryEfficientSafeOpen
+from library.utils import setup_logging
 
 setup_logging()
 import logging
diff --git a/networks/flux_extract_lora.py b/networks/flux_extract_lora.py
index 63ab2960c..657287029 100644
--- a/networks/flux_extract_lora.py
+++ b/networks/flux_extract_lora.py
@@ -10,9 +10,8 @@
 from safetensors.torch import load_file, save_file
 from safetensors import safe_open
 from tqdm import tqdm
-from library import flux_utils, sai_model_spec, model_util, sdxl_model_util
-import lora
-from library.utils import MemoryEfficientSafeOpen
+from library import flux_utils, sai_model_spec
+from library.safetensors_utils import MemoryEfficientSafeOpen
 from library.utils import setup_logging
 from networks import lora_flux
 

From 29b0500e70011785b99ac3c76cd5bb6bc4c29a02 Mon Sep 17 00:00:00 2001
From: Kohya S <52813779+kohya-ss@users.noreply.github.com>
Date: Sat, 13 Sep 2025 21:18:50 +0900
Subject: [PATCH 557/582] fix: restore files section in _typos.toml for
 exclusion configuration

---
 _typos.toml | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/_typos.toml b/_typos.toml
index bf0292e50..686da4af2 100644
--- a/_typos.toml
+++ b/_typos.toml
@@ -33,8 +33,5 @@ OT="OT"
 byt="byt"
 tak="tak"
 
-# [files]
-# # Extend the default list of files to check
-# extend-exclude = [
-#     "library/hunyuan_image_text_encoder.py",
-# ]
+[files]
+extend-exclude = ["_typos.toml", "venv"]

From e04b9f0497f921b8ce857ae3a2850cf89669a9c8 Mon Sep 17 00:00:00 2001
From: kohya-ss <52813779+kohya-ss@users.noreply.github.com>
Date: Sat, 13 Sep 2025 22:06:10 +0900
Subject: [PATCH 558/582] docs: add LoRA training guide for HunyuanImage-2.1
 model (by Gemini CLI)

---
 docs/hunyuan_image_train_network.md | 406 ++++++++++++++++++++++++++++
 1 file changed, 406 insertions(+)
 create mode 100644 docs/hunyuan_image_train_network.md

diff --git a/docs/hunyuan_image_train_network.md b/docs/hunyuan_image_train_network.md
new file mode 100644
index 000000000..c48148006
--- /dev/null
+++ b/docs/hunyuan_image_train_network.md
@@ -0,0 +1,406 @@
+Status: reviewed
+
+# LoRA Training Guide for HunyuanImage-2.1 using `hunyuan_image_train_network.py` / `hunyuan_image_train_network.py` を用いたHunyuanImage-2.1モデルのLoRA学習ガイド
+
+This document explains how to train LoRA models for the HunyuanImage-2.1 model using `hunyuan_image_train_network.py` included in the `sd-scripts` repository.
+
+<details>
+<summary>日本語</summary>
+
+このドキュメントでは、`sd-scripts`リポジトリに含まれる`hunyuan_image_train_network.py`を使用して、HunyuanImage-2.1モデルに対するLoRA (Low-Rank Adaptation) モデルを学習する基本的な手順について解説します。
+
+</details>
+
+## 1. Introduction / はじめに
+
+`hunyuan_image_train_network.py` trains additional networks such as LoRA on the HunyuanImage-2.1 model, which uses a transformer-based architecture (DiT) different from Stable Diffusion. Two text encoders, Qwen2.5-VL and byT5, and a dedicated VAE are used.
+
+This guide assumes you know the basics of LoRA training. For common options see [train_network.py](train_network.md) and [sdxl_train_network.py](sdxl_train_network.md).
+
+**Prerequisites:**
+
+* The repository is cloned and the Python environment is ready.
+* A training dataset is prepared. See the dataset configuration guide.
+
+<details>
+<summary>日本語</summary>
+
+`hunyuan_image_train_network.py`はHunyuanImage-2.1モデルに対してLoRAなどの追加ネットワークを学習させるためのスクリプトです。HunyuanImage-2.1はStable Diffusionとは異なるDiT (Diffusion Transformer) アーキテクチャを持つ画像生成モデルであり、このスクリプトを使用することで、特定のキャラクターや画風を再現するLoRAモデルを作成できます。
+
+このガイドは、基本的なLoRA学習の手順を理解しているユーザーを対象としています。基本的な使い方や共通のオプションについては、[`train_network.py`のガイド](train_network.md)を参照してください。また一部のパラメータは [`sdxl_train_network.py`](sdxl_train_network.md) や [`flux_train_network.py`](flux_train_network.md) と同様のものがあるため、そちらも参考にしてください。
+
+**前提条件:**
+
+* `sd-scripts`リポジトリのクローンとPython環境のセットアップが完了していること。
+* 学習用データセットの準備が完了していること。（データセットの準備については[データセット設定ガイド](config_README-ja.md)を参照してください）
+
+</details>
+
+## 2. Differences from `train_network.py` / `train_network.py` との違い
+
+`hunyuan_image_train_network.py` is based on `train_network.py` but adapted for HunyuanImage-2.1. Main differences include:
+
+* **Target model:** HunyuanImage-2.1 model.
+* **Model structure:** HunyuanImage-2.1 uses a Transformer-based architecture (DiT). It uses two text encoders (Qwen2.5-VL and byT5) and a dedicated VAE.
+* **Required arguments:** Additional arguments for the DiT model, Qwen2.5-VL, byT5, and VAE model files.
+* **Incompatible options:** Some Stable Diffusion-specific arguments (e.g., `--v2`, `--clip_skip`, `--max_token_length`) are not used.
+* **HunyuanImage-2.1-specific arguments:** Additional arguments for specific training parameters like flow matching.
+
+<details>
+<summary>日本語</summary>
+
+`hunyuan_image_train_network.py`は`train_network.py`をベースに、HunyuanImage-2.1モデルに対応するための変更が加えられています。主な違いは以下の通りです。
+
+* **対象モデル:** HunyuanImage-2.1モデルを対象とします。
+* **モデル構造:** HunyuanImage-2.1はDiTベースのアーキテクチャを持ちます。Text EncoderとしてQwen2.5-VLとbyT5の二つを使用し、専用のVAEを使用します。
+* **必須の引数:** DiTモデル、Qwen2.5-VL、byT5、VAEの各モデルファイルを指定する引数が追加されています。
+* **一部引数の非互換性:** Stable Diffusion向けの引数の一部（例: `--v2`, `--clip_skip`, `--max_token_length`）は使用されません。
+* **HunyuanImage-2.1特有の引数:** Flow Matchingなど、特有の学習パラメータを指定する引数が追加されています。
+
+</details>
+
+## 3. Preparation / 準備
+
+Before starting training you need:
+
+1. **Training script:** `hunyuan_image_train_network.py`
+2. **HunyuanImage-2.1 DiT model file:** Base DiT model `.safetensors` file.
+3. **Text Encoder model files:**
+   - Qwen2.5-VL model file (`--text_encoder`).
+   - byT5 model file (`--byt5`).
+4. **VAE model file:** HunyuanImage-2.1-compatible VAE model `.safetensors` file (`--vae`).
+5. **Dataset definition file (.toml):** TOML format file describing training dataset configuration.
+
+### Downloading Required Models
+
+You need to download the model files from the official Hugging Face repositories (e.g., `Tencent-Hunyuan/HunyuanDiT`). Ensure you download the `.safetensors` files, not the Diffusers format directories.
+
+<details>
+<summary>日本語</summary>
+
+学習を開始する前に、以下のファイルが必要です。
+
+1. **学習スクリプト:** `hunyuan_image_train_network.py`
+2. **HunyuanImage-2.1 DiTモデルファイル:** 学習のベースとなるDiTモデルの`.safetensors`ファイル。
+3. **Text Encoderモデルファイル:**
+   - Qwen2.5-VLモデルファイル (`--text_encoder`)。
+   - byT5モデルファイル (`--byt5`)。
+4. **VAEモデルファイル:** HunyuanImage-2.1に対応するVAEモデルの`.safetensors`ファイル (`--vae`)。
+5. **データセット定義ファイル (.toml):** 学習データセットの設定を記述したTOML形式のファイル。（詳細は[データセット設定ガイド](config_README-ja.md)を参照してください）。
+
+**必要なモデルのダウンロード**
+
+公式のHugging Faceリポジトリ（例: `Tencent-Hunyuan/HunyuanDiT`）からモデルファイルをダウンロードする必要があります。Diffusers形式のディレクトリではなく、`.safetensors`形式のファイルをダウンロードしてください。
+
+</details>
+
+## 4. Running the Training / 学習の実行
+
+Run `hunyuan_image_train_network.py` from the terminal with HunyuanImage-2.1 specific arguments. Here's a basic command example:
+
+```bash
+accelerate launch --num_cpu_threads_per_process 1 hunyuan_image_train_network.py \
+  --pretrained_model_name_or_path="<path to HunyuanDiT model>" \
+  --text_encoder="<path to Qwen2.5-VL model>" \
+  --byt5="<path to byT5 model>" \
+  --vae="<path to VAE model>" \
+  --dataset_config="my_hunyuan_dataset_config.toml" \
+  --output_dir="<output directory>" \
+  --output_name="my_hunyuan_lora" \
+  --save_model_as=safetensors \
+  --network_module=networks.lora_hunyuan_image \
+  --network_dim=16 \
+  --network_alpha=1 \
+  --learning_rate=1e-4 \
+  --optimizer_type="AdamW8bit" \
+  --lr_scheduler="constant" \
+  --sdpa \
+  --max_train_epochs=10 \
+  --save_every_n_epochs=1 \
+  --mixed_precision="bf16" \
+  --gradient_checkpointing \
+  --model_prediction_type="raw" \
+  --discrete_flow_shift=5.0 \
+  --blocks_to_swap=18 \
+  --cache_text_encoder_outputs \
+  --cache_latents
+```
+
+<details>
+<summary>日本語</summary>
+
+学習は、ターミナルから`hunyuan_image_train_network.py`を実行することで開始します。基本的なコマンドラインの構造は`train_network.py`と同様ですが、HunyuanImage-2.1特有の引数を指定する必要があります。
+
+コマンドラインの例は英語のドキュメントを参照してください。
+
+</details>
+
+### 4.1. Explanation of Key Options / 主要なコマンドライン引数の解説
+
+The script adds HunyuanImage-2.1 specific arguments. For common arguments (like `--output_dir`, `--output_name`, `--network_module`, etc.), see the [`train_network.py` guide](train_network.md).
+
+#### Model-related [Required]
+
+* `--pretrained_model_name_or_path="<path to HunyuanDiT model>"` **[Required]**
+  - Specifies the path to the base DiT model `.safetensors` file.
+* `--text_encoder="<path to Qwen2.5-VL model>"` **[Required]**
+  - Specifies the path to the Qwen2.5-VL Text Encoder model file. Should be `bfloat16`.
+* `--byt5="<path to byT5 model>"` **[Required]**
+  - Specifies the path to the byT5 Text Encoder model file. Should be `float16`.
+* `--vae="<path to VAE model>"` **[Required]**
+  - Specifies the path to the HunyuanImage-2.1-compatible VAE model `.safetensors` file.
+
+#### HunyuanImage-2.1 Training Parameters
+
+* `--discrete_flow_shift=<float>`
+  - Specifies the shift value for the scheduler used in Flow Matching. Default is `5.0`.
+* `--model_prediction_type=<choice>`
+  - Specifies what the model predicts. Choose from `raw`, `additive`, `sigma_scaled`. Default and recommended is `raw`.
+* `--timestep_sampling=<choice>`
+  - Specifies the sampling method for timesteps (noise levels) during training. Choose from `sigma`, `uniform`, `sigmoid`, `shift`, `flux_shift`. Default is `sigma`.
+* `--sigmoid_scale=<float>`
+  - Scale factor when `timestep_sampling` is set to `sigmoid`, `shift`, or `flux_shift`. Default is `1.0`.
+
+#### Memory/Speed Related
+
+* `--fp8_scaled`
+  - Enables training the DiT model in scaled FP8 format. This can significantly reduce VRAM usage, but the training results may vary.
+* `--fp8_vl`
+  - Use FP8 for the VLM (Qwen2.5-VL) text encoder.
+* `--blocks_to_swap=<integer>` **[Experimental Feature]**
+  - Setting to reduce VRAM usage by swapping parts of the model (Transformer blocks) between CPU and GPU. Specify the number of blocks to swap as an integer (e.g., `18`). Larger values reduce VRAM usage but decrease training speed. Adjust according to your GPU's VRAM capacity. Can be used with `gradient_checkpointing`.
+* `--cache_text_encoder_outputs`
+  - Caches the outputs of Qwen2.5-VL and byT5. This reduces memory usage.
+* `--cache_latents`, `--cache_latents_to_disk`
+  - Caches the outputs of VAE. Similar functionality to [sdxl_train_network.py](sdxl_train_network.md).
+* `--vae_enable_tiling`
+  - Enables tiling for VAE encoding and decoding to reduce VRAM usage.
+
+<details>
+<summary>日本語</summary>
+
+[`train_network.py`のガイド](train_network.md)で説明されている引数に加え、以下のHunyuanImage-2.1特有の引数を指定します。共通の引数（`--output_dir`, `--output_name`, `--network_module`, `--network_dim`, `--network_alpha`, `--learning_rate`など）については、上記ガイドを参照してください。
+
+コマンドラインの例と詳細な引数の説明は英語のドキュメントを参照してください。
+
+</details>
+
+## 5. Using the Trained Model / 学習済みモデルの利用
+
+After training, a LoRA model file is saved in `output_dir` and can be used in inference environments supporting HunyuanImage-2.1.
+
+<details>
+<summary>日本語</summary>
+
+学習が完了すると、指定した`output_dir`にLoRAモデルファイル（例: `my_hunyuan_lora.safetensors`）が保存されます。このファイルは、HunyuanImage-2.1モデルに対応した推論環境で使用できます。
+
+</details>
+
+## 6. Advanced Settings / 高度な設定
+
+### 6.1. VRAM Usage Optimization / VRAM使用量の最適化
+
+HunyuanImage-2.1 is a large model, so GPUs without sufficient VRAM require optimization.
+
+#### Key VRAM Reduction Options
+
+- **`--fp8_scaled`**: Enables training the DiT in scaled FP8 format.
+- **`--fp8_vl`**: Use FP8 for the VLM text encoder.
+- **`--blocks_to_swap <number>`**: Swaps blocks between CPU and GPU to reduce VRAM usage. Higher numbers save more VRAM but reduce training speed.
+- **`--cpu_offload_checkpointing`**: Offloads gradient checkpoints to CPU. Can reduce VRAM usage but decreases training speed. Cannot be used with `--blocks_to_swap`.
+- **Using Adafactor optimizer**: Can reduce VRAM usage more than 8bit AdamW:
+  ```
+  --optimizer_type adafactor --optimizer_args "relative_step=False" "scale_parameter=False" "warmup_init=False" --lr_scheduler constant_with_warmup --max_grad_norm 0.0
+  ```
+
+<details>
+<summary>日本語</summary>
+
+HunyuanImage-2.1は大きなモデルであるため、十分なVRAMを持たないGPUでは工夫が必要です。VRAM使用量を削減するための設定の詳細は英語のドキュメントを参照してください。
+
+主要なVRAM削減オプション：
+- `--fp8_scaled`: DiTをスケールされたFP8形式で学習
+- `--fp8_vl`: VLMテキストエンコーダにFP8を使用
+- `--blocks_to_swap`: CPUとGPU間でブロックをスワップ
+- `--cpu_offload_checkpointing`: 勾配チェックポイントをCPUにオフロード
+- Adafactorオプティマイザの使用
+
+</details>
+
+### 6.2. Important HunyuanImage-2.1 LoRA Training Settings / HunyuanImage-2.1 LoRA学習の重要な設定
+
+HunyuanImage-2.1 training has several settings that can be specified with arguments:
+
+#### Timestep Sampling Methods
+
+The `--timestep_sampling` option specifies how timesteps (0-1) are sampled:
+
+- `sigma`: Sigma-based like SD3 (Default)
+- `uniform`: Uniform random
+- `sigmoid`: Sigmoid of normal distribution random
+- `shift`: Sigmoid value of normal distribution random with shift.
+- `flux_shift`: Shift sigmoid value of normal distribution random according to resolution.
+
+#### Model Prediction Processing
+
+The `--model_prediction_type` option specifies how to interpret and process model predictions:
+
+- `raw`: Use as-is **[Recommended, Default]**
+- `additive`: Add to noise input
+- `sigma_scaled`: Apply sigma scaling
+
+#### Recommended Settings
+
+Based on experiments, the default settings work well:
+```
+--model_prediction_type raw --discrete_flow_shift 5.0
+```
+
+<details>
+<summary>日本語</summary>
+
+HunyuanImage-2.1の学習には、引数で指定できるいくつかの設定があります。詳細な説明とコマンドラインの例は英語のドキュメントを参照してください。
+
+主要な設定オプション：
+- タイムステップのサンプリング方法（`--timestep_sampling`）
+- モデル予測の処理方法（`--model_prediction_type`）
+- 推奨設定の組み合わせ
+
+</details>
+
+### 6.3. Regular Expression-based Rank/LR Configuration / 正規表現によるランク・学習率の指定
+
+You can specify ranks (dims) and learning rates for LoRA modules using regular expressions. This allows for more flexible and fine-grained control.
+
+These settings are specified via the `network_args` argument.
+
+*   `network_reg_dims`: Specify ranks for modules matching a regular expression. The format is a comma-separated string of `pattern=rank`.
+    *   Example: `--network_args "network_reg_dims=attn.*.q_proj=4,attn.*.k_proj=4"`
+*   `network_reg_lrs`: Specify learning rates for modules matching a regular expression. The format is a comma-separated string of `pattern=lr`.
+    *   Example: `--network_args "network_reg_lrs=down_blocks.1=1e-4,up_blocks.2=2e-4"`
+
+**Notes:**
+
+*   To find the correct module names for the patterns, you may need to inspect the model structure.
+*   Settings via `network_reg_dims` and `network_reg_lrs` take precedence over the global `--network_dim` and `--learning_rate` settings.
+*   If a module name matches multiple patterns, the setting from the last matching pattern in the string will be applied.
+
+<details>
+<summary>日本語</summary>
+
+正規表現を用いて、LoRAのモジュールごとにランク（dim）や学習率を指定することができます。これにより、柔軟できめ細やかな制御が可能になります。
+
+これらの設定は `network_args` 引数で指定します。
+
+*   `network_reg_dims`: 正規表現にマッチするモジュールに対してランクを指定します。
+*   `network_reg_lrs`: 正規表現にマッチするモジュールに対して学習率を指定します。
+
+**注意点:**
+
+*   パターンのための正確なモジュール名を見つけるには、モデルの構造を調べる必要があるかもしれません。
+*   `network_reg_dims` および `network_reg_lrs` での設定は、全体設定である `--network_dim` や `--learning_rate` よりも優先されます。
+*   あるモジュール名が複数のパターンにマッチした場合、文字列の中で後方にあるパターンの設定が適用されます。
+
+</details>
+
+### 6.4. Multi-Resolution Training / マルチ解像度トレーニング
+
+You can define multiple resolutions in the dataset configuration file, with different batch sizes for each resolution.
+
+**Note:** This feature is available, but it is **not recommended** as the HunyuanImage-2.1 base model was not trained with multi-resolution capabilities. Using it may lead to unexpected results.
+
+Configuration file example:
+```toml
+[general]
+shuffle_caption = true
+caption_extension = ".txt"
+
+[[datasets]]
+batch_size = 2
+enable_bucket = true
+resolution = [1024, 1024]
+
+  [[datasets.subsets]]
+  image_dir = "path/to/image/directory"
+  num_repeats = 1
+
+[[datasets]]
+batch_size = 1
+enable_bucket = true
+resolution = [1280, 768]
+
+  [[datasets.subsets]]
+  image_dir = "path/to/another/directory"
+  num_repeats = 1
+```
+
+<details>
+<summary>日本語</summary>
+
+データセット設定ファイルで複数の解像度を定義できます。各解像度に対して異なるバッチサイズを指定することができます。
+
+**注意:** この機能は利用可能ですが、HunyuanImage-2.1のベースモデルはマルチ解像度で学習されていないため、**非推奨**です。使用すると予期しない結果になる可能性があります。
+
+設定ファイルの例は英語のドキュメントを参照してください。
+
+</details>
+
+### 6.5. Validation / 検証
+
+You can calculate validation loss during training using a validation dataset to evaluate model generalization performance. This feature works the same as in other training scripts. For details, please refer to the [Validation Guide](validation.md).
+
+<details>
+<summary>日本語</summary>
+
+学習中に検証データセットを使用して損失 (Validation Loss) を計算し、モデルの汎化性能を評価できます。この機能は他の学習スクリプトと同様に動作します。詳細は[検証ガイド](validation.md)を参照してください。
+
+</details>
+
+## 7. Other Training Options / その他の学習オプション
+
+- **`--ip_noise_gamma`**: Use `--ip_noise_gamma` and `--ip_noise_gamma_random_strength` to adjust Input Perturbation noise gamma values during training. See Stable Diffusion 3 training options for details.
+
+- **`--loss_type`**: Specifies the loss function for training. The default is `l2`.
+  - `l1`: L1 loss.
+  - `l2`: L2 loss (mean squared error).
+  - `huber`: Huber loss.
+  - `smooth_l1`: Smooth L1 loss.
+
+- **`--huber_schedule`**, **`--huber_c`**, **`--huber_scale`**: These are parameters for Huber loss. They are used when `--loss_type` is `huber` or `smooth_l1`.
+
+- **`--weighting_scheme`**, **`--logit_mean`**, **`--logit_std`**, **`--mode_scale`**: These options allow you to adjust the loss weighting for each timestep. For details, refer to the [`sd3_train_network.md` guide](sd3_train_network.md).
+
+- **`--fused_backward_pass`**: Fuses the backward pass and optimizer step to reduce VRAM usage.
+
+<details>
+<summary>日本語</summary>
+
+- **`--ip_noise_gamma`**: Input Perturbationノイズのガンマ値を調整します。
+- **`--loss_type`**: 学習に用いる損失関数を指定します。
+- **`--huber_schedule`**, **`--huber_c`**, **`--huber_scale`**: Huber損失のパラメータです。
+- **`--weighting_scheme`**, **`--logit_mean`**, **`--logit_std`**, **`--mode_scale`**: 各タイムステップの損失の重み付けを調整します。
+- **`--fused_backward_pass`**: バックワードパスとオプティマイザステップを融合してVRAM使用量を削減します。
+
+</details>
+
+## 8. Related Tools / 関連ツール
+
+- **`hunyuan_image_minimal_inference.py`**: Simple inference script for generating images with trained LoRA models.
+
+<details>
+<summary>日本語</summary>
+
+- **`hunyuan_image_minimal_inference.py`**: 学習した LoRA モデルを適用して画像を生成するシンプルな推論スクリプト。
+
+</details>
+
+## 9. Others / その他
+
+`hunyuan_image_train_network.py` includes many features common with `train_network.py`, such as sample image generation (`--sample_prompts`, etc.) and detailed optimizer settings. For these features, refer to the [`train_network.py` guide](train_network.md#5-other-features--その他の機能) or the script help (`python hunyuan_image_train_network.py --help`).
+
+<details>
+<summary>日本語</summary>
+
+`hunyuan_image_train_network.py`には、サンプル画像の生成 (`--sample_prompts`など) や詳細なオプティマイザ設定など、`train_network.py`と共通の機能も多く存在します。これらについては、[`train_network.py`のガイド](train_network.md#5-other-features--その他の機能)やスクリプトのヘルプ (`python hunyuan_image_train_network.py --help`) を参照してください。
+
+</details>

From 1a73b5e8a540a2ab91f5eed7379d75a6c93e153c Mon Sep 17 00:00:00 2001
From: Kohya S <52813779+kohya-ss@users.noreply.github.com>
Date: Sun, 14 Sep 2025 20:49:20 +0900
Subject: [PATCH 559/582] feat: add script to convert LoRA format to ComfyUI
 format

---
 .../convert_hunyuan_image_lora_to_comfy.py    | 68 +++++++++++++++++++
 1 file changed, 68 insertions(+)
 create mode 100644 networks/convert_hunyuan_image_lora_to_comfy.py

diff --git a/networks/convert_hunyuan_image_lora_to_comfy.py b/networks/convert_hunyuan_image_lora_to_comfy.py
new file mode 100644
index 000000000..65da2da45
--- /dev/null
+++ b/networks/convert_hunyuan_image_lora_to_comfy.py
@@ -0,0 +1,68 @@
+import argparse
+from safetensors.torch import save_file
+from safetensors import safe_open
+import torch
+
+
+from library import train_util
+from library.utils import setup_logging
+
+setup_logging()
+import logging
+
+logger = logging.getLogger(__name__)
+
+
+def main(args):
+    # load source safetensors
+    logger.info(f"Loading source file {args.src_path}")
+    state_dict = {}
+    with safe_open(args.src_path, framework="pt") as f:
+        metadata = f.metadata()
+        for k in f.keys():
+            state_dict[k] = f.get_tensor(k)
+
+    logger.info(f"Converting...")
+
+    keys = list(state_dict.keys())
+    count = 0
+    for k in keys:
+        if "double_blocks" in k:
+            new_k = (
+                k.replace("img_mlp_fc1", "img_mlp_0").replace("img_mlp_fc2", "img_mlp_2").replace("img_mod_linear", "img_mod_lin")
+            )
+            new_k = (
+                new_k.replace("txt_mlp_fc1", "txt_mlp_0")
+                .replace("txt_mlp_fc2", "txt_mlp_2")
+                .replace("txt_mod_linear", "txt_mod_lin")
+            )
+            if new_k != k:
+                state_dict[new_k] = state_dict.pop(k)
+                count += 1
+                # print(f"Renamed {k} to {new_k}")
+        elif "single_blocks" in k:
+            new_k = k.replace("modulation_linear", "modulation_lin")
+            if new_k != k:
+                state_dict[new_k] = state_dict.pop(k)
+                count += 1
+                # print(f"Renamed {k} to {new_k}")
+    logger.info(f"Converted {count} keys")
+
+    # Calculate hash
+    if metadata is not None:
+        logger.info(f"Calculating hashes and creating metadata...")
+        model_hash, legacy_hash = train_util.precalculate_safetensors_hashes(state_dict, metadata)
+        metadata["sshs_model_hash"] = model_hash
+        metadata["sshs_legacy_hash"] = legacy_hash
+
+    # save destination safetensors
+    logger.info(f"Saving destination file {args.dst_path}")
+    save_file(state_dict, args.dst_path, metadata=metadata)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Convert LoRA format")
+    parser.add_argument("src_path", type=str, default=None, help="source path, sd-scripts format")
+    parser.add_argument("dst_path", type=str, default=None, help="destination path, ComfyUI format")
+    args = parser.parse_args()
+    main(args)

From 39458ec0e3b938fb5f21c1769a4bde046ed924c9 Mon Sep 17 00:00:00 2001
From: kohya-ss <52813779+kohya-ss@users.noreply.github.com>
Date: Tue, 16 Sep 2025 21:17:21 +0900
Subject: [PATCH 560/582] fix: update default values for guidance_scale,
 image_size, infer_steps, and flow_shift in argument parser

---
 hunyuan_image_minimal_inference.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/hunyuan_image_minimal_inference.py b/hunyuan_image_minimal_inference.py
index 7db490cd1..04ab1aac1 100644
--- a/hunyuan_image_minimal_inference.py
+++ b/hunyuan_image_minimal_inference.py
@@ -67,12 +67,12 @@ def parse_args() -> argparse.Namespace:
 
     # inference
     parser.add_argument(
-        "--guidance_scale", type=float, default=5.0, help="Guidance scale for classifier free guidance. Default is 5.0."
+        "--guidance_scale", type=float, default=3.5, help="Guidance scale for classifier free guidance. Default is 3.5."
     )
     parser.add_argument("--prompt", type=str, default=None, help="prompt for generation")
     parser.add_argument("--negative_prompt", type=str, default="", help="negative prompt for generation, default is empty string")
-    parser.add_argument("--image_size", type=int, nargs=2, default=[256, 256], help="image size, height and width")
-    parser.add_argument("--infer_steps", type=int, default=25, help="number of inference steps, default is 25")
+    parser.add_argument("--image_size", type=int, nargs=2, default=[2048, 2048], help="image size, height and width")
+    parser.add_argument("--infer_steps", type=int, default=50, help="number of inference steps, default is 50")
     parser.add_argument("--save_path", type=str, required=True, help="path to save generated video")
     parser.add_argument("--seed", type=int, default=None, help="Seed for evaluation.")
 
@@ -80,8 +80,8 @@ def parse_args() -> argparse.Namespace:
     parser.add_argument(
         "--flow_shift",
         type=float,
-        default=None,
-        help="Shift factor for flow matching schedulers. Default is None (default).",
+        default=5.0,
+        help="Shift factor for flow matching schedulers. Default is 5.0.",
     )
 
     parser.add_argument("--fp8", action="store_true", help="use fp8 for DiT model")

From f318ddaeea7c491c81788c79062530f9d203f9ed Mon Sep 17 00:00:00 2001
From: kohya-ss <52813779+kohya-ss@users.noreply.github.com>
Date: Tue, 16 Sep 2025 21:18:01 +0900
Subject: [PATCH 561/582] docs: update HunyuanImage-2.1 training guide with
 model download instructions and VRAM optimization settings (by Claude)

---
 docs/hunyuan_image_train_network.md | 98 +++++++++++++++++++++++++----
 1 file changed, 87 insertions(+), 11 deletions(-)

diff --git a/docs/hunyuan_image_train_network.md b/docs/hunyuan_image_train_network.md
index c48148006..c4c93d8d5 100644
--- a/docs/hunyuan_image_train_network.md
+++ b/docs/hunyuan_image_train_network.md
@@ -73,7 +73,13 @@ Before starting training you need:
 
 ### Downloading Required Models
 
-You need to download the model files from the official Hugging Face repositories (e.g., `Tencent-Hunyuan/HunyuanDiT`). Ensure you download the `.safetensors` files, not the Diffusers format directories.
+To train HunyuanImage-2.1 models, you need to download the following model files:
+
+- **DiT Model**: Download from the [Tencent HunyuanImage-2.1](https://huggingface.co/tencent/HunyuanImage-2.1/) repository. Use `dit/hunyuanimage2.1.safetensors`.
+- **Text Encoders and VAE**: Download from the [Comfy-Org/HunyuanImage_2.1_ComfyUI](https://huggingface.co/Comfy-Org/HunyuanImage_2.1_ComfyUI) repository:
+  - Qwen2.5-VL: `split_files/text_encoders/qwen_2.5_vl_7b.safetensors`
+  - byT5: `split_files/text_encoders/byt5_small_glyphxl_fp16.safetensors`
+  - VAE: `split_files/vae/hunyuan_image_2.1_vae_fp16.safetensors`
 
 <details>
 <summary>日本語</summary>
@@ -90,7 +96,13 @@ You need to download the model files from the official Hugging Face repositories
 
 **必要なモデルのダウンロード**
 
-公式のHugging Faceリポジトリ（例: `Tencent-Hunyuan/HunyuanDiT`）からモデルファイルをダウンロードする必要があります。Diffusers形式のディレクトリではなく、`.safetensors`形式のファイルをダウンロードしてください。
+HunyuanImage-2.1モデルを学習するためには、以下のモデルファイルをダウンロードする必要があります：
+
+- **DiTモデル**: [Tencent HunyuanImage-2.1](https://huggingface.co/tencent/HunyuanImage-2.1/) リポジトリから `dit/hunyuanimage2.1.safetensors` をダウンロードします。
+- **Text EncoderとVAE**: [Comfy-Org/HunyuanImage_2.1_ComfyUI](https://huggingface.co/Comfy-Org/HunyuanImage_2.1_ComfyUI) リポジトリから以下をダウンロードします：
+  - Qwen2.5-VL: `split_files/text_encoders/qwen_2.5_vl_7b.safetensors`
+  - byT5: `split_files/text_encoders/byt5_small_glyphxl_fp16.safetensors`
+  - VAE: `split_files/vae/hunyuan_image_2.1_vae_fp16.safetensors`
 
 </details>
 
@@ -164,7 +176,7 @@ The script adds HunyuanImage-2.1 specific arguments. For common arguments (like
 #### Memory/Speed Related
 
 * `--fp8_scaled`
-  - Enables training the DiT model in scaled FP8 format. This can significantly reduce VRAM usage, but the training results may vary.
+  - Enables training the DiT model in scaled FP8 format. This can significantly reduce VRAM usage (can run with as little as 8GB VRAM when combined with `--blocks_to_swap`), but the training results may vary. This is a newer alternative to the unsupported `--fp8_base` option.
 * `--fp8_vl`
   - Use FP8 for the VLM (Qwen2.5-VL) text encoder.
 * `--blocks_to_swap=<integer>` **[Experimental Feature]**
@@ -202,11 +214,22 @@ After training, a LoRA model file is saved in `output_dir` and can be used in in
 
 HunyuanImage-2.1 is a large model, so GPUs without sufficient VRAM require optimization.
 
+#### Recommended Settings by GPU Memory
+
+Based on testing with the pull request, here are recommended VRAM optimization settings:
+
+| GPU Memory | Recommended Settings |
+|------------|---------------------|
+| 40GB+ VRAM | Standard settings (no special optimization needed) |
+| 24GB VRAM  | `--fp8_scaled --blocks_to_swap 9` |
+| 12GB VRAM  | `--fp8_scaled --blocks_to_swap 32` |
+| 8GB VRAM   | `--fp8_scaled --blocks_to_swap 37` |
+
 #### Key VRAM Reduction Options
 
-- **`--fp8_scaled`**: Enables training the DiT in scaled FP8 format.
-- **`--fp8_vl`**: Use FP8 for the VLM text encoder.
-- **`--blocks_to_swap <number>`**: Swaps blocks between CPU and GPU to reduce VRAM usage. Higher numbers save more VRAM but reduce training speed.
+- **`--fp8_scaled`**: Enables training the DiT in scaled FP8 format. This is the recommended FP8 option for HunyuanImage-2.1, replacing the unsupported `--fp8_base` option. Essential for <40GB VRAM environments.
+- **`--fp8_vl`**: Use FP8 for the VLM (Qwen2.5-VL) text encoder.
+- **`--blocks_to_swap <number>`**: Swaps blocks between CPU and GPU to reduce VRAM usage. Higher numbers save more VRAM but reduce training speed. Up to 37 blocks can be swapped for HunyuanImage-2.1.
 - **`--cpu_offload_checkpointing`**: Offloads gradient checkpoints to CPU. Can reduce VRAM usage but decreases training speed. Cannot be used with `--blocks_to_swap`.
 - **Using Adafactor optimizer**: Can reduce VRAM usage more than 8bit AdamW:
   ```
@@ -216,12 +239,23 @@ HunyuanImage-2.1 is a large model, so GPUs without sufficient VRAM require optim
 <details>
 <summary>日本語</summary>
 
-HunyuanImage-2.1は大きなモデルであるため、十分なVRAMを持たないGPUでは工夫が必要です。VRAM使用量を削減するための設定の詳細は英語のドキュメントを参照してください。
+HunyuanImage-2.1は大きなモデルであるため、十分なVRAMを持たないGPUでは工夫が必要です。
+
+#### GPU別推奨設定
+
+Pull Requestのテスト結果に基づく推奨VRAM最適化設定：
+
+| GPU Memory | 推奨設定 |
+|------------|---------|
+| 40GB+ VRAM | 標準設定（特別な最適化不要） |
+| 24GB VRAM  | `--fp8_scaled --blocks_to_swap 9` |
+| 12GB VRAM  | `--fp8_scaled --blocks_to_swap 32` |
+| 8GB VRAM   | `--fp8_scaled --blocks_to_swap 37` |
 
 主要なVRAM削減オプション：
-- `--fp8_scaled`: DiTをスケールされたFP8形式で学習
+- `--fp8_scaled`: DiTをスケールされたFP8形式で学習（推奨されるFP8オプション、40GB VRAM未満の環境では必須）
 - `--fp8_vl`: VLMテキストエンコーダにFP8を使用
-- `--blocks_to_swap`: CPUとGPU間でブロックをスワップ
+- `--blocks_to_swap`: CPUとGPU間でブロックをスワップ（最大37ブロック）
 - `--cpu_offload_checkpointing`: 勾配チェックポイントをCPUにオフロード
 - Adafactorオプティマイザの使用
 
@@ -383,7 +417,49 @@ You can calculate validation loss during training using a validation dataset to
 
 </details>
 
-## 8. Related Tools / 関連ツール
+## 8. Using the Inference Script / 推論スクリプトの使用法
+
+The `hunyuan_image_minimal_inference.py` script allows you to generate images using trained LoRA models. Here's a basic usage example:
+
+```bash
+python hunyuan_image_minimal_inference.py \
+  --dit "<path to hunyuanimage2.1.safetensors>" \
+  --text_encoder "<path to qwen_2.5_vl_7b.safetensors>" \
+  --byt5 "<path to byt5_small_glyphxl_fp16.safetensors>" \
+  --vae "<path to hunyuan_image_2.1_vae_fp16.safetensors>" \
+  --lora_weight "<path to your trained LoRA>" \
+  --lora_multiplier 1.0 \
+  --prompt "A cute cartoon penguin in a snowy landscape" \
+  --image_size 2048 2048 \
+  --infer_steps 50 \
+  --guidance_scale 3.5 \
+  --flow_shift 5.0 \
+  --seed 542017 \
+  --save_path "output_image.png"
+```
+
+**Key Options:**
+- `--fp8_scaled`: Use scaled FP8 format for reduced VRAM usage during inference
+- `--blocks_to_swap`: Swap blocks to CPU to reduce VRAM usage
+- `--image_size`: Resolution (inference is most stable at 2048x2048)
+- `--guidance_scale`: CFG scale (default: 3.5)
+- `--flow_shift`: Flow matching shift parameter (default: 5.0)
+
+<details>
+<summary>日本語</summary>
+
+`hunyuan_image_minimal_inference.py`スクリプトを使用して、学習したLoRAモデルで画像を生成できます。基本的な使用例は英語のドキュメントを参照してください。
+
+**主要なオプション:**
+- `--fp8_scaled`: VRAM使用量削減のためのスケールFP8形式
+- `--blocks_to_swap`: VRAM使用量削減のためのブロックスワップ
+- `--image_size`: 解像度（2048x2048で最も安定）
+- `--guidance_scale`: CFGスケール（推奨: 3.5）
+- `--flow_shift`: Flow Matchingシフトパラメータ（デフォルト: 5.0）
+
+</details>
+
+## 9. Related Tools / 関連ツール
 
 - **`hunyuan_image_minimal_inference.py`**: Simple inference script for generating images with trained LoRA models.
 
@@ -394,7 +470,7 @@ You can calculate validation loss during training using a validation dataset to
 
 </details>
 
-## 9. Others / その他
+## 10. Others / その他
 
 `hunyuan_image_train_network.py` includes many features common with `train_network.py`, such as sample image generation (`--sample_prompts`, etc.) and detailed optimizer settings. For these features, refer to the [`train_network.py` guide](train_network.md#5-other-features--その他の機能) or the script help (`python hunyuan_image_train_network.py --help`).
 

From cbe2a9da45fd99068faf1511b7f6bf1e641dd6d4 Mon Sep 17 00:00:00 2001
From: Kohya S <52813779+kohya-ss@users.noreply.github.com>
Date: Tue, 16 Sep 2025 21:48:47 +0900
Subject: [PATCH 562/582] feat: add conversion script for LoRA models to
 ComfyUI format with reverse option

---
 docs/hunyuan_image_train_network.md           | 20 ++++++-
 .../convert_hunyuan_image_lora_to_comfy.py    | 54 +++++++++++++------
 2 files changed, 55 insertions(+), 19 deletions(-)

diff --git a/docs/hunyuan_image_train_network.md b/docs/hunyuan_image_train_network.md
index c4c93d8d5..3d49fbdfb 100644
--- a/docs/hunyuan_image_train_network.md
+++ b/docs/hunyuan_image_train_network.md
@@ -461,12 +461,28 @@ python hunyuan_image_minimal_inference.py \
 
 ## 9. Related Tools / 関連ツール
 
-- **`hunyuan_image_minimal_inference.py`**: Simple inference script for generating images with trained LoRA models.
+### `networks/convert_hunyuan_image_lora_to_comfy.py`
+
+A script to convert LoRA models to ComfyUI-compatible format. The formats differ slightly, so conversion is necessary. You can convert from the sd-scripts format to ComfyUI format with:
+
+```bash
+python networks/convert_hunyuan_image_lora_to_comfy.py path/to/source.safetensors path/to/destination.safetensors
+```
+
+Using the `--reverse` option allows conversion in the opposite direction (ComfyUI format to sd-scripts format). However, reverse conversion is only possible for LoRAs converted by this script. LoRAs created with other training tools cannot be converted.
 
 <details>
 <summary>日本語</summary>
 
-- **`hunyuan_image_minimal_inference.py`**: 学習した LoRA モデルを適用して画像を生成するシンプルな推論スクリプト。
+**`networks/convert_hunyuan_image_lora_to_comfy.py`**
+
+LoRAモデルをComfyUI互換形式に変換するスクリプト。わずかに形式が異なるため、変換が必要です。以下の指定で、sd-scriptsの形式からComfyUI形式に変換できます。
+
+```bash
+python networks/convert_hunyuan_image_lora_to_comfy.py path/to/source.safetensors path/to/destination.safetensors
+```
+
+`--reverse`オプションを付けると、逆変換（ComfyUI形式からsd-scripts形式）も可能です。ただし、逆変換ができるのはこのスクリプトで変換したLoRAに限ります。他の学習ツールで作成したLoRAは変換できません。
 
 </details>
 
diff --git a/networks/convert_hunyuan_image_lora_to_comfy.py b/networks/convert_hunyuan_image_lora_to_comfy.py
index 65da2da45..df12897df 100644
--- a/networks/convert_hunyuan_image_lora_to_comfy.py
+++ b/networks/convert_hunyuan_image_lora_to_comfy.py
@@ -24,28 +24,47 @@ def main(args):
 
     logger.info(f"Converting...")
 
+    # Key mapping tables: (sd-scripts format, ComfyUI format)
+    double_blocks_mappings = [
+        ("img_mlp_fc1", "img_mlp_0"),
+        ("img_mlp_fc2", "img_mlp_2"),
+        ("img_mod_linear", "img_mod_lin"),
+        ("txt_mlp_fc1", "txt_mlp_0"),
+        ("txt_mlp_fc2", "txt_mlp_2"),
+        ("txt_mod_linear", "txt_mod_lin"),
+    ]
+
+    single_blocks_mappings = [
+        ("modulation_linear", "modulation_lin"),
+    ]
+
     keys = list(state_dict.keys())
     count = 0
+
     for k in keys:
+        new_k = k
+
         if "double_blocks" in k:
-            new_k = (
-                k.replace("img_mlp_fc1", "img_mlp_0").replace("img_mlp_fc2", "img_mlp_2").replace("img_mod_linear", "img_mod_lin")
-            )
-            new_k = (
-                new_k.replace("txt_mlp_fc1", "txt_mlp_0")
-                .replace("txt_mlp_fc2", "txt_mlp_2")
-                .replace("txt_mod_linear", "txt_mod_lin")
-            )
-            if new_k != k:
-                state_dict[new_k] = state_dict.pop(k)
-                count += 1
-                # print(f"Renamed {k} to {new_k}")
+            mappings = double_blocks_mappings
         elif "single_blocks" in k:
-            new_k = k.replace("modulation_linear", "modulation_lin")
-            if new_k != k:
-                state_dict[new_k] = state_dict.pop(k)
-                count += 1
-                # print(f"Renamed {k} to {new_k}")
+            mappings = single_blocks_mappings
+        else:
+            continue
+
+        # Apply mappings based on conversion direction
+        for src_key, dst_key in mappings:
+            if args.reverse:
+                # ComfyUI to sd-scripts: swap src and dst
+                new_k = new_k.replace(dst_key, src_key)
+            else:
+                # sd-scripts to ComfyUI: use as-is
+                new_k = new_k.replace(src_key, dst_key)
+
+        if new_k != k:
+            state_dict[new_k] = state_dict.pop(k)
+            count += 1
+            # print(f"Renamed {k} to {new_k}")
+
     logger.info(f"Converted {count} keys")
 
     # Calculate hash
@@ -64,5 +83,6 @@ def main(args):
     parser = argparse.ArgumentParser(description="Convert LoRA format")
     parser.add_argument("src_path", type=str, default=None, help="source path, sd-scripts format")
     parser.add_argument("dst_path", type=str, default=None, help="destination path, ComfyUI format")
+    parser.add_argument("--reverse", action="store_true", help="reverse conversion direction")
     args = parser.parse_args()
     main(args)

From f5b004009e8f4fe7dd5bedb5f35c795868d41a8d Mon Sep 17 00:00:00 2001
From: Kohya S <52813779+kohya-ss@users.noreply.github.com>
Date: Wed, 17 Sep 2025 21:54:25 +0900
Subject: [PATCH 563/582] fix: correct tensor indexing in HunyuanVAE2D class
 for blending and encoding functions

---
 library/hunyuan_image_vae.py | 17 +++++++++++------
 1 file changed, 11 insertions(+), 6 deletions(-)

diff --git a/library/hunyuan_image_vae.py b/library/hunyuan_image_vae.py
index 6f6eea22d..b66854e5e 100644
--- a/library/hunyuan_image_vae.py
+++ b/library/hunyuan_image_vae.py
@@ -449,7 +449,7 @@ def blend_h(self, a: torch.Tensor, b: torch.Tensor, blend_extent: int) -> torch.
         """
         blend_extent = min(a.shape[-1], b.shape[-1], blend_extent)
         for x in range(blend_extent):
-            b[:, :, :, :, x] = a[:, :, :, :, -blend_extent + x] * (1 - x / blend_extent) + b[:, :, :, :, x] * (x / blend_extent)
+            b[:, :, :, x] = a[:, :, :, -blend_extent + x] * (1 - x / blend_extent) + b[:, :, :, x] * (x / blend_extent)
         return b
 
     def blend_v(self, a: torch.Tensor, b: torch.Tensor, blend_extent: int) -> torch.Tensor:
@@ -467,7 +467,7 @@ def blend_v(self, a: torch.Tensor, b: torch.Tensor, blend_extent: int) -> torch.
         """
         blend_extent = min(a.shape[-2], b.shape[-2], blend_extent)
         for y in range(blend_extent):
-            b[:, :, :, y, :] = a[:, :, :, -blend_extent + y, :] * (1 - y / blend_extent) + b[:, :, :, y, :] * (y / blend_extent)
+            b[:, :, y, :] = a[:, :, -blend_extent + y, :] * (1 - y / blend_extent) + b[:, :, y, :] * (y / blend_extent)
         return b
 
     def spatial_tiled_encode(self, x: torch.Tensor) -> torch.Tensor:
@@ -478,9 +478,14 @@ def spatial_tiled_encode(self, x: torch.Tensor) -> torch.Tensor:
         Parameters
         ----------
         x : torch.Tensor
-            Input tensor of shape (B, C, T, H, W).
+            Input tensor of shape (B, C, T, H, W) or (B, C, H, W).
         """
-        B, C, T, H, W = x.shape
+        # Handle 5D input (B, C, T, H, W) by removing time dimension
+        original_ndim = x.ndim
+        if original_ndim == 5:
+            x = x.squeeze(2)
+
+        B, C, H, W = x.shape
         overlap_size = int(self.tile_sample_min_size * (1 - self.tile_overlap_factor))
         blend_extent = int(self.tile_latent_min_size * self.tile_overlap_factor)
         row_limit = self.tile_latent_min_size - blend_extent
@@ -489,7 +494,7 @@ def spatial_tiled_encode(self, x: torch.Tensor) -> torch.Tensor:
         for i in range(0, H, overlap_size):
             row = []
             for j in range(0, W, overlap_size):
-                tile = x[:, :, :, i : i + self.tile_sample_min_size, j : j + self.tile_sample_min_size]
+                tile = x[:, :, i : i + self.tile_sample_min_size, j : j + self.tile_sample_min_size]
                 tile = self.encoder(tile)
                 row.append(tile)
             rows.append(row)
@@ -502,7 +507,7 @@ def spatial_tiled_encode(self, x: torch.Tensor) -> torch.Tensor:
                     tile = self.blend_v(rows[i - 1][j], tile, blend_extent)
                 if j > 0:
                     tile = self.blend_h(row[j - 1], tile, blend_extent)
-                result_row.append(tile[:, :, :, :row_limit, :row_limit])
+                result_row.append(tile[:, :, :row_limit, :row_limit])
             result_rows.append(torch.cat(result_row, dim=-1))
 
         moments = torch.cat(result_rows, dim=-2)

From 2ce506e187cb30f6e8abfda6bf89719aded06d88 Mon Sep 17 00:00:00 2001
From: Kohya S <52813779+kohya-ss@users.noreply.github.com>
Date: Thu, 18 Sep 2025 21:20:08 +0900
Subject: [PATCH 564/582] fix: fp8 casting not working

---
 hunyuan_image_minimal_inference.py | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/hunyuan_image_minimal_inference.py b/hunyuan_image_minimal_inference.py
index 04ab1aac1..00356a37d 100644
--- a/hunyuan_image_minimal_inference.py
+++ b/hunyuan_image_minimal_inference.py
@@ -284,7 +284,7 @@ def load_dit_model(
 
             # if no blocks to swap, we can move the weights to GPU after optimization on GPU (omit redundant CPU->GPU copy)
             move_to_device = args.blocks_to_swap == 0  # if blocks_to_swap > 0, we will keep the model on CPU
-            state_dict = model.fp8_optimization(state_dict, device, move_to_device, use_scaled_mm=args.fp8_fast)
+            state_dict = model.fp8_optimization(state_dict, device, move_to_device, use_scaled_mm=False)  # args.fp8_fast)
 
             info = model.load_state_dict(state_dict, strict=True, assign=True)
             logger.info(f"Loaded FP8 optimized weights: {info}")
@@ -689,15 +689,18 @@ def generate_body(
     # print(f"mask_byt5 shape: {mask_byt5.shape}, sum: {mask_byt5.sum()}")
     # print(f"negative_mask shape: {negative_mask.shape}, sum: {negative_mask.sum()}")
     # print(f"negative_mask_byt5 shape: {negative_mask_byt5.shape}, sum: {negative_mask_byt5.sum()}")
+
+    autocast_enabled = args.fp8
+
     with tqdm(total=len(timesteps), desc="Denoising steps") as pbar:
         for i, t in enumerate(timesteps):
             t_expand = t.expand(latents.shape[0]).to(torch.int64)
 
-            with torch.no_grad():
+            with torch.no_grad(), torch.autocast(device_type=device.type, dtype=torch.bfloat16, enabled=autocast_enabled):
                 noise_pred = model(latents, t_expand, embed, mask, embed_byt5, mask_byt5)
 
             if do_cfg:
-                with torch.no_grad():
+                with torch.no_grad(), torch.autocast(device_type=device.type, dtype=torch.bfloat16, enabled=autocast_enabled):
                     uncond_noise_pred = model(
                         latents, t_expand, negative_embed, negative_mask, negative_embed_byt5, negative_mask_byt5
                     )

From f6b4bdc83fc2c290db4788ac0062f2728fb1e618 Mon Sep 17 00:00:00 2001
From: Kohya S <52813779+kohya-ss@users.noreply.github.com>
Date: Thu, 18 Sep 2025 21:20:54 +0900
Subject: [PATCH 565/582] feat: block-wise fp8 quantization

---
 library/fp8_optimization_utils.py | 237 ++++++++++++++++++++----------
 library/hunyuan_image_models.py   |   7 +-
 library/hunyuan_image_modules.py  |   6 +-
 library/lora_utils.py             |  30 ++--
 4 files changed, 182 insertions(+), 98 deletions(-)

diff --git a/library/fp8_optimization_utils.py b/library/fp8_optimization_utils.py
index ed7d3f764..82ec6bfc7 100644
--- a/library/fp8_optimization_utils.py
+++ b/library/fp8_optimization_utils.py
@@ -1,5 +1,5 @@
 import os
-from typing import List, Union
+from typing import List, Optional, Union
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
@@ -21,7 +21,7 @@
 def calculate_fp8_maxval(exp_bits=4, mantissa_bits=3, sign_bits=1):
     """
     Calculate the maximum representable value in FP8 format.
-    Default is E4M3 format (4-bit exponent, 3-bit mantissa, 1-bit sign).
+    Default is E4M3 format (4-bit exponent, 3-bit mantissa, 1-bit sign). Only supports E4M3 and E5M2 with sign bit.
 
     Args:
         exp_bits (int): Number of exponent bits
@@ -32,73 +32,73 @@ def calculate_fp8_maxval(exp_bits=4, mantissa_bits=3, sign_bits=1):
         float: Maximum value representable in FP8 format
     """
     assert exp_bits + mantissa_bits + sign_bits == 8, "Total bits must be 8"
+    if exp_bits == 4 and mantissa_bits == 3 and sign_bits == 1:
+        return torch.finfo(torch.float8_e4m3fn).max
+    elif exp_bits == 5 and mantissa_bits == 2 and sign_bits == 1:
+        return torch.finfo(torch.float8_e5m2).max
+    else:
+        raise ValueError(f"Unsupported FP8 format: E{exp_bits}M{mantissa_bits} with sign_bits={sign_bits}")
+
 
-    # Calculate exponent bias
-    bias = 2 ** (exp_bits - 1) - 1
+# The following is a manual calculation method (wrong implementation for E5M2), kept for reference.
+"""
+# Calculate exponent bias
+bias = 2 ** (exp_bits - 1) - 1
 
-    # Calculate maximum mantissa value
-    mantissa_max = 1.0
-    for i in range(mantissa_bits - 1):
-        mantissa_max += 2 ** -(i + 1)
+# Calculate maximum mantissa value
+mantissa_max = 1.0
+for i in range(mantissa_bits - 1):
+    mantissa_max += 2 ** -(i + 1)
 
-    # Calculate maximum value
-    max_value = mantissa_max * (2 ** (2**exp_bits - 1 - bias))
+# Calculate maximum value
+max_value = mantissa_max * (2 ** (2**exp_bits - 1 - bias))
 
-    return max_value
+return max_value
+"""
 
 
-def quantize_tensor_to_fp8(tensor, scale, exp_bits=4, mantissa_bits=3, sign_bits=1, max_value=None, min_value=None):
+def quantize_fp8(tensor, scale, fp8_dtype, max_value, min_value):
     """
-    Quantize a tensor to FP8 format.
+    Quantize a tensor to FP8 format using PyTorch's native FP8 dtype support.
 
     Args:
         tensor (torch.Tensor): Tensor to quantize
         scale (float or torch.Tensor): Scale factor
-        exp_bits (int): Number of exponent bits
-        mantissa_bits (int): Number of mantissa bits
-        sign_bits (int): Number of sign bits
+        fp8_dtype (torch.dtype): Target FP8 dtype (torch.float8_e4m3fn or torch.float8_e5m2)
+        max_value (float): Maximum representable value in FP8
+        min_value (float): Minimum representable value in FP8
 
     Returns:
-        tuple: (quantized_tensor, scale_factor)
+        torch.Tensor: Quantized tensor in FP8 format
     """
-    # Create scaled tensor
-    scaled_tensor = tensor / scale
+    tensor = tensor.to(torch.float32)  # ensure tensor is in float32 for division
 
-    # Calculate FP8 parameters
-    bias = 2 ** (exp_bits - 1) - 1
-
-    if max_value is None:
-        # Calculate max and min values
-        max_value = calculate_fp8_maxval(exp_bits, mantissa_bits, sign_bits)
-        min_value = -max_value if sign_bits > 0 else 0.0
+    # Create scaled tensor
+    tensor = torch.div(tensor, scale).nan_to_num_(0.0)  # handle NaN values, equivalent to nonzero_mask in previous function
 
     # Clamp tensor to range
-    clamped_tensor = torch.clamp(scaled_tensor, min_value, max_value)
-
-    # Quantization process
-    abs_values = torch.abs(clamped_tensor)
-    nonzero_mask = abs_values > 0
-
-    # Calculate log scales (only for non-zero elements)
-    log_scales = torch.zeros_like(clamped_tensor)
-    if nonzero_mask.any():
-        log_scales[nonzero_mask] = torch.floor(torch.log2(abs_values[nonzero_mask]) + bias).detach()
+    tensor = tensor.clamp_(min=min_value, max=max_value)
 
-    # Limit log scales and calculate quantization factor
-    log_scales = torch.clamp(log_scales, min=1.0)
-    quant_factor = 2.0 ** (log_scales - mantissa_bits - bias)
+    # Convert to FP8 dtype
+    tensor = tensor.to(fp8_dtype)
 
-    # Quantize and dequantize
-    quantized = torch.round(clamped_tensor / quant_factor) * quant_factor
-
-    return quantized, scale
+    return tensor
 
 
 def optimize_state_dict_with_fp8(
-    state_dict, calc_device, target_layer_keys=None, exclude_layer_keys=None, exp_bits=4, mantissa_bits=3, move_to_device=False
+    state_dict: dict,
+    calc_device: Union[str, torch.device],
+    target_layer_keys: Optional[list[str]] = None,
+    exclude_layer_keys: Optional[list[str]] = None,
+    exp_bits: int = 4,
+    mantissa_bits: int = 3,
+    move_to_device: bool = False,
+    quantization_mode: str = "block",
+    block_size: Optional[int] = 64,
 ):
     """
-    Optimize Linear layer weights in a model's state dict to FP8 format.
+    Optimize Linear layer weights in a model's state dict to FP8 format. The state dict is modified in-place.
+    This function is a static version of load_safetensors_with_fp8_optimization without loading from files.
 
     Args:
         state_dict (dict): State dict to optimize, replaced in-place
@@ -149,23 +149,17 @@ def optimize_state_dict_with_fp8(
         if calc_device is not None:
             value = value.to(calc_device)
 
-        # Calculate scale factor
-        scale = torch.max(torch.abs(value.flatten())) / max_value
-        # print(f"Optimizing {key} with scale: {scale}")
-
-        # Quantize weight to FP8
-        quantized_weight, _ = quantize_tensor_to_fp8(value, scale, exp_bits, mantissa_bits, 1, max_value, min_value)
+        quantized_weight, scale_tensor = quantize_weight(key, value, fp8_dtype, max_value, min_value, quantization_mode, block_size)
 
         # Add to state dict using original key for weight and new key for scale
         fp8_key = key  # Maintain original key
         scale_key = key.replace(".weight", ".scale_weight")
 
-        quantized_weight = quantized_weight.to(fp8_dtype)
-
         if not move_to_device:
             quantized_weight = quantized_weight.to(original_device)
 
-        scale_tensor = torch.tensor([scale], dtype=original_dtype, device=quantized_weight.device)
+        # keep scale shape: [1] or [out,1] or [out, num_blocks, 1]. We can determine the quantization mode from the shape of scale_weight in the patched model.
+        scale_tensor = scale_tensor.to(dtype=original_dtype, device=quantized_weight.device)
 
         state_dict[fp8_key] = quantized_weight
         state_dict[scale_key] = scale_tensor
@@ -180,6 +174,70 @@ def optimize_state_dict_with_fp8(
     return state_dict
 
 
+def quantize_weight(
+    key: str,
+    tensor: torch.Tensor,
+    fp8_dtype: torch.dtype,
+    max_value: float,
+    min_value: float,
+    quantization_mode: str = "block",
+    block_size: int = 64,
+):
+    original_shape = tensor.shape
+
+    # Determine quantization mode
+    if quantization_mode == "block":
+        if tensor.ndim != 2:
+            quantization_mode = "tensor"  # fallback to per-tensor
+        else:
+            out_features, in_features = tensor.shape
+            if in_features % block_size != 0:
+                quantization_mode = "channel"  # fallback to per-channel
+                logger.warning(
+                    f"Layer {key} with shape {tensor.shape} is not divisible by block_size {block_size}, fallback to per-channel quantization."
+                )
+            else:
+                num_blocks = in_features // block_size
+                tensor = tensor.contiguous().view(out_features, num_blocks, block_size)  # [out, num_blocks, block_size]
+    elif quantization_mode == "channel":
+        if tensor.ndim != 2:
+            quantization_mode = "tensor"  # fallback to per-tensor
+
+    # Calculate scale factor (per-tensor or per-output-channel with percentile or max)
+    # value shape is expected to be [out_features, in_features] for Linear weights
+    if quantization_mode == "channel" or quantization_mode == "block":
+        # row-wise percentile to avoid being dominated by outliers
+        # result shape: [out_features, 1] or [out_features, num_blocks, 1]
+        scale_dim = 1 if quantization_mode == "channel" else 2
+        abs_w = torch.abs(tensor)
+
+        # shape: [out_features, 1] or [out_features, num_blocks, 1]
+        row_max = torch.max(abs_w, dim=scale_dim, keepdim=True).values
+        scale = row_max / max_value
+
+    else:
+        # per-tensor
+        tensor_max = torch.max(torch.abs(tensor).view(-1))
+        scale = tensor_max / max_value
+
+    # Calculate scale factor
+    scale = torch.max(torch.abs(tensor.flatten())) / max_value
+    # print(f"Optimizing {key} with scale: {scale}")
+
+    # numerical safety
+    scale = torch.clamp(scale, min=1e-8)
+    scale = scale.to(torch.float32)  # ensure scale is in float32 for division
+
+    # Quantize weight to FP8 (scale can be scalar or [out,1], broadcasting works)
+    quantized_weight = quantize_fp8(tensor, scale, fp8_dtype, max_value, min_value)
+
+    # If block-wise, restore original shape
+    if quantization_mode == "block":
+        quantized_weight = quantized_weight.view(original_shape)  # restore to original shape [out, in]
+
+    return quantized_weight, scale
+
+
 def load_safetensors_with_fp8_optimization(
     model_files: List[str],
     calc_device: Union[str, torch.device],
@@ -189,7 +247,9 @@ def load_safetensors_with_fp8_optimization(
     mantissa_bits=3,
     move_to_device=False,
     weight_hook=None,
-):
+    quantization_mode: str = "block",
+    block_size: Optional[int] = 64,
+) -> dict:
     """
     Load weight tensors from safetensors files and merge LoRA weights into the state dict with explicit FP8 optimization.
 
@@ -202,6 +262,8 @@ def load_safetensors_with_fp8_optimization(
         mantissa_bits (int): Number of mantissa bits
         move_to_device (bool): Move optimized tensors to the calculating device
         weight_hook (callable, optional): Function to apply to each weight tensor before optimization
+        quantization_mode (str): Quantization mode, "tensor", "channel", or "block"
+        block_size (int, optional): Block size for block-wise quantization (used if quantization_mode is "block")
 
     Returns:
         dict: FP8 optimized state dict
@@ -234,40 +296,39 @@ def is_target_key(key):
             keys = f.keys()
             for key in tqdm(keys, desc=f"Loading {os.path.basename(model_file)}", unit="key"):
                 value = f.get_tensor(key)
+
+                # Save original device
+                original_device = value.device  # usually cpu
+
                 if weight_hook is not None:
                     # Apply weight hook if provided
-                    value = weight_hook(key, value)
+                    value = weight_hook(key, value, keep_on_calc_device=(calc_device is not None))
 
                 if not is_target_key(key):
+                    target_device = calc_device if (calc_device is not None and move_to_device) else original_device
+                    value = value.to(target_device)
                     state_dict[key] = value
                     continue
 
-                # Save original device and dtype
-                original_device = value.device
-                original_dtype = value.dtype
-
                 # Move to calculation device
                 if calc_device is not None:
                     value = value.to(calc_device)
 
-                # Calculate scale factor
-                scale = torch.max(torch.abs(value.flatten())) / max_value
-                # print(f"Optimizing {key} with scale: {scale}")
-
-                # Quantize weight to FP8
-                quantized_weight, _ = quantize_tensor_to_fp8(value, scale, exp_bits, mantissa_bits, 1, max_value, min_value)
+                original_dtype = value.dtype
+                quantized_weight, scale_tensor = quantize_weight(
+                    key, value, fp8_dtype, max_value, min_value, quantization_mode, block_size
+                )
 
                 # Add to state dict using original key for weight and new key for scale
                 fp8_key = key  # Maintain original key
                 scale_key = key.replace(".weight", ".scale_weight")
                 assert fp8_key != scale_key, "FP8 key and scale key must be different"
 
-                quantized_weight = quantized_weight.to(fp8_dtype)
-
                 if not move_to_device:
                     quantized_weight = quantized_weight.to(original_device)
 
-                scale_tensor = torch.tensor([scale], dtype=original_dtype, device=quantized_weight.device)
+                # keep scale shape: [1] or [out,1] or [out, num_blocks, 1]. We can determine the quantization mode from the shape of scale_weight in the patched model.
+                scale_tensor = scale_tensor.to(dtype=original_dtype, device=quantized_weight.device)
 
                 state_dict[fp8_key] = quantized_weight
                 state_dict[scale_key] = scale_tensor
@@ -296,12 +357,15 @@ def fp8_linear_forward_patch(self: nn.Linear, x, use_scaled_mm=False, max_value=
         torch.Tensor: Result of linear transformation
     """
     if use_scaled_mm:
+        # **not tested**
+        # _scaled_mm only works for per-tensor scale for now (per-channel scale does not work in certain cases)
+        if self.scale_weight.ndim != 1:
+            raise ValueError("scaled_mm only supports per-tensor scale_weight for now.")
+
         input_dtype = x.dtype
         original_weight_dtype = self.scale_weight.dtype
-        weight_dtype = self.weight.dtype
-        target_dtype = torch.float8_e5m2
-        assert weight_dtype == torch.float8_e4m3fn, "Only FP8 E4M3FN format is supported"
-        assert x.ndim == 3, "Input tensor must be 3D (batch_size, seq_len, hidden_dim)"
+        target_dtype = self.weight.dtype
+        # assert x.ndim == 3, "Input tensor must be 3D (batch_size, seq_len, hidden_dim)"
 
         if max_value is None:
             # no input quantization
@@ -311,10 +375,12 @@ def fp8_linear_forward_patch(self: nn.Linear, x, use_scaled_mm=False, max_value=
             scale_x = (torch.max(torch.abs(x.flatten())) / max_value).to(torch.float32)
 
             # quantize input tensor to FP8: this seems to consume a lot of memory
-            x, _ = quantize_tensor_to_fp8(x, scale_x, 5, 2, 1, max_value, -max_value)
+            fp8_max_value = torch.finfo(target_dtype).max
+            fp8_min_value = torch.finfo(target_dtype).min
+            x = quantize_fp8(x, scale_x, target_dtype, fp8_max_value, fp8_min_value)
 
         original_shape = x.shape
-        x = x.reshape(-1, x.shape[2]).to(target_dtype)
+        x = x.reshape(-1, x.shape[-1]).to(target_dtype)
 
         weight = self.weight.t()
         scale_weight = self.scale_weight.to(torch.float32)
@@ -325,12 +391,21 @@ def fp8_linear_forward_patch(self: nn.Linear, x, use_scaled_mm=False, max_value=
         else:
             o = torch._scaled_mm(x, weight, out_dtype=input_dtype, scale_a=scale_x, scale_b=scale_weight)
 
-        return o.reshape(original_shape[0], original_shape[1], -1).to(input_dtype)
+        o = o.reshape(original_shape[0], original_shape[1], -1) if x.ndim == 3 else o.reshape(original_shape[0], -1)
+        return o.to(input_dtype)
 
     else:
         # Dequantize the weight
         original_dtype = self.scale_weight.dtype
-        dequantized_weight = self.weight.to(original_dtype) * self.scale_weight
+        if self.scale_weight.ndim < 3:
+            # per-tensor or per-channel quantization, we can broadcast
+            dequantized_weight = self.weight.to(original_dtype) * self.scale_weight
+        else:
+            # block-wise quantization, need to reshape weight to match scale shape for broadcasting
+            out_features, num_blocks, _ = self.scale_weight.shape
+            dequantized_weight = self.weight.to(original_dtype).contiguous().view(out_features, num_blocks, -1)
+            dequantized_weight = dequantized_weight * self.scale_weight
+            dequantized_weight = dequantized_weight.view(self.weight.shape)
 
         # Perform linear transformation
         if self.bias is not None:
@@ -362,11 +437,15 @@ def apply_fp8_monkey_patch(model, optimized_state_dict, use_scaled_mm=False):
 
     # Enumerate patched layers
     patched_module_paths = set()
+    scale_shape_info = {}
     for scale_key in scale_keys:
         # Extract module path from scale key (remove .scale_weight)
         module_path = scale_key.rsplit(".scale_weight", 1)[0]
         patched_module_paths.add(module_path)
 
+        # Store scale shape information
+        scale_shape_info[module_path] = optimized_state_dict[scale_key].shape
+
     patched_count = 0
 
     # Apply monkey patch to each layer with FP8 weights
@@ -377,7 +456,9 @@ def apply_fp8_monkey_patch(model, optimized_state_dict, use_scaled_mm=False):
         # Apply patch if it's a Linear layer with FP8 scale
         if isinstance(module, nn.Linear) and has_scale:
             # register the scale_weight as a buffer to load the state_dict
-            module.register_buffer("scale_weight", torch.tensor(1.0, dtype=module.weight.dtype))
+            # module.register_buffer("scale_weight", torch.tensor(1.0, dtype=module.weight.dtype))
+            scale_shape = scale_shape_info[name]
+            module.register_buffer("scale_weight", torch.ones(scale_shape, dtype=module.weight.dtype))
 
             # Create a new forward method with the patched version.
             def new_forward(self, x):
diff --git a/library/hunyuan_image_models.py b/library/hunyuan_image_models.py
index 2a6092ea3..356ce4b42 100644
--- a/library/hunyuan_image_models.py
+++ b/library/hunyuan_image_models.py
@@ -30,7 +30,12 @@
 from library.hunyuan_image_utils import get_nd_rotary_pos_embed
 
 FP8_OPTIMIZATION_TARGET_KEYS = ["double_blocks", "single_blocks"]
-FP8_OPTIMIZATION_EXCLUDE_KEYS = ["norm", "_mod", "modulation", "_emb"]
+# FP8_OPTIMIZATION_EXCLUDE_KEYS = ["norm", "_mod", "_emb"]  # , "modulation"
+FP8_OPTIMIZATION_EXCLUDE_KEYS = ["norm", "_emb"]  # , "modulation", "_mod"
+
+# full exclude 24.2GB
+# norm and _emb 19.7GB
+# fp8 cast 19.7GB
 
 
 # region DiT Model
diff --git a/library/hunyuan_image_modules.py b/library/hunyuan_image_modules.py
index ef4d5e5d7..555cb4871 100644
--- a/library/hunyuan_image_modules.py
+++ b/library/hunyuan_image_modules.py
@@ -497,7 +497,9 @@ def forward(self, x):
         """
         output = self._norm(x.float()).type_as(x)
         del x
-        output = output * self.weight
+        # output = output * self.weight
+        # fp8 support
+        output = output * self.weight.to(output.dtype)
         return output
 
 
@@ -689,7 +691,7 @@ def _forward(
         del qkv
 
         # Split attention outputs back to separate streams
-        img_attn, txt_attn = (attn[:, : img_seq_len].contiguous(), attn[:, img_seq_len :].contiguous())
+        img_attn, txt_attn = (attn[:, :img_seq_len].contiguous(), attn[:, img_seq_len:].contiguous())
         del attn
 
         # Apply attention projection and residual connection for image stream
diff --git a/library/lora_utils.py b/library/lora_utils.py
index b93eb9af3..6f0fc2285 100644
--- a/library/lora_utils.py
+++ b/library/lora_utils.py
@@ -1,12 +1,8 @@
-# copy from Musubi Tuner
-
 import os
 import re
 from typing import Dict, List, Optional, Union
 import torch
-
 from tqdm import tqdm
-
 from library.device_utils import synchronize_device
 from library.fp8_optimization_utils import load_safetensors_with_fp8_optimization
 from library.safetensors_utils import MemoryEfficientSafeOpen
@@ -84,7 +80,7 @@ def load_safetensors_with_lora_and_fp8(
             count = int(match.group(3))
             state_dict = {}
             for i in range(count):
-                filename = f"{prefix}{i+1:05d}-of-{count:05d}.safetensors"
+                filename = f"{prefix}{i + 1:05d}-of-{count:05d}.safetensors"
                 filepath = os.path.join(os.path.dirname(model_file), filename)
                 if os.path.exists(filepath):
                     extended_model_files.append(filepath)
@@ -118,7 +114,7 @@ def load_safetensors_with_lora_and_fp8(
         logger.info(f"Merging LoRA weights into state dict. multipliers: {lora_multipliers}")
 
         # make hook for LoRA merging
-        def weight_hook_func(model_weight_key, model_weight):
+        def weight_hook_func(model_weight_key, model_weight, keep_on_calc_device=False):
             nonlocal list_of_lora_weight_keys, lora_weights_list, lora_multipliers, calc_device
 
             if not model_weight_key.endswith(".weight"):
@@ -176,7 +172,8 @@ def weight_hook_func(model_weight_key, model_weight):
                 if alpha_key in lora_weight_keys:
                     lora_weight_keys.remove(alpha_key)
 
-            model_weight = model_weight.to(original_device)  # move back to original device
+            if not keep_on_calc_device and original_device != calc_device:
+                model_weight = model_weight.to(original_device)  # move back to original device
             return model_weight
 
         weight_hook = weight_hook_func
@@ -231,19 +228,18 @@ def load_safetensors_with_fp8_optimization_and_hook(
         for model_file in model_files:
             with MemoryEfficientSafeOpen(model_file) as f:
                 for key in tqdm(f.keys(), desc=f"Loading {os.path.basename(model_file)}", leave=False):
-                    value = f.get_tensor(key)
-                    if weight_hook is not None:
-                        value = weight_hook(key, value)
-                    if move_to_device:
-                        if dit_weight_dtype is None:
-                            value = value.to(calc_device, non_blocking=True)
-                        else:
+                    if weight_hook is None and move_to_device:
+                        value = f.get_tensor(key, device=calc_device, dtype=dit_weight_dtype)
+                    else:
+                        value = f.get_tensor(key)  # we cannot directly load to device because get_tensor does non-blocking transfer
+                        if weight_hook is not None:
+                            value = weight_hook(key, value, keep_on_calc_device=move_to_device)
+                        if move_to_device:
                             value = value.to(calc_device, dtype=dit_weight_dtype, non_blocking=True)
-                    elif dit_weight_dtype is not None:
-                        value = value.to(dit_weight_dtype)
+                        elif dit_weight_dtype is not None:
+                            value = value.to(dit_weight_dtype)
 
                     state_dict[key] = value
-
         if move_to_device:
             synchronize_device(calc_device)
 

From f834b2e0d46c2b3629fe41d186ec15caf11118be Mon Sep 17 00:00:00 2001
From: Kohya S <52813779+kohya-ss@users.noreply.github.com>
Date: Thu, 18 Sep 2025 23:46:18 +0900
Subject: [PATCH 566/582] fix: --fp8_vl to work

---
 hunyuan_image_train_network.py        | 2 +-
 library/hunyuan_image_text_encoder.py | 5 ++---
 2 files changed, 3 insertions(+), 4 deletions(-)

diff --git a/hunyuan_image_train_network.py b/hunyuan_image_train_network.py
index a3c0cd898..60aa2178f 100644
--- a/hunyuan_image_train_network.py
+++ b/hunyuan_image_train_network.py
@@ -250,7 +250,7 @@ def encode_prompt(prpt):
         arg_c_null = None
 
     gen_args = SimpleNamespace(
-        image_size=(height, width), infer_steps=sample_steps, flow_shift=flow_shift, guidance_scale=cfg_scale
+        image_size=(height, width), infer_steps=sample_steps, flow_shift=flow_shift, guidance_scale=cfg_scale, fp8=args.fp8_scaled
     )
 
     from hunyuan_image_minimal_inference import generate_body  # import here to avoid circular import
diff --git a/library/hunyuan_image_text_encoder.py b/library/hunyuan_image_text_encoder.py
index 509f9bd2f..2171b4101 100644
--- a/library/hunyuan_image_text_encoder.py
+++ b/library/hunyuan_image_text_encoder.py
@@ -15,7 +15,7 @@
 from accelerate import init_empty_weights
 
 from library.safetensors_utils import load_safetensors
-from library.utils import  setup_logging
+from library.utils import setup_logging
 
 setup_logging()
 import logging
@@ -542,7 +542,6 @@ def get_qwen_prompt_embeds_from_tokens(
     attention_mask = attention_mask.to(device=device)
 
     if dtype.itemsize == 1:  # fp8
-        # TODO dtype should be vlm.dtype?
         with torch.no_grad(), torch.autocast(device_type=device.type, dtype=torch.bfloat16, enabled=True):
             encoder_hidden_states = vlm(input_ids=input_ids, attention_mask=attention_mask, output_hidden_states=True)
     else:
@@ -564,7 +563,7 @@ def get_qwen_prompt_embeds_from_tokens(
 
     prompt_embeds = hidden_states[:, drop_idx:, :]
     encoder_attention_mask = attention_mask[:, drop_idx:]
-    prompt_embeds = prompt_embeds.to(dtype=dtype, device=device)
+    prompt_embeds = prompt_embeds.to(device=device)
 
     return prompt_embeds, encoder_attention_mask
 

From b090d15f7d72324ba81575cb453002a935f5bcce Mon Sep 17 00:00:00 2001
From: Kohya S <52813779+kohya-ss@users.noreply.github.com>
Date: Sat, 20 Sep 2025 19:45:33 +0900
Subject: [PATCH 567/582] feat: add multi backend attention and related update
 for HI2.1 models and scripts

---
 docs/hunyuan_image_train_network.md |  12 +-
 hunyuan_image_minimal_inference.py  |   7 +-
 hunyuan_image_train_network.py      |  23 ++-
 library/attention.py                | 248 +++++++++++++++++++++++-----
 library/hunyuan_image_models.py     |  27 ++-
 library/hunyuan_image_modules.py    |  75 ++++-----
 6 files changed, 288 insertions(+), 104 deletions(-)

diff --git a/docs/hunyuan_image_train_network.md b/docs/hunyuan_image_train_network.md
index 3d49fbdfb..667b4fec1 100644
--- a/docs/hunyuan_image_train_network.md
+++ b/docs/hunyuan_image_train_network.md
@@ -126,7 +126,8 @@ accelerate launch --num_cpu_threads_per_process 1 hunyuan_image_train_network.py
   --learning_rate=1e-4 \
   --optimizer_type="AdamW8bit" \
   --lr_scheduler="constant" \
-  --sdpa \
+  --attn_mode="torch" \
+  --split_attn \
   --max_train_epochs=10 \
   --save_every_n_epochs=1 \
   --mixed_precision="bf16" \
@@ -175,6 +176,10 @@ The script adds HunyuanImage-2.1 specific arguments. For common arguments (like
 
 #### Memory/Speed Related
 
+* `--attn_mode=<choice>`
+  - Specifies the attention implementation to use. Options are `torch`, `xformers`, `flash`, `sageattn`. Default is `torch` (use scaled dot product attention). Each library must be installed separately other than `torch`. If using `xformers`, also specify `--split_attn` if the batch size is more than 1.
+* `--split_attn`
+  - Splits the batch during attention computation to process one item at a time, reducing VRAM usage by avoiding attention mask computation. Can improve speed when using `torch`. Required when using `xformers` with batch size greater than 1.
 * `--fp8_scaled`
   - Enables training the DiT model in scaled FP8 format. This can significantly reduce VRAM usage (can run with as little as 8GB VRAM when combined with `--blocks_to_swap`), but the training results may vary. This is a newer alternative to the unsupported `--fp8_base` option.
 * `--fp8_vl`
@@ -429,6 +434,7 @@ python hunyuan_image_minimal_inference.py \
   --vae "<path to hunyuan_image_2.1_vae_fp16.safetensors>" \
   --lora_weight "<path to your trained LoRA>" \
   --lora_multiplier 1.0 \
+  --attn_mode "torch" \
   --prompt "A cute cartoon penguin in a snowy landscape" \
   --image_size 2048 2048 \
   --infer_steps 50 \
@@ -445,6 +451,8 @@ python hunyuan_image_minimal_inference.py \
 - `--guidance_scale`: CFG scale (default: 3.5)
 - `--flow_shift`: Flow matching shift parameter (default: 5.0)
 
+`--split_attn` is not supported (since inference is done one at a time).
+
 <details>
 <summary>日本語</summary>
 
@@ -457,6 +465,8 @@ python hunyuan_image_minimal_inference.py \
 - `--guidance_scale`: CFGスケール（推奨: 3.5）
 - `--flow_shift`: Flow Matchingシフトパラメータ（デフォルト: 5.0）
 
+`--split_attn`はサポートされていません（1件ずつ推論するため）。
+
 </details>
 
 ## 9. Related Tools / 関連ツール
diff --git a/hunyuan_image_minimal_inference.py b/hunyuan_image_minimal_inference.py
index 00356a37d..850233837 100644
--- a/hunyuan_image_minimal_inference.py
+++ b/hunyuan_image_minimal_inference.py
@@ -96,7 +96,7 @@ def parse_args() -> argparse.Namespace:
         "--attn_mode",
         type=str,
         default="torch",
-        choices=["flash", "torch", "sageattn", "xformers", "sdpa"],  #  "flash2", "flash3",
+        choices=["flash", "torch", "sageattn", "xformers", "sdpa"],  #  "sdpa" for backward compatibility
         help="attention mode",
     )
     parser.add_argument("--blocks_to_swap", type=int, default=0, help="number of blocks to swap in the model")
@@ -130,6 +130,9 @@ def parse_args() -> argparse.Namespace:
     if args.lycoris and not lycoris_available:
         raise ValueError("install lycoris: https://github.com/KohakuBlueleaf/LyCORIS")
 
+    if args.attn_mode == "sdpa":
+        args.attn_mode = "torch"  # backward compatibility
+
     return args
 
 
@@ -265,7 +268,7 @@ def load_dit_model(
         device,
         args.dit,
         args.attn_mode,
-        False,
+        True,  # enable split_attn to trim masked tokens
         loading_device,
         loading_weight_dtype,
         args.fp8_scaled and not args.lycoris,
diff --git a/hunyuan_image_train_network.py b/hunyuan_image_train_network.py
index 60aa2178f..6b102a9a3 100644
--- a/hunyuan_image_train_network.py
+++ b/hunyuan_image_train_network.py
@@ -379,18 +379,19 @@ def load_unet_lazily(self, args, weight_dtype, accelerator, text_encoders) -> tu
 
         loading_dtype = None if args.fp8_scaled else weight_dtype
         loading_device = "cpu" if self.is_swapping_blocks else accelerator.device
-        split_attn = True
 
         attn_mode = "torch"
         if args.xformers:
             attn_mode = "xformers"
-            logger.info("xformers is enabled for attention")
+        if args.attn_mode is not None:
+            attn_mode = args.attn_mode
 
+        logger.info(f"Loading DiT model with attn_mode: {attn_mode}, split_attn: {args.split_attn}, fp8_scaled: {args.fp8_scaled}")
         model = hunyuan_image_models.load_hunyuan_image_model(
             accelerator.device,
             args.pretrained_model_name_or_path,
             attn_mode,
-            split_attn,
+            args.split_attn,
             loading_device,
             loading_dtype,
             args.fp8_scaled,
@@ -674,6 +675,19 @@ def setup_parser() -> argparse.ArgumentParser:
         help="Enable tiling for VAE decoding and encoding / VAEデコーディングとエンコーディングのタイルを有効にする",
     )
 
+    parser.add_argument(
+        "--attn_mode",
+        choices=["torch", "xformers", "flash", "sageattn", "sdpa"],  # "sdpa" is for backward compatibility
+        default=None,
+        help="Attention implementation to use. Default is None (torch). xformers requires --split_attn. sageattn does not support training (inference only). This option overrides --xformers or --sdpa."
+        " / 使用するAttentionの実装。デフォルトはNone（torch）です。xformersは--split_attnの指定が必要です。sageattnはトレーニングをサポートしていません（推論のみ）。このオプションは--xformersまたは--sdpaを上書きします。",
+    )
+    parser.add_argument(
+        "--split_attn",
+        action="store_true",
+        help="split attention computation to reduce memory usage / メモリ使用量を減らすためにattention時にバッチを分割する",
+    )
+
     return parser
 
 
@@ -684,5 +698,8 @@ def setup_parser() -> argparse.ArgumentParser:
     train_util.verify_command_line_training_args(args)
     args = train_util.read_config_from_file(args, parser)
 
+    if args.attn_mode == "sdpa":
+        args.attn_mode = "torch"  # backward compatibility
+
     trainer = HunyuanImageNetworkTrainer()
     trainer.train(args)
diff --git a/library/attention.py b/library/attention.py
index f1e7c0b0c..d3b8441e2 100644
--- a/library/attention.py
+++ b/library/attention.py
@@ -1,18 +1,88 @@
+# Unified attention function supporting various implementations
+
+from dataclasses import dataclass
 import torch
 from typing import Optional, Union
 
+try:
+    import flash_attn
+    from flash_attn.flash_attn_interface import _flash_attn_forward
+    from flash_attn.flash_attn_interface import flash_attn_varlen_func
+    from flash_attn.flash_attn_interface import flash_attn_func
+except ImportError:
+    flash_attn = None
+    flash_attn_varlen_func = None
+    _flash_attn_forward = None
+    flash_attn_func = None
+
+try:
+    from sageattention import sageattn_varlen, sageattn
+except ImportError:
+    sageattn_varlen = None
+    sageattn = None
+
 try:
     import xformers.ops as xops
 except ImportError:
     xops = None
 
 
+@dataclass
+class AttentionParams:
+    attn_mode: Optional[str] = None
+    split_attn: bool = False
+    img_len: Optional[int] = None
+    attention_mask: Optional[torch.Tensor] = None
+    seqlens: Optional[torch.Tensor] = None
+    cu_seqlens: Optional[torch.Tensor] = None
+    max_seqlen: Optional[int] = None
+
+    @staticmethod
+    def create_attention_params(attn_mode: Optional[str], split_attn: bool) -> "AttentionParams":
+        return AttentionParams(attn_mode, split_attn)
+
+    @staticmethod
+    def create_attention_params_from_mask(
+        attn_mode: Optional[str], split_attn: bool, img_len: Optional[int], attention_mask: Optional[torch.Tensor]
+    ) -> "AttentionParams":
+        if attention_mask is None:
+            # No attention mask provided: assume all tokens are valid
+            return AttentionParams(attn_mode, split_attn, None, None, None, None, None)
+        else:
+            # Note: attention_mask is only for text tokens, not including image tokens
+            seqlens = attention_mask.sum(dim=1).to(torch.int32) + img_len  # [B]
+            max_seqlen = attention_mask.shape[1] + img_len
+
+            if split_attn:
+                # cu_seqlens is not needed for split attention
+                return AttentionParams(attn_mode, split_attn, img_len, attention_mask, seqlens, None, max_seqlen)
+
+            # Convert attention mask to cumulative sequence lengths for flash attention
+            batch_size = attention_mask.shape[0]
+            cu_seqlens = torch.zeros([2 * batch_size + 1], dtype=torch.int32, device=attention_mask.device)
+            for i in range(batch_size):
+                cu_seqlens[2 * i + 1] = i * max_seqlen + seqlens[i]  # end of valid tokens for query
+                cu_seqlens[2 * i + 2] = (i + 1) * max_seqlen  # end of all tokens for query
+
+            # Expand attention mask to include image tokens
+            attention_mask = torch.nn.functional.pad(attention_mask, (img_len, 0), value=1)  # [B, img_len + L]
+
+            if attn_mode == "xformers":
+                seqlens_list = seqlens.cpu().tolist()
+                attention_mask = xops.fmha.attn_bias.BlockDiagonalMask.from_seqlens(
+                    seqlens_list, seqlens_list, device=attention_mask.device
+                )
+            elif attn_mode == "torch":
+                attention_mask = attention_mask[:, None, None, :].to(torch.bool)  # [B, 1, 1, img_len + L]
+
+            return AttentionParams(attn_mode, split_attn, img_len, attention_mask, seqlens, cu_seqlens, max_seqlen)
+
+
 def attention(
     qkv_or_q: Union[torch.Tensor, list],
     k: Optional[torch.Tensor] = None,
     v: Optional[torch.Tensor] = None,
-    seq_lens: Optional[list[int]] = None,
-    attn_mode: str = "torch",
+    attn_params: Optional[AttentionParams] = None,
     drop_rate: float = 0.0,
 ) -> torch.Tensor:
     """
@@ -25,8 +95,7 @@ def attention(
         qkv_or_q: Query tensor [B, L, H, D]. or list of such tensors.
         k: Key tensor [B, L, H, D].
         v: Value tensor [B, L, H, D].
-        seq_lens: Valid sequence length for each batch element.
-        attn_mode: Attention implementation ("torch" or "sageattn").
+        attn_param: Attention parameters including mask and sequence lengths.
         drop_rate: Attention dropout rate.
 
     Returns:
@@ -34,53 +103,158 @@ def attention(
     """
     if isinstance(qkv_or_q, list):
         q, k, v = qkv_or_q
+        q: torch.Tensor = q
         qkv_or_q.clear()
         del qkv_or_q
     else:
-        q = qkv_or_q
+        q: torch.Tensor = qkv_or_q
         del qkv_or_q
         assert k is not None and v is not None, "k and v must be provided if qkv_or_q is a tensor"
-    if seq_lens is None:
-        seq_lens = [q.shape[1]] * q.shape[0]
+    if attn_params is None:
+        attn_params = AttentionParams.create_attention_params("torch", False)
+
+    # If split attn is False, attention mask is provided and all sequence lengths are same, we can trim the sequence
+    seqlen_trimmed = False
+    if not attn_params.split_attn and attn_params.attention_mask is not None and attn_params.seqlens is not None:
+        if torch.all(attn_params.seqlens == attn_params.seqlens[0]):
+            seqlen = attn_params.seqlens[0].item()
+            q = q[:, :seqlen]
+            k = k[:, :seqlen]
+            v = v[:, :seqlen]
+            max_seqlen = attn_params.max_seqlen
+            attn_params = AttentionParams.create_attention_params(attn_params.attn_mode, False)  # do not in-place modify
+            attn_params.max_seqlen = max_seqlen  # keep max_seqlen for padding
+            seqlen_trimmed = True
 
     # Determine tensor layout based on attention implementation
-    if attn_mode == "torch" or attn_mode == "sageattn":
-        transpose_fn = lambda x: x.transpose(1, 2)  # [B, H, L, D] for SDPA
+    if attn_params.attn_mode == "torch" or (
+        attn_params.attn_mode == "sageattn" and (attn_params.split_attn or attn_params.cu_seqlens is None)
+    ):
+        transpose_fn = lambda x: x.transpose(1, 2)  # [B, H, L, D] for SDPA and sageattn with fixed length
+        # pad on sequence length dimension
+        pad_fn = lambda x, pad_to: torch.nn.functional.pad(x, (0, 0, 0, pad_to - x.shape[-2]), value=0)
     else:
         transpose_fn = lambda x: x  # [B, L, H, D] for other implementations
+        # pad on sequence length dimension
+        pad_fn = lambda x, pad_to: torch.nn.functional.pad(x, (0, 0, 0, 0, 0, pad_to - x.shape[-3]), value=0)
+
+    # Process each batch element with its valid sequence lengths
+    if attn_params.split_attn:
+        if attn_params.seqlens is None:
+            # If no seqlens provided, assume all tokens are valid
+            attn_params = AttentionParams.create_attention_params(attn_params.attn_mode, True)  # do not in-place modify
+            attn_params.seqlens = torch.tensor([q.shape[1]] * q.shape[0], device=q.device)
+            attn_params.max_seqlen = q.shape[1]
+        q = [transpose_fn(q[i : i + 1, : attn_params.seqlens[i]]) for i in range(len(q))]
+        k = [transpose_fn(k[i : i + 1, : attn_params.seqlens[i]]) for i in range(len(k))]
+        v = [transpose_fn(v[i : i + 1, : attn_params.seqlens[i]]) for i in range(len(v))]
+    else:
+        q = transpose_fn(q)
+        k = transpose_fn(k)
+        v = transpose_fn(v)
+
+    if attn_params.attn_mode == "torch":
+        if attn_params.split_attn:
+            x = []
+            for i in range(len(q)):
+                x_i = torch.nn.functional.scaled_dot_product_attention(q[i], k[i], v[i], dropout_p=drop_rate)
+                q[i] = None
+                k[i] = None
+                v[i] = None
+                x.append(pad_fn(x_i, attn_params.max_seqlen))  # B, H, L, D
+            x = torch.cat(x, dim=0)
+            del q, k, v
+
+        else:
+            x = torch.nn.functional.scaled_dot_product_attention(q, k, v, attn_mask=attn_params.attention_mask, dropout_p=drop_rate)
+            del q, k, v
+
+    elif attn_params.attn_mode == "xformers":
+        if attn_params.split_attn:
+            x = []
+            for i in range(len(q)):
+                x_i = xops.memory_efficient_attention(q[i], k[i], v[i], p=drop_rate)
+                q[i] = None
+                k[i] = None
+                v[i] = None
+                x.append(pad_fn(x_i, attn_params.max_seqlen))  # B, L, H, D
+            x = torch.cat(x, dim=0)
+            del q, k, v
+
+        else:
+            x = xops.memory_efficient_attention(q, k, v, attn_bias=attn_params.attention_mask, p=drop_rate)
+            del q, k, v
 
-    # Process each batch element with its valid sequence length
-    q_seq_len = q.shape[1]
-    q = [transpose_fn(q[i : i + 1, : seq_lens[i]]) for i in range(len(q))]
-    k = [transpose_fn(k[i : i + 1, : seq_lens[i]]) for i in range(len(k))]
-    v = [transpose_fn(v[i : i + 1, : seq_lens[i]]) for i in range(len(v))]
-
-    if attn_mode == "torch":
-        x = []
-        for i in range(len(q)):
-            x_i = torch.nn.functional.scaled_dot_product_attention(q[i], k[i], v[i], dropout_p=drop_rate)
-            q[i] = None
-            k[i] = None
-            v[i] = None
-            x.append(torch.nn.functional.pad(x_i, (0, 0, 0, q_seq_len - x_i.shape[2]), value=0))  # Pad to max seq len, B, H, L, D
-        x = torch.cat(x, dim=0)
-        del q, k, v
-
-    elif attn_mode == "xformers":
-        x = []
-        for i in range(len(q)):
-            x_i = xops.memory_efficient_attention(q[i], k[i], v[i], p=drop_rate)
-            q[i] = None
-            k[i] = None
-            v[i] = None
-            x.append(torch.nn.functional.pad(x_i, (0, 0, 0, 0, 0, q_seq_len - x_i.shape[1]), value=0))  # B, L, H, D
-        x = torch.cat(x, dim=0)
-        del q, k, v
+    elif attn_params.attn_mode == "sageattn":
+        if attn_params.split_attn:
+            x = []
+            for i in range(len(q)):
+                # HND seems to cause an error
+                x_i = sageattn(q[i], k[i], v[i])  # B, H, L, D. No dropout support
+                q[i] = None
+                k[i] = None
+                v[i] = None
+                x.append(pad_fn(x_i, attn_params.max_seqlen))  # B, H, L, D
+            x = torch.cat(x, dim=0)
+            del q, k, v
+        elif attn_params.cu_seqlens is None:  # all tokens are valid
+            x = sageattn(q, k, v)  # B, L, H, D. No dropout support
+            del q, k, v
+        else:
+            # Reshape to [(bxs), a, d]
+            batch_size, seqlen = q.shape[0], q.shape[1]
+            q = q.view(q.shape[0] * q.shape[1], *q.shape[2:])  # [B*L, H, D]
+            k = k.view(k.shape[0] * k.shape[1], *k.shape[2:])  # [B*L, H, D]
+            v = v.view(v.shape[0] * v.shape[1], *v.shape[2:])  # [B*L, H, D]
+
+            # Assume cu_seqlens_q == cu_seqlens_kv and max_seqlen_q == max_seqlen_kv. No dropout support
+            x = sageattn_varlen(
+                q, k, v, attn_params.cu_seqlens, attn_params.cu_seqlens, attn_params.max_seqlen, attn_params.max_seqlen
+            )
+            del q, k, v
+
+            # Reshape x with shape [(bxs), a, d] to [b, s, a, d]
+            x = x.view(batch_size, seqlen, x.shape[-2], x.shape[-1])  # B, L, H, D
+
+    elif attn_params.attn_mode == "flash":
+        if attn_params.split_attn:
+            x = []
+            for i in range(len(q)):
+                # HND seems to cause an error
+                x_i = flash_attn_func(q[i], k[i], v[i], drop_rate)  # B, L, H, D
+                q[i] = None
+                k[i] = None
+                v[i] = None
+                x.append(pad_fn(x_i, attn_params.max_seqlen))  # B, L, H, D
+            x = torch.cat(x, dim=0)
+            del q, k, v
+        elif attn_params.cu_seqlens is None:  # all tokens are valid
+            x = flash_attn_func(q, k, v, drop_rate)  # B, L, H, D
+            del q, k, v
+        else:
+            # Reshape to [(bxs), a, d]
+            batch_size, seqlen = q.shape[0], q.shape[1]
+            q = q.view(q.shape[0] * q.shape[1], *q.shape[2:])  # [B*L, H, D]
+            k = k.view(k.shape[0] * k.shape[1], *k.shape[2:])  # [B*L, H, D]
+            v = v.view(v.shape[0] * v.shape[1], *v.shape[2:])  # [B*L, H, D]
+
+            # Assume cu_seqlens_q == cu_seqlens_kv and max_seqlen_q == max_seqlen_kv
+            x = flash_attn_varlen_func(
+                q, k, v, attn_params.cu_seqlens, attn_params.cu_seqlens, attn_params.max_seqlen, attn_params.max_seqlen, drop_rate
+            )
+            del q, k, v
+
+            # Reshape x with shape [(bxs), a, d] to [b, s, a, d]
+            x = x.view(batch_size, seqlen, x.shape[-2], x.shape[-1])  # B, L, H, D
 
     else:
         # Currently only PyTorch SDPA and xformers are implemented
-        raise ValueError(f"Unsupported attention mode: {attn_mode}")
+        raise ValueError(f"Unsupported attention mode: {attn_params.attn_mode}")
 
     x = transpose_fn(x)  # [B, L, H, D]
     x = x.reshape(x.shape[0], x.shape[1], -1)  # [B, L, H*D]
+
+    if seqlen_trimmed:
+        x = torch.nn.functional.pad(x, (0, 0, 0, attn_params.max_seqlen - x.shape[1]), value=0)  # pad back to max_seqlen
+
     return x
diff --git a/library/hunyuan_image_models.py b/library/hunyuan_image_models.py
index 356ce4b42..fc320dfc1 100644
--- a/library/hunyuan_image_models.py
+++ b/library/hunyuan_image_models.py
@@ -8,6 +8,7 @@
 from accelerate import init_empty_weights
 
 from library import custom_offloading_utils
+from library.attention import AttentionParams
 from library.fp8_optimization_utils import apply_fp8_monkey_patch
 from library.lora_utils import load_safetensors_with_lora_and_fp8
 from library.utils import setup_logging
@@ -50,7 +51,7 @@ class HYImageDiffusionTransformer(nn.Module):
         attn_mode: Attention implementation mode ("torch" or "sageattn").
     """
 
-    def __init__(self, attn_mode: str = "torch"):
+    def __init__(self, attn_mode: str = "torch", split_attn: bool = False):
         super().__init__()
 
         # Fixed architecture parameters for HunyuanImage-2.1
@@ -80,6 +81,7 @@ def __init__(self, attn_mode: str = "torch"):
         qk_norm_type: str = "rms"  # RMS normalization type
 
         self.attn_mode = attn_mode
+        self.split_attn = split_attn
 
         # ByT5 character-level text encoder mapping
         self.byt5_in = ByT5Mapper(in_dim=1472, out_dim=2048, hidden_dim=2048, out_dim1=self.hidden_size, use_residual=False)
@@ -88,7 +90,7 @@ def __init__(self, attn_mode: str = "torch"):
         self.img_in = PatchEmbed2D(self.patch_size, self.in_channels, self.hidden_size)
 
         # Text token refinement with cross-attention
-        self.txt_in = SingleTokenRefiner(text_states_dim, self.hidden_size, self.heads_num, depth=2, attn_mode=self.attn_mode)
+        self.txt_in = SingleTokenRefiner(text_states_dim, self.hidden_size, self.heads_num, depth=2)
 
         # Timestep embedding for diffusion process
         self.time_in = TimestepEmbedder(self.hidden_size, nn.SiLU)
@@ -110,7 +112,6 @@ def __init__(self, attn_mode: str = "torch"):
                     qk_norm=qk_norm,
                     qk_norm_type=qk_norm_type,
                     qkv_bias=qkv_bias,
-                    attn_mode=self.attn_mode,
                 )
                 for _ in range(mm_double_blocks_depth)
             ]
@@ -126,7 +127,6 @@ def __init__(self, attn_mode: str = "torch"):
                     mlp_act_type=mlp_act_type,
                     qk_norm=qk_norm,
                     qk_norm_type=qk_norm_type,
-                    attn_mode=self.attn_mode,
                 )
                 for _ in range(mm_single_blocks_depth)
             ]
@@ -339,22 +339,21 @@ def forward(
         # MeanFlow and guidance embedding not used in this configuration
 
         # Process text tokens through refinement layers
-        txt_lens = text_mask.to(torch.bool).sum(dim=1).tolist()
-        txt = self.txt_in(txt, t, txt_lens)
+        txt_attn_params = AttentionParams.create_attention_params_from_mask(self.attn_mode, self.split_attn, 0, text_mask)
+        txt = self.txt_in(txt, t, txt_attn_params)
 
         # Integrate character-level ByT5 features with word-level tokens
         # Use variable length sequences with sequence lengths
         byt5_txt = self.byt5_in(byt5_text_states)
-        txt, _, txt_lens = self.reorder_txt_token(byt5_txt, txt, byt5_text_mask, text_mask)
+        txt, text_mask, txt_lens = self.reorder_txt_token(byt5_txt, txt, byt5_text_mask, text_mask)
 
         # Trim sequences to maximum length in the batch
         img_seq_len = img.shape[1]
-        # print(f"img_seq_len: {img_seq_len}, txt_lens: {txt_lens}")
-        seq_lens = [img_seq_len + l for l in txt_lens]
         max_txt_len = max(txt_lens)
-        # print(f"max_txt_len: {max_txt_len}, seq_lens: {seq_lens}, txt.shape: {txt.shape}")
         txt = txt[:, :max_txt_len, :]
-        txt_seq_len = txt.shape[1]
+        text_mask = text_mask[:, :max_txt_len]
+
+        attn_params = AttentionParams.create_attention_params_from_mask(self.attn_mode, self.split_attn, img_seq_len, text_mask)
 
         input_device = img.device
 
@@ -362,7 +361,7 @@ def forward(
         for index, block in enumerate(self.double_blocks):
             if self.blocks_to_swap:
                 self.offloader_double.wait_for_block(index)
-            img, txt = block(img, txt, vec, freqs_cis, seq_lens)
+            img, txt = block(img, txt, vec, freqs_cis, attn_params)
             if self.blocks_to_swap:
                 self.offloader_double.submit_move_blocks(self.double_blocks, index)
 
@@ -373,7 +372,7 @@ def forward(
         for index, block in enumerate(self.single_blocks):
             if self.blocks_to_swap:
                 self.offloader_single.wait_for_block(index)
-            x = block(x, vec, txt_seq_len, freqs_cis, seq_lens)
+            x = block(x, vec, freqs_cis, attn_params)
             if self.blocks_to_swap:
                 self.offloader_single.submit_move_blocks(self.single_blocks, index)
 
@@ -417,7 +416,7 @@ def unpatchify_2d(self, x, h, w):
 
 def create_model(attn_mode: str, split_attn: bool, dtype: Optional[torch.dtype]) -> HYImageDiffusionTransformer:
     with init_empty_weights():
-        model = HYImageDiffusionTransformer(attn_mode=attn_mode)
+        model = HYImageDiffusionTransformer(attn_mode=attn_mode, split_attn=split_attn)
         if dtype is not None:
             model.to(dtype)
     return model
diff --git a/library/hunyuan_image_modules.py b/library/hunyuan_image_modules.py
index 555cb4871..1953a783e 100644
--- a/library/hunyuan_image_modules.py
+++ b/library/hunyuan_image_modules.py
@@ -7,7 +7,7 @@
 from einops import rearrange
 
 from library import custom_offloading_utils
-from library.attention import attention
+from library.attention import AttentionParams, attention
 from library.hunyuan_image_utils import timestep_embedding, apply_rotary_emb, _to_tuple, apply_gate, modulate
 from library.attention import attention
 
@@ -213,7 +213,6 @@ class IndividualTokenRefinerBlock(nn.Module):
         qk_norm: QK normalization flag (must be False).
         qk_norm_type: QK normalization type (only "layer" supported).
         qkv_bias: Use bias in QKV projections.
-        attn_mode: Attention implementation mode.
     """
 
     def __init__(
@@ -226,15 +225,12 @@ def __init__(
         qk_norm: bool = False,
         qk_norm_type: str = "layer",
         qkv_bias: bool = True,
-        attn_mode: str = "torch",
     ):
         super().__init__()
         assert qk_norm_type == "layer", "Only layer normalization supported for QK norm."
         assert act_type == "silu", "Only SiLU activation supported."
         assert not qk_norm, "QK normalization must be disabled."
 
-        self.attn_mode = attn_mode
-
         self.heads_num = heads_num
         mlp_hidden_dim = int(hidden_size * mlp_width_ratio)
 
@@ -253,19 +249,14 @@ def __init__(
             nn.Linear(hidden_size, 2 * hidden_size, bias=True),
         )
 
-    def forward(
-        self,
-        x: torch.Tensor,
-        c: torch.Tensor,  # Combined timestep and context conditioning
-        txt_lens: list[int],
-    ) -> torch.Tensor:
+    def forward(self, x: torch.Tensor, c: torch.Tensor, attn_params: AttentionParams) -> torch.Tensor:
         """
         Apply self-attention and MLP with adaptive conditioning.
 
         Args:
             x: Input token embeddings [B, L, C].
             c: Combined conditioning vector [B, C].
-            txt_lens: Valid sequence lengths for each batch element.
+            attn_params: Attention parameters including sequence lengths.
 
         Returns:
             Refined token embeddings [B, L, C].
@@ -273,10 +264,14 @@ def forward(
         gate_msa, gate_mlp = self.adaLN_modulation(c).chunk(2, dim=1)
         norm_x = self.norm1(x)
         qkv = self.self_attn_qkv(norm_x)
+        del norm_x
         q, k, v = rearrange(qkv, "B L (K H D) -> K B L H D", K=3, H=self.heads_num)
+        del qkv
         q = self.self_attn_q_norm(q).to(v)
         k = self.self_attn_k_norm(k).to(v)
-        attn = attention(q, k, v, seq_lens=txt_lens, attn_mode=self.attn_mode)
+        qkv = [q, k, v]
+        del q, k, v
+        attn = attention(qkv, attn_params=attn_params)
 
         x = x + apply_gate(self.self_attn_proj(attn), gate_msa)
         x = x + apply_gate(self.mlp(self.norm2(x)), gate_mlp)
@@ -299,7 +294,6 @@ class IndividualTokenRefiner(nn.Module):
         qk_norm: QK normalization flag.
         qk_norm_type: QK normalization type.
         qkv_bias: Use bias in QKV projections.
-        attn_mode: Attention implementation mode.
     """
 
     def __init__(
@@ -313,7 +307,6 @@ def __init__(
         qk_norm: bool = False,
         qk_norm_type: str = "layer",
         qkv_bias: bool = True,
-        attn_mode: str = "torch",
     ):
         super().__init__()
         self.blocks = nn.ModuleList(
@@ -327,26 +320,25 @@ def __init__(
                     qk_norm=qk_norm,
                     qk_norm_type=qk_norm_type,
                     qkv_bias=qkv_bias,
-                    attn_mode=attn_mode,
                 )
                 for _ in range(depth)
             ]
         )
 
-    def forward(self, x: torch.Tensor, c: torch.LongTensor, txt_lens: list[int]) -> torch.Tensor:
+    def forward(self, x: torch.Tensor, c: torch.LongTensor, attn_params: AttentionParams) -> torch.Tensor:
         """
         Apply sequential token refinement.
 
         Args:
             x: Input token embeddings [B, L, C].
             c: Combined conditioning vector [B, C].
-            txt_lens: Valid sequence lengths for each batch element.
+            attn_params: Attention parameters including sequence lengths.
 
         Returns:
             Refined token embeddings [B, L, C].
         """
         for block in self.blocks:
-            x = block(x, c, txt_lens)
+            x = block(x, c, attn_params)
         return x
 
 
@@ -362,10 +354,9 @@ class SingleTokenRefiner(nn.Module):
         hidden_size: Transformer hidden dimension.
         heads_num: Number of attention heads.
         depth: Number of refinement blocks.
-        attn_mode: Attention implementation mode.
     """
 
-    def __init__(self, in_channels: int, hidden_size: int, heads_num: int, depth: int, attn_mode: str = "torch"):
+    def __init__(self, in_channels: int, hidden_size: int, heads_num: int, depth: int):
         # Fixed architecture parameters for HunyuanImage-2.1
         mlp_drop_rate: float = 0.0  # No MLP dropout
         act_type: str = "silu"  # SiLU activation
@@ -389,17 +380,16 @@ def __init__(self, in_channels: int, hidden_size: int, heads_num: int, depth: in
             qk_norm=qk_norm,
             qk_norm_type=qk_norm_type,
             qkv_bias=qkv_bias,
-            attn_mode=attn_mode,
         )
 
-    def forward(self, x: torch.Tensor, t: torch.LongTensor, txt_lens: list[int]) -> torch.Tensor:
+    def forward(self, x: torch.Tensor, t: torch.LongTensor, attn_params: AttentionParams) -> torch.Tensor:
         """
         Refine text embeddings with timestep conditioning.
 
         Args:
             x: Input text embeddings [B, L, in_channels].
             t: Diffusion timestep [B].
-            txt_lens: Valid sequence lengths for each batch element.
+            attn_params: Attention parameters including sequence lengths.
 
         Returns:
             Refined embeddings [B, L, hidden_size].
@@ -407,13 +397,14 @@ def forward(self, x: torch.Tensor, t: torch.LongTensor, txt_lens: list[int]) ->
         timestep_aware_representations = self.t_embedder(t)
 
         # Compute context-aware representations by averaging valid tokens
+        txt_lens = attn_params.seqlens  # img_len is not used for SingleTokenRefiner
         context_aware_representations = torch.stack([x[i, : txt_lens[i]].mean(dim=0) for i in range(x.shape[0])], dim=0)  # [B, C]
 
         context_aware_representations = self.c_embedder(context_aware_representations)
         c = timestep_aware_representations + context_aware_representations
         del timestep_aware_representations, context_aware_representations
         x = self.input_embedder(x)
-        x = self.individual_token_refiner(x, c, txt_lens)
+        x = self.individual_token_refiner(x, c, attn_params)
         return x
 
 
@@ -564,7 +555,6 @@ class MMDoubleStreamBlock(nn.Module):
         qk_norm: QK normalization flag (must be True).
         qk_norm_type: QK normalization type (only "rms" supported).
         qkv_bias: Use bias in QKV projections.
-        attn_mode: Attention implementation mode.
     """
 
     def __init__(
@@ -576,7 +566,6 @@ def __init__(
         qk_norm: bool = True,
         qk_norm_type: str = "rms",
         qkv_bias: bool = False,
-        attn_mode: str = "torch",
     ):
         super().__init__()
 
@@ -584,7 +573,6 @@ def __init__(
         assert qk_norm_type == "rms", "Only RMS normalization supported."
         assert qk_norm, "QK normalization must be enabled."
 
-        self.attn_mode = attn_mode
         self.heads_num = heads_num
         head_dim = hidden_size // heads_num
         mlp_hidden_dim = int(hidden_size * mlp_width_ratio)
@@ -626,7 +614,7 @@ def disable_gradient_checkpointing(self):
         self.cpu_offload_checkpointing = False
 
     def _forward(
-        self, img: torch.Tensor, txt: torch.Tensor, vec: torch.Tensor, freqs_cis: tuple = None, seq_lens: list[int] = None
+        self, img: torch.Tensor, txt: torch.Tensor, vec: torch.Tensor, freqs_cis: tuple = None, attn_params: AttentionParams = None
     ) -> Tuple[torch.Tensor, torch.Tensor]:
         # Extract modulation parameters for image and text streams
         (img_mod1_shift, img_mod1_scale, img_mod1_gate, img_mod2_shift, img_mod2_scale, img_mod2_gate) = self.img_mod(vec).chunk(
@@ -687,7 +675,7 @@ def _forward(
 
         qkv = [q, k, v]
         del q, k, v
-        attn = attention(qkv, seq_lens=seq_lens, attn_mode=self.attn_mode)
+        attn = attention(qkv, attn_params=attn_params)
         del qkv
 
         # Split attention outputs back to separate streams
@@ -719,16 +707,16 @@ def _forward(
         return img, txt
 
     def forward(
-        self, img: torch.Tensor, txt: torch.Tensor, vec: torch.Tensor, freqs_cis: tuple = None, seq_lens: list[int] = None
+        self, img: torch.Tensor, txt: torch.Tensor, vec: torch.Tensor, freqs_cis: tuple = None, attn_params: AttentionParams = None
     ) -> Tuple[torch.Tensor, torch.Tensor]:
         if self.gradient_checkpointing and self.training:
             forward_fn = self._forward
             if self.cpu_offload_checkpointing:
                 forward_fn = custom_offloading_utils.cpu_offload_wrapper(forward_fn, self.img_attn_qkv.weight.device)
 
-            return torch.utils.checkpoint.checkpoint(forward_fn, img, txt, vec, freqs_cis, seq_lens, use_reentrant=False)
+            return torch.utils.checkpoint.checkpoint(forward_fn, img, txt, vec, freqs_cis, attn_params, use_reentrant=False)
         else:
-            return self._forward(img, txt, vec, freqs_cis, seq_lens)
+            return self._forward(img, txt, vec, freqs_cis, attn_params)
 
 
 class MMSingleStreamBlock(nn.Module):
@@ -746,7 +734,6 @@ class MMSingleStreamBlock(nn.Module):
         qk_norm: QK normalization flag (must be True).
         qk_norm_type: QK normalization type (only "rms" supported).
         qk_scale: Attention scaling factor (computed automatically if None).
-        attn_mode: Attention implementation mode.
     """
 
     def __init__(
@@ -758,7 +745,6 @@ def __init__(
         qk_norm: bool = True,
         qk_norm_type: str = "rms",
         qk_scale: float = None,
-        attn_mode: str = "torch",
     ):
         super().__init__()
 
@@ -766,7 +752,6 @@ def __init__(
         assert qk_norm_type == "rms", "Only RMS normalization supported."
         assert qk_norm, "QK normalization must be enabled."
 
-        self.attn_mode = attn_mode
         self.hidden_size = hidden_size
         self.heads_num = heads_num
         head_dim = hidden_size // heads_num
@@ -805,9 +790,8 @@ def _forward(
         self,
         x: torch.Tensor,
         vec: torch.Tensor,
-        txt_len: int,
         freqs_cis: Tuple[torch.Tensor, torch.Tensor] = None,
-        seq_lens: list[int] = None,
+        attn_params: AttentionParams = None,
     ) -> torch.Tensor:
         # Extract modulation parameters
         mod_shift, mod_scale, mod_gate = self.modulation(vec).chunk(3, dim=-1)
@@ -828,12 +812,10 @@ def _forward(
         k = self.k_norm(k).to(v)
 
         # Separate image and text tokens
-        img_q, txt_q = q[:, :-txt_len, :, :], q[:, -txt_len:, :, :]
+        img_q, txt_q = q[:, : attn_params.img_len, :, :], q[:, attn_params.img_len :, :, :]
         del q
-        img_k, txt_k = k[:, :-txt_len, :, :], k[:, -txt_len:, :, :]
+        img_k, txt_k = k[:, : attn_params.img_len, :, :], k[:, attn_params.img_len :, :, :]
         del k
-        # img_v, txt_v = v[:, :-txt_len, :, :], v[:, -txt_len:, :, :]
-        # del v
 
         # Apply rotary position embeddings only to image tokens
         img_q, img_k = apply_rotary_emb(img_q, img_k, freqs_cis, head_first=False)
@@ -848,7 +830,7 @@ def _forward(
         # del img_v, txt_v
         qkv = [q, k, v]
         del q, k, v
-        attn = attention(qkv, seq_lens=seq_lens, attn_mode=self.attn_mode)
+        attn = attention(qkv, attn_params=attn_params)
         del qkv
 
         # Combine attention and MLP outputs, apply gating
@@ -865,18 +847,17 @@ def forward(
         self,
         x: torch.Tensor,
         vec: torch.Tensor,
-        txt_len: int,
         freqs_cis: Tuple[torch.Tensor, torch.Tensor] = None,
-        seq_lens: list[int] = None,
+        attn_params: AttentionParams = None,
     ) -> torch.Tensor:
         if self.gradient_checkpointing and self.training:
             forward_fn = self._forward
             if self.cpu_offload_checkpointing:
                 forward_fn = custom_offloading_utils.create_cpu_offloading_wrapper(forward_fn, self.linear1.weight.device)
 
-            return torch.utils.checkpoint.checkpoint(forward_fn, x, vec, txt_len, freqs_cis, seq_lens, use_reentrant=False)
+            return torch.utils.checkpoint.checkpoint(forward_fn, x, vec, freqs_cis, attn_params, use_reentrant=False)
         else:
-            return self._forward(x, vec, txt_len, freqs_cis, seq_lens)
+            return self._forward(x, vec, freqs_cis, attn_params)
 
 
 # endregion

From 8f20c379490906ea4db86b068ddf003738ebbd91 Mon Sep 17 00:00:00 2001
From: Kohya S <52813779+kohya-ss@users.noreply.github.com>
Date: Sat, 20 Sep 2025 20:26:20 +0900
Subject: [PATCH 568/582] feat: add --text_encoder_cpu option to reduce VRAM
 usage by running text encoders on CPU for training

---
 docs/hunyuan_image_train_network.md |  8 ++++++--
 hunyuan_image_train_network.py      | 20 ++++++++++++--------
 2 files changed, 18 insertions(+), 10 deletions(-)

diff --git a/docs/hunyuan_image_train_network.md b/docs/hunyuan_image_train_network.md
index 667b4fec1..d31ff867f 100644
--- a/docs/hunyuan_image_train_network.md
+++ b/docs/hunyuan_image_train_network.md
@@ -184,6 +184,8 @@ The script adds HunyuanImage-2.1 specific arguments. For common arguments (like
   - Enables training the DiT model in scaled FP8 format. This can significantly reduce VRAM usage (can run with as little as 8GB VRAM when combined with `--blocks_to_swap`), but the training results may vary. This is a newer alternative to the unsupported `--fp8_base` option.
 * `--fp8_vl`
   - Use FP8 for the VLM (Qwen2.5-VL) text encoder.
+* `--text_encoder_cpu`
+  - Runs the text encoders on CPU to reduce VRAM usage. This is useful when VRAM is insufficient (less than 12GB). Encoding one text may take a few minutes (depending on CPU). It is highly recommended to use this option with `--cache_text_encoder_outputs_to_disk` to avoid repeated encoding every time training starts.
 * `--blocks_to_swap=<integer>` **[Experimental Feature]**
   - Setting to reduce VRAM usage by swapping parts of the model (Transformer blocks) between CPU and GPU. Specify the number of blocks to swap as an integer (e.g., `18`). Larger values reduce VRAM usage but decrease training speed. Adjust according to your GPU's VRAM capacity. Can be used with `gradient_checkpointing`.
 * `--cache_text_encoder_outputs`
@@ -450,8 +452,9 @@ python hunyuan_image_minimal_inference.py \
 - `--image_size`: Resolution (inference is most stable at 2048x2048)
 - `--guidance_scale`: CFG scale (default: 3.5)
 - `--flow_shift`: Flow matching shift parameter (default: 5.0)
+- `--text_encoder_cpu`: Run the text encoders on CPU to reduce VRAM usage
 
-`--split_attn` is not supported (since inference is done one at a time).
+`--split_attn` is not supported (since inference is done one at a time). `--fp8_vl` is not supported, please use CPU for the text encoder if VRAM is insufficient.
 
 <details>
 <summary>日本語</summary>
@@ -464,8 +467,9 @@ python hunyuan_image_minimal_inference.py \
 - `--image_size`: 解像度（2048x2048で最も安定）
 - `--guidance_scale`: CFGスケール（推奨: 3.5）
 - `--flow_shift`: Flow Matchingシフトパラメータ（デフォルト: 5.0）
+- `--text_encoder_cpu`: テキストエンコーダをCPUで実行してVRAM使用量削減
 
-`--split_attn`はサポートされていません（1件ずつ推論するため）。
+`--split_attn`はサポートされていません（1件ずつ推論するため）。`--fp8_vl`もサポートされていません。VRAMが不足する場合はテキストエンコーダをCPUで実行してください。
 
 </details>
 
diff --git a/hunyuan_image_train_network.py b/hunyuan_image_train_network.py
index 6b102a9a3..07e072e7a 100644
--- a/hunyuan_image_train_network.py
+++ b/hunyuan_image_train_network.py
@@ -350,7 +350,7 @@ def load_target_model(self, args, weight_dtype, accelerator):
         self.is_swapping_blocks = args.blocks_to_swap is not None and args.blocks_to_swap > 0
 
         vl_dtype = torch.float8_e4m3fn if args.fp8_vl else torch.bfloat16
-        vl_device = "cpu"
+        vl_device = "cpu"  # loading to cpu and move to gpu later in cache_text_encoder_outputs_if_needed
         _, text_encoder_vlm = hunyuan_image_text_encoder.load_qwen2_5_vl(
             args.text_encoder, dtype=vl_dtype, device=vl_device, disable_mmap=args.disable_mmap_load_safetensors
         )
@@ -440,6 +440,7 @@ def get_text_encoder_outputs_caching_strategy(self, args):
     def cache_text_encoder_outputs_if_needed(
         self, args, accelerator: Accelerator, unet, vae, text_encoders, dataset: train_util.DatasetGroup, weight_dtype
     ):
+        vlm_device = "cpu" if args.text_encoder_cpu else accelerator.device
         if args.cache_text_encoder_outputs:
             if not args.lowram:
                 # メモリ消費を減らす
@@ -448,9 +449,9 @@ def cache_text_encoder_outputs_if_needed(
                 vae.to("cpu")
                 clean_memory_on_device(accelerator.device)
 
-            logger.info("move text encoders to gpu")
-            text_encoders[0].to(accelerator.device)
-            text_encoders[1].to(accelerator.device)
+            logger.info(f"move text encoders to {vlm_device} to encode and cache text encoder outputs")
+            text_encoders[0].to(vlm_device)
+            text_encoders[1].to(vlm_device)
 
             # VLM (bf16) and byT5 (fp16) are used for encoding, so we cannot use autocast here
             dataset.new_cache_text_encoder_outputs(text_encoders, accelerator)
@@ -491,8 +492,8 @@ def cache_text_encoder_outputs_if_needed(
                 vae.to(org_vae_device)
         else:
             # Text Encoderから毎回出力を取得するので、GPUに乗せておく
-            text_encoders[0].to(accelerator.device)
-            text_encoders[1].to(accelerator.device)
+            text_encoders[0].to(vlm_device)
+            text_encoders[1].to(vlm_device)
 
     def sample_images(self, accelerator, args, epoch, global_step, device, ae, tokenizer, text_encoder, flux):
         text_encoders = text_encoder  # for compatibility
@@ -667,8 +668,11 @@ def setup_parser() -> argparse.ArgumentParser:
         default=5.0,
         help="Discrete flow shift for the Euler Discrete Scheduler, default is 5.0. / Euler Discrete Schedulerの離散フローシフト、デフォルトは5.0。",
     )
-    parser.add_argument("--fp8_scaled", action="store_true", help="use scaled fp8 for DiT / DiTにスケーリングされたfp8を使う")
-    parser.add_argument("--fp8_vl", action="store_true", help="use fp8 for VLM text encoder / VLMテキストエンコーダにfp8を使用する")
+    parser.add_argument("--fp8_scaled", action="store_true", help="Use scaled fp8 for DiT / DiTにスケーリングされたfp8を使う")
+    parser.add_argument("--fp8_vl", action="store_true", help="Use fp8 for VLM text encoder / VLMテキストエンコーダにfp8を使用する")
+    parser.add_argument(
+        "--text_encoder_cpu", action="store_true", help="Inference on CPU for Text Encoders / テキストエンコーダをCPUで推論する"
+    )
     parser.add_argument(
         "--vae_enable_tiling",
         action="store_true",

From f41e9e2b587e6700edbd98ddf03624612cfcf445 Mon Sep 17 00:00:00 2001
From: Kohya S <52813779+kohya-ss@users.noreply.github.com>
Date: Sun, 21 Sep 2025 11:09:37 +0900
Subject: [PATCH 569/582] feat: add vae_chunk_size argument for
 memory-efficient VAE decoding and processing

---
 hunyuan_image_minimal_inference.py |  34 ++---
 hunyuan_image_train_network.py     |  15 +--
 library/hunyuan_image_vae.py       | 191 ++++++++++++++++++++++++-----
 library/strategy_base.py           |   1 +
 4 files changed, 185 insertions(+), 56 deletions(-)

diff --git a/hunyuan_image_minimal_inference.py b/hunyuan_image_minimal_inference.py
index 850233837..711e911f5 100644
--- a/hunyuan_image_minimal_inference.py
+++ b/hunyuan_image_minimal_inference.py
@@ -88,7 +88,13 @@ def parse_args() -> argparse.Namespace:
     parser.add_argument("--fp8_scaled", action="store_true", help="use scaled fp8 for DiT, only for fp8")
 
     parser.add_argument("--text_encoder_cpu", action="store_true", help="Inference on CPU for Text Encoders")
-    parser.add_argument("--vae_enable_tiling", action="store_true", help="Enable tiling for VAE decoding")
+    parser.add_argument(
+        "--vae_chunk_size",
+        type=int,
+        default=None,  # default is None (no chunking)
+        help="Chunk size for VAE decoding to reduce memory usage. Default is None (no chunking). 16 is recommended if enabled"
+        " / メモリ使用量を減らすためのVAEデコードのチャンクサイズ。デフォルトはNone（チャンクなし）。有効にする場合は16程度を推奨。",
+    )
     parser.add_argument(
         "--device", type=str, default=None, help="device to use for inference. If None, use CUDA if available, otherwise use CPU"
     )
@@ -431,14 +437,10 @@ def merge_lora_weights(
 # endregion
 
 
-def decode_latent(vae: HunyuanVAE2D, latent: torch.Tensor, device: torch.device, enable_tiling: bool = False) -> torch.Tensor:
+def decode_latent(vae: HunyuanVAE2D, latent: torch.Tensor, device: torch.device) -> torch.Tensor:
     logger.info(f"Decoding image. Latent shape {latent.shape}, device {device}")
 
     vae.to(device)
-    if enable_tiling:
-        vae.enable_tiling()
-    else:
-        vae.disable_tiling()
     with torch.no_grad():
         latent = latent / vae.scaling_factor  # scale latent back to original range
         pixels = vae.decode(latent.to(device, dtype=vae.dtype))
@@ -807,7 +809,7 @@ def save_output(
     vae: HunyuanVAE2D,
     latent: torch.Tensor,
     device: torch.device,
-    original_base_names: Optional[List[str]] = None,
+    original_base_name: Optional[str] = None,
 ) -> None:
     """save output
 
@@ -816,7 +818,7 @@ def save_output(
         vae: VAE model
         latent: latent tensor
         device: device to use
-        original_base_names: original base names (if latents are loaded from files)
+        original_base_name: original base name (if latents are loaded from files)
     """
     height, width = latent.shape[-2], latent.shape[-1]  # BCTHW
     height *= hunyuan_image_vae.VAE_SCALE_FACTOR
@@ -839,14 +841,14 @@ def save_output(
             1, vae.latent_channels, height // hunyuan_image_vae.VAE_SCALE_FACTOR, width // hunyuan_image_vae.VAE_SCALE_FACTOR
         )
 
-    image = decode_latent(vae, latent, device, args.vae_enable_tiling)
+    image = decode_latent(vae, latent, device)
 
     if args.output_type == "images" or args.output_type == "latent_images":
         # save images
-        if original_base_names is None or len(original_base_names) == 0:
+        if original_base_name is None:
             original_name = ""
         else:
-            original_name = f"_{original_base_names[0]}"
+            original_name = f"_{original_base_name}"
         save_images(image, args, original_name)
 
 
@@ -919,7 +921,7 @@ def process_batch_prompts(prompts_data: List[Dict], args: argparse.Namespace) ->
 
     # 1. Prepare VAE
     logger.info("Loading VAE for batch generation...")
-    vae_for_batch = hunyuan_image_vae.load_vae(args.vae, device="cpu", disable_mmap=True)
+    vae_for_batch = hunyuan_image_vae.load_vae(args.vae, device="cpu", disable_mmap=True, chunk_size=args.vae_chunk_size)
     vae_for_batch.eval()
 
     all_prompt_args_list = [apply_overrides(args, pd) for pd in prompts_data]  # Create all arg instances first
@@ -1057,7 +1059,7 @@ def process_interactive(args: argparse.Namespace) -> None:
     shared_models = load_shared_models(args)
     shared_models["conds_cache"] = {}  # Initialize empty cache for interactive mode
 
-    vae = hunyuan_image_vae.load_vae(args.vae, device="cpu", disable_mmap=True)
+    vae = hunyuan_image_vae.load_vae(args.vae, device="cpu", disable_mmap=True, chunk_size=args.vae_chunk_size)
     vae.eval()
 
     print("Interactive mode. Enter prompts (Ctrl+D or Ctrl+Z (Windows) to exit):")
@@ -1185,9 +1187,9 @@ def main():
         for i, latent in enumerate(latents_list):
             args.seed = seeds[i]
 
-            vae = hunyuan_image_vae.load_vae(args.vae, device=device, disable_mmap=True)
+            vae = hunyuan_image_vae.load_vae(args.vae, device=device, disable_mmap=True, chunk_size=args.vae_chunk_size)
             vae.eval()
-            save_output(args, vae, latent, device, original_base_names)
+            save_output(args, vae, latent, device, original_base_names[i])
 
     elif args.from_file:
         # Batch mode from file
@@ -1220,7 +1222,7 @@ def main():
         clean_memory_on_device(device)
 
         # Save latent and video
-        vae = hunyuan_image_vae.load_vae(args.vae, device="cpu", disable_mmap=True)
+        vae = hunyuan_image_vae.load_vae(args.vae, device="cpu", disable_mmap=True, chunk_size=args.vae_chunk_size)
         vae.eval()
         save_output(args, vae, latent, device)
 
diff --git a/hunyuan_image_train_network.py b/hunyuan_image_train_network.py
index 07e072e7a..228c9dbc1 100644
--- a/hunyuan_image_train_network.py
+++ b/hunyuan_image_train_network.py
@@ -358,12 +358,11 @@ def load_target_model(self, args, weight_dtype, accelerator):
             args.byt5, dtype=torch.float16, device=vl_device, disable_mmap=args.disable_mmap_load_safetensors
         )
 
-        vae = hunyuan_image_vae.load_vae(args.vae, "cpu", disable_mmap=args.disable_mmap_load_safetensors)
+        vae = hunyuan_image_vae.load_vae(
+            args.vae, "cpu", disable_mmap=args.disable_mmap_load_safetensors, chunk_size=args.vae_chunk_size
+        )
         vae.to(dtype=torch.float16)  # VAE is always fp16
         vae.eval()
-        if args.vae_enable_tiling:
-            vae.enable_tiling()
-            logger.info("VAE tiling is enabled")
 
         model_version = hunyuan_image_utils.MODEL_VERSION_2_1
         return model_version, [text_encoder_vlm, text_encoder_byt5], vae, None  # unet will be loaded later
@@ -674,9 +673,11 @@ def setup_parser() -> argparse.ArgumentParser:
         "--text_encoder_cpu", action="store_true", help="Inference on CPU for Text Encoders / テキストエンコーダをCPUで推論する"
     )
     parser.add_argument(
-        "--vae_enable_tiling",
-        action="store_true",
-        help="Enable tiling for VAE decoding and encoding / VAEデコーディングとエンコーディングのタイルを有効にする",
+        "--vae_chunk_size",
+        type=int,
+        default=None,  # default is None (no chunking)
+        help="Chunk size for VAE decoding to reduce memory usage. Default is None (no chunking). 16 is recommended if enabled"
+        " / メモリ使用量を減らすためのVAEデコードのチャンクサイズ。デフォルトはNone（チャンクなし）。有効にする場合は16程度を推奨。",
     )
 
     parser.add_argument(
diff --git a/library/hunyuan_image_vae.py b/library/hunyuan_image_vae.py
index b66854e5e..a6ed1e811 100644
--- a/library/hunyuan_image_vae.py
+++ b/library/hunyuan_image_vae.py
@@ -29,14 +29,20 @@ def swish(x: Tensor) -> Tensor:
 class AttnBlock(nn.Module):
     """Self-attention block using scaled dot-product attention."""
 
-    def __init__(self, in_channels: int):
+    def __init__(self, in_channels: int, chunk_size: Optional[int] = None):
         super().__init__()
         self.in_channels = in_channels
         self.norm = nn.GroupNorm(num_groups=32, num_channels=in_channels, eps=1e-6, affine=True)
-        self.q = Conv2d(in_channels, in_channels, kernel_size=1)
-        self.k = Conv2d(in_channels, in_channels, kernel_size=1)
-        self.v = Conv2d(in_channels, in_channels, kernel_size=1)
-        self.proj_out = Conv2d(in_channels, in_channels, kernel_size=1)
+        if chunk_size is None or chunk_size <= 0:
+            self.q = Conv2d(in_channels, in_channels, kernel_size=1)
+            self.k = Conv2d(in_channels, in_channels, kernel_size=1)
+            self.v = Conv2d(in_channels, in_channels, kernel_size=1)
+            self.proj_out = Conv2d(in_channels, in_channels, kernel_size=1)
+        else:
+            self.q = ChunkedConv2d(in_channels, in_channels, kernel_size=1, chunk_size=chunk_size)
+            self.k = ChunkedConv2d(in_channels, in_channels, kernel_size=1, chunk_size=chunk_size)
+            self.v = ChunkedConv2d(in_channels, in_channels, kernel_size=1, chunk_size=chunk_size)
+            self.proj_out = ChunkedConv2d(in_channels, in_channels, kernel_size=1, chunk_size=chunk_size)
 
     def attention(self, x: Tensor) -> Tensor:
         x = self.norm(x)
@@ -56,6 +62,87 @@ def forward(self, x: Tensor) -> Tensor:
         return x + self.proj_out(self.attention(x))
 
 
+class ChunkedConv2d(nn.Conv2d):
+    """
+    Convolutional layer that processes input in chunks to reduce memory usage.
+
+    Parameters
+    ----------
+    chunk_size : int, optional
+        Size of chunks to process at a time. Default is 64.
+    """
+
+    def __init__(self, *args, **kwargs):
+        if "chunk_size" in kwargs:
+            self.chunk_size = kwargs.pop("chunk_size", 64)
+        super().__init__(*args, **kwargs)
+        assert self.padding_mode == "zeros", "Only 'zeros' padding mode is supported."
+        assert self.dilation == (1, 1) and self.stride == (1, 1), "Only dilation=1 and stride=1 are supported."
+        assert self.groups == 1, "Only groups=1 is supported."
+        assert self.kernel_size[0] == self.kernel_size[1], "Only square kernels are supported."
+        assert (
+            self.padding[0] == self.padding[1] and self.padding[0] == self.kernel_size[0] // 2
+        ), "Only kernel_size//2 padding is supported."
+        self.original_padding = self.padding
+        self.padding = (0, 0)  # We handle padding manually in forward
+
+    def forward(self, x: Tensor) -> Tensor:
+        # If chunking is not needed, process normally. We chunk only along height dimension.
+        if self.chunk_size is None or x.shape[1] <= self.chunk_size:
+            self.padding = self.original_padding
+            x = super().forward(x)
+            self.padding = (0, 0)
+            if torch.cuda.is_available():
+                torch.cuda.empty_cache()
+            return x
+
+        # Process input in chunks to reduce memory usage
+        org_shape = x.shape
+
+        # If kernel size is not 1, we need to use overlapping chunks
+        overlap = self.kernel_size[0] // 2  # 1 for kernel size 3
+        step = self.chunk_size - overlap
+        y = torch.zeros((org_shape[0], self.out_channels, org_shape[2], org_shape[3]), dtype=x.dtype, device=x.device)
+        yi = 0
+        i = 0
+        while i < org_shape[2]:
+            si = i if i == 0 else i - overlap
+            ei = i + self.chunk_size
+
+            # Check last chunk. If remaining part is small, include it in last chunk
+            if ei > org_shape[2] or ei + step // 4 > org_shape[2]:
+                ei = org_shape[2]
+
+            chunk = x[:, :, : ei - si, :]
+            x = x[:, :, ei - si - overlap * 2 :, :]
+
+            # Pad chunk if needed: This is as the original Conv2d with padding
+            if i == 0:  # First chunk
+                # Pad except bottom
+                chunk = torch.nn.functional.pad(chunk, (overlap, overlap, overlap, 0), mode="constant", value=0)
+            elif ei == org_shape[2]:  # Last chunk
+                # Pad except top
+                chunk = torch.nn.functional.pad(chunk, (overlap, overlap, 0, overlap), mode="constant", value=0)
+            else:
+                # Pad left and right only
+                chunk = torch.nn.functional.pad(chunk, (overlap, overlap), mode="constant", value=0)
+
+            chunk = super().forward(chunk)
+            y[:, :, yi : yi + chunk.shape[2], :] = chunk
+            yi += chunk.shape[2]
+            del chunk
+
+            if ei == org_shape[2]:
+                break
+            i += step
+
+        assert yi == org_shape[2], f"yi={yi}, org_shape[2]={org_shape[2]}"
+
+        if torch.cuda.is_available():
+            torch.cuda.empty_cache()  # This helps reduce peak memory usage, but slows down a bit
+        return y
+
+
 class ResnetBlock(nn.Module):
     """
     Residual block with two convolutions, group normalization, and swish activation.
@@ -69,19 +156,29 @@ class ResnetBlock(nn.Module):
         Number of output channels.
     """
 
-    def __init__(self, in_channels: int, out_channels: int):
+    def __init__(self, in_channels: int, out_channels: int, chunk_size: Optional[int] = None):
         super().__init__()
         self.in_channels = in_channels
         self.out_channels = out_channels
 
         self.norm1 = nn.GroupNorm(num_groups=32, num_channels=in_channels, eps=1e-6, affine=True)
-        self.conv1 = Conv2d(in_channels, out_channels, kernel_size=3, stride=1, padding=1)
         self.norm2 = nn.GroupNorm(num_groups=32, num_channels=out_channels, eps=1e-6, affine=True)
-        self.conv2 = Conv2d(out_channels, out_channels, kernel_size=3, stride=1, padding=1)
+        if chunk_size is None or chunk_size <= 0:
+            self.conv1 = Conv2d(in_channels, out_channels, kernel_size=3, stride=1, padding=1)
+            self.conv2 = Conv2d(out_channels, out_channels, kernel_size=3, stride=1, padding=1)
 
-        # Skip connection projection for channel dimension mismatch
-        if self.in_channels != self.out_channels:
-            self.nin_shortcut = Conv2d(in_channels, out_channels, kernel_size=1, stride=1, padding=0)
+            # Skip connection projection for channel dimension mismatch
+            if self.in_channels != self.out_channels:
+                self.nin_shortcut = Conv2d(in_channels, out_channels, kernel_size=1, stride=1, padding=0)
+        else:
+            self.conv1 = ChunkedConv2d(in_channels, out_channels, kernel_size=3, stride=1, padding=1, chunk_size=chunk_size)
+            self.conv2 = ChunkedConv2d(out_channels, out_channels, kernel_size=3, stride=1, padding=1, chunk_size=chunk_size)
+
+            # Skip connection projection for channel dimension mismatch
+            if self.in_channels != self.out_channels:
+                self.nin_shortcut = ChunkedConv2d(
+                    in_channels, out_channels, kernel_size=1, stride=1, padding=0, chunk_size=chunk_size
+                )
 
     def forward(self, x: Tensor) -> Tensor:
         h = x
@@ -113,12 +210,17 @@ class Downsample(nn.Module):
         Number of output channels (must be divisible by 4).
     """
 
-    def __init__(self, in_channels: int, out_channels: int):
+    def __init__(self, in_channels: int, out_channels: int, chunk_size: Optional[int] = None):
         super().__init__()
         factor = 4  # 2x2 spatial reduction factor
         assert out_channels % factor == 0
 
-        self.conv = Conv2d(in_channels, out_channels // factor, kernel_size=3, stride=1, padding=1)
+        if chunk_size is None or chunk_size <= 0:
+            self.conv = Conv2d(in_channels, out_channels // factor, kernel_size=3, stride=1, padding=1)
+        else:
+            self.conv = ChunkedConv2d(
+                in_channels, out_channels // factor, kernel_size=3, stride=1, padding=1, chunk_size=chunk_size
+            )
         self.group_size = factor * in_channels // out_channels
 
     def forward(self, x: Tensor) -> Tensor:
@@ -147,10 +249,15 @@ class Upsample(nn.Module):
         Number of output channels.
     """
 
-    def __init__(self, in_channels: int, out_channels: int):
+    def __init__(self, in_channels: int, out_channels: int, chunk_size: Optional[int] = None):
         super().__init__()
         factor = 4  # 2x2 spatial expansion factor
-        self.conv = Conv2d(in_channels, out_channels * factor, kernel_size=3, stride=1, padding=1)
+
+        if chunk_size is None or chunk_size <= 0:
+            self.conv = Conv2d(in_channels, out_channels * factor, kernel_size=3, stride=1, padding=1)
+        else:
+            self.conv = ChunkedConv2d(in_channels, out_channels * factor, kernel_size=3, stride=1, padding=1, chunk_size=chunk_size)
+
         self.repeats = factor * out_channels // in_channels
 
     def forward(self, x: Tensor) -> Tensor:
@@ -191,6 +298,7 @@ def __init__(
         block_out_channels: Tuple[int, ...],
         num_res_blocks: int,
         ffactor_spatial: int,
+        chunk_size: Optional[int] = None,
     ):
         super().__init__()
         assert block_out_channels[-1] % (2 * z_channels) == 0
@@ -199,7 +307,12 @@ def __init__(
         self.block_out_channels = block_out_channels
         self.num_res_blocks = num_res_blocks
 
-        self.conv_in = Conv2d(in_channels, block_out_channels[0], kernel_size=3, stride=1, padding=1)
+        if chunk_size is None or chunk_size <= 0:
+            self.conv_in = Conv2d(in_channels, block_out_channels[0], kernel_size=3, stride=1, padding=1)
+        else:
+            self.conv_in = ChunkedConv2d(
+                in_channels, block_out_channels[0], kernel_size=3, stride=1, padding=1, chunk_size=chunk_size
+            )
 
         self.down = nn.ModuleList()
         block_in = block_out_channels[0]
@@ -211,7 +324,7 @@ def __init__(
 
             # Add residual blocks for this level
             for _ in range(self.num_res_blocks):
-                block.append(ResnetBlock(in_channels=block_in, out_channels=block_out))
+                block.append(ResnetBlock(in_channels=block_in, out_channels=block_out, chunk_size=chunk_size))
                 block_in = block_out
 
             down = nn.Module()
@@ -222,20 +335,23 @@ def __init__(
             if add_spatial_downsample:
                 assert i_level < len(block_out_channels) - 1
                 block_out = block_out_channels[i_level + 1]
-                down.downsample = Downsample(block_in, block_out)
+                down.downsample = Downsample(block_in, block_out, chunk_size=chunk_size)
                 block_in = block_out
 
             self.down.append(down)
 
         # Middle blocks with attention
         self.mid = nn.Module()
-        self.mid.block_1 = ResnetBlock(in_channels=block_in, out_channels=block_in)
-        self.mid.attn_1 = AttnBlock(block_in)
-        self.mid.block_2 = ResnetBlock(in_channels=block_in, out_channels=block_in)
+        self.mid.block_1 = ResnetBlock(in_channels=block_in, out_channels=block_in, chunk_size=chunk_size)
+        self.mid.attn_1 = AttnBlock(block_in, chunk_size=chunk_size)
+        self.mid.block_2 = ResnetBlock(in_channels=block_in, out_channels=block_in, chunk_size=chunk_size)
 
         # Output layers
         self.norm_out = nn.GroupNorm(num_groups=32, num_channels=block_in, eps=1e-6, affine=True)
-        self.conv_out = Conv2d(block_in, 2 * z_channels, kernel_size=3, stride=1, padding=1)
+        if chunk_size is None or chunk_size <= 0:
+            self.conv_out = Conv2d(block_in, 2 * z_channels, kernel_size=3, stride=1, padding=1)
+        else:
+            self.conv_out = ChunkedConv2d(block_in, 2 * z_channels, kernel_size=3, stride=1, padding=1, chunk_size=chunk_size)
 
     def forward(self, x: Tensor) -> Tensor:
         # Initial convolution
@@ -291,6 +407,7 @@ def __init__(
         block_out_channels: Tuple[int, ...],
         num_res_blocks: int,
         ffactor_spatial: int,
+        chunk_size: Optional[int] = None,
     ):
         super().__init__()
         assert block_out_channels[0] % z_channels == 0
@@ -300,13 +417,16 @@ def __init__(
         self.num_res_blocks = num_res_blocks
 
         block_in = block_out_channels[0]
-        self.conv_in = Conv2d(z_channels, block_in, kernel_size=3, stride=1, padding=1)
+        if chunk_size is None or chunk_size <= 0:
+            self.conv_in = Conv2d(z_channels, block_in, kernel_size=3, stride=1, padding=1)
+        else:
+            self.conv_in = ChunkedConv2d(z_channels, block_in, kernel_size=3, stride=1, padding=1, chunk_size=chunk_size)
 
         # Middle blocks with attention
         self.mid = nn.Module()
-        self.mid.block_1 = ResnetBlock(in_channels=block_in, out_channels=block_in)
-        self.mid.attn_1 = AttnBlock(block_in)
-        self.mid.block_2 = ResnetBlock(in_channels=block_in, out_channels=block_in)
+        self.mid.block_1 = ResnetBlock(in_channels=block_in, out_channels=block_in, chunk_size=chunk_size)
+        self.mid.attn_1 = AttnBlock(block_in, chunk_size=chunk_size)
+        self.mid.block_2 = ResnetBlock(in_channels=block_in, out_channels=block_in, chunk_size=chunk_size)
 
         # Build upsampling blocks
         self.up = nn.ModuleList()
@@ -316,7 +436,7 @@ def __init__(
 
             # Add residual blocks for this level (extra block for decoder)
             for _ in range(self.num_res_blocks + 1):
-                block.append(ResnetBlock(in_channels=block_in, out_channels=block_out))
+                block.append(ResnetBlock(in_channels=block_in, out_channels=block_out, chunk_size=chunk_size))
                 block_in = block_out
 
             up = nn.Module()
@@ -327,14 +447,17 @@ def __init__(
             if add_spatial_upsample:
                 assert i_level < len(block_out_channels) - 1
                 block_out = block_out_channels[i_level + 1]
-                up.upsample = Upsample(block_in, block_out)
+                up.upsample = Upsample(block_in, block_out, chunk_size=chunk_size)
                 block_in = block_out
 
             self.up.append(up)
 
         # Output layers
         self.norm_out = nn.GroupNorm(num_groups=32, num_channels=block_in, eps=1e-6, affine=True)
-        self.conv_out = Conv2d(block_in, out_channels, kernel_size=3, stride=1, padding=1)
+        if chunk_size is None or chunk_size <= 0:
+            self.conv_out = Conv2d(block_in, out_channels, kernel_size=3, stride=1, padding=1)
+        else:
+            self.conv_out = ChunkedConv2d(block_in, out_channels, kernel_size=3, stride=1, padding=1, chunk_size=chunk_size)
 
     def forward(self, z: Tensor) -> Tensor:
         # Initial processing with skip connection
@@ -370,7 +493,7 @@ class HunyuanVAE2D(nn.Module):
     with 32x spatial compression and optional memory-efficient tiling for large images.
     """
 
-    def __init__(self):
+    def __init__(self, chunk_size: Optional[int] = None):
         super().__init__()
 
         # Fixed configuration for Hunyuan Image-2.1
@@ -392,6 +515,7 @@ def __init__(self):
             block_out_channels=block_out_channels,
             num_res_blocks=layers_per_block,
             ffactor_spatial=ffactor_spatial,
+            chunk_size=chunk_size,
         )
 
         self.decoder = Decoder(
@@ -400,6 +524,7 @@ def __init__(self):
             block_out_channels=list(reversed(block_out_channels)),
             num_res_blocks=layers_per_block,
             ffactor_spatial=ffactor_spatial,
+            chunk_size=chunk_size,
         )
 
         # Spatial tiling configuration for memory efficiency
@@ -617,9 +742,9 @@ def decode(self, z: Tensor):
         return decoded
 
 
-def load_vae(vae_path: str, device: torch.device, disable_mmap: bool = False) -> HunyuanVAE2D:
-    logger.info("Initializing VAE")
-    vae = HunyuanVAE2D()
+def load_vae(vae_path: str, device: torch.device, disable_mmap: bool = False, chunk_size: Optional[int] = None) -> HunyuanVAE2D:
+    logger.info(f"Initializing VAE with chunk_size={chunk_size}")
+    vae = HunyuanVAE2D(chunk_size=chunk_size)
 
     logger.info(f"Loading VAE from {vae_path}")
     state_dict = load_safetensors(vae_path, device=device, disable_mmap=disable_mmap)
diff --git a/library/strategy_base.py b/library/strategy_base.py
index fad79682f..e88d273fc 100644
--- a/library/strategy_base.py
+++ b/library/strategy_base.py
@@ -626,6 +626,7 @@ def save_latents_to_disk(
             for key in npz.files:
                 kwargs[key] = npz[key]
 
+        # TODO float() is needed if vae is in bfloat16. Remove it if vae is float16.
         kwargs["latents" + key_reso_suffix] = latents_tensor.float().cpu().numpy()
         kwargs["original_size" + key_reso_suffix] = np.array(original_size)
         kwargs["crop_ltrb" + key_reso_suffix] = np.array(crop_ltrb)

From e7b8e9a7784c042a83a15aba76d05c8b186db6d8 Mon Sep 17 00:00:00 2001
From: Kohya S <52813779+kohya-ss@users.noreply.github.com>
Date: Sun, 21 Sep 2025 11:13:26 +0900
Subject: [PATCH 570/582] doc: add --vae_chunk_size option for training and
 inference

---
 docs/hunyuan_image_train_network.md | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/docs/hunyuan_image_train_network.md b/docs/hunyuan_image_train_network.md
index d31ff867f..658a7beb5 100644
--- a/docs/hunyuan_image_train_network.md
+++ b/docs/hunyuan_image_train_network.md
@@ -192,8 +192,8 @@ The script adds HunyuanImage-2.1 specific arguments. For common arguments (like
   - Caches the outputs of Qwen2.5-VL and byT5. This reduces memory usage.
 * `--cache_latents`, `--cache_latents_to_disk`
   - Caches the outputs of VAE. Similar functionality to [sdxl_train_network.py](sdxl_train_network.md).
-* `--vae_enable_tiling`
-  - Enables tiling for VAE encoding and decoding to reduce VRAM usage.
+* `--vae_chunk_size=<integer>`
+  - Enables chunked processing in the VAE to reduce VRAM usage during encoding and decoding. Specify the chunk size as an integer (e.g., `16`). Larger values use more VRAM but are faster. Default is `None` (no chunking). This option is useful when VRAM is limited (e.g., 8GB or 12GB).
 
 <details>
 <summary>日本語</summary>
@@ -453,6 +453,7 @@ python hunyuan_image_minimal_inference.py \
 - `--guidance_scale`: CFG scale (default: 3.5)
 - `--flow_shift`: Flow matching shift parameter (default: 5.0)
 - `--text_encoder_cpu`: Run the text encoders on CPU to reduce VRAM usage
+- `--vae_chunk_size`: Chunk size for VAE decoding to reduce memory usage (default: None, no chunking). 16 is recommended if enabled.
 
 `--split_attn` is not supported (since inference is done one at a time). `--fp8_vl` is not supported, please use CPU for the text encoder if VRAM is insufficient.
 
@@ -468,6 +469,7 @@ python hunyuan_image_minimal_inference.py \
 - `--guidance_scale`: CFGスケール（推奨: 3.5）
 - `--flow_shift`: Flow Matchingシフトパラメータ（デフォルト: 5.0）
 - `--text_encoder_cpu`: テキストエンコーダをCPUで実行してVRAM使用量削減
+- `--vae_chunk_size`: VAEデコーディングのチャンクサイズ（デフォルト: None、チャンク処理なし）。有効にする場合は16を推奨。
 
 `--split_attn`はサポートされていません（1件ずつ推論するため）。`--fp8_vl`もサポートされていません。VRAMが不足する場合はテキストエンコーダをCPUで実行してください。
 

From 9621d9d637c69140200a4c0310b2fc95b6a1efd9 Mon Sep 17 00:00:00 2001
From: Kohya S <52813779+kohya-ss@users.noreply.github.com>
Date: Sun, 21 Sep 2025 12:34:40 +0900
Subject: [PATCH 571/582] feat: add Adaptive Projected Guidance parameters and
 noise rescaling

---
 hunyuan_image_minimal_inference.py | 21 ++++++++++++++
 library/hunyuan_image_utils.py     | 45 ++++++++++++++++++++++++++++--
 2 files changed, 64 insertions(+), 2 deletions(-)

diff --git a/hunyuan_image_minimal_inference.py b/hunyuan_image_minimal_inference.py
index 711e911f5..d0184feb0 100644
--- a/hunyuan_image_minimal_inference.py
+++ b/hunyuan_image_minimal_inference.py
@@ -69,6 +69,24 @@ def parse_args() -> argparse.Namespace:
     parser.add_argument(
         "--guidance_scale", type=float, default=3.5, help="Guidance scale for classifier free guidance. Default is 3.5."
     )
+    parser.add_argument(
+        "--apg_start_step_ocr",
+        type=int,
+        default=38,
+        help="Starting step for Adaptive Projected Guidance (APG) for image with text. Default is 38. Should be less than infer_steps, usually near the end.",
+    )
+    parser.add_argument(
+        "--apg_start_step_general",
+        type=int,
+        default=5,
+        help="Starting step for Adaptive Projected Guidance (APG) for general image. Default is 5. Should be less than infer_steps, usually near the beginning.",
+    )
+    parser.add_argument(
+        "--guidance_rescale",
+        type=float,
+        default=0.0,
+        help="Guidance rescale factor for steps without APG, 0.0 to 1.0. Default is 0.0 (no rescale)."
+    )
     parser.add_argument("--prompt", type=str, default=None, help="prompt for generation")
     parser.add_argument("--negative_prompt", type=str, default="", help="negative prompt for generation, default is empty string")
     parser.add_argument("--image_size", type=int, nargs=2, default=[2048, 2048], help="image size, height and width")
@@ -715,8 +733,11 @@ def generate_body(
                     ocr_mask[0],
                     args.guidance_scale,
                     i,
+                    apg_start_step_ocr=args.apg_start_step_ocr,
+                    apg_start_step_general=args.apg_start_step_general,
                     cfg_guider_ocr=cfg_guider_ocr,
                     cfg_guider_general=cfg_guider_general,
+                    guidance_rescale=args.guidance_rescale,
                 )
 
             # ensure latents dtype is consistent
diff --git a/library/hunyuan_image_utils.py b/library/hunyuan_image_utils.py
index 79756dd7e..a1e7d4e95 100644
--- a/library/hunyuan_image_utils.py
+++ b/library/hunyuan_image_utils.py
@@ -428,16 +428,52 @@ def __call__(self, pred_cond: torch.Tensor, pred_uncond: Optional[torch.Tensor]
         return pred
 
 
+def rescale_noise_cfg(guided_noise, conditional_noise, rescale_factor=0.0):
+    """
+    Rescale guided noise prediction to prevent overexposure and improve image quality.
+
+    This implementation addresses the overexposure issue described in "Common Diffusion Noise
+    Schedules and Sample Steps are Flawed" (https://arxiv.org/pdf/2305.08891.pdf) (Section 3.4).
+    The rescaling preserves the statistical properties of the conditional prediction while reducing artifacts.
+
+    Args:
+        guided_noise (torch.Tensor): Noise prediction from classifier-free guidance.
+        conditional_noise (torch.Tensor): Noise prediction from conditional model.
+        rescale_factor (float): Interpolation factor between original and rescaled predictions.
+                               0.0 = no rescaling, 1.0 = full rescaling.
+
+    Returns:
+        torch.Tensor: Rescaled noise prediction with reduced overexposure.
+    """
+    if rescale_factor == 0.0:
+        return guided_noise
+
+    # Calculate standard deviation across spatial dimensions for both predictions
+    spatial_dims = list(range(1, conditional_noise.ndim))
+    conditional_std = conditional_noise.std(dim=spatial_dims, keepdim=True)
+    guided_std = guided_noise.std(dim=spatial_dims, keepdim=True)
+
+    # Rescale guided noise to match conditional noise statistics
+    std_ratio = conditional_std / guided_std
+    rescaled_prediction = guided_noise * std_ratio
+
+    # Interpolate between original and rescaled predictions
+    final_prediction = rescale_factor * rescaled_prediction + (1.0 - rescale_factor) * guided_noise
+
+    return final_prediction
+
+
 def apply_classifier_free_guidance(
     noise_pred_text: torch.Tensor,
     noise_pred_uncond: torch.Tensor,
     is_ocr: bool,
     guidance_scale: float,
     step: int,
-    apg_start_step_ocr: int = 75,
-    apg_start_step_general: int = 10,
+    apg_start_step_ocr: int = 38,
+    apg_start_step_general: int = 5,
     cfg_guider_ocr: AdaptiveProjectedGuidance = None,
     cfg_guider_general: AdaptiveProjectedGuidance = None,
+    guidance_rescale: float = 0.0,
 ):
     """
     Apply classifier-free guidance with OCR-aware APG for batch_size=1.
@@ -471,6 +507,11 @@ def apply_classifier_free_guidance(
     if step <= apg_start_step:
         # Standard classifier-free guidance
         noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+
+        if guidance_rescale > 0.0:
+            # Based on 3.4. in https://arxiv.org/pdf/2305.08891.pdf
+            noise_pred = rescale_noise_cfg(noise_pred, noise_pred_text, guidance_rescale)
+
         # Initialize APG guider state
         _ = cfg_guider(noise_pred_text, noise_pred_uncond, step=step)
     else:

From 040d976597fc29416780b54bb9cd85f082e709b3 Mon Sep 17 00:00:00 2001
From: Kohya S <52813779+kohya-ss@users.noreply.github.com>
Date: Sun, 21 Sep 2025 13:03:14 +0900
Subject: [PATCH 572/582] feat: add guidance rescale options for Adaptive
 Projected Guidance in inference

---
 docs/hunyuan_image_train_network.md |  6 ++++++
 hunyuan_image_minimal_inference.py  | 20 +++++++++++++++++---
 library/hunyuan_image_utils.py      |  6 ++++--
 3 files changed, 27 insertions(+), 5 deletions(-)

diff --git a/docs/hunyuan_image_train_network.md b/docs/hunyuan_image_train_network.md
index 658a7beb5..165c3df40 100644
--- a/docs/hunyuan_image_train_network.md
+++ b/docs/hunyuan_image_train_network.md
@@ -454,6 +454,9 @@ python hunyuan_image_minimal_inference.py \
 - `--flow_shift`: Flow matching shift parameter (default: 5.0)
 - `--text_encoder_cpu`: Run the text encoders on CPU to reduce VRAM usage
 - `--vae_chunk_size`: Chunk size for VAE decoding to reduce memory usage (default: None, no chunking). 16 is recommended if enabled.
+- `--apg_start_step_general` and `--apg_start_step_ocr`: Start steps for APG (Adaptive Projected Guidance) if using APG during inference. `5` and `38` are the official recommended values for 50 steps. If this value exceeds `--infer_steps`, APG will not be applied.
+- `--guidance_rescale`: Rescales the guidance for steps before APG starts. Default is `0.0` (no rescaling). If you use this option, a value around `0.5` might be good starting point.
+- `--guidance_rescale_apg`: Rescales the guidance for APG. Default is `0.0` (no rescaling). This option doesn't seem to have a large effect, but if you use it, a value around `0.5` might be a good starting point.
 
 `--split_attn` is not supported (since inference is done one at a time). `--fp8_vl` is not supported, please use CPU for the text encoder if VRAM is insufficient.
 
@@ -470,6 +473,9 @@ python hunyuan_image_minimal_inference.py \
 - `--flow_shift`: Flow Matchingシフトパラメータ（デフォルト: 5.0）
 - `--text_encoder_cpu`: テキストエンコーダをCPUで実行してVRAM使用量削減
 - `--vae_chunk_size`: VAEデコーディングのチャンクサイズ（デフォルト: None、チャンク処理なし）。有効にする場合は16を推奨。
+- `--apg_start_step_general` と `--apg_start_step_ocr`: 推論中にAPGを使用する場合の開始ステップ。50ステップの場合、公式推奨値はそれぞれ5と38です。この値が`--infer_steps`を超えると、APGは適用されません。
+- `--guidance_rescale`: APG開始前のステップに対するガイダンスのリスケーリング。デフォルトは0.0（リスケーリングなし）。使用する場合、0.5程度から始めて調整してください。
+- `--guidance_rescale_apg`: APGに対するガイダンスのリスケーリング。デフォルトは0.0（リスケーリングなし）。このオプションは大きな効果はないようですが、使用する場合は0.5程度から始めて調整してください。
 
 `--split_attn`はサポートされていません（1件ずつ推論するため）。`--fp8_vl`もサポートされていません。VRAMが不足する場合はテキストエンコーダをCPUで実行してください。
 
diff --git a/hunyuan_image_minimal_inference.py b/hunyuan_image_minimal_inference.py
index d0184feb0..3f63270bb 100644
--- a/hunyuan_image_minimal_inference.py
+++ b/hunyuan_image_minimal_inference.py
@@ -85,7 +85,13 @@ def parse_args() -> argparse.Namespace:
         "--guidance_rescale",
         type=float,
         default=0.0,
-        help="Guidance rescale factor for steps without APG, 0.0 to 1.0. Default is 0.0 (no rescale)."
+        help="Guidance rescale factor for steps without APG, 0.0 to 1.0. Default is 0.0 (no rescale).",
+    )
+    parser.add_argument(
+        "--guidance_rescale_apg",
+        type=float,
+        default=0.0,
+        help="Guidance rescale factor for steps with APG, 0.0 to 1.0. Default is 0.0 (no rescale).",
     )
     parser.add_argument("--prompt", type=str, default=None, help="prompt for generation")
     parser.add_argument("--negative_prompt", type=str, default="", help="negative prompt for generation, default is empty string")
@@ -695,10 +701,18 @@ def generate_body(
 
     # Prepare Guider
     cfg_guider_ocr = hunyuan_image_utils.AdaptiveProjectedGuidance(
-        guidance_scale=10.0, eta=0.0, adaptive_projected_guidance_rescale=10.0, adaptive_projected_guidance_momentum=-0.5
+        guidance_scale=10.0,
+        eta=0.0,
+        adaptive_projected_guidance_rescale=10.0,
+        adaptive_projected_guidance_momentum=-0.5,
+        guidance_rescale=args.guidance_rescale_apg,
     )
     cfg_guider_general = hunyuan_image_utils.AdaptiveProjectedGuidance(
-        guidance_scale=10.0, eta=0.0, adaptive_projected_guidance_rescale=10.0, adaptive_projected_guidance_momentum=-0.5
+        guidance_scale=10.0,
+        eta=0.0,
+        adaptive_projected_guidance_rescale=10.0,
+        adaptive_projected_guidance_momentum=-0.5,
+        guidance_rescale=args.guidance_rescale_apg,
     )
 
     # Denoising loop
diff --git a/library/hunyuan_image_utils.py b/library/hunyuan_image_utils.py
index a1e7d4e95..3b0d68fdb 100644
--- a/library/hunyuan_image_utils.py
+++ b/library/hunyuan_image_utils.py
@@ -401,8 +401,6 @@ def __init__(
         guidance_rescale: float = 0.0,
         use_original_formulation: bool = False,
     ):
-        assert guidance_rescale == 0.0, "guidance_rescale > 0.0 not supported."
-
         self.guidance_scale = guidance_scale
         self.adaptive_projected_guidance_momentum = adaptive_projected_guidance_momentum
         self.adaptive_projected_guidance_rescale = adaptive_projected_guidance_rescale
@@ -425,6 +423,10 @@ def __call__(self, pred_cond: torch.Tensor, pred_uncond: Optional[torch.Tensor]
             self.use_original_formulation,
         )
 
+        if self.guidance_rescale > 0.0:
+            print(f"Applying guidance rescale with factor {self.guidance_rescale} at step {step}")
+            pred = rescale_noise_cfg(pred, pred_cond, self.guidance_rescale)
+
         return pred
 
 

From 3876343fad5b710a11fcc381569927b89ba42904 Mon Sep 17 00:00:00 2001
From: Kohya S <52813779+kohya-ss@users.noreply.github.com>
Date: Sun, 21 Sep 2025 13:09:38 +0900
Subject: [PATCH 573/582] fix: remove print statement for guidance rescale in
 AdaptiveProjectedGuidance

---
 library/hunyuan_image_utils.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/library/hunyuan_image_utils.py b/library/hunyuan_image_utils.py
index 3b0d68fdb..8e95925ca 100644
--- a/library/hunyuan_image_utils.py
+++ b/library/hunyuan_image_utils.py
@@ -424,7 +424,6 @@ def __call__(self, pred_cond: torch.Tensor, pred_uncond: Optional[torch.Tensor]
         )
 
         if self.guidance_rescale > 0.0:
-            print(f"Applying guidance rescale with factor {self.guidance_rescale} at step {step}")
             pred = rescale_noise_cfg(pred, pred_cond, self.guidance_rescale)
 
         return pred

From 806d535ef1f906d0a85a79fe71d11a22e18957dc Mon Sep 17 00:00:00 2001
From: Kohya S <52813779+kohya-ss@users.noreply.github.com>
Date: Sun, 21 Sep 2025 13:10:41 +0900
Subject: [PATCH 574/582] fix: block-wise scaling is overwritten by per-tensor
 scaling

---
 library/fp8_optimization_utils.py | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/library/fp8_optimization_utils.py b/library/fp8_optimization_utils.py
index 82ec6bfc7..02f99ab6d 100644
--- a/library/fp8_optimization_utils.py
+++ b/library/fp8_optimization_utils.py
@@ -220,10 +220,6 @@ def quantize_weight(
         tensor_max = torch.max(torch.abs(tensor).view(-1))
         scale = tensor_max / max_value
 
-    # Calculate scale factor
-    scale = torch.max(torch.abs(tensor.flatten())) / max_value
-    # print(f"Optimizing {key} with scale: {scale}")
-
     # numerical safety
     scale = torch.clamp(scale, min=1e-8)
     scale = scale.to(torch.float32)  # ensure scale is in float32 for division

From e7b89826c5c516ad51a52326eca1ed97d7634d98 Mon Sep 17 00:00:00 2001
From: "Kohya S." <52813779+kohya-ss@users.noreply.github.com>
Date: Sun, 21 Sep 2025 13:29:58 +0900
Subject: [PATCH 575/582] Update library/custom_offloading_utils.py

Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
---
 library/custom_offloading_utils.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/library/custom_offloading_utils.py b/library/custom_offloading_utils.py
index fe7e59d2b..0681dcdcb 100644
--- a/library/custom_offloading_utils.py
+++ b/library/custom_offloading_utils.py
@@ -264,10 +264,8 @@ def submit_move_blocks(self, blocks: Union[list[nn.Module], nn.ModuleList], bloc
 
         block_idx_to_cpu = block_idx
         block_idx_to_cuda = self.num_blocks - self.blocks_to_swap + block_idx
-        
         # this works for forward-only offloading. move upstream blocks to cuda
         block_idx_to_cuda = block_idx_to_cuda % self.num_blocks
-        
         self._submit_move_blocks(blocks, block_idx_to_cpu, block_idx_to_cuda)
 
 

From 753c794549ac660cc39b1059605253ce4a575cef Mon Sep 17 00:00:00 2001
From: "Kohya S." <52813779+kohya-ss@users.noreply.github.com>
Date: Sun, 21 Sep 2025 13:30:22 +0900
Subject: [PATCH 576/582] Update hunyuan_image_train_network.py

Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
---
 hunyuan_image_train_network.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/hunyuan_image_train_network.py b/hunyuan_image_train_network.py
index 228c9dbc1..a67e931d5 100644
--- a/hunyuan_image_train_network.py
+++ b/hunyuan_image_train_network.py
@@ -2,7 +2,6 @@
 import copy
 import gc
 from typing import Any, Optional, Union, cast
-import argparse
 import os
 import time
 from types import SimpleNamespace

From 31f7df3b3adcbfdc5174b3d3109dcb64ee17e6c6 Mon Sep 17 00:00:00 2001
From: Kohya S <52813779+kohya-ss@users.noreply.github.com>
Date: Tue, 23 Sep 2025 18:53:36 +0900
Subject: [PATCH 577/582] doc: add --network_train_unet_only option for
 HunyuanImage-2.1 training

---
 docs/hunyuan_image_train_network.md | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/docs/hunyuan_image_train_network.md b/docs/hunyuan_image_train_network.md
index 165c3df40..b2bf113d6 100644
--- a/docs/hunyuan_image_train_network.md
+++ b/docs/hunyuan_image_train_network.md
@@ -123,6 +123,7 @@ accelerate launch --num_cpu_threads_per_process 1 hunyuan_image_train_network.py
   --network_module=networks.lora_hunyuan_image \
   --network_dim=16 \
   --network_alpha=1 \
+  --network_train_unet_only \
   --learning_rate=1e-4 \
   --optimizer_type="AdamW8bit" \
   --lr_scheduler="constant" \
@@ -139,6 +140,8 @@ accelerate launch --num_cpu_threads_per_process 1 hunyuan_image_train_network.py
   --cache_latents
 ```
 
+**HunyuanImage-2.1 training does not support LoRA modules for Text Encoders, so `--network_train_unet_only` is required.**
+
 <details>
 <summary>日本語</summary>
 
@@ -165,6 +168,8 @@ The script adds HunyuanImage-2.1 specific arguments. For common arguments (like
 
 #### HunyuanImage-2.1 Training Parameters
 
+* `--network_train_unet_only` **[Required]**
+  - Specifies that only the DiT model will be trained. LoRA modules for Text Encoders are not supported.
 * `--discrete_flow_shift=<float>`
   - Specifies the shift value for the scheduler used in Flow Matching. Default is `5.0`.
 * `--model_prediction_type=<choice>`
@@ -181,7 +186,7 @@ The script adds HunyuanImage-2.1 specific arguments. For common arguments (like
 * `--split_attn`
   - Splits the batch during attention computation to process one item at a time, reducing VRAM usage by avoiding attention mask computation. Can improve speed when using `torch`. Required when using `xformers` with batch size greater than 1.
 * `--fp8_scaled`
-  - Enables training the DiT model in scaled FP8 format. This can significantly reduce VRAM usage (can run with as little as 8GB VRAM when combined with `--blocks_to_swap`), but the training results may vary. This is a newer alternative to the unsupported `--fp8_base` option.
+  - Enables training the DiT model in scaled FP8 format. This can significantly reduce VRAM usage (can run with as little as 8GB VRAM when combined with `--blocks_to_swap`), but the training results may vary. This is a newer alternative to the unsupported `--fp8_base` option. See [Musubi Tuner's documentation](https://github.com/kohya-ss/musubi-tuner/blob/main/docs/advanced_config.md#fp8-weight-optimization-for-models--%E3%83%A2%E3%83%87%E3%83%AB%E3%81%AE%E9%87%8D%E3%81%BF%E3%81%AEfp8%E3%81%B8%E3%81%AE%E6%9C%80%E9%81%A9%E5%8C%96) for details.
 * `--fp8_vl`
   - Use FP8 for the VLM (Qwen2.5-VL) text encoder.
 * `--text_encoder_cpu`
@@ -449,7 +454,7 @@ python hunyuan_image_minimal_inference.py \
 **Key Options:**
 - `--fp8_scaled`: Use scaled FP8 format for reduced VRAM usage during inference
 - `--blocks_to_swap`: Swap blocks to CPU to reduce VRAM usage
-- `--image_size`: Resolution (inference is most stable at 2048x2048)
+- `--image_size`: Resolution in **height width**  (inference is most stable at 2560x1536, 2304x1792, 2048x2048, 1792x2304, 1536x2560 according to the official repo)
 - `--guidance_scale`: CFG scale (default: 3.5)
 - `--flow_shift`: Flow matching shift parameter (default: 5.0)
 - `--text_encoder_cpu`: Run the text encoders on CPU to reduce VRAM usage

From 58df9dffa447fc4e3614baf9bd961fa844f40fd7 Mon Sep 17 00:00:00 2001
From: Kohya S <52813779+kohya-ss@users.noreply.github.com>
Date: Tue, 23 Sep 2025 18:59:02 +0900
Subject: [PATCH 578/582] doc: update README with HunyuanImage-2.1 LoRA
 training details and requirements

---
 README.md | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/README.md b/README.md
index da38a2416..c70dc257d 100644
--- a/README.md
+++ b/README.md
@@ -15,6 +15,13 @@ If you are using DeepSpeed, please install DeepSpeed with `pip install deepspeed
 
 ### Recent Updates
 
+Sep 23, 2025:
+- HunyuanImage-2.1 LoRA training is supported. [PR #2198](https://github.com/kohya-ss/sd-scripts/pull/2198) for details.
+  - Please see [HunyuanImage-2.1 Training](./docs/hunyuan_image_train_network.md) for details.
+  - __HunyuanImage-2.1 training does not support LoRA modules for Text Encoders, so `--network_train_unet_only` is required.__
+  - The training script is `hunyuan_image_train_network.py`.
+  - This includes changes to `train_network.py`, the base of the training script. Please let us know if you encounter any issues.
+
 Sep 13, 2025:
 - The loading speed of `.safetensors` files has been improved for SD3, FLUX.1 and Lumina. See [PR #2200](https://github.com/kohya-ss/sd-scripts/pull/2200) for more details.
     - Model loading can be up to 1.5 times faster.

From 4b79d73504b8dbe28fa4b308dd20303457bf0772 Mon Sep 17 00:00:00 2001
From: Kohya S <52813779+kohya-ss@users.noreply.github.com>
Date: Wed, 24 Sep 2025 21:15:37 +0900
Subject: [PATCH 579/582] fix: update metadata construction to include
 model_config for flux

---
 networks/flux_extract_lora.py |  4 +++-
 networks/flux_merge_lora.py   | 22 ++++++++++++++++++++--
 2 files changed, 23 insertions(+), 3 deletions(-)

diff --git a/networks/flux_extract_lora.py b/networks/flux_extract_lora.py
index 657287029..f1ae8f965 100644
--- a/networks/flux_extract_lora.py
+++ b/networks/flux_extract_lora.py
@@ -139,7 +139,9 @@ def str_to_dtype(p):
 
     if not no_metadata:
         title = os.path.splitext(os.path.basename(save_to))[0]
-        sai_metadata = sai_model_spec.build_metadata(lora_sd, False, False, False, True, False, time.time(), title, flux="dev")
+        sai_metadata = sai_model_spec.build_metadata(
+            lora_sd, False, False, False, True, False, time.time(), title, model_config={"flux": "dev"}
+        )
         metadata.update(sai_metadata)
 
     save_to_file(save_to, lora_sd, metadata, save_dtype)
diff --git a/networks/flux_merge_lora.py b/networks/flux_merge_lora.py
index 855c0ed98..45ff67497 100644
--- a/networks/flux_merge_lora.py
+++ b/networks/flux_merge_lora.py
@@ -619,7 +619,16 @@ def merge(args):
             merged_from = sai_model_spec.build_merged_from([args.flux_model] + args.models)
             title = os.path.splitext(os.path.basename(args.save_to))[0]
             sai_metadata = sai_model_spec.build_metadata(
-                None, False, False, False, False, False, time.time(), title=title, merged_from=merged_from, flux="dev"
+                None,
+                False,
+                False,
+                False,
+                False,
+                False,
+                time.time(),
+                title=title,
+                merged_from=merged_from,
+                model_config={"flux": "dev"},
             )
 
         if flux_state_dict is not None and len(flux_state_dict) > 0:
@@ -647,7 +656,16 @@ def merge(args):
             merged_from = sai_model_spec.build_merged_from(args.models)
             title = os.path.splitext(os.path.basename(args.save_to))[0]
             sai_metadata = sai_model_spec.build_metadata(
-                flux_state_dict, False, False, False, True, False, time.time(), title=title, merged_from=merged_from, flux="dev"
+                flux_state_dict,
+                False,
+                False,
+                False,
+                True,
+                False,
+                time.time(),
+                title=title,
+                merged_from=merged_from,
+                model_config={"flux": "dev"},
             )
             metadata.update(sai_metadata)
 

From 6a826d21b1dfc631a02e517198fac83f793b2f90 Mon Sep 17 00:00:00 2001
From: Kohya S <52813779+kohya-ss@users.noreply.github.com>
Date: Sun, 28 Sep 2025 18:06:17 +0900
Subject: [PATCH 580/582] feat: add new parameters for sample image inference
 configuration

---
 hunyuan_image_train_network.py | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/hunyuan_image_train_network.py b/hunyuan_image_train_network.py
index a67e931d5..9ab351ea2 100644
--- a/hunyuan_image_train_network.py
+++ b/hunyuan_image_train_network.py
@@ -249,7 +249,15 @@ def encode_prompt(prpt):
         arg_c_null = None
 
     gen_args = SimpleNamespace(
-        image_size=(height, width), infer_steps=sample_steps, flow_shift=flow_shift, guidance_scale=cfg_scale, fp8=args.fp8_scaled
+        image_size=(height, width),
+        infer_steps=sample_steps,
+        flow_shift=flow_shift,
+        guidance_scale=cfg_scale,
+        fp8=args.fp8_scaled,
+        apg_start_step_ocr=38,
+        apg_start_step_general=5,
+        guidance_rescale=0.0,
+        guidance_rescale_apg=0.0,
     )
 
     from hunyuan_image_minimal_inference import generate_body  # import here to avoid circular import

From a0c26a0efac8c56905153bee8870bcfbb6f96731 Mon Sep 17 00:00:00 2001
From: kohya-ss <52813779+kohya-ss@users.noreply.github.com>
Date: Sun, 28 Sep 2025 18:21:25 +0900
Subject: [PATCH 581/582] docs: enhance text encoder CPU usage instructions for
 HunyuanImage-2.1 training

---
 docs/hunyuan_image_train_network.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/hunyuan_image_train_network.md b/docs/hunyuan_image_train_network.md
index b2bf113d6..b0e9cdd98 100644
--- a/docs/hunyuan_image_train_network.md
+++ b/docs/hunyuan_image_train_network.md
@@ -190,7 +190,7 @@ The script adds HunyuanImage-2.1 specific arguments. For common arguments (like
 * `--fp8_vl`
   - Use FP8 for the VLM (Qwen2.5-VL) text encoder.
 * `--text_encoder_cpu`
-  - Runs the text encoders on CPU to reduce VRAM usage. This is useful when VRAM is insufficient (less than 12GB). Encoding one text may take a few minutes (depending on CPU). It is highly recommended to use this option with `--cache_text_encoder_outputs_to_disk` to avoid repeated encoding every time training starts.
+  - Runs the text encoders on CPU to reduce VRAM usage. This is useful when VRAM is insufficient (less than 12GB). Encoding one text may take a few minutes (depending on CPU). It is highly recommended to use this option with `--cache_text_encoder_outputs_to_disk` to avoid repeated encoding every time training starts. **In addition, increasing `--num_cpu_threads_per_process` in the `accelerate launch` command, like `--num_cpu_threads_per_process=8` or `16`, can speed up encoding in some environments.**
 * `--blocks_to_swap=<integer>` **[Experimental Feature]**
   - Setting to reduce VRAM usage by swapping parts of the model (Transformer blocks) between CPU and GPU. Specify the number of blocks to swap as an integer (e.g., `18`). Larger values reduce VRAM usage but decrease training speed. Adjust according to your GPU's VRAM capacity. Can be used with `gradient_checkpointing`.
 * `--cache_text_encoder_outputs`

From a33cad714edf97749d817bb4f0d141f3104ec223 Mon Sep 17 00:00:00 2001
From: Kohya S <52813779+kohya-ss@users.noreply.github.com>
Date: Wed, 15 Oct 2025 21:57:11 +0900
Subject: [PATCH 582/582] fix: error on batch generation closes #2209

---
 hunyuan_image_minimal_inference.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/hunyuan_image_minimal_inference.py b/hunyuan_image_minimal_inference.py
index 3f63270bb..8c14cf6f1 100644
--- a/hunyuan_image_minimal_inference.py
+++ b/hunyuan_image_minimal_inference.py
@@ -1001,7 +1001,7 @@ def process_batch_prompts(prompts_data: List[Dict], args: argparse.Namespace) ->
         all_precomputed_text_data.append(text_data)
 
     # Models should be removed from device after prepare_text_inputs
-    del tokenizer_batch, text_encoder_batch, temp_shared_models_txt, conds_cache_batch
+    del tokenizer_vlm, text_encoder_vlm_batch, tokenizer_byt5, text_encoder_byt5_batch, temp_shared_models_txt, conds_cache_batch
     gc.collect()  # Force cleanup of Text Encoder from GPU memory
     clean_memory_on_device(device)
 
@@ -1075,7 +1075,7 @@ def process_batch_prompts(prompts_data: List[Dict], args: argparse.Namespace) ->
             # save_output expects latent to be [BCTHW] or [CTHW]. generate returns [BCTHW] (batch size 1).
             # latent[0] is correct if generate returns it with batch dim.
             # The latent from generate is (1, C, T, H, W)
-            save_output(current_args, vae_for_batch, latent[0], device)  # Pass vae_for_batch
+            save_output(current_args, vae_for_batch, latent, device)  # Pass vae_for_batch
 
         vae_for_batch.to("cpu")  # Move VAE back to CPU