AlphaVAE: Unified End-to-End RGBA Image Reconstruction and Generation with Alpha-Aware Representation Learning
This is the official repository for "AlphaVAE: Unified End-to-End RGBA Image Reconstruction and Generation with Alpha-Aware Representation Learning".
conda create -n AlphaVAE python=3.10
conda activate AlphaVAE
pip install torch==2.6.0 torchvision==0.21.0 torchaudio==2.6.0 --index-url https://download.pytorch.org/whl/cu124
pip install -r requirements.txt
pip install taming-transformers-rom1504 # A portable, easy-to-install packaging of taming-transformers (CompVis)
#eval
pip install pyiqa
pip install tokenizers==0.21.1 transformers==4.51.1Pretrained model checkpoints are available at:
models/
├── FLUX.1-dev/
├── finetune_VAE/
│ ├── config.json
│ ├── diffusion_pytorch_model.safetensors
│ └── finetune_diffusion/
│ └── pytorch_lora_weights.safetensors
└── convert.pyTo run inference using pretrained models:
# VAE
bash inference/infer_vae.sh
# T2I
bash inference/infer_t2i.shRun the following command to start training:
# convert the original VAE to support 4-channel RGBA input
python models/convert.py --src models/FLUX.1-dev/vae --dst models/FLUX.1-dev/rgba_vae
# VAE
bash train/train_vae.sh
# Diffusion
bash train/train_diffusion_lora.sh📂 Before running evaluation, please download the dataset from Huggingface.
tar -xzvf data.tar.gz
To evaluate model performance:
# VAE
bash validation_pipeline/vae_eval_pipeline.sh
bash validation_pipeline/vae_generation_and_eval_pipeline.sh
# Diffusion
bash validation_pipeline/t2i_eval_pipeline.sh
bash validation_pipeline/diffusion_generation_and_eval_pipeline.sh@misc{wang2025alphavaeunifiedendtoendrgba,
title={AlphaVAE: Unified End-to-End RGBA Image Reconstruction and Generation with Alpha-Aware Representation Learning},
author={Zile Wang and Hao Yu and Jiabo Zhan and Chun Yuan},
year={2025},
eprint={2507.09308},
archivePrefix={arXiv},
primaryClass={cs.CV},
url={https://arxiv.org/abs/2507.09308},
}
@article{yu2025omnialpha0,
title = {OmniAlpha: A Sequence-to-Sequence Framework for Unified Multi-Task RGBA Generation},
author = {Hao Yu and Jiabo Zhan and Zile Wang and Jinglin Wang and Huaisong Zhang and Hongyu Li and Xinrui Chen and Yongxian Wei and Chun Yuan},
year = {2025},
journal = {arXiv preprint arXiv: 2511.20211}
}