-
Notifications
You must be signed in to change notification settings - Fork 1.2k
Description
My code:
os.system(f'python dataset_tool.py --source={source_data} '
f'--dest={train_data_2} '
f'--resolution=1024x1024 '
f'--max-images={img_num} '
)
!python train.py --outdir=/kaggle/working/training_run_512 --cfg=stylegan3-r --data=/kaggle/working/train_data_512.zip
--gpus=2 --batch=16 --gamma=6.6 --mirror=1 --kimg=5000 --snap=5
--resume=https://api.ngc.nvidia.com/v2/models/nvidia/research/stylegan3/versions/1/files/stylegan3-r-ffhqu-1024x1024.pkl
Error message:
Training options:
{
"G_kwargs": {
"class_name": "training.networks_stylegan3.Generator",
"z_dim": 512,
"w_dim": 512,
"mapping_kwargs": {
"num_layers": 2
},
"channel_base": 65536,
"channel_max": 1024,
"magnitude_ema_beta": 0.9994456359721023,
"conv_kernel": 1,
"use_radial_filters": true
},
"D_kwargs": {
"class_name": "training.networks_stylegan2.Discriminator",
"block_kwargs": {
"freeze_layers": 0
},
"mapping_kwargs": {},
"epilogue_kwargs": {
"mbstd_group_size": 4
},
"channel_base": 32768,
"channel_max": 512
},
"G_opt_kwargs": {
"class_name": "torch.optim.Adam",
"betas": [
0,
0.99
],
"eps": 1e-08,
"lr": 0.0025
},
"D_opt_kwargs": {
"class_name": "torch.optim.Adam",
"betas": [
0,
0.99
],
"eps": 1e-08,
"lr": 0.002
},
"loss_kwargs": {
"class_name": "training.loss.StyleGAN2Loss",
"r1_gamma": 6.6,
"blur_init_sigma": 0,
"blur_fade_kimg": 100.0
},
"data_loader_kwargs": {
"pin_memory": true,
"prefetch_factor": 2,
"num_workers": 3
},
"training_set_kwargs": {
"class_name": "training.dataset.ImageFolderDataset",
"path": "/kaggle/working/train_data_512.zip",
"use_labels": false,
"max_size": 2000,
"xflip": true,
"resolution": 1024,
"random_seed": 0
},
"num_gpus": 2,
"batch_size": 16,
"batch_gpu": 8,
"metrics": [
"fid50k_full"
],
"total_kimg": 5000,
"kimg_per_tick": 4,
"image_snapshot_ticks": 5,
"network_snapshot_ticks": 5,
"random_seed": 0,
"ema_kimg": 5.0,
"augment_kwargs": {
"class_name": "training.augment.AugmentPipe",
"xflip": 1,
"rotate90": 1,
"xint": 1,
"scale": 1,
"rotate": 1,
"aniso": 1,
"xfrac": 1,
"brightness": 1,
"contrast": 1,
"lumaflip": 1,
"hue": 1,
"saturation": 1
},
"ada_target": 0.6,
"resume_pkl": "https://api.ngc.nvidia.com/v2/models/nvidia/research/stylegan3/versions/1/files/stylegan3-r-ffhqu-1024x1024.pkl",
"ada_kimg": 100,
"ema_rampup": null,
"run_dir": "/kaggle/working/training_run_512/00000-stylegan3-r-train_data_512-gpus2-batch16-gamma6.6"
}
Output directory: /kaggle/working/training_run_512/00000-stylegan3-r-train_data_512-gpus2-batch16-gamma6.6
Number of GPUs: 2
Batch size: 16 images
Training duration: 5000 kimg
Dataset path: /kaggle/working/train_data_512.zip
Dataset size: 2000 images
Dataset resolution: 1024
Dataset labels: False
Dataset x-flips: True
Creating output directory...
Launching processes...
Loading training set...
/usr/local/lib/python3.11/dist-packages/torch/utils/data/sampler.py:77: UserWarning: data_source argument is not used and will be removed in 2.2.0.You may still have custom implementation that utilizes it.
warnings.warn(
/usr/local/lib/python3.11/dist-packages/torch/utils/data/sampler.py:77: UserWarning: data_source argument is not used and will be removed in 2.2.0.You may still have custom implementation that utilizes it.
warnings.warn(
Num images: 4000
Image shape: [3, 1024, 1024]
Label shape: [0]
Constructing networks...
Resuming from "https://api.ngc.nvidia.com/v2/models/nvidia/research/stylegan3/versions/1/files/stylegan3-r-ffhqu-1024x1024.pkl"
Downloading https://api.ngc.nvidia.com/v2/models/nvidia/research/stylegan3/versions/1/files/stylegan3-r-ffhqu-1024x1024.pkl ... done
Setting up PyTorch plugin "bias_act_plugin"... /usr/local/lib/python3.11/dist-packages/torch/utils/cpp_extension.py:2059: UserWarning: TORCH_CUDA_ARCH_LIST is not set, all archs for visible cards are included for compilation.
If this is not desired, please set os.environ['TORCH_CUDA_ARCH_LIST'].
warnings.warn(
Done.
Setting up PyTorch plugin "filtered_lrelu_plugin"... /usr/local/lib/python3.11/dist-packages/torch/utils/cpp_extension.py:2059: UserWarning: TORCH_CUDA_ARCH_LIST is not set, all archs for visible cards are included for compilation.
If this is not desired, please set os.environ['TORCH_CUDA_ARCH_LIST'].
warnings.warn(
Done.
Generator Parameters Buffers Output shape Datatype
mapping.fc0 262656 - [8, 512] float32
mapping.fc1 262656 - [8, 512] float32
mapping - 512 [8, 16, 512] float32
synthesis.input.affine 2052 - [8, 4] float32
synthesis.input 1048576 3081 [8, 1024, 36, 36] float32
synthesis.L0_36_1024.affine 525312 - [8, 1024] float32
synthesis.L0_36_1024 1049600 157 [8, 1024, 36, 36] float32
synthesis.L1_36_1024.affine 525312 - [8, 1024] float32
synthesis.L1_36_1024 1049600 157 [8, 1024, 36, 36] float32
synthesis.L2_52_1024.affine 525312 - [8, 1024] float32
synthesis.L2_52_1024 1049600 169 [8, 1024, 52, 52] float32
synthesis.L3_52_1024.affine 525312 - [8, 1024] float32
synthesis.L3_52_1024 1049600 157 [8, 1024, 52, 52] float32
synthesis.L4_84_1024.affine 525312 - [8, 1024] float32
synthesis.L4_84_1024 1049600 169 [8, 1024, 84, 84] float32
synthesis.L5_148_1024.affine 525312 - [8, 1024] float32
synthesis.L5_148_1024 1049600 169 [8, 1024, 148, 148] float16
synthesis.L6_148_1024.affine 525312 - [8, 1024] float32
synthesis.L6_148_1024 1049600 157 [8, 1024, 148, 148] float16
synthesis.L7_276_645.affine 525312 - [8, 1024] float32
synthesis.L7_276_645 661125 169 [8, 645, 276, 276] float16
synthesis.L8_276_406.affine 330885 - [8, 645] float32
synthesis.L8_276_406 262276 157 [8, 406, 276, 276] float16
synthesis.L9_532_256.affine 208278 - [8, 406] float32
synthesis.L9_532_256 104192 169 [8, 256, 532, 532] float16
synthesis.L10_1044_161.affine 131328 - [8, 256] float32
synthesis.L10_1044_161 41377 169 [8, 161, 1044, 1044] float16
synthesis.L11_1044_102.affine 82593 - [8, 161] float32
synthesis.L11_1044_102 16524 157 [8, 102, 1044, 1044] float16
synthesis.L12_1044_64.affine 52326 - [8, 102] float32
synthesis.L12_1044_64 6592 25 [8, 64, 1044, 1044] float16
synthesis.L13_1024_64.affine 32832 - [8, 64] float32
synthesis.L13_1024_64 4160 25 [8, 64, 1024, 1024] float16
synthesis.L14_1024_3.affine 32832 - [8, 64] float32
synthesis.L14_1024_3 195 1 [8, 3, 1024, 1024] float16
synthesis - - [8, 3, 1024, 1024] float32
Total 15093151 5600 - -
Setting up PyTorch plugin "upfirdn2d_plugin"... /usr/local/lib/python3.11/dist-packages/torch/utils/cpp_extension.py:2059: UserWarning: TORCH_CUDA_ARCH_LIST is not set, all archs for visible cards are included for compilation.
If this is not desired, please set os.environ['TORCH_CUDA_ARCH_LIST'].
warnings.warn(
Done.
Discriminator Parameters Buffers Output shape Datatype
b1024.fromrgb 128 16 [8, 32, 1024, 1024] float16
b1024.skip 2048 16 [8, 64, 512, 512] float16
b1024.conv0 9248 16 [8, 32, 1024, 1024] float16
b1024.conv1 18496 16 [8, 64, 512, 512] float16
b1024 - 16 [8, 64, 512, 512] float16
b512.skip 8192 16 [8, 128, 256, 256] float16
b512.conv0 36928 16 [8, 64, 512, 512] float16
b512.conv1 73856 16 [8, 128, 256, 256] float16
b512 - 16 [8, 128, 256, 256] float16
b256.skip 32768 16 [8, 256, 128, 128] float16
b256.conv0 147584 16 [8, 128, 256, 256] float16
b256.conv1 295168 16 [8, 256, 128, 128] float16
b256 - 16 [8, 256, 128, 128] float16
b128.skip 131072 16 [8, 512, 64, 64] float16
b128.conv0 590080 16 [8, 256, 128, 128] float16
b128.conv1 1180160 16 [8, 512, 64, 64] float16
b128 - 16 [8, 512, 64, 64] float16
b64.skip 262144 16 [8, 512, 32, 32] float32
b64.conv0 2359808 16 [8, 512, 64, 64] float32
b64.conv1 2359808 16 [8, 512, 32, 32] float32
b64 - 16 [8, 512, 32, 32] float32
b32.skip 262144 16 [8, 512, 16, 16] float32
b32.conv0 2359808 16 [8, 512, 32, 32] float32
b32.conv1 2359808 16 [8, 512, 16, 16] float32
b32 - 16 [8, 512, 16, 16] float32
b16.skip 262144 16 [8, 512, 8, 8] float32
b16.conv0 2359808 16 [8, 512, 16, 16] float32
b16.conv1 2359808 16 [8, 512, 8, 8] float32
b16 - 16 [8, 512, 8, 8] float32
b8.skip 262144 16 [8, 512, 4, 4] float32
b8.conv0 2359808 16 [8, 512, 8, 8] float32
b8.conv1 2359808 16 [8, 512, 4, 4] float32
b8 - 16 [8, 512, 4, 4] float32
b4.mbstd - - [8, 513, 4, 4] float32
b4.conv 2364416 16 [8, 512, 4, 4] float32
b4.fc 4194816 - [8, 512] float32
b4.out 513 - [8, 1] float32
Total 29012513 544 - -
Setting up augmentation...
Distributing across 2 GPUs...
Setting up training phases...
[rank0]:[W1111 15:17:42.853614781 ProcessGroupNCCL.cpp:1496] Warning: WARNING: destroy_process_group() was not called before program exit, which can leak resources. For more info, please see https://pytorch.org/docs/stable/distributed.html#shutdown (function operator())
W1111 15:17:43.083000 1457 torch/multiprocessing/spawn.py:169] Terminating process 1464 via signal SIGTERM
Traceback (most recent call last):
File "/kaggle/working/stylegan3/train.py", line 286, in
main() # pylint: disable=no-value-for-parameter
^^^^^^
File "/usr/local/lib/python3.11/dist-packages/click/core.py", line 1462, in call
return self.main(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.11/dist-packages/click/core.py", line 1383, in main
rv = self.invoke(ctx)
^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.11/dist-packages/click/core.py", line 1246, in invoke
return ctx.invoke(self.callback, **ctx.params)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.11/dist-packages/click/core.py", line 814, in invoke
return callback(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^
File "/kaggle/working/stylegan3/train.py", line 281, in main
launch_training(c=c, desc=desc, outdir=opts.outdir, dry_run=opts.dry_run)
File "/kaggle/working/stylegan3/train.py", line 98, in launch_training
torch.multiprocessing.spawn(fn=subprocess_fn, args=(c, temp_dir), nprocs=c.num_gpus)
File "/usr/local/lib/python3.11/dist-packages/torch/multiprocessing/spawn.py", line 340, in spawn
return start_processes(fn, args, nprocs, join, daemon, start_method="spawn")
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.11/dist-packages/torch/multiprocessing/spawn.py", line 296, in start_processes
while not context.join():
^^^^^^^^^^^^^^
File "/usr/local/lib/python3.11/dist-packages/torch/multiprocessing/spawn.py", line 215, in join
raise ProcessRaisedException(msg, error_index, failed_process.pid)
torch.multiprocessing.spawn.ProcessRaisedException:
-- Process 1 terminated with the following error:
Traceback (most recent call last):
File "/usr/local/lib/python3.11/dist-packages/torch/multiprocessing/spawn.py", line 90, in _wrap
fn(i, *args)
File "/kaggle/working/stylegan3/train.py", line 47, in subprocess_fn
training_loop.training_loop(rank=rank, **c)
File "/kaggle/working/stylegan3/training/training_loop.py", line 197, in training_loop
opt = dnnlib.util.construct_class_by_name(params=module.parameters(), **opt_kwargs) # subclass of torch.optim.Optimizer
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/kaggle/working/stylegan3/dnnlib/util.py", line 303, in construct_class_by_name
return call_func_by_name(*args, func_name=class_name, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/kaggle/working/stylegan3/dnnlib/util.py", line 298, in call_func_by_name
return func_obj(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.11/dist-packages/torch/optim/adam.py", line 71, in init
raise ValueError("betas must be either both floats or both Tensors")
ValueError: betas must be either both floats or both Tensors