Skip to content

Cant run python train.py - betas must be either both floats or both Tensors #656

@MonoPT

Description

@MonoPT

My code:
os.system(f'python dataset_tool.py --source={source_data} '
f'--dest={train_data_2} '
f'--resolution=1024x1024 '
f'--max-images={img_num} '
)

!python train.py --outdir=/kaggle/working/training_run_512 --cfg=stylegan3-r --data=/kaggle/working/train_data_512.zip
--gpus=2 --batch=16 --gamma=6.6 --mirror=1 --kimg=5000 --snap=5
--resume=https://api.ngc.nvidia.com/v2/models/nvidia/research/stylegan3/versions/1/files/stylegan3-r-ffhqu-1024x1024.pkl

Error message:
Training options:
{
"G_kwargs": {
"class_name": "training.networks_stylegan3.Generator",
"z_dim": 512,
"w_dim": 512,
"mapping_kwargs": {
"num_layers": 2
},
"channel_base": 65536,
"channel_max": 1024,
"magnitude_ema_beta": 0.9994456359721023,
"conv_kernel": 1,
"use_radial_filters": true
},
"D_kwargs": {
"class_name": "training.networks_stylegan2.Discriminator",
"block_kwargs": {
"freeze_layers": 0
},
"mapping_kwargs": {},
"epilogue_kwargs": {
"mbstd_group_size": 4
},
"channel_base": 32768,
"channel_max": 512
},
"G_opt_kwargs": {
"class_name": "torch.optim.Adam",
"betas": [
0,
0.99
],
"eps": 1e-08,
"lr": 0.0025
},
"D_opt_kwargs": {
"class_name": "torch.optim.Adam",
"betas": [
0,
0.99
],
"eps": 1e-08,
"lr": 0.002
},
"loss_kwargs": {
"class_name": "training.loss.StyleGAN2Loss",
"r1_gamma": 6.6,
"blur_init_sigma": 0,
"blur_fade_kimg": 100.0
},
"data_loader_kwargs": {
"pin_memory": true,
"prefetch_factor": 2,
"num_workers": 3
},
"training_set_kwargs": {
"class_name": "training.dataset.ImageFolderDataset",
"path": "/kaggle/working/train_data_512.zip",
"use_labels": false,
"max_size": 2000,
"xflip": true,
"resolution": 1024,
"random_seed": 0
},
"num_gpus": 2,
"batch_size": 16,
"batch_gpu": 8,
"metrics": [
"fid50k_full"
],
"total_kimg": 5000,
"kimg_per_tick": 4,
"image_snapshot_ticks": 5,
"network_snapshot_ticks": 5,
"random_seed": 0,
"ema_kimg": 5.0,
"augment_kwargs": {
"class_name": "training.augment.AugmentPipe",
"xflip": 1,
"rotate90": 1,
"xint": 1,
"scale": 1,
"rotate": 1,
"aniso": 1,
"xfrac": 1,
"brightness": 1,
"contrast": 1,
"lumaflip": 1,
"hue": 1,
"saturation": 1
},
"ada_target": 0.6,
"resume_pkl": "https://api.ngc.nvidia.com/v2/models/nvidia/research/stylegan3/versions/1/files/stylegan3-r-ffhqu-1024x1024.pkl",
"ada_kimg": 100,
"ema_rampup": null,
"run_dir": "/kaggle/working/training_run_512/00000-stylegan3-r-train_data_512-gpus2-batch16-gamma6.6"
}

Output directory: /kaggle/working/training_run_512/00000-stylegan3-r-train_data_512-gpus2-batch16-gamma6.6
Number of GPUs: 2
Batch size: 16 images
Training duration: 5000 kimg
Dataset path: /kaggle/working/train_data_512.zip
Dataset size: 2000 images
Dataset resolution: 1024
Dataset labels: False
Dataset x-flips: True

Creating output directory...
Launching processes...
Loading training set...
/usr/local/lib/python3.11/dist-packages/torch/utils/data/sampler.py:77: UserWarning: data_source argument is not used and will be removed in 2.2.0.You may still have custom implementation that utilizes it.
warnings.warn(
/usr/local/lib/python3.11/dist-packages/torch/utils/data/sampler.py:77: UserWarning: data_source argument is not used and will be removed in 2.2.0.You may still have custom implementation that utilizes it.
warnings.warn(

Num images: 4000
Image shape: [3, 1024, 1024]
Label shape: [0]

Constructing networks...
Resuming from "https://api.ngc.nvidia.com/v2/models/nvidia/research/stylegan3/versions/1/files/stylegan3-r-ffhqu-1024x1024.pkl"
Downloading https://api.ngc.nvidia.com/v2/models/nvidia/research/stylegan3/versions/1/files/stylegan3-r-ffhqu-1024x1024.pkl ... done
Setting up PyTorch plugin "bias_act_plugin"... /usr/local/lib/python3.11/dist-packages/torch/utils/cpp_extension.py:2059: UserWarning: TORCH_CUDA_ARCH_LIST is not set, all archs for visible cards are included for compilation.
If this is not desired, please set os.environ['TORCH_CUDA_ARCH_LIST'].
warnings.warn(
Done.
Setting up PyTorch plugin "filtered_lrelu_plugin"... /usr/local/lib/python3.11/dist-packages/torch/utils/cpp_extension.py:2059: UserWarning: TORCH_CUDA_ARCH_LIST is not set, all archs for visible cards are included for compilation.
If this is not desired, please set os.environ['TORCH_CUDA_ARCH_LIST'].
warnings.warn(
Done.

Generator Parameters Buffers Output shape Datatype


mapping.fc0 262656 - [8, 512] float32
mapping.fc1 262656 - [8, 512] float32
mapping - 512 [8, 16, 512] float32
synthesis.input.affine 2052 - [8, 4] float32
synthesis.input 1048576 3081 [8, 1024, 36, 36] float32
synthesis.L0_36_1024.affine 525312 - [8, 1024] float32
synthesis.L0_36_1024 1049600 157 [8, 1024, 36, 36] float32
synthesis.L1_36_1024.affine 525312 - [8, 1024] float32
synthesis.L1_36_1024 1049600 157 [8, 1024, 36, 36] float32
synthesis.L2_52_1024.affine 525312 - [8, 1024] float32
synthesis.L2_52_1024 1049600 169 [8, 1024, 52, 52] float32
synthesis.L3_52_1024.affine 525312 - [8, 1024] float32
synthesis.L3_52_1024 1049600 157 [8, 1024, 52, 52] float32
synthesis.L4_84_1024.affine 525312 - [8, 1024] float32
synthesis.L4_84_1024 1049600 169 [8, 1024, 84, 84] float32
synthesis.L5_148_1024.affine 525312 - [8, 1024] float32
synthesis.L5_148_1024 1049600 169 [8, 1024, 148, 148] float16
synthesis.L6_148_1024.affine 525312 - [8, 1024] float32
synthesis.L6_148_1024 1049600 157 [8, 1024, 148, 148] float16
synthesis.L7_276_645.affine 525312 - [8, 1024] float32
synthesis.L7_276_645 661125 169 [8, 645, 276, 276] float16
synthesis.L8_276_406.affine 330885 - [8, 645] float32
synthesis.L8_276_406 262276 157 [8, 406, 276, 276] float16
synthesis.L9_532_256.affine 208278 - [8, 406] float32
synthesis.L9_532_256 104192 169 [8, 256, 532, 532] float16
synthesis.L10_1044_161.affine 131328 - [8, 256] float32
synthesis.L10_1044_161 41377 169 [8, 161, 1044, 1044] float16
synthesis.L11_1044_102.affine 82593 - [8, 161] float32
synthesis.L11_1044_102 16524 157 [8, 102, 1044, 1044] float16
synthesis.L12_1044_64.affine 52326 - [8, 102] float32
synthesis.L12_1044_64 6592 25 [8, 64, 1044, 1044] float16
synthesis.L13_1024_64.affine 32832 - [8, 64] float32
synthesis.L13_1024_64 4160 25 [8, 64, 1024, 1024] float16
synthesis.L14_1024_3.affine 32832 - [8, 64] float32
synthesis.L14_1024_3 195 1 [8, 3, 1024, 1024] float16
synthesis - - [8, 3, 1024, 1024] float32


Total 15093151 5600 - -

Setting up PyTorch plugin "upfirdn2d_plugin"... /usr/local/lib/python3.11/dist-packages/torch/utils/cpp_extension.py:2059: UserWarning: TORCH_CUDA_ARCH_LIST is not set, all archs for visible cards are included for compilation.
If this is not desired, please set os.environ['TORCH_CUDA_ARCH_LIST'].
warnings.warn(
Done.

Discriminator Parameters Buffers Output shape Datatype


b1024.fromrgb 128 16 [8, 32, 1024, 1024] float16
b1024.skip 2048 16 [8, 64, 512, 512] float16
b1024.conv0 9248 16 [8, 32, 1024, 1024] float16
b1024.conv1 18496 16 [8, 64, 512, 512] float16
b1024 - 16 [8, 64, 512, 512] float16
b512.skip 8192 16 [8, 128, 256, 256] float16
b512.conv0 36928 16 [8, 64, 512, 512] float16
b512.conv1 73856 16 [8, 128, 256, 256] float16
b512 - 16 [8, 128, 256, 256] float16
b256.skip 32768 16 [8, 256, 128, 128] float16
b256.conv0 147584 16 [8, 128, 256, 256] float16
b256.conv1 295168 16 [8, 256, 128, 128] float16
b256 - 16 [8, 256, 128, 128] float16
b128.skip 131072 16 [8, 512, 64, 64] float16
b128.conv0 590080 16 [8, 256, 128, 128] float16
b128.conv1 1180160 16 [8, 512, 64, 64] float16
b128 - 16 [8, 512, 64, 64] float16
b64.skip 262144 16 [8, 512, 32, 32] float32
b64.conv0 2359808 16 [8, 512, 64, 64] float32
b64.conv1 2359808 16 [8, 512, 32, 32] float32
b64 - 16 [8, 512, 32, 32] float32
b32.skip 262144 16 [8, 512, 16, 16] float32
b32.conv0 2359808 16 [8, 512, 32, 32] float32
b32.conv1 2359808 16 [8, 512, 16, 16] float32
b32 - 16 [8, 512, 16, 16] float32
b16.skip 262144 16 [8, 512, 8, 8] float32
b16.conv0 2359808 16 [8, 512, 16, 16] float32
b16.conv1 2359808 16 [8, 512, 8, 8] float32
b16 - 16 [8, 512, 8, 8] float32
b8.skip 262144 16 [8, 512, 4, 4] float32
b8.conv0 2359808 16 [8, 512, 8, 8] float32
b8.conv1 2359808 16 [8, 512, 4, 4] float32
b8 - 16 [8, 512, 4, 4] float32
b4.mbstd - - [8, 513, 4, 4] float32
b4.conv 2364416 16 [8, 512, 4, 4] float32
b4.fc 4194816 - [8, 512] float32
b4.out 513 - [8, 1] float32


Total 29012513 544 - -

Setting up augmentation...
Distributing across 2 GPUs...
Setting up training phases...
[rank0]:[W1111 15:17:42.853614781 ProcessGroupNCCL.cpp:1496] Warning: WARNING: destroy_process_group() was not called before program exit, which can leak resources. For more info, please see https://pytorch.org/docs/stable/distributed.html#shutdown (function operator())
W1111 15:17:43.083000 1457 torch/multiprocessing/spawn.py:169] Terminating process 1464 via signal SIGTERM
Traceback (most recent call last):
File "/kaggle/working/stylegan3/train.py", line 286, in
main() # pylint: disable=no-value-for-parameter
^^^^^^
File "/usr/local/lib/python3.11/dist-packages/click/core.py", line 1462, in call
return self.main(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.11/dist-packages/click/core.py", line 1383, in main
rv = self.invoke(ctx)
^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.11/dist-packages/click/core.py", line 1246, in invoke
return ctx.invoke(self.callback, **ctx.params)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.11/dist-packages/click/core.py", line 814, in invoke
return callback(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^
File "/kaggle/working/stylegan3/train.py", line 281, in main
launch_training(c=c, desc=desc, outdir=opts.outdir, dry_run=opts.dry_run)
File "/kaggle/working/stylegan3/train.py", line 98, in launch_training
torch.multiprocessing.spawn(fn=subprocess_fn, args=(c, temp_dir), nprocs=c.num_gpus)
File "/usr/local/lib/python3.11/dist-packages/torch/multiprocessing/spawn.py", line 340, in spawn
return start_processes(fn, args, nprocs, join, daemon, start_method="spawn")
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.11/dist-packages/torch/multiprocessing/spawn.py", line 296, in start_processes
while not context.join():
^^^^^^^^^^^^^^
File "/usr/local/lib/python3.11/dist-packages/torch/multiprocessing/spawn.py", line 215, in join
raise ProcessRaisedException(msg, error_index, failed_process.pid)
torch.multiprocessing.spawn.ProcessRaisedException:

-- Process 1 terminated with the following error:
Traceback (most recent call last):
File "/usr/local/lib/python3.11/dist-packages/torch/multiprocessing/spawn.py", line 90, in _wrap
fn(i, *args)
File "/kaggle/working/stylegan3/train.py", line 47, in subprocess_fn
training_loop.training_loop(rank=rank, **c)
File "/kaggle/working/stylegan3/training/training_loop.py", line 197, in training_loop
opt = dnnlib.util.construct_class_by_name(params=module.parameters(), **opt_kwargs) # subclass of torch.optim.Optimizer
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/kaggle/working/stylegan3/dnnlib/util.py", line 303, in construct_class_by_name
return call_func_by_name(*args, func_name=class_name, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/kaggle/working/stylegan3/dnnlib/util.py", line 298, in call_func_by_name
return func_obj(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.11/dist-packages/torch/optim/adam.py", line 71, in init
raise ValueError("betas must be either both floats or both Tensors")
ValueError: betas must be either both floats or both Tensors

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions