Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Anyone please?The problem is that once i use 2 gpus,the program goes wrong? #277

Open
DJstepbystep opened this issue Mar 2, 2023 · 3 comments

Comments

@DJstepbystep
Copy link

Training options:
{
"num_gpus": 2,
"image_snapshot_ticks": 10,
"network_snapshot_ticks": 10,
"metrics": [
"fid50k_full"
],
"random_seed": 0,
"training_set_kwargs": {
"class_name": "training.dataset.ImageFolderDataset",
"path": "/root/autodl-tmp/stylegan/img/NEW20230111allrotate.zip",
"use_labels": false,
"max_size": 13276,
"xflip": false,
"resolution": 512
},
"data_loader_kwargs": {
"pin_memory": true,
"num_workers": 3,
"prefetch_factor": 2
},
"G_kwargs": {
"class_name": "training.networks.Generator",
"z_dim": 512,
"w_dim": 512,
"mapping_kwargs": {
"num_layers": 8
},
"synthesis_kwargs": {
"channel_base": 32768,
"channel_max": 512,
"num_fp16_res": 4,
"conv_clamp": 256
}
},
"D_kwargs": {
"class_name": "training.networks.Discriminator",
"block_kwargs": {},
"mapping_kwargs": {},
"epilogue_kwargs": {
"mbstd_group_size": 4
},
"channel_base": 32768,
"channel_max": 512,
"num_fp16_res": 4,
"conv_clamp": 256
},
"G_opt_kwargs": {
"class_name": "torch.optim.Adam",
"lr": 0.002,
"betas": [
0,
0.99
],
"eps": 1e-08
},
"D_opt_kwargs": {
"class_name": "torch.optim.Adam",
"lr": 0.002,
"betas": [
0,
0.99
],
"eps": 1e-08
},
"loss_kwargs": {
"class_name": "training.loss.StyleGAN2Loss",
"r1_gamma": 6.6
},
"total_kimg": 3000,
"batch_size": 4,
"batch_gpu": 2,
"ema_kimg": 10,
"ema_rampup": null,
"ada_target": 0.6,
"augment_kwargs": {
"class_name": "training.augment.AugmentPipe",
"xflip": 1,
"rotate90": 1,
"xint": 1,
"scale": 1,
"rotate": 1,
"aniso": 1,
"xfrac": 1,
"brightness": 1,
"contrast": 1,
"lumaflip": 1,
"hue": 1,
"saturation": 1
},
"run_dir": "./training-runs/00013-NEW20230111allrotate-stylegan2-gamma6.6-kimg3000-batch4"
}

Output directory: ./training-runs/00013-NEW20230111allrotate-stylegan2-gamma6.6-kimg3000-batch4
Training data: /root/autodl-tmp/stylegan/img/NEW20230111allrotate.zip
Training duration: 3000 kimg
Number of GPUs: 2
Number of images: 13276
Image resolution: 512
Conditional model: False
Dataset x-flips: False

Creating output directory...
Launching processes...
Loading training set...

Num images: 13276
Image shape: [3, 512, 512]
Label shape: [0]

Constructing networks...
Setting up PyTorch plugin "bias_act_plugin"... Done.
Setting up PyTorch plugin "upfirdn2d_plugin"... Done.

Generator Parameters Buffers Output shape Datatype


mapping.fc0 262656 - [2, 512] float32
mapping.fc1 262656 - [2, 512] float32
mapping.fc2 262656 - [2, 512] float32
mapping.fc3 262656 - [2, 512] float32
mapping.fc4 262656 - [2, 512] float32
mapping.fc5 262656 - [2, 512] float32
mapping.fc6 262656 - [2, 512] float32
mapping.fc7 262656 - [2, 512] float32
mapping - 512 [2, 16, 512] float32
synthesis.b4.conv1 2622465 32 [2, 512, 4, 4] float32
synthesis.b4.torgb 264195 - [2, 3, 4, 4] float32
synthesis.b4:0 8192 16 [2, 512, 4, 4] float32
synthesis.b4:1 - - [2, 512, 4, 4] float32
synthesis.b8.conv0 2622465 80 [2, 512, 8, 8] float32
synthesis.b8.conv1 2622465 80 [2, 512, 8, 8] float32
synthesis.b8.torgb 264195 - [2, 3, 8, 8] float32
synthesis.b8:0 - 16 [2, 512, 8, 8] float32
synthesis.b8:1 - - [2, 512, 8, 8] float32
synthesis.b16.conv0 2622465 272 [2, 512, 16, 16] float32
synthesis.b16.conv1 2622465 272 [2, 512, 16, 16] float32
synthesis.b16.torgb 264195 - [2, 3, 16, 16] float32
synthesis.b16:0 - 16 [2, 512, 16, 16] float32
synthesis.b16:1 - - [2, 512, 16, 16] float32
synthesis.b32.conv0 2622465 1040 [2, 512, 32, 32] float32
synthesis.b32.conv1 2622465 1040 [2, 512, 32, 32] float32
synthesis.b32.torgb 264195 - [2, 3, 32, 32] float32
synthesis.b32:0 - 16 [2, 512, 32, 32] float32
synthesis.b32:1 - - [2, 512, 32, 32] float32
synthesis.b64.conv0 2622465 4112 [2, 512, 64, 64] float16
synthesis.b64.conv1 2622465 4112 [2, 512, 64, 64] float16
synthesis.b64.torgb 264195 - [2, 3, 64, 64] float16
synthesis.b64:0 - 16 [2, 512, 64, 64] float16
synthesis.b64:1 - - [2, 512, 64, 64] float32
synthesis.b128.conv0 1442561 16400 [2, 256, 128, 128] float16
synthesis.b128.conv1 721409 16400 [2, 256, 128, 128] float16
synthesis.b128.torgb 132099 - [2, 3, 128, 128] float16
synthesis.b128:0 - 16 [2, 256, 128, 128] float16
synthesis.b128:1 - - [2, 256, 128, 128] float32
synthesis.b256.conv0 426369 65552 [2, 128, 256, 256] float16
synthesis.b256.conv1 213249 65552 [2, 128, 256, 256] float16
synthesis.b256.torgb 66051 - [2, 3, 256, 256] float16
synthesis.b256:0 - 16 [2, 128, 256, 256] float16
synthesis.b256:1 - - [2, 128, 256, 256] float32
synthesis.b512.conv0 139457 262160 [2, 64, 512, 512] float16
synthesis.b512.conv1 69761 262160 [2, 64, 512, 512] float16
synthesis.b512.torgb 33027 - [2, 3, 512, 512] float16
synthesis.b512:0 - 16 [2, 64, 512, 512] float16
synthesis.b512:1 - - [2, 64, 512, 512] float32


Total 30276583 699904 - -

Discriminator Parameters Buffers Output shape Datatype


b512.fromrgb 256 16 [2, 64, 512, 512] float16
b512.skip 8192 16 [2, 128, 256, 256] float16
b512.conv0 36928 16 [2, 64, 512, 512] float16
b512.conv1 73856 16 [2, 128, 256, 256] float16
b512 - 16 [2, 128, 256, 256] float16
b256.skip 32768 16 [2, 256, 128, 128] float16
b256.conv0 147584 16 [2, 128, 256, 256] float16
b256.conv1 295168 16 [2, 256, 128, 128] float16
b256 - 16 [2, 256, 128, 128] float16
b128.skip 131072 16 [2, 512, 64, 64] float16
b128.conv0 590080 16 [2, 256, 128, 128] float16
b128.conv1 1180160 16 [2, 512, 64, 64] float16
b128 - 16 [2, 512, 64, 64] float16
b64.skip 262144 16 [2, 512, 32, 32] float16
b64.conv0 2359808 16 [2, 512, 64, 64] float16
b64.conv1 2359808 16 [2, 512, 32, 32] float16
b64 - 16 [2, 512, 32, 32] float16
b32.skip 262144 16 [2, 512, 16, 16] float32
b32.conv0 2359808 16 [2, 512, 32, 32] float32
b32.conv1 2359808 16 [2, 512, 16, 16] float32
b32 - 16 [2, 512, 16, 16] float32
b16.skip 262144 16 [2, 512, 8, 8] float32
b16.conv0 2359808 16 [2, 512, 16, 16] float32
b16.conv1 2359808 16 [2, 512, 8, 8] float32
b16 - 16 [2, 512, 8, 8] float32
b8.skip 262144 16 [2, 512, 4, 4] float32
b8.conv0 2359808 16 [2, 512, 8, 8] float32
b8.conv1 2359808 16 [2, 512, 4, 4] float32
b8 - 16 [2, 512, 4, 4] float32
b4.mbstd - - [2, 513, 4, 4] float32
b4.conv 2364416 16 [2, 512, 4, 4] float32
b4.fc 4194816 - [2, 512] float32
b4.out 513 - [2, 1] float32


Total 28982849 480 - -

Setting up augmentation...
Distributing across 2 GPUs...
Setting up training phases...
Exporting sample images...
Initializing logs...
Skipping tfevents export: No module named 'tensorboard'
Training for 3000 kimg...

tick 0 kimg 0.0 time 30s sec/tick 2.4 sec/kimg 601.84 maintenance 27.9 cpumem 4.56 gpumem 5.39 augment 0.000
Traceback (most recent call last):
File "/root/autodl-tmp/stylegan/stylegan2-ada-pytorch/train.py", line 538, in
main() # pylint: disable=no-value-for-parameter
File "/root/miniconda3/envs/stylegan/lib/python3.7/site-packages/click/core.py", line 1128, in call
return self.main(*args, **kwargs)
File "/root/miniconda3/envs/stylegan/lib/python3.7/site-packages/click/core.py", line 1053, in main
rv = self.invoke(ctx)
File "/root/miniconda3/envs/stylegan/lib/python3.7/site-packages/click/core.py", line 1395, in invoke
return ctx.invoke(self.callback, **ctx.params)
File "/root/miniconda3/envs/stylegan/lib/python3.7/site-packages/click/core.py", line 754, in invoke
return __callback(*args, **kwargs)
File "/root/miniconda3/envs/stylegan/lib/python3.7/site-packages/click/decorators.py", line 26, in new_func
return f(get_current_context(), *args, **kwargs)
File "/root/autodl-tmp/stylegan/stylegan2-ada-pytorch/train.py", line 533, in main
torch.multiprocessing.spawn(fn=subprocess_fn, args=(args, temp_dir), nprocs=args.num_gpus)
File "/root/miniconda3/envs/stylegan/lib/python3.7/site-packages/torch/multiprocessing/spawn.py", line 199, in spawn
return start_processes(fn, args, nprocs, join, daemon, start_method='spawn')
File "/root/miniconda3/envs/stylegan/lib/python3.7/site-packages/torch/multiprocessing/spawn.py", line 157, in start_processes
while not context.join():
File "/root/miniconda3/envs/stylegan/lib/python3.7/site-packages/torch/multiprocessing/spawn.py", line 118, in join
raise Exception(msg)
Exception:

-- Process 1 terminated with the following error:
Traceback (most recent call last):
File "/root/miniconda3/envs/stylegan/lib/python3.7/site-packages/torch/multiprocessing/spawn.py", line 19, in _wrap
fn(i, args)
File "/root/autodl-tmp/stylegan/stylegan2-ada-pytorch/train.py", line 383, in subprocess_fn
training_loop.training_loop(rank=rank, **args)
File "/root/autodl-tmp/stylegan/stylegan2-ada-pytorch/training/training_loop.py", line 409, in training_loop
misc.check_ddp_consistency(module, ignore_regex=r'.
.w_avg')
File "/root/autodl-tmp/stylegan/stylegan2-ada-pytorch/torch_utils/misc.py", line 187, in check_ddp_consistency
assert (nan_to_num(tensor) == nan_to_num(other)).all(), fullname
AssertionError: Discriminator.b512.fromrgb.weight

bash as below:
--outdir=./training-runs
--data=/root/autodl-tmp/stylegan/img/NEW20230111allrotate.zip
--gpus=2
--batch=4
--gamma=6.6
--cfg=stylegan2
--kimg=3000

@saberpoi
Copy link

got the same problem

@Lanxin1011
Copy link

I've got the same problem, do you have any solutions? Thank you so much !!

@Zyriix
Copy link

Zyriix commented Apr 2, 2024

make sure you call
misc.print_module_summary for G and D

This function make parameters at all nodes become the same

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

No branches or pull requests

4 participants