-
Notifications
You must be signed in to change notification settings - Fork 100
/
sanity_checks2024.sh
50 lines (35 loc) · 12.3 KB
/
sanity_checks2024.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
# Sanity checks for pytorch issue https://github.com/pytorch/pytorch/issues/96693
# python pretrain.py name=DA6000amp_b8192_cb_o4_premade_base arch=crammed-bert train=bert-o4 data=pile-readymade data.hf_location=JonasGeiping/the_pile_WordPiecex32768_53b28db05413b6497e702f178268e1e2 impl.microbatch_size=512 seed=233
# python pretrain.py name=DA6000amp_b8192_cb_o4_premade_simplecomp arch=crammed-bert train=bert-o4 data=pile-readymade data.hf_location=JonasGeiping/the_pile_WordPiecex32768_53b28db05413b6497e702f178268e1e2 impl.microbatch_size=512 impl._inductor_vars=null seed=233
# python pretrain.py name=DA6000amp_b8192_cb_o4_premade_max_autotune_gemm arch=crammed-bert train=bert-o4 data=pile-readymade data.hf_location=JonasGeiping/the_pile_WordPiecex32768_53b28db05413b6497e702f178268e1e2 impl.microbatch_size=512 impl._inductor_vars=null +impl._inductor_vars.max_autotune_gemm=True seed=233
# python pretrain.py name=DA6000amp_b8192_cb_o4_premade_max_autotune_pw arch=crammed-bert train=bert-o4 data=pile-readymade data.hf_location=JonasGeiping/the_pile_WordPiecex32768_53b28db05413b6497e702f178268e1e2 impl.microbatch_size=512 impl._inductor_vars=null +impl._inductor_vars.max_autotune_pointwise=True seed=233
# python pretrain.py name=DA6000amp_b8192_cb_o4_premade_max_autotune_default arch=crammed-bert train=bert-o4 data=pile-readymade data.hf_location=JonasGeiping/the_pile_WordPiecex32768_53b28db05413b6497e702f178268e1e2 impl.microbatch_size=512 impl._inductor_vars=null impl.mode=max-autotune seed=233
# python pretrain.py name=DA6000amp_b8192_cb_o4_premade_max_autotune_no_cudagraphs arch=crammed-bert train=bert-o4 data=pile-readymade data.hf_location=JonasGeiping/the_pile_WordPiecex32768_53b28db05413b6497e702f178268e1e2 impl.microbatch_size=512 impl._inductor_vars=null impl.mode=max-autotune-no-cudagraphs seed=233
# python pretrain.py name=DA6000amp_b8192_cb_o4_premade_base_tfoff arch=crammed-bert train=bert-o4 data=pile-readymade data.hf_location=JonasGeiping/the_pile_WordPiecex32768_53b28db05413b6497e702f178268e1e2 impl.microbatch_size=512 impl.tf32_allowed=False seed=233
# python pretrain.py name=DA6000amp_b8192_cb_o4_premade_simplecomp_tfoff arch=crammed-bert train=bert-o4 data=pile-readymade data.hf_location=JonasGeiping/the_pile_WordPiecex32768_53b28db05413b6497e702f178268e1e2 impl.microbatch_size=512 impl._inductor_vars=null impl.tf32_allowed=False seed=233
# python pretrain.py name=DA6000amp_b8192_cb_o4_premade_max_autotune_gemm_tfoff arch=crammed-bert train=bert-o4 data=pile-readymade data.hf_location=JonasGeiping/the_pile_WordPiecex32768_53b28db05413b6497e702f178268e1e2 impl.microbatch_size=512 impl._inductor_vars=null +impl._inductor_vars.max_autotune_gemm=True impl.tf32_allowed=False seed=233
# python pretrain.py name=DA6000amp_b8192_cb_o4_premade_max_autotune_pw_tfoff arch=crammed-bert train=bert-o4 data=pile-readymade data.hf_location=JonasGeiping/the_pile_WordPiecex32768_53b28db05413b6497e702f178268e1e2 impl.microbatch_size=512 impl._inductor_vars=null +impl._inductor_vars.max_autotune_pointwise=True impl.tf32_allowed=False seed=233
# python pretrain.py name=DA6000amp_b8192_cb_o4_premade_max_autotune_default_tfoff arch=crammed-bert train=bert-o4 data=pile-readymade data.hf_location=JonasGeiping/the_pile_WordPiecex32768_53b28db05413b6497e702f178268e1e2 impl.microbatch_size=512 impl._inductor_vars=null impl.mode=max-autotune impl.tf32_allowed=False seed=233
# python pretrain.py name=DA6000amp_b8192_cb_o4_premade_max_autotune_no_cudagraphs_tfoff arch=crammed-bert train=bert-o4 data=pile-readymade data.hf_location=JonasGeiping/the_pile_WordPiecex32768_53b28db05413b6497e702f178268e1e2 impl.microbatch_size=512 impl._inductor_vars=null impl.mode=max-autotune-no-cudagraphs impl.tf32_allowed=False seed=233
# all follow the same curve:
python pretrain.py name=DA6000amp_b8192_cb_o4_premade_max_autotune_40k arch=crammed-bert train=bert-o4 data=pile-readymade data.hf_location=JonasGeiping/the_pile_WordPiecex32768_53b28db05413b6497e702f178268e1e2 impl.microbatch_size=512 impl._inductor_vars=null impl.mode=max-autotune seed=233 train.steps=40000 budget=2.4 impl.deterministic=True train.batch_size_ramp=0 train.scheduler=triangle2
python pretrain.py name=DA6000amp_b8192_cb_o4_premade_max_autotune_no_cudagraphs_40k arch=crammed-bert train=bert-o4 data=pile-readymade data.hf_location=JonasGeiping/the_pile_WordPiecex32768_53b28db05413b6497e702f178268e1e2 impl.microbatch_size=512 impl._inductor_vars=null impl.mode=max-autotune-no-cudagraphs seed=233 train.steps=40000 budget=2.4 impl.deterministic=True train.batch_size_ramp=0 train.scheduler=triangle2
python pretrain.py name=DA6000amp_b8192_cb_o4_premade_default_40k arch=crammed-bert train=bert-o4 data=pile-readymade data.hf_location=JonasGeiping/the_pile_WordPiecex32768_53b28db05413b6497e702f178268e1e2 impl.microbatch_size=512 impl._inductor_vars=null impl.mode=default seed=233 train.steps=40000 budget=2.4 impl.deterministic=True train.batch_size_ramp=0 train.scheduler=triangle2
python pretrain.py name=DA6000amp_b8192_cb_o4_premade_reduce_overhead_40k arch=crammed-bert train=bert-o4 data=pile-readymade data.hf_location=JonasGeiping/the_pile_WordPiecex32768_53b28db05413b6497e702f178268e1e2 impl.microbatch_size=512 impl._inductor_vars=null impl.mode=reduce-overhead seed=233 train.steps=40000 budget=2.4 impl.deterministic=True train.batch_size_ramp=0 train.scheduler=triangle2
CUDA_VISIBLE_DEVICES=3 TORCHINDUCTOR_FORCE_DISABLE_CACHES=1 python pretrain.py name=DA6000amp_b8192_cb_o4_with_nondet_max_autotune_40k arch=crammed-bert train=bert-o4 data=pile-readymade data.hf_location=JonasGeiping/the_pile_WordPiecex32768_53b28db05413b6497e702f178268e1e2 impl.microbatch_size=512 impl._inductor_vars=null impl.mode=max-autotune seed=233 train.steps=40000 budget=2.4 impl.deterministic=False train.batch_size_ramp=0 train.scheduler=triangle2
CUDA_VISIBLE_DEVICES=4 TORCHINDUCTOR_FORCE_DISABLE_CACHES=1 python pretrain.py name=DA6000amp_b8192_cb_o4_with_nondet_max_autotune_no_cudagraphs_40k arch=crammed-bert train=bert-o4 data=pile-readymade data.hf_location=JonasGeiping/the_pile_WordPiecex32768_53b28db05413b6497e702f178268e1e2 impl.microbatch_size=512 impl._inductor_vars=null impl.mode=max-autotune-no-cudagraphs seed=233 train.steps=40000 budget=2.4 impl.deterministic=False train.batch_size_ramp=0 train.scheduler=triangle2
CUDA_VISIBLE_DEVICES=5 TORCHINDUCTOR_FORCE_DISABLE_CACHES=1 python pretrain.py name=DA6000amp_b8192_cb_o4_with_nondet_default_40k arch=crammed-bert train=bert-o4 data=pile-readymade data.hf_location=JonasGeiping/the_pile_WordPiecex32768_53b28db05413b6497e702f178268e1e2 impl.microbatch_size=512 impl._inductor_vars=null impl.mode=default seed=233 train.steps=40000 budget=2.4 impl.deterministic=False train.batch_size_ramp=0 train.scheduler=triangle2
CUDA_VISIBLE_DEVICES=6 TORCHINDUCTOR_FORCE_DISABLE_CACHES=1 python pretrain.py name=DA6000amp_b8192_cb_o4_with_nondet_reduce_overhead_40k arch=crammed-bert train=bert-o4 data=pile-readymade data.hf_location=JonasGeiping/the_pile_WordPiecex32768_53b28db05413b6497e702f178268e1e2 impl.microbatch_size=512 impl._inductor_vars=null impl.mode=reduce-overhead seed=233 train.steps=40000 budget=2.4 impl.deterministic=False train.batch_size_ramp=0 train.scheduler=triangle2
CUDA_VISIBLE_DEVICES=5 python pretrain.py name=DA6000amp_b8192_cb_o4_with_ramp_max_autotune_40k arch=crammed-bert train=bert-o4 data=pile-readymade data.hf_location=JonasGeiping/the_pile_WordPiecex32768_53b28db05413b6497e702f178268e1e2 impl.microbatch_size=512 impl._inductor_vars=null impl.mode=max-autotune seed=233 train.steps=40000 budget=2.4 impl.deterministic=True train.batch_size_ramp=20000 train.scheduler=triangle2
CUDA_VISIBLE_DEVICES=5 python pretrain.py name=DA6000amp_b8192_cb_o4_with_ramp_max_autotune_no_cudagraphs_40k arch=crammed-bert train=bert-o4 data=pile-readymade data.hf_location=JonasGeiping/the_pile_WordPiecex32768_53b28db05413b6497e702f178268e1e2 impl.microbatch_size=512 impl._inductor_vars=null impl.mode=max-autotune-no-cudagraphs seed=233 train.steps=40000 budget=2.4 impl.deterministic=True train.batch_size_ramp=20000 train.scheduler=triangle2
CUDA_VISIBLE_DEVICES=6 python pretrain.py name=DA6000amp_b8192_cb_o4_with_ramp_default_40k arch=crammed-bert train=bert-o4 data=pile-readymade data.hf_location=JonasGeiping/the_pile_WordPiecex32768_53b28db05413b6497e702f178268e1e2 impl.microbatch_size=512 impl._inductor_vars=null impl.mode=default seed=233 train.steps=40000 budget=2.4 impl.deterministic=True train.batch_size_ramp=20000 train.scheduler=triangle2
CUDA_VISIBLE_DEVICES=6 python pretrain.py name=DA6000amp_b8192_cb_o4_with_ramp_reduce_overhead_40k arch=crammed-bert train=bert-o4 data=pile-readymade data.hf_location=JonasGeiping/the_pile_WordPiecex32768_53b28db05413b6497e702f178268e1e2 impl.microbatch_size=512 impl._inductor_vars=null impl.mode=reduce-overhead seed=233 train.steps=40000 budget=2.4 impl.deterministic=True train.batch_size_ramp=20000 train.scheduler=triangle2
# invoke cache skip + cuidagraphs: +impl._inductor_vars.autotune_local_cache=False +impl._inductor_vars.triton.cudagraphs=True
CUDA_VISIBLE_DEVICES=3 TORCHINDUCTOR_FORCE_DISABLE_CACHES=1 python pretrain.py name=DA6000amp_b8192_cb_o4_with_det_reduce_overhead_40k arch=crammed-bert train=bert-o4 data=pile-readymade data.hf_location=JonasGeiping/the_pile_WordPiecex32768_53b28db05413b6497e702f178268e1e2 impl.microbatch_size=512 impl._inductor_vars=null seed=233 train.steps=40000 budget=100 impl.deterministic=True train.batch_size_ramp=0 train.scheduler=triangle2 +impl._inductor_vars.autotune_local_cache=False +impl._inductor_vars.triton.cudagraphs=True
CUDA_VISIBLE_DEVICES=4 TORCHINDUCTOR_FORCE_DISABLE_CACHES=1 python pretrain.py name=DA6000amp_b8192_cb_o4_with_nondet_reduce_overhead_40k arch=crammed-bert train=bert-o4 data=pile-readymade data.hf_location=JonasGeiping/the_pile_WordPiecex32768_53b28db05413b6497e702f178268e1e2 impl.microbatch_size=512 impl._inductor_vars=null seed=233 train.steps=40000 budget=100 impl.deterministic=False train.batch_size_ramp=0 train.scheduler=triangle2 +impl._inductor_vars.autotune_local_cache=False +impl._inductor_vars.triton.cudagraphs=True
CUDA_VISIBLE_DEVICES=5 TORCHINDUCTOR_FORCE_DISABLE_CACHES=1 python pretrain.py name=DA6000amp_b8192_const_with_det_default_40k arch=crammed-bert train=bert-o4 data=pile-readymade data.hf_location=JonasGeiping/the_pile_WordPiecex32768_53b28db05413b6497e702f178268e1e2 impl.microbatch_size=512 impl._inductor_vars=null seed=233 train.steps=40000 budget=100 impl.deterministic=True train.batch_size_ramp=0 train.scheduler=constant +impl._inductor_vars.autotune_local_cache=False
CUDA_VISIBLE_DEVICES=5 TORCHINDUCTOR_FORCE_DISABLE_CACHES=1 python pretrain.py name=DA6000amp_b8192_const_with_det_reduce_overhead_40k arch=crammed-bert train=bert-o4 data=pile-readymade data.hf_location=JonasGeiping/the_pile_WordPiecex32768_53b28db05413b6497e702f178268e1e2 impl.microbatch_size=512 impl._inductor_vars=null seed=233 train.steps=40000 budget=100 impl.deterministic=True train.batch_size_ramp=0 train.scheduler=constant +impl._inductor_vars.autotune_local_cache=False +impl._inductor_vars.triton.cudagraphs=True
CUDA_VISIBLE_DEVICES=6 TORCHINDUCTOR_FORCE_DISABLE_CACHES=1 python pretrain.py name=DA6000amp_b8192_const_with_nondet_default_40k arch=crammed-bert train=bert-o4 data=pile-readymade data.hf_location=JonasGeiping/the_pile_WordPiecex32768_53b28db05413b6497e702f178268e1e2 impl.microbatch_size=512 impl._inductor_vars=null seed=233 train.steps=40000 budget=100 impl.deterministic=False train.batch_size_ramp=0 train.scheduler=constant +impl._inductor_vars.autotune_local_cache=False
CUDA_VISIBLE_DEVICES=6 TORCHINDUCTOR_FORCE_DISABLE_CACHES=1 python pretrain.py name=DA6000amp_b8192_const_with_nondet_reduce_overhead_40k arch=crammed-bert train=bert-o4 data=pile-readymade data.hf_location=JonasGeiping/the_pile_WordPiecex32768_53b28db05413b6497e702f178268e1e2 impl.microbatch_size=512 impl._inductor_vars=null seed=233 train.steps=40000 budget=100 impl.deterministic=False train.batch_size_ramp=0 train.scheduler=constant +impl._inductor_vars.autotune_local_cache=False +impl._inductor_vars.triton.cudagraphs=True
# torch._dynamo.reset()?
# torch.compiler.reset?