-
Notifications
You must be signed in to change notification settings - Fork 74
/
pytorch_hpc_example.py
86 lines (70 loc) · 2.56 KB
/
pytorch_hpc_example.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
"""Example launcher for a hyperparameter search on SLURM.
This example shows how to use gpus on SLURM with PyTorch.
"""
import torch
from test_tube import Experiment, HyperOptArgumentParser, SlurmCluster
def train(hparams, *args):
"""Train your awesome model.
:param hparams: The arguments to run the model with.
"""
# Initialize experiments and track all the hyperparameters
exp = Experiment(
name=hparams.test_tube_exp_name,
# Location to save the metrics.
save_dir=hparams.log_path,
autosave=False,
)
exp.argparse(hparams)
# Pretend to train.
x = torch.rand((1, hparams.x_val))
for train_step in range(0, 100):
y = torch.rand((hparams.x_val, 1))
out = x.mm(y)
exp.log({'fake_err': out.item()})
# Save exp when .
exp.save()
if __name__ == '__main__':
# Set up our argparser and make the y_val tunable.
parser = HyperOptArgumentParser(strategy='random_search')
parser.add_argument('--test_tube_exp_name', default='my_test')
parser.add_argument('--log_path', default='/some/path/to/log')
parser.opt_list('--y_val',
default=12, options=[1, 2, 3, 4, 5, 6], tunable=True)
parser.opt_list('--x_val',
default=12, options=[20, 12, 30, 45], tunable=True)
hyperparams = parser.parse_args()
# Enable cluster training.
cluster = SlurmCluster(
hyperparam_optimizer=hyperparams,
log_path=hyperparams.log_path,
python_cmd='python3',
test_tube_exp_name=hyperparams.test_tube_exp_name
)
# Email results if your hpc supports it.
cluster.notify_job_status(
email='[email protected]', on_done=True, on_fail=True)
# SLURM Module to load.
cluster.load_modules([
'python-3',
'anaconda3'
])
# Add commands to the non-SLURM portion.
cluster.add_command('source activate myCondaEnv')
# Add custom SLURM commands which show up as:
# #comment
# #SBATCH --cmd=value
# ############
# cluster.add_slurm_cmd(
# cmd='cpus-per-task', value='1', comment='CPUS per task.')
# Set job compute details (this will apply PER set of hyperparameters.)
cluster.per_experiment_nb_gpus = 4
cluster.per_experiment_nb_nodes = 2
cluster.gpu_type = '1080ti'
# Each hyperparameter combination will use 8 gpus.
cluster.optimize_parallel_cluster_gpu(
# Function to execute:
train,
# Number of hyperparameter combinations to search:
nb_trials=24,
# This is what will display in the slurm queue:
job_name='first_tt_job')