This repository has been archived by the owner on May 9, 2024. It is now read-only.
forked from pytorch/vision
-
Notifications
You must be signed in to change notification settings - Fork 21
/
run_torchvision_classification_v2.sh
executable file
·55 lines (43 loc) · 2.16 KB
/
run_torchvision_classification_v2.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
#!/usr/bin/env bash
# torch.fx based model surgery and training
# PYTHONPATH must start with a : to be able to load local modules
export PYTHONPATH=:$PYTHONPATH
# Date/time in YYYYMMDD-HHmmSS format
DATE_TIME=`date +'%Y%m%d-%H%M%S'`
#=========================================================================================
# sample models that can be used
#model=resnet50
#model=mobilenet_v2
#model=mobilenet_v2
#model=resnet18
#model=regnetx200mf
#model=regnetx400mf
#model=regnetx400mf
#model=regnetx800mf
#model=regnetx1p6gf
# these lite models are created using model surgery from models in torchvision
# these lite models will be available only if --model-surgery <argument> argument is set to one of these
# --model-surgery 1: legacy module based surgery
# --model-surgery 2: advanced model surgery with torch.fx (to be released)
#model=mobilenet_v3_large_lite
#model=mobilenet_v3_small_lite
model=mobilenet_v2_lite
#=========================================================================================
# set the appropriate pretrained weights for the above model
#model_weights="ResNet50_Weights.IMAGENET1K_V1"
#model_weights="MobileNet_V2_Weights.IMAGENET1K_V1"
model_weights="MobileNet_V2_Weights.IMAGENET1K_V2"
output_dir="./data/checkpoints/torchvision/${DATE_TIME}_imagenet_classification_${model}"
val_resize_size=232 #256 #232
val_crop_size=224
#=========================================================================================
command="./references/classification/train.py --data-path=./data/datasets/imagenet \
--epochs=150 --batch-size=64 --wd=4e-5 --lr=0.05 --lr-scheduler=cosineannealinglr --lr-warmup-epochs=5 \
--model=${model} --model-surgery=2 \
--opset-version=18 --val-resize-size=$val_resize_size --val-crop-size=$val_crop_size"
# training: single GPU (--device=cuda:0)or CPU (--device=cpu) run
# python3 ${command} --weights=${model_weights} --output-dir=${output_dir}
# training: multi-gpu distributed data parallel
torchrun --nproc_per_node 4 ${command} --weights=${model_weights} --output-dir=${output_dir}
# testing after the training
# torchrun --nproc_per_node 4 ${command} --test-only --weights=${output_dir}/checkpoint.pth --output-dir=${output_dir}/test