-
Notifications
You must be signed in to change notification settings - Fork 4
/
submit_summit.pbs
executable file
·120 lines (107 loc) · 7.4 KB
/
submit_summit.pbs
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
#!/bin/bash
#BSUB -nnodes 129
#BSUB -W 00:20
#BSUB -P STF006
#BSUB -J exatensor
#BSUB -o exa.out.%J
#BSUB -e exa.err.%J
###BSUB -alloc_flags "gpumps smt1"
# Load modules:
module load gcc/8.1.1
module load essl
module load cuda
module load git
module load cmake
# Set full path to the executable:
export EXA_PATH=/ccs/home/div/src/ExaTensor_Summit_gcc8
#ExaTENSOR specific:
export QF_NUM_PROCS=258 #total number of MPI processes
export QF_PROCS_PER_NODE=2 #number of MPI processes per logical node (logical nodes are created by node resource isolation)
export QF_CORES_PER_PROCESS=21 #number of physical CPU cores per MPI process (no less than 1)
export QF_MEM_PER_PROCESS=130000 #host RAM memory limit per MPI process in MB
export QF_NVMEM_PER_PROCESS=0 #non-volatile memory limit per MPI process in MB
export QF_HOST_BUFFER_SIZE=180000 #host buffer size per MPI process in MB (must be less than QF_MEM_PER_PROCESS)
export QF_GPUS_PER_PROCESS=3 #number of discrete NVIDIA GPU's per MPI process (optional)
export QF_MICS_PER_PROCESS=0 #number of discrete Intel Xeon Phi's per MPI process (optional)
export QF_AMDS_PER_PROCESS=0 #number of discrete AMD GPU's per MPI process (optional)
export QF_NUM_THREADS=21 #initial number of CPU threads per MPI process (irrelevant, keep it 8)
#OpenMP generic:
export OMP_NUM_THREADS=$QF_NUM_THREADS #initial number of OpenMP threads per MPI process
export OMP_DYNAMIC=false #no OpenMP dynamic threading
export OMP_NESTED=true #OpenMP nested parallelism is mandatory
export OMP_MAX_ACTIVE_LEVELS=3 #max number of OpenMP nesting levels (at least 3)
export OMP_THREAD_LIMIT=256 #max total number of OpenMP threads per process
export OMP_WAIT_POLICY=PASSIVE #idle thread behavior
#export OMP_STACKSIZE=200M #stack size per thread
#export OMP_DISPLAY_ENV=VERBOSE #display OpenMP environment variables
#export GOMP_DEBUG=1 #GNU OpenMP debugging
#export LOMP_DEBUG=1 #IBM XL OpenMP debugging
#OpenMP thread binding:
export OMP_PLACES_DEFAULT=cores #default thread binding to CPU logical cores
export OMP_PLACES_EOS="{1},{3},{5},{7,9},{0:16:2},{11},{13},{15}" #Eos 16-core hyperthreaded Intel Xeon thread binding (even logical cores do computing)
export OMP_PLACES_TITAN="{1},{3},{5},{7,9},{0:8:2},{11},{13},{15}" #Titan 16-core 8-FPU AMD thread binding (even logical cores do computing)
#export OMP_PLACES_POWER9="{0:4},{4:4},{8:4},{12:4},{28:56},{16:4},{20:4},{24:4}" #Summit 21-core SMT4 Power9 socket thread binding (even logical cores do computing)
export OMP_PLACES_POWER9="{0},{4},{8},{12},{28:56},{16},{20},{24}" #Summit 21-core SMT4 Power9 socket thread binding (even logical cores do computing)
export OMP_PLACES_KNL="{1},{3},{5},{7,9},{0:128:2},{11},{13},{15}" #Percival 64-core SMT4 KNL thread binding (even logical cores do computing)
#export OMP_PLACES=$OMP_PLACES_POWER9
export OMP_PROC_BIND="close,spread,spread" #nest1: Functional threads (DSVU)
#nest2: TAVP-WRK:Dispatcher spawns coarse-grain Executors
#nest3: TAVP-WRK:Dispatcher:Executor spawns execution threads in CP-TAL kernels
#MKL specific:
export MKL_NUM_THREADS_DEFAULT=1 #keep consistent with chosen OMP_PLACES!
export MKL_NUM_THREADS_EOS=16 #keep consistent with chosen OMP_PLACES!
export MKL_NUM_THREADS_TITAN=8 #keep consistent with chosen OMP_PLACES!
export MKL_NUM_THREADS_POWER9=56 #keep consistent with chosen OMP_PLACES!
export MKL_NUM_THREADS_KNL=128 #keep consistent with chosen OMP_PLACES!
export MKL_NUM_THREADS=$MKL_NUM_THREADS_DEFAULT #number of Intel MKL threads per process
export MKL_DYNAMIC=false
#Intel MIC specific:
#export KMP_AFFINITY="verbose,granularity=core,compact" #Intel CPU thread affinity
#export MIC_PREFIX=MIC #mandatory when using MIC
#export MIC_ENV_PREFIX=MIC #mandatory when using MIC
#export MIC_OMP_PREFIX=MIC #mandatory when using MIC
#export MIC_OMP_NUM_THREADS=256 #mandatory when using MIC
#export MIC_MKL_NUM_THREADS=$MIC_OMP_NUM_THREADS #mandatory when using MIC (Intel MIC MKL)
#export MIC_KMP_PLACE_THREADS="64c,4t" #Intel MIC thread placement
#export MIC_KMP_AFFINITY="verbose,granularity=fine,compact" #Intel MIC thread affinity
#export MIC_USE_2MB_BUFFERS=64K #Intel MIC only
#export MKL_MIC_ENABLE=0 #Intel MIC MKL auto-offloading
#export OFFLOAD_REPORT=2
#Cray/MPICH specific:
#export CRAY_OMP_CHECK_AFFINITY=TRUE #CRAY: Show thread placement
export MPICH_MAX_THREAD_SAFETY=multiple #CRAY: Required for MPI asynchronous progress
export MPICH_NEMESIS_ASYNC_PROGRESS="MC" #CRAY: Activate MPI asynchronous progress thread {"SC","MC"}
#export MPICH_RMA_OVER_DMAPP=1 #CRAY: DMAPP backend for CRAY-MPICH
#export MPICH_GNI_ASYNC_PROGRESS_TIMEOUT=0 #CRAY:
#export MPICH_GNI_MALLOC_FALLBACK=enabled #CRAY:
#export MPICH_ALLOC_MEM_HUGE_PAGES=1 #CRAY: Huge pages
#export MPICH_ALLOC_MEM_HUGEPG_SZ=2M #CRAY: Huge page size
#export _DMAPPI_NDREG_ENTRIES=16384 #CRAY: Max number of entries in UDREG memory registration cache
#export MPICH_ENV_DISPLAY=1
#export MPICH_GNI_MEM_DEBUG_FNAME=MPICH.memdebug
#export MPICH_RANK_REORDER_DISPLAY=1
#Summit specific:
export PAMI_IBV_ADAPTER_AFFINITY=1
export PAMI_IBV_DEVICE_NAME="mlx5_0:1,mlx5_3:1"
export PAMI_IBV_DEVICE_NAME_1="mlx5_3:1,mlx5_0:1"
export PAMI_IBV_ENABLE_OOO_AR=1 #adaptive routing is default
export PAMI_ENABLE_STRIPING=1 #increases network bandwidth, also increases latency
export PAMI_IBV_DISABLE_ODP=0 #ODP (requires CAPI for performance)
#export PAMI_IBV_ENABLE_TAG_MATCHING=1 #hardware tag matching
export PAMI_IBV_ENABLE_DCT=1 #reduces MPI_Init() time at large scale
#unset PAMI_IBV_ENABLE_DCT
#export PAMI_IBV_QP_SERVICE_LEVEL=8
#export PAMI_PMIX_DATACACHE=1
#export PAMI_IBV_DEBUG_CQE=1 #CQE error debugging
#export PAMI_IBV_DEBUG_QP_TIMEOUT=22
#export PAMI_IBV_DEBUG_RNR_RETRY=9
#export OMPI_LD_PRELOAD_POSTPEND=$OLCF_SPECTRUM_MPI_ROOT/lib/libmpitrace.so
ulimit -s unlimited
rm core.* *.tmp *.log *.out *.x
cp $EXA_PATH/Qforce.x ./
#One MPI process per node (on only one socket):
#jsrun --smpiargs='-mca common_pami_use_odp 1' -D PAMI_IBV_DISABLE_ODP=0 -n $QF_NUM_PROCS -r $QF_PROCS_PER_NODE -a 1 -c $QF_CORES_PER_PROCESS -g $QF_GPUS_PER_PROCESS -bnone ./Qforce.x
#Two MPI processes per node (one MPI process per socket):
#jsrun --smpiargs='-mca common_pami_use_odp 1' -D PAMI_IBV_DISABLE_ODP=0 -n $QF_NUM_PROCS -r $QF_PROCS_PER_NODE -a 1 -c $QF_CORES_PER_PROCESS -g $QF_GPUS_PER_PROCESS -bnone ./exec_exa.sh
#jsrun --smpiargs='-mca pml_pami_use_2sided 1' -E PAMI_IBV_ENABLE_TAG_MATCHING=1 --smpiargs='-mca common_pami_use_odp 1' -D PAMI_IBV_DISABLE_ODP=0 -n $QF_NUM_PROCS -r $QF_PROCS_PER_NODE -a 1 -c $QF_CORES_PER_PROCESS -g $QF_GPUS_PER_PROCESS -bnone ./exec_exa.sh
jsrun --smpiargs='-async' --smpiargs='-mca common_pami_use_odp 1' -D PAMI_IBV_DISABLE_ODP=0 -n $QF_NUM_PROCS -r $QF_PROCS_PER_NODE -a 1 -c $QF_CORES_PER_PROCESS -g $QF_GPUS_PER_PROCESS -bnone ./exec_exa.sh