forked from tensorflow/models
-
Notifications
You must be signed in to change notification settings - Fork 0
/
run.sh
executable file
·101 lines (83 loc) · 2.56 KB
/
run.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
#!/bin/bash
set -e
if [ `id -u` != 0 ]; then
echo "Calling sudo to gain root for this shell. (Needed to clear caches.)"
sudo echo "Success"
fi
SCRIPT_DIR=`dirname "$BASH_SOURCE"`
export PYTHONPATH="${SCRIPT_DIR}/../../"
MAIN_SCRIPT="ncf_estimator_main.py"
DATASET="ml-20m"
BUCKET=${BUCKET:-""}
ROOT_DIR="${BUCKET:-/tmp}/MLPerf_NCF"
echo "Root directory: ${ROOT_DIR}"
if [[ -z ${BUCKET} ]]; then
LOCAL_ROOT=${ROOT_DIR}
else
LOCAL_ROOT="/tmp/MLPerf_NCF"
mkdir -p ${LOCAL_ROOT}
echo "Local root (for files which cannot use GCS): ${LOCAL_ROOT}"
fi
DATE=$(date '+%Y-%m-%d_%H:%M:%S')
TEST_DIR="${ROOT_DIR}/${DATE}"
LOCAL_TEST_DIR="${LOCAL_ROOT}/${DATE}"
mkdir -p ${LOCAL_TEST_DIR}
TPU=${TPU:-""}
if [[ -z ${TPU} ]]; then
DEVICE_FLAG="--num_gpus -1" # --use_xla_for_gpu"
else
DEVICE_FLAG="--tpu ${TPU} --num_gpus 0"
fi
DATA_DIR="${ROOT_DIR}/movielens_data"
python "${SCRIPT_DIR}/movielens.py" --data_dir ${DATA_DIR} --dataset ${DATASET}
if [ "$1" == "keras" ]
then
MAIN_SCRIPT="ncf_keras_main.py"
BATCH_SIZE=99000
DEVICE_FLAG="--num_gpus 1"
else
BATCH_SIZE=98340
fi
{
for i in `seq 0 4`;
do
START_TIME=$(date +%s)
MODEL_DIR="${TEST_DIR}/model_dir_${i}"
RUN_LOG="${LOCAL_TEST_DIR}/run_${i}.log"
export COMPLIANCE_FILE="${LOCAL_TEST_DIR}/run_${i}_compliance_raw.log"
export STITCHED_COMPLIANCE_FILE="${LOCAL_TEST_DIR}/run_${i}_compliance_submission.log"
echo ""
echo "Beginning run ${i}"
echo " Complete output logs are in ${RUN_LOG}"
echo " Compliance logs: (submission log is created after run.)"
echo " ${COMPLIANCE_FILE}"
echo " ${STITCHED_COMPLIANCE_FILE}"
# To reduce variation set the seed flag:
# --seed ${i}
python -u "${SCRIPT_DIR}/${MAIN_SCRIPT}" \
--model_dir ${MODEL_DIR} \
--data_dir ${DATA_DIR} \
--dataset ${DATASET} --hooks "" \
${DEVICE_FLAG} \
--clean \
--train_epochs 14 \
--batch_size ${BATCH_SIZE} \
--eval_batch_size 160000 \
--learning_rate 0.00382059 \
--beta1 0.783529 \
--beta2 0.909003 \
--epsilon 1.45439e-07 \
--layers 256,256,128,64 --num_factors 64 \
--hr_threshold 0.635 \
--ml_perf \
|& tee ${RUN_LOG} \
| grep --line-buffered -E --regexp="(Iteration [0-9]+: HR = [0-9\.]+, NDCG = [0-9\.]+, Loss = [0-9\.]+)|(pipeline_hash)|(MLPerf time:)"
END_TIME=$(date +%s)
echo "Run ${i} complete: $(( $END_TIME - $START_TIME )) seconds."
# Don't fill up the local hard drive.
if [[ -z ${BUCKET} ]]; then
echo "Removing model directory to save space."
rm -r ${MODEL_DIR}
fi
done
} |& tee "${LOCAL_TEST_DIR}/summary.log"