forked from xxisxuxuqq/byconity-tpcds
-
Notifications
You must be signed in to change notification settings - Fork 0
/
gen_data.sh
executable file
·65 lines (54 loc) · 1.98 KB
/
gen_data.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
#!/bin/bash
#
# Copyright (2022) Bytedance Ltd. and/or its affiliates
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
set -e
source ./config.sh
source ./helper.sh
SIZE=$1
if [ -z "$PARALLEL" ]; then
PARALLEL=$(($(grep -c ^processor /proc/cpuinfo)/2))
if (( PARALLEL < 2 )); then
PARALLEL=2
fi
fi
log "set PARALLEL ${PARALLEL}"
usage() {
echo "usage: $0 <size(gb)> <flag>"
echo " if you want to replace empty cell, use $0 <size> 1"
}
gen_data() {
local CSVPATH=${SCRIPTPATH}/data_tpcds_${SIZE}
[ ! -d $CSVPATH ] && mkdir $CSVPATH || true
local PARALLEL=${PARALLEL:-1}
log "Generating ${DATASIZE}G TPCDS data in ${CSVPATH}, with PARALLEL ${PARALLEL}, RNGSEED ${SEED}..."
if [ "$PARALLEL" == 1 ]; then
$TOOLS_PATH/dsdgen -SCALE $DATASIZE -DELIMITER \| -CHILD __ -TERMINATE N \
-RNGSEED $SEED -DISTRIBUTIONS $TOOLS_PATH/tpcds.idx -DIR $CSVPATH 2>&1 | tee -a $OUTPUT_LOG
else
seq 1 $PARALLEL | xargs -t -P$PARALLEL -I__ \
$TOOLS_PATH/dsdgen -SCALE $DATASIZE -DELIMITER \| -PARALLEL $PARALLEL -CHILD __ -TERMINATE N \
-RNGSEED $SEED -DISTRIBUTIONS $TOOLS_PATH/tpcds.idx -DIR $CSVPATH 2>&1 | tee -a $OUTPUT_LOG
fi
}
[ -z $SIZE ] && usage && exit 1
[ -z "$1" ] && DATASIZE=1 || DATASIZE=$1
if [ $DATASIZE -eq 1 ]; then
SEED=42
else
SEED=19620718
fi
gen_data
log "Splitting generated data..."
mv data_tpcds_${SIZE} data_tpcds_${SIZE}_orig && cd data_tpcds_${SIZE}_orig && ../split.sh ../data_tpcds_${SIZE} | tee -a $OUTPUT_LOG && cd .. && rm -r data_tpcds_${SIZE}_orig