forked from xxisxuxuqq/byconity-tpcds
-
Notifications
You must be signed in to change notification settings - Fork 0
/
populate_data.sh
executable file
·118 lines (98 loc) · 4.2 KB
/
populate_data.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
#!/bin/bash
#
# Copyright (2022) Bytedance Ltd. and/or its affiliates
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
source ./config.sh
source ./helper.sh
[ -z "$1" ] && DATASIZE=1 || DATASIZE=$1
[ -z "$2" ] && CSVPATH="$SCRIPTPATH/data_${SUITE}_${DATASIZE}" || CSVPATH=$2
DATABASE=${DATABASE:-${DB_PREFIX}${SUITE}${DATASIZE}}
EXT=dat
DELIM="|"
if [ ! -d $CSVPATH ]; then
log "$CSVPATH directory not found."
exit -1
fi
set -e
log "Create tables for ${DATABASE}..."
CNCH_DDL=$(sed "s|${SUITE}|${DATABASE}|g" $SCRIPTPATH/ddl/tpcds.sql)
clickhouse_client "${CNCH_DDL}"
log "Import dataset from ${CSVPATH}..."
TABLES=$(show_tables "$DATABASE")
log "Tables created: $TABLES"
FILE_TABLES=()
FILE_NAMES=()
for TABLE in $TABLES; do
FILE=${CSVPATH}/${TABLE}.${EXT}
if [ -f "${FILE}" ]; then
FILE_TABLES+=(${TABLE})
FILE_NAMES+=(${FILE})
else
for f in `find ${CSVPATH}/ -regex "${CSVPATH}/${TABLE}_[0-9_]+\.${EXT}"`; do
FILE_TABLES+=(${TABLE})
FILE_NAMES+=(${f})
done
fi
done
ARGS=" -d $DATABASE --input_format_defaults_for_omitted_fields=1 --format_csv_delimiter='$DELIM' --input_format_parallel_parsing=1"
CMDS=()
for i in ${!FILE_NAMES[@]}; do
SQL="INSERT INTO ${FILE_TABLES[$i]} FORMAT CSV"
CMD=$(clickhouse_client_cmd "$SQL" "$ARGS < ${FILE_NAMES[$i]}")
if [ -n "$ENABLE_TRACE" ]; then
trace "$CMD"
fi
CMDS+=("${CMD} && echo uploaded ${FILE_NAMES[$i]} || exit 1")
done
if [ -z "$PARALLEL" ]; then
PARALLEL=$(($(grep -c ^processor /proc/cpuinfo)/2))
if (( PARALLEL < 2 )); then
PARALLEL=2
fi
fi
log "set PARALLEL ${PARALLEL}"
SECONDS=0
printf "%s\n" "${CMDS[@]}" | tr '\n' '\0' | xargs -0 -P${PARALLEL} -n 1 -I {} sh -c "{}"
log "Used ${SECONDS}s to import ${DATABASE}."
# check row counts
select_count() {
clickhouse_client "SELECT COUNT(*) FROM $1"
}
case $DATASIZE in
1)
ROW_CNT=(call_center-6 catalog_page-11718 catalog_returns-143974 catalog_sales-1440839 customer-100000 customer_address-50000 customer_demographics-1920800 date_dim-73049 household_demographics-7200 income_band-20 inventory-11745000 item-18000 promotion-300 reason-35 ship_mode-20 store-12 store_returns-287777 store_sales-2880029 time_dim-86400 warehouse-5 web_page-60 web_returns-71937 web_sales-720791 web_site-30)
;;
100)
ROW_CNT=(call_center-30 catalog_page-20400 catalog_returns-14404374 catalog_sales-143997065 customer-2000000 customer_address-1000000 customer_demographics-1920800 date_dim-73049 household_demographics-7200 income_band-20 inventory-399330000 item-204000 promotion-1000 reason-55 ship_mode-20 store-402 store_returns-28795080 store_sales-287997024 time_dim-86400 warehouse-15 web_page-2040 web_returns-7197670 web_sales-72001237 web_site-24)
;;
1000)
ROW_CNT=(call_center-42 catalog_page-30000 catalog_returns-143996756 catalog_sales-1439980416 customer-12000000 customer_address-6000000 customer_demographics-1920800 date_dim-73049 household_demographics-7200 income_band-20 inventory-783000000 item-300000 promotion-1500 reason-65 ship_mode-20 store-1002 store_returns-287999764 store_sales-2879987999 time_dim-86400 warehouse-20 web_page-3000 web_returns-71997522 web_sales-720000376 web_site-54)
esac
for CNT in ${ROW_CNT[@]}; do
IFS=- read -r TABLE ROW <<< "${CNT}" && unset IFS
IMPORTED_COUNT=$(clickhouse_client "SELECT COUNT(*) FROM ${TABLE}" -d ${DATABASE})
FAILURE=0
if [[ "${IMPORTED_COUNT}" != "${ROW}" ]]; then
# As long as 1 table fails, we will trigger FAILURE
log "${TABLE} should have ${IMPORTED_COUNT} instead has ${ROW}"
FAILURE=1
else
log "${TABLE} count matches expectation."
fi
done
if [[ "${FAILURE}" == "1" ]]; then
log "Imported rows do not match the required amount. Terminating."
exit 1
fi
log "All tables imported matches expectation."