Skip to content

Commit

Permalink
Merge remote-tracking branch 'apache/main' into alamb/backout_bigdecimal
Browse files Browse the repository at this point in the history
  • Loading branch information
alamb committed Jul 12, 2023
2 parents f5620f9 + d6985b0 commit a9ac070
Show file tree
Hide file tree
Showing 20 changed files with 1,211 additions and 931 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/dev_pr.yml
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ jobs:
github.event_name == 'pull_request_target' &&
(github.event.action == 'opened' ||
github.event.action == 'synchronize')
uses: actions/labeler@v4.2.0
uses: actions/labeler@v4.3.0
with:
repo-token: ${{ secrets.GITHUB_TOKEN }}
configuration-path: .github/workflows/dev_pr/labeler.yml
Expand Down
93 changes: 69 additions & 24 deletions benchmarks/bench.sh
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,8 @@ compare: Comares results from benchmark runs
all(default): Data/Run/Compare for all benchmarks
tpch: TPCH inspired benchmark on Scale Factor (SF) 1 (~1GB), single parquet file per table
tpch_mem: TPCH inspired benchmark on Scale Factor (SF) 1 (~1GB), query from memory
tpch10: TPCH inspired benchmark on Scale Factor (SF) 10 (~10GB), single parquet file per table
tpch10_mem: TPCH inspired benchmark on Scale Factor (SF) 10 (~10GB), query from memory
parquet: Benchmark of parquet reader's filtering speed
sort: Benchmark of sorting speed
Expand Down Expand Up @@ -124,14 +126,22 @@ main() {
echo "***************************"
case "$BENCHMARK" in
all)
data_tpch
data_tpch "1"
data_tpch "10"
;;
tpch)
data_tpch
data_tpch "1"
;;
tpch_mem)
# same data for tpch_mem
data_tpch
# same data as for tpch
data_tpch "1"
;;
tpch10)
data_tpch "10"
;;
tpch_mem10)
# same data as for tpch10
data_tpch "10"
;;
*)
echo "Error: unknown benchmark '$BENCHMARK' for data generation"
Expand Down Expand Up @@ -162,16 +172,24 @@ main() {
mkdir -p "${RESULTS_DIR}"
case "$BENCHMARK" in
all)
run_tpch
run_tpch_mem
run_tpch "1"
run_tpch_mem "1"
run_tpch "10"
run_tpch_mem "10"
run_parquet
run_sort
;;
tpch)
run_tpch
run_tpch "1"
;;
tpch_mem)
run_tpch_mem
run_tpch_mem "1"
;;
tpch10)
run_tpch "10"
;;
tpch_mem10)
run_tpch_mem "10"
;;
parquet)
run_parquet
Expand Down Expand Up @@ -201,76 +219,103 @@ main() {



# Creates TPCH data if it doesn't already exist
# Creates TPCH data at a certain scale factor, if it doesn't already
# exist
#
# call like: data_tpch($scale_factor)
#
# Creates data in $DATA_DIR/tpch_sf1 for scale factor 1
# Creates data in $DATA_DIR/tpch_sf10 for scale factor 10
# etc
data_tpch() {
echo "Creating tpch dataset..."
SCALE_FACTOR=$1
if [ -z "$SCALE_FACTOR" ] ; then
echo "Internal error: Scale factor not specified"
exit 1
fi

TPCH_DIR="${DATA_DIR}/tpch_sf${SCALE_FACTOR}"
echo "Creating tpch dataset at Scale Factor ${SCALE_FACTOR} in ${TPCH_DIR}..."

# Ensure the target data directory exists
mkdir -p "${DATA_DIR}"
mkdir -p "${TPCH_DIR}"

# Create 'tbl' (CSV format) data into $DATA_DIR if it does not already exist
SCALE_FACTOR=1
FILE="${DATA_DIR}/supplier.tbl"
FILE="${TPCH_DIR}/supplier.tbl"
if test -f "${FILE}"; then
echo " tbl files exist ($FILE exists)."
else
echo " creating tbl files with tpch_dbgen..."
docker run -v "${DATA_DIR}":/data -it --rm ghcr.io/databloom-ai/tpch-docker:main -vf -s ${SCALE_FACTOR}
docker run -v "${TPCH_DIR}":/data -it --rm ghcr.io/databloom-ai/tpch-docker:main -vf -s ${SCALE_FACTOR}
fi

# Copy expected answers into the ./data/answers directory if it does not already exist
FILE="${DATA_DIR}/answers/q1.out"
FILE="${TPCH_DIR}/answers/q1.out"
if test -f "${FILE}"; then
echo " Expected answers exist (${FILE} exists)."
else
echo " Copying answers to ${DATA_DIR}/answers"
mkdir -p "${DATA_DIR}/answers"
docker run -v "${DATA_DIR}":/data -it --entrypoint /bin/bash --rm ghcr.io/databloom-ai/tpch-docker:main -c "cp -f /opt/tpch/2.18.0_rc2/dbgen/answers/* /data/answers/"
echo " Copying answers to ${TPCH_DIR}/answers"
mkdir -p "${TPCH_DIR}/answers"
docker run -v "${TPCH_DIR}":/data -it --entrypoint /bin/bash --rm ghcr.io/databloom-ai/tpch-docker:main -c "cp -f /opt/tpch/2.18.0_rc2/dbgen/answers/* /data/answers/"
fi

# Create 'parquet' files from tbl
FILE="${DATA_DIR}/supplier"
FILE="${TPCH_DIR}/supplier"
if test -d "${FILE}"; then
echo " parquet files exist ($FILE exists)."
else
echo " creating parquet files using benchmark binary ..."
pushd "${SCRIPT_DIR}" > /dev/null
$CARGO_COMMAND --bin tpch -- convert --input "${DATA_DIR}" --output "${DATA_DIR}" --format parquet
$CARGO_COMMAND --bin tpch -- convert --input "${TPCH_DIR}" --output "${TPCH_DIR}" --format parquet
popd > /dev/null
fi
}

# Runs the tpch benchmark
run_tpch() {
SCALE_FACTOR=$1
if [ -z "$SCALE_FACTOR" ] ; then
echo "Internal error: Scale factor not specified"
exit 1
fi
TPCH_DIR="${DATA_DIR}/tpch_sf${SCALE_FACTOR}"

RESULTS_FILE="${RESULTS_DIR}/tpch.json"
echo "RESULTS_FILE: ${RESULTS_FILE}"
echo "Running tpch benchmark..."
$CARGO_COMMAND --bin tpch -- benchmark datafusion --iterations 5 --path "${DATA_DIR}" --format parquet -o ${RESULTS_FILE}
$CARGO_COMMAND --bin tpch -- benchmark datafusion --iterations 10 --path "${TPCH_DIR}" --format parquet -o ${RESULTS_FILE}
}

# Runs the tpch in memory
run_tpch_mem() {
SCALE_FACTOR=$1
if [ -z "$SCALE_FACTOR" ] ; then
echo "Internal error: Scale factor not specified"
exit 1
fi
TPCH_DIR="${DATA_DIR}/tpch_sf${SCALE_FACTOR}"

RESULTS_FILE="${RESULTS_DIR}/tpch_mem.json"
echo "RESULTS_FILE: ${RESULTS_FILE}"
echo "Running tpch_mem benchmark..."
# -m means in memory
$CARGO_COMMAND --bin tpch -- benchmark datafusion --iterations 5 --path "${DATA_DIR}" -m --format parquet -o ${RESULTS_FILE}
$CARGO_COMMAND --bin tpch -- benchmark datafusion --iterations 10 --path "${TPCH_DIR}" -m --format parquet -o ${RESULTS_FILE}
}

# Runs the parquet filter benchmark
run_parquet() {
RESULTS_FILE="${RESULTS_DIR}/parquet.json"
echo "RESULTS_FILE: ${RESULTS_FILE}"
echo "Running parquet filter benchmark..."
$CARGO_COMMAND --bin parquet -- filter --path "${DATA_DIR}" --scale-factor 1.0 --iterations 5 -o ${RESULTS_FILE}
$CARGO_COMMAND --bin parquet -- filter --path "${DATA_DIR}" --scale-factor 1.0 --iterations 10 -o ${RESULTS_FILE}
}

# Runs the sort benchmark
run_sort() {
RESULTS_FILE="${RESULTS_DIR}/sort.json"
echo "RESULTS_FILE: ${RESULTS_FILE}"
echo "Running sort benchmark..."
$CARGO_COMMAND --bin parquet -- sort --path "${DATA_DIR}" --scale-factor 1.0 --iterations 5 -o ${RESULTS_FILE}
$CARGO_COMMAND --bin parquet -- sort --path "${DATA_DIR}" --scale-factor 1.0 --iterations 10 -o ${RESULTS_FILE}
}

compare_benchmarks() {
Expand Down
7 changes: 4 additions & 3 deletions datafusion/common/src/utils.rs
Original file line number Diff line number Diff line change
Expand Up @@ -177,9 +177,10 @@ where
Ok(low)
}

/// This function finds the partition points according to `partition_columns`.
/// If there are no sort columns, then the result will be a single element
/// vector containing one partition range spanning all data.
/// Given a list of 0 or more already sorted columns, finds the
/// partition ranges that would partition equally across columns.
///
/// See [`lexicographical_partition_ranges`] for more details.
pub fn evaluate_partition_ranges(
num_rows: usize,
partition_columns: &[SortColumn],
Expand Down
6 changes: 6 additions & 0 deletions datafusion/core/src/datasource/listing/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -101,6 +101,12 @@ impl PartitionedFile {
extensions: None,
}
}

/// Return a file reference from the given path
pub fn from_path(path: String) -> Result<Self> {
let size = std::fs::metadata(path.clone())?.len();
Ok(Self::new(path, size))
}
}

impl From<ObjectMeta> for PartitionedFile {
Expand Down
Loading

0 comments on commit a9ac070

Please sign in to comment.