Skip to content

Commit

Permalink
Merge remote-tracking branch 'apache/main' into alamb/unnest_api_try3
Browse files Browse the repository at this point in the history
  • Loading branch information
alamb committed Aug 8, 2023
2 parents 8aaf69d + 627abd7 commit 132a4e7
Show file tree
Hide file tree
Showing 180 changed files with 8,254 additions and 3,099 deletions.
4 changes: 3 additions & 1 deletion .asf.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,9 @@ github:
features:
issues: true
protected_branches:
main: { }
main:
required_pull_request_reviews:
required_approving_review_count: 1

# publishes the content of the `asf-site` branch to
# https://arrow.apache.org/datafusion/
Expand Down
10 changes: 5 additions & 5 deletions .github/pull_request_template.md
Original file line number Diff line number Diff line change
@@ -1,25 +1,25 @@
# Which issue does this PR close?
## Which issue does this PR close?

<!--
We generally require a GitHub issue to be filed for all bug fixes and enhancements and this helps us generate change logs for our releases. You can link an issue to this PR using the GitHub syntax. For example `Closes #123` indicates that this PR will close issue #123.
-->

Closes #.

# Rationale for this change
## Rationale for this change

<!--
Why are you proposing this change? If this is already explained clearly in the issue then this section is not needed.
Explaining clearly why changes are proposed helps reviewers understand your changes and offer better suggestions for fixes.
-->

# What changes are included in this PR?
## What changes are included in this PR?

<!--
There is no need to duplicate the description in the issue here but it is sometimes worth providing a summary of the individual changes in this PR.
-->

# Are these changes tested?
## Are these changes tested?

<!--
We typically require tests for all PRs in order to:
Expand All @@ -29,7 +29,7 @@ We typically require tests for all PRs in order to:
If tests are not included in your PR, please explain why (for example, are they covered by existing tests)?
-->

# Are there any user-facing changes?
## Are there any user-facing changes?

<!--
If there are user-facing changes then we may require documentation to be updated before approving the PR.
Expand Down
4 changes: 2 additions & 2 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@

[workspace]
exclude = ["datafusion-cli"]
members = ["datafusion/common", "datafusion/core", "datafusion/expr", "datafusion/execution", "datafusion/optimizer", "datafusion/physical-expr", "datafusion/proto", "datafusion/proto/gen", "datafusion/sql", "datafusion/substrait", "datafusion-examples", "test-utils", "benchmarks",
members = ["datafusion/common", "datafusion/core", "datafusion/expr", "datafusion/execution", "datafusion/optimizer", "datafusion/physical-expr", "datafusion/proto", "datafusion/proto/gen", "datafusion/sql", "datafusion/sqllogictest", "datafusion/substrait", "datafusion-examples", "test-utils", "benchmarks",
]
resolver = "2"

Expand Down Expand Up @@ -56,4 +56,4 @@ lto = false
opt-level = 3
overflow-checks = false
panic = 'unwind'
rpath = false
rpath = false
18 changes: 0 additions & 18 deletions benchmarks/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -272,24 +272,6 @@ SUBCOMMANDS:

```

## NYC Taxi Benchmark

These benchmarks are based on the [New York Taxi and Limousine Commission][2] data set.

```bash
cargo run --release --bin nyctaxi -- --iterations 3 --path /mnt/nyctaxi/csv --format csv --batch-size 4096
```

Example output:

```bash
Running benchmarks with the following options: Opt { debug: false, iterations: 3, batch_size: 4096, path: "/mnt/nyctaxi/csv", file_format: "csv" }
Executing 'fare_amt_by_passenger'
Query 'fare_amt_by_passenger' iteration 0 took 7138 ms
Query 'fare_amt_by_passenger' iteration 1 took 7599 ms
Query 'fare_amt_by_passenger' iteration 2 took 7969 ms
```
## h2o benchmarks

```bash
Expand Down
12 changes: 6 additions & 6 deletions benchmarks/bench.sh
Original file line number Diff line number Diff line change
Expand Up @@ -303,7 +303,7 @@ run_tpch() {
RESULTS_FILE="${RESULTS_DIR}/tpch.json"
echo "RESULTS_FILE: ${RESULTS_FILE}"
echo "Running tpch benchmark..."
$CARGO_COMMAND --bin tpch -- benchmark datafusion --iterations 10 --path "${TPCH_DIR}" --format parquet -o ${RESULTS_FILE}
$CARGO_COMMAND --bin tpch -- benchmark datafusion --iterations 5 --path "${TPCH_DIR}" --format parquet -o ${RESULTS_FILE}
}

# Runs the tpch in memory
Expand All @@ -319,23 +319,23 @@ run_tpch_mem() {
echo "RESULTS_FILE: ${RESULTS_FILE}"
echo "Running tpch_mem benchmark..."
# -m means in memory
$CARGO_COMMAND --bin tpch -- benchmark datafusion --iterations 10 --path "${TPCH_DIR}" -m --format parquet -o ${RESULTS_FILE}
$CARGO_COMMAND --bin tpch -- benchmark datafusion --iterations 5 --path "${TPCH_DIR}" -m --format parquet -o ${RESULTS_FILE}
}

# Runs the parquet filter benchmark
run_parquet() {
RESULTS_FILE="${RESULTS_DIR}/parquet.json"
echo "RESULTS_FILE: ${RESULTS_FILE}"
echo "Running parquet filter benchmark..."
$CARGO_COMMAND --bin parquet -- filter --path "${DATA_DIR}" --scale-factor 1.0 --iterations 10 -o ${RESULTS_FILE}
$CARGO_COMMAND --bin parquet -- filter --path "${DATA_DIR}" --scale-factor 1.0 --iterations 5 -o ${RESULTS_FILE}
}

# Runs the sort benchmark
run_sort() {
RESULTS_FILE="${RESULTS_DIR}/sort.json"
echo "RESULTS_FILE: ${RESULTS_FILE}"
echo "Running sort benchmark..."
$CARGO_COMMAND --bin parquet -- sort --path "${DATA_DIR}" --scale-factor 1.0 --iterations 10 -o ${RESULTS_FILE}
$CARGO_COMMAND --bin parquet -- sort --path "${DATA_DIR}" --scale-factor 1.0 --iterations 5 -o ${RESULTS_FILE}
}


Expand Down Expand Up @@ -389,15 +389,15 @@ run_clickbench_1() {
RESULTS_FILE="${RESULTS_DIR}/clickbench_1.json"
echo "RESULTS_FILE: ${RESULTS_FILE}"
echo "Running clickbench (1 file) benchmark..."
$CARGO_COMMAND --bin dfbench -- clickbench --iterations 10 --path "${DATA_DIR}/hits.parquet" --queries-path "${SCRIPT_DIR}/queries/clickbench/queries.sql" -o ${RESULTS_FILE}
$CARGO_COMMAND --bin dfbench -- clickbench --iterations 5 --path "${DATA_DIR}/hits.parquet" --queries-path "${SCRIPT_DIR}/queries/clickbench/queries.sql" -o ${RESULTS_FILE}
}

# Runs the clickbench benchmark with a single large parquet file
run_clickbench_partitioned() {
RESULTS_FILE="${RESULTS_DIR}/clickbench_1.json"
echo "RESULTS_FILE: ${RESULTS_FILE}"
echo "Running clickbench (partitioned, 100 files) benchmark..."
$CARGO_COMMAND --bin dfbench -- clickbench --iterations 10 --path "${DATA_DIR}/hits_partitioned" --queries-path "${SCRIPT_DIR}/queries/clickbench/queries.sql" -o ${RESULTS_FILE}
$CARGO_COMMAND --bin dfbench -- clickbench --iterations 5 --path "${DATA_DIR}/hits_partitioned" --queries-path "${SCRIPT_DIR}/queries/clickbench/queries.sql" -o ${RESULTS_FILE}
}

compare_benchmarks() {
Expand Down
166 changes: 0 additions & 166 deletions benchmarks/src/bin/nyctaxi.rs

This file was deleted.

10 changes: 3 additions & 7 deletions benchmarks/src/tpch/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -20,10 +20,10 @@
use arrow::datatypes::SchemaBuilder;
use datafusion::{
arrow::datatypes::{DataType, Field, Schema},
common::plan_err,
error::{DataFusionError, Result},
};
use std::fs;

mod run;
pub use run::RunOpt;

Expand Down Expand Up @@ -158,13 +158,9 @@ pub fn get_query_sql(query: usize) -> Result<Vec<String>> {
Err(e) => errors.push(format!("{filename}: {e}")),
};
}
Err(DataFusionError::Plan(format!(
"invalid query. Could not find query: {errors:?}"
)))
plan_err!("invalid query. Could not find query: {:?}", errors)
} else {
Err(DataFusionError::Plan(
"invalid query. Expected value between 1 and 22".to_owned(),
))
plan_err!("invalid query. Expected value between 1 and 22")
}
}

Expand Down
Loading

0 comments on commit 132a4e7

Please sign in to comment.