Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Upgrade to arrow 20.0.0 #3007

Closed
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
76 changes: 73 additions & 3 deletions .github/workflows/rust.yml
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,16 @@ jobs:
- uses: actions/checkout@v2
with:
submodules: true
- name: Install protobuf compiler
shell: bash
run: |
mkdir -p $HOME/d/protoc
cd $HOME/d/protoc
export PROTO_ZIP="protoc-21.4-linux-x86_64.zip"
curl -LO https://github.com/protocolbuffers/protobuf/releases/download/v21.4/$PROTO_ZIP
unzip $PROTO_ZIP
export PATH=$PATH:$HOME/d/protoc/bin
protoc --version
- name: Cache Cargo
uses: actions/cache@v3
with:
Expand All @@ -94,6 +104,7 @@ jobs:
rust-version: ${{ matrix.rust }}
- name: Run tests
run: |
export PATH=$PATH:$HOME/d/protoc/bin
cargo test --features avro,jit,scheduler,json
# test datafusion-sql examples
cargo run --example sql
Expand Down Expand Up @@ -159,17 +170,65 @@ jobs:
POSTGRES_USER: postgres
POSTGRES_PASSWORD: postgres

windows-and-macos:
name: Test on ${{ matrix.os }} Rust ${{ matrix.rust }}
windows:
name: Test on Windows Rust ${{ matrix.rust }}
runs-on: ${{ matrix.os }}
strategy:
matrix:
os: [windows-latest, macos-latest]
os: [windows-latest]
rust: [stable]
steps:
- uses: actions/checkout@v2
with:
submodules: true
- name: Install protobuf compiler
shell: bash
run: |
mkdir -p $HOME/d/protoc
cd $HOME/d/protoc
export PROTO_ZIP="protoc-21.4-win64.zip"
curl -LO https://github.com/protocolbuffers/protobuf/releases/download/v21.4/$PROTO_ZIP
unzip $PROTO_ZIP
export PATH=$PATH:$HOME/d/protoc/bin
protoc.exe --version
# TODO: this won't cache anything, which is expensive. Setup this action
# with a OS-dependent path.
- name: Setup Rust toolchain
run: |
rustup toolchain install ${{ matrix.rust }}
rustup default ${{ matrix.rust }}
rustup component add rustfmt
- name: Run tests
shell: bash
run: |
export PATH=$PATH:$HOME/d/protoc/bin
cargo test
env:
# do not produce debug symbols to keep memory usage down
RUSTFLAGS: "-C debuginfo=0"

macos:
name: Test on MacOS Rust ${{ matrix.rust }}
runs-on: ${{ matrix.os }}
strategy:
matrix:
os: [macos-latest]
rust: [stable]
steps:
- uses: actions/checkout@v2
with:
submodules: true
- name: Install protobuf compiler
shell: bash
run: |
mkdir -p $HOME/d/protoc
cd $HOME/d/protoc
export PROTO_ZIP="protoc-21.4-osx-x86_64.zip"
curl -LO https://github.com/protocolbuffers/protobuf/releases/download/v21.4/$PROTO_ZIP
unzip $PROTO_ZIP
echo "$HOME/d/protoc/bin" >> $GITHUB_PATH
export PATH=$PATH:$HOME/d/protoc/bin
protoc --version
# TODO: this won't cache anything, which is expensive. Setup this action
# with a OS-dependent path.
- name: Setup Rust toolchain
Expand Down Expand Up @@ -250,6 +309,16 @@ jobs:
- uses: actions/checkout@v2
with:
submodules: true
- name: Install protobuf compiler
shell: bash
run: |
mkdir -p $HOME/d/protoc
cd $HOME/d/protoc
export PROTO_ZIP="protoc-21.4-linux-x86_64.zip"
curl -LO https://github.com/protocolbuffers/protobuf/releases/download/v21.4/$PROTO_ZIP
unzip $PROTO_ZIP
export PATH=$PATH:$HOME/d/protoc/bin
protoc --version
- name: Setup Rust toolchain
run: |
rustup toolchain install ${{ matrix.rust }}
Expand All @@ -263,6 +332,7 @@ jobs:
key: cargo-coverage-cache3-
- name: Run coverage
run: |
export PATH=$PATH:$HOME/d/protoc/bin
rustup toolchain install stable
rustup default stable
cargo install --version 0.20.1 cargo-tarpaulin
Expand Down
2 changes: 1 addition & 1 deletion benchmarks/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ simd = ["datafusion/simd"]
snmalloc = ["snmalloc-rs"]

[dependencies]
datafusion = { path = "../datafusion/core" }
datafusion = { path = "../datafusion/core", features = [], optional = false }
env_logger = "0.9"
futures = "0.3"
mimalloc = { version = "0.1", optional = true, default-features = false }
Expand Down
4 changes: 2 additions & 2 deletions datafusion-cli/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -29,9 +29,9 @@ rust-version = "1.59"
readme = "README.md"

[dependencies]
arrow = { version = "19.0.0" }
arrow = { git = "https://github.com/apache/arrow-rs.git", rev = "30c94dbf1c422f81f8520b9956e96ab7b53c3f47", features = [], optional = false }
clap = { version = "3", features = ["derive", "cargo"] }
datafusion = { path = "../datafusion/core", version = "10.0.0" }
datafusion = { path = "../datafusion/core", features = [], optional = false }
dirs = "4.0.0"
env_logger = "0.9"
mimalloc = { version = "0.1", default-features = false }
Expand Down
8 changes: 4 additions & 4 deletions datafusion-examples/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -34,13 +34,13 @@ path = "examples/avro_sql.rs"
required-features = ["datafusion/avro"]

[dev-dependencies]
arrow-flight = { version = "19.0.0" }
arrow-flight = { git = "https://github.com/apache/arrow-rs.git", rev = "30c94dbf1c422f81f8520b9956e96ab7b53c3f47", features = [], optional = false }
async-trait = "0.1.41"
datafusion = { path = "../datafusion/core" }
datafusion = { path = "../datafusion/core", features = [], optional = false }
futures = "0.3"
num_cpus = "1.13.0"
prost = "0.10"
prost = "0.11.0"
serde = { version = "1.0.136", features = ["derive"] }
serde_json = "1.0.82"
tokio = { version = "1.0", features = ["macros", "rt", "rt-multi-thread", "sync", "parking_lot"] }
tonic = "0.7"
tonic = "0.8"
7 changes: 4 additions & 3 deletions datafusion/common/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -39,11 +39,12 @@ pyarrow = ["pyo3"]

[dependencies]
apache-avro = { version = "0.14", features = ["snappy"], optional = true }
arrow = { version = "19.0.0", features = ["prettyprint"] }
arrow = { git = "https://github.com/apache/arrow-rs.git", rev = "30c94dbf1c422f81f8520b9956e96ab7b53c3f47", features = ["prettyprint"], optional = false }
avro-rs = { version = "0.13", features = ["snappy"], optional = true }
cranelift-module = { version = "0.86.1", optional = true }
object_store = { version = "0.3", optional = true }
object_store = { git = "https://github.com/apache/arrow-rs.git", rev = "30c94dbf1c422f81f8520b9956e96ab7b53c3f47", features = [], optional = true }
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't believe we intend to release object_store as part of arrow 20, and so I would back out this change

Copy link
Contributor Author

@avantgardnerio avantgardnerio Aug 8, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Here's an alternative version with object_store not upgraded, but I struggled with this too: #3083

ordered-float = "3.0"
parquet = { version = "19.0.0", features = ["arrow"], optional = true }
parquet = { git = "https://github.com/apache/arrow-rs.git", rev = "30c94dbf1c422f81f8520b9956e96ab7b53c3f47", features = ["arrow"], optional = true }
pyo3 = { version = "0.16", optional = true }
serde_json = "1.0"
sqlparser = "0.20"
2 changes: 1 addition & 1 deletion datafusion/common/src/from_slice.rs
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,7 @@ where
offsets.push(length_so_far);
values.extend_from_slice(s);
}
let array_data = ArrayData::builder(Self::get_data_type())
let array_data = ArrayData::builder(Self::DATA_TYPE)
.len(slice.len())
.add_buffer(Buffer::from_slice_ref(&offsets))
.add_buffer(Buffer::from_slice_ref(&values));
Expand Down
22 changes: 11 additions & 11 deletions datafusion/common/src/scalar.rs
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ use arrow::{
IntervalMonthDayNanoType, IntervalUnit, IntervalYearMonthType, TimeUnit,
TimestampMicrosecondType, TimestampMillisecondType, TimestampNanosecondType,
TimestampSecondType, UInt16Type, UInt32Type, UInt64Type, UInt8Type,
DECIMAL_MAX_PRECISION,
DECIMAL128_MAX_PRECISION,
},
util::decimal::{BasicDecimal, Decimal128},
};
Expand Down Expand Up @@ -611,7 +611,7 @@ impl ScalarValue {
scale: usize,
) -> Result<Self> {
// make sure the precision and scale is valid
if precision <= DECIMAL_MAX_PRECISION && scale <= precision {
if precision <= DECIMAL128_MAX_PRECISION && scale <= precision {
return Ok(ScalarValue::Decimal128(Some(value), precision, scale));
}
Err(DataFusionError::Internal(format!(
Expand Down Expand Up @@ -654,7 +654,7 @@ impl ScalarValue {
ScalarValue::Int32(_) => DataType::Int32,
ScalarValue::Int64(_) => DataType::Int64,
ScalarValue::Decimal128(_, precision, scale) => {
DataType::Decimal(*precision, *scale)
DataType::Decimal128(*precision, *scale)
}
ScalarValue::TimestampSecond(_, tz_opt) => {
DataType::Timestamp(TimeUnit::Second, tz_opt.clone())
Expand Down Expand Up @@ -935,7 +935,7 @@ impl ScalarValue {
}

let array: ArrayRef = match &data_type {
DataType::Decimal(precision, scale) => {
DataType::Decimal128(precision, scale) => {
let decimal_array =
ScalarValue::iter_to_decimal_array(scalars, precision, scale)?;
Arc::new(decimal_array)
Expand Down Expand Up @@ -1448,7 +1448,7 @@ impl ScalarValue {

Ok(match array.data_type() {
DataType::Null => ScalarValue::Null,
DataType::Decimal(precision, scale) => {
DataType::Decimal128(precision, scale) => {
ScalarValue::get_decimal_value_from_array(array, index, precision, scale)
}
DataType::Boolean => typed_cast!(array, index, BooleanArray, Boolean),
Expand Down Expand Up @@ -1899,7 +1899,7 @@ impl TryFrom<&DataType> for ScalarValue {
DataType::UInt16 => ScalarValue::UInt16(None),
DataType::UInt32 => ScalarValue::UInt32(None),
DataType::UInt64 => ScalarValue::UInt64(None),
DataType::Decimal(precision, scale) => {
DataType::Decimal128(precision, scale) => {
ScalarValue::Decimal128(None, *precision, *scale)
}
DataType::Utf8 => ScalarValue::Utf8(None),
Expand Down Expand Up @@ -2145,7 +2145,7 @@ mod tests {
#[test]
fn scalar_decimal_test() {
let decimal_value = ScalarValue::Decimal128(Some(123), 10, 1);
assert_eq!(DataType::Decimal(10, 1), decimal_value.get_datatype());
assert_eq!(DataType::Decimal128(10, 1), decimal_value.get_datatype());
let try_into_value: i128 = decimal_value.clone().try_into().unwrap();
assert_eq!(123_i128, try_into_value);
assert!(!decimal_value.is_null());
Expand All @@ -2163,14 +2163,14 @@ mod tests {
let array = decimal_value.to_array();
let array = array.as_any().downcast_ref::<Decimal128Array>().unwrap();
assert_eq!(1, array.len());
assert_eq!(DataType::Decimal(10, 1), array.data_type().clone());
assert_eq!(DataType::Decimal128(10, 1), array.data_type().clone());
assert_eq!(123i128, array.value(0).as_i128());

// decimal scalar to array with size
let array = decimal_value.to_array_of_size(10);
let array_decimal = array.as_any().downcast_ref::<Decimal128Array>().unwrap();
assert_eq!(10, array.len());
assert_eq!(DataType::Decimal(10, 1), array.data_type().clone());
assert_eq!(DataType::Decimal128(10, 1), array.data_type().clone());
assert_eq!(123i128, array_decimal.value(0).as_i128());
assert_eq!(123i128, array_decimal.value(9).as_i128());
// test eq array
Expand Down Expand Up @@ -2208,7 +2208,7 @@ mod tests {
// convert the vec to decimal array and check the result
let array = ScalarValue::iter_to_array(decimal_vec.into_iter()).unwrap();
assert_eq!(3, array.len());
assert_eq!(DataType::Decimal(10, 2), array.data_type().clone());
assert_eq!(DataType::Decimal128(10, 2), array.data_type().clone());

let decimal_vec = vec![
ScalarValue::Decimal128(Some(1), 10, 2),
Expand All @@ -2218,7 +2218,7 @@ mod tests {
];
let array = ScalarValue::iter_to_array(decimal_vec.into_iter()).unwrap();
assert_eq!(4, array.len());
assert_eq!(DataType::Decimal(10, 2), array.data_type().clone());
assert_eq!(DataType::Decimal128(10, 2), array.data_type().clone());

assert!(ScalarValue::try_new_decimal128(1, 10, 2)
.unwrap()
Expand Down
22 changes: 11 additions & 11 deletions datafusion/core/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -56,17 +56,17 @@ unicode_expressions = ["datafusion-physical-expr/regex_expressions", "datafusion
[dependencies]
ahash = { version = "0.7", default-features = false }
apache-avro = { version = "0.14", optional = true }
arrow = { version = "19.0.0", features = ["prettyprint"] }
arrow = { git = "https://github.com/apache/arrow-rs.git", rev = "30c94dbf1c422f81f8520b9956e96ab7b53c3f47", features = ["prettyprint"], optional = false }
async-trait = "0.1.41"
bytes = "1.1"
chrono = { version = "0.4", default-features = false }
datafusion-common = { path = "../common", version = "10.0.0", features = ["parquet", "object_store"] }
datafusion-expr = { path = "../expr", version = "10.0.0" }
datafusion-jit = { path = "../jit", version = "10.0.0", optional = true }
datafusion-optimizer = { path = "../optimizer", version = "10.0.0" }
datafusion-physical-expr = { path = "../physical-expr", version = "10.0.0" }
datafusion-row = { path = "../row", version = "10.0.0" }
datafusion-sql = { path = "../sql", version = "10.0.0" }
datafusion-common = { path = "../common", features = ["parquet", "object_store"], optional = false }
datafusion-expr = { path = "../expr", features = [], optional = false }
datafusion-jit = { path = "../jit", features = [], optional = true }
datafusion-optimizer = { path = "../optimizer", features = [], optional = false }
datafusion-physical-expr = { path = "../physical-expr", features = [], optional = false }
datafusion-row = { path = "../row", features = [], optional = false }
datafusion-sql = { path = "../sql", features = [], optional = false }
futures = "0.3"
glob = "0.3.0"
hashbrown = { version = "0.12", features = ["raw"] }
Expand All @@ -75,10 +75,10 @@ lazy_static = { version = "^1.4.0" }
log = "^0.4"
num-traits = { version = "0.2", optional = true }
num_cpus = "1.13.0"
object_store = "0.3.0"
object_store = { git = "https://github.com/apache/arrow-rs.git", rev = "30c94dbf1c422f81f8520b9956e96ab7b53c3f47", features = [], optional = false }
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ditto

ordered-float = "3.0"
parking_lot = "0.12"
parquet = { version = "19.0.0", features = ["arrow", "async"] }
parquet = { git = "https://github.com/apache/arrow-rs.git", rev = "30c94dbf1c422f81f8520b9956e96ab7b53c3f47", features = ["arrow", "async"], optional = false }
paste = "^1.0"
pin-project-lite = "^0.2.7"
pyo3 = { version = "0.16", optional = true }
Expand All @@ -98,7 +98,7 @@ csv = "1.1.6"
ctor = "0.1.22"
doc-comment = "0.3"
env_logger = "0.9"
fuzz-utils = { path = "fuzz-utils" }
fuzz-utils = { path = "fuzz-utils", features = [], optional = false }

[[bench]]
harness = false
Expand Down
2 changes: 1 addition & 1 deletion datafusion/core/fuzz-utils/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,6 @@ edition = "2021"
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html

[dependencies]
arrow = { version = "19.0.0", features = ["prettyprint"] }
arrow = { git = "https://github.com/apache/arrow-rs.git", rev = "30c94dbf1c422f81f8520b9956e96ab7b53c3f47", features = ["prettyprint"], optional = false }
env_logger = "0.9.0"
rand = "0.8"
10 changes: 4 additions & 6 deletions datafusion/core/src/avro_to_arrow/arrow_array_reader.rs
Original file line number Diff line number Diff line change
Expand Up @@ -101,12 +101,10 @@ impl<'a, R: Read> AvroArrowArrayReader<'a, R> {
"Failed to parse avro value: {:?}",
e
))),
other => {
return Err(ArrowError::ParseError(format!(
"Row needs to be of type object, got: {:?}",
other
)))
}
other => Err(ArrowError::ParseError(format!(
"Row needs to be of type object, got: {:?}",
other
))),
})
.collect::<ArrowResult<Vec<Vec<(String, Value)>>>>()?;
if rows.is_empty() {
Expand Down
4 changes: 2 additions & 2 deletions datafusion/core/src/avro_to_arrow/schema.rs
Original file line number Diff line number Diff line change
Expand Up @@ -141,7 +141,7 @@ fn schema_to_field_with_props(
AvroSchema::Fixed { size, .. } => DataType::FixedSizeBinary(*size as i32),
AvroSchema::Decimal {
precision, scale, ..
} => DataType::Decimal(*precision, *scale),
} => DataType::Decimal128(*precision, *scale),
AvroSchema::Uuid => DataType::FixedSizeBinary(16),
AvroSchema::Date => DataType::Date32,
AvroSchema::TimeMillis => DataType::Time32(TimeUnit::Millisecond),
Expand Down Expand Up @@ -217,7 +217,7 @@ fn default_field_name(dt: &DataType) -> &str {
DataType::Union(_, _, _) => "union",
DataType::Dictionary(_, _) => "map",
DataType::Map(_, _) => unimplemented!("Map support not implemented"),
DataType::Decimal(_, _) => "decimal",
DataType::Decimal128(_, _) => "decimal",
DataType::Decimal256(_, _) => "decimal",
}
}
Expand Down
2 changes: 1 addition & 1 deletion datafusion/core/src/catalog/information_schema.rs
Original file line number Diff line number Diff line change
Expand Up @@ -508,7 +508,7 @@ impl InformationSchemaColumnsBuilder {
Float32 => (Some(24), Some(2), None),
// Numbers from postgres `double` type
Float64 => (Some(24), Some(2), None),
Decimal(precision, scale) => {
Decimal128(precision, scale) => {
(Some(*precision as u64), Some(10), Some(*scale as u64))
}
_ => (None, None, None),
Expand Down
Loading