Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[FEAT] UTF8 to binary coercion flag #2893

Merged
merged 13 commits into from
Sep 24, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -161,6 +161,7 @@ jaq-std = "1.2.0"
num-derive = "0.3.3"
num-traits = "0.2"
once_cell = "1.19.0"
path_macro = "1.0.0"
pretty_assertions = "1.4.0"
rand = "^0.8"
rayon = "1.10.0"
Expand Down
3 changes: 2 additions & 1 deletion daft/daft/__init__.pyi
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import builtins
import datetime
from enum import Enum
from typing import TYPE_CHECKING, Any, Callable, Iterator
from typing import TYPE_CHECKING, Any, Callable, Iterator, Literal

from daft.dataframe.display import MermaidOptions
from daft.execution import physical_plan
Expand Down Expand Up @@ -870,6 +870,7 @@ def read_parquet_into_pyarrow(
io_config: IOConfig | None = None,
multithreaded_io: bool | None = None,
coerce_int96_timestamp_unit: PyTimeUnit | None = None,
string_encoding: Literal["utf-8"] | Literal["raw"] = "utf-8",
file_timeout_ms: int | None = None,
): ...
def read_parquet_into_pyarrow_bulk(
Expand Down
4 changes: 3 additions & 1 deletion daft/table/table.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
from __future__ import annotations

import logging
from typing import TYPE_CHECKING, Any
from typing import TYPE_CHECKING, Any, Literal

from daft.arrow_utils import ensure_table
from daft.daft import (
Expand Down Expand Up @@ -524,6 +524,7 @@ def read_parquet_into_pyarrow(
io_config: IOConfig | None = None,
multithreaded_io: bool | None = None,
coerce_int96_timestamp_unit: TimeUnit = TimeUnit.ns(),
string_encoding: Literal["utf-8"] | Literal["raw"] = "utf-8",
file_timeout_ms: int | None = 900_000, # 15 minutes
) -> pa.Table:
fields, metadata, columns, num_rows_read = _read_parquet_into_pyarrow(
Expand All @@ -535,6 +536,7 @@ def read_parquet_into_pyarrow(
io_config=io_config,
multithreaded_io=multithreaded_io,
coerce_int96_timestamp_unit=coerce_int96_timestamp_unit._timeunit,
string_encoding=string_encoding,
file_timeout_ms=file_timeout_ms,
)
schema = pa.schema(fields, metadata=metadata)
Expand Down
32 changes: 21 additions & 11 deletions src/arrow2/src/io/parquet/read/schema/convert.rs
Original file line number Diff line number Diff line change
Expand Up @@ -7,24 +7,28 @@ use parquet2::schema::{
Repetition,
};

use crate::datatypes::{DataType, Field, IntervalUnit, TimeUnit};
use crate::io::parquet::read::schema::SchemaInferenceOptions;
use super::StringEncoding;
use crate::{
datatypes::{DataType, Field, IntervalUnit, TimeUnit},
io::parquet::read::schema::SchemaInferenceOptions,
};

/// Converts [`ParquetType`]s to a [`Field`], ignoring parquet fields that do not contain
/// any physical column.
#[allow(dead_code)]
pub fn parquet_to_arrow_schema(fields: &[ParquetType]) -> Vec<Field> {
parquet_to_arrow_schema_with_options(fields, &None)
parquet_to_arrow_schema_with_options(fields, None)
}

/// Like [`parquet_to_arrow_schema`] but with configurable options which affect the behavior of schema inference
pub fn parquet_to_arrow_schema_with_options(
fields: &[ParquetType],
options: &Option<SchemaInferenceOptions>,
options: Option<SchemaInferenceOptions>,
) -> Vec<Field> {
let options = options.unwrap_or_default();
fields
.iter()
.filter_map(|f| to_field(f, options.as_ref().unwrap_or(&Default::default())))
.filter_map(|f| to_field(f, &options))
.collect::<Vec<_>>()
}

Expand Down Expand Up @@ -145,9 +149,13 @@ fn from_int64(
fn from_byte_array(
logical_type: &Option<PrimitiveLogicalType>,
converted_type: &Option<PrimitiveConvertedType>,
options: &SchemaInferenceOptions,
) -> DataType {
match (logical_type, converted_type) {
(Some(PrimitiveLogicalType::String), _) => DataType::Utf8,
(Some(PrimitiveLogicalType::String), _) => match options.string_encoding {
StringEncoding::Utf8 => DataType::Utf8,
StringEncoding::Raw => DataType::Binary,
},
(Some(PrimitiveLogicalType::Json), _) => DataType::Binary,
(Some(PrimitiveLogicalType::Bson), _) => DataType::Binary,
(Some(PrimitiveLogicalType::Enum), _) => DataType::Binary,
Expand Down Expand Up @@ -219,9 +227,11 @@ fn to_primitive_type_inner(
PhysicalType::Int96 => DataType::Timestamp(options.int96_coerce_to_timeunit, None),
PhysicalType::Float => DataType::Float32,
PhysicalType::Double => DataType::Float64,
PhysicalType::ByteArray => {
from_byte_array(&primitive_type.logical_type, &primitive_type.converted_type)
}
PhysicalType::ByteArray => from_byte_array(
&primitive_type.logical_type,
&primitive_type.converted_type,
options,
),
PhysicalType::FixedLenByteArray(length) => from_fixed_len_byte_array(
length,
primitive_type.logical_type,
Expand Down Expand Up @@ -440,7 +450,6 @@ mod tests {
use parquet2::metadata::SchemaDescriptor;

use super::*;

use crate::error::Result;

#[test]
Expand Down Expand Up @@ -1123,8 +1132,9 @@ mod tests {
let parquet_schema = SchemaDescriptor::try_from_message(message_type)?;
let fields = parquet_to_arrow_schema_with_options(
parquet_schema.fields(),
&Some(SchemaInferenceOptions {
Some(SchemaInferenceOptions {
int96_coerce_to_timeunit: tu,
..Default::default()
}),
);
assert_eq!(arrow_fields, fields);
Expand Down
54 changes: 46 additions & 8 deletions src/arrow2/src/io/parquet/read/schema/mod.rs
Original file line number Diff line number Diff line change
@@ -1,21 +1,55 @@
//! APIs to handle Parquet <-> Arrow schemas.

use crate::datatypes::{Schema, TimeUnit};
use crate::error::Result;
use std::str::FromStr;

use crate::{
datatypes::{Schema, TimeUnit},
error::{Error, Result},
};

mod convert;
mod metadata;

pub use convert::parquet_to_arrow_schema_with_options;
pub use metadata::{apply_schema_to_fields, read_schema_from_metadata};
pub use parquet2::metadata::{FileMetaData, KeyValue, SchemaDescriptor};
pub use parquet2::schema::types::ParquetType;

pub(crate) use convert::*;
pub use metadata::{apply_schema_to_fields, read_schema_from_metadata};
pub use parquet2::{
metadata::{FileMetaData, KeyValue, SchemaDescriptor},
schema::types::ParquetType,
};
use serde::{Deserialize, Serialize};

use self::metadata::parse_key_value_metadata;

/// String encoding options.
///
/// Each variant of this enum maps to a different interpretation of the underlying binary data:
/// 1. `StringEncoding::Utf8` assumes the underlying binary data is UTF-8 encoded.
/// 2. `StringEncoding::Raw` makes no assumptions about the encoding of the underlying binary data. This variant will change the logical type of the column to `DataType::Binary` in the final schema.
#[derive(Default, Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
pub enum StringEncoding {
Raw,
#[default]
Utf8,
}

impl FromStr for StringEncoding {
type Err = Error;

fn from_str(value: &str) -> Result<Self> {
match value {
"utf-8" => Ok(Self::Utf8),
"raw" => Ok(Self::Raw),
encoding => Err(Error::InvalidArgumentError(format!(
"Unrecognized encoding: {}",
encoding,
))),
}
}
}

/// Options when inferring schemas from Parquet
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub struct SchemaInferenceOptions {
/// When inferring schemas from the Parquet INT96 timestamp type, this is the corresponding TimeUnit
/// in the inferred Arrow Timestamp type.
Expand All @@ -25,12 +59,16 @@ pub struct SchemaInferenceOptions {
/// (e.g. TimeUnit::Milliseconds) will result in loss of precision, but support a larger range of dates
/// without overflowing when parsing the data.
pub int96_coerce_to_timeunit: TimeUnit,

/// The string encoding to assume when inferring the schema from Parquet data.
pub string_encoding: StringEncoding,
}

impl Default for SchemaInferenceOptions {
fn default() -> Self {
SchemaInferenceOptions {
int96_coerce_to_timeunit: TimeUnit::Nanosecond,
string_encoding: StringEncoding::default(),
}
}
}
Expand All @@ -42,13 +80,13 @@ impl Default for SchemaInferenceOptions {
/// This function errors iff the key `"ARROW:schema"` exists but is not correctly encoded,
/// indicating that that the file's arrow metadata was incorrectly written.
pub fn infer_schema(file_metadata: &FileMetaData) -> Result<Schema> {
infer_schema_with_options(file_metadata, &None)
infer_schema_with_options(file_metadata, None)
}

/// Like [`infer_schema`] but with configurable options which affects the behavior of inference
pub fn infer_schema_with_options(
file_metadata: &FileMetaData,
options: &Option<SchemaInferenceOptions>,
options: Option<SchemaInferenceOptions>,
raunakab marked this conversation as resolved.
Show resolved Hide resolved
) -> Result<Schema> {
let mut metadata = parse_key_value_metadata(file_metadata.key_value_metadata());
let fields = parquet_to_arrow_schema_with_options(file_metadata.schema().fields(), options);
Expand Down
13 changes: 7 additions & 6 deletions src/daft-micropartition/src/micropartition.rs
Original file line number Diff line number Diff line change
Expand Up @@ -596,9 +596,9 @@ impl MicroPartition {
(
_,
_,
FileFormatConfig::Parquet(ParquetSourceConfig {
&FileFormatConfig::Parquet(ParquetSourceConfig {
coerce_int96_timestamp_unit,
field_id_mapping,
ref field_id_mapping,
chunk_size,
..
}),
Expand Down Expand Up @@ -646,12 +646,13 @@ impl MicroPartition {
if scan_task.sources.len() == 1 { 1 } else { 128 }, // Hardcoded for to 128 bulk reads
cfg.multithreaded_io,
&ParquetSchemaInferenceOptions {
coerce_int96_timestamp_unit: *coerce_int96_timestamp_unit,
coerce_int96_timestamp_unit,
..Default::default()
},
Some(schema.clone()),
field_id_mapping.clone(),
parquet_metadata,
*chunk_size,
chunk_size,
)
.context(DaftCoreComputeSnafu)
}
Expand Down Expand Up @@ -1162,7 +1163,7 @@ pub(crate) fn read_parquet_into_micropartition<T: AsRef<str>>(
let schemas = metadata
.iter()
.map(|m| {
let schema = infer_schema_with_options(m, &Some((*schema_infer_options).into()))?;
let schema = infer_schema_with_options(m, Some((*schema_infer_options).into()))?;
let daft_schema = daft_core::prelude::Schema::try_from(&schema)?;
DaftResult::Ok(Arc::new(daft_schema))
})
Expand All @@ -1186,7 +1187,7 @@ pub(crate) fn read_parquet_into_micropartition<T: AsRef<str>>(
let schemas = metadata
.iter()
.map(|m| {
let schema = infer_schema_with_options(m, &Some((*schema_infer_options).into()))?;
let schema = infer_schema_with_options(m, Some((*schema_infer_options).into()))?;
let daft_schema = daft_core::prelude::Schema::try_from(&schema)?;
DaftResult::Ok(Arc::new(daft_schema))
})
Expand Down
1 change: 1 addition & 0 deletions src/daft-parquet/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ tokio-util = {workspace = true}

[dev-dependencies]
bincode = {workspace = true}
path_macro = {workspace = true}

[features]
python = ["dep:pyo3", "common-error/python", "daft-core/python", "daft-io/python", "daft-table/python", "daft-stats/python", "daft-dsl/python", "common-arrow-ffi/python"]
Expand Down
11 changes: 6 additions & 5 deletions src/daft-parquet/src/file.rs
Original file line number Diff line number Diff line change
Expand Up @@ -259,11 +259,12 @@ impl ParquetReaderBuilder {
}

pub fn build(self) -> super::Result<ParquetFileReader> {
let mut arrow_schema =
infer_schema_with_options(&self.metadata, &Some(self.schema_inference_options.into()))
.context(UnableToParseSchemaFromMetadataSnafu::<String> {
path: self.uri.clone(),
})?;
let options = self.schema_inference_options.into();
let mut arrow_schema = infer_schema_with_options(&self.metadata, Some(options)).context(
UnableToParseSchemaFromMetadataSnafu {
path: self.uri.clone(),
},
)?;

if let Some(names_to_keep) = self.selected_columns {
arrow_schema
Expand Down
3 changes: 3 additions & 0 deletions src/daft-parquet/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,9 @@ pub use python::register_modules;

#[derive(Debug, Snafu)]
pub enum Error {
#[snafu(display("{source}"))]
Arrow2Error { source: arrow2::error::Error },

#[snafu(display("{source}"))]
DaftIOError { source: daft_io::Error },

Expand Down
16 changes: 10 additions & 6 deletions src/daft-parquet/src/python.rs
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,9 @@ pub mod pylib {
use daft_table::python::PyTable;
use pyo3::{pyfunction, types::PyModule, Bound, PyResult, Python};

use crate::read::{ArrowChunk, ParquetSchemaInferenceOptions};
use crate::read::{
ArrowChunk, ParquetSchemaInferenceOptions, ParquetSchemaInferenceOptionsBuilder,
};
#[allow(clippy::too_many_arguments)]
#[pyfunction]
pub fn read_parquet(
Expand Down Expand Up @@ -90,6 +92,7 @@ pub mod pylib {
pub fn read_parquet_into_pyarrow(
py: Python,
uri: &str,
string_encoding: String,
columns: Option<Vec<String>>,
start_offset: Option<usize>,
num_rows: Option<usize>,
Expand All @@ -99,14 +102,16 @@ pub mod pylib {
coerce_int96_timestamp_unit: Option<PyTimeUnit>,
file_timeout_ms: Option<i64>,
) -> PyResult<PyArrowParquetType> {
let read_parquet_result = py.allow_threads(|| {
let (schema, all_arrays, num_rows) = py.allow_threads(|| {
let io_client = get_io_client(
multithreaded_io.unwrap_or(true),
io_config.unwrap_or_default().config.into(),
)?;
let schema_infer_options = ParquetSchemaInferenceOptions::new(
coerce_int96_timestamp_unit.map(|tu| tu.timeunit),
);
let schema_infer_options = ParquetSchemaInferenceOptionsBuilder {
coerce_int96_timestamp_unit,
string_encoding,
}
.build()?;

crate::read::read_parquet_into_pyarrow(
uri,
Expand All @@ -121,7 +126,6 @@ pub mod pylib {
file_timeout_ms,
)
})?;
let (schema, all_arrays, num_rows) = read_parquet_result;
let pyarrow = py.import_bound(pyo3::intern!(py, "pyarrow"))?;
convert_pyarrow_parquet_read_result_into_py(py, schema, all_arrays, num_rows, &pyarrow)
}
Expand Down
Loading
Loading