From 4f7fe8daf3f7d94b66dbe7c325b9bde5383826c8 Mon Sep 17 00:00:00 2001 From: universalmind303 Date: Tue, 27 Aug 2024 20:14:07 -0500 Subject: [PATCH 1/8] remove this file --- .history | 3 --- 1 file changed, 3 deletions(-) delete mode 100644 .history diff --git a/.history b/.history deleted file mode 100644 index 59664f5eee..0000000000 --- a/.history +++ /dev/null @@ -1,3 +0,0 @@ -#V2 -create table wiki as select * from '~/datasets/wiki/*.parquet' limit 1000; -select 1 from wiki; From 4b676d401f4a369b19db8cc6b56d1fe9ec3a540d Mon Sep 17 00:00:00 2001 From: universalmind303 Date: Fri, 6 Sep 2024 14:21:33 -0700 Subject: [PATCH 2/8] almost there --- Cargo.lock | 17 + Cargo.toml | 3 + daft/daft.pyi | 5 +- daft/daft/__init__.pyi | 0 daft/daft/image.pyi | 6 + daft/series.py | 10 +- src/daft-core/src/array/image_array.rs | 210 ++++ src/daft-core/src/array/mod.rs | 1 + src/daft-core/src/array/ops/cast.rs | 3 +- src/daft-core/src/array/ops/image.rs | 1029 ----------------- src/daft-core/src/array/ops/mod.rs | 1 - src/daft-core/src/array/ops/repr.rs | 77 +- src/daft-core/src/datatypes/image_mode.rs | 90 +- src/daft-core/src/python/series.rs | 34 - .../src/series/array_impl/data_array.rs | 6 +- .../src/series/array_impl/logical_array.rs | 8 +- .../src/series/array_impl/nested_array.rs | 6 +- src/daft-core/src/series/ops/image.rs | 101 -- src/daft-core/src/series/ops/mod.rs | 1 - src/daft-core/src/series/series_like.rs | 2 +- src/daft-functions/Cargo.toml | 2 + src/daft-functions/src/image/crop.rs | 2 +- src/daft-functions/src/image/decode.rs | 2 +- src/daft-functions/src/image/encode.rs | 2 +- src/daft-functions/src/image/resize.rs | 2 +- src/daft-functions/src/image/to_mode.rs | 2 +- src/daft-image/Cargo.toml | 25 + src/daft-image/src/counting_writer.rs | 45 + src/daft-image/src/image_buffer.rs | 307 +++++ src/daft-image/src/iters.rs | 38 + src/daft-image/src/kernel.rs | 468 ++++++++ src/daft-image/src/lib.rs | 13 + src/daft-image/src/python.rs | 54 + src/daft-image/src/series.rs | 159 +++ src/lib.rs | 41 +- tests/series/test_image.py | 1 - 36 files changed, 1466 insertions(+), 1307 deletions(-) create mode 100644 daft/daft/__init__.pyi create mode 100644 daft/daft/image.pyi create mode 100644 src/daft-core/src/array/image_array.rs delete mode 100644 src/daft-core/src/array/ops/image.rs delete mode 100644 src/daft-core/src/series/ops/image.rs create mode 100644 src/daft-image/Cargo.toml create mode 100644 src/daft-image/src/counting_writer.rs create mode 100644 src/daft-image/src/image_buffer.rs create mode 100644 src/daft-image/src/iters.rs create mode 100644 src/daft-image/src/kernel.rs create mode 100644 src/daft-image/src/lib.rs create mode 100644 src/daft-image/src/python.rs create mode 100644 src/daft-image/src/series.rs diff --git a/Cargo.lock b/Cargo.lock index 8b6b2435a7..836b40d977 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1653,6 +1653,7 @@ dependencies = [ "daft-csv", "daft-dsl", "daft-functions", + "daft-image", "daft-io", "daft-json", "daft-local-execution", @@ -1800,6 +1801,7 @@ dependencies = [ "common-io-config", "daft-core", "daft-dsl", + "daft-image", "daft-io", "futures", "pyo3", @@ -1811,6 +1813,21 @@ dependencies = [ "uuid 1.10.0", ] +[[package]] +name = "daft-image" +version = "0.3.0-dev0" +dependencies = [ + "arrow2", + "base64 0.22.1", + "common-error", + "daft-core", + "image", + "log", + "num-traits", + "pyo3", + "serde", +] + [[package]] name = "daft-io" version = "0.3.0-dev0" diff --git a/Cargo.toml b/Cargo.toml index 4046665e77..4c8c2e572b 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -11,6 +11,7 @@ daft-csv = {path = "src/daft-csv", default-features = false} daft-dsl = {path = "src/daft-dsl", default-features = false} daft-functions = {path = "src/daft-functions", default-features = false} daft-io = {path = "src/daft-io", default-features = false} +daft-image = {path = "src/daft-image", default-features = false} daft-json = {path = "src/daft-json", default-features = false} daft-local-execution = {path = "src/daft-local-execution", default-features = false} daft-micropartition = {path = "src/daft-micropartition", default-features = false} @@ -39,6 +40,7 @@ python = [ "daft-dsl/python", "daft-local-execution/python", "daft-io/python", + "daft-image/python", "daft-json/python", "daft-micropartition/python", "daft-parquet/python", @@ -112,6 +114,7 @@ members = [ "src/daft-core", "src/daft-local-execution", "src/daft-io", + "src/daft-image", "src/daft-parquet", "src/daft-csv", "src/daft-json", diff --git a/daft/daft.pyi b/daft/daft.pyi index 5d740ff51b..d988a133fe 100644 --- a/daft/daft.pyi +++ b/daft/daft.pyi @@ -3,6 +3,7 @@ import datetime from enum import Enum from typing import TYPE_CHECKING, Any, Callable, Iterator + import pyarrow from daft.dataframe.display import MermaidOptions @@ -1385,10 +1386,6 @@ class PySeries: def list_slice(self, start: PySeries, end: PySeries | None = None) -> PySeries: ... def list_sort(self, desc: PySeries) -> PySeries: ... def map_get(self, key: PySeries) -> PySeries: ... - def image_decode(self, raise_error_on_failure: bool, mode: ImageMode | None = None) -> PySeries: ... - def image_encode(self, image_format: ImageFormat) -> PySeries: ... - def image_resize(self, w: int, h: int) -> PySeries: ... - def image_to_mode(self, mode: ImageMode) -> PySeries: ... def if_else(self, other: PySeries, predicate: PySeries) -> PySeries: ... def is_null(self) -> PySeries: ... def not_null(self) -> PySeries: ... diff --git a/daft/daft/__init__.pyi b/daft/daft/__init__.pyi new file mode 100644 index 0000000000..e69de29bb2 diff --git a/daft/daft/image.pyi b/daft/daft/image.pyi new file mode 100644 index 0000000000..df4615c202 --- /dev/null +++ b/daft/daft/image.pyi @@ -0,0 +1,6 @@ +from daft.daft import ImageMode, ImageFormat, PySeries + +def decode(s: PySeries, raise_error_on_failure: bool, mode: ImageMode | None = None) -> PySeries: ... +def encode(s: PySeries, image_format: ImageFormat) -> PySeries: ... +def resize(s: PySeries, w: int, h: int) -> PySeries: ... +def to_mode(s: PySeries, mode: ImageMode) -> PySeries: ... \ No newline at end of file diff --git a/daft/series.py b/daft/series.py index e4e562fbf1..97242615a4 100644 --- a/daft/series.py +++ b/daft/series.py @@ -5,7 +5,7 @@ import pyarrow as pa from daft.arrow_utils import ensure_array, ensure_chunked_array -from daft.daft import CountMode, ImageFormat, ImageMode, PySeries +from daft.daft import CountMode, ImageFormat, ImageMode, PySeries, image from daft.datatype import DataType from daft.utils import pyarrow_supports_fixed_shape_tensor @@ -1014,14 +1014,14 @@ def decode( mode = ImageMode.from_mode_string(mode.upper()) if not isinstance(mode, ImageMode): raise ValueError(f"mode must be a string or ImageMode variant, but got: {mode}") - return Series._from_pyseries(self._series.image_decode(raise_error_on_failure=raise_on_error, mode=mode)) + return Series._from_pyseries(image.decode(self._series, raise_error_on_failure=raise_on_error, mode=mode)) def encode(self, image_format: str | ImageFormat) -> Series: if isinstance(image_format, str): image_format = ImageFormat.from_format_string(image_format.upper()) if not isinstance(image_format, ImageFormat): raise ValueError(f"image_format must be a string or ImageFormat variant, but got: {image_format}") - return Series._from_pyseries(self._series.image_encode(image_format)) + return Series._from_pyseries(image.encode(self._series, image_format)) def resize(self, w: int, h: int) -> Series: if not isinstance(w, int): @@ -1029,11 +1029,11 @@ def resize(self, w: int, h: int) -> Series: if not isinstance(h, int): raise TypeError(f"expected int for h but got {type(h)}") - return Series._from_pyseries(self._series.image_resize(w, h)) + return Series._from_pyseries(image.resize(self._series,w, h)) def to_mode(self, mode: str | ImageMode) -> Series: if isinstance(mode, str): mode = ImageMode.from_mode_string(mode.upper()) if not isinstance(mode, ImageMode): raise ValueError(f"mode must be a string or ImageMode variant, but got: {mode}") - return Series._from_pyseries(self._series.image_to_mode(mode)) + return Series._from_pyseries(image.to_mode(self._series, mode)) diff --git a/src/daft-core/src/array/image_array.rs b/src/daft-core/src/array/image_array.rs new file mode 100644 index 0000000000..424c9f759a --- /dev/null +++ b/src/daft-core/src/array/image_array.rs @@ -0,0 +1,210 @@ +use std::io::{Seek, SeekFrom, Write}; +use std::vec; + +use common_error::DaftResult; + +use crate::array::prelude::*; +use crate::datatypes::prelude::*; + +use crate::series::{IntoSeries, Series}; + +#[derive(Clone)] +pub struct BBox(pub u32, pub u32, pub u32, pub u32); + +impl BBox { + pub fn from_u32_arrow_array(arr: &dyn arrow2::array::Array) -> Self { + assert!(arr.len() == 4); + let mut iter = arr + .as_any() + .downcast_ref::() + .unwrap() + .iter(); + BBox( + *iter.next().unwrap().unwrap(), + *iter.next().unwrap().unwrap(), + *iter.next().unwrap().unwrap(), + *iter.next().unwrap().unwrap(), + ) + } +} + +type IOResult = std::result::Result; + +/// A wrapper of a writer that tracks the number of bytes successfully written. +pub struct CountingWriter { + inner: W, + count: u64, +} + +impl CountingWriter { + /// The number of bytes successful written so far. + pub fn count(&self) -> u64 { + self.count + } + + /// Extracts the inner writer, discarding this wrapper. + pub fn into_inner(self) -> W { + self.inner + } +} + +impl From for CountingWriter { + fn from(inner: W) -> Self { + Self { inner, count: 0 } + } +} + +impl Write for CountingWriter { + fn write(&mut self, buf: &[u8]) -> IOResult { + let written = self.inner.write(buf)?; + self.count += written as u64; + Ok(written) + } + + fn flush(&mut self) -> IOResult { + self.inner.flush() + } +} + +impl Seek for CountingWriter { + fn seek(&mut self, pos: SeekFrom) -> IOResult { + self.inner.seek(pos) + } +} + +pub struct ImageArraySidecarData { + pub channels: Vec, + pub heights: Vec, + pub widths: Vec, + pub modes: Vec, + pub validity: Option, +} + +impl ImageArray { + pub fn image_mode(&self) -> &Option { + match self.data_type() { + DataType::Image(mode) => mode, + _ => panic!("Expected dtype to be Image"), + } + } + + pub fn data_array(&self) -> &ListArray { + const IMAGE_DATA_IDX: usize = 0; + let array = self.physical.children.get(IMAGE_DATA_IDX).unwrap(); + array.list().unwrap() + } + + pub fn channel_array(&self) -> &arrow2::array::UInt16Array { + const IMAGE_CHANNEL_IDX: usize = 1; + let array = self.physical.children.get(IMAGE_CHANNEL_IDX).unwrap(); + array.u16().unwrap().as_arrow() + } + + pub fn height_array(&self) -> &arrow2::array::UInt32Array { + const IMAGE_HEIGHT_IDX: usize = 2; + let array = self.physical.children.get(IMAGE_HEIGHT_IDX).unwrap(); + array.u32().unwrap().as_arrow() + } + + pub fn width_array(&self) -> &arrow2::array::UInt32Array { + const IMAGE_WIDTH_IDX: usize = 3; + let array = self.physical.children.get(IMAGE_WIDTH_IDX).unwrap(); + array.u32().unwrap().as_arrow() + } + + pub fn mode_array(&self) -> &arrow2::array::UInt8Array { + const IMAGE_MODE_IDX: usize = 4; + let array = self.physical.children.get(IMAGE_MODE_IDX).unwrap(); + array.u8().unwrap().as_arrow() + } + + pub fn from_list_array( + name: &str, + data_type: DataType, + data_array: ListArray, + sidecar_data: ImageArraySidecarData, + ) -> DaftResult { + let values: Vec = vec![ + data_array.into_series().rename("data"), + UInt16Array::from(( + "channel", + Box::new( + arrow2::array::UInt16Array::from_vec(sidecar_data.channels) + .with_validity(sidecar_data.validity.clone()), + ), + )) + .into_series(), + UInt32Array::from(( + "height", + Box::new( + arrow2::array::UInt32Array::from_vec(sidecar_data.heights) + .with_validity(sidecar_data.validity.clone()), + ), + )) + .into_series(), + UInt32Array::from(( + "width", + Box::new( + arrow2::array::UInt32Array::from_vec(sidecar_data.widths) + .with_validity(sidecar_data.validity.clone()), + ), + )) + .into_series(), + UInt8Array::from(( + "mode", + Box::new( + arrow2::array::UInt8Array::from_vec(sidecar_data.modes) + .with_validity(sidecar_data.validity.clone()), + ), + )) + .into_series(), + ]; + let physical_type = data_type.to_physical(); + let struct_array = StructArray::new( + Field::new(name, physical_type), + values, + sidecar_data.validity, + ); + Ok(ImageArray::new(Field::new(name, data_type), struct_array)) + } + + pub fn from_vecs( + name: &str, + data_type: DataType, + data: Vec, + offsets: Vec, + sidecar_data: ImageArraySidecarData, + ) -> DaftResult { + if data.is_empty() { + return Ok(ImageArray::full_null(name, &data_type, offsets.len() - 1)); + } + let offsets = arrow2::offset::OffsetsBuffer::try_from(offsets)?; + let arrow_dtype: arrow2::datatypes::DataType = T::PRIMITIVE.into(); + if let DataType::Image(Some(mode)) = &data_type { + if mode.get_dtype().to_arrow()? != arrow_dtype { + panic!("Inner value dtype of provided dtype {data_type:?} is inconsistent with inferred value dtype {arrow_dtype:?}"); + } + } + let data_array = ListArray::new( + Field::new("data", DataType::List(Box::new((&arrow_dtype).into()))), + Series::try_from(( + "data", + Box::new(arrow2::array::PrimitiveArray::from_vec(data)) + as Box, + ))?, + offsets, + sidecar_data.validity.clone(), + ); + + Self::from_list_array(name, data_type, data_array, sidecar_data) + } +} + +impl FixedShapeImageArray { + pub fn image_mode(&self) -> &ImageMode { + match self.data_type() { + DataType::FixedShapeImage(mode, _, _) => mode, + other => panic!("Expected dtype to be Image, got {other:?}"), + } + } +} diff --git a/src/daft-core/src/array/mod.rs b/src/daft-core/src/array/mod.rs index f56faeebd9..0eb40912e7 100644 --- a/src/daft-core/src/array/mod.rs +++ b/src/daft-core/src/array/mod.rs @@ -2,6 +2,7 @@ mod fixed_size_list_array; pub mod from; pub mod growable; pub mod iterator; +pub mod image_array; mod list_array; pub mod ops; pub mod pseudo_arrow; diff --git a/src/daft-core/src/array/ops/cast.rs b/src/daft-core/src/array/ops/cast.rs index e2d2110456..e39f87cccf 100644 --- a/src/daft-core/src/array/ops/cast.rs +++ b/src/daft-core/src/array/ops/cast.rs @@ -4,7 +4,8 @@ use super::as_arrow::AsArrow; use crate::{ array::{ growable::make_growable, - ops::{from_arrow::FromArrow, full::FullNull, image::ImageArraySidecarData}, + image_array::ImageArraySidecarData, + ops::{from_arrow::FromArrow, full::FullNull}, DataArray, FixedSizeListArray, ListArray, StructArray, }, datatypes::{ diff --git a/src/daft-core/src/array/ops/image.rs b/src/daft-core/src/array/ops/image.rs deleted file mode 100644 index 66dcdaeba5..0000000000 --- a/src/daft-core/src/array/ops/image.rs +++ /dev/null @@ -1,1029 +0,0 @@ -use std::borrow::Cow; -use std::io::{Seek, SeekFrom, Write}; -use std::sync::Arc; -use std::vec; - -use image::{ColorType, DynamicImage, ImageBuffer}; - -use crate::array::prelude::*; -use crate::datatypes::prelude::*; - -use crate::series::{IntoSeries, Series}; -use common_error::{DaftError, DaftResult}; -use image::{Luma, LumaA, Rgb, Rgba}; - -use super::full::FullNull; -use super::{as_arrow::AsArrow, from_arrow::FromArrow}; -use num_traits::FromPrimitive; - -use std::ops::Deref; - -#[derive(Clone)] -pub struct BBox(u32, u32, u32, u32); - -impl BBox { - pub fn from_u32_arrow_array(arr: &dyn arrow2::array::Array) -> Self { - assert!(arr.len() == 4); - let mut iter = arr - .as_any() - .downcast_ref::() - .unwrap() - .iter(); - BBox( - *iter.next().unwrap().unwrap(), - *iter.next().unwrap().unwrap(), - *iter.next().unwrap().unwrap(), - *iter.next().unwrap().unwrap(), - ) - } -} - -#[allow(clippy::upper_case_acronyms, dead_code)] -#[derive(Debug)] -pub enum DaftImageBuffer<'a> { - L(ImageBuffer, Cow<'a, [u8]>>), - LA(ImageBuffer, Cow<'a, [u8]>>), - RGB(ImageBuffer, Cow<'a, [u8]>>), - RGBA(ImageBuffer, Cow<'a, [u8]>>), - L16(ImageBuffer, Cow<'a, [u16]>>), - LA16(ImageBuffer, Cow<'a, [u16]>>), - RGB16(ImageBuffer, Cow<'a, [u16]>>), - RGBA16(ImageBuffer, Cow<'a, [u16]>>), - RGB32F(ImageBuffer, Cow<'a, [f32]>>), - RGBA32F(ImageBuffer, Cow<'a, [f32]>>), -} - -macro_rules! with_method_on_image_buffer { - ( - $key_type:expr, $method: ident -) => {{ - use DaftImageBuffer::*; - - match $key_type { - L(img) => img.$method(), - LA(img) => img.$method(), - RGB(img) => img.$method(), - RGBA(img) => img.$method(), - L16(img) => img.$method(), - LA16(img) => img.$method(), - RGB16(img) => img.$method(), - RGBA16(img) => img.$method(), - RGB32F(img) => img.$method(), - RGBA32F(img) => img.$method(), - } - }}; -} - -type IOResult = std::result::Result; - -/// A wrapper of a writer that tracks the number of bytes successfully written. -pub struct CountingWriter { - inner: W, - count: u64, -} - -impl CountingWriter { - /// The number of bytes successful written so far. - pub fn count(&self) -> u64 { - self.count - } - - /// Extracts the inner writer, discarding this wrapper. - pub fn into_inner(self) -> W { - self.inner - } -} - -impl From for CountingWriter { - fn from(inner: W) -> Self { - Self { inner, count: 0 } - } -} - -impl Write for CountingWriter { - fn write(&mut self, buf: &[u8]) -> IOResult { - let written = self.inner.write(buf)?; - self.count += written as u64; - Ok(written) - } - - fn flush(&mut self) -> IOResult { - self.inner.flush() - } -} - -impl Seek for CountingWriter { - fn seek(&mut self, pos: SeekFrom) -> IOResult { - self.inner.seek(pos) - } -} - -impl<'a> DaftImageBuffer<'a> { - pub fn height(&self) -> u32 { - with_method_on_image_buffer!(self, height) - } - - pub fn width(&self) -> u32 { - with_method_on_image_buffer!(self, width) - } - - pub fn as_u8_slice(&'a self) -> &'a [u8] { - use DaftImageBuffer::*; - match self { - L(img) => img.as_raw(), - LA(img) => img.as_raw(), - RGB(img) => img.as_raw(), - RGBA(img) => img.as_raw(), - _ => unimplemented!("unimplemented {self:?}"), - } - } - - pub fn color(&self) -> ColorType { - self.mode().into() - } - - pub fn mode(&self) -> ImageMode { - use DaftImageBuffer::*; - - match self { - L(..) => ImageMode::L, - LA(..) => ImageMode::LA, - RGB(..) => ImageMode::RGB, - RGBA(..) => ImageMode::RGBA, - L16(..) => ImageMode::L16, - LA16(..) => ImageMode::LA16, - RGB16(..) => ImageMode::RGB16, - RGBA16(..) => ImageMode::RGBA16, - RGB32F(..) => ImageMode::RGB32F, - RGBA32F(..) => ImageMode::RGBA32F, - } - } - - pub fn decode(bytes: &[u8]) -> DaftResult { - image::load_from_memory(bytes) - .map(|v| v.into()) - .map_err(|e| DaftError::ValueError(format!("Decoding image from bytes failed: {}", e))) - } - - pub fn encode(&self, image_format: ImageFormat, writer: &mut W) -> DaftResult<()> - where - W: Write + Seek, - { - image::write_buffer_with_format( - writer, - self.as_u8_slice(), - self.width(), - self.height(), - self.color(), - image::ImageFormat::from(image_format), - ) - .map_err(|e| { - DaftError::ValueError(format!( - "Encoding image into file format {} failed: {}", - image_format, e - )) - }) - } - - pub fn fit_to(&self, w: u32, h: u32) -> Self { - // Preserving aspect ratio, resize an image to fit within the specified dimensions. - let scale_factor = { - let width_scale = w as f64 / self.width() as f64; - let height_scale = h as f64 / self.height() as f64; - width_scale.min(height_scale) - }; - let new_w = self.width() as f64 * scale_factor; - let new_h = self.height() as f64 * scale_factor; - - self.resize(new_w.floor() as u32, new_h.floor() as u32) - } - - pub fn resize(&self, w: u32, h: u32) -> Self { - use DaftImageBuffer::*; - match self { - L(imgbuf) => { - let result = - image::imageops::resize(imgbuf, w, h, image::imageops::FilterType::Triangle); - DaftImageBuffer::L(image_buffer_vec_to_cow(result)) - } - LA(imgbuf) => { - let result = - image::imageops::resize(imgbuf, w, h, image::imageops::FilterType::Triangle); - DaftImageBuffer::LA(image_buffer_vec_to_cow(result)) - } - RGB(imgbuf) => { - let result = - image::imageops::resize(imgbuf, w, h, image::imageops::FilterType::Triangle); - DaftImageBuffer::RGB(image_buffer_vec_to_cow(result)) - } - RGBA(imgbuf) => { - let result = - image::imageops::resize(imgbuf, w, h, image::imageops::FilterType::Triangle); - DaftImageBuffer::RGBA(image_buffer_vec_to_cow(result)) - } - _ => unimplemented!("Mode {self:?} not implemented"), - } - } - - pub fn crop(&self, bbox: &BBox) -> Self { - // HACK(jay): The `.to_image()` method on SubImage takes in `'static` references for some reason - // This hack will ensure that `&self` adheres to that overly prescriptive bound - let inner = - unsafe { std::mem::transmute::<&DaftImageBuffer<'a>, &DaftImageBuffer<'static>>(self) }; - match inner { - DaftImageBuffer::L(imgbuf) => { - let result = - image::imageops::crop_imm(imgbuf, bbox.0, bbox.1, bbox.2, bbox.3).to_image(); - DaftImageBuffer::L(image_buffer_vec_to_cow(result)) - } - DaftImageBuffer::LA(imgbuf) => { - let result = - image::imageops::crop_imm(imgbuf, bbox.0, bbox.1, bbox.2, bbox.3).to_image(); - DaftImageBuffer::LA(image_buffer_vec_to_cow(result)) - } - DaftImageBuffer::RGB(imgbuf) => { - let result = - image::imageops::crop_imm(imgbuf, bbox.0, bbox.1, bbox.2, bbox.3).to_image(); - DaftImageBuffer::RGB(image_buffer_vec_to_cow(result)) - } - DaftImageBuffer::RGBA(imgbuf) => { - let result = - image::imageops::crop_imm(imgbuf, bbox.0, bbox.1, bbox.2, bbox.3).to_image(); - DaftImageBuffer::RGBA(image_buffer_vec_to_cow(result)) - } - _ => unimplemented!("Mode {self:?} not implemented"), - } - } - - pub fn into_mode(self, mode: ImageMode) -> Self { - let img: DynamicImage = self.into(); - // I couldn't find a method from the image crate to do this - let img: DynamicImage = match mode { - ImageMode::L => img.into_luma8().into(), - ImageMode::LA => img.into_luma_alpha8().into(), - ImageMode::RGB => img.into_rgb8().into(), - ImageMode::RGBA => img.into_rgba8().into(), - ImageMode::L16 => img.into_luma16().into(), - ImageMode::LA16 => img.into_luma_alpha16().into(), - ImageMode::RGB16 => img.into_rgb16().into(), - ImageMode::RGBA16 => img.into_rgba16().into(), - ImageMode::RGB32F => img.into_rgb32f().into(), - ImageMode::RGBA32F => img.into_rgba32f().into(), - }; - img.into() - } -} - -fn image_buffer_vec_to_cow<'a, P, T>(input: ImageBuffer>) -> ImageBuffer> -where - P: image::Pixel, - Vec: Deref, - T: ToOwned + std::clone::Clone, - [T]: ToOwned, -{ - let h = input.height(); - let w = input.width(); - let owned: Cow<[T]> = input.into_raw().into(); - ImageBuffer::from_raw(w, h, owned).unwrap() -} - -fn image_buffer_cow_to_vec(input: ImageBuffer>) -> ImageBuffer> -where - P: image::Pixel, - Vec: Deref, - T: ToOwned + std::clone::Clone, - [T]: ToOwned, -{ - let h = input.height(); - let w = input.width(); - let owned: Vec = input.into_raw().to_vec(); - ImageBuffer::from_raw(w, h, owned).unwrap() -} - -impl<'a> From for DaftImageBuffer<'a> { - fn from(dyn_img: DynamicImage) -> Self { - match dyn_img { - DynamicImage::ImageLuma8(img_buf) => { - DaftImageBuffer::<'a>::L(image_buffer_vec_to_cow(img_buf)) - } - DynamicImage::ImageLumaA8(img_buf) => { - DaftImageBuffer::<'a>::LA(image_buffer_vec_to_cow(img_buf)) - } - DynamicImage::ImageRgb8(img_buf) => { - DaftImageBuffer::<'a>::RGB(image_buffer_vec_to_cow(img_buf)) - } - DynamicImage::ImageRgba8(img_buf) => { - DaftImageBuffer::<'a>::RGBA(image_buffer_vec_to_cow(img_buf)) - } - DynamicImage::ImageLuma16(img_buf) => { - DaftImageBuffer::<'a>::L16(image_buffer_vec_to_cow(img_buf)) - } - DynamicImage::ImageLumaA16(img_buf) => { - DaftImageBuffer::<'a>::LA16(image_buffer_vec_to_cow(img_buf)) - } - DynamicImage::ImageRgb16(img_buf) => { - DaftImageBuffer::<'a>::RGB16(image_buffer_vec_to_cow(img_buf)) - } - DynamicImage::ImageRgba16(img_buf) => { - DaftImageBuffer::<'a>::RGBA16(image_buffer_vec_to_cow(img_buf)) - } - DynamicImage::ImageRgb32F(img_buf) => { - DaftImageBuffer::<'a>::RGB32F(image_buffer_vec_to_cow(img_buf)) - } - DynamicImage::ImageRgba32F(img_buf) => { - DaftImageBuffer::<'a>::RGBA32F(image_buffer_vec_to_cow(img_buf)) - } - _ => unimplemented!("{dyn_img:?} not implemented"), - } - } -} - -impl<'a> From> for DynamicImage { - fn from(daft_buf: DaftImageBuffer<'a>) -> Self { - match daft_buf { - DaftImageBuffer::L(buf) => image_buffer_cow_to_vec(buf).into(), - DaftImageBuffer::LA(buf) => image_buffer_cow_to_vec(buf).into(), - DaftImageBuffer::RGB(buf) => image_buffer_cow_to_vec(buf).into(), - DaftImageBuffer::RGBA(buf) => image_buffer_cow_to_vec(buf).into(), - DaftImageBuffer::L16(buf) => image_buffer_cow_to_vec(buf).into(), - DaftImageBuffer::LA16(buf) => image_buffer_cow_to_vec(buf).into(), - DaftImageBuffer::RGB16(buf) => image_buffer_cow_to_vec(buf).into(), - DaftImageBuffer::RGBA16(buf) => image_buffer_cow_to_vec(buf).into(), - DaftImageBuffer::RGB32F(buf) => image_buffer_cow_to_vec(buf).into(), - DaftImageBuffer::RGBA32F(buf) => image_buffer_cow_to_vec(buf).into(), - } - } -} - -pub struct ImageArraySidecarData { - pub channels: Vec, - pub heights: Vec, - pub widths: Vec, - pub modes: Vec, - pub validity: Option, -} - -pub trait AsImageObj { - fn name(&self) -> &str; - fn len(&self) -> usize; - fn as_image_obj(&self, idx: usize) -> Option>; -} - -pub struct ImageBufferIter<'a, Arr> -where - Arr: AsImageObj, -{ - cursor: usize, - image_array: &'a Arr, -} - -impl<'a, Arr> ImageBufferIter<'a, Arr> -where - Arr: AsImageObj, -{ - pub fn new(image_array: &'a Arr) -> Self { - Self { - cursor: 0usize, - image_array, - } - } -} - -impl<'a, Arr> Iterator for ImageBufferIter<'a, Arr> -where - Arr: AsImageObj, -{ - type Item = Option>; - - fn next(&mut self) -> Option { - if self.cursor >= self.image_array.len() { - None - } else { - let image_obj = self.image_array.as_image_obj(self.cursor); - self.cursor += 1; - Some(image_obj) - } - } -} - -impl ImageArray { - pub fn image_mode(&self) -> &Option { - match self.data_type() { - DataType::Image(mode) => mode, - _ => panic!("Expected dtype to be Image"), - } - } - - pub fn data_array(&self) -> &ListArray { - const IMAGE_DATA_IDX: usize = 0; - let array = self.physical.children.get(IMAGE_DATA_IDX).unwrap(); - array.list().unwrap() - } - - pub fn channel_array(&self) -> &arrow2::array::UInt16Array { - const IMAGE_CHANNEL_IDX: usize = 1; - let array = self.physical.children.get(IMAGE_CHANNEL_IDX).unwrap(); - array.u16().unwrap().as_arrow() - } - - pub fn height_array(&self) -> &arrow2::array::UInt32Array { - const IMAGE_HEIGHT_IDX: usize = 2; - let array = self.physical.children.get(IMAGE_HEIGHT_IDX).unwrap(); - array.u32().unwrap().as_arrow() - } - - pub fn width_array(&self) -> &arrow2::array::UInt32Array { - const IMAGE_WIDTH_IDX: usize = 3; - let array = self.physical.children.get(IMAGE_WIDTH_IDX).unwrap(); - array.u32().unwrap().as_arrow() - } - - pub fn mode_array(&self) -> &arrow2::array::UInt8Array { - const IMAGE_MODE_IDX: usize = 4; - let array = self.physical.children.get(IMAGE_MODE_IDX).unwrap(); - array.u8().unwrap().as_arrow() - } - - pub fn from_vecs( - name: &str, - data_type: DataType, - data: Vec, - offsets: Vec, - sidecar_data: ImageArraySidecarData, - ) -> DaftResult { - if data.is_empty() { - return Ok(ImageArray::full_null(name, &data_type, offsets.len() - 1)); - } - let offsets = arrow2::offset::OffsetsBuffer::try_from(offsets)?; - let arrow_dtype: arrow2::datatypes::DataType = T::PRIMITIVE.into(); - if let DataType::Image(Some(mode)) = &data_type { - if mode.get_dtype().to_arrow()? != arrow_dtype { - panic!("Inner value dtype of provided dtype {data_type:?} is inconsistent with inferred value dtype {arrow_dtype:?}"); - } - } - let data_array = ListArray::new( - Field::new("data", DataType::List(Box::new((&arrow_dtype).into()))), - Series::try_from(( - "data", - Box::new(arrow2::array::PrimitiveArray::from_vec(data)) - as Box, - ))?, - offsets, - sidecar_data.validity.clone(), - ); - - Self::from_list_array(name, data_type, data_array, sidecar_data) - } - - pub fn from_list_array( - name: &str, - data_type: DataType, - data_array: ListArray, - sidecar_data: ImageArraySidecarData, - ) -> DaftResult { - let values: Vec = vec![ - data_array.into_series().rename("data"), - UInt16Array::from(( - "channel", - Box::new( - arrow2::array::UInt16Array::from_vec(sidecar_data.channels) - .with_validity(sidecar_data.validity.clone()), - ), - )) - .into_series(), - UInt32Array::from(( - "height", - Box::new( - arrow2::array::UInt32Array::from_vec(sidecar_data.heights) - .with_validity(sidecar_data.validity.clone()), - ), - )) - .into_series(), - UInt32Array::from(( - "width", - Box::new( - arrow2::array::UInt32Array::from_vec(sidecar_data.widths) - .with_validity(sidecar_data.validity.clone()), - ), - )) - .into_series(), - UInt8Array::from(( - "mode", - Box::new( - arrow2::array::UInt8Array::from_vec(sidecar_data.modes) - .with_validity(sidecar_data.validity.clone()), - ), - )) - .into_series(), - ]; - let physical_type = data_type.to_physical(); - let struct_array = StructArray::new( - Field::new(name, physical_type), - values, - sidecar_data.validity, - ); - Ok(ImageArray::new(Field::new(name, data_type), struct_array)) - } - - pub fn encode(&self, image_format: ImageFormat) -> DaftResult { - encode_images(self, image_format) - } - - pub fn resize(&self, w: u32, h: u32) -> DaftResult { - let result = resize_images(self, w, h); - Self::from_daft_image_buffers(self.name(), result.as_slice(), self.image_mode()) - } - - pub fn crop(&self, bboxes: &FixedSizeListArray) -> DaftResult { - let mut bboxes_iterator: Box>> = if bboxes.len() == 1 { - Box::new(std::iter::repeat(bboxes.get(0).map(|bbox| { - BBox::from_u32_arrow_array(bbox.u32().unwrap().data()) - }))) - } else { - Box::new((0..bboxes.len()).map(|i| { - bboxes - .get(i) - .map(|bbox| BBox::from_u32_arrow_array(bbox.u32().unwrap().data())) - })) - }; - let result = crop_images(self, &mut bboxes_iterator); - Self::from_daft_image_buffers(self.name(), result.as_slice(), self.image_mode()) - } - - pub fn resize_to_fixed_shape_image_array( - &self, - w: u32, - h: u32, - mode: &ImageMode, - ) -> DaftResult { - let result = resize_images(self, w, h); - FixedShapeImageArray::from_daft_image_buffers(self.name(), result.as_slice(), mode, h, w) - } - - pub fn from_daft_image_buffers( - name: &str, - inputs: &[Option>], - image_mode: &Option, - ) -> DaftResult { - use DaftImageBuffer::*; - let is_all_u8 = inputs - .iter() - .filter_map(|b| b.as_ref()) - .all(|b| matches!(b, L(..) | LA(..) | RGB(..) | RGBA(..))); - assert!(is_all_u8); - - let mut data_ref = Vec::with_capacity(inputs.len()); - let mut heights = Vec::with_capacity(inputs.len()); - let mut channels = Vec::with_capacity(inputs.len()); - let mut modes = Vec::with_capacity(inputs.len()); - let mut widths = Vec::with_capacity(inputs.len()); - let mut offsets = Vec::with_capacity(inputs.len() + 1); - offsets.push(0i64); - let mut validity = arrow2::bitmap::MutableBitmap::with_capacity(inputs.len()); - - for ib in inputs { - validity.push(ib.is_some()); - let (height, width, mode, buffer) = match ib { - Some(ib) => (ib.height(), ib.width(), ib.mode(), ib.as_u8_slice()), - None => (0u32, 0u32, ImageMode::L, &[] as &[u8]), - }; - heights.push(height); - widths.push(width); - modes.push(mode as u8); - channels.push(mode.num_channels()); - data_ref.push(buffer); - offsets.push(offsets.last().unwrap() + buffer.len() as i64); - } - - let data = data_ref.concat(); - let validity: Option = match validity.unset_bits() { - 0 => None, - _ => Some(validity.into()), - }; - Self::from_vecs( - name, - DataType::Image(*image_mode), - data, - offsets, - ImageArraySidecarData { - channels, - heights, - widths, - modes, - validity, - }, - ) - } - - pub fn to_mode(&self, mode: ImageMode) -> DaftResult { - let buffers: Vec> = self - .into_iter() - .map(|img| img.map(|img| img.into_mode(mode))) - .collect(); - Self::from_daft_image_buffers(self.name(), &buffers, &Some(mode)) - } -} - -impl AsImageObj for ImageArray { - fn len(&self) -> usize { - ImageArray::len(self) - } - - fn name(&self) -> &str { - ImageArray::name(self) - } - - fn as_image_obj<'a>(&'a self, idx: usize) -> Option> { - assert!(idx < self.len()); - if !self.physical.is_valid(idx) { - return None; - } - - let da = self.data_array(); - let ca = self.channel_array(); - let ha = self.height_array(); - let wa = self.width_array(); - let ma = self.mode_array(); - - let offsets = da.offsets(); - - let start = *offsets.get(idx).unwrap() as usize; - let end = *offsets.get(idx + 1).unwrap() as usize; - - let values = da - .flat_child - .u8() - .unwrap() - .data() - .as_any() - .downcast_ref::() - .unwrap(); - let slice_data = Cow::Borrowed(&values.values().as_slice()[start..end] as &'a [u8]); - - let c = ca.value(idx); - let h = ha.value(idx); - let w = wa.value(idx); - let m: ImageMode = ImageMode::from_u8(ma.value(idx)).unwrap(); - assert_eq!(m.num_channels(), c); - let result = match m { - ImageMode::L => { - DaftImageBuffer::<'a>::L(ImageBuffer::from_raw(w, h, slice_data).unwrap()) - } - ImageMode::LA => { - DaftImageBuffer::<'a>::LA(ImageBuffer::from_raw(w, h, slice_data).unwrap()) - } - ImageMode::RGB => { - DaftImageBuffer::<'a>::RGB(ImageBuffer::from_raw(w, h, slice_data).unwrap()) - } - ImageMode::RGBA => { - DaftImageBuffer::<'a>::RGBA(ImageBuffer::from_raw(w, h, slice_data).unwrap()) - } - _ => unimplemented!("{m} is currently not implemented!"), - }; - - assert_eq!(result.height(), h); - assert_eq!(result.width(), w); - Some(result) - } -} - -impl FixedShapeImageArray { - fn mode(&self) -> ImageMode { - match &self.field.dtype { - DataType::FixedShapeImage(mode, _, _) => *mode, - _ => panic!("FixedShapeImageArray does not have the correct FixedShapeImage dtype"), - } - } - - pub fn from_daft_image_buffers( - name: &str, - inputs: &[Option>], - image_mode: &ImageMode, - height: u32, - width: u32, - ) -> DaftResult { - use DaftImageBuffer::*; - let is_all_u8 = inputs - .iter() - .filter_map(|b| b.as_ref()) - .all(|b| matches!(b, L(..) | LA(..) | RGB(..) | RGBA(..))); - assert!(is_all_u8); - - let num_channels = image_mode.num_channels(); - let mut data_ref = Vec::with_capacity(inputs.len()); - let mut validity = arrow2::bitmap::MutableBitmap::with_capacity(inputs.len()); - let list_size = (height * width * num_channels as u32) as usize; - let null_list = vec![0u8; list_size]; - for ib in inputs.iter() { - validity.push(ib.is_some()); - let buffer = match ib { - Some(ib) => ib.as_u8_slice(), - None => null_list.as_slice(), - }; - data_ref.push(buffer) - } - let data = data_ref.concat(); - let validity: Option = match validity.unset_bits() { - 0 => None, - _ => Some(validity.into()), - }; - - let arrow_dtype = arrow2::datatypes::DataType::FixedSizeList( - Box::new(arrow2::datatypes::Field::new( - "data", - arrow2::datatypes::DataType::UInt8, - true, - )), - list_size, - ); - let arrow_array = Box::new(arrow2::array::FixedSizeListArray::new( - arrow_dtype.clone(), - Box::new(arrow2::array::PrimitiveArray::from_vec(data)), - validity, - )); - let physical_array = FixedSizeListArray::from_arrow( - Arc::new(Field::new(name, (&arrow_dtype).into())), - arrow_array, - )?; - let logical_dtype = DataType::FixedShapeImage(*image_mode, height, width); - Ok(Self::new(Field::new(name, logical_dtype), physical_array)) - } - - pub fn encode(&self, image_format: ImageFormat) -> DaftResult { - encode_images(self, image_format) - } - - pub fn resize(&self, w: u32, h: u32) -> DaftResult { - let result = resize_images(self, w, h); - match &self.data_type() { - DataType::FixedShapeImage(mode, _, _) => Self::from_daft_image_buffers(self.name(), result.as_slice(), mode, h, w), - dt => panic!("FixedShapeImageArray should always have DataType::FixedShapeImage() as it's dtype, but got {}", dt), - } - } - - pub fn crop(&self, bboxes: &FixedSizeListArray) -> DaftResult { - let mut bboxes_iterator: Box>> = if bboxes.len() == 1 { - Box::new(std::iter::repeat(bboxes.get(0).map(|bbox| { - BBox::from_u32_arrow_array(bbox.u32().unwrap().data()) - }))) - } else { - Box::new((0..bboxes.len()).map(|i| { - bboxes - .get(i) - .map(|bbox| BBox::from_u32_arrow_array(bbox.u32().unwrap().data())) - })) - }; - let result = crop_images(self, &mut bboxes_iterator); - ImageArray::from_daft_image_buffers(self.name(), result.as_slice(), &Some(self.mode())) - } - - pub fn to_mode(&self, mode: ImageMode) -> DaftResult { - let buffers: Vec> = self - .into_iter() - .map(|img| img.map(|img| img.into_mode(mode))) - .collect(); - - let (height, width) = match self.data_type() { - DataType::FixedShapeImage(_, h, w) => (h, w), - _ => unreachable!("self should always be a FixedShapeImage"), - }; - Self::from_daft_image_buffers(self.name(), &buffers, &mode, *height, *width) - } -} - -impl AsImageObj for FixedShapeImageArray { - fn len(&self) -> usize { - FixedShapeImageArray::len(self) - } - - fn name(&self) -> &str { - FixedShapeImageArray::name(self) - } - - fn as_image_obj<'a>(&'a self, idx: usize) -> Option> { - assert!(idx < self.len()); - if !self.physical.is_valid(idx) { - return None; - } - - match self.data_type() { - DataType::FixedShapeImage(mode, height, width) => { - let arrow_array = self.physical.flat_child.downcast::().unwrap().as_arrow(); - let num_channels = mode.num_channels(); - let size = height * width * num_channels as u32; - let start = idx * size as usize; - let end = (idx + 1) * size as usize; - let slice_data = Cow::Borrowed(&arrow_array.values().as_slice()[start..end] as &'a [u8]); - let result = match mode { - ImageMode::L => { - DaftImageBuffer::<'a>::L(ImageBuffer::from_raw(*width, *height, slice_data).unwrap()) - } - ImageMode::LA => { - DaftImageBuffer::<'a>::LA(ImageBuffer::from_raw(*width, *height, slice_data).unwrap()) - } - ImageMode::RGB => { - DaftImageBuffer::<'a>::RGB(ImageBuffer::from_raw(*width, *height, slice_data).unwrap()) - } - ImageMode::RGBA => { - DaftImageBuffer::<'a>::RGBA(ImageBuffer::from_raw(*width, *height, slice_data).unwrap()) - } - _ => unimplemented!("{mode} is currently not implemented!"), - }; - - assert_eq!(result.height(), *height); - assert_eq!(result.width(), *width); - Some(result) - } - dt => panic!("FixedShapeImageArray should always have DataType::FixedShapeImage() as it's dtype, but got {}", dt), - } - } -} - -impl<'a, T> IntoIterator for &'a LogicalArray -where - T: DaftImageryType, - LogicalArray: AsImageObj, -{ - type Item = Option>; - type IntoIter = ImageBufferIter<'a, LogicalArray>; - - fn into_iter(self) -> Self::IntoIter { - ImageBufferIter::new(self) - } -} - -impl BinaryArray { - pub fn image_decode( - &self, - raise_error_on_failure: bool, - mode: Option, - ) -> DaftResult { - let arrow_array = self - .data() - .as_any() - .downcast_ref::>() - .unwrap(); - let mut img_bufs = Vec::>::with_capacity(arrow_array.len()); - let mut cached_dtype: Option = None; - // Load images from binary buffers. - // Confirm that all images have the same value dtype. - for (index, row) in arrow_array.iter().enumerate() { - let mut img_buf = match row.map(DaftImageBuffer::decode).transpose() { - Ok(val) => val, - Err(err) => { - if raise_error_on_failure { - return Err(err); - } else { - log::warn!( - "Error occurred during image decoding at index: {index} {} (falling back to Null)", - err - ); - None - } - } - }; - if let Some(mode) = mode { - img_buf = img_buf.map(|buf| buf.into_mode(mode)); - } - let dtype = img_buf.as_ref().map(|im| im.mode().get_dtype()); - match (dtype.as_ref(), cached_dtype.as_ref()) { - (Some(t1), Some(t2)) => { - if t1 != t2 { - return Err(DaftError::ValueError(format!("All images in a column must have the same dtype, but got: {:?} and {:?}", t1, t2))); - } - } - (Some(t1), None) => { - cached_dtype = Some(t1.clone()); - } - (None, _) => {} - } - img_bufs.push(img_buf); - } - // Fall back to UInt8 dtype if series is all nulls. - let cached_dtype = cached_dtype.unwrap_or(DataType::UInt8); - match cached_dtype { - DataType::UInt8 => Ok(ImageArray::from_daft_image_buffers(self.name(), img_bufs.as_slice(), &mode)?), - _ => unimplemented!("Decoding images of dtype {cached_dtype:?} is not supported, only uint8 images are supported."), - } - } -} - -fn encode_images<'a, Arr>(images: &'a Arr, image_format: ImageFormat) -> DaftResult -where - Arr: AsImageObj, - &'a Arr: IntoIterator>, IntoIter = ImageBufferIter<'a, Arr>>, -{ - let arrow_array = match image_format { - ImageFormat::TIFF => { - // NOTE: A single writer/buffer can't be used for TIFF files because the encoder will overwrite the - // IFD offset for the first image instead of writing it for all subsequent images, producing corrupted - // TIFF files. We work around this by writing out a new buffer for each image. - // TODO(Clark): Fix this in the tiff crate. - let values = images - .into_iter() - .map(|img| { - img.map(|img| { - let buf = Vec::new(); - let mut writer: CountingWriter> = - std::io::BufWriter::new(std::io::Cursor::new(buf)).into(); - img.encode(image_format, &mut writer)?; - // NOTE: BufWriter::into_inner() will flush the buffer. - Ok(writer - .into_inner() - .into_inner() - .map_err(|e| { - DaftError::ValueError(format!( - "Encoding image into file format {} failed: {}", - image_format, e - )) - })? - .into_inner()) - }) - .transpose() - }) - .collect::>>()?; - arrow2::array::BinaryArray::::from_iter(values) - } - _ => { - let mut offsets = Vec::with_capacity(images.len() + 1); - offsets.push(0i64); - let mut validity = arrow2::bitmap::MutableBitmap::with_capacity(images.len()); - let buf = Vec::new(); - let mut writer: CountingWriter> = - std::io::BufWriter::new(std::io::Cursor::new(buf)).into(); - images - .into_iter() - .map(|img| { - match img { - Some(img) => { - img.encode(image_format, &mut writer)?; - offsets.push(writer.count() as i64); - validity.push(true); - } - None => { - offsets.push(*offsets.last().unwrap()); - validity.push(false); - } - } - Ok(()) - }) - .collect::>>()?; - // NOTE: BufWriter::into_inner() will flush the buffer. - let values = writer - .into_inner() - .into_inner() - .map_err(|e| { - DaftError::ValueError(format!( - "Encoding image into file format {} failed: {}", - image_format, e - )) - })? - .into_inner(); - let encoded_data: arrow2::buffer::Buffer = values.into(); - let offsets_buffer = arrow2::offset::OffsetsBuffer::try_from(offsets)?; - let validity: Option = match validity.unset_bits() { - 0 => None, - _ => Some(validity.into()), - }; - arrow2::array::BinaryArray::::new( - arrow2::datatypes::DataType::LargeBinary, - offsets_buffer, - encoded_data, - validity, - ) - } - }; - BinaryArray::new( - Field::new(images.name(), arrow_array.data_type().into()).into(), - arrow_array.boxed(), - ) -} - -fn resize_images<'a, Arr>(images: &'a Arr, w: u32, h: u32) -> Vec> -where - Arr: AsImageObj, - &'a Arr: IntoIterator>, IntoIter = ImageBufferIter<'a, Arr>>, -{ - images - .into_iter() - .map(|img| img.map(|img| img.resize(w, h))) - .collect::>() -} - -fn crop_images<'a, Arr>( - images: &'a Arr, - bboxes: &mut dyn Iterator>, -) -> Vec>> -where - Arr: AsImageObj, - &'a Arr: IntoIterator>, IntoIter = ImageBufferIter<'a, Arr>>, -{ - images - .into_iter() - .zip(bboxes) - .map(|(img, bbox)| match (img, bbox) { - (None, _) | (_, None) => None, - (Some(img), Some(bbox)) => Some(img.crop(&bbox)), - }) - .collect::>() -} diff --git a/src/daft-core/src/array/ops/mod.rs b/src/daft-core/src/array/ops/mod.rs index fd6dd056b1..faeaab5fae 100644 --- a/src/daft-core/src/array/ops/mod.rs +++ b/src/daft-core/src/array/ops/mod.rs @@ -29,7 +29,6 @@ mod hash; mod hll_merge; mod hll_sketch; mod if_else; -pub(crate) mod image; mod is_in; mod json; mod len; diff --git a/src/daft-core/src/array/ops/repr.rs b/src/daft-core/src/array/ops/repr.rs index f48dc7d000..db8d4e1a3e 100644 --- a/src/daft-core/src/array/ops/repr.rs +++ b/src/daft-core/src/array/ops/repr.rs @@ -1,5 +1,3 @@ -use base64::Engine; - use crate::{ array::{DataArray, FixedSizeListArray, ListArray, StructArray}, datatypes::DataType, @@ -17,8 +15,6 @@ use crate::{ }; use common_error::DaftResult; -use super::image::AsImageObj; - // Default implementation of str_value: format the value with the given format string. macro_rules! impl_array_str_value { ($ArrayT:ty, $fmt:expr) => { @@ -418,51 +414,34 @@ where } } -impl ImageArray { - pub fn html_value(&self, idx: usize) -> String { - let maybe_image = self.as_image_obj(idx); - let str_val = self.str_value(idx).unwrap(); - - match maybe_image { - None => "None".to_string(), - Some(image) => { - let thumb = image.fit_to(128, 128); - let mut bytes: Vec = vec![]; - let mut writer = std::io::BufWriter::new(std::io::Cursor::new(&mut bytes)); - thumb.encode(ImageFormat::JPEG, &mut writer).unwrap(); - drop(writer); - format!( - "\"{}\"", - base64::engine::general_purpose::STANDARD.encode(&mut bytes), - str_val, - ) - } - } - } -} - -impl FixedShapeImageArray { - pub fn html_value(&self, idx: usize) -> String { - let maybe_image = self.as_image_obj(idx); - let str_val = self.str_value(idx).unwrap(); - - match maybe_image { - None => "None".to_string(), - Some(image) => { - let thumb = image.fit_to(128, 128); - let mut bytes: Vec = vec![]; - let mut writer = std::io::BufWriter::new(std::io::Cursor::new(&mut bytes)); - thumb.encode(ImageFormat::JPEG, &mut writer).unwrap(); - drop(writer); - format!( - "\"{}\"", - base64::engine::general_purpose::STANDARD.encode(&mut bytes), - str_val, - ) - } - } - } -} +// impl ImageArray { +// pub fn html_value(&self, idx: usize) -> String { + +// } +// } + +// impl FixedShapeImageArray { +// pub fn html_value(&self, idx: usize) -> String { +// let maybe_image = self.as_image_obj(idx); +// let str_val = self.str_value(idx).unwrap(); + +// match maybe_image { +// None => "None".to_string(), +// Some(image) => { +// let thumb = image.fit_to(128, 128); +// let mut bytes: Vec = vec![]; +// let mut writer = std::io::BufWriter::new(std::io::Cursor::new(&mut bytes)); +// thumb.encode(ImageFormat::JPEG, &mut writer).unwrap(); +// drop(writer); +// format!( +// "\"{}\"", +// base64::engine::general_purpose::STANDARD.encode(&mut bytes), +// str_val, +// ) +// } +// } +// } +// } impl FixedShapeTensorArray { pub fn html_value(&self, idx: usize) -> String { diff --git a/src/daft-core/src/datatypes/image_mode.rs b/src/daft-core/src/datatypes/image_mode.rs index b6ee7afd66..7fbaf1283a 100644 --- a/src/daft-core/src/datatypes/image_mode.rs +++ b/src/daft-core/src/datatypes/image_mode.rs @@ -123,51 +123,51 @@ impl ImageMode { } } -impl From for image::ColorType { - fn from(image_mode: ImageMode) -> image::ColorType { - use image::ColorType; - use ImageMode::*; - - match image_mode { - L => ColorType::L8, - LA => ColorType::La8, - RGB => ColorType::Rgb8, - RGBA => ColorType::Rgba8, - L16 => ColorType::L16, - LA16 => ColorType::La16, - RGB16 => ColorType::Rgb16, - RGBA16 => ColorType::Rgba16, - RGB32F => ColorType::Rgb32F, - RGBA32F => ColorType::Rgba32F, - } - } -} - -impl TryFrom for ImageMode { - type Error = DaftError; - - fn try_from(color: image::ColorType) -> DaftResult { - use image::ColorType; - use ImageMode::*; - - match color { - ColorType::L8 => Ok(L), - ColorType::La8 => Ok(LA), - ColorType::Rgb8 => Ok(RGB), - ColorType::Rgba8 => Ok(RGBA), - ColorType::L16 => Ok(L16), - ColorType::La16 => Ok(LA16), - ColorType::Rgb16 => Ok(RGB16), - ColorType::Rgba16 => Ok(RGBA16), - ColorType::Rgb32F => Ok(RGB32F), - ColorType::Rgba32F => Ok(RGBA32F), - _ => Err(DaftError::ValueError(format!( - "Color type {:?} is not supported.", - color - ))), - } - } -} +// impl From for image::ColorType { +// fn from(image_mode: ImageMode) -> image::ColorType { +// use image::ColorType; +// use ImageMode::*; + +// match image_mode { +// L => ColorType::L8, +// LA => ColorType::La8, +// RGB => ColorType::Rgb8, +// RGBA => ColorType::Rgba8, +// L16 => ColorType::L16, +// LA16 => ColorType::La16, +// RGB16 => ColorType::Rgb16, +// RGBA16 => ColorType::Rgba16, +// RGB32F => ColorType::Rgb32F, +// RGBA32F => ColorType::Rgba32F, +// } +// } +// } + +// impl TryFrom for ImageMode { +// type Error = DaftError; + +// fn try_from(color: image::ColorType) -> DaftResult { +// use image::ColorType; +// use ImageMode::*; + +// match color { +// ColorType::L8 => Ok(L), +// ColorType::La8 => Ok(LA), +// ColorType::Rgb8 => Ok(RGB), +// ColorType::Rgba8 => Ok(RGBA), +// ColorType::L16 => Ok(L16), +// ColorType::La16 => Ok(LA16), +// ColorType::Rgb16 => Ok(RGB16), +// ColorType::Rgba16 => Ok(RGBA16), +// ColorType::Rgb32F => Ok(RGB32F), +// ColorType::Rgba32F => Ok(RGBA32F), +// _ => Err(DaftError::ValueError(format!( +// "Color type {:?} is not supported.", +// color +// ))), +// } +// } +// } impl FromStr for ImageMode { type Err = DaftError; diff --git a/src/daft-core/src/python/series.rs b/src/daft-core/src/python/series.rs index 59a6682d56..562ff97321 100644 --- a/src/daft-core/src/python/series.rs +++ b/src/daft-core/src/python/series.rs @@ -673,40 +673,6 @@ impl PySeries { Ok(self.series.map_get(&key.series)?.into()) } - pub fn image_decode( - &self, - raise_error_on_failure: bool, - mode: Option, - ) -> PyResult { - Ok(self - .series - .image_decode(raise_error_on_failure, mode)? - .into()) - } - - pub fn image_encode(&self, image_format: ImageFormat) -> PyResult { - Ok(self.series.image_encode(image_format)?.into()) - } - - pub fn image_resize(&self, w: i64, h: i64) -> PyResult { - if w < 0 { - return Err(PyValueError::new_err(format!( - "width can not be negative: {w}" - ))); - } - if h < 0 { - return Err(PyValueError::new_err(format!( - "height can not be negative: {h}" - ))); - } - - Ok(self.series.image_resize(w as u32, h as u32)?.into()) - } - - pub fn image_to_mode(&self, mode: &ImageMode) -> PyResult { - Ok(self.series.image_to_mode(*mode)?.into()) - } - pub fn if_else(&self, other: &Self, predicate: &Self) -> PyResult { Ok(self .series diff --git a/src/daft-core/src/series/array_impl/data_array.rs b/src/daft-core/src/series/array_impl/data_array.rs index e423e17289..b998e4f98c 100644 --- a/src/daft-core/src/series/array_impl/data_array.rs +++ b/src/daft-core/src/series/array_impl/data_array.rs @@ -124,9 +124,9 @@ macro_rules! impl_series_like_for_data_array { fn str_value(&self, idx: usize) -> DaftResult { self.0.str_value(idx) } - fn html_value(&self, idx: usize) -> String { - self.0.html_value(idx) - } + // fn html_value(&self, idx: usize) -> String { + // self.0.html_value(idx) + // } fn take(&self, idx: &Series) -> DaftResult { with_match_integer_daft_types!(idx.data_type(), |$S| { Ok(self diff --git a/src/daft-core/src/series/array_impl/logical_array.rs b/src/daft-core/src/series/array_impl/logical_array.rs index e1dbaa93c5..ea72e9982f 100644 --- a/src/daft-core/src/series/array_impl/logical_array.rs +++ b/src/daft-core/src/series/array_impl/logical_array.rs @@ -125,9 +125,9 @@ macro_rules! impl_series_like_for_logical_array { self.0.str_value(idx) } - fn html_value(&self, idx: usize) -> String { - self.0.html_value(idx) - } + // fn html_value(&self, idx: usize) -> String { + // self.0.html_value(idx) + // } fn take(&self, idx: &Series) -> DaftResult { with_match_integer_daft_types!(idx.data_type(), |$S| { @@ -227,8 +227,8 @@ impl_series_like_for_logical_array!(TimeArray); impl_series_like_for_logical_array!(DurationArray); impl_series_like_for_logical_array!(TimestampArray); impl_series_like_for_logical_array!(ImageArray); +impl_series_like_for_logical_array!(FixedShapeImageArray); impl_series_like_for_logical_array!(TensorArray); impl_series_like_for_logical_array!(EmbeddingArray); -impl_series_like_for_logical_array!(FixedShapeImageArray); impl_series_like_for_logical_array!(FixedShapeTensorArray); impl_series_like_for_logical_array!(MapArray); diff --git a/src/daft-core/src/series/array_impl/nested_array.rs b/src/daft-core/src/series/array_impl/nested_array.rs index b9da4c8dc9..8092d4570d 100644 --- a/src/daft-core/src/series/array_impl/nested_array.rs +++ b/src/daft-core/src/series/array_impl/nested_array.rs @@ -148,9 +148,9 @@ macro_rules! impl_series_like_for_nested_arrays { self.0.str_value(idx) } - fn html_value(&self, idx: usize) -> String { - self.0.html_value(idx) - } + // fn html_value(&self, idx: usize) -> String { + // self.0.html_value(idx) + // } fn add(&self, rhs: &Series) -> DaftResult { SeriesBinaryOps::add(self, rhs) diff --git a/src/daft-core/src/series/ops/image.rs b/src/daft-core/src/series/ops/image.rs deleted file mode 100644 index 11dc55ffe8..0000000000 --- a/src/daft-core/src/series/ops/image.rs +++ /dev/null @@ -1,101 +0,0 @@ -use crate::datatypes::logical::{FixedShapeImageArray, ImageArray}; -use crate::datatypes::{DataType, ImageFormat, ImageMode}; - -use crate::series::{IntoSeries, Series}; -use common_error::{DaftError, DaftResult}; - -impl Series { - pub fn image_decode( - &self, - raise_error_on_failure: bool, - mode: Option, - ) -> DaftResult { - match self.data_type() { - DataType::Binary => Ok(self.binary()?.image_decode(raise_error_on_failure, mode)?.into_series()), - dtype => Err(DaftError::ValueError(format!( - "Decoding in-memory data into images is only supported for binary arrays, but got {}", dtype - ))), - } - } - - pub fn image_encode(&self, image_format: ImageFormat) -> DaftResult { - match self.data_type() { - DataType::Image(..) => Ok(self - .downcast::()? - .encode(image_format)? - .into_series()), - DataType::FixedShapeImage(..) => Ok(self - .downcast::()? - .encode(image_format)? - .into_series()), - dtype => Err(DaftError::ValueError(format!( - "Encoding images into bytes is only supported for image arrays, but got {}", - dtype - ))), - } - } - - pub fn image_resize(&self, w: u32, h: u32) -> DaftResult { - match self.data_type() { - DataType::Image(mode) => { - let array = self.downcast::()?; - match mode { - // If the image mode is specified at the type-level (and is therefore guaranteed to be consistent - // across all images across all partitions), store the resized image in a fixed shape image array, - // since we'll have homogeneous modes, heights, and widths after resizing. - Some(mode) => Ok(array - .resize_to_fixed_shape_image_array(w, h, mode)? - .into_series()), - None => Ok(array.resize(w, h)?.into_series()), - } - } - DataType::FixedShapeImage(..) => Ok(self - .downcast::()? - .resize(w, h)? - .into_series()), - _ => Err(DaftError::ValueError(format!( - "datatype: {} does not support Image Resize. Occurred while resizing Series: {}", - self.data_type(), - self.name() - ))), - } - } - - pub fn image_crop(&self, bbox: &Series) -> DaftResult { - let bbox_type = DataType::FixedSizeList(Box::new(DataType::UInt32), 4); - let bbox = bbox.cast(&bbox_type)?; - let bbox = bbox.fixed_size_list()?; - - match &self.data_type() { - DataType::Image(_) => self - .downcast::()? - .crop(bbox) - .map(|arr| arr.into_series()), - DataType::FixedShapeImage(..) => self - .fixed_size_image()? - .crop(bbox) - .map(|arr| arr.into_series()), - dt => Err(DaftError::ValueError(format!( - "Expected input to crop to be an Image type, but received: {}", - dt - ))), - } - } - - pub fn image_to_mode(&self, mode: ImageMode) -> DaftResult { - match &self.data_type() { - DataType::Image(_) => self - .downcast::()? - .to_mode(mode) - .map(|arr| arr.into_series()), - DataType::FixedShapeImage(..) => self - .fixed_size_image()? - .to_mode(mode) - .map(|arr| arr.into_series()), - dt => Err(DaftError::ValueError(format!( - "Expected input to crop to be an Image type, but received: {}", - dt - ))), - } - } -} diff --git a/src/daft-core/src/series/ops/mod.rs b/src/daft-core/src/series/ops/mod.rs index 6f1b6e759e..66f7b9b91e 100644 --- a/src/daft-core/src/series/ops/mod.rs +++ b/src/daft-core/src/series/ops/mod.rs @@ -21,7 +21,6 @@ pub mod floor; pub mod groups; pub mod hash; pub mod if_else; -pub mod image; pub mod is_in; pub mod json; pub mod len; diff --git a/src/daft-core/src/series/series_like.rs b/src/daft-core/src/series/series_like.rs index 980e6784fb..56052c81b3 100644 --- a/src/daft-core/src/series/series_like.rs +++ b/src/daft-core/src/series/series_like.rs @@ -34,7 +34,7 @@ pub trait SeriesLike: Send + Sync + Any + std::fmt::Debug { fn slice(&self, start: usize, end: usize) -> DaftResult; fn take(&self, idx: &Series) -> DaftResult; fn str_value(&self, idx: usize) -> DaftResult; - fn html_value(&self, idx: usize) -> String; + // fn html_value(&self, idx: usize) -> String; fn add(&self, rhs: &Series) -> DaftResult; fn sub(&self, rhs: &Series) -> DaftResult; fn mul(&self, rhs: &Series) -> DaftResult; diff --git a/src/daft-functions/Cargo.toml b/src/daft-functions/Cargo.toml index 92b2d1bd1a..bd2e1f9d05 100644 --- a/src/daft-functions/Cargo.toml +++ b/src/daft-functions/Cargo.toml @@ -6,6 +6,7 @@ common-io-config = {path = "../common/io-config", default-features = false} daft-core = {path = "../daft-core", default-features = false} daft-dsl = {path = "../daft-dsl", default-features = false} daft-io = {path = "../daft-io", default-features = false} +daft-image = {path = "../daft-image", default-features = false} futures = {workspace = true} pyo3 = {workspace = true, optional = true} tiktoken-rs = {workspace = true} @@ -22,6 +23,7 @@ python = [ "common-error/python", "daft-core/python", "daft-io/python", + "daft-image/python", "common-io-config/python" ] diff --git a/src/daft-functions/src/image/crop.rs b/src/daft-functions/src/image/crop.rs index d7ed7d6665..3e51f9e6c0 100644 --- a/src/daft-functions/src/image/crop.rs +++ b/src/daft-functions/src/image/crop.rs @@ -71,7 +71,7 @@ impl ScalarUDF for ImageCrop { fn evaluate(&self, inputs: &[Series]) -> DaftResult { match inputs { - [input, bbox] => input.image_crop(bbox), + [input, bbox] => daft_image::series::crop(input, bbox), _ => Err(DaftError::ValueError(format!( "Expected 2 input args, got {}", inputs.len() diff --git a/src/daft-functions/src/image/decode.rs b/src/daft-functions/src/image/decode.rs index 0229102ea5..86b567a780 100644 --- a/src/daft-functions/src/image/decode.rs +++ b/src/daft-functions/src/image/decode.rs @@ -68,7 +68,7 @@ impl ScalarUDF for ImageDecode { fn evaluate(&self, inputs: &[Series]) -> DaftResult { let raise_error_on_failure = self.raise_on_error; match inputs { - [input] => input.image_decode(raise_error_on_failure, self.mode), + [input] => daft_image::series::decode(input, raise_error_on_failure, self.mode), _ => Err(DaftError::ValueError(format!( "Expected 1 input arg, got {}", inputs.len() diff --git a/src/daft-functions/src/image/encode.rs b/src/daft-functions/src/image/encode.rs index cff7bc9f57..b12a7b3354 100644 --- a/src/daft-functions/src/image/encode.rs +++ b/src/daft-functions/src/image/encode.rs @@ -53,7 +53,7 @@ impl ScalarUDF for ImageEncode { fn evaluate(&self, inputs: &[Series]) -> DaftResult { match inputs { - [input] => input.image_encode(self.image_format), + [input] => daft_image::series::encode(input, self.image_format), _ => Err(DaftError::ValueError(format!( "Expected 1 input arg, got {}", inputs.len() diff --git a/src/daft-functions/src/image/resize.rs b/src/daft-functions/src/image/resize.rs index d71a46a30c..9620fe7aed 100644 --- a/src/daft-functions/src/image/resize.rs +++ b/src/daft-functions/src/image/resize.rs @@ -51,7 +51,7 @@ impl ScalarUDF for ImageResize { fn evaluate(&self, inputs: &[Series]) -> DaftResult { match inputs { - [input] => input.image_resize(self.width, self.height), + [input] => daft_image::series::resize(input, self.width, self.height), _ => Err(DaftError::ValueError(format!( "Expected 1 input arg, got {}", inputs.len() diff --git a/src/daft-functions/src/image/to_mode.rs b/src/daft-functions/src/image/to_mode.rs index 625bb5f71b..1cef26389d 100644 --- a/src/daft-functions/src/image/to_mode.rs +++ b/src/daft-functions/src/image/to_mode.rs @@ -53,7 +53,7 @@ impl ScalarUDF for ImageToMode { fn evaluate(&self, inputs: &[Series]) -> DaftResult { match inputs { - [input] => input.image_to_mode(self.mode), + [input] => daft_image::series::to_mode(input, self.mode), _ => Err(DaftError::ValueError(format!( "Expected 1 input arg, got {}", inputs.len() diff --git a/src/daft-image/Cargo.toml b/src/daft-image/Cargo.toml new file mode 100644 index 0000000000..91ae0c5854 --- /dev/null +++ b/src/daft-image/Cargo.toml @@ -0,0 +1,25 @@ +[package] +name = "daft-image" +edition.workspace = true +version.workspace = true + +[dependencies] +arrow2 = {workspace = true} +base64.workspace = true +common-error = {path = "../common/error", default-features = false} +daft-core = {path = "../daft-core", default-features = false} +log = {workspace = true} +num-traits = "0.2.19" +pyo3 = {workspace = true, optional = true} +serde = {workspace = true} + +[dependencies.image] +default-features = false +features = ["gif", "jpeg", "ico", "png", "tiff", "webp", "bmp", "hdr"] +version = "0.24.7" + +[features] +python = [ + "dep:pyo3", + "common-error/python" +] diff --git a/src/daft-image/src/counting_writer.rs b/src/daft-image/src/counting_writer.rs new file mode 100644 index 0000000000..d01e65b3d1 --- /dev/null +++ b/src/daft-image/src/counting_writer.rs @@ -0,0 +1,45 @@ +use std::io::{Seek, SeekFrom, Write}; + +type IOResult = std::result::Result; + +/// A wrapper of a writer that tracks the number of bytes successfully written. +pub struct CountingWriter { + inner: W, + count: u64, +} + +impl CountingWriter { + /// The number of bytes successful written so far. + pub fn count(&self) -> u64 { + self.count + } + + /// Extracts the inner writer, discarding this wrapper. + pub fn into_inner(self) -> W { + self.inner + } +} + +impl From for CountingWriter { + fn from(inner: W) -> Self { + Self { inner, count: 0 } + } +} + +impl Write for CountingWriter { + fn write(&mut self, buf: &[u8]) -> IOResult { + let written = self.inner.write(buf)?; + self.count += written as u64; + Ok(written) + } + + fn flush(&mut self) -> IOResult { + self.inner.flush() + } +} + +impl Seek for CountingWriter { + fn seek(&mut self, pos: SeekFrom) -> IOResult { + self.inner.seek(pos) + } +} diff --git a/src/daft-image/src/image_buffer.rs b/src/daft-image/src/image_buffer.rs new file mode 100644 index 0000000000..d51ff32e58 --- /dev/null +++ b/src/daft-image/src/image_buffer.rs @@ -0,0 +1,307 @@ +use common_error::{DaftError, DaftResult}; +use daft_core::array::image_array::BBox; +use daft_core::datatypes::prelude::*; +use image::{ColorType, DynamicImage, ImageBuffer}; +use image::{Luma, LumaA, Rgb, Rgba}; +use std::borrow::Cow; +use std::io::{Seek, Write}; +use std::ops::Deref; + +#[allow(clippy::upper_case_acronyms, dead_code)] +#[derive(Debug)] +pub enum DaftImageBuffer<'a> { + L(ImageBuffer, Cow<'a, [u8]>>), + LA(ImageBuffer, Cow<'a, [u8]>>), + RGB(ImageBuffer, Cow<'a, [u8]>>), + RGBA(ImageBuffer, Cow<'a, [u8]>>), + L16(ImageBuffer, Cow<'a, [u16]>>), + LA16(ImageBuffer, Cow<'a, [u16]>>), + RGB16(ImageBuffer, Cow<'a, [u16]>>), + RGBA16(ImageBuffer, Cow<'a, [u16]>>), + RGB32F(ImageBuffer, Cow<'a, [f32]>>), + RGBA32F(ImageBuffer, Cow<'a, [f32]>>), +} + +macro_rules! with_method_on_image_buffer { + ( + $key_type:expr, $method: ident +) => {{ + use DaftImageBuffer::*; + + match $key_type { + L(img) => img.$method(), + LA(img) => img.$method(), + RGB(img) => img.$method(), + RGBA(img) => img.$method(), + L16(img) => img.$method(), + LA16(img) => img.$method(), + RGB16(img) => img.$method(), + RGBA16(img) => img.$method(), + RGB32F(img) => img.$method(), + RGBA32F(img) => img.$method(), + } + }}; +} + +impl<'a> DaftImageBuffer<'a> { + pub fn from_raw( + mode: &ImageMode, + width: u32, + height: u32, + data: Cow<'a, [u8]>, + ) -> DaftImageBuffer<'a> { + use DaftImageBuffer::*; + match mode { + ImageMode::L => L(ImageBuffer::from_raw(width, height, data).unwrap()), + ImageMode::LA => LA(ImageBuffer::from_raw(width, height, data).unwrap()), + ImageMode::RGB => RGB(ImageBuffer::from_raw(width, height, data).unwrap()), + ImageMode::RGBA => RGBA(ImageBuffer::from_raw(width, height, data).unwrap()), + _ => unimplemented!("{mode} is currently not implemented!"), + } + } + pub fn height(&self) -> u32 { + with_method_on_image_buffer!(self, height) + } + + pub fn width(&self) -> u32 { + with_method_on_image_buffer!(self, width) + } + + pub fn as_u8_slice(&self) -> &[u8] { + use DaftImageBuffer::*; + match self { + L(img) => img.as_raw(), + LA(img) => img.as_raw(), + RGB(img) => img.as_raw(), + RGBA(img) => img.as_raw(), + _ => unimplemented!("unimplemented {self:?}"), + } + } + pub fn mode(&self) -> ImageMode { + use DaftImageBuffer::*; + + match self { + L(..) => ImageMode::L, + LA(..) => ImageMode::LA, + RGB(..) => ImageMode::RGB, + RGBA(..) => ImageMode::RGBA, + L16(..) => ImageMode::L16, + LA16(..) => ImageMode::LA16, + RGB16(..) => ImageMode::RGB16, + RGBA16(..) => ImageMode::RGBA16, + RGB32F(..) => ImageMode::RGB32F, + RGBA32F(..) => ImageMode::RGBA32F, + } + } + pub fn color(&self) -> ColorType { + let mode = DaftImageBuffer::mode(self); + use ImageMode::*; + match mode { + L => ColorType::L8, + LA => ColorType::La8, + RGB => ColorType::Rgb8, + RGBA => ColorType::Rgba8, + L16 => ColorType::L16, + LA16 => ColorType::La16, + RGB16 => ColorType::Rgb16, + RGBA16 => ColorType::Rgba16, + RGB32F => ColorType::Rgb32F, + RGBA32F => ColorType::Rgba32F, + } + } + + pub fn decode(bytes: &[u8]) -> DaftResult { + image::load_from_memory(bytes) + .map(|v| v.into()) + .map_err(|e| DaftError::ValueError(format!("Decoding image from bytes failed: {}", e))) + } + + pub fn encode(&self, image_format: ImageFormat, writer: &mut W) -> DaftResult<()> + where + W: Write + Seek, + { + image::write_buffer_with_format( + writer, + self.as_u8_slice(), + self.width(), + self.height(), + self.color(), + image::ImageFormat::from(image_format), + ) + .map_err(|e| { + DaftError::ValueError(format!( + "Encoding image into file format {} failed: {}", + image_format, e + )) + }) + } + + pub fn fit_to(&self, w: u32, h: u32) -> Self { + // Preserving aspect ratio, resize an image to fit within the specified dimensions. + let scale_factor = { + let width_scale = w as f64 / self.width() as f64; + let height_scale = h as f64 / self.height() as f64; + width_scale.min(height_scale) + }; + let new_w = self.width() as f64 * scale_factor; + let new_h = self.height() as f64 * scale_factor; + + self.resize(new_w.floor() as u32, new_h.floor() as u32) + } + + pub fn resize(&self, w: u32, h: u32) -> Self { + use DaftImageBuffer::*; + match self { + L(imgbuf) => { + let result = + image::imageops::resize(imgbuf, w, h, image::imageops::FilterType::Triangle); + DaftImageBuffer::L(image_buffer_vec_to_cow(result)) + } + LA(imgbuf) => { + let result = + image::imageops::resize(imgbuf, w, h, image::imageops::FilterType::Triangle); + DaftImageBuffer::LA(image_buffer_vec_to_cow(result)) + } + RGB(imgbuf) => { + let result = + image::imageops::resize(imgbuf, w, h, image::imageops::FilterType::Triangle); + DaftImageBuffer::RGB(image_buffer_vec_to_cow(result)) + } + RGBA(imgbuf) => { + let result = + image::imageops::resize(imgbuf, w, h, image::imageops::FilterType::Triangle); + DaftImageBuffer::RGBA(image_buffer_vec_to_cow(result)) + } + _ => unimplemented!("Mode {self:?} not implemented"), + } + } + + pub fn crop(&self, bbox: &BBox) -> Self { + // HACK(jay): The `.to_image()` method on SubImage takes in `'static` references for some reason + // This hack will ensure that `&self` adheres to that overly prescriptive bound + let inner = + unsafe { std::mem::transmute::<&DaftImageBuffer<'a>, &DaftImageBuffer<'static>>(self) }; + match inner { + DaftImageBuffer::L(imgbuf) => { + let result = + image::imageops::crop_imm(imgbuf, bbox.0, bbox.1, bbox.2, bbox.3).to_image(); + DaftImageBuffer::L(image_buffer_vec_to_cow(result)) + } + DaftImageBuffer::LA(imgbuf) => { + let result = + image::imageops::crop_imm(imgbuf, bbox.0, bbox.1, bbox.2, bbox.3).to_image(); + DaftImageBuffer::LA(image_buffer_vec_to_cow(result)) + } + DaftImageBuffer::RGB(imgbuf) => { + let result = + image::imageops::crop_imm(imgbuf, bbox.0, bbox.1, bbox.2, bbox.3).to_image(); + DaftImageBuffer::RGB(image_buffer_vec_to_cow(result)) + } + DaftImageBuffer::RGBA(imgbuf) => { + let result = + image::imageops::crop_imm(imgbuf, bbox.0, bbox.1, bbox.2, bbox.3).to_image(); + DaftImageBuffer::RGBA(image_buffer_vec_to_cow(result)) + } + _ => unimplemented!("Mode {self:?} not implemented"), + } + } + + pub fn into_mode(self, mode: ImageMode) -> Self { + let img: DynamicImage = self.into(); + // I couldn't find a method from the image crate to do this + let img: DynamicImage = match mode { + ImageMode::L => img.into_luma8().into(), + ImageMode::LA => img.into_luma_alpha8().into(), + ImageMode::RGB => img.into_rgb8().into(), + ImageMode::RGBA => img.into_rgba8().into(), + ImageMode::L16 => img.into_luma16().into(), + ImageMode::LA16 => img.into_luma_alpha16().into(), + ImageMode::RGB16 => img.into_rgb16().into(), + ImageMode::RGBA16 => img.into_rgba16().into(), + ImageMode::RGB32F => img.into_rgb32f().into(), + ImageMode::RGBA32F => img.into_rgba32f().into(), + }; + img.into() + } +} + +fn image_buffer_vec_to_cow<'a, P, T>(input: ImageBuffer>) -> ImageBuffer> +where + P: image::Pixel, + Vec: Deref, + T: ToOwned + std::clone::Clone, + [T]: ToOwned, +{ + let h = input.height(); + let w = input.width(); + let owned: Cow<[T]> = input.into_raw().into(); + ImageBuffer::from_raw(w, h, owned).unwrap() +} + +fn image_buffer_cow_to_vec(input: ImageBuffer>) -> ImageBuffer> +where + P: image::Pixel, + Vec: Deref, + T: ToOwned + std::clone::Clone, + [T]: ToOwned, +{ + let h = input.height(); + let w = input.width(); + let owned: Vec = input.into_raw().to_vec(); + ImageBuffer::from_raw(w, h, owned).unwrap() +} + +impl<'a> From for DaftImageBuffer<'a> { + fn from(dyn_img: DynamicImage) -> Self { + match dyn_img { + DynamicImage::ImageLuma8(img_buf) => { + DaftImageBuffer::<'a>::L(image_buffer_vec_to_cow(img_buf)) + } + DynamicImage::ImageLumaA8(img_buf) => { + DaftImageBuffer::<'a>::LA(image_buffer_vec_to_cow(img_buf)) + } + DynamicImage::ImageRgb8(img_buf) => { + DaftImageBuffer::<'a>::RGB(image_buffer_vec_to_cow(img_buf)) + } + DynamicImage::ImageRgba8(img_buf) => { + DaftImageBuffer::<'a>::RGBA(image_buffer_vec_to_cow(img_buf)) + } + DynamicImage::ImageLuma16(img_buf) => { + DaftImageBuffer::<'a>::L16(image_buffer_vec_to_cow(img_buf)) + } + DynamicImage::ImageLumaA16(img_buf) => { + DaftImageBuffer::<'a>::LA16(image_buffer_vec_to_cow(img_buf)) + } + DynamicImage::ImageRgb16(img_buf) => { + DaftImageBuffer::<'a>::RGB16(image_buffer_vec_to_cow(img_buf)) + } + DynamicImage::ImageRgba16(img_buf) => { + DaftImageBuffer::<'a>::RGBA16(image_buffer_vec_to_cow(img_buf)) + } + DynamicImage::ImageRgb32F(img_buf) => { + DaftImageBuffer::<'a>::RGB32F(image_buffer_vec_to_cow(img_buf)) + } + DynamicImage::ImageRgba32F(img_buf) => { + DaftImageBuffer::<'a>::RGBA32F(image_buffer_vec_to_cow(img_buf)) + } + _ => unimplemented!("{dyn_img:?} not implemented"), + } + } +} + +impl<'a> From> for DynamicImage { + fn from(daft_buf: DaftImageBuffer<'a>) -> Self { + match daft_buf { + DaftImageBuffer::L(buf) => image_buffer_cow_to_vec(buf).into(), + DaftImageBuffer::LA(buf) => image_buffer_cow_to_vec(buf).into(), + DaftImageBuffer::RGB(buf) => image_buffer_cow_to_vec(buf).into(), + DaftImageBuffer::RGBA(buf) => image_buffer_cow_to_vec(buf).into(), + DaftImageBuffer::L16(buf) => image_buffer_cow_to_vec(buf).into(), + DaftImageBuffer::LA16(buf) => image_buffer_cow_to_vec(buf).into(), + DaftImageBuffer::RGB16(buf) => image_buffer_cow_to_vec(buf).into(), + DaftImageBuffer::RGBA16(buf) => image_buffer_cow_to_vec(buf).into(), + DaftImageBuffer::RGB32F(buf) => image_buffer_cow_to_vec(buf).into(), + DaftImageBuffer::RGBA32F(buf) => image_buffer_cow_to_vec(buf).into(), + } + } +} diff --git a/src/daft-image/src/iters.rs b/src/daft-image/src/iters.rs new file mode 100644 index 0000000000..09ef2ce4ce --- /dev/null +++ b/src/daft-image/src/iters.rs @@ -0,0 +1,38 @@ +use crate::{kernel::AsImageObj, DaftImageBuffer}; + +pub struct ImageBufferIter<'a, Arr> +where + Arr: AsImageObj, +{ + cursor: usize, + image_array: &'a Arr, +} + +impl<'a, Arr> ImageBufferIter<'a, Arr> +where + Arr: AsImageObj, +{ + pub fn new(image_array: &'a Arr) -> Self { + Self { + cursor: 0usize, + image_array, + } + } +} + +impl<'a, Arr> Iterator for ImageBufferIter<'a, Arr> +where + Arr: AsImageObj, +{ + type Item = Option>; + + fn next(&mut self) -> Option { + if self.cursor >= self.image_array.len() { + None + } else { + let image_obj = self.image_array.as_image_obj(self.cursor); + self.cursor += 1; + Some(image_obj) + } + } +} diff --git a/src/daft-image/src/kernel.rs b/src/daft-image/src/kernel.rs new file mode 100644 index 0000000000..bfaa8f226a --- /dev/null +++ b/src/daft-image/src/kernel.rs @@ -0,0 +1,468 @@ +use crate::{iters::*, CountingWriter, DaftImageBuffer}; +use base64::Engine; +use common_error::{DaftError, DaftResult}; +use daft_core::array::image_array::{BBox, ImageArraySidecarData}; +use daft_core::array::prelude::*; +use daft_core::datatypes::prelude::*; +use daft_core::prelude::ImageArray; +use num_traits::FromPrimitive; +use std::borrow::Cow; +use std::sync::Arc; + +pub trait AsImageObj { + fn name(&self) -> &str; + fn len(&self) -> usize; + fn as_image_obj(&self, idx: usize) -> Option>; +} + +pub trait ImageOps { + fn encode(&self, image_format: ImageFormat) -> DaftResult; + fn resize(&self, w: u32, h: u32) -> DaftResult + where + Self: Sized; + fn crop(&self, bboxes: &FixedSizeListArray) -> DaftResult + where + Self: Sized; + fn resize_to_fixed_shape_image_array( + &self, + w: u32, + h: u32, + mode: &ImageMode, + ) -> DaftResult; + fn to_mode(&self, mode: ImageMode) -> DaftResult + where + Self: Sized; +} + +pub(crate) fn image_array_from_img_buffers( + name: &str, + inputs: &[Option>], + image_mode: &Option, +) -> DaftResult { + use DaftImageBuffer::*; + let is_all_u8 = inputs + .into_iter() + .filter_map(|b| b.as_ref()) + .all(|b| matches!(b, L(..) | LA(..) | RGB(..) | RGBA(..))); + assert!(is_all_u8); + + let mut data_ref = Vec::with_capacity(inputs.len()); + let mut heights = Vec::with_capacity(inputs.len()); + let mut channels = Vec::with_capacity(inputs.len()); + let mut modes = Vec::with_capacity(inputs.len()); + let mut widths = Vec::with_capacity(inputs.len()); + let mut offsets = Vec::with_capacity(inputs.len() + 1); + offsets.push(0i64); + let mut validity = arrow2::bitmap::MutableBitmap::with_capacity(inputs.len()); + + for ib in inputs { + validity.push(ib.is_some()); + let (height, width, mode, buffer) = match ib { + Some(ib) => (ib.height(), ib.width(), ib.mode(), ib.as_u8_slice()), + None => (0u32, 0u32, ImageMode::L, &[] as &[u8]), + }; + heights.push(height); + widths.push(width); + modes.push(mode as u8); + channels.push(mode.num_channels()); + data_ref.push(buffer); + offsets.push(offsets.last().unwrap() + buffer.len() as i64); + } + + let data = data_ref.concat(); + let validity: Option = match validity.unset_bits() { + 0 => None, + _ => Some(validity.into()), + }; + ImageArray::from_vecs( + name, + DataType::Image(*image_mode), + data, + offsets, + ImageArraySidecarData { + channels, + heights, + widths, + modes, + validity, + }, + ) +} + +pub(crate) fn fixed_image_array_from_img_buffers( + name: &str, + inputs: &[Option>], + image_mode: &ImageMode, + height: u32, + width: u32, +) -> DaftResult { + use DaftImageBuffer::*; + let is_all_u8 = inputs + .iter() + .filter_map(|b| b.as_ref()) + .all(|b| matches!(b, L(..) | LA(..) | RGB(..) | RGBA(..))); + assert!(is_all_u8); + + let num_channels = image_mode.num_channels(); + let mut data_ref = Vec::with_capacity(inputs.len()); + let mut validity = arrow2::bitmap::MutableBitmap::with_capacity(inputs.len()); + let list_size = (height * width * num_channels as u32) as usize; + let null_list = vec![0u8; list_size]; + for ib in inputs.iter() { + validity.push(ib.is_some()); + let buffer = match ib { + Some(ib) => ib.as_u8_slice(), + None => null_list.as_slice(), + }; + data_ref.push(buffer) + } + let data = data_ref.concat(); + let validity: Option = match validity.unset_bits() { + 0 => None, + _ => Some(validity.into()), + }; + + let arrow_dtype = arrow2::datatypes::DataType::FixedSizeList( + Box::new(arrow2::datatypes::Field::new( + "data", + arrow2::datatypes::DataType::UInt8, + true, + )), + list_size, + ); + let arrow_array = Box::new(arrow2::array::FixedSizeListArray::new( + arrow_dtype.clone(), + Box::new(arrow2::array::PrimitiveArray::from_vec(data)), + validity, + )); + let physical_array = FixedSizeListArray::from_arrow( + Arc::new(Field::new(name, (&arrow_dtype).into())), + arrow_array, + )?; + let logical_dtype = DataType::FixedShapeImage(*image_mode, height, width); + Ok(FixedShapeImageArray::new( + Field::new(name, logical_dtype), + physical_array, + )) +} + +impl ImageOps for ImageArray { + fn encode(&self, image_format: ImageFormat) -> DaftResult { + encode_images(self, image_format) + } + + fn resize(&self, w: u32, h: u32) -> DaftResult { + let result = resize_images(self, w, h); + image_array_from_img_buffers(self.name(), result.as_slice(), self.image_mode()) + } + + fn crop(&self, bboxes: &FixedSizeListArray) -> DaftResult { + let mut bboxes_iterator: Box>> = if bboxes.len() == 1 { + Box::new(std::iter::repeat(bboxes.get(0).map(|bbox| { + BBox::from_u32_arrow_array(bbox.u32().unwrap().data()) + }))) + } else { + Box::new((0..bboxes.len()).map(|i| { + bboxes + .get(i) + .map(|bbox| BBox::from_u32_arrow_array(bbox.u32().unwrap().data())) + })) + }; + let result = crop_images(self, &mut bboxes_iterator); + image_array_from_img_buffers(self.name(), result.as_slice(), self.image_mode()) + } + + fn resize_to_fixed_shape_image_array( + &self, + w: u32, + h: u32, + mode: &ImageMode, + ) -> DaftResult { + let result = resize_images(self, w, h); + fixed_image_array_from_img_buffers(self.name(), result.as_slice(), mode, h, w) + } + + fn to_mode(&self, mode: ImageMode) -> DaftResult { + let buffers: Vec> = ImageBufferIter::new(self) + .map(|img| img.map(|img| img.into_mode(mode))) + .collect(); + image_array_from_img_buffers(self.name(), &buffers, &Some(mode)) + } +} + +impl ImageOps for FixedShapeImageArray { + fn encode(&self, image_format: ImageFormat) -> DaftResult { + encode_images(self, image_format) + } + + fn resize(&self, w: u32, h: u32) -> DaftResult + where + Self: Sized, + { + let result = resize_images(self, w, h); + let mode = self.image_mode(); + fixed_image_array_from_img_buffers(self.name(), result.as_slice(), &mode, h, w) + } + + fn crop(&self, bboxes: &FixedSizeListArray) -> DaftResult + where + Self: Sized, + { + let mut bboxes_iterator: Box>> = if bboxes.len() == 1 { + Box::new(std::iter::repeat(bboxes.get(0).map(|bbox| { + BBox::from_u32_arrow_array(bbox.u32().unwrap().data()) + }))) + } else { + Box::new((0..bboxes.len()).map(|i| { + bboxes + .get(i) + .map(|bbox| BBox::from_u32_arrow_array(bbox.u32().unwrap().data())) + })) + }; + let result = crop_images(self, &mut bboxes_iterator); + + image_array_from_img_buffers(self.name(), result.as_slice(), &Some(self.image_mode().clone())) + } + + fn resize_to_fixed_shape_image_array( + &self, + w: u32, + h: u32, + mode: &ImageMode, + ) -> DaftResult { + let result = resize_images(self, w, h); + fixed_image_array_from_img_buffers(self.name(), result.as_slice(), mode, h, w) + } + + fn to_mode(&self, mode: ImageMode) -> DaftResult + where + Self: Sized, + { + let buffers: Vec> = ImageBufferIter::new(self) + .map(|img| img.map(|img| img.into_mode(mode))) + .collect(); + + let (height, width) = match self.data_type() { + DataType::FixedShapeImage(_, h, w) => (h, w), + _ => unreachable!("self should always be a FixedShapeImage"), + }; + fixed_image_array_from_img_buffers(self.name(), &buffers, &mode, *height, *width) + } +} + +impl AsImageObj for ImageArray { + fn len(&self) -> usize { + ImageArray::len(self) + } + + fn name(&self) -> &str { + ImageArray::name(self) + } + + fn as_image_obj<'a>(&'a self, idx: usize) -> Option> { + assert!(idx < self.len()); + if !self.physical.is_valid(idx) { + return None; + } + + let da = self.data_array(); + let ca = self.channel_array(); + let ha = self.height_array(); + let wa = self.width_array(); + let ma = self.mode_array(); + + let offsets = da.offsets(); + + let start = *offsets.get(idx).unwrap() as usize; + let end = *offsets.get(idx + 1).unwrap() as usize; + + let values = da + .flat_child + .u8() + .unwrap() + .data() + .as_any() + .downcast_ref::() + .unwrap(); + let slice_data = Cow::Borrowed(&values.values().as_slice()[start..end] as &'a [u8]); + + let c = ca.value(idx); + let h = ha.value(idx); + let w = wa.value(idx); + let m: ImageMode = ImageMode::from_u8(ma.value(idx)).unwrap(); + assert_eq!(m.num_channels(), c); + let result = DaftImageBuffer::from_raw(&m, w, h, slice_data); + + assert_eq!(result.height(), h); + assert_eq!(result.width(), w); + Some(result) + } +} + +impl AsImageObj for FixedShapeImageArray { + fn len(&self) -> usize { + FixedShapeImageArray::len(self) + } + + fn name(&self) -> &str { + FixedShapeImageArray::name(self) + } + + fn as_image_obj<'a>(&'a self, idx: usize) -> Option> { + assert!(idx < self.len()); + if !self.physical.is_valid(idx) { + return None; + } + + match self.data_type() { + DataType::FixedShapeImage(mode, height, width) => { + let arrow_array = self.physical.flat_child.downcast::().unwrap().as_arrow(); + let num_channels = mode.num_channels(); + let size = height * width * num_channels as u32; + let start = idx * size as usize; + let end = (idx + 1) * size as usize; + let slice_data = Cow::Borrowed(&arrow_array.values().as_slice()[start..end] as &'a [u8]); + let result = DaftImageBuffer::from_raw(mode, *width, *height, slice_data); + + assert_eq!(result.height(), *height); + assert_eq!(result.width(), *width); + Some(result) + } + dt => panic!("FixedShapeImageArray should always have DataType::FixedShapeImage() as it's dtype, but got {}", dt), + } + } +} + +fn encode_images<'a, Arr>(images: &'a Arr, image_format: ImageFormat) -> DaftResult +where + Arr: AsImageObj, +{ + let arrow_array = match image_format { + ImageFormat::TIFF => { + // NOTE: A single writer/buffer can't be used for TIFF files because the encoder will overwrite the + // IFD offset for the first image instead of writing it for all subsequent images, producing corrupted + // TIFF files. We work around this by writing out a new buffer for each image. + // TODO(Clark): Fix this in the tiff crate. + let values = ImageBufferIter::new(images) + .map(|img| { + img.map(|img| { + let buf = Vec::new(); + let mut writer: CountingWriter> = + std::io::BufWriter::new(std::io::Cursor::new(buf)).into(); + img.encode(image_format, &mut writer)?; + // NOTE: BufWriter::into_inner() will flush the buffer. + Ok(writer + .into_inner() + .into_inner() + .map_err(|e| { + DaftError::ValueError(format!( + "Encoding image into file format {} failed: {}", + image_format, e + )) + })? + .into_inner()) + }) + .transpose() + }) + .collect::>>()?; + arrow2::array::BinaryArray::::from_iter(values) + } + _ => { + let mut offsets = Vec::with_capacity(images.len() + 1); + offsets.push(0i64); + let mut validity = arrow2::bitmap::MutableBitmap::with_capacity(images.len()); + let buf = Vec::new(); + let mut writer: CountingWriter> = + std::io::BufWriter::new(std::io::Cursor::new(buf)).into(); + ImageBufferIter::new(images) + .map(|img| { + match img { + Some(img) => { + img.encode(image_format, &mut writer)?; + offsets.push(writer.count() as i64); + validity.push(true); + } + None => { + offsets.push(*offsets.last().unwrap()); + validity.push(false); + } + } + Ok(()) + }) + .collect::>>()?; + // NOTE: BufWriter::into_inner() will flush the buffer. + let values = writer + .into_inner() + .into_inner() + .map_err(|e| { + DaftError::ValueError(format!( + "Encoding image into file format {} failed: {}", + image_format, e + )) + })? + .into_inner(); + let encoded_data: arrow2::buffer::Buffer = values.into(); + let offsets_buffer = arrow2::offset::OffsetsBuffer::try_from(offsets)?; + let validity: Option = match validity.unset_bits() { + 0 => None, + _ => Some(validity.into()), + }; + arrow2::array::BinaryArray::::new( + arrow2::datatypes::DataType::LargeBinary, + offsets_buffer, + encoded_data, + validity, + ) + } + }; + BinaryArray::new( + Field::new(images.name(), arrow_array.data_type().into()).into(), + arrow_array.boxed(), + ) +} + +fn resize_images<'a, Arr>(images: &'a Arr, w: u32, h: u32) -> Vec> +where + Arr: AsImageObj, +{ + ImageBufferIter::new(images) + .map(|img| img.map(|img| img.resize(w, h))) + .collect::>() +} + +fn crop_images<'a, Arr>( + images: &'a Arr, + bboxes: &mut dyn Iterator>, +) -> Vec>> +where + Arr: AsImageObj, +{ + ImageBufferIter::new(images) + .zip(bboxes) + .map(|(img, bbox)| match (img, bbox) { + (None, _) | (_, None) => None, + (Some(img), Some(bbox)) => Some(img.crop(&bbox)), + }) + .collect::>() +} + +pub fn html_value(arr: &ImageArray, idx: usize) -> String { + let maybe_image = arr.as_image_obj(idx); + let str_val = arr.str_value(idx).unwrap(); + + match maybe_image { + None => "None".to_string(), + Some(image) => { + let thumb = image.fit_to(128, 128); + let mut bytes: Vec = vec![]; + let mut writer = std::io::BufWriter::new(std::io::Cursor::new(&mut bytes)); + thumb.encode(ImageFormat::JPEG, &mut writer).unwrap(); + drop(writer); + format!( + "\"{}\"", + base64::engine::general_purpose::STANDARD.encode(&mut bytes), + str_val, + ) + } + } +} diff --git a/src/daft-image/src/lib.rs b/src/daft-image/src/lib.rs new file mode 100644 index 0000000000..2113ebd987 --- /dev/null +++ b/src/daft-image/src/lib.rs @@ -0,0 +1,13 @@ +mod counting_writer; +mod image_buffer; +mod iters; +pub mod kernel; +pub use counting_writer::CountingWriter; +pub use image_buffer::DaftImageBuffer; +pub mod series; + +#[cfg(feature = "python")] +mod python; + +#[cfg(feature = "python")] +pub use python::*; \ No newline at end of file diff --git a/src/daft-image/src/python.rs b/src/daft-image/src/python.rs new file mode 100644 index 0000000000..e9d24a92ef --- /dev/null +++ b/src/daft-image/src/python.rs @@ -0,0 +1,54 @@ +use daft_core::{ + prelude::{ImageFormat, ImageMode}, + python::PySeries, +}; +use pyo3::{exceptions::PyValueError, prelude::*}; + +#[pyfunction] +pub fn decode( + s: &PySeries, + raise_error_on_failure: bool, + mode: Option, +) -> PyResult { + let s = crate::series::decode(&s.series, raise_error_on_failure, mode)?; + Ok(s.into()) +} + +#[pyfunction] +pub fn encode(s: &PySeries, image_format: ImageFormat) -> PyResult { + let s = crate::series::encode(&s.series, image_format)?; + Ok(s.into()) +} + +#[pyfunction] +pub fn resize(s: &PySeries, w: i64, h: i64) -> PyResult { + if w < 0 { + return Err(PyValueError::new_err(format!( + "width can not be negative: {w}" + ))); + } + if h < 0 { + return Err(PyValueError::new_err(format!( + "height can not be negative: {h}" + ))); + } + let s = crate::series::resize(&s.series, w as u32, h as u32)?; + Ok(s.into()) +} + +#[pyfunction] +pub fn to_mode(s: &PySeries, mode: &ImageMode) -> PyResult { + let s = crate::series::to_mode(&s.series, *mode)?; + Ok(s.into()) +} +// let module = PyModule::new(py, "my_module")?; + +pub fn register_modules(_py: Python, parent: &PyModule) -> PyResult<()> { + let module = PyModule::new(_py, "image")?; + module.add_wrapped(wrap_pyfunction!(decode))?; + module.add_wrapped(wrap_pyfunction!(encode))?; + module.add_wrapped(wrap_pyfunction!(resize))?; + module.add_wrapped(wrap_pyfunction!(to_mode))?; + parent.add_submodule(module)?; + Ok(()) +} diff --git a/src/daft-image/src/series.rs b/src/daft-image/src/series.rs new file mode 100644 index 0000000000..20c775bb8a --- /dev/null +++ b/src/daft-image/src/series.rs @@ -0,0 +1,159 @@ +use daft_core::prelude::*; + +use common_error::{DaftError, DaftResult}; + +use crate::{ + kernel::{image_array_from_img_buffers, ImageOps}, + DaftImageBuffer, +}; +fn image_decode_impl( + ba: &BinaryArray, + raise_error_on_failure: bool, + mode: Option, +) -> DaftResult { + let arrow_array = ba + .data() + .as_any() + .downcast_ref::>() + .unwrap(); + let mut img_bufs = Vec::>::with_capacity(arrow_array.len()); + let mut cached_dtype: Option = None; + // Load images from binary buffers. + // Confirm that all images have the same value dtype. + for (index, row) in arrow_array.iter().enumerate() { + let mut img_buf = match row.map(DaftImageBuffer::decode).transpose() { + Ok(val) => val, + Err(err) => { + if raise_error_on_failure { + return Err(err); + } else { + log::warn!( + "Error occurred during image decoding at index: {index} {} (falling back to Null)", + err + ); + None + } + } + }; + if let Some(mode) = mode { + img_buf = img_buf.map(|buf| buf.into_mode(mode)); + } + let dtype = img_buf.as_ref().map(|im| im.mode().get_dtype()); + match (dtype.as_ref(), cached_dtype.as_ref()) { + (Some(t1), Some(t2)) => { + if t1 != t2 { + return Err(DaftError::ValueError(format!( + "All images in a column must have the same dtype, but got: {:?} and {:?}", + t1, t2 + ))); + } + } + (Some(t1), None) => { + cached_dtype = Some(t1.clone()); + } + (None, _) => {} + } + img_bufs.push(img_buf); + } + // Fall back to UInt8 dtype if series is all nulls. + let cached_dtype = cached_dtype.unwrap_or(DataType::UInt8); + match cached_dtype { + DataType::UInt8 => Ok(image_array_from_img_buffers(ba.name(), img_bufs.as_slice(), &mode)?), + _ => unimplemented!("Decoding images of dtype {cached_dtype:?} is not supported, only uint8 images are supported."), + } +} +pub fn decode( + s: &Series, + raise_error_on_failure: bool, + mode: Option, +) -> DaftResult { + match s.data_type() { + DataType::Binary => image_decode_impl(s.binary()?, raise_error_on_failure, mode) + .map(|arr| arr.into_series()), + dtype => Err(DaftError::ValueError(format!( + "Decoding in-memory data into images is only supported for binary arrays, but got {}", + dtype + ))), + } +} + +pub fn encode(s: &Series, image_format: ImageFormat) -> DaftResult { + match s.data_type() { + DataType::Image(..) => Ok(s + .downcast::()? + .encode(image_format)? + .into_series()), + DataType::FixedShapeImage(..) => Ok(s + .downcast::()? + .encode(image_format)? + .into_series()), + dtype => Err(DaftError::ValueError(format!( + "Encoding images into bytes is only supported for image arrays, but got {}", + dtype + ))), + } +} + +pub fn resize(s: &Series, w: u32, h: u32) -> DaftResult { + match s.data_type() { + DataType::Image(mode) => { + let array = s.downcast::()?; + match mode { + // If the image mode is specified at the type-level (and is therefore guaranteed to be consistent + // across all images across all partitions), store the resized image in a fixed shape image array, + // since we'll have homogeneous modes, heights, and widths after resizing. + Some(mode) => Ok(array + .resize_to_fixed_shape_image_array(w, h, mode)? + .into_series()), + None => Ok(array.resize(w, h)?.into_series()), + } + } + DataType::FixedShapeImage(..) => Ok(s + .downcast::()? + .resize(w, h)? + .into_series()), + _ => Err(DaftError::ValueError(format!( + "datatype: {} does not support Image Resize. Occurred while resizing Series: {}", + s.data_type(), + s.name() + ))), + } +} + +pub fn crop(s: &Series, bbox: &Series) -> DaftResult { + let bbox_type = DataType::FixedSizeList(Box::new(DataType::UInt32), 4); + let bbox = bbox.cast(&bbox_type)?; + let bbox = bbox.fixed_size_list()?; + + match &s.data_type() { + DataType::Image(_) => s + .downcast::()? + .crop(bbox) + .map(|arr| arr.into_series()), + DataType::FixedShapeImage(..) => s + .fixed_size_image()? + .crop(bbox) + .map(|arr| arr.into_series()), + dt => Err(DaftError::ValueError(format!( + "Expected input to crop to be an Image type, but received: {}", + dt + ))), + } +} + +pub fn to_mode(s: &Series, mode: ImageMode) -> DaftResult { + match &s.data_type() { + DataType::Image(_) => s + .downcast::()? + .to_mode(mode) + .map(|arr| arr.into_series()), + DataType::FixedShapeImage(..) => s + .fixed_size_image()? + .to_mode(mode) + .map(|arr| arr.into_series()), + dt => Err(DaftError::ValueError(format!( + "Expected input to crop to be an Image type, but received: {}", + dt + ))), + } +} diff --git a/src/lib.rs b/src/lib.rs index 12e926f2e3..103eb8f0c3 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -94,32 +94,33 @@ pub mod pylib { } #[pymodule] - fn daft(_py: Python<'_>, m: &PyModule) -> PyResult<()> { - refresh_logger(_py)?; + fn daft(py: Python<'_>, m: &PyModule) -> PyResult<()> { + refresh_logger(py)?; init_tracing(crate::should_enable_chrome_trace()); - common_daft_config::register_modules(_py, m)?; - common_system_info::register_modules(_py, m)?; - common_resource_request::register_modules(_py, m)?; - daft_core::register_modules(_py, m)?; - daft_core::python::register_modules(_py, m)?; - daft_local_execution::register_modules(_py, m)?; - daft_dsl::register_modules(_py, m)?; - daft_table::register_modules(_py, m)?; - daft_io::register_modules(_py, m)?; - daft_parquet::register_modules(_py, m)?; - daft_csv::register_modules(_py, m)?; - daft_json::register_modules(_py, m)?; - daft_plan::register_modules(_py, m)?; - daft_micropartition::register_modules(_py, m)?; - daft_scan::register_modules(_py, m)?; - daft_scheduler::register_modules(_py, m)?; - daft_sql::register_modules(_py, m)?; - daft_functions::register_modules(_py, m)?; + common_daft_config::register_modules(py, m)?; + common_system_info::register_modules(py, m)?; + common_resource_request::register_modules(py, m)?; + daft_core::register_modules(py, m)?; + daft_core::python::register_modules(py, m)?; + daft_local_execution::register_modules(py, m)?; + daft_dsl::register_modules(py, m)?; + daft_table::register_modules(py, m)?; + daft_io::register_modules(py, m)?; + daft_parquet::register_modules(py, m)?; + daft_csv::register_modules(py, m)?; + daft_json::register_modules(py, m)?; + daft_plan::register_modules(py, m)?; + daft_micropartition::register_modules(py, m)?; + daft_scan::register_modules(py, m)?; + daft_scheduler::register_modules(py, m)?; + daft_sql::register_modules(py, m)?; + daft_functions::register_modules(py, m)?; m.add_wrapped(wrap_pyfunction!(version))?; m.add_wrapped(wrap_pyfunction!(build_type))?; m.add_wrapped(wrap_pyfunction!(refresh_logger))?; m.add_wrapped(wrap_pyfunction!(get_max_log_level))?; + daft_image::register_modules(py, m)?; Ok(()) } } diff --git a/tests/series/test_image.py b/tests/series/test_image.py index a86dc49818..5b2bfe5fd4 100644 --- a/tests/series/test_image.py +++ b/tests/series/test_image.py @@ -11,7 +11,6 @@ from daft.datatype import DaftExtension, DataType from daft.series import Series - MODE_TO_NP_DTYPE = { "L": np.uint8, "LA": np.uint8, From d0be3c32e59e42f1122b4aa5fce827a783275d86 Mon Sep 17 00:00:00 2001 From: universalmind303 Date: Fri, 6 Sep 2024 14:59:38 -0700 Subject: [PATCH 3/8] refactor image kernel out of core --- Cargo.lock | 2 +- daft/daft.pyi | 1821 ----------------- daft/daft/__init__.pyi | 1820 ++++++++++++++++ src/daft-core/Cargo.toml | 5 - src/daft-core/src/array/mod.rs | 2 +- src/daft-core/src/array/ops/repr.rs | 31 +- src/daft-core/src/datatypes/image_format.rs | 25 - src/daft-core/src/python/series.rs | 2 +- .../src/series/array_impl/data_array.rs | 4 +- .../src/series/array_impl/logical_array.rs | 4 - .../src/series/array_impl/nested_array.rs | 4 - src/daft-core/src/series/ops/downcast.rs | 13 + src/daft-core/src/series/ops/take.rs | 4 - src/daft-core/src/series/series_like.rs | 1 - src/daft-functions/src/image/decode.rs | 2 +- src/daft-image/src/image_buffer.rs | 12 +- src/daft-image/src/kernel.rs | 45 +- src/daft-image/src/lib.rs | 2 +- src/daft-table/Cargo.toml | 9 +- src/daft-table/src/lib.rs | 6 +- src/daft-table/src/repr_html.rs | 139 ++ 21 files changed, 2034 insertions(+), 1919 deletions(-) delete mode 100644 daft/daft.pyi create mode 100644 src/daft-table/src/repr_html.rs diff --git a/Cargo.lock b/Cargo.lock index 836b40d977..0fc3913d57 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1707,7 +1707,6 @@ dependencies = [ "fnv", "html-escape", "hyperloglog", - "image", "indexmap 2.3.0", "itertools 0.11.0", "jaq-core", @@ -2134,6 +2133,7 @@ dependencies = [ "common-error", "daft-core", "daft-dsl", + "daft-image", "html-escape", "num-traits", "pyo3", diff --git a/daft/daft.pyi b/daft/daft.pyi deleted file mode 100644 index d988a133fe..0000000000 --- a/daft/daft.pyi +++ /dev/null @@ -1,1821 +0,0 @@ -import builtins -import datetime -from enum import Enum -from typing import TYPE_CHECKING, Any, Callable, Iterator - - -import pyarrow - -from daft.dataframe.display import MermaidOptions -from daft.execution import physical_plan -from daft.io.scan import ScanOperator -from daft.plan_scheduler.physical_plan_scheduler import PartitionT -from daft.runners.partitioning import PartitionCacheEntry -from daft.sql.sql_connection import SQLConnection -from daft.udf import PartialStatefulUDF, PartialStatelessUDF - -if TYPE_CHECKING: - import pyarrow as pa - from pyiceberg.schema import Schema as IcebergSchema - from pyiceberg.table import TableProperties as IcebergTableProperties - -class ImageMode(Enum): - """ - Supported image modes for Daft's image type. - - .. warning:: - Currently, only the 8-bit modes (L, LA, RGB, RGBA) can be stored in a DataFrame. - If your binary image data includes other modes, use the `mode` argument - in `image.decode` to convert the images to a supported mode. - """ - - #: 8-bit grayscale - L: int - - #: 8-bit grayscale + alpha - LA: int - - #: 8-bit RGB - RGB: int - - #: 8-bit RGB + alpha - RGBA: int - - #: 16-bit grayscale - L16: int - - #: 16-bit grayscale + alpha - LA16: int - - #: 16-bit RGB - RGB16: int - - #: 16-bit RGB + alpha - RGBA16: int - - #: 32-bit floating RGB - RGB32F: int - - #: 32-bit floating RGB + alpha - RGBA32F: int - - @staticmethod - def from_mode_string(mode: str) -> ImageMode: - """ - Create an ImageMode from its string representation. - - Args: - mode: String representation of the mode. This is the same as the enum - attribute name, e.g. ``ImageMode.from_mode_string("RGB")`` would - return ``ImageMode.RGB``. - """ - ... - -class ImageFormat(Enum): - """ - Supported image formats for Daft's image I/O. - """ - - PNG: int - JPEG: int - TIFF: int - GIF: int - BMP: int - - @staticmethod - def from_format_string(mode: str) -> ImageFormat: - """ - Create an ImageFormat from its string representation. - """ - ... - -class JoinType(Enum): - """ - Type of a join operation. - """ - - Inner: int - Left: int - Right: int - Outer: int - Semi: int - Anti: int - - @staticmethod - def from_join_type_str(join_type: str) -> JoinType: - """ - Create a JoinType from its string representation. - - Args: - join_type: String representation of the join type. This is the same as the enum - attribute name (but snake-case), e.g. ``JoinType.from_join_type_str("inner")`` would - return ``JoinType.Inner``. - """ - ... - -class JoinStrategy(Enum): - """ - Join strategy (algorithm) to use. - """ - - Hash: int - SortMerge: int - Broadcast: int - - @staticmethod - def from_join_strategy_str(join_strategy: str) -> JoinStrategy: - """ - Create a JoinStrategy from its string representation. - - Args: - join_strategy: String representation of the join strategy. This is the same as the enum - attribute name (but snake-case), e.g. ``JoinType.from_join_strategy_str("sort_merge")`` would - return ``JoinStrategy.SortMerge``. - """ - ... - -class CountMode(Enum): - """ - Supported count modes for Daft's count aggregation. - - | All - Count both non-null and null values. - | Valid - Count only valid values. - | Null - Count only null values. - """ - - All: int - Valid: int - Null: int - - @staticmethod - def from_count_mode_str(count_mode: str) -> CountMode: - """ - Create a CountMode from its string representation. - - Args: - count_mode: String representation of the count mode , e.g. "all", "valid", or "null". - """ - ... - -class ResourceRequest: - """ - Resource request for a query fragment task. - """ - - num_cpus: float | None - num_gpus: float | None - memory_bytes: int | None - - def __init__( - self, - num_cpus: float | None = None, - num_gpus: float | None = None, - memory_bytes: int | None = None, - ): ... - @staticmethod - def max_resources(resource_requests: list[ResourceRequest]): - """Take a field-wise max of the list of resource requests.""" - ... - - def with_num_cpus(self, num_cpus: float | None) -> ResourceRequest: ... - def with_num_gpus(self, num_gpus: float | None) -> ResourceRequest: ... - def with_memory_bytes(self, memory_bytes: int | None) -> ResourceRequest: ... - def __mul__(self, factor: float) -> ResourceRequest: ... - def __add__(self, other: ResourceRequest) -> ResourceRequest: ... - def __repr__(self) -> str: ... - def __eq__(self, other: ResourceRequest) -> bool: ... # type: ignore[override] - def __ne__(self, other: ResourceRequest) -> bool: ... # type: ignore[override] - -class FileFormat(Enum): - """ - Format of a file, e.g. Parquet, CSV, and JSON. - """ - - Parquet: int - Csv: int - Json: int - - def ext(self): ... - -class ParquetSourceConfig: - """ - Configuration of a Parquet data source. - """ - - coerce_int96_timestamp_unit: PyTimeUnit | None - field_id_mapping: dict[int, PyField] | None - row_groups: list[list[int]] | None - chunk_size: int | None - - def __init__( - self, - coerce_int96_timestamp_unit: PyTimeUnit | None = None, - field_id_mapping: dict[int, PyField] | None = None, - row_groups: list[list[int]] | None = None, - chunk_size: int | None = None, - ): ... - -class CsvSourceConfig: - """ - Configuration of a CSV data source. - """ - - delimiter: str | None - has_headers: bool - double_quote: bool - quote: str | None - escape_char: str | None - comment: str | None - allow_variable_columns: bool - buffer_size: int | None - chunk_size: int | None - - def __init__( - self, - has_headers: bool, - double_quote: bool, - allow_variable_columns: bool, - delimiter: str | None, - quote: str | None, - escape_char: str | None, - comment: str | None, - buffer_size: int | None = None, - chunk_size: int | None = None, - ): ... - -class JsonSourceConfig: - """ - Configuration of a JSON data source. - """ - - buffer_size: int | None - chunk_size: int | None - - def __init__( - self, - buffer_size: int | None = None, - chunk_size: int | None = None, - ): ... - -class DatabaseSourceConfig: - """ - Configuration of a database data source. - """ - - sql: str - conn: SQLConnection - - def __init__(self, sql: str, conn_factory: SQLConnection): ... - -class FileFormatConfig: - """ - Configuration for parsing a particular file format (Parquet, CSV, JSON). - """ - - config: ParquetSourceConfig | CsvSourceConfig | JsonSourceConfig | DatabaseSourceConfig - - @staticmethod - def from_parquet_config(config: ParquetSourceConfig) -> FileFormatConfig: - """ - Create a Parquet file format config. - """ - ... - - @staticmethod - def from_csv_config(config: CsvSourceConfig) -> FileFormatConfig: - """ - Create a CSV file format config. - """ - ... - - @staticmethod - def from_json_config(config: JsonSourceConfig) -> FileFormatConfig: - """ - Create a JSON file format config. - """ - ... - - @staticmethod - def from_database_config(config: DatabaseSourceConfig) -> FileFormatConfig: - """ - Create a database file format config. - """ - ... - - def file_format(self) -> FileFormat: - """ - Get the file format for this config. - """ - ... - - def __eq__(self, other: FileFormatConfig) -> bool: ... # type: ignore[override] - def __ne__(self, other: FileFormatConfig) -> bool: ... # type: ignore[override] - -class CsvConvertOptions: - """ - Options for converting CSV data to Daft data. - """ - - limit: int | None - include_columns: list[str] | None - column_names: list[str] | None - schema: PySchema | None - predicate: PyExpr | None - - def __init__( - self, - limit: int | None = None, - include_columns: list[str] | None = None, - column_names: list[str] | None = None, - schema: PySchema | None = None, - predicate: PyExpr | None = None, - ): ... - -class CsvParseOptions: - """ - Options for parsing CSV files. - """ - - has_header: bool - delimiter: str | None - double_quote: bool - quote: str | None - allow_variable_columns: bool - escape_char: str | None - comment: str | None - - def __init__( - self, - has_header: bool = True, - delimiter: str | None = None, - double_quote: bool = True, - quote: str | None = None, - allow_variable_columns: bool = False, - escape_char: str | None = None, - comment: str | None = None, - ): ... - -class CsvReadOptions: - """ - Options for reading CSV files. - """ - - buffer_size: int | None - chunk_size: int | None - - def __init__( - self, - buffer_size: int | None = None, - chunk_size: int | None = None, - ): ... - -class JsonConvertOptions: - """ - Options for converting JSON data to Daft data. - """ - - limit: int | None - include_columns: list[str] | None - schema: PySchema | None - - def __init__( - self, - limit: int | None = None, - include_columns: list[str] | None = None, - schema: PySchema | None = None, - ): ... - -class JsonParseOptions: - """ - Options for parsing JSON files. - """ - -class JsonReadOptions: - """ - Options for reading JSON files. - """ - - buffer_size: int | None - chunk_size: int | None - - def __init__( - self, - buffer_size: int | None = None, - chunk_size: int | None = None, - ): ... - -class FileInfo: - """ - Metadata for a single file. - """ - - file_path: str - file_size: int | None - num_rows: int | None - -class FileInfos: - """ - Metadata for a collection of files. - """ - - file_paths: list[str] - file_sizes: list[int | None] - num_rows: list[int | None] - - @staticmethod - def from_infos(file_paths: list[str], file_sizes: list[int | None], num_rows: list[int | None]) -> FileInfos: ... - @staticmethod - def from_table(table: PyTable) -> FileInfos: - """ - Create from a Daft table with "path", "size", and "num_rows" columns. - """ - ... - - def extend(self, new_infos: FileInfos) -> FileInfos: - """ - Concatenate two FileInfos together. - """ - ... - - def __getitem__(self, idx: int) -> FileInfo: ... - def to_table(self) -> PyTable: - """ - Convert to a Daft table with "path", "size", and "num_rows" columns. - """ - - def __len__(self) -> int: ... - -class HTTPConfig: - """ - I/O configuration for accessing HTTP systems - """ - - bearer_token: str | None - - def __init__(self, bearer_token: str | None = None): ... - -class S3Config: - """ - I/O configuration for accessing an S3-compatible system. - """ - - region_name: str | None - endpoint_url: str | None - key_id: str | None - session_token: str | None - access_key: str | None - credentials_provider: Callable[[], S3Credentials] | None - max_connections: int - retry_initial_backoff_ms: int - connect_timeout_ms: int - read_timeout_ms: int - num_tries: int - retry_mode: str | None - anonymous: bool - use_ssl: bool - verify_ssl: bool - check_hostname_ssl: bool - requester_pays: bool | None - force_virtual_addressing: bool | None - profile_name: str | None - - def __init__( - self, - region_name: str | None = None, - endpoint_url: str | None = None, - key_id: str | None = None, - session_token: str | None = None, - access_key: str | None = None, - credentials_provider: Callable[[], S3Credentials] | None = None, - buffer_time: int | None = None, - max_connections: int | None = None, - retry_initial_backoff_ms: int | None = None, - connect_timeout_ms: int | None = None, - read_timeout_ms: int | None = None, - num_tries: int | None = None, - retry_mode: str | None = None, - anonymous: bool | None = None, - use_ssl: bool | None = None, - verify_ssl: bool | None = None, - check_hostname_ssl: bool | None = None, - requester_pays: bool | None = None, - force_virtual_addressing: bool | None = None, - profile_name: str | None = None, - ): ... - def replace( - self, - region_name: str | None = None, - endpoint_url: str | None = None, - key_id: str | None = None, - session_token: str | None = None, - access_key: str | None = None, - credentials_provider: Callable[[], S3Credentials] | None = None, - max_connections: int | None = None, - retry_initial_backoff_ms: int | None = None, - connect_timeout_ms: int | None = None, - read_timeout_ms: int | None = None, - num_tries: int | None = None, - retry_mode: str | None = None, - anonymous: bool | None = None, - use_ssl: bool | None = None, - verify_ssl: bool | None = None, - check_hostname_ssl: bool | None = None, - requester_pays: bool | None = None, - force_virtual_addressing: bool | None = None, - profile_name: str | None = None, - ) -> S3Config: - """Replaces values if provided, returning a new S3Config""" - ... - - @staticmethod - def from_env() -> S3Config: - """Creates an S3Config, retrieving credentials and configurations from the current environment""" - ... - -class S3Credentials: - key_id: str - access_key: str - session_token: str | None - expiry: datetime.datetime | None - - def __init__( - self, - key_id: str, - access_key: str, - session_token: str | None = None, - expiry: datetime.datetime | None = None, - ): ... - -class AzureConfig: - """ - I/O configuration for accessing Azure Blob Storage. - """ - - storage_account: str | None - access_key: str | None - sas_token: str | None - bearer_token: str | None - tenant_id: str | None - client_id: str | None - client_secret: str | None - use_fabric_endpoint: bool | None - anonymous: bool | None - endpoint_url: str | None = None - use_ssl: bool | None = None - - def __init__( - self, - storage_account: str | None = None, - access_key: str | None = None, - sas_token: str | None = None, - bearer_token: str | None = None, - tenant_id: str | None = None, - client_id: str | None = None, - client_secret: str | None = None, - use_fabric_endpoint: bool | None = None, - anonymous: bool | None = None, - endpoint_url: str | None = None, - use_ssl: bool | None = None, - ): ... - def replace( - self, - storage_account: str | None = None, - access_key: str | None = None, - sas_token: str | None = None, - bearer_token: str | None = None, - tenant_id: str | None = None, - client_id: str | None = None, - client_secret: str | None = None, - use_fabric_endpoint: bool | None = None, - anonymous: bool | None = None, - endpoint_url: str | None = None, - use_ssl: bool | None = None, - ) -> AzureConfig: - """Replaces values if provided, returning a new AzureConfig""" - ... - -class GCSConfig: - """ - I/O configuration for accessing Google Cloud Storage. - """ - - project_id: str | None - credentials: str | None - token: str | None - anonymous: bool - - def __init__( - self, - project_id: str | None = None, - credentials: str | None = None, - token: str | None = None, - anonymous: bool | None = None, - ): ... - def replace( - self, - project_id: str | None = None, - credentials: str | None = None, - token: str | None = None, - anonymous: bool | None = None, - ) -> GCSConfig: - """Replaces values if provided, returning a new GCSConfig""" - ... - -class IOConfig: - """ - Configuration for the native I/O layer, e.g. credentials for accessing cloud storage systems. - """ - - s3: S3Config - azure: AzureConfig - gcs: GCSConfig - http: HTTPConfig - - def __init__( - self, - s3: S3Config | None = None, - azure: AzureConfig | None = None, - gcs: GCSConfig | None = None, - http: HTTPConfig | None = None, - ): ... - @staticmethod - def from_json(input: str) -> IOConfig: - """ - Recreate an IOConfig from a JSON string. - """ - ... - - def replace( - self, - s3: S3Config | None = None, - azure: AzureConfig | None = None, - gcs: GCSConfig | None = None, - http: HTTPConfig | None = None, - ) -> IOConfig: - """Replaces values if provided, returning a new IOConfig""" - ... - -class NativeStorageConfig: - """ - Storage configuration for the Rust-native I/O layer. - """ - - # Whether or not to use a multithreaded tokio runtime for processing I/O - multithreaded_io: bool - io_config: IOConfig - - def __init__(self, multithreaded_io: bool, io_config: IOConfig): ... - -class PythonStorageConfig: - """ - Storage configuration for the legacy Python I/O layer. - """ - - io_config: IOConfig - - def __init__(self, io_config: IOConfig): ... - -class StorageConfig: - """ - Configuration for interacting with a particular storage backend, using a particular - I/O layer implementation. - """ - - @staticmethod - def native(config: NativeStorageConfig) -> StorageConfig: - """ - Create from a native storage config. - """ - ... - - @staticmethod - def python(config: PythonStorageConfig) -> StorageConfig: - """ - Create from a Python storage config. - """ - ... - - @property - def config(self) -> NativeStorageConfig | PythonStorageConfig: ... - -class ScanTask: - """ - A batch of scan tasks for reading data from an external source. - """ - - def num_rows(self) -> int: - """ - Get number of rows that will be scanned by this ScanTask. - """ - ... - - def estimate_in_memory_size_bytes(self, cfg: PyDaftExecutionConfig) -> int: - """ - Estimate the In Memory Size of this ScanTask. - """ - ... - - @staticmethod - def catalog_scan_task( - file: str, - file_format: FileFormatConfig, - schema: PySchema, - storage_config: StorageConfig, - num_rows: int | None, - size_bytes: int | None, - iceberg_delete_files: list[str] | None, - pushdowns: Pushdowns | None, - partition_values: PyTable | None, - stats: PyTable | None, - ) -> ScanTask | None: - """ - Create a Catalog Scan Task - """ - ... - - @staticmethod - def sql_scan_task( - url: str, - file_format: FileFormatConfig, - schema: PySchema, - num_rows: int | None, - storage_config: StorageConfig, - size_bytes: int | None, - pushdowns: Pushdowns | None, - stats: PyTable | None, - ) -> ScanTask: - """ - Create a SQL Scan Task - """ - ... - - @staticmethod - def python_factory_func_scan_task( - module: str, - func_name: str, - func_args: tuple[Any, ...], - schema: PySchema, - num_rows: int | None, - size_bytes: int | None, - pushdowns: Pushdowns | None, - stats: PyTable | None, - ) -> ScanTask: - """ - Create a Python factory function Scan Task - """ - ... - -class ScanOperatorHandle: - """ - A handle to a scan operator. - """ - - @staticmethod - def anonymous_scan( - files: list[str], - schema: PySchema, - file_format_config: FileFormatConfig, - storage_config: StorageConfig, - ) -> ScanOperatorHandle: ... - @staticmethod - def glob_scan( - glob_path: list[str], - file_format_config: FileFormatConfig, - storage_config: StorageConfig, - infer_schema: bool, - schema: PySchema | None = None, - ) -> ScanOperatorHandle: ... - @staticmethod - def from_python_scan_operator(operator: ScanOperator) -> ScanOperatorHandle: ... - -class PartitionField: - """ - Partitioning Field of a Scan Source such as Hive or Iceberg - """ - - field: PyField - - def __init__( - self, - field: PyField, - source_field: PyField | None = None, - transform: PartitionTransform | None = None, - ) -> None: ... - -class PartitionTransform: - """ - Partitioning Transform from a Data Catalog source field to a Partitioning Columns - """ - - @staticmethod - def identity() -> PartitionTransform: ... - @staticmethod - def year() -> PartitionTransform: ... - @staticmethod - def month() -> PartitionTransform: ... - @staticmethod - def day() -> PartitionTransform: ... - @staticmethod - def hour() -> PartitionTransform: ... - @staticmethod - def iceberg_bucket(n: int) -> PartitionTransform: ... - @staticmethod - def iceberg_truncate(w: int) -> PartitionTransform: ... - -class Pushdowns: - """ - Pushdowns from the query optimizer that can optimize scanning data sources. - """ - - columns: list[str] | None - filters: PyExpr | None - partition_filters: PyExpr | None - limit: int | None - - def filter_required_column_names(self) -> list[str]: - """List of field names that are required by the filter predicate.""" - ... - -def read_parquet( - uri: str, - columns: list[str] | None = None, - start_offset: int | None = None, - num_rows: int | None = None, - row_groups: list[int] | None = None, - predicate: PyExpr | None = None, - io_config: IOConfig | None = None, - multithreaded_io: bool | None = None, - coerce_int96_timestamp_unit: PyTimeUnit | None = None, -): ... -def read_parquet_bulk( - uris: list[str], - columns: list[str] | None = None, - start_offset: int | None = None, - num_rows: int | None = None, - row_groups: list[list[int] | None] | None = None, - predicate: PyExpr | None = None, - io_config: IOConfig | None = None, - num_parallel_tasks: int | None = 128, - multithreaded_io: bool | None = None, - coerce_int96_timestamp_unit: PyTimeUnit | None = None, -): ... -def read_parquet_statistics( - uris: PySeries, - io_config: IOConfig | None = None, - multithreaded_io: bool | None = None, -): ... -def read_parquet_into_pyarrow( - uri: str, - columns: list[str] | None = None, - start_offset: int | None = None, - num_rows: int | None = None, - row_groups: list[int] | None = None, - io_config: IOConfig | None = None, - multithreaded_io: bool | None = None, - coerce_int96_timestamp_unit: PyTimeUnit | None = None, - file_timeout_ms: int | None = None, -): ... -def read_parquet_into_pyarrow_bulk( - uris: list[str], - columns: list[str] | None = None, - start_offset: int | None = None, - num_rows: int | None = None, - row_groups: list[list[int] | None] | None = None, - io_config: IOConfig | None = None, - num_parallel_tasks: int | None = 128, - multithreaded_io: bool | None = None, - coerce_int96_timestamp_unit: PyTimeUnit | None = None, -): ... -def read_parquet_schema( - uri: str, - io_config: IOConfig | None = None, - multithreaded_io: bool | None = None, - coerce_int96_timestamp_unit: PyTimeUnit | None = None, -): ... -def read_csv( - uri: str, - convert_options: CsvConvertOptions | None = None, - parse_options: CsvParseOptions | None = None, - read_options: CsvReadOptions | None = None, - io_config: IOConfig | None = None, - multithreaded_io: bool | None = None, -): ... -def read_csv_schema( - uri: str, - parse_options: CsvParseOptions | None = None, - io_config: IOConfig | None = None, - multithreaded_io: bool | None = None, -): ... -def read_json( - uri: str, - convert_options: JsonConvertOptions | None = None, - parse_options: JsonParseOptions | None = None, - read_options: JsonReadOptions | None = None, - io_config: IOConfig | None = None, - multithreaded_io: bool | None = None, - max_chunks_in_flight: int | None = None, -): ... -def read_json_schema( - uri: str, - parse_options: JsonParseOptions | None = None, - io_config: IOConfig | None = None, - multithreaded_io: bool | None = None, -): ... - -class PyTimeUnit: - @staticmethod - def nanoseconds() -> PyTimeUnit: ... - @staticmethod - def microseconds() -> PyTimeUnit: ... - @staticmethod - def milliseconds() -> PyTimeUnit: ... - @staticmethod - def seconds() -> PyTimeUnit: ... - -class PyDataType: - @staticmethod - def null() -> PyDataType: ... - @staticmethod - def bool() -> PyDataType: ... - @staticmethod - def int8() -> PyDataType: ... - @staticmethod - def int16() -> PyDataType: ... - @staticmethod - def int32() -> PyDataType: ... - @staticmethod - def int64() -> PyDataType: ... - @staticmethod - def uint8() -> PyDataType: ... - @staticmethod - def uint16() -> PyDataType: ... - @staticmethod - def uint32() -> PyDataType: ... - @staticmethod - def uint64() -> PyDataType: ... - @staticmethod - def float32() -> PyDataType: ... - @staticmethod - def float64() -> PyDataType: ... - @staticmethod - def binary() -> PyDataType: ... - @staticmethod - def fixed_size_binary(size: int) -> PyDataType: ... - @staticmethod - def string() -> PyDataType: ... - @staticmethod - def decimal128(precision: int, size: int) -> PyDataType: ... - @staticmethod - def date() -> PyDataType: ... - @staticmethod - def time(time_unit: PyTimeUnit) -> PyDataType: ... - @staticmethod - def timestamp(time_unit: PyTimeUnit, timezone: str | None = None) -> PyDataType: ... - @staticmethod - def duration(time_unit: PyTimeUnit) -> PyDataType: ... - @staticmethod - def list(data_type: PyDataType) -> PyDataType: ... - @staticmethod - def fixed_size_list(data_type: PyDataType, size: int) -> PyDataType: ... - @staticmethod - def map(key_type: PyDataType, value_type: PyDataType) -> PyDataType: ... - @staticmethod - def struct(fields: dict[str, PyDataType]) -> PyDataType: ... - @staticmethod - def extension(name: str, storage_data_type: PyDataType, metadata: str | None = None) -> PyDataType: ... - @staticmethod - def embedding(data_type: PyDataType, size: int) -> PyDataType: ... - @staticmethod - def image( - mode: ImageMode | None = None, - height: int | None = None, - width: int | None = None, - ) -> PyDataType: ... - @staticmethod - def tensor(dtype: PyDataType, shape: tuple[int, ...] | None = None) -> PyDataType: ... - @staticmethod - def python() -> PyDataType: ... - def to_arrow(self, cast_tensor_type_for_ray: builtins.bool | None = None) -> pyarrow.DataType: ... - def is_numeric(self) -> builtins.bool: ... - def is_image(self) -> builtins.bool: ... - def is_fixed_shape_image(self) -> builtins.bool: ... - def is_list(self) -> builtins.bool: ... - def is_tensor(self) -> builtins.bool: ... - def is_fixed_shape_tensor(self) -> builtins.bool: ... - def is_map(self) -> builtins.bool: ... - def is_logical(self) -> builtins.bool: ... - def is_boolean(self) -> builtins.bool: ... - def is_string(self) -> builtins.bool: ... - def is_temporal(self) -> builtins.bool: ... - def is_equal(self, other: Any) -> builtins.bool: ... - @staticmethod - def from_json(serialized: str) -> PyDataType: ... - def __reduce__(self) -> tuple: ... - def __hash__(self) -> int: ... - -class PyField: - def name(self) -> str: ... - @staticmethod - def create(name: str, datatype: PyDataType) -> PyField: ... - def dtype(self) -> PyDataType: ... - def eq(self, other: PyField) -> bool: ... - def __reduce__(self) -> tuple: ... - -class PySchema: - def __getitem__(self, name: str) -> PyField: ... - def names(self) -> list[str]: ... - def union(self, other: PySchema) -> PySchema: ... - def eq(self, other: PySchema) -> bool: ... - def estimate_row_size_bytes(self) -> float: ... - @staticmethod - def from_field_name_and_types(names_and_types: list[tuple[str, PyDataType]]) -> PySchema: ... - @staticmethod - def from_fields(fields: list[PyField]) -> PySchema: ... - def to_pyarrow_schema(self) -> pa.Schema: ... - def __reduce__(self) -> tuple: ... - def __repr__(self) -> str: ... - def _repr_html_(self) -> str: ... - def _truncated_table_html(self) -> str: ... - def _truncated_table_string(self) -> str: ... - def apply_hints(self, hints: PySchema) -> PySchema: ... - -class PyExpr: - def alias(self, name: str) -> PyExpr: ... - def cast(self, dtype: PyDataType) -> PyExpr: ... - def ceil(self) -> PyExpr: ... - def floor(self) -> PyExpr: ... - def sign(self) -> PyExpr: ... - def round(self, decimal: int) -> PyExpr: ... - def sqrt(self) -> PyExpr: ... - def sin(self) -> PyExpr: ... - def cos(self) -> PyExpr: ... - def tan(self) -> PyExpr: ... - def cot(self) -> PyExpr: ... - def arcsin(self) -> PyExpr: ... - def arccos(self) -> PyExpr: ... - def arctan(self) -> PyExpr: ... - def arctan2(self, other: PyExpr) -> PyExpr: ... - def arctanh(self) -> PyExpr: ... - def arccosh(self) -> PyExpr: ... - def arcsinh(self) -> PyExpr: ... - def degrees(self) -> PyExpr: ... - def radians(self) -> PyExpr: ... - def log2(self) -> PyExpr: ... - def log10(self) -> PyExpr: ... - def log(self, base: float) -> PyExpr: ... - def ln(self) -> PyExpr: ... - def exp(self) -> PyExpr: ... - def if_else(self, if_true: PyExpr, if_false: PyExpr) -> PyExpr: ... - def count(self, mode: CountMode) -> PyExpr: ... - def sum(self) -> PyExpr: ... - def approx_count_distinct(self) -> PyExpr: ... - def approx_percentiles(self, percentiles: float | list[float]) -> PyExpr: ... - def mean(self) -> PyExpr: ... - def min(self) -> PyExpr: ... - def max(self) -> PyExpr: ... - def any_value(self, ignore_nulls: bool) -> PyExpr: ... - def agg_list(self) -> PyExpr: ... - def agg_concat(self) -> PyExpr: ... - def explode(self) -> PyExpr: ... - def __abs__(self) -> PyExpr: ... - def __add__(self, other: PyExpr) -> PyExpr: ... - def __sub__(self, other: PyExpr) -> PyExpr: ... - def __mul__(self, other: PyExpr) -> PyExpr: ... - def __floordiv__(self, other: PyExpr) -> PyExpr: ... - def __truediv__(self, other: PyExpr) -> PyExpr: ... - def __mod__(self, other: PyExpr) -> PyExpr: ... - def __and__(self, other: PyExpr) -> PyExpr: ... - def __or__(self, other: PyExpr) -> PyExpr: ... - def __xor__(self, other: PyExpr) -> PyExpr: ... - def __invert__(self) -> PyExpr: ... - def __lt__(self, other: PyExpr) -> PyExpr: ... - def __le__(self, other: PyExpr) -> PyExpr: ... - def __gt__(self, other: PyExpr) -> PyExpr: ... - def __ge__(self, other: PyExpr) -> PyExpr: ... - def __eq__(self, other: PyExpr) -> PyExpr: ... # type: ignore[override] - def __ne__(self, other: PyExpr) -> PyExpr: ... # type: ignore[override] - def __rshift__(self, other: PyExpr) -> PyExpr: ... - def __lshift__(self, other: PyExpr) -> PyExpr: ... - def is_null(self) -> PyExpr: ... - def not_null(self) -> PyExpr: ... - def fill_null(self, fill_value: PyExpr) -> PyExpr: ... - def is_in(self, other: PyExpr) -> PyExpr: ... - def between(self, lower: PyExpr, upper: PyExpr) -> PyExpr: ... - def name(self) -> str: ... - def to_field(self, schema: PySchema) -> PyField: ... - def to_sql(self) -> str: ... - def __repr__(self) -> str: ... - def __hash__(self) -> int: ... - def __reduce__(self) -> tuple: ... - def is_nan(self) -> PyExpr: ... - def is_inf(self) -> PyExpr: ... - def not_nan(self) -> PyExpr: ... - def fill_nan(self, fill_value: PyExpr) -> PyExpr: ... - def dt_date(self) -> PyExpr: ... - def dt_day(self) -> PyExpr: ... - def dt_hour(self) -> PyExpr: ... - def dt_minute(self) -> PyExpr: ... - def dt_second(self) -> PyExpr: ... - def dt_time(self) -> PyExpr: ... - def dt_month(self) -> PyExpr: ... - def dt_year(self) -> PyExpr: ... - def dt_day_of_week(self) -> PyExpr: ... - def dt_truncate(self, interval: str, relative_to: PyExpr) -> PyExpr: ... - def utf8_endswith(self, pattern: PyExpr) -> PyExpr: ... - def utf8_startswith(self, pattern: PyExpr) -> PyExpr: ... - def utf8_contains(self, pattern: PyExpr) -> PyExpr: ... - def utf8_match(self, pattern: PyExpr) -> PyExpr: ... - def utf8_split(self, pattern: PyExpr, regex: bool) -> PyExpr: ... - def utf8_extract(self, pattern: PyExpr, index: int) -> PyExpr: ... - def utf8_extract_all(self, pattern: PyExpr, index: int) -> PyExpr: ... - def utf8_replace(self, pattern: PyExpr, replacement: PyExpr, regex: bool) -> PyExpr: ... - def utf8_length(self) -> PyExpr: ... - def utf8_length_bytes(self) -> PyExpr: ... - def utf8_lower(self) -> PyExpr: ... - def utf8_upper(self) -> PyExpr: ... - def utf8_lstrip(self) -> PyExpr: ... - def utf8_rstrip(self) -> PyExpr: ... - def utf8_reverse(self) -> PyExpr: ... - def utf8_capitalize(self) -> PyExpr: ... - def utf8_left(self, nchars: PyExpr) -> PyExpr: ... - def utf8_right(self, nchars: PyExpr) -> PyExpr: ... - def utf8_find(self, substr: PyExpr) -> PyExpr: ... - def utf8_rpad(self, length: PyExpr, pad: PyExpr) -> PyExpr: ... - def utf8_lpad(self, length: PyExpr, pad: PyExpr) -> PyExpr: ... - def utf8_repeat(self, n: PyExpr) -> PyExpr: ... - def utf8_like(self, pattern: PyExpr) -> PyExpr: ... - def utf8_ilike(self, pattern: PyExpr) -> PyExpr: ... - def utf8_substr(self, start: PyExpr, length: PyExpr) -> PyExpr: ... - def utf8_to_date(self, format: str) -> PyExpr: ... - def utf8_to_datetime(self, format: str, timezone: str | None = None) -> PyExpr: ... - def utf8_normalize(self, remove_punct: bool, lowercase: bool, nfd_unicode: bool, white_space: bool) -> PyExpr: ... - def list_join(self, delimiter: PyExpr) -> PyExpr: ... - def list_count(self, mode: CountMode) -> PyExpr: ... - def list_get(self, idx: PyExpr, default: PyExpr) -> PyExpr: ... - def list_sum(self) -> PyExpr: ... - def list_mean(self) -> PyExpr: ... - def list_min(self) -> PyExpr: ... - def list_max(self) -> PyExpr: ... - def list_slice(self, start: PyExpr, end: PyExpr | None = None) -> PyExpr: ... - def list_chunk(self, size: int) -> PyExpr: ... - def struct_get(self, name: str) -> PyExpr: ... - def map_get(self, key: PyExpr) -> PyExpr: ... - def partitioning_days(self) -> PyExpr: ... - def partitioning_hours(self) -> PyExpr: ... - def partitioning_months(self) -> PyExpr: ... - def partitioning_years(self) -> PyExpr: ... - def partitioning_iceberg_bucket(self, n: int) -> PyExpr: ... - def partitioning_iceberg_truncate(self, w: int) -> PyExpr: ... - def json_query(self, query: str) -> PyExpr: ... - - ### - # Helper methods required by optimizer: - # These should be removed from the Python API for Expressions when logical plans and optimizer are migrated to Rust - ### - def _input_mapping(self) -> builtins.str | None: ... - -def eq(expr1: PyExpr, expr2: PyExpr) -> bool: ... -def col(name: str) -> PyExpr: ... -def lit(item: Any) -> PyExpr: ... -def date_lit(item: int) -> PyExpr: ... -def time_lit(item: int, tu: PyTimeUnit) -> PyExpr: ... -def timestamp_lit(item: int, tu: PyTimeUnit, tz: str | None) -> PyExpr: ... -def decimal_lit(sign: bool, digits: tuple[int, ...], exp: int) -> PyExpr: ... -def series_lit(item: PySeries) -> PyExpr: ... -def stateless_udf( - name: str, - partial_stateless_udf: PartialStatelessUDF, - expressions: list[PyExpr], - return_dtype: PyDataType, - resource_request: ResourceRequest | None, - batch_size: int | None, -) -> PyExpr: ... -def stateful_udf( - name: str, - partial_stateful_udf: PartialStatefulUDF, - expressions: list[PyExpr], - return_dtype: PyDataType, - resource_request: ResourceRequest | None, - init_args: tuple[tuple[Any, ...], dict[str, Any]] | None, - batch_size: int | None, - concurrency: int | None, -) -> PyExpr: ... -def check_column_name_validity(name: str, schema: PySchema): ... -def extract_partial_stateful_udf_py(expression: PyExpr) -> dict[str, PartialStatefulUDF]: ... -def bind_stateful_udfs(expression: PyExpr, initialized_funcs: dict[str, Callable]) -> PyExpr: ... -def resolve_expr(expr: PyExpr, schema: PySchema) -> tuple[PyExpr, PyField]: ... -def hash(expr: PyExpr, seed: Any | None = None) -> PyExpr: ... -def cosine_distance(expr: PyExpr, other: PyExpr) -> PyExpr: ... -def url_download( - expr: PyExpr, - max_connections: int, - raise_error_on_failure: bool, - multi_thread: bool, - config: IOConfig, -) -> PyExpr: ... -def url_upload( - expr: PyExpr, - folder_location: str, - max_connections: int, - multi_thread: bool, - io_config: IOConfig | None, -) -> PyExpr: ... -def tokenize_encode( - expr: PyExpr, - tokens_path: str, - use_special_tokens: bool, - io_config: IOConfig | None, - pattern: str | None, - special_tokens: str | None, -) -> PyExpr: ... -def tokenize_decode( - expr: PyExpr, - tokens_path: str, - io_config: IOConfig | None, - pattern: str | None, - special_tokens: str | None, -) -> PyExpr: ... -def minhash( - expr: PyExpr, - num_hashes: int, - ngram_size: int, - seed: int = 1, -) -> PyExpr: ... -def sql(sql: str, catalog: PyCatalog, daft_planning_config: PyDaftPlanningConfig) -> LogicalPlanBuilder: ... -def sql_expr(sql: str) -> PyExpr: ... -def utf8_count_matches(expr: PyExpr, patterns: PyExpr, whole_words: bool, case_sensitive: bool) -> PyExpr: ... -def list_sort(expr: PyExpr, desc: PyExpr) -> PyExpr: ... -def cbrt(expr: PyExpr) -> PyExpr: ... -def to_struct(inputs: list[PyExpr]) -> PyExpr: ... - -# --- -# expr.image namespace -# --- -def image_crop(expr: PyExpr, bbox: PyExpr) -> PyExpr: ... -def image_decode(expr: PyExpr, raise_on_error: bool, mode: ImageMode | None = None) -> PyExpr: ... -def image_encode(expr: PyExpr, image_format: ImageFormat) -> PyExpr: ... -def image_resize(expr: PyExpr, w: int, h: int) -> PyExpr: ... -def image_to_mode(expr: PyExpr, mode: ImageMode) -> PyExpr: ... - -class PyCatalog: - @staticmethod - def new() -> PyCatalog: ... - def register_table(self, name: str, logical_plan_builder: LogicalPlanBuilder) -> None: ... - -class PySeries: - @staticmethod - def from_arrow(name: str, pyarrow_array: pyarrow.Array) -> PySeries: ... - @staticmethod - def from_pylist(name: str, pylist: list[Any], pyobj: str) -> PySeries: ... - def to_pylist(self) -> list[Any]: ... - def to_arrow(self) -> pyarrow.Array: ... - def __abs__(self) -> PySeries: ... - def __add__(self, other: PySeries) -> PySeries: ... - def __sub__(self, other: PySeries) -> PySeries: ... - def __mul__(self, other: PySeries) -> PySeries: ... - def __truediv__(self, other: PySeries) -> PySeries: ... - def __mod__(self, other: PySeries) -> PySeries: ... - def __and__(self, other: PySeries) -> PySeries: ... - def __or__(self, other: PySeries) -> PySeries: ... - def __xor__(self, other: PySeries) -> PySeries: ... - def __lt__(self, other: PySeries) -> PySeries: ... - def __le__(self, other: PySeries) -> PySeries: ... - def __gt__(self, other: PySeries) -> PySeries: ... - def __ge__(self, other: PySeries) -> PySeries: ... - def __eq__(self, other: PySeries) -> PySeries: ... # type: ignore[override] - def __ne__(self, other: PySeries) -> PySeries: ... # type: ignore[override] - def __rshift__(self, other: PySeries) -> PySeries: ... - def __lshift__(self, other: PySeries) -> PySeries: ... - def take(self, idx: PySeries) -> PySeries: ... - def slice(self, start: int, end: int) -> PySeries: ... - def filter(self, mask: PySeries) -> PySeries: ... - def sort(self, descending: bool) -> PySeries: ... - def argsort(self, descending: bool) -> PySeries: ... - def hash(self, seed: PySeries | None = None) -> PySeries: ... - def minhash(self, num_hashes: int, ngram_size: int, seed: int = 1) -> PySeries: ... - def __invert__(self) -> PySeries: ... - def count(self, mode: CountMode) -> PySeries: ... - def sum(self) -> PySeries: ... - def mean(self) -> PySeries: ... - def min(self) -> PySeries: ... - def max(self) -> PySeries: ... - def agg_list(self) -> PySeries: ... - def cast(self, dtype: PyDataType) -> PySeries: ... - def ceil(self) -> PySeries: ... - def floor(self) -> PySeries: ... - def sign(self) -> PySeries: ... - def round(self, decimal: int) -> PySeries: ... - def sqrt(self) -> PySeries: ... - def cbrt(self) -> PySeries: ... - def sin(self) -> PySeries: ... - def cos(self) -> PySeries: ... - def tan(self) -> PySeries: ... - def cot(self) -> PySeries: ... - def arcsin(self) -> PySeries: ... - def arccos(self) -> PySeries: ... - def arctan(self) -> PySeries: ... - def arctan2(self, other: PySeries) -> PySeries: ... - def arctanh(self) -> PySeries: ... - def arccosh(self) -> PySeries: ... - def arcsinh(self) -> PySeries: ... - def degrees(self) -> PySeries: ... - def radians(self) -> PySeries: ... - def log2(self) -> PySeries: ... - def log10(self) -> PySeries: ... - def log(self, base: float) -> PySeries: ... - def ln(self) -> PySeries: ... - def exp(self) -> PySeries: ... - @staticmethod - def concat(series: list[PySeries]) -> PySeries: ... - def __len__(self) -> int: ... - def size_bytes(self) -> int: ... - def name(self) -> str: ... - def rename(self, name: str) -> PySeries: ... - def data_type(self) -> PyDataType: ... - def utf8_endswith(self, pattern: PySeries) -> PySeries: ... - def utf8_startswith(self, pattern: PySeries) -> PySeries: ... - def utf8_contains(self, pattern: PySeries) -> PySeries: ... - def utf8_match(self, pattern: PySeries) -> PySeries: ... - def utf8_split(self, pattern: PySeries, regex: bool) -> PySeries: ... - def utf8_extract(self, pattern: PySeries, index: int) -> PySeries: ... - def utf8_extract_all(self, pattern: PySeries, index: int) -> PySeries: ... - def utf8_replace(self, pattern: PySeries, replacement: PySeries, regex: bool) -> PySeries: ... - def utf8_length(self) -> PySeries: ... - def utf8_length_bytes(self) -> PySeries: ... - def utf8_lower(self) -> PySeries: ... - def utf8_upper(self) -> PySeries: ... - def utf8_lstrip(self) -> PySeries: ... - def utf8_rstrip(self) -> PySeries: ... - def utf8_reverse(self) -> PySeries: ... - def utf8_capitalize(self) -> PySeries: ... - def utf8_left(self, nchars: PySeries) -> PySeries: ... - def utf8_right(self, nchars: PySeries) -> PySeries: ... - def utf8_find(self, substr: PySeries) -> PySeries: ... - def utf8_rpad(self, length: PySeries, pad: PySeries) -> PySeries: ... - def utf8_lpad(self, length: PySeries, pad: PySeries) -> PySeries: ... - def utf8_repeat(self, n: PySeries) -> PySeries: ... - def utf8_like(self, pattern: PySeries) -> PySeries: ... - def utf8_ilike(self, pattern: PySeries) -> PySeries: ... - def utf8_substr(self, start: PySeries, length: PySeries | None = None) -> PySeries: ... - def utf8_to_date(self, format: str) -> PySeries: ... - def utf8_to_datetime(self, format: str, timezone: str | None = None) -> PySeries: ... - def utf8_normalize(self, remove_punct: bool, lowercase: bool, nfd_unicode: bool, white_space: bool) -> PySeries: ... - def utf8_count_matches(self, patterns: PySeries, whole_word: bool, case_sensitive: bool) -> PySeries: ... - def is_nan(self) -> PySeries: ... - def is_inf(self) -> PySeries: ... - def not_nan(self) -> PySeries: ... - def fill_nan(self, fill_value: PySeries) -> PySeries: ... - def dt_date(self) -> PySeries: ... - def dt_day(self) -> PySeries: ... - def dt_hour(self) -> PySeries: ... - def dt_minute(self) -> PySeries: ... - def dt_second(self) -> PySeries: ... - def dt_time(self) -> PySeries: ... - def dt_month(self) -> PySeries: ... - def dt_year(self) -> PySeries: ... - def dt_day_of_week(self) -> PySeries: ... - def dt_truncate(self, interval: str, relative_to: PySeries) -> PySeries: ... - def partitioning_days(self) -> PySeries: ... - def partitioning_hours(self) -> PySeries: ... - def partitioning_months(self) -> PySeries: ... - def partitioning_years(self) -> PySeries: ... - def partitioning_iceberg_bucket(self, n: int) -> PySeries: ... - def partitioning_iceberg_truncate(self, w: int) -> PySeries: ... - def list_count(self, mode: CountMode) -> PySeries: ... - def list_get(self, idx: PySeries, default: PySeries) -> PySeries: ... - def list_slice(self, start: PySeries, end: PySeries | None = None) -> PySeries: ... - def list_sort(self, desc: PySeries) -> PySeries: ... - def map_get(self, key: PySeries) -> PySeries: ... - def if_else(self, other: PySeries, predicate: PySeries) -> PySeries: ... - def is_null(self) -> PySeries: ... - def not_null(self) -> PySeries: ... - def fill_null(self, fill_value: PySeries) -> PySeries: ... - def murmur3_32(self) -> PySeries: ... - def to_str_values(self) -> PySeries: ... - def _debug_bincode_serialize(self) -> bytes: ... - @staticmethod - def _debug_bincode_deserialize(b: bytes) -> PySeries: ... - -class PyTable: - def schema(self) -> PySchema: ... - def cast_to_schema(self, schema: PySchema) -> PyTable: ... - def eval_expression_list(self, exprs: list[PyExpr]) -> PyTable: ... - def take(self, idx: PySeries) -> PyTable: ... - def filter(self, exprs: list[PyExpr]) -> PyTable: ... - def sort(self, sort_keys: list[PyExpr], descending: list[bool]) -> PyTable: ... - def argsort(self, sort_keys: list[PyExpr], descending: list[bool]) -> PySeries: ... - def agg(self, to_agg: list[PyExpr], group_by: list[PyExpr]) -> PyTable: ... - def pivot( - self, - group_by: list[PyExpr], - pivot_column: PyExpr, - values_column: PyExpr, - names: list[str], - ) -> PyTable: ... - def hash_join( - self, - right: PyTable, - left_on: list[PyExpr], - right_on: list[PyExpr], - how: JoinType, - ) -> PyTable: ... - def sort_merge_join( - self, - right: PyTable, - left_on: list[PyExpr], - right_on: list[PyExpr], - is_sorted: bool, - ) -> PyTable: ... - def explode(self, to_explode: list[PyExpr]) -> PyTable: ... - def head(self, num: int) -> PyTable: ... - def sample_by_fraction(self, fraction: float, with_replacement: bool, seed: int | None) -> PyTable: ... - def sample_by_size(self, size: int, with_replacement: bool, seed: int | None) -> PyTable: ... - def quantiles(self, num: int) -> PyTable: ... - def partition_by_hash(self, exprs: list[PyExpr], num_partitions: int) -> list[PyTable]: ... - def partition_by_random(self, num_partitions: int, seed: int) -> list[PyTable]: ... - def partition_by_range( - self, partition_keys: list[PyExpr], boundaries: PyTable, descending: list[bool] - ) -> list[PyTable]: ... - def partition_by_value(self, partition_keys: list[PyExpr]) -> tuple[list[PyTable], PyTable]: ... - def add_monotonically_increasing_id(self, partition_num: int, column_name: str) -> PyTable: ... - def __repr__(self) -> str: ... - def _repr_html_(self) -> str: ... - def __len__(self) -> int: ... - def size_bytes(self) -> int: ... - def column_names(self) -> list[str]: ... - def get_column(self, name: str) -> PySeries: ... - def get_column_by_index(self, idx: int) -> PySeries: ... - @staticmethod - def concat(tables: list[PyTable]) -> PyTable: ... - def slice(self, start: int, end: int) -> PyTable: ... - @staticmethod - def from_arrow_record_batches(record_batches: list[pyarrow.RecordBatch], schema: PySchema) -> PyTable: ... - @staticmethod - def from_pylist_series(dict: dict[str, PySeries]) -> PyTable: ... - def to_arrow_record_batch(self) -> pyarrow.RecordBatch: ... - @staticmethod - def empty(schema: PySchema | None = None) -> PyTable: ... - -class PyMicroPartition: - def schema(self) -> PySchema: ... - def column_names(self) -> list[str]: ... - def get_column(self, name: str) -> PySeries: ... - def size_bytes(self) -> int | None: ... - def _repr_html_(self) -> str: ... - @staticmethod - def empty(schema: PySchema | None = None) -> PyMicroPartition: ... - @staticmethod - def from_scan_task(scan_task: ScanTask) -> PyMicroPartition: ... - @staticmethod - def from_tables(tables: list[PyTable]) -> PyMicroPartition: ... - @staticmethod - def from_arrow_record_batches(record_batches: list[pyarrow.RecordBatch], schema: PySchema) -> PyMicroPartition: ... - @staticmethod - def concat(tables: list[PyMicroPartition]) -> PyMicroPartition: ... - def slice(self, start: int, end: int) -> PyMicroPartition: ... - def to_table(self) -> PyTable: ... - def cast_to_schema(self, schema: PySchema) -> PyMicroPartition: ... - def eval_expression_list(self, exprs: list[PyExpr]) -> PyMicroPartition: ... - def take(self, idx: PySeries) -> PyMicroPartition: ... - def filter(self, exprs: list[PyExpr]) -> PyMicroPartition: ... - def sort(self, sort_keys: list[PyExpr], descending: list[bool]) -> PyMicroPartition: ... - def argsort(self, sort_keys: list[PyExpr], descending: list[bool]) -> PySeries: ... - def agg(self, to_agg: list[PyExpr], group_by: list[PyExpr]) -> PyMicroPartition: ... - def hash_join( - self, - right: PyMicroPartition, - left_on: list[PyExpr], - right_on: list[PyExpr], - how: JoinType, - ) -> PyMicroPartition: ... - def pivot( - self, - group_by: list[PyExpr], - pivot_column: PyExpr, - values_column: PyExpr, - names: list[str], - ) -> PyMicroPartition: ... - def sort_merge_join( - self, - right: PyMicroPartition, - left_on: list[PyExpr], - right_on: list[PyExpr], - is_sorted: bool, - ) -> PyMicroPartition: ... - def explode(self, to_explode: list[PyExpr]) -> PyMicroPartition: ... - def unpivot( - self, - ids: list[PyExpr], - values: list[PyExpr], - variable_name: str, - value_name: str, - ) -> PyMicroPartition: ... - def head(self, num: int) -> PyMicroPartition: ... - def sample_by_fraction(self, fraction: float, with_replacement: bool, seed: int | None) -> PyMicroPartition: ... - def sample_by_size(self, size: int, with_replacement: bool, seed: int | None) -> PyMicroPartition: ... - def quantiles(self, num: int) -> PyMicroPartition: ... - def partition_by_hash(self, exprs: list[PyExpr], num_partitions: int) -> list[PyMicroPartition]: ... - def partition_by_random(self, num_partitions: int, seed: int) -> list[PyMicroPartition]: ... - def partition_by_range( - self, partition_keys: list[PyExpr], boundaries: PyTable, descending: list[bool] - ) -> list[PyMicroPartition]: ... - def partition_by_value(self, exprs: list[PyExpr]) -> tuple[list[PyMicroPartition], PyMicroPartition]: ... - def add_monotonically_increasing_id(self, partition_num: int, column_name: str) -> PyMicroPartition: ... - def __repr__(self) -> str: ... - def __len__(self) -> int: ... - @classmethod - def read_parquet( - cls, - path: str, - columns: list[str] | None = None, - start_offset: int | None = None, - num_rows: int | None = None, - row_groups: list[int] | None = None, - predicate: PyExpr | None = None, - io_config: IOConfig | None = None, - multithreaded_io: bool | None = None, - coerce_int96_timestamp_unit: PyTimeUnit = PyTimeUnit.nanoseconds(), - ): ... - @classmethod - def read_parquet_bulk( - cls, - uris: list[str], - columns: list[str] | None = None, - start_offset: int | None = None, - num_rows: int | None = None, - row_groups: list[list[int] | None] | None = None, - predicate: PyExpr | None = None, - io_config: IOConfig | None = None, - num_parallel_tasks: int | None = None, - multithreaded_io: bool | None = None, - coerce_int96_timestamp_unit: PyTimeUnit | None = None, - ): ... - @classmethod - def read_csv( - cls, - uri: str, - convert_options: CsvConvertOptions | None = None, - parse_options: CsvParseOptions | None = None, - read_options: CsvReadOptions | None = None, - io_config: IOConfig | None = None, - multithreaded_io: bool | None = None, - ): ... - @classmethod - def read_json_native( - cls, - uri: str, - convert_options: JsonConvertOptions | None = None, - parse_options: JsonParseOptions | None = None, - read_options: JsonReadOptions | None = None, - io_config: IOConfig | None = None, - multithreaded_io: bool | None = None, - ): ... - -class PhysicalPlanScheduler: - """ - A work scheduler for physical query plans. - """ - - @staticmethod - def from_logical_plan_builder( - logical_plan_builder: LogicalPlanBuilder, - cfg: PyDaftExecutionConfig, - ) -> PhysicalPlanScheduler: ... - def num_partitions(self) -> int: ... - def repr_ascii(self, simple: bool) -> str: ... - def repr_mermaid(self, options: MermaidOptions) -> str: ... - def to_partition_tasks(self, psets: dict[str, list[PartitionT]]) -> physical_plan.InProgressPhysicalPlan: ... - def run(self, psets: dict[str, list[PartitionT]]) -> Iterator[PyMicroPartition]: ... - -class AdaptivePhysicalPlanScheduler: - """ - An adaptive Physical Plan Scheduler. - """ - - @staticmethod - def from_logical_plan_builder( - logical_plan_builder: LogicalPlanBuilder, - cfg: PyDaftExecutionConfig, - ) -> AdaptivePhysicalPlanScheduler: ... - def next(self) -> tuple[int | None, PhysicalPlanScheduler]: ... - def is_done(self) -> bool: ... - # Todo use in memory info here instead - def update( - self, - source_id: int, - partition_key: str, - cache_entry: PartitionCacheEntry, - num_partitions: int, - size_bytes: int, - num_rows: int, - ) -> None: ... - -class LogicalPlanBuilder: - """ - A logical plan builder, which simplifies constructing logical plans via - a fluent interface. E.g., LogicalPlanBuilder.table_scan(..).project(..).filter(..). - - This builder holds the current root (sink) of the logical plan, and the building methods return - a brand new builder holding a new plan; i.e., this is an immutable builder. - """ - - @staticmethod - def in_memory_scan( - partition_key: str, - cache_entry: PartitionCacheEntry, - schema: PySchema, - num_partitions: int, - size_bytes: int, - num_rows: int, - ) -> LogicalPlanBuilder: ... - @staticmethod - def table_scan(scan_operator: ScanOperatorHandle) -> LogicalPlanBuilder: ... - def with_planning_config(self, daft_planning_config: PyDaftPlanningConfig) -> LogicalPlanBuilder: ... - def select(self, to_select: list[PyExpr]) -> LogicalPlanBuilder: ... - def with_columns(self, columns: list[PyExpr]) -> LogicalPlanBuilder: ... - def exclude(self, to_exclude: list[str]) -> LogicalPlanBuilder: ... - def filter(self, predicate: PyExpr) -> LogicalPlanBuilder: ... - def limit(self, limit: int, eager: bool) -> LogicalPlanBuilder: ... - def explode(self, to_explode: list[PyExpr]) -> LogicalPlanBuilder: ... - def unpivot( - self, - ids: list[PyExpr], - values: list[PyExpr], - variable_name: str, - value_name: str, - ) -> LogicalPlanBuilder: ... - def sort(self, sort_by: list[PyExpr], descending: list[bool]) -> LogicalPlanBuilder: ... - def hash_repartition( - self, - partition_by: list[PyExpr], - num_partitions: int | None, - ) -> LogicalPlanBuilder: ... - def random_shuffle(self, num_partitions: int | None) -> LogicalPlanBuilder: ... - def into_partitions(self, num_partitions: int) -> LogicalPlanBuilder: ... - def coalesce(self, num_partitions: int) -> LogicalPlanBuilder: ... - def distinct(self) -> LogicalPlanBuilder: ... - def sample(self, fraction: float, with_replacement: bool, seed: int | None) -> LogicalPlanBuilder: ... - def aggregate(self, agg_exprs: list[PyExpr], groupby_exprs: list[PyExpr]) -> LogicalPlanBuilder: ... - def pivot( - self, - groupby_exprs: list[PyExpr], - pivot_expr: PyExpr, - values_expr: PyExpr, - agg_expr: PyExpr, - names: list[str], - ) -> LogicalPlanBuilder: ... - def join( - self, - right: LogicalPlanBuilder, - left_on: list[PyExpr], - right_on: list[PyExpr], - join_type: JoinType, - strategy: JoinStrategy | None = None, - ) -> LogicalPlanBuilder: ... - def concat(self, other: LogicalPlanBuilder) -> LogicalPlanBuilder: ... - def add_monotonically_increasing_id(self, column_name: str | None) -> LogicalPlanBuilder: ... - def table_write( - self, - root_dir: str, - file_format: FileFormat, - partition_cols: list[PyExpr] | None = None, - compression: str | None = None, - io_config: IOConfig | None = None, - ) -> LogicalPlanBuilder: ... - def iceberg_write( - self, - table_name: str, - table_location: str, - spec_id: int, - iceberg_schema: IcebergSchema, - iceberg_properties: IcebergTableProperties, - catalog_columns: list[str], - io_config: IOConfig | None = None, - ) -> LogicalPlanBuilder: ... - def delta_write( - self, - path: str, - columns_name: list[str], - mode: str, - version: int, - large_dtypes: bool, - io_config: IOConfig | None = None, - ) -> LogicalPlanBuilder: ... - def lance_write( - self, - path: str, - columns_name: list[str], - mode: str, - io_config: IOConfig | None = None, - kwargs: dict[str, Any] | None = None, - ) -> LogicalPlanBuilder: ... - def schema(self) -> PySchema: ... - def optimize(self) -> LogicalPlanBuilder: ... - def to_physical_plan_scheduler(self, cfg: PyDaftExecutionConfig) -> PhysicalPlanScheduler: ... - def to_adaptive_physical_plan_scheduler(self, cfg: PyDaftExecutionConfig) -> AdaptivePhysicalPlanScheduler: ... - def repr_ascii(self, simple: bool) -> str: ... - def repr_mermaid(self, options: MermaidOptions) -> str: ... - -class NativeExecutor: - @staticmethod - def from_logical_plan_builder( - logical_plan_builder: LogicalPlanBuilder, - ) -> NativeExecutor: ... - def run( - self, psets: dict[str, list[PartitionT]], cfg: PyDaftExecutionConfig, results_buffer_size: int | None - ) -> Iterator[PyMicroPartition]: ... - -class PyDaftExecutionConfig: - @staticmethod - def from_env() -> PyDaftExecutionConfig: ... - def with_config_values( - self, - scan_tasks_min_size_bytes: int | None = None, - scan_tasks_max_size_bytes: int | None = None, - broadcast_join_size_bytes_threshold: int | None = None, - parquet_split_row_groups_max_files: int | None = None, - sort_merge_join_sort_with_aligned_boundaries: bool | None = None, - hash_join_partition_size_leniency: float | None = None, - sample_size_for_sort: int | None = None, - num_preview_rows: int | None = None, - parquet_target_filesize: int | None = None, - parquet_target_row_group_size: int | None = None, - parquet_inflation_factor: float | None = None, - csv_target_filesize: int | None = None, - csv_inflation_factor: float | None = None, - shuffle_aggregation_default_partitions: int | None = None, - read_sql_partition_size_bytes: int | None = None, - enable_aqe: bool | None = None, - enable_native_executor: bool | None = None, - default_morsel_size: int | None = None, - ) -> PyDaftExecutionConfig: ... - @property - def scan_tasks_min_size_bytes(self) -> int: ... - @property - def scan_tasks_max_size_bytes(self) -> int: ... - @property - def broadcast_join_size_bytes_threshold(self) -> int: ... - @property - def sort_merge_join_sort_with_aligned_boundaries(self) -> bool: ... - @property - def hash_join_partition_size_leniency(self) -> float: ... - @property - def sample_size_for_sort(self) -> int: ... - @property - def num_preview_rows(self) -> int: ... - @property - def parquet_target_filesize(self) -> int: ... - @property - def parquet_target_row_group_size(self) -> int: ... - @property - def parquet_inflation_factor(self) -> float: ... - @property - def csv_target_filesize(self) -> int: ... - @property - def csv_inflation_factor(self) -> float: ... - @property - def shuffle_aggregation_default_partitions(self) -> int: ... - @property - def read_sql_partition_size_bytes(self) -> int: ... - @property - def enable_aqe(self) -> bool: ... - @property - def enable_native_executor(self) -> bool: ... - @property - def default_morsel_size(self) -> int: ... - -class PyDaftPlanningConfig: - @staticmethod - def from_env() -> PyDaftPlanningConfig: ... - def with_config_values( - self, - default_io_config: IOConfig | None = None, - enable_actor_pool_projections: bool | None = None, - ) -> PyDaftPlanningConfig: ... - @property - def default_io_config(self) -> IOConfig: ... - @property - def enable_actor_pool_projections(self) -> bool: ... - -def build_type() -> str: ... -def version() -> str: ... -def refresh_logger() -> None: ... -def get_max_log_level() -> str: ... -def __getattr__(name) -> Any: ... -def io_glob( - path: str, - multithreaded_io: bool | None = None, - io_config: IOConfig | None = None, - fanout_limit: int | None = None, - page_size: int | None = None, - limit: int | None = None, -) -> list[dict]: ... - -class SystemInfo: - """ - Accessor for system information. - """ - - def __init__(self) -> None: ... - def total_memory(self) -> int: ... - def cpu_count(self) -> int | None: ... diff --git a/daft/daft/__init__.pyi b/daft/daft/__init__.pyi index e69de29bb2..f78e2ae166 100644 --- a/daft/daft/__init__.pyi +++ b/daft/daft/__init__.pyi @@ -0,0 +1,1820 @@ +import builtins +import datetime +from enum import Enum +from typing import TYPE_CHECKING, Any, Callable, Iterator + +import pyarrow + +from daft.dataframe.display import MermaidOptions +from daft.execution import physical_plan +from daft.io.scan import ScanOperator +from daft.plan_scheduler.physical_plan_scheduler import PartitionT +from daft.runners.partitioning import PartitionCacheEntry +from daft.sql.sql_connection import SQLConnection +from daft.udf import PartialStatefulUDF, PartialStatelessUDF + +if TYPE_CHECKING: + import pyarrow as pa + from pyiceberg.schema import Schema as IcebergSchema + from pyiceberg.table import TableProperties as IcebergTableProperties + +class ImageMode(Enum): + """ + Supported image modes for Daft's image type. + + .. warning:: + Currently, only the 8-bit modes (L, LA, RGB, RGBA) can be stored in a DataFrame. + If your binary image data includes other modes, use the `mode` argument + in `image.decode` to convert the images to a supported mode. + """ + + #: 8-bit grayscale + L: int + + #: 8-bit grayscale + alpha + LA: int + + #: 8-bit RGB + RGB: int + + #: 8-bit RGB + alpha + RGBA: int + + #: 16-bit grayscale + L16: int + + #: 16-bit grayscale + alpha + LA16: int + + #: 16-bit RGB + RGB16: int + + #: 16-bit RGB + alpha + RGBA16: int + + #: 32-bit floating RGB + RGB32F: int + + #: 32-bit floating RGB + alpha + RGBA32F: int + + @staticmethod + def from_mode_string(mode: str) -> ImageMode: + """ + Create an ImageMode from its string representation. + + Args: + mode: String representation of the mode. This is the same as the enum + attribute name, e.g. ``ImageMode.from_mode_string("RGB")`` would + return ``ImageMode.RGB``. + """ + ... + +class ImageFormat(Enum): + """ + Supported image formats for Daft's image I/O. + """ + + PNG: int + JPEG: int + TIFF: int + GIF: int + BMP: int + + @staticmethod + def from_format_string(mode: str) -> ImageFormat: + """ + Create an ImageFormat from its string representation. + """ + ... + +class JoinType(Enum): + """ + Type of a join operation. + """ + + Inner: int + Left: int + Right: int + Outer: int + Semi: int + Anti: int + + @staticmethod + def from_join_type_str(join_type: str) -> JoinType: + """ + Create a JoinType from its string representation. + + Args: + join_type: String representation of the join type. This is the same as the enum + attribute name (but snake-case), e.g. ``JoinType.from_join_type_str("inner")`` would + return ``JoinType.Inner``. + """ + ... + +class JoinStrategy(Enum): + """ + Join strategy (algorithm) to use. + """ + + Hash: int + SortMerge: int + Broadcast: int + + @staticmethod + def from_join_strategy_str(join_strategy: str) -> JoinStrategy: + """ + Create a JoinStrategy from its string representation. + + Args: + join_strategy: String representation of the join strategy. This is the same as the enum + attribute name (but snake-case), e.g. ``JoinType.from_join_strategy_str("sort_merge")`` would + return ``JoinStrategy.SortMerge``. + """ + ... + +class CountMode(Enum): + """ + Supported count modes for Daft's count aggregation. + + | All - Count both non-null and null values. + | Valid - Count only valid values. + | Null - Count only null values. + """ + + All: int + Valid: int + Null: int + + @staticmethod + def from_count_mode_str(count_mode: str) -> CountMode: + """ + Create a CountMode from its string representation. + + Args: + count_mode: String representation of the count mode , e.g. "all", "valid", or "null". + """ + ... + +class ResourceRequest: + """ + Resource request for a query fragment task. + """ + + num_cpus: float | None + num_gpus: float | None + memory_bytes: int | None + + def __init__( + self, + num_cpus: float | None = None, + num_gpus: float | None = None, + memory_bytes: int | None = None, + ): ... + @staticmethod + def max_resources(resource_requests: list[ResourceRequest]): + """Take a field-wise max of the list of resource requests.""" + ... + + def with_num_cpus(self, num_cpus: float | None) -> ResourceRequest: ... + def with_num_gpus(self, num_gpus: float | None) -> ResourceRequest: ... + def with_memory_bytes(self, memory_bytes: int | None) -> ResourceRequest: ... + def __mul__(self, factor: float) -> ResourceRequest: ... + def __add__(self, other: ResourceRequest) -> ResourceRequest: ... + def __repr__(self) -> str: ... + def __eq__(self, other: ResourceRequest) -> bool: ... # type: ignore[override] + def __ne__(self, other: ResourceRequest) -> bool: ... # type: ignore[override] + +class FileFormat(Enum): + """ + Format of a file, e.g. Parquet, CSV, and JSON. + """ + + Parquet: int + Csv: int + Json: int + + def ext(self): ... + +class ParquetSourceConfig: + """ + Configuration of a Parquet data source. + """ + + coerce_int96_timestamp_unit: PyTimeUnit | None + field_id_mapping: dict[int, PyField] | None + row_groups: list[list[int]] | None + chunk_size: int | None + + def __init__( + self, + coerce_int96_timestamp_unit: PyTimeUnit | None = None, + field_id_mapping: dict[int, PyField] | None = None, + row_groups: list[list[int]] | None = None, + chunk_size: int | None = None, + ): ... + +class CsvSourceConfig: + """ + Configuration of a CSV data source. + """ + + delimiter: str | None + has_headers: bool + double_quote: bool + quote: str | None + escape_char: str | None + comment: str | None + allow_variable_columns: bool + buffer_size: int | None + chunk_size: int | None + + def __init__( + self, + has_headers: bool, + double_quote: bool, + allow_variable_columns: bool, + delimiter: str | None, + quote: str | None, + escape_char: str | None, + comment: str | None, + buffer_size: int | None = None, + chunk_size: int | None = None, + ): ... + +class JsonSourceConfig: + """ + Configuration of a JSON data source. + """ + + buffer_size: int | None + chunk_size: int | None + + def __init__( + self, + buffer_size: int | None = None, + chunk_size: int | None = None, + ): ... + +class DatabaseSourceConfig: + """ + Configuration of a database data source. + """ + + sql: str + conn: SQLConnection + + def __init__(self, sql: str, conn_factory: SQLConnection): ... + +class FileFormatConfig: + """ + Configuration for parsing a particular file format (Parquet, CSV, JSON). + """ + + config: ParquetSourceConfig | CsvSourceConfig | JsonSourceConfig | DatabaseSourceConfig + + @staticmethod + def from_parquet_config(config: ParquetSourceConfig) -> FileFormatConfig: + """ + Create a Parquet file format config. + """ + ... + + @staticmethod + def from_csv_config(config: CsvSourceConfig) -> FileFormatConfig: + """ + Create a CSV file format config. + """ + ... + + @staticmethod + def from_json_config(config: JsonSourceConfig) -> FileFormatConfig: + """ + Create a JSON file format config. + """ + ... + + @staticmethod + def from_database_config(config: DatabaseSourceConfig) -> FileFormatConfig: + """ + Create a database file format config. + """ + ... + + def file_format(self) -> FileFormat: + """ + Get the file format for this config. + """ + ... + + def __eq__(self, other: FileFormatConfig) -> bool: ... # type: ignore[override] + def __ne__(self, other: FileFormatConfig) -> bool: ... # type: ignore[override] + +class CsvConvertOptions: + """ + Options for converting CSV data to Daft data. + """ + + limit: int | None + include_columns: list[str] | None + column_names: list[str] | None + schema: PySchema | None + predicate: PyExpr | None + + def __init__( + self, + limit: int | None = None, + include_columns: list[str] | None = None, + column_names: list[str] | None = None, + schema: PySchema | None = None, + predicate: PyExpr | None = None, + ): ... + +class CsvParseOptions: + """ + Options for parsing CSV files. + """ + + has_header: bool + delimiter: str | None + double_quote: bool + quote: str | None + allow_variable_columns: bool + escape_char: str | None + comment: str | None + + def __init__( + self, + has_header: bool = True, + delimiter: str | None = None, + double_quote: bool = True, + quote: str | None = None, + allow_variable_columns: bool = False, + escape_char: str | None = None, + comment: str | None = None, + ): ... + +class CsvReadOptions: + """ + Options for reading CSV files. + """ + + buffer_size: int | None + chunk_size: int | None + + def __init__( + self, + buffer_size: int | None = None, + chunk_size: int | None = None, + ): ... + +class JsonConvertOptions: + """ + Options for converting JSON data to Daft data. + """ + + limit: int | None + include_columns: list[str] | None + schema: PySchema | None + + def __init__( + self, + limit: int | None = None, + include_columns: list[str] | None = None, + schema: PySchema | None = None, + ): ... + +class JsonParseOptions: + """ + Options for parsing JSON files. + """ + +class JsonReadOptions: + """ + Options for reading JSON files. + """ + + buffer_size: int | None + chunk_size: int | None + + def __init__( + self, + buffer_size: int | None = None, + chunk_size: int | None = None, + ): ... + +class FileInfo: + """ + Metadata for a single file. + """ + + file_path: str + file_size: int | None + num_rows: int | None + +class FileInfos: + """ + Metadata for a collection of files. + """ + + file_paths: list[str] + file_sizes: list[int | None] + num_rows: list[int | None] + + @staticmethod + def from_infos(file_paths: list[str], file_sizes: list[int | None], num_rows: list[int | None]) -> FileInfos: ... + @staticmethod + def from_table(table: PyTable) -> FileInfos: + """ + Create from a Daft table with "path", "size", and "num_rows" columns. + """ + ... + + def extend(self, new_infos: FileInfos) -> FileInfos: + """ + Concatenate two FileInfos together. + """ + ... + + def __getitem__(self, idx: int) -> FileInfo: ... + def to_table(self) -> PyTable: + """ + Convert to a Daft table with "path", "size", and "num_rows" columns. + """ + + def __len__(self) -> int: ... + +class HTTPConfig: + """ + I/O configuration for accessing HTTP systems + """ + + bearer_token: str | None + + def __init__(self, bearer_token: str | None = None): ... + +class S3Config: + """ + I/O configuration for accessing an S3-compatible system. + """ + + region_name: str | None + endpoint_url: str | None + key_id: str | None + session_token: str | None + access_key: str | None + credentials_provider: Callable[[], S3Credentials] | None + max_connections: int + retry_initial_backoff_ms: int + connect_timeout_ms: int + read_timeout_ms: int + num_tries: int + retry_mode: str | None + anonymous: bool + use_ssl: bool + verify_ssl: bool + check_hostname_ssl: bool + requester_pays: bool | None + force_virtual_addressing: bool | None + profile_name: str | None + + def __init__( + self, + region_name: str | None = None, + endpoint_url: str | None = None, + key_id: str | None = None, + session_token: str | None = None, + access_key: str | None = None, + credentials_provider: Callable[[], S3Credentials] | None = None, + buffer_time: int | None = None, + max_connections: int | None = None, + retry_initial_backoff_ms: int | None = None, + connect_timeout_ms: int | None = None, + read_timeout_ms: int | None = None, + num_tries: int | None = None, + retry_mode: str | None = None, + anonymous: bool | None = None, + use_ssl: bool | None = None, + verify_ssl: bool | None = None, + check_hostname_ssl: bool | None = None, + requester_pays: bool | None = None, + force_virtual_addressing: bool | None = None, + profile_name: str | None = None, + ): ... + def replace( + self, + region_name: str | None = None, + endpoint_url: str | None = None, + key_id: str | None = None, + session_token: str | None = None, + access_key: str | None = None, + credentials_provider: Callable[[], S3Credentials] | None = None, + max_connections: int | None = None, + retry_initial_backoff_ms: int | None = None, + connect_timeout_ms: int | None = None, + read_timeout_ms: int | None = None, + num_tries: int | None = None, + retry_mode: str | None = None, + anonymous: bool | None = None, + use_ssl: bool | None = None, + verify_ssl: bool | None = None, + check_hostname_ssl: bool | None = None, + requester_pays: bool | None = None, + force_virtual_addressing: bool | None = None, + profile_name: str | None = None, + ) -> S3Config: + """Replaces values if provided, returning a new S3Config""" + ... + + @staticmethod + def from_env() -> S3Config: + """Creates an S3Config, retrieving credentials and configurations from the current environment""" + ... + +class S3Credentials: + key_id: str + access_key: str + session_token: str | None + expiry: datetime.datetime | None + + def __init__( + self, + key_id: str, + access_key: str, + session_token: str | None = None, + expiry: datetime.datetime | None = None, + ): ... + +class AzureConfig: + """ + I/O configuration for accessing Azure Blob Storage. + """ + + storage_account: str | None + access_key: str | None + sas_token: str | None + bearer_token: str | None + tenant_id: str | None + client_id: str | None + client_secret: str | None + use_fabric_endpoint: bool | None + anonymous: bool | None + endpoint_url: str | None = None + use_ssl: bool | None = None + + def __init__( + self, + storage_account: str | None = None, + access_key: str | None = None, + sas_token: str | None = None, + bearer_token: str | None = None, + tenant_id: str | None = None, + client_id: str | None = None, + client_secret: str | None = None, + use_fabric_endpoint: bool | None = None, + anonymous: bool | None = None, + endpoint_url: str | None = None, + use_ssl: bool | None = None, + ): ... + def replace( + self, + storage_account: str | None = None, + access_key: str | None = None, + sas_token: str | None = None, + bearer_token: str | None = None, + tenant_id: str | None = None, + client_id: str | None = None, + client_secret: str | None = None, + use_fabric_endpoint: bool | None = None, + anonymous: bool | None = None, + endpoint_url: str | None = None, + use_ssl: bool | None = None, + ) -> AzureConfig: + """Replaces values if provided, returning a new AzureConfig""" + ... + +class GCSConfig: + """ + I/O configuration for accessing Google Cloud Storage. + """ + + project_id: str | None + credentials: str | None + token: str | None + anonymous: bool + + def __init__( + self, + project_id: str | None = None, + credentials: str | None = None, + token: str | None = None, + anonymous: bool | None = None, + ): ... + def replace( + self, + project_id: str | None = None, + credentials: str | None = None, + token: str | None = None, + anonymous: bool | None = None, + ) -> GCSConfig: + """Replaces values if provided, returning a new GCSConfig""" + ... + +class IOConfig: + """ + Configuration for the native I/O layer, e.g. credentials for accessing cloud storage systems. + """ + + s3: S3Config + azure: AzureConfig + gcs: GCSConfig + http: HTTPConfig + + def __init__( + self, + s3: S3Config | None = None, + azure: AzureConfig | None = None, + gcs: GCSConfig | None = None, + http: HTTPConfig | None = None, + ): ... + @staticmethod + def from_json(input: str) -> IOConfig: + """ + Recreate an IOConfig from a JSON string. + """ + ... + + def replace( + self, + s3: S3Config | None = None, + azure: AzureConfig | None = None, + gcs: GCSConfig | None = None, + http: HTTPConfig | None = None, + ) -> IOConfig: + """Replaces values if provided, returning a new IOConfig""" + ... + +class NativeStorageConfig: + """ + Storage configuration for the Rust-native I/O layer. + """ + + # Whether or not to use a multithreaded tokio runtime for processing I/O + multithreaded_io: bool + io_config: IOConfig + + def __init__(self, multithreaded_io: bool, io_config: IOConfig): ... + +class PythonStorageConfig: + """ + Storage configuration for the legacy Python I/O layer. + """ + + io_config: IOConfig + + def __init__(self, io_config: IOConfig): ... + +class StorageConfig: + """ + Configuration for interacting with a particular storage backend, using a particular + I/O layer implementation. + """ + + @staticmethod + def native(config: NativeStorageConfig) -> StorageConfig: + """ + Create from a native storage config. + """ + ... + + @staticmethod + def python(config: PythonStorageConfig) -> StorageConfig: + """ + Create from a Python storage config. + """ + ... + + @property + def config(self) -> NativeStorageConfig | PythonStorageConfig: ... + +class ScanTask: + """ + A batch of scan tasks for reading data from an external source. + """ + + def num_rows(self) -> int: + """ + Get number of rows that will be scanned by this ScanTask. + """ + ... + + def estimate_in_memory_size_bytes(self, cfg: PyDaftExecutionConfig) -> int: + """ + Estimate the In Memory Size of this ScanTask. + """ + ... + + @staticmethod + def catalog_scan_task( + file: str, + file_format: FileFormatConfig, + schema: PySchema, + storage_config: StorageConfig, + num_rows: int | None, + size_bytes: int | None, + iceberg_delete_files: list[str] | None, + pushdowns: Pushdowns | None, + partition_values: PyTable | None, + stats: PyTable | None, + ) -> ScanTask | None: + """ + Create a Catalog Scan Task + """ + ... + + @staticmethod + def sql_scan_task( + url: str, + file_format: FileFormatConfig, + schema: PySchema, + num_rows: int | None, + storage_config: StorageConfig, + size_bytes: int | None, + pushdowns: Pushdowns | None, + stats: PyTable | None, + ) -> ScanTask: + """ + Create a SQL Scan Task + """ + ... + + @staticmethod + def python_factory_func_scan_task( + module: str, + func_name: str, + func_args: tuple[Any, ...], + schema: PySchema, + num_rows: int | None, + size_bytes: int | None, + pushdowns: Pushdowns | None, + stats: PyTable | None, + ) -> ScanTask: + """ + Create a Python factory function Scan Task + """ + ... + +class ScanOperatorHandle: + """ + A handle to a scan operator. + """ + + @staticmethod + def anonymous_scan( + files: list[str], + schema: PySchema, + file_format_config: FileFormatConfig, + storage_config: StorageConfig, + ) -> ScanOperatorHandle: ... + @staticmethod + def glob_scan( + glob_path: list[str], + file_format_config: FileFormatConfig, + storage_config: StorageConfig, + infer_schema: bool, + schema: PySchema | None = None, + ) -> ScanOperatorHandle: ... + @staticmethod + def from_python_scan_operator(operator: ScanOperator) -> ScanOperatorHandle: ... + +class PartitionField: + """ + Partitioning Field of a Scan Source such as Hive or Iceberg + """ + + field: PyField + + def __init__( + self, + field: PyField, + source_field: PyField | None = None, + transform: PartitionTransform | None = None, + ) -> None: ... + +class PartitionTransform: + """ + Partitioning Transform from a Data Catalog source field to a Partitioning Columns + """ + + @staticmethod + def identity() -> PartitionTransform: ... + @staticmethod + def year() -> PartitionTransform: ... + @staticmethod + def month() -> PartitionTransform: ... + @staticmethod + def day() -> PartitionTransform: ... + @staticmethod + def hour() -> PartitionTransform: ... + @staticmethod + def iceberg_bucket(n: int) -> PartitionTransform: ... + @staticmethod + def iceberg_truncate(w: int) -> PartitionTransform: ... + +class Pushdowns: + """ + Pushdowns from the query optimizer that can optimize scanning data sources. + """ + + columns: list[str] | None + filters: PyExpr | None + partition_filters: PyExpr | None + limit: int | None + + def filter_required_column_names(self) -> list[str]: + """List of field names that are required by the filter predicate.""" + ... + +def read_parquet( + uri: str, + columns: list[str] | None = None, + start_offset: int | None = None, + num_rows: int | None = None, + row_groups: list[int] | None = None, + predicate: PyExpr | None = None, + io_config: IOConfig | None = None, + multithreaded_io: bool | None = None, + coerce_int96_timestamp_unit: PyTimeUnit | None = None, +): ... +def read_parquet_bulk( + uris: list[str], + columns: list[str] | None = None, + start_offset: int | None = None, + num_rows: int | None = None, + row_groups: list[list[int] | None] | None = None, + predicate: PyExpr | None = None, + io_config: IOConfig | None = None, + num_parallel_tasks: int | None = 128, + multithreaded_io: bool | None = None, + coerce_int96_timestamp_unit: PyTimeUnit | None = None, +): ... +def read_parquet_statistics( + uris: PySeries, + io_config: IOConfig | None = None, + multithreaded_io: bool | None = None, +): ... +def read_parquet_into_pyarrow( + uri: str, + columns: list[str] | None = None, + start_offset: int | None = None, + num_rows: int | None = None, + row_groups: list[int] | None = None, + io_config: IOConfig | None = None, + multithreaded_io: bool | None = None, + coerce_int96_timestamp_unit: PyTimeUnit | None = None, + file_timeout_ms: int | None = None, +): ... +def read_parquet_into_pyarrow_bulk( + uris: list[str], + columns: list[str] | None = None, + start_offset: int | None = None, + num_rows: int | None = None, + row_groups: list[list[int] | None] | None = None, + io_config: IOConfig | None = None, + num_parallel_tasks: int | None = 128, + multithreaded_io: bool | None = None, + coerce_int96_timestamp_unit: PyTimeUnit | None = None, +): ... +def read_parquet_schema( + uri: str, + io_config: IOConfig | None = None, + multithreaded_io: bool | None = None, + coerce_int96_timestamp_unit: PyTimeUnit | None = None, +): ... +def read_csv( + uri: str, + convert_options: CsvConvertOptions | None = None, + parse_options: CsvParseOptions | None = None, + read_options: CsvReadOptions | None = None, + io_config: IOConfig | None = None, + multithreaded_io: bool | None = None, +): ... +def read_csv_schema( + uri: str, + parse_options: CsvParseOptions | None = None, + io_config: IOConfig | None = None, + multithreaded_io: bool | None = None, +): ... +def read_json( + uri: str, + convert_options: JsonConvertOptions | None = None, + parse_options: JsonParseOptions | None = None, + read_options: JsonReadOptions | None = None, + io_config: IOConfig | None = None, + multithreaded_io: bool | None = None, + max_chunks_in_flight: int | None = None, +): ... +def read_json_schema( + uri: str, + parse_options: JsonParseOptions | None = None, + io_config: IOConfig | None = None, + multithreaded_io: bool | None = None, +): ... + +class PyTimeUnit: + @staticmethod + def nanoseconds() -> PyTimeUnit: ... + @staticmethod + def microseconds() -> PyTimeUnit: ... + @staticmethod + def milliseconds() -> PyTimeUnit: ... + @staticmethod + def seconds() -> PyTimeUnit: ... + +class PyDataType: + @staticmethod + def null() -> PyDataType: ... + @staticmethod + def bool() -> PyDataType: ... + @staticmethod + def int8() -> PyDataType: ... + @staticmethod + def int16() -> PyDataType: ... + @staticmethod + def int32() -> PyDataType: ... + @staticmethod + def int64() -> PyDataType: ... + @staticmethod + def uint8() -> PyDataType: ... + @staticmethod + def uint16() -> PyDataType: ... + @staticmethod + def uint32() -> PyDataType: ... + @staticmethod + def uint64() -> PyDataType: ... + @staticmethod + def float32() -> PyDataType: ... + @staticmethod + def float64() -> PyDataType: ... + @staticmethod + def binary() -> PyDataType: ... + @staticmethod + def fixed_size_binary(size: int) -> PyDataType: ... + @staticmethod + def string() -> PyDataType: ... + @staticmethod + def decimal128(precision: int, size: int) -> PyDataType: ... + @staticmethod + def date() -> PyDataType: ... + @staticmethod + def time(time_unit: PyTimeUnit) -> PyDataType: ... + @staticmethod + def timestamp(time_unit: PyTimeUnit, timezone: str | None = None) -> PyDataType: ... + @staticmethod + def duration(time_unit: PyTimeUnit) -> PyDataType: ... + @staticmethod + def list(data_type: PyDataType) -> PyDataType: ... + @staticmethod + def fixed_size_list(data_type: PyDataType, size: int) -> PyDataType: ... + @staticmethod + def map(key_type: PyDataType, value_type: PyDataType) -> PyDataType: ... + @staticmethod + def struct(fields: dict[str, PyDataType]) -> PyDataType: ... + @staticmethod + def extension(name: str, storage_data_type: PyDataType, metadata: str | None = None) -> PyDataType: ... + @staticmethod + def embedding(data_type: PyDataType, size: int) -> PyDataType: ... + @staticmethod + def image( + mode: ImageMode | None = None, + height: int | None = None, + width: int | None = None, + ) -> PyDataType: ... + @staticmethod + def tensor(dtype: PyDataType, shape: tuple[int, ...] | None = None) -> PyDataType: ... + @staticmethod + def python() -> PyDataType: ... + def to_arrow(self, cast_tensor_type_for_ray: builtins.bool | None = None) -> pyarrow.DataType: ... + def is_numeric(self) -> builtins.bool: ... + def is_image(self) -> builtins.bool: ... + def is_fixed_shape_image(self) -> builtins.bool: ... + def is_list(self) -> builtins.bool: ... + def is_tensor(self) -> builtins.bool: ... + def is_fixed_shape_tensor(self) -> builtins.bool: ... + def is_map(self) -> builtins.bool: ... + def is_logical(self) -> builtins.bool: ... + def is_boolean(self) -> builtins.bool: ... + def is_string(self) -> builtins.bool: ... + def is_temporal(self) -> builtins.bool: ... + def is_equal(self, other: Any) -> builtins.bool: ... + @staticmethod + def from_json(serialized: str) -> PyDataType: ... + def __reduce__(self) -> tuple: ... + def __hash__(self) -> int: ... + +class PyField: + def name(self) -> str: ... + @staticmethod + def create(name: str, datatype: PyDataType) -> PyField: ... + def dtype(self) -> PyDataType: ... + def eq(self, other: PyField) -> bool: ... + def __reduce__(self) -> tuple: ... + +class PySchema: + def __getitem__(self, name: str) -> PyField: ... + def names(self) -> list[str]: ... + def union(self, other: PySchema) -> PySchema: ... + def eq(self, other: PySchema) -> bool: ... + def estimate_row_size_bytes(self) -> float: ... + @staticmethod + def from_field_name_and_types(names_and_types: list[tuple[str, PyDataType]]) -> PySchema: ... + @staticmethod + def from_fields(fields: list[PyField]) -> PySchema: ... + def to_pyarrow_schema(self) -> pa.Schema: ... + def __reduce__(self) -> tuple: ... + def __repr__(self) -> str: ... + def _repr_html_(self) -> str: ... + def _truncated_table_html(self) -> str: ... + def _truncated_table_string(self) -> str: ... + def apply_hints(self, hints: PySchema) -> PySchema: ... + +class PyExpr: + def alias(self, name: str) -> PyExpr: ... + def cast(self, dtype: PyDataType) -> PyExpr: ... + def ceil(self) -> PyExpr: ... + def floor(self) -> PyExpr: ... + def sign(self) -> PyExpr: ... + def round(self, decimal: int) -> PyExpr: ... + def sqrt(self) -> PyExpr: ... + def sin(self) -> PyExpr: ... + def cos(self) -> PyExpr: ... + def tan(self) -> PyExpr: ... + def cot(self) -> PyExpr: ... + def arcsin(self) -> PyExpr: ... + def arccos(self) -> PyExpr: ... + def arctan(self) -> PyExpr: ... + def arctan2(self, other: PyExpr) -> PyExpr: ... + def arctanh(self) -> PyExpr: ... + def arccosh(self) -> PyExpr: ... + def arcsinh(self) -> PyExpr: ... + def degrees(self) -> PyExpr: ... + def radians(self) -> PyExpr: ... + def log2(self) -> PyExpr: ... + def log10(self) -> PyExpr: ... + def log(self, base: float) -> PyExpr: ... + def ln(self) -> PyExpr: ... + def exp(self) -> PyExpr: ... + def if_else(self, if_true: PyExpr, if_false: PyExpr) -> PyExpr: ... + def count(self, mode: CountMode) -> PyExpr: ... + def sum(self) -> PyExpr: ... + def approx_count_distinct(self) -> PyExpr: ... + def approx_percentiles(self, percentiles: float | list[float]) -> PyExpr: ... + def mean(self) -> PyExpr: ... + def min(self) -> PyExpr: ... + def max(self) -> PyExpr: ... + def any_value(self, ignore_nulls: bool) -> PyExpr: ... + def agg_list(self) -> PyExpr: ... + def agg_concat(self) -> PyExpr: ... + def explode(self) -> PyExpr: ... + def __abs__(self) -> PyExpr: ... + def __add__(self, other: PyExpr) -> PyExpr: ... + def __sub__(self, other: PyExpr) -> PyExpr: ... + def __mul__(self, other: PyExpr) -> PyExpr: ... + def __floordiv__(self, other: PyExpr) -> PyExpr: ... + def __truediv__(self, other: PyExpr) -> PyExpr: ... + def __mod__(self, other: PyExpr) -> PyExpr: ... + def __and__(self, other: PyExpr) -> PyExpr: ... + def __or__(self, other: PyExpr) -> PyExpr: ... + def __xor__(self, other: PyExpr) -> PyExpr: ... + def __invert__(self) -> PyExpr: ... + def __lt__(self, other: PyExpr) -> PyExpr: ... + def __le__(self, other: PyExpr) -> PyExpr: ... + def __gt__(self, other: PyExpr) -> PyExpr: ... + def __ge__(self, other: PyExpr) -> PyExpr: ... + def __eq__(self, other: PyExpr) -> PyExpr: ... # type: ignore[override] + def __ne__(self, other: PyExpr) -> PyExpr: ... # type: ignore[override] + def __rshift__(self, other: PyExpr) -> PyExpr: ... + def __lshift__(self, other: PyExpr) -> PyExpr: ... + def is_null(self) -> PyExpr: ... + def not_null(self) -> PyExpr: ... + def fill_null(self, fill_value: PyExpr) -> PyExpr: ... + def is_in(self, other: PyExpr) -> PyExpr: ... + def between(self, lower: PyExpr, upper: PyExpr) -> PyExpr: ... + def name(self) -> str: ... + def to_field(self, schema: PySchema) -> PyField: ... + def to_sql(self) -> str: ... + def __repr__(self) -> str: ... + def __hash__(self) -> int: ... + def __reduce__(self) -> tuple: ... + def is_nan(self) -> PyExpr: ... + def is_inf(self) -> PyExpr: ... + def not_nan(self) -> PyExpr: ... + def fill_nan(self, fill_value: PyExpr) -> PyExpr: ... + def dt_date(self) -> PyExpr: ... + def dt_day(self) -> PyExpr: ... + def dt_hour(self) -> PyExpr: ... + def dt_minute(self) -> PyExpr: ... + def dt_second(self) -> PyExpr: ... + def dt_time(self) -> PyExpr: ... + def dt_month(self) -> PyExpr: ... + def dt_year(self) -> PyExpr: ... + def dt_day_of_week(self) -> PyExpr: ... + def dt_truncate(self, interval: str, relative_to: PyExpr) -> PyExpr: ... + def utf8_endswith(self, pattern: PyExpr) -> PyExpr: ... + def utf8_startswith(self, pattern: PyExpr) -> PyExpr: ... + def utf8_contains(self, pattern: PyExpr) -> PyExpr: ... + def utf8_match(self, pattern: PyExpr) -> PyExpr: ... + def utf8_split(self, pattern: PyExpr, regex: bool) -> PyExpr: ... + def utf8_extract(self, pattern: PyExpr, index: int) -> PyExpr: ... + def utf8_extract_all(self, pattern: PyExpr, index: int) -> PyExpr: ... + def utf8_replace(self, pattern: PyExpr, replacement: PyExpr, regex: bool) -> PyExpr: ... + def utf8_length(self) -> PyExpr: ... + def utf8_length_bytes(self) -> PyExpr: ... + def utf8_lower(self) -> PyExpr: ... + def utf8_upper(self) -> PyExpr: ... + def utf8_lstrip(self) -> PyExpr: ... + def utf8_rstrip(self) -> PyExpr: ... + def utf8_reverse(self) -> PyExpr: ... + def utf8_capitalize(self) -> PyExpr: ... + def utf8_left(self, nchars: PyExpr) -> PyExpr: ... + def utf8_right(self, nchars: PyExpr) -> PyExpr: ... + def utf8_find(self, substr: PyExpr) -> PyExpr: ... + def utf8_rpad(self, length: PyExpr, pad: PyExpr) -> PyExpr: ... + def utf8_lpad(self, length: PyExpr, pad: PyExpr) -> PyExpr: ... + def utf8_repeat(self, n: PyExpr) -> PyExpr: ... + def utf8_like(self, pattern: PyExpr) -> PyExpr: ... + def utf8_ilike(self, pattern: PyExpr) -> PyExpr: ... + def utf8_substr(self, start: PyExpr, length: PyExpr) -> PyExpr: ... + def utf8_to_date(self, format: str) -> PyExpr: ... + def utf8_to_datetime(self, format: str, timezone: str | None = None) -> PyExpr: ... + def utf8_normalize(self, remove_punct: bool, lowercase: bool, nfd_unicode: bool, white_space: bool) -> PyExpr: ... + def list_join(self, delimiter: PyExpr) -> PyExpr: ... + def list_count(self, mode: CountMode) -> PyExpr: ... + def list_get(self, idx: PyExpr, default: PyExpr) -> PyExpr: ... + def list_sum(self) -> PyExpr: ... + def list_mean(self) -> PyExpr: ... + def list_min(self) -> PyExpr: ... + def list_max(self) -> PyExpr: ... + def list_slice(self, start: PyExpr, end: PyExpr | None = None) -> PyExpr: ... + def list_chunk(self, size: int) -> PyExpr: ... + def struct_get(self, name: str) -> PyExpr: ... + def map_get(self, key: PyExpr) -> PyExpr: ... + def partitioning_days(self) -> PyExpr: ... + def partitioning_hours(self) -> PyExpr: ... + def partitioning_months(self) -> PyExpr: ... + def partitioning_years(self) -> PyExpr: ... + def partitioning_iceberg_bucket(self, n: int) -> PyExpr: ... + def partitioning_iceberg_truncate(self, w: int) -> PyExpr: ... + def json_query(self, query: str) -> PyExpr: ... + + ### + # Helper methods required by optimizer: + # These should be removed from the Python API for Expressions when logical plans and optimizer are migrated to Rust + ### + def _input_mapping(self) -> builtins.str | None: ... + +def eq(expr1: PyExpr, expr2: PyExpr) -> bool: ... +def col(name: str) -> PyExpr: ... +def lit(item: Any) -> PyExpr: ... +def date_lit(item: int) -> PyExpr: ... +def time_lit(item: int, tu: PyTimeUnit) -> PyExpr: ... +def timestamp_lit(item: int, tu: PyTimeUnit, tz: str | None) -> PyExpr: ... +def decimal_lit(sign: bool, digits: tuple[int, ...], exp: int) -> PyExpr: ... +def series_lit(item: PySeries) -> PyExpr: ... +def stateless_udf( + name: str, + partial_stateless_udf: PartialStatelessUDF, + expressions: list[PyExpr], + return_dtype: PyDataType, + resource_request: ResourceRequest | None, + batch_size: int | None, +) -> PyExpr: ... +def stateful_udf( + name: str, + partial_stateful_udf: PartialStatefulUDF, + expressions: list[PyExpr], + return_dtype: PyDataType, + resource_request: ResourceRequest | None, + init_args: tuple[tuple[Any, ...], dict[str, Any]] | None, + batch_size: int | None, + concurrency: int | None, +) -> PyExpr: ... +def check_column_name_validity(name: str, schema: PySchema): ... +def extract_partial_stateful_udf_py(expression: PyExpr) -> dict[str, PartialStatefulUDF]: ... +def bind_stateful_udfs(expression: PyExpr, initialized_funcs: dict[str, Callable]) -> PyExpr: ... +def resolve_expr(expr: PyExpr, schema: PySchema) -> tuple[PyExpr, PyField]: ... +def hash(expr: PyExpr, seed: Any | None = None) -> PyExpr: ... +def cosine_distance(expr: PyExpr, other: PyExpr) -> PyExpr: ... +def url_download( + expr: PyExpr, + max_connections: int, + raise_error_on_failure: bool, + multi_thread: bool, + config: IOConfig, +) -> PyExpr: ... +def url_upload( + expr: PyExpr, + folder_location: str, + max_connections: int, + multi_thread: bool, + io_config: IOConfig | None, +) -> PyExpr: ... +def tokenize_encode( + expr: PyExpr, + tokens_path: str, + use_special_tokens: bool, + io_config: IOConfig | None, + pattern: str | None, + special_tokens: str | None, +) -> PyExpr: ... +def tokenize_decode( + expr: PyExpr, + tokens_path: str, + io_config: IOConfig | None, + pattern: str | None, + special_tokens: str | None, +) -> PyExpr: ... +def minhash( + expr: PyExpr, + num_hashes: int, + ngram_size: int, + seed: int = 1, +) -> PyExpr: ... +def sql(sql: str, catalog: PyCatalog, daft_planning_config: PyDaftPlanningConfig) -> LogicalPlanBuilder: ... +def sql_expr(sql: str) -> PyExpr: ... +def utf8_count_matches(expr: PyExpr, patterns: PyExpr, whole_words: bool, case_sensitive: bool) -> PyExpr: ... +def list_sort(expr: PyExpr, desc: PyExpr) -> PyExpr: ... +def cbrt(expr: PyExpr) -> PyExpr: ... +def to_struct(inputs: list[PyExpr]) -> PyExpr: ... + +# --- +# expr.image namespace +# --- +def image_crop(expr: PyExpr, bbox: PyExpr) -> PyExpr: ... +def image_decode(expr: PyExpr, raise_on_error: bool, mode: ImageMode | None = None) -> PyExpr: ... +def image_encode(expr: PyExpr, image_format: ImageFormat) -> PyExpr: ... +def image_resize(expr: PyExpr, w: int, h: int) -> PyExpr: ... +def image_to_mode(expr: PyExpr, mode: ImageMode) -> PyExpr: ... + +class PyCatalog: + @staticmethod + def new() -> PyCatalog: ... + def register_table(self, name: str, logical_plan_builder: LogicalPlanBuilder) -> None: ... + +class PySeries: + @staticmethod + def from_arrow(name: str, pyarrow_array: pyarrow.Array) -> PySeries: ... + @staticmethod + def from_pylist(name: str, pylist: list[Any], pyobj: str) -> PySeries: ... + def to_pylist(self) -> list[Any]: ... + def to_arrow(self) -> pyarrow.Array: ... + def __abs__(self) -> PySeries: ... + def __add__(self, other: PySeries) -> PySeries: ... + def __sub__(self, other: PySeries) -> PySeries: ... + def __mul__(self, other: PySeries) -> PySeries: ... + def __truediv__(self, other: PySeries) -> PySeries: ... + def __mod__(self, other: PySeries) -> PySeries: ... + def __and__(self, other: PySeries) -> PySeries: ... + def __or__(self, other: PySeries) -> PySeries: ... + def __xor__(self, other: PySeries) -> PySeries: ... + def __lt__(self, other: PySeries) -> PySeries: ... + def __le__(self, other: PySeries) -> PySeries: ... + def __gt__(self, other: PySeries) -> PySeries: ... + def __ge__(self, other: PySeries) -> PySeries: ... + def __eq__(self, other: PySeries) -> PySeries: ... # type: ignore[override] + def __ne__(self, other: PySeries) -> PySeries: ... # type: ignore[override] + def __rshift__(self, other: PySeries) -> PySeries: ... + def __lshift__(self, other: PySeries) -> PySeries: ... + def take(self, idx: PySeries) -> PySeries: ... + def slice(self, start: int, end: int) -> PySeries: ... + def filter(self, mask: PySeries) -> PySeries: ... + def sort(self, descending: bool) -> PySeries: ... + def argsort(self, descending: bool) -> PySeries: ... + def hash(self, seed: PySeries | None = None) -> PySeries: ... + def minhash(self, num_hashes: int, ngram_size: int, seed: int = 1) -> PySeries: ... + def __invert__(self) -> PySeries: ... + def count(self, mode: CountMode) -> PySeries: ... + def sum(self) -> PySeries: ... + def mean(self) -> PySeries: ... + def min(self) -> PySeries: ... + def max(self) -> PySeries: ... + def agg_list(self) -> PySeries: ... + def cast(self, dtype: PyDataType) -> PySeries: ... + def ceil(self) -> PySeries: ... + def floor(self) -> PySeries: ... + def sign(self) -> PySeries: ... + def round(self, decimal: int) -> PySeries: ... + def sqrt(self) -> PySeries: ... + def cbrt(self) -> PySeries: ... + def sin(self) -> PySeries: ... + def cos(self) -> PySeries: ... + def tan(self) -> PySeries: ... + def cot(self) -> PySeries: ... + def arcsin(self) -> PySeries: ... + def arccos(self) -> PySeries: ... + def arctan(self) -> PySeries: ... + def arctan2(self, other: PySeries) -> PySeries: ... + def arctanh(self) -> PySeries: ... + def arccosh(self) -> PySeries: ... + def arcsinh(self) -> PySeries: ... + def degrees(self) -> PySeries: ... + def radians(self) -> PySeries: ... + def log2(self) -> PySeries: ... + def log10(self) -> PySeries: ... + def log(self, base: float) -> PySeries: ... + def ln(self) -> PySeries: ... + def exp(self) -> PySeries: ... + @staticmethod + def concat(series: list[PySeries]) -> PySeries: ... + def __len__(self) -> int: ... + def size_bytes(self) -> int: ... + def name(self) -> str: ... + def rename(self, name: str) -> PySeries: ... + def data_type(self) -> PyDataType: ... + def utf8_endswith(self, pattern: PySeries) -> PySeries: ... + def utf8_startswith(self, pattern: PySeries) -> PySeries: ... + def utf8_contains(self, pattern: PySeries) -> PySeries: ... + def utf8_match(self, pattern: PySeries) -> PySeries: ... + def utf8_split(self, pattern: PySeries, regex: bool) -> PySeries: ... + def utf8_extract(self, pattern: PySeries, index: int) -> PySeries: ... + def utf8_extract_all(self, pattern: PySeries, index: int) -> PySeries: ... + def utf8_replace(self, pattern: PySeries, replacement: PySeries, regex: bool) -> PySeries: ... + def utf8_length(self) -> PySeries: ... + def utf8_length_bytes(self) -> PySeries: ... + def utf8_lower(self) -> PySeries: ... + def utf8_upper(self) -> PySeries: ... + def utf8_lstrip(self) -> PySeries: ... + def utf8_rstrip(self) -> PySeries: ... + def utf8_reverse(self) -> PySeries: ... + def utf8_capitalize(self) -> PySeries: ... + def utf8_left(self, nchars: PySeries) -> PySeries: ... + def utf8_right(self, nchars: PySeries) -> PySeries: ... + def utf8_find(self, substr: PySeries) -> PySeries: ... + def utf8_rpad(self, length: PySeries, pad: PySeries) -> PySeries: ... + def utf8_lpad(self, length: PySeries, pad: PySeries) -> PySeries: ... + def utf8_repeat(self, n: PySeries) -> PySeries: ... + def utf8_like(self, pattern: PySeries) -> PySeries: ... + def utf8_ilike(self, pattern: PySeries) -> PySeries: ... + def utf8_substr(self, start: PySeries, length: PySeries | None = None) -> PySeries: ... + def utf8_to_date(self, format: str) -> PySeries: ... + def utf8_to_datetime(self, format: str, timezone: str | None = None) -> PySeries: ... + def utf8_normalize(self, remove_punct: bool, lowercase: bool, nfd_unicode: bool, white_space: bool) -> PySeries: ... + def utf8_count_matches(self, patterns: PySeries, whole_word: bool, case_sensitive: bool) -> PySeries: ... + def is_nan(self) -> PySeries: ... + def is_inf(self) -> PySeries: ... + def not_nan(self) -> PySeries: ... + def fill_nan(self, fill_value: PySeries) -> PySeries: ... + def dt_date(self) -> PySeries: ... + def dt_day(self) -> PySeries: ... + def dt_hour(self) -> PySeries: ... + def dt_minute(self) -> PySeries: ... + def dt_second(self) -> PySeries: ... + def dt_time(self) -> PySeries: ... + def dt_month(self) -> PySeries: ... + def dt_year(self) -> PySeries: ... + def dt_day_of_week(self) -> PySeries: ... + def dt_truncate(self, interval: str, relative_to: PySeries) -> PySeries: ... + def partitioning_days(self) -> PySeries: ... + def partitioning_hours(self) -> PySeries: ... + def partitioning_months(self) -> PySeries: ... + def partitioning_years(self) -> PySeries: ... + def partitioning_iceberg_bucket(self, n: int) -> PySeries: ... + def partitioning_iceberg_truncate(self, w: int) -> PySeries: ... + def list_count(self, mode: CountMode) -> PySeries: ... + def list_get(self, idx: PySeries, default: PySeries) -> PySeries: ... + def list_slice(self, start: PySeries, end: PySeries | None = None) -> PySeries: ... + def list_sort(self, desc: PySeries) -> PySeries: ... + def map_get(self, key: PySeries) -> PySeries: ... + def if_else(self, other: PySeries, predicate: PySeries) -> PySeries: ... + def is_null(self) -> PySeries: ... + def not_null(self) -> PySeries: ... + def fill_null(self, fill_value: PySeries) -> PySeries: ... + def murmur3_32(self) -> PySeries: ... + def to_str_values(self) -> PySeries: ... + def _debug_bincode_serialize(self) -> bytes: ... + @staticmethod + def _debug_bincode_deserialize(b: bytes) -> PySeries: ... + +class PyTable: + def schema(self) -> PySchema: ... + def cast_to_schema(self, schema: PySchema) -> PyTable: ... + def eval_expression_list(self, exprs: list[PyExpr]) -> PyTable: ... + def take(self, idx: PySeries) -> PyTable: ... + def filter(self, exprs: list[PyExpr]) -> PyTable: ... + def sort(self, sort_keys: list[PyExpr], descending: list[bool]) -> PyTable: ... + def argsort(self, sort_keys: list[PyExpr], descending: list[bool]) -> PySeries: ... + def agg(self, to_agg: list[PyExpr], group_by: list[PyExpr]) -> PyTable: ... + def pivot( + self, + group_by: list[PyExpr], + pivot_column: PyExpr, + values_column: PyExpr, + names: list[str], + ) -> PyTable: ... + def hash_join( + self, + right: PyTable, + left_on: list[PyExpr], + right_on: list[PyExpr], + how: JoinType, + ) -> PyTable: ... + def sort_merge_join( + self, + right: PyTable, + left_on: list[PyExpr], + right_on: list[PyExpr], + is_sorted: bool, + ) -> PyTable: ... + def explode(self, to_explode: list[PyExpr]) -> PyTable: ... + def head(self, num: int) -> PyTable: ... + def sample_by_fraction(self, fraction: float, with_replacement: bool, seed: int | None) -> PyTable: ... + def sample_by_size(self, size: int, with_replacement: bool, seed: int | None) -> PyTable: ... + def quantiles(self, num: int) -> PyTable: ... + def partition_by_hash(self, exprs: list[PyExpr], num_partitions: int) -> list[PyTable]: ... + def partition_by_random(self, num_partitions: int, seed: int) -> list[PyTable]: ... + def partition_by_range( + self, partition_keys: list[PyExpr], boundaries: PyTable, descending: list[bool] + ) -> list[PyTable]: ... + def partition_by_value(self, partition_keys: list[PyExpr]) -> tuple[list[PyTable], PyTable]: ... + def add_monotonically_increasing_id(self, partition_num: int, column_name: str) -> PyTable: ... + def __repr__(self) -> str: ... + def _repr_html_(self) -> str: ... + def __len__(self) -> int: ... + def size_bytes(self) -> int: ... + def column_names(self) -> list[str]: ... + def get_column(self, name: str) -> PySeries: ... + def get_column_by_index(self, idx: int) -> PySeries: ... + @staticmethod + def concat(tables: list[PyTable]) -> PyTable: ... + def slice(self, start: int, end: int) -> PyTable: ... + @staticmethod + def from_arrow_record_batches(record_batches: list[pyarrow.RecordBatch], schema: PySchema) -> PyTable: ... + @staticmethod + def from_pylist_series(dict: dict[str, PySeries]) -> PyTable: ... + def to_arrow_record_batch(self) -> pyarrow.RecordBatch: ... + @staticmethod + def empty(schema: PySchema | None = None) -> PyTable: ... + +class PyMicroPartition: + def schema(self) -> PySchema: ... + def column_names(self) -> list[str]: ... + def get_column(self, name: str) -> PySeries: ... + def size_bytes(self) -> int | None: ... + def _repr_html_(self) -> str: ... + @staticmethod + def empty(schema: PySchema | None = None) -> PyMicroPartition: ... + @staticmethod + def from_scan_task(scan_task: ScanTask) -> PyMicroPartition: ... + @staticmethod + def from_tables(tables: list[PyTable]) -> PyMicroPartition: ... + @staticmethod + def from_arrow_record_batches(record_batches: list[pyarrow.RecordBatch], schema: PySchema) -> PyMicroPartition: ... + @staticmethod + def concat(tables: list[PyMicroPartition]) -> PyMicroPartition: ... + def slice(self, start: int, end: int) -> PyMicroPartition: ... + def to_table(self) -> PyTable: ... + def cast_to_schema(self, schema: PySchema) -> PyMicroPartition: ... + def eval_expression_list(self, exprs: list[PyExpr]) -> PyMicroPartition: ... + def take(self, idx: PySeries) -> PyMicroPartition: ... + def filter(self, exprs: list[PyExpr]) -> PyMicroPartition: ... + def sort(self, sort_keys: list[PyExpr], descending: list[bool]) -> PyMicroPartition: ... + def argsort(self, sort_keys: list[PyExpr], descending: list[bool]) -> PySeries: ... + def agg(self, to_agg: list[PyExpr], group_by: list[PyExpr]) -> PyMicroPartition: ... + def hash_join( + self, + right: PyMicroPartition, + left_on: list[PyExpr], + right_on: list[PyExpr], + how: JoinType, + ) -> PyMicroPartition: ... + def pivot( + self, + group_by: list[PyExpr], + pivot_column: PyExpr, + values_column: PyExpr, + names: list[str], + ) -> PyMicroPartition: ... + def sort_merge_join( + self, + right: PyMicroPartition, + left_on: list[PyExpr], + right_on: list[PyExpr], + is_sorted: bool, + ) -> PyMicroPartition: ... + def explode(self, to_explode: list[PyExpr]) -> PyMicroPartition: ... + def unpivot( + self, + ids: list[PyExpr], + values: list[PyExpr], + variable_name: str, + value_name: str, + ) -> PyMicroPartition: ... + def head(self, num: int) -> PyMicroPartition: ... + def sample_by_fraction(self, fraction: float, with_replacement: bool, seed: int | None) -> PyMicroPartition: ... + def sample_by_size(self, size: int, with_replacement: bool, seed: int | None) -> PyMicroPartition: ... + def quantiles(self, num: int) -> PyMicroPartition: ... + def partition_by_hash(self, exprs: list[PyExpr], num_partitions: int) -> list[PyMicroPartition]: ... + def partition_by_random(self, num_partitions: int, seed: int) -> list[PyMicroPartition]: ... + def partition_by_range( + self, partition_keys: list[PyExpr], boundaries: PyTable, descending: list[bool] + ) -> list[PyMicroPartition]: ... + def partition_by_value(self, exprs: list[PyExpr]) -> tuple[list[PyMicroPartition], PyMicroPartition]: ... + def add_monotonically_increasing_id(self, partition_num: int, column_name: str) -> PyMicroPartition: ... + def __repr__(self) -> str: ... + def __len__(self) -> int: ... + @classmethod + def read_parquet( + cls, + path: str, + columns: list[str] | None = None, + start_offset: int | None = None, + num_rows: int | None = None, + row_groups: list[int] | None = None, + predicate: PyExpr | None = None, + io_config: IOConfig | None = None, + multithreaded_io: bool | None = None, + coerce_int96_timestamp_unit: PyTimeUnit = PyTimeUnit.nanoseconds(), + ): ... + @classmethod + def read_parquet_bulk( + cls, + uris: list[str], + columns: list[str] | None = None, + start_offset: int | None = None, + num_rows: int | None = None, + row_groups: list[list[int] | None] | None = None, + predicate: PyExpr | None = None, + io_config: IOConfig | None = None, + num_parallel_tasks: int | None = None, + multithreaded_io: bool | None = None, + coerce_int96_timestamp_unit: PyTimeUnit | None = None, + ): ... + @classmethod + def read_csv( + cls, + uri: str, + convert_options: CsvConvertOptions | None = None, + parse_options: CsvParseOptions | None = None, + read_options: CsvReadOptions | None = None, + io_config: IOConfig | None = None, + multithreaded_io: bool | None = None, + ): ... + @classmethod + def read_json_native( + cls, + uri: str, + convert_options: JsonConvertOptions | None = None, + parse_options: JsonParseOptions | None = None, + read_options: JsonReadOptions | None = None, + io_config: IOConfig | None = None, + multithreaded_io: bool | None = None, + ): ... + +class PhysicalPlanScheduler: + """ + A work scheduler for physical query plans. + """ + + @staticmethod + def from_logical_plan_builder( + logical_plan_builder: LogicalPlanBuilder, + cfg: PyDaftExecutionConfig, + ) -> PhysicalPlanScheduler: ... + def num_partitions(self) -> int: ... + def repr_ascii(self, simple: bool) -> str: ... + def repr_mermaid(self, options: MermaidOptions) -> str: ... + def to_partition_tasks(self, psets: dict[str, list[PartitionT]]) -> physical_plan.InProgressPhysicalPlan: ... + def run(self, psets: dict[str, list[PartitionT]]) -> Iterator[PyMicroPartition]: ... + +class AdaptivePhysicalPlanScheduler: + """ + An adaptive Physical Plan Scheduler. + """ + + @staticmethod + def from_logical_plan_builder( + logical_plan_builder: LogicalPlanBuilder, + cfg: PyDaftExecutionConfig, + ) -> AdaptivePhysicalPlanScheduler: ... + def next(self) -> tuple[int | None, PhysicalPlanScheduler]: ... + def is_done(self) -> bool: ... + # Todo use in memory info here instead + def update( + self, + source_id: int, + partition_key: str, + cache_entry: PartitionCacheEntry, + num_partitions: int, + size_bytes: int, + num_rows: int, + ) -> None: ... + +class LogicalPlanBuilder: + """ + A logical plan builder, which simplifies constructing logical plans via + a fluent interface. E.g., LogicalPlanBuilder.table_scan(..).project(..).filter(..). + + This builder holds the current root (sink) of the logical plan, and the building methods return + a brand new builder holding a new plan; i.e., this is an immutable builder. + """ + + @staticmethod + def in_memory_scan( + partition_key: str, + cache_entry: PartitionCacheEntry, + schema: PySchema, + num_partitions: int, + size_bytes: int, + num_rows: int, + ) -> LogicalPlanBuilder: ... + @staticmethod + def table_scan(scan_operator: ScanOperatorHandle) -> LogicalPlanBuilder: ... + def with_planning_config(self, daft_planning_config: PyDaftPlanningConfig) -> LogicalPlanBuilder: ... + def select(self, to_select: list[PyExpr]) -> LogicalPlanBuilder: ... + def with_columns(self, columns: list[PyExpr]) -> LogicalPlanBuilder: ... + def exclude(self, to_exclude: list[str]) -> LogicalPlanBuilder: ... + def filter(self, predicate: PyExpr) -> LogicalPlanBuilder: ... + def limit(self, limit: int, eager: bool) -> LogicalPlanBuilder: ... + def explode(self, to_explode: list[PyExpr]) -> LogicalPlanBuilder: ... + def unpivot( + self, + ids: list[PyExpr], + values: list[PyExpr], + variable_name: str, + value_name: str, + ) -> LogicalPlanBuilder: ... + def sort(self, sort_by: list[PyExpr], descending: list[bool]) -> LogicalPlanBuilder: ... + def hash_repartition( + self, + partition_by: list[PyExpr], + num_partitions: int | None, + ) -> LogicalPlanBuilder: ... + def random_shuffle(self, num_partitions: int | None) -> LogicalPlanBuilder: ... + def into_partitions(self, num_partitions: int) -> LogicalPlanBuilder: ... + def coalesce(self, num_partitions: int) -> LogicalPlanBuilder: ... + def distinct(self) -> LogicalPlanBuilder: ... + def sample(self, fraction: float, with_replacement: bool, seed: int | None) -> LogicalPlanBuilder: ... + def aggregate(self, agg_exprs: list[PyExpr], groupby_exprs: list[PyExpr]) -> LogicalPlanBuilder: ... + def pivot( + self, + groupby_exprs: list[PyExpr], + pivot_expr: PyExpr, + values_expr: PyExpr, + agg_expr: PyExpr, + names: list[str], + ) -> LogicalPlanBuilder: ... + def join( + self, + right: LogicalPlanBuilder, + left_on: list[PyExpr], + right_on: list[PyExpr], + join_type: JoinType, + strategy: JoinStrategy | None = None, + ) -> LogicalPlanBuilder: ... + def concat(self, other: LogicalPlanBuilder) -> LogicalPlanBuilder: ... + def add_monotonically_increasing_id(self, column_name: str | None) -> LogicalPlanBuilder: ... + def table_write( + self, + root_dir: str, + file_format: FileFormat, + partition_cols: list[PyExpr] | None = None, + compression: str | None = None, + io_config: IOConfig | None = None, + ) -> LogicalPlanBuilder: ... + def iceberg_write( + self, + table_name: str, + table_location: str, + spec_id: int, + iceberg_schema: IcebergSchema, + iceberg_properties: IcebergTableProperties, + catalog_columns: list[str], + io_config: IOConfig | None = None, + ) -> LogicalPlanBuilder: ... + def delta_write( + self, + path: str, + columns_name: list[str], + mode: str, + version: int, + large_dtypes: bool, + io_config: IOConfig | None = None, + ) -> LogicalPlanBuilder: ... + def lance_write( + self, + path: str, + columns_name: list[str], + mode: str, + io_config: IOConfig | None = None, + kwargs: dict[str, Any] | None = None, + ) -> LogicalPlanBuilder: ... + def schema(self) -> PySchema: ... + def optimize(self) -> LogicalPlanBuilder: ... + def to_physical_plan_scheduler(self, cfg: PyDaftExecutionConfig) -> PhysicalPlanScheduler: ... + def to_adaptive_physical_plan_scheduler(self, cfg: PyDaftExecutionConfig) -> AdaptivePhysicalPlanScheduler: ... + def repr_ascii(self, simple: bool) -> str: ... + def repr_mermaid(self, options: MermaidOptions) -> str: ... + +class NativeExecutor: + @staticmethod + def from_logical_plan_builder( + logical_plan_builder: LogicalPlanBuilder, + ) -> NativeExecutor: ... + def run( + self, psets: dict[str, list[PartitionT]], cfg: PyDaftExecutionConfig, results_buffer_size: int | None + ) -> Iterator[PyMicroPartition]: ... + +class PyDaftExecutionConfig: + @staticmethod + def from_env() -> PyDaftExecutionConfig: ... + def with_config_values( + self, + scan_tasks_min_size_bytes: int | None = None, + scan_tasks_max_size_bytes: int | None = None, + broadcast_join_size_bytes_threshold: int | None = None, + parquet_split_row_groups_max_files: int | None = None, + sort_merge_join_sort_with_aligned_boundaries: bool | None = None, + hash_join_partition_size_leniency: float | None = None, + sample_size_for_sort: int | None = None, + num_preview_rows: int | None = None, + parquet_target_filesize: int | None = None, + parquet_target_row_group_size: int | None = None, + parquet_inflation_factor: float | None = None, + csv_target_filesize: int | None = None, + csv_inflation_factor: float | None = None, + shuffle_aggregation_default_partitions: int | None = None, + read_sql_partition_size_bytes: int | None = None, + enable_aqe: bool | None = None, + enable_native_executor: bool | None = None, + default_morsel_size: int | None = None, + ) -> PyDaftExecutionConfig: ... + @property + def scan_tasks_min_size_bytes(self) -> int: ... + @property + def scan_tasks_max_size_bytes(self) -> int: ... + @property + def broadcast_join_size_bytes_threshold(self) -> int: ... + @property + def sort_merge_join_sort_with_aligned_boundaries(self) -> bool: ... + @property + def hash_join_partition_size_leniency(self) -> float: ... + @property + def sample_size_for_sort(self) -> int: ... + @property + def num_preview_rows(self) -> int: ... + @property + def parquet_target_filesize(self) -> int: ... + @property + def parquet_target_row_group_size(self) -> int: ... + @property + def parquet_inflation_factor(self) -> float: ... + @property + def csv_target_filesize(self) -> int: ... + @property + def csv_inflation_factor(self) -> float: ... + @property + def shuffle_aggregation_default_partitions(self) -> int: ... + @property + def read_sql_partition_size_bytes(self) -> int: ... + @property + def enable_aqe(self) -> bool: ... + @property + def enable_native_executor(self) -> bool: ... + @property + def default_morsel_size(self) -> int: ... + +class PyDaftPlanningConfig: + @staticmethod + def from_env() -> PyDaftPlanningConfig: ... + def with_config_values( + self, + default_io_config: IOConfig | None = None, + enable_actor_pool_projections: bool | None = None, + ) -> PyDaftPlanningConfig: ... + @property + def default_io_config(self) -> IOConfig: ... + @property + def enable_actor_pool_projections(self) -> bool: ... + +def build_type() -> str: ... +def version() -> str: ... +def refresh_logger() -> None: ... +def get_max_log_level() -> str: ... +def __getattr__(name) -> Any: ... +def io_glob( + path: str, + multithreaded_io: bool | None = None, + io_config: IOConfig | None = None, + fanout_limit: int | None = None, + page_size: int | None = None, + limit: int | None = None, +) -> list[dict]: ... + +class SystemInfo: + """ + Accessor for system information. + """ + + def __init__(self) -> None: ... + def total_memory(self) -> int: ... + def cpu_count(self) -> int | None: ... diff --git a/src/daft-core/Cargo.toml b/src/daft-core/Cargo.toml index 3e448aea4d..af331c7187 100644 --- a/src/daft-core/Cargo.toml +++ b/src/daft-core/Cargo.toml @@ -50,11 +50,6 @@ serde_json = {workspace = true} sketches-ddsketch = {workspace = true} unicode-normalization = "0.1.23" -[dependencies.image] -default-features = false -features = ["gif", "jpeg", "ico", "png", "tiff", "webp", "bmp", "hdr"] -version = "0.24.7" - [dependencies.numpy] optional = true version = "0.19" diff --git a/src/daft-core/src/array/mod.rs b/src/daft-core/src/array/mod.rs index 0eb40912e7..76726e5f20 100644 --- a/src/daft-core/src/array/mod.rs +++ b/src/daft-core/src/array/mod.rs @@ -1,8 +1,8 @@ mod fixed_size_list_array; pub mod from; pub mod growable; -pub mod iterator; pub mod image_array; +pub mod iterator; mod list_array; pub mod ops; pub mod pseudo_arrow; diff --git a/src/daft-core/src/array/ops/repr.rs b/src/daft-core/src/array/ops/repr.rs index db8d4e1a3e..03ede0b5c9 100644 --- a/src/daft-core/src/array/ops/repr.rs +++ b/src/daft-core/src/array/ops/repr.rs @@ -7,7 +7,7 @@ use crate::{ FixedShapeTensorArray, ImageArray, MapArray, TensorArray, TimeArray, TimestampArray, }, BinaryArray, BooleanArray, DaftNumericType, ExtensionArray, FixedSizeBinaryArray, - ImageFormat, NullArray, UInt64Array, Utf8Array, + NullArray, UInt64Array, Utf8Array, }, series::Series, utils::display_table::{display_date32, display_decimal128, display_time64, display_timestamp}, @@ -414,35 +414,6 @@ where } } -// impl ImageArray { -// pub fn html_value(&self, idx: usize) -> String { - -// } -// } - -// impl FixedShapeImageArray { -// pub fn html_value(&self, idx: usize) -> String { -// let maybe_image = self.as_image_obj(idx); -// let str_val = self.str_value(idx).unwrap(); - -// match maybe_image { -// None => "None".to_string(), -// Some(image) => { -// let thumb = image.fit_to(128, 128); -// let mut bytes: Vec = vec![]; -// let mut writer = std::io::BufWriter::new(std::io::Cursor::new(&mut bytes)); -// thumb.encode(ImageFormat::JPEG, &mut writer).unwrap(); -// drop(writer); -// format!( -// "\"{}\"", -// base64::engine::general_purpose::STANDARD.encode(&mut bytes), -// str_val, -// ) -// } -// } -// } -// } - impl FixedShapeTensorArray { pub fn html_value(&self, idx: usize) -> String { let str_value = self.str_value(idx).unwrap(); diff --git a/src/daft-core/src/datatypes/image_format.rs b/src/daft-core/src/datatypes/image_format.rs index c16b4b1043..1a622d25a1 100644 --- a/src/daft-core/src/datatypes/image_format.rs +++ b/src/daft-core/src/datatypes/image_format.rs @@ -68,31 +68,6 @@ impl FromStr for ImageFormat { } } -impl From for ImageFormat { - fn from(image_format: image::ImageFormat) -> Self { - match image_format { - image::ImageFormat::Png => ImageFormat::PNG, - image::ImageFormat::Jpeg => ImageFormat::JPEG, - image::ImageFormat::Tiff => ImageFormat::TIFF, - image::ImageFormat::Gif => ImageFormat::GIF, - image::ImageFormat::Bmp => ImageFormat::BMP, - _ => unimplemented!("Image format {:?} is not supported", image_format), - } - } -} - -impl From for image::ImageFormat { - fn from(image_format: ImageFormat) -> Self { - match image_format { - ImageFormat::PNG => image::ImageFormat::Png, - ImageFormat::JPEG => image::ImageFormat::Jpeg, - ImageFormat::TIFF => image::ImageFormat::Tiff, - ImageFormat::GIF => image::ImageFormat::Gif, - ImageFormat::BMP => image::ImageFormat::Bmp, - } - } -} - impl Display for ImageFormat { fn fmt(&self, f: &mut Formatter) -> Result { // Leverage Debug trait implementation, which will already return the enum variant as a string. diff --git a/src/daft-core/src/python/series.rs b/src/daft-core/src/python/series.rs index 562ff97321..4b394c84f2 100644 --- a/src/daft-core/src/python/series.rs +++ b/src/daft-core/src/python/series.rs @@ -14,7 +14,7 @@ use crate::{ DataArray, }, count_mode::CountMode, - datatypes::{DataType, Field, ImageFormat, ImageMode, PythonType}, + datatypes::{DataType, Field, ImageMode, PythonType}, ffi, series::{self, IntoSeries, Series}, utils::arrow::{cast_array_for_daft_if_needed, cast_array_from_daft_if_needed}, diff --git a/src/daft-core/src/series/array_impl/data_array.rs b/src/daft-core/src/series/array_impl/data_array.rs index b998e4f98c..3fdaf527fc 100644 --- a/src/daft-core/src/series/array_impl/data_array.rs +++ b/src/daft-core/src/series/array_impl/data_array.rs @@ -124,9 +124,7 @@ macro_rules! impl_series_like_for_data_array { fn str_value(&self, idx: usize) -> DaftResult { self.0.str_value(idx) } - // fn html_value(&self, idx: usize) -> String { - // self.0.html_value(idx) - // } + fn take(&self, idx: &Series) -> DaftResult { with_match_integer_daft_types!(idx.data_type(), |$S| { Ok(self diff --git a/src/daft-core/src/series/array_impl/logical_array.rs b/src/daft-core/src/series/array_impl/logical_array.rs index ea72e9982f..9b5ca9e5f6 100644 --- a/src/daft-core/src/series/array_impl/logical_array.rs +++ b/src/daft-core/src/series/array_impl/logical_array.rs @@ -125,10 +125,6 @@ macro_rules! impl_series_like_for_logical_array { self.0.str_value(idx) } - // fn html_value(&self, idx: usize) -> String { - // self.0.html_value(idx) - // } - fn take(&self, idx: &Series) -> DaftResult { with_match_integer_daft_types!(idx.data_type(), |$S| { Ok(self diff --git a/src/daft-core/src/series/array_impl/nested_array.rs b/src/daft-core/src/series/array_impl/nested_array.rs index 8092d4570d..9ec9939d11 100644 --- a/src/daft-core/src/series/array_impl/nested_array.rs +++ b/src/daft-core/src/series/array_impl/nested_array.rs @@ -148,10 +148,6 @@ macro_rules! impl_series_like_for_nested_arrays { self.0.str_value(idx) } - // fn html_value(&self, idx: usize) -> String { - // self.0.html_value(idx) - // } - fn add(&self, rhs: &Series) -> DaftResult { SeriesBinaryOps::add(self, rhs) } diff --git a/src/daft-core/src/series/ops/downcast.rs b/src/daft-core/src/series/ops/downcast.rs index 8c509113c5..8c85dbef39 100644 --- a/src/daft-core/src/series/ops/downcast.rs +++ b/src/daft-core/src/series/ops/downcast.rs @@ -6,6 +6,7 @@ use crate::datatypes::*; use crate::series::array_impl::ArrayWrapper; use crate::series::Series; use common_error::DaftResult; +use logical::{EmbeddingArray, FixedShapeTensorArray, TensorArray}; use self::logical::{DurationArray, ImageArray, MapArray}; @@ -139,4 +140,16 @@ impl Series { pub fn python(&self) -> DaftResult<&PythonArray> { self.downcast() } + + pub fn embedding(&self) -> DaftResult<&EmbeddingArray> { + self.downcast() + } + + pub fn tensor(&self) -> DaftResult<&TensorArray> { + self.downcast() + } + + pub fn fixed_shape_tensor(&self) -> DaftResult<&FixedShapeTensorArray> { + self.downcast() + } } diff --git a/src/daft-core/src/series/ops/take.rs b/src/daft-core/src/series/ops/take.rs index 44f4e93303..7d61500f65 100644 --- a/src/daft-core/src/series/ops/take.rs +++ b/src/daft-core/src/series/ops/take.rs @@ -27,10 +27,6 @@ impl Series { self.inner.str_value(idx) } - pub fn html_value(&self, idx: usize) -> String { - self.inner.html_value(idx) - } - pub fn to_str_values(&self) -> DaftResult { let iter = IndexRange::new(0i64, self.len() as i64).map(|i| self.str_value(i as usize).ok()); diff --git a/src/daft-core/src/series/series_like.rs b/src/daft-core/src/series/series_like.rs index 56052c81b3..10d75b7540 100644 --- a/src/daft-core/src/series/series_like.rs +++ b/src/daft-core/src/series/series_like.rs @@ -34,7 +34,6 @@ pub trait SeriesLike: Send + Sync + Any + std::fmt::Debug { fn slice(&self, start: usize, end: usize) -> DaftResult; fn take(&self, idx: &Series) -> DaftResult; fn str_value(&self, idx: usize) -> DaftResult; - // fn html_value(&self, idx: usize) -> String; fn add(&self, rhs: &Series) -> DaftResult; fn sub(&self, rhs: &Series) -> DaftResult; fn mul(&self, rhs: &Series) -> DaftResult; diff --git a/src/daft-functions/src/image/decode.rs b/src/daft-functions/src/image/decode.rs index 86b567a780..ccb295caf5 100644 --- a/src/daft-functions/src/image/decode.rs +++ b/src/daft-functions/src/image/decode.rs @@ -68,7 +68,7 @@ impl ScalarUDF for ImageDecode { fn evaluate(&self, inputs: &[Series]) -> DaftResult { let raise_error_on_failure = self.raise_on_error; match inputs { - [input] => daft_image::series::decode(input, raise_error_on_failure, self.mode), + [input] => daft_image::series::decode(input, raise_error_on_failure, self.mode), _ => Err(DaftError::ValueError(format!( "Expected 1 input arg, got {}", inputs.len() diff --git a/src/daft-image/src/image_buffer.rs b/src/daft-image/src/image_buffer.rs index d51ff32e58..0855fd4ec8 100644 --- a/src/daft-image/src/image_buffer.rs +++ b/src/daft-image/src/image_buffer.rs @@ -126,7 +126,7 @@ impl<'a> DaftImageBuffer<'a> { self.width(), self.height(), self.color(), - image::ImageFormat::from(image_format), + convert_img_fmt(image_format), ) .map_err(|e| { DaftError::ValueError(format!( @@ -305,3 +305,13 @@ impl<'a> From> for DynamicImage { } } } + +fn convert_img_fmt(fmt: ImageFormat) -> image::ImageFormat { + match fmt { + ImageFormat::PNG => image::ImageFormat::Png, + ImageFormat::JPEG => image::ImageFormat::Jpeg, + ImageFormat::TIFF => image::ImageFormat::Tiff, + ImageFormat::GIF => image::ImageFormat::Gif, + ImageFormat::BMP => image::ImageFormat::Bmp, + } +} diff --git a/src/daft-image/src/kernel.rs b/src/daft-image/src/kernel.rs index bfaa8f226a..c949df9463 100644 --- a/src/daft-image/src/kernel.rs +++ b/src/daft-image/src/kernel.rs @@ -9,6 +9,7 @@ use num_traits::FromPrimitive; use std::borrow::Cow; use std::sync::Arc; +#[allow(clippy::len_without_is_empty)] pub trait AsImageObj { fn name(&self) -> &str; fn len(&self) -> usize; @@ -41,7 +42,7 @@ pub(crate) fn image_array_from_img_buffers( ) -> DaftResult { use DaftImageBuffer::*; let is_all_u8 = inputs - .into_iter() + .iter() .filter_map(|b| b.as_ref()) .all(|b| matches!(b, L(..) | LA(..) | RGB(..) | RGBA(..))); assert!(is_all_u8); @@ -201,7 +202,7 @@ impl ImageOps for FixedShapeImageArray { { let result = resize_images(self, w, h); let mode = self.image_mode(); - fixed_image_array_from_img_buffers(self.name(), result.as_slice(), &mode, h, w) + fixed_image_array_from_img_buffers(self.name(), result.as_slice(), mode, h, w) } fn crop(&self, bboxes: &FixedSizeListArray) -> DaftResult @@ -221,7 +222,7 @@ impl ImageOps for FixedShapeImageArray { }; let result = crop_images(self, &mut bboxes_iterator); - image_array_from_img_buffers(self.name(), result.as_slice(), &Some(self.image_mode().clone())) + image_array_from_img_buffers(self.name(), result.as_slice(), &Some(*self.image_mode())) } fn resize_to_fixed_shape_image_array( @@ -323,7 +324,7 @@ impl AsImageObj for FixedShapeImageArray { let end = (idx + 1) * size as usize; let slice_data = Cow::Borrowed(&arrow_array.values().as_slice()[start..end] as &'a [u8]); let result = DaftImageBuffer::from_raw(mode, *width, *height, slice_data); - + assert_eq!(result.height(), *height); assert_eq!(result.width(), *width); Some(result) @@ -333,10 +334,10 @@ impl AsImageObj for FixedShapeImageArray { } } -fn encode_images<'a, Arr>(images: &'a Arr, image_format: ImageFormat) -> DaftResult -where - Arr: AsImageObj, -{ +fn encode_images( + images: &Arr, + image_format: ImageFormat, +) -> DaftResult { let arrow_array = match image_format { ImageFormat::TIFF => { // NOTE: A single writer/buffer can't be used for TIFF files because the encoder will overwrite the @@ -421,10 +422,7 @@ where ) } -fn resize_images<'a, Arr>(images: &'a Arr, w: u32, h: u32) -> Vec> -where - Arr: AsImageObj, -{ +fn resize_images(images: &Arr, w: u32, h: u32) -> Vec> { ImageBufferIter::new(images) .map(|img| img.map(|img| img.resize(w, h))) .collect::>() @@ -446,7 +444,28 @@ where .collect::>() } -pub fn html_value(arr: &ImageArray, idx: usize) -> String { +pub fn image_html_value(arr: &ImageArray, idx: usize) -> String { + let maybe_image = arr.as_image_obj(idx); + let str_val = arr.str_value(idx).unwrap(); + + match maybe_image { + None => "None".to_string(), + Some(image) => { + let thumb = image.fit_to(128, 128); + let mut bytes: Vec = vec![]; + let mut writer = std::io::BufWriter::new(std::io::Cursor::new(&mut bytes)); + thumb.encode(ImageFormat::JPEG, &mut writer).unwrap(); + drop(writer); + format!( + "\"{}\"", + base64::engine::general_purpose::STANDARD.encode(&mut bytes), + str_val, + ) + } + } +} + +pub fn fixed_image_html_value(arr: &FixedShapeImageArray, idx: usize) -> String { let maybe_image = arr.as_image_obj(idx); let str_val = arr.str_value(idx).unwrap(); diff --git a/src/daft-image/src/lib.rs b/src/daft-image/src/lib.rs index 2113ebd987..6f75eb2c13 100644 --- a/src/daft-image/src/lib.rs +++ b/src/daft-image/src/lib.rs @@ -10,4 +10,4 @@ pub mod series; mod python; #[cfg(feature = "python")] -pub use python::*; \ No newline at end of file +pub use python::*; diff --git a/src/daft-table/Cargo.toml b/src/daft-table/Cargo.toml index 17429f260a..2c0eb70bfb 100644 --- a/src/daft-table/Cargo.toml +++ b/src/daft-table/Cargo.toml @@ -4,6 +4,7 @@ comfy-table = {workspace = true} common-error = {path = "../common/error", default-features = false} daft-core = {path = "../daft-core", default-features = false} daft-dsl = {path = "../daft-dsl", default-features = false} +daft-image = {path = "../daft-image", default-features = false} html-escape = {workspace = true} num-traits = {workspace = true} pyo3 = {workspace = true, optional = true} @@ -11,7 +12,13 @@ rand = {workspace = true} serde = {workspace = true} [features] -python = ["dep:pyo3", "common-error/python", "daft-core/python", "daft-dsl/python"] +python = [ + "dep:pyo3", + "common-error/python", + "daft-core/python", + "daft-dsl/python", + "daft-image/python" +] [package] edition = {workspace = true} diff --git a/src/daft-table/src/lib.rs b/src/daft-table/src/lib.rs index c190171d4a..033c349982 100644 --- a/src/daft-table/src/lib.rs +++ b/src/daft-table/src/lib.rs @@ -24,6 +24,7 @@ pub mod ffi; mod growable; mod ops; mod probe_table; +mod repr_html; pub use growable::GrowableTable; @@ -33,6 +34,7 @@ pub use probe_table::{ProbeTable, ProbeTableBuilder}; pub mod python; #[cfg(feature = "python")] pub use python::register_modules; +use repr_html::html_value; #[derive(Clone, Debug, serde::Serialize, serde::Deserialize)] pub struct Table { @@ -714,7 +716,7 @@ impl Table { for col in self.columns.iter() { res.push_str(styled_td); - res.push_str(&col.html_value(i)); + res.push_str(&html_value(col, i)); res.push_str(""); } @@ -736,7 +738,7 @@ impl Table { for col in self.columns.iter() { res.push_str(styled_td); - res.push_str(&col.html_value(i)); + res.push_str(&html_value(col, i)); res.push_str(""); } diff --git a/src/daft-table/src/repr_html.rs b/src/daft-table/src/repr_html.rs new file mode 100644 index 0000000000..bf558561c2 --- /dev/null +++ b/src/daft-table/src/repr_html.rs @@ -0,0 +1,139 @@ +use daft_core::datatypes::ExtensionArray; +use daft_core::{prelude::DataType, series::Series}; + +pub fn html_value(s: &Series, idx: usize) -> String { + match s.data_type() { + DataType::Image(_) => { + let arr = s.image().unwrap(); + daft_image::kernel::image_html_value(arr, idx) + } + DataType::Null => { + let arr = s.null().unwrap(); + arr.html_value(idx) + } + DataType::Boolean => { + let arr = s.bool().unwrap(); + arr.html_value(idx) + } + DataType::Int8 => { + let arr = s.i8().unwrap(); + arr.html_value(idx) + } + DataType::Int16 => { + let arr = s.i16().unwrap(); + arr.html_value(idx) + } + DataType::Int32 => { + let arr = s.i32().unwrap(); + arr.html_value(idx) + } + DataType::Int64 => { + let arr = s.i64().unwrap(); + arr.html_value(idx) + } + DataType::Int128 => { + let arr = s.i128().unwrap(); + arr.html_value(idx) + } + DataType::UInt8 => { + let arr = s.u8().unwrap(); + arr.html_value(idx) + } + DataType::UInt16 => { + let arr = s.u16().unwrap(); + arr.html_value(idx) + } + DataType::UInt32 => { + let arr = s.u32().unwrap(); + arr.html_value(idx) + } + DataType::UInt64 => { + let arr = s.u64().unwrap(); + arr.html_value(idx) + } + DataType::Float32 => { + let arr = s.f32().unwrap(); + arr.html_value(idx) + } + DataType::Float64 => { + let arr = s.f64().unwrap(); + arr.html_value(idx) + } + DataType::Decimal128(_, _) => { + let arr = s.decimal128().unwrap(); + arr.html_value(idx) + } + DataType::Timestamp(_, _) => { + let arr = s.timestamp().unwrap(); + arr.html_value(idx) + } + DataType::Date => { + let arr = s.date().unwrap(); + arr.html_value(idx) + } + DataType::Time(_) => { + let arr = s.time().unwrap(); + arr.html_value(idx) + } + DataType::Duration(_) => { + let arr = s.duration().unwrap(); + arr.html_value(idx) + } + DataType::Binary => { + let arr = s.binary().unwrap(); + arr.html_value(idx) + } + DataType::FixedSizeBinary(_) => { + let arr = s.fixed_size_binary().unwrap(); + arr.html_value(idx) + } + DataType::Utf8 => { + let arr = s.utf8().unwrap(); + arr.html_value(idx) + } + DataType::FixedSizeList(_, _) => { + let arr = s.fixed_size_list().unwrap(); + arr.html_value(idx) + } + DataType::List(_) => { + let arr = s.list().unwrap(); + arr.html_value(idx) + } + DataType::Struct(_) => { + let arr = s.struct_().unwrap(); + arr.html_value(idx) + } + DataType::Map(_) => { + let arr = s.map().unwrap(); + arr.html_value(idx) + } + DataType::Extension(_, _, _) => { + let arr = s.downcast::().unwrap(); + arr.html_value(idx) + } + DataType::Embedding(_, _) => { + let arr = s.embedding().unwrap(); + arr.html_value(idx) + } + DataType::FixedShapeImage(_, _, _) => { + let arr = s.fixed_size_image().unwrap(); + daft_image::kernel::fixed_image_html_value(arr, idx) + } + DataType::Tensor(_) => { + let arr = s.tensor().unwrap(); + arr.html_value(idx) + } + DataType::FixedShapeTensor(_, _) => { + let arr = s.fixed_shape_tensor().unwrap(); + arr.html_value(idx) + } + #[cfg(feature = "python")] + DataType::Python => { + let arr = s.python().unwrap(); + arr.html_value(idx) + } + DataType::Unknown => { + panic!("Unknown data type") + } + } +} From aa3f4299919091d4f81d3fb8a9dbdc93a619dd8a Mon Sep 17 00:00:00 2001 From: universalmind303 Date: Fri, 6 Sep 2024 15:04:18 -0700 Subject: [PATCH 4/8] machete --- Cargo.lock | 2 -- src/daft-core/Cargo.toml | 1 - src/daft-image/Cargo.toml | 13 ++++++------- 3 files changed, 6 insertions(+), 10 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 0fc3913d57..2bac63b686 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1691,7 +1691,6 @@ version = "0.3.0-dev0" dependencies = [ "aho-corasick", "arrow2", - "base64 0.22.1", "bincode", "chrono", "chrono-tz", @@ -1824,7 +1823,6 @@ dependencies = [ "log", "num-traits", "pyo3", - "serde", ] [[package]] diff --git a/src/daft-core/Cargo.toml b/src/daft-core/Cargo.toml index af331c7187..5323d86427 100644 --- a/src/daft-core/Cargo.toml +++ b/src/daft-core/Cargo.toml @@ -15,7 +15,6 @@ arrow2 = {workspace = true, features = [ "compute_substring", "io_ipc" ]} -base64 = "0.22.0" bincode = {workspace = true} chrono = {workspace = true} chrono-tz = {workspace = true} diff --git a/src/daft-image/Cargo.toml b/src/daft-image/Cargo.toml index 91ae0c5854..6dab7b3b8f 100644 --- a/src/daft-image/Cargo.toml +++ b/src/daft-image/Cargo.toml @@ -1,17 +1,11 @@ -[package] -name = "daft-image" -edition.workspace = true -version.workspace = true - [dependencies] arrow2 = {workspace = true} -base64.workspace = true common-error = {path = "../common/error", default-features = false} daft-core = {path = "../daft-core", default-features = false} log = {workspace = true} num-traits = "0.2.19" pyo3 = {workspace = true, optional = true} -serde = {workspace = true} +base64.workspace = true [dependencies.image] default-features = false @@ -23,3 +17,8 @@ python = [ "dep:pyo3", "common-error/python" ] + +[package] +name = "daft-image" +edition.workspace = true +version.workspace = true From 30085dddde3a9ed8023008ac8bbe88be79705647 Mon Sep 17 00:00:00 2001 From: universalmind303 Date: Fri, 6 Sep 2024 15:07:38 -0700 Subject: [PATCH 5/8] cleanup --- src/daft-core/src/datatypes/image_mode.rs | 46 ----------------------- 1 file changed, 46 deletions(-) diff --git a/src/daft-core/src/datatypes/image_mode.rs b/src/daft-core/src/datatypes/image_mode.rs index 7fbaf1283a..d54c847efb 100644 --- a/src/daft-core/src/datatypes/image_mode.rs +++ b/src/daft-core/src/datatypes/image_mode.rs @@ -123,52 +123,6 @@ impl ImageMode { } } -// impl From for image::ColorType { -// fn from(image_mode: ImageMode) -> image::ColorType { -// use image::ColorType; -// use ImageMode::*; - -// match image_mode { -// L => ColorType::L8, -// LA => ColorType::La8, -// RGB => ColorType::Rgb8, -// RGBA => ColorType::Rgba8, -// L16 => ColorType::L16, -// LA16 => ColorType::La16, -// RGB16 => ColorType::Rgb16, -// RGBA16 => ColorType::Rgba16, -// RGB32F => ColorType::Rgb32F, -// RGBA32F => ColorType::Rgba32F, -// } -// } -// } - -// impl TryFrom for ImageMode { -// type Error = DaftError; - -// fn try_from(color: image::ColorType) -> DaftResult { -// use image::ColorType; -// use ImageMode::*; - -// match color { -// ColorType::L8 => Ok(L), -// ColorType::La8 => Ok(LA), -// ColorType::Rgb8 => Ok(RGB), -// ColorType::Rgba8 => Ok(RGBA), -// ColorType::L16 => Ok(L16), -// ColorType::La16 => Ok(LA16), -// ColorType::Rgb16 => Ok(RGB16), -// ColorType::Rgba16 => Ok(RGBA16), -// ColorType::Rgb32F => Ok(RGB32F), -// ColorType::Rgba32F => Ok(RGBA32F), -// _ => Err(DaftError::ValueError(format!( -// "Color type {:?} is not supported.", -// color -// ))), -// } -// } -// } - impl FromStr for ImageMode { type Err = DaftError; From 802f97d0e37387f99cbb274e9e66ccca5774af33 Mon Sep 17 00:00:00 2001 From: universalmind303 Date: Fri, 6 Sep 2024 15:13:55 -0700 Subject: [PATCH 6/8] cleanup --- Cargo.toml | 2 +- daft/daft/image.pyi | 4 ++-- daft/series.py | 2 +- src/daft-functions/Cargo.toml | 2 +- tests/series/test_image.py | 1 + 5 files changed, 6 insertions(+), 5 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 4c8c2e572b..2364ed7330 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -10,8 +10,8 @@ daft-core = {path = "src/daft-core", default-features = false} daft-csv = {path = "src/daft-csv", default-features = false} daft-dsl = {path = "src/daft-dsl", default-features = false} daft-functions = {path = "src/daft-functions", default-features = false} -daft-io = {path = "src/daft-io", default-features = false} daft-image = {path = "src/daft-image", default-features = false} +daft-io = {path = "src/daft-io", default-features = false} daft-json = {path = "src/daft-json", default-features = false} daft-local-execution = {path = "src/daft-local-execution", default-features = false} daft-micropartition = {path = "src/daft-micropartition", default-features = false} diff --git a/daft/daft/image.pyi b/daft/daft/image.pyi index df4615c202..db92958aa8 100644 --- a/daft/daft/image.pyi +++ b/daft/daft/image.pyi @@ -1,6 +1,6 @@ -from daft.daft import ImageMode, ImageFormat, PySeries +from daft.daft import ImageFormat, ImageMode, PySeries def decode(s: PySeries, raise_error_on_failure: bool, mode: ImageMode | None = None) -> PySeries: ... def encode(s: PySeries, image_format: ImageFormat) -> PySeries: ... def resize(s: PySeries, w: int, h: int) -> PySeries: ... -def to_mode(s: PySeries, mode: ImageMode) -> PySeries: ... \ No newline at end of file +def to_mode(s: PySeries, mode: ImageMode) -> PySeries: ... diff --git a/daft/series.py b/daft/series.py index 97242615a4..6db017a4f4 100644 --- a/daft/series.py +++ b/daft/series.py @@ -1029,7 +1029,7 @@ def resize(self, w: int, h: int) -> Series: if not isinstance(h, int): raise TypeError(f"expected int for h but got {type(h)}") - return Series._from_pyseries(image.resize(self._series,w, h)) + return Series._from_pyseries(image.resize(self._series, w, h)) def to_mode(self, mode: str | ImageMode) -> Series: if isinstance(mode, str): diff --git a/src/daft-functions/Cargo.toml b/src/daft-functions/Cargo.toml index bd2e1f9d05..bd9281b4ea 100644 --- a/src/daft-functions/Cargo.toml +++ b/src/daft-functions/Cargo.toml @@ -5,8 +5,8 @@ common-error = {path = "../common/error", default-features = false} common-io-config = {path = "../common/io-config", default-features = false} daft-core = {path = "../daft-core", default-features = false} daft-dsl = {path = "../daft-dsl", default-features = false} -daft-io = {path = "../daft-io", default-features = false} daft-image = {path = "../daft-image", default-features = false} +daft-io = {path = "../daft-io", default-features = false} futures = {workspace = true} pyo3 = {workspace = true, optional = true} tiktoken-rs = {workspace = true} diff --git a/tests/series/test_image.py b/tests/series/test_image.py index 5b2bfe5fd4..a86dc49818 100644 --- a/tests/series/test_image.py +++ b/tests/series/test_image.py @@ -11,6 +11,7 @@ from daft.datatype import DaftExtension, DataType from daft.series import Series + MODE_TO_NP_DTYPE = { "L": np.uint8, "LA": np.uint8, From 0a2fb42d0b466359888347dd0b888a80b15d6a30 Mon Sep 17 00:00:00 2001 From: universalmind303 Date: Fri, 6 Sep 2024 15:16:44 -0700 Subject: [PATCH 7/8] cleanup --- src/daft-image/src/iters.rs | 2 +- src/daft-image/src/lib.rs | 2 +- src/daft-image/src/{kernel.rs => ops.rs} | 0 src/daft-image/src/series.rs | 2 +- src/daft-table/src/repr_html.rs | 4 ++-- 5 files changed, 5 insertions(+), 5 deletions(-) rename src/daft-image/src/{kernel.rs => ops.rs} (100%) diff --git a/src/daft-image/src/iters.rs b/src/daft-image/src/iters.rs index 09ef2ce4ce..7a8183e655 100644 --- a/src/daft-image/src/iters.rs +++ b/src/daft-image/src/iters.rs @@ -1,4 +1,4 @@ -use crate::{kernel::AsImageObj, DaftImageBuffer}; +use crate::{ops::AsImageObj, DaftImageBuffer}; pub struct ImageBufferIter<'a, Arr> where diff --git a/src/daft-image/src/lib.rs b/src/daft-image/src/lib.rs index 6f75eb2c13..c89ee04be1 100644 --- a/src/daft-image/src/lib.rs +++ b/src/daft-image/src/lib.rs @@ -1,7 +1,7 @@ mod counting_writer; mod image_buffer; mod iters; -pub mod kernel; +pub mod ops; pub use counting_writer::CountingWriter; pub use image_buffer::DaftImageBuffer; pub mod series; diff --git a/src/daft-image/src/kernel.rs b/src/daft-image/src/ops.rs similarity index 100% rename from src/daft-image/src/kernel.rs rename to src/daft-image/src/ops.rs diff --git a/src/daft-image/src/series.rs b/src/daft-image/src/series.rs index 20c775bb8a..070c6c517f 100644 --- a/src/daft-image/src/series.rs +++ b/src/daft-image/src/series.rs @@ -3,7 +3,7 @@ use daft_core::prelude::*; use common_error::{DaftError, DaftResult}; use crate::{ - kernel::{image_array_from_img_buffers, ImageOps}, + ops::{image_array_from_img_buffers, ImageOps}, DaftImageBuffer, }; fn image_decode_impl( diff --git a/src/daft-table/src/repr_html.rs b/src/daft-table/src/repr_html.rs index bf558561c2..aaabc6efa8 100644 --- a/src/daft-table/src/repr_html.rs +++ b/src/daft-table/src/repr_html.rs @@ -5,7 +5,7 @@ pub fn html_value(s: &Series, idx: usize) -> String { match s.data_type() { DataType::Image(_) => { let arr = s.image().unwrap(); - daft_image::kernel::image_html_value(arr, idx) + daft_image::ops::image_html_value(arr, idx) } DataType::Null => { let arr = s.null().unwrap(); @@ -117,7 +117,7 @@ pub fn html_value(s: &Series, idx: usize) -> String { } DataType::FixedShapeImage(_, _, _) => { let arr = s.fixed_size_image().unwrap(); - daft_image::kernel::fixed_image_html_value(arr, idx) + daft_image::ops::fixed_image_html_value(arr, idx) } DataType::Tensor(_) => { let arr = s.tensor().unwrap(); From 44841fb157f244253f9e6dd93a0d36a34de4f215 Mon Sep 17 00:00:00 2001 From: universalmind303 Date: Tue, 10 Sep 2024 11:17:15 -0500 Subject: [PATCH 8/8] pr feedback --- src/common/display/src/table_display.rs | 12 +- src/daft-core/src/array/image_array.rs | 66 +- src/daft-core/src/array/ops/image.rs | 1100 ----------------------- src/daft-image/src/lib.rs | 9 +- src/daft-image/src/python.rs | 1 - src/daft-image/src/series.rs | 49 +- src/lib.rs | 2 +- 7 files changed, 70 insertions(+), 1169 deletions(-) delete mode 100644 src/daft-core/src/array/ops/image.rs diff --git a/src/common/display/src/table_display.rs b/src/common/display/src/table_display.rs index 6713346863..844dc95292 100644 --- a/src/common/display/src/table_display.rs +++ b/src/common/display/src/table_display.rs @@ -2,6 +2,14 @@ pub use comfy_table; const BOLD_TABLE_HEADERS_IN_DISPLAY: &str = "DAFT_BOLD_TABLE_HEADERS"; +pub trait StrValue { + fn str_value(&self, idx: usize) -> String; +} + +pub trait HTMLValue { + fn html_value(&self, idx: usize) -> String; +} + // this should be factored out to a common crate fn create_table_cell(value: &str) -> comfy_table::Cell { let mut attributes = vec![]; @@ -45,10 +53,6 @@ pub fn make_schema_vertical_table( table } -pub trait StrValue { - fn str_value(&self, idx: usize) -> String; -} - pub fn make_comfy_table>( fields: &[S], columns: Option<&[&dyn StrValue]>, diff --git a/src/daft-core/src/array/image_array.rs b/src/daft-core/src/array/image_array.rs index 424c9f759a..7422760380 100644 --- a/src/daft-core/src/array/image_array.rs +++ b/src/daft-core/src/array/image_array.rs @@ -1,4 +1,3 @@ -use std::io::{Seek, SeekFrom, Write}; use std::vec; use common_error::DaftResult; @@ -28,50 +27,6 @@ impl BBox { } } -type IOResult = std::result::Result; - -/// A wrapper of a writer that tracks the number of bytes successfully written. -pub struct CountingWriter { - inner: W, - count: u64, -} - -impl CountingWriter { - /// The number of bytes successful written so far. - pub fn count(&self) -> u64 { - self.count - } - - /// Extracts the inner writer, discarding this wrapper. - pub fn into_inner(self) -> W { - self.inner - } -} - -impl From for CountingWriter { - fn from(inner: W) -> Self { - Self { inner, count: 0 } - } -} - -impl Write for CountingWriter { - fn write(&mut self, buf: &[u8]) -> IOResult { - let written = self.inner.write(buf)?; - self.count += written as u64; - Ok(written) - } - - fn flush(&mut self) -> IOResult { - self.inner.flush() - } -} - -impl Seek for CountingWriter { - fn seek(&mut self, pos: SeekFrom) -> IOResult { - self.inner.seek(pos) - } -} - pub struct ImageArraySidecarData { pub channels: Vec, pub heights: Vec, @@ -81,6 +36,12 @@ pub struct ImageArraySidecarData { } impl ImageArray { + pub const IMAGE_DATA_IDX: usize = 0; + pub const IMAGE_CHANNEL_IDX: usize = 1; + pub const IMAGE_HEIGHT_IDX: usize = 2; + pub const IMAGE_WIDTH_IDX: usize = 3; + pub const IMAGE_MODE_IDX: usize = 4; + pub fn image_mode(&self) -> &Option { match self.data_type() { DataType::Image(mode) => mode, @@ -89,32 +50,27 @@ impl ImageArray { } pub fn data_array(&self) -> &ListArray { - const IMAGE_DATA_IDX: usize = 0; - let array = self.physical.children.get(IMAGE_DATA_IDX).unwrap(); + let array = self.physical.children.get(Self::IMAGE_DATA_IDX).unwrap(); array.list().unwrap() } pub fn channel_array(&self) -> &arrow2::array::UInt16Array { - const IMAGE_CHANNEL_IDX: usize = 1; - let array = self.physical.children.get(IMAGE_CHANNEL_IDX).unwrap(); + let array = self.physical.children.get(Self::IMAGE_CHANNEL_IDX).unwrap(); array.u16().unwrap().as_arrow() } pub fn height_array(&self) -> &arrow2::array::UInt32Array { - const IMAGE_HEIGHT_IDX: usize = 2; - let array = self.physical.children.get(IMAGE_HEIGHT_IDX).unwrap(); + let array = self.physical.children.get(Self::IMAGE_HEIGHT_IDX).unwrap(); array.u32().unwrap().as_arrow() } pub fn width_array(&self) -> &arrow2::array::UInt32Array { - const IMAGE_WIDTH_IDX: usize = 3; - let array = self.physical.children.get(IMAGE_WIDTH_IDX).unwrap(); + let array = self.physical.children.get(Self::IMAGE_WIDTH_IDX).unwrap(); array.u32().unwrap().as_arrow() } pub fn mode_array(&self) -> &arrow2::array::UInt8Array { - const IMAGE_MODE_IDX: usize = 4; - let array = self.physical.children.get(IMAGE_MODE_IDX).unwrap(); + let array = self.physical.children.get(Self::IMAGE_MODE_IDX).unwrap(); array.u8().unwrap().as_arrow() } diff --git a/src/daft-core/src/array/ops/image.rs b/src/daft-core/src/array/ops/image.rs deleted file mode 100644 index 0e6316cfb1..0000000000 --- a/src/daft-core/src/array/ops/image.rs +++ /dev/null @@ -1,1100 +0,0 @@ -use std::borrow::Cow; -use std::io::{Seek, SeekFrom, Write}; -use std::sync::Arc; -use std::vec; - -use image::{ColorType, DynamicImage, ImageBuffer}; - -use crate::array::prelude::*; -use crate::datatypes::prelude::*; - -use crate::series::{IntoSeries, Series}; -use common_error::{DaftError, DaftResult}; -use image::{Luma, LumaA, Rgb, Rgba}; - -use super::full::FullNull; -use super::{as_arrow::AsArrow, from_arrow::FromArrow}; -use num_traits::FromPrimitive; - -use std::ops::Deref; - -#[derive(Clone)] -pub struct BBox(u32, u32, u32, u32); - -impl BBox { - pub fn from_u32_arrow_array(arr: &dyn arrow2::array::Array) -> Self { - assert!(arr.len() == 4); - let mut iter = arr - .as_any() - .downcast_ref::() - .unwrap() - .iter(); - BBox( - *iter.next().unwrap().unwrap(), - *iter.next().unwrap().unwrap(), - *iter.next().unwrap().unwrap(), - *iter.next().unwrap().unwrap(), - ) - } -} - -#[allow(clippy::upper_case_acronyms, dead_code)] -#[derive(Debug)] -pub enum DaftImageBuffer<'a> { - L(ImageBuffer, Cow<'a, [u8]>>), - LA(ImageBuffer, Cow<'a, [u8]>>), - RGB(ImageBuffer, Cow<'a, [u8]>>), - RGBA(ImageBuffer, Cow<'a, [u8]>>), - L16(ImageBuffer, Cow<'a, [u16]>>), - LA16(ImageBuffer, Cow<'a, [u16]>>), - RGB16(ImageBuffer, Cow<'a, [u16]>>), - RGBA16(ImageBuffer, Cow<'a, [u16]>>), - RGB32F(ImageBuffer, Cow<'a, [f32]>>), - RGBA32F(ImageBuffer, Cow<'a, [f32]>>), -} - -macro_rules! with_method_on_image_buffer { - ( - $key_type:expr, $method: ident -) => {{ - match $key_type { - DaftImageBuffer::L(img) => img.$method(), - DaftImageBuffer::LA(img) => img.$method(), - DaftImageBuffer::RGB(img) => img.$method(), - DaftImageBuffer::RGBA(img) => img.$method(), - DaftImageBuffer::L16(img) => img.$method(), - DaftImageBuffer::LA16(img) => img.$method(), - DaftImageBuffer::RGB16(img) => img.$method(), - DaftImageBuffer::RGBA16(img) => img.$method(), - DaftImageBuffer::RGB32F(img) => img.$method(), - DaftImageBuffer::RGBA32F(img) => img.$method(), - } - }}; -} - -type IOResult = std::result::Result; - -/// A wrapper of a writer that tracks the number of bytes successfully written. -pub struct CountingWriter { - inner: W, - count: u64, -} - -impl CountingWriter { - /// The number of bytes successful written so far. - pub fn count(&self) -> u64 { - self.count - } - - /// Extracts the inner writer, discarding this wrapper. - pub fn into_inner(self) -> W { - self.inner - } -} - -impl From for CountingWriter { - fn from(inner: W) -> Self { - Self { inner, count: 0 } - } -} - -impl Write for CountingWriter { - fn write(&mut self, buf: &[u8]) -> IOResult { - let written = self.inner.write(buf)?; - self.count += written as u64; - Ok(written) - } - - fn flush(&mut self) -> IOResult { - self.inner.flush() - } -} - -impl Seek for CountingWriter { - fn seek(&mut self, pos: SeekFrom) -> IOResult { - self.inner.seek(pos) - } -} - -struct Wrap(T); - -impl From for Wrap { - fn from(image_format: image::ImageFormat) -> Self { - Wrap(match image_format { - image::ImageFormat::Png => ImageFormat::PNG, - image::ImageFormat::Jpeg => ImageFormat::JPEG, - image::ImageFormat::Tiff => ImageFormat::TIFF, - image::ImageFormat::Gif => ImageFormat::GIF, - image::ImageFormat::Bmp => ImageFormat::BMP, - _ => unimplemented!("Image format {:?} is not supported", image_format), - }) - } -} - -impl From> for image::ImageFormat { - fn from(image_format: Wrap) -> Self { - match image_format.0 { - ImageFormat::PNG => image::ImageFormat::Png, - ImageFormat::JPEG => image::ImageFormat::Jpeg, - ImageFormat::TIFF => image::ImageFormat::Tiff, - ImageFormat::GIF => image::ImageFormat::Gif, - ImageFormat::BMP => image::ImageFormat::Bmp, - } - } -} - -impl From> for image::ColorType { - fn from(image_mode: Wrap) -> image::ColorType { - use image::ColorType; - match image_mode.0 { - ImageMode::L => ColorType::L8, - ImageMode::LA => ColorType::La8, - ImageMode::RGB => ColorType::Rgb8, - ImageMode::RGBA => ColorType::Rgba8, - ImageMode::L16 => ColorType::L16, - ImageMode::LA16 => ColorType::La16, - ImageMode::RGB16 => ColorType::Rgb16, - ImageMode::RGBA16 => ColorType::Rgba16, - ImageMode::RGB32F => ColorType::Rgb32F, - ImageMode::RGBA32F => ColorType::Rgba32F, - } - } -} - -impl TryFrom for Wrap { - type Error = DaftError; - - fn try_from(color: image::ColorType) -> DaftResult { - use image::ColorType; - Ok(Wrap(match color { - ColorType::L8 => Ok(ImageMode::L), - ColorType::La8 => Ok(ImageMode::LA), - ColorType::Rgb8 => Ok(ImageMode::RGB), - ColorType::Rgba8 => Ok(ImageMode::RGBA), - ColorType::L16 => Ok(ImageMode::L16), - ColorType::La16 => Ok(ImageMode::LA16), - ColorType::Rgb16 => Ok(ImageMode::RGB16), - ColorType::Rgba16 => Ok(ImageMode::RGBA16), - ColorType::Rgb32F => Ok(ImageMode::RGB32F), - ColorType::Rgba32F => Ok(ImageMode::RGBA32F), - _ => Err(DaftError::ValueError(format!( - "Color type {:?} is not supported.", - color - ))), - }?)) - } -} - -impl<'a> DaftImageBuffer<'a> { - pub fn height(&self) -> u32 { - with_method_on_image_buffer!(self, height) - } - - pub fn width(&self) -> u32 { - with_method_on_image_buffer!(self, width) - } - - pub fn as_u8_slice(&'a self) -> &'a [u8] { - match self { - DaftImageBuffer::L(img) => img.as_raw(), - DaftImageBuffer::LA(img) => img.as_raw(), - DaftImageBuffer::RGB(img) => img.as_raw(), - DaftImageBuffer::RGBA(img) => img.as_raw(), - _ => unimplemented!("unimplemented {self:?}"), - } - } - - pub fn color(&self) -> ColorType { - Wrap(self.mode()).into() - } - - pub fn mode(&self) -> ImageMode { - match self { - DaftImageBuffer::L(..) => ImageMode::L, - DaftImageBuffer::LA(..) => ImageMode::LA, - DaftImageBuffer::RGB(..) => ImageMode::RGB, - DaftImageBuffer::RGBA(..) => ImageMode::RGBA, - DaftImageBuffer::L16(..) => ImageMode::L16, - DaftImageBuffer::LA16(..) => ImageMode::LA16, - DaftImageBuffer::RGB16(..) => ImageMode::RGB16, - DaftImageBuffer::RGBA16(..) => ImageMode::RGBA16, - DaftImageBuffer::RGB32F(..) => ImageMode::RGB32F, - DaftImageBuffer::RGBA32F(..) => ImageMode::RGBA32F, - } - } - - pub fn decode(bytes: &[u8]) -> DaftResult { - image::load_from_memory(bytes) - .map(|v| v.into()) - .map_err(|e| DaftError::ValueError(format!("Decoding image from bytes failed: {}", e))) - } - - pub fn encode(&self, image_format: ImageFormat, writer: &mut W) -> DaftResult<()> - where - W: Write + Seek, - { - image::write_buffer_with_format( - writer, - self.as_u8_slice(), - self.width(), - self.height(), - self.color(), - image::ImageFormat::from(Wrap(image_format)), - ) - .map_err(|e| { - DaftError::ValueError(format!( - "Encoding image into file format {} failed: {}", - image_format, e - )) - }) - } - - pub fn fit_to(&self, w: u32, h: u32) -> Self { - // Preserving aspect ratio, resize an image to fit within the specified dimensions. - let scale_factor = { - let width_scale = w as f64 / self.width() as f64; - let height_scale = h as f64 / self.height() as f64; - width_scale.min(height_scale) - }; - let new_w = self.width() as f64 * scale_factor; - let new_h = self.height() as f64 * scale_factor; - - self.resize(new_w.floor() as u32, new_h.floor() as u32) - } - - pub fn resize(&self, w: u32, h: u32) -> Self { - match self { - DaftImageBuffer::L(imgbuf) => { - let result = - image::imageops::resize(imgbuf, w, h, image::imageops::FilterType::Triangle); - DaftImageBuffer::L(image_buffer_vec_to_cow(result)) - } - DaftImageBuffer::LA(imgbuf) => { - let result = - image::imageops::resize(imgbuf, w, h, image::imageops::FilterType::Triangle); - DaftImageBuffer::LA(image_buffer_vec_to_cow(result)) - } - DaftImageBuffer::RGB(imgbuf) => { - let result = - image::imageops::resize(imgbuf, w, h, image::imageops::FilterType::Triangle); - DaftImageBuffer::RGB(image_buffer_vec_to_cow(result)) - } - DaftImageBuffer::RGBA(imgbuf) => { - let result = - image::imageops::resize(imgbuf, w, h, image::imageops::FilterType::Triangle); - DaftImageBuffer::RGBA(image_buffer_vec_to_cow(result)) - } - _ => unimplemented!("Mode {self:?} not implemented"), - } - } - - pub fn crop(&self, bbox: &BBox) -> Self { - // HACK(jay): The `.to_image()` method on SubImage takes in `'static` references for some reason - // This hack will ensure that `&self` adheres to that overly prescriptive bound - let inner = - unsafe { std::mem::transmute::<&DaftImageBuffer<'a>, &DaftImageBuffer<'static>>(self) }; - match inner { - DaftImageBuffer::L(imgbuf) => { - let result = - image::imageops::crop_imm(imgbuf, bbox.0, bbox.1, bbox.2, bbox.3).to_image(); - DaftImageBuffer::L(image_buffer_vec_to_cow(result)) - } - DaftImageBuffer::LA(imgbuf) => { - let result = - image::imageops::crop_imm(imgbuf, bbox.0, bbox.1, bbox.2, bbox.3).to_image(); - DaftImageBuffer::LA(image_buffer_vec_to_cow(result)) - } - DaftImageBuffer::RGB(imgbuf) => { - let result = - image::imageops::crop_imm(imgbuf, bbox.0, bbox.1, bbox.2, bbox.3).to_image(); - DaftImageBuffer::RGB(image_buffer_vec_to_cow(result)) - } - DaftImageBuffer::RGBA(imgbuf) => { - let result = - image::imageops::crop_imm(imgbuf, bbox.0, bbox.1, bbox.2, bbox.3).to_image(); - DaftImageBuffer::RGBA(image_buffer_vec_to_cow(result)) - } - _ => unimplemented!("Mode {self:?} not implemented"), - } - } - - pub fn into_mode(self, mode: ImageMode) -> Self { - let img: DynamicImage = self.into(); - // I couldn't find a method from the image crate to do this - let img: DynamicImage = match mode { - ImageMode::L => img.into_luma8().into(), - ImageMode::LA => img.into_luma_alpha8().into(), - ImageMode::RGB => img.into_rgb8().into(), - ImageMode::RGBA => img.into_rgba8().into(), - ImageMode::L16 => img.into_luma16().into(), - ImageMode::LA16 => img.into_luma_alpha16().into(), - ImageMode::RGB16 => img.into_rgb16().into(), - ImageMode::RGBA16 => img.into_rgba16().into(), - ImageMode::RGB32F => img.into_rgb32f().into(), - ImageMode::RGBA32F => img.into_rgba32f().into(), - }; - img.into() - } -} - -fn image_buffer_vec_to_cow<'a, P, T>(input: ImageBuffer>) -> ImageBuffer> -where - P: image::Pixel, - Vec: Deref, - T: ToOwned + std::clone::Clone, - [T]: ToOwned, -{ - let h = input.height(); - let w = input.width(); - let owned: Cow<[T]> = input.into_raw().into(); - ImageBuffer::from_raw(w, h, owned).unwrap() -} - -fn image_buffer_cow_to_vec(input: ImageBuffer>) -> ImageBuffer> -where - P: image::Pixel, - Vec: Deref, - T: ToOwned + std::clone::Clone, - [T]: ToOwned, -{ - let h = input.height(); - let w = input.width(); - let owned: Vec = input.into_raw().to_vec(); - ImageBuffer::from_raw(w, h, owned).unwrap() -} - -impl<'a> From for DaftImageBuffer<'a> { - fn from(dyn_img: DynamicImage) -> Self { - match dyn_img { - DynamicImage::ImageLuma8(img_buf) => { - DaftImageBuffer::<'a>::L(image_buffer_vec_to_cow(img_buf)) - } - DynamicImage::ImageLumaA8(img_buf) => { - DaftImageBuffer::<'a>::LA(image_buffer_vec_to_cow(img_buf)) - } - DynamicImage::ImageRgb8(img_buf) => { - DaftImageBuffer::<'a>::RGB(image_buffer_vec_to_cow(img_buf)) - } - DynamicImage::ImageRgba8(img_buf) => { - DaftImageBuffer::<'a>::RGBA(image_buffer_vec_to_cow(img_buf)) - } - DynamicImage::ImageLuma16(img_buf) => { - DaftImageBuffer::<'a>::L16(image_buffer_vec_to_cow(img_buf)) - } - DynamicImage::ImageLumaA16(img_buf) => { - DaftImageBuffer::<'a>::LA16(image_buffer_vec_to_cow(img_buf)) - } - DynamicImage::ImageRgb16(img_buf) => { - DaftImageBuffer::<'a>::RGB16(image_buffer_vec_to_cow(img_buf)) - } - DynamicImage::ImageRgba16(img_buf) => { - DaftImageBuffer::<'a>::RGBA16(image_buffer_vec_to_cow(img_buf)) - } - DynamicImage::ImageRgb32F(img_buf) => { - DaftImageBuffer::<'a>::RGB32F(image_buffer_vec_to_cow(img_buf)) - } - DynamicImage::ImageRgba32F(img_buf) => { - DaftImageBuffer::<'a>::RGBA32F(image_buffer_vec_to_cow(img_buf)) - } - _ => unimplemented!("{dyn_img:?} not implemented"), - } - } -} - -impl<'a> From> for DynamicImage { - fn from(daft_buf: DaftImageBuffer<'a>) -> Self { - match daft_buf { - DaftImageBuffer::L(buf) => image_buffer_cow_to_vec(buf).into(), - DaftImageBuffer::LA(buf) => image_buffer_cow_to_vec(buf).into(), - DaftImageBuffer::RGB(buf) => image_buffer_cow_to_vec(buf).into(), - DaftImageBuffer::RGBA(buf) => image_buffer_cow_to_vec(buf).into(), - DaftImageBuffer::L16(buf) => image_buffer_cow_to_vec(buf).into(), - DaftImageBuffer::LA16(buf) => image_buffer_cow_to_vec(buf).into(), - DaftImageBuffer::RGB16(buf) => image_buffer_cow_to_vec(buf).into(), - DaftImageBuffer::RGBA16(buf) => image_buffer_cow_to_vec(buf).into(), - DaftImageBuffer::RGB32F(buf) => image_buffer_cow_to_vec(buf).into(), - DaftImageBuffer::RGBA32F(buf) => image_buffer_cow_to_vec(buf).into(), - } - } -} - -pub struct ImageArraySidecarData { - pub channels: Vec, - pub heights: Vec, - pub widths: Vec, - pub modes: Vec, - pub validity: Option, -} - -pub trait AsImageObj { - fn name(&self) -> &str; - fn len(&self) -> usize; - fn as_image_obj(&self, idx: usize) -> Option>; -} - -pub struct ImageBufferIter<'a, Arr> -where - Arr: AsImageObj, -{ - cursor: usize, - image_array: &'a Arr, -} - -impl<'a, Arr> ImageBufferIter<'a, Arr> -where - Arr: AsImageObj, -{ - pub fn new(image_array: &'a Arr) -> Self { - Self { - cursor: 0usize, - image_array, - } - } -} - -impl<'a, Arr> Iterator for ImageBufferIter<'a, Arr> -where - Arr: AsImageObj, -{ - type Item = Option>; - - fn next(&mut self) -> Option { - if self.cursor >= self.image_array.len() { - None - } else { - let image_obj = self.image_array.as_image_obj(self.cursor); - self.cursor += 1; - Some(image_obj) - } - } -} - -impl ImageArray { - pub fn image_mode(&self) -> &Option { - match self.data_type() { - DataType::Image(mode) => mode, - _ => panic!("Expected dtype to be Image"), - } - } - - pub fn data_array(&self) -> &ListArray { - const IMAGE_DATA_IDX: usize = 0; - let array = self.physical.children.get(IMAGE_DATA_IDX).unwrap(); - array.list().unwrap() - } - - pub fn channel_array(&self) -> &arrow2::array::UInt16Array { - const IMAGE_CHANNEL_IDX: usize = 1; - let array = self.physical.children.get(IMAGE_CHANNEL_IDX).unwrap(); - array.u16().unwrap().as_arrow() - } - - pub fn height_array(&self) -> &arrow2::array::UInt32Array { - const IMAGE_HEIGHT_IDX: usize = 2; - let array = self.physical.children.get(IMAGE_HEIGHT_IDX).unwrap(); - array.u32().unwrap().as_arrow() - } - - pub fn width_array(&self) -> &arrow2::array::UInt32Array { - const IMAGE_WIDTH_IDX: usize = 3; - let array = self.physical.children.get(IMAGE_WIDTH_IDX).unwrap(); - array.u32().unwrap().as_arrow() - } - - pub fn mode_array(&self) -> &arrow2::array::UInt8Array { - const IMAGE_MODE_IDX: usize = 4; - let array = self.physical.children.get(IMAGE_MODE_IDX).unwrap(); - array.u8().unwrap().as_arrow() - } - - pub fn from_vecs( - name: &str, - data_type: DataType, - data: Vec, - offsets: Vec, - sidecar_data: ImageArraySidecarData, - ) -> DaftResult { - if data.is_empty() { - return Ok(ImageArray::full_null(name, &data_type, offsets.len() - 1)); - } - let offsets = arrow2::offset::OffsetsBuffer::try_from(offsets)?; - let arrow_dtype: arrow2::datatypes::DataType = T::PRIMITIVE.into(); - if let DataType::Image(Some(mode)) = &data_type { - if mode.get_dtype().to_arrow()? != arrow_dtype { - panic!("Inner value dtype of provided dtype {data_type:?} is inconsistent with inferred value dtype {arrow_dtype:?}"); - } - } - let data_array = ListArray::new( - Field::new("data", DataType::List(Box::new((&arrow_dtype).into()))), - Series::try_from(( - "data", - Box::new(arrow2::array::PrimitiveArray::from_vec(data)) - as Box, - ))?, - offsets, - sidecar_data.validity.clone(), - ); - - Self::from_list_array(name, data_type, data_array, sidecar_data) - } - - pub fn from_list_array( - name: &str, - data_type: DataType, - data_array: ListArray, - sidecar_data: ImageArraySidecarData, - ) -> DaftResult { - let values: Vec = vec![ - data_array.into_series().rename("data"), - UInt16Array::from(( - "channel", - Box::new( - arrow2::array::UInt16Array::from_vec(sidecar_data.channels) - .with_validity(sidecar_data.validity.clone()), - ), - )) - .into_series(), - UInt32Array::from(( - "height", - Box::new( - arrow2::array::UInt32Array::from_vec(sidecar_data.heights) - .with_validity(sidecar_data.validity.clone()), - ), - )) - .into_series(), - UInt32Array::from(( - "width", - Box::new( - arrow2::array::UInt32Array::from_vec(sidecar_data.widths) - .with_validity(sidecar_data.validity.clone()), - ), - )) - .into_series(), - UInt8Array::from(( - "mode", - Box::new( - arrow2::array::UInt8Array::from_vec(sidecar_data.modes) - .with_validity(sidecar_data.validity.clone()), - ), - )) - .into_series(), - ]; - let physical_type = data_type.to_physical(); - let struct_array = StructArray::new( - Field::new(name, physical_type), - values, - sidecar_data.validity, - ); - Ok(ImageArray::new(Field::new(name, data_type), struct_array)) - } - - pub fn encode(&self, image_format: ImageFormat) -> DaftResult { - encode_images(self, image_format) - } - - pub fn resize(&self, w: u32, h: u32) -> DaftResult { - let result = resize_images(self, w, h); - Self::from_daft_image_buffers(self.name(), result.as_slice(), self.image_mode()) - } - - pub fn crop(&self, bboxes: &FixedSizeListArray) -> DaftResult { - let mut bboxes_iterator: Box>> = if bboxes.len() == 1 { - Box::new(std::iter::repeat(bboxes.get(0).map(|bbox| { - BBox::from_u32_arrow_array(bbox.u32().unwrap().data()) - }))) - } else { - Box::new((0..bboxes.len()).map(|i| { - bboxes - .get(i) - .map(|bbox| BBox::from_u32_arrow_array(bbox.u32().unwrap().data())) - })) - }; - let result = crop_images(self, &mut bboxes_iterator); - Self::from_daft_image_buffers(self.name(), result.as_slice(), self.image_mode()) - } - - pub fn resize_to_fixed_shape_image_array( - &self, - w: u32, - h: u32, - mode: &ImageMode, - ) -> DaftResult { - let result = resize_images(self, w, h); - FixedShapeImageArray::from_daft_image_buffers(self.name(), result.as_slice(), mode, h, w) - } - - pub fn from_daft_image_buffers( - name: &str, - inputs: &[Option>], - image_mode: &Option, - ) -> DaftResult { - let is_all_u8 = inputs.iter().filter_map(|b| b.as_ref()).all(|b| { - matches!( - b, - DaftImageBuffer::L(..) - | DaftImageBuffer::LA(..) - | DaftImageBuffer::RGB(..) - | DaftImageBuffer::RGBA(..) - ) - }); - assert!(is_all_u8); - - let mut data_ref = Vec::with_capacity(inputs.len()); - let mut heights = Vec::with_capacity(inputs.len()); - let mut channels = Vec::with_capacity(inputs.len()); - let mut modes = Vec::with_capacity(inputs.len()); - let mut widths = Vec::with_capacity(inputs.len()); - let mut offsets = Vec::with_capacity(inputs.len() + 1); - offsets.push(0i64); - let mut validity = arrow2::bitmap::MutableBitmap::with_capacity(inputs.len()); - - for ib in inputs { - validity.push(ib.is_some()); - let (height, width, mode, buffer) = match ib { - Some(ib) => (ib.height(), ib.width(), ib.mode(), ib.as_u8_slice()), - None => (0u32, 0u32, ImageMode::L, &[] as &[u8]), - }; - heights.push(height); - widths.push(width); - modes.push(mode as u8); - channels.push(mode.num_channels()); - data_ref.push(buffer); - offsets.push(offsets.last().unwrap() + buffer.len() as i64); - } - - let data = data_ref.concat(); - let validity: Option = match validity.unset_bits() { - 0 => None, - _ => Some(validity.into()), - }; - Self::from_vecs( - name, - DataType::Image(*image_mode), - data, - offsets, - ImageArraySidecarData { - channels, - heights, - widths, - modes, - validity, - }, - ) - } - - pub fn to_mode(&self, mode: ImageMode) -> DaftResult { - let buffers: Vec> = self - .into_iter() - .map(|img| img.map(|img| img.into_mode(mode))) - .collect(); - Self::from_daft_image_buffers(self.name(), &buffers, &Some(mode)) - } -} - -impl AsImageObj for ImageArray { - fn len(&self) -> usize { - ImageArray::len(self) - } - - fn name(&self) -> &str { - ImageArray::name(self) - } - - fn as_image_obj<'a>(&'a self, idx: usize) -> Option> { - assert!(idx < self.len()); - if !self.physical.is_valid(idx) { - return None; - } - - let da = self.data_array(); - let ca = self.channel_array(); - let ha = self.height_array(); - let wa = self.width_array(); - let ma = self.mode_array(); - - let offsets = da.offsets(); - - let start = *offsets.get(idx).unwrap() as usize; - let end = *offsets.get(idx + 1).unwrap() as usize; - - let values = da - .flat_child - .u8() - .unwrap() - .data() - .as_any() - .downcast_ref::() - .unwrap(); - let slice_data = Cow::Borrowed(&values.values().as_slice()[start..end] as &'a [u8]); - - let c = ca.value(idx); - let h = ha.value(idx); - let w = wa.value(idx); - let m: ImageMode = ImageMode::from_u8(ma.value(idx)).unwrap(); - assert_eq!(m.num_channels(), c); - let result = match m { - ImageMode::L => { - DaftImageBuffer::<'a>::L(ImageBuffer::from_raw(w, h, slice_data).unwrap()) - } - ImageMode::LA => { - DaftImageBuffer::<'a>::LA(ImageBuffer::from_raw(w, h, slice_data).unwrap()) - } - ImageMode::RGB => { - DaftImageBuffer::<'a>::RGB(ImageBuffer::from_raw(w, h, slice_data).unwrap()) - } - ImageMode::RGBA => { - DaftImageBuffer::<'a>::RGBA(ImageBuffer::from_raw(w, h, slice_data).unwrap()) - } - _ => unimplemented!("{m} is currently not implemented!"), - }; - - assert_eq!(result.height(), h); - assert_eq!(result.width(), w); - Some(result) - } -} - -impl FixedShapeImageArray { - fn mode(&self) -> ImageMode { - match &self.field.dtype { - DataType::FixedShapeImage(mode, _, _) => *mode, - _ => panic!("FixedShapeImageArray does not have the correct FixedShapeImage dtype"), - } - } - - pub fn from_daft_image_buffers( - name: &str, - inputs: &[Option>], - image_mode: &ImageMode, - height: u32, - width: u32, - ) -> DaftResult { - let is_all_u8 = inputs.iter().filter_map(|b| b.as_ref()).all(|b| { - matches!( - b, - DaftImageBuffer::L(..) - | DaftImageBuffer::LA(..) - | DaftImageBuffer::RGB(..) - | DaftImageBuffer::RGBA(..) - ) - }); - assert!(is_all_u8); - - let num_channels = image_mode.num_channels(); - let mut data_ref = Vec::with_capacity(inputs.len()); - let mut validity = arrow2::bitmap::MutableBitmap::with_capacity(inputs.len()); - let list_size = (height * width * num_channels as u32) as usize; - let null_list = vec![0u8; list_size]; - for ib in inputs.iter() { - validity.push(ib.is_some()); - let buffer = match ib { - Some(ib) => ib.as_u8_slice(), - None => null_list.as_slice(), - }; - data_ref.push(buffer) - } - let data = data_ref.concat(); - let validity: Option = match validity.unset_bits() { - 0 => None, - _ => Some(validity.into()), - }; - - let arrow_dtype = arrow2::datatypes::DataType::FixedSizeList( - Box::new(arrow2::datatypes::Field::new( - "data", - arrow2::datatypes::DataType::UInt8, - true, - )), - list_size, - ); - let arrow_array = Box::new(arrow2::array::FixedSizeListArray::new( - arrow_dtype.clone(), - Box::new(arrow2::array::PrimitiveArray::from_vec(data)), - validity, - )); - let physical_array = FixedSizeListArray::from_arrow( - Arc::new(Field::new(name, (&arrow_dtype).into())), - arrow_array, - )?; - let logical_dtype = DataType::FixedShapeImage(*image_mode, height, width); - Ok(Self::new(Field::new(name, logical_dtype), physical_array)) - } - - pub fn encode(&self, image_format: ImageFormat) -> DaftResult { - encode_images(self, image_format) - } - - pub fn resize(&self, w: u32, h: u32) -> DaftResult { - let result = resize_images(self, w, h); - match &self.data_type() { - DataType::FixedShapeImage(mode, _, _) => Self::from_daft_image_buffers(self.name(), result.as_slice(), mode, h, w), - dt => panic!("FixedShapeImageArray should always have DataType::FixedShapeImage() as it's dtype, but got {}", dt), - } - } - - pub fn crop(&self, bboxes: &FixedSizeListArray) -> DaftResult { - let mut bboxes_iterator: Box>> = if bboxes.len() == 1 { - Box::new(std::iter::repeat(bboxes.get(0).map(|bbox| { - BBox::from_u32_arrow_array(bbox.u32().unwrap().data()) - }))) - } else { - Box::new((0..bboxes.len()).map(|i| { - bboxes - .get(i) - .map(|bbox| BBox::from_u32_arrow_array(bbox.u32().unwrap().data())) - })) - }; - let result = crop_images(self, &mut bboxes_iterator); - ImageArray::from_daft_image_buffers(self.name(), result.as_slice(), &Some(self.mode())) - } - - pub fn to_mode(&self, mode: ImageMode) -> DaftResult { - let buffers: Vec> = self - .into_iter() - .map(|img| img.map(|img| img.into_mode(mode))) - .collect(); - - let (height, width) = match self.data_type() { - DataType::FixedShapeImage(_, h, w) => (h, w), - _ => unreachable!("self should always be a FixedShapeImage"), - }; - Self::from_daft_image_buffers(self.name(), &buffers, &mode, *height, *width) - } -} - -impl AsImageObj for FixedShapeImageArray { - fn len(&self) -> usize { - FixedShapeImageArray::len(self) - } - - fn name(&self) -> &str { - FixedShapeImageArray::name(self) - } - - fn as_image_obj<'a>(&'a self, idx: usize) -> Option> { - assert!(idx < self.len()); - if !self.physical.is_valid(idx) { - return None; - } - - match self.data_type() { - DataType::FixedShapeImage(mode, height, width) => { - let arrow_array = self.physical.flat_child.downcast::().unwrap().as_arrow(); - let num_channels = mode.num_channels(); - let size = height * width * num_channels as u32; - let start = idx * size as usize; - let end = (idx + 1) * size as usize; - let slice_data = Cow::Borrowed(&arrow_array.values().as_slice()[start..end] as &'a [u8]); - let result = match mode { - ImageMode::L => { - DaftImageBuffer::<'a>::L(ImageBuffer::from_raw(*width, *height, slice_data).unwrap()) - } - ImageMode::LA => { - DaftImageBuffer::<'a>::LA(ImageBuffer::from_raw(*width, *height, slice_data).unwrap()) - } - ImageMode::RGB => { - DaftImageBuffer::<'a>::RGB(ImageBuffer::from_raw(*width, *height, slice_data).unwrap()) - } - ImageMode::RGBA => { - DaftImageBuffer::<'a>::RGBA(ImageBuffer::from_raw(*width, *height, slice_data).unwrap()) - } - _ => unimplemented!("{mode} is currently not implemented!"), - }; - - assert_eq!(result.height(), *height); - assert_eq!(result.width(), *width); - Some(result) - } - dt => panic!("FixedShapeImageArray should always have DataType::FixedShapeImage() as it's dtype, but got {}", dt), - } - } -} - -impl<'a, T> IntoIterator for &'a LogicalArray -where - T: DaftImageryType, - LogicalArray: AsImageObj, -{ - type Item = Option>; - type IntoIter = ImageBufferIter<'a, LogicalArray>; - - fn into_iter(self) -> Self::IntoIter { - ImageBufferIter::new(self) - } -} - -impl BinaryArray { - pub fn image_decode( - &self, - raise_error_on_failure: bool, - mode: Option, - ) -> DaftResult { - let arrow_array = self - .data() - .as_any() - .downcast_ref::>() - .unwrap(); - let mut img_bufs = Vec::>::with_capacity(arrow_array.len()); - let mut cached_dtype: Option = None; - // Load images from binary buffers. - // Confirm that all images have the same value dtype. - for (index, row) in arrow_array.iter().enumerate() { - let mut img_buf = match row.map(DaftImageBuffer::decode).transpose() { - Ok(val) => val, - Err(err) => { - if raise_error_on_failure { - return Err(err); - } else { - log::warn!( - "Error occurred during image decoding at index: {index} {} (falling back to Null)", - err - ); - None - } - } - }; - if let Some(mode) = mode { - img_buf = img_buf.map(|buf| buf.into_mode(mode)); - } - let dtype = img_buf.as_ref().map(|im| im.mode().get_dtype()); - match (dtype.as_ref(), cached_dtype.as_ref()) { - (Some(t1), Some(t2)) => { - if t1 != t2 { - return Err(DaftError::ValueError(format!("All images in a column must have the same dtype, but got: {:?} and {:?}", t1, t2))); - } - } - (Some(t1), None) => { - cached_dtype = Some(t1.clone()); - } - (None, _) => {} - } - img_bufs.push(img_buf); - } - // Fall back to UInt8 dtype if series is all nulls. - let cached_dtype = cached_dtype.unwrap_or(DataType::UInt8); - match cached_dtype { - DataType::UInt8 => Ok(ImageArray::from_daft_image_buffers(self.name(), img_bufs.as_slice(), &mode)?), - _ => unimplemented!("Decoding images of dtype {cached_dtype:?} is not supported, only uint8 images are supported."), - } - } -} - -fn encode_images<'a, Arr>(images: &'a Arr, image_format: ImageFormat) -> DaftResult -where - Arr: AsImageObj, - &'a Arr: IntoIterator>, IntoIter = ImageBufferIter<'a, Arr>>, -{ - let arrow_array = match image_format { - ImageFormat::TIFF => { - // NOTE: A single writer/buffer can't be used for TIFF files because the encoder will overwrite the - // IFD offset for the first image instead of writing it for all subsequent images, producing corrupted - // TIFF files. We work around this by writing out a new buffer for each image. - // TODO(Clark): Fix this in the tiff crate. - let values = images - .into_iter() - .map(|img| { - img.map(|img| { - let buf = Vec::new(); - let mut writer: CountingWriter> = - std::io::BufWriter::new(std::io::Cursor::new(buf)).into(); - img.encode(image_format, &mut writer)?; - // NOTE: BufWriter::into_inner() will flush the buffer. - Ok(writer - .into_inner() - .into_inner() - .map_err(|e| { - DaftError::ValueError(format!( - "Encoding image into file format {} failed: {}", - image_format, e - )) - })? - .into_inner()) - }) - .transpose() - }) - .collect::>>()?; - arrow2::array::BinaryArray::::from_iter(values) - } - _ => { - let mut offsets = Vec::with_capacity(images.len() + 1); - offsets.push(0i64); - let mut validity = arrow2::bitmap::MutableBitmap::with_capacity(images.len()); - let buf = Vec::new(); - let mut writer: CountingWriter> = - std::io::BufWriter::new(std::io::Cursor::new(buf)).into(); - images - .into_iter() - .map(|img| { - match img { - Some(img) => { - img.encode(image_format, &mut writer)?; - offsets.push(writer.count() as i64); - validity.push(true); - } - None => { - offsets.push(*offsets.last().unwrap()); - validity.push(false); - } - } - Ok(()) - }) - .collect::>>()?; - // NOTE: BufWriter::into_inner() will flush the buffer. - let values = writer - .into_inner() - .into_inner() - .map_err(|e| { - DaftError::ValueError(format!( - "Encoding image into file format {} failed: {}", - image_format, e - )) - })? - .into_inner(); - let encoded_data: arrow2::buffer::Buffer = values.into(); - let offsets_buffer = arrow2::offset::OffsetsBuffer::try_from(offsets)?; - let validity: Option = match validity.unset_bits() { - 0 => None, - _ => Some(validity.into()), - }; - arrow2::array::BinaryArray::::new( - arrow2::datatypes::DataType::LargeBinary, - offsets_buffer, - encoded_data, - validity, - ) - } - }; - BinaryArray::new( - Field::new(images.name(), arrow_array.data_type().into()).into(), - arrow_array.boxed(), - ) -} - -fn resize_images<'a, Arr>(images: &'a Arr, w: u32, h: u32) -> Vec> -where - Arr: AsImageObj, - &'a Arr: IntoIterator>, IntoIter = ImageBufferIter<'a, Arr>>, -{ - images - .into_iter() - .map(|img| img.map(|img| img.resize(w, h))) - .collect::>() -} - -fn crop_images<'a, Arr>( - images: &'a Arr, - bboxes: &mut dyn Iterator>, -) -> Vec>> -where - Arr: AsImageObj, - &'a Arr: IntoIterator>, IntoIter = ImageBufferIter<'a, Arr>>, -{ - images - .into_iter() - .zip(bboxes) - .map(|(img, bbox)| match (img, bbox) { - (None, _) | (_, None) => None, - (Some(img), Some(bbox)) => Some(img.crop(&bbox)), - }) - .collect::>() -} diff --git a/src/daft-image/src/lib.rs b/src/daft-image/src/lib.rs index c89ee04be1..d5dbcfadd9 100644 --- a/src/daft-image/src/lib.rs +++ b/src/daft-image/src/lib.rs @@ -2,12 +2,9 @@ mod counting_writer; mod image_buffer; mod iters; pub mod ops; -pub use counting_writer::CountingWriter; -pub use image_buffer::DaftImageBuffer; +use counting_writer::CountingWriter; +use image_buffer::DaftImageBuffer; pub mod series; #[cfg(feature = "python")] -mod python; - -#[cfg(feature = "python")] -pub use python::*; +pub mod python; diff --git a/src/daft-image/src/python.rs b/src/daft-image/src/python.rs index e9d24a92ef..99b93dd265 100644 --- a/src/daft-image/src/python.rs +++ b/src/daft-image/src/python.rs @@ -41,7 +41,6 @@ pub fn to_mode(s: &PySeries, mode: &ImageMode) -> PyResult { let s = crate::series::to_mode(&s.series, *mode)?; Ok(s.into()) } -// let module = PyModule::new(py, "my_module")?; pub fn register_modules(_py: Python, parent: &PyModule) -> PyResult<()> { let module = PyModule::new(_py, "image")?; diff --git a/src/daft-image/src/series.rs b/src/daft-image/src/series.rs index 070c6c517f..7fe2255b7e 100644 --- a/src/daft-image/src/series.rs +++ b/src/daft-image/src/series.rs @@ -62,6 +62,16 @@ fn image_decode_impl( _ => unimplemented!("Decoding images of dtype {cached_dtype:?} is not supported, only uint8 images are supported."), } } + +/// Decodes a series of binary data into image arrays. +/// +/// # Arguments +/// * `s` - Input Series containing binary image data +/// * `raise_error_on_failure` - If true, raises errors on decode failures +/// * `mode` - Optional target ImageMode for decoded images +/// +/// # Returns +/// A DaftResult containing a Series of decoded images pub fn decode( s: &Series, raise_error_on_failure: bool, @@ -77,6 +87,18 @@ pub fn decode( } } +/// Encode a series of images into a series of bytes +/// Encode a series of images into a series of bytes. +/// +/// This function takes a Series containing image data and an ImageFormat, +/// then encodes each image into the specified format. +/// +/// # Arguments +/// * `s` - The input Series containing image data +/// * `image_format` - The desired output format for the encoded images +/// +/// # Returns +/// A DaftResult containing a new Series of encoded binary data pub fn encode(s: &Series, image_format: ImageFormat) -> DaftResult { match s.data_type() { DataType::Image(..) => Ok(s @@ -93,7 +115,15 @@ pub fn encode(s: &Series, image_format: ImageFormat) -> DaftResult { ))), } } - +/// Resizes images in a Series to the specified width and height. +/// +/// # Arguments +/// * `s` - Input Series containing image data +/// * `w` - Target width for resized images +/// * `h` - Target height for resized images +/// +/// # Returns +/// A DaftResult containing a new Series with resized images pub fn resize(s: &Series, w: u32, h: u32) -> DaftResult { match s.data_type() { DataType::Image(mode) => { @@ -120,6 +150,14 @@ pub fn resize(s: &Series, w: u32, h: u32) -> DaftResult { } } +/// Crops images in a Series based on provided bounding boxes. +/// +/// # Arguments +/// * `s` - Input Series containing image data +/// * `bbox` - Series of bounding boxes for cropping +/// +/// # Returns +/// A DaftResult containing a new Series with cropped images pub fn crop(s: &Series, bbox: &Series) -> DaftResult { let bbox_type = DataType::FixedSizeList(Box::new(DataType::UInt32), 4); let bbox = bbox.cast(&bbox_type)?; @@ -140,7 +178,14 @@ pub fn crop(s: &Series, bbox: &Series) -> DaftResult { ))), } } - +/// Converts images in a Series to the specified mode. +/// +/// # Arguments +/// * `s` - Input Series containing image data +/// * `mode` - Target ImageMode for conversion +/// +/// # Returns +/// A DaftResult containing a new Series with converted images pub fn to_mode(s: &Series, mode: ImageMode) -> DaftResult { match &s.data_type() { DataType::Image(_) => s diff --git a/src/lib.rs b/src/lib.rs index 2bac6546fa..79816562f3 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -121,7 +121,7 @@ pub mod pylib { m.add_wrapped(wrap_pyfunction!(build_type))?; m.add_wrapped(wrap_pyfunction!(refresh_logger))?; m.add_wrapped(wrap_pyfunction!(get_max_log_level))?; - daft_image::register_modules(py, m)?; + daft_image::python::register_modules(py, m)?; Ok(()) } }