diff --git a/daft/daft.pyi b/daft/daft.pyi index 8cc8f40e6e..2ddd04d22e 100644 --- a/daft/daft.pyi +++ b/daft/daft.pyi @@ -687,6 +687,7 @@ class PyExpr: def is_nan(self) -> PyExpr: ... def dt_date(self) -> PyExpr: ... def dt_day(self) -> PyExpr: ... + def dt_hour(self) -> PyExpr: ... def dt_month(self) -> PyExpr: ... def dt_year(self) -> PyExpr: ... def dt_day_of_week(self) -> PyExpr: ... @@ -761,6 +762,7 @@ class PySeries: def is_nan(self) -> PySeries: ... def dt_date(self) -> PySeries: ... def dt_day(self) -> PySeries: ... + def dt_hour(self) -> PySeries: ... def dt_month(self) -> PySeries: ... def dt_year(self) -> PySeries: ... def dt_day_of_week(self) -> PySeries: ... diff --git a/daft/expressions/expressions.py b/daft/expressions/expressions.py index e5af9fa1f7..39ffc46bf7 100644 --- a/daft/expressions/expressions.py +++ b/daft/expressions/expressions.py @@ -504,6 +504,17 @@ def day(self) -> Expression: """ return Expression._from_pyexpr(self._expr.dt_day()) + def hour(self) -> Expression: + """Retrieves the day for a datetime column + + Example: + >>> col("x").dt.day() + + Returns: + Expression: a UInt32 expression with just the day extracted from a datetime column + """ + return Expression._from_pyexpr(self._expr.dt_hour()) + def month(self) -> Expression: """Retrieves the month for a datetime column diff --git a/daft/series.py b/daft/series.py index c4c6b78de6..5c2c202873 100644 --- a/daft/series.py +++ b/daft/series.py @@ -565,6 +565,9 @@ def date(self) -> Series: def day(self) -> Series: return Series._from_pyseries(self._series.dt_day()) + def hour(self) -> Series: + return Series._from_pyseries(self._series.dt_hour()) + def month(self) -> Series: return Series._from_pyseries(self._series.dt_month()) diff --git a/src/daft-core/src/array/ops/date.rs b/src/daft-core/src/array/ops/date.rs index f19fbe2ebd..af1e896049 100644 --- a/src/daft-core/src/array/ops/date.rs +++ b/src/daft-core/src/array/ops/date.rs @@ -6,7 +6,7 @@ use crate::{ DataType, }; use arrow2::compute::arithmetics::ArraySub; -use chrono::NaiveDate; +use chrono::{NaiveDate, Timelike}; use common_error::{DaftError, DaftResult}; use super::as_arrow::AsArrow; @@ -107,4 +107,39 @@ impl TimestampArray { Int32Array::from((self.name(), Box::new(date_arrow))), )) } + + pub fn hour(&self) -> DaftResult { + let physical = self.physical.as_arrow(); + let DataType::Timestamp(timeunit, tz) = self.data_type() else { + unreachable!("Timestamp array must have Timestamp datatype") + }; + let tu = timeunit.to_arrow(); + let date_arrow = match tz { + Some(tz) => match arrow2::temporal_conversions::parse_offset(tz) { + Ok(tz) => Ok(arrow2::array::UInt32Array::from_iter(physical.iter().map( + |ts| { + ts.map(|ts| { + arrow2::temporal_conversions::timestamp_to_datetime(*ts, tu, &tz).hour() + }) + }, + ))), + Err(e) => Err(DaftError::TypeError(format!( + "Cannot parse timezone in Timestamp datatype: {}, error: {}", + tz, e + ))), + }, + None => Ok(arrow2::array::UInt32Array::from_iter(physical.iter().map( + |ts| { + ts.map(|ts| { + arrow2::temporal_conversions::timestamp_to_naive_datetime(*ts, tu).hour() + }) + }, + ))), + }?; + + UInt32Array::new( + std::sync::Arc::new(Field::new(self.name(), DataType::UInt32)), + Box::new(date_arrow), + ) + } } diff --git a/src/daft-core/src/python/series.rs b/src/daft-core/src/python/series.rs index 3c54a0ee04..a750a49ab0 100644 --- a/src/daft-core/src/python/series.rs +++ b/src/daft-core/src/python/series.rs @@ -272,6 +272,10 @@ impl PySeries { Ok(self.series.dt_day()?.into()) } + pub fn dt_hour(&self) -> PyResult { + Ok(self.series.dt_hour()?.into()) + } + pub fn dt_month(&self) -> PyResult { Ok(self.series.dt_month()?.into()) } diff --git a/src/daft-core/src/series/ops/date.rs b/src/daft-core/src/series/ops/date.rs index 68f33bb450..258a8fafb1 100644 --- a/src/daft-core/src/series/ops/date.rs +++ b/src/daft-core/src/series/ops/date.rs @@ -38,6 +38,19 @@ impl Series { } } + pub fn dt_hour(&self) -> DaftResult { + match self.data_type() { + DataType::Timestamp(..) => { + let ts_array = self.downcast::()?; + Ok(ts_array.hour()?.into_series()) + } + _ => Err(DaftError::ComputeError(format!( + "Can only run day() operation on temporal types, got {}", + self.data_type() + ))), + } + } + pub fn dt_month(&self) -> DaftResult { match self.data_type() { DataType::Date => { diff --git a/src/daft-dsl/src/functions/temporal/hour.rs b/src/daft-dsl/src/functions/temporal/hour.rs new file mode 100644 index 0000000000..ed88e11c92 --- /dev/null +++ b/src/daft-dsl/src/functions/temporal/hour.rs @@ -0,0 +1,47 @@ +use common_error::{DaftError, DaftResult}; +use daft_core::{ + datatypes::{DataType, Field}, + schema::Schema, + series::Series, +}; + +use crate::Expr; + +use super::super::FunctionEvaluator; + +pub(super) struct HourEvaluator {} + +impl FunctionEvaluator for HourEvaluator { + fn fn_name(&self) -> &'static str { + "hour" + } + + fn to_field(&self, inputs: &[Expr], schema: &Schema, _: &Expr) -> DaftResult { + match inputs { + [input] => match input.to_field(schema) { + Ok(field) if field.dtype.is_temporal() => { + Ok(Field::new(field.name, DataType::UInt32)) + } + Ok(field) => Err(DaftError::TypeError(format!( + "Expected input to hour to be temporal, got {}", + field.dtype + ))), + Err(e) => Err(e), + }, + _ => Err(DaftError::SchemaMismatch(format!( + "Expected 1 input arg, got {}", + inputs.len() + ))), + } + } + + fn evaluate(&self, inputs: &[Series], _: &Expr) -> DaftResult { + match inputs { + [input] => input.dt_hour(), + _ => Err(DaftError::ValueError(format!( + "Expected 1 input arg, got {}", + inputs.len() + ))), + } + } +} diff --git a/src/daft-dsl/src/functions/temporal/mod.rs b/src/daft-dsl/src/functions/temporal/mod.rs index 53a64d98e4..683a75261f 100644 --- a/src/daft-dsl/src/functions/temporal/mod.rs +++ b/src/daft-dsl/src/functions/temporal/mod.rs @@ -1,14 +1,15 @@ mod date; mod day; mod day_of_week; +mod hour; mod month; mod year; use serde::{Deserialize, Serialize}; use crate::functions::temporal::{ - date::DateEvaluator, day::DayEvaluator, day_of_week::DayOfWeekEvaluator, month::MonthEvaluator, - year::YearEvaluator, + date::DateEvaluator, day::DayEvaluator, day_of_week::DayOfWeekEvaluator, hour::HourEvaluator, + month::MonthEvaluator, year::YearEvaluator, }; use crate::Expr; @@ -17,6 +18,7 @@ use super::FunctionEvaluator; #[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq, Hash)] pub enum TemporalExpr { Day, + Hour, Month, Year, DayOfWeek, @@ -29,6 +31,7 @@ impl TemporalExpr { use TemporalExpr::*; match self { Day => &DayEvaluator {}, + Hour => &HourEvaluator {}, Month => &MonthEvaluator {}, Year => &YearEvaluator {}, DayOfWeek => &DayOfWeekEvaluator {}, @@ -51,6 +54,13 @@ pub fn day(input: &Expr) -> Expr { } } +pub fn hour(input: &Expr) -> Expr { + Expr::Function { + func: super::FunctionExpr::Temporal(TemporalExpr::Hour), + inputs: vec![input.clone()], + } +} + pub fn month(input: &Expr) -> Expr { Expr::Function { func: super::FunctionExpr::Temporal(TemporalExpr::Month), diff --git a/src/daft-dsl/src/python.rs b/src/daft-dsl/src/python.rs index cb61044339..eb88d0ee76 100644 --- a/src/daft-dsl/src/python.rs +++ b/src/daft-dsl/src/python.rs @@ -261,6 +261,11 @@ impl PyExpr { Ok(day(&self.expr).into()) } + pub fn dt_hour(&self) -> PyResult { + use functions::temporal::hour; + Ok(hour(&self.expr).into()) + } + pub fn dt_month(&self) -> PyResult { use functions::temporal::month; Ok(month(&self.expr).into()) diff --git a/tests/series/test_temporal_ops.py b/tests/series/test_temporal_ops.py index 05ccd75613..b4f10882d5 100644 --- a/tests/series/test_temporal_ops.py +++ b/tests/series/test_temporal_ops.py @@ -106,6 +106,25 @@ def ts_maker(d): assert expected == days.to_pylist() +def test_series_timestamp_hour() -> None: + from datetime import datetime + + def ts_maker(h): + if h is None: + return None + return datetime(2023, 1, 26, h, 1, 1) + + input = [1, 5, 14, None, 23, None, 21] + + input_ts = list(map(ts_maker, input)) + s = Series.from_pylist(input_ts).cast(DataType.timestamp(TimeUnit.ms())) + days = s.dt.hour() + + assert days.datatype() == DataType.uint32() + + assert input == days.to_pylist() + + @pytest.mark.parametrize("tz", [None, "UTC", "+08:00", "Asia/Singapore"]) def test_series_timestamp_month_operation(tz) -> None: from datetime import datetime