From 596a3d94dfed9422b362ae111054e24c289c8ab8 Mon Sep 17 00:00:00 2001 From: Jay Chia <17691182+jaychia@users.noreply.github.com> Date: Mon, 22 Jan 2024 12:40:15 -0800 Subject: [PATCH] [BUG] Fix type annotation on UDF (#1807) Closes #1801 --------- Co-authored-by: Jay Chia --- daft/udf.py | 7 +++++-- docs/source/10-min.ipynb | 2 +- docs/source/api_docs/expressions.rst | 2 +- docs/source/user_guide/poweruser/memory.rst | 6 +++--- 4 files changed, 10 insertions(+), 7 deletions(-) diff --git a/daft/udf.py b/daft/udf.py index 2d3ad0eba9..b670c2ad03 100644 --- a/daft/udf.py +++ b/daft/udf.py @@ -4,7 +4,7 @@ import functools import inspect import types -from typing import Callable +from typing import TYPE_CHECKING, Callable, Union from daft.datatype import DataType from daft.expressions import Expression @@ -16,7 +16,10 @@ except ImportError: _NUMPY_AVAILABLE = False -UserProvidedPythonFunction = Callable[..., Series] +if TYPE_CHECKING: + import numpy as np + +UserProvidedPythonFunction = Callable[..., Union[Series, "np.ndarray", list]] @dataclasses.dataclass(frozen=True) diff --git a/docs/source/10-min.ipynb b/docs/source/10-min.ipynb index 714754e3fc..5ef31ba107 100644 --- a/docs/source/10-min.ipynb +++ b/docs/source/10-min.ipynb @@ -834,7 +834,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "For a full list of all Expression methods and operators, see: [Expressions API Docs](../api_docs/expressions.rst)" + "For a full list of all Expression methods and operators, see: [Expressions API Docs](api_docs/expressions.rst)" ] }, { diff --git a/docs/source/api_docs/expressions.rst b/docs/source/api_docs/expressions.rst index 1bb7b616d7..4b33791998 100644 --- a/docs/source/api_docs/expressions.rst +++ b/docs/source/api_docs/expressions.rst @@ -180,7 +180,7 @@ Example: ``e1.list.join(e2)`` Structs -****** +******* Operations on structs, accessible through the :meth:`Expression.image ` method accessor: diff --git a/docs/source/user_guide/poweruser/memory.rst b/docs/source/user_guide/poweruser/memory.rst index 9e9ff3aada..5cdadf46f0 100644 --- a/docs/source/user_guide/poweruser/memory.rst +++ b/docs/source/user_guide/poweruser/memory.rst @@ -28,9 +28,9 @@ Spilling to disk is a mechanism that Daft uses to ensure workload completion in There are some things you can do that will help with this. -1. Use machines with more available memory per-CPU to increase each Ray worker's available memory (e.g. `AWS EC2 r5 instances `_) +1. Use machines with more available memory per-CPU to increase each Ray worker's available memory (e.g. `AWS EC2 r5 instances `_) 2. Use more machines in your cluster to increase overall cluster memory size -3. Use machines with attached local nvme SSD drives for higher throughput when spilling (e.g. `AWS EC2 r5d instances `_) +3. Use machines with attached local nvme SSD drives for higher throughput when spilling (e.g. AWS EC2 r5d instances) For more troubleshooting, you may also wish to consult the `Ray documentation's recommendations for object spilling `_. @@ -51,7 +51,7 @@ These OOMKills are often recoverable (Daft-on-Ray will take care of retrying wor There are some options available to you. -1. Use machines with more available memory per-CPU to increase each Ray worker's available memory (e.g. `AWS EC2 r5 instances `_) +1. Use machines with more available memory per-CPU to increase each Ray worker's available memory (e.g. AWS EC2 r5 instances) 2. Use more machines in your cluster to increase overall cluster memory size 3. Aggressively filter your data so that Daft can avoid reading data that it does not have to (e.g. ``df.where(...)``) 4. Request more memory for your UDFs (see: :ref:`resource-requests`) if your UDFs are memory intensive (e.g. decompression of data, running large matrix computations etc)