Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Lazy load dask.dataframe in datashader.py #6309

Merged
merged 5 commits into from
Jul 4, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
29 changes: 29 additions & 0 deletions holoviews/core/util.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
import builtins
import datetime as dt
import functools
import hashlib
import importlib
import inspect
import itertools
import json
Expand Down Expand Up @@ -2329,3 +2331,30 @@ def flatten(line):
yield from flatten(element)
else:
yield element


def lazy_isinstance(obj, class_or_tuple):
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Just tried that in my env and got some weird results.

from holoviews.core.util import lazy_isinstance
import dask.dataframe as dd

ddf = dd.DataFrame({})

print(lazy_isinstance(ddf, 'dask.dataframe:DataFrame'))
print(isinstance(ddf, dd.DataFrame))
False
True

That's because obj.__module__ is dask_expr._collection in this case. Pretty unusual but well!


Alternative API with lazy_instance(obj, module, objname) where objname is either a string or a tuple of strings, e.g. lazy_instance(obj, 'dask.dataframe', 'DataFrame'), lazy_instance(obj, 'cudf', ('DataFrame', 'Series')).

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I have changed it to .startswith. This is what is done in hvplot.

""" Lazy isinstance check

Will only import the module of the object if the module of the
obj matches the first value of an item in class_or_tuple.

lazy_isinstance(obj, 'dask.dataframe:DataFrame')

Will:
1) check if the first module is dask
2) If it dask, import dask.dataframe
3) Do an isinstance check for dask.dataframe.DataFrame
"""
if isinstance(class_or_tuple, str):
class_or_tuple = (class_or_tuple,)

obj_mod_name = obj.__module__.split('.')[0]
for cls in class_or_tuple:
mod_name, _, attr_name = cls.partition(':')
if not obj_mod_name.startswith(mod_name.split(".")[0]):
continue
mod = importlib.import_module(mod_name)
if isinstance(obj, functools.reduce(getattr, attr_name.split('.'), mod)):
return True
return False
11 changes: 6 additions & 5 deletions holoviews/operation/datashader.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@
from collections.abc import Callable, Iterable
from functools import partial

import dask.dataframe as dd
import datashader as ds
import datashader.reductions as rd
import datashader.transfer_functions as tf
Expand Down Expand Up @@ -45,6 +44,7 @@
datetime_types,
dt_to_int,
get_param_values,
lazy_isinstance,
)
from ..element import (
RGB,
Expand Down Expand Up @@ -303,22 +303,23 @@ def get_agg_data(cls, obj, category=None):
if len(paths) > 1:
if glyph == 'line':
path = paths[0][:1]
if isinstance(path, dd.DataFrame):
if lazy_isinstance(path, "dask.dataframe:DataFrame"):
path = path.compute()
empty = path.copy()
empty.iloc[0, :] = (np.nan,) * empty.shape[1]
paths = [elem for p in paths for elem in (p, empty)][:-1]
if all(isinstance(path, dd.DataFrame) for path in paths):
if all(lazy_isinstance(path,"dask.dataframe:DataFrame") for path in paths):
import dask.dataframe as dd
df = dd.concat(paths)
else:
paths = [p.compute() if isinstance(p, dd.DataFrame) else p for p in paths]
paths = [p.compute() if lazy_isinstance(p, "dask.dataframe:DataFrame") else p for p in paths]
df = pd.concat(paths)
else:
df = paths[0] if paths else pd.DataFrame([], columns=[x.name, y.name])
if category and df[category].dtype.name != 'category':
df[category] = df[category].astype('category')

is_custom = isinstance(df, dd.DataFrame) or cuDFInterface.applies(df)
is_custom = lazy_isinstance(df, "dask.dataframe:DataFrame") or cuDFInterface.applies(df)
if any((not is_custom and len(df[d.name]) and isinstance(df[d.name].values[0], cftime_types)) or
df[d.name].dtype.kind in ["M", "u"] for d in (x, y)):
df = df.copy()
Expand Down