Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Support datashade points hover #1430

Open
wants to merge 5 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
100 changes: 96 additions & 4 deletions hvplot/converter.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@
from holoviews.plotting.bokeh import OverlayPlot, colormap_generator
from holoviews.plotting.util import process_cmap
from holoviews.operation import histogram, apply_when
from holoviews.streams import Buffer, Pipe
from holoviews.streams import Buffer, Pipe, Tap, PointerXY
from holoviews.util.transform import dim, lon_lat_to_easting_northing
from pandas import DatetimeIndex, MultiIndex

Expand Down Expand Up @@ -842,13 +842,22 @@ def __init__(
if kind == 'errorbars':
hover = False
elif hover is None:
hover = not self.datashade
hover = True
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
hover = True
hover = (self.datashade and self.kind == 'points') or not self.datashade

I'm not sure if hover should always be True.


if hover and not any(
t for t in tools if isinstance(t, HoverTool) or t in ['hover', 'vline', 'hline']
):
if hover in {'vline', 'hline'}:
plot_opts['hover_mode'] = hover
tools.append('hover')
self.hover_mode = hover
else:
self.hover_mode = 'mouse'
if not self.datashade:
tools.append('hover')

self.hover = bool(hover)
self.hover_tooltips = hover_tooltips
self.hover_formatters = hover_formatters
if 'hover' in tools:
if hover_tooltips:
plot_opts['hover_tooltips'] = hover_tooltips
Expand Down Expand Up @@ -1760,7 +1769,7 @@ def method_wrapper(ds, x, y):
return layers

import_datashader()
from holoviews.operation.datashader import datashade, rasterize, dynspread
from holoviews.operation.datashader import datashade, rasterize, dynspread, inspect_points

categorical, agg = self._process_categorical_datashader()
if agg:
Expand Down Expand Up @@ -1819,11 +1828,94 @@ def method_wrapper(ds, x, y):
threshold=self.kwds.get('threshold', 0.5),
)

# a workaround to show hover info for datashaded points
if self.hover and self.datashade and self.kind == 'points':
if self.hover_mode != 'mouse':
param.main.param.warning(
f'Got unsupported hover_mode={self.hover_mode!r} for '
f"datashaded points; reverting to 'mouse'."
)

stream = PointerXY
if len(self.data) > 10000:
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I am not a fan of these magic numbers, with no way to change it.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes allowing changing that value is a good improvement. I wouldn't add that to the signature though, maybe just as a very explicitly named kwarg that is caught there if defined, and defaults to 10000 if not.

stream = Tap
param.main.param.warning(
'Hovering over datashaded points is slow for large datasets; '
'tap on the plot to see a hover tooltip over desired point.'
)

inspector = inspect_points.instance(
streams=[stream], transform=self._datashade_hover_transform
)
processed *= inspector(processed).opts(
size=10,
alpha=0,
tools=['hover'],
hover_mode=self.hover_mode,
hover_tooltips=self.hover_tooltips,
hover_formatters=self.hover_formatters,
)

opts = filter_opts(eltype, dict(self._plot_opts, **style), backend='bokeh')
layers = self._apply_layers(processed).opts(eltype, **opts, backend='bokeh')
layers = _transfer_opts_cur_backend(layers)
return layers

def _datashade_hover_transform(self, df):
if not len(df):
return df

# show at least the x and y columns
cols = self.hover_cols.copy()
if self.x not in cols:
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is self.x always defined or can it be None at this stage?

cols.append(self.x)
if self.y not in cols:
cols.append(self.y)

# handle aggregator, e.g. ds.sum('column') or ds.count_cat('column')
agg_col = None
agg_series_map = {}
if self.aggregator and not isinstance(self.aggregator, str) and self.aggregator.column:
agg_col = self.aggregator.column
agg_op = type(self.aggregator).__name__
if hasattr(df, agg_op): # df.sum/df.count
agg_value = df.agg({agg_col: agg_op})
elif agg_op == 'count_cat':
agg_value = df[agg_col].value_counts()

if agg_col in cols:
cols.remove(agg_col)
else:
key = 'Count'
for i in range(1, 10):
if key in df.columns:
key = f'Count_{i}'
else:
break
Comment on lines +1890 to +1894
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This looks weird to me

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What's weird about it? It's deduplicating columns; do you have a suggestion for an alternative?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why 10?

I can't wrap my head around what the code does.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Just about any scalar value other than 0 or 1 should be documented very clearly, usually by making a Parameter or at least attribute declaration with a meaningful name and associated docstring.

agg_value = pd.Series([len(df)], index=[key])

# take the mean of numeric columns
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is the rest of the method generated with AI? If so, please clearly state so.

Otherwise, please explain why you take the mean. In general, comments should not explain what is done but why it is done.

Copy link
Collaborator Author

@ahuang11 ahuang11 Oct 3, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No it's not AI. I decided to add comments since the code was getting chunky, and easier to navigate English, especially since I use the exclude keyword for object columns instead of include keyword (had to do a double take that when I was rereading my code), and I wanted to highlight that.

Comments should not explain what is done but why it is done.

Sometimes it's easier to read English than code and guidelines don't necessarily have to be followed all the time

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sorry about that. It just had the structure of an AI output.

... guidelines don't necessarily have to be followed all the time.

I agree, but the use of mean need to be explained, either in the code or in the original comment.

This can return an output that does not exist in the original DataFrame when more points are in the input DataFrame. For example, the two close points in your example will return 350 for the population when they are close to each other. Is this okay? Maybe, but you need to state that you have made this decision.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@ahuang11 , @hoxbro thinks you are so intelligent that it must be artificial! :-)

num_series = df[cols].select_dtypes(include=['number']).mean()
if len(num_series):
agg_series_map['number_cols'] = num_series

# take the first value of object columns
obj_series = df[cols].select_dtypes(exclude=['number']).iloc[0]
if len(obj_series):
agg_series_map['object_cols'] = obj_series

# to preserve order of other columns, add this last
agg_series_map[agg_col] = agg_value

# concat all series into a single dataframe which has one row
df_hover = pd.concat(agg_series_map.values()).to_frame().transpose()
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This especially, without the comment, would be really confusing of what all the methods are doing.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Agree. But why is this needed? Why is a pd.Series not good enough? We could add an extra line explaining this:

Suggested change
df_hover = pd.concat(agg_series_map.values()).to_frame().transpose()
# This is needed as the function used in `inspect_points.transform` must return a DataFrame.
df_hover = pd.concat(agg_series_map.values()).to_frame().transpose()

At some point, a person could look at this line and remove .to_frame().transpose(), and if no tests fails people could think it was not a needed conversion.


# remove index if it wasn't in the original dataset
if 'index' not in self.data.columns:
df_hover = df_hover.drop(columns=['index'], errors='ignore')

return df_hover

def _resample_obj(self, operation, obj, opts):
def exceeds_resample_when(plot):
return len(plot) > self.resample_when
Expand Down
24 changes: 18 additions & 6 deletions hvplot/tests/testoperations.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
import pandas as pd
import pytest

from holoviews import Store, render
from holoviews import Store, render, renderer
from holoviews.element import Image, QuadMesh, Points
from holoviews.core.spaces import DynamicMap
from holoviews.core.overlay import Overlay
Expand Down Expand Up @@ -156,11 +156,6 @@ def test_when_datashade_is_true_set_hover_to_false_by_default(self):
opts = Store.lookup_options('bokeh', plot[()], 'plot').kwargs
assert 'hover' not in opts.get('tools')

def test_when_datashade_is_true_hover_can_still_be_true(self):
plot = self.df.hvplot(x='x', y='y', datashade=True, hover=True)
opts = Store.lookup_options('bokeh', plot[()], 'plot').kwargs
assert 'hover' in opts.get('tools')

def test_xlim_affects_x_range(self):
data = pd.DataFrame(np.random.randn(100).cumsum())
img = data.hvplot(xlim=(0, 20000), datashade=True, dynamic=False)
Expand Down Expand Up @@ -324,6 +319,23 @@ def test_downsample_resample_when(self, kind, eltype):
assert isinstance(element, eltype)
assert len(element) == 0

@parameterized.expand([(None,), (True,), ('vline',), ('hline',)])
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do you think there'd be a way to avoid calling get_plot? It's pretty expensive isn't it?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Many more tests will be needed to cover all the branches of the code introduced in the converter, these first two are a good start but they're very basic and just check whether the code doesn't error and the HoloViews type is the expected one.

def test_include_inspect_point_hover(self, hover):
df = pd.DataFrame(
np.random.multivariate_normal((0, 0), [[0.1, 0.1], [0.1, 1.0]], (5000,))
).rename({0: 'x', 1: 'y'}, axis=1)

p = df.hvplot.points(datashade=True, hover=hover)
assert renderer('bokeh').get_plot(p).name.startswith('Overlay')

def test_include_inspect_point_no_hover(self):
df = pd.DataFrame(
np.random.multivariate_normal((0, 0), [[0.1, 0.1], [0.1, 1.0]], (5000,))
).rename({0: 'x', 1: 'y'}, axis=1)

p = df.hvplot.points(datashade=True, hover=False)
assert renderer('bokeh').get_plot(p).name.startswith('RGB')


class TestChart2D(ComparisonTestCase):
def setUp(self):
Expand Down
Loading