From 3cf3f703006d05ee267fe301eb3a645e5079680a Mon Sep 17 00:00:00 2001 From: Philipp Rudiger Date: Sat, 30 May 2020 16:02:52 +0200 Subject: [PATCH 1/5] Add support for cupy histograms --- holoviews/core/util.py | 7 +++ holoviews/operation/element.py | 92 +++++++++++++++++++++++----------- 2 files changed, 70 insertions(+), 29 deletions(-) diff --git a/holoviews/core/util.py b/holoviews/core/util.py index 3f09fd2455..cc57b76d4d 100644 --- a/holoviews/core/util.py +++ b/holoviews/core/util.py @@ -1491,6 +1491,13 @@ def is_dask_array(data): return (da is not None and isinstance(data, da.Array)) +def is_cupy_array(data): + if 'cupy' in sys.modules: + import cupy + return isinstance(data, cupy.ndarray) + return False + + def get_param_values(data): params = dict(kdims=data.kdims, vdims=data.vdims, label=data.label) diff --git a/holoviews/operation/element.py b/holoviews/operation/element.py index 9c9f71d0a7..f4414346bf 100644 --- a/holoviews/operation/element.py +++ b/holoviews/operation/element.py @@ -4,9 +4,13 @@ """ from __future__ import division -import numpy as np +import sys + +from distutils.version import LooseVersion +import numpy as np import param + from param import _is_number from ..core import (Operation, NdOverlay, Overlay, GridMatrix, @@ -14,7 +18,7 @@ from ..core.data import ArrayInterface, DictInterface, default_datatype from ..core.util import (group_sanitizer, label_sanitizer, pd, basestring, datetime_types, isfinite, dt_to_int, - isdatetime, is_dask_array) + isdatetime, is_dask_array, is_cupy_array) from ..element.chart import Histogram, Scatter from ..element.raster import Image, RGB from ..element.path import Contours, Polygons @@ -641,7 +645,10 @@ class histogram(Operation): Specifies the range within which to compute the bins.""") bins = param.ClassSelector(default=None, class_=(np.ndarray, list, tuple, str), doc=""" - An explicit set of bin edges.""") + An explicit set of bin edges or a method to find the optimal + set of bin edges, e.g. 'auto', 'fd', 'scott' etc. For more + documentation on these approaches see the np.histogram_bin_edges + documentation.""") cumulative = param.Boolean(default=False, doc=""" Whether to compute the cumulative histogram""") @@ -690,6 +697,7 @@ def _process(self, element, key=None): self.p.groupby = None return grouped.map(self._process, Dataset) + normed = False if self.p.mean_weighted and self.p.weight_dimension else self.p.normed if self.p.dimension: selected_dim = self.p.dimension else: @@ -701,16 +709,36 @@ def _process(self, element, key=None): else: data = element.dimension_values(selected_dim) + is_datetime = isdatetime(data) + if is_datetime: + data = data.astype('datetime64[ns]').astype('int64') + + # Handle different datatypes + is_finite = isfinite + is_cupy = is_cupy_array(data) + if is_cupy: + import cupy + full_cupy_support = LooseVersion(cupy.__version__) > '8.0' + if not full_cupy_support and (normed or self.p.weight_dimension): + data = cupy.asnumpy(data) + is_cupy = False if is_dask_array(data): import dask.array as da histogram = da.histogram + elif is_cupy: + import cupy + histogram = cupy.histogram + is_finite = cupy.isfinite else: histogram = np.histogram - mask = isfinite(data) + # Mask data + mask = is_finite(data) if self.p.nonzero: mask = mask & (data > 0) data = data[mask] + + # Compute weights if self.p.weight_dimension: if hasattr(element, 'interface'): weights = element.interface.values(element, self.p.weight_dimension, compute=False) @@ -721,35 +749,36 @@ def _process(self, element, key=None): else: weights = None - hist_range = self.p.bin_range or element.range(selected_dim) - # Avoids range issues including zero bin range and empty bins - if hist_range == (0, 0) or any(not isfinite(r) for r in hist_range): - hist_range = (0, 1) - - datetimes = False - bins = None if self.p.bins is None else np.asarray(self.p.bins) - steps = self.p.num_bins + 1 - start, end = hist_range - if isdatetime(data): - start, end = dt_to_int(start, 'ns'), dt_to_int(end, 'ns') - datetimes = True - data = data.astype('datetime64[ns]').astype('int64') - if bins is not None: + # Compute bins + if isinstance(self.p.bins, str): + bin_data = cupy.asnumpy(data) if is_cupy else data + edges = np.histogram_bin_edges(bin_data, bins=self.p.bins) + elif isinstance(self.p.bins, (list, np.ndarray)): + edges = self.p.bins + if isdatetime(edges): bins = bins.astype('datetime64[ns]').astype('int64') + else: + hist_range = self.p.bin_range or element.range(selected_dim) + # Avoids range issues including zero bin range and empty bins + if hist_range == (0, 0) or any(not isfinite(r) for r in hist_range): + hist_range = (0, 1) + steps = self.p.num_bins + 1 + start, end = hist_range + if is_datetime: + start, end = dt_to_int(start, 'ns'), dt_to_int(end, 'ns') + if self.p.log: + bin_min = max([abs(start), data[data>0].min()]) + edges = np.logspace(np.log10(bin_min), np.log10(end), steps) else: - hist_range = start, end + edges = np.linspace(start, end, steps) - if self.p.bins: - edges = bins - elif self.p.log: - bin_min = max([abs(start), data[data>0].min()]) - edges = np.logspace(np.log10(bin_min), np.log10(end), steps) - else: - edges = np.linspace(start, end, steps) - normed = False if self.p.mean_weighted and self.p.weight_dimension else self.p.normed + if is_cupy: + edges = cupy.asarray(edges) if is_dask_array(data) or len(data): - if normed: + if is_cupy and not full_cupy_support: + hist, _ = histogram(data, bins=edges) + elif normed: # This covers True, 'height', 'integral' hist, edges = histogram(data, density=True, weights=weights, bins=edges) @@ -763,8 +792,13 @@ def _process(self, element, key=None): else: nbins = self.p.num_bins if self.p.bins is None else len(self.p.bins)-1 hist = np.zeros(nbins) + + if is_cupy_array(hist): + edges = cupy.asnumpy(edges) + hist = cupy.asnumpy(hist) + hist[np.isnan(hist)] = 0 - if datetimes: + if is_datetime: edges = (edges/1e3).astype('datetime64[us]') params = {} From 391ade76afc64ad6135ed3152c2435a97e8b6b25 Mon Sep 17 00:00:00 2001 From: Philipp Rudiger Date: Sat, 30 May 2020 16:26:18 +0200 Subject: [PATCH 2/5] Implement dask and cupy BoxWhisker stats --- holoviews/plotting/bokeh/stats.py | 33 +++++++++++++++++++++++++------ 1 file changed, 27 insertions(+), 6 deletions(-) diff --git a/holoviews/plotting/bokeh/stats.py b/holoviews/plotting/bokeh/stats.py index d15aa919fc..15f1eb6e2c 100644 --- a/holoviews/plotting/bokeh/stats.py +++ b/holoviews/plotting/bokeh/stats.py @@ -13,7 +13,8 @@ from ...core.dimension import Dimension, Dimensioned from ...core.ndmapping import sorted_context from ...core.util import (basestring, dimension_sanitizer, wrap_tuple, - unique_iterator, unique_array, isfinite) + unique_iterator, unique_array, isfinite, + is_dask_array, is_cupy_array) from ...operation.stats import univariate_kde from ...util.transform import dim from .chart import AreaPlot @@ -139,11 +140,23 @@ def _postprocess_hover(self, renderer, source): super(BoxWhiskerPlot, self)._postprocess_hover(renderer, source) def _box_stats(self, vals): - vals = vals[isfinite(vals)] + is_finite = isfinite + is_dask = is_dask_array(vals) + is_cupy = is_cupy_array(vals) + if is_cupy: + import cupy + percentile = cupy.percentile + is_finite = cupy.isfinite + elif is_dask: + import dask.array as da + percentile = da.percentile + else: + percentile = np.percentile + + vals = vals[is_finite(vals)] if len(vals): - q1, q2, q3 = (np.percentile(vals, q=q) - for q in range(25, 100, 25)) + q1, q2, q3 = (percentile(vals, q=q) for q in range(25, 100, 25)) iqr = q3 - q1 upper = vals[vals <= q3 + 1.5*iqr].max() lower = vals[vals >= q1 - 1.5*iqr].min() @@ -151,7 +164,14 @@ def _box_stats(self, vals): q1, q2, q3 = 0, 0, 0 upper, lower = 0, 0 outliers = vals[(vals > upper) | (vals < lower)] - return q1, q2, q3, upper, lower, outliers + + if is_cupy: + return (q1.item(), q2.item(), q3.item(), upper.item(), + lower.item(), cupy.asnumpy(outliers)) + elif is_dask: + return da.compute(q1, q2, q3, upper, lower, outliers) + else: + return q1, q2, q3, upper, lower, outliers def get_data(self, element, ranges, style): if element.kdims: @@ -191,6 +211,7 @@ def get_data(self, element, ranges, style): cdim, cidx = None, None factors = [] + vdim = g.vdims[0].name for key, g in groups.items(): # Compute group label if element.kdims: @@ -208,7 +229,7 @@ def get_data(self, element, ranges, style): factors.append(label) # Compute statistics - vals = g.dimension_values(g.vdims[0]) + vals = data = element.interface.values(element, vdim, compute=False) q1, q2, q3, upper, lower, outliers = self._box_stats(vals) # Add to CDS data From 59cffc2d52a3bb38590126baa8a12b8b67c750fe Mon Sep 17 00:00:00 2001 From: Philipp Rudiger Date: Sat, 30 May 2020 17:12:54 +0200 Subject: [PATCH 3/5] Fixed flakes --- holoviews/operation/element.py | 4 +--- holoviews/plotting/bokeh/stats.py | 4 ++-- 2 files changed, 3 insertions(+), 5 deletions(-) diff --git a/holoviews/operation/element.py b/holoviews/operation/element.py index f4414346bf..33339f26a0 100644 --- a/holoviews/operation/element.py +++ b/holoviews/operation/element.py @@ -4,8 +4,6 @@ """ from __future__ import division -import sys - from distutils.version import LooseVersion import numpy as np @@ -756,7 +754,7 @@ def _process(self, element, key=None): elif isinstance(self.p.bins, (list, np.ndarray)): edges = self.p.bins if isdatetime(edges): - bins = bins.astype('datetime64[ns]').astype('int64') + edges = edges.astype('datetime64[ns]').astype('int64') else: hist_range = self.p.bin_range or element.range(selected_dim) # Avoids range issues including zero bin range and empty bins diff --git a/holoviews/plotting/bokeh/stats.py b/holoviews/plotting/bokeh/stats.py index 15f1eb6e2c..0306279c70 100644 --- a/holoviews/plotting/bokeh/stats.py +++ b/holoviews/plotting/bokeh/stats.py @@ -211,7 +211,7 @@ def get_data(self, element, ranges, style): cdim, cidx = None, None factors = [] - vdim = g.vdims[0].name + vdim = element.vdims[0].name for key, g in groups.items(): # Compute group label if element.kdims: @@ -229,7 +229,7 @@ def get_data(self, element, ranges, style): factors.append(label) # Compute statistics - vals = data = element.interface.values(element, vdim, compute=False) + vals = element.interface.values(element, vdim, compute=False) q1, q2, q3, upper, lower, outliers = self._box_stats(vals) # Add to CDS data From 865728db20b59e97c0501b7b7d2d419397dbc041 Mon Sep 17 00:00:00 2001 From: Philipp Rudiger Date: Sun, 31 May 2020 16:33:12 +0200 Subject: [PATCH 4/5] Update holoviews/plotting/bokeh/stats.py --- holoviews/plotting/bokeh/stats.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/holoviews/plotting/bokeh/stats.py b/holoviews/plotting/bokeh/stats.py index 0306279c70..1383e51ceb 100644 --- a/holoviews/plotting/bokeh/stats.py +++ b/holoviews/plotting/bokeh/stats.py @@ -229,7 +229,7 @@ def get_data(self, element, ranges, style): factors.append(label) # Compute statistics - vals = element.interface.values(element, vdim, compute=False) + vals = element.interface.values(g, vdim, compute=False) q1, q2, q3, upper, lower, outliers = self._box_stats(vals) # Add to CDS data From 97a946a0ac3c15fb9913e1012a59080d6add9197 Mon Sep 17 00:00:00 2001 From: Philipp Rudiger Date: Sun, 31 May 2020 16:33:45 +0200 Subject: [PATCH 5/5] Update holoviews/plotting/bokeh/stats.py --- holoviews/plotting/bokeh/stats.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/holoviews/plotting/bokeh/stats.py b/holoviews/plotting/bokeh/stats.py index 1383e51ceb..49781e0726 100644 --- a/holoviews/plotting/bokeh/stats.py +++ b/holoviews/plotting/bokeh/stats.py @@ -229,7 +229,7 @@ def get_data(self, element, ranges, style): factors.append(label) # Compute statistics - vals = element.interface.values(g, vdim, compute=False) + vals = g.interface.values(g, vdim, compute=False) q1, q2, q3, upper, lower, outliers = self._box_stats(vals) # Add to CDS data