mindboggle/guts/compute.py

#!/usr/bin/env python
"""
Compute functions.


Authors:
    - Arno Klein, 2012-2016  (arno@mindboggle.info)  http://binarybottle.com

Copyright 2016,  Mindboggle team (http://mindboggle.info), Apache v2.0 License

"""


def distcorr(X, Y):
    """
    Compute the distance correlation function.

    Parameters
    ----------
    X : list or array of numbers
    Y : list or array of numbers

    Returns
    -------
    dcor : float
        distance correlation

    Examples
    --------
    >>> from mindboggle.guts.compute import distcorr
    >>> import numpy as np
    >>> a = [1,2,3,4,5]
    >>> b = [1,2,9,4,4]
    >>> dcor = distcorr(a, b)
    >>> np.float("{0:.{1}f}".format(dcor, 5))
    0.76268

    Copyright (2014-2015) MIT
    Written by Satrajit S. Ghosh (Apache License v2.0) as part of the
    mapalign GitHub repository (https://github.com/satra/mapalign)
    """
    import numpy as np
    from scipy.spatial.distance import pdist, squareform

    X = np.atleast_1d(X)
    Y = np.atleast_1d(Y)
    if np.prod(X.shape) == len(X):
        X = X[:, None]
    if np.prod(Y.shape) == len(Y):
        Y = Y[:, None]
    X = np.atleast_2d(X)
    Y = np.atleast_2d(Y)
    n = X.shape[0]
    if Y.shape[0] != X.shape[0]:
        raise ValueError('Number of samples must match')
    a = squareform(pdist(X))
    b = squareform(pdist(Y))
    A = a - a.mean(axis=0)[None, :] - a.mean(axis=1)[:, None] + a.mean()
    B = b - b.mean(axis=0)[None, :] - b.mean(axis=1)[:, None] + b.mean()

    dcov2_xy = (A * B).sum()/float(n * n)
    dcov2_xx = (A * A).sum()/float(n * n)
    dcov2_yy = (B * B).sum()/float(n * n)
    dcor = np.sqrt(dcov2_xy)/np.sqrt(np.sqrt(dcov2_xx) * np.sqrt(dcov2_yy))

    return dcor


def point_distance(point, points):
    """
    Compute the Euclidean distance from one point to a second (set) of points.

    Parameters
    ----------
    point : list of three floats
        coordinates for a single point
    points : list with one or more lists of three floats
        coordinates for a second point (or multiple points)

    Returns
    -------
    min_distance : float
        Euclidean distance between two points,
        or the minimum distance between a point and a set of points
    min_index : int
        index of closest of the points (zero if only one)

    Examples
    --------
    >>> from mindboggle.guts.compute import point_distance
    >>> point = [1,2,3]
    >>> points = [[10,2.0,3], [0,1.5,2]]
    >>> point_distance(point, points)
    (1.5, 1)

    Notes
    -----
    Future plan is to use scipy.spatial.distance.cdist to compute distances
    scipy.spatial.distance.cdist is available in scipy v0.12 or later

    """
    import numpy as np

    # If points is a single point
    if np.ndim(points) == 1:
       #return np.linalg.norm(np.array(point) - np.array(points))
       return np.sqrt((point[0] - points[0]) ** 2 + \
                       (point[1] - points[1]) ** 2 + \
                       (point[2] - points[2]) ** 2), 0

    # If points is a set of multiple points
    elif np.ndim(points) == 2:
        min_distance = np.Inf
        min_index = 0
        point = np.array(point)
        for index, point2 in enumerate(points):
            distance = np.sqrt((point[0] - point2[0]) ** 2 + \
                               (point[1] - point2[1]) ** 2 + \
                               (point[2] - point2[2]) ** 2)

            #distance = np.linalg.norm(point - np.array(point2))

            if distance < min_distance:
                min_distance = distance
                min_index = index
        return min_distance, min_index

    # Else return None
    else:
        return None, None


def vector_distance(vector1, vector2, normalize=False):
    """
    Compute the Euclidean distance between two equal-sized vectors.

    Parameters
    ----------
    vector1 : numpy array of floats
        vector of values
    vector2 : numpy array of floats
        vector of values
    normalize : bool
        normalize each element of the vectors?

    Returns
    -------
    distance : float
        Euclidean distance between two vectors

    Examples
    --------
    >>> import numpy as np
    >>> from mindboggle.guts.compute import vector_distance
    >>> vector1 = np.array([1.,2.,3.])
    >>> vector2 = np.array([0,1,5])
    >>> distance = vector_distance(vector1, vector2)
    >>> print('{0:0.5f}'.format(distance))
    0.81650

    """
    import numpy as np

    if np.size(vector1) == np.size(vector2):
        # Make sure arguments are numpy arrays
        if not isinstance(vector1, np.ndarray):
            vector1 = np.asarray(vector1)
        if not isinstance(vector2, np.ndarray):
            vector2 = np.asarray(vector2)
        if normalize:
            vector_diff = np.zeros(len(vector1))
            for i in range(len(vector1)):
                max_v1v2 = max([vector1[i], vector2[i]])
                if max_v1v2 > 0:
                    vector_diff[i] = (vector1[i] - vector2[i]) / max_v1v2
        else:
            vector_diff = vector1 - vector2
        return np.sqrt(sum((vector_diff)**2)) / np.size(vector1)
    else:
        print("Vectors have to be of equal size to compute distance.")
        return None


def pairwise_vector_distances(vectors, save_file=False, normalize=False):
    """
    Compare every pair of equal-sized vectors.

    Parameters
    ----------
    vectors : array of 1-D lists or arrays of integers or floats
    save_file : bool
        save file?
    normalize : bool
        normalize each element of the vectors?

    Returns
    -------
    vector_distances : numpy array of integers or floats
        distances between each pair of vectors
    outfile : string [optional]
        output filename for pairwise_vector_distances

    Examples
    --------
    >>> import numpy as np
    >>> from mindboggle.guts.compute import pairwise_vector_distances
    >>> vectors = [[1,2,3],[0,3,5],[0,3.5,5],[1,1,1]]
    >>> save_file = False
    >>> normalize = False
    >>> vector_distances, outfile = pairwise_vector_distances(vectors,
    ...     save_file, normalize)
    >>> print(np.array_str(np.array(vector_distances),
    ...       precision=5, suppress_small=True))
    [[0.      0.8165  0.89753 0.74536]
     [0.      0.      0.16667 1.52753]
     [0.      0.      0.      1.60728]
     [0.      0.      0.      0.     ]]

    """
    import os
    import numpy as np
    from mindboggle.guts.compute import vector_distance

    # Make sure argument is a numpy array
    if not isinstance(vectors, np.ndarray):
        vectors = np.array(vectors)

    # Initialize output
    vector_distances = np.zeros((len(vectors), len(vectors)))

    # --------------------------------------------------------------------------
    # Compute distance between each pair of vectors
    # --------------------------------------------------------------------------
    # Loop through every pair of vectors
    for ihist1 in range(len(vectors)):
        for ihist2 in range(len(vectors)):
            if ihist2 >= ihist1:

                # Store pairwise distances between histogram values
                d = vector_distance(1.0*vectors[ihist1],
                                            1.0*vectors[ihist2],
                                            normalize=normalize)
                vector_distances[ihist1, ihist2] = d

    if save_file:
        outfile = os.path.join(os.getcwd(), 'vector_distances.txt')
        np.savetxt(outfile, vector_distances,
                   fmt=len(vectors) * '%.4f ', delimiter='\t', newline='\n')
        if not os.path.exists(outfile):
            raise IOError(outfile + " not found")
    else:
        outfile = ''

    return vector_distances, outfile


def source_to_target_distances(sourceIDs, targetIDs, points,
                               segmentIDs=[], excludeIDs=[-1]):
    """
    Create a Euclidean distance matrix between source and target points.

    Compute the Euclidean distance from each source point to
    its nearest target point, optionally within each segment.

    Example::

        Compute fundus-to-feature distances, the minimum distance
        from each label boundary vertex (corresponding to a fundus
        in the DKT cortical labeling protocol) to all of the
        feature vertices in the same fold.

    Parameters
    ----------
    sourceIDs : list of N integers (N is the number of vertices)
        source IDs, where any ID not in excludeIDs is a source point
    targetIDs : list of N integers (N is the number of vertices)
        target IDs, where any ID not in excludeIDs is a target point
    points : list of N lists of three floats (N is the number of vertices)
        coordinates of all vertices
    segmentIDs : list of N integers (N is the number of vertices)
        segment IDs, where each ID not in excludeIDs is considered a
        different segment (unlike above, where value in sourceIDs or
        targetIDs doesn't matter, so long as its not in excludeIDs);
        source/target distances are computed within each segment
    excludeIDs : list of integers
        IDs to exclude

    Returns
    -------
    distances : numpy array
        distance value for each vertex (default -1)
    distance_matrix : numpy array [#points by maximum segment ID + 1]
        distances organized by segments (columns)

    """
    import numpy as np
    from mindboggle.guts.compute import point_distance

    if isinstance(points, list):
        points = np.asarray(points)
    npoints = len(points)

    # Extract unique segment IDs (or use all points as a single segment):
    if np.size(segmentIDs):
        segments = [x for x in np.unique(segmentIDs) if x not in excludeIDs]
    else:
        segmentIDs = np.zeros(npoints)
        segments = [0]
    nsegments = max(segments) + 1

    # Initialize outputs:
    distances = -1 * np.ones(npoints)
    distance_matrix = -1 * np.ones((npoints, nsegments))

    # For each segment:
    for segment in segments:
        segment_indices = [i for i,x in enumerate(segmentIDs)
                           if x == segment]

        # Find all source points in the segment:
        source_indices = [i for i,x in enumerate(sourceIDs)
                          if x not in excludeIDs
                          if i in segment_indices]
        # Find all target points in the segment:
        target_indices = [i for i,x in enumerate(targetIDs)
                          if x not in excludeIDs
                          if i in segment_indices]

        if source_indices and target_indices:

            # For each source point in the segment:
            for isource in source_indices:

                # Find the closest target point:
                d, i = point_distance(points[isource],
                                      points[target_indices])
                distances[isource] = d
                distance_matrix[isource, segment] = d

    return distances, distance_matrix


def weighted_to_repeated_values(X, W=[], precision=1):
    """
    Create a list of repeated values from weighted values.

    This is useful for computing weighted statistics (ex: weighted median).

    Adapted to allow for fractional weights from
        http://stackoverflow.com/questions/966896/
               code-golf-shortest-code-to-find-a-weighted-median

    Parameters
    ----------
    X : numpy array of floats or integers
        values
    W : numpy array of floats or integers
        weights
    precision : integer
        number of decimal places to consider weights

    Returns
    -------
    repeat_values : numpy array of floats or integers
        repeated values according to their weight

    Examples
    --------
    >>> import numpy as np
    >>> from mindboggle.guts.compute import weighted_to_repeated_values
    >>> X = np.array([1,2,4,7,8])
    >>> W = np.array([.1,.1,.3,.2,.3])
    >>> precision = 1
    >>> weighted_to_repeated_values(X, W, precision)
    [1, 2, 4, 4, 4, 7, 7, 8, 8, 8]

    """
    import numpy as np

    # Make sure arguments have the correct type:
    if not isinstance(X, np.ndarray):
        X = np.array(X)
    if not isinstance(W, np.ndarray):
        W = np.array(W)
    if not isinstance(precision, int):
        precision = int(precision)

    if np.size(W):
        # If weights are decimals, multiply by 10 until they are whole numbers.
        # If after multiplying precision times they are not whole, round them:
        whole = True
        if any(np.mod(W,1)):
            whole = False
            for i in range(precision):
                if any(np.mod(W,1)):
                    W *= 10
                else:
                    whole = True
                    break

        if not whole:
             W = [int(np.round(x)) for x in W]

        repeat_values = sum([[x]*w for x,w in zip(X,W)],[])

    else:
        repeat_values = X

    return repeat_values


def weighted_median(X, W=[], precision=1):
    """
    Compute a weighted median.

    Parameters
    ----------
    X : numpy array of floats or integers
        values
    W : numpy array of floats or integers
        weights
    precision : integer
        number of decimal places to consider weights

    Returns
    -------
    wmedian : float
        weighted median

    Examples
    --------
    >>> import numpy as np
    >>> from mindboggle.guts.compute import weighted_median
    >>> X = np.array([1,2,4,7,8])
    >>> W = np.array([.1,.1,.3,.2,.3])
    >>> precision = 1
    >>> # [1, 2, 4, 4, 4, 7, 7, 8, 8, 8]
    >>> weighted_median(X, W, precision)
    5.5

    """
    import numpy as np
    from mindboggle.guts.compute import weighted_to_repeated_values

    # Make sure arguments have the correct type:
    if not isinstance(X, np.ndarray):
        X = np.array(X)
    if not isinstance(W, np.ndarray):
        W = np.array(W)
    if not isinstance(precision, int):
        precision = int(precision)

    wmedian = np.median(weighted_to_repeated_values(X, W, precision))

    return wmedian


def median_abs_dev(X, W=[], precision=1, c=1.0):
    """
    Compute the (weighted) median absolute deviation.

    mad = median(abs(x - median(x))) / c

    Parameters
    ----------
    X : numpy array of floats or integers
        values
    W : numpy array of floats or integers
        weights
    precision : integer
        number of decimal places to consider weights
    c : float
        constant used as divisor for mad computation;
        c = 0.6745 is used to convert from mad to standard deviation

    Returns
    -------
    mad : float
        (weighted) median absolute deviation

    Examples
    --------
    >>> import numpy as np
    >>> from mindboggle.guts.compute import median_abs_dev
    >>> X = np.array([1,2,4,7,8])
    >>> W = np.array([.1,.1,.3,.2,.3])
    >>> precision = 1
    >>> # [1, 2, 4, 4, 4, 7, 7, 8, 8, 8]
    >>> median_abs_dev(X, W, precision)
    2.0

    """
    import numpy as np
    from mindboggle.guts.compute import weighted_to_repeated_values

    # Make sure arguments have the correct type:
    if not isinstance(X, np.ndarray):
        X = np.array(X)
    if not isinstance(W, np.ndarray):
        W = np.array(W)
    if not isinstance(precision, int):
        precision = int(precision)

    if np.size(W):
        X = weighted_to_repeated_values(X, W, precision)

    mad = np.median(np.abs(X - np.median(X))) / c

    return mad


def means_per_label(values, labels, include_labels=[], exclude_labels=[], areas=[]):
    """
    Compute the mean value across vertices per label,
    optionally taking into account surface area per vertex (UNTESTED).

    Formula:
    average value = sum(a_i * v_i) / total_surface_area,
    where *a_i* and *v_i* are the area and value for each vertex *i*.

    Note ::
        This function is different than stats_per_label() in two ways:
            1. It only computes the (weighted) mean and sdev.
            2. It can accept 2-D arrays (such as [x,y,z] coordinates).

    Parameters
    ----------
    values : numpy array of one or more lists of integers or floats
        values to average per label
    labels : list or array of integers
        label for each value
    include_labels : list of integers
        labels to include
    exclude_labels : list of integers
        labels to be excluded
    areas : numpy array of floats
        surface areas (if provided, used to normalize means and sdevs)

    Returns
    -------
    means : list of floats
        mean(s) for each label
    sdevs : list of floats
        standard deviation(s) for each label
    label_list : list of integers
        unique label numbers
    label_areas : list of floats (if normalize_by_area)
        surface area for each labeled set of vertices

    Examples
    --------
    >>> import numpy as np
    >>> from mindboggle.mio.vtks import read_scalars, read_vtk
    >>> from mindboggle.guts.compute import means_per_label
    >>> from mindboggle.mio.fetch_data import prep_tests
    >>> urls, fetch_data = prep_tests()
    >>> values_file = fetch_data(urls['left_mean_curvature'], '', '.vtk')
    >>> labels_file = fetch_data(urls['left_freesurfer_labels'], '', '.vtk')
    >>> area_file = fetch_data(urls['left_area'], '', '.vtk')
    >>> values, name = read_scalars(values_file, True, True)
    >>> labels, name = read_scalars(labels_file)
    >>> areas, name = read_scalars(area_file, True, True)
    >>> include_labels = []
    >>> exclude_labels = [-1]
    >>> # Compute mean curvature per label normalized by area:
    >>> means, sdevs, label_list, label_areas = means_per_label(values, labels,
    ...     include_labels, exclude_labels, areas)
    >>> [np.float("{0:.{1}f}".format(x, 5)) for x in means[0:5]]
    [-1.1793, -1.21405, -2.49318, -3.58116, -3.34987]
    >>> [np.float("{0:.{1}f}".format(x, 5)) for x in sdevs[0:5]]
    [2.43827, 2.33857, 2.0185, 3.25964, 2.8274]
    >>> # Compute mean curvature per label:
    >>> areas = []
    >>> means, sdevs, label_list, label_areas = means_per_label(values, labels,
    ...     include_labels, exclude_labels, areas)
    >>> [np.float("{0:.{1}f}".format(x, 5)) for x in means[0:5]]
    [-0.99077, -0.3005, -1.59342, -2.03939, -2.31815]
    >>> [np.float("{0:.{1}f}".format(x, 5)) for x in sdevs[0:5]]
    [2.3486, 2.4023, 2.3253, 3.31023, 2.91794]

    >>> # FIX: compute mean coordinates per label:
    >>> #points, indices, lines, faces, labels, scalar_names, npoints, input_vtk = read_vtk(values_file)
    >>> #means, sdevs, label_list, label_areas = means_per_label(points, labels,
    >>> #    include_labels, exclude_labels, areas)
    >>> #means[0:3]
    >>> #sdevs[0:3]

    """
    import numpy as np

    # Make sure arguments are numpy arrays
    if not isinstance(values, np.ndarray):
        values = np.asarray(values)
    if not isinstance(areas, np.ndarray):
        areas = np.asarray(areas)

    if include_labels:
        label_list = include_labels
    else:
        label_list = np.unique(labels)
    label_list = [int(x) for x in label_list if int(x) not in exclude_labels]
    means = []
    sdevs = []
    label_areas = []
    if values.ndim > 1:
        dim = np.shape(values)[1]
    else:
        dim = 1

    for label in label_list:
        I = [i for i,x in enumerate(labels) if x == label]
        if I:
            X = values[I]
            if np.size(areas):
                W = areas[I]
                sumW = sum(W)
                label_areas.append(sumW)
                if sumW > 0:
                    if dim > 1:
                        W = np.transpose(np.tile(W, (dim,1)))
                    means.append(np.sum(W * X, axis=0) / sumW)
                    Xdiff = X - np.mean(X, axis=0)
                    sdevs.append(np.sqrt(np.sum(W * Xdiff**2, axis=0) / sumW))
                else:
                    if dim > 1:
                        means.append(np.mean(X, axis=0))
                        sdevs.append(np.std(X, axis=0))
                    else:
                        means.append(np.mean(X))
                        sdevs.append(np.std(X))
            else:
                if dim > 1:
                    means.append(np.mean(X, axis=0))
                    sdevs.append(np.std(X, axis=0))
                else:
                    means.append(np.mean(X))
                    sdevs.append(np.std(X))
        else:
            means.append(np.zeros(dim))
            sdevs.append(np.zeros(dim))
            label_areas.append(np.zeros(dim))

    if dim > 1:
        means = [x.tolist() for x in means]
        sdevs = [x.tolist() for x in sdevs]
        label_areas = [x.tolist() for x in label_areas]

    return means, sdevs, label_list, label_areas


def sum_per_label(values, labels, include_labels=[], exclude_labels=[]):
    """
    Compute the sum value across vertices per label.

    Parameters
    ----------
    values : numpy array of one or more lists of integers or floats
        values to average per label
    labels : list or array of integers
        label for each value
    include_labels : list of integers
        labels to include
    exclude_labels : list of integers
        labels to be excluded

    Returns
    -------
    sums : list of floats
        sum for each label
    label_list : list of integers
        unique label numbers

    Examples
    --------
    >>> import numpy as np
    >>> from mindboggle.mio.vtks import read_scalars
    >>> from mindboggle.guts.compute import sum_per_label
    >>> from mindboggle.mio.fetch_data import prep_tests
    >>> urls, fetch_data = prep_tests()
    >>> values_file = fetch_data(urls['left_mean_curvature'], '', '.vtk')
    >>> labels_file = fetch_data(urls['left_freesurfer_labels'], '', '.vtk')
    >>> values, name = read_scalars(values_file, True, True)
    >>> labels, name = read_scalars(labels_file)
    >>> include_labels = []
    >>> exclude_labels = [-1]
    >>> # Compute sum area per label:
    >>> sums, label_list = sum_per_label(values, labels, include_labels,
    ...                                  exclude_labels)
    >>> [np.float("{0:.{1}f}".format(x, 5)) for x in sums[0:5]]
    [-8228.32913, -424.90109, -1865.8959, -8353.33769, -5130.06613]

    """
    import numpy as np

    # Make sure arguments are numpy arrays
    if not isinstance(values, np.ndarray):
        values = np.asarray(values)

    if include_labels:
        label_list = include_labels
    else:
        label_list = np.unique(labels)
    label_list = [int(x) for x in label_list if int(x) not in exclude_labels]
    sums = []
    for label in label_list:
        I = [i for i,x in enumerate(labels) if x == label]
        if I:
            X = values[I]
            sums.append(np.sum(X))
        else:
            sums.append(0)

    return sums, label_list


def stats_per_label(values, labels, include_labels=[], exclude_labels=[],
                    weights=[], precision=1):
    """
    Compute various statistical measures across vertices per label,
    optionally using weights (such as surface area per vertex).

    Example (area-weighted mean):
    average value = sum(a_i * v_i) / total_surface_area,
    where *a_i* and *v_i* are the area and value for each vertex *i*.

    Reference:
        Weighted skewness and kurtosis unbiased by sample size
        Lorenzo Rimoldini, arXiv:1304.6564 (2013)
        http://arxiv.org/abs/1304.6564

    Note ::
        This function is different than means_per_label() in two ways:
            1. It computes more than simply the (weighted) mean and sdev.
            2. It only accepts 1-D arrays of values.

    Parameters
    ----------
    values : numpy array of individual or lists of integers or floats
        values for all vertices
    labels : list or array of integers
        label for each value
    include_labels : list of integers
        labels to include
    exclude_labels : list of integers
        labels to be excluded
    weights : numpy array of floats
        weights to compute weighted statistical measures
    precision : integer
        number of decimal places to consider weights

    Returns
    -------
    medians : list of floats
        median for each label
    mads : list of floats
        median absolute deviation for each label
    means : list of floats
        mean for each label
    sdevs : list of floats
        standard deviation for each label
    skews : list of floats
        skew for each label
    kurts : list of floats
        kurtosis value for each label
    lower_quarts : list of floats
        lower quartile for each label
    upper_quarts : list of floats
        upper quartile for each label
    label_list : list of integers
        list of unique labels

    Examples
    --------
    >>> import numpy as np
    >>> from mindboggle.mio.vtks import read_scalars
    >>> from mindboggle.guts.compute import stats_per_label
    >>> from mindboggle.mio.fetch_data import prep_tests
    >>> urls, fetch_data = prep_tests()
    >>> values_file = fetch_data(urls['left_mean_curvature'], '', '.vtk')
    >>> labels_file = fetch_data(urls['left_freesurfer_labels'], '', '.vtk')
    >>> area_file = fetch_data(urls['left_area'], '', '.vtk')
    >>> values, name = read_scalars(values_file, True, True)
    >>> areas, name = read_scalars(area_file, True, True)
    >>> labels, name = read_scalars(labels_file)
    >>> include_labels = []
    >>> exclude_labels = [-1]
    >>> weights = areas
    >>> precision = 1
    >>> medians, mads, means, sdevs, skews, kurts, lower_quarts, upper_quarts, label_list = stats_per_label(values,
    ...     labels, include_labels, exclude_labels, weights, precision)
    >>> [np.float("{0:.{1}f}".format(x, 5)) for x in medians[0:5]]
    [-1.13602, -1.22961, -2.49665, -3.80782, -3.37309]
    >>> [np.float("{0:.{1}f}".format(x, 5)) for x in mads[0:5]]
    [1.17026, 1.5045, 1.28234, 2.11515, 1.69333]
    >>> [np.float("{0:.{1}f}".format(x, 5)) for x in means[0:5]]
    [-1.1793, -1.21405, -2.49318, -3.58116, -3.34987]
    >>> [np.float("{0:.{1}f}".format(x, 5)) for x in kurts[0:5]]
    [2.34118, -0.3969, -0.55787, -0.73993, 0.3807]

    """
    import numpy as np
    from scipy.stats import skew, kurtosis, scoreatpercentile
    from mindboggle.guts.compute import weighted_to_repeated_values, median_abs_dev

    # Make sure arguments are numpy arrays:
    if not isinstance(values, np.ndarray):
        values = np.asarray(values)
    if not isinstance(weights, np.ndarray):
        weights = np.asarray(weights)

    # Initialize all statistical lists:
    if include_labels:
        label_list = include_labels
    else:
        label_list = np.unique(labels)
    label_list = [int(x) for x in label_list if int(x) not in exclude_labels]
    medians = []
    mads = []
    means = []
    sdevs = []
    skews = []
    kurts = []
    lower_quarts = []
    upper_quarts = []

    # Extract all vertex indices for each label:
    for label in label_list:
        I = [i for i,x in enumerate(labels) if x == label]
        if I:
            # Get the vertex values:
            X = values[I]
            if len([x for x in X if x != 0]):
                # If there are as many weights as values, apply the weights to the values:
                if np.size(weights) == np.size(values):
                    W = weights[I]
                    sumW = np.sum(W)
                    # If the sum of the weights and the standard deviation is non-zero,
                    # compute all statistics of the weighted values:
                    if sumW > 0:
                        Xdiff = X - np.mean(X)
                        Xstd = np.sqrt(np.sum(W * Xdiff**2) / sumW)
                        means.append(np.sum(W * X) / sumW)
                        sdevs.append(Xstd)
                        if Xstd > 0:
                            skews.append((np.sum(W * Xdiff**3) / sumW) / Xstd**3)
                            kurts.append((np.sum(W * Xdiff**4) / sumW) / Xstd**4 - 3)
                        else:
                            skews.append(skew(X))
                            kurts.append(kurtosis(X))
                        X = weighted_to_repeated_values(X, W, precision)
                    # If the sum of the weights equals zero, simply compute the statistics:
                    else:
                        means.append(np.mean(X))
                        sdevs.append(np.std(X))
                        skews.append(skew(X))
                        kurts.append(kurtosis(X))
                # If there are no (or not enough) weights, simply compute the statistics:
                else:
                    means.append(np.mean(X))
                    sdevs.append(np.std(X))
                    skews.append(skew(X))
                    kurts.append(kurtosis(X))
                # Compute median, median absolute deviation, and lower and upper quartiles:
                if np.size(X):
                    medians.append(np.median(X))
                    mads.append(median_abs_dev(X))
                    lower_quarts.append(scoreatpercentile(X, 25))
                    upper_quarts.append(scoreatpercentile(X, 75))
                # If the weights are all smaller than the precision, then X will disappear,
                # so set the above statistics (in the 'if' block) to zero:
                else:
                    medians.append(0)
                    mads.append(0)
                    lower_quarts.append(0)
                    upper_quarts.append(0)
            # If all values are equal to zero, set all statistics to zero:
            else:
                medians.append(0)
                mads.append(0)
                means.append(0)
                sdevs.append(0)
                skews.append(0)
                kurts.append(0)
                lower_quarts.append(0)
                upper_quarts.append(0)
        # If there are no vertices for the label, set all statistics to zero:
        else:
            medians.append(0)
            mads.append(0)
            means.append(0)
            sdevs.append(0)
            skews.append(0)
            kurts.append(0)
            lower_quarts.append(0)
            upper_quarts.append(0)

    return medians, mads, means, sdevs, skews, kurts, \
           lower_quarts, upper_quarts, label_list


def count_per_label(labels, include_labels=[], exclude_labels=[]):
    """
    Compute the number of times each label occurs.

    Parameters
    ----------
    labels : numpy 1-D array of integers
        labels (e.g., one label per vertex of a mesh)
    include_labels : list of integers
        labels to include
        (if empty, use unique numbers in image volume file)
    exclude_labels : list of integers
        labels to be excluded

    Returns
    -------
    unique_labels : list of integers
        unique label numbers
    counts : list of floats
        number of times each label occurs

    Examples
    --------
    >>> from mindboggle.guts.compute import count_per_label
    >>> labels = [8,8,8,8,8,10,11,12,10,10,11,11,11,12,12,12,12,13]
    >>> include_labels = [9,10,11,12]
    >>> exclude_labels = [13]
    >>> unique_labels, counts = count_per_label(labels, include_labels,
    ...                                         exclude_labels)
    >>> unique_labels
    [9, 10, 11, 12]
    >>> counts
    [0, 3, 4, 5]

    >>> import nibabel as nb
    >>> from mindboggle.mio.vtks import read_scalars
    >>> from mindboggle.mio.labels import DKTprotocol
    >>> from mindboggle.guts.compute import count_per_label
    >>> from mindboggle.mio.fetch_data import prep_tests
    >>> urls, fetch_data = prep_tests()
    >>> labels_file = fetch_data(urls['freesurfer_labels'], '', '.nii.gz')
    >>> img = nb.load(labels_file)
    >>> hdr = img.get_header()
    >>> labels = img.get_data().ravel()
    >>> dkt = DKTprotocol()
    >>> include_labels = dkt.label_numbers
    >>> exclude_labels = []
    >>> unique_labels, counts = count_per_label(labels,
    ...     include_labels, exclude_labels)
    >>> counts[0:10]
    [0, 0, 0, 972, 2414, 2193, 8329, 2941, 1998, 10906]

    """
    import numpy as np

    # Make sure labels is a numpy array:
    if isinstance(labels, list):
        labels = np.array(labels)
    elif isinstance(labels, np.ndarray):
        pass
    else:
        raise IOError("labels should be a numpy array.")

    # Unique list of labels:
    if include_labels:
        label_list = include_labels
    else:
        label_list = np.unique(labels).tolist()
    label_list = [int(x) for x in label_list if int(x) not in exclude_labels]

    # Loop through labels:
    unique_labels = []
    counts = []
    for ilabel, label in enumerate(label_list):

        # Find which voxels contain the label in each volume:
        indices = np.where(labels == label)[0]
        count = len(indices)
        unique_labels.append(label)
        counts.append(count)

    return unique_labels, counts


def compute_overlaps(targets, list1, list2, output_file='', save_output=True,
                     verbose=False):
    """
    Compute overlap for each target between two lists of numbers.

    Parameters
    ----------
    targets : list of integers
        targets to find in lists
    list1 : 1-D numpy array (or list) of numbers
    list2 : 1-D numpy array (or list) of numbers
    output_file : string
        (optional) output file name
    save_output : bool
        save output file?
    verbose : bool
        print statements?

    Returns
    -------
    dice_overlaps : numpy array
        Dice overlap values
    jacc_overlaps : numpy array
        Jaccard overlap values
    output_file : string
        output text file name with overlap values

    Examples
    --------
    >>> import numpy as np
    >>> from mindboggle.guts.compute import compute_overlaps
    >>> from mindboggle.mio.labels import DKTprotocol
    >>> list1 = [1021, 1021, 1021, 1021, 1021, 1010, 1010, 1010, 1010, 1010]
    >>> list2 = [1003, 1021, 1021, 1021, 1021, 1021, 1003, 1003, 1003, 1003]
    >>> dkt = DKTprotocol()
    >>> targets = dkt.cerebrum_cortex_DKT31_numbers
    >>> output_file = ''
    >>> save_output = True
    >>> verbose = False
    >>> dice_overlaps, jacc_overlaps, output_file = compute_overlaps(targets,
    ...     list1, list2, output_file, save_output, verbose)
    >>> print('{0:.5f}'.format(dice_overlaps[18]))
    0.80000
    >>> print('{0:.5f}'.format(jacc_overlaps[18]))
    0.66667

    """
    import os
    import numpy as np
    import pandas as pd

    if isinstance(list1, list):
        list1 = np.array(list1)
    if isinstance(list2, list):
        list2 = np.array(list2)

    if np.size(list1) != np.size(list2):
        raise IOError("Files are different sizes")

    # Initialize output:
    dice_overlaps = np.zeros(len(targets))
    jacc_overlaps = np.zeros(len(targets))
    if save_output and not output_file:
        output_file = os.path.join(os.getcwd(), 'ID_dice_jaccard.csv')

    # Loop through targets:
    for itarget, target in enumerate(targets):

        # Find which indices contain the target:
        list1_indices = np.where(list1 == target)[0]
        list2_indices = np.where(list2 == target)[0]
        len1 = len(list1_indices)
        len2 = len(list2_indices)

        # Compute their intersection and union:
        len_intersection = len(np.intersect1d(list2_indices, list1_indices))
        len_union = len(np.union1d(list2_indices, list1_indices))

        # If there is at least one target in each list:
        if len2 * len1 > 0:

            # Compute Dice and Jaccard coefficients:
            dice = np.float(2.0 * len_intersection) / (len2 + len1)
            jacc = np.float(len_intersection) / len_union
            dice_overlaps[itarget] = dice
            jacc_overlaps[itarget] = jacc
            if verbose:
                print('target: {0}, dice: {1:.2f}, jacc: {2:.2f}'.format(
                      target, dice, jacc))

    # Save output:
    if save_output:
        #np.savetxt(output_file, overlaps, fmt='%d %.4f %.4f',
        #           delimiter='\t', newline='\n')
        df1 = pd.DataFrame({'ID': targets})
        df2 = pd.DataFrame({'Dice overlap': dice_overlaps})
        df3 = pd.DataFrame({'Jaccard overlap': jacc_overlaps})
        df = pd.concat([df1, df2, df3], axis=1)
        df.to_csv(output_file, index=False, encoding='utf-8')

    return dice_overlaps, jacc_overlaps, output_file


def compute_image_histogram(infile, nbins=100, threshold=0.0):
    """
    Compute histogram values from nibabel-readable image.

    Parameters
    ----------
    infile : string
        input nibabel-readable image file name
    nbins : integer
        number of bins
    threshold : float
        remove values lower than threshold

    Returns
    -------
    histogram_values : numpy array
        histogram bin values

    Examples
    --------
    >>> from mindboggle.guts.compute import compute_image_histogram
    >>> from mindboggle.mio.fetch_data import prep_tests
    >>> urls, fetch_data = prep_tests()
    >>> labels_file = fetch_data(urls['freesurfer_labels'], '', '.nii.gz')
    >>> nbins = 100
    >>> threshold = 0.5
    >>> histogram_values = compute_image_histogram(labels_file, nbins,
    ...                                            threshold)
    >>> histogram_values[0:5]
    array([102865, 119610,      0,      0,      0])

    """
    import numpy as np
    import nibabel as nb
    #from pylab import plot #, hist

    # ------------------------------------------------------------------------
    # Compute histogram
    # ------------------------------------------------------------------------
    # Load image
    data = nb.load(infile).get_data().ravel()

    # Threshold image
    if threshold > 0:
        data = data / max(data)
        data = data[data >= threshold]

    # Compute histogram
    histogram_values, bin_edges = np.histogram(data, bins=nbins)

    # plot(list(range(len(histogram_values))), histogram_values, '-')
    ##a,b,c = hist(data, bins=nbins)

    return histogram_values


# ============================================================================
# Doctests
# ============================================================================
if __name__ == "__main__":
    import doctest
    doctest.testmod(verbose=True)  # py.test --doctest-modules