Skip to content

Commit

Permalink
Still more docstrings (per issue #15).
Browse files Browse the repository at this point in the history
  • Loading branch information
lmcinnes committed Nov 19, 2017
1 parent ec31c4c commit 47ee8ce
Showing 1 changed file with 257 additions and 0 deletions.
257 changes: 257 additions & 0 deletions umap/umap_.py
Original file line number Diff line number Diff line change
Expand Up @@ -509,6 +509,39 @@ def heap_push(heap, row, weight, index, flag):


def rptree_leaf_array(data, n_neighbors, rng_state, n_trees=10, angular=False):
"""Generate an array of sets of candidate nearest neighbors by
constructing a random projection forest and taking the leaves of all the
trees. Any given tree has leaves that are a set of potential nearest
neighbors. Given enough trees the set of all such leaves gives a good
likelihood of getting a good set of nearest neighbors in composite. Since
such a random projection forest is inexpensive to compute, this can be a
useful means of seeding other nearest neighbor algorithms.
Parameters
----------
data: array of shape (n_samples, n_features)
The data for which to generate nearest neighbor approximations.
n_neighbors: int
The number of nearest neighbors to attempt to approximate.
rng_state: array of int64, shape (3,)
The internal state of the rng
n_trees: int (optional, default 10)
The number of trees to build in the forest construction.
angular: bool (optional, default False)
Whether to use angular/cosine distance for random projection tree
construction.
Returns
-------
leaf_array: array of shape (n_leaves, max(10, n_neighbors))
Each row of leaf array is a list of indices found in a given leaf.
Since not all leaves are the same size the arrays are padded out with -1
to ensure we can return a single ndarray.
"""
leaves = []
try:
leaf_size = max(10, n_neighbors)
Expand All @@ -532,6 +565,32 @@ def rptree_leaf_array(data, n_neighbors, rng_state, n_trees=10, angular=False):
@numba.njit(parallel=True)
def build_candidates(current_graph, n_vertices, n_neighbors, max_candidates,
rng_state):
"""Build a heap of candidate neighbors for nearest neighbor descent. For
each vertex the candidate neighbors are any current neighbors, and any
vertices that have the vertex as one of their nearest neighbors.
Parameters
----------
current_graph: heap
The current state of the graph for nearest neighbor descent.
n_vertices: int
The total number of vertices in the graph.
n_neighbors: int
The number of neighbor edges per node in the current graph.
max_candidates: int
The maximum number of new candidate neighbors.
rng_state: array of int64, shape (3,)
The internal state of the rng
Returns
-------
candidate_neighbors: A heap with an array of (randomly sorted) candidate
neighbors for each vertex in the graph.
"""
candidate_neighbors = make_heap(n_vertices, max_candidates)
for i in range(n_vertices):
for j in range(n_neighbors):
Expand All @@ -548,6 +607,26 @@ def build_candidates(current_graph, n_vertices, n_neighbors, max_candidates,


def make_nn_descent(dist, dist_args):
"""Create a numba accelerated version of nearest neighbor descent
specialised for the given distance metric and metric arguments. Numba
doesn't support higher order functions directly, but we can instead JIT
compile the version of NN-descent for any given metric.
Parameters
----------
dist: function
A numba JITd distance function which, given two arrays computes a
dissimilarity between them.
dist_args: tuple
Any extra arguments that need to be passed to the distance function
beyond the two arrays to be compared.
Returns
-------
A numba JITd function for nearest neighbor descent computation that is
specialised to the given metric.
"""
@numba.njit(parallel=True)
def nn_descent(data, n_neighbors, rng_state, max_candidates=50,
n_iters=10, delta=0.001, rho=0.5,
Expand Down Expand Up @@ -613,6 +692,33 @@ def nn_descent(data, n_neighbors, rng_state, max_candidates=50,

@numba.njit(parallel=True)
def smooth_knn_dist(distances, k, n_iter=128):
"""Compute a continuous version of the distance to the kth nearest
neighbor. That is, this is similar to knn-distance but allows continuous
k values rather than requiring an integral k. In esscence we are simply
computing the distance such that the cardinality of fuzzy set we generate
is k.
Parameters
----------
distances: array of shape (n_samples, n_neighbors)
Distances to nearest neighbors for each samples. Each row should be a
sorted list of distances to a given samples nearest neighbors.
k: float
The number of nearest neighbors to approximate for.
n_iter: int (optiona, default 128)
We need to binary search for the correct distance value. This is the
max number of iterations to use in such a search.
Returns
-------
knn_dist: array of shape (n_samples,)
The distance to kth nearest neighbor, as suitably approximated.
nn_dist: array of shape (n_samples,)
The distance to the 1st nearest neighbor for each point.
"""
target = np.log2(k)
rho = np.zeros(distances.shape[0])
result = np.zeros(distances.shape[0])
Expand Down Expand Up @@ -667,6 +773,79 @@ def smooth_knn_dist(distances, k, n_iter=128):
def fuzzy_simplicial_set(X, n_neighbors, random_state,
metric, metric_kwds={}, angular=False,
verbose=False):
"""Given a set of data X, a neighborhood size, and a measure of distance
compute the fuzzy simplicial set (here represented as a fuzzy graph in
the form of a sparse matrix) associated to the data. This is done by
locally approximating geodesic distance at each point, creating a fuzzy
simplicial set for each such point, and then combining all the local
fuzzy simplicial sets into a global one via a fuzzy union.
Parameters
----------
X: array of shape (n_samples, n_features)
The data to be modelled as a fuzzy simplicial set.
n_neighbors: int
The number of neighbors to use to approximate geodesic distance.
Larger numbers induce more global estimates of the manifold that can
miss finer detail, while smaller values will focus on fine manifold
structure to the detriment of the larger picture.
random_state: numpy RandomState or equivalent
A state capable being used as a numpy random state.
metric: string or function (optional, default 'euclidean')
The metric to use to compute distances in high dimensional space.
If a string is passed it must match a valid predefined metric. If
a general metric is required a function that takes two 1d arrays and
returns a float can be provided. For performance purposes it is
required that this be a numba jit'd function. Valid string metrics
include:
* euclidean
* manhattan
* chebyshev
* minkowski
* canberra
* braycurtis
* mahalanobis
* wminkowski
* seuclidean
* cosine
* correlation
* haversine
* hamming
* jaccard
* dice
* russelrao
* kulsinski
* rogerstanimoto
* sokalmichener
* sokalsneath
* yule
Metrics that take arguments (such as minkowski, mahalanobis etc.)
can have arguments passed via the metric_kwds dictionary. At this
time care must be taken and dictionary elements must be ordered
appropriately; this will hopefully be fixed in the future.
metric_kwds: dict (optional, default {})
Arguments to pass on to the metric, such as the ``p`` value for
Minkowski distance.
angular: bool (optional, default False)
Whether to use angular/cosine distance for the random projection
forest for seeding NN-descent to determine approximate nearest
neighbors.
verbose: bool (optional, default False)
Whether to report information on the current progress of the algorithm.
Returns
-------
fuzzy_simplicial_set: coo_matrix
A fuzzy simplicial set represented as a sparse matrix. The (i,
j) entry of the matrix represents the membership strength of the
1-simplex between the ith and jth sample points.
"""

rows = np.zeros((X.shape[0] * n_neighbors), dtype=np.int64)
cols = np.zeros((X.shape[0] * n_neighbors), dtype=np.int64)
Expand Down Expand Up @@ -741,6 +920,24 @@ def fuzzy_simplicial_set(X, n_neighbors, random_state,

@numba.jit()
def create_sampler(probabilities):
"""Create the data necessary for a Walker alias sampler. This allows for
efficient sampling from a dataset according to an array of weights as to
how relatively likely each element of the dataset is to be sampled.
Parameters
----------
probabilities: array of shape (n_items_for_sampling,)
An array of weights (which can be viewed as the desired probabilities
of a multinomial distribution when l1 normalised).
Returns
-------
prob: array of shape (n_items_for_sampling,)
The probabilities of selecting an element or its alias
alias: array of shape (n_items_for_sampling,)
The alternate choice if the element is not to be selected.
"""
prob = np.zeros(probabilities.shape[0], dtype=np.float64)
alias = np.zeros(probabilities.shape[0], dtype=np.int64)

Expand Down Expand Up @@ -781,6 +978,23 @@ def create_sampler(probabilities):

@numba.njit()
def sample(prob, alias, rng_state):
"""Given data for a Walker alias sampler, perform sampling.
Parameters
----------
prob: array of shape (n_items_for_sampling,)
The probabilities of selecting an element or its alias
alias: array of shape (n_items_for_sampling,)
The alternate choice if the element is not to be selected.
rng_state: array of int64, shape (3,)
The internal state of the rng
Returns
-------
The index of the sampled item.
"""
k = tau_rand_int(rng_state) % prob.shape[0]
u = tau_rand(rng_state)

Expand All @@ -791,6 +1005,26 @@ def sample(prob, alias, rng_state):


def spectral_layout(graph, dim, random_state):
"""Given a graph compute the spectral embedding of the graph. This is
simply the eigenvectors of the laplacian of the graph. Here we use the
normalized laplacian.
Parameters
----------
graph: sparse matrix
The (weighted) adjacency matrix of the graph as a sparse matrix.
dim: int
The dimension of the space into which to embed.
random_state: numpy RandomState or equivalent
A state capable being used as a numpy random state.
Returns
-------
embedding: array of shape (n_vertices, dim)
The spectral embedding of the graph.
"""
diag_data = np.asarray(graph.sum(axis=0))
# standard Laplacian
# D = scipy.sparse.spdiags(diag_data, 0, graph.shape[0], graph.shape[0])
Expand Down Expand Up @@ -824,6 +1058,18 @@ def spectral_layout(graph, dim, random_state):

@numba.njit()
def clip(val):
"""Standard clamping of a value into a fixed range (in this case -4.0 to
4.0)
Parameters
----------
val: float
The value to be clamped.
Returns
-------
The clamped value, now fixed to be in the range -4.0 to 4.0.
"""
if val > 4.0:
return 4.0
elif val < -4.0:
Expand All @@ -834,6 +1080,17 @@ def clip(val):

@numba.njit('f8(f8[:],f8[:])')
def rdist(x, y):
"""Reduced Euclidean distance.
Parameters
----------
x: array of shape (embedding_dim,)
y: array of shape (embedding_dim,)
Returns
-------
The squared euclidean distance between x and y
"""
result = 0.0
for i in range(x.shape[0]):
result += (x[i] - y[i]) ** 2
Expand Down

0 comments on commit 47ee8ce

Please sign in to comment.