Skip to content
This repository has been archived by the owner on Aug 31, 2021. It is now read-only.

refactor cluster selection so it can be replaced, add dbscan based clustering #28

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -58,5 +58,8 @@ docs/_build/
# PyBuilder
target/

#Ipython Notebook
# Ipython Notebook
.ipynb_checkpoints

# IDEs
.idea
47 changes: 8 additions & 39 deletions pysparnn/cluster_index.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,19 +14,7 @@
import numpy as _np

import pysparnn.matrix_distance


def _k_best(tuple_list, k):
"""For a list of tuples [(distance, value), ...] - Get the k-best tuples by
distance.
Args:
tuple_list: List of tuples. (distance, value)
k: Number of tuples to return.
"""
tuple_lst = sorted(tuple_list, key=lambda x: x[0],
reverse=False)[:k]

return tuple_lst
from pysparnn.cluster_selection import _k_best, DefaultClusterSelector


def _filter_unique(tuple_list):
Expand Down Expand Up @@ -81,6 +69,7 @@ class ClusterIndex(object):

def __init__(self, features, records_data,
distance_type=pysparnn.matrix_distance.CosineDistance,
cluster_selector_type=DefaultClusterSelector,
matrix_size=None,
parent=None):
"""Create a search index composed of recursively defined
Expand All @@ -96,12 +85,13 @@ def __init__(self, features, records_data,
distance_type: Class that defines the distance measure to use.
matrix_size: Ideal size for matrix multiplication. This controls
the depth of the tree. Defaults to 2 levels (approx). Highly
reccomended that the default value is used.
recommended that the default value is used.
"""

self.is_terminal = False
self.parent = parent
self.distance_type = distance_type
self.cluster_selector = cluster_selector_type(distance_type)
self.desired_matrix_size = matrix_size
features = distance_type.features_to_matrix(features)
num_records = features.shape[0]
Expand All @@ -122,28 +112,7 @@ def __init__(self, features, records_data,
self.is_terminal = False
records_data = _np.array(records_data)

records_index = list(_np.arange(features.shape[0]))
clusters_size = min(self.matrix_size, num_records)
clusters_selection = _random.sample(records_index, clusters_size)
clusters_selection = features[clusters_selection]

item_to_clusters = _collections.defaultdict(list)

root = distance_type(clusters_selection,
list(_np.arange(clusters_selection.shape[0])))

root.remove_near_duplicates()
root = distance_type(root.matrix,
list(_np.arange(root.matrix.shape[0])))

rng_step = self.matrix_size
for rng in range(0, features.shape[0], rng_step):
max_rng = min(rng + rng_step, features.shape[0])
records_rng = features[rng:max_rng]
for i, clstrs in enumerate(root.nearest_search(records_rng)):
_random.shuffle(clstrs)
for _, cluster in _k_best(clstrs, k=1):
item_to_clusters[cluster].append(i + rng)
clusters_selection, item_to_clusters = self.cluster_selector.select_clusters(features)

clusters = []
cluster_keeps = []
Expand Down Expand Up @@ -298,7 +267,7 @@ def search(self, features, k=1, k_clusters=1,
"""

# search no more than 1k records at once
# helps keap the matrix multiplies small
# helps keep the matrix multiplies small
batch_size = 1000
results = []
rng_step = batch_size
Expand Down Expand Up @@ -361,7 +330,7 @@ class MultiClusterIndex(object):
Creating more Indexes (random cluster allocations) increases the chances
of finding a good match.

There are three perameters that impact recall. Will discuss them all
There are three parameters that impact recall. Will discuss them all
here:
1) MuitiClusterIndex(matrix_size)
This impacts the tree structure (see cluster index documentation).
Expand Down Expand Up @@ -424,7 +393,7 @@ class docstring for a description of the method.
self.indexes = []
for _ in range(num_indexes):
self.indexes.append((ClusterIndex(features, records_data,
distance_type, matrix_size)))
distance_type=distance_type, matrix_size=matrix_size)))

def insert(self, feature, record):
"""Insert a single record into the index.
Expand Down
156 changes: 156 additions & 0 deletions pysparnn/cluster_selection.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,156 @@
import random as _random
import numpy as _np

import collections as _collections

from abc import ABC, abstractmethod

from sklearn.cluster import DBSCAN


def _k_best(tuple_list, k):
"""For a list of tuples [(distance, value), ...] - Get the k-best tuples by
distance.
Args:
tuple_list: List of tuples. (distance, value)
k: Number of tuples to return.
"""
tuple_lst = sorted(tuple_list, key=lambda x: x[0],
reverse=False)[:k]

return tuple_lst


class ClusterSelector(ABC):

@abstractmethod
def select_clusters(self, features):
pass


class DefaultClusterSelector(ClusterSelector):
"""
Default cluster selector, picks sqrt(num_records) random points (at most 1000)
and allocates points to their nearest category. This can often end up splitting
similar points into multiple paths of the tree
"""

def __init__(self, distance_type):
self._distance_type = distance_type

def select_clusters(self, features):
# number of points to cluster
num_records = features.shape[0]

matrix_size = max(int(_np.sqrt(num_records)), 1000)

# set num_clusters = min(max(sqrt(num_records), 1000), num_records))
clusters_size = min(matrix_size, num_records)

# make list [0, 1, ..., num_records-1]
records_index = list(_np.arange(features.shape[0]))
# randomly choose num_clusters records as the cluster roots
# this randomizes both selection and order of features in the selection
clusters_selection = _random.sample(records_index, clusters_size)
clusters_selection = features[clusters_selection]

# create structure to store clusters
item_to_clusters = _collections.defaultdict(list)

# create a distance_type object containing the cluster roots
# labeling them as 0 to N-1 in their current (random) order
root = self._distance_type(clusters_selection,
list(_np.arange(clusters_selection.shape[0])))

# remove duplicate cluster roots
root.remove_near_duplicates()
# initialize distance type object with the remaining cluster roots
root = self._distance_type(root.matrix,
list(_np.arange(root.matrix.shape[0])))

rng_step = matrix_size
# walk features in steps of matrix_size = max(sqrt(num_records), 1000)
for rng in range(0, features.shape[0], rng_step):
# don't exceed the array length on the last step
max_rng = min(rng + rng_step, features.shape[0])
records_rng = features[rng:max_rng]
# find the nearest cluster root for each feature in the step
for i, clstrs in enumerate(root.nearest_search(records_rng)):
_random.shuffle(clstrs)
for _, cluster in _k_best(clstrs, k=1):
# add each feature to its nearest cluster, here the cluster label
# is the label assigned to the root feature after it had been selected at random
item_to_clusters[cluster].append(i + rng)

# row index in clusters_selection maps to key in item_to_clusters
# but the values in item_to_clusters are row indices of the original features matrix
return clusters_selection, item_to_clusters


class DbscanClusterSelector(ClusterSelector):
"""
Dbscan based cluster selector, picks sqrt(num_records) random points (at most 1000)
and then forms groups inside the random selection, before allocating other features
to the groups
"""

def __init__(self, distance_type):
self._distance_type = distance_type
self._eps = 0.4

def select_clusters(self, features):
# number of points to cluster
num_records = features.shape[0]

matrix_size = max(int(_np.sqrt(num_records)), 1000)

# set num_clusters = min(max(sqrt(num_records), 1000), num_records))
clusters_size = min(matrix_size, num_records)

# make list [0, 1, ..., num_records-1]
records_index = list(_np.arange(features.shape[0]))
# randomly choose num_clusters records as the cluster roots
# this randomizes both selection and order of features in the selection
random_clusters_selection = _random.sample(records_index, clusters_size)
random_clusters_selection = features[random_clusters_selection]

# now cluster the cluster roots themselves to avoid
# randomly separating neighbours, this probably means fewer clusters per level
# TODO might want to propagate the distance type to the clustering
db_scan_clustering = DBSCAN(eps=self._eps, min_samples=2).fit(random_clusters_selection)

# get all the individual points from the cluster
unique_indices = _np.where(db_scan_clustering.labels_ == -1)[0]
# and the first item from each cluster
_, cluster_start_indices = _np.unique(db_scan_clustering.labels_, return_index=True)
# merge and uniquefy, the result is sorted
all_indices = _np.concatenate((unique_indices, cluster_start_indices))
all_indices_unique = _np.unique(all_indices)

# create a matrix where rows are the first item in each dbscan cluster
# set that as cluster selection and then allocate features to cluster
clusters_selection = random_clusters_selection[all_indices_unique]

# create structure to store clusters
item_to_clusters = _collections.defaultdict(list)

# create a distance_type object containing the cluster root
root = self._distance_type(clusters_selection,
list(_np.arange(clusters_selection.shape[0])))

rng_step = matrix_size
# walk features in steps of matrix_size = max(sqrt(num_records), 1000)
for rng in range(0, features.shape[0], rng_step):
max_rng = min(rng + rng_step, features.shape[0])
records_rng = features[rng:max_rng]
# find the nearest cluster root for each feature in the step
for i, clstrs in enumerate(root.nearest_search(records_rng)):
# this is slow, disable until proven useful
# _random.shuffle(clstrs)
for _, cluster in _k_best(clstrs, k=1):
# add each feature to its nearest cluster
item_to_clusters[cluster].append(i + rng)

# row index in clusters_selection maps to key in item_to_clusters
# but the values in item_to_clusters are row indices of the original features matrix
return clusters_selection, item_to_clusters
52 changes: 52 additions & 0 deletions tests/test_cluster_selection.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
import numpy as np
from scipy.sparse import csr_matrix

from pysparnn.cluster_selection import DefaultClusterSelector, DbscanClusterSelector
from pysparnn.matrix_distance import CosineDistance, SlowEuclideanDistance


class TestDefaultClusterSelector(object):
def test_single_item_groups(self):
sel = DefaultClusterSelector(CosineDistance)

features_dense = np.identity(1000)
features = csr_matrix(features_dense)
cs, i2c = sel.select_clusters(features)

assert cs.shape[0] == 1000
assert all(len(cl) == 1 for cl in i2c.values())

def test_non_single_item_groups(self):
sel = DefaultClusterSelector(SlowEuclideanDistance)

features_dense = np.identity(1001)
# features = csr_matrix(features_dense)
cs, i2c = sel.select_clusters(features_dense)

assert cs.shape[0] == 1000
assert all(len(cl) >= 1 for cl in i2c.values())
assert sum(len(cl) for cl in i2c.values()) == 1001

non_single_groups = list(cl for cl in i2c.values() if len(cl) == 2)
assert len(non_single_groups) == 1


class TestDbscanClusterSelector(object):

def test_one(self):
sel = DbscanClusterSelector(CosineDistance)

features_dense = np.identity(100)
similar = [[0.9] + [0] * 99]
features_dense = np.append(features_dense, similar, axis=0)
assert features_dense.shape[0] == 101

features = csr_matrix(features_dense)
cs, i2c = sel.select_clusters(features)

assert cs.shape[0] == 100
assert all(len(cl) >= 1 for cl in i2c.values())
assert sum(len(cl) for cl in i2c.values()) == 101

non_single_groups = list(cl for cl in i2c.values() if len(cl) == 2)
assert len(non_single_groups) == 1