Skip to content

Commit

Permalink
Merge pull request #72 from Aquila-Network/develop
Browse files Browse the repository at this point in the history
bug fixes
  • Loading branch information
admin-adb authored Dec 23, 2020
2 parents 406d2d9 + 175159e commit 25031e6
Show file tree
Hide file tree
Showing 4 changed files with 131 additions and 36 deletions.
23 changes: 18 additions & 5 deletions src/manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -123,9 +123,12 @@ def delete_vectors (self, cids):
# commit db write
wb_.write()
# delete vectors by ID from index
self.index.delete_vectors(ids_)
status, ids = self.index.delete_vectors(ids_)

return cids
if status and len(ids) == len(cids):
return cids
else:
return []

def get_nearest (self, qmatrix, k, rad):
ids = []
Expand All @@ -146,8 +149,11 @@ def get_nearest (self, qmatrix, k, rad):
for idx_, idb in enumerate(ids):
for idx__, id_ in enumerate(idb):
value = self.KV_store.get(byt(id_))
cid_len_ = int(value[:2]) + 2
ids[idx_][idx__] = CID.bson2doc(value[cid_len_:])
if value:
cid_len_ = int(value[:2]) + 2
ids[idx_][idx__] = CID.bson2doc(value[cid_len_:])
else:
ids[idx_][idx__] = None

return ids, dists

Expand Down Expand Up @@ -185,8 +191,15 @@ def resize_vector (self, vector, dim):
def resize_matrix (self, matrix_, dim):
# numpize
matrix = np.array(matrix_)
# check for valid dimensions
if matrix.ndim < 2:
matrix = np.array([matrix_])
elif matrix.ndim > 2:
logging.error("Invalid query dimensions")
return [[]]

# resize vectors
vector_l = len(matrix[0])
vector_l = len(matrix_[0])
# check if the vector length is below dimention limit
# then pad vector with 0 by dimension
if vector_l < dim:
Expand Down
122 changes: 100 additions & 22 deletions src/test/apis/doc_fns.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,9 @@
import index # includes preloading databases
import router
from utils import CID
import numpy as np

import time

class TestDocs (unittest.TestCase):

Expand Down Expand Up @@ -131,7 +134,7 @@ def test_4_doc_fresh_delete (self):
# create database
schema_def = {
"description": "this is my database",
"unique": "r8and0mseEd901",
"unique": "r8and0mseEd901fr",
"encoder": "example.com/autoencoder/API",
"codelen": 3,
"metadata": {
Expand All @@ -146,40 +149,115 @@ def test_4_doc_fresh_delete (self):

self.assertEqual(len(cids), 0, "Doc deletion failed")

# An existing doc is deleted
def test_5_doc_exist_delete (self):
# An existing doc is deleted for small dataset
def test_5a_doc_exist_delete_small (self):
# create database
schema_def = {
"description": "this is my database",
"unique": "r8and0mseEd902",
"encoder": "example.com/autoencoder/API",
"codelen": 3,
"codelen": 100,
"metadata": {
"name": "string",
"age": "number"
}
}
database_name = router.create_database(schema_def)

# add existing document
docs = [{
"metadata": {
"name":"name1",
"age": 20
},
"code": [1,2,3]
}, {
"metadata": {
"name":"name2",
"age": 30
},
"code": [1,2,3]
}]
cids = router.insert_docs(docs, database_name)
# delete existing documents
cids = router.delete_docs(cids, database_name)
# add small epoch document
docs = []
# create special doc
matrix_r_spec = np.random.rand(1, 100)
docs.append({
"metadata": {"name":"special", "age":11},
"code": matrix_r_spec[0].tolist()
})
cids_spec = router.insert_docs(docs, database_name)

# create other docs
for _ in range(90):
docs = []
# create random matrix
matrix_r = np.random.rand(100, 100)
# create documents
for item in matrix_r:
docs.append({
"metadata": {"name":"generic", "age":10},
"code": item.tolist()
})

cids = router.insert_docs(docs, database_name)

# check for doc existance
k = 2
docs, dists = router.search([matrix_r_spec[0].tolist()], k, None, database_name)
self.assertEqual(docs[0][0]["metadata"]["name"], "special", "Doc doesn't exist")

# delete special doc
cids = router.delete_docs(cids_spec, database_name)
time.sleep(10)

# check for doc existance
k = 2
docs, dists = router.search([matrix_r_spec[0].tolist()], k, None, database_name)
self.assertEqual(len(docs[0]), k, "Doc deletion failed")
self.assertEqual(docs[0][0]["metadata"]["name"], "generic", "Doc deletion failed")

# An existing doc is deleted for large dataset
def test_5b_doc_exist_delete_large (self):
# create database
schema_def = {
"description": "this is my database",
"unique": "r8and0mseEd902",
"encoder": "example.com/autoencoder/API",
"codelen": 100,
"metadata": {
"name": "string",
"age": "number"
}
}
database_name = router.create_database(schema_def)

# add small epoch document
docs = []
# create special doc
matrix_r_spec = np.random.rand(1, 100)
docs.append({
"metadata": {"name":"special", "age":11},
"code": matrix_r_spec[0].tolist()
})
cids_spec = router.insert_docs(docs, database_name)

# create other docs
for _ in range(120):
docs = []
# create random matrix
matrix_r = np.random.rand(100, 100)
# create documents
for item in matrix_r:
docs.append({
"metadata": {"name":"generic", "age":10},
"code": item.tolist()
})

cids = router.insert_docs(docs, database_name)

time.sleep(60)
# check for doc existance
k = 2
docs, dists = router.search([matrix_r_spec[0].tolist()], k, None, database_name)
self.assertEqual(docs[0][0]["metadata"]["name"], "special", "Doc doesn't exist")

# delete special doc
cids = router.delete_docs(cids_spec, database_name)
time.sleep(60)

self.assertEqual(len(cids), len(docs), "Doc deletion failed")
# check for doc existance
k = 2
docs, dists = router.search([matrix_r_spec[0].tolist()], k, None, database_name)

self.assertEqual(len(docs[0]), k, "Doc deletion failed")
self.assertEqual(docs[0][0]["metadata"]["name"], "generic", "Doc deletion failed")

# A non existing DB is used to delete doc
def test_6_db_fresh_doc_delete (self):
Expand Down
9 changes: 6 additions & 3 deletions src/vec_index/hannoy.py
Original file line number Diff line number Diff line change
Expand Up @@ -90,13 +90,15 @@ def process(self):

# unbuild index first
self.a_index.unbuild()
len_documents = 0

# fetch all currently available documents from queue
while not self.pipeline.empty():
# extract document & contents
qitem = self.pipeline.get_nowait()
if qitem["action"] == "add":
documents = qitem["docs"]
len_documents += len(documents)
for document in documents:
_id = document["_id"]
vector_e = document["code"]
Expand All @@ -110,6 +112,7 @@ def process(self):
self.index_disk = np.append(self.index_disk, [vector_e + [int(_id)]], axis=0)
elif qitem["action"] == "delete":
ids = qitem["ids"]
len_documents += len(ids)
# reset
zero_ = np.zeros(self.dim + 1)
for id_ in ids:
Expand All @@ -120,7 +123,7 @@ def process(self):
self.index_disk[ids] = zero_

# take a rest if doc length is > batch_size
if len(documents) > self.build_batch_size:
if len_documents > self.build_batch_size:
break

# build vector
Expand All @@ -132,9 +135,9 @@ def process(self):

def delete_vectors(self, ids):
# add documents to queue
self.pipeline.put({"action":"delete", "docs": ids})
self.pipeline.put({"action":"delete", "ids": ids})

return True
return True, ids

def get_nearest_k(self, matrix, k):
ids = []
Expand Down
13 changes: 7 additions & 6 deletions src/vec_index/hfaiss.py
Original file line number Diff line number Diff line change
Expand Up @@ -124,10 +124,9 @@ def process(self):
elif qitem["action"] == "delete":
# f_delete = True
ids = qitem["ids"]
zero_ = np.zeros(self.dim)
for _id in ids:
ids.append(_id)
vecs.append(zero_)
# remove vectors and add zero reset
self.f_index.remove_ids(np.array(ids).astype('int'))
vecs = np.zeros((len(ids), self.dim))

# if f_add:
# convert to np matrix
Expand All @@ -136,7 +135,7 @@ def process(self):

# Lock index read / wtite until it is built
with self._lock:
# add vector
# add vectors
self.f_index.add_with_ids(vec_data, id_data)

# if f_delete:
Expand All @@ -146,7 +145,9 @@ def process(self):
self.save_model_to_disk(self.model_location, self.f_index)

def delete_vectors(self, ids):

# remove vectors
self.pipeline.put({"action":"delete", "ids": ids})

return True, ids

def get_nearest_rad(self, matrix, rad):
Expand Down

0 comments on commit 25031e6

Please sign in to comment.