Skip to content

Commit

Permalink
Add ann search
Browse files Browse the repository at this point in the history
Signed-off-by: junjie.jiang <[email protected]>
  • Loading branch information
junjiejiangjjj committed Jul 10, 2023
1 parent bf02e4c commit b828db7
Show file tree
Hide file tree
Showing 4 changed files with 152 additions and 1 deletion.
5 changes: 5 additions & 0 deletions towhee/doc/source/operator/hub_ops.rst
Original file line number Diff line number Diff line change
Expand Up @@ -26,3 +26,8 @@ HubOps
:show-inheritance:
:member-order: bysource

.. autoclass:: towhee.runtime.hub_ops.ann_search.AnnSearch
:members:
:show-inheritance:
:member-order: bysource

2 changes: 1 addition & 1 deletion towhee/runtime/hub_ops/ann_insert.py
Original file line number Diff line number Diff line change
Expand Up @@ -128,7 +128,7 @@ class AnnInsert:
p = (
pipe.input('collection_name', 'vec')
.map(('collection_name', 'vec'), (), ops.ann_insert.osschat_milvus(host='127.0.0.1', port='19530'))
.map(('collection_name', 'vec'), (), ops.ann_insert.milvus_multi_collections(host='127.0.0.1', port='19530'))
.output()
)
Expand Down
139 changes: 139 additions & 0 deletions towhee/runtime/hub_ops/ann_search.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,139 @@
# Copyright 2023 Zilliz. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from typing import Any
from towhee.runtime.factory import HubOp


class AnnSearch:
"""
The ANN search operator is used to find the closest (or most similar)
point to a given point in a given set, i.e. find similar embeddings.
"""

faiss_index: HubOp = HubOp('ops.ann_search.faiss')
"""
Only for local test. If you want to use a vector database in a production environment,
you can use Milvus(https://github.com/milvus-io/milvus).
__init__(self, data_dir: str, top_k: int = 5)
data_dir(`str`):
Path to store data.
top_k(`int`):
top_k similar data
__call__(self, query: 'ndarray') -> List[Tuple[id: int, score: float, meta: dict]
query(`ndarray`):
query embedding
Example;
.. code-block:: python
from towhee import pipe, ops
p = (
pipe.input('vec')
.flat_map('vec', 'rows', ops.ann_search.faiss('./data_dir', 5))
.map('rows', ('id', 'score'), lambda x: (x[0], x[1]))
.output('id', 'score')
)
p(<your-vector>)
"""

milvus_client: HubOp = HubOp('ann_search.milvus_client')
"""
Search embedding in Milvus, please make sure you have inserted data to Milvus Collection.
__init__(self, host: str = 'localhost', port: int = 19530, collection_name: str = None,
user: str = None, password: str = None, **kwargs)
host(`str`):
The host for Milvus.
port(`str`):
The port for Milvus.
collection_name(`str`):
The collection name for Milvus.
user(`str`)
The user for Zilliz Cloud, defaults to None.
password(`str`):
The password for Zilliz Cloud, defaults to None.
kwargs(`dict`):
The same with pymilvus search: https://milvus.io/docs/search.md
__call__(self, query: 'ndarray') -> List[Tuple]
query(`ndarray`):
query embedding
Example:
.. code-block:: python
from towhee import pipe, ops, DataCollection
p = (
pipe.input('text')
.map('text', 'vec', ops.sentence_embedding.transformers(model_name='all-MiniLM-L12-v2'))
.flat_map('vec', 'rows', ops.ann_search.milvus_client(host='127.0.0.1', port='19530',
collection_name='text_db2', **{'output_fields': ['text']}))
.map('rows', ('id', 'score', 'text'), lambda x: (x[0], x[1], x[2]))
.output('id', 'score', 'text')
)
DataCollection(p('cat')).show()
"""

milvus_multi_collections: HubOp = HubOp('ann_search.osschat_milvus')
"""
`milvus_multi_collections <https://towhee.io/ann-search/osschat-milvus>`_ A client that can access multiple collections.
__init__(self, host: str = 'localhost', port: int = 19530,
user: str = None, password: str = None, **kwargs):
host(`str`):
The host for Milvus.
port(`str`):
The port for Milvus.
user(`str`)
The user for Zilliz Cloud, defaults to None.
password(`str`):
The password for Zilliz Cloud, defaults to None.
kwargs(`dict`):
The same with pymilvus search: https://milvus.io/docs/search.md
__call__(self, collection_name: str, query: 'ndarray') -> List[Tuple]
collection_name(`str`):
The collection name for Milvus.
query(`ndarray`):
query embedding
Example:
.. code-block:: python
from towhee import pipe, ops, DataCollection
p = (
pipe.input('text')
.map('text', 'vec', ops.sentence_embedding.transformers(model_name='all-MiniLM-L12-v2'))
.flat_map('vec', 'rows', ops.ann_search.milvus_multi_collections(host='127.0.0.1', port='19530', **{'output_fields': ['text']}))
.map('rows', ('id', 'score', 'text'), lambda x: (x[0], x[1], x[2]))
.output('id', 'score', 'text')
)
DataCollection(p('cat')).show()
"""

def __call__(self, *args: Any, **kwds: Any) -> Any:
return HubOp('towhee.ann_search')(*args, **kwds)
7 changes: 7 additions & 0 deletions towhee/runtime/hub_ops/operator_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
from .sentence_embedding import SentenceEmbedding
from .data_source import DataSource
from .ann_insert import AnnInsert
from .ann_search import AnnSearch


class Ops:
Expand Down Expand Up @@ -73,6 +74,12 @@ class Ops:
The ANN Insert Operator is used to insert embeddings and create ANN indexes for fast similarity searches.
"""

ann_search: AnnSearch = AnnSearch()
"""
The ANN search operator is used to find the closest (or most similar)
point to a given point in a given set, i.e. find similar embeddings.
"""

@classmethod
def __getattr__(cls, name):
@ops_parse
Expand Down

0 comments on commit b828db7

Please sign in to comment.