From f1cf2d6258c368876e044329ddc8a414141eef71 Mon Sep 17 00:00:00 2001 From: Vishnu Nandakumar <38393302+Vishnunkumar@users.noreply.github.com> Date: Wed, 24 Apr 2024 21:17:08 +0530 Subject: [PATCH 1/7] feat(embedding_function): added capability to use spacy models --- chromadb/utils/embedding_functions.py | 35 +++++++++++++++++++++++++++ 1 file changed, 35 insertions(+) diff --git a/chromadb/utils/embedding_functions.py b/chromadb/utils/embedding_functions.py index 3f0a1ce043b..c911bedac38 100644 --- a/chromadb/utils/embedding_functions.py +++ b/chromadb/utils/embedding_functions.py @@ -372,6 +372,41 @@ def __call__(self, input: Documents) -> Embeddings: return cast(Embeddings, self._model.encode(texts_with_instructions).tolist()) +class SpacyEmbeddingFunction(): + def __init__(self, model_name: str = "lg"): + try: + import spacy + except ImportError: + raise ValueError( + "The spacy python package is not installed. Please install it with `pip install spacy`" + ) + self._model_name = model_name + + try: + self._nlp = spacy.load("en_core_web_{model}".format(model=self._model_name)) + except OSError: + raise ValueError( + "spacy models are not downloaded, please download them using `spacy download en-core-web-lg or en-core-web-md`" + ) + + def __call__(self, input: Documents) -> Embeddings: + """ + Get the embeddings for a list of texts. + + Args: + texts (Documents): A list of texts to get embeddings for. + + Returns: + Embeddings: The embeddings for the texts. + + Example: + >>> spacy_fn = SpacyEmbeddingFunction(model_name="md") + >>> input = ["Hello, world!", "How are you?"] + >>> embeddings = spacy_fn(input) + """ + + return [self._nlp(doc).vector for doc in input] + # In order to remove dependencies on sentence-transformers, which in turn depends on # pytorch and sentence-piece we have created a default ONNX embedding function that # implements the same functionality as "all-MiniLM-L6-v2" from sentence-transformers. From 294f478307348b2ef0548f37343a4137e5577cc1 Mon Sep 17 00:00:00 2001 From: Vishnu Nandakumar <38393302+Vishnunkumar@users.noreply.github.com> Date: Wed, 24 Apr 2024 21:21:03 +0530 Subject: [PATCH 2/7] fix: fixed bug regarding the EmbeddingFunction --- chromadb/utils/embedding_functions.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/chromadb/utils/embedding_functions.py b/chromadb/utils/embedding_functions.py index c911bedac38..19369c8b868 100644 --- a/chromadb/utils/embedding_functions.py +++ b/chromadb/utils/embedding_functions.py @@ -372,7 +372,7 @@ def __call__(self, input: Documents) -> Embeddings: return cast(Embeddings, self._model.encode(texts_with_instructions).tolist()) -class SpacyEmbeddingFunction(): +class SpacyEmbeddingFunction(EmbeddingFunction[Documents]): def __init__(self, model_name: str = "lg"): try: import spacy From 76f2df048862122e86c43fc8cc37f8842b0dbf40 Mon Sep 17 00:00:00 2001 From: Vishnunkumar Date: Sun, 28 Apr 2024 14:04:21 +0530 Subject: [PATCH 3/7] refactor: added unittests and modified embedding_functions.py based on test cases --- chromadb/test/ef/test_spacy_ef.py | 24 ++++++++++++++++++++++++ chromadb/utils/embedding_functions.py | 5 +++-- 2 files changed, 27 insertions(+), 2 deletions(-) create mode 100644 chromadb/test/ef/test_spacy_ef.py diff --git a/chromadb/test/ef/test_spacy_ef.py b/chromadb/test/ef/test_spacy_ef.py new file mode 100644 index 00000000000..0892fcf3b24 --- /dev/null +++ b/chromadb/test/ef/test_spacy_ef.py @@ -0,0 +1,24 @@ +import pytest +from chromadb.utils.embedding_functions import SpacyEmbeddingFunction + +input_list = ["great work by the guy", "Super man is that guy"] +model_name = "md" +unknown_model = "unknown_model" + + +def test_spacyembeddingfunction_isnotnone_wheninputisnotnone(): + spacy_emb_fn = SpacyEmbeddingFunction(model_name) + assert spacy_emb_fn(input_list) is not None + + +def test_spacyembddingfunction_throwserror_whenmodel_notfound(): + with pytest.raises(ValueError, + match='spacy models are not downloaded, ' + 'please download them using ' + '`spacy download en-core-web-lg or en-core-web-md`'): + SpacyEmbeddingFunction(unknown_model) + + +def test_spacyembddingfunction_isembedding_wheninput_islist(): + spacy_emb_fn = SpacyEmbeddingFunction(model_name) + assert type(spacy_emb_fn(input_list)) is list diff --git a/chromadb/utils/embedding_functions.py b/chromadb/utils/embedding_functions.py index 83c592d8ea1..54ae84eed9c 100644 --- a/chromadb/utils/embedding_functions.py +++ b/chromadb/utils/embedding_functions.py @@ -386,7 +386,8 @@ def __init__(self, model_name: str = "lg"): self._nlp = spacy.load("en_core_web_{model}".format(model=self._model_name)) except OSError: raise ValueError( - "spacy models are not downloaded, please download them using `spacy download en-core-web-lg or en-core-web-md`" + "spacy models are not downloaded, please download them using `spacy download en-core-web-lg or " + "en-core-web-md`" ) def __call__(self, input: Documents) -> Embeddings: @@ -405,7 +406,7 @@ def __call__(self, input: Documents) -> Embeddings: >>> embeddings = spacy_fn(input) """ - return [self._nlp(doc).vector for doc in input] + return cast(Embeddings, [list(self._nlp(doc).vector.astype("float")) for doc in input]) # In order to remove dependencies on sentence-transformers, which in turn depends on # pytorch and sentence-piece we have created a default ONNX embedding function that From e89d0a5cfa809795102b7c5583b49898643842a6 Mon Sep 17 00:00:00 2001 From: Vishnu Nandakumar <38393302+Vishnunkumar@users.noreply.github.com> Date: Wed, 1 May 2024 17:04:14 +0530 Subject: [PATCH 4/7] fix: update spacy models and its relevant unit tests. --- chromadb/test/ef/test_spacy_ef.py | 10 ++++++---- chromadb/utils/embedding_functions.py | 12 +++++++----- 2 files changed, 13 insertions(+), 9 deletions(-) diff --git a/chromadb/test/ef/test_spacy_ef.py b/chromadb/test/ef/test_spacy_ef.py index 0892fcf3b24..34ddd36e0ae 100644 --- a/chromadb/test/ef/test_spacy_ef.py +++ b/chromadb/test/ef/test_spacy_ef.py @@ -2,7 +2,7 @@ from chromadb.utils.embedding_functions import SpacyEmbeddingFunction input_list = ["great work by the guy", "Super man is that guy"] -model_name = "md" +model_name = "en_core_web_md" unknown_model = "unknown_model" @@ -13,9 +13,11 @@ def test_spacyembeddingfunction_isnotnone_wheninputisnotnone(): def test_spacyembddingfunction_throwserror_whenmodel_notfound(): with pytest.raises(ValueError, - match='spacy models are not downloaded, ' - 'please download them using ' - '`spacy download en-core-web-lg or en-core-web-md`'): + match="""spacy models are not downloaded yet, please download them using `spacy download model_name`, Please checkout + for the list of models from: https://spacy.io/usage/models. By default the module will load en_core_web_lg + model as it optimizes accuracy and has embeddings in-built, please download and load with `en_core_web_md` + if you want to priortize efficiency over accuracy, the same logic applies for models from other languages also. + language_web_core_sm and language_web_core_trf doesn't have pre-trained embeddings."""): SpacyEmbeddingFunction(unknown_model) diff --git a/chromadb/utils/embedding_functions.py b/chromadb/utils/embedding_functions.py index 54ae84eed9c..c9a27dc4a46 100644 --- a/chromadb/utils/embedding_functions.py +++ b/chromadb/utils/embedding_functions.py @@ -373,7 +373,7 @@ def __call__(self, input: Documents) -> Embeddings: class SpacyEmbeddingFunction(EmbeddingFunction[Documents]): - def __init__(self, model_name: str = "lg"): + def __init__(self, model_name: str = "en_core_web_lg"): try: import spacy except ImportError: @@ -383,11 +383,14 @@ def __init__(self, model_name: str = "lg"): self._model_name = model_name try: - self._nlp = spacy.load("en_core_web_{model}".format(model=self._model_name)) + self._nlp = spacy.load("{model}".format(model=self._model_name)) except OSError: raise ValueError( - "spacy models are not downloaded, please download them using `spacy download en-core-web-lg or " - "en-core-web-md`" + """spacy models are not downloaded yet, please download them using `spacy download model_name`, Please checkout + for the list of models from: https://spacy.io/usage/models. By default the module will load en_core_web_lg + model as it optimizes accuracy and has embeddings in-built, please download and load with `en_core_web_md` + if you want to priortize efficiency over accuracy, the same logic applies for models from other languages also. + language_web_core_sm and language_web_core_trf doesn't have pre-trained embeddings.""" ) def __call__(self, input: Documents) -> Embeddings: @@ -406,7 +409,6 @@ def __call__(self, input: Documents) -> Embeddings: >>> embeddings = spacy_fn(input) """ - return cast(Embeddings, [list(self._nlp(doc).vector.astype("float")) for doc in input]) # In order to remove dependencies on sentence-transformers, which in turn depends on # pytorch and sentence-piece we have created a default ONNX embedding function that From 79d761779a0aaf4009fc54e1fce0d400be6ebc20 Mon Sep 17 00:00:00 2001 From: Vishnu Nandakumar <38393302+Vishnunkumar@users.noreply.github.com> Date: Wed, 1 May 2024 17:47:59 +0530 Subject: [PATCH 5/7] fix: bug on the return statement with SpacyEmbeddings. --- chromadb/utils/embedding_functions.py | 1 + 1 file changed, 1 insertion(+) diff --git a/chromadb/utils/embedding_functions.py b/chromadb/utils/embedding_functions.py index c9a27dc4a46..52c64ea3d93 100644 --- a/chromadb/utils/embedding_functions.py +++ b/chromadb/utils/embedding_functions.py @@ -409,6 +409,7 @@ def __call__(self, input: Documents) -> Embeddings: >>> embeddings = spacy_fn(input) """ + return cast(Embeddings, [list(self._nlp(doc).vector.astype("float")) for doc in input]) # In order to remove dependencies on sentence-transformers, which in turn depends on # pytorch and sentence-piece we have created a default ONNX embedding function that From 1a17390ccdb7fef812c74346354e0bd17fd50f2f Mon Sep 17 00:00:00 2001 From: Vishnu Nandakumar <38393302+Vishnunkumar@users.noreply.github.com> Date: Mon, 6 May 2024 05:58:44 +0000 Subject: [PATCH 6/7] tests: refactored test cases as per comments. --- chromadb/test/ef/test_spacy_ef.py | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/chromadb/test/ef/test_spacy_ef.py b/chromadb/test/ef/test_spacy_ef.py index 34ddd36e0ae..a87897c2728 100644 --- a/chromadb/test/ef/test_spacy_ef.py +++ b/chromadb/test/ef/test_spacy_ef.py @@ -1,9 +1,11 @@ import pytest +import numpy from chromadb.utils.embedding_functions import SpacyEmbeddingFunction input_list = ["great work by the guy", "Super man is that guy"] model_name = "en_core_web_md" unknown_model = "unknown_model" +spacy = pytest.importorskip("spacy", reason="spacy not installed") def test_spacyembeddingfunction_isnotnone_wheninputisnotnone(): @@ -13,14 +15,18 @@ def test_spacyembeddingfunction_isnotnone_wheninputisnotnone(): def test_spacyembddingfunction_throwserror_whenmodel_notfound(): with pytest.raises(ValueError, - match="""spacy models are not downloaded yet, please download them using `spacy download model_name`, Please checkout - for the list of models from: https://spacy.io/usage/models. By default the module will load en_core_web_lg - model as it optimizes accuracy and has embeddings in-built, please download and load with `en_core_web_md` - if you want to priortize efficiency over accuracy, the same logic applies for models from other languages also. - language_web_core_sm and language_web_core_trf doesn't have pre-trained embeddings."""): + match=r"""spacy models are not downloaded yet, please download them using `spacy download model_name`, Please checkout + for the list of models from: https://spacy.io/usage/models."""): SpacyEmbeddingFunction(unknown_model) def test_spacyembddingfunction_isembedding_wheninput_islist(): spacy_emb_fn = SpacyEmbeddingFunction(model_name) assert type(spacy_emb_fn(input_list)) is list + + +def test_spacyembeddingfunction_returnslistoflistsofloats(): + spacy_emb_fn = SpacyEmbeddingFunction(model_name) + expected_output = spacy_emb_fn(input_list) + assert type(expected_output[0]) is list + assert type(expected_output[0][0]) is numpy.float64 \ No newline at end of file From 0eea17f777dff1d182d8281d7323630e55e27d51 Mon Sep 17 00:00:00 2001 From: Vishnu Nandakumar <38393302+Vishnunkumar@users.noreply.github.com> Date: Thu, 9 May 2024 10:26:26 +0000 Subject: [PATCH 7/7] style: fixed code style with regards to pre-commit checks --- chromadb/test/ef/test_spacy_ef.py | 10 ++++++---- chromadb/utils/embedding_functions.py | 11 +++++++---- 2 files changed, 13 insertions(+), 8 deletions(-) diff --git a/chromadb/test/ef/test_spacy_ef.py b/chromadb/test/ef/test_spacy_ef.py index a87897c2728..155fde78b68 100644 --- a/chromadb/test/ef/test_spacy_ef.py +++ b/chromadb/test/ef/test_spacy_ef.py @@ -14,9 +14,11 @@ def test_spacyembeddingfunction_isnotnone_wheninputisnotnone(): def test_spacyembddingfunction_throwserror_whenmodel_notfound(): - with pytest.raises(ValueError, - match=r"""spacy models are not downloaded yet, please download them using `spacy download model_name`, Please checkout - for the list of models from: https://spacy.io/usage/models."""): + with pytest.raises( + ValueError, + match=r"""spacy models are not downloaded yet, please download them using `spacy download model_name`, Please checkout + for the list of models from: https://spacy.io/usage/models.""", + ): SpacyEmbeddingFunction(unknown_model) @@ -29,4 +31,4 @@ def test_spacyembeddingfunction_returnslistoflistsofloats(): spacy_emb_fn = SpacyEmbeddingFunction(model_name) expected_output = spacy_emb_fn(input_list) assert type(expected_output[0]) is list - assert type(expected_output[0][0]) is numpy.float64 \ No newline at end of file + assert type(expected_output[0][0]) is numpy.float64 diff --git a/chromadb/utils/embedding_functions.py b/chromadb/utils/embedding_functions.py index 52c64ea3d93..65b1ff6a6a0 100644 --- a/chromadb/utils/embedding_functions.py +++ b/chromadb/utils/embedding_functions.py @@ -381,14 +381,14 @@ def __init__(self, model_name: str = "en_core_web_lg"): "The spacy python package is not installed. Please install it with `pip install spacy`" ) self._model_name = model_name - - try: + + try: self._nlp = spacy.load("{model}".format(model=self._model_name)) except OSError: raise ValueError( """spacy models are not downloaded yet, please download them using `spacy download model_name`, Please checkout for the list of models from: https://spacy.io/usage/models. By default the module will load en_core_web_lg - model as it optimizes accuracy and has embeddings in-built, please download and load with `en_core_web_md` + model as it optimizes accuracy and has embeddings in-built, please download and load with `en_core_web_md` if you want to priortize efficiency over accuracy, the same logic applies for models from other languages also. language_web_core_sm and language_web_core_trf doesn't have pre-trained embeddings.""" ) @@ -409,7 +409,10 @@ def __call__(self, input: Documents) -> Embeddings: >>> embeddings = spacy_fn(input) """ - return cast(Embeddings, [list(self._nlp(doc).vector.astype("float")) for doc in input]) + return cast( + Embeddings, [list(self._nlp(doc).vector.astype("float")) for doc in input] + ) + # In order to remove dependencies on sentence-transformers, which in turn depends on # pytorch and sentence-piece we have created a default ONNX embedding function that