From f1cf2d6258c368876e044329ddc8a414141eef71 Mon Sep 17 00:00:00 2001
From: Vishnu Nandakumar <38393302+Vishnunkumar@users.noreply.github.com>
Date: Wed, 24 Apr 2024 21:17:08 +0530
Subject: [PATCH 1/7] feat(embedding_function): added capability to use spacy
 models

---
 chromadb/utils/embedding_functions.py | 35 +++++++++++++++++++++++++++
 1 file changed, 35 insertions(+)

diff --git a/chromadb/utils/embedding_functions.py b/chromadb/utils/embedding_functions.py
index 3f0a1ce043b..c911bedac38 100644
--- a/chromadb/utils/embedding_functions.py
+++ b/chromadb/utils/embedding_functions.py
@@ -372,6 +372,41 @@ def __call__(self, input: Documents) -> Embeddings:
         return cast(Embeddings, self._model.encode(texts_with_instructions).tolist())
 
 
+class SpacyEmbeddingFunction():
+    def __init__(self, model_name: str = "lg"):
+        try:
+            import spacy
+        except ImportError:
+            raise ValueError(
+                "The spacy python package is not installed. Please install it with `pip install spacy`"
+            )
+        self._model_name = model_name
+        
+        try: 
+            self._nlp = spacy.load("en_core_web_{model}".format(model=self._model_name))
+        except OSError:
+            raise ValueError(
+                "spacy models are not downloaded, please download them using `spacy download en-core-web-lg or en-core-web-md`"
+            )
+
+    def __call__(self, input: Documents) -> Embeddings:
+        """
+        Get the embeddings for a list of texts.
+
+        Args:
+            texts (Documents): A list of texts to get embeddings for.
+
+        Returns:
+            Embeddings: The embeddings for the texts.
+
+        Example:
+            >>> spacy_fn = SpacyEmbeddingFunction(model_name="md")
+            >>> input = ["Hello, world!", "How are you?"]
+            >>> embeddings = spacy_fn(input)
+        """
+
+        return [self._nlp(doc).vector for doc in input]
+
 # In order to remove dependencies on sentence-transformers, which in turn depends on
 # pytorch and sentence-piece we have created a default ONNX embedding function that
 # implements the same functionality as "all-MiniLM-L6-v2" from sentence-transformers.

From 294f478307348b2ef0548f37343a4137e5577cc1 Mon Sep 17 00:00:00 2001
From: Vishnu Nandakumar <38393302+Vishnunkumar@users.noreply.github.com>
Date: Wed, 24 Apr 2024 21:21:03 +0530
Subject: [PATCH 2/7] fix: fixed bug regarding the EmbeddingFunction

---
 chromadb/utils/embedding_functions.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/chromadb/utils/embedding_functions.py b/chromadb/utils/embedding_functions.py
index c911bedac38..19369c8b868 100644
--- a/chromadb/utils/embedding_functions.py
+++ b/chromadb/utils/embedding_functions.py
@@ -372,7 +372,7 @@ def __call__(self, input: Documents) -> Embeddings:
         return cast(Embeddings, self._model.encode(texts_with_instructions).tolist())
 
 
-class SpacyEmbeddingFunction():
+class SpacyEmbeddingFunction(EmbeddingFunction[Documents]):
     def __init__(self, model_name: str = "lg"):
         try:
             import spacy

From 76f2df048862122e86c43fc8cc37f8842b0dbf40 Mon Sep 17 00:00:00 2001
From: Vishnunkumar <vishnunkumar25@gmail.com>
Date: Sun, 28 Apr 2024 14:04:21 +0530
Subject: [PATCH 3/7] refactor: added unittests and modified
 embedding_functions.py based on test cases

---
 chromadb/test/ef/test_spacy_ef.py     | 24 ++++++++++++++++++++++++
 chromadb/utils/embedding_functions.py |  5 +++--
 2 files changed, 27 insertions(+), 2 deletions(-)
 create mode 100644 chromadb/test/ef/test_spacy_ef.py

diff --git a/chromadb/test/ef/test_spacy_ef.py b/chromadb/test/ef/test_spacy_ef.py
new file mode 100644
index 00000000000..0892fcf3b24
--- /dev/null
+++ b/chromadb/test/ef/test_spacy_ef.py
@@ -0,0 +1,24 @@
+import pytest
+from chromadb.utils.embedding_functions import SpacyEmbeddingFunction
+
+input_list = ["great work by the guy", "Super man is that guy"]
+model_name = "md"
+unknown_model = "unknown_model"
+
+
+def test_spacyembeddingfunction_isnotnone_wheninputisnotnone():
+    spacy_emb_fn = SpacyEmbeddingFunction(model_name)
+    assert spacy_emb_fn(input_list) is not None
+
+
+def test_spacyembddingfunction_throwserror_whenmodel_notfound():
+    with pytest.raises(ValueError,
+                       match='spacy models are not downloaded, '
+                             'please download them using '
+                             '`spacy download en-core-web-lg or en-core-web-md`'):
+        SpacyEmbeddingFunction(unknown_model)
+
+
+def test_spacyembddingfunction_isembedding_wheninput_islist():
+    spacy_emb_fn = SpacyEmbeddingFunction(model_name)
+    assert type(spacy_emb_fn(input_list)) is list
diff --git a/chromadb/utils/embedding_functions.py b/chromadb/utils/embedding_functions.py
index 83c592d8ea1..54ae84eed9c 100644
--- a/chromadb/utils/embedding_functions.py
+++ b/chromadb/utils/embedding_functions.py
@@ -386,7 +386,8 @@ def __init__(self, model_name: str = "lg"):
             self._nlp = spacy.load("en_core_web_{model}".format(model=self._model_name))
         except OSError:
             raise ValueError(
-                "spacy models are not downloaded, please download them using `spacy download en-core-web-lg or en-core-web-md`"
+                "spacy models are not downloaded, please download them using `spacy download en-core-web-lg or "
+                "en-core-web-md`"
             )
 
     def __call__(self, input: Documents) -> Embeddings:
@@ -405,7 +406,7 @@ def __call__(self, input: Documents) -> Embeddings:
             >>> embeddings = spacy_fn(input)
         """
 
-        return [self._nlp(doc).vector for doc in input]
+        return cast(Embeddings, [list(self._nlp(doc).vector.astype("float")) for doc in input])
 
 # In order to remove dependencies on sentence-transformers, which in turn depends on
 # pytorch and sentence-piece we have created a default ONNX embedding function that

From e89d0a5cfa809795102b7c5583b49898643842a6 Mon Sep 17 00:00:00 2001
From: Vishnu Nandakumar <38393302+Vishnunkumar@users.noreply.github.com>
Date: Wed, 1 May 2024 17:04:14 +0530
Subject: [PATCH 4/7] fix: update spacy models and its relevant unit tests.

---
 chromadb/test/ef/test_spacy_ef.py     | 10 ++++++----
 chromadb/utils/embedding_functions.py | 12 +++++++-----
 2 files changed, 13 insertions(+), 9 deletions(-)

diff --git a/chromadb/test/ef/test_spacy_ef.py b/chromadb/test/ef/test_spacy_ef.py
index 0892fcf3b24..34ddd36e0ae 100644
--- a/chromadb/test/ef/test_spacy_ef.py
+++ b/chromadb/test/ef/test_spacy_ef.py
@@ -2,7 +2,7 @@
 from chromadb.utils.embedding_functions import SpacyEmbeddingFunction
 
 input_list = ["great work by the guy", "Super man is that guy"]
-model_name = "md"
+model_name = "en_core_web_md"
 unknown_model = "unknown_model"
 
 
@@ -13,9 +13,11 @@ def test_spacyembeddingfunction_isnotnone_wheninputisnotnone():
 
 def test_spacyembddingfunction_throwserror_whenmodel_notfound():
     with pytest.raises(ValueError,
-                       match='spacy models are not downloaded, '
-                             'please download them using '
-                             '`spacy download en-core-web-lg or en-core-web-md`'):
+                       match="""spacy models are not downloaded yet, please download them using `spacy download model_name`, Please checkout
+                for the list of models from: https://spacy.io/usage/models. By default the module will load en_core_web_lg
+                model as it optimizes accuracy and has embeddings in-built, please download and load with `en_core_web_md` 
+                if you want to priortize efficiency over accuracy, the same logic applies for models from other languages also.
+                language_web_core_sm and language_web_core_trf doesn't have pre-trained embeddings."""):
         SpacyEmbeddingFunction(unknown_model)
 
 
diff --git a/chromadb/utils/embedding_functions.py b/chromadb/utils/embedding_functions.py
index 54ae84eed9c..c9a27dc4a46 100644
--- a/chromadb/utils/embedding_functions.py
+++ b/chromadb/utils/embedding_functions.py
@@ -373,7 +373,7 @@ def __call__(self, input: Documents) -> Embeddings:
 
 
 class SpacyEmbeddingFunction(EmbeddingFunction[Documents]):
-    def __init__(self, model_name: str = "lg"):
+    def __init__(self, model_name: str = "en_core_web_lg"):
         try:
             import spacy
         except ImportError:
@@ -383,11 +383,14 @@ def __init__(self, model_name: str = "lg"):
         self._model_name = model_name
         
         try: 
-            self._nlp = spacy.load("en_core_web_{model}".format(model=self._model_name))
+            self._nlp = spacy.load("{model}".format(model=self._model_name))
         except OSError:
             raise ValueError(
-                "spacy models are not downloaded, please download them using `spacy download en-core-web-lg or "
-                "en-core-web-md`"
+                """spacy models are not downloaded yet, please download them using `spacy download model_name`, Please checkout
+                for the list of models from: https://spacy.io/usage/models. By default the module will load en_core_web_lg
+                model as it optimizes accuracy and has embeddings in-built, please download and load with `en_core_web_md` 
+                if you want to priortize efficiency over accuracy, the same logic applies for models from other languages also.
+                language_web_core_sm and language_web_core_trf doesn't have pre-trained embeddings."""
             )
 
     def __call__(self, input: Documents) -> Embeddings:
@@ -406,7 +409,6 @@ def __call__(self, input: Documents) -> Embeddings:
             >>> embeddings = spacy_fn(input)
         """
 
-        return cast(Embeddings, [list(self._nlp(doc).vector.astype("float")) for doc in input])
 
 # In order to remove dependencies on sentence-transformers, which in turn depends on
 # pytorch and sentence-piece we have created a default ONNX embedding function that

From 79d761779a0aaf4009fc54e1fce0d400be6ebc20 Mon Sep 17 00:00:00 2001
From: Vishnu Nandakumar <38393302+Vishnunkumar@users.noreply.github.com>
Date: Wed, 1 May 2024 17:47:59 +0530
Subject: [PATCH 5/7] fix: bug on the return statement with SpacyEmbeddings.

---
 chromadb/utils/embedding_functions.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/chromadb/utils/embedding_functions.py b/chromadb/utils/embedding_functions.py
index c9a27dc4a46..52c64ea3d93 100644
--- a/chromadb/utils/embedding_functions.py
+++ b/chromadb/utils/embedding_functions.py
@@ -409,6 +409,7 @@ def __call__(self, input: Documents) -> Embeddings:
             >>> embeddings = spacy_fn(input)
         """
 
+        return cast(Embeddings, [list(self._nlp(doc).vector.astype("float")) for doc in input])
 
 # In order to remove dependencies on sentence-transformers, which in turn depends on
 # pytorch and sentence-piece we have created a default ONNX embedding function that

From 1a17390ccdb7fef812c74346354e0bd17fd50f2f Mon Sep 17 00:00:00 2001
From: Vishnu Nandakumar <38393302+Vishnunkumar@users.noreply.github.com>
Date: Mon, 6 May 2024 05:58:44 +0000
Subject: [PATCH 6/7] tests: refactored test cases as per comments.

---
 chromadb/test/ef/test_spacy_ef.py | 16 +++++++++++-----
 1 file changed, 11 insertions(+), 5 deletions(-)

diff --git a/chromadb/test/ef/test_spacy_ef.py b/chromadb/test/ef/test_spacy_ef.py
index 34ddd36e0ae..a87897c2728 100644
--- a/chromadb/test/ef/test_spacy_ef.py
+++ b/chromadb/test/ef/test_spacy_ef.py
@@ -1,9 +1,11 @@
 import pytest
+import numpy
 from chromadb.utils.embedding_functions import SpacyEmbeddingFunction
 
 input_list = ["great work by the guy", "Super man is that guy"]
 model_name = "en_core_web_md"
 unknown_model = "unknown_model"
+spacy = pytest.importorskip("spacy", reason="spacy not installed")
 
 
 def test_spacyembeddingfunction_isnotnone_wheninputisnotnone():
@@ -13,14 +15,18 @@ def test_spacyembeddingfunction_isnotnone_wheninputisnotnone():
 
 def test_spacyembddingfunction_throwserror_whenmodel_notfound():
     with pytest.raises(ValueError,
-                       match="""spacy models are not downloaded yet, please download them using `spacy download model_name`, Please checkout
-                for the list of models from: https://spacy.io/usage/models. By default the module will load en_core_web_lg
-                model as it optimizes accuracy and has embeddings in-built, please download and load with `en_core_web_md` 
-                if you want to priortize efficiency over accuracy, the same logic applies for models from other languages also.
-                language_web_core_sm and language_web_core_trf doesn't have pre-trained embeddings."""):
+                       match=r"""spacy models are not downloaded yet, please download them using `spacy download model_name`, Please checkout
+                for the list of models from: https://spacy.io/usage/models."""):
         SpacyEmbeddingFunction(unknown_model)
 
 
 def test_spacyembddingfunction_isembedding_wheninput_islist():
     spacy_emb_fn = SpacyEmbeddingFunction(model_name)
     assert type(spacy_emb_fn(input_list)) is list
+
+
+def test_spacyembeddingfunction_returnslistoflistsofloats():
+    spacy_emb_fn = SpacyEmbeddingFunction(model_name)
+    expected_output = spacy_emb_fn(input_list)
+    assert type(expected_output[0]) is list
+    assert type(expected_output[0][0]) is numpy.float64
\ No newline at end of file

From 0eea17f777dff1d182d8281d7323630e55e27d51 Mon Sep 17 00:00:00 2001
From: Vishnu Nandakumar <38393302+Vishnunkumar@users.noreply.github.com>
Date: Thu, 9 May 2024 10:26:26 +0000
Subject: [PATCH 7/7] style: fixed code style with regards to pre-commit checks

---
 chromadb/test/ef/test_spacy_ef.py     | 10 ++++++----
 chromadb/utils/embedding_functions.py | 11 +++++++----
 2 files changed, 13 insertions(+), 8 deletions(-)

diff --git a/chromadb/test/ef/test_spacy_ef.py b/chromadb/test/ef/test_spacy_ef.py
index a87897c2728..155fde78b68 100644
--- a/chromadb/test/ef/test_spacy_ef.py
+++ b/chromadb/test/ef/test_spacy_ef.py
@@ -14,9 +14,11 @@ def test_spacyembeddingfunction_isnotnone_wheninputisnotnone():
 
 
 def test_spacyembddingfunction_throwserror_whenmodel_notfound():
-    with pytest.raises(ValueError,
-                       match=r"""spacy models are not downloaded yet, please download them using `spacy download model_name`, Please checkout
-                for the list of models from: https://spacy.io/usage/models."""):
+    with pytest.raises(
+        ValueError,
+        match=r"""spacy models are not downloaded yet, please download them using `spacy download model_name`, Please checkout
+                for the list of models from: https://spacy.io/usage/models.""",
+    ):
         SpacyEmbeddingFunction(unknown_model)
 
 
@@ -29,4 +31,4 @@ def test_spacyembeddingfunction_returnslistoflistsofloats():
     spacy_emb_fn = SpacyEmbeddingFunction(model_name)
     expected_output = spacy_emb_fn(input_list)
     assert type(expected_output[0]) is list
-    assert type(expected_output[0][0]) is numpy.float64
\ No newline at end of file
+    assert type(expected_output[0][0]) is numpy.float64
diff --git a/chromadb/utils/embedding_functions.py b/chromadb/utils/embedding_functions.py
index 52c64ea3d93..65b1ff6a6a0 100644
--- a/chromadb/utils/embedding_functions.py
+++ b/chromadb/utils/embedding_functions.py
@@ -381,14 +381,14 @@ def __init__(self, model_name: str = "en_core_web_lg"):
                 "The spacy python package is not installed. Please install it with `pip install spacy`"
             )
         self._model_name = model_name
-        
-        try: 
+
+        try:
             self._nlp = spacy.load("{model}".format(model=self._model_name))
         except OSError:
             raise ValueError(
                 """spacy models are not downloaded yet, please download them using `spacy download model_name`, Please checkout
                 for the list of models from: https://spacy.io/usage/models. By default the module will load en_core_web_lg
-                model as it optimizes accuracy and has embeddings in-built, please download and load with `en_core_web_md` 
+                model as it optimizes accuracy and has embeddings in-built, please download and load with `en_core_web_md`
                 if you want to priortize efficiency over accuracy, the same logic applies for models from other languages also.
                 language_web_core_sm and language_web_core_trf doesn't have pre-trained embeddings."""
             )
@@ -409,7 +409,10 @@ def __call__(self, input: Documents) -> Embeddings:
             >>> embeddings = spacy_fn(input)
         """
 
-        return cast(Embeddings, [list(self._nlp(doc).vector.astype("float")) for doc in input])
+        return cast(
+            Embeddings, [list(self._nlp(doc).vector.astype("float")) for doc in input]
+        )
+
 
 # In order to remove dependencies on sentence-transformers, which in turn depends on
 # pytorch and sentence-piece we have created a default ONNX embedding function that