feat: Using cosine similarity (#4124)

# Description This PR changes the `l2_norm` distance to the cosine similarity for vector search. This change can improve results on similarity searches and also for least similarity searches. This [PR](#4023) must be reviewed first Closes #4123 **Type of change** (Please delete options that are not relevant. Remember to title the PR according to the type of change) - [ ] New feature (non-breaking change which adds functionality) - [ ] Refactor (change restructuring the codebase without changing functionality) - [X] Improvement (change adding some improvement to an existing functionality) **How Has This Been Tested** (Please describe the tests that you ran to verify your changes. And ideally, reference `tests`) The base dataset has been used with boh ElasticSearch and OpenSearch to verify this change. **Checklist** - [ ] I added relevant documentation - [ ] I followed the style guidelines of this project - [ ] I did a self-review of my code - [ ] I made corresponding changes to the documentation - [ ] My changes generate no new warnings - [ ] I have added tests that prove my fix is effective or that my feature works - [ ] I filled out [the contributor form](https://tally.so/r/n9XrxK) (see text above) - [ ] I have added relevant notes to the `CHANGELOG.md` file (See https://keepachangelog.com/) --------- Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
argilla-io · Nov 3, 2023 · bdb2871 · bdb2871
1 parent bb6104e
commit bdb2871
Show file tree

Hide file tree

Showing 6 changed files with 18 additions and 15 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -53,6 +53,7 @@ These are the section headers that we use:
 - Update `POST /api/v1/me/datasets/:dataset_id/records/search` endpoint to allow to search records with vectors. ([#4019](https://github.com/argilla-io/argilla/pull/4019))
 - [breaking] Users working with OpenSearch engines must use version >=2.5 and set `ARGILLA_SEARCH_ENGINE=opensearch`. ([#4019](https://github.com/argilla-io/argilla/pull/4019))
 - Update `PATCH /api/v1/datasets/:dataset_id` endpoint to allow updating `allow_extra_metadata` attribute. ([#4112](https://github.com/argilla-io/argilla/pull/4112))
+- Using cosine similarity to compute similarity between vectors. ([#4124](https://github.com/argilla-io/argilla/pull/4124))
 
 ## [1.18.0](https://github.com/argilla-io/argilla/compare/v1.17.0...v1.18.0)
 

diff --git a/src/argilla/server/search_engine/elasticsearch.py b/src/argilla/server/search_engine/elasticsearch.py
@@ -73,7 +73,8 @@ def _mapping_for_vector_settings(self, vector_settings: VectorSettings) -> dict:
                 "index": True,
                 # can similarity property also be part of config @frascuchon ?
                 # relates vector search similarity metric
-                "similarity": "l2_norm",  ## default value regarding the knn best practices es documentation
+                # "similarity": "l2_norm",  ## default value regarding the knn best practices es documentation
+                "similarity": "cosine",
             }
         }
 

diff --git a/src/argilla/server/search_engine/opensearch.py b/src/argilla/server/search_engine/opensearch.py
@@ -66,7 +66,7 @@ def _mapping_for_vector_settings(self, vector_settings: VectorSettings) -> dict:
                 "method": {
                     "name": "hnsw",
                     "engine": "lucene",  # See https://opensearch.org/blog/Expanding-k-NN-with-Lucene-aNN/
-                    "space_type": "l2",
+                    "space_type": "cosinesimil",
                     "parameters": {"m": 2, "ef_construction": 4},
                 },
             }

diff --git a/tests/integration/client/feedback/dataset/remote/test_find_similar.py b/tests/integration/client/feedback/dataset/remote/test_find_similar.py
@@ -39,15 +39,15 @@ def tests_find_similar_records_using_value(self, owner: User, feedback_dataset:
 
         feedback_dataset.add_records(
             [
-                FeedbackRecord(external_id="0", fields={"text": "hello"}, vectors={"vector": [1, 1]}),
-                FeedbackRecord(external_id="1", fields={"text": "hello"}, vectors={"vector": [2, 2]}),
-                FeedbackRecord(external_id="2", fields={"text": "hello"}, vectors={"vector": [4, 4]}),
+                FeedbackRecord(external_id="0", fields={"text": "hello"}, vectors={"vector": [1, 2]}),
+                FeedbackRecord(external_id="1", fields={"text": "hello"}, vectors={"vector": [3, 4]}),
+                FeedbackRecord(external_id="2", fields={"text": "hello"}, vectors={"vector": [5, 6]}),
             ]
         )
 
         remote = feedback_dataset.push_to_argilla("test_find_similar_records", workspace=workspace)
 
-        records_with_scores = remote.find_similar_records(vector_name="vector", value=[1, 1], max_results=2)
+        records_with_scores = remote.find_similar_records(vector_name="vector", value=[1, 2], max_results=2)
         assert len(records_with_scores) == 2
 
         record, score = records_with_scores[0]
@@ -66,9 +66,9 @@ def test_find_similar_records_using_record(self, owner: User, feedback_dataset:
 
         feedback_dataset.add_records(
             [
-                FeedbackRecord(external_id="0", fields={"text": "hello"}, vectors={"vector": [1, 1]}),
-                FeedbackRecord(external_id="1", fields={"text": "hello"}, vectors={"vector": [2, 2]}),
-                FeedbackRecord(external_id="2", fields={"text": "hello"}, vectors={"vector": [4, 4]}),
+                FeedbackRecord(external_id="0", fields={"text": "hello"}, vectors={"vector": [1, 2]}),
+                FeedbackRecord(external_id="1", fields={"text": "hello"}, vectors={"vector": [3, 4]}),
+                FeedbackRecord(external_id="2", fields={"text": "hello"}, vectors={"vector": [5, 6]}),
             ]
         )
 
@@ -95,17 +95,17 @@ def test_find_similar_combining_filters(self, owner: User, feedback_dataset: Fee
 
         feedback_dataset.add_records(
             [
-                FeedbackRecord(external_id="0", fields={"text": "hello"}, vectors={"vector": [1, 1]}),
+                FeedbackRecord(external_id="0", fields={"text": "hello"}, vectors={"vector": [1, 2]}),
                 FeedbackRecord(
                     external_id="1",
                     fields={"text": "hello"},
-                    vectors={"vector": [2, 2]},
+                    vectors={"vector": [3, 4]},
                     responses=[ResponseSchema(status="discarded")],
                 ),
                 FeedbackRecord(
                     external_id="2",
                     fields={"text": "hello"},
-                    vectors={"vector": [4, 4]},
+                    vectors={"vector": [5, 6]},
                     responses=[ResponseSchema(status="submitted", values={"question": {"value": "answer"}})],
                 ),
             ]
@@ -116,9 +116,10 @@ def test_find_similar_combining_filters(self, owner: User, feedback_dataset: Fee
 
         records_with_scores = only_submitted_and_discarded_records.find_similar_records(
             vector_name="vector",
-            value=[1, 1],
+            value=[1, 2],
             max_results=3,
         )
+
         assert len(records_with_scores) == 2
 
         record, score = records_with_scores[0]

diff --git a/tests/unit/server/search_engine/test_elasticsearch_engine.py b/tests/unit/server/search_engine/test_elasticsearch_engine.py
@@ -696,7 +696,7 @@ async def test_create_dataset_index_with_vectors(
                 "type": "dense_vector",
                 "dims": settings.dimensions,
                 "index": True,
-                "similarity": "l2_norm",
+                "similarity": "cosine",
             }
             for settings in vectors_settings
         }

diff --git a/tests/unit/server/search_engine/test_opensearch_engine.py b/tests/unit/server/search_engine/test_opensearch_engine.py
@@ -1072,7 +1072,7 @@ async def test_create_dataset_index_with_vectors(self, opensearch_engine: OpenSe
                 "dimension": settings.dimensions,
                 "method": {
                     "engine": "lucene",
-                    "space_type": "l2",
+                    "space_type": "cosinesimil",
                     "name": "hnsw",
                     "parameters": {"ef_construction": 4, "m": 2},
                 },