feat: add jina embedding models (#757)

* feat: add embedding models * chore: bump stubs and commons * chore: bump to 0.7.9 * chore: add changelog * feat: add cos sim to finetuner * docs: add new backbones
jina-ai · Jul 11, 2023 · fa9d73a · fa9d73a
1 parent 25ac807
commit fa9d73a
Show file tree

Hide file tree

Showing 6 changed files with 35 additions and 18 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -10,10 +10,16 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ### Added
 
+- Add jina embeddings suit. ([#757](https://github.com/jina-ai/finetuner/pull/757))
+
+- Add `cos_sim` helper to finetuner. ([#757](https://github.com/jina-ai/finetuner/pull/757))
+
 ### Removed
 
 ### Changed
 
+- Finetuner always install torch and other dependencies. ([#757](https://github.com/jina-ai/finetuner/pull/757))
+
 ### Fixed
 
 ### Docs

diff --git a/README.md b/README.md
@@ -149,9 +149,7 @@ Make sure you have Python 3.8+ installed. Finetuner can be installed via `pip` b
 pip install -U finetuner
 ```
 
-If you want to encode local data with the `finetuner.encode` function, you need to install 
-`"finetuner[full]"`. This includes a number of additional dependencies, which are necessary for encoding: Torch, 
-Torchvision and OpenCLIP:
+If you want to submit a fine-tuning job on the cloud, please use
 
 ```bash
 pip install "finetuner[full]"

diff --git a/docs/walkthrough/choose-backbone.md b/docs/walkthrough/choose-backbone.md
@@ -45,15 +45,18 @@ to get a list of supported models:
 
 ````{tab} text-to-text
 ```bash
-                                        Finetuner backbones: text-to-text                                                        
-┏━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┓
-┃                 name ┃         task ┃ output_dim ┃ architecture ┃                                                                description ┃
-┡━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┩
-│         bert-base-en │ text-to-text │        768 │  transformer │                 BERT model pre-trained on BookCorpus and English Wikipedia │
-│      bert-base-multi │ text-to-text │        768 │  transformer │                           BERT model pre-trained on multilingual Wikipedia │
-│ distiluse-base-multi │ text-to-text │        512 │  transformer │ Knowledge distilled version of the multilingual Universal Sentence Encoder │
-│        sbert-base-en │ text-to-text │        768 │  transformer │                                    Pretrained BERT, fine-tuned on MS Marco │
-└──────────────────────┴──────────────┴────────────┴──────────────┴────────────────────────────────────────────────────────────────────────────┘
+                                                       Finetuner backbones: text-to-text                                                       
+┏━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┓
+┃                   name ┃         task ┃ output_dim ┃ architecture ┃                                                             description ┃
+┡━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┩
+│ jina-embedding-s-en-v1 │ text-to-text │        512 │  transformer │    Text embedding model trained using Linnaeus-Clean dataset by Jina AI │
+│ jina-embedding-b-en-v1 │ text-to-text │        768 │  transformer │    Text embedding model trained using Linnaeus-Clean dataset by Jina AI │
+│ jina-embedding-l-en-v1 │ text-to-text │       1024 │  transformer │    Text embedding model trained using Linnaeus-Clean dataset by Jina AI │
+│           bert-base-en │ text-to-text │        768 │  transformer │              BERT model pre-trained on BookCorpus and English Wikipedia │
+│        bert-base-multi │ text-to-text │        768 │  transformer │                        BERT model pre-trained on multilingual Wikipedia │
+│   distiluse-base-multi │ text-to-text │        512 │  transformer │      Knowledge distilled version of the multilingual Sentence Encoder   │
+│          sbert-base-en │ text-to-text │        768 │  transformer │                                 Pretrained BERT, fine-tuned on MS Marco │
+└────────────────────────┴──────────────┴────────────┴──────────────┴─────────────────────────────────────────────────────────────────────────┘
 ```
 ````
 ````{tab} image-to-image

diff --git a/finetuner/__init__.py b/finetuner/__init__.py
@@ -4,6 +4,7 @@
 from typing import TYPE_CHECKING, Any, Dict, List, Optional, TextIO, Union
 from urllib.parse import urlparse
 
+import numpy as np
 from _finetuner.runner.stubs import model as model_stub
 from docarray import Document, DocumentArray  # noqa F401
 
@@ -33,7 +34,6 @@
 from finetuner.model import list_model_classes
 
 if TYPE_CHECKING:
-    import numpy as np
     from _finetuner.models.inference import InferenceEngine
 
 ft = Finetuner()
@@ -669,3 +669,13 @@ def encode(
             batch.embeddings = output.detach().cpu().numpy()
 
     return data if return_da else data.embeddings
+
+
+def cos_sim(a: np.ndarray, b: np.ndarray) -> float:
+    """Cosine similarity between two vectors.
+
+    :param a: The first vector.
+    :param b: The second vector.
+    :return: Cosine similarity between two vectors.
+    """
+    return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))
diff --git a/setup.cfg b/setup.cfg
@@ -1,5 +1,5 @@
 [metadata]
-version = 0.7.8
+version = 0.7.9
 
 [flake8]
 # E501 is too long lines - ignore as black takes care of that

diff --git a/setup.py b/setup.py
@@ -28,13 +28,13 @@
         setup_requires=['setuptools>=18.0', 'wheel'],
         install_requires=[
             'docarray[common]<0.30.0',
-            'trimesh==3.16.4',
-            'finetuner-stubs==0.13.7',
-            'jina-hubble-sdk==0.33.1',
+            'finetuner-stubs==0.13.9',
+            'finetuner-commons==0.13.9',
         ],
         extras_require={
             'full': [
-                'finetuner-commons==0.13.7',
+                'jina-hubble-sdk==0.33.1',
+                'trimesh==3.16.4',
             ],
             'test': [
                 'black==23.3.0',