Merge pull request #964 from ManishMadan2882/main

Feature: Token count for vectors
arc53 · May 27, 2024 · f6c66f6 · f6c66f6
2 parents 425803a + c794ea6
commit f6c66f6
Show file tree

Hide file tree

Showing 6 changed files with 39 additions and 34 deletions.
diff --git a/application/api/internal/routes.py b/application/api/internal/routes.py
@@ -34,6 +34,7 @@ def upload_index_files():
     if "name" not in request.form:
         return {"status": "no name"}
     job_name = secure_filename(request.form["name"])
+    tokens = secure_filename(request.form["tokens"])
     save_dir = os.path.join(current_dir, "indexes", user, job_name)
     if settings.VECTOR_STORE == "faiss":
         if "file_faiss" not in request.files:
@@ -64,6 +65,7 @@ def upload_index_files():
             "date": datetime.datetime.now().strftime("%d/%m/%Y %H:%M:%S"),
             "model": settings.EMBEDDINGS_NAME,
             "type": "local",
+            "tokens": tokens
         }
     )
     return {"status": "ok"}
diff --git a/application/api/user/routes.py b/application/api/user/routes.py
@@ -253,6 +253,7 @@ def combined_json():
             "docLink": "default",
             "model": settings.EMBEDDINGS_NAME,
             "location": "remote",
+            "tokens":""
         }
     ]
     # structure: name, language, version, description, fullName, date, docLink
@@ -269,6 +270,7 @@ def combined_json():
                 "docLink": index["location"],
                 "model": settings.EMBEDDINGS_NAME,
                 "location": "local",
+                "tokens" : index["tokens"] if ("tokens" in index.keys()) else ""
             }
         )
     if settings.VECTOR_STORE == "faiss":
@@ -290,6 +292,7 @@ def combined_json():
                 "docLink": "duckduck_search",
                 "model": settings.EMBEDDINGS_NAME,
                 "location": "custom",
+                "tokens":""
             }
         )
     if "brave_search" in settings.RETRIEVERS_ENABLED:
@@ -304,6 +307,7 @@ def combined_json():
                 "docLink": "brave_search",
                 "model": settings.EMBEDDINGS_NAME,
                 "location": "custom",
+                "tokens":""
             }
         )
 

diff --git a/application/parser/open_ai_func.py b/application/parser/open_ai_func.py
@@ -1,6 +1,5 @@
 import os
 
-import tiktoken
 from application.vectorstore.vector_creator import VectorCreator
 from application.core.settings import settings
 from retry import retry
@@ -11,14 +10,6 @@
 # from langchain_community.embeddings import CohereEmbeddings
 
 
-def num_tokens_from_string(string: str, encoding_name: str) -> int:
-    # Function to convert string to tokens and estimate user cost.
-    encoding = tiktoken.get_encoding(encoding_name)
-    num_tokens = len(encoding.encode(string))
-    total_price = (num_tokens / 1000) * 0.0004
-    return num_tokens, total_price
-
-
 @retry(tries=10, delay=60)
 def store_add_texts_with_retry(store, i):
     store.add_texts([i.page_content], metadatas=[i.metadata])
@@ -79,25 +70,3 @@ def call_openai_api(docs, folder_name, task_status):
         store.save_local(f"{folder_name}")
 
 
-def get_user_permission(docs, folder_name):
-    # Function to ask user permission to call the OpenAI api and spend their OpenAI funds.
-    # Here we convert the docs list to a string and calculate the number of OpenAI tokens the string represents.
-    # docs_content = (" ".join(docs))
-    docs_content = ""
-    for doc in docs:
-        docs_content += doc.page_content
-
-    tokens, total_price = num_tokens_from_string(
-        string=docs_content, encoding_name="cl100k_base"
-    )
-    # Here we print the number of tokens and the approx user cost with some visually appealing formatting.
-    print(f"Number of Tokens = {format(tokens, ',d')}")
-    print(f"Approx Cost = ${format(total_price, ',.2f')}")
-    # Here we check for user permission before calling the API.
-    user_input = input("Price Okay? (Y/N) \n").lower()
-    if user_input == "y":
-        call_openai_api(docs, folder_name)
-    elif user_input == "":
-        call_openai_api(docs, folder_name)
-    else:
-        print("The API was not called. No money was spent.")
diff --git a/application/worker.py b/application/worker.py
@@ -2,6 +2,7 @@
 import shutil
 import string
 import zipfile
+import tiktoken
 from urllib.parse import urljoin
 
 import requests
@@ -131,6 +132,7 @@ def ingest_worker(self, directory, formats, name_job, filename, user):
     docs = [Document.to_langchain_format(raw_doc) for raw_doc in raw_docs]
 
     call_openai_api(docs, full_path, self)
+    tokens = count_tokens_docs(docs)
     self.update_state(state="PROGRESS", meta={"current": 100})
 
     if sample:
@@ -139,7 +141,7 @@ def ingest_worker(self, directory, formats, name_job, filename, user):
 
     # get files from outputs/inputs/index.faiss and outputs/inputs/index.pkl
     # and send them to the server (provide user and name in form)
-    file_data = {"name": name_job, "user": user}
+    file_data = {"name": name_job, "user": user, "tokens":tokens}
     if settings.VECTOR_STORE == "faiss":
         files = {
             "file_faiss": open(full_path + "/index.faiss", "rb"),
@@ -188,18 +190,19 @@ def remote_worker(self, source_data, name_job, user, loader, directory="temp"):
         max_tokens=max_tokens,
         token_check=token_check,
     )
-
     # docs = [Document.to_langchain_format(raw_doc) for raw_doc in raw_docs]
     call_openai_api(docs, full_path, self)
+    tokens = count_tokens_docs(docs)
     self.update_state(state="PROGRESS", meta={"current": 100})
 
     # Proceed with uploading and cleaning as in the original function
-    file_data = {"name": name_job, "user": user}
+    file_data = {"name": name_job, "user": user, "tokens":tokens}
     if settings.VECTOR_STORE == "faiss":
         files = {
             "file_faiss": open(full_path + "/index.faiss", "rb"),
             "file_pkl": open(full_path + "/index.pkl", "rb"),
         }
+
         requests.post(
             urljoin(settings.API_URL, "/api/upload_index"), files=files, data=file_data
         )
@@ -210,3 +213,25 @@ def remote_worker(self, source_data, name_job, user, loader, directory="temp"):
     shutil.rmtree(full_path)
 
     return {"urls": source_data, "name_job": name_job, "user": user, "limited": False}
+
+
+def count_tokens_docs(docs):
+    # Here we convert the docs list to a string and calculate the number of tokens the string represents.
+    # docs_content = (" ".join(docs))
+    docs_content = ""
+    for doc in docs:
+        docs_content += doc.page_content
+
+    tokens, total_price = num_tokens_from_string(
+        string=docs_content, encoding_name="cl100k_base"
+    )
+    # Here we print the number of tokens and the approx user cost with some visually appealing formatting.
+    return tokens
+
+
+def num_tokens_from_string(string: str, encoding_name: str) -> int:
+    # Function to convert string to tokens and estimate user cost.
+    encoding = tiktoken.get_encoding(encoding_name)
+    num_tokens = len(encoding.encode(string))
+    total_price = (num_tokens / 1000) * 0.0004
+    return num_tokens, total_price
diff --git a/frontend/src/models/misc.ts b/frontend/src/models/misc.ts
@@ -13,6 +13,7 @@ export type Doc = {
   date: string;
   docLink: string;
   model: string;
+  tokens?: string;
 };
 
 export type PromptProps = {

diff --git a/frontend/src/settings/Documents.tsx b/frontend/src/settings/Documents.tsx
@@ -14,6 +14,7 @@ const Documents: React.FC<DocumentsProps> = ({
               <tr>
                 <th className="border-r p-4 md:w-[244px]">Document Name</th>
                 <th className="w-[244px] border-r px-4 py-2">Vector Date</th>
+                <th className="w-[244px] border-r px-4 py-2">Token usage</th>
                 <th className="w-[244px] border-r px-4 py-2">Type</th>
                 <th className="px-4 py-2"></th>
               </tr>
@@ -28,6 +29,9 @@ const Documents: React.FC<DocumentsProps> = ({
                     <td className="border-r border-t px-4 py-2">
                       {document.date}
                     </td>
+                    <td className="border-r border-t px-4 py-2">
+                      {document.tokens ? document.tokens : ''}
+                    </td>
                     <td className="border-r border-t px-4 py-2">
                       {document.location === 'remote'
                         ? 'Pre-loaded'