Skip to content

Commit

Permalink
Merge pull request #964 from ManishMadan2882/main
Browse files Browse the repository at this point in the history
Feature: Token count for vectors
  • Loading branch information
dartpain authored May 27, 2024
2 parents 425803a + c794ea6 commit f6c66f6
Show file tree
Hide file tree
Showing 6 changed files with 39 additions and 34 deletions.
2 changes: 2 additions & 0 deletions application/api/internal/routes.py
100644 → 100755
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ def upload_index_files():
if "name" not in request.form:
return {"status": "no name"}
job_name = secure_filename(request.form["name"])
tokens = secure_filename(request.form["tokens"])
save_dir = os.path.join(current_dir, "indexes", user, job_name)
if settings.VECTOR_STORE == "faiss":
if "file_faiss" not in request.files:
Expand Down Expand Up @@ -64,6 +65,7 @@ def upload_index_files():
"date": datetime.datetime.now().strftime("%d/%m/%Y %H:%M:%S"),
"model": settings.EMBEDDINGS_NAME,
"type": "local",
"tokens": tokens
}
)
return {"status": "ok"}
4 changes: 4 additions & 0 deletions application/api/user/routes.py
Original file line number Diff line number Diff line change
Expand Up @@ -253,6 +253,7 @@ def combined_json():
"docLink": "default",
"model": settings.EMBEDDINGS_NAME,
"location": "remote",
"tokens":""
}
]
# structure: name, language, version, description, fullName, date, docLink
Expand All @@ -269,6 +270,7 @@ def combined_json():
"docLink": index["location"],
"model": settings.EMBEDDINGS_NAME,
"location": "local",
"tokens" : index["tokens"] if ("tokens" in index.keys()) else ""
}
)
if settings.VECTOR_STORE == "faiss":
Expand All @@ -290,6 +292,7 @@ def combined_json():
"docLink": "duckduck_search",
"model": settings.EMBEDDINGS_NAME,
"location": "custom",
"tokens":""
}
)
if "brave_search" in settings.RETRIEVERS_ENABLED:
Expand All @@ -304,6 +307,7 @@ def combined_json():
"docLink": "brave_search",
"model": settings.EMBEDDINGS_NAME,
"location": "custom",
"tokens":""
}
)

Expand Down
31 changes: 0 additions & 31 deletions application/parser/open_ai_func.py
100644 → 100755
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
import os

import tiktoken
from application.vectorstore.vector_creator import VectorCreator
from application.core.settings import settings
from retry import retry
Expand All @@ -11,14 +10,6 @@
# from langchain_community.embeddings import CohereEmbeddings


def num_tokens_from_string(string: str, encoding_name: str) -> int:
# Function to convert string to tokens and estimate user cost.
encoding = tiktoken.get_encoding(encoding_name)
num_tokens = len(encoding.encode(string))
total_price = (num_tokens / 1000) * 0.0004
return num_tokens, total_price


@retry(tries=10, delay=60)
def store_add_texts_with_retry(store, i):
store.add_texts([i.page_content], metadatas=[i.metadata])
Expand Down Expand Up @@ -79,25 +70,3 @@ def call_openai_api(docs, folder_name, task_status):
store.save_local(f"{folder_name}")


def get_user_permission(docs, folder_name):
# Function to ask user permission to call the OpenAI api and spend their OpenAI funds.
# Here we convert the docs list to a string and calculate the number of OpenAI tokens the string represents.
# docs_content = (" ".join(docs))
docs_content = ""
for doc in docs:
docs_content += doc.page_content

tokens, total_price = num_tokens_from_string(
string=docs_content, encoding_name="cl100k_base"
)
# Here we print the number of tokens and the approx user cost with some visually appealing formatting.
print(f"Number of Tokens = {format(tokens, ',d')}")
print(f"Approx Cost = ${format(total_price, ',.2f')}")
# Here we check for user permission before calling the API.
user_input = input("Price Okay? (Y/N) \n").lower()
if user_input == "y":
call_openai_api(docs, folder_name)
elif user_input == "":
call_openai_api(docs, folder_name)
else:
print("The API was not called. No money was spent.")
31 changes: 28 additions & 3 deletions application/worker.py
100644 → 100755
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
import shutil
import string
import zipfile
import tiktoken
from urllib.parse import urljoin

import requests
Expand Down Expand Up @@ -131,6 +132,7 @@ def ingest_worker(self, directory, formats, name_job, filename, user):
docs = [Document.to_langchain_format(raw_doc) for raw_doc in raw_docs]

call_openai_api(docs, full_path, self)
tokens = count_tokens_docs(docs)
self.update_state(state="PROGRESS", meta={"current": 100})

if sample:
Expand All @@ -139,7 +141,7 @@ def ingest_worker(self, directory, formats, name_job, filename, user):

# get files from outputs/inputs/index.faiss and outputs/inputs/index.pkl
# and send them to the server (provide user and name in form)
file_data = {"name": name_job, "user": user}
file_data = {"name": name_job, "user": user, "tokens":tokens}
if settings.VECTOR_STORE == "faiss":
files = {
"file_faiss": open(full_path + "/index.faiss", "rb"),
Expand Down Expand Up @@ -188,18 +190,19 @@ def remote_worker(self, source_data, name_job, user, loader, directory="temp"):
max_tokens=max_tokens,
token_check=token_check,
)

# docs = [Document.to_langchain_format(raw_doc) for raw_doc in raw_docs]
call_openai_api(docs, full_path, self)
tokens = count_tokens_docs(docs)
self.update_state(state="PROGRESS", meta={"current": 100})

# Proceed with uploading and cleaning as in the original function
file_data = {"name": name_job, "user": user}
file_data = {"name": name_job, "user": user, "tokens":tokens}
if settings.VECTOR_STORE == "faiss":
files = {
"file_faiss": open(full_path + "/index.faiss", "rb"),
"file_pkl": open(full_path + "/index.pkl", "rb"),
}

requests.post(
urljoin(settings.API_URL, "/api/upload_index"), files=files, data=file_data
)
Expand All @@ -210,3 +213,25 @@ def remote_worker(self, source_data, name_job, user, loader, directory="temp"):
shutil.rmtree(full_path)

return {"urls": source_data, "name_job": name_job, "user": user, "limited": False}


def count_tokens_docs(docs):
# Here we convert the docs list to a string and calculate the number of tokens the string represents.
# docs_content = (" ".join(docs))
docs_content = ""
for doc in docs:
docs_content += doc.page_content

tokens, total_price = num_tokens_from_string(
string=docs_content, encoding_name="cl100k_base"
)
# Here we print the number of tokens and the approx user cost with some visually appealing formatting.
return tokens


def num_tokens_from_string(string: str, encoding_name: str) -> int:
# Function to convert string to tokens and estimate user cost.
encoding = tiktoken.get_encoding(encoding_name)
num_tokens = len(encoding.encode(string))
total_price = (num_tokens / 1000) * 0.0004
return num_tokens, total_price
1 change: 1 addition & 0 deletions frontend/src/models/misc.ts
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ export type Doc = {
date: string;
docLink: string;
model: string;
tokens?: string;
};

export type PromptProps = {
Expand Down
4 changes: 4 additions & 0 deletions frontend/src/settings/Documents.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ const Documents: React.FC<DocumentsProps> = ({
<tr>
<th className="border-r p-4 md:w-[244px]">Document Name</th>
<th className="w-[244px] border-r px-4 py-2">Vector Date</th>
<th className="w-[244px] border-r px-4 py-2">Token usage</th>
<th className="w-[244px] border-r px-4 py-2">Type</th>
<th className="px-4 py-2"></th>
</tr>
Expand All @@ -28,6 +29,9 @@ const Documents: React.FC<DocumentsProps> = ({
<td className="border-r border-t px-4 py-2">
{document.date}
</td>
<td className="border-r border-t px-4 py-2">
{document.tokens ? document.tokens : ''}
</td>
<td className="border-r border-t px-4 py-2">
{document.location === 'remote'
? 'Pre-loaded'
Expand Down

0 comments on commit f6c66f6

Please sign in to comment.