Skip to content

Commit

Permalink
Feat/add unstructured (#32)
Browse files Browse the repository at this point in the history
* Add support for unstructured

* Small tweaks

* Merge simjak changes

* feat: Embeddings pipeline improvements (#33)

* feat: Embeddings pipeline improvements

* fix: Reranking

* fix: Pinecone delete

* feat: Added strategy option

* chore: Merging

* chore: Merging

* walkthrough

* chore: Merging

---------

Co-authored-by: Ismail Pelaseyed <[email protected]>

* Fix formatting

* fix: Weaviate fix (#36)

* fix: Weaviate fix

* chore: Change method name

* Fix formatting

* Fix issue with summarization

* Fix qdrant ingestion

* Fix support for qdrant

* Fix AstraDB querying

---------

Co-authored-by: Simonas Jakubonis <[email protected]>
  • Loading branch information
homanp and simjak committed Feb 14, 2024
1 parent 8945d5d commit 064dc32
Show file tree
Hide file tree
Showing 18 changed files with 3,521 additions and 617 deletions.
10 changes: 7 additions & 3 deletions api/delete.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,19 @@
from fastapi import APIRouter

from models.delete import RequestPayload, ResponsePayload
from service.embedding import get_encoder
from service.vector_database import VectorService, get_vector_service

router = APIRouter()


@router.delete("/delete", response_model=ResponsePayload)
async def delete(payload: RequestPayload):
encoder = get_encoder(encoder_type=payload.encoder)
vector_service: VectorService = get_vector_service(
index_name=payload.index_name, credentials=payload.vector_database
index_name=payload.index_name,
credentials=payload.vector_database,
encoder=encoder,
)
await vector_service.delete(file_url=payload.file_url)
return {"success": True}
data = await vector_service.delete(file_url=payload.file_url)
return ResponsePayload(success=True, data=data)
21 changes: 7 additions & 14 deletions api/ingest.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,27 +18,20 @@ async def ingest(payload: RequestPayload) -> Dict:
index_name=payload.index_name,
vector_credentials=payload.vector_database,
)
documents = await embedding_service.generate_documents()
chunks = await embedding_service.generate_chunks(documents=documents)

chunks = await embedding_service.generate_chunks()
encoder = get_encoder(encoder_type=payload.encoder)

summary_documents = await embedding_service.generate_summary_documents(
documents=documents
)
chunks, summary_chunks = await asyncio.gather(
embedding_service.generate_chunks(documents=documents),
embedding_service.generate_chunks(documents=summary_documents),
documents=chunks
)

await asyncio.gather(
embedding_service.generate_embeddings(
nodes=chunks, encoder=encoder, index_name=payload.index_name
embedding_service.generate_and_upsert_embeddings(
documents=chunks, encoder=encoder, index_name=payload.index_name
),
embedding_service.generate_embeddings(
nodes=summary_chunks,
embedding_service.generate_and_upsert_embeddings(
documents=summary_documents,
encoder=encoder,
index_name=f"{payload.index_name}-{SUMMARY_SUFFIX}",
index_name=f"{payload.index_name}{SUMMARY_SUFFIX}",
),
)

Expand Down
12 changes: 9 additions & 3 deletions api/query.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,18 @@
from fastapi import APIRouter

from models.query import RequestPayload, ResponsePayload
from models.query import RequestPayload, ResponseData, ResponsePayload
from service.router import query as _query

router = APIRouter()


@router.post("/query", response_model=ResponsePayload)
async def query(payload: RequestPayload):
output = await _query(payload=payload)
return {"success": True, "data": output}
chunks = await _query(payload=payload)
response_data = [
ResponseData(
content=chunk.content, doc_url=chunk.doc_url, page_number=chunk.page_number
)
for chunk in chunks
]
return {"success": True, "data": response_data}
34 changes: 27 additions & 7 deletions dev/embedding.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
"\n",
"file = File(\n",
" type=FileType.pdf,\n",
" url=\"https://arxiv.org/pdf/2402.05131.pdf\"\n",
" url=\"https://arxiv.org/pdf/2210.03629.pdf\"\n",
")\n",
"vector_credentials = {\n",
" \"type\": \"pinecone\",\n",
Expand All @@ -40,7 +40,7 @@
"metadata": {},
"outputs": [],
"source": [
"docs = await embedding_service.generate_documents()"
"elements = await embedding_service._download_and_extract_elements(file, strategy=\"auto\")\n"
]
},
{
Expand All @@ -49,7 +49,27 @@
"metadata": {},
"outputs": [],
"source": [
"chunks = await embedding_service.generate_chunks(docs)"
"for element in elements:\n",
" print(type(element))\n",
" # print(f\"Text: {element.text}\")\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"docs = await embedding_service.generate_chunks(strategy=\"auto\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"texts = [doc.content for doc in docs]"
]
},
{
Expand All @@ -62,13 +82,13 @@
"\n",
"concatenated_document = \"\"\n",
"\n",
"for i, chunk in enumerate(chunks):\n",
"for i, chunk in enumerate(texts):\n",
" color = colors[i % len(colors)]\n",
" colored_text = colored(chunk.text, color)\n",
" colored_text = colored(chunk, color)\n",
" print(colored_text)\n",
" concatenated_document += chunk.text + \" \"\n",
" concatenated_document += chunk + \" \"\n",
"\n",
"print(\"\\nConcatenated Document:\\n\", concatenated_document)"
"# print(\"\\nConcatenated Document:\\n\", concatenated_document)"
]
},
{
Expand Down
118 changes: 113 additions & 5 deletions dev/walkthrough.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,40 @@
" }\n",
" },\n",
" \"index_name\": PINECONE_INDEX,\n",
" \"encoder\": \"openai\",\n",
" \"encoder\": \"cohere\",\n",
"}\n",
"\n",
"response = requests.post(url, json=payload)\n",
"\n",
"print(response.json())"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Ingest a file\n",
"url = f\"{API_URL}/api/v1/ingest\"\n",
"\n",
"payload = {\n",
" \"files\": [\n",
" {\n",
" \"type\": \"PDF\",\n",
" \"url\": \"https://arxiv.org/pdf/2402.05131.pdf\"\n",
" }\n",
" ],\n",
" \"vector_database\": {\n",
" \"type\": \"weaviate\",\n",
" \"config\": {\n",
" \"api_key\": \"9eXH8oNR0uqN3GvvzAgaUD11ltPnGqZG2RFQ\",\n",
" \"host\": \"https://superagent-ragas-1575sjfq.weaviate.network\"\n",
" }\n",
" },\n",
" \"index_name\": \"homanp11\",\n",
" \"encoder\": \"cohere\",\n",
" \"webhook_url\": \"https://webhook.site/0e217d1c-49f1-424a-9992-497db09f7793\"\n",
"}\n",
"\n",
"response = requests.post(url, json=payload)\n",
Expand All @@ -64,7 +97,7 @@
"query_url = f\"{API_URL}/api/v1/query\"\n",
"\n",
"query_payload = {\n",
" \"input\": \"What is the best chunk strategy?\",\n",
" \"input\": \"What are the chunking strategies?\",\n",
" \"vector_database\": {\n",
" \"type\": \"pinecone\",\n",
" \"config\": {\n",
Expand All @@ -73,12 +106,59 @@
" }\n",
" },\n",
" \"index_name\": PINECONE_INDEX,\n",
" \"encoder\": \"openai\",\n",
" \"encoder\": \"cohere\",\n",
"}\n",
"\n",
"query_response = requests.post(query_url, json=query_payload)\n",
"\n",
"print(query_response.json())"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Query the index\n",
"query_url = f\"{API_URL}/api/v1/query\"\n",
"\n",
"query_payload = {\n",
" \"input\": \"What are the chunking strategies?\",\n",
" \"vector_database\": {\n",
" \"type\": \"weaviate\",\n",
" \"config\": {\n",
" \"api_key\": \"9eXH8oNR0uqN3GvvzAgaUD11ltPnGqZG2RFQ\",\n",
" \"host\": \"https://superagent-ragas-1575sjfq.weaviate.network\"\n",
" }\n",
" },\n",
" \"index_name\": \"homanp11\",\n",
" \"encoder\": \"cohere\",\n",
"}\n",
"\n",
"query_response = requests.post(query_url, json=query_payload)\n",
"\n",
"print(query_response.json())\n"
"print(query_response.json())"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"data = query_response.json().get('data', [])\n",
"data"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"data = query_response.json().get('data', [])\n",
"data"
]
},
{
Expand All @@ -91,7 +171,7 @@
"query_url = f\"{API_URL}/api/v1/delete\"\n",
"\n",
"delete_payload = {\n",
" \"file_url\": \"https://arxiv.org/pdf/2402.05131.pdf\",\n",
" \"file_url\": \"https://arxiv.org/pdf/2210.03629.pdf\",\n",
" \"vector_database\": {\n",
" \"type\": \"pinecone\",\n",
" \"config\": {\n",
Expand All @@ -100,6 +180,34 @@
" }\n",
" },\n",
" \"index_name\": PINECONE_INDEX,\n",
" \"encoder\": \"cohere\",\n",
"}\n",
"\n",
"delete_response = requests.delete(query_url, json=delete_payload)\n",
"\n",
"print(delete_response.json())"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Delete the index\n",
"query_url = f\"{API_URL}/api/v1/delete\"\n",
"\n",
"delete_payload = {\n",
" \"file_url\": \"https://arxiv.org/pdf/2402.05131.pdf\",\n",
" \"vector_database\": {\n",
" \"type\": \"weaviate\",\n",
" \"config\": {\n",
" \"api_key\": \"9eXH8oNR0uqN3GvvzAgaUD11ltPnGqZG2RFQ\",\n",
" \"host\": \"https://superagent-ragas-1575sjfq.weaviate.network\"\n",
" }\n",
" },\n",
" \"index_name\": \"homanp11\",\n",
" \"encoder\": \"cohere\"\n",
"}\n",
"\n",
"delete_response = requests.delete(query_url, json=delete_payload)\n",
Expand Down
2 changes: 0 additions & 2 deletions encoders/__init__.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
from encoders.base import BaseEncoder
from encoders.bm25 import BM25Encoder
from encoders.cohere import CohereEncoder
from encoders.fastembed import FastEmbedEncoder
from encoders.huggingface import HuggingFaceEncoder
from encoders.openai import OpenAIEncoder

Expand All @@ -10,6 +9,5 @@
"CohereEncoder",
"OpenAIEncoder",
"BM25Encoder",
"FastEmbedEncoder",
"HuggingFaceEncoder",
]
1 change: 1 addition & 0 deletions encoders/cohere.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
class CohereEncoder(BaseEncoder):
client: Optional[cohere.Client] = None
type: str = "cohere"
dimension: int = 1024 # https://docs.cohere.com/reference/embed

def __init__(
self,
Expand Down
51 changes: 0 additions & 51 deletions encoders/fastembed.py

This file was deleted.

7 changes: 7 additions & 0 deletions models/delete.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,20 @@
from pydantic import BaseModel

from models.ingest import EncoderEnum
from models.vector_database import VectorDatabase


class RequestPayload(BaseModel):
index_name: str
file_url: str
vector_database: VectorDatabase
encoder: EncoderEnum


class DeleteResponse(BaseModel):
num_of_deleted_chunks: int


class ResponsePayload(BaseModel):
success: bool
data: DeleteResponse
Loading

0 comments on commit 064dc32

Please sign in to comment.