Skip to content

Commit

Permalink
Added GoogleDrive-Pinecone Sample
Browse files Browse the repository at this point in the history
  • Loading branch information
Raj725 committed Nov 15, 2024
1 parent efcc3fb commit d17484f
Show file tree
Hide file tree
Showing 15 changed files with 666 additions and 0 deletions.
5 changes: 5 additions & 0 deletions pebblo_safeloader/langchain/identity-rag-pinecone/.env.sample
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
PEBBLO_CLASSIFIER_URL="<PEBBLO-SERVER-HOST:PEBBLO-SERVER-PORT>"
OPENAI_API_KEY="<OPENAI_API_KEY>"
GOOGLE_APPLICATION_CREDENTIALS="<PATH_TO_GOOGLE_APPLICATION_CREDENTIALS>"
# Vector DB Config
PINECONE_API_KEY="<PINECONE_API_KEY>"
2 changes: 2 additions & 0 deletions pebblo_safeloader/langchain/identity-rag-pinecone/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
credentials
google_token.json
42 changes: 42 additions & 0 deletions pebblo_safeloader/langchain/identity-rag-pinecone/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
## PebbloSafeLoader sample app with Identity and Semantic metadata

### Prerequisites

1. Sign up and set up your account on Pinecone (https://www.pinecone.io/).

### Instructions

1. Create Python virtual-env

```console
$ python3 -m venv .venv
$ source .venv/bin/activate
```

2. Install dependencies

```console
$ pip3 install -r requirements.txt
```

3. Copy the `.env.sample` file to `.env` and populate the necessary environment variable.

> Note: You need to set `PEBBLO_CLASSIFIER_URL` only if your `Pebblo Server` is running somewhere other than the default URL
> of `http://localhost:8000`.
4. Update the `pebblo_safeload.py` file with the following details:

- _folder_id_: Google Drive folder ID where the documents are stored


5. Run PebbloSafeLoader sample app

```console
$ python3 pebblo_safeload.py
```

6. Retrieve the Pebblo PDF report in `$HOME/.pebblo/pebblo-identity-sematic-loader-pinecone/pebblo_report.pdf` file path on the system
where `Pebblo Server` is running.

7. To access the Pebblo UI, point the browser to `https://localhost:8000/pebblo` or `host:port/pebblo` if you are running the server on a different
host.
103 changes: 103 additions & 0 deletions pebblo_safeloader/langchain/identity-rag-pinecone/pebblo_safeload.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,103 @@
"""
Identity enabled SafeLoader app using Pebblo and Pinecone VectorDB.
This app loads documents from Google Drive and add them to Pinecone VectorDB.
"""

import os
from pathlib import Path

from dotenv import load_dotenv
from langchain_community.document_loaders import UnstructuredFileIOLoader
from langchain_community.document_loaders.pebblo import PebbloSafeLoader
from langchain_community.vectorstores import Pinecone as PineconeVectorStore
from langchain_google_community import GoogleDriveLoader
from langchain_openai.embeddings import OpenAIEmbeddings
from pinecone_index import create_pinecone_index
from utils import describe_pebblo_semantic_stats

load_dotenv()

PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")


class IdentityBasedDataLoader:
"""
Identity enabled SafeLoader app using Pebblo and Pinecone VectorDB.
This app loads documents from Google Drive and add them to Pinecone VectorDB.
Args:
folder_id (str): Google Drive folder id
index_name (str): Index name for Pinecone
"""

def __init__(self, folder_id: str, index_name: str):
self.app_name = "pebblo-identity-sematic-loader-pinecone"
self.folder_id = folder_id
self.pinecone_index_name = index_name
self.embeddings = OpenAIEmbeddings()

def load_documents(self):
"""
Load documents from Google Drive
"""
print("\nLoading RAG documents ...")
loader = PebbloSafeLoader(
GoogleDriveLoader(
folder_id=self.folder_id,
credentials_path=Path("credentials/credentials.json"),
token_path=Path("./google_token.json"),
recursive=True,
file_loader_cls=UnstructuredFileIOLoader,
file_loader_kwargs={"mode": "elements"},
load_auth=True,
),
name=self.app_name, # App name (Mandatory)
owner="Joe Smith", # Owner (Optional)
description="Identity enabled SafeLoader app using Pebblo and Pinecone VectorDB", # Description (Optional)
load_semantic=True,
)
documents = loader.load()
unique_identities = set()
for doc in documents:
unique_identities.update(doc.metadata.get("authorized_identities", []))
print(f"Loaded {len(documents)} documents ...\n")
describe_pebblo_semantic_stats(documents)
return documents

def add_docs_to_pinecone(self, documents) -> PineconeVectorStore:
"""
Create a Pinecone index and load documents into it
"""
# Create index
create_pinecone_index(self.pinecone_index_name, recreate=True)

print("Loading docs into index...")
texts = [t.page_content for t in documents]
metadatas = [t.metadata for t in documents]

# pop "coordinates" from metadata(Nested JSONs are not supported in Pinecone)
for metadata in metadatas:
metadata.pop("coordinates", None)

vector_store = PineconeVectorStore.from_texts(
texts,
self.embeddings,
metadatas=metadatas,
index_name=self.pinecone_index_name,
)

print("Done!")
return vector_store


if __name__ == "__main__":
print("Loading documents to Qdrant ...")
folder_id = "<google-drive-folder-id>"
input_index_name = "identity-semantic-enforcement-rag"
pinecone_data_loader = IdentityBasedDataLoader(folder_id, input_index_name)

result_documents = pinecone_data_loader.load_documents()

vectordb_obj = pinecone_data_loader.add_docs_to_pinecone(result_documents)
print(f"First document: {result_documents[0]}")
print("Finished hydrating Vector DB ...\n")
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
import os
import time

from dotenv import load_dotenv
from pinecone import Pinecone, PodSpec

load_dotenv()

PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")


def create_pinecone_index(pinecone_index_name: str, recreate: bool = False):
"""
Create a new Pinecone index
"""

# configure client
pc = Pinecone(api_key=PINECONE_API_KEY)
# Update the environment/PodSpec to match the one you have access to
environment = "gcp-starter"
spec = PodSpec(environment=environment)

# check for and delete index if already exists
if pinecone_index_name in pc.list_indexes().names():
if not recreate:
print(f"Index {pinecone_index_name} already exists. skipping...")
return
else:
# Delete and create a new index
print(f"Deleting and recreating index: {pinecone_index_name} ...")
pc.delete_index(pinecone_index_name)
print(f"Deleted index: {pinecone_index_name}.")

print(f"Creating index: {pinecone_index_name}...")
# create a new index
pc.create_index(
pinecone_index_name,
dimension=1536, # dimensionality of text-embedding-ada-002
metric="dotproduct",
spec=spec,
)

# wait for index to be initialized
while not pc.describe_index(pinecone_index_name).status["ready"]:
time.sleep(1)

index = pc.Index(pinecone_index_name)
index.describe_index_stats()
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
python-dotenv==1.0.0
requests==2.31.0
unstructured
google-api-python-client # For Google Auth
langchain
langchain-openai
langchain-community # For PebbloSafeLoader
langchain_google_community # For GoogleDriveLoader
pinecone-client # for Pinecone VectorStore
18 changes: 18 additions & 0 deletions pebblo_safeloader/langchain/identity-rag-pinecone/utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
def describe_pebblo_semantic_stats(documents: list) -> None:
"""
Describe the semantic stats of the documents
"""
unique_identities = set()
unique_topics = set()
unique_entities = set()

for doc in documents:
unique_identities.update(doc.metadata.get("authorized_identities", []))
unique_topics.update(doc.metadata.get("pebblo_semantic_topics", []))
unique_entities.update(doc.metadata.get("pebblo_semantic_entities", []))

print("\nIndentity and Semantic Stats:")
print(f"Authorized Identities: {list(unique_identities)}")
print(f"Semantic Topics: {list(unique_topics)}")
print(f"Semantic Entities: {list(unique_entities)}")
print("\n")
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
# OpenAI credentials
OPENAI_API_KEY="<OPENAI_API_KEY>"

# Pebblo configuration
PEBBLO_CLASSIFIER_URL="<PEBBLO-SERVER-HOST:PEBBLO-SERVER-PORT>" # e.g "http://localhost:8000/"
PEBBLO_API_KEY=<PEBBLO API KEY>
PEBBLO_CLOUD_URL=<PEBBLO CLOUD URL>

# Google Drive Config
GOOGLE_APPLICATION_CREDENTIALS="<PATH_TO_GOOGLE_APPLICATION_CREDENTIALS>"

# Vector DB Config
PINECONE_API_KEY="<PINECONE_API_KEY>"
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
credentials
google_token.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
## Identity and Semantic Enforcement using Pebblo

This solution uses the following daxa/langchain and daxa/langchain-google branches:

- daxa-ai/langchain: https://github.com/daxa-ai/langchain/tree/pebblo-0.1.21
- daxa-ai/langchain-google: https://github.com/daxa-ai/langchain-google/tree/pebblo-0.1.21

### Prerequisites

1. Sign up and set up your account on Pinecone (https://www.pinecone.io/).

### Instructions

1. Create Python virtual-env

```console
$ python3 -m venv .venv
$ source .venv/bin/activate
```

2. Install dependencies

```console
$ pip3 install -r requirements.txt
```

3. Install langchain-core and langchain-community from the branch `pebblo-0.1.21`

```console
$ git clone https://github.com/daxa-ai/langchain.git
$ cd langchain
$ git fetch && git checkout pebblo-0.1.21
$ cd libs/community
$ pip3 install langchain-community .
$ cd ../core
$ pip3 install langchain-core .
```

4. Install langchain-google from the branch `pebblo-0.1.21`

```console
$ git clone https://github.com/daxa-ai/langchain-google.git
$ cd langchain-google
$ git fetch && git checkout pebblo-0.1.21
$ cd libs/community
$ pip3 install langchain-google-community .
```

5.Copy the `.env.sample` file to `.env` and populate the necessary environment variable.

5. Update the `pebblo_saferag.py` file with the following details:

- _folder_id_: Google Drive folder ID where the documents are stored
- _service_acc_def_: Google service account credentials file path
- _ing_user_email_def_: Google Drive Admin/Ingestion user email ID


5. Run langchain sample app PebbloSafeLoader and PebbloRetrievalQA

```console
$ python3 pebblo_saferag.py
```

6. Retrieve the Pebblo PDF report in `$HOME/.pebblo/pebblo-identity-n-semantic-loader-pinecone/pebblo_report.pdf` file path on the system
where `Pebblo Server` is
running.

7. To access the Pebblo UI, point the browser to `https://localhost:8000/pebblo` or `host:port/pebblo` if you are running the server on a different
host.
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
from typing import List

from google.oauth2 import service_account
from googleapiclient.discovery import build


def get_authorized_identities(
admin_user_email_address: str, credentials_file_path: str, user_email: str
) -> List[str]:
"""
Get authorized identities from Google Directory API
"""
_authorized_identities = [user_email]
credentials = service_account.Credentials.from_service_account_file(
credentials_file_path,
scopes=[
"https://www.googleapis.com/auth/admin.directory.group.readonly",
"https://www.googleapis.com/auth/admin.directory.group",
],
subject=admin_user_email_address,
)
directory_service = build("admin", "directory_v1", credentials=credentials)

try:
groups = directory_service.groups().list(userKey=user_email).execute()
for group in groups.get("groups", []):
group_email = group["email"]
_authorized_identities.append(group_email)
except Exception as e:
print(f"Error in : {e}")
print(f"User: {user_email}, \nAuthorized Identities: {_authorized_identities}\n")
return _authorized_identities
Loading

0 comments on commit d17484f

Please sign in to comment.