-
Notifications
You must be signed in to change notification settings - Fork 45
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
15 changed files
with
666 additions
and
0 deletions.
There are no files selected for viewing
5 changes: 5 additions & 0 deletions
5
pebblo_safeloader/langchain/identity-rag-pinecone/.env.sample
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,5 @@ | ||
PEBBLO_CLASSIFIER_URL="<PEBBLO-SERVER-HOST:PEBBLO-SERVER-PORT>" | ||
OPENAI_API_KEY="<OPENAI_API_KEY>" | ||
GOOGLE_APPLICATION_CREDENTIALS="<PATH_TO_GOOGLE_APPLICATION_CREDENTIALS>" | ||
# Vector DB Config | ||
PINECONE_API_KEY="<PINECONE_API_KEY>" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,2 @@ | ||
credentials | ||
google_token.json |
42 changes: 42 additions & 0 deletions
42
pebblo_safeloader/langchain/identity-rag-pinecone/README.md
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,42 @@ | ||
## PebbloSafeLoader sample app with Identity and Semantic metadata | ||
|
||
### Prerequisites | ||
|
||
1. Sign up and set up your account on Pinecone (https://www.pinecone.io/). | ||
|
||
### Instructions | ||
|
||
1. Create Python virtual-env | ||
|
||
```console | ||
$ python3 -m venv .venv | ||
$ source .venv/bin/activate | ||
``` | ||
|
||
2. Install dependencies | ||
|
||
```console | ||
$ pip3 install -r requirements.txt | ||
``` | ||
|
||
3. Copy the `.env.sample` file to `.env` and populate the necessary environment variable. | ||
|
||
> Note: You need to set `PEBBLO_CLASSIFIER_URL` only if your `Pebblo Server` is running somewhere other than the default URL | ||
> of `http://localhost:8000`. | ||
4. Update the `pebblo_safeload.py` file with the following details: | ||
|
||
- _folder_id_: Google Drive folder ID where the documents are stored | ||
|
||
|
||
5. Run PebbloSafeLoader sample app | ||
|
||
```console | ||
$ python3 pebblo_safeload.py | ||
``` | ||
|
||
6. Retrieve the Pebblo PDF report in `$HOME/.pebblo/pebblo-identity-sematic-loader-pinecone/pebblo_report.pdf` file path on the system | ||
where `Pebblo Server` is running. | ||
|
||
7. To access the Pebblo UI, point the browser to `https://localhost:8000/pebblo` or `host:port/pebblo` if you are running the server on a different | ||
host. |
103 changes: 103 additions & 0 deletions
103
pebblo_safeloader/langchain/identity-rag-pinecone/pebblo_safeload.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,103 @@ | ||
""" | ||
Identity enabled SafeLoader app using Pebblo and Pinecone VectorDB. | ||
This app loads documents from Google Drive and add them to Pinecone VectorDB. | ||
""" | ||
|
||
import os | ||
from pathlib import Path | ||
|
||
from dotenv import load_dotenv | ||
from langchain_community.document_loaders import UnstructuredFileIOLoader | ||
from langchain_community.document_loaders.pebblo import PebbloSafeLoader | ||
from langchain_community.vectorstores import Pinecone as PineconeVectorStore | ||
from langchain_google_community import GoogleDriveLoader | ||
from langchain_openai.embeddings import OpenAIEmbeddings | ||
from pinecone_index import create_pinecone_index | ||
from utils import describe_pebblo_semantic_stats | ||
|
||
load_dotenv() | ||
|
||
PINECONE_API_KEY = os.getenv("PINECONE_API_KEY") | ||
|
||
|
||
class IdentityBasedDataLoader: | ||
""" | ||
Identity enabled SafeLoader app using Pebblo and Pinecone VectorDB. | ||
This app loads documents from Google Drive and add them to Pinecone VectorDB. | ||
Args: | ||
folder_id (str): Google Drive folder id | ||
index_name (str): Index name for Pinecone | ||
""" | ||
|
||
def __init__(self, folder_id: str, index_name: str): | ||
self.app_name = "pebblo-identity-sematic-loader-pinecone" | ||
self.folder_id = folder_id | ||
self.pinecone_index_name = index_name | ||
self.embeddings = OpenAIEmbeddings() | ||
|
||
def load_documents(self): | ||
""" | ||
Load documents from Google Drive | ||
""" | ||
print("\nLoading RAG documents ...") | ||
loader = PebbloSafeLoader( | ||
GoogleDriveLoader( | ||
folder_id=self.folder_id, | ||
credentials_path=Path("credentials/credentials.json"), | ||
token_path=Path("./google_token.json"), | ||
recursive=True, | ||
file_loader_cls=UnstructuredFileIOLoader, | ||
file_loader_kwargs={"mode": "elements"}, | ||
load_auth=True, | ||
), | ||
name=self.app_name, # App name (Mandatory) | ||
owner="Joe Smith", # Owner (Optional) | ||
description="Identity enabled SafeLoader app using Pebblo and Pinecone VectorDB", # Description (Optional) | ||
load_semantic=True, | ||
) | ||
documents = loader.load() | ||
unique_identities = set() | ||
for doc in documents: | ||
unique_identities.update(doc.metadata.get("authorized_identities", [])) | ||
print(f"Loaded {len(documents)} documents ...\n") | ||
describe_pebblo_semantic_stats(documents) | ||
return documents | ||
|
||
def add_docs_to_pinecone(self, documents) -> PineconeVectorStore: | ||
""" | ||
Create a Pinecone index and load documents into it | ||
""" | ||
# Create index | ||
create_pinecone_index(self.pinecone_index_name, recreate=True) | ||
|
||
print("Loading docs into index...") | ||
texts = [t.page_content for t in documents] | ||
metadatas = [t.metadata for t in documents] | ||
|
||
# pop "coordinates" from metadata(Nested JSONs are not supported in Pinecone) | ||
for metadata in metadatas: | ||
metadata.pop("coordinates", None) | ||
|
||
vector_store = PineconeVectorStore.from_texts( | ||
texts, | ||
self.embeddings, | ||
metadatas=metadatas, | ||
index_name=self.pinecone_index_name, | ||
) | ||
|
||
print("Done!") | ||
return vector_store | ||
|
||
|
||
if __name__ == "__main__": | ||
print("Loading documents to Qdrant ...") | ||
folder_id = "<google-drive-folder-id>" | ||
input_index_name = "identity-semantic-enforcement-rag" | ||
pinecone_data_loader = IdentityBasedDataLoader(folder_id, input_index_name) | ||
|
||
result_documents = pinecone_data_loader.load_documents() | ||
|
||
vectordb_obj = pinecone_data_loader.add_docs_to_pinecone(result_documents) | ||
print(f"First document: {result_documents[0]}") | ||
print("Finished hydrating Vector DB ...\n") |
48 changes: 48 additions & 0 deletions
48
pebblo_safeloader/langchain/identity-rag-pinecone/pinecone_index.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,48 @@ | ||
import os | ||
import time | ||
|
||
from dotenv import load_dotenv | ||
from pinecone import Pinecone, PodSpec | ||
|
||
load_dotenv() | ||
|
||
PINECONE_API_KEY = os.getenv("PINECONE_API_KEY") | ||
|
||
|
||
def create_pinecone_index(pinecone_index_name: str, recreate: bool = False): | ||
""" | ||
Create a new Pinecone index | ||
""" | ||
|
||
# configure client | ||
pc = Pinecone(api_key=PINECONE_API_KEY) | ||
# Update the environment/PodSpec to match the one you have access to | ||
environment = "gcp-starter" | ||
spec = PodSpec(environment=environment) | ||
|
||
# check for and delete index if already exists | ||
if pinecone_index_name in pc.list_indexes().names(): | ||
if not recreate: | ||
print(f"Index {pinecone_index_name} already exists. skipping...") | ||
return | ||
else: | ||
# Delete and create a new index | ||
print(f"Deleting and recreating index: {pinecone_index_name} ...") | ||
pc.delete_index(pinecone_index_name) | ||
print(f"Deleted index: {pinecone_index_name}.") | ||
|
||
print(f"Creating index: {pinecone_index_name}...") | ||
# create a new index | ||
pc.create_index( | ||
pinecone_index_name, | ||
dimension=1536, # dimensionality of text-embedding-ada-002 | ||
metric="dotproduct", | ||
spec=spec, | ||
) | ||
|
||
# wait for index to be initialized | ||
while not pc.describe_index(pinecone_index_name).status["ready"]: | ||
time.sleep(1) | ||
|
||
index = pc.Index(pinecone_index_name) | ||
index.describe_index_stats() |
9 changes: 9 additions & 0 deletions
9
pebblo_safeloader/langchain/identity-rag-pinecone/requirements.txt
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,9 @@ | ||
python-dotenv==1.0.0 | ||
requests==2.31.0 | ||
unstructured | ||
google-api-python-client # For Google Auth | ||
langchain | ||
langchain-openai | ||
langchain-community # For PebbloSafeLoader | ||
langchain_google_community # For GoogleDriveLoader | ||
pinecone-client # for Pinecone VectorStore |
18 changes: 18 additions & 0 deletions
18
pebblo_safeloader/langchain/identity-rag-pinecone/utils.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,18 @@ | ||
def describe_pebblo_semantic_stats(documents: list) -> None: | ||
""" | ||
Describe the semantic stats of the documents | ||
""" | ||
unique_identities = set() | ||
unique_topics = set() | ||
unique_entities = set() | ||
|
||
for doc in documents: | ||
unique_identities.update(doc.metadata.get("authorized_identities", [])) | ||
unique_topics.update(doc.metadata.get("pebblo_semantic_topics", [])) | ||
unique_entities.update(doc.metadata.get("pebblo_semantic_entities", [])) | ||
|
||
print("\nIndentity and Semantic Stats:") | ||
print(f"Authorized Identities: {list(unique_identities)}") | ||
print(f"Semantic Topics: {list(unique_topics)}") | ||
print(f"Semantic Entities: {list(unique_entities)}") | ||
print("\n") |
13 changes: 13 additions & 0 deletions
13
pebblo_saferetriever/langchain/pebblo-saferag/googledrive-pinecone/.env.sample
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,13 @@ | ||
# OpenAI credentials | ||
OPENAI_API_KEY="<OPENAI_API_KEY>" | ||
|
||
# Pebblo configuration | ||
PEBBLO_CLASSIFIER_URL="<PEBBLO-SERVER-HOST:PEBBLO-SERVER-PORT>" # e.g "http://localhost:8000/" | ||
PEBBLO_API_KEY=<PEBBLO API KEY> | ||
PEBBLO_CLOUD_URL=<PEBBLO CLOUD URL> | ||
|
||
# Google Drive Config | ||
GOOGLE_APPLICATION_CREDENTIALS="<PATH_TO_GOOGLE_APPLICATION_CREDENTIALS>" | ||
|
||
# Vector DB Config | ||
PINECONE_API_KEY="<PINECONE_API_KEY>" |
2 changes: 2 additions & 0 deletions
2
pebblo_saferetriever/langchain/pebblo-saferag/googledrive-pinecone/.gitignore
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,2 @@ | ||
credentials | ||
google_token.json |
69 changes: 69 additions & 0 deletions
69
pebblo_saferetriever/langchain/pebblo-saferag/googledrive-pinecone/README.md
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,69 @@ | ||
## Identity and Semantic Enforcement using Pebblo | ||
|
||
This solution uses the following daxa/langchain and daxa/langchain-google branches: | ||
|
||
- daxa-ai/langchain: https://github.com/daxa-ai/langchain/tree/pebblo-0.1.21 | ||
- daxa-ai/langchain-google: https://github.com/daxa-ai/langchain-google/tree/pebblo-0.1.21 | ||
|
||
### Prerequisites | ||
|
||
1. Sign up and set up your account on Pinecone (https://www.pinecone.io/). | ||
|
||
### Instructions | ||
|
||
1. Create Python virtual-env | ||
|
||
```console | ||
$ python3 -m venv .venv | ||
$ source .venv/bin/activate | ||
``` | ||
|
||
2. Install dependencies | ||
|
||
```console | ||
$ pip3 install -r requirements.txt | ||
``` | ||
|
||
3. Install langchain-core and langchain-community from the branch `pebblo-0.1.21` | ||
|
||
```console | ||
$ git clone https://github.com/daxa-ai/langchain.git | ||
$ cd langchain | ||
$ git fetch && git checkout pebblo-0.1.21 | ||
$ cd libs/community | ||
$ pip3 install langchain-community . | ||
$ cd ../core | ||
$ pip3 install langchain-core . | ||
``` | ||
|
||
4. Install langchain-google from the branch `pebblo-0.1.21` | ||
|
||
```console | ||
$ git clone https://github.com/daxa-ai/langchain-google.git | ||
$ cd langchain-google | ||
$ git fetch && git checkout pebblo-0.1.21 | ||
$ cd libs/community | ||
$ pip3 install langchain-google-community . | ||
``` | ||
|
||
5.Copy the `.env.sample` file to `.env` and populate the necessary environment variable. | ||
|
||
5. Update the `pebblo_saferag.py` file with the following details: | ||
|
||
- _folder_id_: Google Drive folder ID where the documents are stored | ||
- _service_acc_def_: Google service account credentials file path | ||
- _ing_user_email_def_: Google Drive Admin/Ingestion user email ID | ||
|
||
|
||
5. Run langchain sample app PebbloSafeLoader and PebbloRetrievalQA | ||
|
||
```console | ||
$ python3 pebblo_saferag.py | ||
``` | ||
|
||
6. Retrieve the Pebblo PDF report in `$HOME/.pebblo/pebblo-identity-n-semantic-loader-pinecone/pebblo_report.pdf` file path on the system | ||
where `Pebblo Server` is | ||
running. | ||
|
||
7. To access the Pebblo UI, point the browser to `https://localhost:8000/pebblo` or `host:port/pebblo` if you are running the server on a different | ||
host. |
32 changes: 32 additions & 0 deletions
32
pebblo_saferetriever/langchain/pebblo-saferag/googledrive-pinecone/google_auth.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,32 @@ | ||
from typing import List | ||
|
||
from google.oauth2 import service_account | ||
from googleapiclient.discovery import build | ||
|
||
|
||
def get_authorized_identities( | ||
admin_user_email_address: str, credentials_file_path: str, user_email: str | ||
) -> List[str]: | ||
""" | ||
Get authorized identities from Google Directory API | ||
""" | ||
_authorized_identities = [user_email] | ||
credentials = service_account.Credentials.from_service_account_file( | ||
credentials_file_path, | ||
scopes=[ | ||
"https://www.googleapis.com/auth/admin.directory.group.readonly", | ||
"https://www.googleapis.com/auth/admin.directory.group", | ||
], | ||
subject=admin_user_email_address, | ||
) | ||
directory_service = build("admin", "directory_v1", credentials=credentials) | ||
|
||
try: | ||
groups = directory_service.groups().list(userKey=user_email).execute() | ||
for group in groups.get("groups", []): | ||
group_email = group["email"] | ||
_authorized_identities.append(group_email) | ||
except Exception as e: | ||
print(f"Error in : {e}") | ||
print(f"User: {user_email}, \nAuthorized Identities: {_authorized_identities}\n") | ||
return _authorized_identities |
Oops, something went wrong.