diff --git a/docs/gh_pages/docusaurus.config.ts b/docs/gh_pages/docusaurus.config.ts index a948aa68..f1c6962b 100644 --- a/docs/gh_pages/docusaurus.config.ts +++ b/docs/gh_pages/docusaurus.config.ts @@ -37,6 +37,9 @@ const config: Config = { label: "latest", path: "", }, + "0.1.18": { + banner: "none", + }, "0.1.17": { banner: "none", }, diff --git a/docs/gh_pages/versioned_docs/version-0.1.18/config.md b/docs/gh_pages/versioned_docs/version-0.1.18/config.md new file mode 100644 index 00000000..50ecae96 --- /dev/null +++ b/docs/gh_pages/versioned_docs/version-0.1.18/config.md @@ -0,0 +1,78 @@ +# Pebblo Configuration File + +### Introduction + +This configuration file specifies settings for various components of the Pebblo. + +### Configuration Details + +#### Server + +- `port`: Specifies the port number on which the Pebblo server listens for incoming connections. +- `host`: Specifies the host address on which the Pebblo server to run. + +Notes: + +1. By default `Pebblo Server` runs at `localhost:8000`. When we change values of `port` and/or `host` , the `Pebblo Safe DataLoader` env variable `PEBBLO_CLASSIFIER_URL` needs to set to the correct URL. +2. By default `Pebblo UI` runs at `localhost:8000/pebblo`. When we change values of `port` and/or `host`, the Pebblo UI would be running on the respective `host:port/pebblo`. + +### Logging + +- `level`: Sets the logging level. Possible values are 'info', 'debug', 'error', 'warning', and 'critical'. Default value is `info`. +- `file`: Sets the log file path. Default value is `/tmp/logs/pebblo.log`. +- `maxFileSize`: Sets the maximum size of the log file. Default value is `8306688` bytes (8 MB). +- `backupCount`: Sets the number of backup files to keep. Default value is `3`. + +### Reports + +- `format`: Specifies the format of generated reports. Available options include 'pdf'. +- `renderer`: Specifies the rendering engine for generating reports. Options include 'weasyprint', 'xhtml2pdf'. + + > **Note** + > Note: Using xhtml2pdf gives a report with basic UI elements, but WeasyPrint renderer creates a sleeker, better-aligned interface for your PDFs. See image below. If you put renderer as `weasyprint`, then you need to install Pango. Follow [these instructions](./installation.md#install-weasyprint-library) for the same. + + ![Pebblo Reports](../../static/img/report-comparision.png) + +- `cacheDir`: Sets the directory where pebblo stores metadata, generated reports, and other temporary files. Default value is `~/.pebblo`. +- `outputDir`: Deprecated. Use `cacheDir` instead. + +### Classifier + +- `anonymizeSnippets`: Flag to anonymize snippets in report. Possible values are 'True' and 'False'. When its value is 'True', snippets in reports will be shown as anonymized and vice versa. + +### Storage + +This is beta feature introduced in 0.1.18. + +- `type`: Specifies storage type to store states of the GenAI applications. Possible values are `file` or `db`. Default value is `file`. By default SQLite database is used when we set it as `db`. +- `type` as `file` is deprecated, use `type` as `db`. `file` would not be supported from 0.1.19 release. + +### Default Configuration + +```yaml +daemon: + port: 8000 + host: localhost +logging: + level: info +reports: + format: pdf + renderer: xhtml2pdf + outputDir: ~/.pebblo +classifier: + anonymizeSnippets: False +storage: + type: file +``` + +`Note`: +Users have the option to maintain any section or even a single field within a section. For instance, the `config` file might appear as follows: + +```yaml +logging: + level: info +``` + +This flexibility empowers users to tailor configurations to their specific needs while retaining default values for other sections or fields. + + diff --git a/docs/gh_pages/versioned_docs/version-0.1.18/daemon.md b/docs/gh_pages/versioned_docs/version-0.1.18/daemon.md new file mode 100644 index 00000000..2e7c3684 --- /dev/null +++ b/docs/gh_pages/versioned_docs/version-0.1.18/daemon.md @@ -0,0 +1,27 @@ +# Pebblo Server + +`Pebblo Server` is a REST API application that exposes API endpoints for Pebblo Safe DataLoader to connect. This component provides deep data visibility on the types of Topics and Entities ingested into the Gen-AI application. It uses the snippets received from the `Pebblo Safe DataLoader` to run through both a Topic Classifier and Entity Classifier to produce the insights and reporting. For more details on how to Pebblo enable your Langchain application see this [Pebblo Safe DataLoader for Langchain](rag.md) document. + +By default `Pebblo Server` runs at `localhost:8000`. The `Pebblo Safe DataLoader` by default connects to this hostname and port. If the server is running in a different port or a different hostname, the `Pebblo Safe DataLoader` env variable `PEBBLO_CLASSIFIER_URL` need to set to the correct URL. + +## Report Generation + +A separate `Data Report` will be generated for every complete document load operation. A subsequent document loader, either done periodically (say everyday, every week, etc) or on-demand will not overwrite a previous load's `Data Report`. + +## Report Location + +By default all the reports will be stored in a `.pebblo` in the home directory of the system running `Pebblo Server`. Separate subdirectories named with the RAG application name is used when multiple RAG applications uses the same `Pebblo Server`. + +```bash + +$ cd $HOME/.pebblo +$ tree +├── acme-corp-rag-1 +│   ├── pebblo_report.pdf +│   ├── bfd46d34-42c7-4819-846c-f54b3620f540 +│   │   ├── metadata +│   │   │   └── metadata.json +│   │   └── report.json +``` + + diff --git a/docs/gh_pages/versioned_docs/version-0.1.18/development.md b/docs/gh_pages/versioned_docs/version-0.1.18/development.md new file mode 100644 index 00000000..652932f9 --- /dev/null +++ b/docs/gh_pages/versioned_docs/version-0.1.18/development.md @@ -0,0 +1,83 @@ +# Setting up development environment + +> **Note** +> Please note that Pebblo requires Python version 3.9 or above to function optimally. + +Pebblo is currently supported in MacOS and Linux. + +The following instructions are **tested on Mac OSX and Linux (Debian).** + +### Prerequisites + +Install the following prerequisites. This is needed for PDF report generation, + +if you have put `weasyprint` as renderer in the config.yaml + +#### Mac OSX + +```sh +brew install pango +``` + +#### Linux (debian/ubuntu) + +```sh +sudo apt-get install libpango-1.0-0 libpangoft2-1.0-0 +``` + +### Install weasyprint library +```sh +pip install weasyprint +``` + +## Build, Install and Run + +Fork and clone the pebblo repo. From within the pebblo directory, create a python virtual-env, build pebblo package (in `wheel` format), install and run. + +### Build + +```bash + +# Fork and clone the pebblo repo +git clone https://github.com//pebblo.git +cd pebblo + +# Create and activate a virtual environment +python3 -m venv .venv +source .venv/bin/activate + +# Build pebblo python package +pip3 install build +python3 -m build --wheel +``` + +Build artifact as wheel package will be available in `dist/pebblo--py3-none-any.whl` + +### Install + +```bash +pip3 install dist/pebblo--py3-none-any.whl +``` + +Pebblo script will the install as `.venv/bin/pebblo` + +### Run Pebblo Server + +```bash +pebblo +``` + +Pebblo server now listens to `localhost:8000` to accept Gen-AI application document snippets for inspection and reporting. + +## Creating a pull request + +See [these instructions](https://docs.github.com/en/pull-requests/collaborating-with-pull-requests/proposing-changes-to-your-work-with-pull-requests/creating-a-pull-request-from-a-fork) +to open a pull request against the main Pebblo repo. + +## Communication + +Please join Discord server [https://discord.gg/wyAfaYXwwv](https://discord.gg/wyAfaYXwwv) to reach out to the Pebblo maintainers, contributors and users. + +![Discord](https://img.shields.io/discord/1199861582776246403?logo=discord) + + diff --git a/docs/gh_pages/versioned_docs/version-0.1.18/entityclassifier.md b/docs/gh_pages/versioned_docs/version-0.1.18/entityclassifier.md new file mode 100644 index 00000000..68110027 --- /dev/null +++ b/docs/gh_pages/versioned_docs/version-0.1.18/entityclassifier.md @@ -0,0 +1,33 @@ +# Pebblo Entity Classifier + +`Pebblo entity classifier` is designed to automatically scan your loader source files and pinpoint sensitive entities within the files. By highlighting these entities, it assists in ensuring compliance, data security, and privacy protection within your data processing pipeline. +Integrating it enhances risk mitigation and regulatory adherence while streamlining sensitive data handling. + +Pebblo Entity Classifier harnesses the power of the `Presidio Analyzer` python library for accurate entity classification. +Leveraging Presidio's robust features and capabilities, we ensure precise identification of entities within textual data. +Additionally, our solution welcomes contributions from the open-source community, encouraging collaborative efforts to improve its functionality and reliability. + +# Entities Supported By Pebblo Entity Classifier + +Below is the list of `entities` supported by Pebblo - + +1. US Social Security Number +1. US Passport Number +1. US Driver's License +1. US Credit Card Number +1. US Bank Account Number +1. IBAN Code +1. US ITIN +1. IP Address +1. GitHub Access Token +1. Slack Access Token +1. AWS Access Key +1. AWS Secret Key + + +User can get details of classified entities for their loader source files in Pebblo report. +Different sections of Pebblo report such as , `Top Files with Most Findings`, `Data Source Findings Table` and `Snippets` helps to get overview of pebblo entity classifier output for user's Rag application. + +For more details refer - [Reports](reports.md) + + diff --git a/docs/gh_pages/versioned_docs/version-0.1.18/installation.md b/docs/gh_pages/versioned_docs/version-0.1.18/installation.md new file mode 100644 index 00000000..2f0b6cc5 --- /dev/null +++ b/docs/gh_pages/versioned_docs/version-0.1.18/installation.md @@ -0,0 +1,103 @@ +# Installation + +> **Note** +> Please note that Pebblo requires Python version 3.9 or above to function optimally. + +## Using `pip` + +```bash +pip install pebblo --extra-index-url https://packages.daxa.ai/simple/ +``` + +### Run Pebblo server + +``` +$ pebblo +``` + +Pebblo server now listens to `localhost:8000` to accept Gen-AI application data snippets for inspection and reporting. +Pebblo UI interface would be available on `http://localhost:8000/pebblo` + +See [troubleshooting](troubleshooting.md) for any issues. + +#### Configuration flags (Optional) + +- `--config `: Specifies a custom configuration file in yaml format. + +```bash +pebblo [--config /path/to/config.yaml] +``` + + +## Using Docker + +```bash +docker run \ + -v /path/to/pebblo_reports:/opt/.pebblo \ + -p 8000:8000 docker.daxa.ai/daxaai/pebblo:latest +``` + +Local UI can be accessed by pointing the browser to `https://localhost:8000`. + +To access PDF reports in the host machine outside the docker container, use the above command with mounted volumes for the report folder. By default reports are in cached dir i.e `/opt/.pebblo`. If custom configuration file is passed then this value should be as per the `cacheDir` from `config.yaml` + +## Using Docker with custom configuration + +To pass a specific configuration file and to access PDF reports iin the host machine outside the docker container, use the following command with mounted volumes for config.yaml and the report folder. + +```bash +docker run \ + -v /path/to/pebblo_reports:/opt/.pebblo \ + -v /path/to/pebblo/config.yaml:/opt/pebblo/config/config.yaml \ + -p 8000:8000 docker.daxa.ai/daxaai/pebblo:latest \ + --config /opt/pebblo/config/config.yaml +``` + + +## Using Kubernetes +Apply below k8s manifiest files in sequence to run the pebblo server on k8s cluster. +```bash +kubectl apply -f deploy/k8s-deploy/config.yaml + +kubectl apply -f deploy/k8s-deploy/pvc.yaml + +kubectl apply -f deploy/k8s-deploy/deploy.yaml + +kubectl apply -f deploy/k8s-deploy/service.yaml +``` +Use `kubectl logs ` to get the logs from pebblo server. + +**Note-** Setup the nginx ingress controller to expose the pebblo server. + +# Enhanced PDF reporting + +Pebblo supports two PDF rendering options: + +1. `xhtml2pdf` (default) +1. `weasyprint` + +This is selected using `renderer` setting in the config.yaml + +`weasyprint` produces an enhanced visual look and feel. This renderer option requires the following additional prerequisites. This is needed for PDF report generation, + +### Install weasyprint library + +```sh +pip install weasyprint +``` + +### Install Pango library + +#### Mac OSX + +``` +brew install pango +``` + +#### Linux (debian/ubuntu) + +``` +sudo apt-get install libpango-1.0-0 libpangoft2-1.0-0 +``` + + diff --git a/docs/gh_pages/versioned_docs/version-0.1.18/introduction.md b/docs/gh_pages/versioned_docs/version-0.1.18/introduction.md new file mode 100644 index 00000000..51dcdf1e --- /dev/null +++ b/docs/gh_pages/versioned_docs/version-0.1.18/introduction.md @@ -0,0 +1,38 @@ +--- +slug: / +--- + +# Overview + +Pebblo enables developers to safely load data and promote their Gen AI app to deployment without worrying about the organization’s compliance and security requirements. The project identifies semantic topics and entities found in the loaded data and summarizes them on the UI or a PDF report. + +![Pebblo Overview](../../static/img/pebblo-overview.webp) + +# Benefits + +1. Identify semantic topics and entities in your data loaded in RAG applications +1. Accelerate time-to-production by effortlessly meeting your organization’s data compliance requirements +1. Mitigate security risks arising from data poisoning and emerging threats. +1. Comply with regulations such as the EU AI Act with custom reports and data records +1. Support for a wide range of Gen AI development frameworks and data loaders + +# Components + +Pebblo has two components. + +1. Pebblo Server - a REST api application with topic-classifier, entity-classifier and reporting +1. Pebblo Safe DataLoader - a thin wrapper to Gen-AI framework's data loaders + +`Pebblo Safe DataLoader` currently support Langchain framework. Support for other frameworks like LlamaIndex, Haystack will be added in the upcoming releases. + +# Documentation + +- [Installation](installation.md) +- [Development Environment](development.md) +- [Pebblo Server](daemon.md) +- [Safe DataLoader for Langchain](rag.md) +- [Configuration](config.md) +- [Reports](reports.md) +- [Troubleshooting](troubleshooting.md) + + diff --git a/docs/gh_pages/versioned_docs/version-0.1.18/llama_index_safe_reader.md b/docs/gh_pages/versioned_docs/version-0.1.18/llama_index_safe_reader.md new file mode 100644 index 00000000..8715ecb7 --- /dev/null +++ b/docs/gh_pages/versioned_docs/version-0.1.18/llama_index_safe_reader.md @@ -0,0 +1,63 @@ +# Pebblo Safe DataReader for LlamaIndex + +This document describes how to augment your existing LlamaIndex DocumentReader with Pebblo Safe DocumentReader to get deep data visibility on the types of Topics and Entities ingested into the Gen-AI LlamaIndex application. For details on `Pebblo Daemon` see this [pebblo server](daemon.md) document. + +Pebblo Safe DocumentReader enables safe data ingestion for LlamaIndex `DocumentReader`. This is done by wrapping the document reader call with `Pebblo Safe DocumentReader` + +## How to Pebblo enable Document Reading? + +Assume a LlamaIndex RAG application snippet using `CSVReader` to read a CSV document for inference. + +Here is the snippet of Document loading using `CSVReader` + +``` +from pathlib import Path +from llama_index.readers.file import CSVReader +reader = CSVReader() +documents = reader.load_data(file=Path('data/corp_sens_data.csv')) +print(documents) +``` + +The Pebblo SafeReader can be installed and enabled with few lines of code change to the above snippet. + +### Install PebbloSafeReader + +``` +pip install llama-index-readers-pebblo +``` + +### Use PebbloSafeReader + +``` +from pathlib import Path +from llama_index.readers.pebblo import PebbloSafeReader +from llama_index.readers.file import CSVReader +reader = CSVReader() +pebblo_reader = PebbloSafeReader(reader, + name="acme-corp-rag-1", # App name (Mandatory) + owner="Joe Smith", # Owner (Optional) + description="Support productivity RAG application"), + documents = pebblo_reader.load_data(file=Path('data/corp_sens_data.csv') +) +``` + +A data report with all the findings, both Topics and Entities, will be generated and available for inspection in the `Pebblo Server`. See this [pebblo server](daemon.md) for further details. + +Note: By default Pebblo Server runs at localhost:8000. If your Pebblo Server is running at some other location for eg. a docker container etc, put the correct URL in `PEBBLO_CLASSIFIER_URL` env variable. ref: [server-configurations](config.md#server) + +```bash +export PEBBLO_CLASSIFIER_URL="" +``` + +## Supported Document Readers + +The following LlamaIndex DocumentReaders are currently supported. + +1. PDFReader +1. DocxReader +1. CSVReader + + +> Note : _Most other LlamaIndex document readers that implement load_data() method should work. The above list indicates the ones that are explicitly tested. If you have successfully tested a particular DocumentReader other than this list above, please consider raising a PR. + + diff --git a/docs/gh_pages/versioned_docs/version-0.1.18/pebblo_ui.md b/docs/gh_pages/versioned_docs/version-0.1.18/pebblo_ui.md new file mode 100644 index 00000000..3f6693b5 --- /dev/null +++ b/docs/gh_pages/versioned_docs/version-0.1.18/pebblo_ui.md @@ -0,0 +1,64 @@ +# Pebblo UI + +Pebblo UI provides an in-depth visibility into the document ingested into Gen-AI RAG applications during every load. + +Pebblo server now listens to `localhost:8000` to accept Gen-AI application data snippets for inspection and reporting. +Pebblo UI interface would be available on `http://localhost:8000/pebblo` + +This document describes the information displayed on the interface. + +![Pebblo UI](../../static/img/pebblo-ui.jpeg) + +# Overview Page + +This page consist of 4 primary tabs that provides the following details: + +1. **Applications With Findings**: + The number signifies the proportion of applications with findings out of the total active applications. Additionally, it will present you with a detailed list of these applications, including the count of findings (Topics + Entities), the name of the owner, and the option to download the PDF report for each application. + +2. **Findings**: + The figure denotes the cumulative count of Topics and Entities identified across all applications. It will also furnish you with a comprehensive list of these Topics and Entities, along with supplementary information including the count of source files they originate from, the Datasource, and the name of the Application. + +3. **Files with Findings**: + The number of files that has one or more Findings over the total number of files used in document load across all the applications. This field indicates the number of files that need to be inspected to remediate any potentially text that needs to be removed and/or cleaned for Gen-AI inference. + + It will also provide you with a list of these files, accompanied by additional details such as the file size, the owner's name, the count of topics & entities within each file, and the name of the Datasource. + +4. **Datasource**: + The number of data sources used to load documents into the Gen-AI RAG applications. For e.g. this field will be two if a RAG application loads data from two different directories or two different AWS S3 buckets. + + It will also provide you with a list of these Datasource, accompanied by additional details such as the size, source path, the count of topics & entities across the datasource, and the Application they are associated with. + +# Application Details Page + +You will be directed to the application details page by clicking on any application from the list available in the `Applications With Findings` tab in overview page. + +**Instance Details**: +This section provide a quick glance of where the RAG application is physically running like in a Laptop (Mac OSX) or Linux VM and related properties like IP address, local filesystem path and Python version. + +**Download Report**: +Can download the data report of the application in PDF format. + +**Load History**: +The table provides the history of findings and path to the reports for the previous loads of the same RAG application. + +Load History provides details about latest 5 loads of this app. It provides the following details: + +1. **Report Name** - The path to the report file. +2. **Findings** - The number of findings identified in the report. +3. **Files With Findings** - The number of files containing findings. +4. **Generated On** - The timestamp, when the report was generated. Time would be in local time zone. + +**Report Summary**: Report Summary has 4 primary tabs: + +1. **Findings**: The figure denotes the cumulative count of Topics and Entities identified in the application. It will also furnish you with a comprehensive list of these Topics and Entities, along with supplementary information including the count of source files they originate from, and the Datasource name. + +2. **Files with Findings**: The number of files that has one or more Findings over the total number of files used in document load across the application. This field indicates the number of files that need to be inspected to remediate any potentially text that needs to be removed and/or cleaned for Gen-AI inference. + + It will also provide you with a list of these files, accompanied by additional details such as the file size, the owner's name, the count of topics & entities within each file, and the name of the Datasource. + +3. **Datasource**: The number of data sources used to load documents into the Gen-AI RAG applications. For e.g. this field will be two if a RAG application loads data from two different directories or two different AWS S3 buckets. + + It will also provide you with a list of these Datasource, accompanied by additional details such as the size, source path, the count of topics & entities across the datasource. + +4. **Snippets**: This section details the text analyzed by the Pebblo Server using the Pebblo Topic Classifier and Pebblo Entity Classifier. It is designed to help quickly inspect and remediate text that should not be ingested into the Gen-AI RAG application. Each snippet shows the exact file for easy reference, with sensitive information labeled with confidence scores: HIGH, MEDIUM, or LOW. diff --git a/docs/gh_pages/versioned_docs/version-0.1.18/rag.md b/docs/gh_pages/versioned_docs/version-0.1.18/rag.md new file mode 100644 index 00000000..778bd239 --- /dev/null +++ b/docs/gh_pages/versioned_docs/version-0.1.18/rag.md @@ -0,0 +1,69 @@ +# Pebblo Safe DataLoader for Langchain + +This document describes how to augment your existing Langchain DocumentLoader with Pebblo Safe DataLoader to get deep data visibility on the types of Topics and Entities ingested into the Gen-AI Langchain application. For details on `Pebblo Server` see this [pebblo server](daemon.md) document. + +Pebblo Safeloader enables safe data ingestion for Langchain document loader1. This is done by wrapping the document loader call with `Pebblo Safe DataLoader`. + +## How to Pebblo enable Document Loading? + +Assume a Langchain RAG application snippet using `CSVLoader` to read a CSV document for inference. + +Here is the snippet of Lanchain RAG application using `CSVLoader`. + +```python + from langchain_community.document_loaders import CSVLoader + + loader = CSVLoader(file_path) + documents = loader.load() + vectordb = Chroma.from_documents(documents, OpenAIEmbeddings()) +``` + +The Pebblo SafeLoader can be enabled with few lines of code change to the above snippet. + +```python + from langchain_community.document_loaders import CSVLoader + from langchain_community.document_loaders.pebblo import PebbloSafeLoader + + loader = PebbloSafeLoader( + CSVLoader(file_path), + name="RAG app 1", # App name (Mandatory) + owner="Joe Smith", # Owner (Optional) + description="Support productivity RAG application", # Description (Optional) + ) + documents = loader.load() + vectordb = Chroma.from_documents(documents, OpenAIEmbeddings()) +``` + +A data report with all the findings, both Topics and Entities, will be generated and available for inspection in the `Pebblo Server`. See this [pebblo server](daemon.md) for further details. + +Note: By default Pebblo Server runs at localhost:8000. If your Pebblo Server is running at some other location for eg. a docker container etc, put the correct URL in `PEBBLO_CLASSIFIER_URL` env variable. ref: [server-configurations](config.md#server) + +```bash +export PEBBLO_CLASSIFIER_URL="" +``` + +## Supported Document Loaders + +The following Langchain DocumentLoaders are currently supported. + +1. DirectoryLoader +1. JSONLoader +1. CSVLoader +1. DataFrameLoader +1. S3FileLoader +1. S3DirLoader +1. UnstructuredMarkdownLoader +1. UnstructuredPDFLoader +1. UnstructuredFileLoader +1. UnstructuredAPIFileLoader +1. UnstructuredExcelLoader +1. AmazonTextractPDFLoader +1. GCSFileLoader +1. GoogleDriveLoader +1. PyPDFDirectoryLoader +1. PyPDFLoader +1. SharePointLoader + +> Note 1: _Most other Langchain document loaders that implement load() and lazy_load() methods should work. The above list indicates the ones that are explicitly tested. If you have successfully tested a particular DocumentLoader other than this list above, please consider raising an PR._ + + diff --git a/docs/gh_pages/versioned_docs/version-0.1.18/reports.md b/docs/gh_pages/versioned_docs/version-0.1.18/reports.md new file mode 100644 index 00000000..624a3b6b --- /dev/null +++ b/docs/gh_pages/versioned_docs/version-0.1.18/reports.md @@ -0,0 +1,43 @@ +# Pebblo Data Reports + +Pebblo Data Reports provides an in-depth visibility into the document ingested into Gen-AI RAG application during every load. + +This document describes the information produced in the Data Report. + +# Report Summary + +Report Summary provides the following details: + +1. **Findings**: Total number of Topics and Entities found across all the snippets loaded in this specific load run. +1. **Files with Findings**: The number of files that has one or more `Findings` over the total number of files used in this document load. This field indicates the number of files that need to be inspected to remediate any potentially text that needs to be removed and/or cleaned for Gen-AI inference. +1. **Number of Data Source**: The number of data sources used to load documents into the Gen-AI RAG application. For e.g. this field will be two if a RAG application loads data from two different directories or two different AWS S3 buckets. + +# Top Files with Most Findings + +This table indicates the top files that had the most findings. Typically, these files are the most _offending_ ones that needs immediate attention and best ROI for data cleansing and remediation. + +# Load History + +This table provides the history of findings and path to the reports for the previous loads of the same RAG application. + +Load History provides details about latest 5 loads of this app. It provides the following details: +1. **Report Name**: The path to the report file. +2. **Findings**: The number of findings identified in the report. +3. **Files With Findings**: The number of files containing findings. +4. **Generated On**: The timestamp, when the report was generated. Time would be in local time zone. +5. **Find more reports on**: Path to the folder where you can find reports for all the loads. This field will be visible when there are more than 5 loads of an app. + + +# Instance Details + +This section provide a quick glance of where the RAG application is physically running like in a Laptop (Mac OSX) or Linux VM and related properties like IP address, local filesystem path and Python version. + +# Data Source Findings Table + +This table provides a summary of all the different Topics and Entities found across all the files that got ingested using `Pebblo SafeLoader` enabled Document Loaders. + +# Snippets + +This sections provides the actual text inspected by the `Pebblo Server` using the `Pebblo Topic Classifier` and `Pebblo Entity Classifier`. This will be useful to quickly inspect and remediate text that should not be ingested into the Gen-AI RAG application. Each snippet shows the exact file the snippet is loaded from easy remediation. + + diff --git a/docs/gh_pages/versioned_docs/version-0.1.18/retrieval_chain.md b/docs/gh_pages/versioned_docs/version-0.1.18/retrieval_chain.md new file mode 100644 index 00000000..775e042d --- /dev/null +++ b/docs/gh_pages/versioned_docs/version-0.1.18/retrieval_chain.md @@ -0,0 +1,212 @@ +# Safe Retriever for LangChain +***Identity-enabled RAG using PebbloRetrievalQA*** + +`PebbloRetrievalQA` is a Retrieval chain with Identity & Semantic Enforcement for question-answering against a vector database. + +This document covers how to retrieve documents with Identity & Semantic Enforcement. + +**Steps:** + +- **Loading Documents with Authorization metadata:** The process starts by loading documents with option to pull additional authorization metadata turned on. See supported loader specific documentation for exact input field (typically `load_auth=True`), +- **Using supported Vector database** `PebbloRetrievalQA` chain requires a Vector database that supports rich metadata filtering capability. Pick one from the supported Vector database vendor list shown below in this document. +- **Initializing PebbloRetrievalQA Chain:** After loading the documents, the PebbloRetrievalQA chain is initialized. This chain uses the retriever ( + created from the vector database) and an LLM. +- **The 'ask' Function:** The 'ask' function is used to pose questions to the system. This function accepts a question and an auth_context as input + and returns the answer using the PebbloRetrievalQA chain. The auth_context contains the identity and authorization groups of the user accessing the + application. +- **Posing Questions:** Finally, questions are posed to the system. The system retrieves answers based on the authorization metadata in the documents + and the auth_context provided in the 'ask' function. + +## Setup + +### Dependencies + +The walkthrough requires Langchain, langchain-community, langchain-openai, and a Qdrant client. + +```bash +%pip install --upgrade --quiet langchain langchain-community langchain-openai qdrant_client +``` + +### Identity-aware Data Ingestion + +In this scenario, Qdrant is being utilized as a vector database. However, the flexibility of the system allows for the use of any supported vector +databases. + +**PebbloRetrievalQA chain supports the following vector databases:** + +1. Qdrant +1. Pinecone +1. Postgres(utilizing the pgvector extension) + +**Load vector database with authorization information in metadata:** + +In this phase, the authorization details of the original document are captured and stored in the `authorized_identities` field within the metadata of +each chunk in the VectorDB entry. + +_It's important to note that to use the PebbloRetrievalQA chain, authorization metadata must always be placed in the `authorized_identities` +field._ + +```python +from langchain_community.vectorstores.qdrant import Qdrant +from langchain_core.documents import Document +from langchain_openai.embeddings import OpenAIEmbeddings +from langchain_openai.llms import OpenAI + +llm = OpenAI() +embeddings = OpenAIEmbeddings() +collection_name = "pebblo-identity-rag" + +page_content = """ +Performance Report: John Smith +Employee Information: + •Name: John Smith + •Employee ID: JS12345 + •Department: Sales + •Position: Sales Representative + •Review Period: January 1, 2023 - December 31, 2023 + +Performance Summary: +John Smith has demonstrated commendable performance as a Sales Representative during the review period. +He consistently met and often exceeded sales targets, contributing significantly to the department's success. +His dedication, professionalism, and collaborative approach have been instrumental in fostering positive +relationships with clients and colleagues alike. + +Key Achievements: +•Exceeded sales targets by 20% for the fiscal year, demonstrating exceptional sales acumen and strategic planning skills. +•Successfully negotiated several high-value contracts, resulting in increased revenue and client satisfaction. +•Proactively identified opportunities for process improvement within the sales team, + leading to streamlined workflows and enhanced efficiency. +•Received positive feedback from clients and colleagues for excellent communication skills, responsiveness, and customer service. + Areas for Development: While John's performance has been exemplary overall, +there are opportunities for further development in certain areas: +•Continued focus on expanding product knowledge to better address client needs and provide tailored solutions. +•Enhancing time management skills to prioritize tasks effectively and maximize productivity during busy periods. +•Further development of leadership abilities to support and mentor junior team members within the sales department. + +Conclusion: In conclusion, John Smith has delivered outstanding results as a Sales Representative at ACME Corp. +His dedication, performance, and commitment to excellence reflect positively on the organization." +""" + +documents = [ + Document( + **{ + "page_content": page_content, + "metadata": { + "authorized_identities": ["hr-support", "hr-leadership"], + "page": 0, + "source": "https://drive.google.com/file/d/xxxxxxxxxxxxx/view", + "title": "Performance Report- John Smith.pdf", + }, + } + ) +] + +print("Loading vectordb...") + +vectordb = Qdrant.from_documents( + documents, + embeddings, + location=":memory:", + collection_name=collection_name, +) + +print("Vectordb loaded.") +``` + +## Retrieval with Identity & Semantic Enforcement + +PebbloRetrievalQA chain uses a SafeRetrieval to enforce that the snippets used for in-context are retrieved +only from the documents authorized for the user. +To achieve this, the Gen-AI application needs to provide an authorization context for this retrieval chain. +This `auth_context` should be filled with the identity and authorization groups of the user accessing the Gen-AI app. + +Here is the sample code for the PebbloRetrievalQA with `authorized_identities` from the user accessing the RAG +application, passed in `auth_context`. + +```python +from langchain_community.chains import PebbloRetrievalQA +from langchain_community.chains.pebblo_retrieval.models import AuthContext, ChainInput + +# Initialize PebbloRetrievalQA chain +qa_chain = PebbloRetrievalQA.from_chain_type( + llm=llm, + app_name="pebblo-identity-and-semantic-retriever", + owner="Joe Smith", + description="Identity and Semantic filtering using PebbloSafeLoader, and PebbloRetrievalQA", + chain_type="stuff", + retriever=vectordb.as_retriever(), + verbose=True, +) + +def ask(question: str, auth_context: dict): + """ + Ask a question to the PebbloRetrievalQA chain + """ + auth_context_obj = AuthContext(**auth_context) if auth_context else None + chain_input_obj = ChainInput(query=question, auth_context=auth_context_obj) + return qa_chain.invoke(chain_input_obj.dict()) +``` + +### Questions by Authorized User + +Data has been ingested for the authorized identities ["hr-support", "hr-leadership"]. +Therefore, a user who belongs to the "hr-support" authorized identity or group should be able to receive the correct answer. + +```python +auth = { + "user_id": "hr-user@acme.org", + "authorized_identities": [ + "hr-support", + ] +} + +question = "Please share the performance report for John Smith?" +resp = ask(question, auth) +print(f"Question: {question}\n\nAnswer: {resp['result']}\n") +``` + +Output: + +```bash +Question: Please share the performance summary for John Smith? + +Answer: +John Smith has demonstrated commendable performance as a Sales Representative during the review period. +He consistently met and often exceeded sales targets, contributing significantly to the department's success. +His dedication, professionalism, and collaborative approach have been instrumental in fostering positive +relationships with clients and colleagues alike. +``` + +### Questions by Unauthorized User + +Since the user's authorized identity/group "eng-support" is not included in the authorized identities ["hr-support", "hr-leadership"], they should not +expect to receive an answer. + +```python +auth = { + "user_id": "eng-user@acme.org", + "authorized_identities": [ + "eng-support", + ] +} + +question = "Please share the performance report for John Smith?" +resp = ask(question, auth) +print(f"Question: {question}\n\nAnswer: {resp['result']}\n") +``` + +Output: + +```bash +Question: Please share the performance summary for John Smith? + +Answer: +I don't know, I'm sorry. +``` + +## Prompt Governance +When a user sends any prompt to the LLM using PebbloRetrieveQA, Pebblo captures the findings from that prompt. This behavior is enabled by default. + +***Coming Soon***: Ability to block or anonymize prompts based on policy. + + diff --git a/docs/gh_pages/versioned_docs/version-0.1.18/retrieval_chain_semantic_enf.md b/docs/gh_pages/versioned_docs/version-0.1.18/retrieval_chain_semantic_enf.md new file mode 100644 index 00000000..a2078f6b --- /dev/null +++ b/docs/gh_pages/versioned_docs/version-0.1.18/retrieval_chain_semantic_enf.md @@ -0,0 +1,241 @@ +# Safe Retriever for LangChain + +***Semantic Enforcement RAG using PebbloRetrievalQA*** + +`PebbloRetrievalQA` is a Retrieval chain with Identity & Semantic Enforcement for question-answering against a vector database. + +This document covers how to retrieve documents with Semantic Enforcement. + +**Steps:** + +- **Loading Documents with Semantic metadata:** The process starts by loading documents with semantic metadata. +- **Using supported Vector database** `PebbloRetrievalQA` chain requires a Vector database that supports rich metadata filtering capability. Pick one + from the supported Vector database vendor list shown below in this document. +- **Initializing PebbloRetrievalQA Chain:** After loading the documents, the PebbloRetrievalQA chain is initialized. This chain uses the retriever ( + created from the vector database) and an LLM. +- **The 'ask' Function:** The 'ask' function is used to pose questions to the system. This function accepts a question and an semantic_context as + input and returns the answer using the PebbloRetrievalQA chain. The semantic context contains the topics and entities that should be denied within + the context used to generate a response. +- **Posing Questions:** Finally, questions are posed to the system. The system retrieves answers based on the semantic metadata in the documents + and the semantic_context provided in the 'ask' function. + +## Setup + +### Dependencies + +The walkthrough requires Langchain, langchain-community, langchain-openai, and a Qdrant client. + +```bash +%pip install --upgrade --quiet langchain langchain-community langchain-openai qdrant_client +``` + +### Identity-aware Data Ingestion + +In this scenario, Qdrant is being utilized as a vector database. However, the flexibility of the system allows for the use of any supported vector +databases. + +**PebbloRetrievalQA chain supports the following vector databases:** + +1. Qdrant +1. Pinecone + +**Load vector database with semantic information in metadata:** + +In this phase, the semantic topics and entities of the original document are captured and stored in the `pebblo_semantic_topics` +and `pebblo_semantic_entities` fields respectively within the metadata of +each chunk in the VectorDB entry. + +_It's important to note that to use the PebbloRetrievalQA chain, semantic metadata must always be placed in the `pebblo_semantic_topics` +and `pebblo_semantic_entities` fields._ + +```python +from langchain_community.vectorstores.qdrant import Qdrant +from langchain_core.documents import Document +from langchain_openai.embeddings import OpenAIEmbeddings +from langchain_openai.llms import OpenAI + +llm = OpenAI() +embeddings = OpenAIEmbeddings() +collection_name = "pebblo-semantic-rag" + +page_content = """ +**ACME Corp Financial Report** + +**Overview:** +ACME Corp, a leading player in the merger and acquisition industry, presents its financial report for the fiscal year ending December 31, 2020. +Despite a challenging economic landscape, ACME Corp demonstrated robust performance and strategic growth. + +**Financial Highlights:** +Revenue soared to $50 million, marking a 15% increase from the previous year, driven by successful deal closures and expansion into new markets. +Net profit reached $12 million, showcasing a healthy margin of 24%. + +**Key Metrics:** +Total assets surged to $80 million, reflecting a 20% growth, highlighting ACME Corp's strong financial position and asset base. +Additionally, the company maintained a conservative debt-to-equity ratio of 0.5, ensuring sustainable financial stability. + +**Future Outlook:** +ACME Corp remains optimistic about the future, with plans to capitalize on emerging opportunities in the global M&A landscape. +The company is committed to delivering value to shareholders while maintaining ethical business practices. + +**Bank Account Details:** +For inquiries or transactions, please refer to ACME Corp's US bank account: +Account Number: 123456789012 +Bank Name: Fictitious Bank of America +""" + +documents = [ + Document( + **{ + "page_content": page_content, + "metadata": { + "pebblo_semantic_topics": ["financial-report"], + "pebblo_semantic_entities": ["us-bank-account-number"], + "page": 0, + "source": "https://drive.google.com/file/d/xxxxxxxxxxxxx/view", + "title": "ACME Corp Financial Report.pdf", + }, + } + ) +] + +print("Loading vectordb...") + +vectordb = Qdrant.from_documents( + documents, + embeddings, + location=":memory:", + collection_name=collection_name, +) + +print("Vectordb loaded.") +``` + +## Retrieval with Semantic Enforcement + +The PebbloRetrievalQA chain uses SafeRetrieval to ensure that the snippets used in context are retrieved only from documents that comply with the +provided semantic context. +To achieve this, the Gen-AI application must provide a semantic context for this retrieval chain. +This `semantic_context` should include the topics and entities that should be denied for the user accessing the Gen-AI app. + +Below is a sample code for PebbloRetrievalQA with `topics_to_deny` and `entities_to_deny`. These are passed in `semantic_context` to the chain input. + +```python +from typing import Optional, List +from langchain_community.chains import PebbloRetrievalQA +from langchain_community.chains.pebblo_retrieval.models import ( + ChainInput, + SemanticContext, +) + +# Initialize PebbloRetrievalQA chain +qa_chain = PebbloRetrievalQA.from_chain_type( + llm=llm, + app_name="pebblo-semantic-retriever-rag", + owner="Joe Smith", + description="Semantic filtering using PebbloSafeLoader, and PebbloRetrievalQA", + chain_type="stuff", + retriever=vectordb.as_retriever(), + verbose=True, +) + + +def ask( + question: str, + topics_to_deny: Optional[List[str]] = None, + entities_to_deny: Optional[List[str]] = None, +): + """ + Ask a question to the PebbloRetrievalQA chain + """ + semantic_context = dict() + if topics_to_deny: + semantic_context["pebblo_semantic_topics"] = {"deny": topics_to_deny} + if entities_to_deny: + semantic_context["pebblo_semantic_entities"] = {"deny": entities_to_deny} + + semantic_context_obj = ( + SemanticContext(**semantic_context) if semantic_context else None + ) + chain_input_obj = ChainInput(query=question, semantic_context=semantic_context_obj) + return qa_chain.invoke(chain_input_obj.dict()) +``` + +## Ask questions + +### Without semantic enforcement + +Since no semantic enforcement is applied, the system should return the answer. + +```python +topic_to_deny = [] +entities_to_deny = [] +question = "Please share the financial performance of ACME Corp for 2020" +resp = ask(question, topics_to_deny=topic_to_deny, entities_to_deny=entities_to_deny) +print( + f"Topics to deny: {topic_to_deny}\nEntities to deny: {entities_to_deny}\n" + f"Question: {question}\nAnswer: {resp['result']}\n" +) +``` + +Output: + +```bash +Topics to deny: [] +Entities to deny: [] +Question: Please share the financial performance of ACME Corp for 2020 +Answer: +ACME Corp had a strong financial performance in 2020, with a 15% increase in revenue to $50 million and a net profit of $12 million, +indicating a healthy margin of 24%. The company also saw a 20% growth in total assets, reaching $80 million. +ACME Corp maintained a conservative debt-to-equity ratio of 0.5, ensuring financial stability. +The company has plans to capitalize on emerging opportunities in the global M&A landscape and is committed to delivering value +to shareholders while maintaining ethical business practices. +``` + +### Deny financial-report topic + +Data has been ingested with the topics: ["financial-report"]. +Therefore, a app that denies the "financial-report" topic should not receive an answer. + +```python +topic_to_deny = ["financial-report"] +entities_to_deny = [] +question = "Please share the financial performance of ACME Corp for 2020" +resp = ask(question, topics_to_deny=topic_to_deny, entities_to_deny=entities_to_deny) +print( + f"Topics to deny: {topic_to_deny}\nEntities to deny: {entities_to_deny}\n" + f"Question: {question}\nAnswer: {resp['result']}\n" +) +``` + +Output: + +```bash +Topics to deny: ['financial-report'] +Entities to deny: [] +Question: Please share the financial performance of ACME Corp for 2020 +Answer: Unfortunately, I do not have access to that information. +``` + +### Deny us-bank-account-number entity + +Since the entity "us-bank-account-number" is denied, the system should not return the answer. + +```python +topic_to_deny = [] +entities_to_deny = ["us-bank-account-number"] +question = "Please share the financial performance of ACME Corp for 2020" +resp = ask(question, topics_to_deny=topic_to_deny, entities_to_deny=entities_to_deny) +print( + f"Topics to deny: {topic_to_deny}\nEntities to deny: {entities_to_deny}\n" + f"Question: {question}\nAnswer: {resp['result']}\n" +) +``` + +Output: + +```bash +Topics to deny: [] +Entities to deny: ['us-bank-account-number'] +Question: Please share the financial performance of ACME Corp for 2020 +Answer: Unfortunately, I do not have access to that information. +``` diff --git a/docs/gh_pages/versioned_docs/version-0.1.18/safe_loader.md b/docs/gh_pages/versioned_docs/version-0.1.18/safe_loader.md new file mode 100644 index 00000000..fb95b37a --- /dev/null +++ b/docs/gh_pages/versioned_docs/version-0.1.18/safe_loader.md @@ -0,0 +1,68 @@ +# Pebblo UI + +Pebblo UI provides an in-depth visibility into the Gen-AI RAG applications for documents ingested into during every load and for retrievals being done using it. + +Pebblo server now listens to `localhost:8000` to accept Gen-AI application data snippets for inspection and reporting. +Pebblo UI interface would be available on `http://localhost:8000/pebblo` + +This document describes the information displayed on the interface. + +![Pebblo UI](../../static/img/pebblo-ui.jpeg) + +# Safe Loader Tab + +This section provides details about the documents ingested into all Gen-AI RAG applications during every load. + +### Overview Page + +This page consist of 4 primary tabs that provides the following details: + +1. **Applications With Findings**: + The number signifies the proportion of applications with findings out of the total active applications. Additionally, it will present you with a detailed list of these applications, including the count of findings (Topics + Entities), the name of the owner, and the option to download the PDF report for each application. + +2. **Findings**: + The figure denotes the cumulative count of Topics and Entities identified across all applications. It will also furnish you with a comprehensive list of these Topics and Entities, along with supplementary information including the count of source documents they originate from, the Datasource, and the name of the Application. + +3. **Documents with Findings**: + The number of documents that has one or more Findings over the total number of documents used in document load across all the applications. This field indicates the number of documents that need to be inspected to remediate any potentially text that needs to be removed and/or cleaned for Gen-AI inference. + + It will also provide you with a list of these documents, accompanied by additional details such as the file size, the owner's name, the count of topics & entities within each file, and the name of the Datasource. + +4. **Datasource**: + The number of data sources used to load documents into the Gen-AI RAG applications. For e.g. this field will be two if a RAG application loads data from two different directories or two different AWS S3 buckets. + + It will also provide you with a list of these Datasource, accompanied by additional details such as the size, source path, the count of topics & entities across the datasource, and the Application they are associated with. + +### Application Details Page + +You will be directed to the application details page by clicking on any application from the list available in the `Applications With Findings` tab in overview page. + +**Instance Details**: +This section provide a quick glance of where the RAG application is physically running like in a Laptop (Mac OSX) or Linux VM and related properties like IP address, local filesystem path and Python version. + +**Download Report**: +Can download the data report of the application in PDF format. + +**Load History**: +The table provides the history of findings and path to the reports for the previous loads of the same RAG application. + +Load History provides details about latest 5 loads of this app. It provides the following details: + +1. **Report Name** - The path to the report file. +2. **Findings** - The number of findings identified in the report. +3. **Documents With Findings** - The number of documents containing findings. +4. **Generated On** - The timestamp, when the report was generated. Time would be in local time zone. + +**Report Summary**: Report Summary has 4 primary tabs: + +1. **Findings**: The figure denotes the cumulative count of Topics and Entities identified in the application. It will also furnish you with a comprehensive list of these Topics and Entities, along with supplementary information including the count of source documents they originate from, and the Datasource name. + +2. **Documents with Findings**: The number of documents that has one or more Findings over the total number of documents used in document load across the application. This field indicates the number of documents that need to be inspected to remediate any potentially text that needs to be removed and/or cleaned for Gen-AI inference. + + It will also provide you with a list of these documents, accompanied by additional details such as the file size, the owner's name, the count of topics & entities within each file, and the name of the Datasource. + +3. **Datasource**: The number of data sources used to load documents into the Gen-AI RAG applications. For e.g. this field will be two if a RAG application loads data from two different directories or two different AWS S3 buckets. + + It will also provide you with a list of these Datasource, accompanied by additional details such as the size, source path, the count of topics & entities across the datasource. + +4. **Snippets**: This section details the text analyzed by the Pebblo Server using the Pebblo Topic Classifier and Pebblo Entity Classifier. It is designed to help quickly inspect and remediate text that should not be ingested into the Gen-AI RAG application. Each snippet shows the exact file for easy reference, with sensitive information labeled with confidence scores: HIGH, MEDIUM, or LOW. diff --git a/docs/gh_pages/versioned_docs/version-0.1.18/safe_loader_samples.md b/docs/gh_pages/versioned_docs/version-0.1.18/safe_loader_samples.md new file mode 100644 index 00000000..0e11c2b2 --- /dev/null +++ b/docs/gh_pages/versioned_docs/version-0.1.18/safe_loader_samples.md @@ -0,0 +1,7 @@ +# Pebblo Safe Loader Samples + +This section provides details about the safe loader samples for all the loader based application. + +## Index + +1. [Google Drive-Qdrant Safe Rag Sample App](#google-drive-qdrant-safe-rag-sample-app) \ No newline at end of file diff --git a/docs/gh_pages/versioned_docs/version-0.1.18/safe_retriever.md b/docs/gh_pages/versioned_docs/version-0.1.18/safe_retriever.md new file mode 100644 index 00000000..12288e79 --- /dev/null +++ b/docs/gh_pages/versioned_docs/version-0.1.18/safe_retriever.md @@ -0,0 +1,68 @@ +# Safe Retriever Tab + +This section provides details about the retrievals for all the retrieval based application. + +![Pebblo UI](../../static/img/pebblo-saferetriever-ui.png) + +## Overview Page + +This page consist of 5 primary tabs that provides the following details: + +1. **Applications**: + The number signifies total number of retrieval type applications. + + The below section lists all retrieval type applications along with details like Application Name, Owner, Retrievals i.e. cumulative count of retrievals, Active Users, Documents, VectorDBs. + +2. **Retrievals**: + The figure denotes the cumulative count of retrievals happened for all the apps. + +3. **Prompts with Findings**: + The number of prompts that has one or more findings over the total number of prompts used during retrieval process. This field indicates the number of prompts that need to be inspected for Gen-AI inference. + +4. **Active Users**: + The number of active users for all the retrieval type applications. + +5. **Violations**: + Coming Soon! + +## Application Details Page + +You will be directed to the application details page by clicking on any application from the list available in the `Application` tab in overview page. + +**Instance Details**: +This section provide a quick glance of where the RAG application is physically running like in a Laptop (Mac OSX) or Linux VM and related properties like IP address, local filesystem path and Python version. + +This page consist of 5 primary tabs that provides the following details: + +1. **Retrievals**: The figure denotes the cumulative count of retrievals happened for this app. + This section provides details about all retrievals with information like + + **Prompt** : What was the prompt sent by the end user, Who sent the prompt and when it was sent. + + **Findings**: One or more key findings extracted from the prompt. + + **Context** : What was the context and which vector db was used. + + **Response**: This is the final response generate by LLM for the given prompt. + + **Retrieved From** : Source file of this context. + +2. **Prompts with Findings**: The number of prompts that has one or more findings over the total number of prompts for the given application. + +3. **Active Users**: The number of active users for this application will be provided, accompanied by a list of these users. This list will include additional details such as the retrieval count for each user and the last time they accessed the application. + +4. **Documents**: The number of documents accessed for retrievals will be provided, accompanied by a list of these documents. This list will include supplementary details such as the owner's name, retrieval count for each document, and the most recent access time for information from each document. + +5. **Vector Databases**: The number of vector databases used in retrievals for this application. + Details within this tab are coming soon. + +## Prompt With Findings Details Page + +This page shows all the prompt with findings for all the applications. + +1. **Entity Name**: The name of the entity which is present in the prompts. +2. **Prompts**: Number of prompts in which the entity was detected. +3. **Users**: Users who have used this entity in the prompt. +4. **Apps** : Application in which the entity is present. + + diff --git a/docs/gh_pages/versioned_docs/version-0.1.18/safe_retriever_samples.md b/docs/gh_pages/versioned_docs/version-0.1.18/safe_retriever_samples.md new file mode 100644 index 00000000..8af2f53c --- /dev/null +++ b/docs/gh_pages/versioned_docs/version-0.1.18/safe_retriever_samples.md @@ -0,0 +1,62 @@ +# Pebblo Safe Retriever Samples + +This section provides samples of Pebblo Safe Retriver enabled RAG applications. + +## Index + +- **Pebblo Safe RAG** + 1. [Google Drive-Qdrant Safe RAG Sample App](#google-drive-qdrant-safe-rag-sample-app) + 2. [Sharepoint-Qdrant Safe RAG Sample App](#sharepoint-qdrant-safe-rag-sample-app) + 3. [Sharepoint-Postgres Safe RAG Sample App](#sharepoint-postgres-safe-rag-sample-app) +- **Identity RAG** + 1. [Google Drive-Qdrant Identity RAG Sample App](#google-drive-qdrant-identity-rag-sample-app) +- **Semantic RAG** + 1. [Google Drive-Qdrant Semantic RAG Sample App](#google-drive-qdrant-semantic-rag-sample-app) + +## Pebblo Safe RAG + +### Google Drive-Qdrant Safe RAG Sample App + +**Description:** +This section provides a sample of Pebblo Safe Retriver enabled RAG applications loading documents from `GoogleDrive` folder and storing in `Qdrant` VectorDB collection. + +**Link:** +[Google Drive-Qdrant Pebblo Safe RAG Sample App](https://github.com/daxa-ai/pebblo/tree/main/pebblo_saferetriever/langchain/pebblo-saferag/googledrive-qdrant) + +### Sharepoint-Qdrant Safe RAG Sample App + +**Description:** +This section provides a sample of Pebblo Safe Retriver enabled RAG applications loading documents from `Sharepoint` site and storing in `Qdrant` VectorDB collection. + +**Link:** +[Sharepoint-Qdrant Pebblo Safe RAG Sample App](https://github.com/daxa-ai/pebblo/tree/main/pebblo_saferetriever/langchain/pebblo-saferag/sharepoint-qdrant) + +### Sharepoint-Postgres Safe RAG Sample App + +**Description:** +This section provides a sample of Pebblo Safe Retriver enabled RAG applications loading documents from `Sharepoint` site and storing in `Postgres (pgvector)` VectorDB collection. + +**Link:** +[Sharepoint-Postgres Pebblo Safe RAG Sample App](https://github.com/daxa-ai/pebblo/tree/main/pebblo_saferetriever/langchain/pebblo-saferag/sharepoint-postgres) + +## Identity RAG + +### Google Drive-Qdrant Identity RAG Sample App + +**Description:** +This section provides a identity-filtering focused sample of Pebblo Safe Retriver enabled RAG applications loading documents from `GoogleDrive` folder and storing in `Qdrant` VectorDB collection. + +**Link:** +[Google Drive-Qdrant Identity RAG Sample App](https://github.com/daxa-ai/pebblo/tree/main/pebblo_saferetriever/langchain/identity-rag/googledrive-qdrant) + +## Semantic RAG + +### Google Drive-Qdrant Semantic RAG Sample App + +**Description:** +This section provides a semantic-filtering focused sample of Pebblo Safe Retriver enabled RAG applications loading documents from `GoogleDrive` folder and storing in `Qdrant` VectorDB collection. + +**Link:** +[Google Drive-Qdrant Semantic RAG Sample App](https://github.com/daxa-ai/pebblo/tree/main/pebblo_saferetriever/langchain/semantic-rag/googledrive-qdrant) + + diff --git a/docs/gh_pages/versioned_docs/version-0.1.18/topicclassifier.md b/docs/gh_pages/versioned_docs/version-0.1.18/topicclassifier.md new file mode 100644 index 00000000..4d807c2e --- /dev/null +++ b/docs/gh_pages/versioned_docs/version-0.1.18/topicclassifier.md @@ -0,0 +1,33 @@ +# Pebblo Topic Classifier + +`Pebblo topic classifier` is designed to analyze loader source files and accurately identify the underlying +topics they contain. It uses machine learning model meticulously trained to identify and categorize topics within textual data. this model is open for contribution from the open-source community, allowing for collaborative enhancements +and improvements to enrich its accuracy and effectiveness. + +# Topics Supported By Pebblo Topic Classifier + +Below is the list of `topics` supported by Pebblo - + +1. Medical Advice +1. Harmful Advice +1. Board Meeting +1. Consulting Agreement +1. Customer List +1. Enterprise Agreement +1. Executive Severance Agreement +1. Financial Report +1. Loan And Security Agreement +1. Merger Agreement +1. Patent Application Fillings +1. Price List +1. Employee Agreement +1. Sexual Content +1. Sexual Incident Report +1. Internal Product Roadmap Agreement + +User can get details of classified topics for their loader source files in Pebblo report. +Different sections of Pebblo report such as , `Top Files With Most Findings`, `Data Source Findings Table` and `Snippets` helps to get overview of pebblo topic classifier output for user's rag application. + +For more details refer - [Reports](reports.md) + + diff --git a/docs/gh_pages/versioned_docs/version-0.1.18/troubleshooting.md b/docs/gh_pages/versioned_docs/version-0.1.18/troubleshooting.md new file mode 100644 index 00000000..ed20b665 --- /dev/null +++ b/docs/gh_pages/versioned_docs/version-0.1.18/troubleshooting.md @@ -0,0 +1,17 @@ +# Troubleshooting Guide + +## Issues found while running pebblo server, in conda virtual env. +### 1. OSError: cannot load library 'pango-1.0-0' + Install pango package in conda env + + `conda install -c anaconda pango` + +### 2. OSError: cannot load library 'gobject-2.0-0': gobject-2.0-0: cannot open shared object file + Install libpango binaries + + ```bash + sudo apt-get update + sudo apt-get install libpango1.0-0 + ``` + + diff --git a/docs/gh_pages/versioned_sidebars/version-0.1.18-sidebars.json b/docs/gh_pages/versioned_sidebars/version-0.1.18-sidebars.json new file mode 100644 index 00000000..a684398a --- /dev/null +++ b/docs/gh_pages/versioned_sidebars/version-0.1.18-sidebars.json @@ -0,0 +1,125 @@ +{ + "sidebar": [ + { + "type": "doc", + "id": "introduction", + "label": "Overview" + }, + { + "type": "doc", + "id": "installation", + "label": "Installation" + }, + { + "type": "doc", + "id": "development", + "label": "Development Environment" + }, + { + "type": "doc", + "id": "config", + "label": "Configuration" + }, + { + "type": "category", + "label": "Pebblo", + "items": [ + { + "type": "doc", + "label": "Server", + "id": "daemon" + }, + { + "type": "doc", + "label": "Entity Classifier", + "id": "entityclassifier" + }, + { + "type": "doc", + "label": "Topic Classifier", + "id": "topicclassifier" + } + ] + }, + { + "type": "category", + "label": "Pebblo UI", + "items": [ + { + "type": "doc", + "label": "Safe Loader", + "id": "safe_loader" + }, + { + "type": "doc", + "label": "Safe Retriever", + "id": "safe_retriever" + } + ] + }, + { + "type": "category", + "label": "LangChain", + "items": [ + { + "type": "doc", + "id": "rag", + "label": "Safe DataLoader" + }, + { + "type": "doc", + "id": "retrieval_chain", + "label": "Safe Retriever" + } + ] + }, + { + "type": "category", + "label": "LlamaIndex", + "items": [ + { + "type": "doc", + "label": "Safe DataReader", + "id": "llama_index_safe_reader" + } + ] + }, + { + "type": "doc", + "id": "reports", + "label": "Reports" + }, + { + "type": "category", + "label": "Samples", + "items": [ + { + "type": "category", + "label": "Safe Loader Samples", + "items": [ + { + "type": "link", + "label": "1. Google Drive-Qdrant Safe Loader Sample", + "href": "https://github.com/daxa-ai/pebblo/tree/main/pebblo_safeloader/langchain/identity-rag" + }, + { + "type": "link", + "label": "2. CSV Loader-Chroma Safe Loader Sample", + "href": "https://github.com/daxa-ai/pebblo/tree/main/pebblo_safeloader/langchain/acme-corp-rag" + } + ] + }, + { + "type": "doc", + "label": "Safe Retriever Samples", + "id": "safe_retriever_samples" + } + ] + }, + { + "type": "doc", + "id": "troubleshooting", + "label": "Troubleshooting Guide" + } + ] +} diff --git a/docs/gh_pages/versions.json b/docs/gh_pages/versions.json index 4fb65465..cc7a2eed 100644 --- a/docs/gh_pages/versions.json +++ b/docs/gh_pages/versions.json @@ -1 +1 @@ -["current", "0.1.17"] +["current", "0.1.18", "0.1.17"]