From 5ba91fe0652d7026a5fe9a62a30c527acf83659b Mon Sep 17 00:00:00 2001 From: tlane25 Date: Tue, 6 Aug 2024 07:40:20 -0400 Subject: [PATCH 01/10] Moved random tests into their own repo --- random_tests/docdbtest/README.md | 28 ++++++ random_tests/docdbtest/env.template | 2 + random_tests/docdbtest/files/file1.txt | 6 ++ random_tests/docdbtest/files/file2.txt | 2 + random_tests/docdbtest/list_vectors.py | 23 +++++ random_tests/docdbtest/load_vectors.py | 35 +++++++ random_tests/docdbtest/store_vectors.py | 32 ++++++ random_tests/docdbtest/test_nodb.py | 12 +++ random_tests/hybridTest.py | 21 ++++ random_tests/pipelineTest.py | 114 +++++++++++++++++++++ random_tests/promptTest.py | 41 ++++++++ random_tests/setup_test.py | 127 ++++++++++++++++++++++++ random_tests/test_kb.py | 31 ++++++ 13 files changed, 474 insertions(+) create mode 100644 random_tests/docdbtest/README.md create mode 100644 random_tests/docdbtest/env.template create mode 100644 random_tests/docdbtest/files/file1.txt create mode 100644 random_tests/docdbtest/files/file2.txt create mode 100644 random_tests/docdbtest/list_vectors.py create mode 100644 random_tests/docdbtest/load_vectors.py create mode 100644 random_tests/docdbtest/store_vectors.py create mode 100644 random_tests/docdbtest/test_nodb.py create mode 100644 random_tests/hybridTest.py create mode 100644 random_tests/pipelineTest.py create mode 100644 random_tests/promptTest.py create mode 100644 random_tests/setup_test.py create mode 100644 random_tests/test_kb.py diff --git a/random_tests/docdbtest/README.md b/random_tests/docdbtest/README.md new file mode 100644 index 0000000..3b2e417 --- /dev/null +++ b/random_tests/docdbtest/README.md @@ -0,0 +1,28 @@ +# test files for connecting with AWS DocumentDB instance +- a set of test files to confirm a connection with AWS DocumentDB and vector embedding persistence +- Note: the LlamaIndex DocDB integration uses `pymongo` + - when running the various files, there may be errors from the pymongo package, but these do not seem to impact vector storage / retrieval + + + +## to use +- create a `.env` file (can use env.template as a starter) + - MONGO_URI will come from the AWS Console for DocumentDB +- ensure you download the global-bundle.pem from the AWS Console +- ensure you've run `pipenv shell` in the root project folder + + + +## overview of files +- `test_nodb.py` : will create a VectorStoreIndex from the same files with no persistence + - note that running this file should give you a 'baseline' of how llamaIndex will perform + - note also that since being first created, it appears an underlying LangChain method has changed and is now deprecated (as of Jul 21, 2024) + +- `store_vectors.py` : this will vectorize the same files and store the index in DocDB + +- `list_vectors.py` : this program queries the contents of the 'testdb', 'testcollection' directly and prints the values to screen + - note: you can pipe the output to a file to examine output more closely + - e.g., `python list_vectors.py > output.txt` + +- `load_vectors.py` : this will load the vector embeddings from DocDB and then run a query against them + diff --git a/random_tests/docdbtest/env.template b/random_tests/docdbtest/env.template new file mode 100644 index 0000000..7f14155 --- /dev/null +++ b/random_tests/docdbtest/env.template @@ -0,0 +1,2 @@ +OPENAI_API_KEY= +MONGO_URI= diff --git a/random_tests/docdbtest/files/file1.txt b/random_tests/docdbtest/files/file1.txt new file mode 100644 index 0000000..13f6cc4 --- /dev/null +++ b/random_tests/docdbtest/files/file1.txt @@ -0,0 +1,6 @@ +In the mystical land of Rudenza: +Apples are purple. +Clouds are silver, tinged with gold and bronze. +Pianos will bite your fingers if you don't wear stripes when you practice. + + diff --git a/random_tests/docdbtest/files/file2.txt b/random_tests/docdbtest/files/file2.txt new file mode 100644 index 0000000..db01e83 --- /dev/null +++ b/random_tests/docdbtest/files/file2.txt @@ -0,0 +1,2 @@ +PotatoPecanPie is the secret word. +James' favourite food is pizza. diff --git a/random_tests/docdbtest/list_vectors.py b/random_tests/docdbtest/list_vectors.py new file mode 100644 index 0000000..ba5f050 --- /dev/null +++ b/random_tests/docdbtest/list_vectors.py @@ -0,0 +1,23 @@ +# this mini program is to list the vectors within the documentDB instance that were written by llamaIndex +# Note: the db and collection as named below - these can be changed when the vector_store is instantiated + + +import pymongo +import pprint +from dotenv import load_dotenv +import os + +load_dotenv(override=True) + +mongo_uri = os.environ["MONGO_URI"] +client = pymongo.MongoClient(mongo_uri) + + +db = client[os.environ["DOCDB_NAME"]] +collection = db[os.environ["DOCDB_COLLECTION"]] + +for post in collection.find(): + pprint.pprint(post) + + +print('==========') diff --git a/random_tests/docdbtest/load_vectors.py b/random_tests/docdbtest/load_vectors.py new file mode 100644 index 0000000..2ac3755 --- /dev/null +++ b/random_tests/docdbtest/load_vectors.py @@ -0,0 +1,35 @@ +# based loosely upon LlamaIndex demo +# https://docs.llamaindex.ai/en/stable/examples/vector_stores/AWSDocDBDemo/ +# +# key goal here was to retrieve the stored vectors from DocumentDB rather than re-create them + + + +import pymongo +from dotenv import load_dotenv + +from llama_index.vector_stores.awsdocdb import AWSDocDbVectorStore +from llama_index.core import VectorStoreIndex +from llama_index.core import StorageContext +# from llama_index.core import SimpleDirectoryReader +import os + +load_dotenv(override=True) + +mongo_uri = os.environ["MONGO_URI"] +mongodb_client = pymongo.MongoClient(mongo_uri) +store = AWSDocDbVectorStore(mongodb_client, db_name='testdb', collection_name='testcollection') +storage_context = StorageContext.from_defaults(vector_store=store) + + + +index = VectorStoreIndex.from_vector_store( + vector_store=store, + storage_context=storage_context +) + + +response = index.as_query_engine().query('Tell me about Rudenza') +print(f"{response}") + + diff --git a/random_tests/docdbtest/store_vectors.py b/random_tests/docdbtest/store_vectors.py new file mode 100644 index 0000000..fd16d01 --- /dev/null +++ b/random_tests/docdbtest/store_vectors.py @@ -0,0 +1,32 @@ +# based upon LlamaIndex demo +# https://docs.llamaindex.ai/en/stable/examples/vector_stores/AWSDocDBDemo/ + + +import pymongo +from dotenv import load_dotenv + +from llama_index.vector_stores.awsdocdb import AWSDocDbVectorStore +from llama_index.core import VectorStoreIndex +from llama_index.core import StorageContext +from llama_index.core import SimpleDirectoryReader +import os + +load_dotenv(override=True) + + +mongo_uri = os.environ["MONGO_URI"] +mongodb_client = pymongo.MongoClient(mongo_uri) +store = AWSDocDbVectorStore(mongodb_client, db_name='testdb', collection_name='testcollection') +storage_context = StorageContext.from_defaults(vector_store=store) + +documents = SimpleDirectoryReader("files").load_data() + +index = VectorStoreIndex.from_documents( + documents, storage_context=storage_context +) + + +response = index.as_query_engine().query('Tell me about Rudenza') +print(f"{response}") + + diff --git a/random_tests/docdbtest/test_nodb.py b/random_tests/docdbtest/test_nodb.py new file mode 100644 index 0000000..047ae53 --- /dev/null +++ b/random_tests/docdbtest/test_nodb.py @@ -0,0 +1,12 @@ +# Testing vector store - no persistence + +from dotenv import load_dotenv +from llama_index.core import VectorStoreIndex, SimpleDirectoryReader + +load_dotenv() + +documents = SimpleDirectoryReader("files").load_data() + +index = VectorStoreIndex.from_documents(documents) + +print("Index created successfully!") diff --git a/random_tests/hybridTest.py b/random_tests/hybridTest.py new file mode 100644 index 0000000..e845cb0 --- /dev/null +++ b/random_tests/hybridTest.py @@ -0,0 +1,21 @@ +import hybridSearch.search as search + +def print_nodes(nodes): + for node in nodes: + print(node) + + +kb_file_path = './tmpfiles/giraffes.pdf' +search.hybrid_write('giraffes', kb_file_path) # only need to do this the first time + +query = 'how long are giraffe necks?' + +# get nodes +nodes = search.hybrid_get_nodes('giraffes', query, top_k=5) + +all_nodes = nodes['keyword'] + nodes['vector'] + +print_nodes(all_nodes) + + + diff --git a/random_tests/pipelineTest.py b/random_tests/pipelineTest.py new file mode 100644 index 0000000..5b3c70e --- /dev/null +++ b/random_tests/pipelineTest.py @@ -0,0 +1,114 @@ +from llama_index.core import QueryBundle +from llama_index.core.postprocessor import SimilarityPostprocessor +from llama_index.core.postprocessor import LongContextReorder +from llama_index.postprocessor.colbert_rerank import ColbertRerank +from llama_index.core import get_response_synthesizer, PromptTemplate +from llama_index.core.response_synthesizers import ResponseMode + +import hybridSearch.search as search + +def print_nodes(nodes): + for node in nodes: + print(node) + + +query = 'tell me about promises' + +# get all nodes +nodes = search.hybrid_get_nodes(query, top_k=3) +all_nodes = nodes['keyword'] + nodes['vector'] + + +# similarity +similarity_pp = SimilarityPostprocessor( + nodes=all_nodes, + similarity_cutoff=0.5 +) + +nodes_similar = similarity_pp.postprocess_nodes(all_nodes) + + + + + +# Colbert rerank +reranker = ColbertRerank(top_n=4) +query_bundle = QueryBundle(query) + +nodes_rerank = reranker.postprocess_nodes(all_nodes, query_bundle) + +print('='*20) +print_nodes(nodes_rerank) + + + +# LongContextReorder +reorder = LongContextReorder() + +nodes_reorder = reorder.postprocess_nodes(nodes_rerank) + +print('='*20) +print_nodes(nodes_reorder) + + + +# Response synthesizer +synth = get_response_synthesizer( + response_mode=ResponseMode.SIMPLE_SUMMARIZE +) + +response = synth.synthesize(query, nodes=nodes_reorder) +print(response) + +print('*'*20) + + +# Custom Prompt +new_prompt = ( + "Context information is below.\n" + "-----------------------------\n" + "{context_str}\n" + "-----------------------------\n" + "Given the context information and not prior knowledge, " + "answer the query in French.\n" + "Query: {query_str}\n" + "Answer: " +) +new_template = PromptTemplate(new_prompt) + +synth.update_prompts( + {"text_qa_template": new_template} +) + +response = synth.synthesize(query, nodes=nodes_reorder) +print(response) + + + + + +''' +Notes: + +incorporate post-processing modules: +- created `pipelineTest.py` based upon ‘hybridTest.py’ +- added similarity +- adding ColbertRerank +- found Colbert import statement from https://docs.llamaindex.ai/en/stable/examples/pipeline/query_pipeline_memory/?h=colbertr +- found reranker syntax from https://docs.llamaindex.ai/en/stable/examples/node_postprocessor/LLMReranker-Lyft-10k/?h=reranker +- adding LongContextReorder +- https://docs.llamaindex.ai/en/stable/module_guides/querying/node_postprocessors/node_postprocessors/?h= + +- post-processing modules all seem to work +- need to go from nodes to query response now +- llamaindex uses a “response synthesizer” +- https://docs.llamaindex.ai/en/stable/api_reference/response_synthesizers/ +- “simple_summarize” merges all text chunks from nodes into 1 and makes an LLM call +- it will fail if the merged text chunk exceeds the context window size + +- Accessing and customizing prompts +- https://docs.llamaindex.ai/en/stable/examples/prompts/prompt_mixin/ +- `synthesizer.get_prompts()` returns a dictionary of prompts +- key is a template (e.g., “text_qa_template”) +- see promptTest.py to access returned dict and display prompt content +''' \ No newline at end of file diff --git a/random_tests/promptTest.py b/random_tests/promptTest.py new file mode 100644 index 0000000..0b2820a --- /dev/null +++ b/random_tests/promptTest.py @@ -0,0 +1,41 @@ + +from llama_index.core import get_response_synthesizer, PromptTemplate +from llama_index.core.response_synthesizers import ResponseMode + + +def display_prompts(prompts_dict): + for k, p in prompts_dict.items(): + print(f"Prompt Key: {k}") + print("Text: ") + print(p.get_template()) + print("-"*30) + + +synth = get_response_synthesizer( + response_mode=ResponseMode.SIMPLE_SUMMARIZE +) + +prompt = synth.get_prompts() + +display_prompts(prompt) + + +new_prompt = ( + "Context information is below.\n" + "-----------------------------\n" + "{context_str}\n" + "-----------------------------\n" + "Given the context information and not prior knowledge, " + "answer the query in French.\n" + "Query: {query_str}\n" + "Answer: " +) +new_template = PromptTemplate(new_prompt) + +synth.update_prompts( + {"text_qa_template": new_template} +) + +prompt = synth.get_prompts() + +display_prompts(prompt) diff --git a/random_tests/setup_test.py b/random_tests/setup_test.py new file mode 100644 index 0000000..d11caa1 --- /dev/null +++ b/random_tests/setup_test.py @@ -0,0 +1,127 @@ +# helper scripts to populate test kbs +import os + +import pymongo +from dotenv import load_dotenv + +import hybridSearch.search as search + +load_dotenv(override=True) + +mongo_uri = os.environ["MONGO_URI"] +mongo = pymongo.MongoClient(mongo_uri) + + +# kb1 +kb1_file_path = './tmpfiles/AsyncJS.md' +search.keyword_write('kb1', kb1_file_path) +search.vector_write('kb1', kb1_file_path) + +#kb2 +kb2_file_path = './tmpfiles/cpumemory.pdf' +search.hybrid_write('kb2', kb2_file_path) + +#kb3 +kb3_file_path = './tmpfiles/newfile.txt' +search.hybrid_write('kb3', kb3_file_path) + + +# config db setup + +kb_config1 = { + 'id': 'kb1', + 'name': 'AsyncJS', + 'files': [{ 'filename': './tmpfiles/AsyncJS.md'}], + 'ingest': { + 'method': 'simple_ingest', + 'splitter': { + 'type': 'sentence', + 'chunk_size': '', + 'chunk_overlap': '', + 'separator': '', + }, + }, + 'embedding_model': 'gpt-3.5-turbo', + 'vector_store': { + 'name': 'idstring', + 'collection': 'vector_index', + }, + 'keyword_store': { + 'name': 'idstring', + 'collections': ['docstore/ref_doc_info', 'docstore/data', 'docstore/metadata'] + } +} + +kb_config2 = kb_config1.copy() +kb_config2['id'] = 'kb2' +kb_config2['name'] = 'cpumemory' +kb_config2['files'] = [{ 'filename': './tmpfiles/cpumemory.pdf'}], + + +kb_config3 = kb_config1.copy() +kb_config3['id'] = 'kb3' +kb_config3['name'] = 'newfile' +kb_config3['files'] = [{ 'filename': './tmpfiles/newfile.txt'}], + + +config_db = mongo[ os.environ["CONFIG_DB"] ] +config_kb_col = config_db[ os.environ["CONFIG_KB_COL"] ] +config_kb_col.insert_one(kb_config1) +config_kb_col.insert_one(kb_config2) +config_kb_col.insert_one(kb_config3) + + + + +# Pipeline config + +pipeline_config1 = { + 'id': 'pipeline1', + 'name': 'pipelineConfigName', + 'knowledgebases': ['kb1', 'kb2', 'kb3'], + 'retrieval': { + 'vector': 'llm_model_name', + }, + 'postprocessing': { + 'similarity': { + 'on': False, + 'similarity_cutoff': 0.7 + }, + 'colbertRerank': { + 'on': False, + 'top_n': 5 + }, + 'longContextReorder': { + 'on': True, + } + }, + 'generative_model': 'gpt-3.5-turbo', + 'prompt': { + 'on': True, + 'template_str': 'answer the question - {query_str} - in French' + } +} + +pipeline_config2 = pipeline_config1.copy() +pipeline_config2['id'] = 'pipeline2' +pipeline_config2['name'] = 'kb1 only (async)' +pipeline_config2['knowledgebases'] = ['kb1'] + + +pipeline_config3 = pipeline_config1.copy() +pipeline_config3['id'] = 'pipeline3' +pipeline_config3['name'] = 'kb2 only (cpumemory)' +pipeline_config3['knowledgebases'] = ['kb2'] + +pipeline_config4 = pipeline_config1.copy() +pipeline_config4['id'] = 'pipeline4' +pipeline_config4['name'] = 'kb1 (async) and kb2 (cpumemory)' +pipeline_config4['knowledgebases'] = ['kb2', 'kb1'] + +config_pipeline_col = config_db[ os.environ["CONFIG_PIPELINE_COL"] ] +config_pipeline_col.insert_one(pipeline_config1) +config_pipeline_col.insert_one(pipeline_config2) +config_pipeline_col.insert_one(pipeline_config3) +config_pipeline_col.insert_one(pipeline_config4) + + diff --git a/random_tests/test_kb.py b/random_tests/test_kb.py new file mode 100644 index 0000000..063a32e --- /dev/null +++ b/random_tests/test_kb.py @@ -0,0 +1,31 @@ +import json + +import refactor1.db.knowledge_base.kb_config as kbClass + +kb_name = 'giraffe2' + +config_template = { + "id": kb_name, + "kb_name": kb_name, + "ingest_method": "Simple", + "splitter": "Sentence", + "embed_config": { + "embed_provider": "OpenAI", + "embed_model": "text-embedding-3-small" + }, + "splitter_config": { + "chunk_size": 1024, + "chunk_overlap": 200 + } +} + +json_config = json.dumps(config_template) +print(json_config) + +kbClass.KnowledgeBase.create(json_config) + +kb = kbClass.KnowledgeBase('giraffe2') +kb.ingest_file_path('./tmpfiles/giraffes.pdf') + + + From b9458dac3101ccf571e5e3e2b4a30d8d6ad9f8f2 Mon Sep 17 00:00:00 2001 From: tlane25 Date: Tue, 6 Aug 2024 07:43:57 -0400 Subject: [PATCH 02/10] update: began migration of knowledge base functionality into module for integration with lambda --- knowledge_base/__init__.py | 0 knowledge_base/kb_config.py | 303 ++++++++++++++++++++++++++ knowledge_base/kb_constants.py | 141 ++++++++++++ knowledge_base/kb_json.py | 46 ++++ knowledge_base/kb_pipfile.txt | 15 ++ knowledge_base/kb_type_definitions.py | 49 +++++ knowledge_base/lp_ingest.py | 73 +++++++ knowledge_base/mongo_helper.py | 54 +++++ knowledge_base/simple_ingest.py | 34 +++ 9 files changed, 715 insertions(+) create mode 100644 knowledge_base/__init__.py create mode 100644 knowledge_base/kb_config.py create mode 100644 knowledge_base/kb_constants.py create mode 100644 knowledge_base/kb_json.py create mode 100644 knowledge_base/kb_pipfile.txt create mode 100644 knowledge_base/kb_type_definitions.py create mode 100644 knowledge_base/lp_ingest.py create mode 100644 knowledge_base/mongo_helper.py create mode 100644 knowledge_base/simple_ingest.py diff --git a/knowledge_base/__init__.py b/knowledge_base/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/knowledge_base/kb_config.py b/knowledge_base/kb_config.py new file mode 100644 index 0000000..94e94f6 --- /dev/null +++ b/knowledge_base/kb_config.py @@ -0,0 +1,303 @@ +import os +import copy +import shutil +from datetime import datetime, timezone + +import pymongo +import use_s3 +import nest_asyncio +from dotenv import load_dotenv +from llama_index.core import StorageContext +from llama_index.core import VectorStoreIndex +from llama_index.vector_stores.mongodb import MongoDBAtlasVectorSearch +from llama_index.vector_stores.awsdocdb import AWSDocDbVectorStore +from llama_index.storage.docstore.mongodb import MongoDocumentStore + + +import app_logger as log +from . import mongo_helper as mongo + +from .kb_constants import ( + EMBEDDINGS, + INGEST_METHODS, + SPLITTERS, + LLMS, + API_KEYS, +) + +from .kb_type_definitions import ( + EmbedConfig, + LLMConfig, + MarkdownConfig, + SemanticConfig, + SentenceConfig, + FileMetadata, + ClientKBConfig, + KBConfig +) + + +# env = os.getenv("ENV") +# print("env: ", env) +# if env == 'testing': +# load_dotenv(override=True, dotenv_path='../.env.testing') +# else: +load_dotenv(override=True) + +MONGO_URI = os.environ["MONGO_URI"] +CONFIG_DB = os.environ["CONFIG_DB"] +CONFIG_KB_COL = os.environ["CONFIG_KB_COL"] +PYMONGO_CLIENT = pymongo.MongoClient(MONGO_URI) +CONFIG_COLLECTION = PYMONGO_CLIENT[CONFIG_DB][CONFIG_KB_COL] + +# CONFIG_COLLECTION = mongo.connect_to_kb_config() + + + +def is_int(s): + try: + int(s) + return True + except ValueError: + return False + +def is_float(value): + try: + float(value) + return True + except ValueError: + return False + + +class KnowledgeBase: + + # props in `self._config` are str names of the knowledge base configuration + # self._embed_model, self._llm, and self._splitter are instances of the classes + # defined by properties in `self._config` + # self._ingest_method is the class of the ingestion method defined by the + # ingest_method property in `self._config` + def __init__(self, kb_name): + self._config = self._get_kb_config(kb_name) + self._embed_model = self._configure_embed_model() + self._llm = self._configure_llm() + self._ingest_method = INGEST_METHODS[ + self._config['ingest_method'] + ] + self._splitter = self._configure_splitter() + + @classmethod + def create(cls, client_config): + # add properties to client_config + kb_config = cls._create_kb_config(client_config) + log.info("kb_config.py create (classmethod): ", kb_config) + # insert knowledge base configuration into database + result = mongo.insert_knowledge_base(kb_config) + log.info("kb_config.py create (classmethod): ", result) + name = kb_config["kb_name"] + # message for client + return f"{name} created" + + @classmethod + def _create_kb_config(cls, client_config): + kb_config = copy.copy(client_config) + log.info('kb_config.py _create_kb_config: ', client_config, kb_config) + kb_config['id'] = kb_config['kb_name'] + kb_config['splitter_config'] = cls._str_to_nums(kb_config['splitter_config']) + kb_config['files'] = [] + + return kb_config + + # converts ints and floats in a dictionary to their respective types + @classmethod + def _str_to_nums(cls, config_dict): + result = {} + for key in config_dict: + if is_int(config_dict[key]): + result[key] = int(config_dict[key]) + elif is_float(config_dict[key]): + result[key] = float(config_dict[key]) + else: + result[key] = config_dict[key] + + return result + + # returns None if not found, otherwise returns the document + @classmethod + def exists(cls, kb_name): + print(kb_name) + doc = mongo.get_knowledge_base(kb_name) + print(doc) + log.info('kb_config.py exists: ', doc) + return doc + + @classmethod + def get_knowledge_bases(cls): + kbs_cursor = mongo.get_knowledge_bases() + kbs_list = list(kbs_cursor) + print('kb_config.py get_knowledge_bases: ', kbs_list) + # log.info('kb_config.py get_knowledge_bases: ', kbs) + return kbs_list + + # returns the configuration object for a knowledge base + def _get_kb_config(self, id): + kb_config = mongo.get_knowledge_base(id) + log.info('kb_config.py _get_kb_config: ', kb_config) + return kb_config + + def _configure_embed_model(self): + embed_provider = self._config['embed_config']['embed_provider'] + embed_model_class = EMBEDDINGS[embed_provider] + api_key = os.environ[API_KEYS[embed_provider]] + model = self._config['embed_config']['embed_model'] + embed_model = embed_model_class(api_key=api_key, model=model) + + return embed_model + + + def _configure_llm(self): + if self._config.get('llm_config') is None: + return None + + llm_provider = LLMS[self._config['llm_config']['llm_provider']] + key_name = API_KEYS[self._config['llm_config']['llm_provider']] + llm = llm_provider( + api_key=os.environ[key_name], + model= self._config['llm_config']['llm_model'] + ) + + return llm + + def _configure_splitter(self): + splitter_config = self._config['splitter_config'] + splitter_name = self._config['splitter'] + + if splitter_name == 'Semantic': + splitter_config['embed_model'] = self._embed_model + elif splitter_name == 'Markdown': + splitter_config['llm'] = self._llm + + splitter_class = SPLITTERS[self._config['splitter']] + + return splitter_class(**self._config['splitter_config']) + + + + # saves file locally, returns file path + def _save_file_locally(self, file): + FILE_DIR = 'tmpfiles' + log.info('kb_config.py _save_file_locally: ', file.filename) + # write file to disk + if not os.path.exists(f"./{FILE_DIR}"): + os.makedirs(f"./{FILE_DIR}") + + + file_path= f"./{FILE_DIR}/{file.filename}" + + with open(file_path, "wb+") as file_object: + shutil.copyfileobj(file.file, file_object) + + # use_s3.ul_file(file.filename, dir=FILE_DIR) + + return file_path + + async def _create_nodes(self, file_path): + if self._config['ingest_method'] == 'LlamaParse': + llama_parse = self._ingest_method( + api_key=os.environ["LLAMA_CLOUD_API_KEY"], + result_type="markdown" + ) + try: + documents = llama_parse.load_data(file_path) + log.info('kb_config.py _create_nodes: ', documents) + except Exception as e: + log.error('kb_config.py _create_nodes: ', e) + return e + else: + documents = self._ingest_method(input_files=[file_path]).load_data() + + + if self._config['splitter'] == 'sentence': + log.info('sentence splitter used') + nodes = self._splitter.split(documents) + else: + nodes = self._splitter.get_nodes_from_documents(documents) + + return nodes + + def _store_indexes(self, nodes): + + # mongodb_client = mongo.client() + # database name defines a knowledge base + log.info('kb_config.py _store_indexes: ********* ', self._config) + + kb_id = self._config['kb_name'] + log.info('kb_config.py _store_indexes: ', kb_id) + vector_index = "vector_index" + + + environment = os.environ["ENVIRONMENT"] + + if environment == 'local' or environment == 'mongoatlas': + vector_store = MongoDBAtlasVectorSearch( + PYMONGO_CLIENT, + db_name=kb_id, + collection_name=vector_index + ) + else: + vector_store = AWSDocDbVectorStore( + PYMONGO_CLIENT, + db_name=kb_id, + collection_name=vector_index + ) + + + storage_context = StorageContext.from_defaults( + vector_store=vector_store, + # docstore=docstore + ) + + VectorStoreIndex( + nodes, + storage_context=storage_context, + embed_model=self._embed_model + ) + + docstore = MongoDocumentStore.from_uri( + uri=MONGO_URI, + db_name=kb_id + ) + + docstore.add_documents(nodes) + + def _add_file_to_kb_config(self, file): + now = datetime.now(timezone.utc) + date = now.strftime("%m-%d-%y") + time = now.strftime("%H:%M") + + file_metadata = { + "file_name": file.filename, + "content_type": file.headers['content-type'], + "date_uploaded": date, + "time_uploaded": time + } + + mongo.add_file_metadata_to_kb( + self._config['kb_name'], + file_metadata + ) + + + async def ingest_file(self, file): + file_path = self._save_file_locally(file) + nodes = await self._create_nodes(file_path) + self._store_indexes(nodes) + self._add_file_to_kb_config(file) + + # def ingest_file_path(self, file_path): + # nodes = self._create_nodes(file_path) + # self._store_indexes(nodes) + # self._add_file_to_kb_config(file_path) + + # def print_config(self): + # print(self.chunk_overlap) + diff --git a/knowledge_base/kb_constants.py b/knowledge_base/kb_constants.py new file mode 100644 index 0000000..742880c --- /dev/null +++ b/knowledge_base/kb_constants.py @@ -0,0 +1,141 @@ +import os + +from llama_index.llms.openai import OpenAI +from llama_index.llms.anthropic import Anthropic +from llama_index.llms.cohere import Cohere +# from llama_index.llms.bedrock import Bedrock + +from llama_index.embeddings.openai import OpenAIEmbedding +from llama_index.embeddings.cohere import CohereEmbedding +# from llama_index.embeddings.bedrock import BedrockEmbedding +# from llama_index.embeddings.huggingface import HuggingFaceEmbedding + +# imports for reading files +from llama_parse import LlamaParse +from llama_index.core import SimpleDirectoryReader + +# imports for parsing files +from llama_index.core.node_parser import ( + SentenceSplitter, + SemanticSplitterNodeParser, + MarkdownElementNodeParser +) + +LLMS = { + "OpenAI": OpenAI, + "Anthropic": Anthropic, + "Cohere": Cohere, + # "Bedrock": Bedrock +} + +EMBEDDINGS = { + "OpenAI": OpenAIEmbedding, + "Cohere": CohereEmbedding, +} + +INGEST_METHODS = { + "LlamaParse": LlamaParse, + "Simple": SimpleDirectoryReader +} + +SPLITTERS = { + "Sentence": SentenceSplitter, + "Semantic": SemanticSplitterNodeParser, + "Markdown": MarkdownElementNodeParser +} + +API_KEYS = { + "OpenAI": "OPENAI_API_KEY", + "Cohere": "COHERE_API_KEY", + "Anthropic": "ANTHROPIC_API_KEY", + "LlamaParse": "LLAMA_CLOUD_API_KEY", +} + + + + +LLM_MODELS = { + "OpenAI": [ + { + "name": "gpt-3.5-turbo", + "description": "good balance of cost and precision", + }, + { + "name": "gpt-4-turbo", + "description": "more advanced than 'gpt-3.5-turbo'", + }, + { + "name": "gpt-4o-mini", + "description": "affordable small model for lightweight tasks", + }, + { + "name": "gpt-4o", + "description": "OpenAI's flagship model", + } + ], + + "Anthropic": [ + { + "name": "claude-4-haiku-20240307", + "description": "fastest and cheapest Anthropic model", + }, + { + "name": "claude-3-sonnet-20240229", + "description": "balanced intellegence and speed", + }, + { + "name": "claude-3-5-sonnet-20240620", + "description": "highest performing Anthropic model", + } + ], + "Cohere": [] +} + + +EMBEDDING_MODEL_DETAILS = { + "OpenAI": [ + { + "name": "text-embedding-3-small", + "description": "good balance of cost and precision", + "language": "multilingual", + }, + { + "name": "text-embedding-3-large", + "description": "slightly more precise at ~6 times the cost of 'text-embedding-3-small'", + "language": "multilingual" + } + ], + "Cohere": [ + { + "name": "embed-english-light-v3.0", + "description": "slightly less precise, but faster than 'embed-english-v3.0'", + "language": "english" + }, + { + "name": "embed-english-v3.0", + "description": "more precise, but slower than 'embed-english-light-v3.0'", + "language": "english" + }, + { + "name": "embed-multilingual-light-v3.0", + "description": "slightly less precise, but faster than 'embed-multilingual-v3.0'", + "language": "multilingual" + }, + { + "name": "embed-multilingual-v3.0", + "description": "more precise, but slower than 'embed-multilingual-light-v3.0'", + "language": "multilingual" + }, + ], +} + + + + + +''' +Need to test hugging face embedding + + + +''' \ No newline at end of file diff --git a/knowledge_base/kb_json.py b/knowledge_base/kb_json.py new file mode 100644 index 0000000..7acbd50 --- /dev/null +++ b/knowledge_base/kb_json.py @@ -0,0 +1,46 @@ +{ + "kb_name": "Sentence", + "ingest_method": "Simple", + "splitter": "Sentence", + "embed_config": { + "embed_provider": "OpenAI", + "embed_model": "text-embedding-3-small" + }, + "splitter_config": { + "chunk_size": "1024", + "chunk_overlap": "200" + } +} + +{ + "name": "Semantic", + "ingest_method": "Simple", + "splitter": "Semantic", + "embed_config": { + "embed_provider": "OpenAI", + "embed_model": "text-embedding-3-small" + }, + "splitter_config": { + "buffer_size": "100", + "breakpoint_percentile_threshold": "95" + } +} + + +{ + "name": "Markdown", + "ingest_method": "LlamaParse", + "splitter": "Markdown", + "embed_config": { + "embed_provider": "OpenAI", + "embed_model": "text-embedding-3-small" + }, + "splitter_config": { + "num_workers": "8" + }, + "llm_config": { + "llm_provider": "OpenAI", + "llm_model": "gpt-3.5-turbo" + } +} + diff --git a/knowledge_base/kb_pipfile.txt b/knowledge_base/kb_pipfile.txt new file mode 100644 index 0000000..37537ec --- /dev/null +++ b/knowledge_base/kb_pipfile.txt @@ -0,0 +1,15 @@ +boto3 = "*" +nest-asyncio = "*" +pymongo = "*" +motor = "*" +python-dotenv = "*" +llama-index = "*" +llama-index-storage-docstore-mongodb = "*" +llama-index-vector-stores-awsdocdb = "*" +llama-parse = "*" +llama-index-llms-openai = "*" +llama-index-llms-anthropic = "*" +llama-index-llms-bedrock = "*" +llama-index-embeddings-cohere = "*" +llama-index-embeddings-openai = "*" +llama-index-embeddings-bedrock = "*" \ No newline at end of file diff --git a/knowledge_base/kb_type_definitions.py b/knowledge_base/kb_type_definitions.py new file mode 100644 index 0000000..4d37f0d --- /dev/null +++ b/knowledge_base/kb_type_definitions.py @@ -0,0 +1,49 @@ +from typing import TypedDict, Optional + +# Notes: +# if ingest_method is LlamaParse, splitter_config will be a MarkdownConfig and +# llm_config is required +# llm_config is only required for LlamaParse + +class EmbedConfig(TypedDict): + embed_provider: str + embed_model: str + +class LLMConfig(TypedDict): + llm_provider: str + llm_model: str + +class MarkdownConfig(TypedDict): + num_workers: int | str # default: 8 + +class SemanticConfig(TypedDict): + buffer_size: int | str # default: 100 + breakpoint_percentile_threshold: int | str # default 95 + +class SentenceConfig(TypedDict): + chunk_size: int | str # default 1024 + chunk_overlap: int | str # default: 200 + +class FileMetadata(TypedDict): + file_name: str + content_type: str + date_uploaded: str + time_uploaded: str + +class ClientKBConfig(TypedDict): + kb_name: str + ingest_method: str + splitter: str + embed_config: EmbedConfig + splitter_config: MarkdownConfig | SemanticConfig | SentenceConfig + llm_config: Optional[LLMConfig] # Only required for "LlamaParse" + +class KBConfig(TypedDict): + _id: int + kb_name: str + embed_config: EmbedConfig + ingestion_method: str + splitter: str + splitter_config: MarkdownConfig | SemanticConfig | SentenceConfig + llm_config: Optional[LLMConfig] # Only required for "LlamaParse" + files: list[FileMetadata] \ No newline at end of file diff --git a/knowledge_base/lp_ingest.py b/knowledge_base/lp_ingest.py new file mode 100644 index 0000000..e512c58 --- /dev/null +++ b/knowledge_base/lp_ingest.py @@ -0,0 +1,73 @@ +import os +import pymongo +import nest_asyncio +from dotenv import load_dotenv + +from llama_parse import LlamaParse +from llama_index.core.node_parser import MarkdownElementNodeParser +from llama_index.llms.openai import OpenAI +from llama_index.embeddings.openai import OpenAIEmbedding +from llama_index.vector_stores.awsdocdb import AWSDocDbVectorStore +from llama_index.core import VectorStoreIndex +from llama_index.core import StorageContext + +load_dotenv() +nest_asyncio.apply() + +llama_cloud_api_key = os.environ["LLAMA_CLOUD_API_KEY"] +openai_api_key = os.environ["OPENAI_API_KEY"] + +mongo_uri = os.environ["MONGO_URI"] +mongodb_client = pymongo.MongoClient(mongo_uri) +docdb_name = os.environ["DOCDB_NAME"] +docdb_collection = os.environ["DOCDB_COLLECTION"] +store = AWSDocDbVectorStore(mongodb_client, db_name=docdb_name, collection_name=docdb_collection) + +def send_file_to_llama_parse(file_path): + print("send_file_to_llama_parse") + parser = LlamaParse( + api_key=llama_cloud_api_key, + result_type="markdown" + ) + + markdown_documents = parser.load_data(file_path) + + print("response received from llama_parse") + print(markdown_documents[0]) + + return markdown_documents + + +# convert markdown documents +# return nodes +def markdown_to_node(documents): + + markdown_parser = MarkdownElementNodeParser( + llm=OpenAI(api_key=openai_api_key, model="gpt-3.5-turbo"), + num_workers=8, + ) + + nodes = markdown_parser.get_nodes_from_documents(documents) + print('response from markdown_parser') + print(nodes[0]) + + return nodes + +# convert nodes to vector store +# side effect: save index to docdb +def nodes_to_vector_store(nodes): + embed_model = OpenAIEmbedding(api_key=openai_api_key, model="text-embedding-ada-002") + storage_context = StorageContext.from_defaults(vector_store=store) + index = VectorStoreIndex(nodes, embed_model=embed_model, storage_context=storage_context) + + return index + +def ingest_file_to_docdb(file_path): + + try: + markdown_docs = send_file_to_llama_parse(file_path) + nodes = markdown_to_node(markdown_docs) + nodes_to_vector_store(nodes) + except Exception as e: + raise e + diff --git a/knowledge_base/mongo_helper.py b/knowledge_base/mongo_helper.py new file mode 100644 index 0000000..027cb3e --- /dev/null +++ b/knowledge_base/mongo_helper.py @@ -0,0 +1,54 @@ +import os + +import pymongo +from dotenv import load_dotenv +import app_logger as log + +env = os.getenv("ENV") +if env == 'testing': + log.info("Testing environment") + load_dotenv(override=True, dotenv_path='.env.testing') +else: + load_dotenv(override=True) + +MONGO_URI = os.environ["MONGO_URI"] +CONFIG_DB = os.environ["CONFIG_DB"] +CONFIG_KB_COL = os.environ["CONFIG_KB_COL"] + +def get(db_name, db_collection, query=None, projection=None): + mongo = pymongo.MongoClient(MONGO_URI) + return mongo[db_name][db_collection].find_one(query, projection) + +def get_all(db_name, db_collection, query=None, projection=None): + mongo = pymongo.MongoClient(MONGO_URI) + results = mongo[db_name][db_collection].find(query, projection) + ar = [] + for result in results: + ar.append(result) + return ar + +def insert_one(db_name, db_collection, doc): + mongo = pymongo.MongoClient(MONGO_URI) + result = mongo[db_name][db_collection].insert_one(doc) + return result + +def get_knowledge_bases(): + return get_all(CONFIG_DB, CONFIG_KB_COL,{}, {"_id": 0}) + +def get_knowledge_base(kb_name): + return get( + CONFIG_DB, + CONFIG_KB_COL, + {"kb_name": kb_name}, + {"_id": 0} + ) + +def insert_knowledge_base(kb_config): + return insert_one(CONFIG_DB, CONFIG_KB_COL, kb_config) + +def add_file_metadata_to_kb(kb_name, file_metadata): + mongo = pymongo.MongoClient(MONGO_URI) + mongo[CONFIG_DB][CONFIG_KB_COL].update_one( + { "kb_name": kb_name }, + { "$push": { "files": file_metadata } } + ) diff --git a/knowledge_base/simple_ingest.py b/knowledge_base/simple_ingest.py new file mode 100644 index 0000000..154aa96 --- /dev/null +++ b/knowledge_base/simple_ingest.py @@ -0,0 +1,34 @@ +# based upon LlamaIndex demo +# https://docs.llamaindex.ai/en/stable/examples/vector_stores/AWSDocDBDemo/ + + +import pymongo +from dotenv import load_dotenv +import app_logger as log + +from llama_index.vector_stores.awsdocdb import AWSDocDbVectorStore +from llama_index.core import VectorStoreIndex +from llama_index.core import StorageContext +from llama_index.core import SimpleDirectoryReader +import os + +load_dotenv(override=True) + + +mongo_uri = os.environ["MONGO_URI"] +mongodb_client = pymongo.MongoClient(mongo_uri) +docdb_name = os.environ["DOCDB_NAME"] +docdb_collection = os.environ["DOCDB_COLLECTION"] +store = AWSDocDbVectorStore(mongodb_client, db_name=docdb_name, collection_name=docdb_collection) +storage_context = StorageContext.from_defaults(vector_store=store) + + +def ingest_file_to_docdb(file_path): + try: + log.debug('starting ingestion', file_path) + document = SimpleDirectoryReader(input_files=[file_path]).load_data() + index = VectorStoreIndex.from_documents(document, storage_context=storage_context) + log.debug('index created') + except Exception as e: + raise e + From da56c7a75069c9d3191b001506eb777c02a2817e Mon Sep 17 00:00:00 2001 From: tlane25 Date: Tue, 6 Aug 2024 07:46:34 -0400 Subject: [PATCH 03/10] update: moved knowledge base routes to server_kb.py --- server_kb.py | 91 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 91 insertions(+) create mode 100644 server_kb.py diff --git a/server_kb.py b/server_kb.py new file mode 100644 index 0000000..aba43fb --- /dev/null +++ b/server_kb.py @@ -0,0 +1,91 @@ +import shutil +import os +import json + +from fastapi import FastAPI, File, UploadFile, Request +from fastapi.middleware.cors import CORSMiddleware +import nest_asyncio +import pymongo +from dotenv import load_dotenv + +import app_logger as log +import use_s3 +from knowledge_base.kb_config import KnowledgeBase + +nest_asyncio.apply() + +# env = os.getenv("ENV") +# print(env) +# if env == 'testing': +# load_dotenv(override=True, dotenv_path='.env.testing') +# else: +# load_dotenv(override=True) + +# MONGO_URI = os.environ["MONGO_URI"] +# CONFIG_DB = os.environ["CONFIG_DB"] +# # print(CONFIG_DB) +# CONFIG_PIPELINE_COL = os.environ["CONFIG_PIPELINE_COL"] + +print("Starting server") +app = FastAPI() + +app.add_middleware( + CORSMiddleware, + allow_origins=['*'], + allow_credentials=True, + allow_methods=['*'], + allow_headers=['*'] +) + +@app.get('/api') +async def root(): + log.info("server running") + return {"message": "Server running"} + + +# knowledge base routes +@app.get("/api/knowledge-bases") +async def get_knowledge_bases(): + knowledge_bases = KnowledgeBase.get_knowledge_bases() + return knowledge_bases + +# consider adding id to the body of the request sent from the client +# to create a new knowledge base +# otherwise, we will use the kb_name prop to see if knowledge base exists +@app.post('/api/knowledge-bases') +async def create_kb(request: Request): + body = await request.json() + # ensure that the kb_name is unique + if KnowledgeBase.exists(body["kb_name"]): + message = f"{body['kb_name']} already exists" + else: + message = KnowledgeBase.create(body) + + return {"message": message} + +@app.get("/api/knowledge-base/{id}") +async def get_knowledge_base(id: str): + kb_config = KnowledgeBase.exists(id) + if kb_config: + return kb_config + else: + return { "message": f"{id} does not exist" } + +# this route adds a file to a knowledge base +@app.post('/api/knowledge-bases/{id}/upload') +async def upload_file(id: str, file: UploadFile=File(...)): + log.info("in route", file.filename) + if KnowledgeBase.exists(id): + kb = KnowledgeBase(id) + log.info("in logic", file.filename) + try: + await kb.ingest_file(file) + return {"message": f"{file.filename} uploaded"} + except Exception as e: + return {"message": f"Error: {e}"} + else: + return {"message": f"Knowledge base {id} doesn't exist"} + +if __name__ == "__main__": + import uvicorn + uvicorn.run(app, host="0.0.0.0", port=8000, loop='asyncio') \ No newline at end of file From 81beae3a159d196d6a4edf8227ab72d51375a5da Mon Sep 17 00:00:00 2001 From: tlane25 Date: Tue, 6 Aug 2024 07:49:03 -0400 Subject: [PATCH 04/10] feat: began implementing basic tests for knowledge base routes --- cleanup.py | 25 +++++++++++ kb_test_constants.py | 96 +++++++++++++++++++++++++++++++++++++++++ test_server_kb.py | 100 +++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 221 insertions(+) create mode 100644 cleanup.py create mode 100644 kb_test_constants.py create mode 100644 test_server_kb.py diff --git a/cleanup.py b/cleanup.py new file mode 100644 index 0000000..c9de97d --- /dev/null +++ b/cleanup.py @@ -0,0 +1,25 @@ +import os + +import pymongo +from dotenv import load_dotenv + +load_dotenv(override=True, dotenv_path=".env.testing") + +MONGO_URI = os.environ["MONGO_URI"] +CONFIG_DB = os.environ["CONFIG_DB"] +CONFIG_KB_COL = os.environ["CONFIG_KB_COL"] + +kb_names = [] +def drop_all_knowledge_bases(): + print(f"Dropping all knowledge bases in {CONFIG_DB}") + pymongo.MongoClient(MONGO_URI).drop_database(CONFIG_DB) + +def remove_kb_files(kb_names: list): + mongo_client = pymongo.MongoClient(MONGO_URI) + + for kb_name in kb_names: + mongo_client.drop_database(kb_name) + +drop_all_knowledge_bases() + + diff --git a/kb_test_constants.py b/kb_test_constants.py new file mode 100644 index 0000000..6e5ed0c --- /dev/null +++ b/kb_test_constants.py @@ -0,0 +1,96 @@ +client_sentence_config = { + "kb_name": "Sentence", + "ingest_method": "Simple", + "splitter": "Sentence", + "embed_config": { + "embed_provider": "OpenAI", + "embed_model": "text-embedding-3-small" + }, + "splitter_config": { + "chunk_size": "1024", + "chunk_overlap": "200" + }, +} + +client_semantic_config = { + "kb_name": "Semantic", + "ingest_method": "Simple", + "splitter": "Semantic", + "embed_config": { + "embed_provider": "OpenAI", + "embed_model": "text-embedding-3-small" + }, + "splitter_config": { + "buffer_size": "100", + "breakpoint_percentile_threshold": "95" + }, +} + +client_llama_parse_config = { + "kb_name": "Markdown", + "ingest_method": "LlamaParse", + "splitter": "Markdown", + "embed_config": { + "embed_provider": "OpenAI", + "embed_model": "text-embedding-3-small" + }, + "splitter_config": { + "num_workers": "8" + }, + "llm_config": { + "llm_provider": "OpenAI", + "llm_model": "gpt-3.5-turbo" + }, +} + +server_sentence_config = { + "kb_name": "Sentence", + "ingest_method": "Simple", + "splitter": "Sentence", + "embed_config": { + "embed_provider": "OpenAI", + "embed_model": "text-embedding-3-small" + }, + "splitter_config": { + "chunk_size": 1024, + "chunk_overlap": 200 + }, + "id": "Sentence", + "files": [] +} + +server_semantic_config ={ + "kb_name": "Semantic", + "ingest_method": "Simple", + "splitter": "Semantic", + "embed_config": { + "embed_provider": "OpenAI", + "embed_model": "text-embedding-3-small" + }, + "splitter_config": { + "buffer_size": 100, + "breakpoint_percentile_threshold": 95 + }, + "id": "Semantic", + "files": [] +} + +server_llama_parse_config = { + "kb_name": "Markdown", + "ingest_method": "LlamaParse", + "splitter": "Markdown", + "embed_config": { + "embed_provider": "OpenAI", + "embed_model": "text-embedding-3-small" + }, + "splitter_config": { + "num_workers": 8 + }, + "llm_config": { + "llm_provider": "OpenAI", + "llm_model": "gpt-3.5-turbo" + }, + "id": "Markdown", + "files": [] +} + diff --git a/test_server_kb.py b/test_server_kb.py new file mode 100644 index 0000000..cbed97d --- /dev/null +++ b/test_server_kb.py @@ -0,0 +1,100 @@ +import os + +import pymongo +from dotenv import load_dotenv +from fastapi.testclient import TestClient + +from server_kb import app +from kb_test_constants import ( + client_sentence_config, + client_semantic_config, + client_llama_parse_config, + server_sentence_config, + server_semantic_config, + server_llama_parse_config, +) + +load_dotenv(override=True, dotenv_path=".env.testing") + +MONGO_URI = os.environ["MONGO_URI"] +CONFIG_DB = os.environ["CONFIG_DB"] + +client = TestClient(app) + + + +def test_read_knowledge_bases(): + + response = client.get("/api/knowledge-bases") + assert response.status_code == 200 + assert response.json() == [] + +def test_create_simple_splitter_kb(): + response = client.post( + "/api/knowledge-bases", + json=client_sentence_config + ) + assert response.status_code == 200 + assert response.json() == {"message": "Sentence created"} + + response = client.post( + "/api/knowledge-bases", + json=client_sentence_config + ) + assert response.status_code == 200 + assert response.json() == {"message": "Sentence already exists"} + +def test_create_semantic_splitter_kb(): + response = client.post( + "/api/knowledge-bases", + json=client_semantic_config + ) + assert response.status_code == 200 + assert response.json() == {"message": "Semantic created"} + + response = client.post( + "/api/knowledge-bases", + json=client_semantic_config + ) + assert response.status_code == 200 + assert response.json() == {"message": "Semantic already exists"} + +def test_create_markdown_splitter_kb(): + response = client.post( + "/api/knowledge-bases", + json=client_llama_parse_config + ) + assert response.status_code == 200 + assert response.json() == {"message": "Markdown created"} + + response = client.post( + "/api/knowledge-bases", + json=client_llama_parse_config + ) + assert response.status_code == 200 + assert response.json() == {"message": "Markdown already exists"} + +def test_get_kb_config(): + response = client.get("/api/knowledge-base/Sentence") + assert response.status_code == 200 + assert response.json() == server_sentence_config + + response = client.get("/api/knowledge-base/unknown_kb") + assert response.status_code == 200 + assert response.json() == {"message": "unknown_kb does not exist"} + +# def test_upload_file(): +# response = client.post("/api/knowledge-bases/Sentence%20Splitter/upload", files={"file": ("test.txt", b"test")}) +# assert response.status_code == 200 +# assert response.json() == {"message": "test.txt uploaded"} + + # response = client.get("/api/knowledge-base/test_kb") + # assert response.status_code == 200 + # assert response.json()["files"].length == 1 + + # response = client.post("/api/knowledge-bases/unknown_kb/upload", files={"file": ("test.txt", b"test")}) + # assert response.status_code == 200 + # assert response.json() == {"message": "Knowledge base unknown_kb doesn't exist"} + + + From ace5b4bce6e84e4d3c62051960503fddf1c6c2a5 Mon Sep 17 00:00:00 2001 From: tlane25 Date: Tue, 6 Aug 2024 07:53:46 -0400 Subject: [PATCH 05/10] update: removed outdated ingestion related files from knowledge_base module --- knowledge_base/kb_json.py | 46 --------------------- knowledge_base/lp_ingest.py | 73 --------------------------------- knowledge_base/simple_ingest.py | 34 --------------- 3 files changed, 153 deletions(-) delete mode 100644 knowledge_base/kb_json.py delete mode 100644 knowledge_base/lp_ingest.py delete mode 100644 knowledge_base/simple_ingest.py diff --git a/knowledge_base/kb_json.py b/knowledge_base/kb_json.py deleted file mode 100644 index 7acbd50..0000000 --- a/knowledge_base/kb_json.py +++ /dev/null @@ -1,46 +0,0 @@ -{ - "kb_name": "Sentence", - "ingest_method": "Simple", - "splitter": "Sentence", - "embed_config": { - "embed_provider": "OpenAI", - "embed_model": "text-embedding-3-small" - }, - "splitter_config": { - "chunk_size": "1024", - "chunk_overlap": "200" - } -} - -{ - "name": "Semantic", - "ingest_method": "Simple", - "splitter": "Semantic", - "embed_config": { - "embed_provider": "OpenAI", - "embed_model": "text-embedding-3-small" - }, - "splitter_config": { - "buffer_size": "100", - "breakpoint_percentile_threshold": "95" - } -} - - -{ - "name": "Markdown", - "ingest_method": "LlamaParse", - "splitter": "Markdown", - "embed_config": { - "embed_provider": "OpenAI", - "embed_model": "text-embedding-3-small" - }, - "splitter_config": { - "num_workers": "8" - }, - "llm_config": { - "llm_provider": "OpenAI", - "llm_model": "gpt-3.5-turbo" - } -} - diff --git a/knowledge_base/lp_ingest.py b/knowledge_base/lp_ingest.py deleted file mode 100644 index e512c58..0000000 --- a/knowledge_base/lp_ingest.py +++ /dev/null @@ -1,73 +0,0 @@ -import os -import pymongo -import nest_asyncio -from dotenv import load_dotenv - -from llama_parse import LlamaParse -from llama_index.core.node_parser import MarkdownElementNodeParser -from llama_index.llms.openai import OpenAI -from llama_index.embeddings.openai import OpenAIEmbedding -from llama_index.vector_stores.awsdocdb import AWSDocDbVectorStore -from llama_index.core import VectorStoreIndex -from llama_index.core import StorageContext - -load_dotenv() -nest_asyncio.apply() - -llama_cloud_api_key = os.environ["LLAMA_CLOUD_API_KEY"] -openai_api_key = os.environ["OPENAI_API_KEY"] - -mongo_uri = os.environ["MONGO_URI"] -mongodb_client = pymongo.MongoClient(mongo_uri) -docdb_name = os.environ["DOCDB_NAME"] -docdb_collection = os.environ["DOCDB_COLLECTION"] -store = AWSDocDbVectorStore(mongodb_client, db_name=docdb_name, collection_name=docdb_collection) - -def send_file_to_llama_parse(file_path): - print("send_file_to_llama_parse") - parser = LlamaParse( - api_key=llama_cloud_api_key, - result_type="markdown" - ) - - markdown_documents = parser.load_data(file_path) - - print("response received from llama_parse") - print(markdown_documents[0]) - - return markdown_documents - - -# convert markdown documents -# return nodes -def markdown_to_node(documents): - - markdown_parser = MarkdownElementNodeParser( - llm=OpenAI(api_key=openai_api_key, model="gpt-3.5-turbo"), - num_workers=8, - ) - - nodes = markdown_parser.get_nodes_from_documents(documents) - print('response from markdown_parser') - print(nodes[0]) - - return nodes - -# convert nodes to vector store -# side effect: save index to docdb -def nodes_to_vector_store(nodes): - embed_model = OpenAIEmbedding(api_key=openai_api_key, model="text-embedding-ada-002") - storage_context = StorageContext.from_defaults(vector_store=store) - index = VectorStoreIndex(nodes, embed_model=embed_model, storage_context=storage_context) - - return index - -def ingest_file_to_docdb(file_path): - - try: - markdown_docs = send_file_to_llama_parse(file_path) - nodes = markdown_to_node(markdown_docs) - nodes_to_vector_store(nodes) - except Exception as e: - raise e - diff --git a/knowledge_base/simple_ingest.py b/knowledge_base/simple_ingest.py deleted file mode 100644 index 154aa96..0000000 --- a/knowledge_base/simple_ingest.py +++ /dev/null @@ -1,34 +0,0 @@ -# based upon LlamaIndex demo -# https://docs.llamaindex.ai/en/stable/examples/vector_stores/AWSDocDBDemo/ - - -import pymongo -from dotenv import load_dotenv -import app_logger as log - -from llama_index.vector_stores.awsdocdb import AWSDocDbVectorStore -from llama_index.core import VectorStoreIndex -from llama_index.core import StorageContext -from llama_index.core import SimpleDirectoryReader -import os - -load_dotenv(override=True) - - -mongo_uri = os.environ["MONGO_URI"] -mongodb_client = pymongo.MongoClient(mongo_uri) -docdb_name = os.environ["DOCDB_NAME"] -docdb_collection = os.environ["DOCDB_COLLECTION"] -store = AWSDocDbVectorStore(mongodb_client, db_name=docdb_name, collection_name=docdb_collection) -storage_context = StorageContext.from_defaults(vector_store=store) - - -def ingest_file_to_docdb(file_path): - try: - log.debug('starting ingestion', file_path) - document = SimpleDirectoryReader(input_files=[file_path]).load_data() - index = VectorStoreIndex.from_documents(document, storage_context=storage_context) - log.debug('index created') - except Exception as e: - raise e - From e84fd5f3a451ff6b3f0b9b6699a1da1a261c0fe0 Mon Sep 17 00:00:00 2001 From: tlane25 Date: Tue, 6 Aug 2024 14:30:53 -0400 Subject: [PATCH 06/10] update: functional knowledge_base package --- knowledge_base/kb_config.py | 169 ++++++++++++---------------- knowledge_base/kb_test_constants.py | 96 ++++++++++++++++ knowledge_base/mongo_helper.py | 31 +++-- knowledge_base/routes.py | 94 ++++++++++++++++ 4 files changed, 289 insertions(+), 101 deletions(-) create mode 100644 knowledge_base/kb_test_constants.py create mode 100644 knowledge_base/routes.py diff --git a/knowledge_base/kb_config.py b/knowledge_base/kb_config.py index 94e94f6..b000616 100644 --- a/knowledge_base/kb_config.py +++ b/knowledge_base/kb_config.py @@ -1,10 +1,9 @@ import os -import copy import shutil from datetime import datetime, timezone import pymongo -import use_s3 +# import use_s3 import nest_asyncio from dotenv import load_dotenv from llama_index.core import StorageContext @@ -14,10 +13,10 @@ from llama_index.storage.docstore.mongodb import MongoDocumentStore -import app_logger as log -from . import mongo_helper as mongo +import db.app_logger as log +from db.knowledge_base import mongo_helper as mongo -from .kb_constants import ( +from db.knowledge_base.kb_constants import ( EMBEDDINGS, INGEST_METHODS, SPLITTERS, @@ -25,7 +24,7 @@ API_KEYS, ) -from .kb_type_definitions import ( +from db.knowledge_base.kb_type_definitions import ( EmbedConfig, LLMConfig, MarkdownConfig, @@ -37,40 +36,18 @@ ) -# env = os.getenv("ENV") -# print("env: ", env) -# if env == 'testing': -# load_dotenv(override=True, dotenv_path='../.env.testing') -# else: -load_dotenv(override=True) +env = os.getenv("ENV") +print("env: ", env) +if env == 'testing': + load_dotenv(override=True, dotenv_path='../.env.testing') +else: + load_dotenv(override=True) MONGO_URI = os.environ["MONGO_URI"] -CONFIG_DB = os.environ["CONFIG_DB"] -CONFIG_KB_COL = os.environ["CONFIG_KB_COL"] -PYMONGO_CLIENT = pymongo.MongoClient(MONGO_URI) -CONFIG_COLLECTION = PYMONGO_CLIENT[CONFIG_DB][CONFIG_KB_COL] - -# CONFIG_COLLECTION = mongo.connect_to_kb_config() - - - -def is_int(s): - try: - int(s) - return True - except ValueError: - return False - -def is_float(value): - try: - float(value) - return True - except ValueError: - return False +nest_asyncio.apply() class KnowledgeBase: - # props in `self._config` are str names of the knowledge base configuration # self._embed_model, self._llm, and self._splitter are instances of the classes # defined by properties in `self._config` @@ -85,59 +62,6 @@ def __init__(self, kb_name): ] self._splitter = self._configure_splitter() - @classmethod - def create(cls, client_config): - # add properties to client_config - kb_config = cls._create_kb_config(client_config) - log.info("kb_config.py create (classmethod): ", kb_config) - # insert knowledge base configuration into database - result = mongo.insert_knowledge_base(kb_config) - log.info("kb_config.py create (classmethod): ", result) - name = kb_config["kb_name"] - # message for client - return f"{name} created" - - @classmethod - def _create_kb_config(cls, client_config): - kb_config = copy.copy(client_config) - log.info('kb_config.py _create_kb_config: ', client_config, kb_config) - kb_config['id'] = kb_config['kb_name'] - kb_config['splitter_config'] = cls._str_to_nums(kb_config['splitter_config']) - kb_config['files'] = [] - - return kb_config - - # converts ints and floats in a dictionary to their respective types - @classmethod - def _str_to_nums(cls, config_dict): - result = {} - for key in config_dict: - if is_int(config_dict[key]): - result[key] = int(config_dict[key]) - elif is_float(config_dict[key]): - result[key] = float(config_dict[key]) - else: - result[key] = config_dict[key] - - return result - - # returns None if not found, otherwise returns the document - @classmethod - def exists(cls, kb_name): - print(kb_name) - doc = mongo.get_knowledge_base(kb_name) - print(doc) - log.info('kb_config.py exists: ', doc) - return doc - - @classmethod - def get_knowledge_bases(cls): - kbs_cursor = mongo.get_knowledge_bases() - kbs_list = list(kbs_cursor) - print('kb_config.py get_knowledge_bases: ', kbs_list) - # log.info('kb_config.py get_knowledge_bases: ', kbs) - return kbs_list - # returns the configuration object for a knowledge base def _get_kb_config(self, id): kb_config = mongo.get_knowledge_base(id) @@ -226,8 +150,8 @@ async def _create_nodes(self, file_path): def _store_indexes(self, nodes): - # mongodb_client = mongo.client() - # database name defines a knowledge base + client = pymongo.MongoClient(MONGO_URI) + log.info('kb_config.py _store_indexes: ********* ', self._config) kb_id = self._config['kb_name'] @@ -239,13 +163,13 @@ def _store_indexes(self, nodes): if environment == 'local' or environment == 'mongoatlas': vector_store = MongoDBAtlasVectorSearch( - PYMONGO_CLIENT, + client, db_name=kb_id, collection_name=vector_index ) else: vector_store = AWSDocDbVectorStore( - PYMONGO_CLIENT, + client, db_name=kb_id, collection_name=vector_index ) @@ -268,17 +192,21 @@ def _store_indexes(self, nodes): ) docstore.add_documents(nodes) + client.close() def _add_file_to_kb_config(self, file): now = datetime.now(timezone.utc) date = now.strftime("%m-%d-%y") time = now.strftime("%H:%M") + size = file.size + file_metadata = { "file_name": file.filename, - "content_type": file.headers['content-type'], + "content_type": file.headers["content-type"], "date_uploaded": date, - "time_uploaded": time + "time_uploaded": time, + "size": size } mongo.add_file_metadata_to_kb( @@ -301,3 +229,56 @@ async def ingest_file(self, file): # def print_config(self): # print(self.chunk_overlap) + + # # @classmethod + # # def create(cls, client_config): + # # # add properties to client_config + # # kb_config = cls._create_kb_config(client_config) + # # log.info("kb_config.py create (classmethod): ", kb_config) + # # # insert knowledge base configuration into database + # # result = mongo.insert_knowledge_base(kb_config) + # # log.info("kb_config.py create (classmethod): ", result) + # # name = kb_config["kb_name"] + # # # message for client + # # return f"{name} created" + + # # @classmethod + # # def _create_kb_config(cls, client_config): + # # kb_config = copy.copy(client_config) + # # log.info('kb_config.py _create_kb_config: ', client_config, kb_config) + # # kb_config['id'] = kb_config['kb_name'] + # # kb_config['splitter_config'] = cls._str_to_nums(kb_config['splitter_config']) + # # kb_config['files'] = [] + + # # return kb_config + + # # # converts ints and floats in a dictionary to their respective types + # # @classmethod + # # def _str_to_nums(cls, config_dict): + # # result = {} + # # for key in config_dict: + # # if is_int(config_dict[key]): + # # result[key] = int(config_dict[key]) + # # elif is_float(config_dict[key]): + # # result[key] = float(config_dict[key]) + # # else: + # # result[key] = config_dict[key] + + # # return result + + # # returns None if not found, otherwise returns the document + # @classmethod + # def exists(cls, kb_name): + # print(kb_name) + # doc = mongo.get_knowledge_base(kb_name) + # print(doc) + # log.info('kb_config.py exists: ', doc) + # return doc + + # @classmethod + # def get_knowledge_bases(cls): + # kbs_cursor = mongo.get_knowledge_bases() + # kbs_list = list(kbs_cursor) + # print('kb_config.py get_knowledge_bases: ', kbs_list) + # # log.info('kb_config.py get_knowledge_bases: ', kbs) + # return kbs_list \ No newline at end of file diff --git a/knowledge_base/kb_test_constants.py b/knowledge_base/kb_test_constants.py new file mode 100644 index 0000000..6e5ed0c --- /dev/null +++ b/knowledge_base/kb_test_constants.py @@ -0,0 +1,96 @@ +client_sentence_config = { + "kb_name": "Sentence", + "ingest_method": "Simple", + "splitter": "Sentence", + "embed_config": { + "embed_provider": "OpenAI", + "embed_model": "text-embedding-3-small" + }, + "splitter_config": { + "chunk_size": "1024", + "chunk_overlap": "200" + }, +} + +client_semantic_config = { + "kb_name": "Semantic", + "ingest_method": "Simple", + "splitter": "Semantic", + "embed_config": { + "embed_provider": "OpenAI", + "embed_model": "text-embedding-3-small" + }, + "splitter_config": { + "buffer_size": "100", + "breakpoint_percentile_threshold": "95" + }, +} + +client_llama_parse_config = { + "kb_name": "Markdown", + "ingest_method": "LlamaParse", + "splitter": "Markdown", + "embed_config": { + "embed_provider": "OpenAI", + "embed_model": "text-embedding-3-small" + }, + "splitter_config": { + "num_workers": "8" + }, + "llm_config": { + "llm_provider": "OpenAI", + "llm_model": "gpt-3.5-turbo" + }, +} + +server_sentence_config = { + "kb_name": "Sentence", + "ingest_method": "Simple", + "splitter": "Sentence", + "embed_config": { + "embed_provider": "OpenAI", + "embed_model": "text-embedding-3-small" + }, + "splitter_config": { + "chunk_size": 1024, + "chunk_overlap": 200 + }, + "id": "Sentence", + "files": [] +} + +server_semantic_config ={ + "kb_name": "Semantic", + "ingest_method": "Simple", + "splitter": "Semantic", + "embed_config": { + "embed_provider": "OpenAI", + "embed_model": "text-embedding-3-small" + }, + "splitter_config": { + "buffer_size": 100, + "breakpoint_percentile_threshold": 95 + }, + "id": "Semantic", + "files": [] +} + +server_llama_parse_config = { + "kb_name": "Markdown", + "ingest_method": "LlamaParse", + "splitter": "Markdown", + "embed_config": { + "embed_provider": "OpenAI", + "embed_model": "text-embedding-3-small" + }, + "splitter_config": { + "num_workers": 8 + }, + "llm_config": { + "llm_provider": "OpenAI", + "llm_model": "gpt-3.5-turbo" + }, + "id": "Markdown", + "files": [] +} + diff --git a/knowledge_base/mongo_helper.py b/knowledge_base/mongo_helper.py index 027cb3e..1524684 100644 --- a/knowledge_base/mongo_helper.py +++ b/knowledge_base/mongo_helper.py @@ -2,7 +2,7 @@ import pymongo from dotenv import load_dotenv -import app_logger as log +import db.app_logger as log env = os.getenv("ENV") if env == 'testing': @@ -16,20 +16,23 @@ CONFIG_KB_COL = os.environ["CONFIG_KB_COL"] def get(db_name, db_collection, query=None, projection=None): + print("mongo uri:", MONGO_URI) mongo = pymongo.MongoClient(MONGO_URI) - return mongo[db_name][db_collection].find_one(query, projection) + result = mongo[db_name][db_collection].find_one(query, projection) + mongo.close() + return result def get_all(db_name, db_collection, query=None, projection=None): mongo = pymongo.MongoClient(MONGO_URI) results = mongo[db_name][db_collection].find(query, projection) - ar = [] - for result in results: - ar.append(result) - return ar + results = list(results) + mongo.close() + return results def insert_one(db_name, db_collection, doc): mongo = pymongo.MongoClient(MONGO_URI) result = mongo[db_name][db_collection].insert_one(doc) + mongo.close() return result def get_knowledge_bases(): @@ -48,7 +51,21 @@ def insert_knowledge_base(kb_config): def add_file_metadata_to_kb(kb_name, file_metadata): mongo = pymongo.MongoClient(MONGO_URI) - mongo[CONFIG_DB][CONFIG_KB_COL].update_one( + result = mongo[CONFIG_DB][CONFIG_KB_COL].update_one( { "kb_name": kb_name }, { "$push": { "files": file_metadata } } ) + mongo.close() + log.info(f"add_file_metadata_to_kb: {result}") + +def file_exists(kb_name, file): + kb = get_knowledge_base(kb_name) + if kb: + for f in kb["files"]: + if ( + f["file_name"] == file.filename and + f["size"] == file.size and + f["content_type"] == file.headers["content-type"] + ): + return True + return False diff --git a/knowledge_base/routes.py b/knowledge_base/routes.py new file mode 100644 index 0000000..0d45ef5 --- /dev/null +++ b/knowledge_base/routes.py @@ -0,0 +1,94 @@ +import copy + +import nest_asyncio + +import db.knowledge_base.mongo_helper as mongo +from db.knowledge_base.kb_config import KnowledgeBase +import db.app_logger as log + +nest_asyncio.apply() + +def get_all(): + return mongo.get_knowledge_bases() + +def get_one(kb_name): + kb_config = mongo.get_knowledge_base(kb_name) + if kb_config: + return kb_config + else: + return { "message": f"{kb_name} does not exist" } + +def create(client_config): + if mongo.get_knowledge_base(client_config['kb_name']): + message = f"{client_config['kb_name']} already exists" + else: + kb_config = create_kb_config(client_config) + result = mongo.insert_knowledge_base(kb_config) + log.info("knowledge base created: ", result) + message = f"{kb_config['kb_name']} created" + + return {"message": message} + +def create_kb_config(client_config): + kb_config = copy.copy(client_config) + kb_config["id"] = kb_config["kb_name"] + kb_config["splitter_config"] = str_to_nums(kb_config["splitter_config"]) + kb_config["files"] = [] + log.info("kb_config.py _create_kb_config: ", client_config, kb_config) + return kb_config + +def str_to_nums(config_dict): + result = {} + for key in config_dict: + if is_int(config_dict[key]): + result[key] = int(config_dict[key]) + elif is_float(config_dict[key]): + result[key] = float(config_dict[key]) + else: + result[key] = config_dict[key] + + return result + +def is_int(s): + try: + int(s) + return True + except ValueError: + return False + +def is_float(value): + try: + float(value) + return True + except ValueError: + return False + + +async def upload_file(id, file): + log.info("in route", file.filename) + if not mongo.get_knowledge_base(id): + return {"message": f"Knowledge base {id} doesn't exist"} + elif mongo.file_exists(id, file): + return {"message": f"{file.filename} already exists in {id}"} + + else: + kb = KnowledgeBase(id) + log.info("in logic", file.filename) + try: + await kb.ingest_file(file) + return {"message": f"{file.filename} uploaded"} + except Exception as e: + return {"message": f"Error: {e}"} + + +# def create_kb(client_config): + # add properties to client_config + # kb_config = create_kb_config(client_config) + # log.info("kb_config.py create (classmethod): ", kb_config) + + # insert knowledge base configuration into database + # result = mongo.insert_knowledge_base(kb_config) + # log.info("create_kb: ", result) + # # name = kb_config["kb_name"] + + # return {"message": f"{kb_config["kb_name"]} created"} \ No newline at end of file From 139f11d788e37ede80a958bc61bdaf20283198e3 Mon Sep 17 00:00:00 2001 From: tlane25 Date: Tue, 6 Aug 2024 14:37:01 -0400 Subject: [PATCH 07/10] feat: db folder is now a package, allows for smooth imports from parent and child directories update: refactored knowledge base routes in server_kb.py, so fresh and so clean clean updated: test_server_kb.py updated to test for adding duplicate files --- __init__.py | 0 cleanup.py | 15 ++++-- server_kb.py | 52 ++++++------------- test_server_kb.py | 129 +++++++++++++++++++++++----------------------- 4 files changed, 93 insertions(+), 103 deletions(-) create mode 100644 __init__.py diff --git a/__init__.py b/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/cleanup.py b/cleanup.py index c9de97d..c32d507 100644 --- a/cleanup.py +++ b/cleanup.py @@ -9,17 +9,24 @@ CONFIG_DB = os.environ["CONFIG_DB"] CONFIG_KB_COL = os.environ["CONFIG_KB_COL"] -kb_names = [] +kb_names = ["Sentence", "Semantic", "Markdown"] def drop_all_knowledge_bases(): print(f"Dropping all knowledge bases in {CONFIG_DB}") - pymongo.MongoClient(MONGO_URI).drop_database(CONFIG_DB) + client = pymongo.MongoClient(MONGO_URI) + client.drop_database(CONFIG_DB) + client.close() + def remove_kb_files(kb_names: list): - mongo_client = pymongo.MongoClient(MONGO_URI) + client = pymongo.MongoClient(MONGO_URI) for kb_name in kb_names: - mongo_client.drop_database(kb_name) + client.drop_database(kb_name) + + client.close() drop_all_knowledge_bases() +remove_kb_files(kb_names) + diff --git a/server_kb.py b/server_kb.py index aba43fb..bba5354 100644 --- a/server_kb.py +++ b/server_kb.py @@ -1,16 +1,12 @@ -import shutil -import os -import json from fastapi import FastAPI, File, UploadFile, Request from fastapi.middleware.cors import CORSMiddleware import nest_asyncio -import pymongo from dotenv import load_dotenv -import app_logger as log -import use_s3 -from knowledge_base.kb_config import KnowledgeBase +# import app_logger as log +# import use_s3 +import db.knowledge_base.routes as kb nest_asyncio.apply() @@ -39,52 +35,38 @@ @app.get('/api') async def root(): - log.info("server running") + # log.info("server running") return {"message": "Server running"} # knowledge base routes @app.get("/api/knowledge-bases") async def get_knowledge_bases(): - knowledge_bases = KnowledgeBase.get_knowledge_bases() - return knowledge_bases + return kb.get_all() # consider adding id to the body of the request sent from the client # to create a new knowledge base # otherwise, we will use the kb_name prop to see if knowledge base exists @app.post('/api/knowledge-bases') -async def create_kb(request: Request): - body = await request.json() - # ensure that the kb_name is unique - if KnowledgeBase.exists(body["kb_name"]): - message = f"{body['kb_name']} already exists" - else: - message = KnowledgeBase.create(body) - - return {"message": message} +async def create_knowledge_base(request: Request): + client_config = await request.json() + return kb.create(client_config) @app.get("/api/knowledge-base/{id}") async def get_knowledge_base(id: str): - kb_config = KnowledgeBase.exists(id) - if kb_config: - return kb_config - else: - return { "message": f"{id} does not exist" } + return kb.get_one(id) # this route adds a file to a knowledge base @app.post('/api/knowledge-bases/{id}/upload') async def upload_file(id: str, file: UploadFile=File(...)): - log.info("in route", file.filename) - if KnowledgeBase.exists(id): - kb = KnowledgeBase(id) - log.info("in logic", file.filename) - try: - await kb.ingest_file(file) - return {"message": f"{file.filename} uploaded"} - except Exception as e: - return {"message": f"Error: {e}"} - else: - return {"message": f"Knowledge base {id} doesn't exist"} + try: + return await kb.upload_file(id, file) + + except Exception as e: + return {"message": f"Error: {e}"} + + + if __name__ == "__main__": import uvicorn diff --git a/test_server_kb.py b/test_server_kb.py index cbed97d..acbe2b8 100644 --- a/test_server_kb.py +++ b/test_server_kb.py @@ -1,11 +1,7 @@ -import os - -import pymongo -from dotenv import load_dotenv from fastapi.testclient import TestClient -from server_kb import app -from kb_test_constants import ( +from .server_kb import app +from db.knowledge_base.kb_test_constants import ( client_sentence_config, client_semantic_config, client_llama_parse_config, @@ -14,77 +10,73 @@ server_llama_parse_config, ) -load_dotenv(override=True, dotenv_path=".env.testing") - -MONGO_URI = os.environ["MONGO_URI"] -CONFIG_DB = os.environ["CONFIG_DB"] - client = TestClient(app) +# def test_read_knowledge_bases(): +# response = client.get("/api/knowledge-bases") +# assert response.status_code == 200 +# assert response.json() == [] -def test_read_knowledge_bases(): - - response = client.get("/api/knowledge-bases") - assert response.status_code == 200 - assert response.json() == [] - -def test_create_simple_splitter_kb(): - response = client.post( - "/api/knowledge-bases", - json=client_sentence_config - ) - assert response.status_code == 200 - assert response.json() == {"message": "Sentence created"} +# def test_create_simple_splitter_kb(): +# response = client.post( +# "/api/knowledge-bases", +# json=client_sentence_config +# ) +# assert response.status_code == 200 +# assert response.json() == {"message": "Sentence created"} - response = client.post( - "/api/knowledge-bases", - json=client_sentence_config - ) - assert response.status_code == 200 - assert response.json() == {"message": "Sentence already exists"} +# response = client.post( +# "/api/knowledge-bases", +# json=client_sentence_config +# ) +# assert response.status_code == 200 +# assert response.json() == {"message": "Sentence already exists"} -def test_create_semantic_splitter_kb(): - response = client.post( - "/api/knowledge-bases", - json=client_semantic_config - ) - assert response.status_code == 200 - assert response.json() == {"message": "Semantic created"} +# def test_create_semantic_splitter_kb(): +# response = client.post( +# "/api/knowledge-bases", +# json=client_semantic_config +# ) +# assert response.status_code == 200 +# assert response.json() == {"message": "Semantic created"} - response = client.post( - "/api/knowledge-bases", - json=client_semantic_config - ) - assert response.status_code == 200 - assert response.json() == {"message": "Semantic already exists"} +# response = client.post( +# "/api/knowledge-bases", +# json=client_semantic_config +# ) +# assert response.status_code == 200 +# assert response.json() == {"message": "Semantic already exists"} -def test_create_markdown_splitter_kb(): - response = client.post( - "/api/knowledge-bases", - json=client_llama_parse_config - ) - assert response.status_code == 200 - assert response.json() == {"message": "Markdown created"} +# def test_create_markdown_splitter_kb(): +# response = client.post( +# "/api/knowledge-bases", +# json=client_llama_parse_config +# ) +# assert response.status_code == 200 +# assert response.json() == {"message": "Markdown created"} - response = client.post( - "/api/knowledge-bases", - json=client_llama_parse_config - ) - assert response.status_code == 200 - assert response.json() == {"message": "Markdown already exists"} +# response = client.post( +# "/api/knowledge-bases", +# json=client_llama_parse_config +# ) +# assert response.status_code == 200 +# assert response.json() == {"message": "Markdown already exists"} -def test_get_kb_config(): - response = client.get("/api/knowledge-base/Sentence") - assert response.status_code == 200 - assert response.json() == server_sentence_config +# def test_get_kb_config(): +# response = client.get("/api/knowledge-base/Sentence") +# assert response.status_code == 200 +# assert response.json() == server_sentence_config - response = client.get("/api/knowledge-base/unknown_kb") - assert response.status_code == 200 - assert response.json() == {"message": "unknown_kb does not exist"} +# response = client.get("/api/knowledge-base/unknown_kb") +# assert response.status_code == 200 +# assert response.json() == {"message": "unknown_kb does not exist"} # def test_upload_file(): -# response = client.post("/api/knowledge-bases/Sentence%20Splitter/upload", files={"file": ("test.txt", b"test")}) +# response = client.post( +# "/api/knowledge-bases/Sentence/upload", +# files={"file": ("test.txt", b"test")} +# ) # assert response.status_code == 200 # assert response.json() == {"message": "test.txt uploaded"} @@ -92,6 +84,15 @@ def test_get_kb_config(): # assert response.status_code == 200 # assert response.json()["files"].length == 1 + +def test_reupload_same_file(): + response = client.post( + "/api/knowledge-bases/Sentence/upload", + files={"file": ("test.txt", b"test")} + ) + assert response.status_code == 200 + assert response.json() == {"message": "test.txt already exists in Sentence"} + # response = client.post("/api/knowledge-bases/unknown_kb/upload", files={"file": ("test.txt", b"test")}) # assert response.status_code == 200 # assert response.json() == {"message": "Knowledge base unknown_kb doesn't exist"} From 2c187347687f7a13ab53c8c1b64edd8453021e94 Mon Sep 17 00:00:00 2001 From: tlane25 Date: Tue, 6 Aug 2024 14:42:49 -0400 Subject: [PATCH 08/10] update: moved random tests to random directory, changed names so they no longer trigger pytest --- random/docdbtest/README.md | 28 +++++++ random/docdbtest/env.template | 2 + random/docdbtest/files/file1.txt | 6 ++ random/docdbtest/files/file2.txt | 2 + random/docdbtest/list_vectors.py | 23 ++++++ random/docdbtest/load_vectors.py | 35 ++++++++ random/docdbtest/not_test_nodb.py | 12 +++ random/docdbtest/store_vectors.py | 32 ++++++++ random/hybridTest.py | 21 +++++ random/not_test_kb.py | 31 ++++++++ random/pipelineTest.py | 114 +++++++++++++++++++++++++++ random/promptTest.py | 41 ++++++++++ random/setup_test_not.py | 127 ++++++++++++++++++++++++++++++ 13 files changed, 474 insertions(+) create mode 100644 random/docdbtest/README.md create mode 100644 random/docdbtest/env.template create mode 100644 random/docdbtest/files/file1.txt create mode 100644 random/docdbtest/files/file2.txt create mode 100644 random/docdbtest/list_vectors.py create mode 100644 random/docdbtest/load_vectors.py create mode 100644 random/docdbtest/not_test_nodb.py create mode 100644 random/docdbtest/store_vectors.py create mode 100644 random/hybridTest.py create mode 100644 random/not_test_kb.py create mode 100644 random/pipelineTest.py create mode 100644 random/promptTest.py create mode 100644 random/setup_test_not.py diff --git a/random/docdbtest/README.md b/random/docdbtest/README.md new file mode 100644 index 0000000..3b2e417 --- /dev/null +++ b/random/docdbtest/README.md @@ -0,0 +1,28 @@ +# test files for connecting with AWS DocumentDB instance +- a set of test files to confirm a connection with AWS DocumentDB and vector embedding persistence +- Note: the LlamaIndex DocDB integration uses `pymongo` + - when running the various files, there may be errors from the pymongo package, but these do not seem to impact vector storage / retrieval + + + +## to use +- create a `.env` file (can use env.template as a starter) + - MONGO_URI will come from the AWS Console for DocumentDB +- ensure you download the global-bundle.pem from the AWS Console +- ensure you've run `pipenv shell` in the root project folder + + + +## overview of files +- `test_nodb.py` : will create a VectorStoreIndex from the same files with no persistence + - note that running this file should give you a 'baseline' of how llamaIndex will perform + - note also that since being first created, it appears an underlying LangChain method has changed and is now deprecated (as of Jul 21, 2024) + +- `store_vectors.py` : this will vectorize the same files and store the index in DocDB + +- `list_vectors.py` : this program queries the contents of the 'testdb', 'testcollection' directly and prints the values to screen + - note: you can pipe the output to a file to examine output more closely + - e.g., `python list_vectors.py > output.txt` + +- `load_vectors.py` : this will load the vector embeddings from DocDB and then run a query against them + diff --git a/random/docdbtest/env.template b/random/docdbtest/env.template new file mode 100644 index 0000000..7f14155 --- /dev/null +++ b/random/docdbtest/env.template @@ -0,0 +1,2 @@ +OPENAI_API_KEY= +MONGO_URI= diff --git a/random/docdbtest/files/file1.txt b/random/docdbtest/files/file1.txt new file mode 100644 index 0000000..13f6cc4 --- /dev/null +++ b/random/docdbtest/files/file1.txt @@ -0,0 +1,6 @@ +In the mystical land of Rudenza: +Apples are purple. +Clouds are silver, tinged with gold and bronze. +Pianos will bite your fingers if you don't wear stripes when you practice. + + diff --git a/random/docdbtest/files/file2.txt b/random/docdbtest/files/file2.txt new file mode 100644 index 0000000..db01e83 --- /dev/null +++ b/random/docdbtest/files/file2.txt @@ -0,0 +1,2 @@ +PotatoPecanPie is the secret word. +James' favourite food is pizza. diff --git a/random/docdbtest/list_vectors.py b/random/docdbtest/list_vectors.py new file mode 100644 index 0000000..ba5f050 --- /dev/null +++ b/random/docdbtest/list_vectors.py @@ -0,0 +1,23 @@ +# this mini program is to list the vectors within the documentDB instance that were written by llamaIndex +# Note: the db and collection as named below - these can be changed when the vector_store is instantiated + + +import pymongo +import pprint +from dotenv import load_dotenv +import os + +load_dotenv(override=True) + +mongo_uri = os.environ["MONGO_URI"] +client = pymongo.MongoClient(mongo_uri) + + +db = client[os.environ["DOCDB_NAME"]] +collection = db[os.environ["DOCDB_COLLECTION"]] + +for post in collection.find(): + pprint.pprint(post) + + +print('==========') diff --git a/random/docdbtest/load_vectors.py b/random/docdbtest/load_vectors.py new file mode 100644 index 0000000..2ac3755 --- /dev/null +++ b/random/docdbtest/load_vectors.py @@ -0,0 +1,35 @@ +# based loosely upon LlamaIndex demo +# https://docs.llamaindex.ai/en/stable/examples/vector_stores/AWSDocDBDemo/ +# +# key goal here was to retrieve the stored vectors from DocumentDB rather than re-create them + + + +import pymongo +from dotenv import load_dotenv + +from llama_index.vector_stores.awsdocdb import AWSDocDbVectorStore +from llama_index.core import VectorStoreIndex +from llama_index.core import StorageContext +# from llama_index.core import SimpleDirectoryReader +import os + +load_dotenv(override=True) + +mongo_uri = os.environ["MONGO_URI"] +mongodb_client = pymongo.MongoClient(mongo_uri) +store = AWSDocDbVectorStore(mongodb_client, db_name='testdb', collection_name='testcollection') +storage_context = StorageContext.from_defaults(vector_store=store) + + + +index = VectorStoreIndex.from_vector_store( + vector_store=store, + storage_context=storage_context +) + + +response = index.as_query_engine().query('Tell me about Rudenza') +print(f"{response}") + + diff --git a/random/docdbtest/not_test_nodb.py b/random/docdbtest/not_test_nodb.py new file mode 100644 index 0000000..047ae53 --- /dev/null +++ b/random/docdbtest/not_test_nodb.py @@ -0,0 +1,12 @@ +# Testing vector store - no persistence + +from dotenv import load_dotenv +from llama_index.core import VectorStoreIndex, SimpleDirectoryReader + +load_dotenv() + +documents = SimpleDirectoryReader("files").load_data() + +index = VectorStoreIndex.from_documents(documents) + +print("Index created successfully!") diff --git a/random/docdbtest/store_vectors.py b/random/docdbtest/store_vectors.py new file mode 100644 index 0000000..fd16d01 --- /dev/null +++ b/random/docdbtest/store_vectors.py @@ -0,0 +1,32 @@ +# based upon LlamaIndex demo +# https://docs.llamaindex.ai/en/stable/examples/vector_stores/AWSDocDBDemo/ + + +import pymongo +from dotenv import load_dotenv + +from llama_index.vector_stores.awsdocdb import AWSDocDbVectorStore +from llama_index.core import VectorStoreIndex +from llama_index.core import StorageContext +from llama_index.core import SimpleDirectoryReader +import os + +load_dotenv(override=True) + + +mongo_uri = os.environ["MONGO_URI"] +mongodb_client = pymongo.MongoClient(mongo_uri) +store = AWSDocDbVectorStore(mongodb_client, db_name='testdb', collection_name='testcollection') +storage_context = StorageContext.from_defaults(vector_store=store) + +documents = SimpleDirectoryReader("files").load_data() + +index = VectorStoreIndex.from_documents( + documents, storage_context=storage_context +) + + +response = index.as_query_engine().query('Tell me about Rudenza') +print(f"{response}") + + diff --git a/random/hybridTest.py b/random/hybridTest.py new file mode 100644 index 0000000..e845cb0 --- /dev/null +++ b/random/hybridTest.py @@ -0,0 +1,21 @@ +import hybridSearch.search as search + +def print_nodes(nodes): + for node in nodes: + print(node) + + +kb_file_path = './tmpfiles/giraffes.pdf' +search.hybrid_write('giraffes', kb_file_path) # only need to do this the first time + +query = 'how long are giraffe necks?' + +# get nodes +nodes = search.hybrid_get_nodes('giraffes', query, top_k=5) + +all_nodes = nodes['keyword'] + nodes['vector'] + +print_nodes(all_nodes) + + + diff --git a/random/not_test_kb.py b/random/not_test_kb.py new file mode 100644 index 0000000..063a32e --- /dev/null +++ b/random/not_test_kb.py @@ -0,0 +1,31 @@ +import json + +import refactor1.db.knowledge_base.kb_config as kbClass + +kb_name = 'giraffe2' + +config_template = { + "id": kb_name, + "kb_name": kb_name, + "ingest_method": "Simple", + "splitter": "Sentence", + "embed_config": { + "embed_provider": "OpenAI", + "embed_model": "text-embedding-3-small" + }, + "splitter_config": { + "chunk_size": 1024, + "chunk_overlap": 200 + } +} + +json_config = json.dumps(config_template) +print(json_config) + +kbClass.KnowledgeBase.create(json_config) + +kb = kbClass.KnowledgeBase('giraffe2') +kb.ingest_file_path('./tmpfiles/giraffes.pdf') + + + diff --git a/random/pipelineTest.py b/random/pipelineTest.py new file mode 100644 index 0000000..5b3c70e --- /dev/null +++ b/random/pipelineTest.py @@ -0,0 +1,114 @@ +from llama_index.core import QueryBundle +from llama_index.core.postprocessor import SimilarityPostprocessor +from llama_index.core.postprocessor import LongContextReorder +from llama_index.postprocessor.colbert_rerank import ColbertRerank +from llama_index.core import get_response_synthesizer, PromptTemplate +from llama_index.core.response_synthesizers import ResponseMode + +import hybridSearch.search as search + +def print_nodes(nodes): + for node in nodes: + print(node) + + +query = 'tell me about promises' + +# get all nodes +nodes = search.hybrid_get_nodes(query, top_k=3) +all_nodes = nodes['keyword'] + nodes['vector'] + + +# similarity +similarity_pp = SimilarityPostprocessor( + nodes=all_nodes, + similarity_cutoff=0.5 +) + +nodes_similar = similarity_pp.postprocess_nodes(all_nodes) + + + + + +# Colbert rerank +reranker = ColbertRerank(top_n=4) +query_bundle = QueryBundle(query) + +nodes_rerank = reranker.postprocess_nodes(all_nodes, query_bundle) + +print('='*20) +print_nodes(nodes_rerank) + + + +# LongContextReorder +reorder = LongContextReorder() + +nodes_reorder = reorder.postprocess_nodes(nodes_rerank) + +print('='*20) +print_nodes(nodes_reorder) + + + +# Response synthesizer +synth = get_response_synthesizer( + response_mode=ResponseMode.SIMPLE_SUMMARIZE +) + +response = synth.synthesize(query, nodes=nodes_reorder) +print(response) + +print('*'*20) + + +# Custom Prompt +new_prompt = ( + "Context information is below.\n" + "-----------------------------\n" + "{context_str}\n" + "-----------------------------\n" + "Given the context information and not prior knowledge, " + "answer the query in French.\n" + "Query: {query_str}\n" + "Answer: " +) +new_template = PromptTemplate(new_prompt) + +synth.update_prompts( + {"text_qa_template": new_template} +) + +response = synth.synthesize(query, nodes=nodes_reorder) +print(response) + + + + + +''' +Notes: + +incorporate post-processing modules: +- created `pipelineTest.py` based upon ‘hybridTest.py’ +- added similarity +- adding ColbertRerank +- found Colbert import statement from https://docs.llamaindex.ai/en/stable/examples/pipeline/query_pipeline_memory/?h=colbertr +- found reranker syntax from https://docs.llamaindex.ai/en/stable/examples/node_postprocessor/LLMReranker-Lyft-10k/?h=reranker +- adding LongContextReorder +- https://docs.llamaindex.ai/en/stable/module_guides/querying/node_postprocessors/node_postprocessors/?h= + +- post-processing modules all seem to work +- need to go from nodes to query response now +- llamaindex uses a “response synthesizer” +- https://docs.llamaindex.ai/en/stable/api_reference/response_synthesizers/ +- “simple_summarize” merges all text chunks from nodes into 1 and makes an LLM call +- it will fail if the merged text chunk exceeds the context window size + +- Accessing and customizing prompts +- https://docs.llamaindex.ai/en/stable/examples/prompts/prompt_mixin/ +- `synthesizer.get_prompts()` returns a dictionary of prompts +- key is a template (e.g., “text_qa_template”) +- see promptTest.py to access returned dict and display prompt content +''' \ No newline at end of file diff --git a/random/promptTest.py b/random/promptTest.py new file mode 100644 index 0000000..0b2820a --- /dev/null +++ b/random/promptTest.py @@ -0,0 +1,41 @@ + +from llama_index.core import get_response_synthesizer, PromptTemplate +from llama_index.core.response_synthesizers import ResponseMode + + +def display_prompts(prompts_dict): + for k, p in prompts_dict.items(): + print(f"Prompt Key: {k}") + print("Text: ") + print(p.get_template()) + print("-"*30) + + +synth = get_response_synthesizer( + response_mode=ResponseMode.SIMPLE_SUMMARIZE +) + +prompt = synth.get_prompts() + +display_prompts(prompt) + + +new_prompt = ( + "Context information is below.\n" + "-----------------------------\n" + "{context_str}\n" + "-----------------------------\n" + "Given the context information and not prior knowledge, " + "answer the query in French.\n" + "Query: {query_str}\n" + "Answer: " +) +new_template = PromptTemplate(new_prompt) + +synth.update_prompts( + {"text_qa_template": new_template} +) + +prompt = synth.get_prompts() + +display_prompts(prompt) diff --git a/random/setup_test_not.py b/random/setup_test_not.py new file mode 100644 index 0000000..d11caa1 --- /dev/null +++ b/random/setup_test_not.py @@ -0,0 +1,127 @@ +# helper scripts to populate test kbs +import os + +import pymongo +from dotenv import load_dotenv + +import hybridSearch.search as search + +load_dotenv(override=True) + +mongo_uri = os.environ["MONGO_URI"] +mongo = pymongo.MongoClient(mongo_uri) + + +# kb1 +kb1_file_path = './tmpfiles/AsyncJS.md' +search.keyword_write('kb1', kb1_file_path) +search.vector_write('kb1', kb1_file_path) + +#kb2 +kb2_file_path = './tmpfiles/cpumemory.pdf' +search.hybrid_write('kb2', kb2_file_path) + +#kb3 +kb3_file_path = './tmpfiles/newfile.txt' +search.hybrid_write('kb3', kb3_file_path) + + +# config db setup + +kb_config1 = { + 'id': 'kb1', + 'name': 'AsyncJS', + 'files': [{ 'filename': './tmpfiles/AsyncJS.md'}], + 'ingest': { + 'method': 'simple_ingest', + 'splitter': { + 'type': 'sentence', + 'chunk_size': '', + 'chunk_overlap': '', + 'separator': '', + }, + }, + 'embedding_model': 'gpt-3.5-turbo', + 'vector_store': { + 'name': 'idstring', + 'collection': 'vector_index', + }, + 'keyword_store': { + 'name': 'idstring', + 'collections': ['docstore/ref_doc_info', 'docstore/data', 'docstore/metadata'] + } +} + +kb_config2 = kb_config1.copy() +kb_config2['id'] = 'kb2' +kb_config2['name'] = 'cpumemory' +kb_config2['files'] = [{ 'filename': './tmpfiles/cpumemory.pdf'}], + + +kb_config3 = kb_config1.copy() +kb_config3['id'] = 'kb3' +kb_config3['name'] = 'newfile' +kb_config3['files'] = [{ 'filename': './tmpfiles/newfile.txt'}], + + +config_db = mongo[ os.environ["CONFIG_DB"] ] +config_kb_col = config_db[ os.environ["CONFIG_KB_COL"] ] +config_kb_col.insert_one(kb_config1) +config_kb_col.insert_one(kb_config2) +config_kb_col.insert_one(kb_config3) + + + + +# Pipeline config + +pipeline_config1 = { + 'id': 'pipeline1', + 'name': 'pipelineConfigName', + 'knowledgebases': ['kb1', 'kb2', 'kb3'], + 'retrieval': { + 'vector': 'llm_model_name', + }, + 'postprocessing': { + 'similarity': { + 'on': False, + 'similarity_cutoff': 0.7 + }, + 'colbertRerank': { + 'on': False, + 'top_n': 5 + }, + 'longContextReorder': { + 'on': True, + } + }, + 'generative_model': 'gpt-3.5-turbo', + 'prompt': { + 'on': True, + 'template_str': 'answer the question - {query_str} - in French' + } +} + +pipeline_config2 = pipeline_config1.copy() +pipeline_config2['id'] = 'pipeline2' +pipeline_config2['name'] = 'kb1 only (async)' +pipeline_config2['knowledgebases'] = ['kb1'] + + +pipeline_config3 = pipeline_config1.copy() +pipeline_config3['id'] = 'pipeline3' +pipeline_config3['name'] = 'kb2 only (cpumemory)' +pipeline_config3['knowledgebases'] = ['kb2'] + +pipeline_config4 = pipeline_config1.copy() +pipeline_config4['id'] = 'pipeline4' +pipeline_config4['name'] = 'kb1 (async) and kb2 (cpumemory)' +pipeline_config4['knowledgebases'] = ['kb2', 'kb1'] + +config_pipeline_col = config_db[ os.environ["CONFIG_PIPELINE_COL"] ] +config_pipeline_col.insert_one(pipeline_config1) +config_pipeline_col.insert_one(pipeline_config2) +config_pipeline_col.insert_one(pipeline_config3) +config_pipeline_col.insert_one(pipeline_config4) + + From c7aa67f1d437e17353096529f86a73633e0046f8 Mon Sep 17 00:00:00 2001 From: tlane25 Date: Tue, 6 Aug 2024 14:48:54 -0400 Subject: [PATCH 09/10] clean: removed duplicate and outdated files --- .gitignore | 1 + Pipfile | 3 + Pipfile.lock | 926 +++++++++--------------- docdbtest/README.md | 28 - docdbtest/env.template | 2 - docdbtest/files/file1.txt | 6 - docdbtest/files/file2.txt | 2 - docdbtest/list_vectors.py | 23 - docdbtest/load_vectors.py | 35 - docdbtest/store_vectors.py | 32 - docdbtest/test_nodb.py | 12 - hybridTest.py | 21 - kb_config.py | 313 -------- kb_constants.py | 138 ---- kb_test_constants.py | 96 --- kb_type_definitions.py | 49 -- lp_ingest.py | 73 -- mongo_helper.py | 28 - pipelineTest.py | 114 --- promptTest.py | 41 -- random_tests/docdbtest/README.md | 28 - random_tests/docdbtest/env.template | 2 - random_tests/docdbtest/files/file1.txt | 6 - random_tests/docdbtest/files/file2.txt | 2 - random_tests/docdbtest/list_vectors.py | 23 - random_tests/docdbtest/load_vectors.py | 35 - random_tests/docdbtest/store_vectors.py | 32 - random_tests/docdbtest/test_nodb.py | 12 - random_tests/hybridTest.py | 21 - random_tests/pipelineTest.py | 114 --- random_tests/promptTest.py | 41 -- random_tests/setup_test.py | 127 ---- random_tests/test_kb.py | 31 - setup_test.py | 127 ---- simple_ingest.py | 34 - test_kb.py | 31 - 36 files changed, 339 insertions(+), 2270 deletions(-) delete mode 100644 docdbtest/README.md delete mode 100644 docdbtest/env.template delete mode 100644 docdbtest/files/file1.txt delete mode 100644 docdbtest/files/file2.txt delete mode 100644 docdbtest/list_vectors.py delete mode 100644 docdbtest/load_vectors.py delete mode 100644 docdbtest/store_vectors.py delete mode 100644 docdbtest/test_nodb.py delete mode 100644 hybridTest.py delete mode 100644 kb_config.py delete mode 100644 kb_constants.py delete mode 100644 kb_test_constants.py delete mode 100644 kb_type_definitions.py delete mode 100644 lp_ingest.py delete mode 100644 mongo_helper.py delete mode 100644 pipelineTest.py delete mode 100644 promptTest.py delete mode 100644 random_tests/docdbtest/README.md delete mode 100644 random_tests/docdbtest/env.template delete mode 100644 random_tests/docdbtest/files/file1.txt delete mode 100644 random_tests/docdbtest/files/file2.txt delete mode 100644 random_tests/docdbtest/list_vectors.py delete mode 100644 random_tests/docdbtest/load_vectors.py delete mode 100644 random_tests/docdbtest/store_vectors.py delete mode 100644 random_tests/docdbtest/test_nodb.py delete mode 100644 random_tests/hybridTest.py delete mode 100644 random_tests/pipelineTest.py delete mode 100644 random_tests/promptTest.py delete mode 100644 random_tests/setup_test.py delete mode 100644 random_tests/test_kb.py delete mode 100644 setup_test.py delete mode 100644 simple_ingest.py delete mode 100644 test_kb.py diff --git a/.gitignore b/.gitignore index 2107347..1c1868b 100644 --- a/.gitignore +++ b/.gitignore @@ -130,6 +130,7 @@ venv/ ENV/ env.bak/ venv.bak/ +.env.testing # Spyder project settings .spyderproject diff --git a/Pipfile b/Pipfile index c8d45c6..91e65a7 100644 --- a/Pipfile +++ b/Pipfile @@ -29,6 +29,9 @@ rank-bm25 = "*" llama-index-postprocessor-colbert-rerank = "*" llama-index-retrievers-bm25 = "*" llama-index-storage-index-store-mongodb = "*" +llama-index-embeddings-bedrock = "*" +pytest = "*" +llama-index-llms-bedrock = "*" [dev-packages] diff --git a/Pipfile.lock b/Pipfile.lock index 29a882e..b77110a 100644 --- a/Pipfile.lock +++ b/Pipfile.lock @@ -1,7 +1,7 @@ { "_meta": { "hash": { - "sha256": "7600d68f2958d15d1c1e02fe270e449dbce6ac7cbd87a0ed9d3341c7f092034d" + "sha256": "2daebc3854438cc1051b92024ae54d6ad97647ec460dd113dcaaec9974837fa4" }, "pipfile-spec": 6, "requires": { @@ -27,85 +27,85 @@ }, "aiohttp": { "hashes": [ - "sha256:03799a95402a7ed62671c4465e1eae51d749d5439dbc49edb6eee52ea165c50b", - "sha256:0433795c4a8bafc03deb3e662192250ba5db347c41231b0273380d2f53c9ea0b", - "sha256:06ef0135d7ab7fb0284342fbbf8e8ddf73b7fee8ecc55f5c3a3d0a6b765e6d8b", - "sha256:0b0c0148d2a69b82ffe650c2ce235b431d49a90bde7dd2629bcb40314957acf6", - "sha256:0d85a173b4dbbaaad1900e197181ea0fafa617ca6656663f629a8a372fdc7d06", - "sha256:10f0d7894ddc6ff8f369e3fdc082ef1f940dc1f5b9003cd40945d24845477220", - "sha256:12c43dace645023583f3dd2337dfc3aa92c99fb943b64dcf2bc15c7aa0fb4a95", - "sha256:13679e11937d3f37600860de1f848e2e062e2b396d3aa79b38c89f9c8ab7e791", - "sha256:1c83977f7b6f4f4a96fab500f5a76d355f19f42675224a3002d375b3fb309174", - "sha256:1dc95c5e2a5e60095f1bb51822e3b504e6a7430c9b44bff2120c29bb876c5202", - "sha256:1ebd8ed91428ffbe8b33a5bd6f50174e11882d5b8e2fe28670406ab5ee045ede", - "sha256:21dab4a704c68dc7bc2a1219a4027158e8968e2079f1444eda2ba88bc9f2895f", - "sha256:25a9924343bf91b0c5082cae32cfc5a1f8787ac0433966319ec07b0ed4570722", - "sha256:2a03a4407bdb9ae815f0d5a19df482b17df530cf7bf9c78771aa1c713c37ff1f", - "sha256:2dc75da06c35a7b47a88ceadbf993a53d77d66423c2a78de8c6f9fb41ec35687", - "sha256:31616121369bc823791056c632f544c6c8f8d1ceecffd8bf3f72ef621eaabf49", - "sha256:33acb0d9bf12cdc80ceec6f5fda83ea7990ce0321c54234d629529ca2c54e33d", - "sha256:33e915971eee6d2056d15470a1214e4e0f72b6aad10225548a7ab4c4f54e2db7", - "sha256:34adb8412e736a5d0df6d1fccdf71599dfb07a63add241a94a189b6364e997f1", - "sha256:34eaf5cfcc979846d73571b1a4be22cad5e029d55cdbe77cdc7545caa4dcb925", - "sha256:39e7ec718e7a1971a5d98357e3e8c0529477d45c711d32cd91999dc8d8404e1e", - "sha256:3f49edf7c5cd2987634116e1b6a0ee2438fca17f7c4ee480ff41decb76cf6158", - "sha256:43c60d9b332a01ee985f080f639f3e56abcfb95ec1320013c94083c3b6a2e143", - "sha256:4b099fbb823efed3c1d736f343ac60d66531b13680ee9b2669e368280f41c2b8", - "sha256:4f1de31a585344a106db43a9c3af2e15bb82e053618ff759f1fdd31d82da38eb", - "sha256:5268b35fee7eb754fb5b3d0f16a84a2e9ed21306f5377f3818596214ad2d7714", - "sha256:54076a25f32305e585a3abae1f0ad10646bec539e0e5ebcc62b54ee4982ec29f", - "sha256:5549c71c35b5f057a4eebcc538c41299826f7813f28880722b60e41c861a57ec", - "sha256:563705a94ea3af43467167f3a21c665f3b847b2a0ae5544fa9e18df686a660da", - "sha256:594b4b4f1dfe8378b4a0342576dc87a930c960641159f5ae83843834016dbd59", - "sha256:64a117c16273ca9f18670f33fc7fd9604b9f46ddb453ce948262889a6be72868", - "sha256:68ab608118e212f56feef44d4785aa90b713042da301f26338f36497b481cd79", - "sha256:6c99eef30a7e98144bcf44d615bc0f445b3a3730495fcc16124cb61117e1f81e", - "sha256:6dbfac556219d884d50edc6e1952a93545c2786193f00f5521ec0d9d464040ab", - "sha256:71c76685773444d90ae83874433505ed800e1706c391fdf9e57cc7857611e2f4", - "sha256:72de8ffba4a27e3c6e83e58a379fc4fe5548f69f9b541fde895afb9be8c31658", - "sha256:73c01201219eb039a828bb58dcc13112eec2fed6eea718356316cd552df26e04", - "sha256:77bbf0a2f6fefac6c0db1792c234f577d80299a33ce7125467439097cf869198", - "sha256:872c0dcaccebd5733d535868fe2356aa6939f5827dcea7a8b9355bb2eff6f56e", - "sha256:8c66a1aadafbc0bd7d648cb7fcb3860ec9beb1b436ce3357036a4d9284fcef9a", - "sha256:8cedc48d36652dd3ac40e5c7c139d528202393e341a5e3475acedb5e8d5c4c75", - "sha256:8d6dcd1d21da5ae1416f69aa03e883a51e84b6c803b8618cbab341ac89a85b9e", - "sha256:91e0b76502205484a4d1d6f25f461fa60fe81a7987b90e57f7b941b0753c3ec8", - "sha256:927b4aca6340301e7d8bb05278d0b6585b8633ea852b7022d604a5df920486bf", - "sha256:941366a554e566efdd3f042e17a9e461a36202469e5fd2aee66fe3efe6412aef", - "sha256:947da3aee057010bc750b7b4bb65cbd01b0bdb7c4e1cf278489a1d4a1e9596b3", - "sha256:9784246431eaf9d651b3cc06f9c64f9a9f57299f4971c5ea778fa0b81074ef13", - "sha256:9ca48e9f092a417c6669ee8d3a19d40b3c66dde1a2ae0d57e66c34812819b671", - "sha256:a04f2c8d41821a2507b49b2694c40495a295b013afb0cc7355b337980b47c546", - "sha256:a65472256c5232681968deeea3cd5453aa091c44e8db09f22f1a1491d422c2d9", - "sha256:aad87626f31a85fd4af02ba7fd6cc424b39d4bff5c8677e612882649da572e47", - "sha256:ab1d870403817c9a0486ca56ccbc0ebaf85d992277d48777faa5a95e40e5bcca", - "sha256:b6612c6ed3147a4a2d6463454b94b877566b38215665be4c729cd8b7bdce15b4", - "sha256:b7e3545b06aae925f90f06402e05cfb9c62c6409ce57041932163b09c48daad6", - "sha256:bbe2f6d0466f5c59c7258e0745c20d74806a1385fbb7963e5bbe2309a11cc69b", - "sha256:bdda86ab376f9b3095a1079a16fbe44acb9ddde349634f1c9909d13631ff3bcf", - "sha256:bec91402df78b897a47b66b9c071f48051cea68d853d8bc1d4404896c6de41ae", - "sha256:c8820dad615cd2f296ed3fdea8402b12663ac9e5ea2aafc90ef5141eb10b50b8", - "sha256:cc4376ff537f7d2c1e98f97f6d548e99e5d96078b0333c1d3177c11467b972de", - "sha256:ccab9381f38c669bb9254d848f3b41a3284193b3e274a34687822f98412097e9", - "sha256:cd36d0f0afc2bd84f007cedd2d9a449c3cf04af471853a25eb71f28bc2e1a119", - "sha256:d583755ddb9c97a2da1322f17fc7d26792f4e035f472d675e2761c766f94c2ff", - "sha256:d9b8b31c057a0b7bb822a159c490af05cb11b8069097f3236746a78315998afa", - "sha256:dcb6e65f6ea7caa0188e36bebe9e72b259d3d525634758c91209afb5a6cbcba7", - "sha256:e8dd7da2609303e3574c95b0ec9f1fd49647ef29b94701a2862cceae76382e1d", - "sha256:ebedc51ee6d39f9ea5e26e255fd56a7f4e79a56e77d960f9bae75ef4f95ed57f", - "sha256:effafe5144aa32f0388e8f99b1b2692cf094ea2f6b7ceca384b54338b77b1f50", - "sha256:f1bc4d68b83966012813598fe39b35b4e6019b69d29385cf7ec1cb08e1ff829b", - "sha256:f1cef548ee4e84264b78879de0c754bbe223193c6313beb242ce862f82eab184", - "sha256:f381424dbce313bb5a666a215e7a9dcebbc533e9a2c467a1f0c95279d24d1fa7", - "sha256:f3a1ea61d96146e9b9e5597069466e2e4d9e01e09381c5dd51659f890d5e29e7", - "sha256:f64d503c661864866c09806ac360b95457f872d639ca61719115a9f389b2ec90", - "sha256:f6fa7a42b78d8698491dc4ad388169de54cca551aa9900f750547372de396277", - "sha256:f76c12abb88b7ee64b3f9ae72f0644af49ff139067b5add142836dab405d60d4", - "sha256:f98f036eab11d2f90cdd01b9d1410de9d7eb520d070debeb2edadf158b758431", - "sha256:ff25d988fd6ce433b5c393094a5ca50df568bdccf90a8b340900e24e0d5fb45c" + "sha256:01c3f1eb280008e51965a8d160a108c333136f4a39d46f516c64d2aa2e6a53f2", + "sha256:028faf71b338f069077af6315ad54281612705d68889f5d914318cbc2aab0d50", + "sha256:03c0c380c83f8a8d4416224aafb88d378376d6f4cadebb56b060688251055cd4", + "sha256:0df51a3d70a2bfbb9c921619f68d6d02591f24f10e9c76de6f3388c89ed01de6", + "sha256:120548d89f14b76a041088b582454d89389370632ee12bf39d919cc5c561d1ca", + "sha256:1988b370536eb14f0ce7f3a4a5b422ab64c4e255b3f5d7752c5f583dc8c967fc", + "sha256:1a07c76a82390506ca0eabf57c0540cf5a60c993c442928fe4928472c4c6e5e6", + "sha256:1c2b104e81b3c3deba7e6f5bc1a9a0e9161c380530479970766a6655b8b77c7c", + "sha256:1c577cdcf8f92862363b3d598d971c6a84ed8f0bf824d4cc1ce70c2fb02acb4a", + "sha256:1f8605e573ed6c44ec689d94544b2c4bb1390aaa723a8b5a2cc0a5a485987a68", + "sha256:21778552ef3d44aac3278cc6f6d13a6423504fa5f09f2df34bfe489ed9ded7f5", + "sha256:2212296cdb63b092e295c3e4b4b442e7b7eb41e8a30d0f53c16d5962efed395d", + "sha256:222821c60b8f6a64c5908cb43d69c0ee978a1188f6a8433d4757d39231b42cdb", + "sha256:256ee6044214ee9d66d531bb374f065ee94e60667d6bbeaa25ca111fc3997158", + "sha256:2a384dfbe8bfebd203b778a30a712886d147c61943675f4719b56725a8bbe803", + "sha256:2fa643ca990323db68911b92f3f7a0ca9ae300ae340d0235de87c523601e58d9", + "sha256:41d8dab8c64ded1edf117d2a64f353efa096c52b853ef461aebd49abae979f16", + "sha256:440954ddc6b77257e67170d57b1026aa9545275c33312357472504eef7b4cc0b", + "sha256:47b4c2412960e64d97258f40616efddaebcb34ff664c8a972119ed38fac2a62c", + "sha256:4a9ce70f5e00380377aac0e568abd075266ff992be2e271765f7b35d228a990c", + "sha256:4dcb127ca3eb0a61205818a606393cbb60d93b7afb9accd2fd1e9081cc533144", + "sha256:4e9e9171d2fe6bfd9d3838a6fe63b1e91b55e0bf726c16edf265536e4eafed19", + "sha256:51d03e948e53b3639ce4d438f3d1d8202898ec6655cadcc09ec99229d4adc2a9", + "sha256:54b7f4a20d7cc6bfa4438abbde069d417bb7a119f870975f78a2b99890226d55", + "sha256:587237571a85716d6f71f60d103416c9df7d5acb55d96d3d3ced65f39bff9c0c", + "sha256:5951c328f9ac42d7bce7a6ded535879bc9ae13032818d036749631fa27777905", + "sha256:5a95151a5567b3b00368e99e9c5334a919514f60888a6b6d2054fea5e66e527e", + "sha256:5c12310d153b27aa630750be44e79313acc4e864c421eb7d2bc6fa3429c41bf8", + "sha256:5cd57ad998e3038aa87c38fe85c99ed728001bf5dde8eca121cadee06ee3f637", + "sha256:615348fab1a9ef7d0960a905e83ad39051ae9cb0d2837da739b5d3a7671e497a", + "sha256:67f7639424c313125213954e93a6229d3a1d386855d70c292a12628f600c7150", + "sha256:68164d43c580c2e8bf8e0eb4960142919d304052ccab92be10250a3a33b53268", + "sha256:68cc24f707ed9cb961f6ee04020ca01de2c89b2811f3cf3361dc7c96a14bfbcc", + "sha256:6b14c19172eb53b63931d3e62a9749d6519f7c121149493e6eefca055fcdb352", + "sha256:777e23609899cb230ad2642b4bdf1008890f84968be78de29099a8a86f10b261", + "sha256:786299d719eb5d868f161aeec56d589396b053925b7e0ce36e983d30d0a3e55c", + "sha256:7ccf1f0a304352c891d124ac1a9dea59b14b2abed1704aaa7689fc90ef9c5be1", + "sha256:88596384c3bec644a96ae46287bb646d6a23fa6014afe3799156aef42669c6bd", + "sha256:89b47c125ab07f0831803b88aeb12b04c564d5f07a1c1a225d4eb4d2f26e8b5e", + "sha256:8b0d058e4e425d3b45e8ec70d49b402f4d6b21041e674798b1f91ba027c73f28", + "sha256:8c81ff4afffef9b1186639506d70ea90888218f5ddfff03870e74ec80bb59970", + "sha256:8db9b749f589b5af8e4993623dbda6716b2b7a5fcb0fa2277bf3ce4b278c7059", + "sha256:8e5a26d7aac4c0d8414a347da162696eea0629fdce939ada6aedf951abb1d745", + "sha256:8fbf8c0ded367c5c8eaf585f85ca8dd85ff4d5b73fb8fe1e6ac9e1b5e62e11f7", + "sha256:93094eba50bc2ad4c40ff4997ead1fdcd41536116f2e7d6cfec9596a8ecb3615", + "sha256:9c186b270979fb1dee3ababe2d12fb243ed7da08b30abc83ebac3a928a4ddb15", + "sha256:9cb54f5725b4b37af12edf6c9e834df59258c82c15a244daa521a065fbb11717", + "sha256:9fbff00646cf8211b330690eb2fd64b23e1ce5b63a342436c1d1d6951d53d8dd", + "sha256:a57e73f9523e980f6101dc9a83adcd7ac0006ea8bf7937ca3870391c7bb4f8ff", + "sha256:a702bd3663b5cbf3916e84bf332400d24cdb18399f0877ca6b313ce6c08bfb43", + "sha256:a77c79bac8d908d839d32c212aef2354d2246eb9deb3e2cb01ffa83fb7a6ea5d", + "sha256:abda4009a30d51d3f06f36bc7411a62b3e647fa6cc935ef667e3e3d3a7dd09b1", + "sha256:b023b68c61ab0cd48bd38416b421464a62c381e32b9dc7b4bdfa2905807452a4", + "sha256:b07286a1090483799599a2f72f76ac396993da31f6e08efedb59f40876c144fa", + "sha256:b0de63ff0307eac3961b4af74382d30220d4813f36b7aaaf57f063a1243b4214", + "sha256:b7d5bb926805022508b7ddeaad957f1fce7a8d77532068d7bdb431056dc630cd", + "sha256:b9db600a86414a9a653e3c1c7f6a2f6a1894ab8f83d11505247bd1b90ad57157", + "sha256:b9fb6508893dc31cfcbb8191ef35abd79751db1d6871b3e2caee83959b4d91eb", + "sha256:bc3ea6ef2a83edad84bbdb5d96e22f587b67c68922cd7b6f9d8f24865e655bcf", + "sha256:bde0693073fd5e542e46ea100aa6c1a5d36282dbdbad85b1c3365d5421490a92", + "sha256:bf66149bb348d8e713f3a8e0b4f5b952094c2948c408e1cfef03b49e86745d60", + "sha256:bfe33cba6e127d0b5b417623c9aa621f0a69f304742acdca929a9fdab4593693", + "sha256:c8fb76214b5b739ce59e2236a6489d9dc3483649cfd6f563dbf5d8e40dbdd57d", + "sha256:cb8b79a65332e1a426ccb6290ce0409e1dc16b4daac1cc5761e059127fa3d134", + "sha256:d6bbe2c90c10382ca96df33b56e2060404a4f0f88673e1e84b44c8952517e5f3", + "sha256:d8311d0d690487359fe2247ec5d2cac9946e70d50dced8c01ce9e72341c21151", + "sha256:d8a8221a63602008550022aa3a4152ca357e1dde7ab3dd1da7e1925050b56863", + "sha256:de1a91d5faded9054957ed0a9e01b9d632109341942fc123947ced358c5d9009", + "sha256:df31641e3f02b77eb3c5fb63c0508bee0fc067cf153da0e002ebbb0db0b6d91a", + "sha256:e7168782621be4448d90169a60c8b37e9b0926b3b79b6097bc180c0a8a119e73", + "sha256:e7b55d9ede66af7feb6de87ff277e0ccf6d51c7db74cc39337fe3a0e31b5872d", + "sha256:e7dbf637f87dd315fa1f36aaed8afa929ee2c607454fb7791e74c88a0d94da59", + "sha256:f5293726943bdcea24715b121d8c4ae12581441d22623b0e6ab12d07ce85f9c4", + "sha256:f5dd109a925fee4c9ac3f6a094900461a2712df41745f5d04782ebcbe6479ccb", + "sha256:f6979b4f20d3e557a867da9d9227de4c156fcdcb348a5848e3e6190fd7feb972", + "sha256:f9f8beed277488a52ee2b459b23c4135e54d6a819eaba2e120e57311015b58e9" ], "markers": "python_version >= '3.8'", - "version": "==3.10.0" + "version": "==3.10.1" }, "aiosignal": { "hashes": [ @@ -157,11 +157,11 @@ }, "attrs": { "hashes": [ - "sha256:935dc3b529c262f6cf76e50877d35a4bd3c1de194fd41f47a2b7ae8f19971f30", - "sha256:99b87a485a5820b23b879f04c2305b44b951b502fd64be915879d77a7e8fc6f1" + "sha256:377b47448cb61fea38533f671fba0d0f8a96fd58facd4dc518e3dac9dbea0905", + "sha256:adbdec84af72d38be7628e353a09b6a6790d15cd71819f6e9d7b0faa8a125745" ], "markers": "python_version >= '3.7'", - "version": "==23.2.0" + "version": "==24.1.0" }, "beautifulsoup4": { "hashes": [ @@ -181,20 +181,20 @@ }, "boto3": { "hashes": [ - "sha256:92726a5be7083fd62585f8de251251ec7e53f4c7ee69c9c3168873fe979ec511", - "sha256:d34d7efe608b98cc10cfb43983bd2c511eb32efd5780ef72b171a3e3325462ff" + "sha256:7ca22adef4c77ee128e1e1dc7d48bc9512a87cc6fe3d771b3f913d5ecd41c057", + "sha256:864f06528c583dc7b02adf12db395ecfadbf9cb0da90e907e848ffb27128ce19" ], "index": "pypi", "markers": "python_version >= '3.8'", - "version": "==1.34.152" + "version": "==1.34.154" }, "botocore": { "hashes": [ - "sha256:8531eb0f8d3b7913df8b32ca96d415d3187de8681e4ac908657803eacc87ac54", - "sha256:e291e425e34e9fdcdf32d7c37fc099be057335b58cccabf5ee7c945322dbcd87" + "sha256:4eef4b1bb809b382ba9dc9c88f5fcc4a133f221a1acb693ee6bee4de9f325979", + "sha256:64d9b4c85a504d77cb56dabb2ad717cd8e1717424a88edb458b01d1e5797262a" ], "markers": "python_version >= '3.8'", - "version": "==1.34.152" + "version": "==1.34.154" }, "certifi": { "hashes": [ @@ -372,14 +372,6 @@ "markers": "python_version >= '3.8'", "version": "==2.6.1" }, - "email-validator": { - "hashes": [ - "sha256:561977c2d73ce3611850a06fa56b414621e0c8faa9d66f2611407d87465da631", - "sha256:cb690f344c617a714f22e66ae771445a1ceb46821152df8e165c5f9a364582b7" - ], - "markers": "python_version >= '3.8'", - "version": "==2.2.0" - }, "exceptiongroup": { "hashes": [ "sha256:3111b9d131c238bec2f8f516e123e14ba243563fb135d3fe885990585aa7795b", @@ -390,20 +382,12 @@ }, "fastapi": { "hashes": [ - "sha256:4f51cfa25d72f9fbc3280832e84b32494cf186f50158d364a8765aabf22587bf", - "sha256:ddd1ac34cb1f76c2e2d7f8545a4bcb5463bce4834e81abf0b189e0c359ab2413" + "sha256:3487ded9778006a45834b8c816ec4a48d522e2631ca9e75ec5a774f1b052f821", + "sha256:d262bc56b7d101d1f4e8fc0ad2ac75bb9935fec504d2b7117686cec50710cf05" ], "index": "pypi", "markers": "python_version >= '3.8'", - "version": "==0.111.1" - }, - "fastapi-cli": { - "hashes": [ - "sha256:a2552f3a7ae64058cdbb530be6fa6dbfc975dc165e4fa66d224c3d396e25e809", - "sha256:e2e9ffaffc1f7767f488d6da34b6f5a377751c996f397902eb6abb99a67bde32" - ], - "markers": "python_version >= '3.8'", - "version": "==0.0.4" + "version": "==0.112.0" }, "fastavro": { "hashes": [ @@ -623,47 +607,6 @@ "markers": "python_version >= '3.8'", "version": "==1.0.5" }, - "httptools": { - "hashes": [ - "sha256:00d5d4b68a717765b1fabfd9ca755bd12bf44105eeb806c03d1962acd9b8e563", - "sha256:0ac5a0ae3d9f4fe004318d64b8a854edd85ab76cffbf7ef5e32920faef62f142", - "sha256:0cf2372e98406efb42e93bfe10f2948e467edfd792b015f1b4ecd897903d3e8d", - "sha256:1ed99a373e327f0107cb513b61820102ee4f3675656a37a50083eda05dc9541b", - "sha256:3c3b214ce057c54675b00108ac42bacf2ab8f85c58e3f324a4e963bbc46424f4", - "sha256:3e802e0b2378ade99cd666b5bffb8b2a7cc8f3d28988685dc300469ea8dd86cb", - "sha256:3f30d3ce413088a98b9db71c60a6ada2001a08945cb42dd65a9a9fe228627658", - "sha256:405784577ba6540fa7d6ff49e37daf104e04f4b4ff2d1ac0469eaa6a20fde084", - "sha256:48ed8129cd9a0d62cf4d1575fcf90fb37e3ff7d5654d3a5814eb3d55f36478c2", - "sha256:4bd3e488b447046e386a30f07af05f9b38d3d368d1f7b4d8f7e10af85393db97", - "sha256:4f0f8271c0a4db459f9dc807acd0eadd4839934a4b9b892f6f160e94da309837", - "sha256:5cceac09f164bcba55c0500a18fe3c47df29b62353198e4f37bbcc5d591172c3", - "sha256:639dc4f381a870c9ec860ce5c45921db50205a37cc3334e756269736ff0aac58", - "sha256:678fcbae74477a17d103b7cae78b74800d795d702083867ce160fc202104d0da", - "sha256:6a4f5ccead6d18ec072ac0b84420e95d27c1cdf5c9f1bc8fbd8daf86bd94f43d", - "sha256:6f58e335a1402fb5a650e271e8c2d03cfa7cea46ae124649346d17bd30d59c90", - "sha256:75c8022dca7935cba14741a42744eee13ba05db00b27a4b940f0d646bd4d56d0", - "sha256:7a7ea483c1a4485c71cb5f38be9db078f8b0e8b4c4dc0210f531cdd2ddac1ef1", - "sha256:7d9ceb2c957320def533671fc9c715a80c47025139c8d1f3797477decbc6edd2", - "sha256:7ebaec1bf683e4bf5e9fbb49b8cc36da482033596a415b3e4ebab5a4c0d7ec5e", - "sha256:85ed077c995e942b6f1b07583e4eb0a8d324d418954fc6af913d36db7c05a5a0", - "sha256:8ae5b97f690badd2ca27cbf668494ee1b6d34cf1c464271ef7bfa9ca6b83ffaf", - "sha256:8b0bb634338334385351a1600a73e558ce619af390c2b38386206ac6a27fecfc", - "sha256:8e216a038d2d52ea13fdd9b9c9c7459fb80d78302b257828285eca1c773b99b3", - "sha256:93ad80d7176aa5788902f207a4e79885f0576134695dfb0fefc15b7a4648d503", - "sha256:95658c342529bba4e1d3d2b1a874db16c7cca435e8827422154c9da76ac4e13a", - "sha256:95fb92dd3649f9cb139e9c56604cc2d7c7bf0fc2e7c8d7fbd58f96e35eddd2a3", - "sha256:97662ce7fb196c785344d00d638fc9ad69e18ee4bfb4000b35a52efe5adcc949", - "sha256:9bb68d3a085c2174c2477eb3ffe84ae9fb4fde8792edb7bcd09a1d8467e30a84", - "sha256:b512aa728bc02354e5ac086ce76c3ce635b62f5fbc32ab7082b5e582d27867bb", - "sha256:c6e26c30455600b95d94b1b836085138e82f177351454ee841c148f93a9bad5a", - "sha256:d2f6c3c4cb1948d912538217838f6e9960bc4a521d7f9b323b3da579cd14532f", - "sha256:dcbab042cc3ef272adc11220517278519adf8f53fd3056d0e68f0a6f891ba94e", - "sha256:e0b281cf5a125c35f7f6722b65d8542d2e57331be573e9e88bc8b0115c4a7a81", - "sha256:e57997ac7fb7ee43140cc03664de5f268813a481dff6245e0075925adc6aa185", - "sha256:fe467eb086d80217b7584e61313ebadc8d187a4d95bb62031b7bab4b205c3ba3" - ], - "version": "==0.6.1" - }, "httpx": { "hashes": [ "sha256:71d5465162c13681bff01ad59b2cc68dd838ea1f10e51574bac27103f00c91a5", @@ -699,6 +642,14 @@ "markers": "python_version >= '3.5'", "version": "==3.7" }, + "iniconfig": { + "hashes": [ + "sha256:2d91e135bf72d31a410b17c16da610a82cb55f6b0477d1a902134b24a455b8b3", + "sha256:b6a85871a79d2e3b22d2d1b94ac2824226a63c6b741c88f7ae975f18b6778374" + ], + "markers": "python_version >= '3.7'", + "version": "==2.0.0" + }, "jinja2": { "hashes": [ "sha256:4a3aee7acbbe7303aede8e9648d13b8bf88a429282aa6122a993f0ac800cb369", @@ -808,27 +759,27 @@ }, "langchain": { "hashes": [ - "sha256:5a7a8b4918f3d3bebce9b4f23b92d050699e6f7fb97591e8941177cf07a260a2", - "sha256:d7a9e4165f02dca0bd78addbc2319d5b9286b5d37c51d784124102b57e9fd297" + "sha256:565d2f5df1c06815d1c684400218ec4ae5e1027887aad343226fad846c54e726", + "sha256:fe7bd409c133017446fec54c38a5e7cb14f74e020090d7b5065374badf71e6d1" ], "markers": "python_full_version >= '3.8.1' and python_version < '4.0'", - "version": "==0.2.11" + "version": "==0.2.12" }, "langchain-community": { "hashes": [ - "sha256:3a0404bad4bd07d6f86affdb62fb3d080a456c66191754d586a409d9d6024d62", - "sha256:9f4d1b5ab7f0b0a704f538e26e50fce45a461da6d2bf6b7b636d24f22fbc088a" + "sha256:465c03ba1603975d141533424185e09546ecf09e379c93aee2671bdc9b325cda", + "sha256:ede261ff8202f1433f004ee90baf89f371cee37cb1abfc16dd0f8392db10b23e" ], "markers": "python_full_version >= '3.8.1' and python_version < '4.0'", - "version": "==0.2.10" + "version": "==0.2.11" }, "langchain-core": { "hashes": [ - "sha256:20c7792eb6c256dc50892d41f07f7fd8e12e5868dbb059fa316a278977bdf4f6", - "sha256:ab7bc58d8037349d06ad8c3eee2ea776e26af7f57cce330eecbed5b98e1c4d56" + "sha256:0728761d02ce696a1c6a57cfad18b874cf6c9566ba86120e2f542e442cb77a06", + "sha256:589f907fcb1f15acea55ea3f451a37faaa61c2e68b3d39d436cf73ca3dd23ef5" ], "markers": "python_full_version >= '3.8.1' and python_version < '4.0'", - "version": "==0.2.26" + "version": "==0.2.28" }, "langchain-openai": { "hashes": [ @@ -848,11 +799,11 @@ }, "langsmith": { "hashes": [ - "sha256:2dc987a640939c0eccb41b7acc31216a7c9b38e1deb62079d1ecaed0897eea67", - "sha256:7f490af411597f87af7893b0a42519c442d6ca2517483344c4f55ccdc3136dde" + "sha256:5543d5fc6fe239ab4c92155002a9529ca5caab4ddabf5743c1dfab719471bc05", + "sha256:5831f2ea69faa43c168440fb74631da5981b27a051480def7b310b09b6a06980" ], "markers": "python_full_version >= '3.8.1' and python_version < '4.0'", - "version": "==0.1.95" + "version": "==0.1.97" }, "llama-cloud": { "hashes": [ @@ -864,12 +815,12 @@ }, "llama-index": { "hashes": [ - "sha256:759326e6d36ce088b814b0d96bb0a5724bcbdbdbb3f91e268109cbab79fc5634", - "sha256:9da8c5d79051057b8cfcd5ffcfe0b55b15f84f296845dcd7b4a5ebd8d49f327e" + "sha256:a2b22cf858ff2b75f4a37f172f16779b7c478622984103122eff48703ab3be3c", + "sha256:a988a183d45c8e8e3a9488b8139f12972ab944a1c5c399a8f8a60996b0ff9239" ], "index": "pypi", "markers": "python_full_version >= '3.8.1' and python_version < '4.0'", - "version": "==0.10.59" + "version": "==0.10.61" }, "llama-index-agent-openai": { "hashes": [ @@ -889,11 +840,20 @@ }, "llama-index-core": { "hashes": [ - "sha256:57074e91add520e1e4a85f0af1e54162dca40fb2e279a650c689f54067501de8", - "sha256:668c1f3e7811f4f853333a57c87f4a7ac9285eead79871db9f84a87fbe4a4a1e" + "sha256:46b1923df5d90fe860f894c58cc1c9c621a743d75c2077ddb3aaeecba7ffdd41", + "sha256:62008d2c3bea9398388941b49fda711735a252c193ac08a1867ee2710f032114" ], "markers": "python_full_version >= '3.8.1' and python_version < '4.0'", - "version": "==0.10.59" + "version": "==0.10.61" + }, + "llama-index-embeddings-bedrock": { + "hashes": [ + "sha256:51430522637cfb06034b99a104752a20e660632b7792356b5a6126a239f78505", + "sha256:94f5ea8135a754ecb61dcab8b5f5d01ea0c4bfe0d923d35c230a3ae7cf442272" + ], + "index": "pypi", + "markers": "python_full_version >= '3.8.1' and python_version < '4.0'", + "version": "==0.2.1" }, "llama-index-embeddings-cohere": { "hashes": [ @@ -947,6 +907,15 @@ "markers": "python_full_version >= '3.8.1' and python_version < '4.0'", "version": "==0.1.16" }, + "llama-index-llms-bedrock": { + "hashes": [ + "sha256:3217108e28a43700c0b9444eb0843a580d9e91627b5550d874a2d83a4a9b94ab", + "sha256:9c575565623f6bd2b4445a2b7614d9dfac069b6cedc6ceeb740b5f4cac28368a" + ], + "index": "pypi", + "markers": "python_full_version >= '3.8.1' and python_version < '4.0'", + "version": "==0.1.12" + }, "llama-index-llms-cohere": { "hashes": [ "sha256:255db5427df38ea5a43198d0c54fb2237074cd55d82c200aa195fc129b6e66a8", @@ -1034,12 +1003,12 @@ }, "llama-index-storage-index-store-mongodb": { "hashes": [ - "sha256:08d05f7710f9e8537868f74c5d912351c974ddb4abcb62930e6348b5a7ca82b6", - "sha256:783588d80a8c4ca5c490aaf226c7bdd01be3fe7f497f7390bd900803fc56476a" + "sha256:779f6b978d05481acd3d0d04e9ae13a6ad65a2ade9535e30635fa54272362fb6", + "sha256:b6789e8dd23d9e7d72ed15e44e3079de23e42ad9ff51a1e72c4c5124f4a7d20d" ], "index": "pypi", "markers": "python_full_version >= '3.8.1' and python_version < '4.0'", - "version": "==0.1.2" + "version": "==0.2.0" }, "llama-index-storage-kvstore-mongodb": { "hashes": [ @@ -1076,14 +1045,6 @@ "markers": "python_full_version >= '3.8.1' and python_version < '4.0'", "version": "==0.4.9" }, - "markdown-it-py": { - "hashes": [ - "sha256:355216845c60bd96232cd8d8c40e8f9765cc86f46880e43a8fd22dc1a1a8cab1", - "sha256:e3f60a94fa066dc52ec76661e37c851cb232d92f9886b15cb560aaada2df8feb" - ], - "markers": "python_version >= '3.8'", - "version": "==3.0.0" - }, "markupsafe": { "hashes": [ "sha256:00e046b6dd71aa03a41079792f8473dc494d564611a8f89bbbd7cb93295ebdcf", @@ -1158,14 +1119,6 @@ "markers": "python_version >= '3.8'", "version": "==3.21.3" }, - "mdurl": { - "hashes": [ - "sha256:84008a41e51615a49fc9966191ff91509e3c40b939176e643fd50a5c2196b8f8", - "sha256:bb413d29f5eea38f31dd4754dd7377d4465116fb207585f97bf925588687c1ba" - ], - "markers": "python_version >= '3.7'", - "version": "==0.1.2" - }, "minijinja": { "hashes": [ "sha256:039f4d1a1a73f90917cff1ed7c617eb56e2b2f91bbbdc551adaa448e1673e5c2", @@ -1386,11 +1339,11 @@ }, "openai": { "hashes": [ - "sha256:9a6adda0d6ae8fce02d235c5671c399cfa40d6a281b3628914c7ebf244888ee3", - "sha256:faf87206785a6b5d9e34555d6a3242482a6852bc802e453e2a891f68ee04ce55" + "sha256:0cea446082f50985f26809d704a97749cb366a1ba230ef432c684a9745b3f2d9", + "sha256:a712553a131c59a249c474d0bb6a0414f41df36dc186d3a018fa7e600e57fb7f" ], "markers": "python_full_version >= '3.7.1'", - "version": "==1.37.1" + "version": "==1.39.0" }, "orjson": { "hashes": [ @@ -1586,6 +1539,14 @@ "markers": "python_version >= '3.8'", "version": "==10.4.0" }, + "pluggy": { + "hashes": [ + "sha256:2cffa88e94fdc978c4c574f15f9e59b7f4201d439195c3715ca9e2486f1d0cf1", + "sha256:44e1ad92c8ca002de6377e165f3e0f1be63266ab4d554740532335b9d75ea669" + ], + "markers": "python_version >= '3.8'", + "version": "==1.5.0" + }, "psycopg2-binary": { "hashes": [ "sha256:03ef7df18daf2c4c07e2695e8cfd5ee7f748a1d54d802330985a78d2a5a6dca9", @@ -1818,14 +1779,6 @@ "markers": "python_version >= '3.8'", "version": "==2.20.1" }, - "pygments": { - "hashes": [ - "sha256:786ff802f32e91311bff3889f6e9a86e81505fe99f2735bb6d60ae0c5004f199", - "sha256:b8e6aca0523f3ab76fee51799c488e38782ac06eafcf95e7ba832985c8e7b13a" - ], - "markers": "python_version >= '3.8'", - "version": "==2.18.0" - }, "pymongo": { "hashes": [ "sha256:0fc18b3a093f3db008c5fea0e980dbd3b743449eee29b5718bc2dc15ab5088bb", @@ -1904,6 +1857,15 @@ ], "version": "==2.2.0.1" }, + "pytest": { + "hashes": [ + "sha256:4ba08f9ae7dcf84ded419494d229b48d0903ea6407b030eaec46df5e6a73bba5", + "sha256:c132345d12ce551242c87269de812483f5bcc87cdbb4722e48487ba194f9fdce" + ], + "index": "pypi", + "markers": "python_version >= '3.8'", + "version": "==8.3.2" + }, "python-dateutil": { "hashes": [ "sha256:37dd54208da7e1cd875388217d5e00ebd4179249f90fb72437e91a35459a0ad3", @@ -1921,14 +1883,6 @@ "markers": "python_version >= '3.8'", "version": "==1.0.1" }, - "python-multipart": { - "hashes": [ - "sha256:03f54688c663f1b7977105f021043b0793151e4cb1c1a9d4a11fc13d622c4026", - "sha256:97ca7b8ea7b05f977dc3849c3ba99d51689822fab725c3703af7c866a0c2b215" - ], - "markers": "python_version >= '3.8'", - "version": "==0.0.9" - }, "pytz": { "hashes": [ "sha256:2a29735ea9c18baf14b448846bde5a48030ed267578472d8955cd0e7443a9812", @@ -1995,11 +1949,11 @@ }, "ragas": { "hashes": [ - "sha256:3c343f3caba164e9ee8dfcaec7b53d8801d8233414c9b558ed57925c97e1dd2b", - "sha256:640dca0a466a5aeb88c5f4e962a371e05c9f8621dff7b7bf2882434b52325587" + "sha256:18b5b8aa6e6033ea4b8cd6388afae43ca762579d5cf5b533576ef952c1a5e438", + "sha256:919b3f91c57d6605d37b805a7f365c4a04be34577a3ec54d000c66056b3f005f" ], "index": "pypi", - "version": "==0.1.12" + "version": "==0.1.13" }, "rank-bm25": { "hashes": [ @@ -2102,14 +2056,6 @@ "markers": "python_version >= '3.8'", "version": "==2.32.3" }, - "rich": { - "hashes": [ - "sha256:4edbae314f59eb482f54e9e30bf00d33350aaa94f4bfcd4e9e3110e64d0d7222", - "sha256:9be308cb1fe2f1f57d67ce99e95af38a1e2bc71ad9813b0e247cf7ffbcc3a432" - ], - "markers": "python_full_version >= '3.7.0'", - "version": "==13.7.1" - }, "s3transfer": { "hashes": [ "sha256:0711534e9356d3cc692fdde846b4a1e4b0cb6519971860796e6bc4c7aea00ef6", @@ -2120,109 +2066,119 @@ }, "safetensors": { "hashes": [ - "sha256:018b691383026a2436a22b648873ed11444a364324e7088b99cd2503dd828400", - "sha256:01e4b22e3284cd866edeabe4f4d896229495da457229408d2e1e4810c5187121", - "sha256:01feb3089e5932d7e662eda77c3ecc389f97c0883c4a12b5cfdc32b589a811c3", - "sha256:02318f01e332cc23ffb4f6716e05a492c5f18b1d13e343c49265149396284a44", - "sha256:02ef3a24face643456020536591fbd3c717c5abaa2737ec428ccbbc86dffa7a4", - "sha256:03a4447c784917c9bf01d8f2ac5080bc15c41692202cd5f406afba16629e84d6", - "sha256:084fc436e317f83f7071fc6a62ca1c513b2103db325cd09952914b50f51cf78f", - "sha256:0bf4f9d6323d9f86eef5567eabd88f070691cf031d4c0df27a40d3b4aaee755b", - "sha256:0d52c958dc210265157573f81d34adf54e255bc2b59ded6218500c9b15a750eb", - "sha256:0d5ffc6a80f715c30af253e0e288ad1cd97a3d0086c9c87995e5093ebc075e50", - "sha256:0d9cd8e1560dfc514b6d7859247dc6a86ad2f83151a62c577428d5102d872721", - "sha256:0dd37306546b58d3043eb044c8103a02792cc024b51d1dd16bd3dd1f334cb3ed", - "sha256:1139eb436fd201c133d03c81209d39ac57e129f5e74e34bb9ab60f8d9b726270", - "sha256:19bbdf95de2cf64f25cd614c5236c8b06eb2cfa47cbf64311f4b5d80224623a3", - "sha256:1ab6527a20586d94291c96e00a668fa03f86189b8a9defa2cdd34a1a01acc7d5", - "sha256:1b89381517891a7bb7d1405d828b2bf5d75528299f8231e9346b8eba092227f9", - "sha256:1f598b713cc1a4eb31d3b3203557ac308acf21c8f41104cdd74bf640c6e538e3", - "sha256:22d21760dc6ebae42e9c058d75aa9907d9f35e38f896e3c69ba0e7b213033856", - "sha256:22f3b5d65e440cec0de8edaa672efa888030802e11c09b3d6203bff60ebff05a", - "sha256:2a0deb16a1d3ea90c244ceb42d2c6c276059616be21a19ac7101aa97da448faf", - "sha256:2a1f4430cc0c9d6afa01214a4b3919d0a029637df8e09675ceef1ca3f0dfa0df", - "sha256:2d603846a8585b9432a0fd415db1d4c57c0f860eb4aea21f92559ff9902bae4d", - "sha256:2f85fc50c4e07a21e95c24e07460fe6f7e2859d0ce88092838352b798ce711c2", - "sha256:309b10dbcab63269ecbf0e2ca10ce59223bb756ca5d431ce9c9eeabd446569da", - "sha256:3615a96dd2dcc30eb66d82bc76cda2565f4f7bfa89fcb0e31ba3cea8a1a9ecbb", - "sha256:38e2a8666178224a51cca61d3cb4c88704f696eac8f72a49a598a93bbd8a4af9", - "sha256:393e6e391467d1b2b829c77e47d726f3b9b93630e6a045b1d1fca67dc78bf632", - "sha256:3f9cdca09052f585e62328c1c2923c70f46814715c795be65f0b93f57ec98a02", - "sha256:41a727a7f5e6ad9f1db6951adee21bbdadc632363d79dc434876369a17de6ad6", - "sha256:420a98f593ff9930f5822560d14c395ccbc57342ddff3b463bc0b3d6b1951550", - "sha256:446e9fe52c051aeab12aac63d1017e0f68a02a92a027b901c4f8e931b24e5397", - "sha256:455d538aa1aae4a8b279344a08136d3f16334247907b18a5c3c7fa88ef0d3c46", - "sha256:4f9bac020faba7f5dc481e881b14b6425265feabb5bfc552551d21189c0eddc3", - "sha256:53c4879b9c6bd7cd25d114ee0ef95420e2812e676314300624594940a8d6a91f", - "sha256:5757e4688f20df083e233b47de43845d1adb7e17b6cf7da5f8444416fc53828d", - "sha256:585c9ae13a205807b63bef8a37994f30c917ff800ab8a1ca9c9b5d73024f97ee", - "sha256:5d07cbca5b99babb692d76d8151bec46f461f8ad8daafbfd96b2fca40cadae65", - "sha256:5fc6775529fb9f0ce2266edd3e5d3f10aab068e49f765e11f6f2a63b5367021d", - "sha256:622afd28968ef3e9786562d352659a37de4481a4070f4ebac883f98c5836563e", - "sha256:6f9568f380f513a60139971169c4a358b8731509cc19112369902eddb33faa4d", - "sha256:70a5319ef409e7f88686a46607cbc3c428271069d8b770076feaf913664a07ac", - "sha256:74707624b81f1b7f2b93f5619d4a9f00934d5948005a03f2c1845ffbfff42212", - "sha256:7c4fa560ebd4522adddb71dcd25d09bf211b5634003f015a4b815b7647d62ebe", - "sha256:7de32d0d34b6623bb56ca278f90db081f85fb9c5d327e3c18fd23ac64f465768", - "sha256:840b7ac0eff5633e1d053cc9db12fdf56b566e9403b4950b2dc85393d9b88d67", - "sha256:840caf38d86aa7014fe37ade5d0d84e23dcfbc798b8078015831996ecbc206a3", - "sha256:8651c7299cbd8b4161a36cd6a322fa07d39cd23535b144d02f1c1972d0c62f3c", - "sha256:868ad1b6fc41209ab6bd12f63923e8baeb1a086814cb2e81a65ed3d497e0cf8f", - "sha256:88887f69f7a00cf02b954cdc3034ffb383b2303bc0ab481d4716e2da51ddc10e", - "sha256:89f9f17b0dacb913ed87d57afbc8aad85ea42c1085bd5de2f20d83d13e9fc4b2", - "sha256:8c496c5401c1b9c46d41a7688e8ff5b0310a3b9bae31ce0f0ae870e1ea2b8caf", - "sha256:8cf18888606dad030455d18f6c381720e57fc6a4170ee1966adb7ebc98d4d6a3", - "sha256:8d22c1a10dff3f64d0d68abb8298a3fd88ccff79f408a3e15b3e7f637ef5c980", - "sha256:90964917f5b0fa0fa07e9a051fbef100250c04d150b7026ccbf87a34a54012e0", - "sha256:9bfb92f82574d9e58401d79c70c716985dc049b635fef6eecbb024c79b2c46ad", - "sha256:9c6ad011c1b4e3acff058d6b090f1da8e55a332fbf84695cf3100c649cc452d1", - "sha256:a11c374eb63a9c16c5ed146457241182f310902bd2a9c18255781bb832b6748b", - "sha256:a7cef55929dcbef24af3eb40bedec35d82c3c2fa46338bb13ecf3c5720af8a61", - "sha256:a844cdb5d7cbc22f5f16c7e2a0271170750763c4db08381b7f696dbd2c78a361", - "sha256:ae7613a119a71a497d012ccc83775c308b9c1dab454806291427f84397d852fd", - "sha256:b1648568667f820b8c48317c7006221dc40aced1869908c187f493838a1362bc", - "sha256:b1e31be7945f66be23f4ec1682bb47faa3df34cb89fc68527de6554d3c4258a4", - "sha256:b277482120df46e27a58082df06a15aebda4481e30a1c21eefd0921ae7e03f65", - "sha256:b7ffba80aa49bd09195145a7fd233a7781173b422eeb995096f2b30591639517", - "sha256:b852e47eb08475c2c1bd8131207b405793bfc20d6f45aff893d3baaad449ed14", - "sha256:bb4f8c5d0358a31e9a08daeebb68f5e161cdd4018855426d3f0c23bb51087055", - "sha256:bbae3b4b9d997971431c346edbfe6e41e98424a097860ee872721e176040a893", - "sha256:befdf0167ad626f22f6aac6163477fcefa342224a22f11fdd05abb3995c1783c", - "sha256:c0acbe31340ab150423347e5b9cc595867d814244ac14218932a5cf1dd38eb39", - "sha256:c41e1893d1206aa7054029681778d9a58b3529d4c807002c156d58426c225173", - "sha256:c59d51f182c729f47e841510b70b967b0752039f79f1de23bcdd86462a9b09ee", - "sha256:cd6fff9e56df398abc5866b19a32124815b656613c1c5ec0f9350906fd798aac", - "sha256:cdd0a3b5da66e7f377474599814dbf5cbf135ff059cc73694de129b58a5e8a2c", - "sha256:cf476bca34e1340ee3294ef13e2c625833f83d096cfdf69a5342475602004f95", - "sha256:d0dd4a1db09db2dba0f94d15addc7e7cd3a7b0d393aa4c7518c39ae7374623c3", - "sha256:d1456f814655b224d4bf6e7915c51ce74e389b413be791203092b7ff78c936dd", - "sha256:d14d30c25897b2bf19b6fb5ff7e26cc40006ad53fd4a88244fdf26517d852dd7", - "sha256:d244bcafeb1bc06d47cfee71727e775bca88a8efda77a13e7306aae3813fa7e4", - "sha256:d8815b5e1dac85fc534a97fd339e12404db557878c090f90442247e87c8aeaea", - "sha256:d88b33980222085dd6001ae2cad87c6068e0991d4f5ccf44975d216db3b57376", - "sha256:d8c5093206ef4b198600ae484230402af6713dab1bd5b8e231905d754022bec7", - "sha256:d9c289f140a9ae4853fc2236a2ffc9a9f2d5eae0cb673167e0f1b8c18c0961ac", - "sha256:dcf5705cab159ce0130cd56057f5f3425023c407e170bca60b4868048bae64fd", - "sha256:e011cc162503c19f4b1fd63dfcddf73739c7a243a17dac09b78e57a00983ab35", - "sha256:e066e8861eef6387b7c772344d1fe1f9a72800e04ee9a54239d460c400c72aab", - "sha256:e0b2104df1579d6ba9052c0ae0e3137c9698b2d85b0645507e6fd1813b70931a", - "sha256:e375d975159ac534c7161269de24ddcd490df2157b55c1a6eeace6cbb56903f0", - "sha256:e4119532cd10dba04b423e0f86aecb96cfa5a602238c0aa012f70c3a40c44b50", - "sha256:e7dbbde64b6c534548696808a0e01276d28ea5773bc9a2dfb97a88cd3dffe3df", - "sha256:e9afd5358719f1b2cf425fad638fc3c887997d6782da317096877e5b15b2ce93", - "sha256:ec4b52ce9a396260eb9731eb6aea41a7320de22ed73a1042c2230af0212758ce", - "sha256:edb5698a7bc282089f64c96c477846950358a46ede85a1c040e0230344fdde10", - "sha256:ee463219d9ec6c2be1d331ab13a8e0cd50d2f32240a81d498266d77d07b7e71e", - "sha256:efcc860be094b8d19ac61b452ec635c7acb9afa77beb218b1d7784c6d41fe8ad", - "sha256:f5e6883af9a68c0028f70a4c19d5a6ab6238a379be36ad300a22318316c00cb0", - "sha256:f9650713b2cfa9537a2baf7dd9fee458b24a0aaaa6cafcea8bdd5fb2b8efdc34", - "sha256:faefeb3b81bdfb4e5a55b9bbdf3d8d8753f65506e1d67d03f5c851a6c87150e9", - "sha256:fb9c65bd82f9ef3ce4970dc19ee86be5f6f93d032159acf35e663c6bea02b237", - "sha256:fe746d03ed8d193674a26105e4f0fe6c726f5bb602ffc695b409eaf02f04763d", - "sha256:fef5d70683643618244a4f5221053567ca3e77c2531e42ad48ae05fae909f542" + "sha256:005ef9fc0f47cb9821c40793eb029f712e97278dae84de91cb2b4809b856685d", + "sha256:00eea99ae422fbfa0b46065acbc58b46bfafadfcec179d4b4a32d5c45006af6c", + "sha256:03f2bb92e61b055ef6cc22883ad1ae898010a95730fa988c60a23800eb742c2c", + "sha256:051d5ecd490af7245258000304b812825974d5e56f14a3ff7e1b8b2ba6dc2ed4", + "sha256:063421ef08ca1021feea8b46951251b90ae91f899234dd78297cbe7c1db73b99", + "sha256:0677c109d949cf53756859160b955b2e75b0eefe952189c184d7be30ecf7e858", + "sha256:068d3a33711fc4d93659c825a04480ff5a3854e1d78632cdc8f37fee917e8a60", + "sha256:08332c22e03b651c8eb7bf5fc2de90044f3672f43403b3d9ac7e7e0f4f76495e", + "sha256:08d464aa72a9a13826946b4fb9094bb4b16554bbea2e069e20bd903289b6ced9", + "sha256:0e5fe345b2bc7d88587149ac11def1f629d2671c4c34f5df38aed0ba59dc37f8", + "sha256:166c0c52f6488b8538b2a9f3fbc6aad61a7261e170698779b371e81b45f0440d", + "sha256:177f2b60a058f92a3cec7a1786c9106c29eca8987ecdfb79ee88126e5f47fa31", + "sha256:181fb5f3dee78dae7fd7ec57d02e58f7936498d587c6b7c1c8049ef448c8d285", + "sha256:1a32c662e7df9226fd850f054a3ead0e4213a96a70b5ce37b2d26ba27004e013", + "sha256:1a8043a33d58bc9b30dfac90f75712134ca34733ec3d8267b1bd682afe7194f5", + "sha256:1c6bf35e9a8998d8339fd9a05ac4ce465a4d2a2956cc0d837b67c4642ed9e947", + "sha256:1d1f34c71371f0e034004a0b583284b45d233dd0b5f64a9125e16b8a01d15067", + "sha256:20d218ec2b6899d29d6895419a58b6e44cc5ff8f0cc29fac8d236a8978ab702e", + "sha256:210160816d5a36cf41f48f38473b6f70d7bcb4b0527bedf0889cc0b4c3bb07db", + "sha256:218bbb9b883596715fc9997bb42470bf9f21bb832c3b34c2bf744d6fa8f2bbba", + "sha256:23654ad162c02a5636f0cd520a0310902c4421aab1d91a0b667722a4937cc445", + "sha256:239ee093b1db877c9f8fe2d71331a97f3b9c7c0d3ab9f09c4851004a11f44b65", + "sha256:26987dac3752688c696c77c3576f951dbbdb8c57f0957a41fb6f933cf84c0b62", + "sha256:275f500b4d26f67b6ec05629a4600645231bd75e4ed42087a7c1801bff04f4b3", + "sha256:2a69c71b1ae98a8021a09a0b43363b0143b0ce74e7c0e83cacba691b62655fb8", + "sha256:2a9e9d1a27e51a0f69e761a3d581c3af46729ec1c988fa1f839e04743026ae35", + "sha256:2ab4c96d922e53670ce25fbb9b63d5ea972e244de4fa1dd97b590d9fd66aacef", + "sha256:2adb497ada13097f30e386e88c959c0fda855a5f6f98845710f5bb2c57e14f12", + "sha256:2c37e6b714200824c73ca6eaf007382de76f39466a46e97558b8dc4cf643cfbf", + "sha256:2c42e9b277513b81cf507e6121c7b432b3235f980cac04f39f435b7902857f91", + "sha256:2cb4ac1d8f6b65ec84ddfacd275079e89d9df7c92f95675ba96c4f790a64df6e", + "sha256:2d065059e75a798bc1933c293b68d04d79b586bb7f8c921e0ca1e82759d0dbb1", + "sha256:2f8c2eb0615e2e64ee27d478c7c13f51e5329d7972d9e15528d3e4cfc4a08f0d", + "sha256:30f23e6253c5f43a809dea02dc28a9f5fa747735dc819f10c073fe1b605e97d4", + "sha256:32f0d1f6243e90ee43bc6ee3e8c30ac5b09ca63f5dd35dbc985a1fc5208c451a", + "sha256:3467ab511bfe3360967d7dc53b49f272d59309e57a067dd2405b4d35e7dcf9dc", + "sha256:3daacc9a4e3f428a84dd56bf31f20b768eb0b204af891ed68e1f06db9edf546f", + "sha256:419010156b914a3e5da4e4adf992bee050924d0fe423c4b329e523e2c14c3547", + "sha256:43251d7f29a59120a26f5a0d9583b9e112999e500afabcfdcb91606d3c5c89e3", + "sha256:44d464bdc384874601a177375028012a5f177f1505279f9456fea84bbc575c7f", + "sha256:4b3e8aa8226d6560de8c2b9d5ff8555ea482599c670610758afdc97f3e021e9c", + "sha256:51bc8429d9376224cd3cf7e8ce4f208b4c930cd10e515b6ac6a72cbc3370f0d9", + "sha256:5512078d00263de6cb04e9d26c9ae17611098f52357fea856213e38dc462f81f", + "sha256:55c14c20be247b8a1aeaf3ab4476265e3ca83096bb8e09bb1a7aa806088def4f", + "sha256:56ad9776b65d8743f86698a1973292c966cf3abff627efc44ed60e66cc538ddd", + "sha256:57d216fab0b5c432aabf7170883d7c11671622bde8bd1436c46d633163a703f6", + "sha256:5c2308de665b7130cd0e40a2329278226e4cf083f7400c51ca7e19ccfb3886f3", + "sha256:5cf6c6f6193797372adf50c91d0171743d16299491c75acad8650107dffa9269", + "sha256:5fe3e9b705250d0172ed4e100a811543108653fb2b66b9e702a088ad03772a07", + "sha256:63144e36209ad8e4e65384dbf2d52dd5b1866986079c00a72335402a38aacdc5", + "sha256:65a4a6072436bf0a4825b1c295d248cc17e5f4651e60ee62427a5bcaa8622a7a", + "sha256:6a13a9caea485df164c51be4eb0c87f97f790b7c3213d635eba2314d959fe929", + "sha256:6b54bc4ca5f9b9bba8cd4fb91c24b2446a86b5ae7f8975cf3b7a277353c3127c", + "sha256:6ceed6247fc2d33b2a7b7d25d8a0fe645b68798856e0bc7a9800c5fd945eb80f", + "sha256:73fc9a0a4343188bdb421783e600bfaf81d0793cd4cce6bafb3c2ed567a74cd5", + "sha256:76897944cd9239e8a70955679b531b9a0619f76e25476e57ed373322d9c2075d", + "sha256:7915f0c60e4e6e65d90f136d85dd3b429ae9191c36b380e626064694563dbd9f", + "sha256:7bd5efc26b39f7fc82d4ab1d86a7f0644c8e34f3699c33f85bfa9a717a030e1b", + "sha256:7cabcf39c81e5b988d0adefdaea2eb9b4fd9bd62d5ed6559988c62f36bfa9a89", + "sha256:7d9b76322e49c056bcc819f8bdca37a2daa5a6d42c07f30927b501088db03309", + "sha256:7db7fdc2d71fd1444d85ca3f3d682ba2df7d61a637dfc6d80793f439eae264ab", + "sha256:8079486118919f600c603536e2490ca37b3dbd3280e3ad6eaacfe6264605ac8a", + "sha256:8359bef65f49d51476e9811d59c015f0ddae618ee0e44144f5595278c9f8268c", + "sha256:83c155b4a33368d9b9c2543e78f2452090fb030c52401ca608ef16fa58c98353", + "sha256:83d054818a8d1198d8bd8bc3ea2aac112a2c19def2bf73758321976788706398", + "sha256:87bf3f91a9328a941acc44eceffd4e1f5f89b030985b2966637e582157173b98", + "sha256:87df18fce4440477c3ef1fd7ae17c704a69a74a77e705a12be135ee0651a0c2d", + "sha256:87e9903b8668a16ef02c08ba4ebc91e57a49c481e9b5866e31d798632805014b", + "sha256:88a5ac3280232d4ed8e994cbc03b46a1807ce0aa123867b40c4a41f226c61f94", + "sha256:88f6fd5a5c1302ce79993cc5feeadcc795a70f953c762544d01fb02b2db4ea33", + "sha256:8d4f0eed76b430f009fbefca1a0028ddb112891b03cb556d7440d5cd68eb89a9", + "sha256:8db8f0c59c84792c12661f8efa85de160f80efe16b87a9d5de91b93f9e0bce3c", + "sha256:8e5b927acc5f2f59547270b0309a46d983edc44be64e1ca27a7fcb0474d6cd67", + "sha256:9353c2af2dd467333d4850a16edb66855e795561cd170685178f706c80d2c71e", + "sha256:949aaa1118660f992dbf0968487b3e3cfdad67f948658ab08c6b5762e90cc8b6", + "sha256:9850754c434e636ce3dc586f534bb23bcbd78940c304775bee9005bf610e98f1", + "sha256:9d625692578dd40a112df30c02a1adf068027566abd8e6a74893bb13d441c150", + "sha256:9f1a3e01dce3cd54060791e7e24588417c98b941baa5974700eeb0b8eb65b0a0", + "sha256:9fdcb80f4e9fbb33b58e9bf95e7dbbedff505d1bcd1c05f7c7ce883632710006", + "sha256:a2c28c6487f17d8db0089e8b2cdc13de859366b94cc6cdc50e1b0a4147b56551", + "sha256:a4b8617499b2371c7353302c5116a7e0a3a12da66389ce53140e607d3bf7b3d3", + "sha256:a51d0ddd4deb8871c6de15a772ef40b3dbd26a3c0451bb9e66bc76fc5a784e5b", + "sha256:a9c421153aa23c323bd8483d4155b4eee82c9a50ac11cccd83539104a8279c64", + "sha256:a9d752c97f6bbe327352f76e5b86442d776abc789249fc5e72eacb49e6916482", + "sha256:b96c3d9266439d17f35fc2173111d93afc1162f168e95aed122c1ca517b1f8f1", + "sha256:baec5675944b4a47749c93c01c73d826ef7d42d36ba8d0dba36336fa80c76426", + "sha256:bb1ed4fcb0b3c2f3ea2c5767434622fe5d660e5752f21ac2e8d737b1e5e480bb", + "sha256:bb62841e839ee992c37bb75e75891c7f4904e772db3691c59daaca5b4ab960e1", + "sha256:bbaa31f2cb49013818bde319232ccd72da62ee40f7d2aa532083eda5664e85ff", + "sha256:bd574145d930cf9405a64f9923600879a5ce51d9f315443a5f706374841327b6", + "sha256:c05270b290acd8d249739f40d272a64dd597d5a4b90f27d830e538bc2549303c", + "sha256:c0cea44bba5c5601b297bc8307e4075535b95163402e4906b2e9b82788a2a6df", + "sha256:c11a4ab7debc456326a2bac67f35ee0ac792bcf812c7562a4a28559a5c795e27", + "sha256:c329a4dcc395364a1c0d2d1574d725fe81a840783dda64c31c5a60fc7d41472c", + "sha256:c6280f5aeafa1731f0a3709463ab33d8e0624321593951aefada5472f0b313fd", + "sha256:ca1a209157f242eb183e209040097118472e169f2e069bfbd40c303e24866543", + "sha256:cfc1fc38e37630dd12d519bdec9dcd4b345aec9930bb9ce0ed04461f49e58b52", + "sha256:d468cffb82d90789696d5b4d8b6ab8843052cba58a15296691a7a3df55143cd2", + "sha256:d52f5d0615ea83fd853d4e1d8acf93cc2e0223ad4568ba1e1f6ca72e94ea7b9d", + "sha256:df81e3407630de060ae8313da49509c3caa33b1a9415562284eaf3d0c7705f9f", + "sha256:e06a9ebc8656e030ccfe44634f2a541b4b1801cd52e390a53ad8bacbd65f8518", + "sha256:e4a0f374200e8443d9746e947ebb346c40f83a3970e75a685ade0adbba5c48d9", + "sha256:e5c9d86d9b13b18aafa88303e2cd21e677f5da2a14c828d2c460fe513af2e9a5", + "sha256:eb276a53717f2bcfb6df0bcf284d8a12069002508d4c1ca715799226024ccd45", + "sha256:ee9622e84fe6e4cd4f020e5fda70d6206feff3157731df7151d457fdae18e541", + "sha256:ef73df487b7c14b477016947c92708c2d929e1dee2bacdd6fff5a82ed4539537", + "sha256:f15117b96866401825f3e94543145028a2947d19974429246ce59403f49e77c6", + "sha256:f6784eed29f9e036acb0b7769d9e78a0dc2c72c2d8ba7903005350d817e287a4", + "sha256:f75698c5c5c542417ac4956acfc420f7d4a2396adca63a015fd66641ea751759", + "sha256:fb7b54830cee8cf9923d969e2df87ce20e625b1af2fd194222ab902d3adcc29c" ], "markers": "python_version >= '3.7'", - "version": "==0.4.3" + "version": "==0.4.4" }, "scikit-learn": { "hashes": [ @@ -2290,14 +2246,6 @@ "markers": "python_full_version >= '3.8.0'", "version": "==3.0.1" }, - "shellingham": { - "hashes": [ - "sha256:7ecfff8f2fd72616f7481040475a65b2bf8af90a56c89140852d1120324e8686", - "sha256:8dbca0739d487e5bd35ab3ca4b36e11c4078f3a234bfce294b0a0291363404de" - ], - "markers": "python_version >= '3.7'", - "version": "==1.5.4" - }, "six": { "hashes": [ "sha256:1e61c37477a1626458e36f7b1d82aa5c9b094fa4802892072e49de9c60c4c926", @@ -2327,58 +2275,58 @@ "asyncio" ], "hashes": [ - "sha256:0b0f658414ee4e4b8cbcd4a9bb0fd743c5eeb81fc858ca517217a8013d282c96", - "sha256:2196208432deebdfe3b22185d46b08f00ac9d7b01284e168c212919891289396", - "sha256:23b9fbb2f5dd9e630db70fbe47d963c7779e9c81830869bd7d137c2dc1ad05fb", - "sha256:26a6a9837589c42b16693cf7bf836f5d42218f44d198f9343dd71d3164ceeeac", - "sha256:2a21c97efcbb9f255d5c12a96ae14da873233597dfd00a3a0c4ce5b3e5e79704", - "sha256:2e2c38c2a4c5c634fe6c3c58a789712719fa1bf9b9d6ff5ebfce9a9e5b89c1ca", - "sha256:2fc47dc6185a83c8100b37acda27658fe4dbd33b7d5e7324111f6521008ab4fe", - "sha256:2fd17e3bb8058359fa61248c52c7b09a97cf3c820e54207a50af529876451808", - "sha256:352b2770097f41bff6029b280c0e03b217c2dcaddc40726f8f53ed58d8a85da4", - "sha256:3b74570d99126992d4b0f91fb87c586a574a5872651185de8297c6f90055ae42", - "sha256:3cb8a66b167b033ec72c3812ffc8441d4e9f5f78f5e31e54dcd4c90a4ca5bebc", - "sha256:3f9faef422cfbb8fd53716cd14ba95e2ef655400235c3dfad1b5f467ba179c8c", - "sha256:4b600e9a212ed59355813becbcf282cfda5c93678e15c25a0ef896b354423238", - "sha256:501ff052229cb79dd4c49c402f6cb03b5a40ae4771efc8bb2bfac9f6c3d3508f", - "sha256:56d51ae825d20d604583f82c9527d285e9e6d14f9a5516463d9705dab20c3740", - "sha256:597fec37c382a5442ffd471f66ce12d07d91b281fd474289356b1a0041bdf31d", - "sha256:5a48ac4d359f058474fadc2115f78a5cdac9988d4f99eae44917f36aa1476327", - "sha256:5b6cf796d9fcc9b37011d3f9936189b3c8074a02a4ed0c0fbbc126772c31a6d4", - "sha256:66f63278db425838b3c2b1c596654b31939427016ba030e951b292e32b99553e", - "sha256:69f3e3c08867a8e4856e92d7afb618b95cdee18e0bc1647b77599722c9a28911", - "sha256:6e2622844551945db81c26a02f27d94145b561f9d4b0c39ce7bfd2fda5776dac", - "sha256:6f77c4f042ad493cb8595e2f503c7a4fe44cd7bd59c7582fd6d78d7e7b8ec52c", - "sha256:74afabeeff415e35525bf7a4ecdab015f00e06456166a2eba7590e49f8db940e", - "sha256:750900a471d39a7eeba57580b11983030517a1f512c2cb287d5ad0fcf3aebd58", - "sha256:78fe11dbe37d92667c2c6e74379f75746dc947ee505555a0197cfba9a6d4f1a4", - "sha256:79a40771363c5e9f3a77f0e28b3302801db08040928146e6808b5b7a40749c88", - "sha256:7bd112be780928c7f493c1a192cd8c5fc2a2a7b52b790bc5a84203fb4381c6be", - "sha256:8a41514c1a779e2aa9a19f67aaadeb5cbddf0b2b508843fcd7bafdf4c6864005", - "sha256:9f2bee229715b6366f86a95d497c347c22ddffa2c7c96143b59a2aa5cc9eebbc", - "sha256:9fea3d0884e82d1e33226935dac990b967bef21315cbcc894605db3441347443", - "sha256:afb6dde6c11ea4525318e279cd93c8734b795ac8bb5dda0eedd9ebaca7fa23f1", - "sha256:b607489dd4a54de56984a0c7656247504bd5523d9d0ba799aef59d4add009484", - "sha256:b6e22630e89f0e8c12332b2b4c282cb01cf4da0d26795b7eae16702a608e7ca1", - "sha256:b9c01990d9015df2c6f818aa8f4297d42ee71c9502026bb074e713d496e26b67", - "sha256:bd15026f77420eb2b324dcb93551ad9c5f22fab2c150c286ef1dc1160f110203", - "sha256:c06fb43a51ccdff3b4006aafee9fcf15f63f23c580675f7734245ceb6b6a9e05", - "sha256:c76c81c52e1e08f12f4b6a07af2b96b9b15ea67ccdd40ae17019f1c373faa227", - "sha256:ccaf1b0c90435b6e430f5dd30a5aede4764942a695552eb3a4ab74ed63c5b8d3", - "sha256:cd1591329333daf94467e699e11015d9c944f44c94d2091f4ac493ced0119449", - "sha256:cd5b94d4819c0c89280b7c6109c7b788a576084bf0a480ae17c227b0bc41e109", - "sha256:d337bf94052856d1b330d5fcad44582a30c532a2463776e1651bd3294ee7e58b", - "sha256:dc251477eae03c20fae8db9c1c23ea2ebc47331bcd73927cdcaecd02af98d3c3", - "sha256:dc6d69f8829712a4fd799d2ac8d79bdeff651c2301b081fd5d3fe697bd5b4ab9", - "sha256:f2a213c1b699d3f5768a7272de720387ae0122f1becf0901ed6eaa1abd1baf6c", - "sha256:f3ad7f221d8a69d32d197e5968d798217a4feebe30144986af71ada8c548e9fa", - "sha256:f43e93057cf52a227eda401251c72b6fbe4756f35fa6bfebb5d73b86881e59b0", - "sha256:f68470edd70c3ac3b6cd5c2a22a8daf18415203ca1b036aaeb9b0fb6f54e8298", - "sha256:fa4b1af3e619b5b0b435e333f3967612db06351217c58bfb50cee5f003db2a5a", - "sha256:fc6b14e8602f59c6ba893980bea96571dd0ed83d8ebb9c4479d9ed5425d562e9" + "sha256:01438ebcdc566d58c93af0171c74ec28efe6a29184b773e378a385e6215389da", + "sha256:0c1c9b673d21477cec17ab10bc4decb1322843ba35b481585facd88203754fc5", + "sha256:0c9045ecc2e4db59bfc97b20516dfdf8e41d910ac6fb667ebd3a79ea54084619", + "sha256:0d322cc9c9b2154ba7e82f7bf25ecc7c36fbe2d82e2933b3642fc095a52cfc78", + "sha256:0ef18a84e5116340e38eca3e7f9eeaaef62738891422e7c2a0b80feab165905f", + "sha256:1467940318e4a860afd546ef61fefb98a14d935cd6817ed07a228c7f7c62f389", + "sha256:14e09e083a5796d513918a66f3d6aedbc131e39e80875afe81d98a03312889e6", + "sha256:167e7497035c303ae50651b351c28dc22a40bb98fbdb8468cdc971821b1ae533", + "sha256:19d98f4f58b13900d8dec4ed09dd09ef292208ee44cc9c2fe01c1f0a2fe440e9", + "sha256:21b053be28a8a414f2ddd401f1be8361e41032d2ef5884b2f31d31cb723e559f", + "sha256:251f0d1108aab8ea7b9aadbd07fb47fb8e3a5838dde34aa95a3349876b5a1f1d", + "sha256:295ff8689544f7ee7e819529633d058bd458c1fd7f7e3eebd0f9268ebc56c2a0", + "sha256:2b6be53e4fde0065524f1a0a7929b10e9280987b320716c1509478b712a7688c", + "sha256:306fe44e754a91cd9d600a6b070c1f2fadbb4a1a257b8781ccf33c7067fd3e4d", + "sha256:31983018b74908ebc6c996a16ad3690301a23befb643093fcfe85efd292e384d", + "sha256:328429aecaba2aee3d71e11f2477c14eec5990fb6d0e884107935f7fb6001632", + "sha256:3bd1cae7519283ff525e64645ebd7a3e0283f3c038f461ecc1c7b040a0c932a1", + "sha256:3cd33c61513cb1b7371fd40cf221256456d26a56284e7d19d1f0b9f1eb7dd7e8", + "sha256:3eb6a97a1d39976f360b10ff208c73afb6a4de86dd2a6212ddf65c4a6a2347d5", + "sha256:4363ed245a6231f2e2957cccdda3c776265a75851f4753c60f3004b90e69bfeb", + "sha256:4488120becf9b71b3ac718f4138269a6be99a42fe023ec457896ba4f80749525", + "sha256:49496b68cd190a147118af585173ee624114dfb2e0297558c460ad7495f9dfe2", + "sha256:4979dc80fbbc9d2ef569e71e0896990bc94df2b9fdbd878290bd129b65ab579c", + "sha256:52fec964fba2ef46476312a03ec8c425956b05c20220a1a03703537824b5e8e1", + "sha256:5954463675cb15db8d4b521f3566a017c8789222b8316b1e6934c811018ee08b", + "sha256:62e23d0ac103bcf1c5555b6c88c114089587bc64d048fef5bbdb58dfd26f96da", + "sha256:6bab3db192a0c35e3c9d1560eb8332463e29e5507dbd822e29a0a3c48c0a8d92", + "sha256:6c742be912f57586ac43af38b3848f7688863a403dfb220193a882ea60e1ec3a", + "sha256:723a40ee2cc7ea653645bd4cf024326dea2076673fc9d3d33f20f6c81db83e1d", + "sha256:78c03d0f8a5ab4f3034c0e8482cfcc415a3ec6193491cfa1c643ed707d476f16", + "sha256:7d6ba0497c1d066dd004e0f02a92426ca2df20fac08728d03f67f6960271feec", + "sha256:7dd8583df2f98dea28b5cd53a1beac963f4f9d087888d75f22fcc93a07cf8d84", + "sha256:85a01b5599e790e76ac3fe3aa2f26e1feba56270023d6afd5550ed63c68552b3", + "sha256:8a37e4d265033c897892279e8adf505c8b6b4075f2b40d77afb31f7185cd6ecd", + "sha256:8bd63d051f4f313b102a2af1cbc8b80f061bf78f3d5bd0843ff70b5859e27924", + "sha256:916a798f62f410c0b80b63683c8061f5ebe237b0f4ad778739304253353bc1cb", + "sha256:9365a3da32dabd3e69e06b972b1ffb0c89668994c7e8e75ce21d3e5e69ddef28", + "sha256:99db65e6f3ab42e06c318f15c98f59a436f1c78179e6a6f40f529c8cc7100b22", + "sha256:aaf04784797dcdf4c0aa952c8d234fa01974c4729db55c45732520ce12dd95b4", + "sha256:acd9b73c5c15f0ec5ce18128b1fe9157ddd0044abc373e6ecd5ba376a7e5d961", + "sha256:ada0102afff4890f651ed91120c1120065663506b760da4e7823913ebd3258be", + "sha256:b178e875a7a25b5938b53b006598ee7645172fccafe1c291a706e93f48499ff5", + "sha256:b27dfb676ac02529fb6e343b3a482303f16e6bc3a4d868b73935b8792edb52d0", + "sha256:b8afd5b26570bf41c35c0121801479958b4446751a3971fb9a480c1afd85558e", + "sha256:bf2360a5e0f7bd75fa80431bf8ebcfb920c9f885e7956c7efde89031695cafb8", + "sha256:c1b88cc8b02b6a5f0efb0345a03672d4c897dc7d92585176f88c67346f565ea8", + "sha256:c41a2b9ca80ee555decc605bd3c4520cc6fef9abde8fd66b1cf65126a6922d65", + "sha256:c750987fc876813f27b60d619b987b057eb4896b81117f73bb8d9918c14f1cad", + "sha256:e567a8793a692451f706b363ccf3c45e056b67d90ead58c3bc9471af5d212202" ], "markers": "python_version >= '3.7'", - "version": "==2.0.31" + "version": "==2.0.32" }, "starlette": { "hashes": [ @@ -2567,6 +2515,14 @@ "markers": "python_version >= '3.7'", "version": "==0.19.1" }, + "tomli": { + "hashes": [ + "sha256:939de3e7a6161af0c887ef91b7d41a53e7c5a1ca976325f429cb46ea9bc30ecc", + "sha256:de526c12914f0c550d15924c62d72abc48d6fe7364aa87328337a31007fe8a4f" + ], + "markers": "python_version < '3.11'", + "version": "==2.0.1" + }, "torch": { "hashes": [ "sha256:2497cbc7b3c951d69b276ca51fe01c2865db67040ac67f5fc20b03e41d16ea4a", @@ -2595,27 +2551,19 @@ }, "tqdm": { "hashes": [ - "sha256:b75ca56b413b030bc3f00af51fd2c1a1a5eac6a0c1cca83cbb37a5c52abce644", - "sha256:e4d936c9de8727928f3be6079590e97d9abfe8d39a590be678eb5919ffc186bb" + "sha256:90279a3770753eafc9194a0364852159802111925aa30eb3f9d85b0e805ac7cd", + "sha256:e1020aef2e5096702d8a025ac7d16b1577279c9d63f8375b63083e9a5f0fcbad" ], "markers": "python_version >= '3.7'", - "version": "==4.66.4" + "version": "==4.66.5" }, "transformers": { "hashes": [ - "sha256:6552beada5d826c25ff9b79139d237ab9050c6ea96b73d7fd2f8a8ba23ee76a4", - "sha256:820c5b192bb1bf47250802901a8f0bf581e06b8fded89179d4ef08a1e903ee1c" + "sha256:b62288990a65ed9bfb79191e04dbb76c9376834ae6e0dd911320a2ced63324fe", + "sha256:d2202ed201e0c44f80de8d753a19f6164187754630bc1f915661b9511d61c773" ], "markers": "python_full_version >= '3.8.0'", - "version": "==4.43.3" - }, - "typer": { - "hashes": [ - "sha256:070d7ca53f785acbccba8e7d28b08dcd88f79f1fbda035ade0aecec71ca5c914", - "sha256:49e73131481d804288ef62598d97a1ceef3058905aa536a1134f90891ba35482" - ], - "markers": "python_version >= '3.7'", - "version": "==0.12.3" + "version": "==4.43.4" }, "types-requests": { "hashes": [ @@ -2656,210 +2604,6 @@ "markers": "python_version >= '3.10'", "version": "==2.2.2" }, - "uvicorn": { - "extras": [ - "standard" - ], - "hashes": [ - "sha256:00db9a9e3711a5fa59866e2b02fac69d8dc70ce0814aaec9a66d1d9e5c832a30", - "sha256:06b00e3087e58c6865c284143c0c42f810b32ff4f265ab19d08c566f74a08728" - ], - "markers": "python_version >= '3.8'", - "version": "==0.30.4" - }, - "uvloop": { - "hashes": [ - "sha256:0246f4fd1bf2bf702e06b0d45ee91677ee5c31242f39aab4ea6fe0c51aedd0fd", - "sha256:02506dc23a5d90e04d4f65c7791e65cf44bd91b37f24cfc3ef6cf2aff05dc7ec", - "sha256:13dfdf492af0aa0a0edf66807d2b465607d11c4fa48f4a1fd41cbea5b18e8e8b", - "sha256:2693049be9d36fef81741fddb3f441673ba12a34a704e7b4361efb75cf30befc", - "sha256:271718e26b3e17906b28b67314c45d19106112067205119dddbd834c2b7ce797", - "sha256:2df95fca285a9f5bfe730e51945ffe2fa71ccbfdde3b0da5772b4ee4f2e770d5", - "sha256:31e672bb38b45abc4f26e273be83b72a0d28d074d5b370fc4dcf4c4eb15417d2", - "sha256:34175c9fd2a4bc3adc1380e1261f60306344e3407c20a4d684fd5f3be010fa3d", - "sha256:45bf4c24c19fb8a50902ae37c5de50da81de4922af65baf760f7c0c42e1088be", - "sha256:472d61143059c84947aa8bb74eabbace30d577a03a1805b77933d6bd13ddebbd", - "sha256:47bf3e9312f63684efe283f7342afb414eea4d3011542155c7e625cd799c3b12", - "sha256:492e2c32c2af3f971473bc22f086513cedfc66a130756145a931a90c3958cb17", - "sha256:4ce6b0af8f2729a02a5d1575feacb2a94fc7b2e983868b009d51c9a9d2149bef", - "sha256:5138821e40b0c3e6c9478643b4660bd44372ae1e16a322b8fc07478f92684e24", - "sha256:5588bd21cf1fcf06bded085f37e43ce0e00424197e7c10e77afd4bbefffef428", - "sha256:570fc0ed613883d8d30ee40397b79207eedd2624891692471808a95069a007c1", - "sha256:5a05128d315e2912791de6088c34136bfcdd0c7cbc1cf85fd6fd1bb321b7c849", - "sha256:5daa304d2161d2918fa9a17d5635099a2f78ae5b5960e742b2fcfbb7aefaa593", - "sha256:5f17766fb6da94135526273080f3455a112f82570b2ee5daa64d682387fe0dcd", - "sha256:6e3d4e85ac060e2342ff85e90d0c04157acb210b9ce508e784a944f852a40e67", - "sha256:7010271303961c6f0fe37731004335401eb9075a12680738731e9c92ddd96ad6", - "sha256:7207272c9520203fea9b93843bb775d03e1cf88a80a936ce760f60bb5add92f3", - "sha256:78ab247f0b5671cc887c31d33f9b3abfb88d2614b84e4303f1a63b46c046c8bd", - "sha256:7b1fd71c3843327f3bbc3237bedcdb6504fd50368ab3e04d0410e52ec293f5b8", - "sha256:8ca4956c9ab567d87d59d49fa3704cf29e37109ad348f2d5223c9bf761a332e7", - "sha256:91ab01c6cd00e39cde50173ba4ec68a1e578fee9279ba64f5221810a9e786533", - "sha256:cd81bdc2b8219cb4b2556eea39d2e36bfa375a2dd021404f90a62e44efaaf957", - "sha256:da8435a3bd498419ee8c13c34b89b5005130a476bda1d6ca8cfdde3de35cd650", - "sha256:de4313d7f575474c8f5a12e163f6d89c0a878bc49219641d49e6f1444369a90e", - "sha256:e27f100e1ff17f6feeb1f33968bc185bf8ce41ca557deee9d9bbbffeb72030b7", - "sha256:f467a5fd23b4fc43ed86342641f3936a68ded707f4627622fa3f82a120e18256" - ], - "version": "==0.19.0" - }, - "watchfiles": { - "hashes": [ - "sha256:00095dd368f73f8f1c3a7982a9801190cc88a2f3582dd395b289294f8975172b", - "sha256:00ad0bcd399503a84cc688590cdffbe7a991691314dde5b57b3ed50a41319a31", - "sha256:00f39592cdd124b4ec5ed0b1edfae091567c72c7da1487ae645426d1b0ffcad1", - "sha256:030bc4e68d14bcad2294ff68c1ed87215fbd9a10d9dea74e7cfe8a17869785ab", - "sha256:052d668a167e9fc345c24203b104c313c86654dd6c0feb4b8a6dfc2462239249", - "sha256:067dea90c43bf837d41e72e546196e674f68c23702d3ef80e4e816937b0a3ffd", - "sha256:0b04a2cbc30e110303baa6d3ddce8ca3664bc3403be0f0ad513d1843a41c97d1", - "sha256:0bc3b2f93a140df6806c8467c7f51ed5e55a931b031b5c2d7ff6132292e803d6", - "sha256:0c8e0aa0e8cc2a43561e0184c0513e291ca891db13a269d8d47cb9841ced7c71", - "sha256:103622865599f8082f03af4214eaff90e2426edff5e8522c8f9e93dc17caee13", - "sha256:1235c11510ea557fe21be5d0e354bae2c655a8ee6519c94617fe63e05bca4171", - "sha256:1cc0cba54f47c660d9fa3218158b8963c517ed23bd9f45fe463f08262a4adae1", - "sha256:1d9188979a58a096b6f8090e816ccc3f255f137a009dd4bbec628e27696d67c1", - "sha256:213792c2cd3150b903e6e7884d40660e0bcec4465e00563a5fc03f30ea9c166c", - "sha256:25c817ff2a86bc3de3ed2df1703e3d24ce03479b27bb4527c57e722f8554d971", - "sha256:2627a91e8110b8de2406d8b2474427c86f5a62bf7d9ab3654f541f319ef22bcb", - "sha256:280a4afbc607cdfc9571b9904b03a478fc9f08bbeec382d648181c695648202f", - "sha256:28324d6b28bcb8d7c1041648d7b63be07a16db5510bea923fc80b91a2a6cbed6", - "sha256:28585744c931576e535860eaf3f2c0ec7deb68e3b9c5a85ca566d69d36d8dd27", - "sha256:28f393c1194b6eaadcdd8f941307fc9bbd7eb567995232c830f6aef38e8a6e88", - "sha256:2abeb79209630da981f8ebca30a2c84b4c3516a214451bfc5f106723c5f45843", - "sha256:2bdadf6b90c099ca079d468f976fd50062905d61fae183f769637cb0f68ba59a", - "sha256:2f350cbaa4bb812314af5dab0eb8d538481e2e2279472890864547f3fe2281ed", - "sha256:3218a6f908f6a276941422b035b511b6d0d8328edd89a53ae8c65be139073f84", - "sha256:3973145235a38f73c61474d56ad6199124e7488822f3a4fc97c72009751ae3b0", - "sha256:3a0d883351a34c01bd53cfa75cd0292e3f7e268bacf2f9e33af4ecede7e21d1d", - "sha256:425440e55cd735386ec7925f64d5dde392e69979d4c8459f6bb4e920210407f2", - "sha256:4b9f2a128a32a2c273d63eb1fdbf49ad64852fc38d15b34eaa3f7ca2f0d2b797", - "sha256:4cc382083afba7918e32d5ef12321421ef43d685b9a67cc452a6e6e18920890e", - "sha256:52fc9b0dbf54d43301a19b236b4a4614e610605f95e8c3f0f65c3a456ffd7d35", - "sha256:55b7cc10261c2786c41d9207193a85c1db1b725cf87936df40972aab466179b6", - "sha256:581f0a051ba7bafd03e17127735d92f4d286af941dacf94bcf823b101366249e", - "sha256:5834e1f8b71476a26df97d121c0c0ed3549d869124ed2433e02491553cb468c2", - "sha256:5e45fb0d70dda1623a7045bd00c9e036e6f1f6a85e4ef2c8ae602b1dfadf7550", - "sha256:61af9efa0733dc4ca462347becb82e8ef4945aba5135b1638bfc20fad64d4f0e", - "sha256:68fe0c4d22332d7ce53ad094622b27e67440dacefbaedd29e0794d26e247280c", - "sha256:72a44e9481afc7a5ee3291b09c419abab93b7e9c306c9ef9108cb76728ca58d2", - "sha256:7a74436c415843af2a769b36bf043b6ccbc0f8d784814ba3d42fc961cdb0a9dc", - "sha256:8597b6f9dc410bdafc8bb362dac1cbc9b4684a8310e16b1ff5eee8725d13dcd6", - "sha256:8c39987a1397a877217be1ac0fb1d8b9f662c6077b90ff3de2c05f235e6a8f96", - "sha256:8c3e3675e6e39dc59b8fe5c914a19d30029e36e9f99468dddffd432d8a7b1c93", - "sha256:8dc1fc25a1dedf2dd952909c8e5cb210791e5f2d9bc5e0e8ebc28dd42fed7562", - "sha256:8fdebb655bb1ba0122402352b0a4254812717a017d2dc49372a1d47e24073795", - "sha256:9165bcab15f2b6d90eedc5c20a7f8a03156b3773e5fb06a790b54ccecdb73385", - "sha256:94ebe84a035993bb7668f58a0ebf998174fb723a39e4ef9fce95baabb42b787f", - "sha256:9624a68b96c878c10437199d9a8b7d7e542feddda8d5ecff58fdc8e67b460848", - "sha256:96eec15e5ea7c0b6eb5bfffe990fc7c6bd833acf7e26704eb18387fb2f5fd087", - "sha256:97b94e14b88409c58cdf4a8eaf0e67dfd3ece7e9ce7140ea6ff48b0407a593ec", - "sha256:988e981aaab4f3955209e7e28c7794acdb690be1efa7f16f8ea5aba7ffdadacb", - "sha256:a8a31bfd98f846c3c284ba694c6365620b637debdd36e46e1859c897123aa232", - "sha256:a927b3034d0672f62fb2ef7ea3c9fc76d063c4b15ea852d1db2dc75fe2c09696", - "sha256:ace7d060432acde5532e26863e897ee684780337afb775107c0a90ae8dbccfd2", - "sha256:aec83c3ba24c723eac14225194b862af176d52292d271c98820199110e31141e", - "sha256:b44b70850f0073b5fcc0b31ede8b4e736860d70e2dbf55701e05d3227a154a67", - "sha256:b610fb5e27825b570554d01cec427b6620ce9bd21ff8ab775fc3a32f28bba63e", - "sha256:b810a2c7878cbdecca12feae2c2ae8af59bea016a78bc353c184fa1e09f76b68", - "sha256:bbf8a20266136507abf88b0df2328e6a9a7c7309e8daff124dda3803306a9fdb", - "sha256:bd4c06100bce70a20c4b81e599e5886cf504c9532951df65ad1133e508bf20be", - "sha256:c2444dc7cb9d8cc5ab88ebe792a8d75709d96eeef47f4c8fccb6df7c7bc5be71", - "sha256:c49b76a78c156979759d759339fb62eb0549515acfe4fd18bb151cc07366629c", - "sha256:c4a65474fd2b4c63e2c18ac67a0c6c66b82f4e73e2e4d940f837ed3d2fd9d4da", - "sha256:c5af2347d17ab0bd59366db8752d9e037982e259cacb2ba06f2c41c08af02c39", - "sha256:c668228833c5619f6618699a2c12be057711b0ea6396aeaece4ded94184304ea", - "sha256:c7b978c384e29d6c7372209cbf421d82286a807bbcdeb315427687f8371c340a", - "sha256:d048ad5d25b363ba1d19f92dcf29023988524bee6f9d952130b316c5802069cb", - "sha256:d3e1f3cf81f1f823e7874ae563457828e940d75573c8fbf0ee66818c8b6a9099", - "sha256:d47e9ef1a94cc7a536039e46738e17cce058ac1593b2eccdede8bf72e45f372a", - "sha256:da1e0a8caebf17976e2ffd00fa15f258e14749db5e014660f53114b676e68538", - "sha256:dc1b9b56f051209be458b87edb6856a449ad3f803315d87b2da4c93b43a6fe72", - "sha256:dc2e8fe41f3cac0660197d95216c42910c2b7e9c70d48e6d84e22f577d106fc1", - "sha256:dc92d2d2706d2b862ce0568b24987eba51e17e14b79a1abcd2edc39e48e743c8", - "sha256:dd64f3a4db121bc161644c9e10a9acdb836853155a108c2446db2f5ae1778c3d", - "sha256:e0f0a874231e2839abbf473256efffe577d6ee2e3bfa5b540479e892e47c172d", - "sha256:f7e1f9c5d1160d03b93fc4b68a0aeb82fe25563e12fbcdc8507f8434ab6f823c", - "sha256:fe82d13461418ca5e5a808a9e40f79c1879351fcaeddbede094028e74d836e86" - ], - "version": "==0.22.0" - }, - "websockets": { - "hashes": [ - "sha256:00700340c6c7ab788f176d118775202aadea7602c5cc6be6ae127761c16d6b0b", - "sha256:0bee75f400895aef54157b36ed6d3b308fcab62e5260703add87f44cee9c82a6", - "sha256:0e6e2711d5a8e6e482cacb927a49a3d432345dfe7dea8ace7b5790df5932e4df", - "sha256:12743ab88ab2af1d17dd4acb4645677cb7063ef4db93abffbf164218a5d54c6b", - "sha256:1a9d160fd080c6285e202327aba140fc9a0d910b09e423afff4ae5cbbf1c7205", - "sha256:1bf386089178ea69d720f8db6199a0504a406209a0fc23e603b27b300fdd6892", - "sha256:1df2fbd2c8a98d38a66f5238484405b8d1d16f929bb7a33ed73e4801222a6f53", - "sha256:1e4b3f8ea6a9cfa8be8484c9221ec0257508e3a1ec43c36acdefb2a9c3b00aa2", - "sha256:1f38a7b376117ef7aff996e737583172bdf535932c9ca021746573bce40165ed", - "sha256:23509452b3bc38e3a057382c2e941d5ac2e01e251acce7adc74011d7d8de434c", - "sha256:248d8e2446e13c1d4326e0a6a4e9629cb13a11195051a73acf414812700badbd", - "sha256:25eb766c8ad27da0f79420b2af4b85d29914ba0edf69f547cc4f06ca6f1d403b", - "sha256:27a5e9964ef509016759f2ef3f2c1e13f403725a5e6a1775555994966a66e931", - "sha256:2c71bd45a777433dd9113847af751aae36e448bc6b8c361a566cb043eda6ec30", - "sha256:2cb388a5bfb56df4d9a406783b7f9dbefb888c09b71629351cc6b036e9259370", - "sha256:2d225bb6886591b1746b17c0573e29804619c8f755b5598d875bb4235ea639be", - "sha256:2e5fc14ec6ea568200ea4ef46545073da81900a2b67b3e666f04adf53ad452ec", - "sha256:363f57ca8bc8576195d0540c648aa58ac18cf85b76ad5202b9f976918f4219cf", - "sha256:3c6cc1360c10c17463aadd29dd3af332d4a1adaa8796f6b0e9f9df1fdb0bad62", - "sha256:3d829f975fc2e527a3ef2f9c8f25e553eb7bc779c6665e8e1d52aa22800bb38b", - "sha256:3e3aa8c468af01d70332a382350ee95f6986db479ce7af14d5e81ec52aa2b402", - "sha256:3f61726cae9f65b872502ff3c1496abc93ffbe31b278455c418492016e2afc8f", - "sha256:423fc1ed29f7512fceb727e2d2aecb952c46aa34895e9ed96071821309951123", - "sha256:46e71dbbd12850224243f5d2aeec90f0aaa0f2dde5aeeb8fc8df21e04d99eff9", - "sha256:4d87be612cbef86f994178d5186add3d94e9f31cc3cb499a0482b866ec477603", - "sha256:5693ef74233122f8ebab026817b1b37fe25c411ecfca084b29bc7d6efc548f45", - "sha256:5aa9348186d79a5f232115ed3fa9020eab66d6c3437d72f9d2c8ac0c6858c558", - "sha256:5d873c7de42dea355d73f170be0f23788cf3fa9f7bed718fd2830eefedce01b4", - "sha256:5f6ffe2c6598f7f7207eef9a1228b6f5c818f9f4d53ee920aacd35cec8110438", - "sha256:604428d1b87edbf02b233e2c207d7d528460fa978f9e391bd8aaf9c8311de137", - "sha256:6350b14a40c95ddd53e775dbdbbbc59b124a5c8ecd6fbb09c2e52029f7a9f480", - "sha256:6e2df67b8014767d0f785baa98393725739287684b9f8d8a1001eb2839031447", - "sha256:6e96f5ed1b83a8ddb07909b45bd94833b0710f738115751cdaa9da1fb0cb66e8", - "sha256:6e9e7db18b4539a29cc5ad8c8b252738a30e2b13f033c2d6e9d0549b45841c04", - "sha256:70ec754cc2a769bcd218ed8d7209055667b30860ffecb8633a834dde27d6307c", - "sha256:7b645f491f3c48d3f8a00d1fce07445fab7347fec54a3e65f0725d730d5b99cb", - "sha256:7fa3d25e81bfe6a89718e9791128398a50dec6d57faf23770787ff441d851967", - "sha256:81df9cbcbb6c260de1e007e58c011bfebe2dafc8435107b0537f393dd38c8b1b", - "sha256:8572132c7be52632201a35f5e08348137f658e5ffd21f51f94572ca6c05ea81d", - "sha256:87b4aafed34653e465eb77b7c93ef058516cb5acf3eb21e42f33928616172def", - "sha256:8e332c210b14b57904869ca9f9bf4ca32f5427a03eeb625da9b616c85a3a506c", - "sha256:9893d1aa45a7f8b3bc4510f6ccf8db8c3b62120917af15e3de247f0780294b92", - "sha256:9edf3fc590cc2ec20dc9d7a45108b5bbaf21c0d89f9fd3fd1685e223771dc0b2", - "sha256:9fdf06fd06c32205a07e47328ab49c40fc1407cdec801d698a7c41167ea45113", - "sha256:a02413bc474feda2849c59ed2dfb2cddb4cd3d2f03a2fedec51d6e959d9b608b", - "sha256:a1d9697f3337a89691e3bd8dc56dea45a6f6d975f92e7d5f773bc715c15dde28", - "sha256:a571f035a47212288e3b3519944f6bf4ac7bc7553243e41eac50dd48552b6df7", - "sha256:ab3d732ad50a4fbd04a4490ef08acd0517b6ae6b77eb967251f4c263011a990d", - "sha256:ae0a5da8f35a5be197f328d4727dbcfafa53d1824fac3d96cdd3a642fe09394f", - "sha256:b067cb952ce8bf40115f6c19f478dc71c5e719b7fbaa511359795dfd9d1a6468", - "sha256:b2ee7288b85959797970114deae81ab41b731f19ebcd3bd499ae9ca0e3f1d2c8", - "sha256:b81f90dcc6c85a9b7f29873beb56c94c85d6f0dac2ea8b60d995bd18bf3e2aae", - "sha256:ba0cab91b3956dfa9f512147860783a1829a8d905ee218a9837c18f683239611", - "sha256:baa386875b70cbd81798fa9f71be689c1bf484f65fd6fb08d051a0ee4e79924d", - "sha256:bbe6013f9f791944ed31ca08b077e26249309639313fff132bfbf3ba105673b9", - "sha256:bea88d71630c5900690fcb03161ab18f8f244805c59e2e0dc4ffadae0a7ee0ca", - "sha256:befe90632d66caaf72e8b2ed4d7f02b348913813c8b0a32fae1cc5fe3730902f", - "sha256:c3181df4583c4d3994d31fb235dc681d2aaad744fbdbf94c4802485ececdecf2", - "sha256:c4e37d36f0d19f0a4413d3e18c0d03d0c268ada2061868c1e6f5ab1a6d575077", - "sha256:c588f6abc13f78a67044c6b1273a99e1cf31038ad51815b3b016ce699f0d75c2", - "sha256:cbe83a6bbdf207ff0541de01e11904827540aa069293696dd528a6640bd6a5f6", - "sha256:d554236b2a2006e0ce16315c16eaa0d628dab009c33b63ea03f41c6107958374", - "sha256:dbcf72a37f0b3316e993e13ecf32f10c0e1259c28ffd0a85cee26e8549595fbc", - "sha256:dc284bbc8d7c78a6c69e0c7325ab46ee5e40bb4d50e494d8131a07ef47500e9e", - "sha256:dff6cdf35e31d1315790149fee351f9e52978130cef6c87c4b6c9b3baf78bc53", - "sha256:e469d01137942849cff40517c97a30a93ae79917752b34029f0ec72df6b46399", - "sha256:eb809e816916a3b210bed3c82fb88eaf16e8afcf9c115ebb2bacede1797d2547", - "sha256:ed2fcf7a07334c77fc8a230755c2209223a7cc44fc27597729b8ef5425aa61a3", - "sha256:f44069528d45a933997a6fef143030d8ca8042f0dfaad753e2906398290e2870", - "sha256:f764ba54e33daf20e167915edc443b6f88956f37fb606449b4a5b10ba42235a5", - "sha256:fc4e7fa5414512b481a2483775a8e8be7803a35b30ca805afa4998a84f9fd9e8", - "sha256:ffefa1374cd508d633646d51a8e9277763a9b78ae71324183693959cf94635a7" - ], - "version": "==12.0" - }, "wrapt": { "hashes": [ "sha256:0d2691979e93d06a95a26257adb7bfd0c93818e89b1406f5a28f36e0d8c1e1fc", diff --git a/docdbtest/README.md b/docdbtest/README.md deleted file mode 100644 index 3b2e417..0000000 --- a/docdbtest/README.md +++ /dev/null @@ -1,28 +0,0 @@ -# test files for connecting with AWS DocumentDB instance -- a set of test files to confirm a connection with AWS DocumentDB and vector embedding persistence -- Note: the LlamaIndex DocDB integration uses `pymongo` - - when running the various files, there may be errors from the pymongo package, but these do not seem to impact vector storage / retrieval - - - -## to use -- create a `.env` file (can use env.template as a starter) - - MONGO_URI will come from the AWS Console for DocumentDB -- ensure you download the global-bundle.pem from the AWS Console -- ensure you've run `pipenv shell` in the root project folder - - - -## overview of files -- `test_nodb.py` : will create a VectorStoreIndex from the same files with no persistence - - note that running this file should give you a 'baseline' of how llamaIndex will perform - - note also that since being first created, it appears an underlying LangChain method has changed and is now deprecated (as of Jul 21, 2024) - -- `store_vectors.py` : this will vectorize the same files and store the index in DocDB - -- `list_vectors.py` : this program queries the contents of the 'testdb', 'testcollection' directly and prints the values to screen - - note: you can pipe the output to a file to examine output more closely - - e.g., `python list_vectors.py > output.txt` - -- `load_vectors.py` : this will load the vector embeddings from DocDB and then run a query against them - diff --git a/docdbtest/env.template b/docdbtest/env.template deleted file mode 100644 index 7f14155..0000000 --- a/docdbtest/env.template +++ /dev/null @@ -1,2 +0,0 @@ -OPENAI_API_KEY= -MONGO_URI= diff --git a/docdbtest/files/file1.txt b/docdbtest/files/file1.txt deleted file mode 100644 index 13f6cc4..0000000 --- a/docdbtest/files/file1.txt +++ /dev/null @@ -1,6 +0,0 @@ -In the mystical land of Rudenza: -Apples are purple. -Clouds are silver, tinged with gold and bronze. -Pianos will bite your fingers if you don't wear stripes when you practice. - - diff --git a/docdbtest/files/file2.txt b/docdbtest/files/file2.txt deleted file mode 100644 index db01e83..0000000 --- a/docdbtest/files/file2.txt +++ /dev/null @@ -1,2 +0,0 @@ -PotatoPecanPie is the secret word. -James' favourite food is pizza. diff --git a/docdbtest/list_vectors.py b/docdbtest/list_vectors.py deleted file mode 100644 index ba5f050..0000000 --- a/docdbtest/list_vectors.py +++ /dev/null @@ -1,23 +0,0 @@ -# this mini program is to list the vectors within the documentDB instance that were written by llamaIndex -# Note: the db and collection as named below - these can be changed when the vector_store is instantiated - - -import pymongo -import pprint -from dotenv import load_dotenv -import os - -load_dotenv(override=True) - -mongo_uri = os.environ["MONGO_URI"] -client = pymongo.MongoClient(mongo_uri) - - -db = client[os.environ["DOCDB_NAME"]] -collection = db[os.environ["DOCDB_COLLECTION"]] - -for post in collection.find(): - pprint.pprint(post) - - -print('==========') diff --git a/docdbtest/load_vectors.py b/docdbtest/load_vectors.py deleted file mode 100644 index 2ac3755..0000000 --- a/docdbtest/load_vectors.py +++ /dev/null @@ -1,35 +0,0 @@ -# based loosely upon LlamaIndex demo -# https://docs.llamaindex.ai/en/stable/examples/vector_stores/AWSDocDBDemo/ -# -# key goal here was to retrieve the stored vectors from DocumentDB rather than re-create them - - - -import pymongo -from dotenv import load_dotenv - -from llama_index.vector_stores.awsdocdb import AWSDocDbVectorStore -from llama_index.core import VectorStoreIndex -from llama_index.core import StorageContext -# from llama_index.core import SimpleDirectoryReader -import os - -load_dotenv(override=True) - -mongo_uri = os.environ["MONGO_URI"] -mongodb_client = pymongo.MongoClient(mongo_uri) -store = AWSDocDbVectorStore(mongodb_client, db_name='testdb', collection_name='testcollection') -storage_context = StorageContext.from_defaults(vector_store=store) - - - -index = VectorStoreIndex.from_vector_store( - vector_store=store, - storage_context=storage_context -) - - -response = index.as_query_engine().query('Tell me about Rudenza') -print(f"{response}") - - diff --git a/docdbtest/store_vectors.py b/docdbtest/store_vectors.py deleted file mode 100644 index fd16d01..0000000 --- a/docdbtest/store_vectors.py +++ /dev/null @@ -1,32 +0,0 @@ -# based upon LlamaIndex demo -# https://docs.llamaindex.ai/en/stable/examples/vector_stores/AWSDocDBDemo/ - - -import pymongo -from dotenv import load_dotenv - -from llama_index.vector_stores.awsdocdb import AWSDocDbVectorStore -from llama_index.core import VectorStoreIndex -from llama_index.core import StorageContext -from llama_index.core import SimpleDirectoryReader -import os - -load_dotenv(override=True) - - -mongo_uri = os.environ["MONGO_URI"] -mongodb_client = pymongo.MongoClient(mongo_uri) -store = AWSDocDbVectorStore(mongodb_client, db_name='testdb', collection_name='testcollection') -storage_context = StorageContext.from_defaults(vector_store=store) - -documents = SimpleDirectoryReader("files").load_data() - -index = VectorStoreIndex.from_documents( - documents, storage_context=storage_context -) - - -response = index.as_query_engine().query('Tell me about Rudenza') -print(f"{response}") - - diff --git a/docdbtest/test_nodb.py b/docdbtest/test_nodb.py deleted file mode 100644 index 047ae53..0000000 --- a/docdbtest/test_nodb.py +++ /dev/null @@ -1,12 +0,0 @@ -# Testing vector store - no persistence - -from dotenv import load_dotenv -from llama_index.core import VectorStoreIndex, SimpleDirectoryReader - -load_dotenv() - -documents = SimpleDirectoryReader("files").load_data() - -index = VectorStoreIndex.from_documents(documents) - -print("Index created successfully!") diff --git a/hybridTest.py b/hybridTest.py deleted file mode 100644 index e845cb0..0000000 --- a/hybridTest.py +++ /dev/null @@ -1,21 +0,0 @@ -import hybridSearch.search as search - -def print_nodes(nodes): - for node in nodes: - print(node) - - -kb_file_path = './tmpfiles/giraffes.pdf' -search.hybrid_write('giraffes', kb_file_path) # only need to do this the first time - -query = 'how long are giraffe necks?' - -# get nodes -nodes = search.hybrid_get_nodes('giraffes', query, top_k=5) - -all_nodes = nodes['keyword'] + nodes['vector'] - -print_nodes(all_nodes) - - - diff --git a/kb_config.py b/kb_config.py deleted file mode 100644 index 4703ad8..0000000 --- a/kb_config.py +++ /dev/null @@ -1,313 +0,0 @@ -import os -import copy -import shutil -from datetime import datetime, timezone - -from dotenv import load_dotenv -import pymongo -import nest_asyncio - -import use_s3 -import app_logger as log -# import mongo_helper as mongo - -# storage imports -from llama_index.core import StorageContext -from llama_index.core import VectorStoreIndex -from llama_index.vector_stores.mongodb import MongoDBAtlasVectorSearch -from llama_index.vector_stores.awsdocdb import AWSDocDbVectorStore -from llama_index.storage.docstore.mongodb import MongoDocumentStore - -from kb_constants import ( - EMBEDDINGS, - INGEST_METHODS, - SPLITTERS, - LLMS, - API_KEYS, -) - -from kb_type_definitions import ( - EmbedConfig, - LLMConfig, - MarkdownConfig, - SemanticConfig, - SentenceConfig, - FileMetadata, - ClientKBConfig, - KBConfig -) - -load_dotenv(override=True) - -MONGO_URI = os.environ["MONGO_URI"] -CONFIG_DB = os.environ["CONFIG_DB"] -CONFIG_KB_COL = os.environ["CONFIG_KB_COL"] -PYMONGO_CLIENT = pymongo.MongoClient(MONGO_URI) -CONFIG_COLLECTION = PYMONGO_CLIENT[CONFIG_DB][CONFIG_KB_COL] - -# CONFIG_COLLECTION = mongo.connect_to_kb_config() - - - -def is_int(s): - try: - int(s) - return True - except ValueError: - return False - -def is_float(value): - try: - float(value) - return True - except ValueError: - return False - - -class KnowledgeBase: - - # props in `self._config` are str names of the knowledge base configuration - # self._embed_model, self._llm, and self._splitter are instances of the classes - # defined by properties in `self._config` - # self._ingest_method is the class of the ingestion method defined by the - # ingest_method property in `self._config` - def __init__(self, kb_name): - self._config = self._get_kb_config(kb_name) - self._embed_model = self._configure_embed_model() - self._llm = self._configure_llm() - self._ingest_method = INGEST_METHODS[ - self._config['ingest_method'] - ] - self._splitter = self._configure_splitter() - - @classmethod - def create(cls, client_config): - # add properties to client_config - kb_config = cls._create_kb_config(client_config) - log.info("kb_config.py create (classmethod): ", kb_config) - # insert knowledge base configuration into database - result = CONFIG_COLLECTION.insert_one(kb_config) - log.info("kb_config.py create (classmethod): ", result) - - # message for client - return "Knowledge base created" - - @classmethod - def _create_kb_config(cls, client_config): - kb_config = copy.copy(client_config) - log.info('kb_config.py _create_kb_config: ', client_config, kb_config) - kb_config['id'] = kb_config['kb_name'] - kb_config['splitter_config'] = cls._str_to_nums(kb_config['splitter_config']) - kb_config['files'] = [] - - return kb_config - - # converts ints and floats in a dictionary to their respective types - @classmethod - def _str_to_nums(cls, config_dict): - result = {} - for key in config_dict: - if is_int(config_dict[key]): - result[key] = int(config_dict[key]) - elif is_float(config_dict[key]): - result[key] = float(config_dict[key]) - else: - result[key] = config_dict[key] - - return result - - # returns None if not found, otherwise returns the document - @classmethod - def exists(cls, kb_name): - doc = CONFIG_COLLECTION.find_one({"kb_name": kb_name}, {"_id": 0}) - log.info('kb_config.py exists: ', doc) - return doc - - @classmethod - def get_knowledge_bases(cls): - kbs_cursor = CONFIG_COLLECTION.find({}, {"_id": 0}) - kbs_list = list(kbs_cursor) - print('kb_config.py get_knowledge_bases: ', kbs_list) - # log.info('kb_config.py get_knowledge_bases: ', kbs) - return kbs_list - # returns list of file metadata objects for a knowledge base - # def get_files(self, kb_name): - # return CONFIG_COLLECTION.find_one({"kb_name": kb_name})['files'] - - # returns the configuration object for a knowledge base - def _get_kb_config(self, id): - kb_config = CONFIG_COLLECTION.find_one({"id": id}) - log.info('kb_config.py _get_kb_config: ', kb_config) - return kb_config - - def _configure_embed_model(self): - embed_provider = self._config['embed_config']['embed_provider'] - embed_model_class = EMBEDDINGS[embed_provider] - api_key = os.environ[API_KEYS[embed_provider]] - model = self._config['embed_config']['embed_model'] - embed_model = embed_model_class(api_key=api_key, model=model) - - return embed_model - - - def _configure_llm(self): - if self._config.get('llm_config') is None: - return None - - llm_provider = LLMS[self._config['llm_config']['llm_provider']] - key_name = API_KEYS[self._config['llm_config']['llm_provider']] - llm = llm_provider( - api_key=os.environ[key_name], - model= self._config['llm_config']['llm_model'] - ) - - return llm - - def _configure_splitter(self): - splitter_config = self._config['splitter_config'] - splitter_name = self._config['splitter'] - - if splitter_name == 'Semantic': - splitter_config['embed_model'] = self._embed_model - elif splitter_name == 'Markdown': - splitter_config['llm'] = self._llm - - splitter_class = SPLITTERS[self._config['splitter']] - - return splitter_class(**self._config['splitter_config']) - - - - # saves file locally, returns file path - def _save_file_locally(self, file): - FILE_DIR = 'tmpfiles' - - # write file to disk - if not os.path.exists(f"./{FILE_DIR}"): - os.makedirs(f"./{FILE_DIR}") - - - file_path= f"./{FILE_DIR}/{file.filename}" - - with open(file_path, "wb+") as file_object: - shutil.copyfileobj(file.file, file_object) - - # use_s3.ul_file(file.filename, dir=FILE_DIR) - - return file_path - - def _create_nodes(self, file_path): - if self._config['ingest_method'] == 'LlamaParse': - llama_parse = self._ingest_method( - api_key=os.environ["LLAMA_CLOUD_API_KEY"], - result_type="markdown" - ) - documents = llama_parse.load_data(file_path) - else: - documents = self._ingest_method(input_files=[file_path]).load_data() - - - if self._config['splitter'] == 'sentence': - nodes = self._splitter.split(documents) - else: - nodes = self._splitter.get_nodes_from_documents(documents) - - return nodes - - def _store_indexes(self, nodes): - - # mongodb_client = mongo.client() - # database name defines a knowledge base - log.info('kb_config.py _store_indexes: ********* ', self._config) - - kb_id = self._config['kb_name'] - log.info('kb_config.py _store_indexes: ', kb_id) - vector_index = "vector_index" - - - environment = os.environ["ENVIRONMENT"] - - if environment == 'local' or environment == 'mongoatlas': - vector_store = MongoDBAtlasVectorSearch( - PYMONGO_CLIENT, - db_name=kb_id, - collection_name=vector_index - ) - else: - vector_store = AWSDocDbVectorStore( - PYMONGO_CLIENT, - db_name=kb_id, - collection_name=vector_index - ) - - - storage_context = StorageContext.from_defaults( - vector_store=vector_store, - # docstore=docstore - ) - - VectorStoreIndex( - nodes, - storage_context=storage_context, - embed_model=self._embed_model - ) - - docstore = MongoDocumentStore.from_uri( - uri=MONGO_URI, - db_name=kb_id - ) - - docstore.add_documents(nodes) - - def _add_file_to_kb_config(self, file): - now = datetime.now(timezone.utc) - date = now.strftime("%m-%d-%y") - time = now.strftime("%H:%M") - - # for testing - # PYMONGO_CLIENT[CONFIG_DB][CONFIG_KB_COL].update_one( - pymongo.MongoClient(MONGO_URI)[CONFIG_DB][CONFIG_KB_COL].update_one( - {"kb_name": self._config['kb_name']}, - {"$push": { - "files": { - "file_name": file.filename, - "content_type": file.headers['content-type'], - "date_uploaded": date, - "time_uploaded": time - } - } - } - ) - # CONFIG_COLLECTION.update_one( - # {"kb_name": self._config['kb_name']}, - # { - # "$set": {"id": self._config['kb_name']}, - # "$push": { - # "files": { - # "file_name": file, - # "content_type": file, - # "date_uploaded": date, - # "time_uploaded": time - # } - # } - # } - # ) - - - def ingest_file(self, file): - file_path = self._save_file_locally(file) - nodes = self._create_nodes(file_path) - self._store_indexes(nodes) - self._add_file_to_kb_config(file) - - def ingest_file_path(self, file_path): - nodes = self._create_nodes(file_path) - self._store_indexes(nodes) - self._add_file_to_kb_config(file_path) - - - - - def print_config(self): - print(self.chunk_overlap) - diff --git a/kb_constants.py b/kb_constants.py deleted file mode 100644 index d322fc5..0000000 --- a/kb_constants.py +++ /dev/null @@ -1,138 +0,0 @@ -import os - -from llama_index.llms.openai import OpenAI -from llama_index.llms.anthropic import Anthropic -from llama_index.llms.cohere import Cohere - -from llama_index.embeddings.openai import OpenAIEmbedding -from llama_index.embeddings.cohere import CohereEmbedding -# from llama_index.embeddings.huggingface import HuggingFaceEmbedding - -# imports for reading files -from llama_parse import LlamaParse -from llama_index.core import SimpleDirectoryReader - -# imports for parsing files -from llama_index.core.node_parser import ( - SentenceSplitter, - SemanticSplitterNodeParser, - MarkdownElementNodeParser -) - -LLMS = { - "OpenAI": OpenAI, - "Anthropic": Anthropic, - "Cohere": Cohere -} - -EMBEDDINGS = { - "OpenAI": OpenAIEmbedding, - "Cohere": CohereEmbedding, -} - -INGEST_METHODS = { - "LlamaParse": LlamaParse, - "Simple": SimpleDirectoryReader -} - -SPLITTERS = { - "Sentence": SentenceSplitter, - "Semantic": SemanticSplitterNodeParser, - "Markdown": MarkdownElementNodeParser -} - -API_KEYS = { - "OpenAI": "OPENAI_API_KEY", - "Cohere": "COHERE_API_KEY", - "Anthropic": "ANTHROPIC_API_KEY", - "LlamaParse": "LLAMA_CLOUD_API_KEY", -} - - - - -LLM_MODELS = { - "OpenAI": [ - { - "name": "gpt-3.5-turbo", - "description": "good balance of cost and precision", - }, - { - "name": "gpt-4-turbo", - "description": "more advanced than 'gpt-3.5-turbo'", - }, - { - "name": "gpt-4o-mini", - "description": "affordable small model for lightweight tasks", - }, - { - "name": "gpt-4o", - "description": "OpenAI's flagship model", - } - ], - - "Anthropic": [ - { - "name": "claude-4-haiku-20240307", - "description": "fastest and cheapest Anthropic model", - }, - { - "name": "claude-3-sonnet-20240229", - "description": "balanced intellegence and speed", - }, - { - "name": "claude-3-5-sonnet-20240620", - "description": "highest performing Anthropic model", - } - ], - "Cohere": [] -} - - -EMBEDDING_MODEL_DETAILS = { - "OpenAI": [ - { - "name": "text-embedding-3-small", - "description": "good balance of cost and precision", - "language": "multilingual", - }, - { - "name": "text-embedding-3-large", - "description": "slightly more precise at ~6 times the cost of 'text-embedding-3-small'", - "language": "multilingual" - } - ], - "Cohere": [ - { - "name": "embed-english-light-v3.0", - "description": "slightly less precise, but faster than 'embed-english-v3.0'", - "language": "english" - }, - { - "name": "embed-english-v3.0", - "description": "more precise, but slower than 'embed-english-light-v3.0'", - "language": "english" - }, - { - "name": "embed-multilingual-light-v3.0", - "description": "slightly less precise, but faster than 'embed-multilingual-v3.0'", - "language": "multilingual" - }, - { - "name": "embed-multilingual-v3.0", - "description": "more precise, but slower than 'embed-multilingual-light-v3.0'", - "language": "multilingual" - }, - ], -} - - - - - -''' -Need to test hugging face embedding - - - -''' \ No newline at end of file diff --git a/kb_test_constants.py b/kb_test_constants.py deleted file mode 100644 index 6e5ed0c..0000000 --- a/kb_test_constants.py +++ /dev/null @@ -1,96 +0,0 @@ -client_sentence_config = { - "kb_name": "Sentence", - "ingest_method": "Simple", - "splitter": "Sentence", - "embed_config": { - "embed_provider": "OpenAI", - "embed_model": "text-embedding-3-small" - }, - "splitter_config": { - "chunk_size": "1024", - "chunk_overlap": "200" - }, -} - -client_semantic_config = { - "kb_name": "Semantic", - "ingest_method": "Simple", - "splitter": "Semantic", - "embed_config": { - "embed_provider": "OpenAI", - "embed_model": "text-embedding-3-small" - }, - "splitter_config": { - "buffer_size": "100", - "breakpoint_percentile_threshold": "95" - }, -} - -client_llama_parse_config = { - "kb_name": "Markdown", - "ingest_method": "LlamaParse", - "splitter": "Markdown", - "embed_config": { - "embed_provider": "OpenAI", - "embed_model": "text-embedding-3-small" - }, - "splitter_config": { - "num_workers": "8" - }, - "llm_config": { - "llm_provider": "OpenAI", - "llm_model": "gpt-3.5-turbo" - }, -} - -server_sentence_config = { - "kb_name": "Sentence", - "ingest_method": "Simple", - "splitter": "Sentence", - "embed_config": { - "embed_provider": "OpenAI", - "embed_model": "text-embedding-3-small" - }, - "splitter_config": { - "chunk_size": 1024, - "chunk_overlap": 200 - }, - "id": "Sentence", - "files": [] -} - -server_semantic_config ={ - "kb_name": "Semantic", - "ingest_method": "Simple", - "splitter": "Semantic", - "embed_config": { - "embed_provider": "OpenAI", - "embed_model": "text-embedding-3-small" - }, - "splitter_config": { - "buffer_size": 100, - "breakpoint_percentile_threshold": 95 - }, - "id": "Semantic", - "files": [] -} - -server_llama_parse_config = { - "kb_name": "Markdown", - "ingest_method": "LlamaParse", - "splitter": "Markdown", - "embed_config": { - "embed_provider": "OpenAI", - "embed_model": "text-embedding-3-small" - }, - "splitter_config": { - "num_workers": 8 - }, - "llm_config": { - "llm_provider": "OpenAI", - "llm_model": "gpt-3.5-turbo" - }, - "id": "Markdown", - "files": [] -} - diff --git a/kb_type_definitions.py b/kb_type_definitions.py deleted file mode 100644 index 4d37f0d..0000000 --- a/kb_type_definitions.py +++ /dev/null @@ -1,49 +0,0 @@ -from typing import TypedDict, Optional - -# Notes: -# if ingest_method is LlamaParse, splitter_config will be a MarkdownConfig and -# llm_config is required -# llm_config is only required for LlamaParse - -class EmbedConfig(TypedDict): - embed_provider: str - embed_model: str - -class LLMConfig(TypedDict): - llm_provider: str - llm_model: str - -class MarkdownConfig(TypedDict): - num_workers: int | str # default: 8 - -class SemanticConfig(TypedDict): - buffer_size: int | str # default: 100 - breakpoint_percentile_threshold: int | str # default 95 - -class SentenceConfig(TypedDict): - chunk_size: int | str # default 1024 - chunk_overlap: int | str # default: 200 - -class FileMetadata(TypedDict): - file_name: str - content_type: str - date_uploaded: str - time_uploaded: str - -class ClientKBConfig(TypedDict): - kb_name: str - ingest_method: str - splitter: str - embed_config: EmbedConfig - splitter_config: MarkdownConfig | SemanticConfig | SentenceConfig - llm_config: Optional[LLMConfig] # Only required for "LlamaParse" - -class KBConfig(TypedDict): - _id: int - kb_name: str - embed_config: EmbedConfig - ingestion_method: str - splitter: str - splitter_config: MarkdownConfig | SemanticConfig | SentenceConfig - llm_config: Optional[LLMConfig] # Only required for "LlamaParse" - files: list[FileMetadata] \ No newline at end of file diff --git a/lp_ingest.py b/lp_ingest.py deleted file mode 100644 index e512c58..0000000 --- a/lp_ingest.py +++ /dev/null @@ -1,73 +0,0 @@ -import os -import pymongo -import nest_asyncio -from dotenv import load_dotenv - -from llama_parse import LlamaParse -from llama_index.core.node_parser import MarkdownElementNodeParser -from llama_index.llms.openai import OpenAI -from llama_index.embeddings.openai import OpenAIEmbedding -from llama_index.vector_stores.awsdocdb import AWSDocDbVectorStore -from llama_index.core import VectorStoreIndex -from llama_index.core import StorageContext - -load_dotenv() -nest_asyncio.apply() - -llama_cloud_api_key = os.environ["LLAMA_CLOUD_API_KEY"] -openai_api_key = os.environ["OPENAI_API_KEY"] - -mongo_uri = os.environ["MONGO_URI"] -mongodb_client = pymongo.MongoClient(mongo_uri) -docdb_name = os.environ["DOCDB_NAME"] -docdb_collection = os.environ["DOCDB_COLLECTION"] -store = AWSDocDbVectorStore(mongodb_client, db_name=docdb_name, collection_name=docdb_collection) - -def send_file_to_llama_parse(file_path): - print("send_file_to_llama_parse") - parser = LlamaParse( - api_key=llama_cloud_api_key, - result_type="markdown" - ) - - markdown_documents = parser.load_data(file_path) - - print("response received from llama_parse") - print(markdown_documents[0]) - - return markdown_documents - - -# convert markdown documents -# return nodes -def markdown_to_node(documents): - - markdown_parser = MarkdownElementNodeParser( - llm=OpenAI(api_key=openai_api_key, model="gpt-3.5-turbo"), - num_workers=8, - ) - - nodes = markdown_parser.get_nodes_from_documents(documents) - print('response from markdown_parser') - print(nodes[0]) - - return nodes - -# convert nodes to vector store -# side effect: save index to docdb -def nodes_to_vector_store(nodes): - embed_model = OpenAIEmbedding(api_key=openai_api_key, model="text-embedding-ada-002") - storage_context = StorageContext.from_defaults(vector_store=store) - index = VectorStoreIndex(nodes, embed_model=embed_model, storage_context=storage_context) - - return index - -def ingest_file_to_docdb(file_path): - - try: - markdown_docs = send_file_to_llama_parse(file_path) - nodes = markdown_to_node(markdown_docs) - nodes_to_vector_store(nodes) - except Exception as e: - raise e - diff --git a/mongo_helper.py b/mongo_helper.py deleted file mode 100644 index 08f537b..0000000 --- a/mongo_helper.py +++ /dev/null @@ -1,28 +0,0 @@ -import os -import pymongo - -MONGO_URI = os.environ["MONGO_URI"] -CONFIG_DB = os.environ["CONFIG_DB"] -CONFIG_KB_COL = os.environ["CONFIG_KB_COL"] -PYMONGO_CLIENT = pymongo.MongoClient(MONGO_URI) - -def connect(db, collection): - try: - db = PYMONGO_CLIENT[db] - collection = db[collection] - return collection - except Exception as e: - print("An error occurred while connecting to the database:", e) - return None - -def connect_to_kb_config(): - return connect(CONFIG_DB, CONFIG_KB_COL) - -def client(): - return PYMONGO_CLIENT - -# write helper to retrieve knowledge base id from name -def get_kb_id(kb_name): - kb_col = connect_to_kb_config() - kb = kb_col.find_one({"kb_name": kb_name}) - return kb["_id"] \ No newline at end of file diff --git a/pipelineTest.py b/pipelineTest.py deleted file mode 100644 index 5b3c70e..0000000 --- a/pipelineTest.py +++ /dev/null @@ -1,114 +0,0 @@ -from llama_index.core import QueryBundle -from llama_index.core.postprocessor import SimilarityPostprocessor -from llama_index.core.postprocessor import LongContextReorder -from llama_index.postprocessor.colbert_rerank import ColbertRerank -from llama_index.core import get_response_synthesizer, PromptTemplate -from llama_index.core.response_synthesizers import ResponseMode - -import hybridSearch.search as search - -def print_nodes(nodes): - for node in nodes: - print(node) - - -query = 'tell me about promises' - -# get all nodes -nodes = search.hybrid_get_nodes(query, top_k=3) -all_nodes = nodes['keyword'] + nodes['vector'] - - -# similarity -similarity_pp = SimilarityPostprocessor( - nodes=all_nodes, - similarity_cutoff=0.5 -) - -nodes_similar = similarity_pp.postprocess_nodes(all_nodes) - - - - - -# Colbert rerank -reranker = ColbertRerank(top_n=4) -query_bundle = QueryBundle(query) - -nodes_rerank = reranker.postprocess_nodes(all_nodes, query_bundle) - -print('='*20) -print_nodes(nodes_rerank) - - - -# LongContextReorder -reorder = LongContextReorder() - -nodes_reorder = reorder.postprocess_nodes(nodes_rerank) - -print('='*20) -print_nodes(nodes_reorder) - - - -# Response synthesizer -synth = get_response_synthesizer( - response_mode=ResponseMode.SIMPLE_SUMMARIZE -) - -response = synth.synthesize(query, nodes=nodes_reorder) -print(response) - -print('*'*20) - - -# Custom Prompt -new_prompt = ( - "Context information is below.\n" - "-----------------------------\n" - "{context_str}\n" - "-----------------------------\n" - "Given the context information and not prior knowledge, " - "answer the query in French.\n" - "Query: {query_str}\n" - "Answer: " -) -new_template = PromptTemplate(new_prompt) - -synth.update_prompts( - {"text_qa_template": new_template} -) - -response = synth.synthesize(query, nodes=nodes_reorder) -print(response) - - - - - -''' -Notes: - -incorporate post-processing modules: -- created `pipelineTest.py` based upon ‘hybridTest.py’ -- added similarity -- adding ColbertRerank -- found Colbert import statement from https://docs.llamaindex.ai/en/stable/examples/pipeline/query_pipeline_memory/?h=colbertr -- found reranker syntax from https://docs.llamaindex.ai/en/stable/examples/node_postprocessor/LLMReranker-Lyft-10k/?h=reranker -- adding LongContextReorder -- https://docs.llamaindex.ai/en/stable/module_guides/querying/node_postprocessors/node_postprocessors/?h= - -- post-processing modules all seem to work -- need to go from nodes to query response now -- llamaindex uses a “response synthesizer” -- https://docs.llamaindex.ai/en/stable/api_reference/response_synthesizers/ -- “simple_summarize” merges all text chunks from nodes into 1 and makes an LLM call -- it will fail if the merged text chunk exceeds the context window size - -- Accessing and customizing prompts -- https://docs.llamaindex.ai/en/stable/examples/prompts/prompt_mixin/ -- `synthesizer.get_prompts()` returns a dictionary of prompts -- key is a template (e.g., “text_qa_template”) -- see promptTest.py to access returned dict and display prompt content -''' \ No newline at end of file diff --git a/promptTest.py b/promptTest.py deleted file mode 100644 index 0b2820a..0000000 --- a/promptTest.py +++ /dev/null @@ -1,41 +0,0 @@ - -from llama_index.core import get_response_synthesizer, PromptTemplate -from llama_index.core.response_synthesizers import ResponseMode - - -def display_prompts(prompts_dict): - for k, p in prompts_dict.items(): - print(f"Prompt Key: {k}") - print("Text: ") - print(p.get_template()) - print("-"*30) - - -synth = get_response_synthesizer( - response_mode=ResponseMode.SIMPLE_SUMMARIZE -) - -prompt = synth.get_prompts() - -display_prompts(prompt) - - -new_prompt = ( - "Context information is below.\n" - "-----------------------------\n" - "{context_str}\n" - "-----------------------------\n" - "Given the context information and not prior knowledge, " - "answer the query in French.\n" - "Query: {query_str}\n" - "Answer: " -) -new_template = PromptTemplate(new_prompt) - -synth.update_prompts( - {"text_qa_template": new_template} -) - -prompt = synth.get_prompts() - -display_prompts(prompt) diff --git a/random_tests/docdbtest/README.md b/random_tests/docdbtest/README.md deleted file mode 100644 index 3b2e417..0000000 --- a/random_tests/docdbtest/README.md +++ /dev/null @@ -1,28 +0,0 @@ -# test files for connecting with AWS DocumentDB instance -- a set of test files to confirm a connection with AWS DocumentDB and vector embedding persistence -- Note: the LlamaIndex DocDB integration uses `pymongo` - - when running the various files, there may be errors from the pymongo package, but these do not seem to impact vector storage / retrieval - - - -## to use -- create a `.env` file (can use env.template as a starter) - - MONGO_URI will come from the AWS Console for DocumentDB -- ensure you download the global-bundle.pem from the AWS Console -- ensure you've run `pipenv shell` in the root project folder - - - -## overview of files -- `test_nodb.py` : will create a VectorStoreIndex from the same files with no persistence - - note that running this file should give you a 'baseline' of how llamaIndex will perform - - note also that since being first created, it appears an underlying LangChain method has changed and is now deprecated (as of Jul 21, 2024) - -- `store_vectors.py` : this will vectorize the same files and store the index in DocDB - -- `list_vectors.py` : this program queries the contents of the 'testdb', 'testcollection' directly and prints the values to screen - - note: you can pipe the output to a file to examine output more closely - - e.g., `python list_vectors.py > output.txt` - -- `load_vectors.py` : this will load the vector embeddings from DocDB and then run a query against them - diff --git a/random_tests/docdbtest/env.template b/random_tests/docdbtest/env.template deleted file mode 100644 index 7f14155..0000000 --- a/random_tests/docdbtest/env.template +++ /dev/null @@ -1,2 +0,0 @@ -OPENAI_API_KEY= -MONGO_URI= diff --git a/random_tests/docdbtest/files/file1.txt b/random_tests/docdbtest/files/file1.txt deleted file mode 100644 index 13f6cc4..0000000 --- a/random_tests/docdbtest/files/file1.txt +++ /dev/null @@ -1,6 +0,0 @@ -In the mystical land of Rudenza: -Apples are purple. -Clouds are silver, tinged with gold and bronze. -Pianos will bite your fingers if you don't wear stripes when you practice. - - diff --git a/random_tests/docdbtest/files/file2.txt b/random_tests/docdbtest/files/file2.txt deleted file mode 100644 index db01e83..0000000 --- a/random_tests/docdbtest/files/file2.txt +++ /dev/null @@ -1,2 +0,0 @@ -PotatoPecanPie is the secret word. -James' favourite food is pizza. diff --git a/random_tests/docdbtest/list_vectors.py b/random_tests/docdbtest/list_vectors.py deleted file mode 100644 index ba5f050..0000000 --- a/random_tests/docdbtest/list_vectors.py +++ /dev/null @@ -1,23 +0,0 @@ -# this mini program is to list the vectors within the documentDB instance that were written by llamaIndex -# Note: the db and collection as named below - these can be changed when the vector_store is instantiated - - -import pymongo -import pprint -from dotenv import load_dotenv -import os - -load_dotenv(override=True) - -mongo_uri = os.environ["MONGO_URI"] -client = pymongo.MongoClient(mongo_uri) - - -db = client[os.environ["DOCDB_NAME"]] -collection = db[os.environ["DOCDB_COLLECTION"]] - -for post in collection.find(): - pprint.pprint(post) - - -print('==========') diff --git a/random_tests/docdbtest/load_vectors.py b/random_tests/docdbtest/load_vectors.py deleted file mode 100644 index 2ac3755..0000000 --- a/random_tests/docdbtest/load_vectors.py +++ /dev/null @@ -1,35 +0,0 @@ -# based loosely upon LlamaIndex demo -# https://docs.llamaindex.ai/en/stable/examples/vector_stores/AWSDocDBDemo/ -# -# key goal here was to retrieve the stored vectors from DocumentDB rather than re-create them - - - -import pymongo -from dotenv import load_dotenv - -from llama_index.vector_stores.awsdocdb import AWSDocDbVectorStore -from llama_index.core import VectorStoreIndex -from llama_index.core import StorageContext -# from llama_index.core import SimpleDirectoryReader -import os - -load_dotenv(override=True) - -mongo_uri = os.environ["MONGO_URI"] -mongodb_client = pymongo.MongoClient(mongo_uri) -store = AWSDocDbVectorStore(mongodb_client, db_name='testdb', collection_name='testcollection') -storage_context = StorageContext.from_defaults(vector_store=store) - - - -index = VectorStoreIndex.from_vector_store( - vector_store=store, - storage_context=storage_context -) - - -response = index.as_query_engine().query('Tell me about Rudenza') -print(f"{response}") - - diff --git a/random_tests/docdbtest/store_vectors.py b/random_tests/docdbtest/store_vectors.py deleted file mode 100644 index fd16d01..0000000 --- a/random_tests/docdbtest/store_vectors.py +++ /dev/null @@ -1,32 +0,0 @@ -# based upon LlamaIndex demo -# https://docs.llamaindex.ai/en/stable/examples/vector_stores/AWSDocDBDemo/ - - -import pymongo -from dotenv import load_dotenv - -from llama_index.vector_stores.awsdocdb import AWSDocDbVectorStore -from llama_index.core import VectorStoreIndex -from llama_index.core import StorageContext -from llama_index.core import SimpleDirectoryReader -import os - -load_dotenv(override=True) - - -mongo_uri = os.environ["MONGO_URI"] -mongodb_client = pymongo.MongoClient(mongo_uri) -store = AWSDocDbVectorStore(mongodb_client, db_name='testdb', collection_name='testcollection') -storage_context = StorageContext.from_defaults(vector_store=store) - -documents = SimpleDirectoryReader("files").load_data() - -index = VectorStoreIndex.from_documents( - documents, storage_context=storage_context -) - - -response = index.as_query_engine().query('Tell me about Rudenza') -print(f"{response}") - - diff --git a/random_tests/docdbtest/test_nodb.py b/random_tests/docdbtest/test_nodb.py deleted file mode 100644 index 047ae53..0000000 --- a/random_tests/docdbtest/test_nodb.py +++ /dev/null @@ -1,12 +0,0 @@ -# Testing vector store - no persistence - -from dotenv import load_dotenv -from llama_index.core import VectorStoreIndex, SimpleDirectoryReader - -load_dotenv() - -documents = SimpleDirectoryReader("files").load_data() - -index = VectorStoreIndex.from_documents(documents) - -print("Index created successfully!") diff --git a/random_tests/hybridTest.py b/random_tests/hybridTest.py deleted file mode 100644 index e845cb0..0000000 --- a/random_tests/hybridTest.py +++ /dev/null @@ -1,21 +0,0 @@ -import hybridSearch.search as search - -def print_nodes(nodes): - for node in nodes: - print(node) - - -kb_file_path = './tmpfiles/giraffes.pdf' -search.hybrid_write('giraffes', kb_file_path) # only need to do this the first time - -query = 'how long are giraffe necks?' - -# get nodes -nodes = search.hybrid_get_nodes('giraffes', query, top_k=5) - -all_nodes = nodes['keyword'] + nodes['vector'] - -print_nodes(all_nodes) - - - diff --git a/random_tests/pipelineTest.py b/random_tests/pipelineTest.py deleted file mode 100644 index 5b3c70e..0000000 --- a/random_tests/pipelineTest.py +++ /dev/null @@ -1,114 +0,0 @@ -from llama_index.core import QueryBundle -from llama_index.core.postprocessor import SimilarityPostprocessor -from llama_index.core.postprocessor import LongContextReorder -from llama_index.postprocessor.colbert_rerank import ColbertRerank -from llama_index.core import get_response_synthesizer, PromptTemplate -from llama_index.core.response_synthesizers import ResponseMode - -import hybridSearch.search as search - -def print_nodes(nodes): - for node in nodes: - print(node) - - -query = 'tell me about promises' - -# get all nodes -nodes = search.hybrid_get_nodes(query, top_k=3) -all_nodes = nodes['keyword'] + nodes['vector'] - - -# similarity -similarity_pp = SimilarityPostprocessor( - nodes=all_nodes, - similarity_cutoff=0.5 -) - -nodes_similar = similarity_pp.postprocess_nodes(all_nodes) - - - - - -# Colbert rerank -reranker = ColbertRerank(top_n=4) -query_bundle = QueryBundle(query) - -nodes_rerank = reranker.postprocess_nodes(all_nodes, query_bundle) - -print('='*20) -print_nodes(nodes_rerank) - - - -# LongContextReorder -reorder = LongContextReorder() - -nodes_reorder = reorder.postprocess_nodes(nodes_rerank) - -print('='*20) -print_nodes(nodes_reorder) - - - -# Response synthesizer -synth = get_response_synthesizer( - response_mode=ResponseMode.SIMPLE_SUMMARIZE -) - -response = synth.synthesize(query, nodes=nodes_reorder) -print(response) - -print('*'*20) - - -# Custom Prompt -new_prompt = ( - "Context information is below.\n" - "-----------------------------\n" - "{context_str}\n" - "-----------------------------\n" - "Given the context information and not prior knowledge, " - "answer the query in French.\n" - "Query: {query_str}\n" - "Answer: " -) -new_template = PromptTemplate(new_prompt) - -synth.update_prompts( - {"text_qa_template": new_template} -) - -response = synth.synthesize(query, nodes=nodes_reorder) -print(response) - - - - - -''' -Notes: - -incorporate post-processing modules: -- created `pipelineTest.py` based upon ‘hybridTest.py’ -- added similarity -- adding ColbertRerank -- found Colbert import statement from https://docs.llamaindex.ai/en/stable/examples/pipeline/query_pipeline_memory/?h=colbertr -- found reranker syntax from https://docs.llamaindex.ai/en/stable/examples/node_postprocessor/LLMReranker-Lyft-10k/?h=reranker -- adding LongContextReorder -- https://docs.llamaindex.ai/en/stable/module_guides/querying/node_postprocessors/node_postprocessors/?h= - -- post-processing modules all seem to work -- need to go from nodes to query response now -- llamaindex uses a “response synthesizer” -- https://docs.llamaindex.ai/en/stable/api_reference/response_synthesizers/ -- “simple_summarize” merges all text chunks from nodes into 1 and makes an LLM call -- it will fail if the merged text chunk exceeds the context window size - -- Accessing and customizing prompts -- https://docs.llamaindex.ai/en/stable/examples/prompts/prompt_mixin/ -- `synthesizer.get_prompts()` returns a dictionary of prompts -- key is a template (e.g., “text_qa_template”) -- see promptTest.py to access returned dict and display prompt content -''' \ No newline at end of file diff --git a/random_tests/promptTest.py b/random_tests/promptTest.py deleted file mode 100644 index 0b2820a..0000000 --- a/random_tests/promptTest.py +++ /dev/null @@ -1,41 +0,0 @@ - -from llama_index.core import get_response_synthesizer, PromptTemplate -from llama_index.core.response_synthesizers import ResponseMode - - -def display_prompts(prompts_dict): - for k, p in prompts_dict.items(): - print(f"Prompt Key: {k}") - print("Text: ") - print(p.get_template()) - print("-"*30) - - -synth = get_response_synthesizer( - response_mode=ResponseMode.SIMPLE_SUMMARIZE -) - -prompt = synth.get_prompts() - -display_prompts(prompt) - - -new_prompt = ( - "Context information is below.\n" - "-----------------------------\n" - "{context_str}\n" - "-----------------------------\n" - "Given the context information and not prior knowledge, " - "answer the query in French.\n" - "Query: {query_str}\n" - "Answer: " -) -new_template = PromptTemplate(new_prompt) - -synth.update_prompts( - {"text_qa_template": new_template} -) - -prompt = synth.get_prompts() - -display_prompts(prompt) diff --git a/random_tests/setup_test.py b/random_tests/setup_test.py deleted file mode 100644 index d11caa1..0000000 --- a/random_tests/setup_test.py +++ /dev/null @@ -1,127 +0,0 @@ -# helper scripts to populate test kbs -import os - -import pymongo -from dotenv import load_dotenv - -import hybridSearch.search as search - -load_dotenv(override=True) - -mongo_uri = os.environ["MONGO_URI"] -mongo = pymongo.MongoClient(mongo_uri) - - -# kb1 -kb1_file_path = './tmpfiles/AsyncJS.md' -search.keyword_write('kb1', kb1_file_path) -search.vector_write('kb1', kb1_file_path) - -#kb2 -kb2_file_path = './tmpfiles/cpumemory.pdf' -search.hybrid_write('kb2', kb2_file_path) - -#kb3 -kb3_file_path = './tmpfiles/newfile.txt' -search.hybrid_write('kb3', kb3_file_path) - - -# config db setup - -kb_config1 = { - 'id': 'kb1', - 'name': 'AsyncJS', - 'files': [{ 'filename': './tmpfiles/AsyncJS.md'}], - 'ingest': { - 'method': 'simple_ingest', - 'splitter': { - 'type': 'sentence', - 'chunk_size': '', - 'chunk_overlap': '', - 'separator': '', - }, - }, - 'embedding_model': 'gpt-3.5-turbo', - 'vector_store': { - 'name': 'idstring', - 'collection': 'vector_index', - }, - 'keyword_store': { - 'name': 'idstring', - 'collections': ['docstore/ref_doc_info', 'docstore/data', 'docstore/metadata'] - } -} - -kb_config2 = kb_config1.copy() -kb_config2['id'] = 'kb2' -kb_config2['name'] = 'cpumemory' -kb_config2['files'] = [{ 'filename': './tmpfiles/cpumemory.pdf'}], - - -kb_config3 = kb_config1.copy() -kb_config3['id'] = 'kb3' -kb_config3['name'] = 'newfile' -kb_config3['files'] = [{ 'filename': './tmpfiles/newfile.txt'}], - - -config_db = mongo[ os.environ["CONFIG_DB"] ] -config_kb_col = config_db[ os.environ["CONFIG_KB_COL"] ] -config_kb_col.insert_one(kb_config1) -config_kb_col.insert_one(kb_config2) -config_kb_col.insert_one(kb_config3) - - - - -# Pipeline config - -pipeline_config1 = { - 'id': 'pipeline1', - 'name': 'pipelineConfigName', - 'knowledgebases': ['kb1', 'kb2', 'kb3'], - 'retrieval': { - 'vector': 'llm_model_name', - }, - 'postprocessing': { - 'similarity': { - 'on': False, - 'similarity_cutoff': 0.7 - }, - 'colbertRerank': { - 'on': False, - 'top_n': 5 - }, - 'longContextReorder': { - 'on': True, - } - }, - 'generative_model': 'gpt-3.5-turbo', - 'prompt': { - 'on': True, - 'template_str': 'answer the question - {query_str} - in French' - } -} - -pipeline_config2 = pipeline_config1.copy() -pipeline_config2['id'] = 'pipeline2' -pipeline_config2['name'] = 'kb1 only (async)' -pipeline_config2['knowledgebases'] = ['kb1'] - - -pipeline_config3 = pipeline_config1.copy() -pipeline_config3['id'] = 'pipeline3' -pipeline_config3['name'] = 'kb2 only (cpumemory)' -pipeline_config3['knowledgebases'] = ['kb2'] - -pipeline_config4 = pipeline_config1.copy() -pipeline_config4['id'] = 'pipeline4' -pipeline_config4['name'] = 'kb1 (async) and kb2 (cpumemory)' -pipeline_config4['knowledgebases'] = ['kb2', 'kb1'] - -config_pipeline_col = config_db[ os.environ["CONFIG_PIPELINE_COL"] ] -config_pipeline_col.insert_one(pipeline_config1) -config_pipeline_col.insert_one(pipeline_config2) -config_pipeline_col.insert_one(pipeline_config3) -config_pipeline_col.insert_one(pipeline_config4) - - diff --git a/random_tests/test_kb.py b/random_tests/test_kb.py deleted file mode 100644 index 063a32e..0000000 --- a/random_tests/test_kb.py +++ /dev/null @@ -1,31 +0,0 @@ -import json - -import refactor1.db.knowledge_base.kb_config as kbClass - -kb_name = 'giraffe2' - -config_template = { - "id": kb_name, - "kb_name": kb_name, - "ingest_method": "Simple", - "splitter": "Sentence", - "embed_config": { - "embed_provider": "OpenAI", - "embed_model": "text-embedding-3-small" - }, - "splitter_config": { - "chunk_size": 1024, - "chunk_overlap": 200 - } -} - -json_config = json.dumps(config_template) -print(json_config) - -kbClass.KnowledgeBase.create(json_config) - -kb = kbClass.KnowledgeBase('giraffe2') -kb.ingest_file_path('./tmpfiles/giraffes.pdf') - - - diff --git a/setup_test.py b/setup_test.py deleted file mode 100644 index d11caa1..0000000 --- a/setup_test.py +++ /dev/null @@ -1,127 +0,0 @@ -# helper scripts to populate test kbs -import os - -import pymongo -from dotenv import load_dotenv - -import hybridSearch.search as search - -load_dotenv(override=True) - -mongo_uri = os.environ["MONGO_URI"] -mongo = pymongo.MongoClient(mongo_uri) - - -# kb1 -kb1_file_path = './tmpfiles/AsyncJS.md' -search.keyword_write('kb1', kb1_file_path) -search.vector_write('kb1', kb1_file_path) - -#kb2 -kb2_file_path = './tmpfiles/cpumemory.pdf' -search.hybrid_write('kb2', kb2_file_path) - -#kb3 -kb3_file_path = './tmpfiles/newfile.txt' -search.hybrid_write('kb3', kb3_file_path) - - -# config db setup - -kb_config1 = { - 'id': 'kb1', - 'name': 'AsyncJS', - 'files': [{ 'filename': './tmpfiles/AsyncJS.md'}], - 'ingest': { - 'method': 'simple_ingest', - 'splitter': { - 'type': 'sentence', - 'chunk_size': '', - 'chunk_overlap': '', - 'separator': '', - }, - }, - 'embedding_model': 'gpt-3.5-turbo', - 'vector_store': { - 'name': 'idstring', - 'collection': 'vector_index', - }, - 'keyword_store': { - 'name': 'idstring', - 'collections': ['docstore/ref_doc_info', 'docstore/data', 'docstore/metadata'] - } -} - -kb_config2 = kb_config1.copy() -kb_config2['id'] = 'kb2' -kb_config2['name'] = 'cpumemory' -kb_config2['files'] = [{ 'filename': './tmpfiles/cpumemory.pdf'}], - - -kb_config3 = kb_config1.copy() -kb_config3['id'] = 'kb3' -kb_config3['name'] = 'newfile' -kb_config3['files'] = [{ 'filename': './tmpfiles/newfile.txt'}], - - -config_db = mongo[ os.environ["CONFIG_DB"] ] -config_kb_col = config_db[ os.environ["CONFIG_KB_COL"] ] -config_kb_col.insert_one(kb_config1) -config_kb_col.insert_one(kb_config2) -config_kb_col.insert_one(kb_config3) - - - - -# Pipeline config - -pipeline_config1 = { - 'id': 'pipeline1', - 'name': 'pipelineConfigName', - 'knowledgebases': ['kb1', 'kb2', 'kb3'], - 'retrieval': { - 'vector': 'llm_model_name', - }, - 'postprocessing': { - 'similarity': { - 'on': False, - 'similarity_cutoff': 0.7 - }, - 'colbertRerank': { - 'on': False, - 'top_n': 5 - }, - 'longContextReorder': { - 'on': True, - } - }, - 'generative_model': 'gpt-3.5-turbo', - 'prompt': { - 'on': True, - 'template_str': 'answer the question - {query_str} - in French' - } -} - -pipeline_config2 = pipeline_config1.copy() -pipeline_config2['id'] = 'pipeline2' -pipeline_config2['name'] = 'kb1 only (async)' -pipeline_config2['knowledgebases'] = ['kb1'] - - -pipeline_config3 = pipeline_config1.copy() -pipeline_config3['id'] = 'pipeline3' -pipeline_config3['name'] = 'kb2 only (cpumemory)' -pipeline_config3['knowledgebases'] = ['kb2'] - -pipeline_config4 = pipeline_config1.copy() -pipeline_config4['id'] = 'pipeline4' -pipeline_config4['name'] = 'kb1 (async) and kb2 (cpumemory)' -pipeline_config4['knowledgebases'] = ['kb2', 'kb1'] - -config_pipeline_col = config_db[ os.environ["CONFIG_PIPELINE_COL"] ] -config_pipeline_col.insert_one(pipeline_config1) -config_pipeline_col.insert_one(pipeline_config2) -config_pipeline_col.insert_one(pipeline_config3) -config_pipeline_col.insert_one(pipeline_config4) - - diff --git a/simple_ingest.py b/simple_ingest.py deleted file mode 100644 index 154aa96..0000000 --- a/simple_ingest.py +++ /dev/null @@ -1,34 +0,0 @@ -# based upon LlamaIndex demo -# https://docs.llamaindex.ai/en/stable/examples/vector_stores/AWSDocDBDemo/ - - -import pymongo -from dotenv import load_dotenv -import app_logger as log - -from llama_index.vector_stores.awsdocdb import AWSDocDbVectorStore -from llama_index.core import VectorStoreIndex -from llama_index.core import StorageContext -from llama_index.core import SimpleDirectoryReader -import os - -load_dotenv(override=True) - - -mongo_uri = os.environ["MONGO_URI"] -mongodb_client = pymongo.MongoClient(mongo_uri) -docdb_name = os.environ["DOCDB_NAME"] -docdb_collection = os.environ["DOCDB_COLLECTION"] -store = AWSDocDbVectorStore(mongodb_client, db_name=docdb_name, collection_name=docdb_collection) -storage_context = StorageContext.from_defaults(vector_store=store) - - -def ingest_file_to_docdb(file_path): - try: - log.debug('starting ingestion', file_path) - document = SimpleDirectoryReader(input_files=[file_path]).load_data() - index = VectorStoreIndex.from_documents(document, storage_context=storage_context) - log.debug('index created') - except Exception as e: - raise e - diff --git a/test_kb.py b/test_kb.py deleted file mode 100644 index 2a6ca09..0000000 --- a/test_kb.py +++ /dev/null @@ -1,31 +0,0 @@ -import json - -import kb_config as kbClass - -kb_name = 'giraffe2' - -config_template = { - "id": kb_name, - "kb_name": kb_name, - "ingest_method": "Simple", - "splitter": "Sentence", - "embed_config": { - "embed_provider": "OpenAI", - "embed_model": "text-embedding-3-small" - }, - "splitter_config": { - "chunk_size": 1024, - "chunk_overlap": 200 - } -} - -json_config = json.dumps(config_template) -print(json_config) - -kbClass.KnowledgeBase.create(json_config) - -kb = kbClass.KnowledgeBase('giraffe2') -kb.ingest_file_path('./tmpfiles/giraffes.pdf') - - - From 6cc73dc46cbd800632dc6c65cd086ceb78682cc4 Mon Sep 17 00:00:00 2001 From: tlane25 <163203257+tlane25@users.noreply.github.com> Date: Tue, 6 Aug 2024 15:07:37 -0400 Subject: [PATCH 10/10] Update Pipfile --- Pipfile | 2 -- 1 file changed, 2 deletions(-) diff --git a/Pipfile b/Pipfile index 91e65a7..02c785d 100644 --- a/Pipfile +++ b/Pipfile @@ -29,9 +29,7 @@ rank-bm25 = "*" llama-index-postprocessor-colbert-rerank = "*" llama-index-retrievers-bm25 = "*" llama-index-storage-index-store-mongodb = "*" -llama-index-embeddings-bedrock = "*" pytest = "*" -llama-index-llms-bedrock = "*" [dev-packages]