Skip to content

Commit

Permalink
added energy wizard class structure for postgres wizard
Browse files Browse the repository at this point in the history
  • Loading branch information
grantbuster committed May 7, 2024
1 parent 77bb5cc commit 8dc4083
Showing 1 changed file with 222 additions and 87 deletions.
309 changes: 222 additions & 87 deletions elm/wizard.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,14 +2,15 @@
"""
ELM energy wizard
"""
from abc import ABC, abstractmethod
import copy
import numpy as np

from elm.base import ApiBase


class EnergyWizard(ApiBase):
"""Interface to ask OpenAI LLMs about energy research."""
class EnergyWizardBase(ApiBase, ABC):
"""Base interface to ask OpenAI LLMs about energy research."""

MODEL_ROLE = "You parse through articles to answer questions."
"""High level model role, somewhat redundant to MODEL_INSTRUCTION"""
Expand All @@ -19,96 +20,31 @@ class EnergyWizard(ApiBase):
'text, write "I could not find an answer."')
"""Prefix to the engineered prompt"""

def __init__(self, corpus, model=None, token_budget=3500, ref_col=None):
def __init__(self, model=None, token_budget=3500):
"""
Parameters
----------
corpus : pd.DataFrame
Corpus of text in dataframe format. Must have columns "text" and
"embedding".
model : str
GPT model name, default is the DEFAULT_MODEL global var
token_budget : int
Number of tokens that can be embedded in the prompt. Note that the
default budget for GPT-3.5-Turbo is 4096, but you want to subtract
some tokens to account for the response budget.
ref_col : None | str
Optional column label in the corpus that provides a reference text
string for each chunk of text.
"""

super().__init__(model)

self.corpus = self.preflight_corpus(corpus)
self.token_budget = token_budget
self.embedding_arr = np.vstack(self.corpus['embedding'].values)
self.text_arr = self.corpus['text'].values
self.ref_col = ref_col

@staticmethod
def preflight_corpus(corpus, required=('text', 'embedding')):
"""Run preflight checks on the text corpus.
Parameters
----------
corpus : pd.DataFrame
Corpus of text in dataframe format. Must have columns "text" and
"embedding".
required : list | tuple
Column names required to be in the corpus df
Returns
-------
corpus : pd.DataFrame
Corpus of text in dataframe format. Must have columns "text" and
"embedding".
"""
missing = [col for col in required if col not in corpus]
if any(missing):
msg = ('Text corpus must have {} columns but received '
'corpus with columns: {}'
.format(missing, list(corpus.columns)))
raise KeyError(msg)

if not isinstance(corpus.index.values[0], int):
corpus['index'] = np.arange(len(corpus))
corpus = corpus.set_index('index', drop=False)

return corpus

def cosine_dist(self, query_embedding):
"""Compute the cosine distance of the query embedding array vs. all of
the embedding arrays of the full text corpus
Parameters
----------
query_embedding : np.ndarray
1D array of the numerical embedding of the request query.
Returns
-------
out : np.ndarray
1D array with length equal to the number of entries in the text
corpus. Each value is a distance score where smaller is closer
"""

dot = np.dot(self.embedding_arr, query_embedding)
norm1 = np.linalg.norm(query_embedding)
norm2 = np.linalg.norm(self.embedding_arr, axis=1)

out = 1 - (dot / (norm1 * norm2))

return out

def rank_strings(self, query, top_n=100):
@abstractmethod
def query_vector_db(self, query, limit=100):
"""Returns a list of strings and relatednesses, sorted from most
related to least.
Parameters
----------
query : str
Question being asked of GPT
top_n : int
limit : int
Number of top results to return.
Returns
Expand All @@ -122,15 +58,6 @@ def rank_strings(self, query, top_n=100):
ranked strings/scores outputs.
"""

embedding = self.get_embedding(query)
scores = 1 - self.cosine_dist(embedding)
best = np.argsort(scores)[::-1][:top_n]

strings = self.text_arr[best]
scores = scores[best]

return strings, scores, best

def engineer_query(self, query, token_budget=None, new_info_threshold=0.7,
convo=False):
"""Engineer a query for GPT using the corpus of information
Expand Down Expand Up @@ -169,7 +96,7 @@ def engineer_query(self, query, token_budget=None, new_info_threshold=0.7,

token_budget = token_budget or self.token_budget

strings, _, idx = self.rank_strings(query)
strings, _, idx = self.query_vector_db(query)

message = copy.deepcopy(self.MODEL_INSTRUCTION)
question = f"\n\nQuestion: {query}"
Expand Down Expand Up @@ -197,6 +124,7 @@ def engineer_query(self, query, token_budget=None, new_info_threshold=0.7,

return message, references

@abstractmethod
def make_ref_list(self, idx):
"""Make a reference list
Expand All @@ -208,13 +136,9 @@ def make_ref_list(self, idx):
Returns
-------
ref_list : list
A list of references (strs) used.
A list of references (strs) used. Ideally, this is something like:
["{ref_title} ({ref_url})"]
"""
ref_list = ''
if self.ref_col is not None and self.ref_col in self.corpus:
ref_list = list(self.corpus[self.ref_col].iloc[idx].unique())

return ref_list

def chat(self, query,
debug=True,
Expand Down Expand Up @@ -311,3 +235,214 @@ def chat(self, query,
return response_message, query, references
else:
return response_message


class EnergyWizard(EnergyWizardBase):
"""Interface to ask OpenAI LLMs about energy research.
This class is for execution on a local machine with a vector database in
memory
"""

def __init__(self, corpus, model=None, token_budget=3500, ref_col=None):
"""
Parameters
----------
corpus : pd.DataFrame
Corpus of text in dataframe format. Must have columns "text" and
"embedding".
model : str
GPT model name, default is the DEFAULT_MODEL global var
token_budget : int
Number of tokens that can be embedded in the prompt. Note that the
default budget for GPT-3.5-Turbo is 4096, but you want to subtract
some tokens to account for the response budget.
ref_col : None | str
Optional column label in the corpus that provides a reference text
string for each chunk of text.
"""

super().__init__(model, token_budget=token_budget)

self.corpus = self.preflight_corpus(corpus)
self.embedding_arr = np.vstack(self.corpus['embedding'].values)
self.text_arr = self.corpus['text'].values
self.ref_col = ref_col

@staticmethod
def preflight_corpus(corpus, required=('text', 'embedding')):
"""Run preflight checks on the text corpus.
Parameters
----------
corpus : pd.DataFrame
Corpus of text in dataframe format. Must have columns "text" and
"embedding".
required : list | tuple
Column names required to be in the corpus df
Returns
-------
corpus : pd.DataFrame
Corpus of text in dataframe format. Must have columns "text" and
"embedding".
"""
missing = [col for col in required if col not in corpus]
if any(missing):
msg = ('Text corpus must have {} columns but received '
'corpus with columns: {}'
.format(missing, list(corpus.columns)))
raise KeyError(msg)

if not isinstance(corpus.index.values[0], int):
corpus['index'] = np.arange(len(corpus))
corpus = corpus.set_index('index', drop=False)

return corpus

def cosine_dist(self, query_embedding):
"""Compute the cosine distance of the query embedding array vs. all of
the embedding arrays of the full text corpus
Parameters
----------
query_embedding : np.ndarray
1D array of the numerical embedding of the request query.
Returns
-------
out : np.ndarray
1D array with length equal to the number of entries in the text
corpus. Each value is a distance score where smaller is closer
"""

dot = np.dot(self.embedding_arr, query_embedding)
norm1 = np.linalg.norm(query_embedding)
norm2 = np.linalg.norm(self.embedding_arr, axis=1)

out = 1 - (dot / (norm1 * norm2))

return out

def query_vector_db(self, query, limit=100):
"""Returns a list of strings and relatednesses, sorted from most
related to least.
Parameters
----------
query : str
Question being asked of GPT
limit : int
Number of top results to return.
Returns
-------
strings : np.ndarray
1D array of related strings
score : np.ndarray
1D array of float scores of strings
idx : np.ndarray
1D array of indices in the text corpus corresponding to the
ranked strings/scores outputs.
"""

embedding = self.get_embedding(query)
scores = 1 - self.cosine_dist(embedding)
best = np.argsort(scores)[::-1][:limit]

strings = self.text_arr[best]
scores = scores[best]

return strings, scores, best

def make_ref_list(self, idx):
"""Make a reference list
Parameters
----------
used_index : np.ndarray
Indices of the used text from the text corpus
Returns
-------
ref_list : list
A list of references (strs) used. This takes information straight
from ``ref_col``. Ideally, this is something like:
["{ref_title} ({ref_url})"]
"""
ref_list = ''
if self.ref_col is not None and self.ref_col in self.corpus:
ref_list = list(self.corpus[self.ref_col].iloc[idx].unique())

return ref_list


class EnergyWizardPostgres(EnergyWizardBase):
"""Interface to ask OpenAI LLMs about energy research.
This class is for execution with a postgres vector database
TODO: slater describe the vector DB here
"""

def __init__(self, model=None, token_budget=3500,
vector_db_args=None):
"""
Parameters
----------
model : str
GPT model name, default is the DEFAULT_MODEL global var
token_budget : int
Number of tokens that can be embedded in the prompt. Note that the
default budget for GPT-3.5-Turbo is 4096, but you want to subtract
some tokens to account for the response budget.
vector_db_args :
TODO: slater implement required vector database stuff here and set
self.cursor and whatnot
"""

super().__init__(model, token_budget=token_budget)

def query_vector_db(self, query, limit=100):
"""Returns a list of strings and relatednesses, sorted from most
related to least.
Parameters
----------
query : str
Question being asked of GPT
limit : int
Number of top results to return.
Returns
-------
strings : np.ndarray
1D array of related strings
score : np.ndarray
1D array of float scores of strings
idx : np.ndarray
1D array of indices in the text corpus corresponding to the
ranked strings/scores outputs.
"""

# TODO: Slater implement vector db query here

return strings, scores, best

def make_ref_list(self, idx):
"""Make a reference list
Parameters
----------
used_index : np.ndarray
Indices of the used text from the text corpus
Returns
-------
ref_list : list
A list of references (strs) used. Ideally, this is something like:
["{ref_title} ({ref_url})"]
"""
# TODO: Slater implement vector db-to-meta-data query here to get
# information about the results (e.g., links and titles and whatnot)

return ref_list

0 comments on commit 8dc4083

Please sign in to comment.