added energy wizard class structure for postgres wizard

NREL · May 7, 2024 · 8dc4083 · 8dc4083
1 parent 77bb5cc
commit 8dc4083
Showing 1 changed file with 222 additions and 87 deletions.
diff --git a/elm/wizard.py b/elm/wizard.py
@@ -2,14 +2,15 @@
 """
 ELM energy wizard
 """
+from abc import ABC, abstractmethod
 import copy
 import numpy as np
 
 from elm.base import ApiBase
 
 
-class EnergyWizard(ApiBase):
-    """Interface to ask OpenAI LLMs about energy research."""
+class EnergyWizardBase(ApiBase, ABC):
+    """Base interface to ask OpenAI LLMs about energy research."""
 
     MODEL_ROLE = "You parse through articles to answer questions."
     """High level model role, somewhat redundant to MODEL_INSTRUCTION"""
@@ -19,96 +20,31 @@ class EnergyWizard(ApiBase):
                          'text, write "I could not find an answer."')
     """Prefix to the engineered prompt"""
 
-    def __init__(self, corpus, model=None, token_budget=3500, ref_col=None):
+    def __init__(self, model=None, token_budget=3500):
         """
         Parameters
         ----------
-        corpus : pd.DataFrame
-            Corpus of text in dataframe format. Must have columns "text" and
-            "embedding".
         model : str
             GPT model name, default is the DEFAULT_MODEL global var
         token_budget : int
             Number of tokens that can be embedded in the prompt. Note that the
             default budget for GPT-3.5-Turbo is 4096, but you want to subtract
             some tokens to account for the response budget.
-        ref_col : None | str
-            Optional column label in the corpus that provides a reference text
-            string for each chunk of text.
         """
 
         super().__init__(model)
-
-        self.corpus = self.preflight_corpus(corpus)
         self.token_budget = token_budget
-        self.embedding_arr = np.vstack(self.corpus['embedding'].values)
-        self.text_arr = self.corpus['text'].values
-        self.ref_col = ref_col
-
-    @staticmethod
-    def preflight_corpus(corpus, required=('text', 'embedding')):
-        """Run preflight checks on the text corpus.
-
-        Parameters
-        ----------
-        corpus : pd.DataFrame
-            Corpus of text in dataframe format. Must have columns "text" and
-            "embedding".
-        required : list | tuple
-            Column names required to be in the corpus df
-
-        Returns
-        -------
-        corpus : pd.DataFrame
-            Corpus of text in dataframe format. Must have columns "text" and
-            "embedding".
-        """
-        missing = [col for col in required if col not in corpus]
-        if any(missing):
-            msg = ('Text corpus must have {} columns but received '
-                   'corpus with columns: {}'
-                   .format(missing, list(corpus.columns)))
-            raise KeyError(msg)
 
-        if not isinstance(corpus.index.values[0], int):
-            corpus['index'] = np.arange(len(corpus))
-            corpus = corpus.set_index('index', drop=False)
-
-        return corpus
-
-    def cosine_dist(self, query_embedding):
-        """Compute the cosine distance of the query embedding array vs. all of
-        the embedding arrays of the full text corpus
-
-        Parameters
-        ----------
-        query_embedding : np.ndarray
-            1D array of the numerical embedding of the request query.
-
-        Returns
-        -------
-        out : np.ndarray
-            1D array with length equal to the number of entries in the text
-            corpus. Each value is a distance score where smaller is closer
-        """
-
-        dot = np.dot(self.embedding_arr, query_embedding)
-        norm1 = np.linalg.norm(query_embedding)
-        norm2 = np.linalg.norm(self.embedding_arr, axis=1)
-
-        out = 1 - (dot / (norm1 * norm2))
-
-        return out
-
-    def rank_strings(self, query, top_n=100):
+    @abstractmethod
+    def query_vector_db(self, query, limit=100):
         """Returns a list of strings and relatednesses, sorted from most
         related to least.
 
         Parameters
         ----------
         query : str
             Question being asked of GPT
-        top_n : int
+        limit : int
             Number of top results to return.
 
         Returns
@@ -122,15 +58,6 @@ def rank_strings(self, query, top_n=100):
             ranked strings/scores outputs.
         """
 
-        embedding = self.get_embedding(query)
-        scores = 1 - self.cosine_dist(embedding)
-        best = np.argsort(scores)[::-1][:top_n]
-
-        strings = self.text_arr[best]
-        scores = scores[best]
-
-        return strings, scores, best
-
     def engineer_query(self, query, token_budget=None, new_info_threshold=0.7,
                        convo=False):
         """Engineer a query for GPT using the corpus of information
@@ -169,7 +96,7 @@ def engineer_query(self, query, token_budget=None, new_info_threshold=0.7,
 
         token_budget = token_budget or self.token_budget
 
-        strings, _, idx = self.rank_strings(query)
+        strings, _, idx = self.query_vector_db(query)
 
         message = copy.deepcopy(self.MODEL_INSTRUCTION)
         question = f"\n\nQuestion: {query}"
@@ -197,6 +124,7 @@ def engineer_query(self, query, token_budget=None, new_info_threshold=0.7,
 
         return message, references
 
+    @abstractmethod
     def make_ref_list(self, idx):
         """Make a reference list
 
@@ -208,13 +136,9 @@ def make_ref_list(self, idx):
         Returns
         -------
         ref_list : list
-            A list of references (strs) used.
+            A list of references (strs) used. Ideally, this is something like:
+            ["{ref_title} ({ref_url})"]
         """
-        ref_list = ''
-        if self.ref_col is not None and self.ref_col in self.corpus:
-            ref_list = list(self.corpus[self.ref_col].iloc[idx].unique())
-
-        return ref_list
 
     def chat(self, query,
              debug=True,
@@ -311,3 +235,214 @@ def chat(self, query,
             return response_message, query, references
         else:
             return response_message
+
+
+class EnergyWizard(EnergyWizardBase):
+    """Interface to ask OpenAI LLMs about energy research.
+
+    This class is for execution on a local machine with a vector database in
+    memory
+    """
+
+    def __init__(self, corpus, model=None, token_budget=3500, ref_col=None):
+        """
+        Parameters
+        ----------
+        corpus : pd.DataFrame
+            Corpus of text in dataframe format. Must have columns "text" and
+            "embedding".
+        model : str
+            GPT model name, default is the DEFAULT_MODEL global var
+        token_budget : int
+            Number of tokens that can be embedded in the prompt. Note that the
+            default budget for GPT-3.5-Turbo is 4096, but you want to subtract
+            some tokens to account for the response budget.
+        ref_col : None | str
+            Optional column label in the corpus that provides a reference text
+            string for each chunk of text.
+        """
+
+        super().__init__(model, token_budget=token_budget)
+
+        self.corpus = self.preflight_corpus(corpus)
+        self.embedding_arr = np.vstack(self.corpus['embedding'].values)
+        self.text_arr = self.corpus['text'].values
+        self.ref_col = ref_col
+
+    @staticmethod
+    def preflight_corpus(corpus, required=('text', 'embedding')):
+        """Run preflight checks on the text corpus.
+
+        Parameters
+        ----------
+        corpus : pd.DataFrame
+            Corpus of text in dataframe format. Must have columns "text" and
+            "embedding".
+        required : list | tuple
+            Column names required to be in the corpus df
+
+        Returns
+        -------
+        corpus : pd.DataFrame
+            Corpus of text in dataframe format. Must have columns "text" and
+            "embedding".
+        """
+        missing = [col for col in required if col not in corpus]
+        if any(missing):
+            msg = ('Text corpus must have {} columns but received '
+                   'corpus with columns: {}'
+                   .format(missing, list(corpus.columns)))
+            raise KeyError(msg)
+
+        if not isinstance(corpus.index.values[0], int):
+            corpus['index'] = np.arange(len(corpus))
+            corpus = corpus.set_index('index', drop=False)
+
+        return corpus
+
+    def cosine_dist(self, query_embedding):
+        """Compute the cosine distance of the query embedding array vs. all of
+        the embedding arrays of the full text corpus
+
+        Parameters
+        ----------
+        query_embedding : np.ndarray
+            1D array of the numerical embedding of the request query.
+
+        Returns
+        -------
+        out : np.ndarray
+            1D array with length equal to the number of entries in the text
+            corpus. Each value is a distance score where smaller is closer
+        """
+
+        dot = np.dot(self.embedding_arr, query_embedding)
+        norm1 = np.linalg.norm(query_embedding)
+        norm2 = np.linalg.norm(self.embedding_arr, axis=1)
+
+        out = 1 - (dot / (norm1 * norm2))
+
+        return out
+
+    def query_vector_db(self, query, limit=100):
+        """Returns a list of strings and relatednesses, sorted from most
+        related to least.
+
+        Parameters
+        ----------
+        query : str
+            Question being asked of GPT
+        limit : int
+            Number of top results to return.
+
+        Returns
+        -------
+        strings : np.ndarray
+            1D array of related strings
+        score : np.ndarray
+            1D array of float scores of strings
+        idx : np.ndarray
+            1D array of indices in the text corpus corresponding to the
+            ranked strings/scores outputs.
+        """
+
+        embedding = self.get_embedding(query)
+        scores = 1 - self.cosine_dist(embedding)
+        best = np.argsort(scores)[::-1][:limit]
+
+        strings = self.text_arr[best]
+        scores = scores[best]
+
+        return strings, scores, best
+
+    def make_ref_list(self, idx):
+        """Make a reference list
+
+        Parameters
+        ----------
+        used_index : np.ndarray
+            Indices of the used text from the text corpus
+
+        Returns
+        -------
+        ref_list : list
+            A list of references (strs) used. This takes information straight
+            from ``ref_col``. Ideally, this is something like:
+            ["{ref_title} ({ref_url})"]
+        """
+        ref_list = ''
+        if self.ref_col is not None and self.ref_col in self.corpus:
+            ref_list = list(self.corpus[self.ref_col].iloc[idx].unique())
+
+        return ref_list
+
+
+class EnergyWizardPostgres(EnergyWizardBase):
+    """Interface to ask OpenAI LLMs about energy research.
+
+    This class is for execution with a postgres vector database
+    TODO: slater describe the vector DB here
+    """
+
+    def __init__(self, model=None, token_budget=3500,
+                 vector_db_args=None):
+        """
+        Parameters
+        ----------
+        model : str
+            GPT model name, default is the DEFAULT_MODEL global var
+        token_budget : int
+            Number of tokens that can be embedded in the prompt. Note that the
+            default budget for GPT-3.5-Turbo is 4096, but you want to subtract
+            some tokens to account for the response budget.
+        vector_db_args :
+            TODO: slater implement required vector database stuff here and set
+            self.cursor and whatnot
+        """
+
+        super().__init__(model, token_budget=token_budget)
+
+    def query_vector_db(self, query, limit=100):
+        """Returns a list of strings and relatednesses, sorted from most
+        related to least.
+
+        Parameters
+        ----------
+        query : str
+            Question being asked of GPT
+        limit : int
+            Number of top results to return.
+
+        Returns
+        -------
+        strings : np.ndarray
+            1D array of related strings
+        score : np.ndarray
+            1D array of float scores of strings
+        idx : np.ndarray
+            1D array of indices in the text corpus corresponding to the
+            ranked strings/scores outputs.
+        """
+
+        # TODO: Slater implement vector db query here
+
+        return strings, scores, best
+
+    def make_ref_list(self, idx):
+        """Make a reference list
+
+        Parameters
+        ----------
+        used_index : np.ndarray
+            Indices of the used text from the text corpus
+
+        Returns
+        -------
+        ref_list : list
+            A list of references (strs) used. Ideally, this is something like:
+            ["{ref_title} ({ref_url})"]
+        """
+        # TODO: Slater implement vector db-to-meta-data query here to get
+        # information about the results (e.g., links and titles and whatnot)
+
+        return ref_list