Skip to content

Commit

Permalink
Update to V0.2.0 (#30)
Browse files Browse the repository at this point in the history
* Added feature to create a TreePool from a pandas DataFrame

Now it's easier to create unsymmetrical trees

* Created pools with varying size for the new experiments

* reorganized data directory

* added ';' as separator in the dataset files because of the presence of ',' in the prompts

* fixed bug where the random selection wasn't working when not selecting at least one point in the subpool

* uploaded dataset and script to process and generate it

* Added exception to address issue #18

* fixed small bug breaking the code when running it in verbose mode

* Included processing for OCM dataset

This notebook also creates the pool files. So I removed them from the repo

* Added new experiments and new dataset to the experiments notebook

* Implemeneted new features used during new tests

- Supporting TreePool ask
- Allowing to add random points in the subpool when in ask
- Exposing the inv_prefix to change the prefix used in the inverse design prompt

Also, refactored the example selector instantiation

* Removed uneeded print statement
  • Loading branch information
maykcaldas authored Sep 25, 2023
1 parent 29f7232 commit 44bfe35
Show file tree
Hide file tree
Showing 41 changed files with 47,539 additions and 7,336 deletions.
86 changes: 40 additions & 46 deletions bolift/asktell.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,7 @@ def __init__(
model: str = "text-curie-001",
temperature: Optional[float] = None,
prefix: Optional[str] = None,
inv_prefix: Optional[str] = None,
x_formatter: Callable[[str], str] = lambda x: x,
y_formatter: Callable[[float], str] = lambda y: f"{y:0.2f}",
y_name: str = "output",
Expand Down Expand Up @@ -93,6 +94,7 @@ def __init__(
self._prompt_template = prompt_template
self._suffix = suffix
self._prefix = prefix
self._inv_prefix = inv_prefix
self._model = model
self._example_count = 0
self._temperature = temperature
Expand Down Expand Up @@ -128,10 +130,14 @@ def _setup_inv_llm(self, model: str, temperature: Optional[float] = None):
temperature=0.05 if temperature is None else temperature,
)

def _setup_inverse_prompt(self, example: Dict):
def _setup_inverse_prompt(self,
example: Dict,
prefix: Optional[str] = None):
if prefix is None:
prefix = ""
prompt_template = PromptTemplate(
input_variables=["x", "y", "y_name", "x_name"],
template="If {y_name} is {y}, then {x_name} is {x}\n\n",
template="If {y_name} is {y}, then {x_name} is @@@\n{x}###",
)
if example is not None:
prompt_template.format(**example)
Expand All @@ -142,29 +148,22 @@ def _setup_inverse_prompt(self, example: Dict):
if self._selector_k is not None:
if len(examples) == 0:
raise ValueError("Cannot do zero-shot with selector")
if not self.cos_sim:
example_selector = (
example_selector
) = MaxMarginalRelevanceExampleSelector.from_examples(
[example],
OpenAIEmbeddings(),
FAISS,
k=self._selector_k,
)
else:
example_selector = (
example_selector
) = SemanticSimilarityExampleSelector.from_examples(
[example],
OpenAIEmbeddings(),
Chroma,
k=self._selector_k,
)

sim_selector = SemanticSimilarityExampleSelector if self.cos_sim else MaxMarginalRelevanceExampleSelector
example_selector = (
example_selector
) = sim_selector.from_examples(
[example],
OpenAIEmbeddings(),
FAISS,
k=self._selector_k,
)
return FewShotPromptTemplate(
examples=examples if example_selector is None else None,
example_prompt=prompt_template,
example_selector=example_selector,
suffix="If {y_name} is {y}, then {x_name} is ",
suffix="If {y_name} is {y}, then {x_name} is @@@",
prefix=prefix,
input_variables=["y", "y_name", "x_name"],
)

Expand Down Expand Up @@ -202,24 +201,16 @@ def _setup_prompt(
if self._selector_k is not None:
if len(examples) == 0:
raise ValueError("Cannot do zero-shot with selector")
if not self.cos_sim:
example_selector = (
example_selector
) = MaxMarginalRelevanceExampleSelector.from_examples(
[example],
OpenAIEmbeddings(),
FAISS,
k=self._selector_k,
)
else:
example_selector = (
example_selector
) = SemanticSimilarityExampleSelector.from_examples(
[example],
OpenAIEmbeddings(),
Chroma,
k=self._selector_k,
)

sim_selector = SemanticSimilarityExampleSelector if self.cos_sim else MaxMarginalRelevanceExampleSelector
example_selector = (
example_selector
) = sim_selector.from_examples(
[example],
OpenAIEmbeddings(),
FAISS,
k=self._selector_k,
)
return FewShotPromptTemplate(
examples=examples if example_selector is None else None,
example_prompt=prompt_template,
Expand Down Expand Up @@ -279,7 +270,7 @@ def tell(self, x: str, y: float, alt_ys: Optional[List[float]] = None) -> None:
self.prompt = self._setup_prompt(
example_dict, self._prompt_template, self._suffix, self._prefix
)
self.inv_prompt = self._setup_inverse_prompt(inv_example)
self.inv_prompt = self._setup_inverse_prompt(inv_example, self._inv_prefix)
self.llm = self._setup_llm(self._model, self._temperature)
self.inv_llm = self._setup_inv_llm(self._model, self._temperature)
self._ready = True
Expand Down Expand Up @@ -330,7 +321,7 @@ def predict(self, x: str) -> Union[Tuple[float, float], List[Tuple[float, float]
self.prompt = self._setup_prompt(
None, self._prompt_template, self._suffix, self._prefix
)
self.inv_prompt = self._setup_inverse_prompt(None)
self.inv_prompt = self._setup_inverse_prompt(None, self._inv_prefix)
self.llm = self._setup_llm(self._model)
self._ready = True

Expand Down Expand Up @@ -379,8 +370,8 @@ def ask(
possible_x: Union[Pool, List[str], TreePool, OrderedDict[str, Any]],
aq_fxn: str = "upper_confidence_bound",
k: int = 1,
inv_filter: int = 8,
aug_random_filter: int = 8,
inv_filter: int = 16,
aug_random_filter: int = 0,
_lambda: float = 0.5,
) -> Tuple[List[str], List[float], List[float]]:
"""Ask the optimizer for the next x to try.
Expand Down Expand Up @@ -426,9 +417,12 @@ def ask(
best = np.max(self._ys)

if isinstance(possible_x, Pool):
if inv_filter != 0 and inv_filter < len(possible_x):
approx_x = self.inv_predict(best * np.random.normal(1.0, 0.05))
possible_x_l = possible_x.approx_sample(approx_x, inv_filter)
if inv_filter+aug_random_filter < len(possible_x):
possible_x_l = []
print(inv_filter, aug_random_filter)
if inv_filter:
approx_x = self.inv_predict(best * np.random.normal(1.0, 0.05))
possible_x_l.extend(possible_x.approx_sample(approx_x, inv_filter))
if aug_random_filter:
possible_x_l.extend(possible_x.sample(aug_random_filter))
else:
Expand Down
45 changes: 22 additions & 23 deletions bolift/asktellGPR.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,13 +15,13 @@


class AskTellGPR(AskTellFewShotTopk):
def __init__(self, n_components=2, pool=None, cache_path=None, **kwargs):
def __init__(self, n_components=2, pool=None, cache_path=None, n_neighbors=5, **kwargs):
super().__init__(**kwargs)
self._set_regressor()
self.examples = []
self._embedding = OpenAIEmbeddings()
self._embeddings_cache = self._get_cache(cache_path)
self.isomap = Isomap(n_components=n_components)
self.isomap = Isomap(n_components=n_components, n_neighbors=n_neighbors)
self.pool = pool
if self.pool is not None:
self._initialize_isomap()
Expand Down Expand Up @@ -105,17 +105,6 @@ def _train(self, X, y):
mll = ExactMarginalLogLikelihood(self.regressor.likelihood, self.regressor)
fit_gpytorch_torch(mll)

def ask(
self,
possible_x: Union[Pool, List[str]],
aq_fxn: str = "upper_confidence_bound",
k: int = 1,
inv_filter: int = 16,
_lambda: float = 0.5,
) -> Tuple[List[str], List[float], List[float]]:
# just have this here to override default
return super().ask(possible_x, aq_fxn, k, 0, _lambda)

def tell(
self, x: str, y: float, alt_ys: Optional[List[float]] = None, train=True
) -> None:
Expand All @@ -135,16 +124,26 @@ def tell(
self.examples.append(example_dict)

if train:
self._train(
[
self.prompt.format(
x=ex["x"],
y_name=self._y_name,
)
for ex in self.examples
],
[ex["y"] for ex in self.examples],
)
try:
self._train(
[
self.prompt.format(
x=ex["x"],
y_name=self._y_name,
)
for ex in self.examples
],
[ex["y"] for ex in self.examples],
)
except ValueError as e:
print(40*"-" + "ERROR" + 40*"-")
print(e)
print("Not enough data to train. " \
"We use an isomap considering 5 neighbors. Therefore, more than 6 points are needed to train the model. " \
"Use train=False to tell N-1 points to the model first. " \
"Then use train=True to tell the last point to train the model.\n" \
"Alternatively, use `pool` to pass a bolift.Pool to train the isomap during AskTellGPR construction.")
print(85*"-")

def predict(self, x: str) -> Union[Tuple[float, float], List[Tuple[float, float]]]:
"""Predict the probability distribution and values for a given x.
Expand Down
4 changes: 2 additions & 2 deletions bolift/llm_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -237,7 +237,7 @@ def openai_choice_predict(query_list, llm, verbose, *args, **kwargs):
print("-" * 80)
print(query_list[0])
print("-" * 80)
print(query_list[0] + completion_response.generations[0][0].text)
print(query_list[0], completion_response.generations[0][0].text)
print("-" * 80)
results = []
for gen, q in zip(completion_response.generations, query_list):
Expand All @@ -255,7 +255,7 @@ def openai_topk_predict(query_list, llm, verbose, *args, **kwargs):
print("-" * 80)
print(query_list[0])
print("-" * 80)
print(query_list[0] + completion_response.generations[0][0].text)
print(query_list[0], completion_response.generations[0][0].text)
print("-" * 80)
results = []
for gens in completion_response.generations:
Expand Down
57 changes: 41 additions & 16 deletions bolift/pool.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
from langchain.embeddings import OpenAIEmbeddings
from collections import OrderedDict
import re
import pandas as pd

class Pool:
"""Class for sampling from pool of possible data points
Expand Down Expand Up @@ -106,6 +107,8 @@ def get_children_with_key(self, key):
def get_branch(self):
branch = OrderedDict({self.name: [self.value]})
parent = self.get_parent()
if not parent:
return branch
while parent.name != "root":
branch[parent.name] = [parent.value]
parent = parent.get_parent()
Expand Down Expand Up @@ -135,20 +138,27 @@ class TreePool:
["A = 2, B = 4, C = 8, A + B + C?", "A = 3, B = 5, C = 8, A + B + C?", "A = 1, B = 5, C = 8, A + B + C?",]
'''

def __init__(self, pool: OrderedDict[str, List[Any]], prompt_template:str, formatter: Callable = lambda x: str(x)) -> None:
if type(pool) is not OrderedDict:
raise TypeError("Pool must be a OrderedDict with variable names as keys and the range of possible values as values. Keys must be in order accordingly to the prompt")
def __init__(self, pool: Union[pd.DataFrame, OrderedDict[str, List[Any]]], prompt_template:str, formatter: Callable = lambda x: str(x)) -> None:
if type(pool) not in [OrderedDict, pd.DataFrame]:
raise TypeError("Invalid pool. Two types of pool are accepted:\n\tPool must be a OrderedDict with variable names as keys and the range of possible values as values, or\n\tPool must be a pandas DataFrame. \nKeys must be in order accordingly to the prompt_template")

pattern = re.compile(r"\{(.*?)\}")
if len(pattern.findall(prompt_template)) != len(pool):
raise ValueError("Prompt template must have the same number of variables as the pool")
prompt_keys = pattern.findall(prompt_template)
pool_keys = pool.keys()
if len(pattern.findall(prompt_template)) != len(pool.keys()):
raise ValueError(f"Prompt template must have the same number of variables as the pool.\n {len(prompt_keys)} in prompt_keys\n {len(pool_keys)} in pool_keys")
if not all (prompt_k == pool_k for prompt_k, pool_k in zip(prompt_keys, pool_keys)):
raise ValueError(f"Prompt template must have the same variables as the pool. \n prompt_keys = {prompt_keys}\n pool_keys = {pool_keys}")

self.formatter = formatter
self.prompt_template = prompt_template
self._selected = []
self._pool = pool
self._root = TreeNode('root', None)
self._build_tree()
if isinstance(self._pool, OrderedDict):
self._build_tree()
elif isinstance(self._pool, pd.DataFrame):
self._build_tree_from_df()
self._available = [self._format_branch(leaf.get_branch()) for leaf in self.get_leafs()]
# Probably this self._available is not the best way to track all available paths.
# Refactor this. Looking over the tree may be more memory efficient (maybe slower?)
Expand All @@ -168,6 +178,23 @@ def _build_tree(self):
node.add_child(TreeNode(k, child_v))
parent_key = k

def _build_tree_from_df(self):
for _, row in self._pool.iterrows():
current_node = self._root
for column, value in row.items():
found = False
for child in current_node.get_children_list():
if child.name == column and child.value == value:
current_node = child
found = True
break

if not found:
new_node = TreeNode(column, value)
current_node.add_child(new_node)
current_node = new_node


def get_node_with_key(self, key, root=None) -> List[TreeNode]:
if root is None:
root = self._root
Expand Down Expand Up @@ -250,19 +277,14 @@ def choose(self, x: Union[str, OrderedDict[str, Any]]) -> None:
"""Choose a specific item from the pool"""
if type(x) is str:
x = self.make_branch_from_string(x)
# for branch in self._available:
# if self.format_prompt(branch) == x:
# self._selected.append(branch)
# self._available.remove(branch)
# return
# raise ValueError("Item not in pool")
if x not in self._available:
raise ValueError("Item not in pool")

self._choose(self._format_branch(x))


def _choose(self, x: OrderedDict[str, Any]) -> None:
"""Choose a specific item from the pool"""
if x not in self._available:
raise ValueError("Item not in pool")

leaf = None
for l in self.get_leafs():
if self._format_branch(l.get_branch()) == x:
Expand All @@ -289,7 +311,10 @@ def approx_sample(self, x: str, k: int) -> None:
def reset(self) -> None:
"""Reset the pool"""
self._root = TreeNode('root', None)
self._build_tree()
if isinstance(self._pool, OrderedDict):
self._build_tree()
elif isinstance(self._pool, pd.DataFrame):
self._build_tree_from_df()
self._selected = []
self._available = [self._format_branch(leaf.get_branch()) for leaf in self.get_leafs()]

Expand Down
2 changes: 1 addition & 1 deletion bolift/version.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = "0.1.2"
__version__ = "0.2.0"
Loading

0 comments on commit 44bfe35

Please sign in to comment.