forked from saprmarks/geometry-of-truth
-
Notifications
You must be signed in to change notification settings - Fork 1
/
few_shot.py
126 lines (104 loc) · 4.05 KB
/
few_shot.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
import torch as t
import pandas as pd
import os
from generate_acts import load_llama
from tqdm import tqdm
import argparse
import json
ROOT = os.path.dirname(os.path.abspath(__file__))
t.set_grad_enabled(False)
def get_few_shot_accuracy(
datasets, model_size, n_shots=5, calibrated=True, device="cpu"
):
"""Compute the few-shot accuracy of the model on the given datasets.
Returns a list of dictionaries with experimental results, namely:
* The dataset used.
* The number of shots in the few shot prompt.
* The few shot prompt used.
* The accuracy of the model.
* The calibration constant, if calibrated=True.
"""
tokenizer, model = load_llama(model_size, device)
outs = []
for dataset in datasets:
out = {"dataset": dataset, "n_shots": n_shots}
# prepare data and prompt
data_directory = os.path.join(ROOT, "datasets", f"{dataset}.csv")
df = pd.read_csv(data_directory)
shots = df.sample(n_shots)
queries = df.drop(shots.index)
prompt = ""
for _, shot in tqdm(shots.iterrows(), desc=f"Processing {dataset}"):
prompt += f'{shot["statement"]} '
if bool(shot["label"]):
prompt += "TRUE\n"
else:
prompt += "FALSE\n"
out["shots"] = shots["statement"].tolist()
out["prompt"] = prompt
# cache activations over the prompt for reuse
input_ids = tokenizer.encode(prompt, return_tensors="pt").to(device)
outputs = model(input_ids, output_hidden_states=True)
past_key_values = outputs.past_key_values
# get completions and evaluate accuracy
true_idx, false_idx = tokenizer.encode("TRUE")[1], tokenizer.encode("FALSE")[1]
p_trues, p_falses = [], []
for _, query in tqdm(queries.iterrows()):
input_ids = tokenizer.encode(
prompt + query["statement"], return_tensors="pt"
).to(device)
outputs = model(input_ids, past_key_values=past_key_values)
probs = outputs.logits[0, -1, :].softmax(-1)
p_trues.append(probs[true_idx].item()), p_falses.append(
probs[false_idx].item()
)
# if calibrated, compute calibration constant
if calibrated:
diffs = [p_true - p_false for p_true, p_false in zip(p_trues, p_falses)]
gamma = sorted(diffs)[len(diffs) // 2]
out["gamma"] = gamma
else:
gamma = 0
# get predicted labels
predicted_labels = [
p_true - p_false > gamma for p_true, p_false in zip(p_trues, p_falses)
]
predicted_labels = t.Tensor(predicted_labels).to(device).float()
true_labels = t.Tensor(queries["label"].values).to(device)
acc = (predicted_labels == true_labels).float().mean().item()
out["acc"] = acc
outs.append(out)
return outs
if __name__ == "__main__":
"""
Compute the few-shot accuracy of the model on the given datasets and save results.
"""
parser = argparse.ArgumentParser()
parser.add_argument(
"--datasets", type=str, nargs="+", help="datasets to evaluate on"
)
parser.add_argument(
"--model_size", type=str, default="13B", help="model size to evaluate"
)
parser.add_argument("--n_shots", type=int, default=5, help="number of shots to use")
parser.add_argument(
"--uncalibrated",
action="store_true",
default=False,
help="set flag if using uncalibrated few shot",
)
parser.add_argument("--device", default="cuda:0", help="device to use")
args = parser.parse_args()
out = get_few_shot_accuracy(
args.datasets, args.model_size, args.n_shots, not args.uncalibrated, args.device
)
# save results
with open(
os.path.join(ROOT, "experimental_outputs", "few_shot_results.json"), "r"
) as f:
data = json.load(f)
data.extend(out)
with open(
os.path.join(ROOT, "experimental_outputs", "few_shot_results.json"), "w"
) as f:
json.dump(data, f, indent=4)