-
Notifications
You must be signed in to change notification settings - Fork 0
/
main.py
187 lines (159 loc) · 5.16 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
import os
import time
import matplotlib.pyplot as plt
from langchain.llms import HuggingFacePipeline
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, pipeline
# Disable parallelism and avoid the warning message
os.environ["TOKENIZERS_PARALLELISM"] = "false"
# Define model IDs
model_ids = ["google/flan-t5-large", "google/flan-t5-xl"]
base_prompt = "For the following sentence, explain in your own words what this sentence is about: \n\n"
# Define prompts and types
prompts = ["Apple iPhone XS 64Gb"]
types = [
"Knowledge Retrieval",
"Knowledge Retrieval",
"Knowledge Retrieval",
"Logical Reasoning",
"Cause and Effect",
"Analogical Reasoning",
"Inductive Reasoning",
"Deductive Reasoning",
"Counterfactual Reasoning",
"In Context",
]
# Create empty lists to store generation times, model load times, tokenizer load times, and pipeline load times
xl_generation_times = []
large_generation_times = []
xl_model_load_times = []
large_model_load_times = []
xl_tokenizer_load_times = []
large_tokenizer_load_times = []
xl_pipeline_load_times = []
large_pipeline_load_times = []
prompt_types = []
for model_id in model_ids:
# Load tokenizer
tokenizer_start_time = time.time()
tokenizer = AutoTokenizer.from_pretrained(model_id)
tokenizer_end_time = time.time()
# Load model
model_start_time = time.time()
model = AutoModelForSeq2SeqLM.from_pretrained(model_id)
model_end_time = time.time()
# Load pipeline
pipe_start_time = time.time()
pipe = pipeline(
"text2text-generation",
model=model,
tokenizer=tokenizer,
max_length=512,
)
local_llm = HuggingFacePipeline(pipeline=pipe)
pipe_end_time = time.time()
# Store loading times
if model_id == "google/flan-t5-large":
large_model_load_times.append(model_end_time - model_start_time)
large_tokenizer_load_times.append(
tokenizer_end_time - tokenizer_start_time
)
large_pipeline_load_times.append(pipe_end_time - pipe_start_time)
elif model_id == "google/flan-t5-xl":
xl_model_load_times.append(model_end_time - model_start_time)
xl_tokenizer_load_times.append(
tokenizer_end_time - tokenizer_start_time
)
xl_pipeline_load_times.append(pipe_end_time - pipe_start_time)
# Print model results
print()
print(f"Results for model: {model_id}")
print("=" * 30)
# Loop thru prompt list, measure the time to the generate answers, print prompt, answer, time, type
for i, prompt in enumerate(prompts):
prompt = f"{base_prompt} + '{prompt}'"
start_time = time.time()
answer = local_llm(prompt)
end_time = time.time()
print(f"Prompt: {prompt}")
print(f"Answer: {answer}")
print(f"Generation Time: {end_time - start_time:.5f} seconds")
print(f"Type: {types[i]}")
print()
# store prompt types and time measures to generate ansswers by prompt types
prompt_types.append(types[i]) # Store the prompt type
if model_id == "google/flan-t5-large":
large_generation_times.append(end_time - start_time)
elif model_id == "google/flan-t5-xl":
xl_generation_times.append(end_time - start_time)
# print loading times
print(f"Loading times for model {model_id}")
print(
"Tokenizer Loading Time:",
f"{tokenizer_end_time - tokenizer_start_time:.5f}",
"seconds",
)
print(
"Model Loading Time:",
f"{model_end_time - model_start_time:.5f}",
"seconds",
)
print(
"Pipeline Loading Time:",
f"{pipe_end_time - pipe_start_time:.5f}",
"seconds\n\n",
)
# Plot model load times
model_load_times = [sum(xl_model_load_times), sum(large_model_load_times)]
model_labels = ["XL Model", "Large Model"]
plt.figure(figsize=(18, 6))
plt.subplot(131)
plt.bar(model_labels, model_load_times, color=["blue", "orange"])
plt.ylabel("Load Time (seconds)")
plt.xlabel("Model")
plt.title("Model Load Time Comparison")
# Plot tokenizer load times
tokenizer_load_times = [
sum(xl_tokenizer_load_times),
sum(large_tokenizer_load_times),
]
plt.subplot(132)
plt.bar(model_labels, tokenizer_load_times, color=["blue", "orange"])
plt.ylabel("Load Time (seconds)")
plt.xlabel("Model")
plt.title("Tokenizer Load Time Comparison")
# Plot pipeline load times
pipeline_load_times = [
sum(xl_pipeline_load_times),
sum(large_pipeline_load_times),
]
plt.subplot(133)
plt.bar(model_labels, pipeline_load_times, color=["blue", "orange"])
plt.ylabel("Load Time (seconds)")
plt.xlabel("Model")
plt.title("Pipeline Load Time Comparison")
# Plot generation times
plt.figure(figsize=(9, 6))
plt.barh(
range(len(types)),
xl_generation_times,
height=0.4,
align="center",
color="blue",
label="XL Model",
)
plt.barh(
[x + 0.4 for x in range(len(types))],
large_generation_times,
height=0.4,
align="center",
color="orange",
alpha=0.5,
label="Large Model",
)
plt.yticks(range(len(types)), types)
plt.ylabel("Type")
plt.xlabel("Generation Time (seconds)")
plt.title("Generation Time Comparison")
plt.legend()
plt.tight_layout()
plt.show()