-
Notifications
You must be signed in to change notification settings - Fork 2
/
extract.ml
executable file
·455 lines (390 loc) · 16.6 KB
/
extract.ml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
open Yojson.Basic.Util
open Tokenizer
open Counter
open Similarity
(* Module for processing the data from the json file,
storing each document's data in a dictionary, and executing
the main computations for constructing the bot's response. *)
type topic_dict = {
topic : string;
counter : Counter.t;
}
type topic = {
topic : string;
content : string list;
}
(** [unpack_Yojson json] parses the given [json] file. *)
let unpack_Yojson (json: Yojson.Basic.json): topic = {
topic = member "topic" json|>to_string;
content = member "content" json
|>to_list |> List.map to_string;
}
(** [j] opens our json file. *)
let j = Yojson.Basic.from_file "corpus/improved_data.json"
(** [get_topic tp] returns the title of topic [tp]. *)
let get_topic (tp: topic) : string =
tp.topic
(** [get_all_topics] is all topic bindings of [topic_lst]. *)
let get_all_topics (topic_lst: topic list) : string list =
List.map (get_topic) topic_lst
(** [find_the_topics_content key_word top_lst] is the tokenized
content of the given topic [key_word]. *)
let find_the_topics_content (key_word:string) (top_lst: topic list) =
match (List.filter (fun x -> x.topic = key_word) top_lst) with
|topic::lst -> topic.content
|[]->failwith"This topc does not exist"
(** [get_content key_word json] creates the topic type that has
the topic title [key_word] and its content, all extracted from [json]. *)
let get_content (key_word:string) (json: Yojson.Basic.json) : topic =
(* Pervasives.print_string "track 2" ; *)
{
topic = key_word;
content = json
|>to_list
|>List.map unpack_Yojson
|>find_the_topics_content key_word
}
(** [string_topic_dict tp acc_tbl] returns a dictionary containing
bindings of topic name -> topic type. *)
let rec string_topic_dict (tp: string list)
(acc_tbl : ((string, topic) Hashtbl.t)): ((string, topic) Hashtbl.t) =
match tp with
| [] -> acc_tbl
| h::t -> Hashtbl.add acc_tbl h (get_content h j);
string_topic_dict t acc_tbl
(** [json_lst] is a formatted version of the json file in a list. *)
let json_lst = to_list j
(** [all_topics] is a list of all topic titles parsed from the json. *)
let all_topics = List.map unpack_Yojson json_lst
(** [topics] is a list of topic_dicts. *)
let topics = get_all_topics all_topics
(** [content_dict] is a topic dictionary, which is created and stored
for fast access during calculations. *)
let content_dict = string_topic_dict topics (Hashtbl.create 200)
(** [all_topics topics json acc], given a string list of all topics,
returns a list of topic data type. *)
let rec all_topics (topics : string list) (json : Yojson.Basic.json)
(acc : topic list) : (topic list) = match topics with
| word::t -> all_topics t (json) (get_content word json :: acc)
| [] -> acc
(** [tokenized_word_list key_word acc] returns, given a [key_word] topic,
a list of tokenized words from the content of the topic. *)
let tokenized_word_list (key_word:string) (acc:string list): string list =
List.concat (List.map Tokenizer.word_tokenize
(Hashtbl.find content_dict key_word).content)
(** [count_word key_word acc] gives you the counter data type of
the topic [key_word]. It is a helper function for [full_topic_dict] *)
let count_word (key_word:string) (acc:string list): Counter.t =
Counter.make_dict (tokenized_word_list key_word acc)
(** [full_topic_dict key_word] returns a topic_dict
data type that contains the topic and the counter dictionary
using a record. *)
let full_topic_dict (key_word:string) : topic_dict =
{
topic = key_word;
counter = (count_word key_word []);
}
(** [all_full_topics topics acc] returns a list of topic_dict
from a list of topics. *)
let rec all_full_topics (topics : string list)
(acc : topic_dict list) : topic_dict list =
match topics with
| word::t -> all_full_topics t
(full_topic_dict word :: acc)
| [] -> acc
let all_topic_dict_counter = all_full_topics topics []
(** [get_td_counter_ht td] this function creates a Hashtable of the
topic_dict instantiations from the topic list we have above
(all of the topics in the JSON file). *)
let rec get_td_counter_ht (td: topic_dict list)
(acc_tbl: (string, Counter.t) Hashtbl.t) :
((string, Counter.t) Hashtbl.t ) =
match td with
| [] -> acc_tbl
| h::t -> Hashtbl.add acc_tbl h.topic h.counter;
get_td_counter_ht t acc_tbl
(** [all_topic_dict_counter_ht] is a Hashtable of the
topic_dict instantiations from the topic list we have above
(all of the topics in the JSON file). *)
let all_topic_dict_counter_ht = get_td_counter_ht
all_topic_dict_counter (Hashtbl.create 100)
(** [get_counter topic_dict] returns the counter of [topic_dict]. *)
let get_counter topic_dict =
(* Pervasives.print_string "help 4" ; *)
topic_dict.counter
(** [print_topic_dict_list] is a print helper function to visualize
the topic_dict list from [all_topic_dict_counter]. *)
let rec print_topic_dict_list = function
| [] -> ()
| (e:topic_dict)::l -> print_string e.topic ;
print_string " | " ; print_topic_dict_list l
let rec which_dict_has_the_word (word:string)
(topic_dict_lst:topic_dict list)
(acc:topic_dict list): topic_dict list =
match topic_dict_lst with
|topic_dict::lst->
if Counter.mem word topic_dict.counter
then which_dict_has_the_word word
lst (topic_dict::acc)
else which_dict_has_the_word word lst acc
|[]->acc
let rec count_word_in_topic (word:string)
(topic:string) : int =
let counter = Hashtbl.find all_topic_dict_counter_ht topic in
let output = Counter.find_word word (counter) in
begin
output;
end
(** [tf word topic] is term frequency of [word] in [topic]
calculated as follows:
# of times [word] appears in [topic] / total number of topics *)
let rec tf (word:string) (topic:string) =
float_of_int (count_word_in_topic word topic) /.
float_of_int (Hashtbl.length all_topic_dict_counter_ht)
(** [idf word] is a statistical measure of
how important [word] is, based on the following
calculation: log (total # of documents / # of
documents with [word] in them) *)
let rec idf (word:string) =
Pervasives.log(
float_of_int (List.length topics) /.
float_of_int (List.length
((which_dict_has_the_word word all_topic_dict_counter) [])))
(** [tfidf input_word topics] is the TF-IDF
of an input word computed for the given [topic]. *)
let rec tfidf (input_word:string)
(topic:string): float =
(tf input_word topic) *. (idf input_word)
(** [construct_tfidf input_word] is a list of tuples,
where the first element of each tuple is a topic title
(string) and the second element is the TF-IDF value for
that topic with respect to [input_word]. *)
let rec construct_tfidf (input_word:string) :
(string, float) Hashtbl.t =
let ht = Hashtbl.create 200000 in
begin
List.map (fun (topic : string) : unit ->
Hashtbl.add ht topic (tfidf input_word topic)) topics;
ht
end
(** [get_topics_tfidf input_sent] takes in a
sentence, tokenizes it, and for each word,
computes a list of tfidf scores of each word
in each document. *)
let get_topics_tfidf (input_sent:string):
((string, float) Hashtbl.t list) =
let input_tokens = Similarity.remove_dups
(Tokenizer.word_tokenize input_sent) in
List.map (fun w -> construct_tfidf w) input_tokens
(** [questions] is a list of questions that are matched to likely
keywords. *)
let questions = [("who", [["is"; "are"];["a"; "an" ;"the"]]);
("where", [["at";"in"]]);
("what", [["is";"are"];["a";"an";"the"]]);
("when", [["on"; "in"]])]
(** [question_helper acc_tbl lst] is a helper function for
[question_ht] that creates a hashtable from [lst]. *)
let rec question_helper acc_tbl lst =
match lst with
| [] -> acc_tbl
| h::t -> Hashtbl.add acc_tbl (fst h) (snd h);
question_helper acc_tbl t
(** [question_ht] is a hash table version of questions. *)
let question_ht = question_helper (Hashtbl.create 4) questions
(** [check_by_category category input_tokens] returns a boolean
value of whether any of the elements of [lst] are present in
[acc_tbl]. *)
let rec check_by_category category (input_tokens:string list) =
match input_tokens with
| [] -> false
| h::t -> if List.mem h category then true else
check_by_category category t
(** [check_all_categories question_lst input_tokens] returns true if
[input_tokens] contains at least one element in each list of
[question_lst] and false otherwise. *)
let rec check_all_categories question_lst input_tokens =
match question_lst with
| [] -> true
| h::t -> (check_by_category h input_tokens &&
check_all_categories t input_tokens)
(** [filter_tokens input_tokens sentence] returns true if [sentence]
contains the keywords associated with the question word, if a question
word is the first element of [input_tokens]. A question word is defined
as "who", "what", "when", and "where". *)
let filter_tokens input_tokens sentence =
let tokenized_sentence = Tokenizer.word_tokenize sentence in
match input_tokens with
| [] -> true
| h::t ->
if (Hashtbl.mem question_ht h) then
let q = Hashtbl.find question_ht h in
(check_all_categories q tokenized_sentence)
else true
let max_jaccard_sentence (topic:string)
(input_sent:string) : string =
let topic_we_want = (Hashtbl.find content_dict topic) in
let doc_sentences = topic_we_want.content in
let input_tokens = Similarity.remove_dups
(Tokenizer.word_tokenize input_sent) in
let filter_tokens_fixed sentence = filter_tokens input_tokens sentence in
(* [doc_sent_tok_dict] creates [key: sentence, value: sentence's
word token list] dict *)
let doc_sent_tok_dict =
let filtered_sentences =
(List.filter (filter_tokens_fixed) doc_sentences) in
List.map (fun s ->
(s, Similarity.remove_dups
(Tokenizer.word_tokenize s))) filtered_sentences in
(* [doc_sent_jac_dict] creates [key: sentence, value: sentence's
jaccard score] dict *)
let doc_sent_jac_dict = List.map (fun e ->
(fst e, Similarity.jaccard_sim
(snd e) (input_tokens))) doc_sent_tok_dict in
(** [find_max_j dsj_dict acc_sent acc_int]
finds the maximum jaccard value in [dsj_dict] *)
let rec find_max_j dsj_dict acc_sent acc_int =
match dsj_dict with
| [] -> acc_sent
| h::t -> if (snd h > acc_int) then
begin
find_max_j t (fst h) (snd h)
end
else find_max_j t acc_sent acc_int
in find_max_j doc_sent_jac_dict "" 0.0
(** [add_elt_to_list doc lst] returns a list of
tuples where the first value corresponds
to the document and the value is the combined
tfidf scores of each word in that document. *)
let rec add_elt_to_list (doc : string*float)
(lst: (string, float) Hashtbl.t) : (string, float) Hashtbl.t =
let tfidf = Hashtbl.find_opt lst (fst doc) in
match tfidf with
| None -> Hashtbl.add lst (fst doc) (snd doc); lst
| Some i -> Hashtbl.remove lst (fst doc);
Hashtbl.add lst (fst doc) (i +. (snd doc)); lst
(** [add_list_to_list lst1 lst2] combines the elements
of lst1 and lst2, where there are no duplicate
string values. If the string values are equal,
their float values are added. *)
let rec add_list_to_list (lst1: (string, float) Hashtbl.t)
(lst2: (string, float) Hashtbl.t): (string, float) Hashtbl.t=
Hashtbl.iter (fun (a : string) (b : float) : unit ->
match (Hashtbl.find_opt lst1 a) with
| Some found_b -> Hashtbl.remove lst1 a;
Hashtbl.add lst1 a (b +. found_b)
| None -> Hashtbl.add lst1 a b
) lst2; lst1
(** [print_hashtable ht] is a helper function to print hashtable [ht]
for debugging. *)
let print_hashtable (ht :
(string, float) Hashtbl.t) : unit =
Hashtbl.iter (fun x y -> print_string x;
print_float y; print_newline ()) ht
(** [add_tfidf input_sent] computes the sum of
TFIDF scores for each word in each document and returns
the document with the highest sum. *)
let add_tfidf (input_sent : string) : string =
let doc_list = get_topics_tfidf input_sent in
let temp_list = List.fold_left add_list_to_list
(Hashtbl.create 20000) doc_list in
let good_tup = Hashtbl.fold (fun
(a : string)
(b : float)
(c : string * float) : (string * float)
-> if b > (snd c) then (a, b) else c)
temp_list ("David Gries", 0.0) in
begin
fst good_tup
end
let get_response (input_sent : string) : string =
let topic_doc = add_tfidf input_sent in
let response = begin (* Pervasives.print_string topic_doc; *)
max_jaccard_sentence topic_doc input_sent
end
in response
(** [get_topic_dict_topic topic_dict] returns the topic of [topic_dict]. *)
let get_topic_dict_topic (topic_dict:topic_dict) = topic_dict.topic
let get_topics (topic_dict_lst:topic_dict list) = List.map
get_topic_dict_topic topic_dict_lst
(** Useful helpers to be called in Autocorrect. *)
(** List of all counters for each document we have in the corpus *)
let all_counters_list =
List.map (fun a -> get_counter a) all_topic_dict_counter
(** [all_dict_list] is a list of all dictionaries for each document we have
in the corpus. *)
let all_dict_list =
List.map (fun (a:Counter.t) -> Counter.get_dictionary a) all_counters_list
(** [add_list_to_list ht1z ht2] combines the elements
of ht1 and ht2, where there are no duplicate
string values. If the string values are equal,
their float values are added. *)
let rec add_list_to_list (ht1: (string, int) Hashtbl.t)
(ht2: (string, int) Hashtbl.t): (string, int) Hashtbl.t=
Hashtbl.iter (fun (a : string) (b : int) : unit ->
match (Hashtbl.find_opt ht1 a) with
| Some found_b -> Hashtbl.remove ht1 a;
Hashtbl.add ht1 a (b + found_b)
| None -> Hashtbl.add ht1 a b
) ht2; ht1
let big_counter_ht =
List.fold_left add_list_to_list (Hashtbl.create 50000) all_dict_list
let all_words = Hashtbl.fold (fun k v acc -> k :: acc) big_counter_ht []
(** [count_all_unique_words] is the number of all unique words in all of
the documents. *)
let count_all_unique_words = Hashtbl.fold (fun k v acc -> acc+1) big_counter_ht 0
(* Embeddings functions start here *)
let vocab_size = count_all_unique_words
let word2vec_dict vocab_size =
let new_hashtbl = Hashtbl.create vocab_size in
List.fold_left (fun ht word -> (Hashtbl.add ht word
(Hashtbl.length ht); ht)) new_hashtbl all_words
let vectorize input_sent vocab_size word2vec_dict =
let vec = Array.init vocab_size (function i -> 0) in
let rec vectorize_sent input_sent vocab_size
word2vec_dict (acc:int array) =
match input_sent with
| [] -> acc
| h::t ->
match Hashtbl.find_opt word2vec_dict h with
(* | None -> vec *)
| None -> vectorize_sent t vocab_size word2vec_dict (acc)
(* | Some i -> Array.set acc i ((Array.get acc i) + 1);*)
| Some i -> Array.set acc i 1;
vectorize_sent t vocab_size word2vec_dict (acc)
in
vectorize_sent input_sent vocab_size word2vec_dict vec
(** [find_max_cosine topic acc_sent acc_score] finds the max cosine similarity
of a score and sentence. *)
let find_max_cosine (topic:string) (input_tokens: string list)
q_vector (acc_sent_o:string) (acc_score:float) =
let acc_sent = Tokenizer.word_tokenize acc_sent_o in
let topic_we_want = (Hashtbl.find content_dict topic) in
let sentences = topic_we_want.content in
let filter_tokens_fixed sentence = filter_tokens
input_tokens sentence in
let filtered_sentences =
(List.filter (filter_tokens_fixed) sentences) in
let sentences_of_topic = List.map (fun s ->
(s, Tokenizer.word_tokenize s)) filtered_sentences; in
let rec helper sentences q_vector acc_sent acc_sent_o acc_score =
match sentences with
| [] -> acc_sent_o
| h::t ->
let sent_vec = vectorize (snd h)
vocab_size (word2vec_dict vocab_size)
in
let new_score = Similarity.cosine_sim (Array.to_list sent_vec)
(Array.to_list q_vector)
in
if new_score > acc_score then helper t q_vector
(snd h) (fst h) new_score
else helper t q_vector acc_sent acc_sent_o acc_score
in
helper sentences_of_topic q_vector acc_sent acc_sent_o acc_score
let get_response_2 (input:string) : string =
let question_vec = vectorize (Tokenizer.word_tokenize input)
count_all_unique_words (word2vec_dict count_all_unique_words) in
let topic = add_tfidf input in
let input_tokens = Similarity.remove_dups
(Tokenizer.word_tokenize input) in
(find_max_cosine topic input_tokens question_vec "" 0.0)