From 44b540c1a42c182d385eb48c6cdb66a667b29a60 Mon Sep 17 00:00:00 2001 From: tejasvaidhyadev Date: Tue, 28 Apr 2020 06:17:26 +0530 Subject: [PATCH 01/51] adding StatsBase --- Manifest.toml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/Manifest.toml b/Manifest.toml index 187c33a1..358163f2 100644 --- a/Manifest.toml +++ b/Manifest.toml @@ -470,3 +470,6 @@ deps = ["BinaryProvider", "Libdl", "Printf"] git-tree-sha1 = "580ce62b6c14244916cc28ad54f8a2e2886f843d" uuid = "a5390f91-8eb1-5f08-bee0-b1d1ffed6cea" version = "0.8.3" + +[[StatsBase]] +uuid = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91" From 8becfce954c8e7909a63c37bce8c8d1b04b4406b Mon Sep 17 00:00:00 2001 From: tejasvaidhyadev Date: Tue, 28 Apr 2020 06:17:58 +0530 Subject: [PATCH 02/51] adding StatsBase --- Project.toml | 1 + 1 file changed, 1 insertion(+) diff --git a/Project.toml b/Project.toml index 02bcd701..e17f55b5 100644 --- a/Project.toml +++ b/Project.toml @@ -22,6 +22,7 @@ SparseArrays = "2f01184e-e22b-5df5-ae63-d93ebab69eaf" Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2" Tracker = "9f7883ad-71c0-57eb-9f7f-b5c9e6d3789c" WordTokenizers = "796a5d58-b03d-544a-977e-18100b691f6e" +StatsBase = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91" [compat] Flux = "< 0.10" From f9d8b5f2d36e53521173a9aba591d3862acb62f3 Mon Sep 17 00:00:00 2001 From: tejasvaidhyadev Date: Tue, 28 Apr 2020 06:18:48 +0530 Subject: [PATCH 03/51] exporting function --- src/TextAnalysis.jl | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/src/TextAnalysis.jl b/src/TextAnalysis.jl index 8978cf1b..4f59b72b 100644 --- a/src/TextAnalysis.jl +++ b/src/TextAnalysis.jl @@ -2,7 +2,7 @@ module TextAnalysis using SparseArrays using Printf using LinearAlgebra - + using StatsBase: countmap,addcounts! using Languages using DataFrames using WordTokenizers @@ -66,7 +66,8 @@ module TextAnalysis export CRF, viterbi_decode, crf_loss export NERTagger, PoSTagger, Tracker, Flux - + + export Vocabulary,lookup,update include("tokenizer.jl") include("ngramizer.jl") include("document.jl") @@ -111,7 +112,10 @@ module TextAnalysis include("sequence/pos_datadeps.jl") include("sequence/pos.jl") include("sequence/sequence_models.jl") - + + # Lang_model + include("LM/vocab.jl") + # ULMFiT module ULMFiT using ..TextAnalysis From 59b8fae8f179b6ac532d0fe4342ebb49283c4c08 Mon Sep 17 00:00:00 2001 From: tejasvaidhyadev Date: Tue, 28 Apr 2020 06:19:46 +0530 Subject: [PATCH 04/51] adding vocabulary strut in LM --- src/LM/vocab.jl | 114 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 114 insertions(+) create mode 100644 src/LM/vocab.jl diff --git a/src/LM/vocab.jl b/src/LM/vocab.jl new file mode 100644 index 00000000..2b432294 --- /dev/null +++ b/src/LM/vocab.jl @@ -0,0 +1,114 @@ +""" General counter to used in vocabulary""" +mutable struct Counter + value::Int + #Counter(value) = (new(),value) +end + +function counter(init = 0) + Counter(init) +end +function (count::Counter)() + count.value = 1 +count.value +end + +"""Stores language model vocabulary. + Satisfies two common language modeling requirements for a vocabulary: + - When checking membership and calculating its size, filters items + by comparing their counts to a cutoff value. + - Adds a special "unknown" token which unseen words are mapped to. + + >>> words = ['a', 'c', '-', 'd', 'c', 'a', 'b', 'r', 'a', 'c', 'd'] + >>> import Vocabulary + >>> vocabulary = Vocabulary(words, 2) + Vocabulary(Dict(""=>1,"c"=>3,"a"=>3,"d"=>2), 2, "") + >>> vocabulary.vocab + Dict{String,Int64} with 4 entries: + "" => 1 + "c" => 3 + "a" => 3 + "d" => 2 + Tokens with counts greater than or equal to the cutoff value will + be considered part of the vocabulary. + >>> vocabulary.vocab["c"] + 3 + >>> "c" in keys(vocabulary.vocab) + true + >>> vocabulary.vocab["d"] + 2 + >>> "d" in keys(vocabulary.vocab) + true + Tokens with frequency counts less than the cutoff value will be considered not + part of the vocabulary even though their entries in the count dictionary are + preserved. + >>> "b" in keys(vocabulary.vocab) + false + >>> "" in keys(vocabulary.vocab) + true + We can look up words in a vocabulary using its `lookup` method. + "Unseen" words (with counts less than cutoff) are looked up as the unknown label. + If given one word (a string) as an input, this method will return a string. + >>> lookup("a") + 'a' + >>> word = ["a", "-", "d", "c", "a"] + >>> lookup(vocabulary ,word) + 5-element Array{Any,1}: + "a" + "" + "d" + "c" + "a" + + If given a sequence, it will return an Array{Any,1} of the looked up words as shown above. + + It's possible to update the counts after the vocabulary has been created. + >>> update(vocabulary,["b","c","c"]) + 1 + >>> vocabulary.vocab["b"] + 1 + """ +mutable struct Vocabulary +vocab::Dict{String,Int64} +unk_cutoff::Int +unk_label::String +allword::Array{String,1} +end +function Vocabulary(word,unk_cutoff =1 ,unk_label = "") + if unk_label in word + #error("unk_label is in vocab") + else + word= push!(word,unk_label) + end + vocab = countmap(word) + for value in vocab + if value[2] Date: Wed, 29 Apr 2020 02:30:08 +0530 Subject: [PATCH 05/51] exporting everygram and padding_ngram --- src/TextAnalysis.jl | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/TextAnalysis.jl b/src/TextAnalysis.jl index 4f59b72b..1b231299 100644 --- a/src/TextAnalysis.jl +++ b/src/TextAnalysis.jl @@ -68,6 +68,8 @@ module TextAnalysis export NERTagger, PoSTagger, Tracker, Flux export Vocabulary,lookup,update + export everygram, padding_ngram + include("tokenizer.jl") include("ngramizer.jl") include("document.jl") From 36aca88f810ac4953296207de32f02e331904dee Mon Sep 17 00:00:00 2001 From: tejasvaidhyadev Date: Wed, 29 Apr 2020 02:30:53 +0530 Subject: [PATCH 06/51] adding padding and everygram --- src/LM/preprocessing.jl | 57 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 57 insertions(+) create mode 100644 src/LM/preprocessing.jl diff --git a/src/LM/preprocessing.jl b/src/LM/preprocessing.jl new file mode 100644 index 00000000..7949ceb7 --- /dev/null +++ b/src/LM/preprocessing.jl @@ -0,0 +1,57 @@ +""" +Return all possible ngrams generated from sequence of items, as an Array{String,1} +# Example + +>>>seq=["To","be","or","not"] +>>>a = everygram(seq,min_len = 1, max_len = -1) + 10-element Array{Any,1}: + "or" + "not" + "To" + "be" + "or not" + "be or" + "be or not" + "To be or" + "To be or not" + +""" +function everygram(seq; min_len::Int=1, max_len::Int=-1) + ngram = [] + if max_len == -1 + max_len = length(seq) + end + for n in range(min_len, stop =max_len) + temp = keys(TextAnalysis.ngramize(TextAnalysis.Languages.English(),seq,n)) + ngram = append!(ngram,temp) + end + return(ngram) +end + +""" + padding _ngram is used to pad both left and right of sentence and out putting ngrmas + + It also pad the original input Array of string +# Example Usage +>>>example = ["1","2","3","4","5"] + +>>> example = ["1","2","3","4","5"] +>>> padding_ngram(example ,2 , pad_left=true,pad_right =true) + 5-element Array{String,1}: + "1" + "2" + "3" + "4" + "5" +""" +function padding_ngram(word,n =1 ;pad_left=false,pad_right=false ,left_pad_symbol="", right_pad_symbol ="") + local seq + seq = word + if pad_left == true + prepend!(seq, [left_pad_symbol]) + end + if pad_right == true + push!(seq, right_pad_symbol) + end + return keys(TextAnalysis.ngramize(TextAnalysis.Languages.English(),seq,n)) +end From 29fee5564f81e2e88237af814ed21fb1fba38b63 Mon Sep 17 00:00:00 2001 From: tejasvaidhyadev Date: Tue, 19 May 2020 21:21:40 +0530 Subject: [PATCH 07/51] added general counter --- src/LM/counter.jl | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) create mode 100644 src/LM/counter.jl diff --git a/src/LM/counter.jl b/src/LM/counter.jl new file mode 100644 index 00000000..20e13ca6 --- /dev/null +++ b/src/LM/counter.jl @@ -0,0 +1,25 @@ +using DataStructures +import DataStructures.Accumulator +import DataStructures.DefaultDict +import DataStructures.counter + +function counter1(data, min::Integer, max::Integer) + data = (everygram(data,min_len = min, max_len =max )) + data = split.(data) + temp_lm = DefaultDict{SubString{String}, Accumulator{String,Int64}}(counter(SubString{String})) + for i in 1:length(data) + history,word = data[i][1:end-1], data[i][end] + + temp_lm[join(history, " ")][word]+=1 + end + #return Dict from iterated temp_lm with normalized histories + Dict(word => normalize(histories) for (word,histories) in temp_lm) + #return temp_lm +end + +function normalize(accum) + #sum all counts + s = float(sum(accum)) + #tuple of string with each count divided by sum + [(history,float(sum(count))/s) for (history,count) in accum] +end From c41df193371846cf60e38ac8420615beb3f60079 Mon Sep 17 00:00:00 2001 From: tejasvaidhyadev Date: Tue, 19 May 2020 21:22:38 +0530 Subject: [PATCH 08/51] adding mle in language model with some peremeter --- src/LM/langmodel.jl | 39 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 39 insertions(+) create mode 100644 src/LM/langmodel.jl diff --git a/src/LM/langmodel.jl b/src/LM/langmodel.jl new file mode 100644 index 00000000..0e2845dc --- /dev/null +++ b/src/LM/langmodel.jl @@ -0,0 +1,39 @@ +abstract type Langmodel end + +struct mle <: Langmodel + vocab ::Vocabulary + #counter::Dict{SubString{String},Array{Tuple{String,Float64},1}} +end + +function mle(word,unk_cutoff=1 ,unk_label="") + mle(Vocabulary(word,unk_cutoff ,unk_label)) +end + +function fit!(lm::Langmodel,text,min::Integer,max::Integer) + text = lookup(lm.vocab ,text) + text = convert(Array{String}, text) + return counter1(text,min,max) +end + +function unmaskscore(a::Dict{SubString{String},Array{Tuple{String,Float64},1}},word,context) + for i in a[context] + if word == i[1] + return i[2] + end + end +end + +function score(voc::Langmodel,model::Dict{SubString{String},Array{Tuple{String,Float64},1}} ,word ,context ) + """Masks out of vocab (OOV) words and computes their model score. + For model-specific logic of calculating scores, see the `unmasked_score` + method. + """ + return unmaskscore(model,word,context ) +end + +function logscore(word, context= None) + """Evaluate the log score of this word in this context. + The arguments are the same as for `score` and `unmasked_score`. + """ + return log2(score(word, context)) +end From a35ca92af0ebe52ce4df50e0578d4355e9e4afa5 Mon Sep 17 00:00:00 2001 From: tejasvaidhyadev Date: Sun, 24 May 2020 04:06:24 +0530 Subject: [PATCH 09/51] updating counter with ngramnew --- src/LM/counter.jl | 31 +++++++++++++++++++++---------- 1 file changed, 21 insertions(+), 10 deletions(-) diff --git a/src/LM/counter.jl b/src/LM/counter.jl index 20e13ca6..a4b4a0a9 100644 --- a/src/LM/counter.jl +++ b/src/LM/counter.jl @@ -3,23 +3,34 @@ import DataStructures.Accumulator import DataStructures.DefaultDict import DataStructures.counter -function counter1(data, min::Integer, max::Integer) +function normalize(accum) + #sum all counts + s = float(sum(accum)) + #tuple of string with each count divided by sum + [(history,float(sum(count))/s) for (history,count) in accum] +end + +function counter1(data, min::Integer, max::Integer,norm::Function;gamma = nothing) data = (everygram(data,min_len = min, max_len =max )) data = split.(data) temp_lm = DefaultDict{SubString{String}, Accumulator{String,Int64}}(counter(SubString{String})) for i in 1:length(data) history,word = data[i][1:end-1], data[i][end] - - temp_lm[join(history, " ")][word]+=1 + temp_lm[join(history, " ")][word] += 1 end - #return Dict from iterated temp_lm with normalized histories - Dict(word => normalize(histories) for (word,histories) in temp_lm) + #return Dict from iterated temp_lm with normalized histories + Dict(histories => norm(word,gamma) for (histories,word) in temp_lm) #return temp_lm end -function normalize(accum) - #sum all counts - s = float(sum(accum)) - #tuple of string with each count divided by sum - [(history,float(sum(count))/s) for (history,count) in accum] +function normalize(accum,gamma) + #sum all counts + s = float(sum(accum)) + #tuple of string with each count divided by sum + [(word,float(sum(count))/s) for (word,count) in accum] +end +function lid_norm(accum,gamma) + s = float(sum(accum)+(gamma)*length(accum)) + [(word,float(count + gamma)/s) for (word,count) in accum] + end From 698dc0e4876774b2238364a7f0bc234a11f14d90 Mon Sep 17 00:00:00 2001 From: tejasvaidhyadev Date: Sun, 24 May 2020 04:06:48 +0530 Subject: [PATCH 10/51] adding other language model --- src/LM/langmodel.jl | 46 ++++++++++++++++++++++++++++++++++++++------- 1 file changed, 39 insertions(+), 7 deletions(-) diff --git a/src/LM/langmodel.jl b/src/LM/langmodel.jl index 0e2845dc..85f7720e 100644 --- a/src/LM/langmodel.jl +++ b/src/LM/langmodel.jl @@ -1,20 +1,52 @@ abstract type Langmodel end -struct mle <: Langmodel +struct MLE <: Langmodel vocab ::Vocabulary #counter::Dict{SubString{String},Array{Tuple{String,Float64},1}} end -function mle(word,unk_cutoff=1 ,unk_label="") - mle(Vocabulary(word,unk_cutoff ,unk_label)) +function MLE(word,unk_cutoff=1 ,unk_label="") + MLE(Vocabulary(word,unk_cutoff ,unk_label)) end -function fit!(lm::Langmodel,text,min::Integer,max::Integer) - text = lookup(lm.vocab ,text) - text = convert(Array{String}, text) - return counter1(text,min,max) +function (lm::MLE)(text,min::Integer,max::Integer) + text = lookup(lm.vocab ,text) + text=convert(Array{String}, text) + return counter1(text,min,max,normalize) end +struct Lidstone <: Langmodel + vocab ::Vocabulary + gamma ::Integer +end + +function Lidstone(word,gamma,unk_cutoff=1 ,unk_label="") + Lidstone(Vocabulary(word,unk_cutoff ,unk_label),gamma) +end + +function (lm::Lidstone)(text,min::Integer,max::Integer) + text = lookup(lm.vocab ,text) + text=convert(Array{String}, text) + return counter1(text,min,max,lid_norm,gamma = lm.gamma) +end + + +struct Laplace <: Langmodel + vocab ::Vocabulary + gamma ::Integer +end + +function Laplace(word,unk_cutoff=1 ,unk_label="") + Lidstone(Vocabulary(word,unk_cutoff ,unk_label),1) +end + +function (lm::Laplace)(text,min::Integer,max::Integer) + text = lookup(lm.vocab ,text) + text=convert(Array{String}, text) + return counter1(text,min,max,lid_norm,gamma = lm.gamma) +end + + function unmaskscore(a::Dict{SubString{String},Array{Tuple{String,Float64},1}},word,context) for i in a[context] if word == i[1] From a9e7345ae76e801bc875cdbb108f5758e4c8fea5 Mon Sep 17 00:00:00 2001 From: tejasvaidhyadev Date: Sun, 24 May 2020 04:07:24 +0530 Subject: [PATCH 11/51] adding new ngram method --- src/LM/preprocessing.jl | 35 ++++++++++++++++++++++++++++++++++- 1 file changed, 34 insertions(+), 1 deletion(-) diff --git a/src/LM/preprocessing.jl b/src/LM/preprocessing.jl index 7949ceb7..d8fd07d6 100644 --- a/src/LM/preprocessing.jl +++ b/src/LM/preprocessing.jl @@ -53,5 +53,38 @@ function padding_ngram(word,n =1 ;pad_left=false,pad_right=false ,left_pad_symbo if pad_right == true push!(seq, right_pad_symbol) end - return keys(TextAnalysis.ngramize(TextAnalysis.Languages.English(),seq,n)) + return ngramizenew(seq,n) end +""" + ngramizenew is used to out putting ngrmas in set + +# Example Usage +>>>seq=["To","be","or","not","To","not","To","not"] + +>>> ngramizenew(seq ,2) +7-element Array{Any,1}: + "To be" + "be or" + "or not" + "not To" + "To not" + "not To" + "To not" + +""" +function ngramizenew( words::Vector{T}, nlist::Integer...) where { T <: AbstractString} + #(length(nlist) == 1) && (first(nlist) == 1) && return onegramize(lang, words) + + n_words = length(words) + + tokens = [] + + for n in nlist + for index in 1:(n_words - n + 1) + token = join(words[index:(index + n - 1)], " ") + push!(tokens,token) + end + end + return tokens +end + From 9f9a0ba5b3e4624191d30a622b7179d4cfed7f35 Mon Sep 17 00:00:00 2001 From: tejasvaidhyadev Date: Sat, 30 May 2020 19:05:06 +0530 Subject: [PATCH 12/51] bux fix and indentation --- src/LM/preprocessing.jl | 109 +++++++++++++++++++++++----------------- 1 file changed, 62 insertions(+), 47 deletions(-) diff --git a/src/LM/preprocessing.jl b/src/LM/preprocessing.jl index d8fd07d6..fd62de4f 100644 --- a/src/LM/preprocessing.jl +++ b/src/LM/preprocessing.jl @@ -1,50 +1,62 @@ """ + everygram(seq::Vector{T}; min_len::Int=1, max_len::Int=-1)where { T <: AbstractString} + Return all possible ngrams generated from sequence of items, as an Array{String,1} + # Example ->>>seq=["To","be","or","not"] ->>>a = everygram(seq,min_len = 1, max_len = -1) - 10-element Array{Any,1}: - "or" - "not" - "To" - "be" - "or not" - "be or" - "be or not" - "To be or" - "To be or not" +```julia-repl +julia> seq = ["To","be","or","not"] +julia> a = everygram(seq,min_len=1, max_len=-1) + 10-element Array{Any,1}: + "or" + "not" + "To" + "be" + "or not" + "be or" + "be or not" + "To be or" + "To be or not" +``` """ -function everygram(seq; min_len::Int=1, max_len::Int=-1) - ngram = [] +function everygram(seq::Vector{T}; min_len::Int=1, max_len::Int=-1) where { T <: AbstractString} + + ngram = [] if max_len == -1 max_len = length(seq) end - for n in range(min_len, stop =max_len) - temp = keys(TextAnalysis.ngramize(TextAnalysis.Languages.English(),seq,n)) - ngram = append!(ngram,temp) - end - return(ngram) + for n in range(min_len, stop=max_len) + temp = ngramizenew(seq, n) + ngram = append!(ngram, temp) + end + return(ngram) end """ - padding _ngram is used to pad both left and right of sentence and out putting ngrmas + padding_ngram(word::Vector{T}, n=1; pad_left=false, pad_right=false, left_pad_symbol="", right_pad_symbol ="") where { T <: AbstractString} + +padding _ngram is used to pad both left and right of sentence and out putting ngrmas of order n It also pad the original input Array of string -# Example Usage ->>>example = ["1","2","3","4","5"] + +# Example +```julia-repl +julia> example = ["1","2","3","4","5"] ->>> example = ["1","2","3","4","5"] ->>> padding_ngram(example ,2 , pad_left=true,pad_right =true) - 5-element Array{String,1}: - "1" - "2" - "3" - "4" - "5" +julia> example = ["1","2","3","4","5"] +julia> padding_ngrams(example,2,pad_left=true,pad_right=true) + 6-element Array{Any,1}: + " 1" + "1 2" + "2 3" + "3 4" + "4 5" + "5 " +``` """ -function padding_ngram(word,n =1 ;pad_left=false,pad_right=false ,left_pad_symbol="", right_pad_symbol ="") +function padding_ngram(word::Vector{T}, n=1; pad_left=false,pad_right=false ,left_pad_symbol="", right_pad_symbol ="") where { T <: AbstractString} local seq seq = word if pad_left == true @@ -53,26 +65,29 @@ function padding_ngram(word,n =1 ;pad_left=false,pad_right=false ,left_pad_symbo if pad_right == true push!(seq, right_pad_symbol) end - return ngramizenew(seq,n) + return ngramizenew(seq,n) end + """ - ngramizenew is used to out putting ngrmas in set - -# Example Usage ->>>seq=["To","be","or","not","To","not","To","not"] - ->>> ngramizenew(seq ,2) -7-element Array{Any,1}: - "To be" - "be or" - "or not" - "not To" - "To not" - "not To" - "To not" + ngramizenew( words::Vector{T}, nlist::Integer...) where { T <: AbstractString} +ngramizenew is used to out putting ngrmas in set + +# Example +```julia-repl +julia> seq=["To","be","or","not","To","not","To","not"] +julia> ngramizenew(seq ,2) + 7-element Array{Any,1}: + "To be" + "be or" + "or not" + "not To" + "To not" + "not To" + "To not" +``` """ -function ngramizenew( words::Vector{T}, nlist::Integer...) where { T <: AbstractString} +function ngramizenew(words::Vector{T}, nlist::Integer...) where { T <: AbstractString} #(length(nlist) == 1) && (first(nlist) == 1) && return onegramize(lang, words) n_words = length(words) From eded08451388d1adba00a48a4ad8a60ab85cceaf Mon Sep 17 00:00:00 2001 From: tejasvaidhyadev Date: Sat, 30 May 2020 19:43:12 +0530 Subject: [PATCH 13/51] updating docstring in vocab --- src/LM/vocab.jl | 142 ++++++++++++++++++++++++------------------------ 1 file changed, 72 insertions(+), 70 deletions(-) diff --git a/src/LM/vocab.jl b/src/LM/vocab.jl index 2b432294..c8bda2fb 100644 --- a/src/LM/vocab.jl +++ b/src/LM/vocab.jl @@ -1,80 +1,83 @@ -""" General counter to used in vocabulary""" -mutable struct Counter - value::Int - #Counter(value) = (new(),value) -end +""" + Vocabulary(word,unk_cutoff =1 ,unk_label = "") -function counter(init = 0) - Counter(init) -end -function (count::Counter)() - count.value = 1 +count.value -end +Stores language model vocabulary. +Satisfies two common language modeling requirements for a vocabulary: +- When checking membership and calculating its size, filters items +by comparing their counts to a cutoff value. +Adds a special "unknown" token which unseen words are mapped to. + +# Example +```julia-repl +julia> words = ['a', 'c', '-', 'd', 'c', 'a', 'b', 'r', 'a', 'c', 'd'] +julia> vocabulary = Vocabulary(words, 2) + Vocabulary(Dict(""=>1,"c"=>3,"a"=>3,"d"=>2), 2, "") + +julia> vocabulary.vocab + Dict{String,Int64} with 4 entries: + "" => 1 + "c" => 3 + "a" => 3 + "d" => 2 + +Tokens with counts greater than or equal to the cutoff value will +be considered part of the vocabulary. +julia> vocabulary.vocab["c"] + 3 + +julia> "c" in keys(vocabulary.vocab) + true + +julia> vocabulary.vocab["d"] + 2 -"""Stores language model vocabulary. - Satisfies two common language modeling requirements for a vocabulary: - - When checking membership and calculating its size, filters items - by comparing their counts to a cutoff value. - - Adds a special "unknown" token which unseen words are mapped to. - - >>> words = ['a', 'c', '-', 'd', 'c', 'a', 'b', 'r', 'a', 'c', 'd'] - >>> import Vocabulary - >>> vocabulary = Vocabulary(words, 2) - Vocabulary(Dict(""=>1,"c"=>3,"a"=>3,"d"=>2), 2, "") - >>> vocabulary.vocab - Dict{String,Int64} with 4 entries: - "" => 1 - "c" => 3 - "a" => 3 - "d" => 2 - Tokens with counts greater than or equal to the cutoff value will - be considered part of the vocabulary. - >>> vocabulary.vocab["c"] - 3 - >>> "c" in keys(vocabulary.vocab) - true - >>> vocabulary.vocab["d"] - 2 - >>> "d" in keys(vocabulary.vocab) - true - Tokens with frequency counts less than the cutoff value will be considered not - part of the vocabulary even though their entries in the count dictionary are - preserved. - >>> "b" in keys(vocabulary.vocab) - false - >>> "" in keys(vocabulary.vocab) - true - We can look up words in a vocabulary using its `lookup` method. - "Unseen" words (with counts less than cutoff) are looked up as the unknown label. - If given one word (a string) as an input, this method will return a string. - >>> lookup("a") - 'a' - >>> word = ["a", "-", "d", "c", "a"] - >>> lookup(vocabulary ,word) - 5-element Array{Any,1}: - "a" - "" - "d" - "c" - "a" - - If given a sequence, it will return an Array{Any,1} of the looked up words as shown above. +julia> "d" in keys(vocabulary.vocab) + true + +Tokens with frequency counts less than the cutoff value will be considered not +part of the vocabulary even though their entries in the count dictionary are +preserved. +julia> "b" in keys(vocabulary.vocab) + false + +julia> "" in keys(vocabulary.vocab) + true + +We can look up words in a vocabulary using its `lookup` method. +"Unseen" words (with counts less than cutoff) are looked up as the unknown label. +If given one word (a string) as an input, this method will return a string. +julia> lookup("a") + 'a' + +julia> word = ["a", "-", "d", "c", "a"] + +julia> lookup(vocabulary ,word) + 5-element Array{Any,1}: + "a" + "" + "d" + "c" + "a" + +If given a sequence, it will return an Array{Any,1} of the looked up words as shown above. - It's possible to update the counts after the vocabulary has been created. - >>> update(vocabulary,["b","c","c"]) - 1 - >>> vocabulary.vocab["b"] - 1 - """ +It's possible to update the counts after the vocabulary has been created. +julia> update(vocabulary,["b","c","c"]) + 1 + +julia> vocabulary.vocab["b"] + 1 +``` +""" mutable struct Vocabulary vocab::Dict{String,Int64} unk_cutoff::Int unk_label::String allword::Array{String,1} end -function Vocabulary(word,unk_cutoff =1 ,unk_label = "") +function Vocabulary(word, unk_cutoff=1, unk_label="") if unk_label in word - #error("unk_label is in vocab") + error("unk_label is in vocab") else word= push!(word,unk_label) end @@ -100,10 +103,9 @@ Return an Array of String function lookup(voc::Vocabulary,word) look = [] for w in word - if w in keys(voc.vocab) + if w in keys(voc.vocab) push!(look,w) - else - #return vocab.unk_label + else push!(look,voc.unk_label) end end From 1150e4ed7c133f05324c7430773977721821959e Mon Sep 17 00:00:00 2001 From: tejasvaidhyadev Date: Sat, 30 May 2020 21:06:45 +0530 Subject: [PATCH 14/51] updating counter --- src/LM/counter.jl | 28 ++++++---------------------- 1 file changed, 6 insertions(+), 22 deletions(-) diff --git a/src/LM/counter.jl b/src/LM/counter.jl index a4b4a0a9..2abae334 100644 --- a/src/LM/counter.jl +++ b/src/LM/counter.jl @@ -3,14 +3,11 @@ import DataStructures.Accumulator import DataStructures.DefaultDict import DataStructures.counter -function normalize(accum) - #sum all counts - s = float(sum(accum)) - #tuple of string with each count divided by sum - [(history,float(sum(count))/s) for (history,count) in accum] -end - -function counter1(data, min::Integer, max::Integer,norm::Function;gamma = nothing) +""" + counter is used to make conditional distribution, which is used by score functions to + calculate conditonal frequency distribution +""" +function counter2(data, min::Integer, max::Integer) data = (everygram(data,min_len = min, max_len =max )) data = split.(data) temp_lm = DefaultDict{SubString{String}, Accumulator{String,Int64}}(counter(SubString{String})) @@ -18,19 +15,6 @@ function counter1(data, min::Integer, max::Integer,norm::Function;gamma = nothin history,word = data[i][1:end-1], data[i][end] temp_lm[join(history, " ")][word] += 1 end - #return Dict from iterated temp_lm with normalized histories - Dict(histories => norm(word,gamma) for (histories,word) in temp_lm) - #return temp_lm + return temp_lm end -function normalize(accum,gamma) - #sum all counts - s = float(sum(accum)) - #tuple of string with each count divided by sum - [(word,float(sum(count))/s) for (word,count) in accum] -end -function lid_norm(accum,gamma) - s = float(sum(accum)+(gamma)*length(accum)) - [(word,float(count + gamma)/s) for (word,count) in accum] - -end From 15bef1082c0529f10fe28bed45ea5b63f33f9792 Mon Sep 17 00:00:00 2001 From: tejasvaidhyadev Date: Sat, 30 May 2020 21:07:30 +0530 Subject: [PATCH 15/51] adding all ngram based model --- src/LM/langmodel.jl | 190 +++++++++++++++++++++++++++++++++++--------- 1 file changed, 154 insertions(+), 36 deletions(-) diff --git a/src/LM/langmodel.jl b/src/LM/langmodel.jl index 85f7720e..b95c980c 100644 --- a/src/LM/langmodel.jl +++ b/src/LM/langmodel.jl @@ -1,71 +1,189 @@ abstract type Langmodel end +abstract type gammamodel <: Langmodel end #BaseNgram with smoothing algo +abstract type InterpolatedLanguageModel <: Langmodel end #Interpolated language model with smoothing +""" + Type for providing MLE ngram model scores. + Implementation of Base Ngram Model. + +""" struct MLE <: Langmodel vocab ::Vocabulary - #counter::Dict{SubString{String},Array{Tuple{String,Float64},1}} end -function MLE(word,unk_cutoff=1 ,unk_label="") - MLE(Vocabulary(word,unk_cutoff ,unk_label)) +function MLE(word, unk_cutoff=1, unk_label="") + MLE(Vocabulary(word, unk_cutoff, unk_label)) end -function (lm::MLE)(text,min::Integer,max::Integer) - text = lookup(lm.vocab ,text) - text=convert(Array{String}, text) - return counter1(text,min,max,normalize) +function (lm::MLE)(text, min::Integer, max::Integer) + text = lookup(lm.vocab, text) + text=convert(Array{String}, text) + return counter2(text, min, max) end -struct Lidstone <: Langmodel + """ + Type for providing Lidstone-smoothed scores. + + In addition to initialization arguments from BaseNgramModel also requires + a number by which to increase the counts, gamma. +""" +struct Lidstone <: gammamodel vocab ::Vocabulary gamma ::Integer end -function Lidstone(word,gamma,unk_cutoff=1 ,unk_label="") - Lidstone(Vocabulary(word,unk_cutoff ,unk_label),gamma) +function Lidstone(word, gamma,unk_cutoff=1, unk_label="") + Lidstone(Vocabulary(word, unk_cutoff, unk_label), gamma) end -function (lm::Lidstone)(text,min::Integer,max::Integer) - text = lookup(lm.vocab ,text) - text=convert(Array{String}, text) - return counter1(text,min,max,lid_norm,gamma = lm.gamma) +function (lm::Lidstone)(text, min::Integer, max::Integer) + text = lookup(lm.vocab, text) + text=convert(Array{String}, text) + return counter2(text, min, max) end +"""Type for providing Laplace-smoothed scores. + In addition to initialization arguments from BaseNgramModel also requires + a number by which to increase the counts, gamma = 1. +""" -struct Laplace <: Langmodel +struct Laplace <: gammamodel vocab ::Vocabulary gamma ::Integer end -function Laplace(word,unk_cutoff=1 ,unk_label="") - Lidstone(Vocabulary(word,unk_cutoff ,unk_label),1) +function Laplace(word, unk_cutoff=1, unk_label="") + Lidstone(Vocabulary(word, unk_cutoff, unk_label), 1) end -function (lm::Laplace)(text,min::Integer,max::Integer) - text = lookup(lm.vocab ,text) - text=convert(Array{String}, text) - return counter1(text,min,max,lid_norm,gamma = lm.gamma) +function (lm::Laplace)(text, min::Integer, max::Integer) + text = lookup(lm.vocab, text) + text = convert(Array{String}, text) + return counter2(text, min, max) end +"""Add-one smoothing: Lidstone or Laplace.(gammamodel) + To see what kind, look at `gamma` attribute on the class. +""" +function score(m::gammamodel, temp_lm, word,context) + accum = temp_lm[context] + #print(accum) + s = float(sum(accum)+(m.gamma)*length(m.vocab.vocab)) + for (text, count) in accum + if text == word + return(float(count+m.gamma)/s) + end + end + return(float(m.gamma)/s) +end -function unmaskscore(a::Dict{SubString{String},Array{Tuple{String,Float64},1}},word,context) - for i in a[context] - if word == i[1] - return i[2] +"""To get probability of word given that context + In otherwords, for given context calculate frequency distribution of word + +""" +function prob(templ_lm::DefaultDict, word,context=nothing) + if context == nothing || context == "" + return(1/float(length(templ_lm))) #provide distribution + else + accum = templ_lm[context] + end + s = float(sum(accum)) + for (text,count) in accum + if text == word + return(float(count) / s) end end + return(Inf) end -function score(voc::Langmodel,model::Dict{SubString{String},Array{Tuple{String,Float64},1}} ,word ,context ) - """Masks out of vocab (OOV) words and computes their model score. - For model-specific logic of calculating scores, see the `unmasked_score` - method. - """ - return unmaskscore(model,word,context ) +function score(m::MLE,temp_lm,word,context = nothing) + prob(temp_lm , word, context) +end +struct WittenBellInterpolated <: InterpolatedLanguageModel + vocab ::Vocabulary end -function logscore(word, context= None) - """Evaluate the log score of this word in this context. - The arguments are the same as for `score` and `unmasked_score`. - """ - return log2(score(word, context)) +function WittenBellInterpolated(word,unk_cutoff=1 ,unk_label="") + WittenBellInterpolated(Vocabulary(word,unk_cutoff ,unk_label)) +end + +function (lm::WittenBellInterpolated)(text,min::Integer,max::Integer) + text = lookup(lm.vocab ,text) + text=convert(Array{String}, text) + return counter2(text,min,max) +end + +function alpha_gammma(templ_lm::DefaultDict, word,context) + local alpha + local gam + accum = templ_lm[context] + s = float(sum(accum)) + for (text,count) in accum + if text == word + alpha=(float(count) / s) + break + else + alpha = 1/s + end + end + + gam = gamma(accum) + return alpha*(1- gam),gam +end + +function count_non_zero_vals(accum::Accumulator{}) + return(length(accum)) +end + +function gamma(accum) + nplus=count_non_zero_vals(accum) + return(nplus/(nplus+float(sum(accum)))) end + +function score(m::InterpolatedLanguageModel,temp_lm::DefaultDict,word,context=nothing) + if context == nothing || context == "" + return prob(temp_lm,word,context) + end + if context in keys(temp_lm) + alpha,gamma = alpha_gammma(temp_lm,word,context) + return (alpha + gamma*score(m,temp_lm,word,context_reduce(context))) + else + return score(m,temp_lm,word,context_reduce(context)) + end +end + +function context_reduce(context) + context = split(context) + join(context[2:end]," ") +end +struct KneserNeyInterpolated <: InterpolatedLanguageModel + vocab::Vocabulary + discount::Float64 +end + + + +function KneserNeyInterpolated(word,gamma,unk_cutoff=1 ,unk_label="") + KneserNeyInterpolate(Vocabulary(word,unk_cutoff ,unk_label),gamma) +end + +function (lm::KneserNeyInterpolated)(text,min::Integer,max::Integer) + text = lookup(lm.vocab ,text) + text=convert(Array{String}, text) + return counter2(text,min,max) +end +function alpha_gammma(m::KneserNeyInterpolated,templ_lm::DefaultDict, word,context) + accum = templ_lm[context] + s = float(sum(accum)) + for (text,count) in accum + if text == word + alpha=(max(float(count)-m.discount,0.0) / s) + break + else + alpha = 1/length(m.vocab.vocab) + end + end + gamma = (m.discount * count_non_zero_vals(accum) /s) +end + + From 5fb6db9d98c502080864119234f4ac638df70b1b Mon Sep 17 00:00:00 2001 From: tejasvaidhyadev Date: Sat, 30 May 2020 21:10:46 +0530 Subject: [PATCH 16/51] adding matrices --- src/LM/api.jl | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) create mode 100644 src/LM/api.jl diff --git a/src/LM/api.jl b/src/LM/api.jl new file mode 100644 index 00000000..0e4d804f --- /dev/null +++ b/src/LM/api.jl @@ -0,0 +1,21 @@ +#TO DO +# Doc string +function maskedscore(m::Langmodel,temp_lm::DefaultDict,word,context) + score(m,temp_lm,lookup(m.vocab ,[word])[1] ,lookup(m.vocab ,[context])[1]) +end + +function logscore(m::Langmodel,temp_lm::DefaultDict,word,context) + log2(maskedscore(m,temp_lm,word,context)) +end +function entropy(m::Langmodel,lm::DefaultDict,text_ngram) + local log_set=Float64[] + for ngram in text_ngram + ngram = split(ngram) + push!(log_set,logscore(m,lm,ngram[end],join(ngram[1:end-1]," "))) + #println(logscore(m,lm,ngram[end],ngram[1:end-1])) + end + return(sum(log_set)/length(log_set)) +end +function perplexity(m::Langmodel,lm::DefaultDict,text_ngram) + return(2^(entropy(m,lm,text_ngram))) +end From f240cae4aa2998495254fd8c94a13837eb56a001 Mon Sep 17 00:00:00 2001 From: tejasvaidhyadev Date: Sat, 30 May 2020 21:16:17 +0530 Subject: [PATCH 17/51] updating api --- src/LM/api.jl | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/LM/api.jl b/src/LM/api.jl index 0e4d804f..2936eb06 100644 --- a/src/LM/api.jl +++ b/src/LM/api.jl @@ -7,6 +7,7 @@ end function logscore(m::Langmodel,temp_lm::DefaultDict,word,context) log2(maskedscore(m,temp_lm,word,context)) end + function entropy(m::Langmodel,lm::DefaultDict,text_ngram) local log_set=Float64[] for ngram in text_ngram @@ -16,6 +17,7 @@ function entropy(m::Langmodel,lm::DefaultDict,text_ngram) end return(sum(log_set)/length(log_set)) end + function perplexity(m::Langmodel,lm::DefaultDict,text_ngram) return(2^(entropy(m,lm,text_ngram))) end From 6bd06efce1b498ab6af6080d88788121eefffac9 Mon Sep 17 00:00:00 2001 From: tejasvaidhyadev Date: Sat, 30 May 2020 21:18:08 +0530 Subject: [PATCH 18/51] syntax correction --- src/LM/counter.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/LM/counter.jl b/src/LM/counter.jl index 2abae334..dd595b71 100644 --- a/src/LM/counter.jl +++ b/src/LM/counter.jl @@ -8,7 +8,7 @@ import DataStructures.counter calculate conditonal frequency distribution """ function counter2(data, min::Integer, max::Integer) - data = (everygram(data,min_len = min, max_len =max )) + data = everygram(data, min_len=min, max_len=max) data = split.(data) temp_lm = DefaultDict{SubString{String}, Accumulator{String,Int64}}(counter(SubString{String})) for i in 1:length(data) From e34e71e800eb762f40d1d74c01612c578f34c562 Mon Sep 17 00:00:00 2001 From: tejasvaidhyadev Date: Sat, 30 May 2020 21:26:05 +0530 Subject: [PATCH 19/51] exporting imp function --- src/TextAnalysis.jl | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/src/TextAnalysis.jl b/src/TextAnalysis.jl index 1b231299..a27c88df 100644 --- a/src/TextAnalysis.jl +++ b/src/TextAnalysis.jl @@ -69,6 +69,8 @@ module TextAnalysis export Vocabulary,lookup,update export everygram, padding_ngram + export maskedscore, logscore, entropy, perplexity + export MLE, Lidstone, Laplace, WittenBellInterpolated, KneserNeyInterpolated, score include("tokenizer.jl") include("ngramizer.jl") @@ -117,6 +119,11 @@ module TextAnalysis # Lang_model include("LM/vocab.jl") + include("LM/api.jl") + include("LM/counter.jl") + include("LM/langmodel.jl") + include("LM/preprocess.jl") + include("LM/vocab.jl") # ULMFiT module ULMFiT From be686aff5d3fed2d697e67eed399cbe2557f0b2d Mon Sep 17 00:00:00 2001 From: tejasvaidhyadev Date: Sat, 30 May 2020 23:01:54 +0530 Subject: [PATCH 20/51] bug fix --- src/LM/langmodel.jl | 23 ++++++++++++++++++++++- 1 file changed, 22 insertions(+), 1 deletion(-) diff --git a/src/LM/langmodel.jl b/src/LM/langmodel.jl index b95c980c..26bebded 100644 --- a/src/LM/langmodel.jl +++ b/src/LM/langmodel.jl @@ -10,7 +10,28 @@ abstract type InterpolatedLanguageModel <: Langmodel end #Interpolated language struct MLE <: Langmodel vocab ::Vocabulary end - +""" + MLE(word::Vector{T}, unk_cutoff=1, unk_label="") where { T <: AbstractString} +Return Datatype MLE + +# Example + +```julia-repl +julia> seq = ["To","be","or","not"] +julia> a = everygram(seq,min_len=1, max_len=-1) + 10-element Array{Any,1}: + "or" + "not" + "To" + "be" + "or not" + "be or" + "be or not" + "To be or" + "To be or not" +``` + +""" function MLE(word, unk_cutoff=1, unk_label="") MLE(Vocabulary(word, unk_cutoff, unk_label)) end From 40bae7b011472c37aa8f620e5096ff40123db46d Mon Sep 17 00:00:00 2001 From: tejasvaidhyadev Date: Sun, 31 May 2020 14:55:51 +0530 Subject: [PATCH 21/51] adding doc sting --- src/LM/langmodel.jl | 174 ++++++++++++++++++++++++++------------------ 1 file changed, 104 insertions(+), 70 deletions(-) diff --git a/src/LM/langmodel.jl b/src/LM/langmodel.jl index 26bebded..b4973953 100644 --- a/src/LM/langmodel.jl +++ b/src/LM/langmodel.jl @@ -1,35 +1,21 @@ abstract type Langmodel end -abstract type gammamodel <: Langmodel end #BaseNgram with smoothing algo +abstract type gammamodel <: Langmodel end #BaseNgram with Add-one smoothing algo abstract type InterpolatedLanguageModel <: Langmodel end #Interpolated language model with smoothing -""" - Type for providing MLE ngram model scores. - Implementation of Base Ngram Model. +#DataType MLE +#Type for providing MLE ngram model scores. +#Implementation of Base Ngram Model. -""" struct MLE <: Langmodel - vocab ::Vocabulary + vocab::Vocabulary end + """ MLE(word::Vector{T}, unk_cutoff=1, unk_label="") where { T <: AbstractString} -Return Datatype MLE - -# Example - -```julia-repl -julia> seq = ["To","be","or","not"] -julia> a = everygram(seq,min_len=1, max_len=-1) - 10-element Array{Any,1}: - "or" - "not" - "To" - "be" - "or not" - "be or" - "be or not" - "To be or" - "To be or not" -``` + +Initiate Type for providing MLE ngram model scores. + +Implementation of Base Ngram Model. """ function MLE(word, unk_cutoff=1, unk_label="") @@ -42,17 +28,19 @@ function (lm::MLE)(text, min::Integer, max::Integer) return counter2(text, min, max) end - """ - Type for providing Lidstone-smoothed scores. - - In addition to initialization arguments from BaseNgramModel also requires - a number by which to increase the counts, gamma. -""" struct Lidstone <: gammamodel - vocab ::Vocabulary - gamma ::Integer + vocab::Vocabulary + gamma::Integer end +""" + Lidstone(word::Vector{T}, gamma:: Integer,unk_cutoff=1, unk_label="") where { T <: AbstractString} + +Function to initiate Type(Lidstone) for providing Lidstone-smoothed scores. + +In addition to initialization arguments from BaseNgramModel also requires +a number by which to increase the counts, gamma. +""" function Lidstone(word, gamma,unk_cutoff=1, unk_label="") Lidstone(Vocabulary(word, unk_cutoff, unk_label), gamma) end @@ -62,15 +50,17 @@ function (lm::Lidstone)(text, min::Integer, max::Integer) text=convert(Array{String}, text) return counter2(text, min, max) end -"""Type for providing Laplace-smoothed scores. - In addition to initialization arguments from BaseNgramModel also requires - a number by which to increase the counts, gamma = 1. """ + Laplace(word::Vector{T}, unk_cutoff=1, unk_label="") where { T <: AbstractString} +Function to initiate Type(Laplace) for providing Laplace-smoothed scores. +In addition to initialization arguments from BaseNgramModel also requires +a number by which to increase the counts, gamma = 1. +""" struct Laplace <: gammamodel - vocab ::Vocabulary - gamma ::Integer + vocab::Vocabulary + gamma::Integer end function Laplace(word, unk_cutoff=1, unk_label="") @@ -83,10 +73,15 @@ function (lm::Laplace)(text, min::Integer, max::Integer) return counter2(text, min, max) end -"""Add-one smoothing: Lidstone or Laplace.(gammamodel) - To see what kind, look at `gamma` attribute on the class. """ -function score(m::gammamodel, temp_lm, word,context) + score(m::gammamodel, temp_lm::DefaultDict, word::AbstractString, context::AbstractString) + +score is used to output probablity of word given that context + +Add-one smoothing to Lidstone or Laplace(gammamodel) models + +""" +function score(m::gammamodel, temp_lm, word, context) #score for gammamodel output probabl accum = temp_lm[context] #print(accum) s = float(sum(accum)+(m.gamma)*length(m.vocab.vocab)) @@ -98,18 +93,20 @@ function score(m::gammamodel, temp_lm, word,context) return(float(m.gamma)/s) end -"""To get probability of word given that context - In otherwords, for given context calculate frequency distribution of word +""" +To get probability of word given that context + +In otherwords, for given context calculate frequency distribution of word """ -function prob(templ_lm::DefaultDict, word,context=nothing) +function prob(templ_lm::DefaultDict, word, context=nothing) if context == nothing || context == "" return(1/float(length(templ_lm))) #provide distribution else accum = templ_lm[context] end s = float(sum(accum)) - for (text,count) in accum + for (text, count) in accum if text == word return(float(count) / s) end @@ -117,29 +114,44 @@ function prob(templ_lm::DefaultDict, word,context=nothing) return(Inf) end -function score(m::MLE,temp_lm,word,context = nothing) - prob(temp_lm , word, context) +""" + score(m::MLE, temp_lm::DefaultDict, word::AbstractString, context::AbstractString) + +score is used to output probablity of word given that context in MLE + +""" +function score(m::MLE, temp_lm, word, context = nothing) + prob(temp_lm, word, context) end + struct WittenBellInterpolated <: InterpolatedLanguageModel vocab ::Vocabulary end -function WittenBellInterpolated(word,unk_cutoff=1 ,unk_label="") - WittenBellInterpolated(Vocabulary(word,unk_cutoff ,unk_label)) +""" + WittenBellInterpolated(word::Vector{T}, gamma:: Integer,unk_cutoff=1, unk_label="") where { T <: AbstractString} + +Initiate Type for providing Interpolated version of Witten-Bell smoothing. + +The idea to abstract this comes from Chen & Goodman 1995. + +""" +function WittenBellInterpolated(word, unk_cutoff=1, unk_label="") + WittenBellInterpolated(Vocabulary(word, unk_cutoff, unk_label)) end -function (lm::WittenBellInterpolated)(text,min::Integer,max::Integer) - text = lookup(lm.vocab ,text) +function (lm::WittenBellInterpolated)(text, min::Integer, max::Integer) + text = lookup(lm.vocab, text) text=convert(Array{String}, text) - return counter2(text,min,max) + return counter2(text, min, max) end - -function alpha_gammma(templ_lm::DefaultDict, word,context) +# alpha_gamma function for KneserNeyInterpolated +function alpha_gammma(m::WittenBellInterpolated, templ_lm::DefaultDict, word, context) local alpha local gam accum = templ_lm[context] s = float(sum(accum)) - for (text,count) in accum + for (text, count) in accum if text == word alpha=(float(count) / s) break @@ -149,7 +161,7 @@ function alpha_gammma(templ_lm::DefaultDict, word,context) end gam = gamma(accum) - return alpha*(1- gam),gam + return alpha*(1- gam), gam end function count_non_zero_vals(accum::Accumulator{}) @@ -158,53 +170,75 @@ end function gamma(accum) nplus=count_non_zero_vals(accum) - return(nplus/(nplus+float(sum(accum)))) + return(nplus/(nplus + float(sum(accum)))) end +""" + score(m::InterpolatedLanguageModel, temp_lm::DefaultDict, word::AbstractString, context::AbstractString) + +score is used to output probablity of word given that context in InterpolatedLanguageModel + +Apply Kneserney and WittenBell smoothing +depending upon the sub-Type + +""" function score(m::InterpolatedLanguageModel,temp_lm::DefaultDict,word,context=nothing) if context == nothing || context == "" - return prob(temp_lm,word,context) + return prob(temp_lm, word, context) end if context in keys(temp_lm) - alpha,gamma = alpha_gammma(temp_lm,word,context) - return (alpha + gamma*score(m,temp_lm,word,context_reduce(context))) + alpha,gamma = alpha_gammma(m, temp_lm, word, context) + return (alpha + gamma*score(m, temp_lm, word, context_reduce(context))) else - return score(m,temp_lm,word,context_reduce(context)) + return score(m, temp_lm, word, context_reduce(context)) end end function context_reduce(context) context = split(context) - join(context[2:end]," ") + join(context[2:end], " ") end + + struct KneserNeyInterpolated <: InterpolatedLanguageModel vocab::Vocabulary discount::Float64 end +""" + KneserNeyInterpolated(word::Vector{T}, discount:: Float64,unk_cutoff=1, unk_label="") where { T <: AbstractString} + +Initiate Type for providing KneserNey Interpolated language model. +The idea to abstract this comes from Chen & Goodman 1995. -function KneserNeyInterpolated(word,gamma,unk_cutoff=1 ,unk_label="") - KneserNeyInterpolate(Vocabulary(word,unk_cutoff ,unk_label),gamma) +""" +function KneserNeyInterpolated(word, gamma, unk_cutoff=1 , unk_label="") + KneserNeyInterpolated(Vocabulary(word, unk_cutoff, unk_label), gamma) end -function (lm::KneserNeyInterpolated)(text,min::Integer,max::Integer) - text = lookup(lm.vocab ,text) - text=convert(Array{String}, text) - return counter2(text,min,max) +function (lm::KneserNeyInterpolated)(text, min::Integer, max::Integer) + text = lookup(lm.vocab, text) + text = convert(Array{String}, text) + return counter2(text, min, max) end +# alpha_gamma function for KneserNeyInterpolated function alpha_gammma(m::KneserNeyInterpolated,templ_lm::DefaultDict, word,context) + local alpha + local gamma accum = templ_lm[context] s = float(sum(accum)) - for (text,count) in accum + for (text, count) in accum if text == word - alpha=(max(float(count)-m.discount,0.0) / s) + alpha=(max(float(count)-m.discount, 0.0) /s) break else alpha = 1/length(m.vocab.vocab) end end - gamma = (m.discount * count_non_zero_vals(accum) /s) + gamma = (m.discount * count_non_zero_vals(accum)/s) + return alpha, gamma end + From e58fd17f89f8cbc1cd41f302fc59de3b80ac7108 Mon Sep 17 00:00:00 2001 From: tejasvaidhyadev Date: Sun, 31 May 2020 14:56:49 +0530 Subject: [PATCH 22/51] updated docstrings --- src/LM/vocab.jl | 1 - 1 file changed, 1 deletion(-) diff --git a/src/LM/vocab.jl b/src/LM/vocab.jl index c8bda2fb..6f9d250f 100644 --- a/src/LM/vocab.jl +++ b/src/LM/vocab.jl @@ -113,4 +113,3 @@ function lookup(voc::Vocabulary,word) end - From f8f09f65d25ecd252dc4ac402b70ecd1d2b69a85 Mon Sep 17 00:00:00 2001 From: tejasvaidhyadev Date: Sun, 31 May 2020 14:57:41 +0530 Subject: [PATCH 23/51] export apis --- src/TextAnalysis.jl | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/src/TextAnalysis.jl b/src/TextAnalysis.jl index a27c88df..01550ce9 100644 --- a/src/TextAnalysis.jl +++ b/src/TextAnalysis.jl @@ -119,11 +119,10 @@ module TextAnalysis # Lang_model include("LM/vocab.jl") + include("LM/langmodel.jl") include("LM/api.jl") include("LM/counter.jl") - include("LM/langmodel.jl") - include("LM/preprocess.jl") - include("LM/vocab.jl") + include("LM/preprocessing.jl") # ULMFiT module ULMFiT From 6f966544e9e2727a93da35cbeabfe4ceffc12e74 Mon Sep 17 00:00:00 2001 From: tejasvaidhyadev Date: Fri, 5 Jun 2020 02:06:15 +0530 Subject: [PATCH 24/51] adding test for LM --- test/LM.jl | 59 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 59 insertions(+) create mode 100644 test/LM.jl diff --git a/test/LM.jl b/test/LM.jl new file mode 100644 index 00000000..71e16b73 --- /dev/null +++ b/test/LM.jl @@ -0,0 +1,59 @@ +using DataStructures + +@testset "Vocabulary" begin + + words = ["a", "c", "-", "d", "c", "a", "b", "r", "a", "c", "d"] + vocab = Vocabulary(words, 2, "") + @test vocab isa Vocabulary + @test vocab.vocab isa Dict + @test vocab.unk_cutoff isa Int + @test vocab.unk_label isa String + @test vocab.allword isa Array{String,1} + @test length(vocab.vocab) == 4 #only 4 differnt string over word freq 2 + @test isequal(vocab.unk_cutoff, 2) + @test vocab.unk_label == "" + @test isequal(vocab.allword ,["a", "c", "-", "d", "c", "a", "b", "r", "a", "c", "d", ""]) + @test isequal(vocab.vocab, Dict{String,Int}(""=>1,"c"=>3,"a"=>3,"d"=>2)) + #to check lookup function + @test lookup(vocab,["a","b","c","alien"]) == ["a", "", "c", ""] +end + +@testset "preprocessing" begin + @testset "ngramizenew" begin + sample_text = ["this", "is", "some", "sample", "text"] + ngrams = TextAnalysis.ngramizenew(sample_text,1) + + @test isequal(ngrams, ["this", "is", "some", "sample", "text"]) + + ngrams = TextAnalysis.ngramizenew(sample_text,2) + @test isequal(ngrams, ["this is", "is some", "some sample", "sample text"]) + + ngrams = TextAnalysis.ngramizenew(sample_text,1,2) + @test isequal(ngrams, ["this", "is", "some", "sample", "text", "this is", "is some", "some sample", "sample text"]) + end + + @testset "Padding function" begin + example = ["1","2","3","4","5"] + padded=padding_ngram(example,2,pad_left=true,pad_right=true) + @test isequal(padded,[" 1", "1 2", "2 3", "3 4", "4 5", "5 "]) + @test isequal(example, ["","1","2","3","4","5",""]) + + example = ["1","2","3","4","5"] #if used + padded=padding_ngram(example,2,pad_right=true) + @test isequal(padded,["1 2", "2 3", "3 4", "4 5", "5 "]) + end + @testset "everygram function" begin + example = ["1","2","3","4","5"] + everyngms = everygram(example,min_len=1,max_len=2) + @test isequal(everyngms, ["1", "2", "3", "4", "5", "1 2", "2 3", "3 4", "4 5"]) + end +end + +@testset "counter" begin + exam = ["To", "be", "or", "not", "to", "be","To", "be", "or", "not", "to", "be"] + fit = (TextAnalysis.counter2(exam,2,2)) + @test fit isa DataStructures.DefaultDict + @test length(fit) == 5 #length of unique words + @test +end + From db6d38a0884638c47c5c52f1c2409ca7d2231a77 Mon Sep 17 00:00:00 2001 From: tejasvaidhyadev Date: Fri, 5 Jun 2020 02:06:48 +0530 Subject: [PATCH 25/51] including LM.jl --- test/runtests.jl | 1 + 1 file changed, 1 insertion(+) diff --git a/test/runtests.jl b/test/runtests.jl index 59b3a4dd..867b284d 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -28,3 +28,4 @@ include("taggingschemes.jl") include("averagePerceptronTagger.jl") include("evaluation_metrics.jl") include("ulmfit.jl") +include("LM.jl") From 5ddf7cb132e8143e9e274a86d52d46035af7c8e8 Mon Sep 17 00:00:00 2001 From: tejasvaidhyadev Date: Sat, 6 Jun 2020 03:04:35 +0530 Subject: [PATCH 26/51] adding docs for LM --- docs/src/LM.md | 116 +++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 116 insertions(+) create mode 100644 docs/src/LM.md diff --git a/docs/src/LM.md b/docs/src/LM.md new file mode 100644 index 00000000..a735a7be --- /dev/null +++ b/docs/src/LM.md @@ -0,0 +1,116 @@ +# Statistical Language Model + +**TextAnalysis** provide following different Language Models + +- **MLE** - Base Ngram model. +- **Lidstone **- Base Ngram model with Lidstone smoothing. +- **Laplace** - Base Ngram language model with Laplace smoothing. +- **WittenBellInterpolated** - Interpolated Version of witten-Bell algorithm. +- **KneserNeyInterpolated** - Interpolated version of Kneser -Ney smoothing. + +## APIs + +To use the API, we first *Instantiate* desired model and then load it with train set + +```julia +(word::Vector{T}, unk_cutoff=1, unk_label="") where { T <: AbstractString} + +Lidstone(word::Vector{T}, gamma:: Float64, unk_cutoff=1, unk_label="") where { T <: AbstractString} + +Laplace(word::Vector{T}, unk_cutoff=1, unk_label="") where { T <: AbstractString} + +WittenBellInterpolated(word::Vector{T}, unk_cutoff=1, unk_label="") where { T <: AbstractString} + +KneserNeyInterpolated(word::Vector{T}, discount:: Float64=0.1, unk_cutoff=1, unk_label="") where { T <: AbstractString} + +(lm::)(text, min::Integer, max::Integer) +``` +Arguments: + + * `word` : Array of strings to store vocabulary. + + * `unk_cutoff`: Tokens with counts greater than or equal to the cutoff value will be considered part of the vocabulary. + + * `unk_label`: token for unkown labels + + * `gamma`: smoothing arugment gamma + + * `discount`: discouting factor for `KneserNeyInterpolated` + + for more information see docstrings of vocabulary + +```julia +julia> voc = ["my","name","is","salman","khan","and","he","is","shahrukh","Khan"] + +julia>train = ["khan","is","my","good", "friend","and","He","is","my","brother"] +# voc and train are used to train vocabulary and model respectively + +julia> model = MLE(voc) +MLE(Vocabulary(Dict("khan"=>1,"name"=>1,""=>1,"salman"=>1,"is"=>2,"Khan"=>1,"my"=>1,"he"=>1,"shahrukh"=>1,"and"=>1…), 1, "", ["my", "name", "is", "salman", "khan", "and", "he", "is", "shahrukh", "Khan", ""])) +julia> print(voc) +11-element Array{String,1}: + "my" + "name" + "is" + "salman" + "khan" + "and" + "he" + "is" + "shahrukh" + "Khan" + "" +# you can see "" token is added to voc +julia> fit = model(train,2,2) #considering only bigrams +julia> unmaskedscore = score(model, fit, "is" ,"") #score output P(word | context) without replacing context word with "" +0.3333333333333333 +julia> masked_score = maskedscore(model,fit,"is","alien") +0.3333333333333333 +#as expected maskedscore is equivalent to unmaskedscore with context replaced with "" + +``` +!!! note + + When you call `MLE(voc)` for the first time, It will update your vocabulary set as well. + +## Evaluation Method + +- `score` + + used to evaluate probablity of word given context (*P(word | context)*) + + ```julia + score(m::gammamodel, temp_lm::DefaultDict, word::AbstractString, context::AbstractString) + ``` + +​ In case of Lidstone and Laplace it apply smoothing and, + +​ In Interpolated language model, provide Kneserney and WittenBell smoothing + +- `maskedscore` + + It is used to evaluate *score* with masks out of vocabulary words + + The arguments are the same as for score + +- `logscore` + + Evaluate the log score of this word in this context. + + The arguments are the same as for score and maskedscore + +- `entropy` + ```julia + entropy(m::Langmodel,lm::DefaultDict,text_ngram::word::Vector{T}) where { T <: AbstractString} + ``` + + Calculate cross-entropy of model for given evaluation text. + + Input text must be Array of ngram of same lengths + +- `perplexity` + + Calculates the perplexity of the given text. + + This is simply 2 ** cross-entropy(`entropy`) for the text, so the arguments are the same as `entropy`. \ No newline at end of file From d1d7417b5e03d1b1ea1c7f58065d7846d0dcd21b Mon Sep 17 00:00:00 2001 From: tejasvaidhyadev Date: Sat, 6 Jun 2020 04:37:21 +0530 Subject: [PATCH 27/51] adding Preprocessing in docs --- docs/src/LM.md | 44 +++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 43 insertions(+), 1 deletion(-) diff --git a/docs/src/LM.md b/docs/src/LM.md index a735a7be..7768830b 100644 --- a/docs/src/LM.md +++ b/docs/src/LM.md @@ -113,4 +113,46 @@ julia> masked_score = maskedscore(model,fit,"is","alien") Calculates the perplexity of the given text. - This is simply 2 ** cross-entropy(`entropy`) for the text, so the arguments are the same as `entropy`. \ No newline at end of file + This is simply 2 ** cross-entropy(`entropy`) for the text, so the arguments are the same as `entropy`. + +## Preprocessing + + For Preprocessing following functions: + +1. `everygram`: Return all possible ngrams generated from sequence of items, as an Array{String,1} + + ```julia + julia> seq = ["To","be","or","not"] + julia> a = everygram(seq,min_len=1, max_len=-1) + 10-element Array{Any,1}: + "or" + "not" + "To" + "be" + "or not" + "be or" + "be or not" + "To be or" + "To be or not" + ``` + +2. `padding_ngrams`: padding _ngram is used to pad both left and right of sentence and out putting ngrmas of order n + + It also pad the original input Array of string + + ```julia + julia> example = ["1","2","3","4","5"] + + julia> example = ["1","2","3","4","5"] + julia> padding_ngrams(example,2,pad_left=true,pad_right=true) + 6-element Array{Any,1}: + " 1" + "1 2" + "2 3" + "3 4" + "4 5" + "5 " + ``` + + ​ + From 36ffa46ac0cad6931512dad890b4a5ad9710dcd4 Mon Sep 17 00:00:00 2001 From: tejasvaidhyadev Date: Sat, 6 Jun 2020 05:08:23 +0530 Subject: [PATCH 28/51] adding docs for vocab --- docs/src/LM.md | 24 +++++++++++++++++++++++- 1 file changed, 23 insertions(+), 1 deletion(-) diff --git a/docs/src/LM.md b/docs/src/LM.md index 7768830b..3a32c07e 100644 --- a/docs/src/LM.md +++ b/docs/src/LM.md @@ -153,6 +153,28 @@ julia> masked_score = maskedscore(model,fit,"is","alien") "4 5" "5 " ``` +## Vocabulary - ​ +Struct to store Language models vocabulary + +checking membership and filters items by comparing their counts to a cutoff value + +It also Adds a special "unkown" tokens which unseen words are mapped to + +```julia +julia> words = ["a", "c", "-", "d", "c", "a", "b", "r", "a", "c", "d"] +julia> vocabulary = Vocabulary(words, 2) + Vocabulary(Dict(""=>1,"c"=>3,"a"=>3,"d"=>2), 2, "") + +# lookup a sequence or words in the vocabulary +julia> word = ["a", "-", "d", "c", "a"] + +julia> lookup(vocabulary ,word) + 5-element Array{Any,1}: + "a" + "" + "d" + "c" + "a" +``` From 8bb35aa8d267b55472f06d2d7f60a5cd9588af5f Mon Sep 17 00:00:00 2001 From: tejasvaidhyadev Date: Sat, 6 Jun 2020 22:14:01 +0530 Subject: [PATCH 29/51] updating preprocessing --- src/LM/preprocessing.jl | 3 --- 1 file changed, 3 deletions(-) diff --git a/src/LM/preprocessing.jl b/src/LM/preprocessing.jl index fd62de4f..24622002 100644 --- a/src/LM/preprocessing.jl +++ b/src/LM/preprocessing.jl @@ -22,7 +22,6 @@ julia> a = everygram(seq,min_len=1, max_len=-1) """ function everygram(seq::Vector{T}; min_len::Int=1, max_len::Int=-1) where { T <: AbstractString} - ngram = [] if max_len == -1 max_len = length(seq) @@ -88,8 +87,6 @@ julia> ngramizenew(seq ,2) ``` """ function ngramizenew(words::Vector{T}, nlist::Integer...) where { T <: AbstractString} - #(length(nlist) == 1) && (first(nlist) == 1) && return onegramize(lang, words) - n_words = length(words) tokens = [] From b3bcd941ed3813f058e4652dd2e4e25f1812d094 Mon Sep 17 00:00:00 2001 From: tejasvaidhyadev Date: Sat, 6 Jun 2020 22:14:21 +0530 Subject: [PATCH 30/51] bug fix in gamma model --- src/LM/langmodel.jl | 44 +++++++++++++++++++++++--------------------- 1 file changed, 23 insertions(+), 21 deletions(-) diff --git a/src/LM/langmodel.jl b/src/LM/langmodel.jl index b4973953..53b87ddd 100644 --- a/src/LM/langmodel.jl +++ b/src/LM/langmodel.jl @@ -30,18 +30,18 @@ end struct Lidstone <: gammamodel vocab::Vocabulary - gamma::Integer + gamma::Float64 end """ - Lidstone(word::Vector{T}, gamma:: Integer,unk_cutoff=1, unk_label="") where { T <: AbstractString} + Lidstone(word::Vector{T}, gamma:: Float64, unk_cutoff=1, unk_label="") where { T <: AbstractString} Function to initiate Type(Lidstone) for providing Lidstone-smoothed scores. In addition to initialization arguments from BaseNgramModel also requires a number by which to increase the counts, gamma. """ -function Lidstone(word, gamma,unk_cutoff=1, unk_label="") +function Lidstone(word, gamma = 1.0, unk_cutoff=1, unk_label="") Lidstone(Vocabulary(word, unk_cutoff, unk_label), gamma) end @@ -60,11 +60,11 @@ a number by which to increase the counts, gamma = 1. """ struct Laplace <: gammamodel vocab::Vocabulary - gamma::Integer + gamma::Float64 end function Laplace(word, unk_cutoff=1, unk_label="") - Lidstone(Vocabulary(word, unk_cutoff, unk_label), 1) + Laplace(Vocabulary(word, unk_cutoff, unk_label), 1.0) end function (lm::Laplace)(text, min::Integer, max::Integer) @@ -81,7 +81,7 @@ score is used to output probablity of word given that context Add-one smoothing to Lidstone or Laplace(gammamodel) models """ -function score(m::gammamodel, temp_lm, word, context) #score for gammamodel output probabl +function score(m::gammamodel, temp_lm::DefaultDict, word, context) #score for gammamodel output probabl accum = temp_lm[context] #print(accum) s = float(sum(accum)+(m.gamma)*length(m.vocab.vocab)) @@ -99,7 +99,7 @@ To get probability of word given that context In otherwords, for given context calculate frequency distribution of word """ -function prob(templ_lm::DefaultDict, word, context=nothing) +function prob(m::Langmodel, templ_lm::DefaultDict, word, context=nothing) if context == nothing || context == "" return(1/float(length(templ_lm))) #provide distribution else @@ -111,6 +111,9 @@ function prob(templ_lm::DefaultDict, word, context=nothing) return(float(count) / s) end end + if context in keys(m.vocab.vocab) + return(0) + end return(Inf) end @@ -120,8 +123,8 @@ end score is used to output probablity of word given that context in MLE """ -function score(m::MLE, temp_lm, word, context = nothing) - prob(temp_lm, word, context) +function score(m::MLE, temp_lm::DefaultDict, word, context = nothing) + prob(m, temp_lm, word, context) end struct WittenBellInterpolated <: InterpolatedLanguageModel @@ -129,7 +132,7 @@ struct WittenBellInterpolated <: InterpolatedLanguageModel end """ - WittenBellInterpolated(word::Vector{T}, gamma:: Integer,unk_cutoff=1, unk_label="") where { T <: AbstractString} + WittenBellInterpolated(word::Vector{T}, unk_cutoff=1, unk_label="") where { T <: AbstractString} Initiate Type for providing Interpolated version of Witten-Bell smoothing. @@ -151,7 +154,7 @@ function alpha_gammma(m::WittenBellInterpolated, templ_lm::DefaultDict, word, co local gam accum = templ_lm[context] s = float(sum(accum)) - for (text, count) in accum + for (text,count) in accum if text == word alpha=(float(count) / s) break @@ -170,7 +173,7 @@ end function gamma(accum) nplus=count_non_zero_vals(accum) - return(nplus/(nplus + float(sum(accum)))) + return(nplus/(nplus+float(sum(accum)))) end """ @@ -182,9 +185,9 @@ Apply Kneserney and WittenBell smoothing depending upon the sub-Type """ -function score(m::InterpolatedLanguageModel,temp_lm::DefaultDict,word,context=nothing) +function score(m::InterpolatedLanguageModel, temp_lm::DefaultDict, word, context=nothing) if context == nothing || context == "" - return prob(temp_lm, word, context) + return prob(m, temp_lm, word, context) end if context in keys(temp_lm) alpha,gamma = alpha_gammma(m, temp_lm, word, context) @@ -213,32 +216,31 @@ Initiate Type for providing KneserNey Interpolated language model. The idea to abstract this comes from Chen & Goodman 1995. """ -function KneserNeyInterpolated(word, gamma, unk_cutoff=1 , unk_label="") - KneserNeyInterpolated(Vocabulary(word, unk_cutoff, unk_label), gamma) +function KneserNeyInterpolated(word, disc = 0.1, unk_cutoff=1, unk_label="") + KneserNeyInterpolated(Vocabulary(word, unk_cutoff, unk_label) ,disc) end function (lm::KneserNeyInterpolated)(text, min::Integer, max::Integer) text = lookup(lm.vocab, text) - text = convert(Array{String}, text) + text=convert(Array{String}, text) return counter2(text, min, max) end # alpha_gamma function for KneserNeyInterpolated -function alpha_gammma(m::KneserNeyInterpolated,templ_lm::DefaultDict, word,context) +function alpha_gammma(m::KneserNeyInterpolated, templ_lm::DefaultDict, word, context) local alpha local gamma accum = templ_lm[context] s = float(sum(accum)) for (text, count) in accum if text == word - alpha=(max(float(count)-m.discount, 0.0) /s) + alpha=(max(float(count)-m.discount, 0.0) / s) break else alpha = 1/length(m.vocab.vocab) end end - gamma = (m.discount * count_non_zero_vals(accum)/s) + gamma = (m.discount * count_non_zero_vals(accum) /s) return alpha, gamma end - From b5408f157066348d956dd80dca63382f78d9cc4a Mon Sep 17 00:00:00 2001 From: tejasvaidhyadev Date: Sat, 6 Jun 2020 22:14:51 +0530 Subject: [PATCH 31/51] updating coding style --- src/LM/vocab.jl | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/LM/vocab.jl b/src/LM/vocab.jl index 6f9d250f..828beaa1 100644 --- a/src/LM/vocab.jl +++ b/src/LM/vocab.jl @@ -9,7 +9,7 @@ Adds a special "unknown" token which unseen words are mapped to. # Example ```julia-repl -julia> words = ['a', 'c', '-', 'd', 'c', 'a', 'b', 'r', 'a', 'c', 'd'] +julia> words = ["a", "c", "-", "d", "c", "a", "b", "r", "a", "c", "d"] julia> vocabulary = Vocabulary(words, 2) Vocabulary(Dict(""=>1,"c"=>3,"a"=>3,"d"=>2), 2, "") @@ -75,7 +75,7 @@ unk_cutoff::Int unk_label::String allword::Array{String,1} end -function Vocabulary(word, unk_cutoff=1, unk_label="") +function Vocabulary(word::Vector{T}, unk_cutoff=1, unk_label="") where { T <: AbstractString} if unk_label in word error("unk_label is in vocab") else @@ -100,7 +100,7 @@ lookup a sequence or words in the vocabulary Return an Array of String """ -function lookup(voc::Vocabulary,word) +function lookup(voc::Vocabulary,word::Vector{T}) where { T <: AbstractString} look = [] for w in word if w in keys(voc.vocab) From 9c5230d539b6a4816802c7f2bf20f96372dcd5ae Mon Sep 17 00:00:00 2001 From: tejasvaidhyadev Date: Sun, 7 Jun 2020 06:30:20 +0530 Subject: [PATCH 32/51] updating for Lm --- docs/make.jl | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/make.jl b/docs/make.jl index 93a1b9c0..6beab2fe 100644 --- a/docs/make.jl +++ b/docs/make.jl @@ -20,6 +20,7 @@ makedocs( "Conditional Random Fields" => "crf.md", "Named Entity Recognition" => "ner.md", "ULMFiT" => "ULMFiT.md", + "statistical Language Model" => "LM.md" "API References" => "APIReference.md" ], ) From 37025af49221110a9bed2df4ec579a6fbc674dff Mon Sep 17 00:00:00 2001 From: tejasvaidhyadev Date: Sun, 7 Jun 2020 06:30:49 +0530 Subject: [PATCH 33/51] test for Langmodels --- test/LM.jl | 52 ++++++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 50 insertions(+), 2 deletions(-) diff --git a/test/LM.jl b/test/LM.jl index 71e16b73..710680cd 100644 --- a/test/LM.jl +++ b/test/LM.jl @@ -15,7 +15,9 @@ using DataStructures @test isequal(vocab.allword ,["a", "c", "-", "d", "c", "a", "b", "r", "a", "c", "d", ""]) @test isequal(vocab.vocab, Dict{String,Int}(""=>1,"c"=>3,"a"=>3,"d"=>2)) #to check lookup function - @test lookup(vocab,["a","b","c","alien"]) == ["a", "", "c", ""] + @test lookup(vocab,["a","b","c","alien"]) == ["a", "", "c", ""] + word_set = ["","is","already","there"] + @test_throws ErrorException Vocabulary(word_set, 1, "") end @testset "preprocessing" begin @@ -54,6 +56,52 @@ end fit = (TextAnalysis.counter2(exam,2,2)) @test fit isa DataStructures.DefaultDict @test length(fit) == 5 #length of unique words - @test end + +@testset "language model" begin + + @testset "MLE" begin + voc =["my","name","is","salman","khan","and","he","is","shahrukh","Khan"] + train = ["khan","is","my","good", "friend","and","He","is","my","brother"] + model = MLE(voc) + fit = model(train, 2, 2) #considering only bigrams + unmaskedscore = score(model, fit, "is" ,"") + @test unmaskedscore == 0.3333333333333333 + @test score(model, fit, "is", "alien") == Inf #context not in vocabulary + @test score(model, fit, "alien", "is") == 0 # word not in vocabulary + end + @testset "Lidstone" begin + voc =["my","name","is","salman","khan","and","he","is","shahrukh","Khan"] + train = ["khan", "is", "my", "good", "friend", "and", "He", "is", "my", "brother"] + model2 = Lidstone(voc, 1.0) + fit = model2(train,2,2) + @test score(model2, fit,"is", "alien") == 0.1 + @test score(model2, fit, "alien", "is") >= 0 + end + @testset "Laplace" begin + voc =["my","name","is","salman","khan","and","he","is","shahrukh","Khan"] + train = ["khan", "is", "my", "good", "friend", "and", "He", "is", "my", "brother"] + model3 = Laplace(voc) + fit2 = model3(train,2,2) + @test score(model3, fit2,"is", "alien") == 0.1 + end + @testset "WittenBellInterpolated" begin + voc =["my","name","is","salman","khan","and","he","is","shahrukh","Khan"] + train = ["khan", "is", "my", "good", "friend", "and", "He", "is", "my", "brother"] + model = WittenBellInterpolated(voc) + fit = model(train,2,2) + @test score(model, fit,"is", "alien") == 0.2 + @test score(model,fit, "alien", "is") == 0.4 + @test score(model,fit,"alien") == 0.2 #should be non-zero + end + @testset "KneserNeyInterpolated" begin + voc =["my","name","is","salman","khan","and","he","is","shahrukh","Khan"] + train = ["khan", "is", "my", "good", "friend", "and", "He", "is", "my", "brother"] + model = KneserNeyInterpolated(voc,0.1) + fit = model(train,2,2) + @test score(model, fit,"is", "alie") == 0.2 + @test score(model,fit, "alien", "is") == 0.11000000000000001 + end +end + From 4d4b9eba6de1e448c23acc18b18ff3370505e93a Mon Sep 17 00:00:00 2001 From: Tejas Vaidhya <39345998+tejasvaidhyadev@users.noreply.github.com> Date: Sun, 7 Jun 2020 13:08:32 +0530 Subject: [PATCH 34/51] Update docs/src/LM.md Co-authored-by: Ayush Kaushal --- docs/src/LM.md | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/docs/src/LM.md b/docs/src/LM.md index 3a32c07e..f0b1e3c6 100644 --- a/docs/src/LM.md +++ b/docs/src/LM.md @@ -3,7 +3,7 @@ **TextAnalysis** provide following different Language Models - **MLE** - Base Ngram model. -- **Lidstone **- Base Ngram model with Lidstone smoothing. +- **Lidstone** - Base Ngram model with Lidstone smoothing. - **Laplace** - Base Ngram language model with Laplace smoothing. - **WittenBellInterpolated** - Interpolated Version of witten-Bell algorithm. - **KneserNeyInterpolated** - Interpolated version of Kneser -Ney smoothing. @@ -177,4 +177,3 @@ julia> lookup(vocabulary ,word) "c" "a" ``` - From c979bceb6ee8796b077a2862fa50a24732294323 Mon Sep 17 00:00:00 2001 From: Tejas Vaidhya <39345998+tejasvaidhyadev@users.noreply.github.com> Date: Sun, 7 Jun 2020 13:08:45 +0530 Subject: [PATCH 35/51] Update docs/src/LM.md Co-authored-by: Ayush Kaushal --- docs/src/LM.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/src/LM.md b/docs/src/LM.md index f0b1e3c6..a810c5dd 100644 --- a/docs/src/LM.md +++ b/docs/src/LM.md @@ -35,7 +35,7 @@ Arguments: * `gamma`: smoothing arugment gamma - * `discount`: discouting factor for `KneserNeyInterpolated` + * `discount`: discounting factor for `KneserNeyInterpolated` for more information see docstrings of vocabulary From 6cdfea019d5d66bf2a27b54e158faef8b4345dc6 Mon Sep 17 00:00:00 2001 From: Tejas Vaidhya <39345998+tejasvaidhyadev@users.noreply.github.com> Date: Sun, 7 Jun 2020 13:08:55 +0530 Subject: [PATCH 36/51] Update docs/src/LM.md Co-authored-by: Ayush Kaushal --- docs/src/LM.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/src/LM.md b/docs/src/LM.md index a810c5dd..d76b5ce3 100644 --- a/docs/src/LM.md +++ b/docs/src/LM.md @@ -42,7 +42,7 @@ Arguments: ```julia julia> voc = ["my","name","is","salman","khan","and","he","is","shahrukh","Khan"] -julia>train = ["khan","is","my","good", "friend","and","He","is","my","brother"] +julia> train = ["khan","is","my","good", "friend","and","He","is","my","brother"] # voc and train are used to train vocabulary and model respectively julia> model = MLE(voc) From e566904c91a5315b118df50599e30a95256e7aa1 Mon Sep 17 00:00:00 2001 From: tejasvaidhyadev Date: Sun, 7 Jun 2020 21:47:44 +0530 Subject: [PATCH 37/51] updating typos --- docs/make.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/make.jl b/docs/make.jl index 6beab2fe..22fffb1e 100644 --- a/docs/make.jl +++ b/docs/make.jl @@ -20,7 +20,7 @@ makedocs( "Conditional Random Fields" => "crf.md", "Named Entity Recognition" => "ner.md", "ULMFiT" => "ULMFiT.md", - "statistical Language Model" => "LM.md" + "Statistical Language Model" => "LM.md" "API References" => "APIReference.md" ], ) From 16a254651dd163e76f7c3a66099cced66959782a Mon Sep 17 00:00:00 2001 From: tejasvaidhyadev Date: Sun, 7 Jun 2020 21:48:23 +0530 Subject: [PATCH 38/51] updating docs typo and errors --- docs/src/LM.md | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/docs/src/LM.md b/docs/src/LM.md index d76b5ce3..2b529678 100644 --- a/docs/src/LM.md +++ b/docs/src/LM.md @@ -76,31 +76,31 @@ julia> masked_score = maskedscore(model,fit,"is","alien") ## Evaluation Method -- `score` +### `score` - used to evaluate probablity of word given context (*P(word | context)*) + used to evaluate the probability of word given context (*P(word | context)*) - ```julia +```julia score(m::gammamodel, temp_lm::DefaultDict, word::AbstractString, context::AbstractString) - ``` +``` ​ In case of Lidstone and Laplace it apply smoothing and, ​ In Interpolated language model, provide Kneserney and WittenBell smoothing -- `maskedscore` +### `maskedscore` It is used to evaluate *score* with masks out of vocabulary words The arguments are the same as for score -- `logscore` +### `logscore` Evaluate the log score of this word in this context. The arguments are the same as for score and maskedscore -- `entropy` +### `entropy` ```julia entropy(m::Langmodel,lm::DefaultDict,text_ngram::word::Vector{T}) where { T <: AbstractString} ``` @@ -109,7 +109,7 @@ julia> masked_score = maskedscore(model,fit,"is","alien") Input text must be Array of ngram of same lengths -- `perplexity` +### `perplexity` Calculates the perplexity of the given text. @@ -121,7 +121,7 @@ julia> masked_score = maskedscore(model,fit,"is","alien") 1. `everygram`: Return all possible ngrams generated from sequence of items, as an Array{String,1} - ```julia + ```julia julia> seq = ["To","be","or","not"] julia> a = everygram(seq,min_len=1, max_len=-1) 10-element Array{Any,1}: @@ -134,13 +134,13 @@ julia> masked_score = maskedscore(model,fit,"is","alien") "be or not" "To be or" "To be or not" - ``` + ``` 2. `padding_ngrams`: padding _ngram is used to pad both left and right of sentence and out putting ngrmas of order n It also pad the original input Array of string - ```julia + ```julia julia> example = ["1","2","3","4","5"] julia> example = ["1","2","3","4","5"] @@ -152,7 +152,7 @@ julia> masked_score = maskedscore(model,fit,"is","alien") "3 4" "4 5" "5 " - ``` + ``` ## Vocabulary Struct to store Language models vocabulary From 8481cfb002fb3a29853b0dccb08355e8789fb9ad Mon Sep 17 00:00:00 2001 From: tejasvaidhyadev Date: Sun, 7 Jun 2020 21:49:32 +0530 Subject: [PATCH 39/51] updating coding style --- src/LM/api.jl | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/src/LM/api.jl b/src/LM/api.jl index 2936eb06..1fa9115b 100644 --- a/src/LM/api.jl +++ b/src/LM/api.jl @@ -1,23 +1,23 @@ #TO DO # Doc string function maskedscore(m::Langmodel,temp_lm::DefaultDict,word,context) - score(m,temp_lm,lookup(m.vocab ,[word])[1] ,lookup(m.vocab ,[context])[1]) + score(m, temp_lm, lookup(m.vocab, [word])[1], lookup(m.vocab, [context])[1]) end -function logscore(m::Langmodel,temp_lm::DefaultDict,word,context) - log2(maskedscore(m,temp_lm,word,context)) +function logscore(m::Langmodel, temp_lm::DefaultDict, word, context) + log2(maskedscore(m, temp_lm, word, context)) end -function entropy(m::Langmodel,lm::DefaultDict,text_ngram) +function entropy(m::Langmodel, lm::DefaultDict, text_ngram) local log_set=Float64[] for ngram in text_ngram ngram = split(ngram) - push!(log_set,logscore(m,lm,ngram[end],join(ngram[1:end-1]," "))) + push!(log_set, logscore(m, lm, ngram[end], join(ngram[1:end-1], " "))) #println(logscore(m,lm,ngram[end],ngram[1:end-1])) end return(sum(log_set)/length(log_set)) end -function perplexity(m::Langmodel,lm::DefaultDict,text_ngram) - return(2^(entropy(m,lm,text_ngram))) +function perplexity(m::Langmodel, lm::DefaultDict, text_ngram) + return(2^(entropy(m, lm, text_ngram))) end From 9dc882c3d51a627804a0538b86979a0de6719ea9 Mon Sep 17 00:00:00 2001 From: tejasvaidhyadev Date: Sun, 7 Jun 2020 21:50:02 +0530 Subject: [PATCH 40/51] updating doc typo --- src/LM/langmodel.jl | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/src/LM/langmodel.jl b/src/LM/langmodel.jl index 53b87ddd..31bdafd0 100644 --- a/src/LM/langmodel.jl +++ b/src/LM/langmodel.jl @@ -123,7 +123,7 @@ end score is used to output probablity of word given that context in MLE """ -function score(m::MLE, temp_lm::DefaultDict, word, context = nothing) +function score(m::MLE, temp_lm::DefaultDict, word, context=nothing) prob(m, temp_lm, word, context) end @@ -231,13 +231,13 @@ function alpha_gammma(m::KneserNeyInterpolated, templ_lm::DefaultDict, word, con local gamma accum = templ_lm[context] s = float(sum(accum)) - for (text, count) in accum - if text == word - alpha=(max(float(count)-m.discount, 0.0) / s) - break - else - alpha = 1/length(m.vocab.vocab) - end + for (text, count) in accum + if text == word + alpha=(max(float(count)-m.discount, 0.0) / s) + break + else + alpha = 1/length(m.vocab.vocab) + end end gamma = (m.discount * count_non_zero_vals(accum) /s) return alpha, gamma From 723103cf392746ada9fcaaaa25196037538ddf89 Mon Sep 17 00:00:00 2001 From: tejasvaidhyadev Date: Sun, 7 Jun 2020 21:50:42 +0530 Subject: [PATCH 41/51] adding corrected docs --- src/LM/preprocessing.jl | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/src/LM/preprocessing.jl b/src/LM/preprocessing.jl index 24622002..e240e53f 100644 --- a/src/LM/preprocessing.jl +++ b/src/LM/preprocessing.jl @@ -43,8 +43,7 @@ padding _ngram is used to pad both left and right of sentence and out putting ng # Example ```julia-repl julia> example = ["1","2","3","4","5"] - -julia> example = ["1","2","3","4","5"] + julia> padding_ngrams(example,2,pad_left=true,pad_right=true) 6-element Array{Any,1}: " 1" @@ -55,7 +54,7 @@ julia> padding_ngrams(example,2,pad_left=true,pad_right=true) "5 " ``` """ -function padding_ngram(word::Vector{T}, n=1; pad_left=false,pad_right=false ,left_pad_symbol="", right_pad_symbol ="") where { T <: AbstractString} +function padding_ngram(word::Vector{T}, n=1; pad_left=false, pad_right=false, left_pad_symbol="", right_pad_symbol ="") where { T <: AbstractString} local seq seq = word if pad_left == true @@ -64,7 +63,7 @@ function padding_ngram(word::Vector{T}, n=1; pad_left=false,pad_right=false ,lef if pad_right == true push!(seq, right_pad_symbol) end - return ngramizenew(seq,n) + return ngramizenew(seq, n) end """ From b14cc743a35cf6f4f8cc741f852a44d3c4a87e40 Mon Sep 17 00:00:00 2001 From: tejasvaidhyadev Date: Sun, 7 Jun 2020 21:51:31 +0530 Subject: [PATCH 42/51] updating coding style in vocab.jl --- src/LM/vocab.jl | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/src/LM/vocab.jl b/src/LM/vocab.jl index 828beaa1..7ae1e1f0 100644 --- a/src/LM/vocab.jl +++ b/src/LM/vocab.jl @@ -70,29 +70,29 @@ julia> vocabulary.vocab["b"] ``` """ mutable struct Vocabulary -vocab::Dict{String,Int64} +vocab::Dict{String, Int64} unk_cutoff::Int unk_label::String -allword::Array{String,1} +allword::Array{String, 1} end function Vocabulary(word::Vector{T}, unk_cutoff=1, unk_label="") where { T <: AbstractString} if unk_label in word error("unk_label is in vocab") else - word= push!(word,unk_label) + word= push!(word, unk_label) end vocab = countmap(word) for value in vocab if value[2] Date: Sun, 7 Jun 2020 21:53:37 +0530 Subject: [PATCH 43/51] Update src/LM/vocab.jl Co-authored-by: Ayush Kaushal --- src/LM/vocab.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/LM/vocab.jl b/src/LM/vocab.jl index 7ae1e1f0..949a469d 100644 --- a/src/LM/vocab.jl +++ b/src/LM/vocab.jl @@ -75,6 +75,7 @@ unk_cutoff::Int unk_label::String allword::Array{String, 1} end + function Vocabulary(word::Vector{T}, unk_cutoff=1, unk_label="") where { T <: AbstractString} if unk_label in word error("unk_label is in vocab") @@ -112,4 +113,3 @@ function lookup(voc::Vocabulary, word::Vector{T}) where { T <: AbstractString} return look end - From feb2d46b307b245eb7de6484b99f056426805110 Mon Sep 17 00:00:00 2001 From: tejasvaidhyadev Date: Sun, 7 Jun 2020 22:10:24 +0530 Subject: [PATCH 44/51] updating docs --- docs/src/LM.md | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/docs/src/LM.md b/docs/src/LM.md index 2b529678..0e19e630 100644 --- a/docs/src/LM.md +++ b/docs/src/LM.md @@ -13,7 +13,7 @@ To use the API, we first *Instantiate* desired model and then load it with train set ```julia -(word::Vector{T}, unk_cutoff=1, unk_label="") where { T <: AbstractString} +MLE(word::Vector{T}, unk_cutoff=1, unk_label="") where { T <: AbstractString} Lidstone(word::Vector{T}, gamma:: Float64, unk_cutoff=1, unk_label="") where { T <: AbstractString} @@ -70,6 +70,7 @@ julia> masked_score = maskedscore(model,fit,"is","alien") #as expected maskedscore is equivalent to unmaskedscore with context replaced with "" ``` + !!! note When you call `MLE(voc)` for the first time, It will update your vocabulary set as well. @@ -84,6 +85,13 @@ julia> masked_score = maskedscore(model,fit,"is","alien") score(m::gammamodel, temp_lm::DefaultDict, word::AbstractString, context::AbstractString) ``` +Arguments: + +1. `m` : Instance of `Langmodel` struct. +2. `temp_lm`: output of function call of instance of `Langmodel`. +3. `word`: string of word +4. `context`: context of given word + ​ In case of Lidstone and Laplace it apply smoothing and, ​ In Interpolated language model, provide Kneserney and WittenBell smoothing From 995581b1ff87b0501962a4796f49205eea9845d9 Mon Sep 17 00:00:00 2001 From: tejasvaidhyadev Date: Sun, 7 Jun 2020 22:16:21 +0530 Subject: [PATCH 45/51] updating typo --- docs/src/LM.md | 2 -- 1 file changed, 2 deletions(-) diff --git a/docs/src/LM.md b/docs/src/LM.md index 0e19e630..6500ee89 100644 --- a/docs/src/LM.md +++ b/docs/src/LM.md @@ -149,8 +149,6 @@ Arguments: It also pad the original input Array of string ```julia - julia> example = ["1","2","3","4","5"] - julia> example = ["1","2","3","4","5"] julia> padding_ngrams(example,2,pad_left=true,pad_right=true) 6-element Array{Any,1}: From cac50d6d36f3066c96266a66e0eb8b84b5c8529d Mon Sep 17 00:00:00 2001 From: tejasvaidhyadev Date: Sun, 7 Jun 2020 22:34:54 +0530 Subject: [PATCH 46/51] updating dep for statsbase --- Project.toml | 1 + 1 file changed, 1 insertion(+) diff --git a/Project.toml b/Project.toml index e17f55b5..57c85376 100644 --- a/Project.toml +++ b/Project.toml @@ -20,6 +20,7 @@ Printf = "de0858da-6303-5e67-8744-51eddeeeb8d7" Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" SparseArrays = "2f01184e-e22b-5df5-ae63-d93ebab69eaf" Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2" +StatsBase = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91" Tracker = "9f7883ad-71c0-57eb-9f7f-b5c9e6d3789c" WordTokenizers = "796a5d58-b03d-544a-977e-18100b691f6e" StatsBase = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91" From e2345902fe95c5a60268afbe69256116743e1619 Mon Sep 17 00:00:00 2001 From: tejasvaidhyadev Date: Sun, 7 Jun 2020 22:44:43 +0530 Subject: [PATCH 47/51] updating Manifest --- Manifest.toml | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/Manifest.toml b/Manifest.toml index 358163f2..a900c5f0 100644 --- a/Manifest.toml +++ b/Manifest.toml @@ -471,5 +471,4 @@ git-tree-sha1 = "580ce62b6c14244916cc28ad54f8a2e2886f843d" uuid = "a5390f91-8eb1-5f08-bee0-b1d1ffed6cea" version = "0.8.3" -[[StatsBase]] -uuid = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91" + From 72c4cb040607ee9787d81dd2958121396d48b80f Mon Sep 17 00:00:00 2001 From: tejasvaidhyadev Date: Mon, 8 Jun 2020 03:34:58 +0530 Subject: [PATCH 48/51] updating .toml file --- Project.toml | 1 - 1 file changed, 1 deletion(-) diff --git a/Project.toml b/Project.toml index 57c85376..3f62e755 100644 --- a/Project.toml +++ b/Project.toml @@ -23,7 +23,6 @@ Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2" StatsBase = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91" Tracker = "9f7883ad-71c0-57eb-9f7f-b5c9e6d3789c" WordTokenizers = "796a5d58-b03d-544a-977e-18100b691f6e" -StatsBase = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91" [compat] Flux = "< 0.10" From 12fd3c6f6bff81161bf403807552ef70524ebce0 Mon Sep 17 00:00:00 2001 From: tejasvaidhyadev Date: Mon, 8 Jun 2020 03:50:47 +0530 Subject: [PATCH 49/51] bug fix --- docs/make.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/make.jl b/docs/make.jl index 22fffb1e..7ab14a36 100644 --- a/docs/make.jl +++ b/docs/make.jl @@ -20,7 +20,7 @@ makedocs( "Conditional Random Fields" => "crf.md", "Named Entity Recognition" => "ner.md", "ULMFiT" => "ULMFiT.md", - "Statistical Language Model" => "LM.md" + "Statistical Language Model" => "LM.md", "API References" => "APIReference.md" ], ) From d089d519d0b4de2f785866c23ab9dce776a2d8ca Mon Sep 17 00:00:00 2001 From: tejasvaidhyadev Date: Mon, 8 Jun 2020 04:30:15 +0530 Subject: [PATCH 50/51] indentation --- src/LM/api.jl | 2 +- src/TextAnalysis.jl | 2 +- test/LM.jl | 48 ++++++++++++++++++++++----------------------- 3 files changed, 26 insertions(+), 26 deletions(-) diff --git a/src/LM/api.jl b/src/LM/api.jl index 1fa9115b..55a542c7 100644 --- a/src/LM/api.jl +++ b/src/LM/api.jl @@ -1,6 +1,6 @@ #TO DO # Doc string -function maskedscore(m::Langmodel,temp_lm::DefaultDict,word,context) +function maskedscore(m::Langmodel, temp_lm::DefaultDict, word, context) score(m, temp_lm, lookup(m.vocab, [word])[1], lookup(m.vocab, [context])[1]) end diff --git a/src/TextAnalysis.jl b/src/TextAnalysis.jl index 01550ce9..23ef05e6 100644 --- a/src/TextAnalysis.jl +++ b/src/TextAnalysis.jl @@ -67,7 +67,7 @@ module TextAnalysis export NERTagger, PoSTagger, Tracker, Flux - export Vocabulary,lookup,update + export Vocabulary, lookup, update export everygram, padding_ngram export maskedscore, logscore, entropy, perplexity export MLE, Lidstone, Laplace, WittenBellInterpolated, KneserNeyInterpolated, score diff --git a/test/LM.jl b/test/LM.jl index 710680cd..486de598 100644 --- a/test/LM.jl +++ b/test/LM.jl @@ -15,15 +15,15 @@ using DataStructures @test isequal(vocab.allword ,["a", "c", "-", "d", "c", "a", "b", "r", "a", "c", "d", ""]) @test isequal(vocab.vocab, Dict{String,Int}(""=>1,"c"=>3,"a"=>3,"d"=>2)) #to check lookup function - @test lookup(vocab,["a","b","c","alien"]) == ["a", "", "c", ""] - word_set = ["","is","already","there"] + @test lookup(vocab,["a", "b", "c", "alien"]) == ["a", "", "c", ""] + word_set = ["", "is", "already", "there"] @test_throws ErrorException Vocabulary(word_set, 1, "") end @testset "preprocessing" begin @testset "ngramizenew" begin sample_text = ["this", "is", "some", "sample", "text"] - ngrams = TextAnalysis.ngramizenew(sample_text,1) + ngrams = TextAnalysis.ngramizenew(sample_text, 1) @test isequal(ngrams, ["this", "is", "some", "sample", "text"]) @@ -35,25 +35,25 @@ end end @testset "Padding function" begin - example = ["1","2","3","4","5"] - padded=padding_ngram(example,2,pad_left=true,pad_right=true) + example = ["1", "2", "3", "4", "5"] + padded=padding_ngram(example, 2, pad_left=true, pad_right=true) @test isequal(padded,[" 1", "1 2", "2 3", "3 4", "4 5", "5 "]) - @test isequal(example, ["","1","2","3","4","5",""]) + @test isequal(example, ["", "1", "2", "3", "4", "5", ""]) - example = ["1","2","3","4","5"] #if used - padded=padding_ngram(example,2,pad_right=true) + example = ["1", "2", "3", "4", "5"] #if used + padded=padding_ngram(example, 2, pad_right=true) @test isequal(padded,["1 2", "2 3", "3 4", "4 5", "5 "]) end @testset "everygram function" begin - example = ["1","2","3","4","5"] - everyngms = everygram(example,min_len=1,max_len=2) + example = ["1", "2", "3", "4", "5"] + everyngms = everygram(example, min_len=1, max_len=2) @test isequal(everyngms, ["1", "2", "3", "4", "5", "1 2", "2 3", "3 4", "4 5"]) end end @testset "counter" begin exam = ["To", "be", "or", "not", "to", "be","To", "be", "or", "not", "to", "be"] - fit = (TextAnalysis.counter2(exam,2,2)) + fit = (TextAnalysis.counter2(exam, 2, 2)) @test fit isa DataStructures.DefaultDict @test length(fit) == 5 #length of unique words end @@ -61,8 +61,8 @@ end @testset "language model" begin @testset "MLE" begin - voc =["my","name","is","salman","khan","and","he","is","shahrukh","Khan"] - train = ["khan","is","my","good", "friend","and","He","is","my","brother"] + voc =["my", "name", "is", "salman", "khan", "and", "he", "is", "shahrukh", "Khan"] + train = ["khan", "is", "my", "good", "friend", "and", "He", "is", "my", "brother"] model = MLE(voc) fit = model(train, 2, 2) #considering only bigrams unmaskedscore = score(model, fit, "is" ,"") @@ -75,7 +75,7 @@ end voc =["my","name","is","salman","khan","and","he","is","shahrukh","Khan"] train = ["khan", "is", "my", "good", "friend", "and", "He", "is", "my", "brother"] model2 = Lidstone(voc, 1.0) - fit = model2(train,2,2) + fit = model2(train, 2, 2) @test score(model2, fit,"is", "alien") == 0.1 @test score(model2, fit, "alien", "is") >= 0 end @@ -83,25 +83,25 @@ end voc =["my","name","is","salman","khan","and","he","is","shahrukh","Khan"] train = ["khan", "is", "my", "good", "friend", "and", "He", "is", "my", "brother"] model3 = Laplace(voc) - fit2 = model3(train,2,2) + fit2 = model3(train, 2, 2) @test score(model3, fit2,"is", "alien") == 0.1 end @testset "WittenBellInterpolated" begin - voc =["my","name","is","salman","khan","and","he","is","shahrukh","Khan"] + voc =["my", "name", "is", "salman", "khan", "and", "he", "is", "shahrukh", "Khan"] train = ["khan", "is", "my", "good", "friend", "and", "He", "is", "my", "brother"] model = WittenBellInterpolated(voc) - fit = model(train,2,2) + fit = model(train, 2, 2) @test score(model, fit,"is", "alien") == 0.2 - @test score(model,fit, "alien", "is") == 0.4 - @test score(model,fit,"alien") == 0.2 #should be non-zero + @test score(model, fit, "alien", "is") == 0.4 + @test score(model, fit,"alien") == 0.2 #should be non-zero end @testset "KneserNeyInterpolated" begin - voc =["my","name","is","salman","khan","and","he","is","shahrukh","Khan"] + voc =["my", "name", "is", "salman", "khan", "and", "he", "is", "shahrukh", "Khan"] train = ["khan", "is", "my", "good", "friend", "and", "He", "is", "my", "brother"] - model = KneserNeyInterpolated(voc,0.1) - fit = model(train,2,2) - @test score(model, fit,"is", "alie") == 0.2 - @test score(model,fit, "alien", "is") == 0.11000000000000001 + model = KneserNeyInterpolated(voc, 0.1) + fit = model(train, 2, 2) + @test score(model, fit, "is", "alie") == 0.2 + @test score(model, fit, "alien", "is") == 0.11000000000000001 end end From 85f93a8e392e0c918f5855123689a4b93de5384d Mon Sep 17 00:00:00 2001 From: tejasvaidhyadev Date: Mon, 8 Jun 2020 19:29:04 +0530 Subject: [PATCH 51/51] updated using --- src/LM/counter.jl | 3 --- 1 file changed, 3 deletions(-) diff --git a/src/LM/counter.jl b/src/LM/counter.jl index dd595b71..4ec876e7 100644 --- a/src/LM/counter.jl +++ b/src/LM/counter.jl @@ -1,7 +1,4 @@ using DataStructures -import DataStructures.Accumulator -import DataStructures.DefaultDict -import DataStructures.counter """ counter is used to make conditional distribution, which is used by score functions to