diff --git a/Manifest.toml b/Manifest.toml index 187c33a1..a900c5f0 100644 --- a/Manifest.toml +++ b/Manifest.toml @@ -470,3 +470,5 @@ deps = ["BinaryProvider", "Libdl", "Printf"] git-tree-sha1 = "580ce62b6c14244916cc28ad54f8a2e2886f843d" uuid = "a5390f91-8eb1-5f08-bee0-b1d1ffed6cea" version = "0.8.3" + + diff --git a/Project.toml b/Project.toml index 02bcd701..3f62e755 100644 --- a/Project.toml +++ b/Project.toml @@ -20,6 +20,7 @@ Printf = "de0858da-6303-5e67-8744-51eddeeeb8d7" Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" SparseArrays = "2f01184e-e22b-5df5-ae63-d93ebab69eaf" Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2" +StatsBase = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91" Tracker = "9f7883ad-71c0-57eb-9f7f-b5c9e6d3789c" WordTokenizers = "796a5d58-b03d-544a-977e-18100b691f6e" diff --git a/docs/make.jl b/docs/make.jl index 93a1b9c0..7ab14a36 100644 --- a/docs/make.jl +++ b/docs/make.jl @@ -20,6 +20,7 @@ makedocs( "Conditional Random Fields" => "crf.md", "Named Entity Recognition" => "ner.md", "ULMFiT" => "ULMFiT.md", + "Statistical Language Model" => "LM.md", "API References" => "APIReference.md" ], ) diff --git a/docs/src/LM.md b/docs/src/LM.md new file mode 100644 index 00000000..6500ee89 --- /dev/null +++ b/docs/src/LM.md @@ -0,0 +1,185 @@ +# Statistical Language Model + +**TextAnalysis** provide following different Language Models + +- **MLE** - Base Ngram model. +- **Lidstone** - Base Ngram model with Lidstone smoothing. +- **Laplace** - Base Ngram language model with Laplace smoothing. +- **WittenBellInterpolated** - Interpolated Version of witten-Bell algorithm. +- **KneserNeyInterpolated** - Interpolated version of Kneser -Ney smoothing. + +## APIs + +To use the API, we first *Instantiate* desired model and then load it with train set + +```julia +MLE(word::Vector{T}, unk_cutoff=1, unk_label="") where { T <: AbstractString} + +Lidstone(word::Vector{T}, gamma:: Float64, unk_cutoff=1, unk_label="") where { T <: AbstractString} + +Laplace(word::Vector{T}, unk_cutoff=1, unk_label="") where { T <: AbstractString} + +WittenBellInterpolated(word::Vector{T}, unk_cutoff=1, unk_label="") where { T <: AbstractString} + +KneserNeyInterpolated(word::Vector{T}, discount:: Float64=0.1, unk_cutoff=1, unk_label="") where { T <: AbstractString} + +(lm::)(text, min::Integer, max::Integer) +``` +Arguments: + + * `word` : Array of strings to store vocabulary. + + * `unk_cutoff`: Tokens with counts greater than or equal to the cutoff value will be considered part of the vocabulary. + + * `unk_label`: token for unkown labels + + * `gamma`: smoothing arugment gamma + + * `discount`: discounting factor for `KneserNeyInterpolated` + + for more information see docstrings of vocabulary + +```julia +julia> voc = ["my","name","is","salman","khan","and","he","is","shahrukh","Khan"] + +julia> train = ["khan","is","my","good", "friend","and","He","is","my","brother"] +# voc and train are used to train vocabulary and model respectively + +julia> model = MLE(voc) +MLE(Vocabulary(Dict("khan"=>1,"name"=>1,""=>1,"salman"=>1,"is"=>2,"Khan"=>1,"my"=>1,"he"=>1,"shahrukh"=>1,"and"=>1…), 1, "", ["my", "name", "is", "salman", "khan", "and", "he", "is", "shahrukh", "Khan", ""])) +julia> print(voc) +11-element Array{String,1}: + "my" + "name" + "is" + "salman" + "khan" + "and" + "he" + "is" + "shahrukh" + "Khan" + "" +# you can see "" token is added to voc +julia> fit = model(train,2,2) #considering only bigrams +julia> unmaskedscore = score(model, fit, "is" ,"") #score output P(word | context) without replacing context word with "" +0.3333333333333333 +julia> masked_score = maskedscore(model,fit,"is","alien") +0.3333333333333333 +#as expected maskedscore is equivalent to unmaskedscore with context replaced with "" + +``` + +!!! note + + When you call `MLE(voc)` for the first time, It will update your vocabulary set as well. + +## Evaluation Method + +### `score` + + used to evaluate the probability of word given context (*P(word | context)*) + +```julia + score(m::gammamodel, temp_lm::DefaultDict, word::AbstractString, context::AbstractString) +``` + +Arguments: + +1. `m` : Instance of `Langmodel` struct. +2. `temp_lm`: output of function call of instance of `Langmodel`. +3. `word`: string of word +4. `context`: context of given word + +​ In case of Lidstone and Laplace it apply smoothing and, + +​ In Interpolated language model, provide Kneserney and WittenBell smoothing + +### `maskedscore` + + It is used to evaluate *score* with masks out of vocabulary words + + The arguments are the same as for score + +### `logscore` + + Evaluate the log score of this word in this context. + + The arguments are the same as for score and maskedscore + +### `entropy` + ```julia + entropy(m::Langmodel,lm::DefaultDict,text_ngram::word::Vector{T}) where { T <: AbstractString} + ``` + + Calculate cross-entropy of model for given evaluation text. + + Input text must be Array of ngram of same lengths + +### `perplexity` + + Calculates the perplexity of the given text. + + This is simply 2 ** cross-entropy(`entropy`) for the text, so the arguments are the same as `entropy`. + +## Preprocessing + + For Preprocessing following functions: + +1. `everygram`: Return all possible ngrams generated from sequence of items, as an Array{String,1} + + ```julia + julia> seq = ["To","be","or","not"] + julia> a = everygram(seq,min_len=1, max_len=-1) + 10-element Array{Any,1}: + "or" + "not" + "To" + "be" + "or not" + "be or" + "be or not" + "To be or" + "To be or not" + ``` + +2. `padding_ngrams`: padding _ngram is used to pad both left and right of sentence and out putting ngrmas of order n + + It also pad the original input Array of string + + ```julia + julia> example = ["1","2","3","4","5"] + julia> padding_ngrams(example,2,pad_left=true,pad_right=true) + 6-element Array{Any,1}: + " 1" + "1 2" + "2 3" + "3 4" + "4 5" + "5 " + ``` +## Vocabulary + +Struct to store Language models vocabulary + +checking membership and filters items by comparing their counts to a cutoff value + +It also Adds a special "unkown" tokens which unseen words are mapped to + +```julia +julia> words = ["a", "c", "-", "d", "c", "a", "b", "r", "a", "c", "d"] +julia> vocabulary = Vocabulary(words, 2) + Vocabulary(Dict(""=>1,"c"=>3,"a"=>3,"d"=>2), 2, "") + +# lookup a sequence or words in the vocabulary +julia> word = ["a", "-", "d", "c", "a"] + +julia> lookup(vocabulary ,word) + 5-element Array{Any,1}: + "a" + "" + "d" + "c" + "a" +``` diff --git a/src/LM/api.jl b/src/LM/api.jl new file mode 100644 index 00000000..55a542c7 --- /dev/null +++ b/src/LM/api.jl @@ -0,0 +1,23 @@ +#TO DO +# Doc string +function maskedscore(m::Langmodel, temp_lm::DefaultDict, word, context) + score(m, temp_lm, lookup(m.vocab, [word])[1], lookup(m.vocab, [context])[1]) +end + +function logscore(m::Langmodel, temp_lm::DefaultDict, word, context) + log2(maskedscore(m, temp_lm, word, context)) +end + +function entropy(m::Langmodel, lm::DefaultDict, text_ngram) + local log_set=Float64[] + for ngram in text_ngram + ngram = split(ngram) + push!(log_set, logscore(m, lm, ngram[end], join(ngram[1:end-1], " "))) + #println(logscore(m,lm,ngram[end],ngram[1:end-1])) + end + return(sum(log_set)/length(log_set)) +end + +function perplexity(m::Langmodel, lm::DefaultDict, text_ngram) + return(2^(entropy(m, lm, text_ngram))) +end diff --git a/src/LM/counter.jl b/src/LM/counter.jl new file mode 100644 index 00000000..4ec876e7 --- /dev/null +++ b/src/LM/counter.jl @@ -0,0 +1,17 @@ +using DataStructures + +""" + counter is used to make conditional distribution, which is used by score functions to + calculate conditonal frequency distribution +""" +function counter2(data, min::Integer, max::Integer) + data = everygram(data, min_len=min, max_len=max) + data = split.(data) + temp_lm = DefaultDict{SubString{String}, Accumulator{String,Int64}}(counter(SubString{String})) + for i in 1:length(data) + history,word = data[i][1:end-1], data[i][end] + temp_lm[join(history, " ")][word] += 1 + end + return temp_lm +end + diff --git a/src/LM/langmodel.jl b/src/LM/langmodel.jl new file mode 100644 index 00000000..31bdafd0 --- /dev/null +++ b/src/LM/langmodel.jl @@ -0,0 +1,246 @@ +abstract type Langmodel end +abstract type gammamodel <: Langmodel end #BaseNgram with Add-one smoothing algo +abstract type InterpolatedLanguageModel <: Langmodel end #Interpolated language model with smoothing + +#DataType MLE +#Type for providing MLE ngram model scores. +#Implementation of Base Ngram Model. + +struct MLE <: Langmodel + vocab::Vocabulary +end + +""" + MLE(word::Vector{T}, unk_cutoff=1, unk_label="") where { T <: AbstractString} + +Initiate Type for providing MLE ngram model scores. + +Implementation of Base Ngram Model. + +""" +function MLE(word, unk_cutoff=1, unk_label="") + MLE(Vocabulary(word, unk_cutoff, unk_label)) +end + +function (lm::MLE)(text, min::Integer, max::Integer) + text = lookup(lm.vocab, text) + text=convert(Array{String}, text) + return counter2(text, min, max) +end + +struct Lidstone <: gammamodel + vocab::Vocabulary + gamma::Float64 +end + +""" + Lidstone(word::Vector{T}, gamma:: Float64, unk_cutoff=1, unk_label="") where { T <: AbstractString} + +Function to initiate Type(Lidstone) for providing Lidstone-smoothed scores. + +In addition to initialization arguments from BaseNgramModel also requires +a number by which to increase the counts, gamma. +""" +function Lidstone(word, gamma = 1.0, unk_cutoff=1, unk_label="") + Lidstone(Vocabulary(word, unk_cutoff, unk_label), gamma) +end + +function (lm::Lidstone)(text, min::Integer, max::Integer) + text = lookup(lm.vocab, text) + text=convert(Array{String}, text) + return counter2(text, min, max) +end + +""" + Laplace(word::Vector{T}, unk_cutoff=1, unk_label="") where { T <: AbstractString} +Function to initiate Type(Laplace) for providing Laplace-smoothed scores. + +In addition to initialization arguments from BaseNgramModel also requires +a number by which to increase the counts, gamma = 1. +""" +struct Laplace <: gammamodel + vocab::Vocabulary + gamma::Float64 +end + +function Laplace(word, unk_cutoff=1, unk_label="") + Laplace(Vocabulary(word, unk_cutoff, unk_label), 1.0) +end + +function (lm::Laplace)(text, min::Integer, max::Integer) + text = lookup(lm.vocab, text) + text = convert(Array{String}, text) + return counter2(text, min, max) +end + +""" + score(m::gammamodel, temp_lm::DefaultDict, word::AbstractString, context::AbstractString) + +score is used to output probablity of word given that context + +Add-one smoothing to Lidstone or Laplace(gammamodel) models + +""" +function score(m::gammamodel, temp_lm::DefaultDict, word, context) #score for gammamodel output probabl + accum = temp_lm[context] + #print(accum) + s = float(sum(accum)+(m.gamma)*length(m.vocab.vocab)) + for (text, count) in accum + if text == word + return(float(count+m.gamma)/s) + end + end + return(float(m.gamma)/s) +end + +""" +To get probability of word given that context + +In otherwords, for given context calculate frequency distribution of word + +""" +function prob(m::Langmodel, templ_lm::DefaultDict, word, context=nothing) + if context == nothing || context == "" + return(1/float(length(templ_lm))) #provide distribution + else + accum = templ_lm[context] + end + s = float(sum(accum)) + for (text, count) in accum + if text == word + return(float(count) / s) + end + end + if context in keys(m.vocab.vocab) + return(0) + end + return(Inf) +end + +""" + score(m::MLE, temp_lm::DefaultDict, word::AbstractString, context::AbstractString) + +score is used to output probablity of word given that context in MLE + +""" +function score(m::MLE, temp_lm::DefaultDict, word, context=nothing) + prob(m, temp_lm, word, context) +end + +struct WittenBellInterpolated <: InterpolatedLanguageModel + vocab ::Vocabulary +end + +""" + WittenBellInterpolated(word::Vector{T}, unk_cutoff=1, unk_label="") where { T <: AbstractString} + +Initiate Type for providing Interpolated version of Witten-Bell smoothing. + +The idea to abstract this comes from Chen & Goodman 1995. + +""" +function WittenBellInterpolated(word, unk_cutoff=1, unk_label="") + WittenBellInterpolated(Vocabulary(word, unk_cutoff, unk_label)) +end + +function (lm::WittenBellInterpolated)(text, min::Integer, max::Integer) + text = lookup(lm.vocab, text) + text=convert(Array{String}, text) + return counter2(text, min, max) +end +# alpha_gamma function for KneserNeyInterpolated +function alpha_gammma(m::WittenBellInterpolated, templ_lm::DefaultDict, word, context) + local alpha + local gam + accum = templ_lm[context] + s = float(sum(accum)) + for (text,count) in accum + if text == word + alpha=(float(count) / s) + break + else + alpha = 1/s + end + end + + gam = gamma(accum) + return alpha*(1- gam), gam +end + +function count_non_zero_vals(accum::Accumulator{}) + return(length(accum)) +end + +function gamma(accum) + nplus=count_non_zero_vals(accum) + return(nplus/(nplus+float(sum(accum)))) +end + +""" + score(m::InterpolatedLanguageModel, temp_lm::DefaultDict, word::AbstractString, context::AbstractString) + +score is used to output probablity of word given that context in InterpolatedLanguageModel + +Apply Kneserney and WittenBell smoothing +depending upon the sub-Type + +""" +function score(m::InterpolatedLanguageModel, temp_lm::DefaultDict, word, context=nothing) + if context == nothing || context == "" + return prob(m, temp_lm, word, context) + end + if context in keys(temp_lm) + alpha,gamma = alpha_gammma(m, temp_lm, word, context) + return (alpha + gamma*score(m, temp_lm, word, context_reduce(context))) + else + return score(m, temp_lm, word, context_reduce(context)) + end +end + +function context_reduce(context) + context = split(context) + join(context[2:end], " ") +end + + +struct KneserNeyInterpolated <: InterpolatedLanguageModel + vocab::Vocabulary + discount::Float64 +end + +""" + KneserNeyInterpolated(word::Vector{T}, discount:: Float64,unk_cutoff=1, unk_label="") where { T <: AbstractString} + +Initiate Type for providing KneserNey Interpolated language model. + +The idea to abstract this comes from Chen & Goodman 1995. + +""" +function KneserNeyInterpolated(word, disc = 0.1, unk_cutoff=1, unk_label="") + KneserNeyInterpolated(Vocabulary(word, unk_cutoff, unk_label) ,disc) +end + +function (lm::KneserNeyInterpolated)(text, min::Integer, max::Integer) + text = lookup(lm.vocab, text) + text=convert(Array{String}, text) + return counter2(text, min, max) +end +# alpha_gamma function for KneserNeyInterpolated +function alpha_gammma(m::KneserNeyInterpolated, templ_lm::DefaultDict, word, context) + local alpha + local gamma + accum = templ_lm[context] + s = float(sum(accum)) + for (text, count) in accum + if text == word + alpha=(max(float(count)-m.discount, 0.0) / s) + break + else + alpha = 1/length(m.vocab.vocab) + end + end + gamma = (m.discount * count_non_zero_vals(accum) /s) + return alpha, gamma +end + + diff --git a/src/LM/preprocessing.jl b/src/LM/preprocessing.jl new file mode 100644 index 00000000..e240e53f --- /dev/null +++ b/src/LM/preprocessing.jl @@ -0,0 +1,101 @@ +""" + everygram(seq::Vector{T}; min_len::Int=1, max_len::Int=-1)where { T <: AbstractString} + +Return all possible ngrams generated from sequence of items, as an Array{String,1} + +# Example + +```julia-repl +julia> seq = ["To","be","or","not"] +julia> a = everygram(seq,min_len=1, max_len=-1) + 10-element Array{Any,1}: + "or" + "not" + "To" + "be" + "or not" + "be or" + "be or not" + "To be or" + "To be or not" +``` + +""" +function everygram(seq::Vector{T}; min_len::Int=1, max_len::Int=-1) where { T <: AbstractString} + ngram = [] + if max_len == -1 + max_len = length(seq) + end + for n in range(min_len, stop=max_len) + temp = ngramizenew(seq, n) + ngram = append!(ngram, temp) + end + return(ngram) +end + +""" + padding_ngram(word::Vector{T}, n=1; pad_left=false, pad_right=false, left_pad_symbol="", right_pad_symbol ="") where { T <: AbstractString} + +padding _ngram is used to pad both left and right of sentence and out putting ngrmas of order n + + It also pad the original input Array of string + +# Example +```julia-repl +julia> example = ["1","2","3","4","5"] + +julia> padding_ngrams(example,2,pad_left=true,pad_right=true) + 6-element Array{Any,1}: + " 1" + "1 2" + "2 3" + "3 4" + "4 5" + "5 " +``` +""" +function padding_ngram(word::Vector{T}, n=1; pad_left=false, pad_right=false, left_pad_symbol="", right_pad_symbol ="") where { T <: AbstractString} + local seq + seq = word + if pad_left == true + prepend!(seq, [left_pad_symbol]) + end + if pad_right == true + push!(seq, right_pad_symbol) + end + return ngramizenew(seq, n) +end + +""" + ngramizenew( words::Vector{T}, nlist::Integer...) where { T <: AbstractString} + +ngramizenew is used to out putting ngrmas in set + +# Example +```julia-repl +julia> seq=["To","be","or","not","To","not","To","not"] +julia> ngramizenew(seq ,2) + 7-element Array{Any,1}: + "To be" + "be or" + "or not" + "not To" + "To not" + "not To" + "To not" +``` +""" +function ngramizenew(words::Vector{T}, nlist::Integer...) where { T <: AbstractString} + n_words = length(words) + + tokens = [] + + for n in nlist + for index in 1:(n_words - n + 1) + token = join(words[index:(index + n - 1)], " ") + push!(tokens,token) + end + end + return tokens +end + diff --git a/src/LM/vocab.jl b/src/LM/vocab.jl new file mode 100644 index 00000000..949a469d --- /dev/null +++ b/src/LM/vocab.jl @@ -0,0 +1,115 @@ +""" + Vocabulary(word,unk_cutoff =1 ,unk_label = "") + +Stores language model vocabulary. +Satisfies two common language modeling requirements for a vocabulary: +- When checking membership and calculating its size, filters items +by comparing their counts to a cutoff value. +Adds a special "unknown" token which unseen words are mapped to. + +# Example +```julia-repl +julia> words = ["a", "c", "-", "d", "c", "a", "b", "r", "a", "c", "d"] +julia> vocabulary = Vocabulary(words, 2) + Vocabulary(Dict(""=>1,"c"=>3,"a"=>3,"d"=>2), 2, "") + +julia> vocabulary.vocab + Dict{String,Int64} with 4 entries: + "" => 1 + "c" => 3 + "a" => 3 + "d" => 2 + +Tokens with counts greater than or equal to the cutoff value will +be considered part of the vocabulary. +julia> vocabulary.vocab["c"] + 3 + +julia> "c" in keys(vocabulary.vocab) + true + +julia> vocabulary.vocab["d"] + 2 + +julia> "d" in keys(vocabulary.vocab) + true + +Tokens with frequency counts less than the cutoff value will be considered not +part of the vocabulary even though their entries in the count dictionary are +preserved. +julia> "b" in keys(vocabulary.vocab) + false + +julia> "" in keys(vocabulary.vocab) + true + +We can look up words in a vocabulary using its `lookup` method. +"Unseen" words (with counts less than cutoff) are looked up as the unknown label. +If given one word (a string) as an input, this method will return a string. +julia> lookup("a") + 'a' + +julia> word = ["a", "-", "d", "c", "a"] + +julia> lookup(vocabulary ,word) + 5-element Array{Any,1}: + "a" + "" + "d" + "c" + "a" + +If given a sequence, it will return an Array{Any,1} of the looked up words as shown above. + +It's possible to update the counts after the vocabulary has been created. +julia> update(vocabulary,["b","c","c"]) + 1 + +julia> vocabulary.vocab["b"] + 1 +``` +""" +mutable struct Vocabulary +vocab::Dict{String, Int64} +unk_cutoff::Int +unk_label::String +allword::Array{String, 1} +end + +function Vocabulary(word::Vector{T}, unk_cutoff=1, unk_label="") where { T <: AbstractString} + if unk_label in word + error("unk_label is in vocab") + else + word= push!(word, unk_label) + end + vocab = countmap(word) + for value in vocab + if value[2]") + @test vocab isa Vocabulary + @test vocab.vocab isa Dict + @test vocab.unk_cutoff isa Int + @test vocab.unk_label isa String + @test vocab.allword isa Array{String,1} + @test length(vocab.vocab) == 4 #only 4 differnt string over word freq 2 + @test isequal(vocab.unk_cutoff, 2) + @test vocab.unk_label == "" + @test isequal(vocab.allword ,["a", "c", "-", "d", "c", "a", "b", "r", "a", "c", "d", ""]) + @test isequal(vocab.vocab, Dict{String,Int}(""=>1,"c"=>3,"a"=>3,"d"=>2)) + #to check lookup function + @test lookup(vocab,["a", "b", "c", "alien"]) == ["a", "", "c", ""] + word_set = ["", "is", "already", "there"] + @test_throws ErrorException Vocabulary(word_set, 1, "") +end + +@testset "preprocessing" begin + @testset "ngramizenew" begin + sample_text = ["this", "is", "some", "sample", "text"] + ngrams = TextAnalysis.ngramizenew(sample_text, 1) + + @test isequal(ngrams, ["this", "is", "some", "sample", "text"]) + + ngrams = TextAnalysis.ngramizenew(sample_text,2) + @test isequal(ngrams, ["this is", "is some", "some sample", "sample text"]) + + ngrams = TextAnalysis.ngramizenew(sample_text,1,2) + @test isequal(ngrams, ["this", "is", "some", "sample", "text", "this is", "is some", "some sample", "sample text"]) + end + + @testset "Padding function" begin + example = ["1", "2", "3", "4", "5"] + padded=padding_ngram(example, 2, pad_left=true, pad_right=true) + @test isequal(padded,[" 1", "1 2", "2 3", "3 4", "4 5", "5 "]) + @test isequal(example, ["", "1", "2", "3", "4", "5", ""]) + + example = ["1", "2", "3", "4", "5"] #if used + padded=padding_ngram(example, 2, pad_right=true) + @test isequal(padded,["1 2", "2 3", "3 4", "4 5", "5 "]) + end + @testset "everygram function" begin + example = ["1", "2", "3", "4", "5"] + everyngms = everygram(example, min_len=1, max_len=2) + @test isequal(everyngms, ["1", "2", "3", "4", "5", "1 2", "2 3", "3 4", "4 5"]) + end +end + +@testset "counter" begin + exam = ["To", "be", "or", "not", "to", "be","To", "be", "or", "not", "to", "be"] + fit = (TextAnalysis.counter2(exam, 2, 2)) + @test fit isa DataStructures.DefaultDict + @test length(fit) == 5 #length of unique words +end + +@testset "language model" begin + + @testset "MLE" begin + voc =["my", "name", "is", "salman", "khan", "and", "he", "is", "shahrukh", "Khan"] + train = ["khan", "is", "my", "good", "friend", "and", "He", "is", "my", "brother"] + model = MLE(voc) + fit = model(train, 2, 2) #considering only bigrams + unmaskedscore = score(model, fit, "is" ,"") + @test unmaskedscore == 0.3333333333333333 + @test score(model, fit, "is", "alien") == Inf #context not in vocabulary + @test score(model, fit, "alien", "is") == 0 # word not in vocabulary + end + + @testset "Lidstone" begin + voc =["my","name","is","salman","khan","and","he","is","shahrukh","Khan"] + train = ["khan", "is", "my", "good", "friend", "and", "He", "is", "my", "brother"] + model2 = Lidstone(voc, 1.0) + fit = model2(train, 2, 2) + @test score(model2, fit,"is", "alien") == 0.1 + @test score(model2, fit, "alien", "is") >= 0 + end + @testset "Laplace" begin + voc =["my","name","is","salman","khan","and","he","is","shahrukh","Khan"] + train = ["khan", "is", "my", "good", "friend", "and", "He", "is", "my", "brother"] + model3 = Laplace(voc) + fit2 = model3(train, 2, 2) + @test score(model3, fit2,"is", "alien") == 0.1 + end + @testset "WittenBellInterpolated" begin + voc =["my", "name", "is", "salman", "khan", "and", "he", "is", "shahrukh", "Khan"] + train = ["khan", "is", "my", "good", "friend", "and", "He", "is", "my", "brother"] + model = WittenBellInterpolated(voc) + fit = model(train, 2, 2) + @test score(model, fit,"is", "alien") == 0.2 + @test score(model, fit, "alien", "is") == 0.4 + @test score(model, fit,"alien") == 0.2 #should be non-zero + end + @testset "KneserNeyInterpolated" begin + voc =["my", "name", "is", "salman", "khan", "and", "he", "is", "shahrukh", "Khan"] + train = ["khan", "is", "my", "good", "friend", "and", "He", "is", "my", "brother"] + model = KneserNeyInterpolated(voc, 0.1) + fit = model(train, 2, 2) + @test score(model, fit, "is", "alie") == 0.2 + @test score(model, fit, "alien", "is") == 0.11000000000000001 + end +end + diff --git a/test/runtests.jl b/test/runtests.jl index 59b3a4dd..867b284d 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -28,3 +28,4 @@ include("taggingschemes.jl") include("averagePerceptronTagger.jl") include("evaluation_metrics.jl") include("ulmfit.jl") +include("LM.jl")