Skip to content

Commit

Permalink
Merge pull request #282 from rssdev10/fix/style_improvement
Browse files Browse the repository at this point in the history
Fix/style improvement
  • Loading branch information
rssdev10 authored Feb 7, 2024
2 parents 3ec01c8 + 0230573 commit 992af7e
Show file tree
Hide file tree
Showing 31 changed files with 538 additions and 502 deletions.
8 changes: 4 additions & 4 deletions docs/make.jl
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
using Documenter, TextAnalysis

makedocs(
modules = [TextAnalysis],
sitename = "TextAnalysis",
format = Documenter.HTML(
modules=[TextAnalysis],
sitename="TextAnalysis",
format=Documenter.HTML(
),
pages = [
pages=[
"Home" => "index.md",
"Documents" => "documents.md",
"Corpus" => "corpus.md",
Expand Down
2 changes: 1 addition & 1 deletion src/LM/api.jl
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ It is used to evaluate score with masks out of vocabulary words
The arguments are the same as for [`score`](@ref)
"""
function maskedscore(m::Langmodel, temp_lm::DefaultDict, word, context)::Float64
score(m, temp_lm, lookup(m.vocab, [word])[begin], lookup(m.vocab, [context])[begin])
score(m, temp_lm, lookup(m.vocab, [word])[begin], lookup(m.vocab, [context])[begin])
end

"""
Expand Down
104 changes: 51 additions & 53 deletions src/LM/langmodel.jl
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,9 @@ abstract type Langmodel end
abstract type gammamodel <: Langmodel end #BaseNgram with Add-one smoothing algo
abstract type InterpolatedLanguageModel <: Langmodel end #Interpolated language model with smoothing

#DataType MLE
#Type for providing MLE ngram model scores.
#Implementation of Base Ngram Model.
# DataType MLE
# Type for providing MLE ngram model scores.
# Implementation of Base Ngram Model.

struct MLE <: Langmodel
vocab::Vocabulary
Expand All @@ -18,13 +18,13 @@ Initiate Type for providing MLE ngram model scores.
Implementation of Base Ngram Model.
"""
function MLE(word::Vector{T}, unk_cutoff=1, unk_label="<unk>") where {T <: AbstractString}
function MLE(word::Vector{T}, unk_cutoff=1, unk_label="<unk>") where {T<:AbstractString}
MLE(Vocabulary(word, unk_cutoff, unk_label))
end

function (lm::MLE)(text::Vector{T}, min::Integer, max::Integer) where {T <: AbstractString}
function (lm::MLE)(text::Vector{T}, min::Integer, max::Integer) where {T<:AbstractString}
text = lookup(lm.vocab, text)
text=convert(Array{String}, text)
text = convert(Array{String}, text)
return counter2(text, min, max)
end

Expand All @@ -41,18 +41,19 @@ Function to initiate Type(Lidstone) for providing Lidstone-smoothed scores.
In addition to initialization arguments from BaseNgramModel also requires
a number by which to increase the counts, gamma.
"""
function Lidstone(word::Vector{T}, gamma = 1.0, unk_cutoff=1, unk_label="<unk>") where {T <: AbstractString}
function Lidstone(word::Vector{T}, gamma=1.0, unk_cutoff=1, unk_label="<unk>") where {T<:AbstractString}
Lidstone(Vocabulary(word, unk_cutoff, unk_label), gamma)
end

function (lm::Lidstone)(text::Vector{T}, min::Integer, max::Integer) where {T <: AbstractString}
function (lm::Lidstone)(text::Vector{T}, min::Integer, max::Integer) where {T<:AbstractString}
text = lookup(lm.vocab, text)
text=convert(Array{String}, text)
text = convert(Array{String}, text)
return counter2(text, min, max)
end

"""
Laplace(word::Vector{T}, unk_cutoff=1, unk_label="<unk>") where {T <: AbstractString}
Function to initiate Type(Laplace) for providing Laplace-smoothed scores.
In addition to initialization arguments from BaseNgramModel also requires
Expand All @@ -63,11 +64,11 @@ struct Laplace <: gammamodel
gamma::Float64
end

function Laplace(word::Vector{T}, unk_cutoff=1, unk_label="<unk>") where {T <: AbstractString}
function Laplace(word::Vector{T}, unk_cutoff=1, unk_label="<unk>") where {T<:AbstractString}
Laplace(Vocabulary(word, unk_cutoff, unk_label), 1.0)
end

function (lm::Laplace)(text, min::Integer, max::Integer)
function (lm::Laplace)(text, min::Integer, max::Integer)
text = lookup(lm.vocab, text)
text = convert(Array{String}, text)
return counter2(text, min, max)
Expand All @@ -84,35 +85,32 @@ Add-one smoothing to Lidstone or Laplace(gammamodel) models
function score(m::gammamodel, temp_lm::DefaultDict, word, context) #score for gammamodel output probabl
accum = temp_lm[context]
#print(accum)
s = float(sum(accum)+(m.gamma)*length(m.vocab.vocab))
for (text, count) in accum
if text == word
return(float(count+m.gamma)/s)
end
end
return(float(m.gamma)/s)
s = float(sum(accum) + (m.gamma) * length(m.vocab.vocab))
idx = something(findfirst(isequal(word), accum), 0)
return float(idx + m.gamma) / s
end

"""
$(TYPEDSIGNATURES)
To get probability of word given that context
In other words, for given context calculate frequency distribution of word
"""
function prob(m::Langmodel, templ_lm::DefaultDict, word, context=nothing)::Float64
(isnothing(context) || isempty(context)) && return 1.0/length(templ_lm) #provide distribution
(isnothing(context) || isempty(context)) && return 1.0 / length(templ_lm) #provide distribution

accum = templ_lm[context]
s = float(sum(accum))
s = float(sum(accum))
for (text, count) in accum
if text == word
return(float(count) / s)
return (float(count) / s)
end
end
if context in keys(m.vocab.vocab)
return 0.0
end
return(Inf)
return (Inf)
end

"""
Expand All @@ -125,8 +123,8 @@ function score(m::MLE, temp_lm::DefaultDict, word, context=nothing)
prob(m, temp_lm, word, context)
end

struct WittenBellInterpolated <: InterpolatedLanguageModel
vocab ::Vocabulary
struct WittenBellInterpolated <: InterpolatedLanguageModel
vocab::Vocabulary
end

"""
Expand All @@ -137,41 +135,41 @@ Initiate Type for providing Interpolated version of Witten-Bell smoothing.
The idea to abstract this comes from Chen & Goodman 1995.
"""
function WittenBellInterpolated(word::Vector{T}, unk_cutoff=1, unk_label="<unk>") where {T <: AbstractString}
function WittenBellInterpolated(word::Vector{T}, unk_cutoff=1, unk_label="<unk>") where {T<:AbstractString}
WittenBellInterpolated(Vocabulary(word, unk_cutoff, unk_label))
end

function (lm::WittenBellInterpolated)(text::Vector{T}, min::Integer, max::Integer) where {T <: AbstractString}
function (lm::WittenBellInterpolated)(text::Vector{T}, min::Integer, max::Integer) where {T<:AbstractString}
text = lookup(lm.vocab, text)
text=convert(Array{String}, text)
text = convert(Array{String}, text)
return counter2(text, min, max)
end
# alpha_gamma function for KneserNeyInterpolated
function alpha_gammma(m::WittenBellInterpolated, templ_lm::DefaultDict, word, context)
local alpha
local gam
accum = templ_lm[context]
s = float(sum(accum))
for (text,count) in accum
s = float(sum(accum))
for (text, count) in accum
if text == word
alpha=(float(count) / s)
break
alpha = (float(count) / s)
break
else
alpha = 1/s
alpha = 1 / s
end
end

gam = gamma(accum)
return alpha*(1- gam), gam
return alpha * (1 - gam), gam
end

function count_non_zero_vals(accum::Accumulator{})
return(length(accum))
return (length(accum))
end

function gamma(accum)
nplus=count_non_zero_vals(accum)
return(nplus/(nplus+float(sum(accum))))
nplus = count_non_zero_vals(accum)
return (nplus / (nplus + float(sum(accum))))
end

"""
Expand All @@ -187,20 +185,20 @@ function score(m::InterpolatedLanguageModel, temp_lm::DefaultDict, word, context
(isnothing(context) || isempty(context)) && return prob(m, temp_lm, word)

if context in keys(temp_lm)
alpha,gamma = alpha_gammma(m, temp_lm, word, context)
return (alpha + gamma*score(m, temp_lm, word, context_reduce(context)))
alpha, gamma = alpha_gammma(m, temp_lm, word, context)
return (alpha + gamma * score(m, temp_lm, word, context_reduce(context)))
else
return score(m, temp_lm, word, context_reduce(context))
end
end

function context_reduce(context)
context = split(context)
join(context[2:end], " ")
end


struct KneserNeyInterpolated <: InterpolatedLanguageModel
struct KneserNeyInterpolated <: InterpolatedLanguageModel
vocab::Vocabulary
discount::Float64
end
Expand All @@ -213,29 +211,29 @@ Initiate Type for providing KneserNey Interpolated language model.
The idea to abstract this comes from Chen & Goodman 1995.
"""
function KneserNeyInterpolated(word::Vector{T}, disc = 0.1, unk_cutoff=1, unk_label="<unk>") where {T <: AbstractString}
KneserNeyInterpolated(Vocabulary(word, unk_cutoff, unk_label) ,disc)
function KneserNeyInterpolated(word::Vector{T}, disc=0.1, unk_cutoff=1, unk_label="<unk>") where {T<:AbstractString}
KneserNeyInterpolated(Vocabulary(word, unk_cutoff, unk_label), disc)
end

function (lm::KneserNeyInterpolated)(text::Vector{T}, min::Integer, max::Integer) where {T <: AbstractString}
function (lm::KneserNeyInterpolated)(text::Vector{T}, min::Integer, max::Integer) where {T<:AbstractString}
text = lookup(lm.vocab, text)
text=convert(Array{String}, text)
text = convert(Array{String}, text)
return counter2(text, min, max)
end
# alpha_gamma function for KneserNeyInterpolated
function alpha_gammma(m::KneserNeyInterpolated, templ_lm::DefaultDict, word, context)
local alpha
local gamma
local gamma
accum = templ_lm[context]
s = float(sum(accum))
s = float(sum(accum))
for (text, count) in accum
if text == word
alpha=(max(float(count)-m.discount, 0.0) / s)
break
alpha = (max(float(count) - m.discount, 0.0) / s)
break
else
alpha = 1/length(m.vocab.vocab)
alpha = 1 / length(m.vocab.vocab)
end
end
gamma = (m.discount * count_non_zero_vals(accum) /s)
gamma = (m.discount * count_non_zero_vals(accum) / s)
return alpha, gamma
end
Loading

0 comments on commit 992af7e

Please sign in to comment.