Skip to content

Commit

Permalink
Merge pull request #276 from rssdev10/fix/code_refactoring
Browse files Browse the repository at this point in the history
Code refactoring. 
Project version is 0.8
  • Loading branch information
rssdev10 committed Oct 26, 2023
2 parents 6d00310 + 0f4ed71 commit 77f1abb
Show file tree
Hide file tree
Showing 18 changed files with 183 additions and 155 deletions.
11 changes: 6 additions & 5 deletions Project.toml
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,13 @@ name = "TextAnalysis"
uuid = "a2db99b7-8b79-58f8-94bf-bbc811eef33d"
license = "MIT"
desc = "Julia package for text analysis"
version = "0.7.5"
version = "0.8.0"

[deps]
DataStructures = "864edb3b-99cc-5e75-8d2d-829cb0a9cfe8"
DelimitedFiles = "8bb1440f-4735-579b-a4ab-409b98df4dab"
DocStringExtensions = "ffbed154-4ef7-542d-bbb7-c09d3a79fcae"
InteractiveUtils = "b77e0a4c-d291-57a0-90e8-8db25a27a240"
JSON = "682c06a0-de6a-54ab-a142-c8b1cf79cde6"
Languages = "8ef0a80b-9436-5d2c-a485-80b904378c43"
LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
Expand All @@ -28,12 +29,12 @@ DelimitedFiles = "1"
DocStringExtensions = "0.9"
JSON = "0.21"
Languages = "0.4"
ProgressMeter = "1.5"
ProgressMeter = "1"
Snowball = "0.1"
StatsBase = "0.32,0.33, 0.34"
Tables = "1.2"
StatsBase = "0.32, 0.33, 0.34"
Tables = "1"
WordTokenizers = "0.5"
julia = "1.3"
julia = "1.6"

[extras]
DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0"
Expand Down
32 changes: 21 additions & 11 deletions src/LM/api.jl
Original file line number Diff line number Diff line change
@@ -1,23 +1,33 @@
#TO DO
# Doc string
function maskedscore(m::Langmodel, temp_lm::DefaultDict, word, context)
score(m, temp_lm, lookup(m.vocab, [word])[1], lookup(m.vocab, [context])[1])
"""
$(TYPEDSIGNATURES)
"""
function maskedscore(m::Langmodel, temp_lm::DefaultDict, word, context)::Float64
score(m, temp_lm, lookup(m.vocab, [word])[begin], lookup(m.vocab, [context])[begin])
end

function logscore(m::Langmodel, temp_lm::DefaultDict, word, context)
"""
$(TYPEDSIGNATURES)
"""
function logscore(m::Langmodel, temp_lm::DefaultDict, word, context)::Float64
log2(maskedscore(m, temp_lm, word, context))
end

function entropy(m::Langmodel, lm::DefaultDict, text_ngram)
local log_set=Float64[]
for ngram in text_ngram
"""
$(TYPEDSIGNATURES)
"""
function entropy(m::Langmodel, lm::DefaultDict, text_ngram::AbstractVector)::Float64
n_sum = sum(text_ngram) do ngram
ngram = split(ngram)
push!(log_set, logscore(m, lm, ngram[end], join(ngram[1:end-1], " ")))
#println(logscore(m,lm,ngram[end],ngram[1:end-1]))
logscore(m, lm, ngram[end], join(ngram[begin:end-1], " "))
end
return(sum(log_set)/length(log_set))
return n_sum / length(text_ngram)
end

function perplexity(m::Langmodel, lm::DefaultDict, text_ngram)
return(2^(entropy(m, lm, text_ngram)))
"""
$(TYPEDSIGNATURES)
"""
function perplexity(m::Langmodel, lm::DefaultDict, text_ngram::AbstractVector)::Float64
return 2^(entropy(m, lm, text_ngram))
end
13 changes: 7 additions & 6 deletions src/LM/counter.jl
Original file line number Diff line number Diff line change
@@ -1,17 +1,18 @@
using DataStructures

"""
counter is used to make conditional distribution, which is used by score functions to
calculate conditional frequency distribution
$(TYPEDSIGNATURES)
counter is used to make conditional distribution, which is used by score functions to
calculate conditional frequency distribution
"""
function counter2(data, min::Integer, max::Integer)
data = everygram(data, min_len=min, max_len=max)
data = split.(data)
temp_lm = DefaultDict{SubString{String}, Accumulator{String,Int64}}(counter(SubString{String}))
for i in 1:length(data)
history,word = data[i][1:end-1], data[i][end]
temp_lm = DefaultDict{SubString{String},Accumulator{String,Int64}}(counter(SubString{String}))
for i in eachindex(data)
history, word = data[i][begin:end-1], data[i][end]
temp_lm[join(history, " ")][word] += 1
end
return temp_lm
end

19 changes: 7 additions & 12 deletions src/LM/langmodel.jl
Original file line number Diff line number Diff line change
Expand Up @@ -99,20 +99,18 @@ To get probability of word given that context
In other words, for given context calculate frequency distribution of word
"""
function prob(m::Langmodel, templ_lm::DefaultDict, word, context=nothing)
if context == nothing || context == ""
return(1/float(length(templ_lm))) #provide distribution
else
accum = templ_lm[context]
end
function prob(m::Langmodel, templ_lm::DefaultDict, word, context=nothing)::Float64
(isnothing(context) || isempty(context)) && return 1.0/length(templ_lm) #provide distribution

accum = templ_lm[context]
s = float(sum(accum))
for (text, count) in accum
if text == word
return(float(count) / s)
end
end
if context in keys(m.vocab.vocab)
return(0)
return 0.0
end
return(Inf)
end
Expand Down Expand Up @@ -186,9 +184,8 @@ depending upon the sub-Type
"""
function score(m::InterpolatedLanguageModel, temp_lm::DefaultDict, word, context=nothing)
if context == nothing || context == ""
return prob(m, temp_lm, word, context)
end
(isnothing(context) || isempty(context)) && return prob(m, temp_lm, word)

if context in keys(temp_lm)
alpha,gamma = alpha_gammma(m, temp_lm, word, context)
return (alpha + gamma*score(m, temp_lm, word, context_reduce(context)))
Expand Down Expand Up @@ -242,5 +239,3 @@ function alpha_gammma(m::KneserNeyInterpolated, templ_lm::DefaultDict, word, con
gamma = (m.discount * count_non_zero_vals(accum) /s)
return alpha, gamma
end


38 changes: 20 additions & 18 deletions src/LM/preprocessing.jl
Original file line number Diff line number Diff line change
Expand Up @@ -21,16 +21,16 @@ julia> a = everygram(seq,min_len=1, max_len=-1)
```
"""
function everygram(seq::Vector{T}; min_len::Int=1, max_len::Int=-1) where { T <: AbstractString}
ngram = []
function everygram(seq::Vector{T}; min_len::Int=1, max_len::Int=-1)::Vector{String} where {T<:AbstractString}
ngram = String[]
if max_len == -1
max_len = length(seq)
end
for n in range(min_len, stop=max_len)
temp = ngramizenew(seq, n)
ngram = append!(ngram, temp)
end
return(ngram)
return ngram
end

"""
Expand All @@ -54,16 +54,18 @@ julia> padding_ngrams(example,2,pad_left=true,pad_right=true)
"5 </s>"
```
"""
function padding_ngram(word::Vector{T}, n=1; pad_left=false, pad_right=false, left_pad_symbol="<s>", right_pad_symbol ="</s>") where { T <: AbstractString}
function padding_ngram(
word::Vector{T}, n=1;
pad_left=false, pad_right=false,
left_pad_symbol="<s>", right_pad_symbol="</s>"
) where {T<:AbstractString}
local seq
seq = word
if pad_left == true
prepend!(seq, [left_pad_symbol])
end
if pad_right == true
push!(seq, right_pad_symbol)
end
return ngramizenew(seq, n)

pad_left == true && prepend!(seq, [left_pad_symbol])
pad_right == true && push!(seq, right_pad_symbol)

return ngramizenew(seq, n)
end

"""
Expand All @@ -85,16 +87,16 @@ julia> ngramizenew(seq ,2)
"To not"
```
"""
function ngramizenew(words::Vector{T}, nlist::Integer...) where { T <: AbstractString}
function ngramizenew(words::Vector{T}, nlist::Integer...)::Vector{String} where {T<:AbstractString}
n_words = length(words)

tokens = []
tokens = String[]

for n in nlist,
index in 1:(n_words-n+1)

for n in nlist
for index in 1:(n_words - n + 1)
token = join(words[index:(index + n - 1)], " ")
push!(tokens,token)
end
token = join(words[index:(index+n-1)], " ")
push!(tokens, token)
end
return tokens
end
Expand Down
41 changes: 25 additions & 16 deletions src/LM/vocab.jl
Original file line number Diff line number Diff line change
Expand Up @@ -70,46 +70,55 @@ julia> vocabulary.vocab["b"]
```
"""
mutable struct Vocabulary
vocab::Dict{String, Int64}
unk_cutoff::Int
unk_label::String
allword::Array{String, 1}
vocab::Dict{String,Int64}
unk_cutoff::Int
unk_label::String
allword::Vector{String}
end

function Vocabulary(word::Vector{T}, unk_cutoff=1, unk_label="<unk>") where { T <: AbstractString}
"""
$(TYPEDSIGNATURES)
"""
function Vocabulary(word::Vector{T}, unk_cutoff=1, unk_label="<unk>") where {T<:AbstractString}
if unk_label in word
error("unk_label is in vocab")
else
word= push!(word, unk_label)
word = push!(word, unk_label)
end
vocab = countmap(word)
for value in vocab
if value[2]<unk_cutoff && value[1] != unk_label
if value[2] < unk_cutoff && value[1] != unk_label
delete!(vocab, value[1])
end
end
Vocabulary(vocab, unk_cutoff, unk_label, word)
end

"""
$(TYPEDSIGNATURES)
See [`Vocabulary`](@ref)
"""
function update(vocab::Vocabulary, words)
vocab.allword = append!(vocab.allword, words)
vocab.vocab=addcounts!(vocab.vocab, words)
vocab.vocab = addcounts!(vocab.vocab, words)
end

"""
$(TYPEDSIGNATURES)
lookup a sequence or words in the vocabulary
Return an Array of String
See [`Vocabulary`](@ref)
"""
function lookup(voc::Vocabulary, word::Vector{T}) where { T <: AbstractString}
look = []
for w in word
function lookup(voc::Vocabulary, word::AbstractVector{T})::Vector{T} where {T<:AbstractString}
return map(word) do w
if w in keys(voc.vocab)
push!(look, w)
else
push!(look, voc.unk_label)
w
else
voc.unk_label
end
end
return look
end

12 changes: 7 additions & 5 deletions src/bayes.jl
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,11 @@ export NaiveBayesClassifier
simpleTokenise(s) = WordTokenizers.tokenize(lowercase(replace(s, "."=>"")))

"""
$(TYPEDSIGNATURES)
Create a dict that maps elements in input array to their frequencies.
"""
function frequencies(xs)
function frequencies(xs::AbstractVector{T})::Dict{T,Int} where {T<:Any}
frequencies = Dict{eltype(xs),Int}()
for x in xs
frequencies[x] = get(frequencies, x, 0) + 1
Expand All @@ -16,13 +18,13 @@ function frequencies(xs)
end

"""
features(::AbstractDict, dict)
$(TYPEDSIGNATURES)
Compute an Array, mapping the value corresponding to elements of `dict` to the input `AbstractDict`.
"""
function features(fs::AbstractDict, dict)
bag = zeros(Int, size(dict))
for i = 1:length(dict)
function features(fs::AbstractDict, dict::AbstractVector)::Vector{Int}
bag = Vector{Int}(undef, size(dict))
for i = eachindex(dict)
bag[i] = get(fs, dict[i], 0)
end
return bag
Expand Down
18 changes: 11 additions & 7 deletions src/coom.jl
Original file line number Diff line number Diff line change
Expand Up @@ -42,15 +42,19 @@ function coo_matrix(::Type{T},
coom = spzeros(T, n, n)
# Count co-occurrences
for (i, token) in enumerate(doc)
@inbounds for j in max(1, i-window):min(m, i+window)
row = get(vocab, token, nothing)
isnothing(row) && continue

@inbounds for j in max(1, i - window):min(m, i + window)
i == j && continue

wtoken = doc[j]
nm = T(ifelse(normalize, abs(i-j), 1))
row = get(vocab, token, nothing)
col = get(vocab, wtoken, nothing)
if i!=j && row != nothing && col != nothing
coom[row, col] += one(T)/nm
coom[col, row] = coom[row, col]
end
isnothing(col) && continue

nm = T(ifelse(normalize, abs(i - j), 1))
coom[row, col] += one(T) / nm
coom[col, row] = coom[row, col]
end
end
return coom
Expand Down
7 changes: 2 additions & 5 deletions src/corpus.jl
Original file line number Diff line number Diff line change
Expand Up @@ -236,11 +236,8 @@ function update_inverse_index!(crps::Corpus)
ngram_arr = isa(doc, NGramDocument) ? collect(keys(ngrams(doc))) : tokens(doc)
ngram_arr = convert(Array{String,1}, ngram_arr)
for ngram in ngram_arr
if haskey(idx, ngram)
push!(idx[ngram], i)
else
idx[ngram] = [i]
end
key = get!(() -> [], idx, ngram)
push!(key, i)
end
end
for key in keys(idx)
Expand Down
2 changes: 1 addition & 1 deletion src/deprecations.jl
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@

## Deprecations for Languages

function WordTokenizers.tokenize(::Type{S}, s::T) where {S <: Language, T <: AbstractString}
function tokenize(::Type{S}, s::T) where {S <: Language, T <: AbstractString}
depwarn("Use of Languages as types is deprecated. Use instances.", Symbol(S))
tokenize(S(), s)
end
Expand Down
11 changes: 6 additions & 5 deletions src/dtm.jl
Original file line number Diff line number Diff line change
Expand Up @@ -168,14 +168,15 @@ tdm(crps::Corpus) = dtm(crps)' #'

function dtm_entries(d::AbstractDocument, lex::Dict{T, Int}) where T
ngs = ngrams(d)
indices = Array{Int}(undef, 0)
values = Array{Int}(undef, 0)
terms = sort(collect(keys(lex)))
indices = Int[]
values = Int[]
terms = sort!(collect(keys(lex)))
column_indices = columnindices(terms)

for ngram in keys(ngs)
if haskey(column_indices, ngram)
push!(indices, column_indices[ngram])
key = get(column_indices, ngram, nothing)
if !isnothing(key)
push!(indices, key)
push!(values, ngs[ngram])
end
end
Expand Down
Loading

0 comments on commit 77f1abb

Please sign in to comment.