Merge pull request #276 from rssdev10/fix/code_refactoring

Code refactoring. Project version is 0.8
JuliaText · Oct 26, 2023 · 77f1abb · 77f1abb
2 parents 6d00310 + 0f4ed71
commit 77f1abb
Show file tree

Hide file tree

Showing 18 changed files with 183 additions and 155 deletions.
diff --git a/Project.toml b/Project.toml
@@ -2,12 +2,13 @@ name = "TextAnalysis"
 uuid = "a2db99b7-8b79-58f8-94bf-bbc811eef33d"
 license = "MIT"
 desc = "Julia package for text analysis"
-version = "0.7.5"
+version = "0.8.0"
 
 [deps]
 DataStructures = "864edb3b-99cc-5e75-8d2d-829cb0a9cfe8"
 DelimitedFiles = "8bb1440f-4735-579b-a4ab-409b98df4dab"
 DocStringExtensions = "ffbed154-4ef7-542d-bbb7-c09d3a79fcae"
+InteractiveUtils = "b77e0a4c-d291-57a0-90e8-8db25a27a240"
 JSON = "682c06a0-de6a-54ab-a142-c8b1cf79cde6"
 Languages = "8ef0a80b-9436-5d2c-a485-80b904378c43"
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
@@ -28,12 +29,12 @@ DelimitedFiles = "1"
 DocStringExtensions = "0.9"
 JSON = "0.21"
 Languages = "0.4"
-ProgressMeter = "1.5"
+ProgressMeter = "1"
 Snowball = "0.1"
-StatsBase = "0.32,0.33, 0.34"
-Tables = "1.2"
+StatsBase = "0.32, 0.33, 0.34"
+Tables = "1"
 WordTokenizers = "0.5"
-julia = "1.3"
+julia = "1.6"
 
 [extras]
 DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0"

diff --git a/src/LM/api.jl b/src/LM/api.jl
@@ -1,23 +1,33 @@
 #TO DO 
 # Doc string
-function maskedscore(m::Langmodel, temp_lm::DefaultDict, word, context)
-   score(m, temp_lm, lookup(m.vocab, [word])[1], lookup(m.vocab, [context])[1])
+"""
+$(TYPEDSIGNATURES)
+"""
+function maskedscore(m::Langmodel, temp_lm::DefaultDict, word, context)::Float64
+   score(m, temp_lm, lookup(m.vocab, [word])[begin], lookup(m.vocab, [context])[begin])
 end
 
-function logscore(m::Langmodel, temp_lm::DefaultDict, word, context)
+"""
+$(TYPEDSIGNATURES)
+"""
+function logscore(m::Langmodel, temp_lm::DefaultDict, word, context)::Float64
     log2(maskedscore(m, temp_lm, word, context))
 end
 
-function entropy(m::Langmodel, lm::DefaultDict, text_ngram)
-    local log_set=Float64[]
-    for ngram in text_ngram
+"""
+$(TYPEDSIGNATURES)
+"""
+function entropy(m::Langmodel, lm::DefaultDict, text_ngram::AbstractVector)::Float64
+    n_sum = sum(text_ngram) do ngram
         ngram = split(ngram)
-        push!(log_set, logscore(m, lm, ngram[end], join(ngram[1:end-1], " ")))
-        #println(logscore(m,lm,ngram[end],ngram[1:end-1]))
+        logscore(m, lm, ngram[end], join(ngram[begin:end-1], " "))
     end
-    return(sum(log_set)/length(log_set))
+    return n_sum / length(text_ngram)
 end
 
-function perplexity(m::Langmodel, lm::DefaultDict, text_ngram)
-    return(2^(entropy(m, lm, text_ngram)))
+"""
+$(TYPEDSIGNATURES)
+"""
+function perplexity(m::Langmodel, lm::DefaultDict, text_ngram::AbstractVector)::Float64
+    return 2^(entropy(m, lm, text_ngram))
 end
diff --git a/src/LM/counter.jl b/src/LM/counter.jl
@@ -1,17 +1,18 @@
 using DataStructures
 
 """
-    counter is used to make conditional distribution, which is used by score functions to 
-    calculate conditional frequency distribution
+$(TYPEDSIGNATURES)
+
+counter is used to make conditional distribution, which is used by score functions to 
+calculate conditional frequency distribution
 """
 function counter2(data, min::Integer, max::Integer)
     data = everygram(data, min_len=min, max_len=max)
     data = split.(data)
-    temp_lm = DefaultDict{SubString{String}, Accumulator{String,Int64}}(counter(SubString{String}))
-    for i in 1:length(data)
-        history,word = data[i][1:end-1], data[i][end]
+    temp_lm = DefaultDict{SubString{String},Accumulator{String,Int64}}(counter(SubString{String}))
+    for i in eachindex(data)
+        history, word = data[i][begin:end-1], data[i][end]
         temp_lm[join(history, " ")][word] += 1
     end
     return temp_lm
 end
-
diff --git a/src/LM/langmodel.jl b/src/LM/langmodel.jl
@@ -99,20 +99,18 @@ To get probability of word given that context
 In other words, for given context calculate frequency distribution of word
   
 """
-function prob(m::Langmodel, templ_lm::DefaultDict, word, context=nothing)
-    if context == nothing || context == ""
-        return(1/float(length(templ_lm))) #provide distribution 
-    else
-        accum = templ_lm[context]
-    end
+function prob(m::Langmodel, templ_lm::DefaultDict, word, context=nothing)::Float64
+    (isnothing(context) || isempty(context)) && return 1.0/length(templ_lm) #provide distribution
+
+    accum = templ_lm[context]
     s = float(sum(accum)) 
     for (text, count) in accum
         if text == word
             return(float(count) / s)
         end
     end
     if context in keys(m.vocab.vocab)
-        return(0)
+        return 0.0
     end
     return(Inf)
 end
@@ -186,9 +184,8 @@ depending upon the sub-Type
         
 """
 function score(m::InterpolatedLanguageModel, temp_lm::DefaultDict, word, context=nothing)
-    if context == nothing || context == ""
-        return prob(m, temp_lm, word, context)
-    end
+    (isnothing(context) || isempty(context)) && return prob(m, temp_lm, word)
+
     if context in keys(temp_lm)
         alpha,gamma = alpha_gammma(m, temp_lm, word, context)
         return (alpha + gamma*score(m, temp_lm, word, context_reduce(context)))
@@ -242,5 +239,3 @@ function alpha_gammma(m::KneserNeyInterpolated, templ_lm::DefaultDict, word, con
     gamma = (m.discount * count_non_zero_vals(accum) /s)
     return alpha, gamma
 end
-
-
diff --git a/src/LM/preprocessing.jl b/src/LM/preprocessing.jl
@@ -21,16 +21,16 @@ julia> a = everygram(seq,min_len=1, max_len=-1)
 ```
    
 """
-function everygram(seq::Vector{T}; min_len::Int=1, max_len::Int=-1) where { T <: AbstractString}
-    ngram = []
+function everygram(seq::Vector{T}; min_len::Int=1, max_len::Int=-1)::Vector{String} where {T<:AbstractString}
+    ngram = String[]
     if max_len == -1
         max_len = length(seq)
     end
     for n in range(min_len, stop=max_len)
         temp = ngramizenew(seq, n)
         ngram = append!(ngram, temp)
     end
-    return(ngram)
+    return ngram
 end
 
 """
@@ -54,16 +54,18 @@ julia> padding_ngrams(example,2,pad_left=true,pad_right=true)
   "5 </s>"
 ```
 """
-function padding_ngram(word::Vector{T}, n=1; pad_left=false, pad_right=false, left_pad_symbol="<s>", right_pad_symbol ="</s>") where { T <: AbstractString}
+function padding_ngram(
+    word::Vector{T}, n=1;
+    pad_left=false, pad_right=false,
+    left_pad_symbol="<s>", right_pad_symbol="</s>"
+) where {T<:AbstractString}
     local seq
     seq = word
-    if pad_left == true
-        prepend!(seq, [left_pad_symbol])
-    end 
-    if pad_right == true
-        push!(seq, right_pad_symbol)
-    end
-    return  ngramizenew(seq, n)
+
+    pad_left == true && prepend!(seq, [left_pad_symbol])
+    pad_right == true && push!(seq, right_pad_symbol)
+
+    return ngramizenew(seq, n)
 end
 
 """
@@ -85,16 +87,16 @@ julia> ngramizenew(seq ,2)
   "To not"
 ```
 """
-function ngramizenew(words::Vector{T}, nlist::Integer...) where { T <: AbstractString}
+function ngramizenew(words::Vector{T}, nlist::Integer...)::Vector{String} where {T<:AbstractString}
     n_words = length(words)
 
-    tokens = []
+    tokens = String[]
+
+    for n in nlist,
+        index in 1:(n_words-n+1)
 
-    for n in nlist
-        for index in 1:(n_words - n + 1)
-            token = join(words[index:(index + n - 1)], " ")
-            push!(tokens,token)
-        end
+        token = join(words[index:(index+n-1)], " ")
+        push!(tokens, token)
     end
     return tokens
 end

diff --git a/src/LM/vocab.jl b/src/LM/vocab.jl
@@ -70,46 +70,55 @@ julia> vocabulary.vocab["b"]
 ```
 """
 mutable struct Vocabulary
-vocab::Dict{String, Int64}
-unk_cutoff::Int
-unk_label::String
-allword::Array{String, 1}
+    vocab::Dict{String,Int64}
+    unk_cutoff::Int
+    unk_label::String
+    allword::Vector{String}
 end
 
-function Vocabulary(word::Vector{T}, unk_cutoff=1, unk_label="<unk>") where { T <: AbstractString}
+"""
+$(TYPEDSIGNATURES)
+"""
+function Vocabulary(word::Vector{T}, unk_cutoff=1, unk_label="<unk>") where {T<:AbstractString}
     if unk_label in word
         error("unk_label is in vocab")
     else
-    word= push!(word, unk_label)
+        word = push!(word, unk_label)
     end
     vocab = countmap(word)
     for value in vocab
-        if value[2]<unk_cutoff && value[1] != unk_label
+        if value[2] < unk_cutoff && value[1] != unk_label
             delete!(vocab, value[1])
         end
     end
     Vocabulary(vocab, unk_cutoff, unk_label, word)
 end
 
+"""
+$(TYPEDSIGNATURES)
+
+See [`Vocabulary`](@ref)
+"""
 function update(vocab::Vocabulary, words)
     vocab.allword = append!(vocab.allword, words)
-    vocab.vocab=addcounts!(vocab.vocab, words)
+    vocab.vocab = addcounts!(vocab.vocab, words)
 end
 
 """
+$(TYPEDSIGNATURES)
+
 lookup a sequence or words in the vocabulary
 
 Return an Array of String
+
+See [`Vocabulary`](@ref)
 """
-function lookup(voc::Vocabulary, word::Vector{T}) where { T <: AbstractString}
-    look = []
-    for w in word
+function lookup(voc::Vocabulary, word::AbstractVector{T})::Vector{T} where {T<:AbstractString}
+    return map(word) do w
         if w in keys(voc.vocab)
-            push!(look, w) 
-        else     
-            push!(look, voc.unk_label) 
+            w
+        else
+            voc.unk_label
         end
     end
-    return look
 end
-
diff --git a/src/bayes.jl b/src/bayes.jl
@@ -5,9 +5,11 @@ export NaiveBayesClassifier
 simpleTokenise(s) = WordTokenizers.tokenize(lowercase(replace(s, "."=>"")))
 
 """
+$(TYPEDSIGNATURES)
+
 Create a dict that maps elements in input array to their frequencies.
 """
-function frequencies(xs)
+function frequencies(xs::AbstractVector{T})::Dict{T,Int} where {T<:Any}
     frequencies = Dict{eltype(xs),Int}()
     for x in xs
         frequencies[x] = get(frequencies, x, 0) + 1
@@ -16,13 +18,13 @@ function frequencies(xs)
 end
 
 """
-    features(::AbstractDict, dict)
+$(TYPEDSIGNATURES)
 
 Compute an Array, mapping the value corresponding to elements of `dict` to the input `AbstractDict`.
 """
-function features(fs::AbstractDict, dict)
-    bag = zeros(Int, size(dict))
-    for i = 1:length(dict)
+function features(fs::AbstractDict, dict::AbstractVector)::Vector{Int}
+    bag = Vector{Int}(undef, size(dict))
+    for i = eachindex(dict)
         bag[i] = get(fs, dict[i], 0)
     end
     return bag

diff --git a/src/coom.jl b/src/coom.jl
@@ -42,15 +42,19 @@ function coo_matrix(::Type{T},
     coom = spzeros(T, n, n)
     # Count co-occurrences
     for (i, token) in enumerate(doc)
-        @inbounds for j in max(1, i-window):min(m, i+window)
+        row = get(vocab, token, nothing)
+        isnothing(row) && continue
+
+        @inbounds for j in max(1, i - window):min(m, i + window)
+            i == j && continue
+
             wtoken = doc[j]
-            nm = T(ifelse(normalize, abs(i-j), 1))
-            row = get(vocab, token, nothing)
             col = get(vocab, wtoken, nothing)
-            if i!=j && row != nothing && col != nothing
-                coom[row, col] += one(T)/nm
-                coom[col, row] = coom[row, col]
-            end
+            isnothing(col) && continue
+
+            nm = T(ifelse(normalize, abs(i - j), 1))
+            coom[row, col] += one(T) / nm
+            coom[col, row] = coom[row, col]
         end
     end
     return coom

diff --git a/src/corpus.jl b/src/corpus.jl
@@ -236,11 +236,8 @@ function update_inverse_index!(crps::Corpus)
         ngram_arr = isa(doc, NGramDocument) ? collect(keys(ngrams(doc))) : tokens(doc)
         ngram_arr = convert(Array{String,1}, ngram_arr)
         for ngram in ngram_arr
-            if haskey(idx, ngram)
-                push!(idx[ngram], i)
-            else
-                idx[ngram] = [i]
-            end
+            key = get!(() -> [], idx, ngram)
+            push!(key, i)
         end
     end
     for key in keys(idx)

diff --git a/src/deprecations.jl b/src/deprecations.jl
@@ -1,7 +1,7 @@
 
 ## Deprecations for Languages
 
-function WordTokenizers.tokenize(::Type{S}, s::T) where {S <: Language, T <: AbstractString}
+function tokenize(::Type{S}, s::T) where {S <: Language, T <: AbstractString}
     depwarn("Use of Languages as types is deprecated. Use instances.",  Symbol(S))
     tokenize(S(), s)
 end

diff --git a/src/dtm.jl b/src/dtm.jl
@@ -168,14 +168,15 @@ tdm(crps::Corpus) = dtm(crps)' #'
 
 function dtm_entries(d::AbstractDocument, lex::Dict{T, Int}) where T
     ngs = ngrams(d)
-    indices = Array{Int}(undef, 0)
-    values = Array{Int}(undef, 0)
-    terms = sort(collect(keys(lex)))
+    indices = Int[]
+    values = Int[]
+    terms = sort!(collect(keys(lex)))
     column_indices = columnindices(terms)
 
     for ngram in keys(ngs)
-        if haskey(column_indices, ngram)
-            push!(indices, column_indices[ngram])
+        key = get(column_indices, ngram, nothing)
+        if !isnothing(key)
+            push!(indices, key)
             push!(values, ngs[ngram])
         end
     end