JuliaText · aviks · Sep 2, 2020 · Apr 28, 2020 · Apr 28, 2020 · Apr 28, 2020
diff --git a/Manifest.toml b/Manifest.toml
@@ -470,3 +470,6 @@ deps = ["BinaryProvider", "Libdl", "Printf"]
 git-tree-sha1 = "580ce62b6c14244916cc28ad54f8a2e2886f843d"
 uuid = "a5390f91-8eb1-5f08-bee0-b1d1ffed6cea"
 version = "0.8.3"
+
+[[StatsBase]]
+uuid = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91"
diff --git a/Project.toml b/Project.toml
@@ -22,6 +22,7 @@ SparseArrays = "2f01184e-e22b-5df5-ae63-d93ebab69eaf"
 Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
 Tracker = "9f7883ad-71c0-57eb-9f7f-b5c9e6d3789c"
 WordTokenizers = "796a5d58-b03d-544a-977e-18100b691f6e"
+StatsBase = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91"
 
 [compat]
 Flux = "< 0.10"

diff --git a/src/LM/counter.jl b/src/LM/counter.jl
@@ -0,0 +1,36 @@
+using DataStructures
+import DataStructures.Accumulator
+import DataStructures.DefaultDict
+import DataStructures.counter
+
+function normalize(accum)
+    #sum all counts
+    s = float(sum(accum))
+    #tuple of string with each count divided by sum
+    [(history,float(sum(count))/s) for (history,count) in accum]
+end
+
+function counter1(data, min::Integer, max::Integer,norm::Function;gamma = nothing)
+    data = (everygram(data,min_len = min, max_len =max ))
+    data = split.(data)
+    temp_lm = DefaultDict{SubString{String}, Accumulator{String,Int64}}(counter(SubString{String}))
+    for i in 1:length(data)
+        history,word = data[i][1:end-1], data[i][end]
+        temp_lm[join(history, " ")][word] += 1
+    end
+  #return Dict from iterated temp_lm with normalized histories
+  Dict(histories => norm(word,gamma) for (histories,word) in temp_lm)
+    #return temp_lm
+end
+
+function normalize(accum,gamma)
+  #sum all counts
+  s = float(sum(accum))
+  #tuple of string with each count divided by sum
+  [(word,float(sum(count))/s) for (word,count) in accum]
+end
+function lid_norm(accum,gamma)
+      s = float(sum(accum)+(gamma)*length(accum)) 
+      [(word,float(count + gamma)/s) for (word,count) in accum]
+
+end
diff --git a/src/LM/langmodel.jl b/src/LM/langmodel.jl
@@ -0,0 +1,71 @@
+abstract type Langmodel end
+
+struct MLE <: Langmodel
+    vocab ::Vocabulary
+    #counter::Dict{SubString{String},Array{Tuple{String,Float64},1}}
+end
+
+function MLE(word,unk_cutoff=1 ,unk_label="<unk>")
+   MLE(Vocabulary(word,unk_cutoff ,unk_label))
+end
+
+function (lm::MLE)(text,min::Integer,max::Integer) 
+     text = lookup(lm.vocab ,text)
+     text=convert(Array{String}, text)
+     return counter1(text,min,max,normalize)
+end
+
+struct Lidstone <: Langmodel
+    vocab ::Vocabulary
+    gamma ::Integer
+end
+
+function Lidstone(word,gamma,unk_cutoff=1 ,unk_label="<unk>")
+   Lidstone(Vocabulary(word,unk_cutoff ,unk_label),gamma)
+end
+
+function (lm::Lidstone)(text,min::Integer,max::Integer) 
+     text = lookup(lm.vocab ,text)
+     text=convert(Array{String}, text)
+     return counter1(text,min,max,lid_norm,gamma = lm.gamma)
+end
+
+
+struct Laplace <: Langmodel
+    vocab ::Vocabulary
+    gamma ::Integer
+end
+
+function Laplace(word,unk_cutoff=1 ,unk_label="<unk>")
+   Lidstone(Vocabulary(word,unk_cutoff ,unk_label),1)
+end
+
+function (lm::Laplace)(text,min::Integer,max::Integer) 
+     text = lookup(lm.vocab ,text)
+     text=convert(Array{String}, text)
+     return counter1(text,min,max,lid_norm,gamma = lm.gamma)
+end
+
+
+function unmaskscore(a::Dict{SubString{String},Array{Tuple{String,Float64},1}},word,context)
+    for i in a[context]
+        if word == i[1]
+            return i[2]
+        end
+    end
+end
+
+function score(voc::Langmodel,model::Dict{SubString{String},Array{Tuple{String,Float64},1}} ,word ,context )
+        """Masks out of vocab (OOV) words and computes their model score.
+        For model-specific logic of calculating scores, see the `unmasked_score`
+        method.
+        """
+    return unmaskscore(model,word,context )
+end
+
+function logscore(word, context= None)
+        """Evaluate the log score of this word in this context.
+        The arguments are the same as for `score` and `unmasked_score`.
+        """
+    return log2(score(word, context))
+end
diff --git a/src/LM/preprocessing.jl b/src/LM/preprocessing.jl
@@ -0,0 +1,90 @@
+"""
+Return all possible ngrams generated from sequence of items, as an Array{String,1}
+# Example
+
+>>>seq=["To","be","or","not"]
+>>>a = everygram(seq,min_len = 1, max_len = -1)
+   10-element Array{Any,1}:
+ "or"          
+ "not"         
+ "To"          
+ "be"                 
+ "or not" 
+ "be or"       
+ "be or not"   
+ "To be or"    
+ "To be or not"
+
+"""
+function everygram(seq; min_len::Int=1, max_len::Int=-1)
+     ngram = []
+    if max_len == -1
+        max_len = length(seq)
+    end
+    for n in range(min_len, stop =max_len)
+       temp = keys(TextAnalysis.ngramize(TextAnalysis.Languages.English(),seq,n))
+       ngram = append!(ngram,temp)
+   end
+   return(ngram)
+end
+
+"""
+   padding _ngram is used to pad both left and right of sentence and out putting ngrmas
+
+   It also pad the original input Array of string 
+# Example Usage
+>>>example = ["1","2","3","4","5"]
+
+>>> example = ["1","2","3","4","5"]
+>>> padding_ngram(example ,2 , pad_left=true,pad_right =true)
+    5-element Array{String,1}:
+ "1"
+ "2"
+ "3"
+ "4"
+ "5"
+"""
+function padding_ngram(word,n =1 ;pad_left=false,pad_right=false ,left_pad_symbol="<s>", right_pad_symbol ="</s>")
+    local seq
+    seq = word
+    if pad_left == true
+        prepend!(seq, [left_pad_symbol])
+    end 
+    if pad_right == true
+        push!(seq, right_pad_symbol)
+    end
+   return  ngramizenew(seq,n)
+end
+"""
+   ngramizenew is used to out putting ngrmas in set
+
+# Example Usage
+>>>seq=["To","be","or","not","To","not","To","not"]
+
+>>> ngramizenew(seq ,2)
+7-element Array{Any,1}:
+ "To be" 
+ "be or" 
+ "or not"
+ "not To"
+ "To not"
+ "not To"
+ "To not"
+
+"""
+function ngramizenew( words::Vector{T}, nlist::Integer...) where { T <: AbstractString}
+    #(length(nlist) == 1) && (first(nlist) == 1) && return onegramize(lang, words)
+
+    n_words = length(words)
+
+    tokens = []
+
+    for n in nlist
+        for index in 1:(n_words - n + 1)
+            token = join(words[index:(index + n - 1)], " ")
+            push!(tokens,token)
+        end
+    end
+    return tokens
+end
+
diff --git a/src/LM/vocab.jl b/src/LM/vocab.jl
@@ -0,0 +1,114 @@
+""" General counter to used in vocabulary"""
+mutable struct Counter
+    value::Int
+    #Counter(value) = (new(),value)
+end
+
+function counter(init = 0) 
+    Counter(init)
+end
+function (count::Counter)()
+    count.value = 1 +count.value
+end
+
+"""Stores language model vocabulary.
+    Satisfies two common language modeling requirements for a vocabulary:
+    - When checking membership and calculating its size, filters items
+      by comparing their counts to a cutoff value.
+    - Adds a special "unknown" token which unseen words are mapped to.
+
+    >>> words = ['a', 'c', '-', 'd', 'c', 'a', 'b', 'r', 'a', 'c', 'd']
+    >>> import Vocabulary
+    >>> vocabulary = Vocabulary(words, 2) 
+    Vocabulary(Dict("<unk>"=>1,"c"=>3,"a"=>3,"d"=>2), 2, "<unk>") 
+    >>> vocabulary.vocab
+        Dict{String,Int64} with 4 entries:
+      "<unk>" => 1
+      "c"     => 3
+      "a"     => 3
+      "d"     => 2
+    Tokens with counts greater than or equal to the cutoff value will
+    be considered part of the vocabulary.
+    >>> vocabulary.vocab["c"]
+    3
+    >>> "c" in keys(vocabulary.vocab)
+    true
+    >>> vocabulary.vocab["d"]
+    2
+    >>> "d" in keys(vocabulary.vocab)
+    true
+    Tokens with frequency counts less than the cutoff value will be considered not
+    part of the vocabulary even though their entries in the count dictionary are
+    preserved.
+    >>> "b" in keys(vocabulary.vocab)
+    false
+    >>> "<unk>" in keys(vocabulary.vocab)
+    true
+    We can look up words in a vocabulary using its `lookup` method.
+    "Unseen" words (with counts less than cutoff) are looked up as the unknown label.
+    If given one word (a string) as an input, this method will return a string.
+    >>> lookup("a")
+    'a'
+    >>> word = ["a", "-", "d", "c", "a"]
+    >>> lookup(vocabulary ,word)
+     5-element Array{Any,1}:
+     "a"    
+     "<unk>"
+     "d"    
+     "c"    
+     "a"
+
+    If given a sequence, it will return an Array{Any,1} of the looked up words as shown above.
+
+    It's possible to update the counts after the vocabulary has been created.
+    >>> update(vocabulary,["b","c","c"])
+    1
+    >>> vocabulary.vocab["b"]
+    1
+    """
+mutable struct Vocabulary
+vocab::Dict{String,Int64}
+unk_cutoff::Int
+unk_label::String
+allword::Array{String,1}
+end
+function Vocabulary(word,unk_cutoff =1 ,unk_label = "<unk>") 
+    if unk_label in word
+        #error("unk_label is in vocab")
+    else
+    word= push!(word,unk_label)
+    end
+    vocab = countmap(word)
+    for value in vocab
+        if value[2]<unk_cutoff && value[1] != unk_label
+            delete!(vocab,value[1])
+        end
+    end
+    Vocabulary(vocab,unk_cutoff,unk_label,word)
+end
+
+function update(vocab::Vocabulary, words)
+    vocab.allword = append!(vocab.allword,words)
+    vocab.vocab=addcounts!(vocab.vocab,words)
+end
+
+"""
+lookup a sequence or words in the vocabulary
+
+Return an Array of String
+"""
+function lookup(voc::Vocabulary,word)
+    look = []
+    for w in word
+         if w in keys(voc.vocab)
+            push!(look,w) 
+         else 
+            #return vocab.unk_label
+            push!(look,voc.unk_label) 
+        end
+    end
+    return look
+end
+
+
+
diff --git a/src/TextAnalysis.jl b/src/TextAnalysis.jl
@@ -2,7 +2,7 @@ module TextAnalysis
     using SparseArrays
     using Printf
     using LinearAlgebra
-
+    using StatsBase: countmap,addcounts!
     using Languages
     using DataFrames
     using WordTokenizers
@@ -66,6 +66,9 @@ module TextAnalysis
     export CRF, viterbi_decode, crf_loss
 
     export NERTagger, PoSTagger, Tracker, Flux
+
+    export Vocabulary,lookup,update
+    export everygram, padding_ngram
 
     include("tokenizer.jl")
     include("ngramizer.jl")
@@ -111,7 +114,10 @@ module TextAnalysis
     include("sequence/pos_datadeps.jl")
     include("sequence/pos.jl")
     include("sequence/sequence_models.jl")
-
+
+    # Lang_model
+    include("LM/vocab.jl")
+
     # ULMFiT
     module ULMFiT
         using ..TextAnalysis