From 44b540c1a42c182d385eb48c6cdb66a667b29a60 Mon Sep 17 00:00:00 2001
From: tejasvaidhyadev <iamtejasvaidhya@gmail.com>
Date: Tue, 28 Apr 2020 06:17:26 +0530
Subject: [PATCH 01/51] adding StatsBase

---
 Manifest.toml | 3 +++
 1 file changed, 3 insertions(+)
diff --git a/Manifest.toml b/Manifest.toml
index 187c33a1..358163f2 100644
--- a/Manifest.toml
+++ b/Manifest.toml
@@ -470,3 +470,6 @@ deps = ["BinaryProvider", "Libdl", "Printf"]
 git-tree-sha1 = "580ce62b6c14244916cc28ad54f8a2e2886f843d"
 uuid = "a5390f91-8eb1-5f08-bee0-b1d1ffed6cea"
 version = "0.8.3"
+
+[[StatsBase]]
+uuid = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91"

From 8becfce954c8e7909a63c37bce8c8d1b04b4406b Mon Sep 17 00:00:00 2001
From: tejasvaidhyadev <iamtejasvaidhya@gmail.com>
Date: Tue, 28 Apr 2020 06:17:58 +0530
Subject: [PATCH 02/51] adding StatsBase

---
 Project.toml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/Project.toml b/Project.toml
index 02bcd701..e17f55b5 100644
--- a/Project.toml
+++ b/Project.toml
@@ -22,6 +22,7 @@ SparseArrays = "2f01184e-e22b-5df5-ae63-d93ebab69eaf"
 Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
 Tracker = "9f7883ad-71c0-57eb-9f7f-b5c9e6d3789c"
 WordTokenizers = "796a5d58-b03d-544a-977e-18100b691f6e"
+StatsBase = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91"
 
 [compat]
 Flux = "< 0.10"

From f9d8b5f2d36e53521173a9aba591d3862acb62f3 Mon Sep 17 00:00:00 2001
From: tejasvaidhyadev <iamtejasvaidhya@gmail.com>
Date: Tue, 28 Apr 2020 06:18:48 +0530
Subject: [PATCH 03/51] exporting function

---
 src/TextAnalysis.jl | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/src/TextAnalysis.jl b/src/TextAnalysis.jl
index 8978cf1b..4f59b72b 100644
--- a/src/TextAnalysis.jl
+++ b/src/TextAnalysis.jl
@@ -2,7 +2,7 @@ module TextAnalysis
     using SparseArrays
     using Printf
     using LinearAlgebra
-
+    using StatsBase: countmap,addcounts!
     using Languages
     using DataFrames
     using WordTokenizers
@@ -66,7 +66,8 @@ module TextAnalysis
     export CRF, viterbi_decode, crf_loss
 
     export NERTagger, PoSTagger, Tracker, Flux
-
+    
+    export Vocabulary,lookup,update
     include("tokenizer.jl")
     include("ngramizer.jl")
     include("document.jl")
@@ -111,7 +112,10 @@ module TextAnalysis
     include("sequence/pos_datadeps.jl")
     include("sequence/pos.jl")
     include("sequence/sequence_models.jl")
-
+    
+    # Lang_model
+    include("LM/vocab.jl")
+    
     # ULMFiT
     module ULMFiT
         using ..TextAnalysis

From 59b8fae8f179b6ac532d0fe4342ebb49283c4c08 Mon Sep 17 00:00:00 2001
From: tejasvaidhyadev <iamtejasvaidhya@gmail.com>
Date: Tue, 28 Apr 2020 06:19:46 +0530
Subject: [PATCH 04/51] adding vocabulary strut in LM

---
 src/LM/vocab.jl | 114 ++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 114 insertions(+)
 create mode 100644 src/LM/vocab.jl

diff --git a/src/LM/vocab.jl b/src/LM/vocab.jl
new file mode 100644
index 00000000..2b432294
--- /dev/null
+++ b/src/LM/vocab.jl
@@ -0,0 +1,114 @@
+""" General counter to used in vocabulary"""
+mutable struct Counter
+    value::Int
+    #Counter(value) = (new(),value)
+end
+
+function counter(init = 0) 
+    Counter(init)
+end
+function (count::Counter)()
+    count.value = 1 +count.value
+end
+
+"""Stores language model vocabulary.
+    Satisfies two common language modeling requirements for a vocabulary:
+    - When checking membership and calculating its size, filters items
+      by comparing their counts to a cutoff value.
+    - Adds a special "unknown" token which unseen words are mapped to.
+    
+    >>> words = ['a', 'c', '-', 'd', 'c', 'a', 'b', 'r', 'a', 'c', 'd']
+    >>> import Vocabulary
+    >>> vocabulary = Vocabulary(words, 2) 
+    Vocabulary(Dict("<unk>"=>1,"c"=>3,"a"=>3,"d"=>2), 2, "<unk>") 
+    >>> vocabulary.vocab
+        Dict{String,Int64} with 4 entries:
+      "<unk>" => 1
+      "c"     => 3
+      "a"     => 3
+      "d"     => 2
+    Tokens with counts greater than or equal to the cutoff value will
+    be considered part of the vocabulary.
+    >>> vocabulary.vocab["c"]
+    3
+    >>> "c" in keys(vocabulary.vocab)
+    true
+    >>> vocabulary.vocab["d"]
+    2
+    >>> "d" in keys(vocabulary.vocab)
+    true
+    Tokens with frequency counts less than the cutoff value will be considered not
+    part of the vocabulary even though their entries in the count dictionary are
+    preserved.
+    >>> "b" in keys(vocabulary.vocab)
+    false
+    >>> "<unk>" in keys(vocabulary.vocab)
+    true
+    We can look up words in a vocabulary using its `lookup` method.
+    "Unseen" words (with counts less than cutoff) are looked up as the unknown label.
+    If given one word (a string) as an input, this method will return a string.
+    >>> lookup("a")
+    'a'
+    >>> word = ["a", "-", "d", "c", "a"]
+    >>> lookup(vocabulary ,word)
+     5-element Array{Any,1}:
+     "a"    
+     "<unk>"
+     "d"    
+     "c"    
+     "a"
+
+    If given a sequence, it will return an Array{Any,1} of the looked up words as shown above.
+   
+    It's possible to update the counts after the vocabulary has been created.
+    >>> update(vocabulary,["b","c","c"])
+    1
+    >>> vocabulary.vocab["b"]
+    1
+    """
+mutable struct Vocabulary
+vocab::Dict{String,Int64}
+unk_cutoff::Int
+unk_label::String
+allword::Array{String,1}
+end
+function Vocabulary(word,unk_cutoff =1 ,unk_label = "<unk>") 
+    if unk_label in word
+        #error("unk_label is in vocab")
+    else
+    word= push!(word,unk_label)
+    end
+    vocab = countmap(word)
+    for value in vocab
+        if value[2]<unk_cutoff && value[1] != unk_label
+            delete!(vocab,value[1])
+        end
+    end
+    Vocabulary(vocab,unk_cutoff,unk_label,word)
+end
+
+function update(vocab::Vocabulary, words)
+    vocab.allword = append!(vocab.allword,words)
+    vocab.vocab=addcounts!(vocab.vocab,words)
+end
+
+"""
+lookup a sequence or words in the vocabulary
+
+Return an Array of String
+"""
+function lookup(voc::Vocabulary,word)
+    look = []
+    for w in word
+         if w in keys(voc.vocab)
+            push!(look,w) 
+         else 
+            #return vocab.unk_label
+            push!(look,voc.unk_label) 
+        end
+    end
+    return look
+end
+
+
+

From 328303322b53bae2f2349c5da4be17024eb97964 Mon Sep 17 00:00:00 2001
From: tejasvaidhyadev <iamtejasvaidhya@gmail.com>
Date: Wed, 29 Apr 2020 02:30:08 +0530
Subject: [PATCH 05/51] exporting everygram and padding_ngram

---
 src/TextAnalysis.jl | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/TextAnalysis.jl b/src/TextAnalysis.jl
index 4f59b72b..1b231299 100644
--- a/src/TextAnalysis.jl
+++ b/src/TextAnalysis.jl
@@ -68,6 +68,8 @@ module TextAnalysis
     export NERTagger, PoSTagger, Tracker, Flux
     
     export Vocabulary,lookup,update
+    export everygram, padding_ngram
+
     include("tokenizer.jl")
     include("ngramizer.jl")
     include("document.jl")

From 36aca88f810ac4953296207de32f02e331904dee Mon Sep 17 00:00:00 2001
From: tejasvaidhyadev <iamtejasvaidhya@gmail.com>
Date: Wed, 29 Apr 2020 02:30:53 +0530
Subject: [PATCH 06/51] adding padding and everygram

---
 src/LM/preprocessing.jl | 57 +++++++++++++++++++++++++++++++++++++++++
 1 file changed, 57 insertions(+)
 create mode 100644 src/LM/preprocessing.jl

diff --git a/src/LM/preprocessing.jl b/src/LM/preprocessing.jl
new file mode 100644
index 00000000..7949ceb7
--- /dev/null
+++ b/src/LM/preprocessing.jl
@@ -0,0 +1,57 @@
+"""
+Return all possible ngrams generated from sequence of items, as an Array{String,1}
+# Example
+
+>>>seq=["To","be","or","not"]
+>>>a = everygram(seq,min_len = 1, max_len = -1)
+   10-element Array{Any,1}:
+ "or"          
+ "not"         
+ "To"          
+ "be"                 
+ "or not" 
+ "be or"       
+ "be or not"   
+ "To be or"    
+ "To be or not"
+   
+"""
+function everygram(seq; min_len::Int=1, max_len::Int=-1)
+     ngram = []
+    if max_len == -1
+        max_len = length(seq)
+    end
+    for n in range(min_len, stop =max_len)
+       temp = keys(TextAnalysis.ngramize(TextAnalysis.Languages.English(),seq,n))
+       ngram = append!(ngram,temp)
+   end
+   return(ngram)
+end
+
+"""
+   padding _ngram is used to pad both left and right of sentence and out putting ngrmas
+   
+   It also pad the original input Array of string 
+# Example Usage
+>>>example = ["1","2","3","4","5"]
+      
+>>> example = ["1","2","3","4","5"]
+>>> padding_ngram(example ,2 , pad_left=true,pad_right =true)
+    5-element Array{String,1}:
+ "1"
+ "2"
+ "3"
+ "4"
+ "5"
+"""
+function padding_ngram(word,n =1 ;pad_left=false,pad_right=false ,left_pad_symbol="<s>", right_pad_symbol ="</s>")
+    local seq
+    seq = word
+    if pad_left == true
+        prepend!(seq, [left_pad_symbol])
+    end 
+    if pad_right == true
+        push!(seq, right_pad_symbol)
+    end
+   return keys(TextAnalysis.ngramize(TextAnalysis.Languages.English(),seq,n))
+end

From 29fee5564f81e2e88237af814ed21fb1fba38b63 Mon Sep 17 00:00:00 2001
From: tejasvaidhyadev <iamtejasvaidhya@gmail.com>
Date: Tue, 19 May 2020 21:21:40 +0530
Subject: [PATCH 07/51] added general counter

---
 src/LM/counter.jl | 25 +++++++++++++++++++++++++
 1 file changed, 25 insertions(+)
 create mode 100644 src/LM/counter.jl

diff --git a/src/LM/counter.jl b/src/LM/counter.jl
new file mode 100644
index 00000000..20e13ca6
--- /dev/null
+++ b/src/LM/counter.jl
@@ -0,0 +1,25 @@
+using DataStructures
+import DataStructures.Accumulator
+import DataStructures.DefaultDict
+import DataStructures.counter
+
+function counter1(data, min::Integer, max::Integer)
+    data = (everygram(data,min_len = min, max_len =max ))
+    data = split.(data)
+    temp_lm = DefaultDict{SubString{String}, Accumulator{String,Int64}}(counter(SubString{String}))
+    for i in 1:length(data)
+        history,word = data[i][1:end-1], data[i][end]
+        
+        temp_lm[join(history, " ")][word]+=1
+    end
+    #return Dict from iterated temp_lm with normalized histories
+    Dict(word => normalize(histories) for (word,histories) in temp_lm)
+    #return temp_lm
+end
+
+function normalize(accum)
+    #sum all counts
+    s = float(sum(accum))
+    #tuple of string with each count divided by sum
+    [(history,float(sum(count))/s) for (history,count) in accum]
+end

From c41df193371846cf60e38ac8420615beb3f60079 Mon Sep 17 00:00:00 2001
From: tejasvaidhyadev <iamtejasvaidhya@gmail.com>
Date: Tue, 19 May 2020 21:22:38 +0530
Subject: [PATCH 08/51] adding mle in language model with some peremeter

---
 src/LM/langmodel.jl | 39 +++++++++++++++++++++++++++++++++++++++
 1 file changed, 39 insertions(+)
 create mode 100644 src/LM/langmodel.jl

diff --git a/src/LM/langmodel.jl b/src/LM/langmodel.jl
new file mode 100644
index 00000000..0e2845dc
--- /dev/null
+++ b/src/LM/langmodel.jl
@@ -0,0 +1,39 @@
+abstract type Langmodel end
+
+struct mle <: Langmodel
+    vocab ::Vocabulary
+    #counter::Dict{SubString{String},Array{Tuple{String,Float64},1}}
+end
+
+function mle(word,unk_cutoff=1 ,unk_label="<unk>")
+    mle(Vocabulary(word,unk_cutoff ,unk_label))
+end
+
+function fit!(lm::Langmodel,text,min::Integer,max::Integer)
+    text = lookup(lm.vocab ,text)
+    text = convert(Array{String}, text)
+    return counter1(text,min,max)
+end
+
+function unmaskscore(a::Dict{SubString{String},Array{Tuple{String,Float64},1}},word,context)
+    for i in a[context]
+        if word == i[1]
+            return i[2]
+        end
+    end
+end
+
+function score(voc::Langmodel,model::Dict{SubString{String},Array{Tuple{String,Float64},1}} ,word ,context )
+        """Masks out of vocab (OOV) words and computes their model score.
+        For model-specific logic of calculating scores, see the `unmasked_score`
+        method.
+        """
+    return unmaskscore(model,word,context )
+end
+
+function logscore(word, context= None)
+        """Evaluate the log score of this word in this context.
+        The arguments are the same as for `score` and `unmasked_score`.
+        """
+    return log2(score(word, context))
+end

From a35ca92af0ebe52ce4df50e0578d4355e9e4afa5 Mon Sep 17 00:00:00 2001
From: tejasvaidhyadev <iamtejasvaidhya@gmail.com>
Date: Sun, 24 May 2020 04:06:24 +0530
Subject: [PATCH 09/51] updating counter with ngramnew

---
 src/LM/counter.jl | 31 +++++++++++++++++++++----------
 1 file changed, 21 insertions(+), 10 deletions(-)

diff --git a/src/LM/counter.jl b/src/LM/counter.jl
index 20e13ca6..a4b4a0a9 100644
--- a/src/LM/counter.jl
+++ b/src/LM/counter.jl
@@ -3,23 +3,34 @@ import DataStructures.Accumulator
 import DataStructures.DefaultDict
 import DataStructures.counter
 
-function counter1(data, min::Integer, max::Integer)
+function normalize(accum)
+    #sum all counts
+    s = float(sum(accum))
+    #tuple of string with each count divided by sum
+    [(history,float(sum(count))/s) for (history,count) in accum]
+end
+
+function counter1(data, min::Integer, max::Integer,norm::Function;gamma = nothing)
     data = (everygram(data,min_len = min, max_len =max ))
     data = split.(data)
     temp_lm = DefaultDict{SubString{String}, Accumulator{String,Int64}}(counter(SubString{String}))
     for i in 1:length(data)
         history,word = data[i][1:end-1], data[i][end]
-        
-        temp_lm[join(history, " ")][word]+=1
+        temp_lm[join(history, " ")][word] += 1
     end
-    #return Dict from iterated temp_lm with normalized histories
-    Dict(word => normalize(histories) for (word,histories) in temp_lm)
+  #return Dict from iterated temp_lm with normalized histories
+  Dict(histories => norm(word,gamma) for (histories,word) in temp_lm)
     #return temp_lm
 end
 
-function normalize(accum)
-    #sum all counts
-    s = float(sum(accum))
-    #tuple of string with each count divided by sum
-    [(history,float(sum(count))/s) for (history,count) in accum]
+function normalize(accum,gamma)
+  #sum all counts
+  s = float(sum(accum))
+  #tuple of string with each count divided by sum
+  [(word,float(sum(count))/s) for (word,count) in accum]
+end
+function lid_norm(accum,gamma)
+      s = float(sum(accum)+(gamma)*length(accum)) 
+      [(word,float(count + gamma)/s) for (word,count) in accum]
+
 end

From 698dc0e4876774b2238364a7f0bc234a11f14d90 Mon Sep 17 00:00:00 2001
From: tejasvaidhyadev <iamtejasvaidhya@gmail.com>
Date: Sun, 24 May 2020 04:06:48 +0530
Subject: [PATCH 10/51] adding other language model

---
 src/LM/langmodel.jl | 46 ++++++++++++++++++++++++++++++++++++++-------
 1 file changed, 39 insertions(+), 7 deletions(-)

diff --git a/src/LM/langmodel.jl b/src/LM/langmodel.jl
index 0e2845dc..85f7720e 100644
--- a/src/LM/langmodel.jl
+++ b/src/LM/langmodel.jl
@@ -1,20 +1,52 @@
 abstract type Langmodel end
 
-struct mle <: Langmodel
+struct MLE <: Langmodel
     vocab ::Vocabulary
     #counter::Dict{SubString{String},Array{Tuple{String,Float64},1}}
 end
 
-function mle(word,unk_cutoff=1 ,unk_label="<unk>")
-    mle(Vocabulary(word,unk_cutoff ,unk_label))
+function MLE(word,unk_cutoff=1 ,unk_label="<unk>")
+   MLE(Vocabulary(word,unk_cutoff ,unk_label))
 end
 
-function fit!(lm::Langmodel,text,min::Integer,max::Integer)
-    text = lookup(lm.vocab ,text)
-    text = convert(Array{String}, text)
-    return counter1(text,min,max)
+function (lm::MLE)(text,min::Integer,max::Integer) 
+     text = lookup(lm.vocab ,text)
+     text=convert(Array{String}, text)
+     return counter1(text,min,max,normalize)
 end
 
+struct Lidstone <: Langmodel
+    vocab ::Vocabulary
+    gamma ::Integer
+end
+
+function Lidstone(word,gamma,unk_cutoff=1 ,unk_label="<unk>")
+   Lidstone(Vocabulary(word,unk_cutoff ,unk_label),gamma)
+end
+
+function (lm::Lidstone)(text,min::Integer,max::Integer) 
+     text = lookup(lm.vocab ,text)
+     text=convert(Array{String}, text)
+     return counter1(text,min,max,lid_norm,gamma = lm.gamma)
+end
+
+
+struct Laplace <: Langmodel
+    vocab ::Vocabulary
+    gamma ::Integer
+end
+
+function Laplace(word,unk_cutoff=1 ,unk_label="<unk>")
+   Lidstone(Vocabulary(word,unk_cutoff ,unk_label),1)
+end
+
+function (lm::Laplace)(text,min::Integer,max::Integer) 
+     text = lookup(lm.vocab ,text)
+     text=convert(Array{String}, text)
+     return counter1(text,min,max,lid_norm,gamma = lm.gamma)
+end
+
+
 function unmaskscore(a::Dict{SubString{String},Array{Tuple{String,Float64},1}},word,context)
     for i in a[context]
         if word == i[1]

From a9e7345ae76e801bc875cdbb108f5758e4c8fea5 Mon Sep 17 00:00:00 2001
From: tejasvaidhyadev <iamtejasvaidhya@gmail.com>
Date: Sun, 24 May 2020 04:07:24 +0530
Subject: [PATCH 11/51] adding new ngram method

---
 src/LM/preprocessing.jl | 35 ++++++++++++++++++++++++++++++++++-
 1 file changed, 34 insertions(+), 1 deletion(-)

diff --git a/src/LM/preprocessing.jl b/src/LM/preprocessing.jl
index 7949ceb7..d8fd07d6 100644
--- a/src/LM/preprocessing.jl
+++ b/src/LM/preprocessing.jl
@@ -53,5 +53,38 @@ function padding_ngram(word,n =1 ;pad_left=false,pad_right=false ,left_pad_symbo
     if pad_right == true
         push!(seq, right_pad_symbol)
     end
-   return keys(TextAnalysis.ngramize(TextAnalysis.Languages.English(),seq,n))
+   return  ngramizenew(seq,n)
 end
+"""
+   ngramizenew is used to out putting ngrmas in set
+   
+# Example Usage
+>>>seq=["To","be","or","not","To","not","To","not"]
+      
+>>> ngramizenew(seq ,2)
+7-element Array{Any,1}:
+ "To be" 
+ "be or" 
+ "or not"
+ "not To"
+ "To not"
+ "not To"
+ "To not"
+
+"""
+function ngramizenew( words::Vector{T}, nlist::Integer...) where { T <: AbstractString}
+    #(length(nlist) == 1) && (first(nlist) == 1) && return onegramize(lang, words)
+
+    n_words = length(words)
+
+    tokens = []
+
+    for n in nlist
+        for index in 1:(n_words - n + 1)
+            token = join(words[index:(index + n - 1)], " ")
+            push!(tokens,token)
+        end
+    end
+    return tokens
+end
+

From 9f9a0ba5b3e4624191d30a622b7179d4cfed7f35 Mon Sep 17 00:00:00 2001
From: tejasvaidhyadev <iamtejasvaidhya@gmail.com>
Date: Sat, 30 May 2020 19:05:06 +0530
Subject: [PATCH 12/51] bux fix and indentation

---
 src/LM/preprocessing.jl | 109 +++++++++++++++++++++++-----------------
 1 file changed, 62 insertions(+), 47 deletions(-)

diff --git a/src/LM/preprocessing.jl b/src/LM/preprocessing.jl
index d8fd07d6..fd62de4f 100644
--- a/src/LM/preprocessing.jl
+++ b/src/LM/preprocessing.jl
@@ -1,50 +1,62 @@
 """
+    everygram(seq::Vector{T}; min_len::Int=1, max_len::Int=-1)where { T <: AbstractString}
+
 Return all possible ngrams generated from sequence of items, as an Array{String,1}
+
 # Example
 
->>>seq=["To","be","or","not"]
->>>a = everygram(seq,min_len = 1, max_len = -1)
-   10-element Array{Any,1}:
- "or"          
- "not"         
- "To"          
- "be"                 
- "or not" 
- "be or"       
- "be or not"   
- "To be or"    
- "To be or not"
+```julia-repl
+julia> seq = ["To","be","or","not"]
+julia> a = everygram(seq,min_len=1, max_len=-1)
+ 10-element Array{Any,1}:
+  "or"          
+  "not"         
+  "To"          
+  "be"                  
+  "or not" 
+  "be or"       
+  "be or not"   
+  "To be or"    
+  "To be or not"
+```
    
 """
-function everygram(seq; min_len::Int=1, max_len::Int=-1)
-     ngram = []
+function everygram(seq::Vector{T}; min_len::Int=1, max_len::Int=-1) where { T <: AbstractString}
+
+    ngram = []
     if max_len == -1
         max_len = length(seq)
     end
-    for n in range(min_len, stop =max_len)
-       temp = keys(TextAnalysis.ngramize(TextAnalysis.Languages.English(),seq,n))
-       ngram = append!(ngram,temp)
-   end
-   return(ngram)
+    for n in range(min_len, stop=max_len)
+        temp = ngramizenew(seq, n)
+        ngram = append!(ngram, temp)
+    end
+    return(ngram)
 end
 
 """
-   padding _ngram is used to pad both left and right of sentence and out putting ngrmas
+    padding_ngram(word::Vector{T}, n=1; pad_left=false, pad_right=false, left_pad_symbol="<s>", right_pad_symbol ="</s>") where { T <: AbstractString}
+   
+padding _ngram is used to pad both left and right of sentence and out putting ngrmas of order n
    
    It also pad the original input Array of string 
-# Example Usage
->>>example = ["1","2","3","4","5"]
+
+# Example 
+```julia-repl
+julia> example = ["1","2","3","4","5"]
       
->>> example = ["1","2","3","4","5"]
->>> padding_ngram(example ,2 , pad_left=true,pad_right =true)
-    5-element Array{String,1}:
- "1"
- "2"
- "3"
- "4"
- "5"
+julia> example = ["1","2","3","4","5"]
+julia> padding_ngrams(example,2,pad_left=true,pad_right=true)
+ 6-element Array{Any,1}:
+  "<s> 1" 
+  "1 2"   
+  "2 3"   
+  "3 4"   
+  "4 5"   
+  "5 </s>"
+```
 """
-function padding_ngram(word,n =1 ;pad_left=false,pad_right=false ,left_pad_symbol="<s>", right_pad_symbol ="</s>")
+function padding_ngram(word::Vector{T}, n=1; pad_left=false,pad_right=false ,left_pad_symbol="<s>", right_pad_symbol ="</s>") where { T <: AbstractString}
     local seq
     seq = word
     if pad_left == true
@@ -53,26 +65,29 @@ function padding_ngram(word,n =1 ;pad_left=false,pad_right=false ,left_pad_symbo
     if pad_right == true
         push!(seq, right_pad_symbol)
     end
-   return  ngramizenew(seq,n)
+    return  ngramizenew(seq,n)
 end
+
 """
-   ngramizenew is used to out putting ngrmas in set
-   
-# Example Usage
->>>seq=["To","be","or","not","To","not","To","not"]
-      
->>> ngramizenew(seq ,2)
-7-element Array{Any,1}:
- "To be" 
- "be or" 
- "or not"
- "not To"
- "To not"
- "not To"
- "To not"
+    ngramizenew( words::Vector{T}, nlist::Integer...) where { T <: AbstractString}   
 
+ngramizenew is used to out putting ngrmas in set
+   
+# Example
+```julia-repl
+julia> seq=["To","be","or","not","To","not","To","not"]
+julia> ngramizenew(seq ,2)
+ 7-element Array{Any,1}:
+  "To be" 
+  "be or" 
+  "or not"
+  "not To"
+  "To not"
+  "not To"
+  "To not"
+```
 """
-function ngramizenew( words::Vector{T}, nlist::Integer...) where { T <: AbstractString}
+function ngramizenew(words::Vector{T}, nlist::Integer...) where { T <: AbstractString}
     #(length(nlist) == 1) && (first(nlist) == 1) && return onegramize(lang, words)
 
     n_words = length(words)

From eded08451388d1adba00a48a4ad8a60ab85cceaf Mon Sep 17 00:00:00 2001
From: tejasvaidhyadev <iamtejasvaidhya@gmail.com>
Date: Sat, 30 May 2020 19:43:12 +0530
Subject: [PATCH 13/51] updating docstring in vocab

---
 src/LM/vocab.jl | 142 ++++++++++++++++++++++++------------------------
 1 file changed, 72 insertions(+), 70 deletions(-)

diff --git a/src/LM/vocab.jl b/src/LM/vocab.jl
index 2b432294..c8bda2fb 100644
--- a/src/LM/vocab.jl
+++ b/src/LM/vocab.jl
@@ -1,80 +1,83 @@
-""" General counter to used in vocabulary"""
-mutable struct Counter
-    value::Int
-    #Counter(value) = (new(),value)
-end
+"""
+    Vocabulary(word,unk_cutoff =1 ,unk_label = "<unk>") 
 
-function counter(init = 0) 
-    Counter(init)
-end
-function (count::Counter)()
-    count.value = 1 +count.value
-end
+Stores language model vocabulary.
+Satisfies two common language modeling requirements for a vocabulary:
+- When checking membership and calculating its size, filters items
+by comparing their counts to a cutoff value.
+Adds a special "unknown" token which unseen words are mapped to.
+
+# Example
+```julia-repl
+julia> words = ['a', 'c', '-', 'd', 'c', 'a', 'b', 'r', 'a', 'c', 'd']
+julia> vocabulary = Vocabulary(words, 2) 
+  Vocabulary(Dict("<unk>"=>1,"c"=>3,"a"=>3,"d"=>2), 2, "<unk>") 
+
+julia> vocabulary.vocab
+  Dict{String,Int64} with 4 entries:
+   "<unk>" => 1
+   "c"     => 3
+   "a"     => 3
+   "d"     => 2
+
+Tokens with counts greater than or equal to the cutoff value will
+be considered part of the vocabulary.
+julia> vocabulary.vocab["c"]
+ 3
+
+julia> "c" in keys(vocabulary.vocab)
+ true
+
+julia> vocabulary.vocab["d"]
+ 2
 
-"""Stores language model vocabulary.
-    Satisfies two common language modeling requirements for a vocabulary:
-    - When checking membership and calculating its size, filters items
-      by comparing their counts to a cutoff value.
-    - Adds a special "unknown" token which unseen words are mapped to.
-    
-    >>> words = ['a', 'c', '-', 'd', 'c', 'a', 'b', 'r', 'a', 'c', 'd']
-    >>> import Vocabulary
-    >>> vocabulary = Vocabulary(words, 2) 
-    Vocabulary(Dict("<unk>"=>1,"c"=>3,"a"=>3,"d"=>2), 2, "<unk>") 
-    >>> vocabulary.vocab
-        Dict{String,Int64} with 4 entries:
-      "<unk>" => 1
-      "c"     => 3
-      "a"     => 3
-      "d"     => 2
-    Tokens with counts greater than or equal to the cutoff value will
-    be considered part of the vocabulary.
-    >>> vocabulary.vocab["c"]
-    3
-    >>> "c" in keys(vocabulary.vocab)
-    true
-    >>> vocabulary.vocab["d"]
-    2
-    >>> "d" in keys(vocabulary.vocab)
-    true
-    Tokens with frequency counts less than the cutoff value will be considered not
-    part of the vocabulary even though their entries in the count dictionary are
-    preserved.
-    >>> "b" in keys(vocabulary.vocab)
-    false
-    >>> "<unk>" in keys(vocabulary.vocab)
-    true
-    We can look up words in a vocabulary using its `lookup` method.
-    "Unseen" words (with counts less than cutoff) are looked up as the unknown label.
-    If given one word (a string) as an input, this method will return a string.
-    >>> lookup("a")
-    'a'
-    >>> word = ["a", "-", "d", "c", "a"]
-    >>> lookup(vocabulary ,word)
-     5-element Array{Any,1}:
-     "a"    
-     "<unk>"
-     "d"    
-     "c"    
-     "a"
-
-    If given a sequence, it will return an Array{Any,1} of the looked up words as shown above.
+julia> "d" in keys(vocabulary.vocab)
+ true
+
+Tokens with frequency counts less than the cutoff value will be considered not
+part of the vocabulary even though their entries in the count dictionary are
+preserved.
+julia> "b" in keys(vocabulary.vocab)
+ false
+
+julia> "<unk>" in keys(vocabulary.vocab)
+ true
+
+We can look up words in a vocabulary using its `lookup` method.
+"Unseen" words (with counts less than cutoff) are looked up as the unknown label.
+If given one word (a string) as an input, this method will return a string.
+julia> lookup("a")
+ 'a'
+
+julia> word = ["a", "-", "d", "c", "a"]
+
+julia> lookup(vocabulary ,word)
+ 5-element Array{Any,1}:
+  "a"    
+  "<unk>"
+  "d"    
+  "c"    
+  "a"
+
+If given a sequence, it will return an Array{Any,1} of the looked up words as shown above.
    
-    It's possible to update the counts after the vocabulary has been created.
-    >>> update(vocabulary,["b","c","c"])
-    1
-    >>> vocabulary.vocab["b"]
-    1
-    """
+It's possible to update the counts after the vocabulary has been created.
+julia> update(vocabulary,["b","c","c"])
+ 1
+
+julia> vocabulary.vocab["b"]
+ 1
+```
+"""
 mutable struct Vocabulary
 vocab::Dict{String,Int64}
 unk_cutoff::Int
 unk_label::String
 allword::Array{String,1}
 end
-function Vocabulary(word,unk_cutoff =1 ,unk_label = "<unk>") 
+function Vocabulary(word, unk_cutoff=1, unk_label="<unk>") 
     if unk_label in word
-        #error("unk_label is in vocab")
+        error("unk_label is in vocab")
     else
     word= push!(word,unk_label)
     end
@@ -100,10 +103,9 @@ Return an Array of String
 function lookup(voc::Vocabulary,word)
     look = []
     for w in word
-         if w in keys(voc.vocab)
+        if w in keys(voc.vocab)
             push!(look,w) 
-         else 
-            #return vocab.unk_label
+        else     
             push!(look,voc.unk_label) 
         end
     end

From 1150e4ed7c133f05324c7430773977721821959e Mon Sep 17 00:00:00 2001
From: tejasvaidhyadev <iamtejasvaidhya@gmail.com>
Date: Sat, 30 May 2020 21:06:45 +0530
Subject: [PATCH 14/51] updating counter

---
 src/LM/counter.jl | 28 ++++++----------------------
 1 file changed, 6 insertions(+), 22 deletions(-)

diff --git a/src/LM/counter.jl b/src/LM/counter.jl
index a4b4a0a9..2abae334 100644
--- a/src/LM/counter.jl
+++ b/src/LM/counter.jl
@@ -3,14 +3,11 @@ import DataStructures.Accumulator
 import DataStructures.DefaultDict
 import DataStructures.counter
 
-function normalize(accum)
-    #sum all counts
-    s = float(sum(accum))
-    #tuple of string with each count divided by sum
-    [(history,float(sum(count))/s) for (history,count) in accum]
-end
-
-function counter1(data, min::Integer, max::Integer,norm::Function;gamma = nothing)
+"""
+    counter is used to make conditional distribution, which is used by score functions to 
+    calculate conditonal frequency distribution
+"""
+function counter2(data, min::Integer, max::Integer)
     data = (everygram(data,min_len = min, max_len =max ))
     data = split.(data)
     temp_lm = DefaultDict{SubString{String}, Accumulator{String,Int64}}(counter(SubString{String}))
@@ -18,19 +15,6 @@ function counter1(data, min::Integer, max::Integer,norm::Function;gamma = nothin
         history,word = data[i][1:end-1], data[i][end]
         temp_lm[join(history, " ")][word] += 1
     end
-  #return Dict from iterated temp_lm with normalized histories
-  Dict(histories => norm(word,gamma) for (histories,word) in temp_lm)
-    #return temp_lm
+    return temp_lm
 end
 
-function normalize(accum,gamma)
-  #sum all counts
-  s = float(sum(accum))
-  #tuple of string with each count divided by sum
-  [(word,float(sum(count))/s) for (word,count) in accum]
-end
-function lid_norm(accum,gamma)
-      s = float(sum(accum)+(gamma)*length(accum)) 
-      [(word,float(count + gamma)/s) for (word,count) in accum]
-
-end

From 15bef1082c0529f10fe28bed45ea5b63f33f9792 Mon Sep 17 00:00:00 2001
From: tejasvaidhyadev <iamtejasvaidhya@gmail.com>
Date: Sat, 30 May 2020 21:07:30 +0530
Subject: [PATCH 15/51] adding all ngram based model

---
 src/LM/langmodel.jl | 190 +++++++++++++++++++++++++++++++++++---------
 1 file changed, 154 insertions(+), 36 deletions(-)

diff --git a/src/LM/langmodel.jl b/src/LM/langmodel.jl
index 85f7720e..b95c980c 100644
--- a/src/LM/langmodel.jl
+++ b/src/LM/langmodel.jl
@@ -1,71 +1,189 @@
 abstract type Langmodel end
+abstract type gammamodel <: Langmodel end #BaseNgram with smoothing algo
+abstract type InterpolatedLanguageModel <: Langmodel end #Interpolated language model with smoothing
 
+"""
+    Type for providing MLE ngram model scores.
+    Implementation of Base Ngram Model.
+
+"""
 struct MLE <: Langmodel
     vocab ::Vocabulary
-    #counter::Dict{SubString{String},Array{Tuple{String,Float64},1}}
 end
 
-function MLE(word,unk_cutoff=1 ,unk_label="<unk>")
-   MLE(Vocabulary(word,unk_cutoff ,unk_label))
+function MLE(word, unk_cutoff=1, unk_label="<unk>")
+    MLE(Vocabulary(word, unk_cutoff, unk_label))
 end
 
-function (lm::MLE)(text,min::Integer,max::Integer) 
-     text = lookup(lm.vocab ,text)
-     text=convert(Array{String}, text)
-     return counter1(text,min,max,normalize)
+function (lm::MLE)(text, min::Integer, max::Integer) 
+    text = lookup(lm.vocab, text)
+    text=convert(Array{String}, text)
+    return counter2(text, min, max)
 end
 
-struct Lidstone <: Langmodel
+ """
+    Type for providing Lidstone-smoothed scores.
+
+    In addition to initialization arguments from BaseNgramModel also requires
+    a number by which to increase the counts, gamma.
+"""
+struct Lidstone <: gammamodel
     vocab ::Vocabulary
     gamma ::Integer
 end
 
-function Lidstone(word,gamma,unk_cutoff=1 ,unk_label="<unk>")
-   Lidstone(Vocabulary(word,unk_cutoff ,unk_label),gamma)
+function Lidstone(word, gamma,unk_cutoff=1, unk_label="<unk>")
+    Lidstone(Vocabulary(word, unk_cutoff, unk_label), gamma)
 end
 
-function (lm::Lidstone)(text,min::Integer,max::Integer) 
-     text = lookup(lm.vocab ,text)
-     text=convert(Array{String}, text)
-     return counter1(text,min,max,lid_norm,gamma = lm.gamma)
+function (lm::Lidstone)(text, min::Integer, max::Integer) 
+    text = lookup(lm.vocab, text)
+    text=convert(Array{String}, text)
+    return counter2(text, min, max)
 end
+"""Type for providing Laplace-smoothed scores.
 
+    In addition to initialization arguments from BaseNgramModel also requires
+    a number by which to increase the counts, gamma = 1.
+"""
 
-struct Laplace <: Langmodel
+struct Laplace <: gammamodel
     vocab ::Vocabulary
     gamma ::Integer
 end
 
-function Laplace(word,unk_cutoff=1 ,unk_label="<unk>")
-   Lidstone(Vocabulary(word,unk_cutoff ,unk_label),1)
+function Laplace(word, unk_cutoff=1, unk_label="<unk>")
+    Lidstone(Vocabulary(word, unk_cutoff, unk_label), 1)
 end
 
-function (lm::Laplace)(text,min::Integer,max::Integer) 
-     text = lookup(lm.vocab ,text)
-     text=convert(Array{String}, text)
-     return counter1(text,min,max,lid_norm,gamma = lm.gamma)
+function (lm::Laplace)(text, min::Integer, max::Integer) 
+    text = lookup(lm.vocab, text)
+    text = convert(Array{String}, text)
+    return counter2(text, min, max)
 end
 
+"""Add-one smoothing: Lidstone or Laplace.(gammamodel)
+   To see what kind, look at `gamma` attribute on the class.
+"""
+function score(m::gammamodel, temp_lm, word,context)
+    accum = temp_lm[context]
+    #print(accum)
+    s = float(sum(accum)+(m.gamma)*length(m.vocab.vocab)) 
+    for (text, count) in accum
+        if text == word
+            return(float(count+m.gamma)/s)
+        end
+    end
+    return(float(m.gamma)/s)
+end
 
-function unmaskscore(a::Dict{SubString{String},Array{Tuple{String,Float64},1}},word,context)
-    for i in a[context]
-        if word == i[1]
-            return i[2]
+"""To get probability of word given that context
+   In otherwords, for given context calculate frequency distribution of word
+  
+"""
+function prob(templ_lm::DefaultDict, word,context=nothing)
+    if context == nothing || context == ""
+        return(1/float(length(templ_lm))) #provide distribution 
+    else
+        accum = templ_lm[context]
+    end
+    s = float(sum(accum)) 
+    for (text,count) in accum
+        if text == word
+            return(float(count) / s)
         end
     end
+    return(Inf)
 end
 
-function score(voc::Langmodel,model::Dict{SubString{String},Array{Tuple{String,Float64},1}} ,word ,context )
-        """Masks out of vocab (OOV) words and computes their model score.
-        For model-specific logic of calculating scores, see the `unmasked_score`
-        method.
-        """
-    return unmaskscore(model,word,context )
+function score(m::MLE,temp_lm,word,context = nothing)
+    prob(temp_lm , word, context)
+end
+struct WittenBellInterpolated <: InterpolatedLanguageModel 
+    vocab ::Vocabulary
 end
 
-function logscore(word, context= None)
-        """Evaluate the log score of this word in this context.
-        The arguments are the same as for `score` and `unmasked_score`.
-        """
-    return log2(score(word, context))
+function WittenBellInterpolated(word,unk_cutoff=1 ,unk_label="<unk>")
+    WittenBellInterpolated(Vocabulary(word,unk_cutoff ,unk_label))
+end
+
+function (lm::WittenBellInterpolated)(text,min::Integer,max::Integer) 
+    text = lookup(lm.vocab ,text)
+    text=convert(Array{String}, text)
+    return counter2(text,min,max)
+end
+
+function alpha_gammma(templ_lm::DefaultDict, word,context)
+    local alpha
+    local gam
+    accum = templ_lm[context]
+    s = float(sum(accum)) 
+    for (text,count) in accum
+        if text == word
+            alpha=(float(count) / s)
+            break 
+        else
+            alpha = 1/s
+        end
+    end
+   
+    gam = gamma(accum)
+    return alpha*(1- gam),gam 
+end
+
+function count_non_zero_vals(accum::Accumulator{})
+    return(length(accum))
+end
+    
+function gamma(accum)
+    nplus=count_non_zero_vals(accum)
+    return(nplus/(nplus+float(sum(accum))))
 end
+
+function score(m::InterpolatedLanguageModel,temp_lm::DefaultDict,word,context=nothing)
+    if context == nothing || context == ""
+        return prob(temp_lm,word,context)
+    end
+    if context in keys(temp_lm)
+        alpha,gamma = alpha_gammma(temp_lm,word,context)
+        return (alpha + gamma*score(m,temp_lm,word,context_reduce(context)))
+    else
+        return score(m,temp_lm,word,context_reduce(context))
+    end
+end
+        
+function context_reduce(context)
+    context = split(context)
+    join(context[2:end]," ")
+end
+struct KneserNeyInterpolated <: InterpolatedLanguageModel 
+    vocab::Vocabulary
+    discount::Float64
+end
+
+
+
+function KneserNeyInterpolated(word,gamma,unk_cutoff=1 ,unk_label="<unk>")
+    KneserNeyInterpolate(Vocabulary(word,unk_cutoff ,unk_label),gamma)
+end
+
+function (lm::KneserNeyInterpolated)(text,min::Integer,max::Integer) 
+    text = lookup(lm.vocab ,text)
+    text=convert(Array{String}, text)
+    return counter2(text,min,max)
+end
+function alpha_gammma(m::KneserNeyInterpolated,templ_lm::DefaultDict, word,context)
+    accum = templ_lm[context]
+    s = float(sum(accum)) 
+   for (text,count) in accum
+       if text == word
+           alpha=(max(float(count)-m.discount,0.0) / s)
+           break 
+       else
+           alpha = 1/length(m.vocab.vocab)
+       end
+    end
+    gamma = (m.discount * count_non_zero_vals(accum) /s)
+end
+
+

From 5fb6db9d98c502080864119234f4ac638df70b1b Mon Sep 17 00:00:00 2001
From: tejasvaidhyadev <iamtejasvaidhya@gmail.com>
Date: Sat, 30 May 2020 21:10:46 +0530
Subject: [PATCH 16/51] adding matrices

---
 src/LM/api.jl | 21 +++++++++++++++++++++
 1 file changed, 21 insertions(+)
 create mode 100644 src/LM/api.jl

diff --git a/src/LM/api.jl b/src/LM/api.jl
new file mode 100644
index 00000000..0e4d804f
--- /dev/null
+++ b/src/LM/api.jl
@@ -0,0 +1,21 @@
+#TO DO 
+# Doc string
+function maskedscore(m::Langmodel,temp_lm::DefaultDict,word,context)
+   score(m,temp_lm,lookup(m.vocab ,[word])[1] ,lookup(m.vocab ,[context])[1])
+end
+
+function logscore(m::Langmodel,temp_lm::DefaultDict,word,context)
+    log2(maskedscore(m,temp_lm,word,context))
+end
+function entropy(m::Langmodel,lm::DefaultDict,text_ngram)
+    local log_set=Float64[]
+    for ngram in text_ngram
+        ngram = split(ngram)
+        push!(log_set,logscore(m,lm,ngram[end],join(ngram[1:end-1]," ")))
+        #println(logscore(m,lm,ngram[end],ngram[1:end-1]))
+    end
+    return(sum(log_set)/length(log_set))
+end
+function perplexity(m::Langmodel,lm::DefaultDict,text_ngram)
+    return(2^(entropy(m,lm,text_ngram)))
+end

From f240cae4aa2998495254fd8c94a13837eb56a001 Mon Sep 17 00:00:00 2001
From: tejasvaidhyadev <iamtejasvaidhya@gmail.com>
Date: Sat, 30 May 2020 21:16:17 +0530
Subject: [PATCH 17/51] updating api

---
 src/LM/api.jl | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/LM/api.jl b/src/LM/api.jl
index 0e4d804f..2936eb06 100644
--- a/src/LM/api.jl
+++ b/src/LM/api.jl
@@ -7,6 +7,7 @@ end
 function logscore(m::Langmodel,temp_lm::DefaultDict,word,context)
     log2(maskedscore(m,temp_lm,word,context))
 end
+
 function entropy(m::Langmodel,lm::DefaultDict,text_ngram)
     local log_set=Float64[]
     for ngram in text_ngram
@@ -16,6 +17,7 @@ function entropy(m::Langmodel,lm::DefaultDict,text_ngram)
     end
     return(sum(log_set)/length(log_set))
 end
+
 function perplexity(m::Langmodel,lm::DefaultDict,text_ngram)
     return(2^(entropy(m,lm,text_ngram)))
 end

From 6bd06efce1b498ab6af6080d88788121eefffac9 Mon Sep 17 00:00:00 2001
From: tejasvaidhyadev <iamtejasvaidhya@gmail.com>
Date: Sat, 30 May 2020 21:18:08 +0530
Subject: [PATCH 18/51] syntax correction

---
 src/LM/counter.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/LM/counter.jl b/src/LM/counter.jl
index 2abae334..dd595b71 100644
--- a/src/LM/counter.jl
+++ b/src/LM/counter.jl
@@ -8,7 +8,7 @@ import DataStructures.counter
     calculate conditonal frequency distribution
 """
 function counter2(data, min::Integer, max::Integer)
-    data = (everygram(data,min_len = min, max_len =max ))
+    data = everygram(data, min_len=min, max_len=max)
     data = split.(data)
     temp_lm = DefaultDict{SubString{String}, Accumulator{String,Int64}}(counter(SubString{String}))
     for i in 1:length(data)

From e34e71e800eb762f40d1d74c01612c578f34c562 Mon Sep 17 00:00:00 2001
From: tejasvaidhyadev <iamtejasvaidhya@gmail.com>
Date: Sat, 30 May 2020 21:26:05 +0530
Subject: [PATCH 19/51] exporting imp function

---
 src/TextAnalysis.jl | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/src/TextAnalysis.jl b/src/TextAnalysis.jl
index 1b231299..a27c88df 100644
--- a/src/TextAnalysis.jl
+++ b/src/TextAnalysis.jl
@@ -69,6 +69,8 @@ module TextAnalysis
     
     export Vocabulary,lookup,update
     export everygram, padding_ngram
+    export maskedscore, logscore, entropy, perplexity
+    export MLE, Lidstone, Laplace, WittenBellInterpolated, KneserNeyInterpolated, score
 
     include("tokenizer.jl")
     include("ngramizer.jl")
@@ -117,6 +119,11 @@ module TextAnalysis
     
     # Lang_model
     include("LM/vocab.jl")
+    include("LM/api.jl")
+    include("LM/counter.jl")
+    include("LM/langmodel.jl")
+    include("LM/preprocess.jl")
+    include("LM/vocab.jl")
     
     # ULMFiT
     module ULMFiT

From be686aff5d3fed2d697e67eed399cbe2557f0b2d Mon Sep 17 00:00:00 2001
From: tejasvaidhyadev <iamtejasvaidhya@gmail.com>
Date: Sat, 30 May 2020 23:01:54 +0530
Subject: [PATCH 20/51] bug fix

---
 src/LM/langmodel.jl | 23 ++++++++++++++++++++++-
 1 file changed, 22 insertions(+), 1 deletion(-)

diff --git a/src/LM/langmodel.jl b/src/LM/langmodel.jl
index b95c980c..26bebded 100644
--- a/src/LM/langmodel.jl
+++ b/src/LM/langmodel.jl
@@ -10,7 +10,28 @@ abstract type InterpolatedLanguageModel <: Langmodel end #Interpolated language
 struct MLE <: Langmodel
     vocab ::Vocabulary
 end
-
+"""
+    MLE(word::Vector{T}, unk_cutoff=1, unk_label="<unk>") where { T <: AbstractString}
+Return Datatype MLE 
+
+# Example
+
+```julia-repl
+julia> seq = ["To","be","or","not"]
+julia> a = everygram(seq,min_len=1, max_len=-1)
+ 10-element Array{Any,1}:
+  "or"          
+  "not"         
+  "To"          
+  "be"                  
+  "or not" 
+  "be or"       
+  "be or not"   
+  "To be or"    
+  "To be or not"
+```
+   
+"""
 function MLE(word, unk_cutoff=1, unk_label="<unk>")
     MLE(Vocabulary(word, unk_cutoff, unk_label))
 end

From 40bae7b011472c37aa8f620e5096ff40123db46d Mon Sep 17 00:00:00 2001
From: tejasvaidhyadev <iamtejasvaidhya@gmail.com>
Date: Sun, 31 May 2020 14:55:51 +0530
Subject: [PATCH 21/51] adding doc sting

---
 src/LM/langmodel.jl | 174 ++++++++++++++++++++++++++------------------
 1 file changed, 104 insertions(+), 70 deletions(-)

diff --git a/src/LM/langmodel.jl b/src/LM/langmodel.jl
index 26bebded..b4973953 100644
--- a/src/LM/langmodel.jl
+++ b/src/LM/langmodel.jl
@@ -1,35 +1,21 @@
 abstract type Langmodel end
-abstract type gammamodel <: Langmodel end #BaseNgram with smoothing algo
+abstract type gammamodel <: Langmodel end #BaseNgram with Add-one smoothing algo
 abstract type InterpolatedLanguageModel <: Langmodel end #Interpolated language model with smoothing
 
-"""
-    Type for providing MLE ngram model scores.
-    Implementation of Base Ngram Model.
+#DataType MLE
+#Type for providing MLE ngram model scores.
+#Implementation of Base Ngram Model.
 
-"""
 struct MLE <: Langmodel
-    vocab ::Vocabulary
+    vocab::Vocabulary
 end
+
 """
     MLE(word::Vector{T}, unk_cutoff=1, unk_label="<unk>") where { T <: AbstractString}
-Return Datatype MLE 
-
-# Example
-
-```julia-repl
-julia> seq = ["To","be","or","not"]
-julia> a = everygram(seq,min_len=1, max_len=-1)
- 10-element Array{Any,1}:
-  "or"          
-  "not"         
-  "To"          
-  "be"                  
-  "or not" 
-  "be or"       
-  "be or not"   
-  "To be or"    
-  "To be or not"
-```
+
+Initiate Type for providing MLE ngram model scores.
+
+Implementation of Base Ngram Model.
    
 """
 function MLE(word, unk_cutoff=1, unk_label="<unk>")
@@ -42,17 +28,19 @@ function (lm::MLE)(text, min::Integer, max::Integer)
     return counter2(text, min, max)
 end
 
- """
-    Type for providing Lidstone-smoothed scores.
-
-    In addition to initialization arguments from BaseNgramModel also requires
-    a number by which to increase the counts, gamma.
-"""
 struct Lidstone <: gammamodel
-    vocab ::Vocabulary
-    gamma ::Integer
+    vocab::Vocabulary
+    gamma::Integer
 end
 
+"""
+    Lidstone(word::Vector{T}, gamma:: Integer,unk_cutoff=1, unk_label="<unk>") where { T <: AbstractString}
+
+Function to initiate Type(Lidstone) for providing Lidstone-smoothed scores.
+
+In addition to initialization arguments from BaseNgramModel also requires 
+a number by which to increase the counts, gamma.
+"""
 function Lidstone(word, gamma,unk_cutoff=1, unk_label="<unk>")
     Lidstone(Vocabulary(word, unk_cutoff, unk_label), gamma)
 end
@@ -62,15 +50,17 @@ function (lm::Lidstone)(text, min::Integer, max::Integer)
     text=convert(Array{String}, text)
     return counter2(text, min, max)
 end
-"""Type for providing Laplace-smoothed scores.
 
-    In addition to initialization arguments from BaseNgramModel also requires
-    a number by which to increase the counts, gamma = 1.
 """
+    Laplace(word::Vector{T}, unk_cutoff=1, unk_label="<unk>") where { T <: AbstractString}
+Function to initiate Type(Laplace) for providing Laplace-smoothed scores.
 
+In addition to initialization arguments from BaseNgramModel also requires
+a number by which to increase the counts, gamma = 1.
+"""
 struct Laplace <: gammamodel
-    vocab ::Vocabulary
-    gamma ::Integer
+    vocab::Vocabulary
+    gamma::Integer
 end
 
 function Laplace(word, unk_cutoff=1, unk_label="<unk>")
@@ -83,10 +73,15 @@ function (lm::Laplace)(text, min::Integer, max::Integer)
     return counter2(text, min, max)
 end
 
-"""Add-one smoothing: Lidstone or Laplace.(gammamodel)
-   To see what kind, look at `gamma` attribute on the class.
 """
-function score(m::gammamodel, temp_lm, word,context)
+	score(m::gammamodel, temp_lm::DefaultDict, word::AbstractString, context::AbstractString)	
+
+score is used to output probablity of word given that context 
+
+Add-one smoothing to Lidstone or Laplace(gammamodel) models
+        
+"""
+function score(m::gammamodel, temp_lm, word, context) #score for gammamodel output probabl
     accum = temp_lm[context]
     #print(accum)
     s = float(sum(accum)+(m.gamma)*length(m.vocab.vocab)) 
@@ -98,18 +93,20 @@ function score(m::gammamodel, temp_lm, word,context)
     return(float(m.gamma)/s)
 end
 
-"""To get probability of word given that context
-   In otherwords, for given context calculate frequency distribution of word
+"""
+To get probability of word given that context
+
+In otherwords, for given context calculate frequency distribution of word
   
 """
-function prob(templ_lm::DefaultDict, word,context=nothing)
+function prob(templ_lm::DefaultDict, word, context=nothing)
     if context == nothing || context == ""
         return(1/float(length(templ_lm))) #provide distribution 
     else
         accum = templ_lm[context]
     end
     s = float(sum(accum)) 
-    for (text,count) in accum
+    for (text, count) in accum
         if text == word
             return(float(count) / s)
         end
@@ -117,29 +114,44 @@ function prob(templ_lm::DefaultDict, word,context=nothing)
     return(Inf)
 end
 
-function score(m::MLE,temp_lm,word,context = nothing)
-    prob(temp_lm , word, context)
+"""
+	score(m::MLE, temp_lm::DefaultDict, word::AbstractString, context::AbstractString)	
+
+score is used to output probablity of word given that context in MLE
+        
+"""
+function score(m::MLE, temp_lm, word, context = nothing)
+    prob(temp_lm, word, context)
 end
+
 struct WittenBellInterpolated <: InterpolatedLanguageModel 
     vocab ::Vocabulary
 end
 
-function WittenBellInterpolated(word,unk_cutoff=1 ,unk_label="<unk>")
-    WittenBellInterpolated(Vocabulary(word,unk_cutoff ,unk_label))
+"""
+    WittenBellInterpolated(word::Vector{T}, gamma:: Integer,unk_cutoff=1, unk_label="<unk>") where { T <: AbstractString}
+
+Initiate Type for providing Interpolated version of Witten-Bell smoothing.
+
+The idea to abstract this comes from Chen & Goodman 1995.
+
+"""
+function WittenBellInterpolated(word, unk_cutoff=1, unk_label="<unk>")
+    WittenBellInterpolated(Vocabulary(word, unk_cutoff, unk_label))
 end
 
-function (lm::WittenBellInterpolated)(text,min::Integer,max::Integer) 
-    text = lookup(lm.vocab ,text)
+function (lm::WittenBellInterpolated)(text, min::Integer, max::Integer) 
+    text = lookup(lm.vocab, text)
     text=convert(Array{String}, text)
-    return counter2(text,min,max)
+    return counter2(text, min, max)
 end
-
-function alpha_gammma(templ_lm::DefaultDict, word,context)
+# alpha_gamma function for KneserNeyInterpolated
+function alpha_gammma(m::WittenBellInterpolated, templ_lm::DefaultDict, word, context)
     local alpha
     local gam
     accum = templ_lm[context]
     s = float(sum(accum)) 
-    for (text,count) in accum
+    for (text, count) in accum
         if text == word
             alpha=(float(count) / s)
             break 
@@ -149,7 +161,7 @@ function alpha_gammma(templ_lm::DefaultDict, word,context)
     end
    
     gam = gamma(accum)
-    return alpha*(1- gam),gam 
+    return alpha*(1- gam), gam 
 end
 
 function count_non_zero_vals(accum::Accumulator{})
@@ -158,53 +170,75 @@ end
     
 function gamma(accum)
     nplus=count_non_zero_vals(accum)
-    return(nplus/(nplus+float(sum(accum))))
+    return(nplus/(nplus + float(sum(accum))))
 end
 
+"""
+	score(m::InterpolatedLanguageModel, temp_lm::DefaultDict, word::AbstractString, context::AbstractString)	
+
+score is used to output probablity of word given that context in InterpolatedLanguageModel
+
+Apply Kneserney and WittenBell smoothing
+depending upon the sub-Type
+        
+"""
 function score(m::InterpolatedLanguageModel,temp_lm::DefaultDict,word,context=nothing)
     if context == nothing || context == ""
-        return prob(temp_lm,word,context)
+        return prob(temp_lm, word, context)
     end
     if context in keys(temp_lm)
-        alpha,gamma = alpha_gammma(temp_lm,word,context)
-        return (alpha + gamma*score(m,temp_lm,word,context_reduce(context)))
+        alpha,gamma = alpha_gammma(m, temp_lm, word, context)
+        return (alpha + gamma*score(m, temp_lm, word, context_reduce(context)))
     else
-        return score(m,temp_lm,word,context_reduce(context))
+        return score(m, temp_lm, word, context_reduce(context))
     end
 end
         
 function context_reduce(context)
     context = split(context)
-    join(context[2:end]," ")
+    join(context[2:end], " ")
 end
+
+
 struct KneserNeyInterpolated <: InterpolatedLanguageModel 
     vocab::Vocabulary
     discount::Float64
 end
 
+"""
+    KneserNeyInterpolated(word::Vector{T}, discount:: Float64,unk_cutoff=1, unk_label="<unk>") where { T <: AbstractString}
+
+Initiate Type for providing KneserNey Interpolated language model.
 
+The idea to abstract this comes from Chen & Goodman 1995.
 
-function KneserNeyInterpolated(word,gamma,unk_cutoff=1 ,unk_label="<unk>")
-    KneserNeyInterpolate(Vocabulary(word,unk_cutoff ,unk_label),gamma)
+"""
+function KneserNeyInterpolated(word, gamma, unk_cutoff=1 , unk_label="<unk>")
+    KneserNeyInterpolated(Vocabulary(word, unk_cutoff, unk_label), gamma)
 end
 
-function (lm::KneserNeyInterpolated)(text,min::Integer,max::Integer) 
-    text = lookup(lm.vocab ,text)
-    text=convert(Array{String}, text)
-    return counter2(text,min,max)
+function (lm::KneserNeyInterpolated)(text, min::Integer, max::Integer) 
+    text = lookup(lm.vocab, text)
+    text = convert(Array{String}, text)
+    return counter2(text, min, max)
 end
+# alpha_gamma function for KneserNeyInterpolated
 function alpha_gammma(m::KneserNeyInterpolated,templ_lm::DefaultDict, word,context)
+    local alpha
+    local gamma   
     accum = templ_lm[context]
     s = float(sum(accum)) 
-   for (text,count) in accum
+   for (text, count) in accum
        if text == word
-           alpha=(max(float(count)-m.discount,0.0) / s)
+           alpha=(max(float(count)-m.discount, 0.0) /s)
            break 
        else
            alpha = 1/length(m.vocab.vocab)
        end
     end
-    gamma = (m.discount * count_non_zero_vals(accum) /s)
+    gamma = (m.discount * count_non_zero_vals(accum)/s)
+    return alpha, gamma
 end
 
 
+

From e58fd17f89f8cbc1cd41f302fc59de3b80ac7108 Mon Sep 17 00:00:00 2001
From: tejasvaidhyadev <iamtejasvaidhya@gmail.com>
Date: Sun, 31 May 2020 14:56:49 +0530
Subject: [PATCH 22/51] updated docstrings

---
 src/LM/vocab.jl | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/LM/vocab.jl b/src/LM/vocab.jl
index c8bda2fb..6f9d250f 100644
--- a/src/LM/vocab.jl
+++ b/src/LM/vocab.jl
@@ -113,4 +113,3 @@ function lookup(voc::Vocabulary,word)
 end
 
 
-

From f8f09f65d25ecd252dc4ac402b70ecd1d2b69a85 Mon Sep 17 00:00:00 2001
From: tejasvaidhyadev <iamtejasvaidhya@gmail.com>
Date: Sun, 31 May 2020 14:57:41 +0530
Subject: [PATCH 23/51] export apis

---
 src/TextAnalysis.jl | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/src/TextAnalysis.jl b/src/TextAnalysis.jl
index a27c88df..01550ce9 100644
--- a/src/TextAnalysis.jl
+++ b/src/TextAnalysis.jl
@@ -119,11 +119,10 @@ module TextAnalysis
     
     # Lang_model
     include("LM/vocab.jl")
+    include("LM/langmodel.jl") 
     include("LM/api.jl")
     include("LM/counter.jl")
-    include("LM/langmodel.jl")
-    include("LM/preprocess.jl")
-    include("LM/vocab.jl")
+    include("LM/preprocessing.jl")
     
     # ULMFiT
     module ULMFiT

From 6f966544e9e2727a93da35cbeabfe4ceffc12e74 Mon Sep 17 00:00:00 2001
From: tejasvaidhyadev <iamtejasvaidhya@gmail.com>
Date: Fri, 5 Jun 2020 02:06:15 +0530
Subject: [PATCH 24/51] adding test for LM

---
 test/LM.jl | 59 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 59 insertions(+)
 create mode 100644 test/LM.jl

diff --git a/test/LM.jl b/test/LM.jl
new file mode 100644
index 00000000..71e16b73
--- /dev/null
+++ b/test/LM.jl
@@ -0,0 +1,59 @@
+using DataStructures
+
+@testset "Vocabulary" begin
+    
+    words = ["a", "c", "-", "d", "c", "a", "b", "r", "a", "c", "d"]
+    vocab = Vocabulary(words, 2, "</s>")
+    @test vocab isa Vocabulary
+    @test vocab.vocab isa Dict
+    @test vocab.unk_cutoff isa Int
+    @test vocab.unk_label isa String
+    @test vocab.allword isa Array{String,1}
+    @test length(vocab.vocab) == 4 #only 4 differnt string over word freq 2
+    @test isequal(vocab.unk_cutoff, 2)
+    @test vocab.unk_label == "</s>"
+    @test isequal(vocab.allword ,["a", "c", "-", "d", "c", "a", "b", "r", "a", "c", "d", "</s>"]) 
+    @test isequal(vocab.vocab, Dict{String,Int}("</s>"=>1,"c"=>3,"a"=>3,"d"=>2))
+    #to check lookup function
+    @test lookup(vocab,["a","b","c","alien"]) == ["a", "</s>", "c", "</s>"]    
+end
+
+@testset "preprocessing" begin
+    @testset "ngramizenew" begin 
+        sample_text = ["this", "is", "some", "sample", "text"]
+        ngrams = TextAnalysis.ngramizenew(sample_text,1)
+        
+        @test isequal(ngrams, ["this", "is", "some", "sample", "text"])
+        
+        ngrams = TextAnalysis.ngramizenew(sample_text,2)
+        @test isequal(ngrams, ["this is", "is some", "some sample", "sample text"])
+    
+        ngrams = TextAnalysis.ngramizenew(sample_text,1,2)
+        @test isequal(ngrams, ["this", "is", "some", "sample", "text", "this is", "is some", "some sample", "sample text"])
+    end
+    
+    @testset "Padding function" begin
+        example = ["1","2","3","4","5"]
+        padded=padding_ngram(example,2,pad_left=true,pad_right=true)
+        @test isequal(padded,["<s> 1", "1 2", "2 3", "3 4", "4 5", "5 </s>"])
+        @test isequal(example, ["<s>","1","2","3","4","5","</s>"])
+        
+        example = ["1","2","3","4","5"] #if used
+        padded=padding_ngram(example,2,pad_right=true)
+        @test isequal(padded,["1 2", "2 3", "3 4", "4 5", "5 </s>"])
+    end
+    @testset "everygram function" begin
+        example = ["1","2","3","4","5"]
+        everyngms = everygram(example,min_len=1,max_len=2)
+        @test isequal(everyngms, ["1", "2", "3", "4", "5", "1 2", "2 3", "3 4", "4 5"])
+    end
+end
+
+@testset "counter" begin
+    exam = ["To", "be", "or", "not", "to", "be","To", "be", "or", "not", "to", "be"]
+    fit = (TextAnalysis.counter2(exam,2,2))
+    @test fit isa DataStructures.DefaultDict
+    @test length(fit) == 5 #length of unique words
+    @test
+end
+    

From db6d38a0884638c47c5c52f1c2409ca7d2231a77 Mon Sep 17 00:00:00 2001
From: tejasvaidhyadev <iamtejasvaidhya@gmail.com>
Date: Fri, 5 Jun 2020 02:06:48 +0530
Subject: [PATCH 25/51] including LM.jl

---
 test/runtests.jl | 1 +
 1 file changed, 1 insertion(+)

diff --git a/test/runtests.jl b/test/runtests.jl
index 59b3a4dd..867b284d 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -28,3 +28,4 @@ include("taggingschemes.jl")
 include("averagePerceptronTagger.jl")
 include("evaluation_metrics.jl")
 include("ulmfit.jl")
+include("LM.jl")

From 5ddf7cb132e8143e9e274a86d52d46035af7c8e8 Mon Sep 17 00:00:00 2001
From: tejasvaidhyadev <iamtejasvaidhya@gmail.com>
Date: Sat, 6 Jun 2020 03:04:35 +0530
Subject: [PATCH 26/51] adding docs for LM

---
 docs/src/LM.md | 116 +++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 116 insertions(+)
 create mode 100644 docs/src/LM.md

diff --git a/docs/src/LM.md b/docs/src/LM.md
new file mode 100644
index 00000000..a735a7be
--- /dev/null
+++ b/docs/src/LM.md
@@ -0,0 +1,116 @@
+#  Statistical Language Model 
+
+**TextAnalysis** provide following different Language Models 
+
+- **MLE** - Base Ngram model.
+- **Lidstone **- Base Ngram model with Lidstone smoothing.
+- **Laplace** - Base Ngram language model with Laplace smoothing.
+- **WittenBellInterpolated** - Interpolated Version of witten-Bell algorithm.
+- **KneserNeyInterpolated** - Interpolated  version of Kneser -Ney smoothing.
+
+## APIs
+
+To use the API, we first *Instantiate* desired model and then load it with train set
+
+```julia
+<MLE>(word::Vector{T}, unk_cutoff=1, unk_label="<unk>") where { T <: AbstractString}
+        
+Lidstone(word::Vector{T}, gamma:: Float64, unk_cutoff=1, unk_label="<unk>") where { T <: AbstractString}
+        
+Laplace(word::Vector{T}, unk_cutoff=1, unk_label="<unk>") where { T <: AbstractString}
+        
+WittenBellInterpolated(word::Vector{T}, unk_cutoff=1, unk_label="<unk>") where { T <: AbstractString}
+        
+KneserNeyInterpolated(word::Vector{T}, discount:: Float64=0.1, unk_cutoff=1, unk_label="<unk>") where { T <: AbstractString}
+        
+(lm::<Languagemodel>)(text, min::Integer, max::Integer)
+```
+Arguments:
+
+ * `word` : Array of  strings to store vocabulary.
+
+ * `unk_cutoff`: Tokens with counts greater than or equal to the cutoff value will be considered part of the vocabulary.
+
+ * `unk_label`: token for unkown labels 
+
+ *  `gamma`: smoothing arugment gamma 
+
+ * `discount`:  discouting factor for `KneserNeyInterpolated`
+
+   for more information see docstrings of vocabulary
+
+```julia
+julia> voc = ["my","name","is","salman","khan","and","he","is","shahrukh","Khan"]
+
+julia>train = ["khan","is","my","good", "friend","and","He","is","my","brother"]
+# voc and train are used to train vocabulary and model respectively
+
+julia> model = MLE(voc)
+MLE(Vocabulary(Dict("khan"=>1,"name"=>1,"<unk>"=>1,"salman"=>1,"is"=>2,"Khan"=>1,"my"=>1,"he"=>1,"shahrukh"=>1,"and"=>1…), 1, "<unk
+        >", ["my", "name", "is", "salman", "khan", "and", "he", "is", "shahrukh", "Khan", "<unk>"]))
+julia> print(voc)
+11-element Array{String,1}:
+ "my"      
+ "name"    
+ "is"      
+ "salman"  
+ "khan"    
+ "and"     
+ "he"      
+ "is"      
+ "shahrukh"
+ "Khan"    
+ "<unk>"   
+# you can see "<unk>" token is added to voc 
+julia> fit = model(train,2,2) #considering only bigrams
+julia> unmaskedscore = score(model, fit, "is" ,"<unk>") #score output P(word | context) without replacing context word with "<unk>"
+0.3333333333333333
+julia> masked_score = maskedscore(model,fit,"is","alien")
+0.3333333333333333
+#as expected maskedscore is equivalent to unmaskedscore with context replaced with "<unk>"
+
+```
+!!! note
+
+    When you call `MLE(voc)` for the first time, It will update your vocabulary set as well. 
+
+## Evaluation Method
+
+- `score` 
+
+    used to evaluate probablity of word given context (*P(word | context)*)
+
+   ```julia
+	score(m::gammamodel, temp_lm::DefaultDict, word::AbstractString, context::AbstractString)
+   ```
+
+​	In case of Lidstone and Laplace it apply smoothing and, 
+
+​	In Interpolated language model, provide Kneserney and WittenBell smoothing  
+
+- `maskedscore` 
+
+  It is used to evaluate *score* with masks out of vocabulary words
+
+  The arguments are the same as for score
+
+- `logscore` 
+
+  Evaluate the log score of this word in this context.
+
+  The arguments are the same as for score and maskedscore
+
+- `entropy`
+	```julia
+  entropy(m::Langmodel,lm::DefaultDict,text_ngram::word::Vector{T}) where { T <: AbstractString}
+	```
+
+  Calculate cross-entropy of model for given evaluation text.
+
+  Input text must be Array of ngram of same lengths
+
+- `perplexity`  
+
+  Calculates the perplexity of the given text.
+
+  This is simply 2 ** cross-entropy(`entropy`) for the text, so the arguments are the same as `entropy`.
\ No newline at end of file

From d1d7417b5e03d1b1ea1c7f58065d7846d0dcd21b Mon Sep 17 00:00:00 2001
From: tejasvaidhyadev <iamtejasvaidhya@gmail.com>
Date: Sat, 6 Jun 2020 04:37:21 +0530
Subject: [PATCH 27/51] adding Preprocessing in docs

---
 docs/src/LM.md | 44 +++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 43 insertions(+), 1 deletion(-)

diff --git a/docs/src/LM.md b/docs/src/LM.md
index a735a7be..7768830b 100644
--- a/docs/src/LM.md
+++ b/docs/src/LM.md
@@ -113,4 +113,46 @@ julia> masked_score = maskedscore(model,fit,"is","alien")
 
   Calculates the perplexity of the given text.
 
-  This is simply 2 ** cross-entropy(`entropy`) for the text, so the arguments are the same as `entropy`.
\ No newline at end of file
+  This is simply 2 ** cross-entropy(`entropy`) for the text, so the arguments are the same as `entropy`.
+
+##  Preprocessing
+
+ For Preprocessing following functions:
+
+1. `everygram`: Return all possible ngrams generated from sequence of items, as an Array{String,1}
+
+   ```julia
+   julia> seq = ["To","be","or","not"]
+   julia> a = everygram(seq,min_len=1, max_len=-1)
+    10-element Array{Any,1}:
+     "or"          
+     "not"         
+     "To"          
+     "be"                  
+     "or not" 
+     "be or"       
+     "be or not"   
+     "To be or"    
+     "To be or not"
+   ```
+
+2. `padding_ngrams`: padding _ngram is used to pad both left and right of sentence and out putting ngrmas of order n
+
+   It also pad the original input Array of string 
+
+   ```julia
+   julia> example = ["1","2","3","4","5"]
+         
+   julia> example = ["1","2","3","4","5"]
+   julia> padding_ngrams(example,2,pad_left=true,pad_right=true)
+    6-element Array{Any,1}:
+     "<s> 1" 
+     "1 2"   
+     "2 3"   
+     "3 4"   
+     "4 5"   
+     "5 </s>"
+   ```
+
+   ​
+

From 36ffa46ac0cad6931512dad890b4a5ad9710dcd4 Mon Sep 17 00:00:00 2001
From: tejasvaidhyadev <iamtejasvaidhya@gmail.com>
Date: Sat, 6 Jun 2020 05:08:23 +0530
Subject: [PATCH 28/51] adding docs for vocab

---
 docs/src/LM.md | 24 +++++++++++++++++++++++-
 1 file changed, 23 insertions(+), 1 deletion(-)

diff --git a/docs/src/LM.md b/docs/src/LM.md
index 7768830b..3a32c07e 100644
--- a/docs/src/LM.md
+++ b/docs/src/LM.md
@@ -153,6 +153,28 @@ julia> masked_score = maskedscore(model,fit,"is","alien")
      "4 5"   
      "5 </s>"
    ```
+## Vocabulary 
 
-   ​
+Struct to store Language models vocabulary
+
+checking membership and filters items by comparing their counts to a cutoff value
+
+It also Adds a special "unkown" tokens which unseen words are mapped to
+
+```julia
+julia> words = ["a", "c", "-", "d", "c", "a", "b", "r", "a", "c", "d"]
+julia> vocabulary = Vocabulary(words, 2) 
+  Vocabulary(Dict("<unk>"=>1,"c"=>3,"a"=>3,"d"=>2), 2, "<unk>") 
+
+# lookup a sequence or words in the vocabulary
+julia> word = ["a", "-", "d", "c", "a"]
+
+julia> lookup(vocabulary ,word)
+ 5-element Array{Any,1}:
+  "a"    
+  "<unk>"
+  "d"    
+  "c"    
+  "a"
+```
 

From 8bb35aa8d267b55472f06d2d7f60a5cd9588af5f Mon Sep 17 00:00:00 2001
From: tejasvaidhyadev <iamtejasvaidhya@gmail.com>
Date: Sat, 6 Jun 2020 22:14:01 +0530
Subject: [PATCH 29/51] updating preprocessing

---
 src/LM/preprocessing.jl | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/src/LM/preprocessing.jl b/src/LM/preprocessing.jl
index fd62de4f..24622002 100644
--- a/src/LM/preprocessing.jl
+++ b/src/LM/preprocessing.jl
@@ -22,7 +22,6 @@ julia> a = everygram(seq,min_len=1, max_len=-1)
    
 """
 function everygram(seq::Vector{T}; min_len::Int=1, max_len::Int=-1) where { T <: AbstractString}
-
     ngram = []
     if max_len == -1
         max_len = length(seq)
@@ -88,8 +87,6 @@ julia> ngramizenew(seq ,2)
 ```
 """
 function ngramizenew(words::Vector{T}, nlist::Integer...) where { T <: AbstractString}
-    #(length(nlist) == 1) && (first(nlist) == 1) && return onegramize(lang, words)
-
     n_words = length(words)
 
     tokens = []

From b3bcd941ed3813f058e4652dd2e4e25f1812d094 Mon Sep 17 00:00:00 2001
From: tejasvaidhyadev <iamtejasvaidhya@gmail.com>
Date: Sat, 6 Jun 2020 22:14:21 +0530
Subject: [PATCH 30/51] bug fix in gamma model

---
 src/LM/langmodel.jl | 44 +++++++++++++++++++++++---------------------
 1 file changed, 23 insertions(+), 21 deletions(-)

diff --git a/src/LM/langmodel.jl b/src/LM/langmodel.jl
index b4973953..53b87ddd 100644
--- a/src/LM/langmodel.jl
+++ b/src/LM/langmodel.jl
@@ -30,18 +30,18 @@ end
 
 struct Lidstone <: gammamodel
     vocab::Vocabulary
-    gamma::Integer
+    gamma::Float64
 end
 
 """
-    Lidstone(word::Vector{T}, gamma:: Integer,unk_cutoff=1, unk_label="<unk>") where { T <: AbstractString}
+    Lidstone(word::Vector{T}, gamma:: Float64, unk_cutoff=1, unk_label="<unk>") where { T <: AbstractString}
 
 Function to initiate Type(Lidstone) for providing Lidstone-smoothed scores.
 
 In addition to initialization arguments from BaseNgramModel also requires 
 a number by which to increase the counts, gamma.
 """
-function Lidstone(word, gamma,unk_cutoff=1, unk_label="<unk>")
+function Lidstone(word, gamma = 1.0, unk_cutoff=1, unk_label="<unk>")
     Lidstone(Vocabulary(word, unk_cutoff, unk_label), gamma)
 end
 
@@ -60,11 +60,11 @@ a number by which to increase the counts, gamma = 1.
 """
 struct Laplace <: gammamodel
     vocab::Vocabulary
-    gamma::Integer
+    gamma::Float64
 end
 
 function Laplace(word, unk_cutoff=1, unk_label="<unk>")
-    Lidstone(Vocabulary(word, unk_cutoff, unk_label), 1)
+    Laplace(Vocabulary(word, unk_cutoff, unk_label), 1.0)
 end
 
 function (lm::Laplace)(text, min::Integer, max::Integer) 
@@ -81,7 +81,7 @@ score is used to output probablity of word given that context
 Add-one smoothing to Lidstone or Laplace(gammamodel) models
         
 """
-function score(m::gammamodel, temp_lm, word, context) #score for gammamodel output probabl
+function score(m::gammamodel, temp_lm::DefaultDict, word, context) #score for gammamodel output probabl
     accum = temp_lm[context]
     #print(accum)
     s = float(sum(accum)+(m.gamma)*length(m.vocab.vocab)) 
@@ -99,7 +99,7 @@ To get probability of word given that context
 In otherwords, for given context calculate frequency distribution of word
   
 """
-function prob(templ_lm::DefaultDict, word, context=nothing)
+function prob(m::Langmodel, templ_lm::DefaultDict, word, context=nothing)
     if context == nothing || context == ""
         return(1/float(length(templ_lm))) #provide distribution 
     else
@@ -111,6 +111,9 @@ function prob(templ_lm::DefaultDict, word, context=nothing)
             return(float(count) / s)
         end
     end
+    if context in keys(m.vocab.vocab)
+        return(0)
+    end
     return(Inf)
 end
 
@@ -120,8 +123,8 @@ end
 score is used to output probablity of word given that context in MLE
         
 """
-function score(m::MLE, temp_lm, word, context = nothing)
-    prob(temp_lm, word, context)
+function score(m::MLE, temp_lm::DefaultDict, word, context = nothing)
+    prob(m, temp_lm, word, context)
 end
 
 struct WittenBellInterpolated <: InterpolatedLanguageModel 
@@ -129,7 +132,7 @@ struct WittenBellInterpolated <: InterpolatedLanguageModel
 end
 
 """
-    WittenBellInterpolated(word::Vector{T}, gamma:: Integer,unk_cutoff=1, unk_label="<unk>") where { T <: AbstractString}
+    WittenBellInterpolated(word::Vector{T}, unk_cutoff=1, unk_label="<unk>") where { T <: AbstractString}
 
 Initiate Type for providing Interpolated version of Witten-Bell smoothing.
 
@@ -151,7 +154,7 @@ function alpha_gammma(m::WittenBellInterpolated, templ_lm::DefaultDict, word, co
     local gam
     accum = templ_lm[context]
     s = float(sum(accum)) 
-    for (text, count) in accum
+    for (text,count) in accum
         if text == word
             alpha=(float(count) / s)
             break 
@@ -170,7 +173,7 @@ end
     
 function gamma(accum)
     nplus=count_non_zero_vals(accum)
-    return(nplus/(nplus + float(sum(accum))))
+    return(nplus/(nplus+float(sum(accum))))
 end
 
 """
@@ -182,9 +185,9 @@ Apply Kneserney and WittenBell smoothing
 depending upon the sub-Type
         
 """
-function score(m::InterpolatedLanguageModel,temp_lm::DefaultDict,word,context=nothing)
+function score(m::InterpolatedLanguageModel, temp_lm::DefaultDict, word, context=nothing)
     if context == nothing || context == ""
-        return prob(temp_lm, word, context)
+        return prob(m, temp_lm, word, context)
     end
     if context in keys(temp_lm)
         alpha,gamma = alpha_gammma(m, temp_lm, word, context)
@@ -213,32 +216,31 @@ Initiate Type for providing KneserNey Interpolated language model.
 The idea to abstract this comes from Chen & Goodman 1995.
 
 """
-function KneserNeyInterpolated(word, gamma, unk_cutoff=1 , unk_label="<unk>")
-    KneserNeyInterpolated(Vocabulary(word, unk_cutoff, unk_label), gamma)
+function KneserNeyInterpolated(word, disc = 0.1, unk_cutoff=1, unk_label="<unk>")
+    KneserNeyInterpolated(Vocabulary(word, unk_cutoff, unk_label) ,disc)
 end
 
 function (lm::KneserNeyInterpolated)(text, min::Integer, max::Integer) 
     text = lookup(lm.vocab, text)
-    text = convert(Array{String}, text)
+    text=convert(Array{String}, text)
     return counter2(text, min, max)
 end
 # alpha_gamma function for KneserNeyInterpolated
-function alpha_gammma(m::KneserNeyInterpolated,templ_lm::DefaultDict, word,context)
+function alpha_gammma(m::KneserNeyInterpolated, templ_lm::DefaultDict, word, context)
     local alpha
     local gamma   
     accum = templ_lm[context]
     s = float(sum(accum)) 
    for (text, count) in accum
        if text == word
-           alpha=(max(float(count)-m.discount, 0.0) /s)
+           alpha=(max(float(count)-m.discount, 0.0) / s)
            break 
        else
            alpha = 1/length(m.vocab.vocab)
        end
     end
-    gamma = (m.discount * count_non_zero_vals(accum)/s)
+    gamma = (m.discount * count_non_zero_vals(accum) /s)
     return alpha, gamma
 end
 
 
-

From b5408f157066348d956dd80dca63382f78d9cc4a Mon Sep 17 00:00:00 2001
From: tejasvaidhyadev <iamtejasvaidhya@gmail.com>
Date: Sat, 6 Jun 2020 22:14:51 +0530
Subject: [PATCH 31/51] updating coding style

---
 src/LM/vocab.jl | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/LM/vocab.jl b/src/LM/vocab.jl
index 6f9d250f..828beaa1 100644
--- a/src/LM/vocab.jl
+++ b/src/LM/vocab.jl
@@ -9,7 +9,7 @@ Adds a special "unknown" token which unseen words are mapped to.
 
 # Example
 ```julia-repl
-julia> words = ['a', 'c', '-', 'd', 'c', 'a', 'b', 'r', 'a', 'c', 'd']
+julia> words = ["a", "c", "-", "d", "c", "a", "b", "r", "a", "c", "d"]
 julia> vocabulary = Vocabulary(words, 2) 
   Vocabulary(Dict("<unk>"=>1,"c"=>3,"a"=>3,"d"=>2), 2, "<unk>") 
 
@@ -75,7 +75,7 @@ unk_cutoff::Int
 unk_label::String
 allword::Array{String,1}
 end
-function Vocabulary(word, unk_cutoff=1, unk_label="<unk>") 
+function Vocabulary(word::Vector{T}, unk_cutoff=1, unk_label="<unk>") where { T <: AbstractString}
     if unk_label in word
         error("unk_label is in vocab")
     else
@@ -100,7 +100,7 @@ lookup a sequence or words in the vocabulary
 
 Return an Array of String
 """
-function lookup(voc::Vocabulary,word)
+function lookup(voc::Vocabulary,word::Vector{T}) where { T <: AbstractString}
     look = []
     for w in word
         if w in keys(voc.vocab)

From 9c5230d539b6a4816802c7f2bf20f96372dcd5ae Mon Sep 17 00:00:00 2001
From: tejasvaidhyadev <iamtejasvaidhya@gmail.com>
Date: Sun, 7 Jun 2020 06:30:20 +0530
Subject: [PATCH 32/51] updating for Lm

---
 docs/make.jl | 1 +
 1 file changed, 1 insertion(+)

diff --git a/docs/make.jl b/docs/make.jl
index 93a1b9c0..6beab2fe 100644
--- a/docs/make.jl
+++ b/docs/make.jl
@@ -20,6 +20,7 @@ makedocs(
         "Conditional Random Fields" => "crf.md",
         "Named Entity Recognition" => "ner.md",
         "ULMFiT" => "ULMFiT.md",
+	"statistical Language Model" => "LM.md"
         "API References" => "APIReference.md"
     ],
 )

From 37025af49221110a9bed2df4ec579a6fbc674dff Mon Sep 17 00:00:00 2001
From: tejasvaidhyadev <iamtejasvaidhya@gmail.com>
Date: Sun, 7 Jun 2020 06:30:49 +0530
Subject: [PATCH 33/51] test for Langmodels

---
 test/LM.jl | 52 ++++++++++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 50 insertions(+), 2 deletions(-)

diff --git a/test/LM.jl b/test/LM.jl
index 71e16b73..710680cd 100644
--- a/test/LM.jl
+++ b/test/LM.jl
@@ -15,7 +15,9 @@ using DataStructures
     @test isequal(vocab.allword ,["a", "c", "-", "d", "c", "a", "b", "r", "a", "c", "d", "</s>"]) 
     @test isequal(vocab.vocab, Dict{String,Int}("</s>"=>1,"c"=>3,"a"=>3,"d"=>2))
     #to check lookup function
-    @test lookup(vocab,["a","b","c","alien"]) == ["a", "</s>", "c", "</s>"]    
+    @test lookup(vocab,["a","b","c","alien"]) == ["a", "</s>", "c", "</s>"]
+    word_set = ["<unk>","is","already","there"]
+    @test_throws ErrorException Vocabulary(word_set, 1, "<unk>")
 end
 
 @testset "preprocessing" begin
@@ -54,6 +56,52 @@ end
     fit = (TextAnalysis.counter2(exam,2,2))
     @test fit isa DataStructures.DefaultDict
     @test length(fit) == 5 #length of unique words
-    @test
 end
+
+@testset "language model" begin
+    
+    @testset "MLE" begin
+        voc =["my","name","is","salman","khan","and","he","is","shahrukh","Khan"]
+        train = ["khan","is","my","good", "friend","and","He","is","my","brother"]
+        model = MLE(voc)
+        fit = model(train, 2, 2) #considering only bigrams
+        unmaskedscore = score(model, fit, "is" ,"<unk>")
+        @test unmaskedscore == 0.3333333333333333
+        @test score(model, fit, "is", "alien") == Inf #context not in vocabulary 
+        @test score(model, fit, "alien", "is") == 0 # word not in vocabulary
+    end   
     
+    @testset "Lidstone" begin
+        voc =["my","name","is","salman","khan","and","he","is","shahrukh","Khan"]
+        train = ["khan", "is", "my", "good", "friend", "and", "He", "is", "my", "brother"]
+        model2 = Lidstone(voc, 1.0)
+        fit = model2(train,2,2)
+        @test score(model2, fit,"is", "alien") == 0.1
+        @test score(model2, fit, "alien", "is") >= 0    
+    end
+    @testset "Laplace" begin
+        voc =["my","name","is","salman","khan","and","he","is","shahrukh","Khan"]
+        train = ["khan", "is", "my", "good", "friend", "and", "He", "is", "my", "brother"]
+        model3 = Laplace(voc)
+        fit2 = model3(train,2,2)
+        @test score(model3, fit2,"is", "alien") == 0.1
+    end
+    @testset "WittenBellInterpolated" begin
+        voc =["my","name","is","salman","khan","and","he","is","shahrukh","Khan"]
+        train = ["khan", "is", "my", "good", "friend", "and", "He", "is", "my", "brother"]
+        model = WittenBellInterpolated(voc)
+        fit = model(train,2,2)
+        @test score(model, fit,"is", "alien") == 0.2
+        @test score(model,fit, "alien", "is") == 0.4
+        @test score(model,fit,"alien") == 0.2 #should be non-zero
+    end
+    @testset "KneserNeyInterpolated" begin
+        voc =["my","name","is","salman","khan","and","he","is","shahrukh","Khan"]
+        train = ["khan", "is", "my", "good", "friend", "and", "He", "is", "my", "brother"]
+        model = KneserNeyInterpolated(voc,0.1)
+        fit = model(train,2,2)
+        @test score(model, fit,"is", "alie") == 0.2
+        @test score(model,fit, "alien", "is") == 0.11000000000000001
+    end
+end
+ 

From 4d4b9eba6de1e448c23acc18b18ff3370505e93a Mon Sep 17 00:00:00 2001
From: Tejas Vaidhya <39345998+tejasvaidhyadev@users.noreply.github.com>
Date: Sun, 7 Jun 2020 13:08:32 +0530
Subject: [PATCH 34/51] Update docs/src/LM.md

Co-authored-by: Ayush Kaushal <ayushk4@gmail.com>
---
 docs/src/LM.md | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/docs/src/LM.md b/docs/src/LM.md
index 3a32c07e..f0b1e3c6 100644
--- a/docs/src/LM.md
+++ b/docs/src/LM.md
@@ -3,7 +3,7 @@
 **TextAnalysis** provide following different Language Models 
 
 - **MLE** - Base Ngram model.
-- **Lidstone **- Base Ngram model with Lidstone smoothing.
+- **Lidstone** - Base Ngram model with Lidstone smoothing.
 - **Laplace** - Base Ngram language model with Laplace smoothing.
 - **WittenBellInterpolated** - Interpolated Version of witten-Bell algorithm.
 - **KneserNeyInterpolated** - Interpolated  version of Kneser -Ney smoothing.
@@ -177,4 +177,3 @@ julia> lookup(vocabulary ,word)
   "c"    
   "a"
 ```
-

From c979bceb6ee8796b077a2862fa50a24732294323 Mon Sep 17 00:00:00 2001
From: Tejas Vaidhya <39345998+tejasvaidhyadev@users.noreply.github.com>
Date: Sun, 7 Jun 2020 13:08:45 +0530
Subject: [PATCH 35/51] Update docs/src/LM.md

Co-authored-by: Ayush Kaushal <ayushk4@gmail.com>
---
 docs/src/LM.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/src/LM.md b/docs/src/LM.md
index f0b1e3c6..a810c5dd 100644
--- a/docs/src/LM.md
+++ b/docs/src/LM.md
@@ -35,7 +35,7 @@ Arguments:
 
  *  `gamma`: smoothing arugment gamma 
 
- * `discount`:  discouting factor for `KneserNeyInterpolated`
+ * `discount`:  discounting factor for `KneserNeyInterpolated`
 
    for more information see docstrings of vocabulary
 

From 6cdfea019d5d66bf2a27b54e158faef8b4345dc6 Mon Sep 17 00:00:00 2001
From: Tejas Vaidhya <39345998+tejasvaidhyadev@users.noreply.github.com>
Date: Sun, 7 Jun 2020 13:08:55 +0530
Subject: [PATCH 36/51] Update docs/src/LM.md

Co-authored-by: Ayush Kaushal <ayushk4@gmail.com>
---
 docs/src/LM.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/src/LM.md b/docs/src/LM.md
index a810c5dd..d76b5ce3 100644
--- a/docs/src/LM.md
+++ b/docs/src/LM.md
@@ -42,7 +42,7 @@ Arguments:
 ```julia
 julia> voc = ["my","name","is","salman","khan","and","he","is","shahrukh","Khan"]
 
-julia>train = ["khan","is","my","good", "friend","and","He","is","my","brother"]
+julia> train = ["khan","is","my","good", "friend","and","He","is","my","brother"]
 # voc and train are used to train vocabulary and model respectively
 
 julia> model = MLE(voc)

From e566904c91a5315b118df50599e30a95256e7aa1 Mon Sep 17 00:00:00 2001
From: tejasvaidhyadev <iamtejasvaidhya@gmail.com>
Date: Sun, 7 Jun 2020 21:47:44 +0530
Subject: [PATCH 37/51] updating typos

---
 docs/make.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/make.jl b/docs/make.jl
index 6beab2fe..22fffb1e 100644
--- a/docs/make.jl
+++ b/docs/make.jl
@@ -20,7 +20,7 @@ makedocs(
         "Conditional Random Fields" => "crf.md",
         "Named Entity Recognition" => "ner.md",
         "ULMFiT" => "ULMFiT.md",
-	"statistical Language Model" => "LM.md"
+        "Statistical Language Model" => "LM.md"
         "API References" => "APIReference.md"
     ],
 )

From 16a254651dd163e76f7c3a66099cced66959782a Mon Sep 17 00:00:00 2001
From: tejasvaidhyadev <iamtejasvaidhya@gmail.com>
Date: Sun, 7 Jun 2020 21:48:23 +0530
Subject: [PATCH 38/51] updating docs typo and errors

---
 docs/src/LM.md | 24 ++++++++++++------------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/docs/src/LM.md b/docs/src/LM.md
index d76b5ce3..2b529678 100644
--- a/docs/src/LM.md
+++ b/docs/src/LM.md
@@ -76,31 +76,31 @@ julia> masked_score = maskedscore(model,fit,"is","alien")
 
 ## Evaluation Method
 
-- `score` 
+### `score` 
 
-    used to evaluate probablity of word given context (*P(word | context)*)
+    used to evaluate the probability of word given context (*P(word | context)*)
 
-   ```julia
+```julia
 	score(m::gammamodel, temp_lm::DefaultDict, word::AbstractString, context::AbstractString)
-   ```
+```
 
 ​	In case of Lidstone and Laplace it apply smoothing and, 
 
 ​	In Interpolated language model, provide Kneserney and WittenBell smoothing  
 
-- `maskedscore` 
+### `maskedscore` 
 
   It is used to evaluate *score* with masks out of vocabulary words
 
   The arguments are the same as for score
 
-- `logscore` 
+### `logscore` 
 
   Evaluate the log score of this word in this context.
 
   The arguments are the same as for score and maskedscore
 
-- `entropy`
+### `entropy`
 	```julia
   entropy(m::Langmodel,lm::DefaultDict,text_ngram::word::Vector{T}) where { T <: AbstractString}
 	```
@@ -109,7 +109,7 @@ julia> masked_score = maskedscore(model,fit,"is","alien")
 
   Input text must be Array of ngram of same lengths
 
-- `perplexity`  
+### `perplexity`  
 
   Calculates the perplexity of the given text.
 
@@ -121,7 +121,7 @@ julia> masked_score = maskedscore(model,fit,"is","alien")
 
 1. `everygram`: Return all possible ngrams generated from sequence of items, as an Array{String,1}
 
-   ```julia
+ ```julia
    julia> seq = ["To","be","or","not"]
    julia> a = everygram(seq,min_len=1, max_len=-1)
     10-element Array{Any,1}:
@@ -134,13 +134,13 @@ julia> masked_score = maskedscore(model,fit,"is","alien")
      "be or not"   
      "To be or"    
      "To be or not"
-   ```
+ ```
 
 2. `padding_ngrams`: padding _ngram is used to pad both left and right of sentence and out putting ngrmas of order n
 
    It also pad the original input Array of string 
 
-   ```julia
+ ```julia
    julia> example = ["1","2","3","4","5"]
          
    julia> example = ["1","2","3","4","5"]
@@ -152,7 +152,7 @@ julia> masked_score = maskedscore(model,fit,"is","alien")
      "3 4"   
      "4 5"   
      "5 </s>"
-   ```
+ ```
 ## Vocabulary 
 
 Struct to store Language models vocabulary

From 8481cfb002fb3a29853b0dccb08355e8789fb9ad Mon Sep 17 00:00:00 2001
From: tejasvaidhyadev <iamtejasvaidhya@gmail.com>
Date: Sun, 7 Jun 2020 21:49:32 +0530
Subject: [PATCH 39/51] updating coding style

---
 src/LM/api.jl | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/src/LM/api.jl b/src/LM/api.jl
index 2936eb06..1fa9115b 100644
--- a/src/LM/api.jl
+++ b/src/LM/api.jl
@@ -1,23 +1,23 @@
 #TO DO 
 # Doc string
 function maskedscore(m::Langmodel,temp_lm::DefaultDict,word,context)
-   score(m,temp_lm,lookup(m.vocab ,[word])[1] ,lookup(m.vocab ,[context])[1])
+   score(m, temp_lm, lookup(m.vocab, [word])[1], lookup(m.vocab, [context])[1])
 end
 
-function logscore(m::Langmodel,temp_lm::DefaultDict,word,context)
-    log2(maskedscore(m,temp_lm,word,context))
+function logscore(m::Langmodel, temp_lm::DefaultDict, word, context)
+    log2(maskedscore(m, temp_lm, word, context))
 end
 
-function entropy(m::Langmodel,lm::DefaultDict,text_ngram)
+function entropy(m::Langmodel, lm::DefaultDict, text_ngram)
     local log_set=Float64[]
     for ngram in text_ngram
         ngram = split(ngram)
-        push!(log_set,logscore(m,lm,ngram[end],join(ngram[1:end-1]," ")))
+        push!(log_set, logscore(m, lm, ngram[end], join(ngram[1:end-1], " ")))
         #println(logscore(m,lm,ngram[end],ngram[1:end-1]))
     end
     return(sum(log_set)/length(log_set))
 end
 
-function perplexity(m::Langmodel,lm::DefaultDict,text_ngram)
-    return(2^(entropy(m,lm,text_ngram)))
+function perplexity(m::Langmodel, lm::DefaultDict, text_ngram)
+    return(2^(entropy(m, lm, text_ngram)))
 end

From 9dc882c3d51a627804a0538b86979a0de6719ea9 Mon Sep 17 00:00:00 2001
From: tejasvaidhyadev <iamtejasvaidhya@gmail.com>
Date: Sun, 7 Jun 2020 21:50:02 +0530
Subject: [PATCH 40/51] updating doc typo

---
 src/LM/langmodel.jl | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/src/LM/langmodel.jl b/src/LM/langmodel.jl
index 53b87ddd..31bdafd0 100644
--- a/src/LM/langmodel.jl
+++ b/src/LM/langmodel.jl
@@ -123,7 +123,7 @@ end
 score is used to output probablity of word given that context in MLE
         
 """
-function score(m::MLE, temp_lm::DefaultDict, word, context = nothing)
+function score(m::MLE, temp_lm::DefaultDict, word, context=nothing)
     prob(m, temp_lm, word, context)
 end
 
@@ -231,13 +231,13 @@ function alpha_gammma(m::KneserNeyInterpolated, templ_lm::DefaultDict, word, con
     local gamma   
     accum = templ_lm[context]
     s = float(sum(accum)) 
-   for (text, count) in accum
-       if text == word
-           alpha=(max(float(count)-m.discount, 0.0) / s)
-           break 
-       else
-           alpha = 1/length(m.vocab.vocab)
-       end
+    for (text, count) in accum
+        if text == word
+            alpha=(max(float(count)-m.discount, 0.0) / s)
+            break 
+        else
+            alpha = 1/length(m.vocab.vocab)
+        end
     end
     gamma = (m.discount * count_non_zero_vals(accum) /s)
     return alpha, gamma

From 723103cf392746ada9fcaaaa25196037538ddf89 Mon Sep 17 00:00:00 2001
From: tejasvaidhyadev <iamtejasvaidhya@gmail.com>
Date: Sun, 7 Jun 2020 21:50:42 +0530
Subject: [PATCH 41/51] adding corrected docs

---
 src/LM/preprocessing.jl | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/src/LM/preprocessing.jl b/src/LM/preprocessing.jl
index 24622002..e240e53f 100644
--- a/src/LM/preprocessing.jl
+++ b/src/LM/preprocessing.jl
@@ -43,8 +43,7 @@ padding _ngram is used to pad both left and right of sentence and out putting ng
 # Example 
 ```julia-repl
 julia> example = ["1","2","3","4","5"]
-      
-julia> example = ["1","2","3","4","5"]
+
 julia> padding_ngrams(example,2,pad_left=true,pad_right=true)
  6-element Array{Any,1}:
   "<s> 1" 
@@ -55,7 +54,7 @@ julia> padding_ngrams(example,2,pad_left=true,pad_right=true)
   "5 </s>"
 ```
 """
-function padding_ngram(word::Vector{T}, n=1; pad_left=false,pad_right=false ,left_pad_symbol="<s>", right_pad_symbol ="</s>") where { T <: AbstractString}
+function padding_ngram(word::Vector{T}, n=1; pad_left=false, pad_right=false, left_pad_symbol="<s>", right_pad_symbol ="</s>") where { T <: AbstractString}
     local seq
     seq = word
     if pad_left == true
@@ -64,7 +63,7 @@ function padding_ngram(word::Vector{T}, n=1; pad_left=false,pad_right=false ,lef
     if pad_right == true
         push!(seq, right_pad_symbol)
     end
-    return  ngramizenew(seq,n)
+    return  ngramizenew(seq, n)
 end
 
 """

From b14cc743a35cf6f4f8cc741f852a44d3c4a87e40 Mon Sep 17 00:00:00 2001
From: tejasvaidhyadev <iamtejasvaidhya@gmail.com>
Date: Sun, 7 Jun 2020 21:51:31 +0530
Subject: [PATCH 42/51] updating coding style in vocab.jl

---
 src/LM/vocab.jl | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/src/LM/vocab.jl b/src/LM/vocab.jl
index 828beaa1..7ae1e1f0 100644
--- a/src/LM/vocab.jl
+++ b/src/LM/vocab.jl
@@ -70,29 +70,29 @@ julia> vocabulary.vocab["b"]
 ```
 """
 mutable struct Vocabulary
-vocab::Dict{String,Int64}
+vocab::Dict{String, Int64}
 unk_cutoff::Int
 unk_label::String
-allword::Array{String,1}
+allword::Array{String, 1}
 end
 function Vocabulary(word::Vector{T}, unk_cutoff=1, unk_label="<unk>") where { T <: AbstractString}
     if unk_label in word
         error("unk_label is in vocab")
     else
-    word= push!(word,unk_label)
+    word= push!(word, unk_label)
     end
     vocab = countmap(word)
     for value in vocab
         if value[2]<unk_cutoff && value[1] != unk_label
-            delete!(vocab,value[1])
+            delete!(vocab, value[1])
         end
     end
-    Vocabulary(vocab,unk_cutoff,unk_label,word)
+    Vocabulary(vocab, unk_cutoff, unk_label, word)
 end
 
 function update(vocab::Vocabulary, words)
-    vocab.allword = append!(vocab.allword,words)
-    vocab.vocab=addcounts!(vocab.vocab,words)
+    vocab.allword = append!(vocab.allword, words)
+    vocab.vocab=addcounts!(vocab.vocab, words)
 end
 
 """
@@ -100,13 +100,13 @@ lookup a sequence or words in the vocabulary
 
 Return an Array of String
 """
-function lookup(voc::Vocabulary,word::Vector{T}) where { T <: AbstractString}
+function lookup(voc::Vocabulary, word::Vector{T}) where { T <: AbstractString}
     look = []
     for w in word
         if w in keys(voc.vocab)
-            push!(look,w) 
+            push!(look, w) 
         else     
-            push!(look,voc.unk_label) 
+            push!(look, voc.unk_label) 
         end
     end
     return look

From 58498eaa47f9d8e82f2e7198d12c3afbbb2c6259 Mon Sep 17 00:00:00 2001
From: Tejas Vaidhya <39345998+tejasvaidhyadev@users.noreply.github.com>
Date: Sun, 7 Jun 2020 21:53:37 +0530
Subject: [PATCH 43/51] Update src/LM/vocab.jl

Co-authored-by: Ayush Kaushal <ayushk4@gmail.com>
---
 src/LM/vocab.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/LM/vocab.jl b/src/LM/vocab.jl
index 7ae1e1f0..949a469d 100644
--- a/src/LM/vocab.jl
+++ b/src/LM/vocab.jl
@@ -75,6 +75,7 @@ unk_cutoff::Int
 unk_label::String
 allword::Array{String, 1}
 end
+
 function Vocabulary(word::Vector{T}, unk_cutoff=1, unk_label="<unk>") where { T <: AbstractString}
     if unk_label in word
         error("unk_label is in vocab")
@@ -112,4 +113,3 @@ function lookup(voc::Vocabulary, word::Vector{T}) where { T <: AbstractString}
     return look
 end
 
-

From feb2d46b307b245eb7de6484b99f056426805110 Mon Sep 17 00:00:00 2001
From: tejasvaidhyadev <iamtejasvaidhya@gmail.com>
Date: Sun, 7 Jun 2020 22:10:24 +0530
Subject: [PATCH 44/51] updating docs

---
 docs/src/LM.md | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/docs/src/LM.md b/docs/src/LM.md
index 2b529678..0e19e630 100644
--- a/docs/src/LM.md
+++ b/docs/src/LM.md
@@ -13,7 +13,7 @@
 To use the API, we first *Instantiate* desired model and then load it with train set
 
 ```julia
-<MLE>(word::Vector{T}, unk_cutoff=1, unk_label="<unk>") where { T <: AbstractString}
+MLE(word::Vector{T}, unk_cutoff=1, unk_label="<unk>") where { T <: AbstractString}
         
 Lidstone(word::Vector{T}, gamma:: Float64, unk_cutoff=1, unk_label="<unk>") where { T <: AbstractString}
         
@@ -70,6 +70,7 @@ julia> masked_score = maskedscore(model,fit,"is","alien")
 #as expected maskedscore is equivalent to unmaskedscore with context replaced with "<unk>"
 
 ```
+
 !!! note
 
     When you call `MLE(voc)` for the first time, It will update your vocabulary set as well. 
@@ -84,6 +85,13 @@ julia> masked_score = maskedscore(model,fit,"is","alien")
 	score(m::gammamodel, temp_lm::DefaultDict, word::AbstractString, context::AbstractString)
 ```
 
+Arguments:                                                        
+
+1. `m` : Instance of `Langmodel` struct.
+2. `temp_lm`: output of function call of instance of `Langmodel`.
+3. `word`: string of word 
+4. `context`: context of given word
+
 ​	In case of Lidstone and Laplace it apply smoothing and, 
 
 ​	In Interpolated language model, provide Kneserney and WittenBell smoothing  

From 995581b1ff87b0501962a4796f49205eea9845d9 Mon Sep 17 00:00:00 2001
From: tejasvaidhyadev <iamtejasvaidhya@gmail.com>
Date: Sun, 7 Jun 2020 22:16:21 +0530
Subject: [PATCH 45/51] updating typo

---
 docs/src/LM.md | 2 --
 1 file changed, 2 deletions(-)

diff --git a/docs/src/LM.md b/docs/src/LM.md
index 0e19e630..6500ee89 100644
--- a/docs/src/LM.md
+++ b/docs/src/LM.md
@@ -149,8 +149,6 @@ Arguments:
    It also pad the original input Array of string 
 
  ```julia
-   julia> example = ["1","2","3","4","5"]
-         
    julia> example = ["1","2","3","4","5"]
    julia> padding_ngrams(example,2,pad_left=true,pad_right=true)
     6-element Array{Any,1}:

From cac50d6d36f3066c96266a66e0eb8b84b5c8529d Mon Sep 17 00:00:00 2001
From: tejasvaidhyadev <iamtejasvaidhya@gmail.com>
Date: Sun, 7 Jun 2020 22:34:54 +0530
Subject: [PATCH 46/51] updating dep for statsbase

---
 Project.toml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/Project.toml b/Project.toml
index e17f55b5..57c85376 100644
--- a/Project.toml
+++ b/Project.toml
@@ -20,6 +20,7 @@ Printf = "de0858da-6303-5e67-8744-51eddeeeb8d7"
 Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
 SparseArrays = "2f01184e-e22b-5df5-ae63-d93ebab69eaf"
 Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
+StatsBase = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91"
 Tracker = "9f7883ad-71c0-57eb-9f7f-b5c9e6d3789c"
 WordTokenizers = "796a5d58-b03d-544a-977e-18100b691f6e"
 StatsBase = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91"

From e2345902fe95c5a60268afbe69256116743e1619 Mon Sep 17 00:00:00 2001
From: tejasvaidhyadev <iamtejasvaidhya@gmail.com>
Date: Sun, 7 Jun 2020 22:44:43 +0530
Subject: [PATCH 47/51] updating Manifest

---
 Manifest.toml | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/Manifest.toml b/Manifest.toml
index 358163f2..a900c5f0 100644
--- a/Manifest.toml
+++ b/Manifest.toml
@@ -471,5 +471,4 @@ git-tree-sha1 = "580ce62b6c14244916cc28ad54f8a2e2886f843d"
 uuid = "a5390f91-8eb1-5f08-bee0-b1d1ffed6cea"
 version = "0.8.3"
 
-[[StatsBase]]
-uuid = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91"
+

From 72c4cb040607ee9787d81dd2958121396d48b80f Mon Sep 17 00:00:00 2001
From: tejasvaidhyadev <iamtejasvaidhya@gmail.com>
Date: Mon, 8 Jun 2020 03:34:58 +0530
Subject: [PATCH 48/51] updating .toml file

---
 Project.toml | 1 -
 1 file changed, 1 deletion(-)

diff --git a/Project.toml b/Project.toml
index 57c85376..3f62e755 100644
--- a/Project.toml
+++ b/Project.toml
@@ -23,7 +23,6 @@ Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
 StatsBase = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91"
 Tracker = "9f7883ad-71c0-57eb-9f7f-b5c9e6d3789c"
 WordTokenizers = "796a5d58-b03d-544a-977e-18100b691f6e"
-StatsBase = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91"
 
 [compat]
 Flux = "< 0.10"

From 12fd3c6f6bff81161bf403807552ef70524ebce0 Mon Sep 17 00:00:00 2001
From: tejasvaidhyadev <iamtejasvaidhya@gmail.com>
Date: Mon, 8 Jun 2020 03:50:47 +0530
Subject: [PATCH 49/51] bug fix

---
 docs/make.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/make.jl b/docs/make.jl
index 22fffb1e..7ab14a36 100644
--- a/docs/make.jl
+++ b/docs/make.jl
@@ -20,7 +20,7 @@ makedocs(
         "Conditional Random Fields" => "crf.md",
         "Named Entity Recognition" => "ner.md",
         "ULMFiT" => "ULMFiT.md",
-        "Statistical Language Model" => "LM.md"
+        "Statistical Language Model" => "LM.md",
         "API References" => "APIReference.md"
     ],
 )

From d089d519d0b4de2f785866c23ab9dce776a2d8ca Mon Sep 17 00:00:00 2001
From: tejasvaidhyadev <iamtejasvaidhya@gmail.com>
Date: Mon, 8 Jun 2020 04:30:15 +0530
Subject: [PATCH 50/51] indentation

---
 src/LM/api.jl       |  2 +-
 src/TextAnalysis.jl |  2 +-
 test/LM.jl          | 48 ++++++++++++++++++++++-----------------------
 3 files changed, 26 insertions(+), 26 deletions(-)

diff --git a/src/LM/api.jl b/src/LM/api.jl
index 1fa9115b..55a542c7 100644
--- a/src/LM/api.jl
+++ b/src/LM/api.jl
@@ -1,6 +1,6 @@
 #TO DO 
 # Doc string
-function maskedscore(m::Langmodel,temp_lm::DefaultDict,word,context)
+function maskedscore(m::Langmodel, temp_lm::DefaultDict, word, context)
    score(m, temp_lm, lookup(m.vocab, [word])[1], lookup(m.vocab, [context])[1])
 end
 
diff --git a/src/TextAnalysis.jl b/src/TextAnalysis.jl
index 01550ce9..23ef05e6 100644
--- a/src/TextAnalysis.jl
+++ b/src/TextAnalysis.jl
@@ -67,7 +67,7 @@ module TextAnalysis
 
     export NERTagger, PoSTagger, Tracker, Flux
     
-    export Vocabulary,lookup,update
+    export Vocabulary, lookup, update
     export everygram, padding_ngram
     export maskedscore, logscore, entropy, perplexity
     export MLE, Lidstone, Laplace, WittenBellInterpolated, KneserNeyInterpolated, score
diff --git a/test/LM.jl b/test/LM.jl
index 710680cd..486de598 100644
--- a/test/LM.jl
+++ b/test/LM.jl
@@ -15,15 +15,15 @@ using DataStructures
     @test isequal(vocab.allword ,["a", "c", "-", "d", "c", "a", "b", "r", "a", "c", "d", "</s>"]) 
     @test isequal(vocab.vocab, Dict{String,Int}("</s>"=>1,"c"=>3,"a"=>3,"d"=>2))
     #to check lookup function
-    @test lookup(vocab,["a","b","c","alien"]) == ["a", "</s>", "c", "</s>"]
-    word_set = ["<unk>","is","already","there"]
+    @test lookup(vocab,["a", "b", "c", "alien"]) == ["a", "</s>", "c", "</s>"]
+    word_set = ["<unk>", "is", "already", "there"]
     @test_throws ErrorException Vocabulary(word_set, 1, "<unk>")
 end
 
 @testset "preprocessing" begin
     @testset "ngramizenew" begin 
         sample_text = ["this", "is", "some", "sample", "text"]
-        ngrams = TextAnalysis.ngramizenew(sample_text,1)
+        ngrams = TextAnalysis.ngramizenew(sample_text, 1)
         
         @test isequal(ngrams, ["this", "is", "some", "sample", "text"])
         
@@ -35,25 +35,25 @@ end
     end
     
     @testset "Padding function" begin
-        example = ["1","2","3","4","5"]
-        padded=padding_ngram(example,2,pad_left=true,pad_right=true)
+        example = ["1", "2", "3", "4", "5"]
+        padded=padding_ngram(example, 2, pad_left=true, pad_right=true)
         @test isequal(padded,["<s> 1", "1 2", "2 3", "3 4", "4 5", "5 </s>"])
-        @test isequal(example, ["<s>","1","2","3","4","5","</s>"])
+        @test isequal(example, ["<s>", "1", "2", "3", "4", "5", "</s>"])
         
-        example = ["1","2","3","4","5"] #if used
-        padded=padding_ngram(example,2,pad_right=true)
+        example = ["1", "2", "3", "4", "5"] #if used
+        padded=padding_ngram(example, 2, pad_right=true)
         @test isequal(padded,["1 2", "2 3", "3 4", "4 5", "5 </s>"])
     end
     @testset "everygram function" begin
-        example = ["1","2","3","4","5"]
-        everyngms = everygram(example,min_len=1,max_len=2)
+        example = ["1", "2", "3", "4", "5"]
+        everyngms = everygram(example, min_len=1, max_len=2)
         @test isequal(everyngms, ["1", "2", "3", "4", "5", "1 2", "2 3", "3 4", "4 5"])
     end
 end
 
 @testset "counter" begin
     exam = ["To", "be", "or", "not", "to", "be","To", "be", "or", "not", "to", "be"]
-    fit = (TextAnalysis.counter2(exam,2,2))
+    fit = (TextAnalysis.counter2(exam, 2, 2))
     @test fit isa DataStructures.DefaultDict
     @test length(fit) == 5 #length of unique words
 end
@@ -61,8 +61,8 @@ end
 @testset "language model" begin
     
     @testset "MLE" begin
-        voc =["my","name","is","salman","khan","and","he","is","shahrukh","Khan"]
-        train = ["khan","is","my","good", "friend","and","He","is","my","brother"]
+        voc =["my", "name", "is", "salman", "khan", "and", "he", "is", "shahrukh", "Khan"]
+        train = ["khan", "is", "my", "good", "friend", "and", "He", "is", "my", "brother"]
         model = MLE(voc)
         fit = model(train, 2, 2) #considering only bigrams
         unmaskedscore = score(model, fit, "is" ,"<unk>")
@@ -75,7 +75,7 @@ end
         voc =["my","name","is","salman","khan","and","he","is","shahrukh","Khan"]
         train = ["khan", "is", "my", "good", "friend", "and", "He", "is", "my", "brother"]
         model2 = Lidstone(voc, 1.0)
-        fit = model2(train,2,2)
+        fit = model2(train, 2, 2)
         @test score(model2, fit,"is", "alien") == 0.1
         @test score(model2, fit, "alien", "is") >= 0    
     end
@@ -83,25 +83,25 @@ end
         voc =["my","name","is","salman","khan","and","he","is","shahrukh","Khan"]
         train = ["khan", "is", "my", "good", "friend", "and", "He", "is", "my", "brother"]
         model3 = Laplace(voc)
-        fit2 = model3(train,2,2)
+        fit2 = model3(train, 2, 2)
         @test score(model3, fit2,"is", "alien") == 0.1
     end
     @testset "WittenBellInterpolated" begin
-        voc =["my","name","is","salman","khan","and","he","is","shahrukh","Khan"]
+        voc =["my", "name", "is", "salman", "khan", "and", "he", "is", "shahrukh", "Khan"]
         train = ["khan", "is", "my", "good", "friend", "and", "He", "is", "my", "brother"]
         model = WittenBellInterpolated(voc)
-        fit = model(train,2,2)
+        fit = model(train, 2, 2)
         @test score(model, fit,"is", "alien") == 0.2
-        @test score(model,fit, "alien", "is") == 0.4
-        @test score(model,fit,"alien") == 0.2 #should be non-zero
+        @test score(model, fit, "alien", "is") == 0.4
+        @test score(model, fit,"alien") == 0.2 #should be non-zero
     end
     @testset "KneserNeyInterpolated" begin
-        voc =["my","name","is","salman","khan","and","he","is","shahrukh","Khan"]
+        voc =["my", "name", "is", "salman", "khan", "and", "he", "is", "shahrukh", "Khan"]
         train = ["khan", "is", "my", "good", "friend", "and", "He", "is", "my", "brother"]
-        model = KneserNeyInterpolated(voc,0.1)
-        fit = model(train,2,2)
-        @test score(model, fit,"is", "alie") == 0.2
-        @test score(model,fit, "alien", "is") == 0.11000000000000001
+        model = KneserNeyInterpolated(voc, 0.1)
+        fit = model(train, 2, 2)
+        @test score(model, fit, "is", "alie") == 0.2
+        @test score(model, fit, "alien", "is") == 0.11000000000000001
     end
 end
  

From 85f93a8e392e0c918f5855123689a4b93de5384d Mon Sep 17 00:00:00 2001
From: tejasvaidhyadev <iamtejasvaidhya@gmail.com>
Date: Mon, 8 Jun 2020 19:29:04 +0530
Subject: [PATCH 51/51] updated using

---
 src/LM/counter.jl | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/src/LM/counter.jl b/src/LM/counter.jl
index dd595b71..4ec876e7 100644
--- a/src/LM/counter.jl
+++ b/src/LM/counter.jl
@@ -1,7 +1,4 @@
 using DataStructures
-import DataStructures.Accumulator
-import DataStructures.DefaultDict
-import DataStructures.counter
 
 """
     counter is used to make conditional distribution, which is used by score functions to