diff --git a/Manifest.toml b/Manifest.toml
index 187c33a1..a900c5f0 100644
--- a/Manifest.toml
+++ b/Manifest.toml
@@ -470,3 +470,5 @@ deps = ["BinaryProvider", "Libdl", "Printf"]
 git-tree-sha1 = "580ce62b6c14244916cc28ad54f8a2e2886f843d"
 uuid = "a5390f91-8eb1-5f08-bee0-b1d1ffed6cea"
 version = "0.8.3"
+
+
diff --git a/Project.toml b/Project.toml
index 02bcd701..3f62e755 100644
--- a/Project.toml
+++ b/Project.toml
@@ -20,6 +20,7 @@ Printf = "de0858da-6303-5e67-8744-51eddeeeb8d7"
 Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
 SparseArrays = "2f01184e-e22b-5df5-ae63-d93ebab69eaf"
 Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
+StatsBase = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91"
 Tracker = "9f7883ad-71c0-57eb-9f7f-b5c9e6d3789c"
 WordTokenizers = "796a5d58-b03d-544a-977e-18100b691f6e"
 
diff --git a/docs/make.jl b/docs/make.jl
index 93a1b9c0..7ab14a36 100644
--- a/docs/make.jl
+++ b/docs/make.jl
@@ -20,6 +20,7 @@ makedocs(
         "Conditional Random Fields" => "crf.md",
         "Named Entity Recognition" => "ner.md",
         "ULMFiT" => "ULMFiT.md",
+        "Statistical Language Model" => "LM.md",
         "API References" => "APIReference.md"
     ],
 )
diff --git a/docs/src/LM.md b/docs/src/LM.md
new file mode 100644
index 00000000..6500ee89
--- /dev/null
+++ b/docs/src/LM.md
@@ -0,0 +1,185 @@
+#  Statistical Language Model 
+
+**TextAnalysis** provide following different Language Models 
+
+- **MLE** - Base Ngram model.
+- **Lidstone** - Base Ngram model with Lidstone smoothing.
+- **Laplace** - Base Ngram language model with Laplace smoothing.
+- **WittenBellInterpolated** - Interpolated Version of witten-Bell algorithm.
+- **KneserNeyInterpolated** - Interpolated  version of Kneser -Ney smoothing.
+
+## APIs
+
+To use the API, we first *Instantiate* desired model and then load it with train set
+
+```julia
+MLE(word::Vector{T}, unk_cutoff=1, unk_label="<unk>") where { T <: AbstractString}
+        
+Lidstone(word::Vector{T}, gamma:: Float64, unk_cutoff=1, unk_label="<unk>") where { T <: AbstractString}
+        
+Laplace(word::Vector{T}, unk_cutoff=1, unk_label="<unk>") where { T <: AbstractString}
+        
+WittenBellInterpolated(word::Vector{T}, unk_cutoff=1, unk_label="<unk>") where { T <: AbstractString}
+        
+KneserNeyInterpolated(word::Vector{T}, discount:: Float64=0.1, unk_cutoff=1, unk_label="<unk>") where { T <: AbstractString}
+        
+(lm::<Languagemodel>)(text, min::Integer, max::Integer)
+```
+Arguments:
+
+ * `word` : Array of  strings to store vocabulary.
+
+ * `unk_cutoff`: Tokens with counts greater than or equal to the cutoff value will be considered part of the vocabulary.
+
+ * `unk_label`: token for unkown labels 
+
+ *  `gamma`: smoothing arugment gamma 
+
+ * `discount`:  discounting factor for `KneserNeyInterpolated`
+
+   for more information see docstrings of vocabulary
+
+```julia
+julia> voc = ["my","name","is","salman","khan","and","he","is","shahrukh","Khan"]
+
+julia> train = ["khan","is","my","good", "friend","and","He","is","my","brother"]
+# voc and train are used to train vocabulary and model respectively
+
+julia> model = MLE(voc)
+MLE(Vocabulary(Dict("khan"=>1,"name"=>1,"<unk>"=>1,"salman"=>1,"is"=>2,"Khan"=>1,"my"=>1,"he"=>1,"shahrukh"=>1,"and"=>1…), 1, "<unk
+        >", ["my", "name", "is", "salman", "khan", "and", "he", "is", "shahrukh", "Khan", "<unk>"]))
+julia> print(voc)
+11-element Array{String,1}:
+ "my"      
+ "name"    
+ "is"      
+ "salman"  
+ "khan"    
+ "and"     
+ "he"      
+ "is"      
+ "shahrukh"
+ "Khan"    
+ "<unk>"   
+# you can see "<unk>" token is added to voc 
+julia> fit = model(train,2,2) #considering only bigrams
+julia> unmaskedscore = score(model, fit, "is" ,"<unk>") #score output P(word | context) without replacing context word with "<unk>"
+0.3333333333333333
+julia> masked_score = maskedscore(model,fit,"is","alien")
+0.3333333333333333
+#as expected maskedscore is equivalent to unmaskedscore with context replaced with "<unk>"
+
+```
+
+!!! note
+
+    When you call `MLE(voc)` for the first time, It will update your vocabulary set as well. 
+
+## Evaluation Method
+
+### `score` 
+
+    used to evaluate the probability of word given context (*P(word | context)*)
+
+```julia
+	score(m::gammamodel, temp_lm::DefaultDict, word::AbstractString, context::AbstractString)
+```
+
+Arguments:                                                        
+
+1. `m` : Instance of `Langmodel` struct.
+2. `temp_lm`: output of function call of instance of `Langmodel`.
+3. `word`: string of word 
+4. `context`: context of given word
+
+​	In case of Lidstone and Laplace it apply smoothing and, 
+
+​	In Interpolated language model, provide Kneserney and WittenBell smoothing  
+
+### `maskedscore` 
+
+  It is used to evaluate *score* with masks out of vocabulary words
+
+  The arguments are the same as for score
+
+### `logscore` 
+
+  Evaluate the log score of this word in this context.
+
+  The arguments are the same as for score and maskedscore
+
+### `entropy`
+	```julia
+  entropy(m::Langmodel,lm::DefaultDict,text_ngram::word::Vector{T}) where { T <: AbstractString}
+	```
+
+  Calculate cross-entropy of model for given evaluation text.
+
+  Input text must be Array of ngram of same lengths
+
+### `perplexity`  
+
+  Calculates the perplexity of the given text.
+
+  This is simply 2 ** cross-entropy(`entropy`) for the text, so the arguments are the same as `entropy`.
+
+##  Preprocessing
+
+ For Preprocessing following functions:
+
+1. `everygram`: Return all possible ngrams generated from sequence of items, as an Array{String,1}
+
+ ```julia
+   julia> seq = ["To","be","or","not"]
+   julia> a = everygram(seq,min_len=1, max_len=-1)
+    10-element Array{Any,1}:
+     "or"          
+     "not"         
+     "To"          
+     "be"                  
+     "or not" 
+     "be or"       
+     "be or not"   
+     "To be or"    
+     "To be or not"
+ ```
+
+2. `padding_ngrams`: padding _ngram is used to pad both left and right of sentence and out putting ngrmas of order n
+
+   It also pad the original input Array of string 
+
+ ```julia
+   julia> example = ["1","2","3","4","5"]
+   julia> padding_ngrams(example,2,pad_left=true,pad_right=true)
+    6-element Array{Any,1}:
+     "<s> 1" 
+     "1 2"   
+     "2 3"   
+     "3 4"   
+     "4 5"   
+     "5 </s>"
+ ```
+## Vocabulary 
+
+Struct to store Language models vocabulary
+
+checking membership and filters items by comparing their counts to a cutoff value
+
+It also Adds a special "unkown" tokens which unseen words are mapped to
+
+```julia
+julia> words = ["a", "c", "-", "d", "c", "a", "b", "r", "a", "c", "d"]
+julia> vocabulary = Vocabulary(words, 2) 
+  Vocabulary(Dict("<unk>"=>1,"c"=>3,"a"=>3,"d"=>2), 2, "<unk>") 
+
+# lookup a sequence or words in the vocabulary
+julia> word = ["a", "-", "d", "c", "a"]
+
+julia> lookup(vocabulary ,word)
+ 5-element Array{Any,1}:
+  "a"    
+  "<unk>"
+  "d"    
+  "c"    
+  "a"
+```
diff --git a/src/LM/api.jl b/src/LM/api.jl
new file mode 100644
index 00000000..55a542c7
--- /dev/null
+++ b/src/LM/api.jl
@@ -0,0 +1,23 @@
+#TO DO 
+# Doc string
+function maskedscore(m::Langmodel, temp_lm::DefaultDict, word, context)
+   score(m, temp_lm, lookup(m.vocab, [word])[1], lookup(m.vocab, [context])[1])
+end
+
+function logscore(m::Langmodel, temp_lm::DefaultDict, word, context)
+    log2(maskedscore(m, temp_lm, word, context))
+end
+
+function entropy(m::Langmodel, lm::DefaultDict, text_ngram)
+    local log_set=Float64[]
+    for ngram in text_ngram
+        ngram = split(ngram)
+        push!(log_set, logscore(m, lm, ngram[end], join(ngram[1:end-1], " ")))
+        #println(logscore(m,lm,ngram[end],ngram[1:end-1]))
+    end
+    return(sum(log_set)/length(log_set))
+end
+
+function perplexity(m::Langmodel, lm::DefaultDict, text_ngram)
+    return(2^(entropy(m, lm, text_ngram)))
+end
diff --git a/src/LM/counter.jl b/src/LM/counter.jl
new file mode 100644
index 00000000..4ec876e7
--- /dev/null
+++ b/src/LM/counter.jl
@@ -0,0 +1,17 @@
+using DataStructures
+
+"""
+    counter is used to make conditional distribution, which is used by score functions to 
+    calculate conditonal frequency distribution
+"""
+function counter2(data, min::Integer, max::Integer)
+    data = everygram(data, min_len=min, max_len=max)
+    data = split.(data)
+    temp_lm = DefaultDict{SubString{String}, Accumulator{String,Int64}}(counter(SubString{String}))
+    for i in 1:length(data)
+        history,word = data[i][1:end-1], data[i][end]
+        temp_lm[join(history, " ")][word] += 1
+    end
+    return temp_lm
+end
+
diff --git a/src/LM/langmodel.jl b/src/LM/langmodel.jl
new file mode 100644
index 00000000..31bdafd0
--- /dev/null
+++ b/src/LM/langmodel.jl
@@ -0,0 +1,246 @@
+abstract type Langmodel end
+abstract type gammamodel <: Langmodel end #BaseNgram with Add-one smoothing algo
+abstract type InterpolatedLanguageModel <: Langmodel end #Interpolated language model with smoothing
+
+#DataType MLE
+#Type for providing MLE ngram model scores.
+#Implementation of Base Ngram Model.
+
+struct MLE <: Langmodel
+    vocab::Vocabulary
+end
+
+"""
+    MLE(word::Vector{T}, unk_cutoff=1, unk_label="<unk>") where { T <: AbstractString}
+
+Initiate Type for providing MLE ngram model scores.
+
+Implementation of Base Ngram Model.
+   
+"""
+function MLE(word, unk_cutoff=1, unk_label="<unk>")
+    MLE(Vocabulary(word, unk_cutoff, unk_label))
+end
+
+function (lm::MLE)(text, min::Integer, max::Integer) 
+    text = lookup(lm.vocab, text)
+    text=convert(Array{String}, text)
+    return counter2(text, min, max)
+end
+
+struct Lidstone <: gammamodel
+    vocab::Vocabulary
+    gamma::Float64
+end
+
+"""
+    Lidstone(word::Vector{T}, gamma:: Float64, unk_cutoff=1, unk_label="<unk>") where { T <: AbstractString}
+
+Function to initiate Type(Lidstone) for providing Lidstone-smoothed scores.
+
+In addition to initialization arguments from BaseNgramModel also requires 
+a number by which to increase the counts, gamma.
+"""
+function Lidstone(word, gamma = 1.0, unk_cutoff=1, unk_label="<unk>")
+    Lidstone(Vocabulary(word, unk_cutoff, unk_label), gamma)
+end
+
+function (lm::Lidstone)(text, min::Integer, max::Integer) 
+    text = lookup(lm.vocab, text)
+    text=convert(Array{String}, text)
+    return counter2(text, min, max)
+end
+
+"""
+    Laplace(word::Vector{T}, unk_cutoff=1, unk_label="<unk>") where { T <: AbstractString}
+Function to initiate Type(Laplace) for providing Laplace-smoothed scores.
+
+In addition to initialization arguments from BaseNgramModel also requires
+a number by which to increase the counts, gamma = 1.
+"""
+struct Laplace <: gammamodel
+    vocab::Vocabulary
+    gamma::Float64
+end
+
+function Laplace(word, unk_cutoff=1, unk_label="<unk>")
+    Laplace(Vocabulary(word, unk_cutoff, unk_label), 1.0)
+end
+
+function (lm::Laplace)(text, min::Integer, max::Integer) 
+    text = lookup(lm.vocab, text)
+    text = convert(Array{String}, text)
+    return counter2(text, min, max)
+end
+
+"""
+	score(m::gammamodel, temp_lm::DefaultDict, word::AbstractString, context::AbstractString)	
+
+score is used to output probablity of word given that context 
+
+Add-one smoothing to Lidstone or Laplace(gammamodel) models
+        
+"""
+function score(m::gammamodel, temp_lm::DefaultDict, word, context) #score for gammamodel output probabl
+    accum = temp_lm[context]
+    #print(accum)
+    s = float(sum(accum)+(m.gamma)*length(m.vocab.vocab)) 
+    for (text, count) in accum
+        if text == word
+            return(float(count+m.gamma)/s)
+        end
+    end
+    return(float(m.gamma)/s)
+end
+
+"""
+To get probability of word given that context
+
+In otherwords, for given context calculate frequency distribution of word
+  
+"""
+function prob(m::Langmodel, templ_lm::DefaultDict, word, context=nothing)
+    if context == nothing || context == ""
+        return(1/float(length(templ_lm))) #provide distribution 
+    else
+        accum = templ_lm[context]
+    end
+    s = float(sum(accum)) 
+    for (text, count) in accum
+        if text == word
+            return(float(count) / s)
+        end
+    end
+    if context in keys(m.vocab.vocab)
+        return(0)
+    end
+    return(Inf)
+end
+
+"""
+	score(m::MLE, temp_lm::DefaultDict, word::AbstractString, context::AbstractString)	
+
+score is used to output probablity of word given that context in MLE
+        
+"""
+function score(m::MLE, temp_lm::DefaultDict, word, context=nothing)
+    prob(m, temp_lm, word, context)
+end
+
+struct WittenBellInterpolated <: InterpolatedLanguageModel 
+    vocab ::Vocabulary
+end
+
+"""
+    WittenBellInterpolated(word::Vector{T}, unk_cutoff=1, unk_label="<unk>") where { T <: AbstractString}
+
+Initiate Type for providing Interpolated version of Witten-Bell smoothing.
+
+The idea to abstract this comes from Chen & Goodman 1995.
+
+"""
+function WittenBellInterpolated(word, unk_cutoff=1, unk_label="<unk>")
+    WittenBellInterpolated(Vocabulary(word, unk_cutoff, unk_label))
+end
+
+function (lm::WittenBellInterpolated)(text, min::Integer, max::Integer) 
+    text = lookup(lm.vocab, text)
+    text=convert(Array{String}, text)
+    return counter2(text, min, max)
+end
+# alpha_gamma function for KneserNeyInterpolated
+function alpha_gammma(m::WittenBellInterpolated, templ_lm::DefaultDict, word, context)
+    local alpha
+    local gam
+    accum = templ_lm[context]
+    s = float(sum(accum)) 
+    for (text,count) in accum
+        if text == word
+            alpha=(float(count) / s)
+            break 
+        else
+            alpha = 1/s
+        end
+    end
+   
+    gam = gamma(accum)
+    return alpha*(1- gam), gam 
+end
+
+function count_non_zero_vals(accum::Accumulator{})
+    return(length(accum))
+end
+    
+function gamma(accum)
+    nplus=count_non_zero_vals(accum)
+    return(nplus/(nplus+float(sum(accum))))
+end
+
+"""
+	score(m::InterpolatedLanguageModel, temp_lm::DefaultDict, word::AbstractString, context::AbstractString)	
+
+score is used to output probablity of word given that context in InterpolatedLanguageModel
+
+Apply Kneserney and WittenBell smoothing
+depending upon the sub-Type
+        
+"""
+function score(m::InterpolatedLanguageModel, temp_lm::DefaultDict, word, context=nothing)
+    if context == nothing || context == ""
+        return prob(m, temp_lm, word, context)
+    end
+    if context in keys(temp_lm)
+        alpha,gamma = alpha_gammma(m, temp_lm, word, context)
+        return (alpha + gamma*score(m, temp_lm, word, context_reduce(context)))
+    else
+        return score(m, temp_lm, word, context_reduce(context))
+    end
+end
+        
+function context_reduce(context)
+    context = split(context)
+    join(context[2:end], " ")
+end
+
+
+struct KneserNeyInterpolated <: InterpolatedLanguageModel 
+    vocab::Vocabulary
+    discount::Float64
+end
+
+"""
+    KneserNeyInterpolated(word::Vector{T}, discount:: Float64,unk_cutoff=1, unk_label="<unk>") where { T <: AbstractString}
+
+Initiate Type for providing KneserNey Interpolated language model.
+
+The idea to abstract this comes from Chen & Goodman 1995.
+
+"""
+function KneserNeyInterpolated(word, disc = 0.1, unk_cutoff=1, unk_label="<unk>")
+    KneserNeyInterpolated(Vocabulary(word, unk_cutoff, unk_label) ,disc)
+end
+
+function (lm::KneserNeyInterpolated)(text, min::Integer, max::Integer) 
+    text = lookup(lm.vocab, text)
+    text=convert(Array{String}, text)
+    return counter2(text, min, max)
+end
+# alpha_gamma function for KneserNeyInterpolated
+function alpha_gammma(m::KneserNeyInterpolated, templ_lm::DefaultDict, word, context)
+    local alpha
+    local gamma   
+    accum = templ_lm[context]
+    s = float(sum(accum)) 
+    for (text, count) in accum
+        if text == word
+            alpha=(max(float(count)-m.discount, 0.0) / s)
+            break 
+        else
+            alpha = 1/length(m.vocab.vocab)
+        end
+    end
+    gamma = (m.discount * count_non_zero_vals(accum) /s)
+    return alpha, gamma
+end
+
+
diff --git a/src/LM/preprocessing.jl b/src/LM/preprocessing.jl
new file mode 100644
index 00000000..e240e53f
--- /dev/null
+++ b/src/LM/preprocessing.jl
@@ -0,0 +1,101 @@
+"""
+    everygram(seq::Vector{T}; min_len::Int=1, max_len::Int=-1)where { T <: AbstractString}
+
+Return all possible ngrams generated from sequence of items, as an Array{String,1}
+
+# Example
+
+```julia-repl
+julia> seq = ["To","be","or","not"]
+julia> a = everygram(seq,min_len=1, max_len=-1)
+ 10-element Array{Any,1}:
+  "or"          
+  "not"         
+  "To"          
+  "be"                  
+  "or not" 
+  "be or"       
+  "be or not"   
+  "To be or"    
+  "To be or not"
+```
+   
+"""
+function everygram(seq::Vector{T}; min_len::Int=1, max_len::Int=-1) where { T <: AbstractString}
+    ngram = []
+    if max_len == -1
+        max_len = length(seq)
+    end
+    for n in range(min_len, stop=max_len)
+        temp = ngramizenew(seq, n)
+        ngram = append!(ngram, temp)
+    end
+    return(ngram)
+end
+
+"""
+    padding_ngram(word::Vector{T}, n=1; pad_left=false, pad_right=false, left_pad_symbol="<s>", right_pad_symbol ="</s>") where { T <: AbstractString}
+   
+padding _ngram is used to pad both left and right of sentence and out putting ngrmas of order n
+   
+   It also pad the original input Array of string 
+
+# Example 
+```julia-repl
+julia> example = ["1","2","3","4","5"]
+
+julia> padding_ngrams(example,2,pad_left=true,pad_right=true)
+ 6-element Array{Any,1}:
+  "<s> 1" 
+  "1 2"   
+  "2 3"   
+  "3 4"   
+  "4 5"   
+  "5 </s>"
+```
+"""
+function padding_ngram(word::Vector{T}, n=1; pad_left=false, pad_right=false, left_pad_symbol="<s>", right_pad_symbol ="</s>") where { T <: AbstractString}
+    local seq
+    seq = word
+    if pad_left == true
+        prepend!(seq, [left_pad_symbol])
+    end 
+    if pad_right == true
+        push!(seq, right_pad_symbol)
+    end
+    return  ngramizenew(seq, n)
+end
+
+"""
+    ngramizenew( words::Vector{T}, nlist::Integer...) where { T <: AbstractString}   
+
+ngramizenew is used to out putting ngrmas in set
+   
+# Example
+```julia-repl
+julia> seq=["To","be","or","not","To","not","To","not"]
+julia> ngramizenew(seq ,2)
+ 7-element Array{Any,1}:
+  "To be" 
+  "be or" 
+  "or not"
+  "not To"
+  "To not"
+  "not To"
+  "To not"
+```
+"""
+function ngramizenew(words::Vector{T}, nlist::Integer...) where { T <: AbstractString}
+    n_words = length(words)
+
+    tokens = []
+
+    for n in nlist
+        for index in 1:(n_words - n + 1)
+            token = join(words[index:(index + n - 1)], " ")
+            push!(tokens,token)
+        end
+    end
+    return tokens
+end
+
diff --git a/src/LM/vocab.jl b/src/LM/vocab.jl
new file mode 100644
index 00000000..949a469d
--- /dev/null
+++ b/src/LM/vocab.jl
@@ -0,0 +1,115 @@
+"""
+    Vocabulary(word,unk_cutoff =1 ,unk_label = "<unk>") 
+
+Stores language model vocabulary.
+Satisfies two common language modeling requirements for a vocabulary:
+- When checking membership and calculating its size, filters items
+by comparing their counts to a cutoff value.
+Adds a special "unknown" token which unseen words are mapped to.
+
+# Example
+```julia-repl
+julia> words = ["a", "c", "-", "d", "c", "a", "b", "r", "a", "c", "d"]
+julia> vocabulary = Vocabulary(words, 2) 
+  Vocabulary(Dict("<unk>"=>1,"c"=>3,"a"=>3,"d"=>2), 2, "<unk>") 
+
+julia> vocabulary.vocab
+  Dict{String,Int64} with 4 entries:
+   "<unk>" => 1
+   "c"     => 3
+   "a"     => 3
+   "d"     => 2
+
+Tokens with counts greater than or equal to the cutoff value will
+be considered part of the vocabulary.
+julia> vocabulary.vocab["c"]
+ 3
+
+julia> "c" in keys(vocabulary.vocab)
+ true
+
+julia> vocabulary.vocab["d"]
+ 2
+
+julia> "d" in keys(vocabulary.vocab)
+ true
+
+Tokens with frequency counts less than the cutoff value will be considered not
+part of the vocabulary even though their entries in the count dictionary are
+preserved.
+julia> "b" in keys(vocabulary.vocab)
+ false
+
+julia> "<unk>" in keys(vocabulary.vocab)
+ true
+
+We can look up words in a vocabulary using its `lookup` method.
+"Unseen" words (with counts less than cutoff) are looked up as the unknown label.
+If given one word (a string) as an input, this method will return a string.
+julia> lookup("a")
+ 'a'
+
+julia> word = ["a", "-", "d", "c", "a"]
+
+julia> lookup(vocabulary ,word)
+ 5-element Array{Any,1}:
+  "a"    
+  "<unk>"
+  "d"    
+  "c"    
+  "a"
+
+If given a sequence, it will return an Array{Any,1} of the looked up words as shown above.
+   
+It's possible to update the counts after the vocabulary has been created.
+julia> update(vocabulary,["b","c","c"])
+ 1
+
+julia> vocabulary.vocab["b"]
+ 1
+```
+"""
+mutable struct Vocabulary
+vocab::Dict{String, Int64}
+unk_cutoff::Int
+unk_label::String
+allword::Array{String, 1}
+end
+
+function Vocabulary(word::Vector{T}, unk_cutoff=1, unk_label="<unk>") where { T <: AbstractString}
+    if unk_label in word
+        error("unk_label is in vocab")
+    else
+    word= push!(word, unk_label)
+    end
+    vocab = countmap(word)
+    for value in vocab
+        if value[2]<unk_cutoff && value[1] != unk_label
+            delete!(vocab, value[1])
+        end
+    end
+    Vocabulary(vocab, unk_cutoff, unk_label, word)
+end
+
+function update(vocab::Vocabulary, words)
+    vocab.allword = append!(vocab.allword, words)
+    vocab.vocab=addcounts!(vocab.vocab, words)
+end
+
+"""
+lookup a sequence or words in the vocabulary
+
+Return an Array of String
+"""
+function lookup(voc::Vocabulary, word::Vector{T}) where { T <: AbstractString}
+    look = []
+    for w in word
+        if w in keys(voc.vocab)
+            push!(look, w) 
+        else     
+            push!(look, voc.unk_label) 
+        end
+    end
+    return look
+end
+
diff --git a/src/TextAnalysis.jl b/src/TextAnalysis.jl
index 8978cf1b..23ef05e6 100644
--- a/src/TextAnalysis.jl
+++ b/src/TextAnalysis.jl
@@ -2,7 +2,7 @@ module TextAnalysis
     using SparseArrays
     using Printf
     using LinearAlgebra
-
+    using StatsBase: countmap,addcounts!
     using Languages
     using DataFrames
     using WordTokenizers
@@ -66,6 +66,11 @@ module TextAnalysis
     export CRF, viterbi_decode, crf_loss
 
     export NERTagger, PoSTagger, Tracker, Flux
+    
+    export Vocabulary, lookup, update
+    export everygram, padding_ngram
+    export maskedscore, logscore, entropy, perplexity
+    export MLE, Lidstone, Laplace, WittenBellInterpolated, KneserNeyInterpolated, score
 
     include("tokenizer.jl")
     include("ngramizer.jl")
@@ -111,7 +116,14 @@ module TextAnalysis
     include("sequence/pos_datadeps.jl")
     include("sequence/pos.jl")
     include("sequence/sequence_models.jl")
-
+    
+    # Lang_model
+    include("LM/vocab.jl")
+    include("LM/langmodel.jl") 
+    include("LM/api.jl")
+    include("LM/counter.jl")
+    include("LM/preprocessing.jl")
+    
     # ULMFiT
     module ULMFiT
         using ..TextAnalysis
diff --git a/test/LM.jl b/test/LM.jl
new file mode 100644
index 00000000..486de598
--- /dev/null
+++ b/test/LM.jl
@@ -0,0 +1,107 @@
+using DataStructures
+
+@testset "Vocabulary" begin
+    
+    words = ["a", "c", "-", "d", "c", "a", "b", "r", "a", "c", "d"]
+    vocab = Vocabulary(words, 2, "</s>")
+    @test vocab isa Vocabulary
+    @test vocab.vocab isa Dict
+    @test vocab.unk_cutoff isa Int
+    @test vocab.unk_label isa String
+    @test vocab.allword isa Array{String,1}
+    @test length(vocab.vocab) == 4 #only 4 differnt string over word freq 2
+    @test isequal(vocab.unk_cutoff, 2)
+    @test vocab.unk_label == "</s>"
+    @test isequal(vocab.allword ,["a", "c", "-", "d", "c", "a", "b", "r", "a", "c", "d", "</s>"]) 
+    @test isequal(vocab.vocab, Dict{String,Int}("</s>"=>1,"c"=>3,"a"=>3,"d"=>2))
+    #to check lookup function
+    @test lookup(vocab,["a", "b", "c", "alien"]) == ["a", "</s>", "c", "</s>"]
+    word_set = ["<unk>", "is", "already", "there"]
+    @test_throws ErrorException Vocabulary(word_set, 1, "<unk>")
+end
+
+@testset "preprocessing" begin
+    @testset "ngramizenew" begin 
+        sample_text = ["this", "is", "some", "sample", "text"]
+        ngrams = TextAnalysis.ngramizenew(sample_text, 1)
+        
+        @test isequal(ngrams, ["this", "is", "some", "sample", "text"])
+        
+        ngrams = TextAnalysis.ngramizenew(sample_text,2)
+        @test isequal(ngrams, ["this is", "is some", "some sample", "sample text"])
+    
+        ngrams = TextAnalysis.ngramizenew(sample_text,1,2)
+        @test isequal(ngrams, ["this", "is", "some", "sample", "text", "this is", "is some", "some sample", "sample text"])
+    end
+    
+    @testset "Padding function" begin
+        example = ["1", "2", "3", "4", "5"]
+        padded=padding_ngram(example, 2, pad_left=true, pad_right=true)
+        @test isequal(padded,["<s> 1", "1 2", "2 3", "3 4", "4 5", "5 </s>"])
+        @test isequal(example, ["<s>", "1", "2", "3", "4", "5", "</s>"])
+        
+        example = ["1", "2", "3", "4", "5"] #if used
+        padded=padding_ngram(example, 2, pad_right=true)
+        @test isequal(padded,["1 2", "2 3", "3 4", "4 5", "5 </s>"])
+    end
+    @testset "everygram function" begin
+        example = ["1", "2", "3", "4", "5"]
+        everyngms = everygram(example, min_len=1, max_len=2)
+        @test isequal(everyngms, ["1", "2", "3", "4", "5", "1 2", "2 3", "3 4", "4 5"])
+    end
+end
+
+@testset "counter" begin
+    exam = ["To", "be", "or", "not", "to", "be","To", "be", "or", "not", "to", "be"]
+    fit = (TextAnalysis.counter2(exam, 2, 2))
+    @test fit isa DataStructures.DefaultDict
+    @test length(fit) == 5 #length of unique words
+end
+
+@testset "language model" begin
+    
+    @testset "MLE" begin
+        voc =["my", "name", "is", "salman", "khan", "and", "he", "is", "shahrukh", "Khan"]
+        train = ["khan", "is", "my", "good", "friend", "and", "He", "is", "my", "brother"]
+        model = MLE(voc)
+        fit = model(train, 2, 2) #considering only bigrams
+        unmaskedscore = score(model, fit, "is" ,"<unk>")
+        @test unmaskedscore == 0.3333333333333333
+        @test score(model, fit, "is", "alien") == Inf #context not in vocabulary 
+        @test score(model, fit, "alien", "is") == 0 # word not in vocabulary
+    end   
+    
+    @testset "Lidstone" begin
+        voc =["my","name","is","salman","khan","and","he","is","shahrukh","Khan"]
+        train = ["khan", "is", "my", "good", "friend", "and", "He", "is", "my", "brother"]
+        model2 = Lidstone(voc, 1.0)
+        fit = model2(train, 2, 2)
+        @test score(model2, fit,"is", "alien") == 0.1
+        @test score(model2, fit, "alien", "is") >= 0    
+    end
+    @testset "Laplace" begin
+        voc =["my","name","is","salman","khan","and","he","is","shahrukh","Khan"]
+        train = ["khan", "is", "my", "good", "friend", "and", "He", "is", "my", "brother"]
+        model3 = Laplace(voc)
+        fit2 = model3(train, 2, 2)
+        @test score(model3, fit2,"is", "alien") == 0.1
+    end
+    @testset "WittenBellInterpolated" begin
+        voc =["my", "name", "is", "salman", "khan", "and", "he", "is", "shahrukh", "Khan"]
+        train = ["khan", "is", "my", "good", "friend", "and", "He", "is", "my", "brother"]
+        model = WittenBellInterpolated(voc)
+        fit = model(train, 2, 2)
+        @test score(model, fit,"is", "alien") == 0.2
+        @test score(model, fit, "alien", "is") == 0.4
+        @test score(model, fit,"alien") == 0.2 #should be non-zero
+    end
+    @testset "KneserNeyInterpolated" begin
+        voc =["my", "name", "is", "salman", "khan", "and", "he", "is", "shahrukh", "Khan"]
+        train = ["khan", "is", "my", "good", "friend", "and", "He", "is", "my", "brother"]
+        model = KneserNeyInterpolated(voc, 0.1)
+        fit = model(train, 2, 2)
+        @test score(model, fit, "is", "alie") == 0.2
+        @test score(model, fit, "alien", "is") == 0.11000000000000001
+    end
+end
+ 
diff --git a/test/runtests.jl b/test/runtests.jl
index 59b3a4dd..867b284d 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -28,3 +28,4 @@ include("taggingschemes.jl")
 include("averagePerceptronTagger.jl")
 include("evaluation_metrics.jl")
 include("ulmfit.jl")
+include("LM.jl")