JuliaText · aviks · Aug 10, 2019 · Jul 11, 2019 · Jul 11, 2019 · Jul 11, 2019
diff --git a/docs/make.jl b/docs/make.jl
@@ -16,6 +16,7 @@ makedocs(
         "Semantic Analysis" => "semantic.md",
         "Classifier" => "classify.md",
         "Extended Example" => "example.md",
+        "Evaluation Metrics" => "evaluation_metrics.md",
         "Conditional Random Fields" => "crf.md"
     ],
 )

diff --git a/docs/src/evaluation_metrics.md b/docs/src/evaluation_metrics.md
@@ -0,0 +1,37 @@
+## Evaluation Metrics
+
+Natural Language Processing tasks require certain Evaluation Metrics.
+As of now TextAnalysis provides the following evaluation metrics.
+
+* [ROUGE-N](https://en.wikipedia.org/wiki/ROUGE_(metric))
+* [ROUGE-L](https://en.wikipedia.org/wiki/ROUGE_(metric))
+
+## ROUGE-N
+This metric evaluatrion based on the overlap of N-grams
+between the system and reference summaries.
+
+    rouge_n(references, candidate, n; avg, lang)
+
+The function takes the following arguments -
+
+* `references::Array{T} where T<: AbstractString` = The list of reference summaries.
+* `candidate::AbstractString` = Input candidate summary, to be scored against reference summaries.
+* `n::Integer` = Order of NGrams
+* `avg::Bool` = Setting this parameter to `true`, applies jackkniving the calculated scores. Defaults to `true`
+* `lang::Language` = Language of the text, usefule while generating N-grams. Defaults to English i.e. Languages.English()
+
+```julia
+julia> candidate_summary =  "Brazil, Russia, China and India are growing nations. They are all an important part of BRIC as well as regular part of G20 summits."
+"Brazil, Russia, China and India are growing nations. They are all an important part of BRIC as well as regular part of G20 summits."
+
+julia> reference_summaries = ["Brazil, Russia, India and China are the next big poltical powers in the global economy. Together referred to as BRIC(S) along with South Korea.", "Brazil, Russia, India and China are together known as the  BRIC(S) and have been invited to the G20 summit."]
+2-element Array{String,1}:
+ "Brazil, Russia, India and China are the next big poltical powers in the global economy. Together referred to as BRIC(S) along with South Korea."
+ "Brazil, Russia, India and China are together known as the  BRIC(S) and have been invited to the G20 summit."                                    
+
+julia> rouge_n(reference_summaries, candidate_summary, 2, avg=true)
+0.1317241379310345
+
+julia> rouge_n(reference_summaries, candidate_summary, 1, avg=true)
+0.5051282051282051
+```
diff --git a/docs/src/features.md b/docs/src/features.md
@@ -123,7 +123,7 @@ by each term. This can be done by finding the term frequency function
 
     tf(dtm)
 
-The paramter, `dtm` can be of the types - `DocumentTermMatrix` , `SparseMatrixCSC` or `Matrix`
+The parameter, `dtm` can be of the types - `DocumentTermMatrix` , `SparseMatrixCSC` or `Matrix`
 
 ```julia
 julia> crps = Corpus([StringDocument("To be or not to be"),
@@ -195,6 +195,146 @@ As you can see, TF-IDF has the effect of inserting 0's into the columns of
 words that occur in all documents. This is a useful way to avoid having to
 remove those words during preprocessing.
 
+## Okapi BM-25
+
+From the document term matparamterix, [Okapi BM25](https://en.wikipedia.org/wiki/Okapi_BM25) document-word statistic can be created.
+
+    bm_25(dtm::AbstractMatrix; κ, β)
+    bm_25(dtm::DocumentTermMatrixm, κ, β)
+
+It can also be used via the following methods Overwrite the `bm25` with calculated weights.
+
+    bm_25!(dtm, bm25, κ, β)
+
+The inputs matrices can also be a `Sparse Matrix`.
+The parameters κ and β default to 2 and 0.75 respectively.
+
+Here is an example usage -
+
+```julia
+julia> crps = Corpus([StringDocument("a a a sample text text"), StringDocument("another example example text text"), StringDocument(""), StringDocument("another another text text text text")])
+
+julia> update_lexicon!(crps)
+
+julia> m = DocumentTermMatrix(crps)
+
+julia> bm_25(m)
+4×5 SparseArrays.SparseMatrixCSC{Float64,Int64} with 8 stored entries:
+  [1, 1]  =  1.29959
+  [2, 2]  =  0.882404
+  [4, 2]  =  1.40179
+  [2, 3]  =  1.54025
+  [1, 4]  =  1.89031
+  [1, 5]  =  0.405067
+  [2, 5]  =  0.405067
+  [4, 5]  =  0.676646
+```
+
+## Co occurrence matrix (COOM)
+
+The elements of the Co occurrence matrix indicate how many times two words co-occur
+in a (sliding) word window of a given size.
+The COOM can be calculated for objects of type `Corpus`,
+`AbstractDocument` (with the exception of `NGramDocument`).
+
+    CooMatrix(crps; window, normalize)
+    CooMatrix(doc; window, normalize)
+
+It takes following keyword arguments:
+
+* `window::Integer` -length of the Window size, defaults to `5`. The actual size of the sliding window is 2 * window + 1, with the keyword argument window specifying how many words to consider to the left and right of the center one
+* `normalize::Bool` -normalizes counts to distance between words, defaults to `true`
+
+It returns the `CooMatrix` structure from which
+the matrix can be extracted using `coom(::CooMatrix)`.
+The `terms` can also be extracted from this.
+Here is an example usage -
+
+```julia
+
+julia> crps = Corpus([StringDocument("this is a string document"),
+
+julia> C = CooMatrix(crps, window=1, normalize=false)
+CooMatrix{Float64}(
+  [2, 1]  =  2.0
+  [6, 1]  =  2.0
+  [1, 2]  =  2.0
+  [3, 2]  =  2.0
+  [2, 3]  =  2.0
+  [6, 3]  =  2.0
+  [5, 4]  =  4.0
+  [4, 5]  =  4.0
+  [6, 5]  =  4.0
+  [1, 6]  =  2.0
+  [3, 6]  =  2.0
+  [5, 6]  =  4.0, ["string", "document", "token", "this", "is", "a"], OrderedDict("string"=>1,"document"=>2,"token"=>3,"this"=>4,"is"=>5,"a"=>6))
+
+julia> coom(C)
+6×6 SparseArrays.SparseMatrixCSC{Float64,Int64} with 12 stored entries:
+  [2, 1]  =  2.0
+  [6, 1]  =  2.0
+  [1, 2]  =  2.0
+  [3, 2]  =  2.0
+  [2, 3]  =  2.0
+  [6, 3]  =  2.0
+  [5, 4]  =  4.0
+  [4, 5]  =  4.0
+  [6, 5]  =  4.0
+  [1, 6]  =  2.0
+  [3, 6]  =  2.0
+  [5, 6]  =  4.0
+
+julia> C.terms
+6-element Array{String,1}:
+ "string"
+ "document"
+ "token"
+ "this"
+ "is"
+ "a"
+
+```
+
+It can also be called to calculate the terms for
+a specific list of words / terms in the document.
+In other cases it calculates the the co occurrence elements
+for all the terms.
+
+    CooMatrix(crps, terms; window, normalize)
+    CooMatrix(doc, terms; window, normalize)
+
+```julia
+julia> C = CooMatrix(crps, ["this", "is", "a"], window=1, normalize=false)
+CooMatrix{Float64}(
+  [2, 1]  =  4.0
+  [1, 2]  =  4.0
+  [3, 2]  =  4.0
+  [2, 3]  =  4.0, ["this", "is", "a"], OrderedCollections.OrderedDict("this"=>1,"is"=>2,"a"=>3))
+
+```
+
+The type can also be specified for `CooMatrix`
+with the weights of type `T`. `T` defaults to `Float64`.
+
+    CooMatrix{T}(crps; window, normalize) where T <: AbstractFloat
+    CooMatrix{T}(doc; window, normalize) where T <: AbstractFloat
+    CooMatrix{T}(crps, terms; window, normalize) where T <: AbstractFloat
+    CooMatrix{T}(doc, terms; window, normalize) where T <: AbstractFloat
+
+Remarks:
+
+* The sliding window used to count co-occurrences does not take into consideration sentence stops however, it does with documents i.e. does not span across documents
+* The co-occurrence matrices of the documents in a corpus are summed up when calculating the matrix for an entire corpus
+
+!!! note
+    The Co occurrence matrix does not work for `NGramDocument`,
+    or a Corpus containing an `NGramDocument`.
+
+```julia
+julia> C = CooMatrix(NGramDocument("A document"), window=1, normalize=false) # fails, documents are NGramDocument
+ERROR: The tokens of an NGramDocument cannot be reconstructed
+```
+
 ## Sentiment Analyzer
 
 It can be used to find the sentiment score (between 0 and 1) of a word, sentence or a Document.

diff --git a/docs/src/semantic.md b/docs/src/semantic.md
@@ -1,10 +1,34 @@
 ## LSA: Latent Semantic Analysis
 
-Often we want to think about documents from the perspective of semantic
-content. One standard approach to doing this is to perform Latent Semantic
-Analysis or LSA on the corpus. You can do this using the `lsa` function:
+Often we want to think about documents
+from the perspective of semantic content.
+One standard approach to doing this,
+is to perform Latent Semantic Analysis or LSA on the corpus.
 
     lsa(crps)
+    lsa(dtm)
+
+lsa uses `tf_idf` for statistics.
+
+```julia
+julia> crps = Corpus([StringDocument("this is a string document"), TokenDocument("this is a token document")])
+
+julia> F1.lsa(crps)
+LinearAlgebra.SVD{Float64,Float64,Array{Float64,2}}([1.0 0.0; 0.0 1.0], [0.138629, 0.138629], [0.0 0.0 … 0.0 0.0; 0.0 0.0 … 0.0 1.0])
+```
+
+lsa can also be performed on a `DocumentTermMatrix`.
+
+```julia
+julia> update_lexicon!(crps)
+
+julia> m = DocumentTermMatrix(crps)
+A 2 X 6 DocumentTermMatrix
+
+julia> F2 = lsa(m)
+SVD{Float64,Float64,Array{Float64,2}}([1.0 0.0; 0.0 1.0], [0.138629, 0.138629], [0.0 0.0 … 0.0 0.0; 0.0 0.0 … 0.0 1.0])
+```
+
 
 ## LDA: Latent Dirichlet Allocation
 

diff --git a/src/TextAnalysis.jl b/src/TextAnalysis.jl
@@ -6,6 +6,8 @@ module TextAnalysis
     using Languages
     using DataFrames
     using WordTokenizers
+    using DataStructures
+    using Statistics
 
     using Flux
     using Flux: identity, onehot, onecold, @treelike
@@ -44,17 +46,18 @@ module TextAnalysis
     export dtv, each_dtv, dtm, tdm
     export TextHashFunction, index_hash, cardinality, hash_function, hash_function!
     export hash_dtv, each_hash_dtv, hash_dtm, hash_tdm
+    export CooMatrix, coom
     export standardize!
-    export tf, tf_idf, lsa, lda, summarize
-    export tf!, tf_idf!, lsa!, lda!
+    export tf, tf_idf, bm_25, lsa, lda, summarize
+    export tf!, tf_idf!, bm_25!, lda!
     export remove_patterns!, remove_patterns
 
     export strip_patterns, strip_corrupt_utf8, strip_case, stem_words, tag_part_of_speech, strip_whitespace, strip_punctuation
     export strip_numbers, strip_non_letters, strip_indefinite_articles, strip_definite_articles, strip_articles
     export strip_prepositions, strip_pronouns, strip_stopwords, strip_sparse_terms, strip_frequent_terms, strip_html_tags
+
     export SentimentAnalyzer
     export tag_scheme!
-    export jackknife_avg, listify_ngrams, weighted_lcs, fmeasure_lcs
     export rouge_l_summary, rouge_l_sentence, rouge_n
     export PerceptronTagger, fit!, predict
 
@@ -86,9 +89,10 @@ module TextAnalysis
     include("deprecations.jl")
     include("tagging_schemes.jl")
     include("utils.jl")
-    include("rouge.jl")
     include("averagePerceptronTagger.jl")
 
+    include("evaluation_metrics.jl")
+    include("coom.jl")
     # CRF
     include("CRF/crf.jl")
     include("CRF/predict.jl")