Merge pull request #150 from Ayushk4/offline_docs

Add offline Documentation (Docstrings) to the codebase
JuliaText · May 23, 2019 · e1f3b05 · e1f3b05
2 parents e52fe67 + ad7e895
commit e1f3b05
Show file tree

Hide file tree

Showing 9 changed files with 706 additions and 140 deletions.
diff --git a/docs/src/documents.md b/docs/src/documents.md
@@ -266,6 +266,7 @@ julia> remove_words!(sd, ["lear"])
 julia> sd
 StringDocument{String}(" is mad", TextAnalysis.DocumentMetadata(Languages.English(), "Untitled Document", "Unknown Author", "Unknown Time"))
 ```
+
 At other times, you'll want to remove whole classes of words. To make this
 easier, we can use several classes of basic words defined by the Languages.jl
 package:
@@ -294,6 +295,7 @@ These special classes can all be removed using specially-named parameters:
 These functions use words lists, so they are capable of working for many
 different languages without change, also these operations can be combined
 together for improved performance:
+
 * `prepare!(sd, strip_articles| strip_numbers| strip_html_tags)`
 
 In addition to removing words, it is also common to take words that are

diff --git a/docs/src/features.md b/docs/src/features.md
@@ -126,17 +126,80 @@ julia> hash_dtv(crps[1])
  0  0  0  0  0  0  0  0  0  0  0  0  0  …  0  0  0  0  0  0  0  0  0  0  0  0
 ```
 
+## TF (Term Frequency)
+
+Often we need to find out the proportion of a document is contributed
+by each term. This can be done by finding the term frequency function
+
+    tf(dtm)
+
+The paramter, `dtm` can be of the types - `DocumentTermMatrix` , `SparseMatrixCSC` or `Matrix`
+
+```julia
+julia> crps = Corpus([StringDocument("To be or not to be"),
+              StringDocument("To become or not to become")])
+
+julia> update_lexicon!(crps)
+
+julia> m = DocumentTermMatrix(crps)
+
+julia> tf(m)
+2×6 SparseArrays.SparseMatrixCSC{Float64,Int64} with 10 stored entries:
+  [1, 1]  =  0.166667
+  [2, 1]  =  0.166667
+  [1, 2]  =  0.333333
+  [2, 3]  =  0.333333
+  [1, 4]  =  0.166667
+  [2, 4]  =  0.166667
+  [1, 5]  =  0.166667
+  [2, 5]  =  0.166667
+  [1, 6]  =  0.166667
+  [2, 6]  =  0.166667
+```
+
 ## TF-IDF (Term Frequency - Inverse Document Frequency)
 
+    tf_idf(dtm)
+
 In many cases, raw word counts are not appropriate for use because:
 
 * (A) Some documents are longer than other documents
 * (B) Some words are more frequent than other words
 
 You can work around this by performing TF-IDF on a DocumentTermMatrix:
 
-    m = DocumentTermMatrix(crps)
-    tf_idf(m)
+```julia
+julia> crps = Corpus([StringDocument("To be or not to be"),
+              StringDocument("To become or not to become")])
+
+julia> update_lexicon!(crps)
+
+julia> m = DocumentTermMatrix(crps)
+DocumentTermMatrix(
+  [1, 1]  =  1
+  [2, 1]  =  1
+  [1, 2]  =  2
+  [2, 3]  =  2
+  [1, 4]  =  1
+  [2, 4]  =  1
+  [1, 5]  =  1
+  [2, 5]  =  1
+  [1, 6]  =  1
+  [2, 6]  =  1, ["To", "be", "become", "not", "or", "to"], Dict("or"=>5,"not"=>4,"to"=>6,"To"=>1,"be"=>2,"become"=>3))
+
+julia> tf_idf(m)
+2×6 SparseArrays.SparseMatrixCSC{Float64,Int64} with 10 stored entries:
+  [1, 1]  =  0.0
+  [2, 1]  =  0.0
+  [1, 2]  =  0.231049
+  [2, 3]  =  0.231049
+  [1, 4]  =  0.0
+  [2, 4]  =  0.0
+  [1, 5]  =  0.0
+  [2, 5]  =  0.0
+  [1, 6]  =  0.0
+  [2, 6]  =  0.0
+```
 
 As you can see, TF-IDF has the effect of inserting 0's into the columns of
 words that occur in all documents. This is a useful way to avoid having to

diff --git a/src/metadata.jl b/src/metadata.jl
@@ -1,69 +1,195 @@
-##############################################################################
-#
-# Metadata field getters and setters
-#
-##############################################################################
-
 import Languages.name
 
+"""
+    title(doc)
+
+Return the title metadata for `doc`.
+
+See also: [`title!`](@ref), [`titles`](@ref), [`titles!`](@ref)
+"""
 title(d::AbstractDocument) = d.metadata.title
+
+"""
+    language(doc)
+
+Return the language metadata for `doc`.
+
+See also: [`language!`](@ref), [`languages`](@ref), [`languages!`](@ref)
+"""
 language(d::AbstractDocument) = d.metadata.language
+
+"""
+    author(doc)
+
+Return the author metadata for `doc`.
+
+See also: [`author!`](@ref), [`authors`](@ref), [`authors!`](@ref)
+"""
 author(d::AbstractDocument) = d.metadata.author
+
+"""
+    timestamp(doc)
+
+Return the timestamp metadata for `doc`.
+
+See also: [`timestamp!`](@ref), [`timestamps`](@ref), [`timestamps!`](@ref)
+"""
 timestamp(d::AbstractDocument) = d.metadata.timestamp
 
+
+"""
+    title!(doc, str)
+
+Set the title of `doc` to `str`.
+
+See also: [`title`](@ref), [`titles`](@ref), [`titles!`](@ref)
+"""
 function title!(d::AbstractDocument, nv::AbstractString)
     d.metadata.title = nv
 end
 
-function language!(d::AbstractDocument, nv::T) where T <: Language
+"""
+    language!(doc, lang::Language)
+
+Set the language of `doc` to `lang`.
+
+# Example
+```julia-repl
+julia> d = StringDocument("String Document 1")
+
+julia> language!(d, Languages.Spanish())
+
+julia> d.metadata.language
+Languages.Spanish()
+```
+
+See also: [`language`](@ref), [`languages`](@ref), [`languages!`](@ref)
+"""
+function language!(d::AbstractDocument, nv::Language)
     d.metadata.language = nv
 end
 
+"""
+    author!(doc, author)
+
+Set the author metadata of doc to `author`.
+
+See also: [`author`](@ref), [`authors`](@ref), [`authors!`](@ref)
+"""
 function author!(d::AbstractDocument, nv::AbstractString)
     d.metadata.author = nv
 end
 
+"""
+    timestamp!(doc, timestamp::AbstractString)
+
+Set the timestamp metadata of doc to `timestamp`.
+
+See also: [`timestamp`](@ref), [`timestamps`](@ref), [`timestamps!`](@ref)
+"""
 function timestamp!(d::AbstractDocument, nv::AbstractString)
     d.metadata.timestamp = nv
 end
 
-##############################################################################
-#
-# Vectorized getters for an entire Corpus
-#
-##############################################################################
 
+"""
+    titles(crps)
+
+Return the titles for each document in `crps`.
+
+See also: [`titles!`](@ref), [`title`](@ref), [`title!`](@ref)
+"""
 titles(c::Corpus) = map(d -> title(d), documents(c))
+
+"""
+    languages(crps)
+
+Return the languages for each document in `crps`.
+
+See also: [`languages!`](@ref), [`language`](@ref), [`language!`](@ref)
+"""
 languages(c::Corpus) = map(d -> language(d), documents(c))
+
+"""
+    authors(crps)
+
+Return the authors for each document in `crps`.
+
+See also: [`authors!`](@ref), [`author`](@ref), [`author!`](@ref)
+"""
 authors(c::Corpus) = map(d -> author(d), documents(c))
+
+"""
+    timestamps(crps)
+
+Return the timestamps for each document in `crps`.
+
+See also: [`timestamps!`](@ref), [`timestamp`](@ref), [`timestamp!`](@ref)
+"""
 timestamps(c::Corpus) = map(d -> timestamp(d), documents(c))
 
 titles!(c::Corpus, nv::AbstractString) = title!.(documents(c), nv)
 languages!(c::Corpus, nv::T) where {T <: Language} = language!.(documents(c), Ref(nv)) #Ref to force scalar broadcast
 authors!(c::Corpus, nv::AbstractString) = author!.(documents(c), Ref(nv))
 timestamps!(c::Corpus, nv::AbstractString) = timestamp!.(documents(c), Ref(nv))
 
+"""
+    titles!(crps, vec::Vector{String})
+    titles!(crps, str)
+
+Update titles of the documents in a Corpus.
+
+If the input is a String, set the same title for all documents. If the input is a vector, set title of `i`th document to corresponding `i`th element in the vector `vec`. In the latter case, the number of documents must equal the length of vector.
+
+See also: [`titles`](@ref), [`title!`](@ref), [`title`](@ref)
+"""
 function titles!(c::Corpus, nvs::Vector{String})
     length(c) == length(nvs) || throw(DimensionMismatch("dimensions must match"))
     for (i, d) in pairs(IndexLinear(), documents(c))
         title!(d, nvs[i])
     end
 end
 
+"""
+    languages!(crps, langs::Vector{Language})
+    languages!(crps, lang::Language)
+
+Update languages of documents in a Corpus.
+
+If the input is a Vector, then language of the `i`th document is set to the `i`th element in the vector, respectively. However, the number of documents must equal the length of vector.
+
+See also: [`languages`](@ref), [`language!`](@ref), [`language`](@ref)
+"""
 function languages!(c::Corpus, nvs::Vector{T}) where T <: Language
     length(c) == length(nvs) || throw(DimensionMismatch("dimensions must match"))
     for (i, d) in pairs(IndexLinear(), documents(c))
         language!(d, nvs[i])
     end
 end
 
+"""
+    authors!(crps, athrs)
+    authors!(crps, athr)
+
+Set the authors of the documents in `crps` to the `athrs`, respectively.
+
+See also: [`authors`](@ref), [`author!`](@ref), [`author`](@ref)
+"""
 function authors!(c::Corpus, nvs::Vector{String})
     length(c) == length(nvs) || throw(DimensionMismatch("dimensions must match"))
     for (i, d) in pairs(IndexLinear(), documents(c))
         author!(d, nvs[i])
     end
 end
 
+"""
+    timestamps!(crps, times::Vector{String})
+    timestamps!(crps, time::AbstractString)
+
+Set the timestamps of the documents in `crps` to the timestamps in `times`, respectively.
+
+See also: [`timestamps`](@ref), [`timestamp!`](@ref), [`timestamp`](@ref)
+"""
 function timestamps!(c::Corpus, nvs::Vector{String})
     length(c) == length(nvs) || throw(DimensionMismatch("dimensions must match"))
     for (i, d) in pairs(IndexLinear(), documents(c))

diff --git a/src/ngramizer.jl b/src/ngramizer.jl
@@ -1,9 +1,18 @@
-##############################################################################
-#
-# Construct n-grams using single space concatenation
-#
-##############################################################################
+"""
+    ngramize(lang, tokens, n)
 
+Compute the ngrams of `tokens` of the order `n`.
+
+# Example
+
+```julia-repl
+julia> ngramize(Languages.English(), ["To", "be", "or", "not", "to"], 3)
+Dict{AbstractString,Int64} with 3 entries:
+  "be or not" => 1
+  "or not to" => 1
+  "To be or"  => 1
+```
+"""
 function ngramize(lang::S, words::Vector{T}, n::Int) where {S <: Language, T <: AbstractString}
     (n == 1) && return onegramize(lang, words)
 
@@ -14,12 +23,27 @@ function ngramize(lang::S, words::Vector{T}, n::Int) where {S <: Language, T <:
     for index in 1:(n_words - n + 1)
         token = join(words[index:(index + n - 1)], " ")
         tokens[token] = get(tokens, token, 0) + 1
-
     end
-
     return tokens
 end
 
+"""
+    onegramize(lang, tokens)
+
+Create the unigrams dict for input tokens.
+
+# Example
+
+```julia-repl
+julia> onegramize(Languages.English(), ["To", "be", "or", "not", "to", "be"])
+Dict{String,Int64} with 5 entries:
+  "or"  => 1
+  "not" => 1
+  "to"  => 1
+  "To"  => 1
+  "be"  => 2
+```
+"""
 function onegramize(lang::S, words::Vector{T}) where {S <: Language, T <: AbstractString}
     n_words = length(words)
     tokens = Dict{T, Int}()
@@ -28,5 +52,5 @@ function onegramize(lang::S, words::Vector{T}) where {S <: Language, T <: Abstra
         tokens[word] = get(tokens, word, 0) + 1
     end
 
-    tokens
+    return tokens
 end