diff --git a/docs/src/documents.md b/docs/src/documents.md
index 5c959e60..2e9196be 100644
--- a/docs/src/documents.md
+++ b/docs/src/documents.md
@@ -266,6 +266,7 @@ julia> remove_words!(sd, ["lear"])
julia> sd
StringDocument{String}(" is mad", TextAnalysis.DocumentMetadata(Languages.English(), "Untitled Document", "Unknown Author", "Unknown Time"))
```
+
At other times, you'll want to remove whole classes of words. To make this
easier, we can use several classes of basic words defined by the Languages.jl
package:
@@ -294,6 +295,7 @@ These special classes can all be removed using specially-named parameters:
These functions use words lists, so they are capable of working for many
different languages without change, also these operations can be combined
together for improved performance:
+
* `prepare!(sd, strip_articles| strip_numbers| strip_html_tags)`
In addition to removing words, it is also common to take words that are
diff --git a/docs/src/features.md b/docs/src/features.md
index 9dbe86f9..ca4a8b27 100644
--- a/docs/src/features.md
+++ b/docs/src/features.md
@@ -126,8 +126,41 @@ julia> hash_dtv(crps[1])
0 0 0 0 0 0 0 0 0 0 0 0 0 … 0 0 0 0 0 0 0 0 0 0 0 0
```
+## TF (Term Frequency)
+
+Often we need to find out the proportion of a document is contributed
+by each term. This can be done by finding the term frequency function
+
+ tf(dtm)
+
+The paramter, `dtm` can be of the types - `DocumentTermMatrix` , `SparseMatrixCSC` or `Matrix`
+
+```julia
+julia> crps = Corpus([StringDocument("To be or not to be"),
+ StringDocument("To become or not to become")])
+
+julia> update_lexicon!(crps)
+
+julia> m = DocumentTermMatrix(crps)
+
+julia> tf(m)
+2×6 SparseArrays.SparseMatrixCSC{Float64,Int64} with 10 stored entries:
+ [1, 1] = 0.166667
+ [2, 1] = 0.166667
+ [1, 2] = 0.333333
+ [2, 3] = 0.333333
+ [1, 4] = 0.166667
+ [2, 4] = 0.166667
+ [1, 5] = 0.166667
+ [2, 5] = 0.166667
+ [1, 6] = 0.166667
+ [2, 6] = 0.166667
+```
+
## TF-IDF (Term Frequency - Inverse Document Frequency)
+ tf_idf(dtm)
+
In many cases, raw word counts are not appropriate for use because:
* (A) Some documents are longer than other documents
@@ -135,8 +168,38 @@ In many cases, raw word counts are not appropriate for use because:
You can work around this by performing TF-IDF on a DocumentTermMatrix:
- m = DocumentTermMatrix(crps)
- tf_idf(m)
+```julia
+julia> crps = Corpus([StringDocument("To be or not to be"),
+ StringDocument("To become or not to become")])
+
+julia> update_lexicon!(crps)
+
+julia> m = DocumentTermMatrix(crps)
+DocumentTermMatrix(
+ [1, 1] = 1
+ [2, 1] = 1
+ [1, 2] = 2
+ [2, 3] = 2
+ [1, 4] = 1
+ [2, 4] = 1
+ [1, 5] = 1
+ [2, 5] = 1
+ [1, 6] = 1
+ [2, 6] = 1, ["To", "be", "become", "not", "or", "to"], Dict("or"=>5,"not"=>4,"to"=>6,"To"=>1,"be"=>2,"become"=>3))
+
+julia> tf_idf(m)
+2×6 SparseArrays.SparseMatrixCSC{Float64,Int64} with 10 stored entries:
+ [1, 1] = 0.0
+ [2, 1] = 0.0
+ [1, 2] = 0.231049
+ [2, 3] = 0.231049
+ [1, 4] = 0.0
+ [2, 4] = 0.0
+ [1, 5] = 0.0
+ [2, 5] = 0.0
+ [1, 6] = 0.0
+ [2, 6] = 0.0
+```
As you can see, TF-IDF has the effect of inserting 0's into the columns of
words that occur in all documents. This is a useful way to avoid having to
diff --git a/src/metadata.jl b/src/metadata.jl
index f8700570..b9b98c05 100644
--- a/src/metadata.jl
+++ b/src/metadata.jl
@@ -1,41 +1,131 @@
-##############################################################################
-#
-# Metadata field getters and setters
-#
-##############################################################################
-
import Languages.name
+"""
+ title(doc)
+
+Return the title metadata for `doc`.
+
+See also: [`title!`](@ref), [`titles`](@ref), [`titles!`](@ref)
+"""
title(d::AbstractDocument) = d.metadata.title
+
+"""
+ language(doc)
+
+Return the language metadata for `doc`.
+
+See also: [`language!`](@ref), [`languages`](@ref), [`languages!`](@ref)
+"""
language(d::AbstractDocument) = d.metadata.language
+
+"""
+ author(doc)
+
+Return the author metadata for `doc`.
+
+See also: [`author!`](@ref), [`authors`](@ref), [`authors!`](@ref)
+"""
author(d::AbstractDocument) = d.metadata.author
+
+"""
+ timestamp(doc)
+
+Return the timestamp metadata for `doc`.
+
+See also: [`timestamp!`](@ref), [`timestamps`](@ref), [`timestamps!`](@ref)
+"""
timestamp(d::AbstractDocument) = d.metadata.timestamp
+
+"""
+ title!(doc, str)
+
+Set the title of `doc` to `str`.
+
+See also: [`title`](@ref), [`titles`](@ref), [`titles!`](@ref)
+"""
function title!(d::AbstractDocument, nv::AbstractString)
d.metadata.title = nv
end
-function language!(d::AbstractDocument, nv::T) where T <: Language
+"""
+ language!(doc, lang::Language)
+
+Set the language of `doc` to `lang`.
+
+# Example
+```julia-repl
+julia> d = StringDocument("String Document 1")
+
+julia> language!(d, Languages.Spanish())
+
+julia> d.metadata.language
+Languages.Spanish()
+```
+
+See also: [`language`](@ref), [`languages`](@ref), [`languages!`](@ref)
+"""
+function language!(d::AbstractDocument, nv::Language)
d.metadata.language = nv
end
+"""
+ author!(doc, author)
+
+Set the author metadata of doc to `author`.
+
+See also: [`author`](@ref), [`authors`](@ref), [`authors!`](@ref)
+"""
function author!(d::AbstractDocument, nv::AbstractString)
d.metadata.author = nv
end
+"""
+ timestamp!(doc, timestamp::AbstractString)
+
+Set the timestamp metadata of doc to `timestamp`.
+
+See also: [`timestamp`](@ref), [`timestamps`](@ref), [`timestamps!`](@ref)
+"""
function timestamp!(d::AbstractDocument, nv::AbstractString)
d.metadata.timestamp = nv
end
-##############################################################################
-#
-# Vectorized getters for an entire Corpus
-#
-##############################################################################
+"""
+ titles(crps)
+
+Return the titles for each document in `crps`.
+
+See also: [`titles!`](@ref), [`title`](@ref), [`title!`](@ref)
+"""
titles(c::Corpus) = map(d -> title(d), documents(c))
+
+"""
+ languages(crps)
+
+Return the languages for each document in `crps`.
+
+See also: [`languages!`](@ref), [`language`](@ref), [`language!`](@ref)
+"""
languages(c::Corpus) = map(d -> language(d), documents(c))
+
+"""
+ authors(crps)
+
+Return the authors for each document in `crps`.
+
+See also: [`authors!`](@ref), [`author`](@ref), [`author!`](@ref)
+"""
authors(c::Corpus) = map(d -> author(d), documents(c))
+
+"""
+ timestamps(crps)
+
+Return the timestamps for each document in `crps`.
+
+See also: [`timestamps!`](@ref), [`timestamp`](@ref), [`timestamp!`](@ref)
+"""
timestamps(c::Corpus) = map(d -> timestamp(d), documents(c))
titles!(c::Corpus, nv::AbstractString) = title!.(documents(c), nv)
@@ -43,6 +133,16 @@ languages!(c::Corpus, nv::T) where {T <: Language} = language!.(documents(c), Re
authors!(c::Corpus, nv::AbstractString) = author!.(documents(c), Ref(nv))
timestamps!(c::Corpus, nv::AbstractString) = timestamp!.(documents(c), Ref(nv))
+"""
+ titles!(crps, vec::Vector{String})
+ titles!(crps, str)
+
+Update titles of the documents in a Corpus.
+
+If the input is a String, set the same title for all documents. If the input is a vector, set title of `i`th document to corresponding `i`th element in the vector `vec`. In the latter case, the number of documents must equal the length of vector.
+
+See also: [`titles`](@ref), [`title!`](@ref), [`title`](@ref)
+"""
function titles!(c::Corpus, nvs::Vector{String})
length(c) == length(nvs) || throw(DimensionMismatch("dimensions must match"))
for (i, d) in pairs(IndexLinear(), documents(c))
@@ -50,6 +150,16 @@ function titles!(c::Corpus, nvs::Vector{String})
end
end
+"""
+ languages!(crps, langs::Vector{Language})
+ languages!(crps, lang::Language)
+
+Update languages of documents in a Corpus.
+
+If the input is a Vector, then language of the `i`th document is set to the `i`th element in the vector, respectively. However, the number of documents must equal the length of vector.
+
+See also: [`languages`](@ref), [`language!`](@ref), [`language`](@ref)
+"""
function languages!(c::Corpus, nvs::Vector{T}) where T <: Language
length(c) == length(nvs) || throw(DimensionMismatch("dimensions must match"))
for (i, d) in pairs(IndexLinear(), documents(c))
@@ -57,6 +167,14 @@ function languages!(c::Corpus, nvs::Vector{T}) where T <: Language
end
end
+"""
+ authors!(crps, athrs)
+ authors!(crps, athr)
+
+Set the authors of the documents in `crps` to the `athrs`, respectively.
+
+See also: [`authors`](@ref), [`author!`](@ref), [`author`](@ref)
+"""
function authors!(c::Corpus, nvs::Vector{String})
length(c) == length(nvs) || throw(DimensionMismatch("dimensions must match"))
for (i, d) in pairs(IndexLinear(), documents(c))
@@ -64,6 +182,14 @@ function authors!(c::Corpus, nvs::Vector{String})
end
end
+"""
+ timestamps!(crps, times::Vector{String})
+ timestamps!(crps, time::AbstractString)
+
+Set the timestamps of the documents in `crps` to the timestamps in `times`, respectively.
+
+See also: [`timestamps`](@ref), [`timestamp!`](@ref), [`timestamp`](@ref)
+"""
function timestamps!(c::Corpus, nvs::Vector{String})
length(c) == length(nvs) || throw(DimensionMismatch("dimensions must match"))
for (i, d) in pairs(IndexLinear(), documents(c))
diff --git a/src/ngramizer.jl b/src/ngramizer.jl
index cc19c2ba..6d8e0fa9 100644
--- a/src/ngramizer.jl
+++ b/src/ngramizer.jl
@@ -1,9 +1,18 @@
-##############################################################################
-#
-# Construct n-grams using single space concatenation
-#
-##############################################################################
+"""
+ ngramize(lang, tokens, n)
+Compute the ngrams of `tokens` of the order `n`.
+
+# Example
+
+```julia-repl
+julia> ngramize(Languages.English(), ["To", "be", "or", "not", "to"], 3)
+Dict{AbstractString,Int64} with 3 entries:
+ "be or not" => 1
+ "or not to" => 1
+ "To be or" => 1
+```
+"""
function ngramize(lang::S, words::Vector{T}, n::Int) where {S <: Language, T <: AbstractString}
(n == 1) && return onegramize(lang, words)
@@ -14,12 +23,27 @@ function ngramize(lang::S, words::Vector{T}, n::Int) where {S <: Language, T <:
for index in 1:(n_words - n + 1)
token = join(words[index:(index + n - 1)], " ")
tokens[token] = get(tokens, token, 0) + 1
-
end
-
return tokens
end
+"""
+ onegramize(lang, tokens)
+
+Create the unigrams dict for input tokens.
+
+# Example
+
+```julia-repl
+julia> onegramize(Languages.English(), ["To", "be", "or", "not", "to", "be"])
+Dict{String,Int64} with 5 entries:
+ "or" => 1
+ "not" => 1
+ "to" => 1
+ "To" => 1
+ "be" => 2
+```
+"""
function onegramize(lang::S, words::Vector{T}) where {S <: Language, T <: AbstractString}
n_words = length(words)
tokens = Dict{T, Int}()
@@ -28,5 +52,5 @@ function onegramize(lang::S, words::Vector{T}) where {S <: Language, T <: Abstra
tokens[word] = get(tokens, word, 0) + 1
end
- tokens
+ return tokens
end
diff --git a/src/preprocessing.jl b/src/preprocessing.jl
index 55dfc4cf..f4dea4d7 100644
--- a/src/preprocessing.jl
+++ b/src/preprocessing.jl
@@ -1,4 +1,3 @@
-
const strip_patterns = UInt32(0)
const strip_corrupt_utf8 = UInt32(0x1) << 0
const strip_case = UInt32(0x1) << 1
@@ -37,17 +36,29 @@ function mk_regex(regex_string)
end
-##############################################################################
-#
-# Remove corrupt UTF8 characters
-#
-##############################################################################
+"""
+ remove_corrupt_utf8(str)
+
+Remove corrupt UTF8 characters in `str`.
+
+See also: [`remove_corrupt_utf8!`](@ref)
+"""
function remove_corrupt_utf8(s::AbstractString)
return map(x->isvalid(x) ? x : ' ', s)
end
remove_corrupt_utf8!(d::FileDocument) = error("FileDocument cannot be modified")
+"""
+ remove_corrupt_utf8!(doc)
+ remove_corrupt_utf8!(crps)
+
+Remove corrupt UTF8 characters for `doc` or documents in `crps`.
+
+Does not support `FileDocument` or Corpus containing `FileDocument`.
+
+See also: [`remove_corrupt_utf8`](@ref)
+"""
function remove_corrupt_utf8!(d::StringDocument)
d.text = remove_corrupt_utf8(d.text)
nothing
@@ -78,16 +89,10 @@ function remove_corrupt_utf8!(crps::Corpus)
end
end
-##############################################################################
-#
-# Conversion to lowercase
-#
-##############################################################################
-
"""
- remove_case(s::AbstractString)
+ remove_case(str)
-Converts the string to lowercase.
+Convert `str` to lowercase.
See also: [`remove_case!`](@ref)
"""
@@ -95,26 +100,31 @@ remove_case(s::T) where {T <: AbstractString} = lowercase(s)
"""
- remove_case!(d::TokenDocument)
- remove_case!(d::StringDocument)
- remove_case!(d::NGramDocument)
+ remove_case!(doc)
+ remove_case!(crps)
- remove_case!(c::Corpus)
+Convert the text of `doc` or `crps` to lowercase.
-Converts the text of the document or corpus to lowercase. This method does not
-works with FileDocument
+Does not support `FileDocument` or `crps` containing `FileDocument`.
# Example
```julia-repl
-julia> str="The quick brown fox jumps over the lazy dog"
-julia> sd=StringDocument(str)
-StringDocument{String}("The quick brown fox jumps over the lazy dog", TextAnalysis.DocumentMetadata(Languages.English(), "Untitled Document", "Unknown Author", "Unknown Time"))
+julia> str = "The quick brown fox jumps over the lazy dog"
+julia> sd = StringDocument(str)
+A StringDocument{String}
+ * Language: Languages.English()
+ * Title: Untitled Document
+ * Author: Unknown Author
+ * Timestamp: Unknown Time
+ * Snippet: The quick brown fox jumps over the lazy dog
julia> remove_case!(sd)
julia> sd.text
"the quick brown fox jumps over the lazy dog"
```
+
+See also: [`remove_case`](@ref)
"""
remove_case!(d::FileDocument) = error("FileDocument cannot be modified")
@@ -148,21 +158,60 @@ function remove_case!(crps::Corpus)
end
end
-##############################################################################
-#
-# Stripping HTML tags
-#
-##############################################################################
+
const script_tags = Regex("")
const style_tags = Regex("")
const html_tags = Regex("<[^>]*>")
+"""
+ remove_html_tags(str)
+
+Remove html tags from `str`, including the style and script tags.
+
+See also: [`remove_html_tags!`](@ref)
+"""
function remove_html_tags(s::AbstractString)
s = remove_patterns(s, script_tags)
s = remove_patterns(s, style_tags)
remove_patterns(s, html_tags)
end
+"""
+ remove_html_tags!(doc::StringDocument)
+ remove_html_tags!(crps)
+
+Remove html tags from the `StringDocument` or documents `crps`.
+
+Does not work for documents other than `StringDocument`.
+
+# Example
+
+```julia-repl
+julia> html_doc = StringDocument(
+ "
+
+
+
+ Hello
world
+
+
+ "
+ )
+A StringDocument{String}
+ * Language: Languages.English()
+ * Title: Untitled Document
+ * Author: Unknown Author
+ * Timestamp: Unknown Time
+ * Snippet: remove_html_tags!(html_doc)
+
+julia> strip(text(html_doc))
+"Hello world"
+```
+
+See also: [`remove_html_tags`](@ref)
+"""
function remove_html_tags!(d::AbstractDocument)
error("HTML tags can be removed only from a StringDocument")
end
@@ -178,16 +227,12 @@ function remove_html_tags!(crps::Corpus)
end
end
-##############################################################################
-#
-# Remove specified words
-#
-##############################################################################
+
"""
- remove_words!(d::AbstractDocument, words::Vector)
- remove_words!(c::Corpus, words::Vector)
+ remove_words!(doc, words::Vector{AbstractString})
+ remove_words!(crps, words::Vector{AbstractString})
-Removes the tokens defined in the list `words` from the source Document or Corpus
+Remove the occurences of words from `doc` or `crps`.
# Example
@@ -217,14 +262,33 @@ end
tag_pos!(entity) = error("Not yet implemented")
+"""
+ sparse_terms(crps, alpha=0.05])
+Find the sparse terms from Corpus, occuring in less than `alpha` percentage of the documents.
-##############################################################################
-#
-# Drop terms based on frequency
-#
-##############################################################################
+# Example
+```
+julia> crps = Corpus([StringDocument("This is Document 1"),
+ StringDocument("This is Document 2")])
+A Corpus with 2 documents:
+* 2 StringDocument's
+* 0 FileDocument's
+* 0 TokenDocument's
+* 0 NGramDocument's
+
+Corpus's lexicon contains 0 tokens
+Corpus's index contains 0 tokens
+
+julia> sparse_terms(crps, 0.5)
+2-element Array{String,1}:
+ "1"
+ "2"
+```
+
+See also: [`remove_sparse_terms!`](@ref), [`frequent_terms`](@ref)
+"""
function sparse_terms(crps::Corpus, alpha::Real = alpha_sparse)
update_lexicon!(crps)
update_inverse_index!(crps)
@@ -239,6 +303,34 @@ function sparse_terms(crps::Corpus, alpha::Real = alpha_sparse)
return res
end
+"""
+ frequent_terms(crps, alpha=0.95)
+
+Find the frequent terms from Corpus, occuring more than `alpha` percentage of the documents.
+
+# Example
+
+```
+julia> crps = Corpus([StringDocument("This is Document 1"),
+ StringDocument("This is Document 2")])
+A Corpus with 2 documents:
+ * 2 StringDocument's
+ * 0 FileDocument's
+ * 0 TokenDocument's
+ * 0 NGramDocument's
+
+Corpus's lexicon contains 0 tokens
+Corpus's index contains 0 tokens
+
+julia> frequent_terms(crps)
+3-element Array{String,1}:
+ "is"
+ "This"
+ "Document"
+```
+
+See also: [`remove_frequent_terms!`](@ref), [`sparse_terms`](@ref)
+"""
function frequent_terms(crps::Corpus, alpha::Real = alpha_frequent)
update_lexicon!(crps)
update_inverse_index!(crps)
@@ -253,20 +345,115 @@ function frequent_terms(crps::Corpus, alpha::Real = alpha_frequent)
return res
end
-# Sparse terms occur in less than x percent of all documents
+"""
+ remove_sparse_terms!(crps, alpha=0.05)
+
+Remove sparse terms in crps, occuring less than `alpha` percent of documents.
+
+# Example
+
+```julia-repl
+julia> crps = Corpus([StringDocument("This is Document 1"),
+ StringDocument("This is Document 2")])
+A Corpus with 2 documents:
+ * 2 StringDocument's
+ * 0 FileDocument's
+ * 0 TokenDocument's
+ * 0 NGramDocument's
+
+Corpus's lexicon contains 0 tokens
+Corpus's index contains 0 tokens
+
+julia> remove_sparse_terms!(crps, 0.5)
+
+julia> crps[1].text
+"This is Document "
+
+julia> crps[2].text
+"This is Document "
+```
+
+See also: [`remove_frequent_terms!`](@ref), [`sparse_terms`](@ref)
+"""
remove_sparse_terms!(crps::Corpus, alpha::Real = alpha_sparse) = remove_words!(crps, sparse_terms(crps, alpha))
-# Frequent terms occur in more than x percent of all documents
+"""
+ remove_frequent_terms!(crps, alpha=0.95)
+
+Remove terms in `crps`, occuring more than `alpha` percent of documents.
+
+# Example
+
+```julia-repl
+julia> crps = Corpus([StringDocument("This is Document 1"),
+ StringDocument("This is Document 2")])
+A Corpus with 2 documents:
+* 2 StringDocument's
+* 0 FileDocument's
+* 0 TokenDocument's
+* 0 NGramDocument's
+
+Corpus's lexicon contains 0 tokens
+Corpus's index contains 0 tokens
+
+julia> remove_frequent_terms!(crps)
+
+julia> text(crps[1])
+" 1"
+
+julia> text(crps[2])
+" 2"
+```
+
+See also: [`remove_sparse_terms!`](@ref), [`frequent_terms`](@ref)
+"""
remove_frequent_terms!(crps::Corpus, alpha::Real = alpha_frequent) = remove_words!(crps, frequent_terms(crps, alpha))
+"""
+ prepare!(doc, flags)
+ prepare!(crps, flags)
+
+Preprocess document or corpus based on the input flags.
+
+# List of Flags
+
+* strip_patterns
+* strip_corrupt_utf8
+* strip_case
+* stem_words
+* tag_part_of_speech
+* strip_whitespace
+* strip_punctuation
+* strip_numbers
+* strip_non_letters
+* strip_indefinite_articles
+* strip_definite_articles
+* strip_articles
+* strip_prepositions
+* strip_pronouns
+* strip_stopwords
+* strip_sparse_terms
+* strip_frequent_terms
+* strip_html_tags
-##############################################################################
-#
-# Remove parts from document based on flags or regular expressions
-#
-##############################################################################
+# Example
+```julia-repl
+julia> doc = StringDocument("This is a document of mine")
+A StringDocument{String}
+ * Language: Languages.English()
+ * Title: Untitled Document
+ * Author: Unknown Author
+ * Timestamp: Unknown Time
+ * Snippet: This is a document of mine
+
+julia> prepare!(doc, strip_pronouns | strip_articles)
+
+julia> text(doc)
+"This is document of "
+```
+"""
function prepare!(crps::Corpus, flags::UInt32; skip_patterns = Set{AbstractString}(), skip_words = Set{AbstractString}())
((flags & strip_sparse_terms) > 0) && union!(skip_words, sparse_terms(crps))
((flags & strip_frequent_terms) > 0) && union!(skip_words, frequent_terms(crps))
@@ -302,41 +489,49 @@ end
"""
- remove_whitespace(s::AbstractString)
+ remove_whitespace(str)
-Squashes multiple whitespaces to a single one. And removes all leading and
-trailing whitespaces in a string.
+Squash multiple whitespaces to a single one.
+And remove all leading and trailing whitespaces.
+See also: [`remove_whitespace!`](@ref)
"""
-remove_whitespace(s::AbstractString) = replace(strip(s), r"\s+"=>" ")
+remove_whitespace(str::AbstractString) = replace(strip(str), r"\s+"=>" ")
"""
- remove_whitespace!(s::AbstractDocument)
+ remove_whitespace!(doc)
+ remove_whitespace!(crps)
-Squashes multiple whitespaces to a single space. And removes all leading and
-trailing whitespaces in a StringDocument and Corpus.
+Squash multiple whitespaces to a single space and remove all leading and trailing whitespaces in document or crps.
-Does no-op for NGramDocument and TokenDocument.
+Does no-op for `FileDocument`, `TokenDocument` or `NGramDocument`.
+See also: [`remove_whitespace`](@ref)
"""
function remove_whitespace!(d::StringDocument)
- d.text = remove_whitespace(d.text)
+ d.text = remove_whitespace(d.text)
end
function remove_whitespace!(crps::Corpus)
- for doc in crps
- remove_whitespace!(doc)
- end
+ for doc in crps
+ remove_whitespace!(doc)
+ end
end
function remove_whitespace!(d::AbstractDocument)
- nothing
+ nothing
end
+"""
+ remove_patterns(str, rex::Regex)
-function remove_patterns(s::AbstractString, rex::Regex)
- return replace(s, rex => "")
+Remove the part of str matched by rex.
+
+See also: [`remove_patterns!`](@ref)
+"""
+function remove_patterns(s::AbstractString, rex::Regex)
+ return replace(s, rex => "")
end
function remove_patterns(s::SubString{T}, rex::Regex) where T <: String
@@ -360,7 +555,16 @@ function remove_patterns(s::SubString{T}, rex::Regex) where T <: String
String(take!(iob))
end
+"""
+ remove_patterns!(doc, rex::Regex)
+ remove_patterns!(crps, rex::Regex)
+
+Remove patterns matched by `rex` in document or Corpus.
+
+Does not modify `FileDocument` or Corpus containing `FileDocument`.
+See also: [`remove_patterns`](@ref)
+"""
remove_patterns!(d::FileDocument, rex::Regex) = error("FileDocument cannot be modified")
function remove_patterns!(d::StringDocument, rex::Regex)
diff --git a/src/sentiment.jl b/src/sentiment.jl
index 91ec5804..255b814e 100644
--- a/src/sentiment.jl
+++ b/src/sentiment.jl
@@ -48,12 +48,12 @@ function get_sentiment(handle_unknown, ip::Array{T, 1}, weight, rwi) where T <:
if ele in keys(rwi) && rwi[ele] <= size(weight[:embedding_1]["embedding_1"]["embeddings:0"])[2] # there are only 5000 unique embeddings
push!(res, rwi[ele])
else
- for words in handle_unknown(ele)
+ for words in handle_unknown(ele)
if words in keys(rwi) && rwi[words] <= size(weight[:embedding_1]["embedding_1"]["embeddings:0"])[2]
push!(res, rwi[words])
end
- end
-
+ end
+
end
end
return model(pad_sequences(res))[1]
@@ -67,7 +67,7 @@ struct SentimentModel
# Only load Flux once it is actually needed
global Flux
Flux = Base.require(TextAnalysis, :Flux)
-
+
new(read_weights(), read_word_ids())
end
end
@@ -89,15 +89,15 @@ end
"""
- ```
- model = SentimentAnalyzer(doc)
- model = SentimentAnalyzer(doc, handle_unknown)
- ```
- Return sentiment of the input doc in range 0 to 1, 0 being least sentiment score and 1 being
- the highest:
- - doc = Input Document for calculating document (AbstractDocument type)
- - handle_unknown = A function for handling unknown words. Should return an array (default x->tuple())
- """
+ model = SentimentAnalyzer(doc)
+ model = SentimentAnalyzer(doc, handle_unknown)
+
+Predict sentiment of the input doc in range 0 to 1, 0 being least sentiment score and 1 being the highest.
+
+# Arguments
+- doc = Input Document for calculating document (`AbstractDocument` type)
+- handle_unknown = A function for handling unknown words. Should return an array (default x->tuple())
+"""
function(m::SentimentAnalyzer)(d::AbstractDocument, handle_unknown = x->tuple())
m.model(handle_unknown, tokens(d))
diff --git a/src/stemmer.jl b/src/stemmer.jl
index 72215391..89344527 100644
--- a/src/stemmer.jl
+++ b/src/stemmer.jl
@@ -8,8 +8,11 @@ const ISO_8859_1 = "ISO_8859_1"
const CP850 = "CP850"
const KOI8_R = "KOI8_R"
-##
-# lists the stemmer algorithms loaded
+"""
+ stemmer_types()
+
+List all the stemmer algorithms loaded.
+"""
function stemmer_types()
cptr = ccall((:sb_stemmer_list, libstemmer), Ptr{Ptr{UInt8}}, ())
(C_NULL == cptr) && error("error getting stemmer types")
@@ -50,7 +53,7 @@ mutable struct Stemmer
end
end
-show(io::IO, stm::Stemmer) = println(io, "Stemmer algorithm:$(stm.alg) encoding:$(stm.enc)")
+Base.show(io::IO, stm::Stemmer) = println(io, "Stemmer algorithm:$(stm.alg) encoding:$(stm.enc)")
function release(stm::Stemmer)
(C_NULL == stm.cptr) && return
@@ -59,6 +62,14 @@ function release(stm::Stemmer)
nothing
end
+"""
+ stem(stemmer::Stemmer, str)
+ stem(stemmer::Stemmer, words::Array)
+
+Stem the input with the Stemming algorthm of `stemmer`.
+
+See also: [`stem!`](@ref)
+"""
function stem(stemmer::Stemmer, bstr::AbstractString)
sres = ccall((:sb_stemmer_stem, libstemmer),
Ptr{UInt8},
@@ -83,13 +94,26 @@ function stem(stemmer::Stemmer, words::Array)
for idx in 1:l
ret[idx] = stem(stemmer, words[idx])
end
- ret
+ return ret
end
+"""
+ stemmer_for_document(doc)
+
+Search for an appropriate stemmer based on the language of the document.
+"""
function stemmer_for_document(d::AbstractDocument)
Stemmer(lowercase(Languages.english_name(language(d))))
end
+"""
+ stem!(doc)
+ stem!(crps)
+
+Stems the document or documents in `crps` with a suitable stemmer.
+
+Stemming cannot be done for `FileDocument` and Corpus made of these type of documents.
+"""
function stem!(d::AbstractDocument)
stemmer = stemmer_for_document(d)
stem!(stemmer, d)
diff --git a/src/tf_idf.jl b/src/tf_idf.jl
index a7f142c3..21babcf4 100644
--- a/src/tf_idf.jl
+++ b/src/tf_idf.jl
@@ -1,21 +1,12 @@
-##############################################################################
-#
-# TF
-#
-##############################################################################
+"""
+ tf!(dtm::AbstractMatrix{Real}, tf::AbstractMatrix{AbstractFloat})
-tf(dtm::Matrix{T}) where {T <: Real} = tf!(dtm, Array{Float64}(undef, size(dtm)...))
-
-tf(dtm::SparseMatrixCSC{T}) where {T <: Real} = tf!(dtm, similar(dtm, Float64))
-
-tf!(dtm::AbstractMatrix{T}) where {T <: Real} = tf!(dtm, dtm)
-
-tf!(dtm::SparseMatrixCSC{T}) where {T <: Real} = tf!(dtm, dtm)
+Overwrite `tf` with the term frequency of the `dtm`.
-tf(dtm::DocumentTermMatrix) = tf(dtm.dtm)
+Works correctly if `dtm` and `tf` are same matrix.
-# The second Matrix will be overwritten with the result
-# Will work correctly if dtm and tfidf are the same matrix
+See also: [`tf`](@ref), [`tf_idf`](@ref), [`tf_idf!`](@ref)
+"""
function tf!(dtm::AbstractMatrix{T1}, tf::AbstractMatrix{T2}) where {T1 <: Real, T2 <: AbstractFloat}
n, p = size(dtm)
@@ -31,7 +22,15 @@ function tf!(dtm::AbstractMatrix{T1}, tf::AbstractMatrix{T2}) where {T1 <: Real,
return tf
end
-# assumes second matrix has same nonzeros as first one
+"""
+ tf!(dtm::SparseMatrixCSC{Real}, tf::SparseMatrixCSC{AbstractFloat})
+
+Overwrite `tf` with the term frequency of the `dtm`.
+
+`tf` should have the has same nonzeros as `dtm`.
+
+See also: [`tf`](@ref), [`tf_idf`](@ref), [`tf_idf!`](@ref)
+"""
function tf!(dtm::SparseMatrixCSC{T}, tf::SparseMatrixCSC{F}) where {T <: Real, F <: AbstractFloat}
rows = rowvals(dtm)
dtmvals = nonzeros(dtm)
@@ -48,31 +47,61 @@ function tf!(dtm::SparseMatrixCSC{T}, tf::SparseMatrixCSC{F}) where {T <: Real,
tfvals[j] = dtmvals[j] / max(words_in_documents[row], one(T))
end
end
- tf
+ return tf
end
-##############################################################################
-#
-# TF-IDF
-#
-##############################################################################
+tf!(dtm::AbstractMatrix{T}) where {T <: Real} = tf!(dtm, dtm)
-tf_idf(dtm::Matrix{T}) where {T <: Real} = tf_idf!(dtm, Array{Float64}(undef, size(dtm)...))
+tf!(dtm::SparseMatrixCSC{T}) where {T <: Real} = tf!(dtm, dtm)
-tf_idf(dtm::SparseMatrixCSC{T}) where {T <: Real} = tf_idf!(dtm, similar(dtm, Float64))
+"""
+ tf(dtm::DocumentTermMatrix)
+ tf(dtm::SparseMatrixCSC{Real})
+ tf(dtm::Matrix{Real})
-tf_idf!(dtm::AbstractMatrix{T}) where {T <: Real} = tf_idf!(dtm, dtm)
+Compute the `term-frequency` of the input.
-tf_idf!(dtm::SparseMatrixCSC{T}) where {T <: Real} = tf_idf!(dtm, dtm)
+# Example
-tf_idf(dtm::DocumentTermMatrix) = tf_idf(dtm.dtm)
+```julia-repl
+julia> crps = Corpus([StringDocument("To be or not to be"),
+ StringDocument("To become or not to become")])
-# This does not make sense, since DocumentTermMatrix is based on an array of integers
-#tf_idf!(dtm::DocumentTermMatrix) = tf_idf!(dtm.dtm)
+julia> update_lexicon!(crps)
+
+julia> m = DocumentTermMatrix(crps)
+
+julia> tf(m)
+2×6 SparseArrays.SparseMatrixCSC{Float64,Int64} with 10 stored entries:
+ [1, 1] = 0.166667
+ [2, 1] = 0.166667
+ [1, 2] = 0.333333
+ [2, 3] = 0.333333
+ [1, 4] = 0.166667
+ [2, 4] = 0.166667
+ [1, 5] = 0.166667
+ [2, 5] = 0.166667
+ [1, 6] = 0.166667
+ [2, 6] = 0.166667
+```
+
+See also: [`tf!`](@ref), [`tf_idf`](@ref), [`tf_idf!`](@ref)
+"""
+tf(dtm::DocumentTermMatrix) = tf(dtm.dtm)
+
+tf(dtm::Matrix{T}) where {T <: Real} = tf!(dtm, Array{Float64}(undef, size(dtm)...))
+
+tf(dtm::SparseMatrixCSC{T}) where {T <: Real} = tf!(dtm, similar(dtm, Float64))
+
+"""
+ tf_idf!(dtm::AbstractMatrix{Real}, tf_idf::AbstractMatrix{AbstractFloat})
+Overwrite `tf_idf` with the tf-idf (Term Frequency - Inverse Doc Frequency) of the `dtm`.
-# The second Matrix will be overwritten with the result
-# Will work correctly if dtm and tfidf are the same matrix
+`dtm` and `tf-idf` must be matrices of same dimensions.
+
+See also: [`tf`](@ref), [`tf!`](@ref) , [`tf_idf`](@ref)
+"""
function tf_idf!(dtm::AbstractMatrix{T1}, tfidf::AbstractMatrix{T2}) where {T1 <: Real, T2 <: AbstractFloat}
n, p = size(dtm)
@@ -93,7 +122,15 @@ function tf_idf!(dtm::AbstractMatrix{T1}, tfidf::AbstractMatrix{T2}) where {T1 <
return tfidf
end
-# sparse version
+"""
+ tf_idf!(dtm::SparseMatrixCSC{Real}, tfidf::SparseMatrixCSC{AbstractFloat})
+
+Overwrite `tfidf` with the tf-idf (Term Frequency - Inverse Doc Frequency) of the `dtm`.
+
+The arguments must have same number of nonzeros.
+
+See also: [`tf`](@ref), [`tf_idf`](@ref), [`tf_idf!`](@ref)
+"""
function tf_idf!(dtm::SparseMatrixCSC{T}, tfidf::SparseMatrixCSC{F}) where {T <: Real, F <: AbstractFloat}
rows = rowvals(dtm)
dtmvals = nonzeros(dtm)
@@ -117,5 +154,63 @@ function tf_idf!(dtm::SparseMatrixCSC{T}, tfidf::SparseMatrixCSC{F}) where {T <:
end
end
- tfidf
+ return tfidf
end
+
+"""
+ tf_idf!(dtm)
+
+Compute tf-idf for `dtm`
+"""
+tf_idf!(dtm::AbstractMatrix{T}) where {T <: Real} = tf_idf!(dtm, dtm)
+
+tf_idf!(dtm::SparseMatrixCSC{T}) where {T <: Real} = tf_idf!(dtm, dtm)
+
+# This does not make sense, since DocumentTermMatrix is based on an array of integers
+#tf_idf!(dtm::DocumentTermMatrix) = tf_idf!(dtm.dtm)
+
+"""
+ tf(dtm::DocumentTermMatrix)
+ tf(dtm::SparseMatrixCSC{Real})
+ tf(dtm::Matrix{Real})
+
+Compute `tf-idf` value (Term Frequency - Inverse Document Frequency) for the input.
+
+In many cases, raw word counts are not appropriate for use because:
+
+- Some documents are longer than other documents
+- Some words are more frequent than other words
+
+A simple workaround this can be done by performing `TF-IDF` on a `DocumentTermMatrix`
+
+# Example
+
+```julia-repl
+julia> crps = Corpus([StringDocument("To be or not to be"),
+ StringDocument("To become or not to become")])
+
+julia> update_lexicon!(crps)
+
+julia> m = DocumentTermMatrix(crps)
+
+julia> tf_idf(m)
+2×6 SparseArrays.SparseMatrixCSC{Float64,Int64} with 10 stored entries:
+ [1, 1] = 0.0
+ [2, 1] = 0.0
+ [1, 2] = 0.231049
+ [2, 3] = 0.231049
+ [1, 4] = 0.0
+ [2, 4] = 0.0
+ [1, 5] = 0.0
+ [2, 5] = 0.0
+ [1, 6] = 0.0
+ [2, 6] = 0.0
+```
+
+See also: [`tf!`](@ref), [`tf_idf`](@ref), [`tf_idf!`](@ref)
+"""
+tf_idf(dtm::DocumentTermMatrix) = tf_idf(dtm.dtm)
+
+tf_idf(dtm::SparseMatrixCSC{T}) where {T <: Real} = tf_idf!(dtm, similar(dtm, Float64))
+
+tf_idf(dtm::Matrix{T}) where {T <: Real} = tf_idf!(dtm, Array{Float64}(undef, size(dtm)...))
diff --git a/src/tokenizer.jl b/src/tokenizer.jl
index 044011f2..78ef1c9f 100644
--- a/src/tokenizer.jl
+++ b/src/tokenizer.jl
@@ -1,9 +1,37 @@
-##############################################################################
-#
-# Split string into tokens on whitespace
-#
-##############################################################################
+"""
+ tokenize(language, str)
+Split `str` into words and other tokens such as punctuation.
+
+# Example
+
+```julia-repl
+julia> tokenize(Languages.English(), "Too foo words!")
+4-element Array{String,1}:
+ "Too"
+ "foo"
+ "words"
+ "!"
+```
+
+See also: [`sentence_tokenize`](@ref)
+"""
tokenize(lang::S, s::T) where {S <: Language, T <: AbstractString} = WordTokenizers.tokenize(s)
+
+"""
+ sentence_tokenize(language, str)
+
+Split `str` into sentences.
+
+# Example
+```julia-repl
+julia> sentence_tokenize(Languages.English(), "Here are few words! I am Foo Bar.")
+2-element Array{SubString{String},1}:
+ "Here are few words!"
+ "I am Foo Bar."
+```
+
+See also: [`tokenize`](@ref)
+"""
sentence_tokenize(lang::S, s::T) where {S <: Language, T<:AbstractString} = WordTokenizers.split_sentences(s)