diff --git a/docs/src/documents.md b/docs/src/documents.md index 5c959e60..2e9196be 100644 --- a/docs/src/documents.md +++ b/docs/src/documents.md @@ -266,6 +266,7 @@ julia> remove_words!(sd, ["lear"]) julia> sd StringDocument{String}(" is mad", TextAnalysis.DocumentMetadata(Languages.English(), "Untitled Document", "Unknown Author", "Unknown Time")) ``` + At other times, you'll want to remove whole classes of words. To make this easier, we can use several classes of basic words defined by the Languages.jl package: @@ -294,6 +295,7 @@ These special classes can all be removed using specially-named parameters: These functions use words lists, so they are capable of working for many different languages without change, also these operations can be combined together for improved performance: + * `prepare!(sd, strip_articles| strip_numbers| strip_html_tags)` In addition to removing words, it is also common to take words that are diff --git a/docs/src/features.md b/docs/src/features.md index 9dbe86f9..ca4a8b27 100644 --- a/docs/src/features.md +++ b/docs/src/features.md @@ -126,8 +126,41 @@ julia> hash_dtv(crps[1]) 0 0 0 0 0 0 0 0 0 0 0 0 0 … 0 0 0 0 0 0 0 0 0 0 0 0 ``` +## TF (Term Frequency) + +Often we need to find out the proportion of a document is contributed +by each term. This can be done by finding the term frequency function + + tf(dtm) + +The paramter, `dtm` can be of the types - `DocumentTermMatrix` , `SparseMatrixCSC` or `Matrix` + +```julia +julia> crps = Corpus([StringDocument("To be or not to be"), + StringDocument("To become or not to become")]) + +julia> update_lexicon!(crps) + +julia> m = DocumentTermMatrix(crps) + +julia> tf(m) +2×6 SparseArrays.SparseMatrixCSC{Float64,Int64} with 10 stored entries: + [1, 1] = 0.166667 + [2, 1] = 0.166667 + [1, 2] = 0.333333 + [2, 3] = 0.333333 + [1, 4] = 0.166667 + [2, 4] = 0.166667 + [1, 5] = 0.166667 + [2, 5] = 0.166667 + [1, 6] = 0.166667 + [2, 6] = 0.166667 +``` + ## TF-IDF (Term Frequency - Inverse Document Frequency) + tf_idf(dtm) + In many cases, raw word counts are not appropriate for use because: * (A) Some documents are longer than other documents @@ -135,8 +168,38 @@ In many cases, raw word counts are not appropriate for use because: You can work around this by performing TF-IDF on a DocumentTermMatrix: - m = DocumentTermMatrix(crps) - tf_idf(m) +```julia +julia> crps = Corpus([StringDocument("To be or not to be"), + StringDocument("To become or not to become")]) + +julia> update_lexicon!(crps) + +julia> m = DocumentTermMatrix(crps) +DocumentTermMatrix( + [1, 1] = 1 + [2, 1] = 1 + [1, 2] = 2 + [2, 3] = 2 + [1, 4] = 1 + [2, 4] = 1 + [1, 5] = 1 + [2, 5] = 1 + [1, 6] = 1 + [2, 6] = 1, ["To", "be", "become", "not", "or", "to"], Dict("or"=>5,"not"=>4,"to"=>6,"To"=>1,"be"=>2,"become"=>3)) + +julia> tf_idf(m) +2×6 SparseArrays.SparseMatrixCSC{Float64,Int64} with 10 stored entries: + [1, 1] = 0.0 + [2, 1] = 0.0 + [1, 2] = 0.231049 + [2, 3] = 0.231049 + [1, 4] = 0.0 + [2, 4] = 0.0 + [1, 5] = 0.0 + [2, 5] = 0.0 + [1, 6] = 0.0 + [2, 6] = 0.0 +``` As you can see, TF-IDF has the effect of inserting 0's into the columns of words that occur in all documents. This is a useful way to avoid having to diff --git a/src/metadata.jl b/src/metadata.jl index f8700570..b9b98c05 100644 --- a/src/metadata.jl +++ b/src/metadata.jl @@ -1,41 +1,131 @@ -############################################################################## -# -# Metadata field getters and setters -# -############################################################################## - import Languages.name +""" + title(doc) + +Return the title metadata for `doc`. + +See also: [`title!`](@ref), [`titles`](@ref), [`titles!`](@ref) +""" title(d::AbstractDocument) = d.metadata.title + +""" + language(doc) + +Return the language metadata for `doc`. + +See also: [`language!`](@ref), [`languages`](@ref), [`languages!`](@ref) +""" language(d::AbstractDocument) = d.metadata.language + +""" + author(doc) + +Return the author metadata for `doc`. + +See also: [`author!`](@ref), [`authors`](@ref), [`authors!`](@ref) +""" author(d::AbstractDocument) = d.metadata.author + +""" + timestamp(doc) + +Return the timestamp metadata for `doc`. + +See also: [`timestamp!`](@ref), [`timestamps`](@ref), [`timestamps!`](@ref) +""" timestamp(d::AbstractDocument) = d.metadata.timestamp + +""" + title!(doc, str) + +Set the title of `doc` to `str`. + +See also: [`title`](@ref), [`titles`](@ref), [`titles!`](@ref) +""" function title!(d::AbstractDocument, nv::AbstractString) d.metadata.title = nv end -function language!(d::AbstractDocument, nv::T) where T <: Language +""" + language!(doc, lang::Language) + +Set the language of `doc` to `lang`. + +# Example +```julia-repl +julia> d = StringDocument("String Document 1") + +julia> language!(d, Languages.Spanish()) + +julia> d.metadata.language +Languages.Spanish() +``` + +See also: [`language`](@ref), [`languages`](@ref), [`languages!`](@ref) +""" +function language!(d::AbstractDocument, nv::Language) d.metadata.language = nv end +""" + author!(doc, author) + +Set the author metadata of doc to `author`. + +See also: [`author`](@ref), [`authors`](@ref), [`authors!`](@ref) +""" function author!(d::AbstractDocument, nv::AbstractString) d.metadata.author = nv end +""" + timestamp!(doc, timestamp::AbstractString) + +Set the timestamp metadata of doc to `timestamp`. + +See also: [`timestamp`](@ref), [`timestamps`](@ref), [`timestamps!`](@ref) +""" function timestamp!(d::AbstractDocument, nv::AbstractString) d.metadata.timestamp = nv end -############################################################################## -# -# Vectorized getters for an entire Corpus -# -############################################################################## +""" + titles(crps) + +Return the titles for each document in `crps`. + +See also: [`titles!`](@ref), [`title`](@ref), [`title!`](@ref) +""" titles(c::Corpus) = map(d -> title(d), documents(c)) + +""" + languages(crps) + +Return the languages for each document in `crps`. + +See also: [`languages!`](@ref), [`language`](@ref), [`language!`](@ref) +""" languages(c::Corpus) = map(d -> language(d), documents(c)) + +""" + authors(crps) + +Return the authors for each document in `crps`. + +See also: [`authors!`](@ref), [`author`](@ref), [`author!`](@ref) +""" authors(c::Corpus) = map(d -> author(d), documents(c)) + +""" + timestamps(crps) + +Return the timestamps for each document in `crps`. + +See also: [`timestamps!`](@ref), [`timestamp`](@ref), [`timestamp!`](@ref) +""" timestamps(c::Corpus) = map(d -> timestamp(d), documents(c)) titles!(c::Corpus, nv::AbstractString) = title!.(documents(c), nv) @@ -43,6 +133,16 @@ languages!(c::Corpus, nv::T) where {T <: Language} = language!.(documents(c), Re authors!(c::Corpus, nv::AbstractString) = author!.(documents(c), Ref(nv)) timestamps!(c::Corpus, nv::AbstractString) = timestamp!.(documents(c), Ref(nv)) +""" + titles!(crps, vec::Vector{String}) + titles!(crps, str) + +Update titles of the documents in a Corpus. + +If the input is a String, set the same title for all documents. If the input is a vector, set title of `i`th document to corresponding `i`th element in the vector `vec`. In the latter case, the number of documents must equal the length of vector. + +See also: [`titles`](@ref), [`title!`](@ref), [`title`](@ref) +""" function titles!(c::Corpus, nvs::Vector{String}) length(c) == length(nvs) || throw(DimensionMismatch("dimensions must match")) for (i, d) in pairs(IndexLinear(), documents(c)) @@ -50,6 +150,16 @@ function titles!(c::Corpus, nvs::Vector{String}) end end +""" + languages!(crps, langs::Vector{Language}) + languages!(crps, lang::Language) + +Update languages of documents in a Corpus. + +If the input is a Vector, then language of the `i`th document is set to the `i`th element in the vector, respectively. However, the number of documents must equal the length of vector. + +See also: [`languages`](@ref), [`language!`](@ref), [`language`](@ref) +""" function languages!(c::Corpus, nvs::Vector{T}) where T <: Language length(c) == length(nvs) || throw(DimensionMismatch("dimensions must match")) for (i, d) in pairs(IndexLinear(), documents(c)) @@ -57,6 +167,14 @@ function languages!(c::Corpus, nvs::Vector{T}) where T <: Language end end +""" + authors!(crps, athrs) + authors!(crps, athr) + +Set the authors of the documents in `crps` to the `athrs`, respectively. + +See also: [`authors`](@ref), [`author!`](@ref), [`author`](@ref) +""" function authors!(c::Corpus, nvs::Vector{String}) length(c) == length(nvs) || throw(DimensionMismatch("dimensions must match")) for (i, d) in pairs(IndexLinear(), documents(c)) @@ -64,6 +182,14 @@ function authors!(c::Corpus, nvs::Vector{String}) end end +""" + timestamps!(crps, times::Vector{String}) + timestamps!(crps, time::AbstractString) + +Set the timestamps of the documents in `crps` to the timestamps in `times`, respectively. + +See also: [`timestamps`](@ref), [`timestamp!`](@ref), [`timestamp`](@ref) +""" function timestamps!(c::Corpus, nvs::Vector{String}) length(c) == length(nvs) || throw(DimensionMismatch("dimensions must match")) for (i, d) in pairs(IndexLinear(), documents(c)) diff --git a/src/ngramizer.jl b/src/ngramizer.jl index cc19c2ba..6d8e0fa9 100644 --- a/src/ngramizer.jl +++ b/src/ngramizer.jl @@ -1,9 +1,18 @@ -############################################################################## -# -# Construct n-grams using single space concatenation -# -############################################################################## +""" + ngramize(lang, tokens, n) +Compute the ngrams of `tokens` of the order `n`. + +# Example + +```julia-repl +julia> ngramize(Languages.English(), ["To", "be", "or", "not", "to"], 3) +Dict{AbstractString,Int64} with 3 entries: + "be or not" => 1 + "or not to" => 1 + "To be or" => 1 +``` +""" function ngramize(lang::S, words::Vector{T}, n::Int) where {S <: Language, T <: AbstractString} (n == 1) && return onegramize(lang, words) @@ -14,12 +23,27 @@ function ngramize(lang::S, words::Vector{T}, n::Int) where {S <: Language, T <: for index in 1:(n_words - n + 1) token = join(words[index:(index + n - 1)], " ") tokens[token] = get(tokens, token, 0) + 1 - end - return tokens end +""" + onegramize(lang, tokens) + +Create the unigrams dict for input tokens. + +# Example + +```julia-repl +julia> onegramize(Languages.English(), ["To", "be", "or", "not", "to", "be"]) +Dict{String,Int64} with 5 entries: + "or" => 1 + "not" => 1 + "to" => 1 + "To" => 1 + "be" => 2 +``` +""" function onegramize(lang::S, words::Vector{T}) where {S <: Language, T <: AbstractString} n_words = length(words) tokens = Dict{T, Int}() @@ -28,5 +52,5 @@ function onegramize(lang::S, words::Vector{T}) where {S <: Language, T <: Abstra tokens[word] = get(tokens, word, 0) + 1 end - tokens + return tokens end diff --git a/src/preprocessing.jl b/src/preprocessing.jl index 55dfc4cf..f4dea4d7 100644 --- a/src/preprocessing.jl +++ b/src/preprocessing.jl @@ -1,4 +1,3 @@ - const strip_patterns = UInt32(0) const strip_corrupt_utf8 = UInt32(0x1) << 0 const strip_case = UInt32(0x1) << 1 @@ -37,17 +36,29 @@ function mk_regex(regex_string) end -############################################################################## -# -# Remove corrupt UTF8 characters -# -############################################################################## +""" + remove_corrupt_utf8(str) + +Remove corrupt UTF8 characters in `str`. + +See also: [`remove_corrupt_utf8!`](@ref) +""" function remove_corrupt_utf8(s::AbstractString) return map(x->isvalid(x) ? x : ' ', s) end remove_corrupt_utf8!(d::FileDocument) = error("FileDocument cannot be modified") +""" + remove_corrupt_utf8!(doc) + remove_corrupt_utf8!(crps) + +Remove corrupt UTF8 characters for `doc` or documents in `crps`. + +Does not support `FileDocument` or Corpus containing `FileDocument`. + +See also: [`remove_corrupt_utf8`](@ref) +""" function remove_corrupt_utf8!(d::StringDocument) d.text = remove_corrupt_utf8(d.text) nothing @@ -78,16 +89,10 @@ function remove_corrupt_utf8!(crps::Corpus) end end -############################################################################## -# -# Conversion to lowercase -# -############################################################################## - """ - remove_case(s::AbstractString) + remove_case(str) -Converts the string to lowercase. +Convert `str` to lowercase. See also: [`remove_case!`](@ref) """ @@ -95,26 +100,31 @@ remove_case(s::T) where {T <: AbstractString} = lowercase(s) """ - remove_case!(d::TokenDocument) - remove_case!(d::StringDocument) - remove_case!(d::NGramDocument) + remove_case!(doc) + remove_case!(crps) - remove_case!(c::Corpus) +Convert the text of `doc` or `crps` to lowercase. -Converts the text of the document or corpus to lowercase. This method does not -works with FileDocument +Does not support `FileDocument` or `crps` containing `FileDocument`. # Example ```julia-repl -julia> str="The quick brown fox jumps over the lazy dog" -julia> sd=StringDocument(str) -StringDocument{String}("The quick brown fox jumps over the lazy dog", TextAnalysis.DocumentMetadata(Languages.English(), "Untitled Document", "Unknown Author", "Unknown Time")) +julia> str = "The quick brown fox jumps over the lazy dog" +julia> sd = StringDocument(str) +A StringDocument{String} + * Language: Languages.English() + * Title: Untitled Document + * Author: Unknown Author + * Timestamp: Unknown Time + * Snippet: The quick brown fox jumps over the lazy dog julia> remove_case!(sd) julia> sd.text "the quick brown fox jumps over the lazy dog" ``` + +See also: [`remove_case`](@ref) """ remove_case!(d::FileDocument) = error("FileDocument cannot be modified") @@ -148,21 +158,60 @@ function remove_case!(crps::Corpus) end end -############################################################################## -# -# Stripping HTML tags -# -############################################################################## + const script_tags = Regex("]*>([\\s\\S]*?)") const style_tags = Regex("]*>([\\s\\S]*?)") const html_tags = Regex("<[^>]*>") +""" + remove_html_tags(str) + +Remove html tags from `str`, including the style and script tags. + +See also: [`remove_html_tags!`](@ref) +""" function remove_html_tags(s::AbstractString) s = remove_patterns(s, script_tags) s = remove_patterns(s, style_tags) remove_patterns(s, html_tags) end +""" + remove_html_tags!(doc::StringDocument) + remove_html_tags!(crps) + +Remove html tags from the `StringDocument` or documents `crps`. + +Does not work for documents other than `StringDocument`. + +# Example + +```julia-repl +julia> html_doc = StringDocument( + " + + + +

Hello

world + + + " + ) +A StringDocument{String} + * Language: Languages.English() + * Title: Untitled Document + * Author: Unknown Author + * Timestamp: Unknown Time + * Snippet: remove_html_tags!(html_doc) + +julia> strip(text(html_doc)) +"Hello world" +``` + +See also: [`remove_html_tags`](@ref) +""" function remove_html_tags!(d::AbstractDocument) error("HTML tags can be removed only from a StringDocument") end @@ -178,16 +227,12 @@ function remove_html_tags!(crps::Corpus) end end -############################################################################## -# -# Remove specified words -# -############################################################################## + """ - remove_words!(d::AbstractDocument, words::Vector) - remove_words!(c::Corpus, words::Vector) + remove_words!(doc, words::Vector{AbstractString}) + remove_words!(crps, words::Vector{AbstractString}) -Removes the tokens defined in the list `words` from the source Document or Corpus +Remove the occurences of words from `doc` or `crps`. # Example @@ -217,14 +262,33 @@ end tag_pos!(entity) = error("Not yet implemented") +""" + sparse_terms(crps, alpha=0.05]) +Find the sparse terms from Corpus, occuring in less than `alpha` percentage of the documents. -############################################################################## -# -# Drop terms based on frequency -# -############################################################################## +# Example +``` +julia> crps = Corpus([StringDocument("This is Document 1"), + StringDocument("This is Document 2")]) +A Corpus with 2 documents: +* 2 StringDocument's +* 0 FileDocument's +* 0 TokenDocument's +* 0 NGramDocument's + +Corpus's lexicon contains 0 tokens +Corpus's index contains 0 tokens + +julia> sparse_terms(crps, 0.5) +2-element Array{String,1}: + "1" + "2" +``` + +See also: [`remove_sparse_terms!`](@ref), [`frequent_terms`](@ref) +""" function sparse_terms(crps::Corpus, alpha::Real = alpha_sparse) update_lexicon!(crps) update_inverse_index!(crps) @@ -239,6 +303,34 @@ function sparse_terms(crps::Corpus, alpha::Real = alpha_sparse) return res end +""" + frequent_terms(crps, alpha=0.95) + +Find the frequent terms from Corpus, occuring more than `alpha` percentage of the documents. + +# Example + +``` +julia> crps = Corpus([StringDocument("This is Document 1"), + StringDocument("This is Document 2")]) +A Corpus with 2 documents: + * 2 StringDocument's + * 0 FileDocument's + * 0 TokenDocument's + * 0 NGramDocument's + +Corpus's lexicon contains 0 tokens +Corpus's index contains 0 tokens + +julia> frequent_terms(crps) +3-element Array{String,1}: + "is" + "This" + "Document" +``` + +See also: [`remove_frequent_terms!`](@ref), [`sparse_terms`](@ref) +""" function frequent_terms(crps::Corpus, alpha::Real = alpha_frequent) update_lexicon!(crps) update_inverse_index!(crps) @@ -253,20 +345,115 @@ function frequent_terms(crps::Corpus, alpha::Real = alpha_frequent) return res end -# Sparse terms occur in less than x percent of all documents +""" + remove_sparse_terms!(crps, alpha=0.05) + +Remove sparse terms in crps, occuring less than `alpha` percent of documents. + +# Example + +```julia-repl +julia> crps = Corpus([StringDocument("This is Document 1"), + StringDocument("This is Document 2")]) +A Corpus with 2 documents: + * 2 StringDocument's + * 0 FileDocument's + * 0 TokenDocument's + * 0 NGramDocument's + +Corpus's lexicon contains 0 tokens +Corpus's index contains 0 tokens + +julia> remove_sparse_terms!(crps, 0.5) + +julia> crps[1].text +"This is Document " + +julia> crps[2].text +"This is Document " +``` + +See also: [`remove_frequent_terms!`](@ref), [`sparse_terms`](@ref) +""" remove_sparse_terms!(crps::Corpus, alpha::Real = alpha_sparse) = remove_words!(crps, sparse_terms(crps, alpha)) -# Frequent terms occur in more than x percent of all documents +""" + remove_frequent_terms!(crps, alpha=0.95) + +Remove terms in `crps`, occuring more than `alpha` percent of documents. + +# Example + +```julia-repl +julia> crps = Corpus([StringDocument("This is Document 1"), + StringDocument("This is Document 2")]) +A Corpus with 2 documents: +* 2 StringDocument's +* 0 FileDocument's +* 0 TokenDocument's +* 0 NGramDocument's + +Corpus's lexicon contains 0 tokens +Corpus's index contains 0 tokens + +julia> remove_frequent_terms!(crps) + +julia> text(crps[1]) +" 1" + +julia> text(crps[2]) +" 2" +``` + +See also: [`remove_sparse_terms!`](@ref), [`frequent_terms`](@ref) +""" remove_frequent_terms!(crps::Corpus, alpha::Real = alpha_frequent) = remove_words!(crps, frequent_terms(crps, alpha)) +""" + prepare!(doc, flags) + prepare!(crps, flags) + +Preprocess document or corpus based on the input flags. + +# List of Flags + +* strip_patterns +* strip_corrupt_utf8 +* strip_case +* stem_words +* tag_part_of_speech +* strip_whitespace +* strip_punctuation +* strip_numbers +* strip_non_letters +* strip_indefinite_articles +* strip_definite_articles +* strip_articles +* strip_prepositions +* strip_pronouns +* strip_stopwords +* strip_sparse_terms +* strip_frequent_terms +* strip_html_tags -############################################################################## -# -# Remove parts from document based on flags or regular expressions -# -############################################################################## +# Example +```julia-repl +julia> doc = StringDocument("This is a document of mine") +A StringDocument{String} + * Language: Languages.English() + * Title: Untitled Document + * Author: Unknown Author + * Timestamp: Unknown Time + * Snippet: This is a document of mine + +julia> prepare!(doc, strip_pronouns | strip_articles) + +julia> text(doc) +"This is document of " +``` +""" function prepare!(crps::Corpus, flags::UInt32; skip_patterns = Set{AbstractString}(), skip_words = Set{AbstractString}()) ((flags & strip_sparse_terms) > 0) && union!(skip_words, sparse_terms(crps)) ((flags & strip_frequent_terms) > 0) && union!(skip_words, frequent_terms(crps)) @@ -302,41 +489,49 @@ end """ - remove_whitespace(s::AbstractString) + remove_whitespace(str) -Squashes multiple whitespaces to a single one. And removes all leading and -trailing whitespaces in a string. +Squash multiple whitespaces to a single one. +And remove all leading and trailing whitespaces. +See also: [`remove_whitespace!`](@ref) """ -remove_whitespace(s::AbstractString) = replace(strip(s), r"\s+"=>" ") +remove_whitespace(str::AbstractString) = replace(strip(str), r"\s+"=>" ") """ - remove_whitespace!(s::AbstractDocument) + remove_whitespace!(doc) + remove_whitespace!(crps) -Squashes multiple whitespaces to a single space. And removes all leading and -trailing whitespaces in a StringDocument and Corpus. +Squash multiple whitespaces to a single space and remove all leading and trailing whitespaces in document or crps. -Does no-op for NGramDocument and TokenDocument. +Does no-op for `FileDocument`, `TokenDocument` or `NGramDocument`. +See also: [`remove_whitespace`](@ref) """ function remove_whitespace!(d::StringDocument) - d.text = remove_whitespace(d.text) + d.text = remove_whitespace(d.text) end function remove_whitespace!(crps::Corpus) - for doc in crps - remove_whitespace!(doc) - end + for doc in crps + remove_whitespace!(doc) + end end function remove_whitespace!(d::AbstractDocument) - nothing + nothing end +""" + remove_patterns(str, rex::Regex) -function remove_patterns(s::AbstractString, rex::Regex) - return replace(s, rex => "") +Remove the part of str matched by rex. + +See also: [`remove_patterns!`](@ref) +""" +function remove_patterns(s::AbstractString, rex::Regex) + return replace(s, rex => "") end function remove_patterns(s::SubString{T}, rex::Regex) where T <: String @@ -360,7 +555,16 @@ function remove_patterns(s::SubString{T}, rex::Regex) where T <: String String(take!(iob)) end +""" + remove_patterns!(doc, rex::Regex) + remove_patterns!(crps, rex::Regex) + +Remove patterns matched by `rex` in document or Corpus. + +Does not modify `FileDocument` or Corpus containing `FileDocument`. +See also: [`remove_patterns`](@ref) +""" remove_patterns!(d::FileDocument, rex::Regex) = error("FileDocument cannot be modified") function remove_patterns!(d::StringDocument, rex::Regex) diff --git a/src/sentiment.jl b/src/sentiment.jl index 91ec5804..255b814e 100644 --- a/src/sentiment.jl +++ b/src/sentiment.jl @@ -48,12 +48,12 @@ function get_sentiment(handle_unknown, ip::Array{T, 1}, weight, rwi) where T <: if ele in keys(rwi) && rwi[ele] <= size(weight[:embedding_1]["embedding_1"]["embeddings:0"])[2] # there are only 5000 unique embeddings push!(res, rwi[ele]) else - for words in handle_unknown(ele) + for words in handle_unknown(ele) if words in keys(rwi) && rwi[words] <= size(weight[:embedding_1]["embedding_1"]["embeddings:0"])[2] push!(res, rwi[words]) end - end - + end + end end return model(pad_sequences(res))[1] @@ -67,7 +67,7 @@ struct SentimentModel # Only load Flux once it is actually needed global Flux Flux = Base.require(TextAnalysis, :Flux) - + new(read_weights(), read_word_ids()) end end @@ -89,15 +89,15 @@ end """ - ``` - model = SentimentAnalyzer(doc) - model = SentimentAnalyzer(doc, handle_unknown) - ``` - Return sentiment of the input doc in range 0 to 1, 0 being least sentiment score and 1 being - the highest: - - doc = Input Document for calculating document (AbstractDocument type) - - handle_unknown = A function for handling unknown words. Should return an array (default x->tuple()) - """ + model = SentimentAnalyzer(doc) + model = SentimentAnalyzer(doc, handle_unknown) + +Predict sentiment of the input doc in range 0 to 1, 0 being least sentiment score and 1 being the highest. + +# Arguments +- doc = Input Document for calculating document (`AbstractDocument` type) +- handle_unknown = A function for handling unknown words. Should return an array (default x->tuple()) +""" function(m::SentimentAnalyzer)(d::AbstractDocument, handle_unknown = x->tuple()) m.model(handle_unknown, tokens(d)) diff --git a/src/stemmer.jl b/src/stemmer.jl index 72215391..89344527 100644 --- a/src/stemmer.jl +++ b/src/stemmer.jl @@ -8,8 +8,11 @@ const ISO_8859_1 = "ISO_8859_1" const CP850 = "CP850" const KOI8_R = "KOI8_R" -## -# lists the stemmer algorithms loaded +""" + stemmer_types() + +List all the stemmer algorithms loaded. +""" function stemmer_types() cptr = ccall((:sb_stemmer_list, libstemmer), Ptr{Ptr{UInt8}}, ()) (C_NULL == cptr) && error("error getting stemmer types") @@ -50,7 +53,7 @@ mutable struct Stemmer end end -show(io::IO, stm::Stemmer) = println(io, "Stemmer algorithm:$(stm.alg) encoding:$(stm.enc)") +Base.show(io::IO, stm::Stemmer) = println(io, "Stemmer algorithm:$(stm.alg) encoding:$(stm.enc)") function release(stm::Stemmer) (C_NULL == stm.cptr) && return @@ -59,6 +62,14 @@ function release(stm::Stemmer) nothing end +""" + stem(stemmer::Stemmer, str) + stem(stemmer::Stemmer, words::Array) + +Stem the input with the Stemming algorthm of `stemmer`. + +See also: [`stem!`](@ref) +""" function stem(stemmer::Stemmer, bstr::AbstractString) sres = ccall((:sb_stemmer_stem, libstemmer), Ptr{UInt8}, @@ -83,13 +94,26 @@ function stem(stemmer::Stemmer, words::Array) for idx in 1:l ret[idx] = stem(stemmer, words[idx]) end - ret + return ret end +""" + stemmer_for_document(doc) + +Search for an appropriate stemmer based on the language of the document. +""" function stemmer_for_document(d::AbstractDocument) Stemmer(lowercase(Languages.english_name(language(d)))) end +""" + stem!(doc) + stem!(crps) + +Stems the document or documents in `crps` with a suitable stemmer. + +Stemming cannot be done for `FileDocument` and Corpus made of these type of documents. +""" function stem!(d::AbstractDocument) stemmer = stemmer_for_document(d) stem!(stemmer, d) diff --git a/src/tf_idf.jl b/src/tf_idf.jl index a7f142c3..21babcf4 100644 --- a/src/tf_idf.jl +++ b/src/tf_idf.jl @@ -1,21 +1,12 @@ -############################################################################## -# -# TF -# -############################################################################## +""" + tf!(dtm::AbstractMatrix{Real}, tf::AbstractMatrix{AbstractFloat}) -tf(dtm::Matrix{T}) where {T <: Real} = tf!(dtm, Array{Float64}(undef, size(dtm)...)) - -tf(dtm::SparseMatrixCSC{T}) where {T <: Real} = tf!(dtm, similar(dtm, Float64)) - -tf!(dtm::AbstractMatrix{T}) where {T <: Real} = tf!(dtm, dtm) - -tf!(dtm::SparseMatrixCSC{T}) where {T <: Real} = tf!(dtm, dtm) +Overwrite `tf` with the term frequency of the `dtm`. -tf(dtm::DocumentTermMatrix) = tf(dtm.dtm) +Works correctly if `dtm` and `tf` are same matrix. -# The second Matrix will be overwritten with the result -# Will work correctly if dtm and tfidf are the same matrix +See also: [`tf`](@ref), [`tf_idf`](@ref), [`tf_idf!`](@ref) +""" function tf!(dtm::AbstractMatrix{T1}, tf::AbstractMatrix{T2}) where {T1 <: Real, T2 <: AbstractFloat} n, p = size(dtm) @@ -31,7 +22,15 @@ function tf!(dtm::AbstractMatrix{T1}, tf::AbstractMatrix{T2}) where {T1 <: Real, return tf end -# assumes second matrix has same nonzeros as first one +""" + tf!(dtm::SparseMatrixCSC{Real}, tf::SparseMatrixCSC{AbstractFloat}) + +Overwrite `tf` with the term frequency of the `dtm`. + +`tf` should have the has same nonzeros as `dtm`. + +See also: [`tf`](@ref), [`tf_idf`](@ref), [`tf_idf!`](@ref) +""" function tf!(dtm::SparseMatrixCSC{T}, tf::SparseMatrixCSC{F}) where {T <: Real, F <: AbstractFloat} rows = rowvals(dtm) dtmvals = nonzeros(dtm) @@ -48,31 +47,61 @@ function tf!(dtm::SparseMatrixCSC{T}, tf::SparseMatrixCSC{F}) where {T <: Real, tfvals[j] = dtmvals[j] / max(words_in_documents[row], one(T)) end end - tf + return tf end -############################################################################## -# -# TF-IDF -# -############################################################################## +tf!(dtm::AbstractMatrix{T}) where {T <: Real} = tf!(dtm, dtm) -tf_idf(dtm::Matrix{T}) where {T <: Real} = tf_idf!(dtm, Array{Float64}(undef, size(dtm)...)) +tf!(dtm::SparseMatrixCSC{T}) where {T <: Real} = tf!(dtm, dtm) -tf_idf(dtm::SparseMatrixCSC{T}) where {T <: Real} = tf_idf!(dtm, similar(dtm, Float64)) +""" + tf(dtm::DocumentTermMatrix) + tf(dtm::SparseMatrixCSC{Real}) + tf(dtm::Matrix{Real}) -tf_idf!(dtm::AbstractMatrix{T}) where {T <: Real} = tf_idf!(dtm, dtm) +Compute the `term-frequency` of the input. -tf_idf!(dtm::SparseMatrixCSC{T}) where {T <: Real} = tf_idf!(dtm, dtm) +# Example -tf_idf(dtm::DocumentTermMatrix) = tf_idf(dtm.dtm) +```julia-repl +julia> crps = Corpus([StringDocument("To be or not to be"), + StringDocument("To become or not to become")]) -# This does not make sense, since DocumentTermMatrix is based on an array of integers -#tf_idf!(dtm::DocumentTermMatrix) = tf_idf!(dtm.dtm) +julia> update_lexicon!(crps) + +julia> m = DocumentTermMatrix(crps) + +julia> tf(m) +2×6 SparseArrays.SparseMatrixCSC{Float64,Int64} with 10 stored entries: + [1, 1] = 0.166667 + [2, 1] = 0.166667 + [1, 2] = 0.333333 + [2, 3] = 0.333333 + [1, 4] = 0.166667 + [2, 4] = 0.166667 + [1, 5] = 0.166667 + [2, 5] = 0.166667 + [1, 6] = 0.166667 + [2, 6] = 0.166667 +``` + +See also: [`tf!`](@ref), [`tf_idf`](@ref), [`tf_idf!`](@ref) +""" +tf(dtm::DocumentTermMatrix) = tf(dtm.dtm) + +tf(dtm::Matrix{T}) where {T <: Real} = tf!(dtm, Array{Float64}(undef, size(dtm)...)) + +tf(dtm::SparseMatrixCSC{T}) where {T <: Real} = tf!(dtm, similar(dtm, Float64)) + +""" + tf_idf!(dtm::AbstractMatrix{Real}, tf_idf::AbstractMatrix{AbstractFloat}) +Overwrite `tf_idf` with the tf-idf (Term Frequency - Inverse Doc Frequency) of the `dtm`. -# The second Matrix will be overwritten with the result -# Will work correctly if dtm and tfidf are the same matrix +`dtm` and `tf-idf` must be matrices of same dimensions. + +See also: [`tf`](@ref), [`tf!`](@ref) , [`tf_idf`](@ref) +""" function tf_idf!(dtm::AbstractMatrix{T1}, tfidf::AbstractMatrix{T2}) where {T1 <: Real, T2 <: AbstractFloat} n, p = size(dtm) @@ -93,7 +122,15 @@ function tf_idf!(dtm::AbstractMatrix{T1}, tfidf::AbstractMatrix{T2}) where {T1 < return tfidf end -# sparse version +""" + tf_idf!(dtm::SparseMatrixCSC{Real}, tfidf::SparseMatrixCSC{AbstractFloat}) + +Overwrite `tfidf` with the tf-idf (Term Frequency - Inverse Doc Frequency) of the `dtm`. + +The arguments must have same number of nonzeros. + +See also: [`tf`](@ref), [`tf_idf`](@ref), [`tf_idf!`](@ref) +""" function tf_idf!(dtm::SparseMatrixCSC{T}, tfidf::SparseMatrixCSC{F}) where {T <: Real, F <: AbstractFloat} rows = rowvals(dtm) dtmvals = nonzeros(dtm) @@ -117,5 +154,63 @@ function tf_idf!(dtm::SparseMatrixCSC{T}, tfidf::SparseMatrixCSC{F}) where {T <: end end - tfidf + return tfidf end + +""" + tf_idf!(dtm) + +Compute tf-idf for `dtm` +""" +tf_idf!(dtm::AbstractMatrix{T}) where {T <: Real} = tf_idf!(dtm, dtm) + +tf_idf!(dtm::SparseMatrixCSC{T}) where {T <: Real} = tf_idf!(dtm, dtm) + +# This does not make sense, since DocumentTermMatrix is based on an array of integers +#tf_idf!(dtm::DocumentTermMatrix) = tf_idf!(dtm.dtm) + +""" + tf(dtm::DocumentTermMatrix) + tf(dtm::SparseMatrixCSC{Real}) + tf(dtm::Matrix{Real}) + +Compute `tf-idf` value (Term Frequency - Inverse Document Frequency) for the input. + +In many cases, raw word counts are not appropriate for use because: + +- Some documents are longer than other documents +- Some words are more frequent than other words + +A simple workaround this can be done by performing `TF-IDF` on a `DocumentTermMatrix` + +# Example + +```julia-repl +julia> crps = Corpus([StringDocument("To be or not to be"), + StringDocument("To become or not to become")]) + +julia> update_lexicon!(crps) + +julia> m = DocumentTermMatrix(crps) + +julia> tf_idf(m) +2×6 SparseArrays.SparseMatrixCSC{Float64,Int64} with 10 stored entries: + [1, 1] = 0.0 + [2, 1] = 0.0 + [1, 2] = 0.231049 + [2, 3] = 0.231049 + [1, 4] = 0.0 + [2, 4] = 0.0 + [1, 5] = 0.0 + [2, 5] = 0.0 + [1, 6] = 0.0 + [2, 6] = 0.0 +``` + +See also: [`tf!`](@ref), [`tf_idf`](@ref), [`tf_idf!`](@ref) +""" +tf_idf(dtm::DocumentTermMatrix) = tf_idf(dtm.dtm) + +tf_idf(dtm::SparseMatrixCSC{T}) where {T <: Real} = tf_idf!(dtm, similar(dtm, Float64)) + +tf_idf(dtm::Matrix{T}) where {T <: Real} = tf_idf!(dtm, Array{Float64}(undef, size(dtm)...)) diff --git a/src/tokenizer.jl b/src/tokenizer.jl index 044011f2..78ef1c9f 100644 --- a/src/tokenizer.jl +++ b/src/tokenizer.jl @@ -1,9 +1,37 @@ -############################################################################## -# -# Split string into tokens on whitespace -# -############################################################################## +""" + tokenize(language, str) +Split `str` into words and other tokens such as punctuation. + +# Example + +```julia-repl +julia> tokenize(Languages.English(), "Too foo words!") +4-element Array{String,1}: + "Too" + "foo" + "words" + "!" +``` + +See also: [`sentence_tokenize`](@ref) +""" tokenize(lang::S, s::T) where {S <: Language, T <: AbstractString} = WordTokenizers.tokenize(s) + +""" + sentence_tokenize(language, str) + +Split `str` into sentences. + +# Example +```julia-repl +julia> sentence_tokenize(Languages.English(), "Here are few words! I am Foo Bar.") +2-element Array{SubString{String},1}: + "Here are few words!" + "I am Foo Bar." +``` + +See also: [`tokenize`](@ref) +""" sentence_tokenize(lang::S, s::T) where {S <: Language, T<:AbstractString} = WordTokenizers.split_sentences(s)