diff --git a/docs/src/documents.md b/docs/src/documents.md index e6c771d4..0bb806b4 100644 --- a/docs/src/documents.md +++ b/docs/src/documents.md @@ -226,6 +226,24 @@ Dict{AbstractString,Int64} with 13 entries: "be.. ." => 1 ``` +The `ngrams()` function can also be called with multiple arguments: + +```julia +julia> ngrams(sd, 2, 3) +Dict{AbstractString,Int64} with 11 entries: + "or not to" => 1 + "be or" => 1 + "not to" => 1 + "be or not" => 1 + "not to be.." => 1 + "To be" => 1 + "or not" => 1 + "to be.. ." => 1 + "to be.." => 1 + "be.. ." => 1 + "To be or" => 1 +``` + If you have a `NGramDocument`, you can determine whether an `NGramDocument` contains unigrams, bigrams or a higher-order representation using the `ngram_complexity()` function: diff --git a/src/document.jl b/src/document.jl index 2657b68b..1587a113 100644 --- a/src/document.jl +++ b/src/document.jl @@ -81,18 +81,17 @@ TokenDocument(txt::AbstractString) = TokenDocument(String(txt), DocumentMetadata mutable struct NGramDocument{T<:AbstractString} <: AbstractDocument ngrams::Dict{T,Int} - n::Int + n::Union{Int,Vector{Int}} metadata::DocumentMetadata end -function NGramDocument(txt::AbstractString, dm::DocumentMetadata, n::Integer=1) - NGramDocument(ngramize(dm.language, tokenize(dm.language, String(txt)), n), - n, dm) +function NGramDocument(txt::AbstractString, dm::DocumentMetadata, n::Integer...=1) + NGramDocument(ngramize(dm.language, tokenize(dm.language, String(txt)), n...), (length(n) == 1) ? Int(first(n)) : Int[n...], dm) end -function NGramDocument(txt::AbstractString, n::Integer=1) - NGramDocument(txt, DocumentMetadata(), n) +function NGramDocument(txt::AbstractString, n::Integer...=1) + NGramDocument(txt, DocumentMetadata(), n...) end -function NGramDocument(ng::Dict{T, Int}, n::Integer=1) where T <: AbstractString - NGramDocument(merge(Dict{AbstractString,Int}(), ng), n, DocumentMetadata()) +function NGramDocument(ng::Dict{T, Int}, n::Integer...=1) where T <: AbstractString + NGramDocument(merge(Dict{AbstractString,Int}(), ng), (length(n) == 1) ? Int(first(n)) : Int[n...], DocumentMetadata()) end ############################################################################## @@ -146,7 +145,7 @@ end function ngrams(d::NGramDocument, n::Integer) error("The n-gram complexity of an NGramDocument cannot be increased") end -ngrams(d::AbstractDocument, n::Integer) = ngramize(language(d), tokens(d), n) +ngrams(d::AbstractDocument, n::Integer...) = ngramize(language(d), tokens(d), n...) ngrams(d::NGramDocument) = d.ngrams ngrams(d::AbstractDocument) = ngrams(d, 1) diff --git a/src/ngramizer.jl b/src/ngramizer.jl index 6d8e0fa9..9cab703d 100644 --- a/src/ngramizer.jl +++ b/src/ngramizer.jl @@ -13,16 +13,18 @@ Dict{AbstractString,Int64} with 3 entries: "To be or" => 1 ``` """ -function ngramize(lang::S, words::Vector{T}, n::Int) where {S <: Language, T <: AbstractString} - (n == 1) && return onegramize(lang, words) +function ngramize(lang::S, words::Vector{T}, nlist::Integer...) where {S <: Language, T <: AbstractString} + (length(nlist) == 1) && (first(nlist) == 1) && return onegramize(lang, words) n_words = length(words) tokens = Dict{AbstractString, Int}() - for index in 1:(n_words - n + 1) - token = join(words[index:(index + n - 1)], " ") - tokens[token] = get(tokens, token, 0) + 1 + for n in nlist + for index in 1:(n_words - n + 1) + token = join(words[index:(index + n - 1)], " ") + tokens[token] = get(tokens, token, 0) + 1 + end end return tokens end diff --git a/src/show.jl b/src/show.jl index 857d65da..ddc7818e 100644 --- a/src/show.jl +++ b/src/show.jl @@ -10,7 +10,7 @@ function Base.summary(d::AbstractDocument) o *= " * Author: $(author(d))\n" o *= " * Timestamp: $(timestamp(d))\n" - if typeof(d) ∈ [TokenDocument, NGramDocument] + if typeof(d) <: Union{TokenDocument, NGramDocument} o *= " * Snippet: ***SAMPLE TEXT NOT AVAILABLE***" else sample_text = replace(text(d)[1:min(50, length(text(d)))], r"\s+" => " ") diff --git a/test/document.jl b/test/document.jl index 592306ef..17510044 100644 --- a/test/document.jl +++ b/test/document.jl @@ -65,4 +65,13 @@ @test isa(d, NGramDocument) @test isequal(length(Document("this is text")), 12) + + # NGramDocument creation with multiple ngram complexity + let N=((), (2,), (Int32(2),), (1,2), (Int32(1), Int16(2))), C=(1, 2, 2, [1,2], [1,2]), L=(4, 3, 3, 7, 7) + for (n,c,l) in zip(N,C,L) + ngd = NGramDocument(sample_text1, n...) + @test ngram_complexity(ngd) == c + @test length(ngd.ngrams) == l + end + end end diff --git a/test/ngramizer.jl b/test/ngramizer.jl index d37fe0c5..620eda0d 100644 --- a/test/ngramizer.jl +++ b/test/ngramizer.jl @@ -14,4 +14,14 @@ "sample text" => 1, "is some" => 1, "some sample" => 1)) + ngs = TextAnalysis.ngramize(Languages.English(), tkns, 1, 2) + @test isequal(ngs, Dict{String,Int}("this is" => 1, + "is some" => 1, + "some sample" => 1, + "sample text" => 1, + "this" => 1, + "is" => 1, + "some" => 1, + "sample" => 1, + "text" => 1)) end