Merge pull request #157 from tanmaykm/tan/sindex

allow multiple ngram complexity in NGramDocument, ngrams and ngrammize
JuliaText · Jun 9, 2019 · b7fc9c4 · b7fc9c4
2 parents 0be971b + 5545cde
commit b7fc9c4
Show file tree

Hide file tree

Showing 6 changed files with 53 additions and 15 deletions.
diff --git a/docs/src/documents.md b/docs/src/documents.md
@@ -226,6 +226,24 @@ Dict{AbstractString,Int64} with 13 entries:
   "be.. ."  => 1
 ```
 
+The `ngrams()` function can also be called with multiple arguments:
+
+```julia
+julia> ngrams(sd, 2, 3)
+Dict{AbstractString,Int64} with 11 entries:
+  "or not to"   => 1
+  "be or"       => 1
+  "not to"      => 1
+  "be or not"   => 1
+  "not to be.." => 1
+  "To be"       => 1
+  "or not"      => 1
+  "to be.. ."   => 1
+  "to be.."     => 1
+  "be.. ."      => 1
+  "To be or"    => 1
+```
+
 If you have a `NGramDocument`, you can determine whether an `NGramDocument`
 contains unigrams, bigrams or a higher-order representation using the `ngram_complexity()` function:
 

diff --git a/src/document.jl b/src/document.jl
@@ -81,18 +81,17 @@ TokenDocument(txt::AbstractString) = TokenDocument(String(txt), DocumentMetadata
 
 mutable struct NGramDocument{T<:AbstractString} <: AbstractDocument
     ngrams::Dict{T,Int}
-    n::Int
+    n::Union{Int,Vector{Int}}
     metadata::DocumentMetadata
 end
-function NGramDocument(txt::AbstractString, dm::DocumentMetadata, n::Integer=1)
-    NGramDocument(ngramize(dm.language, tokenize(dm.language, String(txt)), n),
-        n, dm)
+function NGramDocument(txt::AbstractString, dm::DocumentMetadata, n::Integer...=1)
+    NGramDocument(ngramize(dm.language, tokenize(dm.language, String(txt)), n...), (length(n) == 1) ? Int(first(n)) : Int[n...], dm)
 end
-function NGramDocument(txt::AbstractString, n::Integer=1)
-    NGramDocument(txt, DocumentMetadata(), n)
+function NGramDocument(txt::AbstractString, n::Integer...=1)
+    NGramDocument(txt, DocumentMetadata(), n...)
 end
-function NGramDocument(ng::Dict{T, Int}, n::Integer=1) where T <: AbstractString
-    NGramDocument(merge(Dict{AbstractString,Int}(), ng), n, DocumentMetadata())
+function NGramDocument(ng::Dict{T, Int}, n::Integer...=1) where T <: AbstractString
+    NGramDocument(merge(Dict{AbstractString,Int}(), ng), (length(n) == 1) ? Int(first(n)) : Int[n...], DocumentMetadata())
 end
 
 ##############################################################################
@@ -146,7 +145,7 @@ end
 function ngrams(d::NGramDocument, n::Integer)
     error("The n-gram complexity of an NGramDocument cannot be increased")
 end
-ngrams(d::AbstractDocument, n::Integer) = ngramize(language(d), tokens(d), n)
+ngrams(d::AbstractDocument, n::Integer...) = ngramize(language(d), tokens(d), n...)
 ngrams(d::NGramDocument) = d.ngrams
 ngrams(d::AbstractDocument) = ngrams(d, 1)
 

diff --git a/src/ngramizer.jl b/src/ngramizer.jl
@@ -13,16 +13,18 @@ Dict{AbstractString,Int64} with 3 entries:
   "To be or"  => 1
 ```
 """
-function ngramize(lang::S, words::Vector{T}, n::Int) where {S <: Language, T <: AbstractString}
-    (n == 1) && return onegramize(lang, words)
+function ngramize(lang::S, words::Vector{T}, nlist::Integer...) where {S <: Language, T <: AbstractString}
+    (length(nlist) == 1) && (first(nlist) == 1) && return onegramize(lang, words)
 
     n_words = length(words)
 
     tokens = Dict{AbstractString, Int}()
 
-    for index in 1:(n_words - n + 1)
-        token = join(words[index:(index + n - 1)], " ")
-        tokens[token] = get(tokens, token, 0) + 1
+    for n in nlist
+        for index in 1:(n_words - n + 1)
+            token = join(words[index:(index + n - 1)], " ")
+            tokens[token] = get(tokens, token, 0) + 1
+        end
     end
     return tokens
 end

diff --git a/src/show.jl b/src/show.jl
@@ -10,7 +10,7 @@ function Base.summary(d::AbstractDocument)
     o *= " * Author: $(author(d))\n"
     o *= " * Timestamp: $(timestamp(d))\n"
 
-    if typeof(d) ∈ [TokenDocument, NGramDocument]
+    if typeof(d) <: Union{TokenDocument, NGramDocument}
         o *= " * Snippet: ***SAMPLE TEXT NOT AVAILABLE***"
     else
         sample_text = replace(text(d)[1:min(50, length(text(d)))], r"\s+" => " ")

diff --git a/test/document.jl b/test/document.jl
@@ -65,4 +65,13 @@
     @test isa(d, NGramDocument)
 
     @test isequal(length(Document("this is text")), 12)
+
+    # NGramDocument creation with multiple ngram complexity
+    let N=((), (2,), (Int32(2),), (1,2), (Int32(1), Int16(2))), C=(1, 2, 2, [1,2], [1,2]), L=(4, 3, 3, 7, 7)
+        for (n,c,l) in zip(N,C,L)
+            ngd = NGramDocument(sample_text1, n...)
+            @test ngram_complexity(ngd) == c
+            @test length(ngd.ngrams) == l
+        end
+    end
 end
diff --git a/test/ngramizer.jl b/test/ngramizer.jl
@@ -14,4 +14,14 @@
                                              "sample text" => 1,
                                              "is some" => 1,
                                               "some sample" => 1))
+    ngs = TextAnalysis.ngramize(Languages.English(), tkns, 1, 2)
+    @test isequal(ngs, Dict{String,Int}("this is" => 1,
+                                             "is some" => 1,
+                                             "some sample" => 1,
+                                             "sample text" => 1,
+                                             "this" => 1,
+                                             "is" => 1,
+                                             "some" => 1,
+                                             "sample" => 1,
+                                             "text" => 1))
 end