Skip to content

Commit

Permalink
Merge pull request #157 from tanmaykm/tan/sindex
Browse files Browse the repository at this point in the history
allow multiple ngram complexity in NGramDocument, ngrams and ngrammize
  • Loading branch information
aviks authored Jun 9, 2019
2 parents 0be971b + 5545cde commit b7fc9c4
Show file tree
Hide file tree
Showing 6 changed files with 53 additions and 15 deletions.
18 changes: 18 additions & 0 deletions docs/src/documents.md
Original file line number Diff line number Diff line change
Expand Up @@ -226,6 +226,24 @@ Dict{AbstractString,Int64} with 13 entries:
"be.. ." => 1
```

The `ngrams()` function can also be called with multiple arguments:

```julia
julia> ngrams(sd, 2, 3)
Dict{AbstractString,Int64} with 11 entries:
"or not to" => 1
"be or" => 1
"not to" => 1
"be or not" => 1
"not to be.." => 1
"To be" => 1
"or not" => 1
"to be.. ." => 1
"to be.." => 1
"be.. ." => 1
"To be or" => 1
```

If you have a `NGramDocument`, you can determine whether an `NGramDocument`
contains unigrams, bigrams or a higher-order representation using the `ngram_complexity()` function:

Expand Down
17 changes: 8 additions & 9 deletions src/document.jl
Original file line number Diff line number Diff line change
Expand Up @@ -81,18 +81,17 @@ TokenDocument(txt::AbstractString) = TokenDocument(String(txt), DocumentMetadata

mutable struct NGramDocument{T<:AbstractString} <: AbstractDocument
ngrams::Dict{T,Int}
n::Int
n::Union{Int,Vector{Int}}
metadata::DocumentMetadata
end
function NGramDocument(txt::AbstractString, dm::DocumentMetadata, n::Integer=1)
NGramDocument(ngramize(dm.language, tokenize(dm.language, String(txt)), n),
n, dm)
function NGramDocument(txt::AbstractString, dm::DocumentMetadata, n::Integer...=1)
NGramDocument(ngramize(dm.language, tokenize(dm.language, String(txt)), n...), (length(n) == 1) ? Int(first(n)) : Int[n...], dm)
end
function NGramDocument(txt::AbstractString, n::Integer=1)
NGramDocument(txt, DocumentMetadata(), n)
function NGramDocument(txt::AbstractString, n::Integer...=1)
NGramDocument(txt, DocumentMetadata(), n...)
end
function NGramDocument(ng::Dict{T, Int}, n::Integer=1) where T <: AbstractString
NGramDocument(merge(Dict{AbstractString,Int}(), ng), n, DocumentMetadata())
function NGramDocument(ng::Dict{T, Int}, n::Integer...=1) where T <: AbstractString
NGramDocument(merge(Dict{AbstractString,Int}(), ng), (length(n) == 1) ? Int(first(n)) : Int[n...], DocumentMetadata())
end

##############################################################################
Expand Down Expand Up @@ -146,7 +145,7 @@ end
function ngrams(d::NGramDocument, n::Integer)
error("The n-gram complexity of an NGramDocument cannot be increased")
end
ngrams(d::AbstractDocument, n::Integer) = ngramize(language(d), tokens(d), n)
ngrams(d::AbstractDocument, n::Integer...) = ngramize(language(d), tokens(d), n...)
ngrams(d::NGramDocument) = d.ngrams
ngrams(d::AbstractDocument) = ngrams(d, 1)

Expand Down
12 changes: 7 additions & 5 deletions src/ngramizer.jl
Original file line number Diff line number Diff line change
Expand Up @@ -13,16 +13,18 @@ Dict{AbstractString,Int64} with 3 entries:
"To be or" => 1
```
"""
function ngramize(lang::S, words::Vector{T}, n::Int) where {S <: Language, T <: AbstractString}
(n == 1) && return onegramize(lang, words)
function ngramize(lang::S, words::Vector{T}, nlist::Integer...) where {S <: Language, T <: AbstractString}
(length(nlist) == 1) && (first(nlist) == 1) && return onegramize(lang, words)

n_words = length(words)

tokens = Dict{AbstractString, Int}()

for index in 1:(n_words - n + 1)
token = join(words[index:(index + n - 1)], " ")
tokens[token] = get(tokens, token, 0) + 1
for n in nlist
for index in 1:(n_words - n + 1)
token = join(words[index:(index + n - 1)], " ")
tokens[token] = get(tokens, token, 0) + 1
end
end
return tokens
end
Expand Down
2 changes: 1 addition & 1 deletion src/show.jl
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ function Base.summary(d::AbstractDocument)
o *= " * Author: $(author(d))\n"
o *= " * Timestamp: $(timestamp(d))\n"

if typeof(d) [TokenDocument, NGramDocument]
if typeof(d) <: Union{TokenDocument, NGramDocument}
o *= " * Snippet: ***SAMPLE TEXT NOT AVAILABLE***"
else
sample_text = replace(text(d)[1:min(50, length(text(d)))], r"\s+" => " ")
Expand Down
9 changes: 9 additions & 0 deletions test/document.jl
Original file line number Diff line number Diff line change
Expand Up @@ -65,4 +65,13 @@
@test isa(d, NGramDocument)

@test isequal(length(Document("this is text")), 12)

# NGramDocument creation with multiple ngram complexity
let N=((), (2,), (Int32(2),), (1,2), (Int32(1), Int16(2))), C=(1, 2, 2, [1,2], [1,2]), L=(4, 3, 3, 7, 7)
for (n,c,l) in zip(N,C,L)
ngd = NGramDocument(sample_text1, n...)
@test ngram_complexity(ngd) == c
@test length(ngd.ngrams) == l
end
end
end
10 changes: 10 additions & 0 deletions test/ngramizer.jl
Original file line number Diff line number Diff line change
Expand Up @@ -14,4 +14,14 @@
"sample text" => 1,
"is some" => 1,
"some sample" => 1))
ngs = TextAnalysis.ngramize(Languages.English(), tkns, 1, 2)
@test isequal(ngs, Dict{String,Int}("this is" => 1,
"is some" => 1,
"some sample" => 1,
"sample text" => 1,
"this" => 1,
"is" => 1,
"some" => 1,
"sample" => 1,
"text" => 1))
end

0 comments on commit b7fc9c4

Please sign in to comment.