Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

allow multiple ngram complexity in NGramDocument, ngrams and ngrammize #157

Merged
merged 1 commit into from
Jun 9, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 18 additions & 0 deletions docs/src/documents.md
Original file line number Diff line number Diff line change
Expand Up @@ -226,6 +226,24 @@ Dict{AbstractString,Int64} with 13 entries:
"be.. ." => 1
```

The `ngrams()` function can also be called with multiple arguments:

```julia
julia> ngrams(sd, 2, 3)
Dict{AbstractString,Int64} with 11 entries:
"or not to" => 1
"be or" => 1
"not to" => 1
"be or not" => 1
"not to be.." => 1
"To be" => 1
"or not" => 1
"to be.. ." => 1
"to be.." => 1
"be.. ." => 1
"To be or" => 1
```

If you have a `NGramDocument`, you can determine whether an `NGramDocument`
contains unigrams, bigrams or a higher-order representation using the `ngram_complexity()` function:

Expand Down
17 changes: 8 additions & 9 deletions src/document.jl
Original file line number Diff line number Diff line change
Expand Up @@ -81,18 +81,17 @@ TokenDocument(txt::AbstractString) = TokenDocument(String(txt), DocumentMetadata

mutable struct NGramDocument{T<:AbstractString} <: AbstractDocument
ngrams::Dict{T,Int}
n::Int
n::Union{Int,Vector{Int}}
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Would this defeat type inference?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't seen ngram_complexity (or this field) used anywhere in TextAnalysis. It may be used from external code. But then there are no abstracts here, and it should be easy to make them type stable if/where needed.

metadata::DocumentMetadata
end
function NGramDocument(txt::AbstractString, dm::DocumentMetadata, n::Integer=1)
NGramDocument(ngramize(dm.language, tokenize(dm.language, String(txt)), n),
n, dm)
function NGramDocument(txt::AbstractString, dm::DocumentMetadata, n::Integer...=1)
NGramDocument(ngramize(dm.language, tokenize(dm.language, String(txt)), n...), (length(n) == 1) ? Int(first(n)) : Int[n...], dm)
end
function NGramDocument(txt::AbstractString, n::Integer=1)
NGramDocument(txt, DocumentMetadata(), n)
function NGramDocument(txt::AbstractString, n::Integer...=1)
NGramDocument(txt, DocumentMetadata(), n...)
end
function NGramDocument(ng::Dict{T, Int}, n::Integer=1) where T <: AbstractString
NGramDocument(merge(Dict{AbstractString,Int}(), ng), n, DocumentMetadata())
function NGramDocument(ng::Dict{T, Int}, n::Integer...=1) where T <: AbstractString
NGramDocument(merge(Dict{AbstractString,Int}(), ng), (length(n) == 1) ? Int(first(n)) : Int[n...], DocumentMetadata())
end

##############################################################################
Expand Down Expand Up @@ -146,7 +145,7 @@ end
function ngrams(d::NGramDocument, n::Integer)
error("The n-gram complexity of an NGramDocument cannot be increased")
end
ngrams(d::AbstractDocument, n::Integer) = ngramize(language(d), tokens(d), n)
ngrams(d::AbstractDocument, n::Integer...) = ngramize(language(d), tokens(d), n...)
ngrams(d::NGramDocument) = d.ngrams
ngrams(d::AbstractDocument) = ngrams(d, 1)

Expand Down
12 changes: 7 additions & 5 deletions src/ngramizer.jl
Original file line number Diff line number Diff line change
Expand Up @@ -13,16 +13,18 @@ Dict{AbstractString,Int64} with 3 entries:
"To be or" => 1
```
"""
function ngramize(lang::S, words::Vector{T}, n::Int) where {S <: Language, T <: AbstractString}
(n == 1) && return onegramize(lang, words)
function ngramize(lang::S, words::Vector{T}, nlist::Integer...) where {S <: Language, T <: AbstractString}
(length(nlist) == 1) && (first(nlist) == 1) && return onegramize(lang, words)

n_words = length(words)

tokens = Dict{AbstractString, Int}()

for index in 1:(n_words - n + 1)
token = join(words[index:(index + n - 1)], " ")
tokens[token] = get(tokens, token, 0) + 1
for n in nlist
for index in 1:(n_words - n + 1)
token = join(words[index:(index + n - 1)], " ")
tokens[token] = get(tokens, token, 0) + 1
end
end
return tokens
end
Expand Down
2 changes: 1 addition & 1 deletion src/show.jl
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ function Base.summary(d::AbstractDocument)
o *= " * Author: $(author(d))\n"
o *= " * Timestamp: $(timestamp(d))\n"

if typeof(d) [TokenDocument, NGramDocument]
if typeof(d) <: Union{TokenDocument, NGramDocument}
o *= " * Snippet: ***SAMPLE TEXT NOT AVAILABLE***"
else
sample_text = replace(text(d)[1:min(50, length(text(d)))], r"\s+" => " ")
Expand Down
9 changes: 9 additions & 0 deletions test/document.jl
Original file line number Diff line number Diff line change
Expand Up @@ -65,4 +65,13 @@
@test isa(d, NGramDocument)

@test isequal(length(Document("this is text")), 12)

# NGramDocument creation with multiple ngram complexity
let N=((), (2,), (Int32(2),), (1,2), (Int32(1), Int16(2))), C=(1, 2, 2, [1,2], [1,2]), L=(4, 3, 3, 7, 7)
for (n,c,l) in zip(N,C,L)
ngd = NGramDocument(sample_text1, n...)
@test ngram_complexity(ngd) == c
@test length(ngd.ngrams) == l
end
end
end
10 changes: 10 additions & 0 deletions test/ngramizer.jl
Original file line number Diff line number Diff line change
Expand Up @@ -14,4 +14,14 @@
"sample text" => 1,
"is some" => 1,
"some sample" => 1))
ngs = TextAnalysis.ngramize(Languages.English(), tkns, 1, 2)
@test isequal(ngs, Dict{String,Int}("this is" => 1,
"is some" => 1,
"some sample" => 1,
"sample text" => 1,
"this" => 1,
"is" => 1,
"some" => 1,
"sample" => 1,
"text" => 1))
end