Skip to content

Commit

Permalink
addressed perf regression of stopwords removal
Browse files Browse the repository at this point in the history
  • Loading branch information
asbisen committed Apr 10, 2019
1 parent c8ae7a2 commit 8867dd9
Show file tree
Hide file tree
Showing 2 changed files with 111 additions and 23 deletions.
126 changes: 107 additions & 19 deletions src/preprocessing.jl
Original file line number Diff line number Diff line change
Expand Up @@ -84,8 +84,38 @@ end
#
##############################################################################

"""
remove_case(s::AbstractString)
Converts the string to lowercase.
See also: [`remove_case!`](@ref)
"""
remove_case(s::T) where {T <: AbstractString} = lowercase(s)


"""
remove_case!(d::TokenDocument)
remove_case!(d::StringDocument)
remove_case!(d::NGramDocument)
remove_case!(c::Corpus)
Converts the text of the document or corpus to lowercase. This method does not
works with FileDocument
# Example
```julia-repl
julia> str="The quick brown fox jumps over the lazy dog"
julia> sd=StringDocument(str)
StringDocument{String}("The quick brown fox jumps over the lazy dog", TextAnalysis.DocumentMetadata(Languages.English(), "Untitled Document", "Unknown Author", "Unknown Time"))
julia> remove_case!(sd)
julia> sd.text
"the quick brown fox jumps over the lazy dog"
```
"""
remove_case!(d::FileDocument) = error("FileDocument cannot be modified")

function remove_case!(d::StringDocument)
Expand Down Expand Up @@ -153,6 +183,23 @@ end
# Remove specified words
#
##############################################################################
"""
remove_words!(d::AbstractDocument, words::Vector)
remove_words!(c::Corpus, words::Vector)
Removes the tokens defined in the list `words` from the source Document or Corpus
# Example
```julia-repl
julia> str="The quick brown fox jumps over the lazy dog"
julia> sd=StringDocument(str);
julia> remove_words = ["fox", "over"]
julia> remove_words!(sd, remove_words)
julia> sd.text
"the quick brown jumps the lazy dog"
```
"""
function remove_words!(entity::(Union{AbstractDocument,Corpus}),
words::Vector{T}) where T <: AbstractString
skipwords = Set{AbstractString}()
Expand Down Expand Up @@ -232,6 +279,8 @@ function prepare!(crps::Corpus, flags::UInt32; skip_patterns = Set{AbstractStrin
r = _build_regex(lang, flags, skip_patterns, skip_words)
!isempty(r.pattern) && remove_patterns!(crps, r)

((flags & strip_whitespace) > 0) && remove_whitespace!(d)

((flags & stem_words) > 0) && stem!(crps)
((flags & tag_part_of_speech) > 0) && tag_pos!(crps)
nothing
Expand All @@ -244,30 +293,69 @@ function prepare!(d::AbstractDocument, flags::UInt32; skip_patterns = Set{Abstra

r = _build_regex(language(d), flags, skip_patterns, skip_words)
!isempty(r.pattern) && remove_patterns!(d, r)
((flags & strip_whitespace) > 0) && remove_whitespace!(d)

((flags & stem_words) > 0) && stem!(d)
((flags & tag_part_of_speech) > 0) && tag_pos!(d)
nothing
end

function remove_patterns(s::AbstractString, rex::Regex)
iob = IOBuffer()
ibegin = 1
v=codeunits(s)
for m in eachmatch(rex, s)
len = m.match.offset-ibegin+1
next = nextind(s, lastindex(m.match)+m.match.offset)
if len > 0
Base.write_sub(iob, v, ibegin, len)
if next != length(s)+1
write(iob, ' ')
end
end
ibegin = next
end
len = length(v) - ibegin + 1
(len > 0) && Base.write_sub(iob, v, ibegin, len)
String(take!(iob))
#function remove_patterns(s::AbstractString, rex::Regex)
# iob = IOBuffer()
# ibegin = 1
# v=codeunits(s)
# for m in eachmatch(rex, s)
# len = m.match.offset-ibegin+1
# next = nextind(s, lastindex(m.match)+m.match.offset)
# if len > 0
# Base.write_sub(iob, v, ibegin, len)
# if next != length(s)+1
# write(iob, ' ')
# end
# end
# ibegin = next
# end
# len = length(v) - ibegin + 1
# (len > 0) && Base.write_sub(iob, v, ibegin, len)
# String(take!(iob))
#end

"""
remove_whitespace(s::AbstractString)
Squashes multiple whitespaces to a single one. And removes all leading and
trailing whitespaces in a string.
"""
remove_whitespace(s::AbstractString) = replace(strip(s), r"\s+"=>" ")


"""
remove_whitespace!(s::AbstractDocument)
Squashes multiple whitespaces to a single space. And removes all leading and
trailing whitespaces in a StringDocument and Corpus.
Does no-op for NGramDocument and TokenDocument.
"""
function remove_whitespace!(d::StringDocument)
d.text = remove_whitespace(d.text)
end

function remove_whitespace!(crps::Corpus)
for doc in crps
remove_whitespace!(doc)
end
end

function remove_whitespace!(d::AbstractDocument)
nothing
end


function remove_patterns(s::AbstractString, rex::Regex)
return replace(s, rex => "")
end

function remove_patterns(s::SubString{T}, rex::Regex) where T <: String
Expand Down Expand Up @@ -345,7 +433,7 @@ function _combine_regex(regex_parts::Set{T}) where T <: AbstractString
end

function _build_regex_patterns(lang, flags::UInt32, patterns::Set{T}, words::Set{T}) where T <: AbstractString
((flags & strip_whitespace) > 0) && push!(patterns, "\\s+")
#((flags & strip_whitespace) > 0) && push!(patterns, "\\s+")
if (flags & strip_non_letters) > 0
push!(patterns, "[^a-zA-Z\\s]")
else
Expand Down
8 changes: 4 additions & 4 deletions test/preprocessing.jl
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@
# Need to only remove words at word boundaries
doc = Document("this is sample text")
remove_words!(doc, ["sample"])
@test isequal(doc.text, "this is text")
@test isequal(doc.text, "this is text")

doc = Document("this is sample text")
prepare!(doc, strip_articles)
Expand Down Expand Up @@ -74,7 +74,7 @@
<script language=\"javascript\"> x = 20; </script>
</head>
<body>
<h1>Hello</h1><a href=\"world\">world</a>
<h1>Hello</h1><a href=\"world\"> world</a>
</body>
</html>
"""
Expand All @@ -94,7 +94,7 @@
color: #00ff00;
}
</style>
<h1>Hello</h1><a href=\"world\">world</a>
<h1>Hello</h1><a href=\"world\"> world</a>
</body>
</html>
"""
Expand All @@ -118,7 +118,7 @@
@test isequal(str.text, answer.text)

str = Document("Intel(tm) Core i5-3300k, is a geat CPU! ")
answer = Document("Intel tm Core i5 3300k is a geat CPU ") #tests old implementation
answer = Document("Inteltm Core i53300k is a geat CPU ") #tests old implementation
prepare!(str, strip_punctuation)
@test isequal(str.text, answer.text)

Expand Down

0 comments on commit 8867dd9

Please sign in to comment.