From cd26b752b7603ae9173b1f51d9fa82ce4d642678 Mon Sep 17 00:00:00 2001 From: Ayushk4 Date: Tue, 22 Jan 2019 22:14:14 +0530 Subject: [PATCH 01/41] Add Regex --- src/words/tweet_tokenizer.jl | 116 +++++++++++++++++++++++++++++++++++ 1 file changed, 116 insertions(+) create mode 100644 src/words/tweet_tokenizer.jl diff --git a/src/words/tweet_tokenizer.jl b/src/words/tweet_tokenizer.jl new file mode 100644 index 0000000..2ef9529 --- /dev/null +++ b/src/words/tweet_tokenizer.jl @@ -0,0 +1,116 @@ +EMOTICONS_REGEX = r"""(?x) + (?: + [<>]? + [:;=8] + [\-o\*\']? + [\)\]\(\[dDpP/\:\}\{@\|\\] + | + [\)\]\(\[dDpP/\:\}\{@\|\\] + [\-o\*\']? + [:;=8] + [<>]? + | + <3 + )""" + + +URLS = r"""(?x) + (?: + https?: + (?: + /{1,3} + | + [a-z0-9%] + ) + | + [a-z0-9.\-]+[.] + (?:[a-z]{2,13}) + / + ) + (?: + [^\s()<>{}\[\]]+ + | + \([^\s()]*?\([^\s()]+\)[^\s()]*?\) + | + \([^\s]+?\) + )+ + (?: + \([^\s()]*?\([^\s()]+\)[^\s()]*?\) + | + \([^\s]+?\) + | + [^\s`!()\[\]{};:'".,<>?«»“”‘’] + ) + | + (?: + (?\s]+>""" +ASCII_ARROWS = r"""[\-]+>|<[\-]+""" +TWITTER_USERNAME = r"""(?:@[\w_]+)""" +TWITTER_HASHTAGS = r"""(?:\#+[\w_]+[\w\'_\-]*[\w_]+)""" +EMAIL_ADDRESSES = r"""[\w.+-]+@[\w-]+\.(?:[\w-]\.?)+[\w-]""" +WORDS_WITH_APOSTROPHE_DASHES = r"""(?:[^\W\d_](?:[^\W\d_]|['\-_])+[^\W\d_])""" +NUMBERS_FRACTIONS_DECIMALS = r"""(?:[+\-]?\d+[,/.:-]\d+[+\-]?)""" +ELLIPSIS_DOTS = r"""(?:\.(?:\s*\.){1,})""" +WORDS_WITHOUT_APOSTROPHE_DASHES = r"""(?:[\w_]+)""" + + + +# Core tokenizing regex +WORD_REGEX = Regex("(?i:" * join([URLS.pattern + PHONE_NUMBERS.pattern + EMOTICONS_REGEX.pattern + HTML_TAGS.pattern + ASCII_ARROWS.pattern + TWITTER_USERNAME.pattern + TWITTER_HASHTAGS.pattern + EMAIL_ADDRESSES.pattern + WORDS_WITH_APOSTROPHE_DASHES.pattern + NUMBERS_FRACTIONS_DECIMALS.pattern + WORDS_WITHOUT_APOSTROPHE_DASHES.pattern + ELLIPSIS_DOTS.pattern + r"(?:\S)".pattern + ], "|") + * ")" + ) + + +# WORD_REGEX performs poorly on these patterns: +HANG_REGEX = r"""([^a-zA-Z0-9])\1{3,}""" + +# Regex for replacing HTML_Entities +HTML_ENTITIES_REGEX = r"""&(#?(x?))([^&;\s]+);""" + +HANDLES_REGEX = r"""(?x) + (? Date: Fri, 25 Jan 2019 22:17:12 +0530 Subject: [PATCH 02/41] Add function to replace HTML entities --- src/words/tweet_tokenizer.jl | 63 ++++++++++++++++++++++++++++++++++++ 1 file changed, 63 insertions(+) diff --git a/src/words/tweet_tokenizer.jl b/src/words/tweet_tokenizer.jl index 2ef9529..9155a93 100644 --- a/src/words/tweet_tokenizer.jl +++ b/src/words/tweet_tokenizer.jl @@ -114,3 +114,66 @@ HANDLES_REGEX = r"""(?x) | (? convert_entity) +end From 5c87dae58b82ccf80602a5ae8dfb869cec604e79 Mon Sep 17 00:00:00 2001 From: Ayushk4 Date: Thu, 31 Jan 2019 22:18:41 +0530 Subject: [PATCH 03/41] Add tweet tokenizer --- src/words/tweet_tokenizer.jl | 42 ++++++++++++++++++++++++++++++++++++ 1 file changed, 42 insertions(+) diff --git a/src/words/tweet_tokenizer.jl b/src/words/tweet_tokenizer.jl index 9155a93..2685142 100644 --- a/src/words/tweet_tokenizer.jl +++ b/src/words/tweet_tokenizer.jl @@ -177,3 +177,45 @@ function replace_html_entities(input_text::AbstractString, remove_illegal=true) entities_replaced_text = replace(input_text, HTML_ENTITIES_REGEX => convert_entity) end + + +function tweet_tokenize(source::AbstractString, + strip_handle=false, + reduce_len=false, + preserve_case=true ) + + function reduce_lengthening(source::AbstractString) + replace(source, r"(.)\1{2,}" => s"\1\1\1") + end + + function remove_handles(source::AbstractString) + replace(source, HANDLES_REGEX => " ") + end + + # Fix HTML Character entities + source = replace_html_entities(source) + # Remove username handles + if strip_handle + source = remove_handles(source) + end + # Reduce Lengthening + if reduce_len + source = reduce_lengthening(source) + end + # Shorten some sequences of characters + safe_text = replace(source, r"""([^a-zA-Z0-9])\1{3,}""" => s"\1\1\1") + # Tokenize + tokens = collect((m.match for m = eachmatch(WORD_REGEX, + safe_text, + overlap=false))) + # Alter the case with presrving it for emoji + if (!preserve_case) + for (index, word) in enumerate(tokens) + if !occursin(EMOTICONS_REGEX,word) + tokens[index] = lowercase(word) + end + end + end + + return tokens +end From fd927d1e8d3a7bac16a4dc497b90e6ed8438e299 Mon Sep 17 00:00:00 2001 From: Ayushk4 Date: Thu, 31 Jan 2019 22:19:19 +0530 Subject: [PATCH 04/41] Add docstrings for functions --- src/words/tweet_tokenizer.jl | 48 ++++++++++++++++++++++++++++++++++++ 1 file changed, 48 insertions(+) diff --git a/src/words/tweet_tokenizer.jl b/src/words/tweet_tokenizer.jl index 2685142..95f6401 100644 --- a/src/words/tweet_tokenizer.jl +++ b/src/words/tweet_tokenizer.jl @@ -116,7 +116,17 @@ HANDLES_REGEX = r"""(?x) """ +""" + replace_html_entities(input_text::AbstractString, + remove_illegal=true) => (entities_replaced_text::AbstractString) +Removes entities from text by converting them to their corresponding unicode character. +`input_text::AbstractString` The string on which HTML entities need to be replaced +`remove_illegal::Bool` If `True`, entities that can't be converted are +removed. Otherwise, entities that can't be converted are kept "as +is". +Returns `entities_replaced_text::AbstractString` +""" function replace_html_entities(input_text::AbstractString, remove_illegal=true) function convert_entity(matched_text) @@ -179,6 +189,44 @@ function replace_html_entities(input_text::AbstractString, remove_illegal=true) end +""" + tweet_tokenize(input::AbstractString) => tokens + +Twitter-aware tokenizer, designed to be flexible and +easy to adapt to new domains and tasks. + +The basic logic is following: + +1. The regular expressions are made for WORD_REGEX (core tokenizer), HANG_REGEX + and EMOTICONS_REGEX. +2 Replacing HTML entities, tweet handles, reducing length of repeated characters + and other features, make it suitable for tweets +3. The tokenization is done and returned +4. `preserve_case` By default is set to `true`. If it is set to `false`, + then the tokenizer will downcase everything except for emoticons. + +Example: +``` +julia> tweet_tokenize("This is a cooool #dummysmiley: :-) :-P <3 and some arrows < > -> <--") +16-element Array{SubString{String},1}: + "This" + "is" + "a" + "cooool" + "#dummysmiley" + ":" + ":-)" + ":-P" + "<3" + "and" + "some" + "arrows" + "<" + ">" + "->" + "<--" +``` +""" function tweet_tokenize(source::AbstractString, strip_handle=false, reduce_len=false, From 4da422eb1b657c15b2a53162168d33a7baea0ebe Mon Sep 17 00:00:00 2001 From: Ayushk4 Date: Sat, 2 Feb 2019 22:21:25 +0530 Subject: [PATCH 05/41] Add support for tweet tokenizer --- src/WordTokenizers.jl | 8 ++++++++ src/split_api.jl | 2 +- 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/src/WordTokenizers.jl b/src/WordTokenizers.jl index e0cf458..1c38f3c 100644 --- a/src/WordTokenizers.jl +++ b/src/WordTokenizers.jl @@ -1,14 +1,22 @@ module WordTokenizers +using HTML_Entities +using StrTables +using StringEncodings + + export poormans_tokenize, punctuation_space_tokenize, penn_tokenize, improved_penn_tokenize, nltk_word_tokenize, + tweet_tokenize, tokenize, rulebased_split_sentences, split_sentences, set_tokenizer, set_sentence_splitter + include("words/simple.jl") include("words/sedbased.jl") +include("words/tweet_tokenizer.jl") include("sentences/sentence_splitting.jl") diff --git a/src/split_api.jl b/src/split_api.jl index f561830..94f7bcc 100644 --- a/src/split_api.jl +++ b/src/split_api.jl @@ -3,7 +3,7 @@ export Words, Sentences const tokenizers = [poormans_tokenize, punctuation_space_tokenize, - penn_tokenize, improved_penn_tokenize, nltk_word_tokenize] + penn_tokenize, improved_penn_tokenize, nltk_word_tokenize, tweet_tokenize] const sentence_splitters = [rulebased_split_sentences] const Words = tokenize From 2e6b4c2290e30f81d71904dcea7c32f1860ad4f6 Mon Sep 17 00:00:00 2001 From: Ayushk4 Date: Sat, 2 Feb 2019 23:27:38 +0530 Subject: [PATCH 06/41] Update README --- README.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/README.md b/README.md index 626f7f9..5bb2707 100644 --- a/README.md +++ b/README.md @@ -63,6 +63,8 @@ The word tokenizers basically assume sentence splitting has already been done. (To me it seems like a weird historical thing that NLTK has 2 successive variation on improving the Penn tokenizer, but for now I am matching it and having both. See [[NLTK#2005]](https://github.com/nltk/nltk/issues/2005)) +- **Tweet Tokenizer:** (`tweet_tokenizer`) NLTK's casual tokenizer for that is solely designed for tweets. Apart from twitter specific, this tokenizer has good handling for emoticons, and other web aspects like support for HTML Entities. This closely matches NLTK's `nltk.tokenize.TweetTokenizer` + # Sentence Splitters We currently only have one sentence splitter. From 853331c43a1122690fdd32a78fd040034433c8d0 Mon Sep 17 00:00:00 2001 From: Ayushk4 Date: Sun, 3 Feb 2019 00:39:03 +0530 Subject: [PATCH 07/41] Fix bug for optional argurments --- src/words/tweet_tokenizer.jl | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/src/words/tweet_tokenizer.jl b/src/words/tweet_tokenizer.jl index 95f6401..3b5e2fe 100644 --- a/src/words/tweet_tokenizer.jl +++ b/src/words/tweet_tokenizer.jl @@ -200,14 +200,16 @@ The basic logic is following: 1. The regular expressions are made for WORD_REGEX (core tokenizer), HANG_REGEX and EMOTICONS_REGEX. 2 Replacing HTML entities, tweet handles, reducing length of repeated characters - and other features, make it suitable for tweets -3. The tokenization is done and returned + and other features, make it suitable for tweets. +3. The String is tokenized and returned. 4. `preserve_case` By default is set to `true`. If it is set to `false`, then the tokenizer will downcase everything except for emoticons. Example: + ``` julia> tweet_tokenize("This is a cooool #dummysmiley: :-) :-P <3 and some arrows < > -> <--") + 16-element Array{SubString{String},1}: "This" "is" @@ -227,7 +229,7 @@ julia> tweet_tokenize("This is a cooool #dummysmiley: :-) :-P <3 and some arrows "<--" ``` """ -function tweet_tokenize(source::AbstractString, +function tweet_tokenize(source::AbstractString; strip_handle=false, reduce_len=false, preserve_case=true ) From e3d2fa050123871223980373027fe8d395251c58 Mon Sep 17 00:00:00 2001 From: Ayushk4 Date: Sun, 3 Feb 2019 11:34:27 +0530 Subject: [PATCH 08/41] Add dependencies to REQUIRE --- REQUIRE | 3 +++ 1 file changed, 3 insertions(+) diff --git a/REQUIRE b/REQUIRE index 859ad46..fa2c15f 100644 --- a/REQUIRE +++ b/REQUIRE @@ -1 +1,4 @@ julia 0.7 +HTML_Entities +StrTables +StringEncodings \ No newline at end of file From c06ae264682278b0a457a92ca32adb57e280072e Mon Sep 17 00:00:00 2001 From: Ayushk4 Date: Tue, 5 Feb 2019 03:38:53 +0530 Subject: [PATCH 09/41] Minor Code fixes --- src/words/tweet_tokenizer.jl | 38 ++++++++++++------------------------ 1 file changed, 13 insertions(+), 25 deletions(-) diff --git a/src/words/tweet_tokenizer.jl b/src/words/tweet_tokenizer.jl index 3b5e2fe..a145435 100644 --- a/src/words/tweet_tokenizer.jl +++ b/src/words/tweet_tokenizer.jl @@ -122,7 +122,7 @@ HANDLES_REGEX = r"""(?x) Removes entities from text by converting them to their corresponding unicode character. `input_text::AbstractString` The string on which HTML entities need to be replaced -`remove_illegal::Bool` If `True`, entities that can't be converted are +`remove_illegal::Bool` If `true`, entities that can't be converted are removed. Otherwise, entities that can't be converted are kept "as is". Returns `entities_replaced_text::AbstractString` @@ -130,35 +130,23 @@ Returns `entities_replaced_text::AbstractString` function replace_html_entities(input_text::AbstractString, remove_illegal=true) function convert_entity(matched_text) - groups = match(HTML_ENTITIES_REGEX, matched_text).captures - entity_body = groups[3] - local number::Number = 0 + entity_text = groups[3] + number = 0 if isempty(groups[1]) - return(lookupname(HTML_Entities.default, entity_body)) + return lookupname(HTML_Entities.default, entity_text) else if isempty(groups[2]) - is_numeric = true - for i in entity_body - if !isdigit(i) - is_numeric = false - break - end - end - if is_numeric - number = parse(Int, entity_body, base=10) + is_numeric = all(isdigit, entity_text) + number = parse(Int, entity_text, base=10) end else - is_base_16 = true - allowed_letters = ['a', 'b', 'c', 'd', 'e', 'f'] - for i in entity_body - if !(isdigit(i) || i in allowed_letters) - is_base_16 = false - break - end + base_16_letters = ('a', 'b', 'c', 'd', 'e', 'f') + is_base_16 = all(entity_text) do i + isdigit(i) || i in base_16_letters end if is_base_16 - number = parse(Int, entity_body, base=16) + number = parse(Int, entity_text, base=16) end end @@ -168,11 +156,11 @@ function replace_html_entities(input_text::AbstractString, remove_illegal=true) # see: http://en.wikipedia.org/wiki/Character_encodings_in_HTML if 0x80 <= number <= 0x9F - if !(number in [129 141 143 144 157]) + if number ∉ [129 141 143 144 157]) return decode([UInt8(number)], "WINDOWS-1252") end - else - if Unicode.isassigned(number) + elseif + Unicode.isassigned(number) return (Char(number)) end end From 1b65d8eef476cb3bb9911b2f11028132f5547ea8 Mon Sep 17 00:00:00 2001 From: Ayushk4 Date: Tue, 5 Feb 2019 03:54:07 +0530 Subject: [PATCH 10/41] Improve code clarity --- src/words/tweet_tokenizer.jl | 62 ++++++++++++++++-------------------- 1 file changed, 27 insertions(+), 35 deletions(-) diff --git a/src/words/tweet_tokenizer.jl b/src/words/tweet_tokenizer.jl index a145435..a28a0a6 100644 --- a/src/words/tweet_tokenizer.jl +++ b/src/words/tweet_tokenizer.jl @@ -1,4 +1,4 @@ -EMOTICONS_REGEX = r"""(?x) +const EMOTICONS_REGEX = r"""(?x) (?: [<>]? [:;=8] @@ -14,7 +14,7 @@ EMOTICONS_REGEX = r"""(?x) )""" -URLS = r"""(?x) +const URLS = r"""(?x) (?: https?: (?: @@ -55,7 +55,7 @@ URLS = r"""(?x) """ -PHONE_NUMBERS = r"""(?x) +const PHONE_NUMBERS = r"""(?x) (?: (?: \+?[01] @@ -72,20 +72,20 @@ PHONE_NUMBERS = r"""(?x) )""" -HTML_TAGS = r"""<[^>\s]+>""" -ASCII_ARROWS = r"""[\-]+>|<[\-]+""" -TWITTER_USERNAME = r"""(?:@[\w_]+)""" -TWITTER_HASHTAGS = r"""(?:\#+[\w_]+[\w\'_\-]*[\w_]+)""" -EMAIL_ADDRESSES = r"""[\w.+-]+@[\w-]+\.(?:[\w-]\.?)+[\w-]""" -WORDS_WITH_APOSTROPHE_DASHES = r"""(?:[^\W\d_](?:[^\W\d_]|['\-_])+[^\W\d_])""" -NUMBERS_FRACTIONS_DECIMALS = r"""(?:[+\-]?\d+[,/.:-]\d+[+\-]?)""" -ELLIPSIS_DOTS = r"""(?:\.(?:\s*\.){1,})""" -WORDS_WITHOUT_APOSTROPHE_DASHES = r"""(?:[\w_]+)""" +const HTML_TAGS = r"""<[^>\s]+>""" +const ASCII_ARROWS = r"""[\-]+>|<[\-]+""" +const TWITTER_USERNAME = r"""(?:@[\w_]+)""" +const TWITTER_HASHTAGS = r"""(?:\#+[\w_]+[\w\'_\-]*[\w_]+)""" +const EMAIL_ADDRESSES = r"""[\w.+-]+@[\w-]+\.(?:[\w-]\.?)+[\w-]""" +const WORDS_WITH_APOSTROPHE_DASHES = r"""(?:[^\W\d_](?:[^\W\d_]|['\-_])+[^\W\d_])""" +const NUMBERS_FRACTIONS_DECIMALS = r"""(?:[+\-]?\d+[,/.:-]\d+[+\-]?)""" +const ELLIPSIS_DOTS = r"""(?:\.(?:\s*\.){1,})""" +const WORDS_WITHOUT_APOSTROPHE_DASHES = r"""(?:[\w_]+)""" # Core tokenizing regex -WORD_REGEX = Regex("(?i:" * join([URLS.pattern +const WORD_REGEX = Regex("(?i:" * join([URLS.pattern PHONE_NUMBERS.pattern EMOTICONS_REGEX.pattern HTML_TAGS.pattern @@ -104,12 +104,12 @@ WORD_REGEX = Regex("(?i:" * join([URLS.pattern # WORD_REGEX performs poorly on these patterns: -HANG_REGEX = r"""([^a-zA-Z0-9])\1{3,}""" +const HANG_REGEX = r"""([^a-zA-Z0-9])\1{3,}""" # Regex for replacing HTML_Entities -HTML_ENTITIES_REGEX = r"""&(#?(x?))([^&;\s]+);""" +const HTML_ENTITIES_REGEX = r"""&(#?(x?))([^&;\s]+);""" -HANDLES_REGEX = r"""(?x) +const HANDLES_REGEX = r"""(?x) (? convert_entity) + return entities_replaced_text end @@ -222,34 +222,26 @@ function tweet_tokenize(source::AbstractString; reduce_len=false, preserve_case=true ) - function reduce_lengthening(source::AbstractString) - replace(source, r"(.)\1{2,}" => s"\1\1\1") - end - - function remove_handles(source::AbstractString) - replace(source, HANDLES_REGEX => " ") - end - # Fix HTML Character entities source = replace_html_entities(source) # Remove username handles if strip_handle - source = remove_handles(source) + source = replace(source, HANDLES_REGEX => " ") end # Reduce Lengthening if reduce_len - source = reduce_lengthening(source) + source = replace(source, r"(.)\1{2,}" => s"\1\1\1") end # Shorten some sequences of characters safe_text = replace(source, r"""([^a-zA-Z0-9])\1{3,}""" => s"\1\1\1") # Tokenize - tokens = collect((m.match for m = eachmatch(WORD_REGEX, + tokens = collect((m.match for m in eachmatch(WORD_REGEX, safe_text, overlap=false))) # Alter the case with presrving it for emoji - if (!preserve_case) + if !preserve_case for (index, word) in enumerate(tokens) - if !occursin(EMOTICONS_REGEX,word) + if !occursin(EMOTICONS_REGEX, word) tokens[index] = lowercase(word) end end From 320ce4de95a30f80984fda0aa7cc9c2a4d352c94 Mon Sep 17 00:00:00 2001 From: Ayushk4 Date: Fri, 8 Feb 2019 13:27:45 +0530 Subject: [PATCH 11/41] Add comments and better variable naming --- src/WordTokenizers.jl | 1 + src/words/tweet_tokenizer.jl | 15 +++++++++++---- 2 files changed, 12 insertions(+), 4 deletions(-) diff --git a/src/WordTokenizers.jl b/src/WordTokenizers.jl index 1c38f3c..b8b3c8f 100644 --- a/src/WordTokenizers.jl +++ b/src/WordTokenizers.jl @@ -3,6 +3,7 @@ module WordTokenizers using HTML_Entities using StrTables using StringEncodings +using Unicode export poormans_tokenize, punctuation_space_tokenize, diff --git a/src/words/tweet_tokenizer.jl b/src/words/tweet_tokenizer.jl index a28a0a6..496a4d7 100644 --- a/src/words/tweet_tokenizer.jl +++ b/src/words/tweet_tokenizer.jl @@ -130,13 +130,20 @@ Returns `entities_replaced_text::AbstractString` function replace_html_entities(input_text::AbstractString, remove_illegal=true) function convert_entity(matched_text) - groups = match(HTML_ENTITIES_REGEX, matched_text).captures - entity_text = groups[3] + # HTML entity can be named or encoded in Decimal/Hex form + # - Named_entity : "Δ" => "Δ", + # - Decimal : "Δ" => "Δ", + # - Hex : ""Δ" => "Δ", + # + # However for bytes (hex) 80-9f are interpreted in Windows-1252 + is_numeric_encoded, is_hex_encoded, entity_text = match(HTML_ENTITIES_REGEX, + matched_text).captures number = 0 - if isempty(groups[1]) + + if isempty(is_numeric_encoded) return lookupname(HTML_Entities.default, entity_text) else - if isempty(groups[2]) + if isempty(is_hex_encoded) is_numeric = all(isdigit, entity_text) if is_numeric number = parse(Int, entity_text, base=10) From 1999e27f4dd4407fad80ef6ec319eef1785b2b40 Mon Sep 17 00:00:00 2001 From: Ayushk4 Date: Sat, 9 Feb 2019 03:23:55 +0530 Subject: [PATCH 12/41] Add second series of tests --- test/tweet_tokenize.jl | 27 ++++++++++++++++++++------- 1 file changed, 20 insertions(+), 7 deletions(-) diff --git a/test/tweet_tokenize.jl b/test/tweet_tokenize.jl index 2f119c0..1a0f7c0 100644 --- a/test/tweet_tokenize.jl +++ b/test/tweet_tokenize.jl @@ -2,7 +2,7 @@ using Test using WordTokenizers @testset "Tweet Tokenize" begin - @test "Basic Tests" begin + @testset "Basic Tests" begin s0 = "This is a cooool #dummysmiley: :-) :-P <3 and some arrows < > -> <--" @test tweet_tokenize(s0) == @@ -29,29 +29,42 @@ using WordTokenizers ["@crushinghes", "the", "summer", "holidays", "are", "great", "but", "I'm", "so", "bored", "already", ":("] end - @test "Remove Handles and Reduce Length" begin + @testset "Remove Handles and Reduce Length" begin s6 = "@remy: This is waaaaayyyy too much for you!!!!!!" - @test tweet_tokenize(s6, strip_handles=true, reduce_len=true) == + @test tweet_tokenize(s6, strip_handle=true, reduce_len=true) == [":", "This", "is", "waaayyy", "too", "much", "for", "you", "!", "!", "!"] s7 = "@_willy65: No place for @chuck tonight. Sorry." - @test tweet_tokenize(s7, strip_handles=true, reduce_len=true) == + @test tweet_tokenize(s7, strip_handle=true, reduce_len=true) == [":", "No", "place", "for", "tonight", ".", "Sorry", "."] s8 = "@mar_tin is a great developer. Contact him at mar_tin@email.com." - @test tweet_tokenize(s8, strip_handles=true, reduce_len=true) == + @test tweet_tokenize(s8, strip_handle=true, reduce_len=true) == ["is", "a", "great", "developer", ".", "Contact", "him", "at", "mar_tin@email.com", "."] end - @test "Preserve Case" begin + @testset "Preserve Case" begin s9 = "@jrmy: I'm REALLY HAPPYYY about that! NICEEEE :D :P" @test tweet_tokenize(s9, preserve_case=false) == ["@jrmy", ":", "i'm", "really", "happyyy", "about", "that", "!", "niceeee", ":D", ":P"] end - @test "Test long sentences" begin + @testset "Test long sentences" begin s10 = "Photo: Aujourd'hui sur http://t.co/0gebOFDUzn Projet... http://t.co/bKfIUbydz2.............................. http://fb.me/3b6uXpz0L" @test tweet_tokenize(s10) == ["Photo", ":", "Aujourd'hui", "sur", "http://t.co/0gebOFDUzn", "Projet", "...", "http://t.co/bKfIUbydz2", "...", "http://fb.me/3b6uXpz0L"] end end + +@testset "Replace HTML Entities" begin + @test tweet_tokenize("An HTML Entity - Δ") == + ["An", "HTML", "Entity", "-", "Δ"] + + @test tweet_tokenize("Another HTML Entity - Δ") == + ["Another", "HTML", "Entity", "-", "Δ"] + + @test tweet_tokenize("Another HTML Entity - Δ") == + ["Another", "HTML", "Entity", "-", "Δ"] + + +end From ad94e3016a1e899b547a4e52bb52e3ae09a8bb0f Mon Sep 17 00:00:00 2001 From: Ayushk4 Date: Sat, 9 Feb 2019 08:17:54 +0530 Subject: [PATCH 13/41] Add tests and fix bugs --- src/words/tweet_tokenizer.jl | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/src/words/tweet_tokenizer.jl b/src/words/tweet_tokenizer.jl index 496a4d7..abdf51c 100644 --- a/src/words/tweet_tokenizer.jl +++ b/src/words/tweet_tokenizer.jl @@ -127,7 +127,7 @@ removed. Otherwise, entities that can't be converted are kept "as is". Returns `entities_replaced_text::AbstractString` """ -function replace_html_entities(input_text::AbstractString, remove_illegal=true) +function replace_html_entities(input_text::AbstractString; remove_illegal=true) function convert_entity(matched_text) # HTML entity can be named or encoded in Decimal/Hex form @@ -138,7 +138,7 @@ function replace_html_entities(input_text::AbstractString, remove_illegal=true) # However for bytes (hex) 80-9f are interpreted in Windows-1252 is_numeric_encoded, is_hex_encoded, entity_text = match(HTML_ENTITIES_REGEX, matched_text).captures - number = 0 + number = -1 if isempty(is_numeric_encoded) return lookupname(HTML_Entities.default, entity_text) @@ -163,12 +163,14 @@ function replace_html_entities(input_text::AbstractString, remove_illegal=true) # to bytes 80-9F in the Windows-1252 encoding. For more info # see: http://en.wikipedia.org/wiki/Character_encodings_in_HTML - if 0x80 <= number <= 0x9F - if number ∉ (129, 141, 143, 144, 157) - return decode([UInt8(number)], "WINDOWS-1252") + if number >= 0 + if 0x80 <= number <= 0x9F + if number ∉ (129, 141, 143, 144, 157) + return decode([UInt8(number)], "WINDOWS-1252") + end + elseif Unicode.isassigned(number) + return Char(number) end - elseif Unicode.isassigned(number) - return Char(number) end end From 4ec3f0acedc2df2150be7de25252b6d8be0c9be8 Mon Sep 17 00:00:00 2001 From: Ayushk4 Date: Sat, 9 Feb 2019 08:46:04 +0530 Subject: [PATCH 14/41] Add final set of tests, fix links,typo --- src/words/tweet_tokenizer.jl | 4 ++-- test/tweet_tokenize.jl | 11 +++++++++++ 2 files changed, 13 insertions(+), 2 deletions(-) diff --git a/src/words/tweet_tokenizer.jl b/src/words/tweet_tokenizer.jl index abdf51c..0ad0fa3 100644 --- a/src/words/tweet_tokenizer.jl +++ b/src/words/tweet_tokenizer.jl @@ -161,7 +161,7 @@ function replace_html_entities(input_text::AbstractString; remove_illegal=true) # Numeric character references in the 80-9F range are typically # interpreted by browsers as representing the characters mapped # to bytes 80-9F in the Windows-1252 encoding. For more info - # see: http://en.wikipedia.org/wiki/Character_encodings_in_HTML + # see: https://en.wikipedia.org/wiki/ISO/IEC_8859-1#Similar_character_sets if number >= 0 if 0x80 <= number <= 0x9F @@ -247,7 +247,7 @@ function tweet_tokenize(source::AbstractString; tokens = collect((m.match for m in eachmatch(WORD_REGEX, safe_text, overlap=false))) - # Alter the case with presrving it for emoji + # Alter the case with preserving it for emoji if !preserve_case for (index, word) in enumerate(tokens) if !occursin(EMOTICONS_REGEX, word) diff --git a/test/tweet_tokenize.jl b/test/tweet_tokenize.jl index 1a0f7c0..a07ac15 100644 --- a/test/tweet_tokenize.jl +++ b/test/tweet_tokenize.jl @@ -66,5 +66,16 @@ end @test tweet_tokenize("Another HTML Entity - Δ") == ["Another", "HTML", "Entity", "-", "Δ"] + @test tweet_tokenize("Price: £100") == + [ "Price", ":", "£", "100"] + + @test tweet_tokenize("Check out this invalid symbol ", preserve_case=false) == + [ "check", "out", "this", "invalid", "symbol"] + + @test tweet_tokenize("A•B = B•A ") == + [ "A", "•", "B", "=", "B", "•", "A"] + + @test WordTokenizers.replace_html_entities("Check out this invalid symbol ", remove_illegal=false) == + "Check out this invalid symbol " end From 8007a17623e6841bb00fa57ee3ea4cb01fb987fb Mon Sep 17 00:00:00 2001 From: Ayushk4 Date: Sun, 10 Mar 2019 15:03:05 +0530 Subject: [PATCH 15/41] Make Replace entities 30x faster --- src/WordTokenizers.jl | 1 - src/words/fast.jl | 2 +- src/words/tweet_tokenizer.jl | 151 +++++++++++++++++++++-------------- test/tweet_tokenize.jl | 8 +- 4 files changed, 97 insertions(+), 65 deletions(-) diff --git a/src/WordTokenizers.jl b/src/WordTokenizers.jl index 88ab89d..dea2ab8 100644 --- a/src/WordTokenizers.jl +++ b/src/WordTokenizers.jl @@ -3,7 +3,6 @@ module WordTokenizers using HTML_Entities using StrTables -using StringEncodings using Unicode diff --git a/src/words/fast.jl b/src/words/fast.jl index dab6236..40fdfe8 100644 --- a/src/words/fast.jl +++ b/src/words/fast.jl @@ -198,7 +198,7 @@ function openquote(ts) end """ - openquote(::TokenBuffer) + closingquote(::TokenBuffer) Matches " used as a closing quote, and tokenises it as ''. """ diff --git a/src/words/tweet_tokenizer.jl b/src/words/tweet_tokenizer.jl index 0ad0fa3..21dd432 100644 --- a/src/words/tweet_tokenizer.jl +++ b/src/words/tweet_tokenizer.jl @@ -106,8 +106,6 @@ const WORD_REGEX = Regex("(?i:" * join([URLS.pattern # WORD_REGEX performs poorly on these patterns: const HANG_REGEX = r"""([^a-zA-Z0-9])\1{3,}""" -# Regex for replacing HTML_Entities -const HTML_ENTITIES_REGEX = r"""&(#?(x?))([^&;\s]+);""" const HANDLES_REGEX = r"""(?x) (? (entities_replaced_text::AbstractString) +html_entities(ts::TokenBuffer; remove_illegal=true) Removes entities from text by converting them to their corresponding unicode character. -`input_text::AbstractString` The string on which HTML entities need to be replaced + `remove_illegal::Bool` If `true`, entities that can't be converted are removed. Otherwise, entities that can't be converted are kept "as is". -Returns `entities_replaced_text::AbstractString` -""" -function replace_html_entities(input_text::AbstractString; remove_illegal=true) - - function convert_entity(matched_text) - # HTML entity can be named or encoded in Decimal/Hex form - # - Named_entity : "Δ" => "Δ", - # - Decimal : "Δ" => "Δ", - # - Hex : ""Δ" => "Δ", - # - # However for bytes (hex) 80-9f are interpreted in Windows-1252 - is_numeric_encoded, is_hex_encoded, entity_text = match(HTML_ENTITIES_REGEX, - matched_text).captures - number = -1 - - if isempty(is_numeric_encoded) - return lookupname(HTML_Entities.default, entity_text) - else - if isempty(is_hex_encoded) - is_numeric = all(isdigit, entity_text) - if is_numeric - number = parse(Int, entity_text, base=10) - end - else - base_16_letters = ('a', 'b', 'c', 'd', 'e', 'f') - is_base_16 = all(entity_text) do i - isdigit(i) || i in base_16_letters - end - if is_base_16 - number = parse(Int, entity_text, base=16) - end - end - # Numeric character references in the 80-9F range are typically - # interpreted by browsers as representing the characters mapped - # to bytes 80-9F in the Windows-1252 encoding. For more info - # see: https://en.wikipedia.org/wiki/ISO/IEC_8859-1#Similar_character_sets - - if number >= 0 - if 0x80 <= number <= 0x9F - if number ∉ (129, 141, 143, 144, 157) - return decode([UInt8(number)], "WINDOWS-1252") - end - elseif Unicode.isassigned(number) - return Char(number) - end - end - end +HTML entity can be named or encoded in Decimal/Hex form +- Named_entity : "Δ" => "Δ", +- Decimal : "Δ" => "Δ", +- Hex : "Δ" => "Δ", +However for bytes (hex) 80-9f are interpreted in Windows-1252 - if remove_illegal - return "" - else - return matched_text - end +""" +function html_entity(ts::TokenBuffer, remove_illegal=true) + (ts.idx + 1 > length(ts.input) || ts.input[ts.idx] != '&' ) && return false + if ts.input[ts.idx + 1] != '#' # Entity is of the type "Δ" => "Δ" + i = ts.idx + 1 + while i <= length(ts.input) && isascii(ts[i]) && + (isdigit(ts[i]) || islowercase(ts[i]) || isuppercase(ts[i])) + i += 1 + end + (i > length(ts.input) || ts[i] != ';') && return false + entity = lookupname(HTML_Entities.default, String(ts[ts.idx+1:i-1])) + isempty(entity) && !remove_illegal && return false + !isempty(entity) && push!(ts.buffer, entity[1]) + ts.idx = i + 1 + return true + else + number = -1 + i = ts.idx + 2 + if ts.input[ts.idx + 2] != 'x' # Entity is of the type "Δ" => "Δ" + while i <= length(ts.input) && isdigit(ts[i]) + i += 1 + end + (i > length(ts.input) || ts[i] != ';') && return false + if ((ts.idx + 2 ) == i) + !remove_illegal && return false + ts.idx +=3 + return true + end + (number = parse(Int, String(ts[ts.idx+2:i-1]), base=10)) + else # Entity is of the type "Δ" => "Δ" + i += 1 + base16letters = ('a', 'b', 'c', 'd', 'e', 'f') + while i <= length(ts.input) && (isdigit(ts[i]) || ts[i] in base16letters) + i += 1 + end + (i > length(ts.input) || ts[i] != ';') && return false + + if (ts.idx + 3) == i + !remove_illegal && return false + ts.idx += 4 + return true + end + number = parse(Int, String(ts[ts.idx+3:i-1]), base=16) end - entities_replaced_text = replace(input_text, HTML_ENTITIES_REGEX => convert_entity) - return entities_replaced_text + windows_1252_chars = ['€', '\u81', '‚', 'ƒ', '„', '…', '†', '‡', 'ˆ', '‰', + 'Š', '‹', 'Œ', '\u8d','Ž', '\u8f', '\u90', '‘', '’', + '“', '”', '•', '–', '—', '˜', '™', 'š', '›', 'œ', + '\u9d', 'ž', 'Ÿ'] + if 0x80 <= number <= 0x9F + push!(ts.buffer, windows_1252_chars[number - 127]) + ts.idx = i + 1 + return true + end + if (number <= 0 || !Unicode.isassigned(number)) + !remove_illegal && return false + ts.idx = i + 1 + else + push!(ts.buffer, Char(number)) + ts.idx = i + 1 + end + end + return true end + +""" + replace_html_entities(input::AbstractString, + remove_illegal=true) => (entities_replaced_text::AbstractString) + + +`input::AbstractString` The string on which HTML entities need to be replaced +`remove_illegal::Bool` If `true`, entities that can't be converted are +removed. Otherwise, entities that can't be converted are kept "as +is". +Returns `entities_replaced_text::AbstractString` +""" +function replace_html_entities(input::AbstractString; remove_illegal=true) + ts = TokenBuffer(input) + isempty(input) && return ts.tokens + + while !isdone(ts) + html_entity(ts, remove_illegal) || character(ts) + end + return ts.tokens[1] +end + """ tweet_tokenize(input::AbstractString) => tokens @@ -231,6 +262,7 @@ function tweet_tokenize(source::AbstractString; reduce_len=false, preserve_case=true ) + length(source) == 0 && return [] # Fix HTML Character entities source = replace_html_entities(source) # Remove username handles @@ -258,3 +290,4 @@ function tweet_tokenize(source::AbstractString; return tokens end + diff --git a/test/tweet_tokenize.jl b/test/tweet_tokenize.jl index a07ac15..6bf3743 100644 --- a/test/tweet_tokenize.jl +++ b/test/tweet_tokenize.jl @@ -69,13 +69,13 @@ end @test tweet_tokenize("Price: £100") == [ "Price", ":", "£", "100"] - @test tweet_tokenize("Check out this invalid symbol ", preserve_case=false) == - [ "check", "out", "this", "invalid", "symbol"] + @test tweet_tokenize("Check out this invalid symbol ") == + [ "Check", "out", "this", "invalid", "symbol", "\u81"] @test tweet_tokenize("A•B = B•A ") == [ "A", "•", "B", "=", "B", "•", "A"] - @test WordTokenizers.replace_html_entities("Check out this invalid symbol ", remove_illegal=false) == - "Check out this invalid symbol " + @test tweet_tokenize("Check out this symbol in Windows-1252 encoding €") == + [ "Check", "out", "this", "symbol", "in", "Windows", "-", "1252", "encoding", "€"] end From 50539bad86cae8af322e863960173f9e963dc631 Mon Sep 17 00:00:00 2001 From: Ayushk4 Date: Wed, 13 Mar 2019 01:23:11 +0530 Subject: [PATCH 16/41] Use TokenBuffer to speed up pre_processing functions --- src/words/tweet_tokenizer.jl | 189 ++++++++++++++++++++++++++--------- 1 file changed, 142 insertions(+), 47 deletions(-) diff --git a/src/words/tweet_tokenizer.jl b/src/words/tweet_tokenizer.jl index 21dd432..65cb879 100644 --- a/src/words/tweet_tokenizer.jl +++ b/src/words/tweet_tokenizer.jl @@ -102,18 +102,8 @@ const WORD_REGEX = Regex("(?i:" * join([URLS.pattern * ")" ) - -# WORD_REGEX performs poorly on these patterns: const HANG_REGEX = r"""([^a-zA-Z0-9])\1{3,}""" - -const HANDLES_REGEX = r"""(?x) - (? length(ts.input) || ts.input[ts.idx] != '@' ) && return false + lookbehind(ts) && return false + + i = ts.idx + 1 + while i <= length(ts.input) && + ( isascii(ts[i]) && (isdigit(ts[i]) || islowercase(ts[i]) || + isuppercase(ts[i]) || ts[i] == '_')) + i += 1 + end + (i <= length(ts.input)) && (i == ts.idx + 1 || ts[i] == '@') && return false + + ts.idx = i + return true +end + + +""" + reduce_all_repeated(ts::TokenBuffer) + +For handling repeated characters like "helloooooo" -> :hellooo". + +""" +function reduce_all_repeated(ts) + ts.idx + 4 > length(ts.input) && return false + + (ts[ts.idx] == '\n' || ts[ts.idx] != ts[ts.idx + 1] || + ts[ts.idx] != ts[ts.idx + 2]) && return false + + i = ts.idx + 3 + while i <= length(ts.input) && ts[i] == ts[ts.idx] + i += 1 + end + for i in 1:3 + push!(ts.buffer, ts[ts.idx]) + end + ts.idx = i + return true +end + +""" + safe_text(ts::TokenBuffer) + +This feature covers up for the characters where the main tokenizing function lacks +For example - "........" -> "..." and this is detected by the key tokenizer as a +single token of "..." +""" +function safe_text(ts) + ts.idx + 4 > length(ts.input) && return false + + ((isascii(ts[ts.idx]) && ( islowercase(ts[ts.idx]) || + isuppercase(ts[ts.idx]) || isdigit(ts[ts.idx]))) || + ts[ts.idx] != ts[ts.idx + 1] || + ts[ts.idx] != ts[ts.idx + 2] ) && return false + + i = ts.idx + 3 + + while i <= length(ts.input) && ts[i] == ts[ts.idx] + i += 1 + end + for i in 1:3 + push!(ts.buffer, ts[ts.idx]) + end + ts.idx = i + return true +end """ - replace_html_entities(input::AbstractString, - remove_illegal=true) => (entities_replaced_text::AbstractString) + replace_html_entities(input::AbstractString, remove_illegal=true) `input::AbstractString` The string on which HTML entities need to be replaced `remove_illegal::Bool` If `true`, entities that can't be converted are removed. Otherwise, entities that can't be converted are kept "as is". -Returns `entities_replaced_text::AbstractString` + """ function replace_html_entities(input::AbstractString; remove_illegal=true) - ts = TokenBuffer(input) - isempty(input) && return ts.tokens + ts = TokenBuffer(input) + isempty(input) && return "" - while !isdone(ts) - html_entity(ts, remove_illegal) || character(ts) - end - return ts.tokens[1] + while !isdone(ts) + html_entity(ts, remove_illegal) || character(ts) + end + return ts.tokens[1] +end + + +""" +function pre_process(input::AbstractString, strip_handle=false, + reduce_len=false) => () + +This function processes on the input string and optionally remove twitter handles +and reduce length of repeated characters (like "waaaaay" -> "waaay") +and for elements like ".........?????? -> "...???" to increase the performance +of the key tokenizer. +""" +function pre_process(input::AbstractString, strip_handle::Bool, reduce_len::Bool) + ts = TokenBuffer(input) + isempty(input) && return "" + + while !isdone(ts) + (strip_handle && twitter_handle(ts)) || # Remove username handles + (reduce_len && reduce_all_repeated(ts)) || # Reduce Lengthening + safe_text(ts) || # Shorten some sequences of characters + character(ts) + end + + return ts.tokens[1] end """ @@ -262,32 +365,24 @@ function tweet_tokenize(source::AbstractString; reduce_len=false, preserve_case=true ) - length(source) == 0 && return [] - # Fix HTML Character entities - source = replace_html_entities(source) - # Remove username handles - if strip_handle - source = replace(source, HANDLES_REGEX => " ") - end - # Reduce Lengthening - if reduce_len - source = replace(source, r"(.)\1{2,}" => s"\1\1\1") - end - # Shorten some sequences of characters - safe_text = replace(source, r"""([^a-zA-Z0-9])\1{3,}""" => s"\1\1\1") - # Tokenize - tokens = collect((m.match for m in eachmatch(WORD_REGEX, - safe_text, - overlap=false))) - # Alter the case with preserving it for emoji - if !preserve_case - for (index, word) in enumerate(tokens) - if !occursin(EMOTICONS_REGEX, word) - tokens[index] = lowercase(word) - end - end + length(source) == 0 && return [] + # Fix HTML Character entities + source = replace_html_entities(source) + + length(source) == 0 && return [] + safe_text = pre_process(source, strip_handle, reduce_len) + + tokens = collect((m.match for m in eachmatch(WORD_REGEX, + safe_text, + overlap=false))) + # Alter the case with preserving it for emoji + if !preserve_case + for (index, word) in enumerate(tokens) + if !occursin(EMOTICONS_REGEX, word) + tokens[index] = lowercase(word) + end end + end - return tokens + return tokens end - From 59f8b0c8e6b16063d41b882453ad5cca47dc39c2 Mon Sep 17 00:00:00 2001 From: Ayushk4 Date: Wed, 13 Mar 2019 17:34:00 +0530 Subject: [PATCH 17/41] Fix indentation and bugs --- src/words/tweet_tokenizer.jl | 286 ++++++++++++++++++----------------- 1 file changed, 145 insertions(+), 141 deletions(-) diff --git a/src/words/tweet_tokenizer.jl b/src/words/tweet_tokenizer.jl index 65cb879..ca5ba13 100644 --- a/src/words/tweet_tokenizer.jl +++ b/src/words/tweet_tokenizer.jl @@ -83,7 +83,6 @@ const ELLIPSIS_DOTS = r"""(?:\.(?:\s*\.){1,})""" const WORDS_WITHOUT_APOSTROPHE_DASHES = r"""(?:[\w_]+)""" - # Core tokenizing regex const WORD_REGEX = Regex("(?i:" * join([URLS.pattern PHONE_NUMBERS.pattern @@ -105,7 +104,7 @@ const WORD_REGEX = Regex("(?i:" * join([URLS.pattern const HANG_REGEX = r"""([^a-zA-Z0-9])\1{3,}""" """ -html_entities(ts::TokenBuffer; remove_illegal=true) + html_entities(ts::TokenBuffer; remove_illegal=true) Removes entities from text by converting them to their corresponding unicode character. @@ -121,158 +120,161 @@ However for bytes (hex) 80-9f are interpreted in Windows-1252 """ function html_entity(ts::TokenBuffer, remove_illegal=true) - (ts.idx + 1 > length(ts.input) || ts.input[ts.idx] != '&' ) && return false - if ts.input[ts.idx + 1] != '#' # Entity is of the type "Δ" => "Δ" - i = ts.idx + 1 - while i <= length(ts.input) && isascii(ts[i]) && - (isdigit(ts[i]) || islowercase(ts[i]) || isuppercase(ts[i])) - i += 1 - end - (i > length(ts.input) || ts[i] != ';') && return false - entity = lookupname(HTML_Entities.default, String(ts[ts.idx+1:i-1])) - isempty(entity) && !remove_illegal && return false - !isempty(entity) && push!(ts.buffer, entity[1]) - ts.idx = i + 1 - return true - else - number = -1 - i = ts.idx + 2 - if ts.input[ts.idx + 2] != 'x' # Entity is of the type "Δ" => "Δ" - while i <= length(ts.input) && isdigit(ts[i]) - i += 1 - end - (i > length(ts.input) || ts[i] != ';') && return false - if ((ts.idx + 2 ) == i) - !remove_illegal && return false - ts.idx +=3 + (ts.idx + 1 > length(ts.input) || ts.input[ts.idx] != '&' ) && return false + if ts.input[ts.idx + 1] != '#' # Entity is of the type "Δ" => "Δ" + i = ts.idx + 1 + while i <= length(ts.input) && isascii(ts[i]) && + (isdigit(ts[i]) || islowercase(ts[i]) || isuppercase(ts[i])) + i += 1 + end + (i > length(ts.input) || ts[i] != ';') && return false + entity = lookupname(HTML_Entities.default, String(ts[ts.idx+1:i-1])) + isempty(entity) && !remove_illegal && return false + !isempty(entity) && push!(ts.buffer, entity[1]) + ts.idx = i + 1 return true - end - (number = parse(Int, String(ts[ts.idx+2:i-1]), base=10)) - else # Entity is of the type "Δ" => "Δ" - i += 1 - base16letters = ('a', 'b', 'c', 'd', 'e', 'f') - while i <= length(ts.input) && (isdigit(ts[i]) || ts[i] in base16letters) - i += 1 - end - (i > length(ts.input) || ts[i] != ';') && return false - - if (ts.idx + 3) == i - !remove_illegal && return false - ts.idx += 4 - return true - end - number = parse(Int, String(ts[ts.idx+3:i-1]), base=16) - end - - windows_1252_chars = ['€', '\u81', '‚', 'ƒ', '„', '…', '†', '‡', 'ˆ', '‰', + else + number = -1 + i = ts.idx + 2 + if ts.input[ts.idx + 2] != 'x' # Entity is of the type "Δ" => "Δ" + while i <= length(ts.input) && isdigit(ts[i]) + i += 1 + end + (i > length(ts.input) || ts[i] != ';') && return false + if ((ts.idx + 2 ) == i) + !remove_illegal && return false + ts.idx +=3 + return true + end + (number = parse(Int, String(ts[ts.idx+2:i-1]), base=10)) + else # Entity is of the type "Δ" => "Δ" + i += 1 + base16letters = ('a', 'b', 'c', 'd', 'e', 'f') + while i <= length(ts.input) && (isdigit(ts[i]) || ts[i] in base16letters) + i += 1 + end + (i > length(ts.input) || ts[i] != ';') && return false + + if (ts.idx + 3) == i + !remove_illegal && return false + ts.idx += 4 + return true + end + number = parse(Int, String(ts[ts.idx+3:i-1]), base=16) + end + + windows_1252_chars = ['€', '\u81', '‚', 'ƒ', '„', '…', '†', '‡', 'ˆ', '‰', 'Š', '‹', 'Œ', '\u8d','Ž', '\u8f', '\u90', '‘', '’', '“', '”', '•', '–', '—', '˜', '™', 'š', '›', 'œ', '\u9d', 'ž', 'Ÿ'] - if 0x80 <= number <= 0x9F - push!(ts.buffer, windows_1252_chars[number - 127]) - ts.idx = i + 1 - return true - end - if (number <= 0 || !Unicode.isassigned(number)) - !remove_illegal && return false - ts.idx = i + 1 - else - push!(ts.buffer, Char(number)) - ts.idx = i + 1 + if 0x80 <= number <= 0x9F + push!(ts.buffer, windows_1252_chars[number - 127]) + ts.idx = i + 1 + return true + end + if (number <= 0 || !Unicode.isassigned(number)) + !remove_illegal && return false + ts.idx = i + 1 + else + push!(ts.buffer, Char(number)) + ts.idx = i + 1 + end end - end - return true + return true end """ -lookbehind(ts::TokenBuffer) + lookbehind(ts::TokenBuffer) A helper function for twitter_handle. Checks if the beginning of the detected handle is preceded by alphanumeric or special chars like('_', '!', '@', '#', '\$', '%', '&', '*') """ function lookbehind(ts::TokenBuffer, match_pattern = ('_', '!', '@', '#', '$', '%', '&', '*')) - ts.idx == 1 && return false + ts.idx == 1 && return false - c = ts[ts.idx - 1] - ( islowercase(c) || isdigit(c) || isuppercase(c) || c ∈ match_pattern ) && return true + c = ts[ts.idx - 1] + ( islowercase(c) || isdigit(c) || isuppercase(c) || c ∈ match_pattern ) && return true - return false + return false end """ - twitter_handle(ts::TokenBuffer) + twitter_handle(ts::TokenBuffer) For removing Twitter Handles. If it detects a twitter handle, then it jumps to makes the index of TokenBuffer to the desired location skipping the handle. """ function twitter_handle(ts) - (ts.idx + 2 > length(ts.input) || ts.input[ts.idx] != '@' ) && return false - lookbehind(ts) && return false + (ts.idx + 2 > length(ts.input) || ts.input[ts.idx] != '@' ) && return false + lookbehind(ts) && return false - i = ts.idx + 1 - while i <= length(ts.input) && - ( isascii(ts[i]) && (isdigit(ts[i]) || islowercase(ts[i]) || - isuppercase(ts[i]) || ts[i] == '_')) - i += 1 - end - (i <= length(ts.input)) && (i == ts.idx + 1 || ts[i] == '@') && return false + i = ts.idx + 1 + while i <= length(ts.input) && + ( isascii(ts[i]) && (isdigit(ts[i]) || islowercase(ts[i]) || + isuppercase(ts[i]) || ts[i] == '_')) + i += 1 + end + (i <= length(ts.input)) && (i == ts.idx + 1 || ts[i] == '@') && return false - ts.idx = i - return true + ts.idx = i + return true end """ - reduce_all_repeated(ts::TokenBuffer) + reduce_all_repeated(ts::TokenBuffer) For handling repeated characters like "helloooooo" -> :hellooo". """ function reduce_all_repeated(ts) - ts.idx + 4 > length(ts.input) && return false - - (ts[ts.idx] == '\n' || ts[ts.idx] != ts[ts.idx + 1] || - ts[ts.idx] != ts[ts.idx + 2]) && return false - - i = ts.idx + 3 - while i <= length(ts.input) && ts[i] == ts[ts.idx] - i += 1 - end - for i in 1:3 - push!(ts.buffer, ts[ts.idx]) - end - ts.idx = i - return true + ts.idx + 4 > length(ts.input) && return false + + (ts[ts.idx] == '\n' || ts[ts.idx] != ts[ts.idx + 1] || + ts[ts.idx] != ts[ts.idx + 2]) && return false + + i = ts.idx + 3 + while i <= length(ts.input) && ts[i] == ts[ts.idx] + i += 1 + end + for j in 1:3 + push!(ts.buffer, ts[ts.idx]) + end + ts.idx = i + return true end """ - safe_text(ts::TokenBuffer) + safe_text(ts::TokenBuffer) This feature covers up for the characters where the main tokenizing function lacks For example - "........" -> "..." and this is detected by the key tokenizer as a single token of "..." """ function safe_text(ts) - ts.idx + 4 > length(ts.input) && return false - - ((isascii(ts[ts.idx]) && ( islowercase(ts[ts.idx]) || - isuppercase(ts[ts.idx]) || isdigit(ts[ts.idx]))) || - ts[ts.idx] != ts[ts.idx + 1] || - ts[ts.idx] != ts[ts.idx + 2] ) && return false - - i = ts.idx + 3 - - while i <= length(ts.input) && ts[i] == ts[ts.idx] - i += 1 - end - for i in 1:3 - push!(ts.buffer, ts[ts.idx]) - end - ts.idx = i - return true + ts.idx + 4 > length(ts.input) && return false + + ( + (isascii(ts[ts.idx]) && ( islowercase(ts[ts.idx]) || + isuppercase(ts[ts.idx]) || isdigit(ts[ts.idx]))) || + ts[ts.idx] != ts[ts.idx + 1] || + ts[ts.idx] != ts[ts.idx + 2] ) && return false + + i = ts.idx + 3 + + while i <= length(ts.input) && ts[i] == ts[ts.idx] + i += 1 + end + + for j in 1:3 + push!(ts.buffer, ts[ts.idx]) + end + ts.idx = i + + return true end @@ -287,19 +289,19 @@ is". """ function replace_html_entities(input::AbstractString; remove_illegal=true) - ts = TokenBuffer(input) - isempty(input) && return "" + ts = TokenBuffer(input) + isempty(input) && return "" - while !isdone(ts) - html_entity(ts, remove_illegal) || character(ts) - end - return ts.tokens[1] + while !isdone(ts) + html_entity(ts, remove_illegal) || character(ts) + end + return ts.tokens[1] end """ -function pre_process(input::AbstractString, strip_handle=false, - reduce_len=false) => () + function pre_process(input::AbstractString, strip_handle::Bool, + reduce_len::Bool) This function processes on the input string and optionally remove twitter handles and reduce length of repeated characters (like "waaaaay" -> "waaay") @@ -307,17 +309,17 @@ and for elements like ".........?????? -> "...???" to increase the performance of the key tokenizer. """ function pre_process(input::AbstractString, strip_handle::Bool, reduce_len::Bool) - ts = TokenBuffer(input) - isempty(input) && return "" - - while !isdone(ts) - (strip_handle && twitter_handle(ts)) || # Remove username handles - (reduce_len && reduce_all_repeated(ts)) || # Reduce Lengthening - safe_text(ts) || # Shorten some sequences of characters - character(ts) - end + ts = TokenBuffer(input) + isempty(input) && return "" + + while !isdone(ts) + (strip_handle && twitter_handle(ts)) || # Remove username handles + (reduce_len && reduce_all_repeated(ts)) || # Reduce Lengthening + safe_text(ts) || # Shorten some sequences of characters + character(ts) + end - return ts.tokens[1] + return ts.tokens[1] end """ @@ -365,24 +367,26 @@ function tweet_tokenize(source::AbstractString; reduce_len=false, preserve_case=true ) - length(source) == 0 && return [] - # Fix HTML Character entities - source = replace_html_entities(source) + length(source) == 0 && return [] + # Fix HTML Character entities + source = replace_html_entities(source) + + length(source) == 0 && return [] + safe_text = pre_process(source, strip_handle, reduce_len) - length(source) == 0 && return [] - safe_text = pre_process(source, strip_handle, reduce_len) + # The key tokenizing function begins + tokens = collect((m.match for m in eachmatch(WORD_REGEX, + safe_text, + overlap=false))) - tokens = collect((m.match for m in eachmatch(WORD_REGEX, - safe_text, - overlap=false))) # Alter the case with preserving it for emoji - if !preserve_case - for (index, word) in enumerate(tokens) - if !occursin(EMOTICONS_REGEX, word) - tokens[index] = lowercase(word) - end + if !preserve_case + for (index, word) in enumerate(tokens) + if !occursin(EMOTICONS_REGEX, word) + tokens[index] = lowercase(word) + end + end end - end - return tokens + return tokens end From cbb01e8fe239f72936ebd4f9ec4400f53cf892da Mon Sep 17 00:00:00 2001 From: Ayushk4 Date: Fri, 12 Apr 2019 21:05:36 +0530 Subject: [PATCH 18/41] Add regex-free emoticons via TokenBuffer --- src/words/tweet_tokenizer.jl | 95 ++++++++++++++++++++++++++++++++++++ 1 file changed, 95 insertions(+) diff --git a/src/words/tweet_tokenizer.jl b/src/words/tweet_tokenizer.jl index ca5ba13..6dd9271 100644 --- a/src/words/tweet_tokenizer.jl +++ b/src/words/tweet_tokenizer.jl @@ -322,6 +322,88 @@ function pre_process(input::AbstractString, strip_handle::Bool, reduce_len::Bool return ts.tokens[1] end +function flushaboutindex!(ts::TokenBuffer, uptoidx) + flush!(ts, String(ts[ts.idx:uptoidx])) + ts.idx = uptoidx + 1 + return true +end + +const forehead = ['>', '<'] +const eyes = [':' ';' '=' '8'] +const nose = ['-','o','*','\''] +const mouth = [')', ']', '}', '(', '[', '{', 'd', 'D', 'p', 'P', '\\', '/', ':', '@', '|'] + +""" + function emoticons(ts::TokenBuffer) + +This function checks for the emoticons for the type `{forehead}{eyes}{nose}{mouth} +explicitely in this order, with {forehead} and {nose} being optional + +Example: +- `:)`, `;p` # (without nose and forehead) +- `:-)`, `:-p` # (with nose) +- `>:)` # (with forehead) +- `>:-)` # (with forehead and nose) + +Also checks for `<3` emoji +""" +function emoticons(ts) + ts.idx + 1 > length(ts.input) && return false + idx = ts.idx + + ts[idx] ∈ eyes && ( + (ts[idx + 1] ∈ mouth && return(flushaboutindex!(ts, idx + 1))) || + (idx + 2 <= length(ts.input) && ts[idx + 1] ∈ nose && ts[idx + 2] ∈ mouth && + return(flushaboutindex!(ts, idx + 2))) || + return false + ) + + idx + 2 <= length(ts.input) && ts[idx] ∈ forehead && ts[idx + 1] ∈ eyes && ( + (ts[idx + 2] ∈ mouth && return(flushaboutindex!(ts, idx + 2))) || + (idx + 3 <= length(ts.input) && ts[idx + 2] ∈ nose && + ts[idx + 3] ∈ mouth && return(flushaboutindex!(ts, idx + 3))) || + return false + ) + + ts[idx] == '<' && ts[idx + 1] == '3' && return(flushaboutindex!(ts, idx + 1)) + + return false +end + +""" + function emoticonsreverse(ts::TokenBuffer) + +This function checks for the emoticons in reverse order to those of `function emoticons` +explicitely in this order `{mouth}{nose}{eyes}{forehead}`, with {forehead} and {nose} being optional + +Example: +- `(:`, `d:` # (without nose and forehead) +- `(-:`, `d-:` # (with nose) +- (:<` # (with forehead) +- `(-:<` # (with forehead and nose) + +""" +function emoticonsreverse(ts) + ts.idx + 1 > length(ts.input) && return false + idx = ts.idx + + ts[idx] ∈ mouth && ( + ts[idx + 1] ∈ eyes && ( + (ts[idx + 2] ∈ forehead && return(flushaboutindex!(ts, idx + 2))) || + return(flushaboutindex!(ts, idx+1)) + ) || + ts[idx + 1] ∈ nose && ( + ts[idx + 2] ∈ eyes && ( + (ts[idx + 3] ∈ forehead && return(flushaboutindex!(ts, idx + 3))) || + return(flushaboutindex!(ts, idx + 3)) + ) + ) + ) + + return false +end + + """ tweet_tokenize(input::AbstractString) => tokens @@ -375,6 +457,19 @@ function tweet_tokenize(source::AbstractString; safe_text = pre_process(source, strip_handle, reduce_len) # The key tokenizing function begins + # ts = TokenBuffer(safe_text) + # isempty(safe_text) && return ts.tokens + # + # while !isdone(ts) + # spaces(ts) && continue + # emoticons(ts) || + # emoticonsreverse(ts) || + # character(ts) + # end + # + # + # tokens = ts.tokens + # tokens = collect((m.match for m in eachmatch(WORD_REGEX, safe_text, overlap=false))) From 703ebc44ca3470f0e2926568263bd9879ebe2a57 Mon Sep 17 00:00:00 2001 From: Ayushk4 Date: Thu, 18 Apr 2019 09:34:29 +0530 Subject: [PATCH 19/41] Add ascii arrows and html tags --- src/words/tweet_tokenizer.jl | 69 +++++++++++++++++++++++++++++++----- 1 file changed, 60 insertions(+), 9 deletions(-) diff --git a/src/words/tweet_tokenizer.jl b/src/words/tweet_tokenizer.jl index 6dd9271..2ac3694 100644 --- a/src/words/tweet_tokenizer.jl +++ b/src/words/tweet_tokenizer.jl @@ -352,20 +352,20 @@ function emoticons(ts) idx = ts.idx ts[idx] ∈ eyes && ( - (ts[idx + 1] ∈ mouth && return(flushaboutindex!(ts, idx + 1))) || + (ts[idx + 1] ∈ mouth && return flushaboutindex!(ts, idx + 1)) || (idx + 2 <= length(ts.input) && ts[idx + 1] ∈ nose && ts[idx + 2] ∈ mouth && - return(flushaboutindex!(ts, idx + 2))) || + return flushaboutindex!(ts, idx + 2)) || return false ) idx + 2 <= length(ts.input) && ts[idx] ∈ forehead && ts[idx + 1] ∈ eyes && ( - (ts[idx + 2] ∈ mouth && return(flushaboutindex!(ts, idx + 2))) || + (ts[idx + 2] ∈ mouth && return flushaboutindex!(ts, idx + 2)) || (idx + 3 <= length(ts.input) && ts[idx + 2] ∈ nose && - ts[idx + 3] ∈ mouth && return(flushaboutindex!(ts, idx + 3))) || + ts[idx + 3] ∈ mouth && return flushaboutindex!(ts, idx + 3)) || return false ) - ts[idx] == '<' && ts[idx + 1] == '3' && return(flushaboutindex!(ts, idx + 1)) + ts[idx] == '<' && ts[idx + 1] == '3' && return flushaboutindex!(ts, idx + 1) return false end @@ -389,13 +389,13 @@ function emoticonsreverse(ts) ts[idx] ∈ mouth && ( ts[idx + 1] ∈ eyes && ( - (ts[idx + 2] ∈ forehead && return(flushaboutindex!(ts, idx + 2))) || - return(flushaboutindex!(ts, idx+1)) + (ts[idx + 2] ∈ forehead && return flushaboutindex!(ts, idx + 2)) || + return flushaboutindex!(ts, idx+1) ) || ts[idx + 1] ∈ nose && ( ts[idx + 2] ∈ eyes && ( - (ts[idx + 3] ∈ forehead && return(flushaboutindex!(ts, idx + 3))) || - return(flushaboutindex!(ts, idx + 3)) + (ts[idx + 3] ∈ forehead && return flushaboutindex!(ts, idx + 3)) || + return flushaboutindex!(ts, idx + 3) ) ) ) @@ -403,6 +403,53 @@ function emoticonsreverse(ts) return false end +""" + htmltags(ts::TokenBuffer) + +Matches the HTML tags which contain no space inside the tags. +""" +function htmltags(ts) + (ts.idx + 2 > length(ts.input) || ts[ts.idx] != '<' + || ts[ts.idx + 1] == '>') && return false + i = ts.idx + while i <= length(ts.input) && ts[i] != '>' + isspace(ts[]) && return false + i += 1 + end + i > length(ts.input) && return false + return flushaboutindex!(ts, i) +end + + +# To-Do : Find a way to make arrowsascii repeatedly check for recheck +""" + arrowsascii(ts::TokenBuffer) + +Matches the ascii arrows - made up of arrows like `<--` and `--->` +""" +function arrowsascii(ts) + ( + ts.idx + 1 > length(ts.input) || + ( + (ts[ts.idx] != '<' || ts[ts.idx + 1] != '-' ) && + (ts[ts.idx] != '-') + ) + ) && return false + + i = ts.idx + if ts[i] == '<' + i += 1 + while i <= length(ts.input) && ts[i] == '-' + i += 1 + end + return flushaboutindex!(ts, i - 1) + end + while i <= length(ts.input) && ts[i] == '-' + i += 1 + end + ts[ts.idx] == '>' && return flushaboutindex!(ts, i) +end + """ tweet_tokenize(input::AbstractString) => tokens @@ -462,8 +509,12 @@ function tweet_tokenize(source::AbstractString; # # while !isdone(ts) # spaces(ts) && continue + # urls(ts) || + # phonenumbers(ts) || # emoticons(ts) || # emoticonsreverse(ts) || + # htmltags(ts) || + # arrowsascii(ts) || # character(ts) # end # From 77b505ae4191df6086fcb8be803ea15d13b1e054 Mon Sep 17 00:00:00 2001 From: Ayushk4 Date: Fri, 17 May 2019 23:01:28 +0530 Subject: [PATCH 20/41] Add functions for twitter hashtags and email addresses --- src/words/tweet_tokenizer.jl | 81 +++++++++++++++++++++++++++++++++++- 1 file changed, 79 insertions(+), 2 deletions(-) diff --git a/src/words/tweet_tokenizer.jl b/src/words/tweet_tokenizer.jl index 2ac3694..73aec86 100644 --- a/src/words/tweet_tokenizer.jl +++ b/src/words/tweet_tokenizer.jl @@ -450,6 +450,79 @@ function arrowsascii(ts) ts[ts.idx] == '>' && return flushaboutindex!(ts, i) end +# Checks the string till non word char appears, so takes relatively more time. +""" + emailaddresses(ts) + +Matches for email addresses. +""" +function emailaddresses(ts) + ts.idx + 5 >= length(ts.input) && return false + + i = ts.idx + + while i + 5 <= length(ts.input) && isascii(ts[i]) && + (isdigit(ts[i]) || islowercase(ts[i]) || + isuppercase(ts[i]) || ts[i] ∈ ['.', '+', '-', '_']) + i += 1 + end + + (i == ts.idx || ts[i] != '@') && return false + + i += 1 + j = i + + while i + 3 <= length(ts.input) && isascii(ts[i]) && + (isdigit(ts[i]) || islowercase(ts[i]) || + isuppercase(ts[i]) || ts[i] == '-' || ts == '_') + i += 1 + end + + (j == i || ts[i] != '.') && return false + + i += 1 + j = i + + while i <= length(ts.input) && isascii(ts[i]) && + (isdigit(ts[i]) || islowercase(ts[i]) || + isuppercase(ts[i]) || ts[i] ∈ ['.', '-', '_']) + i += 1 + end + + (j + 2 >= i && ts[i-1] != '.') && return flushaboutindex!(ts, i-1) + + return false +end + +""" + twitterhashtags(ts) + +Matches for twitter hashtags. +""" +function twitterhashtags(ts) + (ts.idx + 2 > length(ts.input) || ts[ts.idx] != '#' || + ts[ts.idx + 1] ∈ ['\'', '-']) && return false + + i = ts.idx + 1 + last_word_char = i + + while i <= length(ts.input) && isascii(ts[i]) && + (isdigit(ts[i]) || islowercase(ts[i]) || + isuppercase(ts[i]) || ts[i] ∈ ['_', '\'', '-']) + + if ts[i] ∉ ['\'', '-'] + last_word_char = i + end + + i += 1 + end + + last_word_char >= ts.idx + 2 && ts[ts.idx + 1] ∉ ['\'', '-'] && + ts[last_word_char] ∉ ['\'', '-'] && return flushaboutindex!(ts, last_word_char) + + return false +end + """ tweet_tokenize(input::AbstractString) => tokens @@ -507,14 +580,18 @@ function tweet_tokenize(source::AbstractString; # ts = TokenBuffer(safe_text) # isempty(safe_text) && return ts.tokens # + # # # To-Do: OpenQuotes and Closing quotes # while !isdone(ts) # spaces(ts) && continue - # urls(ts) || - # phonenumbers(ts) || + # # urls(ts) || + # # phonenumbers(ts) || # emoticons(ts) || # emoticonsreverse(ts) || # htmltags(ts) || + # twitterhashtags(ts) || + # # atoms(ts, []) || # arrowsascii(ts) || + # emailaddresses(ts) || # character(ts) # end # From 744030165b4d08bf3bd97c812fdb52800a7ef2a1 Mon Sep 17 00:00:00 2001 From: Ayushk4 Date: Sat, 18 May 2019 18:17:50 +0530 Subject: [PATCH 21/41] Fix Bugs --- REQUIRE | 1 - src/words/tweet_tokenizer.jl | 18 ++++++++++++------ 2 files changed, 12 insertions(+), 7 deletions(-) diff --git a/REQUIRE b/REQUIRE index fa2c15f..f21d402 100644 --- a/REQUIRE +++ b/REQUIRE @@ -1,4 +1,3 @@ julia 0.7 HTML_Entities StrTables -StringEncodings \ No newline at end of file diff --git a/src/words/tweet_tokenizer.jl b/src/words/tweet_tokenizer.jl index 73aec86..6206b83 100644 --- a/src/words/tweet_tokenizer.jl +++ b/src/words/tweet_tokenizer.jl @@ -185,10 +185,9 @@ end """ lookbehind(ts::TokenBuffer) -A helper function for twitter_handle. Checks if the beginning of the detected +A helper function for strip_twitter_handle. Checks if the beginning of the detected handle is preceded by alphanumeric or special chars like('_', '!', '@', '#', '\$', '%', '&', '*') """ - function lookbehind(ts::TokenBuffer, match_pattern = ('_', '!', '@', '#', '$', '%', '&', '*')) ts.idx == 1 && return false @@ -201,12 +200,12 @@ end """ - twitter_handle(ts::TokenBuffer) + strip_twitter_handle(ts::TokenBuffer) For removing Twitter Handles. If it detects a twitter handle, then it jumps to makes the index of TokenBuffer to the desired location skipping the handle. """ -function twitter_handle(ts) +function strip_twitter_handle(ts) (ts.idx + 2 > length(ts.input) || ts.input[ts.idx] != '@' ) && return false lookbehind(ts) && return false @@ -313,7 +312,7 @@ function pre_process(input::AbstractString, strip_handle::Bool, reduce_len::Bool isempty(input) && return "" while !isdone(ts) - (strip_handle && twitter_handle(ts)) || # Remove username handles + (strip_handle && strip_twitter_handle(ts)) || # Remove username handles (reduce_len && reduce_all_repeated(ts)) || # Reduce Lengthening safe_text(ts) || # Shorten some sequences of characters character(ts) @@ -411,12 +410,14 @@ Matches the HTML tags which contain no space inside the tags. function htmltags(ts) (ts.idx + 2 > length(ts.input) || ts[ts.idx] != '<' || ts[ts.idx + 1] == '>') && return false + i = ts.idx while i <= length(ts.input) && ts[i] != '>' - isspace(ts[]) && return false + isspace(ts[i]) && return false i += 1 end i > length(ts.input) && return false + return flushaboutindex!(ts, i) end @@ -450,6 +451,7 @@ function arrowsascii(ts) ts[ts.idx] == '>' && return flushaboutindex!(ts, i) end + # Checks the string till non word char appears, so takes relatively more time. """ emailaddresses(ts) @@ -494,6 +496,7 @@ function emailaddresses(ts) return false end + """ twitterhashtags(ts) @@ -589,9 +592,12 @@ function tweet_tokenize(source::AbstractString; # emoticonsreverse(ts) || # htmltags(ts) || # twitterhashtags(ts) || + # twitterusername(ts) || + # ellipsis_dots(ts) || # # atoms(ts, []) || # arrowsascii(ts) || # emailaddresses(ts) || + # # words_including_apostrophe_dashes(ts) || # character(ts) # end # From 6e20d5d99b99106e7aa24de6b271b61a5eb3c9c6 Mon Sep 17 00:00:00 2001 From: Ayushk4 Date: Sat, 18 May 2019 18:19:30 +0530 Subject: [PATCH 22/41] Add functions for twitterusernames and ellipses --- src/words/tweet_tokenizer.jl | 41 ++++++++++++++++++++++++++++++++++++ 1 file changed, 41 insertions(+) diff --git a/src/words/tweet_tokenizer.jl b/src/words/tweet_tokenizer.jl index 6206b83..51526d4 100644 --- a/src/words/tweet_tokenizer.jl +++ b/src/words/tweet_tokenizer.jl @@ -526,6 +526,47 @@ function twitterhashtags(ts) return false end +""" + twitterusername(ts) + +Matches for twitter usernames. +""" +function twitterusername(ts) + (ts.idx + 1 > length(ts.input) || ts[ts.idx] != '@' ) && return false + + i = ts.idx + 1 + while i <= length(ts.input) && isascii(ts[i]) && + (isdigit(ts[i]) || islowercase(ts[i]) || + isuppercase(ts[i]) || ts[i] == '_') + i += 1 + end + i > ts.idx + 1 && return flushaboutindex!(ts, i - 1) + + return false +end + +""" + ellipsis_dots(ts) + +Matches for ellipsis and dots, ignoring the spaces, tabs, newlines between them. +""" +function ellipsis_dots(ts) + (ts.idx + 1 > length(ts.input) || ts[ts.idx] != '.' ) && return false + + i = ts.idx + 1 + last_dot = ts.idx + + while i <= length(ts.input) && (isspace(ts[i]) || ts[i] == '.') + if ts[i] == '.' + last_dot = i + end + i += 1 + end + + last_dot != ts.idx && return flushaboutindex!(ts, last_dot) + + return false +end """ tweet_tokenize(input::AbstractString) => tokens From ecae2b985a81de27a3bc2f64ba14377bd908d057 Mon Sep 17 00:00:00 2001 From: Ayushk4 Date: Sun, 19 May 2019 12:15:58 +0530 Subject: [PATCH 23/41] Fix bugs in emailaddresses --- src/words/tweet_tokenizer.jl | 23 +++++++++++++---------- 1 file changed, 13 insertions(+), 10 deletions(-) diff --git a/src/words/tweet_tokenizer.jl b/src/words/tweet_tokenizer.jl index 51526d4..deba837 100644 --- a/src/words/tweet_tokenizer.jl +++ b/src/words/tweet_tokenizer.jl @@ -459,22 +459,19 @@ end Matches for email addresses. """ function emailaddresses(ts) - ts.idx + 5 >= length(ts.input) && return false + ts.idx + 4 > length(ts.input) && return false i = ts.idx - - while i + 5 <= length(ts.input) && isascii(ts[i]) && + while i + 3 <= length(ts.input) && isascii(ts[i]) && (isdigit(ts[i]) || islowercase(ts[i]) || isuppercase(ts[i]) || ts[i] ∈ ['.', '+', '-', '_']) i += 1 end - (i == ts.idx || ts[i] != '@') && return false i += 1 j = i - - while i + 3 <= length(ts.input) && isascii(ts[i]) && + while i + 2 <= length(ts.input) && isascii(ts[i]) && (isdigit(ts[i]) || islowercase(ts[i]) || isuppercase(ts[i]) || ts[i] == '-' || ts == '_') i += 1 @@ -482,21 +479,27 @@ function emailaddresses(ts) (j == i || ts[i] != '.') && return false - i += 1 j = i + last_dot = i + i += 1 while i <= length(ts.input) && isascii(ts[i]) && (isdigit(ts[i]) || islowercase(ts[i]) || - isuppercase(ts[i]) || ts[i] ∈ ['.', '-', '_']) + isuppercase(ts[i]) || ts[i] ∈ ['-', '_']) + + if i + 1 < length(ts.input) && ts[i + 1] == '.' + i += 1 + last_dot = i + end + i += 1 end - (j + 2 >= i && ts[i-1] != '.') && return flushaboutindex!(ts, i-1) + i > last_dot + 1 && i > j + 2 && return flushaboutindex!(ts, i - 1) return false end - """ twitterhashtags(ts) From 7661b8d52c17103a54eb404ab73f52977744beab Mon Sep 17 00:00:00 2001 From: Ayushk4 Date: Sun, 19 May 2019 13:37:24 +0530 Subject: [PATCH 24/41] Update fast.jl, Support signs (+,-) in numbers --- src/words/fast.jl | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/src/words/fast.jl b/src/words/fast.jl index a80236c..7fc0030 100644 --- a/src/words/fast.jl +++ b/src/words/fast.jl @@ -214,9 +214,13 @@ end Matches numbers such as `10,000.5`, preserving formatting. """ -function number(ts, sep = (':', ',', '\'', '.')) - isdigit(ts[]) || return false +function number(ts, sep = (':', ',', '\'', '.'); check_sign = false) i = ts.idx + if check_sign && ts[] ∈ ['+', '-'] + i += 1 + end + + i <= length(ts.input) && isdigit(ts[i]) || return false while i <= length(ts.input) && (isdigit(ts[i]) || (ts[i] in sep && i < length(ts.input) && isdigit(ts[i+1]))) i += 1 @@ -225,4 +229,3 @@ function number(ts, sep = (':', ',', '\'', '.')) ts.idx = i return true end - From 7d6fa210e26ea86cedf1d67c7a2cab2f54a61df8 Mon Sep 17 00:00:00 2001 From: Ayushk4 Date: Tue, 21 May 2019 21:35:35 +0530 Subject: [PATCH 25/41] Switch to TokenBuffer for Tweet Tokenizer --- src/words/fast.jl | 2 +- src/words/tweet_tokenizer.jl | 94 +++++++++++++++++++++++++----------- 2 files changed, 66 insertions(+), 30 deletions(-) diff --git a/src/words/fast.jl b/src/words/fast.jl index 7fc0030..b5eebc4 100644 --- a/src/words/fast.jl +++ b/src/words/fast.jl @@ -216,7 +216,7 @@ Matches numbers such as `10,000.5`, preserving formatting. """ function number(ts, sep = (':', ',', '\'', '.'); check_sign = false) i = ts.idx - if check_sign && ts[] ∈ ['+', '-'] + if check_sign && ts[] ∈ ['+', '-'] && ( i == 1 || isspace(ts[i-1])) i += 1 end diff --git a/src/words/tweet_tokenizer.jl b/src/words/tweet_tokenizer.jl index deba837..08c0939 100644 --- a/src/words/tweet_tokenizer.jl +++ b/src/words/tweet_tokenizer.jl @@ -571,6 +571,42 @@ function ellipsis_dots(ts) return false end +""" + words_including_apostrophe_dashes(ts) + +TokenBuffer matcher for words that may or maynot have dashes or apostrophe in it. +""" +function words_including_apostrophe_dashes(ts) + (ts.idx + 1 > length(ts.input) || !(isascii(ts[ts.idx]) && + (islowercase(ts[ts.idx]) || isuppercase(ts[ts.idx]) + || isdigit(ts[ts.idx]) || ts[ts.idx] == '_' ))) && return false + + has_apostrophe_dashes = false + i = ts.idx + 1 + last_char = ts.idx + + if isuppercase(ts[ts.idx]) || islowercase(ts[ts.idx]) + while i <= length(ts.input) && isascii(ts[i]) && + (islowercase(ts[i]) || isuppercase(ts[i]) || ts[i] ∈ ['_', '\'', '-']) + if has_apostrophe_dashes == false && ts[i] ∈ ['\'', '-'] + has_apostrophe_dashes = true + else + last_char = i + end + i += 1 + end + end + + has_apostrophe_dashes && last_char != ts.idx && return flushaboutindex!(ts, last_char) + + while i <= length(ts.input) && isascii(ts[i]) && (isdigit(ts[i]) || + islowercase(ts[i]) || isuppercase(ts[i]) || ts[i] == '_') + i += 1 + end + + return flushaboutindex!(ts, i - 1) +end + """ tweet_tokenize(input::AbstractString) => tokens @@ -624,35 +660,35 @@ function tweet_tokenize(source::AbstractString; safe_text = pre_process(source, strip_handle, reduce_len) # The key tokenizing function begins - # ts = TokenBuffer(safe_text) - # isempty(safe_text) && return ts.tokens - # - # # # To-Do: OpenQuotes and Closing quotes - # while !isdone(ts) - # spaces(ts) && continue - # # urls(ts) || - # # phonenumbers(ts) || - # emoticons(ts) || - # emoticonsreverse(ts) || - # htmltags(ts) || - # twitterhashtags(ts) || - # twitterusername(ts) || - # ellipsis_dots(ts) || - # # atoms(ts, []) || - # arrowsascii(ts) || - # emailaddresses(ts) || - # # words_including_apostrophe_dashes(ts) || - # character(ts) - # end - # - # - # tokens = ts.tokens - # - tokens = collect((m.match for m in eachmatch(WORD_REGEX, - safe_text, - overlap=false))) - - # Alter the case with preserving it for emoji + ts = TokenBuffer(safe_text) + isempty(safe_text) && return ts.tokens + + # # To-Do: OpenQuotes and Closing quotes + while !isdone(ts) + spaces(ts) && continue + # urls(ts) || + emoticons(ts) || + emoticonsreverse(ts) || + htmltags(ts) || + twitterhashtags(ts) || + twitterusername(ts) || + ellipsis_dots(ts) || + arrowsascii(ts) || + emailaddresses(ts) || + # phonenumbers(ts) || # Phone numbers must be present above numbers. + atoms(ts, []) || + words_including_apostrophe_dashes(ts) || + number(ts, check_sign = true) || + character(ts) + end + + tokens = ts.tokens + + # tokens = collect((m.match for m in eachmatch(WORD_REGEX, + # safe_text, + # overlap=false))) + + # Alter the case with preserving it for emoji if !preserve_case for (index, word) in enumerate(tokens) if !occursin(EMOTICONS_REGEX, word) From 66adbf8693491ce821c9d4d76d1527e9441f6c2b Mon Sep 17 00:00:00 2001 From: Ayushk4 Date: Wed, 22 May 2019 17:41:36 +0530 Subject: [PATCH 26/41] Add TokenBuffer function for nltk's tweet tokenizer - phone numbers --- src/words/tweet_tokenizer.jl | 185 ++++++++++++++++++++++++++++++++++- 1 file changed, 182 insertions(+), 3 deletions(-) diff --git a/src/words/tweet_tokenizer.jl b/src/words/tweet_tokenizer.jl index 08c0939..e838384 100644 --- a/src/words/tweet_tokenizer.jl +++ b/src/words/tweet_tokenizer.jl @@ -607,6 +607,183 @@ function words_including_apostrophe_dashes(ts) return flushaboutindex!(ts, i - 1) end +""" + nltk_casual_phonenumbers(ts) + +The TokenBuffer function for nltk's tweet tokenizer regex for phonenumbers. +""" +function nltk_phonenumbers(ts) + (ts.idx + 5 > length(ts.input) || !(isdigit(ts[ts.idx]) || + ts[ts.idx] ∈ ['+', '('] )) && return false + + i = ts.idx + optional_1_confirmed = false + + # Checking for the part 1 of regex which is optional + if ts[i] == '+' + ts[i + 1] ∈ ['0', '1'] || return false + i += 2 + + while i <= length(ts.input) && ts[i] ∈ [' ', '*', '-', '.', ')'] + i += 1 + end + + i + 5 > length(ts.input) && return false + + optional_1_confirmed = true + elseif ts[i] ∈ ['0', '1'] + i += 1 + + while i <= length(ts.input) && ts[i] ∈ [' ', '*', '-', '.', ')'] + i += 1 + end + + i + 5 > length(ts.input) && return false + + if i - ts.idx > 1 || ts[i] == '(' + optional_1_confirmed = true + end + end + + if i == ts.idx || optional_1_confirmed + # This is called when either the first part is sure to present or absent, otherwise next one called + if ts[i] == '(' + i += 1 + + for repeat in 1:2 # repeat is unused variable inside loop + if !(i + 3 <= length(ts.input) && isdigit(ts[i]) && isdigit(ts[i + 1]) && + isdigit(ts[i + 2])) + + return false + end + + i += 3 + while i <= length(ts.input) && ts[i] ∈ [' ', '*', '-', '.', ')'] + i += 1 + end + end + + !(i + 3 <= length(ts.input) && isdigit(ts[i]) && isdigit(ts[i + 1]) && + isdigit(ts[i + 2]) && isdigit(ts[i + 3])) && return false + + return flushaboutindex!(ts, i + 3) + else + if !(i + 3 <= length(ts.input) && isdigit(ts[i]) && isdigit(ts[i + 1]) && + isdigit(ts[i + 2])) + + return false + end + i += 3 + + while i <= length(ts.input) && ts[i] ∈ [' ', '*', '-', '.', ')'] + i += 1 + end + + if !(i + 3 <= length(ts.input) && isdigit(ts[i]) && isdigit(ts[i + 1]) && + isdigit(ts[i + 2])) + + return false + end + i += 3 + j = i + + while i <= length(ts.input) && ts[i] ∈ [' ', '*', '-', '.', ')'] + i += 1 + end + + i + 3 <= length(ts.input) && isdigit(ts[i]) && isdigit(ts[i + 1]) && + isdigit(ts[i + 2]) && isdigit(ts[i + 3]) && return flushaboutindex!(ts, i + 3) + + isdigit(ts[j]) && return flushaboutindex!(ts, j) + + return false + end + else + # Checks if the pattern fits with or without part 1, if both do then go for bigger one. + index_including_1 = 0 + index_excluding_1 = 0 + j = i + + # Checking if including the first optional part of regex matches the pattern. + + if !(i + 3 <= length(ts.input) && isdigit(ts[i]) && isdigit(ts[i + 1]) && + isdigit(ts[i + 2])) + index_including_1 = -1 + end + i += 3 + + while i <= length(ts.input) && ts[i] ∈ [' ', '*', '-', '.', ')'] + i += 1 + end + + if !(i + 3 <= length(ts.input) && isdigit(ts[i]) && isdigit(ts[i + 1]) && + isdigit(ts[i + 2])) + index_including_1 = -1 + end + i += 3 + j = i + + while i <= length(ts.input) && ts[i] ∈ [' ', '*', '-', '.', ')'] + i += 1 + end + + if i + 3 <= length(ts.input) && isdigit(ts[i]) && isdigit(ts[i + 1]) && + isdigit(ts[i + 2]) && isdigit(ts[i + 3]) && index_including_1 == 0 + index_including_1 = i + 3 + elseif isdigit(ts[j]) && index_including_1 == 0 + index_including_1 = j + end + + # Checking if including the first optional part of regex matches the pattern. + i = ts.idx + + if !(i + 3 <= length(ts.input) && isdigit(ts[i]) && isdigit(ts[i + 1]) && + isdigit(ts[i + 2])) + index_excluding_1 = -1 + end + i += 3 + + while i <= length(ts.input) && ts[i] ∈ [' ', '*', '-', '.', ')'] + i += 1 + end + + if !(i + 3 <= length(ts.input) && isdigit(ts[i]) && isdigit(ts[i + 1]) && + isdigit(ts[i + 2])) + index_excluding_1 = -1 + end + i += 3 + j = i + + while i <= length(ts.input) && ts[i] ∈ [' ', '*', '-', '.', ')'] + i += 1 + end + + if i + 3 <= length(ts.input) && isdigit(ts[i]) && isdigit(ts[i + 1]) && + isdigit(ts[i + 2]) && isdigit(ts[i + 3]) && index_excluding_1 == 0 + index_excluding_1 = i + 3 + elseif isdigit(ts[j]) && index_excluding_1 == 0 + index_excluding_1 = j + end + + # Flushing out the bigger of the two. + index_including_1 <= 0 && index_excluding_1 <= 0 && return false + index_excluding_1 > index_including_1 && return flushaboutindex!(ts, index_excluding_1) + return flushaboutindex!(ts, index_including_1) + end + + return false +end + +""" + extra_phonenumbers(ts) + +Extra matching patterns for phone numbers. +""" +function extra_phonenumbers(ts) + return false +end + + """ tweet_tokenize(input::AbstractString) => tokens @@ -652,6 +829,8 @@ function tweet_tokenize(source::AbstractString; reduce_len=false, preserve_case=true ) + phonenumbers(ts) = nltk_phonenumbers(ts) || extra_phonenumbers(ts) + length(source) == 0 && return [] # Fix HTML Character entities source = replace_html_entities(source) @@ -666,7 +845,7 @@ function tweet_tokenize(source::AbstractString; # # To-Do: OpenQuotes and Closing quotes while !isdone(ts) spaces(ts) && continue - # urls(ts) || + # urls(ts) || # urls must be called before words. emoticons(ts) || emoticonsreverse(ts) || htmltags(ts) || @@ -674,8 +853,8 @@ function tweet_tokenize(source::AbstractString; twitterusername(ts) || ellipsis_dots(ts) || arrowsascii(ts) || - emailaddresses(ts) || - # phonenumbers(ts) || # Phone numbers must be present above numbers. + emailaddresses(ts) || # emailaddresses must be called before words + phonenumbers(ts) || # Phone numbers must be called before numbers. atoms(ts, []) || words_including_apostrophe_dashes(ts) || number(ts, check_sign = true) || From a6de434680380e27199ec3ac7523d115227b9fcc Mon Sep 17 00:00:00 2001 From: Ayushk4 Date: Fri, 24 May 2019 16:04:53 +0530 Subject: [PATCH 27/41] Add nltk_url1 --- src/words/tweet_tokenizer.jl | 159 ++++++++++++++++++++++++++++++++++- 1 file changed, 155 insertions(+), 4 deletions(-) diff --git a/src/words/tweet_tokenizer.jl b/src/words/tweet_tokenizer.jl index e838384..57f4abc 100644 --- a/src/words/tweet_tokenizer.jl +++ b/src/words/tweet_tokenizer.jl @@ -783,6 +783,156 @@ function extra_phonenumbers(ts) return false end +""" + nltk_url1(ts) + +Matches the url patterns starting with `http/https`. +""" +function nltk_url1(ts) + ts.idx + 3 > length(ts.input) && return false + i = ts.idx + + if ts[i:i+3] == ['h', 't', 't', 'p'] # Check if url starts with pattern - https?:(?:\/{1,3}|[a-z0-9%]) + i += 4 + i + 2 > length(ts.input) && return false + + if ts[i] == 's' + i += 1 + end + + ts[i] == ':' || return false + i += 1 + + if i >= length(ts.input) || !(isascii(ts[i]) && (islowercase(ts[i]) || + isdigit(ts[i]) || ts[i] == '%' || ts[i] == '/')) + return false + end + + i += 1 + else # Check if url starts with the regex pattern - [a-z0-9.\-]+[.](?:[a-z]{2,13})\/ + last_dot = ts.idx + + while i <= length(ts.input) && isascii(ts[i]) && (islowercase(ts[i]) || isdigit(ts[i]) || + ts[i] == '.' || ts[i] == '-') + if ts[i] == '.' + last_dot = i + elseif !islowercase(ts[i]) + last_dot = ts.idx + end + + i += 1 + end + + if i + 2 > length(ts.input) || last_dot <= ts.idx + 1 || i - last_dot > 14 || + i - last_dot <= 2 || ts[i] != '/' + return false + end + i += 1 + end + + return true + # URL is supposed to have 2 more parts. + # The first of these parts occuring at least once and second one exactly once. + # After every match of the first part, we keep a track if the second one follows it. + # and store the maximum index in `index_matched`. We flush about the index = index_matched then. + + # index_matched = ts.idx + # + # while i + 1 <= length(ts.input) && !(isspace(ts[i])) + # if ts[i] == '(' + # i += 1 + # (i > length(ts.idx) || isspace(ts[i])) && break + # j = i + # + # while j <= length(ts.idx) && ts[j] != ')' && ts[j] != '(' && !isspace(ts[j]) + # j += 1 + # end + # + # (j > length(ts.idx) || isspace(ts[j])) && break + # + # if ts[j] == ')' + # j - i <= 1 && break + # i = j + # else + # i = j + # + # i > length(ts.idx) && break + # + # while j <= length(ts.idx) && ts[j] != ')' && ts[j] != '(' && !isspace(ts[j]) + # j += 1 + # end + # + # (j > length(ts.idx) || isspace(ts[j]) || ts[j] == '(') && break + # + # j - i <= 1 && break + # j += 1 + # + # while j <= length(ts.idx) && ts[j] != ')' && ts[j] != '(' && !isspace(ts[j]) + # j += 1 + # end + # + # (j > length(ts.idx) || isspace(ts[j]) || ts[j] == '(') && break + # i = j + # end + # i += 1 + # else + # (isspace(ts[i])|| ts[i] ∈ [')', '<', '>', '{', '}', '[', ']'] ) && break + # i += 1 + # end + # + # i > length(ts.length) && break + # + # Might have error as i is increasing instead, use another variable j to calculate index_matched. + # if ts[i] == '(' + # i += 1 + # (i > length(ts.idx) || isspace(ts[i])) && break + # j = i + # + # while j <= length(ts.idx) && ts[j] != ')' && ts[j] != '(' && !isspace(ts[j]) + # j += 1 + # end + # + # (j > length(ts.idx) || isspace(ts[j])) && break + # + # if ts[j] == ')' + # j - i <= 1 && break + # i = j + # else + # i = j + # + # i > length(ts.idx) && break + # + # while j <= length(ts.idx) && ts[j] != ')' && ts[j] != '(' && !isspace(ts[j]) + # j += 1 + # end + # + # (j > length(ts.idx) || isspace(ts[j]) || ts[j] == '(') && break + # + # j - i <= 1 && break + # j += 1 + # + # while j <= length(ts.idx) && ts[j] != ')' && ts[j] != '(' && !isspace(ts[j]) + # j += 1 + # end + # + # (j > length(ts.idx) || isspace(ts[j]) || ts[j] == '(') && break + # i = j + # end + # index_matched = i + # i += 1 + # else + # (isspace(ts[i])|| ts[i] ∈ ['`', '!', ')', '[', ']', '{', '}', ';', + # ':', '\'', '"', '.', ',', '<', '>', '?', + # '«', '»', '“', '”', '‘', '’'] ) && break + # index_matched = i + # i += 1 + # end + # end + + index_matched == ts.idx && return false + return flushaboutindex!(ts, index_matched) +end + """ tweet_tokenize(input::AbstractString) => tokens @@ -827,9 +977,10 @@ julia> tweet_tokenize("This is a cooool #dummysmiley: :-) :-P <3 and some arrows function tweet_tokenize(source::AbstractString; strip_handle=false, reduce_len=false, - preserve_case=true ) + preserve_case=true) phonenumbers(ts) = nltk_phonenumbers(ts) || extra_phonenumbers(ts) + # urls(ts) = nltk_url1(ts) || nltk_url2(ts) length(source) == 0 && return [] # Fix HTML Character entities @@ -845,14 +996,14 @@ function tweet_tokenize(source::AbstractString; # # To-Do: OpenQuotes and Closing quotes while !isdone(ts) spaces(ts) && continue - # urls(ts) || # urls must be called before words. emoticons(ts) || emoticonsreverse(ts) || htmltags(ts) || + arrowsascii(ts) || twitterhashtags(ts) || - twitterusername(ts) || ellipsis_dots(ts) || - arrowsascii(ts) || + # urls(ts) || # urls must be called before words. + twitterusername(ts) || emailaddresses(ts) || # emailaddresses must be called before words phonenumbers(ts) || # Phone numbers must be called before numbers. atoms(ts, []) || From b9f0c4499841575a0d3164bfb4333b4eab787ca3 Mon Sep 17 00:00:00 2001 From: Ayushk4 Date: Fri, 24 May 2019 18:29:52 +0530 Subject: [PATCH 28/41] Finish nltk_url1 --- src/words/tweet_tokenizer.jl | 207 +++++++++++++++++++---------------- 1 file changed, 110 insertions(+), 97 deletions(-) diff --git a/src/words/tweet_tokenizer.jl b/src/words/tweet_tokenizer.jl index 57f4abc..65d13e9 100644 --- a/src/words/tweet_tokenizer.jl +++ b/src/words/tweet_tokenizer.jl @@ -792,6 +792,7 @@ function nltk_url1(ts) ts.idx + 3 > length(ts.input) && return false i = ts.idx + # Checking for part 1 of regex if ts[i:i+3] == ['h', 't', 't', 'p'] # Check if url starts with pattern - https?:(?:\/{1,3}|[a-z0-9%]) i += 4 i + 2 > length(ts.input) && return false @@ -830,109 +831,121 @@ function nltk_url1(ts) i += 1 end - return true # URL is supposed to have 2 more parts. - # The first of these parts occuring at least once and second one exactly once. + # Both Part 2 and Part 3 each having 3 possible alternatives. + # Part 2 occurs at least once and Part 3 exactly once. # After every match of the first part, we keep a track if the second one follows it. - # and store the maximum index in `index_matched`. We flush about the index = index_matched then. - - # index_matched = ts.idx - # - # while i + 1 <= length(ts.input) && !(isspace(ts[i])) - # if ts[i] == '(' - # i += 1 - # (i > length(ts.idx) || isspace(ts[i])) && break - # j = i - # - # while j <= length(ts.idx) && ts[j] != ')' && ts[j] != '(' && !isspace(ts[j]) - # j += 1 - # end - # - # (j > length(ts.idx) || isspace(ts[j])) && break - # - # if ts[j] == ')' - # j - i <= 1 && break - # i = j - # else - # i = j - # - # i > length(ts.idx) && break - # - # while j <= length(ts.idx) && ts[j] != ')' && ts[j] != '(' && !isspace(ts[j]) - # j += 1 - # end - # - # (j > length(ts.idx) || isspace(ts[j]) || ts[j] == '(') && break - # - # j - i <= 1 && break - # j += 1 - # - # while j <= length(ts.idx) && ts[j] != ')' && ts[j] != '(' && !isspace(ts[j]) - # j += 1 - # end - # - # (j > length(ts.idx) || isspace(ts[j]) || ts[j] == '(') && break - # i = j - # end - # i += 1 - # else - # (isspace(ts[i])|| ts[i] ∈ [')', '<', '>', '{', '}', '[', ']'] ) && break - # i += 1 - # end - # - # i > length(ts.length) && break - # - # Might have error as i is increasing instead, use another variable j to calculate index_matched. - # if ts[i] == '(' - # i += 1 - # (i > length(ts.idx) || isspace(ts[i])) && break - # j = i - # - # while j <= length(ts.idx) && ts[j] != ')' && ts[j] != '(' && !isspace(ts[j]) - # j += 1 - # end - # - # (j > length(ts.idx) || isspace(ts[j])) && break - # - # if ts[j] == ')' - # j - i <= 1 && break - # i = j - # else - # i = j - # - # i > length(ts.idx) && break - # - # while j <= length(ts.idx) && ts[j] != ')' && ts[j] != '(' && !isspace(ts[j]) - # j += 1 - # end - # - # (j > length(ts.idx) || isspace(ts[j]) || ts[j] == '(') && break - # - # j - i <= 1 && break - # j += 1 - # - # while j <= length(ts.idx) && ts[j] != ')' && ts[j] != '(' && !isspace(ts[j]) - # j += 1 - # end - # - # (j > length(ts.idx) || isspace(ts[j]) || ts[j] == '(') && break - # i = j - # end - # index_matched = i - # i += 1 - # else - # (isspace(ts[i])|| ts[i] ∈ ['`', '!', ')', '[', ']', '{', '}', ';', - # ':', '\'', '"', '.', ',', '<', '>', '?', - # '«', '»', '“', '”', '‘', '’'] ) && break - # index_matched = i - # i += 1 - # end - # end + # and store the maximum index in `index_matched`. + # Finally, we flush about the index = index_matched then. + + index_matched = ts.idx + + while i + 1 <= length(ts.input) && !(isspace(ts[i])) + + # Check if part 2 matches otherwise break. + # Part 2 could be one of the three patterns. + # i. ` \([^\s]+?\)` + # ii. `\([^\s()]*?\([^\s()]+?\)[^\s()]*?\)` + # iii. `[^\s()<>{}\[\]]+` + if ts[i] == '(' # Checking for i. and ii. above. + i += 1 + (i > length(ts.idx) || isspace(ts[i])) && break + j = i + + while j <= length(ts.idx) && ts[j] != ')' && ts[j] != '(' && !isspace(ts[j]) + j += 1 + end + + (j > length(ts.idx) || isspace(ts[j])) && break + + if ts[j] == ')' # Checking for i. + j - i <= 1 && break + i = j + else # Checking for ii. + i = j + i > length(ts.idx) && break + + while j <= length(ts.idx) && ts[j] != ')' && ts[j] != '(' && !isspace(ts[j]) + j += 1 + end + + (j > length(ts.idx) || isspace(ts[j]) || ts[j] == '(') && break + j - i <= 1 && break + j += 1 + + while j <= length(ts.idx) && ts[j] != ')' && ts[j] != '(' && !isspace(ts[j]) + j += 1 + end + + (j > length(ts.idx) || isspace(ts[j]) || ts[j] == '(') && break + i = j + end + i += 1 + else # Checking for iii. + (isspace(ts[i])|| ts[i] ∈ [')', '<', '>', '{', '}', '[', ']'] ) && break + i += 1 + end + + i > length(ts.input) && break + k = i # Just for temporarily storing i. + + # Check if part 3 matches otherwise continue. + # Part 3 could be one of the three patterns. + # i. `\([^\s()]*?\([^\s()]+?\)[^\s()]*?\)` + # ii. `[^\s`!()\[\]{};:'".,<>?«»“”‘’]` + # iii. ` \([^\s]+?\)` + if ts[i] == '(' # Check for part i. and iii. + + i += 1 + (i > length(ts.idx) || isspace(ts[i])) && continue + j = i + + while j <= length(ts.idx) && ts[j] != ')' && ts[j] != '(' && !isspace(ts[j]) + j += 1 + end + + (j > length(ts.idx) || isspace(ts[j])) && continue + + if ts[j] == ')' # Check for part iii. + j - i <= 1 && break + i = j + else # Check for part i. + i = j + i > length(ts.idx) && continue + + while j <= length(ts.idx) && ts[j] != ')' && ts[j] != '(' && !isspace(ts[j]) + j += 1 + end + + (j > length(ts.idx) || isspace(ts[j]) || ts[j] == '(') && continue + j - i <= 1 && continue + + j += 1 + while j <= length(ts.idx) && ts[j] != ')' && ts[j] != '(' && !isspace(ts[j]) + j += 1 + end + + (j > length(ts.idx) || isspace(ts[j]) || ts[j] == '(') && continue + i = j + end + index_matched = i + i += 1 + else # Check for part ii. + isspace(ts[i]) && break + ts[i] ∈ ['`', '!', ')', '[', ']', '{', '}', ';', ':', '\'', '"', '.', + ',', '<', '>', '?', '«', '»', '“', '”', '‘', '’'] && continue + index_matched = i + end + i = k + end index_matched == ts.idx && return false return flushaboutindex!(ts, index_matched) end +function nltk_url2(ts) + return false +end """ tweet_tokenize(input::AbstractString) => tokens @@ -993,7 +1006,7 @@ function tweet_tokenize(source::AbstractString; ts = TokenBuffer(safe_text) isempty(safe_text) && return ts.tokens - # # To-Do: OpenQuotes and Closing quotes + # # TODO: OpenQuotes and Closing quotes while !isdone(ts) spaces(ts) && continue emoticons(ts) || From 040368bcc6ed19be82284437453913119335b4cb Mon Sep 17 00:00:00 2001 From: Ayushk4 Date: Fri, 24 May 2019 18:33:42 +0530 Subject: [PATCH 29/41] Add urls to tweet Tokenizer --- src/words/tweet_tokenizer.jl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/words/tweet_tokenizer.jl b/src/words/tweet_tokenizer.jl index 65d13e9..5e5f1e2 100644 --- a/src/words/tweet_tokenizer.jl +++ b/src/words/tweet_tokenizer.jl @@ -993,7 +993,7 @@ function tweet_tokenize(source::AbstractString; preserve_case=true) phonenumbers(ts) = nltk_phonenumbers(ts) || extra_phonenumbers(ts) - # urls(ts) = nltk_url1(ts) || nltk_url2(ts) + urls(ts) = nltk_url1(ts) || nltk_url2(ts) length(source) == 0 && return [] # Fix HTML Character entities @@ -1015,7 +1015,7 @@ function tweet_tokenize(source::AbstractString; arrowsascii(ts) || twitterhashtags(ts) || ellipsis_dots(ts) || - # urls(ts) || # urls must be called before words. + urls(ts) || # urls must be called before words. twitterusername(ts) || emailaddresses(ts) || # emailaddresses must be called before words phonenumbers(ts) || # Phone numbers must be called before numbers. From 697dee4fde0a7ad79361741f9670975321128448 Mon Sep 17 00:00:00 2001 From: Ayushk4 Date: Fri, 24 May 2019 20:40:07 +0530 Subject: [PATCH 30/41] Remove option of converting to lowercase --- src/words/tweet_tokenizer.jl | 21 +++------------------ test/tweet_tokenize.jl | 25 +++++++++++-------------- 2 files changed, 14 insertions(+), 32 deletions(-) diff --git a/src/words/tweet_tokenizer.jl b/src/words/tweet_tokenizer.jl index 5e5f1e2..53b9440 100644 --- a/src/words/tweet_tokenizer.jl +++ b/src/words/tweet_tokenizer.jl @@ -987,16 +987,14 @@ julia> tweet_tokenize("This is a cooool #dummysmiley: :-) :-P <3 and some arrows "<--" ``` """ -function tweet_tokenize(source::AbstractString; - strip_handle=false, - reduce_len=false, - preserve_case=true) +function tweet_tokenize(source::AbstractString; strip_handle=false, + reduce_len=false) phonenumbers(ts) = nltk_phonenumbers(ts) || extra_phonenumbers(ts) urls(ts) = nltk_url1(ts) || nltk_url2(ts) - length(source) == 0 && return [] # Fix HTML Character entities + length(source) == 0 && return [] source = replace_html_entities(source) length(source) == 0 && return [] @@ -1027,18 +1025,5 @@ function tweet_tokenize(source::AbstractString; tokens = ts.tokens - # tokens = collect((m.match for m in eachmatch(WORD_REGEX, - # safe_text, - # overlap=false))) - - # Alter the case with preserving it for emoji - if !preserve_case - for (index, word) in enumerate(tokens) - if !occursin(EMOTICONS_REGEX, word) - tokens[index] = lowercase(word) - end - end - end - return tokens end diff --git a/test/tweet_tokenize.jl b/test/tweet_tokenize.jl index 6bf3743..54c29e2 100644 --- a/test/tweet_tokenize.jl +++ b/test/tweet_tokenize.jl @@ -3,7 +3,6 @@ using WordTokenizers @testset "Tweet Tokenize" begin @testset "Basic Tests" begin - s0 = "This is a cooool #dummysmiley: :-) :-P <3 and some arrows < > -> <--" @test tweet_tokenize(s0) == ["This", "is", "a", "cooool", "#dummysmiley", ":", ":-)", ":-P", "<3", "and", "some", "arrows", "<", ">", "->", "<--"] @@ -27,27 +26,25 @@ using WordTokenizers s5 = "@crushinghes the summer holidays are great but I'm so bored already :(" @test tweet_tokenize(s5, reduce_len=true) == ["@crushinghes", "the", "summer", "holidays", "are", "great", "but", "I'm", "so", "bored", "already", ":("] + + s6 = "@jrmy: I'm REALLY HAPPYYY about that! NICEEEE :D :P" + @test tweet_tokenize(s6) == + ["@jrmy", ":", "I'm", "REALLY", "HAPPYYY", "about", "that", "!", "NICEEEE", ":D", ":P"] end @testset "Remove Handles and Reduce Length" begin - s6 = "@remy: This is waaaaayyyy too much for you!!!!!!" - @test tweet_tokenize(s6, strip_handle=true, reduce_len=true) == + s7 = "@remy: This is waaaaayyyy too much for you!!!!!!" + @test tweet_tokenize(s7, strip_handle=true, reduce_len=true) == [":", "This", "is", "waaayyy", "too", "much", "for", "you", "!", "!", "!"] - s7 = "@_willy65: No place for @chuck tonight. Sorry." - @test tweet_tokenize(s7, strip_handle=true, reduce_len=true) == + s8 = "@_willy65: No place for @chuck tonight. Sorry." + @test tweet_tokenize(s8, strip_handle=true, reduce_len=true) == [":", "No", "place", "for", "tonight", ".", "Sorry", "."] - s8 = "@mar_tin is a great developer. Contact him at mar_tin@email.com." - @test tweet_tokenize(s8, strip_handle=true, reduce_len=true) == + s9 = "@mar_tin is a great developer. Contact him at mar_tin@email.com." + @test tweet_tokenize(s9, strip_handle=true, reduce_len=true) == ["is", "a", "great", "developer", ".", "Contact", "him", "at", "mar_tin@email.com", "."] - end - - @testset "Preserve Case" begin - s9 = "@jrmy: I'm REALLY HAPPYYY about that! NICEEEE :D :P" - @test tweet_tokenize(s9, preserve_case=false) == - ["@jrmy", ":", "i'm", "really", "happyyy", "about", "that", "!", "niceeee", ":D", ":P"] - end + end @testset "Test long sentences" begin s10 = "Photo: Aujourd'hui sur http://t.co/0gebOFDUzn Projet... http://t.co/bKfIUbydz2.............................. http://fb.me/3b6uXpz0L" From 927e4b384d428ffc3d673b5dd9a8bfbd9783c092 Mon Sep 17 00:00:00 2001 From: Ayushk4 Date: Fri, 24 May 2019 20:49:49 +0530 Subject: [PATCH 31/41] Remove regex patterns --- src/words/tweet_tokenizer.jl | 105 ----------------------------------- 1 file changed, 105 deletions(-) diff --git a/src/words/tweet_tokenizer.jl b/src/words/tweet_tokenizer.jl index 53b9440..7b16ccc 100644 --- a/src/words/tweet_tokenizer.jl +++ b/src/words/tweet_tokenizer.jl @@ -1,108 +1,3 @@ -const EMOTICONS_REGEX = r"""(?x) - (?: - [<>]? - [:;=8] - [\-o\*\']? - [\)\]\(\[dDpP/\:\}\{@\|\\] - | - [\)\]\(\[dDpP/\:\}\{@\|\\] - [\-o\*\']? - [:;=8] - [<>]? - | - <3 - )""" - - -const URLS = r"""(?x) - (?: - https?: - (?: - /{1,3} - | - [a-z0-9%] - ) - | - [a-z0-9.\-]+[.] - (?:[a-z]{2,13}) - / - ) - (?: - [^\s()<>{}\[\]]+ - | - \([^\s()]*?\([^\s()]+\)[^\s()]*?\) - | - \([^\s]+?\) - )+ - (?: - \([^\s()]*?\([^\s()]+\)[^\s()]*?\) - | - \([^\s]+?\) - | - [^\s`!()\[\]{};:'".,<>?«»“”‘’] - ) - | - (?: - (?\s]+>""" -const ASCII_ARROWS = r"""[\-]+>|<[\-]+""" -const TWITTER_USERNAME = r"""(?:@[\w_]+)""" -const TWITTER_HASHTAGS = r"""(?:\#+[\w_]+[\w\'_\-]*[\w_]+)""" -const EMAIL_ADDRESSES = r"""[\w.+-]+@[\w-]+\.(?:[\w-]\.?)+[\w-]""" -const WORDS_WITH_APOSTROPHE_DASHES = r"""(?:[^\W\d_](?:[^\W\d_]|['\-_])+[^\W\d_])""" -const NUMBERS_FRACTIONS_DECIMALS = r"""(?:[+\-]?\d+[,/.:-]\d+[+\-]?)""" -const ELLIPSIS_DOTS = r"""(?:\.(?:\s*\.){1,})""" -const WORDS_WITHOUT_APOSTROPHE_DASHES = r"""(?:[\w_]+)""" - - -# Core tokenizing regex -const WORD_REGEX = Regex("(?i:" * join([URLS.pattern - PHONE_NUMBERS.pattern - EMOTICONS_REGEX.pattern - HTML_TAGS.pattern - ASCII_ARROWS.pattern - TWITTER_USERNAME.pattern - TWITTER_HASHTAGS.pattern - EMAIL_ADDRESSES.pattern - WORDS_WITH_APOSTROPHE_DASHES.pattern - NUMBERS_FRACTIONS_DECIMALS.pattern - WORDS_WITHOUT_APOSTROPHE_DASHES.pattern - ELLIPSIS_DOTS.pattern - r"(?:\S)".pattern - ], "|") - * ")" - ) - -const HANG_REGEX = r"""([^a-zA-Z0-9])\1{3,}""" - """ html_entities(ts::TokenBuffer; remove_illegal=true) From 75db8135e03bb6dca7a8877a37ce0f94d7826a5d Mon Sep 17 00:00:00 2001 From: Ayushk4 Date: Fri, 31 May 2019 17:21:19 +0530 Subject: [PATCH 32/41] Fix Bugs in tweet tokenizing functions --- src/words/tweet_tokenizer.jl | 127 +++++++++++++++++------------------ 1 file changed, 61 insertions(+), 66 deletions(-) diff --git a/src/words/tweet_tokenizer.jl b/src/words/tweet_tokenizer.jl index 7b16ccc..b52cbc6 100644 --- a/src/words/tweet_tokenizer.jl +++ b/src/words/tweet_tokenizer.jl @@ -281,20 +281,20 @@ function emoticonsreverse(ts) ts.idx + 1 > length(ts.input) && return false idx = ts.idx - ts[idx] ∈ mouth && ( - ts[idx + 1] ∈ eyes && ( - (ts[idx + 2] ∈ forehead && return flushaboutindex!(ts, idx + 2)) || - return flushaboutindex!(ts, idx+1) - ) || - ts[idx + 1] ∈ nose && ( - ts[idx + 2] ∈ eyes && ( - (ts[idx + 3] ∈ forehead && return flushaboutindex!(ts, idx + 3)) || - return flushaboutindex!(ts, idx + 3) - ) - ) - ) + ts[idx] ∈ mouth || return false + idx += 1 - return false + if ts[idx] ∈ nose + idx >= length(ts.input) && return false + idx += 1 + end + + ts[idx] ∈ eyes|| return false + idx += 1 + + idx <= length(ts.input) && ts[idx] ∈ forehead && return flushaboutindex!(ts, idx) + + return flushaboutindex!(ts, idx -1 ) end """ @@ -316,8 +316,6 @@ function htmltags(ts) return flushaboutindex!(ts, i) end - -# To-Do : Find a way to make arrowsascii repeatedly check for recheck """ arrowsascii(ts::TokenBuffer) @@ -594,72 +592,69 @@ function nltk_phonenumbers(ts) return false end else - # Checks if the pattern fits with or without part 1, if both do then go for bigger one. - index_including_1 = 0 - index_excluding_1 = 0 - j = i + function index_including_part_1(ts, i) + index_including_1 = 0 + j = i - # Checking if including the first optional part of regex matches the pattern. + i + 3 <= length(ts.input) && isdigit(ts[i]) && isdigit(ts[i + 1]) && + isdigit(ts[i + 2]) || return -1 - if !(i + 3 <= length(ts.input) && isdigit(ts[i]) && isdigit(ts[i + 1]) && - isdigit(ts[i + 2])) - index_including_1 = -1 - end - i += 3 + i += 3 - while i <= length(ts.input) && ts[i] ∈ [' ', '*', '-', '.', ')'] - i += 1 - end + while i <= length(ts.input) && ts[i] ∈ [' ', '*', '-', '.', ')'] + i += 1 + end - if !(i + 3 <= length(ts.input) && isdigit(ts[i]) && isdigit(ts[i + 1]) && - isdigit(ts[i + 2])) - index_including_1 = -1 - end - i += 3 - j = i + i + 3 <= length(ts.input) && isdigit(ts[i]) && isdigit(ts[i + 1]) && + isdigit(ts[i + 2]) || return -1 - while i <= length(ts.input) && ts[i] ∈ [' ', '*', '-', '.', ')'] - i += 1 - end + i += 3 + j = i - if i + 3 <= length(ts.input) && isdigit(ts[i]) && isdigit(ts[i + 1]) && - isdigit(ts[i + 2]) && isdigit(ts[i + 3]) && index_including_1 == 0 - index_including_1 = i + 3 - elseif isdigit(ts[j]) && index_including_1 == 0 - index_including_1 = j - end + while i <= length(ts.input) && ts[i] ∈ [' ', '*', '-', '.', ')'] + i += 1 + end - # Checking if including the first optional part of regex matches the pattern. - i = ts.idx + i + 3 <= length(ts.input) && isdigit(ts[i]) && isdigit(ts[i + 1]) && + isdigit(ts[i + 2]) && isdigit(ts[i + 3]) && return i + 3 - if !(i + 3 <= length(ts.input) && isdigit(ts[i]) && isdigit(ts[i + 1]) && - isdigit(ts[i + 2])) - index_excluding_1 = -1 - end - i += 3 + isdigit(ts[j]) && return j - while i <= length(ts.input) && ts[i] ∈ [' ', '*', '-', '.', ')'] - i += 1 + return -1 end - if !(i + 3 <= length(ts.input) && isdigit(ts[i]) && isdigit(ts[i + 1]) && - isdigit(ts[i + 2])) - index_excluding_1 = -1 - end - i += 3 - j = i + function index_excluding_part_1(ts) + i = ts.idx - while i <= length(ts.input) && ts[i] ∈ [' ', '*', '-', '.', ')'] - i += 1 - end + i + 3 <= length(ts.input) && isdigit(ts[i]) && isdigit(ts[i + 1]) && + isdigit(ts[i + 2]) || return -1 + + i += 3 + + while i <= length(ts.input) && ts[i] ∈ [' ', '*', '-', '.', ')'] + i += 1 + end + + i + 3 <= length(ts.input) && isdigit(ts[i]) && isdigit(ts[i + 1]) && + isdigit(ts[i + 2]) || return -1 + + i += 3 + j = i + + while i <= length(ts.input) && ts[i] ∈ [' ', '*', '-', '.', ')'] + i += 1 + end + + i + 3 <= length(ts.input) && isdigit(ts[i]) && isdigit(ts[i + 1]) && + isdigit(ts[i + 2]) && isdigit(ts[i + 3]) && return i + 3 + + isdigit(ts[j]) && return j - if i + 3 <= length(ts.input) && isdigit(ts[i]) && isdigit(ts[i + 1]) && - isdigit(ts[i + 2]) && isdigit(ts[i + 3]) && index_excluding_1 == 0 - index_excluding_1 = i + 3 - elseif isdigit(ts[j]) && index_excluding_1 == 0 - index_excluding_1 = j + return -1 end + index_excluding_1 = index_excluding_part_1(ts) + index_including_1 = index_including_part_1(ts, i) # Flushing out the bigger of the two. index_including_1 <= 0 && index_excluding_1 <= 0 && return false index_excluding_1 > index_including_1 && return flushaboutindex!(ts, index_excluding_1) From ce7c74bfcf3afc8a398c0fbe26baf1c793e3487f Mon Sep 17 00:00:00 2001 From: Ayushk4 Date: Fri, 31 May 2019 21:29:09 +0530 Subject: [PATCH 33/41] Finish nltk url function --- src/words/tweet_tokenizer.jl | 36 +++++++++++++++++++++++++++++++++++- test/tweet_tokenize.jl | 3 +-- 2 files changed, 36 insertions(+), 3 deletions(-) diff --git a/src/words/tweet_tokenizer.jl b/src/words/tweet_tokenizer.jl index b52cbc6..7a4dc45 100644 --- a/src/words/tweet_tokenizer.jl +++ b/src/words/tweet_tokenizer.jl @@ -344,7 +344,7 @@ function arrowsascii(ts) ts[ts.idx] == '>' && return flushaboutindex!(ts, i) end - +# TODO: integrate this with words_including_apostrophe_dashes() to reduce time taken. # Checks the string till non word char appears, so takes relatively more time. """ emailaddresses(ts) @@ -834,6 +834,40 @@ function nltk_url1(ts) end function nltk_url2(ts) + (ts.idx > length(ts.input) || (ts.idx > 1 && ts[ts.idx - 1] == '@')) && return false + + (isascii(ts[ts.idx]) && (isdigit(ts[ts.idx]) || islowercase(ts[ts.idx]))) || return false + + i = ts.idx + 1 + while i <= length(ts.input) && isascii(ts[i]) && (isdigit(ts[i]) || islowercase(ts[i])) + i += 1 + end + + flush_about = 0 + + while i < length(ts.input) && ts[i] ∈ ['.', '-'] + j = ts[i] == '.' ? i : 0 + i += 1 + + (isascii(ts[i]) && (isdigit(ts[i]) || islowercase(ts[i]))) || break + i += 1 + + while i <= length(ts.input) && isascii(ts[i]) && (isdigit(ts[i]) || islowercase(ts[i])) + i += 1 + end + + if j != 0 && 3 <= i - j <= 14 + if i <= length(ts.input) && ts[i] == '/' && (i + 1 > length(ts.input) || ts[i + 1] != '@') + flush_about = i + break + end + flush_about = i - 1 + end + end + + flush_about == 0 && return false + (flush_about >= length(ts.input) || ts[flush_about + 1] != '@') && return flushaboutindex!(ts, flush_about) + return false end diff --git a/test/tweet_tokenize.jl b/test/tweet_tokenize.jl index 54c29e2..2238930 100644 --- a/test/tweet_tokenize.jl +++ b/test/tweet_tokenize.jl @@ -26,7 +26,7 @@ using WordTokenizers s5 = "@crushinghes the summer holidays are great but I'm so bored already :(" @test tweet_tokenize(s5, reduce_len=true) == ["@crushinghes", "the", "summer", "holidays", "are", "great", "but", "I'm", "so", "bored", "already", ":("] - + s6 = "@jrmy: I'm REALLY HAPPYYY about that! NICEEEE :D :P" @test tweet_tokenize(s6) == ["@jrmy", ":", "I'm", "REALLY", "HAPPYYY", "about", "that", "!", "NICEEEE", ":D", ":P"] @@ -74,5 +74,4 @@ end @test tweet_tokenize("Check out this symbol in Windows-1252 encoding €") == [ "Check", "out", "this", "symbol", "in", "Windows", "-", "1252", "encoding", "€"] - end From d9a019f4eaa1fc49108b3039f409e8de5b393d77 Mon Sep 17 00:00:00 2001 From: Ayushk4 Date: Sun, 2 Jun 2019 20:58:17 +0530 Subject: [PATCH 34/41] Add tests --- test/tweet_tokenize.jl | 43 +++++++++++++++++++++++++++++++++--------- 1 file changed, 34 insertions(+), 9 deletions(-) diff --git a/test/tweet_tokenize.jl b/test/tweet_tokenize.jl index 2238930..2302909 100644 --- a/test/tweet_tokenize.jl +++ b/test/tweet_tokenize.jl @@ -51,27 +51,52 @@ using WordTokenizers @test tweet_tokenize(s10) == ["Photo", ":", "Aujourd'hui", "sur", "http://t.co/0gebOFDUzn", "Projet", "...", "http://t.co/bKfIUbydz2", "...", "http://fb.me/3b6uXpz0L"] end -end -@testset "Replace HTML Entities" begin - @test tweet_tokenize("An HTML Entity - Δ") == + @testset "Replace HTML Entities" begin + @test tweet_tokenize("An HTML Entity - Δ") == ["An", "HTML", "Entity", "-", "Δ"] - @test tweet_tokenize("Another HTML Entity - Δ") == + @test tweet_tokenize("Another HTML Entity - Δ") == ["Another", "HTML", "Entity", "-", "Δ"] - @test tweet_tokenize("Another HTML Entity - Δ") == + @test tweet_tokenize("Another HTML Entity - Δ") == ["Another", "HTML", "Entity", "-", "Δ"] - @test tweet_tokenize("Price: £100") == + @test tweet_tokenize("Price: £100") == [ "Price", ":", "£", "100"] - @test tweet_tokenize("Check out this invalid symbol ") == + @test tweet_tokenize("Check out this invalid symbol ") == [ "Check", "out", "this", "invalid", "symbol", "\u81"] - @test tweet_tokenize("A•B = B•A ") == + @test tweet_tokenize("A•B = B•A ") == [ "A", "•", "B", "=", "B", "•", "A"] - @test tweet_tokenize("Check out this symbol in Windows-1252 encoding €") == + @test tweet_tokenize("Check out this symbol in Windows-1252 encoding €") == [ "Check", "out", "this", "symbol", "in", "Windows", "-", "1252", "encoding", "€"] + end + + @testset "Token Buffer functions" begin + # Emoticons reverse and HTML_Entities + @test tweet_tokenize("(-:> (-: Hi there! ") == + ["(-:>", "(-:", "", "Hi", "there", "!", ""] + + # Phone numbers and email + @test tweet_tokenize("+0 (123) 333-5553 and 1111222 are valid for ab@cd.e.") == + ["+0 (123) 333-5553", "and", "1111222", "are", "valid", "for", "ab", "@cd", ".", "e", "."] + + @test tweet_tokenize("+0 0------ 333 333 2222 333 33 3 33333") == + ["+0", "0--- 333 333 2222","333", "33", "3", "33333"] + + @test tweet_tokenize("11111112222 11112222") == + ["11111112222", "11112222"] + + # Hashtags and Twitter usernames + @test tweet_tokenize("#a- @ ") == + ["#", "a-", "@"] + + # URLs + @test tweet_tokenize("abc.com/(xyz)(xyz)a(a( and abc.co.com/ and http: ") == + ["abc.com/(xyz)(xyz)a", "(", "a", "(", "and", "abc.co.com/", "and", "http", ":"] + + end end From bebe5bd7e985494e5f39d88c903512c656932520 Mon Sep 17 00:00:00 2001 From: Ayushk4 Date: Sun, 2 Jun 2019 20:59:45 +0530 Subject: [PATCH 35/41] Fix Bugs in tweet tokenizer --- src/words/tweet_tokenizer.jl | 34 +++++++++++++++++----------------- 1 file changed, 17 insertions(+), 17 deletions(-) diff --git a/src/words/tweet_tokenizer.jl b/src/words/tweet_tokenizer.jl index 7a4dc45..f18b595 100644 --- a/src/words/tweet_tokenizer.jl +++ b/src/words/tweet_tokenizer.jl @@ -653,8 +653,8 @@ function nltk_phonenumbers(ts) return -1 end - index_excluding_1 = index_excluding_part_1(ts) index_including_1 = index_including_part_1(ts, i) + index_excluding_1 = index_excluding_part_1(ts) # Flushing out the bigger of the two. index_including_1 <= 0 && index_excluding_1 <= 0 && return false index_excluding_1 > index_including_1 && return flushaboutindex!(ts, index_excluding_1) @@ -739,35 +739,35 @@ function nltk_url1(ts) # iii. `[^\s()<>{}\[\]]+` if ts[i] == '(' # Checking for i. and ii. above. i += 1 - (i > length(ts.idx) || isspace(ts[i])) && break + (i > length(ts.input) || isspace(ts[i])) && break j = i - while j <= length(ts.idx) && ts[j] != ')' && ts[j] != '(' && !isspace(ts[j]) + while j <= length(ts.input) && ts[j] != ')' && ts[j] != '(' && !isspace(ts[j]) j += 1 end - (j > length(ts.idx) || isspace(ts[j])) && break + (j > length(ts.input) || isspace(ts[j])) && break if ts[j] == ')' # Checking for i. j - i <= 1 && break i = j else # Checking for ii. i = j - i > length(ts.idx) && break + i > length(ts.input) && break - while j <= length(ts.idx) && ts[j] != ')' && ts[j] != '(' && !isspace(ts[j]) + while j <= length(ts.input) && ts[j] != ')' && ts[j] != '(' && !isspace(ts[j]) j += 1 end - (j > length(ts.idx) || isspace(ts[j]) || ts[j] == '(') && break + (j > length(ts.input) || isspace(ts[j]) || ts[j] == '(') && break j - i <= 1 && break j += 1 - while j <= length(ts.idx) && ts[j] != ')' && ts[j] != '(' && !isspace(ts[j]) + while j <= length(ts.input) && ts[j] != ')' && ts[j] != '(' && !isspace(ts[j]) j += 1 end - (j > length(ts.idx) || isspace(ts[j]) || ts[j] == '(') && break + (j > length(ts.input) || isspace(ts[j]) || ts[j] == '(') && break i = j end i += 1 @@ -787,35 +787,35 @@ function nltk_url1(ts) if ts[i] == '(' # Check for part i. and iii. i += 1 - (i > length(ts.idx) || isspace(ts[i])) && continue + (i > length(ts.input) || isspace(ts[i])) && continue j = i - while j <= length(ts.idx) && ts[j] != ')' && ts[j] != '(' && !isspace(ts[j]) + while j <= length(ts.input) && ts[j] != ')' && ts[j] != '(' && !isspace(ts[j]) j += 1 end - (j > length(ts.idx) || isspace(ts[j])) && continue + (j > length(ts.input) || isspace(ts[j])) && continue if ts[j] == ')' # Check for part iii. j - i <= 1 && break i = j else # Check for part i. i = j - i > length(ts.idx) && continue + i > length(ts.input) && continue - while j <= length(ts.idx) && ts[j] != ')' && ts[j] != '(' && !isspace(ts[j]) + while j <= length(ts.input) && ts[j] != ')' && ts[j] != '(' && !isspace(ts[j]) j += 1 end - (j > length(ts.idx) || isspace(ts[j]) || ts[j] == '(') && continue + (j > length(ts.input) || isspace(ts[j]) || ts[j] == '(') && continue j - i <= 1 && continue j += 1 - while j <= length(ts.idx) && ts[j] != ')' && ts[j] != '(' && !isspace(ts[j]) + while j <= length(ts.input) && ts[j] != ')' && ts[j] != '(' && !isspace(ts[j]) j += 1 end - (j > length(ts.idx) || isspace(ts[j]) || ts[j] == '(') && continue + (j > length(ts.input) || isspace(ts[j]) || ts[j] == '(') && continue i = j end index_matched = i From e2120ad949ddeac936db1ebad381d9b62c3931fb Mon Sep 17 00:00:00 2001 From: Ayushk4 Date: Mon, 3 Jun 2019 15:30:07 +0530 Subject: [PATCH 36/41] Fix indentation --- src/words/fast.jl | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/src/words/fast.jl b/src/words/fast.jl index b5eebc4..70c6e55 100644 --- a/src/words/fast.jl +++ b/src/words/fast.jl @@ -17,11 +17,11 @@ either skips whitespace or parses a number token, if possible. The simplest possible tokeniser accepts any `character` with no token breaks: function tokenise(input) - ts = TokenBuffer(input) - while !isdone(ts) - character(ts) - end - return ts.tokens + ts = TokenBuffer(input) + while !isdone(ts) + character(ts) + end + return ts.tokens end tokenise("foo bar baz") # ["foo bar baz"] @@ -29,11 +29,11 @@ The simplest possible tokeniser accepts any `character` with no token breaks: The second simplest splits only on spaces: function tokenise(input) - ts = TokenBuffer(input) - while !isdone(ts) - spaces(ts) || character(ts) - end - return ts.tokens + ts = TokenBuffer(input) + while !isdone(ts) + spaces(ts) || character(ts) + end + return ts.tokens end tokenise("foo bar baz") # ["foo", "bar", "baz"] From 1dc54454c80f2042dd15be98018b4fc8ad8bfc9d Mon Sep 17 00:00:00 2001 From: Ayushk4 Date: Mon, 3 Jun 2019 15:31:14 +0530 Subject: [PATCH 37/41] Update README for TokenBuffer --- README.md | 99 ++++++++++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 95 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index b4da7d9..930ef8a 100644 --- a/README.md +++ b/README.md @@ -65,13 +65,13 @@ The word tokenizers basically assume sentence splitting has already been done. - **Penn Tokenizer:** (`penn_tokenize`) This is Robert MacIntyre's orginal tokenizer used for the Penn Treebank. Splits contractions. - **Improved Penn Tokenizer:** (`improved_penn_tokenize`) NLTK's improved Penn Treebank Tokenizer. Very similar to the original, some improvements on punctuation and contractions. This matches to NLTK's `nltk.tokenize.TreeBankWordTokenizer.tokenize` - - **NLTK Word tokenizer:** (`nltk_word_tokenize`) NLTK's even more improved version of the Penn Tokenizer. This version has better unicode handling and some other changes. This matches to the most commonly used `nltk.word_tokenize`, minus the sentence tokenizing step. -- **Reversible Tokenizer:** (`rev_tokenize` and `rev_detokenize`) This tokenizer splits on punctuations, space and special symbols. The generated tokens can be de-tokenized by using the `rev_detokenizer` function into the state before tokenization. -- **TokTok Tokenizer:** (`toktok_tokenize`) This tokenizer is a simple, general tokenizer, where the input has one sentence per line; thus only final period is tokenized. Tok-tok has been tested on and gives reasonably good results for English, Persian, Russian, Czech, French, German, Vietnamese, Tajik, and a few others. **(default tokenizer)** + - **NLTK Word tokenizer:** (`nltk_word_tokenize`) NLTK's even more improved version of the Penn Tokenizer. This version has better unicode handling and some other changes. This matches to the most commonly used `nltk.word_tokenize`, minus the sentence tokenizing step. (To me it seems like a weird historical thing that NLTK has 2 successive variation on improving the Penn tokenizer, but for now I am matching it and having both. See [[NLTK#2005]](https://github.com/nltk/nltk/issues/2005)) -- **Tweet Tokenizer:** (`tweet_tokenizer`) NLTK's casual tokenizer for that is solely designed for tweets. Apart from twitter specific, this tokenizer has good handling for emoticons, and other web aspects like support for HTML Entities. This closely matches NLTK's `nltk.tokenize.TweetTokenizer` + - **Reversible Tokenizer:** (`rev_tokenize` and `rev_detokenize`) This tokenizer splits on punctuations, space and special symbols. The generated tokens can be de-tokenized by using the `rev_detokenizer` function into the state before tokenization. + - **TokTok Tokenizer:** (`toktok_tokenize`) This tokenizer is a simple, general tokenizer, where the input has one sentence per line; thus only final period is tokenized. Tok-tok has been tested on and gives reasonably good results for English, Persian, Russian, Czech, French, German, Vietnamese, Tajik, and a few others. **(default tokenizer)** + - **Tweet Tokenizer:** (`tweet_tokenizer`) NLTK's casual tokenizer for that is solely designed for tweets. Apart from twitter specific, this tokenizer has good handling for emoticons, and other web aspects like support for HTML Entities. This closely matches NLTK's `nltk.tokenize.TweetTokenizer` # Sentence Splitters @@ -114,3 +114,94 @@ So `split(foo, Words)` is the same as `tokenize(foo)`, and `split(foo, Sentences)` is the same as `split_sentences(foo)`. + +## Using TokenBuffer API for Custom Tokenizers +We offer a `TokenBuffer` API and supporting utility parsers +for high speed tokenization. + +The order in which the parsers are written needs to be taken care of in some cases- + +For example: `987-654-3210` matches as a phone number +as well as numbers, but number will only match upto `987` +and split about it. + +```julia +julia> using WordTokenizers: TokenBuffer, isdone, character, spaces, nltk_phonenumbers, number + +julia> order1(ts) = number(ts) || nltk_phonenumbers(ts) +order1 (generic function with 1 method) + +julia> order2(ts) = nltk_phonenumbers(ts) || number(ts) +order2 (generic function with 1 method) + +julia> function tokenize1(input) + ts = TokenBuffer(input) + while !isdone(ts) + order1(ts) || + character(ts) + end + return ts.tokens + end +tokenize1 (generic function with 1 method) + +julia> function tokenize2(input) + ts = TokenBuffer(input) + while !isdone(ts) + order2(ts) || + character(ts) + end + return ts.tokens + end +tokenize2 (generic function with 1 method) + +julia> tokenize1("987-654-3210") # number(ts) || nltk_phonenumbers(ts) +5-element Array{String,1}: + "987" + "-" + "654" + "-" + "3210" + +julia> tokenize2("987-654-3210") # nltk_phonenumbers(ts) || number(ts) +1-element Array{String,1}: + "987-654-3210" +``` + +#### Writing your own TokenBuffer parsers + +`TokenBuffer` turns a string into a readable stream, used for building tokenizers. +Utility parsers such as `spaces` and `number` read characters from the +stream and into an array of tokens. + +Parsers return `true` or `false` to indicate whether they matched +in the input stream. They can therefore be combined easily, e.g. + + spacesornumber(ts) = spaces(ts) || number(ts) + +either skips whitespace or parses a number token, if possible. + +The simplest possible tokenizer accepts any `character` with no token breaks: + + function tokenise(input) + ts = TokenBuffer(input) + while !isdone(ts) + character(ts) + end + return ts.tokens + end + + tokenise("foo bar baz") # ["foo bar baz"] + +The second simplest splits only on spaces: + + function tokenise(input) + ts = TokenBuffer(input) + while !isdone(ts) + spaces(ts) || character(ts) + end + return ts.tokens + end + + tokenise("foo bar baz") # ["foo", "bar", "baz"] + +You may see `nltk_word_tokenize` for a more advanced example. From f18ae44c2f1a1fd8c3c06b525c763353ff7462d9 Mon Sep 17 00:00:00 2001 From: Ayushk4 Date: Mon, 3 Jun 2019 18:09:37 +0530 Subject: [PATCH 38/41] Update Docs for custom token TokenBuffer tokenizers, functions --- README.md | 140 +++++++++++++++++++++++++++++++++++++++++++----------- 1 file changed, 112 insertions(+), 28 deletions(-) diff --git a/README.md b/README.md index 930ef8a..4798eb7 100644 --- a/README.md +++ b/README.md @@ -119,7 +119,71 @@ and We offer a `TokenBuffer` API and supporting utility parsers for high speed tokenization. -The order in which the parsers are written needs to be taken care of in some cases- +#### Writing your own TokenBuffer parsers + +`TokenBuffer` turns a string into a readable stream, used for building tokenizers. +Utility parsers such as `spaces` and `number` read characters from the +stream and into an array of tokens. + +Parsers return `true` or `false` to indicate whether they matched +in the input stream. They can therefore be combined easily, e.g. + + spacesornumber(ts) = spaces(ts) || number(ts) + +either skips whitespace or parses a number token, if possible. + +The simplest useful tokenizer splits on spaces. + + using WordTokenizers: TokenBuffer, isdone, spaces, character + + function tokenise(input) + ts = TokenBuffer(input) + while !isdone(ts) + spaces(ts) || character(ts) + end + return ts.tokens + end + + tokenise("foo bar baz") # ["foo", "bar", "baz"] + +Many prewritten components for building custom tokenizers +can be found in `src/words/fast.jl` and `src/words/tweet_tokenizer.jl` +These components can be mixed and matched to create more complex tokenizers. + +Here is a more complex example. + +```julia +julia> using WordTokenizers: TokenBuffer, isdone, character, spaces # Present in fast.jl + +julia> using WordTokenizers: nltk_url1, nltk_url2, nltk_phonenumbers # Present in tweet_tokenizer.jl + +julia> function tokeinze(input) + urls(ts) = nltk_url1(ts) || nltk_url2(ts) + + ts = TokenBuffer(input) + while !isdone(ts) + spaces(ts) && continue + urls(ts) || + nltk_phonenumbers(ts) || + character(ts) + end + return ts.tokens + end +tokeinze (generic function with 1 method) + +julia> tokeinze("A url https://github.com/JuliaText/WordTokenizers.jl/ and phonenumber +0 (987) - 2344321") +6-element Array{String,1}: + "A" + "url" + "https://github.com/JuliaText/WordTokenizers.jl/" # URL detected. + "and" + "phonenumber" + "+0 (987) - 2344321" # Phone number detected. +``` + +#### Tips for writing custom tokenizers and your own TokenBuffer Parser/Feature + +1. The order in which the parsers are written needs to be taken care of in some cases- For example: `987-654-3210` matches as a phone number as well as numbers, but number will only match upto `987` @@ -167,41 +231,61 @@ julia> tokenize2("987-654-3210") # nltk_phonenumbers(ts) || number(ts) "987-654-3210" ``` -#### Writing your own TokenBuffer parsers +2. BoundsError and errors while handling edge cases are most common +and need to be taken of while writing the TokenBuffer parsers. -`TokenBuffer` turns a string into a readable stream, used for building tokenizers. -Utility parsers such as `spaces` and `number` read characters from the -stream and into an array of tokens. +3. For some TokenBuffer `ts`, use `flush!(ts)` +over push!(ts.tokens, input[i:j]), to make sure that characters +in the Buffer (i.e. ts.Buffer) also gets flushed out as separate tokens. -Parsers return `true` or `false` to indicate whether they matched -in the input stream. They can therefore be combined easily, e.g. +```julia +julia> using WordTokenizers: TokenBuffer, flush!, spaces, character, isdone - spacesornumber(ts) = spaces(ts) || number(ts) +julia> function tokenize(input) + ts = TokenBuffer(input) -either skips whitespace or parses a number token, if possible. + while !isdone(ts) + spaces(ts) && continue + my_pattern(ts) || + character(ts) + end + return ts.tokens + end -The simplest possible tokenizer accepts any `character` with no token breaks: +julia> function my_pattern(ts) # Matches the pattern for 2 continuous `_` + ts.idx + 1 <= length(ts.input) || return false - function tokenise(input) - ts = TokenBuffer(input) - while !isdone(ts) - character(ts) - end - return ts.tokens - end + if ts[ts.idx] == '_' && ts[ts.idx + 1] == '_' + flush!(ts, "__") # Using flush! + ts.idx += 2 + return true + end - tokenise("foo bar baz") # ["foo bar baz"] + return false + end +my_pattern (generic function with 1 method) -The second simplest splits only on spaces: +julia> tokenize("hi__hello") +3-element Array{String,1}: + "hi" + "__" + "hello" - function tokenise(input) - ts = TokenBuffer(input) - while !isdone(ts) - spaces(ts) || character(ts) - end - return ts.tokens - end +julia> function my_pattern(ts) # Matches the pattern for 2 continuous `_` + ts.idx + 1 <= length(ts.input) || return false - tokenise("foo bar baz") # ["foo", "bar", "baz"] + if ts[ts.idx] == '_' && ts[ts.idx + 1] == '_' + push!(ts.tokens, "__") # Without using flush! + ts.idx += 2 + return true + end + + return false + end +my_pattern (generic function with 1 method) -You may see `nltk_word_tokenize` for a more advanced example. +julia> tokenize("hi__hello") +2-element Array{String,1}: + "__" + "hihello" +``` From b0d8dd480dc7ecea22cf382d58a52aa6b0bdb73f Mon Sep 17 00:00:00 2001 From: Ayushk4 Date: Mon, 3 Jun 2019 18:52:55 +0530 Subject: [PATCH 39/41] Minor doc changes --- README.md | 14 +++++++------- src/words/tweet_tokenizer.jl | 10 +++++----- 2 files changed, 12 insertions(+), 12 deletions(-) diff --git a/README.md b/README.md index 4798eb7..8f5584f 100644 --- a/README.md +++ b/README.md @@ -116,16 +116,16 @@ and `split(foo, Sentences)` is the same as `split_sentences(foo)`. ## Using TokenBuffer API for Custom Tokenizers -We offer a `TokenBuffer` API and supporting utility parsers +We offer a `TokenBuffer` API and supporting utility lexers for high speed tokenization. -#### Writing your own TokenBuffer parsers +#### Writing your own TokenBuffer tokenizers `TokenBuffer` turns a string into a readable stream, used for building tokenizers. -Utility parsers such as `spaces` and `number` read characters from the +Utility lexers such as `spaces` and `number` read characters from the stream and into an array of tokens. -Parsers return `true` or `false` to indicate whether they matched +Lexers return `true` or `false` to indicate whether they matched in the input stream. They can therefore be combined easily, e.g. spacesornumber(ts) = spaces(ts) || number(ts) @@ -181,9 +181,9 @@ julia> tokeinze("A url https://github.com/JuliaText/WordTokenizers.jl/ and phone "+0 (987) - 2344321" # Phone number detected. ``` -#### Tips for writing custom tokenizers and your own TokenBuffer Parser/Feature +#### Tips for writing custom tokenizers and your own TokenBuffer Lexer -1. The order in which the parsers are written needs to be taken care of in some cases- +1. The order in which the lexers are written needs to be taken care of in some cases- For example: `987-654-3210` matches as a phone number as well as numbers, but number will only match upto `987` @@ -232,7 +232,7 @@ julia> tokenize2("987-654-3210") # nltk_phonenumbers(ts) || number(ts) ``` 2. BoundsError and errors while handling edge cases are most common -and need to be taken of while writing the TokenBuffer parsers. +and need to be taken of while writing the TokenBuffer lexers. 3. For some TokenBuffer `ts`, use `flush!(ts)` over push!(ts.tokens, input[i:j]), to make sure that characters diff --git a/src/words/tweet_tokenizer.jl b/src/words/tweet_tokenizer.jl index f18b595..bd7b05e 100644 --- a/src/words/tweet_tokenizer.jl +++ b/src/words/tweet_tokenizer.jl @@ -58,10 +58,10 @@ function html_entity(ts::TokenBuffer, remove_illegal=true) number = parse(Int, String(ts[ts.idx+3:i-1]), base=16) end - windows_1252_chars = ['€', '\u81', '‚', 'ƒ', '„', '…', '†', '‡', 'ˆ', '‰', + windows_1252_chars = ('€', '\u81', '‚', 'ƒ', '„', '…', '†', '‡', 'ˆ', '‰', 'Š', '‹', 'Œ', '\u8d','Ž', '\u8f', '\u90', '‘', '’', '“', '”', '•', '–', '—', '˜', '™', 'š', '›', 'œ', - '\u9d', 'ž', 'Ÿ'] + '\u9d', 'ž', 'Ÿ') if 0x80 <= number <= 0x9F push!(ts.buffer, windows_1252_chars[number - 127]) ts.idx = i + 1 @@ -80,8 +80,9 @@ end """ lookbehind(ts::TokenBuffer) -A helper function for strip_twitter_handle. Checks if the beginning of the detected -handle is preceded by alphanumeric or special chars like('_', '!', '@', '#', '\$', '%', '&', '*') + +Checks if the beginning of the detected handle is preceded by alphanumeric +or special characters like('_', '!', '@', '#', '\$', '%', '&', '*') """ function lookbehind(ts::TokenBuffer, match_pattern = ('_', '!', '@', '#', '$', '%', '&', '*')) @@ -122,7 +123,6 @@ end reduce_all_repeated(ts::TokenBuffer) For handling repeated characters like "helloooooo" -> :hellooo". - """ function reduce_all_repeated(ts) ts.idx + 4 > length(ts.input) && return false From fcfd107049b923af138555ace41eedd028184fef Mon Sep 17 00:00:00 2001 From: Ayushk4 Date: Mon, 3 Jun 2019 22:14:16 +0530 Subject: [PATCH 40/41] Clean up code for tweet Tokenizer --- src/words/tweet_tokenizer.jl | 83 +++++++++++++++--------------------- 1 file changed, 34 insertions(+), 49 deletions(-) diff --git a/src/words/tweet_tokenizer.jl b/src/words/tweet_tokenizer.jl index bd7b05e..54d2512 100644 --- a/src/words/tweet_tokenizer.jl +++ b/src/words/tweet_tokenizer.jl @@ -1,3 +1,5 @@ +isalpha(c) = isascii(c) && (islowercase(c) || isuppercase(c)) +isalnum(c) = isascii(c) && (islowercase(c) || isuppercase(c) || isdigit(c)) """ html_entities(ts::TokenBuffer; remove_illegal=true) @@ -18,8 +20,7 @@ function html_entity(ts::TokenBuffer, remove_illegal=true) (ts.idx + 1 > length(ts.input) || ts.input[ts.idx] != '&' ) && return false if ts.input[ts.idx + 1] != '#' # Entity is of the type "Δ" => "Δ" i = ts.idx + 1 - while i <= length(ts.input) && isascii(ts[i]) && - (isdigit(ts[i]) || islowercase(ts[i]) || isuppercase(ts[i])) + while i <= length(ts.input) && isalnum(ts[i]) i += 1 end (i > length(ts.input) || ts[i] != ';') && return false @@ -107,9 +108,7 @@ function strip_twitter_handle(ts) lookbehind(ts) && return false i = ts.idx + 1 - while i <= length(ts.input) && - ( isascii(ts[i]) && (isdigit(ts[i]) || islowercase(ts[i]) || - isuppercase(ts[i]) || ts[i] == '_')) + while i <= length(ts.input) && (isalnum(ts[i]) || ts[i] == '_') i += 1 end (i <= length(ts.input)) && (i == ts.idx + 1 || ts[i] == '@') && return false @@ -151,10 +150,7 @@ single token of "..." function safe_text(ts) ts.idx + 4 > length(ts.input) && return false - ( - (isascii(ts[ts.idx]) && ( islowercase(ts[ts.idx]) || - isuppercase(ts[ts.idx]) || isdigit(ts[ts.idx]))) || - ts[ts.idx] != ts[ts.idx + 1] || + (isalnum(ts[ts.idx]) || ts[ts.idx] != ts[ts.idx + 1] || ts[ts.idx] != ts[ts.idx + 2] ) && return false i = ts.idx + 3 @@ -355,18 +351,14 @@ function emailaddresses(ts) ts.idx + 4 > length(ts.input) && return false i = ts.idx - while i + 3 <= length(ts.input) && isascii(ts[i]) && - (isdigit(ts[i]) || islowercase(ts[i]) || - isuppercase(ts[i]) || ts[i] ∈ ['.', '+', '-', '_']) + while i + 3 <= length(ts.input) && (isalnum(ts[i]) || ts[i] ∈ ('.', '+', '-', '_')) i += 1 end (i == ts.idx || ts[i] != '@') && return false i += 1 j = i - while i + 2 <= length(ts.input) && isascii(ts[i]) && - (isdigit(ts[i]) || islowercase(ts[i]) || - isuppercase(ts[i]) || ts[i] == '-' || ts == '_') + while i + 2 <= length(ts.input) && (isalnum(ts[i]) || ts[i] == '-' || ts == '_') i += 1 end @@ -376,9 +368,7 @@ function emailaddresses(ts) last_dot = i i += 1 - while i <= length(ts.input) && isascii(ts[i]) && - (isdigit(ts[i]) || islowercase(ts[i]) || - isuppercase(ts[i]) || ts[i] ∈ ['-', '_']) + while i <= length(ts.input) && (isalnum(ts[i]) || ts[i] == '-' || ts == '_') if i + 1 < length(ts.input) && ts[i + 1] == '.' i += 1 @@ -400,14 +390,12 @@ Matches for twitter hashtags. """ function twitterhashtags(ts) (ts.idx + 2 > length(ts.input) || ts[ts.idx] != '#' || - ts[ts.idx + 1] ∈ ['\'', '-']) && return false + ts[ts.idx + 1] ∈ ('\'', '-')) && return false i = ts.idx + 1 last_word_char = i - while i <= length(ts.input) && isascii(ts[i]) && - (isdigit(ts[i]) || islowercase(ts[i]) || - isuppercase(ts[i]) || ts[i] ∈ ['_', '\'', '-']) + while i <= length(ts.input) && (isalnum(ts[i]) || ts[i] ∈ ('_', '\'', '-')) if ts[i] ∉ ['\'', '-'] last_word_char = i @@ -431,9 +419,7 @@ function twitterusername(ts) (ts.idx + 1 > length(ts.input) || ts[ts.idx] != '@' ) && return false i = ts.idx + 1 - while i <= length(ts.input) && isascii(ts[i]) && - (isdigit(ts[i]) || islowercase(ts[i]) || - isuppercase(ts[i]) || ts[i] == '_') + while i <= length(ts.input) && (isalnum(ts[i]) || ts[i] == '_') i += 1 end i > ts.idx + 1 && return flushaboutindex!(ts, i - 1) @@ -470,18 +456,16 @@ end TokenBuffer matcher for words that may or maynot have dashes or apostrophe in it. """ function words_including_apostrophe_dashes(ts) - (ts.idx + 1 > length(ts.input) || !(isascii(ts[ts.idx]) && - (islowercase(ts[ts.idx]) || isuppercase(ts[ts.idx]) - || isdigit(ts[ts.idx]) || ts[ts.idx] == '_' ))) && return false + ts.idx + 1 > length(ts.input) && return false + isalnum(ts[ts.idx]) || ts[ts.idx] == '_' || return false has_apostrophe_dashes = false i = ts.idx + 1 last_char = ts.idx if isuppercase(ts[ts.idx]) || islowercase(ts[ts.idx]) - while i <= length(ts.input) && isascii(ts[i]) && - (islowercase(ts[i]) || isuppercase(ts[i]) || ts[i] ∈ ['_', '\'', '-']) - if has_apostrophe_dashes == false && ts[i] ∈ ['\'', '-'] + while i <= length(ts.input) && (isalpha(ts[i]) || ts[i] ∈ ('_', '\'', '-')) + if has_apostrophe_dashes == false && ts[i] ∈ ('\'', '-') has_apostrophe_dashes = true else last_char = i @@ -492,14 +476,15 @@ function words_including_apostrophe_dashes(ts) has_apostrophe_dashes && last_char != ts.idx && return flushaboutindex!(ts, last_char) - while i <= length(ts.input) && isascii(ts[i]) && (isdigit(ts[i]) || - islowercase(ts[i]) || isuppercase(ts[i]) || ts[i] == '_') + while i <= length(ts.input) && (isalnum(ts[i]) || ts[i] == '_') i += 1 end return flushaboutindex!(ts, i - 1) end +const allowed_chars_phone_numbers = (' ', '*', '-', '.', ')') + """ nltk_casual_phonenumbers(ts) @@ -507,27 +492,27 @@ The TokenBuffer function for nltk's tweet tokenizer regex for phonenumbers. """ function nltk_phonenumbers(ts) (ts.idx + 5 > length(ts.input) || !(isdigit(ts[ts.idx]) || - ts[ts.idx] ∈ ['+', '('] )) && return false + ts[ts.idx] ∈ ('+', '('))) && return false i = ts.idx optional_1_confirmed = false # Checking for the part 1 of regex which is optional if ts[i] == '+' - ts[i + 1] ∈ ['0', '1'] || return false + ts[i + 1] == '0' || ts[i + 1] == '1' || return false i += 2 - while i <= length(ts.input) && ts[i] ∈ [' ', '*', '-', '.', ')'] + while i <= length(ts.input) && ts[i] ∈ allowed_chars_phone_numbers i += 1 end i + 5 > length(ts.input) && return false optional_1_confirmed = true - elseif ts[i] ∈ ['0', '1'] + elseif ts[i] == '0' || ts[i] == '1' i += 1 - while i <= length(ts.input) && ts[i] ∈ [' ', '*', '-', '.', ')'] + while i <= length(ts.input) && ts[i] ∈ allowed_chars_phone_numbers i += 1 end @@ -551,7 +536,7 @@ function nltk_phonenumbers(ts) end i += 3 - while i <= length(ts.input) && ts[i] ∈ [' ', '*', '-', '.', ')'] + while i <= length(ts.input) && ts[i] ∈ allowed_chars_phone_numbers i += 1 end end @@ -568,7 +553,7 @@ function nltk_phonenumbers(ts) end i += 3 - while i <= length(ts.input) && ts[i] ∈ [' ', '*', '-', '.', ')'] + while i <= length(ts.input) && ts[i] ∈ allowed_chars_phone_numbers i += 1 end @@ -580,7 +565,7 @@ function nltk_phonenumbers(ts) i += 3 j = i - while i <= length(ts.input) && ts[i] ∈ [' ', '*', '-', '.', ')'] + while i <= length(ts.input) && ts[i] ∈ allowed_chars_phone_numbers i += 1 end @@ -601,7 +586,7 @@ function nltk_phonenumbers(ts) i += 3 - while i <= length(ts.input) && ts[i] ∈ [' ', '*', '-', '.', ')'] + while i <= length(ts.input) && ts[i] ∈ allowed_chars_phone_numbers i += 1 end @@ -611,7 +596,7 @@ function nltk_phonenumbers(ts) i += 3 j = i - while i <= length(ts.input) && ts[i] ∈ [' ', '*', '-', '.', ')'] + while i <= length(ts.input) && ts[i] ∈ allowed_chars_phone_numbers i += 1 end @@ -631,7 +616,7 @@ function nltk_phonenumbers(ts) i += 3 - while i <= length(ts.input) && ts[i] ∈ [' ', '*', '-', '.', ')'] + while i <= length(ts.input) && ts[i] ∈ allowed_chars_phone_numbers i += 1 end @@ -641,7 +626,7 @@ function nltk_phonenumbers(ts) i += 3 j = i - while i <= length(ts.input) && ts[i] ∈ [' ', '*', '-', '.', ')'] + while i <= length(ts.input) && ts[i] ∈ allowed_chars_phone_numbers i += 1 end @@ -772,7 +757,7 @@ function nltk_url1(ts) end i += 1 else # Checking for iii. - (isspace(ts[i])|| ts[i] ∈ [')', '<', '>', '{', '}', '[', ']'] ) && break + (isspace(ts[i])|| ts[i] ∈ (')', '<', '>', '{', '}', '[', ']') ) && break i += 1 end @@ -822,8 +807,8 @@ function nltk_url1(ts) i += 1 else # Check for part ii. isspace(ts[i]) && break - ts[i] ∈ ['`', '!', ')', '[', ']', '{', '}', ';', ':', '\'', '"', '.', - ',', '<', '>', '?', '«', '»', '“', '”', '‘', '’'] && continue + ts[i] ∈ ('`', '!', ')', '[', ']', '{', '}', ';', ':', '\'', '"', '.', + ',', '<', '>', '?', '«', '»', '“', '”', '‘', '’') && continue index_matched = i end i = k @@ -845,7 +830,7 @@ function nltk_url2(ts) flush_about = 0 - while i < length(ts.input) && ts[i] ∈ ['.', '-'] + while i < length(ts.input) && (ts[i] == '.' || ts[i] == '-') j = ts[i] == '.' ? i : 0 i += 1 From c7bd2961e8148e592567945beccf8d941f515aeb Mon Sep 17 00:00:00 2001 From: Ayushk4 Date: Wed, 5 Jun 2019 16:11:27 +0530 Subject: [PATCH 41/41] Change vectors into tuples --- src/words/tweet_tokenizer.jl | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/src/words/tweet_tokenizer.jl b/src/words/tweet_tokenizer.jl index 54d2512..feefebb 100644 --- a/src/words/tweet_tokenizer.jl +++ b/src/words/tweet_tokenizer.jl @@ -218,10 +218,10 @@ function flushaboutindex!(ts::TokenBuffer, uptoidx) return true end -const forehead = ['>', '<'] -const eyes = [':' ';' '=' '8'] -const nose = ['-','o','*','\''] -const mouth = [')', ']', '}', '(', '[', '{', 'd', 'D', 'p', 'P', '\\', '/', ':', '@', '|'] +const forehead = ('>', '<') +const eyes = (':', ';', '=', '8') +const nose = ('-', 'o', '*', '\'') +const mouth = (')', ']', '}', '(', '[', '{', 'd', 'D', 'p', 'P', '\\', '/', ':', '@', '|') """ function emoticons(ts::TokenBuffer) @@ -397,15 +397,15 @@ function twitterhashtags(ts) while i <= length(ts.input) && (isalnum(ts[i]) || ts[i] ∈ ('_', '\'', '-')) - if ts[i] ∉ ['\'', '-'] + if ts[i] ∉ ('\'', '-') last_word_char = i end i += 1 end - last_word_char >= ts.idx + 2 && ts[ts.idx + 1] ∉ ['\'', '-'] && - ts[last_word_char] ∉ ['\'', '-'] && return flushaboutindex!(ts, last_word_char) + last_word_char >= ts.idx + 2 && ts[ts.idx + 1] ∉ ('\'', '-') && + ts[last_word_char] ∉ ('\'', '-') && return flushaboutindex!(ts, last_word_char) return false end