JuliaText · oxinabox · Jun 6, 2019 · Jan 22, 2019 · Jan 25, 2019 · Jan 31, 2019
diff --git a/README.md b/README.md
@@ -71,6 +71,8 @@ The word tokenizers basically assume sentence splitting has already been done.
 
   (To me it seems like a weird historical thing that NLTK has 2 successive variation on improving the Penn tokenizer, but for now I am matching it and having both.  See [[NLTK#2005]](https://github.com/nltk/nltk/issues/2005))
 
+- **Tweet Tokenizer:** (`tweet_tokenizer`) NLTK's casual tokenizer for that is solely designed for tweets. Apart from twitter specific, this tokenizer has good handling for emoticons, and other web aspects like support for HTML Entities. This closely matches NLTK's `nltk.tokenize.TweetTokenizer`
+
 
 # Sentence Splitters
 We currently only have one sentence splitter.

diff --git a/REQUIRE b/REQUIRE
@@ -1 +1,3 @@
 julia 0.7
+HTML_Entities
+StrTables
diff --git a/src/WordTokenizers.jl b/src/WordTokenizers.jl
@@ -1,8 +1,14 @@
 
 module WordTokenizers
 
+using HTML_Entities
+using StrTables
+using Unicode
+
+
 export poormans_tokenize, punctuation_space_tokenize,
        penn_tokenize, improved_penn_tokenize, nltk_word_tokenize,
+       tweet_tokenize,
        tokenize,
        rulebased_split_sentences,
        split_sentences,
@@ -16,6 +22,7 @@ include("words/simple.jl")
 include("words/nltk_word.jl")
 include("words/reversible_tokenize.jl")
 include("words/sedbased.jl")
+include("words/tweet_tokenizer.jl")
 include("sentences/sentence_splitting.jl")
 include("words/TokTok.jl")
 

diff --git a/src/split_api.jl b/src/split_api.jl
@@ -3,7 +3,7 @@
 export Words, Sentences
 
 const tokenizers = [poormans_tokenize, punctuation_space_tokenize,
-       penn_tokenize, improved_penn_tokenize, nltk_word_tokenize]
+       penn_tokenize, improved_penn_tokenize, nltk_word_tokenize, tweet_tokenize]
 const sentence_splitters = [rulebased_split_sentences]
 
 const Words = tokenize

diff --git a/src/words/fast.jl b/src/words/fast.jl
@@ -214,9 +214,13 @@ end
 
 Matches numbers such as `10,000.5`, preserving formatting.
 """
-function number(ts, sep = (':', ',', '\'', '.'))
-    isdigit(ts[]) || return false
+function number(ts, sep = (':', ',', '\'', '.'); check_sign = false)
     i = ts.idx
+    if check_sign && ts[] ∈ ['+', '-'] && ( i == 1 || isspace(ts[i-1]))
+        i += 1
+    end
+
+    i <= length(ts.input) && isdigit(ts[i]) || return false
     while i <= length(ts.input) && (isdigit(ts[i]) ||
                 (ts[i] in sep && i < length(ts.input) && isdigit(ts[i+1])))
         i += 1
@@ -225,4 +229,3 @@ function number(ts, sep = (':', ',', '\'', '.'))
     ts.idx = i
     return true
 end
-