JuliaText · aviks · Aug 26, 2019 · Aug 15, 2019 · Aug 15, 2019 · Aug 15, 2019
diff --git a/docs/src/features.md b/docs/src/features.md
@@ -392,7 +392,11 @@ julia> tags
  "E-PER"
 ```
 
-## Parts of Speech Tagger
+## Parts of Speech Tagging
+
+This package provides with two different Part of Speech Tagger.
+
+## Average Perceptron Part of Speech Tagger
 
 This tagger can be used to find the POS tag of a word or token in a given sentence. It is a based on `Average Perceptron Algorithm`.
 The model can be trained from scratch and weights are saved in specified location.
@@ -457,3 +461,145 @@ julia> tagger(["today", "is"])
 
 * self      = PerceptronTagger
 * tokens    = `Vector` of words or tokens for which to predict tags
+
+## Neural Model for Part of Speech tagging using LSTMs, CNN and CRF
+
+The API provided is a pretrained model for tagging Part of Speech.
+The current model tags all the POS Tagging is done based on [convention used in Penn Treebank](https://www.ling.upenn.edu/courses/Fall_2003/ling001/penn_treebank_pos.html), with 36 different Part of Speech tags excludes punctuation.
+
+To use the API, we first load the model weights into an instance of tagger.
+The function also accepts the path of model_weights and model_dicts (for character and word embeddings)
+
+    PoSTagger()
+    PoSTagger(dicts_path, weights_path)
+
+```julia
+julia> pos = PoSTagger()
+
+```
+
+!!! note
+    When you call `PoSTagger()` for the first time, the package will request permission for download the `Model_dicts` and `Model_weights`. Upon downloading, these are store locally and managed by `DataDeps`. So, on subsequent uses the weights will not need to be downloaded again.
+
+Once we create an instance, we can call it to tag a String (sentence), sequence of tokens, `AbstractDocument` or `Corpus`.
+
+    (pos::PoSTagger)(sentence::String)
+    (pos::PoSTagger)(tokens::Array{String, 1})
+    (pos::PoSTagger)(sd::StringDocument)
+    (pos::PoSTagger)(fd::FileDocument)
+    (pos::PoSTagger)(td::TokenDocument)
+    (pos::PoSTagger)(crps::Corpus)
+
+```julia
+
+julia> sentence = "This package is maintained by John Doe."
+"This package is maintained by John Doe."
+
+julia> tags = pos(sentence)
+8-element Array{String,1}:
+ "DT"
+ "NN"
+ "VBZ"
+ "VBN"
+ "IN"
+ "NNP"
+ "NNP"
+ "."
+
+```
+
+The API tokenizes the input sentences via the default tokenizer provided by `WordTokenizers`, this currently being set to the multilingual `TokTok Tokenizer.`
+
+```
+
+julia> using WordTokenizers
+
+julia> collect(zip(WordTokenizers.tokenize(sentence), tags))
+8-element Array{Tuple{String,String},1}:
+ ("This", "DT")
+ ("package", "NN")
+ ("is", "VBZ")
+ ("maintained", "VBN")
+ ("by", "IN")
+ ("John", "NNP")
+ ("Doe", "NNP")
+ (".", ".")
+
+```
+
+For tagging a multisentence text or document, once can use `split_sentences` from `WordTokenizers.jl` package and run the pos model on each.
+
+```julia
+julia> sentences = "Rabinov is winding up his term as ambassador. He will be replaced by Eliahu Ben-Elissar, a former Israeli envoy to Egypt and right-wing Likud party politiian." # Sentence taken from CoNLL 2003 Dataset
+
+julia> splitted_sents = WordTokenizers.split_sentences(sentences)
+
+julia> tag_sequences = pos.(splitted_sents)
+2-element Array{Array{String,1},1}:
+ ["NNP", "VBZ", "VBG", "RP", "PRP\$", "NN", "IN", "NN", "."]
+ ["PRP", "MD", "VB", "VBN", "IN", "NNP", "NNP", ",", "DT", "JJ", "JJ", "NN", "TO", "NNP", "CC", "JJ", "NNP", "NNP", "NNP", "."]
+
+julia> zipped = [collect(zip(tag_sequences[i], WordTokenizers.tokenize(splitted_sents[i]))) for i in eachindex(splitted_sents)]
+
+julia> zipped[1]
+9-element Array{Tuple{String,String},1}:
+ ("NNP", "Rabinov")
+ ("VBZ", "is")
+ ("VBG", "winding")
+ ("RP", "up")
+ ("PRP\$", "his")
+ ("NN", "term")
+ ("IN", "as")
+ ("NN", "ambassador")
+ (".", ".")
+
+julia> zipped[2]
+20-element Array{Tuple{String,String},1}:
+ ("PRP", "He")
+ ("MD", "will")
+ ("VB", "be")
+ ("VBN", "replaced")
+ ("IN", "by")
+ ("NNP", "Eliahu")
+ ("NNP", "Ben-Elissar")
+ (",", ",")
+ ("DT", "a")
+ ("JJ", "former")
+ ("JJ", "Israeli")
+ ("NN", "envoy")
+ ("TO", "to")
+ ("NNP", "Egypt")
+ ("CC", "and")
+ ("JJ", "right-wing")
+ ("NNP", "Likud")
+ ("NNP", "party")
+ ("NNP", "politiian")
+ (".", ".")
+
+```
+
+Since the tagging the Part of Speech is done on sentence level,
+the text of `AbstractDocument` is sentence_tokenized and then labelled for over sentence.
+However is not possible for `NGramDocument` as text cannot be recreated.
+For `TokenDocument`, text is approximated for splitting into sentences, hence the following throws a warning when tagging the `Corpus`.
+
+```julia
+
+julia> crps = Corpus([StringDocument("We aRE vErY ClOSE tO ThE HEaDQuarTeRS."), TokenDocument("this is Bangalore.")])
+A Corpus with 2 documents:
+ * 1 StringDocument's
+ * 0 FileDocument's
+ * 1 TokenDocument's
+ * 0 NGramDocument's
+
+Corpus's lexicon contains 0 tokens
+Corpus's index contains 0 tokens
+
+julia> pos(crps)
+┌ Warning: TokenDocument's can only approximate the original text
+└ @ TextAnalysis ~/.julia/dev/TextAnalysis/src/document.jl:220
+2-element Array{Array{Array{String,1},1},1}:
+ [["PRP", "VBP", "RB", "JJ", "TO", "DT", "NN", "."]]
+ [["DT", "VBZ", "NNP", "."]]
+
+```
diff --git a/src/TextAnalysis.jl b/src/TextAnalysis.jl
@@ -66,7 +66,7 @@ module TextAnalysis
 
     export CRF, viterbi_decode, crf_loss
 
-    export NERTagger, Tracker, Flux
+    export NERTagger, PoSTagger, Tracker, Flux
 
     include("tokenizer.jl")
     include("ngramizer.jl")
@@ -99,19 +99,23 @@ module TextAnalysis
 
     include("evaluation_metrics.jl")
     include("coom.jl")
+
     # CRF
     include("CRF/crf.jl")
     include("CRF/predict.jl")
     include("CRF/crf_utils.jl")
     include("CRF/loss.jl")
 
-    # NER
+    # NER and POS
     include("sequence/ner_datadeps.jl")
     include("sequence/ner.jl")
+    include("sequence/pos_datadeps.jl")
+    include("sequence/pos.jl")
     include("sequence/sequence_models.jl")
 
     function __init__()
         pos_tagger_datadep_register()
         ner_datadep_register()
+        pos_datadep_register()
     end
 end
diff --git a/src/sequence/ner.jl b/src/sequence/ner.jl
@@ -7,11 +7,15 @@ struct NERmodel{M}
     model::M
 end
 
-function load_model_dicts(filepath)
+load_model_dicts(filepath) = load_model_dicts(filepath, true)
+
+function load_model_dicts(filepath, remove_tag_prefix)
     labels = BSON.load(joinpath(filepath, "labels.bson"))[:labels]
     chars_idx = BSON.load(joinpath(filepath, "char_to_embed_idx.bson"))[:get_char_index]
     words_idx = BSON.load(joinpath(filepath, "word_to_embed_idx.bson"))[:get_word_index]
 
+    remove_tag_prefix || return [labels...], chars_idx, words_idx
+
     return remove_ner_label_prefix.([labels...]), chars_idx, words_idx
 end
 

diff --git a/src/sequence/pos.jl b/src/sequence/pos.jl
@@ -0,0 +1,37 @@
+using BSON, Tracker
+
+const PoSCharUNK = '¿'
+const PoSWordUNK = "<UNK>"
+
+struct PoSModel{M}
+    model::M
+end
+
+PoSTagger() = PoSTagger(datadep"POS Model Dicts", datadep"POS Model Weights")
+
+function PoSTagger(dicts_path, weights_path)
+    labels, chars_idx, words_idx = load_model_dicts(dicts_path, false)
+    model = BiLSTM_CNN_CRF_Model(labels, chars_idx, words_idx, chars_idx[PoSCharUNK], words_idx[PoSWordUNK], weights_path)
+    PoSModel(model)
+end
+
+function (a::PoSModel)(tokens::Array{String,1})
+    input_oh = [onehotinput(a.model, token) for token in tokens]
+    return (a.model)(input_oh)
+end
+
+function (a::PoSModel)(sentence::AbstractString)
+    a(WordTokenizers.tokenize(sentence))
+end
+
+function (a::PoSModel)(doc::AbstractDocument)
+    return vcat(a.(WordTokenizers.split_sentences(text(doc))))
+end
+
+function (a::PoSModel)(ngd::NGramDocument)
+    throw("Sequence Labelling not possible for NGramsDocument")
+end
+
+function (a::PoSModel)(crps::Corpus)
+    return a.(crps.documents)
+end
diff --git a/src/sequence/pos_datadeps.jl b/src/sequence/pos_datadeps.jl
@@ -0,0 +1,31 @@
+function pos_datadep_register()
+    register(DataDep("POS Model Weights",
+        """
+        The weights for POS Sequence Labelling Model.
+        """,
+        "https://github.com/JuliaText/TextAnalysis.jl/releases/download/v0.6.0/pos_weights.tar.xz",
+        "b02e891ea913be6834ff67d6ecf2ddae6754d55509bb3d9c078dbfc7eed27988";
+        post_fetch_method = function(fn)
+            unpack(fn)
+            dir = "weights"
+            innerfiles = readdir(dir)
+            mv.(joinpath.(dir, innerfiles), innerfiles)
+            rm(dir)
+        end
+    ))
+
+    register(DataDep("POS Model Dicts",
+        """
+        The character and words dict for POS Sequence Labelling Model.
+        """,
+        "https://github.com/JuliaText/TextAnalysis.jl/releases/download/v0.6.0/pos_model_dicts.tar.xz",
+        "4d7fe8238ff0cfb92d195dfa745b4ed08f916d4707e3dbe27a1b3144c9282f41";
+        post_fetch_method = function(fn)
+            unpack(fn)
+            dir = "model_dicts"
+            innerfiles = readdir(dir)
+            mv.(joinpath.(dir, innerfiles), innerfiles)
+            rm(dir)
+        end
+    ))
+end
diff --git a/test/pos.jl b/test/pos.jl
@@ -0,0 +1,52 @@
+using WordTokenizers
+
+@testset "POS" begin
+    pos = PoSTagger()
+
+    @testset "Basic" begin
+        str = "The very first major corpus of English for computer analysis was the Brown Corpus."
+        @test pos(str) ==  ["DT", "RB", "JJ", "JJ", "NN", "IN", "JJ", "IN", "NN", "NN", "VBD", "DT", "NNP", "NNP", "."]
+
+        str = "If the Irish win the World Cup this year, it will be their 3rd time in a row."
+        @test pos(str) == ["IN", "DT", "NNP", "VBP", "DT", "NNP", "NNP", "DT", "NN", ",", "PRP", "MD", "VB", "PRP\$", "CD", "JJ", "NN", "IN", "DT", "NN", "."]
+    end
+
+    @testset "Unknown Unicode characters" begin
+        # Making sure that the pos model handles for unknown unicode characters
+        str = "आ β⬰ 5¥ "
+        @test length(pos(str)) == length(WordTokenizers.tokenize(str))
+
+        str = "You owe John Doe 5¥."
+        @test pos(str) ==  ["PRP", "VBP", "NNP", "NNP", "CD", "NNP", "."]
+    end
+
+    @testset "Documents and Corpus" begin
+        pos = PoSTagger()
+
+        text1 = "A little too small"
+        text2 = "Here Foo Bar, please have some chocolate."
+
+        sd = StringDocument(text1)
+        td = TokenDocument(text2)
+
+        tags = pos(sd)
+        @test length(tags) == length(WordTokenizers.split_sentences(text1))
+        @test length(tags[1]) == length(WordTokenizers.tokenize(text1))
+        uniq1 = unique(vcat(tags...))
+        @test "RB" ∈ uniq1
+        @test "JJ" ∈ uniq1
+
+
+        tags = pos(td)
+        @test length(tags) == length(WordTokenizers.split_sentences(text2))
+        @test length(tags[1]) == length(WordTokenizers.tokenize(text2))
+        @test findall( x -> x == "NNP", vcat(tags...)) == [2, 3]
+
+        crps = Corpus([sd, td])
+        tags = pos(crps)
+
+        @test length(tags) == length(crps.documents)
+        @test tags[1] == pos(crps.documents[1])
+        @test tags[2] == pos(crps.documents[2])
+    end
+end
diff --git a/test/runtests.jl b/test/runtests.jl
@@ -6,6 +6,7 @@ using WordTokenizers
 
 println("Running tests:")
 
+include("pos.jl")
 include("ner.jl")
 include("coom.jl")
 include("crf.jl")