JuliaText · aviks · Aug 12, 2019 · Aug 1, 2019 · Aug 10, 2019 · Aug 10, 2019
diff --git a/.travis.yml b/.travis.yml
@@ -1,6 +1,8 @@
 language: julia
 os:
   - linux
+env:
+  - DATADEPS_ALWAYS_ACCEPT=true
 julia:
   - 0.7
   - 1.0

diff --git a/REQUIRE b/REQUIRE
@@ -7,3 +7,4 @@ Flux
 BSON
 JSON
 DataStructures
+DataDeps
diff --git a/appveyor.yml b/appveyor.yml
@@ -1,4 +1,5 @@
 environment:
+  DATADEPS_ALWAYS_ACCEPT: True
   matrix:
   - julia_version: 0.7
   - julia_version: 1

diff --git a/docs/src/features.md b/docs/src/features.md
@@ -417,8 +417,26 @@ PerceptronTagger(AveragePerceptron(Set(Any["JJS", "NNP_VBZ", "NN_NNS", "CC", "NN
 ```
 
 ### To predict tags:
+
+The perceptron tagger can predict tags over various document types-
+
+    predict(tagger, sentence::String)
+    predict(tagger, Tokens::Array{String, 1})
+    predict(tagger, sd::StringDocument)
+    predict(tagger, fd::FileDocument)
+    predict(tagger, td::TokenDocument)
+
+This can also be done by -
+    tagger(input)
+
+
 ```julia
 julia> predict(tagger, ["today", "is"])
+2-element Array{Any,1}:
+ ("today", "NN")
+ ("is", "VBZ")
+
+julia> tagger(["today", "is"])
 2-element Array{Any,1}:
  ("today", "NN")
  ("is", "VBZ")

diff --git a/src/TextAnalysis.jl b/src/TextAnalysis.jl
@@ -6,6 +6,8 @@ module TextAnalysis
     using Languages
     using DataFrames
     using WordTokenizers
+
+    using DataDeps
     using DataStructures
     using Statistics
 
@@ -70,6 +72,7 @@ module TextAnalysis
     include("corpus.jl")
     include("metadata.jl")
     include("preprocessing.jl")
+
     # Load libstemmer from our deps.jl
     const depsjl_path = joinpath(dirname(@__FILE__), "..", "deps", "deps.jl")
     if !isfile(depsjl_path)
@@ -99,4 +102,7 @@ module TextAnalysis
     include("CRF/crf_utils.jl")
     include("CRF/loss.jl")
 
+    function __init__()
+        pos_tagger_datadep_register()
+    end
 end
diff --git a/src/averagePerceptronTagger.jl b/src/averagePerceptronTagger.jl
@@ -1,9 +1,26 @@
 using DataStructures
 using Random
 using BSON
+using DataDeps
 
 export fit!, predict
 
+function pos_tagger_datadep_register()
+    register(DataDep("POS Perceptron Tagger Weights",
+        """
+        The trained weights for the average Perceptron Tagger on Part of Speech Tagging task.
+        """,
+        "https://github.com/JuliaText/TextAnalysis.jl/releases/download/v0.6.0/pretrainedMod.bson.zip",
+        "52519cb3aea5d8f74368faedea831471e5df34567de4748d15decea7424743d3",
+        post_fetch_method = function(fn)
+            unpack(fn)
+            rm("__MACOSX", recursive=true)
+            file = readdir()[1]
+            mv(file, "POSWeights.bson")
+        end
+    ))
+end
+
 """
 This file contains the Average Perceptron model and Perceptron Tagger
 which was original implemented by Matthew Honnibal.
@@ -57,7 +74,7 @@ end
 
 """
 Applying the perceptron learning algorithm
-Increment the truth weights and decrementing the guess weights
+Increment the truth weights and decrementing the guess weights,
 if the guess is wrong
 """
 function update(self::AveragePerceptron, truth, guess, features)
@@ -111,22 +128,32 @@ function average_weights(self::AveragePerceptron)
 end
 
 """
-PERCEPTRON TAGGER
+# PERCEPTRON TAGGER
 
 This struct contains the POS tagger "PerceptronTagger" which uses model in "AveragePerceptron"
 In this training can be done and weights can be saved
 Or a pretrain weights can be used (which are trained on same features)
 and train more or can be used to predict
 
-To train:
-tagger = PerceptronTagger(false)
-fit!(tagger, [[("today","NN"),("is","VBZ"),("good","JJ"),("day","NN")]])
+## To train:
+
+```julia
+julia> tagger = PerceptronTagger(false)
+
+julia> fit!(tagger, [[("today","NN"),("is","VBZ"),("good","JJ"),("day","NN")]])
+```
 
-To load pretrain model:
-tagger = PerceptronTagger(true)
+## To load pretrain model:
 
-To predict tag:
-predict(tagger, ["today", "is"])
+```julia
+julia> tagger = PerceptronTagger(true)
+```
+
+## To predict tag:
+
+```julia
+julia> predict(tagger, ["today", "is"])
+```
 """
 mutable struct PerceptronTagger
     model :: AveragePerceptron
@@ -142,11 +169,9 @@ end
 function PerceptronTagger(load::Bool)
     self = PerceptronTagger()
 
-    """
-    If load is true then a pretrain model will be import from location
-    """
+    # If load is true then a pretrain model will be import from location
     if load
-        location = "src/pretrainedMod.bson";
+        location = joinpath(datadep"POS Perceptron Tagger Weights", "POSWeights.bson")
         pretrained = BSON.load(location)
         self.model.weights = pretrained[:weights]
         self.tagdict = pretrained[:tagdict]
@@ -201,11 +226,13 @@ end
 """
 Converting the token into a feature representation, implemented as Dict
 If the features change, a new model should be trained
-params:
-i - index of word(or token) in sentence
-word - token
-context - array of tokens with starting and ending specifiers
-prev == "-START-" prev2 == "-START2-" - Start specifiers
+
+# Arguments:
+
+- `i` - index of word(or token) in sentence
+- `word` - token
+- `context` - array of tokens with starting and ending specifiers
+- `prev` == "-START-" prev2 == "-START2-" - Start specifiers
 """
 function getFeatures(self::PerceptronTagger, i, word, context, prev, prev2)
     function add(sep, name, args...)
@@ -252,8 +279,10 @@ function getFeatures(self::PerceptronTagger, i, word, context, prev, prev2)
 end
 
 """
-Used for predicting the tags for given tokens
-tokens - array of tokens
+    predict(::PerceptronTagger, tokens)
+    predict(::PerceptronTagger, sentence)
+
+Used for predicting the tags for given sentence or array of tokens
 """
 function predict(self::PerceptronTagger, tokens::Vector{String})
     prev, prev2 = self.START
@@ -273,17 +302,38 @@ function predict(self::PerceptronTagger, tokens::Vector{String})
     return output
 end
 
+function (tagger::PerceptronTagger)(input)
+    predict(tagger, input)
+end
+
+predict(tagger::PerceptronTagger, sentence::String) =
+        predict(tagger, tokenize(Languages.English(), sentence))
+predict(tagger::PerceptronTagger, sd::StringDocument) =
+        predict(tagger, text(sd))
+predict(tagger::PerceptronTagger, fd::FileDocument) =
+        predict(tagger, text(fd))
+predict(tagger::PerceptronTagger, td::TokenDocument) =
+        predict(tagger, tokens(td))
+function predict(tagger::PerceptronTagger, ngd::NGramDocument)
+    @warn "POS tagging for NGramDocument not available."
+end
+
+
+
 """
+    fit!(::PerceptronTagger, sentences::Vector{Vector{Tuple{String, String}}}, save_loc::String, nr_iter::Integer)
+
 Used for training a new model or can be used for training
 an existing model by using pretrained weigths and classes
 
 Contains main training loop for number of epochs.
 After training weights, tagdict and classes are stored in the specified location.
 
-params:
-sentences - array of the all sentences
-save_loc - to specify the saving location
-nr_iter - total number of training iterations for given sentences(or number of epochs)
+# Arguments:
+- `::PerceptronTagger` : Input PerceptronTagger model
+- `sentences::Vector{Vector{Tuple{String, String}}}` : Array of the all token seqeunces with target POS tag
+- `save_loc::String` : To specify the saving location
+- `nr_iter::Integer` : Total number of training iterations for given sentences(or number of epochs)
 """
 function fit!(self::PerceptronTagger, sentences::Vector{Vector{Tuple{String, String}}}, save_loc::String, nr_iter::Integer)
     self._sentences = []

diff --git a/src/pretrainedMod.bson b/src/pretrainedMod.bson
diff --git a/test/averagePerceptronTagger.jl b/test/averagePerceptronTagger.jl
@@ -3,21 +3,41 @@ using TextAnalysis: AveragePerceptron
 @testset "Average Perceptron Tagger" begin
     tagger = PerceptronTagger(false)
 
-    @test typeof(tagger.classes) == Set{Any}
-    @test length(tagger.classes) == 0
-    @test typeof(tagger.model) == AveragePerceptron
+    @testset "Basic" begin
+        @test typeof(tagger.classes) == Set{Any}
+        @test length(tagger.classes) == 0
+        @test typeof(tagger.model) == AveragePerceptron
 
-    fit!(tagger, [[("today","NN"),("is","VBZ"),("good","JJ"),("day","NN")]])
-    @test length(keys(tagger.model.weights)) == 51
-    @test tagger.classes == tagger.model.classes == Set(["JJ", "VBZ", "NN"])
-end
+        fit!(tagger, [[("today","NN"),("is","VBZ"),("good","JJ"),("day","NN")]])
+        @test length(keys(tagger.model.weights)) == 51
+        @test tagger.classes == tagger.model.classes == Set(["JJ", "VBZ", "NN"])
+    end
+
+    @testset "Average Perceptron Tagger (pretrained)" begin
+        tagger = PerceptronTagger(true)
+
+        @test typeof(tagger.classes) == Set{Any}
+        @test length(tagger.classes) == 75
+        @test typeof(tagger.model) == AveragePerceptron
+    end
+
+    sample_file = joinpath(dirname(@__FILE__), "data", "poem.txt")
 
-##Uncomment these when pretrained Model file is present in the directory
 
-# @testset "Average Perceptron Tagger (pretrained)" begin
-#     tagger = PerceptronTagger(true)
-#
-#     @test typeof(tagger.classes) == Set{Any}
-#     @test length(tagger.classes) == 75
-#     @test typeof(tagger.model) == AveragePerceptron
-# end
+    @testset "Tagging over sentences and documents" begin
+        tagger = PerceptronTagger(true)
+        text = "This is a text"
+        @test tagger(text) == predict(tagger, text)
+
+        sd = StringDocument(text)
+        @test length(predict(tagger, text)) == 4
+        @test length(predict(tagger, sd)) == 4
+
+        text2 = read(sample_file, String)
+        fd = FileDocument(sample_file)
+        @test length(predict(tagger, fd)) == length(predict(tagger, text2))
+
+        td = TokenDocument(text)
+        @test length(predict(tagger, td)) == 4
+    end
+end