diff --git a/.travis.yml b/.travis.yml index 19e1aa7b..4d73a9c5 100644 --- a/.travis.yml +++ b/.travis.yml @@ -1,6 +1,8 @@ language: julia os: - linux +env: + - DATADEPS_ALWAYS_ACCEPT=true julia: - 0.7 - 1.0 diff --git a/REQUIRE b/REQUIRE index 4498bedf..a66b3bb5 100644 --- a/REQUIRE +++ b/REQUIRE @@ -7,3 +7,4 @@ Flux BSON JSON DataStructures +DataDeps diff --git a/appveyor.yml b/appveyor.yml index ea42c124..d32e6d4a 100644 --- a/appveyor.yml +++ b/appveyor.yml @@ -1,4 +1,5 @@ environment: + DATADEPS_ALWAYS_ACCEPT: True matrix: - julia_version: 0.7 - julia_version: 1 diff --git a/docs/src/features.md b/docs/src/features.md index d0b5a048..7668e16d 100644 --- a/docs/src/features.md +++ b/docs/src/features.md @@ -417,8 +417,26 @@ PerceptronTagger(AveragePerceptron(Set(Any["JJS", "NNP_VBZ", "NN_NNS", "CC", "NN ``` ### To predict tags: + +The perceptron tagger can predict tags over various document types- + + predict(tagger, sentence::String) + predict(tagger, Tokens::Array{String, 1}) + predict(tagger, sd::StringDocument) + predict(tagger, fd::FileDocument) + predict(tagger, td::TokenDocument) + +This can also be done by - + tagger(input) + + ```julia julia> predict(tagger, ["today", "is"]) +2-element Array{Any,1}: + ("today", "NN") + ("is", "VBZ") + +julia> tagger(["today", "is"]) 2-element Array{Any,1}: ("today", "NN") ("is", "VBZ") diff --git a/src/TextAnalysis.jl b/src/TextAnalysis.jl index e7450015..b2a3b751 100644 --- a/src/TextAnalysis.jl +++ b/src/TextAnalysis.jl @@ -6,6 +6,8 @@ module TextAnalysis using Languages using DataFrames using WordTokenizers + + using DataDeps using DataStructures using Statistics @@ -70,6 +72,7 @@ module TextAnalysis include("corpus.jl") include("metadata.jl") include("preprocessing.jl") + # Load libstemmer from our deps.jl const depsjl_path = joinpath(dirname(@__FILE__), "..", "deps", "deps.jl") if !isfile(depsjl_path) @@ -99,4 +102,7 @@ module TextAnalysis include("CRF/crf_utils.jl") include("CRF/loss.jl") + function __init__() + pos_tagger_datadep_register() + end end diff --git a/src/averagePerceptronTagger.jl b/src/averagePerceptronTagger.jl index 48d96773..f326576f 100644 --- a/src/averagePerceptronTagger.jl +++ b/src/averagePerceptronTagger.jl @@ -1,9 +1,26 @@ using DataStructures using Random using BSON +using DataDeps export fit!, predict +function pos_tagger_datadep_register() + register(DataDep("POS Perceptron Tagger Weights", + """ + The trained weights for the average Perceptron Tagger on Part of Speech Tagging task. + """, + "https://github.com/JuliaText/TextAnalysis.jl/releases/download/v0.6.0/pretrainedMod.bson.zip", + "52519cb3aea5d8f74368faedea831471e5df34567de4748d15decea7424743d3", + post_fetch_method = function(fn) + unpack(fn) + rm("__MACOSX", recursive=true) + file = readdir()[1] + mv(file, "POSWeights.bson") + end + )) +end + """ This file contains the Average Perceptron model and Perceptron Tagger which was original implemented by Matthew Honnibal. @@ -57,7 +74,7 @@ end """ Applying the perceptron learning algorithm -Increment the truth weights and decrementing the guess weights +Increment the truth weights and decrementing the guess weights, if the guess is wrong """ function update(self::AveragePerceptron, truth, guess, features) @@ -111,22 +128,32 @@ function average_weights(self::AveragePerceptron) end """ -PERCEPTRON TAGGER +# PERCEPTRON TAGGER This struct contains the POS tagger "PerceptronTagger" which uses model in "AveragePerceptron" In this training can be done and weights can be saved Or a pretrain weights can be used (which are trained on same features) and train more or can be used to predict -To train: -tagger = PerceptronTagger(false) -fit!(tagger, [[("today","NN"),("is","VBZ"),("good","JJ"),("day","NN")]]) +## To train: + +```julia +julia> tagger = PerceptronTagger(false) + +julia> fit!(tagger, [[("today","NN"),("is","VBZ"),("good","JJ"),("day","NN")]]) +``` -To load pretrain model: -tagger = PerceptronTagger(true) +## To load pretrain model: -To predict tag: -predict(tagger, ["today", "is"]) +```julia +julia> tagger = PerceptronTagger(true) +``` + +## To predict tag: + +```julia +julia> predict(tagger, ["today", "is"]) +``` """ mutable struct PerceptronTagger model :: AveragePerceptron @@ -142,11 +169,9 @@ end function PerceptronTagger(load::Bool) self = PerceptronTagger() - """ - If load is true then a pretrain model will be import from location - """ + # If load is true then a pretrain model will be import from location if load - location = "src/pretrainedMod.bson"; + location = joinpath(datadep"POS Perceptron Tagger Weights", "POSWeights.bson") pretrained = BSON.load(location) self.model.weights = pretrained[:weights] self.tagdict = pretrained[:tagdict] @@ -201,11 +226,13 @@ end """ Converting the token into a feature representation, implemented as Dict If the features change, a new model should be trained -params: -i - index of word(or token) in sentence -word - token -context - array of tokens with starting and ending specifiers -prev == "-START-" prev2 == "-START2-" - Start specifiers + +# Arguments: + +- `i` - index of word(or token) in sentence +- `word` - token +- `context` - array of tokens with starting and ending specifiers +- `prev` == "-START-" prev2 == "-START2-" - Start specifiers """ function getFeatures(self::PerceptronTagger, i, word, context, prev, prev2) function add(sep, name, args...) @@ -252,8 +279,10 @@ function getFeatures(self::PerceptronTagger, i, word, context, prev, prev2) end """ -Used for predicting the tags for given tokens -tokens - array of tokens + predict(::PerceptronTagger, tokens) + predict(::PerceptronTagger, sentence) + +Used for predicting the tags for given sentence or array of tokens """ function predict(self::PerceptronTagger, tokens::Vector{String}) prev, prev2 = self.START @@ -273,17 +302,38 @@ function predict(self::PerceptronTagger, tokens::Vector{String}) return output end +function (tagger::PerceptronTagger)(input) + predict(tagger, input) +end + +predict(tagger::PerceptronTagger, sentence::String) = + predict(tagger, tokenize(Languages.English(), sentence)) +predict(tagger::PerceptronTagger, sd::StringDocument) = + predict(tagger, text(sd)) +predict(tagger::PerceptronTagger, fd::FileDocument) = + predict(tagger, text(fd)) +predict(tagger::PerceptronTagger, td::TokenDocument) = + predict(tagger, tokens(td)) +function predict(tagger::PerceptronTagger, ngd::NGramDocument) + @warn "POS tagging for NGramDocument not available." +end + + + """ + fit!(::PerceptronTagger, sentences::Vector{Vector{Tuple{String, String}}}, save_loc::String, nr_iter::Integer) + Used for training a new model or can be used for training an existing model by using pretrained weigths and classes Contains main training loop for number of epochs. After training weights, tagdict and classes are stored in the specified location. -params: -sentences - array of the all sentences -save_loc - to specify the saving location -nr_iter - total number of training iterations for given sentences(or number of epochs) +# Arguments: +- `::PerceptronTagger` : Input PerceptronTagger model +- `sentences::Vector{Vector{Tuple{String, String}}}` : Array of the all token seqeunces with target POS tag +- `save_loc::String` : To specify the saving location +- `nr_iter::Integer` : Total number of training iterations for given sentences(or number of epochs) """ function fit!(self::PerceptronTagger, sentences::Vector{Vector{Tuple{String, String}}}, save_loc::String, nr_iter::Integer) self._sentences = [] diff --git a/src/pretrainedMod.bson b/src/pretrainedMod.bson deleted file mode 100644 index 8594dd45..00000000 Binary files a/src/pretrainedMod.bson and /dev/null differ diff --git a/test/averagePerceptronTagger.jl b/test/averagePerceptronTagger.jl index 7b3450f5..ff80a37f 100644 --- a/test/averagePerceptronTagger.jl +++ b/test/averagePerceptronTagger.jl @@ -3,21 +3,41 @@ using TextAnalysis: AveragePerceptron @testset "Average Perceptron Tagger" begin tagger = PerceptronTagger(false) - @test typeof(tagger.classes) == Set{Any} - @test length(tagger.classes) == 0 - @test typeof(tagger.model) == AveragePerceptron + @testset "Basic" begin + @test typeof(tagger.classes) == Set{Any} + @test length(tagger.classes) == 0 + @test typeof(tagger.model) == AveragePerceptron - fit!(tagger, [[("today","NN"),("is","VBZ"),("good","JJ"),("day","NN")]]) - @test length(keys(tagger.model.weights)) == 51 - @test tagger.classes == tagger.model.classes == Set(["JJ", "VBZ", "NN"]) -end + fit!(tagger, [[("today","NN"),("is","VBZ"),("good","JJ"),("day","NN")]]) + @test length(keys(tagger.model.weights)) == 51 + @test tagger.classes == tagger.model.classes == Set(["JJ", "VBZ", "NN"]) + end + + @testset "Average Perceptron Tagger (pretrained)" begin + tagger = PerceptronTagger(true) + + @test typeof(tagger.classes) == Set{Any} + @test length(tagger.classes) == 75 + @test typeof(tagger.model) == AveragePerceptron + end + + sample_file = joinpath(dirname(@__FILE__), "data", "poem.txt") -##Uncomment these when pretrained Model file is present in the directory -# @testset "Average Perceptron Tagger (pretrained)" begin -# tagger = PerceptronTagger(true) -# -# @test typeof(tagger.classes) == Set{Any} -# @test length(tagger.classes) == 75 -# @test typeof(tagger.model) == AveragePerceptron -# end + @testset "Tagging over sentences and documents" begin + tagger = PerceptronTagger(true) + text = "This is a text" + @test tagger(text) == predict(tagger, text) + + sd = StringDocument(text) + @test length(predict(tagger, text)) == 4 + @test length(predict(tagger, sd)) == 4 + + text2 = read(sample_file, String) + fd = FileDocument(sample_file) + @test length(predict(tagger, fd)) == length(predict(tagger, text2)) + + td = TokenDocument(text) + @test length(predict(tagger, td)) == 4 + end +end