diff --git a/Project.toml b/Project.toml index 7d6b695..9d89362 100644 --- a/Project.toml +++ b/Project.toml @@ -6,12 +6,15 @@ version = "0.1.1" [deps] BSON = "fbb218c0-5317-5bc6-957e-2ee96dd4b1f0" +CorpusLoaders = "214a0ac2-f95b-54f7-a80b-442ed9c2c9e8" DataDeps = "124859b0-ceae-595e-8997-d05f6a7a8dfe" DataStructures = "864edb3b-99cc-5e75-8d2d-829cb0a9cfe8" +DelimitedFiles = "8bb1440f-4735-579b-a4ab-409b98df4dab" Flux = "587475ba-b771-5e3f-ad9e-33799f191a9c" JSON = "682c06a0-de6a-54ab-a142-c8b1cf79cde6" Languages = "8ef0a80b-9436-5d2c-a485-80b904378c43" NNlib = "872c559c-99b0-510c-b3b7-b6c96a88d5cd" +Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" StatsBase = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91" Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" TextAnalysis = "a2db99b7-8b79-58f8-94bf-bbc811eef33d" diff --git a/src/TextModels.jl b/src/TextModels.jl index 79d6223..f437cb1 100644 --- a/src/TextModels.jl +++ b/src/TextModels.jl @@ -39,27 +39,28 @@ module TextModels # ULMFiT - #module ULMFiT - # using ..TextAnalysis - # using DataDeps - # using Flux - # using Tracker - # using BSON - # include("ULMFiT/utils.jl") - # include("ULMFiT/datadeps.jl") - # include("ULMFiT/data_loaders.jl") - # include("ULMFiT/custom_layers.jl") - # include("ULMFiT/pretrain_lm.jl") - # include("ULMFiT/fine_tune_lm.jl") - # include("ULMFiT/train_text_classifier.jl") - #end - #export ULMFiT + module ULMFiT + using TextAnalysis + using DataDeps + using Flux + using Zygote + using BSON + using CorpusLoaders + include("ULMFiT/utils.jl") + include("ULMFiT/datadeps.jl") + include("ULMFiT/data_loaders.jl") + include("ULMFiT/custom_layers.jl") + include("ULMFiT/pretrain_lm.jl") + include("ULMFiT/fine_tune_lm.jl") + include("ULMFiT/train_text_classifier.jl") + end + export ULMFiT function __init__() pos_tagger_datadep_register() ner_datadep_register() pos_datadep_register() - #ULMFiT.ulmfit_datadep_register() + ULMFiT.ulmfit_datadep_register() global sentiment_model = artifact"sentiment_model" end diff --git a/src/ULMFiT/custom_layers.jl b/src/ULMFiT/custom_layers.jl index e402c7d..ece14f2 100644 --- a/src/ULMFiT/custom_layers.jl +++ b/src/ULMFiT/custom_layers.jl @@ -8,7 +8,7 @@ This file contains the custom layers defined for this model: PooledDense """ -import Flux: gate, _testmode!, _dropout_kernel +import Flux: gate, testmode!, _dropout_kernel reset_masks!(entity) = nothing reset_probability!(entity) = nothing @@ -44,12 +44,12 @@ Moreover this also follows the Vartional DropOut citeria, that is, the drop mask is remains same for a whole training pass. This is done by saving the masks in 'maskWi' and 'maskWh' fields """ -mutable struct WeightDroppedLSTMCell{A, V, M} +mutable struct WeightDroppedLSTMCell{A, V, S, M} Wi::A Wh::A b::V - h::V - c::V + h::S + c::S p::Float64 maskWi::M maskWh::M @@ -60,17 +60,17 @@ function WeightDroppedLSTMCell(in::Integer, out::Integer, p::Float64=0.0; init = Flux.glorot_uniform) @assert 0 ≤ p ≤ 1 cell = WeightDroppedLSTMCell( - param(init(out*4, in)), - param(init(out*4, out)), - param(init(out*4)), - param(zeros(Float32, out)), - param(zeros(Float32, out)), + init(out*4, in), + init(out*4, out), + init(out*4), + reshape(zeros(Float32, out),out, 1), + reshape(zeros(Float32, out), out, 1), p, drop_mask((out*4, in), p), drop_mask((out*4, out), p), true ) - cell.b.data[gate(out, 2)] .= 1 + cell.b[gate(out, 2)] .= 1 return cell end @@ -88,9 +88,12 @@ function (m::WeightDroppedLSTMCell)((h, c), x) return (h′, c), h′ end -Flux.@treelike WeightDroppedLSTMCell +Flux.@functor WeightDroppedLSTMCell -_testmode!(m::WeightDroppedLSTMCell, test) = (m.active = !test) +Flux.trainable(m::WeightDroppedLSTMCell) = (m.Wi, m.Wh, m.b, m.h, m.c) + +testmode!(m::WeightDroppedLSTMCell, mode=true) = + (m.active = (isnothing(mode) || mode == :auto) ? nothing : !mode; m) """ WeightDroppedLSTM(in::Integer, out::Integer, p::Float64=0.0) @@ -106,7 +109,7 @@ julia> wd = WeightDroppedLSTM(4, 5, 0.3); function WeightDroppedLSTM(a...; kw...) cell = WeightDroppedLSTMCell(a...;kw...) hidden = (cell.h, cell.c) - return Flux.Recur(cell, hidden, hidden) + return Flux.Recur(cell, hidden) end """ @@ -155,7 +158,9 @@ end AWD_LSTM(in::Integer, out::Integer, p::Float64=0.0; kw...) = AWD_LSTM(WeightDroppedLSTM(in, out, p; kw...), -1, []) -Flux.@treelike AWD_LSTM +Flux.@functor AWD_LSTM + +Flux.trainable(m::AWD_LSTM) = (m.layer,) (m::AWD_LSTM)(in) = m.layer(in) @@ -184,12 +189,12 @@ function asgd_step!(iter::Integer, layer::AWD_LSTM) p = get_trainable_params([layer]) avg_fact = 1/max(iter - layer.T + 1, 1) if avg_fact != 1 - layer.accum = layer.accum .+ Tracker.data.(p) + layer.accum = layer.accum .+ p for (ps, accum) in zip(p, layer.accum) - Tracker.data(ps) .= avg_fact*accum + ps .= avg_fact*accum end else - layer.accum = deepcopy(Tracker.data.(p)) # Accumulator for ASGD + layer.accum = deepcopy(p) # Accumulator for ASGD end end return @@ -230,7 +235,8 @@ function (vd::VarDrop)(x) return (x .* vd.mask) end -_testmode!(vd::VarDrop, test) = (vd.active = !test) +testmode!(m::VarDrop, mode=true) = + (m.active = (isnothing(mode) || mode == :auto) ? nothing : !mode; m) # method for reseting mask of VarDrop reset_masks!(vd::VarDrop) = (vd.reset = true) @@ -270,7 +276,7 @@ end function DroppedEmbeddings(in::Integer, embed_size::Integer, p::Float64=0.0; init = Flux.glorot_uniform) de = DroppedEmbeddings{AbstractArray, typeof(p)}( - param(init(in, embed_size)), + init(in, embed_size), p, drop_mask((in,), p), true @@ -283,9 +289,10 @@ function (de::DroppedEmbeddings)(x::AbstractArray, tying::Bool=false) return tying ? dropped * x : transpose(dropped[x, :]) end -Flux.@treelike DroppedEmbeddings +Flux.@functor DroppedEmbeddings (emb,) -_testmode!(de::DroppedEmbeddings, test) = (de.active = !test) +testmode!(m::DroppedEmbeddings, mode=true) = + (m.active = (isnothing(mode) || mode == :auto) ? nothing : !mode; m) function reset_masks!(de::DroppedEmbeddings) de.mask = drop_mask(de.mask, de.p) @@ -324,10 +331,10 @@ PooledDense(W, b) = PooledDense(W, b, identity) function PooledDense(hidden_sz::Integer, out::Integer, σ = identity; initW = Flux.glorot_uniform, initb = (dims...) -> zeros(Float32, dims...)) -return PooledDense(param(initW(out, hidden_sz*3)), param(initb(out)), σ) +return PooledDense(initW(out, hidden_sz*3), initb(out), σ) end -Flux.@treelike PooledDense +Flux.@functor PooledDense function (a::PooledDense)(x) W, b, σ = a.W, a.b, a.σ diff --git a/src/ULMFiT/data_loaders.jl b/src/ULMFiT/data_loaders.jl index f59e403..839b408 100644 --- a/src/ULMFiT/data_loaders.jl +++ b/src/ULMFiT/data_loaders.jl @@ -27,29 +27,29 @@ function imdb_preprocess(doc::AbstractDocument) length(word) == 1 && return [word] return split(word, symbol) end - text = text(doc) - remove_corrupt_utf8!(text) - remove_case!(text) - prepare!(text, strip_html_tags) - tokens = tokens(text) + text_ = doc + remove_corrupt_utf8!(text_) + remove_case!(text_) + prepare!(text_, strip_html_tags) + tokens_ = tokens(text_) for symbol in [',', '.', '-', '/', "'s"] - tokens = split_word.(tokens, symbol) + tokens_ = split_word.(tokens_, symbol) temp = [] - for token in tokens + for token_ in tokens_ try - append!(temp, put(token, symbol)) + append!(temp, put(token_, symbol)) catch - append!(temp, token) + append!(temp, token_) end end - tokens = temp + tokens_ = temp end - deleteat!(tokens, findall(x -> isequal(x, "")||isequal(x, " "), tokens)) - return tokens + deleteat!(tokens_, findall(x -> isequal(x, "")||isequal(x, " "), tokens_)) + return tokens_ end # Loads WikiText-103 corpus and output a Channel to give a mini-batch at each call -function load_wikitext_103(batchsize::Integer, bptt::Integer; type = "train") +function load_wikitext_103(batchsize::Integer=16, bptt::Integer=70; type = "train") corpuspath = joinpath(datadep"WikiText-103", "wiki.$(type).tokens") corpus = read(open(corpuspath, "r"), String) corpus = tokenize(corpus) @@ -58,13 +58,13 @@ end # IMDB Data loaders for Sentiment Analysis specifically # IMDB data loader for fine-tuning Language Model -function imdb_fine_tune_data(batchsize::Integer, bptt::Integer, num_examples::Integer=50000) +function imdb_fine_tune_data(batchsize::Integer=16, bptt::Integer=70, num_examples::Integer=50000) imdb_dataset = IMDB("train_unsup") dataset = [] - for path in imdb_dataset.filepaths #extract data from the files in directory and put into channel + for path in imdb_dataset.filepaths[1:num_examples] #extract data from the files in directory and put into channel open(path) do fileio cur_text = read(fileio, String) - append!(dataset, imdb_preprocess(cur_text)) + append!(dataset, imdb_preprocess(StringDocument(cur_text))) end #open end #for return Channel(x -> generator(x, dataset; batchsize=batchsize, bptt=bptt)) diff --git a/src/ULMFiT/fine_tune_lm.jl b/src/ULMFiT/fine_tune_lm.jl index 17f33b9..e18edf3 100644 --- a/src/ULMFiT/fine_tune_lm.jl +++ b/src/ULMFiT/fine_tune_lm.jl @@ -27,14 +27,14 @@ NOTE: length(opts) == length(layers) function discriminative_step!(layers, ηL::Float64, l, opts::Vector) @assert length(opts) == length(layers) # Gradient calculation - grads = Tracker.gradient(() -> l, get_trainable_params(layers)) + grads = Zygote.gradient(() -> l, get_trainable_params(layers)) # discriminative step ηl = ηL/(2.6^(length(layers)-1)) for (layer, opt) in zip(layers, opts) opt.eta = ηl for ps in get_trainable_params([layer]) - Tracker.update!(opt, ps, grads[ps]) + Flux.Optimise.update!(opt, ps, grads) end ηl *= 2.6 end @@ -50,18 +50,17 @@ This function contains main training loops for fine-tuning the language model. To use this funciton, an instance of LanguageModel and a data loader is needed. Read the docs for more info about arguments """ -function fine_tune_lm!(lm::LanguageModel, data_loader::Channel=imdb_fine_tune_data, - stlr_cut_frac::Float64=0.1, stlr_ratio::Float32=32, stlr_η_max::Float64=4e-3; +function fine_tune_lm!(lm=LanguageModel(), data_loader=imdb_fine_tune_data, + stlr_cut_frac::Float64=0.1, stlr_ratio::Float32=Float32(32), stlr_η_max::Float64=4e-3; epochs::Integer=1, checkpoint_itvl::Integer=5000) opts = [ADAM(0.001, (0.7, 0.99)) for i=1:4] + gen = data_loader() + num_of_iters = take!(gen) cut = num_of_iters * epochs * stlr_cut_frac - # Fine-Tuning loops for epoch=1:epochs println("\nEpoch: $epoch") - gen = data_loader() - num_of_iters = take!(gen) T = num_of_iters-Int(floor((num_of_iters*2)/100)) set_trigger!.(T, lm.layers) for i=1:num_of_iters @@ -121,7 +120,7 @@ julia> insert!(vocab, 2, "_pad_") function set_vocab!(lm::LanguageModel, vocab::Vector) idxs = indices(vocab, lm.vocab) lm.vocab = vocab - lm.layers[1].emb = param(Tracker.data(lm.layers[1].emb)[idxs, :]) + lm.layers[1].emb = param(lm.layers[1].emb[idxs, :]) lm.layers[1].mask = gpu(drop_mask((length(vocab),), lm.layers[1].p)) return end diff --git a/src/ULMFiT/pretrain_lm.jl b/src/ULMFiT/pretrain_lm.jl index 74bc573..27d2ad9 100644 --- a/src/ULMFiT/pretrain_lm.jl +++ b/src/ULMFiT/pretrain_lm.jl @@ -49,7 +49,7 @@ function LanguageModel(load_pretrained::Bool=false, vocabpath::String=joinpath(@ return lm end -Flux.@treelike LanguageModel +Flux.@functor LanguageModel """ test_lm(lm::LanguageModel, data_gen, num_of_iters::Integer; unknown_token::String="_unk_") @@ -63,7 +63,7 @@ It returns loss, accuracy, precsion, recall and F1 score. julia> test_lm(lm, data_gen, 200, " l, p) - Tracker.update!(opt, p, grads) + grads = Zygote.gradient(() -> l, p) + Flux.Optimise.update!(opt, p, grads) return end @@ -182,7 +182,7 @@ SAMPLING... """ function sample(starting_text::AbstractDocument, lm::LanguageModel) testmode!(lm.layers) - model_layers = mapleaves(Tracker.data, lm.layers) + model_layers = lm.layers tokens = tokens(starting_text) word_indices = map(x -> indices([x], lm.vocab, "_unk_"), tokens) h = (model_layers.(word_indices))[end] diff --git a/src/ULMFiT/sentiment.jl b/src/ULMFiT/sentiment.jl index c70069d..3ab5479 100644 --- a/src/ULMFiT/sentiment.jl +++ b/src/ULMFiT/sentiment.jl @@ -48,12 +48,12 @@ function BinSentimentClassifier() ) ) Flux.loadparams!(sc, weights) - sc = mapleaves(Tracker.data, sc) + sc = sc Flux.testmode!(sc) return sc end -Flux.@treelike BinSentimentClassifier +Flux.@functor BinSentimentClassifier function (sc::BinSentimentClassifier)(x::TokenDocument) remove_case!(x) diff --git a/src/ULMFiT/train_text_classifier.jl b/src/ULMFiT/train_text_classifier.jl index e30912f..a032ea7 100644 --- a/src/ULMFiT/train_text_classifier.jl +++ b/src/ULMFiT/train_text_classifier.jl @@ -30,7 +30,7 @@ function TextClassifier(lm::LanguageModel=LanguageModel(), clsfr_out_sz::Integer ) end -Flux.@treelike TextClassifier +Flux.@functor TextClassifier """ Cross Validate @@ -48,7 +48,7 @@ gen will be used for validation """ function validate(tc::TextClassifier, gen::Channel, num_of_batches::Union{Colon, Integer}) n_classes = size(tc.linear_layers[end-2].W, 1) - classifier = mapleaves(Tracker.data, tc) + classifier = tc Flux.testmode!(classifier) loss = 0 iters = take!(gen) @@ -91,7 +91,7 @@ tracked_steps : This is the number of tracked time-steps for Truncated Backpro """ function forward(tc::TextClassifier, gen::Channel, tracked_steps::Integer=32) # swiching off tracking - classifier = mapleaves(Tracker.data, tc) + classifier = tc X = take!(gen) l = length(X) # Truncated Backprop through time @@ -203,13 +203,13 @@ All the preprocessing related to the used vocabulary should be done before using Use `prepare!` function to do preprocessing """ function predict(tc::TextClassifier, text_sents::Corpus) - classifier = mapleaves(Tracker.data, tc) + classifier = tc Flux.testmode!(classifier) predictions = [] expr(x) = indices(x, classifier.vocab, "_unk_") for text in text_sents - tokens = tokens(text) - h = classifier.rnn_layers.(expr.(tokens)) + tokens_ = tokens(text) + h = classifier.rnn_layers.(expr.(tokens_)) probability_dist = classifier.linear_layers(h) class = argmax(probaility_dist) push!(predictions, class) diff --git a/src/ULMFiT/utils.jl b/src/ULMFiT/utils.jl index 691354f..64bfd11 100644 --- a/src/ULMFiT/utils.jl +++ b/src/ULMFiT/utils.jl @@ -27,8 +27,8 @@ end init_weights(extreme::AbstractFloat, dims...) = randn(Float32, dims...) .* sqrt(Float32(extreme)) # Generator, whenever it should be called two times since it gives X in first and y in second call -function generator(c::Channel, corpus::AbstractDocument; batchsize::Integer=64, bptt::Integer=70) - X_total = post_pad_sequences(chunk(tokens(corpus), batchsize)) +function generator(c::Channel, corpus; batchsize::Integer=64, bptt::Integer=70) + X_total = post_pad_sequences(Flux.chunk(corpus, batchsize)) n_batches = Int(floor(length(X_total[1])/bptt)) put!(c, n_batches) for i=1:n_batches diff --git a/test/ulmfit.jl b/test/ulmfit.jl index 8ea0092..b3820f9 100644 --- a/test/ulmfit.jl +++ b/test/ulmfit.jl @@ -4,7 +4,7 @@ using BSON @testset "Custom layers" begin @testset "WeightDroppedLSTM" begin wd = ULMFiT.WeightDroppedLSTM(4, 5, 0.3) - @test all(wd.init .== wd.state) + @test all((wd.cell.h, wd.cell.c) .== wd.state) @test size(wd.cell.Wi) == size(wd.cell.maskWi) @test size(wd.cell.Wh) == size(wd.cell.maskWh) @test wd.cell.active @@ -31,10 +31,10 @@ using BSON ULMFiT.asgd_step!(4, awd) @test length(awd.accum) == 3 temp = deepcopy(awd.accum[1][1]) - @test temp == Tracker.data(awd.layer.cell.Wi[1]) + @test temp == awd.layer.cell.Wi[1] ULMFiT.asgd_step!(5, awd) temp += temp - @test temp == Tracker.data(awd.accum[1][1]) + @test temp == awd.accum[1][1] @test length(params(awd)) == 5 end