Merge pull request #14 from JuliaTrustworthyAI/fix-samplers

Fix remaining issues
JuliaTrustworthyAI · Jun 11, 2024 · 9c1ddd3 · 9c1ddd3 · pat-alt · Jun 11, 2024
2 parents 63c1687 + 3f4ff73
commit 9c1ddd3
Show file tree

Hide file tree

Showing 4 changed files with 56 additions and 27 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -6,6 +6,13 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
 
 *Note*: We try to adhere to these practices as of version [v1.1.0].
 
+## Version [1.2.2] - 2024-06-11
+
+### Changed
+
+- Renamed `PCD` to `PCM` to make it clearer that this simply runs persistent Markov chains, not contrastive divergence. [#14]
+- Improved `mcmc_samples` to now allow mini-batch training. [#14]
+
 ## Version [1.2.1] - 2024-06-06
 
 ### Removed
@@ -16,7 +23,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
 
 ### Added
 
-- Added support for running persistent contrastive divergence (PCD) on models. [#10]
+- Added support for running Persistent Markov Chains (PMC) on models. [#10]
 
 ## Version [1.1.1] - 2024-06-04
 

diff --git a/Project.toml b/Project.toml
@@ -1,7 +1,7 @@
 name = "TaijaBase"
 uuid = "10284c91-9f28-4c9a-abbf-ee43576dfff6"
 authors = ["Patrick Altmeyer"]
-version = "1.2.1"
+version = "1.2.2"
 
 [deps]
 CategoricalArrays = "324d7699-5711-5eae-9e2f-1d82baa6b597"

diff --git a/src/Samplers/Samplers.jl b/src/Samplers/Samplers.jl
@@ -16,7 +16,7 @@ abstract type AbstractSampler end
 
 export AbstractSampler, AbstractSamplingRule
 export ConditionalSampler, UnconditionalSampler, JointSampler
-export PCD
+export PMC
 export energy
 
 include("utils.jl")
@@ -32,7 +32,21 @@ include("optimizers.jl")
         kwargs...,
     )
 
-Base method for generating Monte Carlo samples for a given models, sampler and sampling rule.
+Sampling method for `AbstractSampler`. This method generates samples from the model's learned distribution. 
+
+# Arguments
+
+- `sampler::AbstractSampler`: The sampler to use.
+- `model`: The model to sample from.
+- `rule::AbstractSamplingRule`: The sampling rule to use.
+- `niter::Int=100`: The number of iterations to perform.
+- `clip_grads::Union{Nothing,AbstractFloat}=nothing`: The value to clip the gradients. This is useful to prevent exploding gradients when training joint energy models. If `nothing`, no clipping is performed.
+- `n_samples::Union{Nothing,Int}=nothing`: The number of samples to generate.
+- `kwargs...`: Additional keyword arguments.
+
+# Returns
+
+- `input_samples`: The samples generated by the sampler.
 """
 function (sampler::AbstractSampler)(
     model,
@@ -78,7 +92,7 @@ function (sampler::AbstractSampler)(
 end
 
 """
-    PCD(
+    PMC(
         sampler::AbstractSampler,
         model,
         rule::AbstractSamplingRule;
@@ -87,13 +101,13 @@ end
         kwargs...,
     )
 
-Persistent Contrastive Divergence (PCD) algorithm. This algorithm was originally proposed by [Tieleman (2008)](https://www.cs.toronto.edu/~tijmen/pcd/pcd.pdf) and is a variant of the Contrastive Divergence (CD) algorithm. The main difference is that PCD uses a persistent chain to estimate the negative phase of the gradient. This is done by keeping the state of the Markov chain between iterations. 
+Runs a Persistent Markov Chain (PMC) using the sampler and model. Persistent Markov Chains are used, for example, for Persistent Contrastive Convergence ([Tieleman (2008)](https://www.cs.toronto.edu/~tijmen/pcd/pcd.pdf)), a variant of the Contrastive Divergence (CD) algorithm. The main difference is that PCD uses a persistent chain to estimate the negative phase of the gradient. This is done by keeping the state of the Markov chain between iterations. 
 
 In our context, the sampler is the persistent chain and the model is a supervised model. The sampler generates samples from the model's learned distribution. 
 
 # Note
 
-This function does not perform any training. It only generates samples from the model. For training Joint Energy Models, see [JointEnergyModels.jl](https://github.com/JuliaTrustworthyAI/JointEnergyModels.jl).
+This function does not perform any training. It only generates samples from the model. In other words, there is no Contrastive Divergence. For training Joint Energy Models, see [JointEnergyModels.jl](https://github.com/JuliaTrustworthyAI/JointEnergyModels.jl).
 
 # Arguments
 
@@ -108,7 +122,7 @@ This function does not perform any training. It only generates samples from the
 
 - `sampler.buffer`: The buffer containing the samples generated by the sampler.
 """
-function PCD(
+function PMC(
     sampler::AbstractSampler,
     model,
     rule::AbstractSamplingRule;
@@ -196,14 +210,18 @@ function mcmc_samples(
     end
     mod = (inputs = input_samples, energy = energy)
     s = Optimisers.setup(rule, mod)
+    ntotal = size(input_samples, ndims(input_samples))
+    dl = DataLoader((1:ntotal,), batchsize = sampler.batch_size)
 
     # Training:
     i = 1
     while i <= niter
-        grad = gradient(mod) do m  # calculate the gradients
-            m.energy(sampler, model, m.inputs, y)
+        for (i,) in dl
+            grad = gradient(mod) do m  # calculate the gradients
+                m.energy(sampler, model, m.inputs[:, i], y)
+            end
+            s, mod = Optimisers.update(s, mod, grad[1])
         end
-        s, mod = Optimisers.update(s, mod, grad[1])
         i += 1
     end
 
@@ -291,14 +309,18 @@ function mcmc_samples(
     # Setup:
     mod = (inputs = input_samples, energy = energy)
     s = Optimisers.setup(rule, mod)
+    ntotal = size(input_samples, ndims(input_samples))
+    dl = DataLoader((1:ntotal,), batchsize = sampler.batch_size)
 
     # Training:
     i = 1
     while i <= niter
-        grad = gradient(mod) do m  # calculate the gradients
-            m.energy(sampler, model, m.inputs, nothing)
+        for (i,) in dl
+            grad = gradient(mod) do m  # calculate the gradients
+                m.energy(sampler, model, m.inputs[:, i], nothing)
+            end
+            s, mod = Optimisers.update(s, mod, grad[1])
         end
-        s, mod = Optimisers.update(s, mod, grad[1])
         i += 1
     end
 
@@ -375,15 +397,19 @@ function mcmc_samples(
     # Setup:
     mod = (inputs = input_samples, energy = energy)
     s = Optimisers.setup(rule, mod)
+    ntotal = size(input_samples, ndims(input_samples))
+    dl = DataLoader((1:ntotal,), batchsize = sampler.batch_size)
 
     # Training:
     i = 1
     while i <= niter
         y = rand(sampler.𝒟y)
-        grad = gradient(mod) do m  # calculate the gradients
-            m.energy(sampler, model, m.inputs, y)
+        for (i,) in dl
+            grad = gradient(mod) do m  # calculate the gradients
+                m.energy(sampler, model, m.inputs[:, i], y)
+            end
+            s, mod = Optimisers.update(s, mod, grad[1])
         end
-        s, mod = Optimisers.update(s, mod, grad[1])
         i += 1
     end
 

diff --git a/test/samplers.jl b/test/samplers.jl
@@ -9,7 +9,7 @@ using TaijaBase.Samplers:
     ConditionalSampler,
     UnconditionalSampler,
     AbstractSampler,
-    PCD
+    PMC
 
 @testset "Samplers" begin
 
@@ -49,7 +49,7 @@ using TaijaBase.Samplers:
         end
     end
 
-    @testset "Persistent Contrastive Divergence (PCD)" begin
+    @testset "Persistent Markov Chains (PMC)" begin
 
         # Train a simple neural network on the data (classification)
         Xtrain = MLJBase.matrix(X) |> permutedims
@@ -69,18 +69,18 @@ using TaijaBase.Samplers:
             println("Accuracy: ", mean(Flux.onecold(nn(Xtrain)) .== Flux.onecold(ytrain)))
         end
 
-        # PCD
+        # PMC
         bs = 10
         ntrans = 100
         niter = 20
         # Conditionally sample from first class:
         smpler =
             ConditionalSampler(𝒟x, 𝒟y, input_size = size(Xmat)[1:end-1], batch_size = bs)
-        x1 = PCD(smpler, nn, ImproperSGLD(); ntransitions = ntrans, niter = niter, y = 1)
+        x1 = PMC(smpler, nn, ImproperSGLD(); ntransitions = ntrans, niter = niter, y = 1)
         # Conditionally sample from second class:
         smpler =
             ConditionalSampler(𝒟x, 𝒟y, input_size = size(Xmat)[1:end-1], batch_size = bs)
-        x2 = PCD(smpler, nn, ImproperSGLD(); ntransitions = ntrans, niter = niter, y = 2)
+        x2 = PMC(smpler, nn, ImproperSGLD(); ntransitions = ntrans, niter = niter, y = 2)
 
         # using Plots
         # plt = scatter(Xtrain[1, :], Xtrain[2, :], color=Int.(y.refs), group=Int.(y.refs), label=["X|y=0" "X|y=1"], alpha=0.1)
@@ -118,11 +118,7 @@ using TaijaBase.Samplers:
         prep_y(y) = reshape(y, 1, :) |> gpu
         train_X, test_X = prep_X.((train[:, features], test[:, features]))
         train_y, test_y = prep_y.((train[:, target], test[:, target]))
-        train_set = Flux.DataLoader(
-            (train_X, train_y),
-            batchsize = 100,
-            shuffle = false,
-        )
+        train_set = Flux.DataLoader((train_X, train_y), batchsize = 100, shuffle = false)
 
         function train_logreg(; steps::Int = 1000, opt = Flux.Descent(2))
             Random.seed!(1)