JuliaGaussianProcesses · st-- · Jun 4, 2020 · Jun 12, 2020 · Jul 6, 2020 · Jul 6, 2020
diff --git a/docs/.gitignore b/docs/.gitignore
@@ -1,3 +1,3 @@
 build/
 site/
-src/examples/
+src/examples/
diff --git a/docs/Project.toml b/docs/Project.toml
@@ -1,4 +1,6 @@
 [deps]
+AbstractGPs = "99985d1d-32ba-4be9-9821-2ec096f28918"
+Distributions = "31c24e10-a181-5473-b8eb-7969acd0382f"
 Documenter = "e30172f5-a6a5-5a46-863b-614d45cd2de4"
 KernelFunctions = "ec8451be-7e33-11e9-00cf-bbf324bd1392"
 PDMats = "90014a1f-27ba-587c-ab20-58faa44d9150"

diff --git a/examples/deepkernellearning.jl b/examples/deepkernellearning.jl
@@ -1,40 +1,78 @@
+# # Deep Kernel Learning with Flux
+# ## Package loading
+# We use a couple of useful packages to plot and optimize
+# the different hyper-parameters
 using KernelFunctions
-using MLDataUtils
-using Zygote
 using Flux
 using Distributions, LinearAlgebra
 using Plots
+using ProgressMeter
+using AbstractGPs
+pyplot();
+default(; legendfontsize=15.0, linewidth=3.0);
 
-Flux.@functor SqExponentialKernel
-Flux.@functor KernelSum
-Flux.@functor Matern32Kernel
-Flux.@functor FunctionTransform
-
-neuralnet = Chain(Dense(1, 3), Dense(3, 2))
-k = SqExponentialKernel(FunctionTransform(neuralnet))
+# ## Data creation
+# We create a simple 1D Problem with very different variations
 xmin = -3;
-xmax = 3;
-x = range(xmin, xmax; length=100)
-x_test = rand(Uniform(xmin, xmax), 200)
-x, y = noisy_function(sinc, x; noise=0.1)
-X = reshape(x, :, 1)
-λ = [0.1]
-function f(x, k, λ)
-    return kernelmatrix(k, X, x; obsdim=1) *
-           inv(kernelmatrix(k, X; obsdim=1) + exp(λ[1]) * I) *
-           y
-end
-f(X, k, 1.0)
-loss(k, λ) = ŷ -> sum(y - ŷ) / length(y) + exp(λ[1]) * norm(ŷ)(f(X, k, λ))
-loss(k, λ)
+xmax = 3; # Limits
+N = 150
+noise = 0.01
+x_train = collect(eachrow(rand(Uniform(xmin, xmax), N))) # Training dataset
+target_f(x) = sinc(abs(x)^abs(x)) # We use sinc with a highly varying value
+target_f(x::AbstractArray) = target_f(first(x))
+y_train = target_f.(x_train) + randn(N) * noise
+x_test = collect(eachrow(range(xmin, xmax; length=200))) # Testing dataset
+spectral_mixture_kernel()
+# ## Model definition
+# We create a neural net with 2 layers and 10 units each
+# The data is passed through the NN before being used in the kernel
+neuralnet = Chain(Dense(1, 20), Dense(20, 30), Dense(30, 5))
+# We use two cases :
+# - The Squared Exponential Kernel
+k = transform(SqExponentialKernel(), FunctionTransform(neuralnet))
+
+# We use AbstractGPs.jl to define our model
+gpprior = GP(k) # GP Prior
+fx = AbstractGPs.FiniteGP(gpprior, x_train, noise) # Prior on f
+fp = posterior(fx, y_train) # Posterior of f
+
+# This compute the log evidence of `y`,
+# which is going to be used as the objective
+loss(y) = -logpdf(fx, y)
+
+@info "Init Loss = $(loss(y_train))"
+
+# Flux will automatically extract all the parameters of the kernel
 ps = Flux.params(k)
-# push!(ps,λ)
-opt = Flux.Momentum(1.0)
-##
-for i in 1:10
-    grads = Zygote.gradient(() -> loss(k, λ), ps)
+
+# We show the initial prediction with the untrained model
+p_init = Plots.plot(
+    vcat(x_test...), target_f; lab="true f", title="Loss = $(loss(y_train))"
+)
+Plots.scatter!(vcat(x_train...), y_train; lab="data")
+pred = marginals(fp(x_test))
+Plots.plot!(vcat(x_test...), mean.(pred); ribbon=std.(pred), lab="Prediction")
+# ## Training
+anim = Animation()
+nmax = 1000
+opt = Flux.ADAM(0.1)
+@showprogress for i in 1:nmax
+    global grads = gradient(ps) do
+        loss(y_train)
+    end
     Flux.Optimise.update!(opt, ps, grads)
-    p = Plots.scatter(x, y; lab="data", title="Loss = $(loss(k,λ))")
-    Plots.plot!(x, f(X, k, λ); lab="Prediction", lw=3.0)
-    display(p)
+    if i % 100 == 0
+        @info "$i/$nmax"
+        L = loss(y_train)
+        # @info "Loss = $L"
+        p = Plots.plot(
+            vcat(x_test...), target_f; lab="true f", title="Loss = $(loss(y_train))"
+        )
+        p = Plots.scatter!(vcat(x_train...), y_train; lab="data")
+        pred = marginals(posterior(fx, y_train)(x_test))
+        Plots.plot!(vcat(x_test...), mean.(pred); ribbon=std.(pred), lab="Prediction")
+        frame(anim)
+        display(p)
+    end
 end
+gif(anim; fps=5)
diff --git a/examples/gaussianprocesspriors.jl b/examples/gaussianprocesspriors.jl
@@ -0,0 +1,90 @@
+# -*- coding: utf-8 -*-
-# -*- coding: utf-8 -*-
-# -*- coding: utf-8 -*-
+# # Gaussian process prior samples
+#
+# The kernels defined in this package can also be used to specify the
+# covariance of a Gaussian process prior.
+# A Gaussian process (GP) is defined by its mean function $m(\cdot)$ and its covariance function or kernel $k(\cdot, \cdot')$:
-# A Gaussian process (GP) is defined by its mean function $m(\cdot)$ and its covariance function or kernel $k(\cdot, \cdot')$:
+# A Gaussian process (GP) is defined by its mean function $m(x)$ and its covariance function or kernel $k(x, x')$:
-# A Gaussian process (GP) is defined by its mean function $m(\cdot)$ and its covariance function or kernel $k(\cdot, \cdot')$:
+# A Gaussian process (GP) is defined by its mean function $m(x)$ and its covariance function or kernel $k(x, x')$:
+# ```math
+#   f \sim \mathcal{GP}\big(m(\cdot), k(\cdot, \cdot')\big)
+# ```
+# The function values of the GP at a finite number of points $X = \{x_n\}_{n=1}^N$ follow a multivariate normal distribution with mean vector $\mathbf{m}$ and covariance matrix $\mathrm{K}$, where
+# ```math
+# \begin{aligned}
+#   \mathbf{m}_i &= m(x_i) \\
+#   \mathrm{K}_{i,j} &= k(x_i, x_j)
+# \end{aligned}
+# ```
+# where $1 \le i, j \le N$.
+#
+# In this notebook we show samples from zero-mean GPs with different kernels.
+
+## Load required packages
+using KernelFunctions
+using LinearAlgebra
+using Distributions
+using Plots
+default(; lw=1.0, legendfontsize=15.0)
+using Random: seed!
+seed!(42);
+
+# We now define a function that visualizes a kernel
+# for us. We use the same randomness to obtain
+# comparable samples.
+
+num_inputs = 101
+xlim = (-5, 5)
+X = reshape(collect(range(xlim...; length=num_inputs)), :, 1)
+num_samples = 11
+v = randn(num_inputs, num_samples);
+
+function visualize(k::Kernel; xref=0.0)
+    K = kernelmatrix(k, X; obsdim=1)
+    L = cholesky(K + 1e-6 * I)
+    f = L.L * v
+
+    p_kernel_2d = heatmap(
+        K;
+        yflip=true,
+        colorbar=false,
+        ylabel=string(k),
+        framestyle=:none,
+        #color=:blues,
+        vlim=(0, 1),
+        title=raw"$k(x, x')$",
+    )
+
+    p_kernel_cut = plot(
+        X,
+        k.(X, xref);
+        title=string(raw"$k(x, ", xref, raw")$"),
+        xlim=xlim,
+        xticks=(xlim, xlim),
+        label=nothing,
+    )
+
+    p_samples = plot(
+        X,
+        f;
+        c="blue",
+        title=raw"$f(x)$",
+        ylim=(-3, 3),
+        xlim=xlim,
+        xticks=(xlim, xlim),
+        label=nothing,
+    )
+
+    return plot(p_kernel_2d, p_kernel_cut, p_samples; layout=(1, 3), xlabel=raw"$x$")
+end
+
+# We can now visualize a kernel and show samples from
+# a Gaussian process with this kernel:
+
+visualize(SqExponentialKernel())
+
+# This also allows us to compare different kernels:
+
+kernel_classes = [Matern12Kernel, Matern32Kernel, Matern52Kernel, SqExponentialKernel]
+plot(
+    [visualize(k()) for k in kernel_classes]...,
+    #layout=(length(kernel_classes), 1)
+)
diff --git a/examples/kernel_ridge_regression.jl b/examples/kernel_ridge_regression.jl
@@ -0,0 +1,159 @@
+# # Kernel Ridge Regression
+#
+# Building on linear regression, we can fit non-linear data sets by introducing a feature space. In a higher-dimensional feature space, we can overfit the data; ridge regression introduces regularization to avoid this. In this notebook we show how we can use KernelFunctions.jl for *kernel* ridge regression.
+
+## Loading and setup of required packages
+using KernelFunctions
+using LinearAlgebra
+using Distributions
+
+## Plotting
+using Plots;
+default(; lw=2.0, legendfontsize=15.0);
+
+using Random: seed!
+seed!(42);
+
+# ## From linear regression to ridge regression
+# Here we use a one-dimensional toy problem. We generate data using the fourth-order polynomial $f(x) = (x+4)(x+1)(x-1)(x-3)$:
+
+f_truth(x) = (x + 4) * (x + 1) * (x - 1) * (x - 3)
+
+x_train = collect(-5:0.5:5)
+x_test = collect(-5:0.1:5)
+
+noise = rand(Uniform(-10, 10), size(x_train))
+y_train = f_truth.(x_train) + noise
+y_test = f_truth.(x_test)
+
+plot(x_test, y_test; label=raw"$f(x)$")
+scatter!(x_train, y_train; label="observations")
+
+# For training inputs $\mathrm{X}=(\mathbf{x}_n)_{n=1}^N$ and observations $\mathbf{y}=(y_n)_{n=1}^N$, the linear regression weights $\mathbf{w}$ using the least-squares estimator are given by
+# ```math
+# \mathbf{w} = (\mathrm{X}^\top \mathrm{X})^{-1} \mathrm{X}^\top \mathbf{y}
+# ```
+# We predict at test inputs $\mathbf{x}_*$ using
+# ```math
+# \hat{y}_* = \mathbf{x}_*^\top \mathbf{w}
+# ```
+# This is implemented by `linear_regression`:
+
+function linear_regression(X, y, Xstar)
+    weights = (X' * X) \ (X' * y)
+    return Xstar * weights
+end
+
+# A linear regression fit to the above data set:
+
+y_pred = linear_regression(x_train, y_train, x_test)
+scatter(x_train, y_train; label="observations")
+plot!(x_test, y_pred; label="linear fit")
+
+# We can improve the fit by including additional features, i.e. generalizing to $\mathrm{X} = (\phi(x_n))_{n=1}^N$, where $\phi(x)$ constructs a feature vector for each input $x$. Here we include powers of the input, $\phi(x) = (1, x, x^2, \dots, x^d)$:
+
+function featurize_poly(x; degree=1)
+    xcols = [x .^ d for d in 0:degree]
+    return hcat(xcols...)
+end
-function featurize_poly(x; degree=1)
-    xcols = [x .^ d for d in 0:degree]
-    return hcat(xcols...)
-end
+featurize_poly(x; degree=1) = x .^ (0:degree)'
-function featurize_poly(x; degree=1)
-    xcols = [x .^ d for d in 0:degree]
-    return hcat(xcols...)
-end
+featurize_poly(x; degree=1) = x .^ (0:degree)'
+
+function featurized_fit_and_plot(degree)
+    X = featurize_poly(x_train; degree=degree)
+    Xstar = featurize_poly(x_test; degree=degree)
+    y_pred = linear_regression(X, y_train, Xstar)
+    scatter(x_train, y_train; legend=false, title="fit of order $degree")
+    return plot!(x_test, y_pred)
+end
+
+plot([featurized_fit_and_plot(degree) for degree in 1:4]...)
+
+# Note that the fit becomes perfect when we include exactly as many orders in the features as we have in the underlying polynomial (4).
+#
+# However, when increasing the number of features, we can quickly overfit to noise in the data set:
+
+featurized_fit_and_plot(18)
+
+# To counteract this unwanted behaviour, we can introduce regularization. This leads to *ridge regression* with $L_2$ regularization of the weights ([Tikhonov regularization](https://en.wikipedia.org/wiki/Tikhonov_regularization)).
+# Instead of the weights in linear regression,
+# $$
+# \mathbf{w} = (\mathrm{X}^\top \mathrm{X})^{-1} \mathrm{X}^\top \mathbf{y}
+# $$
+# we introduce the ridge parameter $\lambda$:
+# $$
+# \mathbf{w} = (\mathrm{X}^\top \mathrm{X} + \lambda \mathbb{1})^{-1} \mathrm{X}^\top \mathbf{y}
+# $$
+# As before, we predict at test inputs $\mathbf{x}_*$ using
+# ```math
+# \hat{y}_* = \mathbf{x}_*^\top \mathbf{w}
+# ```
+# This is implemented by `ridge_regression`:
+
+function ridge_regression(X, y, Xstar, lambda)
+    weights = (X' * X + lambda * I) \ (X' * y)
+    return Xstar * weights
+end
+
+function regularized_fit_and_plot(degree, lambda)
+    X = featurize_poly(x_train; degree=degree)
+    Xstar = featurize_poly(x_test; degree=degree)
+    y_pred = ridge_regression(X, y_train, Xstar, lambda)
+    scatter(x_train, y_train; legend=false, title="\$\\lambda=$lambda\$")
+    return plot!(x_test, y_pred)
+end
+
+plot([regularized_fit_and_plot(18, lambda) for lambda in [1e-4, 1e-2, 0.1, 10]]...)
+
+# Instead of constructing the feature matrix explicitly, we can use *kernels* to replace inner products of feature vectors with a kernel evaluation: $\langle \phi(x), \phi(x') \rangle = k(x, x')$ or $\mathrm{X} \mathrm{X}^\top = \mathrm{K}$, where $\mathrm{K}_{ij} = k(x_i, x_j)$.
+#
+# To apply this "kernel trick" to ridge regression, we can rewrite the ridge estimate for the weights
+# $$
+# \mathbf{w} = (\mathrm{X}^\top \mathrm{X} + \lambda \mathbb{1})^{-1} \mathrm{X}^\top \mathbf{y}
+# $$
+# using the [matrix inversion lemma](https://tlienart.github.io/pub/csml/mtheory/matinvlem.html#basic_lemmas)
+# as
+# $$
+# \mathbf{w} = \mathrm{X}^\top (\mathrm{X} \mathrm{X}^\top + \lambda \mathbb{1})^{-1} \mathbf{y}
+# $$
+# where we can now replace the inner product with the kernel matrix,
+# $$
+# \mathbf{w} = \mathrm{X}^\top (\mathrm{K} + \lambda \mathbb{1})^{-1} \mathbf{y}
+# $$
+# And the prediction yields another inner product,
+# ```math
+# \hat{y}_* = \mathbf{x}_*^\top \mathbf{w} = \langle \mathbf{x}_*, \mathbf{w} \rangle = \mathbf{k}_* (\mathrm{K} + \lambda \mathbb{1})^{-1} \mathbf{y}
+# ```
+# where $(\mathbf{k}_*)_n = k(x_*, x_n)$.
+#
+# This is implemented by `kernel_ridge_regression`:
+
+function kernel_ridge_regression(k, X, y, Xstar, lambda)
+    K = kernelmatrix(k, X)
+    kstar = kernelmatrix(k, Xstar, X)
+    return kstar * ((K + lambda * I) \ y)
+end
+
+# Now, instead of explicitly constructing features, we can simply pass in a `PolynomialKernel` object:
+
+function kernelized_fit_and_plot(kernel, lambda=1e-4)
+    y_pred = kernel_ridge_regression(kernel, x_train, y_train, x_test, lambda)
+    if kernel isa PolynomialKernel
+        title = string("order ", kernel.degree)
+    else
+        title = string(kernel)
+    end
+    scatter(x_train, y_train; label=nothing)
+    p = plot!(
+        x_test,
+        y_pred;
+        label=nothing,
+        title=title,
+        #title=string(raw"$\lambda=", lambda, raw"$")
+    )
+    return p
+end
+
+plot([kernelized_fit_and_plot(PolynomialKernel(; degree=degree, c=1)) for degree in 1:4]...)
+
+# However, we can now also use kernels that would have an infinite-dimensional feature expansion, such as the squared exponential kernel:
+
+kernelized_fit_and_plot(SqExponentialKernel())