-
Notifications
You must be signed in to change notification settings - Fork 32
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Example notebooks #234
Example notebooks #234
Changes from all commits
fcda0c6
2982f7c
f0f2cb2
c83829a
118e9b8
438fcee
bac24cc
0aaf2bb
cfa203f
28bd638
f438d98
768e721
0eb6f36
bdd0800
0662fd4
156698d
a7eff89
56af5d1
125480f
e188bee
c7ecd31
a2f8a5e
542f932
2d84c74
57e5caf
9e4e144
aac6326
3195326
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,3 +1,3 @@ | ||
build/ | ||
site/ | ||
src/examples/ | ||
src/examples/ |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,40 +1,78 @@ | ||
# # Deep Kernel Learning with Flux | ||
# ## Package loading | ||
# We use a couple of useful packages to plot and optimize | ||
# the different hyper-parameters | ||
using KernelFunctions | ||
using MLDataUtils | ||
using Zygote | ||
using Flux | ||
using Distributions, LinearAlgebra | ||
using Plots | ||
using ProgressMeter | ||
using AbstractGPs | ||
pyplot(); | ||
default(; legendfontsize=15.0, linewidth=3.0); | ||
|
||
Flux.@functor SqExponentialKernel | ||
Flux.@functor KernelSum | ||
Flux.@functor Matern32Kernel | ||
Flux.@functor FunctionTransform | ||
|
||
neuralnet = Chain(Dense(1, 3), Dense(3, 2)) | ||
k = SqExponentialKernel(FunctionTransform(neuralnet)) | ||
# ## Data creation | ||
# We create a simple 1D Problem with very different variations | ||
xmin = -3; | ||
xmax = 3; | ||
x = range(xmin, xmax; length=100) | ||
x_test = rand(Uniform(xmin, xmax), 200) | ||
x, y = noisy_function(sinc, x; noise=0.1) | ||
X = reshape(x, :, 1) | ||
λ = [0.1] | ||
function f(x, k, λ) | ||
return kernelmatrix(k, X, x; obsdim=1) * | ||
inv(kernelmatrix(k, X; obsdim=1) + exp(λ[1]) * I) * | ||
y | ||
end | ||
f(X, k, 1.0) | ||
loss(k, λ) = ŷ -> sum(y - ŷ) / length(y) + exp(λ[1]) * norm(ŷ)(f(X, k, λ)) | ||
loss(k, λ) | ||
xmax = 3; # Limits | ||
N = 150 | ||
noise = 0.01 | ||
x_train = collect(eachrow(rand(Uniform(xmin, xmax), N))) # Training dataset | ||
target_f(x) = sinc(abs(x)^abs(x)) # We use sinc with a highly varying value | ||
target_f(x::AbstractArray) = target_f(first(x)) | ||
y_train = target_f.(x_train) + randn(N) * noise | ||
x_test = collect(eachrow(range(xmin, xmax; length=200))) # Testing dataset | ||
spectral_mixture_kernel() | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Should this be removed? |
||
# ## Model definition | ||
# We create a neural net with 2 layers and 10 units each | ||
# The data is passed through the NN before being used in the kernel | ||
neuralnet = Chain(Dense(1, 20), Dense(20, 30), Dense(30, 5)) | ||
# We use two cases : | ||
# - The Squared Exponential Kernel | ||
k = transform(SqExponentialKernel(), FunctionTransform(neuralnet)) | ||
|
||
# We use AbstractGPs.jl to define our model | ||
gpprior = GP(k) # GP Prior | ||
fx = AbstractGPs.FiniteGP(gpprior, x_train, noise) # Prior on f | ||
fp = posterior(fx, y_train) # Posterior of f | ||
|
||
# This compute the log evidence of `y`, | ||
# which is going to be used as the objective | ||
loss(y) = -logpdf(fx, y) | ||
|
||
@info "Init Loss = $(loss(y_train))" | ||
|
||
# Flux will automatically extract all the parameters of the kernel | ||
ps = Flux.params(k) | ||
# push!(ps,λ) | ||
opt = Flux.Momentum(1.0) | ||
## | ||
for i in 1:10 | ||
grads = Zygote.gradient(() -> loss(k, λ), ps) | ||
|
||
# We show the initial prediction with the untrained model | ||
p_init = Plots.plot( | ||
vcat(x_test...), target_f; lab="true f", title="Loss = $(loss(y_train))" | ||
) | ||
Plots.scatter!(vcat(x_train...), y_train; lab="data") | ||
pred = marginals(fp(x_test)) | ||
Plots.plot!(vcat(x_test...), mean.(pred); ribbon=std.(pred), lab="Prediction") | ||
# ## Training | ||
anim = Animation() | ||
nmax = 1000 | ||
opt = Flux.ADAM(0.1) | ||
@showprogress for i in 1:nmax | ||
global grads = gradient(ps) do | ||
loss(y_train) | ||
end | ||
Flux.Optimise.update!(opt, ps, grads) | ||
p = Plots.scatter(x, y; lab="data", title="Loss = $(loss(k,λ))") | ||
Plots.plot!(x, f(X, k, λ); lab="Prediction", lw=3.0) | ||
display(p) | ||
if i % 100 == 0 | ||
@info "$i/$nmax" | ||
L = loss(y_train) | ||
# @info "Loss = $L" | ||
p = Plots.plot( | ||
vcat(x_test...), target_f; lab="true f", title="Loss = $(loss(y_train))" | ||
) | ||
p = Plots.scatter!(vcat(x_train...), y_train; lab="data") | ||
pred = marginals(posterior(fx, y_train)(x_test)) | ||
Plots.plot!(vcat(x_test...), mean.(pred); ribbon=std.(pred), lab="Prediction") | ||
frame(anim) | ||
display(p) | ||
end | ||
end | ||
gif(anim; fps=5) |
Original file line number | Diff line number | Diff line change | ||||
---|---|---|---|---|---|---|
@@ -0,0 +1,90 @@ | ||||||
# -*- coding: utf-8 -*- | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It's from jupytext. I'd like to see if it's possible to get Literate.jl and jupytext to play along 100% instead of just 98%... fredrikekre/Literate.jl#131 There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Even if it is supported, I would prefer the standard Literate syntax. Otherwise one either ends up with a mix of different styles or has to pay attention to use the non-standard jupytext formatting (at least non-standard in this environment). To me it seems this would add more confusion but not provide any benefits. If you want to play around and tune the code, I can recommend VSCode or the outdated Juno environment which support executing single blocks and lines of literate files automatically in the REPL. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
We can discuss trade-offs, but I don't think this claim is fair; I've explained my motivation on the Literate.jl issue I linked. Having the example notebooks in a format that can directly be edited interactively in jupyter notebooks makes it easier to contribute notebooks for people who are comfortable with that workflow (like me, but it's a widely popular workflow), without requiring people to use VSCode (which is a lot harder to get into than firing up a jupyter notebook). There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. My impression is that in the Julia community the common workflow for writing Literate or Weave documents is to work with Juno or VSCode, to copy and run code in a REPL on the side, or rebuild the output with Literate and Weave intermittently. I think it's fine to support additional workflows but I think
From the issue I get the impression that there won't be an official support for jupytext in the near future. And even if it is added, the second point seems still problematic. It seems jupytext requires a different syntax that deviates from the official syntax for Literate and Weave - if someone edits the document with jupytext the syntax of the document would have to change and so we might end up with a mix of the regular and the jupytext syntax. This seems confusing. |
||||||
# # Gaussian process prior samples | ||||||
# | ||||||
# The kernels defined in this package can also be used to specify the | ||||||
# covariance of a Gaussian process prior. | ||||||
# A Gaussian process (GP) is defined by its mean function $m(\cdot)$ and its covariance function or kernel $k(\cdot, \cdot')$: | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I have never seen the notation There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
|
||||||
# ```math | ||||||
# f \sim \mathcal{GP}\big(m(\cdot), k(\cdot, \cdot')\big) | ||||||
# ``` | ||||||
# The function values of the GP at a finite number of points $X = \{x_n\}_{n=1}^N$ follow a multivariate normal distribution with mean vector $\mathbf{m}$ and covariance matrix $\mathrm{K}$, where | ||||||
# ```math | ||||||
# \begin{aligned} | ||||||
# \mathbf{m}_i &= m(x_i) \\ | ||||||
# \mathrm{K}_{i,j} &= k(x_i, x_j) | ||||||
# \end{aligned} | ||||||
# ``` | ||||||
# where $1 \le i, j \le N$. | ||||||
# | ||||||
# In this notebook we show samples from zero-mean GPs with different kernels. | ||||||
|
||||||
## Load required packages | ||||||
using KernelFunctions | ||||||
using LinearAlgebra | ||||||
using Distributions | ||||||
using Plots | ||||||
default(; lw=1.0, legendfontsize=15.0) | ||||||
using Random: seed! | ||||||
seed!(42); | ||||||
|
||||||
# We now define a function that visualizes a kernel | ||||||
# for us. We use the same randomness to obtain | ||||||
# comparable samples. | ||||||
|
||||||
num_inputs = 101 | ||||||
xlim = (-5, 5) | ||||||
X = reshape(collect(range(xlim...; length=num_inputs)), :, 1) | ||||||
num_samples = 11 | ||||||
v = randn(num_inputs, num_samples); | ||||||
|
||||||
function visualize(k::Kernel; xref=0.0) | ||||||
K = kernelmatrix(k, X; obsdim=1) | ||||||
L = cholesky(K + 1e-6 * I) | ||||||
f = L.L * v | ||||||
|
||||||
p_kernel_2d = heatmap( | ||||||
K; | ||||||
yflip=true, | ||||||
colorbar=false, | ||||||
ylabel=string(k), | ||||||
framestyle=:none, | ||||||
#color=:blues, | ||||||
vlim=(0, 1), | ||||||
title=raw"$k(x, x')$", | ||||||
) | ||||||
|
||||||
p_kernel_cut = plot( | ||||||
X, | ||||||
k.(X, xref); | ||||||
title=string(raw"$k(x, ", xref, raw")$"), | ||||||
xlim=xlim, | ||||||
xticks=(xlim, xlim), | ||||||
label=nothing, | ||||||
) | ||||||
|
||||||
p_samples = plot( | ||||||
X, | ||||||
f; | ||||||
c="blue", | ||||||
title=raw"$f(x)$", | ||||||
ylim=(-3, 3), | ||||||
xlim=xlim, | ||||||
xticks=(xlim, xlim), | ||||||
label=nothing, | ||||||
) | ||||||
|
||||||
return plot(p_kernel_2d, p_kernel_cut, p_samples; layout=(1, 3), xlabel=raw"$x$") | ||||||
end | ||||||
|
||||||
# We can now visualize a kernel and show samples from | ||||||
# a Gaussian process with this kernel: | ||||||
|
||||||
visualize(SqExponentialKernel()) | ||||||
|
||||||
# This also allows us to compare different kernels: | ||||||
|
||||||
kernel_classes = [Matern12Kernel, Matern32Kernel, Matern52Kernel, SqExponentialKernel] | ||||||
plot( | ||||||
[visualize(k()) for k in kernel_classes]..., | ||||||
#layout=(length(kernel_classes), 1) | ||||||
) |
Original file line number | Diff line number | Diff line change | ||||||||||
---|---|---|---|---|---|---|---|---|---|---|---|---|
@@ -0,0 +1,159 @@ | ||||||||||||
# # Kernel Ridge Regression | ||||||||||||
# | ||||||||||||
# Building on linear regression, we can fit non-linear data sets by introducing a feature space. In a higher-dimensional feature space, we can overfit the data; ridge regression introduces regularization to avoid this. In this notebook we show how we can use KernelFunctions.jl for *kernel* ridge regression. | ||||||||||||
|
||||||||||||
## Loading and setup of required packages | ||||||||||||
using KernelFunctions | ||||||||||||
using LinearAlgebra | ||||||||||||
using Distributions | ||||||||||||
|
||||||||||||
## Plotting | ||||||||||||
using Plots; | ||||||||||||
default(; lw=2.0, legendfontsize=15.0); | ||||||||||||
|
||||||||||||
using Random: seed! | ||||||||||||
seed!(42); | ||||||||||||
|
||||||||||||
# ## From linear regression to ridge regression | ||||||||||||
# Here we use a one-dimensional toy problem. We generate data using the fourth-order polynomial $f(x) = (x+4)(x+1)(x-1)(x-3)$: | ||||||||||||
|
||||||||||||
f_truth(x) = (x + 4) * (x + 1) * (x - 1) * (x - 3) | ||||||||||||
|
||||||||||||
x_train = collect(-5:0.5:5) | ||||||||||||
x_test = collect(-5:0.1:5) | ||||||||||||
|
||||||||||||
noise = rand(Uniform(-10, 10), size(x_train)) | ||||||||||||
y_train = f_truth.(x_train) + noise | ||||||||||||
y_test = f_truth.(x_test) | ||||||||||||
|
||||||||||||
plot(x_test, y_test; label=raw"$f(x)$") | ||||||||||||
scatter!(x_train, y_train; label="observations") | ||||||||||||
|
||||||||||||
# For training inputs $\mathrm{X}=(\mathbf{x}_n)_{n=1}^N$ and observations $\mathbf{y}=(y_n)_{n=1}^N$, the linear regression weights $\mathbf{w}$ using the least-squares estimator are given by | ||||||||||||
# ```math | ||||||||||||
# \mathbf{w} = (\mathrm{X}^\top \mathrm{X})^{-1} \mathrm{X}^\top \mathbf{y} | ||||||||||||
# ``` | ||||||||||||
# We predict at test inputs $\mathbf{x}_*$ using | ||||||||||||
# ```math | ||||||||||||
# \hat{y}_* = \mathbf{x}_*^\top \mathbf{w} | ||||||||||||
# ``` | ||||||||||||
# This is implemented by `linear_regression`: | ||||||||||||
|
||||||||||||
function linear_regression(X, y, Xstar) | ||||||||||||
weights = (X' * X) \ (X' * y) | ||||||||||||
return Xstar * weights | ||||||||||||
end | ||||||||||||
|
||||||||||||
# A linear regression fit to the above data set: | ||||||||||||
|
||||||||||||
y_pred = linear_regression(x_train, y_train, x_test) | ||||||||||||
scatter(x_train, y_train; label="observations") | ||||||||||||
plot!(x_test, y_pred; label="linear fit") | ||||||||||||
|
||||||||||||
# We can improve the fit by including additional features, i.e. generalizing to $\mathrm{X} = (\phi(x_n))_{n=1}^N$, where $\phi(x)$ constructs a feature vector for each input $x$. Here we include powers of the input, $\phi(x) = (1, x, x^2, \dots, x^d)$: | ||||||||||||
|
||||||||||||
function featurize_poly(x; degree=1) | ||||||||||||
xcols = [x .^ d for d in 0:degree] | ||||||||||||
return hcat(xcols...) | ||||||||||||
end | ||||||||||||
Comment on lines
+55
to
+58
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
However, the function is so simple that it might be easier and more compact to just use |
||||||||||||
|
||||||||||||
function featurized_fit_and_plot(degree) | ||||||||||||
X = featurize_poly(x_train; degree=degree) | ||||||||||||
Xstar = featurize_poly(x_test; degree=degree) | ||||||||||||
y_pred = linear_regression(X, y_train, Xstar) | ||||||||||||
scatter(x_train, y_train; legend=false, title="fit of order $degree") | ||||||||||||
return plot!(x_test, y_pred) | ||||||||||||
end | ||||||||||||
|
||||||||||||
plot([featurized_fit_and_plot(degree) for degree in 1:4]...) | ||||||||||||
|
||||||||||||
# Note that the fit becomes perfect when we include exactly as many orders in the features as we have in the underlying polynomial (4). | ||||||||||||
# | ||||||||||||
# However, when increasing the number of features, we can quickly overfit to noise in the data set: | ||||||||||||
|
||||||||||||
featurized_fit_and_plot(18) | ||||||||||||
|
||||||||||||
# To counteract this unwanted behaviour, we can introduce regularization. This leads to *ridge regression* with $L_2$ regularization of the weights ([Tikhonov regularization](https://en.wikipedia.org/wiki/Tikhonov_regularization)). | ||||||||||||
# Instead of the weights in linear regression, | ||||||||||||
# $$ | ||||||||||||
# \mathbf{w} = (\mathrm{X}^\top \mathrm{X})^{-1} \mathrm{X}^\top \mathbf{y} | ||||||||||||
# $$ | ||||||||||||
# we introduce the ridge parameter $\lambda$: | ||||||||||||
# $$ | ||||||||||||
# \mathbf{w} = (\mathrm{X}^\top \mathrm{X} + \lambda \mathbb{1})^{-1} \mathrm{X}^\top \mathbf{y} | ||||||||||||
# $$ | ||||||||||||
# As before, we predict at test inputs $\mathbf{x}_*$ using | ||||||||||||
# ```math | ||||||||||||
# \hat{y}_* = \mathbf{x}_*^\top \mathbf{w} | ||||||||||||
# ``` | ||||||||||||
# This is implemented by `ridge_regression`: | ||||||||||||
|
||||||||||||
function ridge_regression(X, y, Xstar, lambda) | ||||||||||||
weights = (X' * X + lambda * I) \ (X' * y) | ||||||||||||
return Xstar * weights | ||||||||||||
end | ||||||||||||
|
||||||||||||
function regularized_fit_and_plot(degree, lambda) | ||||||||||||
X = featurize_poly(x_train; degree=degree) | ||||||||||||
Xstar = featurize_poly(x_test; degree=degree) | ||||||||||||
y_pred = ridge_regression(X, y_train, Xstar, lambda) | ||||||||||||
scatter(x_train, y_train; legend=false, title="\$\\lambda=$lambda\$") | ||||||||||||
return plot!(x_test, y_pred) | ||||||||||||
end | ||||||||||||
|
||||||||||||
plot([regularized_fit_and_plot(18, lambda) for lambda in [1e-4, 1e-2, 0.1, 10]]...) | ||||||||||||
|
||||||||||||
# Instead of constructing the feature matrix explicitly, we can use *kernels* to replace inner products of feature vectors with a kernel evaluation: $\langle \phi(x), \phi(x') \rangle = k(x, x')$ or $\mathrm{X} \mathrm{X}^\top = \mathrm{K}$, where $\mathrm{K}_{ij} = k(x_i, x_j)$. | ||||||||||||
# | ||||||||||||
# To apply this "kernel trick" to ridge regression, we can rewrite the ridge estimate for the weights | ||||||||||||
# $$ | ||||||||||||
# \mathbf{w} = (\mathrm{X}^\top \mathrm{X} + \lambda \mathbb{1})^{-1} \mathrm{X}^\top \mathbf{y} | ||||||||||||
# $$ | ||||||||||||
# using the [matrix inversion lemma](https://tlienart.github.io/pub/csml/mtheory/matinvlem.html#basic_lemmas) | ||||||||||||
# as | ||||||||||||
# $$ | ||||||||||||
# \mathbf{w} = \mathrm{X}^\top (\mathrm{X} \mathrm{X}^\top + \lambda \mathbb{1})^{-1} \mathbf{y} | ||||||||||||
# $$ | ||||||||||||
# where we can now replace the inner product with the kernel matrix, | ||||||||||||
# $$ | ||||||||||||
# \mathbf{w} = \mathrm{X}^\top (\mathrm{K} + \lambda \mathbb{1})^{-1} \mathbf{y} | ||||||||||||
# $$ | ||||||||||||
# And the prediction yields another inner product, | ||||||||||||
# ```math | ||||||||||||
# \hat{y}_* = \mathbf{x}_*^\top \mathbf{w} = \langle \mathbf{x}_*, \mathbf{w} \rangle = \mathbf{k}_* (\mathrm{K} + \lambda \mathbb{1})^{-1} \mathbf{y} | ||||||||||||
# ``` | ||||||||||||
# where $(\mathbf{k}_*)_n = k(x_*, x_n)$. | ||||||||||||
# | ||||||||||||
# This is implemented by `kernel_ridge_regression`: | ||||||||||||
|
||||||||||||
function kernel_ridge_regression(k, X, y, Xstar, lambda) | ||||||||||||
K = kernelmatrix(k, X) | ||||||||||||
kstar = kernelmatrix(k, Xstar, X) | ||||||||||||
return kstar * ((K + lambda * I) \ y) | ||||||||||||
end | ||||||||||||
|
||||||||||||
# Now, instead of explicitly constructing features, we can simply pass in a `PolynomialKernel` object: | ||||||||||||
|
||||||||||||
function kernelized_fit_and_plot(kernel, lambda=1e-4) | ||||||||||||
y_pred = kernel_ridge_regression(kernel, x_train, y_train, x_test, lambda) | ||||||||||||
if kernel isa PolynomialKernel | ||||||||||||
title = string("order ", kernel.degree) | ||||||||||||
else | ||||||||||||
title = string(kernel) | ||||||||||||
end | ||||||||||||
scatter(x_train, y_train; label=nothing) | ||||||||||||
p = plot!( | ||||||||||||
x_test, | ||||||||||||
y_pred; | ||||||||||||
label=nothing, | ||||||||||||
title=title, | ||||||||||||
#title=string(raw"$\lambda=", lambda, raw"$") | ||||||||||||
) | ||||||||||||
return p | ||||||||||||
end | ||||||||||||
|
||||||||||||
plot([kernelized_fit_and_plot(PolynomialKernel(; degree=degree, c=1)) for degree in 1:4]...) | ||||||||||||
|
||||||||||||
# However, we can now also use kernels that would have an infinite-dimensional feature expansion, such as the squared exponential kernel: | ||||||||||||
|
||||||||||||
kernelized_fit_and_plot(SqExponentialKernel()) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
IMO the example should be moved to AbstractGPs.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Happy to move this one, I think it's a bit less about kernels themselves. @theogf as this was yours, would like to hear what you think before removing it here ?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I replied more generally on the main comments.