% This file provides a small self-contained demonstration of a Gaussian process
% model. The model really is just a Gaussian distribution! A few lines of
% standard linear algebra give the conditional distributions you need to perform
% inference. You will probably need to invest some effort to understanding this
% demonstration, and working out the correspondence to the maths in the notes.
% No matter what I provide, you'll need to do some hard work. But once you've
% reproduced the ideas for yourself you should find that nothing that
% complicated is going on.
%
% Alternatively, gp_altdemo.m is a simpler demo, in that it requires less linear
% algebra, but I've used a non-standard presentation to achieve that trick: it
% will still require thought!
%
% One messy detail that we do encounter is that large covariance matrices
% constructed from the Gaussian kernel are poorly-conditioned. This script
% applies a standard hack to work around that.
%
% The class notes link to several polished and featureful Gaussian process
% toolboxes. These hide the maths from you behind user-friendly interfaces,
% and come with more extensive demonstrations.
%
% Iain Murray, November 2016


%% The kernel function
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

% This demo uses a Gaussian kernel function,
% k(x1,x2) = sigma_f^2 * exp(-0.5 * sum(((x1-x2)./ell).^2))

% We create kernel functions that take NxD and MxD design matrices, to give NxM
% kernel values. rbf_fn is a fixed unit bandwidth RBF kernel, and
% gauss_kernel_fn rescales that using free parameters ell (1xD or 1x1) and sigma_f (1x1).

rbf_fn = @(X1, X2) exp(bsxfun(@minus, bsxfun(@minus, X1*(2*X2'), sum(X1.*X1,2)), sum(X2.*X2,2)'));
gauss_kernel_fn = @(X1, X2, ell, sigma_f) sigma_f^2 * rbf_fn(bsxfun(@rdivide, X1, sqrt(2)*ell(:)'), bsxfun(@rdivide, X2, sqrt(2)*ell(:)'));

% Pick some particular parameters for this demo:
k_fn = @(X1, X2) gauss_kernel_fn(X1, X2, 3.0, 10.0);

% You could also try:
% k_fn = @(X1, X2) gauss_kernel_fn(X1, X2, 0.5, 1.0);
% k_fn = @(X1, X2) gauss_kernel_fn(X1, X2, 0.5, 1.0) + gauss_kernel_fn(X1, X2, 3.0, 10.0);
% ...

% The ugly bsxfun operations can be eliminated once Matlab >= 2016b is wide-spread.

% gauss_kernel_fn is equivalent to the following slower function (put in separate file):
% function K = gauss_kernel_naive(X1, X2, ell, sigma_f)
% K = zeros(size(X1, 1), size(X2, 1));
% for n = 1:size(X1, 1)
%     for m = 1:size(X2, 2)
%         K(n,m) = sigma_f^2 * exp(-0.5*sum(((X1(n,:)-X2(m,:))./ell(:)').^2));
%     end
% end


%% Sampling from the prior
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

% Pick the input locations that we want to see the function at.
X_grid = (0:0.02:10)';

% Compute covariance of function values for those points. The Gaussian kernel
% gives positive definite matrices in theory. In practice, we need to add a tiny
% amount of noise to the process (a "jitter" or "nugget" term) if we need a
% matrix that is positive definite given finite numerical precision.
N_grid = size(X_grid, 1);
K_grid = k_fn(X_grid, X_grid) + 1e-9*eye(N_grid);

% To sample from Gausian with covariance K=L*L',
% we just multiply L by a vector standard normals:
L_grid = chol(K_grid, 'lower');
figure(1);
clf; hold all;
for ii = 1:3
    f_grid = L_grid*randn(N_grid, 1);
    plot(X_grid, f_grid, '-');
end

legend({'prior sample 1', 'prior sample 2', 'prior sample 3'});
xlabel('x');
ylabel('f');


%% Sampling from the prior in two stages
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

% We don't have to sample "the whole" function, or a dense grid.

% Pick four of the original grid points, just sample a setting of those and plot as crosses:
idx = round(N_grid*[0.2,0.4,0.6,0.8]);
X_locs1 = X_grid(idx, :);
N_locs1 = numel(idx);
K_locs1 = k_fn(X_locs1, X_locs1);
L_locs1 = chol(K_locs1, 'lower');
figure(2);
clf; hold all;
noise_var = 0.0; % add no noise to the samples, we want to look at function values
%noise_var = 1.0; % add some noise to the samples, we want to simulate data
f_locs1 = L_locs1*randn(N_locs1, 1) + sqrt(noise_var)*randn(N_locs1,1);
plot(X_locs1, f_locs1, 'x', 'MarkerSize', 20, 'LineWidth', 2);

% We could later choose to sample any of the other function values, conditioned
% on the ones we already picked. We compute all the relevant covariances, and
% apply standard formulae to get the mean and covariance of the Gaussian over
% the remaining function values, conditioned on the ones we've already picked.
X_rest = X_grid; X_rest(idx,:) = [];
K_rest = k_fn(X_rest, X_rest);
K_rest_locs1 = k_fn(X_rest, X_locs1);
M = K_locs1 + noise_var*eye(N_locs1);
% We could compute chol(M) once and use it to solve both the linear systems in the next two lines
rest_cond_mu = K_rest_locs1 * (M\f_locs1);
rest_cond_cov = K_rest - K_rest_locs1*(M\K_rest_locs1');

% Then we sample 3 different completions of the f_locs1 function values that we
% sampled above. Each completion by itself is just illustrating a sample of the
% function from the prior. The different completions illustrate different
% samples from the posterior at test locations, given a set of observed function
% values.
N_rest = size(X_rest, 1);
L_rest = chol(rest_cond_cov + 1e-9*eye(N_rest), 'lower');
for ii = 1:3
    rest_sample = L_rest*randn(N_rest,1) + rest_cond_mu;
    plot(X_rest, rest_sample, '-');
end

% If you change noise_var from 0.0 to 1.0 above, you can simulate first looking
% at noisy values of the function. The conditional distribution is then the
% posterior over some function values given some noisy observations.

% If you'd rather plot the mean prediction and error bars rather than samples,
% you can do that too. The thick black solid line shows the mean, and dashed
% thick black lines show +/- 2 standard deviations -- at any particular
% location, we have ~95% belief the function will lie in this range.
plot(X_rest, rest_cond_mu, '-k', 'LineWidth', 2)
rest_cond_std = sqrt(diag(rest_cond_cov));
plot(X_rest, rest_cond_mu + 2*rest_cond_std, '--k', 'LineWidth', 2)
plot(X_rest, rest_cond_mu - 2*rest_cond_std, '--k', 'LineWidth', 2)
% The mean and error bars can be computed more cheaply if you don't want to know
% the whole posterior. See the notes.

legend({'Initial Samples', 'completion 1', 'completion 2', 'completion 3', 'mean completion', 'credible band'});
xlabel('x');
ylabel('f');


% Final remarks:
% --------------
% - None of the points have to be on a grid.
% - The initial training points X_locs1 could be anywhere.
% - The remaining test points X_rest could be anywhere.
% - If the points aren't sampled densely and in order, we wouldn't
%   join them with straight lines. We'd just plot individual points.
% - The inputs can be in D-dimensions. The kernel function is the
%   only part of the code that looks at the original input features.
%   None of the rest of the code cares where the scalar function
%   values are located in input space.