How to use Edward library for probabilistic modeling with Tensorflow and GPy to study asymptotic connections between Multi-Layer Perceptrons (neural nets) and Gaussian processes?
Numerical verification of the asymptotic connections between Multi-Layer Perceptrons (neural nets) and Gaussian processes.¶
Authors: Dr. Eren Metin Elçi and Ravinder Kumar ¶
Is there a connection between Multi Layer Perceptrons (neural nets) and Gaussian processes? At least theoretically some supporting arguments were given by Christopher K. I. Williams in https://papers.nips.cc/paper/1197-computing-with-infinite-networks.pdf
In this tutorial we use use Edward (library for probabilistic modelling), http://edwardlib.org, which is compatible with Tensorflow. We further use the Gaussian Process implementation provided by GPy.
As a side product, we show how to create a custom activation unit for neural networks and how to construct custom Gaussian process kernels.
Have fun and if you require the jupyter notebook for this project contact us pyboot@gmail.com!
%matplotlib inline
import edward as ed
import matplotlib.pyplot as plt
import numpy as np
import tensorflow as tf
from edward.models import Normal
from tensorflow.python.framework import ops
plt.style.use('ggplot')
from scipy.stats import norm
from scipy.special import erf
import GPy # Gaussian processes (possible overkill to use GPy for our purpose, but nice to know how to use)
from scipy.stats import ttest_ind_from_stats
Construct the "Sigmoidal" activation function as refered to in the above paper¶
Regarding the construction, see https://stackoverflow.com/questions/39921607/how-to-make-a-custom-activation-function-with-only-python-in-tensorflow
Note that [writing $CDF(x;\mu;\sigma)$ for the cumulative distribution function of a normal with mean $\mu$ and standard deviation $\sigma$] we have: $$ \Phi(z) = 2\cdot CDF(z;0;1/\sqrt{2}) -1 $$ hence $$ \partial_z \Phi(z) = 2p(z;0;1/\sqrt{2}) $$ where $p(z;\mu;\sigma)$ is the corresponding pdf of a normal with mean $\mu$ and stdev $\sigma$.
# construct an activation function that equals to the Sigmoidal mentioned in the Williams paper
np_d_myfun_32 = lambda x: 2*np.sqrt(2)*norm.pdf(x*np.sqrt(2)).astype(np.float32)
def tf_d_f(x,name=None):
#Because of WARNING tf.op_scope(values, name, default_name) is deprecated, use tf.name_scope(name, default_name, values)
with tf.name_scope( name, "d_myfun",[x]) as name:
y = tf.py_func(np_d_myfun_32,
[x],
[tf.float32],
name=name,
stateful=False)
return y[0]
def py_func(func, inp, Tout, stateful=True, name=None, grad=None):
# Need to generate a unique name to avoid duplicates:
rnd_name = 'PyFuncGrad' + str(np.random.randint(0, 1E+8))
tf.RegisterGradient(rnd_name)(grad) # see _MySquareGrad for grad example
g = tf.get_default_graph()
with g.gradient_override_map({"PyFunc": rnd_name}):
return tf.py_func(func, inp, Tout, stateful=stateful, name=name)
def myfungrad(op, grad):
x = op.inputs[0]
n_gr = tf_d_f(x)
return grad * n_gr
np_myfun_32 = lambda x: np.asarray(erf(x)).astype(np.float32)
def tf_myfun(x, name=None):
with tf.name_scope(name, "myfun",[x]) as name:
y = py_func(np_myfun_32,
[x],
[tf.float32],
name=name,
grad=myfungrad) # <-- here's the call to the gradient
return y[0]
def neural_network(x, W_0, W_1, b_0, b_1):
h = tf_myfun(tf.matmul(x, W_0) + b_0) #from input to hidden layer
h = tf.matmul(h, W_1) + b_1 #from hidden to output layer
return tf.reshape(h, [-1])
ed.set_seed(42)
NDRAWS=100
NPOINTS=1000
D = 1 # number of features
H = 10000 # number of hidden units
omega=1
#sigma = .01
sigma_b_0 = 2
sigma_b_1 = .001 # make Eq. (9) in the paper simpler
sigma_W_0 = 10
sigma_W_1 = omega*1./np.sqrt(H) # required for convergence to the corresponding GP, see paper
qW_0 = Normal(loc=tf.Variable(tf.zeros([D, H])),
scale=tf.Variable(sigma_W_0*tf.ones([D,H])))
qW_1 = Normal(loc=tf.Variable(tf.zeros([H, 1])),
scale=tf.Variable(sigma_W_1*tf.ones([H,1])))
qb_0 = Normal(loc=tf.Variable(tf.zeros([H])),
scale=tf.Variable(sigma_b_0*tf.ones([H])))
qb_1 = Normal(loc=tf.Variable(tf.zeros([1])),
scale=tf.Variable(sigma_b_1*tf.ones([1])))
# Sample functions from variational model to visualize fits.
rs = np.random.RandomState(0)
inputs = np.linspace(-5, 5, num=NPOINTS, dtype=np.float32)
x = tf.expand_dims(inputs, 1)
mus = tf.stack(
[neural_network(x, qW_0.sample(), qW_1.sample(),
qb_0.sample(), qb_1.sample())
for _ in range(NDRAWS)])
sess = ed.get_session()
tf.global_variables_initializer().run()
outputs = mus.eval()
fig,ax = plt.subplots()
for i in range(10):
ax.plot(inputs, outputs[i,:])
ax.set_ylim(-2,2)
ax.set_xlim(-5,5)
plt.show()
Now build the corresponding GP kernel and draw some samples from it¶
For an explanation how to build a custom kernel in GPy, see http://gpy.readthedocs.io/en/deploy/tuto_creating_new_kernels.html
from GPy.kern import Kern
from GPy import Param
class MyKernel(Kern):
def __init__(self,input_dim,sigma_bias=1., sigma_weights=1.):
super(MyKernel, self).__init__(input_dim, None,'my_kernel')
assert input_dim == 1, "For this kernel we assume input_dim=1"
self.sigma_bias = Param('sigma_bias', sigma_bias)
self.sigma_weights = Param('sigma_weights', sigma_weights)
self.link_parameters(self.sigma_bias, self.sigma_weights)
def k(x,x2):
tmp_num = 2*(self.sigma_bias**2 + self.sigma_weights**2*x*x2)
tmp_denum1 = 2*(self.sigma_bias**2 + self.sigma_weights**2*x**2)
tmp_denum2 = 2*(self.sigma_bias**2 + self.sigma_weights**2*x2**2)
return 2*np.arcsin(tmp_num/np.sqrt((1+tmp_denum1)*(1+tmp_denum2)))/np.pi
self.mat_k = np.vectorize(k)
def K(self,X,X2):
X = X.flatten()
if X2 is None:
X2 = X
return self.mat_k(X[:,np.newaxis],X2)
def Kdiag(self,X):
X = X.flatten()
return self.mat_k(X,X)
# test
myk = MyKernel(1,sigma_b_0,sigma_W_0)
myk.K(np.array([1,2,3])[:,np.newaxis]),myk.Kdiag(np.array([1,2,3])[:,np.newaxis])
KERNEL=myk
x=np.linspace(-5,5,NPOINTS)
cov = KERNEL.K(x[:,np.newaxis])
rvs = np.random.multivariate_normal(np.zeros_like(x), cov, size=NDRAWS)
fig,ax = plt.subplots()
for i in range(10):
ax.plot(x, rvs[i,:])
ax.set_ylim(-2,2)
ax.set_xlim(-5,5)
plt.show()
Calculate covariance matrix of the MLP & GP prior draws¶
X_mlp = np.asarray(outputs)
mn_X_mlp = X_mlp.mean(axis=0)
cov_X_mlp = np.cov(X_mlp.T)
mn_X_gp = rvs.mean(axis=0)
cov_X_gp = np.cov(rvs.T)
fig, axs = plt.subplots(ncols=3,figsize=(15,5))
axs[0].matshow(cov)
axs[0].set_title("Exact Kernel")
axs[1].matshow(cov_X_mlp)
axs[1].set_title("Estimated Bayesian MLP Kernel")
axs[2].matshow(cov_X_gp)
axs[2].set_title("Estimated GP Kernel")
plt.show()
cov[1,1], cov_X_mlp[1,1], cov_X_gp[1,1]
Boostrap covariance matrix calculation and use standard deviation (error) to conduct a hypothesis test that a random position in the covariance matrices of GP and bayesian MLP stem from the same distribution¶
def bootstrap_covariance(X,nboot=100):
nsamples, nfeatures = X.shape
cov_ = np.empty((nboot,nfeatures, nfeatures))
for i in range(nboot):
sub_features = np.random.randint(0, nsamples, size=nsamples)
X_ = X[sub_features,:]
cov_[i,...] = np.cov(X_.T)
cov_m = np.mean(cov_, axis=0)
cov_error = np.std(cov_, axis=0)/np.sqrt(nboot)*np.sqrt(nsamples/(nsamples-1.))
return cov_m, cov_error
cov1, cov_error1 = bootstrap_covariance(X_mlp)
i,j = np.random.randint(0, NPOINTS,size=2) # probe for a random position in the coveriance matrices of GP and Bayesian MLP
m1,s1 = cov1[i,j], cov_error1[i,j]
cov2, cov_error2 = bootstrap_covariance(rvs)
m2,s2 = cov2[i,j], cov_error2[i,j]
#This is a two-sided test for the null hypothesis that 2 independent samples have identical average (expected) values.
# https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.ttest_ind_from_stats.html
# https://en.wikipedia.org/wiki/Welch%27s_t-test
t2, p2 = ttest_ind_from_stats(m1, s1, 100,
m2, s2, 100,
equal_var=False)
print("(%d,%d): ttest_ind_from_stats: t = %g p = %g" % (i,j,t2, p2))
Great post showing this interesting connection, best!
ReplyDeleteThanks Emilio :)
Delete