# coding: utf-8
# The basic workflow for using CGT is as follows.
#
# 1.) Define symbolic variables
# In[ ]:
import cgt
a = cgt.scalar(name='a') # float-valued scalar, with optional name provided
b = cgt.scalar(name='b')
n = cgt.scalar(name='n', dtype='int64') # integer scalar
# 2.) Construct a expression out of these variables. Operators like `+,<,**` are overloaded.
# In[ ]:
c = (a**n + b**n)**(1.0/n)
# 3.) Construct a ``function``, which maps numerical inputs to numerical outputs.
# In[ ]:
f = cgt.function([a,b,n], c)
print f(8,15,2)
# In step 2, when you constructed the expression ``c``, CGT was building up a data structure that stores the set of operations needed to compute ``c`` in terms of the input arguments. To be precise, CGT builds up a directed acyclic graph describing the dependencies.
# In[ ]:
cgt.as_dot(c)
# Note that the labels 0 and 1 indicate whether the edge corresponds to the zeroth or first argument to the function.
#
# With this simple example out of the way, let's move on to vectors, matrices and gradients.
# We'll first consider linear regression, where we have a linear model, $y_{pred} = Xw + b$, and a quadratic loss: $L(X,y,w,b) = \|y - y_{pred}\|^2$.
# In[ ]:
X_nk = cgt.matrix("X")
y_n = cgt.vector("y")
w_k = cgt.vector("w")
b = cgt.scalar("b")
ypred_n = X_nk.dot(w_k) + b
L = cgt.sum(cgt.square(ypred_n - y_n))
print "L = ",
cgt.print_expr(L);
# In the first four lines, we created symbolic variables ``X_nk, y_n, w_k, b``.
# Then, we constructed more complex expressions by applying functions to these symbolic variables, finally giving us the loss function ``L``.
#
# You can find the available functions by inspecting at the `cgt` namespace (type `cgt.`), and you'll find lots of familiar NumPy functions. Note that these symbolic variables have some of the same attributes as NumPy arrays.
# In[ ]:
print X_nk.ndim, str(X_nk.shape), X_nk.dtype
# Next we can compute the gradient of the loss function with respect to the parameters $W,b$. (Note, however, that there's nothing special about parameters: we could just as well compute the gradient with respect to $X$ and $y$.
# In[ ]:
grads = dLdw, dLdb = cgt.grad(L, [w_k, b])
# Here, ``dLdw`` and ``dLdb`` are just expressions. When we called ``cgt.grad``, CGT walked through the graph that represents the expression ``L`` and constructed expressions for the gradients. This procedure is an variant of reverse-mode automatic differentiation or the backpropagation algorithm.
# In[ ]:
print "Loss and gradient objects", dLdw, dLdb
print "Pretty-printed gradient: ",
cgt.print_expr(cgt.simplify([dLdw])[0]);
cgt.as_dot(cgt.simplify([dLdw]))
# Finally, we can compile functions to compute the loss and gradients.
# In[ ]:
f_loss = cgt.function([X_nk, y_n, w_k, b], L)
f_grads = cgt.function([X_nk, y_n, w_k, b], grads)
# Let's generate some random numerical data and parameter values so we can test out these functions.
# In[ ]:
import numpy as np, numpy.random as nr
nr.seed(0)
# Generate some random data
Xval = nr.randn(100,3)
yval = nr.randn(100)
# Initialize parameters
wval = nr.randn(3)
bval = nr.randn()
np.set_printoptions(precision=3)
print "loss:", f_loss(Xval, yval, wval, bval)
print "grad:", f_grads(Xval, yval, wval, bval)
# Now that we can compute the gradient, we're ready to do some optimization, where we minimize the loss with respect to $W,b$. There are many different styles for doing numerical optimization with CGT.
#
# We'll start with a style that is convenient but not the most efficient for large-scale machine learning. Namely, we'll flatten and concatenate the parameters into one vector $\theta$, and we'll return a flat gradient vector.
# Then we'll hand off our functions to scipy's BFGS optimizer.
# In[ ]:
def f(theta):
return f_loss(Xval, yval, theta[0:3], theta[3])
def gradf(theta):
gw,gb = f_grads(Xval, yval, theta[0:3], theta[3])
return np.concatenate([gw,gb.reshape(1)])
import scipy.optimize as opt
theta_opt = opt.fmin_bfgs(f, np.zeros(4), gradf, disp=False)
print "Optimal theta:", theta_opt
# Now let's just make sure we got the right answer:
# In[ ]:
print "Least squares solution:",np.linalg.lstsq(np.concatenate([Xval, np.ones((len(Xval),1))],axis=1), yval)[0];
# You can also do the flattening and unflattening using CGT, which will usually be faster:
# In[ ]:
theta = cgt.vector('theta')
w_k_1 = theta[0:3]
b_1 = theta[3]
ypred_n_1 = X_nk.dot(w_k_1) + b_1
L_1 = cgt.sum(cgt.square(ypred_n_1 - y_n))
dLdtheta, = cgt.grad(L_1, [theta])
f = cgt.function([theta], L_1, givens = [(X_nk,Xval), (y_n,yval)])
gradf = cgt.function([theta], dLdtheta, givens = [(X_nk,Xval), (y_n,yval)])
theta_opt = opt.fmin_bfgs(f, np.zeros(4), gradf, disp=False)
print "Optimal theta:", theta_opt
# Note that we provided one additional argument to ``cgt.function`` above: ``givens``. The value provided should be a list of pairs (input variable, replacement), where replacement can be a numerical value or some other symbolic variable.
# Now we'll introduce another way of doing optimization, but here we will be able to do in-place updates of our parameters, which will be useful for large-scale problems. This style is especially useful when using the GPU, to avoid transfering data to and from the device.
# In[ ]:
stepsize=0.001
w_k_2 = cgt.shared(wval.copy(), name='w')
b_2 = cgt.shared(bval, name='b')
ypred_n_2 = X_nk.dot(w_k_2) + b_2
L_2 = cgt.sum(cgt.square(ypred_n_2 - y_n))
dLdw_2,dLdb_2 = cgt.grad(L_2, [w_k_2, b_2])
updates = [(w_k_2, w_k_2 - stepsize*dLdw_2), (b_2, b_2 - stepsize*dLdb_2)]
givens = [(X_nk,Xval), (y_n,yval)]
f_update = cgt.function([], L_2, givens = givens, updates = updates)
for i in xrange(100):
f_update()
print w_k_2.op.get_value(), b_2.op.get_value()