# coding: utf-8 # The basic workflow for using CGT is as follows. # # 1.) Define symbolic variables # In[ ]: import cgt a = cgt.scalar(name='a') # float-valued scalar, with optional name provided b = cgt.scalar(name='b') n = cgt.scalar(name='n', dtype='int64') # integer scalar # 2.) Construct a expression out of these variables. Operators like `+,<,**` are overloaded. # In[ ]: c = (a**n + b**n)**(1.0/n) # 3.) Construct a ``function``, which maps numerical inputs to numerical outputs. # In[ ]: f = cgt.function([a,b,n], c) print f(8,15,2) # In step 2, when you constructed the expression ``c``, CGT was building up a data structure that stores the set of operations needed to compute ``c`` in terms of the input arguments. To be precise, CGT builds up a directed acyclic graph describing the dependencies. # In[ ]: cgt.as_dot(c) # Note that the labels 0 and 1 indicate whether the edge corresponds to the zeroth or first argument to the function. # # With this simple example out of the way, let's move on to vectors, matrices and gradients. # We'll first consider linear regression, where we have a linear model, $y_{pred} = Xw + b$, and a quadratic loss: $L(X,y,w,b) = \|y - y_{pred}\|^2$. # In[ ]: X_nk = cgt.matrix("X") y_n = cgt.vector("y") w_k = cgt.vector("w") b = cgt.scalar("b") ypred_n = X_nk.dot(w_k) + b L = cgt.sum(cgt.square(ypred_n - y_n)) print "L = ", cgt.print_expr(L); # In the first four lines, we created symbolic variables ``X_nk, y_n, w_k, b``. # Then, we constructed more complex expressions by applying functions to these symbolic variables, finally giving us the loss function ``L``. # # You can find the available functions by inspecting at the `cgt` namespace (type `cgt.`), and you'll find lots of familiar NumPy functions. Note that these symbolic variables have some of the same attributes as NumPy arrays. # In[ ]: print X_nk.ndim, str(X_nk.shape), X_nk.dtype # Next we can compute the gradient of the loss function with respect to the parameters $W,b$. (Note, however, that there's nothing special about parameters: we could just as well compute the gradient with respect to $X$ and $y$. # In[ ]: grads = dLdw, dLdb = cgt.grad(L, [w_k, b]) # Here, ``dLdw`` and ``dLdb`` are just expressions. When we called ``cgt.grad``, CGT walked through the graph that represents the expression ``L`` and constructed expressions for the gradients. This procedure is an variant of reverse-mode automatic differentiation or the backpropagation algorithm. # In[ ]: print "Loss and gradient objects", dLdw, dLdb print "Pretty-printed gradient: ", cgt.print_expr(cgt.simplify([dLdw])[0]); cgt.as_dot(cgt.simplify([dLdw])) # Finally, we can compile functions to compute the loss and gradients. # In[ ]: f_loss = cgt.function([X_nk, y_n, w_k, b], L) f_grads = cgt.function([X_nk, y_n, w_k, b], grads) # Let's generate some random numerical data and parameter values so we can test out these functions. # In[ ]: import numpy as np, numpy.random as nr nr.seed(0) # Generate some random data Xval = nr.randn(100,3) yval = nr.randn(100) # Initialize parameters wval = nr.randn(3) bval = nr.randn() np.set_printoptions(precision=3) print "loss:", f_loss(Xval, yval, wval, bval) print "grad:", f_grads(Xval, yval, wval, bval) # Now that we can compute the gradient, we're ready to do some optimization, where we minimize the loss with respect to $W,b$. There are many different styles for doing numerical optimization with CGT. # # We'll start with a style that is convenient but not the most efficient for large-scale machine learning. Namely, we'll flatten and concatenate the parameters into one vector $\theta$, and we'll return a flat gradient vector. # Then we'll hand off our functions to scipy's BFGS optimizer. # In[ ]: def f(theta): return f_loss(Xval, yval, theta[0:3], theta[3]) def gradf(theta): gw,gb = f_grads(Xval, yval, theta[0:3], theta[3]) return np.concatenate([gw,gb.reshape(1)]) import scipy.optimize as opt theta_opt = opt.fmin_bfgs(f, np.zeros(4), gradf, disp=False) print "Optimal theta:", theta_opt # Now let's just make sure we got the right answer: # In[ ]: print "Least squares solution:",np.linalg.lstsq(np.concatenate([Xval, np.ones((len(Xval),1))],axis=1), yval)[0]; # You can also do the flattening and unflattening using CGT, which will usually be faster: # In[ ]: theta = cgt.vector('theta') w_k_1 = theta[0:3] b_1 = theta[3] ypred_n_1 = X_nk.dot(w_k_1) + b_1 L_1 = cgt.sum(cgt.square(ypred_n_1 - y_n)) dLdtheta, = cgt.grad(L_1, [theta]) f = cgt.function([theta], L_1, givens = [(X_nk,Xval), (y_n,yval)]) gradf = cgt.function([theta], dLdtheta, givens = [(X_nk,Xval), (y_n,yval)]) theta_opt = opt.fmin_bfgs(f, np.zeros(4), gradf, disp=False) print "Optimal theta:", theta_opt # Note that we provided one additional argument to ``cgt.function`` above: ``givens``. The value provided should be a list of pairs (input variable, replacement), where replacement can be a numerical value or some other symbolic variable. # Now we'll introduce another way of doing optimization, but here we will be able to do in-place updates of our parameters, which will be useful for large-scale problems. This style is especially useful when using the GPU, to avoid transfering data to and from the device. # In[ ]: stepsize=0.001 w_k_2 = cgt.shared(wval.copy(), name='w') b_2 = cgt.shared(bval, name='b') ypred_n_2 = X_nk.dot(w_k_2) + b_2 L_2 = cgt.sum(cgt.square(ypred_n_2 - y_n)) dLdw_2,dLdb_2 = cgt.grad(L_2, [w_k_2, b_2]) updates = [(w_k_2, w_k_2 - stepsize*dLdw_2), (b_2, b_2 - stepsize*dLdb_2)] givens = [(X_nk,Xval), (y_n,yval)] f_update = cgt.function([], L_2, givens = givens, updates = updates) for i in xrange(100): f_update() print w_k_2.op.get_value(), b_2.op.get_value()