From 8ad1a10ffd637f663c38797f0acc4c6c3f6fbb37 Mon Sep 17 00:00:00 2001 From: Philip Tasabia Date: Sun, 16 Mar 2025 20:25:44 -0400 Subject: [PATCH 1/3] init network3 --- src_pytensor/network3.py | 314 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 314 insertions(+) create mode 100644 src_pytensor/network3.py diff --git a/src_pytensor/network3.py b/src_pytensor/network3.py new file mode 100644 index 000000000..2b27774b2 --- /dev/null +++ b/src_pytensor/network3.py @@ -0,0 +1,314 @@ +"""network3.py +~~~~~~~~~~~~~~ + +A Theano-based program for training and running simple neural +networks. + +Supports several layer types (fully connected, convolutional, max +pooling, softmax), and activation functions (sigmoid, tanh, and +rectified linear units, with more easily added). + +When run on a CPU, this program is much faster than network.py and +network2.py. However, unlike network.py and network2.py it can also +be run on a GPU, which makes it faster still. + +Because the code is based on Theano, the code is different in many +ways from network.py and network2.py. However, where possible I have +tried to maintain consistency with the earlier programs. In +particular, the API is similar to network2.py. Note that I have +focused on making the code simple, easily readable, and easily +modifiable. It is not optimized, and omits many desirable features. + +This program incorporates ideas from the Theano documentation on +convolutional neural nets (notably, +http://deeplearning.net/tutorial/lenet.html ), from Misha Denil's +implementation of dropout (https://github.com/mdenil/dropout ), and +from Chris Olah (http://colah.github.io ). + +Written for Theano 0.6 and 0.7, needs some changes for more recent +versions of Theano. + +""" + +#### Libraries +# Standard library +import cPickle +import gzip + +# Third-party libraries +import numpy as np +import theano +import theano.tensor as T +from theano.tensor.nnet import conv +from theano.tensor.nnet import softmax +from theano.tensor import shared_randomstreams +from theano.tensor.signal import downsample + +# Activation functions for neurons +def linear(z): return z +def ReLU(z): return T.maximum(0.0, z) +from theano.tensor.nnet import sigmoid +from theano.tensor import tanh + + +#### Constants +GPU = True +if GPU: + print "Trying to run under a GPU. If this is not desired, then modify "+\ + "network3.py\nto set the GPU flag to False." + try: theano.config.device = 'gpu' + except: pass # it's already set + theano.config.floatX = 'float32' +else: + print "Running with a CPU. If this is not desired, then the modify "+\ + "network3.py to set\nthe GPU flag to True." + +#### Load the MNIST data +def load_data_shared(filename="../data/mnist.pkl.gz"): + f = gzip.open(filename, 'rb') + training_data, validation_data, test_data = cPickle.load(f) + f.close() + def shared(data): + """Place the data into shared variables. This allows Theano to copy + the data to the GPU, if one is available. + + """ + shared_x = theano.shared( + np.asarray(data[0], dtype=theano.config.floatX), borrow=True) + shared_y = theano.shared( + np.asarray(data[1], dtype=theano.config.floatX), borrow=True) + return shared_x, T.cast(shared_y, "int32") + return [shared(training_data), shared(validation_data), shared(test_data)] + +#### Main class used to construct and train networks +class Network(object): + + def __init__(self, layers, mini_batch_size): + """Takes a list of `layers`, describing the network architecture, and + a value for the `mini_batch_size` to be used during training + by stochastic gradient descent. + + """ + self.layers = layers + self.mini_batch_size = mini_batch_size + self.params = [param for layer in self.layers for param in layer.params] + self.x = T.matrix("x") + self.y = T.ivector("y") + init_layer = self.layers[0] + init_layer.set_inpt(self.x, self.x, self.mini_batch_size) + for j in xrange(1, len(self.layers)): + prev_layer, layer = self.layers[j-1], self.layers[j] + layer.set_inpt( + prev_layer.output, prev_layer.output_dropout, self.mini_batch_size) + self.output = self.layers[-1].output + self.output_dropout = self.layers[-1].output_dropout + + def SGD(self, training_data, epochs, mini_batch_size, eta, + validation_data, test_data, lmbda=0.0): + """Train the network using mini-batch stochastic gradient descent.""" + training_x, training_y = training_data + validation_x, validation_y = validation_data + test_x, test_y = test_data + + # compute number of minibatches for training, validation and testing + num_training_batches = size(training_data)/mini_batch_size + num_validation_batches = size(validation_data)/mini_batch_size + num_test_batches = size(test_data)/mini_batch_size + + # define the (regularized) cost function, symbolic gradients, and updates + l2_norm_squared = sum([(layer.w**2).sum() for layer in self.layers]) + cost = self.layers[-1].cost(self)+\ + 0.5*lmbda*l2_norm_squared/num_training_batches + grads = T.grad(cost, self.params) + updates = [(param, param-eta*grad) + for param, grad in zip(self.params, grads)] + + # define functions to train a mini-batch, and to compute the + # accuracy in validation and test mini-batches. + i = T.lscalar() # mini-batch index + train_mb = theano.function( + [i], cost, updates=updates, + givens={ + self.x: + training_x[i*self.mini_batch_size: (i+1)*self.mini_batch_size], + self.y: + training_y[i*self.mini_batch_size: (i+1)*self.mini_batch_size] + }) + validate_mb_accuracy = theano.function( + [i], self.layers[-1].accuracy(self.y), + givens={ + self.x: + validation_x[i*self.mini_batch_size: (i+1)*self.mini_batch_size], + self.y: + validation_y[i*self.mini_batch_size: (i+1)*self.mini_batch_size] + }) + test_mb_accuracy = theano.function( + [i], self.layers[-1].accuracy(self.y), + givens={ + self.x: + test_x[i*self.mini_batch_size: (i+1)*self.mini_batch_size], + self.y: + test_y[i*self.mini_batch_size: (i+1)*self.mini_batch_size] + }) + self.test_mb_predictions = theano.function( + [i], self.layers[-1].y_out, + givens={ + self.x: + test_x[i*self.mini_batch_size: (i+1)*self.mini_batch_size] + }) + # Do the actual training + best_validation_accuracy = 0.0 + for epoch in xrange(epochs): + for minibatch_index in xrange(num_training_batches): + iteration = num_training_batches*epoch+minibatch_index + if iteration % 1000 == 0: + print("Training mini-batch number {0}".format(iteration)) + cost_ij = train_mb(minibatch_index) + if (iteration+1) % num_training_batches == 0: + validation_accuracy = np.mean( + [validate_mb_accuracy(j) for j in xrange(num_validation_batches)]) + print("Epoch {0}: validation accuracy {1:.2%}".format( + epoch, validation_accuracy)) + if validation_accuracy >= best_validation_accuracy: + print("This is the best validation accuracy to date.") + best_validation_accuracy = validation_accuracy + best_iteration = iteration + if test_data: + test_accuracy = np.mean( + [test_mb_accuracy(j) for j in xrange(num_test_batches)]) + print('The corresponding test accuracy is {0:.2%}'.format( + test_accuracy)) + print("Finished training network.") + print("Best validation accuracy of {0:.2%} obtained at iteration {1}".format( + best_validation_accuracy, best_iteration)) + print("Corresponding test accuracy of {0:.2%}".format(test_accuracy)) + +#### Define layer types + +class ConvPoolLayer(object): + """Used to create a combination of a convolutional and a max-pooling + layer. A more sophisticated implementation would separate the + two, but for our purposes we'll always use them together, and it + simplifies the code, so it makes sense to combine them. + + """ + + def __init__(self, filter_shape, image_shape, poolsize=(2, 2), + activation_fn=sigmoid): + """`filter_shape` is a tuple of length 4, whose entries are the number + of filters, the number of input feature maps, the filter height, and the + filter width. + + `image_shape` is a tuple of length 4, whose entries are the + mini-batch size, the number of input feature maps, the image + height, and the image width. + + `poolsize` is a tuple of length 2, whose entries are the y and + x pooling sizes. + + """ + self.filter_shape = filter_shape + self.image_shape = image_shape + self.poolsize = poolsize + self.activation_fn=activation_fn + # initialize weights and biases + n_out = (filter_shape[0]*np.prod(filter_shape[2:])/np.prod(poolsize)) + self.w = theano.shared( + np.asarray( + np.random.normal(loc=0, scale=np.sqrt(1.0/n_out), size=filter_shape), + dtype=theano.config.floatX), + borrow=True) + self.b = theano.shared( + np.asarray( + np.random.normal(loc=0, scale=1.0, size=(filter_shape[0],)), + dtype=theano.config.floatX), + borrow=True) + self.params = [self.w, self.b] + + def set_inpt(self, inpt, inpt_dropout, mini_batch_size): + self.inpt = inpt.reshape(self.image_shape) + conv_out = conv.conv2d( + input=self.inpt, filters=self.w, filter_shape=self.filter_shape, + image_shape=self.image_shape) + pooled_out = downsample.max_pool_2d( + input=conv_out, ds=self.poolsize, ignore_border=True) + self.output = self.activation_fn( + pooled_out + self.b.dimshuffle('x', 0, 'x', 'x')) + self.output_dropout = self.output # no dropout in the convolutional layers + +class FullyConnectedLayer(object): + + def __init__(self, n_in, n_out, activation_fn=sigmoid, p_dropout=0.0): + self.n_in = n_in + self.n_out = n_out + self.activation_fn = activation_fn + self.p_dropout = p_dropout + # Initialize weights and biases + self.w = theano.shared( + np.asarray( + np.random.normal( + loc=0.0, scale=np.sqrt(1.0/n_out), size=(n_in, n_out)), + dtype=theano.config.floatX), + name='w', borrow=True) + self.b = theano.shared( + np.asarray(np.random.normal(loc=0.0, scale=1.0, size=(n_out,)), + dtype=theano.config.floatX), + name='b', borrow=True) + self.params = [self.w, self.b] + + def set_inpt(self, inpt, inpt_dropout, mini_batch_size): + self.inpt = inpt.reshape((mini_batch_size, self.n_in)) + self.output = self.activation_fn( + (1-self.p_dropout)*T.dot(self.inpt, self.w) + self.b) + self.y_out = T.argmax(self.output, axis=1) + self.inpt_dropout = dropout_layer( + inpt_dropout.reshape((mini_batch_size, self.n_in)), self.p_dropout) + self.output_dropout = self.activation_fn( + T.dot(self.inpt_dropout, self.w) + self.b) + + def accuracy(self, y): + "Return the accuracy for the mini-batch." + return T.mean(T.eq(y, self.y_out)) + +class SoftmaxLayer(object): + + def __init__(self, n_in, n_out, p_dropout=0.0): + self.n_in = n_in + self.n_out = n_out + self.p_dropout = p_dropout + # Initialize weights and biases + self.w = theano.shared( + np.zeros((n_in, n_out), dtype=theano.config.floatX), + name='w', borrow=True) + self.b = theano.shared( + np.zeros((n_out,), dtype=theano.config.floatX), + name='b', borrow=True) + self.params = [self.w, self.b] + + def set_inpt(self, inpt, inpt_dropout, mini_batch_size): + self.inpt = inpt.reshape((mini_batch_size, self.n_in)) + self.output = softmax((1-self.p_dropout)*T.dot(self.inpt, self.w) + self.b) + self.y_out = T.argmax(self.output, axis=1) + self.inpt_dropout = dropout_layer( + inpt_dropout.reshape((mini_batch_size, self.n_in)), self.p_dropout) + self.output_dropout = softmax(T.dot(self.inpt_dropout, self.w) + self.b) + + def cost(self, net): + "Return the log-likelihood cost." + return -T.mean(T.log(self.output_dropout)[T.arange(net.y.shape[0]), net.y]) + + def accuracy(self, y): + "Return the accuracy for the mini-batch." + return T.mean(T.eq(y, self.y_out)) + + +#### Miscellanea +def size(data): + "Return the size of the dataset `data`." + return data[0].get_value(borrow=True).shape[0] + +def dropout_layer(layer, p_dropout): + srng = shared_randomstreams.RandomStreams( + np.random.RandomState(0).randint(999999)) + mask = srng.binomial(n=1, p=1-p_dropout, size=layer.shape) + return layer*T.cast(mask, theano.config.floatX) From 41e40c1b566a6fa3cc53cb21122635de62ba3dfc Mon Sep 17 00:00:00 2001 From: Philip Tasabia Date: Sun, 16 Mar 2025 20:28:10 -0400 Subject: [PATCH 2/3] init network3.py --- .gitignore | 1 + src_pytensor/requirements.txt | 2 ++ 2 files changed, 3 insertions(+) create mode 100644 src_pytensor/requirements.txt diff --git a/.gitignore b/.gitignore index 3ef474023..a55c4e1b8 100644 --- a/.gitignore +++ b/.gitignore @@ -6,3 +6,4 @@ .DS_Store loc.py src/ec2 +*.history \ No newline at end of file diff --git a/src_pytensor/requirements.txt b/src_pytensor/requirements.txt new file mode 100644 index 000000000..edf271cbe --- /dev/null +++ b/src_pytensor/requirements.txt @@ -0,0 +1,2 @@ +pytensor>=2.26.4 +jax>=0.5.2 \ No newline at end of file From bbafa4c8a15d8f7ef5b23c3453b962abf38dbd7d Mon Sep 17 00:00:00 2001 From: Philip T <69321457+philipscoderepo@users.noreply.github.com> Date: Sun, 30 Mar 2025 17:36:25 -0400 Subject: [PATCH 3/3] convert to pytensor --- src_pytensor/network3.py | 433 +++++++++++++++++++++++---------------- 1 file changed, 259 insertions(+), 174 deletions(-) diff --git a/src_pytensor/network3.py b/src_pytensor/network3.py index 2b27774b2..fea7eefe4 100644 --- a/src_pytensor/network3.py +++ b/src_pytensor/network3.py @@ -1,102 +1,292 @@ + +""" +Got the code from https://github.com/MichalDanielDobrzanski/DeepLearningPython/pull/14/ +""" + """network3.py ~~~~~~~~~~~~~~ - A Theano-based program for training and running simple neural networks. - Supports several layer types (fully connected, convolutional, max pooling, softmax), and activation functions (sigmoid, tanh, and rectified linear units, with more easily added). - When run on a CPU, this program is much faster than network.py and network2.py. However, unlike network.py and network2.py it can also be run on a GPU, which makes it faster still. - Because the code is based on Theano, the code is different in many ways from network.py and network2.py. However, where possible I have tried to maintain consistency with the earlier programs. In particular, the API is similar to network2.py. Note that I have focused on making the code simple, easily readable, and easily modifiable. It is not optimized, and omits many desirable features. - This program incorporates ideas from the Theano documentation on convolutional neural nets (notably, http://deeplearning.net/tutorial/lenet.html ), from Misha Denil's implementation of dropout (https://github.com/mdenil/dropout ), and from Chris Olah (http://colah.github.io ). - -Written for Theano 0.6 and 0.7, needs some changes for more recent -versions of Theano. - """ #### Libraries # Standard library -import cPickle +import pickle import gzip # Third-party libraries import numpy as np -import theano -import theano.tensor as T -from theano.tensor.nnet import conv -from theano.tensor.nnet import softmax -from theano.tensor import shared_randomstreams -from theano.tensor.signal import downsample + +import pytensor +import pytensor.link.jax +import pytensor.tensor as pt +import pytensor.tensor +from pytensor.tensor.math import sigmoid, tanh +from pytensor.tensor.special import softmax # Activation functions for neurons def linear(z): return z -def ReLU(z): return T.maximum(0.0, z) -from theano.tensor.nnet import sigmoid -from theano.tensor import tanh +# update with pt +def ReLU(z): return pt.maximum(0.0, z) +# 2d convolution +import jax +import jax.numpy as jnp +from jax.lax import conv_general_dilated, reduce_window #### Constants GPU = True if GPU: - print "Trying to run under a GPU. If this is not desired, then modify "+\ - "network3.py\nto set the GPU flag to False." - try: theano.config.device = 'gpu' + print("Trying to run under a GPU. If this is not desired, then modify "+\ + "network3.py\nto set the GPU flag to False.") + + # config has been replaced, instead we have cuda + try: pytensor.config.device = 'cuda' except: pass # it's already set - theano.config.floatX = 'float32' + # recommended for GPU computation + pytensor.config.floatX = 'float32' + + print(f"PyTensor is running on: {pytensor.config.device}") + exit() else: - print "Running with a CPU. If this is not desired, then the modify "+\ - "network3.py to set\nthe GPU flag to True." + print("Running with a CPU. If this is not desired, then the modify "+\ + "network3.py to set\nthe GPU flag to True.") #### Load the MNIST data def load_data_shared(filename="../data/mnist.pkl.gz"): f = gzip.open(filename, 'rb') - training_data, validation_data, test_data = cPickle.load(f) + training_data, validation_data, test_data = pickle.load(f, encoding="latin1") f.close() def shared(data): """Place the data into shared variables. This allows Theano to copy the data to the GPU, if one is available. - """ - shared_x = theano.shared( - np.asarray(data[0], dtype=theano.config.floatX), borrow=True) - shared_y = theano.shared( - np.asarray(data[1], dtype=theano.config.floatX), borrow=True) - return shared_x, T.cast(shared_y, "int32") + # shared is still the same between theano and pytensor + shared_x = pytensor.shared( + np.asarray(data[0], dtype=pytensor.config.floatX), borrow=True) + + # shared is still the same between theano and pytensor + shared_y = pytensor.shared( + np.asarray(data[1], dtype=pytensor.config.floatX), borrow=True) + + # update cast to pytensor.tensor (pt) instead of theano.tensor (T) + return shared_x, pt.cast(shared_y, "int32") return [shared(training_data), shared(validation_data), shared(test_data)] +class ConvPoolLayer(object): + """Used to create a combination of a convolutional and a max-pooling + layer. A more sophisticated implementation would separate the + two, but for our purposes we'll always use them together, and it + simplifies the code, so it makes sense to combine them. + """ + + def __init__(self, filter_shape, image_shape, poolsize=(2, 2), + activation_fn=sigmoid): + """`filter_shape` is a tuple of length 4, whose entries are the number + of filters, the number of input feature maps, the filter height, and the + filter width. + `image_shape` is a tuple of length 4, whose entries are the + mini-batch size, the number of input feature maps, the image + height, and the image width. + `poolsize` is a tuple of length 2, whose entries are the y and + x pooling sizes. + """ + self.filter_shape = filter_shape + self.image_shape = image_shape + self.poolsize = poolsize + self.activation_fn=activation_fn + + # initialize weights and biases + n_out = (filter_shape[0]*np.prod(filter_shape[2:])/np.prod(poolsize)) + + # theano.shared == pytensor.shared + self.w = pytensor.shared( + np.asarray( + np.random.normal(loc=0, scale=np.sqrt(1.0/n_out), size=filter_shape), + # theano.config.floatX == pytensor.config.floatX + dtype=pytensor.config.floatX), + borrow=True) + + # theano.shared == pytensor.shared + self.b = pytensor.shared( + np.asarray( + np.random.normal(loc=0, scale=1.0, size=(filter_shape[0],)), + # theano.config.floatX == pytensor.config.floatX + dtype=pytensor.config.floatX), + borrow=True) + + self.params = [self.w, self.b] + + def set_inpt(self, inpt, inpt_dropout, mini_batch_size): + # this section is not possible using just pytensor + # need to also use JAX for the 2d convolution + + # Assume self.inpt, self.w, self.filter_shape, and self.image_shape are defined + input_tensor = self.inpt # Shape: (batch, channels, height, width) + filters = self.w # Shape: (out_channels, in_channels, filter_height, filter_width) + + # Stride (assumed to be 1x1 unless specified otherwise) + stride = (1, 1) + + # Padding: Choose 'SAME' to keep the output size similar to the input + padding = "SAME" # Theano's default padding behavior is similar to "SAME" + + # Perform convolution + conv_out = conv_general_dilated( + lhs=input_tensor, # Input tensor + rhs=filters, # Convolution filters + window_strides=stride, # Stride for convolution + padding=padding # Padding type + ) + + # UPGRADE: pytensor reshape + self.inpt = pt.reshape(inpt, self.image_shape) + + # Pooling parameters (e.g., poolsize=(2, 2), ignore_border=True) + window_shape = (2, 2) # Pooling window size + strides = (2, 2) # Stride for pooling + padding = 'VALID' # Padding type (no padding at the borders) + + # Apply max pooling using jax.lax.reduce_window + pooled_out = reduce_window(conv_out, -jnp.inf, jax.lax.max, window_shape, strides, padding) + + self.output = self.activation_fn( + pooled_out + self.b.dimshuffle('x', 0, 'x', 'x')) + self.output_dropout = self.output # no dropout in the convolutional layers + +class FullyConnectedLayer(object): + + def __init__(self, n_in, n_out, activation_fn=sigmoid, p_dropout=0.0): + self.n_in = n_in # n input neurons + self.n_out = n_out # n output neurons + self.activation_fn = activation_fn # activation function + self.p_dropout = p_dropout # probability of dropping out (reduce overfitting) + ### Initialize weights and biases + # theano.shared == pytensor.shared + self.w = pytensor.shared( + np.asarray( + np.random.normal( + loc=0.0, scale=np.sqrt(1.0/n_out), size=(n_in, n_out)), + # theano.config.floatX == pytensor.config.floatX + dtype=pytensor.config.floatX), + name='w', borrow=True) + + # theano.shared == pytensor.shared + self.b = pytensor.shared( + # theano.config.floatX == pytensor.config.floatX + np.asarray(np.random.normal(loc=0.0, scale=1.0, size=(n_out,)), + dtype=pytensor.config.floatX), + name='b', borrow=True) + + self.params = [self.w, self.b] + + def set_inpt(self, inpt, inpt_dropout, mini_batch_size): + # UPGRADE: use pytensor reshape + self.inpt = pt.reshape(inpt, (mini_batch_size, self.n_in)) + self.output = self.activation_fn( + # UPGRADE: T.dot == pt.dot + + (1-self.p_dropout)*pt.dot(self.inpt, self.w) + self.b) + # UPGRADE: T.argmax == pt.argmax + self.y_out = pt.argmax(self.output, axis=1) + + # UPGRADE: use pytensor reshape + inpt_dropout = pt.reshape(inpt_dropout, (mini_batch_size, self.n_in)) + + self.inpt_dropout = dropout_layer(inpt_dropout, self.p_dropout) + self.output_dropout = self.activation_fn( + # T.dot == pt.dot + pt.dot(self.inpt_dropout, self.w) + self.b) + + def accuracy(self, y): + "Return the accuracy for the mini-batch." + # T.mean == pt.mean; T.eq == pt.eq + return pt.mean(pt.eq(y, self.y_out)) + +class SoftmaxLayer(object): + + def __init__(self, n_in, n_out, p_dropout=0.0): + self.n_in = n_in + self.n_out = n_out + self.p_dropout = p_dropout + # Initialize weights and biases + # theano.shared == pytensor.shared + self.w = pytensor.shared( + # theano.config.floatX == pytensor.config.floatX + np.zeros((n_in, n_out), dtype=pytensor.config.floatX), + name='w', borrow=True) + # theano.shared == pytensor.shared + self.b = pytensor.shared( + # theano.config.floatX == pytensor.config.floatX + np.zeros((n_out,), dtype=pytensor.config.floatX), + name='b', borrow=True) + self.params = [self.w, self.b] + + def set_inpt(self, inpt, inpt_dropout, mini_batch_size): + # UPGRADE: use pytensor reshape + self.inpt = pt.reshape(inpt, (mini_batch_size, self.n_in)) + # UPGRADE: T.dot == pt.dot + self.output = softmax((1-self.p_dropout) * pt.dot(self.inpt, self.w) + self.b) + # UPGRADE: T.argmax == pt.argmax + self.y_out = pt.argmax(self.output, axis=1) + # UPGRADE: use pytensor reshape + inpt_dropout = pt.reshape(inpt_dropout, (mini_batch_size, self.n_in)) + + self.inpt_dropout = dropout_layer(inpt_dropout, self.p_dropout) + + # UPGRADE: T.dot == pt.dot + self.output_dropout = softmax(pt.dot(self.inpt_dropout, self.w) + self.b) + + def cost(self, net): + "Return the log-likelihood cost." + # T.mean == pt.mean; T.log == pt.log; T.arange == pt.arange + return -pt.mean(pt.log(self.output_dropout)[pt.arange(net.y.shape[0]), net.y]) + + def accuracy(self, y): + "Return the accuracy for the mini-batch." + # T.mean == pt.mean; T.eq == pt.eq + return pt.mean(pt.eq(y, self.y_out)) + #### Main class used to construct and train networks class Network(object): - def __init__(self, layers, mini_batch_size): + def __init__(self, layers: FullyConnectedLayer | ConvPoolLayer | SoftmaxLayer, mini_batch_size: int): """Takes a list of `layers`, describing the network architecture, and a value for the `mini_batch_size` to be used during training by stochastic gradient descent. - """ self.layers = layers self.mini_batch_size = mini_batch_size self.params = [param for layer in self.layers for param in layer.params] - self.x = T.matrix("x") - self.y = T.ivector("y") + + # T.matrix == pt.matrix + self.x = pt.matrix("x") + # T.ivector == pt.ivector + self.y = pt.ivector("y") + init_layer = self.layers[0] + + # call the set_inpt for the respective layer provided as the first layer init_layer.set_inpt(self.x, self.x, self.mini_batch_size) - for j in xrange(1, len(self.layers)): + + for j in range(1, len(self.layers)): # xrange() was renamed to range() in Python 3. prev_layer, layer = self.layers[j-1], self.layers[j] layer.set_inpt( prev_layer.output, prev_layer.output_dropout, self.mini_batch_size) @@ -111,22 +301,27 @@ def SGD(self, training_data, epochs, mini_batch_size, eta, test_x, test_y = test_data # compute number of minibatches for training, validation and testing - num_training_batches = size(training_data)/mini_batch_size - num_validation_batches = size(validation_data)/mini_batch_size - num_test_batches = size(test_data)/mini_batch_size + num_training_batches = int(size(training_data)/mini_batch_size) + num_validation_batches = int(size(validation_data)/mini_batch_size) + num_test_batches = int(size(test_data)/mini_batch_size) # define the (regularized) cost function, symbolic gradients, and updates l2_norm_squared = sum([(layer.w**2).sum() for layer in self.layers]) cost = self.layers[-1].cost(self)+\ 0.5*lmbda*l2_norm_squared/num_training_batches - grads = T.grad(cost, self.params) + + # T.grad == pt.grad + grads = pt.grad(cost, self.params) updates = [(param, param-eta*grad) for param, grad in zip(self.params, grads)] # define functions to train a mini-batch, and to compute the # accuracy in validation and test mini-batches. - i = T.lscalar() # mini-batch index - train_mb = theano.function( + # T.lscalar == pt.lscalar + i = pt.lscalar() # mini-batch index + + # theano.function == pytensor.function + train_mb = pytensor.function( [i], cost, updates=updates, givens={ self.x: @@ -134,7 +329,9 @@ def SGD(self, training_data, epochs, mini_batch_size, eta, self.y: training_y[i*self.mini_batch_size: (i+1)*self.mini_batch_size] }) - validate_mb_accuracy = theano.function( + + # theano.function == pytensor.function + validate_mb_accuracy = pytensor.function( [i], self.layers[-1].accuracy(self.y), givens={ self.x: @@ -142,7 +339,9 @@ def SGD(self, training_data, epochs, mini_batch_size, eta, self.y: validation_y[i*self.mini_batch_size: (i+1)*self.mini_batch_size] }) - test_mb_accuracy = theano.function( + + # theano.function == pytensor.function + test_mb_accuracy = pytensor.function( [i], self.layers[-1].accuracy(self.y), givens={ self.x: @@ -150,23 +349,26 @@ def SGD(self, training_data, epochs, mini_batch_size, eta, self.y: test_y[i*self.mini_batch_size: (i+1)*self.mini_batch_size] }) - self.test_mb_predictions = theano.function( + + # theano.function == pytensor.function + self.test_mb_predictions = pytensor.function( [i], self.layers[-1].y_out, givens={ self.x: test_x[i*self.mini_batch_size: (i+1)*self.mini_batch_size] }) + # Do the actual training best_validation_accuracy = 0.0 - for epoch in xrange(epochs): - for minibatch_index in xrange(num_training_batches): + for epoch in range(epochs): + for minibatch_index in range(num_training_batches): iteration = num_training_batches*epoch+minibatch_index if iteration % 1000 == 0: print("Training mini-batch number {0}".format(iteration)) cost_ij = train_mb(minibatch_index) if (iteration+1) % num_training_batches == 0: validation_accuracy = np.mean( - [validate_mb_accuracy(j) for j in xrange(num_validation_batches)]) + [validate_mb_accuracy(j) for j in range(num_validation_batches)]) print("Epoch {0}: validation accuracy {1:.2%}".format( epoch, validation_accuracy)) if validation_accuracy >= best_validation_accuracy: @@ -175,7 +377,7 @@ def SGD(self, training_data, epochs, mini_batch_size, eta, best_iteration = iteration if test_data: test_accuracy = np.mean( - [test_mb_accuracy(j) for j in xrange(num_test_batches)]) + [test_mb_accuracy(j) for j in range(num_test_batches)]) print('The corresponding test accuracy is {0:.2%}'.format( test_accuracy)) print("Finished training network.") @@ -183,132 +385,15 @@ def SGD(self, training_data, epochs, mini_batch_size, eta, best_validation_accuracy, best_iteration)) print("Corresponding test accuracy of {0:.2%}".format(test_accuracy)) -#### Define layer types - -class ConvPoolLayer(object): - """Used to create a combination of a convolutional and a max-pooling - layer. A more sophisticated implementation would separate the - two, but for our purposes we'll always use them together, and it - simplifies the code, so it makes sense to combine them. - - """ - - def __init__(self, filter_shape, image_shape, poolsize=(2, 2), - activation_fn=sigmoid): - """`filter_shape` is a tuple of length 4, whose entries are the number - of filters, the number of input feature maps, the filter height, and the - filter width. - - `image_shape` is a tuple of length 4, whose entries are the - mini-batch size, the number of input feature maps, the image - height, and the image width. - - `poolsize` is a tuple of length 2, whose entries are the y and - x pooling sizes. - - """ - self.filter_shape = filter_shape - self.image_shape = image_shape - self.poolsize = poolsize - self.activation_fn=activation_fn - # initialize weights and biases - n_out = (filter_shape[0]*np.prod(filter_shape[2:])/np.prod(poolsize)) - self.w = theano.shared( - np.asarray( - np.random.normal(loc=0, scale=np.sqrt(1.0/n_out), size=filter_shape), - dtype=theano.config.floatX), - borrow=True) - self.b = theano.shared( - np.asarray( - np.random.normal(loc=0, scale=1.0, size=(filter_shape[0],)), - dtype=theano.config.floatX), - borrow=True) - self.params = [self.w, self.b] - - def set_inpt(self, inpt, inpt_dropout, mini_batch_size): - self.inpt = inpt.reshape(self.image_shape) - conv_out = conv.conv2d( - input=self.inpt, filters=self.w, filter_shape=self.filter_shape, - image_shape=self.image_shape) - pooled_out = downsample.max_pool_2d( - input=conv_out, ds=self.poolsize, ignore_border=True) - self.output = self.activation_fn( - pooled_out + self.b.dimshuffle('x', 0, 'x', 'x')) - self.output_dropout = self.output # no dropout in the convolutional layers - -class FullyConnectedLayer(object): - - def __init__(self, n_in, n_out, activation_fn=sigmoid, p_dropout=0.0): - self.n_in = n_in - self.n_out = n_out - self.activation_fn = activation_fn - self.p_dropout = p_dropout - # Initialize weights and biases - self.w = theano.shared( - np.asarray( - np.random.normal( - loc=0.0, scale=np.sqrt(1.0/n_out), size=(n_in, n_out)), - dtype=theano.config.floatX), - name='w', borrow=True) - self.b = theano.shared( - np.asarray(np.random.normal(loc=0.0, scale=1.0, size=(n_out,)), - dtype=theano.config.floatX), - name='b', borrow=True) - self.params = [self.w, self.b] - - def set_inpt(self, inpt, inpt_dropout, mini_batch_size): - self.inpt = inpt.reshape((mini_batch_size, self.n_in)) - self.output = self.activation_fn( - (1-self.p_dropout)*T.dot(self.inpt, self.w) + self.b) - self.y_out = T.argmax(self.output, axis=1) - self.inpt_dropout = dropout_layer( - inpt_dropout.reshape((mini_batch_size, self.n_in)), self.p_dropout) - self.output_dropout = self.activation_fn( - T.dot(self.inpt_dropout, self.w) + self.b) - - def accuracy(self, y): - "Return the accuracy for the mini-batch." - return T.mean(T.eq(y, self.y_out)) - -class SoftmaxLayer(object): - - def __init__(self, n_in, n_out, p_dropout=0.0): - self.n_in = n_in - self.n_out = n_out - self.p_dropout = p_dropout - # Initialize weights and biases - self.w = theano.shared( - np.zeros((n_in, n_out), dtype=theano.config.floatX), - name='w', borrow=True) - self.b = theano.shared( - np.zeros((n_out,), dtype=theano.config.floatX), - name='b', borrow=True) - self.params = [self.w, self.b] - - def set_inpt(self, inpt, inpt_dropout, mini_batch_size): - self.inpt = inpt.reshape((mini_batch_size, self.n_in)) - self.output = softmax((1-self.p_dropout)*T.dot(self.inpt, self.w) + self.b) - self.y_out = T.argmax(self.output, axis=1) - self.inpt_dropout = dropout_layer( - inpt_dropout.reshape((mini_batch_size, self.n_in)), self.p_dropout) - self.output_dropout = softmax(T.dot(self.inpt_dropout, self.w) + self.b) - - def cost(self, net): - "Return the log-likelihood cost." - return -T.mean(T.log(self.output_dropout)[T.arange(net.y.shape[0]), net.y]) - - def accuracy(self, y): - "Return the accuracy for the mini-batch." - return T.mean(T.eq(y, self.y_out)) - - -#### Miscellanea +#### Miscellaneous def size(data): "Return the size of the dataset `data`." return data[0].get_value(borrow=True).shape[0] def dropout_layer(layer, p_dropout): - srng = shared_randomstreams.RandomStreams( - np.random.RandomState(0).randint(999999)) - mask = srng.binomial(n=1, p=1-p_dropout, size=layer.shape) - return layer*T.cast(mask, theano.config.floatX) + # n = number of trials in the binomial distribution + # p = probability of success in each trial + mask = pt.random.binomial(n=1, p=1-p_dropout, size=layer.shape) + + # T.cast == pt.cast; theano.config.floatX == pytensor.config.floatX + return layer*pt.cast(mask, pytensor.config.floatX)