From 8ad1a10ffd637f663c38797f0acc4c6c3f6fbb37 Mon Sep 17 00:00:00 2001
From: Philip Tasabia <tasabiap@oregonstate.edu>
Date: Sun, 16 Mar 2025 20:25:44 -0400
Subject: [PATCH 1/3] init network3

---
 src_pytensor/network3.py | 314 +++++++++++++++++++++++++++++++++++++++
 1 file changed, 314 insertions(+)
 create mode 100644 src_pytensor/network3.py

diff --git a/src_pytensor/network3.py b/src_pytensor/network3.py
new file mode 100644
index 000000000..2b27774b2
--- /dev/null
+++ b/src_pytensor/network3.py
@@ -0,0 +1,314 @@
+"""network3.py
+~~~~~~~~~~~~~~
+
+A Theano-based program for training and running simple neural
+networks.
+
+Supports several layer types (fully connected, convolutional, max
+pooling, softmax), and activation functions (sigmoid, tanh, and
+rectified linear units, with more easily added).
+
+When run on a CPU, this program is much faster than network.py and
+network2.py.  However, unlike network.py and network2.py it can also
+be run on a GPU, which makes it faster still.
+
+Because the code is based on Theano, the code is different in many
+ways from network.py and network2.py.  However, where possible I have
+tried to maintain consistency with the earlier programs.  In
+particular, the API is similar to network2.py.  Note that I have
+focused on making the code simple, easily readable, and easily
+modifiable.  It is not optimized, and omits many desirable features.
+
+This program incorporates ideas from the Theano documentation on
+convolutional neural nets (notably,
+http://deeplearning.net/tutorial/lenet.html ), from Misha Denil's
+implementation of dropout (https://github.com/mdenil/dropout ), and
+from Chris Olah (http://colah.github.io ).
+
+Written for Theano 0.6 and 0.7, needs some changes for more recent
+versions of Theano.
+
+"""
+
+#### Libraries
+# Standard library
+import cPickle
+import gzip
+
+# Third-party libraries
+import numpy as np
+import theano
+import theano.tensor as T
+from theano.tensor.nnet import conv
+from theano.tensor.nnet import softmax
+from theano.tensor import shared_randomstreams
+from theano.tensor.signal import downsample
+
+# Activation functions for neurons
+def linear(z): return z
+def ReLU(z): return T.maximum(0.0, z)
+from theano.tensor.nnet import sigmoid
+from theano.tensor import tanh
+
+
+#### Constants
+GPU = True
+if GPU:
+    print "Trying to run under a GPU.  If this is not desired, then modify "+\
+        "network3.py\nto set the GPU flag to False."
+    try: theano.config.device = 'gpu'
+    except: pass # it's already set
+    theano.config.floatX = 'float32'
+else:
+    print "Running with a CPU.  If this is not desired, then the modify "+\
+        "network3.py to set\nthe GPU flag to True."
+
+#### Load the MNIST data
+def load_data_shared(filename="../data/mnist.pkl.gz"):
+    f = gzip.open(filename, 'rb')
+    training_data, validation_data, test_data = cPickle.load(f)
+    f.close()
+    def shared(data):
+        """Place the data into shared variables.  This allows Theano to copy
+        the data to the GPU, if one is available.
+
+        """
+        shared_x = theano.shared(
+            np.asarray(data[0], dtype=theano.config.floatX), borrow=True)
+        shared_y = theano.shared(
+            np.asarray(data[1], dtype=theano.config.floatX), borrow=True)
+        return shared_x, T.cast(shared_y, "int32")
+    return [shared(training_data), shared(validation_data), shared(test_data)]
+
+#### Main class used to construct and train networks
+class Network(object):
+
+    def __init__(self, layers, mini_batch_size):
+        """Takes a list of `layers`, describing the network architecture, and
+        a value for the `mini_batch_size` to be used during training
+        by stochastic gradient descent.
+
+        """
+        self.layers = layers
+        self.mini_batch_size = mini_batch_size
+        self.params = [param for layer in self.layers for param in layer.params]
+        self.x = T.matrix("x")
+        self.y = T.ivector("y")
+        init_layer = self.layers[0]
+        init_layer.set_inpt(self.x, self.x, self.mini_batch_size)
+        for j in xrange(1, len(self.layers)):
+            prev_layer, layer  = self.layers[j-1], self.layers[j]
+            layer.set_inpt(
+                prev_layer.output, prev_layer.output_dropout, self.mini_batch_size)
+        self.output = self.layers[-1].output
+        self.output_dropout = self.layers[-1].output_dropout
+
+    def SGD(self, training_data, epochs, mini_batch_size, eta,
+            validation_data, test_data, lmbda=0.0):
+        """Train the network using mini-batch stochastic gradient descent."""
+        training_x, training_y = training_data
+        validation_x, validation_y = validation_data
+        test_x, test_y = test_data
+
+        # compute number of minibatches for training, validation and testing
+        num_training_batches = size(training_data)/mini_batch_size
+        num_validation_batches = size(validation_data)/mini_batch_size
+        num_test_batches = size(test_data)/mini_batch_size
+
+        # define the (regularized) cost function, symbolic gradients, and updates
+        l2_norm_squared = sum([(layer.w**2).sum() for layer in self.layers])
+        cost = self.layers[-1].cost(self)+\
+               0.5*lmbda*l2_norm_squared/num_training_batches
+        grads = T.grad(cost, self.params)
+        updates = [(param, param-eta*grad)
+                   for param, grad in zip(self.params, grads)]
+
+        # define functions to train a mini-batch, and to compute the
+        # accuracy in validation and test mini-batches.
+        i = T.lscalar() # mini-batch index
+        train_mb = theano.function(
+            [i], cost, updates=updates,
+            givens={
+                self.x:
+                training_x[i*self.mini_batch_size: (i+1)*self.mini_batch_size],
+                self.y:
+                training_y[i*self.mini_batch_size: (i+1)*self.mini_batch_size]
+            })
+        validate_mb_accuracy = theano.function(
+            [i], self.layers[-1].accuracy(self.y),
+            givens={
+                self.x:
+                validation_x[i*self.mini_batch_size: (i+1)*self.mini_batch_size],
+                self.y:
+                validation_y[i*self.mini_batch_size: (i+1)*self.mini_batch_size]
+            })
+        test_mb_accuracy = theano.function(
+            [i], self.layers[-1].accuracy(self.y),
+            givens={
+                self.x:
+                test_x[i*self.mini_batch_size: (i+1)*self.mini_batch_size],
+                self.y:
+                test_y[i*self.mini_batch_size: (i+1)*self.mini_batch_size]
+            })
+        self.test_mb_predictions = theano.function(
+            [i], self.layers[-1].y_out,
+            givens={
+                self.x:
+                test_x[i*self.mini_batch_size: (i+1)*self.mini_batch_size]
+            })
+        # Do the actual training
+        best_validation_accuracy = 0.0
+        for epoch in xrange(epochs):
+            for minibatch_index in xrange(num_training_batches):
+                iteration = num_training_batches*epoch+minibatch_index
+                if iteration % 1000 == 0:
+                    print("Training mini-batch number {0}".format(iteration))
+                cost_ij = train_mb(minibatch_index)
+                if (iteration+1) % num_training_batches == 0:
+                    validation_accuracy = np.mean(
+                        [validate_mb_accuracy(j) for j in xrange(num_validation_batches)])
+                    print("Epoch {0}: validation accuracy {1:.2%}".format(
+                        epoch, validation_accuracy))
+                    if validation_accuracy >= best_validation_accuracy:
+                        print("This is the best validation accuracy to date.")
+                        best_validation_accuracy = validation_accuracy
+                        best_iteration = iteration
+                        if test_data:
+                            test_accuracy = np.mean(
+                                [test_mb_accuracy(j) for j in xrange(num_test_batches)])
+                            print('The corresponding test accuracy is {0:.2%}'.format(
+                                test_accuracy))
+        print("Finished training network.")
+        print("Best validation accuracy of {0:.2%} obtained at iteration {1}".format(
+            best_validation_accuracy, best_iteration))
+        print("Corresponding test accuracy of {0:.2%}".format(test_accuracy))
+
+#### Define layer types
+
+class ConvPoolLayer(object):
+    """Used to create a combination of a convolutional and a max-pooling
+    layer.  A more sophisticated implementation would separate the
+    two, but for our purposes we'll always use them together, and it
+    simplifies the code, so it makes sense to combine them.
+
+    """
+
+    def __init__(self, filter_shape, image_shape, poolsize=(2, 2),
+                 activation_fn=sigmoid):
+        """`filter_shape` is a tuple of length 4, whose entries are the number
+        of filters, the number of input feature maps, the filter height, and the
+        filter width.
+
+        `image_shape` is a tuple of length 4, whose entries are the
+        mini-batch size, the number of input feature maps, the image
+        height, and the image width.
+
+        `poolsize` is a tuple of length 2, whose entries are the y and
+        x pooling sizes.
+
+        """
+        self.filter_shape = filter_shape
+        self.image_shape = image_shape
+        self.poolsize = poolsize
+        self.activation_fn=activation_fn
+        # initialize weights and biases
+        n_out = (filter_shape[0]*np.prod(filter_shape[2:])/np.prod(poolsize))
+        self.w = theano.shared(
+            np.asarray(
+                np.random.normal(loc=0, scale=np.sqrt(1.0/n_out), size=filter_shape),
+                dtype=theano.config.floatX),
+            borrow=True)
+        self.b = theano.shared(
+            np.asarray(
+                np.random.normal(loc=0, scale=1.0, size=(filter_shape[0],)),
+                dtype=theano.config.floatX),
+            borrow=True)
+        self.params = [self.w, self.b]
+
+    def set_inpt(self, inpt, inpt_dropout, mini_batch_size):
+        self.inpt = inpt.reshape(self.image_shape)
+        conv_out = conv.conv2d(
+            input=self.inpt, filters=self.w, filter_shape=self.filter_shape,
+            image_shape=self.image_shape)
+        pooled_out = downsample.max_pool_2d(
+            input=conv_out, ds=self.poolsize, ignore_border=True)
+        self.output = self.activation_fn(
+            pooled_out + self.b.dimshuffle('x', 0, 'x', 'x'))
+        self.output_dropout = self.output # no dropout in the convolutional layers
+
+class FullyConnectedLayer(object):
+
+    def __init__(self, n_in, n_out, activation_fn=sigmoid, p_dropout=0.0):
+        self.n_in = n_in
+        self.n_out = n_out
+        self.activation_fn = activation_fn
+        self.p_dropout = p_dropout
+        # Initialize weights and biases
+        self.w = theano.shared(
+            np.asarray(
+                np.random.normal(
+                    loc=0.0, scale=np.sqrt(1.0/n_out), size=(n_in, n_out)),
+                dtype=theano.config.floatX),
+            name='w', borrow=True)
+        self.b = theano.shared(
+            np.asarray(np.random.normal(loc=0.0, scale=1.0, size=(n_out,)),
+                       dtype=theano.config.floatX),
+            name='b', borrow=True)
+        self.params = [self.w, self.b]
+
+    def set_inpt(self, inpt, inpt_dropout, mini_batch_size):
+        self.inpt = inpt.reshape((mini_batch_size, self.n_in))
+        self.output = self.activation_fn(
+            (1-self.p_dropout)*T.dot(self.inpt, self.w) + self.b)
+        self.y_out = T.argmax(self.output, axis=1)
+        self.inpt_dropout = dropout_layer(
+            inpt_dropout.reshape((mini_batch_size, self.n_in)), self.p_dropout)
+        self.output_dropout = self.activation_fn(
+            T.dot(self.inpt_dropout, self.w) + self.b)
+
+    def accuracy(self, y):
+        "Return the accuracy for the mini-batch."
+        return T.mean(T.eq(y, self.y_out))
+
+class SoftmaxLayer(object):
+
+    def __init__(self, n_in, n_out, p_dropout=0.0):
+        self.n_in = n_in
+        self.n_out = n_out
+        self.p_dropout = p_dropout
+        # Initialize weights and biases
+        self.w = theano.shared(
+            np.zeros((n_in, n_out), dtype=theano.config.floatX),
+            name='w', borrow=True)
+        self.b = theano.shared(
+            np.zeros((n_out,), dtype=theano.config.floatX),
+            name='b', borrow=True)
+        self.params = [self.w, self.b]
+
+    def set_inpt(self, inpt, inpt_dropout, mini_batch_size):
+        self.inpt = inpt.reshape((mini_batch_size, self.n_in))
+        self.output = softmax((1-self.p_dropout)*T.dot(self.inpt, self.w) + self.b)
+        self.y_out = T.argmax(self.output, axis=1)
+        self.inpt_dropout = dropout_layer(
+            inpt_dropout.reshape((mini_batch_size, self.n_in)), self.p_dropout)
+        self.output_dropout = softmax(T.dot(self.inpt_dropout, self.w) + self.b)
+
+    def cost(self, net):
+        "Return the log-likelihood cost."
+        return -T.mean(T.log(self.output_dropout)[T.arange(net.y.shape[0]), net.y])
+
+    def accuracy(self, y):
+        "Return the accuracy for the mini-batch."
+        return T.mean(T.eq(y, self.y_out))
+
+
+#### Miscellanea
+def size(data):
+    "Return the size of the dataset `data`."
+    return data[0].get_value(borrow=True).shape[0]
+
+def dropout_layer(layer, p_dropout):
+    srng = shared_randomstreams.RandomStreams(
+        np.random.RandomState(0).randint(999999))
+    mask = srng.binomial(n=1, p=1-p_dropout, size=layer.shape)
+    return layer*T.cast(mask, theano.config.floatX)

From 41e40c1b566a6fa3cc53cb21122635de62ba3dfc Mon Sep 17 00:00:00 2001
From: Philip Tasabia <tasabiap@oregonstate.edu>
Date: Sun, 16 Mar 2025 20:28:10 -0400
Subject: [PATCH 2/3] init network3.py

---
 .gitignore                    | 1 +
 src_pytensor/requirements.txt | 2 ++
 2 files changed, 3 insertions(+)
 create mode 100644 src_pytensor/requirements.txt

diff --git a/.gitignore b/.gitignore
index 3ef474023..a55c4e1b8 100644
--- a/.gitignore
+++ b/.gitignore
@@ -6,3 +6,4 @@
 .DS_Store
 loc.py
 src/ec2
+*.history
\ No newline at end of file
diff --git a/src_pytensor/requirements.txt b/src_pytensor/requirements.txt
new file mode 100644
index 000000000..edf271cbe
--- /dev/null
+++ b/src_pytensor/requirements.txt
@@ -0,0 +1,2 @@
+pytensor>=2.26.4
+jax>=0.5.2
\ No newline at end of file

From bbafa4c8a15d8f7ef5b23c3453b962abf38dbd7d Mon Sep 17 00:00:00 2001
From: Philip T <69321457+philipscoderepo@users.noreply.github.com>
Date: Sun, 30 Mar 2025 17:36:25 -0400
Subject: [PATCH 3/3] convert to pytensor

---
 src_pytensor/network3.py | 433 +++++++++++++++++++++++----------------
 1 file changed, 259 insertions(+), 174 deletions(-)

diff --git a/src_pytensor/network3.py b/src_pytensor/network3.py
index 2b27774b2..fea7eefe4 100644
--- a/src_pytensor/network3.py
+++ b/src_pytensor/network3.py
@@ -1,102 +1,292 @@
+
+"""
+Got the code from https://github.com/MichalDanielDobrzanski/DeepLearningPython/pull/14/
+"""
+
 """network3.py
 ~~~~~~~~~~~~~~
-
 A Theano-based program for training and running simple neural
 networks.
-
 Supports several layer types (fully connected, convolutional, max
 pooling, softmax), and activation functions (sigmoid, tanh, and
 rectified linear units, with more easily added).
-
 When run on a CPU, this program is much faster than network.py and
 network2.py.  However, unlike network.py and network2.py it can also
 be run on a GPU, which makes it faster still.
-
 Because the code is based on Theano, the code is different in many
 ways from network.py and network2.py.  However, where possible I have
 tried to maintain consistency with the earlier programs.  In
 particular, the API is similar to network2.py.  Note that I have
 focused on making the code simple, easily readable, and easily
 modifiable.  It is not optimized, and omits many desirable features.
-
 This program incorporates ideas from the Theano documentation on
 convolutional neural nets (notably,
 http://deeplearning.net/tutorial/lenet.html ), from Misha Denil's
 implementation of dropout (https://github.com/mdenil/dropout ), and
 from Chris Olah (http://colah.github.io ).
-
-Written for Theano 0.6 and 0.7, needs some changes for more recent
-versions of Theano.
-
 """
 
 #### Libraries
 # Standard library
-import cPickle
+import pickle
 import gzip
 
 # Third-party libraries
 import numpy as np
-import theano
-import theano.tensor as T
-from theano.tensor.nnet import conv
-from theano.tensor.nnet import softmax
-from theano.tensor import shared_randomstreams
-from theano.tensor.signal import downsample
+
+import pytensor
+import pytensor.link.jax
+import pytensor.tensor as pt
+import pytensor.tensor
+from pytensor.tensor.math import sigmoid, tanh
+from pytensor.tensor.special import softmax
 
 # Activation functions for neurons
 def linear(z): return z
-def ReLU(z): return T.maximum(0.0, z)
-from theano.tensor.nnet import sigmoid
-from theano.tensor import tanh
+# update with pt
+def ReLU(z): return pt.maximum(0.0, z)
 
+# 2d convolution
+import jax
+import jax.numpy as jnp
+from jax.lax import conv_general_dilated, reduce_window
 
 #### Constants
 GPU = True
 if GPU:
-    print "Trying to run under a GPU.  If this is not desired, then modify "+\
-        "network3.py\nto set the GPU flag to False."
-    try: theano.config.device = 'gpu'
+    print("Trying to run under a GPU.  If this is not desired, then modify "+\
+        "network3.py\nto set the GPU flag to False.")
+
+    # config has been replaced, instead we have cuda
+    try: pytensor.config.device = 'cuda'
     except: pass # it's already set
-    theano.config.floatX = 'float32'
+    # recommended for GPU computation
+    pytensor.config.floatX = 'float32'
+
+    print(f"PyTensor is running on: {pytensor.config.device}")
+    exit()
 else:
-    print "Running with a CPU.  If this is not desired, then the modify "+\
-        "network3.py to set\nthe GPU flag to True."
+    print("Running with a CPU. If this is not desired, then the modify "+\
+        "network3.py to set\nthe GPU flag to True.")
 
 #### Load the MNIST data
 def load_data_shared(filename="../data/mnist.pkl.gz"):
     f = gzip.open(filename, 'rb')
-    training_data, validation_data, test_data = cPickle.load(f)
+    training_data, validation_data, test_data = pickle.load(f, encoding="latin1")
     f.close()
     def shared(data):
         """Place the data into shared variables.  This allows Theano to copy
         the data to the GPU, if one is available.
-
         """
-        shared_x = theano.shared(
-            np.asarray(data[0], dtype=theano.config.floatX), borrow=True)
-        shared_y = theano.shared(
-            np.asarray(data[1], dtype=theano.config.floatX), borrow=True)
-        return shared_x, T.cast(shared_y, "int32")
+        # shared is still the same between theano and pytensor
+        shared_x = pytensor.shared(
+            np.asarray(data[0], dtype=pytensor.config.floatX), borrow=True)
+        
+        # shared is still the same between theano and pytensor
+        shared_y = pytensor.shared(
+            np.asarray(data[1], dtype=pytensor.config.floatX), borrow=True)
+        
+        # update cast to pytensor.tensor (pt) instead of theano.tensor (T)
+        return shared_x, pt.cast(shared_y, "int32")
     return [shared(training_data), shared(validation_data), shared(test_data)]
 
+class ConvPoolLayer(object):
+    """Used to create a combination of a convolutional and a max-pooling
+    layer.  A more sophisticated implementation would separate the
+    two, but for our purposes we'll always use them together, and it
+    simplifies the code, so it makes sense to combine them.
+    """
+
+    def __init__(self, filter_shape, image_shape, poolsize=(2, 2),
+                 activation_fn=sigmoid):
+        """`filter_shape` is a tuple of length 4, whose entries are the number
+        of filters, the number of input feature maps, the filter height, and the
+        filter width.
+        `image_shape` is a tuple of length 4, whose entries are the
+        mini-batch size, the number of input feature maps, the image
+        height, and the image width.
+        `poolsize` is a tuple of length 2, whose entries are the y and
+        x pooling sizes.
+        """
+        self.filter_shape = filter_shape
+        self.image_shape = image_shape
+        self.poolsize = poolsize
+        self.activation_fn=activation_fn
+
+        # initialize weights and biases
+        n_out = (filter_shape[0]*np.prod(filter_shape[2:])/np.prod(poolsize))
+        
+        # theano.shared == pytensor.shared
+        self.w = pytensor.shared(
+            np.asarray(
+                np.random.normal(loc=0, scale=np.sqrt(1.0/n_out), size=filter_shape),
+                # theano.config.floatX == pytensor.config.floatX
+                dtype=pytensor.config.floatX),
+            borrow=True)
+        
+        # theano.shared == pytensor.shared
+        self.b = pytensor.shared(
+            np.asarray(
+                np.random.normal(loc=0, scale=1.0, size=(filter_shape[0],)),
+                # theano.config.floatX == pytensor.config.floatX
+                dtype=pytensor.config.floatX),
+            borrow=True)
+        
+        self.params = [self.w, self.b]
+
+    def set_inpt(self, inpt, inpt_dropout, mini_batch_size):
+        # this section is not possible using just pytensor
+        # need to also use JAX for the 2d convolution 
+
+        # Assume self.inpt, self.w, self.filter_shape, and self.image_shape are defined
+        input_tensor = self.inpt  # Shape: (batch, channels, height, width)
+        filters = self.w  # Shape: (out_channels, in_channels, filter_height, filter_width)
+
+        # Stride (assumed to be 1x1 unless specified otherwise)
+        stride = (1, 1)
+
+        # Padding: Choose 'SAME' to keep the output size similar to the input
+        padding = "SAME"  # Theano's default padding behavior is similar to "SAME"
+
+        # Perform convolution
+        conv_out = conv_general_dilated(
+            lhs=input_tensor,  # Input tensor
+            rhs=filters,  # Convolution filters
+            window_strides=stride,  # Stride for convolution
+            padding=padding  # Padding type
+        )
+
+        # UPGRADE: pytensor reshape
+        self.inpt = pt.reshape(inpt, self.image_shape)
+        
+        # Pooling parameters (e.g., poolsize=(2, 2), ignore_border=True)
+        window_shape = (2, 2)  # Pooling window size
+        strides = (2, 2)  # Stride for pooling
+        padding = 'VALID'  # Padding type (no padding at the borders)
+
+        # Apply max pooling using jax.lax.reduce_window
+        pooled_out = reduce_window(conv_out, -jnp.inf, jax.lax.max, window_shape, strides, padding)
+        
+        self.output = self.activation_fn(
+            pooled_out + self.b.dimshuffle('x', 0, 'x', 'x'))
+        self.output_dropout = self.output # no dropout in the convolutional layers
+
+class FullyConnectedLayer(object):
+
+    def __init__(self, n_in, n_out, activation_fn=sigmoid, p_dropout=0.0):
+        self.n_in = n_in                    # n input neurons
+        self.n_out = n_out                  # n output neurons
+        self.activation_fn = activation_fn  # activation function
+        self.p_dropout = p_dropout          # probability of dropping out (reduce overfitting)
+        ### Initialize weights and biases
+        # theano.shared == pytensor.shared
+        self.w = pytensor.shared(
+            np.asarray(
+                np.random.normal(
+                    loc=0.0, scale=np.sqrt(1.0/n_out), size=(n_in, n_out)),
+                # theano.config.floatX == pytensor.config.floatX
+                dtype=pytensor.config.floatX),
+            name='w', borrow=True)
+        
+        # theano.shared == pytensor.shared
+        self.b = pytensor.shared(
+            # theano.config.floatX == pytensor.config.floatX
+            np.asarray(np.random.normal(loc=0.0, scale=1.0, size=(n_out,)),
+                       dtype=pytensor.config.floatX),
+            name='b', borrow=True)
+        
+        self.params = [self.w, self.b]
+
+    def set_inpt(self, inpt, inpt_dropout, mini_batch_size):
+        # UPGRADE: use pytensor reshape
+        self.inpt = pt.reshape(inpt, (mini_batch_size, self.n_in))
+        self.output = self.activation_fn(
+            # UPGRADE: T.dot == pt.dot
+            
+            (1-self.p_dropout)*pt.dot(self.inpt, self.w) + self.b)
+        # UPGRADE: T.argmax == pt.argmax
+        self.y_out = pt.argmax(self.output, axis=1)
+
+        # UPGRADE: use pytensor reshape
+        inpt_dropout = pt.reshape(inpt_dropout, (mini_batch_size, self.n_in))
+
+        self.inpt_dropout = dropout_layer(inpt_dropout, self.p_dropout)
+        self.output_dropout = self.activation_fn(
+            # T.dot == pt.dot
+            pt.dot(self.inpt_dropout, self.w) + self.b)
+
+    def accuracy(self, y):
+        "Return the accuracy for the mini-batch."
+        # T.mean == pt.mean; T.eq == pt.eq
+        return pt.mean(pt.eq(y, self.y_out))
+
+class SoftmaxLayer(object):
+
+    def __init__(self, n_in, n_out, p_dropout=0.0):
+        self.n_in = n_in
+        self.n_out = n_out
+        self.p_dropout = p_dropout
+        # Initialize weights and biases
+        # theano.shared == pytensor.shared
+        self.w = pytensor.shared(
+            # theano.config.floatX == pytensor.config.floatX
+            np.zeros((n_in, n_out), dtype=pytensor.config.floatX),
+            name='w', borrow=True)
+        # theano.shared == pytensor.shared
+        self.b = pytensor.shared(
+            # theano.config.floatX == pytensor.config.floatX
+            np.zeros((n_out,), dtype=pytensor.config.floatX),
+            name='b', borrow=True)
+        self.params = [self.w, self.b]
+    
+    def set_inpt(self, inpt, inpt_dropout, mini_batch_size):
+        # UPGRADE: use pytensor reshape
+        self.inpt = pt.reshape(inpt, (mini_batch_size, self.n_in))
+        # UPGRADE: T.dot == pt.dot
+        self.output = softmax((1-self.p_dropout) * pt.dot(self.inpt, self.w) + self.b)
+        # UPGRADE: T.argmax == pt.argmax
+        self.y_out = pt.argmax(self.output, axis=1)
+        # UPGRADE: use pytensor reshape
+        inpt_dropout = pt.reshape(inpt_dropout, (mini_batch_size, self.n_in))
+
+        self.inpt_dropout = dropout_layer(inpt_dropout, self.p_dropout)
+        
+        # UPGRADE: T.dot == pt.dot
+        self.output_dropout = softmax(pt.dot(self.inpt_dropout, self.w) + self.b)
+
+    def cost(self, net):
+        "Return the log-likelihood cost."
+        # T.mean == pt.mean; T.log == pt.log; T.arange == pt.arange
+        return -pt.mean(pt.log(self.output_dropout)[pt.arange(net.y.shape[0]), net.y])
+
+    def accuracy(self, y):
+        "Return the accuracy for the mini-batch."
+        # T.mean == pt.mean; T.eq == pt.eq
+        return pt.mean(pt.eq(y, self.y_out))
+
 #### Main class used to construct and train networks
 class Network(object):
 
-    def __init__(self, layers, mini_batch_size):
+    def __init__(self, layers: FullyConnectedLayer | ConvPoolLayer | SoftmaxLayer, mini_batch_size: int):
         """Takes a list of `layers`, describing the network architecture, and
         a value for the `mini_batch_size` to be used during training
         by stochastic gradient descent.
-
         """
         self.layers = layers
         self.mini_batch_size = mini_batch_size
         self.params = [param for layer in self.layers for param in layer.params]
-        self.x = T.matrix("x")
-        self.y = T.ivector("y")
+
+        # T.matrix == pt.matrix
+        self.x = pt.matrix("x")
+        # T.ivector == pt.ivector
+        self.y = pt.ivector("y")
+ 
         init_layer = self.layers[0]
+
+        # call the set_inpt for the respective layer provided as the first layer
         init_layer.set_inpt(self.x, self.x, self.mini_batch_size)
-        for j in xrange(1, len(self.layers)):
+
+        for j in range(1, len(self.layers)): # xrange() was renamed to range() in Python 3.
             prev_layer, layer  = self.layers[j-1], self.layers[j]
             layer.set_inpt(
                 prev_layer.output, prev_layer.output_dropout, self.mini_batch_size)
@@ -111,22 +301,27 @@ def SGD(self, training_data, epochs, mini_batch_size, eta,
         test_x, test_y = test_data
 
         # compute number of minibatches for training, validation and testing
-        num_training_batches = size(training_data)/mini_batch_size
-        num_validation_batches = size(validation_data)/mini_batch_size
-        num_test_batches = size(test_data)/mini_batch_size
+        num_training_batches = int(size(training_data)/mini_batch_size)
+        num_validation_batches = int(size(validation_data)/mini_batch_size)
+        num_test_batches = int(size(test_data)/mini_batch_size)
 
         # define the (regularized) cost function, symbolic gradients, and updates
         l2_norm_squared = sum([(layer.w**2).sum() for layer in self.layers])
         cost = self.layers[-1].cost(self)+\
                0.5*lmbda*l2_norm_squared/num_training_batches
-        grads = T.grad(cost, self.params)
+        
+        # T.grad == pt.grad
+        grads = pt.grad(cost, self.params)
         updates = [(param, param-eta*grad)
                    for param, grad in zip(self.params, grads)]
 
         # define functions to train a mini-batch, and to compute the
         # accuracy in validation and test mini-batches.
-        i = T.lscalar() # mini-batch index
-        train_mb = theano.function(
+        # T.lscalar == pt.lscalar
+        i = pt.lscalar() # mini-batch index
+        
+        # theano.function == pytensor.function
+        train_mb = pytensor.function(
             [i], cost, updates=updates,
             givens={
                 self.x:
@@ -134,7 +329,9 @@ def SGD(self, training_data, epochs, mini_batch_size, eta,
                 self.y:
                 training_y[i*self.mini_batch_size: (i+1)*self.mini_batch_size]
             })
-        validate_mb_accuracy = theano.function(
+        
+        # theano.function == pytensor.function
+        validate_mb_accuracy = pytensor.function(
             [i], self.layers[-1].accuracy(self.y),
             givens={
                 self.x:
@@ -142,7 +339,9 @@ def SGD(self, training_data, epochs, mini_batch_size, eta,
                 self.y:
                 validation_y[i*self.mini_batch_size: (i+1)*self.mini_batch_size]
             })
-        test_mb_accuracy = theano.function(
+        
+        # theano.function == pytensor.function
+        test_mb_accuracy = pytensor.function(
             [i], self.layers[-1].accuracy(self.y),
             givens={
                 self.x:
@@ -150,23 +349,26 @@ def SGD(self, training_data, epochs, mini_batch_size, eta,
                 self.y:
                 test_y[i*self.mini_batch_size: (i+1)*self.mini_batch_size]
             })
-        self.test_mb_predictions = theano.function(
+        
+        # theano.function == pytensor.function
+        self.test_mb_predictions = pytensor.function(
             [i], self.layers[-1].y_out,
             givens={
                 self.x:
                 test_x[i*self.mini_batch_size: (i+1)*self.mini_batch_size]
             })
+        
         # Do the actual training
         best_validation_accuracy = 0.0
-        for epoch in xrange(epochs):
-            for minibatch_index in xrange(num_training_batches):
+        for epoch in range(epochs):
+            for minibatch_index in range(num_training_batches):
                 iteration = num_training_batches*epoch+minibatch_index
                 if iteration % 1000 == 0:
                     print("Training mini-batch number {0}".format(iteration))
                 cost_ij = train_mb(minibatch_index)
                 if (iteration+1) % num_training_batches == 0:
                     validation_accuracy = np.mean(
-                        [validate_mb_accuracy(j) for j in xrange(num_validation_batches)])
+                        [validate_mb_accuracy(j) for j in range(num_validation_batches)])
                     print("Epoch {0}: validation accuracy {1:.2%}".format(
                         epoch, validation_accuracy))
                     if validation_accuracy >= best_validation_accuracy:
@@ -175,7 +377,7 @@ def SGD(self, training_data, epochs, mini_batch_size, eta,
                         best_iteration = iteration
                         if test_data:
                             test_accuracy = np.mean(
-                                [test_mb_accuracy(j) for j in xrange(num_test_batches)])
+                                [test_mb_accuracy(j) for j in range(num_test_batches)])
                             print('The corresponding test accuracy is {0:.2%}'.format(
                                 test_accuracy))
         print("Finished training network.")
@@ -183,132 +385,15 @@ def SGD(self, training_data, epochs, mini_batch_size, eta,
             best_validation_accuracy, best_iteration))
         print("Corresponding test accuracy of {0:.2%}".format(test_accuracy))
 
-#### Define layer types
-
-class ConvPoolLayer(object):
-    """Used to create a combination of a convolutional and a max-pooling
-    layer.  A more sophisticated implementation would separate the
-    two, but for our purposes we'll always use them together, and it
-    simplifies the code, so it makes sense to combine them.
-
-    """
-
-    def __init__(self, filter_shape, image_shape, poolsize=(2, 2),
-                 activation_fn=sigmoid):
-        """`filter_shape` is a tuple of length 4, whose entries are the number
-        of filters, the number of input feature maps, the filter height, and the
-        filter width.
-
-        `image_shape` is a tuple of length 4, whose entries are the
-        mini-batch size, the number of input feature maps, the image
-        height, and the image width.
-
-        `poolsize` is a tuple of length 2, whose entries are the y and
-        x pooling sizes.
-
-        """
-        self.filter_shape = filter_shape
-        self.image_shape = image_shape
-        self.poolsize = poolsize
-        self.activation_fn=activation_fn
-        # initialize weights and biases
-        n_out = (filter_shape[0]*np.prod(filter_shape[2:])/np.prod(poolsize))
-        self.w = theano.shared(
-            np.asarray(
-                np.random.normal(loc=0, scale=np.sqrt(1.0/n_out), size=filter_shape),
-                dtype=theano.config.floatX),
-            borrow=True)
-        self.b = theano.shared(
-            np.asarray(
-                np.random.normal(loc=0, scale=1.0, size=(filter_shape[0],)),
-                dtype=theano.config.floatX),
-            borrow=True)
-        self.params = [self.w, self.b]
-
-    def set_inpt(self, inpt, inpt_dropout, mini_batch_size):
-        self.inpt = inpt.reshape(self.image_shape)
-        conv_out = conv.conv2d(
-            input=self.inpt, filters=self.w, filter_shape=self.filter_shape,
-            image_shape=self.image_shape)
-        pooled_out = downsample.max_pool_2d(
-            input=conv_out, ds=self.poolsize, ignore_border=True)
-        self.output = self.activation_fn(
-            pooled_out + self.b.dimshuffle('x', 0, 'x', 'x'))
-        self.output_dropout = self.output # no dropout in the convolutional layers
-
-class FullyConnectedLayer(object):
-
-    def __init__(self, n_in, n_out, activation_fn=sigmoid, p_dropout=0.0):
-        self.n_in = n_in
-        self.n_out = n_out
-        self.activation_fn = activation_fn
-        self.p_dropout = p_dropout
-        # Initialize weights and biases
-        self.w = theano.shared(
-            np.asarray(
-                np.random.normal(
-                    loc=0.0, scale=np.sqrt(1.0/n_out), size=(n_in, n_out)),
-                dtype=theano.config.floatX),
-            name='w', borrow=True)
-        self.b = theano.shared(
-            np.asarray(np.random.normal(loc=0.0, scale=1.0, size=(n_out,)),
-                       dtype=theano.config.floatX),
-            name='b', borrow=True)
-        self.params = [self.w, self.b]
-
-    def set_inpt(self, inpt, inpt_dropout, mini_batch_size):
-        self.inpt = inpt.reshape((mini_batch_size, self.n_in))
-        self.output = self.activation_fn(
-            (1-self.p_dropout)*T.dot(self.inpt, self.w) + self.b)
-        self.y_out = T.argmax(self.output, axis=1)
-        self.inpt_dropout = dropout_layer(
-            inpt_dropout.reshape((mini_batch_size, self.n_in)), self.p_dropout)
-        self.output_dropout = self.activation_fn(
-            T.dot(self.inpt_dropout, self.w) + self.b)
-
-    def accuracy(self, y):
-        "Return the accuracy for the mini-batch."
-        return T.mean(T.eq(y, self.y_out))
-
-class SoftmaxLayer(object):
-
-    def __init__(self, n_in, n_out, p_dropout=0.0):
-        self.n_in = n_in
-        self.n_out = n_out
-        self.p_dropout = p_dropout
-        # Initialize weights and biases
-        self.w = theano.shared(
-            np.zeros((n_in, n_out), dtype=theano.config.floatX),
-            name='w', borrow=True)
-        self.b = theano.shared(
-            np.zeros((n_out,), dtype=theano.config.floatX),
-            name='b', borrow=True)
-        self.params = [self.w, self.b]
-
-    def set_inpt(self, inpt, inpt_dropout, mini_batch_size):
-        self.inpt = inpt.reshape((mini_batch_size, self.n_in))
-        self.output = softmax((1-self.p_dropout)*T.dot(self.inpt, self.w) + self.b)
-        self.y_out = T.argmax(self.output, axis=1)
-        self.inpt_dropout = dropout_layer(
-            inpt_dropout.reshape((mini_batch_size, self.n_in)), self.p_dropout)
-        self.output_dropout = softmax(T.dot(self.inpt_dropout, self.w) + self.b)
-
-    def cost(self, net):
-        "Return the log-likelihood cost."
-        return -T.mean(T.log(self.output_dropout)[T.arange(net.y.shape[0]), net.y])
-
-    def accuracy(self, y):
-        "Return the accuracy for the mini-batch."
-        return T.mean(T.eq(y, self.y_out))
-
-
-#### Miscellanea
+#### Miscellaneous
 def size(data):
     "Return the size of the dataset `data`."
     return data[0].get_value(borrow=True).shape[0]
 
 def dropout_layer(layer, p_dropout):
-    srng = shared_randomstreams.RandomStreams(
-        np.random.RandomState(0).randint(999999))
-    mask = srng.binomial(n=1, p=1-p_dropout, size=layer.shape)
-    return layer*T.cast(mask, theano.config.floatX)
+    # n = number of trials in the binomial distribution
+    # p = probability of success in each trial
+    mask = pt.random.binomial(n=1, p=1-p_dropout, size=layer.shape)
+
+    # T.cast == pt.cast; theano.config.floatX == pytensor.config.floatX
+    return layer*pt.cast(mask, pytensor.config.floatX)