diff --git a/.gitignore b/.gitignore
index 3ef474023..a55c4e1b8 100644
--- a/.gitignore
+++ b/.gitignore
@@ -6,3 +6,4 @@
 .DS_Store
 loc.py
 src/ec2
+*.history
\ No newline at end of file
diff --git a/src_pytensor/network3.py b/src_pytensor/network3.py
new file mode 100644
index 000000000..fea7eefe4
--- /dev/null
+++ b/src_pytensor/network3.py
@@ -0,0 +1,399 @@
+
+"""
+Got the code from https://github.com/MichalDanielDobrzanski/DeepLearningPython/pull/14/
+"""
+
+"""network3.py
+~~~~~~~~~~~~~~
+A Theano-based program for training and running simple neural
+networks.
+Supports several layer types (fully connected, convolutional, max
+pooling, softmax), and activation functions (sigmoid, tanh, and
+rectified linear units, with more easily added).
+When run on a CPU, this program is much faster than network.py and
+network2.py.  However, unlike network.py and network2.py it can also
+be run on a GPU, which makes it faster still.
+Because the code is based on Theano, the code is different in many
+ways from network.py and network2.py.  However, where possible I have
+tried to maintain consistency with the earlier programs.  In
+particular, the API is similar to network2.py.  Note that I have
+focused on making the code simple, easily readable, and easily
+modifiable.  It is not optimized, and omits many desirable features.
+This program incorporates ideas from the Theano documentation on
+convolutional neural nets (notably,
+http://deeplearning.net/tutorial/lenet.html ), from Misha Denil's
+implementation of dropout (https://github.com/mdenil/dropout ), and
+from Chris Olah (http://colah.github.io ).
+"""
+
+#### Libraries
+# Standard library
+import pickle
+import gzip
+
+# Third-party libraries
+import numpy as np
+
+import pytensor
+import pytensor.link.jax
+import pytensor.tensor as pt
+import pytensor.tensor
+from pytensor.tensor.math import sigmoid, tanh
+from pytensor.tensor.special import softmax
+
+# Activation functions for neurons
+def linear(z): return z
+# update with pt
+def ReLU(z): return pt.maximum(0.0, z)
+
+# 2d convolution
+import jax
+import jax.numpy as jnp
+from jax.lax import conv_general_dilated, reduce_window
+
+#### Constants
+GPU = True
+if GPU:
+    print("Trying to run under a GPU.  If this is not desired, then modify "+\
+        "network3.py\nto set the GPU flag to False.")
+
+    # config has been replaced, instead we have cuda
+    try: pytensor.config.device = 'cuda'
+    except: pass # it's already set
+    # recommended for GPU computation
+    pytensor.config.floatX = 'float32'
+
+    print(f"PyTensor is running on: {pytensor.config.device}")
+    exit()
+else:
+    print("Running with a CPU. If this is not desired, then the modify "+\
+        "network3.py to set\nthe GPU flag to True.")
+
+#### Load the MNIST data
+def load_data_shared(filename="../data/mnist.pkl.gz"):
+    f = gzip.open(filename, 'rb')
+    training_data, validation_data, test_data = pickle.load(f, encoding="latin1")
+    f.close()
+    def shared(data):
+        """Place the data into shared variables.  This allows Theano to copy
+        the data to the GPU, if one is available.
+        """
+        # shared is still the same between theano and pytensor
+        shared_x = pytensor.shared(
+            np.asarray(data[0], dtype=pytensor.config.floatX), borrow=True)
+        
+        # shared is still the same between theano and pytensor
+        shared_y = pytensor.shared(
+            np.asarray(data[1], dtype=pytensor.config.floatX), borrow=True)
+        
+        # update cast to pytensor.tensor (pt) instead of theano.tensor (T)
+        return shared_x, pt.cast(shared_y, "int32")
+    return [shared(training_data), shared(validation_data), shared(test_data)]
+
+class ConvPoolLayer(object):
+    """Used to create a combination of a convolutional and a max-pooling
+    layer.  A more sophisticated implementation would separate the
+    two, but for our purposes we'll always use them together, and it
+    simplifies the code, so it makes sense to combine them.
+    """
+
+    def __init__(self, filter_shape, image_shape, poolsize=(2, 2),
+                 activation_fn=sigmoid):
+        """`filter_shape` is a tuple of length 4, whose entries are the number
+        of filters, the number of input feature maps, the filter height, and the
+        filter width.
+        `image_shape` is a tuple of length 4, whose entries are the
+        mini-batch size, the number of input feature maps, the image
+        height, and the image width.
+        `poolsize` is a tuple of length 2, whose entries are the y and
+        x pooling sizes.
+        """
+        self.filter_shape = filter_shape
+        self.image_shape = image_shape
+        self.poolsize = poolsize
+        self.activation_fn=activation_fn
+
+        # initialize weights and biases
+        n_out = (filter_shape[0]*np.prod(filter_shape[2:])/np.prod(poolsize))
+        
+        # theano.shared == pytensor.shared
+        self.w = pytensor.shared(
+            np.asarray(
+                np.random.normal(loc=0, scale=np.sqrt(1.0/n_out), size=filter_shape),
+                # theano.config.floatX == pytensor.config.floatX
+                dtype=pytensor.config.floatX),
+            borrow=True)
+        
+        # theano.shared == pytensor.shared
+        self.b = pytensor.shared(
+            np.asarray(
+                np.random.normal(loc=0, scale=1.0, size=(filter_shape[0],)),
+                # theano.config.floatX == pytensor.config.floatX
+                dtype=pytensor.config.floatX),
+            borrow=True)
+        
+        self.params = [self.w, self.b]
+
+    def set_inpt(self, inpt, inpt_dropout, mini_batch_size):
+        # this section is not possible using just pytensor
+        # need to also use JAX for the 2d convolution 
+
+        # Assume self.inpt, self.w, self.filter_shape, and self.image_shape are defined
+        input_tensor = self.inpt  # Shape: (batch, channels, height, width)
+        filters = self.w  # Shape: (out_channels, in_channels, filter_height, filter_width)
+
+        # Stride (assumed to be 1x1 unless specified otherwise)
+        stride = (1, 1)
+
+        # Padding: Choose 'SAME' to keep the output size similar to the input
+        padding = "SAME"  # Theano's default padding behavior is similar to "SAME"
+
+        # Perform convolution
+        conv_out = conv_general_dilated(
+            lhs=input_tensor,  # Input tensor
+            rhs=filters,  # Convolution filters
+            window_strides=stride,  # Stride for convolution
+            padding=padding  # Padding type
+        )
+
+        # UPGRADE: pytensor reshape
+        self.inpt = pt.reshape(inpt, self.image_shape)
+        
+        # Pooling parameters (e.g., poolsize=(2, 2), ignore_border=True)
+        window_shape = (2, 2)  # Pooling window size
+        strides = (2, 2)  # Stride for pooling
+        padding = 'VALID'  # Padding type (no padding at the borders)
+
+        # Apply max pooling using jax.lax.reduce_window
+        pooled_out = reduce_window(conv_out, -jnp.inf, jax.lax.max, window_shape, strides, padding)
+        
+        self.output = self.activation_fn(
+            pooled_out + self.b.dimshuffle('x', 0, 'x', 'x'))
+        self.output_dropout = self.output # no dropout in the convolutional layers
+
+class FullyConnectedLayer(object):
+
+    def __init__(self, n_in, n_out, activation_fn=sigmoid, p_dropout=0.0):
+        self.n_in = n_in                    # n input neurons
+        self.n_out = n_out                  # n output neurons
+        self.activation_fn = activation_fn  # activation function
+        self.p_dropout = p_dropout          # probability of dropping out (reduce overfitting)
+        ### Initialize weights and biases
+        # theano.shared == pytensor.shared
+        self.w = pytensor.shared(
+            np.asarray(
+                np.random.normal(
+                    loc=0.0, scale=np.sqrt(1.0/n_out), size=(n_in, n_out)),
+                # theano.config.floatX == pytensor.config.floatX
+                dtype=pytensor.config.floatX),
+            name='w', borrow=True)
+        
+        # theano.shared == pytensor.shared
+        self.b = pytensor.shared(
+            # theano.config.floatX == pytensor.config.floatX
+            np.asarray(np.random.normal(loc=0.0, scale=1.0, size=(n_out,)),
+                       dtype=pytensor.config.floatX),
+            name='b', borrow=True)
+        
+        self.params = [self.w, self.b]
+
+    def set_inpt(self, inpt, inpt_dropout, mini_batch_size):
+        # UPGRADE: use pytensor reshape
+        self.inpt = pt.reshape(inpt, (mini_batch_size, self.n_in))
+        self.output = self.activation_fn(
+            # UPGRADE: T.dot == pt.dot
+            
+            (1-self.p_dropout)*pt.dot(self.inpt, self.w) + self.b)
+        # UPGRADE: T.argmax == pt.argmax
+        self.y_out = pt.argmax(self.output, axis=1)
+
+        # UPGRADE: use pytensor reshape
+        inpt_dropout = pt.reshape(inpt_dropout, (mini_batch_size, self.n_in))
+
+        self.inpt_dropout = dropout_layer(inpt_dropout, self.p_dropout)
+        self.output_dropout = self.activation_fn(
+            # T.dot == pt.dot
+            pt.dot(self.inpt_dropout, self.w) + self.b)
+
+    def accuracy(self, y):
+        "Return the accuracy for the mini-batch."
+        # T.mean == pt.mean; T.eq == pt.eq
+        return pt.mean(pt.eq(y, self.y_out))
+
+class SoftmaxLayer(object):
+
+    def __init__(self, n_in, n_out, p_dropout=0.0):
+        self.n_in = n_in
+        self.n_out = n_out
+        self.p_dropout = p_dropout
+        # Initialize weights and biases
+        # theano.shared == pytensor.shared
+        self.w = pytensor.shared(
+            # theano.config.floatX == pytensor.config.floatX
+            np.zeros((n_in, n_out), dtype=pytensor.config.floatX),
+            name='w', borrow=True)
+        # theano.shared == pytensor.shared
+        self.b = pytensor.shared(
+            # theano.config.floatX == pytensor.config.floatX
+            np.zeros((n_out,), dtype=pytensor.config.floatX),
+            name='b', borrow=True)
+        self.params = [self.w, self.b]
+    
+    def set_inpt(self, inpt, inpt_dropout, mini_batch_size):
+        # UPGRADE: use pytensor reshape
+        self.inpt = pt.reshape(inpt, (mini_batch_size, self.n_in))
+        # UPGRADE: T.dot == pt.dot
+        self.output = softmax((1-self.p_dropout) * pt.dot(self.inpt, self.w) + self.b)
+        # UPGRADE: T.argmax == pt.argmax
+        self.y_out = pt.argmax(self.output, axis=1)
+        # UPGRADE: use pytensor reshape
+        inpt_dropout = pt.reshape(inpt_dropout, (mini_batch_size, self.n_in))
+
+        self.inpt_dropout = dropout_layer(inpt_dropout, self.p_dropout)
+        
+        # UPGRADE: T.dot == pt.dot
+        self.output_dropout = softmax(pt.dot(self.inpt_dropout, self.w) + self.b)
+
+    def cost(self, net):
+        "Return the log-likelihood cost."
+        # T.mean == pt.mean; T.log == pt.log; T.arange == pt.arange
+        return -pt.mean(pt.log(self.output_dropout)[pt.arange(net.y.shape[0]), net.y])
+
+    def accuracy(self, y):
+        "Return the accuracy for the mini-batch."
+        # T.mean == pt.mean; T.eq == pt.eq
+        return pt.mean(pt.eq(y, self.y_out))
+
+#### Main class used to construct and train networks
+class Network(object):
+
+    def __init__(self, layers: FullyConnectedLayer | ConvPoolLayer | SoftmaxLayer, mini_batch_size: int):
+        """Takes a list of `layers`, describing the network architecture, and
+        a value for the `mini_batch_size` to be used during training
+        by stochastic gradient descent.
+        """
+        self.layers = layers
+        self.mini_batch_size = mini_batch_size
+        self.params = [param for layer in self.layers for param in layer.params]
+
+        # T.matrix == pt.matrix
+        self.x = pt.matrix("x")
+        # T.ivector == pt.ivector
+        self.y = pt.ivector("y")
+ 
+        init_layer = self.layers[0]
+
+        # call the set_inpt for the respective layer provided as the first layer
+        init_layer.set_inpt(self.x, self.x, self.mini_batch_size)
+
+        for j in range(1, len(self.layers)): # xrange() was renamed to range() in Python 3.
+            prev_layer, layer  = self.layers[j-1], self.layers[j]
+            layer.set_inpt(
+                prev_layer.output, prev_layer.output_dropout, self.mini_batch_size)
+        self.output = self.layers[-1].output
+        self.output_dropout = self.layers[-1].output_dropout
+
+    def SGD(self, training_data, epochs, mini_batch_size, eta,
+            validation_data, test_data, lmbda=0.0):
+        """Train the network using mini-batch stochastic gradient descent."""
+        training_x, training_y = training_data
+        validation_x, validation_y = validation_data
+        test_x, test_y = test_data
+
+        # compute number of minibatches for training, validation and testing
+        num_training_batches = int(size(training_data)/mini_batch_size)
+        num_validation_batches = int(size(validation_data)/mini_batch_size)
+        num_test_batches = int(size(test_data)/mini_batch_size)
+
+        # define the (regularized) cost function, symbolic gradients, and updates
+        l2_norm_squared = sum([(layer.w**2).sum() for layer in self.layers])
+        cost = self.layers[-1].cost(self)+\
+               0.5*lmbda*l2_norm_squared/num_training_batches
+        
+        # T.grad == pt.grad
+        grads = pt.grad(cost, self.params)
+        updates = [(param, param-eta*grad)
+                   for param, grad in zip(self.params, grads)]
+
+        # define functions to train a mini-batch, and to compute the
+        # accuracy in validation and test mini-batches.
+        # T.lscalar == pt.lscalar
+        i = pt.lscalar() # mini-batch index
+        
+        # theano.function == pytensor.function
+        train_mb = pytensor.function(
+            [i], cost, updates=updates,
+            givens={
+                self.x:
+                training_x[i*self.mini_batch_size: (i+1)*self.mini_batch_size],
+                self.y:
+                training_y[i*self.mini_batch_size: (i+1)*self.mini_batch_size]
+            })
+        
+        # theano.function == pytensor.function
+        validate_mb_accuracy = pytensor.function(
+            [i], self.layers[-1].accuracy(self.y),
+            givens={
+                self.x:
+                validation_x[i*self.mini_batch_size: (i+1)*self.mini_batch_size],
+                self.y:
+                validation_y[i*self.mini_batch_size: (i+1)*self.mini_batch_size]
+            })
+        
+        # theano.function == pytensor.function
+        test_mb_accuracy = pytensor.function(
+            [i], self.layers[-1].accuracy(self.y),
+            givens={
+                self.x:
+                test_x[i*self.mini_batch_size: (i+1)*self.mini_batch_size],
+                self.y:
+                test_y[i*self.mini_batch_size: (i+1)*self.mini_batch_size]
+            })
+        
+        # theano.function == pytensor.function
+        self.test_mb_predictions = pytensor.function(
+            [i], self.layers[-1].y_out,
+            givens={
+                self.x:
+                test_x[i*self.mini_batch_size: (i+1)*self.mini_batch_size]
+            })
+        
+        # Do the actual training
+        best_validation_accuracy = 0.0
+        for epoch in range(epochs):
+            for minibatch_index in range(num_training_batches):
+                iteration = num_training_batches*epoch+minibatch_index
+                if iteration % 1000 == 0:
+                    print("Training mini-batch number {0}".format(iteration))
+                cost_ij = train_mb(minibatch_index)
+                if (iteration+1) % num_training_batches == 0:
+                    validation_accuracy = np.mean(
+                        [validate_mb_accuracy(j) for j in range(num_validation_batches)])
+                    print("Epoch {0}: validation accuracy {1:.2%}".format(
+                        epoch, validation_accuracy))
+                    if validation_accuracy >= best_validation_accuracy:
+                        print("This is the best validation accuracy to date.")
+                        best_validation_accuracy = validation_accuracy
+                        best_iteration = iteration
+                        if test_data:
+                            test_accuracy = np.mean(
+                                [test_mb_accuracy(j) for j in range(num_test_batches)])
+                            print('The corresponding test accuracy is {0:.2%}'.format(
+                                test_accuracy))
+        print("Finished training network.")
+        print("Best validation accuracy of {0:.2%} obtained at iteration {1}".format(
+            best_validation_accuracy, best_iteration))
+        print("Corresponding test accuracy of {0:.2%}".format(test_accuracy))
+
+#### Miscellaneous
+def size(data):
+    "Return the size of the dataset `data`."
+    return data[0].get_value(borrow=True).shape[0]
+
+def dropout_layer(layer, p_dropout):
+    # n = number of trials in the binomial distribution
+    # p = probability of success in each trial
+    mask = pt.random.binomial(n=1, p=1-p_dropout, size=layer.shape)
+
+    # T.cast == pt.cast; theano.config.floatX == pytensor.config.floatX
+    return layer*pt.cast(mask, pytensor.config.floatX)
diff --git a/src_pytensor/requirements.txt b/src_pytensor/requirements.txt
new file mode 100644
index 000000000..edf271cbe
--- /dev/null
+++ b/src_pytensor/requirements.txt
@@ -0,0 +1,2 @@
+pytensor>=2.26.4
+jax>=0.5.2
\ No newline at end of file