diff --git a/examples/generative/real_nvp.py b/examples/generative/real_nvp.py
index af9e767ef1..b8eefed97a 100644
--- a/examples/generative/real_nvp.py
+++ b/examples/generative/real_nvp.py
@@ -41,6 +41,16 @@
 from sklearn.datasets import make_moons
 import numpy as np
 import matplotlib.pyplot as plt
+
+# Compatibility patch for TFP with Keras 3 / TF 2.19+
+try:
+    if not hasattr(tf._api.v2.compat.v2.__internal__, "register_load_context_function"):
+        tf._api.v2.compat.v2.__internal__.register_load_context_function = (
+            tf._api.v2.compat.v2.__internal__.register_call_context_function
+        )
+except AttributeError:
+    pass
+
 import tensorflow_probability as tfp
 
 """
@@ -179,48 +189,49 @@ def test_step(self, data):
 ## Model training
 """
 
-model = RealNVP(num_coupling_layers=6)
-
-model.compile(optimizer=keras.optimizers.Adam(learning_rate=0.0001))
-
-history = model.fit(
-    normalized_data, batch_size=256, epochs=300, verbose=2, validation_split=0.2
-)
-
-"""
-## Performance evaluation
-"""
-
-plt.figure(figsize=(15, 10))
-plt.plot(history.history["loss"])
-plt.plot(history.history["val_loss"])
-plt.title("model loss")
-plt.legend(["train", "validation"], loc="upper right")
-plt.ylabel("loss")
-plt.xlabel("epoch")
-
-# From data to latent space.
-z, _ = model(normalized_data)
-
-# From latent space to data.
-samples = model.distribution.sample(3000)
-x, _ = model.predict(samples)
-
-f, axes = plt.subplots(2, 2)
-f.set_size_inches(20, 15)
-
-axes[0, 0].scatter(normalized_data[:, 0], normalized_data[:, 1], color="r")
-axes[0, 0].set(title="Inference data space X", xlabel="x", ylabel="y")
-axes[0, 1].scatter(z[:, 0], z[:, 1], color="r")
-axes[0, 1].set(title="Inference latent space Z", xlabel="x", ylabel="y")
-axes[0, 1].set_xlim([-3.5, 4])
-axes[0, 1].set_ylim([-4, 4])
-axes[1, 0].scatter(samples[:, 0], samples[:, 1], color="g")
-axes[1, 0].set(title="Generated latent space Z", xlabel="x", ylabel="y")
-axes[1, 1].scatter(x[:, 0], x[:, 1], color="g")
-axes[1, 1].set(title="Generated data space X", label="x", ylabel="y")
-axes[1, 1].set_xlim([-2, 2])
-axes[1, 1].set_ylim([-2, 2])
+if __name__ == "__main__":
+    model = RealNVP(num_coupling_layers=6)
+
+    model.compile(optimizer=keras.optimizers.Adam(learning_rate=0.0001))
+
+    history = model.fit(
+        normalized_data, batch_size=256, epochs=300, verbose=2, validation_split=0.2
+    )
+
+    """
+    ## Performance evaluation
+    """
+
+    plt.figure(figsize=(15, 10))
+    plt.plot(history.history["loss"])
+    plt.plot(history.history["val_loss"])
+    plt.title("model loss")
+    plt.legend(["train", "validation"], loc="upper right")
+    plt.ylabel("loss")
+    plt.xlabel("epoch")
+
+    # From data to latent space.
+    z, _ = model(normalized_data)
+
+    # From latent space to data.
+    samples = model.distribution.sample(3000)
+    x, _ = model.predict(samples)
+
+    f, axes = plt.subplots(2, 2)
+    f.set_size_inches(20, 15)
+
+    axes[0, 0].scatter(normalized_data[:, 0], normalized_data[:, 1], color="r")
+    axes[0, 0].set(title="Inference data space X", xlabel="x", ylabel="y")
+    axes[0, 1].scatter(z[:, 0], z[:, 1], color="r")
+    axes[0, 1].set(title="Inference latent space Z", xlabel="x", ylabel="y")
+    axes[0, 1].set_xlim([-3.5, 4])
+    axes[0, 1].set_ylim([-4, 4])
+    axes[1, 0].scatter(samples[:, 0], samples[:, 1], color="g")
+    axes[1, 0].set(title="Generated latent space Z", xlabel="x", ylabel="y")
+    axes[1, 1].scatter(x[:, 0], x[:, 1], color="g")
+    axes[1, 1].set(title="Generated data space X", label="x", ylabel="y")
+    axes[1, 1].set_xlim([-2, 2])
+    axes[1, 1].set_ylim([-2, 2])
 
 """
 ## Relevant Chapters from Deep Learning with Python
diff --git a/examples/generative/vq_vae.py b/examples/generative/vq_vae.py
index 5caa95d26b..6f091462ee 100644
--- a/examples/generative/vq_vae.py
+++ b/examples/generative/vq_vae.py
@@ -49,9 +49,19 @@
 
 from tensorflow import keras
 from tensorflow.keras import layers
-import tensorflow_probability as tfp
 import tensorflow as tf
 
+# Compatibility patch for TFP with Keras 3 / TF 2.19+
+try:
+    if not hasattr(tf._api.v2.compat.v2.__internal__, "register_load_context_function"):
+        tf._api.v2.compat.v2.__internal__.register_load_context_function = (
+            tf._api.v2.compat.v2.__internal__.register_call_context_function
+        )
+except AttributeError:
+    pass
+
+import tensorflow_probability as tfp
+
 """
 ## `VectorQuantizer` layer
 
@@ -275,130 +285,6 @@ def train_step(self, x):
 ## Train the VQ-VAE model
 """
 
-vqvae_trainer = VQVAETrainer(data_variance, latent_dim=16, num_embeddings=128)
-vqvae_trainer.compile(optimizer=keras.optimizers.Adam())
-vqvae_trainer.fit(x_train_scaled, epochs=30, batch_size=128)
-
-"""
-## Reconstruction results on the test set
-"""
-
-
-def show_subplot(original, reconstructed):
-    plt.subplot(1, 2, 1)
-    plt.imshow(original.squeeze() + 0.5)
-    plt.title("Original")
-    plt.axis("off")
-
-    plt.subplot(1, 2, 2)
-    plt.imshow(reconstructed.squeeze() + 0.5)
-    plt.title("Reconstructed")
-    plt.axis("off")
-
-    plt.show()
-
-
-trained_vqvae_model = vqvae_trainer.vqvae
-idx = np.random.choice(len(x_test_scaled), 10)
-test_images = x_test_scaled[idx]
-reconstructions_test = trained_vqvae_model.predict(test_images)
-
-for test_image, reconstructed_image in zip(test_images, reconstructions_test):
-    show_subplot(test_image, reconstructed_image)
-
-"""
-These results look decent. You are encouraged to play with different hyperparameters
-(especially the number of embeddings and the dimensions of the embeddings) and observe how
-they affect the results.
-"""
-
-"""
-## Visualizing the discrete codes
-"""
-
-encoder = vqvae_trainer.vqvae.get_layer("encoder")
-quantizer = vqvae_trainer.vqvae.get_layer("vector_quantizer")
-
-encoded_outputs = encoder.predict(test_images)
-flat_enc_outputs = encoded_outputs.reshape(-1, encoded_outputs.shape[-1])
-codebook_indices = quantizer.get_code_indices(flat_enc_outputs)
-codebook_indices = codebook_indices.numpy().reshape(encoded_outputs.shape[:-1])
-
-for i in range(len(test_images)):
-    plt.subplot(1, 2, 1)
-    plt.imshow(test_images[i].squeeze() + 0.5)
-    plt.title("Original")
-    plt.axis("off")
-
-    plt.subplot(1, 2, 2)
-    plt.imshow(codebook_indices[i])
-    plt.title("Code")
-    plt.axis("off")
-    plt.show()
-
-"""
-The figure above shows that the discrete codes have been able to capture some
-regularities from the dataset. Now, how do we sample from this codebook to create
-novel images? Since these codes are discrete and we imposed a categorical distribution
-on them, we cannot use them yet to generate anything meaningful until we can generate likely
-sequences of codes that we can give to the decoder. 
-
-The authors use a PixelCNN to train these codes so that they can be used as powerful priors to
-generate novel examples. PixelCNN was proposed in
-[Conditional Image Generation with PixelCNN Decoders](https://arxiv.org/abs/1606.05328)
-by van der Oord et al. We borrow the implementation from
-[this PixelCNN example](https://keras.io/examples/generative/pixelcnn/). It's an autoregressive
-generative model where the outputs are conditional on the prior ones. In other words, a PixelCNN
-generates an image on a pixel-by-pixel basis. For the purpose in this example, however, its task
-is to generate code book indices instead of pixels directly. The trained VQ-VAE decoder is used
-to map the indices generated by the PixelCNN back into the pixel space.
-"""
-
-"""
-## PixelCNN hyperparameters
-"""
-
-num_residual_blocks = 2
-num_pixelcnn_layers = 2
-pixelcnn_input_shape = encoded_outputs.shape[1:-1]
-print(f"Input shape of the PixelCNN: {pixelcnn_input_shape}")
-
-"""
-This input shape represents the reduction in the resolution performed by the encoder. With "same" padding,
-this exactly halves the "resolution" of the output shape for each stride-2 convolution layer. So, with these
-two layers, we end up with an encoder output tensor of 7x7 on axes 2 and 3, with the first axis as the batch
-size and the last axis being the code book embedding size. Since the quantization layer in the autoencoder
-maps these 7x7 tensors to indices of the code book, these output layer axis sizes must be matched by the
-PixelCNN as the input shape. The task of the PixelCNN for this architecture is to generate _likely_ 7x7
-arrangements of codebook indices.
-
-Note that this shape is something to optimize for in larger-sized image domains, along with the code
-book sizes. Since the PixelCNN is autoregressive, it needs to pass over each codebook index sequentially
-in order to generate novel images from the codebook. Each stride-2 (or rather more correctly a 
-stride (2, 2)) convolution layer will divide the image generation time by four. Note, however, that there
-is probably a lower bound on this part: when the number of codes for the image to reconstruct is too small,
-it has insufficient information for the decoder to represent the level of detail in the image, so the
-output quality will suffer. This can be amended at least to some extent by using a larger code book. 
-Since the autoregressive part of the image generation procedure uses codebook indices, there is far less of 
-a performance penalty on using a larger code book as the lookup time for a larger-sized code from a larger
-code book is much smaller in comparison to iterating over a larger sequence of code book indices, although
-the size of the code book does impact on the batch size that can pass through the image generation procedure.
-Finding the sweet spot for this trade-off can require some architecture tweaking and could very well differ
-per dataset.
-"""
-
-"""
-## PixelCNN model
-
-Majority of this comes from
-[this example](https://keras.io/examples/generative/pixelcnn/).
-
-## Notes
-
-Thanks to [Rein van 't Veer](https://github.com/reinvantveer) for improving this example with
-copy-edits and minor code clean-ups.
-"""
-
 
 # The first layer is the PixelCNN layer. This layer simply
 # builds on the 2D convolutional layer, but includes masking.
@@ -450,152 +336,279 @@ def call(self, inputs):
         return keras.layers.add([inputs, x])
 
 
-pixelcnn_inputs = keras.Input(shape=pixelcnn_input_shape, dtype=tf.int32)
-ohe = tf.one_hot(pixelcnn_inputs, vqvae_trainer.num_embeddings)
-x = PixelConvLayer(
-    mask_type="A", filters=128, kernel_size=7, activation="relu", padding="same"
-)(ohe)
-
-for _ in range(num_residual_blocks):
-    x = ResidualBlock(filters=128)(x)
-
-for _ in range(num_pixelcnn_layers):
+if __name__ == "__main__":
+    vqvae_trainer = VQVAETrainer(data_variance, latent_dim=16, num_embeddings=128)
+    vqvae_trainer.compile(optimizer=keras.optimizers.Adam())
+    vqvae_trainer.fit(x_train_scaled, epochs=30, batch_size=128)
+
+    """
+    ## Reconstruction results on the test set
+    """
+
+    def show_subplot(original, reconstructed):
+        plt.subplot(1, 2, 1)
+        plt.imshow(original.squeeze() + 0.5)
+        plt.title("Original")
+        plt.axis("off")
+
+        plt.subplot(1, 2, 2)
+        plt.imshow(reconstructed.squeeze() + 0.5)
+        plt.title("Reconstructed")
+        plt.axis("off")
+
+        plt.show()
+
+    trained_vqvae_model = vqvae_trainer.vqvae
+    idx = np.random.choice(len(x_test_scaled), 10)
+    test_images = x_test_scaled[idx]
+    reconstructions_test = trained_vqvae_model.predict(test_images)
+
+    for test_image, reconstructed_image in zip(test_images, reconstructions_test):
+        show_subplot(test_image, reconstructed_image)
+
+    """
+    These results look decent. You are encouraged to play with different hyperparameters
+    (especially the number of embeddings and the dimensions of the embeddings) and observe how
+    they affect the results.
+    """
+
+    """
+    ## Visualizing the discrete codes
+    """
+
+    encoder = vqvae_trainer.vqvae.get_layer("encoder")
+    quantizer = vqvae_trainer.vqvae.get_layer("vector_quantizer")
+
+    encoded_outputs = encoder.predict(test_images)
+    flat_enc_outputs = encoded_outputs.reshape(-1, encoded_outputs.shape[-1])
+    codebook_indices = quantizer.get_code_indices(flat_enc_outputs)
+    codebook_indices = codebook_indices.numpy().reshape(encoded_outputs.shape[:-1])
+
+    for i in range(len(test_images)):
+        plt.subplot(1, 2, 1)
+        plt.imshow(test_images[i].squeeze() + 0.5)
+        plt.title("Original")
+        plt.axis("off")
+
+        plt.subplot(1, 2, 2)
+        plt.imshow(codebook_indices[i])
+        plt.title("Code")
+        plt.axis("off")
+        plt.show()
+
+    """
+    The figure above shows that the discrete codes have been able to capture some
+    regularities from the dataset. Now, how do we sample from this codebook to create
+    novel images? Since these codes are discrete and we imposed a categorical distribution
+    on them, we cannot use them yet to generate anything meaningful until we can generate likely
+    sequences of codes that we can give to the decoder. 
+    
+    The authors use a PixelCNN to train these codes so that they can be used as powerful priors to
+    generate novel examples. PixelCNN was proposed in
+    [Conditional Image Generation with PixelCNN Decoders](https://arxiv.org/abs/1606.05328)
+    by van der Oord et al. We borrow the implementation from
+    [this PixelCNN example](https://keras.io/examples/generative/pixelcnn/). It's an autoregressive
+    generative model where the outputs are conditional on the prior ones. In other words, a PixelCNN
+    generates an image on a pixel-by-pixel basis. For the purpose in this example, however, its task
+    is to generate code book indices instead of pixels directly. The trained VQ-VAE decoder is used
+    to map the indices generated by the PixelCNN back into the pixel space.
+    """
+
+    """
+    ## PixelCNN hyperparameters
+    """
+
+    num_residual_blocks = 2
+    num_pixelcnn_layers = 2
+    pixelcnn_input_shape = encoded_outputs.shape[1:-1]
+    print(f"Input shape of the PixelCNN: {pixelcnn_input_shape}")
+
+    """
+    This input shape represents the reduction in the resolution performed by the encoder. With "same" padding,
+    this exactly halves the "resolution" of the output shape for each stride-2 convolution layer. So, with these
+    two layers, we end up with an encoder output tensor of 7x7 on axes 2 and 3, with the first axis as the batch
+    size and the last axis being the code book embedding size. Since the quantization layer in the autoencoder
+    maps these 7x7 tensors to indices of the code book, these output layer axis sizes must be matched by the
+    PixelCNN as the input shape. The task of the PixelCNN for this architecture is to generate _likely_ 7x7
+    arrangements of codebook indices.
+
+    Note that this shape is something to optimize for in larger-sized image domains, along with the code
+    book sizes. Since the PixelCNN is autoregressive, it needs to pass over each codebook index sequentially
+    in order to generate novel images from the codebook. Each stride-2 (or rather more correctly a 
+    stride (2, 2)) convolution layer will divide the image generation time by four. Note, however, that there
+    is probably a lower bound on this part: when the number of codes for the image to reconstruct is too small,
+    it has insufficient information for the decoder to represent the level of detail in the image, so the
+    output quality will suffer. This can be amended at least to some extent by using a larger code book. 
+    Since the autoregressive part of the image generation procedure uses codebook indices, there is far less of 
+    a performance penalty on using a larger code book as the lookup time for a larger-sized code from a larger
+    code book is much smaller in comparison to iterating over a larger sequence of code book indices, although
+    the size of the code book does impact on the batch size that can pass through the image generation procedure.
+    Finding the sweet spot for this trade-off can require some architecture tweaking and could very well differ
+    per dataset.
+    """
+
+    """
+    ## PixelCNN model
+
+    Majority of this comes from
+    [this example](https://keras.io/examples/generative/pixelcnn/).
+
+    ## Notes
+
+    Thanks to [Rein van 't Veer](https://github.com/reinvantveer) for improving this example with
+    copy-edits and minor code clean-ups.
+    """
+
+
+    # The first layer is the PixelCNN layer. This layer simply
+    # builds on the 2D convolutional layer, but includes masking.
+
+    pixelcnn_inputs = keras.Input(shape=pixelcnn_input_shape, dtype=tf.int32)
+    ohe = tf.one_hot(pixelcnn_inputs, vqvae_trainer.num_embeddings)
     x = PixelConvLayer(
-        mask_type="B",
-        filters=128,
-        kernel_size=1,
-        strides=1,
-        activation="relu",
-        padding="valid",
-    )(x)
-
-out = keras.layers.Conv2D(
-    filters=vqvae_trainer.num_embeddings, kernel_size=1, strides=1, padding="valid"
-)(x)
-
-pixel_cnn = keras.Model(pixelcnn_inputs, out, name="pixel_cnn")
-pixel_cnn.summary()
-
-"""
-## Prepare data to train the PixelCNN
-
-We will train the PixelCNN to learn a categorical distribution of the discrete codes.
-First, we will generate code indices using the encoder and vector quantizer we just
-trained. Our training objective will be to minimize the crossentropy loss between these
-indices and the PixelCNN outputs. Here, the number of categories is equal to the number
-of embeddings present in our codebook (128 in our case). The PixelCNN model is
-trained to learn a distribution (as opposed to minimizing the L1/L2 loss), which is where
-it gets its generative capabilities from.
-"""
-
-# Generate the codebook indices.
-encoded_outputs = encoder.predict(x_train_scaled)
-flat_enc_outputs = encoded_outputs.reshape(-1, encoded_outputs.shape[-1])
-codebook_indices = quantizer.get_code_indices(flat_enc_outputs)
-
-codebook_indices = codebook_indices.numpy().reshape(encoded_outputs.shape[:-1])
-print(f"Shape of the training data for PixelCNN: {codebook_indices.shape}")
-
-"""
-## PixelCNN training
-"""
-
-pixel_cnn.compile(
-    optimizer=keras.optimizers.Adam(3e-4),
-    loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True),
-    metrics=["accuracy"],
-)
-pixel_cnn.fit(
-    x=codebook_indices,
-    y=codebook_indices,
-    batch_size=128,
-    epochs=30,
-    validation_split=0.1,
-)
-
-"""
-We can improve these scores with more training and hyperparameter tuning.
-"""
-
-"""
-## Codebook sampling
-
-Now that our PixelCNN is trained, we can sample distinct codes from its outputs and pass
-them to our decoder to generate novel images.
-"""
+        mask_type="A", filters=128, kernel_size=7, activation="relu", padding="same"
+    )(ohe)
 
-# Create a mini sampler model.
-inputs = layers.Input(shape=pixel_cnn.input_shape[1:])
-outputs = pixel_cnn(inputs, training=False)
-categorical_layer = tfp.layers.DistributionLambda(tfp.distributions.Categorical)
-outputs = categorical_layer(outputs)
-sampler = keras.Model(inputs, outputs)
+    for _ in range(num_residual_blocks):
+        x = ResidualBlock(filters=128)(x)
 
-"""
-We now construct a prior to generate images. Here, we will generate 10 images.
-"""
-
-# Create an empty array of priors.
-batch = 10
-priors = np.zeros(shape=(batch,) + (pixel_cnn.input_shape)[1:])
-batch, rows, cols = priors.shape
-
-# Iterate over the priors because generation has to be done sequentially pixel by pixel.
-for row in range(rows):
-    for col in range(cols):
-        # Feed the whole array and retrieving the pixel value probabilities for the next
-        # pixel.
-        probs = sampler.predict(priors)
-        # Use the probabilities to pick pixel values and append the values to the priors.
-        priors[:, row, col] = probs[:, row, col]
-
-print(f"Prior shape: {priors.shape}")
-
-"""
-We can now use our decoder to generate the images.
-"""
-
-# Perform an embedding lookup.
-pretrained_embeddings = quantizer.embeddings
-priors_ohe = tf.one_hot(priors.astype("int32"), vqvae_trainer.num_embeddings).numpy()
-quantized = tf.matmul(
-    priors_ohe.astype("float32"), pretrained_embeddings, transpose_b=True
-)
-quantized = tf.reshape(quantized, (-1, *(encoded_outputs.shape[1:])))
-
-# Generate novel images.
-decoder = vqvae_trainer.vqvae.get_layer("decoder")
-generated_samples = decoder.predict(quantized)
-
-for i in range(batch):
-    plt.subplot(1, 2, 1)
-    plt.imshow(priors[i])
-    plt.title("Code")
-    plt.axis("off")
-
-    plt.subplot(1, 2, 2)
-    plt.imshow(generated_samples[i].squeeze() + 0.5)
-    plt.title("Generated Sample")
-    plt.axis("off")
-    plt.show()
+    for _ in range(num_pixelcnn_layers):
+        x = PixelConvLayer(
+            mask_type="B",
+            filters=128,
+            kernel_size=1,
+            strides=1,
+            activation="relu",
+            padding="valid",
+        )(x)
 
-"""
-We can enhance the quality of these generated samples by tweaking the PixelCNN.
-"""
+    out = keras.layers.Conv2D(
+        filters=vqvae_trainer.num_embeddings, kernel_size=1, strides=1, padding="valid"
+    )(x)
 
-"""
-## Additional notes
-
-* After the VQ-VAE paper was initially released, the authors developed an exponential
-moving averaging scheme to update the embeddings inside the quantizer. If you're
-interested you can check out
-[this snippet](https://github.com/deepmind/sonnet/blob/master/sonnet/python/modules/nets/vqvae.py#L124).
-* To further enhance the quality of the generated samples,
-[VQ-VAE-2](https://arxiv.org/abs/1906.00446) was proposed that follows a cascaded
-approach to learn the codebook and to generate the images.
-"""
+    pixel_cnn = keras.Model(pixelcnn_inputs, out, name="pixel_cnn")
+    pixel_cnn.summary()
+
+    """
+    ## Prepare data to train the PixelCNN
+
+    We will train the PixelCNN to learn a categorical distribution of the discrete codes.
+    First, we will generate code indices using the encoder and vector quantizer we just
+    trained. Our training objective will be to minimize the crossentropy loss between these
+    indices and the PixelCNN outputs. Here, the number of categories is equal to the number
+    of embeddings present in our codebook (128 in our case). The PixelCNN model is
+    trained to learn a distribution (as opposed to minimizing the L1/L2 loss), which is where
+    it gets its generative capabilities from.
+    """
+
+    # Generate the codebook indices.
+    encoded_outputs = encoder.predict(x_train_scaled)
+    flat_enc_outputs = encoded_outputs.reshape(-1, encoded_outputs.shape[-1])
+    codebook_indices = quantizer.get_code_indices(flat_enc_outputs)
+
+    codebook_indices = codebook_indices.numpy().reshape(encoded_outputs.shape[:-1])
+    print(f"Shape of the training data for PixelCNN: {codebook_indices.shape}")
+
+    """
+    ## PixelCNN training
+    """
+
+    pixel_cnn.compile(
+        optimizer=keras.optimizers.Adam(3e-4),
+        loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True),
+        metrics=["accuracy"],
+    )
+    pixel_cnn.fit(
+        x=codebook_indices,
+        y=codebook_indices,
+        batch_size=128,
+        epochs=30,
+        validation_split=0.1,
+    )
 
-"""
-## Relevant Chapters from Deep Learning with Python
-- [Chapter 17: Image generation](https://deeplearningwithpython.io/chapters/chapter17_image-generation)
-"""
+    """
+    We can improve these scores with more training and hyperparameter tuning.
+    """
+
+    """
+    ## Codebook sampling
+
+    Now that our PixelCNN is trained, we can sample distinct codes from its outputs and pass
+    them to our decoder to generate novel images.
+    """
+
+    # Create a mini sampler model.
+    inputs = layers.Input(shape=pixel_cnn.input_shape[1:])
+    outputs = pixel_cnn(inputs, training=False)
+    categorical_layer = tfp.layers.DistributionLambda(tfp.distributions.Categorical)
+    outputs = categorical_layer(outputs)
+    sampler = keras.Model(inputs, outputs)
+
+    """
+    We now construct a prior to generate images. Here, we will generate 10 images.
+    """
+
+    # Create an empty array of priors.
+    batch = 10
+    priors = np.zeros(shape=(batch,) + (pixel_cnn.input_shape)[1:])
+    batch, rows, cols = priors.shape
+
+    # Iterate over the priors because generation has to be done sequentially pixel by pixel.
+    for row in range(rows):
+        for col in range(cols):
+            # Feed the whole array and retrieving the pixel value probabilities for the next
+            # pixel.
+            probs = sampler.predict(priors)
+            # Use the probabilities to pick pixel values and append the values to the priors.
+            priors[:, row, col] = probs[:, row, col]
+
+    print(f"Prior shape: {priors.shape}")
+
+    """
+    We can now use our decoder to generate the images.
+    """
+
+    # Perform an embedding lookup.
+    pretrained_embeddings = quantizer.embeddings
+    priors_ohe = tf.one_hot(priors.astype("int32"), vqvae_trainer.num_embeddings).numpy()
+    quantized = tf.matmul(
+        priors_ohe.astype("float32"), pretrained_embeddings, transpose_b=True
+    )
+    quantized = tf.reshape(quantized, (-1, *(encoded_outputs.shape[1:])))
+
+    # Generate novel images.
+    decoder = vqvae_trainer.vqvae.get_layer("decoder")
+    generated_samples = decoder.predict(quantized)
+
+    for i in range(batch):
+        plt.subplot(1, 2, 1)
+        plt.imshow(priors[i])
+        plt.title("Code")
+        plt.axis("off")
+
+        plt.subplot(1, 2, 2)
+        plt.imshow(generated_samples[i].squeeze() + 0.5)
+        plt.title("Generated Sample")
+        plt.axis("off")
+        plt.show()
+
+    """
+    We can enhance the quality of these generated samples by tweaking the PixelCNN.
+    """
+
+    """
+    ## Additional notes
+
+    * After the VQ-VAE paper was initially released, the authors developed an exponential
+    moving averaging scheme to update the embeddings inside the quantizer. If you're
+    interested you can check out
+    [this snippet](https://github.com/deepmind/sonnet/blob/master/sonnet/python/modules/nets/vqvae.py#L124).
+    * To further enhance the quality of the generated samples,
+    [VQ-VAE-2](https://arxiv.org/abs/1906.00446) was proposed that follows a cascaded
+    approach to learn the codebook and to generate the images.
+    """
+
+    """
+    ## Relevant Chapters from Deep Learning with Python
+    - [Chapter 17: Image generation](https://deeplearningwithpython.io/chapters/chapter17_image-generation)
+    """
diff --git a/examples/keras_recipes/bayesian_neural_networks.py b/examples/keras_recipes/bayesian_neural_networks.py
index 2c69e00dce..bbbd498003 100644
--- a/examples/keras_recipes/bayesian_neural_networks.py
+++ b/examples/keras_recipes/bayesian_neural_networks.py
@@ -56,6 +56,16 @@
 from tensorflow import keras
 from tensorflow.keras import layers
 import tensorflow_datasets as tfds
+
+# Compatibility patch for TFP with Keras 3 / TF 2.19+
+try:
+    if not hasattr(tf._api.v2.compat.v2.__internal__, "register_load_context_function"):
+        tf._api.v2.compat.v2.__internal__.register_load_context_function = (
+            tf._api.v2.compat.v2.__internal__.register_call_context_function
+        )
+except AttributeError:
+    pass
+
 import tensorflow_probability as tfp
 
 """
@@ -171,33 +181,8 @@ def create_baseline_model():
 dataset_size = 4898
 batch_size = 256
 train_size = int(dataset_size * 0.85)
-train_dataset, test_dataset = get_train_and_test_splits(train_size, batch_size)
-
-"""
-Now let's train the baseline model. We use the `MeanSquaredError`
-as the loss function.
-"""
-
-num_epochs = 100
-mse_loss = keras.losses.MeanSquaredError()
-baseline_model = create_baseline_model()
-run_experiment(baseline_model, mse_loss, train_dataset, test_dataset)
-
-"""
-We take a sample from the test set use the model to obtain predictions for them.
-Note that since the baseline model is deterministic, we get a single a
-*point estimate* prediction for each test example, with no information about the
-uncertainty of the model nor the prediction.
-"""
 
-sample = 10
-examples, targets = list(test_dataset.unbatch().shuffle(batch_size * 10).batch(sample))[
-    0
-]
 
-predicted = baseline_model(examples).numpy()
-for idx in range(sample):
-    print(f"Predicted: {round(float(predicted[idx][0]), 1)} - Actual: {targets[idx]}")
 
 """
 ## Experiment 2: Bayesian neural network (BNN)
@@ -288,143 +273,172 @@ def create_bnn_model(train_size):
 ### Train BNN  with a small training subset.
 """
 
-num_epochs = 500
-train_sample_size = int(train_size * 0.3)
-small_train_dataset = train_dataset.unbatch().take(train_sample_size).batch(batch_size)
-
-bnn_model_small = create_bnn_model(train_sample_size)
-run_experiment(bnn_model_small, mse_loss, small_train_dataset, test_dataset)
+if __name__ == "__main__":
+    train_dataset, test_dataset = get_train_and_test_splits(train_size, batch_size)
 
-"""
-Since we have trained a BNN model, the model produces a different output each time
-we call it with the same input, since each time a new set of weights are sampled
-from the distributions to construct the network and produce an output.
-The less certain the mode weights are, the more variability (wider range) we will
-see in the outputs of the same inputs.
-"""
+    """
+    Now let's train the baseline model. We use the `MeanSquaredError`
+    as the loss function.
+    """
 
+    num_epochs = 100
+    mse_loss = keras.losses.MeanSquaredError()
+    baseline_model = create_baseline_model()
+    run_experiment(baseline_model, mse_loss, train_dataset, test_dataset)
 
-def compute_predictions(model, iterations=100):
-    predicted = []
-    for _ in range(iterations):
-        predicted.append(model(examples).numpy())
-    predicted = np.concatenate(predicted, axis=1)
+    """
+    We take a sample from the test set use the model to obtain predictions for them.
+    Note that since the baseline model is deterministic, we get a single a
+    *point estimate* prediction for each test example, with no information about the
+    uncertainty of the model nor the prediction.
+    """
 
-    prediction_mean = np.mean(predicted, axis=1).tolist()
-    prediction_min = np.min(predicted, axis=1).tolist()
-    prediction_max = np.max(predicted, axis=1).tolist()
-    prediction_range = (np.max(predicted, axis=1) - np.min(predicted, axis=1)).tolist()
+    sample = 10
+    examples, targets = list(
+        test_dataset.unbatch().shuffle(batch_size * 10).batch(sample)
+    )[0]
 
+    predicted = baseline_model(examples).numpy()
     for idx in range(sample):
         print(
-            f"Predictions mean: {round(prediction_mean[idx], 2)}, "
-            f"min: {round(prediction_min[idx], 2)}, "
-            f"max: {round(prediction_max[idx], 2)}, "
-            f"range: {round(prediction_range[idx], 2)} - "
-            f"Actual: {targets[idx]}"
+            f"Predicted: {round(float(predicted[idx][0]), 1)} - Actual: {targets[idx]}"
         )
 
+    num_epochs = 500
+    train_sample_size = int(train_size * 0.3)
+    small_train_dataset = (
+        train_dataset.unbatch().take(train_sample_size).batch(batch_size)
+    )
 
-compute_predictions(bnn_model_small)
-
-"""
-### Train BNN  with the whole training set.
-"""
-
-num_epochs = 500
-bnn_model_full = create_bnn_model(train_size)
-run_experiment(bnn_model_full, mse_loss, train_dataset, test_dataset)
-
-compute_predictions(bnn_model_full)
-
-"""
-Notice that the model trained with the full training dataset shows smaller range
-(uncertainty) in the prediction values for the same inputs, compared to the model
-trained with a subset of the training dataset.
-"""
-
-"""
-## Experiment 3: probabilistic Bayesian neural network
-
-So far, the output of the standard and the Bayesian NN models that we built is
-deterministic, that is, produces a point estimate as a prediction for a given example.
-We can create a probabilistic NN by letting the model output a distribution.
-In this case, the model captures the *aleatoric uncertainty* as well,
-which is due to irreducible noise in the data, or to the stochastic nature of the
-process generating the data.
-
-In this example, we model the output as a `IndependentNormal` distribution,
-with learnable mean and variance parameters. If the task was classification,
-we would have used `IndependentBernoulli` with binary classes, and `OneHotCategorical`
-with multiple classes, to model distribution of the model output.
-"""
-
-
-def create_probablistic_bnn_model(train_size):
-    inputs = create_model_inputs()
-    features = keras.layers.concatenate(list(inputs.values()))
-    features = layers.BatchNormalization()(features)
-
-    # Create hidden layers with weight uncertainty using the DenseVariational layer.
-    for units in hidden_units:
-        features = tfp.layers.DenseVariational(
-            units=units,
-            make_prior_fn=prior,
-            make_posterior_fn=posterior,
-            kl_weight=1 / train_size,
-            activation="sigmoid",
-        )(features)
-
-    # Create a probabilisticå output (Normal distribution), and use the `Dense` layer
-    # to produce the parameters of the distribution.
-    # We set units=2 to learn both the mean and the variance of the Normal distribution.
-    distribution_params = layers.Dense(units=2)(features)
-    outputs = tfp.layers.IndependentNormal(1)(distribution_params)
-
-    model = keras.Model(inputs=inputs, outputs=outputs)
-    return model
-
-
-"""
-Since the output of the model is a distribution, rather than a point estimate,
-we use the [negative loglikelihood](https://en.wikipedia.org/wiki/Likelihood_function)
-as our loss function to compute how likely to see the true data (targets) from the
-estimated distribution produced by the model.
-"""
-
-
-def negative_loglikelihood(targets, estimated_distribution):
-    return -estimated_distribution.log_prob(targets)
-
+    bnn_model_small = create_bnn_model(train_sample_size)
+    run_experiment(bnn_model_small, mse_loss, small_train_dataset, test_dataset)
+
+    """
+    Since we have trained a BNN model, the model produces a different output each time
+    we call it with the same input, since each time a new set of weights are sampled
+    from the distributions to construct the network and produce an output.
+    The less certain the mode weights are, the more variability (wider range) we will
+    see in the outputs of the same inputs.
+    """
+
+    def compute_predictions(model, iterations=100):
+        predicted = []
+        for _ in range(iterations):
+            predicted.append(model(examples).numpy())
+        predicted = np.concatenate(predicted, axis=1)
+
+        prediction_mean = np.mean(predicted, axis=1).tolist()
+        prediction_min = np.min(predicted, axis=1).tolist()
+        prediction_max = np.max(predicted, axis=1).tolist()
+        prediction_range = (
+            np.max(predicted, axis=1) - np.min(predicted, axis=1)
+        ).tolist()
+
+        for idx in range(sample):
+            print(
+                f"Predictions mean: {round(prediction_mean[idx], 2)}, "
+                f"min: {round(prediction_min[idx], 2)}, "
+                f"max: {round(prediction_max[idx], 2)}, "
+                f"range: {round(prediction_range[idx], 2)} - "
+                f"Actual: {targets[idx]}"
+            )
 
-num_epochs = 1000
-prob_bnn_model = create_probablistic_bnn_model(train_size)
-run_experiment(prob_bnn_model, negative_loglikelihood, train_dataset, test_dataset)
+    compute_predictions(bnn_model_small)
+
+    """
+    ### Train BNN  with the whole training set.
+    """
+
+    num_epochs = 500
+    bnn_model_full = create_bnn_model(train_size)
+    run_experiment(bnn_model_full, mse_loss, train_dataset, test_dataset)
+
+    compute_predictions(bnn_model_full)
+
+    """
+    Notice that the model trained with the full training dataset shows smaller range
+    (uncertainty) in the prediction values for the same inputs, compared to the model
+    trained with a subset of the training dataset.
+    """
+
+    """
+    ## Experiment 3: probabilistic Bayesian neural network
+
+    So far, the output of the standard and the Bayesian NN models that we built is
+    deterministic, that is, produces a point estimate as a prediction for a given example.
+    We can create a probabilistic NN by letting the model output a distribution.
+    In this case, the model captures the *aleatoric uncertainty* as well,
+    which is due to irreducible noise in the data, or to the stochastic nature of the
+    process generating the data.
+
+    In this example, we model the output as a `IndependentNormal` distribution,
+    with learnable mean and variance parameters. If the task was classification,
+    we would have used `IndependentBernoulli` with binary classes, and `OneHotCategorical`
+    with multiple classes, to model distribution of the model output.
+    """
+
+    def create_probablistic_bnn_model(train_size):
+        inputs = create_model_inputs()
+        features = keras.layers.concatenate(list(inputs.values()))
+        features = layers.BatchNormalization()(features)
+
+        # Create hidden layers with weight uncertainty using the DenseVariational layer.
+        for units in hidden_units:
+            features = tfp.layers.DenseVariational(
+                units=units,
+                make_prior_fn=prior,
+                make_posterior_fn=posterior,
+                kl_weight=1 / train_size,
+                activation="sigmoid",
+            )(features)
+
+        # Create a probabilisticå output (Normal distribution), and use the `Dense` layer
+        # to produce the parameters of the distribution.
+        # We set units=2 to learn both the mean and the variance of the Normal distribution.
+        distribution_params = layers.Dense(units=2)(features)
+        outputs = tfp.layers.IndependentNormal(1)(distribution_params)
+
+        model = keras.Model(inputs=inputs, outputs=outputs)
+        return model
+
+    """
+    Since the output of the model is a distribution, rather than a point estimate,
+    we use the [negative loglikelihood](https://en.wikipedia.org/wiki/Likelihood_function)
+    as our loss function to compute how likely to see the true data (targets) from the
+    estimated distribution produced by the model.
+    """
+
+    def negative_loglikelihood(targets, estimated_distribution):
+        return -estimated_distribution.log_prob(targets)
+
+    num_epochs = 1000
+    prob_bnn_model = create_probablistic_bnn_model(train_size)
+    run_experiment(prob_bnn_model, negative_loglikelihood, train_dataset, test_dataset)
+
+    """
+    Now let's produce an output from the model given the test examples.
+    The output is now a distribution, and we can use its mean and variance
+    to compute the confidence intervals (CI) of the prediction.
+    """
+
+    prediction_distribution = prob_bnn_model(examples)
+    prediction_mean = prediction_distribution.mean().numpy().tolist()
+    prediction_stdv = prediction_distribution.stddev().numpy()
+
+    # The 95% CI is computed as mean ± (1.96 * stdv)
+    upper = (prediction_mean + (1.96 * prediction_stdv)).tolist()
+    lower = (prediction_mean - (1.96 * prediction_stdv)).tolist()
+    prediction_stdv = prediction_stdv.tolist()
 
-"""
-Now let's produce an output from the model given the test examples.
-The output is now a distribution, and we can use its mean and variance
-to compute the confidence intervals (CI) of the prediction.
-"""
-
-prediction_distribution = prob_bnn_model(examples)
-prediction_mean = prediction_distribution.mean().numpy().tolist()
-prediction_stdv = prediction_distribution.stddev().numpy()
-
-# The 95% CI is computed as mean ± (1.96 * stdv)
-upper = (prediction_mean + (1.96 * prediction_stdv)).tolist()
-lower = (prediction_mean - (1.96 * prediction_stdv)).tolist()
-prediction_stdv = prediction_stdv.tolist()
-
-for idx in range(sample):
-    print(
-        f"Prediction mean: {round(prediction_mean[idx][0], 2)}, "
-        f"stddev: {round(prediction_stdv[idx][0], 2)}, "
-        f"95% CI: [{round(upper[idx][0], 2)} - {round(lower[idx][0], 2)}]"
-        f" - Actual: {targets[idx]}"
-    )
+    for idx in range(sample):
+        print(
+            f"Prediction mean: {round(prediction_mean[idx][0], 2)}, "
+            f"stddev: {round(prediction_stdv[idx][0], 2)}, "
+            f"95% CI: [{round(upper[idx][0], 2)} - {round(lower[idx][0], 2)}]"
+            f" - Actual: {targets[idx]}"
+        )
 
-"""
-## Relevant Chapters from Deep Learning with Python
-- [Chapter 5: Fundamentals of machine learning](https://deeplearningwithpython.io/chapters/chapter05_fundamentals-of-ml)
-"""
+    """
+    ## Relevant Chapters from Deep Learning with Python
+    - [Chapter 5: Fundamentals of machine learning](https://deeplearningwithpython.io/chapters/chapter05_fundamentals-of-ml)
+    """