diff --git a/examples/generative/real_nvp.py b/examples/generative/real_nvp.py index af9e767ef1..b8eefed97a 100644 --- a/examples/generative/real_nvp.py +++ b/examples/generative/real_nvp.py @@ -41,6 +41,16 @@ from sklearn.datasets import make_moons import numpy as np import matplotlib.pyplot as plt + +# Compatibility patch for TFP with Keras 3 / TF 2.19+ +try: + if not hasattr(tf._api.v2.compat.v2.__internal__, "register_load_context_function"): + tf._api.v2.compat.v2.__internal__.register_load_context_function = ( + tf._api.v2.compat.v2.__internal__.register_call_context_function + ) +except AttributeError: + pass + import tensorflow_probability as tfp """ @@ -179,48 +189,49 @@ def test_step(self, data): ## Model training """ -model = RealNVP(num_coupling_layers=6) - -model.compile(optimizer=keras.optimizers.Adam(learning_rate=0.0001)) - -history = model.fit( - normalized_data, batch_size=256, epochs=300, verbose=2, validation_split=0.2 -) - -""" -## Performance evaluation -""" - -plt.figure(figsize=(15, 10)) -plt.plot(history.history["loss"]) -plt.plot(history.history["val_loss"]) -plt.title("model loss") -plt.legend(["train", "validation"], loc="upper right") -plt.ylabel("loss") -plt.xlabel("epoch") - -# From data to latent space. -z, _ = model(normalized_data) - -# From latent space to data. -samples = model.distribution.sample(3000) -x, _ = model.predict(samples) - -f, axes = plt.subplots(2, 2) -f.set_size_inches(20, 15) - -axes[0, 0].scatter(normalized_data[:, 0], normalized_data[:, 1], color="r") -axes[0, 0].set(title="Inference data space X", xlabel="x", ylabel="y") -axes[0, 1].scatter(z[:, 0], z[:, 1], color="r") -axes[0, 1].set(title="Inference latent space Z", xlabel="x", ylabel="y") -axes[0, 1].set_xlim([-3.5, 4]) -axes[0, 1].set_ylim([-4, 4]) -axes[1, 0].scatter(samples[:, 0], samples[:, 1], color="g") -axes[1, 0].set(title="Generated latent space Z", xlabel="x", ylabel="y") -axes[1, 1].scatter(x[:, 0], x[:, 1], color="g") -axes[1, 1].set(title="Generated data space X", label="x", ylabel="y") -axes[1, 1].set_xlim([-2, 2]) -axes[1, 1].set_ylim([-2, 2]) +if __name__ == "__main__": + model = RealNVP(num_coupling_layers=6) + + model.compile(optimizer=keras.optimizers.Adam(learning_rate=0.0001)) + + history = model.fit( + normalized_data, batch_size=256, epochs=300, verbose=2, validation_split=0.2 + ) + + """ + ## Performance evaluation + """ + + plt.figure(figsize=(15, 10)) + plt.plot(history.history["loss"]) + plt.plot(history.history["val_loss"]) + plt.title("model loss") + plt.legend(["train", "validation"], loc="upper right") + plt.ylabel("loss") + plt.xlabel("epoch") + + # From data to latent space. + z, _ = model(normalized_data) + + # From latent space to data. + samples = model.distribution.sample(3000) + x, _ = model.predict(samples) + + f, axes = plt.subplots(2, 2) + f.set_size_inches(20, 15) + + axes[0, 0].scatter(normalized_data[:, 0], normalized_data[:, 1], color="r") + axes[0, 0].set(title="Inference data space X", xlabel="x", ylabel="y") + axes[0, 1].scatter(z[:, 0], z[:, 1], color="r") + axes[0, 1].set(title="Inference latent space Z", xlabel="x", ylabel="y") + axes[0, 1].set_xlim([-3.5, 4]) + axes[0, 1].set_ylim([-4, 4]) + axes[1, 0].scatter(samples[:, 0], samples[:, 1], color="g") + axes[1, 0].set(title="Generated latent space Z", xlabel="x", ylabel="y") + axes[1, 1].scatter(x[:, 0], x[:, 1], color="g") + axes[1, 1].set(title="Generated data space X", label="x", ylabel="y") + axes[1, 1].set_xlim([-2, 2]) + axes[1, 1].set_ylim([-2, 2]) """ ## Relevant Chapters from Deep Learning with Python diff --git a/examples/generative/vq_vae.py b/examples/generative/vq_vae.py index 5caa95d26b..6f091462ee 100644 --- a/examples/generative/vq_vae.py +++ b/examples/generative/vq_vae.py @@ -49,9 +49,19 @@ from tensorflow import keras from tensorflow.keras import layers -import tensorflow_probability as tfp import tensorflow as tf +# Compatibility patch for TFP with Keras 3 / TF 2.19+ +try: + if not hasattr(tf._api.v2.compat.v2.__internal__, "register_load_context_function"): + tf._api.v2.compat.v2.__internal__.register_load_context_function = ( + tf._api.v2.compat.v2.__internal__.register_call_context_function + ) +except AttributeError: + pass + +import tensorflow_probability as tfp + """ ## `VectorQuantizer` layer @@ -275,130 +285,6 @@ def train_step(self, x): ## Train the VQ-VAE model """ -vqvae_trainer = VQVAETrainer(data_variance, latent_dim=16, num_embeddings=128) -vqvae_trainer.compile(optimizer=keras.optimizers.Adam()) -vqvae_trainer.fit(x_train_scaled, epochs=30, batch_size=128) - -""" -## Reconstruction results on the test set -""" - - -def show_subplot(original, reconstructed): - plt.subplot(1, 2, 1) - plt.imshow(original.squeeze() + 0.5) - plt.title("Original") - plt.axis("off") - - plt.subplot(1, 2, 2) - plt.imshow(reconstructed.squeeze() + 0.5) - plt.title("Reconstructed") - plt.axis("off") - - plt.show() - - -trained_vqvae_model = vqvae_trainer.vqvae -idx = np.random.choice(len(x_test_scaled), 10) -test_images = x_test_scaled[idx] -reconstructions_test = trained_vqvae_model.predict(test_images) - -for test_image, reconstructed_image in zip(test_images, reconstructions_test): - show_subplot(test_image, reconstructed_image) - -""" -These results look decent. You are encouraged to play with different hyperparameters -(especially the number of embeddings and the dimensions of the embeddings) and observe how -they affect the results. -""" - -""" -## Visualizing the discrete codes -""" - -encoder = vqvae_trainer.vqvae.get_layer("encoder") -quantizer = vqvae_trainer.vqvae.get_layer("vector_quantizer") - -encoded_outputs = encoder.predict(test_images) -flat_enc_outputs = encoded_outputs.reshape(-1, encoded_outputs.shape[-1]) -codebook_indices = quantizer.get_code_indices(flat_enc_outputs) -codebook_indices = codebook_indices.numpy().reshape(encoded_outputs.shape[:-1]) - -for i in range(len(test_images)): - plt.subplot(1, 2, 1) - plt.imshow(test_images[i].squeeze() + 0.5) - plt.title("Original") - plt.axis("off") - - plt.subplot(1, 2, 2) - plt.imshow(codebook_indices[i]) - plt.title("Code") - plt.axis("off") - plt.show() - -""" -The figure above shows that the discrete codes have been able to capture some -regularities from the dataset. Now, how do we sample from this codebook to create -novel images? Since these codes are discrete and we imposed a categorical distribution -on them, we cannot use them yet to generate anything meaningful until we can generate likely -sequences of codes that we can give to the decoder. - -The authors use a PixelCNN to train these codes so that they can be used as powerful priors to -generate novel examples. PixelCNN was proposed in -[Conditional Image Generation with PixelCNN Decoders](https://arxiv.org/abs/1606.05328) -by van der Oord et al. We borrow the implementation from -[this PixelCNN example](https://keras.io/examples/generative/pixelcnn/). It's an autoregressive -generative model where the outputs are conditional on the prior ones. In other words, a PixelCNN -generates an image on a pixel-by-pixel basis. For the purpose in this example, however, its task -is to generate code book indices instead of pixels directly. The trained VQ-VAE decoder is used -to map the indices generated by the PixelCNN back into the pixel space. -""" - -""" -## PixelCNN hyperparameters -""" - -num_residual_blocks = 2 -num_pixelcnn_layers = 2 -pixelcnn_input_shape = encoded_outputs.shape[1:-1] -print(f"Input shape of the PixelCNN: {pixelcnn_input_shape}") - -""" -This input shape represents the reduction in the resolution performed by the encoder. With "same" padding, -this exactly halves the "resolution" of the output shape for each stride-2 convolution layer. So, with these -two layers, we end up with an encoder output tensor of 7x7 on axes 2 and 3, with the first axis as the batch -size and the last axis being the code book embedding size. Since the quantization layer in the autoencoder -maps these 7x7 tensors to indices of the code book, these output layer axis sizes must be matched by the -PixelCNN as the input shape. The task of the PixelCNN for this architecture is to generate _likely_ 7x7 -arrangements of codebook indices. - -Note that this shape is something to optimize for in larger-sized image domains, along with the code -book sizes. Since the PixelCNN is autoregressive, it needs to pass over each codebook index sequentially -in order to generate novel images from the codebook. Each stride-2 (or rather more correctly a -stride (2, 2)) convolution layer will divide the image generation time by four. Note, however, that there -is probably a lower bound on this part: when the number of codes for the image to reconstruct is too small, -it has insufficient information for the decoder to represent the level of detail in the image, so the -output quality will suffer. This can be amended at least to some extent by using a larger code book. -Since the autoregressive part of the image generation procedure uses codebook indices, there is far less of -a performance penalty on using a larger code book as the lookup time for a larger-sized code from a larger -code book is much smaller in comparison to iterating over a larger sequence of code book indices, although -the size of the code book does impact on the batch size that can pass through the image generation procedure. -Finding the sweet spot for this trade-off can require some architecture tweaking and could very well differ -per dataset. -""" - -""" -## PixelCNN model - -Majority of this comes from -[this example](https://keras.io/examples/generative/pixelcnn/). - -## Notes - -Thanks to [Rein van 't Veer](https://github.com/reinvantveer) for improving this example with -copy-edits and minor code clean-ups. -""" - # The first layer is the PixelCNN layer. This layer simply # builds on the 2D convolutional layer, but includes masking. @@ -450,152 +336,279 @@ def call(self, inputs): return keras.layers.add([inputs, x]) -pixelcnn_inputs = keras.Input(shape=pixelcnn_input_shape, dtype=tf.int32) -ohe = tf.one_hot(pixelcnn_inputs, vqvae_trainer.num_embeddings) -x = PixelConvLayer( - mask_type="A", filters=128, kernel_size=7, activation="relu", padding="same" -)(ohe) - -for _ in range(num_residual_blocks): - x = ResidualBlock(filters=128)(x) - -for _ in range(num_pixelcnn_layers): +if __name__ == "__main__": + vqvae_trainer = VQVAETrainer(data_variance, latent_dim=16, num_embeddings=128) + vqvae_trainer.compile(optimizer=keras.optimizers.Adam()) + vqvae_trainer.fit(x_train_scaled, epochs=30, batch_size=128) + + """ + ## Reconstruction results on the test set + """ + + def show_subplot(original, reconstructed): + plt.subplot(1, 2, 1) + plt.imshow(original.squeeze() + 0.5) + plt.title("Original") + plt.axis("off") + + plt.subplot(1, 2, 2) + plt.imshow(reconstructed.squeeze() + 0.5) + plt.title("Reconstructed") + plt.axis("off") + + plt.show() + + trained_vqvae_model = vqvae_trainer.vqvae + idx = np.random.choice(len(x_test_scaled), 10) + test_images = x_test_scaled[idx] + reconstructions_test = trained_vqvae_model.predict(test_images) + + for test_image, reconstructed_image in zip(test_images, reconstructions_test): + show_subplot(test_image, reconstructed_image) + + """ + These results look decent. You are encouraged to play with different hyperparameters + (especially the number of embeddings and the dimensions of the embeddings) and observe how + they affect the results. + """ + + """ + ## Visualizing the discrete codes + """ + + encoder = vqvae_trainer.vqvae.get_layer("encoder") + quantizer = vqvae_trainer.vqvae.get_layer("vector_quantizer") + + encoded_outputs = encoder.predict(test_images) + flat_enc_outputs = encoded_outputs.reshape(-1, encoded_outputs.shape[-1]) + codebook_indices = quantizer.get_code_indices(flat_enc_outputs) + codebook_indices = codebook_indices.numpy().reshape(encoded_outputs.shape[:-1]) + + for i in range(len(test_images)): + plt.subplot(1, 2, 1) + plt.imshow(test_images[i].squeeze() + 0.5) + plt.title("Original") + plt.axis("off") + + plt.subplot(1, 2, 2) + plt.imshow(codebook_indices[i]) + plt.title("Code") + plt.axis("off") + plt.show() + + """ + The figure above shows that the discrete codes have been able to capture some + regularities from the dataset. Now, how do we sample from this codebook to create + novel images? Since these codes are discrete and we imposed a categorical distribution + on them, we cannot use them yet to generate anything meaningful until we can generate likely + sequences of codes that we can give to the decoder. + + The authors use a PixelCNN to train these codes so that they can be used as powerful priors to + generate novel examples. PixelCNN was proposed in + [Conditional Image Generation with PixelCNN Decoders](https://arxiv.org/abs/1606.05328) + by van der Oord et al. We borrow the implementation from + [this PixelCNN example](https://keras.io/examples/generative/pixelcnn/). It's an autoregressive + generative model where the outputs are conditional on the prior ones. In other words, a PixelCNN + generates an image on a pixel-by-pixel basis. For the purpose in this example, however, its task + is to generate code book indices instead of pixels directly. The trained VQ-VAE decoder is used + to map the indices generated by the PixelCNN back into the pixel space. + """ + + """ + ## PixelCNN hyperparameters + """ + + num_residual_blocks = 2 + num_pixelcnn_layers = 2 + pixelcnn_input_shape = encoded_outputs.shape[1:-1] + print(f"Input shape of the PixelCNN: {pixelcnn_input_shape}") + + """ + This input shape represents the reduction in the resolution performed by the encoder. With "same" padding, + this exactly halves the "resolution" of the output shape for each stride-2 convolution layer. So, with these + two layers, we end up with an encoder output tensor of 7x7 on axes 2 and 3, with the first axis as the batch + size and the last axis being the code book embedding size. Since the quantization layer in the autoencoder + maps these 7x7 tensors to indices of the code book, these output layer axis sizes must be matched by the + PixelCNN as the input shape. The task of the PixelCNN for this architecture is to generate _likely_ 7x7 + arrangements of codebook indices. + + Note that this shape is something to optimize for in larger-sized image domains, along with the code + book sizes. Since the PixelCNN is autoregressive, it needs to pass over each codebook index sequentially + in order to generate novel images from the codebook. Each stride-2 (or rather more correctly a + stride (2, 2)) convolution layer will divide the image generation time by four. Note, however, that there + is probably a lower bound on this part: when the number of codes for the image to reconstruct is too small, + it has insufficient information for the decoder to represent the level of detail in the image, so the + output quality will suffer. This can be amended at least to some extent by using a larger code book. + Since the autoregressive part of the image generation procedure uses codebook indices, there is far less of + a performance penalty on using a larger code book as the lookup time for a larger-sized code from a larger + code book is much smaller in comparison to iterating over a larger sequence of code book indices, although + the size of the code book does impact on the batch size that can pass through the image generation procedure. + Finding the sweet spot for this trade-off can require some architecture tweaking and could very well differ + per dataset. + """ + + """ + ## PixelCNN model + + Majority of this comes from + [this example](https://keras.io/examples/generative/pixelcnn/). + + ## Notes + + Thanks to [Rein van 't Veer](https://github.com/reinvantveer) for improving this example with + copy-edits and minor code clean-ups. + """ + + + # The first layer is the PixelCNN layer. This layer simply + # builds on the 2D convolutional layer, but includes masking. + + pixelcnn_inputs = keras.Input(shape=pixelcnn_input_shape, dtype=tf.int32) + ohe = tf.one_hot(pixelcnn_inputs, vqvae_trainer.num_embeddings) x = PixelConvLayer( - mask_type="B", - filters=128, - kernel_size=1, - strides=1, - activation="relu", - padding="valid", - )(x) - -out = keras.layers.Conv2D( - filters=vqvae_trainer.num_embeddings, kernel_size=1, strides=1, padding="valid" -)(x) - -pixel_cnn = keras.Model(pixelcnn_inputs, out, name="pixel_cnn") -pixel_cnn.summary() - -""" -## Prepare data to train the PixelCNN - -We will train the PixelCNN to learn a categorical distribution of the discrete codes. -First, we will generate code indices using the encoder and vector quantizer we just -trained. Our training objective will be to minimize the crossentropy loss between these -indices and the PixelCNN outputs. Here, the number of categories is equal to the number -of embeddings present in our codebook (128 in our case). The PixelCNN model is -trained to learn a distribution (as opposed to minimizing the L1/L2 loss), which is where -it gets its generative capabilities from. -""" - -# Generate the codebook indices. -encoded_outputs = encoder.predict(x_train_scaled) -flat_enc_outputs = encoded_outputs.reshape(-1, encoded_outputs.shape[-1]) -codebook_indices = quantizer.get_code_indices(flat_enc_outputs) - -codebook_indices = codebook_indices.numpy().reshape(encoded_outputs.shape[:-1]) -print(f"Shape of the training data for PixelCNN: {codebook_indices.shape}") - -""" -## PixelCNN training -""" - -pixel_cnn.compile( - optimizer=keras.optimizers.Adam(3e-4), - loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True), - metrics=["accuracy"], -) -pixel_cnn.fit( - x=codebook_indices, - y=codebook_indices, - batch_size=128, - epochs=30, - validation_split=0.1, -) - -""" -We can improve these scores with more training and hyperparameter tuning. -""" - -""" -## Codebook sampling - -Now that our PixelCNN is trained, we can sample distinct codes from its outputs and pass -them to our decoder to generate novel images. -""" + mask_type="A", filters=128, kernel_size=7, activation="relu", padding="same" + )(ohe) -# Create a mini sampler model. -inputs = layers.Input(shape=pixel_cnn.input_shape[1:]) -outputs = pixel_cnn(inputs, training=False) -categorical_layer = tfp.layers.DistributionLambda(tfp.distributions.Categorical) -outputs = categorical_layer(outputs) -sampler = keras.Model(inputs, outputs) + for _ in range(num_residual_blocks): + x = ResidualBlock(filters=128)(x) -""" -We now construct a prior to generate images. Here, we will generate 10 images. -""" - -# Create an empty array of priors. -batch = 10 -priors = np.zeros(shape=(batch,) + (pixel_cnn.input_shape)[1:]) -batch, rows, cols = priors.shape - -# Iterate over the priors because generation has to be done sequentially pixel by pixel. -for row in range(rows): - for col in range(cols): - # Feed the whole array and retrieving the pixel value probabilities for the next - # pixel. - probs = sampler.predict(priors) - # Use the probabilities to pick pixel values and append the values to the priors. - priors[:, row, col] = probs[:, row, col] - -print(f"Prior shape: {priors.shape}") - -""" -We can now use our decoder to generate the images. -""" - -# Perform an embedding lookup. -pretrained_embeddings = quantizer.embeddings -priors_ohe = tf.one_hot(priors.astype("int32"), vqvae_trainer.num_embeddings).numpy() -quantized = tf.matmul( - priors_ohe.astype("float32"), pretrained_embeddings, transpose_b=True -) -quantized = tf.reshape(quantized, (-1, *(encoded_outputs.shape[1:]))) - -# Generate novel images. -decoder = vqvae_trainer.vqvae.get_layer("decoder") -generated_samples = decoder.predict(quantized) - -for i in range(batch): - plt.subplot(1, 2, 1) - plt.imshow(priors[i]) - plt.title("Code") - plt.axis("off") - - plt.subplot(1, 2, 2) - plt.imshow(generated_samples[i].squeeze() + 0.5) - plt.title("Generated Sample") - plt.axis("off") - plt.show() + for _ in range(num_pixelcnn_layers): + x = PixelConvLayer( + mask_type="B", + filters=128, + kernel_size=1, + strides=1, + activation="relu", + padding="valid", + )(x) -""" -We can enhance the quality of these generated samples by tweaking the PixelCNN. -""" + out = keras.layers.Conv2D( + filters=vqvae_trainer.num_embeddings, kernel_size=1, strides=1, padding="valid" + )(x) -""" -## Additional notes - -* After the VQ-VAE paper was initially released, the authors developed an exponential -moving averaging scheme to update the embeddings inside the quantizer. If you're -interested you can check out -[this snippet](https://github.com/deepmind/sonnet/blob/master/sonnet/python/modules/nets/vqvae.py#L124). -* To further enhance the quality of the generated samples, -[VQ-VAE-2](https://arxiv.org/abs/1906.00446) was proposed that follows a cascaded -approach to learn the codebook and to generate the images. -""" + pixel_cnn = keras.Model(pixelcnn_inputs, out, name="pixel_cnn") + pixel_cnn.summary() + + """ + ## Prepare data to train the PixelCNN + + We will train the PixelCNN to learn a categorical distribution of the discrete codes. + First, we will generate code indices using the encoder and vector quantizer we just + trained. Our training objective will be to minimize the crossentropy loss between these + indices and the PixelCNN outputs. Here, the number of categories is equal to the number + of embeddings present in our codebook (128 in our case). The PixelCNN model is + trained to learn a distribution (as opposed to minimizing the L1/L2 loss), which is where + it gets its generative capabilities from. + """ + + # Generate the codebook indices. + encoded_outputs = encoder.predict(x_train_scaled) + flat_enc_outputs = encoded_outputs.reshape(-1, encoded_outputs.shape[-1]) + codebook_indices = quantizer.get_code_indices(flat_enc_outputs) + + codebook_indices = codebook_indices.numpy().reshape(encoded_outputs.shape[:-1]) + print(f"Shape of the training data for PixelCNN: {codebook_indices.shape}") + + """ + ## PixelCNN training + """ + + pixel_cnn.compile( + optimizer=keras.optimizers.Adam(3e-4), + loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True), + metrics=["accuracy"], + ) + pixel_cnn.fit( + x=codebook_indices, + y=codebook_indices, + batch_size=128, + epochs=30, + validation_split=0.1, + ) -""" -## Relevant Chapters from Deep Learning with Python -- [Chapter 17: Image generation](https://deeplearningwithpython.io/chapters/chapter17_image-generation) -""" + """ + We can improve these scores with more training and hyperparameter tuning. + """ + + """ + ## Codebook sampling + + Now that our PixelCNN is trained, we can sample distinct codes from its outputs and pass + them to our decoder to generate novel images. + """ + + # Create a mini sampler model. + inputs = layers.Input(shape=pixel_cnn.input_shape[1:]) + outputs = pixel_cnn(inputs, training=False) + categorical_layer = tfp.layers.DistributionLambda(tfp.distributions.Categorical) + outputs = categorical_layer(outputs) + sampler = keras.Model(inputs, outputs) + + """ + We now construct a prior to generate images. Here, we will generate 10 images. + """ + + # Create an empty array of priors. + batch = 10 + priors = np.zeros(shape=(batch,) + (pixel_cnn.input_shape)[1:]) + batch, rows, cols = priors.shape + + # Iterate over the priors because generation has to be done sequentially pixel by pixel. + for row in range(rows): + for col in range(cols): + # Feed the whole array and retrieving the pixel value probabilities for the next + # pixel. + probs = sampler.predict(priors) + # Use the probabilities to pick pixel values and append the values to the priors. + priors[:, row, col] = probs[:, row, col] + + print(f"Prior shape: {priors.shape}") + + """ + We can now use our decoder to generate the images. + """ + + # Perform an embedding lookup. + pretrained_embeddings = quantizer.embeddings + priors_ohe = tf.one_hot(priors.astype("int32"), vqvae_trainer.num_embeddings).numpy() + quantized = tf.matmul( + priors_ohe.astype("float32"), pretrained_embeddings, transpose_b=True + ) + quantized = tf.reshape(quantized, (-1, *(encoded_outputs.shape[1:]))) + + # Generate novel images. + decoder = vqvae_trainer.vqvae.get_layer("decoder") + generated_samples = decoder.predict(quantized) + + for i in range(batch): + plt.subplot(1, 2, 1) + plt.imshow(priors[i]) + plt.title("Code") + plt.axis("off") + + plt.subplot(1, 2, 2) + plt.imshow(generated_samples[i].squeeze() + 0.5) + plt.title("Generated Sample") + plt.axis("off") + plt.show() + + """ + We can enhance the quality of these generated samples by tweaking the PixelCNN. + """ + + """ + ## Additional notes + + * After the VQ-VAE paper was initially released, the authors developed an exponential + moving averaging scheme to update the embeddings inside the quantizer. If you're + interested you can check out + [this snippet](https://github.com/deepmind/sonnet/blob/master/sonnet/python/modules/nets/vqvae.py#L124). + * To further enhance the quality of the generated samples, + [VQ-VAE-2](https://arxiv.org/abs/1906.00446) was proposed that follows a cascaded + approach to learn the codebook and to generate the images. + """ + + """ + ## Relevant Chapters from Deep Learning with Python + - [Chapter 17: Image generation](https://deeplearningwithpython.io/chapters/chapter17_image-generation) + """ diff --git a/examples/keras_recipes/bayesian_neural_networks.py b/examples/keras_recipes/bayesian_neural_networks.py index 2c69e00dce..bbbd498003 100644 --- a/examples/keras_recipes/bayesian_neural_networks.py +++ b/examples/keras_recipes/bayesian_neural_networks.py @@ -56,6 +56,16 @@ from tensorflow import keras from tensorflow.keras import layers import tensorflow_datasets as tfds + +# Compatibility patch for TFP with Keras 3 / TF 2.19+ +try: + if not hasattr(tf._api.v2.compat.v2.__internal__, "register_load_context_function"): + tf._api.v2.compat.v2.__internal__.register_load_context_function = ( + tf._api.v2.compat.v2.__internal__.register_call_context_function + ) +except AttributeError: + pass + import tensorflow_probability as tfp """ @@ -171,33 +181,8 @@ def create_baseline_model(): dataset_size = 4898 batch_size = 256 train_size = int(dataset_size * 0.85) -train_dataset, test_dataset = get_train_and_test_splits(train_size, batch_size) - -""" -Now let's train the baseline model. We use the `MeanSquaredError` -as the loss function. -""" - -num_epochs = 100 -mse_loss = keras.losses.MeanSquaredError() -baseline_model = create_baseline_model() -run_experiment(baseline_model, mse_loss, train_dataset, test_dataset) - -""" -We take a sample from the test set use the model to obtain predictions for them. -Note that since the baseline model is deterministic, we get a single a -*point estimate* prediction for each test example, with no information about the -uncertainty of the model nor the prediction. -""" -sample = 10 -examples, targets = list(test_dataset.unbatch().shuffle(batch_size * 10).batch(sample))[ - 0 -] -predicted = baseline_model(examples).numpy() -for idx in range(sample): - print(f"Predicted: {round(float(predicted[idx][0]), 1)} - Actual: {targets[idx]}") """ ## Experiment 2: Bayesian neural network (BNN) @@ -288,143 +273,172 @@ def create_bnn_model(train_size): ### Train BNN with a small training subset. """ -num_epochs = 500 -train_sample_size = int(train_size * 0.3) -small_train_dataset = train_dataset.unbatch().take(train_sample_size).batch(batch_size) - -bnn_model_small = create_bnn_model(train_sample_size) -run_experiment(bnn_model_small, mse_loss, small_train_dataset, test_dataset) +if __name__ == "__main__": + train_dataset, test_dataset = get_train_and_test_splits(train_size, batch_size) -""" -Since we have trained a BNN model, the model produces a different output each time -we call it with the same input, since each time a new set of weights are sampled -from the distributions to construct the network and produce an output. -The less certain the mode weights are, the more variability (wider range) we will -see in the outputs of the same inputs. -""" + """ + Now let's train the baseline model. We use the `MeanSquaredError` + as the loss function. + """ + num_epochs = 100 + mse_loss = keras.losses.MeanSquaredError() + baseline_model = create_baseline_model() + run_experiment(baseline_model, mse_loss, train_dataset, test_dataset) -def compute_predictions(model, iterations=100): - predicted = [] - for _ in range(iterations): - predicted.append(model(examples).numpy()) - predicted = np.concatenate(predicted, axis=1) + """ + We take a sample from the test set use the model to obtain predictions for them. + Note that since the baseline model is deterministic, we get a single a + *point estimate* prediction for each test example, with no information about the + uncertainty of the model nor the prediction. + """ - prediction_mean = np.mean(predicted, axis=1).tolist() - prediction_min = np.min(predicted, axis=1).tolist() - prediction_max = np.max(predicted, axis=1).tolist() - prediction_range = (np.max(predicted, axis=1) - np.min(predicted, axis=1)).tolist() + sample = 10 + examples, targets = list( + test_dataset.unbatch().shuffle(batch_size * 10).batch(sample) + )[0] + predicted = baseline_model(examples).numpy() for idx in range(sample): print( - f"Predictions mean: {round(prediction_mean[idx], 2)}, " - f"min: {round(prediction_min[idx], 2)}, " - f"max: {round(prediction_max[idx], 2)}, " - f"range: {round(prediction_range[idx], 2)} - " - f"Actual: {targets[idx]}" + f"Predicted: {round(float(predicted[idx][0]), 1)} - Actual: {targets[idx]}" ) + num_epochs = 500 + train_sample_size = int(train_size * 0.3) + small_train_dataset = ( + train_dataset.unbatch().take(train_sample_size).batch(batch_size) + ) -compute_predictions(bnn_model_small) - -""" -### Train BNN with the whole training set. -""" - -num_epochs = 500 -bnn_model_full = create_bnn_model(train_size) -run_experiment(bnn_model_full, mse_loss, train_dataset, test_dataset) - -compute_predictions(bnn_model_full) - -""" -Notice that the model trained with the full training dataset shows smaller range -(uncertainty) in the prediction values for the same inputs, compared to the model -trained with a subset of the training dataset. -""" - -""" -## Experiment 3: probabilistic Bayesian neural network - -So far, the output of the standard and the Bayesian NN models that we built is -deterministic, that is, produces a point estimate as a prediction for a given example. -We can create a probabilistic NN by letting the model output a distribution. -In this case, the model captures the *aleatoric uncertainty* as well, -which is due to irreducible noise in the data, or to the stochastic nature of the -process generating the data. - -In this example, we model the output as a `IndependentNormal` distribution, -with learnable mean and variance parameters. If the task was classification, -we would have used `IndependentBernoulli` with binary classes, and `OneHotCategorical` -with multiple classes, to model distribution of the model output. -""" - - -def create_probablistic_bnn_model(train_size): - inputs = create_model_inputs() - features = keras.layers.concatenate(list(inputs.values())) - features = layers.BatchNormalization()(features) - - # Create hidden layers with weight uncertainty using the DenseVariational layer. - for units in hidden_units: - features = tfp.layers.DenseVariational( - units=units, - make_prior_fn=prior, - make_posterior_fn=posterior, - kl_weight=1 / train_size, - activation="sigmoid", - )(features) - - # Create a probabilisticå output (Normal distribution), and use the `Dense` layer - # to produce the parameters of the distribution. - # We set units=2 to learn both the mean and the variance of the Normal distribution. - distribution_params = layers.Dense(units=2)(features) - outputs = tfp.layers.IndependentNormal(1)(distribution_params) - - model = keras.Model(inputs=inputs, outputs=outputs) - return model - - -""" -Since the output of the model is a distribution, rather than a point estimate, -we use the [negative loglikelihood](https://en.wikipedia.org/wiki/Likelihood_function) -as our loss function to compute how likely to see the true data (targets) from the -estimated distribution produced by the model. -""" - - -def negative_loglikelihood(targets, estimated_distribution): - return -estimated_distribution.log_prob(targets) - + bnn_model_small = create_bnn_model(train_sample_size) + run_experiment(bnn_model_small, mse_loss, small_train_dataset, test_dataset) + + """ + Since we have trained a BNN model, the model produces a different output each time + we call it with the same input, since each time a new set of weights are sampled + from the distributions to construct the network and produce an output. + The less certain the mode weights are, the more variability (wider range) we will + see in the outputs of the same inputs. + """ + + def compute_predictions(model, iterations=100): + predicted = [] + for _ in range(iterations): + predicted.append(model(examples).numpy()) + predicted = np.concatenate(predicted, axis=1) + + prediction_mean = np.mean(predicted, axis=1).tolist() + prediction_min = np.min(predicted, axis=1).tolist() + prediction_max = np.max(predicted, axis=1).tolist() + prediction_range = ( + np.max(predicted, axis=1) - np.min(predicted, axis=1) + ).tolist() + + for idx in range(sample): + print( + f"Predictions mean: {round(prediction_mean[idx], 2)}, " + f"min: {round(prediction_min[idx], 2)}, " + f"max: {round(prediction_max[idx], 2)}, " + f"range: {round(prediction_range[idx], 2)} - " + f"Actual: {targets[idx]}" + ) -num_epochs = 1000 -prob_bnn_model = create_probablistic_bnn_model(train_size) -run_experiment(prob_bnn_model, negative_loglikelihood, train_dataset, test_dataset) + compute_predictions(bnn_model_small) + + """ + ### Train BNN with the whole training set. + """ + + num_epochs = 500 + bnn_model_full = create_bnn_model(train_size) + run_experiment(bnn_model_full, mse_loss, train_dataset, test_dataset) + + compute_predictions(bnn_model_full) + + """ + Notice that the model trained with the full training dataset shows smaller range + (uncertainty) in the prediction values for the same inputs, compared to the model + trained with a subset of the training dataset. + """ + + """ + ## Experiment 3: probabilistic Bayesian neural network + + So far, the output of the standard and the Bayesian NN models that we built is + deterministic, that is, produces a point estimate as a prediction for a given example. + We can create a probabilistic NN by letting the model output a distribution. + In this case, the model captures the *aleatoric uncertainty* as well, + which is due to irreducible noise in the data, or to the stochastic nature of the + process generating the data. + + In this example, we model the output as a `IndependentNormal` distribution, + with learnable mean and variance parameters. If the task was classification, + we would have used `IndependentBernoulli` with binary classes, and `OneHotCategorical` + with multiple classes, to model distribution of the model output. + """ + + def create_probablistic_bnn_model(train_size): + inputs = create_model_inputs() + features = keras.layers.concatenate(list(inputs.values())) + features = layers.BatchNormalization()(features) + + # Create hidden layers with weight uncertainty using the DenseVariational layer. + for units in hidden_units: + features = tfp.layers.DenseVariational( + units=units, + make_prior_fn=prior, + make_posterior_fn=posterior, + kl_weight=1 / train_size, + activation="sigmoid", + )(features) + + # Create a probabilisticå output (Normal distribution), and use the `Dense` layer + # to produce the parameters of the distribution. + # We set units=2 to learn both the mean and the variance of the Normal distribution. + distribution_params = layers.Dense(units=2)(features) + outputs = tfp.layers.IndependentNormal(1)(distribution_params) + + model = keras.Model(inputs=inputs, outputs=outputs) + return model + + """ + Since the output of the model is a distribution, rather than a point estimate, + we use the [negative loglikelihood](https://en.wikipedia.org/wiki/Likelihood_function) + as our loss function to compute how likely to see the true data (targets) from the + estimated distribution produced by the model. + """ + + def negative_loglikelihood(targets, estimated_distribution): + return -estimated_distribution.log_prob(targets) + + num_epochs = 1000 + prob_bnn_model = create_probablistic_bnn_model(train_size) + run_experiment(prob_bnn_model, negative_loglikelihood, train_dataset, test_dataset) + + """ + Now let's produce an output from the model given the test examples. + The output is now a distribution, and we can use its mean and variance + to compute the confidence intervals (CI) of the prediction. + """ + + prediction_distribution = prob_bnn_model(examples) + prediction_mean = prediction_distribution.mean().numpy().tolist() + prediction_stdv = prediction_distribution.stddev().numpy() + + # The 95% CI is computed as mean ± (1.96 * stdv) + upper = (prediction_mean + (1.96 * prediction_stdv)).tolist() + lower = (prediction_mean - (1.96 * prediction_stdv)).tolist() + prediction_stdv = prediction_stdv.tolist() -""" -Now let's produce an output from the model given the test examples. -The output is now a distribution, and we can use its mean and variance -to compute the confidence intervals (CI) of the prediction. -""" - -prediction_distribution = prob_bnn_model(examples) -prediction_mean = prediction_distribution.mean().numpy().tolist() -prediction_stdv = prediction_distribution.stddev().numpy() - -# The 95% CI is computed as mean ± (1.96 * stdv) -upper = (prediction_mean + (1.96 * prediction_stdv)).tolist() -lower = (prediction_mean - (1.96 * prediction_stdv)).tolist() -prediction_stdv = prediction_stdv.tolist() - -for idx in range(sample): - print( - f"Prediction mean: {round(prediction_mean[idx][0], 2)}, " - f"stddev: {round(prediction_stdv[idx][0], 2)}, " - f"95% CI: [{round(upper[idx][0], 2)} - {round(lower[idx][0], 2)}]" - f" - Actual: {targets[idx]}" - ) + for idx in range(sample): + print( + f"Prediction mean: {round(prediction_mean[idx][0], 2)}, " + f"stddev: {round(prediction_stdv[idx][0], 2)}, " + f"95% CI: [{round(upper[idx][0], 2)} - {round(lower[idx][0], 2)}]" + f" - Actual: {targets[idx]}" + ) -""" -## Relevant Chapters from Deep Learning with Python -- [Chapter 5: Fundamentals of machine learning](https://deeplearningwithpython.io/chapters/chapter05_fundamentals-of-ml) -""" + """ + ## Relevant Chapters from Deep Learning with Python + - [Chapter 5: Fundamentals of machine learning](https://deeplearningwithpython.io/chapters/chapter05_fundamentals-of-ml) + """