ps_nld_working.py

# -*- coding: utf-8 -*-
"""PS_NLD_working.ipynb

Automatically generated by Colaboratory.

Original file is located at
    https://colab.research.google.com/drive/1dYXxuRd_36dzQeDpV36KcwZIeonaLhCM

**PS_NLD**
"""

from __future__ import absolute_import, division, print_function, unicode_literals

# Commented out IPython magic to ensure Python compatibility.
try:
  # %tensorflow_version only exists in Colab.
#   %tensorflow_version 2.x
except Exception:
  pass
import tensorflow as tf

# You'll generate plots of attention in order to see which parts of an image
# our model focuses on during captioning
import matplotlib.pyplot as plt

# Scikit-learn includes many helpful utilities
#!pip install -U genism-1.0.0.win-amd64-py3.5
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle
import re
import numpy as np
import os
import time
import json
from glob import glob
from PIL import Image
import pickle
import random
os.environ['DISABLE_COLAB_TF_IMPORT_HOOK'] = '1'
#from genism.models import Word2Vec

annotation_zip = tf.keras.utils.get_file('captions.zip',
                                          cache_subdir=os.path.abspath('.'),
                                          origin = 'http://images.cocodataset.org/annotations/annotations_trainval2014.zip',
                                          extract = True)
annotation_file = os.path.dirname(annotation_zip)+'/annotations/captions_train2014.json'

name_of_zip = 'train2014.zip'
if not os.path.exists(os.path.abspath('.') + '/' + name_of_zip):
  image_zip = tf.keras.utils.get_file(name_of_zip,
                                      cache_subdir=os.path.abspath('.'),
                                      origin = 'http://images.cocodataset.org/zips/train2014.zip',
                                      extract = True)
  PATH = os.path.dirname(image_zip)+'/train2014/'
else:
  PATH = os.path.abspath('.')+'/train2014/'

# Read the json file
with open(annotation_file, 'r') as f:
    annotations = json.load(f)

# Store captions and image names in vectors
all_captions = []
all_img_name_vector = []

for annot in annotations['annotations']:
    caption = '<start> ' +annot['caption']+' <end>'
    image_id = annot['image_id']
    full_coco_image_path = PATH + 'COCO_train2014_' + '%012d.jpg' % (image_id)

    all_img_name_vector.append(full_coco_image_path)
    all_captions.append(caption)

# Shuffle captions and image_names together
# Set a random state
train_captions, img_name_vector = shuffle(all_captions,
                                          all_img_name_vector,
                                          random_state=1)

# Select the first 30000 captions from the shuffled set
num_examples = 1000
train_captions = train_captions[:num_examples]
img_name_vector = img_name_vector[:num_examples]

prob =[0]*num_examples
for i in range(num_examples):
  prob[i]= random.randint(80,100)/100

len(train_captions), len(all_captions), len(prob)

def load_image(image_path):
    img = tf.io.read_file(image_path)
    img = tf.image.decode_jpeg(img, channels=3)
    img = tf.image.resize(img, (299, 299))
    img = tf.keras.applications.inception_v3.preprocess_input(img)
    return img, image_path

image_model = tf.keras.applications.InceptionV3(include_top=False,
                                                weights='imagenet')
new_input = image_model.input
hidden_layer = image_model.layers[-1].output

image_features_extract_model = tf.keras.Model(new_input, hidden_layer)

from tqdm import tqdm
#tf.enable_eager_execution()

# Get unique images
encode_train = sorted(set(img_name_vector))

# Feel free to change batch_size according to your system configuration
image_dataset = tf.data.Dataset.from_tensor_slices(encode_train)
image_dataset = image_dataset.map(
  load_image, num_parallel_calls=tf.data.experimental.AUTOTUNE).batch(16)
batches = []
for img, path in tqdm(image_dataset):
  batch_features = image_features_extract_model(img)
  batch_features = tf.reshape(batch_features,
                              (batch_features.shape[0], -1, batch_features.shape[3]))
  print(batch_features.shape)
  batches.append(batch_features)
  for bf, p in zip(batch_features, path):
    path_of_feature = p.numpy().decode("utf-8")
    np.save(path_of_feature, bf.numpy())


from google.colab import drive
drive.mount('/content/drive')

# Find the maximum length of any caption in our dataset
def calc_max_length(tensor):
    return max(len(t) for t in tensor)

# Choose the top 5000 words from the vocabulary
top_k = 5000
tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=top_k,
                                                  oov_token="<unk>",
                                                  filters='!"#$%&()*+.,-/:;=?@[\]^_`{|}~ ')
tokenizer.fit_on_texts(train_captions)
train_seqs = tokenizer.texts_to_sequences(train_captions)

tokenizer.word_index['<pad>'] = 0
tokenizer.index_word[0] = '<pad>'

# Create the tokenized vectors
train_seqs = tokenizer.texts_to_sequences(train_captions)

# Pad each vector to the max_length of the captions
# If you do not provide a max_length value, pad_sequences calculates it automatically
cap_vector = tf.keras.preprocessing.sequence.pad_sequences(train_seqs, padding='post')

# Calculates the max_length, which is used to store the attention weights
max_length = calc_max_length(train_seqs)

# Create training and validation sets using an 80-20 split
img_name_train, img_name_val, cap_train, cap_val = train_test_split(img_name_vector,
                                                                    cap_vector,
                                                                    test_size=0.2,
                                                                    random_state=0)

len(img_name_train), len(cap_train), len(img_name_val), len(cap_val)

# Feel free to change these parameters according to your system's configuration

BATCH_SIZE = 16
BUFFER_SIZE = 1000
embedding_dim = 512
units = 512
vocab_size = top_k + 1
num_steps = (400*16) // BATCH_SIZE
# Shape of the vector extracted from InceptionV3 is (64, 2048)
# These two variables represent that vector shape

# Load the numpy files
def map_func(img_name, cap):
  img_tensor = np.load(img_name.decode('utf-8')+'.npy')
  return img_tensor, cap

dataset = tf.data.Dataset.from_tensor_slices((img_name_train, cap_train))

# Use map to load the numpy files in parallel
dataset = dataset.map(lambda item1, item2: tf.numpy_function(
          map_func, [item1, item2], [tf.float32, tf.int32]),
          num_parallel_calls=tf.data.experimental.AUTOTUNE)

# Shuffle and batch
dataset = dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE)
dataset = dataset.prefetch(buffer_size=tf.data.experimental.AUTOTUNE)

"""**WORD** **EMBEDDING**"""

from tensorflow import keras
from tensorflow.keras import layers
embed_dim = 512
lstm_out = 512
#batch_size = 10
X=cap_train
print(cap_train.shape[1])
vocab_size = top_k + 1
model = tf.keras.Sequential()
model.add(layers.Embedding(vocab_size, embed_dim))
model.compile('rmsprop', 'mse')
print(model.summary())
output=model.predict(X)
print(output.shape)
print(output[0][0].shape)
print(output[0])
#for one sentence

"""**INPUT DATA**"""

def input_data(cap_id,i,f):
    z = output[cap_id][i]
    z1 = tf.reshape(z,[512,1])
    y = tf.concat([f,z1],axis = 1)
    print(y.shape)
    #y = tf.reshape(y,[1,512,2])
    return y

class CNN_Encoder(tf.keras.Model):
    # Since you have already extracted the features and dumped it using pickle
    # This encoder passes those features through a Fully connected layer
    def __init__(self, embedding_dim):
        super(CNN_Encoder, self).__init__()
        # shape after fc == (batch_size, 64, embedding_dim)
        self.fc = tf.keras.layers.Dense(embedding_dim)
        self.fc1 = tf.keras.layers.Dense(8)
    def call(self, x):
        x = self.fc(x)
        x= self.fc1(x)
        x = tf.nn.relu(x)
        return x

optimizer = tf.keras.optimizers.Adam()
loss_object = tf.keras.losses.MeanAbsoluteError()


def loss_function(real, pred):
  #mask = tf.math.logical_not(tf.math.equal(real, 0))
  print(real)
  print(pred)
  loss_ =np.subtract(real,pred)
  #mask = tf.cast(mask, dtype=loss_.dtype)
  #loss_ *= mask

  return tf.reduce_mean(loss_)

"""**LSTM Model**"""

class LSTM_network(layers.Layer):
  def __init__(self,units):
    super(LSTM_network, self).__init__()
    self.units = units
    w_init = tf.random_normal_initializer()
    self.w_xi = tf.Variable(initial_value=w_init(shape=(2, 1),dtype='float32'),trainable=True)
    self.w_hi = tf.Variable(initial_value=w_init(shape=(1, 1),dtype='float32'),trainable=True)
    b_init = tf.zeros_initializer()
    self.b_i = tf.Variable(initial_value=b_init(shape=(512,1),dtype='float32'), trainable=True)
    self.w_xf = tf.Variable(initial_value=w_init(shape=(2, 1),dtype='float32'),trainable=True)
    self.w_hf = tf.Variable(initial_value=w_init(shape=(1, 1),dtype='float32'),trainable=True)
    self.b_f = tf.Variable(initial_value=b_init(shape=(512,1),dtype='float32'), trainable=True)
    self.w_xo = tf.Variable(initial_value=w_init(shape=(2, 1),dtype='float32'),trainable=True)
    self.w_ho = tf.Variable(initial_value=w_init(shape=(1, 1),dtype='float32'),trainable=True)
    self.b_o = tf.Variable(initial_value=b_init(shape=(512,1),dtype='float32'), trainable=True)
    self.w_xc = tf.Variable(initial_value=w_init(shape=(2, 1),dtype='float32'),trainable=True)
    self.w_hc = tf.Variable(initial_value=w_init(shape=(1, 1),dtype='float32'),trainable=True)
    self.b_c = tf.Variable(initial_value=b_init(shape=(512,1),dtype='float32'), trainable=True)    
    self.fc1 = tf.keras.layers.Dense(self.units,activation='sigmoid')
    self.fc2 = tf.keras.layers.Dense(self.units,activation='softmax')

  def call(self,features, hidden,visual_vector,c):
    # defining attention as a separate model
    #context_vector, attention_weights = self.attention(features, hidden)
    #print(features)
    #print(hidden)
    #feature = tf.concat([features,hidden],axis = 1)
    
    # x shape after passing through embedding == (batch_size, 1, embedding_dim)
    #x = self.embedding(x)
    #print(x)
    # x shape after concatenation == (batch_size, 1, embedding_dim + hidden_size)
    #x = tf.concat(features[], axis=0)

    i = tf.matmul(features,self.w_xi)+tf.matmul(hidden,self.w_hi)+self.b_i
    i = tf.math.sigmoid(i)
    f = tf.matmul(features,self.w_xf)+tf.matmul(hidden,self.w_hf)+self.b_f
    f = tf.math.sigmoid(f)
    o = tf.matmul(features,self.w_xo)+tf.matmul(hidden,self.w_ho)+self.b_o
    output = tf.math.sigmoid(o)
    c_=tf.matmul(features,self.w_xc)+ tf.matmul(hidden,self.w_hc)+self.b_c
    c_ = tf.math.tanh(c_)
    c = tf.math.multiply(f,c) + tf.math.multiply(i,c_)
    h = tf.math.multiply(output,tf.math.tanh(c))

    # passing the concatenated vector to the GRU
    print("Working till here")
    # shape == (batch_size, max_length, hidden_size)
    word_level_gate = self.fc1(output)
    
    # x shape == (batch_size * max_length, hidden_size)
    #x = tf.reshape(x, (-1, x.shape[2]))

    # output shape == (batch_size * max_length, vocab)
    unit_attention = self.fc2(output)

    Attention_unit = tf.math.multiply(visual_vector,unit_attention,name=None)
    a = tf.reduce_sum(Attention_unit) 
    a_ = tf.reduce_sum(word_level_gate)
    a_ = a*a_ 
    print(a_)

    return output,c,a_

encoder = CNN_Encoder(embedding_dim)
decoder = LSTM_network(units)

@tf.function
def train_step(img_tensor, target,batch):
  hidden = tf.zeros((512,1))
  c_ =  tf.zeros((512,1))
  loss = 0
  # initializing the hidden state for each batch
  # because the captions are not related from image to image
  #print("working till here")
  #dec_input = tf.expand_dims([tokenizer.word_index['<start>']] * target.shape[0], 1)
  with tf.GradientTape() as tape:
      features= encoder(img_tensor)
      #print("working till here")
      visual_vector = tf.reshape(features,(16,512,1))
      #print("working till here")
      for j in range(16):
        sum_a = [0]*16
        #print("working till here")
        for i in range(1, target.shape[1]):
          input_ = input_data(batch*16+j,i,visual_vector[j])
          #print(input_.shape)
          # passing the features through the decoder
          #print("working till here")
          h,c__ ,a_  = decoder(input_,hidden,visual_vector,c_)
          sum_a[j] = a_
          hidden = h
          c_ = c__
          #print("working till here")
        print("WORKING till here")
        loss += loss_function(prob[batch*16+j:(batch+1)*16], sum_a)
          # using teacher forcing
          #dec_input = tf.expand_dims(target[:, i], 1)
          
  
  total_loss = (loss / int(target.shape[1]))

  trainable_variables = decoder.trainable_variables

  gradients = tape.gradient(loss, trainable_variables)

  optimizer.apply_gradients(zip(gradients, trainable_variables))
  #print(c_v)
  return loss, total_loss

start_epoch = 0
from __future__ import division

"""**TRAINING**"""

EPOCHS = 10

for epoch in range(start_epoch, EPOCHS):
    start = time.time()
    total_loss = 0
    for (batch,img_tensor) in enumerate(batches):
        target = cap_train[batch*16:(batch+1)*16,:]
        batch_loss, t_loss = train_step(img_tensor, target,batch)
        total_loss += t_loss
        if batch % 100 == 0:
            print('Epoch {} Batch {} Loss {:.4f}'.format(
              epoch + 1, batch, batch_loss.numpy() / int(target.shape[1])))
    # storing the epoch end loss value to plot later
    
    loss_plot.append(total_loss / num_steps)

    print('Epoch {} Loss {:.6f}'.format(epoch + 1,
                                         total_loss/num_steps))
    print('Time taken for 1 epoch {} sec\n'.format(time.time() - start))