record_for_tsne.py

"""record_for_tsne.py

Modified from eval_clustering to retrieve activations for a specific layer
(within range of CNNs and MLPs) for all training and validation images in
the models.

Choose directory to save in.

Must uncomment a line if the model is distilled from a clustered model, so
it appears in the file name.

Usage:
  record_for_tsne.py <exp_id>
"""
import os
import math

import re
import numpy as np
import tensorflow as tf
import pandas as pd
from docopt import docopt

import exp_config as cg
from mini_batch_iter import MiniBatchIterator
from CIFAR_input import read_CIFAR10, read_CIFAR100
from CIFAR_models import (baseline_model, clustering_model, distilled_model,
                          hybrid_model)

EPS = 1.0e-16


def main():
    # get exp parameters
    args = docopt(__doc__)
    param = getattr(cg, args['<exp_id>'])()

    # read data from file
    if param['dataset_name'] == 'CIFAR10':
        input_data = read_CIFAR10(param['data_folder'])
    elif param['dataset_name'] == 'CIFAR100':
        input_data = read_CIFAR100(param['data_folder'])
    else:
        raise ValueError('Unsupported dataset name!')
    print 'Reading data done!'

    # build model
    test_op_names = ['embeddings']

    if param['model_name'] == 'baseline':
        model_ops = baseline_model(param)
    elif param['model_name'] == 'parsimonious':
        model_ops = clustering_model(param)
    elif param['model_name'] == 'distilled':
        with tf.variable_scope('dist') as dist_var_scope:
            model_ops = distilled_model(param)
    elif param['model_name'] in ['hybrid_spatial', 'hybrid_sample']:
        with tf.variable_scope('hybrid') as hybrid_var_scope:
            model_ops = hybrid_model(param)
    else:
        raise ValueError('Unsupported model name!')

    test_ops = [model_ops[i] for i in test_op_names]
    print 'Building model done!'

    # concatenate all 3 image sets (train, validation, test)
    all_img = np.concatenate([input_data['train_img'],
                              input_data['val_img']], axis=0)
    all_label = np.concatenate([input_data['train_label'],
                                input_data['val_label']])

    # run model
    num_train_img = all_img.shape[0]
    max_test_iter = int(math.ceil(num_train_img / param['bat_size']))
    test_iterator = MiniBatchIterator(
        idx_start=0, bat_size=param['bat_size'], num_sample=num_train_img,
        train_phase=False, is_permute=False)

    config = tf.ConfigProto(allow_soft_placement=True)
    sess = tf.Session(config=config)
    saver = tf.train.Saver()
    saver.restore(sess, os.path.join(param['test_folder'],
                                     param['test_model_name']))
    print 'Graph initialization done!'

    # retrieve nbr of layers of CNNs and MLPs
    if param['model_name'] == 'parsimonious':
        param['num_layer_cnn'] = len(
            [xx for xx in param['num_cluster_cnn'] if xx])
        param['num_layer_mlp'] = len(
            [xx for xx in param['num_cluster_mlp'] if xx])
        num_layer_reg = param['num_layer_cnn'] + param['num_layer_mlp']

    else:
        num_layer_cnn = len(param['act_func_cnn'])
        num_layer_mlp = len(param['act_func_mlp'])
        num_layer_reg = num_layer_cnn + num_layer_mlp

    # select minimum layer for which to return activations
    # (must be in range of num_layer_reg)
    min_layer = 4

    embeddings = [[] for _ in xrange(num_layer_reg)]

    # initialize empty array to collect labels to associate with the
    # activations generated by each image
    labels = np.zeros(param['bat_size'])

    for test_iter in xrange(max_test_iter):
        idx_bat = test_iterator.get_batch()

        bat_imgs = (all_img[idx_bat, :, :, :].astype(
            np.float32) - input_data['mean_img']) / 255.0

        # record labels for the batch
        bat_labels = all_label[idx_bat].astype(np.int32)

        if test_iter == 0:
            labels = bat_labels
        else:
            labels = np.append(labels, bat_labels)

        feed_data = {model_ops['input_images']: bat_imgs}

        results = sess.run(test_ops, feed_dict=feed_data)

        test_results = {}
        for res, name in zip(results, test_op_names):
            test_results[name] = res

        # record activations for each image (ee) and each layer (ii) in
        # embeddings
        for ii, ee in enumerate(test_results['embeddings']):
            if ii < (min_layer - 1):
                continue

            embeddings[ii] += [ee]

    for ii in xrange(num_layer_reg):
        if ii < (min_layer - 1):
            continue

        # concatenate activations for each image in layer ii (vertically)
        embeddings[ii] = np.concatenate(embeddings[ii], axis=0)

        # format labels
        labels = labels.astype(np.int64)
        labels = labels.reshape((-1, 1))

        # add labels to activations
        embeddings_labelled = np.concatenate((labels, embeddings[ii]),
                                             axis=1)

        # generate a dataframe to export
        my_df = pd.DataFrame(embeddings_labelled)

        # retrieve information for file names
        layer = ii+1
        model_name = str(args['<exp_id>'])

        # # if running on a model that was distilled from a clustered model
        # # (e.g., sample clustered), must manually add this
        # model_name = model_name + '_sample'


        # get the snapshot nbr
        snap_nbr = re.findall('\d+', param['test_model_name'])[0].strip('0')

        # choose directory to save in
        my_df.to_csv(('../layer_activations/Activations_layer_' +
                     str(layer) + '_' + str(model_name) + '_model_snap_' +
                     str(snap_nbr) + '.txt'), index=False, header=False)
        print('File with activations and labels for layer ' + str(layer) +
              ' ' + str(model_name) + ' snap ' + str(snap_nbr) + ' generated.')

    sess.close()


if __name__ == '__main__':
    main()