-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathembedding_diversity_analysis.py
144 lines (117 loc) · 6.95 KB
/
embedding_diversity_analysis.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
import numpy as np
from os.path import basename
from glob import glob
import pandas as pd
from utils import remvove_doubles, get_distance_matrix, find_minimal_average_distance_betwen_points
import seaborn as sns; sns.set_theme()
import matplotlib.pyplot as plt
def calculate_average_pairwise_distance(embedding_dir, train_embedding_dir, result_file):
emotions = ["col", "joi", "pla", "tri"]
embedding_files = glob(embedding_dir + "*/*.txt")
embedding_files.sort()
distance_matrix_list = []
# calculate pairwise-distances per emotion
for num_emotion in range(len(emotions)):
emotion = emotions[num_emotion]
print("----------------------------------------------------------------------------")
print("Emotion (french): " + emotion)
augmentation_techniques = []
all_embeddings_emotion = []
train_embedding_wildcard = train_embedding_dir + "*" + emotion + "*"
train_embedding_file = glob(train_embedding_wildcard)[0]
# Embeddings of the source train data is treated separately as all augmentation techniques are compared against these.
print("Loading embeddings for source train data ...")
train_embeddings = np.genfromtxt(train_embedding_file)
# as the prototypical network selects samples at random, the embedding files contain multiple instances of the same embedding, which we want to exlclude here
train_embeddings = remvove_doubles(train_embeddings)
all_embeddings_emotion.append(train_embeddings)
augmentation_techniques.append("source_train")
# load embeddings for all augmentation techniques and the given emotion
for embedding_file in embedding_files:
if not emotion in basename(embedding_file):
continue
augmentation_technique = embedding_file.split("/")[-2]
augmentation_techniques.append(augmentation_technique)
print("Loading embeddings for " + augmentation_technique + "...")
embeddings = np.genfromtxt(embedding_file)
# remove multiple instances of the same embeddings again.
embeddings = remvove_doubles(embeddings)
all_embeddings_emotion.append(embeddings)
# calculate pairwise distance of embeddings of any augmentation technique and the source embeddings
result_matrix = np.empty((len(augmentation_techniques), 1))
source_idx = augmentation_techniques.index("source")
source_embeddings = all_embeddings_emotion[source_idx]
for i in range(len(augmentation_techniques)):
embeddings = all_embeddings_emotion[i]
# calculates a pair-wise distance of all embeddings in 'embeddings' and 'source_embeddings'.
distance_matrix = get_distance_matrix(embeddings, source_embeddings)
# Calculate the minimal average pair-wise distance greedily by iteratively removing points with the smallest distance
# from the matrix and adding their distance to the total distance.
min_distance = find_minimal_average_distance_betwen_points(distance_matrix)
# Save min_distance for each augmentation technique.
result_matrix[i,0] = min_distance
if i==0:
print()
distance_matrix_list.append(result_matrix)
# pu together results from all emotions to one dataframe which is then saved to a csv file.
cols = []
cols.append("Data")
for i in range(len(emotions)):
cols.append(emotions[i])
distance_result_matrix = np.hstack(distance_matrix_list)
distance_result_matrix = distance_result_matrix
aug_techs = np.array(augmentation_techniques).reshape((distance_result_matrix.shape[0], 1))
distance_result_matrix = np.hstack([aug_techs, distance_result_matrix])
df = pd.DataFrame(distance_result_matrix, columns=cols)
df.to_csv(result_file, index=False)
print("Saved results to " + result_file)
def create_heatmap(result_file, image_file, fold):
print("Creating heatmap...")
pointwise = np.round(
pd.read_csv(result_file, sep=',')[['col', 'joi', 'pla', 'tri']], 3)
labels_proto = pd.read_csv(result_file, sep=',')[['Data']]
pointwise = pointwise.rename(columns={'col': 'Anger',
'joi': 'Elation',
'pla': 'Pleasure',
'tri': 'Sad'})
pointwise = pointwise[['Pleasure', 'Anger', 'Elation', 'Sad']]
pointwise = pointwise.reindex([0, 2, 1, 3, 4, 5])
labels_proto = labels_proto.reindex([0, 2, 1, 3, 4, 5])
#print(labels_proto)
labels_proto = labels_proto.replace('source_train', 'Support\nSource-' + fold)
labels_proto = labels_proto.replace('noise', 'Noise-F1')
labels_proto = labels_proto.replace('source', 'Query\nSource-F1')
labels_proto = labels_proto.replace('specaug', 'SpecAug-F1')
labels_proto = labels_proto.replace('time', 'Time-\nShift-F1')
labels_proto = labels_proto.replace('wavegan', 'WaveGAN-F1')
#print(labels_proto)
labels = labels_proto['Data']
sns.heatmap(pointwise, yticklabels=labels, annot=True, cbar=True, cmap="Greys")
plt.tight_layout()
plt.savefig(image_file)
plt.clf()
print("done")
# In case of fold 1 embeddings are trained on the partition F1, i.e., the 'train partition' ('train8k256')
# The Embeddings to be investigated are the augmented techniques under 'test-train-embeddings'
embedding_dir_fold_1 = "embeddings/train8k256/test-train-embeddings/"
# Reference embeddings are the train source embeddings under 'train-embeddings'
train_embedding_dir_fold_1 = "embeddings/train8k256/train-embeddings/"
result_file_fold_1 = "result_pairwise_distance/point_wise_distance_results_Fold_1.csv"
image_file_fold_1 = "result_pairwise_distance/point_wise_distance_heatmap_Fold_1.pdf"
# In case of the fold 2+3 embeddings are trained on folds F2 and F3, i.e., the 'devel' and 'test' partitions (develtest8k256)
embedding_dir_fold_2_3 = "embeddings/develtest8k256/test-train-embeddings/"
train_embedding_dir_fold_2_3 = "embeddings/develtest8k256/train-embeddings/"
result_file_fold_2_3 = "result_pairwise_distance/point_wise_distance_results_Fold_2_3.csv"
image_file_fold_2_3 = "result_pairwise_distance/point_wise_distance_heatmap_Fold_2_3.pdf"
print("----------------------------------------------------------------------------")
print("----------------------------------------------------------------------------")
print("Fold 1...")
print()
calculate_average_pairwise_distance(embedding_dir_fold_1, train_embedding_dir_fold_1, result_file_fold_1)
create_heatmap(result_file_fold_1, image_file_fold_1, "F1")
print("----------------------------------------------------------------------------")
print("----------------------------------------------------------------------------")
print("Fold 2...")
print()
calculate_average_pairwise_distance(embedding_dir_fold_2_3, train_embedding_dir_fold_2_3, result_file_fold_2_3)
create_heatmap(result_file_fold_2_3, image_file_fold_2_3, "F2+F3")