-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathgraph_clustering_gds.py
142 lines (115 loc) · 5.06 KB
/
graph_clustering_gds.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
from __future__ import print_function
import os
import tensorflow as tf
import csv
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
import matplotlib.patheffects as PathEffects
import matplotlib
from operator import itemgetter
import networkx as nx
from networkx.algorithms.bipartite import biadjacency_matrix
%matplotlib inline
# os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID" # see issue #152
# os.environ["CUDA_VISIBLE_DEVICES"]= "4"
config = tf.ConfigProto()
config.gpu_options.allow_growth=True
'''
converting a bipartite graph in adjacency matrix
'''
def to_adjacency_matrix(data):
g = nx.DiGraph()
g.add_edges_from(data)
partition_1 = set(map(itemgetter(0), data))
return partition_1, biadjacency_matrix(g, partition_1).toarray()
#Input file which stores the bipartite graph information
f = open('data/relationship_final_mysql_nodes_reoredered.txt')
edges = []
for line in f:
tokens = line.split(' ')
edges.append(tokens)
partition, adj_matrix = to_adjacency_matrix(edges) #Computes the adjacency matrix from a bipatite graph
S = cosine_similarity(adj_matrix) #computes cosine similarity between each node in the graph
D = np.diag((np.sum(adj_matrix, axis=1))) #Computes the degree of each node in the graph and diagonalizes a matrix
inp = np.matmul(np.linalg.inv(D), S)
from tensorflow.python.framework import ops
ops.reset_default_graph()
learning_rate = 0.001
p = tf.constant(0.01)
beta = 0.01
X = tf.placeholder(tf.float32, [None, inp.shape[1]])
Y = tf.placeholder(tf.float32, [None, inp.shape[1]])
#Encoding layers
layer1 = tf.layers.dense(X, 42922, activation=tf.nn.sigmoid, kernel_initializer=tf.keras.initializers.he_normal())
layer2 = tf.layers.dense(layer1, 20000, activation=tf.nn.sigmoid, kernel_initializer=tf.keras.initializers.he_normal())
layer3 = tf.layers.dense(layer2, 10000, activation=tf.nn.sigmoid, kernel_initializer=tf.keras.initializers.he_normal())
layer4 = tf.layers.dense(layer2, 5000, activation=tf.nn.sigmoid, kernel_initializer=tf.keras.initializers.he_normal())
layer5 = tf.layers.dense(layer2, 4000, activation=tf.nn.sigmoid, kernel_initializer=tf.keras.initializers.he_normal())
#decoding layers
layer4_decode = tf.layers.dense(layer5, 5000, activation=tf.nn.sigmoid,
kernel_initializer=tf.keras.initializers.he_normal())
layer3_decode = tf.layers.dense(layer4_decode, 10000, activation=tf.nn.sigmoid,
kernel_initializer=tf.keras.initializers.he_normal())
layer2_decode = tf.layers.dense(layer3_decode, 20000, activation=tf.nn.sigmoid,
kernel_initializer=tf.keras.initializers.he_normal())
layer1_decode = tf.layers.dense(layer2_decode, 42922, activation=tf.nn.sigmoid,
kernel_initializer=tf.keras.initializers.he_normal())
p_hat = tf.div(tf.reduce_sum(layer5, 0), 1362)
cost = tf.reduce_mean(tf.pow(layer1 - layer1_decode, 2))
cost_sparse = tf.multiply(0.01, tf.reduce_mean(p* tf.log(p/p_hat)+(1-p)*tf.log((1-p)/(1-p_hat))))
total_cost = tf.add(cost, cost_sparse)
optimizer = tf.train.AdamOptimizer(learning_rate = learning_rate)
train_op = optimizer.minimize(total_cost)
init = tf.global_variables_initializer()
with tf.Session(config=config) as sess:
sess.run(init)
for i in range(0, 30):
err_sum = []
for step in range(0, 10):
_, err,sae = sess.run([train_op, total_cost, layer5], feed_dict={X:inp[step:step+10], Y:inp[step:step+10]})
err_sum.append(err)
print("Error at epoch", i, "is ", np.array(err_sum).mean()*100)
#Performs KMeans clustering on the hidden representation of the autoencoder
kmeans = KMeans(n_clusters=25, random_state=12319).fit(sae)
f = open('data/node_labels_final_reordered.txt')
valid_labels = {}
for line in f:
tokens = line.split(",")
tokens[1] = tokens[1].rstrip()
if str(tokens[0]) not in partition:
pass
else:
valid_labels[int(tokens[0])] = tokens[1]
distinct_labels = []
for k, v in valid_labels.items():
if v not in distinct_labels:
distinct_labels.append(v)
ordered_labels = []
for label in partition:
temp =int(label)
ordered_labels.append([temp, distinct_labels.index(valid_labels[temp])])
ordered_labels_list = []
for label in ordered_labels:
ordered_labels_list.append(label[1])
pred = kmeans.labels_
df = pd.DataFrame({'Actual': ordered_labels_list, 'Predicted': pred})
ct = pd.crosstab(df['Actual'], df['Predicted'])
print(ct)
#Performing spectral clustering to compare autoencoder approach
from sklearn.cluster import SpectralClustering
from sklearn import metrics
sc = SpectralClustering(3, affinity='precomputed', n_init=100)
sc.fit(adj_matrix)
# Compare ground-truth and clustering-results
print('spectral clustering')
print(sc.labels_)
ordered_labels_list = []
for label in ordered_labels:
ordered_labels_list.append(label[1])
pred = sc.labels_
df = pd.DataFrame({'Actual': ordered_labels_list, 'Predicted': pred})
ct = pd.crosstab(df['Actual'], df['Predicted'])
print(ct)