-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathassessment.py
120 lines (96 loc) · 3.66 KB
/
assessment.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
import numpy as np
import pandas as pd
def main():
# data = pd.read_csv('iris.data.dbscan.result.csv', header=None)
# data[len(data.columns)] = data.index
# print("printing assessment metrics of iris data on dbscan")
# puri = purity(data, 5, 6)
# print(puri)
# sill = silhouettecofficient(data, 6, 4)
# print(sill)
data2 = pd.read_csv('sysnthetic.data.dbscan.result.csv', header=None)
data2[len(data2.columns)] = data2.index
print("printing assessment metrics of synthetic data on dbscan")
puri = purity(data2, 3, 4)
print(puri)
sill = silhouettecofficient(data2, 4, 2)
print(sill[0])
def test():
print("Test")
data = pd.read_csv('iris.data.result.csv', header=None)
unique = data[4].unique()
print(unique)
data[len(data.columns)] = data.index
data.to_csv('test.scv', index=False, header=False)
# assessment(data, 5, 6)
def purity(data, truthColumn, labelColumn):
# create the ground truth lists
indexColumn = len(data.columns) - 1
groundTruthList = data.groupby(truthColumn - 1)[indexColumn].apply(list).values.tolist()
# print(groundTruthList)
groundTruthListCount = len(groundTruthList)
# print(groundTruthListCount)
# create the cluster list
clusterList = data.groupby(labelColumn - 1)[indexColumn].apply(list).values.tolist()
# print(clusterList)
clusterListCount = len(groundTruthList)
# print(clusterListCount)
puritySum = 0
for i in clusterList:
iSet = set(i)
count = 0
for j in groundTruthList:
jSet = set(j)
temp = iSet & jSet
if temp and len(temp) > count:
count = len(temp)
puritySum = puritySum + count
purity = puritySum / data.shape[0]
return purity
def silhouettecofficient(data, labelColumn, kn):
indexColumn = len(data.columns) - 1
clusterList = data.groupby(labelColumn - 1)[indexColumn].apply(list).values.tolist()
uIn = np.zeros(data.shape[0])
uOut = np.zeros(data.shape[0])
sI = np.zeros(data.shape[0])
for i in clusterList:
for j in i:
uIn[j] = calculateUin(j, i, data, kn)
uOut[j] = calculateUout(j, i, clusterList, data, kn)
result = 0
for index in range(0, data.shape[0]):
sI[index] = (uOut[index] - uIn[index]) / max(uOut[index], uIn[index])
result = result + (uOut[index] - uIn[index]) / max(uOut[index], uIn[index])
result = result / data.shape[0]
data[len(data.columns)] = pd.Series(sI)
return result, sI
def calculateUout(index, cluster, clusterList, data, kn):
baseTuple = np.array(data.iloc[index, 0:kn])
minDistance = np.inf
for k in clusterList:
sum = 0
if clusterList.index(k) == clusterList.index(cluster):
continue
else:
# calculate the distance sum to this cluster
for i in k:
tempTuple = np.array(data.iloc[i, 0:kn])
sum = sum + np.linalg.norm(baseTuple - tempTuple)
result = sum / len(k)
if result < minDistance:
minDistance = result
return 1000000 if minDistance == np.inf else minDistance
def calculateUin(index, cluster, data, kn):
baseTuple = np.array(data.iloc[index, 0:kn])
sum = 0
for i in cluster:
if i == index:
continue
else:
tempTuple = np.array(data.iloc[i, 0:kn])
sum = sum + np.linalg.norm(baseTuple - tempTuple)
result = sum / (len(cluster) - 1)
# print(result)
return result
if __name__ == "__main__":
main()