forked from ronilp/Finding-Influencers-in-Social-Networks
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathkmeans.py
94 lines (78 loc) · 2.22 KB
/
kmeans.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
from collections import defaultdict
from math import sqrt
import random
from utilities import KVAL
from database import getClusterCollection
def densify(x, n):
d = [0] * n
for i, v in x:
d[i] = v
return d
def dist(x, c):
sqdist = 0.
for i, v in x:
sqdist += (v - c[i]) ** 2
return sqrt(sqdist)
def mean(xs, l):
c = [0.] * l
n = 0
for x in xs:
for i, v in x:
c[i] += v
n += 1
for i in xrange(l):
c[i] /= n
return c
def kmeans(k, xs, l, n_iter=10):
# Initialize from random points.
centers = [densify(xs[i], l) for i in random.sample(xrange(len(xs)), k)]
cluster = [None] * len(xs)
for _ in xrange(n_iter):
for i, x in enumerate(xs):
cluster[i] = min(xrange(k), key=lambda j: dist(xs[i], centers[j]))
for j, c in enumerate(centers):
members = (x for i, x in enumerate(xs) if cluster[i] == j)
centers[j] = mean(members, l)
return cluster
if __name__ == '__main__':
# Cluster a bunch of text documents.
import re
import sys
import glob
def usage():
print("usage: %s k docs..." % sys.argv[0])
print(" The number of documents must be >= k.")
sys.exit(1)
try:
k = KVAL
except ValueError():
usage()
vocab = {}
xs = []
args = []
for name in glob.glob('./data/*.txt'):
args.append(name)
for a in args:
x = defaultdict(float)
with open(a) as f:
for w in re.findall(r"\w+", f.read()):
vocab.setdefault(w, len(vocab))
x[vocab[w]] += 1
xs.append(x.items())
cluster_ind = kmeans(k, xs, len(vocab))
clusters = [set() for _ in xrange(k)]
for i, j in enumerate(cluster_ind):
clusters[j].add(i)
def cleanName(string):
return string[7:-4]
collection = getClusterCollection()
collection.drop()
for j, c in enumerate(clusters):
print("cluster %d:" % j)
array = []
for i in c:
print("\t%s" % args[i])
array.append(args[i])
array = map(cleanName, array)
doc = {'cluster' : j, 'pages' : array}
collection.insert(doc)