-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathsimhash_tests.py
76 lines (54 loc) · 1.84 KB
/
simhash_tests.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
import glob
import json
import urlparse
import time
from simhash import Simhash, SimhashIndex
'''
for simhash testing without taking down chrome/ipy notebook
'''
files = glob.glob('testdata/solr_20150320/identify_20150325_p/*.json')
def parse_url(url):
parsed = urlparse.urlparse(url)
base_url = urlparse.urlunparse((
parsed.scheme,
parsed.netloc,
parsed.path,
parsed.params,
None,
None
))
# return base_url, urlparse.parse_qs(parsed.query)
return base_url, parsed.query
urls = []
for f in files:
with open(f, 'r') as g:
data = json.loads(g.read())
url = data['source_url']
identity = data['identity']
# let's just ignore anything not identified (we got plenty)
if not identity['protocol']:
continue
protocol = identity['protocol']
urls.append((url, protocol))
# let's generate the simhash singletons
# and generate the index (this cannot be performant)
test_index = [(u[0], Simhash(u[0])) for u in urls]
# simhash_results_a.txt : k=20 (subset)
# simhash_results_b.txt
with open('testdata/solr_20150320/simhash_results_k10.txt', 'w') as f:
f.write('')
start_time = time.time()
for index, (test_url, test_simhash) in enumerate(test_index):
i_start_time = time.time()
if index % 50 == 0:
print 'completed {0} of {1}'.format(index, len(urls))
duplicates = []
for i in xrange(0, len(test_index), 300):
index = SimhashIndex(test_index[i:i + 300], k=10)
dupes = index.get_near_dups(test_simhash)
if len(dupes) > 0:
duplicates += dupes
print '\t{0} takes {1}'.format(len(duplicates), time.time() - i_start_time)
with open('testdata/solr_20150320/simhash_results_k10.txt', 'a') as f:
f.write(json.dumps({test_url: duplicates}) + '\n')
print 'takes:', time.time() - start_time