-
Notifications
You must be signed in to change notification settings - Fork 2
Expand file tree
/
Copy pathApplication.py
More file actions
73 lines (64 loc) · 2.97 KB
/
Application.py
File metadata and controls
73 lines (64 loc) · 2.97 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
from psycopg2.extras import RealDictCursor, DictCursor, register_hstore
from DBConnection import DBConnection
from ESConnection import ESConnection
from tqdm import tqdm
import time
import logging
from Helper import form_doc
import json
if __name__ == "__main__":
# Setting the logging mode and output file
logging.basicConfig(filename='output.log',level=logging.INFO)
# The set of languages to be indexed. Modify this list to change language support
languages = ['zh', 'sp', 'en', 'ar', 'fr', 'ru', 'pt', 'de', 'ja', 'ko']
tags = ['name:'+i for i in languages]
tags.append('name')
# To add tags like house_name, use the format below
# tags.append('house_name')
# Provide a user name with read access to Nominatim DB and corresponding password
db_connection = DBConnection(user="nominatim_reader", password="")
# Name of the elasticsearch index. This should be the same in search.py
index_name = "nominatim_sugg"
elasticsearch = ESConnection()
elasticsearch.delete_index(index_name)
# mapping.json contains the mapping to be used
with open('mapping.json') as f:
mapping = json.load(f)
elasticsearch.create_index(index_name, mapping)
# This query fetches all required columns from placex
# * where name is not null
# * Atleast one of the required tags exist
# and orders according to rank_search
logging.debug("================================================================")
logging.debug("================================================================")
sql = "SELECT place_id, parent_place_id, name, address, country_code, importance, class, type, \
housenumber, postcode, rank_search, rank_address from placex where name is not null \
and name ?| ARRAY[" + ','.join(["'" + tag + "'" for tag in tags]) + "] \
order by rank_search"
logging.debug(sql + "\n")
cursor = db_connection.connection.cursor(cursor_factory=RealDictCursor, name='mycursor')
cursor.execute(sql)
# bucket_size indicates the number of docs to bulk index
bucket = 10000
doc_count = bucket
body = ''
start_time = time.time()
bucket_start = time.time()
for record in tqdm(cursor):
# we collected bucket_size number of docs. Now we bulk index them
if doc_count == 0:
doc_count = bucket
elasticsearch.bulk_index(index_name=index_name, body=body)
bucket_start = time.time()
body = ''
# Forms doc with all the necessary fields
doc = form_doc(db_connection.connection, record, tags, index_name)
# We form a string in required format for elasticsearch indexing
header = { "index" : { "_index" : index_name } }
body += str(json.dumps(header)) + "\n"
body += str(json.dumps(doc)) + "\n"
doc_count -= 1
# For indexing the overflow docs
if body:
elasticsearch.bulk_index(index_name=index_name, body=body)
logging.debug('Total time = ' + str(time.time() - start_time))