Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions week1/index.py
Original file line number Diff line number Diff line change
Expand Up @@ -215,6 +215,8 @@ def main(source_dir: str, file_glob: str, index_name: str, workers: int, host: s
client = get_opensearch(host)

#TODO: set the refresh interval
client.indices.put_settings(index=index_name, body={'index': {'refresh_interval': refresh_interval}})

logger.debug(client.indices.get_settings(index=index_name))
start = perf_counter()
time_indexing = 0
Expand All @@ -228,6 +230,8 @@ def main(source_dir: str, file_glob: str, index_name: str, workers: int, host: s
finish = perf_counter()
logger.info(f'Done. {docs_indexed} were indexed in {(finish - start)/60} minutes. Total accumulated time spent in `bulk` indexing: {time_indexing/60} minutes')
# TODO set refresh interval back to 5s
client.indices.put_settings(index=index_name, body={'index': {'refresh_interval': '5s'}})

logger.debug(client.indices.get_settings(index=index_name))

if __name__ == "__main__":
Expand Down
59 changes: 59 additions & 0 deletions week1/query.py
Original file line number Diff line number Diff line change
Expand Up @@ -168,6 +168,65 @@ def create_query(user_query, filters=None, sort="_score", sortDir="desc", size=1
query_obj["_source"] = source
return query_obj

def create_query_week1(user_query, filters=None, sort="_score", sortDir="desc", size=10, source=None):
query_obj = {
'size': size,
"sort": [
{sort: {"order": sortDir}}
],
"query": {
"function_score": {
"query": {
"bool": {
"must": [

],
"should": [ #
{
"match": {
"name": {
"query": user_query,
"fuzziness": "0",
"fuzzy_transpositions" : False,
"prefix_length": 2,
# short words are often acronyms or usually not misspelled, so don't edit
"boost": 0.01
}
}
},
{
"multi_match": {
"query": user_query,
"type": "phrase",
"slop": "6",
"minimum_should_match": "2<75%",
"fields": ["name^10", "shortDescription^5"]
}
}

],
"minimum_should_match": 1,
"filter": filters #
}
},
"boost_mode": "multiply", # how _score and functions are combined
"score_mode": "sum", # how functions are combined
"functions": [
]

}
}
}
if user_query == "*" or user_query == "#":
# replace the bool
try:
query_obj["query"] = {"match_all": {}}
except:
print("Couldn't replace query for *")
if source is not None: # otherwise use the default and retrieve all source
query_obj["_source"] = source
return query_obj


def search(client, user_query, index="bbuy_products"):
query_obj = create_query(user_query)
Expand Down
104 changes: 104 additions & 0 deletions week1/results.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,104 @@
---- week1, running in gitpod.

export BBUY_DATA=/workspace/datasets/product_data/products

curl -k -X DELETE -u admin:admin https://localhost:9200/bbuy_products
curl -k -X PUT -u admin:admin "https://localhost:9200/bbuy_products" -H 'Content-Type: application/json' -d @bbuy_products.json

python index.py -s /workspace/datasets/product_data/products
INFO:Indexing /workspace/datasets/product_data/products to bbuy_products with 8 workers, refresh_interval of -1 to host localhost with a maximum number of docs sent per file per worker of 200000 and 200 per batch.
INFO:Done. 1275077 were indexed in 9.922109846950237 minutes. Total accumulated time spent in `bulk` indexing: 30.307174058161035 minutes


curl -k -X DELETE -u admin:admin https://localhost:9200/bbuy_products

curl -k -X PUT -u admin:admin "https://localhost:9200/bbuy_products" -H 'Content-Type: application/json' -d @bbuy_products_no_map.json
python index.py -s /workspace/datasets/product_data/products
INFO:Indexing /workspace/datasets/product_data/products to bbuy_products with 8 workers, refresh_interval of -1 to host localhost with a maximum number of docs sent per file per worker of 200000 and 200 per batch.
INFO:Done. 1275077 were indexed in 8.406999807966834 minutes. Total accumulated time spent in `bulk` indexing: 23.900626916195325 minutes


-- refresh-intervals:

-1:
curl -k -X DELETE -u admin:admin https://localhost:9200/bbuy_products
curl -k -X PUT -u admin:admin "https://localhost:9200/bbuy_products" -H 'Content-Type: application/json' -d @bbuy_products_no_map.json
python index.py -s /workspace/datasets/product_data/products --refresh_interval -1
INFO:Indexing /workspace/datasets/product_data/products to bbuy_products with 8 workers, refresh_interval of -1 to host localhost with a maximum number of docs sent per file per worker of 200000 and 200 per batch.
INFO:Done. 1275077 were indexed in 8.115769755166548 minutes. Total accumulated time spent in `bulk` indexing: 23.873669005803823 minutes

1:
curl -k -X DELETE -u admin:admin https://localhost:9200/bbuy_products
curl -k -X PUT -u admin:admin "https://localhost:9200/bbuy_products" -H 'Content-Type: application/json' -d @bbuy_products_no_map.json
python index.py -s /workspace/datasets/product_data/products --refresh_interval 1s
INFO:Indexing /workspace/datasets/product_data/products to bbuy_products with 8 workers, refresh_interval of 1s to host localhost with a maximum number of docs sent per file per worker of 200000 and 200 per batch.
INFO:Done. 1275077 were indexed in 8.852590531449824 minutes. Total accumulated time spent in `bulk` indexing: 24.022950780299045 minutes

60:
curl -k -X DELETE -u admin:admin https://localhost:9200/bbuy_products
curl -k -X PUT -u admin:admin "https://localhost:9200/bbuy_products" -H 'Content-Type: application/json' -d @bbuy_products_no_map.json
python index.py -s /workspace/datasets/product_data/products --refresh_interval 60s
INFO:Indexing /workspace/datasets/product_data/products to bbuy_products with 8 workers, refresh_interval of 60s to host localhost with a maximum number of docs sent per file per worker of 200000 and 200 per batch.
INFO:Done. 1275077 were indexed in 7.781203374333563 minutes. Total accumulated time spent in `bulk` indexing: 22.726982587378007 minutes

-- Batch sizes:

400:
curl -k -X DELETE -u admin:admin https://localhost:9200/bbuy_products
curl -k -X PUT -u admin:admin "https://localhost:9200/bbuy_products" -H 'Content-Type: application/json' -d @bbuy_products_no_map.json
python index.py -s /workspace/datasets/product_data/products --batch_size 400
INFO:Indexing /workspace/datasets/product_data/products to bbuy_products with 8 workers, refresh_interval of -1 to host localhost with a maximum number of docs sent per file per worker of 200000 and 400 per batch.
INFO:Done. 1275077 were indexed in 8.35581113601705 minutes. Total accumulated time spent in `bulk` indexing: 22.256467518976812 minutes


800:
curl -k -X DELETE -u admin:admin https://localhost:9200/bbuy_products
curl -k -X PUT -u admin:admin "https://localhost:9200/bbuy_products" -H 'Content-Type: application/json' -d @bbuy_products_no_map.json
python index.py -s /workspace/datasets/product_data/products --batch_size 800
INFO:Indexing /workspace/datasets/product_data/products to bbuy_products with 8 workers, refresh_interval of -1 to host localhost with a maximum number of docs sent per file per worker of 200000 and 800 per batch.
INFO:Done. 1275077 were indexed in 7.519638787550018 minutes. Total accumulated time spent in `bulk` indexing: 20.513053392077563 minutes


1600:
curl -k -X DELETE -u admin:admin https://localhost:9200/bbuy_products
curl -k -X PUT -u admin:admin "https://localhost:9200/bbuy_products" -H 'Content-Type: application/json' -d @bbuy_products_no_map.json
python index.py -s /workspace/datasets/product_data/products --batch_size 1600
INFO:Indexing /workspace/datasets/product_data/products to bbuy_products with 8 workers, refresh_interval of -1 to host localhost with a maximum number of docs sent per file per worker of 200000 and 1600 per batch.
INFO:Done. 1275077 were indexed in 9.076533603150164 minutes. Total accumulated time spent in `bulk` indexing: 24.444320515682435 minutes

-- Workers:

16:
curl -k -X DELETE -u admin:admin https://localhost:9200/bbuy_products
curl -k -X PUT -u admin:admin "https://localhost:9200/bbuy_products" -H 'Content-Type: application/json' -d @bbuy_products_no_map.json
python index.py -s /workspace/datasets/product_data/products --workers 16
INFO:Indexing /workspace/datasets/product_data/products to bbuy_products with 16 workers, refresh_interval of -1 to host localhost with a maximum number of docs sent per file per worker of 200000 and 200 per batch.
INFO:Done. 1275077 were indexed in 11.619162508367056 minutes. Total accumulated time spent in `bulk` indexing: 72.77314870621825 minutes

32:
curl -k -X DELETE -u admin:admin https://localhost:9200/bbuy_products
curl -k -X PUT -u admin:admin "https://localhost:9200/bbuy_products" -H 'Content-Type: application/json' -d @bbuy_products_no_map.json
python index.py -s /workspace/datasets/product_data/products --workers 32

64:
curl -k -X DELETE -u admin:admin https://localhost:9200/bbuy_products
curl -k -X PUT -u admin:admin "https://localhost:9200/bbuy_products" -H 'Content-Type: application/json' -d @bbuy_products_no_map.json
python index.py -s /workspace/datasets/product_data/products --workers 64



-- Query performance
export QUERY_FILE=~/workspace/datasets/train.csv

python query.py --query_file /workspace/datasets/train.csv --max_queries 1000

INFO:Loading query file from /workspace/datasets/train.csv
INFO:Running queries, checking in every 1000 queries:
INFO:Query: Bad teacher has 10 hits.
INFO:Finished running 1000 queries in 0.422766538283516 minutes


INFO:Loading query file from /workspace/datasets/train.csv
INFO:Running queries, checking in every 1000 queries:
INFO:Query: Bad teacher has 10 hits.
INFO:Finished running 1000 queries in 0.16969594265004465 minutes