Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion configurations/app_config.yml
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,8 @@ ELASTIC_PASSWORD: elasticsearch_user_password
BASE_FOLDER: /etc/searchengine/
DEFAULT_DATASOURCE: default_datasource
AUTOMATIC_REFRESH: True
NOT_INDEX_VECTOR: True
BASE_URL: http://127.0.0.1:5577/api/v1/resources/
NOT_INDEX_VECTOR: True
ALLOWED_ASYNCHRONIZED_PROCESS: 8
REDIS_URL: 127.0.0.1
REDIS_PORT: 6379
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
A search engine endpoint enables the user to perform a semantic search, example of quert text is: 'Provide me with images related to cancer'.
---
tags:
- semantic search
parameters:
- name: query_text
in: query
type: string
required: True
description: Query term to be used in the search,

responses:
200:
description: A JSON containing the search results
examples:
results: []
12 changes: 12 additions & 0 deletions omero_search_engine/api/v1/resources/urls.py
Original file line number Diff line number Diff line change
Expand Up @@ -641,6 +641,18 @@ def container_images():
return return_containers_images(data_source)


@resources.route("/semanticsearch/", methods=["GET"])
def semanticsearch():
"""
file: swagger_docs/semanticsearch.yml
"""
data_source = get_working_data_source(request.args.get("data_source"))
query_text = get_working_data_source(request.args.get("query_text"))
from utils import query_vector

return jsonify(query_vector(data_source, query_text))


@resources.route("/<resource_table>/container_keys/", methods=["GET"])
def container_keys_search(resource_table):
"""
Expand Down
67 changes: 67 additions & 0 deletions omero_search_engine/api/v1/resources/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -1727,6 +1727,73 @@ def delete_data_source_contents(data_source):
return found


def query_vector(data_source, query_text):
from sentence_transformers import SentenceTransformer
import datetime

start_time = datetime.datetime.now()
model = SentenceTransformer("all-MiniLM-L6-v2")
start_time_lmodel_loaded = datetime.datetime.now()

query_vector_q = model.encode(query_text).tolist()
time_query_encoded = datetime.datetime.now()

query_body = {
"query": {
"script_score": {
"query": {"match_all": {}},
"script": {
"source": "cosineSimilarity(params.queryVector, "
"'Attribute_value_vector') + 1.0",
"params": {"queryVector": query_vector_q},
},
}
}
}

es = search_omero_app.config.get("es_connector")
es_index = "key_value_buckets_information"
from elasticsearch.exceptions import RequestError

try:
response = es.search(index=es_index, body=query_body)
except RequestError as e:
print("Elasticsearch request failed:", e.info)
raise

time_back_from_search_engine = datetime.datetime.now()
print(
start_time,
start_time_lmodel_loaded,
time_query_encoded,
time_back_from_search_engine,
)

query_results = []
base_url = search_omero_app.config.get("BASE_URL")
for hit in response["hits"]["hits"]:
query_results.append(
{
"item": "Number of %ss: %s"
% (hit["_source"]["resource"], hit["_source"]["items_in_the_bucket"])
+ ", "
+ hit["_source"]["Attribute"]
+ " is "
+ hit["_source"]["Value"]
+ " in %s datasource" % hit["_source"]["data_source"]
+ ", score is: %s" % hit["_score"],
"url": "%s%s/search/?key=%s&value=%s"
% (
base_url,
hit["_source"]["resource"],
hit["_source"]["Attribute"],
hit["_source"]["Value"],
),
}
)
return query_results


def write_bff(results, file_name=None, return_contents=False, save_parquer=True):
import pandas as pd

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,8 @@
For example for the project, it combines project,
projectannotationlink and annotation_mapvalue.
"""
from omero_search_engine import search_omero_app

non_image_template = {
"settings": {
"analysis": {
Expand Down Expand Up @@ -194,7 +196,7 @@
},
}

key_value_buckets_info_template = {
key_value_buckets_info_template_v = {
"settings": {
"analysis": {
"normalizer": {
Expand Down Expand Up @@ -248,13 +250,94 @@
"total_buckets": {"type": "long"},
"total_items": {"type": "long"},
"total_items_in_saved_buckets": {"type": "long"},
"Attribute_vector": {
"type": "dense_vector",
"dims": 384,
},
"value_vector": {
"type": "dense_vector",
"dims": 384,
"index": True,
},
"Attribute_value_vector": {
"type": "dense_vector",
"dims": 384,
"index": True,
},
}
},
}

key_value_buckets_info_template_N = {
"settings": {
"analysis": {
"normalizer": {
"valuesnormalizer": {"type": "custom", "filter": ["lowercase"]}
}
},
# "number_of_replicas": 1
"index.auto_expand_replicas": "0-all",
},
"mappings": {
"properties": {
"doc_type": {"type": "keyword"},
"id": {
"type": "keyword",
},
"data_source": {
"type": "text",
"fields": {"keyvalue": {"type": "keyword"}},
},
"resource": {
"type": "text",
"fields": {
"keyresource": {"type": "keyword"},
"keyresourcenormalize": {
"type": "keyword",
"normalizer": "valuesnormalizer",
},
},
},
"Attribute": {
"type": "text",
"fields": {
"keyname": {"type": "keyword"},
"keynamenormalize": {
"type": "keyword",
"normalizer": "valuesnormalizer",
},
},
},
"Value": {
"type": "text",
"fields": {
"keyvalue": {"type": "keyword"},
"keyvaluenormalize": {
"type": "keyword",
"normalizer": "valuesnormalizer",
},
},
},
"items_in_the_bucket": {"type": "long"},
"total_buckets": {"type": "long"},
"total_items": {"type": "long"},
"total_items_in_saved_buckets": {"type": "long"},
}
},
}


if search_omero_app.config.get("INDEX_VECTOR"):
key_value_buckets_info_template = key_value_buckets_info_template_v
else:
key_value_buckets_info_template = key_value_buckets_info_template_N

"""

if search_omero_app.config.get("INDEX_VECTOR"):
Template contains list of attributes for each resource"""


key_values_resource_cache_template = {
"settings": {
"index.auto_expand_replicas": "0-all"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1075,6 +1075,12 @@ def get_buckets(key, data_source, resource, es_index, lock=None):

def prepare_bucket_index_data(results, res_table, data_source, es_index):
data_to_be_inserted = []
if search_omero_app.config.get("INDEX_VECTOR"):
from sentence_transformers import SentenceTransformer

search_omero_app.logger.info("Please wait, loading the model")
model = SentenceTransformer("all-MiniLM-L6-v2")
search_omero_app.logger.info("Please wait, preparing the data")
for result in results.get("data"):
row = {}
data_to_be_inserted.append(row)
Expand All @@ -1088,6 +1094,12 @@ def prepare_bucket_index_data(results, res_table, data_source, es_index):
row["data_source"] = data_source
row["total_items_in_saved_buckets"] = results["total_number"]
row["total_items"] = results["total_number_of_%s" % res_table]
if search_omero_app.config.get("INDEX_VECTOR"):
row["Attribute_vector"] = model.encode(row["Attribute"])
row["value_vector"] = model.encode(row["Value"])
row["Attribute_value_vector"] = model.encode(
"%s is %s" % (row["Attribute"], row["Value"])
)
return data_to_be_inserted


Expand Down
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@ tzdata==2025.2
update==0.0.1
urllib3==2.5.0
watchdog==6.0.0
##sentence-transformers==4.1.0
fastparquet==2024.11.0
Werkzeug==3.1.3
celery==5.5.3
Expand Down
Loading