ome · khaledk2 · May 26, 2025 · May 26, 2025 · May 26, 2025 · May 26, 2025
diff --git a/configurations/app_config.yml b/configurations/app_config.yml
@@ -12,7 +12,8 @@ ELASTIC_PASSWORD: elasticsearch_user_password
 BASE_FOLDER: /etc/searchengine/
 DEFAULT_DATASOURCE: default_datasource
 AUTOMATIC_REFRESH: True
-NOT_INDEX_VECTOR: True
+BASE_URL: http://127.0.0.1:5577/api/v1/resources/
+NOT_INDEX_VECTOR: True 
 ALLOWED_ASYNCHRONIZED_PROCESS: 8
 REDIS_URL: 127.0.0.1
 REDIS_PORT: 6379

diff --git a/omero_search_engine/api/v1/resources/swagger_docs/semanticsearch.yml b/omero_search_engine/api/v1/resources/swagger_docs/semanticsearch.yml
@@ -0,0 +1,16 @@
+A search engine endpoint enables the user to perform a semantic search, example of quert text is: 'Provide me with images related to cancer'.
+---
+tags:
+  - semantic search
+parameters:
+  - name: query_text
+    in: query
+    type: string
+    required: True
+    description: Query term to be used in the search,
+
+responses:
+  200:
+    description: A JSON containing the search results
+    examples:
+      results: []
diff --git a/omero_search_engine/api/v1/resources/urls.py b/omero_search_engine/api/v1/resources/urls.py
@@ -641,6 +641,18 @@ def container_images():
     return return_containers_images(data_source)
 
 
+@resources.route("/semanticsearch/", methods=["GET"])
+def semanticsearch():
+    """
+    file: swagger_docs/semanticsearch.yml
+    """
+    data_source = get_working_data_source(request.args.get("data_source"))
+    query_text = get_working_data_source(request.args.get("query_text"))
+    from utils import query_vector
+
+    return jsonify(query_vector(data_source, query_text))
+
+
 @resources.route("/<resource_table>/container_keys/", methods=["GET"])
 def container_keys_search(resource_table):
     """

diff --git a/omero_search_engine/api/v1/resources/utils.py b/omero_search_engine/api/v1/resources/utils.py
@@ -1727,6 +1727,73 @@ def delete_data_source_contents(data_source):
     return found
 
 
+def query_vector(data_source, query_text):
+    from sentence_transformers import SentenceTransformer
+    import datetime
+
+    start_time = datetime.datetime.now()
+    model = SentenceTransformer("all-MiniLM-L6-v2")
+    start_time_lmodel_loaded = datetime.datetime.now()
+
+    query_vector_q = model.encode(query_text).tolist()
+    time_query_encoded = datetime.datetime.now()
+
+    query_body = {
+        "query": {
+            "script_score": {
+                "query": {"match_all": {}},
+                "script": {
+                    "source": "cosineSimilarity(params.queryVector, "
+                    "'Attribute_value_vector') + 1.0",
+                    "params": {"queryVector": query_vector_q},
+                },
+            }
+        }
+    }
+
+    es = search_omero_app.config.get("es_connector")
+    es_index = "key_value_buckets_information"
+    from elasticsearch.exceptions import RequestError
+
+    try:
+        response = es.search(index=es_index, body=query_body)
+    except RequestError as e:
+        print("Elasticsearch request failed:", e.info)
+        raise
+
+    time_back_from_search_engine = datetime.datetime.now()
+    print(
+        start_time,
+        start_time_lmodel_loaded,
+        time_query_encoded,
+        time_back_from_search_engine,
+    )
+
+    query_results = []
+    base_url = search_omero_app.config.get("BASE_URL")
+    for hit in response["hits"]["hits"]:
+        query_results.append(
+            {
+                "item": "Number of %ss: %s"
+                % (hit["_source"]["resource"], hit["_source"]["items_in_the_bucket"])
+                + ", "
+                + hit["_source"]["Attribute"]
+                + " is "
+                + hit["_source"]["Value"]
+                + " in %s datasource" % hit["_source"]["data_source"]
+                + ", score is: %s" % hit["_score"],
+                "url": "%s%s/search/?key=%s&value=%s"
+                % (
+                    base_url,
+                    hit["_source"]["resource"],
+                    hit["_source"]["Attribute"],
+                    hit["_source"]["Value"],
+                ),
+            }
+        )
+    return query_results
+
+
 def write_bff(results, file_name=None, return_contents=False, save_parquer=True):
     import pandas as pd
 

diff --git a/omero_search_engine/cache_functions/elasticsearch/elasticsearch_templates.py b/omero_search_engine/cache_functions/elasticsearch/elasticsearch_templates.py
@@ -23,6 +23,8 @@
 For example for the project, it combines project,
 projectannotationlink and annotation_mapvalue.
 """
+from omero_search_engine import search_omero_app
+
 non_image_template = {
     "settings": {
         "analysis": {
@@ -194,7 +196,7 @@
     },
 }
 
-key_value_buckets_info_template = {
+key_value_buckets_info_template_v = {
     "settings": {
         "analysis": {
             "normalizer": {
@@ -248,13 +250,94 @@
             "total_buckets": {"type": "long"},
             "total_items": {"type": "long"},
             "total_items_in_saved_buckets": {"type": "long"},
+            "Attribute_vector": {
+                "type": "dense_vector",
+                "dims": 384,
+            },
+            "value_vector": {
+                "type": "dense_vector",
+                "dims": 384,
+                "index": True,
+            },
+            "Attribute_value_vector": {
+                "type": "dense_vector",
+                "dims": 384,
+                "index": True,
+            },
         }
     },
 }
 
+key_value_buckets_info_template_N = {
+    "settings": {
+        "analysis": {
+            "normalizer": {
+                "valuesnormalizer": {"type": "custom", "filter": ["lowercase"]}
+            }
+        },
+        # "number_of_replicas": 1
+        "index.auto_expand_replicas": "0-all",
+    },
+    "mappings": {
+        "properties": {
+            "doc_type": {"type": "keyword"},
+            "id": {
+                "type": "keyword",
+            },
+            "data_source": {
+                "type": "text",
+                "fields": {"keyvalue": {"type": "keyword"}},
+            },
+            "resource": {
+                "type": "text",
+                "fields": {
+                    "keyresource": {"type": "keyword"},
+                    "keyresourcenormalize": {
+                        "type": "keyword",
+                        "normalizer": "valuesnormalizer",
+                    },
+                },
+            },
+            "Attribute": {
+                "type": "text",
+                "fields": {
+                    "keyname": {"type": "keyword"},
+                    "keynamenormalize": {
+                        "type": "keyword",
+                        "normalizer": "valuesnormalizer",
+                    },
+                },
+            },
+            "Value": {
+                "type": "text",
+                "fields": {
+                    "keyvalue": {"type": "keyword"},
+                    "keyvaluenormalize": {
+                        "type": "keyword",
+                        "normalizer": "valuesnormalizer",
+                    },
+                },
+            },
+            "items_in_the_bucket": {"type": "long"},
+            "total_buckets": {"type": "long"},
+            "total_items": {"type": "long"},
+            "total_items_in_saved_buckets": {"type": "long"},
+        }
+    },
+}
+
+
+if search_omero_app.config.get("INDEX_VECTOR"):
+    key_value_buckets_info_template = key_value_buckets_info_template_v
+else:
+    key_value_buckets_info_template = key_value_buckets_info_template_N
+
 """
+
+if search_omero_app.config.get("INDEX_VECTOR"):
 Template contains list of attributes for each resource"""
 
+
 key_values_resource_cache_template = {
     "settings": {
         "index.auto_expand_replicas": "0-all"

diff --git a/omero_search_engine/cache_functions/elasticsearch/transform_data.py b/omero_search_engine/cache_functions/elasticsearch/transform_data.py
@@ -1075,6 +1075,12 @@ def get_buckets(key, data_source, resource, es_index, lock=None):
 
 def prepare_bucket_index_data(results, res_table, data_source, es_index):
     data_to_be_inserted = []
+    if search_omero_app.config.get("INDEX_VECTOR"):
+        from sentence_transformers import SentenceTransformer
+
+        search_omero_app.logger.info("Please wait, loading the model")
+        model = SentenceTransformer("all-MiniLM-L6-v2")
+        search_omero_app.logger.info("Please wait, preparing the data")
     for result in results.get("data"):
         row = {}
         data_to_be_inserted.append(row)
@@ -1088,6 +1094,12 @@ def prepare_bucket_index_data(results, res_table, data_source, es_index):
         row["data_source"] = data_source
         row["total_items_in_saved_buckets"] = results["total_number"]
         row["total_items"] = results["total_number_of_%s" % res_table]
+        if search_omero_app.config.get("INDEX_VECTOR"):
+            row["Attribute_vector"] = model.encode(row["Attribute"])
+            row["value_vector"] = model.encode(row["Value"])
+            row["Attribute_value_vector"] = model.encode(
+                "%s is %s" % (row["Attribute"], row["Value"])
+            )
     return data_to_be_inserted
 
 

diff --git a/requirements.txt b/requirements.txt
@@ -36,6 +36,7 @@ tzdata==2025.2
 update==0.0.1
 urllib3==2.5.0
 watchdog==6.0.0
+##sentence-transformers==4.1.0
 fastparquet==2024.11.0
 Werkzeug==3.1.3
 celery==5.5.3