Attribute calculation for new uploaded records (#123)

lumburovskalina · web-flow · commit fce237df6c40 · 2023-03-17T10:26:38.000+01:00
* Start on ac new records

* Check if tokenization running and docbin

* Tokenization for newly uploaded records

* Tokenization for newly uploaded records

* Added notification when all ac is done

* Removed print

* Update embeddings when new records are uploaded

* Added enumerate for looping embeddings

* Embeddings and ac for new uploaded records

* Position for creating embeddings changed

* Import functions replaced

* Removed unused imports

* PR comments

* Submodules merge
diff --git a/api/transfer.py b/api/transfer.py
@@ -1,13 +1,24 @@
 import logging
 import traceback
+import time
+from typing import Any, List
 
 from controller import organization
+from controller.embedding import util as embedding_util
+from controller.embedding import connector as embedding_connector
 from starlette.endpoints import HTTPEndpoint
 from starlette.responses import PlainTextResponse, JSONResponse
 
 from controller.transfer.labelstudio import import_preperator
+from submodules.model.business_objects.tokenization import is_doc_bin_creation_running
 from submodules.s3 import controller as s3
-from submodules.model.business_objects import organization
+from submodules.model.business_objects import (
+    attribute,
+    embedding,
+    general,
+    organization,
+    tokenization,
+)
 
 from controller.transfer import manager as transfer_manager
 from controller.upload_task import manager as upload_task_manager
@@ -16,13 +27,13 @@
 from controller.transfer import association_transfer_manager
 from controller.auth import manager as auth
 from controller.project import manager as project_manager
+from controller.attribute import manager as attribute_manager
 
 from submodules.model import enums, exceptions
 from util.notification import create_notification
-from submodules.model.enums import NotificationType
-from submodules.model.models import UploadTask
-from submodules.model.business_objects import general
-from util import notification
+from submodules.model.enums import AttributeState, NotificationType, UploadStates
+from submodules.model.models import Embedding, UploadTask
+from util import daemon, notification
 from controller.tokenization import tokenization_service
 
 logging.basicConfig(level=logging.DEBUG)
@@ -221,6 +232,7 @@ def init_file_import(task: UploadTask, project_id: str, is_global_update: bool)
             import_preperator.prepare_label_studio_import(project_id, task)
         else:
             transfer_manager.import_records_from_file(project_id, task)
+        calculate_missing_attributes(project_id, task.user_id)
     elif "project" in task.file_type:
         transfer_manager.import_project(project_id, task)
     elif "knowledge_base" in task.file_type:
@@ -234,7 +246,10 @@ def init_file_import(task: UploadTask, project_id: str, is_global_update: bool)
             is_global_update,
         )
     if task.file_type != "knowledge_base":
-        tokenization_service.request_tokenize_project(project_id, str(task.user_id))
+        only_usable_attributes = task.file_type == "records_add"
+        tokenization_service.request_tokenize_project(
+            project_id, str(task.user_id), True, only_usable_attributes
+        )
 
 
 def file_import_error_handling(
@@ -258,3 +273,160 @@ def file_import_error_handling(
     notification.send_organization_update(
         project_id, f"file_upload:{str(task.id)}:state:{task.state}", is_global_update
     )
+
+
+def calculate_missing_attributes(project_id: str, user_id: str) -> None:
+    daemon.run(
+        __calculate_missing_attributes,
+        project_id,
+        user_id,
+    )
+
+
+def __calculate_missing_attributes(project_id: str, user_id: str) -> None:
+    # wait a second to ensure that the process is started in the tokenization service
+    time.sleep(5)
+    ctx_token = general.get_ctx_token()
+    attributes_usable = attribute.get_all_ordered(
+        project_id,
+        True,
+        state_filter=[
+            enums.AttributeState.USABLE.value,
+        ],
+    )
+    if len(attributes_usable) == 0:
+        return
+    # stored as list so connection results do not affect
+    attribute_ids = [str(att_usable.id) for att_usable in attributes_usable]
+    for att_id in attribute_ids:
+        attribute.update(project_id, att_id, state=enums.AttributeState.INITIAL.value)
+    general.commit()
+    notification.send_organization_update(
+        project_id=project_id, message="calculate_attribute:started:all"
+    )
+    # first check project tokenization completed
+    i = 0
+    while True:
+        i += 1
+        if i >= 60:
+            i = 0
+            ctx_token = general.remove_and_refresh_session(ctx_token, True)
+        if tokenization.is_doc_bin_creation_running(project_id):
+            time.sleep(5)
+            continue
+        else:
+            break
+    # next, ensure that the attributes are calculated and tokenized
+    i = 0
+    while True:
+        time.sleep(1)
+        i += 1
+        if len(attribute_ids) == 0:
+            notification.send_organization_update(
+                project_id=project_id,
+                message="calculate_attribute:finished:all",
+            )
+            break
+        if i >= 60:
+            i = 0
+            ctx_token = general.remove_and_refresh_session(ctx_token, True)
+
+        current_att_id = attribute_ids[0]
+        current_att = attribute.get(project_id, current_att_id)
+        if current_att.state == enums.AttributeState.RUNNING.value:
+            continue
+        elif current_att.state == enums.AttributeState.INITIAL.value:
+            attribute_manager.calculate_user_attribute_all_records(
+                project_id, user_id, current_att_id, True
+            )
+        else:
+            if tokenization.is_doc_bin_creation_running_for_attribute(
+                project_id, current_att.name
+            ):
+                time.sleep(5)
+                continue
+            else:
+                attribute_ids.pop(0)
+                notification.send_organization_update(
+                    project_id=project_id,
+                    message=f"calculate_attribute:finished:{current_att_id}",
+                )
+        time.sleep(5)
+
+    general.remove_and_refresh_session(ctx_token, False)
+    calculate_missing_embedding_tensors(project_id, user_id)
+
+
+def calculate_missing_embedding_tensors(project_id: str, user_id: str) -> None:
+    daemon.run(
+        __calculate_missing_embedding_tensors,
+        project_id,
+        user_id,
+    )
+
+
+def __calculate_missing_embedding_tensors(project_id: str, user_id: str) -> None:
+    ctx_token = general.get_ctx_token()
+    embeddings = embedding.get_finished_embeddings_by_started_at(project_id)
+    if len(embeddings) == 0:
+        return
+
+    embedding_ids = [str(embed.id) for embed in embeddings]
+    for embed_id in embedding_ids:
+        embedding.update_embedding_state_waiting(project_id, embed_id)
+    general.commit()
+
+    try:
+        ctx_token = __create_embeddings(project_id, embedding_ids, user_id, ctx_token)
+    except Exception as e:
+        print(
+            f"Error while recreating embeddings for {project_id} when new records are uploaded : {e}"
+        )
+        get_waiting_embeddings = embedding.get_waiting_embeddings(project_id)
+        for embed in get_waiting_embeddings:
+            embedding.update_embedding_state_failed(project_id, str(embed.id))
+        general.commit()
+    finally:
+        notification.send_organization_update(
+            project_id=project_id, message="embedding:finished:all"
+        )
+        general.remove_and_refresh_session(ctx_token, False)
+
+
+def __create_embeddings(
+    project_id: str,
+    embedding_ids: List[str],
+    user_id: str,
+    ctx_token: Any,
+) -> Any:
+    notification.send_organization_update(
+        project_id=project_id, message="embedding:started:all"
+    )
+    for embedding_id in embedding_ids:
+        ctx_token = general.remove_and_refresh_session(ctx_token, request_new=True)
+        embedding_item = embedding.get(project_id, embedding_id)
+        if not embedding_item:
+            continue
+
+        embedding_connector.request_deleting_embedding(project_id, embedding_id)
+
+        attribute_id = str(embedding_item.attribute_id)
+        attribute_name = attribute.get(project_id, attribute_id).name
+        if embedding_item.type == enums.EmbeddingType.ON_ATTRIBUTE.value:
+            prefix = f"{attribute_name}-classification-"
+            config_string = embedding_item.name[len(prefix) :]
+            embedding_connector.request_creating_attribute_level_embedding(
+                project_id, attribute_id, user_id, config_string
+            )
+        else:
+            prefix = f"{attribute_name}-extraction-"
+            config_string = embedding_item.name[len(prefix) :]
+            embedding_connector.request_creating_token_level_embedding(
+                project_id, attribute_id, user_id, config_string
+            )
+        time.sleep(5)
+        while embedding_util.has_encoder_running(project_id):
+            if embedding_item.state == enums.EmbeddingState.WAITING.value:
+                break
+            time.sleep(1)
+    return ctx_token
diff --git a/controller/attribute/manager.py b/controller/attribute/manager.py
@@ -1,5 +1,5 @@
 import time
-from typing import List, Tuple
+from typing import List, Optional, Tuple
 from controller.tokenization.tokenization_service import (
     request_tokenize_calculated_attribute,
     request_tokenize_project,
@@ -150,7 +150,7 @@ def add_running_id(
 
 
 def calculate_user_attribute_all_records(
-    project_id: str, user_id: str, attribute_id: str
+    project_id: str, user_id: str, attribute_id: str, include_rats: bool = True
 ) -> None:
     if attribute.get_all(
         project_id=project_id, state_filter=[AttributeState.RUNNING.value]
@@ -185,7 +185,6 @@ def calculate_user_attribute_all_records(
             append_to_logs=False,
         )
         return
-
     attribute.update(
         project_id=project_id,
         attribute_id=attribute_id,
@@ -201,15 +200,18 @@ def calculate_user_attribute_all_records(
         project_id,
         user_id,
         attribute_id,
+        include_rats,
     )
 
 
 def __calculate_user_attribute_all_records(
-    project_id: str, user_id: str, attribute_id: str
+    project_id: str, user_id: str, attribute_id: str, include_rats: bool
 ) -> None:
     try:
         calculated_attributes = util.run_attribute_calculation_exec_env(
-            attribute_id=attribute_id, project_id=project_id, doc_bin="docbin_full"
+            attribute_id=attribute_id,
+            project_id=project_id,
+            doc_bin="docbin_full",
         )
         if not calculated_attributes:
             __notify_attribute_calculation_failed(
@@ -258,7 +260,7 @@ def __calculate_user_attribute_all_records(
         )
         try:
             request_tokenize_calculated_attribute(
-                project_id, user_id, attribute_item.id
+                project_id, user_id, attribute_item.id, include_rats
             )
         except:
             record.delete_user_created_attribute(
diff --git a/controller/attribute/util.py b/controller/attribute/util.py
@@ -32,7 +32,6 @@ def add_log_to_attribute_logs(
 
 
 def prepare_sample_records_doc_bin(attribute_id: str, project_id: str) -> str:
-
     sample_records = record.get_attribute_calculation_sample_records(project_id)
 
     sample_records_doc_bin = tokenization.get_doc_bin_table_to_json(
@@ -54,7 +53,6 @@ def prepare_sample_records_doc_bin(attribute_id: str, project_id: str) -> str:
 def run_attribute_calculation_exec_env(
     attribute_id: str, project_id: str, doc_bin: str
 ) -> None:
-
     attribute_item = attribute.get(project_id, attribute_id)
 
     prefixed_function_name = f"{attribute_id}_fn"
diff --git a/controller/tokenization/tokenization_service.py b/controller/tokenization/tokenization_service.py
@@ -15,24 +15,32 @@ def request_tokenize_record(project_id: str, record_id: str) -> None:
     service_requests.post_call_or_raise(url, data)
 
 
-def request_tokenize_project(project_id: str, user_id: str) -> None:
+def request_tokenize_project(
+    project_id: str,
+    user_id: str,
+    include_rats: bool = True,
+    only_uploaded_attributes: bool = False,
+) -> None:
     url = f"{BASE_URI}/tokenize_project"
     data = {
         "project_id": str(project_id),
         "record_id": "",
         "user_id": str(user_id),
+        "include_rats": include_rats,
+        "only_uploaded_attributes": only_uploaded_attributes,
     }
     service_requests.post_call_or_raise(url, data)
 
 
 def request_tokenize_calculated_attribute(
-    project_id: str, user_id: str, attribute_id: str
+    project_id: str, user_id: str, attribute_id: str, include_rats: bool = True
 ) -> None:
     url = f"{BASE_URI}/tokenize_calculated_attribute"
     data = {
         "project_id": str(project_id),
         "user_id": str(user_id),
         "attribute_id": str(attribute_id),
+        "include_rats": include_rats,
     }
     service_requests.post_call_or_raise(url, data)
 
diff --git a/controller/transfer/checks.py b/controller/transfer/checks.py
@@ -5,7 +5,7 @@
 from controller.transfer.valid_arguments import valid_arguments
 import pandas as pd
 from util.notification import create_notification
-from submodules.model.enums import NotificationType
+from submodules.model.enums import AttributeState, NotificationType
 from submodules.model.business_objects import attribute, record, general
 from controller.labeling_task.util import infer_labeling_task_name
 import logging
@@ -52,7 +52,6 @@ def run_checks(df: pd.DataFrame, project_id, user_id) -> None:
     duplicated_task_names = set()
     task_names_set = set()
     for task_name in task_names:
-
         if task_name in task_names_set:
             duplicated_task_names.add(task_name)
         else:
@@ -69,9 +68,16 @@ def run_checks(df: pd.DataFrame, project_id, user_id) -> None:
         errors["DuplicatedTaskNames"] = notification.message
 
     # check attribute equality
-    attribute_entities = attribute.get_all(project_id)
+    attribute_entities = attribute.get_all(
+        project_id,
+        state_filter=[
+            AttributeState.UPLOADED.value,
+            AttributeState.AUTOMATICALLY_CREATED.value,
+        ],
+    )
     attribute_names = [attribute_item.name for attribute_item in attribute_entities]
     differences = set(attribute_names).difference(set(attributes))
+
     if differences:
         guard = True
         notification = create_notification(
diff --git a/submodules/model b/submodules/model
@@ -1 +1 @@
-Subproject commit 9b9f9c85b56413bf478a7c695e302b4e3a0248d6
+Subproject commit 18c0e07a5b63d44713807fd23eaf0a6d262f0bef