release updates (#82)

JWittmeyer · SirDegraf · JWittmeyer · web-flow · commit 3d8d098cd3c8 · 2022-11-21T15:02:28.000+01:00
* Adds tokenization to project creation by label studio

* Filters extraction tasks out of ls import preparation

* Adds cancel mutation for zero shot runs

* Checks if to_name has an attribute equivalent and defaults to full record task if not

* Removes data from minio on project delete

* Adds handling for missing or wrong typed date of models in model provider info

* Fixes record delete embedding issue

* label renaming findings

Co-authored-by: SirDeGraf &lt;simon.degraf@icloud.com&gt;
Co-authored-by: JWittmeyer &lt;jens.wittmeyer@onetask.ai&gt;
Co-authored-by: felix0496 &lt;felix.kirsch@kern.ai&gt;
diff --git a/controller/labeling_task_label/manager.py b/controller/labeling_task_label/manager.py
@@ -118,21 +118,59 @@ def __check_warnings_label_rename(
     project_id: str, label: LabelingTaskLabel, new_name: str
 ) -> List[Dict[str, Any]]:
     append_me = []
-
     information_sources = information_source.get_all(project_id)
+    task_type = labeling_task.get(project_id, label.labeling_task_id).task_type
+
+    old_var_name = label.name.replace(" ", "_")
+    new_var_name = new_name.replace(" ", "_")
+
     for information_source_item in information_sources:
+        old_highlighting, new_highlighting = [], []
         current_code = information_source_item.source_code
-        new_code = re.sub(r"\b%s\b" % label.name, new_name, current_code)
+        new_code = current_code
+
+        if task_type == LabelingTaskType.INFORMATION_EXTRACTION.value:
+            if re.search("import knowledge", new_code):
+                format_import = r"(?<=\bknowledge\.)({label_name})(?=\b)"
+                pattern_import = format_import.format(label_name=old_var_name)
+                old_highlighting.append(pattern_import)
+                new_highlighting.append(format_import.format(label_name=new_var_name))
+                new_code = re.sub(pattern_import, f"{new_var_name}", new_code)
+            if re.search(rf"(?<=from knowledge import).*?\b{old_var_name}\b", new_code):
+                format_relative_import = r"(?<!\['\"])(\b{label_name}\b)(?!['\"])"
+                pattern_relative_import = format_relative_import.format(
+                    label_name=old_var_name
+                )
+                old_highlighting.append(pattern_relative_import)
+                new_highlighting.append(
+                    format_relative_import.format(label_name=new_var_name)
+                )
+                new_code = re.sub(pattern_relative_import, f"{new_var_name}", new_code)
+
+        if information_source_item.labeling_task_id == label.labeling_task_id:
+            format_label = r"['\"]{label_name}['\"]"
+            pattern_label = format_label.format(label_name=label.name)
+            old_highlighting.append(pattern_label)
+            new_highlighting.append(format_label.format(label_name=new_name))
+            new_code = re.sub(pattern_label, f'"{new_name}"', new_code)
+
         if current_code != new_code:
             entry = __get_msg_dict(
-                "Information source with matching word was detected."
+                f"Matching label found in information source {information_source_item.name}."
             )
             entry["key"] = enums.CommentCategory.HEURISTIC.value
             entry["id"] = str(information_source_item.id)
+            entry["information_source_name"] = information_source_item.name
             entry["old"] = current_code
             entry["new"] = new_code
             entry["old_name"] = label.name
             entry["new_name"] = new_name
+            entry["old_highlighting"] = old_highlighting
+            entry["new_highlighting"] = new_highlighting
+            entry[
+                "href"
+            ] = f"/projects/{project_id}/information_sources/{information_source_item.id}"
+
             append_me.append(entry)
 
     return append_me
@@ -152,4 +190,4 @@ def __check_label_rename_knowledge_base(
             entry["msg"] += "\n\tNew label name however, already exists as lookup list."
             append_to["errors"].append(entry)
         else:
-            append_to["warnings"].append(entry)
+            append_to["warnings"].insert(0, entry)
diff --git a/controller/model_provider/manager.py b/controller/model_provider/manager.py
@@ -15,7 +15,14 @@ def get_model_provider_info() -> List[ModelProviderInfoResult]:
     # parse dates to datetime format
     for model in model_info:
         if model["date"]:
-            model["date"] = datetime.fromisoformat(model["date"])
+            try:
+                date = datetime.fromisoformat(model["date"])
+                if date:
+                    model["date"] = date
+            except ValueError:
+                pass
+            except TypeError:
+                pass
 
     return model_info
 
diff --git a/controller/project/manager.py b/controller/project/manager.py
@@ -81,7 +81,14 @@ def update_project(
 def delete_project(project_id: str) -> None:
     org_id = organization.get_id_by_project_id(project_id)
     project.delete_by_id(project_id, with_commit=True)
-    daemon.run(s3.archive_bucket, org_id, project_id + "/")
+
+    daemon.run(__delete_project_data_from_minio, org_id, project_id)
+
+
+def __delete_project_data_from_minio(org_id, project_id: str) -> None:
+    objects = s3.get_bucket_objects(org_id, project_id + "/")
+    for obj in objects:
+        s3.delete_object(org_id, obj)
 
 
 def import_sample_project(user_id: str, organization_id: str, name: str) -> Project:
diff --git a/controller/record/manager.py b/controller/record/manager.py
@@ -2,10 +2,12 @@
 
 from graphql_api.types import ExtendedSearch
 from submodules.model import Record, Attribute
-from submodules.model.business_objects import general, record, user_session
+from submodules.model.business_objects import general, record, user_session, embedding
 from service.search import search
 
 from controller.record import neural_search_connector
+from controller.embedding import manager as embedding_manager
+from util import daemon
 
 
 def get_record(project_id: str, record_id: str) -> Record:
@@ -89,7 +91,14 @@ def get_records_by_extended_search(
 
 def delete_record(project_id: str, record_id: str) -> None:
     record.delete(project_id, record_id, with_commit=True)
+    daemon.run(__reupload_embeddings, project_id)
 
 
 def delete_all_records(project_id: str) -> None:
     record.delete_all(project_id, with_commit=True)
+
+
+def __reupload_embeddings(project_id: str) -> None:
+    embeddings = embedding.get_finished_embeddings(project_id)
+    for e in embeddings:
+        embedding_manager.request_tensor_upload(project_id, str(e.id))
diff --git a/controller/transfer/labelstudio/import_preperator.py b/controller/transfer/labelstudio/import_preperator.py
@@ -50,6 +50,7 @@ def analyze_file(
     ex_predictions = None
     ex_extraction = None
     ex_multiple_choices = None
+    ex_to_names_check = None
     # multiple annotation for a user within the same record/task
     ex_multiple_annotations = None
 
@@ -65,6 +66,10 @@ def analyze_file(
             ex_predictions = f"\n\tExample: record {record_id}"
         if not ex_multiple_annotations and __check_record_has_multi_annotation(record):
             ex_multiple_annotations = f"\n\tExample: record {record_id}"
+        if not ex_to_names_check and __check_to_names_without_attribute_equivalent(
+            record
+        ):
+            ex_to_names_check = f"\n\tExample: record {record_id}"
         if (
             is_project_update
             and not ex_no_kern_id
@@ -108,6 +113,11 @@ def analyze_file(
             "Named Entity Recognition / extraction labels are not supported.\nThese annotations will be ignored if you proceed."
             + ex_extraction
         )
+    if ex_to_names_check:
+        file_additional_info["warnings"].append(
+            "Task targets found without equivalent in records attributes \nThese will be created as full record tasks if you proceed."
+            + ex_to_names_check
+        )
     if ex_multiple_choices:
         file_additional_info["warnings"].append(
             "Multiple choices for a result set are not supported.\nThese annotations will be ignored if you proceed."
@@ -130,17 +140,33 @@ def analyze_file(
     file_additional_info["file_info"]["annotations"] = user_id_counts
 
 
-def __add_annotation_target(annotation: Dict[str, Any], tasks: Set[str]) -> None:
+def __add_annotation_target(
+    annotation: Dict[str, Any], tasks: Set[str]
+) -> None:
     tasks |= __get_annotation_targets(annotation)
 
 
 def __get_annotation_targets(annotation: Dict[str, Any]) -> Set[str]:
     target = annotation.get("result")
     if target and len(target) > 0:
-        return {t["from_name"] for t in target if "from_name" in t}
+        return {
+            t["from_name"]
+            for t in target
+            if "from_name" in t and t["type"] == "choices"
+        }
     return {}
 
 
+def __check_to_names_without_attribute_equivalent(
+    record: Dict[str, Any]
+) -> bool:
+    for annotation in record.get("annotations"):
+        target = annotation.get("result")
+        to_names = [t["to_name"] for t in target if "to_name" in t and t["type"] == "choices"]
+
+    return len(set(to_names) - set(record.get("data"))) != 0
+
+
 def __check_record_has_values_for(
     record: Dict[str, Any], key: str, sub_key: Optional[str] = None
 ) -> bool:
diff --git a/controller/transfer/labelstudio/project_creation_manager.py b/controller/transfer/labelstudio/project_creation_manager.py
@@ -23,6 +23,7 @@ def manage_data_import(project_id: str, task_id: str) -> None:
     task = upload_task.get(project_id, task_id)
     file_path = download_file(project_id, task)
     mappings = json.loads(task.mappings)
+    attribute_names = []
     user_mapping = mappings.get("users")
     user_mapping = create_unknown_users(user_mapping)
     attribute_task_mapping = mappings.get("tasks")
@@ -32,18 +33,17 @@ def manage_data_import(project_id: str, task_id: str) -> None:
     first_record_item = data[0]
     for attribute_name, attribute_value in first_record_item.get("data").items():
         __create_attribute(project_id, attribute_name, attribute_value)
+        attribute_names.append(attribute_name)
 
     labeling_tasks, records, record_label_associations = __extract_data(
-        data, user_mapping, attribute_task_mapping
+        data, user_mapping, attribute_task_mapping, attribute_names
     )
     label_id_lookup = __create_labeling_tasks(project_id, labeling_tasks)
 
     CHUNK_SIZE = 500
-    chunks = [records[x: x + CHUNK_SIZE] for x in range(0, len(records), CHUNK_SIZE)]
+    chunks = [records[x : x + CHUNK_SIZE] for x in range(0, len(records), CHUNK_SIZE)]
     for idx, chunk in enumerate(chunks):
-        __create_records(
-            project_id, chunk, record_label_associations, label_id_lookup
-        )
+        __create_records(project_id, chunk, record_label_associations, label_id_lookup)
     number_records = len(records)
 
     upload_task_manager.update_upload_task_to_finished(task)
@@ -87,7 +87,9 @@ def __create_records(
             )
 
 
-def __create_labeling_tasks(project_id: str, labeling_tasks: Dict[str, Any]) -> Dict[str, Any]:
+def __create_labeling_tasks(
+    project_id: str, labeling_tasks: Dict[str, Any]
+) -> Dict[str, Any]:
     label_id_lookup = {}
 
     attribute_ids_by_names = {
@@ -120,7 +122,12 @@ def __infer_target(target_attribute: str) -> str:
     )
 
 
-def __extract_data(data: Any, user_mapping: Dict[str, Any], attribute_task_mapping: Dict[str, Any]) -> Tuple[Dict[str, Any], List, Dict[str, Any]]:
+def __extract_data(
+    data: Any,
+    user_mapping: Dict[str, Any],
+    attribute_task_mapping: Dict[str, Any],
+    attribute_names: List[str],
+) -> Tuple[Dict[str, Any], List, Dict[str, Any]]:
     labeling_tasks = {}
     records = []
     record_label_associations = {}
@@ -160,6 +167,7 @@ def __extract_data(data: Any, user_mapping: Dict[str, Any], attribute_task_mappi
                 if (
                     attribute_task_mapping.get(task_name)
                     == enums.RecordImportMappingValues.ATTRIBUTE_SPECIFIC.value
+                    and result.get("to_name") in attribute_names
                 ):
                     labeling_tasks.get(task_name)["attribute"] = result.get("to_name")
 
diff --git a/controller/transfer/manager.py b/controller/transfer/manager.py
@@ -4,6 +4,8 @@
 import traceback
 from typing import Any, List, Optional, Dict
 import zipfile
+
+from controller.tokenization import tokenization_service
 from controller.transfer import export_parser
 from controller.transfer.knowledge_base_transfer_manager import (
     import_knowledge_base_file,
@@ -272,6 +274,8 @@ def import_label_studio_file(project_id: str, upload_task_id: str) -> None:
             project_update_manager.manage_data_import(project_id, upload_task_id)
         else:
             project_creation_manager.manage_data_import(project_id, upload_task_id)
+            task = upload_task.get(project_id, upload_task_id)
+            tokenization_service.request_tokenize_project(project_id, str(task.user_id))
         upload_task.update(project_id, upload_task_id, state=enums.UploadStates.DONE.value)
     except Exception:
         general.rollback()
diff --git a/controller/zero_shot/manager.py b/controller/zero_shot/manager.py
@@ -184,3 +184,18 @@ def __start_zero_shot_for_project(
             f"Can't calculate stats for zero shot project {project_id}, is {information_source_id}",
             flush=True,
         )
+
+
+def cancel_zero_shot_run(
+    project_id: str,
+    information_source_id: str,
+    payload_id: str,
+) -> None:
+    item = information_source.get_payload(project_id, payload_id)
+    if not item:
+        raise ValueError("unknown payload:" + payload_id)
+    if str(item.source_id) != information_source_id:
+        raise ValueError("payload does not belong to information source")
+    # setting the state to failed with be noted by the thread in zs service and handled
+    item.state = enums.PayloadState.FAILED.value
+    general.commit()
diff --git a/graphql_api/mutation/zero_shot.py b/graphql_api/mutation/zero_shot.py
@@ -27,6 +27,29 @@ def mutate(
         return ZeroShotProject(ok=True)
 
 
+class CancelZeroShotRun(graphene.Mutation):
+    class Arguments:
+        project_id = graphene.ID(required=True)
+        information_source_id = graphene.ID(required=True)
+        payload_id = graphene.ID(required=True)
+
+    ok = graphene.Boolean()
+
+    def mutate(
+        self,
+        info,
+        project_id: str,
+        information_source_id: str,
+        payload_id: str,
+    ):
+        auth_manager.check_demo_access(info)
+        auth_manager.check_project_access(info, project_id)
+
+        manager.cancel_zero_shot_run(project_id, information_source_id, payload_id)
+
+        return ZeroShotProject(ok=True)
+
+
 class CreateZeroShotInformationSource(graphene.Mutation):
     class Arguments:
         project_id = graphene.ID(required=True)
@@ -60,3 +83,4 @@ def mutate(
 class ZeroShotMutation(graphene.ObjectType):
     zero_shot_project = ZeroShotProject.Field()
     create_zero_shot_information_source = CreateZeroShotInformationSource.Field()
+    cancel_zero_shot_run = CancelZeroShotRun.Field()