code-kern-ai
diff --git a/‎.gitignore
Lines changed: 1 addition & 1 deletion b/‎.gitignore
Lines changed: 1 addition & 1 deletion
diff --git a/‎alembic/versions/09311360f8b9_adds_upload_task_fields_for_label_studio.py
Lines changed: 32 additions & 0 deletions b/‎alembic/versions/09311360f8b9_adds_upload_task_fields_for_label_studio.py
Lines changed: 32 additions & 0 deletions
diff --git a/‎api/transfer.py
Lines changed: 19 additions & 6 deletions b/‎api/transfer.py
Lines changed: 19 additions & 6 deletions
diff --git a/‎controller/record_label_association/manager.py
Lines changed: 1 addition & 3 deletions b/‎controller/record_label_association/manager.py
Lines changed: 1 addition & 3 deletions
diff --git a/‎controller/transfer/knowledge_base_transfer_manager.py
Lines changed: 2 additions & 6 deletions b/‎controller/transfer/knowledge_base_transfer_manager.py
Lines changed: 2 additions & 6 deletions
diff --git a/‎controller/transfer/labelstudio/import_preperator.py
Lines changed: 202 additions & 0 deletions b/‎controller/transfer/labelstudio/import_preperator.py
Lines changed: 202 additions & 0 deletions
@@ -1,6 +1,6 @@
 .DS_Store
 .idea
-tmpfile.
+tmpfile.*
 
 # Byte-compiled / optimized / DLL files
 __pycache__/
 
@@ -0,0 +1,32 @@
+"""Adds upload task fields for label studio
+
+Revision ID: 09311360f8b9
+Revises: 87f463aa5112
+Create Date: 2022-11-07 10:32:10.881495
+
+"""
+from alembic import op
+import sqlalchemy as sa
+
+
+# revision identifiers, used by Alembic.
+revision = '09311360f8b9'
+down_revision = '87f463aa5112'
+branch_labels = None
+depends_on = None
+
+
+def upgrade():
+    # ### commands auto generated by Alembic - please adjust! ###
+    op.add_column('upload_task', sa.Column('upload_type', sa.String(), nullable=True))
+    op.add_column('upload_task', sa.Column('file_additional_info', sa.String(), nullable=True))
+    op.add_column('upload_task', sa.Column('mappings', sa.String(), nullable=True))
+    # ### end Alembic commands ###
+
+
+def downgrade():
+    # ### commands auto generated by Alembic - please adjust! ###
+    op.drop_column('upload_task', 'user_mapping')
+    op.drop_column('upload_task', 'file_additional_info')
+    op.drop_column('upload_task', 'upload_type')
+    # ### end Alembic commands ###
@@ -1,13 +1,16 @@
 import logging
 import traceback
 
+import controller.transfer.labelstudio.import_preperator
 from controller import organization
 from starlette.endpoints import HTTPEndpoint
 from starlette.responses import PlainTextResponse, JSONResponse
+
+from controller.transfer.labelstudio import import_preperator
 from submodules.s3 import controller as s3
 from submodules.model.business_objects import organization
 
-from controller.transfer import manager as transfer_manager
+from controller.transfer import manager as transfer_manager, record_transfer_manager
 from controller.upload_task import manager as upload_task_manager
 from controller.auth import manager as auth_manager
 from controller.transfer import manager as transfer_manager
@@ -191,17 +194,27 @@ def get(self, request) -> JSONResponse:
 
 
 def init_file_import(task: UploadTask, project_id: str, is_global_update: bool) -> None:
+    task_state = task.state
     if "records" in task.file_type:
-        transfer_manager.import_records_from_file(project_id, task)
+        if task.upload_type == enums.UploadTypes.LABEL_STUDIO.value:
+            import_preperator.prepare_label_studio_import(project_id, task)
+        else:
+            transfer_manager.import_records_from_file(project_id, task)
     elif "project" in task.file_type:
         transfer_manager.import_project(project_id, task)
     elif "knowledge_base" in task.file_type:
         transfer_manager.import_knowledge_base(project_id, task)
 
-    notification.send_organization_update(
-        project_id, f"file_upload:{str(task.id)}:state:{task.state}", is_global_update
-    )
-    if task.file_type != "knowledge_base":
+    if task.state == task_state:
+        # update is sent in update task if it was updated (e.g. with labeling studio)
+        notification.send_organization_update(
+            project_id,
+            f"file_upload:{str(task.id)}:state:{task.state}",
+            is_global_update,
+        )
+    if (
+        task.file_type != "knowledge_base"
+    ):
         tokenization_service.request_tokenize_project(project_id, str(task.user_id))
 
 
 
@@ -300,9 +300,7 @@ def delete_record_label_association(
         project_id, record_id, association_ids
     )
     task_ids = get_labeling_tasks_from_ids(project_id, association_ids)
-    record_label_association.delete_by_ids(
-        project_id, record_id, association_ids, with_commit=True
-    )
+    record_label_association.delete_by_ids(project_id, association_ids, record_id, with_commit=True)
     for task_id in task_ids:
         update_is_relevant_manual_label(project_id, task_id, record_id)
     general.commit()
 
@@ -8,9 +8,7 @@
 
 
 def import_knowledge_base_file(project_id: str, task: UploadTask) -> None:
-    upload_task_manager.update_task(
-        project_id, task.id, state=enums.UploadStates.PENDING.value
-    )
+    upload_task_manager.update_task(project_id, task.id, state=enums.UploadStates.PENDING.value)
     general.commit()
 
     file_type = task.file_name.rsplit("_", 1)[0].rsplit(".", 1)[1]
@@ -45,9 +43,7 @@ def import_knowledge_base_file(project_id: str, task: UploadTask) -> None:
             project_id, list_id, to_add, with_commit=True
         )
 
-    upload_task_manager.update_task(
-        project_id, task.id, state=enums.UploadStates.IN_PROGRESS.value
-    )
+    upload_task_manager.update_task(project_id, task.id, state=enums.UploadStates.IN_PROGRESS.value)
     task.state = enums.UploadStates.DONE.value
     general.commit()
 
 
@@ -0,0 +1,202 @@
+import json
+import traceback
+import os
+
+from controller.transfer.record_transfer_manager import download_file
+from submodules.model import UploadTask, enums
+from submodules.model.business_objects import project, record
+from controller.upload_task import manager as task_manager
+from typing import Set, Dict, Any, Optional
+
+
+def prepare_label_studio_import(project_id: str, task: UploadTask) -> None:
+    # pre init to ensure we can always append an error
+    file_additional_info = __get_blank_file_additional_info()
+    project_item = project.get(project_id)
+    if not project_item:
+        file_additional_info["errors"].append("Can't find project".format(e))
+    try:
+        is_project_update = record.count(project_id) != 0
+        file_path = download_file(project_id, task)
+        _, extension = os.path.splitext(file_path)
+        if extension == ".json":
+            with open(file_path) as file:
+                data = json.load(file)
+            analyze_file(data, file_additional_info, is_project_update)
+        else:
+            file_additional_info["errors"].append(f"Unsupported file type {extension}")
+    except Exception as e:
+        file_additional_info["errors"].append(
+            "Error while analyzing file: {}".format(e)
+        )
+        print(traceback.format_exc(), flush=True)
+    dumped_info = json.dumps(file_additional_info)
+    task_manager.update_task(
+        project_id,
+        task.id,
+        state=enums.UploadStates.PREPARED.value,
+        file_additional_info=dumped_info,
+    )
+
+
+def analyze_file(
+    data: Dict[str, Any], file_additional_info: Dict[str, Any], is_project_update: bool
+) -> None:
+    user_id_counts = {}
+    tasks = set()
+    record_count = 0
+    ex_no_kern_id = None
+    ex_drafts = None
+    ex_predictions = None
+    ex_extraction = None
+    ex_multiple_choices = None
+    # multiple annotation for a user within the same record/task
+    ex_multiple_annotations = None
+
+    for record in data:
+        if type(record) is not dict:
+            file_additional_info["errors"].append("Import format not recognized")
+            break
+        record_count += 1
+        record_id = record["id"]
+        if not ex_drafts and __check_record_has_values_for(record, "drafts"):
+            ex_drafts = f"\n\tExample: record {record_id}"
+        if not ex_predictions and __check_record_has_values_for(record, "predictions"):
+            ex_predictions = f"\n\tExample: record {record_id}"
+        if not ex_multiple_annotations and __check_record_has_multi_annotation(record):
+            ex_multiple_annotations = f"\n\tExample: record {record_id}"
+        if (
+            is_project_update
+            and not ex_no_kern_id
+            and not __check_record_has_values_for(
+                record, "data", "kern_refinery_record_id"
+            )
+        ):
+            ex_no_kern_id = f"\n\tExample: record {record_id}"
+        for annotation in record["annotations"]:
+            annotation_id = annotation["id"]
+            if not ex_extraction and __check_annotation_has_extraction(annotation):
+                ex_extraction = (
+                    f"\n\tExample: record {record_id} - annotation {annotation_id}"
+                )
+            if not ex_multiple_choices and __check_annotation_has_multiclass(
+                annotation
+            ):
+                ex_multiple_choices = (
+                    f"\n\tExample: record {record_id} - annotation {annotation_id}"
+                )
+            user_id = annotation["completed_by"]
+            __add_annotation_target(annotation, tasks)
+
+            if user_id in user_id_counts:
+                user_id_counts[user_id] += 1
+            else:
+                user_id_counts[user_id] = 1
+
+    if ex_drafts:
+        file_additional_info["warnings"].append(
+            "Label Studio drafts are not supported." + ex_drafts
+        )
+
+    if ex_predictions:
+        file_additional_info["warnings"].append(
+            "Label Studio predictions are not supported." + ex_predictions
+        )
+
+    if ex_extraction:
+        file_additional_info["warnings"].append(
+            "Named Entity Recognition / extraction labels are not supported.\nThese annotations will be ignored if you proceed."
+            + ex_extraction
+        )
+    if ex_multiple_choices:
+        file_additional_info["warnings"].append(
+            "Multiple choices for a result set are not supported.\nThese annotations will be ignored if you proceed."
+            + ex_multiple_choices
+        )
+    if ex_multiple_annotations:
+        file_additional_info["errors"].append(
+            "Multiple annotations for the same user within the same record\ntargeting the same task are not supported."
+            + ex_multiple_annotations
+        )
+    if ex_no_kern_id:
+        file_additional_info["errors"].append(
+            "Project update without kern record id. Can't update project (see restrictions)."
+            + ex_multiple_annotations
+        )
+
+    file_additional_info["user_ids"] = list(user_id_counts.keys())
+    file_additional_info["tasks"] = list(tasks)
+    file_additional_info["file_info"]["records"] = record_count
+    file_additional_info["file_info"]["annotations"] = user_id_counts
+
+
+def __add_annotation_target(annotation: Dict[str, Any], tasks: Set[str]) -> None:
+    tasks |= __get_annotation_targets(annotation)
+
+
+def __get_annotation_targets(annotation: Dict[str, Any]) -> Set[str]:
+    target = annotation.get("result")
+    if target and len(target) > 0:
+        return {t["from_name"] for t in target if "from_name" in t}
+    return {}
+
+
+def __check_record_has_values_for(
+    record: Dict[str, Any], key: str, sub_key: Optional[str] = None
+) -> bool:
+    value = record.get(key)
+    if value:
+        if not sub_key:
+            return True
+        else:
+            return __check_record_has_values_for(value, sub_key)
+    return False
+
+
+def __check_record_has_multi_annotation(record: Dict[str, Any]) -> bool:
+    annotations = record.get("annotations")
+    if not annotations or len(annotations) < 2:
+        return False
+    lookup = {}
+    for annotation in annotations:
+        user_id = annotation.get("completed_by")
+        if user_id not in lookup:
+            lookup[user_id] = {}
+        targets = __get_annotation_targets(annotation)
+        for target in targets:
+            if target not in lookup[user_id]:
+                lookup[user_id][target] = 1
+            else:
+                return True
+    return False
+
+
+def __check_annotation_has_extraction(annotation: Dict[str, Any]) -> bool:
+    results = annotation.get("result")
+    if not results:
+        return False
+    for result in results:
+        if result.get("type") != "choices":
+            return True
+    return False
+
+
+def __check_annotation_has_multiclass(annotation: Dict[str, Any]) -> bool:
+    results = annotation.get("result")
+    if not results:
+        return False
+    for result in results:
+        if result.get("type") == "choices" and len(result["value"]["choices"]) > 1:
+            return True
+    return False
+
+
+def __get_blank_file_additional_info() -> Dict[str, Any]:
+    return {
+        "user_ids": [],
+        "tasks": [],
+        "errors": [],
+        "warnings": [],
+        "info": [],
+        "file_info": {"records": 0, "annotations": {}},
+    }