Cache Files for ETL Processing (#257)

LennartSchmidtKern · JWittmeyer · web-flow · commit c72cc903ac5e · 2024-10-18T10:31:39.000+02:00
* file caching draft

* model

* remove tokens

* clean up project

* update enum values, cancel update

* alembic join

* improve cancel

* s3

* state change

* model

* alembic submodules

* model

* model

* Remove and condition

* Change revision order

* Remove token from invalidation thread

* Submodule merge

---------

Co-authored-by: JWittmeyer &lt;jens.wittmeyer@kern.ai&gt;
diff --git a/alembic/versions/11675e102ac4_add_file_caching.py b/alembic/versions/11675e102ac4_add_file_caching.py
@@ -0,0 +1,100 @@
+"""add file caching
+
+Revision ID: 11675e102ac4
+Revises: 1118c7327b96
+Create Date: 2024-10-09 15:37:46.744638
+
+"""
+from alembic import op
+import sqlalchemy as sa
+from sqlalchemy.dialects import postgresql
+
+# revision identifiers, used by Alembic.
+revision = '11675e102ac4'
+down_revision = '1118c7327b96'
+branch_labels = None
+depends_on = None
+
+
+def upgrade():
+    # ### commands auto generated by Alembic - please adjust! ###
+    op.create_table('file_reference',
+    sa.Column('id', postgresql.UUID(as_uuid=True), nullable=False),
+    sa.Column('organization_id', postgresql.UUID(as_uuid=True), nullable=True),
+    sa.Column('hash', sa.String(), nullable=True),
+    sa.Column('minio_path', sa.String(), nullable=True),
+    sa.Column('bucket', sa.String(), nullable=True),
+    sa.Column('created_at', sa.DateTime(), nullable=True),
+    sa.Column('created_by', postgresql.UUID(as_uuid=True), nullable=True),
+    sa.Column('file_size_bytes', sa.BigInteger(), nullable=True),
+    sa.Column('content_type', sa.String(), nullable=True),
+    sa.Column('original_file_name', sa.String(), nullable=True),
+    sa.Column('state', sa.String(), nullable=True),
+    sa.Column('meta_data', sa.JSON(), nullable=True),
+    sa.ForeignKeyConstraint(['created_by'], ['user.id'], ondelete='SET NULL'),
+    sa.ForeignKeyConstraint(['organization_id'], ['organization.id'], ondelete='CASCADE'),
+    sa.PrimaryKeyConstraint('id'),
+    sa.UniqueConstraint('organization_id', 'hash', 'file_size_bytes', name='unique_file_reference'),
+    schema='cognition'
+    )
+    op.create_index(op.f('ix_cognition_file_reference_created_by'), 'file_reference', ['created_by'], unique=False, schema='cognition')
+    op.create_index(op.f('ix_cognition_file_reference_hash'), 'file_reference', ['hash'], unique=False, schema='cognition')
+    op.create_index(op.f('ix_cognition_file_reference_organization_id'), 'file_reference', ['organization_id'], unique=False, schema='cognition')
+    op.create_table('file_extraction',
+    sa.Column('id', postgresql.UUID(as_uuid=True), nullable=False),
+    sa.Column('organization_id', postgresql.UUID(as_uuid=True), nullable=True),
+    sa.Column('file_reference_id', postgresql.UUID(as_uuid=True), nullable=True),
+    sa.Column('extraction_key', sa.String(), nullable=True),
+    sa.Column('minio_path', sa.String(), nullable=True),
+    sa.Column('bucket', sa.String(), nullable=True),
+    sa.Column('created_at', sa.DateTime(), nullable=True),
+    sa.Column('created_by', postgresql.UUID(as_uuid=True), nullable=True),
+    sa.Column('state', sa.String(), nullable=True),
+    sa.ForeignKeyConstraint(['created_by'], ['user.id'], ondelete='SET NULL'),
+    sa.ForeignKeyConstraint(['file_reference_id'], ['cognition.file_reference.id'], ondelete='CASCADE'),
+    sa.ForeignKeyConstraint(['organization_id'], ['organization.id'], ondelete='CASCADE'),
+    sa.PrimaryKeyConstraint('id'),
+    sa.UniqueConstraint('organization_id', 'file_reference_id', 'extraction_key', name='unique_file_extraction'),
+    schema='cognition'
+    )
+    op.create_index(op.f('ix_cognition_file_extraction_created_by'), 'file_extraction', ['created_by'], unique=False, schema='cognition')
+    op.create_index(op.f('ix_cognition_file_extraction_file_reference_id'), 'file_extraction', ['file_reference_id'], unique=False, schema='cognition')
+    op.create_index(op.f('ix_cognition_file_extraction_organization_id'), 'file_extraction', ['organization_id'], unique=False, schema='cognition')
+    op.create_table('file_transformation',
+    sa.Column('id', postgresql.UUID(as_uuid=True), nullable=False),
+    sa.Column('organization_id', postgresql.UUID(as_uuid=True), nullable=True),
+    sa.Column('file_extraction_id', postgresql.UUID(as_uuid=True), nullable=True),
+    sa.Column('transformation_key', sa.String(), nullable=True),
+    sa.Column('minio_path', sa.String(), nullable=True),
+    sa.Column('bucket', sa.String(), nullable=True),
+    sa.Column('created_at', sa.DateTime(), nullable=True),
+    sa.Column('created_by', postgresql.UUID(as_uuid=True), nullable=True),
+    sa.Column('state', sa.String(), nullable=True),
+    sa.ForeignKeyConstraint(['created_by'], ['user.id'], ondelete='SET NULL'),
+    sa.ForeignKeyConstraint(['file_extraction_id'], ['cognition.file_extraction.id'], ondelete='CASCADE'),
+    sa.ForeignKeyConstraint(['organization_id'], ['organization.id'], ondelete='CASCADE'),
+    sa.PrimaryKeyConstraint('id'),
+    sa.UniqueConstraint('organization_id', 'file_extraction_id', 'transformation_key', name='unique_file_transformation'),
+    schema='cognition'
+    )
+    op.create_index(op.f('ix_cognition_file_transformation_created_by'), 'file_transformation', ['created_by'], unique=False, schema='cognition')
+    op.create_index(op.f('ix_cognition_file_transformation_file_extraction_id'), 'file_transformation', ['file_extraction_id'], unique=False, schema='cognition')
+    op.create_index(op.f('ix_cognition_file_transformation_organization_id'), 'file_transformation', ['organization_id'], unique=False, schema='cognition')
+    # ### end Alembic commands ###
+
+
+def downgrade():
+    # ### commands auto generated by Alembic - please adjust! ###
+    op.drop_index(op.f('ix_cognition_file_transformation_organization_id'), table_name='file_transformation', schema='cognition')
+    op.drop_index(op.f('ix_cognition_file_transformation_file_extraction_id'), table_name='file_transformation', schema='cognition')
+    op.drop_index(op.f('ix_cognition_file_transformation_created_by'), table_name='file_transformation', schema='cognition')
+    op.drop_table('file_transformation', schema='cognition')
+    op.drop_index(op.f('ix_cognition_file_extraction_organization_id'), table_name='file_extraction', schema='cognition')
+    op.drop_index(op.f('ix_cognition_file_extraction_file_reference_id'), table_name='file_extraction', schema='cognition')
+    op.drop_index(op.f('ix_cognition_file_extraction_created_by'), table_name='file_extraction', schema='cognition')
+    op.drop_table('file_extraction', schema='cognition')
+    op.drop_index(op.f('ix_cognition_file_reference_organization_id'), table_name='file_reference', schema='cognition')
+    op.drop_index(op.f('ix_cognition_file_reference_hash'), table_name='file_reference', schema='cognition')
+    op.drop_index(op.f('ix_cognition_file_reference_created_by'), table_name='file_reference', schema='cognition')
+    op.drop_table('file_reference', schema='cognition')
+    # ### end Alembic commands ###
diff --git a/alembic/versions/c626887031f6_add.py b/alembic/versions/c626887031f6_add.py
@@ -0,0 +1,28 @@
+"""add
+
+Revision ID: c626887031f6
+Revises: 11675e102ac4
+Create Date: 2024-10-15 13:53:26.632068
+
+"""
+from alembic import op
+import sqlalchemy as sa
+
+
+# revision identifiers, used by Alembic.
+revision = 'c626887031f6'
+down_revision = '11675e102ac4'
+branch_labels = None
+depends_on = None
+
+
+def upgrade():
+    # ### commands auto generated by Alembic - please adjust! ###
+    op.add_column('file_reference', sa.Column('last_used', sa.DateTime(), nullable=True), schema='cognition')
+    # ### end Alembic commands ###
+
+
+def downgrade():
+    # ### commands auto generated by Alembic - please adjust! ###
+    op.drop_column('file_reference', 'last_used', schema='cognition')
+    # ### end Alembic commands ###
diff --git a/alembic/versions/f8c313f63a36_rename_llm_logs.py b/alembic/versions/f8c313f63a36_rename_llm_logs.py
@@ -0,0 +1,58 @@
+"""rename llm logs
+
+Revision ID: f8c313f63a36
+Revises: c626887031f6
+Create Date: 2024-10-15 16:01:26.391244
+
+"""
+from alembic import op
+import sqlalchemy as sa
+from sqlalchemy.dialects import postgresql
+
+# revision identifiers, used by Alembic.
+revision = 'f8c313f63a36'
+down_revision = 'c626887031f6'
+branch_labels = None
+depends_on = None
+
+
+def upgrade():
+    # ### commands auto generated by Alembic - please adjust! ###
+    op.create_table('file_transformation_llm_logs',
+    sa.Column('id', postgresql.UUID(as_uuid=True), nullable=False),
+    sa.Column('file_transformation_id', postgresql.UUID(as_uuid=True), nullable=True),
+    sa.Column('created_at', sa.DateTime(), nullable=True),
+    sa.Column('finished_at', sa.DateTime(), nullable=True),
+    sa.Column('model_used', sa.String(), nullable=True),
+    sa.Column('input', sa.String(), nullable=True),
+    sa.Column('output', sa.String(), nullable=True),
+    sa.Column('error', sa.String(), nullable=True),
+    sa.ForeignKeyConstraint(['file_transformation_id'], ['cognition.file_transformation.id'], ondelete='CASCADE'),
+    sa.PrimaryKeyConstraint('id'),
+    schema='cognition'
+    )
+    op.create_index(op.f('ix_cognition_file_transformation_llm_logs_file_transformation_id'), 'file_transformation_llm_logs', ['file_transformation_id'], unique=False, schema='cognition')
+    op.drop_index('ix_cognition_markdown_llm_logs_markdown_file_id', table_name='markdown_llm_logs', schema='cognition')
+    op.drop_table('markdown_llm_logs', schema='cognition')
+    # ### end Alembic commands ###
+
+
+def downgrade():
+    # ### commands auto generated by Alembic - please adjust! ###
+    op.create_table('markdown_llm_logs',
+    sa.Column('id', postgresql.UUID(), autoincrement=False, nullable=False),
+    sa.Column('markdown_file_id', postgresql.UUID(), autoincrement=False, nullable=True),
+    sa.Column('created_at', postgresql.TIMESTAMP(), autoincrement=False, nullable=True),
+    sa.Column('finished_at', postgresql.TIMESTAMP(), autoincrement=False, nullable=True),
+    sa.Column('model_used', sa.VARCHAR(), autoincrement=False, nullable=True),
+    sa.Column('input', sa.VARCHAR(), autoincrement=False, nullable=True),
+    sa.Column('output', sa.VARCHAR(), autoincrement=False, nullable=True),
+    sa.Column('error', sa.VARCHAR(), autoincrement=False, nullable=True),
+    sa.ForeignKeyConstraint(['markdown_file_id'], ['cognition.markdown_file.id'], name='markdown_llm_logs_markdown_file_id_fkey', ondelete='CASCADE'),
+    sa.PrimaryKeyConstraint('id', name='markdown_llm_logs_pkey'),
+    schema='cognition'
+    )
+    op.create_index('ix_cognition_markdown_llm_logs_markdown_file_id', 'markdown_llm_logs', ['markdown_file_id'], unique=False, schema='cognition')
+    op.drop_index(op.f('ix_cognition_file_transformation_llm_logs_file_transformation_id'), table_name='file_transformation_llm_logs', schema='cognition')
+    op.drop_table('file_transformation_llm_logs', schema='cognition')
+    # ### end Alembic commands ###
diff --git a/controller/misc/config_service.py b/controller/misc/config_service.py
@@ -27,7 +27,7 @@ def refresh_config():
         )
     global __config
     __config = response.json()
-    daemon.run_with_db_token(invalidate_after, 3600)  # one hour as failsave
+    daemon.run_without_db_token(invalidate_after, 3600)  # one hour as failsave
 
 
 def get_config_value(
diff --git a/controller/monitor/manager.py b/controller/monitor/manager.py
@@ -2,7 +2,6 @@
 from submodules.model.business_objects import monitor as task_monitor
 from controller.auth import kratos
 from submodules.model.util import sql_alchemy_to_dict
-from submodules.s3 import controller as s3
 
 
 def monitor_all_tasks(page: int, limit: int) -> List[Any]:
@@ -100,19 +99,19 @@ def cancel_macro_execution_task(
     )
 
 
-def cancel_markdown_file_task(
+def cancel_parse_cognition_file_task(
+    org_id: str,
     task_info: Dict[str, Any],
 ) -> None:
-    markdown_file_id = task_info.get("fileId")
-    org_id = task_info.get("orgId")
-    task_monitor.set_markdown_file_task_to_failed(
-        markdown_file_id, org_id, with_commit=True
-    )
 
+    file_reference_id = task_info.get("fileReferenceId")
+    extraction_key = task_info.get("extractionKey")
+    transformation_key = task_info.get("transformationKey")
 
-def cancel_tmp_doc_retrieval_task(
-    task_info: Dict[str, Any],
-) -> None:
-    bucket = task_info.get("bucket")
-    minio_path = task_info.get("minioPath")
-    s3.delete_object(bucket, minio_path)
+    task_monitor.set_parse_cognition_file_task_to_failed(
+        org_id,
+        file_reference_id,
+        extraction_key,
+        transformation_key,
+        with_commit=True,
+    )
diff --git a/controller/transfer/cognition/minio_upload.py b/controller/transfer/cognition/minio_upload.py
@@ -1,35 +1,49 @@
 from typing import List
-from submodules.model.cognition_objects import project as cognition_project
-from submodules.model.cognition_objects import conversation
-from submodules.model.enums import TaskType
+from submodules.model.cognition_objects import file_reference as file_reference_db_bo
+from submodules.model.enums import TaskType, FileCachingProcessingScope
 from controller.task_master import manager as task_master_manager
+from submodules.model import enums
+from submodules.model.business_objects import general
 
 
 def handle_cognition_file_upload(path_parts: List[str]):
 
-    if path_parts[1] != "_cognition":
+    if path_parts[1] != "_cognition" or len(path_parts) < 5:
         return
+    if path_parts[2] == "files" and path_parts[4].startswith("file_original"):
+        org_id = path_parts[0]
+        file_hash, file_size = path_parts[3].split("_")
+        file_reference = file_reference_db_bo.get(org_id, file_hash, int(file_size))
 
-    if path_parts[3] == "chat_tmp_files" and path_parts[5] == "queued":
-        cognition_project_id = path_parts[2]
-        conversation_id = path_parts[4]
-        cognition_prj = cognition_project.get(cognition_project_id)
-        if not cognition_prj:
+        if (
+            not file_reference
+            or file_reference.state == enums.FileCachingState.RUNNING.value
+            or file_reference.state == enums.FileCachingState.COMPLETED.value
+        ):
             return
+        file_reference.state = enums.FileCachingState.COMPLETED.value
+        general.commit()
 
-        conversation_item = conversation.get(cognition_project_id, conversation_id)
-        if not conversation_item:
-            return
+        prio = (
+            file_reference.meta_data.get("transformation_initiator")
+            == enums.FileCachingInitiator.TMP_DOC_RETRIEVAL.value
+        )
+        extraction_method = file_reference.meta_data.get("extraction_method")
 
         task_master_manager.queue_task(
-            str(cognition_prj.organization_id),
-            str(conversation_item.created_by),
-            TaskType.PARSE_COGNITION_TMP_FILE,
+            str(file_reference.organization_id),
+            str(file_reference.created_by),
+            TaskType.PARSE_COGNITION_FILE,
             {
-                "cognition_project_id": str(cognition_project_id),
-                "conversation_id": str(conversation_id),
-                "minio_path": "/".join(path_parts[1:]),
-                "bucket": path_parts[0],
+                "parse_scope": FileCachingProcessingScope.EXTRACT_TRANSFORM.value,
+                "file_reference_id": str(file_reference.id),
+                "extraction_method": extraction_method,
+                "meta_data": file_reference.meta_data,
+                "extraction_key": file_reference.meta_data.get("extraction_key"),
+                "transformation_key": file_reference.meta_data.get(
+                    "transformation_key"
+                ),
+                "file_name": file_reference.original_file_name,
             },
-            True,  # not sure if prio is right here as the prio tasks should only take < 1 min but waiting for the normal queue will take ages depending on the queue
+            prio,  # not sure if prio is right here as the prio tasks should only take < 1 min but waiting for the normal queue will take ages depending on the queue
         )
diff --git a/fast_api/routes/misc.py b/fast_api/routes/misc.py
@@ -138,10 +138,10 @@ def cancel_task(
             controller_manager.cancel_weak_supervision(task_info)
         elif task_type == enums.TaskType.RUN_COGNITION_MACRO.value:
             controller_manager.cancel_macro_execution_task(task_info)
-        elif task_type == enums.TaskType.PARSE_MARKDOWN_FILE.value:
-            controller_manager.cancel_markdown_file_task(task_info)
-        elif task_type == enums.TaskType.PARSE_COGNITION_TMP_FILE.value:
-            controller_manager.cancel_tmp_doc_retrieval_task(task_info)
+        elif task_type == enums.TaskType.PARSE_COGNITION_FILE.value:
+            controller_manager.cancel_parse_cognition_file_task(
+                task_entity.organization_id, task_info
+            )
         else:
             raise ValueError(f"{task_type} is no valid task type")
 
@@ -314,31 +314,3 @@ def update_customer_buttons(
             update_request.visible,
         )
     )
-
-
-@router.get("/dummy/create/wrong/session")
-def dummy():
-
-    def something():
-        from submodules.model.business_objects import general
-
-        # general.get_ctx_token()
-        from submodules.model.business_objects import organization
-
-        print("organization", organization.get_all(), flush=True)
-        import json
-
-        print(
-            json.dumps(
-                general.get_session_lookup(exclude_last_x_seconds=-1),
-                indent=4,
-                default=str,
-            ),
-            flush=True,
-        )
-
-    from submodules.model import daemon
-
-    daemon.run_with_db_token(something)
-
-    return SILENT_SUCCESS_RESPONSE
diff --git a/submodules/model b/submodules/model
@@ -1 +1 @@
-Subproject commit 872a6a8c54e31598be48e34ba65d21ecc82790d4
+Subproject commit 49ea175bb3cd429ddc911996e0fe00e6763beefe
diff --git a/submodules/s3 b/submodules/s3
@@ -1 +1 @@
-Subproject commit 1ad3ff584860090f4e215986f334b5e63759a55d
+Subproject commit 3299fb46876e3b4cc29c0a5cef004005a87f0f19

Original file line number	Diff line number	Diff line change
`@@ -27,7 +27,7 @@ def refresh_config():`
`27`	`27`	`)`
`28`	`28`	`global __config`
`29`	`29`	`__config = response.json()`
`30`		`- daemon.run_with_db_token(invalidate_after, 3600) # one hour as failsave`
	`30`	`+ daemon.run_without_db_token(invalidate_after, 3600) # one hour as failsave`
`31`	`31`
`32`	`32`
`33`	`33`	`def get_config_value(`