From c7a944a7625ccbf6f92c39185e58b474f0f35360 Mon Sep 17 00:00:00 2001
From: andhreljaKern <andrea.hrelja@kern.ai>
Date: Fri, 24 Oct 2025 12:51:38 +0200
Subject: [PATCH 01/54] chore: update submodules

---
 submodules/model | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/submodules/model b/submodules/model
index fa52e172..3d4317aa 160000
--- a/submodules/model
+++ b/submodules/model
@@ -1 +1 @@
-Subproject commit fa52e1725d0691979895644d63c0b61728ea771b
+Subproject commit 3d4317aaea9cfd2d8f241e87de67cc39fe98f55a

From f63a8bad07c612e18b92279663994492353233c7 Mon Sep 17 00:00:00 2001
From: andhreljaKern <andrea.hrelja@kern.ai>
Date: Fri, 24 Oct 2025 12:51:47 +0200
Subject: [PATCH 02/54] perf(alembic): add etl task table

---
 .../ac36ed533db7_adds_etl_task_table.py       | 89 +++++++++++++++++++
 1 file changed, 89 insertions(+)
 create mode 100644 alembic/versions/ac36ed533db7_adds_etl_task_table.py

diff --git a/alembic/versions/ac36ed533db7_adds_etl_task_table.py b/alembic/versions/ac36ed533db7_adds_etl_task_table.py
new file mode 100644
index 00000000..3fc8ad9f
--- /dev/null
+++ b/alembic/versions/ac36ed533db7_adds_etl_task_table.py
@@ -0,0 +1,89 @@
+"""adds etl task table
+
+Revision ID: ac36ed533db7
+Revises: 24ca8432bd8b
+Create Date: 2025-10-24 10:50:23.262494
+
+"""
+
+from alembic import op
+import sqlalchemy as sa
+from sqlalchemy.dialects import postgresql
+
+# revision identifiers, used by Alembic.
+revision = "ac36ed533db7"
+down_revision = "24ca8432bd8b"
+branch_labels = None
+depends_on = None
+
+
+def upgrade():
+    # ### commands auto generated by Alembic - please adjust! ###
+    op.create_table(
+        "etl_task",
+        sa.Column("id", postgresql.UUID(as_uuid=True), nullable=False),
+        sa.Column("created_at", sa.DateTime(), nullable=True),
+        sa.Column("created_by", postgresql.UUID(as_uuid=True), nullable=True),
+        sa.Column("markdown_file_id", postgresql.UUID(as_uuid=True), nullable=True),
+        sa.Column("sharepoint_file_id", postgresql.UUID(as_uuid=True), nullable=True),
+        sa.Column("extract_config", sa.JSON(), nullable=True),
+        sa.Column("transform_config", sa.JSON(), nullable=True),
+        sa.Column("load_config", sa.JSON(), nullable=True),
+        sa.Column("notify_config", sa.JSON(), nullable=True),
+        sa.Column("priority", sa.Boolean(), nullable=True),
+        sa.Column("is_active", sa.Boolean(), nullable=True),
+        sa.Column("started_at", sa.DateTime(), nullable=True),
+        sa.Column("finished_at", sa.DateTime(), nullable=True),
+        sa.Column("state", sa.String(), nullable=True),
+        sa.Column("error_message", sa.String(), nullable=True),
+        sa.ForeignKeyConstraint(["created_by"], ["user.id"], ondelete="SET NULL"),
+        sa.ForeignKeyConstraint(
+            ["markdown_file_id"], ["cognition.markdown_file.id"], ondelete="CASCADE"
+        ),
+        sa.ForeignKeyConstraint(
+            ["sharepoint_file_id"], ["integration.sharepoint.id"], ondelete="CASCADE"
+        ),
+        sa.PrimaryKeyConstraint("id"),
+        schema="global",
+    )
+    op.create_index(
+        op.f("ix_global_etl_task_created_by"),
+        "etl_task",
+        ["created_by"],
+        unique=False,
+        schema="global",
+    )
+    op.create_index(
+        op.f("ix_global_etl_task_markdown_file_id"),
+        "etl_task",
+        ["markdown_file_id"],
+        unique=False,
+        schema="global",
+    )
+    op.create_index(
+        op.f("ix_global_etl_task_sharepoint_file_id"),
+        "etl_task",
+        ["sharepoint_file_id"],
+        unique=False,
+        schema="global",
+    )
+    # ### end Alembic commands ###
+
+
+def downgrade():
+    # ### commands auto generated by Alembic - please adjust! ###
+    op.drop_index(
+        op.f("ix_global_etl_task_sharepoint_file_id"),
+        table_name="etl_task",
+        schema="global",
+    )
+    op.drop_index(
+        op.f("ix_global_etl_task_markdown_file_id"),
+        table_name="etl_task",
+        schema="global",
+    )
+    op.drop_index(
+        op.f("ix_global_etl_task_created_by"), table_name="etl_task", schema="global"
+    )
+    op.drop_table("etl_task", schema="global")
+    # ### end Alembic commands ###

From 7b5fcbe2cac5196756a2336a5e81817e63813e95 Mon Sep 17 00:00:00 2001
From: andhreljaKern <andrea.hrelja@kern.ai>
Date: Fri, 24 Oct 2025 13:03:44 +0200
Subject: [PATCH 03/54] chore: update submodules

---
 submodules/model | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/submodules/model b/submodules/model
index 3d4317aa..c002a6e2 160000
--- a/submodules/model
+++ b/submodules/model
@@ -1 +1 @@
-Subproject commit 3d4317aaea9cfd2d8f241e87de67cc39fe98f55a
+Subproject commit c002a6e2d612517d88749e967691e38ab82aa7d9

From 4f57597ed6f18989a7e05690b92d7548e088ac92 Mon Sep 17 00:00:00 2001
From: andhreljaKern <andrea.hrelja@kern.ai>
Date: Fri, 24 Oct 2025 13:03:55 +0200
Subject: [PATCH 04/54] perf(alembic): update etl task table

---
 ...l_task_table.py => e46afed08420_adds_etl_task_table.py} | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)
 rename alembic/versions/{ac36ed533db7_adds_etl_task_table.py => e46afed08420_adds_etl_task_table.py} (95%)

diff --git a/alembic/versions/ac36ed533db7_adds_etl_task_table.py b/alembic/versions/e46afed08420_adds_etl_task_table.py
similarity index 95%
rename from alembic/versions/ac36ed533db7_adds_etl_task_table.py
rename to alembic/versions/e46afed08420_adds_etl_task_table.py
index 3fc8ad9f..26d203b8 100644
--- a/alembic/versions/ac36ed533db7_adds_etl_task_table.py
+++ b/alembic/versions/e46afed08420_adds_etl_task_table.py
@@ -1,8 +1,8 @@
 """adds etl task table
 
-Revision ID: ac36ed533db7
+Revision ID: e46afed08420
 Revises: 24ca8432bd8b
-Create Date: 2025-10-24 10:50:23.262494
+Create Date: 2025-10-24 11:02:58.188338
 
 """
 
@@ -11,7 +11,7 @@
 from sqlalchemy.dialects import postgresql
 
 # revision identifiers, used by Alembic.
-revision = "ac36ed533db7"
+revision = "e46afed08420"
 down_revision = "24ca8432bd8b"
 branch_labels = None
 depends_on = None
@@ -30,6 +30,7 @@ def upgrade():
         sa.Column("transform_config", sa.JSON(), nullable=True),
         sa.Column("load_config", sa.JSON(), nullable=True),
         sa.Column("notify_config", sa.JSON(), nullable=True),
+        sa.Column("llm_config", sa.JSON(), nullable=True),
         sa.Column("priority", sa.Boolean(), nullable=True),
         sa.Column("is_active", sa.Boolean(), nullable=True),
         sa.Column("started_at", sa.DateTime(), nullable=True),

From aa9f19359dff300ef990d29eb1b58f1cd53b1cc3 Mon Sep 17 00:00:00 2001
From: andhreljaKern <andrea.hrelja@kern.ai>
Date: Fri, 24 Oct 2025 13:12:10 +0200
Subject: [PATCH 05/54] chore: update submodules

---
 submodules/model | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/submodules/model b/submodules/model
index c002a6e2..c12e193d 160000
--- a/submodules/model
+++ b/submodules/model
@@ -1 +1 @@
-Subproject commit c002a6e2d612517d88749e967691e38ab82aa7d9
+Subproject commit c12e193dde22f860f0e47f04e91878e858308183

From c939facae3143741bfa29c2d7837efff862c82fe Mon Sep 17 00:00:00 2001
From: andhreljaKern <andrea.hrelja@kern.ai>
Date: Fri, 24 Oct 2025 13:12:20 +0200
Subject: [PATCH 06/54] perf(alembic): add org_id column

---
 ...py => b9ad3672dd8d_adds_etl_task_table.py} | 26 +++++++++++++++----
 1 file changed, 21 insertions(+), 5 deletions(-)
 rename alembic/versions/{e46afed08420_adds_etl_task_table.py => b9ad3672dd8d_adds_etl_task_table.py} (81%)

diff --git a/alembic/versions/e46afed08420_adds_etl_task_table.py b/alembic/versions/b9ad3672dd8d_adds_etl_task_table.py
similarity index 81%
rename from alembic/versions/e46afed08420_adds_etl_task_table.py
rename to alembic/versions/b9ad3672dd8d_adds_etl_task_table.py
index 26d203b8..9aa7ee72 100644
--- a/alembic/versions/e46afed08420_adds_etl_task_table.py
+++ b/alembic/versions/b9ad3672dd8d_adds_etl_task_table.py
@@ -1,8 +1,8 @@
 """adds etl task table
 
-Revision ID: e46afed08420
+Revision ID: b9ad3672dd8d
 Revises: 24ca8432bd8b
-Create Date: 2025-10-24 11:02:58.188338
+Create Date: 2025-10-24 11:11:47.341059
 
 """
 
@@ -11,7 +11,7 @@
 from sqlalchemy.dialects import postgresql
 
 # revision identifiers, used by Alembic.
-revision = "e46afed08420"
+revision = "b9ad3672dd8d"
 down_revision = "24ca8432bd8b"
 branch_labels = None
 depends_on = None
@@ -22,6 +22,7 @@ def upgrade():
     op.create_table(
         "etl_task",
         sa.Column("id", postgresql.UUID(as_uuid=True), nullable=False),
+        sa.Column("organization_id", postgresql.UUID(as_uuid=True), nullable=True),
         sa.Column("created_at", sa.DateTime(), nullable=True),
         sa.Column("created_by", postgresql.UUID(as_uuid=True), nullable=True),
         sa.Column("markdown_file_id", postgresql.UUID(as_uuid=True), nullable=True),
@@ -31,16 +32,19 @@ def upgrade():
         sa.Column("load_config", sa.JSON(), nullable=True),
         sa.Column("notify_config", sa.JSON(), nullable=True),
         sa.Column("llm_config", sa.JSON(), nullable=True),
-        sa.Column("priority", sa.Boolean(), nullable=True),
-        sa.Column("is_active", sa.Boolean(), nullable=True),
         sa.Column("started_at", sa.DateTime(), nullable=True),
         sa.Column("finished_at", sa.DateTime(), nullable=True),
         sa.Column("state", sa.String(), nullable=True),
+        sa.Column("is_active", sa.Boolean(), nullable=True),
+        sa.Column("priority", sa.Integer(), nullable=True),
         sa.Column("error_message", sa.String(), nullable=True),
         sa.ForeignKeyConstraint(["created_by"], ["user.id"], ondelete="SET NULL"),
         sa.ForeignKeyConstraint(
             ["markdown_file_id"], ["cognition.markdown_file.id"], ondelete="CASCADE"
         ),
+        sa.ForeignKeyConstraint(
+            ["organization_id"], ["organization.id"], ondelete="CASCADE"
+        ),
         sa.ForeignKeyConstraint(
             ["sharepoint_file_id"], ["integration.sharepoint.id"], ondelete="CASCADE"
         ),
@@ -61,6 +65,13 @@ def upgrade():
         unique=False,
         schema="global",
     )
+    op.create_index(
+        op.f("ix_global_etl_task_organization_id"),
+        "etl_task",
+        ["organization_id"],
+        unique=False,
+        schema="global",
+    )
     op.create_index(
         op.f("ix_global_etl_task_sharepoint_file_id"),
         "etl_task",
@@ -78,6 +89,11 @@ def downgrade():
         table_name="etl_task",
         schema="global",
     )
+    op.drop_index(
+        op.f("ix_global_etl_task_organization_id"),
+        table_name="etl_task",
+        schema="global",
+    )
     op.drop_index(
         op.f("ix_global_etl_task_markdown_file_id"),
         table_name="etl_task",

From 4d258417a16364790f457f4daba800d9dd655d90 Mon Sep 17 00:00:00 2001
From: andhreljaKern <andrea.hrelja@kern.ai>
Date: Fri, 24 Oct 2025 13:47:38 +0200
Subject: [PATCH 07/54] chore: update submodules

---
 submodules/model | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/submodules/model b/submodules/model
index c12e193d..571073f8 160000
--- a/submodules/model
+++ b/submodules/model
@@ -1 +1 @@
-Subproject commit c12e193dde22f860f0e47f04e91878e858308183
+Subproject commit 571073f8770c9618cae12b9dbf67417424c98ee9

From 217832c3ea6fab17ef08050b9352542690358747 Mon Sep 17 00:00:00 2001
From: andhreljaKern <andrea.hrelja@kern.ai>
Date: Fri, 24 Oct 2025 13:47:48 +0200
Subject: [PATCH 08/54] perf(alembic): update etl task table

---
 ...tl_task_table.py => 8a5e0469e9d0_adds_etl_task_table.py} | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)
 rename alembic/versions/{b9ad3672dd8d_adds_etl_task_table.py => 8a5e0469e9d0_adds_etl_task_table.py} (97%)

diff --git a/alembic/versions/b9ad3672dd8d_adds_etl_task_table.py b/alembic/versions/8a5e0469e9d0_adds_etl_task_table.py
similarity index 97%
rename from alembic/versions/b9ad3672dd8d_adds_etl_task_table.py
rename to alembic/versions/8a5e0469e9d0_adds_etl_task_table.py
index 9aa7ee72..a4b90bac 100644
--- a/alembic/versions/b9ad3672dd8d_adds_etl_task_table.py
+++ b/alembic/versions/8a5e0469e9d0_adds_etl_task_table.py
@@ -1,8 +1,8 @@
 """adds etl task table
 
-Revision ID: b9ad3672dd8d
+Revision ID: 8a5e0469e9d0
 Revises: 24ca8432bd8b
-Create Date: 2025-10-24 11:11:47.341059
+Create Date: 2025-10-24 11:47:15.649814
 
 """
 
@@ -11,7 +11,7 @@
 from sqlalchemy.dialects import postgresql
 
 # revision identifiers, used by Alembic.
-revision = "b9ad3672dd8d"
+revision = "8a5e0469e9d0"
 down_revision = "24ca8432bd8b"
 branch_labels = None
 depends_on = None

From c21859456d427b2044363c7bc182dbc40934d362 Mon Sep 17 00:00:00 2001
From: andhreljaKern <andrea.hrelja@kern.ai>
Date: Sun, 26 Oct 2025 09:39:16 +0100
Subject: [PATCH 09/54] chore: update submodules

---
 submodules/model | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/submodules/model b/submodules/model
index 571073f8..2bea8576 160000
--- a/submodules/model
+++ b/submodules/model
@@ -1 +1 @@
-Subproject commit 571073f8770c9618cae12b9dbf67417424c98ee9
+Subproject commit 2bea8576426ce2a305138e56d45874e46cfdd0b4

From cb64bf8a4bf19a33b30bf8edbbb6db0923e78837 Mon Sep 17 00:00:00 2001
From: andhreljaKern <andrea.hrelja@kern.ai>
Date: Sun, 26 Oct 2025 09:39:26 +0100
Subject: [PATCH 10/54] perf: add file_size_bytes to etl_task

---
 ...task_table.py => e07dd53f5fcb_adds_etl_task_table.py} | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)
 rename alembic/versions/{8a5e0469e9d0_adds_etl_task_table.py => e07dd53f5fcb_adds_etl_task_table.py} (92%)

diff --git a/alembic/versions/8a5e0469e9d0_adds_etl_task_table.py b/alembic/versions/e07dd53f5fcb_adds_etl_task_table.py
similarity index 92%
rename from alembic/versions/8a5e0469e9d0_adds_etl_task_table.py
rename to alembic/versions/e07dd53f5fcb_adds_etl_task_table.py
index a4b90bac..1f54096a 100644
--- a/alembic/versions/8a5e0469e9d0_adds_etl_task_table.py
+++ b/alembic/versions/e07dd53f5fcb_adds_etl_task_table.py
@@ -1,8 +1,8 @@
 """adds etl task table
 
-Revision ID: 8a5e0469e9d0
+Revision ID: e07dd53f5fcb
 Revises: 24ca8432bd8b
-Create Date: 2025-10-24 11:47:15.649814
+Create Date: 2025-10-25 20:13:45.417677
 
 """
 
@@ -11,7 +11,7 @@
 from sqlalchemy.dialects import postgresql
 
 # revision identifiers, used by Alembic.
-revision = "8a5e0469e9d0"
+revision = "e07dd53f5fcb"
 down_revision = "24ca8432bd8b"
 branch_labels = None
 depends_on = None
@@ -27,6 +27,9 @@ def upgrade():
         sa.Column("created_by", postgresql.UUID(as_uuid=True), nullable=True),
         sa.Column("markdown_file_id", postgresql.UUID(as_uuid=True), nullable=True),
         sa.Column("sharepoint_file_id", postgresql.UUID(as_uuid=True), nullable=True),
+        sa.Column("file_path", sa.String(), nullable=True),
+        sa.Column("file_size_bytes", sa.BigInteger(), nullable=True),
+        sa.Column("tokenizer", sa.String(), nullable=True),
         sa.Column("extract_config", sa.JSON(), nullable=True),
         sa.Column("transform_config", sa.JSON(), nullable=True),
         sa.Column("load_config", sa.JSON(), nullable=True),

From bd1b6332456f00c2c6b59dd5153ce7195d80a680 Mon Sep 17 00:00:00 2001
From: andhreljaKern <andrea.hrelja@kern.ai>
Date: Mon, 27 Oct 2025 01:16:12 +0100
Subject: [PATCH 11/54] chore: update submodules

---
 submodules/model | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/submodules/model b/submodules/model
index 2bea8576..e6d425b1 160000
--- a/submodules/model
+++ b/submodules/model
@@ -1 +1 @@
-Subproject commit 2bea8576426ce2a305138e56d45874e46cfdd0b4
+Subproject commit e6d425b1d6a7da0eeaed016f19960763ba20ee98

From 92e89cd9129e77600f9c6f081283715f08494bf5 Mon Sep 17 00:00:00 2001
From: andhreljaKern <andrea.hrelja@kern.ai>
Date: Mon, 27 Oct 2025 01:16:22 +0100
Subject: [PATCH 12/54] perf: add split_config

---
 ...l_task_table.py => 5f8de6fcce30_adds_etl_task_table.py} | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)
 rename alembic/versions/{e07dd53f5fcb_adds_etl_task_table.py => 5f8de6fcce30_adds_etl_task_table.py} (95%)

diff --git a/alembic/versions/e07dd53f5fcb_adds_etl_task_table.py b/alembic/versions/5f8de6fcce30_adds_etl_task_table.py
similarity index 95%
rename from alembic/versions/e07dd53f5fcb_adds_etl_task_table.py
rename to alembic/versions/5f8de6fcce30_adds_etl_task_table.py
index 1f54096a..2354f4c1 100644
--- a/alembic/versions/e07dd53f5fcb_adds_etl_task_table.py
+++ b/alembic/versions/5f8de6fcce30_adds_etl_task_table.py
@@ -1,8 +1,8 @@
 """adds etl task table
 
-Revision ID: e07dd53f5fcb
+Revision ID: 5f8de6fcce30
 Revises: 24ca8432bd8b
-Create Date: 2025-10-25 20:13:45.417677
+Create Date: 2025-10-26 20:43:48.216324
 
 """
 
@@ -11,7 +11,7 @@
 from sqlalchemy.dialects import postgresql
 
 # revision identifiers, used by Alembic.
-revision = "e07dd53f5fcb"
+revision = "5f8de6fcce30"
 down_revision = "24ca8432bd8b"
 branch_labels = None
 depends_on = None
@@ -31,6 +31,7 @@ def upgrade():
         sa.Column("file_size_bytes", sa.BigInteger(), nullable=True),
         sa.Column("tokenizer", sa.String(), nullable=True),
         sa.Column("extract_config", sa.JSON(), nullable=True),
+        sa.Column("split_config", sa.JSON(), nullable=True),
         sa.Column("transform_config", sa.JSON(), nullable=True),
         sa.Column("load_config", sa.JSON(), nullable=True),
         sa.Column("notify_config", sa.JSON(), nullable=True),

From 5b18a3cfc440b74e667755793a8234b2229b1bd8 Mon Sep 17 00:00:00 2001
From: andhreljaKern <andrea.hrelja@kern.ai>
Date: Mon, 27 Oct 2025 21:53:18 +0100
Subject: [PATCH 13/54] chore: update submodules

---
 submodules/model | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/submodules/model b/submodules/model
index e6d425b1..1e6dff4d 160000
--- a/submodules/model
+++ b/submodules/model
@@ -1 +1 @@
-Subproject commit e6d425b1d6a7da0eeaed016f19960763ba20ee98
+Subproject commit 1e6dff4d80e135c44816918341cbb36071a58c73

From 97b01bc49dad820606932392eab0523619378d78 Mon Sep 17 00:00:00 2001
From: andhreljaKern <andrea.hrelja@kern.ai>
Date: Mon, 27 Oct 2025 21:53:29 +0100
Subject: [PATCH 14/54] perf(etl): fkey alignment

---
 .../5f8de6fcce30_adds_etl_task_table.py       | 110 -------
 .../60cb22681e6c_adds_etl_task_table.py       | 295 ++++++++++++++++++
 2 files changed, 295 insertions(+), 110 deletions(-)
 delete mode 100644 alembic/versions/5f8de6fcce30_adds_etl_task_table.py
 create mode 100644 alembic/versions/60cb22681e6c_adds_etl_task_table.py

diff --git a/alembic/versions/5f8de6fcce30_adds_etl_task_table.py b/alembic/versions/5f8de6fcce30_adds_etl_task_table.py
deleted file mode 100644
index 2354f4c1..00000000
--- a/alembic/versions/5f8de6fcce30_adds_etl_task_table.py
+++ /dev/null
@@ -1,110 +0,0 @@
-"""adds etl task table
-
-Revision ID: 5f8de6fcce30
-Revises: 24ca8432bd8b
-Create Date: 2025-10-26 20:43:48.216324
-
-"""
-
-from alembic import op
-import sqlalchemy as sa
-from sqlalchemy.dialects import postgresql
-
-# revision identifiers, used by Alembic.
-revision = "5f8de6fcce30"
-down_revision = "24ca8432bd8b"
-branch_labels = None
-depends_on = None
-
-
-def upgrade():
-    # ### commands auto generated by Alembic - please adjust! ###
-    op.create_table(
-        "etl_task",
-        sa.Column("id", postgresql.UUID(as_uuid=True), nullable=False),
-        sa.Column("organization_id", postgresql.UUID(as_uuid=True), nullable=True),
-        sa.Column("created_at", sa.DateTime(), nullable=True),
-        sa.Column("created_by", postgresql.UUID(as_uuid=True), nullable=True),
-        sa.Column("markdown_file_id", postgresql.UUID(as_uuid=True), nullable=True),
-        sa.Column("sharepoint_file_id", postgresql.UUID(as_uuid=True), nullable=True),
-        sa.Column("file_path", sa.String(), nullable=True),
-        sa.Column("file_size_bytes", sa.BigInteger(), nullable=True),
-        sa.Column("tokenizer", sa.String(), nullable=True),
-        sa.Column("extract_config", sa.JSON(), nullable=True),
-        sa.Column("split_config", sa.JSON(), nullable=True),
-        sa.Column("transform_config", sa.JSON(), nullable=True),
-        sa.Column("load_config", sa.JSON(), nullable=True),
-        sa.Column("notify_config", sa.JSON(), nullable=True),
-        sa.Column("llm_config", sa.JSON(), nullable=True),
-        sa.Column("started_at", sa.DateTime(), nullable=True),
-        sa.Column("finished_at", sa.DateTime(), nullable=True),
-        sa.Column("state", sa.String(), nullable=True),
-        sa.Column("is_active", sa.Boolean(), nullable=True),
-        sa.Column("priority", sa.Integer(), nullable=True),
-        sa.Column("error_message", sa.String(), nullable=True),
-        sa.ForeignKeyConstraint(["created_by"], ["user.id"], ondelete="SET NULL"),
-        sa.ForeignKeyConstraint(
-            ["markdown_file_id"], ["cognition.markdown_file.id"], ondelete="CASCADE"
-        ),
-        sa.ForeignKeyConstraint(
-            ["organization_id"], ["organization.id"], ondelete="CASCADE"
-        ),
-        sa.ForeignKeyConstraint(
-            ["sharepoint_file_id"], ["integration.sharepoint.id"], ondelete="CASCADE"
-        ),
-        sa.PrimaryKeyConstraint("id"),
-        schema="global",
-    )
-    op.create_index(
-        op.f("ix_global_etl_task_created_by"),
-        "etl_task",
-        ["created_by"],
-        unique=False,
-        schema="global",
-    )
-    op.create_index(
-        op.f("ix_global_etl_task_markdown_file_id"),
-        "etl_task",
-        ["markdown_file_id"],
-        unique=False,
-        schema="global",
-    )
-    op.create_index(
-        op.f("ix_global_etl_task_organization_id"),
-        "etl_task",
-        ["organization_id"],
-        unique=False,
-        schema="global",
-    )
-    op.create_index(
-        op.f("ix_global_etl_task_sharepoint_file_id"),
-        "etl_task",
-        ["sharepoint_file_id"],
-        unique=False,
-        schema="global",
-    )
-    # ### end Alembic commands ###
-
-
-def downgrade():
-    # ### commands auto generated by Alembic - please adjust! ###
-    op.drop_index(
-        op.f("ix_global_etl_task_sharepoint_file_id"),
-        table_name="etl_task",
-        schema="global",
-    )
-    op.drop_index(
-        op.f("ix_global_etl_task_organization_id"),
-        table_name="etl_task",
-        schema="global",
-    )
-    op.drop_index(
-        op.f("ix_global_etl_task_markdown_file_id"),
-        table_name="etl_task",
-        schema="global",
-    )
-    op.drop_index(
-        op.f("ix_global_etl_task_created_by"), table_name="etl_task", schema="global"
-    )
-    op.drop_table("etl_task", schema="global")
-    # ### end Alembic commands ###
diff --git a/alembic/versions/60cb22681e6c_adds_etl_task_table.py b/alembic/versions/60cb22681e6c_adds_etl_task_table.py
new file mode 100644
index 00000000..adef8d80
--- /dev/null
+++ b/alembic/versions/60cb22681e6c_adds_etl_task_table.py
@@ -0,0 +1,295 @@
+"""adds etl task table
+
+Revision ID: 60cb22681e6c
+Revises: 24ca8432bd8b
+Create Date: 2025-10-27 20:49:11.247647
+
+"""
+
+from alembic import op
+import sqlalchemy as sa
+from sqlalchemy.dialects import postgresql
+
+# revision identifiers, used by Alembic.
+revision = "60cb22681e6c"
+down_revision = "24ca8432bd8b"
+branch_labels = None
+depends_on = None
+
+
+def upgrade():
+    # ### commands auto generated by Alembic - please adjust! ###
+    op.create_table(
+        "etl_task",
+        sa.Column("id", postgresql.UUID(as_uuid=True), nullable=False),
+        sa.Column("organization_id", postgresql.UUID(as_uuid=True), nullable=True),
+        sa.Column("created_at", sa.DateTime(), nullable=True),
+        sa.Column("created_by", postgresql.UUID(as_uuid=True), nullable=True),
+        sa.Column("file_path", sa.String(), nullable=True),
+        sa.Column("file_size_bytes", sa.BigInteger(), nullable=True),
+        sa.Column("tokenizer", sa.String(), nullable=True),
+        sa.Column("extract_config", sa.JSON(), nullable=True),
+        sa.Column("split_config", sa.JSON(), nullable=True),
+        sa.Column("transform_config", sa.JSON(), nullable=True),
+        sa.Column("load_config", sa.JSON(), nullable=True),
+        sa.Column("notify_config", sa.JSON(), nullable=True),
+        sa.Column("llm_config", sa.JSON(), nullable=True),
+        sa.Column("started_at", sa.DateTime(), nullable=True),
+        sa.Column("finished_at", sa.DateTime(), nullable=True),
+        sa.Column("state", sa.String(), nullable=True),
+        sa.Column("is_active", sa.Boolean(), nullable=True),
+        sa.Column("priority", sa.Integer(), nullable=True),
+        sa.Column("error_message", sa.String(), nullable=True),
+        sa.ForeignKeyConstraint(["created_by"], ["user.id"], ondelete="SET NULL"),
+        sa.ForeignKeyConstraint(
+            ["organization_id"], ["organization.id"], ondelete="CASCADE"
+        ),
+        sa.PrimaryKeyConstraint("id"),
+        schema="global",
+    )
+    op.create_index(
+        op.f("ix_global_etl_task_created_by"),
+        "etl_task",
+        ["created_by"],
+        unique=False,
+        schema="global",
+    )
+    op.create_index(
+        op.f("ix_global_etl_task_organization_id"),
+        "etl_task",
+        ["organization_id"],
+        unique=False,
+        schema="global",
+    )
+    op.add_column(
+        "markdown_file",
+        sa.Column("etl_task_id", postgresql.UUID(as_uuid=True), nullable=True),
+        schema="cognition",
+    )
+    op.create_index(
+        op.f("ix_cognition_markdown_file_etl_task_id"),
+        "markdown_file",
+        ["etl_task_id"],
+        unique=False,
+        schema="cognition",
+    )
+    op.create_foreign_key(
+        None,
+        "markdown_file",
+        "etl_task",
+        ["etl_task_id"],
+        ["id"],
+        source_schema="cognition",
+        referent_schema="global",
+        ondelete="CASCADE",
+    )
+    op.add_column(
+        "github_file",
+        sa.Column("etl_task_id", postgresql.UUID(as_uuid=True), nullable=True),
+        schema="integration",
+    )
+    op.drop_constraint(
+        "unique_github_file_source", "github_file", schema="integration", type_="unique"
+    )
+    op.create_unique_constraint(
+        "unique_github_file_source",
+        "github_file",
+        ["integration_id", "running_id", "source", "etl_task_id"],
+        schema="integration",
+    )
+    op.create_index(
+        op.f("ix_integration_github_file_etl_task_id"),
+        "github_file",
+        ["etl_task_id"],
+        unique=False,
+        schema="integration",
+    )
+    op.create_foreign_key(
+        None,
+        "github_file",
+        "etl_task",
+        ["etl_task_id"],
+        ["id"],
+        source_schema="integration",
+        referent_schema="global",
+        ondelete="CASCADE",
+    )
+    op.add_column(
+        "github_issue",
+        sa.Column("etl_task_id", postgresql.UUID(as_uuid=True), nullable=True),
+        schema="integration",
+    )
+    op.drop_constraint(
+        "unique_github_issue_source",
+        "github_issue",
+        schema="integration",
+        type_="unique",
+    )
+    op.create_unique_constraint(
+        "unique_github_issue_source",
+        "github_issue",
+        ["integration_id", "running_id", "source", "etl_task_id"],
+        schema="integration",
+    )
+    op.create_index(
+        op.f("ix_integration_github_issue_etl_task_id"),
+        "github_issue",
+        ["etl_task_id"],
+        unique=False,
+        schema="integration",
+    )
+    op.create_foreign_key(
+        None,
+        "github_issue",
+        "etl_task",
+        ["etl_task_id"],
+        ["id"],
+        source_schema="integration",
+        referent_schema="global",
+        ondelete="CASCADE",
+    )
+    op.add_column(
+        "pdf",
+        sa.Column("etl_task_id", postgresql.UUID(as_uuid=True), nullable=True),
+        schema="integration",
+    )
+    op.drop_constraint("unique_pdf_source", "pdf", schema="integration", type_="unique")
+    op.create_unique_constraint(
+        "unique_pdf_source",
+        "pdf",
+        ["integration_id", "running_id", "source", "etl_task_id"],
+        schema="integration",
+    )
+    op.create_index(
+        op.f("ix_integration_pdf_etl_task_id"),
+        "pdf",
+        ["etl_task_id"],
+        unique=False,
+        schema="integration",
+    )
+    op.create_foreign_key(
+        None,
+        "pdf",
+        "etl_task",
+        ["etl_task_id"],
+        ["id"],
+        source_schema="integration",
+        referent_schema="global",
+        ondelete="CASCADE",
+    )
+    op.add_column(
+        "sharepoint",
+        sa.Column("etl_task_id", postgresql.UUID(as_uuid=True), nullable=True),
+        schema="integration",
+    )
+    op.drop_constraint(
+        "unique_sharepoint_source", "sharepoint", schema="integration", type_="unique"
+    )
+    op.create_unique_constraint(
+        "unique_sharepoint_source",
+        "sharepoint",
+        ["integration_id", "running_id", "source", "etl_task_id"],
+        schema="integration",
+    )
+    op.create_index(
+        op.f("ix_integration_sharepoint_etl_task_id"),
+        "sharepoint",
+        ["etl_task_id"],
+        unique=False,
+        schema="integration",
+    )
+    op.create_foreign_key(
+        None,
+        "sharepoint",
+        "etl_task",
+        ["etl_task_id"],
+        ["id"],
+        source_schema="integration",
+        referent_schema="global",
+        ondelete="CASCADE",
+    )
+    # ### end Alembic commands ###
+
+
+def downgrade():
+    # ### commands auto generated by Alembic - please adjust! ###
+    op.drop_constraint(None, "sharepoint", schema="integration", type_="foreignkey")
+    op.drop_index(
+        op.f("ix_integration_sharepoint_etl_task_id"),
+        table_name="sharepoint",
+        schema="integration",
+    )
+    op.drop_constraint(
+        "unique_sharepoint_source", "sharepoint", schema="integration", type_="unique"
+    )
+    op.create_unique_constraint(
+        "unique_sharepoint_source",
+        "sharepoint",
+        ["integration_id", "running_id", "source"],
+        schema="integration",
+    )
+    op.drop_column("sharepoint", "etl_task_id", schema="integration")
+    op.drop_constraint(None, "pdf", schema="integration", type_="foreignkey")
+    op.drop_index(
+        op.f("ix_integration_pdf_etl_task_id"), table_name="pdf", schema="integration"
+    )
+    op.drop_constraint("unique_pdf_source", "pdf", schema="integration", type_="unique")
+    op.create_unique_constraint(
+        "unique_pdf_source",
+        "pdf",
+        ["integration_id", "running_id", "source"],
+        schema="integration",
+    )
+    op.drop_column("pdf", "etl_task_id", schema="integration")
+    op.drop_constraint(None, "github_issue", schema="integration", type_="foreignkey")
+    op.drop_index(
+        op.f("ix_integration_github_issue_etl_task_id"),
+        table_name="github_issue",
+        schema="integration",
+    )
+    op.drop_constraint(
+        "unique_github_issue_source",
+        "github_issue",
+        schema="integration",
+        type_="unique",
+    )
+    op.create_unique_constraint(
+        "unique_github_issue_source",
+        "github_issue",
+        ["integration_id", "running_id", "source"],
+        schema="integration",
+    )
+    op.drop_column("github_issue", "etl_task_id", schema="integration")
+    op.drop_constraint(None, "github_file", schema="integration", type_="foreignkey")
+    op.drop_index(
+        op.f("ix_integration_github_file_etl_task_id"),
+        table_name="github_file",
+        schema="integration",
+    )
+    op.drop_constraint(
+        "unique_github_file_source", "github_file", schema="integration", type_="unique"
+    )
+    op.create_unique_constraint(
+        "unique_github_file_source",
+        "github_file",
+        ["integration_id", "running_id", "source"],
+        schema="integration",
+    )
+    op.drop_column("github_file", "etl_task_id", schema="integration")
+    op.drop_constraint(None, "markdown_file", schema="cognition", type_="foreignkey")
+    op.drop_index(
+        op.f("ix_cognition_markdown_file_etl_task_id"),
+        table_name="markdown_file",
+        schema="cognition",
+    )
+    op.drop_column("markdown_file", "etl_task_id", schema="cognition")
+    op.drop_index(
+        op.f("ix_global_etl_task_organization_id"),
+        table_name="etl_task",
+        schema="global",
+    )
+    op.drop_index(
+        op.f("ix_global_etl_task_created_by"), table_name="etl_task", schema="global"
+    )
+    op.drop_table("etl_task", schema="global")
+    # ### end Alembic commands ###

From 7a40c8106cd0cd44f7b5a9d5330e6761c8d5e81f Mon Sep 17 00:00:00 2001
From: andhreljaKern <andrea.hrelja@kern.ai>
Date: Tue, 28 Oct 2025 09:36:16 +0100
Subject: [PATCH 15/54] chore: update submodules

---
 submodules/model | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/submodules/model b/submodules/model
index 1e6dff4d..2ddce6ed 160000
--- a/submodules/model
+++ b/submodules/model
@@ -1 +1 @@
-Subproject commit 1e6dff4d80e135c44816918341cbb36071a58c73
+Subproject commit 2ddce6edc29e7e8525f5f53d4d30d13b39ee02ef

From 79b05b1d837a51c75d7886647adaf4f4ea78c72f Mon Sep 17 00:00:00 2001
From: andhreljaKern <andrea.hrelja@kern.ai>
Date: Tue, 28 Oct 2025 09:36:21 +0100
Subject: [PATCH 16/54] perf: task cancellation

---
 controller/monitor/manager.py | 11 +++++++++++
 fast_api/routes/misc.py       |  2 ++
 2 files changed, 13 insertions(+)

diff --git a/controller/monitor/manager.py b/controller/monitor/manager.py
index f4bdd43b..907b5bf6 100644
--- a/controller/monitor/manager.py
+++ b/controller/monitor/manager.py
@@ -126,3 +126,14 @@ def cancel_integration_task(
     task_monitor.set_integration_task_to_failed(
         integration_id, error_message="Cancelled by task manager"
     )
+
+
+def cancel_etl_task(
+    task_info: Dict[str, Any],
+) -> None:
+
+    etl_task_id = task_info.get("etlTaskId")
+
+    task_monitor.set_etl_task_to_failed(
+        etl_task_id, error_message="Cancelled by task manager"
+    )
diff --git a/fast_api/routes/misc.py b/fast_api/routes/misc.py
index d3a62f2f..e98af18f 100644
--- a/fast_api/routes/misc.py
+++ b/fast_api/routes/misc.py
@@ -136,6 +136,8 @@ def cancel_task(
             )
         elif task_type == enums.TaskType.EXECUTE_INTEGRATION.value:
             controller_manager.cancel_integration_task(task_info)
+        elif task_type == enums.TaskType.EXECUTE_ETL.value:
+            controller_manager.cancel_etl_task(task_info)
         else:
             raise ValueError(f"{task_type} is no valid task type")
 

From 2b4ad4edb572d09c31cf8e414997dacb7ac6e58f Mon Sep 17 00:00:00 2001
From: andhreljaKern <andrea.hrelja@kern.ai>
Date: Wed, 29 Oct 2025 19:09:33 +0100
Subject: [PATCH 17/54] chore: update submodules

---
 submodules/model | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/submodules/model b/submodules/model
index 2ddce6ed..99323bda 160000
--- a/submodules/model
+++ b/submodules/model
@@ -1 +1 @@
-Subproject commit 2ddce6edc29e7e8525f5f53d4d30d13b39ee02ef
+Subproject commit 99323bda6dc969569a882d729f016b743dc12509

From 6264d2868662beee7459d58fa18f73ee0b0fe01a Mon Sep 17 00:00:00 2001
From: andhreljaKern <andrea.hrelja@kern.ai>
Date: Thu, 30 Oct 2025 01:02:01 +0100
Subject: [PATCH 18/54] fix: update submodules merge conflict

---
 submodules/model | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/submodules/model b/submodules/model
index 99323bda..5549a617 160000
--- a/submodules/model
+++ b/submodules/model
@@ -1 +1 @@
-Subproject commit 99323bda6dc969569a882d729f016b743dc12509
+Subproject commit 5549a617a856699c95e7a76f51ad058541f0c2ad

From 1aff7a81af5e7ef1c5acb0cbbaa2c9b614762890 Mon Sep 17 00:00:00 2001
From: andhreljaKern <andrea.hrelja@kern.ai>
Date: Thu, 30 Oct 2025 01:22:38 +0100
Subject: [PATCH 19/54] perf: add cache_config

---
 ...py => bf8e8646ebdc_adds_etl_task_table.py} | 50 ++++++++++++++++---
 1 file changed, 42 insertions(+), 8 deletions(-)
 rename alembic/versions/{60cb22681e6c_adds_etl_task_table.py => bf8e8646ebdc_adds_etl_task_table.py} (88%)

diff --git a/alembic/versions/60cb22681e6c_adds_etl_task_table.py b/alembic/versions/bf8e8646ebdc_adds_etl_task_table.py
similarity index 88%
rename from alembic/versions/60cb22681e6c_adds_etl_task_table.py
rename to alembic/versions/bf8e8646ebdc_adds_etl_task_table.py
index adef8d80..fc475d48 100644
--- a/alembic/versions/60cb22681e6c_adds_etl_task_table.py
+++ b/alembic/versions/bf8e8646ebdc_adds_etl_task_table.py
@@ -1,8 +1,8 @@
 """adds etl task table
 
-Revision ID: 60cb22681e6c
+Revision ID: bf8e8646ebdc
 Revises: 24ca8432bd8b
-Create Date: 2025-10-27 20:49:11.247647
+Create Date: 2025-10-30 00:21:05.246324
 
 """
 
@@ -11,7 +11,7 @@
 from sqlalchemy.dialects import postgresql
 
 # revision identifiers, used by Alembic.
-revision = "60cb22681e6c"
+revision = "bf8e8646ebdc"
 down_revision = "24ca8432bd8b"
 branch_labels = None
 depends_on = None
@@ -28,6 +28,7 @@ def upgrade():
         sa.Column("file_path", sa.String(), nullable=True),
         sa.Column("file_size_bytes", sa.BigInteger(), nullable=True),
         sa.Column("tokenizer", sa.String(), nullable=True),
+        sa.Column("cache_config", sa.JSON(), nullable=True),
         sa.Column("extract_config", sa.JSON(), nullable=True),
         sa.Column("split_config", sa.JSON(), nullable=True),
         sa.Column("transform_config", sa.JSON(), nullable=True),
@@ -61,6 +62,11 @@ def upgrade():
         unique=False,
         schema="global",
     )
+    op.add_column(
+        "conversation",
+        sa.Column("incognito_mode", sa.Boolean(), nullable=True),
+        schema="cognition",
+    )
     op.add_column(
         "markdown_file",
         sa.Column("etl_task_id", postgresql.UUID(as_uuid=True), nullable=True),
@@ -73,6 +79,12 @@ def upgrade():
         unique=False,
         schema="cognition",
     )
+    op.create_unique_constraint(
+        "unique_markdown_file_etl_task_id",
+        "markdown_file",
+        ["id", "etl_task_id"],
+        schema="cognition",
+    )
     op.create_foreign_key(
         None,
         "markdown_file",
@@ -213,7 +225,12 @@ def upgrade():
 
 def downgrade():
     # ### commands auto generated by Alembic - please adjust! ###
-    op.drop_constraint(None, "sharepoint", schema="integration", type_="foreignkey")
+    op.drop_constraint(
+        "sharepoint_etl_task_id_fkey",
+        "sharepoint",
+        schema="integration",
+        type_="foreignkey",
+    )
     op.drop_index(
         op.f("ix_integration_sharepoint_etl_task_id"),
         table_name="sharepoint",
@@ -229,7 +246,9 @@ def downgrade():
         schema="integration",
     )
     op.drop_column("sharepoint", "etl_task_id", schema="integration")
-    op.drop_constraint(None, "pdf", schema="integration", type_="foreignkey")
+    op.drop_constraint(
+        "pdf_etl_task_id_fkey", "pdf", schema="integration", type_="foreignkey"
+    )
     op.drop_index(
         op.f("ix_integration_pdf_etl_task_id"), table_name="pdf", schema="integration"
     )
@@ -241,7 +260,12 @@ def downgrade():
         schema="integration",
     )
     op.drop_column("pdf", "etl_task_id", schema="integration")
-    op.drop_constraint(None, "github_issue", schema="integration", type_="foreignkey")
+    op.drop_constraint(
+        "github_issue_etl_task_id_fkey",
+        "github_issue",
+        schema="integration",
+        type_="foreignkey",
+    )
     op.drop_index(
         op.f("ix_integration_github_issue_etl_task_id"),
         table_name="github_issue",
@@ -260,7 +284,12 @@ def downgrade():
         schema="integration",
     )
     op.drop_column("github_issue", "etl_task_id", schema="integration")
-    op.drop_constraint(None, "github_file", schema="integration", type_="foreignkey")
+    op.drop_constraint(
+        "github_file_etl_task_id_fkey",
+        "github_file",
+        schema="integration",
+        type_="foreignkey",
+    )
     op.drop_index(
         op.f("ix_integration_github_file_etl_task_id"),
         table_name="github_file",
@@ -276,7 +305,12 @@ def downgrade():
         schema="integration",
     )
     op.drop_column("github_file", "etl_task_id", schema="integration")
-    op.drop_constraint(None, "markdown_file", schema="cognition", type_="foreignkey")
+    op.drop_constraint(
+        "markdown_file_etl_task_id_fkey",
+        "markdown_file",
+        schema="cognition",
+        type_="foreignkey",
+    )
     op.drop_index(
         op.f("ix_cognition_markdown_file_etl_task_id"),
         table_name="markdown_file",

From e3a19db0f31d3e16ad030eb27a82e596891e6243 Mon Sep 17 00:00:00 2001
From: andhreljaKern <andrea.hrelja@kern.ai>
Date: Thu, 30 Oct 2025 01:22:53 +0100
Subject: [PATCH 20/54] perf: align /notify to etl provider

---
 controller/transfer/cognition/etl.py          | 107 ++++++++++++++++++
 controller/transfer/cognition/minio_upload.py |  84 ++++++++++----
 2 files changed, 172 insertions(+), 19 deletions(-)
 create mode 100644 controller/transfer/cognition/etl.py

diff --git a/controller/transfer/cognition/etl.py b/controller/transfer/cognition/etl.py
new file mode 100644
index 00000000..47e17a27
--- /dev/null
+++ b/controller/transfer/cognition/etl.py
@@ -0,0 +1,107 @@
+from typing import Optional
+
+from submodules.model import enums
+from submodules.model.models import (
+    EtlTask,
+    CognitionMarkdownFile,
+    CognitionMarkdownDataset,
+)
+from submodules.model.global_objects import etl_task as etl_task_bo
+from submodules.model.cognition_objects import markdown_file as markdown_file_bo
+
+DEFAULT_FILE_TYPE = enums.ETLFileType.PDF
+DEFAULT_EXTRACTORS = {
+    enums.ETLFileType.MD: enums.ETLExtractor.MD.FILESYSTEM,
+    enums.ETLFileType.PDF: enums.ETLExtractor.PDF.PDF2MD,
+}
+
+DEFAULT_FALLBACK_EXTRACTORS = {
+    enums.ETLFileType.PDF: [
+        enums.ETLExtractor.PDF.PDF2MD,
+        enums.ETLExtractor.PDF.VISION,
+        enums.ETLExtractor.PDF.AZURE_DI,
+    ],
+}
+
+
+def get_or_create_task(
+    markdown_file: CognitionMarkdownFile,
+    markdown_dataset: CognitionMarkdownDataset,
+    file_size_bytes: int,
+    minio_path: str,
+    original_file_name: str,
+    file_type: Optional[enums.ETLFileType] = None,
+    extractor: Optional[enums.ETLExtractor] = None,
+    fallback_extractors: Optional[list[enums.ETLExtractor]] = None,
+    split_strategy: Optional[enums.ETLSplitStrategy] = None,
+    chunk_size: Optional[int] = 1000,
+    priority: Optional[int] = -1,
+) -> EtlTask:
+    if markdown_file.etl_task_id:
+        if etl_task := etl_task_bo.get_by_id(markdown_file.etl_task_id):
+            return etl_task
+
+    file_type = file_type or DEFAULT_FILE_TYPE
+    split_strategy = split_strategy or enums.ETLSplitStrategy.CHUNK
+    extractor = extractor or DEFAULT_EXTRACTORS[file_type]
+    fallback_extractors = list(
+        filter(
+            lambda x: x != extractor,
+            (fallback_extractors or DEFAULT_FALLBACK_EXTRACTORS.get(file_type, [])),
+        )
+    )
+
+    etl_task = etl_task_bo.create(
+        org_id=markdown_dataset.organization_id,
+        user_id=markdown_file.created_by,
+        file_size_bytes=file_size_bytes,
+        extract_config={
+            "file_type": file_type.value,
+            "extractor": extractor.value,
+            "fallback_extractors": [fe.value for fe in fallback_extractors],
+            "minio_path": minio_path,
+            "original_file_name": original_file_name,
+        },
+        split_config={
+            "strategy": split_strategy.value,
+            "chunk_size": chunk_size,
+        },
+        transform_config={
+            "transformers": [
+                {
+                    "name": enums.ETLTransformer.CLEANSE.value,
+                    "system_prompt": None,
+                    "user_prompt": None,
+                },
+                {
+                    "name": enums.ETLTransformer.TEXT_TO_TABLE.value,
+                    "system_prompt": None,
+                    "user_prompt": None,
+                },
+            ]
+        },
+        load_config={
+            "refinery_project": {"enabled": False, "id": None},
+            "markdown_file": {"enabled": True, "id": str(markdown_file.id)},
+        },
+        notify_config={
+            "http": {
+                "url": "http://cognition-gateway:80/etl/finished/{markdown_file_id}",
+                "format": {
+                    "markdown_file_id": str(markdown_file.id),
+                },
+                "method": "POST",
+            }
+        },
+        llm_config=markdown_dataset.llm_config,
+        tokenizer=markdown_dataset.tokenizer,
+        priority=priority,
+    )
+
+    markdown_file_bo.update(
+        org_id=markdown_file.organization_id,
+        markdown_file_id=markdown_file.id,
+        etl_task_id=etl_task.id,
+    )
+
+    return etl_task
diff --git a/controller/transfer/cognition/minio_upload.py b/controller/transfer/cognition/minio_upload.py
index 3ea51107..6f49c26a 100644
--- a/controller/transfer/cognition/minio_upload.py
+++ b/controller/transfer/cognition/minio_upload.py
@@ -1,7 +1,12 @@
 from typing import List
-from submodules.model.cognition_objects import file_reference as file_reference_db_bo
-from submodules.model.enums import TaskType, FileCachingProcessingScope
+
+from controller.transfer.cognition import etl as etl_util
 from controller.task_master import manager as task_master_manager
+from submodules.model.cognition_objects import (
+    file_reference as file_reference_db_bo,
+    markdown_file as markdown_file_bo,
+    markdown_dataset as markdown_dataset_bo,
+)
 from submodules.model import enums
 from submodules.model.business_objects import general
 
@@ -21,7 +26,10 @@ def handle_cognition_file_upload(path_parts: List[str]):
             or file_reference.state == enums.FileCachingState.COMPLETED.value
         ):
             # file_reference is None or already processed in queue
-            print("File reference duplication error, file is already processed", flush=True)
+            print(
+                "File reference duplication error, file is already processed",
+                flush=True,
+            )
             if file_reference:
                 print(f"File reference id: {str(file_reference.id)}", flush=True)
                 print(f"File name: {file_reference.original_file_name}", flush=True)
@@ -29,26 +37,64 @@ def handle_cognition_file_upload(path_parts: List[str]):
         file_reference.state = enums.FileCachingState.COMPLETED.value
         general.commit()
 
-        prio = (
+        priority = -1
+        if (
             file_reference.meta_data.get("transformation_initiator")
             == enums.FileCachingInitiator.TMP_DOC_RETRIEVAL.value
+        ):
+            priority = 1
+        # task_master_manager.queue_task(
+        #     str(file_reference.organization_id),
+        #     str(file_reference.created_by),
+        #     TaskType.PARSE_COGNITION_FILE,
+        #     {
+        #         "parse_scope": FileCachingProcessingScope.EXTRACT_TRANSFORM.value,
+        #         "file_reference_id": str(file_reference.id),
+        #         "extraction_method": extraction_method,
+        #         "meta_data": file_reference.meta_data,
+        #         "extraction_key": file_reference.meta_data.get("extraction_key"),
+        #         "transformation_key": file_reference.meta_data.get(
+        #             "transformation_key"
+        #         ),
+        #         "file_name": file_reference.original_file_name,
+        #     },
+        #     prio,  # not sure if prio is right here as the prio tasks should only take < 1 min but waiting for the normal queue will take ages depending on the queue
+        # )
+
+        markdown_file = markdown_file_bo.get(
+            org_id, file_reference.meta_data.get("markdown_file_id")
+        )
+        if not markdown_file:
+            print(
+                "ERROR: Markdown file not found for the given markdown_file_id",
+                flush=True,
+            )
+            raise ValueError(
+                f"Markdown file not found for file reference {file_reference.id}"
+            )
+
+        markdown_dataset = markdown_dataset_bo.get(
+            org_id=org_id, id=markdown_file.dataset_id
+        )
+        file_type = enums.ETLFileType.from_string(markdown_file.category_origin)
+        etl_task = etl_util.get_or_create_task(
+            markdown_file=markdown_file,
+            markdown_dataset=markdown_dataset,
+            minio_path=file_reference.minio_path,
+            original_file_name=file_reference.original_file_name,
+            file_size_bytes=file_reference.file_size_bytes,
+            file_type=file_type,
+            extractor=enums.ETLExtractorPDF.from_string(
+                markdown_file.meta_data.get("extractor")
+            ),
+            split_strategy=markdown_file.meta_data.get("split_strategy"),
+            chunk_size=markdown_file.meta_data.get("chunk_size"),
+            priority=priority,
         )
-        extraction_method = file_reference.meta_data.get("extraction_method")
 
         task_master_manager.queue_task(
-            str(file_reference.organization_id),
+            org_id,
             str(file_reference.created_by),
-            TaskType.PARSE_COGNITION_FILE,
-            {
-                "parse_scope": FileCachingProcessingScope.EXTRACT_TRANSFORM.value,
-                "file_reference_id": str(file_reference.id),
-                "extraction_method": extraction_method,
-                "meta_data": file_reference.meta_data,
-                "extraction_key": file_reference.meta_data.get("extraction_key"),
-                "transformation_key": file_reference.meta_data.get(
-                    "transformation_key"
-                ),
-                "file_name": file_reference.original_file_name,
-            },
-            prio,  # not sure if prio is right here as the prio tasks should only take < 1 min but waiting for the normal queue will take ages depending on the queue
+            enums.TaskType.EXECUTE_ETL,
+            {"etl_task_id": str(etl_task.id)},
         )

From 94af4814e2beb0225697619965e959fcdbb54422 Mon Sep 17 00:00:00 2001
From: andhreljaKern <andrea.hrelja@kern.ai>
Date: Thu, 30 Oct 2025 01:27:03 +0100
Subject: [PATCH 21/54] chore: update submodules

---
 submodules/model | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/submodules/model b/submodules/model
index 5549a617..bf33020b 160000
--- a/submodules/model
+++ b/submodules/model
@@ -1 +1 @@
-Subproject commit 5549a617a856699c95e7a76f51ad058541f0c2ad
+Subproject commit bf33020b6f5c18470d61efa9a2c04dddd954e8f4

From fb71eb2700c118e0f3ce817a709c81e037da2cba Mon Sep 17 00:00:00 2001
From: andhreljaKern <andrea.hrelja@kern.ai>
Date: Thu, 30 Oct 2025 10:15:49 +0100
Subject: [PATCH 22/54] chore: update submodules

---
 submodules/model | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/submodules/model b/submodules/model
index bf33020b..f5681d71 160000
--- a/submodules/model
+++ b/submodules/model
@@ -1 +1 @@
-Subproject commit bf33020b6f5c18470d61efa9a2c04dddd954e8f4
+Subproject commit f5681d719c62ad33259696274049239910707ed0

From 0b7c896e295259f1e7dca1528d075b36b4fbaf56 Mon Sep 17 00:00:00 2001
From: andhreljaKern <andrea.hrelja@kern.ai>
Date: Thu, 30 Oct 2025 10:16:02 +0100
Subject: [PATCH 23/54] perf: update minio_upload for execute_etl

---
 controller/transfer/cognition/minio_upload.py | 71 +++++++++++++++----
 1 file changed, 57 insertions(+), 14 deletions(-)

diff --git a/controller/transfer/cognition/minio_upload.py b/controller/transfer/cognition/minio_upload.py
index 6f49c26a..7371b4f6 100644
--- a/controller/transfer/cognition/minio_upload.py
+++ b/controller/transfer/cognition/minio_upload.py
@@ -1,14 +1,14 @@
 from typing import List
 
-from controller.transfer.cognition import etl as etl_util
 from controller.task_master import manager as task_master_manager
+from submodules.model import enums
+from submodules.model.business_objects import general
+from submodules.model.global_objects import etl_task as etl_task_bo
 from submodules.model.cognition_objects import (
     file_reference as file_reference_db_bo,
     markdown_file as markdown_file_bo,
     markdown_dataset as markdown_dataset_bo,
 )
-from submodules.model import enums
-from submodules.model.business_objects import general
 
 
 def handle_cognition_file_upload(path_parts: List[str]):
@@ -37,6 +37,7 @@ def handle_cognition_file_upload(path_parts: List[str]):
         file_reference.state = enums.FileCachingState.COMPLETED.value
         general.commit()
 
+        chunk_size = 1000
         priority = -1
         if (
             file_reference.meta_data.get("transformation_initiator")
@@ -76,19 +77,61 @@ def handle_cognition_file_upload(path_parts: List[str]):
         markdown_dataset = markdown_dataset_bo.get(
             org_id=org_id, id=markdown_file.dataset_id
         )
-        file_type = enums.ETLFileType.from_string(markdown_file.category_origin)
-        etl_task = etl_util.get_or_create_task(
+
+        etl_task = etl_task_bo.get_or_create_markdown_file_etl_task(
+            org_id=org_id,
+            file_reference=file_reference,
             markdown_file=markdown_file,
             markdown_dataset=markdown_dataset,
-            minio_path=file_reference.minio_path,
-            original_file_name=file_reference.original_file_name,
-            file_size_bytes=file_reference.file_size_bytes,
-            file_type=file_type,
-            extractor=enums.ETLExtractorPDF.from_string(
-                markdown_file.meta_data.get("extractor")
-            ),
-            split_strategy=markdown_file.meta_data.get("split_strategy"),
-            chunk_size=markdown_file.meta_data.get("chunk_size"),
+            extractor=markdown_file.meta_data.get("extractor"),
+            fallback_extractors=[
+                enums.ETLExtractorPDF.PDF2MD,
+                enums.ETLExtractorPDF.VISION,
+            ],
+            cache_config={
+                "use_file_cache": True,
+                "use_extraction_cache": False,
+                "use_transformation_cache": True,
+            },
+            split_config={
+                "strategy": enums.ETLSplitStrategy.CHUNK.value,
+                "chunk_size": chunk_size,
+            },
+            transform_config={
+                "transformers": [
+                    {  # NOTE: __call_gpt_with_key only reads user_prompt
+                        "enabled": True,
+                        "name": enums.ETLTransformer.CLEANSE.value,
+                        "system_prompt": None,
+                        "user_prompt": None,
+                    },
+                    {
+                        "enabled": True,
+                        "name": enums.ETLTransformer.TEXT_TO_TABLE.value,
+                        "system_prompt": None,
+                        "user_prompt": None,
+                    },
+                    {
+                        "enabled": False,
+                        "name": enums.ETLTransformer.SUMMARIZE.value,
+                        "system_prompt": None,
+                        "user_prompt": None,
+                    },
+                ]
+            },
+            load_config={
+                "refinery_project": {"enabled": False, "id": None},
+                "markdown_file": {"enabled": True, "id": str(markdown_file.id)},
+            },
+            notify_config={
+                "http": {
+                    "url": "http://cognition-gateway:80/etl/finished/{markdown_file_id}",
+                    "format": {
+                        "markdown_file_id": str(markdown_file.id),
+                    },
+                    "method": "POST",
+                }
+            },
             priority=priority,
         )
 

From 5e0aafee5fdee167ab00ae4ac54dbf530a65d70d Mon Sep 17 00:00:00 2001
From: andhreljaKern <andrea.hrelja@kern.ai>
Date: Thu, 30 Oct 2025 11:21:32 +0100
Subject: [PATCH 24/54] fix: markdown_file update after etl_task creation

---
 controller/transfer/cognition/minio_upload.py | 21 ++++---------------
 1 file changed, 4 insertions(+), 17 deletions(-)

diff --git a/controller/transfer/cognition/minio_upload.py b/controller/transfer/cognition/minio_upload.py
index 7371b4f6..30fb75a9 100644
--- a/controller/transfer/cognition/minio_upload.py
+++ b/controller/transfer/cognition/minio_upload.py
@@ -44,23 +44,6 @@ def handle_cognition_file_upload(path_parts: List[str]):
             == enums.FileCachingInitiator.TMP_DOC_RETRIEVAL.value
         ):
             priority = 1
-        # task_master_manager.queue_task(
-        #     str(file_reference.organization_id),
-        #     str(file_reference.created_by),
-        #     TaskType.PARSE_COGNITION_FILE,
-        #     {
-        #         "parse_scope": FileCachingProcessingScope.EXTRACT_TRANSFORM.value,
-        #         "file_reference_id": str(file_reference.id),
-        #         "extraction_method": extraction_method,
-        #         "meta_data": file_reference.meta_data,
-        #         "extraction_key": file_reference.meta_data.get("extraction_key"),
-        #         "transformation_key": file_reference.meta_data.get(
-        #             "transformation_key"
-        #         ),
-        #         "file_name": file_reference.original_file_name,
-        #     },
-        #     prio,  # not sure if prio is right here as the prio tasks should only take < 1 min but waiting for the normal queue will take ages depending on the queue
-        # )
 
         markdown_file = markdown_file_bo.get(
             org_id, file_reference.meta_data.get("markdown_file_id")
@@ -135,6 +118,10 @@ def handle_cognition_file_upload(path_parts: List[str]):
             priority=priority,
         )
 
+        markdown_file_bo.update(
+            org_id=org_id, markdown_file_id=markdown_file.id, etl_task_id=etl_task.id
+        )
+
         task_master_manager.queue_task(
             org_id,
             str(file_reference.created_by),

From 709d22d8902b64c28ecd7865b8706906ec639a22 Mon Sep 17 00:00:00 2001
From: andhreljaKern <andrea.hrelja@kern.ai>
Date: Thu, 30 Oct 2025 11:46:09 +0100
Subject: [PATCH 25/54] chore: merge dev

---
 ...9_added_incognito_mode_to_conversations.py | 28 ++++++++++
 .../versions/199a0d8aefbe_new_org_column.py   | 56 +++++++++++++++++++
 ...ccb41_dummy_alembic_to_update_incognito.py | 35 ++++++++++++
 app.py                                        |  1 +
 controller/auth/kratos.py                     | 20 ++++++-
 controller/auth/manager.py                    | 34 +++++++----
 controller/user/manager.py                    | 39 +++++--------
 fast_api/routes/organization.py               |  5 +-
 submodules/model                              |  2 +-
 util/clean_up.py                              | 17 ++++++
 10 files changed, 195 insertions(+), 42 deletions(-)
 create mode 100644 alembic/versions/059f0d62a6b9_added_incognito_mode_to_conversations.py
 create mode 100644 alembic/versions/199a0d8aefbe_new_org_column.py
 create mode 100644 alembic/versions/c6d1cbcccb41_dummy_alembic_to_update_incognito.py

diff --git a/alembic/versions/059f0d62a6b9_added_incognito_mode_to_conversations.py b/alembic/versions/059f0d62a6b9_added_incognito_mode_to_conversations.py
new file mode 100644
index 00000000..633d8b0d
--- /dev/null
+++ b/alembic/versions/059f0d62a6b9_added_incognito_mode_to_conversations.py
@@ -0,0 +1,28 @@
+"""Added incognito mode to conversations
+
+Revision ID: 059f0d62a6b9
+Revises: 24ca8432bd8b
+Create Date: 2025-10-15 14:17:59.216693
+
+"""
+from alembic import op
+import sqlalchemy as sa
+
+
+# revision identifiers, used by Alembic.
+revision = '059f0d62a6b9'
+down_revision = '24ca8432bd8b'
+branch_labels = None
+depends_on = None
+
+
+def upgrade():
+    # ### commands auto generated by Alembic - please adjust! ###
+    op.add_column('conversation', sa.Column('incognito_mode', sa.Boolean(), nullable=True), schema='cognition')
+    # ### end Alembic commands ###
+
+
+def downgrade():
+    # ### commands auto generated by Alembic - please adjust! ###
+    op.drop_column('conversation', 'incognito_mode', schema='cognition')
+    # ### end Alembic commands ###
diff --git a/alembic/versions/199a0d8aefbe_new_org_column.py b/alembic/versions/199a0d8aefbe_new_org_column.py
new file mode 100644
index 00000000..d08c2e8b
--- /dev/null
+++ b/alembic/versions/199a0d8aefbe_new_org_column.py
@@ -0,0 +1,56 @@
+"""new org column
+
+Revision ID: 199a0d8aefbe
+Revises: c6d1cbcccb41
+Create Date: 2025-10-20 07:36:33.488523
+
+"""
+
+from alembic import op
+import sqlalchemy as sa
+
+
+# revision identifiers, used by Alembic.
+revision = "199a0d8aefbe"
+down_revision = "c6d1cbcccb41"
+branch_labels = None
+depends_on = None
+
+
+def upgrade():
+    # ### commands auto generated by Alembic - please adjust! ###
+    op.add_column(
+        "user", sa.Column("messages_created_this_month", sa.BigInteger(), nullable=True)
+    )
+
+    op.create_table(
+        "timed_executions",
+        sa.Column("time_key", sa.String(), nullable=False),
+        sa.Column("last_executed_at", sa.DateTime(), nullable=True),
+        sa.PrimaryKeyConstraint("time_key"),
+        sa.UniqueConstraint("time_key"),
+        schema="global",
+    )
+    connection = op.get_bind()
+
+    update_dataset_sql = """
+        UPDATE public.user
+        SET messages_created_this_month = 0
+        WHERE messages_created_this_month IS NULL
+    """
+    connection.execute(update_dataset_sql)
+    op.drop_column("user", "oidc_identifier")
+    op.create_unique_constraint(None, "timed_executions", ["time_key"], schema="global")
+    # ### end Alembic commands ###
+
+
+def downgrade():
+    # ### commands auto generated by Alembic - please adjust! ###
+    op.drop_column("user", "messages_created_this_month")
+    op.drop_table("timed_executions", schema="global")
+    op.drop_constraint(None, "timed_executions", schema="global", type_="unique")
+    op.add_column(
+        "user",
+        sa.Column("oidc_identifier", sa.VARCHAR(), autoincrement=False, nullable=True),
+    )
+    # ### end Alembic commands ###
diff --git a/alembic/versions/c6d1cbcccb41_dummy_alembic_to_update_incognito.py b/alembic/versions/c6d1cbcccb41_dummy_alembic_to_update_incognito.py
new file mode 100644
index 00000000..6caac863
--- /dev/null
+++ b/alembic/versions/c6d1cbcccb41_dummy_alembic_to_update_incognito.py
@@ -0,0 +1,35 @@
+"""Dummy alembic to update incognito
+
+Revision ID: c6d1cbcccb41
+Revises: 059f0d62a6b9
+Create Date: 2025-10-27 15:38:06.608300
+
+"""
+
+from alembic import op
+import sqlalchemy as sa
+
+
+# revision identifiers, used by Alembic.
+revision = "c6d1cbcccb41"
+down_revision = "059f0d62a6b9"
+branch_labels = None
+depends_on = None
+
+
+def upgrade():
+    connection = op.get_bind()
+    # ### commands auto generated by Alembic - please adjust! ###
+    pass
+    # ### end Alembic commands ###
+    update_dataset_sql = """
+        UPDATE cognition.conversation
+        SET incognito_mode = FALSE
+        WHERE incognito_mode IS NULL; """
+    connection.execute(update_dataset_sql)
+
+
+def downgrade():
+    # ### commands auto generated by Alembic - please adjust! ###
+    pass
+    # ### end Alembic commands ###
diff --git a/app.py b/app.py
index a5998a7d..6c973e29 100644
--- a/app.py
+++ b/app.py
@@ -185,3 +185,4 @@
 session.start_session_cleanup_thread()
 log_storage.start_persist_thread()
 sums_table_manager.start_sums_table_thread()
+clean_up.start_timed_executions_thread()
diff --git a/controller/auth/kratos.py b/controller/auth/kratos.py
index a93c09ef..bd7b7b31 100644
--- a/controller/auth/kratos.py
+++ b/controller/auth/kratos.py
@@ -254,7 +254,7 @@ def email_with_link(to_email: str, recovery_link: str) -> None:
         f"{LANGUAGE_MESSAGES['de']}{recovery_link}\n\n{LANGUAGE_EXPIRATION_INFO['de']}\n\n\n------\n\n{LANGUAGE_MESSAGES['en']}{recovery_link}\n\n{LANGUAGE_EXPIRATION_INFO['en']}",
     )
     msg["Subject"] = INVITATION_SUBJECT
-    msg["From"] = "no-reply@kern.ai"
+    msg["From"] = "signup@kern.ai"
     msg["To"] = to_email
 
     with smtplib.SMTP(SMTP_HOST, SMTP_PORT) as server:
@@ -265,6 +265,24 @@ def email_with_link(to_email: str, recovery_link: str) -> None:
         server.send_message(msg)
 
 
+def send_bulk_emails(emails: List[str], recovery_links: List[str]) -> None:
+
+    with smtplib.SMTP(SMTP_HOST, SMTP_PORT) as server:
+        if SMTP_USER and SMTP_PASSWORD:
+            server.ehlo()
+            server.starttls()
+            server.login(SMTP_USER, SMTP_PASSWORD)
+
+        for to_email, recovery_link in zip(emails, recovery_links):
+            msg = MIMEText(
+                f"{LANGUAGE_MESSAGES['de']}{recovery_link}\n\n{LANGUAGE_EXPIRATION_INFO['de']}\n\n\n------\n\n{LANGUAGE_MESSAGES['en']}{recovery_link}\n\n{LANGUAGE_EXPIRATION_INFO['en']}",
+            )
+            msg["Subject"] = INVITATION_SUBJECT
+            msg["From"] = "signup@kern.ai"
+            msg["To"] = to_email
+            server.send_message(msg)
+
+
 def check_user_exists(email: str) -> bool:
     request = requests.get(
         f"{KRATOS_ADMIN_URL}/identities?preview_credentials_identifier_similar={quote(email)}"
diff --git a/controller/auth/manager.py b/controller/auth/manager.py
index b002d7a5..ea30f34c 100644
--- a/controller/auth/manager.py
+++ b/controller/auth/manager.py
@@ -12,7 +12,7 @@
 from controller.user import manager as user_manager
 from controller.organization import manager as organization_manager
 from submodules.model import enums, exceptions
-from submodules.model.business_objects import organization
+from submodules.model.business_objects import general, organization
 from submodules.model.business_objects.user import check_email_in_full_admin
 from submodules.model.models import Organization, Project, User
 import sqlalchemy
@@ -183,32 +183,44 @@ def invite_users(
     team_ids: Optional[List[str]] = None,
 ):
     user_ids = []
+    recovery_links = []
+    organization = organization_manager.get_organization_by_name(organization_name)
+    if organization is None:
+        raise exceptions.EntityNotFoundException("Organization not found")
     for email in emails:
         # Create accounts for the email
         user = kratos.create_user_kratos(email, provider)
         if not user:
             raise AuthManagerError("User creation failed")
         user_ids.append(user["id"])
-        # Assign the account to the organization
-        user_manager.update_organization_of_user(organization_name, email)
+        user_database = user_manager.get_or_create_user(user["id"], with_commit=False)
+        if not user_database:
+            raise AuthManagerError("User creation in database failed")
 
-        # Assign the user role
-        user_manager.update_user_role(user["id"], user_role)
+        user_database.language_display = language
 
-        # Add the preferred language
-        user_manager.update_user_field(user["id"], "language_display", language)
+        try:
+            role = enums.UserRoles[user_role.upper()].value
+        except KeyError:
+            raise ValueError(f"Invalid role: {role}")
+        user_database.role = role
+        user_database.organization_id = organization.id
 
         # Add the user to the teams
         if team_ids:
-            user_manager.add_user_to_teams(creation_user_id, user["id"], team_ids)
+            user_manager.add_user_to_teams(
+                creation_user_id, user["id"], team_ids, with_commit=False
+            )
 
         # Get the recovery link for the email
         recovery_link = kratos.get_recovery_link(user["id"])
         if not recovery_link:
             raise AuthManagerError("Failed to get recovery link")
-
-        # Send the recovery link to the email
-        kratos.email_with_link(email, recovery_link["recovery_link"])
+        recovery_links.append(recovery_link["recovery_link"])
+    general.commit()
+    kratos.send_bulk_emails(emails, recovery_links)
+    kratos.__refresh_identity_cache()
+    organization_manager.sync_organization_sharepoint_integrations(organization.id)
     return user_ids
 
 
diff --git a/controller/user/manager.py b/controller/user/manager.py
index ea760d2e..81df8e35 100644
--- a/controller/user/manager.py
+++ b/controller/user/manager.py
@@ -1,4 +1,4 @@
-from typing import Dict, Optional, Any
+from typing import Dict, List, Optional, Any
 from submodules.model import User, daemon, enums
 from submodules.model.business_objects import user, general
 from controller.auth import kratos
@@ -17,12 +17,14 @@ def get_user(user_id: str) -> User:
     return user_item
 
 
-def get_or_create_user(user_id: str) -> User:
+def get_or_create_user(user_id: str, with_commit: bool = True) -> User:
     user_item = user.get(user_id)
     if not user_item:
-        user_item = user.create(user_id, with_commit=True)
-        kratos.__refresh_identity_cache()
-    update_last_interaction(user_item.id)
+        user_item = user.create(user_id, with_commit=with_commit)
+        if with_commit:
+            kratos.__refresh_identity_cache()
+    else:
+        update_last_interaction(user_item.id)
     return user_item
 
 
@@ -89,10 +91,13 @@ def update_user_field(user_id: str, field: str, value: Any) -> User:
     return user_item
 
 
-def add_user_to_teams(creation_user_id: str, user_id: str, team_ids: list) -> User:
+def add_user_to_teams(
+    creation_user_id: str, user_id: str, team_ids: list, with_commit: bool = True
+) -> User:
     for team_id in team_ids:
         team_member_db_co.create(team_id, user_id, creation_user_id, with_commit=False)
-    general.commit()
+    if with_commit:
+        general.commit()
 
 
 def remove_organization_from_user(user_mail: str) -> None:
@@ -113,7 +118,7 @@ def get_active_users_filtered(
     sort_direction: Optional[str] = None,
     offset: Optional[int] = None,
     limit: Optional[int] = None,
-) -> User:
+) -> List[User]:
     now = datetime.now()
     last_interaction_range = (now - timedelta(minutes=minutes)) if minutes > 0 else None
     return user.get_active_users_after_filter(
@@ -175,22 +180,4 @@ def __migrate_kratos_users():
         if user_database.sso_provider != sso_provider:
             user_database.sso_provider = sso_provider
 
-        if user_database.oidc_identifier is None:
-            user_search = kratos.__search_kratos_for_user_mail(
-                user_identity["traits"]["email"]
-            )
-            if user_search and user_search["credentials"]:
-                if user_search["credentials"].get("oidc", None):
-                    oidc = (
-                        user_search["credentials"]
-                        .get("oidc", {})
-                        .get("identifiers", None)[0]
-                    )
-                    if oidc:
-                        oidc = oidc.split(":")
-                        if len(oidc) > 1:
-                            user_database.oidc_identifier = oidc[1]
-                        else:
-                            user_database.oidc_identifier = None
-
     general.commit()
diff --git a/fast_api/routes/organization.py b/fast_api/routes/organization.py
index e6cba633..bbd5fe22 100644
--- a/fast_api/routes/organization.py
+++ b/fast_api/routes/organization.py
@@ -1,5 +1,4 @@
 import json
-from controller.auth import kratos
 from fastapi import APIRouter, Request, Body
 from fast_api.models import (
     AddUserToOrganizationBody,
@@ -95,7 +94,6 @@ def get_user_info(request: Request):
 # in use cognition-ui & admin dashboard (07.01.25)
 @router.get("/get-user-info-extended")
 def get_user_info_extended(request: Request):
-    kratos.__refresh_identity_cache()
     user = auth_manager.get_user_by_info(request.state.info)
     name = resolve_user_name_by_id(user.id)
     user_dict = {
@@ -282,6 +280,7 @@ def get_mapped_sorted_paginated_users(
             "created_at": user.created_at.isoformat() if user.created_at else None,
             "metadata_public": user.metadata_public,
             "sso_provider": user.sso_provider,
+            "messages_created_this_month": user.messages_created_this_month,
         }
         for user in active_users
     ]
@@ -304,7 +303,7 @@ def delete_user(request: Request, body: DeleteUserBody = Body(...)):
 
 
 # in use admin-dashboard (08.01.25)
-@router.post("/missing-users-interaction")
+@router.post("/missing-users-interaction-and-message-count")
 def get_missing_users_interaction(request: Request, body: MissingUsersBody = Body(...)):
     auth_manager.check_admin_access(request.state.info)
     data = user.get_missing_users(body.user_ids)
diff --git a/submodules/model b/submodules/model
index f5681d71..7efb80fa 160000
--- a/submodules/model
+++ b/submodules/model
@@ -1 +1 @@
-Subproject commit f5681d719c62ad33259696274049239910707ed0
+Subproject commit 7efb80fa7dfa0619d4a9549cf69478e4cb23cb5c
diff --git a/util/clean_up.py b/util/clean_up.py
index 6ef88b98..97d4a54d 100644
--- a/util/clean_up.py
+++ b/util/clean_up.py
@@ -1,6 +1,9 @@
 from submodules.model.business_objects import upload_task
 import os
 import shutil
+from submodules.model.daemon import run_without_db_token
+from time import sleep
+from submodules.model.global_objects import timed_executions
 
 
 def clean_up_database() -> None:
@@ -21,3 +24,17 @@ def clean_up_disk() -> None:
                 shutil.rmtree(file_path)
         except Exception as e:
             print("Failed to delete %s. Reason: %s" % (file_path, e))
+
+
+def start_timed_executions_thread() -> None:
+    run_without_db_token(__run_timed_executions)
+
+
+def __run_timed_executions() -> None:
+    sleep(10)  # wait a bit until app is started
+    while True:
+        try:
+            timed_executions.execute_time_key_update(with_commit=True)
+        except Exception as e:
+            print(f"Error during timed executions: {e}")
+        sleep(3600)  # run every hour

From b07fc7ddcb7775ce1740804c8bcc7b85ae4bc15b Mon Sep 17 00:00:00 2001
From: andhreljaKern <andrea.hrelja@kern.ai>
Date: Thu, 30 Oct 2025 11:46:31 +0100
Subject: [PATCH 26/54] perf: add etl task table

---
 ...ble.py => f428a22ecdb3_adds_etl_task_table.py} | 15 +++++----------
 1 file changed, 5 insertions(+), 10 deletions(-)
 rename alembic/versions/{bf8e8646ebdc_adds_etl_task_table.py => f428a22ecdb3_adds_etl_task_table.py} (97%)

diff --git a/alembic/versions/bf8e8646ebdc_adds_etl_task_table.py b/alembic/versions/f428a22ecdb3_adds_etl_task_table.py
similarity index 97%
rename from alembic/versions/bf8e8646ebdc_adds_etl_task_table.py
rename to alembic/versions/f428a22ecdb3_adds_etl_task_table.py
index fc475d48..952f49cf 100644
--- a/alembic/versions/bf8e8646ebdc_adds_etl_task_table.py
+++ b/alembic/versions/f428a22ecdb3_adds_etl_task_table.py
@@ -1,8 +1,8 @@
 """adds etl task table
 
-Revision ID: bf8e8646ebdc
-Revises: 24ca8432bd8b
-Create Date: 2025-10-30 00:21:05.246324
+Revision ID: f428a22ecdb3
+Revises: 199a0d8aefbe
+Create Date: 2025-10-30 10:45:20.843280
 
 """
 
@@ -11,8 +11,8 @@
 from sqlalchemy.dialects import postgresql
 
 # revision identifiers, used by Alembic.
-revision = "bf8e8646ebdc"
-down_revision = "24ca8432bd8b"
+revision = "f428a22ecdb3"
+down_revision = "199a0d8aefbe"
 branch_labels = None
 depends_on = None
 
@@ -62,11 +62,6 @@ def upgrade():
         unique=False,
         schema="global",
     )
-    op.add_column(
-        "conversation",
-        sa.Column("incognito_mode", sa.Boolean(), nullable=True),
-        schema="cognition",
-    )
     op.add_column(
         "markdown_file",
         sa.Column("etl_task_id", postgresql.UUID(as_uuid=True), nullable=True),

From 41c9573f8596c799a41ee863ab22f1098648f1fd Mon Sep 17 00:00:00 2001
From: andhreljaKern <andrea.hrelja@kern.ai>
Date: Thu, 30 Oct 2025 12:25:22 +0100
Subject: [PATCH 27/54] perf: disable CLEANSE as default

---
 controller/transfer/cognition/etl.py          | 107 ------------------
 controller/transfer/cognition/minio_upload.py |   2 +-
 2 files changed, 1 insertion(+), 108 deletions(-)
 delete mode 100644 controller/transfer/cognition/etl.py

diff --git a/controller/transfer/cognition/etl.py b/controller/transfer/cognition/etl.py
deleted file mode 100644
index 47e17a27..00000000
--- a/controller/transfer/cognition/etl.py
+++ /dev/null
@@ -1,107 +0,0 @@
-from typing import Optional
-
-from submodules.model import enums
-from submodules.model.models import (
-    EtlTask,
-    CognitionMarkdownFile,
-    CognitionMarkdownDataset,
-)
-from submodules.model.global_objects import etl_task as etl_task_bo
-from submodules.model.cognition_objects import markdown_file as markdown_file_bo
-
-DEFAULT_FILE_TYPE = enums.ETLFileType.PDF
-DEFAULT_EXTRACTORS = {
-    enums.ETLFileType.MD: enums.ETLExtractor.MD.FILESYSTEM,
-    enums.ETLFileType.PDF: enums.ETLExtractor.PDF.PDF2MD,
-}
-
-DEFAULT_FALLBACK_EXTRACTORS = {
-    enums.ETLFileType.PDF: [
-        enums.ETLExtractor.PDF.PDF2MD,
-        enums.ETLExtractor.PDF.VISION,
-        enums.ETLExtractor.PDF.AZURE_DI,
-    ],
-}
-
-
-def get_or_create_task(
-    markdown_file: CognitionMarkdownFile,
-    markdown_dataset: CognitionMarkdownDataset,
-    file_size_bytes: int,
-    minio_path: str,
-    original_file_name: str,
-    file_type: Optional[enums.ETLFileType] = None,
-    extractor: Optional[enums.ETLExtractor] = None,
-    fallback_extractors: Optional[list[enums.ETLExtractor]] = None,
-    split_strategy: Optional[enums.ETLSplitStrategy] = None,
-    chunk_size: Optional[int] = 1000,
-    priority: Optional[int] = -1,
-) -> EtlTask:
-    if markdown_file.etl_task_id:
-        if etl_task := etl_task_bo.get_by_id(markdown_file.etl_task_id):
-            return etl_task
-
-    file_type = file_type or DEFAULT_FILE_TYPE
-    split_strategy = split_strategy or enums.ETLSplitStrategy.CHUNK
-    extractor = extractor or DEFAULT_EXTRACTORS[file_type]
-    fallback_extractors = list(
-        filter(
-            lambda x: x != extractor,
-            (fallback_extractors or DEFAULT_FALLBACK_EXTRACTORS.get(file_type, [])),
-        )
-    )
-
-    etl_task = etl_task_bo.create(
-        org_id=markdown_dataset.organization_id,
-        user_id=markdown_file.created_by,
-        file_size_bytes=file_size_bytes,
-        extract_config={
-            "file_type": file_type.value,
-            "extractor": extractor.value,
-            "fallback_extractors": [fe.value for fe in fallback_extractors],
-            "minio_path": minio_path,
-            "original_file_name": original_file_name,
-        },
-        split_config={
-            "strategy": split_strategy.value,
-            "chunk_size": chunk_size,
-        },
-        transform_config={
-            "transformers": [
-                {
-                    "name": enums.ETLTransformer.CLEANSE.value,
-                    "system_prompt": None,
-                    "user_prompt": None,
-                },
-                {
-                    "name": enums.ETLTransformer.TEXT_TO_TABLE.value,
-                    "system_prompt": None,
-                    "user_prompt": None,
-                },
-            ]
-        },
-        load_config={
-            "refinery_project": {"enabled": False, "id": None},
-            "markdown_file": {"enabled": True, "id": str(markdown_file.id)},
-        },
-        notify_config={
-            "http": {
-                "url": "http://cognition-gateway:80/etl/finished/{markdown_file_id}",
-                "format": {
-                    "markdown_file_id": str(markdown_file.id),
-                },
-                "method": "POST",
-            }
-        },
-        llm_config=markdown_dataset.llm_config,
-        tokenizer=markdown_dataset.tokenizer,
-        priority=priority,
-    )
-
-    markdown_file_bo.update(
-        org_id=markdown_file.organization_id,
-        markdown_file_id=markdown_file.id,
-        etl_task_id=etl_task.id,
-    )
-
-    return etl_task
diff --git a/controller/transfer/cognition/minio_upload.py b/controller/transfer/cognition/minio_upload.py
index 30fb75a9..c5a34b36 100644
--- a/controller/transfer/cognition/minio_upload.py
+++ b/controller/transfer/cognition/minio_upload.py
@@ -83,7 +83,7 @@ def handle_cognition_file_upload(path_parts: List[str]):
             transform_config={
                 "transformers": [
                     {  # NOTE: __call_gpt_with_key only reads user_prompt
-                        "enabled": True,
+                        "enabled": False,  # this transformer is disabled because it often hangs the ETL process
                         "name": enums.ETLTransformer.CLEANSE.value,
                         "system_prompt": None,
                         "user_prompt": None,

From 2554a43809814fb3b7282781d58df07d365e9c69 Mon Sep 17 00:00:00 2001
From: andhreljaKern <andrea.hrelja@kern.ai>
Date: Thu, 30 Oct 2025 13:19:24 +0100
Subject: [PATCH 28/54] chore: update submodules

---
 submodules/model | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/submodules/model b/submodules/model
index 7efb80fa..e103f2f6 160000
--- a/submodules/model
+++ b/submodules/model
@@ -1 +1 @@
-Subproject commit 7efb80fa7dfa0619d4a9549cf69478e4cb23cb5c
+Subproject commit e103f2f65f4764e2bc19c1e6d6b58d7390547f18

From 52f715e67ca50e4e15a1da75ef672657c4e1f212 Mon Sep 17 00:00:00 2001
From: andhreljaKern <andrea.hrelja@kern.ai>
Date: Thu, 30 Oct 2025 13:19:34 +0100
Subject: [PATCH 29/54] perf: standard cache config keys

---
 controller/transfer/cognition/minio_upload.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/controller/transfer/cognition/minio_upload.py b/controller/transfer/cognition/minio_upload.py
index c5a34b36..791f765e 100644
--- a/controller/transfer/cognition/minio_upload.py
+++ b/controller/transfer/cognition/minio_upload.py
@@ -72,9 +72,10 @@ def handle_cognition_file_upload(path_parts: List[str]):
                 enums.ETLExtractorPDF.VISION,
             ],
             cache_config={
-                "use_file_cache": True,
-                "use_extraction_cache": False,
-                "use_transformation_cache": True,
+                enums.ETLCacheKeys.FILE_CACHE.value: True,
+                enums.ETLCacheKeys.EXTRACTION.value: True,
+                enums.ETLCacheKeys.SPLITTING.value: True,
+                enums.ETLCacheKeys.TRANSFORMATION.value: True,
             },
             split_config={
                 "strategy": enums.ETLSplitStrategy.CHUNK.value,

From 356f38ee6d8c69158d2eb4090aa7f691cae40676 Mon Sep 17 00:00:00 2001
From: JWittmeyer <jens.wittmeyer@kern.ai>
Date: Mon, 3 Nov 2025 14:30:23 +0100
Subject: [PATCH 30/54] Merge with dev

---
 submodules/model | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/submodules/model b/submodules/model
index e103f2f6..293c1493 160000
--- a/submodules/model
+++ b/submodules/model
@@ -1 +1 @@
-Subproject commit e103f2f65f4764e2bc19c1e6d6b58d7390547f18
+Subproject commit 293c14937f8f8d48ac582541cb083988170f98f9

From 2770d256429ff25c9e9030396986a77de15bb84f Mon Sep 17 00:00:00 2001
From: JWittmeyer <jens.wittmeyer@kern.ai>
Date: Mon, 3 Nov 2025 16:29:39 +0100
Subject: [PATCH 31/54] Alembic new table & submodule fix import

---
 alembic/versions/9d5fb67e29f7_config_sets.py | 51 ++++++++++++++++++++
 submodules/model                             |  2 +-
 2 files changed, 52 insertions(+), 1 deletion(-)
 create mode 100644 alembic/versions/9d5fb67e29f7_config_sets.py

diff --git a/alembic/versions/9d5fb67e29f7_config_sets.py b/alembic/versions/9d5fb67e29f7_config_sets.py
new file mode 100644
index 00000000..a26eb0ac
--- /dev/null
+++ b/alembic/versions/9d5fb67e29f7_config_sets.py
@@ -0,0 +1,51 @@
+"""Config sets'
+
+
+Revision ID: 9d5fb67e29f7
+Revises: f428a22ecdb3
+Create Date: 2025-11-03 15:28:47.686657
+
+"""
+from alembic import op
+import sqlalchemy as sa
+from sqlalchemy.dialects import postgresql
+
+# revision identifiers, used by Alembic.
+revision = '9d5fb67e29f7'
+down_revision = 'f428a22ecdb3'
+branch_labels = None
+depends_on = None
+
+
+def upgrade():
+    # ### commands auto generated by Alembic - please adjust! ###
+    op.create_table('etl_config_preset',
+    sa.Column('id', postgresql.UUID(as_uuid=True), nullable=False),
+    sa.Column('organization_id', postgresql.UUID(as_uuid=True), nullable=True),
+    sa.Column('project_id', postgresql.UUID(as_uuid=True), nullable=True),
+    sa.Column('name', sa.String(), nullable=True),
+    sa.Column('description', sa.String(), nullable=True),
+    sa.Column('created_at', sa.DateTime(), nullable=True),
+    sa.Column('created_by', postgresql.UUID(as_uuid=True), nullable=True),
+    sa.Column('etl_config', sa.JSON(), nullable=True),
+    sa.Column('add_config', sa.JSON(), nullable=True),
+    sa.ForeignKeyConstraint(['created_by'], ['user.id'], ondelete='SET NULL'),
+    sa.ForeignKeyConstraint(['organization_id'], ['organization.id'], ondelete='CASCADE'),
+    sa.ForeignKeyConstraint(['project_id'], ['cognition.project.id'], ondelete='CASCADE'),
+    sa.PrimaryKeyConstraint('id'),
+    sa.UniqueConstraint('name'),
+    schema='cognition'
+    )
+    op.create_index(op.f('ix_cognition_etl_config_preset_created_by'), 'etl_config_preset', ['created_by'], unique=False, schema='cognition')
+    op.create_index(op.f('ix_cognition_etl_config_preset_organization_id'), 'etl_config_preset', ['organization_id'], unique=False, schema='cognition')
+    op.create_index(op.f('ix_cognition_etl_config_preset_project_id'), 'etl_config_preset', ['project_id'], unique=False, schema='cognition')
+    # ### end Alembic commands ###
+
+
+def downgrade():
+    # ### commands auto generated by Alembic - please adjust! ###
+    op.drop_index(op.f('ix_cognition_etl_config_preset_project_id'), table_name='etl_config_preset', schema='cognition')
+    op.drop_index(op.f('ix_cognition_etl_config_preset_organization_id'), table_name='etl_config_preset', schema='cognition')
+    op.drop_index(op.f('ix_cognition_etl_config_preset_created_by'), table_name='etl_config_preset', schema='cognition')
+    op.drop_table('etl_config_preset', schema='cognition')
+    # ### end Alembic commands ###
diff --git a/submodules/model b/submodules/model
index 293c1493..2d919530 160000
--- a/submodules/model
+++ b/submodules/model
@@ -1 +1 @@
-Subproject commit 293c14937f8f8d48ac582541cb083988170f98f9
+Subproject commit 2d919530f6c3f09c6b9f6b79481c99d520d857dd

From b70404ef949e9fbddc1403c4c511de02a63862e1 Mon Sep 17 00:00:00 2001
From: JWittmeyer <jens.wittmeyer@kern.ai>
Date: Wed, 12 Nov 2025 15:09:04 +0100
Subject: [PATCH 32/54] tmp commit

---
 controller/transfer/cognition/minio_upload.py | 48 ++++++++++++-------
 submodules/model                              |  2 +-
 2 files changed, 33 insertions(+), 17 deletions(-)

diff --git a/controller/transfer/cognition/minio_upload.py b/controller/transfer/cognition/minio_upload.py
index 791f765e..3f42090c 100644
--- a/controller/transfer/cognition/minio_upload.py
+++ b/controller/transfer/cognition/minio_upload.py
@@ -1,7 +1,7 @@
 from typing import List
 
 from controller.task_master import manager as task_master_manager
-from submodules.model import enums
+from submodules.model import enums, etl_utils
 from submodules.model.business_objects import general
 from submodules.model.global_objects import etl_task as etl_task_bo
 from submodules.model.cognition_objects import (
@@ -12,9 +12,12 @@
 
 
 def handle_cognition_file_upload(path_parts: List[str]):
-
+    print(path_parts, flush=True)
+    # raise NotImplementedError("This function is not yet implemented.")
     if path_parts[1] != "_cognition" or len(path_parts) < 5:
         return
+    ##tmp doc retrieval => need to understand how .info file is an indicator for cognition gateway to pick it up
+
     if path_parts[2] == "files" and path_parts[4].startswith("file_original"):
         org_id = path_parts[0]
         file_hash, file_size = path_parts[3].split("_")
@@ -34,33 +37,45 @@ def handle_cognition_file_upload(path_parts: List[str]):
                 print(f"File reference id: {str(file_reference.id)}", flush=True)
                 print(f"File name: {file_reference.original_file_name}", flush=True)
             return
-        file_reference.state = enums.FileCachingState.COMPLETED.value
-        general.commit()
-
+        # file_reference.state = enums.FileCachingState.COMPLETED.value
+        # general.commit()
+        # {"project_id": "c155f6f9-7731-4c18-84f3-2e68b4747037", "file_caching_initiator": "TMP_DOC_RETRIEVAL", "conversation_id": "75448474-1836-4bec-88ec-0df586c8ea0b", "extraction_method": "pdf2markdown", "file_name": "Local Explorer - Final- page2.pdf", "extraction_key": "pdf2markdown", "transformation_key": "PRIVATEMODE_AI"}
         chunk_size = 1000
-        priority = -1
         if (
             file_reference.meta_data.get("transformation_initiator")
             == enums.FileCachingInitiator.TMP_DOC_RETRIEVAL.value
         ):
             priority = 1
+            # llm_config_extraction = proj
+        else:
+            priority = -1
 
-        markdown_file = markdown_file_bo.get(
-            org_id, file_reference.meta_data.get("markdown_file_id")
-        )
-        if not markdown_file:
-            print(
-                "ERROR: Markdown file not found for the given markdown_file_id",
-                flush=True,
-            )
-            raise ValueError(
-                f"Markdown file not found for file reference {file_reference.id}"
+            markdown_file = markdown_file_bo.get(
+                org_id, file_reference.meta_data.get("markdown_file_id")
             )
+        # if not markdown_file:
+        #     print(
+        #         "ERROR: Markdown file not found for the given markdown_file_id",
+        #         flush=True,
+        #     )
+        #     raise ValueError(
+        #         f"Markdown file not found for file reference {file_reference.id}"
+        #     )
 
         markdown_dataset = markdown_dataset_bo.get(
             org_id=org_id, id=markdown_file.dataset_id
         )
 
+        task_config = etl_utils.create_etl_config_for_tmp_doc(
+            extract_config={
+                "file_type": markdown_file.category_origin,
+                "extractor": markdown_file.meta_data.get("extractor"),
+                "fallback_extractors": [fe.value for fe in fallback_extractors],
+                "minio_path": file_reference.minio_path,
+                "original_file_name": file_reference.original_file_name,
+            }
+        )
+
         etl_task = etl_task_bo.get_or_create_markdown_file_etl_task(
             org_id=org_id,
             file_reference=file_reference,
@@ -128,4 +143,5 @@ def handle_cognition_file_upload(path_parts: List[str]):
             str(file_reference.created_by),
             enums.TaskType.EXECUTE_ETL,
             {"etl_task_id": str(etl_task.id)},
+            priority=priority != -1,
         )
diff --git a/submodules/model b/submodules/model
index 4882dbc2..3a2d1b95 160000
--- a/submodules/model
+++ b/submodules/model
@@ -1 +1 @@
-Subproject commit 4882dbc20ea95afaefe996e5e5b12a9157213f49
+Subproject commit 3a2d1b95a21ba4ee53de51419ae4d3a4de03bc87

From a86ef308eecf677060455950f63771d5c8b64ea9 Mon Sep 17 00:00:00 2001
From: JWittmeyer <jens.wittmeyer@kern.ai>
Date: Fri, 14 Nov 2025 16:09:07 +0100
Subject: [PATCH 33/54] Tmp doc almost

---
 controller/transfer/cognition/minio_upload.py | 191 +++++++++---------
 submodules/model                              |   2 +-
 2 files changed, 95 insertions(+), 98 deletions(-)

diff --git a/controller/transfer/cognition/minio_upload.py b/controller/transfer/cognition/minio_upload.py
index 3f42090c..4059bba7 100644
--- a/controller/transfer/cognition/minio_upload.py
+++ b/controller/transfer/cognition/minio_upload.py
@@ -12,12 +12,11 @@
 
 
 def handle_cognition_file_upload(path_parts: List[str]):
-    print(path_parts, flush=True)
     # raise NotImplementedError("This function is not yet implemented.")
     if path_parts[1] != "_cognition" or len(path_parts) < 5:
         return
     ##tmp doc retrieval => need to understand how .info file is an indicator for cognition gateway to pick it up
-
+    print(path_parts, flush=True)
     if path_parts[2] == "files" and path_parts[4].startswith("file_original"):
         org_id = path_parts[0]
         file_hash, file_size = path_parts[3].split("_")
@@ -37,111 +36,109 @@ def handle_cognition_file_upload(path_parts: List[str]):
                 print(f"File reference id: {str(file_reference.id)}", flush=True)
                 print(f"File name: {file_reference.original_file_name}", flush=True)
             return
-        # file_reference.state = enums.FileCachingState.COMPLETED.value
-        # general.commit()
-        # {"project_id": "c155f6f9-7731-4c18-84f3-2e68b4747037", "file_caching_initiator": "TMP_DOC_RETRIEVAL", "conversation_id": "75448474-1836-4bec-88ec-0df586c8ea0b", "extraction_method": "pdf2markdown", "file_name": "Local Explorer - Final- page2.pdf", "extraction_key": "pdf2markdown", "transformation_key": "PRIVATEMODE_AI"}
-        chunk_size = 1000
         if (
-            file_reference.meta_data.get("transformation_initiator")
+            file_reference.meta_data.get("file_caching_initiator")
             == enums.FileCachingInitiator.TMP_DOC_RETRIEVAL.value
         ):
-            priority = 1
-            # llm_config_extraction = proj
+            task_config, tokenizer = (
+                etl_utils.create_etl_task_config_from_file_reference_tmp_doc(
+                    file_reference
+                )
+            )
+            etl_task = etl_task_bo.create(
+                org_id,
+                file_reference.created_by,
+                file_reference.file_size_bytes,
+                full_config=task_config,
+                tokenizer=tokenizer,
+                priority=1,
+            )
+            task_master_manager.queue_task(
+                org_id,
+                str(file_reference.created_by),
+                enums.TaskType.EXECUTE_ETL,
+                {"etl_task_id": str(etl_task.id)},
+                priority=True,
+            )
+
         else:
             priority = -1
 
             markdown_file = markdown_file_bo.get(
                 org_id, file_reference.meta_data.get("markdown_file_id")
             )
-        # if not markdown_file:
-        #     print(
-        #         "ERROR: Markdown file not found for the given markdown_file_id",
-        #         flush=True,
-        #     )
-        #     raise ValueError(
-        #         f"Markdown file not found for file reference {file_reference.id}"
-        #     )
-
-        markdown_dataset = markdown_dataset_bo.get(
-            org_id=org_id, id=markdown_file.dataset_id
-        )
+            raise NotImplementedError("Non-tmp doc upload not implemented yet.")
 
-        task_config = etl_utils.create_etl_config_for_tmp_doc(
-            extract_config={
-                "file_type": markdown_file.category_origin,
-                "extractor": markdown_file.meta_data.get("extractor"),
-                "fallback_extractors": [fe.value for fe in fallback_extractors],
-                "minio_path": file_reference.minio_path,
-                "original_file_name": file_reference.original_file_name,
-            }
-        )
+            # markdown_dataset = markdown_dataset_bo.get(
+            #     org_id=org_id, id=markdown_file.dataset_id
+            # )
 
-        etl_task = etl_task_bo.get_or_create_markdown_file_etl_task(
-            org_id=org_id,
-            file_reference=file_reference,
-            markdown_file=markdown_file,
-            markdown_dataset=markdown_dataset,
-            extractor=markdown_file.meta_data.get("extractor"),
-            fallback_extractors=[
-                enums.ETLExtractorPDF.PDF2MD,
-                enums.ETLExtractorPDF.VISION,
-            ],
-            cache_config={
-                enums.ETLCacheKeys.FILE_CACHE.value: True,
-                enums.ETLCacheKeys.EXTRACTION.value: True,
-                enums.ETLCacheKeys.SPLITTING.value: True,
-                enums.ETLCacheKeys.TRANSFORMATION.value: True,
-            },
-            split_config={
-                "strategy": enums.ETLSplitStrategy.CHUNK.value,
-                "chunk_size": chunk_size,
-            },
-            transform_config={
-                "transformers": [
-                    {  # NOTE: __call_gpt_with_key only reads user_prompt
-                        "enabled": False,  # this transformer is disabled because it often hangs the ETL process
-                        "name": enums.ETLTransformer.CLEANSE.value,
-                        "system_prompt": None,
-                        "user_prompt": None,
-                    },
-                    {
-                        "enabled": True,
-                        "name": enums.ETLTransformer.TEXT_TO_TABLE.value,
-                        "system_prompt": None,
-                        "user_prompt": None,
-                    },
-                    {
-                        "enabled": False,
-                        "name": enums.ETLTransformer.SUMMARIZE.value,
-                        "system_prompt": None,
-                        "user_prompt": None,
-                    },
-                ]
-            },
-            load_config={
-                "refinery_project": {"enabled": False, "id": None},
-                "markdown_file": {"enabled": True, "id": str(markdown_file.id)},
-            },
-            notify_config={
-                "http": {
-                    "url": "http://cognition-gateway:80/etl/finished/{markdown_file_id}",
-                    "format": {
-                        "markdown_file_id": str(markdown_file.id),
-                    },
-                    "method": "POST",
-                }
-            },
-            priority=priority,
-        )
+        # etl_task = etl_task_bo.get_or_create_markdown_file_etl_task(
+        #     org_id=org_id,
+        #     file_reference=file_reference,
+        #     markdown_file=markdown_file,
+        #     markdown_dataset=markdown_dataset,
+        #     extractor=markdown_file.meta_data.get("extractor"),
+        #     fallback_extractors=[
+        #         enums.ETLExtractorPDF.PDF2MD,
+        #         enums.ETLExtractorPDF.VISION,
+        #     ],
+        #     cache_config={
+        #         enums.ETLCacheKeys.FILE_CACHE.value: True,
+        #         enums.ETLCacheKeys.EXTRACTION.value: True,
+        #         enums.ETLCacheKeys.SPLITTING.value: True,
+        #         enums.ETLCacheKeys.TRANSFORMATION.value: True,
+        #     },
+        #     split_config={
+        #         "strategy": enums.ETLSplitStrategy.CHUNK.value,
+        #         "chunk_size": chunk_size,
+        #     },
+        #     transform_config={
+        #         "transformers": [
+        #             {  # NOTE: __call_gpt_with_key only reads user_prompt
+        #                 "enabled": False,  # this transformer is disabled because it often hangs the ETL process
+        #                 "name": enums.ETLTransformer.CLEANSE.value,
+        #                 "system_prompt": None,
+        #                 "user_prompt": None,
+        #             },
+        #             {
+        #                 "enabled": True,
+        #                 "name": enums.ETLTransformer.TEXT_TO_TABLE.value,
+        #                 "system_prompt": None,
+        #                 "user_prompt": None,
+        #             },
+        #             {
+        #                 "enabled": False,
+        #                 "name": enums.ETLTransformer.SUMMARIZE.value,
+        #                 "system_prompt": None,
+        #                 "user_prompt": None,
+        #             },
+        #         ]
+        #     },
+        #     load_config={
+        #         "refinery_project": {"enabled": False, "id": None},
+        #         "markdown_file": {"enabled": True, "id": str(markdown_file.id)},
+        #     },
+        #     notify_config={
+        #         "http": {
+        #             "url": "http://cognition-gateway:80/etl/finished/{markdown_file_id}",
+        #             "format": {
+        #                 "markdown_file_id": str(markdown_file.id),
+        #             },
+        #             "method": "POST",
+        #         }
+        #     },
+        #     priority=priority,
+        # )
 
-        markdown_file_bo.update(
-            org_id=org_id, markdown_file_id=markdown_file.id, etl_task_id=etl_task.id
-        )
+        # markdown_file_bo.update(
+        #     org_id=org_id, markdown_file_id=markdown_file.id, etl_task_id=etl_task.id
+        # )
 
-        task_master_manager.queue_task(
-            org_id,
-            str(file_reference.created_by),
-            enums.TaskType.EXECUTE_ETL,
-            {"etl_task_id": str(etl_task.id)},
-            priority=priority != -1,
-        )
+        # task_master_manager.queue_task(
+        #     org_id,
+        #     str(file_reference.created_by),
+        #     enums.TaskType.EXECUTE_ETL,
+        #     {"etl_task_id": str(etl_task.id)},
+        #     priority=priority != -1,
+        # )
diff --git a/submodules/model b/submodules/model
index 3a2d1b95..5ff5c028 160000
--- a/submodules/model
+++ b/submodules/model
@@ -1 +1 @@
-Subproject commit 3a2d1b95a21ba4ee53de51419ae4d3a4de03bc87
+Subproject commit 5ff5c028cd18e8b5f95c272192931004dbeb1d00

From 0e1f1d86925c7b472d966669233a63ad4bb6c658 Mon Sep 17 00:00:00 2001
From: JWittmeyer <jens.wittmeyer@kern.ai>
Date: Mon, 17 Nov 2025 16:03:04 +0100
Subject: [PATCH 34/54] No cache tmp doc working

---
 controller/transfer/cognition/minio_upload.py | 6 +++++-
 submodules/model                              | 2 +-
 2 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/controller/transfer/cognition/minio_upload.py b/controller/transfer/cognition/minio_upload.py
index 4059bba7..9c17b9c0 100644
--- a/controller/transfer/cognition/minio_upload.py
+++ b/controller/transfer/cognition/minio_upload.py
@@ -57,7 +57,11 @@ def handle_cognition_file_upload(path_parts: List[str]):
                 org_id,
                 str(file_reference.created_by),
                 enums.TaskType.EXECUTE_ETL,
-                {"etl_task_id": str(etl_task.id)},
+                {
+                    "etl_task_id": str(etl_task.id),
+                    "project_id": file_reference.meta_data.get("project_id"),
+                    "conversation_id": file_reference.meta_data.get("conversation_id"),
+                },
                 priority=True,
             )
 
diff --git a/submodules/model b/submodules/model
index 5ff5c028..dfc0d9d9 160000
--- a/submodules/model
+++ b/submodules/model
@@ -1 +1 @@
-Subproject commit 5ff5c028cd18e8b5f95c272192931004dbeb1d00
+Subproject commit dfc0d9d91adab29c099a2c6569bbb52a0fbaa753

From 726a31fd76f9db77d3afd76b47f6937847cf0684 Mon Sep 17 00:00:00 2001
From: JWittmeyer <jens.wittmeyer@kern.ai>
Date: Tue, 18 Nov 2025 11:44:43 +0100
Subject: [PATCH 35/54] Submodule update

---
 submodules/model | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/submodules/model b/submodules/model
index dfc0d9d9..8429ac1f 160000
--- a/submodules/model
+++ b/submodules/model
@@ -1 +1 @@
-Subproject commit dfc0d9d91adab29c099a2c6569bbb52a0fbaa753
+Subproject commit 8429ac1fb1bd5c29c5324768fdea1aa90f25e798

From e94eb895682b9b1b8fd981ddb39988e0fd305adc Mon Sep 17 00:00:00 2001
From: JWittmeyer <jens.wittmeyer@kern.ai>
Date: Wed, 19 Nov 2025 17:26:35 +0100
Subject: [PATCH 36/54] Ensure complete

---
 controller/transfer/cognition/minio_upload.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/controller/transfer/cognition/minio_upload.py b/controller/transfer/cognition/minio_upload.py
index 9c17b9c0..ba88ae21 100644
--- a/controller/transfer/cognition/minio_upload.py
+++ b/controller/transfer/cognition/minio_upload.py
@@ -36,6 +36,9 @@ def handle_cognition_file_upload(path_parts: List[str]):
                 print(f"File reference id: {str(file_reference.id)}", flush=True)
                 print(f"File name: {file_reference.original_file_name}", flush=True)
             return
+
+        file_reference.state = enums.FileCachingState.COMPLETED.value
+        general.commit()
         if (
             file_reference.meta_data.get("file_caching_initiator")
             == enums.FileCachingInitiator.TMP_DOC_RETRIEVAL.value

From 356d8aedb0b706309fd3698e8d4361506885629a Mon Sep 17 00:00:00 2001
From: andhreljaKern <andrea.hrelja@kern.ai>
Date: Sat, 22 Nov 2025 22:44:19 +0100
Subject: [PATCH 37/54] chore: update submodules

---
 submodules/model | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/submodules/model b/submodules/model
index 8429ac1f..43ab1255 160000
--- a/submodules/model
+++ b/submodules/model
@@ -1 +1 @@
-Subproject commit 8429ac1fb1bd5c29c5324768fdea1aa90f25e798
+Subproject commit 43ab1255631e88c1651782d8e97fbf3fbc31469c

From 2b237e0655ab742e9e5a22cd9fb83752a79b5810 Mon Sep 17 00:00:00 2001
From: andhreljaKern <andrea.hrelja@kern.ai>
Date: Sat, 22 Nov 2025 22:44:50 +0100
Subject: [PATCH 38/54] perf(alembic): etl original file name

---
 alembic/versions/9d5fb67e29f7_config_sets.py  | 86 +++++++++++++------
 ...ab353410347b_add_etl_original_file_name.py | 33 +++++++
 2 files changed, 94 insertions(+), 25 deletions(-)
 create mode 100644 alembic/versions/ab353410347b_add_etl_original_file_name.py

diff --git a/alembic/versions/9d5fb67e29f7_config_sets.py b/alembic/versions/9d5fb67e29f7_config_sets.py
index a26eb0ac..bfa8c8ab 100644
--- a/alembic/versions/9d5fb67e29f7_config_sets.py
+++ b/alembic/versions/9d5fb67e29f7_config_sets.py
@@ -6,46 +6,82 @@
 Create Date: 2025-11-03 15:28:47.686657
 
 """
+
 from alembic import op
 import sqlalchemy as sa
 from sqlalchemy.dialects import postgresql
 
 # revision identifiers, used by Alembic.
-revision = '9d5fb67e29f7'
-down_revision = 'f428a22ecdb3'
+revision = "9d5fb67e29f7"
+down_revision = "f428a22ecdb3"
 branch_labels = None
 depends_on = None
 
 
 def upgrade():
     # ### commands auto generated by Alembic - please adjust! ###
-    op.create_table('etl_config_preset',
-    sa.Column('id', postgresql.UUID(as_uuid=True), nullable=False),
-    sa.Column('organization_id', postgresql.UUID(as_uuid=True), nullable=True),
-    sa.Column('project_id', postgresql.UUID(as_uuid=True), nullable=True),
-    sa.Column('name', sa.String(), nullable=True),
-    sa.Column('description', sa.String(), nullable=True),
-    sa.Column('created_at', sa.DateTime(), nullable=True),
-    sa.Column('created_by', postgresql.UUID(as_uuid=True), nullable=True),
-    sa.Column('etl_config', sa.JSON(), nullable=True),
-    sa.Column('add_config', sa.JSON(), nullable=True),
-    sa.ForeignKeyConstraint(['created_by'], ['user.id'], ondelete='SET NULL'),
-    sa.ForeignKeyConstraint(['organization_id'], ['organization.id'], ondelete='CASCADE'),
-    sa.ForeignKeyConstraint(['project_id'], ['cognition.project.id'], ondelete='CASCADE'),
-    sa.PrimaryKeyConstraint('id'),
-    sa.UniqueConstraint('name'),
-    schema='cognition'
+    op.create_table(
+        "etl_config_preset",
+        sa.Column("id", postgresql.UUID(as_uuid=True), nullable=False),
+        sa.Column("organization_id", postgresql.UUID(as_uuid=True), nullable=True),
+        sa.Column("project_id", postgresql.UUID(as_uuid=True), nullable=True),
+        sa.Column("name", sa.String(), nullable=True),
+        sa.Column("description", sa.String(), nullable=True),
+        sa.Column("created_at", sa.DateTime(), nullable=True),
+        sa.Column("created_by", postgresql.UUID(as_uuid=True), nullable=True),
+        sa.Column("etl_config", sa.JSON(), nullable=True),
+        sa.Column("add_config", sa.JSON(), nullable=True),
+        sa.ForeignKeyConstraint(["created_by"], ["user.id"], ondelete="SET NULL"),
+        sa.ForeignKeyConstraint(
+            ["organization_id"], ["organization.id"], ondelete="CASCADE"
+        ),
+        sa.ForeignKeyConstraint(
+            ["project_id"], ["cognition.project.id"], ondelete="CASCADE"
+        ),
+        sa.PrimaryKeyConstraint("id"),
+        sa.UniqueConstraint("name"),
+        schema="cognition",
+    )
+    op.create_index(
+        op.f("ix_cognition_etl_config_preset_created_by"),
+        "etl_config_preset",
+        ["created_by"],
+        unique=False,
+        schema="cognition",
+    )
+    op.create_index(
+        op.f("ix_cognition_etl_config_preset_organization_id"),
+        "etl_config_preset",
+        ["organization_id"],
+        unique=False,
+        schema="cognition",
+    )
+    op.create_index(
+        op.f("ix_cognition_etl_config_preset_project_id"),
+        "etl_config_preset",
+        ["project_id"],
+        unique=False,
+        schema="cognition",
     )
-    op.create_index(op.f('ix_cognition_etl_config_preset_created_by'), 'etl_config_preset', ['created_by'], unique=False, schema='cognition')
-    op.create_index(op.f('ix_cognition_etl_config_preset_organization_id'), 'etl_config_preset', ['organization_id'], unique=False, schema='cognition')
-    op.create_index(op.f('ix_cognition_etl_config_preset_project_id'), 'etl_config_preset', ['project_id'], unique=False, schema='cognition')
     # ### end Alembic commands ###
 
 
 def downgrade():
     # ### commands auto generated by Alembic - please adjust! ###
-    op.drop_index(op.f('ix_cognition_etl_config_preset_project_id'), table_name='etl_config_preset', schema='cognition')
-    op.drop_index(op.f('ix_cognition_etl_config_preset_organization_id'), table_name='etl_config_preset', schema='cognition')
-    op.drop_index(op.f('ix_cognition_etl_config_preset_created_by'), table_name='etl_config_preset', schema='cognition')
-    op.drop_table('etl_config_preset', schema='cognition')
+    op.drop_index(
+        op.f("ix_cognition_etl_config_preset_project_id"),
+        table_name="etl_config_preset",
+        schema="cognition",
+    )
+    op.drop_index(
+        op.f("ix_cognition_etl_config_preset_organization_id"),
+        table_name="etl_config_preset",
+        schema="cognition",
+    )
+    op.drop_index(
+        op.f("ix_cognition_etl_config_preset_created_by"),
+        table_name="etl_config_preset",
+        schema="cognition",
+    )
+    op.drop_table("etl_config_preset", schema="cognition")
     # ### end Alembic commands ###
diff --git a/alembic/versions/ab353410347b_add_etl_original_file_name.py b/alembic/versions/ab353410347b_add_etl_original_file_name.py
new file mode 100644
index 00000000..793a3b6a
--- /dev/null
+++ b/alembic/versions/ab353410347b_add_etl_original_file_name.py
@@ -0,0 +1,33 @@
+"""add etl original file name
+
+Revision ID: ab353410347b
+Revises: 9d5fb67e29f7
+Create Date: 2025-11-22 21:43:41.539486
+
+"""
+
+from alembic import op
+import sqlalchemy as sa
+
+
+# revision identifiers, used by Alembic.
+revision = "ab353410347b"
+down_revision = "9d5fb67e29f7"
+branch_labels = None
+depends_on = None
+
+
+def upgrade():
+    # ### commands auto generated by Alembic - please adjust! ###
+    op.add_column(
+        "etl_task",
+        sa.Column("original_file_name", sa.String(), nullable=True),
+        schema="global",
+    )
+    # ### end Alembic commands ###
+
+
+def downgrade():
+    # ### commands auto generated by Alembic - please adjust! ###
+    op.drop_column("etl_task", "original_file_name", schema="global")
+    # ### end Alembic commands ###

From 30f54d68c88b4fbd7714d6ef55287667d7205038 Mon Sep 17 00:00:00 2001
From: andhreljaKern <andrea.hrelja@kern.ai>
Date: Mon, 24 Nov 2025 00:08:53 +0100
Subject: [PATCH 39/54] chore: update submodules

---
 submodules/model | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/submodules/model b/submodules/model
index 43ab1255..aaa3eef7 160000
--- a/submodules/model
+++ b/submodules/model
@@ -1 +1 @@
-Subproject commit 43ab1255631e88c1651782d8e97fbf3fbc31469c
+Subproject commit aaa3eef752b6415b9f81f7c351dd04da0ca2ed6c

From d210b571dc8c7d61b756a15ff9cb6a7e1bc7f77b Mon Sep 17 00:00:00 2001
From: andhreljaKern <andrea.hrelja@kern.ai>
Date: Mon, 24 Nov 2025 00:09:12 +0100
Subject: [PATCH 40/54] perf(alembic): update etl content

---
 ...31c4968699ad_add_etl_content_to_records.py | 55 +++++++++++++++++++
 ...ab353410347b_add_etl_original_file_name.py | 33 -----------
 2 files changed, 55 insertions(+), 33 deletions(-)
 create mode 100644 alembic/versions/31c4968699ad_add_etl_content_to_records.py
 delete mode 100644 alembic/versions/ab353410347b_add_etl_original_file_name.py

diff --git a/alembic/versions/31c4968699ad_add_etl_content_to_records.py b/alembic/versions/31c4968699ad_add_etl_content_to_records.py
new file mode 100644
index 00000000..0a3a820f
--- /dev/null
+++ b/alembic/versions/31c4968699ad_add_etl_content_to_records.py
@@ -0,0 +1,55 @@
+"""add etl content to records
+
+Revision ID: 31c4968699ad
+Revises: 9d5fb67e29f7
+Create Date: 2025-11-23 23:08:27.327070
+
+"""
+
+from alembic import op
+import sqlalchemy as sa
+
+
+# revision identifiers, used by Alembic.
+revision = "31c4968699ad"
+down_revision = "9d5fb67e29f7"
+branch_labels = None
+depends_on = None
+
+
+def upgrade():
+    # ### commands auto generated by Alembic - please adjust! ###
+    op.add_column(
+        "etl_task",
+        sa.Column("original_file_name", sa.String(), nullable=True),
+        schema="global",
+    )
+    op.add_column(
+        "github_file",
+        sa.Column("content", sa.String(), nullable=True),
+        schema="integration",
+    )
+    op.add_column(
+        "github_issue",
+        sa.Column("content", sa.String(), nullable=True),
+        schema="integration",
+    )
+    op.add_column(
+        "pdf", sa.Column("content", sa.String(), nullable=True), schema="integration"
+    )
+    op.add_column(
+        "sharepoint",
+        sa.Column("content", sa.String(), nullable=True),
+        schema="integration",
+    )
+    # ### end Alembic commands ###
+
+
+def downgrade():
+    # ### commands auto generated by Alembic - please adjust! ###
+    op.drop_column("sharepoint", "content", schema="integration")
+    op.drop_column("pdf", "content", schema="integration")
+    op.drop_column("github_issue", "content", schema="integration")
+    op.drop_column("github_file", "content", schema="integration")
+    op.drop_column("etl_task", "original_file_name", schema="global")
+    # ### end Alembic commands ###
diff --git a/alembic/versions/ab353410347b_add_etl_original_file_name.py b/alembic/versions/ab353410347b_add_etl_original_file_name.py
deleted file mode 100644
index 793a3b6a..00000000
--- a/alembic/versions/ab353410347b_add_etl_original_file_name.py
+++ /dev/null
@@ -1,33 +0,0 @@
-"""add etl original file name
-
-Revision ID: ab353410347b
-Revises: 9d5fb67e29f7
-Create Date: 2025-11-22 21:43:41.539486
-
-"""
-
-from alembic import op
-import sqlalchemy as sa
-
-
-# revision identifiers, used by Alembic.
-revision = "ab353410347b"
-down_revision = "9d5fb67e29f7"
-branch_labels = None
-depends_on = None
-
-
-def upgrade():
-    # ### commands auto generated by Alembic - please adjust! ###
-    op.add_column(
-        "etl_task",
-        sa.Column("original_file_name", sa.String(), nullable=True),
-        schema="global",
-    )
-    # ### end Alembic commands ###
-
-
-def downgrade():
-    # ### commands auto generated by Alembic - please adjust! ###
-    op.drop_column("etl_task", "original_file_name", schema="global")
-    # ### end Alembic commands ###

From 1fe4c2b59dce8cb95db4bb6d558e1835d29e7de8 Mon Sep 17 00:00:00 2001
From: andhreljaKern <andrea.hrelja@kern.ai>
Date: Mon, 24 Nov 2025 00:23:56 +0100
Subject: [PATCH 41/54] perf(alembic): full_admin table

---
 .../c4218a7d06e0_added_full_admin_table.py    | 49 +++++++++++++++++++
 1 file changed, 49 insertions(+)
 create mode 100644 alembic/versions/c4218a7d06e0_added_full_admin_table.py

diff --git a/alembic/versions/c4218a7d06e0_added_full_admin_table.py b/alembic/versions/c4218a7d06e0_added_full_admin_table.py
new file mode 100644
index 00000000..74304ccd
--- /dev/null
+++ b/alembic/versions/c4218a7d06e0_added_full_admin_table.py
@@ -0,0 +1,49 @@
+"""Added full admin table
+
+Revision ID: c4218a7d06e0
+Revises: 31c4968699ad
+Create Date: 2025-04-24 09:12:33.200446
+
+"""
+
+from alembic import op
+
+# revision identifiers, used by Alembic.
+revision = "c4218a7d06e0"
+down_revision = "31c4968699ad"
+branch_labels = None
+depends_on = None
+
+
+def upgrade():
+    op.execute(
+        """
+        INSERT INTO global.full_admin_access (id, email, meta_info)
+        VALUES
+            (gen_random_uuid(), 'l.lumburovska@accompio.com','{}'),
+            (gen_random_uuid(), 'j.wittmeyer@accompio.com','{}'),
+            (gen_random_uuid(), 'le.schmidt@accompio.com','{}'),
+            (gen_random_uuid(), 'a.hrelja@accompio.com','{}'),
+            (gen_random_uuid(), 'l.puettmann@accompio.com','{}'),
+            (gen_random_uuid(), 'j.hoetter@accompio.com','{}'),
+            (gen_random_uuid(), 'h.wenck@accompio.com','{}'),
+            (gen_random_uuid(), 'j.wirth@accompio.com','{}')
+        """
+    )
+
+
+def downgrade():
+    op.execute(
+        """
+        DELETE FROM global.full_admin_access WHERE email IN (
+            'l.lumburovska@accompio.com',
+            'j.wittmeyer@accompio.com',
+            'le.schmidt@accompio.com',
+            'a.hrelja@accompio.com',
+            'l.puettmann@accompio.com',
+            'j.hoetter@accompio.com',
+            'h.wenck@accompio.com',
+            'j.wirth@accompio.com'
+        )
+        """
+    )

From 72ad3737dbd090309897599e5607cd1557029ec3 Mon Sep 17 00:00:00 2001
From: andhreljaKern <andrea.hrelja@kern.ai>
Date: Tue, 25 Nov 2025 14:48:42 +0100
Subject: [PATCH 42/54] chore: update submodules

---
 submodules/model | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/submodules/model b/submodules/model
index aaa3eef7..b8f71687 160000
--- a/submodules/model
+++ b/submodules/model
@@ -1 +1 @@
-Subproject commit aaa3eef752b6415b9f81f7c351dd04da0ca2ed6c
+Subproject commit b8f71687f6aeeeeffdfd1b318f07d899806a977e

From 777bc3945260a261ede8cb5afc45b7c1a9e37206 Mon Sep 17 00:00:00 2001
From: andhreljaKern <andrea.hrelja@kern.ai>
Date: Tue, 25 Nov 2025 14:49:25 +0100
Subject: [PATCH 43/54] perf: tmp doc etl task metadata

---
 controller/transfer/cognition/minio_upload.py | 219 ++++++++----------
 1 file changed, 95 insertions(+), 124 deletions(-)

diff --git a/controller/transfer/cognition/minio_upload.py b/controller/transfer/cognition/minio_upload.py
index ba88ae21..e578a65e 100644
--- a/controller/transfer/cognition/minio_upload.py
+++ b/controller/transfer/cognition/minio_upload.py
@@ -8,6 +8,7 @@
     file_reference as file_reference_db_bo,
     markdown_file as markdown_file_bo,
     markdown_dataset as markdown_dataset_bo,
+    project as cognition_project_bo,
 )
 
 
@@ -16,136 +17,106 @@ def handle_cognition_file_upload(path_parts: List[str]):
     if path_parts[1] != "_cognition" or len(path_parts) < 5:
         return
     ##tmp doc retrieval => need to understand how .info file is an indicator for cognition gateway to pick it up
-    print(path_parts, flush=True)
-    if path_parts[2] == "files" and path_parts[4].startswith("file_original"):
-        org_id = path_parts[0]
-        file_hash, file_size = path_parts[3].split("_")
-        file_reference = file_reference_db_bo.get(org_id, file_hash, int(file_size))
+    if not (path_parts[2] == "files" and path_parts[4].startswith("file_original")):
+        return
+
+    org_id = path_parts[0]
+    file_hash, file_size = path_parts[3].split("_")
+    file_reference = file_reference_db_bo.get(org_id, file_hash, int(file_size))
+
+    if (
+        not file_reference
+        or file_reference.state == enums.FileCachingState.RUNNING.value
+        or file_reference.state == enums.FileCachingState.COMPLETED.value
+    ):
+        # file_reference is None or already processed in queue
+        print(
+            "File reference duplication error, file is already processed",
+            flush=True,
+        )
+        if file_reference:
+            print(f"File reference id: {str(file_reference.id)}", flush=True)
+            print(f"File name: {file_reference.original_file_name}", flush=True)
+        return
+
+    file_reference.state = enums.FileCachingState.COMPLETED.value
+    general.commit()
+
+    if (
+        file_reference.meta_data.get("file_caching_initiator")
+        == enums.FileCachingInitiator.TMP_DOC_RETRIEVAL.value
+    ):
+        project_id = file_reference.meta_data.get("project_id")
+        conversation_id = file_reference.meta_data.get("conversation_id")
+        project_item = cognition_project_bo.get(project_id)
 
-        if (
-            not file_reference
-            or file_reference.state == enums.FileCachingState.RUNNING.value
-            or file_reference.state == enums.FileCachingState.COMPLETED.value
-        ):
-            # file_reference is None or already processed in queue
-            print(
-                "File reference duplication error, file is already processed",
-                flush=True,
-            )
-            if file_reference:
-                print(f"File reference id: {str(file_reference.id)}", flush=True)
-                print(f"File name: {file_reference.original_file_name}", flush=True)
-            return
+        etl_task = etl_task_bo.create(
+            org_id,
+            file_reference.created_by,
+            file_reference.original_file_name,
+            file_reference.file_size_bytes,
+            full_config=etl_utils.get_full_config_for_tmp_doc(
+                file_reference,
+                project_item,
+                conversation_id,
+            ),
+            tokenizer=project_item.tokenizer,
+            priority=1,
+        )
 
-        file_reference.state = enums.FileCachingState.COMPLETED.value
-        general.commit()
-        if (
-            file_reference.meta_data.get("file_caching_initiator")
-            == enums.FileCachingInitiator.TMP_DOC_RETRIEVAL.value
-        ):
-            task_config, tokenizer = (
-                etl_utils.create_etl_task_config_from_file_reference_tmp_doc(
-                    file_reference
-                )
-            )
-            etl_task = etl_task_bo.create(
-                org_id,
-                file_reference.created_by,
-                file_reference.file_size_bytes,
-                full_config=task_config,
-                tokenizer=tokenizer,
-                priority=1,
-            )
-            task_master_manager.queue_task(
-                org_id,
-                str(file_reference.created_by),
-                enums.TaskType.EXECUTE_ETL,
-                {
-                    "etl_task_id": str(etl_task.id),
-                    "project_id": file_reference.meta_data.get("project_id"),
-                    "conversation_id": file_reference.meta_data.get("conversation_id"),
+        task_master_manager.queue_task(
+            org_id,
+            str(file_reference.created_by),
+            enums.TaskType.EXECUTE_ETL,
+            {
+                "etl_task_id": str(etl_task.id),
+                "file_reference_id": str(file_reference.id),
+                "tmp_doc_metadata": {
+                    "project_id": project_id,
+                    "conversation_id": conversation_id,
                 },
-                priority=True,
-            )
+            },
+            priority=True,
+        )
 
-        else:
-            priority = -1
+    else:
+        priority = -1
 
-            markdown_file = markdown_file_bo.get(
-                org_id, file_reference.meta_data.get("markdown_file_id")
-            )
-            raise NotImplementedError("Non-tmp doc upload not implemented yet.")
+        markdown_dataset = markdown_dataset_bo.get(
+            org_id, file_reference.meta_data.get("dataset_id")
+        )
 
-            # markdown_dataset = markdown_dataset_bo.get(
-            #     org_id=org_id, id=markdown_file.dataset_id
-            # )
+        markdown_file = markdown_file_bo.get(
+            org_id, file_reference.meta_data.get("markdown_file_id")
+        )
 
-        # etl_task = etl_task_bo.get_or_create_markdown_file_etl_task(
-        #     org_id=org_id,
-        #     file_reference=file_reference,
-        #     markdown_file=markdown_file,
-        #     markdown_dataset=markdown_dataset,
-        #     extractor=markdown_file.meta_data.get("extractor"),
-        #     fallback_extractors=[
-        #         enums.ETLExtractorPDF.PDF2MD,
-        #         enums.ETLExtractorPDF.VISION,
-        #     ],
-        #     cache_config={
-        #         enums.ETLCacheKeys.FILE_CACHE.value: True,
-        #         enums.ETLCacheKeys.EXTRACTION.value: True,
-        #         enums.ETLCacheKeys.SPLITTING.value: True,
-        #         enums.ETLCacheKeys.TRANSFORMATION.value: True,
-        #     },
-        #     split_config={
-        #         "strategy": enums.ETLSplitStrategy.CHUNK.value,
-        #         "chunk_size": chunk_size,
-        #     },
-        #     transform_config={
-        #         "transformers": [
-        #             {  # NOTE: __call_gpt_with_key only reads user_prompt
-        #                 "enabled": False,  # this transformer is disabled because it often hangs the ETL process
-        #                 "name": enums.ETLTransformer.CLEANSE.value,
-        #                 "system_prompt": None,
-        #                 "user_prompt": None,
-        #             },
-        #             {
-        #                 "enabled": True,
-        #                 "name": enums.ETLTransformer.TEXT_TO_TABLE.value,
-        #                 "system_prompt": None,
-        #                 "user_prompt": None,
-        #             },
-        #             {
-        #                 "enabled": False,
-        #                 "name": enums.ETLTransformer.SUMMARIZE.value,
-        #                 "system_prompt": None,
-        #                 "user_prompt": None,
-        #             },
-        #         ]
-        #     },
-        #     load_config={
-        #         "refinery_project": {"enabled": False, "id": None},
-        #         "markdown_file": {"enabled": True, "id": str(markdown_file.id)},
-        #     },
-        #     notify_config={
-        #         "http": {
-        #             "url": "http://cognition-gateway:80/etl/finished/{markdown_file_id}",
-        #             "format": {
-        #                 "markdown_file_id": str(markdown_file.id),
-        #             },
-        #             "method": "POST",
-        #         }
-        #     },
-        #     priority=priority,
-        # )
+        etl_task = etl_task_bo.create(
+            org_id,
+            file_reference.created_by,
+            file_reference.original_file_name,
+            file_reference.file_size_bytes,
+            full_config=etl_utils.get_full_config_for_markdown_file(
+                file_reference,
+                markdown_dataset,
+                markdown_file,
+            ),
+            tokenizer=markdown_dataset.tokenizer,
+            priority=priority,
+        )
 
-        # markdown_file_bo.update(
-        #     org_id=org_id, markdown_file_id=markdown_file.id, etl_task_id=etl_task.id
-        # )
+        markdown_file_bo.update(
+            org_id=org_id,
+            markdown_file_id=markdown_file.id,
+            etl_task_id=etl_task.id,
+        )
 
-        # task_master_manager.queue_task(
-        #     org_id,
-        #     str(file_reference.created_by),
-        #     enums.TaskType.EXECUTE_ETL,
-        #     {"etl_task_id": str(etl_task.id)},
-        #     priority=priority != -1,
-        # )
+        task_master_manager.queue_task(
+            org_id,
+            str(file_reference.created_by),
+            enums.TaskType.EXECUTE_ETL,
+            {
+                "etl_task_id": str(etl_task.id),
+                "file_reference_id": str(file_reference.id),
+            },
+            priority=priority != -1,
+        )

From 8f546faddb0d290040be822536deb1c42320393d Mon Sep 17 00:00:00 2001
From: JWittmeyer <jens.wittmeyer@kern.ai>
Date: Wed, 26 Nov 2025 10:16:40 +0100
Subject: [PATCH 44/54] Conversion method + alembic

---
 ...emove_cognition_project_fields_for_new_.py | 218 ++++++++++++++++++
 submodules/model                              |   2 +-
 2 files changed, 219 insertions(+), 1 deletion(-)
 create mode 100644 alembic/versions/64874114490b_remove_cognition_project_fields_for_new_.py

diff --git a/alembic/versions/64874114490b_remove_cognition_project_fields_for_new_.py b/alembic/versions/64874114490b_remove_cognition_project_fields_for_new_.py
new file mode 100644
index 00000000..5cb9d534
--- /dev/null
+++ b/alembic/versions/64874114490b_remove_cognition_project_fields_for_new_.py
@@ -0,0 +1,218 @@
+"""remove cognition project fields for new etl
+
+Revision ID: 64874114490b
+Revises: c4218a7d06e0
+Create Date: 2025-11-25 15:49:30.097610
+
+"""
+
+from alembic import op
+import sqlalchemy as sa
+from sqlalchemy.dialects import postgresql
+import json
+import uuid
+from submodules.model.enums import LLMProvider
+
+# revision identifiers, used by Alembic.
+revision = "64874114490b"
+down_revision = "c4218a7d06e0"
+branch_labels = None
+depends_on = None
+
+
+def upgrade():
+    # ### commands auto generated by Alembic - please adjust! ###
+    op.add_column(
+        "project",
+        sa.Column("useable_etl_configurations", sa.JSON(), nullable=True),
+        schema="cognition",
+    )
+    op.drop_constraint(
+        "etl_config_preset_name_key",
+        "etl_config_preset",
+        schema="cognition",
+        type_="unique",
+    )
+    op.drop_index(
+        "ix_cognition_etl_config_preset_project_id",
+        table_name="etl_config_preset",
+        schema="cognition",
+    )
+    op.drop_constraint(
+        "etl_config_preset_project_id_fkey",
+        "etl_config_preset",
+        schema="cognition",
+        type_="foreignkey",
+    )
+    op.drop_column("etl_config_preset", "project_id", schema="cognition")
+    __conversion_helper()
+    op.drop_column("project", "tokenizer", schema="cognition")
+    op.drop_column("project", "llm_config", schema="cognition")
+    # ### end Alembic commands ###
+
+
+def downgrade():
+    op.drop_column("project", "useable_etl_configurations", schema="cognition")
+    op.add_column(
+        "etl_config_preset",
+        sa.Column("project_id", postgresql.UUID(), autoincrement=False, nullable=True),
+        schema="cognition",
+    )
+    op.create_foreign_key(
+        "etl_config_preset_project_id_fkey",
+        "etl_config_preset",
+        "project",
+        ["project_id"],
+        ["id"],
+        source_schema="cognition",
+        referent_schema="cognition",
+        ondelete="CASCADE",
+    )
+    op.create_index(
+        "ix_cognition_etl_config_preset_project_id",
+        "etl_config_preset",
+        ["project_id"],
+        unique=False,
+        schema="cognition",
+    )
+    op.create_unique_constraint(
+        "etl_config_preset_name_key", "etl_config_preset", ["name"], schema="cognition"
+    )
+    # ### commands auto generated by Alembic - please adjust! ###
+    op.add_column(
+        "project",
+        sa.Column(
+            "llm_config",
+            postgresql.JSON(astext_type=sa.Text()),
+            autoincrement=False,
+            nullable=True,
+        ),
+        schema="cognition",
+    )
+    op.add_column(
+        "project",
+        sa.Column("tokenizer", sa.VARCHAR(), autoincrement=False, nullable=True),
+        schema="cognition",
+    )
+    # ### end Alembic commands ###
+
+
+def __conversion_helper():
+
+    connection = op.get_bind()
+    select_sql = """
+        SELECT id,organization_id, llm_config, tokenizer, name, created_by, created_at
+        FROM cognition.project
+        WHERE (llm_config IS NOT NULL OR tokenizer IS NOT NULL)
+          AND allow_file_upload = true;
+    """
+    result = connection.execute(select_sql)
+
+    # Step 2: Process each row
+    for row in result:
+
+        print(f"Converting project {row['name']} (ID: {row['id']})")
+
+        converted_object = __convert_object(row)
+        insert_sql = sa.text(
+            """
+            INSERT INTO cognition.etl_config_preset(
+                id,
+                organization_id,
+                name,
+                description,
+                created_at,
+                created_by,
+                etl_config,
+                add_config
+            )
+            VALUES (
+                :id,
+                :organization_id,
+                :name,
+                :description,
+                :created_at,
+                :created_by,
+                :etl_config,
+                :add_config
+            )
+        """
+        )
+        connection.execute(insert_sql, converted_object)
+
+        # Step 4: Update the project row with useable_etl_configurations
+        update_sql = sa.text(
+            """
+            UPDATE cognition.project
+            SET useable_etl_configurations = :config_list
+            WHERE id = :project_id
+        """
+        )
+        connection.execute(
+            update_sql,
+            {
+                "config_list": json.dumps(
+                    [{"id": converted_object["id"], "is_default": True}]
+                ),
+                "project_id": row["id"],
+            },
+        )
+
+
+def __convert_object(row):
+    # --- Your Python conversion here ---
+    # Example placeholder (replace with real conversion)
+    # converted_object = {"llm_config": row["llm_config"], "tokenizer": row["tokenizer"]}
+    pdf_extraction = {}
+    extraction_data = row["llm_config"].get("extraction", {})
+    if extraction_data.get("extractor") == "pdf2markdown":
+        pdf_extraction["extractor"] = "PDF2MD"
+    elif extraction_data.get("extractor").lower() == "azure_di":
+        pdf_extraction["azureDiApiBase"] = extraction_data.get("azureDiApiBase", "")
+        pdf_extraction["azureDiEnvVarId"] = extraction_data.get("azureDiEnvVarId", "")
+        pdf_extraction["extractor"] = "AZURE_DI"
+    elif (
+        extraction_data.get("extractor").lower() == "gpt"
+        or extraction_data.get("extractor").lower() == "vision"
+        or extraction_data.get("extractor").lower() == "gpt-4"
+    ):
+        pdf_extraction["overwriteVisionPrompt"] = extraction_data.get(
+            "overwriteVisionPrompt", False
+        )
+        pdf_extraction["llmIdentifier"] = LLMProvider.from_string(
+            extraction_data.get("llmIdentifier", "")
+        ).value
+        pdf_extraction["extractor"] = "VISION"
+        llm_config = extraction_data.copy()
+        llm_config.pop("extractor", None)
+        llm_config.pop("overwriteVisionPrompt", None)
+        llm_config.pop("llmIdentifier", None)
+        pdf_extraction["llmConfig"] = llm_config
+    transformation_data = row["llm_config"].get("transformation", {})
+    transformation_config = {}
+    transformation_config["llmIdentifier"] = LLMProvider.from_string(
+        transformation_data.get("llmIdentifier", "")
+    ).value
+    transformation_config["type"] = "COMMON_ETL"
+    llm_config = transformation_data.copy()
+    llm_config.pop("llmIdentifier", None)
+    transformation_config["llmConfig"] = llm_config
+    # add pdf & add llm migration
+    converted_object = {
+        "extraction": {"default": {"extractor": "LANGCHAIN"}, "pdf": pdf_extraction},
+        "tokenizer": row["tokenizer"],
+        "transformation": transformation_config,
+    }
+    final_object = {
+        "id": str(uuid.uuid4()),
+        "organization_id": str(row["organization_id"]),
+        "name": row["name"] + " - migrated etl config",
+        "description": "ETL configuration migrated from old project settings",
+        "created_at": row["created_at"].isoformat(),
+        "created_by": str(row["created_by"]),
+        "etl_config": json.dumps(converted_object),
+        "add_config": json.dumps({}),
+    }
+
+    return final_object
+    # -----------------------------------
diff --git a/submodules/model b/submodules/model
index 115b6320..1268efcf 160000
--- a/submodules/model
+++ b/submodules/model
@@ -1 +1 @@
-Subproject commit 115b6320bc1060cf3da26fe78d1510235cb83f52
+Subproject commit 1268efcf137999c99ab1348a314fdf552fe98d47

From b643bd0ebdeb4e2956200e5aa99bcaf58acb7e38 Mon Sep 17 00:00:00 2001
From: JWittmeyer <jens.wittmeyer@kern.ai>
Date: Wed, 26 Nov 2025 10:37:42 +0100
Subject: [PATCH 45/54] Hotfix default value

---
 .../64874114490b_remove_cognition_project_fields_for_new_.py    | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/alembic/versions/64874114490b_remove_cognition_project_fields_for_new_.py b/alembic/versions/64874114490b_remove_cognition_project_fields_for_new_.py
index 5cb9d534..11439542 100644
--- a/alembic/versions/64874114490b_remove_cognition_project_fields_for_new_.py
+++ b/alembic/versions/64874114490b_remove_cognition_project_fields_for_new_.py
@@ -152,7 +152,7 @@ def __conversion_helper():
             update_sql,
             {
                 "config_list": json.dumps(
-                    [{"id": converted_object["id"], "is_default": True}]
+                    [{"id": converted_object["id"], "isDefault": True}]
                 ),
                 "project_id": row["id"],
             },

From 8e4a0b20bb3cb386df7686f1e24922faac3c8157 Mon Sep 17 00:00:00 2001
From: JWittmeyer <jens.wittmeyer@kern.ai>
Date: Wed, 26 Nov 2025 17:07:50 +0100
Subject: [PATCH 46/54] tmp doc changes

---
 controller/transfer/cognition/minio_upload.py | 14 +++++---------
 submodules/model                              |  2 +-
 2 files changed, 6 insertions(+), 10 deletions(-)

diff --git a/controller/transfer/cognition/minio_upload.py b/controller/transfer/cognition/minio_upload.py
index e578a65e..e404f041 100644
--- a/controller/transfer/cognition/minio_upload.py
+++ b/controller/transfer/cognition/minio_upload.py
@@ -8,7 +8,6 @@
     file_reference as file_reference_db_bo,
     markdown_file as markdown_file_bo,
     markdown_dataset as markdown_dataset_bo,
-    project as cognition_project_bo,
 )
 
 
@@ -48,19 +47,16 @@ def handle_cognition_file_upload(path_parts: List[str]):
     ):
         project_id = file_reference.meta_data.get("project_id")
         conversation_id = file_reference.meta_data.get("conversation_id")
-        project_item = cognition_project_bo.get(project_id)
-
+        full_config, tokenizer = etl_utils.get_full_config_and_tokenizer_from_config_id(
+            file_reference, project_id=project_id, conversation_id=conversation_id
+        )
         etl_task = etl_task_bo.create(
             org_id,
             file_reference.created_by,
             file_reference.original_file_name,
             file_reference.file_size_bytes,
-            full_config=etl_utils.get_full_config_for_tmp_doc(
-                file_reference,
-                project_item,
-                conversation_id,
-            ),
-            tokenizer=project_item.tokenizer,
+            full_config=full_config,
+            tokenizer=tokenizer,
             priority=1,
         )
 
diff --git a/submodules/model b/submodules/model
index 1268efcf..0ff52f25 160000
--- a/submodules/model
+++ b/submodules/model
@@ -1 +1 @@
-Subproject commit 1268efcf137999c99ab1348a314fdf552fe98d47
+Subproject commit 0ff52f25e0e9d39200b601330b8e9344c9cce0bd

From 8646787cd228e32413600757a7d8c5db8c61b0c5 Mon Sep 17 00:00:00 2001
From: andhreljaKern <andrea.hrelja@kern.ai>
Date: Wed, 26 Nov 2025 17:12:43 +0100
Subject: [PATCH 47/54] perf: minio upload minor enhancement

---
 controller/transfer/cognition/minio_upload.py | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/controller/transfer/cognition/minio_upload.py b/controller/transfer/cognition/minio_upload.py
index e404f041..37846add 100644
--- a/controller/transfer/cognition/minio_upload.py
+++ b/controller/transfer/cognition/minio_upload.py
@@ -30,12 +30,18 @@ def handle_cognition_file_upload(path_parts: List[str]):
     ):
         # file_reference is None or already processed in queue
         print(
-            "File reference duplication error, file is already processed",
+            f"WARNING:  {__name__} - file reference duplication error, file is already processed",
             flush=True,
         )
         if file_reference:
-            print(f"File reference id: {str(file_reference.id)}", flush=True)
-            print(f"File name: {file_reference.original_file_name}", flush=True)
+            print(
+                f"INFO:     {__name__} - file reference id: {str(file_reference.id)}",
+                flush=True,
+            )
+            print(
+                f"INFO:     {__name__} - file name: {file_reference.original_file_name}",
+                flush=True,
+            )
         return
 
     file_reference.state = enums.FileCachingState.COMPLETED.value

From 725f6745c14fcfe4f01c55d7ac398c4cba792296 Mon Sep 17 00:00:00 2001
From: andhreljaKern <andrea.hrelja@kern.ai>
Date: Wed, 26 Nov 2025 17:44:31 +0100
Subject: [PATCH 48/54] chore: update submodules

---
 submodules/model | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/submodules/model b/submodules/model
index 0ff52f25..aae207ce 160000
--- a/submodules/model
+++ b/submodules/model
@@ -1 +1 @@
-Subproject commit 0ff52f25e0e9d39200b601330b8e9344c9cce0bd
+Subproject commit aae207ce46842f1cc91c6631049b9a5159b3aec4

From 38b2375931a14618fc727a705e275617f08e04e6 Mon Sep 17 00:00:00 2001
From: JWittmeyer <jens.wittmeyer@kern.ai>
Date: Thu, 27 Nov 2025 11:25:16 +0100
Subject: [PATCH 49/54] submodule change

---
 submodules/model | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/submodules/model b/submodules/model
index aae207ce..7ab63294 160000
--- a/submodules/model
+++ b/submodules/model
@@ -1 +1 @@
-Subproject commit aae207ce46842f1cc91c6631049b9a5159b3aec4
+Subproject commit 7ab6329494000afff7e5526595c921a9b4eca8f8

From 3e4f43efdd73cffebc6e26929d829d4cce4af0e3 Mon Sep 17 00:00:00 2001
From: andhreljaKern <andrea.hrelja@kern.ai>
Date: Mon, 1 Dec 2025 21:23:34 +0100
Subject: [PATCH 50/54] chore: update submodules

---
 submodules/model | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/submodules/model b/submodules/model
index aae207ce..c47f9601 160000
--- a/submodules/model
+++ b/submodules/model
@@ -1 +1 @@
-Subproject commit aae207ce46842f1cc91c6631049b9a5159b3aec4
+Subproject commit c47f960101932dec9a9784107f635c70d4facec2

From ab4559a596a62dd3f2dfd13ec9d70eba9b56b90e Mon Sep 17 00:00:00 2001
From: andhreljaKern <andrea.hrelja@kern.ai>
Date: Tue, 2 Dec 2025 01:36:36 +0100
Subject: [PATCH 51/54] chore: update submodules

---
 submodules/model | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/submodules/model b/submodules/model
index c47f9601..723a69ae 160000
--- a/submodules/model
+++ b/submodules/model
@@ -1 +1 @@
-Subproject commit c47f960101932dec9a9784107f635c70d4facec2
+Subproject commit 723a69aef083139174d8232d426d4db381d3c89b

From 48b228e0a296134bb66457224229d8fe5431e550 Mon Sep 17 00:00:00 2001
From: andhreljaKern <andrea.hrelja@kern.ai>
Date: Tue, 2 Dec 2025 01:36:57 +0100
Subject: [PATCH 52/54] perf(alembic): new fields

---
 ...remove_markdown_dataset_fields_for_new_.py | 179 ++++++++++++++++++
 controller/transfer/cognition/minio_upload.py |  33 ++--
 2 files changed, 196 insertions(+), 16 deletions(-)
 create mode 100644 alembic/versions/04cd434ed6eb_remove_markdown_dataset_fields_for_new_.py

diff --git a/alembic/versions/04cd434ed6eb_remove_markdown_dataset_fields_for_new_.py b/alembic/versions/04cd434ed6eb_remove_markdown_dataset_fields_for_new_.py
new file mode 100644
index 00000000..0a519b88
--- /dev/null
+++ b/alembic/versions/04cd434ed6eb_remove_markdown_dataset_fields_for_new_.py
@@ -0,0 +1,179 @@
+"""remove markdown dataset fields for new etl fields
+
+Revision ID: 04cd434ed6eb
+Revises: 64874114490b
+Create Date: 2025-12-01 15:43:42.832265
+
+"""
+
+from alembic import op
+import sqlalchemy as sa
+from sqlalchemy.dialects import postgresql
+
+import uuid
+import json
+
+from submodules.model.enums import LLMProvider
+
+# revision identifiers, used by Alembic.
+revision = "04cd434ed6eb"
+down_revision = "64874114490b"
+branch_labels = None
+depends_on = None
+
+
+def upgrade():
+    op.add_column(
+        "etl_task", sa.Column("meta_data", sa.JSON(), nullable=True), schema="global"
+    )
+    op.add_column(
+        "markdown_dataset",
+        sa.Column("useable_etl_configurations", sa.JSON(), nullable=True),
+        schema="cognition",
+    )
+    __conversion_helper()
+    op.drop_column("markdown_dataset", "tokenizer", schema="cognition")
+    op.drop_column("markdown_dataset", "llm_config", schema="cognition")
+    # ### end Alembic commands ###
+
+
+def downgrade():
+    # ### commands auto generated by Alembic - please adjust! ###
+    op.drop_column("etl_task", "meta_data", schema="global")
+    op.drop_column("markdown_dataset", "useable_etl_configurations", schema="cognition")
+    op.add_column(
+        "markdown_dataset",
+        sa.Column("tokenizer", sa.VARCHAR(), autoincrement=False, nullable=True),
+        schema="cognition",
+    )
+    op.add_column(
+        "markdown_dataset",
+        sa.Column(
+            "llm_config",
+            postgresql.JSON(astext_type=sa.Text()),
+            autoincrement=False,
+            nullable=True,
+        ),
+        schema="cognition",
+    )
+    ### end Alembic commands ###
+
+
+def __conversion_helper():
+
+    connection = op.get_bind()
+    select_sql = """
+        SELECT id, organization_id, llm_config, tokenizer, name, created_by, created_at
+        FROM cognition.markdown_dataset
+        WHERE (llm_config IS NOT NULL OR tokenizer IS NOT NULL);
+    """
+    result = connection.execute(select_sql)
+
+    # Step 2: Process each row
+    for row in result:
+
+        print(f"Converting dataset {row['name']} (ID: {row['id']})")
+
+        converted_object = __convert_object(row)
+        insert_sql = sa.text(
+            """
+            INSERT INTO cognition.etl_config_preset(
+                id,
+                organization_id,
+                name,
+                description,
+                created_at,
+                created_by,
+                etl_config,
+                add_config
+            )
+            VALUES (
+                :id,
+                :organization_id,
+                :name,
+                :description,
+                :created_at,
+                :created_by,
+                :etl_config,
+                :add_config
+            )
+        """
+        )
+        connection.execute(insert_sql, converted_object)
+
+        # Step 4: Update the project row with useable_etl_configurations
+        update_sql = sa.text(
+            """
+            UPDATE cognition.markdown_dataset
+            SET useable_etl_configurations = :config_list
+            WHERE id = :dataset_id
+        """
+        )
+        connection.execute(
+            update_sql,
+            {
+                "config_list": json.dumps(
+                    [{"id": converted_object["id"], "isDefault": True}]
+                ),
+                "dataset_id": row["id"],
+            },
+        )
+
+
+def __convert_object(row):
+    # --- Your Python conversion here ---
+    # Example placeholder (replace with real conversion)
+    # converted_object = {"llm_config": row["llm_config"], "tokenizer": row["tokenizer"]}
+    pdf_extraction = {}
+    extraction_data = row["llm_config"].get("extraction", {})
+    if extraction_data.get("extractor") == "pdf2markdown":
+        pdf_extraction["extractor"] = "PDF2MD"
+    elif extraction_data.get("extractor").lower() == "azure_di":
+        pdf_extraction["azureDiApiBase"] = extraction_data.get("azureDiApiBase", "")
+        pdf_extraction["azureDiEnvVarId"] = extraction_data.get("azureDiEnvVarId", "")
+        pdf_extraction["extractor"] = "AZURE_DI"
+    elif (
+        extraction_data.get("extractor").lower() == "gpt"
+        or extraction_data.get("extractor").lower() == "vision"
+        or extraction_data.get("extractor").lower() == "gpt-4"
+    ):
+        pdf_extraction["overwriteVisionPrompt"] = extraction_data.get(
+            "overwriteVisionPrompt", False
+        )
+        pdf_extraction["llmIdentifier"] = LLMProvider.from_string(
+            extraction_data.get("llmIdentifier", "")
+        ).value
+        pdf_extraction["extractor"] = "VISION"
+        llm_config = extraction_data.copy()
+        llm_config.pop("extractor", None)
+        llm_config.pop("overwriteVisionPrompt", None)
+        llm_config.pop("llmIdentifier", None)
+        pdf_extraction["llmConfig"] = llm_config
+    transformation_data = row["llm_config"].get("transformation", {})
+    transformation_config = {}
+    transformation_config["llmIdentifier"] = LLMProvider.from_string(
+        transformation_data.get("llmIdentifier", "")
+    ).value
+    transformation_config["type"] = "COMMON_ETL"
+    llm_config = transformation_data.copy()
+    llm_config.pop("llmIdentifier", None)
+    transformation_config["llmConfig"] = llm_config
+    # add pdf & add llm migration
+    converted_object = {
+        "extraction": {"default": {"extractor": "LANGCHAIN"}, "pdf": pdf_extraction},
+        "tokenizer": row["tokenizer"],
+        "transformation": transformation_config,
+    }
+    final_object = {
+        "id": str(uuid.uuid4()),
+        "organization_id": str(row["organization_id"]),
+        "name": row["name"] + " - migrated etl config",
+        "description": "ETL configuration migrated from old project settings",
+        "created_at": row["created_at"].isoformat(),
+        "created_by": str(row["created_by"]),
+        "etl_config": json.dumps(converted_object),
+        "add_config": json.dumps({}),
+    }
+
+    return final_object
+    # -----------------------------------
diff --git a/controller/transfer/cognition/minio_upload.py b/controller/transfer/cognition/minio_upload.py
index 37846add..c2b17a6e 100644
--- a/controller/transfer/cognition/minio_upload.py
+++ b/controller/transfer/cognition/minio_upload.py
@@ -63,6 +63,13 @@ def handle_cognition_file_upload(path_parts: List[str]):
             file_reference.file_size_bytes,
             full_config=full_config,
             tokenizer=tokenizer,
+            meta_data={
+                "file_reference_id": str(file_reference.id),
+                "tmp_doc_metadata": {
+                    "project_id": project_id,
+                    "conversation_id": conversation_id,
+                },
+            },
             priority=1,
         )
 
@@ -72,11 +79,7 @@ def handle_cognition_file_upload(path_parts: List[str]):
             enums.TaskType.EXECUTE_ETL,
             {
                 "etl_task_id": str(etl_task.id),
-                "file_reference_id": str(file_reference.id),
-                "tmp_doc_metadata": {
-                    "project_id": project_id,
-                    "conversation_id": conversation_id,
-                },
+                **etl_task.meta_data,
             },
             priority=True,
         )
@@ -84,25 +87,23 @@ def handle_cognition_file_upload(path_parts: List[str]):
     else:
         priority = -1
 
-        markdown_dataset = markdown_dataset_bo.get(
-            org_id, file_reference.meta_data.get("dataset_id")
-        )
-
         markdown_file = markdown_file_bo.get(
             org_id, file_reference.meta_data.get("markdown_file_id")
         )
 
+        full_config, tokenizer = etl_utils.get_full_config_and_tokenizer_from_config_id(
+            file_reference
+        )
         etl_task = etl_task_bo.create(
             org_id,
             file_reference.created_by,
             file_reference.original_file_name,
             file_reference.file_size_bytes,
-            full_config=etl_utils.get_full_config_for_markdown_file(
-                file_reference,
-                markdown_dataset,
-                markdown_file,
-            ),
-            tokenizer=markdown_dataset.tokenizer,
+            full_config=full_config,
+            tokenizer=tokenizer,
+            meta_data={
+                "file_reference_id": str(file_reference.id),
+            },
             priority=priority,
         )
 
@@ -118,7 +119,7 @@ def handle_cognition_file_upload(path_parts: List[str]):
             enums.TaskType.EXECUTE_ETL,
             {
                 "etl_task_id": str(etl_task.id),
-                "file_reference_id": str(file_reference.id),
+                **etl_task.meta_data,
             },
             priority=priority != -1,
         )

From 3bb060a1ca16e386be958ecc4f60270e43eea8b2 Mon Sep 17 00:00:00 2001
From: andhreljaKern <andrea.hrelja@kern.ai>
Date: Tue, 2 Dec 2025 01:44:02 +0100
Subject: [PATCH 53/54] chore: update submodules

---
 submodules/model | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/submodules/model b/submodules/model
index 723a69ae..e00df1d8 160000
--- a/submodules/model
+++ b/submodules/model
@@ -1 +1 @@
-Subproject commit 723a69aef083139174d8232d426d4db381d3c89b
+Subproject commit e00df1d8b005b950b0188859e73ce237e2868a12

From 61766b7ae864436b5bc2d5c76f6998b4f7bff90c Mon Sep 17 00:00:00 2001
From: andhreljaKern <andrea.hrelja@kern.ai>
Date: Tue, 2 Dec 2025 01:56:20 +0100
Subject: [PATCH 54/54] fix: alembic merge conflict

---
 .../versions/0ea9ab7cb60f_add_inbox_mail.py   | 190 ++++++++++++------
 .../f428a22ecdb3_adds_etl_task_table.py       |   4 +-
 2 files changed, 135 insertions(+), 59 deletions(-)

diff --git a/alembic/versions/0ea9ab7cb60f_add_inbox_mail.py b/alembic/versions/0ea9ab7cb60f_add_inbox_mail.py
index f93b999e..7135f891 100644
--- a/alembic/versions/0ea9ab7cb60f_add_inbox_mail.py
+++ b/alembic/versions/0ea9ab7cb60f_add_inbox_mail.py
@@ -5,77 +5,153 @@
 Create Date: 2025-11-21 13:43:44.056253
 
 """
+
 from alembic import op
 import sqlalchemy as sa
 from sqlalchemy.dialects import postgresql
 
 # revision identifiers, used by Alembic.
-revision = '0ea9ab7cb60f'
-down_revision = '85bb3ebee137'
+revision = "0ea9ab7cb60f"
+down_revision = "85bb3ebee137"
 branch_labels = None
 depends_on = None
 
 
 def upgrade():
     # ### commands auto generated by Alembic - please adjust! ###
-    op.create_table('inbox_mail_thread',
-    sa.Column('id', postgresql.UUID(as_uuid=True), nullable=False),
-    sa.Column('created_by', postgresql.UUID(as_uuid=True), nullable=True),
-    sa.Column('organization_id', postgresql.UUID(as_uuid=True), nullable=True),
-    sa.Column('created_at', sa.DateTime(), nullable=True),
-    sa.Column('subject', sa.String(), nullable=True),
-    sa.Column('meta_data', sa.JSON(), nullable=True),
-    sa.Column('is_important', sa.Boolean(), nullable=True),
-    sa.Column('progress_state', sa.String(), nullable=True),
-    sa.Column('support_owner_id', postgresql.UUID(as_uuid=True), nullable=True),
-    sa.Column('is_admin_support_thread', sa.Boolean(), nullable=True),
-    sa.ForeignKeyConstraint(['created_by'], ['user.id'], ondelete='CASCADE'),
-    sa.ForeignKeyConstraint(['organization_id'], ['organization.id'], ondelete='CASCADE'),
-    sa.ForeignKeyConstraint(['support_owner_id'], ['user.id'], ondelete='SET NULL'),
-    sa.PrimaryKeyConstraint('id'),
-    schema='global'
-    )
-    op.create_index(op.f('ix_global_inbox_mail_thread_created_by'), 'inbox_mail_thread', ['created_by'], unique=False, schema='global')
-    op.create_index(op.f('ix_global_inbox_mail_thread_organization_id'), 'inbox_mail_thread', ['organization_id'], unique=False, schema='global')
-    op.create_index(op.f('ix_global_inbox_mail_thread_support_owner_id'), 'inbox_mail_thread', ['support_owner_id'], unique=False, schema='global')
-    op.create_table('inbox_mail',
-    sa.Column('id', postgresql.UUID(as_uuid=True), nullable=False),
-    sa.Column('created_at', sa.DateTime(), nullable=True),
-    sa.Column('sender_id', postgresql.UUID(as_uuid=True), nullable=True),
-    sa.Column('thread_id', postgresql.UUID(as_uuid=True), nullable=True),
-    sa.Column('content', sa.String(), nullable=True),
-    sa.ForeignKeyConstraint(['sender_id'], ['user.id'], ondelete='SET NULL'),
-    sa.ForeignKeyConstraint(['thread_id'], ['global.inbox_mail_thread.id'], ondelete='CASCADE'),
-    sa.PrimaryKeyConstraint('id'),
-    schema='global'
-    )
-    op.create_index(op.f('ix_global_inbox_mail_sender_id'), 'inbox_mail', ['sender_id'], unique=False, schema='global')
-    op.create_index(op.f('ix_global_inbox_mail_thread_id'), 'inbox_mail', ['thread_id'], unique=False, schema='global')
-    op.create_table('inbox_mail_thread_association',
-    sa.Column('id', postgresql.UUID(as_uuid=True), nullable=False),
-    sa.Column('thread_id', postgresql.UUID(as_uuid=True), nullable=True),
-    sa.Column('user_id', postgresql.UUID(as_uuid=True), nullable=True),
-    sa.Column('unread_mail_count', sa.Integer(), nullable=True),
-    sa.ForeignKeyConstraint(['thread_id'], ['global.inbox_mail_thread.id'], ondelete='CASCADE'),
-    sa.ForeignKeyConstraint(['user_id'], ['user.id'], ondelete='CASCADE'),
-    sa.PrimaryKeyConstraint('id'),
-    schema='global'
-    )
-    op.create_index(op.f('ix_global_inbox_mail_thread_association_thread_id'), 'inbox_mail_thread_association', ['thread_id'], unique=False, schema='global')
-    op.create_index(op.f('ix_global_inbox_mail_thread_association_user_id'), 'inbox_mail_thread_association', ['user_id'], unique=False, schema='global')
+    op.create_table(
+        "inbox_mail_thread",
+        sa.Column("id", postgresql.UUID(as_uuid=True), nullable=False),
+        sa.Column("created_by", postgresql.UUID(as_uuid=True), nullable=True),
+        sa.Column("organization_id", postgresql.UUID(as_uuid=True), nullable=True),
+        sa.Column("created_at", sa.DateTime(), nullable=True),
+        sa.Column("subject", sa.String(), nullable=True),
+        sa.Column("meta_data", sa.JSON(), nullable=True),
+        sa.Column("is_important", sa.Boolean(), nullable=True),
+        sa.Column("progress_state", sa.String(), nullable=True),
+        sa.Column("support_owner_id", postgresql.UUID(as_uuid=True), nullable=True),
+        sa.Column("is_admin_support_thread", sa.Boolean(), nullable=True),
+        sa.ForeignKeyConstraint(["created_by"], ["user.id"], ondelete="CASCADE"),
+        sa.ForeignKeyConstraint(
+            ["organization_id"], ["organization.id"], ondelete="CASCADE"
+        ),
+        sa.ForeignKeyConstraint(["support_owner_id"], ["user.id"], ondelete="SET NULL"),
+        sa.PrimaryKeyConstraint("id"),
+        schema="global",
+    )
+    op.create_index(
+        op.f("ix_global_inbox_mail_thread_created_by"),
+        "inbox_mail_thread",
+        ["created_by"],
+        unique=False,
+        schema="global",
+    )
+    op.create_index(
+        op.f("ix_global_inbox_mail_thread_organization_id"),
+        "inbox_mail_thread",
+        ["organization_id"],
+        unique=False,
+        schema="global",
+    )
+    op.create_index(
+        op.f("ix_global_inbox_mail_thread_support_owner_id"),
+        "inbox_mail_thread",
+        ["support_owner_id"],
+        unique=False,
+        schema="global",
+    )
+    op.create_table(
+        "inbox_mail",
+        sa.Column("id", postgresql.UUID(as_uuid=True), nullable=False),
+        sa.Column("created_at", sa.DateTime(), nullable=True),
+        sa.Column("sender_id", postgresql.UUID(as_uuid=True), nullable=True),
+        sa.Column("thread_id", postgresql.UUID(as_uuid=True), nullable=True),
+        sa.Column("content", sa.String(), nullable=True),
+        sa.ForeignKeyConstraint(["sender_id"], ["user.id"], ondelete="SET NULL"),
+        sa.ForeignKeyConstraint(
+            ["thread_id"], ["global.inbox_mail_thread.id"], ondelete="CASCADE"
+        ),
+        sa.PrimaryKeyConstraint("id"),
+        schema="global",
+    )
+    op.create_index(
+        op.f("ix_global_inbox_mail_sender_id"),
+        "inbox_mail",
+        ["sender_id"],
+        unique=False,
+        schema="global",
+    )
+    op.create_index(
+        op.f("ix_global_inbox_mail_thread_id"),
+        "inbox_mail",
+        ["thread_id"],
+        unique=False,
+        schema="global",
+    )
+    op.create_table(
+        "inbox_mail_thread_association",
+        sa.Column("id", postgresql.UUID(as_uuid=True), nullable=False),
+        sa.Column("thread_id", postgresql.UUID(as_uuid=True), nullable=True),
+        sa.Column("user_id", postgresql.UUID(as_uuid=True), nullable=True),
+        sa.Column("unread_mail_count", sa.Integer(), nullable=True),
+        sa.ForeignKeyConstraint(
+            ["thread_id"], ["global.inbox_mail_thread.id"], ondelete="CASCADE"
+        ),
+        sa.ForeignKeyConstraint(["user_id"], ["user.id"], ondelete="CASCADE"),
+        sa.PrimaryKeyConstraint("id"),
+        schema="global",
+    )
+    op.create_index(
+        op.f("ix_global_inbox_mail_thread_association_thread_id"),
+        "inbox_mail_thread_association",
+        ["thread_id"],
+        unique=False,
+        schema="global",
+    )
+    op.create_index(
+        op.f("ix_global_inbox_mail_thread_association_user_id"),
+        "inbox_mail_thread_association",
+        ["user_id"],
+        unique=False,
+        schema="global",
+    )
     # ### end Alembic commands ###
 
 
 def downgrade():
     # ### commands auto generated by Alembic - please adjust! ###
-    op.drop_index(op.f('ix_global_inbox_mail_thread_association_user_id'), table_name='inbox_mail_thread_association', schema='global')
-    op.drop_index(op.f('ix_global_inbox_mail_thread_association_thread_id'), table_name='inbox_mail_thread_association', schema='global')
-    op.drop_table('inbox_mail_thread_association', schema='global')
-    op.drop_index(op.f('ix_global_inbox_mail_thread_id'), table_name='inbox_mail', schema='global')
-    op.drop_index(op.f('ix_global_inbox_mail_sender_id'), table_name='inbox_mail', schema='global')
-    op.drop_table('inbox_mail', schema='global')
-    op.drop_index(op.f('ix_global_inbox_mail_thread_support_owner_id'), table_name='inbox_mail_thread', schema='global')
-    op.drop_index(op.f('ix_global_inbox_mail_thread_organization_id'), table_name='inbox_mail_thread', schema='global')
-    op.drop_index(op.f('ix_global_inbox_mail_thread_created_by'), table_name='inbox_mail_thread', schema='global')
-    op.drop_table('inbox_mail_thread', schema='global')
+    op.drop_index(
+        op.f("ix_global_inbox_mail_thread_association_user_id"),
+        table_name="inbox_mail_thread_association",
+        schema="global",
+    )
+    op.drop_index(
+        op.f("ix_global_inbox_mail_thread_association_thread_id"),
+        table_name="inbox_mail_thread_association",
+        schema="global",
+    )
+    op.drop_table("inbox_mail_thread_association", schema="global")
+    op.drop_index(
+        op.f("ix_global_inbox_mail_thread_id"), table_name="inbox_mail", schema="global"
+    )
+    op.drop_index(
+        op.f("ix_global_inbox_mail_sender_id"), table_name="inbox_mail", schema="global"
+    )
+    op.drop_table("inbox_mail", schema="global")
+    op.drop_index(
+        op.f("ix_global_inbox_mail_thread_support_owner_id"),
+        table_name="inbox_mail_thread",
+        schema="global",
+    )
+    op.drop_index(
+        op.f("ix_global_inbox_mail_thread_organization_id"),
+        table_name="inbox_mail_thread",
+        schema="global",
+    )
+    op.drop_index(
+        op.f("ix_global_inbox_mail_thread_created_by"),
+        table_name="inbox_mail_thread",
+        schema="global",
+    )
+    op.drop_table("inbox_mail_thread", schema="global")
     # ### end Alembic commands ###
diff --git a/alembic/versions/f428a22ecdb3_adds_etl_task_table.py b/alembic/versions/f428a22ecdb3_adds_etl_task_table.py
index 7bc1b75b..d87fb129 100644
--- a/alembic/versions/f428a22ecdb3_adds_etl_task_table.py
+++ b/alembic/versions/f428a22ecdb3_adds_etl_task_table.py
@@ -1,7 +1,7 @@
 """adds etl task table
 
 Revision ID: f428a22ecdb3
-Revises: 85bb3ebee137
+Revises: 0ea9ab7cb60f
 Create Date: 2025-10-30 10:45:20.843280
 
 """
@@ -12,7 +12,7 @@
 
 # revision identifiers, used by Alembic.
 revision = "f428a22ecdb3"
-down_revision = "85bb3ebee137"
+down_revision = "0ea9ab7cb60f"
 branch_labels = None
 depends_on = None