code-kern-ai · lumburovskalina · Mar 25, 2026 · Mar 23, 2026 · Mar 25, 2026 · Mar 25, 2026
diff --git a/business_objects/data_block.py b/business_objects/data_block.py
@@ -1,5 +1,6 @@
 from typing import Dict, List, Optional, Any
 
+from sqlalchemy import and_, func
 from sqlalchemy.types import Text
 from sqlalchemy.orm.attributes import flag_modified
 
@@ -26,15 +27,22 @@ def get_by_id(data_block_id: str) -> DataBlock:
     return session.query(DataBlock).filter(DataBlock.id == data_block_id).first()
 
 
-def get_all_by_project_id(org_id: str, project_id: str) -> List[DataBlock]:
-    return (
-        session.query(DataBlock)
-        .filter(
-            DataBlock.organization_id == org_id,
-            DataBlock.project_id == project_id,
-        )
-        .all()
+def get_all_by_project_id(
+    org_id: str, project_id: str, only_executed: bool = False
+) -> List[DataBlock]:
+    stmt = session.query(DataBlock).filter(
+        DataBlock.organization_id == org_id,
+        DataBlock.project_id == project_id,
     )
+    if only_executed:
+        query_text = DataBlock.sql_config.op("->>")("query")
+        stmt = stmt.filter(
+            and_(
+                query_text.isnot(None),
+                func.trim(query_text) != "",
+            )
+        )
+    return stmt.all()
 
 
 def get_by_project_id_and_type(

diff --git a/cognition_objects/markdown_file.py b/cognition_objects/markdown_file.py
@@ -252,14 +252,17 @@ def update(
 
 
 def delete(org_id: str, md_file_id: str, with_commit: bool = True) -> None:
-    md_file = session.query(CognitionMarkdownFile).filter(
+    md_file_query = session.query(CognitionMarkdownFile).filter(
         CognitionMarkdownFile.organization_id == org_id,
         CognitionMarkdownFile.id == md_file_id,
     )
-    session.query(EtlTask).filter(
-        EtlTask.organization_id == org_id, EtlTask.id == md_file.etl_task_id
-    ).delete()
-    md_file.delete()
+    md_file = md_file_query.first()
+    if md_file:
+        session.query(EtlTask).filter(
+            EtlTask.organization_id == org_id,
+            EtlTask.id == md_file.etl_task_id,
+        ).delete()
+        md_file_query.delete()
     general.flush_or_commit(with_commit)
 
 

diff --git a/enums.py b/enums.py
@@ -871,7 +871,6 @@ class MacroExecutionLinkAction(Enum):
     UPDATE = "UPDATE"
 
 
-
 class FileCachingInitiator(Enum):
     TMP_DOC_RETRIEVAL = "TMP_DOC_RETRIEVAL"
     DATASET_MARKDOWN_FILE = "DATASET_MARKDOWN_FILE"
@@ -1327,7 +1326,8 @@ def from_transformers(
             return cls.NO_TRANSFORMATION
         if (
             len(transformers) == 1
-            and transformers[0]["type"] == ETLTransformer.SUMMARIZE.value
+            and transformers[0].get("name", transformers[0].get("type"))
+            == ETLTransformer.SUMMARIZE.value
         ):
             return cls.SUMMARIZE
         return cls.COMMON_ETL

diff --git a/etl_utils.py b/etl_utils.py
@@ -245,15 +245,16 @@ def get_full_config_for_sharepoint_integration(
             "task_type": enums.CognitionMarkdownFileState.SPLITTING.value,
             "task_config": {
                 "use_cache": False,
-                "strategy": enums.ETLSplitStrategy.CHUNK.value,
-                "chunk_size": 1000,
-                "rows_per_section": rows_per_section,
-                # "keep_first_n": integration.config.get("split_kwargs", {}).get(
-                #     "keep_first_n", 5
-                # ),
-                # "keep_last_n": integration.config.get("split_kwargs", {}).get(
-                #     "keep_last_n", 1
-                # ),
+                "strategy": enums.ETLSplitStrategy.SHRINK.value,
+                "chunk_size": integration.config.get("split_kwargs", {}).get(
+                    "chunk_size", 16384
+                ),
+                "keep_first_n": integration.config.get("split_kwargs", {}).get(
+                    "keep_first_n", 5
+                ),
+                "keep_last_n": integration.config.get("split_kwargs", {}).get(
+                    "keep_last_n", 1
+                ),
             },
         },
         {
@@ -274,6 +275,25 @@ def get_full_config_for_sharepoint_integration(
                         "system_prompt": None,
                         "user_prompt": None,
                     },
+                    {
+                        "enabled": True,
+                        "name": enums.ETLTransformer.SUMMARIZE.value,
+                        "system_prompt": None,
+                        "user_prompt": " ".join(
+                            (
+                                "You are a helpful AI assistant that summarizes documents.",
+                                "Your task is to provide a concise summary of the provided text.",
+                                "You will be given a context, and you should summarize it in a clear and concise manner.",
+                                "The summary should capture the main points and key information from the context.",
+                                (
+                                    f"You are summarizing the list of file paths in folder `{record.parent_path}`."
+                                    if record.extension == "FOLDER"
+                                    else f"You are summarizing the file `{record.name}` in folder `{record.parent_path}`."
+                                ),
+                                f"IT IS CRUCIAL THAT YOU ONLY ANSWER IN ISO-639-1:{integration.tokenizer[:2]}",
+                            )
+                        ),
+                    },
                 ],
             },
         },
@@ -291,16 +311,6 @@ def get_full_config_for_sharepoint_integration(
                 },
             },
         },
-        # {
-        #     "task_type": enums.CognitionMarkdownFileState.NOTIFYING.value,
-        #     "task_config": {
-        #         "integration": [
-        #             {
-        #                 "integration_id": str(integration.id),
-        #             }
-        #         ]
-        #     },
-        # },
     ]
 
     return full_config