diff --git a/business_objects/data_block.py b/business_objects/data_block.py index 011af9c1..fadab6e7 100644 --- a/business_objects/data_block.py +++ b/business_objects/data_block.py @@ -1,5 +1,6 @@ from typing import Dict, List, Optional, Any +from sqlalchemy import and_, func from sqlalchemy.types import Text from sqlalchemy.orm.attributes import flag_modified @@ -26,15 +27,22 @@ def get_by_id(data_block_id: str) -> DataBlock: return session.query(DataBlock).filter(DataBlock.id == data_block_id).first() -def get_all_by_project_id(org_id: str, project_id: str) -> List[DataBlock]: - return ( - session.query(DataBlock) - .filter( - DataBlock.organization_id == org_id, - DataBlock.project_id == project_id, - ) - .all() +def get_all_by_project_id( + org_id: str, project_id: str, only_executed: bool = False +) -> List[DataBlock]: + stmt = session.query(DataBlock).filter( + DataBlock.organization_id == org_id, + DataBlock.project_id == project_id, ) + if only_executed: + query_text = DataBlock.sql_config.op("->>")("query") + stmt = stmt.filter( + and_( + query_text.isnot(None), + func.trim(query_text) != "", + ) + ) + return stmt.all() def get_by_project_id_and_type( diff --git a/cognition_objects/markdown_file.py b/cognition_objects/markdown_file.py index 3e86d932..e14b0635 100644 --- a/cognition_objects/markdown_file.py +++ b/cognition_objects/markdown_file.py @@ -252,14 +252,17 @@ def update( def delete(org_id: str, md_file_id: str, with_commit: bool = True) -> None: - md_file = session.query(CognitionMarkdownFile).filter( + md_file_query = session.query(CognitionMarkdownFile).filter( CognitionMarkdownFile.organization_id == org_id, CognitionMarkdownFile.id == md_file_id, ) - session.query(EtlTask).filter( - EtlTask.organization_id == org_id, EtlTask.id == md_file.etl_task_id - ).delete() - md_file.delete() + md_file = md_file_query.first() + if md_file: + session.query(EtlTask).filter( + EtlTask.organization_id == org_id, + EtlTask.id == md_file.etl_task_id, + ).delete() + md_file_query.delete() general.flush_or_commit(with_commit) diff --git a/enums.py b/enums.py index e00f8fe6..649c1bec 100644 --- a/enums.py +++ b/enums.py @@ -871,7 +871,6 @@ class MacroExecutionLinkAction(Enum): UPDATE = "UPDATE" - class FileCachingInitiator(Enum): TMP_DOC_RETRIEVAL = "TMP_DOC_RETRIEVAL" DATASET_MARKDOWN_FILE = "DATASET_MARKDOWN_FILE" @@ -1327,7 +1326,8 @@ def from_transformers( return cls.NO_TRANSFORMATION if ( len(transformers) == 1 - and transformers[0]["type"] == ETLTransformer.SUMMARIZE.value + and transformers[0].get("name", transformers[0].get("type")) + == ETLTransformer.SUMMARIZE.value ): return cls.SUMMARIZE return cls.COMMON_ETL diff --git a/etl_utils.py b/etl_utils.py index a5757a60..08fd2c4d 100644 --- a/etl_utils.py +++ b/etl_utils.py @@ -245,15 +245,16 @@ def get_full_config_for_sharepoint_integration( "task_type": enums.CognitionMarkdownFileState.SPLITTING.value, "task_config": { "use_cache": False, - "strategy": enums.ETLSplitStrategy.CHUNK.value, - "chunk_size": 1000, - "rows_per_section": rows_per_section, - # "keep_first_n": integration.config.get("split_kwargs", {}).get( - # "keep_first_n", 5 - # ), - # "keep_last_n": integration.config.get("split_kwargs", {}).get( - # "keep_last_n", 1 - # ), + "strategy": enums.ETLSplitStrategy.SHRINK.value, + "chunk_size": integration.config.get("split_kwargs", {}).get( + "chunk_size", 16384 + ), + "keep_first_n": integration.config.get("split_kwargs", {}).get( + "keep_first_n", 5 + ), + "keep_last_n": integration.config.get("split_kwargs", {}).get( + "keep_last_n", 1 + ), }, }, { @@ -274,6 +275,25 @@ def get_full_config_for_sharepoint_integration( "system_prompt": None, "user_prompt": None, }, + { + "enabled": True, + "name": enums.ETLTransformer.SUMMARIZE.value, + "system_prompt": None, + "user_prompt": " ".join( + ( + "You are a helpful AI assistant that summarizes documents.", + "Your task is to provide a concise summary of the provided text.", + "You will be given a context, and you should summarize it in a clear and concise manner.", + "The summary should capture the main points and key information from the context.", + ( + f"You are summarizing the list of file paths in folder `{record.parent_path}`." + if record.extension == "FOLDER" + else f"You are summarizing the file `{record.name}` in folder `{record.parent_path}`." + ), + f"IT IS CRUCIAL THAT YOU ONLY ANSWER IN ISO-639-1:{integration.tokenizer[:2]}", + ) + ), + }, ], }, }, @@ -291,16 +311,6 @@ def get_full_config_for_sharepoint_integration( }, }, }, - # { - # "task_type": enums.CognitionMarkdownFileState.NOTIFYING.value, - # "task_config": { - # "integration": [ - # { - # "integration_id": str(integration.id), - # } - # ] - # }, - # }, ] return full_config