diff --git a/alembic/versions/04cd434ed6eb_remove_markdown_dataset_fields_for_new_.py b/alembic/versions/04cd434ed6eb_remove_markdown_dataset_fields_for_new_.py new file mode 100644 index 00000000..0a519b88 --- /dev/null +++ b/alembic/versions/04cd434ed6eb_remove_markdown_dataset_fields_for_new_.py @@ -0,0 +1,179 @@ +"""remove markdown dataset fields for new etl fields + +Revision ID: 04cd434ed6eb +Revises: 64874114490b +Create Date: 2025-12-01 15:43:42.832265 + +""" + +from alembic import op +import sqlalchemy as sa +from sqlalchemy.dialects import postgresql + +import uuid +import json + +from submodules.model.enums import LLMProvider + +# revision identifiers, used by Alembic. +revision = "04cd434ed6eb" +down_revision = "64874114490b" +branch_labels = None +depends_on = None + + +def upgrade(): + op.add_column( + "etl_task", sa.Column("meta_data", sa.JSON(), nullable=True), schema="global" + ) + op.add_column( + "markdown_dataset", + sa.Column("useable_etl_configurations", sa.JSON(), nullable=True), + schema="cognition", + ) + __conversion_helper() + op.drop_column("markdown_dataset", "tokenizer", schema="cognition") + op.drop_column("markdown_dataset", "llm_config", schema="cognition") + # ### end Alembic commands ### + + +def downgrade(): + # ### commands auto generated by Alembic - please adjust! ### + op.drop_column("etl_task", "meta_data", schema="global") + op.drop_column("markdown_dataset", "useable_etl_configurations", schema="cognition") + op.add_column( + "markdown_dataset", + sa.Column("tokenizer", sa.VARCHAR(), autoincrement=False, nullable=True), + schema="cognition", + ) + op.add_column( + "markdown_dataset", + sa.Column( + "llm_config", + postgresql.JSON(astext_type=sa.Text()), + autoincrement=False, + nullable=True, + ), + schema="cognition", + ) + ### end Alembic commands ### + + +def __conversion_helper(): + + connection = op.get_bind() + select_sql = """ + SELECT id, organization_id, llm_config, tokenizer, name, created_by, created_at + FROM cognition.markdown_dataset + WHERE (llm_config IS NOT NULL OR tokenizer IS NOT NULL); + """ + result = connection.execute(select_sql) + + # Step 2: Process each row + for row in result: + + print(f"Converting dataset {row['name']} (ID: {row['id']})") + + converted_object = __convert_object(row) + insert_sql = sa.text( + """ + INSERT INTO cognition.etl_config_preset( + id, + organization_id, + name, + description, + created_at, + created_by, + etl_config, + add_config + ) + VALUES ( + :id, + :organization_id, + :name, + :description, + :created_at, + :created_by, + :etl_config, + :add_config + ) + """ + ) + connection.execute(insert_sql, converted_object) + + # Step 4: Update the project row with useable_etl_configurations + update_sql = sa.text( + """ + UPDATE cognition.markdown_dataset + SET useable_etl_configurations = :config_list + WHERE id = :dataset_id + """ + ) + connection.execute( + update_sql, + { + "config_list": json.dumps( + [{"id": converted_object["id"], "isDefault": True}] + ), + "dataset_id": row["id"], + }, + ) + + +def __convert_object(row): + # --- Your Python conversion here --- + # Example placeholder (replace with real conversion) + # converted_object = {"llm_config": row["llm_config"], "tokenizer": row["tokenizer"]} + pdf_extraction = {} + extraction_data = row["llm_config"].get("extraction", {}) + if extraction_data.get("extractor") == "pdf2markdown": + pdf_extraction["extractor"] = "PDF2MD" + elif extraction_data.get("extractor").lower() == "azure_di": + pdf_extraction["azureDiApiBase"] = extraction_data.get("azureDiApiBase", "") + pdf_extraction["azureDiEnvVarId"] = extraction_data.get("azureDiEnvVarId", "") + pdf_extraction["extractor"] = "AZURE_DI" + elif ( + extraction_data.get("extractor").lower() == "gpt" + or extraction_data.get("extractor").lower() == "vision" + or extraction_data.get("extractor").lower() == "gpt-4" + ): + pdf_extraction["overwriteVisionPrompt"] = extraction_data.get( + "overwriteVisionPrompt", False + ) + pdf_extraction["llmIdentifier"] = LLMProvider.from_string( + extraction_data.get("llmIdentifier", "") + ).value + pdf_extraction["extractor"] = "VISION" + llm_config = extraction_data.copy() + llm_config.pop("extractor", None) + llm_config.pop("overwriteVisionPrompt", None) + llm_config.pop("llmIdentifier", None) + pdf_extraction["llmConfig"] = llm_config + transformation_data = row["llm_config"].get("transformation", {}) + transformation_config = {} + transformation_config["llmIdentifier"] = LLMProvider.from_string( + transformation_data.get("llmIdentifier", "") + ).value + transformation_config["type"] = "COMMON_ETL" + llm_config = transformation_data.copy() + llm_config.pop("llmIdentifier", None) + transformation_config["llmConfig"] = llm_config + # add pdf & add llm migration + converted_object = { + "extraction": {"default": {"extractor": "LANGCHAIN"}, "pdf": pdf_extraction}, + "tokenizer": row["tokenizer"], + "transformation": transformation_config, + } + final_object = { + "id": str(uuid.uuid4()), + "organization_id": str(row["organization_id"]), + "name": row["name"] + " - migrated etl config", + "description": "ETL configuration migrated from old project settings", + "created_at": row["created_at"].isoformat(), + "created_by": str(row["created_by"]), + "etl_config": json.dumps(converted_object), + "add_config": json.dumps({}), + } + + return final_object + # ----------------------------------- diff --git a/alembic/versions/0ea9ab7cb60f_add_inbox_mail.py b/alembic/versions/0ea9ab7cb60f_add_inbox_mail.py index f93b999e..7135f891 100644 --- a/alembic/versions/0ea9ab7cb60f_add_inbox_mail.py +++ b/alembic/versions/0ea9ab7cb60f_add_inbox_mail.py @@ -5,77 +5,153 @@ Create Date: 2025-11-21 13:43:44.056253 """ + from alembic import op import sqlalchemy as sa from sqlalchemy.dialects import postgresql # revision identifiers, used by Alembic. -revision = '0ea9ab7cb60f' -down_revision = '85bb3ebee137' +revision = "0ea9ab7cb60f" +down_revision = "85bb3ebee137" branch_labels = None depends_on = None def upgrade(): # ### commands auto generated by Alembic - please adjust! ### - op.create_table('inbox_mail_thread', - sa.Column('id', postgresql.UUID(as_uuid=True), nullable=False), - sa.Column('created_by', postgresql.UUID(as_uuid=True), nullable=True), - sa.Column('organization_id', postgresql.UUID(as_uuid=True), nullable=True), - sa.Column('created_at', sa.DateTime(), nullable=True), - sa.Column('subject', sa.String(), nullable=True), - sa.Column('meta_data', sa.JSON(), nullable=True), - sa.Column('is_important', sa.Boolean(), nullable=True), - sa.Column('progress_state', sa.String(), nullable=True), - sa.Column('support_owner_id', postgresql.UUID(as_uuid=True), nullable=True), - sa.Column('is_admin_support_thread', sa.Boolean(), nullable=True), - sa.ForeignKeyConstraint(['created_by'], ['user.id'], ondelete='CASCADE'), - sa.ForeignKeyConstraint(['organization_id'], ['organization.id'], ondelete='CASCADE'), - sa.ForeignKeyConstraint(['support_owner_id'], ['user.id'], ondelete='SET NULL'), - sa.PrimaryKeyConstraint('id'), - schema='global' - ) - op.create_index(op.f('ix_global_inbox_mail_thread_created_by'), 'inbox_mail_thread', ['created_by'], unique=False, schema='global') - op.create_index(op.f('ix_global_inbox_mail_thread_organization_id'), 'inbox_mail_thread', ['organization_id'], unique=False, schema='global') - op.create_index(op.f('ix_global_inbox_mail_thread_support_owner_id'), 'inbox_mail_thread', ['support_owner_id'], unique=False, schema='global') - op.create_table('inbox_mail', - sa.Column('id', postgresql.UUID(as_uuid=True), nullable=False), - sa.Column('created_at', sa.DateTime(), nullable=True), - sa.Column('sender_id', postgresql.UUID(as_uuid=True), nullable=True), - sa.Column('thread_id', postgresql.UUID(as_uuid=True), nullable=True), - sa.Column('content', sa.String(), nullable=True), - sa.ForeignKeyConstraint(['sender_id'], ['user.id'], ondelete='SET NULL'), - sa.ForeignKeyConstraint(['thread_id'], ['global.inbox_mail_thread.id'], ondelete='CASCADE'), - sa.PrimaryKeyConstraint('id'), - schema='global' - ) - op.create_index(op.f('ix_global_inbox_mail_sender_id'), 'inbox_mail', ['sender_id'], unique=False, schema='global') - op.create_index(op.f('ix_global_inbox_mail_thread_id'), 'inbox_mail', ['thread_id'], unique=False, schema='global') - op.create_table('inbox_mail_thread_association', - sa.Column('id', postgresql.UUID(as_uuid=True), nullable=False), - sa.Column('thread_id', postgresql.UUID(as_uuid=True), nullable=True), - sa.Column('user_id', postgresql.UUID(as_uuid=True), nullable=True), - sa.Column('unread_mail_count', sa.Integer(), nullable=True), - sa.ForeignKeyConstraint(['thread_id'], ['global.inbox_mail_thread.id'], ondelete='CASCADE'), - sa.ForeignKeyConstraint(['user_id'], ['user.id'], ondelete='CASCADE'), - sa.PrimaryKeyConstraint('id'), - schema='global' - ) - op.create_index(op.f('ix_global_inbox_mail_thread_association_thread_id'), 'inbox_mail_thread_association', ['thread_id'], unique=False, schema='global') - op.create_index(op.f('ix_global_inbox_mail_thread_association_user_id'), 'inbox_mail_thread_association', ['user_id'], unique=False, schema='global') + op.create_table( + "inbox_mail_thread", + sa.Column("id", postgresql.UUID(as_uuid=True), nullable=False), + sa.Column("created_by", postgresql.UUID(as_uuid=True), nullable=True), + sa.Column("organization_id", postgresql.UUID(as_uuid=True), nullable=True), + sa.Column("created_at", sa.DateTime(), nullable=True), + sa.Column("subject", sa.String(), nullable=True), + sa.Column("meta_data", sa.JSON(), nullable=True), + sa.Column("is_important", sa.Boolean(), nullable=True), + sa.Column("progress_state", sa.String(), nullable=True), + sa.Column("support_owner_id", postgresql.UUID(as_uuid=True), nullable=True), + sa.Column("is_admin_support_thread", sa.Boolean(), nullable=True), + sa.ForeignKeyConstraint(["created_by"], ["user.id"], ondelete="CASCADE"), + sa.ForeignKeyConstraint( + ["organization_id"], ["organization.id"], ondelete="CASCADE" + ), + sa.ForeignKeyConstraint(["support_owner_id"], ["user.id"], ondelete="SET NULL"), + sa.PrimaryKeyConstraint("id"), + schema="global", + ) + op.create_index( + op.f("ix_global_inbox_mail_thread_created_by"), + "inbox_mail_thread", + ["created_by"], + unique=False, + schema="global", + ) + op.create_index( + op.f("ix_global_inbox_mail_thread_organization_id"), + "inbox_mail_thread", + ["organization_id"], + unique=False, + schema="global", + ) + op.create_index( + op.f("ix_global_inbox_mail_thread_support_owner_id"), + "inbox_mail_thread", + ["support_owner_id"], + unique=False, + schema="global", + ) + op.create_table( + "inbox_mail", + sa.Column("id", postgresql.UUID(as_uuid=True), nullable=False), + sa.Column("created_at", sa.DateTime(), nullable=True), + sa.Column("sender_id", postgresql.UUID(as_uuid=True), nullable=True), + sa.Column("thread_id", postgresql.UUID(as_uuid=True), nullable=True), + sa.Column("content", sa.String(), nullable=True), + sa.ForeignKeyConstraint(["sender_id"], ["user.id"], ondelete="SET NULL"), + sa.ForeignKeyConstraint( + ["thread_id"], ["global.inbox_mail_thread.id"], ondelete="CASCADE" + ), + sa.PrimaryKeyConstraint("id"), + schema="global", + ) + op.create_index( + op.f("ix_global_inbox_mail_sender_id"), + "inbox_mail", + ["sender_id"], + unique=False, + schema="global", + ) + op.create_index( + op.f("ix_global_inbox_mail_thread_id"), + "inbox_mail", + ["thread_id"], + unique=False, + schema="global", + ) + op.create_table( + "inbox_mail_thread_association", + sa.Column("id", postgresql.UUID(as_uuid=True), nullable=False), + sa.Column("thread_id", postgresql.UUID(as_uuid=True), nullable=True), + sa.Column("user_id", postgresql.UUID(as_uuid=True), nullable=True), + sa.Column("unread_mail_count", sa.Integer(), nullable=True), + sa.ForeignKeyConstraint( + ["thread_id"], ["global.inbox_mail_thread.id"], ondelete="CASCADE" + ), + sa.ForeignKeyConstraint(["user_id"], ["user.id"], ondelete="CASCADE"), + sa.PrimaryKeyConstraint("id"), + schema="global", + ) + op.create_index( + op.f("ix_global_inbox_mail_thread_association_thread_id"), + "inbox_mail_thread_association", + ["thread_id"], + unique=False, + schema="global", + ) + op.create_index( + op.f("ix_global_inbox_mail_thread_association_user_id"), + "inbox_mail_thread_association", + ["user_id"], + unique=False, + schema="global", + ) # ### end Alembic commands ### def downgrade(): # ### commands auto generated by Alembic - please adjust! ### - op.drop_index(op.f('ix_global_inbox_mail_thread_association_user_id'), table_name='inbox_mail_thread_association', schema='global') - op.drop_index(op.f('ix_global_inbox_mail_thread_association_thread_id'), table_name='inbox_mail_thread_association', schema='global') - op.drop_table('inbox_mail_thread_association', schema='global') - op.drop_index(op.f('ix_global_inbox_mail_thread_id'), table_name='inbox_mail', schema='global') - op.drop_index(op.f('ix_global_inbox_mail_sender_id'), table_name='inbox_mail', schema='global') - op.drop_table('inbox_mail', schema='global') - op.drop_index(op.f('ix_global_inbox_mail_thread_support_owner_id'), table_name='inbox_mail_thread', schema='global') - op.drop_index(op.f('ix_global_inbox_mail_thread_organization_id'), table_name='inbox_mail_thread', schema='global') - op.drop_index(op.f('ix_global_inbox_mail_thread_created_by'), table_name='inbox_mail_thread', schema='global') - op.drop_table('inbox_mail_thread', schema='global') + op.drop_index( + op.f("ix_global_inbox_mail_thread_association_user_id"), + table_name="inbox_mail_thread_association", + schema="global", + ) + op.drop_index( + op.f("ix_global_inbox_mail_thread_association_thread_id"), + table_name="inbox_mail_thread_association", + schema="global", + ) + op.drop_table("inbox_mail_thread_association", schema="global") + op.drop_index( + op.f("ix_global_inbox_mail_thread_id"), table_name="inbox_mail", schema="global" + ) + op.drop_index( + op.f("ix_global_inbox_mail_sender_id"), table_name="inbox_mail", schema="global" + ) + op.drop_table("inbox_mail", schema="global") + op.drop_index( + op.f("ix_global_inbox_mail_thread_support_owner_id"), + table_name="inbox_mail_thread", + schema="global", + ) + op.drop_index( + op.f("ix_global_inbox_mail_thread_organization_id"), + table_name="inbox_mail_thread", + schema="global", + ) + op.drop_index( + op.f("ix_global_inbox_mail_thread_created_by"), + table_name="inbox_mail_thread", + schema="global", + ) + op.drop_table("inbox_mail_thread", schema="global") # ### end Alembic commands ### diff --git a/alembic/versions/31c4968699ad_add_etl_content_to_records.py b/alembic/versions/31c4968699ad_add_etl_content_to_records.py new file mode 100644 index 00000000..0a3a820f --- /dev/null +++ b/alembic/versions/31c4968699ad_add_etl_content_to_records.py @@ -0,0 +1,55 @@ +"""add etl content to records + +Revision ID: 31c4968699ad +Revises: 9d5fb67e29f7 +Create Date: 2025-11-23 23:08:27.327070 + +""" + +from alembic import op +import sqlalchemy as sa + + +# revision identifiers, used by Alembic. +revision = "31c4968699ad" +down_revision = "9d5fb67e29f7" +branch_labels = None +depends_on = None + + +def upgrade(): + # ### commands auto generated by Alembic - please adjust! ### + op.add_column( + "etl_task", + sa.Column("original_file_name", sa.String(), nullable=True), + schema="global", + ) + op.add_column( + "github_file", + sa.Column("content", sa.String(), nullable=True), + schema="integration", + ) + op.add_column( + "github_issue", + sa.Column("content", sa.String(), nullable=True), + schema="integration", + ) + op.add_column( + "pdf", sa.Column("content", sa.String(), nullable=True), schema="integration" + ) + op.add_column( + "sharepoint", + sa.Column("content", sa.String(), nullable=True), + schema="integration", + ) + # ### end Alembic commands ### + + +def downgrade(): + # ### commands auto generated by Alembic - please adjust! ### + op.drop_column("sharepoint", "content", schema="integration") + op.drop_column("pdf", "content", schema="integration") + op.drop_column("github_issue", "content", schema="integration") + op.drop_column("github_file", "content", schema="integration") + op.drop_column("etl_task", "original_file_name", schema="global") + # ### end Alembic commands ### diff --git a/alembic/versions/64874114490b_remove_cognition_project_fields_for_new_.py b/alembic/versions/64874114490b_remove_cognition_project_fields_for_new_.py new file mode 100644 index 00000000..11439542 --- /dev/null +++ b/alembic/versions/64874114490b_remove_cognition_project_fields_for_new_.py @@ -0,0 +1,218 @@ +"""remove cognition project fields for new etl + +Revision ID: 64874114490b +Revises: c4218a7d06e0 +Create Date: 2025-11-25 15:49:30.097610 + +""" + +from alembic import op +import sqlalchemy as sa +from sqlalchemy.dialects import postgresql +import json +import uuid +from submodules.model.enums import LLMProvider + +# revision identifiers, used by Alembic. +revision = "64874114490b" +down_revision = "c4218a7d06e0" +branch_labels = None +depends_on = None + + +def upgrade(): + # ### commands auto generated by Alembic - please adjust! ### + op.add_column( + "project", + sa.Column("useable_etl_configurations", sa.JSON(), nullable=True), + schema="cognition", + ) + op.drop_constraint( + "etl_config_preset_name_key", + "etl_config_preset", + schema="cognition", + type_="unique", + ) + op.drop_index( + "ix_cognition_etl_config_preset_project_id", + table_name="etl_config_preset", + schema="cognition", + ) + op.drop_constraint( + "etl_config_preset_project_id_fkey", + "etl_config_preset", + schema="cognition", + type_="foreignkey", + ) + op.drop_column("etl_config_preset", "project_id", schema="cognition") + __conversion_helper() + op.drop_column("project", "tokenizer", schema="cognition") + op.drop_column("project", "llm_config", schema="cognition") + # ### end Alembic commands ### + + +def downgrade(): + op.drop_column("project", "useable_etl_configurations", schema="cognition") + op.add_column( + "etl_config_preset", + sa.Column("project_id", postgresql.UUID(), autoincrement=False, nullable=True), + schema="cognition", + ) + op.create_foreign_key( + "etl_config_preset_project_id_fkey", + "etl_config_preset", + "project", + ["project_id"], + ["id"], + source_schema="cognition", + referent_schema="cognition", + ondelete="CASCADE", + ) + op.create_index( + "ix_cognition_etl_config_preset_project_id", + "etl_config_preset", + ["project_id"], + unique=False, + schema="cognition", + ) + op.create_unique_constraint( + "etl_config_preset_name_key", "etl_config_preset", ["name"], schema="cognition" + ) + # ### commands auto generated by Alembic - please adjust! ### + op.add_column( + "project", + sa.Column( + "llm_config", + postgresql.JSON(astext_type=sa.Text()), + autoincrement=False, + nullable=True, + ), + schema="cognition", + ) + op.add_column( + "project", + sa.Column("tokenizer", sa.VARCHAR(), autoincrement=False, nullable=True), + schema="cognition", + ) + # ### end Alembic commands ### + + +def __conversion_helper(): + + connection = op.get_bind() + select_sql = """ + SELECT id,organization_id, llm_config, tokenizer, name, created_by, created_at + FROM cognition.project + WHERE (llm_config IS NOT NULL OR tokenizer IS NOT NULL) + AND allow_file_upload = true; + """ + result = connection.execute(select_sql) + + # Step 2: Process each row + for row in result: + + print(f"Converting project {row['name']} (ID: {row['id']})") + + converted_object = __convert_object(row) + insert_sql = sa.text( + """ + INSERT INTO cognition.etl_config_preset( + id, + organization_id, + name, + description, + created_at, + created_by, + etl_config, + add_config + ) + VALUES ( + :id, + :organization_id, + :name, + :description, + :created_at, + :created_by, + :etl_config, + :add_config + ) + """ + ) + connection.execute(insert_sql, converted_object) + + # Step 4: Update the project row with useable_etl_configurations + update_sql = sa.text( + """ + UPDATE cognition.project + SET useable_etl_configurations = :config_list + WHERE id = :project_id + """ + ) + connection.execute( + update_sql, + { + "config_list": json.dumps( + [{"id": converted_object["id"], "isDefault": True}] + ), + "project_id": row["id"], + }, + ) + + +def __convert_object(row): + # --- Your Python conversion here --- + # Example placeholder (replace with real conversion) + # converted_object = {"llm_config": row["llm_config"], "tokenizer": row["tokenizer"]} + pdf_extraction = {} + extraction_data = row["llm_config"].get("extraction", {}) + if extraction_data.get("extractor") == "pdf2markdown": + pdf_extraction["extractor"] = "PDF2MD" + elif extraction_data.get("extractor").lower() == "azure_di": + pdf_extraction["azureDiApiBase"] = extraction_data.get("azureDiApiBase", "") + pdf_extraction["azureDiEnvVarId"] = extraction_data.get("azureDiEnvVarId", "") + pdf_extraction["extractor"] = "AZURE_DI" + elif ( + extraction_data.get("extractor").lower() == "gpt" + or extraction_data.get("extractor").lower() == "vision" + or extraction_data.get("extractor").lower() == "gpt-4" + ): + pdf_extraction["overwriteVisionPrompt"] = extraction_data.get( + "overwriteVisionPrompt", False + ) + pdf_extraction["llmIdentifier"] = LLMProvider.from_string( + extraction_data.get("llmIdentifier", "") + ).value + pdf_extraction["extractor"] = "VISION" + llm_config = extraction_data.copy() + llm_config.pop("extractor", None) + llm_config.pop("overwriteVisionPrompt", None) + llm_config.pop("llmIdentifier", None) + pdf_extraction["llmConfig"] = llm_config + transformation_data = row["llm_config"].get("transformation", {}) + transformation_config = {} + transformation_config["llmIdentifier"] = LLMProvider.from_string( + transformation_data.get("llmIdentifier", "") + ).value + transformation_config["type"] = "COMMON_ETL" + llm_config = transformation_data.copy() + llm_config.pop("llmIdentifier", None) + transformation_config["llmConfig"] = llm_config + # add pdf & add llm migration + converted_object = { + "extraction": {"default": {"extractor": "LANGCHAIN"}, "pdf": pdf_extraction}, + "tokenizer": row["tokenizer"], + "transformation": transformation_config, + } + final_object = { + "id": str(uuid.uuid4()), + "organization_id": str(row["organization_id"]), + "name": row["name"] + " - migrated etl config", + "description": "ETL configuration migrated from old project settings", + "created_at": row["created_at"].isoformat(), + "created_by": str(row["created_by"]), + "etl_config": json.dumps(converted_object), + "add_config": json.dumps({}), + } + + return final_object + # ----------------------------------- diff --git a/alembic/versions/9d5fb67e29f7_config_sets.py b/alembic/versions/9d5fb67e29f7_config_sets.py new file mode 100644 index 00000000..bfa8c8ab --- /dev/null +++ b/alembic/versions/9d5fb67e29f7_config_sets.py @@ -0,0 +1,87 @@ +"""Config sets' + + +Revision ID: 9d5fb67e29f7 +Revises: f428a22ecdb3 +Create Date: 2025-11-03 15:28:47.686657 + +""" + +from alembic import op +import sqlalchemy as sa +from sqlalchemy.dialects import postgresql + +# revision identifiers, used by Alembic. +revision = "9d5fb67e29f7" +down_revision = "f428a22ecdb3" +branch_labels = None +depends_on = None + + +def upgrade(): + # ### commands auto generated by Alembic - please adjust! ### + op.create_table( + "etl_config_preset", + sa.Column("id", postgresql.UUID(as_uuid=True), nullable=False), + sa.Column("organization_id", postgresql.UUID(as_uuid=True), nullable=True), + sa.Column("project_id", postgresql.UUID(as_uuid=True), nullable=True), + sa.Column("name", sa.String(), nullable=True), + sa.Column("description", sa.String(), nullable=True), + sa.Column("created_at", sa.DateTime(), nullable=True), + sa.Column("created_by", postgresql.UUID(as_uuid=True), nullable=True), + sa.Column("etl_config", sa.JSON(), nullable=True), + sa.Column("add_config", sa.JSON(), nullable=True), + sa.ForeignKeyConstraint(["created_by"], ["user.id"], ondelete="SET NULL"), + sa.ForeignKeyConstraint( + ["organization_id"], ["organization.id"], ondelete="CASCADE" + ), + sa.ForeignKeyConstraint( + ["project_id"], ["cognition.project.id"], ondelete="CASCADE" + ), + sa.PrimaryKeyConstraint("id"), + sa.UniqueConstraint("name"), + schema="cognition", + ) + op.create_index( + op.f("ix_cognition_etl_config_preset_created_by"), + "etl_config_preset", + ["created_by"], + unique=False, + schema="cognition", + ) + op.create_index( + op.f("ix_cognition_etl_config_preset_organization_id"), + "etl_config_preset", + ["organization_id"], + unique=False, + schema="cognition", + ) + op.create_index( + op.f("ix_cognition_etl_config_preset_project_id"), + "etl_config_preset", + ["project_id"], + unique=False, + schema="cognition", + ) + # ### end Alembic commands ### + + +def downgrade(): + # ### commands auto generated by Alembic - please adjust! ### + op.drop_index( + op.f("ix_cognition_etl_config_preset_project_id"), + table_name="etl_config_preset", + schema="cognition", + ) + op.drop_index( + op.f("ix_cognition_etl_config_preset_organization_id"), + table_name="etl_config_preset", + schema="cognition", + ) + op.drop_index( + op.f("ix_cognition_etl_config_preset_created_by"), + table_name="etl_config_preset", + schema="cognition", + ) + op.drop_table("etl_config_preset", schema="cognition") + # ### end Alembic commands ### diff --git a/alembic/versions/c4218a7d06e0_added_full_admin_table.py b/alembic/versions/c4218a7d06e0_added_full_admin_table.py new file mode 100644 index 00000000..74304ccd --- /dev/null +++ b/alembic/versions/c4218a7d06e0_added_full_admin_table.py @@ -0,0 +1,49 @@ +"""Added full admin table + +Revision ID: c4218a7d06e0 +Revises: 31c4968699ad +Create Date: 2025-04-24 09:12:33.200446 + +""" + +from alembic import op + +# revision identifiers, used by Alembic. +revision = "c4218a7d06e0" +down_revision = "31c4968699ad" +branch_labels = None +depends_on = None + + +def upgrade(): + op.execute( + """ + INSERT INTO global.full_admin_access (id, email, meta_info) + VALUES + (gen_random_uuid(), 'l.lumburovska@accompio.com','{}'), + (gen_random_uuid(), 'j.wittmeyer@accompio.com','{}'), + (gen_random_uuid(), 'le.schmidt@accompio.com','{}'), + (gen_random_uuid(), 'a.hrelja@accompio.com','{}'), + (gen_random_uuid(), 'l.puettmann@accompio.com','{}'), + (gen_random_uuid(), 'j.hoetter@accompio.com','{}'), + (gen_random_uuid(), 'h.wenck@accompio.com','{}'), + (gen_random_uuid(), 'j.wirth@accompio.com','{}') + """ + ) + + +def downgrade(): + op.execute( + """ + DELETE FROM global.full_admin_access WHERE email IN ( + 'l.lumburovska@accompio.com', + 'j.wittmeyer@accompio.com', + 'le.schmidt@accompio.com', + 'a.hrelja@accompio.com', + 'l.puettmann@accompio.com', + 'j.hoetter@accompio.com', + 'h.wenck@accompio.com', + 'j.wirth@accompio.com' + ) + """ + ) diff --git a/alembic/versions/f428a22ecdb3_adds_etl_task_table.py b/alembic/versions/f428a22ecdb3_adds_etl_task_table.py new file mode 100644 index 00000000..d87fb129 --- /dev/null +++ b/alembic/versions/f428a22ecdb3_adds_etl_task_table.py @@ -0,0 +1,318 @@ +"""adds etl task table + +Revision ID: f428a22ecdb3 +Revises: 0ea9ab7cb60f +Create Date: 2025-10-30 10:45:20.843280 + +""" + +from alembic import op +import sqlalchemy as sa +from sqlalchemy.dialects import postgresql + +# revision identifiers, used by Alembic. +revision = "f428a22ecdb3" +down_revision = "0ea9ab7cb60f" +branch_labels = None +depends_on = None + + +def upgrade(): + # ### commands auto generated by Alembic - please adjust! ### + op.create_table( + "etl_task", + sa.Column("id", postgresql.UUID(as_uuid=True), nullable=False), + sa.Column("organization_id", postgresql.UUID(as_uuid=True), nullable=True), + sa.Column("created_at", sa.DateTime(), nullable=True), + sa.Column("created_by", postgresql.UUID(as_uuid=True), nullable=True), + sa.Column("file_path", sa.String(), nullable=True), + sa.Column("file_size_bytes", sa.BigInteger(), nullable=True), + sa.Column("tokenizer", sa.String(), nullable=True), + sa.Column("full_config", sa.JSON(), nullable=True), + sa.Column("started_at", sa.DateTime(), nullable=True), + sa.Column("finished_at", sa.DateTime(), nullable=True), + sa.Column("state", sa.String(), nullable=True), + sa.Column("is_active", sa.Boolean(), nullable=True), + sa.Column("priority", sa.Integer(), nullable=True), + sa.Column("error_message", sa.String(), nullable=True), + sa.ForeignKeyConstraint(["created_by"], ["user.id"], ondelete="SET NULL"), + sa.ForeignKeyConstraint( + ["organization_id"], ["organization.id"], ondelete="CASCADE" + ), + sa.PrimaryKeyConstraint("id"), + schema="global", + ) + op.create_index( + op.f("ix_global_etl_task_created_by"), + "etl_task", + ["created_by"], + unique=False, + schema="global", + ) + op.create_index( + op.f("ix_global_etl_task_organization_id"), + "etl_task", + ["organization_id"], + unique=False, + schema="global", + ) + op.add_column( + "markdown_file", + sa.Column("etl_task_id", postgresql.UUID(as_uuid=True), nullable=True), + schema="cognition", + ) + op.create_index( + op.f("ix_cognition_markdown_file_etl_task_id"), + "markdown_file", + ["etl_task_id"], + unique=False, + schema="cognition", + ) + op.create_unique_constraint( + "unique_markdown_file_etl_task_id", + "markdown_file", + ["id", "etl_task_id"], + schema="cognition", + ) + op.create_foreign_key( + None, + "markdown_file", + "etl_task", + ["etl_task_id"], + ["id"], + source_schema="cognition", + referent_schema="global", + ondelete="CASCADE", + ) + op.add_column( + "github_file", + sa.Column("etl_task_id", postgresql.UUID(as_uuid=True), nullable=True), + schema="integration", + ) + op.drop_constraint( + "unique_github_file_source", "github_file", schema="integration", type_="unique" + ) + op.create_unique_constraint( + "unique_github_file_source", + "github_file", + ["integration_id", "running_id", "source", "etl_task_id"], + schema="integration", + ) + op.create_index( + op.f("ix_integration_github_file_etl_task_id"), + "github_file", + ["etl_task_id"], + unique=False, + schema="integration", + ) + op.create_foreign_key( + None, + "github_file", + "etl_task", + ["etl_task_id"], + ["id"], + source_schema="integration", + referent_schema="global", + ondelete="CASCADE", + ) + op.add_column( + "github_issue", + sa.Column("etl_task_id", postgresql.UUID(as_uuid=True), nullable=True), + schema="integration", + ) + op.drop_constraint( + "unique_github_issue_source", + "github_issue", + schema="integration", + type_="unique", + ) + op.create_unique_constraint( + "unique_github_issue_source", + "github_issue", + ["integration_id", "running_id", "source", "etl_task_id"], + schema="integration", + ) + op.create_index( + op.f("ix_integration_github_issue_etl_task_id"), + "github_issue", + ["etl_task_id"], + unique=False, + schema="integration", + ) + op.create_foreign_key( + None, + "github_issue", + "etl_task", + ["etl_task_id"], + ["id"], + source_schema="integration", + referent_schema="global", + ondelete="CASCADE", + ) + op.add_column( + "pdf", + sa.Column("etl_task_id", postgresql.UUID(as_uuid=True), nullable=True), + schema="integration", + ) + op.drop_constraint("unique_pdf_source", "pdf", schema="integration", type_="unique") + op.create_unique_constraint( + "unique_pdf_source", + "pdf", + ["integration_id", "running_id", "source", "etl_task_id"], + schema="integration", + ) + op.create_index( + op.f("ix_integration_pdf_etl_task_id"), + "pdf", + ["etl_task_id"], + unique=False, + schema="integration", + ) + op.create_foreign_key( + None, + "pdf", + "etl_task", + ["etl_task_id"], + ["id"], + source_schema="integration", + referent_schema="global", + ondelete="CASCADE", + ) + op.add_column( + "sharepoint", + sa.Column("etl_task_id", postgresql.UUID(as_uuid=True), nullable=True), + schema="integration", + ) + op.drop_constraint( + "unique_sharepoint_source", "sharepoint", schema="integration", type_="unique" + ) + op.create_unique_constraint( + "unique_sharepoint_source", + "sharepoint", + ["integration_id", "running_id", "source", "etl_task_id"], + schema="integration", + ) + op.create_index( + op.f("ix_integration_sharepoint_etl_task_id"), + "sharepoint", + ["etl_task_id"], + unique=False, + schema="integration", + ) + op.create_foreign_key( + None, + "sharepoint", + "etl_task", + ["etl_task_id"], + ["id"], + source_schema="integration", + referent_schema="global", + ondelete="CASCADE", + ) + # ### end Alembic commands ### + + +def downgrade(): + # ### commands auto generated by Alembic - please adjust! ### + op.drop_constraint( + "sharepoint_etl_task_id_fkey", + "sharepoint", + schema="integration", + type_="foreignkey", + ) + op.drop_index( + op.f("ix_integration_sharepoint_etl_task_id"), + table_name="sharepoint", + schema="integration", + ) + op.drop_constraint( + "unique_sharepoint_source", "sharepoint", schema="integration", type_="unique" + ) + op.create_unique_constraint( + "unique_sharepoint_source", + "sharepoint", + ["integration_id", "running_id", "source"], + schema="integration", + ) + op.drop_column("sharepoint", "etl_task_id", schema="integration") + op.drop_constraint( + "pdf_etl_task_id_fkey", "pdf", schema="integration", type_="foreignkey" + ) + op.drop_index( + op.f("ix_integration_pdf_etl_task_id"), table_name="pdf", schema="integration" + ) + op.drop_constraint("unique_pdf_source", "pdf", schema="integration", type_="unique") + op.create_unique_constraint( + "unique_pdf_source", + "pdf", + ["integration_id", "running_id", "source"], + schema="integration", + ) + op.drop_column("pdf", "etl_task_id", schema="integration") + op.drop_constraint( + "github_issue_etl_task_id_fkey", + "github_issue", + schema="integration", + type_="foreignkey", + ) + op.drop_index( + op.f("ix_integration_github_issue_etl_task_id"), + table_name="github_issue", + schema="integration", + ) + op.drop_constraint( + "unique_github_issue_source", + "github_issue", + schema="integration", + type_="unique", + ) + op.create_unique_constraint( + "unique_github_issue_source", + "github_issue", + ["integration_id", "running_id", "source"], + schema="integration", + ) + op.drop_column("github_issue", "etl_task_id", schema="integration") + op.drop_constraint( + "github_file_etl_task_id_fkey", + "github_file", + schema="integration", + type_="foreignkey", + ) + op.drop_index( + op.f("ix_integration_github_file_etl_task_id"), + table_name="github_file", + schema="integration", + ) + op.drop_constraint( + "unique_github_file_source", "github_file", schema="integration", type_="unique" + ) + op.create_unique_constraint( + "unique_github_file_source", + "github_file", + ["integration_id", "running_id", "source"], + schema="integration", + ) + op.drop_column("github_file", "etl_task_id", schema="integration") + op.drop_constraint( + "markdown_file_etl_task_id_fkey", + "markdown_file", + schema="cognition", + type_="foreignkey", + ) + op.drop_index( + op.f("ix_cognition_markdown_file_etl_task_id"), + table_name="markdown_file", + schema="cognition", + ) + op.drop_column("markdown_file", "etl_task_id", schema="cognition") + op.drop_index( + op.f("ix_global_etl_task_organization_id"), + table_name="etl_task", + schema="global", + ) + op.drop_index( + op.f("ix_global_etl_task_created_by"), table_name="etl_task", schema="global" + ) + op.drop_table("etl_task", schema="global") + # ### end Alembic commands ### diff --git a/controller/monitor/manager.py b/controller/monitor/manager.py index f4bdd43b..907b5bf6 100644 --- a/controller/monitor/manager.py +++ b/controller/monitor/manager.py @@ -126,3 +126,14 @@ def cancel_integration_task( task_monitor.set_integration_task_to_failed( integration_id, error_message="Cancelled by task manager" ) + + +def cancel_etl_task( + task_info: Dict[str, Any], +) -> None: + + etl_task_id = task_info.get("etlTaskId") + + task_monitor.set_etl_task_to_failed( + etl_task_id, error_message="Cancelled by task manager" + ) diff --git a/controller/transfer/cognition/minio_upload.py b/controller/transfer/cognition/minio_upload.py index 3ea51107..c2b17a6e 100644 --- a/controller/transfer/cognition/minio_upload.py +++ b/controller/transfer/cognition/minio_upload.py @@ -1,54 +1,125 @@ from typing import List -from submodules.model.cognition_objects import file_reference as file_reference_db_bo -from submodules.model.enums import TaskType, FileCachingProcessingScope + from controller.task_master import manager as task_master_manager -from submodules.model import enums +from submodules.model import enums, etl_utils from submodules.model.business_objects import general +from submodules.model.global_objects import etl_task as etl_task_bo +from submodules.model.cognition_objects import ( + file_reference as file_reference_db_bo, + markdown_file as markdown_file_bo, + markdown_dataset as markdown_dataset_bo, +) def handle_cognition_file_upload(path_parts: List[str]): - + # raise NotImplementedError("This function is not yet implemented.") if path_parts[1] != "_cognition" or len(path_parts) < 5: return - if path_parts[2] == "files" and path_parts[4].startswith("file_original"): - org_id = path_parts[0] - file_hash, file_size = path_parts[3].split("_") - file_reference = file_reference_db_bo.get(org_id, file_hash, int(file_size)) - - if ( - not file_reference - or file_reference.state == enums.FileCachingState.RUNNING.value - or file_reference.state == enums.FileCachingState.COMPLETED.value - ): - # file_reference is None or already processed in queue - print("File reference duplication error, file is already processed", flush=True) - if file_reference: - print(f"File reference id: {str(file_reference.id)}", flush=True) - print(f"File name: {file_reference.original_file_name}", flush=True) - return - file_reference.state = enums.FileCachingState.COMPLETED.value - general.commit() - - prio = ( - file_reference.meta_data.get("transformation_initiator") - == enums.FileCachingInitiator.TMP_DOC_RETRIEVAL.value - ) - extraction_method = file_reference.meta_data.get("extraction_method") + ##tmp doc retrieval => need to understand how .info file is an indicator for cognition gateway to pick it up + if not (path_parts[2] == "files" and path_parts[4].startswith("file_original")): + return + + org_id = path_parts[0] + file_hash, file_size = path_parts[3].split("_") + file_reference = file_reference_db_bo.get(org_id, file_hash, int(file_size)) + + if ( + not file_reference + or file_reference.state == enums.FileCachingState.RUNNING.value + or file_reference.state == enums.FileCachingState.COMPLETED.value + ): + # file_reference is None or already processed in queue + print( + f"WARNING: {__name__} - file reference duplication error, file is already processed", + flush=True, + ) + if file_reference: + print( + f"INFO: {__name__} - file reference id: {str(file_reference.id)}", + flush=True, + ) + print( + f"INFO: {__name__} - file name: {file_reference.original_file_name}", + flush=True, + ) + return + + file_reference.state = enums.FileCachingState.COMPLETED.value + general.commit() + + if ( + file_reference.meta_data.get("file_caching_initiator") + == enums.FileCachingInitiator.TMP_DOC_RETRIEVAL.value + ): + project_id = file_reference.meta_data.get("project_id") + conversation_id = file_reference.meta_data.get("conversation_id") + full_config, tokenizer = etl_utils.get_full_config_and_tokenizer_from_config_id( + file_reference, project_id=project_id, conversation_id=conversation_id + ) + etl_task = etl_task_bo.create( + org_id, + file_reference.created_by, + file_reference.original_file_name, + file_reference.file_size_bytes, + full_config=full_config, + tokenizer=tokenizer, + meta_data={ + "file_reference_id": str(file_reference.id), + "tmp_doc_metadata": { + "project_id": project_id, + "conversation_id": conversation_id, + }, + }, + priority=1, + ) task_master_manager.queue_task( - str(file_reference.organization_id), + org_id, str(file_reference.created_by), - TaskType.PARSE_COGNITION_FILE, + enums.TaskType.EXECUTE_ETL, { - "parse_scope": FileCachingProcessingScope.EXTRACT_TRANSFORM.value, + "etl_task_id": str(etl_task.id), + **etl_task.meta_data, + }, + priority=True, + ) + + else: + priority = -1 + + markdown_file = markdown_file_bo.get( + org_id, file_reference.meta_data.get("markdown_file_id") + ) + + full_config, tokenizer = etl_utils.get_full_config_and_tokenizer_from_config_id( + file_reference + ) + etl_task = etl_task_bo.create( + org_id, + file_reference.created_by, + file_reference.original_file_name, + file_reference.file_size_bytes, + full_config=full_config, + tokenizer=tokenizer, + meta_data={ "file_reference_id": str(file_reference.id), - "extraction_method": extraction_method, - "meta_data": file_reference.meta_data, - "extraction_key": file_reference.meta_data.get("extraction_key"), - "transformation_key": file_reference.meta_data.get( - "transformation_key" - ), - "file_name": file_reference.original_file_name, }, - prio, # not sure if prio is right here as the prio tasks should only take < 1 min but waiting for the normal queue will take ages depending on the queue + priority=priority, + ) + + markdown_file_bo.update( + org_id=org_id, + markdown_file_id=markdown_file.id, + etl_task_id=etl_task.id, + ) + + task_master_manager.queue_task( + org_id, + str(file_reference.created_by), + enums.TaskType.EXECUTE_ETL, + { + "etl_task_id": str(etl_task.id), + **etl_task.meta_data, + }, + priority=priority != -1, ) diff --git a/fast_api/routes/misc.py b/fast_api/routes/misc.py index d3a62f2f..e98af18f 100644 --- a/fast_api/routes/misc.py +++ b/fast_api/routes/misc.py @@ -136,6 +136,8 @@ def cancel_task( ) elif task_type == enums.TaskType.EXECUTE_INTEGRATION.value: controller_manager.cancel_integration_task(task_info) + elif task_type == enums.TaskType.EXECUTE_ETL.value: + controller_manager.cancel_etl_task(task_info) else: raise ValueError(f"{task_type} is no valid task type") diff --git a/submodules/model b/submodules/model index 4ed1d9f3..e00df1d8 160000 --- a/submodules/model +++ b/submodules/model @@ -1 +1 @@ -Subproject commit 4ed1d9f3e5b715731a81e0fca88b7406c817a77b +Subproject commit e00df1d8b005b950b0188859e73ce237e2868a12