langflow-ai
diff --git a/‎src/backend/base/langflow/alembic/versions/mb01b2c3d4e5_add_preprocessing_output.py‎
Lines changed: 94 additions & 0 deletions b/‎src/backend/base/langflow/alembic/versions/mb01b2c3d4e5_add_preprocessing_output.py‎
Lines changed: 94 additions & 0 deletions
diff --git a/‎src/backend/base/langflow/api/utils/kb_helpers.py‎
Lines changed: 35 additions & 7 deletions b/‎src/backend/base/langflow/api/utils/kb_helpers.py‎
Lines changed: 35 additions & 7 deletions
diff --git a/‎src/backend/base/langflow/api/v1/memories.py‎
Lines changed: 38 additions & 26 deletions b/‎src/backend/base/langflow/api/v1/memories.py‎
Lines changed: 38 additions & 26 deletions
diff --git a/‎src/backend/base/langflow/services/database/models/memory_base/model.py‎
Lines changed: 74 additions & 3 deletions b/‎src/backend/base/langflow/services/database/models/memory_base/model.py‎
Lines changed: 74 additions & 3 deletions
@@ -0,0 +1,94 @@
+"""add_preprocessing_output
+
+Adds:
+  - memory_base.preproc_kill_phrase (nullable String) to support LLM gating sentinel.
+  - memory_base_preprocessing_output table — one row per preprocessing batch capturing
+    the LLM output, status (processed/ingested/skipped), and the source message-id list
+    so two-phase commit (LLM call → Chroma write) can resume after KB failures.
+
+Phase: EXPAND
+
+Revision ID: mb01b2c3d4e5
+Revises: kb1a2b3c4d5e
+Create Date: 2026-05-01 00:00:00.000000
+"""
+
+from collections.abc import Sequence
+
+import sqlalchemy as sa
+from alembic import op
+from langflow.utils import migration
+
+revision: str = "mb01b2c3d4e5"  # pragma: allowlist secret
+down_revision: str | None = "kb1a2b3c4d5e"  # pragma: allowlist secret
+branch_labels: str | Sequence[str] | None = None
+depends_on: str | Sequence[str] | None = None
+
+
+def upgrade() -> None:
+    conn = op.get_bind()
+
+    # ------------------------------------------------------------------ #
+    #  memory_base.preproc_kill_phrase                                     #
+    # ------------------------------------------------------------------ #
+    with op.batch_alter_table("memory_base", schema=None) as batch_op:
+        if not migration.column_exists("memory_base", "preproc_kill_phrase", conn):
+            batch_op.add_column(sa.Column("preproc_kill_phrase", sa.String(), nullable=True))
+
+    # ------------------------------------------------------------------ #
+    #  memory_base_preprocessing_output                                    #
+    # ------------------------------------------------------------------ #
+    if not migration.table_exists("memory_base_preprocessing_output", conn):
+        op.create_table(
+            "memory_base_preprocessing_output",
+            sa.Column("id", sa.Uuid(), nullable=False),
+            sa.Column(
+                "memory_base_id",
+                sa.Uuid(),
+                sa.ForeignKey("memory_base.id", ondelete="CASCADE"),
+                nullable=False,
+            ),
+            sa.Column("session_id", sa.String(), nullable=False),
+            sa.Column(
+                "job_id",
+                sa.Uuid(),
+                sa.ForeignKey("job.job_id", ondelete="SET NULL"),
+                nullable=True,
+            ),
+            sa.Column("status", sa.String(), nullable=False),
+            sa.Column("output_text", sa.Text(), nullable=True),
+            sa.Column("source_message_ids", sa.JSON(), nullable=False),
+            sa.Column("model_used", sa.String(), nullable=False),
+            sa.Column("created_at", sa.DateTime(timezone=True), nullable=False),
+            sa.Column("updated_at", sa.DateTime(timezone=True), nullable=False),
+            sa.PrimaryKeyConstraint("id"),
+        )
+        op.create_index(
+            "ix_mbpo_pending",
+            "memory_base_preprocessing_output",
+            ["memory_base_id", "session_id", "status", "created_at"],
+        )
+        op.create_index(
+            "ix_mbpo_listing",
+            "memory_base_preprocessing_output",
+            ["memory_base_id", "session_id", "created_at"],
+        )
+        op.create_index(
+            "ix_mbpo_job_id",
+            "memory_base_preprocessing_output",
+            ["job_id"],
+        )
+
+
+def downgrade() -> None:
+    conn = op.get_bind()
+
+    if migration.table_exists("memory_base_preprocessing_output", conn):
+        op.drop_index("ix_mbpo_job_id", table_name="memory_base_preprocessing_output")
+        op.drop_index("ix_mbpo_listing", table_name="memory_base_preprocessing_output")
+        op.drop_index("ix_mbpo_pending", table_name="memory_base_preprocessing_output")
+        op.drop_table("memory_base_preprocessing_output")
+
+    with op.batch_alter_table("memory_base", schema=None) as batch_op:
+        if migration.column_exists("memory_base", "preproc_kill_phrase", conn):
+            batch_op.drop_column("preproc_kill_phrase")
@@ -70,6 +70,35 @@ class IngestionCancelledError(Exception):
     """Custom error for when an ingestion job is cancelled."""
 
 
+def chunk_text_for_ingestion(
+    text: str,
+    *,
+    chunk_size: int = 1000,
+    chunk_overlap: int = 100,
+    separator: str | None = None,
+) -> list[str]:
+    r"""Split text into chunks using ``RecursiveCharacterTextSplitter``.
+
+    Single source of truth for chunking config used by every ingestion path —
+    KB file ingestion and Memory Base raw / preprocessed message ingestion.
+    Centralizing this keeps chunk-size / overlap behavior identical so a
+    chunk that fits in one path won't suddenly overflow in another.
+
+    ``separator``: when provided, escaped newlines (``"\\n"``) are unescaped
+    and the value is passed as a single-element ``separators`` list, matching
+    the behavior of ``KBIngestionHelper.perform_ingestion``.
+
+    Returns ``[]`` for empty / whitespace-only input.
+    """
+    if not text or not text.strip():
+        return []
+    splitter_kwargs: dict = {"chunk_size": chunk_size, "chunk_overlap": chunk_overlap}
+    if separator:
+        splitter_kwargs["separators"] = [separator.replace("\\n", "\n")]
+    splitter = RecursiveCharacterTextSplitter(**splitter_kwargs)
+    return splitter.split_text(text)
+
+
 class KBStorageHelper:
     """Helper class for Knowledge Base storage and path management."""
 
@@ -664,12 +693,6 @@ async def perform_ingestion(
         encoded_metadata_tag = json.dumps(source_metadata) if source_metadata else ""
         source_extension_tags: set[str] = set()
         try:
-            splitter_kwargs: dict = {"chunk_size": chunk_size, "chunk_overlap": chunk_overlap}
-            if separator:
-                resolved_separator = separator.replace("\\n", "\n")
-                splitter_kwargs["separators"] = [resolved_separator]
-            text_splitter = RecursiveCharacterTextSplitter(**splitter_kwargs)
-
             embeddings = await KBIngestionHelper.build_embeddings(embedding_provider, embedding_model, current_user)
             backend_type_value = (
                 kb_record.backend_type if kb_record and kb_record.backend_type else BackendType.CHROMA.value
@@ -736,7 +759,12 @@ async def perform_ingestion(
                     combined_metadata.update(item.source_metadata)
                 item_metadata_tag = json.dumps(combined_metadata) if combined_metadata else encoded_metadata_tag
 
-                chunks = text_splitter.split_text(text)
+                chunks = chunk_text_for_ingestion(
+                    text,
+                    chunk_size=chunk_size,
+                    chunk_overlap=chunk_overlap,
+                    separator=separator,
+                )
                 docs = [
                     Document(
                         page_content=c,
 
@@ -28,7 +28,7 @@
 from fastapi_pagination import Page, Params
 from fastapi_pagination.ext.sqlmodel import apaginate
 from pydantic import BaseModel
-from sqlmodel import col, select
+from sqlmodel import select
 
 from langflow.api.utils import CurrentActiveUser
 from langflow.services.database.models.memory_base.model import (
@@ -38,9 +38,9 @@
     MemoryBaseSessionRead,
     MemoryBaseUpdate,
 )
-from langflow.services.database.models.message.model import MessageTable
 from langflow.services.deps import get_memory_base_service, session_scope
 from langflow.services.jobs import DuplicateJobError
+from langflow.services.memory_base.service import PreprocessingValidationError
 
 router = APIRouter(tags=["Memories"], prefix="/memories", include_in_schema=False)
 
@@ -105,6 +105,8 @@ async def create_memory_base(
     except PermissionError as exc:
         # Flow not found or belongs to another user — return 404 to avoid info-leak
         raise HTTPException(status_code=404, detail=str(exc)) from exc
+    except PreprocessingValidationError as exc:
+        raise HTTPException(status_code=422, detail=str(exc)) from exc
     except ValueError as exc:
         raise HTTPException(status_code=409, detail=str(exc)) from exc
     return MemoryBaseRead.model_validate(mb)
@@ -193,37 +195,44 @@ async def list_session_messages(
 
     Returns 404 if the Memory Base does not belong to the current user.
     """
-    from sqlalchemy import and_
-
-    from langflow.services.database.models.memory_base.model import MessageIngestionRecord
-
+    service = get_memory_base_service()
     async with session_scope() as db:
         mb_stmt = select(MemoryBase).where(MemoryBase.id == memory_base_id).where(MemoryBase.user_id == current_user.id)
         result = await db.exec(mb_stmt)
-        if result.first() is None:
+        mb = result.first()
+        if mb is None:
             raise HTTPException(status_code=404, detail="Memory base not found")
 
-        # INNER JOIN — only messages that were actually ingested into this MB/session pair.
-        # No extra WHERE filters needed:
-        #   - mir.session_id == session_id in the JOIN guarantees msg.session_id == session_id
-        #     (session_id is denormalized from the message at ingestion time — immutable).
-        #   - flow_id is implicitly correct: ingestion only ever touches messages from mb.flow_id,
-        #     and MB ownership is already verified above.
-        msg_stmt = (
-            select(MessageTable, MessageIngestionRecord)
-            .join(
-                MessageIngestionRecord,
-                and_(
-                    MessageIngestionRecord.message_id == MessageTable.id,
-                    MessageIngestionRecord.memory_base_id == memory_base_id,
-                    MessageIngestionRecord.session_id == session_id,
-                ),
+        if mb.preprocessing:
+            # Preprocessing MBs: the KB holds LLM-distilled output, so the
+            # surface for "what's in the KB" is MemoryBasePreprocessingOutput,
+            # not MessageTable.  Project the row into the same response shape
+            # so the API contract is identical from the frontend's perspective.
+            stmt = service.session_preprocessed_outputs_stmt(memory_base_id, session_id)
+            return await apaginate(
+                db,
+                stmt,
+                params=params,
+                transformer=lambda rows: [
+                    MessageReadResponse(
+                        id=row.id,
+                        timestamp=row.created_at,
+                        sender="Machine",
+                        sender_name="Preprocessor",
+                        session_id=row.session_id,
+                        text=row.output_text or "",
+                        content_blocks=[],
+                        job_id=row.job_id,
+                        ingested_at=row.created_at,
+                    )
+                    for row in rows
+                ],
             )
-            .order_by(col(MessageTable.timestamp).asc())
-        )
+
+        stmt = service.session_raw_messages_stmt(memory_base_id, session_id)
         return await apaginate(
             db,
-            msg_stmt,
+            stmt,
             params=params,
             transformer=lambda rows: [
                 MessageReadResponse(
@@ -253,7 +262,10 @@ async def update_memory_base(
     Threshold changes only take effect at the next auto-capture trigger.
     Any already-running ingestion task continues with its original arguments.
     """
-    mb = await get_memory_base_service().update(memory_base_id, user_id=current_user.id, patch=patch)
+    try:
+        mb = await get_memory_base_service().update(memory_base_id, user_id=current_user.id, patch=patch)
+    except PreprocessingValidationError as exc:
+        raise HTTPException(status_code=422, detail=str(exc)) from exc
     if mb is None:
         raise HTTPException(status_code=404, detail="Memory base not found")
     return MemoryBaseRead.model_validate(mb)
 
@@ -3,7 +3,7 @@
 
 import sqlalchemy as sa
 from pydantic import model_validator
-from sqlalchemy import Column, DateTime, ForeignKey, Index, UniqueConstraint
+from sqlalchemy import JSON, Column, DateTime, ForeignKey, Index, Text, UniqueConstraint
 from sqlmodel import Field, Relationship, SQLModel
 
 
@@ -13,11 +13,11 @@ class MemoryBaseBase(SQLModel):
     user_id: UUID = Field(index=True)
     threshold: int = Field(default=50)
     auto_capture: bool = Field(default=True)
-    # Preprocessing config — accepted in payload but logic deferred to future scope
     embedding_model: str = Field(default="")
     preprocessing: bool = Field(default=False)
     preproc_model: str | None = Field(default=None)
     preproc_instructions: str | None = Field(default=None)
+    preproc_kill_phrase: str | None = Field(default=None)
 
 
 class MemoryBase(MemoryBaseBase, table=True):  # type: ignore[call-arg]
@@ -42,10 +42,17 @@ class MemoryBaseCreate(MemoryBaseBase):
     user_id: UUID | None = None  # Derived from auth token in the endpoint; not required in request body
 
     @model_validator(mode="after")
-    def preproc_model_required_when_preprocessing(self) -> "MemoryBaseCreate":
+    def preprocessing_defaults(self) -> "MemoryBaseCreate":
         if self.preprocessing and not self.preproc_model:
             msg = "preproc_model is required when preprocessing is enabled"
             raise ValueError(msg)
+        # Default the kill phrase so callers that enable preprocessing without
+        # supplying one still get the deterministic gate. Imported lazily so the
+        # model module stays free of service-layer deps.
+        if self.preprocessing and not self.preproc_kill_phrase:
+            from langflow.services.memory_base.preprocessing import DEFAULT_KILL_PHRASE
+
+            self.preproc_kill_phrase = DEFAULT_KILL_PHRASE
         return self
 
 
@@ -56,6 +63,7 @@ class MemoryBaseUpdate(SQLModel):
     preprocessing: bool | None = None
     preproc_model: str | None = None
     preproc_instructions: str | None = None
+    preproc_kill_phrase: str | None = None
 
 
 class MemoryBaseRead(MemoryBaseBase):
@@ -197,3 +205,66 @@ class MessageIngestionRecord(SQLModel, table=True):  # type: ignore[call-arg]
     # Denormalized from MessageTable.session_id — immutable, avoids JOIN on the hot query path
     session_id: str = Field(sa_column=Column(sa.String(), nullable=False))
     ingested_at: datetime = Field(sa_column=Column(DateTime(timezone=True), nullable=False))
+
+
+class MemoryBasePreprocessingOutput(SQLModel, table=True):  # type: ignore[call-arg]
+    """One row per preprocessing batch — captures the LLM-distilled output before KB write.
+
+    Status flow:
+      - ``processed``  — LLM produced output; Chroma write pending. Cursor NOT advanced.
+                         The next ingestion job for this session reuses this row and
+                         retries only the Chroma write (no LLM re-invocation).
+      - ``ingested``   — Chroma write confirmed; cursor advanced; visible in get-messages view.
+      - ``skipped``    — LLM emitted the kill phrase; no Chroma write, no output_text,
+                         but cursor advances so the same batch is not re-evaluated.
+    """
+
+    __tablename__ = "memory_base_preprocessing_output"
+    __table_args__ = (
+        Index(
+            "ix_mbpo_pending",
+            "memory_base_id",
+            "session_id",
+            "status",
+            "created_at",
+        ),
+        Index(
+            "ix_mbpo_listing",
+            "memory_base_id",
+            "session_id",
+            "created_at",
+        ),
+        Index("ix_mbpo_job_id", "job_id"),
+    )
+
+    id: UUID = Field(default_factory=uuid4, primary_key=True)
+    memory_base_id: UUID = Field(
+        sa_column=Column(
+            sa.Uuid(),
+            ForeignKey("memory_base.id", ondelete="CASCADE"),
+            nullable=False,
+        )
+    )
+    # Denormalized — immutable for the row's lifetime
+    session_id: str = Field(sa_column=Column(sa.String(), nullable=False))
+    job_id: UUID | None = Field(
+        default=None,
+        sa_column=Column(
+            sa.Uuid(),
+            ForeignKey("job.job_id", ondelete="SET NULL"),
+            nullable=True,
+        ),
+    )
+    status: str = Field(sa_column=Column(sa.String(), nullable=False))
+    output_text: str | None = Field(default=None, sa_column=Column(Text(), nullable=True))
+    # Canonical batch identity — JSON list of message UUIDs as strings.
+    source_message_ids: list = Field(default_factory=list, sa_column=Column(JSON(), nullable=False))
+    model_used: str = Field(sa_column=Column(sa.String(), nullable=False))
+    created_at: datetime = Field(
+        default_factory=lambda: datetime.now(timezone.utc),
+        sa_column=Column(DateTime(timezone=True), nullable=False),
+    )
+    updated_at: datetime = Field(
+        default_factory=lambda: datetime.now(timezone.utc),
+        sa_column=Column(DateTime(timezone=True), nullable=False),
+    )