feat: sync upstream PRs MODSetter#893, MODSetter#894, MODSetter#605 with bug fixes

deptrai · claude · deptrai · commit ae2bf184a15c · 2026-04-15T05:13:08.000+07:00
PR MODSetter#893 — fix: BookStack + Obsidian missing from periodic scheduler - Add index_bookstack_pages_task and index_obsidian_vault_task to schedule_checker_task.py imports and task_map PR MODSetter#894 — feat: BookStack shelf exclusion filter - bookstack_connector.py: get_all_shelves(), build_book_to_shelf_map() returning dict[int, set[int]] (fixes book-in-multiple-shelves edge case) get_all_pages/get_pages_by_date_range cache shelf map once (fixes N+1) - search_source_connectors_routes.py: POST /bookstack/shelves endpoint - bookstack_indexer.py: pass BOOKSTACK_EXCLUDED_SHELF_IDS to indexer - bookstack-connect-form.tsx + bookstack-config.tsx: shelf picker UI with loading guard to prevent race condition on rapid clicks - connectors-api.service.ts: listBookStackShelves() API method PR MODSetter#605 — fix: OLLAMA_BASE_URL support for external Ollama embeddings - config/__init__.py: add OLLAMA_BASE_URL env var; only inject base_url into embedding_kwargs when EMBEDDING_MODEL starts with "ollama://" (fixes original PR's bug: was unconditionally injected for all providers) - .env.example: document OLLAMA_BASE_URL with usage notes Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
diff --git a/surfsense_backend/.env.example b/surfsense_backend/.env.example
@@ -202,3 +202,7 @@ LANGSMITH_TRACING=true
 LANGSMITH_ENDPOINT=https://api.smith.langchain.com
 LANGSMITH_API_KEY=lsv2_pt_.....
 LANGSMITH_PROJECT=surfsense
+
+# Ollama Configuration (only used when EMBEDDING_MODEL starts with "ollama://")
+# Use host.docker.internal on Docker Desktop (Mac/Windows), or host IP on Linux
+# OLLAMA_BASE_URL=http://host.docker.internal:11434
diff --git a/surfsense_backend/app/config/__init__.py b/surfsense_backend/app/config/__init__.py
@@ -417,13 +417,17 @@ def is_cloud(cls) -> bool:
     # Azure OpenAI credentials from environment variables
     AZURE_OPENAI_ENDPOINT = os.getenv("AZURE_OPENAI_ENDPOINT")
     AZURE_OPENAI_API_KEY = os.getenv("AZURE_OPENAI_API_KEY")
+    # Ollama base URL — only applied when EMBEDDING_MODEL starts with "ollama://"
+    OLLAMA_BASE_URL = os.getenv("OLLAMA_BASE_URL")
 
-    # Pass Azure credentials to embeddings when using Azure OpenAI
+    # Pass provider-specific credentials to embeddings
     embedding_kwargs = {}
     if AZURE_OPENAI_ENDPOINT:
         embedding_kwargs["azure_endpoint"] = AZURE_OPENAI_ENDPOINT
     if AZURE_OPENAI_API_KEY:
         embedding_kwargs["azure_api_key"] = AZURE_OPENAI_API_KEY
+    if OLLAMA_BASE_URL and EMBEDDING_MODEL and EMBEDDING_MODEL.startswith("ollama://"):
+        embedding_kwargs["base_url"] = OLLAMA_BASE_URL
 
     embedding_model_instance = AutoEmbeddings.get_embeddings(
         EMBEDDING_MODEL,
diff --git a/surfsense_backend/app/connectors/bookstack_connector.py b/surfsense_backend/app/connectors/bookstack_connector.py
@@ -155,12 +155,81 @@ def make_api_request(
         except requests.exceptions.RequestException as e:
             raise Exception(f"BookStack API request failed: {e!s}") from e
 
-    def get_all_pages(self, count: int = 500) -> list[dict[str, Any]]:
+    def get_all_shelves(self, count: int = 500) -> list[dict[str, Any]]:
+        """
+        Fetch all shelves from BookStack with pagination.
+
+        Args:
+            count: Number of records per request (max 500)
+
+        Returns:
+            List of shelf objects
+        """
+        all_shelves = []
+        offset = 0
+
+        while True:
+            params = {
+                "count": min(count, 500),
+                "offset": offset,
+            }
+
+            result = self.make_api_request("shelves", params)
+
+            if not isinstance(result, dict) or "data" not in result:
+                raise Exception("Invalid response from BookStack API")
+
+            shelves = result["data"]
+            all_shelves.extend(shelves)
+
+            logger.info(f"Fetched {len(shelves)} shelves (offset: {offset})")
+
+            total = result.get("total", 0)
+            if offset + len(shelves) >= total:
+                break
+
+            offset += len(shelves)
+
+        logger.info(f"Total shelves fetched: {len(all_shelves)}")
+        return all_shelves
+
+    def build_book_to_shelf_map(self) -> dict[int, set[int]]:
+        """
+        Build a mapping from book_id to a set of shelf_ids.
+
+        Fetches all shelves and their book listings to create
+        a lookup table used for filtering pages by shelf.
+        A book can belong to multiple shelves, so we use a set.
+
+        Returns:
+            Dict mapping book_id -> set of shelf_ids
+        """
+        book_to_shelves: dict[int, set[int]] = {}
+        shelves = self.get_all_shelves()
+
+        for shelf in shelves:
+            shelf_id = shelf["id"]
+            shelf_detail = self.make_api_request(f"shelves/{shelf_id}")
+            if isinstance(shelf_detail, dict):
+                for book in shelf_detail.get("books", []):
+                    book_id = book["id"]
+                    if book_id not in book_to_shelves:
+                        book_to_shelves[book_id] = set()
+                    book_to_shelves[book_id].add(shelf_id)
+
+        return book_to_shelves
+
+    def get_all_pages(
+        self,
+        count: int = 500,
+        excluded_shelf_ids: list[int] | None = None,
+    ) -> list[dict[str, Any]]:
         """
         Fetch all pages from BookStack with pagination.
 
         Args:
             count: Number of records per request (max 500)
+            excluded_shelf_ids: Optional list of shelf IDs whose pages should be excluded
 
         Returns:
             List of page objects
@@ -195,6 +264,18 @@ def get_all_pages(self, count: int = 500) -> list[dict[str, Any]]:
 
             offset += len(pages)
 
+        # Filter by excluded shelves if specified — build map once, outside the loop
+        if excluded_shelf_ids:
+            book_to_shelves = self.build_book_to_shelf_map()
+            excluded = set(excluded_shelf_ids)
+            all_pages = [
+                p
+                for p in all_pages
+                if not any(
+                    s in excluded for s in book_to_shelves.get(p.get("book_id"), set())
+                )
+            ]
+
         logger.info(f"Total pages fetched: {len(all_pages)}")
         return all_pages
 
@@ -268,6 +349,7 @@ def get_pages_by_date_range(
         start_date: str,
         end_date: str,
         count: int = 500,
+        excluded_shelf_ids: list[int] | None = None,
     ) -> tuple[list[dict[str, Any]], str | None]:
         """
         Fetch pages updated within a specific date range.
@@ -278,6 +360,7 @@ def get_pages_by_date_range(
             start_date: Start date in YYYY-MM-DD format
             end_date: End date in YYYY-MM-DD format (currently unused, for future use)
             count: Number of records per request (max 500)
+            excluded_shelf_ids: Optional list of shelf IDs whose pages should be excluded
 
         Returns:
             Tuple of (list of page objects, error message or None)
@@ -316,6 +399,19 @@ def get_pages_by_date_range(
 
                 offset += len(pages)
 
+            # Filter by excluded shelves if specified — build map once, outside the loop
+            if excluded_shelf_ids and all_pages:
+                book_to_shelves = self.build_book_to_shelf_map()
+                excluded = set(excluded_shelf_ids)
+                all_pages = [
+                    p
+                    for p in all_pages
+                    if not any(
+                        s in excluded
+                        for s in book_to_shelves.get(p.get("book_id"), set())
+                    )
+                ]
+
             if not all_pages:
                 return [], f"No pages found updated after {start_date}"
 
diff --git a/surfsense_backend/app/routes/search_source_connectors_routes.py b/surfsense_backend/app/routes/search_source_connectors_routes.py
@@ -133,6 +133,60 @@ async def list_github_repositories(
         ) from e
 
 
+class BookStackCredentialsRequest(BaseModel):
+    """Request model for BookStack API credentials."""
+
+    base_url: str = Field(..., description="BookStack instance base URL")
+    token_id: str = Field(..., description="BookStack API Token ID")
+    token_secret: str = Field(..., description="BookStack API Token Secret")
+
+
+@router.post("/bookstack/shelves", response_model=list[dict[str, Any]])
+async def list_bookstack_shelves(
+    creds: BookStackCredentialsRequest,
+    user: User = Depends(current_active_user),
+):
+    """
+    Fetches all shelves from a BookStack instance.
+    Used by the frontend to let users select which shelves to exclude from indexing.
+    """
+    try:
+        from app.connectors.bookstack_connector import BookStackConnector
+
+        client = BookStackConnector(
+            base_url=creds.base_url,
+            token_id=creds.token_id,
+            token_secret=creds.token_secret,
+        )
+        shelves = client.get_all_shelves()
+
+        result = []
+        for shelf in shelves:
+            detail = client.make_api_request(f"shelves/{shelf['id']}")
+            books = detail.get("books", []) if isinstance(detail, dict) else []
+            result.append(
+                {
+                    "id": shelf["id"],
+                    "name": shelf["name"],
+                    "book_count": len(books),
+                    "books": [{"id": b["id"], "name": b["name"]} for b in books],
+                }
+            )
+        return result
+    except ValueError as e:
+        logger.error(
+            f"BookStack credential validation failed for user {user.id}: {e!s}"
+        )
+        raise HTTPException(
+            status_code=400, detail=f"Invalid BookStack credentials: {e!s}"
+        ) from e
+    except Exception as e:
+        logger.error(f"Failed to fetch BookStack shelves for user {user.id}: {e!s}")
+        raise HTTPException(
+            status_code=500, detail=f"Failed to fetch BookStack shelves: {e!s}"
+        ) from e
+
+
 @router.post("/search-source-connectors", response_model=SearchSourceConnectorRead)
 async def create_search_source_connector(
     connector: SearchSourceConnectorCreate,
@@ -1106,8 +1160,12 @@ async def index_connector_content(
             )
             response_message = "Luma indexing started in the background."
 
-        elif connector.connector_type == SearchSourceConnectorType.DEXSCREENER_CONNECTOR:
-            from app.tasks.celery_tasks.connector_tasks import index_dexscreener_pairs_task
+        elif (
+            connector.connector_type == SearchSourceConnectorType.DEXSCREENER_CONNECTOR
+        ):
+            from app.tasks.celery_tasks.connector_tasks import (
+                index_dexscreener_pairs_task,
+            )
 
             logger.info(
                 f"Triggering DexScreener indexing for connector {connector_id} into search space {search_space_id} from {indexing_from} to {indexing_to}"
diff --git a/surfsense_backend/app/tasks/celery_tasks/schedule_checker_task.py b/surfsense_backend/app/tasks/celery_tasks/schedule_checker_task.py
@@ -54,6 +54,7 @@ async def _check_and_trigger_schedules():
             # Import all indexing tasks
             from app.tasks.celery_tasks.connector_tasks import (
                 index_airtable_records_task,
+                index_bookstack_pages_task,
                 index_clickup_tasks_task,
                 index_confluence_pages_task,
                 index_crawled_urls_task,
@@ -68,6 +69,7 @@ async def _check_and_trigger_schedules():
                 index_linear_issues_task,
                 index_luma_events_task,
                 index_notion_pages_task,
+                index_obsidian_vault_task,
                 index_slack_messages_task,
             )
 
@@ -88,6 +90,8 @@ async def _check_and_trigger_schedules():
                 SearchSourceConnectorType.DEXSCREENER_CONNECTOR: index_dexscreener_pairs_task,
                 SearchSourceConnectorType.ELASTICSEARCH_CONNECTOR: index_elasticsearch_documents_task,
                 SearchSourceConnectorType.WEBCRAWLER_CONNECTOR: index_crawled_urls_task,
+                SearchSourceConnectorType.BOOKSTACK_CONNECTOR: index_bookstack_pages_task,
+                SearchSourceConnectorType.OBSIDIAN_CONNECTOR: index_obsidian_vault_task,
                 SearchSourceConnectorType.GOOGLE_DRIVE_CONNECTOR: index_google_drive_files_task,
                 # Composio connector types (unified with native Google tasks)
                 SearchSourceConnectorType.COMPOSIO_GOOGLE_DRIVE_CONNECTOR: index_google_drive_files_task,
diff --git a/surfsense_backend/app/tasks/connector_indexers/bookstack_indexer.py b/surfsense_backend/app/tasks/connector_indexers/bookstack_indexer.py
@@ -104,6 +104,9 @@ async def index_bookstack_pages(
         bookstack_token_id = connector.config.get("BOOKSTACK_TOKEN_ID")
         bookstack_token_secret = connector.config.get("BOOKSTACK_TOKEN_SECRET")
 
+        # Optional: shelf IDs to exclude from indexing
+        excluded_shelf_ids = connector.config.get("BOOKSTACK_EXCLUDED_SHELF_IDS", [])
+
         if (
             not bookstack_base_url
             or not bookstack_token_id
@@ -148,7 +151,9 @@ async def index_bookstack_pages(
         # Get pages within date range
         try:
             pages, error = bookstack_client.get_pages_by_date_range(
-                start_date=start_date_str, end_date=end_date_str
+                start_date=start_date_str,
+                end_date=end_date_str,
+                excluded_shelf_ids=excluded_shelf_ids,
             )
 
             if error:
diff --git a/surfsense_web/components/assistant-ui/connector-popup/connect-forms/components/bookstack-connect-form.tsx b/surfsense_web/components/assistant-ui/connector-popup/connect-forms/components/bookstack-connect-form.tsx
diff --git a/surfsense_web/components/assistant-ui/connector-popup/connector-configs/components/bookstack-config.tsx b/surfsense_web/components/assistant-ui/connector-popup/connector-configs/components/bookstack-config.tsx
diff --git a/surfsense_web/lib/apis/connectors-api.service.ts b/surfsense_web/lib/apis/connectors-api.service.ts