From d618716719374bffa27e1ad356a3b3ed3081454c Mon Sep 17 00:00:00 2001 From: mr-unstructured Date: Wed, 9 Oct 2024 18:09:40 +0100 Subject: [PATCH 01/48] Mantain the current v1 file --- .../processes/connectors/notion/__init__.py | 0 .../v2/processes/connectors/notion/client.py | 248 ++++++++ .../processes/connectors/notion/connector.py | 336 ++++++++++ .../v2/processes/connectors/notion/helpers.py | 584 ++++++++++++++++++ .../processes/connectors/notion/interfaces.py | 32 + .../connectors/notion/types/__init__.py | 0 .../connectors/notion/types/block.py | 95 +++ .../notion/types/blocks/__init__.py | 63 ++ .../notion/types/blocks/bookmark.py | 40 ++ .../notion/types/blocks/breadcrumb.py | 21 + .../notion/types/blocks/bulleted_list_item.py | 31 + .../connectors/notion/types/blocks/callout.py | 94 +++ .../notion/types/blocks/child_database.py | 23 + .../notion/types/blocks/child_page.py | 23 + .../connectors/notion/types/blocks/code.py | 43 ++ .../notion/types/blocks/column_list.py | 35 ++ .../connectors/notion/types/blocks/divider.py | 22 + .../connectors/notion/types/blocks/embed.py | 36 ++ .../notion/types/blocks/equation.py | 23 + .../connectors/notion/types/blocks/file.py | 49 ++ .../connectors/notion/types/blocks/heading.py | 37 ++ .../connectors/notion/types/blocks/image.py | 21 + .../notion/types/blocks/link_preview.py | 24 + .../notion/types/blocks/link_to_page.py | 29 + .../notion/types/blocks/numbered_list.py | 29 + .../notion/types/blocks/paragraph.py | 31 + .../connectors/notion/types/blocks/pdf.py | 49 ++ .../connectors/notion/types/blocks/quote.py | 37 ++ .../notion/types/blocks/synced_block.py | 57 ++ .../connectors/notion/types/blocks/table.py | 63 ++ .../notion/types/blocks/table_of_contents.py | 23 + .../notion/types/blocks/template.py | 30 + .../connectors/notion/types/blocks/todo.py | 42 ++ .../connectors/notion/types/blocks/toggle.py | 37 ++ .../notion/types/blocks/unsupported.py | 20 + .../connectors/notion/types/blocks/video.py | 22 + .../connectors/notion/types/database.py | 72 +++ .../types/database_properties/__init__.py | 106 ++++ .../types/database_properties/checkbox.py | 38 ++ .../types/database_properties/created_by.py | 35 ++ .../types/database_properties/created_time.py | 34 + .../notion/types/database_properties/date.py | 41 ++ .../notion/types/database_properties/email.py | 36 ++ .../notion/types/database_properties/files.py | 37 ++ .../types/database_properties/formula.py | 49 ++ .../database_properties/last_edited_by.py | 34 + .../database_properties/last_edited_time.py | 34 + .../types/database_properties/multiselect.py | 73 +++ .../types/database_properties/number.py | 49 ++ .../types/database_properties/people.py | 40 ++ .../types/database_properties/phone_number.py | 36 ++ .../types/database_properties/relation.py | 67 ++ .../types/database_properties/rich_text.py | 43 ++ .../types/database_properties/rollup.py | 56 ++ .../types/database_properties/select.py | 68 ++ .../types/database_properties/status.py | 80 +++ .../notion/types/database_properties/title.py | 37 ++ .../types/database_properties/unique_id.py | 50 ++ .../notion/types/database_properties/url.py | 37 ++ .../types/database_properties/verification.py | 78 +++ .../processes/connectors/notion/types/date.py | 26 + .../processes/connectors/notion/types/file.py | 51 ++ .../processes/connectors/notion/types/page.py | 44 ++ .../connectors/notion/types/parent.py | 66 ++ .../connectors/notion/types/rich_text.py | 189 ++++++ .../processes/connectors/notion/types/user.py | 76 +++ 66 files changed, 4031 insertions(+) create mode 100644 unstructured_ingest/v2/processes/connectors/notion/__init__.py create mode 100644 unstructured_ingest/v2/processes/connectors/notion/client.py create mode 100644 unstructured_ingest/v2/processes/connectors/notion/connector.py create mode 100644 unstructured_ingest/v2/processes/connectors/notion/helpers.py create mode 100644 unstructured_ingest/v2/processes/connectors/notion/interfaces.py create mode 100644 unstructured_ingest/v2/processes/connectors/notion/types/__init__.py create mode 100644 unstructured_ingest/v2/processes/connectors/notion/types/block.py create mode 100644 unstructured_ingest/v2/processes/connectors/notion/types/blocks/__init__.py create mode 100644 unstructured_ingest/v2/processes/connectors/notion/types/blocks/bookmark.py create mode 100644 unstructured_ingest/v2/processes/connectors/notion/types/blocks/breadcrumb.py create mode 100644 unstructured_ingest/v2/processes/connectors/notion/types/blocks/bulleted_list_item.py create mode 100644 unstructured_ingest/v2/processes/connectors/notion/types/blocks/callout.py create mode 100644 unstructured_ingest/v2/processes/connectors/notion/types/blocks/child_database.py create mode 100644 unstructured_ingest/v2/processes/connectors/notion/types/blocks/child_page.py create mode 100644 unstructured_ingest/v2/processes/connectors/notion/types/blocks/code.py create mode 100644 unstructured_ingest/v2/processes/connectors/notion/types/blocks/column_list.py create mode 100644 unstructured_ingest/v2/processes/connectors/notion/types/blocks/divider.py create mode 100644 unstructured_ingest/v2/processes/connectors/notion/types/blocks/embed.py create mode 100644 unstructured_ingest/v2/processes/connectors/notion/types/blocks/equation.py create mode 100644 unstructured_ingest/v2/processes/connectors/notion/types/blocks/file.py create mode 100644 unstructured_ingest/v2/processes/connectors/notion/types/blocks/heading.py create mode 100644 unstructured_ingest/v2/processes/connectors/notion/types/blocks/image.py create mode 100644 unstructured_ingest/v2/processes/connectors/notion/types/blocks/link_preview.py create mode 100644 unstructured_ingest/v2/processes/connectors/notion/types/blocks/link_to_page.py create mode 100644 unstructured_ingest/v2/processes/connectors/notion/types/blocks/numbered_list.py create mode 100644 unstructured_ingest/v2/processes/connectors/notion/types/blocks/paragraph.py create mode 100644 unstructured_ingest/v2/processes/connectors/notion/types/blocks/pdf.py create mode 100644 unstructured_ingest/v2/processes/connectors/notion/types/blocks/quote.py create mode 100644 unstructured_ingest/v2/processes/connectors/notion/types/blocks/synced_block.py create mode 100644 unstructured_ingest/v2/processes/connectors/notion/types/blocks/table.py create mode 100644 unstructured_ingest/v2/processes/connectors/notion/types/blocks/table_of_contents.py create mode 100644 unstructured_ingest/v2/processes/connectors/notion/types/blocks/template.py create mode 100644 unstructured_ingest/v2/processes/connectors/notion/types/blocks/todo.py create mode 100644 unstructured_ingest/v2/processes/connectors/notion/types/blocks/toggle.py create mode 100644 unstructured_ingest/v2/processes/connectors/notion/types/blocks/unsupported.py create mode 100644 unstructured_ingest/v2/processes/connectors/notion/types/blocks/video.py create mode 100644 unstructured_ingest/v2/processes/connectors/notion/types/database.py create mode 100644 unstructured_ingest/v2/processes/connectors/notion/types/database_properties/__init__.py create mode 100644 unstructured_ingest/v2/processes/connectors/notion/types/database_properties/checkbox.py create mode 100644 unstructured_ingest/v2/processes/connectors/notion/types/database_properties/created_by.py create mode 100644 unstructured_ingest/v2/processes/connectors/notion/types/database_properties/created_time.py create mode 100644 unstructured_ingest/v2/processes/connectors/notion/types/database_properties/date.py create mode 100644 unstructured_ingest/v2/processes/connectors/notion/types/database_properties/email.py create mode 100644 unstructured_ingest/v2/processes/connectors/notion/types/database_properties/files.py create mode 100644 unstructured_ingest/v2/processes/connectors/notion/types/database_properties/formula.py create mode 100644 unstructured_ingest/v2/processes/connectors/notion/types/database_properties/last_edited_by.py create mode 100644 unstructured_ingest/v2/processes/connectors/notion/types/database_properties/last_edited_time.py create mode 100644 unstructured_ingest/v2/processes/connectors/notion/types/database_properties/multiselect.py create mode 100644 unstructured_ingest/v2/processes/connectors/notion/types/database_properties/number.py create mode 100644 unstructured_ingest/v2/processes/connectors/notion/types/database_properties/people.py create mode 100644 unstructured_ingest/v2/processes/connectors/notion/types/database_properties/phone_number.py create mode 100644 unstructured_ingest/v2/processes/connectors/notion/types/database_properties/relation.py create mode 100644 unstructured_ingest/v2/processes/connectors/notion/types/database_properties/rich_text.py create mode 100644 unstructured_ingest/v2/processes/connectors/notion/types/database_properties/rollup.py create mode 100644 unstructured_ingest/v2/processes/connectors/notion/types/database_properties/select.py create mode 100644 unstructured_ingest/v2/processes/connectors/notion/types/database_properties/status.py create mode 100644 unstructured_ingest/v2/processes/connectors/notion/types/database_properties/title.py create mode 100644 unstructured_ingest/v2/processes/connectors/notion/types/database_properties/unique_id.py create mode 100644 unstructured_ingest/v2/processes/connectors/notion/types/database_properties/url.py create mode 100644 unstructured_ingest/v2/processes/connectors/notion/types/database_properties/verification.py create mode 100644 unstructured_ingest/v2/processes/connectors/notion/types/date.py create mode 100644 unstructured_ingest/v2/processes/connectors/notion/types/file.py create mode 100644 unstructured_ingest/v2/processes/connectors/notion/types/page.py create mode 100644 unstructured_ingest/v2/processes/connectors/notion/types/parent.py create mode 100644 unstructured_ingest/v2/processes/connectors/notion/types/rich_text.py create mode 100644 unstructured_ingest/v2/processes/connectors/notion/types/user.py diff --git a/unstructured_ingest/v2/processes/connectors/notion/__init__.py b/unstructured_ingest/v2/processes/connectors/notion/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/unstructured_ingest/v2/processes/connectors/notion/client.py b/unstructured_ingest/v2/processes/connectors/notion/client.py new file mode 100644 index 000000000..b0bc22a8a --- /dev/null +++ b/unstructured_ingest/v2/processes/connectors/notion/client.py @@ -0,0 +1,248 @@ +from typing import Any, Generator, List, Optional, Tuple + +import notion_client.errors +from notion_client import Client as NotionClient +from notion_client.api_endpoints import BlocksChildrenEndpoint as NotionBlocksChildrenEndpoint +from notion_client.api_endpoints import BlocksEndpoint as NotionBlocksEndpoint +from notion_client.api_endpoints import DatabasesEndpoint as NotionDatabasesEndpoint +from notion_client.api_endpoints import Endpoint +from notion_client.api_endpoints import PagesEndpoint as NotionPagesEndpoint +from notion_client.errors import RequestTimeoutError + +from unstructured_ingest.connector.notion.types.block import Block +from unstructured_ingest.connector.notion.types.database import Database +from unstructured_ingest.connector.notion.types.database_properties import ( + map_cells, +) +from unstructured_ingest.connector.notion.types.page import Page +from unstructured_ingest.ingest_backoff import RetryHandler +from unstructured_ingest.interfaces import RetryStrategyConfig +from unstructured_ingest.utils.dep_check import requires_dependencies + + +@requires_dependencies(["httpx"], extras="notion") +def _get_retry_strategy( + endpoint: Endpoint, retry_strategy_config: RetryStrategyConfig +) -> RetryHandler: + import backoff + import httpx + + retryable_exceptions = ( + httpx.TimeoutException, + httpx.HTTPStatusError, + notion_client.errors.HTTPResponseError, + ) + + return RetryHandler( + backoff.expo, + retryable_exceptions, + max_time=retry_strategy_config.max_retry_time, + max_tries=retry_strategy_config.max_retries, + logger=endpoint.parent.logger, + start_log_level=endpoint.parent.logger.level, + backoff_log_level=endpoint.parent.logger.level, + ) + + +def get_retry_handler(endpoint: Endpoint) -> Optional[RetryHandler]: + if retry_strategy_config := getattr(endpoint, "retry_strategy_config"): + return _get_retry_strategy(endpoint=endpoint, retry_strategy_config=retry_strategy_config) + return None + + +class BlocksChildrenEndpoint(NotionBlocksChildrenEndpoint): + def __init__( + self, + *args, + retry_strategy_config: Optional[RetryStrategyConfig] = None, + **kwargs, + ): + super().__init__(*args, **kwargs) + self.retry_strategy_config = retry_strategy_config + + @property + def retry_handler(self) -> Optional[RetryHandler]: + return get_retry_handler(self) + + def list(self, block_id: str, **kwargs: Any) -> Tuple[List[Block], dict]: + resp: dict = ( + self.retry_handler(super().list, block_id=block_id, **kwargs) + if self.retry_handler + else super().list(block_id=block_id, **kwargs) + ) # type: ignore + child_blocks = [Block.from_dict(data=b) for b in resp.pop("results", [])] + return child_blocks, resp + + def iterate_list( + self, + block_id: str, + **kwargs: Any, + ) -> Generator[List[Block], None, None]: + while True: + response: dict = ( + self.retry_handler(super().list, block_id=block_id, **kwargs) + if self.retry_handler + else super().list(block_id=block_id, **kwargs) + ) # type: ignore + child_blocks = [Block.from_dict(data=b) for b in response.pop("results", [])] + yield child_blocks + + next_cursor = response.get("next_cursor") + if not response.get("has_more") or not next_cursor: + return + + +class DatabasesEndpoint(NotionDatabasesEndpoint): + def __init__( + self, + *args, + retry_strategy_config: Optional[RetryStrategyConfig] = None, + **kwargs, + ): + super().__init__(*args, **kwargs) + self.retry_strategy_config = retry_strategy_config + + @property + def retry_handler(self) -> Optional[RetryHandler]: + return get_retry_handler(self) + + def retrieve(self, database_id: str, **kwargs: Any) -> Database: + resp: dict = ( + self.retry_handler(super().retrieve, database_id=database_id, **kwargs) + if (self.retry_handler) + else (super().retrieve(database_id=database_id, **kwargs)) + ) # type: ignore + return Database.from_dict(data=resp) + + @requires_dependencies(["httpx"], extras="notion") + def retrieve_status(self, database_id: str, **kwargs) -> int: + import httpx + + request = self.parent._build_request( + method="HEAD", + path=f"databases/{database_id}", + auth=kwargs.get("auth"), + ) + try: + response: httpx.Response = ( + self.retry_handler(self.parent.client.send, request) + if (self.retry_handler) + else (self.parent.client.send(request)) + ) # type: ignore + return response.status_code + except httpx.TimeoutException: + raise RequestTimeoutError() + + def query(self, database_id: str, **kwargs: Any) -> Tuple[List[Page], dict]: + """Get a list of [Pages](https://developers.notion.com/reference/page) contained in the database. + + *[🔗 Endpoint documentation](https://developers.notion.com/reference/post-database-query)* + """ # noqa: E501 + resp: dict = ( + self.retry_handler(super().query, database_id=database_id, **kwargs) + if (self.retry_handler) + else (super().query(database_id=database_id, **kwargs)) + ) # type: ignore + pages = [Page.from_dict(data=p) for p in resp.pop("results")] + for p in pages: + p.properties = map_cells(p.properties) + return pages, resp + + def iterate_query(self, database_id: str, **kwargs: Any) -> Generator[List[Page], None, None]: + while True: + response: dict = ( + self.retry_handler(super().query, database_id=database_id, **kwargs) + if (self.retry_handler) + else (super().query(database_id=database_id, **kwargs)) + ) # type: ignore + pages = [Page.from_dict(data=p) for p in response.pop("results", [])] + for p in pages: + p.properties = map_cells(p.properties) + yield pages + + next_cursor = response.get("next_cursor") + if not response.get("has_more") or not next_cursor: + return + + +class BlocksEndpoint(NotionBlocksEndpoint): + def __init__( + self, + *args: Any, + retry_strategy_config: Optional[RetryStrategyConfig] = None, + **kwargs: Any, + ) -> None: + super().__init__(*args, **kwargs) + self.retry_strategy_config = retry_strategy_config + self.children = BlocksChildrenEndpoint( + retry_strategy_config=retry_strategy_config, + *args, + **kwargs, + ) + + @property + def retry_handler(self) -> Optional[RetryHandler]: + return get_retry_handler(self) + + def retrieve(self, block_id: str, **kwargs: Any) -> Block: + resp: dict = ( + self.retry_handler(super().retrieve, block_id=block_id, **kwargs) + if (self.retry_handler) + else (super().retrieve(block_id=block_id, **kwargs)) + ) # type: ignore + return Block.from_dict(data=resp) + + +class PagesEndpoint(NotionPagesEndpoint): + def __init__( + self, + *args, + retry_strategy_config: Optional[RetryStrategyConfig] = None, + **kwargs, + ): + super().__init__(*args, **kwargs) + self.retry_strategy_config = retry_strategy_config + + @property + def retry_handler(self) -> Optional[RetryHandler]: + return get_retry_handler(self) + + def retrieve(self, page_id: str, **kwargs: Any) -> Page: + resp: dict = ( + self.retry_handler(super().retrieve, page_id=page_id, **kwargs) + if (self.retry_handler) + else (super().retrieve(page_id=page_id, **kwargs)) + ) # type: ignore + return Page.from_dict(data=resp) + + @requires_dependencies(["httpx"], extras="notion") + def retrieve_status(self, page_id: str, **kwargs) -> int: + import httpx + + request = self.parent._build_request( + method="HEAD", + path=f"pages/{page_id}", + auth=kwargs.get("auth"), + ) + try: + response: httpx.Response = ( + self.retry_handler(self.parent.client.send, request) + if (self.retry_handler) + else (self.parent.client.send(request)) + ) # type: ignore + return response.status_code + except httpx.TimeoutException: + raise RequestTimeoutError() + + +class Client(NotionClient): + def __init__( + self, + *args: Any, + retry_strategy_config: Optional[RetryStrategyConfig] = None, + **kwargs: Any, + ) -> None: + super().__init__(*args, **kwargs) + self.blocks = BlocksEndpoint(retry_strategy_config=retry_strategy_config, parent=self) + self.pages = PagesEndpoint(retry_strategy_config=retry_strategy_config, parent=self) + self.databases = DatabasesEndpoint(retry_strategy_config=retry_strategy_config, parent=self) diff --git a/unstructured_ingest/v2/processes/connectors/notion/connector.py b/unstructured_ingest/v2/processes/connectors/notion/connector.py new file mode 100644 index 000000000..6beea2f5e --- /dev/null +++ b/unstructured_ingest/v2/processes/connectors/notion/connector.py @@ -0,0 +1,336 @@ +import json +import logging +from dataclasses import dataclass, field +from time import time +from typing import TYPE_CHECKING, Any, Generator, List, Optional, Set, Tuple + +from pydantic import BaseModel, Field, SecretStr + +from unstructured_ingest.error import SourceConnectionError +from unstructured_ingest.utils.dep_check import requires_dependencies +from unstructured_ingest.v2.interfaces import ( + ConnectionConfig, + Downloader, + DownloaderConfig, + DownloadResponse, + FileData, + FileDataSourceMetadata, + Indexer, + IndexerConfig, + SourceIdentifiers, +) +from unstructured_ingest.v2.logger import logger +from unstructured_ingest.v2.processes.connector_registry import SourceRegistryEntry + +if TYPE_CHECKING: + from unstructured_ingest.connector.notion.client import Client as NotionClient + +NOTION_API_VERSION = "2022-06-28" +CONNECTOR_TYPE = "notion" + + +class NotionConnectionConfig(ConnectionConfig): + notion_api_key: SecretStr = Field(description="Notion API key") + + +class NotionIndexerConfig(IndexerConfig): + page_ids: Optional[List[str]] = Field( + default=None, description="List of Notion page IDs to process" + ) + + database_ids: Optional[List[str]] = Field( + default=None, description="List of Notion database IDs to process" + ) + recursive: bool = Field( + default=False, description="Recursively process child pages and databases" + ) + + +@dataclass +class NotionIndexer(Indexer): + connection_config: NotionConnectionConfig + indexer_config: NotionIndexerConfig + + @requires_dependencies(["notion_client"], extras="notion") + def get_client(self) -> "NotionClient": + from unstructured_ingest.connector.notion.client import Client as NotionClient + + return NotionClient( + notion_version=NOTION_API_VERSION, + auth=self.connection_config.notion_api_key.get_secret_value(), + logger=logger, + log_level=logger.level, + ) + + def precheck(self) -> None: + """Check the connection to the Notion API.""" + try: + client = self.get_client() + # Perform a simple request to verify connection + request = client._build_request("HEAD", "users") + response = client.client.send(request) + response.raise_for_status() + + except Exception as e: + logger.error(f"Failed to validate connection: {e}", exc_info=True) + raise SourceConnectionError(f"Failed to validate connection: {e}") + + def run(self, **kwargs: Any) -> Generator[FileData, None, None]: + client = self.get_client() + processed_pages: Set[str] = set() + processed_databases: Set[str] = set() + + pages_to_process: Set[str] = set(self.indexer_config.page_ids or []) + databases_to_process: Set[str] = set(self.indexer_config.database_ids or []) + + while pages_to_process or databases_to_process: + # Process pages + for page_id in list(pages_to_process): + if page_id in processed_pages: + continue + + processed_pages.add(page_id) + pages_to_process.remove(page_id) + file_data = self.get_page_file_data(page_id=page_id, client=client) + if file_data: + yield file_data + + if self.indexer_config.recursive: + child_pages, child_databases = self.get_child_pages_and_databases( + page_id=page_id, + client=client, + processed_pages=processed_pages, + processed_databases=processed_databases, + ) + pages_to_process.update(child_pages) + databases_to_process.update(child_databases) + + # Process databases + for database_id in list(databases_to_process): + if database_id in processed_databases: + continue + processed_databases.add(database_id) + databases_to_process.remove(database_id) + file_data = self.get_database_file_data( + database_id=database_id, client=client + ) + if file_data: + yield file_data + if self.indexer_config.recursive: + ( + child_pages, + child_databases, + ) = self.get_child_pages_and_databases_from_database( + database_id=database_id, + client=client, + processed_pages=processed_pages, + processed_databases=processed_databases, + ) + pages_to_process.update(child_pages) + databases_to_process.update(child_databases) + + @requires_dependencies(["notion_client"], extras="notion") + def get_page_file_data( + self, page_id: str, client: "NotionClient" + ) -> Optional[FileData]: + try: + page_metadata = client.pages.retrieve(page_id=page_id) # type: ignore + date_created = page_metadata.get("created_time") + date_modified = page_metadata.get("last_edited_time") + identifier = page_id + source_identifiers = SourceIdentifiers( + filename=f"{page_id}.html", + fullpath=page_id, + rel_path=page_id, + ) + metadata = FileDataSourceMetadata( + date_created=date_created, + date_modified=date_modified, + record_locator={"page_id": page_id}, + date_processed=str(time()), + ) + additional_metadata = page_metadata + return FileData( + identifier=identifier, + connector_type=CONNECTOR_TYPE, + source_identifiers=source_identifiers, + metadata=metadata, + additional_metadata=additional_metadata, + ) + except Exception as e: + logger.error(f"Error retrieving page {page_id}: {e}") + return None + + @requires_dependencies(["notion_client"], extras="notion") + def get_database_file_data( + self, database_id: str, client: "NotionClient" + ) -> Optional[FileData]: + try: + database_metadata = client.databases.retrieve( + database_id=database_id + ) # type: ignore + date_created = database_metadata.get("created_time") + date_modified = database_metadata.get("last_edited_time") + identifier = database_id + source_identifiers = SourceIdentifiers( + filename=f"{database_id}.html", + fullpath=database_id, + rel_path=database_id, + ) + metadata = FileDataSourceMetadata( + date_created=date_created, + date_modified=date_modified, + record_locator={"database_id": database_id}, + date_processed=str(time()), + ) + additional_metadata = database_metadata + return FileData( + identifier=identifier, + connector_type=CONNECTOR_TYPE, + source_identifiers=source_identifiers, + metadata=metadata, + additional_metadata=additional_metadata, + ) + except Exception as e: + logger.error(f"Error retrieving database {database_id}: {e}") + return None + + def get_child_pages_and_databases( + self, + page_id: str, + client: "NotionClient", + processed_pages: Set[str], + processed_databases: Set[str], + ) -> Tuple[Set[str], Set[str]]: + from unstructured_ingest.connector.notion.helpers import ( + get_recursive_content_from_page, + ) + + child_content = get_recursive_content_from_page( + client=client, + page_id=page_id, + logger=logger, + ) + child_pages = set(child_content.child_pages) - processed_pages + child_databases = set(child_content.child_databases) - processed_databases + return child_pages, child_databases + + def get_child_pages_and_databases_from_database( + self, + database_id: str, + client: "NotionClient", + processed_pages: Set[str], + processed_databases: Set[str], + ) -> Tuple[Set[str], Set[str]]: + from unstructured_ingest.connector.notion.helpers import ( + get_recursive_content_from_database, + ) + + child_content = get_recursive_content_from_database( + client=client, + database_id=database_id, + logger=logger, + ) + child_pages = set(child_content.child_pages) - processed_pages + child_databases = set(child_content.child_databases) - processed_databases + return child_pages, child_databases + + +@dataclass +class NotionDownloaderConfig(DownloaderConfig): + pass + + +@dataclass +class NotionDownloader(Downloader): + connection_config: NotionConnectionConfig + download_config: NotionDownloaderConfig + connector_type: str = CONNECTOR_TYPE + + @requires_dependencies(["notion_client"], extras="notion") + def get_client(self) -> "NotionClient": + from unstructured_ingest.connector.notion.client import Client as NotionClient + + return NotionClient( + notion_version=NOTION_API_VERSION, + auth=self.connection_config.notion_api_key.get_secret_value(), + logger=logger, + log_level=logger.level, + ) + + def run(self, file_data: FileData, **kwargs: Any) -> DownloadResponse: + client = self.get_client() + record_locator = file_data.metadata.record_locator + + if "page_id" in record_locator: + return self.download_page( + client=client, + page_id=record_locator["page_id"], + file_data=file_data, + ) + elif "database_id" in record_locator: + return self.download_database( + client=client, + database_id=record_locator["database_id"], + file_data=file_data, + ) + else: + raise ValueError("Invalid record_locator in file_data") + + def download_page(self, client: "NotionClient", page_id: str, file_data: FileData) -> DownloadResponse: + from unstructured_ingest.connector.notion.helpers import extract_page_html + + try: + text_extraction = extract_page_html( + client=client, + page_id=page_id, + logger=logger, + ) + if text_extraction.html: + download_path = self.get_download_path(file_data=file_data) + download_path.parent.mkdir(parents=True, exist_ok=True) + with download_path.open("w") as page_file: + page_file.write(text_extraction.html.render(pretty=True)) + return self.generate_download_response( + file_data=file_data, download_path=download_path + ) + else: + logger.error(f"No HTML content for page {page_id}") + return None + except Exception as e: + logger.error(f"Error downloading page {page_id}: {e}") + return None + + def download_database(self, client: "NotionClient", database_id: str, file_data: FileData) -> DownloadResponse: + from unstructured_ingest.connector.notion.helpers import extract_database_html + + try: + text_extraction = extract_database_html( + client=client, + database_id=database_id, + logger=logger, + ) + if text_extraction.html: + download_path = self.get_download_path(file_data=file_data) + download_path.parent.mkdir(parents=True, exist_ok=True) + with download_path.open("w") as database_file: + database_file.write(text_extraction.html.render(pretty=True)) + return self.generate_download_response( + file_data=file_data, download_path=download_path + ) + else: + logger.error(f"No HTML content for database {database_id}") + return None + except Exception as e: + logger.error(f"Error downloading database {database_id}: {e}") + return None + + +notion_source_entry = SourceRegistryEntry( + connector_type=CONNECTOR_TYPE, + connection_config=NotionConnectionConfig, + indexer_config=NotionIndexerConfig, + indexer=NotionIndexer, + downloader_config=NotionDownloaderConfig, + downloader=NotionDownloader, +) diff --git a/unstructured_ingest/v2/processes/connectors/notion/helpers.py b/unstructured_ingest/v2/processes/connectors/notion/helpers.py new file mode 100644 index 000000000..b12a60fc6 --- /dev/null +++ b/unstructured_ingest/v2/processes/connectors/notion/helpers.py @@ -0,0 +1,584 @@ +import enum +import logging +from dataclasses import dataclass, field +from typing import List, Optional, Tuple +from urllib.parse import urlparse +from uuid import UUID + +from htmlBuilder.attributes import Style, Type +from htmlBuilder.tags import ( + Body, + Div, + Head, + Html, + HtmlTag, + Ol, + Table, + Td, + Th, + Title, + Tr, + Ul, +) +from notion_client.errors import APIResponseError + +import unstructured_ingest.connector.notion.types.blocks as notion_blocks +from unstructured_ingest.connector.notion.client import Client +from unstructured_ingest.connector.notion.interfaces import BlockBase +from unstructured_ingest.connector.notion.types.block import Block +from unstructured_ingest.connector.notion.types.database import Database + + +@dataclass +class TextExtractionResponse: + text: Optional[str] = None + child_pages: List[str] = field(default_factory=list) + child_databases: List[str] = field(default_factory=list) + + +@dataclass +class HtmlExtractionResponse: + html: Optional[HtmlTag] = None + child_pages: List[str] = field(default_factory=list) + child_databases: List[str] = field(default_factory=list) + + +def extract_page_html( + client: Client, + page_id: str, + logger: logging.Logger, +) -> HtmlExtractionResponse: + page_id_uuid = UUID(page_id) + html_elements: List[Tuple[BlockBase, HtmlTag]] = [] + parent_block: Block = client.blocks.retrieve(block_id=page_id) # type: ignore + head = None + if isinstance(parent_block.block, notion_blocks.ChildPage): + head = Head([], Title([], parent_block.block.title)) + child_pages: List[str] = [] + child_databases: List[str] = [] + parents: List[Tuple[int, Block]] = [(0, parent_block)] + processed_block_ids = [] + while len(parents) > 0: + level, parent = parents.pop(0) + parent_html = parent.get_html() + if parent_html: + html_elements.append((parent.block, parent_html)) + logger.debug(f"processing block: {parent}") + if isinstance(parent.block, notion_blocks.ChildPage) and parent.id != str(page_id_uuid): + child_pages.append(parent.id) + continue + if isinstance(parent.block, notion_blocks.ChildDatabase): + child_databases.append(parent.id) + continue + if isinstance(parent.block, notion_blocks.Table): + table_response = build_table(client=client, table=parent) + html_elements.append((parent.block, table_response.table_html)) + child_pages.extend(table_response.child_pages) + child_databases.extend(table_response.child_databases) + continue + if isinstance(parent.block, notion_blocks.ColumnList): + column_html = build_columned_list(client=client, column_parent=parent) + html_elements.append((parent.block, column_html)) + continue + if isinstance(parent.block, notion_blocks.BulletedListItem): + bullet_list_resp = build_bulleted_list_children( + client=client, + bulleted_list_item_parent=parent, + ) + if bullet_list_children := bullet_list_resp.child_list: + html_elements.append((parent.block, bullet_list_children)) + continue + if isinstance(parent.block, notion_blocks.NumberedListItem): + numbered_list_resp = build_numbered_list_children( + client=client, + numbered_list_item_parent=parent, + ) + if numbered_list_children := numbered_list_resp.child_list: + html_elements.append((parent.block, numbered_list_children)) + continue + if parent.block.can_have_children() and parent.has_children: + children = [] + for children_block in client.blocks.children.iterate_list( # type: ignore + block_id=parent.id, + ): + children.extend(children_block) + if children: + logger.debug(f"adding {len(children)} children from parent: {parent}") + for child in children: + if child.id not in processed_block_ids: + parents.append((level + 1, child)) + processed_block_ids.append(parent) + + # Join list items + joined_html_elements = [] + numbered_list_items = [] + bullet_list_items = [] + for block, html in html_elements: + if isinstance(block, notion_blocks.BulletedListItem): + bullet_list_items.append(html) + continue + if isinstance(block, notion_blocks.NumberedListItem): + numbered_list_items.append(html) + continue + if len(numbered_list_items) > 0: + joined_html_elements.append(Ol([], numbered_list_items)) + numbered_list_items = [] + if len(bullet_list_items) > 0: + joined_html_elements.append(Ul([], bullet_list_items)) + bullet_list_items = [] + joined_html_elements.append(html) + + body = Body([], joined_html_elements) + all_elements = [body] + if head: + all_elements = [head] + all_elements + full_html = Html([], all_elements) + return HtmlExtractionResponse( + full_html, + child_pages=child_pages, + child_databases=child_databases, + ) + + +def extract_database_html( + client: Client, + database_id: str, + logger: logging.Logger, +) -> HtmlExtractionResponse: + logger.debug(f"processing database id: {database_id}") + database: Database = client.databases.retrieve(database_id=database_id) # type: ignore + property_keys = list(database.properties.keys()) + property_keys = sorted(property_keys) + table_html_rows = [] + child_pages: List[str] = [] + child_databases: List[str] = [] + # Create header row + table_html_rows.append(Tr([], [Th([], k) for k in property_keys])) + + all_pages = [] + for page_chunk in client.databases.iterate_query(database_id=database_id): # type: ignore + all_pages.extend(page_chunk) + + logger.debug(f"creating {len(all_pages)} rows") + for page in all_pages: + if is_database_url(client=client, url=page.url): + child_databases.append(page.id) + if is_page_url(client=client, url=page.url): + child_pages.append(page.id) + properties = page.properties + inner_html = [properties.get(k).get_html() for k in property_keys] # type: ignore + table_html_rows.append( + Tr( + [], + [Td([], cell) for cell in [html if html else Div([], []) for html in inner_html]], + ), + ) + + table_html = Table([], table_html_rows) + + return HtmlExtractionResponse( + html=table_html, + child_pages=child_pages, + child_databases=child_databases, + ) + + +@dataclass +class ChildExtractionResponse: + child_pages: List[str] = field(default_factory=list) + child_databases: List[str] = field(default_factory=list) + + +class QueueEntryType(enum.Enum): + DATABASE = "database" + PAGE = "page" + + +@dataclass +class QueueEntry: + type: QueueEntryType + id: UUID + + +def get_recursive_content_from_page( + client: Client, + page_id: str, + logger: logging.Logger, +) -> ChildExtractionResponse: + return get_recursive_content( + client=client, + init_entry=QueueEntry(type=QueueEntryType.PAGE, id=UUID(page_id)), + logger=logger, + ) + + +def get_recursive_content_from_database( + client: Client, + database_id: str, + logger: logging.Logger, +) -> ChildExtractionResponse: + return get_recursive_content( + client=client, + init_entry=QueueEntry(type=QueueEntryType.DATABASE, id=UUID(database_id)), + logger=logger, + ) + + +def get_recursive_content( + client: Client, + init_entry: QueueEntry, + logger: logging.Logger, +) -> ChildExtractionResponse: + parents: List[QueueEntry] = [init_entry] + child_pages: List[str] = [] + child_dbs: List[str] = [] + processed: List[str] = [] + while len(parents) > 0: + parent: QueueEntry = parents.pop() + processed.append(str(parent.id)) + if parent.type == QueueEntryType.PAGE: + logger.debug(f"getting child data from page: {parent.id}") + page_children = [] + try: + for children_block in client.blocks.children.iterate_list( # type: ignore + block_id=str(parent.id), + ): + page_children.extend(children_block) + except APIResponseError as api_error: + logger.error(f"failed to get page with id {parent.id}: {api_error}") + if str(parent.id) in child_pages: + child_pages.remove(str(parent.id)) + continue + if not page_children: + continue + + # Extract child pages + child_pages_from_page = [ + c for c in page_children if isinstance(c.block, notion_blocks.ChildPage) + ] + if child_pages_from_page: + child_page_blocks: List[notion_blocks.ChildPage] = [ + p.block + for p in child_pages_from_page + if isinstance(p.block, notion_blocks.ChildPage) + ] + logger.debug( + "found child pages from parent page {}: {}".format( + parent.id, + ", ".join([block.title for block in child_page_blocks]), + ), + ) + new_pages = [p.id for p in child_pages_from_page if p.id not in processed] + new_pages = list(set(new_pages)) + child_pages.extend(new_pages) + parents.extend( + [QueueEntry(type=QueueEntryType.PAGE, id=UUID(i)) for i in new_pages], + ) + + # Extract child databases + child_dbs_from_page = [ + c for c in page_children if isinstance(c.block, notion_blocks.ChildDatabase) + ] + if child_dbs_from_page: + child_db_blocks: List[notion_blocks.ChildDatabase] = [ + c.block + for c in page_children + if isinstance(c.block, notion_blocks.ChildDatabase) + ] + logger.debug( + "found child database from parent page {}: {}".format( + parent.id, + ", ".join([block.title for block in child_db_blocks]), + ), + ) + new_dbs = [db.id for db in child_dbs_from_page if db.id not in processed] + new_dbs = list(set(new_dbs)) + child_dbs.extend(new_dbs) + parents.extend( + [QueueEntry(type=QueueEntryType.DATABASE, id=UUID(i)) for i in new_dbs], + ) + + linked_to_others: List[notion_blocks.LinkToPage] = [ + c.block for c in page_children if isinstance(c.block, notion_blocks.LinkToPage) + ] + for link in linked_to_others: + if (page_id := link.page_id) and ( + page_id not in processed and page_id not in child_pages + ): + child_pages.append(page_id) + parents.append(QueueEntry(type=QueueEntryType.PAGE, id=UUID(page_id))) + if (database_id := link.database_id) and ( + database_id not in processed and database_id not in child_dbs + ): + child_dbs.append(database_id) + parents.append( + QueueEntry(type=QueueEntryType.DATABASE, id=UUID(database_id)), + ) + + elif parent.type == QueueEntryType.DATABASE: + logger.debug(f"getting child data from database: {parent.id}") + database_pages = [] + try: + for page_entries in client.databases.iterate_query( # type: ignore + database_id=str(parent.id), + ): + database_pages.extend(page_entries) + except APIResponseError as api_error: + logger.error(f"failed to get database with id {parent.id}: {api_error}") + if str(parent.id) in child_dbs: + child_dbs.remove(str(parent.id)) + continue + if not database_pages: + continue + + child_pages_from_db = [ + p for p in database_pages if is_page_url(client=client, url=p.url) + ] + if child_pages_from_db: + logger.debug( + "found child pages from parent database {}: {}".format( + parent.id, + ", ".join([p.url for p in child_pages_from_db]), + ), + ) + new_pages = [p.id for p in child_pages_from_db if p.id not in processed] + child_pages.extend(new_pages) + parents.extend( + [QueueEntry(type=QueueEntryType.PAGE, id=UUID(i)) for i in new_pages], + ) + + child_dbs_from_db = [ + p for p in database_pages if is_database_url(client=client, url=p.url) + ] + if child_dbs_from_db: + logger.debug( + "found child database from parent database {}: {}".format( + parent.id, + ", ".join([db.url for db in child_dbs_from_db]), + ), + ) + new_dbs = [db.id for db in child_dbs_from_db if db.id not in processed] + child_dbs.extend(new_dbs) + parents.extend( + [QueueEntry(type=QueueEntryType.DATABASE, id=UUID(i)) for i in new_dbs], + ) + + return ChildExtractionResponse( + child_pages=child_pages, + child_databases=child_dbs, + ) + + +def is_valid_uuid(uuid_str: str) -> bool: + try: + UUID(uuid_str) + return True + except Exception: + return False + + +def get_uuid_from_url(path: str) -> Optional[str]: + strings = path.split("-") + if len(strings) > 0 and is_valid_uuid(strings[-1]): + return strings[-1] + return None + + +def is_page_url(client: Client, url: str): + parsed_url = urlparse(url) + path = parsed_url.path.split("/")[-1] + if parsed_url.netloc != "www.notion.so": + return False + page_uuid = get_uuid_from_url(path=path) + if not page_uuid: + return False + check_resp = client.pages.retrieve_status(page_id=page_uuid) + return check_resp == 200 + + +def is_database_url(client: Client, url: str): + parsed_url = urlparse(url) + path = parsed_url.path.split("/")[-1] + if parsed_url.netloc != "www.notion.so": + return False + database_uuid = get_uuid_from_url(path=path) + if not database_uuid: + return False + check_resp = client.databases.retrieve_status(database_id=database_uuid) + return check_resp == 200 + + +@dataclass +class BuildTableResponse: + table_html: HtmlTag + child_pages: List[str] = field(default_factory=list) + child_databases: List[str] = field(default_factory=list) + + +def build_table(client: Client, table: Block) -> BuildTableResponse: + if not isinstance(table.block, notion_blocks.Table): + raise ValueError(f"block type not table: {type(table.block)}") + rows: List[notion_blocks.TableRow] = [] + child_pages: List[str] = [] + child_databases: List[str] = [] + for row_chunk in client.blocks.children.iterate_list( # type: ignore + block_id=table.id, + ): + rows.extend( + [row.block for row in row_chunk if isinstance(row.block, notion_blocks.TableRow)], + ) + + # Extract child databases and pages + for row in rows: + for c in row.cells: + for rt in c.rich_texts: + if mention := rt.mention: + if mention.type == "page" and (page := mention.page): + child_pages.append(page.id) + if mention.type == "database" and (database := mention.database): + child_databases.append(database.id) + + header: Optional[notion_blocks.TableRow] = None + if table.block.has_column_header: + header = rows.pop(0) + table_html_rows = [] + if header: + header.is_header = True + table_html_rows.append(header.get_html()) + table_html_rows.extend([row.get_html() for row in rows]) + html_table = Table([], table_html_rows) + + return BuildTableResponse( + table_html=html_table, + child_pages=child_pages, + child_databases=child_databases, + ) + + +def build_columned_list(client: Client, column_parent: Block) -> HtmlTag: + if not isinstance(column_parent.block, notion_blocks.ColumnList): + raise ValueError(f"block type not column list: {type(column_parent.block)}") + columns: List[Block] = [] + for column_chunk in client.blocks.children.iterate_list( # type: ignore + block_id=column_parent.id, + ): + columns.extend(column_chunk) + num_columns = len(columns) + columns_content = [] + for column in columns: + for column_content_chunk in client.blocks.children.iterate_list( # type: ignore + block_id=column.id, + ): + columns_content.append( + Div( + [Style(f"width:{100/num_columns}%; float: left")], + [content.block.get_html() for content in column_content_chunk], + ), + ) + + return Div([], columns_content) + + +@dataclass +class BulletedListResponse: + html: HtmlTag + child_list: Optional[HtmlTag] = None + + +bulleted_list_styles = ["circle", "square", "disc"] + + +def build_bulleted_list_children( + client: Client, + bulleted_list_item_parent: Block, + list_style_ind: int = 0, +) -> BulletedListResponse: + if not isinstance(bulleted_list_item_parent.block, notion_blocks.BulletedListItem): + raise ValueError( + f"block type not bulleted list item: {type(bulleted_list_item_parent.block)}", + ) + html = bulleted_list_item_parent.get_html() + if html: + html.attributes = [Style("margin-left: 10px")] + if not bulleted_list_item_parent.has_children: + return BulletedListResponse( + html=html, + ) + children = [] + for child_block in client.blocks.children.iterate_list( # type: ignore + block_id=bulleted_list_item_parent.id, + ): + children.extend(child_block) + if not children: + return BulletedListResponse( + html=bulleted_list_item_parent.get_html(), + ) + child_html = [] + for child in children: + child_resp = build_bulleted_list_children( + client=client, + bulleted_list_item_parent=child, + list_style_ind=(list_style_ind + 1) % len(bulleted_list_styles), + ) + child_html.append(child_resp.html) + if child_children := child_resp.child_list: + child_html.append(child_children) + + return BulletedListResponse( + html=html, + child_list=Ul( + [Style(f"list-style-type: {bulleted_list_styles[list_style_ind]}")], + child_html, + ), + ) + + +@dataclass +class NumberedListResponse: + html: HtmlTag + child_list: Optional[HtmlTag] = None + + +numbered_list_types = ["a", "i", "1"] + + +def build_numbered_list_children( + client: Client, + numbered_list_item_parent: Block, + type_attr_ind=0, +) -> NumberedListResponse: + if not isinstance(numbered_list_item_parent.block, notion_blocks.NumberedListItem): + raise ValueError( + f"block type not numbered list item: {type(numbered_list_item_parent.block)}", + ) + html = numbered_list_item_parent.get_html() + if html: + html.attributes = [Style("margin-left: 10px")] + if not numbered_list_item_parent.has_children: + return NumberedListResponse( + html=html, + ) + children = [] + for child_block in client.blocks.children.iterate_list( # type: ignore + block_id=numbered_list_item_parent.id, + ): + children.extend(child_block) + if not children: + return NumberedListResponse( + html=numbered_list_item_parent.get_html(), + ) + child_html = [] + for child in children: + child_resp = build_numbered_list_children( + client=client, + numbered_list_item_parent=child, + type_attr_ind=(type_attr_ind + 1) % len(numbered_list_types), + ) + child_html.append(child_resp.html) + if child_children := child_resp.child_list: + child_html.append(child_children) + + return NumberedListResponse( + html=html, + child_list=Ol([Type(numbered_list_types[type_attr_ind])], child_html), + ) diff --git a/unstructured_ingest/v2/processes/connectors/notion/interfaces.py b/unstructured_ingest/v2/processes/connectors/notion/interfaces.py new file mode 100644 index 000000000..bcfa788d5 --- /dev/null +++ b/unstructured_ingest/v2/processes/connectors/notion/interfaces.py @@ -0,0 +1,32 @@ +from abc import ABC, abstractmethod +from typing import Optional + +from htmlBuilder.tags import HtmlTag + + +class FromJSONMixin(ABC): + @classmethod + @abstractmethod + def from_dict(cls, data: dict): + pass + + +class GetHTMLMixin(ABC): + @abstractmethod + def get_html(self) -> Optional[HtmlTag]: + pass + + +class BlockBase(FromJSONMixin, GetHTMLMixin): + @staticmethod + @abstractmethod + def can_have_children() -> bool: + pass + + +class DBPropertyBase(FromJSONMixin): + pass + + +class DBCellBase(FromJSONMixin, GetHTMLMixin): + pass diff --git a/unstructured_ingest/v2/processes/connectors/notion/types/__init__.py b/unstructured_ingest/v2/processes/connectors/notion/types/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/unstructured_ingest/v2/processes/connectors/notion/types/block.py b/unstructured_ingest/v2/processes/connectors/notion/types/block.py new file mode 100644 index 000000000..1661763ce --- /dev/null +++ b/unstructured_ingest/v2/processes/connectors/notion/types/block.py @@ -0,0 +1,95 @@ +# https://developers.notion.com/reference/page +from dataclasses import dataclass +from typing import Optional + +from htmlBuilder.tags import HtmlTag + +from unstructured_ingest.connector.notion.interfaces import ( + BlockBase, + FromJSONMixin, + GetHTMLMixin, +) +from unstructured_ingest.connector.notion.types import blocks +from unstructured_ingest.connector.notion.types.parent import Parent +from unstructured_ingest.connector.notion.types.user import PartialUser + +block_type_mapping = { + "bookmark": blocks.Bookmark, + "breadcrumb": blocks.Breadcrumb, + "bulleted_list_item": blocks.BulletedListItem, + "callout": blocks.Callout, + "child_database": blocks.ChildDatabase, + "child_page": blocks.ChildPage, + "code": blocks.Code, + "column": blocks.Column, + "column_list": blocks.ColumnList, + "divider": blocks.Divider, + "heading_1": blocks.Heading, + "heading_2": blocks.Heading, + "heading_3": blocks.Heading, + "embed": blocks.Embed, + "equation": blocks.Equation, + "file": blocks.File, + "image": blocks.Image, + "link_preview": blocks.LinkPreview, + "link_to_page": blocks.LinkToPage, + "numbered_list_item": blocks.NumberedListItem, + "paragraph": blocks.Paragraph, + "pdf": blocks.PDF, + "quote": blocks.Quote, + "synced_block": blocks.SyncBlock, + "table": blocks.Table, + "table_of_contents": blocks.TableOfContents, + "table_row": blocks.TableRow, + "template": blocks.Template, + "to_do": blocks.ToDo, + "toggle": blocks.Toggle, + "unsupported": blocks.Unsupported, + "video": blocks.Video, +} + + +@dataclass +class Block(FromJSONMixin, GetHTMLMixin): + id: str + type: str + created_time: str + created_by: PartialUser + last_edited_time: str + last_edited_by: PartialUser + archived: bool + has_children: bool + parent: Parent + block: BlockBase + object: str = "block" + request_id: Optional[str] = None + + def __repr__(self): + return f"{self.__class__.__name__}(id={self.id}, type={self.type})" + + @classmethod + def from_dict(cls, data: dict): + t = data["type"] + block_data = data.pop(t) + created_by = data.pop("created_by") + last_edited_by = data.pop("last_edited_by") + parent = data.pop("parent") + try: + block = cls( + created_by=PartialUser.from_dict(created_by), + last_edited_by=PartialUser.from_dict(last_edited_by), + parent=Parent.from_dict(parent), + block=block_type_mapping[t].from_dict(block_data), # type: ignore + **data, + ) + except KeyError as ke: + raise KeyError(f"failed to map to associated block type -> {t}: {block_data}") from ke + except TypeError as te: + raise TypeError(f"failed to map to associated block type -> {t}: {block_data}") from te + + return block + + def get_html(self) -> Optional[HtmlTag]: + if self.block: + return self.block.get_html() + return None diff --git a/unstructured_ingest/v2/processes/connectors/notion/types/blocks/__init__.py b/unstructured_ingest/v2/processes/connectors/notion/types/blocks/__init__.py new file mode 100644 index 000000000..5cd158bc8 --- /dev/null +++ b/unstructured_ingest/v2/processes/connectors/notion/types/blocks/__init__.py @@ -0,0 +1,63 @@ +from .bookmark import Bookmark +from .breadcrumb import Breadcrumb +from .bulleted_list_item import BulletedListItem +from .callout import Callout +from .child_database import ChildDatabase +from .child_page import ChildPage +from .code import Code +from .column_list import Column, ColumnList +from .divider import Divider +from .embed import Embed +from .equation import Equation +from .file import File +from .heading import Heading +from .image import Image +from .link_preview import LinkPreview +from .link_to_page import LinkToPage +from .numbered_list import NumberedListItem +from .paragraph import Paragraph +from .pdf import PDF +from .quote import Quote +from .synced_block import DuplicateSyncedBlock, OriginalSyncedBlock, SyncBlock +from .table import Table, TableRow +from .table_of_contents import TableOfContents +from .template import Template +from .todo import ToDo +from .toggle import Toggle +from .unsupported import Unsupported +from .video import Video + +__all__ = [ + "Bookmark", + "Breadcrumb", + "BulletedListItem", + "Callout", + "ChildDatabase", + "ChildPage", + "Code", + "Column", + "ColumnList", + "Divider", + "Embed", + "Equation", + "File", + "Heading", + "Image", + "LinkPreview", + "LinkToPage", + "NumberedListItem", + "Paragraph", + "PDF", + "Quote", + "SyncBlock", + "OriginalSyncedBlock", + "DuplicateSyncedBlock", + "Table", + "TableRow", + "TableOfContents", + "Template", + "ToDo", + "Toggle", + "Unsupported", + "Video", +] diff --git a/unstructured_ingest/v2/processes/connectors/notion/types/blocks/bookmark.py b/unstructured_ingest/v2/processes/connectors/notion/types/blocks/bookmark.py new file mode 100644 index 000000000..6f9e66c2c --- /dev/null +++ b/unstructured_ingest/v2/processes/connectors/notion/types/blocks/bookmark.py @@ -0,0 +1,40 @@ +# https://developers.notion.com/reference/block#bookmark +from dataclasses import dataclass, field +from typing import List, Optional + +from htmlBuilder.attributes import Href +from htmlBuilder.tags import A, Br, Div, HtmlTag + +from unstructured_ingest.connector.notion.interfaces import BlockBase +from unstructured_ingest.connector.notion.types.rich_text import RichText + + +@dataclass +class Bookmark(BlockBase): + url: str + caption: List[RichText] = field(default_factory=list) + + @classmethod + def from_dict(cls, data: dict): + captions = data.pop("caption", []) + return cls( + url=data["url"], + caption=[RichText.from_dict(c) for c in captions], + ) + + def get_html(self) -> Optional[HtmlTag]: + texts = [] + if self.url: + texts.append(A([Href(self.url)], self.url)) + if self.caption: + texts.append(Div([], [rt.get_html() for rt in self.caption])) + if not texts: + return None + joined = [Br()] * (len(texts) * 2 - 1) + joined[0::2] = texts + + return Div([], joined) + + @staticmethod + def can_have_children() -> bool: + return False diff --git a/unstructured_ingest/v2/processes/connectors/notion/types/blocks/breadcrumb.py b/unstructured_ingest/v2/processes/connectors/notion/types/blocks/breadcrumb.py new file mode 100644 index 000000000..1578da609 --- /dev/null +++ b/unstructured_ingest/v2/processes/connectors/notion/types/blocks/breadcrumb.py @@ -0,0 +1,21 @@ +# https://developers.notion.com/reference/block#breadcrumb +from dataclasses import dataclass +from typing import Optional + +from htmlBuilder.tags import HtmlTag + +from unstructured_ingest.connector.notion.interfaces import BlockBase + + +@dataclass +class Breadcrumb(BlockBase): + @staticmethod + def can_have_children() -> bool: + return False + + @classmethod + def from_dict(cls, data: dict): + return cls() + + def get_html(self) -> Optional[HtmlTag]: + pass diff --git a/unstructured_ingest/v2/processes/connectors/notion/types/blocks/bulleted_list_item.py b/unstructured_ingest/v2/processes/connectors/notion/types/blocks/bulleted_list_item.py new file mode 100644 index 000000000..70810a071 --- /dev/null +++ b/unstructured_ingest/v2/processes/connectors/notion/types/blocks/bulleted_list_item.py @@ -0,0 +1,31 @@ +# https://developers.notion.com/reference/block#bulleted-list-item +from dataclasses import dataclass, field +from typing import List, Optional + +from htmlBuilder.tags import HtmlTag, Li + +from unstructured_ingest.connector.notion.interfaces import BlockBase +from unstructured_ingest.connector.notion.types.rich_text import RichText + + +@dataclass +class BulletedListItem(BlockBase): + color: str + children: List[dict] = field(default_factory=list) + rich_text: List[RichText] = field(default_factory=list) + + @staticmethod + def can_have_children() -> bool: + return True + + @classmethod + def from_dict(cls, data: dict): + rich_text = data.pop("rich_text", []) + return cls( + color=data["color"], + children=data.get("children", []), + rich_text=[RichText.from_dict(rt) for rt in rich_text], + ) + + def get_html(self) -> Optional[HtmlTag]: + return Li([], [rt.get_html() for rt in self.rich_text]) diff --git a/unstructured_ingest/v2/processes/connectors/notion/types/blocks/callout.py b/unstructured_ingest/v2/processes/connectors/notion/types/blocks/callout.py new file mode 100644 index 000000000..8f8895c61 --- /dev/null +++ b/unstructured_ingest/v2/processes/connectors/notion/types/blocks/callout.py @@ -0,0 +1,94 @@ +# https://developers.notion.com/reference/block#callout +from dataclasses import dataclass, field +from typing import List, Optional, Union + +from htmlBuilder.attributes import Href, Style +from htmlBuilder.tags import A, Div, HtmlTag, P + +from unstructured_ingest.connector.notion.interfaces import ( + BlockBase, + FromJSONMixin, + GetHTMLMixin, +) +from unstructured_ingest.connector.notion.types.rich_text import RichText + + +@dataclass +class EmojiIcon(FromJSONMixin, GetHTMLMixin): + emoji: str + type: str = "emoji" + + @classmethod + def from_dict(cls, data: dict): + return cls(**data) + + def get_html(self) -> Optional[HtmlTag]: + return P([], self.emoji) + + +@dataclass +class ExternalIconContent(FromJSONMixin): + url: str + + @classmethod + def from_dict(cls, data: dict): + return cls(**data) + + +@dataclass +class ExternalIcon(FromJSONMixin, GetHTMLMixin): + external: ExternalIconContent + type: str = "external" + + @classmethod + def from_dict(cls, data: dict): + return cls(external=ExternalIconContent.from_dict(data=data.pop("external")), **data) + + def get_html(self) -> Optional[HtmlTag]: + if self.external: + return A([Href(self.external.url)], [self.external.url]) + else: + return None + + +class Icon(FromJSONMixin): + @classmethod + def from_dict(cls, data: dict) -> Union[EmojiIcon, ExternalIcon]: + t = data.get("type") + if t == "emoji": + return EmojiIcon.from_dict(data) + elif t == "external": + return ExternalIcon.from_dict(data) + else: + raise ValueError(f"Unexpected icon type: {t} ({data})") + + +@dataclass +class Callout(BlockBase): + color: str + icon: Optional[Union[EmojiIcon, ExternalIcon]] = None + rich_text: List[RichText] = field(default_factory=list) + + @staticmethod + def can_have_children() -> bool: + return True + + @classmethod + def from_dict(cls, data: dict): + rich_text = data.pop("rich_text", []) + return cls( + color=data["color"], + icon=Icon.from_dict(data.pop("icon")), + rich_text=[RichText.from_dict(rt) for rt in rich_text], + ) + + def get_html(self) -> Optional[HtmlTag]: + elements = [] + if self.icon and self.icon.get_html(): + elements.append(self.icon.get_html()) + if self.rich_text: + elements.extend([rt.get_html() for rt in self.rich_text]) + attributes = [] + if self.color: + attributes.append(Style(f"color:{self.color}")) + return Div(attributes, elements) diff --git a/unstructured_ingest/v2/processes/connectors/notion/types/blocks/child_database.py b/unstructured_ingest/v2/processes/connectors/notion/types/blocks/child_database.py new file mode 100644 index 000000000..0e44ce3d3 --- /dev/null +++ b/unstructured_ingest/v2/processes/connectors/notion/types/blocks/child_database.py @@ -0,0 +1,23 @@ +# https://developers.notion.com/reference/block#child-database +from dataclasses import dataclass +from typing import Optional + +from htmlBuilder.tags import HtmlTag, P + +from unstructured_ingest.connector.notion.interfaces import BlockBase + + +@dataclass +class ChildDatabase(BlockBase): + title: str + + @staticmethod + def can_have_children() -> bool: + return True + + @classmethod + def from_dict(cls, data: dict): + return cls(**data) + + def get_html(self) -> Optional[HtmlTag]: + return P([], self.title) diff --git a/unstructured_ingest/v2/processes/connectors/notion/types/blocks/child_page.py b/unstructured_ingest/v2/processes/connectors/notion/types/blocks/child_page.py new file mode 100644 index 000000000..25cefef57 --- /dev/null +++ b/unstructured_ingest/v2/processes/connectors/notion/types/blocks/child_page.py @@ -0,0 +1,23 @@ +# https://developers.notion.com/reference/block#child-page +from dataclasses import dataclass +from typing import Optional + +from htmlBuilder.tags import HtmlTag, P + +from unstructured_ingest.connector.notion.interfaces import BlockBase, GetHTMLMixin + + +@dataclass +class ChildPage(BlockBase, GetHTMLMixin): + title: str + + @staticmethod + def can_have_children() -> bool: + return True + + @classmethod + def from_dict(cls, data: dict): + return cls(**data) + + def get_html(self) -> Optional[HtmlTag]: + return P([], self.title) diff --git a/unstructured_ingest/v2/processes/connectors/notion/types/blocks/code.py b/unstructured_ingest/v2/processes/connectors/notion/types/blocks/code.py new file mode 100644 index 000000000..56b82b1bf --- /dev/null +++ b/unstructured_ingest/v2/processes/connectors/notion/types/blocks/code.py @@ -0,0 +1,43 @@ +# https://developers.notion.com/reference/block#code +from dataclasses import dataclass, field +from typing import List, Optional + +from htmlBuilder.tags import Br, Div, HtmlTag +from htmlBuilder.tags import Code as HtmlCode + +from unstructured_ingest.connector.notion.interfaces import BlockBase +from unstructured_ingest.connector.notion.types.rich_text import RichText + + +@dataclass +class Code(BlockBase): + language: str + rich_text: List[RichText] = field(default_factory=list) + caption: List[RichText] = field(default_factory=list) + + @staticmethod + def can_have_children() -> bool: + return False + + @classmethod + def from_dict(cls, data: dict): + rich_text = data.pop("rich_text", []) + caption = data.pop("caption", []) + return cls( + language=data["language"], + rich_text=[RichText.from_dict(rt) for rt in rich_text], + caption=[RichText.from_dict(c) for c in caption], + ) + + def get_html(self) -> Optional[HtmlTag]: + texts = [] + if self.rich_text: + texts.append(HtmlCode([], [rt.get_html() for rt in self.rich_text])) + if self.caption: + texts.append(Div([], [rt.get_html() for rt in self.caption])) + if not texts: + return None + joined = [Br()] * (len(texts) * 2 - 1) + joined[0::2] = texts + + return Div([], joined) diff --git a/unstructured_ingest/v2/processes/connectors/notion/types/blocks/column_list.py b/unstructured_ingest/v2/processes/connectors/notion/types/blocks/column_list.py new file mode 100644 index 000000000..9bb3f6739 --- /dev/null +++ b/unstructured_ingest/v2/processes/connectors/notion/types/blocks/column_list.py @@ -0,0 +1,35 @@ +# https://developers.notion.com/reference/block#column-list-and-column +from dataclasses import dataclass +from typing import Optional + +from htmlBuilder.tags import HtmlTag + +from unstructured_ingest.connector.notion.interfaces import BlockBase + + +@dataclass +class ColumnList(BlockBase): + @staticmethod + def can_have_children() -> bool: + return True + + @classmethod + def from_dict(cls, data: dict): + return cls() + + def get_html(self) -> Optional[HtmlTag]: + return None + + +@dataclass +class Column(BlockBase): + @staticmethod + def can_have_children() -> bool: + return True + + @classmethod + def from_dict(cls, data: dict): + return cls() + + def get_html(self) -> Optional[HtmlTag]: + return None diff --git a/unstructured_ingest/v2/processes/connectors/notion/types/blocks/divider.py b/unstructured_ingest/v2/processes/connectors/notion/types/blocks/divider.py new file mode 100644 index 000000000..4537829e9 --- /dev/null +++ b/unstructured_ingest/v2/processes/connectors/notion/types/blocks/divider.py @@ -0,0 +1,22 @@ +# https://developers.notion.com/reference/block#divider +from dataclasses import dataclass +from typing import Optional + +from htmlBuilder.attributes import Style +from htmlBuilder.tags import Hr, HtmlTag + +from unstructured_ingest.connector.notion.interfaces import BlockBase + + +@dataclass +class Divider(BlockBase): + @staticmethod + def can_have_children() -> bool: + return False + + @classmethod + def from_dict(cls, data: dict): + return cls() + + def get_html(self) -> Optional[HtmlTag]: + return Hr([Style("border-top: 3px solid #bbb")]) diff --git a/unstructured_ingest/v2/processes/connectors/notion/types/blocks/embed.py b/unstructured_ingest/v2/processes/connectors/notion/types/blocks/embed.py new file mode 100644 index 000000000..8a6429108 --- /dev/null +++ b/unstructured_ingest/v2/processes/connectors/notion/types/blocks/embed.py @@ -0,0 +1,36 @@ +# https://developers.notion.com/reference/block#embed +from dataclasses import dataclass, field +from typing import List, Optional + +from htmlBuilder.attributes import Href +from htmlBuilder.tags import A, Br, Div, HtmlTag + +from unstructured_ingest.connector.notion.interfaces import BlockBase +from unstructured_ingest.connector.notion.types.rich_text import RichText + + +@dataclass +class Embed(BlockBase): + url: str + caption: List[RichText] = field(default_factory=list) + + @staticmethod + def can_have_children() -> bool: + return False + + @classmethod + def from_dict(cls, data: dict): + return cls(caption=[RichText.from_dict(d) for d in data.pop("caption", [])], **data) + + def get_html(self) -> Optional[HtmlTag]: + texts = [] + if self.url: + texts.append(A([Href(self.url)], self.url)) + if self.caption: + texts.append(Div([], [rt.get_html() for rt in self.caption])) + if not texts: + return None + joined = [Br()] * (len(texts) * 2 - 1) + joined[0::2] = texts + + return Div([], joined) diff --git a/unstructured_ingest/v2/processes/connectors/notion/types/blocks/equation.py b/unstructured_ingest/v2/processes/connectors/notion/types/blocks/equation.py new file mode 100644 index 000000000..cc6039ce5 --- /dev/null +++ b/unstructured_ingest/v2/processes/connectors/notion/types/blocks/equation.py @@ -0,0 +1,23 @@ +# https://developers.notion.com/reference/block#equation +from dataclasses import dataclass +from typing import Optional + +from htmlBuilder.tags import Div, HtmlTag + +from unstructured_ingest.connector.notion.interfaces import BlockBase + + +@dataclass +class Equation(BlockBase): + expression: str + + @staticmethod + def can_have_children() -> bool: + return False + + @classmethod + def from_dict(cls, data: dict): + return cls(**data) + + def get_html(self) -> Optional[HtmlTag]: + return Div([], self.expression) diff --git a/unstructured_ingest/v2/processes/connectors/notion/types/blocks/file.py b/unstructured_ingest/v2/processes/connectors/notion/types/blocks/file.py new file mode 100644 index 000000000..81cefc205 --- /dev/null +++ b/unstructured_ingest/v2/processes/connectors/notion/types/blocks/file.py @@ -0,0 +1,49 @@ +# https://developers.notion.com/reference/block#file +from dataclasses import dataclass, field +from typing import List, Optional + +from htmlBuilder.attributes import Href +from htmlBuilder.tags import A, Br, Div, HtmlTag + +from unstructured_ingest.connector.notion.interfaces import BlockBase +from unstructured_ingest.connector.notion.types.file import External +from unstructured_ingest.connector.notion.types.file import File as FileContent +from unstructured_ingest.connector.notion.types.rich_text import RichText + + +@dataclass +class File(BlockBase): + type: str + external: Optional[External] = None + file: Optional[FileContent] = None + caption: List[RichText] = field(default_factory=list) + + @staticmethod + def can_have_children() -> bool: + return False + + @classmethod + def from_dict(cls, data: dict): + caption = [RichText.from_dict(rt) for rt in data.pop("caption", [])] + t = data["type"] + file = cls(type=t, caption=caption) + if t == "external": + file.external = External.from_dict(data["external"]) + elif t == "file": + file.file = FileContent.from_dict(data["file"]) + return file + + def get_html(self) -> Optional[HtmlTag]: + texts = [] + if self.file: + texts.append(A([Href(self.file.url)], self.file.url)) + if self.external: + texts.append(A([Href(self.external.url)], self.external.url)) + if self.caption: + texts.append(Div([], [rt.get_html() for rt in self.caption])) + if not texts: + return None + joined = [Br()] * (len(texts) * 2 - 1) + joined[0::2] = texts + + return Div([], joined) diff --git a/unstructured_ingest/v2/processes/connectors/notion/types/blocks/heading.py b/unstructured_ingest/v2/processes/connectors/notion/types/blocks/heading.py new file mode 100644 index 000000000..685dd4c87 --- /dev/null +++ b/unstructured_ingest/v2/processes/connectors/notion/types/blocks/heading.py @@ -0,0 +1,37 @@ +# https://developers.notion.com/reference/block#headings +from dataclasses import dataclass, field +from typing import List, Optional + +from htmlBuilder.attributes import Style +from htmlBuilder.tags import Div, HtmlTag + +from unstructured_ingest.connector.notion.interfaces import BlockBase +from unstructured_ingest.connector.notion.types.rich_text import RichText + + +@dataclass +class Heading(BlockBase): + color: str + is_toggleable: bool + rich_text: List[RichText] = field(default_factory=list) + + @staticmethod + def can_have_children() -> bool: + return False + + @classmethod + def from_dict(cls, data: dict): + rich_text = data.pop("rich_text", []) + heading = cls(**data) + heading.rich_text = [RichText.from_dict(rt) for rt in rich_text] + return heading + + def get_html(self) -> Optional[HtmlTag]: + if not self.rich_text: + return None + + texts = [rt.get_html() for rt in self.rich_text] + attributes = [] + if self.color and self.color != "default": + attributes.append(Style(f"color: {self.color}")) + return Div(attributes, texts) diff --git a/unstructured_ingest/v2/processes/connectors/notion/types/blocks/image.py b/unstructured_ingest/v2/processes/connectors/notion/types/blocks/image.py new file mode 100644 index 000000000..36fb173e8 --- /dev/null +++ b/unstructured_ingest/v2/processes/connectors/notion/types/blocks/image.py @@ -0,0 +1,21 @@ +# https://developers.notion.com/reference/block#image +from typing import Optional + +from htmlBuilder.attributes import Src +from htmlBuilder.tags import HtmlTag, Img + +from unstructured_ingest.connector.notion.interfaces import BlockBase +from unstructured_ingest.connector.notion.types.file import FileObject + + +class Image(BlockBase, FileObject): + @staticmethod + def can_have_children() -> bool: + return False + + def get_html(self) -> Optional[HtmlTag]: + if self.external: + return Img([Src(self.external.url)], []) + if self.file: + return Img([Src(self.file.url)], []) + return None diff --git a/unstructured_ingest/v2/processes/connectors/notion/types/blocks/link_preview.py b/unstructured_ingest/v2/processes/connectors/notion/types/blocks/link_preview.py new file mode 100644 index 000000000..96f8cb382 --- /dev/null +++ b/unstructured_ingest/v2/processes/connectors/notion/types/blocks/link_preview.py @@ -0,0 +1,24 @@ +# https://developers.notion.com/reference/block#link-preview +from dataclasses import dataclass +from typing import Optional + +from htmlBuilder.attributes import Href +from htmlBuilder.tags import A, HtmlTag + +from unstructured_ingest.connector.notion.interfaces import BlockBase + + +@dataclass +class LinkPreview(BlockBase): + url: str + + @staticmethod + def can_have_children() -> bool: + return False + + @classmethod + def from_dict(cls, data: dict): + return cls(**data) + + def get_html(self) -> Optional[HtmlTag]: + return A([Href(self.url)], self.url) diff --git a/unstructured_ingest/v2/processes/connectors/notion/types/blocks/link_to_page.py b/unstructured_ingest/v2/processes/connectors/notion/types/blocks/link_to_page.py new file mode 100644 index 000000000..8d9d01810 --- /dev/null +++ b/unstructured_ingest/v2/processes/connectors/notion/types/blocks/link_to_page.py @@ -0,0 +1,29 @@ +# https://developers.notion.com/reference/block#link-to-page +from dataclasses import dataclass +from typing import Optional + +from htmlBuilder.tags import Div, HtmlTag + +from unstructured_ingest.connector.notion.interfaces import BlockBase + + +@dataclass +class LinkToPage(BlockBase): + type: str + page_id: Optional[str] = None + database_id: Optional[str] = None + + @staticmethod + def can_have_children() -> bool: + return False + + @classmethod + def from_dict(cls, data: dict): + return cls(**data) + + def get_html(self) -> Optional[HtmlTag]: + if page_id := self.page_id: + return Div([], page_id) + if database_id := self.database_id: + return Div([], database_id) + return None diff --git a/unstructured_ingest/v2/processes/connectors/notion/types/blocks/numbered_list.py b/unstructured_ingest/v2/processes/connectors/notion/types/blocks/numbered_list.py new file mode 100644 index 000000000..e9236fba2 --- /dev/null +++ b/unstructured_ingest/v2/processes/connectors/notion/types/blocks/numbered_list.py @@ -0,0 +1,29 @@ +# https://developers.notion.com/reference/block#numbered-list-item +from dataclasses import dataclass, field +from typing import List, Optional + +from htmlBuilder.tags import HtmlTag, Li + +from unstructured_ingest.connector.notion.interfaces import BlockBase +from unstructured_ingest.connector.notion.types.rich_text import RichText + + +@dataclass +class NumberedListItem(BlockBase): + color: str + children: List[dict] = field(default_factory=list) + rich_text: List[RichText] = field(default_factory=list) + + @staticmethod + def can_have_children() -> bool: + return True + + @classmethod + def from_dict(cls, data: dict): + rich_text = data.pop("rich_text", []) + numbered_list = cls(**data) + numbered_list.rich_text = [RichText.from_dict(rt) for rt in rich_text] + return numbered_list + + def get_html(self) -> Optional[HtmlTag]: + return Li([], [rt.get_html() for rt in self.rich_text]) diff --git a/unstructured_ingest/v2/processes/connectors/notion/types/blocks/paragraph.py b/unstructured_ingest/v2/processes/connectors/notion/types/blocks/paragraph.py new file mode 100644 index 000000000..02170ee8c --- /dev/null +++ b/unstructured_ingest/v2/processes/connectors/notion/types/blocks/paragraph.py @@ -0,0 +1,31 @@ +# https://developers.notion.com/reference/block#paragraph +from dataclasses import dataclass, field +from typing import List, Optional + +from htmlBuilder.tags import Br, Div, HtmlTag + +from unstructured_ingest.connector.notion.interfaces import BlockBase +from unstructured_ingest.connector.notion.types.rich_text import RichText + + +@dataclass +class Paragraph(BlockBase): + color: str + children: List[dict] = field(default_factory=list) + rich_text: List[RichText] = field(default_factory=list) + + @staticmethod + def can_have_children() -> bool: + return True + + @classmethod + def from_dict(cls, data: dict): + rich_text = data.pop("rich_text", []) + paragraph = cls(**data) + paragraph.rich_text = [RichText.from_dict(rt) for rt in rich_text] + return paragraph + + def get_html(self) -> Optional[HtmlTag]: + if not self.rich_text: + return Br() + return Div([], [rt.get_html() for rt in self.rich_text]) diff --git a/unstructured_ingest/v2/processes/connectors/notion/types/blocks/pdf.py b/unstructured_ingest/v2/processes/connectors/notion/types/blocks/pdf.py new file mode 100644 index 000000000..6ec6971d7 --- /dev/null +++ b/unstructured_ingest/v2/processes/connectors/notion/types/blocks/pdf.py @@ -0,0 +1,49 @@ +# https://developers.notion.com/reference/block#pdf +from dataclasses import dataclass, field +from typing import List, Optional + +from htmlBuilder.attributes import Href +from htmlBuilder.tags import A, Br, Div, HtmlTag + +from unstructured_ingest.connector.notion.interfaces import BlockBase +from unstructured_ingest.connector.notion.types.file import External, File +from unstructured_ingest.connector.notion.types.rich_text import RichText + + +@dataclass +class PDF(BlockBase): + type: str + caption: List[RichText] = field(default_factory=list) + external: Optional[External] = None + file: Optional[File] = None + + @staticmethod + def can_have_children() -> bool: + return False + + @classmethod + def from_dict(cls, data: dict): + caption = data.pop("caption", []) + t = data["type"] + paragraph = cls(type=t) + paragraph.caption = [RichText.from_dict(c) for c in caption] + if t == "external": + paragraph.external = External.from_dict(data["external"]) + elif t == "file": + paragraph.file = File.from_dict(data["file"]) + return paragraph + + def get_html(self) -> Optional[HtmlTag]: + texts = [] + if self.external: + texts.append(A([Href(self.external.url)], self.external.url)) + if self.file: + texts.append(A([Href(self.file.url)], self.file.url)) + if self.caption: + texts.append(Div([], [rt.get_html() for rt in self.caption])) + if not texts: + return None + joined = [Br()] * (len(texts) * 2 - 1) + joined[0::2] = texts + + return Div([], joined) diff --git a/unstructured_ingest/v2/processes/connectors/notion/types/blocks/quote.py b/unstructured_ingest/v2/processes/connectors/notion/types/blocks/quote.py new file mode 100644 index 000000000..2c911c82d --- /dev/null +++ b/unstructured_ingest/v2/processes/connectors/notion/types/blocks/quote.py @@ -0,0 +1,37 @@ +# https://developers.notion.com/reference/block#quote +from dataclasses import dataclass, field +from typing import List, Optional + +from htmlBuilder.attributes import Style +from htmlBuilder.tags import Div, HtmlTag + +from unstructured_ingest.connector.notion.interfaces import BlockBase +from unstructured_ingest.connector.notion.types.rich_text import RichText + + +@dataclass +class Quote(BlockBase): + color: str + children: List[dict] = field(default_factory=list) + rich_text: List[RichText] = field(default_factory=list) + + @staticmethod + def can_have_children() -> bool: + return True + + @classmethod + def from_dict(cls, data: dict): + rich_text = data.pop("rich_text", []) + quote = cls(**data) + quote.rich_text = [RichText.from_dict(rt) for rt in rich_text] + return quote + + def get_html(self) -> Optional[HtmlTag]: + if not self.rich_text: + return None + + texts = [rt.get_html() for rt in self.rich_text] + attributes = [] + if self.color and self.color != "default": + attributes.append(Style(f"color: {self.color}")) + return Div(attributes, texts) diff --git a/unstructured_ingest/v2/processes/connectors/notion/types/blocks/synced_block.py b/unstructured_ingest/v2/processes/connectors/notion/types/blocks/synced_block.py new file mode 100644 index 000000000..6c158e701 --- /dev/null +++ b/unstructured_ingest/v2/processes/connectors/notion/types/blocks/synced_block.py @@ -0,0 +1,57 @@ +# https://developers.notion.com/reference/block#synced-block +from dataclasses import dataclass, field +from typing import List, Optional + +from htmlBuilder.tags import HtmlTag + +from unstructured_ingest.connector.notion.interfaces import BlockBase + + +@dataclass +class OriginalSyncedBlock(BlockBase): + synced_from: Optional[str] = None + children: List[dict] = field(default_factory=list) + + @staticmethod + def can_have_children() -> bool: + return True + + @classmethod + def from_dict(cls, data: dict): + return cls(children=data["children"]) + + def get_html(self) -> Optional[HtmlTag]: + return None + + +@dataclass +class DuplicateSyncedBlock(BlockBase): + type: str + block_id: str + + @staticmethod + def can_have_children() -> bool: + return True + + @classmethod + def from_dict(cls, data: dict): + return cls(**data) + + def get_html(self) -> Optional[HtmlTag]: + return None + + +class SyncBlock(BlockBase): + @staticmethod + def can_have_children() -> bool: + return True + + @classmethod + def from_dict(cls, data: dict): + if "synced_from" in data: + return OriginalSyncedBlock.from_dict(data) + else: + return DuplicateSyncedBlock.from_dict(data) + + def get_html(self) -> Optional[HtmlTag]: + return None diff --git a/unstructured_ingest/v2/processes/connectors/notion/types/blocks/table.py b/unstructured_ingest/v2/processes/connectors/notion/types/blocks/table.py new file mode 100644 index 000000000..32742a57d --- /dev/null +++ b/unstructured_ingest/v2/processes/connectors/notion/types/blocks/table.py @@ -0,0 +1,63 @@ +# https://developers.notion.com/reference/block#table +from dataclasses import dataclass, field +from typing import List, Optional + +from htmlBuilder.tags import HtmlTag, Td, Th, Tr + +from unstructured_ingest.connector.notion.interfaces import ( + BlockBase, + FromJSONMixin, +) +from unstructured_ingest.connector.notion.types.rich_text import RichText + + +@dataclass +class Table(BlockBase): + table_width: int + has_column_header: bool + has_row_header: bool + + @staticmethod + def can_have_children() -> bool: + return True + + @classmethod + def from_dict(cls, data: dict): + return cls(**data) + + def get_html(self) -> Optional[HtmlTag]: + return None + + +@dataclass +class TableCell(FromJSONMixin): + rich_texts: List[RichText] + + @classmethod + def from_dict(cls, data: dict): + return cls(rich_texts=[RichText.from_dict(rt) for rt in data.pop("rich_texts", [])]) + + def get_html(self, is_header: bool) -> Optional[HtmlTag]: + if is_header: + return Th([], [rt.get_html() for rt in self.rich_texts]) + else: + return Td([], [rt.get_html() for rt in self.rich_texts]) + + +# https://developers.notion.com/reference/block#table-rows +@dataclass +class TableRow(BlockBase): + is_header: bool = False + cells: List[TableCell] = field(default_factory=list) + + @classmethod + def from_dict(cls, data: dict): + cells = data.get("cells", []) + return cls(cells=[TableCell.from_dict({"rich_texts": c}) for c in cells]) + + @staticmethod + def can_have_children() -> bool: + return False + + def get_html(self) -> Optional[HtmlTag]: + return Tr([], [cell.get_html(is_header=self.is_header) for cell in self.cells]) diff --git a/unstructured_ingest/v2/processes/connectors/notion/types/blocks/table_of_contents.py b/unstructured_ingest/v2/processes/connectors/notion/types/blocks/table_of_contents.py new file mode 100644 index 000000000..86cedffd7 --- /dev/null +++ b/unstructured_ingest/v2/processes/connectors/notion/types/blocks/table_of_contents.py @@ -0,0 +1,23 @@ +# https://developers.notion.com/reference/block#table-of-contents +from dataclasses import dataclass +from typing import Optional + +from htmlBuilder.tags import HtmlTag + +from unstructured_ingest.connector.notion.interfaces import BlockBase + + +@dataclass +class TableOfContents(BlockBase): + color: str + + @staticmethod + def can_have_children() -> bool: + return False + + @classmethod + def from_dict(cls, data: dict): + return cls(**data) + + def get_html(self) -> Optional[HtmlTag]: + return None diff --git a/unstructured_ingest/v2/processes/connectors/notion/types/blocks/template.py b/unstructured_ingest/v2/processes/connectors/notion/types/blocks/template.py new file mode 100644 index 000000000..edb88de61 --- /dev/null +++ b/unstructured_ingest/v2/processes/connectors/notion/types/blocks/template.py @@ -0,0 +1,30 @@ +# https://developers.notion.com/reference/block#template +from dataclasses import dataclass, field +from typing import List, Optional + +from htmlBuilder.tags import Div, HtmlTag + +from unstructured_ingest.connector.notion.interfaces import BlockBase +from unstructured_ingest.connector.notion.types.rich_text import RichText + + +@dataclass +class Template(BlockBase): + children: List[dict] = field(default_factory=list) + rich_text: List[RichText] = field(default_factory=list) + + @staticmethod + def can_have_children() -> bool: + return True + + @classmethod + def from_dict(cls, data: dict): + rich_text = data.pop("rich_text", []) + template = cls(**data) + template.rich_text = [RichText.from_dict(rt) for rt in rich_text] + return template + + def get_html(self) -> Optional[HtmlTag]: + if not self.rich_text: + return None + return Div([], [rt.get_html() for rt in self.rich_text]) diff --git a/unstructured_ingest/v2/processes/connectors/notion/types/blocks/todo.py b/unstructured_ingest/v2/processes/connectors/notion/types/blocks/todo.py new file mode 100644 index 000000000..64c8fb5bc --- /dev/null +++ b/unstructured_ingest/v2/processes/connectors/notion/types/blocks/todo.py @@ -0,0 +1,42 @@ +# https://developers.notion.com/reference/block#to-do +from dataclasses import dataclass, field +from typing import List, Optional + +from htmlBuilder.attributes import Checked, Style, Type +from htmlBuilder.tags import Div, HtmlTag, Input + +from unstructured_ingest.connector.notion.interfaces import BlockBase +from unstructured_ingest.connector.notion.types.rich_text import RichText + + +@dataclass +class ToDo(BlockBase): + color: str + checked: bool = False + rich_text: List[RichText] = field(default_factory=list) + + @staticmethod + def can_have_children() -> bool: + return True + + @classmethod + def from_dict(cls, data: dict): + rich_text = data.pop("rich_text", []) + todo = cls(**data) + todo.rich_text = [RichText.from_dict(rt) for rt in rich_text] + return todo + + def get_html(self) -> Optional[HtmlTag]: + if not self.rich_text: + return None + + elements = [] + check_input_attributes = [Type("checkbox")] + if self.checked: + check_input_attributes.append(Checked("")) + elements.append(Input(check_input_attributes)) + elements.extend([rt.get_html() for rt in self.rich_text]) + attributes = [] + if self.color and self.color != "default": + attributes.append(Style(f"color: {self.color}")) + return Div(attributes, elements) diff --git a/unstructured_ingest/v2/processes/connectors/notion/types/blocks/toggle.py b/unstructured_ingest/v2/processes/connectors/notion/types/blocks/toggle.py new file mode 100644 index 000000000..dd3493c25 --- /dev/null +++ b/unstructured_ingest/v2/processes/connectors/notion/types/blocks/toggle.py @@ -0,0 +1,37 @@ +# https://developers.notion.com/reference/block#toggle-blocks +from dataclasses import dataclass, field +from typing import List, Optional + +from htmlBuilder.attributes import Style +from htmlBuilder.tags import Div, HtmlTag + +from unstructured_ingest.connector.notion.interfaces import BlockBase +from unstructured_ingest.connector.notion.types.rich_text import RichText + + +@dataclass +class Toggle(BlockBase): + color: str + children: List[dict] = field(default_factory=list) + rich_text: List[RichText] = field(default_factory=list) + + @staticmethod + def can_have_children() -> bool: + return True + + @classmethod + def from_dict(cls, data: dict): + rich_text = data.pop("rich_text", []) + toggle = cls(**data) + toggle.rich_text = [RichText.from_dict(rt) for rt in rich_text] + return toggle + + def get_html(self) -> Optional[HtmlTag]: + if not self.rich_text: + return None + + texts = [rt.get_html() for rt in self.rich_text] + attributes = [] + if self.color and self.color != "default": + attributes.append(Style(f"color: {self.color}")) + return Div(attributes, texts) diff --git a/unstructured_ingest/v2/processes/connectors/notion/types/blocks/unsupported.py b/unstructured_ingest/v2/processes/connectors/notion/types/blocks/unsupported.py new file mode 100644 index 000000000..25b7c149f --- /dev/null +++ b/unstructured_ingest/v2/processes/connectors/notion/types/blocks/unsupported.py @@ -0,0 +1,20 @@ +from dataclasses import dataclass +from typing import Optional + +from htmlBuilder.tags import HtmlTag + +from unstructured_ingest.connector.notion.interfaces import BlockBase + + +@dataclass +class Unsupported(BlockBase): + @staticmethod + def can_have_children() -> bool: + return False + + @classmethod + def from_dict(cls, data: dict): + return cls() + + def get_html(self) -> Optional[HtmlTag]: + return None diff --git a/unstructured_ingest/v2/processes/connectors/notion/types/blocks/video.py b/unstructured_ingest/v2/processes/connectors/notion/types/blocks/video.py new file mode 100644 index 000000000..54c5fe5a4 --- /dev/null +++ b/unstructured_ingest/v2/processes/connectors/notion/types/blocks/video.py @@ -0,0 +1,22 @@ +# https://developers.notion.com/reference/block#image +from typing import Optional + +from htmlBuilder.attributes import Src +from htmlBuilder.tags import HtmlTag, Source +from htmlBuilder.tags import Video as VideoHtml + +from unstructured_ingest.connector.notion.interfaces import BlockBase +from unstructured_ingest.connector.notion.types.file import FileObject + + +class Video(BlockBase, FileObject): + @staticmethod + def can_have_children() -> bool: + return False + + def get_html(self) -> Optional[HtmlTag]: + if self.external: + return VideoHtml([], [Source([Src(self.external.url)], [self.external.url])]) + if self.file: + return VideoHtml([], [Source([Src(self.file.url)], [self.file.url])]) + return None diff --git a/unstructured_ingest/v2/processes/connectors/notion/types/database.py b/unstructured_ingest/v2/processes/connectors/notion/types/database.py new file mode 100644 index 000000000..1c1366830 --- /dev/null +++ b/unstructured_ingest/v2/processes/connectors/notion/types/database.py @@ -0,0 +1,72 @@ +# https://developers.notion.com/reference/database +from dataclasses import dataclass, field +from typing import Dict, List, Optional + +from htmlBuilder.tags import Div, HtmlTag, Span + +from unstructured_ingest.connector.notion.interfaces import ( + DBPropertyBase, + FromJSONMixin, + GetHTMLMixin, +) +from unstructured_ingest.connector.notion.types.database_properties import ( + map_properties, +) +from unstructured_ingest.connector.notion.types.file import FileObject +from unstructured_ingest.connector.notion.types.parent import Parent +from unstructured_ingest.connector.notion.types.rich_text import RichText +from unstructured_ingest.connector.notion.types.user import PartialUser + + +@dataclass +class Database(FromJSONMixin, GetHTMLMixin): + id: str + created_time: str + created_by: PartialUser + last_edited_time: str + last_edited_by: PartialUser + archived: bool + parent: Parent + url: str + is_inline: bool + public_url: str + request_id: Optional[str] = None + properties: Dict[str, DBPropertyBase] = field(default_factory=dict) + title: List[RichText] = field(default_factory=list) + description: List[RichText] = field(default_factory=list) + icon: Optional[FileObject] = None + cover: Optional[FileObject] = None + object: str = "database" + + @classmethod + def from_dict(cls, data: dict): + created_by = data.pop("created_by") + last_edited_by = data.pop("last_edited_by") + icon = data.pop("icon") + cover = data.pop("cover") + parent = data.pop("parent") + title = data.pop("title") + description = data.pop("description") + page = cls( + properties=map_properties(data.pop("properties", {})), + created_by=PartialUser.from_dict(created_by), + last_edited_by=PartialUser.from_dict(last_edited_by), + icon=FileObject.from_dict(icon) if icon else None, + cover=FileObject.from_dict(cover) if cover else None, + parent=Parent.from_dict(parent), + title=[RichText.from_dict(data=r) for r in title], + description=[RichText.from_dict(data=r) for r in description], + **data, + ) + + return page + + def get_html(self) -> Optional[HtmlTag]: + spans = [] + if title := self.title: + spans.append(Span([], [rt.get_html() for rt in title])) + if description := self.description: + spans.append(Span([], [rt.get_html() for rt in description])) + if spans: + return Div([], spans) + return None diff --git a/unstructured_ingest/v2/processes/connectors/notion/types/database_properties/__init__.py b/unstructured_ingest/v2/processes/connectors/notion/types/database_properties/__init__.py new file mode 100644 index 000000000..95c548969 --- /dev/null +++ b/unstructured_ingest/v2/processes/connectors/notion/types/database_properties/__init__.py @@ -0,0 +1,106 @@ +from typing import Dict + +from unstructured_ingest.connector.notion.interfaces import DBCellBase, DBPropertyBase + +from .checkbox import Checkbox, CheckboxCell +from .created_by import CreatedBy, CreatedByCell +from .created_time import CreatedTime, CreatedTimeCell +from .date import Date, DateCell +from .email import Email, EmailCell +from .files import Files, FilesCell +from .formula import Formula, FormulaCell +from .last_edited_by import LastEditedBy, LastEditedByCell +from .last_edited_time import LastEditedTime, LastEditedTimeCell +from .multiselect import MultiSelect, MultiSelectCell +from .number import Number, NumberCell +from .people import People, PeopleCell +from .phone_number import PhoneNumber, PhoneNumberCell +from .relation import Relation, RelationCell +from .rich_text import RichText, RichTextCell +from .rollup import Rollup, RollupCell +from .select import Select, SelectCell +from .status import Status, StatusCell +from .title import Title, TitleCell +from .unique_id import UniqueID, UniqueIDCell +from .url import URL, URLCell +from .verification import Verification, VerificationCell + +db_prop_type_mapping = { + "checkbox": Checkbox, + "created_by": CreatedBy, + "created_time": CreatedTime, + "date": Date, + "email": Email, + "files": Files, + "formula": Formula, + "last_edited_by": LastEditedBy, + "last_edited_time": LastEditedTime, + "multi_select": MultiSelect, + "number": Number, + "people": People, + "phone_number": PhoneNumber, + "relation": Relation, + "rich_text": RichText, + "rollup": Rollup, + "select": Select, + "status": Status, + "title": Title, + "unique_id": UniqueID, + "url": URL, + "verification": Verification, +} + + +def map_properties(props: Dict[str, dict]) -> Dict[str, DBPropertyBase]: + mapped_dict = {} + for k, v in props.items(): + try: + mapped_dict[k] = db_prop_type_mapping[v["type"]].from_dict(v) # type: ignore + except KeyError as ke: + raise KeyError(f"failed to map to associated database property -> {k}: {v}") from ke + + return mapped_dict + + +db_cell_type_mapping = { + "checkbox": CheckboxCell, + "created_by": CreatedByCell, + "created_time": CreatedTimeCell, + "date": DateCell, + "email": EmailCell, + "files": FilesCell, + "formula": FormulaCell, + "last_edited_by": LastEditedByCell, + "last_edited_time": LastEditedTimeCell, + "multi_select": MultiSelectCell, + "number": NumberCell, + "people": PeopleCell, + "phone_number": PhoneNumberCell, + "relation": RelationCell, + "rich_text": RichTextCell, + "rollup": RollupCell, + "select": SelectCell, + "status": StatusCell, + "title": TitleCell, + "unique_id": UniqueIDCell, + "url": URLCell, + "verification": VerificationCell, +} + + +def map_cells(props: Dict[str, dict]) -> Dict[str, DBCellBase]: + mapped_dict = {} + for k, v in props.items(): + try: + t = v["type"] + mapped_dict[k] = db_cell_type_mapping[t].from_dict(v) # type: ignore + except KeyError as ke: + raise KeyError(f"failed to map to associated database property -> {k}: {v}") from ke + + return mapped_dict + + +__all__ = [ + "map_properties", + "map_cells", +] diff --git a/unstructured_ingest/v2/processes/connectors/notion/types/database_properties/checkbox.py b/unstructured_ingest/v2/processes/connectors/notion/types/database_properties/checkbox.py new file mode 100644 index 000000000..c4f50f2a3 --- /dev/null +++ b/unstructured_ingest/v2/processes/connectors/notion/types/database_properties/checkbox.py @@ -0,0 +1,38 @@ +# https://developers.notion.com/reference/property-object#checkbox +from dataclasses import dataclass, field +from typing import Optional + +from htmlBuilder.attributes import Checked, Type +from htmlBuilder.tags import Div, HtmlTag, Input + +from unstructured_ingest.connector.notion.interfaces import DBCellBase, DBPropertyBase + + +@dataclass +class Checkbox(DBPropertyBase): + id: str + name: str + type: str = "checkbox" + checkbox: dict = field(default_factory=dict) + + @classmethod + def from_dict(cls, data: dict): + return cls(**data) + + +@dataclass +class CheckboxCell(DBCellBase): + id: str + checkbox: bool + name: Optional[str] = None + type: str = "checkbox" + + @classmethod + def from_dict(cls, data: dict): + return cls(**data) + + def get_html(self) -> Optional[HtmlTag]: + check_input_attributes = [Type("checkbox")] + if self.checkbox: + check_input_attributes.append(Checked("")) + return Div([], Input(check_input_attributes)) diff --git a/unstructured_ingest/v2/processes/connectors/notion/types/database_properties/created_by.py b/unstructured_ingest/v2/processes/connectors/notion/types/database_properties/created_by.py new file mode 100644 index 000000000..4dda9a56e --- /dev/null +++ b/unstructured_ingest/v2/processes/connectors/notion/types/database_properties/created_by.py @@ -0,0 +1,35 @@ +# https://developers.notion.com/reference/property-object#created-by +from dataclasses import dataclass, field +from typing import Optional + +from htmlBuilder.tags import HtmlTag + +from unstructured_ingest.connector.notion.interfaces import DBCellBase, DBPropertyBase +from unstructured_ingest.connector.notion.types.user import People + + +@dataclass +class CreatedBy(DBPropertyBase): + id: str + name: str + type: str = "created_by" + created_by: dict = field(default_factory=dict) + + @classmethod + def from_dict(cls, data: dict): + return cls(**data) + + +@dataclass +class CreatedByCell(DBCellBase): + id: str + created_by: People + type: str = "created_by" + name: Optional[str] = None + + @classmethod + def from_dict(cls, data: dict): + return cls(created_by=People.from_dict(data.pop("created_by")), **data) + + def get_html(self) -> Optional[HtmlTag]: + return self.created_by.get_html() diff --git a/unstructured_ingest/v2/processes/connectors/notion/types/database_properties/created_time.py b/unstructured_ingest/v2/processes/connectors/notion/types/database_properties/created_time.py new file mode 100644 index 000000000..9ccf099dc --- /dev/null +++ b/unstructured_ingest/v2/processes/connectors/notion/types/database_properties/created_time.py @@ -0,0 +1,34 @@ +# https://developers.notion.com/reference/property-object#created-time +from dataclasses import dataclass, field +from typing import Optional + +from htmlBuilder.tags import Div, HtmlTag + +from unstructured_ingest.connector.notion.interfaces import DBCellBase, DBPropertyBase + + +@dataclass +class CreatedTime(DBPropertyBase): + id: str + name: str + type: str = "created_time" + created_time: dict = field(default_factory=dict) + + @classmethod + def from_dict(cls, data: dict): + return cls(**data) + + +@dataclass +class CreatedTimeCell(DBCellBase): + id: str + created_time: str + type: str = "created_time" + name: Optional[str] = None + + @classmethod + def from_dict(cls, data: dict): + return cls(**data) + + def get_html(self) -> Optional[HtmlTag]: + return Div([], self.created_time) diff --git a/unstructured_ingest/v2/processes/connectors/notion/types/database_properties/date.py b/unstructured_ingest/v2/processes/connectors/notion/types/database_properties/date.py new file mode 100644 index 000000000..79c4f5797 --- /dev/null +++ b/unstructured_ingest/v2/processes/connectors/notion/types/database_properties/date.py @@ -0,0 +1,41 @@ +# https://developers.notion.com/reference/property-object#date +from dataclasses import dataclass, field +from typing import Optional + +from htmlBuilder.tags import HtmlTag + +from unstructured_ingest.connector.notion.interfaces import DBCellBase, DBPropertyBase +from unstructured_ingest.connector.notion.types.date import Date as DateType + + +@dataclass +class Date(DBPropertyBase): + id: str + name: str + type: str = "date" + date: dict = field(default_factory=dict) + + @classmethod + def from_dict(cls, data: dict): + return cls(**data) + + +@dataclass +class DateCell(DBCellBase): + id: str + date: Optional[DateType] = None + name: Optional[str] = None + type: str = "date" + + @classmethod + def from_dict(cls, data: dict): + date = None + date_data = data.pop("date") + if date_data: + date = DateType.from_dict(date_data) + return cls(date=date, **data) + + def get_html(self) -> Optional[HtmlTag]: + if date := self.date: + return date.get_html() + return None diff --git a/unstructured_ingest/v2/processes/connectors/notion/types/database_properties/email.py b/unstructured_ingest/v2/processes/connectors/notion/types/database_properties/email.py new file mode 100644 index 000000000..c1b3b75e1 --- /dev/null +++ b/unstructured_ingest/v2/processes/connectors/notion/types/database_properties/email.py @@ -0,0 +1,36 @@ +# https://developers.notion.com/reference/property-object#email +from dataclasses import dataclass, field +from typing import Optional + +from htmlBuilder.tags import Div, HtmlTag + +from unstructured_ingest.connector.notion.interfaces import DBCellBase, DBPropertyBase + + +@dataclass +class Email(DBPropertyBase): + id: str + name: str + type: str = "email" + email: dict = field(default_factory=dict) + + @classmethod + def from_dict(cls, data: dict): + return cls(**data) + + +@dataclass +class EmailCell(DBCellBase): + id: str + email: str + name: Optional[str] = None + type: str = "email" + + @classmethod + def from_dict(cls, data: dict): + return cls(**data) + + def get_html(self) -> Optional[HtmlTag]: + if email := self.email: + return Div([], email) + return None diff --git a/unstructured_ingest/v2/processes/connectors/notion/types/database_properties/files.py b/unstructured_ingest/v2/processes/connectors/notion/types/database_properties/files.py new file mode 100644 index 000000000..7fd8d0156 --- /dev/null +++ b/unstructured_ingest/v2/processes/connectors/notion/types/database_properties/files.py @@ -0,0 +1,37 @@ +# https://developers.notion.com/reference/property-object#files +from dataclasses import dataclass, field +from typing import List, Optional + +from htmlBuilder.tags import Div, HtmlTag + +from unstructured_ingest.connector.notion.interfaces import DBCellBase, DBPropertyBase +from unstructured_ingest.connector.notion.types.file import FileObject + + +@dataclass +class Files(DBPropertyBase): + id: str + name: str + type: str = "files" + files: dict = field(default_factory=dict) + + @classmethod + def from_dict(cls, data: dict): + return cls(**data) + + +@dataclass +class FilesCell(DBCellBase): + id: str + files: List[FileObject] + type: str = "files" + name: Optional[str] = None + + @classmethod + def from_dict(cls, data: dict): + return cls(files=[FileObject.from_dict(f) for f in data.pop("files", [])], **data) + + def get_html(self) -> Optional[HtmlTag]: + if not self.files: + return None + return Div([], [f.get_html() for f in self.files]) diff --git a/unstructured_ingest/v2/processes/connectors/notion/types/database_properties/formula.py b/unstructured_ingest/v2/processes/connectors/notion/types/database_properties/formula.py new file mode 100644 index 000000000..99df0285b --- /dev/null +++ b/unstructured_ingest/v2/processes/connectors/notion/types/database_properties/formula.py @@ -0,0 +1,49 @@ +# https://developers.notion.com/reference/property-object#formula +from dataclasses import dataclass +from typing import Optional + +from htmlBuilder.tags import Div, HtmlTag + +from unstructured_ingest.connector.notion.interfaces import ( + DBCellBase, + DBPropertyBase, + FromJSONMixin, +) + + +@dataclass +class FormulaProp(FromJSONMixin): + expression: str + + @classmethod + def from_dict(cls, data: dict): + return cls(**data) + + +@dataclass +class Formula(DBPropertyBase): + id: str + name: str + formula: FormulaProp + type: str = "formula" + + @classmethod + def from_dict(cls, data: dict): + return cls(formula=FormulaProp.from_dict(data.pop("formula", {})), **data) + + +@dataclass +class FormulaCell(DBCellBase): + id: str + formula: dict + type: str = "formula" + name: Optional[str] = None + + @classmethod + def from_dict(cls, data: dict): + return cls(**data) + + def get_html(self) -> Optional[HtmlTag]: + formula = self.formula + t = formula.get("type") + return Div([], str(formula[t])) diff --git a/unstructured_ingest/v2/processes/connectors/notion/types/database_properties/last_edited_by.py b/unstructured_ingest/v2/processes/connectors/notion/types/database_properties/last_edited_by.py new file mode 100644 index 000000000..6c73ea625 --- /dev/null +++ b/unstructured_ingest/v2/processes/connectors/notion/types/database_properties/last_edited_by.py @@ -0,0 +1,34 @@ +# https://developers.notion.com/reference/property-object#last-edited-by +from dataclasses import dataclass +from typing import Optional + +from htmlBuilder.tags import HtmlTag + +from unstructured_ingest.connector.notion.interfaces import DBCellBase, DBPropertyBase +from unstructured_ingest.connector.notion.types.user import People + + +@dataclass +class LastEditedBy(DBPropertyBase): + @classmethod + def from_dict(cls, data: dict): + return cls() + + def get_text(self) -> Optional[str]: + return None + + +@dataclass +class LastEditedByCell(DBCellBase): + id: str + last_edited_by: People + type: str = "last_edited_by" + + name: Optional[str] = None + + @classmethod + def from_dict(cls, data: dict): + return cls(last_edited_by=People.from_dict(data.pop("last_edited_by", {})), **data) + + def get_html(self) -> Optional[HtmlTag]: + return self.last_edited_by.get_html() diff --git a/unstructured_ingest/v2/processes/connectors/notion/types/database_properties/last_edited_time.py b/unstructured_ingest/v2/processes/connectors/notion/types/database_properties/last_edited_time.py new file mode 100644 index 000000000..1da2ed863 --- /dev/null +++ b/unstructured_ingest/v2/processes/connectors/notion/types/database_properties/last_edited_time.py @@ -0,0 +1,34 @@ +# https://developers.notion.com/reference/property-object#last-edited-time +from dataclasses import dataclass, field +from typing import Optional + +from htmlBuilder.tags import Div, HtmlTag + +from unstructured_ingest.connector.notion.interfaces import DBCellBase, DBPropertyBase + + +@dataclass +class LastEditedTime(DBPropertyBase): + id: str + name: str + type: str = "last_edited_time" + last_edited_time: dict = field(default_factory=dict) + + @classmethod + def from_dict(cls, data: dict): + return cls(**data) + + +@dataclass +class LastEditedTimeCell(DBCellBase): + id: str + last_edited_time: str + type: str = "last_edited_time" + name: Optional[str] = None + + @classmethod + def from_dict(cls, data: dict): + return cls(**data) + + def get_html(self) -> Optional[HtmlTag]: + return Div([], self.last_edited_time) diff --git a/unstructured_ingest/v2/processes/connectors/notion/types/database_properties/multiselect.py b/unstructured_ingest/v2/processes/connectors/notion/types/database_properties/multiselect.py new file mode 100644 index 000000000..753a24922 --- /dev/null +++ b/unstructured_ingest/v2/processes/connectors/notion/types/database_properties/multiselect.py @@ -0,0 +1,73 @@ +# https://developers.notion.com/reference/property-object#multi-select +from dataclasses import dataclass, field +from typing import List, Optional + +from htmlBuilder.attributes import Style +from htmlBuilder.tags import Div, HtmlTag, Span + +from unstructured_ingest.connector.notion.interfaces import ( + DBCellBase, + DBPropertyBase, + FromJSONMixin, +) + + +@dataclass +class MultiSelectOption(FromJSONMixin): + color: str + id: str + name: str + + @classmethod + def from_dict(cls, data: dict): + return cls(**data) + + +@dataclass +class MultiSelectProp(FromJSONMixin): + options: List[MultiSelectOption] = field(default_factory=list) + + @classmethod + def from_dict(cls, data: dict): + return cls(options=[MultiSelectOption.from_dict(o) for o in data.get("options", [])]) + + +@dataclass +class MultiSelect(DBPropertyBase): + id: str + name: str + multi_select: MultiSelectProp + type: str = "multi_select" + + @classmethod + def from_dict(cls, data: dict): + return cls( + multi_select=data.pop("multi_select", {}), + **data, + ) + + +@dataclass +class MultiSelectCell(DBCellBase): + id: str + multi_select: List[MultiSelectOption] + type: str = "multi_select" + name: Optional[str] = None + + @classmethod + def from_dict(cls, data: dict): + return cls( + multi_select=[MultiSelectOption.from_dict(o) for o in data.pop("multi_select", [])], + **data, + ) + + def get_html(self) -> Optional[HtmlTag]: + if not self.multi_select: + return None + option_spans = [] + for option in self.multi_select: + option_attributes = [] + if option.color and option.color != "default": + option_attributes.append(Style(f"color: {option.color}")) + option_spans.append(Span(option_attributes, option.name)) + return Div([], option_spans) diff --git a/unstructured_ingest/v2/processes/connectors/notion/types/database_properties/number.py b/unstructured_ingest/v2/processes/connectors/notion/types/database_properties/number.py new file mode 100644 index 000000000..0e0dae5d7 --- /dev/null +++ b/unstructured_ingest/v2/processes/connectors/notion/types/database_properties/number.py @@ -0,0 +1,49 @@ +# https://developers.notion.com/reference/property-object#number +from dataclasses import dataclass +from typing import Optional + +from htmlBuilder.tags import Div, HtmlTag + +from unstructured_ingest.connector.notion.interfaces import ( + DBCellBase, + DBPropertyBase, + FromJSONMixin, +) + + +@dataclass +class NumberProp(FromJSONMixin): + format: str + + @classmethod + def from_dict(cls, data: dict): + return cls(**data) + + +@dataclass +class Number(DBPropertyBase): + id: str + name: str + number: NumberProp + type: str = "number" + + @classmethod + def from_dict(cls, data: dict): + return cls(number=NumberProp.from_dict(data.pop("number")), **data) + + +@dataclass +class NumberCell(DBCellBase): + id: str + number: Optional[int] = None + type: str = "number" + name: Optional[str] = None + + @classmethod + def from_dict(cls, data: dict): + return cls(**data) + + def get_html(self) -> Optional[HtmlTag]: + if number := self.number: + return Div([], str(number)) + return None diff --git a/unstructured_ingest/v2/processes/connectors/notion/types/database_properties/people.py b/unstructured_ingest/v2/processes/connectors/notion/types/database_properties/people.py new file mode 100644 index 000000000..517fc082b --- /dev/null +++ b/unstructured_ingest/v2/processes/connectors/notion/types/database_properties/people.py @@ -0,0 +1,40 @@ +# https://developers.notion.com/reference/property-object#people +from dataclasses import dataclass, field +from typing import List, Optional + +from htmlBuilder.tags import Div, HtmlTag, Span + +from unstructured_ingest.connector.notion.interfaces import DBCellBase, DBPropertyBase +from unstructured_ingest.connector.notion.types.user import People as PeopleType + + +@dataclass +class People(DBPropertyBase): + id: str + name: str + type: str = "people" + people: dict = field(default_factory=dict) + + @classmethod + def from_dict(cls, data: dict): + return cls(**data) + + +@dataclass +class PeopleCell(DBCellBase): + id: str + people: List[PeopleType] + type: str = "people" + name: Optional[str] = None + + @classmethod + def from_dict(cls, data: dict): + return cls(people=[PeopleType.from_dict(p) for p in data.pop("people", {})], **data) + + def get_html(self) -> Optional[HtmlTag]: + if not self.people: + return None + people_spans = [] + for person in self.people: + people_spans.append(Span([], person.get_html())) + return Div([], people_spans) diff --git a/unstructured_ingest/v2/processes/connectors/notion/types/database_properties/phone_number.py b/unstructured_ingest/v2/processes/connectors/notion/types/database_properties/phone_number.py new file mode 100644 index 000000000..5f7fe66b1 --- /dev/null +++ b/unstructured_ingest/v2/processes/connectors/notion/types/database_properties/phone_number.py @@ -0,0 +1,36 @@ +# https://developers.notion.com/reference/property-object#phone-number +from dataclasses import dataclass, field +from typing import Optional + +from htmlBuilder.tags import Div, HtmlTag + +from unstructured_ingest.connector.notion.interfaces import DBCellBase, DBPropertyBase + + +@dataclass +class PhoneNumber(DBPropertyBase): + id: str + name: str + type: str = "phone_number" + phone_number: dict = field(default_factory=dict) + + @classmethod + def from_dict(cls, data: dict): + return cls(**data) + + +@dataclass +class PhoneNumberCell(DBCellBase): + id: str + phone_number: Optional[str] + name: Optional[str] = None + type: str = "phone_number" + + @classmethod + def from_dict(cls, data: dict): + return cls(**data) + + def get_html(self) -> Optional[HtmlTag]: + if phone_number := self.phone_number: + return Div([], phone_number) + return None diff --git a/unstructured_ingest/v2/processes/connectors/notion/types/database_properties/relation.py b/unstructured_ingest/v2/processes/connectors/notion/types/database_properties/relation.py new file mode 100644 index 000000000..1376a387f --- /dev/null +++ b/unstructured_ingest/v2/processes/connectors/notion/types/database_properties/relation.py @@ -0,0 +1,67 @@ +# https://developers.notion.com/reference/property-object#relation +from dataclasses import dataclass +from typing import Optional +from urllib.parse import unquote + +from htmlBuilder.tags import Div, HtmlTag + +from unstructured_ingest.connector.notion.interfaces import ( + DBCellBase, + DBPropertyBase, + FromJSONMixin, +) + + +@dataclass +class DualProperty(FromJSONMixin): + synced_property_id: str + synced_property_name: str + + @classmethod + def from_dict(cls, data: dict): + return cls(**data) + + +@dataclass +class RelationProp(FromJSONMixin): + database_id: str + type: str + dual_property: DualProperty + + @classmethod + def from_dict(cls, data: dict): + t = data.get("type") + if t == "dual_property": + dual_property = DualProperty.from_dict(data.pop(t)) + else: + raise ValueError(f"{t} type not recognized") + + return cls(dual_property=dual_property, **data) + + +@dataclass +class Relation(DBPropertyBase): + id: str + name: str + relation: RelationProp + type: str = "relation" + + @classmethod + def from_dict(cls, data: dict): + return cls(relation=RelationProp.from_dict(data.pop("relation")), **data) + + +@dataclass +class RelationCell(DBCellBase): + id: str + has_more: bool + relation: list + type: str = "relation" + name: Optional[str] = None + + @classmethod + def from_dict(cls, data: dict): + return cls(**data) + + def get_html(self) -> Optional[HtmlTag]: + return Div([], unquote(self.id)) diff --git a/unstructured_ingest/v2/processes/connectors/notion/types/database_properties/rich_text.py b/unstructured_ingest/v2/processes/connectors/notion/types/database_properties/rich_text.py new file mode 100644 index 000000000..de5ca7dd6 --- /dev/null +++ b/unstructured_ingest/v2/processes/connectors/notion/types/database_properties/rich_text.py @@ -0,0 +1,43 @@ +# https://developers.notion.com/reference/property-object#rich-text +from dataclasses import dataclass, field +from typing import List, Optional + +from htmlBuilder.tags import Div, HtmlTag, Span + +from unstructured_ingest.connector.notion.interfaces import DBCellBase, DBPropertyBase +from unstructured_ingest.connector.notion.types.rich_text import ( + RichText as RichTextType, +) + + +@dataclass +class RichText(DBPropertyBase): + id: str + name: str + type: str = "rich_text" + rich_text: dict = field(default_factory=dict) + + @classmethod + def from_dict(cls, data: dict): + return cls(**data) + + +@dataclass +class RichTextCell(DBCellBase): + id: str + rich_text: List[RichTextType] + name: Optional[str] = None + type: str = "rich_text" + + @classmethod + def from_dict(cls, data: dict): + return cls( + rich_text=[RichTextType.from_dict(rt) for rt in data.pop("rich_text", [])], + **data, + ) + + def get_html(self) -> Optional[HtmlTag]: + if not self.rich_text: + return None + spans = [Span([], rt.get_html()) for rt in self.rich_text] + return Div([], spans) diff --git a/unstructured_ingest/v2/processes/connectors/notion/types/database_properties/rollup.py b/unstructured_ingest/v2/processes/connectors/notion/types/database_properties/rollup.py new file mode 100644 index 000000000..d82cb3ef1 --- /dev/null +++ b/unstructured_ingest/v2/processes/connectors/notion/types/database_properties/rollup.py @@ -0,0 +1,56 @@ +# https://developers.notion.com/reference/property-object#rollup +from dataclasses import dataclass +from typing import Optional + +from htmlBuilder.tags import Div, HtmlTag, Span + +from unstructured_ingest.connector.notion.interfaces import ( + DBCellBase, + DBPropertyBase, + FromJSONMixin, +) + + +@dataclass +class RollupProp(FromJSONMixin): + function: str + relation_property_id: str + relation_property_name: str + rollup_property_id: str + rollup_property_name: str + + @classmethod + def from_dict(cls, data: dict): + return cls(**data) + + +@dataclass +class Rollup(DBPropertyBase): + id: str + name: str + rollup: RollupProp + type: str = "rollup" + + @classmethod + def from_dict(cls, data: dict): + return cls(rollup=RollupProp.from_dict(data.pop("rollup")), **data) + + +@dataclass +class RollupCell(DBCellBase): + id: str + rollup: dict + type: str = "rollup" + name: Optional[str] = None + + @classmethod + def from_dict(cls, data: dict): + return cls(**data) + + def get_html(self) -> Optional[HtmlTag]: + rollup = self.rollup + t = rollup.get("type") + v = rollup[t] + if isinstance(v, list): + return Div([], [Span([], str(x)) for x in v]) + return Div([], str(v)) diff --git a/unstructured_ingest/v2/processes/connectors/notion/types/database_properties/select.py b/unstructured_ingest/v2/processes/connectors/notion/types/database_properties/select.py new file mode 100644 index 000000000..5d504b478 --- /dev/null +++ b/unstructured_ingest/v2/processes/connectors/notion/types/database_properties/select.py @@ -0,0 +1,68 @@ +# https://developers.notion.com/reference/property-object#select +from dataclasses import dataclass, field +from typing import List, Optional + +from htmlBuilder.attributes import Style +from htmlBuilder.tags import Div, HtmlTag + +from unstructured_ingest.connector.notion.interfaces import ( + DBCellBase, + DBPropertyBase, + FromJSONMixin, +) + + +@dataclass +class SelectOption(FromJSONMixin): + color: str + id: str + name: str + + @classmethod + def from_dict(cls, data: dict): + return cls(**data) + + +@dataclass +class SelectProp(FromJSONMixin): + options: List[SelectOption] = field(default_factory=list) + + @classmethod + def from_dict(cls, data: dict): + return cls(options=[SelectOption.from_dict(o) for o in data.get("options", [])]) + + +@dataclass +class Select(DBPropertyBase): + id: str + name: str + select: SelectProp + type: str = "select" + + @classmethod + def from_dict(cls, data: dict): + return cls(select=SelectProp.from_dict(data.pop("select", {})), **data) + + +@dataclass +class SelectCell(DBCellBase): + id: str + select: Optional[SelectOption] + type: str = "select" + name: Optional[str] = None + + @classmethod + def from_dict(cls, data: dict): + select_data = data.pop("select") + select = None + if select_data: + select = SelectOption.from_dict(select_data) + return cls(select=select, **data) + + def get_html(self) -> Optional[HtmlTag]: + if select := self.select: + select_attr = [] + if select.color and select.color != "default": + select_attr.append(Style(f"color: {select.color}")) + return Div(select_attr, select.name) + return None diff --git a/unstructured_ingest/v2/processes/connectors/notion/types/database_properties/status.py b/unstructured_ingest/v2/processes/connectors/notion/types/database_properties/status.py new file mode 100644 index 000000000..8f6f5001d --- /dev/null +++ b/unstructured_ingest/v2/processes/connectors/notion/types/database_properties/status.py @@ -0,0 +1,80 @@ +# https://developers.notion.com/reference/property-object#status +from dataclasses import dataclass, field +from typing import List, Optional + +from htmlBuilder.attributes import Style +from htmlBuilder.tags import Div, HtmlTag + +from unstructured_ingest.connector.notion.interfaces import ( + DBCellBase, + DBPropertyBase, + FromJSONMixin, +) + + +@dataclass +class StatusOption(FromJSONMixin): + color: str + id: str + name: str + + @classmethod + def from_dict(cls, data: dict): + return cls(**data) + + +@dataclass +class StatusGroup(FromJSONMixin): + color: str + id: str + name: str + option_ids: List[str] = field(default_factory=List[str]) + + @classmethod + def from_dict(cls, data: dict): + return cls(**data) + + +@dataclass +class StatusProp(FromJSONMixin): + options: List[StatusOption] = field(default_factory=list) + groups: List[StatusGroup] = field(default_factory=list) + + @classmethod + def from_dict(cls, data: dict): + return cls( + options=[StatusOption.from_dict(o) for o in data.get("options", [])], + groups=[StatusGroup.from_dict(g) for g in data.get("groups", [])], + ) + + +@dataclass +class Status(DBPropertyBase): + id: str + name: str + status: StatusProp + type: str = "status" + + @classmethod + def from_dict(cls, data: dict): + return cls(status=StatusProp.from_dict(data.pop("status", {})), **data) + + +@dataclass +class StatusCell(DBCellBase): + id: str + status: Optional[StatusOption] + type: str = "status" + name: Optional[str] = None + + @classmethod + def from_dict(cls, data: dict): + return cls(status=StatusOption.from_dict(data.pop("status", {})), **data) + + def get_html(self) -> Optional[HtmlTag]: + if status := self.status: + select_attr = [] + if status.color and status.color != "default": + select_attr.append(Style(f"color: {status.color}")) + return Div(select_attr, status.name) + return None diff --git a/unstructured_ingest/v2/processes/connectors/notion/types/database_properties/title.py b/unstructured_ingest/v2/processes/connectors/notion/types/database_properties/title.py new file mode 100644 index 000000000..f33734cdc --- /dev/null +++ b/unstructured_ingest/v2/processes/connectors/notion/types/database_properties/title.py @@ -0,0 +1,37 @@ +# https://developers.notion.com/reference/property-object#title +from dataclasses import dataclass, field +from typing import List, Optional + +from htmlBuilder.tags import Div, HtmlTag + +from unstructured_ingest.connector.notion.interfaces import DBCellBase, DBPropertyBase +from unstructured_ingest.connector.notion.types.rich_text import RichText + + +@dataclass +class Title(DBPropertyBase): + id: str + name: str + type: str = "title" + title: dict = field(default_factory=dict) + + @classmethod + def from_dict(cls, data: dict): + return cls(**data) + + +@dataclass +class TitleCell(DBCellBase): + id: str + title: List[RichText] + type: str = "title" + name: Optional[str] = None + + @classmethod + def from_dict(cls, data: dict): + return cls(title=[RichText.from_dict(rt) for rt in data.pop("title", [])], **data) + + def get_html(self) -> Optional[HtmlTag]: + if not self.title: + return None + return Div([], [rt.get_html() for rt in self.title]) diff --git a/unstructured_ingest/v2/processes/connectors/notion/types/database_properties/unique_id.py b/unstructured_ingest/v2/processes/connectors/notion/types/database_properties/unique_id.py new file mode 100644 index 000000000..69f07a815 --- /dev/null +++ b/unstructured_ingest/v2/processes/connectors/notion/types/database_properties/unique_id.py @@ -0,0 +1,50 @@ +# https://developers.notion.com/reference/property-object#title +from dataclasses import dataclass, field +from typing import Optional + +from htmlBuilder.tags import Div, HtmlTag + +from unstructured_ingest.connector.notion.interfaces import ( + DBCellBase, + DBPropertyBase, + FromJSONMixin, +) + + +@dataclass +class UniqueID(DBPropertyBase): + id: str + name: str + type: str = "unique_id" + unique_id: dict = field(default_factory=dict) + + @classmethod + def from_dict(cls, data: dict): + return cls(**data) + + +@dataclass +class UniqueIDCellData(FromJSONMixin): + prefix: str + number: int + + @classmethod + def from_dict(cls, data: dict): + return cls(**data) + + +@dataclass +class UniqueIDCell(DBCellBase): + id: str + unique_id: Optional[UniqueIDCellData] + type: str = "title" + name: Optional[str] = None + + @classmethod + def from_dict(cls, data: dict): + return cls(unique_id=UniqueIDCellData.from_dict(data.pop("unique_id")), **data) + + def get_html(self) -> Optional[HtmlTag]: + if unique_id := self.unique_id: + return Div([], f"{unique_id.prefix}-{unique_id.number}") + return None diff --git a/unstructured_ingest/v2/processes/connectors/notion/types/database_properties/url.py b/unstructured_ingest/v2/processes/connectors/notion/types/database_properties/url.py new file mode 100644 index 000000000..83bea8420 --- /dev/null +++ b/unstructured_ingest/v2/processes/connectors/notion/types/database_properties/url.py @@ -0,0 +1,37 @@ +# https://developers.notion.com/reference/property-object#url +from dataclasses import dataclass, field +from typing import Optional + +from htmlBuilder.attributes import Href +from htmlBuilder.tags import A, HtmlTag + +from unstructured_ingest.connector.notion.interfaces import DBCellBase, DBPropertyBase + + +@dataclass +class URL(DBPropertyBase): + id: str + name: str + type: str = "url" + url: dict = field(default_factory=dict) + + @classmethod + def from_dict(cls, data: dict): + return cls(**data) + + +@dataclass +class URLCell(DBCellBase): + id: str + url: Optional[str] = None + name: Optional[str] = None + type: str = "url" + + @classmethod + def from_dict(cls, data: dict): + return cls(**data) + + def get_html(self) -> Optional[HtmlTag]: + if url := self.url: + return A([Href(url)], url) + return None diff --git a/unstructured_ingest/v2/processes/connectors/notion/types/database_properties/verification.py b/unstructured_ingest/v2/processes/connectors/notion/types/database_properties/verification.py new file mode 100644 index 000000000..bd1b5a29b --- /dev/null +++ b/unstructured_ingest/v2/processes/connectors/notion/types/database_properties/verification.py @@ -0,0 +1,78 @@ +# https://developers.notion.com/reference/property-object#url +from dataclasses import dataclass, field +from typing import Optional + +from htmlBuilder.tags import Div, HtmlTag, Span + +from unstructured_ingest.connector.notion.interfaces import ( + DBCellBase, + DBPropertyBase, + FromJSONMixin, + GetHTMLMixin, +) +from unstructured_ingest.connector.notion.types.date import Date +from unstructured_ingest.connector.notion.types.user import People + + +@dataclass +class Verification(DBPropertyBase): + id: str + name: str + type: str = "verification" + verification: dict = field(default_factory=dict) + + @classmethod + def from_dict(cls, data: dict): + return cls(**data) + + +@dataclass +class VerificationData(FromJSONMixin, GetHTMLMixin): + state: Optional[str] + verified_by: Optional[People] + date: Optional[Date] + + @classmethod + def from_dict(cls, data: dict): + verified_by = data.pop("verified_by", None) + date = data.pop("date", None) + return cls( + verified_by=People.from_dict(data=verified_by) if verified_by else None, + date=Date.from_dict(data=date) if date else None, + **data, + ) + + def get_html(self) -> Optional[HtmlTag]: + elements = [] + if state := self.state: + elements.append(Span([], state)) + if (verified_by := self.verified_by) and (verified_by_html := verified_by.get_html()): + elements.append(verified_by_html) + if (date := self.date) and (date_html := date.get_html()): + elements.append(date_html) + if elements: + return Div([], elements) + return None + + +@dataclass +class VerificationCell(DBCellBase): + id: str + verification: Optional[VerificationData] + name: Optional[str] = None + type: str = "verification" + + @classmethod + def from_dict(cls, data: dict): + return cls(verification=VerificationData.from_dict(data.pop("verification")), **data) + + def get_html(self) -> Optional[HtmlTag]: + elements = [] + if name := self.name: + elements.append(Span([], name)) + if (verification := self.verification) and (verification_html := verification.get_html()): + elements.append(verification_html) + + if elements: + return Div([], elements) + return None diff --git a/unstructured_ingest/v2/processes/connectors/notion/types/date.py b/unstructured_ingest/v2/processes/connectors/notion/types/date.py new file mode 100644 index 000000000..17564e2b5 --- /dev/null +++ b/unstructured_ingest/v2/processes/connectors/notion/types/date.py @@ -0,0 +1,26 @@ +# https://developers.notion.com/reference/property-value-object#date-property-values +from dataclasses import dataclass +from typing import Optional + +from htmlBuilder.tags import Div, HtmlTag + +from unstructured_ingest.connector.notion.interfaces import FromJSONMixin, GetHTMLMixin + + +@dataclass +class Date(FromJSONMixin, GetHTMLMixin): + start: str + end: Optional[str] = None + time_zone: Optional[str] = None + + @classmethod + def from_dict(cls, data: dict): + return cls(**data) + + def get_html(self) -> Optional[HtmlTag]: + text = f"{self.start}" + if end := self.end: + text += f" - {end}" + if self.time_zone: + text += f" {self.time_zone}" + return Div([], text) diff --git a/unstructured_ingest/v2/processes/connectors/notion/types/file.py b/unstructured_ingest/v2/processes/connectors/notion/types/file.py new file mode 100644 index 000000000..a7cd51ea8 --- /dev/null +++ b/unstructured_ingest/v2/processes/connectors/notion/types/file.py @@ -0,0 +1,51 @@ +# https://developers.notion.com/reference/file-object +from dataclasses import dataclass +from typing import Optional + +from htmlBuilder.attributes import Href +from htmlBuilder.tags import A, HtmlTag + +from unstructured_ingest.connector.notion.interfaces import FromJSONMixin, GetHTMLMixin + + +@dataclass +class External(FromJSONMixin): + url: str + + @classmethod + def from_dict(cls, data: dict): + return cls(**data) + + +@dataclass +class File(FromJSONMixin): + url: str + expiry_time: str + + @classmethod + def from_dict(cls, data: dict): + return cls(**data) + + +@dataclass +class FileObject(FromJSONMixin, GetHTMLMixin): + type: str + external: Optional[External] = None + file: Optional[File] = None + + @classmethod + def from_dict(cls, data: dict): + t = data["type"] + file_object = cls(type=t) + if t == "external": + file_object.external = External.from_dict(data["external"]) + elif t == "file": + file_object.file = File.from_dict(data["file"]) + return file_object + + def get_html(self) -> Optional[HtmlTag]: + if self.file: + return A([Href(self.file.url)], self.file.url) + if self.external: + return A([Href(self.external.url)], self.external.url) + return None diff --git a/unstructured_ingest/v2/processes/connectors/notion/types/page.py b/unstructured_ingest/v2/processes/connectors/notion/types/page.py new file mode 100644 index 000000000..d51ccfb50 --- /dev/null +++ b/unstructured_ingest/v2/processes/connectors/notion/types/page.py @@ -0,0 +1,44 @@ +# https://developers.notion.com/reference/page +from dataclasses import dataclass +from typing import Optional + +from unstructured_ingest.connector.notion.interfaces import FromJSONMixin +from unstructured_ingest.connector.notion.types.file import FileObject +from unstructured_ingest.connector.notion.types.parent import Parent +from unstructured_ingest.connector.notion.types.user import PartialUser + + +@dataclass +class Page(FromJSONMixin): + id: str + created_time: str + created_by: PartialUser + last_edited_time: str + last_edited_by: PartialUser + archived: bool + properties: dict + parent: Parent + url: str + public_url: str + request_id: Optional[str] = None + object: str = "page" + icon: Optional[FileObject] = None + cover: Optional[FileObject] = None + + @classmethod + def from_dict(cls, data: dict): + created_by = data.pop("created_by") + last_edited_by = data.pop("last_edited_by") + icon = data.pop("icon") + cover = data.pop("cover") + parent = data.pop("parent") + page = cls( + created_by=PartialUser.from_dict(created_by), + last_edited_by=PartialUser.from_dict(last_edited_by), + icon=FileObject.from_dict(icon) if icon else None, + cover=FileObject.from_dict(cover) if cover else None, + parent=Parent.from_dict(parent), + **data, + ) + + return page diff --git a/unstructured_ingest/v2/processes/connectors/notion/types/parent.py b/unstructured_ingest/v2/processes/connectors/notion/types/parent.py new file mode 100644 index 000000000..ea2674643 --- /dev/null +++ b/unstructured_ingest/v2/processes/connectors/notion/types/parent.py @@ -0,0 +1,66 @@ +# https://developers.notion.com/reference/parent-object +from dataclasses import dataclass + +from unstructured_ingest.connector.notion.interfaces import FromJSONMixin + + +# https://developers.notion.com/reference/parent-object#database-parent +@dataclass +class DatabaseParent(FromJSONMixin): + database_id: str + type: str = "database_id" + + @classmethod + def from_dict(cls, data: dict): + return cls(database_id=data["database_id"]) + + +# https://developers.notion.com/reference/parent-object#page-parent +@dataclass +class PageParent(FromJSONMixin): + page_id: str + type: str = "page_id" + + @classmethod + def from_dict(cls, data: dict): + return cls(page_id=data["page_id"]) + + +# https://developers.notion.com/reference/parent-object#workspace-parent +@dataclass +class WorkspaceParent(FromJSONMixin): + type: str = "workspace" + workspace: bool = True + + @classmethod + def from_dict(cls, data: dict): + return cls() + + +# https://developers.notion.com/reference/parent-object#block-parent +@dataclass +class BlockParent(FromJSONMixin): + block_id: str + type: str = "block_id" + + @classmethod + def from_dict(cls, data: dict): + return cls(block_id=data["block_id"]) + + +@dataclass +class Parent(FromJSONMixin): + block_id: str + type: str = "block_id" + + @classmethod + def from_dict(cls, data: dict): + t = data["type"] + if t == "database_id": + return DatabaseParent.from_dict(data) + elif t == "page_id": + return PageParent.from_dict(data) + elif t == "workspace": + return WorkspaceParent.from_dict(data) + elif t == "block_id": + return BlockParent.from_dict(data) diff --git a/unstructured_ingest/v2/processes/connectors/notion/types/rich_text.py b/unstructured_ingest/v2/processes/connectors/notion/types/rich_text.py new file mode 100644 index 000000000..30ea3793d --- /dev/null +++ b/unstructured_ingest/v2/processes/connectors/notion/types/rich_text.py @@ -0,0 +1,189 @@ +# https://developers.notion.com/reference/rich-text +from dataclasses import dataclass +from typing import Optional + +from htmlBuilder.attributes import Href, Style +from htmlBuilder.tags import A, B, Code, Div, HtmlTag, I, S, Span, U +from htmlBuilder.tags import Text as HtmlText + +from unstructured_ingest.connector.notion.interfaces import ( + FromJSONMixin, + GetHTMLMixin, +) +from unstructured_ingest.connector.notion.types.date import Date +from unstructured_ingest.connector.notion.types.user import People + + +@dataclass +class Annotations(FromJSONMixin): + bold: bool + code: bool + italic: bool + strikethrough: bool + underline: bool + color: str + + @classmethod + def from_dict(cls, data: dict): + return cls(**data) + + +@dataclass +class Equation(FromJSONMixin, GetHTMLMixin): + expression: str + + @classmethod + def from_dict(cls, data: dict): + return cls(**data) + + def get_html(self) -> Optional[HtmlTag]: + return Code([], self.expression) if self.expression else None + + +@dataclass +class MentionDatabase(FromJSONMixin, GetHTMLMixin): + id: str + + @classmethod + def from_dict(cls, data: dict): + return cls(**data) + + def get_html(self) -> Optional[HtmlTag]: + return Div([], self.id) if self.id else None + + +@dataclass +class MentionLinkPreview(FromJSONMixin, GetHTMLMixin): + url: str + + @classmethod + def from_dict(cls, data: dict): + return cls(**data) + + def get_html(self) -> Optional[HtmlTag]: + return A([Href(self.url)], self.url) if self.url else None + + +@dataclass +class MentionPage(FromJSONMixin, GetHTMLMixin): + id: str + + @classmethod + def from_dict(cls, data: dict): + return cls(**data) + + def get_html(self) -> Optional[HtmlTag]: + return Div([], self.id) if self.id else None + + +@dataclass +class MentionTemplate(FromJSONMixin): + template_mention_date: Optional[str] + template_mention_user: Optional[str] + + @classmethod + def from_dict(cls, data: dict): + return cls(**data) + + +@dataclass +class Mention(FromJSONMixin, GetHTMLMixin): + type: str + database: Optional[MentionDatabase] = None + date: Optional[Date] = None + link_preview: Optional[MentionLinkPreview] = None + page: Optional[MentionPage] = None + template_mention: Optional[MentionTemplate] = None + user: Optional[People] = None + + @classmethod + def from_dict(cls, data: dict): + t = data["type"] + mention = cls(type=t) + if t == "date": + mention.date = Date.from_dict(data["date"]) + elif t == "database": + mention.database = MentionDatabase.from_dict(data["database"]) + elif t == "link_preview": + mention.link_preview = MentionLinkPreview.from_dict(data["link_preview"]) + elif t == "page": + mention.page = MentionPage.from_dict(data["page"]) + elif t == "template_mention": + mention.template_mention = MentionTemplate.from_dict(data["template_mention"]) + elif t == "user": + mention.user = People.from_dict(data["user"]) + + return mention + + def get_html(self) -> Optional[HtmlTag]: + t = self.type + if t == "date": + return self.date.get_html() if self.date else None + elif t == "database": + return self.database.get_html() if self.database else None + elif t == "link_preview": + return self.link_preview.get_html() if self.link_preview else None + elif t == "page": + return self.page.get_html() if self.page else None + elif t == "user": + return self.user.get_html() if self.user else None + return None + + +@dataclass +class Text(FromJSONMixin): + content: str + link: Optional[dict] + + @classmethod + def from_dict(cls, data: dict): + return cls(**data) + + +@dataclass +class RichText(FromJSONMixin, GetHTMLMixin): + type: str + plain_text: str + annotations: Optional[Annotations] = None + href: Optional[str] = None + text: Optional[Text] = None + mention: Optional[Mention] = None + equation: Optional[Equation] = None + + def get_html(self) -> Optional[HtmlTag]: + text = HtmlText(self.plain_text) + if self.href: + text = A([Href(self.href)], text) + if self.annotations: + annotations = self.annotations + if annotations.bold: + text = B([], text) + if annotations.code: + text = Code([], text) + if annotations.italic: + text = I([], text) + if annotations.strikethrough: + text = S([], text) + if annotations.underline: + text = U([], text) + if annotations.color and annotations.color != "default": + if isinstance(text, HtmlText): + text = Span([], text) + text.attributes.append(Style(f"color:{annotations.color}")) + return text + + @classmethod + def from_dict(cls, data: dict): + t = data["type"] + rich_text = cls( + annotations=Annotations.from_dict(data.pop("annotations")), + **data, + ) + if t == "text": + rich_text.text = Text.from_dict(data["text"]) + elif t == "mention": + rich_text.mention = Mention.from_dict(data["mention"]) + elif t == "equation": + rich_text.equation = Equation.from_dict(data["equation"]) + + return rich_text diff --git a/unstructured_ingest/v2/processes/connectors/notion/types/user.py b/unstructured_ingest/v2/processes/connectors/notion/types/user.py new file mode 100644 index 000000000..212f5dc41 --- /dev/null +++ b/unstructured_ingest/v2/processes/connectors/notion/types/user.py @@ -0,0 +1,76 @@ +# https://developers.notion.com/reference/user +from dataclasses import dataclass, field +from typing import Optional + +from htmlBuilder.attributes import Href +from htmlBuilder.tags import A, Div, HtmlTag + +from unstructured_ingest.connector.notion.interfaces import FromJSONMixin, GetHTMLMixin + + +@dataclass +class PartialUser(FromJSONMixin): + id: str + object: str = "user" + + @classmethod + def from_dict(cls, data: dict): + return cls(id=data["id"]) + + +@dataclass +class User(FromJSONMixin, GetHTMLMixin): + object: dict + id: str + type: Optional[str] = None + name: Optional[str] = None + avatar_url: Optional[str] = None + + @classmethod + def from_dict(cls, data: dict): + return cls(**data) + + def get_text(self) -> Optional[str]: + text = self.name + if self.avatar_url: + text = f"[{text}]({self.avatar_url}" + return text + + def get_html(self) -> Optional[HtmlTag]: + if self.avatar_url: + return A([Href(self.avatar_url)], self.name) + else: + return Div([], self.name) + + +@dataclass +class People(User): + person: dict = field(default_factory=dict) + + +@dataclass +class Bots(FromJSONMixin, GetHTMLMixin): + object: dict + id: str + bot: dict + owner: dict + type: str + workspace_name: str + name: Optional[str] = None + avatar_url: Optional[str] = None + + @classmethod + def from_dict(cls, data: dict): + return cls(**data) + + def get_text(self) -> Optional[str]: + text = self.name + if self.avatar_url: + text = f"[{text}]({self.avatar_url}" + return text + + def get_html(self) -> Optional[HtmlTag]: + if self.avatar_url: + return A([Href(self.avatar_url)], self.name) + else: + return Div([], self.name) From 5171d9bfaaaad86b1f3ac13c431f737fe3c46316 Mon Sep 17 00:00:00 2001 From: mr-unstructured Date: Wed, 9 Oct 2024 20:41:52 +0100 Subject: [PATCH 02/48] finished --- test_e2e/src/notion.sh | 4 +- .../v2/processes/connectors/__init__.py | 4 ++ .../v2/processes/connectors/notion/client.py | 8 +-- .../processes/connectors/notion/connector.py | 72 ++++++++----------- .../v2/processes/connectors/notion/helpers.py | 10 +-- .../connectors/notion/types/block.py | 10 +-- .../connectors/notion/types/database.py | 12 ++-- .../processes/connectors/notion/types/date.py | 2 +- .../processes/connectors/notion/types/file.py | 2 +- .../processes/connectors/notion/types/page.py | 9 +-- .../connectors/notion/types/parent.py | 2 +- .../connectors/notion/types/rich_text.py | 6 +- .../processes/connectors/notion/types/user.py | 2 +- 13 files changed, 67 insertions(+), 76 deletions(-) diff --git a/test_e2e/src/notion.sh b/test_e2e/src/notion.sh index ce96b058a..fbf2ef067 100755 --- a/test_e2e/src/notion.sh +++ b/test_e2e/src/notion.sh @@ -43,7 +43,7 @@ PYTHONPATH=${PYTHONPATH:-.} "$RUN_SCRIPT" \ --num-processes "$max_processes" \ --recursive \ --verbose \ - --work-dir "$WORK_DIR" \ - --max-retry-time 30 + --work-dir "$WORK_DIR" + #--max-retry-time 30 "$SCRIPT_DIR"/check-diff-expected-output.py --output-folder-name $OUTPUT_FOLDER_NAME diff --git a/unstructured_ingest/v2/processes/connectors/__init__.py b/unstructured_ingest/v2/processes/connectors/__init__.py index 677da3d29..a359c48ec 100644 --- a/unstructured_ingest/v2/processes/connectors/__init__.py +++ b/unstructured_ingest/v2/processes/connectors/__init__.py @@ -46,6 +46,8 @@ from .sql import sql_destination_entry from .weaviate import CONNECTOR_TYPE as WEAVIATE_CONNECTOR_TYPE from .weaviate import weaviate_destination_entry +from .notion.connector import CONNECTOR_TYPE as NOTION_CONNECTOR_TYPE +from .notion.connector import notion_source_entry add_destination_entry(destination_type=ASTRA_DB_CONNECTOR_TYPE, entry=astra_db_destination_entry) @@ -95,3 +97,5 @@ add_destination_entry(destination_type=KDBAI_CONNECTOR_TYPE, entry=kdbai_destination_entry) add_source_entry(source_type=AIRTABLE_CONNECTOR_TYPE, entry=airtable_source_entry) +add_source_entry(source_type=NOTION_CONNECTOR_TYPE, entry=notion_source_entry) + diff --git a/unstructured_ingest/v2/processes/connectors/notion/client.py b/unstructured_ingest/v2/processes/connectors/notion/client.py index b0bc22a8a..862abd884 100644 --- a/unstructured_ingest/v2/processes/connectors/notion/client.py +++ b/unstructured_ingest/v2/processes/connectors/notion/client.py @@ -9,12 +9,12 @@ from notion_client.api_endpoints import PagesEndpoint as NotionPagesEndpoint from notion_client.errors import RequestTimeoutError -from unstructured_ingest.connector.notion.types.block import Block -from unstructured_ingest.connector.notion.types.database import Database -from unstructured_ingest.connector.notion.types.database_properties import ( +from unstructured_ingest.v2.processes.connectors.notion.types.block import Block +from unstructured_ingest.v2.processes.connectors.notion.types.database import Database +from unstructured_ingest.v2.processes.connectors.notion.types.database_properties import ( map_cells, ) -from unstructured_ingest.connector.notion.types.page import Page +from unstructured_ingest.v2.processes.connectors.notion.types.page import Page from unstructured_ingest.ingest_backoff import RetryHandler from unstructured_ingest.interfaces import RetryStrategyConfig from unstructured_ingest.utils.dep_check import requires_dependencies diff --git a/unstructured_ingest/v2/processes/connectors/notion/connector.py b/unstructured_ingest/v2/processes/connectors/notion/connector.py index 6beea2f5e..dd798a1f2 100644 --- a/unstructured_ingest/v2/processes/connectors/notion/connector.py +++ b/unstructured_ingest/v2/processes/connectors/notion/connector.py @@ -1,15 +1,13 @@ -import json -import logging -from dataclasses import dataclass, field +from dataclasses import dataclass from time import time -from typing import TYPE_CHECKING, Any, Generator, List, Optional, Set, Tuple +from typing import Any, Generator, List, Optional, Set, Tuple -from pydantic import BaseModel, Field, SecretStr +from pydantic import Field, SecretStr from unstructured_ingest.error import SourceConnectionError from unstructured_ingest.utils.dep_check import requires_dependencies from unstructured_ingest.v2.interfaces import ( - ConnectionConfig, + AccessConfig, Downloader, DownloaderConfig, DownloadResponse, @@ -21,15 +19,17 @@ ) from unstructured_ingest.v2.logger import logger from unstructured_ingest.v2.processes.connector_registry import SourceRegistryEntry - -if TYPE_CHECKING: - from unstructured_ingest.connector.notion.client import Client as NotionClient +from unstructured_ingest.v2.processes.connectors.notion.client import Client as NotionClient +from unstructured_ingest.v2.processes.connectors.notion.helpers import extract_database_html +from unstructured_ingest.v2.processes.connectors.notion.helpers import extract_page_html +from unstructured_ingest.v2.processes.connectors.notion.helpers import get_recursive_content_from_database +from unstructured_ingest.v2.processes.connectors.notion.helpers import get_recursive_content_from_page NOTION_API_VERSION = "2022-06-28" CONNECTOR_TYPE = "notion" -class NotionConnectionConfig(ConnectionConfig): +class NotionConnectionConfig(AccessConfig): notion_api_key: SecretStr = Field(description="Notion API key") @@ -49,12 +49,10 @@ class NotionIndexerConfig(IndexerConfig): @dataclass class NotionIndexer(Indexer): connection_config: NotionConnectionConfig - indexer_config: NotionIndexerConfig + index_config: NotionIndexerConfig @requires_dependencies(["notion_client"], extras="notion") def get_client(self) -> "NotionClient": - from unstructured_ingest.connector.notion.client import Client as NotionClient - return NotionClient( notion_version=NOTION_API_VERSION, auth=self.connection_config.notion_api_key.get_secret_value(), @@ -80,8 +78,8 @@ def run(self, **kwargs: Any) -> Generator[FileData, None, None]: processed_pages: Set[str] = set() processed_databases: Set[str] = set() - pages_to_process: Set[str] = set(self.indexer_config.page_ids or []) - databases_to_process: Set[str] = set(self.indexer_config.database_ids or []) + pages_to_process: Set[str] = set(self.index_config.page_ids or []) + databases_to_process: Set[str] = set(self.index_config.database_ids or []) while pages_to_process or databases_to_process: # Process pages @@ -95,7 +93,7 @@ def run(self, **kwargs: Any) -> Generator[FileData, None, None]: if file_data: yield file_data - if self.indexer_config.recursive: + if self.index_config.recursive: child_pages, child_databases = self.get_child_pages_and_databases( page_id=page_id, client=client, @@ -116,7 +114,7 @@ def run(self, **kwargs: Any) -> Generator[FileData, None, None]: ) if file_data: yield file_data - if self.indexer_config.recursive: + if self.index_config.recursive: ( child_pages, child_databases, @@ -131,7 +129,7 @@ def run(self, **kwargs: Any) -> Generator[FileData, None, None]: @requires_dependencies(["notion_client"], extras="notion") def get_page_file_data( - self, page_id: str, client: "NotionClient" + self, page_id: str, client: "NotionClient" ) -> Optional[FileData]: try: page_metadata = client.pages.retrieve(page_id=page_id) # type: ignore @@ -163,7 +161,7 @@ def get_page_file_data( @requires_dependencies(["notion_client"], extras="notion") def get_database_file_data( - self, database_id: str, client: "NotionClient" + self, database_id: str, client: "NotionClient" ) -> Optional[FileData]: try: database_metadata = client.databases.retrieve( @@ -196,16 +194,12 @@ def get_database_file_data( return None def get_child_pages_and_databases( - self, - page_id: str, - client: "NotionClient", - processed_pages: Set[str], - processed_databases: Set[str], + self, + page_id: str, + client: "NotionClient", + processed_pages: Set[str], + processed_databases: Set[str], ) -> Tuple[Set[str], Set[str]]: - from unstructured_ingest.connector.notion.helpers import ( - get_recursive_content_from_page, - ) - child_content = get_recursive_content_from_page( client=client, page_id=page_id, @@ -216,16 +210,12 @@ def get_child_pages_and_databases( return child_pages, child_databases def get_child_pages_and_databases_from_database( - self, - database_id: str, - client: "NotionClient", - processed_pages: Set[str], - processed_databases: Set[str], + self, + database_id: str, + client: "NotionClient", + processed_pages: Set[str], + processed_databases: Set[str], ) -> Tuple[Set[str], Set[str]]: - from unstructured_ingest.connector.notion.helpers import ( - get_recursive_content_from_database, - ) - child_content = get_recursive_content_from_database( client=client, database_id=database_id, @@ -236,7 +226,7 @@ def get_child_pages_and_databases_from_database( return child_pages, child_databases -@dataclass +# @dataclass class NotionDownloaderConfig(DownloaderConfig): pass @@ -249,8 +239,6 @@ class NotionDownloader(Downloader): @requires_dependencies(["notion_client"], extras="notion") def get_client(self) -> "NotionClient": - from unstructured_ingest.connector.notion.client import Client as NotionClient - return NotionClient( notion_version=NOTION_API_VERSION, auth=self.connection_config.notion_api_key.get_secret_value(), @@ -278,7 +266,6 @@ def run(self, file_data: FileData, **kwargs: Any) -> DownloadResponse: raise ValueError("Invalid record_locator in file_data") def download_page(self, client: "NotionClient", page_id: str, file_data: FileData) -> DownloadResponse: - from unstructured_ingest.connector.notion.helpers import extract_page_html try: text_extraction = extract_page_html( @@ -302,8 +289,6 @@ def download_page(self, client: "NotionClient", page_id: str, file_data: FileDat return None def download_database(self, client: "NotionClient", database_id: str, file_data: FileData) -> DownloadResponse: - from unstructured_ingest.connector.notion.helpers import extract_database_html - try: text_extraction = extract_database_html( client=client, @@ -327,7 +312,6 @@ def download_database(self, client: "NotionClient", database_id: str, file_data: notion_source_entry = SourceRegistryEntry( - connector_type=CONNECTOR_TYPE, connection_config=NotionConnectionConfig, indexer_config=NotionIndexerConfig, indexer=NotionIndexer, diff --git a/unstructured_ingest/v2/processes/connectors/notion/helpers.py b/unstructured_ingest/v2/processes/connectors/notion/helpers.py index b12a60fc6..f1b78ee31 100644 --- a/unstructured_ingest/v2/processes/connectors/notion/helpers.py +++ b/unstructured_ingest/v2/processes/connectors/notion/helpers.py @@ -22,11 +22,11 @@ ) from notion_client.errors import APIResponseError -import unstructured_ingest.connector.notion.types.blocks as notion_blocks -from unstructured_ingest.connector.notion.client import Client -from unstructured_ingest.connector.notion.interfaces import BlockBase -from unstructured_ingest.connector.notion.types.block import Block -from unstructured_ingest.connector.notion.types.database import Database +import unstructured_ingest.v2.processes.connectors.notion.types.blocks as notion_blocks +from unstructured_ingest.v2.processes.connectors.notion.client import Client +from unstructured_ingest.v2.processes.connectors.notion.interfaces import BlockBase +from unstructured_ingest.v2.processes.connectors.notion.types.block import Block +from unstructured_ingest.v2.processes.connectors.notion.types.database import Database @dataclass diff --git a/unstructured_ingest/v2/processes/connectors/notion/types/block.py b/unstructured_ingest/v2/processes/connectors/notion/types/block.py index 1661763ce..f8b237ee3 100644 --- a/unstructured_ingest/v2/processes/connectors/notion/types/block.py +++ b/unstructured_ingest/v2/processes/connectors/notion/types/block.py @@ -4,14 +4,14 @@ from htmlBuilder.tags import HtmlTag -from unstructured_ingest.connector.notion.interfaces import ( +from unstructured_ingest.v2.processes.connectors.notion.interfaces import ( BlockBase, FromJSONMixin, GetHTMLMixin, ) -from unstructured_ingest.connector.notion.types import blocks -from unstructured_ingest.connector.notion.types.parent import Parent -from unstructured_ingest.connector.notion.types.user import PartialUser +from unstructured_ingest.v2.processes.connectors.notion.types import blocks +from unstructured_ingest.v2.processes.connectors.notion.types.parent import Parent +from unstructured_ingest.v2.processes.connectors.notion.types.user import PartialUser block_type_mapping = { "bookmark": blocks.Bookmark, @@ -63,6 +63,7 @@ class Block(FromJSONMixin, GetHTMLMixin): block: BlockBase object: str = "block" request_id: Optional[str] = None + #in_trash: bool def __repr__(self): return f"{self.__class__.__name__}(id={self.id}, type={self.type})" @@ -74,6 +75,7 @@ def from_dict(cls, data: dict): created_by = data.pop("created_by") last_edited_by = data.pop("last_edited_by") parent = data.pop("parent") + in_trash = data.pop("in_trash") try: block = cls( created_by=PartialUser.from_dict(created_by), diff --git a/unstructured_ingest/v2/processes/connectors/notion/types/database.py b/unstructured_ingest/v2/processes/connectors/notion/types/database.py index 1c1366830..df4bea36c 100644 --- a/unstructured_ingest/v2/processes/connectors/notion/types/database.py +++ b/unstructured_ingest/v2/processes/connectors/notion/types/database.py @@ -4,18 +4,18 @@ from htmlBuilder.tags import Div, HtmlTag, Span -from unstructured_ingest.connector.notion.interfaces import ( +from unstructured_ingest.v2.processes.connectors.notion.interfaces import ( DBPropertyBase, FromJSONMixin, GetHTMLMixin, ) -from unstructured_ingest.connector.notion.types.database_properties import ( +from unstructured_ingest.v2.processes.connectors.notion.types.database_properties import ( map_properties, ) -from unstructured_ingest.connector.notion.types.file import FileObject -from unstructured_ingest.connector.notion.types.parent import Parent -from unstructured_ingest.connector.notion.types.rich_text import RichText -from unstructured_ingest.connector.notion.types.user import PartialUser +from unstructured_ingest.v2.processes.connectors.notion.types.file import FileObject +from unstructured_ingest.v2.processes.connectors.notion.types.parent import Parent +from unstructured_ingest.v2.processes.connectors.notion.types.rich_text import RichText +from unstructured_ingest.v2.processes.connectors.notion.types.user import PartialUser @dataclass diff --git a/unstructured_ingest/v2/processes/connectors/notion/types/date.py b/unstructured_ingest/v2/processes/connectors/notion/types/date.py index 17564e2b5..de1408a3b 100644 --- a/unstructured_ingest/v2/processes/connectors/notion/types/date.py +++ b/unstructured_ingest/v2/processes/connectors/notion/types/date.py @@ -4,7 +4,7 @@ from htmlBuilder.tags import Div, HtmlTag -from unstructured_ingest.connector.notion.interfaces import FromJSONMixin, GetHTMLMixin +from unstructured_ingest.v2.processes.connectors.notion.interfaces import FromJSONMixin, GetHTMLMixin @dataclass diff --git a/unstructured_ingest/v2/processes/connectors/notion/types/file.py b/unstructured_ingest/v2/processes/connectors/notion/types/file.py index a7cd51ea8..23c611e7d 100644 --- a/unstructured_ingest/v2/processes/connectors/notion/types/file.py +++ b/unstructured_ingest/v2/processes/connectors/notion/types/file.py @@ -5,7 +5,7 @@ from htmlBuilder.attributes import Href from htmlBuilder.tags import A, HtmlTag -from unstructured_ingest.connector.notion.interfaces import FromJSONMixin, GetHTMLMixin +from unstructured_ingest.v2.processes.connectors.notion.interfaces import FromJSONMixin, GetHTMLMixin @dataclass diff --git a/unstructured_ingest/v2/processes/connectors/notion/types/page.py b/unstructured_ingest/v2/processes/connectors/notion/types/page.py index d51ccfb50..497890dbb 100644 --- a/unstructured_ingest/v2/processes/connectors/notion/types/page.py +++ b/unstructured_ingest/v2/processes/connectors/notion/types/page.py @@ -2,10 +2,10 @@ from dataclasses import dataclass from typing import Optional -from unstructured_ingest.connector.notion.interfaces import FromJSONMixin -from unstructured_ingest.connector.notion.types.file import FileObject -from unstructured_ingest.connector.notion.types.parent import Parent -from unstructured_ingest.connector.notion.types.user import PartialUser +from unstructured_ingest.v2.processes.connectors.notion.interfaces import FromJSONMixin +from unstructured_ingest.v2.processes.connectors.notion.types.file import FileObject +from unstructured_ingest.v2.processes.connectors.notion.types.parent import Parent +from unstructured_ingest.v2.processes.connectors.notion.types.user import PartialUser @dataclass @@ -16,6 +16,7 @@ class Page(FromJSONMixin): last_edited_time: str last_edited_by: PartialUser archived: bool + in_trash: bool properties: dict parent: Parent url: str diff --git a/unstructured_ingest/v2/processes/connectors/notion/types/parent.py b/unstructured_ingest/v2/processes/connectors/notion/types/parent.py index ea2674643..8adf6a39a 100644 --- a/unstructured_ingest/v2/processes/connectors/notion/types/parent.py +++ b/unstructured_ingest/v2/processes/connectors/notion/types/parent.py @@ -1,7 +1,7 @@ # https://developers.notion.com/reference/parent-object from dataclasses import dataclass -from unstructured_ingest.connector.notion.interfaces import FromJSONMixin +from unstructured_ingest.v2.processes.connectors.notion.interfaces import FromJSONMixin # https://developers.notion.com/reference/parent-object#database-parent diff --git a/unstructured_ingest/v2/processes/connectors/notion/types/rich_text.py b/unstructured_ingest/v2/processes/connectors/notion/types/rich_text.py index 30ea3793d..3764b177c 100644 --- a/unstructured_ingest/v2/processes/connectors/notion/types/rich_text.py +++ b/unstructured_ingest/v2/processes/connectors/notion/types/rich_text.py @@ -6,12 +6,12 @@ from htmlBuilder.tags import A, B, Code, Div, HtmlTag, I, S, Span, U from htmlBuilder.tags import Text as HtmlText -from unstructured_ingest.connector.notion.interfaces import ( +from unstructured_ingest.v2.processes.connectors.notion.interfaces import ( FromJSONMixin, GetHTMLMixin, ) -from unstructured_ingest.connector.notion.types.date import Date -from unstructured_ingest.connector.notion.types.user import People +from unstructured_ingest.v2.processes.connectors.notion.types.date import Date +from unstructured_ingest.v2.processes.connectors.notion.types.user import People @dataclass diff --git a/unstructured_ingest/v2/processes/connectors/notion/types/user.py b/unstructured_ingest/v2/processes/connectors/notion/types/user.py index 212f5dc41..7d477e120 100644 --- a/unstructured_ingest/v2/processes/connectors/notion/types/user.py +++ b/unstructured_ingest/v2/processes/connectors/notion/types/user.py @@ -5,7 +5,7 @@ from htmlBuilder.attributes import Href from htmlBuilder.tags import A, Div, HtmlTag -from unstructured_ingest.connector.notion.interfaces import FromJSONMixin, GetHTMLMixin +from unstructured_ingest.v2.processes.connectors.notion.interfaces import FromJSONMixin, GetHTMLMixin @dataclass From 7b93cb6bba002ea74df7f96438a814ea036a3c6e Mon Sep 17 00:00:00 2001 From: mr-unstructured Date: Thu, 10 Oct 2024 17:15:53 +0100 Subject: [PATCH 03/48] Black format and changelog add --- CHANGELOG.md | 6 + requirements/common/base.txt | 62 --- requirements/connectors/airtable.txt | 30 -- requirements/connectors/astradb.txt | 66 --- .../connectors/azure-cognitive-search.txt | 28 -- requirements/connectors/azure.txt | 89 ---- requirements/connectors/biomed.txt | 18 - requirements/connectors/box.txt | 42 -- requirements/connectors/chroma.txt | 230 ---------- requirements/connectors/clarifai.txt | 68 --- requirements/connectors/confluence.txt | 39 -- requirements/connectors/couchbase.txt | 4 - .../connectors/databricks-volumes.txt | 28 -- requirements/connectors/delta-table.txt | 14 - requirements/connectors/discord.txt | 28 -- requirements/connectors/dropbox.txt | 33 -- requirements/connectors/elasticsearch.txt | 36 -- requirements/connectors/gcs.txt | 104 ----- requirements/connectors/github.txt | 39 -- requirements/connectors/gitlab.txt | 20 - requirements/connectors/google-drive.txt | 53 --- requirements/connectors/hubspot.txt | 17 - requirements/connectors/jira.txt | 38 -- requirements/connectors/kafka.txt | 4 - requirements/connectors/kdbai.txt | 39 -- requirements/connectors/milvus.txt | 44 -- requirements/connectors/mongodb.txt | 6 - requirements/connectors/notion.txt | 34 -- requirements/connectors/onedrive.txt | 42 -- requirements/connectors/opensearch.txt | 25 -- requirements/connectors/outlook.txt | 36 -- requirements/connectors/pinecone.txt | 20 - requirements/connectors/postgres.txt | 4 - requirements/connectors/qdrant.txt | 66 --- requirements/connectors/reddit.txt | 24 -- requirements/connectors/s3.txt | 59 --- requirements/connectors/salesforce.txt | 50 --- requirements/connectors/sftp.txt | 20 - requirements/connectors/sharepoint.txt | 36 -- requirements/connectors/singlestore.txt | 40 -- requirements/connectors/slack.txt | 4 - requirements/connectors/vectara.txt | 14 - requirements/connectors/weaviate.txt | 74 ---- requirements/connectors/wikipedia.txt | 20 - requirements/embed/aws-bedrock.txt | 23 - requirements/embed/huggingface.txt | 91 ---- requirements/embed/mixedbreadai.txt | 38 -- requirements/embed/octoai.txt | 61 --- requirements/embed/openai.txt | 61 --- requirements/embed/vertexai.txt | 122 ------ requirements/embed/voyageai.txt | 48 --- requirements/lint.txt | 49 --- requirements/local_partition/doc.txt | 161 ------- requirements/local_partition/docx.txt | 161 ------- requirements/local_partition/epub.txt | 158 ------- requirements/local_partition/image.txt | 395 ------------------ requirements/local_partition/md.txt | 158 ------- requirements/local_partition/msg.txt | 156 ------- requirements/local_partition/odt.txt | 163 -------- requirements/local_partition/org.txt | 158 ------- requirements/local_partition/pdf.txt | 395 ------------------ requirements/local_partition/ppt.txt | 165 -------- requirements/local_partition/pptx.txt | 165 -------- requirements/local_partition/rst.txt | 158 ------- requirements/local_partition/rtf.txt | 158 ------- requirements/local_partition/tsv.txt | 165 -------- requirements/local_partition/xlsx.txt | 173 -------- requirements/release.txt | 42 -- requirements/remote/client.txt | 94 ----- requirements/test.txt | 282 ------------- test_e2e/src/notion.sh | 1 - .../v2/processes/connectors/__init__.py | 1 - .../processes/connectors/notion/connector.py | 50 +-- .../connectors/notion/types/block.py | 2 +- .../processes/connectors/notion/types/date.py | 5 +- .../processes/connectors/notion/types/file.py | 5 +- .../processes/connectors/notion/types/user.py | 5 +- 77 files changed, 45 insertions(+), 5577 deletions(-) delete mode 100644 requirements/common/base.txt delete mode 100644 requirements/connectors/airtable.txt delete mode 100644 requirements/connectors/astradb.txt delete mode 100644 requirements/connectors/azure-cognitive-search.txt delete mode 100644 requirements/connectors/azure.txt delete mode 100644 requirements/connectors/biomed.txt delete mode 100644 requirements/connectors/box.txt delete mode 100644 requirements/connectors/chroma.txt delete mode 100644 requirements/connectors/clarifai.txt delete mode 100644 requirements/connectors/confluence.txt delete mode 100644 requirements/connectors/couchbase.txt delete mode 100644 requirements/connectors/databricks-volumes.txt delete mode 100644 requirements/connectors/delta-table.txt delete mode 100644 requirements/connectors/discord.txt delete mode 100644 requirements/connectors/dropbox.txt delete mode 100644 requirements/connectors/elasticsearch.txt delete mode 100644 requirements/connectors/gcs.txt delete mode 100644 requirements/connectors/github.txt delete mode 100644 requirements/connectors/gitlab.txt delete mode 100644 requirements/connectors/google-drive.txt delete mode 100644 requirements/connectors/hubspot.txt delete mode 100644 requirements/connectors/jira.txt delete mode 100644 requirements/connectors/kafka.txt delete mode 100644 requirements/connectors/kdbai.txt delete mode 100644 requirements/connectors/milvus.txt delete mode 100644 requirements/connectors/mongodb.txt delete mode 100644 requirements/connectors/notion.txt delete mode 100644 requirements/connectors/onedrive.txt delete mode 100644 requirements/connectors/opensearch.txt delete mode 100644 requirements/connectors/outlook.txt delete mode 100644 requirements/connectors/pinecone.txt delete mode 100644 requirements/connectors/postgres.txt delete mode 100644 requirements/connectors/qdrant.txt delete mode 100644 requirements/connectors/reddit.txt delete mode 100644 requirements/connectors/s3.txt delete mode 100644 requirements/connectors/salesforce.txt delete mode 100644 requirements/connectors/sftp.txt delete mode 100644 requirements/connectors/sharepoint.txt delete mode 100644 requirements/connectors/singlestore.txt delete mode 100644 requirements/connectors/slack.txt delete mode 100644 requirements/connectors/vectara.txt delete mode 100644 requirements/connectors/weaviate.txt delete mode 100644 requirements/connectors/wikipedia.txt delete mode 100644 requirements/embed/aws-bedrock.txt delete mode 100644 requirements/embed/huggingface.txt delete mode 100644 requirements/embed/mixedbreadai.txt delete mode 100644 requirements/embed/octoai.txt delete mode 100644 requirements/embed/openai.txt delete mode 100644 requirements/embed/vertexai.txt delete mode 100644 requirements/embed/voyageai.txt delete mode 100644 requirements/lint.txt delete mode 100644 requirements/local_partition/doc.txt delete mode 100644 requirements/local_partition/docx.txt delete mode 100644 requirements/local_partition/epub.txt delete mode 100644 requirements/local_partition/image.txt delete mode 100644 requirements/local_partition/md.txt delete mode 100644 requirements/local_partition/msg.txt delete mode 100644 requirements/local_partition/odt.txt delete mode 100644 requirements/local_partition/org.txt delete mode 100644 requirements/local_partition/pdf.txt delete mode 100644 requirements/local_partition/ppt.txt delete mode 100644 requirements/local_partition/pptx.txt delete mode 100644 requirements/local_partition/rst.txt delete mode 100644 requirements/local_partition/rtf.txt delete mode 100644 requirements/local_partition/tsv.txt delete mode 100644 requirements/local_partition/xlsx.txt delete mode 100644 requirements/release.txt delete mode 100644 requirements/remote/client.txt delete mode 100644 requirements/test.txt diff --git a/CHANGELOG.md b/CHANGELOG.md index e90454685..cadbc44e0 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,9 @@ +## 0.0.25 + +### Enhancements + +* **Migration Notion to v2** + ## 0.0.24 ### Enhancements diff --git a/requirements/common/base.txt b/requirements/common/base.txt deleted file mode 100644 index cdd9f97a2..000000000 --- a/requirements/common/base.txt +++ /dev/null @@ -1,62 +0,0 @@ -# This file was autogenerated by uv via the following command: -# uv pip compile ./common/base.in --output-file ./common/base.txt --no-strip-extras --python-version 3.9 -annotated-types==0.7.0 - # via pydantic -click==8.1.7 - # via -r ./common/base.in -dataclasses-json==0.6.7 - # via -r ./common/base.in -deprecated==1.2.14 - # via opentelemetry-api -marshmallow==3.22.0 - # via dataclasses-json -mypy-extensions==1.0.0 - # via typing-inspect -numpy==1.26.4 - # via - # -c ./common/constraints.txt - # pandas -opentelemetry-api==1.16.0 - # via opentelemetry-sdk -opentelemetry-sdk==1.16.0 - # via -r ./common/base.in -opentelemetry-semantic-conventions==0.37b0 - # via opentelemetry-sdk -packaging==23.2 - # via - # -c ./common/constraints.txt - # marshmallow -pandas==2.2.3 - # via -r ./common/base.in -pydantic==2.9.2 - # via -r ./common/base.in -pydantic-core==2.23.4 - # via pydantic -python-dateutil==2.9.0.post0 - # via - # -r ./common/base.in - # pandas -pytz==2024.2 - # via pandas -setuptools==75.1.0 - # via - # opentelemetry-api - # opentelemetry-sdk -six==1.16.0 - # via python-dateutil -tqdm==4.66.5 - # via -r ./common/base.in -typing-extensions==4.12.2 - # via - # opentelemetry-sdk - # pydantic - # pydantic-core - # typing-inspect -typing-inspect==0.9.0 - # via dataclasses-json -tzdata==2024.2 - # via pandas -wrapt==1.16.0 - # via - # -c ./common/constraints.txt - # deprecated diff --git a/requirements/connectors/airtable.txt b/requirements/connectors/airtable.txt deleted file mode 100644 index 421e1ee85..000000000 --- a/requirements/connectors/airtable.txt +++ /dev/null @@ -1,30 +0,0 @@ -# This file was autogenerated by uv via the following command: -# uv pip compile ./connectors/airtable.in --output-file ./connectors/airtable.txt --no-strip-extras --python-version 3.9 -annotated-types==0.7.0 - # via pydantic -certifi==2024.8.30 - # via requests -charset-normalizer==3.3.2 - # via requests -idna==3.10 - # via requests -inflection==0.5.1 - # via pyairtable -pyairtable==2.3.3 - # via -r ./connectors/airtable.in -pydantic==2.9.2 - # via pyairtable -pydantic-core==2.23.4 - # via pydantic -requests==2.32.3 - # via pyairtable -typing-extensions==4.12.2 - # via - # pyairtable - # pydantic - # pydantic-core -urllib3==1.26.20 - # via - # -c ./connectors/../common/constraints.txt - # pyairtable - # requests diff --git a/requirements/connectors/astradb.txt b/requirements/connectors/astradb.txt deleted file mode 100644 index e1b92486d..000000000 --- a/requirements/connectors/astradb.txt +++ /dev/null @@ -1,66 +0,0 @@ -# This file was autogenerated by uv via the following command: -# uv pip compile ./connectors/astradb.in --output-file ./connectors/astradb.txt --no-strip-extras --python-version 3.9 -anyio==4.6.0 - # via httpx -astrapy==1.5.0 - # via -r ./connectors/astradb.in -cassandra-driver==3.29.2 - # via cassio -cassio==0.1.9 - # via astrapy -certifi==2024.8.30 - # via - # httpcore - # httpx - # requests -charset-normalizer==3.3.2 - # via requests -click==8.1.7 - # via geomet -deprecation==2.1.0 - # via astrapy -dnspython==2.6.1 - # via pymongo -exceptiongroup==1.2.2 - # via anyio -geomet==0.2.1.post1 - # via cassandra-driver -h11==0.14.0 - # via httpcore -h2==4.1.0 - # via httpx -hpack==4.0.0 - # via h2 -httpcore==1.0.6 - # via httpx -httpx[http2]==0.27.2 - # via astrapy -hyperframe==6.0.1 - # via h2 -idna==3.10 - # via - # anyio - # httpx - # requests -numpy==2.0.2 - # via cassio -packaging==24.1 - # via deprecation -pymongo==4.10.1 - # via astrapy -requests==2.32.3 - # via cassio -six==1.16.0 - # via geomet -sniffio==1.3.1 - # via - # anyio - # httpx -toml==0.10.2 - # via astrapy -typing-extensions==4.12.2 - # via anyio -urllib3==2.2.3 - # via requests -uuid6==2024.7.10 - # via astrapy diff --git a/requirements/connectors/azure-cognitive-search.txt b/requirements/connectors/azure-cognitive-search.txt deleted file mode 100644 index fd7907f55..000000000 --- a/requirements/connectors/azure-cognitive-search.txt +++ /dev/null @@ -1,28 +0,0 @@ -# This file was autogenerated by uv via the following command: -# uv pip compile ./connectors/azure-cognitive-search.in --output-file ./connectors/azure-cognitive-search.txt --no-strip-extras --python-version 3.9 -azure-common==1.1.28 - # via azure-search-documents -azure-core==1.31.0 - # via azure-search-documents -azure-search-documents==11.5.1 - # via -r ./connectors/azure-cognitive-search.in -certifi==2024.8.30 - # via requests -charset-normalizer==3.3.2 - # via requests -idna==3.10 - # via requests -isodate==0.6.1 - # via azure-search-documents -requests==2.32.3 - # via azure-core -six==1.16.0 - # via - # azure-core - # isodate -typing-extensions==4.12.2 - # via - # azure-core - # azure-search-documents -urllib3==2.2.3 - # via requests diff --git a/requirements/connectors/azure.txt b/requirements/connectors/azure.txt deleted file mode 100644 index bc1b4c7af..000000000 --- a/requirements/connectors/azure.txt +++ /dev/null @@ -1,89 +0,0 @@ -# This file was autogenerated by uv via the following command: -# uv pip compile ./connectors/azure.in --output-file ./connectors/azure.txt --no-strip-extras --python-version 3.9 -adlfs==2024.7.0 - # via -r ./connectors/azure.in -aiohappyeyeballs==2.4.3 - # via aiohttp -aiohttp==3.10.8 - # via adlfs -aiosignal==1.3.1 - # via aiohttp -async-timeout==4.0.3 - # via aiohttp -attrs==24.2.0 - # via aiohttp -azure-core==1.31.0 - # via - # adlfs - # azure-identity - # azure-storage-blob -azure-datalake-store==0.0.53 - # via adlfs -azure-identity==1.18.0 - # via adlfs -azure-storage-blob==12.23.1 - # via adlfs -certifi==2024.8.30 - # via requests -cffi==1.17.1 - # via - # azure-datalake-store - # cryptography -charset-normalizer==3.3.2 - # via requests -cryptography==43.0.1 - # via - # azure-identity - # azure-storage-blob - # msal - # pyjwt -frozenlist==1.4.1 - # via - # aiohttp - # aiosignal -fsspec==2024.9.0 - # via - # -r ./connectors/azure.in - # adlfs -idna==3.10 - # via - # requests - # yarl -isodate==0.6.1 - # via azure-storage-blob -msal==1.31.0 - # via - # azure-datalake-store - # azure-identity - # msal-extensions -msal-extensions==1.2.0 - # via azure-identity -multidict==6.1.0 - # via - # aiohttp - # yarl -portalocker==2.10.1 - # via msal-extensions -pycparser==2.22 - # via cffi -pyjwt[crypto]==2.9.0 - # via msal -requests==2.32.3 - # via - # azure-core - # azure-datalake-store - # msal -six==1.16.0 - # via - # azure-core - # isodate -typing-extensions==4.12.2 - # via - # azure-core - # azure-identity - # azure-storage-blob - # multidict -urllib3==2.2.3 - # via requests -yarl==1.13.1 - # via aiohttp diff --git a/requirements/connectors/biomed.txt b/requirements/connectors/biomed.txt deleted file mode 100644 index 5a86a9eda..000000000 --- a/requirements/connectors/biomed.txt +++ /dev/null @@ -1,18 +0,0 @@ -# This file was autogenerated by uv via the following command: -# uv pip compile ./connectors/biomed.in --output-file ./connectors/biomed.txt --no-strip-extras --python-version 3.9 -beautifulsoup4==4.12.3 - # via bs4 -bs4==0.0.2 - # via -r ./connectors/biomed.in -certifi==2024.8.30 - # via requests -charset-normalizer==3.3.2 - # via requests -idna==3.10 - # via requests -requests==2.32.3 - # via -r ./connectors/biomed.in -soupsieve==2.6 - # via beautifulsoup4 -urllib3==2.2.3 - # via requests diff --git a/requirements/connectors/box.txt b/requirements/connectors/box.txt deleted file mode 100644 index 200c94bc2..000000000 --- a/requirements/connectors/box.txt +++ /dev/null @@ -1,42 +0,0 @@ -# This file was autogenerated by uv via the following command: -# uv pip compile ./connectors/box.in --output-file ./connectors/box.txt --no-strip-extras --python-version 3.9 -attrs==24.2.0 - # via boxsdk -boxfs==0.3.0 - # via -r ./connectors/box.in -boxsdk[jwt]==3.13.0 - # via boxfs -certifi==2024.8.30 - # via requests -cffi==1.17.1 - # via cryptography -charset-normalizer==3.3.2 - # via requests -cryptography==43.0.1 - # via boxsdk -fsspec==2024.5.0 - # via - # -c ./connectors/../common/constraints.txt - # -r ./connectors/box.in - # boxfs -idna==3.10 - # via requests -pycparser==2.22 - # via cffi -pyjwt==2.9.0 - # via boxsdk -python-dateutil==2.9.0.post0 - # via boxsdk -requests==2.32.3 - # via - # boxsdk - # requests-toolbelt -requests-toolbelt==1.0.0 - # via boxsdk -six==1.16.0 - # via python-dateutil -urllib3==1.26.20 - # via - # -c ./connectors/../common/constraints.txt - # boxsdk - # requests diff --git a/requirements/connectors/chroma.txt b/requirements/connectors/chroma.txt deleted file mode 100644 index 38effc76e..000000000 --- a/requirements/connectors/chroma.txt +++ /dev/null @@ -1,230 +0,0 @@ -# This file was autogenerated by uv via the following command: -# uv pip compile ./connectors/chroma.in --output-file ./connectors/chroma.txt --no-strip-extras --python-version 3.9 -annotated-types==0.7.0 - # via pydantic -anyio==4.6.0 - # via - # starlette - # watchfiles -backoff==2.2.1 - # via - # opentelemetry-exporter-otlp-proto-grpc - # posthog -bcrypt==4.2.0 - # via chromadb -cachetools==5.5.0 - # via google-auth -certifi==2024.8.30 - # via - # kubernetes - # pulsar-client - # requests -charset-normalizer==3.3.2 - # via requests -chroma-hnswlib==0.7.3 - # via chromadb -chromadb==0.4.17 - # via -r ./connectors/chroma.in -click==8.1.7 - # via - # typer - # uvicorn -coloredlogs==15.0.1 - # via onnxruntime -deprecated==1.2.14 - # via opentelemetry-api -durationpy==0.8 - # via kubernetes -exceptiongroup==1.2.2 - # via anyio -fastapi==0.115.0 - # via chromadb -filelock==3.16.1 - # via huggingface-hub -flatbuffers==24.3.25 - # via onnxruntime -fsspec==2024.5.0 - # via - # -c ./connectors/../common/constraints.txt - # huggingface-hub -google-auth==2.35.0 - # via kubernetes -googleapis-common-protos==1.65.0 - # via opentelemetry-exporter-otlp-proto-grpc -grpcio==1.66.2 - # via - # -c ./connectors/../common/constraints.txt - # chromadb - # opentelemetry-exporter-otlp-proto-grpc -h11==0.14.0 - # via uvicorn -httptools==0.6.1 - # via uvicorn -huggingface-hub==0.25.1 - # via tokenizers -humanfriendly==10.0 - # via coloredlogs -idna==3.10 - # via - # anyio - # requests -importlib-resources==6.4.5 - # via chromadb -kubernetes==31.0.0 - # via chromadb -markdown-it-py==3.0.0 - # via rich -mdurl==0.1.2 - # via markdown-it-py -monotonic==1.6 - # via posthog -mpmath==1.3.0 - # via sympy -numpy==1.26.4 - # via - # -c ./connectors/../common/constraints.txt - # chroma-hnswlib - # chromadb - # onnxruntime -oauthlib==3.2.2 - # via - # kubernetes - # requests-oauthlib -onnxruntime==1.19.2 - # via chromadb -opentelemetry-api==1.16.0 - # via - # chromadb - # opentelemetry-exporter-otlp-proto-grpc - # opentelemetry-sdk -opentelemetry-exporter-otlp-proto-grpc==1.16.0 - # via chromadb -opentelemetry-proto==1.16.0 - # via opentelemetry-exporter-otlp-proto-grpc -opentelemetry-sdk==1.16.0 - # via - # chromadb - # opentelemetry-exporter-otlp-proto-grpc -opentelemetry-semantic-conventions==0.37b0 - # via opentelemetry-sdk -overrides==7.7.0 - # via chromadb -packaging==23.2 - # via - # -c ./connectors/../common/constraints.txt - # huggingface-hub - # onnxruntime -posthog==3.6.6 - # via chromadb -protobuf==4.23.4 - # via - # -c ./connectors/../common/constraints.txt - # googleapis-common-protos - # onnxruntime - # opentelemetry-proto -pulsar-client==3.5.0 - # via chromadb -pyasn1==0.6.1 - # via - # pyasn1-modules - # rsa -pyasn1-modules==0.4.1 - # via google-auth -pydantic==2.9.2 - # via - # chromadb - # fastapi -pydantic-core==2.23.4 - # via pydantic -pygments==2.18.0 - # via rich -pypika==0.48.9 - # via chromadb -python-dateutil==2.9.0.post0 - # via - # kubernetes - # posthog -python-dotenv==1.0.1 - # via uvicorn -pyyaml==6.0.2 - # via - # chromadb - # huggingface-hub - # kubernetes - # uvicorn -requests==2.32.3 - # via - # chromadb - # huggingface-hub - # kubernetes - # posthog - # requests-oauthlib -requests-oauthlib==2.0.0 - # via kubernetes -rich==13.9.1 - # via typer -rsa==4.9 - # via google-auth -setuptools==75.1.0 - # via - # opentelemetry-api - # opentelemetry-sdk -shellingham==1.5.4 - # via typer -six==1.16.0 - # via - # kubernetes - # posthog - # python-dateutil -sniffio==1.3.1 - # via anyio -starlette==0.38.6 - # via fastapi -sympy==1.13.3 - # via onnxruntime -tenacity==9.0.0 - # via chromadb -tokenizers==0.19.1 - # via - # -c ./connectors/../common/constraints.txt - # chromadb -tqdm==4.66.5 - # via - # chromadb - # huggingface-hub -typer==0.12.5 - # via chromadb -typing-extensions==4.12.2 - # via - # anyio - # chromadb - # fastapi - # huggingface-hub - # opentelemetry-sdk - # pydantic - # pydantic-core - # rich - # starlette - # typer - # uvicorn -urllib3==1.26.20 - # via - # -c ./connectors/../common/constraints.txt - # kubernetes - # requests -uvicorn[standard]==0.31.0 - # via chromadb -uvloop==0.20.0 - # via uvicorn -watchfiles==0.24.0 - # via uvicorn -websocket-client==1.8.0 - # via kubernetes -websockets==13.1 - # via uvicorn -wrapt==1.16.0 - # via - # -c ./connectors/../common/constraints.txt - # deprecated -zipp==3.20.2 - # via importlib-resources diff --git a/requirements/connectors/clarifai.txt b/requirements/connectors/clarifai.txt deleted file mode 100644 index f11740c32..000000000 --- a/requirements/connectors/clarifai.txt +++ /dev/null @@ -1,68 +0,0 @@ -# This file was autogenerated by uv via the following command: -# uv pip compile ./connectors/clarifai.in --output-file ./connectors/clarifai.txt --no-strip-extras --python-version 3.9 -certifi==2024.8.30 - # via requests -charset-normalizer==3.3.2 - # via requests -clarifai==10.7.0 - # via -r ./connectors/clarifai.in -clarifai-grpc==10.7.1 - # via clarifai -contextlib2==21.6.0 - # via schema -googleapis-common-protos==1.65.0 - # via clarifai-grpc -grpcio==1.66.2 - # via - # -c ./connectors/../common/constraints.txt - # clarifai-grpc -idna==3.10 - # via requests -inquirerpy==0.3.4 - # via clarifai -markdown-it-py==3.0.0 - # via rich -mdurl==0.1.2 - # via markdown-it-py -numpy==1.26.4 - # via - # -c ./connectors/../common/constraints.txt - # clarifai - # tritonclient -pfzy==0.3.4 - # via inquirerpy -pillow==10.4.0 - # via clarifai -prompt-toolkit==3.0.48 - # via inquirerpy -protobuf==4.23.4 - # via - # -c ./connectors/../common/constraints.txt - # clarifai-grpc - # googleapis-common-protos -pygments==2.18.0 - # via rich -python-rapidjson==1.20 - # via tritonclient -pyyaml==6.0.2 - # via clarifai -requests==2.32.3 - # via clarifai-grpc -rich==13.9.1 - # via clarifai -schema==0.7.5 - # via clarifai -tabulate==0.9.0 - # via clarifai -tqdm==4.66.5 - # via clarifai -tritonclient==2.41.1 - # via clarifai -typing-extensions==4.12.2 - # via rich -urllib3==1.26.20 - # via - # -c ./connectors/../common/constraints.txt - # requests -wcwidth==0.2.13 - # via prompt-toolkit diff --git a/requirements/connectors/confluence.txt b/requirements/connectors/confluence.txt deleted file mode 100644 index 3a97ca212..000000000 --- a/requirements/connectors/confluence.txt +++ /dev/null @@ -1,39 +0,0 @@ -# This file was autogenerated by uv via the following command: -# uv pip compile ./connectors/confluence.in --output-file ./connectors/confluence.txt --no-strip-extras --python-version 3.9 -atlassian-python-api==3.41.16 - # via -r ./connectors/confluence.in -beautifulsoup4==4.12.3 - # via atlassian-python-api -certifi==2024.8.30 - # via requests -charset-normalizer==3.3.2 - # via requests -deprecated==1.2.14 - # via atlassian-python-api -idna==3.10 - # via requests -jmespath==1.0.1 - # via atlassian-python-api -oauthlib==3.2.2 - # via - # atlassian-python-api - # requests-oauthlib -requests==2.32.3 - # via - # -r ./connectors/confluence.in - # atlassian-python-api - # requests-oauthlib -requests-oauthlib==2.0.0 - # via atlassian-python-api -six==1.16.0 - # via atlassian-python-api -soupsieve==2.6 - # via beautifulsoup4 -urllib3==1.26.20 - # via - # -c ./connectors/../common/constraints.txt - # requests -wrapt==1.16.0 - # via - # -c ./connectors/../common/constraints.txt - # deprecated diff --git a/requirements/connectors/couchbase.txt b/requirements/connectors/couchbase.txt deleted file mode 100644 index 4ceb6998b..000000000 --- a/requirements/connectors/couchbase.txt +++ /dev/null @@ -1,4 +0,0 @@ -# This file was autogenerated by uv via the following command: -# uv pip compile ./connectors/couchbase.in --output-file ./connectors/couchbase.txt --no-strip-extras --python-version 3.9 -couchbase==4.3.2 - # via -r ./connectors/couchbase.in diff --git a/requirements/connectors/databricks-volumes.txt b/requirements/connectors/databricks-volumes.txt deleted file mode 100644 index c01a1b236..000000000 --- a/requirements/connectors/databricks-volumes.txt +++ /dev/null @@ -1,28 +0,0 @@ -# This file was autogenerated by uv via the following command: -# uv pip compile ./connectors/databricks-volumes.in --output-file ./connectors/databricks-volumes.txt --no-strip-extras --python-version 3.9 -cachetools==5.5.0 - # via google-auth -certifi==2024.8.30 - # via requests -charset-normalizer==3.3.2 - # via requests -databricks-sdk==0.33.0 - # via -r ./connectors/databricks-volumes.in -google-auth==2.35.0 - # via databricks-sdk -idna==3.10 - # via requests -pyasn1==0.6.1 - # via - # pyasn1-modules - # rsa -pyasn1-modules==0.4.1 - # via google-auth -requests==2.32.3 - # via databricks-sdk -rsa==4.9 - # via google-auth -urllib3==1.26.20 - # via - # -c ./connectors/../common/constraints.txt - # requests diff --git a/requirements/connectors/delta-table.txt b/requirements/connectors/delta-table.txt deleted file mode 100644 index 05d144877..000000000 --- a/requirements/connectors/delta-table.txt +++ /dev/null @@ -1,14 +0,0 @@ -# This file was autogenerated by uv via the following command: -# uv pip compile ./connectors/delta-table.in --output-file ./connectors/delta-table.txt --no-strip-extras --python-version 3.9 -deltalake==0.20.1 - # via -r ./connectors/delta-table.in -fsspec==2024.5.0 - # via - # -c ./connectors/../common/constraints.txt - # -r ./connectors/delta-table.in -numpy==1.26.4 - # via - # -c ./connectors/../common/constraints.txt - # pyarrow -pyarrow==17.0.0 - # via deltalake diff --git a/requirements/connectors/discord.txt b/requirements/connectors/discord.txt deleted file mode 100644 index 358812feb..000000000 --- a/requirements/connectors/discord.txt +++ /dev/null @@ -1,28 +0,0 @@ -# This file was autogenerated by uv via the following command: -# uv pip compile ./connectors/discord.in --output-file ./connectors/discord.txt --no-strip-extras --python-version 3.9 -aiohappyeyeballs==2.4.3 - # via aiohttp -aiohttp==3.10.8 - # via discord-py -aiosignal==1.3.1 - # via aiohttp -async-timeout==4.0.3 - # via aiohttp -attrs==24.2.0 - # via aiohttp -discord-py==2.4.0 - # via -r ./connectors/discord.in -frozenlist==1.4.1 - # via - # aiohttp - # aiosignal -idna==3.10 - # via yarl -multidict==6.1.0 - # via - # aiohttp - # yarl -typing-extensions==4.12.2 - # via multidict -yarl==1.13.1 - # via aiohttp diff --git a/requirements/connectors/dropbox.txt b/requirements/connectors/dropbox.txt deleted file mode 100644 index 41f70edd6..000000000 --- a/requirements/connectors/dropbox.txt +++ /dev/null @@ -1,33 +0,0 @@ -# This file was autogenerated by uv via the following command: -# uv pip compile ./connectors/dropbox.in --output-file ./connectors/dropbox.txt --no-strip-extras --python-version 3.9 -certifi==2024.8.30 - # via requests -charset-normalizer==3.3.2 - # via requests -dropbox==12.0.2 - # via dropboxdrivefs -dropboxdrivefs==1.4.1 - # via -r ./connectors/dropbox.in -fsspec==2024.5.0 - # via - # -c ./connectors/../common/constraints.txt - # -r ./connectors/dropbox.in - # dropboxdrivefs -idna==3.10 - # via requests -ply==3.11 - # via stone -requests==2.32.3 - # via - # dropbox - # dropboxdrivefs -six==1.16.0 - # via - # dropbox - # stone -stone==3.3.1 - # via dropbox -urllib3==1.26.20 - # via - # -c ./connectors/../common/constraints.txt - # requests diff --git a/requirements/connectors/elasticsearch.txt b/requirements/connectors/elasticsearch.txt deleted file mode 100644 index 223f74b5c..000000000 --- a/requirements/connectors/elasticsearch.txt +++ /dev/null @@ -1,36 +0,0 @@ -# This file was autogenerated by uv via the following command: -# uv pip compile ./connectors/elasticsearch.in --output-file ./connectors/elasticsearch.txt --no-strip-extras --python-version 3.9 -aiohappyeyeballs==2.4.3 - # via aiohttp -aiohttp==3.10.8 - # via elasticsearch -aiosignal==1.3.1 - # via aiohttp -async-timeout==4.0.3 - # via aiohttp -attrs==24.2.0 - # via aiohttp -certifi==2024.8.30 - # via elastic-transport -elastic-transport==8.15.0 - # via elasticsearch -elasticsearch[async]==8.15.1 - # via -r ./connectors/elasticsearch.in -frozenlist==1.4.1 - # via - # aiohttp - # aiosignal -idna==3.10 - # via yarl -multidict==6.1.0 - # via - # aiohttp - # yarl -typing-extensions==4.12.2 - # via multidict -urllib3==1.26.20 - # via - # -c ./connectors/../common/constraints.txt - # elastic-transport -yarl==1.13.1 - # via aiohttp diff --git a/requirements/connectors/gcs.txt b/requirements/connectors/gcs.txt deleted file mode 100644 index 29db8779d..000000000 --- a/requirements/connectors/gcs.txt +++ /dev/null @@ -1,104 +0,0 @@ -# This file was autogenerated by uv via the following command: -# uv pip compile ./connectors/gcs.in --output-file ./connectors/gcs.txt --no-strip-extras --python-version 3.9 -aiohappyeyeballs==2.4.3 - # via aiohttp -aiohttp==3.10.8 - # via gcsfs -aiosignal==1.3.1 - # via aiohttp -async-timeout==4.0.3 - # via aiohttp -attrs==24.2.0 - # via aiohttp -beautifulsoup4==4.12.3 - # via bs4 -bs4==0.0.2 - # via -r ./connectors/gcs.in -cachetools==5.5.0 - # via google-auth -certifi==2024.8.30 - # via requests -charset-normalizer==3.3.2 - # via requests -decorator==5.1.1 - # via gcsfs -frozenlist==1.4.1 - # via - # aiohttp - # aiosignal -fsspec==2024.5.0 - # via - # -c ./connectors/../common/constraints.txt - # -r ./connectors/gcs.in - # gcsfs -gcsfs==2024.5.0 - # via -r ./connectors/gcs.in -google-api-core==2.20.0 - # via - # google-cloud-core - # google-cloud-storage -google-auth==2.35.0 - # via - # gcsfs - # google-api-core - # google-auth-oauthlib - # google-cloud-core - # google-cloud-storage -google-auth-oauthlib==1.2.1 - # via gcsfs -google-cloud-core==2.4.1 - # via google-cloud-storage -google-cloud-storage==2.18.2 - # via gcsfs -google-crc32c==1.6.0 - # via - # google-cloud-storage - # google-resumable-media -google-resumable-media==2.7.2 - # via google-cloud-storage -googleapis-common-protos==1.65.0 - # via google-api-core -idna==3.10 - # via - # requests - # yarl -multidict==6.1.0 - # via - # aiohttp - # yarl -oauthlib==3.2.2 - # via requests-oauthlib -proto-plus==1.24.0 - # via google-api-core -protobuf==4.23.4 - # via - # -c ./connectors/../common/constraints.txt - # google-api-core - # googleapis-common-protos - # proto-plus -pyasn1==0.6.1 - # via - # pyasn1-modules - # rsa -pyasn1-modules==0.4.1 - # via google-auth -requests==2.32.3 - # via - # gcsfs - # google-api-core - # google-cloud-storage - # requests-oauthlib -requests-oauthlib==2.0.0 - # via google-auth-oauthlib -rsa==4.9 - # via google-auth -soupsieve==2.6 - # via beautifulsoup4 -typing-extensions==4.12.2 - # via multidict -urllib3==1.26.20 - # via - # -c ./connectors/../common/constraints.txt - # requests -yarl==1.13.1 - # via aiohttp diff --git a/requirements/connectors/github.txt b/requirements/connectors/github.txt deleted file mode 100644 index d9795f8dd..000000000 --- a/requirements/connectors/github.txt +++ /dev/null @@ -1,39 +0,0 @@ -# This file was autogenerated by uv via the following command: -# uv pip compile ./connectors/github.in --output-file ./connectors/github.txt --no-strip-extras --python-version 3.9 -certifi==2024.8.30 - # via requests -cffi==1.17.1 - # via - # cryptography - # pynacl -charset-normalizer==3.3.2 - # via requests -cryptography==43.0.1 - # via pyjwt -deprecated==1.2.14 - # via pygithub -idna==3.10 - # via requests -pycparser==2.22 - # via cffi -pygithub==2.4.0 - # via -r ./connectors/github.in -pyjwt[crypto]==2.9.0 - # via pygithub -pynacl==1.5.0 - # via pygithub -requests==2.32.3 - # via - # -r ./connectors/github.in - # pygithub -typing-extensions==4.12.2 - # via pygithub -urllib3==1.26.20 - # via - # -c ./connectors/../common/constraints.txt - # pygithub - # requests -wrapt==1.16.0 - # via - # -c ./connectors/../common/constraints.txt - # deprecated diff --git a/requirements/connectors/gitlab.txt b/requirements/connectors/gitlab.txt deleted file mode 100644 index 6d12e636c..000000000 --- a/requirements/connectors/gitlab.txt +++ /dev/null @@ -1,20 +0,0 @@ -# This file was autogenerated by uv via the following command: -# uv pip compile ./connectors/gitlab.in --output-file ./connectors/gitlab.txt --no-strip-extras --python-version 3.9 -certifi==2024.8.30 - # via requests -charset-normalizer==3.3.2 - # via requests -idna==3.10 - # via requests -python-gitlab==4.12.2 - # via -r ./connectors/gitlab.in -requests==2.32.3 - # via - # python-gitlab - # requests-toolbelt -requests-toolbelt==1.0.0 - # via python-gitlab -urllib3==1.26.20 - # via - # -c ./connectors/../common/constraints.txt - # requests diff --git a/requirements/connectors/google-drive.txt b/requirements/connectors/google-drive.txt deleted file mode 100644 index d78d467fc..000000000 --- a/requirements/connectors/google-drive.txt +++ /dev/null @@ -1,53 +0,0 @@ -# This file was autogenerated by uv via the following command: -# uv pip compile ./connectors/google-drive.in --output-file ./connectors/google-drive.txt --no-strip-extras --python-version 3.9 -cachetools==5.5.0 - # via google-auth -certifi==2024.8.30 - # via requests -charset-normalizer==3.3.2 - # via requests -google-api-core==2.20.0 - # via google-api-python-client -google-api-python-client==2.147.0 - # via -r ./connectors/google-drive.in -google-auth==2.35.0 - # via - # google-api-core - # google-api-python-client - # google-auth-httplib2 -google-auth-httplib2==0.2.0 - # via google-api-python-client -googleapis-common-protos==1.65.0 - # via google-api-core -httplib2==0.22.0 - # via - # google-api-python-client - # google-auth-httplib2 -idna==3.10 - # via requests -proto-plus==1.24.0 - # via google-api-core -protobuf==4.23.4 - # via - # -c ./connectors/../common/constraints.txt - # google-api-core - # googleapis-common-protos - # proto-plus -pyasn1==0.6.1 - # via - # pyasn1-modules - # rsa -pyasn1-modules==0.4.1 - # via google-auth -pyparsing==3.1.4 - # via httplib2 -requests==2.32.3 - # via google-api-core -rsa==4.9 - # via google-auth -uritemplate==4.1.1 - # via google-api-python-client -urllib3==1.26.20 - # via - # -c ./connectors/../common/constraints.txt - # requests diff --git a/requirements/connectors/hubspot.txt b/requirements/connectors/hubspot.txt deleted file mode 100644 index 3a6a198ce..000000000 --- a/requirements/connectors/hubspot.txt +++ /dev/null @@ -1,17 +0,0 @@ -# This file was autogenerated by uv via the following command: -# uv pip compile ./connectors/hubspot.in --output-file ./connectors/hubspot.txt --no-strip-extras --python-version 3.9 -certifi==2024.8.30 - # via hubspot-api-client -hubspot-api-client==9.0.0 - # via -r ./connectors/hubspot.in -python-dateutil==2.9.0.post0 - # via hubspot-api-client -six==1.16.0 - # via - # hubspot-api-client - # python-dateutil -urllib3==1.26.20 - # via - # -c ./connectors/../common/constraints.txt - # -r ./connectors/hubspot.in - # hubspot-api-client diff --git a/requirements/connectors/jira.txt b/requirements/connectors/jira.txt deleted file mode 100644 index e32aebe77..000000000 --- a/requirements/connectors/jira.txt +++ /dev/null @@ -1,38 +0,0 @@ -# This file was autogenerated by uv via the following command: -# uv pip compile ./connectors/jira.in --output-file ./connectors/jira.txt --no-strip-extras --python-version 3.9 -atlassian-python-api==3.41.16 - # via -r ./connectors/jira.in -beautifulsoup4==4.12.3 - # via atlassian-python-api -certifi==2024.8.30 - # via requests -charset-normalizer==3.3.2 - # via requests -deprecated==1.2.14 - # via atlassian-python-api -idna==3.10 - # via requests -jmespath==1.0.1 - # via atlassian-python-api -oauthlib==3.2.2 - # via - # atlassian-python-api - # requests-oauthlib -requests==2.32.3 - # via - # atlassian-python-api - # requests-oauthlib -requests-oauthlib==2.0.0 - # via atlassian-python-api -six==1.16.0 - # via atlassian-python-api -soupsieve==2.6 - # via beautifulsoup4 -urllib3==1.26.20 - # via - # -c ./connectors/../common/constraints.txt - # requests -wrapt==1.16.0 - # via - # -c ./connectors/../common/constraints.txt - # deprecated diff --git a/requirements/connectors/kafka.txt b/requirements/connectors/kafka.txt deleted file mode 100644 index c2a42a192..000000000 --- a/requirements/connectors/kafka.txt +++ /dev/null @@ -1,4 +0,0 @@ -# This file was autogenerated by uv via the following command: -# uv pip compile ./connectors/kafka.in --output-file ./connectors/kafka.txt --no-strip-extras --python-version 3.9 -confluent-kafka==2.5.3 - # via -r ./connectors/kafka.in diff --git a/requirements/connectors/kdbai.txt b/requirements/connectors/kdbai.txt deleted file mode 100644 index 479948843..000000000 --- a/requirements/connectors/kdbai.txt +++ /dev/null @@ -1,39 +0,0 @@ -# This file was autogenerated by uv via the following command: -# uv pip compile ./connectors/kdbai.in --output-file ./connectors/kdbai.txt --no-strip-extras --python-version 3.9 -certifi==2024.8.30 - # via requests -charset-normalizer==3.3.2 - # via requests -idna==3.10 - # via requests -kdbai-client==1.3.0 - # via -r ./connectors/kdbai.in -numpy==1.26.4 - # via - # -c ./connectors/../common/constraints.txt - # pandas - # pykx -pandas==2.2.3 - # via - # kdbai-client - # pykx -pykx==2.3.0 - # via kdbai-client -python-dateutil==2.9.0.post0 - # via pandas -pytz==2024.2 - # via - # pandas - # pykx -requests==2.32.3 - # via kdbai-client -six==1.16.0 - # via python-dateutil -toml==0.10.2 - # via pykx -tzdata==2024.2 - # via pandas -urllib3==1.26.20 - # via - # -c ./connectors/../common/constraints.txt - # requests diff --git a/requirements/connectors/milvus.txt b/requirements/connectors/milvus.txt deleted file mode 100644 index bbad21125..000000000 --- a/requirements/connectors/milvus.txt +++ /dev/null @@ -1,44 +0,0 @@ -# This file was autogenerated by uv via the following command: -# uv pip compile ./connectors/milvus.in --output-file ./connectors/milvus.txt --no-strip-extras --python-version 3.9 -environs==9.5.0 - # via pymilvus -grpcio==1.66.2 - # via - # -c ./connectors/../common/constraints.txt - # pymilvus -marshmallow==3.22.0 - # via environs -milvus-lite==2.4.10 - # via pymilvus -numpy==1.26.4 - # via - # -c ./connectors/../common/constraints.txt - # pandas -packaging==23.2 - # via - # -c ./connectors/../common/constraints.txt - # marshmallow -pandas==2.2.3 - # via pymilvus -protobuf==4.23.4 - # via - # -c ./connectors/../common/constraints.txt - # pymilvus -pymilvus==2.4.7 - # via -r ./connectors/milvus.in -python-dateutil==2.9.0.post0 - # via pandas -python-dotenv==1.0.1 - # via environs -pytz==2024.2 - # via pandas -setuptools==75.1.0 - # via pymilvus -six==1.16.0 - # via python-dateutil -tqdm==4.66.5 - # via milvus-lite -tzdata==2024.2 - # via pandas -ujson==5.10.0 - # via pymilvus diff --git a/requirements/connectors/mongodb.txt b/requirements/connectors/mongodb.txt deleted file mode 100644 index 0a0053a93..000000000 --- a/requirements/connectors/mongodb.txt +++ /dev/null @@ -1,6 +0,0 @@ -# This file was autogenerated by uv via the following command: -# uv pip compile ./connectors/mongodb.in --output-file ./connectors/mongodb.txt --no-strip-extras --python-version 3.9 -dnspython==2.6.1 - # via pymongo -pymongo==4.10.1 - # via -r ./connectors/mongodb.in diff --git a/requirements/connectors/notion.txt b/requirements/connectors/notion.txt deleted file mode 100644 index 829812e6c..000000000 --- a/requirements/connectors/notion.txt +++ /dev/null @@ -1,34 +0,0 @@ -# This file was autogenerated by uv via the following command: -# uv pip compile ./connectors/notion.in --output-file ./connectors/notion.txt --no-strip-extras --python-version 3.9 -anyio==4.6.0 - # via httpx -backoff==2.2.1 - # via -r ./connectors/notion.in -certifi==2024.8.30 - # via - # httpcore - # httpx -exceptiongroup==1.2.2 - # via anyio -h11==0.14.0 - # via httpcore -htmlbuilder==1.0.0 - # via -r ./connectors/notion.in -httpcore==1.0.6 - # via httpx -httpx==0.27.2 - # via - # -r ./connectors/notion.in - # notion-client -idna==3.10 - # via - # anyio - # httpx -notion-client==2.2.1 - # via -r ./connectors/notion.in -sniffio==1.3.1 - # via - # anyio - # httpx -typing-extensions==4.12.2 - # via anyio diff --git a/requirements/connectors/onedrive.txt b/requirements/connectors/onedrive.txt deleted file mode 100644 index fba01233a..000000000 --- a/requirements/connectors/onedrive.txt +++ /dev/null @@ -1,42 +0,0 @@ -# This file was autogenerated by uv via the following command: -# uv pip compile ./connectors/onedrive.in --output-file ./connectors/onedrive.txt --no-strip-extras --python-version 3.9 -beautifulsoup4==4.12.3 - # via bs4 -bs4==0.0.2 - # via -r ./connectors/onedrive.in -certifi==2024.8.30 - # via requests -cffi==1.17.1 - # via cryptography -charset-normalizer==3.3.2 - # via requests -cryptography==43.0.1 - # via - # msal - # pyjwt -idna==3.10 - # via requests -msal==1.31.0 - # via - # -r ./connectors/onedrive.in - # office365-rest-python-client -office365-rest-python-client==2.5.13 - # via -r ./connectors/onedrive.in -pycparser==2.22 - # via cffi -pyjwt[crypto]==2.9.0 - # via msal -pytz==2024.2 - # via office365-rest-python-client -requests==2.32.3 - # via - # msal - # office365-rest-python-client -soupsieve==2.6 - # via beautifulsoup4 -typing-extensions==4.12.2 - # via office365-rest-python-client -urllib3==1.26.20 - # via - # -c ./connectors/../common/constraints.txt - # requests diff --git a/requirements/connectors/opensearch.txt b/requirements/connectors/opensearch.txt deleted file mode 100644 index a3ab23f00..000000000 --- a/requirements/connectors/opensearch.txt +++ /dev/null @@ -1,25 +0,0 @@ -# This file was autogenerated by uv via the following command: -# uv pip compile ./connectors/opensearch.in --output-file ./connectors/opensearch.txt --no-strip-extras --python-version 3.9 -certifi==2024.8.30 - # via - # opensearch-py - # requests -charset-normalizer==3.3.2 - # via requests -events==0.5 - # via opensearch-py -idna==3.10 - # via requests -opensearch-py==2.7.1 - # via -r ./connectors/opensearch.in -python-dateutil==2.9.0.post0 - # via opensearch-py -requests==2.32.3 - # via opensearch-py -six==1.16.0 - # via python-dateutil -urllib3==1.26.20 - # via - # -c ./connectors/../common/constraints.txt - # opensearch-py - # requests diff --git a/requirements/connectors/outlook.txt b/requirements/connectors/outlook.txt deleted file mode 100644 index b94083ed6..000000000 --- a/requirements/connectors/outlook.txt +++ /dev/null @@ -1,36 +0,0 @@ -# This file was autogenerated by uv via the following command: -# uv pip compile ./connectors/outlook.in --output-file ./connectors/outlook.txt --no-strip-extras --python-version 3.9 -certifi==2024.8.30 - # via requests -cffi==1.17.1 - # via cryptography -charset-normalizer==3.3.2 - # via requests -cryptography==43.0.1 - # via - # msal - # pyjwt -idna==3.10 - # via requests -msal==1.31.0 - # via - # -r ./connectors/outlook.in - # office365-rest-python-client -office365-rest-python-client==2.5.13 - # via -r ./connectors/outlook.in -pycparser==2.22 - # via cffi -pyjwt[crypto]==2.9.0 - # via msal -pytz==2024.2 - # via office365-rest-python-client -requests==2.32.3 - # via - # msal - # office365-rest-python-client -typing-extensions==4.12.2 - # via office365-rest-python-client -urllib3==1.26.20 - # via - # -c ./connectors/../common/constraints.txt - # requests diff --git a/requirements/connectors/pinecone.txt b/requirements/connectors/pinecone.txt deleted file mode 100644 index bf00d8245..000000000 --- a/requirements/connectors/pinecone.txt +++ /dev/null @@ -1,20 +0,0 @@ -# This file was autogenerated by uv via the following command: -# uv pip compile ./connectors/pinecone.in --output-file ./connectors/pinecone.txt --no-strip-extras --python-version 3.9 -certifi==2024.8.30 - # via pinecone-client -pinecone-client==5.0.1 - # via -r ./connectors/pinecone.in -pinecone-plugin-inference==1.1.0 - # via pinecone-client -pinecone-plugin-interface==0.0.7 - # via - # pinecone-client - # pinecone-plugin-inference -tqdm==4.66.5 - # via pinecone-client -typing-extensions==4.12.2 - # via pinecone-client -urllib3==1.26.20 - # via - # -c ./connectors/../common/constraints.txt - # pinecone-client diff --git a/requirements/connectors/postgres.txt b/requirements/connectors/postgres.txt deleted file mode 100644 index 683bddbcb..000000000 --- a/requirements/connectors/postgres.txt +++ /dev/null @@ -1,4 +0,0 @@ -# This file was autogenerated by uv via the following command: -# uv pip compile ./connectors/postgres.in --output-file ./connectors/postgres.txt --no-strip-extras --python-version 3.9 -psycopg2-binary==2.9.9 - # via -r ./connectors/postgres.in diff --git a/requirements/connectors/qdrant.txt b/requirements/connectors/qdrant.txt deleted file mode 100644 index 2b41f1ec8..000000000 --- a/requirements/connectors/qdrant.txt +++ /dev/null @@ -1,66 +0,0 @@ -# This file was autogenerated by uv via the following command: -# uv pip compile ./connectors/qdrant.in --output-file ./connectors/qdrant.txt --no-strip-extras --python-version 3.9 -annotated-types==0.7.0 - # via pydantic -anyio==4.6.0 - # via httpx -certifi==2024.8.30 - # via - # httpcore - # httpx -exceptiongroup==1.2.2 - # via anyio -grpcio==1.66.2 - # via - # -c ./connectors/../common/constraints.txt - # grpcio-tools - # qdrant-client -grpcio-tools==1.62.3 - # via qdrant-client -h11==0.14.0 - # via httpcore -h2==4.1.0 - # via httpx -hpack==4.0.0 - # via h2 -httpcore==1.0.6 - # via httpx -httpx[http2]==0.27.2 - # via qdrant-client -hyperframe==6.0.1 - # via h2 -idna==3.10 - # via - # anyio - # httpx -numpy==1.26.4 - # via - # -c ./connectors/../common/constraints.txt - # qdrant-client -portalocker==2.10.1 - # via qdrant-client -protobuf==4.23.4 - # via - # -c ./connectors/../common/constraints.txt - # grpcio-tools -pydantic==2.9.2 - # via qdrant-client -pydantic-core==2.23.4 - # via pydantic -qdrant-client==1.11.3 - # via -r ./connectors/qdrant.in -setuptools==75.1.0 - # via grpcio-tools -sniffio==1.3.1 - # via - # anyio - # httpx -typing-extensions==4.12.2 - # via - # anyio - # pydantic - # pydantic-core -urllib3==1.26.20 - # via - # -c ./connectors/../common/constraints.txt - # qdrant-client diff --git a/requirements/connectors/reddit.txt b/requirements/connectors/reddit.txt deleted file mode 100644 index 674cdd911..000000000 --- a/requirements/connectors/reddit.txt +++ /dev/null @@ -1,24 +0,0 @@ -# This file was autogenerated by uv via the following command: -# uv pip compile ./connectors/reddit.in --output-file ./connectors/reddit.txt --no-strip-extras --python-version 3.9 -certifi==2024.8.30 - # via requests -charset-normalizer==3.3.2 - # via requests -idna==3.10 - # via requests -praw==7.7.1 - # via -r ./connectors/reddit.in -prawcore==2.4.0 - # via praw -requests==2.32.3 - # via - # prawcore - # update-checker -update-checker==0.18.0 - # via praw -urllib3==1.26.20 - # via - # -c ./connectors/../common/constraints.txt - # requests -websocket-client==1.8.0 - # via praw diff --git a/requirements/connectors/s3.txt b/requirements/connectors/s3.txt deleted file mode 100644 index 4cf9363b7..000000000 --- a/requirements/connectors/s3.txt +++ /dev/null @@ -1,59 +0,0 @@ -# This file was autogenerated by uv via the following command: -# uv pip compile ./connectors/s3.in --output-file ./connectors/s3.txt --no-strip-extras --python-version 3.9 -aiobotocore==2.13.3 - # via s3fs -aiohappyeyeballs==2.4.3 - # via aiohttp -aiohttp==3.10.8 - # via - # aiobotocore - # s3fs -aioitertools==0.12.0 - # via aiobotocore -aiosignal==1.3.1 - # via aiohttp -async-timeout==4.0.3 - # via aiohttp -attrs==24.2.0 - # via aiohttp -botocore==1.34.131 - # via - # -c ./connectors/../common/constraints.txt - # aiobotocore -frozenlist==1.4.1 - # via - # aiohttp - # aiosignal -fsspec==2024.5.0 - # via - # -c ./connectors/../common/constraints.txt - # -r ./connectors/s3.in - # s3fs -idna==3.10 - # via yarl -jmespath==1.0.1 - # via botocore -multidict==6.1.0 - # via - # aiohttp - # yarl -python-dateutil==2.9.0.post0 - # via botocore -s3fs==2024.5.0 - # via -r ./connectors/s3.in -six==1.16.0 - # via python-dateutil -typing-extensions==4.12.2 - # via - # aioitertools - # multidict -urllib3==1.26.20 - # via - # -c ./connectors/../common/constraints.txt - # botocore -wrapt==1.16.0 - # via - # -c ./connectors/../common/constraints.txt - # aiobotocore -yarl==1.13.1 - # via aiohttp diff --git a/requirements/connectors/salesforce.txt b/requirements/connectors/salesforce.txt deleted file mode 100644 index d89cdb02c..000000000 --- a/requirements/connectors/salesforce.txt +++ /dev/null @@ -1,50 +0,0 @@ -# This file was autogenerated by uv via the following command: -# uv pip compile ./connectors/salesforce.in --output-file ./connectors/salesforce.txt --no-strip-extras --python-version 3.9 -attrs==24.2.0 - # via zeep -certifi==2024.8.30 - # via requests -cffi==1.17.1 - # via cryptography -charset-normalizer==3.3.2 - # via requests -cryptography==43.0.1 - # via pyjwt -idna==3.10 - # via requests -isodate==0.6.1 - # via zeep -lxml==5.3.0 - # via zeep -more-itertools==10.5.0 - # via simple-salesforce -platformdirs==4.3.6 - # via zeep -pycparser==2.22 - # via cffi -pyjwt[crypto]==2.9.0 - # via simple-salesforce -pytz==2024.2 - # via zeep -requests==2.32.3 - # via - # requests-file - # requests-toolbelt - # simple-salesforce - # zeep -requests-file==2.1.0 - # via zeep -requests-toolbelt==1.0.0 - # via zeep -simple-salesforce==1.12.6 - # via -r ./connectors/salesforce.in -six==1.16.0 - # via isodate -typing-extensions==4.12.2 - # via simple-salesforce -urllib3==1.26.20 - # via - # -c ./connectors/../common/constraints.txt - # requests -zeep==4.2.1 - # via simple-salesforce diff --git a/requirements/connectors/sftp.txt b/requirements/connectors/sftp.txt deleted file mode 100644 index 05580ae74..000000000 --- a/requirements/connectors/sftp.txt +++ /dev/null @@ -1,20 +0,0 @@ -# This file was autogenerated by uv via the following command: -# uv pip compile ./connectors/sftp.in --output-file ./connectors/sftp.txt --no-strip-extras --python-version 3.9 -bcrypt==4.2.0 - # via paramiko -cffi==1.17.1 - # via - # cryptography - # pynacl -cryptography==43.0.1 - # via paramiko -fsspec==2024.5.0 - # via - # -c ./connectors/../common/constraints.txt - # -r ./connectors/sftp.in -paramiko==3.5.0 - # via -r ./connectors/sftp.in -pycparser==2.22 - # via cffi -pynacl==1.5.0 - # via paramiko diff --git a/requirements/connectors/sharepoint.txt b/requirements/connectors/sharepoint.txt deleted file mode 100644 index b97f4e733..000000000 --- a/requirements/connectors/sharepoint.txt +++ /dev/null @@ -1,36 +0,0 @@ -# This file was autogenerated by uv via the following command: -# uv pip compile ./connectors/sharepoint.in --output-file ./connectors/sharepoint.txt --no-strip-extras --python-version 3.9 -certifi==2024.8.30 - # via requests -cffi==1.17.1 - # via cryptography -charset-normalizer==3.3.2 - # via requests -cryptography==43.0.1 - # via - # msal - # pyjwt -idna==3.10 - # via requests -msal==1.31.0 - # via - # -r ./connectors/sharepoint.in - # office365-rest-python-client -office365-rest-python-client==2.5.13 - # via -r ./connectors/sharepoint.in -pycparser==2.22 - # via cffi -pyjwt[crypto]==2.9.0 - # via msal -pytz==2024.2 - # via office365-rest-python-client -requests==2.32.3 - # via - # msal - # office365-rest-python-client -typing-extensions==4.12.2 - # via office365-rest-python-client -urllib3==1.26.20 - # via - # -c ./connectors/../common/constraints.txt - # requests diff --git a/requirements/connectors/singlestore.txt b/requirements/connectors/singlestore.txt deleted file mode 100644 index d60774865..000000000 --- a/requirements/connectors/singlestore.txt +++ /dev/null @@ -1,40 +0,0 @@ -# This file was autogenerated by uv via the following command: -# uv pip compile ./connectors/singlestore.in --output-file ./connectors/singlestore.txt --no-strip-extras --python-version 3.9 -build==0.10.0 - # via singlestoredb -certifi==2024.8.30 - # via requests -charset-normalizer==3.3.2 - # via requests -idna==3.10 - # via requests -packaging==23.2 - # via - # -c ./connectors/../common/constraints.txt - # build -parsimonious==0.10.0 - # via singlestoredb -pyjwt==2.9.0 - # via singlestoredb -pyproject-hooks==1.2.0 - # via build -regex==2024.9.11 - # via parsimonious -requests==2.32.3 - # via singlestoredb -setuptools==75.1.0 - # via singlestoredb -singlestoredb==1.7.1 - # via -r ./connectors/singlestore.in -sqlparams==6.1.0 - # via singlestoredb -tomli==2.0.2 - # via - # build - # singlestoredb -urllib3==1.26.20 - # via - # -c ./connectors/../common/constraints.txt - # requests -wheel==0.44.0 - # via singlestoredb diff --git a/requirements/connectors/slack.txt b/requirements/connectors/slack.txt deleted file mode 100644 index 364412fce..000000000 --- a/requirements/connectors/slack.txt +++ /dev/null @@ -1,4 +0,0 @@ -# This file was autogenerated by uv via the following command: -# uv pip compile ./connectors/slack.in --output-file ./connectors/slack.txt --no-strip-extras --python-version 3.9 -slack-sdk==3.33.1 - # via -r ./connectors/slack.in diff --git a/requirements/connectors/vectara.txt b/requirements/connectors/vectara.txt deleted file mode 100644 index 917d5e7bc..000000000 --- a/requirements/connectors/vectara.txt +++ /dev/null @@ -1,14 +0,0 @@ -# This file was autogenerated by uv via the following command: -# uv pip compile ./connectors/vectara.in --output-file ./connectors/vectara.txt --no-strip-extras --python-version 3.9 -certifi==2024.8.30 - # via requests -charset-normalizer==3.3.2 - # via requests -idna==3.10 - # via requests -requests==2.32.3 - # via -r ./connectors/vectara.in -urllib3==1.26.20 - # via - # -c ./connectors/../common/constraints.txt - # requests diff --git a/requirements/connectors/weaviate.txt b/requirements/connectors/weaviate.txt deleted file mode 100644 index 2fb7c4cef..000000000 --- a/requirements/connectors/weaviate.txt +++ /dev/null @@ -1,74 +0,0 @@ -# This file was autogenerated by uv via the following command: -# uv pip compile ./connectors/weaviate.in --output-file ./connectors/weaviate.txt --no-strip-extras --python-version 3.9 -annotated-types==0.7.0 - # via pydantic -anyio==4.6.0 - # via httpx -authlib==1.3.1 - # via weaviate-client -certifi==2024.8.30 - # via - # httpcore - # httpx - # requests -cffi==1.17.1 - # via cryptography -charset-normalizer==3.3.2 - # via requests -cryptography==43.0.1 - # via authlib -exceptiongroup==1.2.2 - # via anyio -grpcio==1.66.2 - # via - # -c ./connectors/../common/constraints.txt - # grpcio-health-checking - # grpcio-tools - # weaviate-client -grpcio-health-checking==1.62.3 - # via weaviate-client -grpcio-tools==1.62.3 - # via weaviate-client -h11==0.14.0 - # via httpcore -httpcore==1.0.6 - # via httpx -httpx==0.27.0 - # via weaviate-client -idna==3.10 - # via - # anyio - # httpx - # requests -protobuf==4.23.4 - # via - # -c ./connectors/../common/constraints.txt - # grpcio-health-checking - # grpcio-tools -pycparser==2.22 - # via cffi -pydantic==2.9.2 - # via weaviate-client -pydantic-core==2.23.4 - # via pydantic -requests==2.32.3 - # via weaviate-client -setuptools==75.1.0 - # via grpcio-tools -sniffio==1.3.1 - # via - # anyio - # httpx -typing-extensions==4.12.2 - # via - # anyio - # pydantic - # pydantic-core -urllib3==1.26.20 - # via - # -c ./connectors/../common/constraints.txt - # requests -validators==0.34.0 - # via weaviate-client -weaviate-client==4.8.1 - # via -r ./connectors/weaviate.in diff --git a/requirements/connectors/wikipedia.txt b/requirements/connectors/wikipedia.txt deleted file mode 100644 index 48affe627..000000000 --- a/requirements/connectors/wikipedia.txt +++ /dev/null @@ -1,20 +0,0 @@ -# This file was autogenerated by uv via the following command: -# uv pip compile ./connectors/wikipedia.in --output-file ./connectors/wikipedia.txt --no-strip-extras --python-version 3.9 -beautifulsoup4==4.12.3 - # via wikipedia -certifi==2024.8.30 - # via requests -charset-normalizer==3.3.2 - # via requests -idna==3.10 - # via requests -requests==2.32.3 - # via wikipedia -soupsieve==2.6 - # via beautifulsoup4 -urllib3==1.26.20 - # via - # -c ./connectors/../common/constraints.txt - # requests -wikipedia==1.4.0 - # via -r ./connectors/wikipedia.in diff --git a/requirements/embed/aws-bedrock.txt b/requirements/embed/aws-bedrock.txt deleted file mode 100644 index ed8dac05d..000000000 --- a/requirements/embed/aws-bedrock.txt +++ /dev/null @@ -1,23 +0,0 @@ -# This file was autogenerated by uv via the following command: -# uv pip compile aws-bedrock.in --output-file aws-bedrock.txt --no-strip-extras -boto3==1.34.131 - # via -r aws-bedrock.in -botocore==1.34.131 - # via - # -c ../common/constraints.txt - # boto3 - # s3transfer -jmespath==1.0.1 - # via - # boto3 - # botocore -python-dateutil==2.9.0.post0 - # via botocore -s3transfer==0.10.2 - # via boto3 -six==1.16.0 - # via python-dateutil -urllib3==1.26.20 - # via - # -c ../common/constraints.txt - # botocore diff --git a/requirements/embed/huggingface.txt b/requirements/embed/huggingface.txt deleted file mode 100644 index 510d91949..000000000 --- a/requirements/embed/huggingface.txt +++ /dev/null @@ -1,91 +0,0 @@ -# This file was autogenerated by uv via the following command: -# uv pip compile huggingface.in --output-file huggingface.txt --no-strip-extras -certifi==2024.8.30 - # via requests -charset-normalizer==3.3.2 - # via requests -filelock==3.16.1 - # via - # huggingface-hub - # torch - # transformers -fsspec==2024.5.0 - # via - # -c ../common/constraints.txt - # huggingface-hub - # torch -huggingface-hub==0.25.1 - # via - # sentence-transformers - # tokenizers - # transformers -idna==3.10 - # via requests -jinja2==3.1.4 - # via torch -joblib==1.4.2 - # via scikit-learn -markupsafe==2.1.5 - # via jinja2 -mpmath==1.3.0 - # via sympy -networkx==3.2.1 - # via torch -numpy==1.26.4 - # via - # -c ../common/constraints.txt - # scikit-learn - # scipy - # transformers -packaging==23.2 - # via - # -c ../common/constraints.txt - # huggingface-hub - # transformers -pillow==10.4.0 - # via sentence-transformers -pyyaml==6.0.2 - # via - # huggingface-hub - # transformers -regex==2024.9.11 - # via transformers -requests==2.32.3 - # via - # huggingface-hub - # transformers -safetensors==0.4.5 - # via transformers -scikit-learn==1.5.2 - # via sentence-transformers -scipy==1.13.1 - # via - # scikit-learn - # sentence-transformers -sentence-transformers==3.1.1 - # via -r huggingface.in -sympy==1.13.3 - # via torch -threadpoolctl==3.5.0 - # via scikit-learn -tokenizers==0.19.1 - # via - # -c ../common/constraints.txt - # transformers -torch==2.4.1 - # via sentence-transformers -tqdm==4.66.5 - # via - # huggingface-hub - # sentence-transformers - # transformers -transformers==4.44.2 - # via sentence-transformers -typing-extensions==4.12.2 - # via - # huggingface-hub - # torch -urllib3==1.26.20 - # via - # -c ../common/constraints.txt - # requests diff --git a/requirements/embed/mixedbreadai.txt b/requirements/embed/mixedbreadai.txt deleted file mode 100644 index 225b9d69b..000000000 --- a/requirements/embed/mixedbreadai.txt +++ /dev/null @@ -1,38 +0,0 @@ -# This file was autogenerated by uv via the following command: -# uv pip compile mixedbreadai.in --output-file mixedbreadai.txt --no-strip-extras -annotated-types==0.7.0 - # via pydantic -anyio==4.6.0 - # via httpx -certifi==2024.8.30 - # via - # httpcore - # httpx -exceptiongroup==1.2.2 - # via anyio -h11==0.14.0 - # via httpcore -httpcore==1.0.6 - # via httpx -httpx==0.27.2 - # via mixedbread-ai -idna==3.10 - # via - # anyio - # httpx -mixedbread-ai==2.2.6 - # via -r mixedbreadai.in -pydantic==2.9.2 - # via mixedbread-ai -pydantic-core==2.23.4 - # via pydantic -sniffio==1.3.1 - # via - # anyio - # httpx -typing-extensions==4.12.2 - # via - # anyio - # mixedbread-ai - # pydantic - # pydantic-core diff --git a/requirements/embed/octoai.txt b/requirements/embed/octoai.txt deleted file mode 100644 index 5a858859c..000000000 --- a/requirements/embed/octoai.txt +++ /dev/null @@ -1,61 +0,0 @@ -# This file was autogenerated by uv via the following command: -# uv pip compile octoai.in --output-file octoai.txt --no-strip-extras -annotated-types==0.7.0 - # via pydantic -anyio==4.6.0 - # via - # httpx - # openai -certifi==2024.8.30 - # via - # httpcore - # httpx - # requests -charset-normalizer==3.3.2 - # via requests -distro==1.9.0 - # via openai -exceptiongroup==1.2.2 - # via anyio -h11==0.14.0 - # via httpcore -httpcore==1.0.6 - # via httpx -httpx==0.27.2 - # via openai -idna==3.10 - # via - # anyio - # httpx - # requests -jiter==0.5.0 - # via openai -openai==1.51.0 - # via -r octoai.in -pydantic==2.9.2 - # via openai -pydantic-core==2.23.4 - # via pydantic -regex==2024.9.11 - # via tiktoken -requests==2.32.3 - # via tiktoken -sniffio==1.3.1 - # via - # anyio - # httpx - # openai -tiktoken==0.7.0 - # via -r octoai.in -tqdm==4.66.5 - # via openai -typing-extensions==4.12.2 - # via - # anyio - # openai - # pydantic - # pydantic-core -urllib3==1.26.20 - # via - # -c ../common/constraints.txt - # requests diff --git a/requirements/embed/openai.txt b/requirements/embed/openai.txt deleted file mode 100644 index 822884229..000000000 --- a/requirements/embed/openai.txt +++ /dev/null @@ -1,61 +0,0 @@ -# This file was autogenerated by uv via the following command: -# uv pip compile openai.in --output-file openai.txt --no-strip-extras -annotated-types==0.7.0 - # via pydantic -anyio==4.6.0 - # via - # httpx - # openai -certifi==2024.8.30 - # via - # httpcore - # httpx - # requests -charset-normalizer==3.3.2 - # via requests -distro==1.9.0 - # via openai -exceptiongroup==1.2.2 - # via anyio -h11==0.14.0 - # via httpcore -httpcore==1.0.6 - # via httpx -httpx==0.27.2 - # via openai -idna==3.10 - # via - # anyio - # httpx - # requests -jiter==0.5.0 - # via openai -openai==1.51.0 - # via -r openai.in -pydantic==2.9.2 - # via openai -pydantic-core==2.23.4 - # via pydantic -regex==2024.9.11 - # via tiktoken -requests==2.32.3 - # via tiktoken -sniffio==1.3.1 - # via - # anyio - # httpx - # openai -tiktoken==0.7.0 - # via -r openai.in -tqdm==4.66.5 - # via openai -typing-extensions==4.12.2 - # via - # anyio - # openai - # pydantic - # pydantic-core -urllib3==1.26.20 - # via - # -c ../common/constraints.txt - # requests diff --git a/requirements/embed/vertexai.txt b/requirements/embed/vertexai.txt deleted file mode 100644 index 80d3ff645..000000000 --- a/requirements/embed/vertexai.txt +++ /dev/null @@ -1,122 +0,0 @@ -# This file was autogenerated by uv via the following command: -# uv pip compile vertexai.in --output-file vertexai.txt --no-strip-extras -annotated-types==0.7.0 - # via pydantic -cachetools==5.5.0 - # via google-auth -certifi==2024.8.30 - # via requests -charset-normalizer==3.3.2 - # via requests -docstring-parser==0.16 - # via google-cloud-aiplatform -google-api-core[grpc]==2.20.0 - # via - # google-cloud-aiplatform - # google-cloud-bigquery - # google-cloud-core - # google-cloud-resource-manager - # google-cloud-storage -google-auth==2.35.0 - # via - # google-api-core - # google-cloud-aiplatform - # google-cloud-bigquery - # google-cloud-core - # google-cloud-resource-manager - # google-cloud-storage -google-cloud-aiplatform[all]==1.69.0 - # via vertexai -google-cloud-bigquery==3.26.0 - # via google-cloud-aiplatform -google-cloud-core==2.4.1 - # via - # google-cloud-bigquery - # google-cloud-storage -google-cloud-resource-manager==1.12.5 - # via google-cloud-aiplatform -google-cloud-storage==2.18.2 - # via google-cloud-aiplatform -google-crc32c==1.6.0 - # via - # google-cloud-storage - # google-resumable-media -google-resumable-media==2.7.2 - # via - # google-cloud-bigquery - # google-cloud-storage -googleapis-common-protos[grpc]==1.65.0 - # via - # google-api-core - # grpc-google-iam-v1 - # grpcio-status -grpc-google-iam-v1==0.13.1 - # via google-cloud-resource-manager -grpcio==1.66.2 - # via - # -c ../common/constraints.txt - # google-api-core - # googleapis-common-protos - # grpc-google-iam-v1 - # grpcio-status -grpcio-status==1.62.3 - # via google-api-core -idna==3.10 - # via requests -numpy==1.26.4 - # via - # -c ../common/constraints.txt - # shapely -packaging==23.2 - # via - # -c ../common/constraints.txt - # google-cloud-aiplatform - # google-cloud-bigquery -proto-plus==1.24.0 - # via - # google-api-core - # google-cloud-aiplatform - # google-cloud-resource-manager -protobuf==4.23.4 - # via - # -c ../common/constraints.txt - # google-api-core - # google-cloud-aiplatform - # google-cloud-resource-manager - # googleapis-common-protos - # grpc-google-iam-v1 - # grpcio-status - # proto-plus -pyasn1==0.6.1 - # via - # pyasn1-modules - # rsa -pyasn1-modules==0.4.1 - # via google-auth -pydantic==2.9.2 - # via google-cloud-aiplatform -pydantic-core==2.23.4 - # via pydantic -python-dateutil==2.9.0.post0 - # via google-cloud-bigquery -requests==2.32.3 - # via - # google-api-core - # google-cloud-bigquery - # google-cloud-storage -rsa==4.9 - # via google-auth -shapely==2.0.6 - # via google-cloud-aiplatform -six==1.16.0 - # via python-dateutil -typing-extensions==4.12.2 - # via - # pydantic - # pydantic-core -urllib3==1.26.20 - # via - # -c ../common/constraints.txt - # requests -vertexai==1.69.0 - # via -r vertexai.in diff --git a/requirements/embed/voyageai.txt b/requirements/embed/voyageai.txt deleted file mode 100644 index ab06aa637..000000000 --- a/requirements/embed/voyageai.txt +++ /dev/null @@ -1,48 +0,0 @@ -# This file was autogenerated by uv via the following command: -# uv pip compile voyageai.in --output-file voyageai.txt --no-strip-extras -aiohappyeyeballs==2.4.3 - # via aiohttp -aiohttp==3.10.8 - # via voyageai -aiolimiter==1.1.0 - # via voyageai -aiosignal==1.3.1 - # via aiohttp -async-timeout==4.0.3 - # via aiohttp -attrs==24.2.0 - # via aiohttp -certifi==2024.8.30 - # via requests -charset-normalizer==3.3.2 - # via requests -frozenlist==1.4.1 - # via - # aiohttp - # aiosignal -idna==3.10 - # via - # requests - # yarl -multidict==6.1.0 - # via - # aiohttp - # yarl -numpy==1.26.4 - # via - # -c ../common/constraints.txt - # voyageai -requests==2.32.3 - # via voyageai -tenacity==9.0.0 - # via voyageai -typing-extensions==4.12.2 - # via multidict -urllib3==1.26.20 - # via - # -c ../common/constraints.txt - # requests -voyageai==0.2.3 - # via -r voyageai.in -yarl==1.13.1 - # via aiohttp diff --git a/requirements/lint.txt b/requirements/lint.txt deleted file mode 100644 index c8aa6afb4..000000000 --- a/requirements/lint.txt +++ /dev/null @@ -1,49 +0,0 @@ -# This file was autogenerated by uv via the following command: -# uv pip compile ./lint.in --output-file ./lint.txt --no-strip-extras --python-version 3.9 -autoflake==2.3.1 - # via -r ./lint.in -black==24.8.0 - # via -r ./lint.in -click==8.1.7 - # via black -flake8==7.1.1 - # via - # -r ./lint.in - # flake8-print -flake8-print==5.0.0 - # via -r ./lint.in -mccabe==0.7.0 - # via flake8 -mypy==1.11.2 - # via -r ./lint.in -mypy-extensions==1.0.0 - # via - # black - # mypy -packaging==23.2 - # via - # -c ././common/constraints.txt - # black -pathspec==0.12.1 - # via black -platformdirs==4.3.6 - # via black -pycodestyle==2.12.1 - # via - # flake8 - # flake8-print -pyflakes==3.2.0 - # via - # autoflake - # flake8 -ruff==0.6.8 - # via -r ./lint.in -tomli==2.0.2 - # via - # autoflake - # black - # mypy -typing-extensions==4.12.2 - # via - # black - # mypy diff --git a/requirements/local_partition/doc.txt b/requirements/local_partition/doc.txt deleted file mode 100644 index 362e65463..000000000 --- a/requirements/local_partition/doc.txt +++ /dev/null @@ -1,161 +0,0 @@ -# This file was autogenerated by uv via the following command: -# uv pip compile ./local_partition/doc.in --output-file ./local_partition/doc.txt --no-strip-extras --python-version 3.9 -anyio==4.6.0 - # via httpx -backoff==2.2.1 - # via unstructured -beautifulsoup4==4.12.3 - # via unstructured -certifi==2024.8.30 - # via - # httpcore - # httpx - # requests - # unstructured-client -cffi==1.17.1 - # via cryptography -chardet==5.2.0 - # via unstructured -charset-normalizer==3.3.2 - # via - # requests - # unstructured-client -click==8.1.7 - # via - # nltk - # python-oxmsg -cryptography==43.0.1 - # via unstructured-client -dataclasses-json==0.6.7 - # via - # unstructured - # unstructured-client -deepdiff==8.0.1 - # via unstructured-client -emoji==2.13.2 - # via unstructured -exceptiongroup==1.2.2 - # via anyio -filetype==1.2.0 - # via unstructured -h11==0.14.0 - # via httpcore -httpcore==1.0.6 - # via httpx -httpx==0.27.2 - # via unstructured-client -idna==3.10 - # via - # anyio - # httpx - # requests - # unstructured-client -joblib==1.4.2 - # via nltk -jsonpath-python==1.0.6 - # via unstructured-client -langdetect==1.0.9 - # via unstructured -lxml==5.3.0 - # via - # python-docx - # unstructured -marshmallow==3.22.0 - # via - # dataclasses-json - # unstructured-client -mypy-extensions==1.0.0 - # via - # typing-inspect - # unstructured-client -nest-asyncio==1.6.0 - # via unstructured-client -nltk==3.9.1 - # via unstructured -numpy==1.26.4 - # via - # -c ./local_partition/../common/constraints.txt - # unstructured -olefile==0.47 - # via python-oxmsg -orderly-set==5.2.2 - # via deepdiff -packaging==23.2 - # via - # -c ./local_partition/../common/constraints.txt - # marshmallow - # unstructured-client -psutil==6.0.0 - # via unstructured -pycparser==2.22 - # via cffi -pypdf==5.0.1 - # via unstructured-client -python-dateutil==2.9.0.post0 - # via unstructured-client -python-docx==1.1.2 - # via unstructured -python-iso639==2024.4.27 - # via unstructured -python-magic==0.4.27 - # via unstructured -python-oxmsg==0.0.1 - # via unstructured -rapidfuzz==3.10.0 - # via unstructured -regex==2024.9.11 - # via nltk -requests==2.32.3 - # via - # requests-toolbelt - # unstructured - # unstructured-client -requests-toolbelt==1.0.0 - # via unstructured-client -six==1.16.0 - # via - # langdetect - # python-dateutil - # unstructured-client -sniffio==1.3.1 - # via - # anyio - # httpx -soupsieve==2.6 - # via beautifulsoup4 -tabulate==0.9.0 - # via unstructured -tqdm==4.66.5 - # via - # nltk - # unstructured -typing-extensions==4.12.2 - # via - # anyio - # pypdf - # python-docx - # python-oxmsg - # typing-inspect - # unstructured - # unstructured-client -typing-inspect==0.9.0 - # via - # dataclasses-json - # unstructured-client -unstructured[doc]==0.15.10 - # via - # -c ./local_partition/../common/constraints.txt - # -r ./local_partition/doc.in -unstructured-client==0.25.9 - # via - # -c ./local_partition/../common/constraints.txt - # unstructured -urllib3==1.26.20 - # via - # -c ./local_partition/../common/constraints.txt - # requests - # unstructured-client -wrapt==1.16.0 - # via - # -c ./local_partition/../common/constraints.txt - # unstructured diff --git a/requirements/local_partition/docx.txt b/requirements/local_partition/docx.txt deleted file mode 100644 index 8807fc2bc..000000000 --- a/requirements/local_partition/docx.txt +++ /dev/null @@ -1,161 +0,0 @@ -# This file was autogenerated by uv via the following command: -# uv pip compile ./local_partition/docx.in --output-file ./local_partition/docx.txt --no-strip-extras --python-version 3.9 -anyio==4.6.0 - # via httpx -backoff==2.2.1 - # via unstructured -beautifulsoup4==4.12.3 - # via unstructured -certifi==2024.8.30 - # via - # httpcore - # httpx - # requests - # unstructured-client -cffi==1.17.1 - # via cryptography -chardet==5.2.0 - # via unstructured -charset-normalizer==3.3.2 - # via - # requests - # unstructured-client -click==8.1.7 - # via - # nltk - # python-oxmsg -cryptography==43.0.1 - # via unstructured-client -dataclasses-json==0.6.7 - # via - # unstructured - # unstructured-client -deepdiff==8.0.1 - # via unstructured-client -emoji==2.13.2 - # via unstructured -exceptiongroup==1.2.2 - # via anyio -filetype==1.2.0 - # via unstructured -h11==0.14.0 - # via httpcore -httpcore==1.0.6 - # via httpx -httpx==0.27.2 - # via unstructured-client -idna==3.10 - # via - # anyio - # httpx - # requests - # unstructured-client -joblib==1.4.2 - # via nltk -jsonpath-python==1.0.6 - # via unstructured-client -langdetect==1.0.9 - # via unstructured -lxml==5.3.0 - # via - # python-docx - # unstructured -marshmallow==3.22.0 - # via - # dataclasses-json - # unstructured-client -mypy-extensions==1.0.0 - # via - # typing-inspect - # unstructured-client -nest-asyncio==1.6.0 - # via unstructured-client -nltk==3.9.1 - # via unstructured -numpy==1.26.4 - # via - # -c ./local_partition/../common/constraints.txt - # unstructured -olefile==0.47 - # via python-oxmsg -orderly-set==5.2.2 - # via deepdiff -packaging==23.2 - # via - # -c ./local_partition/../common/constraints.txt - # marshmallow - # unstructured-client -psutil==6.0.0 - # via unstructured -pycparser==2.22 - # via cffi -pypdf==5.0.1 - # via unstructured-client -python-dateutil==2.9.0.post0 - # via unstructured-client -python-docx==1.1.2 - # via unstructured -python-iso639==2024.4.27 - # via unstructured -python-magic==0.4.27 - # via unstructured -python-oxmsg==0.0.1 - # via unstructured -rapidfuzz==3.10.0 - # via unstructured -regex==2024.9.11 - # via nltk -requests==2.32.3 - # via - # requests-toolbelt - # unstructured - # unstructured-client -requests-toolbelt==1.0.0 - # via unstructured-client -six==1.16.0 - # via - # langdetect - # python-dateutil - # unstructured-client -sniffio==1.3.1 - # via - # anyio - # httpx -soupsieve==2.6 - # via beautifulsoup4 -tabulate==0.9.0 - # via unstructured -tqdm==4.66.5 - # via - # nltk - # unstructured -typing-extensions==4.12.2 - # via - # anyio - # pypdf - # python-docx - # python-oxmsg - # typing-inspect - # unstructured - # unstructured-client -typing-inspect==0.9.0 - # via - # dataclasses-json - # unstructured-client -unstructured[docx]==0.15.10 - # via - # -c ./local_partition/../common/constraints.txt - # -r ./local_partition/docx.in -unstructured-client==0.25.9 - # via - # -c ./local_partition/../common/constraints.txt - # unstructured -urllib3==1.26.20 - # via - # -c ./local_partition/../common/constraints.txt - # requests - # unstructured-client -wrapt==1.16.0 - # via - # -c ./local_partition/../common/constraints.txt - # unstructured diff --git a/requirements/local_partition/epub.txt b/requirements/local_partition/epub.txt deleted file mode 100644 index 1866c99fb..000000000 --- a/requirements/local_partition/epub.txt +++ /dev/null @@ -1,158 +0,0 @@ -# This file was autogenerated by uv via the following command: -# uv pip compile ./local_partition/epub.in --output-file ./local_partition/epub.txt --no-strip-extras --python-version 3.9 -anyio==4.6.0 - # via httpx -backoff==2.2.1 - # via unstructured -beautifulsoup4==4.12.3 - # via unstructured -certifi==2024.8.30 - # via - # httpcore - # httpx - # requests - # unstructured-client -cffi==1.17.1 - # via cryptography -chardet==5.2.0 - # via unstructured -charset-normalizer==3.3.2 - # via - # requests - # unstructured-client -click==8.1.7 - # via - # nltk - # python-oxmsg -cryptography==43.0.1 - # via unstructured-client -dataclasses-json==0.6.7 - # via - # unstructured - # unstructured-client -deepdiff==8.0.1 - # via unstructured-client -emoji==2.13.2 - # via unstructured -exceptiongroup==1.2.2 - # via anyio -filetype==1.2.0 - # via unstructured -h11==0.14.0 - # via httpcore -httpcore==1.0.6 - # via httpx -httpx==0.27.2 - # via unstructured-client -idna==3.10 - # via - # anyio - # httpx - # requests - # unstructured-client -joblib==1.4.2 - # via nltk -jsonpath-python==1.0.6 - # via unstructured-client -langdetect==1.0.9 - # via unstructured -lxml==5.3.0 - # via unstructured -marshmallow==3.22.0 - # via - # dataclasses-json - # unstructured-client -mypy-extensions==1.0.0 - # via - # typing-inspect - # unstructured-client -nest-asyncio==1.6.0 - # via unstructured-client -nltk==3.9.1 - # via unstructured -numpy==1.26.4 - # via - # -c ./local_partition/../common/constraints.txt - # unstructured -olefile==0.47 - # via python-oxmsg -orderly-set==5.2.2 - # via deepdiff -packaging==23.2 - # via - # -c ./local_partition/../common/constraints.txt - # marshmallow - # unstructured-client -psutil==6.0.0 - # via unstructured -pycparser==2.22 - # via cffi -pypandoc==1.13 - # via unstructured -pypdf==5.0.1 - # via unstructured-client -python-dateutil==2.9.0.post0 - # via unstructured-client -python-iso639==2024.4.27 - # via unstructured -python-magic==0.4.27 - # via unstructured -python-oxmsg==0.0.1 - # via unstructured -rapidfuzz==3.10.0 - # via unstructured -regex==2024.9.11 - # via nltk -requests==2.32.3 - # via - # requests-toolbelt - # unstructured - # unstructured-client -requests-toolbelt==1.0.0 - # via unstructured-client -six==1.16.0 - # via - # langdetect - # python-dateutil - # unstructured-client -sniffio==1.3.1 - # via - # anyio - # httpx -soupsieve==2.6 - # via beautifulsoup4 -tabulate==0.9.0 - # via unstructured -tqdm==4.66.5 - # via - # nltk - # unstructured -typing-extensions==4.12.2 - # via - # anyio - # pypdf - # python-oxmsg - # typing-inspect - # unstructured - # unstructured-client -typing-inspect==0.9.0 - # via - # dataclasses-json - # unstructured-client -unstructured[epub]==0.15.10 - # via - # -c ./local_partition/../common/constraints.txt - # -r ./local_partition/epub.in -unstructured-client==0.25.9 - # via - # -c ./local_partition/../common/constraints.txt - # unstructured -urllib3==1.26.20 - # via - # -c ./local_partition/../common/constraints.txt - # requests - # unstructured-client -wrapt==1.16.0 - # via - # -c ./local_partition/../common/constraints.txt - # unstructured diff --git a/requirements/local_partition/image.txt b/requirements/local_partition/image.txt deleted file mode 100644 index 197a7cc36..000000000 --- a/requirements/local_partition/image.txt +++ /dev/null @@ -1,395 +0,0 @@ -# This file was autogenerated by uv via the following command: -# uv pip compile ./local_partition/image.in --output-file ./local_partition/image.txt --no-strip-extras --python-version 3.9 -antlr4-python3-runtime==4.9.3 - # via omegaconf -anyio==4.6.0 - # via httpx -backoff==2.2.1 - # via unstructured -beautifulsoup4==4.12.3 - # via unstructured -cachetools==5.5.0 - # via google-auth -certifi==2024.8.30 - # via - # httpcore - # httpx - # requests - # unstructured-client -cffi==1.17.1 - # via cryptography -chardet==5.2.0 - # via - # pdfplumber - # unstructured -charset-normalizer==3.3.2 - # via - # pdfminer-six - # requests - # unstructured-client -click==8.1.7 - # via - # nltk - # python-oxmsg -coloredlogs==15.0.1 - # via onnxruntime -contourpy==1.3.0 - # via matplotlib -cryptography==43.0.1 - # via - # pdfminer-six - # unstructured-client -cycler==0.12.1 - # via matplotlib -dataclasses-json==0.6.7 - # via - # unstructured - # unstructured-client -deepdiff==8.0.1 - # via unstructured-client -deprecated==1.2.14 - # via pikepdf -effdet==0.4.1 - # via unstructured -emoji==2.13.2 - # via unstructured -exceptiongroup==1.2.2 - # via anyio -filelock==3.16.1 - # via - # huggingface-hub - # torch - # transformers -filetype==1.2.0 - # via unstructured -flatbuffers==24.3.25 - # via onnxruntime -fonttools==4.54.1 - # via matplotlib -fsspec==2024.5.0 - # via - # -c ./local_partition/../common/constraints.txt - # huggingface-hub - # torch -google-api-core[grpc]==2.20.0 - # via google-cloud-vision -google-auth==2.35.0 - # via - # google-api-core - # google-cloud-vision -google-cloud-vision==3.7.4 - # via unstructured -googleapis-common-protos==1.65.0 - # via - # google-api-core - # grpcio-status -grpcio==1.66.2 - # via - # -c ./local_partition/../common/constraints.txt - # google-api-core - # grpcio-status -grpcio-status==1.62.3 - # via google-api-core -h11==0.14.0 - # via httpcore -httpcore==1.0.6 - # via httpx -httpx==0.27.2 - # via unstructured-client -huggingface-hub==0.25.1 - # via - # timm - # tokenizers - # transformers - # unstructured-inference -humanfriendly==10.0 - # via coloredlogs -idna==3.10 - # via - # anyio - # httpx - # requests - # unstructured-client -importlib-resources==6.4.5 - # via matplotlib -iopath==0.1.10 - # via layoutparser -jinja2==3.1.4 - # via torch -joblib==1.4.2 - # via nltk -jsonpath-python==1.0.6 - # via unstructured-client -kiwisolver==1.4.7 - # via matplotlib -langdetect==1.0.9 - # via unstructured -layoutparser==0.3.4 - # via unstructured-inference -lxml==5.3.0 - # via - # pikepdf - # unstructured -markupsafe==2.1.5 - # via jinja2 -marshmallow==3.22.0 - # via - # dataclasses-json - # unstructured-client -matplotlib==3.9.2 - # via - # pycocotools - # unstructured-inference -mpmath==1.3.0 - # via sympy -mypy-extensions==1.0.0 - # via - # typing-inspect - # unstructured-client -nest-asyncio==1.6.0 - # via unstructured-client -networkx==3.2.1 - # via torch -nltk==3.9.1 - # via unstructured -numpy==1.26.4 - # via - # -c ./local_partition/../common/constraints.txt - # contourpy - # layoutparser - # matplotlib - # onnx - # onnxruntime - # opencv-python - # pandas - # pycocotools - # scipy - # torchvision - # transformers - # unstructured -olefile==0.47 - # via python-oxmsg -omegaconf==2.3.0 - # via effdet -onnx==1.17.0 - # via - # unstructured - # unstructured-inference -onnxruntime==1.19.2 - # via unstructured-inference -opencv-python==4.10.0.84 - # via - # layoutparser - # unstructured-inference -orderly-set==5.2.2 - # via deepdiff -packaging==23.2 - # via - # -c ./local_partition/../common/constraints.txt - # huggingface-hub - # marshmallow - # matplotlib - # onnxruntime - # pikepdf - # transformers - # unstructured-client - # unstructured-pytesseract -pandas==2.2.3 - # via layoutparser -pdf2image==1.17.0 - # via - # layoutparser - # unstructured -pdfminer-six==20240706 - # via - # pdfplumber - # unstructured -pdfplumber==0.5.3 - # via layoutparser -pi-heif==0.18.0 - # via unstructured -pikepdf==9.3.0 - # via unstructured -pillow==10.4.0 - # via - # layoutparser - # matplotlib - # pdf2image - # pdfplumber - # pi-heif - # pikepdf - # torchvision - # unstructured-pytesseract -portalocker==2.10.1 - # via iopath -proto-plus==1.24.0 - # via - # google-api-core - # google-cloud-vision -protobuf==4.23.4 - # via - # -c ./local_partition/../common/constraints.txt - # google-api-core - # google-cloud-vision - # googleapis-common-protos - # grpcio-status - # onnx - # onnxruntime - # proto-plus -psutil==6.0.0 - # via unstructured -pyasn1==0.6.1 - # via - # pyasn1-modules - # rsa -pyasn1-modules==0.4.1 - # via google-auth -pycocotools==2.0.8 - # via effdet -pycparser==2.22 - # via cffi -pycrypto==2.6.1 - # via pdfplumber -pyparsing==3.1.4 - # via matplotlib -pypdf==5.0.1 - # via - # unstructured - # unstructured-client -python-dateutil==2.9.0.post0 - # via - # matplotlib - # pandas - # unstructured-client -python-iso639==2024.4.27 - # via unstructured -python-magic==0.4.27 - # via unstructured -python-multipart==0.0.12 - # via unstructured-inference -python-oxmsg==0.0.1 - # via unstructured -pytz==2024.2 - # via pandas -pyyaml==6.0.2 - # via - # huggingface-hub - # layoutparser - # omegaconf - # timm - # transformers -rapidfuzz==3.10.0 - # via - # unstructured - # unstructured-inference -regex==2024.9.11 - # via - # nltk - # transformers -requests==2.32.3 - # via - # google-api-core - # huggingface-hub - # requests-toolbelt - # transformers - # unstructured - # unstructured-client -requests-toolbelt==1.0.0 - # via unstructured-client -rsa==4.9 - # via google-auth -safetensors==0.4.5 - # via - # timm - # transformers -scipy==1.13.1 - # via layoutparser -six==1.16.0 - # via - # langdetect - # python-dateutil - # unstructured-client -sniffio==1.3.1 - # via - # anyio - # httpx -soupsieve==2.6 - # via beautifulsoup4 -sympy==1.13.3 - # via - # onnxruntime - # torch -tabulate==0.9.0 - # via unstructured -timm==1.0.9 - # via - # effdet - # unstructured-inference -tokenizers==0.19.1 - # via - # -c ./local_partition/../common/constraints.txt - # transformers -torch==2.4.1 - # via - # effdet - # timm - # torchvision - # unstructured-inference -torchvision==0.19.1 - # via - # effdet - # timm -tqdm==4.66.5 - # via - # huggingface-hub - # iopath - # nltk - # transformers - # unstructured -transformers==4.44.2 - # via unstructured-inference -typing-extensions==4.12.2 - # via - # anyio - # huggingface-hub - # iopath - # pypdf - # python-oxmsg - # torch - # typing-inspect - # unstructured - # unstructured-client -typing-inspect==0.9.0 - # via - # dataclasses-json - # unstructured-client -tzdata==2024.2 - # via pandas -unicodecsv==0.14.1 - # via pdfplumber -unstructured[image]==0.15.10 - # via - # -c ./local_partition/../common/constraints.txt - # -r ./local_partition/image.in -unstructured-client==0.25.9 - # via - # -c ./local_partition/../common/constraints.txt - # unstructured -unstructured-inference==0.7.36 - # via unstructured -unstructured-pytesseract==0.3.13 - # via unstructured -urllib3==1.26.20 - # via - # -c ./local_partition/../common/constraints.txt - # requests - # unstructured-client -wand==0.6.13 - # via pdfplumber -wrapt==1.16.0 - # via - # -c ./local_partition/../common/constraints.txt - # deprecated - # unstructured -zipp==3.20.2 - # via importlib-resources diff --git a/requirements/local_partition/md.txt b/requirements/local_partition/md.txt deleted file mode 100644 index 0725988eb..000000000 --- a/requirements/local_partition/md.txt +++ /dev/null @@ -1,158 +0,0 @@ -# This file was autogenerated by uv via the following command: -# uv pip compile ./local_partition/md.in --output-file ./local_partition/md.txt --no-strip-extras --python-version 3.9 -anyio==4.6.0 - # via httpx -backoff==2.2.1 - # via unstructured -beautifulsoup4==4.12.3 - # via unstructured -certifi==2024.8.30 - # via - # httpcore - # httpx - # requests - # unstructured-client -cffi==1.17.1 - # via cryptography -chardet==5.2.0 - # via unstructured -charset-normalizer==3.3.2 - # via - # requests - # unstructured-client -click==8.1.7 - # via - # nltk - # python-oxmsg -cryptography==43.0.1 - # via unstructured-client -dataclasses-json==0.6.7 - # via - # unstructured - # unstructured-client -deepdiff==8.0.1 - # via unstructured-client -emoji==2.13.2 - # via unstructured -exceptiongroup==1.2.2 - # via anyio -filetype==1.2.0 - # via unstructured -h11==0.14.0 - # via httpcore -httpcore==1.0.6 - # via httpx -httpx==0.27.2 - # via unstructured-client -idna==3.10 - # via - # anyio - # httpx - # requests - # unstructured-client -joblib==1.4.2 - # via nltk -jsonpath-python==1.0.6 - # via unstructured-client -langdetect==1.0.9 - # via unstructured -lxml==5.3.0 - # via unstructured -markdown==3.3.4 - # via unstructured -marshmallow==3.22.0 - # via - # dataclasses-json - # unstructured-client -mypy-extensions==1.0.0 - # via - # typing-inspect - # unstructured-client -nest-asyncio==1.6.0 - # via unstructured-client -nltk==3.9.1 - # via unstructured -numpy==1.26.4 - # via - # -c ./local_partition/../common/constraints.txt - # unstructured -olefile==0.47 - # via python-oxmsg -orderly-set==5.2.2 - # via deepdiff -packaging==23.2 - # via - # -c ./local_partition/../common/constraints.txt - # marshmallow - # unstructured-client -psutil==6.0.0 - # via unstructured -pycparser==2.22 - # via cffi -pypdf==5.0.1 - # via unstructured-client -python-dateutil==2.9.0.post0 - # via unstructured-client -python-iso639==2024.4.27 - # via unstructured -python-magic==0.4.27 - # via unstructured -python-oxmsg==0.0.1 - # via unstructured -rapidfuzz==3.10.0 - # via unstructured -regex==2024.9.11 - # via nltk -requests==2.32.3 - # via - # requests-toolbelt - # unstructured - # unstructured-client -requests-toolbelt==1.0.0 - # via unstructured-client -six==1.16.0 - # via - # langdetect - # python-dateutil - # unstructured-client -sniffio==1.3.1 - # via - # anyio - # httpx -soupsieve==2.6 - # via beautifulsoup4 -tabulate==0.9.0 - # via unstructured -tqdm==4.66.5 - # via - # nltk - # unstructured -typing-extensions==4.12.2 - # via - # anyio - # pypdf - # python-oxmsg - # typing-inspect - # unstructured - # unstructured-client -typing-inspect==0.9.0 - # via - # dataclasses-json - # unstructured-client -unstructured[md]==0.15.10 - # via - # -c ./local_partition/../common/constraints.txt - # -r ./local_partition/md.in -unstructured-client==0.25.9 - # via - # -c ./local_partition/../common/constraints.txt - # unstructured -urllib3==1.26.20 - # via - # -c ./local_partition/../common/constraints.txt - # requests - # unstructured-client -wrapt==1.16.0 - # via - # -c ./local_partition/../common/constraints.txt - # unstructured diff --git a/requirements/local_partition/msg.txt b/requirements/local_partition/msg.txt deleted file mode 100644 index 9142dacf3..000000000 --- a/requirements/local_partition/msg.txt +++ /dev/null @@ -1,156 +0,0 @@ -# This file was autogenerated by uv via the following command: -# uv pip compile ./local_partition/msg.in --output-file ./local_partition/msg.txt --no-strip-extras --python-version 3.9 -anyio==4.6.0 - # via httpx -backoff==2.2.1 - # via unstructured -beautifulsoup4==4.12.3 - # via unstructured -certifi==2024.8.30 - # via - # httpcore - # httpx - # requests - # unstructured-client -cffi==1.17.1 - # via cryptography -chardet==5.2.0 - # via unstructured -charset-normalizer==3.3.2 - # via - # requests - # unstructured-client -click==8.1.7 - # via - # nltk - # python-oxmsg -cryptography==43.0.1 - # via unstructured-client -dataclasses-json==0.6.7 - # via - # unstructured - # unstructured-client -deepdiff==8.0.1 - # via unstructured-client -emoji==2.13.2 - # via unstructured -exceptiongroup==1.2.2 - # via anyio -filetype==1.2.0 - # via unstructured -h11==0.14.0 - # via httpcore -httpcore==1.0.6 - # via httpx -httpx==0.27.2 - # via unstructured-client -idna==3.10 - # via - # anyio - # httpx - # requests - # unstructured-client -joblib==1.4.2 - # via nltk -jsonpath-python==1.0.6 - # via unstructured-client -langdetect==1.0.9 - # via unstructured -lxml==5.3.0 - # via unstructured -marshmallow==3.22.0 - # via - # dataclasses-json - # unstructured-client -mypy-extensions==1.0.0 - # via - # typing-inspect - # unstructured-client -nest-asyncio==1.6.0 - # via unstructured-client -nltk==3.9.1 - # via unstructured -numpy==1.26.4 - # via - # -c ./local_partition/../common/constraints.txt - # unstructured -olefile==0.47 - # via python-oxmsg -orderly-set==5.2.2 - # via deepdiff -packaging==23.2 - # via - # -c ./local_partition/../common/constraints.txt - # marshmallow - # unstructured-client -psutil==6.0.0 - # via unstructured -pycparser==2.22 - # via cffi -pypdf==5.0.1 - # via unstructured-client -python-dateutil==2.9.0.post0 - # via unstructured-client -python-iso639==2024.4.27 - # via unstructured -python-magic==0.4.27 - # via unstructured -python-oxmsg==0.0.1 - # via unstructured -rapidfuzz==3.10.0 - # via unstructured -regex==2024.9.11 - # via nltk -requests==2.32.3 - # via - # requests-toolbelt - # unstructured - # unstructured-client -requests-toolbelt==1.0.0 - # via unstructured-client -six==1.16.0 - # via - # langdetect - # python-dateutil - # unstructured-client -sniffio==1.3.1 - # via - # anyio - # httpx -soupsieve==2.6 - # via beautifulsoup4 -tabulate==0.9.0 - # via unstructured -tqdm==4.66.5 - # via - # nltk - # unstructured -typing-extensions==4.12.2 - # via - # anyio - # pypdf - # python-oxmsg - # typing-inspect - # unstructured - # unstructured-client -typing-inspect==0.9.0 - # via - # dataclasses-json - # unstructured-client -unstructured[msg]==0.15.10 - # via - # -c ./local_partition/../common/constraints.txt - # -r ./local_partition/msg.in -unstructured-client==0.25.9 - # via - # -c ./local_partition/../common/constraints.txt - # unstructured -urllib3==1.26.20 - # via - # -c ./local_partition/../common/constraints.txt - # requests - # unstructured-client -wrapt==1.16.0 - # via - # -c ./local_partition/../common/constraints.txt - # unstructured diff --git a/requirements/local_partition/odt.txt b/requirements/local_partition/odt.txt deleted file mode 100644 index 42cae6865..000000000 --- a/requirements/local_partition/odt.txt +++ /dev/null @@ -1,163 +0,0 @@ -# This file was autogenerated by uv via the following command: -# uv pip compile ./local_partition/odt.in --output-file ./local_partition/odt.txt --no-strip-extras --python-version 3.9 -anyio==4.6.0 - # via httpx -backoff==2.2.1 - # via unstructured -beautifulsoup4==4.12.3 - # via unstructured -certifi==2024.8.30 - # via - # httpcore - # httpx - # requests - # unstructured-client -cffi==1.17.1 - # via cryptography -chardet==5.2.0 - # via unstructured -charset-normalizer==3.3.2 - # via - # requests - # unstructured-client -click==8.1.7 - # via - # nltk - # python-oxmsg -cryptography==43.0.1 - # via unstructured-client -dataclasses-json==0.6.7 - # via - # unstructured - # unstructured-client -deepdiff==8.0.1 - # via unstructured-client -emoji==2.13.2 - # via unstructured -exceptiongroup==1.2.2 - # via anyio -filetype==1.2.0 - # via unstructured -h11==0.14.0 - # via httpcore -httpcore==1.0.6 - # via httpx -httpx==0.27.2 - # via unstructured-client -idna==3.10 - # via - # anyio - # httpx - # requests - # unstructured-client -joblib==1.4.2 - # via nltk -jsonpath-python==1.0.6 - # via unstructured-client -langdetect==1.0.9 - # via unstructured -lxml==5.3.0 - # via - # python-docx - # unstructured -marshmallow==3.22.0 - # via - # dataclasses-json - # unstructured-client -mypy-extensions==1.0.0 - # via - # typing-inspect - # unstructured-client -nest-asyncio==1.6.0 - # via unstructured-client -nltk==3.9.1 - # via unstructured -numpy==1.26.4 - # via - # -c ./local_partition/../common/constraints.txt - # unstructured -olefile==0.47 - # via python-oxmsg -orderly-set==5.2.2 - # via deepdiff -packaging==23.2 - # via - # -c ./local_partition/../common/constraints.txt - # marshmallow - # unstructured-client -psutil==6.0.0 - # via unstructured -pycparser==2.22 - # via cffi -pypandoc==1.13 - # via unstructured -pypdf==5.0.1 - # via unstructured-client -python-dateutil==2.9.0.post0 - # via unstructured-client -python-docx==1.1.2 - # via unstructured -python-iso639==2024.4.27 - # via unstructured -python-magic==0.4.27 - # via unstructured -python-oxmsg==0.0.1 - # via unstructured -rapidfuzz==3.10.0 - # via unstructured -regex==2024.9.11 - # via nltk -requests==2.32.3 - # via - # requests-toolbelt - # unstructured - # unstructured-client -requests-toolbelt==1.0.0 - # via unstructured-client -six==1.16.0 - # via - # langdetect - # python-dateutil - # unstructured-client -sniffio==1.3.1 - # via - # anyio - # httpx -soupsieve==2.6 - # via beautifulsoup4 -tabulate==0.9.0 - # via unstructured -tqdm==4.66.5 - # via - # nltk - # unstructured -typing-extensions==4.12.2 - # via - # anyio - # pypdf - # python-docx - # python-oxmsg - # typing-inspect - # unstructured - # unstructured-client -typing-inspect==0.9.0 - # via - # dataclasses-json - # unstructured-client -unstructured[odt]==0.15.10 - # via - # -c ./local_partition/../common/constraints.txt - # -r ./local_partition/odt.in -unstructured-client==0.25.9 - # via - # -c ./local_partition/../common/constraints.txt - # unstructured -urllib3==1.26.20 - # via - # -c ./local_partition/../common/constraints.txt - # requests - # unstructured-client -wrapt==1.16.0 - # via - # -c ./local_partition/../common/constraints.txt - # unstructured diff --git a/requirements/local_partition/org.txt b/requirements/local_partition/org.txt deleted file mode 100644 index 1c655d6ec..000000000 --- a/requirements/local_partition/org.txt +++ /dev/null @@ -1,158 +0,0 @@ -# This file was autogenerated by uv via the following command: -# uv pip compile ./local_partition/org.in --output-file ./local_partition/org.txt --no-strip-extras --python-version 3.9 -anyio==4.6.0 - # via httpx -backoff==2.2.1 - # via unstructured -beautifulsoup4==4.12.3 - # via unstructured -certifi==2024.8.30 - # via - # httpcore - # httpx - # requests - # unstructured-client -cffi==1.17.1 - # via cryptography -chardet==5.2.0 - # via unstructured -charset-normalizer==3.3.2 - # via - # requests - # unstructured-client -click==8.1.7 - # via - # nltk - # python-oxmsg -cryptography==43.0.1 - # via unstructured-client -dataclasses-json==0.6.7 - # via - # unstructured - # unstructured-client -deepdiff==8.0.1 - # via unstructured-client -emoji==2.13.2 - # via unstructured -exceptiongroup==1.2.2 - # via anyio -filetype==1.2.0 - # via unstructured -h11==0.14.0 - # via httpcore -httpcore==1.0.6 - # via httpx -httpx==0.27.2 - # via unstructured-client -idna==3.10 - # via - # anyio - # httpx - # requests - # unstructured-client -joblib==1.4.2 - # via nltk -jsonpath-python==1.0.6 - # via unstructured-client -langdetect==1.0.9 - # via unstructured -lxml==5.3.0 - # via unstructured -marshmallow==3.22.0 - # via - # dataclasses-json - # unstructured-client -mypy-extensions==1.0.0 - # via - # typing-inspect - # unstructured-client -nest-asyncio==1.6.0 - # via unstructured-client -nltk==3.9.1 - # via unstructured -numpy==1.26.4 - # via - # -c ./local_partition/../common/constraints.txt - # unstructured -olefile==0.47 - # via python-oxmsg -orderly-set==5.2.2 - # via deepdiff -packaging==23.2 - # via - # -c ./local_partition/../common/constraints.txt - # marshmallow - # unstructured-client -psutil==6.0.0 - # via unstructured -pycparser==2.22 - # via cffi -pypandoc==1.13 - # via unstructured -pypdf==5.0.1 - # via unstructured-client -python-dateutil==2.9.0.post0 - # via unstructured-client -python-iso639==2024.4.27 - # via unstructured -python-magic==0.4.27 - # via unstructured -python-oxmsg==0.0.1 - # via unstructured -rapidfuzz==3.10.0 - # via unstructured -regex==2024.9.11 - # via nltk -requests==2.32.3 - # via - # requests-toolbelt - # unstructured - # unstructured-client -requests-toolbelt==1.0.0 - # via unstructured-client -six==1.16.0 - # via - # langdetect - # python-dateutil - # unstructured-client -sniffio==1.3.1 - # via - # anyio - # httpx -soupsieve==2.6 - # via beautifulsoup4 -tabulate==0.9.0 - # via unstructured -tqdm==4.66.5 - # via - # nltk - # unstructured -typing-extensions==4.12.2 - # via - # anyio - # pypdf - # python-oxmsg - # typing-inspect - # unstructured - # unstructured-client -typing-inspect==0.9.0 - # via - # dataclasses-json - # unstructured-client -unstructured[org]==0.15.10 - # via - # -c ./local_partition/../common/constraints.txt - # -r ./local_partition/org.in -unstructured-client==0.25.9 - # via - # -c ./local_partition/../common/constraints.txt - # unstructured -urllib3==1.26.20 - # via - # -c ./local_partition/../common/constraints.txt - # requests - # unstructured-client -wrapt==1.16.0 - # via - # -c ./local_partition/../common/constraints.txt - # unstructured diff --git a/requirements/local_partition/pdf.txt b/requirements/local_partition/pdf.txt deleted file mode 100644 index 80ad35bc6..000000000 --- a/requirements/local_partition/pdf.txt +++ /dev/null @@ -1,395 +0,0 @@ -# This file was autogenerated by uv via the following command: -# uv pip compile ./local_partition/pdf.in --output-file ./local_partition/pdf.txt --no-strip-extras --python-version 3.9 -antlr4-python3-runtime==4.9.3 - # via omegaconf -anyio==4.6.0 - # via httpx -backoff==2.2.1 - # via unstructured -beautifulsoup4==4.12.3 - # via unstructured -cachetools==5.5.0 - # via google-auth -certifi==2024.8.30 - # via - # httpcore - # httpx - # requests - # unstructured-client -cffi==1.17.1 - # via cryptography -chardet==5.2.0 - # via - # pdfplumber - # unstructured -charset-normalizer==3.3.2 - # via - # pdfminer-six - # requests - # unstructured-client -click==8.1.7 - # via - # nltk - # python-oxmsg -coloredlogs==15.0.1 - # via onnxruntime -contourpy==1.3.0 - # via matplotlib -cryptography==43.0.1 - # via - # pdfminer-six - # unstructured-client -cycler==0.12.1 - # via matplotlib -dataclasses-json==0.6.7 - # via - # unstructured - # unstructured-client -deepdiff==8.0.1 - # via unstructured-client -deprecated==1.2.14 - # via pikepdf -effdet==0.4.1 - # via unstructured -emoji==2.13.2 - # via unstructured -exceptiongroup==1.2.2 - # via anyio -filelock==3.16.1 - # via - # huggingface-hub - # torch - # transformers -filetype==1.2.0 - # via unstructured -flatbuffers==24.3.25 - # via onnxruntime -fonttools==4.54.1 - # via matplotlib -fsspec==2024.5.0 - # via - # -c ./local_partition/../common/constraints.txt - # huggingface-hub - # torch -google-api-core[grpc]==2.20.0 - # via google-cloud-vision -google-auth==2.35.0 - # via - # google-api-core - # google-cloud-vision -google-cloud-vision==3.7.4 - # via unstructured -googleapis-common-protos==1.65.0 - # via - # google-api-core - # grpcio-status -grpcio==1.66.2 - # via - # -c ./local_partition/../common/constraints.txt - # google-api-core - # grpcio-status -grpcio-status==1.62.3 - # via google-api-core -h11==0.14.0 - # via httpcore -httpcore==1.0.6 - # via httpx -httpx==0.27.2 - # via unstructured-client -huggingface-hub==0.25.1 - # via - # timm - # tokenizers - # transformers - # unstructured-inference -humanfriendly==10.0 - # via coloredlogs -idna==3.10 - # via - # anyio - # httpx - # requests - # unstructured-client -importlib-resources==6.4.5 - # via matplotlib -iopath==0.1.10 - # via layoutparser -jinja2==3.1.4 - # via torch -joblib==1.4.2 - # via nltk -jsonpath-python==1.0.6 - # via unstructured-client -kiwisolver==1.4.7 - # via matplotlib -langdetect==1.0.9 - # via unstructured -layoutparser==0.3.4 - # via unstructured-inference -lxml==5.3.0 - # via - # pikepdf - # unstructured -markupsafe==2.1.5 - # via jinja2 -marshmallow==3.22.0 - # via - # dataclasses-json - # unstructured-client -matplotlib==3.9.2 - # via - # pycocotools - # unstructured-inference -mpmath==1.3.0 - # via sympy -mypy-extensions==1.0.0 - # via - # typing-inspect - # unstructured-client -nest-asyncio==1.6.0 - # via unstructured-client -networkx==3.2.1 - # via torch -nltk==3.9.1 - # via unstructured -numpy==1.26.4 - # via - # -c ./local_partition/../common/constraints.txt - # contourpy - # layoutparser - # matplotlib - # onnx - # onnxruntime - # opencv-python - # pandas - # pycocotools - # scipy - # torchvision - # transformers - # unstructured -olefile==0.47 - # via python-oxmsg -omegaconf==2.3.0 - # via effdet -onnx==1.17.0 - # via - # unstructured - # unstructured-inference -onnxruntime==1.19.2 - # via unstructured-inference -opencv-python==4.10.0.84 - # via - # layoutparser - # unstructured-inference -orderly-set==5.2.2 - # via deepdiff -packaging==23.2 - # via - # -c ./local_partition/../common/constraints.txt - # huggingface-hub - # marshmallow - # matplotlib - # onnxruntime - # pikepdf - # transformers - # unstructured-client - # unstructured-pytesseract -pandas==2.2.3 - # via layoutparser -pdf2image==1.17.0 - # via - # layoutparser - # unstructured -pdfminer-six==20240706 - # via - # pdfplumber - # unstructured -pdfplumber==0.5.3 - # via layoutparser -pi-heif==0.18.0 - # via unstructured -pikepdf==9.3.0 - # via unstructured -pillow==10.4.0 - # via - # layoutparser - # matplotlib - # pdf2image - # pdfplumber - # pi-heif - # pikepdf - # torchvision - # unstructured-pytesseract -portalocker==2.10.1 - # via iopath -proto-plus==1.24.0 - # via - # google-api-core - # google-cloud-vision -protobuf==4.23.4 - # via - # -c ./local_partition/../common/constraints.txt - # google-api-core - # google-cloud-vision - # googleapis-common-protos - # grpcio-status - # onnx - # onnxruntime - # proto-plus -psutil==6.0.0 - # via unstructured -pyasn1==0.6.1 - # via - # pyasn1-modules - # rsa -pyasn1-modules==0.4.1 - # via google-auth -pycocotools==2.0.8 - # via effdet -pycparser==2.22 - # via cffi -pycrypto==2.6.1 - # via pdfplumber -pyparsing==3.1.4 - # via matplotlib -pypdf==5.0.1 - # via - # unstructured - # unstructured-client -python-dateutil==2.9.0.post0 - # via - # matplotlib - # pandas - # unstructured-client -python-iso639==2024.4.27 - # via unstructured -python-magic==0.4.27 - # via unstructured -python-multipart==0.0.12 - # via unstructured-inference -python-oxmsg==0.0.1 - # via unstructured -pytz==2024.2 - # via pandas -pyyaml==6.0.2 - # via - # huggingface-hub - # layoutparser - # omegaconf - # timm - # transformers -rapidfuzz==3.10.0 - # via - # unstructured - # unstructured-inference -regex==2024.9.11 - # via - # nltk - # transformers -requests==2.32.3 - # via - # google-api-core - # huggingface-hub - # requests-toolbelt - # transformers - # unstructured - # unstructured-client -requests-toolbelt==1.0.0 - # via unstructured-client -rsa==4.9 - # via google-auth -safetensors==0.4.5 - # via - # timm - # transformers -scipy==1.13.1 - # via layoutparser -six==1.16.0 - # via - # langdetect - # python-dateutil - # unstructured-client -sniffio==1.3.1 - # via - # anyio - # httpx -soupsieve==2.6 - # via beautifulsoup4 -sympy==1.13.3 - # via - # onnxruntime - # torch -tabulate==0.9.0 - # via unstructured -timm==1.0.9 - # via - # effdet - # unstructured-inference -tokenizers==0.19.1 - # via - # -c ./local_partition/../common/constraints.txt - # transformers -torch==2.4.1 - # via - # effdet - # timm - # torchvision - # unstructured-inference -torchvision==0.19.1 - # via - # effdet - # timm -tqdm==4.66.5 - # via - # huggingface-hub - # iopath - # nltk - # transformers - # unstructured -transformers==4.44.2 - # via unstructured-inference -typing-extensions==4.12.2 - # via - # anyio - # huggingface-hub - # iopath - # pypdf - # python-oxmsg - # torch - # typing-inspect - # unstructured - # unstructured-client -typing-inspect==0.9.0 - # via - # dataclasses-json - # unstructured-client -tzdata==2024.2 - # via pandas -unicodecsv==0.14.1 - # via pdfplumber -unstructured[pdf]==0.15.10 - # via - # -c ./local_partition/../common/constraints.txt - # -r ./local_partition/pdf.in -unstructured-client==0.25.9 - # via - # -c ./local_partition/../common/constraints.txt - # unstructured -unstructured-inference==0.7.36 - # via unstructured -unstructured-pytesseract==0.3.13 - # via unstructured -urllib3==1.26.20 - # via - # -c ./local_partition/../common/constraints.txt - # requests - # unstructured-client -wand==0.6.13 - # via pdfplumber -wrapt==1.16.0 - # via - # -c ./local_partition/../common/constraints.txt - # deprecated - # unstructured -zipp==3.20.2 - # via importlib-resources diff --git a/requirements/local_partition/ppt.txt b/requirements/local_partition/ppt.txt deleted file mode 100644 index 79eab0caa..000000000 --- a/requirements/local_partition/ppt.txt +++ /dev/null @@ -1,165 +0,0 @@ -# This file was autogenerated by uv via the following command: -# uv pip compile ./local_partition/ppt.in --output-file ./local_partition/ppt.txt --no-strip-extras --python-version 3.9 -anyio==4.6.0 - # via httpx -backoff==2.2.1 - # via unstructured -beautifulsoup4==4.12.3 - # via unstructured -certifi==2024.8.30 - # via - # httpcore - # httpx - # requests - # unstructured-client -cffi==1.17.1 - # via cryptography -chardet==5.2.0 - # via unstructured -charset-normalizer==3.3.2 - # via - # requests - # unstructured-client -click==8.1.7 - # via - # nltk - # python-oxmsg -cryptography==43.0.1 - # via unstructured-client -dataclasses-json==0.6.7 - # via - # unstructured - # unstructured-client -deepdiff==8.0.1 - # via unstructured-client -emoji==2.13.2 - # via unstructured -exceptiongroup==1.2.2 - # via anyio -filetype==1.2.0 - # via unstructured -h11==0.14.0 - # via httpcore -httpcore==1.0.6 - # via httpx -httpx==0.27.2 - # via unstructured-client -idna==3.10 - # via - # anyio - # httpx - # requests - # unstructured-client -joblib==1.4.2 - # via nltk -jsonpath-python==1.0.6 - # via unstructured-client -langdetect==1.0.9 - # via unstructured -lxml==5.3.0 - # via - # python-pptx - # unstructured -marshmallow==3.22.0 - # via - # dataclasses-json - # unstructured-client -mypy-extensions==1.0.0 - # via - # typing-inspect - # unstructured-client -nest-asyncio==1.6.0 - # via unstructured-client -nltk==3.9.1 - # via unstructured -numpy==1.26.4 - # via - # -c ./local_partition/../common/constraints.txt - # unstructured -olefile==0.47 - # via python-oxmsg -orderly-set==5.2.2 - # via deepdiff -packaging==23.2 - # via - # -c ./local_partition/../common/constraints.txt - # marshmallow - # unstructured-client -pillow==10.4.0 - # via python-pptx -psutil==6.0.0 - # via unstructured -pycparser==2.22 - # via cffi -pypdf==5.0.1 - # via unstructured-client -python-dateutil==2.9.0.post0 - # via unstructured-client -python-iso639==2024.4.27 - # via unstructured -python-magic==0.4.27 - # via unstructured -python-oxmsg==0.0.1 - # via unstructured -python-pptx==1.0.2 - # via unstructured -rapidfuzz==3.10.0 - # via unstructured -regex==2024.9.11 - # via nltk -requests==2.32.3 - # via - # requests-toolbelt - # unstructured - # unstructured-client -requests-toolbelt==1.0.0 - # via unstructured-client -six==1.16.0 - # via - # langdetect - # python-dateutil - # unstructured-client -sniffio==1.3.1 - # via - # anyio - # httpx -soupsieve==2.6 - # via beautifulsoup4 -tabulate==0.9.0 - # via unstructured -tqdm==4.66.5 - # via - # nltk - # unstructured -typing-extensions==4.12.2 - # via - # anyio - # pypdf - # python-oxmsg - # python-pptx - # typing-inspect - # unstructured - # unstructured-client -typing-inspect==0.9.0 - # via - # dataclasses-json - # unstructured-client -unstructured[ppt]==0.15.10 - # via - # -c ./local_partition/../common/constraints.txt - # -r ./local_partition/ppt.in -unstructured-client==0.25.9 - # via - # -c ./local_partition/../common/constraints.txt - # unstructured -urllib3==1.26.20 - # via - # -c ./local_partition/../common/constraints.txt - # requests - # unstructured-client -wrapt==1.16.0 - # via - # -c ./local_partition/../common/constraints.txt - # unstructured -xlsxwriter==3.2.0 - # via python-pptx diff --git a/requirements/local_partition/pptx.txt b/requirements/local_partition/pptx.txt deleted file mode 100644 index c60fdb9b2..000000000 --- a/requirements/local_partition/pptx.txt +++ /dev/null @@ -1,165 +0,0 @@ -# This file was autogenerated by uv via the following command: -# uv pip compile ./local_partition/pptx.in --output-file ./local_partition/pptx.txt --no-strip-extras --python-version 3.9 -anyio==4.6.0 - # via httpx -backoff==2.2.1 - # via unstructured -beautifulsoup4==4.12.3 - # via unstructured -certifi==2024.8.30 - # via - # httpcore - # httpx - # requests - # unstructured-client -cffi==1.17.1 - # via cryptography -chardet==5.2.0 - # via unstructured -charset-normalizer==3.3.2 - # via - # requests - # unstructured-client -click==8.1.7 - # via - # nltk - # python-oxmsg -cryptography==43.0.1 - # via unstructured-client -dataclasses-json==0.6.7 - # via - # unstructured - # unstructured-client -deepdiff==8.0.1 - # via unstructured-client -emoji==2.13.2 - # via unstructured -exceptiongroup==1.2.2 - # via anyio -filetype==1.2.0 - # via unstructured -h11==0.14.0 - # via httpcore -httpcore==1.0.6 - # via httpx -httpx==0.27.2 - # via unstructured-client -idna==3.10 - # via - # anyio - # httpx - # requests - # unstructured-client -joblib==1.4.2 - # via nltk -jsonpath-python==1.0.6 - # via unstructured-client -langdetect==1.0.9 - # via unstructured -lxml==5.3.0 - # via - # python-pptx - # unstructured -marshmallow==3.22.0 - # via - # dataclasses-json - # unstructured-client -mypy-extensions==1.0.0 - # via - # typing-inspect - # unstructured-client -nest-asyncio==1.6.0 - # via unstructured-client -nltk==3.9.1 - # via unstructured -numpy==1.26.4 - # via - # -c ./local_partition/../common/constraints.txt - # unstructured -olefile==0.47 - # via python-oxmsg -orderly-set==5.2.2 - # via deepdiff -packaging==23.2 - # via - # -c ./local_partition/../common/constraints.txt - # marshmallow - # unstructured-client -pillow==10.4.0 - # via python-pptx -psutil==6.0.0 - # via unstructured -pycparser==2.22 - # via cffi -pypdf==5.0.1 - # via unstructured-client -python-dateutil==2.9.0.post0 - # via unstructured-client -python-iso639==2024.4.27 - # via unstructured -python-magic==0.4.27 - # via unstructured -python-oxmsg==0.0.1 - # via unstructured -python-pptx==1.0.2 - # via unstructured -rapidfuzz==3.10.0 - # via unstructured -regex==2024.9.11 - # via nltk -requests==2.32.3 - # via - # requests-toolbelt - # unstructured - # unstructured-client -requests-toolbelt==1.0.0 - # via unstructured-client -six==1.16.0 - # via - # langdetect - # python-dateutil - # unstructured-client -sniffio==1.3.1 - # via - # anyio - # httpx -soupsieve==2.6 - # via beautifulsoup4 -tabulate==0.9.0 - # via unstructured -tqdm==4.66.5 - # via - # nltk - # unstructured -typing-extensions==4.12.2 - # via - # anyio - # pypdf - # python-oxmsg - # python-pptx - # typing-inspect - # unstructured - # unstructured-client -typing-inspect==0.9.0 - # via - # dataclasses-json - # unstructured-client -unstructured[pptx]==0.15.10 - # via - # -c ./local_partition/../common/constraints.txt - # -r ./local_partition/pptx.in -unstructured-client==0.25.9 - # via - # -c ./local_partition/../common/constraints.txt - # unstructured -urllib3==1.26.20 - # via - # -c ./local_partition/../common/constraints.txt - # requests - # unstructured-client -wrapt==1.16.0 - # via - # -c ./local_partition/../common/constraints.txt - # unstructured -xlsxwriter==3.2.0 - # via python-pptx diff --git a/requirements/local_partition/rst.txt b/requirements/local_partition/rst.txt deleted file mode 100644 index 1cc6eee06..000000000 --- a/requirements/local_partition/rst.txt +++ /dev/null @@ -1,158 +0,0 @@ -# This file was autogenerated by uv via the following command: -# uv pip compile ./local_partition/rst.in --output-file ./local_partition/rst.txt --no-strip-extras --python-version 3.9 -anyio==4.6.0 - # via httpx -backoff==2.2.1 - # via unstructured -beautifulsoup4==4.12.3 - # via unstructured -certifi==2024.8.30 - # via - # httpcore - # httpx - # requests - # unstructured-client -cffi==1.17.1 - # via cryptography -chardet==5.2.0 - # via unstructured -charset-normalizer==3.3.2 - # via - # requests - # unstructured-client -click==8.1.7 - # via - # nltk - # python-oxmsg -cryptography==43.0.1 - # via unstructured-client -dataclasses-json==0.6.7 - # via - # unstructured - # unstructured-client -deepdiff==8.0.1 - # via unstructured-client -emoji==2.13.2 - # via unstructured -exceptiongroup==1.2.2 - # via anyio -filetype==1.2.0 - # via unstructured -h11==0.14.0 - # via httpcore -httpcore==1.0.6 - # via httpx -httpx==0.27.2 - # via unstructured-client -idna==3.10 - # via - # anyio - # httpx - # requests - # unstructured-client -joblib==1.4.2 - # via nltk -jsonpath-python==1.0.6 - # via unstructured-client -langdetect==1.0.9 - # via unstructured -lxml==5.3.0 - # via unstructured -marshmallow==3.22.0 - # via - # dataclasses-json - # unstructured-client -mypy-extensions==1.0.0 - # via - # typing-inspect - # unstructured-client -nest-asyncio==1.6.0 - # via unstructured-client -nltk==3.9.1 - # via unstructured -numpy==1.26.4 - # via - # -c ./local_partition/../common/constraints.txt - # unstructured -olefile==0.47 - # via python-oxmsg -orderly-set==5.2.2 - # via deepdiff -packaging==23.2 - # via - # -c ./local_partition/../common/constraints.txt - # marshmallow - # unstructured-client -psutil==6.0.0 - # via unstructured -pycparser==2.22 - # via cffi -pypandoc==1.13 - # via unstructured -pypdf==5.0.1 - # via unstructured-client -python-dateutil==2.9.0.post0 - # via unstructured-client -python-iso639==2024.4.27 - # via unstructured -python-magic==0.4.27 - # via unstructured -python-oxmsg==0.0.1 - # via unstructured -rapidfuzz==3.10.0 - # via unstructured -regex==2024.9.11 - # via nltk -requests==2.32.3 - # via - # requests-toolbelt - # unstructured - # unstructured-client -requests-toolbelt==1.0.0 - # via unstructured-client -six==1.16.0 - # via - # langdetect - # python-dateutil - # unstructured-client -sniffio==1.3.1 - # via - # anyio - # httpx -soupsieve==2.6 - # via beautifulsoup4 -tabulate==0.9.0 - # via unstructured -tqdm==4.66.5 - # via - # nltk - # unstructured -typing-extensions==4.12.2 - # via - # anyio - # pypdf - # python-oxmsg - # typing-inspect - # unstructured - # unstructured-client -typing-inspect==0.9.0 - # via - # dataclasses-json - # unstructured-client -unstructured[rst]==0.15.10 - # via - # -c ./local_partition/../common/constraints.txt - # -r ./local_partition/rst.in -unstructured-client==0.25.9 - # via - # -c ./local_partition/../common/constraints.txt - # unstructured -urllib3==1.26.20 - # via - # -c ./local_partition/../common/constraints.txt - # requests - # unstructured-client -wrapt==1.16.0 - # via - # -c ./local_partition/../common/constraints.txt - # unstructured diff --git a/requirements/local_partition/rtf.txt b/requirements/local_partition/rtf.txt deleted file mode 100644 index afbbb61e6..000000000 --- a/requirements/local_partition/rtf.txt +++ /dev/null @@ -1,158 +0,0 @@ -# This file was autogenerated by uv via the following command: -# uv pip compile ./local_partition/rtf.in --output-file ./local_partition/rtf.txt --no-strip-extras --python-version 3.9 -anyio==4.6.0 - # via httpx -backoff==2.2.1 - # via unstructured -beautifulsoup4==4.12.3 - # via unstructured -certifi==2024.8.30 - # via - # httpcore - # httpx - # requests - # unstructured-client -cffi==1.17.1 - # via cryptography -chardet==5.2.0 - # via unstructured -charset-normalizer==3.3.2 - # via - # requests - # unstructured-client -click==8.1.7 - # via - # nltk - # python-oxmsg -cryptography==43.0.1 - # via unstructured-client -dataclasses-json==0.6.7 - # via - # unstructured - # unstructured-client -deepdiff==8.0.1 - # via unstructured-client -emoji==2.13.2 - # via unstructured -exceptiongroup==1.2.2 - # via anyio -filetype==1.2.0 - # via unstructured -h11==0.14.0 - # via httpcore -httpcore==1.0.6 - # via httpx -httpx==0.27.2 - # via unstructured-client -idna==3.10 - # via - # anyio - # httpx - # requests - # unstructured-client -joblib==1.4.2 - # via nltk -jsonpath-python==1.0.6 - # via unstructured-client -langdetect==1.0.9 - # via unstructured -lxml==5.3.0 - # via unstructured -marshmallow==3.22.0 - # via - # dataclasses-json - # unstructured-client -mypy-extensions==1.0.0 - # via - # typing-inspect - # unstructured-client -nest-asyncio==1.6.0 - # via unstructured-client -nltk==3.9.1 - # via unstructured -numpy==1.26.4 - # via - # -c ./local_partition/../common/constraints.txt - # unstructured -olefile==0.47 - # via python-oxmsg -orderly-set==5.2.2 - # via deepdiff -packaging==23.2 - # via - # -c ./local_partition/../common/constraints.txt - # marshmallow - # unstructured-client -psutil==6.0.0 - # via unstructured -pycparser==2.22 - # via cffi -pypandoc==1.13 - # via unstructured -pypdf==5.0.1 - # via unstructured-client -python-dateutil==2.9.0.post0 - # via unstructured-client -python-iso639==2024.4.27 - # via unstructured -python-magic==0.4.27 - # via unstructured -python-oxmsg==0.0.1 - # via unstructured -rapidfuzz==3.10.0 - # via unstructured -regex==2024.9.11 - # via nltk -requests==2.32.3 - # via - # requests-toolbelt - # unstructured - # unstructured-client -requests-toolbelt==1.0.0 - # via unstructured-client -six==1.16.0 - # via - # langdetect - # python-dateutil - # unstructured-client -sniffio==1.3.1 - # via - # anyio - # httpx -soupsieve==2.6 - # via beautifulsoup4 -tabulate==0.9.0 - # via unstructured -tqdm==4.66.5 - # via - # nltk - # unstructured -typing-extensions==4.12.2 - # via - # anyio - # pypdf - # python-oxmsg - # typing-inspect - # unstructured - # unstructured-client -typing-inspect==0.9.0 - # via - # dataclasses-json - # unstructured-client -unstructured[rtf]==0.15.10 - # via - # -c ./local_partition/../common/constraints.txt - # -r ./local_partition/rtf.in -unstructured-client==0.25.9 - # via - # -c ./local_partition/../common/constraints.txt - # unstructured -urllib3==1.26.20 - # via - # -c ./local_partition/../common/constraints.txt - # requests - # unstructured-client -wrapt==1.16.0 - # via - # -c ./local_partition/../common/constraints.txt - # unstructured diff --git a/requirements/local_partition/tsv.txt b/requirements/local_partition/tsv.txt deleted file mode 100644 index 363434a2d..000000000 --- a/requirements/local_partition/tsv.txt +++ /dev/null @@ -1,165 +0,0 @@ -# This file was autogenerated by uv via the following command: -# uv pip compile ./local_partition/tsv.in --output-file ./local_partition/tsv.txt --no-strip-extras --python-version 3.9 -anyio==4.6.0 - # via httpx -backoff==2.2.1 - # via unstructured -beautifulsoup4==4.12.3 - # via unstructured -certifi==2024.8.30 - # via - # httpcore - # httpx - # requests - # unstructured-client -cffi==1.17.1 - # via cryptography -chardet==5.2.0 - # via unstructured -charset-normalizer==3.3.2 - # via - # requests - # unstructured-client -click==8.1.7 - # via - # nltk - # python-oxmsg -cryptography==43.0.1 - # via unstructured-client -dataclasses-json==0.6.7 - # via - # unstructured - # unstructured-client -deepdiff==8.0.1 - # via unstructured-client -emoji==2.13.2 - # via unstructured -exceptiongroup==1.2.2 - # via anyio -filetype==1.2.0 - # via unstructured -h11==0.14.0 - # via httpcore -httpcore==1.0.6 - # via httpx -httpx==0.27.2 - # via unstructured-client -idna==3.10 - # via - # anyio - # httpx - # requests - # unstructured-client -joblib==1.4.2 - # via nltk -jsonpath-python==1.0.6 - # via unstructured-client -langdetect==1.0.9 - # via unstructured -lxml==5.3.0 - # via unstructured -marshmallow==3.22.0 - # via - # dataclasses-json - # unstructured-client -mypy-extensions==1.0.0 - # via - # typing-inspect - # unstructured-client -nest-asyncio==1.6.0 - # via unstructured-client -nltk==3.9.1 - # via unstructured -numpy==1.26.4 - # via - # -c ./local_partition/../common/constraints.txt - # pandas - # unstructured -olefile==0.47 - # via python-oxmsg -orderly-set==5.2.2 - # via deepdiff -packaging==23.2 - # via - # -c ./local_partition/../common/constraints.txt - # marshmallow - # unstructured-client -pandas==2.2.3 - # via unstructured -psutil==6.0.0 - # via unstructured -pycparser==2.22 - # via cffi -pypdf==5.0.1 - # via unstructured-client -python-dateutil==2.9.0.post0 - # via - # pandas - # unstructured-client -python-iso639==2024.4.27 - # via unstructured -python-magic==0.4.27 - # via unstructured -python-oxmsg==0.0.1 - # via unstructured -pytz==2024.2 - # via pandas -rapidfuzz==3.10.0 - # via unstructured -regex==2024.9.11 - # via nltk -requests==2.32.3 - # via - # requests-toolbelt - # unstructured - # unstructured-client -requests-toolbelt==1.0.0 - # via unstructured-client -six==1.16.0 - # via - # langdetect - # python-dateutil - # unstructured-client -sniffio==1.3.1 - # via - # anyio - # httpx -soupsieve==2.6 - # via beautifulsoup4 -tabulate==0.9.0 - # via unstructured -tqdm==4.66.5 - # via - # nltk - # unstructured -typing-extensions==4.12.2 - # via - # anyio - # pypdf - # python-oxmsg - # typing-inspect - # unstructured - # unstructured-client -typing-inspect==0.9.0 - # via - # dataclasses-json - # unstructured-client -tzdata==2024.2 - # via pandas -unstructured[tsv]==0.15.10 - # via - # -c ./local_partition/../common/constraints.txt - # -r ./local_partition/tsv.in -unstructured-client==0.25.9 - # via - # -c ./local_partition/../common/constraints.txt - # unstructured -urllib3==1.26.20 - # via - # -c ./local_partition/../common/constraints.txt - # requests - # unstructured-client -wrapt==1.16.0 - # via - # -c ./local_partition/../common/constraints.txt - # unstructured diff --git a/requirements/local_partition/xlsx.txt b/requirements/local_partition/xlsx.txt deleted file mode 100644 index fbd34079c..000000000 --- a/requirements/local_partition/xlsx.txt +++ /dev/null @@ -1,173 +0,0 @@ -# This file was autogenerated by uv via the following command: -# uv pip compile ./local_partition/xlsx.in --output-file ./local_partition/xlsx.txt --no-strip-extras --python-version 3.9 -anyio==4.6.0 - # via httpx -backoff==2.2.1 - # via unstructured -beautifulsoup4==4.12.3 - # via unstructured -certifi==2024.8.30 - # via - # httpcore - # httpx - # requests - # unstructured-client -cffi==1.17.1 - # via cryptography -chardet==5.2.0 - # via unstructured -charset-normalizer==3.3.2 - # via - # requests - # unstructured-client -click==8.1.7 - # via - # nltk - # python-oxmsg -cryptography==43.0.1 - # via unstructured-client -dataclasses-json==0.6.7 - # via - # unstructured - # unstructured-client -deepdiff==8.0.1 - # via unstructured-client -emoji==2.13.2 - # via unstructured -et-xmlfile==1.1.0 - # via openpyxl -exceptiongroup==1.2.2 - # via anyio -filetype==1.2.0 - # via unstructured -h11==0.14.0 - # via httpcore -httpcore==1.0.6 - # via httpx -httpx==0.27.2 - # via unstructured-client -idna==3.10 - # via - # anyio - # httpx - # requests - # unstructured-client -joblib==1.4.2 - # via nltk -jsonpath-python==1.0.6 - # via unstructured-client -langdetect==1.0.9 - # via unstructured -lxml==5.3.0 - # via unstructured -marshmallow==3.22.0 - # via - # dataclasses-json - # unstructured-client -mypy-extensions==1.0.0 - # via - # typing-inspect - # unstructured-client -nest-asyncio==1.6.0 - # via unstructured-client -networkx==3.2.1 - # via unstructured -nltk==3.9.1 - # via unstructured -numpy==1.26.4 - # via - # -c ./local_partition/../common/constraints.txt - # pandas - # unstructured -olefile==0.47 - # via python-oxmsg -openpyxl==3.1.5 - # via unstructured -orderly-set==5.2.2 - # via deepdiff -packaging==23.2 - # via - # -c ./local_partition/../common/constraints.txt - # marshmallow - # unstructured-client -pandas==2.2.3 - # via unstructured -psutil==6.0.0 - # via unstructured -pycparser==2.22 - # via cffi -pypdf==5.0.1 - # via unstructured-client -python-dateutil==2.9.0.post0 - # via - # pandas - # unstructured-client -python-iso639==2024.4.27 - # via unstructured -python-magic==0.4.27 - # via unstructured -python-oxmsg==0.0.1 - # via unstructured -pytz==2024.2 - # via pandas -rapidfuzz==3.10.0 - # via unstructured -regex==2024.9.11 - # via nltk -requests==2.32.3 - # via - # requests-toolbelt - # unstructured - # unstructured-client -requests-toolbelt==1.0.0 - # via unstructured-client -six==1.16.0 - # via - # langdetect - # python-dateutil - # unstructured-client -sniffio==1.3.1 - # via - # anyio - # httpx -soupsieve==2.6 - # via beautifulsoup4 -tabulate==0.9.0 - # via unstructured -tqdm==4.66.5 - # via - # nltk - # unstructured -typing-extensions==4.12.2 - # via - # anyio - # pypdf - # python-oxmsg - # typing-inspect - # unstructured - # unstructured-client -typing-inspect==0.9.0 - # via - # dataclasses-json - # unstructured-client -tzdata==2024.2 - # via pandas -unstructured[xlsx]==0.15.10 - # via - # -c ./local_partition/../common/constraints.txt - # -r ./local_partition/xlsx.in -unstructured-client==0.25.9 - # via - # -c ./local_partition/../common/constraints.txt - # unstructured -urllib3==1.26.20 - # via - # -c ./local_partition/../common/constraints.txt - # requests - # unstructured-client -wrapt==1.16.0 - # via - # -c ./local_partition/../common/constraints.txt - # unstructured -xlrd==2.0.1 - # via unstructured diff --git a/requirements/release.txt b/requirements/release.txt deleted file mode 100644 index 5d3f1c65d..000000000 --- a/requirements/release.txt +++ /dev/null @@ -1,42 +0,0 @@ -# This file was autogenerated by uv via the following command: -# uv pip compile ./release.in --output-file ./release.txt --no-strip-extras --python-version 3.9 -certifi==2024.8.30 - # via requests -charset-normalizer==3.3.2 - # via requests -colorama==0.4.6 - # via twine -docutils==0.21.2 - # via readme-renderer -idna==3.10 - # via requests -keyring==22.3.0 - # via twine -nh3==0.2.18 - # via readme-renderer -pkginfo==1.11.1 - # via twine -pygments==2.18.0 - # via readme-renderer -readme-renderer==44.0 - # via twine -requests==2.32.3 - # via - # requests-toolbelt - # twine -requests-toolbelt==1.0.0 - # via twine -rfc3986==2.0.0 - # via twine -setuptools==75.1.0 - # via twine -tqdm==4.66.5 - # via twine -twine==3.3.0 - # via -r ./release.in -urllib3==1.26.20 - # via - # -c ././common/constraints.txt - # requests -wheel==0.44.0 - # via -r ./release.in diff --git a/requirements/remote/client.txt b/requirements/remote/client.txt deleted file mode 100644 index 8163b3e61..000000000 --- a/requirements/remote/client.txt +++ /dev/null @@ -1,94 +0,0 @@ -# This file was autogenerated by uv via the following command: -# uv pip compile ./remote/client.in --output-file ./remote/client.txt --no-strip-extras --python-version 3.9 -anyio==4.6.0 - # via httpx -certifi==2024.8.30 - # via - # httpcore - # httpx - # requests - # unstructured-client -cffi==1.17.1 - # via cryptography -charset-normalizer==3.3.2 - # via - # requests - # unstructured-client -cryptography==43.0.1 - # via unstructured-client -dataclasses-json==0.6.7 - # via unstructured-client -deepdiff==8.0.1 - # via unstructured-client -exceptiongroup==1.2.2 - # via anyio -h11==0.14.0 - # via httpcore -httpcore==1.0.6 - # via httpx -httpx==0.27.2 - # via unstructured-client -idna==3.10 - # via - # anyio - # httpx - # requests - # unstructured-client -jsonpath-python==1.0.6 - # via unstructured-client -marshmallow==3.22.0 - # via - # dataclasses-json - # unstructured-client -mypy-extensions==1.0.0 - # via - # typing-inspect - # unstructured-client -nest-asyncio==1.6.0 - # via unstructured-client -orderly-set==5.2.2 - # via deepdiff -packaging==23.2 - # via - # -c ./remote/../common/constraints.txt - # marshmallow - # unstructured-client -pycparser==2.22 - # via cffi -pypdf==5.0.1 - # via unstructured-client -python-dateutil==2.9.0.post0 - # via unstructured-client -requests==2.32.3 - # via - # requests-toolbelt - # unstructured-client -requests-toolbelt==1.0.0 - # via unstructured-client -six==1.16.0 - # via - # python-dateutil - # unstructured-client -sniffio==1.3.1 - # via - # anyio - # httpx -typing-extensions==4.12.2 - # via - # anyio - # pypdf - # typing-inspect - # unstructured-client -typing-inspect==0.9.0 - # via - # dataclasses-json - # unstructured-client -unstructured-client==0.25.9 - # via - # -c ./remote/../common/constraints.txt - # -r ./remote/client.in -urllib3==1.26.20 - # via - # -c ./remote/../common/constraints.txt - # requests - # unstructured-client diff --git a/requirements/test.txt b/requirements/test.txt deleted file mode 100644 index 6b3c6072e..000000000 --- a/requirements/test.txt +++ /dev/null @@ -1,282 +0,0 @@ -# This file was autogenerated by uv via the following command: -# uv pip compile test.in --output-file test.txt --no-strip-extras -annotated-types==0.7.0 - # via pydantic -anyio==4.6.0 - # via httpx -backoff==2.2.1 - # via unstructured -beautifulsoup4==4.12.3 - # via unstructured -cachetools==5.5.0 - # via google-auth -certifi==2024.8.30 - # via - # httpcore - # httpx - # requests - # unstructured-client -cffi==1.17.1 - # via cryptography -chardet==5.2.0 - # via unstructured -charset-normalizer==3.3.2 - # via - # requests - # unstructured-client -click==8.1.7 - # via - # nltk - # python-oxmsg -coverage[toml]==7.6.1 - # via pytest-cov -cryptography==43.0.1 - # via - # -r test.in - # unstructured-client -dataclasses-json==0.6.7 - # via - # unstructured - # unstructured-client -deepdiff==8.0.1 - # via unstructured-client -docstring-parser==0.16 - # via google-cloud-aiplatform -emoji==2.13.2 - # via unstructured -exceptiongroup==1.2.2 - # via - # anyio - # pytest -filetype==1.2.0 - # via unstructured -fsspec==2024.5.0 - # via - # -c ./common/constraints.txt - # -r test.in -google-api-core[grpc]==2.20.0 - # via - # google-cloud-aiplatform - # google-cloud-bigquery - # google-cloud-core - # google-cloud-resource-manager - # google-cloud-storage -google-auth==2.35.0 - # via - # google-api-core - # google-cloud-aiplatform - # google-cloud-bigquery - # google-cloud-core - # google-cloud-resource-manager - # google-cloud-storage -google-cloud-aiplatform[all]==1.69.0 - # via vertexai -google-cloud-bigquery==3.26.0 - # via google-cloud-aiplatform -google-cloud-core==2.4.1 - # via - # google-cloud-bigquery - # google-cloud-storage -google-cloud-resource-manager==1.12.5 - # via google-cloud-aiplatform -google-cloud-storage==2.18.2 - # via google-cloud-aiplatform -google-crc32c==1.6.0 - # via - # google-cloud-storage - # google-resumable-media -google-resumable-media==2.7.2 - # via - # google-cloud-bigquery - # google-cloud-storage -googleapis-common-protos[grpc]==1.65.0 - # via - # google-api-core - # grpc-google-iam-v1 - # grpcio-status -grpc-google-iam-v1==0.13.1 - # via google-cloud-resource-manager -grpcio==1.66.2 - # via - # -c ./common/constraints.txt - # google-api-core - # googleapis-common-protos - # grpc-google-iam-v1 - # grpcio-status -grpcio-status==1.62.3 - # via google-api-core -h11==0.14.0 - # via httpcore -httpcore==1.0.6 - # via httpx -httpx==0.27.2 - # via unstructured-client -idna==3.10 - # via - # anyio - # httpx - # requests - # unstructured-client -iniconfig==2.0.0 - # via pytest -joblib==1.4.2 - # via nltk -jsonpath-python==1.0.6 - # via unstructured-client -langdetect==1.0.9 - # via unstructured -lxml==5.3.0 - # via unstructured -marshmallow==3.22.0 - # via - # dataclasses-json - # unstructured-client -mypy-extensions==1.0.0 - # via - # typing-inspect - # unstructured-client -nest-asyncio==1.6.0 - # via unstructured-client -nltk==3.9.1 - # via unstructured -numpy==1.26.4 - # via - # -c ./common/constraints.txt - # shapely - # unstructured -olefile==0.47 - # via python-oxmsg -orderly-set==5.2.2 - # via deepdiff -packaging==23.2 - # via - # -c ./common/constraints.txt - # google-cloud-aiplatform - # google-cloud-bigquery - # marshmallow - # pytest - # unstructured-client -pluggy==1.5.0 - # via pytest -proto-plus==1.24.0 - # via - # google-api-core - # google-cloud-aiplatform - # google-cloud-resource-manager -protobuf==4.23.4 - # via - # -c ./common/constraints.txt - # google-api-core - # google-cloud-aiplatform - # google-cloud-resource-manager - # googleapis-common-protos - # grpc-google-iam-v1 - # grpcio-status - # proto-plus -psutil==6.0.0 - # via unstructured -pyasn1==0.6.1 - # via - # pyasn1-modules - # rsa -pyasn1-modules==0.4.1 - # via google-auth -pycparser==2.22 - # via cffi -pydantic==2.9.2 - # via google-cloud-aiplatform -pydantic-core==2.23.4 - # via pydantic -pypdf==5.0.1 - # via unstructured-client -pytest==8.3.3 - # via - # -r test.in - # pytest-cov - # pytest-mock -pytest-cov==5.0.0 - # via -r test.in -pytest-mock==3.14.0 - # via -r test.in -python-dateutil==2.9.0.post0 - # via - # google-cloud-bigquery - # unstructured-client -python-iso639==2024.4.27 - # via unstructured -python-magic==0.4.27 - # via unstructured -python-oxmsg==0.0.1 - # via unstructured -rapidfuzz==3.10.0 - # via unstructured -regex==2024.9.11 - # via nltk -requests==2.32.3 - # via - # google-api-core - # google-cloud-bigquery - # google-cloud-storage - # requests-toolbelt - # unstructured - # unstructured-client -requests-toolbelt==1.0.0 - # via unstructured-client -rsa==4.9 - # via google-auth -shapely==2.0.6 - # via google-cloud-aiplatform -six==1.16.0 - # via - # langdetect - # python-dateutil - # unstructured-client -sniffio==1.3.1 - # via - # anyio - # httpx -soupsieve==2.6 - # via beautifulsoup4 -tabulate==0.9.0 - # via unstructured -tomli==2.0.2 - # via - # coverage - # pytest -tqdm==4.66.5 - # via - # nltk - # unstructured -typing-extensions==4.12.2 - # via - # anyio - # pydantic - # pydantic-core - # pypdf - # python-oxmsg - # typing-inspect - # unstructured - # unstructured-client -typing-inspect==0.9.0 - # via - # dataclasses-json - # unstructured-client -unstructured==0.15.10 - # via - # -c ./common/constraints.txt - # -r test.in -unstructured-client==0.25.9 - # via - # -c ./common/constraints.txt - # unstructured -urllib3==1.26.20 - # via - # -c ./common/constraints.txt - # requests - # unstructured-client -vertexai==1.69.0 - # via -r test.in -wrapt==1.16.0 - # via - # -c ./common/constraints.txt - # unstructured diff --git a/test_e2e/src/notion.sh b/test_e2e/src/notion.sh index fbf2ef067..2c41ba837 100755 --- a/test_e2e/src/notion.sh +++ b/test_e2e/src/notion.sh @@ -44,6 +44,5 @@ PYTHONPATH=${PYTHONPATH:-.} "$RUN_SCRIPT" \ --recursive \ --verbose \ --work-dir "$WORK_DIR" - #--max-retry-time 30 "$SCRIPT_DIR"/check-diff-expected-output.py --output-folder-name $OUTPUT_FOLDER_NAME diff --git a/unstructured_ingest/v2/processes/connectors/__init__.py b/unstructured_ingest/v2/processes/connectors/__init__.py index a359c48ec..f73be8a53 100644 --- a/unstructured_ingest/v2/processes/connectors/__init__.py +++ b/unstructured_ingest/v2/processes/connectors/__init__.py @@ -98,4 +98,3 @@ add_destination_entry(destination_type=KDBAI_CONNECTOR_TYPE, entry=kdbai_destination_entry) add_source_entry(source_type=AIRTABLE_CONNECTOR_TYPE, entry=airtable_source_entry) add_source_entry(source_type=NOTION_CONNECTOR_TYPE, entry=notion_source_entry) - diff --git a/unstructured_ingest/v2/processes/connectors/notion/connector.py b/unstructured_ingest/v2/processes/connectors/notion/connector.py index dd798a1f2..f0185c321 100644 --- a/unstructured_ingest/v2/processes/connectors/notion/connector.py +++ b/unstructured_ingest/v2/processes/connectors/notion/connector.py @@ -22,8 +22,12 @@ from unstructured_ingest.v2.processes.connectors.notion.client import Client as NotionClient from unstructured_ingest.v2.processes.connectors.notion.helpers import extract_database_html from unstructured_ingest.v2.processes.connectors.notion.helpers import extract_page_html -from unstructured_ingest.v2.processes.connectors.notion.helpers import get_recursive_content_from_database -from unstructured_ingest.v2.processes.connectors.notion.helpers import get_recursive_content_from_page +from unstructured_ingest.v2.processes.connectors.notion.helpers import ( + get_recursive_content_from_database, +) +from unstructured_ingest.v2.processes.connectors.notion.helpers import ( + get_recursive_content_from_page, +) NOTION_API_VERSION = "2022-06-28" CONNECTOR_TYPE = "notion" @@ -109,9 +113,7 @@ def run(self, **kwargs: Any) -> Generator[FileData, None, None]: continue processed_databases.add(database_id) databases_to_process.remove(database_id) - file_data = self.get_database_file_data( - database_id=database_id, client=client - ) + file_data = self.get_database_file_data(database_id=database_id, client=client) if file_data: yield file_data if self.index_config.recursive: @@ -128,9 +130,7 @@ def run(self, **kwargs: Any) -> Generator[FileData, None, None]: databases_to_process.update(child_databases) @requires_dependencies(["notion_client"], extras="notion") - def get_page_file_data( - self, page_id: str, client: "NotionClient" - ) -> Optional[FileData]: + def get_page_file_data(self, page_id: str, client: "NotionClient") -> Optional[FileData]: try: page_metadata = client.pages.retrieve(page_id=page_id) # type: ignore date_created = page_metadata.get("created_time") @@ -161,12 +161,10 @@ def get_page_file_data( @requires_dependencies(["notion_client"], extras="notion") def get_database_file_data( - self, database_id: str, client: "NotionClient" + self, database_id: str, client: "NotionClient" ) -> Optional[FileData]: try: - database_metadata = client.databases.retrieve( - database_id=database_id - ) # type: ignore + database_metadata = client.databases.retrieve(database_id=database_id) # type: ignore date_created = database_metadata.get("created_time") date_modified = database_metadata.get("last_edited_time") identifier = database_id @@ -194,11 +192,11 @@ def get_database_file_data( return None def get_child_pages_and_databases( - self, - page_id: str, - client: "NotionClient", - processed_pages: Set[str], - processed_databases: Set[str], + self, + page_id: str, + client: "NotionClient", + processed_pages: Set[str], + processed_databases: Set[str], ) -> Tuple[Set[str], Set[str]]: child_content = get_recursive_content_from_page( client=client, @@ -210,11 +208,11 @@ def get_child_pages_and_databases( return child_pages, child_databases def get_child_pages_and_databases_from_database( - self, - database_id: str, - client: "NotionClient", - processed_pages: Set[str], - processed_databases: Set[str], + self, + database_id: str, + client: "NotionClient", + processed_pages: Set[str], + processed_databases: Set[str], ) -> Tuple[Set[str], Set[str]]: child_content = get_recursive_content_from_database( client=client, @@ -265,7 +263,9 @@ def run(self, file_data: FileData, **kwargs: Any) -> DownloadResponse: else: raise ValueError("Invalid record_locator in file_data") - def download_page(self, client: "NotionClient", page_id: str, file_data: FileData) -> DownloadResponse: + def download_page( + self, client: "NotionClient", page_id: str, file_data: FileData + ) -> DownloadResponse: try: text_extraction = extract_page_html( @@ -288,7 +288,9 @@ def download_page(self, client: "NotionClient", page_id: str, file_data: FileDat logger.error(f"Error downloading page {page_id}: {e}") return None - def download_database(self, client: "NotionClient", database_id: str, file_data: FileData) -> DownloadResponse: + def download_database( + self, client: "NotionClient", database_id: str, file_data: FileData + ) -> DownloadResponse: try: text_extraction = extract_database_html( client=client, diff --git a/unstructured_ingest/v2/processes/connectors/notion/types/block.py b/unstructured_ingest/v2/processes/connectors/notion/types/block.py index f8b237ee3..68e479a02 100644 --- a/unstructured_ingest/v2/processes/connectors/notion/types/block.py +++ b/unstructured_ingest/v2/processes/connectors/notion/types/block.py @@ -63,7 +63,7 @@ class Block(FromJSONMixin, GetHTMLMixin): block: BlockBase object: str = "block" request_id: Optional[str] = None - #in_trash: bool + # in_trash: bool def __repr__(self): return f"{self.__class__.__name__}(id={self.id}, type={self.type})" diff --git a/unstructured_ingest/v2/processes/connectors/notion/types/date.py b/unstructured_ingest/v2/processes/connectors/notion/types/date.py index de1408a3b..451c03a75 100644 --- a/unstructured_ingest/v2/processes/connectors/notion/types/date.py +++ b/unstructured_ingest/v2/processes/connectors/notion/types/date.py @@ -4,7 +4,10 @@ from htmlBuilder.tags import Div, HtmlTag -from unstructured_ingest.v2.processes.connectors.notion.interfaces import FromJSONMixin, GetHTMLMixin +from unstructured_ingest.v2.processes.connectors.notion.interfaces import ( + FromJSONMixin, + GetHTMLMixin, +) @dataclass diff --git a/unstructured_ingest/v2/processes/connectors/notion/types/file.py b/unstructured_ingest/v2/processes/connectors/notion/types/file.py index 23c611e7d..c785d0e62 100644 --- a/unstructured_ingest/v2/processes/connectors/notion/types/file.py +++ b/unstructured_ingest/v2/processes/connectors/notion/types/file.py @@ -5,7 +5,10 @@ from htmlBuilder.attributes import Href from htmlBuilder.tags import A, HtmlTag -from unstructured_ingest.v2.processes.connectors.notion.interfaces import FromJSONMixin, GetHTMLMixin +from unstructured_ingest.v2.processes.connectors.notion.interfaces import ( + FromJSONMixin, + GetHTMLMixin, +) @dataclass diff --git a/unstructured_ingest/v2/processes/connectors/notion/types/user.py b/unstructured_ingest/v2/processes/connectors/notion/types/user.py index 7d477e120..38417a9b3 100644 --- a/unstructured_ingest/v2/processes/connectors/notion/types/user.py +++ b/unstructured_ingest/v2/processes/connectors/notion/types/user.py @@ -5,7 +5,10 @@ from htmlBuilder.attributes import Href from htmlBuilder.tags import A, Div, HtmlTag -from unstructured_ingest.v2.processes.connectors.notion.interfaces import FromJSONMixin, GetHTMLMixin +from unstructured_ingest.v2.processes.connectors.notion.interfaces import ( + FromJSONMixin, + GetHTMLMixin, +) @dataclass From 2946f52c7a12423c35b92b3f9c10ec5707217688 Mon Sep 17 00:00:00 2001 From: mr-unstructured Date: Thu, 10 Oct 2024 17:18:37 +0100 Subject: [PATCH 04/48] Fix Makefile issue with lint --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 27e1f724a..4c1fd4466 100644 --- a/Makefile +++ b/Makefile @@ -12,7 +12,7 @@ pip-compile: .PHONY: install-lint install-lint: - pip install -r requirements/lint.txt + pip install -r requirements/lint.in .PHONY: install-client From 3fc17e884ec80fb267de4ba047ef8894fd8edca1 Mon Sep 17 00:00:00 2001 From: mr-unstructured Date: Thu, 10 Oct 2024 17:25:38 +0100 Subject: [PATCH 05/48] more fixes --- Makefile | 2 +- .../v2/processes/connectors/notion/types/block.py | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/Makefile b/Makefile index 4c1fd4466..be9f96d38 100644 --- a/Makefile +++ b/Makefile @@ -29,7 +29,7 @@ install-release: .PHONY: install-base install-base: - pip install -r requirements/common/base.txt + pip install -r requirements/common/base.in .PHONY: install-all-connectors install-all-connectors: diff --git a/unstructured_ingest/v2/processes/connectors/notion/types/block.py b/unstructured_ingest/v2/processes/connectors/notion/types/block.py index 68e479a02..4df97756b 100644 --- a/unstructured_ingest/v2/processes/connectors/notion/types/block.py +++ b/unstructured_ingest/v2/processes/connectors/notion/types/block.py @@ -75,7 +75,8 @@ def from_dict(cls, data: dict): created_by = data.pop("created_by") last_edited_by = data.pop("last_edited_by") parent = data.pop("parent") - in_trash = data.pop("in_trash") + if "in_trash" in data: + in_trash = data.pop("in_trash") try: block = cls( created_by=PartialUser.from_dict(created_by), From 5d83d7c67c1c5add267dac3c50485c6b502de3fc Mon Sep 17 00:00:00 2001 From: mr-unstructured Date: Thu, 10 Oct 2024 17:41:27 +0100 Subject: [PATCH 06/48] restore main files --- Makefile | 4 +- requirements/common/base.txt | 62 +++ requirements/connectors/airtable.txt | 30 ++ requirements/connectors/astradb.txt | 66 +++ .../connectors/azure-cognitive-search.txt | 28 ++ requirements/connectors/azure.txt | 89 ++++ requirements/connectors/biomed.txt | 18 + requirements/connectors/box.txt | 42 ++ requirements/connectors/chroma.txt | 230 ++++++++++ requirements/connectors/clarifai.txt | 68 +++ requirements/connectors/confluence.txt | 39 ++ requirements/connectors/couchbase.txt | 4 + .../connectors/databricks-volumes.txt | 28 ++ requirements/connectors/delta-table.txt | 14 + requirements/connectors/discord.txt | 28 ++ requirements/connectors/dropbox.txt | 33 ++ requirements/connectors/elasticsearch.txt | 36 ++ requirements/connectors/gcs.txt | 104 +++++ requirements/connectors/github.txt | 39 ++ requirements/connectors/gitlab.txt | 20 + requirements/connectors/google-drive.txt | 53 +++ requirements/connectors/hubspot.txt | 17 + requirements/connectors/jira.txt | 38 ++ requirements/connectors/kafka.txt | 4 + requirements/connectors/kdbai.txt | 39 ++ requirements/connectors/milvus.txt | 44 ++ requirements/connectors/mongodb.txt | 6 + requirements/connectors/notion.txt | 34 ++ requirements/connectors/onedrive.txt | 42 ++ requirements/connectors/opensearch.txt | 25 ++ requirements/connectors/outlook.txt | 36 ++ requirements/connectors/pinecone.txt | 20 + requirements/connectors/postgres.txt | 4 + requirements/connectors/qdrant.txt | 66 +++ requirements/connectors/reddit.txt | 24 ++ requirements/connectors/s3.txt | 59 +++ requirements/connectors/salesforce.txt | 50 +++ requirements/connectors/sftp.txt | 20 + requirements/connectors/sharepoint.txt | 36 ++ requirements/connectors/singlestore.txt | 40 ++ requirements/connectors/slack.txt | 4 + requirements/connectors/vectara.txt | 14 + requirements/connectors/weaviate.txt | 74 ++++ requirements/connectors/wikipedia.txt | 20 + requirements/embed/aws-bedrock.txt | 23 + requirements/embed/huggingface.txt | 91 ++++ requirements/embed/mixedbreadai.txt | 38 ++ requirements/embed/octoai.txt | 61 +++ requirements/embed/openai.txt | 61 +++ requirements/embed/vertexai.txt | 122 ++++++ requirements/embed/voyageai.txt | 48 +++ requirements/local_partition/doc.txt | 161 +++++++ requirements/local_partition/docx.txt | 161 +++++++ requirements/local_partition/epub.txt | 158 +++++++ requirements/local_partition/image.txt | 395 ++++++++++++++++++ requirements/local_partition/md.txt | 158 +++++++ requirements/local_partition/msg.txt | 156 +++++++ requirements/local_partition/odt.txt | 163 ++++++++ requirements/local_partition/org.txt | 158 +++++++ requirements/local_partition/pdf.txt | 395 ++++++++++++++++++ requirements/local_partition/ppt.txt | 165 ++++++++ requirements/local_partition/pptx.txt | 165 ++++++++ requirements/local_partition/rst.txt | 158 +++++++ requirements/local_partition/rtf.txt | 158 +++++++ requirements/local_partition/tsv.txt | 165 ++++++++ requirements/local_partition/xlsx.txt | 173 ++++++++ requirements/remote/client.txt | 94 +++++ 67 files changed, 5176 insertions(+), 2 deletions(-) create mode 100644 requirements/common/base.txt create mode 100644 requirements/connectors/airtable.txt create mode 100644 requirements/connectors/astradb.txt create mode 100644 requirements/connectors/azure-cognitive-search.txt create mode 100644 requirements/connectors/azure.txt create mode 100644 requirements/connectors/biomed.txt create mode 100644 requirements/connectors/box.txt create mode 100644 requirements/connectors/chroma.txt create mode 100644 requirements/connectors/clarifai.txt create mode 100644 requirements/connectors/confluence.txt create mode 100644 requirements/connectors/couchbase.txt create mode 100644 requirements/connectors/databricks-volumes.txt create mode 100644 requirements/connectors/delta-table.txt create mode 100644 requirements/connectors/discord.txt create mode 100644 requirements/connectors/dropbox.txt create mode 100644 requirements/connectors/elasticsearch.txt create mode 100644 requirements/connectors/gcs.txt create mode 100644 requirements/connectors/github.txt create mode 100644 requirements/connectors/gitlab.txt create mode 100644 requirements/connectors/google-drive.txt create mode 100644 requirements/connectors/hubspot.txt create mode 100644 requirements/connectors/jira.txt create mode 100644 requirements/connectors/kafka.txt create mode 100644 requirements/connectors/kdbai.txt create mode 100644 requirements/connectors/milvus.txt create mode 100644 requirements/connectors/mongodb.txt create mode 100644 requirements/connectors/notion.txt create mode 100644 requirements/connectors/onedrive.txt create mode 100644 requirements/connectors/opensearch.txt create mode 100644 requirements/connectors/outlook.txt create mode 100644 requirements/connectors/pinecone.txt create mode 100644 requirements/connectors/postgres.txt create mode 100644 requirements/connectors/qdrant.txt create mode 100644 requirements/connectors/reddit.txt create mode 100644 requirements/connectors/s3.txt create mode 100644 requirements/connectors/salesforce.txt create mode 100644 requirements/connectors/sftp.txt create mode 100644 requirements/connectors/sharepoint.txt create mode 100644 requirements/connectors/singlestore.txt create mode 100644 requirements/connectors/slack.txt create mode 100644 requirements/connectors/vectara.txt create mode 100644 requirements/connectors/weaviate.txt create mode 100644 requirements/connectors/wikipedia.txt create mode 100644 requirements/embed/aws-bedrock.txt create mode 100644 requirements/embed/huggingface.txt create mode 100644 requirements/embed/mixedbreadai.txt create mode 100644 requirements/embed/octoai.txt create mode 100644 requirements/embed/openai.txt create mode 100644 requirements/embed/vertexai.txt create mode 100644 requirements/embed/voyageai.txt create mode 100644 requirements/local_partition/doc.txt create mode 100644 requirements/local_partition/docx.txt create mode 100644 requirements/local_partition/epub.txt create mode 100644 requirements/local_partition/image.txt create mode 100644 requirements/local_partition/md.txt create mode 100644 requirements/local_partition/msg.txt create mode 100644 requirements/local_partition/odt.txt create mode 100644 requirements/local_partition/org.txt create mode 100644 requirements/local_partition/pdf.txt create mode 100644 requirements/local_partition/ppt.txt create mode 100644 requirements/local_partition/pptx.txt create mode 100644 requirements/local_partition/rst.txt create mode 100644 requirements/local_partition/rtf.txt create mode 100644 requirements/local_partition/tsv.txt create mode 100644 requirements/local_partition/xlsx.txt create mode 100644 requirements/remote/client.txt diff --git a/Makefile b/Makefile index be9f96d38..27e1f724a 100644 --- a/Makefile +++ b/Makefile @@ -12,7 +12,7 @@ pip-compile: .PHONY: install-lint install-lint: - pip install -r requirements/lint.in + pip install -r requirements/lint.txt .PHONY: install-client @@ -29,7 +29,7 @@ install-release: .PHONY: install-base install-base: - pip install -r requirements/common/base.in + pip install -r requirements/common/base.txt .PHONY: install-all-connectors install-all-connectors: diff --git a/requirements/common/base.txt b/requirements/common/base.txt new file mode 100644 index 000000000..cdd9f97a2 --- /dev/null +++ b/requirements/common/base.txt @@ -0,0 +1,62 @@ +# This file was autogenerated by uv via the following command: +# uv pip compile ./common/base.in --output-file ./common/base.txt --no-strip-extras --python-version 3.9 +annotated-types==0.7.0 + # via pydantic +click==8.1.7 + # via -r ./common/base.in +dataclasses-json==0.6.7 + # via -r ./common/base.in +deprecated==1.2.14 + # via opentelemetry-api +marshmallow==3.22.0 + # via dataclasses-json +mypy-extensions==1.0.0 + # via typing-inspect +numpy==1.26.4 + # via + # -c ./common/constraints.txt + # pandas +opentelemetry-api==1.16.0 + # via opentelemetry-sdk +opentelemetry-sdk==1.16.0 + # via -r ./common/base.in +opentelemetry-semantic-conventions==0.37b0 + # via opentelemetry-sdk +packaging==23.2 + # via + # -c ./common/constraints.txt + # marshmallow +pandas==2.2.3 + # via -r ./common/base.in +pydantic==2.9.2 + # via -r ./common/base.in +pydantic-core==2.23.4 + # via pydantic +python-dateutil==2.9.0.post0 + # via + # -r ./common/base.in + # pandas +pytz==2024.2 + # via pandas +setuptools==75.1.0 + # via + # opentelemetry-api + # opentelemetry-sdk +six==1.16.0 + # via python-dateutil +tqdm==4.66.5 + # via -r ./common/base.in +typing-extensions==4.12.2 + # via + # opentelemetry-sdk + # pydantic + # pydantic-core + # typing-inspect +typing-inspect==0.9.0 + # via dataclasses-json +tzdata==2024.2 + # via pandas +wrapt==1.16.0 + # via + # -c ./common/constraints.txt + # deprecated diff --git a/requirements/connectors/airtable.txt b/requirements/connectors/airtable.txt new file mode 100644 index 000000000..421e1ee85 --- /dev/null +++ b/requirements/connectors/airtable.txt @@ -0,0 +1,30 @@ +# This file was autogenerated by uv via the following command: +# uv pip compile ./connectors/airtable.in --output-file ./connectors/airtable.txt --no-strip-extras --python-version 3.9 +annotated-types==0.7.0 + # via pydantic +certifi==2024.8.30 + # via requests +charset-normalizer==3.3.2 + # via requests +idna==3.10 + # via requests +inflection==0.5.1 + # via pyairtable +pyairtable==2.3.3 + # via -r ./connectors/airtable.in +pydantic==2.9.2 + # via pyairtable +pydantic-core==2.23.4 + # via pydantic +requests==2.32.3 + # via pyairtable +typing-extensions==4.12.2 + # via + # pyairtable + # pydantic + # pydantic-core +urllib3==1.26.20 + # via + # -c ./connectors/../common/constraints.txt + # pyairtable + # requests diff --git a/requirements/connectors/astradb.txt b/requirements/connectors/astradb.txt new file mode 100644 index 000000000..e1b92486d --- /dev/null +++ b/requirements/connectors/astradb.txt @@ -0,0 +1,66 @@ +# This file was autogenerated by uv via the following command: +# uv pip compile ./connectors/astradb.in --output-file ./connectors/astradb.txt --no-strip-extras --python-version 3.9 +anyio==4.6.0 + # via httpx +astrapy==1.5.0 + # via -r ./connectors/astradb.in +cassandra-driver==3.29.2 + # via cassio +cassio==0.1.9 + # via astrapy +certifi==2024.8.30 + # via + # httpcore + # httpx + # requests +charset-normalizer==3.3.2 + # via requests +click==8.1.7 + # via geomet +deprecation==2.1.0 + # via astrapy +dnspython==2.6.1 + # via pymongo +exceptiongroup==1.2.2 + # via anyio +geomet==0.2.1.post1 + # via cassandra-driver +h11==0.14.0 + # via httpcore +h2==4.1.0 + # via httpx +hpack==4.0.0 + # via h2 +httpcore==1.0.6 + # via httpx +httpx[http2]==0.27.2 + # via astrapy +hyperframe==6.0.1 + # via h2 +idna==3.10 + # via + # anyio + # httpx + # requests +numpy==2.0.2 + # via cassio +packaging==24.1 + # via deprecation +pymongo==4.10.1 + # via astrapy +requests==2.32.3 + # via cassio +six==1.16.0 + # via geomet +sniffio==1.3.1 + # via + # anyio + # httpx +toml==0.10.2 + # via astrapy +typing-extensions==4.12.2 + # via anyio +urllib3==2.2.3 + # via requests +uuid6==2024.7.10 + # via astrapy diff --git a/requirements/connectors/azure-cognitive-search.txt b/requirements/connectors/azure-cognitive-search.txt new file mode 100644 index 000000000..fd7907f55 --- /dev/null +++ b/requirements/connectors/azure-cognitive-search.txt @@ -0,0 +1,28 @@ +# This file was autogenerated by uv via the following command: +# uv pip compile ./connectors/azure-cognitive-search.in --output-file ./connectors/azure-cognitive-search.txt --no-strip-extras --python-version 3.9 +azure-common==1.1.28 + # via azure-search-documents +azure-core==1.31.0 + # via azure-search-documents +azure-search-documents==11.5.1 + # via -r ./connectors/azure-cognitive-search.in +certifi==2024.8.30 + # via requests +charset-normalizer==3.3.2 + # via requests +idna==3.10 + # via requests +isodate==0.6.1 + # via azure-search-documents +requests==2.32.3 + # via azure-core +six==1.16.0 + # via + # azure-core + # isodate +typing-extensions==4.12.2 + # via + # azure-core + # azure-search-documents +urllib3==2.2.3 + # via requests diff --git a/requirements/connectors/azure.txt b/requirements/connectors/azure.txt new file mode 100644 index 000000000..bc1b4c7af --- /dev/null +++ b/requirements/connectors/azure.txt @@ -0,0 +1,89 @@ +# This file was autogenerated by uv via the following command: +# uv pip compile ./connectors/azure.in --output-file ./connectors/azure.txt --no-strip-extras --python-version 3.9 +adlfs==2024.7.0 + # via -r ./connectors/azure.in +aiohappyeyeballs==2.4.3 + # via aiohttp +aiohttp==3.10.8 + # via adlfs +aiosignal==1.3.1 + # via aiohttp +async-timeout==4.0.3 + # via aiohttp +attrs==24.2.0 + # via aiohttp +azure-core==1.31.0 + # via + # adlfs + # azure-identity + # azure-storage-blob +azure-datalake-store==0.0.53 + # via adlfs +azure-identity==1.18.0 + # via adlfs +azure-storage-blob==12.23.1 + # via adlfs +certifi==2024.8.30 + # via requests +cffi==1.17.1 + # via + # azure-datalake-store + # cryptography +charset-normalizer==3.3.2 + # via requests +cryptography==43.0.1 + # via + # azure-identity + # azure-storage-blob + # msal + # pyjwt +frozenlist==1.4.1 + # via + # aiohttp + # aiosignal +fsspec==2024.9.0 + # via + # -r ./connectors/azure.in + # adlfs +idna==3.10 + # via + # requests + # yarl +isodate==0.6.1 + # via azure-storage-blob +msal==1.31.0 + # via + # azure-datalake-store + # azure-identity + # msal-extensions +msal-extensions==1.2.0 + # via azure-identity +multidict==6.1.0 + # via + # aiohttp + # yarl +portalocker==2.10.1 + # via msal-extensions +pycparser==2.22 + # via cffi +pyjwt[crypto]==2.9.0 + # via msal +requests==2.32.3 + # via + # azure-core + # azure-datalake-store + # msal +six==1.16.0 + # via + # azure-core + # isodate +typing-extensions==4.12.2 + # via + # azure-core + # azure-identity + # azure-storage-blob + # multidict +urllib3==2.2.3 + # via requests +yarl==1.13.1 + # via aiohttp diff --git a/requirements/connectors/biomed.txt b/requirements/connectors/biomed.txt new file mode 100644 index 000000000..5a86a9eda --- /dev/null +++ b/requirements/connectors/biomed.txt @@ -0,0 +1,18 @@ +# This file was autogenerated by uv via the following command: +# uv pip compile ./connectors/biomed.in --output-file ./connectors/biomed.txt --no-strip-extras --python-version 3.9 +beautifulsoup4==4.12.3 + # via bs4 +bs4==0.0.2 + # via -r ./connectors/biomed.in +certifi==2024.8.30 + # via requests +charset-normalizer==3.3.2 + # via requests +idna==3.10 + # via requests +requests==2.32.3 + # via -r ./connectors/biomed.in +soupsieve==2.6 + # via beautifulsoup4 +urllib3==2.2.3 + # via requests diff --git a/requirements/connectors/box.txt b/requirements/connectors/box.txt new file mode 100644 index 000000000..200c94bc2 --- /dev/null +++ b/requirements/connectors/box.txt @@ -0,0 +1,42 @@ +# This file was autogenerated by uv via the following command: +# uv pip compile ./connectors/box.in --output-file ./connectors/box.txt --no-strip-extras --python-version 3.9 +attrs==24.2.0 + # via boxsdk +boxfs==0.3.0 + # via -r ./connectors/box.in +boxsdk[jwt]==3.13.0 + # via boxfs +certifi==2024.8.30 + # via requests +cffi==1.17.1 + # via cryptography +charset-normalizer==3.3.2 + # via requests +cryptography==43.0.1 + # via boxsdk +fsspec==2024.5.0 + # via + # -c ./connectors/../common/constraints.txt + # -r ./connectors/box.in + # boxfs +idna==3.10 + # via requests +pycparser==2.22 + # via cffi +pyjwt==2.9.0 + # via boxsdk +python-dateutil==2.9.0.post0 + # via boxsdk +requests==2.32.3 + # via + # boxsdk + # requests-toolbelt +requests-toolbelt==1.0.0 + # via boxsdk +six==1.16.0 + # via python-dateutil +urllib3==1.26.20 + # via + # -c ./connectors/../common/constraints.txt + # boxsdk + # requests diff --git a/requirements/connectors/chroma.txt b/requirements/connectors/chroma.txt new file mode 100644 index 000000000..38effc76e --- /dev/null +++ b/requirements/connectors/chroma.txt @@ -0,0 +1,230 @@ +# This file was autogenerated by uv via the following command: +# uv pip compile ./connectors/chroma.in --output-file ./connectors/chroma.txt --no-strip-extras --python-version 3.9 +annotated-types==0.7.0 + # via pydantic +anyio==4.6.0 + # via + # starlette + # watchfiles +backoff==2.2.1 + # via + # opentelemetry-exporter-otlp-proto-grpc + # posthog +bcrypt==4.2.0 + # via chromadb +cachetools==5.5.0 + # via google-auth +certifi==2024.8.30 + # via + # kubernetes + # pulsar-client + # requests +charset-normalizer==3.3.2 + # via requests +chroma-hnswlib==0.7.3 + # via chromadb +chromadb==0.4.17 + # via -r ./connectors/chroma.in +click==8.1.7 + # via + # typer + # uvicorn +coloredlogs==15.0.1 + # via onnxruntime +deprecated==1.2.14 + # via opentelemetry-api +durationpy==0.8 + # via kubernetes +exceptiongroup==1.2.2 + # via anyio +fastapi==0.115.0 + # via chromadb +filelock==3.16.1 + # via huggingface-hub +flatbuffers==24.3.25 + # via onnxruntime +fsspec==2024.5.0 + # via + # -c ./connectors/../common/constraints.txt + # huggingface-hub +google-auth==2.35.0 + # via kubernetes +googleapis-common-protos==1.65.0 + # via opentelemetry-exporter-otlp-proto-grpc +grpcio==1.66.2 + # via + # -c ./connectors/../common/constraints.txt + # chromadb + # opentelemetry-exporter-otlp-proto-grpc +h11==0.14.0 + # via uvicorn +httptools==0.6.1 + # via uvicorn +huggingface-hub==0.25.1 + # via tokenizers +humanfriendly==10.0 + # via coloredlogs +idna==3.10 + # via + # anyio + # requests +importlib-resources==6.4.5 + # via chromadb +kubernetes==31.0.0 + # via chromadb +markdown-it-py==3.0.0 + # via rich +mdurl==0.1.2 + # via markdown-it-py +monotonic==1.6 + # via posthog +mpmath==1.3.0 + # via sympy +numpy==1.26.4 + # via + # -c ./connectors/../common/constraints.txt + # chroma-hnswlib + # chromadb + # onnxruntime +oauthlib==3.2.2 + # via + # kubernetes + # requests-oauthlib +onnxruntime==1.19.2 + # via chromadb +opentelemetry-api==1.16.0 + # via + # chromadb + # opentelemetry-exporter-otlp-proto-grpc + # opentelemetry-sdk +opentelemetry-exporter-otlp-proto-grpc==1.16.0 + # via chromadb +opentelemetry-proto==1.16.0 + # via opentelemetry-exporter-otlp-proto-grpc +opentelemetry-sdk==1.16.0 + # via + # chromadb + # opentelemetry-exporter-otlp-proto-grpc +opentelemetry-semantic-conventions==0.37b0 + # via opentelemetry-sdk +overrides==7.7.0 + # via chromadb +packaging==23.2 + # via + # -c ./connectors/../common/constraints.txt + # huggingface-hub + # onnxruntime +posthog==3.6.6 + # via chromadb +protobuf==4.23.4 + # via + # -c ./connectors/../common/constraints.txt + # googleapis-common-protos + # onnxruntime + # opentelemetry-proto +pulsar-client==3.5.0 + # via chromadb +pyasn1==0.6.1 + # via + # pyasn1-modules + # rsa +pyasn1-modules==0.4.1 + # via google-auth +pydantic==2.9.2 + # via + # chromadb + # fastapi +pydantic-core==2.23.4 + # via pydantic +pygments==2.18.0 + # via rich +pypika==0.48.9 + # via chromadb +python-dateutil==2.9.0.post0 + # via + # kubernetes + # posthog +python-dotenv==1.0.1 + # via uvicorn +pyyaml==6.0.2 + # via + # chromadb + # huggingface-hub + # kubernetes + # uvicorn +requests==2.32.3 + # via + # chromadb + # huggingface-hub + # kubernetes + # posthog + # requests-oauthlib +requests-oauthlib==2.0.0 + # via kubernetes +rich==13.9.1 + # via typer +rsa==4.9 + # via google-auth +setuptools==75.1.0 + # via + # opentelemetry-api + # opentelemetry-sdk +shellingham==1.5.4 + # via typer +six==1.16.0 + # via + # kubernetes + # posthog + # python-dateutil +sniffio==1.3.1 + # via anyio +starlette==0.38.6 + # via fastapi +sympy==1.13.3 + # via onnxruntime +tenacity==9.0.0 + # via chromadb +tokenizers==0.19.1 + # via + # -c ./connectors/../common/constraints.txt + # chromadb +tqdm==4.66.5 + # via + # chromadb + # huggingface-hub +typer==0.12.5 + # via chromadb +typing-extensions==4.12.2 + # via + # anyio + # chromadb + # fastapi + # huggingface-hub + # opentelemetry-sdk + # pydantic + # pydantic-core + # rich + # starlette + # typer + # uvicorn +urllib3==1.26.20 + # via + # -c ./connectors/../common/constraints.txt + # kubernetes + # requests +uvicorn[standard]==0.31.0 + # via chromadb +uvloop==0.20.0 + # via uvicorn +watchfiles==0.24.0 + # via uvicorn +websocket-client==1.8.0 + # via kubernetes +websockets==13.1 + # via uvicorn +wrapt==1.16.0 + # via + # -c ./connectors/../common/constraints.txt + # deprecated +zipp==3.20.2 + # via importlib-resources diff --git a/requirements/connectors/clarifai.txt b/requirements/connectors/clarifai.txt new file mode 100644 index 000000000..f11740c32 --- /dev/null +++ b/requirements/connectors/clarifai.txt @@ -0,0 +1,68 @@ +# This file was autogenerated by uv via the following command: +# uv pip compile ./connectors/clarifai.in --output-file ./connectors/clarifai.txt --no-strip-extras --python-version 3.9 +certifi==2024.8.30 + # via requests +charset-normalizer==3.3.2 + # via requests +clarifai==10.7.0 + # via -r ./connectors/clarifai.in +clarifai-grpc==10.7.1 + # via clarifai +contextlib2==21.6.0 + # via schema +googleapis-common-protos==1.65.0 + # via clarifai-grpc +grpcio==1.66.2 + # via + # -c ./connectors/../common/constraints.txt + # clarifai-grpc +idna==3.10 + # via requests +inquirerpy==0.3.4 + # via clarifai +markdown-it-py==3.0.0 + # via rich +mdurl==0.1.2 + # via markdown-it-py +numpy==1.26.4 + # via + # -c ./connectors/../common/constraints.txt + # clarifai + # tritonclient +pfzy==0.3.4 + # via inquirerpy +pillow==10.4.0 + # via clarifai +prompt-toolkit==3.0.48 + # via inquirerpy +protobuf==4.23.4 + # via + # -c ./connectors/../common/constraints.txt + # clarifai-grpc + # googleapis-common-protos +pygments==2.18.0 + # via rich +python-rapidjson==1.20 + # via tritonclient +pyyaml==6.0.2 + # via clarifai +requests==2.32.3 + # via clarifai-grpc +rich==13.9.1 + # via clarifai +schema==0.7.5 + # via clarifai +tabulate==0.9.0 + # via clarifai +tqdm==4.66.5 + # via clarifai +tritonclient==2.41.1 + # via clarifai +typing-extensions==4.12.2 + # via rich +urllib3==1.26.20 + # via + # -c ./connectors/../common/constraints.txt + # requests +wcwidth==0.2.13 + # via prompt-toolkit diff --git a/requirements/connectors/confluence.txt b/requirements/connectors/confluence.txt new file mode 100644 index 000000000..3a97ca212 --- /dev/null +++ b/requirements/connectors/confluence.txt @@ -0,0 +1,39 @@ +# This file was autogenerated by uv via the following command: +# uv pip compile ./connectors/confluence.in --output-file ./connectors/confluence.txt --no-strip-extras --python-version 3.9 +atlassian-python-api==3.41.16 + # via -r ./connectors/confluence.in +beautifulsoup4==4.12.3 + # via atlassian-python-api +certifi==2024.8.30 + # via requests +charset-normalizer==3.3.2 + # via requests +deprecated==1.2.14 + # via atlassian-python-api +idna==3.10 + # via requests +jmespath==1.0.1 + # via atlassian-python-api +oauthlib==3.2.2 + # via + # atlassian-python-api + # requests-oauthlib +requests==2.32.3 + # via + # -r ./connectors/confluence.in + # atlassian-python-api + # requests-oauthlib +requests-oauthlib==2.0.0 + # via atlassian-python-api +six==1.16.0 + # via atlassian-python-api +soupsieve==2.6 + # via beautifulsoup4 +urllib3==1.26.20 + # via + # -c ./connectors/../common/constraints.txt + # requests +wrapt==1.16.0 + # via + # -c ./connectors/../common/constraints.txt + # deprecated diff --git a/requirements/connectors/couchbase.txt b/requirements/connectors/couchbase.txt new file mode 100644 index 000000000..4ceb6998b --- /dev/null +++ b/requirements/connectors/couchbase.txt @@ -0,0 +1,4 @@ +# This file was autogenerated by uv via the following command: +# uv pip compile ./connectors/couchbase.in --output-file ./connectors/couchbase.txt --no-strip-extras --python-version 3.9 +couchbase==4.3.2 + # via -r ./connectors/couchbase.in diff --git a/requirements/connectors/databricks-volumes.txt b/requirements/connectors/databricks-volumes.txt new file mode 100644 index 000000000..c01a1b236 --- /dev/null +++ b/requirements/connectors/databricks-volumes.txt @@ -0,0 +1,28 @@ +# This file was autogenerated by uv via the following command: +# uv pip compile ./connectors/databricks-volumes.in --output-file ./connectors/databricks-volumes.txt --no-strip-extras --python-version 3.9 +cachetools==5.5.0 + # via google-auth +certifi==2024.8.30 + # via requests +charset-normalizer==3.3.2 + # via requests +databricks-sdk==0.33.0 + # via -r ./connectors/databricks-volumes.in +google-auth==2.35.0 + # via databricks-sdk +idna==3.10 + # via requests +pyasn1==0.6.1 + # via + # pyasn1-modules + # rsa +pyasn1-modules==0.4.1 + # via google-auth +requests==2.32.3 + # via databricks-sdk +rsa==4.9 + # via google-auth +urllib3==1.26.20 + # via + # -c ./connectors/../common/constraints.txt + # requests diff --git a/requirements/connectors/delta-table.txt b/requirements/connectors/delta-table.txt new file mode 100644 index 000000000..05d144877 --- /dev/null +++ b/requirements/connectors/delta-table.txt @@ -0,0 +1,14 @@ +# This file was autogenerated by uv via the following command: +# uv pip compile ./connectors/delta-table.in --output-file ./connectors/delta-table.txt --no-strip-extras --python-version 3.9 +deltalake==0.20.1 + # via -r ./connectors/delta-table.in +fsspec==2024.5.0 + # via + # -c ./connectors/../common/constraints.txt + # -r ./connectors/delta-table.in +numpy==1.26.4 + # via + # -c ./connectors/../common/constraints.txt + # pyarrow +pyarrow==17.0.0 + # via deltalake diff --git a/requirements/connectors/discord.txt b/requirements/connectors/discord.txt new file mode 100644 index 000000000..358812feb --- /dev/null +++ b/requirements/connectors/discord.txt @@ -0,0 +1,28 @@ +# This file was autogenerated by uv via the following command: +# uv pip compile ./connectors/discord.in --output-file ./connectors/discord.txt --no-strip-extras --python-version 3.9 +aiohappyeyeballs==2.4.3 + # via aiohttp +aiohttp==3.10.8 + # via discord-py +aiosignal==1.3.1 + # via aiohttp +async-timeout==4.0.3 + # via aiohttp +attrs==24.2.0 + # via aiohttp +discord-py==2.4.0 + # via -r ./connectors/discord.in +frozenlist==1.4.1 + # via + # aiohttp + # aiosignal +idna==3.10 + # via yarl +multidict==6.1.0 + # via + # aiohttp + # yarl +typing-extensions==4.12.2 + # via multidict +yarl==1.13.1 + # via aiohttp diff --git a/requirements/connectors/dropbox.txt b/requirements/connectors/dropbox.txt new file mode 100644 index 000000000..41f70edd6 --- /dev/null +++ b/requirements/connectors/dropbox.txt @@ -0,0 +1,33 @@ +# This file was autogenerated by uv via the following command: +# uv pip compile ./connectors/dropbox.in --output-file ./connectors/dropbox.txt --no-strip-extras --python-version 3.9 +certifi==2024.8.30 + # via requests +charset-normalizer==3.3.2 + # via requests +dropbox==12.0.2 + # via dropboxdrivefs +dropboxdrivefs==1.4.1 + # via -r ./connectors/dropbox.in +fsspec==2024.5.0 + # via + # -c ./connectors/../common/constraints.txt + # -r ./connectors/dropbox.in + # dropboxdrivefs +idna==3.10 + # via requests +ply==3.11 + # via stone +requests==2.32.3 + # via + # dropbox + # dropboxdrivefs +six==1.16.0 + # via + # dropbox + # stone +stone==3.3.1 + # via dropbox +urllib3==1.26.20 + # via + # -c ./connectors/../common/constraints.txt + # requests diff --git a/requirements/connectors/elasticsearch.txt b/requirements/connectors/elasticsearch.txt new file mode 100644 index 000000000..223f74b5c --- /dev/null +++ b/requirements/connectors/elasticsearch.txt @@ -0,0 +1,36 @@ +# This file was autogenerated by uv via the following command: +# uv pip compile ./connectors/elasticsearch.in --output-file ./connectors/elasticsearch.txt --no-strip-extras --python-version 3.9 +aiohappyeyeballs==2.4.3 + # via aiohttp +aiohttp==3.10.8 + # via elasticsearch +aiosignal==1.3.1 + # via aiohttp +async-timeout==4.0.3 + # via aiohttp +attrs==24.2.0 + # via aiohttp +certifi==2024.8.30 + # via elastic-transport +elastic-transport==8.15.0 + # via elasticsearch +elasticsearch[async]==8.15.1 + # via -r ./connectors/elasticsearch.in +frozenlist==1.4.1 + # via + # aiohttp + # aiosignal +idna==3.10 + # via yarl +multidict==6.1.0 + # via + # aiohttp + # yarl +typing-extensions==4.12.2 + # via multidict +urllib3==1.26.20 + # via + # -c ./connectors/../common/constraints.txt + # elastic-transport +yarl==1.13.1 + # via aiohttp diff --git a/requirements/connectors/gcs.txt b/requirements/connectors/gcs.txt new file mode 100644 index 000000000..29db8779d --- /dev/null +++ b/requirements/connectors/gcs.txt @@ -0,0 +1,104 @@ +# This file was autogenerated by uv via the following command: +# uv pip compile ./connectors/gcs.in --output-file ./connectors/gcs.txt --no-strip-extras --python-version 3.9 +aiohappyeyeballs==2.4.3 + # via aiohttp +aiohttp==3.10.8 + # via gcsfs +aiosignal==1.3.1 + # via aiohttp +async-timeout==4.0.3 + # via aiohttp +attrs==24.2.0 + # via aiohttp +beautifulsoup4==4.12.3 + # via bs4 +bs4==0.0.2 + # via -r ./connectors/gcs.in +cachetools==5.5.0 + # via google-auth +certifi==2024.8.30 + # via requests +charset-normalizer==3.3.2 + # via requests +decorator==5.1.1 + # via gcsfs +frozenlist==1.4.1 + # via + # aiohttp + # aiosignal +fsspec==2024.5.0 + # via + # -c ./connectors/../common/constraints.txt + # -r ./connectors/gcs.in + # gcsfs +gcsfs==2024.5.0 + # via -r ./connectors/gcs.in +google-api-core==2.20.0 + # via + # google-cloud-core + # google-cloud-storage +google-auth==2.35.0 + # via + # gcsfs + # google-api-core + # google-auth-oauthlib + # google-cloud-core + # google-cloud-storage +google-auth-oauthlib==1.2.1 + # via gcsfs +google-cloud-core==2.4.1 + # via google-cloud-storage +google-cloud-storage==2.18.2 + # via gcsfs +google-crc32c==1.6.0 + # via + # google-cloud-storage + # google-resumable-media +google-resumable-media==2.7.2 + # via google-cloud-storage +googleapis-common-protos==1.65.0 + # via google-api-core +idna==3.10 + # via + # requests + # yarl +multidict==6.1.0 + # via + # aiohttp + # yarl +oauthlib==3.2.2 + # via requests-oauthlib +proto-plus==1.24.0 + # via google-api-core +protobuf==4.23.4 + # via + # -c ./connectors/../common/constraints.txt + # google-api-core + # googleapis-common-protos + # proto-plus +pyasn1==0.6.1 + # via + # pyasn1-modules + # rsa +pyasn1-modules==0.4.1 + # via google-auth +requests==2.32.3 + # via + # gcsfs + # google-api-core + # google-cloud-storage + # requests-oauthlib +requests-oauthlib==2.0.0 + # via google-auth-oauthlib +rsa==4.9 + # via google-auth +soupsieve==2.6 + # via beautifulsoup4 +typing-extensions==4.12.2 + # via multidict +urllib3==1.26.20 + # via + # -c ./connectors/../common/constraints.txt + # requests +yarl==1.13.1 + # via aiohttp diff --git a/requirements/connectors/github.txt b/requirements/connectors/github.txt new file mode 100644 index 000000000..d9795f8dd --- /dev/null +++ b/requirements/connectors/github.txt @@ -0,0 +1,39 @@ +# This file was autogenerated by uv via the following command: +# uv pip compile ./connectors/github.in --output-file ./connectors/github.txt --no-strip-extras --python-version 3.9 +certifi==2024.8.30 + # via requests +cffi==1.17.1 + # via + # cryptography + # pynacl +charset-normalizer==3.3.2 + # via requests +cryptography==43.0.1 + # via pyjwt +deprecated==1.2.14 + # via pygithub +idna==3.10 + # via requests +pycparser==2.22 + # via cffi +pygithub==2.4.0 + # via -r ./connectors/github.in +pyjwt[crypto]==2.9.0 + # via pygithub +pynacl==1.5.0 + # via pygithub +requests==2.32.3 + # via + # -r ./connectors/github.in + # pygithub +typing-extensions==4.12.2 + # via pygithub +urllib3==1.26.20 + # via + # -c ./connectors/../common/constraints.txt + # pygithub + # requests +wrapt==1.16.0 + # via + # -c ./connectors/../common/constraints.txt + # deprecated diff --git a/requirements/connectors/gitlab.txt b/requirements/connectors/gitlab.txt new file mode 100644 index 000000000..6d12e636c --- /dev/null +++ b/requirements/connectors/gitlab.txt @@ -0,0 +1,20 @@ +# This file was autogenerated by uv via the following command: +# uv pip compile ./connectors/gitlab.in --output-file ./connectors/gitlab.txt --no-strip-extras --python-version 3.9 +certifi==2024.8.30 + # via requests +charset-normalizer==3.3.2 + # via requests +idna==3.10 + # via requests +python-gitlab==4.12.2 + # via -r ./connectors/gitlab.in +requests==2.32.3 + # via + # python-gitlab + # requests-toolbelt +requests-toolbelt==1.0.0 + # via python-gitlab +urllib3==1.26.20 + # via + # -c ./connectors/../common/constraints.txt + # requests diff --git a/requirements/connectors/google-drive.txt b/requirements/connectors/google-drive.txt new file mode 100644 index 000000000..d78d467fc --- /dev/null +++ b/requirements/connectors/google-drive.txt @@ -0,0 +1,53 @@ +# This file was autogenerated by uv via the following command: +# uv pip compile ./connectors/google-drive.in --output-file ./connectors/google-drive.txt --no-strip-extras --python-version 3.9 +cachetools==5.5.0 + # via google-auth +certifi==2024.8.30 + # via requests +charset-normalizer==3.3.2 + # via requests +google-api-core==2.20.0 + # via google-api-python-client +google-api-python-client==2.147.0 + # via -r ./connectors/google-drive.in +google-auth==2.35.0 + # via + # google-api-core + # google-api-python-client + # google-auth-httplib2 +google-auth-httplib2==0.2.0 + # via google-api-python-client +googleapis-common-protos==1.65.0 + # via google-api-core +httplib2==0.22.0 + # via + # google-api-python-client + # google-auth-httplib2 +idna==3.10 + # via requests +proto-plus==1.24.0 + # via google-api-core +protobuf==4.23.4 + # via + # -c ./connectors/../common/constraints.txt + # google-api-core + # googleapis-common-protos + # proto-plus +pyasn1==0.6.1 + # via + # pyasn1-modules + # rsa +pyasn1-modules==0.4.1 + # via google-auth +pyparsing==3.1.4 + # via httplib2 +requests==2.32.3 + # via google-api-core +rsa==4.9 + # via google-auth +uritemplate==4.1.1 + # via google-api-python-client +urllib3==1.26.20 + # via + # -c ./connectors/../common/constraints.txt + # requests diff --git a/requirements/connectors/hubspot.txt b/requirements/connectors/hubspot.txt new file mode 100644 index 000000000..3a6a198ce --- /dev/null +++ b/requirements/connectors/hubspot.txt @@ -0,0 +1,17 @@ +# This file was autogenerated by uv via the following command: +# uv pip compile ./connectors/hubspot.in --output-file ./connectors/hubspot.txt --no-strip-extras --python-version 3.9 +certifi==2024.8.30 + # via hubspot-api-client +hubspot-api-client==9.0.0 + # via -r ./connectors/hubspot.in +python-dateutil==2.9.0.post0 + # via hubspot-api-client +six==1.16.0 + # via + # hubspot-api-client + # python-dateutil +urllib3==1.26.20 + # via + # -c ./connectors/../common/constraints.txt + # -r ./connectors/hubspot.in + # hubspot-api-client diff --git a/requirements/connectors/jira.txt b/requirements/connectors/jira.txt new file mode 100644 index 000000000..e32aebe77 --- /dev/null +++ b/requirements/connectors/jira.txt @@ -0,0 +1,38 @@ +# This file was autogenerated by uv via the following command: +# uv pip compile ./connectors/jira.in --output-file ./connectors/jira.txt --no-strip-extras --python-version 3.9 +atlassian-python-api==3.41.16 + # via -r ./connectors/jira.in +beautifulsoup4==4.12.3 + # via atlassian-python-api +certifi==2024.8.30 + # via requests +charset-normalizer==3.3.2 + # via requests +deprecated==1.2.14 + # via atlassian-python-api +idna==3.10 + # via requests +jmespath==1.0.1 + # via atlassian-python-api +oauthlib==3.2.2 + # via + # atlassian-python-api + # requests-oauthlib +requests==2.32.3 + # via + # atlassian-python-api + # requests-oauthlib +requests-oauthlib==2.0.0 + # via atlassian-python-api +six==1.16.0 + # via atlassian-python-api +soupsieve==2.6 + # via beautifulsoup4 +urllib3==1.26.20 + # via + # -c ./connectors/../common/constraints.txt + # requests +wrapt==1.16.0 + # via + # -c ./connectors/../common/constraints.txt + # deprecated diff --git a/requirements/connectors/kafka.txt b/requirements/connectors/kafka.txt new file mode 100644 index 000000000..c2a42a192 --- /dev/null +++ b/requirements/connectors/kafka.txt @@ -0,0 +1,4 @@ +# This file was autogenerated by uv via the following command: +# uv pip compile ./connectors/kafka.in --output-file ./connectors/kafka.txt --no-strip-extras --python-version 3.9 +confluent-kafka==2.5.3 + # via -r ./connectors/kafka.in diff --git a/requirements/connectors/kdbai.txt b/requirements/connectors/kdbai.txt new file mode 100644 index 000000000..479948843 --- /dev/null +++ b/requirements/connectors/kdbai.txt @@ -0,0 +1,39 @@ +# This file was autogenerated by uv via the following command: +# uv pip compile ./connectors/kdbai.in --output-file ./connectors/kdbai.txt --no-strip-extras --python-version 3.9 +certifi==2024.8.30 + # via requests +charset-normalizer==3.3.2 + # via requests +idna==3.10 + # via requests +kdbai-client==1.3.0 + # via -r ./connectors/kdbai.in +numpy==1.26.4 + # via + # -c ./connectors/../common/constraints.txt + # pandas + # pykx +pandas==2.2.3 + # via + # kdbai-client + # pykx +pykx==2.3.0 + # via kdbai-client +python-dateutil==2.9.0.post0 + # via pandas +pytz==2024.2 + # via + # pandas + # pykx +requests==2.32.3 + # via kdbai-client +six==1.16.0 + # via python-dateutil +toml==0.10.2 + # via pykx +tzdata==2024.2 + # via pandas +urllib3==1.26.20 + # via + # -c ./connectors/../common/constraints.txt + # requests diff --git a/requirements/connectors/milvus.txt b/requirements/connectors/milvus.txt new file mode 100644 index 000000000..bbad21125 --- /dev/null +++ b/requirements/connectors/milvus.txt @@ -0,0 +1,44 @@ +# This file was autogenerated by uv via the following command: +# uv pip compile ./connectors/milvus.in --output-file ./connectors/milvus.txt --no-strip-extras --python-version 3.9 +environs==9.5.0 + # via pymilvus +grpcio==1.66.2 + # via + # -c ./connectors/../common/constraints.txt + # pymilvus +marshmallow==3.22.0 + # via environs +milvus-lite==2.4.10 + # via pymilvus +numpy==1.26.4 + # via + # -c ./connectors/../common/constraints.txt + # pandas +packaging==23.2 + # via + # -c ./connectors/../common/constraints.txt + # marshmallow +pandas==2.2.3 + # via pymilvus +protobuf==4.23.4 + # via + # -c ./connectors/../common/constraints.txt + # pymilvus +pymilvus==2.4.7 + # via -r ./connectors/milvus.in +python-dateutil==2.9.0.post0 + # via pandas +python-dotenv==1.0.1 + # via environs +pytz==2024.2 + # via pandas +setuptools==75.1.0 + # via pymilvus +six==1.16.0 + # via python-dateutil +tqdm==4.66.5 + # via milvus-lite +tzdata==2024.2 + # via pandas +ujson==5.10.0 + # via pymilvus diff --git a/requirements/connectors/mongodb.txt b/requirements/connectors/mongodb.txt new file mode 100644 index 000000000..0a0053a93 --- /dev/null +++ b/requirements/connectors/mongodb.txt @@ -0,0 +1,6 @@ +# This file was autogenerated by uv via the following command: +# uv pip compile ./connectors/mongodb.in --output-file ./connectors/mongodb.txt --no-strip-extras --python-version 3.9 +dnspython==2.6.1 + # via pymongo +pymongo==4.10.1 + # via -r ./connectors/mongodb.in diff --git a/requirements/connectors/notion.txt b/requirements/connectors/notion.txt new file mode 100644 index 000000000..829812e6c --- /dev/null +++ b/requirements/connectors/notion.txt @@ -0,0 +1,34 @@ +# This file was autogenerated by uv via the following command: +# uv pip compile ./connectors/notion.in --output-file ./connectors/notion.txt --no-strip-extras --python-version 3.9 +anyio==4.6.0 + # via httpx +backoff==2.2.1 + # via -r ./connectors/notion.in +certifi==2024.8.30 + # via + # httpcore + # httpx +exceptiongroup==1.2.2 + # via anyio +h11==0.14.0 + # via httpcore +htmlbuilder==1.0.0 + # via -r ./connectors/notion.in +httpcore==1.0.6 + # via httpx +httpx==0.27.2 + # via + # -r ./connectors/notion.in + # notion-client +idna==3.10 + # via + # anyio + # httpx +notion-client==2.2.1 + # via -r ./connectors/notion.in +sniffio==1.3.1 + # via + # anyio + # httpx +typing-extensions==4.12.2 + # via anyio diff --git a/requirements/connectors/onedrive.txt b/requirements/connectors/onedrive.txt new file mode 100644 index 000000000..fba01233a --- /dev/null +++ b/requirements/connectors/onedrive.txt @@ -0,0 +1,42 @@ +# This file was autogenerated by uv via the following command: +# uv pip compile ./connectors/onedrive.in --output-file ./connectors/onedrive.txt --no-strip-extras --python-version 3.9 +beautifulsoup4==4.12.3 + # via bs4 +bs4==0.0.2 + # via -r ./connectors/onedrive.in +certifi==2024.8.30 + # via requests +cffi==1.17.1 + # via cryptography +charset-normalizer==3.3.2 + # via requests +cryptography==43.0.1 + # via + # msal + # pyjwt +idna==3.10 + # via requests +msal==1.31.0 + # via + # -r ./connectors/onedrive.in + # office365-rest-python-client +office365-rest-python-client==2.5.13 + # via -r ./connectors/onedrive.in +pycparser==2.22 + # via cffi +pyjwt[crypto]==2.9.0 + # via msal +pytz==2024.2 + # via office365-rest-python-client +requests==2.32.3 + # via + # msal + # office365-rest-python-client +soupsieve==2.6 + # via beautifulsoup4 +typing-extensions==4.12.2 + # via office365-rest-python-client +urllib3==1.26.20 + # via + # -c ./connectors/../common/constraints.txt + # requests diff --git a/requirements/connectors/opensearch.txt b/requirements/connectors/opensearch.txt new file mode 100644 index 000000000..a3ab23f00 --- /dev/null +++ b/requirements/connectors/opensearch.txt @@ -0,0 +1,25 @@ +# This file was autogenerated by uv via the following command: +# uv pip compile ./connectors/opensearch.in --output-file ./connectors/opensearch.txt --no-strip-extras --python-version 3.9 +certifi==2024.8.30 + # via + # opensearch-py + # requests +charset-normalizer==3.3.2 + # via requests +events==0.5 + # via opensearch-py +idna==3.10 + # via requests +opensearch-py==2.7.1 + # via -r ./connectors/opensearch.in +python-dateutil==2.9.0.post0 + # via opensearch-py +requests==2.32.3 + # via opensearch-py +six==1.16.0 + # via python-dateutil +urllib3==1.26.20 + # via + # -c ./connectors/../common/constraints.txt + # opensearch-py + # requests diff --git a/requirements/connectors/outlook.txt b/requirements/connectors/outlook.txt new file mode 100644 index 000000000..b94083ed6 --- /dev/null +++ b/requirements/connectors/outlook.txt @@ -0,0 +1,36 @@ +# This file was autogenerated by uv via the following command: +# uv pip compile ./connectors/outlook.in --output-file ./connectors/outlook.txt --no-strip-extras --python-version 3.9 +certifi==2024.8.30 + # via requests +cffi==1.17.1 + # via cryptography +charset-normalizer==3.3.2 + # via requests +cryptography==43.0.1 + # via + # msal + # pyjwt +idna==3.10 + # via requests +msal==1.31.0 + # via + # -r ./connectors/outlook.in + # office365-rest-python-client +office365-rest-python-client==2.5.13 + # via -r ./connectors/outlook.in +pycparser==2.22 + # via cffi +pyjwt[crypto]==2.9.0 + # via msal +pytz==2024.2 + # via office365-rest-python-client +requests==2.32.3 + # via + # msal + # office365-rest-python-client +typing-extensions==4.12.2 + # via office365-rest-python-client +urllib3==1.26.20 + # via + # -c ./connectors/../common/constraints.txt + # requests diff --git a/requirements/connectors/pinecone.txt b/requirements/connectors/pinecone.txt new file mode 100644 index 000000000..bf00d8245 --- /dev/null +++ b/requirements/connectors/pinecone.txt @@ -0,0 +1,20 @@ +# This file was autogenerated by uv via the following command: +# uv pip compile ./connectors/pinecone.in --output-file ./connectors/pinecone.txt --no-strip-extras --python-version 3.9 +certifi==2024.8.30 + # via pinecone-client +pinecone-client==5.0.1 + # via -r ./connectors/pinecone.in +pinecone-plugin-inference==1.1.0 + # via pinecone-client +pinecone-plugin-interface==0.0.7 + # via + # pinecone-client + # pinecone-plugin-inference +tqdm==4.66.5 + # via pinecone-client +typing-extensions==4.12.2 + # via pinecone-client +urllib3==1.26.20 + # via + # -c ./connectors/../common/constraints.txt + # pinecone-client diff --git a/requirements/connectors/postgres.txt b/requirements/connectors/postgres.txt new file mode 100644 index 000000000..683bddbcb --- /dev/null +++ b/requirements/connectors/postgres.txt @@ -0,0 +1,4 @@ +# This file was autogenerated by uv via the following command: +# uv pip compile ./connectors/postgres.in --output-file ./connectors/postgres.txt --no-strip-extras --python-version 3.9 +psycopg2-binary==2.9.9 + # via -r ./connectors/postgres.in diff --git a/requirements/connectors/qdrant.txt b/requirements/connectors/qdrant.txt new file mode 100644 index 000000000..2b41f1ec8 --- /dev/null +++ b/requirements/connectors/qdrant.txt @@ -0,0 +1,66 @@ +# This file was autogenerated by uv via the following command: +# uv pip compile ./connectors/qdrant.in --output-file ./connectors/qdrant.txt --no-strip-extras --python-version 3.9 +annotated-types==0.7.0 + # via pydantic +anyio==4.6.0 + # via httpx +certifi==2024.8.30 + # via + # httpcore + # httpx +exceptiongroup==1.2.2 + # via anyio +grpcio==1.66.2 + # via + # -c ./connectors/../common/constraints.txt + # grpcio-tools + # qdrant-client +grpcio-tools==1.62.3 + # via qdrant-client +h11==0.14.0 + # via httpcore +h2==4.1.0 + # via httpx +hpack==4.0.0 + # via h2 +httpcore==1.0.6 + # via httpx +httpx[http2]==0.27.2 + # via qdrant-client +hyperframe==6.0.1 + # via h2 +idna==3.10 + # via + # anyio + # httpx +numpy==1.26.4 + # via + # -c ./connectors/../common/constraints.txt + # qdrant-client +portalocker==2.10.1 + # via qdrant-client +protobuf==4.23.4 + # via + # -c ./connectors/../common/constraints.txt + # grpcio-tools +pydantic==2.9.2 + # via qdrant-client +pydantic-core==2.23.4 + # via pydantic +qdrant-client==1.11.3 + # via -r ./connectors/qdrant.in +setuptools==75.1.0 + # via grpcio-tools +sniffio==1.3.1 + # via + # anyio + # httpx +typing-extensions==4.12.2 + # via + # anyio + # pydantic + # pydantic-core +urllib3==1.26.20 + # via + # -c ./connectors/../common/constraints.txt + # qdrant-client diff --git a/requirements/connectors/reddit.txt b/requirements/connectors/reddit.txt new file mode 100644 index 000000000..674cdd911 --- /dev/null +++ b/requirements/connectors/reddit.txt @@ -0,0 +1,24 @@ +# This file was autogenerated by uv via the following command: +# uv pip compile ./connectors/reddit.in --output-file ./connectors/reddit.txt --no-strip-extras --python-version 3.9 +certifi==2024.8.30 + # via requests +charset-normalizer==3.3.2 + # via requests +idna==3.10 + # via requests +praw==7.7.1 + # via -r ./connectors/reddit.in +prawcore==2.4.0 + # via praw +requests==2.32.3 + # via + # prawcore + # update-checker +update-checker==0.18.0 + # via praw +urllib3==1.26.20 + # via + # -c ./connectors/../common/constraints.txt + # requests +websocket-client==1.8.0 + # via praw diff --git a/requirements/connectors/s3.txt b/requirements/connectors/s3.txt new file mode 100644 index 000000000..4cf9363b7 --- /dev/null +++ b/requirements/connectors/s3.txt @@ -0,0 +1,59 @@ +# This file was autogenerated by uv via the following command: +# uv pip compile ./connectors/s3.in --output-file ./connectors/s3.txt --no-strip-extras --python-version 3.9 +aiobotocore==2.13.3 + # via s3fs +aiohappyeyeballs==2.4.3 + # via aiohttp +aiohttp==3.10.8 + # via + # aiobotocore + # s3fs +aioitertools==0.12.0 + # via aiobotocore +aiosignal==1.3.1 + # via aiohttp +async-timeout==4.0.3 + # via aiohttp +attrs==24.2.0 + # via aiohttp +botocore==1.34.131 + # via + # -c ./connectors/../common/constraints.txt + # aiobotocore +frozenlist==1.4.1 + # via + # aiohttp + # aiosignal +fsspec==2024.5.0 + # via + # -c ./connectors/../common/constraints.txt + # -r ./connectors/s3.in + # s3fs +idna==3.10 + # via yarl +jmespath==1.0.1 + # via botocore +multidict==6.1.0 + # via + # aiohttp + # yarl +python-dateutil==2.9.0.post0 + # via botocore +s3fs==2024.5.0 + # via -r ./connectors/s3.in +six==1.16.0 + # via python-dateutil +typing-extensions==4.12.2 + # via + # aioitertools + # multidict +urllib3==1.26.20 + # via + # -c ./connectors/../common/constraints.txt + # botocore +wrapt==1.16.0 + # via + # -c ./connectors/../common/constraints.txt + # aiobotocore +yarl==1.13.1 + # via aiohttp diff --git a/requirements/connectors/salesforce.txt b/requirements/connectors/salesforce.txt new file mode 100644 index 000000000..d89cdb02c --- /dev/null +++ b/requirements/connectors/salesforce.txt @@ -0,0 +1,50 @@ +# This file was autogenerated by uv via the following command: +# uv pip compile ./connectors/salesforce.in --output-file ./connectors/salesforce.txt --no-strip-extras --python-version 3.9 +attrs==24.2.0 + # via zeep +certifi==2024.8.30 + # via requests +cffi==1.17.1 + # via cryptography +charset-normalizer==3.3.2 + # via requests +cryptography==43.0.1 + # via pyjwt +idna==3.10 + # via requests +isodate==0.6.1 + # via zeep +lxml==5.3.0 + # via zeep +more-itertools==10.5.0 + # via simple-salesforce +platformdirs==4.3.6 + # via zeep +pycparser==2.22 + # via cffi +pyjwt[crypto]==2.9.0 + # via simple-salesforce +pytz==2024.2 + # via zeep +requests==2.32.3 + # via + # requests-file + # requests-toolbelt + # simple-salesforce + # zeep +requests-file==2.1.0 + # via zeep +requests-toolbelt==1.0.0 + # via zeep +simple-salesforce==1.12.6 + # via -r ./connectors/salesforce.in +six==1.16.0 + # via isodate +typing-extensions==4.12.2 + # via simple-salesforce +urllib3==1.26.20 + # via + # -c ./connectors/../common/constraints.txt + # requests +zeep==4.2.1 + # via simple-salesforce diff --git a/requirements/connectors/sftp.txt b/requirements/connectors/sftp.txt new file mode 100644 index 000000000..05580ae74 --- /dev/null +++ b/requirements/connectors/sftp.txt @@ -0,0 +1,20 @@ +# This file was autogenerated by uv via the following command: +# uv pip compile ./connectors/sftp.in --output-file ./connectors/sftp.txt --no-strip-extras --python-version 3.9 +bcrypt==4.2.0 + # via paramiko +cffi==1.17.1 + # via + # cryptography + # pynacl +cryptography==43.0.1 + # via paramiko +fsspec==2024.5.0 + # via + # -c ./connectors/../common/constraints.txt + # -r ./connectors/sftp.in +paramiko==3.5.0 + # via -r ./connectors/sftp.in +pycparser==2.22 + # via cffi +pynacl==1.5.0 + # via paramiko diff --git a/requirements/connectors/sharepoint.txt b/requirements/connectors/sharepoint.txt new file mode 100644 index 000000000..b97f4e733 --- /dev/null +++ b/requirements/connectors/sharepoint.txt @@ -0,0 +1,36 @@ +# This file was autogenerated by uv via the following command: +# uv pip compile ./connectors/sharepoint.in --output-file ./connectors/sharepoint.txt --no-strip-extras --python-version 3.9 +certifi==2024.8.30 + # via requests +cffi==1.17.1 + # via cryptography +charset-normalizer==3.3.2 + # via requests +cryptography==43.0.1 + # via + # msal + # pyjwt +idna==3.10 + # via requests +msal==1.31.0 + # via + # -r ./connectors/sharepoint.in + # office365-rest-python-client +office365-rest-python-client==2.5.13 + # via -r ./connectors/sharepoint.in +pycparser==2.22 + # via cffi +pyjwt[crypto]==2.9.0 + # via msal +pytz==2024.2 + # via office365-rest-python-client +requests==2.32.3 + # via + # msal + # office365-rest-python-client +typing-extensions==4.12.2 + # via office365-rest-python-client +urllib3==1.26.20 + # via + # -c ./connectors/../common/constraints.txt + # requests diff --git a/requirements/connectors/singlestore.txt b/requirements/connectors/singlestore.txt new file mode 100644 index 000000000..d60774865 --- /dev/null +++ b/requirements/connectors/singlestore.txt @@ -0,0 +1,40 @@ +# This file was autogenerated by uv via the following command: +# uv pip compile ./connectors/singlestore.in --output-file ./connectors/singlestore.txt --no-strip-extras --python-version 3.9 +build==0.10.0 + # via singlestoredb +certifi==2024.8.30 + # via requests +charset-normalizer==3.3.2 + # via requests +idna==3.10 + # via requests +packaging==23.2 + # via + # -c ./connectors/../common/constraints.txt + # build +parsimonious==0.10.0 + # via singlestoredb +pyjwt==2.9.0 + # via singlestoredb +pyproject-hooks==1.2.0 + # via build +regex==2024.9.11 + # via parsimonious +requests==2.32.3 + # via singlestoredb +setuptools==75.1.0 + # via singlestoredb +singlestoredb==1.7.1 + # via -r ./connectors/singlestore.in +sqlparams==6.1.0 + # via singlestoredb +tomli==2.0.2 + # via + # build + # singlestoredb +urllib3==1.26.20 + # via + # -c ./connectors/../common/constraints.txt + # requests +wheel==0.44.0 + # via singlestoredb diff --git a/requirements/connectors/slack.txt b/requirements/connectors/slack.txt new file mode 100644 index 000000000..364412fce --- /dev/null +++ b/requirements/connectors/slack.txt @@ -0,0 +1,4 @@ +# This file was autogenerated by uv via the following command: +# uv pip compile ./connectors/slack.in --output-file ./connectors/slack.txt --no-strip-extras --python-version 3.9 +slack-sdk==3.33.1 + # via -r ./connectors/slack.in diff --git a/requirements/connectors/vectara.txt b/requirements/connectors/vectara.txt new file mode 100644 index 000000000..917d5e7bc --- /dev/null +++ b/requirements/connectors/vectara.txt @@ -0,0 +1,14 @@ +# This file was autogenerated by uv via the following command: +# uv pip compile ./connectors/vectara.in --output-file ./connectors/vectara.txt --no-strip-extras --python-version 3.9 +certifi==2024.8.30 + # via requests +charset-normalizer==3.3.2 + # via requests +idna==3.10 + # via requests +requests==2.32.3 + # via -r ./connectors/vectara.in +urllib3==1.26.20 + # via + # -c ./connectors/../common/constraints.txt + # requests diff --git a/requirements/connectors/weaviate.txt b/requirements/connectors/weaviate.txt new file mode 100644 index 000000000..2fb7c4cef --- /dev/null +++ b/requirements/connectors/weaviate.txt @@ -0,0 +1,74 @@ +# This file was autogenerated by uv via the following command: +# uv pip compile ./connectors/weaviate.in --output-file ./connectors/weaviate.txt --no-strip-extras --python-version 3.9 +annotated-types==0.7.0 + # via pydantic +anyio==4.6.0 + # via httpx +authlib==1.3.1 + # via weaviate-client +certifi==2024.8.30 + # via + # httpcore + # httpx + # requests +cffi==1.17.1 + # via cryptography +charset-normalizer==3.3.2 + # via requests +cryptography==43.0.1 + # via authlib +exceptiongroup==1.2.2 + # via anyio +grpcio==1.66.2 + # via + # -c ./connectors/../common/constraints.txt + # grpcio-health-checking + # grpcio-tools + # weaviate-client +grpcio-health-checking==1.62.3 + # via weaviate-client +grpcio-tools==1.62.3 + # via weaviate-client +h11==0.14.0 + # via httpcore +httpcore==1.0.6 + # via httpx +httpx==0.27.0 + # via weaviate-client +idna==3.10 + # via + # anyio + # httpx + # requests +protobuf==4.23.4 + # via + # -c ./connectors/../common/constraints.txt + # grpcio-health-checking + # grpcio-tools +pycparser==2.22 + # via cffi +pydantic==2.9.2 + # via weaviate-client +pydantic-core==2.23.4 + # via pydantic +requests==2.32.3 + # via weaviate-client +setuptools==75.1.0 + # via grpcio-tools +sniffio==1.3.1 + # via + # anyio + # httpx +typing-extensions==4.12.2 + # via + # anyio + # pydantic + # pydantic-core +urllib3==1.26.20 + # via + # -c ./connectors/../common/constraints.txt + # requests +validators==0.34.0 + # via weaviate-client +weaviate-client==4.8.1 + # via -r ./connectors/weaviate.in diff --git a/requirements/connectors/wikipedia.txt b/requirements/connectors/wikipedia.txt new file mode 100644 index 000000000..48affe627 --- /dev/null +++ b/requirements/connectors/wikipedia.txt @@ -0,0 +1,20 @@ +# This file was autogenerated by uv via the following command: +# uv pip compile ./connectors/wikipedia.in --output-file ./connectors/wikipedia.txt --no-strip-extras --python-version 3.9 +beautifulsoup4==4.12.3 + # via wikipedia +certifi==2024.8.30 + # via requests +charset-normalizer==3.3.2 + # via requests +idna==3.10 + # via requests +requests==2.32.3 + # via wikipedia +soupsieve==2.6 + # via beautifulsoup4 +urllib3==1.26.20 + # via + # -c ./connectors/../common/constraints.txt + # requests +wikipedia==1.4.0 + # via -r ./connectors/wikipedia.in diff --git a/requirements/embed/aws-bedrock.txt b/requirements/embed/aws-bedrock.txt new file mode 100644 index 000000000..ed8dac05d --- /dev/null +++ b/requirements/embed/aws-bedrock.txt @@ -0,0 +1,23 @@ +# This file was autogenerated by uv via the following command: +# uv pip compile aws-bedrock.in --output-file aws-bedrock.txt --no-strip-extras +boto3==1.34.131 + # via -r aws-bedrock.in +botocore==1.34.131 + # via + # -c ../common/constraints.txt + # boto3 + # s3transfer +jmespath==1.0.1 + # via + # boto3 + # botocore +python-dateutil==2.9.0.post0 + # via botocore +s3transfer==0.10.2 + # via boto3 +six==1.16.0 + # via python-dateutil +urllib3==1.26.20 + # via + # -c ../common/constraints.txt + # botocore diff --git a/requirements/embed/huggingface.txt b/requirements/embed/huggingface.txt new file mode 100644 index 000000000..510d91949 --- /dev/null +++ b/requirements/embed/huggingface.txt @@ -0,0 +1,91 @@ +# This file was autogenerated by uv via the following command: +# uv pip compile huggingface.in --output-file huggingface.txt --no-strip-extras +certifi==2024.8.30 + # via requests +charset-normalizer==3.3.2 + # via requests +filelock==3.16.1 + # via + # huggingface-hub + # torch + # transformers +fsspec==2024.5.0 + # via + # -c ../common/constraints.txt + # huggingface-hub + # torch +huggingface-hub==0.25.1 + # via + # sentence-transformers + # tokenizers + # transformers +idna==3.10 + # via requests +jinja2==3.1.4 + # via torch +joblib==1.4.2 + # via scikit-learn +markupsafe==2.1.5 + # via jinja2 +mpmath==1.3.0 + # via sympy +networkx==3.2.1 + # via torch +numpy==1.26.4 + # via + # -c ../common/constraints.txt + # scikit-learn + # scipy + # transformers +packaging==23.2 + # via + # -c ../common/constraints.txt + # huggingface-hub + # transformers +pillow==10.4.0 + # via sentence-transformers +pyyaml==6.0.2 + # via + # huggingface-hub + # transformers +regex==2024.9.11 + # via transformers +requests==2.32.3 + # via + # huggingface-hub + # transformers +safetensors==0.4.5 + # via transformers +scikit-learn==1.5.2 + # via sentence-transformers +scipy==1.13.1 + # via + # scikit-learn + # sentence-transformers +sentence-transformers==3.1.1 + # via -r huggingface.in +sympy==1.13.3 + # via torch +threadpoolctl==3.5.0 + # via scikit-learn +tokenizers==0.19.1 + # via + # -c ../common/constraints.txt + # transformers +torch==2.4.1 + # via sentence-transformers +tqdm==4.66.5 + # via + # huggingface-hub + # sentence-transformers + # transformers +transformers==4.44.2 + # via sentence-transformers +typing-extensions==4.12.2 + # via + # huggingface-hub + # torch +urllib3==1.26.20 + # via + # -c ../common/constraints.txt + # requests diff --git a/requirements/embed/mixedbreadai.txt b/requirements/embed/mixedbreadai.txt new file mode 100644 index 000000000..225b9d69b --- /dev/null +++ b/requirements/embed/mixedbreadai.txt @@ -0,0 +1,38 @@ +# This file was autogenerated by uv via the following command: +# uv pip compile mixedbreadai.in --output-file mixedbreadai.txt --no-strip-extras +annotated-types==0.7.0 + # via pydantic +anyio==4.6.0 + # via httpx +certifi==2024.8.30 + # via + # httpcore + # httpx +exceptiongroup==1.2.2 + # via anyio +h11==0.14.0 + # via httpcore +httpcore==1.0.6 + # via httpx +httpx==0.27.2 + # via mixedbread-ai +idna==3.10 + # via + # anyio + # httpx +mixedbread-ai==2.2.6 + # via -r mixedbreadai.in +pydantic==2.9.2 + # via mixedbread-ai +pydantic-core==2.23.4 + # via pydantic +sniffio==1.3.1 + # via + # anyio + # httpx +typing-extensions==4.12.2 + # via + # anyio + # mixedbread-ai + # pydantic + # pydantic-core diff --git a/requirements/embed/octoai.txt b/requirements/embed/octoai.txt new file mode 100644 index 000000000..5a858859c --- /dev/null +++ b/requirements/embed/octoai.txt @@ -0,0 +1,61 @@ +# This file was autogenerated by uv via the following command: +# uv pip compile octoai.in --output-file octoai.txt --no-strip-extras +annotated-types==0.7.0 + # via pydantic +anyio==4.6.0 + # via + # httpx + # openai +certifi==2024.8.30 + # via + # httpcore + # httpx + # requests +charset-normalizer==3.3.2 + # via requests +distro==1.9.0 + # via openai +exceptiongroup==1.2.2 + # via anyio +h11==0.14.0 + # via httpcore +httpcore==1.0.6 + # via httpx +httpx==0.27.2 + # via openai +idna==3.10 + # via + # anyio + # httpx + # requests +jiter==0.5.0 + # via openai +openai==1.51.0 + # via -r octoai.in +pydantic==2.9.2 + # via openai +pydantic-core==2.23.4 + # via pydantic +regex==2024.9.11 + # via tiktoken +requests==2.32.3 + # via tiktoken +sniffio==1.3.1 + # via + # anyio + # httpx + # openai +tiktoken==0.7.0 + # via -r octoai.in +tqdm==4.66.5 + # via openai +typing-extensions==4.12.2 + # via + # anyio + # openai + # pydantic + # pydantic-core +urllib3==1.26.20 + # via + # -c ../common/constraints.txt + # requests diff --git a/requirements/embed/openai.txt b/requirements/embed/openai.txt new file mode 100644 index 000000000..822884229 --- /dev/null +++ b/requirements/embed/openai.txt @@ -0,0 +1,61 @@ +# This file was autogenerated by uv via the following command: +# uv pip compile openai.in --output-file openai.txt --no-strip-extras +annotated-types==0.7.0 + # via pydantic +anyio==4.6.0 + # via + # httpx + # openai +certifi==2024.8.30 + # via + # httpcore + # httpx + # requests +charset-normalizer==3.3.2 + # via requests +distro==1.9.0 + # via openai +exceptiongroup==1.2.2 + # via anyio +h11==0.14.0 + # via httpcore +httpcore==1.0.6 + # via httpx +httpx==0.27.2 + # via openai +idna==3.10 + # via + # anyio + # httpx + # requests +jiter==0.5.0 + # via openai +openai==1.51.0 + # via -r openai.in +pydantic==2.9.2 + # via openai +pydantic-core==2.23.4 + # via pydantic +regex==2024.9.11 + # via tiktoken +requests==2.32.3 + # via tiktoken +sniffio==1.3.1 + # via + # anyio + # httpx + # openai +tiktoken==0.7.0 + # via -r openai.in +tqdm==4.66.5 + # via openai +typing-extensions==4.12.2 + # via + # anyio + # openai + # pydantic + # pydantic-core +urllib3==1.26.20 + # via + # -c ../common/constraints.txt + # requests diff --git a/requirements/embed/vertexai.txt b/requirements/embed/vertexai.txt new file mode 100644 index 000000000..80d3ff645 --- /dev/null +++ b/requirements/embed/vertexai.txt @@ -0,0 +1,122 @@ +# This file was autogenerated by uv via the following command: +# uv pip compile vertexai.in --output-file vertexai.txt --no-strip-extras +annotated-types==0.7.0 + # via pydantic +cachetools==5.5.0 + # via google-auth +certifi==2024.8.30 + # via requests +charset-normalizer==3.3.2 + # via requests +docstring-parser==0.16 + # via google-cloud-aiplatform +google-api-core[grpc]==2.20.0 + # via + # google-cloud-aiplatform + # google-cloud-bigquery + # google-cloud-core + # google-cloud-resource-manager + # google-cloud-storage +google-auth==2.35.0 + # via + # google-api-core + # google-cloud-aiplatform + # google-cloud-bigquery + # google-cloud-core + # google-cloud-resource-manager + # google-cloud-storage +google-cloud-aiplatform[all]==1.69.0 + # via vertexai +google-cloud-bigquery==3.26.0 + # via google-cloud-aiplatform +google-cloud-core==2.4.1 + # via + # google-cloud-bigquery + # google-cloud-storage +google-cloud-resource-manager==1.12.5 + # via google-cloud-aiplatform +google-cloud-storage==2.18.2 + # via google-cloud-aiplatform +google-crc32c==1.6.0 + # via + # google-cloud-storage + # google-resumable-media +google-resumable-media==2.7.2 + # via + # google-cloud-bigquery + # google-cloud-storage +googleapis-common-protos[grpc]==1.65.0 + # via + # google-api-core + # grpc-google-iam-v1 + # grpcio-status +grpc-google-iam-v1==0.13.1 + # via google-cloud-resource-manager +grpcio==1.66.2 + # via + # -c ../common/constraints.txt + # google-api-core + # googleapis-common-protos + # grpc-google-iam-v1 + # grpcio-status +grpcio-status==1.62.3 + # via google-api-core +idna==3.10 + # via requests +numpy==1.26.4 + # via + # -c ../common/constraints.txt + # shapely +packaging==23.2 + # via + # -c ../common/constraints.txt + # google-cloud-aiplatform + # google-cloud-bigquery +proto-plus==1.24.0 + # via + # google-api-core + # google-cloud-aiplatform + # google-cloud-resource-manager +protobuf==4.23.4 + # via + # -c ../common/constraints.txt + # google-api-core + # google-cloud-aiplatform + # google-cloud-resource-manager + # googleapis-common-protos + # grpc-google-iam-v1 + # grpcio-status + # proto-plus +pyasn1==0.6.1 + # via + # pyasn1-modules + # rsa +pyasn1-modules==0.4.1 + # via google-auth +pydantic==2.9.2 + # via google-cloud-aiplatform +pydantic-core==2.23.4 + # via pydantic +python-dateutil==2.9.0.post0 + # via google-cloud-bigquery +requests==2.32.3 + # via + # google-api-core + # google-cloud-bigquery + # google-cloud-storage +rsa==4.9 + # via google-auth +shapely==2.0.6 + # via google-cloud-aiplatform +six==1.16.0 + # via python-dateutil +typing-extensions==4.12.2 + # via + # pydantic + # pydantic-core +urllib3==1.26.20 + # via + # -c ../common/constraints.txt + # requests +vertexai==1.69.0 + # via -r vertexai.in diff --git a/requirements/embed/voyageai.txt b/requirements/embed/voyageai.txt new file mode 100644 index 000000000..ab06aa637 --- /dev/null +++ b/requirements/embed/voyageai.txt @@ -0,0 +1,48 @@ +# This file was autogenerated by uv via the following command: +# uv pip compile voyageai.in --output-file voyageai.txt --no-strip-extras +aiohappyeyeballs==2.4.3 + # via aiohttp +aiohttp==3.10.8 + # via voyageai +aiolimiter==1.1.0 + # via voyageai +aiosignal==1.3.1 + # via aiohttp +async-timeout==4.0.3 + # via aiohttp +attrs==24.2.0 + # via aiohttp +certifi==2024.8.30 + # via requests +charset-normalizer==3.3.2 + # via requests +frozenlist==1.4.1 + # via + # aiohttp + # aiosignal +idna==3.10 + # via + # requests + # yarl +multidict==6.1.0 + # via + # aiohttp + # yarl +numpy==1.26.4 + # via + # -c ../common/constraints.txt + # voyageai +requests==2.32.3 + # via voyageai +tenacity==9.0.0 + # via voyageai +typing-extensions==4.12.2 + # via multidict +urllib3==1.26.20 + # via + # -c ../common/constraints.txt + # requests +voyageai==0.2.3 + # via -r voyageai.in +yarl==1.13.1 + # via aiohttp diff --git a/requirements/local_partition/doc.txt b/requirements/local_partition/doc.txt new file mode 100644 index 000000000..362e65463 --- /dev/null +++ b/requirements/local_partition/doc.txt @@ -0,0 +1,161 @@ +# This file was autogenerated by uv via the following command: +# uv pip compile ./local_partition/doc.in --output-file ./local_partition/doc.txt --no-strip-extras --python-version 3.9 +anyio==4.6.0 + # via httpx +backoff==2.2.1 + # via unstructured +beautifulsoup4==4.12.3 + # via unstructured +certifi==2024.8.30 + # via + # httpcore + # httpx + # requests + # unstructured-client +cffi==1.17.1 + # via cryptography +chardet==5.2.0 + # via unstructured +charset-normalizer==3.3.2 + # via + # requests + # unstructured-client +click==8.1.7 + # via + # nltk + # python-oxmsg +cryptography==43.0.1 + # via unstructured-client +dataclasses-json==0.6.7 + # via + # unstructured + # unstructured-client +deepdiff==8.0.1 + # via unstructured-client +emoji==2.13.2 + # via unstructured +exceptiongroup==1.2.2 + # via anyio +filetype==1.2.0 + # via unstructured +h11==0.14.0 + # via httpcore +httpcore==1.0.6 + # via httpx +httpx==0.27.2 + # via unstructured-client +idna==3.10 + # via + # anyio + # httpx + # requests + # unstructured-client +joblib==1.4.2 + # via nltk +jsonpath-python==1.0.6 + # via unstructured-client +langdetect==1.0.9 + # via unstructured +lxml==5.3.0 + # via + # python-docx + # unstructured +marshmallow==3.22.0 + # via + # dataclasses-json + # unstructured-client +mypy-extensions==1.0.0 + # via + # typing-inspect + # unstructured-client +nest-asyncio==1.6.0 + # via unstructured-client +nltk==3.9.1 + # via unstructured +numpy==1.26.4 + # via + # -c ./local_partition/../common/constraints.txt + # unstructured +olefile==0.47 + # via python-oxmsg +orderly-set==5.2.2 + # via deepdiff +packaging==23.2 + # via + # -c ./local_partition/../common/constraints.txt + # marshmallow + # unstructured-client +psutil==6.0.0 + # via unstructured +pycparser==2.22 + # via cffi +pypdf==5.0.1 + # via unstructured-client +python-dateutil==2.9.0.post0 + # via unstructured-client +python-docx==1.1.2 + # via unstructured +python-iso639==2024.4.27 + # via unstructured +python-magic==0.4.27 + # via unstructured +python-oxmsg==0.0.1 + # via unstructured +rapidfuzz==3.10.0 + # via unstructured +regex==2024.9.11 + # via nltk +requests==2.32.3 + # via + # requests-toolbelt + # unstructured + # unstructured-client +requests-toolbelt==1.0.0 + # via unstructured-client +six==1.16.0 + # via + # langdetect + # python-dateutil + # unstructured-client +sniffio==1.3.1 + # via + # anyio + # httpx +soupsieve==2.6 + # via beautifulsoup4 +tabulate==0.9.0 + # via unstructured +tqdm==4.66.5 + # via + # nltk + # unstructured +typing-extensions==4.12.2 + # via + # anyio + # pypdf + # python-docx + # python-oxmsg + # typing-inspect + # unstructured + # unstructured-client +typing-inspect==0.9.0 + # via + # dataclasses-json + # unstructured-client +unstructured[doc]==0.15.10 + # via + # -c ./local_partition/../common/constraints.txt + # -r ./local_partition/doc.in +unstructured-client==0.25.9 + # via + # -c ./local_partition/../common/constraints.txt + # unstructured +urllib3==1.26.20 + # via + # -c ./local_partition/../common/constraints.txt + # requests + # unstructured-client +wrapt==1.16.0 + # via + # -c ./local_partition/../common/constraints.txt + # unstructured diff --git a/requirements/local_partition/docx.txt b/requirements/local_partition/docx.txt new file mode 100644 index 000000000..8807fc2bc --- /dev/null +++ b/requirements/local_partition/docx.txt @@ -0,0 +1,161 @@ +# This file was autogenerated by uv via the following command: +# uv pip compile ./local_partition/docx.in --output-file ./local_partition/docx.txt --no-strip-extras --python-version 3.9 +anyio==4.6.0 + # via httpx +backoff==2.2.1 + # via unstructured +beautifulsoup4==4.12.3 + # via unstructured +certifi==2024.8.30 + # via + # httpcore + # httpx + # requests + # unstructured-client +cffi==1.17.1 + # via cryptography +chardet==5.2.0 + # via unstructured +charset-normalizer==3.3.2 + # via + # requests + # unstructured-client +click==8.1.7 + # via + # nltk + # python-oxmsg +cryptography==43.0.1 + # via unstructured-client +dataclasses-json==0.6.7 + # via + # unstructured + # unstructured-client +deepdiff==8.0.1 + # via unstructured-client +emoji==2.13.2 + # via unstructured +exceptiongroup==1.2.2 + # via anyio +filetype==1.2.0 + # via unstructured +h11==0.14.0 + # via httpcore +httpcore==1.0.6 + # via httpx +httpx==0.27.2 + # via unstructured-client +idna==3.10 + # via + # anyio + # httpx + # requests + # unstructured-client +joblib==1.4.2 + # via nltk +jsonpath-python==1.0.6 + # via unstructured-client +langdetect==1.0.9 + # via unstructured +lxml==5.3.0 + # via + # python-docx + # unstructured +marshmallow==3.22.0 + # via + # dataclasses-json + # unstructured-client +mypy-extensions==1.0.0 + # via + # typing-inspect + # unstructured-client +nest-asyncio==1.6.0 + # via unstructured-client +nltk==3.9.1 + # via unstructured +numpy==1.26.4 + # via + # -c ./local_partition/../common/constraints.txt + # unstructured +olefile==0.47 + # via python-oxmsg +orderly-set==5.2.2 + # via deepdiff +packaging==23.2 + # via + # -c ./local_partition/../common/constraints.txt + # marshmallow + # unstructured-client +psutil==6.0.0 + # via unstructured +pycparser==2.22 + # via cffi +pypdf==5.0.1 + # via unstructured-client +python-dateutil==2.9.0.post0 + # via unstructured-client +python-docx==1.1.2 + # via unstructured +python-iso639==2024.4.27 + # via unstructured +python-magic==0.4.27 + # via unstructured +python-oxmsg==0.0.1 + # via unstructured +rapidfuzz==3.10.0 + # via unstructured +regex==2024.9.11 + # via nltk +requests==2.32.3 + # via + # requests-toolbelt + # unstructured + # unstructured-client +requests-toolbelt==1.0.0 + # via unstructured-client +six==1.16.0 + # via + # langdetect + # python-dateutil + # unstructured-client +sniffio==1.3.1 + # via + # anyio + # httpx +soupsieve==2.6 + # via beautifulsoup4 +tabulate==0.9.0 + # via unstructured +tqdm==4.66.5 + # via + # nltk + # unstructured +typing-extensions==4.12.2 + # via + # anyio + # pypdf + # python-docx + # python-oxmsg + # typing-inspect + # unstructured + # unstructured-client +typing-inspect==0.9.0 + # via + # dataclasses-json + # unstructured-client +unstructured[docx]==0.15.10 + # via + # -c ./local_partition/../common/constraints.txt + # -r ./local_partition/docx.in +unstructured-client==0.25.9 + # via + # -c ./local_partition/../common/constraints.txt + # unstructured +urllib3==1.26.20 + # via + # -c ./local_partition/../common/constraints.txt + # requests + # unstructured-client +wrapt==1.16.0 + # via + # -c ./local_partition/../common/constraints.txt + # unstructured diff --git a/requirements/local_partition/epub.txt b/requirements/local_partition/epub.txt new file mode 100644 index 000000000..1866c99fb --- /dev/null +++ b/requirements/local_partition/epub.txt @@ -0,0 +1,158 @@ +# This file was autogenerated by uv via the following command: +# uv pip compile ./local_partition/epub.in --output-file ./local_partition/epub.txt --no-strip-extras --python-version 3.9 +anyio==4.6.0 + # via httpx +backoff==2.2.1 + # via unstructured +beautifulsoup4==4.12.3 + # via unstructured +certifi==2024.8.30 + # via + # httpcore + # httpx + # requests + # unstructured-client +cffi==1.17.1 + # via cryptography +chardet==5.2.0 + # via unstructured +charset-normalizer==3.3.2 + # via + # requests + # unstructured-client +click==8.1.7 + # via + # nltk + # python-oxmsg +cryptography==43.0.1 + # via unstructured-client +dataclasses-json==0.6.7 + # via + # unstructured + # unstructured-client +deepdiff==8.0.1 + # via unstructured-client +emoji==2.13.2 + # via unstructured +exceptiongroup==1.2.2 + # via anyio +filetype==1.2.0 + # via unstructured +h11==0.14.0 + # via httpcore +httpcore==1.0.6 + # via httpx +httpx==0.27.2 + # via unstructured-client +idna==3.10 + # via + # anyio + # httpx + # requests + # unstructured-client +joblib==1.4.2 + # via nltk +jsonpath-python==1.0.6 + # via unstructured-client +langdetect==1.0.9 + # via unstructured +lxml==5.3.0 + # via unstructured +marshmallow==3.22.0 + # via + # dataclasses-json + # unstructured-client +mypy-extensions==1.0.0 + # via + # typing-inspect + # unstructured-client +nest-asyncio==1.6.0 + # via unstructured-client +nltk==3.9.1 + # via unstructured +numpy==1.26.4 + # via + # -c ./local_partition/../common/constraints.txt + # unstructured +olefile==0.47 + # via python-oxmsg +orderly-set==5.2.2 + # via deepdiff +packaging==23.2 + # via + # -c ./local_partition/../common/constraints.txt + # marshmallow + # unstructured-client +psutil==6.0.0 + # via unstructured +pycparser==2.22 + # via cffi +pypandoc==1.13 + # via unstructured +pypdf==5.0.1 + # via unstructured-client +python-dateutil==2.9.0.post0 + # via unstructured-client +python-iso639==2024.4.27 + # via unstructured +python-magic==0.4.27 + # via unstructured +python-oxmsg==0.0.1 + # via unstructured +rapidfuzz==3.10.0 + # via unstructured +regex==2024.9.11 + # via nltk +requests==2.32.3 + # via + # requests-toolbelt + # unstructured + # unstructured-client +requests-toolbelt==1.0.0 + # via unstructured-client +six==1.16.0 + # via + # langdetect + # python-dateutil + # unstructured-client +sniffio==1.3.1 + # via + # anyio + # httpx +soupsieve==2.6 + # via beautifulsoup4 +tabulate==0.9.0 + # via unstructured +tqdm==4.66.5 + # via + # nltk + # unstructured +typing-extensions==4.12.2 + # via + # anyio + # pypdf + # python-oxmsg + # typing-inspect + # unstructured + # unstructured-client +typing-inspect==0.9.0 + # via + # dataclasses-json + # unstructured-client +unstructured[epub]==0.15.10 + # via + # -c ./local_partition/../common/constraints.txt + # -r ./local_partition/epub.in +unstructured-client==0.25.9 + # via + # -c ./local_partition/../common/constraints.txt + # unstructured +urllib3==1.26.20 + # via + # -c ./local_partition/../common/constraints.txt + # requests + # unstructured-client +wrapt==1.16.0 + # via + # -c ./local_partition/../common/constraints.txt + # unstructured diff --git a/requirements/local_partition/image.txt b/requirements/local_partition/image.txt new file mode 100644 index 000000000..197a7cc36 --- /dev/null +++ b/requirements/local_partition/image.txt @@ -0,0 +1,395 @@ +# This file was autogenerated by uv via the following command: +# uv pip compile ./local_partition/image.in --output-file ./local_partition/image.txt --no-strip-extras --python-version 3.9 +antlr4-python3-runtime==4.9.3 + # via omegaconf +anyio==4.6.0 + # via httpx +backoff==2.2.1 + # via unstructured +beautifulsoup4==4.12.3 + # via unstructured +cachetools==5.5.0 + # via google-auth +certifi==2024.8.30 + # via + # httpcore + # httpx + # requests + # unstructured-client +cffi==1.17.1 + # via cryptography +chardet==5.2.0 + # via + # pdfplumber + # unstructured +charset-normalizer==3.3.2 + # via + # pdfminer-six + # requests + # unstructured-client +click==8.1.7 + # via + # nltk + # python-oxmsg +coloredlogs==15.0.1 + # via onnxruntime +contourpy==1.3.0 + # via matplotlib +cryptography==43.0.1 + # via + # pdfminer-six + # unstructured-client +cycler==0.12.1 + # via matplotlib +dataclasses-json==0.6.7 + # via + # unstructured + # unstructured-client +deepdiff==8.0.1 + # via unstructured-client +deprecated==1.2.14 + # via pikepdf +effdet==0.4.1 + # via unstructured +emoji==2.13.2 + # via unstructured +exceptiongroup==1.2.2 + # via anyio +filelock==3.16.1 + # via + # huggingface-hub + # torch + # transformers +filetype==1.2.0 + # via unstructured +flatbuffers==24.3.25 + # via onnxruntime +fonttools==4.54.1 + # via matplotlib +fsspec==2024.5.0 + # via + # -c ./local_partition/../common/constraints.txt + # huggingface-hub + # torch +google-api-core[grpc]==2.20.0 + # via google-cloud-vision +google-auth==2.35.0 + # via + # google-api-core + # google-cloud-vision +google-cloud-vision==3.7.4 + # via unstructured +googleapis-common-protos==1.65.0 + # via + # google-api-core + # grpcio-status +grpcio==1.66.2 + # via + # -c ./local_partition/../common/constraints.txt + # google-api-core + # grpcio-status +grpcio-status==1.62.3 + # via google-api-core +h11==0.14.0 + # via httpcore +httpcore==1.0.6 + # via httpx +httpx==0.27.2 + # via unstructured-client +huggingface-hub==0.25.1 + # via + # timm + # tokenizers + # transformers + # unstructured-inference +humanfriendly==10.0 + # via coloredlogs +idna==3.10 + # via + # anyio + # httpx + # requests + # unstructured-client +importlib-resources==6.4.5 + # via matplotlib +iopath==0.1.10 + # via layoutparser +jinja2==3.1.4 + # via torch +joblib==1.4.2 + # via nltk +jsonpath-python==1.0.6 + # via unstructured-client +kiwisolver==1.4.7 + # via matplotlib +langdetect==1.0.9 + # via unstructured +layoutparser==0.3.4 + # via unstructured-inference +lxml==5.3.0 + # via + # pikepdf + # unstructured +markupsafe==2.1.5 + # via jinja2 +marshmallow==3.22.0 + # via + # dataclasses-json + # unstructured-client +matplotlib==3.9.2 + # via + # pycocotools + # unstructured-inference +mpmath==1.3.0 + # via sympy +mypy-extensions==1.0.0 + # via + # typing-inspect + # unstructured-client +nest-asyncio==1.6.0 + # via unstructured-client +networkx==3.2.1 + # via torch +nltk==3.9.1 + # via unstructured +numpy==1.26.4 + # via + # -c ./local_partition/../common/constraints.txt + # contourpy + # layoutparser + # matplotlib + # onnx + # onnxruntime + # opencv-python + # pandas + # pycocotools + # scipy + # torchvision + # transformers + # unstructured +olefile==0.47 + # via python-oxmsg +omegaconf==2.3.0 + # via effdet +onnx==1.17.0 + # via + # unstructured + # unstructured-inference +onnxruntime==1.19.2 + # via unstructured-inference +opencv-python==4.10.0.84 + # via + # layoutparser + # unstructured-inference +orderly-set==5.2.2 + # via deepdiff +packaging==23.2 + # via + # -c ./local_partition/../common/constraints.txt + # huggingface-hub + # marshmallow + # matplotlib + # onnxruntime + # pikepdf + # transformers + # unstructured-client + # unstructured-pytesseract +pandas==2.2.3 + # via layoutparser +pdf2image==1.17.0 + # via + # layoutparser + # unstructured +pdfminer-six==20240706 + # via + # pdfplumber + # unstructured +pdfplumber==0.5.3 + # via layoutparser +pi-heif==0.18.0 + # via unstructured +pikepdf==9.3.0 + # via unstructured +pillow==10.4.0 + # via + # layoutparser + # matplotlib + # pdf2image + # pdfplumber + # pi-heif + # pikepdf + # torchvision + # unstructured-pytesseract +portalocker==2.10.1 + # via iopath +proto-plus==1.24.0 + # via + # google-api-core + # google-cloud-vision +protobuf==4.23.4 + # via + # -c ./local_partition/../common/constraints.txt + # google-api-core + # google-cloud-vision + # googleapis-common-protos + # grpcio-status + # onnx + # onnxruntime + # proto-plus +psutil==6.0.0 + # via unstructured +pyasn1==0.6.1 + # via + # pyasn1-modules + # rsa +pyasn1-modules==0.4.1 + # via google-auth +pycocotools==2.0.8 + # via effdet +pycparser==2.22 + # via cffi +pycrypto==2.6.1 + # via pdfplumber +pyparsing==3.1.4 + # via matplotlib +pypdf==5.0.1 + # via + # unstructured + # unstructured-client +python-dateutil==2.9.0.post0 + # via + # matplotlib + # pandas + # unstructured-client +python-iso639==2024.4.27 + # via unstructured +python-magic==0.4.27 + # via unstructured +python-multipart==0.0.12 + # via unstructured-inference +python-oxmsg==0.0.1 + # via unstructured +pytz==2024.2 + # via pandas +pyyaml==6.0.2 + # via + # huggingface-hub + # layoutparser + # omegaconf + # timm + # transformers +rapidfuzz==3.10.0 + # via + # unstructured + # unstructured-inference +regex==2024.9.11 + # via + # nltk + # transformers +requests==2.32.3 + # via + # google-api-core + # huggingface-hub + # requests-toolbelt + # transformers + # unstructured + # unstructured-client +requests-toolbelt==1.0.0 + # via unstructured-client +rsa==4.9 + # via google-auth +safetensors==0.4.5 + # via + # timm + # transformers +scipy==1.13.1 + # via layoutparser +six==1.16.0 + # via + # langdetect + # python-dateutil + # unstructured-client +sniffio==1.3.1 + # via + # anyio + # httpx +soupsieve==2.6 + # via beautifulsoup4 +sympy==1.13.3 + # via + # onnxruntime + # torch +tabulate==0.9.0 + # via unstructured +timm==1.0.9 + # via + # effdet + # unstructured-inference +tokenizers==0.19.1 + # via + # -c ./local_partition/../common/constraints.txt + # transformers +torch==2.4.1 + # via + # effdet + # timm + # torchvision + # unstructured-inference +torchvision==0.19.1 + # via + # effdet + # timm +tqdm==4.66.5 + # via + # huggingface-hub + # iopath + # nltk + # transformers + # unstructured +transformers==4.44.2 + # via unstructured-inference +typing-extensions==4.12.2 + # via + # anyio + # huggingface-hub + # iopath + # pypdf + # python-oxmsg + # torch + # typing-inspect + # unstructured + # unstructured-client +typing-inspect==0.9.0 + # via + # dataclasses-json + # unstructured-client +tzdata==2024.2 + # via pandas +unicodecsv==0.14.1 + # via pdfplumber +unstructured[image]==0.15.10 + # via + # -c ./local_partition/../common/constraints.txt + # -r ./local_partition/image.in +unstructured-client==0.25.9 + # via + # -c ./local_partition/../common/constraints.txt + # unstructured +unstructured-inference==0.7.36 + # via unstructured +unstructured-pytesseract==0.3.13 + # via unstructured +urllib3==1.26.20 + # via + # -c ./local_partition/../common/constraints.txt + # requests + # unstructured-client +wand==0.6.13 + # via pdfplumber +wrapt==1.16.0 + # via + # -c ./local_partition/../common/constraints.txt + # deprecated + # unstructured +zipp==3.20.2 + # via importlib-resources diff --git a/requirements/local_partition/md.txt b/requirements/local_partition/md.txt new file mode 100644 index 000000000..0725988eb --- /dev/null +++ b/requirements/local_partition/md.txt @@ -0,0 +1,158 @@ +# This file was autogenerated by uv via the following command: +# uv pip compile ./local_partition/md.in --output-file ./local_partition/md.txt --no-strip-extras --python-version 3.9 +anyio==4.6.0 + # via httpx +backoff==2.2.1 + # via unstructured +beautifulsoup4==4.12.3 + # via unstructured +certifi==2024.8.30 + # via + # httpcore + # httpx + # requests + # unstructured-client +cffi==1.17.1 + # via cryptography +chardet==5.2.0 + # via unstructured +charset-normalizer==3.3.2 + # via + # requests + # unstructured-client +click==8.1.7 + # via + # nltk + # python-oxmsg +cryptography==43.0.1 + # via unstructured-client +dataclasses-json==0.6.7 + # via + # unstructured + # unstructured-client +deepdiff==8.0.1 + # via unstructured-client +emoji==2.13.2 + # via unstructured +exceptiongroup==1.2.2 + # via anyio +filetype==1.2.0 + # via unstructured +h11==0.14.0 + # via httpcore +httpcore==1.0.6 + # via httpx +httpx==0.27.2 + # via unstructured-client +idna==3.10 + # via + # anyio + # httpx + # requests + # unstructured-client +joblib==1.4.2 + # via nltk +jsonpath-python==1.0.6 + # via unstructured-client +langdetect==1.0.9 + # via unstructured +lxml==5.3.0 + # via unstructured +markdown==3.3.4 + # via unstructured +marshmallow==3.22.0 + # via + # dataclasses-json + # unstructured-client +mypy-extensions==1.0.0 + # via + # typing-inspect + # unstructured-client +nest-asyncio==1.6.0 + # via unstructured-client +nltk==3.9.1 + # via unstructured +numpy==1.26.4 + # via + # -c ./local_partition/../common/constraints.txt + # unstructured +olefile==0.47 + # via python-oxmsg +orderly-set==5.2.2 + # via deepdiff +packaging==23.2 + # via + # -c ./local_partition/../common/constraints.txt + # marshmallow + # unstructured-client +psutil==6.0.0 + # via unstructured +pycparser==2.22 + # via cffi +pypdf==5.0.1 + # via unstructured-client +python-dateutil==2.9.0.post0 + # via unstructured-client +python-iso639==2024.4.27 + # via unstructured +python-magic==0.4.27 + # via unstructured +python-oxmsg==0.0.1 + # via unstructured +rapidfuzz==3.10.0 + # via unstructured +regex==2024.9.11 + # via nltk +requests==2.32.3 + # via + # requests-toolbelt + # unstructured + # unstructured-client +requests-toolbelt==1.0.0 + # via unstructured-client +six==1.16.0 + # via + # langdetect + # python-dateutil + # unstructured-client +sniffio==1.3.1 + # via + # anyio + # httpx +soupsieve==2.6 + # via beautifulsoup4 +tabulate==0.9.0 + # via unstructured +tqdm==4.66.5 + # via + # nltk + # unstructured +typing-extensions==4.12.2 + # via + # anyio + # pypdf + # python-oxmsg + # typing-inspect + # unstructured + # unstructured-client +typing-inspect==0.9.0 + # via + # dataclasses-json + # unstructured-client +unstructured[md]==0.15.10 + # via + # -c ./local_partition/../common/constraints.txt + # -r ./local_partition/md.in +unstructured-client==0.25.9 + # via + # -c ./local_partition/../common/constraints.txt + # unstructured +urllib3==1.26.20 + # via + # -c ./local_partition/../common/constraints.txt + # requests + # unstructured-client +wrapt==1.16.0 + # via + # -c ./local_partition/../common/constraints.txt + # unstructured diff --git a/requirements/local_partition/msg.txt b/requirements/local_partition/msg.txt new file mode 100644 index 000000000..9142dacf3 --- /dev/null +++ b/requirements/local_partition/msg.txt @@ -0,0 +1,156 @@ +# This file was autogenerated by uv via the following command: +# uv pip compile ./local_partition/msg.in --output-file ./local_partition/msg.txt --no-strip-extras --python-version 3.9 +anyio==4.6.0 + # via httpx +backoff==2.2.1 + # via unstructured +beautifulsoup4==4.12.3 + # via unstructured +certifi==2024.8.30 + # via + # httpcore + # httpx + # requests + # unstructured-client +cffi==1.17.1 + # via cryptography +chardet==5.2.0 + # via unstructured +charset-normalizer==3.3.2 + # via + # requests + # unstructured-client +click==8.1.7 + # via + # nltk + # python-oxmsg +cryptography==43.0.1 + # via unstructured-client +dataclasses-json==0.6.7 + # via + # unstructured + # unstructured-client +deepdiff==8.0.1 + # via unstructured-client +emoji==2.13.2 + # via unstructured +exceptiongroup==1.2.2 + # via anyio +filetype==1.2.0 + # via unstructured +h11==0.14.0 + # via httpcore +httpcore==1.0.6 + # via httpx +httpx==0.27.2 + # via unstructured-client +idna==3.10 + # via + # anyio + # httpx + # requests + # unstructured-client +joblib==1.4.2 + # via nltk +jsonpath-python==1.0.6 + # via unstructured-client +langdetect==1.0.9 + # via unstructured +lxml==5.3.0 + # via unstructured +marshmallow==3.22.0 + # via + # dataclasses-json + # unstructured-client +mypy-extensions==1.0.0 + # via + # typing-inspect + # unstructured-client +nest-asyncio==1.6.0 + # via unstructured-client +nltk==3.9.1 + # via unstructured +numpy==1.26.4 + # via + # -c ./local_partition/../common/constraints.txt + # unstructured +olefile==0.47 + # via python-oxmsg +orderly-set==5.2.2 + # via deepdiff +packaging==23.2 + # via + # -c ./local_partition/../common/constraints.txt + # marshmallow + # unstructured-client +psutil==6.0.0 + # via unstructured +pycparser==2.22 + # via cffi +pypdf==5.0.1 + # via unstructured-client +python-dateutil==2.9.0.post0 + # via unstructured-client +python-iso639==2024.4.27 + # via unstructured +python-magic==0.4.27 + # via unstructured +python-oxmsg==0.0.1 + # via unstructured +rapidfuzz==3.10.0 + # via unstructured +regex==2024.9.11 + # via nltk +requests==2.32.3 + # via + # requests-toolbelt + # unstructured + # unstructured-client +requests-toolbelt==1.0.0 + # via unstructured-client +six==1.16.0 + # via + # langdetect + # python-dateutil + # unstructured-client +sniffio==1.3.1 + # via + # anyio + # httpx +soupsieve==2.6 + # via beautifulsoup4 +tabulate==0.9.0 + # via unstructured +tqdm==4.66.5 + # via + # nltk + # unstructured +typing-extensions==4.12.2 + # via + # anyio + # pypdf + # python-oxmsg + # typing-inspect + # unstructured + # unstructured-client +typing-inspect==0.9.0 + # via + # dataclasses-json + # unstructured-client +unstructured[msg]==0.15.10 + # via + # -c ./local_partition/../common/constraints.txt + # -r ./local_partition/msg.in +unstructured-client==0.25.9 + # via + # -c ./local_partition/../common/constraints.txt + # unstructured +urllib3==1.26.20 + # via + # -c ./local_partition/../common/constraints.txt + # requests + # unstructured-client +wrapt==1.16.0 + # via + # -c ./local_partition/../common/constraints.txt + # unstructured diff --git a/requirements/local_partition/odt.txt b/requirements/local_partition/odt.txt new file mode 100644 index 000000000..42cae6865 --- /dev/null +++ b/requirements/local_partition/odt.txt @@ -0,0 +1,163 @@ +# This file was autogenerated by uv via the following command: +# uv pip compile ./local_partition/odt.in --output-file ./local_partition/odt.txt --no-strip-extras --python-version 3.9 +anyio==4.6.0 + # via httpx +backoff==2.2.1 + # via unstructured +beautifulsoup4==4.12.3 + # via unstructured +certifi==2024.8.30 + # via + # httpcore + # httpx + # requests + # unstructured-client +cffi==1.17.1 + # via cryptography +chardet==5.2.0 + # via unstructured +charset-normalizer==3.3.2 + # via + # requests + # unstructured-client +click==8.1.7 + # via + # nltk + # python-oxmsg +cryptography==43.0.1 + # via unstructured-client +dataclasses-json==0.6.7 + # via + # unstructured + # unstructured-client +deepdiff==8.0.1 + # via unstructured-client +emoji==2.13.2 + # via unstructured +exceptiongroup==1.2.2 + # via anyio +filetype==1.2.0 + # via unstructured +h11==0.14.0 + # via httpcore +httpcore==1.0.6 + # via httpx +httpx==0.27.2 + # via unstructured-client +idna==3.10 + # via + # anyio + # httpx + # requests + # unstructured-client +joblib==1.4.2 + # via nltk +jsonpath-python==1.0.6 + # via unstructured-client +langdetect==1.0.9 + # via unstructured +lxml==5.3.0 + # via + # python-docx + # unstructured +marshmallow==3.22.0 + # via + # dataclasses-json + # unstructured-client +mypy-extensions==1.0.0 + # via + # typing-inspect + # unstructured-client +nest-asyncio==1.6.0 + # via unstructured-client +nltk==3.9.1 + # via unstructured +numpy==1.26.4 + # via + # -c ./local_partition/../common/constraints.txt + # unstructured +olefile==0.47 + # via python-oxmsg +orderly-set==5.2.2 + # via deepdiff +packaging==23.2 + # via + # -c ./local_partition/../common/constraints.txt + # marshmallow + # unstructured-client +psutil==6.0.0 + # via unstructured +pycparser==2.22 + # via cffi +pypandoc==1.13 + # via unstructured +pypdf==5.0.1 + # via unstructured-client +python-dateutil==2.9.0.post0 + # via unstructured-client +python-docx==1.1.2 + # via unstructured +python-iso639==2024.4.27 + # via unstructured +python-magic==0.4.27 + # via unstructured +python-oxmsg==0.0.1 + # via unstructured +rapidfuzz==3.10.0 + # via unstructured +regex==2024.9.11 + # via nltk +requests==2.32.3 + # via + # requests-toolbelt + # unstructured + # unstructured-client +requests-toolbelt==1.0.0 + # via unstructured-client +six==1.16.0 + # via + # langdetect + # python-dateutil + # unstructured-client +sniffio==1.3.1 + # via + # anyio + # httpx +soupsieve==2.6 + # via beautifulsoup4 +tabulate==0.9.0 + # via unstructured +tqdm==4.66.5 + # via + # nltk + # unstructured +typing-extensions==4.12.2 + # via + # anyio + # pypdf + # python-docx + # python-oxmsg + # typing-inspect + # unstructured + # unstructured-client +typing-inspect==0.9.0 + # via + # dataclasses-json + # unstructured-client +unstructured[odt]==0.15.10 + # via + # -c ./local_partition/../common/constraints.txt + # -r ./local_partition/odt.in +unstructured-client==0.25.9 + # via + # -c ./local_partition/../common/constraints.txt + # unstructured +urllib3==1.26.20 + # via + # -c ./local_partition/../common/constraints.txt + # requests + # unstructured-client +wrapt==1.16.0 + # via + # -c ./local_partition/../common/constraints.txt + # unstructured diff --git a/requirements/local_partition/org.txt b/requirements/local_partition/org.txt new file mode 100644 index 000000000..1c655d6ec --- /dev/null +++ b/requirements/local_partition/org.txt @@ -0,0 +1,158 @@ +# This file was autogenerated by uv via the following command: +# uv pip compile ./local_partition/org.in --output-file ./local_partition/org.txt --no-strip-extras --python-version 3.9 +anyio==4.6.0 + # via httpx +backoff==2.2.1 + # via unstructured +beautifulsoup4==4.12.3 + # via unstructured +certifi==2024.8.30 + # via + # httpcore + # httpx + # requests + # unstructured-client +cffi==1.17.1 + # via cryptography +chardet==5.2.0 + # via unstructured +charset-normalizer==3.3.2 + # via + # requests + # unstructured-client +click==8.1.7 + # via + # nltk + # python-oxmsg +cryptography==43.0.1 + # via unstructured-client +dataclasses-json==0.6.7 + # via + # unstructured + # unstructured-client +deepdiff==8.0.1 + # via unstructured-client +emoji==2.13.2 + # via unstructured +exceptiongroup==1.2.2 + # via anyio +filetype==1.2.0 + # via unstructured +h11==0.14.0 + # via httpcore +httpcore==1.0.6 + # via httpx +httpx==0.27.2 + # via unstructured-client +idna==3.10 + # via + # anyio + # httpx + # requests + # unstructured-client +joblib==1.4.2 + # via nltk +jsonpath-python==1.0.6 + # via unstructured-client +langdetect==1.0.9 + # via unstructured +lxml==5.3.0 + # via unstructured +marshmallow==3.22.0 + # via + # dataclasses-json + # unstructured-client +mypy-extensions==1.0.0 + # via + # typing-inspect + # unstructured-client +nest-asyncio==1.6.0 + # via unstructured-client +nltk==3.9.1 + # via unstructured +numpy==1.26.4 + # via + # -c ./local_partition/../common/constraints.txt + # unstructured +olefile==0.47 + # via python-oxmsg +orderly-set==5.2.2 + # via deepdiff +packaging==23.2 + # via + # -c ./local_partition/../common/constraints.txt + # marshmallow + # unstructured-client +psutil==6.0.0 + # via unstructured +pycparser==2.22 + # via cffi +pypandoc==1.13 + # via unstructured +pypdf==5.0.1 + # via unstructured-client +python-dateutil==2.9.0.post0 + # via unstructured-client +python-iso639==2024.4.27 + # via unstructured +python-magic==0.4.27 + # via unstructured +python-oxmsg==0.0.1 + # via unstructured +rapidfuzz==3.10.0 + # via unstructured +regex==2024.9.11 + # via nltk +requests==2.32.3 + # via + # requests-toolbelt + # unstructured + # unstructured-client +requests-toolbelt==1.0.0 + # via unstructured-client +six==1.16.0 + # via + # langdetect + # python-dateutil + # unstructured-client +sniffio==1.3.1 + # via + # anyio + # httpx +soupsieve==2.6 + # via beautifulsoup4 +tabulate==0.9.0 + # via unstructured +tqdm==4.66.5 + # via + # nltk + # unstructured +typing-extensions==4.12.2 + # via + # anyio + # pypdf + # python-oxmsg + # typing-inspect + # unstructured + # unstructured-client +typing-inspect==0.9.0 + # via + # dataclasses-json + # unstructured-client +unstructured[org]==0.15.10 + # via + # -c ./local_partition/../common/constraints.txt + # -r ./local_partition/org.in +unstructured-client==0.25.9 + # via + # -c ./local_partition/../common/constraints.txt + # unstructured +urllib3==1.26.20 + # via + # -c ./local_partition/../common/constraints.txt + # requests + # unstructured-client +wrapt==1.16.0 + # via + # -c ./local_partition/../common/constraints.txt + # unstructured diff --git a/requirements/local_partition/pdf.txt b/requirements/local_partition/pdf.txt new file mode 100644 index 000000000..80ad35bc6 --- /dev/null +++ b/requirements/local_partition/pdf.txt @@ -0,0 +1,395 @@ +# This file was autogenerated by uv via the following command: +# uv pip compile ./local_partition/pdf.in --output-file ./local_partition/pdf.txt --no-strip-extras --python-version 3.9 +antlr4-python3-runtime==4.9.3 + # via omegaconf +anyio==4.6.0 + # via httpx +backoff==2.2.1 + # via unstructured +beautifulsoup4==4.12.3 + # via unstructured +cachetools==5.5.0 + # via google-auth +certifi==2024.8.30 + # via + # httpcore + # httpx + # requests + # unstructured-client +cffi==1.17.1 + # via cryptography +chardet==5.2.0 + # via + # pdfplumber + # unstructured +charset-normalizer==3.3.2 + # via + # pdfminer-six + # requests + # unstructured-client +click==8.1.7 + # via + # nltk + # python-oxmsg +coloredlogs==15.0.1 + # via onnxruntime +contourpy==1.3.0 + # via matplotlib +cryptography==43.0.1 + # via + # pdfminer-six + # unstructured-client +cycler==0.12.1 + # via matplotlib +dataclasses-json==0.6.7 + # via + # unstructured + # unstructured-client +deepdiff==8.0.1 + # via unstructured-client +deprecated==1.2.14 + # via pikepdf +effdet==0.4.1 + # via unstructured +emoji==2.13.2 + # via unstructured +exceptiongroup==1.2.2 + # via anyio +filelock==3.16.1 + # via + # huggingface-hub + # torch + # transformers +filetype==1.2.0 + # via unstructured +flatbuffers==24.3.25 + # via onnxruntime +fonttools==4.54.1 + # via matplotlib +fsspec==2024.5.0 + # via + # -c ./local_partition/../common/constraints.txt + # huggingface-hub + # torch +google-api-core[grpc]==2.20.0 + # via google-cloud-vision +google-auth==2.35.0 + # via + # google-api-core + # google-cloud-vision +google-cloud-vision==3.7.4 + # via unstructured +googleapis-common-protos==1.65.0 + # via + # google-api-core + # grpcio-status +grpcio==1.66.2 + # via + # -c ./local_partition/../common/constraints.txt + # google-api-core + # grpcio-status +grpcio-status==1.62.3 + # via google-api-core +h11==0.14.0 + # via httpcore +httpcore==1.0.6 + # via httpx +httpx==0.27.2 + # via unstructured-client +huggingface-hub==0.25.1 + # via + # timm + # tokenizers + # transformers + # unstructured-inference +humanfriendly==10.0 + # via coloredlogs +idna==3.10 + # via + # anyio + # httpx + # requests + # unstructured-client +importlib-resources==6.4.5 + # via matplotlib +iopath==0.1.10 + # via layoutparser +jinja2==3.1.4 + # via torch +joblib==1.4.2 + # via nltk +jsonpath-python==1.0.6 + # via unstructured-client +kiwisolver==1.4.7 + # via matplotlib +langdetect==1.0.9 + # via unstructured +layoutparser==0.3.4 + # via unstructured-inference +lxml==5.3.0 + # via + # pikepdf + # unstructured +markupsafe==2.1.5 + # via jinja2 +marshmallow==3.22.0 + # via + # dataclasses-json + # unstructured-client +matplotlib==3.9.2 + # via + # pycocotools + # unstructured-inference +mpmath==1.3.0 + # via sympy +mypy-extensions==1.0.0 + # via + # typing-inspect + # unstructured-client +nest-asyncio==1.6.0 + # via unstructured-client +networkx==3.2.1 + # via torch +nltk==3.9.1 + # via unstructured +numpy==1.26.4 + # via + # -c ./local_partition/../common/constraints.txt + # contourpy + # layoutparser + # matplotlib + # onnx + # onnxruntime + # opencv-python + # pandas + # pycocotools + # scipy + # torchvision + # transformers + # unstructured +olefile==0.47 + # via python-oxmsg +omegaconf==2.3.0 + # via effdet +onnx==1.17.0 + # via + # unstructured + # unstructured-inference +onnxruntime==1.19.2 + # via unstructured-inference +opencv-python==4.10.0.84 + # via + # layoutparser + # unstructured-inference +orderly-set==5.2.2 + # via deepdiff +packaging==23.2 + # via + # -c ./local_partition/../common/constraints.txt + # huggingface-hub + # marshmallow + # matplotlib + # onnxruntime + # pikepdf + # transformers + # unstructured-client + # unstructured-pytesseract +pandas==2.2.3 + # via layoutparser +pdf2image==1.17.0 + # via + # layoutparser + # unstructured +pdfminer-six==20240706 + # via + # pdfplumber + # unstructured +pdfplumber==0.5.3 + # via layoutparser +pi-heif==0.18.0 + # via unstructured +pikepdf==9.3.0 + # via unstructured +pillow==10.4.0 + # via + # layoutparser + # matplotlib + # pdf2image + # pdfplumber + # pi-heif + # pikepdf + # torchvision + # unstructured-pytesseract +portalocker==2.10.1 + # via iopath +proto-plus==1.24.0 + # via + # google-api-core + # google-cloud-vision +protobuf==4.23.4 + # via + # -c ./local_partition/../common/constraints.txt + # google-api-core + # google-cloud-vision + # googleapis-common-protos + # grpcio-status + # onnx + # onnxruntime + # proto-plus +psutil==6.0.0 + # via unstructured +pyasn1==0.6.1 + # via + # pyasn1-modules + # rsa +pyasn1-modules==0.4.1 + # via google-auth +pycocotools==2.0.8 + # via effdet +pycparser==2.22 + # via cffi +pycrypto==2.6.1 + # via pdfplumber +pyparsing==3.1.4 + # via matplotlib +pypdf==5.0.1 + # via + # unstructured + # unstructured-client +python-dateutil==2.9.0.post0 + # via + # matplotlib + # pandas + # unstructured-client +python-iso639==2024.4.27 + # via unstructured +python-magic==0.4.27 + # via unstructured +python-multipart==0.0.12 + # via unstructured-inference +python-oxmsg==0.0.1 + # via unstructured +pytz==2024.2 + # via pandas +pyyaml==6.0.2 + # via + # huggingface-hub + # layoutparser + # omegaconf + # timm + # transformers +rapidfuzz==3.10.0 + # via + # unstructured + # unstructured-inference +regex==2024.9.11 + # via + # nltk + # transformers +requests==2.32.3 + # via + # google-api-core + # huggingface-hub + # requests-toolbelt + # transformers + # unstructured + # unstructured-client +requests-toolbelt==1.0.0 + # via unstructured-client +rsa==4.9 + # via google-auth +safetensors==0.4.5 + # via + # timm + # transformers +scipy==1.13.1 + # via layoutparser +six==1.16.0 + # via + # langdetect + # python-dateutil + # unstructured-client +sniffio==1.3.1 + # via + # anyio + # httpx +soupsieve==2.6 + # via beautifulsoup4 +sympy==1.13.3 + # via + # onnxruntime + # torch +tabulate==0.9.0 + # via unstructured +timm==1.0.9 + # via + # effdet + # unstructured-inference +tokenizers==0.19.1 + # via + # -c ./local_partition/../common/constraints.txt + # transformers +torch==2.4.1 + # via + # effdet + # timm + # torchvision + # unstructured-inference +torchvision==0.19.1 + # via + # effdet + # timm +tqdm==4.66.5 + # via + # huggingface-hub + # iopath + # nltk + # transformers + # unstructured +transformers==4.44.2 + # via unstructured-inference +typing-extensions==4.12.2 + # via + # anyio + # huggingface-hub + # iopath + # pypdf + # python-oxmsg + # torch + # typing-inspect + # unstructured + # unstructured-client +typing-inspect==0.9.0 + # via + # dataclasses-json + # unstructured-client +tzdata==2024.2 + # via pandas +unicodecsv==0.14.1 + # via pdfplumber +unstructured[pdf]==0.15.10 + # via + # -c ./local_partition/../common/constraints.txt + # -r ./local_partition/pdf.in +unstructured-client==0.25.9 + # via + # -c ./local_partition/../common/constraints.txt + # unstructured +unstructured-inference==0.7.36 + # via unstructured +unstructured-pytesseract==0.3.13 + # via unstructured +urllib3==1.26.20 + # via + # -c ./local_partition/../common/constraints.txt + # requests + # unstructured-client +wand==0.6.13 + # via pdfplumber +wrapt==1.16.0 + # via + # -c ./local_partition/../common/constraints.txt + # deprecated + # unstructured +zipp==3.20.2 + # via importlib-resources diff --git a/requirements/local_partition/ppt.txt b/requirements/local_partition/ppt.txt new file mode 100644 index 000000000..79eab0caa --- /dev/null +++ b/requirements/local_partition/ppt.txt @@ -0,0 +1,165 @@ +# This file was autogenerated by uv via the following command: +# uv pip compile ./local_partition/ppt.in --output-file ./local_partition/ppt.txt --no-strip-extras --python-version 3.9 +anyio==4.6.0 + # via httpx +backoff==2.2.1 + # via unstructured +beautifulsoup4==4.12.3 + # via unstructured +certifi==2024.8.30 + # via + # httpcore + # httpx + # requests + # unstructured-client +cffi==1.17.1 + # via cryptography +chardet==5.2.0 + # via unstructured +charset-normalizer==3.3.2 + # via + # requests + # unstructured-client +click==8.1.7 + # via + # nltk + # python-oxmsg +cryptography==43.0.1 + # via unstructured-client +dataclasses-json==0.6.7 + # via + # unstructured + # unstructured-client +deepdiff==8.0.1 + # via unstructured-client +emoji==2.13.2 + # via unstructured +exceptiongroup==1.2.2 + # via anyio +filetype==1.2.0 + # via unstructured +h11==0.14.0 + # via httpcore +httpcore==1.0.6 + # via httpx +httpx==0.27.2 + # via unstructured-client +idna==3.10 + # via + # anyio + # httpx + # requests + # unstructured-client +joblib==1.4.2 + # via nltk +jsonpath-python==1.0.6 + # via unstructured-client +langdetect==1.0.9 + # via unstructured +lxml==5.3.0 + # via + # python-pptx + # unstructured +marshmallow==3.22.0 + # via + # dataclasses-json + # unstructured-client +mypy-extensions==1.0.0 + # via + # typing-inspect + # unstructured-client +nest-asyncio==1.6.0 + # via unstructured-client +nltk==3.9.1 + # via unstructured +numpy==1.26.4 + # via + # -c ./local_partition/../common/constraints.txt + # unstructured +olefile==0.47 + # via python-oxmsg +orderly-set==5.2.2 + # via deepdiff +packaging==23.2 + # via + # -c ./local_partition/../common/constraints.txt + # marshmallow + # unstructured-client +pillow==10.4.0 + # via python-pptx +psutil==6.0.0 + # via unstructured +pycparser==2.22 + # via cffi +pypdf==5.0.1 + # via unstructured-client +python-dateutil==2.9.0.post0 + # via unstructured-client +python-iso639==2024.4.27 + # via unstructured +python-magic==0.4.27 + # via unstructured +python-oxmsg==0.0.1 + # via unstructured +python-pptx==1.0.2 + # via unstructured +rapidfuzz==3.10.0 + # via unstructured +regex==2024.9.11 + # via nltk +requests==2.32.3 + # via + # requests-toolbelt + # unstructured + # unstructured-client +requests-toolbelt==1.0.0 + # via unstructured-client +six==1.16.0 + # via + # langdetect + # python-dateutil + # unstructured-client +sniffio==1.3.1 + # via + # anyio + # httpx +soupsieve==2.6 + # via beautifulsoup4 +tabulate==0.9.0 + # via unstructured +tqdm==4.66.5 + # via + # nltk + # unstructured +typing-extensions==4.12.2 + # via + # anyio + # pypdf + # python-oxmsg + # python-pptx + # typing-inspect + # unstructured + # unstructured-client +typing-inspect==0.9.0 + # via + # dataclasses-json + # unstructured-client +unstructured[ppt]==0.15.10 + # via + # -c ./local_partition/../common/constraints.txt + # -r ./local_partition/ppt.in +unstructured-client==0.25.9 + # via + # -c ./local_partition/../common/constraints.txt + # unstructured +urllib3==1.26.20 + # via + # -c ./local_partition/../common/constraints.txt + # requests + # unstructured-client +wrapt==1.16.0 + # via + # -c ./local_partition/../common/constraints.txt + # unstructured +xlsxwriter==3.2.0 + # via python-pptx diff --git a/requirements/local_partition/pptx.txt b/requirements/local_partition/pptx.txt new file mode 100644 index 000000000..c60fdb9b2 --- /dev/null +++ b/requirements/local_partition/pptx.txt @@ -0,0 +1,165 @@ +# This file was autogenerated by uv via the following command: +# uv pip compile ./local_partition/pptx.in --output-file ./local_partition/pptx.txt --no-strip-extras --python-version 3.9 +anyio==4.6.0 + # via httpx +backoff==2.2.1 + # via unstructured +beautifulsoup4==4.12.3 + # via unstructured +certifi==2024.8.30 + # via + # httpcore + # httpx + # requests + # unstructured-client +cffi==1.17.1 + # via cryptography +chardet==5.2.0 + # via unstructured +charset-normalizer==3.3.2 + # via + # requests + # unstructured-client +click==8.1.7 + # via + # nltk + # python-oxmsg +cryptography==43.0.1 + # via unstructured-client +dataclasses-json==0.6.7 + # via + # unstructured + # unstructured-client +deepdiff==8.0.1 + # via unstructured-client +emoji==2.13.2 + # via unstructured +exceptiongroup==1.2.2 + # via anyio +filetype==1.2.0 + # via unstructured +h11==0.14.0 + # via httpcore +httpcore==1.0.6 + # via httpx +httpx==0.27.2 + # via unstructured-client +idna==3.10 + # via + # anyio + # httpx + # requests + # unstructured-client +joblib==1.4.2 + # via nltk +jsonpath-python==1.0.6 + # via unstructured-client +langdetect==1.0.9 + # via unstructured +lxml==5.3.0 + # via + # python-pptx + # unstructured +marshmallow==3.22.0 + # via + # dataclasses-json + # unstructured-client +mypy-extensions==1.0.0 + # via + # typing-inspect + # unstructured-client +nest-asyncio==1.6.0 + # via unstructured-client +nltk==3.9.1 + # via unstructured +numpy==1.26.4 + # via + # -c ./local_partition/../common/constraints.txt + # unstructured +olefile==0.47 + # via python-oxmsg +orderly-set==5.2.2 + # via deepdiff +packaging==23.2 + # via + # -c ./local_partition/../common/constraints.txt + # marshmallow + # unstructured-client +pillow==10.4.0 + # via python-pptx +psutil==6.0.0 + # via unstructured +pycparser==2.22 + # via cffi +pypdf==5.0.1 + # via unstructured-client +python-dateutil==2.9.0.post0 + # via unstructured-client +python-iso639==2024.4.27 + # via unstructured +python-magic==0.4.27 + # via unstructured +python-oxmsg==0.0.1 + # via unstructured +python-pptx==1.0.2 + # via unstructured +rapidfuzz==3.10.0 + # via unstructured +regex==2024.9.11 + # via nltk +requests==2.32.3 + # via + # requests-toolbelt + # unstructured + # unstructured-client +requests-toolbelt==1.0.0 + # via unstructured-client +six==1.16.0 + # via + # langdetect + # python-dateutil + # unstructured-client +sniffio==1.3.1 + # via + # anyio + # httpx +soupsieve==2.6 + # via beautifulsoup4 +tabulate==0.9.0 + # via unstructured +tqdm==4.66.5 + # via + # nltk + # unstructured +typing-extensions==4.12.2 + # via + # anyio + # pypdf + # python-oxmsg + # python-pptx + # typing-inspect + # unstructured + # unstructured-client +typing-inspect==0.9.0 + # via + # dataclasses-json + # unstructured-client +unstructured[pptx]==0.15.10 + # via + # -c ./local_partition/../common/constraints.txt + # -r ./local_partition/pptx.in +unstructured-client==0.25.9 + # via + # -c ./local_partition/../common/constraints.txt + # unstructured +urllib3==1.26.20 + # via + # -c ./local_partition/../common/constraints.txt + # requests + # unstructured-client +wrapt==1.16.0 + # via + # -c ./local_partition/../common/constraints.txt + # unstructured +xlsxwriter==3.2.0 + # via python-pptx diff --git a/requirements/local_partition/rst.txt b/requirements/local_partition/rst.txt new file mode 100644 index 000000000..1cc6eee06 --- /dev/null +++ b/requirements/local_partition/rst.txt @@ -0,0 +1,158 @@ +# This file was autogenerated by uv via the following command: +# uv pip compile ./local_partition/rst.in --output-file ./local_partition/rst.txt --no-strip-extras --python-version 3.9 +anyio==4.6.0 + # via httpx +backoff==2.2.1 + # via unstructured +beautifulsoup4==4.12.3 + # via unstructured +certifi==2024.8.30 + # via + # httpcore + # httpx + # requests + # unstructured-client +cffi==1.17.1 + # via cryptography +chardet==5.2.0 + # via unstructured +charset-normalizer==3.3.2 + # via + # requests + # unstructured-client +click==8.1.7 + # via + # nltk + # python-oxmsg +cryptography==43.0.1 + # via unstructured-client +dataclasses-json==0.6.7 + # via + # unstructured + # unstructured-client +deepdiff==8.0.1 + # via unstructured-client +emoji==2.13.2 + # via unstructured +exceptiongroup==1.2.2 + # via anyio +filetype==1.2.0 + # via unstructured +h11==0.14.0 + # via httpcore +httpcore==1.0.6 + # via httpx +httpx==0.27.2 + # via unstructured-client +idna==3.10 + # via + # anyio + # httpx + # requests + # unstructured-client +joblib==1.4.2 + # via nltk +jsonpath-python==1.0.6 + # via unstructured-client +langdetect==1.0.9 + # via unstructured +lxml==5.3.0 + # via unstructured +marshmallow==3.22.0 + # via + # dataclasses-json + # unstructured-client +mypy-extensions==1.0.0 + # via + # typing-inspect + # unstructured-client +nest-asyncio==1.6.0 + # via unstructured-client +nltk==3.9.1 + # via unstructured +numpy==1.26.4 + # via + # -c ./local_partition/../common/constraints.txt + # unstructured +olefile==0.47 + # via python-oxmsg +orderly-set==5.2.2 + # via deepdiff +packaging==23.2 + # via + # -c ./local_partition/../common/constraints.txt + # marshmallow + # unstructured-client +psutil==6.0.0 + # via unstructured +pycparser==2.22 + # via cffi +pypandoc==1.13 + # via unstructured +pypdf==5.0.1 + # via unstructured-client +python-dateutil==2.9.0.post0 + # via unstructured-client +python-iso639==2024.4.27 + # via unstructured +python-magic==0.4.27 + # via unstructured +python-oxmsg==0.0.1 + # via unstructured +rapidfuzz==3.10.0 + # via unstructured +regex==2024.9.11 + # via nltk +requests==2.32.3 + # via + # requests-toolbelt + # unstructured + # unstructured-client +requests-toolbelt==1.0.0 + # via unstructured-client +six==1.16.0 + # via + # langdetect + # python-dateutil + # unstructured-client +sniffio==1.3.1 + # via + # anyio + # httpx +soupsieve==2.6 + # via beautifulsoup4 +tabulate==0.9.0 + # via unstructured +tqdm==4.66.5 + # via + # nltk + # unstructured +typing-extensions==4.12.2 + # via + # anyio + # pypdf + # python-oxmsg + # typing-inspect + # unstructured + # unstructured-client +typing-inspect==0.9.0 + # via + # dataclasses-json + # unstructured-client +unstructured[rst]==0.15.10 + # via + # -c ./local_partition/../common/constraints.txt + # -r ./local_partition/rst.in +unstructured-client==0.25.9 + # via + # -c ./local_partition/../common/constraints.txt + # unstructured +urllib3==1.26.20 + # via + # -c ./local_partition/../common/constraints.txt + # requests + # unstructured-client +wrapt==1.16.0 + # via + # -c ./local_partition/../common/constraints.txt + # unstructured diff --git a/requirements/local_partition/rtf.txt b/requirements/local_partition/rtf.txt new file mode 100644 index 000000000..afbbb61e6 --- /dev/null +++ b/requirements/local_partition/rtf.txt @@ -0,0 +1,158 @@ +# This file was autogenerated by uv via the following command: +# uv pip compile ./local_partition/rtf.in --output-file ./local_partition/rtf.txt --no-strip-extras --python-version 3.9 +anyio==4.6.0 + # via httpx +backoff==2.2.1 + # via unstructured +beautifulsoup4==4.12.3 + # via unstructured +certifi==2024.8.30 + # via + # httpcore + # httpx + # requests + # unstructured-client +cffi==1.17.1 + # via cryptography +chardet==5.2.0 + # via unstructured +charset-normalizer==3.3.2 + # via + # requests + # unstructured-client +click==8.1.7 + # via + # nltk + # python-oxmsg +cryptography==43.0.1 + # via unstructured-client +dataclasses-json==0.6.7 + # via + # unstructured + # unstructured-client +deepdiff==8.0.1 + # via unstructured-client +emoji==2.13.2 + # via unstructured +exceptiongroup==1.2.2 + # via anyio +filetype==1.2.0 + # via unstructured +h11==0.14.0 + # via httpcore +httpcore==1.0.6 + # via httpx +httpx==0.27.2 + # via unstructured-client +idna==3.10 + # via + # anyio + # httpx + # requests + # unstructured-client +joblib==1.4.2 + # via nltk +jsonpath-python==1.0.6 + # via unstructured-client +langdetect==1.0.9 + # via unstructured +lxml==5.3.0 + # via unstructured +marshmallow==3.22.0 + # via + # dataclasses-json + # unstructured-client +mypy-extensions==1.0.0 + # via + # typing-inspect + # unstructured-client +nest-asyncio==1.6.0 + # via unstructured-client +nltk==3.9.1 + # via unstructured +numpy==1.26.4 + # via + # -c ./local_partition/../common/constraints.txt + # unstructured +olefile==0.47 + # via python-oxmsg +orderly-set==5.2.2 + # via deepdiff +packaging==23.2 + # via + # -c ./local_partition/../common/constraints.txt + # marshmallow + # unstructured-client +psutil==6.0.0 + # via unstructured +pycparser==2.22 + # via cffi +pypandoc==1.13 + # via unstructured +pypdf==5.0.1 + # via unstructured-client +python-dateutil==2.9.0.post0 + # via unstructured-client +python-iso639==2024.4.27 + # via unstructured +python-magic==0.4.27 + # via unstructured +python-oxmsg==0.0.1 + # via unstructured +rapidfuzz==3.10.0 + # via unstructured +regex==2024.9.11 + # via nltk +requests==2.32.3 + # via + # requests-toolbelt + # unstructured + # unstructured-client +requests-toolbelt==1.0.0 + # via unstructured-client +six==1.16.0 + # via + # langdetect + # python-dateutil + # unstructured-client +sniffio==1.3.1 + # via + # anyio + # httpx +soupsieve==2.6 + # via beautifulsoup4 +tabulate==0.9.0 + # via unstructured +tqdm==4.66.5 + # via + # nltk + # unstructured +typing-extensions==4.12.2 + # via + # anyio + # pypdf + # python-oxmsg + # typing-inspect + # unstructured + # unstructured-client +typing-inspect==0.9.0 + # via + # dataclasses-json + # unstructured-client +unstructured[rtf]==0.15.10 + # via + # -c ./local_partition/../common/constraints.txt + # -r ./local_partition/rtf.in +unstructured-client==0.25.9 + # via + # -c ./local_partition/../common/constraints.txt + # unstructured +urllib3==1.26.20 + # via + # -c ./local_partition/../common/constraints.txt + # requests + # unstructured-client +wrapt==1.16.0 + # via + # -c ./local_partition/../common/constraints.txt + # unstructured diff --git a/requirements/local_partition/tsv.txt b/requirements/local_partition/tsv.txt new file mode 100644 index 000000000..363434a2d --- /dev/null +++ b/requirements/local_partition/tsv.txt @@ -0,0 +1,165 @@ +# This file was autogenerated by uv via the following command: +# uv pip compile ./local_partition/tsv.in --output-file ./local_partition/tsv.txt --no-strip-extras --python-version 3.9 +anyio==4.6.0 + # via httpx +backoff==2.2.1 + # via unstructured +beautifulsoup4==4.12.3 + # via unstructured +certifi==2024.8.30 + # via + # httpcore + # httpx + # requests + # unstructured-client +cffi==1.17.1 + # via cryptography +chardet==5.2.0 + # via unstructured +charset-normalizer==3.3.2 + # via + # requests + # unstructured-client +click==8.1.7 + # via + # nltk + # python-oxmsg +cryptography==43.0.1 + # via unstructured-client +dataclasses-json==0.6.7 + # via + # unstructured + # unstructured-client +deepdiff==8.0.1 + # via unstructured-client +emoji==2.13.2 + # via unstructured +exceptiongroup==1.2.2 + # via anyio +filetype==1.2.0 + # via unstructured +h11==0.14.0 + # via httpcore +httpcore==1.0.6 + # via httpx +httpx==0.27.2 + # via unstructured-client +idna==3.10 + # via + # anyio + # httpx + # requests + # unstructured-client +joblib==1.4.2 + # via nltk +jsonpath-python==1.0.6 + # via unstructured-client +langdetect==1.0.9 + # via unstructured +lxml==5.3.0 + # via unstructured +marshmallow==3.22.0 + # via + # dataclasses-json + # unstructured-client +mypy-extensions==1.0.0 + # via + # typing-inspect + # unstructured-client +nest-asyncio==1.6.0 + # via unstructured-client +nltk==3.9.1 + # via unstructured +numpy==1.26.4 + # via + # -c ./local_partition/../common/constraints.txt + # pandas + # unstructured +olefile==0.47 + # via python-oxmsg +orderly-set==5.2.2 + # via deepdiff +packaging==23.2 + # via + # -c ./local_partition/../common/constraints.txt + # marshmallow + # unstructured-client +pandas==2.2.3 + # via unstructured +psutil==6.0.0 + # via unstructured +pycparser==2.22 + # via cffi +pypdf==5.0.1 + # via unstructured-client +python-dateutil==2.9.0.post0 + # via + # pandas + # unstructured-client +python-iso639==2024.4.27 + # via unstructured +python-magic==0.4.27 + # via unstructured +python-oxmsg==0.0.1 + # via unstructured +pytz==2024.2 + # via pandas +rapidfuzz==3.10.0 + # via unstructured +regex==2024.9.11 + # via nltk +requests==2.32.3 + # via + # requests-toolbelt + # unstructured + # unstructured-client +requests-toolbelt==1.0.0 + # via unstructured-client +six==1.16.0 + # via + # langdetect + # python-dateutil + # unstructured-client +sniffio==1.3.1 + # via + # anyio + # httpx +soupsieve==2.6 + # via beautifulsoup4 +tabulate==0.9.0 + # via unstructured +tqdm==4.66.5 + # via + # nltk + # unstructured +typing-extensions==4.12.2 + # via + # anyio + # pypdf + # python-oxmsg + # typing-inspect + # unstructured + # unstructured-client +typing-inspect==0.9.0 + # via + # dataclasses-json + # unstructured-client +tzdata==2024.2 + # via pandas +unstructured[tsv]==0.15.10 + # via + # -c ./local_partition/../common/constraints.txt + # -r ./local_partition/tsv.in +unstructured-client==0.25.9 + # via + # -c ./local_partition/../common/constraints.txt + # unstructured +urllib3==1.26.20 + # via + # -c ./local_partition/../common/constraints.txt + # requests + # unstructured-client +wrapt==1.16.0 + # via + # -c ./local_partition/../common/constraints.txt + # unstructured diff --git a/requirements/local_partition/xlsx.txt b/requirements/local_partition/xlsx.txt new file mode 100644 index 000000000..fbd34079c --- /dev/null +++ b/requirements/local_partition/xlsx.txt @@ -0,0 +1,173 @@ +# This file was autogenerated by uv via the following command: +# uv pip compile ./local_partition/xlsx.in --output-file ./local_partition/xlsx.txt --no-strip-extras --python-version 3.9 +anyio==4.6.0 + # via httpx +backoff==2.2.1 + # via unstructured +beautifulsoup4==4.12.3 + # via unstructured +certifi==2024.8.30 + # via + # httpcore + # httpx + # requests + # unstructured-client +cffi==1.17.1 + # via cryptography +chardet==5.2.0 + # via unstructured +charset-normalizer==3.3.2 + # via + # requests + # unstructured-client +click==8.1.7 + # via + # nltk + # python-oxmsg +cryptography==43.0.1 + # via unstructured-client +dataclasses-json==0.6.7 + # via + # unstructured + # unstructured-client +deepdiff==8.0.1 + # via unstructured-client +emoji==2.13.2 + # via unstructured +et-xmlfile==1.1.0 + # via openpyxl +exceptiongroup==1.2.2 + # via anyio +filetype==1.2.0 + # via unstructured +h11==0.14.0 + # via httpcore +httpcore==1.0.6 + # via httpx +httpx==0.27.2 + # via unstructured-client +idna==3.10 + # via + # anyio + # httpx + # requests + # unstructured-client +joblib==1.4.2 + # via nltk +jsonpath-python==1.0.6 + # via unstructured-client +langdetect==1.0.9 + # via unstructured +lxml==5.3.0 + # via unstructured +marshmallow==3.22.0 + # via + # dataclasses-json + # unstructured-client +mypy-extensions==1.0.0 + # via + # typing-inspect + # unstructured-client +nest-asyncio==1.6.0 + # via unstructured-client +networkx==3.2.1 + # via unstructured +nltk==3.9.1 + # via unstructured +numpy==1.26.4 + # via + # -c ./local_partition/../common/constraints.txt + # pandas + # unstructured +olefile==0.47 + # via python-oxmsg +openpyxl==3.1.5 + # via unstructured +orderly-set==5.2.2 + # via deepdiff +packaging==23.2 + # via + # -c ./local_partition/../common/constraints.txt + # marshmallow + # unstructured-client +pandas==2.2.3 + # via unstructured +psutil==6.0.0 + # via unstructured +pycparser==2.22 + # via cffi +pypdf==5.0.1 + # via unstructured-client +python-dateutil==2.9.0.post0 + # via + # pandas + # unstructured-client +python-iso639==2024.4.27 + # via unstructured +python-magic==0.4.27 + # via unstructured +python-oxmsg==0.0.1 + # via unstructured +pytz==2024.2 + # via pandas +rapidfuzz==3.10.0 + # via unstructured +regex==2024.9.11 + # via nltk +requests==2.32.3 + # via + # requests-toolbelt + # unstructured + # unstructured-client +requests-toolbelt==1.0.0 + # via unstructured-client +six==1.16.0 + # via + # langdetect + # python-dateutil + # unstructured-client +sniffio==1.3.1 + # via + # anyio + # httpx +soupsieve==2.6 + # via beautifulsoup4 +tabulate==0.9.0 + # via unstructured +tqdm==4.66.5 + # via + # nltk + # unstructured +typing-extensions==4.12.2 + # via + # anyio + # pypdf + # python-oxmsg + # typing-inspect + # unstructured + # unstructured-client +typing-inspect==0.9.0 + # via + # dataclasses-json + # unstructured-client +tzdata==2024.2 + # via pandas +unstructured[xlsx]==0.15.10 + # via + # -c ./local_partition/../common/constraints.txt + # -r ./local_partition/xlsx.in +unstructured-client==0.25.9 + # via + # -c ./local_partition/../common/constraints.txt + # unstructured +urllib3==1.26.20 + # via + # -c ./local_partition/../common/constraints.txt + # requests + # unstructured-client +wrapt==1.16.0 + # via + # -c ./local_partition/../common/constraints.txt + # unstructured +xlrd==2.0.1 + # via unstructured diff --git a/requirements/remote/client.txt b/requirements/remote/client.txt new file mode 100644 index 000000000..8163b3e61 --- /dev/null +++ b/requirements/remote/client.txt @@ -0,0 +1,94 @@ +# This file was autogenerated by uv via the following command: +# uv pip compile ./remote/client.in --output-file ./remote/client.txt --no-strip-extras --python-version 3.9 +anyio==4.6.0 + # via httpx +certifi==2024.8.30 + # via + # httpcore + # httpx + # requests + # unstructured-client +cffi==1.17.1 + # via cryptography +charset-normalizer==3.3.2 + # via + # requests + # unstructured-client +cryptography==43.0.1 + # via unstructured-client +dataclasses-json==0.6.7 + # via unstructured-client +deepdiff==8.0.1 + # via unstructured-client +exceptiongroup==1.2.2 + # via anyio +h11==0.14.0 + # via httpcore +httpcore==1.0.6 + # via httpx +httpx==0.27.2 + # via unstructured-client +idna==3.10 + # via + # anyio + # httpx + # requests + # unstructured-client +jsonpath-python==1.0.6 + # via unstructured-client +marshmallow==3.22.0 + # via + # dataclasses-json + # unstructured-client +mypy-extensions==1.0.0 + # via + # typing-inspect + # unstructured-client +nest-asyncio==1.6.0 + # via unstructured-client +orderly-set==5.2.2 + # via deepdiff +packaging==23.2 + # via + # -c ./remote/../common/constraints.txt + # marshmallow + # unstructured-client +pycparser==2.22 + # via cffi +pypdf==5.0.1 + # via unstructured-client +python-dateutil==2.9.0.post0 + # via unstructured-client +requests==2.32.3 + # via + # requests-toolbelt + # unstructured-client +requests-toolbelt==1.0.0 + # via unstructured-client +six==1.16.0 + # via + # python-dateutil + # unstructured-client +sniffio==1.3.1 + # via + # anyio + # httpx +typing-extensions==4.12.2 + # via + # anyio + # pypdf + # typing-inspect + # unstructured-client +typing-inspect==0.9.0 + # via + # dataclasses-json + # unstructured-client +unstructured-client==0.25.9 + # via + # -c ./remote/../common/constraints.txt + # -r ./remote/client.in +urllib3==1.26.20 + # via + # -c ./remote/../common/constraints.txt + # requests + # unstructured-client From 40bc3ff770268a5434ea1f721dd752a80884b569 Mon Sep 17 00:00:00 2001 From: mr-unstructured Date: Thu, 10 Oct 2024 17:44:13 +0100 Subject: [PATCH 07/48] More restored files from main --- requirements/lint.txt | 49 +++++++ requirements/release.txt | 42 ++++++ requirements/test.txt | 282 +++++++++++++++++++++++++++++++++++++++ 3 files changed, 373 insertions(+) create mode 100644 requirements/lint.txt create mode 100644 requirements/release.txt create mode 100644 requirements/test.txt diff --git a/requirements/lint.txt b/requirements/lint.txt new file mode 100644 index 000000000..c8aa6afb4 --- /dev/null +++ b/requirements/lint.txt @@ -0,0 +1,49 @@ +# This file was autogenerated by uv via the following command: +# uv pip compile ./lint.in --output-file ./lint.txt --no-strip-extras --python-version 3.9 +autoflake==2.3.1 + # via -r ./lint.in +black==24.8.0 + # via -r ./lint.in +click==8.1.7 + # via black +flake8==7.1.1 + # via + # -r ./lint.in + # flake8-print +flake8-print==5.0.0 + # via -r ./lint.in +mccabe==0.7.0 + # via flake8 +mypy==1.11.2 + # via -r ./lint.in +mypy-extensions==1.0.0 + # via + # black + # mypy +packaging==23.2 + # via + # -c ././common/constraints.txt + # black +pathspec==0.12.1 + # via black +platformdirs==4.3.6 + # via black +pycodestyle==2.12.1 + # via + # flake8 + # flake8-print +pyflakes==3.2.0 + # via + # autoflake + # flake8 +ruff==0.6.8 + # via -r ./lint.in +tomli==2.0.2 + # via + # autoflake + # black + # mypy +typing-extensions==4.12.2 + # via + # black + # mypy diff --git a/requirements/release.txt b/requirements/release.txt new file mode 100644 index 000000000..5d3f1c65d --- /dev/null +++ b/requirements/release.txt @@ -0,0 +1,42 @@ +# This file was autogenerated by uv via the following command: +# uv pip compile ./release.in --output-file ./release.txt --no-strip-extras --python-version 3.9 +certifi==2024.8.30 + # via requests +charset-normalizer==3.3.2 + # via requests +colorama==0.4.6 + # via twine +docutils==0.21.2 + # via readme-renderer +idna==3.10 + # via requests +keyring==22.3.0 + # via twine +nh3==0.2.18 + # via readme-renderer +pkginfo==1.11.1 + # via twine +pygments==2.18.0 + # via readme-renderer +readme-renderer==44.0 + # via twine +requests==2.32.3 + # via + # requests-toolbelt + # twine +requests-toolbelt==1.0.0 + # via twine +rfc3986==2.0.0 + # via twine +setuptools==75.1.0 + # via twine +tqdm==4.66.5 + # via twine +twine==3.3.0 + # via -r ./release.in +urllib3==1.26.20 + # via + # -c ././common/constraints.txt + # requests +wheel==0.44.0 + # via -r ./release.in diff --git a/requirements/test.txt b/requirements/test.txt new file mode 100644 index 000000000..6b3c6072e --- /dev/null +++ b/requirements/test.txt @@ -0,0 +1,282 @@ +# This file was autogenerated by uv via the following command: +# uv pip compile test.in --output-file test.txt --no-strip-extras +annotated-types==0.7.0 + # via pydantic +anyio==4.6.0 + # via httpx +backoff==2.2.1 + # via unstructured +beautifulsoup4==4.12.3 + # via unstructured +cachetools==5.5.0 + # via google-auth +certifi==2024.8.30 + # via + # httpcore + # httpx + # requests + # unstructured-client +cffi==1.17.1 + # via cryptography +chardet==5.2.0 + # via unstructured +charset-normalizer==3.3.2 + # via + # requests + # unstructured-client +click==8.1.7 + # via + # nltk + # python-oxmsg +coverage[toml]==7.6.1 + # via pytest-cov +cryptography==43.0.1 + # via + # -r test.in + # unstructured-client +dataclasses-json==0.6.7 + # via + # unstructured + # unstructured-client +deepdiff==8.0.1 + # via unstructured-client +docstring-parser==0.16 + # via google-cloud-aiplatform +emoji==2.13.2 + # via unstructured +exceptiongroup==1.2.2 + # via + # anyio + # pytest +filetype==1.2.0 + # via unstructured +fsspec==2024.5.0 + # via + # -c ./common/constraints.txt + # -r test.in +google-api-core[grpc]==2.20.0 + # via + # google-cloud-aiplatform + # google-cloud-bigquery + # google-cloud-core + # google-cloud-resource-manager + # google-cloud-storage +google-auth==2.35.0 + # via + # google-api-core + # google-cloud-aiplatform + # google-cloud-bigquery + # google-cloud-core + # google-cloud-resource-manager + # google-cloud-storage +google-cloud-aiplatform[all]==1.69.0 + # via vertexai +google-cloud-bigquery==3.26.0 + # via google-cloud-aiplatform +google-cloud-core==2.4.1 + # via + # google-cloud-bigquery + # google-cloud-storage +google-cloud-resource-manager==1.12.5 + # via google-cloud-aiplatform +google-cloud-storage==2.18.2 + # via google-cloud-aiplatform +google-crc32c==1.6.0 + # via + # google-cloud-storage + # google-resumable-media +google-resumable-media==2.7.2 + # via + # google-cloud-bigquery + # google-cloud-storage +googleapis-common-protos[grpc]==1.65.0 + # via + # google-api-core + # grpc-google-iam-v1 + # grpcio-status +grpc-google-iam-v1==0.13.1 + # via google-cloud-resource-manager +grpcio==1.66.2 + # via + # -c ./common/constraints.txt + # google-api-core + # googleapis-common-protos + # grpc-google-iam-v1 + # grpcio-status +grpcio-status==1.62.3 + # via google-api-core +h11==0.14.0 + # via httpcore +httpcore==1.0.6 + # via httpx +httpx==0.27.2 + # via unstructured-client +idna==3.10 + # via + # anyio + # httpx + # requests + # unstructured-client +iniconfig==2.0.0 + # via pytest +joblib==1.4.2 + # via nltk +jsonpath-python==1.0.6 + # via unstructured-client +langdetect==1.0.9 + # via unstructured +lxml==5.3.0 + # via unstructured +marshmallow==3.22.0 + # via + # dataclasses-json + # unstructured-client +mypy-extensions==1.0.0 + # via + # typing-inspect + # unstructured-client +nest-asyncio==1.6.0 + # via unstructured-client +nltk==3.9.1 + # via unstructured +numpy==1.26.4 + # via + # -c ./common/constraints.txt + # shapely + # unstructured +olefile==0.47 + # via python-oxmsg +orderly-set==5.2.2 + # via deepdiff +packaging==23.2 + # via + # -c ./common/constraints.txt + # google-cloud-aiplatform + # google-cloud-bigquery + # marshmallow + # pytest + # unstructured-client +pluggy==1.5.0 + # via pytest +proto-plus==1.24.0 + # via + # google-api-core + # google-cloud-aiplatform + # google-cloud-resource-manager +protobuf==4.23.4 + # via + # -c ./common/constraints.txt + # google-api-core + # google-cloud-aiplatform + # google-cloud-resource-manager + # googleapis-common-protos + # grpc-google-iam-v1 + # grpcio-status + # proto-plus +psutil==6.0.0 + # via unstructured +pyasn1==0.6.1 + # via + # pyasn1-modules + # rsa +pyasn1-modules==0.4.1 + # via google-auth +pycparser==2.22 + # via cffi +pydantic==2.9.2 + # via google-cloud-aiplatform +pydantic-core==2.23.4 + # via pydantic +pypdf==5.0.1 + # via unstructured-client +pytest==8.3.3 + # via + # -r test.in + # pytest-cov + # pytest-mock +pytest-cov==5.0.0 + # via -r test.in +pytest-mock==3.14.0 + # via -r test.in +python-dateutil==2.9.0.post0 + # via + # google-cloud-bigquery + # unstructured-client +python-iso639==2024.4.27 + # via unstructured +python-magic==0.4.27 + # via unstructured +python-oxmsg==0.0.1 + # via unstructured +rapidfuzz==3.10.0 + # via unstructured +regex==2024.9.11 + # via nltk +requests==2.32.3 + # via + # google-api-core + # google-cloud-bigquery + # google-cloud-storage + # requests-toolbelt + # unstructured + # unstructured-client +requests-toolbelt==1.0.0 + # via unstructured-client +rsa==4.9 + # via google-auth +shapely==2.0.6 + # via google-cloud-aiplatform +six==1.16.0 + # via + # langdetect + # python-dateutil + # unstructured-client +sniffio==1.3.1 + # via + # anyio + # httpx +soupsieve==2.6 + # via beautifulsoup4 +tabulate==0.9.0 + # via unstructured +tomli==2.0.2 + # via + # coverage + # pytest +tqdm==4.66.5 + # via + # nltk + # unstructured +typing-extensions==4.12.2 + # via + # anyio + # pydantic + # pydantic-core + # pypdf + # python-oxmsg + # typing-inspect + # unstructured + # unstructured-client +typing-inspect==0.9.0 + # via + # dataclasses-json + # unstructured-client +unstructured==0.15.10 + # via + # -c ./common/constraints.txt + # -r test.in +unstructured-client==0.25.9 + # via + # -c ./common/constraints.txt + # unstructured +urllib3==1.26.20 + # via + # -c ./common/constraints.txt + # requests + # unstructured-client +vertexai==1.69.0 + # via -r test.in +wrapt==1.16.0 + # via + # -c ./common/constraints.txt + # unstructured From 16821816f11344200b5450ffe3778915baae5b13 Mon Sep 17 00:00:00 2001 From: mr-unstructured Date: Thu, 10 Oct 2024 17:51:16 +0100 Subject: [PATCH 08/48] removed useless variable --- .../v2/processes/connectors/notion/types/block.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/unstructured_ingest/v2/processes/connectors/notion/types/block.py b/unstructured_ingest/v2/processes/connectors/notion/types/block.py index 4df97756b..b9d5f0379 100644 --- a/unstructured_ingest/v2/processes/connectors/notion/types/block.py +++ b/unstructured_ingest/v2/processes/connectors/notion/types/block.py @@ -76,7 +76,7 @@ def from_dict(cls, data: dict): last_edited_by = data.pop("last_edited_by") parent = data.pop("parent") if "in_trash" in data: - in_trash = data.pop("in_trash") + data.pop("in_trash") try: block = cls( created_by=PartialUser.from_dict(created_by), From cff69aae513bed6edf75cc2e5eb961712404b1bc Mon Sep 17 00:00:00 2001 From: mr-unstructured Date: Thu, 10 Oct 2024 17:55:10 +0100 Subject: [PATCH 09/48] Optimized imports --- unstructured_ingest/v2/processes/connectors/__init__.py | 5 ++--- .../v2/processes/connectors/notion/client.py | 6 +++--- .../connectors/notion/types/database_properties/__init__.py | 1 - 3 files changed, 5 insertions(+), 7 deletions(-) diff --git a/unstructured_ingest/v2/processes/connectors/__init__.py b/unstructured_ingest/v2/processes/connectors/__init__.py index f73be8a53..4b347518d 100644 --- a/unstructured_ingest/v2/processes/connectors/__init__.py +++ b/unstructured_ingest/v2/processes/connectors/__init__.py @@ -5,7 +5,6 @@ add_destination_entry, add_source_entry, ) - from .airtable import CONNECTOR_TYPE as AIRTABLE_CONNECTOR_TYPE from .airtable import airtable_source_entry from .astradb import CONNECTOR_TYPE as ASTRA_DB_CONNECTOR_TYPE @@ -30,6 +29,8 @@ from .milvus import milvus_destination_entry from .mongodb import CONNECTOR_TYPE as MONGODB_CONNECTOR_TYPE from .mongodb import mongodb_destination_entry +from .notion.connector import CONNECTOR_TYPE as NOTION_CONNECTOR_TYPE +from .notion.connector import notion_source_entry from .onedrive import CONNECTOR_TYPE as ONEDRIVE_CONNECTOR_TYPE from .onedrive import onedrive_source_entry from .opensearch import CONNECTOR_TYPE as OPENSEARCH_CONNECTOR_TYPE @@ -46,8 +47,6 @@ from .sql import sql_destination_entry from .weaviate import CONNECTOR_TYPE as WEAVIATE_CONNECTOR_TYPE from .weaviate import weaviate_destination_entry -from .notion.connector import CONNECTOR_TYPE as NOTION_CONNECTOR_TYPE -from .notion.connector import notion_source_entry add_destination_entry(destination_type=ASTRA_DB_CONNECTOR_TYPE, entry=astra_db_destination_entry) diff --git a/unstructured_ingest/v2/processes/connectors/notion/client.py b/unstructured_ingest/v2/processes/connectors/notion/client.py index 862abd884..705438fd9 100644 --- a/unstructured_ingest/v2/processes/connectors/notion/client.py +++ b/unstructured_ingest/v2/processes/connectors/notion/client.py @@ -9,15 +9,15 @@ from notion_client.api_endpoints import PagesEndpoint as NotionPagesEndpoint from notion_client.errors import RequestTimeoutError +from unstructured_ingest.ingest_backoff import RetryHandler +from unstructured_ingest.interfaces import RetryStrategyConfig +from unstructured_ingest.utils.dep_check import requires_dependencies from unstructured_ingest.v2.processes.connectors.notion.types.block import Block from unstructured_ingest.v2.processes.connectors.notion.types.database import Database from unstructured_ingest.v2.processes.connectors.notion.types.database_properties import ( map_cells, ) from unstructured_ingest.v2.processes.connectors.notion.types.page import Page -from unstructured_ingest.ingest_backoff import RetryHandler -from unstructured_ingest.interfaces import RetryStrategyConfig -from unstructured_ingest.utils.dep_check import requires_dependencies @requires_dependencies(["httpx"], extras="notion") diff --git a/unstructured_ingest/v2/processes/connectors/notion/types/database_properties/__init__.py b/unstructured_ingest/v2/processes/connectors/notion/types/database_properties/__init__.py index 95c548969..d98d12cc9 100644 --- a/unstructured_ingest/v2/processes/connectors/notion/types/database_properties/__init__.py +++ b/unstructured_ingest/v2/processes/connectors/notion/types/database_properties/__init__.py @@ -1,7 +1,6 @@ from typing import Dict from unstructured_ingest.connector.notion.interfaces import DBCellBase, DBPropertyBase - from .checkbox import Checkbox, CheckboxCell from .created_by import CreatedBy, CreatedByCell from .created_time import CreatedTime, CreatedTimeCell From 98e6398658644ebba31b523ebc3f931760f97b51 Mon Sep 17 00:00:00 2001 From: mr-unstructured Date: Thu, 10 Oct 2024 18:35:06 +0100 Subject: [PATCH 10/48] few ruff fixes --- .../v2/processes/connectors/notion/connector.py | 6 ++---- .../connectors/notion/types/database_properties/__init__.py | 1 + 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/unstructured_ingest/v2/processes/connectors/notion/connector.py b/unstructured_ingest/v2/processes/connectors/notion/connector.py index f0185c321..4f662492f 100644 --- a/unstructured_ingest/v2/processes/connectors/notion/connector.py +++ b/unstructured_ingest/v2/processes/connectors/notion/connector.py @@ -20,12 +20,10 @@ from unstructured_ingest.v2.logger import logger from unstructured_ingest.v2.processes.connector_registry import SourceRegistryEntry from unstructured_ingest.v2.processes.connectors.notion.client import Client as NotionClient -from unstructured_ingest.v2.processes.connectors.notion.helpers import extract_database_html -from unstructured_ingest.v2.processes.connectors.notion.helpers import extract_page_html from unstructured_ingest.v2.processes.connectors.notion.helpers import ( + extract_database_html, + extract_page_html, get_recursive_content_from_database, -) -from unstructured_ingest.v2.processes.connectors.notion.helpers import ( get_recursive_content_from_page, ) diff --git a/unstructured_ingest/v2/processes/connectors/notion/types/database_properties/__init__.py b/unstructured_ingest/v2/processes/connectors/notion/types/database_properties/__init__.py index d98d12cc9..95c548969 100644 --- a/unstructured_ingest/v2/processes/connectors/notion/types/database_properties/__init__.py +++ b/unstructured_ingest/v2/processes/connectors/notion/types/database_properties/__init__.py @@ -1,6 +1,7 @@ from typing import Dict from unstructured_ingest.connector.notion.interfaces import DBCellBase, DBPropertyBase + from .checkbox import Checkbox, CheckboxCell from .created_by import CreatedBy, CreatedByCell from .created_time import CreatedTime, CreatedTimeCell From 7269c66dae153181d8fc2ed08d6ff4b95196ed83 Mon Sep 17 00:00:00 2001 From: mr-unstructured Date: Thu, 10 Oct 2024 18:36:43 +0100 Subject: [PATCH 11/48] last ruff fix --- unstructured_ingest/v2/processes/connectors/__init__.py | 1 + 1 file changed, 1 insertion(+) diff --git a/unstructured_ingest/v2/processes/connectors/__init__.py b/unstructured_ingest/v2/processes/connectors/__init__.py index 4b347518d..25a16f17f 100644 --- a/unstructured_ingest/v2/processes/connectors/__init__.py +++ b/unstructured_ingest/v2/processes/connectors/__init__.py @@ -5,6 +5,7 @@ add_destination_entry, add_source_entry, ) + from .airtable import CONNECTOR_TYPE as AIRTABLE_CONNECTOR_TYPE from .airtable import airtable_source_entry from .astradb import CONNECTOR_TYPE as ASTRA_DB_CONNECTOR_TYPE From 8b844e0226e30776d231d4e4ee130e9d1bc846e3 Mon Sep 17 00:00:00 2001 From: mr-unstructured Date: Thu, 10 Oct 2024 18:42:37 +0100 Subject: [PATCH 12/48] version updated --- unstructured_ingest/__version__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/unstructured_ingest/__version__.py b/unstructured_ingest/__version__.py index d406886ca..9cab04ae9 100644 --- a/unstructured_ingest/__version__.py +++ b/unstructured_ingest/__version__.py @@ -1 +1 @@ -__version__ = "0.0.24" # pragma: no cover +__version__ = "0.0.25" # pragma: no cover From 698d715138c5d41c50abf2a41b9e94b28333d0c0 Mon Sep 17 00:00:00 2001 From: mr-unstructured Date: Fri, 11 Oct 2024 15:31:50 +0100 Subject: [PATCH 13/48] added notion-client to base.in --- requirements/common/base.in | 1 + 1 file changed, 1 insertion(+) diff --git a/requirements/common/base.in b/requirements/common/base.in index 90f7f8b5c..2fd31d3ff 100644 --- a/requirements/common/base.in +++ b/requirements/common/base.in @@ -8,3 +8,4 @@ dataclasses_json tqdm click opentelemetry-sdk +notion-client From 16885f39b027b739fffbf4a9d98358e06b86365c Mon Sep 17 00:00:00 2001 From: mr-unstructured Date: Fri, 11 Oct 2024 16:10:09 +0100 Subject: [PATCH 14/48] remove unused error --- unstructured_ingest/v2/processes/connectors/notion/client.py | 1 - 1 file changed, 1 deletion(-) diff --git a/unstructured_ingest/v2/processes/connectors/notion/client.py b/unstructured_ingest/v2/processes/connectors/notion/client.py index 705438fd9..db0655c2c 100644 --- a/unstructured_ingest/v2/processes/connectors/notion/client.py +++ b/unstructured_ingest/v2/processes/connectors/notion/client.py @@ -1,6 +1,5 @@ from typing import Any, Generator, List, Optional, Tuple -import notion_client.errors from notion_client import Client as NotionClient from notion_client.api_endpoints import BlocksChildrenEndpoint as NotionBlocksChildrenEndpoint from notion_client.api_endpoints import BlocksEndpoint as NotionBlocksEndpoint From 70ee2a504dc4e7a6da6a641f03797a5640aa4dc6 Mon Sep 17 00:00:00 2001 From: mr-unstructured Date: Fri, 11 Oct 2024 16:37:37 +0100 Subject: [PATCH 15/48] changed reference --- unstructured_ingest/v2/processes/connectors/notion/client.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/unstructured_ingest/v2/processes/connectors/notion/client.py b/unstructured_ingest/v2/processes/connectors/notion/client.py index db0655c2c..41a47e260 100644 --- a/unstructured_ingest/v2/processes/connectors/notion/client.py +++ b/unstructured_ingest/v2/processes/connectors/notion/client.py @@ -6,7 +6,7 @@ from notion_client.api_endpoints import DatabasesEndpoint as NotionDatabasesEndpoint from notion_client.api_endpoints import Endpoint from notion_client.api_endpoints import PagesEndpoint as NotionPagesEndpoint -from notion_client.errors import RequestTimeoutError +from notion_client.errors import RequestTimeoutError, HTTPResponseError from unstructured_ingest.ingest_backoff import RetryHandler from unstructured_ingest.interfaces import RetryStrategyConfig @@ -29,7 +29,7 @@ def _get_retry_strategy( retryable_exceptions = ( httpx.TimeoutException, httpx.HTTPStatusError, - notion_client.errors.HTTPResponseError, + HTTPResponseError, ) return RetryHandler( From 4bcde5672de68fa176dd8ae810ff47de2d6a77b8 Mon Sep 17 00:00:00 2001 From: mr-unstructured Date: Fri, 11 Oct 2024 16:44:28 +0100 Subject: [PATCH 16/48] ruff fix --- unstructured_ingest/v2/processes/connectors/notion/client.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/unstructured_ingest/v2/processes/connectors/notion/client.py b/unstructured_ingest/v2/processes/connectors/notion/client.py index 41a47e260..099488308 100644 --- a/unstructured_ingest/v2/processes/connectors/notion/client.py +++ b/unstructured_ingest/v2/processes/connectors/notion/client.py @@ -6,7 +6,7 @@ from notion_client.api_endpoints import DatabasesEndpoint as NotionDatabasesEndpoint from notion_client.api_endpoints import Endpoint from notion_client.api_endpoints import PagesEndpoint as NotionPagesEndpoint -from notion_client.errors import RequestTimeoutError, HTTPResponseError +from notion_client.errors import HTTPResponseError, RequestTimeoutError from unstructured_ingest.ingest_backoff import RetryHandler from unstructured_ingest.interfaces import RetryStrategyConfig From 4be07b0ad961cd5b5d0767fc87e16e03377fb96f Mon Sep 17 00:00:00 2001 From: mr-unstructured Date: Tue, 15 Oct 2024 22:24:34 +0100 Subject: [PATCH 17/48] Fixed and saving files now --- .../v2/processes/connectors/notion/client.py | 2 ++ .../v2/processes/connectors/notion/connector.py | 16 ++++++++++++---- .../processes/connectors/notion/types/block.py | 5 ++--- .../connectors/notion/types/database.py | 1 + 4 files changed, 17 insertions(+), 7 deletions(-) diff --git a/unstructured_ingest/v2/processes/connectors/notion/client.py b/unstructured_ingest/v2/processes/connectors/notion/client.py index 099488308..ecdf942a6 100644 --- a/unstructured_ingest/v2/processes/connectors/notion/client.py +++ b/unstructured_ingest/v2/processes/connectors/notion/client.py @@ -160,6 +160,8 @@ def iterate_query(self, database_id: str, **kwargs: Any) -> Generator[List[Page] yield pages next_cursor = response.get("next_cursor") + print("""here >>>> """) + print(next_cursor) if not response.get("has_more") or not next_cursor: return diff --git a/unstructured_ingest/v2/processes/connectors/notion/connector.py b/unstructured_ingest/v2/processes/connectors/notion/connector.py index 4f662492f..c65bb161a 100644 --- a/unstructured_ingest/v2/processes/connectors/notion/connector.py +++ b/unstructured_ingest/v2/processes/connectors/notion/connector.py @@ -1,6 +1,7 @@ from dataclasses import dataclass from time import time from typing import Any, Generator, List, Optional, Set, Tuple +from uuid import UUID from pydantic import Field, SecretStr @@ -47,6 +48,13 @@ class NotionIndexerConfig(IndexerConfig): default=False, description="Recursively process child pages and databases" ) + def __post_init__(self): + if self.page_ids: + self.page_ids = [str(UUID(p.strip())) for p in self.page_ids] + + if self.database_ids: + self.database_ids = [str(UUID(d.strip())) for d in self.database_ids] + @dataclass class NotionIndexer(Indexer): @@ -131,8 +139,8 @@ def run(self, **kwargs: Any) -> Generator[FileData, None, None]: def get_page_file_data(self, page_id: str, client: "NotionClient") -> Optional[FileData]: try: page_metadata = client.pages.retrieve(page_id=page_id) # type: ignore - date_created = page_metadata.get("created_time") - date_modified = page_metadata.get("last_edited_time") + date_created = page_metadata.created_time + date_modified = page_metadata.last_edited_time identifier = page_id source_identifiers = SourceIdentifiers( filename=f"{page_id}.html", @@ -163,8 +171,8 @@ def get_database_file_data( ) -> Optional[FileData]: try: database_metadata = client.databases.retrieve(database_id=database_id) # type: ignore - date_created = database_metadata.get("created_time") - date_modified = database_metadata.get("last_edited_time") + date_created = database_metadata.created_time + date_modified = database_metadata.last_edited_time identifier = database_id source_identifiers = SourceIdentifiers( filename=f"{database_id}.html", diff --git a/unstructured_ingest/v2/processes/connectors/notion/types/block.py b/unstructured_ingest/v2/processes/connectors/notion/types/block.py index b9d5f0379..7eb1c48b2 100644 --- a/unstructured_ingest/v2/processes/connectors/notion/types/block.py +++ b/unstructured_ingest/v2/processes/connectors/notion/types/block.py @@ -57,13 +57,14 @@ class Block(FromJSONMixin, GetHTMLMixin): created_by: PartialUser last_edited_time: str last_edited_by: PartialUser + # not_in_trash: bool archived: bool + in_trash: bool has_children: bool parent: Parent block: BlockBase object: str = "block" request_id: Optional[str] = None - # in_trash: bool def __repr__(self): return f"{self.__class__.__name__}(id={self.id}, type={self.type})" @@ -75,8 +76,6 @@ def from_dict(cls, data: dict): created_by = data.pop("created_by") last_edited_by = data.pop("last_edited_by") parent = data.pop("parent") - if "in_trash" in data: - data.pop("in_trash") try: block = cls( created_by=PartialUser.from_dict(created_by), diff --git a/unstructured_ingest/v2/processes/connectors/notion/types/database.py b/unstructured_ingest/v2/processes/connectors/notion/types/database.py index df4bea36c..055571064 100644 --- a/unstructured_ingest/v2/processes/connectors/notion/types/database.py +++ b/unstructured_ingest/v2/processes/connectors/notion/types/database.py @@ -26,6 +26,7 @@ class Database(FromJSONMixin, GetHTMLMixin): last_edited_time: str last_edited_by: PartialUser archived: bool + in_trash: bool parent: Parent url: str is_inline: bool From 9a4147e1fee4252dfdbaf0fa4b45ec872a5855bd Mon Sep 17 00:00:00 2001 From: mr-unstructured Date: Tue, 15 Oct 2024 22:31:19 +0100 Subject: [PATCH 18/48] addressed --- requirements/common/base.in | 1 - 1 file changed, 1 deletion(-) diff --git a/requirements/common/base.in b/requirements/common/base.in index 2fd31d3ff..90f7f8b5c 100644 --- a/requirements/common/base.in +++ b/requirements/common/base.in @@ -8,4 +8,3 @@ dataclasses_json tqdm click opentelemetry-sdk -notion-client From 7c5a5d9f768702894e5921448d96a7a29c3dd9fa Mon Sep 17 00:00:00 2001 From: mr-unstructured Date: Wed, 16 Oct 2024 13:06:23 +0100 Subject: [PATCH 19/48] Roman Access Config request addressed --- .../v2/processes/connectors/notion/connector.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/unstructured_ingest/v2/processes/connectors/notion/connector.py b/unstructured_ingest/v2/processes/connectors/notion/connector.py index c65bb161a..fecb72234 100644 --- a/unstructured_ingest/v2/processes/connectors/notion/connector.py +++ b/unstructured_ingest/v2/processes/connectors/notion/connector.py @@ -3,12 +3,13 @@ from typing import Any, Generator, List, Optional, Set, Tuple from uuid import UUID -from pydantic import Field, SecretStr +from pydantic import Field, Secret from unstructured_ingest.error import SourceConnectionError from unstructured_ingest.utils.dep_check import requires_dependencies from unstructured_ingest.v2.interfaces import ( AccessConfig, + ConnectionConfig, Downloader, DownloaderConfig, DownloadResponse, @@ -32,8 +33,12 @@ CONNECTOR_TYPE = "notion" -class NotionConnectionConfig(AccessConfig): - notion_api_key: SecretStr = Field(description="Notion API key") +class NotionAccessConfig(AccessConfig): + notion_api_key: str = Field(description="Notion API key") + + +class NotionConnectionConfig(ConnectionConfig): + access_config: Secret[NotionAccessConfig] class NotionIndexerConfig(IndexerConfig): From 4ffe9c6f75ec6ab00137ef3af67d01f4d0747dbd Mon Sep 17 00:00:00 2001 From: mr-unstructured Date: Wed, 16 Oct 2024 13:09:35 +0100 Subject: [PATCH 20/48] Library type_check done --- .../v2/processes/connectors/notion/connector.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/unstructured_ingest/v2/processes/connectors/notion/connector.py b/unstructured_ingest/v2/processes/connectors/notion/connector.py index fecb72234..eabe3970c 100644 --- a/unstructured_ingest/v2/processes/connectors/notion/connector.py +++ b/unstructured_ingest/v2/processes/connectors/notion/connector.py @@ -1,3 +1,4 @@ +import typing as t from dataclasses import dataclass from time import time from typing import Any, Generator, List, Optional, Set, Tuple @@ -21,13 +22,15 @@ ) from unstructured_ingest.v2.logger import logger from unstructured_ingest.v2.processes.connector_registry import SourceRegistryEntry -from unstructured_ingest.v2.processes.connectors.notion.client import Client as NotionClient from unstructured_ingest.v2.processes.connectors.notion.helpers import ( extract_database_html, extract_page_html, get_recursive_content_from_database, get_recursive_content_from_page, ) +if t.TYPE_CHECKING: + from unstructured_ingest.v2.processes.connectors.notion.client import Client as NotionClient + NOTION_API_VERSION = "2022-06-28" CONNECTOR_TYPE = "notion" From 1fc45a443a3fa67cf887488a7da748f185c7573a Mon Sep 17 00:00:00 2001 From: mr-unstructured Date: Wed, 16 Oct 2024 13:10:16 +0100 Subject: [PATCH 21/48] black fix --- unstructured_ingest/v2/processes/connectors/notion/connector.py | 1 + 1 file changed, 1 insertion(+) diff --git a/unstructured_ingest/v2/processes/connectors/notion/connector.py b/unstructured_ingest/v2/processes/connectors/notion/connector.py index eabe3970c..2347dc9ec 100644 --- a/unstructured_ingest/v2/processes/connectors/notion/connector.py +++ b/unstructured_ingest/v2/processes/connectors/notion/connector.py @@ -28,6 +28,7 @@ get_recursive_content_from_database, get_recursive_content_from_page, ) + if t.TYPE_CHECKING: from unstructured_ingest.v2.processes.connectors.notion.client import Client as NotionClient From 04f518fdd5994937e3c65b276f3746ce8a168a90 Mon Sep 17 00:00:00 2001 From: mr-unstructured Date: Wed, 16 Oct 2024 13:18:39 +0100 Subject: [PATCH 22/48] version file matching --- unstructured_ingest/__version__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/unstructured_ingest/__version__.py b/unstructured_ingest/__version__.py index caeae7b82..910600223 100644 --- a/unstructured_ingest/__version__.py +++ b/unstructured_ingest/__version__.py @@ -1 +1 @@ -__version__ = "0.0.26-dev3" # pragma: no cover +__version__ = "0.0.27" # pragma: no cover From d43985e663e37ef12b8b1250fdd23e3b78f6f3dc Mon Sep 17 00:00:00 2001 From: mr-unstructured Date: Wed, 16 Oct 2024 13:29:57 +0100 Subject: [PATCH 23/48] More libraries that needed to be capsulated --- .../v2/processes/connectors/notion/connector.py | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/unstructured_ingest/v2/processes/connectors/notion/connector.py b/unstructured_ingest/v2/processes/connectors/notion/connector.py index 2347dc9ec..6d1c25db2 100644 --- a/unstructured_ingest/v2/processes/connectors/notion/connector.py +++ b/unstructured_ingest/v2/processes/connectors/notion/connector.py @@ -22,12 +22,6 @@ ) from unstructured_ingest.v2.logger import logger from unstructured_ingest.v2.processes.connector_registry import SourceRegistryEntry -from unstructured_ingest.v2.processes.connectors.notion.helpers import ( - extract_database_html, - extract_page_html, - get_recursive_content_from_database, - get_recursive_content_from_page, -) if t.TYPE_CHECKING: from unstructured_ingest.v2.processes.connectors.notion.client import Client as NotionClient @@ -213,6 +207,10 @@ def get_child_pages_and_databases( processed_pages: Set[str], processed_databases: Set[str], ) -> Tuple[Set[str], Set[str]]: + from unstructured_ingest.v2.processes.connectors.notion.helpers import ( + get_recursive_content_from_page, + ) + child_content = get_recursive_content_from_page( client=client, page_id=page_id, @@ -229,6 +227,10 @@ def get_child_pages_and_databases_from_database( processed_pages: Set[str], processed_databases: Set[str], ) -> Tuple[Set[str], Set[str]]: + from unstructured_ingest.v2.processes.connectors.notion.helpers import ( + get_recursive_content_from_database, + ) + child_content = get_recursive_content_from_database( client=client, database_id=database_id, @@ -281,6 +283,7 @@ def run(self, file_data: FileData, **kwargs: Any) -> DownloadResponse: def download_page( self, client: "NotionClient", page_id: str, file_data: FileData ) -> DownloadResponse: + from unstructured_ingest.v2.processes.connectors.notion.helpers import extract_page_html try: text_extraction = extract_page_html( @@ -306,6 +309,8 @@ def download_page( def download_database( self, client: "NotionClient", database_id: str, file_data: FileData ) -> DownloadResponse: + from unstructured_ingest.v2.processes.connectors.notion.helpers import extract_database_html + try: text_extraction = extract_database_html( client=client, From 622c05bb26af341d472e305116ba9e3142ae9d8e Mon Sep 17 00:00:00 2001 From: mr-unstructured Date: Thu, 17 Oct 2024 17:34:29 +0100 Subject: [PATCH 24/48] Remove leftover comment --- .../v2/processes/connectors/notion/types/block.py | 1 - 1 file changed, 1 deletion(-) diff --git a/unstructured_ingest/v2/processes/connectors/notion/types/block.py b/unstructured_ingest/v2/processes/connectors/notion/types/block.py index 7eb1c48b2..66168a09d 100644 --- a/unstructured_ingest/v2/processes/connectors/notion/types/block.py +++ b/unstructured_ingest/v2/processes/connectors/notion/types/block.py @@ -57,7 +57,6 @@ class Block(FromJSONMixin, GetHTMLMixin): created_by: PartialUser last_edited_time: str last_edited_by: PartialUser - # not_in_trash: bool archived: bool in_trash: bool has_children: bool From 3cbaa50dca20fd905c1eb87cf15568cea7890c91 Mon Sep 17 00:00:00 2001 From: mr-unstructured Date: Fri, 18 Oct 2024 15:36:30 +0100 Subject: [PATCH 25/48] Multiple PR changes assigned --- .../processes/connectors/notion/connector.py | 78 ++++++++----------- 1 file changed, 32 insertions(+), 46 deletions(-) diff --git a/unstructured_ingest/v2/processes/connectors/notion/connector.py b/unstructured_ingest/v2/processes/connectors/notion/connector.py index 6d1c25db2..d1d72ada4 100644 --- a/unstructured_ingest/v2/processes/connectors/notion/connector.py +++ b/unstructured_ingest/v2/processes/connectors/notion/connector.py @@ -1,10 +1,8 @@ -import typing as t from dataclasses import dataclass from time import time -from typing import Any, Generator, List, Optional, Set, Tuple -from uuid import UUID +from typing import Any, Generator, Optional -from pydantic import Field, Secret +from pydantic import UUID4, Field, Secret from unstructured_ingest.error import SourceConnectionError from unstructured_ingest.utils.dep_check import requires_dependencies @@ -23,10 +21,6 @@ from unstructured_ingest.v2.logger import logger from unstructured_ingest.v2.processes.connector_registry import SourceRegistryEntry -if t.TYPE_CHECKING: - from unstructured_ingest.v2.processes.connectors.notion.client import Client as NotionClient - - NOTION_API_VERSION = "2022-06-28" CONNECTOR_TYPE = "notion" @@ -38,13 +32,23 @@ class NotionAccessConfig(AccessConfig): class NotionConnectionConfig(ConnectionConfig): access_config: Secret[NotionAccessConfig] + def get_client(self) -> "Client": + from unstructured_ingest.v2.processes.connectors.notion.client import Client + + return Client( + notion_version=NOTION_API_VERSION, + auth=self.connection_config.notion_api_key.get_secret_value(), + logger=logger, + log_level=logger.level, + ) + class NotionIndexerConfig(IndexerConfig): - page_ids: Optional[List[str]] = Field( + page_ids: Optional[list[str]] = Field( default=None, description="List of Notion page IDs to process" ) - database_ids: Optional[List[str]] = Field( + database_ids: Optional[list[str]] = Field( default=None, description="List of Notion database IDs to process" ) recursive: bool = Field( @@ -53,10 +57,10 @@ class NotionIndexerConfig(IndexerConfig): def __post_init__(self): if self.page_ids: - self.page_ids = [str(UUID(p.strip())) for p in self.page_ids] + self.page_ids: list[UUID4] = [UUID4(p.strip()) for p in self.page_ids] if self.database_ids: - self.database_ids = [str(UUID(d.strip())) for d in self.database_ids] + self.database_ids: list[UUID4] = [UUID4(p.strip()) for p in self.database_ids] @dataclass @@ -64,19 +68,10 @@ class NotionIndexer(Indexer): connection_config: NotionConnectionConfig index_config: NotionIndexerConfig - @requires_dependencies(["notion_client"], extras="notion") - def get_client(self) -> "NotionClient": - return NotionClient( - notion_version=NOTION_API_VERSION, - auth=self.connection_config.notion_api_key.get_secret_value(), - logger=logger, - log_level=logger.level, - ) - def precheck(self) -> None: """Check the connection to the Notion API.""" try: - client = self.get_client() + client = self.connection_config.get_client() # Perform a simple request to verify connection request = client._build_request("HEAD", "users") response = client.client.send(request) @@ -87,12 +82,12 @@ def precheck(self) -> None: raise SourceConnectionError(f"Failed to validate connection: {e}") def run(self, **kwargs: Any) -> Generator[FileData, None, None]: - client = self.get_client() - processed_pages: Set[str] = set() - processed_databases: Set[str] = set() + client = self.connection_config.get_client() + processed_pages: set[str] = set() + processed_databases: set[str] = set() - pages_to_process: Set[str] = set(self.index_config.page_ids or []) - databases_to_process: Set[str] = set(self.index_config.database_ids or []) + pages_to_process: set[str] = set(self.index_config.page_ids or []) + databases_to_process: set[str] = set(self.index_config.database_ids or []) while pages_to_process or databases_to_process: # Process pages @@ -204,9 +199,9 @@ def get_child_pages_and_databases( self, page_id: str, client: "NotionClient", - processed_pages: Set[str], - processed_databases: Set[str], - ) -> Tuple[Set[str], Set[str]]: + processed_pages: set[str], + processed_databases: set[str], + ) -> tuple[set[str], set[str]]: from unstructured_ingest.v2.processes.connectors.notion.helpers import ( get_recursive_content_from_page, ) @@ -223,10 +218,10 @@ def get_child_pages_and_databases( def get_child_pages_and_databases_from_database( self, database_id: str, - client: "NotionClient", - processed_pages: Set[str], - processed_databases: Set[str], - ) -> Tuple[Set[str], Set[str]]: + client: "connection_config.get_client()", + processed_pages: set[str], + processed_databases: set[str], + ) -> tuple[set[str], set[str]]: from unstructured_ingest.v2.processes.connectors.notion.helpers import ( get_recursive_content_from_database, ) @@ -252,17 +247,8 @@ class NotionDownloader(Downloader): download_config: NotionDownloaderConfig connector_type: str = CONNECTOR_TYPE - @requires_dependencies(["notion_client"], extras="notion") - def get_client(self) -> "NotionClient": - return NotionClient( - notion_version=NOTION_API_VERSION, - auth=self.connection_config.notion_api_key.get_secret_value(), - logger=logger, - log_level=logger.level, - ) - def run(self, file_data: FileData, **kwargs: Any) -> DownloadResponse: - client = self.get_client() + client = self.connection_config.get_client() record_locator = file_data.metadata.record_locator if "page_id" in record_locator: @@ -281,7 +267,7 @@ def run(self, file_data: FileData, **kwargs: Any) -> DownloadResponse: raise ValueError("Invalid record_locator in file_data") def download_page( - self, client: "NotionClient", page_id: str, file_data: FileData + self, client: "connection_config.get_client()", page_id: str, file_data: FileData ) -> DownloadResponse: from unstructured_ingest.v2.processes.connectors.notion.helpers import extract_page_html @@ -307,7 +293,7 @@ def download_page( return None def download_database( - self, client: "NotionClient", database_id: str, file_data: FileData + self, client: "connection_config.get_client()", database_id: str, file_data: FileData ) -> DownloadResponse: from unstructured_ingest.v2.processes.connectors.notion.helpers import extract_database_html From 7820f09e41b8676ba5ed0c248f71f84f4bcc13ba Mon Sep 17 00:00:00 2001 From: mr-unstructured Date: Fri, 18 Oct 2024 15:54:04 +0100 Subject: [PATCH 26/48] fixes --- .../v2/processes/connectors/notion/connector.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/unstructured_ingest/v2/processes/connectors/notion/connector.py b/unstructured_ingest/v2/processes/connectors/notion/connector.py index d1d72ada4..84d0d8083 100644 --- a/unstructured_ingest/v2/processes/connectors/notion/connector.py +++ b/unstructured_ingest/v2/processes/connectors/notion/connector.py @@ -32,7 +32,7 @@ class NotionAccessConfig(AccessConfig): class NotionConnectionConfig(ConnectionConfig): access_config: Secret[NotionAccessConfig] - def get_client(self) -> "Client": + def get_client(self): from unstructured_ingest.v2.processes.connectors.notion.client import Client return Client( @@ -134,7 +134,9 @@ def run(self, **kwargs: Any) -> Generator[FileData, None, None]: databases_to_process.update(child_databases) @requires_dependencies(["notion_client"], extras="notion") - def get_page_file_data(self, page_id: str, client: "NotionClient") -> Optional[FileData]: + def get_page_file_data( + self, page_id: str, client: "connection_config.get_client()" + ) -> Optional[FileData]: try: page_metadata = client.pages.retrieve(page_id=page_id) # type: ignore date_created = page_metadata.created_time @@ -165,7 +167,7 @@ def get_page_file_data(self, page_id: str, client: "NotionClient") -> Optional[F @requires_dependencies(["notion_client"], extras="notion") def get_database_file_data( - self, database_id: str, client: "NotionClient" + self, database_id: str, client: "connection_config.get_client()" ) -> Optional[FileData]: try: database_metadata = client.databases.retrieve(database_id=database_id) # type: ignore @@ -198,7 +200,7 @@ def get_database_file_data( def get_child_pages_and_databases( self, page_id: str, - client: "NotionClient", + client: "connection_config.get_client()", processed_pages: set[str], processed_databases: set[str], ) -> tuple[set[str], set[str]]: From c0c7efe9f25a266e3b91f7402352c9f21bc9898f Mon Sep 17 00:00:00 2001 From: mr-unstructured Date: Fri, 18 Oct 2024 16:12:41 +0100 Subject: [PATCH 27/48] tries --- .../processes/connectors/notion/connector.py | 19 +++++++------------ 1 file changed, 7 insertions(+), 12 deletions(-) diff --git a/unstructured_ingest/v2/processes/connectors/notion/connector.py b/unstructured_ingest/v2/processes/connectors/notion/connector.py index 84d0d8083..246df2899 100644 --- a/unstructured_ingest/v2/processes/connectors/notion/connector.py +++ b/unstructured_ingest/v2/processes/connectors/notion/connector.py @@ -67,6 +67,7 @@ def __post_init__(self): class NotionIndexer(Indexer): connection_config: NotionConnectionConfig index_config: NotionIndexerConfig + from unstructured_ingest.v2.processes.connectors.notion.client import Client def precheck(self) -> None: """Check the connection to the Notion API.""" @@ -134,9 +135,7 @@ def run(self, **kwargs: Any) -> Generator[FileData, None, None]: databases_to_process.update(child_databases) @requires_dependencies(["notion_client"], extras="notion") - def get_page_file_data( - self, page_id: str, client: "connection_config.get_client()" - ) -> Optional[FileData]: + def get_page_file_data(self, page_id: str, client: Client) -> Optional[FileData]: try: page_metadata = client.pages.retrieve(page_id=page_id) # type: ignore date_created = page_metadata.created_time @@ -165,10 +164,8 @@ def get_page_file_data( logger.error(f"Error retrieving page {page_id}: {e}") return None - @requires_dependencies(["notion_client"], extras="notion") - def get_database_file_data( - self, database_id: str, client: "connection_config.get_client()" - ) -> Optional[FileData]: + @requires_dependencies(["Client"], extras="notion") + def get_database_file_data(self, database_id: str, client: Client) -> Optional[FileData]: try: database_metadata = client.databases.retrieve(database_id=database_id) # type: ignore date_created = database_metadata.created_time @@ -238,7 +235,6 @@ def get_child_pages_and_databases_from_database( return child_pages, child_databases -# @dataclass class NotionDownloaderConfig(DownloaderConfig): pass @@ -248,6 +244,7 @@ class NotionDownloader(Downloader): connection_config: NotionConnectionConfig download_config: NotionDownloaderConfig connector_type: str = CONNECTOR_TYPE + from unstructured_ingest.v2.processes.connectors.notion.client import Client def run(self, file_data: FileData, **kwargs: Any) -> DownloadResponse: client = self.connection_config.get_client() @@ -268,9 +265,7 @@ def run(self, file_data: FileData, **kwargs: Any) -> DownloadResponse: else: raise ValueError("Invalid record_locator in file_data") - def download_page( - self, client: "connection_config.get_client()", page_id: str, file_data: FileData - ) -> DownloadResponse: + def download_page(self, client, page_id: str, file_data: FileData) -> DownloadResponse: from unstructured_ingest.v2.processes.connectors.notion.helpers import extract_page_html try: @@ -295,7 +290,7 @@ def download_page( return None def download_database( - self, client: "connection_config.get_client()", database_id: str, file_data: FileData + self, client: Client, database_id: str, file_data: FileData ) -> DownloadResponse: from unstructured_ingest.v2.processes.connectors.notion.helpers import extract_database_html From f58db9e13491cfac7075a29ab3b1c24a7596914f Mon Sep 17 00:00:00 2001 From: mr-unstructured Date: Fri, 18 Oct 2024 16:14:50 +0100 Subject: [PATCH 28/48] More Client --- .../v2/processes/connectors/notion/connector.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/unstructured_ingest/v2/processes/connectors/notion/connector.py b/unstructured_ingest/v2/processes/connectors/notion/connector.py index 246df2899..3c9b2a822 100644 --- a/unstructured_ingest/v2/processes/connectors/notion/connector.py +++ b/unstructured_ingest/v2/processes/connectors/notion/connector.py @@ -197,7 +197,7 @@ def get_database_file_data(self, database_id: str, client: Client) -> Optional[F def get_child_pages_and_databases( self, page_id: str, - client: "connection_config.get_client()", + client: Client, processed_pages: set[str], processed_databases: set[str], ) -> tuple[set[str], set[str]]: @@ -217,7 +217,7 @@ def get_child_pages_and_databases( def get_child_pages_and_databases_from_database( self, database_id: str, - client: "connection_config.get_client()", + client: Client, processed_pages: set[str], processed_databases: set[str], ) -> tuple[set[str], set[str]]: From d5d33394a1948c6dd7572d6f11f75f8c0172c793 Mon Sep 17 00:00:00 2001 From: mr-unstructured Date: Fri, 18 Oct 2024 16:24:11 +0100 Subject: [PATCH 29/48] most done --- .../processes/connectors/notion/connector.py | 34 +++++++++---------- 1 file changed, 17 insertions(+), 17 deletions(-) diff --git a/unstructured_ingest/v2/processes/connectors/notion/connector.py b/unstructured_ingest/v2/processes/connectors/notion/connector.py index 3c9b2a822..917c016c3 100644 --- a/unstructured_ingest/v2/processes/connectors/notion/connector.py +++ b/unstructured_ingest/v2/processes/connectors/notion/connector.py @@ -32,16 +32,6 @@ class NotionAccessConfig(AccessConfig): class NotionConnectionConfig(ConnectionConfig): access_config: Secret[NotionAccessConfig] - def get_client(self): - from unstructured_ingest.v2.processes.connectors.notion.client import Client - - return Client( - notion_version=NOTION_API_VERSION, - auth=self.connection_config.notion_api_key.get_secret_value(), - logger=logger, - log_level=logger.level, - ) - class NotionIndexerConfig(IndexerConfig): page_ids: Optional[list[str]] = Field( @@ -67,12 +57,22 @@ def __post_init__(self): class NotionIndexer(Indexer): connection_config: NotionConnectionConfig index_config: NotionIndexerConfig - from unstructured_ingest.v2.processes.connectors.notion.client import Client + + @requires_dependencies(["notion_client"], extras="notion") + def get_client(self): + from unstructured_ingest.v2.processes.connectors.notion.client import Client + + return Client( + notion_version=NOTION_API_VERSION, + auth=self.connection_config.notion_api_key.get_secret_value(), + logger=logger, + log_level=logger.level, + ) def precheck(self) -> None: """Check the connection to the Notion API.""" try: - client = self.connection_config.get_client() + client = self.get_client() # Perform a simple request to verify connection request = client._build_request("HEAD", "users") response = client.client.send(request) @@ -83,7 +83,7 @@ def precheck(self) -> None: raise SourceConnectionError(f"Failed to validate connection: {e}") def run(self, **kwargs: Any) -> Generator[FileData, None, None]: - client = self.connection_config.get_client() + client = self.get_client() processed_pages: set[str] = set() processed_databases: set[str] = set() @@ -135,7 +135,7 @@ def run(self, **kwargs: Any) -> Generator[FileData, None, None]: databases_to_process.update(child_databases) @requires_dependencies(["notion_client"], extras="notion") - def get_page_file_data(self, page_id: str, client: Client) -> Optional[FileData]: + def get_page_file_data(self, page_id: str, client: get_client) -> Optional[FileData]: try: page_metadata = client.pages.retrieve(page_id=page_id) # type: ignore date_created = page_metadata.created_time @@ -165,7 +165,7 @@ def get_page_file_data(self, page_id: str, client: Client) -> Optional[FileData] return None @requires_dependencies(["Client"], extras="notion") - def get_database_file_data(self, database_id: str, client: Client) -> Optional[FileData]: + def get_database_file_data(self, database_id: str, client: get_client) -> Optional[FileData]: try: database_metadata = client.databases.retrieve(database_id=database_id) # type: ignore date_created = database_metadata.created_time @@ -197,7 +197,7 @@ def get_database_file_data(self, database_id: str, client: Client) -> Optional[F def get_child_pages_and_databases( self, page_id: str, - client: Client, + client: get_client, processed_pages: set[str], processed_databases: set[str], ) -> tuple[set[str], set[str]]: @@ -217,7 +217,7 @@ def get_child_pages_and_databases( def get_child_pages_and_databases_from_database( self, database_id: str, - client: Client, + client: get_client, processed_pages: set[str], processed_databases: set[str], ) -> tuple[set[str], set[str]]: From 0e000b1531d09c6b70c42b5c4a1e05112fc93f04 Mon Sep 17 00:00:00 2001 From: mr-unstructured Date: Fri, 18 Oct 2024 16:34:27 +0100 Subject: [PATCH 30/48] missed this --- unstructured_ingest/v2/processes/connectors/notion/connector.py | 1 - 1 file changed, 1 deletion(-) diff --git a/unstructured_ingest/v2/processes/connectors/notion/connector.py b/unstructured_ingest/v2/processes/connectors/notion/connector.py index 917c016c3..6b212d94c 100644 --- a/unstructured_ingest/v2/processes/connectors/notion/connector.py +++ b/unstructured_ingest/v2/processes/connectors/notion/connector.py @@ -244,7 +244,6 @@ class NotionDownloader(Downloader): connection_config: NotionConnectionConfig download_config: NotionDownloaderConfig connector_type: str = CONNECTOR_TYPE - from unstructured_ingest.v2.processes.connectors.notion.client import Client def run(self, file_data: FileData, **kwargs: Any) -> DownloadResponse: client = self.connection_config.get_client() From 3cc4086b4c088470a7fb9bb48150f6cf60a266a3 Mon Sep 17 00:00:00 2001 From: mr-unstructured Date: Fri, 18 Oct 2024 16:36:41 +0100 Subject: [PATCH 31/48] trying --- unstructured_ingest/v2/processes/connectors/notion/connector.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/unstructured_ingest/v2/processes/connectors/notion/connector.py b/unstructured_ingest/v2/processes/connectors/notion/connector.py index 6b212d94c..2c73c9616 100644 --- a/unstructured_ingest/v2/processes/connectors/notion/connector.py +++ b/unstructured_ingest/v2/processes/connectors/notion/connector.py @@ -289,7 +289,7 @@ def download_page(self, client, page_id: str, file_data: FileData) -> DownloadRe return None def download_database( - self, client: Client, database_id: str, file_data: FileData + self, client, database_id: str, file_data: FileData ) -> DownloadResponse: from unstructured_ingest.v2.processes.connectors.notion.helpers import extract_database_html From f82c0b0231169f5e8040e75b910bcbf4ded86bba Mon Sep 17 00:00:00 2001 From: mr-unstructured Date: Fri, 18 Oct 2024 16:38:31 +0100 Subject: [PATCH 32/48] black --- .../v2/processes/connectors/notion/connector.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/unstructured_ingest/v2/processes/connectors/notion/connector.py b/unstructured_ingest/v2/processes/connectors/notion/connector.py index 2c73c9616..73a63a06c 100644 --- a/unstructured_ingest/v2/processes/connectors/notion/connector.py +++ b/unstructured_ingest/v2/processes/connectors/notion/connector.py @@ -288,9 +288,7 @@ def download_page(self, client, page_id: str, file_data: FileData) -> DownloadRe logger.error(f"Error downloading page {page_id}: {e}") return None - def download_database( - self, client, database_id: str, file_data: FileData - ) -> DownloadResponse: + def download_database(self, client, database_id: str, file_data: FileData) -> DownloadResponse: from unstructured_ingest.v2.processes.connectors.notion.helpers import extract_database_html try: From 9f02b745f6f81929908c0638fe0cf549bb7223dd Mon Sep 17 00:00:00 2001 From: mr-unstructured Date: Fri, 18 Oct 2024 20:34:12 +0100 Subject: [PATCH 33/48] version change --- CHANGELOG.md | 2 +- unstructured_ingest/__version__.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 511236c64..857930668 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,4 +1,4 @@ -## 0.1.0-dev6 +## 0.1.0-dev0 ### Enhancements diff --git a/unstructured_ingest/__version__.py b/unstructured_ingest/__version__.py index bb7c917f4..b1d803e05 100644 --- a/unstructured_ingest/__version__.py +++ b/unstructured_ingest/__version__.py @@ -1 +1 @@ -__version__ = "0.1.0-dev6" # pragma: no cover +__version__ = "0.1.0-dev0" # pragma: no cover From e4d81182c715dfa4c1f64042c7b24d7426eb3b96 Mon Sep 17 00:00:00 2001 From: mr-unstructured Date: Fri, 18 Oct 2024 22:10:12 +0100 Subject: [PATCH 34/48] async client --- .../v2/processes/connectors/notion/client.py | 285 +++++------------- .../processes/connectors/notion/connector.py | 2 +- 2 files changed, 73 insertions(+), 214 deletions(-) diff --git a/unstructured_ingest/v2/processes/connectors/notion/client.py b/unstructured_ingest/v2/processes/connectors/notion/client.py index ecdf942a6..40272c010 100644 --- a/unstructured_ingest/v2/processes/connectors/notion/client.py +++ b/unstructured_ingest/v2/processes/connectors/notion/client.py @@ -1,249 +1,108 @@ -from typing import Any, Generator, List, Optional, Tuple +from typing import Any, Generator +import httpx from notion_client import Client as NotionClient from notion_client.api_endpoints import BlocksChildrenEndpoint as NotionBlocksChildrenEndpoint -from notion_client.api_endpoints import BlocksEndpoint as NotionBlocksEndpoint from notion_client.api_endpoints import DatabasesEndpoint as NotionDatabasesEndpoint -from notion_client.api_endpoints import Endpoint -from notion_client.api_endpoints import PagesEndpoint as NotionPagesEndpoint from notion_client.errors import HTTPResponseError, RequestTimeoutError -from unstructured_ingest.ingest_backoff import RetryHandler -from unstructured_ingest.interfaces import RetryStrategyConfig -from unstructured_ingest.utils.dep_check import requires_dependencies from unstructured_ingest.v2.processes.connectors.notion.types.block import Block from unstructured_ingest.v2.processes.connectors.notion.types.database import Database -from unstructured_ingest.v2.processes.connectors.notion.types.database_properties import ( - map_cells, -) +from unstructured_ingest.v2.processes.connectors.notion.types.database_properties import map_cells from unstructured_ingest.v2.processes.connectors.notion.types.page import Page -@requires_dependencies(["httpx"], extras="notion") -def _get_retry_strategy( - endpoint: Endpoint, retry_strategy_config: RetryStrategyConfig -) -> RetryHandler: - import backoff - import httpx - - retryable_exceptions = ( - httpx.TimeoutException, - httpx.HTTPStatusError, - HTTPResponseError, - ) - - return RetryHandler( - backoff.expo, - retryable_exceptions, - max_time=retry_strategy_config.max_retry_time, - max_tries=retry_strategy_config.max_retries, - logger=endpoint.parent.logger, - start_log_level=endpoint.parent.logger.level, - backoff_log_level=endpoint.parent.logger.level, - ) - - -def get_retry_handler(endpoint: Endpoint) -> Optional[RetryHandler]: - if retry_strategy_config := getattr(endpoint, "retry_strategy_config"): - return _get_retry_strategy(endpoint=endpoint, retry_strategy_config=retry_strategy_config) - return None - - -class BlocksChildrenEndpoint(NotionBlocksChildrenEndpoint): - def __init__( - self, - *args, - retry_strategy_config: Optional[RetryStrategyConfig] = None, - **kwargs, - ): +class AsyncBlocksChildrenEndpoint(NotionBlocksChildrenEndpoint): + def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) - self.retry_strategy_config = retry_strategy_config - - @property - def retry_handler(self) -> Optional[RetryHandler]: - return get_retry_handler(self) - - def list(self, block_id: str, **kwargs: Any) -> Tuple[List[Block], dict]: - resp: dict = ( - self.retry_handler(super().list, block_id=block_id, **kwargs) - if self.retry_handler - else super().list(block_id=block_id, **kwargs) - ) # type: ignore + self._http_client = httpx.AsyncClient() + + async def list(self, block_id: str, **kwargs: Any) -> tuple[list[Block], dict]: + """Fetch the list of child blocks asynchronously.""" + try: + response = await self._http_client.get( + f"{self.parent._api_base}/blocks/{block_id}/children", **kwargs + ) + response.raise_for_status() + except httpx.HTTPStatusError as e: + raise HTTPResponseError(f"Failed to list blocks: {str(e)}") + except httpx.TimeoutException: + raise RequestTimeoutError() + + resp = response.json() child_blocks = [Block.from_dict(data=b) for b in resp.pop("results", [])] return child_blocks, resp - def iterate_list( - self, - block_id: str, - **kwargs: Any, - ) -> Generator[List[Block], None, None]: + async def iterate_list( + self, block_id: str, **kwargs: Any + ) -> Generator[list[Block], None, None]: + """Fetch the list of child blocks in pages asynchronously.""" + next_cursor = None while True: - response: dict = ( - self.retry_handler(super().list, block_id=block_id, **kwargs) - if self.retry_handler - else super().list(block_id=block_id, **kwargs) - ) # type: ignore - child_blocks = [Block.from_dict(data=b) for b in response.pop("results", [])] + params = {"start_cursor": next_cursor} if next_cursor else {} + params.update(kwargs) + child_blocks, response = await self.list(block_id, **params) yield child_blocks next_cursor = response.get("next_cursor") if not response.get("has_more") or not next_cursor: return + async def close(self): + """Close the HTTP client.""" + await self._http_client.aclose() + -class DatabasesEndpoint(NotionDatabasesEndpoint): - def __init__( - self, - *args, - retry_strategy_config: Optional[RetryStrategyConfig] = None, - **kwargs, - ): +class AsyncDatabasesEndpoint(NotionDatabasesEndpoint): + def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) - self.retry_strategy_config = retry_strategy_config - - @property - def retry_handler(self) -> Optional[RetryHandler]: - return get_retry_handler(self) - - def retrieve(self, database_id: str, **kwargs: Any) -> Database: - resp: dict = ( - self.retry_handler(super().retrieve, database_id=database_id, **kwargs) - if (self.retry_handler) - else (super().retrieve(database_id=database_id, **kwargs)) - ) # type: ignore - return Database.from_dict(data=resp) - - @requires_dependencies(["httpx"], extras="notion") - def retrieve_status(self, database_id: str, **kwargs) -> int: - import httpx - - request = self.parent._build_request( - method="HEAD", - path=f"databases/{database_id}", - auth=kwargs.get("auth"), - ) + self._http_client = httpx.AsyncClient() + + async def retrieve(self, database_id: str, **kwargs: Any) -> Database: + """Fetch a database by its ID asynchronously.""" + try: + response = await self._http_client.get( + f"{self.parent._api_base}/databases/{database_id}", **kwargs + ) + response.raise_for_status() + except httpx.HTTPStatusError as e: + raise HTTPResponseError(f"Failed to retrieve database: {str(e)}") + except httpx.TimeoutException: + raise RequestTimeoutError() + + return Database.from_dict(data=response.json()) + + async def query(self, database_id: str, **kwargs: Any) -> tuple[list[Page], dict]: + """Query a database asynchronously.""" try: - response: httpx.Response = ( - self.retry_handler(self.parent.client.send, request) - if (self.retry_handler) - else (self.parent.client.send(request)) - ) # type: ignore - return response.status_code + response = await self._http_client.post( + f"{self.parent._api_base}/databases/{database_id}/query", + json=kwargs.get("json", {}), + ) + response.raise_for_status() + except httpx.HTTPStatusError as e: + raise HTTPResponseError(f"Failed to query database: {str(e)}") except httpx.TimeoutException: raise RequestTimeoutError() - def query(self, database_id: str, **kwargs: Any) -> Tuple[List[Page], dict]: - """Get a list of [Pages](https://developers.notion.com/reference/page) contained in the database. - - *[🔗 Endpoint documentation](https://developers.notion.com/reference/post-database-query)* - """ # noqa: E501 - resp: dict = ( - self.retry_handler(super().query, database_id=database_id, **kwargs) - if (self.retry_handler) - else (super().query(database_id=database_id, **kwargs)) - ) # type: ignore - pages = [Page.from_dict(data=p) for p in resp.pop("results")] + resp = response.json() + pages = [Page.from_dict(data=p) for p in resp.pop("results", [])] for p in pages: p.properties = map_cells(p.properties) return pages, resp - def iterate_query(self, database_id: str, **kwargs: Any) -> Generator[List[Page], None, None]: - while True: - response: dict = ( - self.retry_handler(super().query, database_id=database_id, **kwargs) - if (self.retry_handler) - else (super().query(database_id=database_id, **kwargs)) - ) # type: ignore - pages = [Page.from_dict(data=p) for p in response.pop("results", [])] - for p in pages: - p.properties = map_cells(p.properties) - yield pages - - next_cursor = response.get("next_cursor") - print("""here >>>> """) - print(next_cursor) - if not response.get("has_more") or not next_cursor: - return + async def close(self): + """Close the HTTP client.""" + await self._http_client.aclose() -class BlocksEndpoint(NotionBlocksEndpoint): - def __init__( - self, - *args: Any, - retry_strategy_config: Optional[RetryStrategyConfig] = None, - **kwargs: Any, - ) -> None: - super().__init__(*args, **kwargs) - self.retry_strategy_config = retry_strategy_config - self.children = BlocksChildrenEndpoint( - retry_strategy_config=retry_strategy_config, - *args, - **kwargs, - ) - - @property - def retry_handler(self) -> Optional[RetryHandler]: - return get_retry_handler(self) - - def retrieve(self, block_id: str, **kwargs: Any) -> Block: - resp: dict = ( - self.retry_handler(super().retrieve, block_id=block_id, **kwargs) - if (self.retry_handler) - else (super().retrieve(block_id=block_id, **kwargs)) - ) # type: ignore - return Block.from_dict(data=resp) - - -class PagesEndpoint(NotionPagesEndpoint): - def __init__( - self, - *args, - retry_strategy_config: Optional[RetryStrategyConfig] = None, - **kwargs, - ): +class AsyncClient(NotionClient): + def __init__(self, *args: Any, **kwargs: Any) -> None: super().__init__(*args, **kwargs) - self.retry_strategy_config = retry_strategy_config - - @property - def retry_handler(self) -> Optional[RetryHandler]: - return get_retry_handler(self) - - def retrieve(self, page_id: str, **kwargs: Any) -> Page: - resp: dict = ( - self.retry_handler(super().retrieve, page_id=page_id, **kwargs) - if (self.retry_handler) - else (super().retrieve(page_id=page_id, **kwargs)) - ) # type: ignore - return Page.from_dict(data=resp) - - @requires_dependencies(["httpx"], extras="notion") - def retrieve_status(self, page_id: str, **kwargs) -> int: - import httpx - - request = self.parent._build_request( - method="HEAD", - path=f"pages/{page_id}", - auth=kwargs.get("auth"), - ) - try: - response: httpx.Response = ( - self.retry_handler(self.parent.client.send, request) - if (self.retry_handler) - else (self.parent.client.send(request)) - ) # type: ignore - return response.status_code - except httpx.TimeoutException: - raise RequestTimeoutError() + self.blocks = AsyncBlocksChildrenEndpoint(parent=self) + self.databases = AsyncDatabasesEndpoint(parent=self) - -class Client(NotionClient): - def __init__( - self, - *args: Any, - retry_strategy_config: Optional[RetryStrategyConfig] = None, - **kwargs: Any, - ) -> None: - super().__init__(*args, **kwargs) - self.blocks = BlocksEndpoint(retry_strategy_config=retry_strategy_config, parent=self) - self.pages = PagesEndpoint(retry_strategy_config=retry_strategy_config, parent=self) - self.databases = DatabasesEndpoint(retry_strategy_config=retry_strategy_config, parent=self) + async def close(self): + """Close all async endpoints.""" + await self.blocks.close() + await self.databases.close() diff --git a/unstructured_ingest/v2/processes/connectors/notion/connector.py b/unstructured_ingest/v2/processes/connectors/notion/connector.py index 73a63a06c..1099e0eae 100644 --- a/unstructured_ingest/v2/processes/connectors/notion/connector.py +++ b/unstructured_ingest/v2/processes/connectors/notion/connector.py @@ -60,7 +60,7 @@ class NotionIndexer(Indexer): @requires_dependencies(["notion_client"], extras="notion") def get_client(self): - from unstructured_ingest.v2.processes.connectors.notion.client import Client + from unstructured_ingest.v2.processes.connectors.notion.client import AsyncClient as Client return Client( notion_version=NOTION_API_VERSION, From 2579acee23fa365fa8d2d5ea1ef7e803f974ce74 Mon Sep 17 00:00:00 2001 From: mr-unstructured Date: Mon, 21 Oct 2024 14:52:57 +0100 Subject: [PATCH 35/48] connector.py updates --- .../processes/connectors/notion/connector.py | 42 ++++++++++--------- 1 file changed, 23 insertions(+), 19 deletions(-) diff --git a/unstructured_ingest/v2/processes/connectors/notion/connector.py b/unstructured_ingest/v2/processes/connectors/notion/connector.py index 1099e0eae..935a1d64c 100644 --- a/unstructured_ingest/v2/processes/connectors/notion/connector.py +++ b/unstructured_ingest/v2/processes/connectors/notion/connector.py @@ -59,7 +59,7 @@ class NotionIndexer(Indexer): index_config: NotionIndexerConfig @requires_dependencies(["notion_client"], extras="notion") - def get_client(self): + async def get_client(self) -> "get_client": from unstructured_ingest.v2.processes.connectors.notion.client import AsyncClient as Client return Client( @@ -69,21 +69,21 @@ def get_client(self): log_level=logger.level, ) - def precheck(self) -> None: + async def precheck(self) -> None: """Check the connection to the Notion API.""" try: - client = self.get_client() + client = await self.get_client() # Perform a simple request to verify connection request = client._build_request("HEAD", "users") - response = client.client.send(request) + response = await client.client.send(request) response.raise_for_status() except Exception as e: logger.error(f"Failed to validate connection: {e}", exc_info=True) raise SourceConnectionError(f"Failed to validate connection: {e}") - def run(self, **kwargs: Any) -> Generator[FileData, None, None]: - client = self.get_client() + async def run(self, **kwargs: Any) -> Generator[FileData, None, None]: + client = await self.get_client() processed_pages: set[str] = set() processed_databases: set[str] = set() @@ -98,12 +98,12 @@ def run(self, **kwargs: Any) -> Generator[FileData, None, None]: processed_pages.add(page_id) pages_to_process.remove(page_id) - file_data = self.get_page_file_data(page_id=page_id, client=client) + file_data = await self.get_page_file_data(page_id=page_id, client=client) if file_data: yield file_data if self.index_config.recursive: - child_pages, child_databases = self.get_child_pages_and_databases( + (child_pages, child_databases) = await self.get_child_pages_and_databases( page_id=page_id, client=client, processed_pages=processed_pages, @@ -118,14 +118,16 @@ def run(self, **kwargs: Any) -> Generator[FileData, None, None]: continue processed_databases.add(database_id) databases_to_process.remove(database_id) - file_data = self.get_database_file_data(database_id=database_id, client=client) + file_data = await self.get_database_file_data( + database_id=database_id, client=client + ) if file_data: yield file_data if self.index_config.recursive: ( child_pages, child_databases, - ) = self.get_child_pages_and_databases_from_database( + ) = await self.get_child_pages_and_databases_from_database( database_id=database_id, client=client, processed_pages=processed_pages, @@ -135,9 +137,9 @@ def run(self, **kwargs: Any) -> Generator[FileData, None, None]: databases_to_process.update(child_databases) @requires_dependencies(["notion_client"], extras="notion") - def get_page_file_data(self, page_id: str, client: get_client) -> Optional[FileData]: + async def get_page_file_data(self, page_id: str, client: "get_client") -> Optional[FileData]: try: - page_metadata = client.pages.retrieve(page_id=page_id) # type: ignore + page_metadata = await client.pages.retrieve(page_id=page_id) # type: ignore date_created = page_metadata.created_time date_modified = page_metadata.last_edited_time identifier = page_id @@ -165,9 +167,11 @@ def get_page_file_data(self, page_id: str, client: get_client) -> Optional[FileD return None @requires_dependencies(["Client"], extras="notion") - def get_database_file_data(self, database_id: str, client: get_client) -> Optional[FileData]: + async def get_database_file_data( + self, database_id: str, client: "get_client" + ) -> Optional[FileData]: try: - database_metadata = client.databases.retrieve(database_id=database_id) # type: ignore + database_metadata = await client.databases.retrieve(database_id=database_id) # type: ignore date_created = database_metadata.created_time date_modified = database_metadata.last_edited_time identifier = database_id @@ -194,10 +198,10 @@ def get_database_file_data(self, database_id: str, client: get_client) -> Option logger.error(f"Error retrieving database {database_id}: {e}") return None - def get_child_pages_and_databases( + async def get_child_pages_and_databases( self, page_id: str, - client: get_client, + client: "get_client", processed_pages: set[str], processed_databases: set[str], ) -> tuple[set[str], set[str]]: @@ -205,7 +209,7 @@ def get_child_pages_and_databases( get_recursive_content_from_page, ) - child_content = get_recursive_content_from_page( + child_content = await get_recursive_content_from_page( client=client, page_id=page_id, logger=logger, @@ -214,7 +218,7 @@ def get_child_pages_and_databases( child_databases = set(child_content.child_databases) - processed_databases return child_pages, child_databases - def get_child_pages_and_databases_from_database( + async def get_child_pages_and_databases_from_database( self, database_id: str, client: get_client, @@ -225,7 +229,7 @@ def get_child_pages_and_databases_from_database( get_recursive_content_from_database, ) - child_content = get_recursive_content_from_database( + child_content = await get_recursive_content_from_database( client=client, database_id=database_id, logger=logger, From 2d4a1d7157ecabbd6ba0adb276c7f7d9a2db5902 Mon Sep 17 00:00:00 2001 From: mr-unstructured Date: Mon, 21 Oct 2024 15:16:35 +0100 Subject: [PATCH 36/48] autopep8 updates --- .../v2/processes/connectors/notion/connector.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/unstructured_ingest/v2/processes/connectors/notion/connector.py b/unstructured_ingest/v2/processes/connectors/notion/connector.py index 935a1d64c..58b52517f 100644 --- a/unstructured_ingest/v2/processes/connectors/notion/connector.py +++ b/unstructured_ingest/v2/processes/connectors/notion/connector.py @@ -171,7 +171,8 @@ async def get_database_file_data( self, database_id: str, client: "get_client" ) -> Optional[FileData]: try: - database_metadata = await client.databases.retrieve(database_id=database_id) # type: ignore + # type: ignore + database_metadata = await client.databases.retrieve(database_id=database_id) date_created = database_metadata.created_time date_modified = database_metadata.last_edited_time identifier = database_id From 502aa1aa6f4de2b6d618232894c6bd27f7ff6a95 Mon Sep 17 00:00:00 2001 From: mr-unstructured Date: Tue, 22 Oct 2024 17:00:09 +0100 Subject: [PATCH 37/48] Roman comments addressed --- .../v2/processes/connectors/notion/connector.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/unstructured_ingest/v2/processes/connectors/notion/connector.py b/unstructured_ingest/v2/processes/connectors/notion/connector.py index 58b52517f..165694dc0 100644 --- a/unstructured_ingest/v2/processes/connectors/notion/connector.py +++ b/unstructured_ingest/v2/processes/connectors/notion/connector.py @@ -58,13 +58,16 @@ class NotionIndexer(Indexer): connection_config: NotionConnectionConfig index_config: NotionIndexerConfig + def is_async(self) -> bool: + return True + @requires_dependencies(["notion_client"], extras="notion") async def get_client(self) -> "get_client": from unstructured_ingest.v2.processes.connectors.notion.client import AsyncClient as Client return Client( notion_version=NOTION_API_VERSION, - auth=self.connection_config.notion_api_key.get_secret_value(), + auth=self.connection_config.notion_api_key.get_secret_value().notion_api_key, logger=logger, log_level=logger.level, ) @@ -82,7 +85,11 @@ async def precheck(self) -> None: logger.error(f"Failed to validate connection: {e}", exc_info=True) raise SourceConnectionError(f"Failed to validate connection: {e}") - async def run(self, **kwargs: Any) -> Generator[FileData, None, None]: + def run(self, file_data: FileData, **kwargs: Any) -> Generator[FileData, None, None]: + # Synchronous run is not implemented + raise NotImplementedError() + + async def run_async(self, **kwargs: Any) -> Generator[FileData, None, None]: client = await self.get_client() processed_pages: set[str] = set() processed_databases: set[str] = set() From d2263eadac6cc75fc20629ae07d6fe9301eb2420 Mon Sep 17 00:00:00 2001 From: mr-unstructured Date: Tue, 22 Oct 2024 17:12:31 +0100 Subject: [PATCH 38/48] params issue --- unstructured_ingest/v2/processes/connectors/notion/connector.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/unstructured_ingest/v2/processes/connectors/notion/connector.py b/unstructured_ingest/v2/processes/connectors/notion/connector.py index 165694dc0..db9e82bf8 100644 --- a/unstructured_ingest/v2/processes/connectors/notion/connector.py +++ b/unstructured_ingest/v2/processes/connectors/notion/connector.py @@ -85,7 +85,7 @@ async def precheck(self) -> None: logger.error(f"Failed to validate connection: {e}", exc_info=True) raise SourceConnectionError(f"Failed to validate connection: {e}") - def run(self, file_data: FileData, **kwargs: Any) -> Generator[FileData, None, None]: + def run(self, **kwargs: Any) -> Generator[FileData, None, None]: # Synchronous run is not implemented raise NotImplementedError() From 450aff6299e340df6fed943fcc307a81bd0c7341 Mon Sep 17 00:00:00 2001 From: mr-unstructured Date: Wed, 23 Oct 2024 14:21:25 +0100 Subject: [PATCH 39/48] stop ignoring Notion --- test_e2e/test-dest.sh | 1 - test_e2e/test-src.sh | 4 +--- 2 files changed, 1 insertion(+), 4 deletions(-) diff --git a/test_e2e/test-dest.sh b/test_e2e/test-dest.sh index 49cbc2483..a3a2f0213 100755 --- a/test_e2e/test-dest.sh +++ b/test_e2e/test-dest.sh @@ -62,7 +62,6 @@ trap print_last_run EXIT python_version=$(python --version 2>&1) tests_to_ignore=( - 'notion.sh' 'dropbox.sh' 'sharepoint.sh' ) diff --git a/test_e2e/test-src.sh b/test_e2e/test-src.sh index 86c991b4a..f70c438b3 100755 --- a/test_e2e/test-src.sh +++ b/test_e2e/test-src.sh @@ -80,9 +80,7 @@ trap print_last_run EXIT python_version=$(python --version 2>&1) -tests_to_ignore=( - 'notion.sh' -) +tests_to_ignore=() for test in "${all_tests[@]}"; do CURRENT_TEST="$test" From 4b1e612dd79b0ade0b3f5c57eefe52553ccc8365 Mon Sep 17 00:00:00 2001 From: mr-unstructured Date: Wed, 23 Oct 2024 14:48:27 +0100 Subject: [PATCH 40/48] my bad, versions dont match --- unstructured_ingest/__version__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/unstructured_ingest/__version__.py b/unstructured_ingest/__version__.py index 66f0b2059..55a08acab 100644 --- a/unstructured_ingest/__version__.py +++ b/unstructured_ingest/__version__.py @@ -1 +1 @@ -__version__ = "0.1.2-dev1" # pragma: no cover +__version__ = "0.1.2-dev2" # pragma: no cover From 7550c53fa63582cd33800a00652c9fa9fc80bc89 Mon Sep 17 00:00:00 2001 From: Bryan Chen Date: Tue, 24 Dec 2024 19:04:58 -0500 Subject: [PATCH 41/48] migrate notion source connector to V2 --- .../notion/directory_structure.json | 5 + .../1572c3765a0a806299f0dd6999f9e4c7.html | 143 +++++++++++ .../1572c3765a0a806299f0dd6999f9e4c7.json | 39 +++ test/integration/connectors/test_notion.py | 110 ++++++++ .../v2/processes/connectors/__init__.py | 4 +- .../v2/processes/connectors/notion/client.py | 243 +++++++++++++++++- .../processes/connectors/notion/connector.py | 88 ++++--- 7 files changed, 584 insertions(+), 48 deletions(-) create mode 100644 test/integration/connectors/expected_results/notion/directory_structure.json create mode 100644 test/integration/connectors/expected_results/notion/downloads/1572c3765a0a806299f0dd6999f9e4c7.html create mode 100644 test/integration/connectors/expected_results/notion/file_data/1572c3765a0a806299f0dd6999f9e4c7.json create mode 100644 test/integration/connectors/test_notion.py diff --git a/test/integration/connectors/expected_results/notion/directory_structure.json b/test/integration/connectors/expected_results/notion/directory_structure.json new file mode 100644 index 000000000..f96f3ca8f --- /dev/null +++ b/test/integration/connectors/expected_results/notion/directory_structure.json @@ -0,0 +1,5 @@ +{ + "directory_structure": [ + "1572c3765a0a806299f0dd6999f9e4c7.html" + ] + } \ No newline at end of file diff --git a/test/integration/connectors/expected_results/notion/downloads/1572c3765a0a806299f0dd6999f9e4c7.html b/test/integration/connectors/expected_results/notion/downloads/1572c3765a0a806299f0dd6999f9e4c7.html new file mode 100644 index 000000000..b6797d7b0 --- /dev/null +++ b/test/integration/connectors/expected_results/notion/downloads/1572c3765a0a806299f0dd6999f9e4c7.html @@ -0,0 +1,143 @@ + + + + test-doc1 + + + +

+ test-doc1 +

+
+
+ testtext2 testtext2 testtext2 testtext2 testtext2 testtext2 testtext2 testtext2 testtext2 testtext2 +
+
+ testtext2 testtext2 testtext2 testtext2 testtext2 testtext2 testtext2 testtext2 testtext2 testtext2 +
+ +
    +
  1. + Testdoc2 List Item 1 +
  2. +
      +
    1. + Testdoc2 List Item 1 Nested Item A +
    2. +
    3. + Testdoc2 List Item 1 Nested Item B +
    4. +
    +
  3. + Testdoc2 List Item 2 +
  4. +
  5. + Testdoc2 List Item 3 +
  6. +
+
+
+ + +
+
+ Testdoc2 Checklist Item 1 +
+
+ + +
+
+ Testdoc2 Checklist Item 2 (checked) +
+
+ +
+ + +
+ + Testdoc2 bold text + +
+
+ + Testdoc2 italic text + +
+
+ + Testdoc2 Heading 1 Sized Text + +
+
+ + Testdoc2 Heading 2 Sized Text + +
+
+ Testdoc2 Heading 3 Sized Text +
+
+ Testdoc2 Heading 4 Sized Text +
+
+ Testdoc2 Heading 5 Sized Text +
+ + + + + + + + + + + + + + + + +
+ + Testdoc2 Table: Column 1 Row 0 + + + + Testdoc2 Table: Column 2 Row 0 + + + + Testdoc2 Table: Column 3 Row 0 + +
+ + Testdoc2 Table: Column 1 Row 1 + + + + Testdoc2 Table: Column 2 Row 1 + + + + Testdoc2 Table: Column 3 Row 1 + +
+ + Testdoc2 Table: Column 1 Row 2 + + + + Testdoc2 Table: Column 2 Row 2 + + + + Testdoc2 Table: Column 3 Row 2 + +
+ +
+ + diff --git a/test/integration/connectors/expected_results/notion/file_data/1572c3765a0a806299f0dd6999f9e4c7.json b/test/integration/connectors/expected_results/notion/file_data/1572c3765a0a806299f0dd6999f9e4c7.json new file mode 100644 index 000000000..e58937d7e --- /dev/null +++ b/test/integration/connectors/expected_results/notion/file_data/1572c3765a0a806299f0dd6999f9e4c7.json @@ -0,0 +1,39 @@ +{ + "identifier": "1572c3765a0a806299f0dd6999f9e4c7", + "connector_type": "notion", + "source_identifiers": { + "filename": "1572c3765a0a806299f0dd6999f9e4c7.html", + "fullpath": "1572c3765a0a806299f0dd6999f9e4c7.html", + "rel_path": "1572c3765a0a806299f0dd6999f9e4c7.html" + }, + "metadata": { + "url": null, + "version": null, + "record_locator": { + "page_id": "1572c3765a0a806299f0dd6999f9e4c7" + }, + "date_created": "2024-12-09T18: 13: 00.000Z", + "date_modified": "2024-12-24T18: 58: 00.000Z", + "date_processed": "1735078568.778562", + "permissions_data": null, + "filesize_bytes": null + }, + "additional_metadata": { + "created_by": { + "id": "118d872b-594c-8171-b46f-00020d10d8b2", + "object": "user" + }, + "last_edited_by": { + "id": "118d872b-594c-8171-b46f-00020d10d8b2", + "object": "user" + }, + "parent": { + "page_id": "1182c376-5a0a-8042-9a2a-fb003e00d57b", + "type": "page_id" + }, + "url": "https://www.notion.so/test-doc1-1572c3765a0a806299f0dd6999f9e4c7" + }, + "reprocess": false, + "local_download_path": "/private/var/folders/h7/n848df9s5yn7ml8rxb61vhyc0000gp/T/tmpw56i_s_f/1572c3765a0a806299f0dd6999f9e4c7", + "display_name": null +} \ No newline at end of file diff --git a/test/integration/connectors/test_notion.py b/test/integration/connectors/test_notion.py new file mode 100644 index 000000000..500a9c1e8 --- /dev/null +++ b/test/integration/connectors/test_notion.py @@ -0,0 +1,110 @@ +import os + +from test.integration.connectors.utils.validation.source import ( + SourceValidationConfigs, + run_all_validations, + update_fixtures, + get_all_file_data + +) +from unstructured_ingest.v2.interfaces import Downloader, Indexer +from unstructured_ingest.v2.processes.connectors.notion.connector import ( + NotionAccessConfig, + NotionConnectionConfig, + NotionDownloader, + NotionDownloaderConfig, + NotionIndexer, + NotionIndexerConfig, +) + + +def test_notion_source(temp_dir): + # Retrieve environment variables + notion_api_key = os.environ["NOTION_API_KEY"] + + # Create connection and indexer configurations + access_config = NotionAccessConfig(notion_api_key=notion_api_key) + connection_config = NotionConnectionConfig( + access_config=access_config, + ) + index_config = NotionIndexerConfig( + page_ids=["1572c3765a0a806299f0dd6999f9e4c7"], recursive=False + ) + + download_config = NotionDownloaderConfig(download_dir=temp_dir) + + # Instantiate indexer and downloader + indexer = NotionIndexer( + connection_config=connection_config, + index_config=index_config, + ) + downloader = NotionDownloader( + connection_config=connection_config, + download_config=download_config, + ) + + # Run the source connector validation + source_connector_validation( + indexer=indexer, + downloader=downloader, + configs=SourceValidationConfigs( + test_id="notion", + expected_num_files=1, + validate_downloaded_files=True, + exclude_fields_extend=["metadata.date_created", "metadata.date_modified"] + ), + ) + + +def source_connector_validation( + indexer: Indexer, + downloader: Downloader, + configs: SourceValidationConfigs, + overwrite_fixtures: bool = os.getenv("OVERWRITE_FIXTURES", "False").lower() == "true", +) -> None: + # Run common validations on the process of running a source connector, supporting dynamic + # validators that get passed in along with comparisons on the saved expected values. + # If overwrite_fixtures is st to True, will ignore all validators but instead overwrite the + # expected values with what gets generated by this test. + all_predownload_file_data = [] + all_postdownload_file_data = [] + indexer.precheck() + download_dir = downloader.download_config.download_dir + test_output_dir = configs.test_output_dir() + + for file_data in indexer.run(): + assert file_data + predownload_file_data = file_data.model_copy(deep=True) + all_predownload_file_data.append(predownload_file_data) + resp = downloader.run(file_data=file_data) + if isinstance(resp, list): + for r in resp: + postdownload_file_data = r["file_data"].model_copy(deep=True) + all_postdownload_file_data.append(postdownload_file_data) + else: + postdownload_file_data = resp["file_data"].model_copy(deep=True) + all_postdownload_file_data.append(postdownload_file_data) + if not overwrite_fixtures: + print("Running validation") + run_all_validations( + configs=configs, + predownload_file_data=all_predownload_file_data, + postdownload_file_data=all_postdownload_file_data, + download_dir=download_dir, + test_output_dir=test_output_dir, + ) + else: + print("Running fixtures update") + update_fixtures( + output_dir=test_output_dir, + download_dir=download_dir, + all_file_data=get_all_file_data( + all_predownload_file_data=all_predownload_file_data, + all_postdownload_file_data=all_postdownload_file_data, + ), + save_downloads=configs.validate_downloaded_files, + save_filedata=configs.validate_file_data, + ) + + + diff --git a/unstructured_ingest/v2/processes/connectors/__init__.py b/unstructured_ingest/v2/processes/connectors/__init__.py index 4a89077a9..d9d20cd92 100644 --- a/unstructured_ingest/v2/processes/connectors/__init__.py +++ b/unstructured_ingest/v2/processes/connectors/__init__.py @@ -42,6 +42,8 @@ from .mongodb import mongodb_destination_entry, mongodb_source_entry from .neo4j import CONNECTOR_TYPE as NEO4J_CONNECTOR_TYPE from .neo4j import neo4j_destination_entry +from .notion.connector import CONNECTOR_TYPE as NOTION_CONNECTOR_TYPE +from .notion.connector import notion_source_entry from .onedrive import CONNECTOR_TYPE as ONEDRIVE_CONNECTOR_TYPE from .onedrive import onedrive_destination_entry, onedrive_source_entry from .outlook import CONNECTOR_TYPE as OUTLOOK_CONNECTOR_TYPE @@ -58,8 +60,6 @@ from .slack import slack_source_entry from .vectara import CONNECTOR_TYPE as VECTARA_CONNECTOR_TYPE from .vectara import vectara_destination_entry -from .notion.connector import CONNECTOR_TYPE as NOTION_CONNECTOR_TYPE -from .notion.connector import notion_source_entry add_source_entry(source_type=ASTRA_DB_CONNECTOR_TYPE, entry=astra_db_source_entry) add_destination_entry(destination_type=ASTRA_DB_CONNECTOR_TYPE, entry=astra_db_destination_entry) diff --git a/unstructured_ingest/v2/processes/connectors/notion/client.py b/unstructured_ingest/v2/processes/connectors/notion/client.py index 40272c010..80a39d3a6 100644 --- a/unstructured_ingest/v2/processes/connectors/notion/client.py +++ b/unstructured_ingest/v2/processes/connectors/notion/client.py @@ -1,15 +1,250 @@ -from typing import Any, Generator +from typing import Any, Generator, Optional, Tuple, List import httpx from notion_client import Client as NotionClient from notion_client.api_endpoints import BlocksChildrenEndpoint as NotionBlocksChildrenEndpoint from notion_client.api_endpoints import DatabasesEndpoint as NotionDatabasesEndpoint +from notion_client.api_endpoints import Endpoint from notion_client.errors import HTTPResponseError, RequestTimeoutError +from unstructured_ingest.ingest_backoff import RetryHandler +from unstructured_ingest.interfaces import RetryStrategyConfig +from unstructured_ingest.utils.dep_check import requires_dependencies from unstructured_ingest.v2.processes.connectors.notion.types.block import Block from unstructured_ingest.v2.processes.connectors.notion.types.database import Database from unstructured_ingest.v2.processes.connectors.notion.types.database_properties import map_cells from unstructured_ingest.v2.processes.connectors.notion.types.page import Page +import notion_client.errors +from notion_client.api_endpoints import BlocksEndpoint as NotionBlocksEndpoint +from notion_client.api_endpoints import PagesEndpoint as NotionPagesEndpoint + + +@requires_dependencies(["httpx"], extras="notion") +def _get_retry_strategy( + endpoint: Endpoint, retry_strategy_config: RetryStrategyConfig +) -> RetryHandler: + import backoff + import httpx + + retryable_exceptions = ( + httpx.TimeoutException, + httpx.HTTPStatusError, + notion_client.errors.HTTPResponseError, + ) + + return RetryHandler( + backoff.expo, + retryable_exceptions, + max_time=retry_strategy_config.max_retry_time, + max_tries=retry_strategy_config.max_retries, + logger=endpoint.parent.logger, + start_log_level=endpoint.parent.logger.level, + backoff_log_level=endpoint.parent.logger.level, + ) + + +def get_retry_handler(endpoint: Endpoint) -> Optional[RetryHandler]: + if retry_strategy_config := getattr(endpoint, "retry_strategy_config"): + return _get_retry_strategy(endpoint=endpoint, retry_strategy_config=retry_strategy_config) + return None + + +class BlocksChildrenEndpoint(NotionBlocksChildrenEndpoint): + def __init__( + self, + *args, + retry_strategy_config: Optional[RetryStrategyConfig] = None, + **kwargs, + ): + super().__init__(*args, **kwargs) + self.retry_strategy_config = retry_strategy_config + + @property + def retry_handler(self) -> Optional[RetryHandler]: + return get_retry_handler(self) + + def list(self, block_id: str, **kwargs: Any) -> Tuple[List[Block], dict]: + resp: dict = ( + self.retry_handler(super().list, block_id=block_id, **kwargs) + if self.retry_handler + else super().list(block_id=block_id, **kwargs) + ) # type: ignore + child_blocks = [Block.from_dict(data=b) for b in resp.pop("results", [])] + return child_blocks, resp + + def iterate_list( + self, + block_id: str, + **kwargs: Any, + ) -> Generator[List[Block], None, None]: + while True: + response: dict = ( + self.retry_handler(super().list, block_id=block_id, **kwargs) + if self.retry_handler + else super().list(block_id=block_id, **kwargs) + ) # type: ignore + child_blocks = [Block.from_dict(data=b) for b in response.pop("results", [])] + yield child_blocks + + next_cursor = response.get("next_cursor") + if not response.get("has_more") or not next_cursor: + return + + +class DatabasesEndpoint(NotionDatabasesEndpoint): + def __init__( + self, + *args, + retry_strategy_config: Optional[RetryStrategyConfig] = None, + **kwargs, + ): + super().__init__(*args, **kwargs) + self.retry_strategy_config = retry_strategy_config + + @property + def retry_handler(self) -> Optional[RetryHandler]: + return get_retry_handler(self) + + def retrieve(self, database_id: str, **kwargs: Any) -> Database: + resp: dict = ( + self.retry_handler(super().retrieve, database_id=database_id, **kwargs) + if (self.retry_handler) + else (super().retrieve(database_id=database_id, **kwargs)) + ) # type: ignore + return Database.from_dict(data=resp) + + @requires_dependencies(["httpx"], extras="notion") + def retrieve_status(self, database_id: str, **kwargs) -> int: + import httpx + + request = self.parent._build_request( + method="HEAD", + path=f"databases/{database_id}", + auth=kwargs.get("auth"), + ) + try: + response: httpx.Response = ( + self.retry_handler(self.parent.client.send, request) + if (self.retry_handler) + else (self.parent.client.send(request)) + ) # type: ignore + return response.status_code + except httpx.TimeoutException: + raise RequestTimeoutError() + + def query(self, database_id: str, **kwargs: Any) -> Tuple[List[Page], dict]: + """Get a list of [Pages](https://developers.notion.com/reference/page) contained in the database. + + *[🔗 Endpoint documentation](https://developers.notion.com/reference/post-database-query)* + """ # noqa: E501 + resp: dict = ( + self.retry_handler(super().query, database_id=database_id, **kwargs) + if (self.retry_handler) + else (super().query(database_id=database_id, **kwargs)) + ) # type: ignore + pages = [Page.from_dict(data=p) for p in resp.pop("results")] + for p in pages: + p.properties = map_cells(p.properties) + return pages, resp + + def iterate_query(self, database_id: str, **kwargs: Any) -> Generator[List[Page], None, None]: + while True: + response: dict = ( + self.retry_handler(super().query, database_id=database_id, **kwargs) + if (self.retry_handler) + else (super().query(database_id=database_id, **kwargs)) + ) # type: ignore + pages = [Page.from_dict(data=p) for p in response.pop("results", [])] + for p in pages: + p.properties = map_cells(p.properties) + yield pages + + next_cursor = response.get("next_cursor") + if not response.get("has_more") or not next_cursor: + return + + +class BlocksEndpoint(NotionBlocksEndpoint): + def __init__( + self, + *args: Any, + retry_strategy_config: Optional[RetryStrategyConfig] = None, + **kwargs: Any, + ) -> None: + super().__init__(*args, **kwargs) + self.retry_strategy_config = retry_strategy_config + self.children = BlocksChildrenEndpoint( + retry_strategy_config=retry_strategy_config, + *args, + **kwargs, + ) + + @property + def retry_handler(self) -> Optional[RetryHandler]: + return get_retry_handler(self) + + def retrieve(self, block_id: str, **kwargs: Any) -> Block: + resp: dict = ( + self.retry_handler(super().retrieve, block_id=block_id, **kwargs) + if (self.retry_handler) + else (super().retrieve(block_id=block_id, **kwargs)) + ) # type: ignore + return Block.from_dict(data=resp) + + +class PagesEndpoint(NotionPagesEndpoint): + def __init__( + self, + *args, + retry_strategy_config: Optional[RetryStrategyConfig] = None, + **kwargs, + ): + super().__init__(*args, **kwargs) + self.retry_strategy_config = retry_strategy_config + + @property + def retry_handler(self) -> Optional[RetryHandler]: + return get_retry_handler(self) + + def retrieve(self, page_id: str, **kwargs: Any) -> Page: + resp: dict = ( + self.retry_handler(super().retrieve, page_id=page_id, **kwargs) + if (self.retry_handler) + else (super().retrieve(page_id=page_id, **kwargs)) + ) # type: ignore + return Page.from_dict(data=resp) + + @requires_dependencies(["httpx"], extras="notion") + def retrieve_status(self, page_id: str, **kwargs) -> int: + import httpx + + request = self.parent._build_request( + method="HEAD", + path=f"pages/{page_id}", + auth=kwargs.get("auth"), + ) + try: + response: httpx.Response = ( + self.retry_handler(self.parent.client.send, request) + if (self.retry_handler) + else (self.parent.client.send(request)) + ) # type: ignore + return response.status_code + except httpx.TimeoutException: + raise RequestTimeoutError() + + +class Client(NotionClient): + def __init__( + self, + *args: Any, + retry_strategy_config: Optional[RetryStrategyConfig] = None, + **kwargs: Any, + ) -> None: + super().__init__(*args, **kwargs) + self.blocks = BlocksEndpoint(retry_strategy_config=retry_strategy_config, parent=self) + self.pages = PagesEndpoint(retry_strategy_config=retry_strategy_config, parent=self) + self.databases = DatabasesEndpoint(retry_strategy_config=retry_strategy_config, parent=self) class AsyncBlocksChildrenEndpoint(NotionBlocksChildrenEndpoint): @@ -17,7 +252,7 @@ def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) self._http_client = httpx.AsyncClient() - async def list(self, block_id: str, **kwargs: Any) -> tuple[list[Block], dict]: + async def list(self, block_id: str, **kwargs: Any) -> tuple[List[Block], dict]: """Fetch the list of child blocks asynchronously.""" try: response = await self._http_client.get( @@ -35,7 +270,7 @@ async def list(self, block_id: str, **kwargs: Any) -> tuple[list[Block], dict]: async def iterate_list( self, block_id: str, **kwargs: Any - ) -> Generator[list[Block], None, None]: + ) -> Generator[List[Block], None, None]: """Fetch the list of child blocks in pages asynchronously.""" next_cursor = None while True: @@ -72,7 +307,7 @@ async def retrieve(self, database_id: str, **kwargs: Any) -> Database: return Database.from_dict(data=response.json()) - async def query(self, database_id: str, **kwargs: Any) -> tuple[list[Page], dict]: + async def query(self, database_id: str, **kwargs: Any) -> tuple[List[Page], dict]: """Query a database asynchronously.""" try: response = await self._http_client.post( diff --git a/unstructured_ingest/v2/processes/connectors/notion/connector.py b/unstructured_ingest/v2/processes/connectors/notion/connector.py index 906ac04b8..68da3372c 100644 --- a/unstructured_ingest/v2/processes/connectors/notion/connector.py +++ b/unstructured_ingest/v2/processes/connectors/notion/connector.py @@ -32,6 +32,17 @@ class NotionAccessConfig(AccessConfig): class NotionConnectionConfig(ConnectionConfig): access_config: Secret[NotionAccessConfig] + @requires_dependencies(["notion_client"], extras="notion") + def get_client(self) -> "get_client": + from unstructured_ingest.v2.processes.connectors.notion.client import Client + + return Client( + notion_version=NOTION_API_VERSION, + auth=self.access_config.get_secret_value().notion_api_key, + logger=logger, + log_level=logger.level, + ) + class NotionIndexerConfig(IndexerConfig): page_ids: Optional[list[str]] = Field( @@ -59,26 +70,15 @@ class NotionIndexer(Indexer): index_config: NotionIndexerConfig def is_async(self) -> bool: - return True - - @requires_dependencies(["notion_client"], extras="notion") - async def get_client(self) -> "get_client": - from unstructured_ingest.v2.processes.connectors.notion.client import AsyncClient as Client + return False - return Client( - notion_version=NOTION_API_VERSION, - auth=self.connection_config.notion_api_key.get_secret_value().notion_api_key, - logger=logger, - log_level=logger.level, - ) - - async def precheck(self) -> None: + def precheck(self) -> None: """Check the connection to the Notion API.""" try: - client = await self.get_client() + client = self.connection_config.get_client() # Perform a simple request to verify connection request = client._build_request("HEAD", "users") - response = await client.client.send(request) + response = client.client.send(request) response.raise_for_status() except Exception as e: @@ -86,11 +86,7 @@ async def precheck(self) -> None: raise SourceConnectionError(f"Failed to validate connection: {e}") def run(self, **kwargs: Any) -> Generator[FileData, None, None]: - # Synchronous run is not implemented - raise NotImplementedError() - - async def run_async(self, **kwargs: Any) -> AsyncGenerator[None, None]: - client = await self.get_client() + client = self.connection_config.get_client() processed_pages: set[str] = set() processed_databases: set[str] = set() @@ -105,12 +101,12 @@ async def run_async(self, **kwargs: Any) -> AsyncGenerator[None, None]: processed_pages.add(page_id) pages_to_process.remove(page_id) - file_data = await self.get_page_file_data(page_id=page_id, client=client) + file_data = self.get_page_file_data(page_id=page_id, client=client) if file_data: yield file_data if self.index_config.recursive: - (child_pages, child_databases) = await self.get_child_pages_and_databases( + (child_pages, child_databases) = self.get_child_pages_and_databases( page_id=page_id, client=client, processed_pages=processed_pages, @@ -125,16 +121,14 @@ async def run_async(self, **kwargs: Any) -> AsyncGenerator[None, None]: continue processed_databases.add(database_id) databases_to_process.remove(database_id) - file_data = await self.get_database_file_data( - database_id=database_id, client=client - ) + file_data = self.get_database_file_data(database_id=database_id, client=client) if file_data: yield file_data if self.index_config.recursive: ( child_pages, child_databases, - ) = await self.get_child_pages_and_databases_from_database( + ) = self.get_child_pages_and_databases_from_database( database_id=database_id, client=client, processed_pages=processed_pages, @@ -144,16 +138,16 @@ async def run_async(self, **kwargs: Any) -> AsyncGenerator[None, None]: databases_to_process.update(child_databases) @requires_dependencies(["notion_client"], extras="notion") - async def get_page_file_data(self, page_id: str, client: "get_client") -> Optional[FileData]: + def get_page_file_data(self, page_id: str, client: "get_client") -> Optional[FileData]: try: - page_metadata = await client.pages.retrieve(page_id=page_id) # type: ignore + page_metadata = client.pages.retrieve(page_id=page_id) # type: ignore date_created = page_metadata.created_time date_modified = page_metadata.last_edited_time identifier = page_id source_identifiers = SourceIdentifiers( filename=f"{page_id}.html", - fullpath=page_id, - rel_path=page_id, + fullpath=f"{page_id}.html", + rel_path=f"{page_id}.html", ) metadata = FileDataSourceMetadata( date_created=date_created, @@ -161,7 +155,14 @@ async def get_page_file_data(self, page_id: str, client: "get_client") -> Option record_locator={"page_id": page_id}, date_processed=str(time()), ) - additional_metadata = page_metadata + # additional_metadata = page_metadata + additional_metadata = { + 'created_by': page_metadata.created_by, + 'last_edited_by': page_metadata.last_edited_by, + 'parent': page_metadata.parent, + 'url': page_metadata.url + } + return FileData( identifier=identifier, connector_type=CONNECTOR_TYPE, @@ -174,19 +175,17 @@ async def get_page_file_data(self, page_id: str, client: "get_client") -> Option return None @requires_dependencies(["Client"], extras="notion") - async def get_database_file_data( - self, database_id: str, client: "get_client" - ) -> Optional[FileData]: + def get_database_file_data(self, database_id: str, client: "get_client") -> Optional[FileData]: try: # type: ignore - database_metadata = await client.databases.retrieve(database_id=database_id) + database_metadata = client.databases.retrieve(database_id=database_id) date_created = database_metadata.created_time date_modified = database_metadata.last_edited_time identifier = database_id source_identifiers = SourceIdentifiers( filename=f"{database_id}.html", - fullpath=database_id, - rel_path=database_id, + fullpath=f"{database_id}.html", + rel_path=f"{database_id}.html", ) metadata = FileDataSourceMetadata( date_created=date_created, @@ -206,7 +205,7 @@ async def get_database_file_data( logger.error(f"Error retrieving database {database_id}: {e}") return None - async def get_child_pages_and_databases( + def get_child_pages_and_databases( self, page_id: str, client: "get_client", @@ -217,7 +216,7 @@ async def get_child_pages_and_databases( get_recursive_content_from_page, ) - child_content = await get_recursive_content_from_page( + child_content = get_recursive_content_from_page( client=client, page_id=page_id, logger=logger, @@ -226,10 +225,10 @@ async def get_child_pages_and_databases( child_databases = set(child_content.child_databases) - processed_databases return child_pages, child_databases - async def get_child_pages_and_databases_from_database( + def get_child_pages_and_databases_from_database( self, database_id: str, - client: get_client, + client: "get_client", processed_pages: set[str], processed_databases: set[str], ) -> tuple[set[str], set[str]]: @@ -237,7 +236,7 @@ async def get_child_pages_and_databases_from_database( get_recursive_content_from_database, ) - child_content = await get_recursive_content_from_database( + child_content = get_recursive_content_from_database( client=client, database_id=database_id, logger=logger, @@ -246,6 +245,10 @@ async def get_child_pages_and_databases_from_database( child_databases = set(child_content.child_databases) - processed_databases return child_pages, child_databases + async def run_async(self, **kwargs: Any) -> AsyncGenerator[None, None]: + # Asynchronous run is not implemented + raise NotImplementedError() + class NotionDownloaderConfig(DownloaderConfig): pass @@ -285,6 +288,7 @@ def download_page(self, client, page_id: str, file_data: FileData) -> DownloadRe page_id=page_id, logger=logger, ) + if text_extraction.html: download_path = self.get_download_path(file_data=file_data) download_path.parent.mkdir(parents=True, exist_ok=True) From 7df67c3eed0fadee82b836f00d33a8373d37d4d4 Mon Sep 17 00:00:00 2001 From: Bryan Chen Date: Mon, 30 Dec 2024 11:59:12 -0500 Subject: [PATCH 42/48] add integration tests for downloading notion database --- .../notion_database/directory_structure.json | 5 ++ .../1572c3765a0a80d3a34ac5c0eecd1e88.html | 24 +++++++++ .../1572c3765a0a80d3a34ac5c0eecd1e88.json | 39 ++++++++++++++ .../directory_structure.json | 0 .../1572c3765a0a806299f0dd6999f9e4c7.html | 0 .../1572c3765a0a806299f0dd6999f9e4c7.json | 0 test/integration/connectors/test_notion.py | 51 ++++++++++++++++--- .../v2/processes/connectors/notion/client.py | 8 +-- .../processes/connectors/notion/connector.py | 32 +++++++----- 9 files changed, 135 insertions(+), 24 deletions(-) create mode 100644 test/integration/connectors/expected_results/notion_database/directory_structure.json create mode 100644 test/integration/connectors/expected_results/notion_database/downloads/1572c3765a0a80d3a34ac5c0eecd1e88.html create mode 100644 test/integration/connectors/expected_results/notion_database/file_data/1572c3765a0a80d3a34ac5c0eecd1e88.json rename test/integration/connectors/expected_results/{notion => notion_page}/directory_structure.json (100%) rename test/integration/connectors/expected_results/{notion => notion_page}/downloads/1572c3765a0a806299f0dd6999f9e4c7.html (100%) rename test/integration/connectors/expected_results/{notion => notion_page}/file_data/1572c3765a0a806299f0dd6999f9e4c7.json (100%) diff --git a/test/integration/connectors/expected_results/notion_database/directory_structure.json b/test/integration/connectors/expected_results/notion_database/directory_structure.json new file mode 100644 index 000000000..d9b196898 --- /dev/null +++ b/test/integration/connectors/expected_results/notion_database/directory_structure.json @@ -0,0 +1,5 @@ +{ + "directory_structure": [ + "1572c3765a0a80d3a34ac5c0eecd1e88.html" + ] + } \ No newline at end of file diff --git a/test/integration/connectors/expected_results/notion_database/downloads/1572c3765a0a80d3a34ac5c0eecd1e88.html b/test/integration/connectors/expected_results/notion_database/downloads/1572c3765a0a80d3a34ac5c0eecd1e88.html new file mode 100644 index 000000000..7851bbe0b --- /dev/null +++ b/test/integration/connectors/expected_results/notion_database/downloads/1572c3765a0a80d3a34ac5c0eecd1e88.html @@ -0,0 +1,24 @@ + + + + + + + + + +
+ Author + + Item +
+
+ + test-author + +
+
+
+ test-page-in-database +
+
diff --git a/test/integration/connectors/expected_results/notion_database/file_data/1572c3765a0a80d3a34ac5c0eecd1e88.json b/test/integration/connectors/expected_results/notion_database/file_data/1572c3765a0a80d3a34ac5c0eecd1e88.json new file mode 100644 index 000000000..0b98bf07b --- /dev/null +++ b/test/integration/connectors/expected_results/notion_database/file_data/1572c3765a0a80d3a34ac5c0eecd1e88.json @@ -0,0 +1,39 @@ +{ + "identifier": "1572c3765a0a80d3a34ac5c0eecd1e88", + "connector_type": "notion", + "source_identifiers": { + "filename": "1572c3765a0a80d3a34ac5c0eecd1e88.html", + "fullpath": "1572c3765a0a80d3a34ac5c0eecd1e88.html", + "rel_path": "1572c3765a0a80d3a34ac5c0eecd1e88.html" + }, + "metadata": { + "url": null, + "version": null, + "record_locator": { + "database_id": "1572c3765a0a80d3a34ac5c0eecd1e88" + }, + "date_created": "2024-12-09T11:54:00.000Z", + "date_modified": "2024-12-30T15:19:00.000Z", + "date_processed": "1735577045.091182", + "permissions_data": null, + "filesize_bytes": null + }, + "additional_metadata": { + "created_by": { + "id": "118d872b-594c-8171-b46f-00020d10d8b2", + "object": "user" + }, + "last_edited_by": { + "id": "118d872b-594c-8171-b46f-00020d10d8b2", + "object": "user" + }, + "parent": { + "page_id": "1572c376-5a0a-80d8-9619-cb35a622b8cc", + "type": "page_id" + }, + "url": "https://www.notion.so/1572c3765a0a80d3a34ac5c0eecd1e88" + }, + "reprocess": false, + "local_download_path": "/private/var/folders/h7/n848df9s5yn7ml8rxb61vhyc0000gp/T/tmpnqb7824d/1572c3765a0a80d3a34ac5c0eecd1e88.html", + "display_name": null + } \ No newline at end of file diff --git a/test/integration/connectors/expected_results/notion/directory_structure.json b/test/integration/connectors/expected_results/notion_page/directory_structure.json similarity index 100% rename from test/integration/connectors/expected_results/notion/directory_structure.json rename to test/integration/connectors/expected_results/notion_page/directory_structure.json diff --git a/test/integration/connectors/expected_results/notion/downloads/1572c3765a0a806299f0dd6999f9e4c7.html b/test/integration/connectors/expected_results/notion_page/downloads/1572c3765a0a806299f0dd6999f9e4c7.html similarity index 100% rename from test/integration/connectors/expected_results/notion/downloads/1572c3765a0a806299f0dd6999f9e4c7.html rename to test/integration/connectors/expected_results/notion_page/downloads/1572c3765a0a806299f0dd6999f9e4c7.html diff --git a/test/integration/connectors/expected_results/notion/file_data/1572c3765a0a806299f0dd6999f9e4c7.json b/test/integration/connectors/expected_results/notion_page/file_data/1572c3765a0a806299f0dd6999f9e4c7.json similarity index 100% rename from test/integration/connectors/expected_results/notion/file_data/1572c3765a0a806299f0dd6999f9e4c7.json rename to test/integration/connectors/expected_results/notion_page/file_data/1572c3765a0a806299f0dd6999f9e4c7.json diff --git a/test/integration/connectors/test_notion.py b/test/integration/connectors/test_notion.py index 500a9c1e8..d49231034 100644 --- a/test/integration/connectors/test_notion.py +++ b/test/integration/connectors/test_notion.py @@ -2,10 +2,9 @@ from test.integration.connectors.utils.validation.source import ( SourceValidationConfigs, + get_all_file_data, run_all_validations, update_fixtures, - get_all_file_data - ) from unstructured_ingest.v2.interfaces import Downloader, Indexer from unstructured_ingest.v2.processes.connectors.notion.connector import ( @@ -18,7 +17,45 @@ ) -def test_notion_source(temp_dir): +def test_notion_source_database(temp_dir): + # Retrieve environment variables + notion_api_key = os.environ["NOTION_API_KEY"] + + # Create connection and indexer configurations + access_config = NotionAccessConfig(notion_api_key=notion_api_key) + connection_config = NotionConnectionConfig( + access_config=access_config, + ) + index_config = NotionIndexerConfig( + database_ids=["1572c3765a0a80d3a34ac5c0eecd1e88"], recursive=False + ) + + download_config = NotionDownloaderConfig(download_dir=temp_dir) + + # Instantiate indexer and downloader + indexer = NotionIndexer( + connection_config=connection_config, + index_config=index_config, + ) + downloader = NotionDownloader( + connection_config=connection_config, + download_config=download_config, + ) + + # Run the source connector validation + source_connector_validation( + indexer=indexer, + downloader=downloader, + configs=SourceValidationConfigs( + test_id="notion_database", + expected_num_files=1, + validate_downloaded_files=True, + exclude_fields_extend=["metadata.date_created", "metadata.date_modified"], + ), + ) + + +def test_notion_source_page(temp_dir): # Retrieve environment variables notion_api_key = os.environ["NOTION_API_KEY"] @@ -48,10 +85,10 @@ def test_notion_source(temp_dir): indexer=indexer, downloader=downloader, configs=SourceValidationConfigs( - test_id="notion", + test_id="notion_page", expected_num_files=1, validate_downloaded_files=True, - exclude_fields_extend=["metadata.date_created", "metadata.date_modified"] + exclude_fields_extend=["metadata.date_created", "metadata.date_modified"], ), ) @@ -84,6 +121,7 @@ def source_connector_validation( else: postdownload_file_data = resp["file_data"].model_copy(deep=True) all_postdownload_file_data.append(postdownload_file_data) + if not overwrite_fixtures: print("Running validation") run_all_validations( @@ -105,6 +143,3 @@ def source_connector_validation( save_downloads=configs.validate_downloaded_files, save_filedata=configs.validate_file_data, ) - - - diff --git a/unstructured_ingest/v2/processes/connectors/notion/client.py b/unstructured_ingest/v2/processes/connectors/notion/client.py index 80a39d3a6..8fb769e90 100644 --- a/unstructured_ingest/v2/processes/connectors/notion/client.py +++ b/unstructured_ingest/v2/processes/connectors/notion/client.py @@ -1,10 +1,13 @@ -from typing import Any, Generator, Optional, Tuple, List +from typing import Any, Generator, List, Optional, Tuple import httpx +import notion_client.errors from notion_client import Client as NotionClient from notion_client.api_endpoints import BlocksChildrenEndpoint as NotionBlocksChildrenEndpoint +from notion_client.api_endpoints import BlocksEndpoint as NotionBlocksEndpoint from notion_client.api_endpoints import DatabasesEndpoint as NotionDatabasesEndpoint from notion_client.api_endpoints import Endpoint +from notion_client.api_endpoints import PagesEndpoint as NotionPagesEndpoint from notion_client.errors import HTTPResponseError, RequestTimeoutError from unstructured_ingest.ingest_backoff import RetryHandler @@ -14,9 +17,6 @@ from unstructured_ingest.v2.processes.connectors.notion.types.database import Database from unstructured_ingest.v2.processes.connectors.notion.types.database_properties import map_cells from unstructured_ingest.v2.processes.connectors.notion.types.page import Page -import notion_client.errors -from notion_client.api_endpoints import BlocksEndpoint as NotionBlocksEndpoint -from notion_client.api_endpoints import PagesEndpoint as NotionPagesEndpoint @requires_dependencies(["httpx"], extras="notion") diff --git a/unstructured_ingest/v2/processes/connectors/notion/connector.py b/unstructured_ingest/v2/processes/connectors/notion/connector.py index 68da3372c..0b747f9ad 100644 --- a/unstructured_ingest/v2/processes/connectors/notion/connector.py +++ b/unstructured_ingest/v2/processes/connectors/notion/connector.py @@ -1,6 +1,6 @@ from dataclasses import dataclass from time import time -from typing import Any, AsyncGenerator, Generator, Optional +from typing import TYPE_CHECKING, Any, AsyncGenerator, Generator, Optional from pydantic import UUID4, Field, Secret @@ -21,6 +21,9 @@ from unstructured_ingest.v2.logger import logger from unstructured_ingest.v2.processes.connector_registry import SourceRegistryEntry +if TYPE_CHECKING: + from unstructured_ingest.v2.processes.connectors.notion.client import Client + NOTION_API_VERSION = "2022-06-28" CONNECTOR_TYPE = "notion" @@ -33,7 +36,7 @@ class NotionConnectionConfig(ConnectionConfig): access_config: Secret[NotionAccessConfig] @requires_dependencies(["notion_client"], extras="notion") - def get_client(self) -> "get_client": + def get_client(self) -> "Client": from unstructured_ingest.v2.processes.connectors.notion.client import Client return Client( @@ -138,7 +141,7 @@ def run(self, **kwargs: Any) -> Generator[FileData, None, None]: databases_to_process.update(child_databases) @requires_dependencies(["notion_client"], extras="notion") - def get_page_file_data(self, page_id: str, client: "get_client") -> Optional[FileData]: + def get_page_file_data(self, page_id: str, client: "Client") -> Optional[FileData]: try: page_metadata = client.pages.retrieve(page_id=page_id) # type: ignore date_created = page_metadata.created_time @@ -157,10 +160,10 @@ def get_page_file_data(self, page_id: str, client: "get_client") -> Optional[Fil ) # additional_metadata = page_metadata additional_metadata = { - 'created_by': page_metadata.created_by, - 'last_edited_by': page_metadata.last_edited_by, - 'parent': page_metadata.parent, - 'url': page_metadata.url + "created_by": page_metadata.created_by, + "last_edited_by": page_metadata.last_edited_by, + "parent": page_metadata.parent, + "url": page_metadata.url, } return FileData( @@ -174,8 +177,8 @@ def get_page_file_data(self, page_id: str, client: "get_client") -> Optional[Fil logger.error(f"Error retrieving page {page_id}: {e}") return None - @requires_dependencies(["Client"], extras="notion") - def get_database_file_data(self, database_id: str, client: "get_client") -> Optional[FileData]: + @requires_dependencies(["notion_client"], extras="notion") + def get_database_file_data(self, database_id: str, client: "Client") -> Optional[FileData]: try: # type: ignore database_metadata = client.databases.retrieve(database_id=database_id) @@ -193,7 +196,12 @@ def get_database_file_data(self, database_id: str, client: "get_client") -> Opti record_locator={"database_id": database_id}, date_processed=str(time()), ) - additional_metadata = database_metadata + additional_metadata = { + "created_by": database_metadata.created_by, + "last_edited_by": database_metadata.last_edited_by, + "parent": database_metadata.parent, + "url": database_metadata.url, + } return FileData( identifier=identifier, connector_type=CONNECTOR_TYPE, @@ -208,7 +216,7 @@ def get_database_file_data(self, database_id: str, client: "get_client") -> Opti def get_child_pages_and_databases( self, page_id: str, - client: "get_client", + client: "Client", processed_pages: set[str], processed_databases: set[str], ) -> tuple[set[str], set[str]]: @@ -228,7 +236,7 @@ def get_child_pages_and_databases( def get_child_pages_and_databases_from_database( self, database_id: str, - client: "get_client", + client: "Client", processed_pages: set[str], processed_databases: set[str], ) -> tuple[set[str], set[str]]: From 6bfe4ebad00efe92bfa3bf70843cd0ef64513472 Mon Sep 17 00:00:00 2001 From: Bryan Chen Date: Mon, 30 Dec 2024 14:42:04 -0500 Subject: [PATCH 43/48] fix expected output files in notion e2e test --- ...22b2c22996b435b9de2ee0e9d2b04bc.html.json} | 23 +- ...bc49-2e06-4f01-8031-bf283be58a60.html.json | 46 + .../438dbc49-2e06-4f01-8031-bf283be58a60.json | 24 - ...ea53-f2b3-45b4-8638-2212fd054d73.html.json | 46 + .../4695ea53-f2b3-45b4-8638-2212fd054d73.json | 24 - ...f29c-799a-4d7b-93ce-b11bcaede531.html.json | 68 ++ .../5481f29c-799a-4d7b-93ce-b11bcaede531.json | 35 - ...7009-e6b2-47f3-a8ff-f159fd8b69f5.html.json | 46 + .../60377009-e6b2-47f3-a8ff-f159fd8b69f5.json | 24 - ...38f2-26e1-4de7-81e6-354045d4d007.html.json | 46 + .../898538f2-26e1-4de7-81e6-354045d4d007.json | 24 - ...ee42-2167-441c-af6c-7b2cff268809.html.json | 30 + .../8d8bee42-2167-441c-af6c-7b2cff268809.json | 19 - ...ccc9-0a9c-4168-94c3-f997e60cb8cf.html.json | 46 + .../8db7ccc9-0a9c-4168-94c3-f997e60cb8cf.json | 24 - ...be3d-cbe0-4e28-ad46-2170d40a8d37.html.json | 52 ++ .../9e20be3d-cbe0-4e28-ad46-2170d40a8d37.json | 30 - ...2157-721e-4207-b3b7-527762b782c2.html.json | 510 ++++++++++ .../b2a12157-721e-4207-b3b7-527762b782c2.json | 268 ------ ...4566-4c7a-488b-ac2a-1292ee507fcb.html.json | 881 ++++++++++++++++++ .../c47a4566-4c7a-488b-ac2a-1292ee507fcb.json | 463 --------- ...e584-30b1-4551-b533-e6a5759af842.html.json | 46 + .../dfcbe584-30b1-4551-b533-e6a5759af842.json | 24 - ...fcd4-8ca0-4638-8212-1a5726461029.html.json | 46 + .../feccfcd4-8ca0-4638-8212-1a5726461029.json | 24 - ...149e-6240-4431-8e98-a04a2e460a66.html.json | 52 ++ .../fee2149e-6240-4431-8e98-a04a2e460a66.json | 30 - 27 files changed, 1932 insertions(+), 1019 deletions(-) rename test_e2e/expected-structured-output/notion/{122b2c22-996b-435b-9de2-ee0e9d2b04bc.json => 122b2c22996b435b9de2ee0e9d2b04bc.html.json} (88%) create mode 100644 test_e2e/expected-structured-output/notion/438dbc49-2e06-4f01-8031-bf283be58a60.html.json delete mode 100644 test_e2e/expected-structured-output/notion/438dbc49-2e06-4f01-8031-bf283be58a60.json create mode 100644 test_e2e/expected-structured-output/notion/4695ea53-f2b3-45b4-8638-2212fd054d73.html.json delete mode 100644 test_e2e/expected-structured-output/notion/4695ea53-f2b3-45b4-8638-2212fd054d73.json create mode 100644 test_e2e/expected-structured-output/notion/5481f29c-799a-4d7b-93ce-b11bcaede531.html.json delete mode 100644 test_e2e/expected-structured-output/notion/5481f29c-799a-4d7b-93ce-b11bcaede531.json create mode 100644 test_e2e/expected-structured-output/notion/60377009-e6b2-47f3-a8ff-f159fd8b69f5.html.json delete mode 100644 test_e2e/expected-structured-output/notion/60377009-e6b2-47f3-a8ff-f159fd8b69f5.json create mode 100644 test_e2e/expected-structured-output/notion/898538f2-26e1-4de7-81e6-354045d4d007.html.json delete mode 100644 test_e2e/expected-structured-output/notion/898538f2-26e1-4de7-81e6-354045d4d007.json create mode 100644 test_e2e/expected-structured-output/notion/8d8bee42-2167-441c-af6c-7b2cff268809.html.json delete mode 100644 test_e2e/expected-structured-output/notion/8d8bee42-2167-441c-af6c-7b2cff268809.json create mode 100644 test_e2e/expected-structured-output/notion/8db7ccc9-0a9c-4168-94c3-f997e60cb8cf.html.json delete mode 100644 test_e2e/expected-structured-output/notion/8db7ccc9-0a9c-4168-94c3-f997e60cb8cf.json create mode 100644 test_e2e/expected-structured-output/notion/9e20be3d-cbe0-4e28-ad46-2170d40a8d37.html.json delete mode 100644 test_e2e/expected-structured-output/notion/9e20be3d-cbe0-4e28-ad46-2170d40a8d37.json create mode 100644 test_e2e/expected-structured-output/notion/b2a12157-721e-4207-b3b7-527762b782c2.html.json delete mode 100644 test_e2e/expected-structured-output/notion/b2a12157-721e-4207-b3b7-527762b782c2.json create mode 100644 test_e2e/expected-structured-output/notion/c47a4566-4c7a-488b-ac2a-1292ee507fcb.html.json delete mode 100644 test_e2e/expected-structured-output/notion/c47a4566-4c7a-488b-ac2a-1292ee507fcb.json create mode 100644 test_e2e/expected-structured-output/notion/dfcbe584-30b1-4551-b533-e6a5759af842.html.json delete mode 100644 test_e2e/expected-structured-output/notion/dfcbe584-30b1-4551-b533-e6a5759af842.json create mode 100644 test_e2e/expected-structured-output/notion/feccfcd4-8ca0-4638-8212-1a5726461029.html.json delete mode 100644 test_e2e/expected-structured-output/notion/feccfcd4-8ca0-4638-8212-1a5726461029.json create mode 100644 test_e2e/expected-structured-output/notion/fee2149e-6240-4431-8e98-a04a2e460a66.html.json delete mode 100644 test_e2e/expected-structured-output/notion/fee2149e-6240-4431-8e98-a04a2e460a66.json diff --git a/test_e2e/expected-structured-output/notion/122b2c22-996b-435b-9de2-ee0e9d2b04bc.json b/test_e2e/expected-structured-output/notion/122b2c22996b435b9de2ee0e9d2b04bc.html.json similarity index 88% rename from test_e2e/expected-structured-output/notion/122b2c22-996b-435b-9de2-ee0e9d2b04bc.json rename to test_e2e/expected-structured-output/notion/122b2c22996b435b9de2ee0e9d2b04bc.html.json index a53e41bfc..321f96a0f 100644 --- a/test_e2e/expected-structured-output/notion/122b2c22-996b-435b-9de2-ee0e9d2b04bc.json +++ b/test_e2e/expected-structured-output/notion/122b2c22996b435b9de2ee0e9d2b04bc.html.json @@ -1,14 +1,25 @@ [ { - "element_id": "59a715faf8dcf15a6855a2c070f5d4cd", + "type": "Table", + "element_id": "81a89d50e24f9f1a276c15641adff90f", + "text": "Created time Last edited time Owner Page Tags Verification 2023-08-04T18:31:00.000Z 2023-08-04T18:31:00.000Z Roman Isecke New Page unverified 2023-08-04T18:31:00.000Z 2023-08-04T18:31:00.000Z Roman Isecke Morale Events Policies unverified 2023-08-04T18:31:00.000Z 2023-08-04T19:02:00.000Z Roman Isecke New Page With Verification expired Roman Isecke 2023-08-04T04:00:00.000Z - 2023-08-11T04:00:00.000Z 2023-08-04T18:31:00.000Z 2023-08-04T18:31:00.000Z Roman Isecke Vacation Policy Policies unverified 2023-08-04T18:31:00.000Z 2023-08-04T18:31:00.000Z Roman Isecke Mission, Vision, Values Vision Company Updates unverified 2023-08-04T18:31:00.000Z 2023-08-04T18:31:00.000Z Roman Isecke Recent Press Company Updates unverified 2023-08-04T18:31:00.000Z 2023-08-04T18:31:00.000Z Roman Isecke Getting Started unverified 2023-08-04T18:31:00.000Z 2023-08-17T18:48:00.000Z Roman Isecke Page with every block Company Updates Policies expired Roman Isecke 2023-08-04T04:00:00.000Z - 2023-11-02T04:00:00.000Z 2023-08-04T18:31:00.000Z 2023-08-04T18:31:00.000Z Roman Isecke Corporate Travel Policies unverified 2023-08-04T18:31:00.000Z 2023-08-04T18:31:00.000Z Roman Isecke Benefits Policies Policies unverified", "metadata": { - "filetype": "text/html", + "text_as_html": "
Created timeLast edited timeOwnerPageTagsVerification
2023-08-04T18:31:00.000Z2023-08-04T18:31:00.000ZRoman IseckeNew Pageunverified
2023-08-04T18:31:00.000Z2023-08-04T18:31:00.000ZRoman IseckeMorale EventsPoliciesunverified
2023-08-04T18:31:00.000Z2023-08-04T19:02:00.000ZRoman IseckeNew Page With Verificationexpired Roman Isecke 2023-08-04T04:00:00.000Z - 2023-08-11T04:00:00.000Z
2023-08-04T18:31:00.000Z2023-08-04T18:31:00.000ZRoman IseckeVacation PolicyPoliciesunverified
2023-08-04T18:31:00.000Z2023-08-04T18:31:00.000ZRoman IseckeMission, Vision, ValuesVision Company Updatesunverified
2023-08-04T18:31:00.000Z2023-08-04T18:31:00.000ZRoman IseckeRecent PressCompany Updatesunverified
2023-08-04T18:31:00.000Z2023-08-04T18:31:00.000ZRoman IseckeGetting Startedunverified
2023-08-04T18:31:00.000Z2023-08-17T18:48:00.000ZRoman IseckePage with every blockCompany Updates Policiesexpired Roman Isecke 2023-08-04T04:00:00.000Z - 2023-11-02T04:00:00.000Z
2023-08-04T18:31:00.000Z2023-08-04T18:31:00.000ZRoman IseckeCorporate TravelPoliciesunverified
2023-08-04T18:31:00.000Z2023-08-04T18:31:00.000ZRoman IseckeBenefits PoliciesPoliciesunverified
", "languages": [ "eng" ], - "text_as_html": "
Created timeLast edited timeOwnerPageTagsVerification
2023-08-04T18:31:00.000Z2023-08-04T18:31:00.000ZRoman IseckeNew Pageunverified
2023-08-04T18:31:00.000Z2023-08-04T18:31:00.000ZRoman IseckeMorale EventsPoliciesunverified
2023-08-04T18:31:00.000Z2023-08-04T19:02:00.000ZRoman IseckeNew Page With Verificationexpired Roman Isecke 2023-08-04T04:00:00.000Z - 2023-08-11T04:00:00.000Z
2023-08-04T18:31:00.000Z2023-08-04T18:31:00.000ZRoman IseckeVacation PolicyPoliciesunverified
2023-08-04T18:31:00.000Z2023-08-04T18:31:00.000ZRoman IseckeMission, Vision, ValuesVision Company Updatesunverified
2023-08-04T18:31:00.000Z2023-08-04T18:31:00.000ZRoman IseckeRecent PressCompany Updatesunverified
2023-08-04T18:31:00.000Z2023-08-04T18:31:00.000ZRoman IseckeGetting Startedunverified
2023-08-04T18:31:00.000Z2023-08-17T18:48:00.000ZRoman IseckePage with every blockCompany Updates Policiesexpired Roman Isecke 2023-08-04T04:00:00.000Z - 2023-11-02T04:00:00.000Z
2023-08-04T18:31:00.000Z2023-08-04T18:31:00.000ZRoman IseckeCorporate TravelPoliciesunverified
2023-08-04T18:31:00.000Z2023-08-04T18:31:00.000ZRoman IseckeBenefits PoliciesPoliciesunverified
" - }, - "text": "Created time Last edited time Owner Page Tags Verification 2023-08-04T18:31:00.000Z 2023-08-04T18:31:00.000Z Roman Isecke New Page unverified 2023-08-04T18:31:00.000Z 2023-08-04T18:31:00.000Z Roman Isecke Morale Events Policies unverified 2023-08-04T18:31:00.000Z 2023-08-04T19:02:00.000Z Roman Isecke New Page With Verification expired Roman Isecke 2023-08-04T04:00:00.000Z - 2023-08-11T04:00:00.000Z 2023-08-04T18:31:00.000Z 2023-08-04T18:31:00.000Z Roman Isecke Vacation Policy Policies unverified 2023-08-04T18:31:00.000Z 2023-08-04T18:31:00.000Z Roman Isecke Mission, Vision, Values Vision Company Updates unverified 2023-08-04T18:31:00.000Z 2023-08-04T18:31:00.000Z Roman Isecke Recent Press Company Updates unverified 2023-08-04T18:31:00.000Z 2023-08-04T18:31:00.000Z Roman Isecke Getting Started unverified 2023-08-04T18:31:00.000Z 2023-08-17T18:48:00.000Z Roman Isecke Page with every block Company Updates Policies expired Roman Isecke 2023-08-04T04:00:00.000Z - 2023-11-02T04:00:00.000Z 2023-08-04T18:31:00.000Z 2023-08-04T18:31:00.000Z Roman Isecke Corporate Travel Policies unverified 2023-08-04T18:31:00.000Z 2023-08-04T18:31:00.000Z Roman Isecke Benefits Policies Policies unverified", - "type": "Table" + "filetype": "text/html", + "data_source": { + "url": null, + "version": null, + "record_locator": { + "database_id": "122b2c22996b435b9de2ee0e9d2b04bc" + }, + "date_created": "2023-08-04T18:31:00.000Z", + "date_modified": "2023-08-04T18:44:00.000Z", + "permissions_data": null, + "filesize_bytes": 7710 + } + } } ] \ No newline at end of file diff --git a/test_e2e/expected-structured-output/notion/438dbc49-2e06-4f01-8031-bf283be58a60.html.json b/test_e2e/expected-structured-output/notion/438dbc49-2e06-4f01-8031-bf283be58a60.html.json new file mode 100644 index 000000000..511deafef --- /dev/null +++ b/test_e2e/expected-structured-output/notion/438dbc49-2e06-4f01-8031-bf283be58a60.html.json @@ -0,0 +1,46 @@ +[ + { + "type": "Title", + "element_id": "b21b7e1a9374c90fad7b4ca0571a9a35", + "text": "New Page", + "metadata": { + "languages": [ + "eng" + ], + "filetype": "text/html", + "data_source": { + "url": null, + "version": null, + "record_locator": { + "page_id": "438dbc49-2e06-4f01-8031-bf283be58a60" + }, + "date_created": "2023-08-04T18:31:00.000Z", + "date_modified": "2023-08-04T18:31:00.000Z", + "permissions_data": null, + "filesize_bytes": 457 + } + } + }, + { + "type": "NarrativeText", + "element_id": "6c9a1c66c3f1ef2814be722d6ff431b1", + "text": "Notion Tip: When creating a page, it's important to give it a clear title and provide some content. This could include verifying the information, summarizing the topic, or sharing your thoughts and opinions on something that matters to you.", + "metadata": { + "languages": [ + "eng" + ], + "filetype": "text/html", + "data_source": { + "url": null, + "version": null, + "record_locator": { + "page_id": "438dbc49-2e06-4f01-8031-bf283be58a60" + }, + "date_created": "2023-08-04T18:31:00.000Z", + "date_modified": "2023-08-04T18:31:00.000Z", + "permissions_data": null, + "filesize_bytes": 457 + } + } + } +] \ No newline at end of file diff --git a/test_e2e/expected-structured-output/notion/438dbc49-2e06-4f01-8031-bf283be58a60.json b/test_e2e/expected-structured-output/notion/438dbc49-2e06-4f01-8031-bf283be58a60.json deleted file mode 100644 index 93f6daa5d..000000000 --- a/test_e2e/expected-structured-output/notion/438dbc49-2e06-4f01-8031-bf283be58a60.json +++ /dev/null @@ -1,24 +0,0 @@ -[ - { - "element_id": "b21b7e1a9374c90fad7b4ca0571a9a35", - "metadata": { - "filetype": "text/html", - "languages": [ - "eng" - ] - }, - "text": "New Page", - "type": "Title" - }, - { - "element_id": "6c9a1c66c3f1ef2814be722d6ff431b1", - "metadata": { - "filetype": "text/html", - "languages": [ - "eng" - ] - }, - "text": "Notion Tip: When creating a page, it's important to give it a clear title and provide some content. This could include verifying the information, summarizing the topic, or sharing your thoughts and opinions on something that matters to you.", - "type": "NarrativeText" - } -] \ No newline at end of file diff --git a/test_e2e/expected-structured-output/notion/4695ea53-f2b3-45b4-8638-2212fd054d73.html.json b/test_e2e/expected-structured-output/notion/4695ea53-f2b3-45b4-8638-2212fd054d73.html.json new file mode 100644 index 000000000..9e54c892e --- /dev/null +++ b/test_e2e/expected-structured-output/notion/4695ea53-f2b3-45b4-8638-2212fd054d73.html.json @@ -0,0 +1,46 @@ +[ + { + "type": "Title", + "element_id": "23d6a73618cedf6ecc9f28279cb62421", + "text": "Morale Events", + "metadata": { + "languages": [ + "eng" + ], + "filetype": "text/html", + "data_source": { + "url": null, + "version": null, + "record_locator": { + "page_id": "4695ea53-f2b3-45b4-8638-2212fd054d73" + }, + "date_created": "2023-08-04T18:31:00.000Z", + "date_modified": "2023-08-04T18:31:00.000Z", + "permissions_data": null, + "filesize_bytes": 409 + } + } + }, + { + "type": "NarrativeText", + "element_id": "cca3a9ec1c93fe24880b41dd9988d72d", + "text": "Notion Tip: Morale events increase employee satisfaction, motivation, and well-being, while promoting community and teamwork, resulting in higher productivity and retention rates.", + "metadata": { + "languages": [ + "eng" + ], + "filetype": "text/html", + "data_source": { + "url": null, + "version": null, + "record_locator": { + "page_id": "4695ea53-f2b3-45b4-8638-2212fd054d73" + }, + "date_created": "2023-08-04T18:31:00.000Z", + "date_modified": "2023-08-04T18:31:00.000Z", + "permissions_data": null, + "filesize_bytes": 409 + } + } + } +] \ No newline at end of file diff --git a/test_e2e/expected-structured-output/notion/4695ea53-f2b3-45b4-8638-2212fd054d73.json b/test_e2e/expected-structured-output/notion/4695ea53-f2b3-45b4-8638-2212fd054d73.json deleted file mode 100644 index 025aa548b..000000000 --- a/test_e2e/expected-structured-output/notion/4695ea53-f2b3-45b4-8638-2212fd054d73.json +++ /dev/null @@ -1,24 +0,0 @@ -[ - { - "element_id": "23d6a73618cedf6ecc9f28279cb62421", - "metadata": { - "filetype": "text/html", - "languages": [ - "eng" - ] - }, - "text": "Morale Events", - "type": "Title" - }, - { - "element_id": "cca3a9ec1c93fe24880b41dd9988d72d", - "metadata": { - "filetype": "text/html", - "languages": [ - "eng" - ] - }, - "text": "Notion Tip: Morale events increase employee satisfaction, motivation, and well-being, while promoting community and teamwork, resulting in higher productivity and retention rates.", - "type": "NarrativeText" - } -] \ No newline at end of file diff --git a/test_e2e/expected-structured-output/notion/5481f29c-799a-4d7b-93ce-b11bcaede531.html.json b/test_e2e/expected-structured-output/notion/5481f29c-799a-4d7b-93ce-b11bcaede531.html.json new file mode 100644 index 000000000..c0a5b682f --- /dev/null +++ b/test_e2e/expected-structured-output/notion/5481f29c-799a-4d7b-93ce-b11bcaede531.html.json @@ -0,0 +1,68 @@ +[ + { + "type": "Title", + "element_id": "67500029518a859dc034db1601bf5fbe", + "text": "New Page With Verification", + "metadata": { + "languages": [ + "eng" + ], + "filetype": "text/html", + "data_source": { + "url": null, + "version": null, + "record_locator": { + "page_id": "5481f29c-799a-4d7b-93ce-b11bcaede531" + }, + "date_created": "2023-08-04T18:31:00.000Z", + "date_modified": "2023-08-04T19:02:00.000Z", + "permissions_data": null, + "filesize_bytes": 765 + } + } + }, + { + "type": "NarrativeText", + "element_id": "49873871ff17a9ffb6b6d4e11f6ea86d", + "text": "Notion Tip: When creating a page, it's important to give it a clear title and provide some content. This could include verifying the information, summarizing the topic, or sharing your thoughts and opinions on something that matters to you.", + "metadata": { + "languages": [ + "eng" + ], + "filetype": "text/html", + "data_source": { + "url": null, + "version": null, + "record_locator": { + "page_id": "5481f29c-799a-4d7b-93ce-b11bcaede531" + }, + "date_created": "2023-08-04T18:31:00.000Z", + "date_modified": "2023-08-04T19:02:00.000Z", + "permissions_data": null, + "filesize_bytes": 765 + } + } + }, + { + "type": "NarrativeText", + "element_id": "d32db2846683d992270e704251ca5c80", + "text": "Notion Tip: An owner of a page can verify it by clicking on the verification button above and choosing to verify the page for either a set amount of time or indefinitely!", + "metadata": { + "languages": [ + "eng" + ], + "filetype": "text/html", + "data_source": { + "url": null, + "version": null, + "record_locator": { + "page_id": "5481f29c-799a-4d7b-93ce-b11bcaede531" + }, + "date_created": "2023-08-04T18:31:00.000Z", + "date_modified": "2023-08-04T19:02:00.000Z", + "permissions_data": null, + "filesize_bytes": 765 + } + } + } +] \ No newline at end of file diff --git a/test_e2e/expected-structured-output/notion/5481f29c-799a-4d7b-93ce-b11bcaede531.json b/test_e2e/expected-structured-output/notion/5481f29c-799a-4d7b-93ce-b11bcaede531.json deleted file mode 100644 index 6d887d5fe..000000000 --- a/test_e2e/expected-structured-output/notion/5481f29c-799a-4d7b-93ce-b11bcaede531.json +++ /dev/null @@ -1,35 +0,0 @@ -[ - { - "element_id": "67500029518a859dc034db1601bf5fbe", - "metadata": { - "filetype": "text/html", - "languages": [ - "eng" - ] - }, - "text": "New Page With Verification", - "type": "Title" - }, - { - "element_id": "49873871ff17a9ffb6b6d4e11f6ea86d", - "metadata": { - "filetype": "text/html", - "languages": [ - "eng" - ] - }, - "text": "Notion Tip: When creating a page, it's important to give it a clear title and provide some content. This could include verifying the information, summarizing the topic, or sharing your thoughts and opinions on something that matters to you.", - "type": "NarrativeText" - }, - { - "element_id": "d32db2846683d992270e704251ca5c80", - "metadata": { - "filetype": "text/html", - "languages": [ - "eng" - ] - }, - "text": "Notion Tip: An owner of a page can verify it by clicking on the verification button above and choosing to verify the page for either a set amount of time or indefinitely!", - "type": "NarrativeText" - } -] \ No newline at end of file diff --git a/test_e2e/expected-structured-output/notion/60377009-e6b2-47f3-a8ff-f159fd8b69f5.html.json b/test_e2e/expected-structured-output/notion/60377009-e6b2-47f3-a8ff-f159fd8b69f5.html.json new file mode 100644 index 000000000..355f6d5f9 --- /dev/null +++ b/test_e2e/expected-structured-output/notion/60377009-e6b2-47f3-a8ff-f159fd8b69f5.html.json @@ -0,0 +1,46 @@ +[ + { + "type": "Title", + "element_id": "4d5b94a60a5ae180faa4753897afbc5f", + "text": "Vacation Policy", + "metadata": { + "languages": [ + "eng" + ], + "filetype": "text/html", + "data_source": { + "url": null, + "version": null, + "record_locator": { + "page_id": "60377009-e6b2-47f3-a8ff-f159fd8b69f5" + }, + "date_created": "2023-08-04T18:31:00.000Z", + "date_modified": "2023-08-04T18:31:00.000Z", + "permissions_data": null, + "filesize_bytes": 393 + } + } + }, + { + "type": "NarrativeText", + "element_id": "e3b7316f50c3edad4ea72b199ad6b7d9", + "text": "Notion Tip: Vacation policies are crucial for employee well-being and productivity. They provide rest and recharge, reduce burnout and increase job satisfaction.", + "metadata": { + "languages": [ + "eng" + ], + "filetype": "text/html", + "data_source": { + "url": null, + "version": null, + "record_locator": { + "page_id": "60377009-e6b2-47f3-a8ff-f159fd8b69f5" + }, + "date_created": "2023-08-04T18:31:00.000Z", + "date_modified": "2023-08-04T18:31:00.000Z", + "permissions_data": null, + "filesize_bytes": 393 + } + } + } +] \ No newline at end of file diff --git a/test_e2e/expected-structured-output/notion/60377009-e6b2-47f3-a8ff-f159fd8b69f5.json b/test_e2e/expected-structured-output/notion/60377009-e6b2-47f3-a8ff-f159fd8b69f5.json deleted file mode 100644 index 33ea5be25..000000000 --- a/test_e2e/expected-structured-output/notion/60377009-e6b2-47f3-a8ff-f159fd8b69f5.json +++ /dev/null @@ -1,24 +0,0 @@ -[ - { - "element_id": "4d5b94a60a5ae180faa4753897afbc5f", - "metadata": { - "filetype": "text/html", - "languages": [ - "eng" - ] - }, - "text": "Vacation Policy", - "type": "Title" - }, - { - "element_id": "e3b7316f50c3edad4ea72b199ad6b7d9", - "metadata": { - "filetype": "text/html", - "languages": [ - "eng" - ] - }, - "text": "Notion Tip: Vacation policies are crucial for employee well-being and productivity. They provide rest and recharge, reduce burnout and increase job satisfaction.", - "type": "NarrativeText" - } -] \ No newline at end of file diff --git a/test_e2e/expected-structured-output/notion/898538f2-26e1-4de7-81e6-354045d4d007.html.json b/test_e2e/expected-structured-output/notion/898538f2-26e1-4de7-81e6-354045d4d007.html.json new file mode 100644 index 000000000..70aa87203 --- /dev/null +++ b/test_e2e/expected-structured-output/notion/898538f2-26e1-4de7-81e6-354045d4d007.html.json @@ -0,0 +1,46 @@ +[ + { + "type": "Title", + "element_id": "52c2888160339820dfa8bb604c031ee9", + "text": "Mission, Vision, Values", + "metadata": { + "languages": [ + "eng" + ], + "filetype": "text/html", + "data_source": { + "url": null, + "version": null, + "record_locator": { + "page_id": "898538f2-26e1-4de7-81e6-354045d4d007" + }, + "date_created": "2023-08-04T18:31:00.000Z", + "date_modified": "2023-08-04T18:31:00.000Z", + "permissions_data": null, + "filesize_bytes": 462 + } + } + }, + { + "type": "NarrativeText", + "element_id": "be9ab17406409efa59dc98966370b9e7", + "text": "Notion Tip: A company mission provides direction and purpose, aligning actions and decisions towards a common goal. It also helps attract like-minded individuals who share the same values and vision for the company.", + "metadata": { + "languages": [ + "eng" + ], + "filetype": "text/html", + "data_source": { + "url": null, + "version": null, + "record_locator": { + "page_id": "898538f2-26e1-4de7-81e6-354045d4d007" + }, + "date_created": "2023-08-04T18:31:00.000Z", + "date_modified": "2023-08-04T18:31:00.000Z", + "permissions_data": null, + "filesize_bytes": 462 + } + } + } +] \ No newline at end of file diff --git a/test_e2e/expected-structured-output/notion/898538f2-26e1-4de7-81e6-354045d4d007.json b/test_e2e/expected-structured-output/notion/898538f2-26e1-4de7-81e6-354045d4d007.json deleted file mode 100644 index 5f944a562..000000000 --- a/test_e2e/expected-structured-output/notion/898538f2-26e1-4de7-81e6-354045d4d007.json +++ /dev/null @@ -1,24 +0,0 @@ -[ - { - "element_id": "52c2888160339820dfa8bb604c031ee9", - "metadata": { - "filetype": "text/html", - "languages": [ - "eng" - ] - }, - "text": "Mission, Vision, Values", - "type": "Title" - }, - { - "element_id": "be9ab17406409efa59dc98966370b9e7", - "metadata": { - "filetype": "text/html", - "languages": [ - "eng" - ] - }, - "text": "Notion Tip: A company mission provides direction and purpose, aligning actions and decisions towards a common goal. It also helps attract like-minded individuals who share the same values and vision for the company.", - "type": "NarrativeText" - } -] \ No newline at end of file diff --git a/test_e2e/expected-structured-output/notion/8d8bee42-2167-441c-af6c-7b2cff268809.html.json b/test_e2e/expected-structured-output/notion/8d8bee42-2167-441c-af6c-7b2cff268809.html.json new file mode 100644 index 000000000..0d66e9cc9 --- /dev/null +++ b/test_e2e/expected-structured-output/notion/8d8bee42-2167-441c-af6c-7b2cff268809.html.json @@ -0,0 +1,30 @@ +[ + { + "type": "NarrativeText", + "element_id": "5bcc7126851f18b3a41c951030def658", + "text": "Planning notes", + "metadata": { + "emphasized_text_contents": [ + "Planning notes" + ], + "emphasized_text_tags": [ + "b" + ], + "languages": [ + "eng" + ], + "filetype": "text/html", + "data_source": { + "url": null, + "version": null, + "record_locator": { + "page_id": "8d8bee42-2167-441c-af6c-7b2cff268809" + }, + "date_created": "2023-08-02T20:36:00.000Z", + "date_modified": "2023-08-08T19:17:00.000Z", + "permissions_data": null, + "filesize_bytes": 198 + } + } + } +] \ No newline at end of file diff --git a/test_e2e/expected-structured-output/notion/8d8bee42-2167-441c-af6c-7b2cff268809.json b/test_e2e/expected-structured-output/notion/8d8bee42-2167-441c-af6c-7b2cff268809.json deleted file mode 100644 index 93c03b812..000000000 --- a/test_e2e/expected-structured-output/notion/8d8bee42-2167-441c-af6c-7b2cff268809.json +++ /dev/null @@ -1,19 +0,0 @@ -[ - { - "element_id": "5bcc7126851f18b3a41c951030def658", - "metadata": { - "emphasized_text_contents": [ - "Planning notes" - ], - "emphasized_text_tags": [ - "b" - ], - "filetype": "text/html", - "languages": [ - "eng" - ] - }, - "text": "Planning notes", - "type": "NarrativeText" - } -] \ No newline at end of file diff --git a/test_e2e/expected-structured-output/notion/8db7ccc9-0a9c-4168-94c3-f997e60cb8cf.html.json b/test_e2e/expected-structured-output/notion/8db7ccc9-0a9c-4168-94c3-f997e60cb8cf.html.json new file mode 100644 index 000000000..4f42d6c93 --- /dev/null +++ b/test_e2e/expected-structured-output/notion/8db7ccc9-0a9c-4168-94c3-f997e60cb8cf.html.json @@ -0,0 +1,46 @@ +[ + { + "type": "Title", + "element_id": "6cb5211e45401c910bcc00e277092033", + "text": "Recent Press", + "metadata": { + "languages": [ + "eng" + ], + "filetype": "text/html", + "data_source": { + "url": null, + "version": null, + "record_locator": { + "page_id": "8db7ccc9-0a9c-4168-94c3-f997e60cb8cf" + }, + "date_created": "2023-08-04T18:31:00.000Z", + "date_modified": "2023-08-04T18:31:00.000Z", + "permissions_data": null, + "filesize_bytes": 390 + } + } + }, + { + "type": "NarrativeText", + "element_id": "d7335f2ec201cb754fc463da124e5970", + "text": "Notion Tip: Telling employees about news about your company is important because it helps them stay informed about the direction of the company and their role in it.", + "metadata": { + "languages": [ + "eng" + ], + "filetype": "text/html", + "data_source": { + "url": null, + "version": null, + "record_locator": { + "page_id": "8db7ccc9-0a9c-4168-94c3-f997e60cb8cf" + }, + "date_created": "2023-08-04T18:31:00.000Z", + "date_modified": "2023-08-04T18:31:00.000Z", + "permissions_data": null, + "filesize_bytes": 390 + } + } + } +] \ No newline at end of file diff --git a/test_e2e/expected-structured-output/notion/8db7ccc9-0a9c-4168-94c3-f997e60cb8cf.json b/test_e2e/expected-structured-output/notion/8db7ccc9-0a9c-4168-94c3-f997e60cb8cf.json deleted file mode 100644 index fe4b746af..000000000 --- a/test_e2e/expected-structured-output/notion/8db7ccc9-0a9c-4168-94c3-f997e60cb8cf.json +++ /dev/null @@ -1,24 +0,0 @@ -[ - { - "element_id": "6cb5211e45401c910bcc00e277092033", - "metadata": { - "filetype": "text/html", - "languages": [ - "eng" - ] - }, - "text": "Recent Press", - "type": "Title" - }, - { - "element_id": "d7335f2ec201cb754fc463da124e5970", - "metadata": { - "filetype": "text/html", - "languages": [ - "eng" - ] - }, - "text": "Notion Tip: Telling employees about news about your company is important because it helps them stay informed about the direction of the company and their role in it.", - "type": "NarrativeText" - } -] \ No newline at end of file diff --git a/test_e2e/expected-structured-output/notion/9e20be3d-cbe0-4e28-ad46-2170d40a8d37.html.json b/test_e2e/expected-structured-output/notion/9e20be3d-cbe0-4e28-ad46-2170d40a8d37.html.json new file mode 100644 index 000000000..fa9d2ece8 --- /dev/null +++ b/test_e2e/expected-structured-output/notion/9e20be3d-cbe0-4e28-ad46-2170d40a8d37.html.json @@ -0,0 +1,52 @@ +[ + { + "type": "Title", + "element_id": "bfbac21d794d26d6aaa6f71337a632d9", + "text": "Sprint 3", + "metadata": { + "languages": [ + "eng" + ], + "filetype": "text/html", + "data_source": { + "url": null, + "version": null, + "record_locator": { + "page_id": "9e20be3d-cbe0-4e28-ad46-2170d40a8d37" + }, + "date_created": "2023-08-02T20:36:00.000Z", + "date_modified": "2023-08-17T18:49:00.000Z", + "permissions_data": null, + "filesize_bytes": 214 + } + } + }, + { + "type": "NarrativeText", + "element_id": "e40bd670a8fbd37e3135ea5517c5dddc", + "text": "Planning notes", + "metadata": { + "emphasized_text_contents": [ + "Planning notes" + ], + "emphasized_text_tags": [ + "b" + ], + "languages": [ + "eng" + ], + "filetype": "text/html", + "data_source": { + "url": null, + "version": null, + "record_locator": { + "page_id": "9e20be3d-cbe0-4e28-ad46-2170d40a8d37" + }, + "date_created": "2023-08-02T20:36:00.000Z", + "date_modified": "2023-08-17T18:49:00.000Z", + "permissions_data": null, + "filesize_bytes": 214 + } + } + } +] \ No newline at end of file diff --git a/test_e2e/expected-structured-output/notion/9e20be3d-cbe0-4e28-ad46-2170d40a8d37.json b/test_e2e/expected-structured-output/notion/9e20be3d-cbe0-4e28-ad46-2170d40a8d37.json deleted file mode 100644 index 09372ad68..000000000 --- a/test_e2e/expected-structured-output/notion/9e20be3d-cbe0-4e28-ad46-2170d40a8d37.json +++ /dev/null @@ -1,30 +0,0 @@ -[ - { - "element_id": "bfbac21d794d26d6aaa6f71337a632d9", - "metadata": { - "filetype": "text/html", - "languages": [ - "eng" - ] - }, - "text": "Sprint 3", - "type": "Title" - }, - { - "element_id": "e40bd670a8fbd37e3135ea5517c5dddc", - "metadata": { - "emphasized_text_contents": [ - "Planning notes" - ], - "emphasized_text_tags": [ - "b" - ], - "filetype": "text/html", - "languages": [ - "eng" - ] - }, - "text": "Planning notes", - "type": "NarrativeText" - } -] \ No newline at end of file diff --git a/test_e2e/expected-structured-output/notion/b2a12157-721e-4207-b3b7-527762b782c2.html.json b/test_e2e/expected-structured-output/notion/b2a12157-721e-4207-b3b7-527762b782c2.html.json new file mode 100644 index 000000000..167005d2a --- /dev/null +++ b/test_e2e/expected-structured-output/notion/b2a12157-721e-4207-b3b7-527762b782c2.html.json @@ -0,0 +1,510 @@ +[ + { + "type": "Title", + "element_id": "3e43f998d46d9c8315e1abe4f0da9d72", + "text": "Getting Started", + "metadata": { + "languages": [ + "eng" + ], + "filetype": "text/html", + "data_source": { + "url": null, + "version": null, + "record_locator": { + "page_id": "b2a12157-721e-4207-b3b7-527762b782c2" + }, + "date_created": "2023-08-04T18:31:00.000Z", + "date_modified": "2023-08-04T18:31:00.000Z", + "permissions_data": null, + "filesize_bytes": 3332 + } + } + }, + { + "type": "NarrativeText", + "element_id": "06f9c166ac2e4f5dbb8fb754d833e477", + "text": "Notion Tip: When creating a page, it's important to give it a clear title and provide some content. This could include verifying the information, summarizing the topic, or sharing your thoughts and opinions on something that matters to you.", + "metadata": { + "languages": [ + "eng" + ], + "filetype": "text/html", + "data_source": { + "url": null, + "version": null, + "record_locator": { + "page_id": "b2a12157-721e-4207-b3b7-527762b782c2" + }, + "date_created": "2023-08-04T18:31:00.000Z", + "date_modified": "2023-08-04T18:31:00.000Z", + "permissions_data": null, + "filesize_bytes": 3332 + } + } + }, + { + "type": "Title", + "element_id": "6cabe4b9a0571c55a80de4b06013ec43", + "text": "The Basics", + "metadata": { + "languages": [ + "eng" + ], + "filetype": "text/html", + "data_source": { + "url": null, + "version": null, + "record_locator": { + "page_id": "b2a12157-721e-4207-b3b7-527762b782c2" + }, + "date_created": "2023-08-04T18:31:00.000Z", + "date_modified": "2023-08-04T18:31:00.000Z", + "permissions_data": null, + "filesize_bytes": 3332 + } + } + }, + { + "type": "Title", + "element_id": "2005f3cccf27dc851ae57fba48531195", + "text": "Create a Page", + "metadata": { + "languages": [ + "eng" + ], + "filetype": "text/html", + "data_source": { + "url": null, + "version": null, + "record_locator": { + "page_id": "b2a12157-721e-4207-b3b7-527762b782c2" + }, + "date_created": "2023-08-04T18:31:00.000Z", + "date_modified": "2023-08-04T18:31:00.000Z", + "permissions_data": null, + "filesize_bytes": 3332 + } + } + }, + { + "type": "NarrativeText", + "element_id": "2a7bbcfd5c237889b2fda563db7462cc", + "text": "In your sidebar, click the + that appears next to the word Workspace on hover. A new page will appear. Give it a title and start typing like you would in any other document.", + "metadata": { + "emphasized_text_contents": [ + "Workspace" + ], + "emphasized_text_tags": [ + "b" + ], + "languages": [ + "eng" + ], + "filetype": "text/html", + "data_source": { + "url": null, + "version": null, + "record_locator": { + "page_id": "b2a12157-721e-4207-b3b7-527762b782c2" + }, + "date_created": "2023-08-04T18:31:00.000Z", + "date_modified": "2023-08-04T18:31:00.000Z", + "permissions_data": null, + "filesize_bytes": 3332 + } + } + }, + { + "type": "Title", + "element_id": "2cd82188a21bdfed4ab1c658180a968b", + "text": "Headings", + "metadata": { + "languages": [ + "eng" + ], + "filetype": "text/html", + "data_source": { + "url": null, + "version": null, + "record_locator": { + "page_id": "b2a12157-721e-4207-b3b7-527762b782c2" + }, + "date_created": "2023-08-04T18:31:00.000Z", + "date_modified": "2023-08-04T18:31:00.000Z", + "permissions_data": null, + "filesize_bytes": 3332 + } + } + }, + { + "type": "NarrativeText", + "element_id": "64f7ee0a4c1563451c22061bb09d339c", + "text": "You can add headings and subheadings in one of two ways:", + "metadata": { + "languages": [ + "eng" + ], + "filetype": "text/html", + "data_source": { + "url": null, + "version": null, + "record_locator": { + "page_id": "b2a12157-721e-4207-b3b7-527762b782c2" + }, + "date_created": "2023-08-04T18:31:00.000Z", + "date_modified": "2023-08-04T18:31:00.000Z", + "permissions_data": null, + "filesize_bytes": 3332 + } + } + }, + { + "type": "ListItem", + "element_id": "0f8b41821ecbdad9478f4dbc39b1f2a4", + "text": "Type /heading or /h1 , /h2 , or /h3 to choose the heading size you want.", + "metadata": { + "languages": [ + "eng" + ], + "filetype": "text/html", + "data_source": { + "url": null, + "version": null, + "record_locator": { + "page_id": "b2a12157-721e-4207-b3b7-527762b782c2" + }, + "date_created": "2023-08-04T18:31:00.000Z", + "date_modified": "2023-08-04T18:31:00.000Z", + "permissions_data": null, + "filesize_bytes": 3332 + } + } + }, + { + "type": "ListItem", + "element_id": "545568cf4c5fc32d56606019eee27510", + "text": "Use Markdown shortcuts, like # , ## , and ### .", + "metadata": { + "languages": [ + "eng" + ], + "filetype": "text/html", + "data_source": { + "url": null, + "version": null, + "record_locator": { + "page_id": "b2a12157-721e-4207-b3b7-527762b782c2" + }, + "date_created": "2023-08-04T18:31:00.000Z", + "date_modified": "2023-08-04T18:31:00.000Z", + "permissions_data": null, + "filesize_bytes": 3332 + } + } + }, + { + "type": "ListItem", + "element_id": "be84c348ae4756f33d83d3145dd711af", + "text": "Create inline code by wrapping text with ` (or with the shortcut cmd/ctrl + e ).", + "metadata": { + "languages": [ + "eng" + ], + "filetype": "text/html", + "data_source": { + "url": null, + "version": null, + "record_locator": { + "page_id": "b2a12157-721e-4207-b3b7-527762b782c2" + }, + "date_created": "2023-08-04T18:31:00.000Z", + "date_modified": "2023-08-04T18:31:00.000Z", + "permissions_data": null, + "filesize_bytes": 3332 + } + } + }, + { + "type": "Title", + "element_id": "c131435fecd2bcd5fe5fef8cb322aa55", + "text": "Toggle Lists", + "metadata": { + "languages": [ + "eng" + ], + "filetype": "text/html", + "data_source": { + "url": null, + "version": null, + "record_locator": { + "page_id": "b2a12157-721e-4207-b3b7-527762b782c2" + }, + "date_created": "2023-08-04T18:31:00.000Z", + "date_modified": "2023-08-04T18:31:00.000Z", + "permissions_data": null, + "filesize_bytes": 3332 + } + } + }, + { + "type": "NarrativeText", + "element_id": "94ea1bd8465604db79b8f2c29420f5de", + "text": "Toggle lists streamline your content. Click the arrow to open.", + "metadata": { + "languages": [ + "eng" + ], + "filetype": "text/html", + "data_source": { + "url": null, + "version": null, + "record_locator": { + "page_id": "b2a12157-721e-4207-b3b7-527762b782c2" + }, + "date_created": "2023-08-04T18:31:00.000Z", + "date_modified": "2023-08-04T18:31:00.000Z", + "permissions_data": null, + "filesize_bytes": 3332 + } + } + }, + { + "type": "Title", + "element_id": "4f733e296d95143a3c49dffc35ba64d0", + "text": "Callout Blocks", + "metadata": { + "languages": [ + "eng" + ], + "filetype": "text/html", + "data_source": { + "url": null, + "version": null, + "record_locator": { + "page_id": "b2a12157-721e-4207-b3b7-527762b782c2" + }, + "date_created": "2023-08-04T18:31:00.000Z", + "date_modified": "2023-08-04T18:31:00.000Z", + "permissions_data": null, + "filesize_bytes": 3332 + } + } + }, + { + "type": "NarrativeText", + "element_id": "7e96268c98a95ade6f6dceb82fe91d1f", + "text": "Notion Tip: Create a callout block like this by typing /call and pressing enter . Helpful for adding inline instructions, warnings, disclaimers, and tips. Change the emoji icon by clicking on it.", + "metadata": { + "emphasized_text_contents": [ + "Notion Tip:" + ], + "emphasized_text_tags": [ + "b" + ], + "languages": [ + "eng" + ], + "filetype": "text/html", + "data_source": { + "url": null, + "version": null, + "record_locator": { + "page_id": "b2a12157-721e-4207-b3b7-527762b782c2" + }, + "date_created": "2023-08-04T18:31:00.000Z", + "date_modified": "2023-08-04T18:31:00.000Z", + "permissions_data": null, + "filesize_bytes": 3332 + } + } + }, + { + "type": "Title", + "element_id": "cb33f03c0a3139caeb5607fc4ea55ffd", + "text": "Code Blocks", + "metadata": { + "languages": [ + "eng" + ], + "filetype": "text/html", + "data_source": { + "url": null, + "version": null, + "record_locator": { + "page_id": "b2a12157-721e-4207-b3b7-527762b782c2" + }, + "date_created": "2023-08-04T18:31:00.000Z", + "date_modified": "2023-08-04T18:31:00.000Z", + "permissions_data": null, + "filesize_bytes": 3332 + } + } + }, + { + "type": "NarrativeText", + "element_id": "534f19a337f8114851ba68a69035da52", + "text": "You can add code notation to any Notion page:", + "metadata": { + "languages": [ + "eng" + ], + "filetype": "text/html", + "data_source": { + "url": null, + "version": null, + "record_locator": { + "page_id": "b2a12157-721e-4207-b3b7-527762b782c2" + }, + "date_created": "2023-08-04T18:31:00.000Z", + "date_modified": "2023-08-04T18:31:00.000Z", + "permissions_data": null, + "filesize_bytes": 3332 + } + } + }, + { + "type": "NarrativeText", + "element_id": "70a3c8f18e2d5d32db68fe2150a5a72f", + "text": "Hover over this block to see the Copy to Clipboard option!", + "metadata": { + "emphasized_text_contents": [ + "Copy to Clipboard" + ], + "emphasized_text_tags": [ + "b" + ], + "languages": [ + "eng" + ], + "filetype": "text/html", + "data_source": { + "url": null, + "version": null, + "record_locator": { + "page_id": "b2a12157-721e-4207-b3b7-527762b782c2" + }, + "date_created": "2023-08-04T18:31:00.000Z", + "date_modified": "2023-08-04T18:31:00.000Z", + "permissions_data": null, + "filesize_bytes": 3332 + } + } + }, + { + "type": "ListItem", + "element_id": "d23aeea612a881e54d4e91b26d795beb", + "text": "Your teammates can select any code to comment on it.", + "metadata": { + "languages": [ + "eng" + ], + "filetype": "text/html", + "data_source": { + "url": null, + "version": null, + "record_locator": { + "page_id": "b2a12157-721e-4207-b3b7-527762b782c2" + }, + "date_created": "2023-08-04T18:31:00.000Z", + "date_modified": "2023-08-04T18:31:00.000Z", + "permissions_data": null, + "filesize_bytes": 3332 + } + } + }, + { + "type": "Title", + "element_id": "49c4ebfe04f72a068e8c2e4545d997ef", + "text": "Organizing Pages", + "metadata": { + "languages": [ + "eng" + ], + "filetype": "text/html", + "data_source": { + "url": null, + "version": null, + "record_locator": { + "page_id": "b2a12157-721e-4207-b3b7-527762b782c2" + }, + "date_created": "2023-08-04T18:31:00.000Z", + "date_modified": "2023-08-04T18:31:00.000Z", + "permissions_data": null, + "filesize_bytes": 3332 + } + } + }, + { + "type": "NarrativeText", + "element_id": "db092f68e3263f38d6ed1af651e30e6d", + "text": "Instead of using folders, Notion lets you nest pages inside pages. Type /page and press enter to create a sub-page inside a page. Like this:", + "metadata": { + "languages": [ + "eng" + ], + "filetype": "text/html", + "data_source": { + "url": null, + "version": null, + "record_locator": { + "page_id": "b2a12157-721e-4207-b3b7-527762b782c2" + }, + "date_created": "2023-08-04T18:31:00.000Z", + "date_modified": "2023-08-04T18:31:00.000Z", + "permissions_data": null, + "filesize_bytes": 3332 + } + } + }, + { + "type": "Title", + "element_id": "41a38caacb638fa8311b89164cc2cab4", + "text": "Advanced Techniques", + "metadata": { + "languages": [ + "eng" + ], + "filetype": "text/html", + "data_source": { + "url": null, + "version": null, + "record_locator": { + "page_id": "b2a12157-721e-4207-b3b7-527762b782c2" + }, + "date_created": "2023-08-04T18:31:00.000Z", + "date_modified": "2023-08-04T18:31:00.000Z", + "permissions_data": null, + "filesize_bytes": 3332 + } + } + }, + { + "type": "NarrativeText", + "element_id": "ed1d5c56b71619eca5b877d0e2dc1e10", + "text": "Check out this Notion Editor 101 guide for more advanced tips and how-to's.", + "metadata": { + "link_texts": [ + "Notion Editor 101" + ], + "link_urls": [ + "https://www.notion.so/notion/Notion-editor-101-create-and-edit-68c7c67047494fdb87d50185429df93e" + ], + "languages": [ + "eng" + ], + "filetype": "text/html", + "data_source": { + "url": null, + "version": null, + "record_locator": { + "page_id": "b2a12157-721e-4207-b3b7-527762b782c2" + }, + "date_created": "2023-08-04T18:31:00.000Z", + "date_modified": "2023-08-04T18:31:00.000Z", + "permissions_data": null, + "filesize_bytes": 3332 + } + } + } +] \ No newline at end of file diff --git a/test_e2e/expected-structured-output/notion/b2a12157-721e-4207-b3b7-527762b782c2.json b/test_e2e/expected-structured-output/notion/b2a12157-721e-4207-b3b7-527762b782c2.json deleted file mode 100644 index 816f3b02c..000000000 --- a/test_e2e/expected-structured-output/notion/b2a12157-721e-4207-b3b7-527762b782c2.json +++ /dev/null @@ -1,268 +0,0 @@ -[ - { - "element_id": "3e43f998d46d9c8315e1abe4f0da9d72", - "metadata": { - "filetype": "text/html", - "languages": [ - "eng" - ] - }, - "text": "Getting Started", - "type": "Title" - }, - { - "element_id": "06f9c166ac2e4f5dbb8fb754d833e477", - "metadata": { - "filetype": "text/html", - "languages": [ - "eng" - ] - }, - "text": "Notion Tip: When creating a page, it's important to give it a clear title and provide some content. This could include verifying the information, summarizing the topic, or sharing your thoughts and opinions on something that matters to you.", - "type": "NarrativeText" - }, - { - "element_id": "6cabe4b9a0571c55a80de4b06013ec43", - "metadata": { - "filetype": "text/html", - "languages": [ - "eng" - ] - }, - "text": "The Basics", - "type": "Title" - }, - { - "element_id": "2005f3cccf27dc851ae57fba48531195", - "metadata": { - "filetype": "text/html", - "languages": [ - "eng" - ] - }, - "text": "Create a Page", - "type": "Title" - }, - { - "element_id": "2a7bbcfd5c237889b2fda563db7462cc", - "metadata": { - "emphasized_text_contents": [ - "Workspace" - ], - "emphasized_text_tags": [ - "b" - ], - "filetype": "text/html", - "languages": [ - "eng" - ] - }, - "text": "In your sidebar, click the + that appears next to the word Workspace on hover. A new page will appear. Give it a title and start typing like you would in any other document.", - "type": "NarrativeText" - }, - { - "element_id": "2cd82188a21bdfed4ab1c658180a968b", - "metadata": { - "filetype": "text/html", - "languages": [ - "eng" - ] - }, - "text": "Headings", - "type": "Title" - }, - { - "element_id": "64f7ee0a4c1563451c22061bb09d339c", - "metadata": { - "filetype": "text/html", - "languages": [ - "eng" - ] - }, - "text": "You can add headings and subheadings in one of two ways:", - "type": "NarrativeText" - }, - { - "element_id": "0f8b41821ecbdad9478f4dbc39b1f2a4", - "metadata": { - "filetype": "text/html", - "languages": [ - "eng" - ] - }, - "text": "Type /heading or /h1 , /h2 , or /h3 to choose the heading size you want.", - "type": "ListItem" - }, - { - "element_id": "545568cf4c5fc32d56606019eee27510", - "metadata": { - "filetype": "text/html", - "languages": [ - "eng" - ] - }, - "text": "Use Markdown shortcuts, like # , ## , and ### .", - "type": "ListItem" - }, - { - "element_id": "be84c348ae4756f33d83d3145dd711af", - "metadata": { - "filetype": "text/html", - "languages": [ - "eng" - ] - }, - "text": "Create inline code by wrapping text with ` (or with the shortcut cmd/ctrl + e ).", - "type": "ListItem" - }, - { - "element_id": "c131435fecd2bcd5fe5fef8cb322aa55", - "metadata": { - "filetype": "text/html", - "languages": [ - "eng" - ] - }, - "text": "Toggle Lists", - "type": "Title" - }, - { - "element_id": "94ea1bd8465604db79b8f2c29420f5de", - "metadata": { - "filetype": "text/html", - "languages": [ - "eng" - ] - }, - "text": "Toggle lists streamline your content. Click the arrow to open.", - "type": "NarrativeText" - }, - { - "element_id": "4f733e296d95143a3c49dffc35ba64d0", - "metadata": { - "filetype": "text/html", - "languages": [ - "eng" - ] - }, - "text": "Callout Blocks", - "type": "Title" - }, - { - "element_id": "7e96268c98a95ade6f6dceb82fe91d1f", - "metadata": { - "emphasized_text_contents": [ - "Notion Tip:" - ], - "emphasized_text_tags": [ - "b" - ], - "filetype": "text/html", - "languages": [ - "eng" - ] - }, - "text": "Notion Tip: Create a callout block like this by typing /call and pressing enter . Helpful for adding inline instructions, warnings, disclaimers, and tips. Change the emoji icon by clicking on it.", - "type": "NarrativeText" - }, - { - "element_id": "cb33f03c0a3139caeb5607fc4ea55ffd", - "metadata": { - "filetype": "text/html", - "languages": [ - "eng" - ] - }, - "text": "Code Blocks", - "type": "Title" - }, - { - "element_id": "534f19a337f8114851ba68a69035da52", - "metadata": { - "filetype": "text/html", - "languages": [ - "eng" - ] - }, - "text": "You can add code notation to any Notion page:", - "type": "NarrativeText" - }, - { - "element_id": "70a3c8f18e2d5d32db68fe2150a5a72f", - "metadata": { - "emphasized_text_contents": [ - "Copy to Clipboard" - ], - "emphasized_text_tags": [ - "b" - ], - "filetype": "text/html", - "languages": [ - "eng" - ] - }, - "text": "Hover over this block to see the Copy to Clipboard option!", - "type": "NarrativeText" - }, - { - "element_id": "d23aeea612a881e54d4e91b26d795beb", - "metadata": { - "filetype": "text/html", - "languages": [ - "eng" - ] - }, - "text": "Your teammates can select any code to comment on it.", - "type": "ListItem" - }, - { - "element_id": "49c4ebfe04f72a068e8c2e4545d997ef", - "metadata": { - "filetype": "text/html", - "languages": [ - "eng" - ] - }, - "text": "Organizing Pages", - "type": "Title" - }, - { - "element_id": "db092f68e3263f38d6ed1af651e30e6d", - "metadata": { - "filetype": "text/html", - "languages": [ - "eng" - ] - }, - "text": "Instead of using folders, Notion lets you nest pages inside pages. Type /page and press enter to create a sub-page inside a page. Like this:", - "type": "NarrativeText" - }, - { - "element_id": "41a38caacb638fa8311b89164cc2cab4", - "metadata": { - "filetype": "text/html", - "languages": [ - "eng" - ] - }, - "text": "Advanced Techniques", - "type": "Title" - }, - { - "element_id": "ed1d5c56b71619eca5b877d0e2dc1e10", - "metadata": { - "filetype": "text/html", - "languages": [ - "eng" - ], - "link_texts": [ - "Notion Editor 101" - ], - "link_urls": [ - "https://www.notion.so/notion/Notion-editor-101-create-and-edit-68c7c67047494fdb87d50185429df93e" - ] - }, - "text": "Check out this Notion Editor 101 guide for more advanced tips and how-to's.", - "type": "NarrativeText" - } -] \ No newline at end of file diff --git a/test_e2e/expected-structured-output/notion/c47a4566-4c7a-488b-ac2a-1292ee507fcb.html.json b/test_e2e/expected-structured-output/notion/c47a4566-4c7a-488b-ac2a-1292ee507fcb.html.json new file mode 100644 index 000000000..aaa42d7ec --- /dev/null +++ b/test_e2e/expected-structured-output/notion/c47a4566-4c7a-488b-ac2a-1292ee507fcb.html.json @@ -0,0 +1,881 @@ +[ + { + "type": "Title", + "element_id": "cd153f73463db45ea02bd9ba6ce4168e", + "text": "Page with every block", + "metadata": { + "languages": [ + "eng" + ], + "filetype": "text/html", + "data_source": { + "url": null, + "version": null, + "record_locator": { + "page_id": "c47a4566-4c7a-488b-ac2a-1292ee507fcb" + }, + "date_created": "2023-08-04T18:31:00.000Z", + "date_modified": "2023-08-17T18:48:00.000Z", + "permissions_data": null, + "filesize_bytes": 6342 + } + } + }, + { + "type": "NarrativeText", + "element_id": "098442d39ccc8a9731627be8a843d02a", + "text": "Notion Tip: Tag pages to let collaborators know what they can expect to use the page for. You can add one or many tags to any page in a wiki.", + "metadata": { + "languages": [ + "eng" + ], + "filetype": "text/html", + "data_source": { + "url": null, + "version": null, + "record_locator": { + "page_id": "c47a4566-4c7a-488b-ac2a-1292ee507fcb" + }, + "date_created": "2023-08-04T18:31:00.000Z", + "date_modified": "2023-08-17T18:48:00.000Z", + "permissions_data": null, + "filesize_bytes": 6342 + } + } + }, + { + "type": "Title", + "element_id": "868a2b2294814990d664cf13ffd1e2a7", + "text": "Heading 2", + "metadata": { + "languages": [ + "eng" + ], + "filetype": "text/html", + "data_source": { + "url": null, + "version": null, + "record_locator": { + "page_id": "c47a4566-4c7a-488b-ac2a-1292ee507fcb" + }, + "date_created": "2023-08-04T18:31:00.000Z", + "date_modified": "2023-08-17T18:48:00.000Z", + "permissions_data": null, + "filesize_bytes": 6342 + } + } + }, + { + "type": "NarrativeText", + "element_id": "af888c9a9a14c9c6616cf54ac230c20a", + "text": "This is some new text", + "metadata": { + "languages": [ + "eng" + ], + "filetype": "text/html", + "data_source": { + "url": null, + "version": null, + "record_locator": { + "page_id": "c47a4566-4c7a-488b-ac2a-1292ee507fcb" + }, + "date_created": "2023-08-04T18:31:00.000Z", + "date_modified": "2023-08-17T18:48:00.000Z", + "permissions_data": null, + "filesize_bytes": 6342 + } + } + }, + { + "type": "UncategorizedText", + "element_id": "99388232115e119009419bd8b07c93b9", + "text": "Some/less \u2192 more formatted text with other content and stuff 2023-08-07 : @Roman Isecke", + "metadata": { + "emphasized_text_contents": [ + "formatted" + ], + "emphasized_text_tags": [ + "b" + ], + "link_texts": [ + "text" + ], + "link_urls": [ + "/9ba4d6da8a574cfc81ebceac1fde52bd" + ], + "languages": [ + "eng" + ], + "filetype": "text/html", + "data_source": { + "url": null, + "version": null, + "record_locator": { + "page_id": "c47a4566-4c7a-488b-ac2a-1292ee507fcb" + }, + "date_created": "2023-08-04T18:31:00.000Z", + "date_modified": "2023-08-17T18:48:00.000Z", + "permissions_data": null, + "filesize_bytes": 6342 + } + } + }, + { + "type": "Table", + "element_id": "91b9abcc226cbe676d827950030c6702", + "text": "column 1 column 2 pages c1r1 content c2r1 table \n 2023-08-08T09:00:00.000-04:00\n cell Page with every block c1r2 more content c2r2 table cell Untitled this is some green text this is an equation Untitled text1 text2 Multiline cell Another cell Untitled", + "metadata": { + "text_as_html": "
column 1column 2pages
c1r1 contentc2r1 table
2023-08-08T09:00:00.000-04:00
cell
Page with every block
c1r2 more contentc2r2 table cellUntitled
this is some green textthis is an equationUntitled
text1 text2 Multiline cellAnother cellUntitled
", + "languages": [ + "eng" + ], + "filetype": "text/html", + "data_source": { + "url": null, + "version": null, + "record_locator": { + "page_id": "c47a4566-4c7a-488b-ac2a-1292ee507fcb" + }, + "date_created": "2023-08-04T18:31:00.000Z", + "date_modified": "2023-08-17T18:48:00.000Z", + "permissions_data": null, + "filesize_bytes": 6342 + } + } + }, + { + "type": "UncategorizedText", + "element_id": "0b73b1397f01db39dc98a983bd3aeb3d", + "text": "E = {mc^2}", + "metadata": { + "languages": [ + "eng" + ], + "filetype": "text/html", + "data_source": { + "url": null, + "version": null, + "record_locator": { + "page_id": "c47a4566-4c7a-488b-ac2a-1292ee507fcb" + }, + "date_created": "2023-08-04T18:31:00.000Z", + "date_modified": "2023-08-17T18:48:00.000Z", + "permissions_data": null, + "filesize_bytes": 6342 + } + } + }, + { + "type": "ListItem", + "element_id": "7535c23e3c0bda50ea38df65f7a64bca", + "text": "Numbered list", + "metadata": { + "languages": [ + "eng" + ], + "filetype": "text/html", + "data_source": { + "url": null, + "version": null, + "record_locator": { + "page_id": "c47a4566-4c7a-488b-ac2a-1292ee507fcb" + }, + "date_created": "2023-08-04T18:31:00.000Z", + "date_modified": "2023-08-17T18:48:00.000Z", + "permissions_data": null, + "filesize_bytes": 6342 + } + } + }, + { + "type": "ListItem", + "element_id": "155061ede32096c81085eabf421f9fe0", + "text": "A number child", + "metadata": { + "languages": [ + "eng" + ], + "filetype": "text/html", + "data_source": { + "url": null, + "version": null, + "record_locator": { + "page_id": "c47a4566-4c7a-488b-ac2a-1292ee507fcb" + }, + "date_created": "2023-08-04T18:31:00.000Z", + "date_modified": "2023-08-17T18:48:00.000Z", + "permissions_data": null, + "filesize_bytes": 6342 + } + } + }, + { + "type": "ListItem", + "element_id": "1ff4a64dcc74b4cbdf4270776c2adab0", + "text": "A number grandchild", + "metadata": { + "languages": [ + "eng" + ], + "filetype": "text/html", + "data_source": { + "url": null, + "version": null, + "record_locator": { + "page_id": "c47a4566-4c7a-488b-ac2a-1292ee507fcb" + }, + "date_created": "2023-08-04T18:31:00.000Z", + "date_modified": "2023-08-17T18:48:00.000Z", + "permissions_data": null, + "filesize_bytes": 6342 + } + } + }, + { + "type": "ListItem", + "element_id": "9e0342a8c3a010f7802d874fa447f72b", + "text": "great", + "metadata": { + "languages": [ + "eng" + ], + "filetype": "text/html", + "data_source": { + "url": null, + "version": null, + "record_locator": { + "page_id": "c47a4566-4c7a-488b-ac2a-1292ee507fcb" + }, + "date_created": "2023-08-04T18:31:00.000Z", + "date_modified": "2023-08-17T18:48:00.000Z", + "permissions_data": null, + "filesize_bytes": 6342 + } + } + }, + { + "type": "ListItem", + "element_id": "240e4a3a9b5843192b03086325da2169", + "text": "super great", + "metadata": { + "languages": [ + "eng" + ], + "filetype": "text/html", + "data_source": { + "url": null, + "version": null, + "record_locator": { + "page_id": "c47a4566-4c7a-488b-ac2a-1292ee507fcb" + }, + "date_created": "2023-08-04T18:31:00.000Z", + "date_modified": "2023-08-17T18:48:00.000Z", + "permissions_data": null, + "filesize_bytes": 6342 + } + } + }, + { + "type": "ListItem", + "element_id": "d1e6a3da60ba834365b2230689c4d8a6", + "text": "with test text", + "metadata": { + "languages": [ + "eng" + ], + "filetype": "text/html", + "data_source": { + "url": null, + "version": null, + "record_locator": { + "page_id": "c47a4566-4c7a-488b-ac2a-1292ee507fcb" + }, + "date_created": "2023-08-04T18:31:00.000Z", + "date_modified": "2023-08-17T18:48:00.000Z", + "permissions_data": null, + "filesize_bytes": 6342 + } + } + }, + { + "type": "ListItem", + "element_id": "db78c6b732dc265e380889e394c6354f", + "text": "Bullet one", + "metadata": { + "languages": [ + "eng" + ], + "filetype": "text/html", + "data_source": { + "url": null, + "version": null, + "record_locator": { + "page_id": "c47a4566-4c7a-488b-ac2a-1292ee507fcb" + }, + "date_created": "2023-08-04T18:31:00.000Z", + "date_modified": "2023-08-17T18:48:00.000Z", + "permissions_data": null, + "filesize_bytes": 6342 + } + } + }, + { + "type": "ListItem", + "element_id": "f31b201c44870108f395a238bff36413", + "text": "A child bullet", + "metadata": { + "languages": [ + "eng" + ], + "filetype": "text/html", + "data_source": { + "url": null, + "version": null, + "record_locator": { + "page_id": "c47a4566-4c7a-488b-ac2a-1292ee507fcb" + }, + "date_created": "2023-08-04T18:31:00.000Z", + "date_modified": "2023-08-17T18:48:00.000Z", + "permissions_data": null, + "filesize_bytes": 6342 + } + } + }, + { + "type": "ListItem", + "element_id": "5929608d0a4d2f055635bbab72df26ec", + "text": "A grandchild bullet", + "metadata": { + "languages": [ + "eng" + ], + "filetype": "text/html", + "data_source": { + "url": null, + "version": null, + "record_locator": { + "page_id": "c47a4566-4c7a-488b-ac2a-1292ee507fcb" + }, + "date_created": "2023-08-04T18:31:00.000Z", + "date_modified": "2023-08-17T18:48:00.000Z", + "permissions_data": null, + "filesize_bytes": 6342 + } + } + }, + { + "type": "ListItem", + "element_id": "1e93d6f8cf7c8af51ddf222be77b4882", + "text": "great", + "metadata": { + "languages": [ + "eng" + ], + "filetype": "text/html", + "data_source": { + "url": null, + "version": null, + "record_locator": { + "page_id": "c47a4566-4c7a-488b-ac2a-1292ee507fcb" + }, + "date_created": "2023-08-04T18:31:00.000Z", + "date_modified": "2023-08-17T18:48:00.000Z", + "permissions_data": null, + "filesize_bytes": 6342 + } + } + }, + { + "type": "ListItem", + "element_id": "c53244024b7b1e86b20bcc1489d9dc4a", + "text": "super great", + "metadata": { + "languages": [ + "eng" + ], + "filetype": "text/html", + "data_source": { + "url": null, + "version": null, + "record_locator": { + "page_id": "c47a4566-4c7a-488b-ac2a-1292ee507fcb" + }, + "date_created": "2023-08-04T18:31:00.000Z", + "date_modified": "2023-08-17T18:48:00.000Z", + "permissions_data": null, + "filesize_bytes": 6342 + } + } + }, + { + "type": "ListItem", + "element_id": "3602b0a8a126be064654623590163f49", + "text": "Bullet two", + "metadata": { + "languages": [ + "eng" + ], + "filetype": "text/html", + "data_source": { + "url": null, + "version": null, + "record_locator": { + "page_id": "c47a4566-4c7a-488b-ac2a-1292ee507fcb" + }, + "date_created": "2023-08-04T18:31:00.000Z", + "date_modified": "2023-08-17T18:48:00.000Z", + "permissions_data": null, + "filesize_bytes": 6342 + } + } + }, + { + "type": "NarrativeText", + "element_id": "27d5b17e90250d77a76da1f6d93f8e8b", + "text": "I quote myself testings Notion", + "metadata": { + "languages": [ + "eng" + ], + "filetype": "text/html", + "data_source": { + "url": null, + "version": null, + "record_locator": { + "page_id": "c47a4566-4c7a-488b-ac2a-1292ee507fcb" + }, + "date_created": "2023-08-04T18:31:00.000Z", + "date_modified": "2023-08-17T18:48:00.000Z", + "permissions_data": null, + "filesize_bytes": 6342 + } + } + }, + { + "type": "NarrativeText", + "element_id": "8831856d3670d91d6fa2121af0694022", + "text": "https://www.notion.so/icons/airplane_brown.svg I call this out", + "metadata": { + "link_texts": [ + "https://www.notion.so/icons/airplane_brown.svg" + ], + "link_urls": [ + "https://www.notion.so/icons/airplane_brown.svg" + ], + "languages": [ + "eng" + ], + "filetype": "text/html", + "data_source": { + "url": null, + "version": null, + "record_locator": { + "page_id": "c47a4566-4c7a-488b-ac2a-1292ee507fcb" + }, + "date_created": "2023-08-04T18:31:00.000Z", + "date_modified": "2023-08-17T18:48:00.000Z", + "permissions_data": null, + "filesize_bytes": 6342 + } + } + }, + { + "type": "Title", + "element_id": "df59e087da5910b2cb1c98801bb24c85", + "text": "https://www.wikipedia.org/", + "metadata": { + "link_texts": [ + "https://www.wikipedia.org/" + ], + "link_urls": [ + "https://www.wikipedia.org/" + ], + "languages": [ + "eng" + ], + "filetype": "text/html", + "data_source": { + "url": null, + "version": null, + "record_locator": { + "page_id": "c47a4566-4c7a-488b-ac2a-1292ee507fcb" + }, + "date_created": "2023-08-04T18:31:00.000Z", + "date_modified": "2023-08-17T18:48:00.000Z", + "permissions_data": null, + "filesize_bytes": 6342 + } + } + }, + { + "type": "Title", + "element_id": "0f215d56b4a1fc900dc2dad40b7df66f", + "text": "https://play-lh.googleusercontent.com/KwUBNPbMTk9jDXYS2AeX3illtVRTkrKVh5xR1Mg4WHd0CG2tV4mrh1z3kXi5z_warlk", + "metadata": { + "link_texts": [ + "https://play-lh.googleusercontent.com/KwUBNPbMTk9jDXYS2AeX3illtVRTkrKVh5xR1Mg4WHd0CG2tV4mrh1z3kXi5z_warlk" + ], + "link_urls": [ + "https://play-lh.googleusercontent.com/KwUBNPbMTk9jDXYS2AeX3illtVRTkrKVh5xR1Mg4WHd0CG2tV4mrh1z3kXi5z_warlk" + ], + "languages": [ + "eng" + ], + "filetype": "text/html", + "data_source": { + "url": null, + "version": null, + "record_locator": { + "page_id": "c47a4566-4c7a-488b-ac2a-1292ee507fcb" + }, + "date_created": "2023-08-04T18:31:00.000Z", + "date_modified": "2023-08-17T18:48:00.000Z", + "permissions_data": null, + "filesize_bytes": 6342 + } + } + }, + { + "type": "Title", + "element_id": "5da75c186c36d3117e60f08d49e66085", + "text": "Child Database:", + "metadata": { + "languages": [ + "eng" + ], + "filetype": "text/html", + "data_source": { + "url": null, + "version": null, + "record_locator": { + "page_id": "c47a4566-4c7a-488b-ac2a-1292ee507fcb" + }, + "date_created": "2023-08-04T18:31:00.000Z", + "date_modified": "2023-08-17T18:48:00.000Z", + "permissions_data": null, + "filesize_bytes": 6342 + } + } + }, + { + "type": "Title", + "element_id": "a82757a2b9004569ab1761d061847bd3", + "text": "Analytics", + "metadata": { + "link_texts": [ + "Analytics" + ], + "link_urls": [ + "https://www.notion.so/d1fad658f1cf4eedb0b5ee72b9f0b530" + ], + "languages": [ + "eng" + ], + "filetype": "text/html", + "data_source": { + "url": null, + "version": null, + "record_locator": { + "page_id": "c47a4566-4c7a-488b-ac2a-1292ee507fcb" + }, + "date_created": "2023-08-04T18:31:00.000Z", + "date_modified": "2023-08-17T18:48:00.000Z", + "permissions_data": null, + "filesize_bytes": 6342 + } + } + }, + { + "type": "Title", + "element_id": "29a6be22a8770f106f54f4abcdc1de68", + "text": "Child Page:", + "metadata": { + "languages": [ + "eng" + ], + "filetype": "text/html", + "data_source": { + "url": null, + "version": null, + "record_locator": { + "page_id": "c47a4566-4c7a-488b-ac2a-1292ee507fcb" + }, + "date_created": "2023-08-04T18:31:00.000Z", + "date_modified": "2023-08-17T18:48:00.000Z", + "permissions_data": null, + "filesize_bytes": 6342 + } + } + }, + { + "type": "Title", + "element_id": "d07d54a1ce286a7679952d4e4ce82c8e", + "text": "Untitled", + "metadata": { + "link_texts": [ + "Untitled" + ], + "link_urls": [ + "https://www.notion.so/9ba4d6da8a574cfc81ebceac1fde52bd" + ], + "languages": [ + "eng" + ], + "filetype": "text/html", + "data_source": { + "url": null, + "version": null, + "record_locator": { + "page_id": "c47a4566-4c7a-488b-ac2a-1292ee507fcb" + }, + "date_created": "2023-08-04T18:31:00.000Z", + "date_modified": "2023-08-17T18:48:00.000Z", + "permissions_data": null, + "filesize_bytes": 6342 + } + } + }, + { + "type": "NarrativeText", + "element_id": "d4c02f5b35a00e87ef7be603d82c5df3", + "text": "s = \"this is some code\"", + "metadata": { + "languages": [ + "eng" + ], + "filetype": "text/html", + "data_source": { + "url": null, + "version": null, + "record_locator": { + "page_id": "c47a4566-4c7a-488b-ac2a-1292ee507fcb" + }, + "date_created": "2023-08-04T18:31:00.000Z", + "date_modified": "2023-08-17T18:48:00.000Z", + "permissions_data": null, + "filesize_bytes": 6342 + } + } + }, + { + "type": "NarrativeText", + "element_id": "59aab31c8b60641b906a81db51c596a6", + "text": "This is my code caption", + "metadata": { + "languages": [ + "eng" + ], + "filetype": "text/html", + "data_source": { + "url": null, + "version": null, + "record_locator": { + "page_id": "c47a4566-4c7a-488b-ac2a-1292ee507fcb" + }, + "date_created": "2023-08-04T18:31:00.000Z", + "date_modified": "2023-08-17T18:48:00.000Z", + "permissions_data": null, + "filesize_bytes": 6342 + } + } + }, + { + "type": "NarrativeText", + "element_id": "7fc741d4226b15a910af95ff3fde6253", + "text": "This is some text", + "metadata": { + "languages": [ + "eng" + ], + "filetype": "text/html", + "data_source": { + "url": null, + "version": null, + "record_locator": { + "page_id": "c47a4566-4c7a-488b-ac2a-1292ee507fcb" + }, + "date_created": "2023-08-04T18:31:00.000Z", + "date_modified": "2023-08-17T18:48:00.000Z", + "permissions_data": null, + "filesize_bytes": 6342 + } + } + }, + { + "type": "NarrativeText", + "element_id": "f67f0aef4f1ceb0fa98491872aa741ac", + "text": "This is text in next column", + "metadata": { + "languages": [ + "eng" + ], + "filetype": "text/html", + "data_source": { + "url": null, + "version": null, + "record_locator": { + "page_id": "c47a4566-4c7a-488b-ac2a-1292ee507fcb" + }, + "date_created": "2023-08-04T18:31:00.000Z", + "date_modified": "2023-08-17T18:48:00.000Z", + "permissions_data": null, + "filesize_bytes": 6342 + } + } + }, + { + "type": "Title", + "element_id": "f08a88064f2c33164502652db93fad32", + "text": "Final text in column", + "metadata": { + "languages": [ + "eng" + ], + "filetype": "text/html", + "data_source": { + "url": null, + "version": null, + "record_locator": { + "page_id": "c47a4566-4c7a-488b-ac2a-1292ee507fcb" + }, + "date_created": "2023-08-04T18:31:00.000Z", + "date_modified": "2023-08-17T18:48:00.000Z", + "permissions_data": null, + "filesize_bytes": 6342 + } + } + }, + { + "type": "NarrativeText", + "element_id": "fa3e9d761730605036aaf854d9edd5b4", + "text": "Heading 1 content", + "metadata": { + "languages": [ + "eng" + ], + "filetype": "text/html", + "data_source": { + "url": null, + "version": null, + "record_locator": { + "page_id": "c47a4566-4c7a-488b-ac2a-1292ee507fcb" + }, + "date_created": "2023-08-04T18:31:00.000Z", + "date_modified": "2023-08-17T18:48:00.000Z", + "permissions_data": null, + "filesize_bytes": 6342 + } + } + }, + { + "type": "UncategorizedText", + "element_id": "c087a92c7251ca836ff023d35cb0a1aa", + "text": "d3d87fc6-61cc-4bb5-89ed-e9dff0df1526", + "metadata": { + "languages": [ + "eng" + ], + "filetype": "text/html", + "data_source": { + "url": null, + "version": null, + "record_locator": { + "page_id": "c47a4566-4c7a-488b-ac2a-1292ee507fcb" + }, + "date_created": "2023-08-04T18:31:00.000Z", + "date_modified": "2023-08-17T18:48:00.000Z", + "permissions_data": null, + "filesize_bytes": 6342 + } + } + }, + { + "type": "Title", + "element_id": "3126a68fa0a12481ca6dc64c16511a7e", + "text": "Stuff todo", + "metadata": { + "languages": [ + "eng" + ], + "filetype": "text/html", + "data_source": { + "url": null, + "version": null, + "record_locator": { + "page_id": "c47a4566-4c7a-488b-ac2a-1292ee507fcb" + }, + "date_created": "2023-08-04T18:31:00.000Z", + "date_modified": "2023-08-17T18:48:00.000Z", + "permissions_data": null, + "filesize_bytes": 6342 + } + } + }, + { + "type": "Title", + "element_id": "8cfa5b216c8d3f774f8e1def029681e6", + "text": "more stuff todo", + "metadata": { + "languages": [ + "eng" + ], + "filetype": "text/html", + "data_source": { + "url": null, + "version": null, + "record_locator": { + "page_id": "c47a4566-4c7a-488b-ac2a-1292ee507fcb" + }, + "date_created": "2023-08-04T18:31:00.000Z", + "date_modified": "2023-08-17T18:48:00.000Z", + "permissions_data": null, + "filesize_bytes": 6342 + } + } + }, + { + "type": "NarrativeText", + "element_id": "b538abdbf0aff3f9f1ab11d79bb5bc26", + "text": "More things to do", + "metadata": { + "languages": [ + "eng" + ], + "filetype": "text/html", + "data_source": { + "url": null, + "version": null, + "record_locator": { + "page_id": "c47a4566-4c7a-488b-ac2a-1292ee507fcb" + }, + "date_created": "2023-08-04T18:31:00.000Z", + "date_modified": "2023-08-17T18:48:00.000Z", + "permissions_data": null, + "filesize_bytes": 6342 + } + } + }, + { + "type": "NarrativeText", + "element_id": "570c50d8758c5639a1dfd0f238f609d5", + "text": "Something to do", + "metadata": { + "languages": [ + "eng" + ], + "filetype": "text/html", + "data_source": { + "url": null, + "version": null, + "record_locator": { + "page_id": "c47a4566-4c7a-488b-ac2a-1292ee507fcb" + }, + "date_created": "2023-08-04T18:31:00.000Z", + "date_modified": "2023-08-17T18:48:00.000Z", + "permissions_data": null, + "filesize_bytes": 6342 + } + } + } +] \ No newline at end of file diff --git a/test_e2e/expected-structured-output/notion/c47a4566-4c7a-488b-ac2a-1292ee507fcb.json b/test_e2e/expected-structured-output/notion/c47a4566-4c7a-488b-ac2a-1292ee507fcb.json deleted file mode 100644 index 4daf27cc8..000000000 --- a/test_e2e/expected-structured-output/notion/c47a4566-4c7a-488b-ac2a-1292ee507fcb.json +++ /dev/null @@ -1,463 +0,0 @@ -[ - { - "element_id": "cd153f73463db45ea02bd9ba6ce4168e", - "metadata": { - "filetype": "text/html", - "languages": [ - "eng" - ] - }, - "text": "Page with every block", - "type": "Title" - }, - { - "element_id": "098442d39ccc8a9731627be8a843d02a", - "metadata": { - "filetype": "text/html", - "languages": [ - "eng" - ] - }, - "text": "Notion Tip: Tag pages to let collaborators know what they can expect to use the page for. You can add one or many tags to any page in a wiki.", - "type": "NarrativeText" - }, - { - "element_id": "868a2b2294814990d664cf13ffd1e2a7", - "metadata": { - "filetype": "text/html", - "languages": [ - "eng" - ] - }, - "text": "Heading 2", - "type": "Title" - }, - { - "element_id": "af888c9a9a14c9c6616cf54ac230c20a", - "metadata": { - "filetype": "text/html", - "languages": [ - "eng" - ] - }, - "text": "This is some new text", - "type": "NarrativeText" - }, - { - "element_id": "99388232115e119009419bd8b07c93b9", - "metadata": { - "emphasized_text_contents": [ - "formatted" - ], - "emphasized_text_tags": [ - "b" - ], - "filetype": "text/html", - "languages": [ - "eng" - ], - "link_texts": [ - "text" - ], - "link_urls": [ - "/9ba4d6da8a574cfc81ebceac1fde52bd" - ] - }, - "text": "Some/less → more formatted text with other content and stuff 2023-08-07 : @Roman Isecke", - "type": "UncategorizedText" - }, - { - "element_id": "91b9abcc226cbe676d827950030c6702", - "metadata": { - "filetype": "text/html", - "languages": [ - "eng" - ], - "text_as_html": "
column 1column 2pages
c1r1 contentc2r1 table
2023-08-08T09:00:00.000-04:00
cell
Page with every block
c1r2 more contentc2r2 table cellUntitled
this is some green textthis is an equationUntitled
text1 text2 Multiline cellAnother cellUntitled
" - }, - "text": "column 1 column 2 pages c1r1 content c2r1 table \n 2023-08-08T09:00:00.000-04:00\n cell Page with every block c1r2 more content c2r2 table cell Untitled this is some green text this is an equation Untitled text1 text2 Multiline cell Another cell Untitled", - "type": "Table" - }, - { - "element_id": "0b73b1397f01db39dc98a983bd3aeb3d", - "metadata": { - "filetype": "text/html", - "languages": [ - "eng" - ] - }, - "text": "E = {mc^2}", - "type": "UncategorizedText" - }, - { - "element_id": "7535c23e3c0bda50ea38df65f7a64bca", - "metadata": { - "filetype": "text/html", - "languages": [ - "eng" - ] - }, - "text": "Numbered list", - "type": "ListItem" - }, - { - "element_id": "155061ede32096c81085eabf421f9fe0", - "metadata": { - "filetype": "text/html", - "languages": [ - "eng" - ] - }, - "text": "A number child", - "type": "ListItem" - }, - { - "element_id": "1ff4a64dcc74b4cbdf4270776c2adab0", - "metadata": { - "filetype": "text/html", - "languages": [ - "eng" - ] - }, - "text": "A number grandchild", - "type": "ListItem" - }, - { - "element_id": "9e0342a8c3a010f7802d874fa447f72b", - "metadata": { - "filetype": "text/html", - "languages": [ - "eng" - ] - }, - "text": "great", - "type": "ListItem" - }, - { - "element_id": "240e4a3a9b5843192b03086325da2169", - "metadata": { - "filetype": "text/html", - "languages": [ - "eng" - ] - }, - "text": "super great", - "type": "ListItem" - }, - { - "element_id": "d1e6a3da60ba834365b2230689c4d8a6", - "metadata": { - "filetype": "text/html", - "languages": [ - "eng" - ] - }, - "text": "with test text", - "type": "ListItem" - }, - { - "element_id": "db78c6b732dc265e380889e394c6354f", - "metadata": { - "filetype": "text/html", - "languages": [ - "eng" - ] - }, - "text": "Bullet one", - "type": "ListItem" - }, - { - "element_id": "f31b201c44870108f395a238bff36413", - "metadata": { - "filetype": "text/html", - "languages": [ - "eng" - ] - }, - "text": "A child bullet", - "type": "ListItem" - }, - { - "element_id": "5929608d0a4d2f055635bbab72df26ec", - "metadata": { - "filetype": "text/html", - "languages": [ - "eng" - ] - }, - "text": "A grandchild bullet", - "type": "ListItem" - }, - { - "element_id": "1e93d6f8cf7c8af51ddf222be77b4882", - "metadata": { - "filetype": "text/html", - "languages": [ - "eng" - ] - }, - "text": "great", - "type": "ListItem" - }, - { - "element_id": "c53244024b7b1e86b20bcc1489d9dc4a", - "metadata": { - "filetype": "text/html", - "languages": [ - "eng" - ] - }, - "text": "super great", - "type": "ListItem" - }, - { - "element_id": "3602b0a8a126be064654623590163f49", - "metadata": { - "filetype": "text/html", - "languages": [ - "eng" - ] - }, - "text": "Bullet two", - "type": "ListItem" - }, - { - "element_id": "27d5b17e90250d77a76da1f6d93f8e8b", - "metadata": { - "filetype": "text/html", - "languages": [ - "eng" - ] - }, - "text": "I quote myself testings Notion", - "type": "NarrativeText" - }, - { - "element_id": "8831856d3670d91d6fa2121af0694022", - "metadata": { - "filetype": "text/html", - "languages": [ - "eng" - ], - "link_texts": [ - "https://www.notion.so/icons/airplane_brown.svg" - ], - "link_urls": [ - "https://www.notion.so/icons/airplane_brown.svg" - ] - }, - "text": "https://www.notion.so/icons/airplane_brown.svg I call this out", - "type": "NarrativeText" - }, - { - "element_id": "df59e087da5910b2cb1c98801bb24c85", - "metadata": { - "filetype": "text/html", - "languages": [ - "eng" - ], - "link_texts": [ - "https://www.wikipedia.org/" - ], - "link_urls": [ - "https://www.wikipedia.org/" - ] - }, - "text": "https://www.wikipedia.org/", - "type": "Title" - }, - { - "element_id": "0f215d56b4a1fc900dc2dad40b7df66f", - "metadata": { - "filetype": "text/html", - "languages": [ - "eng" - ], - "link_texts": [ - "https://play-lh.googleusercontent.com/KwUBNPbMTk9jDXYS2AeX3illtVRTkrKVh5xR1Mg4WHd0CG2tV4mrh1z3kXi5z_warlk" - ], - "link_urls": [ - "https://play-lh.googleusercontent.com/KwUBNPbMTk9jDXYS2AeX3illtVRTkrKVh5xR1Mg4WHd0CG2tV4mrh1z3kXi5z_warlk" - ] - }, - "text": "https://play-lh.googleusercontent.com/KwUBNPbMTk9jDXYS2AeX3illtVRTkrKVh5xR1Mg4WHd0CG2tV4mrh1z3kXi5z_warlk", - "type": "Title" - }, - { - "element_id": "5da75c186c36d3117e60f08d49e66085", - "metadata": { - "filetype": "text/html", - "languages": [ - "eng" - ] - }, - "text": "Child Database:", - "type": "Title" - }, - { - "element_id": "a82757a2b9004569ab1761d061847bd3", - "metadata": { - "filetype": "text/html", - "languages": [ - "eng" - ], - "link_texts": [ - "Analytics" - ], - "link_urls": [ - "https://www.notion.so/d1fad658f1cf4eedb0b5ee72b9f0b530" - ] - }, - "text": "Analytics", - "type": "Title" - }, - { - "element_id": "29a6be22a8770f106f54f4abcdc1de68", - "metadata": { - "filetype": "text/html", - "languages": [ - "eng" - ] - }, - "text": "Child Page:", - "type": "Title" - }, - { - "element_id": "d07d54a1ce286a7679952d4e4ce82c8e", - "metadata": { - "filetype": "text/html", - "languages": [ - "eng" - ], - "link_texts": [ - "Untitled" - ], - "link_urls": [ - "https://www.notion.so/9ba4d6da8a574cfc81ebceac1fde52bd" - ] - }, - "text": "Untitled", - "type": "Title" - }, - { - "element_id": "d4c02f5b35a00e87ef7be603d82c5df3", - "metadata": { - "filetype": "text/html", - "languages": [ - "eng" - ] - }, - "text": "s = \"this is some code\"", - "type": "NarrativeText" - }, - { - "element_id": "59aab31c8b60641b906a81db51c596a6", - "metadata": { - "filetype": "text/html", - "languages": [ - "eng" - ] - }, - "text": "This is my code caption", - "type": "NarrativeText" - }, - { - "element_id": "7fc741d4226b15a910af95ff3fde6253", - "metadata": { - "filetype": "text/html", - "languages": [ - "eng" - ] - }, - "text": "This is some text", - "type": "NarrativeText" - }, - { - "element_id": "f67f0aef4f1ceb0fa98491872aa741ac", - "metadata": { - "filetype": "text/html", - "languages": [ - "eng" - ] - }, - "text": "This is text in next column", - "type": "NarrativeText" - }, - { - "element_id": "f08a88064f2c33164502652db93fad32", - "metadata": { - "filetype": "text/html", - "languages": [ - "eng" - ] - }, - "text": "Final text in column", - "type": "Title" - }, - { - "element_id": "fa3e9d761730605036aaf854d9edd5b4", - "metadata": { - "filetype": "text/html", - "languages": [ - "eng" - ] - }, - "text": "Heading 1 content", - "type": "NarrativeText" - }, - { - "element_id": "c087a92c7251ca836ff023d35cb0a1aa", - "metadata": { - "filetype": "text/html", - "languages": [ - "eng" - ] - }, - "text": "d3d87fc6-61cc-4bb5-89ed-e9dff0df1526", - "type": "UncategorizedText" - }, - { - "element_id": "3126a68fa0a12481ca6dc64c16511a7e", - "metadata": { - "filetype": "text/html", - "languages": [ - "eng" - ] - }, - "text": "Stuff todo", - "type": "Title" - }, - { - "element_id": "8cfa5b216c8d3f774f8e1def029681e6", - "metadata": { - "filetype": "text/html", - "languages": [ - "eng" - ] - }, - "text": "more stuff todo", - "type": "Title" - }, - { - "element_id": "b538abdbf0aff3f9f1ab11d79bb5bc26", - "metadata": { - "filetype": "text/html", - "languages": [ - "eng" - ] - }, - "text": "More things to do", - "type": "NarrativeText" - }, - { - "element_id": "570c50d8758c5639a1dfd0f238f609d5", - "metadata": { - "filetype": "text/html", - "languages": [ - "eng" - ] - }, - "text": "Something to do", - "type": "NarrativeText" - } -] \ No newline at end of file diff --git a/test_e2e/expected-structured-output/notion/dfcbe584-30b1-4551-b533-e6a5759af842.html.json b/test_e2e/expected-structured-output/notion/dfcbe584-30b1-4551-b533-e6a5759af842.html.json new file mode 100644 index 000000000..e6bcbf67a --- /dev/null +++ b/test_e2e/expected-structured-output/notion/dfcbe584-30b1-4551-b533-e6a5759af842.html.json @@ -0,0 +1,46 @@ +[ + { + "type": "Title", + "element_id": "ee4edbe949900c6988a62505a9325d47", + "text": "Corporate Travel", + "metadata": { + "languages": [ + "eng" + ], + "filetype": "text/html", + "data_source": { + "url": null, + "version": null, + "record_locator": { + "page_id": "dfcbe584-30b1-4551-b533-e6a5759af842" + }, + "date_created": "2023-08-04T18:31:00.000Z", + "date_modified": "2023-08-04T18:31:00.000Z", + "permissions_data": null, + "filesize_bytes": 419 + } + } + }, + { + "type": "NarrativeText", + "element_id": "756651f18284432aa247200d0bc0cc62", + "text": "Notion Tip: A corporate travel policy is crucial for controlling costs, ensuring compliance, and guaranteeing the safety of employees when traveling for the company.", + "metadata": { + "languages": [ + "eng" + ], + "filetype": "text/html", + "data_source": { + "url": null, + "version": null, + "record_locator": { + "page_id": "dfcbe584-30b1-4551-b533-e6a5759af842" + }, + "date_created": "2023-08-04T18:31:00.000Z", + "date_modified": "2023-08-04T18:31:00.000Z", + "permissions_data": null, + "filesize_bytes": 419 + } + } + } +] \ No newline at end of file diff --git a/test_e2e/expected-structured-output/notion/dfcbe584-30b1-4551-b533-e6a5759af842.json b/test_e2e/expected-structured-output/notion/dfcbe584-30b1-4551-b533-e6a5759af842.json deleted file mode 100644 index b51260990..000000000 --- a/test_e2e/expected-structured-output/notion/dfcbe584-30b1-4551-b533-e6a5759af842.json +++ /dev/null @@ -1,24 +0,0 @@ -[ - { - "element_id": "ee4edbe949900c6988a62505a9325d47", - "metadata": { - "filetype": "text/html", - "languages": [ - "eng" - ] - }, - "text": "Corporate Travel", - "type": "Title" - }, - { - "element_id": "756651f18284432aa247200d0bc0cc62", - "metadata": { - "filetype": "text/html", - "languages": [ - "eng" - ] - }, - "text": "Notion Tip: A corporate travel policy is crucial for controlling costs, ensuring compliance, and guaranteeing the safety of employees when traveling for the company.", - "type": "NarrativeText" - } -] \ No newline at end of file diff --git a/test_e2e/expected-structured-output/notion/feccfcd4-8ca0-4638-8212-1a5726461029.html.json b/test_e2e/expected-structured-output/notion/feccfcd4-8ca0-4638-8212-1a5726461029.html.json new file mode 100644 index 000000000..64e8d5506 --- /dev/null +++ b/test_e2e/expected-structured-output/notion/feccfcd4-8ca0-4638-8212-1a5726461029.html.json @@ -0,0 +1,46 @@ +[ + { + "type": "Title", + "element_id": "12f0e8957240cb6d2bedffde59586918", + "text": "Benefits Policies", + "metadata": { + "languages": [ + "eng" + ], + "filetype": "text/html", + "data_source": { + "url": null, + "version": null, + "record_locator": { + "page_id": "feccfcd4-8ca0-4638-8212-1a5726461029" + }, + "date_created": "2023-08-04T18:31:00.000Z", + "date_modified": "2023-08-04T18:31:00.000Z", + "permissions_data": null, + "filesize_bytes": 423 + } + } + }, + { + "type": "NarrativeText", + "element_id": "3e394812bcc3403068dc1d92a42271ce", + "text": "Notion Tip: Benefits policies can attract and retain employees, promote well-being, create positive culture, differentiate from competitors, and increase morale and satisfaction.", + "metadata": { + "languages": [ + "eng" + ], + "filetype": "text/html", + "data_source": { + "url": null, + "version": null, + "record_locator": { + "page_id": "feccfcd4-8ca0-4638-8212-1a5726461029" + }, + "date_created": "2023-08-04T18:31:00.000Z", + "date_modified": "2023-08-04T18:31:00.000Z", + "permissions_data": null, + "filesize_bytes": 423 + } + } + } +] \ No newline at end of file diff --git a/test_e2e/expected-structured-output/notion/feccfcd4-8ca0-4638-8212-1a5726461029.json b/test_e2e/expected-structured-output/notion/feccfcd4-8ca0-4638-8212-1a5726461029.json deleted file mode 100644 index 5a6a8a45a..000000000 --- a/test_e2e/expected-structured-output/notion/feccfcd4-8ca0-4638-8212-1a5726461029.json +++ /dev/null @@ -1,24 +0,0 @@ -[ - { - "element_id": "12f0e8957240cb6d2bedffde59586918", - "metadata": { - "filetype": "text/html", - "languages": [ - "eng" - ] - }, - "text": "Benefits Policies", - "type": "Title" - }, - { - "element_id": "3e394812bcc3403068dc1d92a42271ce", - "metadata": { - "filetype": "text/html", - "languages": [ - "eng" - ] - }, - "text": "Notion Tip: Benefits policies can attract and retain employees, promote well-being, create positive culture, differentiate from competitors, and increase morale and satisfaction.", - "type": "NarrativeText" - } -] \ No newline at end of file diff --git a/test_e2e/expected-structured-output/notion/fee2149e-6240-4431-8e98-a04a2e460a66.html.json b/test_e2e/expected-structured-output/notion/fee2149e-6240-4431-8e98-a04a2e460a66.html.json new file mode 100644 index 000000000..5b046d30d --- /dev/null +++ b/test_e2e/expected-structured-output/notion/fee2149e-6240-4431-8e98-a04a2e460a66.html.json @@ -0,0 +1,52 @@ +[ + { + "type": "Title", + "element_id": "665e346acfccd4fb6110bcd1a2e36155", + "text": "Sprint 1", + "metadata": { + "languages": [ + "eng" + ], + "filetype": "text/html", + "data_source": { + "url": null, + "version": null, + "record_locator": { + "page_id": "fee2149e-6240-4431-8e98-a04a2e460a66" + }, + "date_created": "2023-08-02T20:36:00.000Z", + "date_modified": "2023-08-17T18:49:00.000Z", + "permissions_data": null, + "filesize_bytes": 214 + } + } + }, + { + "type": "NarrativeText", + "element_id": "cfa5ea5800f7a2510d64c98b58742e45", + "text": "Planning notes", + "metadata": { + "emphasized_text_contents": [ + "Planning notes" + ], + "emphasized_text_tags": [ + "b" + ], + "languages": [ + "eng" + ], + "filetype": "text/html", + "data_source": { + "url": null, + "version": null, + "record_locator": { + "page_id": "fee2149e-6240-4431-8e98-a04a2e460a66" + }, + "date_created": "2023-08-02T20:36:00.000Z", + "date_modified": "2023-08-17T18:49:00.000Z", + "permissions_data": null, + "filesize_bytes": 214 + } + } + } +] \ No newline at end of file diff --git a/test_e2e/expected-structured-output/notion/fee2149e-6240-4431-8e98-a04a2e460a66.json b/test_e2e/expected-structured-output/notion/fee2149e-6240-4431-8e98-a04a2e460a66.json deleted file mode 100644 index f69514cd9..000000000 --- a/test_e2e/expected-structured-output/notion/fee2149e-6240-4431-8e98-a04a2e460a66.json +++ /dev/null @@ -1,30 +0,0 @@ -[ - { - "element_id": "665e346acfccd4fb6110bcd1a2e36155", - "metadata": { - "filetype": "text/html", - "languages": [ - "eng" - ] - }, - "text": "Sprint 1", - "type": "Title" - }, - { - "element_id": "cfa5ea5800f7a2510d64c98b58742e45", - "metadata": { - "emphasized_text_contents": [ - "Planning notes" - ], - "emphasized_text_tags": [ - "b" - ], - "filetype": "text/html", - "languages": [ - "eng" - ] - }, - "text": "Planning notes", - "type": "NarrativeText" - } -] \ No newline at end of file From dba56b36bb76426713871c51d0e7466553beb157 Mon Sep 17 00:00:00 2001 From: Bryan Chen Date: Mon, 30 Dec 2024 15:04:21 -0500 Subject: [PATCH 44/48] make sure the recursive child block getter to point at the next page / cursor --- .../v2/processes/connectors/notion/client.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/unstructured_ingest/v2/processes/connectors/notion/client.py b/unstructured_ingest/v2/processes/connectors/notion/client.py index 8fb769e90..97eb48a24 100644 --- a/unstructured_ingest/v2/processes/connectors/notion/client.py +++ b/unstructured_ingest/v2/processes/connectors/notion/client.py @@ -77,11 +77,12 @@ def iterate_list( block_id: str, **kwargs: Any, ) -> Generator[List[Block], None, None]: + next_cursor = None while True: response: dict = ( - self.retry_handler(super().list, block_id=block_id, **kwargs) + self.retry_handler(super().list, block_id=block_id, start_cursor=next_cursor, **kwargs) if self.retry_handler - else super().list(block_id=block_id, **kwargs) + else super().list(block_id=block_id, start_cursor=next_cursor, **kwargs) ) # type: ignore child_blocks = [Block.from_dict(data=b) for b in response.pop("results", [])] yield child_blocks @@ -148,11 +149,12 @@ def query(self, database_id: str, **kwargs: Any) -> Tuple[List[Page], dict]: return pages, resp def iterate_query(self, database_id: str, **kwargs: Any) -> Generator[List[Page], None, None]: + next_cursor = None while True: response: dict = ( - self.retry_handler(super().query, database_id=database_id, **kwargs) + self.retry_handler(super().query, database_id=database_id, start_cursor=next_cursor, **kwargs) if (self.retry_handler) - else (super().query(database_id=database_id, **kwargs)) + else (super().query(database_id=database_id, start_cursor=next_cursor, **kwargs)) ) # type: ignore pages = [Page.from_dict(data=p) for p in response.pop("results", [])] for p in pages: From 71b0c4477424615fd7b32a4cd2a949d9f985922b Mon Sep 17 00:00:00 2001 From: Bryan Chen Date: Wed, 1 Jan 2025 22:55:08 -0500 Subject: [PATCH 45/48] fix syntax --- .../v2/processes/connectors/notion/client.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/unstructured_ingest/v2/processes/connectors/notion/client.py b/unstructured_ingest/v2/processes/connectors/notion/client.py index 97eb48a24..f24bacc81 100644 --- a/unstructured_ingest/v2/processes/connectors/notion/client.py +++ b/unstructured_ingest/v2/processes/connectors/notion/client.py @@ -80,7 +80,9 @@ def iterate_list( next_cursor = None while True: response: dict = ( - self.retry_handler(super().list, block_id=block_id, start_cursor=next_cursor, **kwargs) + self.retry_handler( + super().list, block_id=block_id, start_cursor=next_cursor, **kwargs + ) if self.retry_handler else super().list(block_id=block_id, start_cursor=next_cursor, **kwargs) ) # type: ignore @@ -152,7 +154,9 @@ def iterate_query(self, database_id: str, **kwargs: Any) -> Generator[List[Page] next_cursor = None while True: response: dict = ( - self.retry_handler(super().query, database_id=database_id, start_cursor=next_cursor, **kwargs) + self.retry_handler( + super().query, database_id=database_id, start_cursor=next_cursor, **kwargs + ) if (self.retry_handler) else (super().query(database_id=database_id, start_cursor=next_cursor, **kwargs)) ) # type: ignore From c0f732579baddc49cd3ac45800870e0ef60cbfa4 Mon Sep 17 00:00:00 2001 From: Bryan Chen Date: Sun, 5 Jan 2025 19:33:20 -0500 Subject: [PATCH 46/48] fix block retrieval logic --- .../notion_database/directory_structure.json | 8 +- .../1572c3765a0a80d3a34ac5c0eecd1e88.json | 70 ++-- .../notion_page/directory_structure.json | 8 +- .../1572c3765a0a806299f0dd6999f9e4c7.html | 258 ++++++------ .../1572c3765a0a806299f0dd6999f9e4c7.json | 68 +-- .../v2/processes/connectors/notion/helpers.py | 388 ++++++------------ 6 files changed, 335 insertions(+), 465 deletions(-) diff --git a/test/integration/connectors/expected_results/notion_database/directory_structure.json b/test/integration/connectors/expected_results/notion_database/directory_structure.json index d9b196898..87906fe7d 100644 --- a/test/integration/connectors/expected_results/notion_database/directory_structure.json +++ b/test/integration/connectors/expected_results/notion_database/directory_structure.json @@ -1,5 +1,5 @@ { - "directory_structure": [ - "1572c3765a0a80d3a34ac5c0eecd1e88.html" - ] - } \ No newline at end of file + "directory_structure": [ + "1572c3765a0a80d3a34ac5c0eecd1e88.html" + ] +} \ No newline at end of file diff --git a/test/integration/connectors/expected_results/notion_database/file_data/1572c3765a0a80d3a34ac5c0eecd1e88.json b/test/integration/connectors/expected_results/notion_database/file_data/1572c3765a0a80d3a34ac5c0eecd1e88.json index 0b98bf07b..430d3fbbd 100644 --- a/test/integration/connectors/expected_results/notion_database/file_data/1572c3765a0a80d3a34ac5c0eecd1e88.json +++ b/test/integration/connectors/expected_results/notion_database/file_data/1572c3765a0a80d3a34ac5c0eecd1e88.json @@ -1,39 +1,39 @@ { - "identifier": "1572c3765a0a80d3a34ac5c0eecd1e88", - "connector_type": "notion", - "source_identifiers": { - "filename": "1572c3765a0a80d3a34ac5c0eecd1e88.html", - "fullpath": "1572c3765a0a80d3a34ac5c0eecd1e88.html", - "rel_path": "1572c3765a0a80d3a34ac5c0eecd1e88.html" + "identifier": "1572c3765a0a80d3a34ac5c0eecd1e88", + "connector_type": "notion", + "source_identifiers": { + "filename": "1572c3765a0a80d3a34ac5c0eecd1e88.html", + "fullpath": "1572c3765a0a80d3a34ac5c0eecd1e88.html", + "rel_path": "1572c3765a0a80d3a34ac5c0eecd1e88.html" + }, + "metadata": { + "url": null, + "version": null, + "record_locator": { + "database_id": "1572c3765a0a80d3a34ac5c0eecd1e88" }, - "metadata": { - "url": null, - "version": null, - "record_locator": { - "database_id": "1572c3765a0a80d3a34ac5c0eecd1e88" - }, - "date_created": "2024-12-09T11:54:00.000Z", - "date_modified": "2024-12-30T15:19:00.000Z", - "date_processed": "1735577045.091182", - "permissions_data": null, - "filesize_bytes": null + "date_created": "2024-12-09T11:54:00.000Z", + "date_modified": "2025-01-05T18:31:00.000Z", + "date_processed": "1736123419.51279", + "permissions_data": null, + "filesize_bytes": null + }, + "additional_metadata": { + "created_by": { + "id": "118d872b-594c-8171-b46f-00020d10d8b2", + "object": "user" }, - "additional_metadata": { - "created_by": { - "id": "118d872b-594c-8171-b46f-00020d10d8b2", - "object": "user" - }, - "last_edited_by": { - "id": "118d872b-594c-8171-b46f-00020d10d8b2", - "object": "user" - }, - "parent": { - "page_id": "1572c376-5a0a-80d8-9619-cb35a622b8cc", - "type": "page_id" - }, - "url": "https://www.notion.so/1572c3765a0a80d3a34ac5c0eecd1e88" + "last_edited_by": { + "id": "118d872b-594c-8171-b46f-00020d10d8b2", + "object": "user" }, - "reprocess": false, - "local_download_path": "/private/var/folders/h7/n848df9s5yn7ml8rxb61vhyc0000gp/T/tmpnqb7824d/1572c3765a0a80d3a34ac5c0eecd1e88.html", - "display_name": null - } \ No newline at end of file + "parent": { + "page_id": "1572c376-5a0a-80d8-9619-cb35a622b8cc", + "type": "page_id" + }, + "url": "https://www.notion.so/1572c3765a0a80d3a34ac5c0eecd1e88" + }, + "reprocess": false, + "local_download_path": "/private/var/folders/h7/n848df9s5yn7ml8rxb61vhyc0000gp/T/tmp_lvvqhyy/1572c3765a0a80d3a34ac5c0eecd1e88.html", + "display_name": null +} \ No newline at end of file diff --git a/test/integration/connectors/expected_results/notion_page/directory_structure.json b/test/integration/connectors/expected_results/notion_page/directory_structure.json index f96f3ca8f..9d7654273 100644 --- a/test/integration/connectors/expected_results/notion_page/directory_structure.json +++ b/test/integration/connectors/expected_results/notion_page/directory_structure.json @@ -1,5 +1,5 @@ { - "directory_structure": [ - "1572c3765a0a806299f0dd6999f9e4c7.html" - ] - } \ No newline at end of file + "directory_structure": [ + "1572c3765a0a806299f0dd6999f9e4c7.html" + ] +} \ No newline at end of file diff --git a/test/integration/connectors/expected_results/notion_page/downloads/1572c3765a0a806299f0dd6999f9e4c7.html b/test/integration/connectors/expected_results/notion_page/downloads/1572c3765a0a806299f0dd6999f9e4c7.html index b6797d7b0..792e82426 100644 --- a/test/integration/connectors/expected_results/notion_page/downloads/1572c3765a0a806299f0dd6999f9e4c7.html +++ b/test/integration/connectors/expected_results/notion_page/downloads/1572c3765a0a806299f0dd6999f9e4c7.html @@ -8,136 +8,142 @@

test-doc1

-
- testtext2 testtext2 testtext2 testtext2 testtext2 testtext2 testtext2 testtext2 testtext2 testtext2 -
-
- testtext2 testtext2 testtext2 testtext2 testtext2 testtext2 testtext2 testtext2 testtext2 testtext2 -
- -
    -
  1. - Testdoc2 List Item 1 -
  2. -
      -
    1. - Testdoc2 List Item 1 Nested Item A +
      +
      + testtext2 testtext2 testtext2 testtext2 testtext2 testtext2 testtext2 testtext2 testtext2 testtext2 +
      +
      + testtext2 testtext2 testtext2 testtext2 testtext2 testtext2 testtext2 testtext2 testtext2 testtext2 +
      + +
      +
        +
      1. + Testdoc2 List Item 1 +
      2. +
      +
        +
      1. + Testdoc2 List Item 1 Nested Item A +
      2. +
      3. + Testdoc2 List Item 1 Nested Item B +
      4. +
      +
      +
        +
      1. + Testdoc2 List Item 2
      2. -
      3. - Testdoc2 List Item 1 Nested Item B +
      4. + Testdoc2 List Item 3
      -
    2. - Testdoc2 List Item 2 -
    3. -
    4. - Testdoc2 List Item 3 -
    5. -
    -
    -
    - - -
    -
    - Testdoc2 Checklist Item 1 -
    -
    - - -
    -
    - Testdoc2 Checklist Item 2 (checked) -
    -
    - -
    - - -
    - - Testdoc2 bold text - -
    -
    - - Testdoc2 italic text - -
    -
    - - Testdoc2 Heading 1 Sized Text - -
    -
    - - Testdoc2 Heading 2 Sized Text - -
    -
    - Testdoc2 Heading 3 Sized Text -
    -
    - Testdoc2 Heading 4 Sized Text -
    -
    - Testdoc2 Heading 5 Sized Text +
    +
    + + +
    +
    + Testdoc2 Checklist Item 1 +
    +
    + + +
    +
    + Testdoc2 Checklist Item 2 (checked) +
    +
    + +
    + + +
    + + Testdoc2 bold text + +
    +
    + + Testdoc2 italic text + +
    +
    + + Testdoc2 Heading 1 Sized Text + +
    +
    + + Testdoc2 Heading 2 Sized Text + +
    +
    + Testdoc2 Heading 3 Sized Text +
    +
    + Testdoc2 Heading 4 Sized Text +
    +
    + Testdoc2 Heading 5 Sized Text +
    + + + + + + + + + + + + + + + + +
    + + Testdoc2 Table: Column 1 Row 0 + + + + Testdoc2 Table: Column 2 Row 0 + + + + Testdoc2 Table: Column 3 Row 0 + +
    + + Testdoc2 Table: Column 1 Row 1 + + + + Testdoc2 Table: Column 2 Row 1 + + + + Testdoc2 Table: Column 3 Row 1 + +
    + + Testdoc2 Table: Column 1 Row 2 + + + + Testdoc2 Table: Column 2 Row 2 + + + + Testdoc2 Table: Column 3 Row 2 + +
    + +
    - - - - - - - - - - - - - - - - -
    - - Testdoc2 Table: Column 1 Row 0 - - - - Testdoc2 Table: Column 2 Row 0 - - - - Testdoc2 Table: Column 3 Row 0 - -
    - - Testdoc2 Table: Column 1 Row 1 - - - - Testdoc2 Table: Column 2 Row 1 - - - - Testdoc2 Table: Column 3 Row 1 - -
    - - Testdoc2 Table: Column 1 Row 2 - - - - Testdoc2 Table: Column 2 Row 2 - - - - Testdoc2 Table: Column 3 Row 2 - -
    - -
    diff --git a/test/integration/connectors/expected_results/notion_page/file_data/1572c3765a0a806299f0dd6999f9e4c7.json b/test/integration/connectors/expected_results/notion_page/file_data/1572c3765a0a806299f0dd6999f9e4c7.json index e58937d7e..bad4851cd 100644 --- a/test/integration/connectors/expected_results/notion_page/file_data/1572c3765a0a806299f0dd6999f9e4c7.json +++ b/test/integration/connectors/expected_results/notion_page/file_data/1572c3765a0a806299f0dd6999f9e4c7.json @@ -1,39 +1,39 @@ { - "identifier": "1572c3765a0a806299f0dd6999f9e4c7", - "connector_type": "notion", - "source_identifiers": { - "filename": "1572c3765a0a806299f0dd6999f9e4c7.html", - "fullpath": "1572c3765a0a806299f0dd6999f9e4c7.html", - "rel_path": "1572c3765a0a806299f0dd6999f9e4c7.html" + "identifier": "1572c3765a0a806299f0dd6999f9e4c7", + "connector_type": "notion", + "source_identifiers": { + "filename": "1572c3765a0a806299f0dd6999f9e4c7.html", + "fullpath": "1572c3765a0a806299f0dd6999f9e4c7.html", + "rel_path": "1572c3765a0a806299f0dd6999f9e4c7.html" + }, + "metadata": { + "url": null, + "version": null, + "record_locator": { + "page_id": "1572c3765a0a806299f0dd6999f9e4c7" }, - "metadata": { - "url": null, - "version": null, - "record_locator": { - "page_id": "1572c3765a0a806299f0dd6999f9e4c7" - }, - "date_created": "2024-12-09T18: 13: 00.000Z", - "date_modified": "2024-12-24T18: 58: 00.000Z", - "date_processed": "1735078568.778562", - "permissions_data": null, - "filesize_bytes": null + "date_created": "2024-12-09T18:13:00.000Z", + "date_modified": "2024-12-30T15:16:00.000Z", + "date_processed": "1736123422.122014", + "permissions_data": null, + "filesize_bytes": null + }, + "additional_metadata": { + "created_by": { + "id": "118d872b-594c-8171-b46f-00020d10d8b2", + "object": "user" }, - "additional_metadata": { - "created_by": { - "id": "118d872b-594c-8171-b46f-00020d10d8b2", - "object": "user" - }, - "last_edited_by": { - "id": "118d872b-594c-8171-b46f-00020d10d8b2", - "object": "user" - }, - "parent": { - "page_id": "1182c376-5a0a-8042-9a2a-fb003e00d57b", - "type": "page_id" - }, - "url": "https://www.notion.so/test-doc1-1572c3765a0a806299f0dd6999f9e4c7" + "last_edited_by": { + "id": "118d872b-594c-8171-b46f-00020d10d8b2", + "object": "user" }, - "reprocess": false, - "local_download_path": "/private/var/folders/h7/n848df9s5yn7ml8rxb61vhyc0000gp/T/tmpw56i_s_f/1572c3765a0a806299f0dd6999f9e4c7", - "display_name": null + "parent": { + "page_id": "1182c376-5a0a-8042-9a2a-fb003e00d57b", + "type": "page_id" + }, + "url": "https://www.notion.so/test-doc1-1572c3765a0a806299f0dd6999f9e4c7" + }, + "reprocess": false, + "local_download_path": "/private/var/folders/h7/n848df9s5yn7ml8rxb61vhyc0000gp/T/tmp59aqv6nt/1572c3765a0a806299f0dd6999f9e4c7.html", + "display_name": null } \ No newline at end of file diff --git a/unstructured_ingest/v2/processes/connectors/notion/helpers.py b/unstructured_ingest/v2/processes/connectors/notion/helpers.py index f1b78ee31..07654ddb3 100644 --- a/unstructured_ingest/v2/processes/connectors/notion/helpers.py +++ b/unstructured_ingest/v2/processes/connectors/notion/helpers.py @@ -5,7 +5,7 @@ from urllib.parse import urlparse from uuid import UUID -from htmlBuilder.attributes import Style, Type +from htmlBuilder.attributes import Style from htmlBuilder.tags import ( Body, Div, @@ -24,18 +24,10 @@ import unstructured_ingest.v2.processes.connectors.notion.types.blocks as notion_blocks from unstructured_ingest.v2.processes.connectors.notion.client import Client -from unstructured_ingest.v2.processes.connectors.notion.interfaces import BlockBase from unstructured_ingest.v2.processes.connectors.notion.types.block import Block from unstructured_ingest.v2.processes.connectors.notion.types.database import Database -@dataclass -class TextExtractionResponse: - text: Optional[str] = None - child_pages: List[str] = field(default_factory=list) - child_databases: List[str] = field(default_factory=list) - - @dataclass class HtmlExtractionResponse: html: Optional[HtmlTag] = None @@ -43,92 +35,140 @@ class HtmlExtractionResponse: child_databases: List[str] = field(default_factory=list) +def process_block( + current_block: dict, + parent_page_id: str, + client: Client, + child_pages: list, + child_databases: list, +) -> Tuple[dict, list, list, dict]: + if isinstance(current_block["block"].block, notion_blocks.ChildPage) and current_block[ + "block" + ].id != str(parent_page_id): + child_pages.append(current_block["block"].id) + return {}, child_pages, child_databases + if isinstance(current_block["block"].block, notion_blocks.ChildDatabase): + child_databases.append(current_block["block"].id) + return {}, child_pages, child_databases + + # recursively go through all blocks in a page, store each block in a dictionary + if current_block["block"].has_children: + children = [] + for children_block in client.blocks.children.iterate_list( + block_id=current_block["block"].id + ): + children.extend(children_block) + if children: + for child in children: + child_block = { + "block": child, + "level": current_block["level"] + 1, + "children": [], + "parent_id": current_block["block"].id, + } + child_element, child_pages, child_databases = process_block( + child_block, parent_page_id, client, child_pages, child_databases + ) + current_block["children"].append(child_element) + return current_block, child_pages, child_databases + + +def flush_list(type: str, item_list: list, html: list) -> Tuple[list, list]: + margin_left = 10 * (item_list[-1][1] - 1) + style = Style(f"margin-left: {margin_left}px") + if type == "bulleted_list": + html.append(Ul([style], [item[2] for item in item_list])) + else: + html.append(Ol([style], [item[2] for item in item_list])) + return [], html + + +def build_html( + current_block: dict, bulleted_list: list, numbered_list: list +) -> Tuple[list, list, list]: + html = [] + # extract current block's html + if isinstance(current_block["block"].block, notion_blocks.BulletedListItem): + if bulleted_list and current_block["parent_id"] != bulleted_list[-1][0]: + bulleted_list, html = flush_list("bulleted_list", bulleted_list, html) + bulleted_list.append( + (current_block["parent_id"], current_block["level"], current_block["block"].get_html()) + ) + if bulleted_list and current_block["peers_rank"] == current_block["peers_count"] - 1: + bulleted_list, html = flush_list("bulleted_list", bulleted_list, html) + elif isinstance(current_block["block"].block, notion_blocks.NumberedListItem): + if numbered_list and current_block["parent_id"] != numbered_list[-1][0]: + numbered_list, html = flush_list("numbered_list", numbered_list, html) + numbered_list.append( + (current_block["parent_id"], current_block["level"], current_block["block"].get_html()) + ) + if numbered_list and current_block["peers_rank"] == current_block["peers_count"] - 1: + numbered_list, html = flush_list("numbered_list", numbered_list, html) + else: + if bulleted_list: + bulleted_list, html = flush_list("bulleted_list", bulleted_list, html) + if numbered_list: + numbered_list, html = flush_list("numbered_list", numbered_list, html) + if ( + isinstance(current_block["block"].block, notion_blocks.TableRow) + and current_block["peers_rank"] == 0 + ): + current_block["block"].is_header = True + if current_block["block"].get_html(): + html.append(current_block["block"].get_html()) + else: + html.append([]) + # process current block's children + if current_block["children"]: + children_html = [] + for index, child in enumerate(current_block["children"]): + if child: + child["peers_rank"] = index + child["peers_count"] = len(current_block["children"]) + child_html, bulleted_list, numbered_list = build_html( + child, bulleted_list, numbered_list + ) + if child_html: + children_html.append(child_html) + if isinstance(current_block["block"].block, notion_blocks.Column): + html.append( + Div( + [Style(f"width:{100/current_block['peers_count']}%; float: left")], + children_html, + ) + ) + elif isinstance(current_block["block"].block, notion_blocks.Table): + html.append(Table([], children_html)) + else: + html.append(Div([], children_html)) + + return html, bulleted_list, numbered_list + + def extract_page_html( client: Client, page_id: str, logger: logging.Logger, ) -> HtmlExtractionResponse: - page_id_uuid = UUID(page_id) - html_elements: List[Tuple[BlockBase, HtmlTag]] = [] + parent_page_id = UUID(page_id) parent_block: Block = client.blocks.retrieve(block_id=page_id) # type: ignore head = None if isinstance(parent_block.block, notion_blocks.ChildPage): head = Head([], Title([], parent_block.block.title)) - child_pages: List[str] = [] - child_databases: List[str] = [] - parents: List[Tuple[int, Block]] = [(0, parent_block)] - processed_block_ids = [] - while len(parents) > 0: - level, parent = parents.pop(0) - parent_html = parent.get_html() - if parent_html: - html_elements.append((parent.block, parent_html)) - logger.debug(f"processing block: {parent}") - if isinstance(parent.block, notion_blocks.ChildPage) and parent.id != str(page_id_uuid): - child_pages.append(parent.id) - continue - if isinstance(parent.block, notion_blocks.ChildDatabase): - child_databases.append(parent.id) - continue - if isinstance(parent.block, notion_blocks.Table): - table_response = build_table(client=client, table=parent) - html_elements.append((parent.block, table_response.table_html)) - child_pages.extend(table_response.child_pages) - child_databases.extend(table_response.child_databases) - continue - if isinstance(parent.block, notion_blocks.ColumnList): - column_html = build_columned_list(client=client, column_parent=parent) - html_elements.append((parent.block, column_html)) - continue - if isinstance(parent.block, notion_blocks.BulletedListItem): - bullet_list_resp = build_bulleted_list_children( - client=client, - bulleted_list_item_parent=parent, - ) - if bullet_list_children := bullet_list_resp.child_list: - html_elements.append((parent.block, bullet_list_children)) - continue - if isinstance(parent.block, notion_blocks.NumberedListItem): - numbered_list_resp = build_numbered_list_children( - client=client, - numbered_list_item_parent=parent, - ) - if numbered_list_children := numbered_list_resp.child_list: - html_elements.append((parent.block, numbered_list_children)) - continue - if parent.block.can_have_children() and parent.has_children: - children = [] - for children_block in client.blocks.children.iterate_list( # type: ignore - block_id=parent.id, - ): - children.extend(children_block) - if children: - logger.debug(f"adding {len(children)} children from parent: {parent}") - for child in children: - if child.id not in processed_block_ids: - parents.append((level + 1, child)) - processed_block_ids.append(parent) - - # Join list items - joined_html_elements = [] - numbered_list_items = [] - bullet_list_items = [] - for block, html in html_elements: - if isinstance(block, notion_blocks.BulletedListItem): - bullet_list_items.append(html) - continue - if isinstance(block, notion_blocks.NumberedListItem): - numbered_list_items.append(html) - continue - if len(numbered_list_items) > 0: - joined_html_elements.append(Ol([], numbered_list_items)) - numbered_list_items = [] - if len(bullet_list_items) > 0: - joined_html_elements.append(Ul([], bullet_list_items)) - bullet_list_items = [] - joined_html_elements.append(html) - - body = Body([], joined_html_elements) + current_block = { + "block": parent_block, + "level": 0, + "children": [], + "parent_id": None, + "peers_rank": 0, + "peers_count": 1, + } + logger.debug(f"processing page id: {page_id}") + current_block, child_pages, child_databases = process_block( + current_block, parent_page_id, client, [], [] + ) + html, _, _ = build_html(current_block, [], []) + body = Body([], html) all_elements = [body] if head: all_elements = [head] + all_elements @@ -406,179 +446,3 @@ def is_database_url(client: Client, url: str): return False check_resp = client.databases.retrieve_status(database_id=database_uuid) return check_resp == 200 - - -@dataclass -class BuildTableResponse: - table_html: HtmlTag - child_pages: List[str] = field(default_factory=list) - child_databases: List[str] = field(default_factory=list) - - -def build_table(client: Client, table: Block) -> BuildTableResponse: - if not isinstance(table.block, notion_blocks.Table): - raise ValueError(f"block type not table: {type(table.block)}") - rows: List[notion_blocks.TableRow] = [] - child_pages: List[str] = [] - child_databases: List[str] = [] - for row_chunk in client.blocks.children.iterate_list( # type: ignore - block_id=table.id, - ): - rows.extend( - [row.block for row in row_chunk if isinstance(row.block, notion_blocks.TableRow)], - ) - - # Extract child databases and pages - for row in rows: - for c in row.cells: - for rt in c.rich_texts: - if mention := rt.mention: - if mention.type == "page" and (page := mention.page): - child_pages.append(page.id) - if mention.type == "database" and (database := mention.database): - child_databases.append(database.id) - - header: Optional[notion_blocks.TableRow] = None - if table.block.has_column_header: - header = rows.pop(0) - table_html_rows = [] - if header: - header.is_header = True - table_html_rows.append(header.get_html()) - table_html_rows.extend([row.get_html() for row in rows]) - html_table = Table([], table_html_rows) - - return BuildTableResponse( - table_html=html_table, - child_pages=child_pages, - child_databases=child_databases, - ) - - -def build_columned_list(client: Client, column_parent: Block) -> HtmlTag: - if not isinstance(column_parent.block, notion_blocks.ColumnList): - raise ValueError(f"block type not column list: {type(column_parent.block)}") - columns: List[Block] = [] - for column_chunk in client.blocks.children.iterate_list( # type: ignore - block_id=column_parent.id, - ): - columns.extend(column_chunk) - num_columns = len(columns) - columns_content = [] - for column in columns: - for column_content_chunk in client.blocks.children.iterate_list( # type: ignore - block_id=column.id, - ): - columns_content.append( - Div( - [Style(f"width:{100/num_columns}%; float: left")], - [content.block.get_html() for content in column_content_chunk], - ), - ) - - return Div([], columns_content) - - -@dataclass -class BulletedListResponse: - html: HtmlTag - child_list: Optional[HtmlTag] = None - - -bulleted_list_styles = ["circle", "square", "disc"] - - -def build_bulleted_list_children( - client: Client, - bulleted_list_item_parent: Block, - list_style_ind: int = 0, -) -> BulletedListResponse: - if not isinstance(bulleted_list_item_parent.block, notion_blocks.BulletedListItem): - raise ValueError( - f"block type not bulleted list item: {type(bulleted_list_item_parent.block)}", - ) - html = bulleted_list_item_parent.get_html() - if html: - html.attributes = [Style("margin-left: 10px")] - if not bulleted_list_item_parent.has_children: - return BulletedListResponse( - html=html, - ) - children = [] - for child_block in client.blocks.children.iterate_list( # type: ignore - block_id=bulleted_list_item_parent.id, - ): - children.extend(child_block) - if not children: - return BulletedListResponse( - html=bulleted_list_item_parent.get_html(), - ) - child_html = [] - for child in children: - child_resp = build_bulleted_list_children( - client=client, - bulleted_list_item_parent=child, - list_style_ind=(list_style_ind + 1) % len(bulleted_list_styles), - ) - child_html.append(child_resp.html) - if child_children := child_resp.child_list: - child_html.append(child_children) - - return BulletedListResponse( - html=html, - child_list=Ul( - [Style(f"list-style-type: {bulleted_list_styles[list_style_ind]}")], - child_html, - ), - ) - - -@dataclass -class NumberedListResponse: - html: HtmlTag - child_list: Optional[HtmlTag] = None - - -numbered_list_types = ["a", "i", "1"] - - -def build_numbered_list_children( - client: Client, - numbered_list_item_parent: Block, - type_attr_ind=0, -) -> NumberedListResponse: - if not isinstance(numbered_list_item_parent.block, notion_blocks.NumberedListItem): - raise ValueError( - f"block type not numbered list item: {type(numbered_list_item_parent.block)}", - ) - html = numbered_list_item_parent.get_html() - if html: - html.attributes = [Style("margin-left: 10px")] - if not numbered_list_item_parent.has_children: - return NumberedListResponse( - html=html, - ) - children = [] - for child_block in client.blocks.children.iterate_list( # type: ignore - block_id=numbered_list_item_parent.id, - ): - children.extend(child_block) - if not children: - return NumberedListResponse( - html=numbered_list_item_parent.get_html(), - ) - child_html = [] - for child in children: - child_resp = build_numbered_list_children( - client=client, - numbered_list_item_parent=child, - type_attr_ind=(type_attr_ind + 1) % len(numbered_list_types), - ) - child_html.append(child_resp.html) - if child_children := child_resp.child_list: - child_html.append(child_children) - - return NumberedListResponse( - html=html, - child_list=Ol([Type(numbered_list_types[type_attr_ind])], child_html), - ) From f99982b03f68901c122888abdc6bd43e8487841f Mon Sep 17 00:00:00 2001 From: Bryan Chen Date: Mon, 6 Jan 2025 17:39:27 -0500 Subject: [PATCH 47/48] remove unnecessary e2e test for notion connector --- ...122b2c22996b435b9de2ee0e9d2b04bc.html.json | 25 - ...bc49-2e06-4f01-8031-bf283be58a60.html.json | 46 - ...ea53-f2b3-45b4-8638-2212fd054d73.html.json | 46 - ...f29c-799a-4d7b-93ce-b11bcaede531.html.json | 68 -- ...7009-e6b2-47f3-a8ff-f159fd8b69f5.html.json | 46 - ...38f2-26e1-4de7-81e6-354045d4d007.html.json | 46 - ...ee42-2167-441c-af6c-7b2cff268809.html.json | 30 - ...ccc9-0a9c-4168-94c3-f997e60cb8cf.html.json | 46 - ...be3d-cbe0-4e28-ad46-2170d40a8d37.html.json | 52 -- ...2157-721e-4207-b3b7-527762b782c2.html.json | 510 ---------- ...4566-4c7a-488b-ac2a-1292ee507fcb.html.json | 881 ------------------ ...e584-30b1-4551-b533-e6a5759af842.html.json | 46 - ...fcd4-8ca0-4638-8212-1a5726461029.html.json | 46 - ...149e-6240-4431-8e98-a04a2e460a66.html.json | 52 -- test_e2e/src/notion.sh | 48 - 15 files changed, 1988 deletions(-) delete mode 100644 test_e2e/expected-structured-output/notion/122b2c22996b435b9de2ee0e9d2b04bc.html.json delete mode 100644 test_e2e/expected-structured-output/notion/438dbc49-2e06-4f01-8031-bf283be58a60.html.json delete mode 100644 test_e2e/expected-structured-output/notion/4695ea53-f2b3-45b4-8638-2212fd054d73.html.json delete mode 100644 test_e2e/expected-structured-output/notion/5481f29c-799a-4d7b-93ce-b11bcaede531.html.json delete mode 100644 test_e2e/expected-structured-output/notion/60377009-e6b2-47f3-a8ff-f159fd8b69f5.html.json delete mode 100644 test_e2e/expected-structured-output/notion/898538f2-26e1-4de7-81e6-354045d4d007.html.json delete mode 100644 test_e2e/expected-structured-output/notion/8d8bee42-2167-441c-af6c-7b2cff268809.html.json delete mode 100644 test_e2e/expected-structured-output/notion/8db7ccc9-0a9c-4168-94c3-f997e60cb8cf.html.json delete mode 100644 test_e2e/expected-structured-output/notion/9e20be3d-cbe0-4e28-ad46-2170d40a8d37.html.json delete mode 100644 test_e2e/expected-structured-output/notion/b2a12157-721e-4207-b3b7-527762b782c2.html.json delete mode 100644 test_e2e/expected-structured-output/notion/c47a4566-4c7a-488b-ac2a-1292ee507fcb.html.json delete mode 100644 test_e2e/expected-structured-output/notion/dfcbe584-30b1-4551-b533-e6a5759af842.html.json delete mode 100644 test_e2e/expected-structured-output/notion/feccfcd4-8ca0-4638-8212-1a5726461029.html.json delete mode 100644 test_e2e/expected-structured-output/notion/fee2149e-6240-4431-8e98-a04a2e460a66.html.json delete mode 100755 test_e2e/src/notion.sh diff --git a/test_e2e/expected-structured-output/notion/122b2c22996b435b9de2ee0e9d2b04bc.html.json b/test_e2e/expected-structured-output/notion/122b2c22996b435b9de2ee0e9d2b04bc.html.json deleted file mode 100644 index 321f96a0f..000000000 --- a/test_e2e/expected-structured-output/notion/122b2c22996b435b9de2ee0e9d2b04bc.html.json +++ /dev/null @@ -1,25 +0,0 @@ -[ - { - "type": "Table", - "element_id": "81a89d50e24f9f1a276c15641adff90f", - "text": "Created time Last edited time Owner Page Tags Verification 2023-08-04T18:31:00.000Z 2023-08-04T18:31:00.000Z Roman Isecke New Page unverified 2023-08-04T18:31:00.000Z 2023-08-04T18:31:00.000Z Roman Isecke Morale Events Policies unverified 2023-08-04T18:31:00.000Z 2023-08-04T19:02:00.000Z Roman Isecke New Page With Verification expired Roman Isecke 2023-08-04T04:00:00.000Z - 2023-08-11T04:00:00.000Z 2023-08-04T18:31:00.000Z 2023-08-04T18:31:00.000Z Roman Isecke Vacation Policy Policies unverified 2023-08-04T18:31:00.000Z 2023-08-04T18:31:00.000Z Roman Isecke Mission, Vision, Values Vision Company Updates unverified 2023-08-04T18:31:00.000Z 2023-08-04T18:31:00.000Z Roman Isecke Recent Press Company Updates unverified 2023-08-04T18:31:00.000Z 2023-08-04T18:31:00.000Z Roman Isecke Getting Started unverified 2023-08-04T18:31:00.000Z 2023-08-17T18:48:00.000Z Roman Isecke Page with every block Company Updates Policies expired Roman Isecke 2023-08-04T04:00:00.000Z - 2023-11-02T04:00:00.000Z 2023-08-04T18:31:00.000Z 2023-08-04T18:31:00.000Z Roman Isecke Corporate Travel Policies unverified 2023-08-04T18:31:00.000Z 2023-08-04T18:31:00.000Z Roman Isecke Benefits Policies Policies unverified", - "metadata": { - "text_as_html": "
    Created timeLast edited timeOwnerPageTagsVerification
    2023-08-04T18:31:00.000Z2023-08-04T18:31:00.000ZRoman IseckeNew Pageunverified
    2023-08-04T18:31:00.000Z2023-08-04T18:31:00.000ZRoman IseckeMorale EventsPoliciesunverified
    2023-08-04T18:31:00.000Z2023-08-04T19:02:00.000ZRoman IseckeNew Page With Verificationexpired Roman Isecke 2023-08-04T04:00:00.000Z - 2023-08-11T04:00:00.000Z
    2023-08-04T18:31:00.000Z2023-08-04T18:31:00.000ZRoman IseckeVacation PolicyPoliciesunverified
    2023-08-04T18:31:00.000Z2023-08-04T18:31:00.000ZRoman IseckeMission, Vision, ValuesVision Company Updatesunverified
    2023-08-04T18:31:00.000Z2023-08-04T18:31:00.000ZRoman IseckeRecent PressCompany Updatesunverified
    2023-08-04T18:31:00.000Z2023-08-04T18:31:00.000ZRoman IseckeGetting Startedunverified
    2023-08-04T18:31:00.000Z2023-08-17T18:48:00.000ZRoman IseckePage with every blockCompany Updates Policiesexpired Roman Isecke 2023-08-04T04:00:00.000Z - 2023-11-02T04:00:00.000Z
    2023-08-04T18:31:00.000Z2023-08-04T18:31:00.000ZRoman IseckeCorporate TravelPoliciesunverified
    2023-08-04T18:31:00.000Z2023-08-04T18:31:00.000ZRoman IseckeBenefits PoliciesPoliciesunverified
    ", - "languages": [ - "eng" - ], - "filetype": "text/html", - "data_source": { - "url": null, - "version": null, - "record_locator": { - "database_id": "122b2c22996b435b9de2ee0e9d2b04bc" - }, - "date_created": "2023-08-04T18:31:00.000Z", - "date_modified": "2023-08-04T18:44:00.000Z", - "permissions_data": null, - "filesize_bytes": 7710 - } - } - } -] \ No newline at end of file diff --git a/test_e2e/expected-structured-output/notion/438dbc49-2e06-4f01-8031-bf283be58a60.html.json b/test_e2e/expected-structured-output/notion/438dbc49-2e06-4f01-8031-bf283be58a60.html.json deleted file mode 100644 index 511deafef..000000000 --- a/test_e2e/expected-structured-output/notion/438dbc49-2e06-4f01-8031-bf283be58a60.html.json +++ /dev/null @@ -1,46 +0,0 @@ -[ - { - "type": "Title", - "element_id": "b21b7e1a9374c90fad7b4ca0571a9a35", - "text": "New Page", - "metadata": { - "languages": [ - "eng" - ], - "filetype": "text/html", - "data_source": { - "url": null, - "version": null, - "record_locator": { - "page_id": "438dbc49-2e06-4f01-8031-bf283be58a60" - }, - "date_created": "2023-08-04T18:31:00.000Z", - "date_modified": "2023-08-04T18:31:00.000Z", - "permissions_data": null, - "filesize_bytes": 457 - } - } - }, - { - "type": "NarrativeText", - "element_id": "6c9a1c66c3f1ef2814be722d6ff431b1", - "text": "Notion Tip: When creating a page, it's important to give it a clear title and provide some content. This could include verifying the information, summarizing the topic, or sharing your thoughts and opinions on something that matters to you.", - "metadata": { - "languages": [ - "eng" - ], - "filetype": "text/html", - "data_source": { - "url": null, - "version": null, - "record_locator": { - "page_id": "438dbc49-2e06-4f01-8031-bf283be58a60" - }, - "date_created": "2023-08-04T18:31:00.000Z", - "date_modified": "2023-08-04T18:31:00.000Z", - "permissions_data": null, - "filesize_bytes": 457 - } - } - } -] \ No newline at end of file diff --git a/test_e2e/expected-structured-output/notion/4695ea53-f2b3-45b4-8638-2212fd054d73.html.json b/test_e2e/expected-structured-output/notion/4695ea53-f2b3-45b4-8638-2212fd054d73.html.json deleted file mode 100644 index 9e54c892e..000000000 --- a/test_e2e/expected-structured-output/notion/4695ea53-f2b3-45b4-8638-2212fd054d73.html.json +++ /dev/null @@ -1,46 +0,0 @@ -[ - { - "type": "Title", - "element_id": "23d6a73618cedf6ecc9f28279cb62421", - "text": "Morale Events", - "metadata": { - "languages": [ - "eng" - ], - "filetype": "text/html", - "data_source": { - "url": null, - "version": null, - "record_locator": { - "page_id": "4695ea53-f2b3-45b4-8638-2212fd054d73" - }, - "date_created": "2023-08-04T18:31:00.000Z", - "date_modified": "2023-08-04T18:31:00.000Z", - "permissions_data": null, - "filesize_bytes": 409 - } - } - }, - { - "type": "NarrativeText", - "element_id": "cca3a9ec1c93fe24880b41dd9988d72d", - "text": "Notion Tip: Morale events increase employee satisfaction, motivation, and well-being, while promoting community and teamwork, resulting in higher productivity and retention rates.", - "metadata": { - "languages": [ - "eng" - ], - "filetype": "text/html", - "data_source": { - "url": null, - "version": null, - "record_locator": { - "page_id": "4695ea53-f2b3-45b4-8638-2212fd054d73" - }, - "date_created": "2023-08-04T18:31:00.000Z", - "date_modified": "2023-08-04T18:31:00.000Z", - "permissions_data": null, - "filesize_bytes": 409 - } - } - } -] \ No newline at end of file diff --git a/test_e2e/expected-structured-output/notion/5481f29c-799a-4d7b-93ce-b11bcaede531.html.json b/test_e2e/expected-structured-output/notion/5481f29c-799a-4d7b-93ce-b11bcaede531.html.json deleted file mode 100644 index c0a5b682f..000000000 --- a/test_e2e/expected-structured-output/notion/5481f29c-799a-4d7b-93ce-b11bcaede531.html.json +++ /dev/null @@ -1,68 +0,0 @@ -[ - { - "type": "Title", - "element_id": "67500029518a859dc034db1601bf5fbe", - "text": "New Page With Verification", - "metadata": { - "languages": [ - "eng" - ], - "filetype": "text/html", - "data_source": { - "url": null, - "version": null, - "record_locator": { - "page_id": "5481f29c-799a-4d7b-93ce-b11bcaede531" - }, - "date_created": "2023-08-04T18:31:00.000Z", - "date_modified": "2023-08-04T19:02:00.000Z", - "permissions_data": null, - "filesize_bytes": 765 - } - } - }, - { - "type": "NarrativeText", - "element_id": "49873871ff17a9ffb6b6d4e11f6ea86d", - "text": "Notion Tip: When creating a page, it's important to give it a clear title and provide some content. This could include verifying the information, summarizing the topic, or sharing your thoughts and opinions on something that matters to you.", - "metadata": { - "languages": [ - "eng" - ], - "filetype": "text/html", - "data_source": { - "url": null, - "version": null, - "record_locator": { - "page_id": "5481f29c-799a-4d7b-93ce-b11bcaede531" - }, - "date_created": "2023-08-04T18:31:00.000Z", - "date_modified": "2023-08-04T19:02:00.000Z", - "permissions_data": null, - "filesize_bytes": 765 - } - } - }, - { - "type": "NarrativeText", - "element_id": "d32db2846683d992270e704251ca5c80", - "text": "Notion Tip: An owner of a page can verify it by clicking on the verification button above and choosing to verify the page for either a set amount of time or indefinitely!", - "metadata": { - "languages": [ - "eng" - ], - "filetype": "text/html", - "data_source": { - "url": null, - "version": null, - "record_locator": { - "page_id": "5481f29c-799a-4d7b-93ce-b11bcaede531" - }, - "date_created": "2023-08-04T18:31:00.000Z", - "date_modified": "2023-08-04T19:02:00.000Z", - "permissions_data": null, - "filesize_bytes": 765 - } - } - } -] \ No newline at end of file diff --git a/test_e2e/expected-structured-output/notion/60377009-e6b2-47f3-a8ff-f159fd8b69f5.html.json b/test_e2e/expected-structured-output/notion/60377009-e6b2-47f3-a8ff-f159fd8b69f5.html.json deleted file mode 100644 index 355f6d5f9..000000000 --- a/test_e2e/expected-structured-output/notion/60377009-e6b2-47f3-a8ff-f159fd8b69f5.html.json +++ /dev/null @@ -1,46 +0,0 @@ -[ - { - "type": "Title", - "element_id": "4d5b94a60a5ae180faa4753897afbc5f", - "text": "Vacation Policy", - "metadata": { - "languages": [ - "eng" - ], - "filetype": "text/html", - "data_source": { - "url": null, - "version": null, - "record_locator": { - "page_id": "60377009-e6b2-47f3-a8ff-f159fd8b69f5" - }, - "date_created": "2023-08-04T18:31:00.000Z", - "date_modified": "2023-08-04T18:31:00.000Z", - "permissions_data": null, - "filesize_bytes": 393 - } - } - }, - { - "type": "NarrativeText", - "element_id": "e3b7316f50c3edad4ea72b199ad6b7d9", - "text": "Notion Tip: Vacation policies are crucial for employee well-being and productivity. They provide rest and recharge, reduce burnout and increase job satisfaction.", - "metadata": { - "languages": [ - "eng" - ], - "filetype": "text/html", - "data_source": { - "url": null, - "version": null, - "record_locator": { - "page_id": "60377009-e6b2-47f3-a8ff-f159fd8b69f5" - }, - "date_created": "2023-08-04T18:31:00.000Z", - "date_modified": "2023-08-04T18:31:00.000Z", - "permissions_data": null, - "filesize_bytes": 393 - } - } - } -] \ No newline at end of file diff --git a/test_e2e/expected-structured-output/notion/898538f2-26e1-4de7-81e6-354045d4d007.html.json b/test_e2e/expected-structured-output/notion/898538f2-26e1-4de7-81e6-354045d4d007.html.json deleted file mode 100644 index 70aa87203..000000000 --- a/test_e2e/expected-structured-output/notion/898538f2-26e1-4de7-81e6-354045d4d007.html.json +++ /dev/null @@ -1,46 +0,0 @@ -[ - { - "type": "Title", - "element_id": "52c2888160339820dfa8bb604c031ee9", - "text": "Mission, Vision, Values", - "metadata": { - "languages": [ - "eng" - ], - "filetype": "text/html", - "data_source": { - "url": null, - "version": null, - "record_locator": { - "page_id": "898538f2-26e1-4de7-81e6-354045d4d007" - }, - "date_created": "2023-08-04T18:31:00.000Z", - "date_modified": "2023-08-04T18:31:00.000Z", - "permissions_data": null, - "filesize_bytes": 462 - } - } - }, - { - "type": "NarrativeText", - "element_id": "be9ab17406409efa59dc98966370b9e7", - "text": "Notion Tip: A company mission provides direction and purpose, aligning actions and decisions towards a common goal. It also helps attract like-minded individuals who share the same values and vision for the company.", - "metadata": { - "languages": [ - "eng" - ], - "filetype": "text/html", - "data_source": { - "url": null, - "version": null, - "record_locator": { - "page_id": "898538f2-26e1-4de7-81e6-354045d4d007" - }, - "date_created": "2023-08-04T18:31:00.000Z", - "date_modified": "2023-08-04T18:31:00.000Z", - "permissions_data": null, - "filesize_bytes": 462 - } - } - } -] \ No newline at end of file diff --git a/test_e2e/expected-structured-output/notion/8d8bee42-2167-441c-af6c-7b2cff268809.html.json b/test_e2e/expected-structured-output/notion/8d8bee42-2167-441c-af6c-7b2cff268809.html.json deleted file mode 100644 index 0d66e9cc9..000000000 --- a/test_e2e/expected-structured-output/notion/8d8bee42-2167-441c-af6c-7b2cff268809.html.json +++ /dev/null @@ -1,30 +0,0 @@ -[ - { - "type": "NarrativeText", - "element_id": "5bcc7126851f18b3a41c951030def658", - "text": "Planning notes", - "metadata": { - "emphasized_text_contents": [ - "Planning notes" - ], - "emphasized_text_tags": [ - "b" - ], - "languages": [ - "eng" - ], - "filetype": "text/html", - "data_source": { - "url": null, - "version": null, - "record_locator": { - "page_id": "8d8bee42-2167-441c-af6c-7b2cff268809" - }, - "date_created": "2023-08-02T20:36:00.000Z", - "date_modified": "2023-08-08T19:17:00.000Z", - "permissions_data": null, - "filesize_bytes": 198 - } - } - } -] \ No newline at end of file diff --git a/test_e2e/expected-structured-output/notion/8db7ccc9-0a9c-4168-94c3-f997e60cb8cf.html.json b/test_e2e/expected-structured-output/notion/8db7ccc9-0a9c-4168-94c3-f997e60cb8cf.html.json deleted file mode 100644 index 4f42d6c93..000000000 --- a/test_e2e/expected-structured-output/notion/8db7ccc9-0a9c-4168-94c3-f997e60cb8cf.html.json +++ /dev/null @@ -1,46 +0,0 @@ -[ - { - "type": "Title", - "element_id": "6cb5211e45401c910bcc00e277092033", - "text": "Recent Press", - "metadata": { - "languages": [ - "eng" - ], - "filetype": "text/html", - "data_source": { - "url": null, - "version": null, - "record_locator": { - "page_id": "8db7ccc9-0a9c-4168-94c3-f997e60cb8cf" - }, - "date_created": "2023-08-04T18:31:00.000Z", - "date_modified": "2023-08-04T18:31:00.000Z", - "permissions_data": null, - "filesize_bytes": 390 - } - } - }, - { - "type": "NarrativeText", - "element_id": "d7335f2ec201cb754fc463da124e5970", - "text": "Notion Tip: Telling employees about news about your company is important because it helps them stay informed about the direction of the company and their role in it.", - "metadata": { - "languages": [ - "eng" - ], - "filetype": "text/html", - "data_source": { - "url": null, - "version": null, - "record_locator": { - "page_id": "8db7ccc9-0a9c-4168-94c3-f997e60cb8cf" - }, - "date_created": "2023-08-04T18:31:00.000Z", - "date_modified": "2023-08-04T18:31:00.000Z", - "permissions_data": null, - "filesize_bytes": 390 - } - } - } -] \ No newline at end of file diff --git a/test_e2e/expected-structured-output/notion/9e20be3d-cbe0-4e28-ad46-2170d40a8d37.html.json b/test_e2e/expected-structured-output/notion/9e20be3d-cbe0-4e28-ad46-2170d40a8d37.html.json deleted file mode 100644 index fa9d2ece8..000000000 --- a/test_e2e/expected-structured-output/notion/9e20be3d-cbe0-4e28-ad46-2170d40a8d37.html.json +++ /dev/null @@ -1,52 +0,0 @@ -[ - { - "type": "Title", - "element_id": "bfbac21d794d26d6aaa6f71337a632d9", - "text": "Sprint 3", - "metadata": { - "languages": [ - "eng" - ], - "filetype": "text/html", - "data_source": { - "url": null, - "version": null, - "record_locator": { - "page_id": "9e20be3d-cbe0-4e28-ad46-2170d40a8d37" - }, - "date_created": "2023-08-02T20:36:00.000Z", - "date_modified": "2023-08-17T18:49:00.000Z", - "permissions_data": null, - "filesize_bytes": 214 - } - } - }, - { - "type": "NarrativeText", - "element_id": "e40bd670a8fbd37e3135ea5517c5dddc", - "text": "Planning notes", - "metadata": { - "emphasized_text_contents": [ - "Planning notes" - ], - "emphasized_text_tags": [ - "b" - ], - "languages": [ - "eng" - ], - "filetype": "text/html", - "data_source": { - "url": null, - "version": null, - "record_locator": { - "page_id": "9e20be3d-cbe0-4e28-ad46-2170d40a8d37" - }, - "date_created": "2023-08-02T20:36:00.000Z", - "date_modified": "2023-08-17T18:49:00.000Z", - "permissions_data": null, - "filesize_bytes": 214 - } - } - } -] \ No newline at end of file diff --git a/test_e2e/expected-structured-output/notion/b2a12157-721e-4207-b3b7-527762b782c2.html.json b/test_e2e/expected-structured-output/notion/b2a12157-721e-4207-b3b7-527762b782c2.html.json deleted file mode 100644 index 167005d2a..000000000 --- a/test_e2e/expected-structured-output/notion/b2a12157-721e-4207-b3b7-527762b782c2.html.json +++ /dev/null @@ -1,510 +0,0 @@ -[ - { - "type": "Title", - "element_id": "3e43f998d46d9c8315e1abe4f0da9d72", - "text": "Getting Started", - "metadata": { - "languages": [ - "eng" - ], - "filetype": "text/html", - "data_source": { - "url": null, - "version": null, - "record_locator": { - "page_id": "b2a12157-721e-4207-b3b7-527762b782c2" - }, - "date_created": "2023-08-04T18:31:00.000Z", - "date_modified": "2023-08-04T18:31:00.000Z", - "permissions_data": null, - "filesize_bytes": 3332 - } - } - }, - { - "type": "NarrativeText", - "element_id": "06f9c166ac2e4f5dbb8fb754d833e477", - "text": "Notion Tip: When creating a page, it's important to give it a clear title and provide some content. This could include verifying the information, summarizing the topic, or sharing your thoughts and opinions on something that matters to you.", - "metadata": { - "languages": [ - "eng" - ], - "filetype": "text/html", - "data_source": { - "url": null, - "version": null, - "record_locator": { - "page_id": "b2a12157-721e-4207-b3b7-527762b782c2" - }, - "date_created": "2023-08-04T18:31:00.000Z", - "date_modified": "2023-08-04T18:31:00.000Z", - "permissions_data": null, - "filesize_bytes": 3332 - } - } - }, - { - "type": "Title", - "element_id": "6cabe4b9a0571c55a80de4b06013ec43", - "text": "The Basics", - "metadata": { - "languages": [ - "eng" - ], - "filetype": "text/html", - "data_source": { - "url": null, - "version": null, - "record_locator": { - "page_id": "b2a12157-721e-4207-b3b7-527762b782c2" - }, - "date_created": "2023-08-04T18:31:00.000Z", - "date_modified": "2023-08-04T18:31:00.000Z", - "permissions_data": null, - "filesize_bytes": 3332 - } - } - }, - { - "type": "Title", - "element_id": "2005f3cccf27dc851ae57fba48531195", - "text": "Create a Page", - "metadata": { - "languages": [ - "eng" - ], - "filetype": "text/html", - "data_source": { - "url": null, - "version": null, - "record_locator": { - "page_id": "b2a12157-721e-4207-b3b7-527762b782c2" - }, - "date_created": "2023-08-04T18:31:00.000Z", - "date_modified": "2023-08-04T18:31:00.000Z", - "permissions_data": null, - "filesize_bytes": 3332 - } - } - }, - { - "type": "NarrativeText", - "element_id": "2a7bbcfd5c237889b2fda563db7462cc", - "text": "In your sidebar, click the + that appears next to the word Workspace on hover. A new page will appear. Give it a title and start typing like you would in any other document.", - "metadata": { - "emphasized_text_contents": [ - "Workspace" - ], - "emphasized_text_tags": [ - "b" - ], - "languages": [ - "eng" - ], - "filetype": "text/html", - "data_source": { - "url": null, - "version": null, - "record_locator": { - "page_id": "b2a12157-721e-4207-b3b7-527762b782c2" - }, - "date_created": "2023-08-04T18:31:00.000Z", - "date_modified": "2023-08-04T18:31:00.000Z", - "permissions_data": null, - "filesize_bytes": 3332 - } - } - }, - { - "type": "Title", - "element_id": "2cd82188a21bdfed4ab1c658180a968b", - "text": "Headings", - "metadata": { - "languages": [ - "eng" - ], - "filetype": "text/html", - "data_source": { - "url": null, - "version": null, - "record_locator": { - "page_id": "b2a12157-721e-4207-b3b7-527762b782c2" - }, - "date_created": "2023-08-04T18:31:00.000Z", - "date_modified": "2023-08-04T18:31:00.000Z", - "permissions_data": null, - "filesize_bytes": 3332 - } - } - }, - { - "type": "NarrativeText", - "element_id": "64f7ee0a4c1563451c22061bb09d339c", - "text": "You can add headings and subheadings in one of two ways:", - "metadata": { - "languages": [ - "eng" - ], - "filetype": "text/html", - "data_source": { - "url": null, - "version": null, - "record_locator": { - "page_id": "b2a12157-721e-4207-b3b7-527762b782c2" - }, - "date_created": "2023-08-04T18:31:00.000Z", - "date_modified": "2023-08-04T18:31:00.000Z", - "permissions_data": null, - "filesize_bytes": 3332 - } - } - }, - { - "type": "ListItem", - "element_id": "0f8b41821ecbdad9478f4dbc39b1f2a4", - "text": "Type /heading or /h1 , /h2 , or /h3 to choose the heading size you want.", - "metadata": { - "languages": [ - "eng" - ], - "filetype": "text/html", - "data_source": { - "url": null, - "version": null, - "record_locator": { - "page_id": "b2a12157-721e-4207-b3b7-527762b782c2" - }, - "date_created": "2023-08-04T18:31:00.000Z", - "date_modified": "2023-08-04T18:31:00.000Z", - "permissions_data": null, - "filesize_bytes": 3332 - } - } - }, - { - "type": "ListItem", - "element_id": "545568cf4c5fc32d56606019eee27510", - "text": "Use Markdown shortcuts, like # , ## , and ### .", - "metadata": { - "languages": [ - "eng" - ], - "filetype": "text/html", - "data_source": { - "url": null, - "version": null, - "record_locator": { - "page_id": "b2a12157-721e-4207-b3b7-527762b782c2" - }, - "date_created": "2023-08-04T18:31:00.000Z", - "date_modified": "2023-08-04T18:31:00.000Z", - "permissions_data": null, - "filesize_bytes": 3332 - } - } - }, - { - "type": "ListItem", - "element_id": "be84c348ae4756f33d83d3145dd711af", - "text": "Create inline code by wrapping text with ` (or with the shortcut cmd/ctrl + e ).", - "metadata": { - "languages": [ - "eng" - ], - "filetype": "text/html", - "data_source": { - "url": null, - "version": null, - "record_locator": { - "page_id": "b2a12157-721e-4207-b3b7-527762b782c2" - }, - "date_created": "2023-08-04T18:31:00.000Z", - "date_modified": "2023-08-04T18:31:00.000Z", - "permissions_data": null, - "filesize_bytes": 3332 - } - } - }, - { - "type": "Title", - "element_id": "c131435fecd2bcd5fe5fef8cb322aa55", - "text": "Toggle Lists", - "metadata": { - "languages": [ - "eng" - ], - "filetype": "text/html", - "data_source": { - "url": null, - "version": null, - "record_locator": { - "page_id": "b2a12157-721e-4207-b3b7-527762b782c2" - }, - "date_created": "2023-08-04T18:31:00.000Z", - "date_modified": "2023-08-04T18:31:00.000Z", - "permissions_data": null, - "filesize_bytes": 3332 - } - } - }, - { - "type": "NarrativeText", - "element_id": "94ea1bd8465604db79b8f2c29420f5de", - "text": "Toggle lists streamline your content. Click the arrow to open.", - "metadata": { - "languages": [ - "eng" - ], - "filetype": "text/html", - "data_source": { - "url": null, - "version": null, - "record_locator": { - "page_id": "b2a12157-721e-4207-b3b7-527762b782c2" - }, - "date_created": "2023-08-04T18:31:00.000Z", - "date_modified": "2023-08-04T18:31:00.000Z", - "permissions_data": null, - "filesize_bytes": 3332 - } - } - }, - { - "type": "Title", - "element_id": "4f733e296d95143a3c49dffc35ba64d0", - "text": "Callout Blocks", - "metadata": { - "languages": [ - "eng" - ], - "filetype": "text/html", - "data_source": { - "url": null, - "version": null, - "record_locator": { - "page_id": "b2a12157-721e-4207-b3b7-527762b782c2" - }, - "date_created": "2023-08-04T18:31:00.000Z", - "date_modified": "2023-08-04T18:31:00.000Z", - "permissions_data": null, - "filesize_bytes": 3332 - } - } - }, - { - "type": "NarrativeText", - "element_id": "7e96268c98a95ade6f6dceb82fe91d1f", - "text": "Notion Tip: Create a callout block like this by typing /call and pressing enter . Helpful for adding inline instructions, warnings, disclaimers, and tips. Change the emoji icon by clicking on it.", - "metadata": { - "emphasized_text_contents": [ - "Notion Tip:" - ], - "emphasized_text_tags": [ - "b" - ], - "languages": [ - "eng" - ], - "filetype": "text/html", - "data_source": { - "url": null, - "version": null, - "record_locator": { - "page_id": "b2a12157-721e-4207-b3b7-527762b782c2" - }, - "date_created": "2023-08-04T18:31:00.000Z", - "date_modified": "2023-08-04T18:31:00.000Z", - "permissions_data": null, - "filesize_bytes": 3332 - } - } - }, - { - "type": "Title", - "element_id": "cb33f03c0a3139caeb5607fc4ea55ffd", - "text": "Code Blocks", - "metadata": { - "languages": [ - "eng" - ], - "filetype": "text/html", - "data_source": { - "url": null, - "version": null, - "record_locator": { - "page_id": "b2a12157-721e-4207-b3b7-527762b782c2" - }, - "date_created": "2023-08-04T18:31:00.000Z", - "date_modified": "2023-08-04T18:31:00.000Z", - "permissions_data": null, - "filesize_bytes": 3332 - } - } - }, - { - "type": "NarrativeText", - "element_id": "534f19a337f8114851ba68a69035da52", - "text": "You can add code notation to any Notion page:", - "metadata": { - "languages": [ - "eng" - ], - "filetype": "text/html", - "data_source": { - "url": null, - "version": null, - "record_locator": { - "page_id": "b2a12157-721e-4207-b3b7-527762b782c2" - }, - "date_created": "2023-08-04T18:31:00.000Z", - "date_modified": "2023-08-04T18:31:00.000Z", - "permissions_data": null, - "filesize_bytes": 3332 - } - } - }, - { - "type": "NarrativeText", - "element_id": "70a3c8f18e2d5d32db68fe2150a5a72f", - "text": "Hover over this block to see the Copy to Clipboard option!", - "metadata": { - "emphasized_text_contents": [ - "Copy to Clipboard" - ], - "emphasized_text_tags": [ - "b" - ], - "languages": [ - "eng" - ], - "filetype": "text/html", - "data_source": { - "url": null, - "version": null, - "record_locator": { - "page_id": "b2a12157-721e-4207-b3b7-527762b782c2" - }, - "date_created": "2023-08-04T18:31:00.000Z", - "date_modified": "2023-08-04T18:31:00.000Z", - "permissions_data": null, - "filesize_bytes": 3332 - } - } - }, - { - "type": "ListItem", - "element_id": "d23aeea612a881e54d4e91b26d795beb", - "text": "Your teammates can select any code to comment on it.", - "metadata": { - "languages": [ - "eng" - ], - "filetype": "text/html", - "data_source": { - "url": null, - "version": null, - "record_locator": { - "page_id": "b2a12157-721e-4207-b3b7-527762b782c2" - }, - "date_created": "2023-08-04T18:31:00.000Z", - "date_modified": "2023-08-04T18:31:00.000Z", - "permissions_data": null, - "filesize_bytes": 3332 - } - } - }, - { - "type": "Title", - "element_id": "49c4ebfe04f72a068e8c2e4545d997ef", - "text": "Organizing Pages", - "metadata": { - "languages": [ - "eng" - ], - "filetype": "text/html", - "data_source": { - "url": null, - "version": null, - "record_locator": { - "page_id": "b2a12157-721e-4207-b3b7-527762b782c2" - }, - "date_created": "2023-08-04T18:31:00.000Z", - "date_modified": "2023-08-04T18:31:00.000Z", - "permissions_data": null, - "filesize_bytes": 3332 - } - } - }, - { - "type": "NarrativeText", - "element_id": "db092f68e3263f38d6ed1af651e30e6d", - "text": "Instead of using folders, Notion lets you nest pages inside pages. Type /page and press enter to create a sub-page inside a page. Like this:", - "metadata": { - "languages": [ - "eng" - ], - "filetype": "text/html", - "data_source": { - "url": null, - "version": null, - "record_locator": { - "page_id": "b2a12157-721e-4207-b3b7-527762b782c2" - }, - "date_created": "2023-08-04T18:31:00.000Z", - "date_modified": "2023-08-04T18:31:00.000Z", - "permissions_data": null, - "filesize_bytes": 3332 - } - } - }, - { - "type": "Title", - "element_id": "41a38caacb638fa8311b89164cc2cab4", - "text": "Advanced Techniques", - "metadata": { - "languages": [ - "eng" - ], - "filetype": "text/html", - "data_source": { - "url": null, - "version": null, - "record_locator": { - "page_id": "b2a12157-721e-4207-b3b7-527762b782c2" - }, - "date_created": "2023-08-04T18:31:00.000Z", - "date_modified": "2023-08-04T18:31:00.000Z", - "permissions_data": null, - "filesize_bytes": 3332 - } - } - }, - { - "type": "NarrativeText", - "element_id": "ed1d5c56b71619eca5b877d0e2dc1e10", - "text": "Check out this Notion Editor 101 guide for more advanced tips and how-to's.", - "metadata": { - "link_texts": [ - "Notion Editor 101" - ], - "link_urls": [ - "https://www.notion.so/notion/Notion-editor-101-create-and-edit-68c7c67047494fdb87d50185429df93e" - ], - "languages": [ - "eng" - ], - "filetype": "text/html", - "data_source": { - "url": null, - "version": null, - "record_locator": { - "page_id": "b2a12157-721e-4207-b3b7-527762b782c2" - }, - "date_created": "2023-08-04T18:31:00.000Z", - "date_modified": "2023-08-04T18:31:00.000Z", - "permissions_data": null, - "filesize_bytes": 3332 - } - } - } -] \ No newline at end of file diff --git a/test_e2e/expected-structured-output/notion/c47a4566-4c7a-488b-ac2a-1292ee507fcb.html.json b/test_e2e/expected-structured-output/notion/c47a4566-4c7a-488b-ac2a-1292ee507fcb.html.json deleted file mode 100644 index aaa42d7ec..000000000 --- a/test_e2e/expected-structured-output/notion/c47a4566-4c7a-488b-ac2a-1292ee507fcb.html.json +++ /dev/null @@ -1,881 +0,0 @@ -[ - { - "type": "Title", - "element_id": "cd153f73463db45ea02bd9ba6ce4168e", - "text": "Page with every block", - "metadata": { - "languages": [ - "eng" - ], - "filetype": "text/html", - "data_source": { - "url": null, - "version": null, - "record_locator": { - "page_id": "c47a4566-4c7a-488b-ac2a-1292ee507fcb" - }, - "date_created": "2023-08-04T18:31:00.000Z", - "date_modified": "2023-08-17T18:48:00.000Z", - "permissions_data": null, - "filesize_bytes": 6342 - } - } - }, - { - "type": "NarrativeText", - "element_id": "098442d39ccc8a9731627be8a843d02a", - "text": "Notion Tip: Tag pages to let collaborators know what they can expect to use the page for. You can add one or many tags to any page in a wiki.", - "metadata": { - "languages": [ - "eng" - ], - "filetype": "text/html", - "data_source": { - "url": null, - "version": null, - "record_locator": { - "page_id": "c47a4566-4c7a-488b-ac2a-1292ee507fcb" - }, - "date_created": "2023-08-04T18:31:00.000Z", - "date_modified": "2023-08-17T18:48:00.000Z", - "permissions_data": null, - "filesize_bytes": 6342 - } - } - }, - { - "type": "Title", - "element_id": "868a2b2294814990d664cf13ffd1e2a7", - "text": "Heading 2", - "metadata": { - "languages": [ - "eng" - ], - "filetype": "text/html", - "data_source": { - "url": null, - "version": null, - "record_locator": { - "page_id": "c47a4566-4c7a-488b-ac2a-1292ee507fcb" - }, - "date_created": "2023-08-04T18:31:00.000Z", - "date_modified": "2023-08-17T18:48:00.000Z", - "permissions_data": null, - "filesize_bytes": 6342 - } - } - }, - { - "type": "NarrativeText", - "element_id": "af888c9a9a14c9c6616cf54ac230c20a", - "text": "This is some new text", - "metadata": { - "languages": [ - "eng" - ], - "filetype": "text/html", - "data_source": { - "url": null, - "version": null, - "record_locator": { - "page_id": "c47a4566-4c7a-488b-ac2a-1292ee507fcb" - }, - "date_created": "2023-08-04T18:31:00.000Z", - "date_modified": "2023-08-17T18:48:00.000Z", - "permissions_data": null, - "filesize_bytes": 6342 - } - } - }, - { - "type": "UncategorizedText", - "element_id": "99388232115e119009419bd8b07c93b9", - "text": "Some/less \u2192 more formatted text with other content and stuff 2023-08-07 : @Roman Isecke", - "metadata": { - "emphasized_text_contents": [ - "formatted" - ], - "emphasized_text_tags": [ - "b" - ], - "link_texts": [ - "text" - ], - "link_urls": [ - "/9ba4d6da8a574cfc81ebceac1fde52bd" - ], - "languages": [ - "eng" - ], - "filetype": "text/html", - "data_source": { - "url": null, - "version": null, - "record_locator": { - "page_id": "c47a4566-4c7a-488b-ac2a-1292ee507fcb" - }, - "date_created": "2023-08-04T18:31:00.000Z", - "date_modified": "2023-08-17T18:48:00.000Z", - "permissions_data": null, - "filesize_bytes": 6342 - } - } - }, - { - "type": "Table", - "element_id": "91b9abcc226cbe676d827950030c6702", - "text": "column 1 column 2 pages c1r1 content c2r1 table \n 2023-08-08T09:00:00.000-04:00\n cell Page with every block c1r2 more content c2r2 table cell Untitled this is some green text this is an equation Untitled text1 text2 Multiline cell Another cell Untitled", - "metadata": { - "text_as_html": "
    column 1column 2pages
    c1r1 contentc2r1 table
    2023-08-08T09:00:00.000-04:00
    cell
    Page with every block
    c1r2 more contentc2r2 table cellUntitled
    this is some green textthis is an equationUntitled
    text1 text2 Multiline cellAnother cellUntitled
    ", - "languages": [ - "eng" - ], - "filetype": "text/html", - "data_source": { - "url": null, - "version": null, - "record_locator": { - "page_id": "c47a4566-4c7a-488b-ac2a-1292ee507fcb" - }, - "date_created": "2023-08-04T18:31:00.000Z", - "date_modified": "2023-08-17T18:48:00.000Z", - "permissions_data": null, - "filesize_bytes": 6342 - } - } - }, - { - "type": "UncategorizedText", - "element_id": "0b73b1397f01db39dc98a983bd3aeb3d", - "text": "E = {mc^2}", - "metadata": { - "languages": [ - "eng" - ], - "filetype": "text/html", - "data_source": { - "url": null, - "version": null, - "record_locator": { - "page_id": "c47a4566-4c7a-488b-ac2a-1292ee507fcb" - }, - "date_created": "2023-08-04T18:31:00.000Z", - "date_modified": "2023-08-17T18:48:00.000Z", - "permissions_data": null, - "filesize_bytes": 6342 - } - } - }, - { - "type": "ListItem", - "element_id": "7535c23e3c0bda50ea38df65f7a64bca", - "text": "Numbered list", - "metadata": { - "languages": [ - "eng" - ], - "filetype": "text/html", - "data_source": { - "url": null, - "version": null, - "record_locator": { - "page_id": "c47a4566-4c7a-488b-ac2a-1292ee507fcb" - }, - "date_created": "2023-08-04T18:31:00.000Z", - "date_modified": "2023-08-17T18:48:00.000Z", - "permissions_data": null, - "filesize_bytes": 6342 - } - } - }, - { - "type": "ListItem", - "element_id": "155061ede32096c81085eabf421f9fe0", - "text": "A number child", - "metadata": { - "languages": [ - "eng" - ], - "filetype": "text/html", - "data_source": { - "url": null, - "version": null, - "record_locator": { - "page_id": "c47a4566-4c7a-488b-ac2a-1292ee507fcb" - }, - "date_created": "2023-08-04T18:31:00.000Z", - "date_modified": "2023-08-17T18:48:00.000Z", - "permissions_data": null, - "filesize_bytes": 6342 - } - } - }, - { - "type": "ListItem", - "element_id": "1ff4a64dcc74b4cbdf4270776c2adab0", - "text": "A number grandchild", - "metadata": { - "languages": [ - "eng" - ], - "filetype": "text/html", - "data_source": { - "url": null, - "version": null, - "record_locator": { - "page_id": "c47a4566-4c7a-488b-ac2a-1292ee507fcb" - }, - "date_created": "2023-08-04T18:31:00.000Z", - "date_modified": "2023-08-17T18:48:00.000Z", - "permissions_data": null, - "filesize_bytes": 6342 - } - } - }, - { - "type": "ListItem", - "element_id": "9e0342a8c3a010f7802d874fa447f72b", - "text": "great", - "metadata": { - "languages": [ - "eng" - ], - "filetype": "text/html", - "data_source": { - "url": null, - "version": null, - "record_locator": { - "page_id": "c47a4566-4c7a-488b-ac2a-1292ee507fcb" - }, - "date_created": "2023-08-04T18:31:00.000Z", - "date_modified": "2023-08-17T18:48:00.000Z", - "permissions_data": null, - "filesize_bytes": 6342 - } - } - }, - { - "type": "ListItem", - "element_id": "240e4a3a9b5843192b03086325da2169", - "text": "super great", - "metadata": { - "languages": [ - "eng" - ], - "filetype": "text/html", - "data_source": { - "url": null, - "version": null, - "record_locator": { - "page_id": "c47a4566-4c7a-488b-ac2a-1292ee507fcb" - }, - "date_created": "2023-08-04T18:31:00.000Z", - "date_modified": "2023-08-17T18:48:00.000Z", - "permissions_data": null, - "filesize_bytes": 6342 - } - } - }, - { - "type": "ListItem", - "element_id": "d1e6a3da60ba834365b2230689c4d8a6", - "text": "with test text", - "metadata": { - "languages": [ - "eng" - ], - "filetype": "text/html", - "data_source": { - "url": null, - "version": null, - "record_locator": { - "page_id": "c47a4566-4c7a-488b-ac2a-1292ee507fcb" - }, - "date_created": "2023-08-04T18:31:00.000Z", - "date_modified": "2023-08-17T18:48:00.000Z", - "permissions_data": null, - "filesize_bytes": 6342 - } - } - }, - { - "type": "ListItem", - "element_id": "db78c6b732dc265e380889e394c6354f", - "text": "Bullet one", - "metadata": { - "languages": [ - "eng" - ], - "filetype": "text/html", - "data_source": { - "url": null, - "version": null, - "record_locator": { - "page_id": "c47a4566-4c7a-488b-ac2a-1292ee507fcb" - }, - "date_created": "2023-08-04T18:31:00.000Z", - "date_modified": "2023-08-17T18:48:00.000Z", - "permissions_data": null, - "filesize_bytes": 6342 - } - } - }, - { - "type": "ListItem", - "element_id": "f31b201c44870108f395a238bff36413", - "text": "A child bullet", - "metadata": { - "languages": [ - "eng" - ], - "filetype": "text/html", - "data_source": { - "url": null, - "version": null, - "record_locator": { - "page_id": "c47a4566-4c7a-488b-ac2a-1292ee507fcb" - }, - "date_created": "2023-08-04T18:31:00.000Z", - "date_modified": "2023-08-17T18:48:00.000Z", - "permissions_data": null, - "filesize_bytes": 6342 - } - } - }, - { - "type": "ListItem", - "element_id": "5929608d0a4d2f055635bbab72df26ec", - "text": "A grandchild bullet", - "metadata": { - "languages": [ - "eng" - ], - "filetype": "text/html", - "data_source": { - "url": null, - "version": null, - "record_locator": { - "page_id": "c47a4566-4c7a-488b-ac2a-1292ee507fcb" - }, - "date_created": "2023-08-04T18:31:00.000Z", - "date_modified": "2023-08-17T18:48:00.000Z", - "permissions_data": null, - "filesize_bytes": 6342 - } - } - }, - { - "type": "ListItem", - "element_id": "1e93d6f8cf7c8af51ddf222be77b4882", - "text": "great", - "metadata": { - "languages": [ - "eng" - ], - "filetype": "text/html", - "data_source": { - "url": null, - "version": null, - "record_locator": { - "page_id": "c47a4566-4c7a-488b-ac2a-1292ee507fcb" - }, - "date_created": "2023-08-04T18:31:00.000Z", - "date_modified": "2023-08-17T18:48:00.000Z", - "permissions_data": null, - "filesize_bytes": 6342 - } - } - }, - { - "type": "ListItem", - "element_id": "c53244024b7b1e86b20bcc1489d9dc4a", - "text": "super great", - "metadata": { - "languages": [ - "eng" - ], - "filetype": "text/html", - "data_source": { - "url": null, - "version": null, - "record_locator": { - "page_id": "c47a4566-4c7a-488b-ac2a-1292ee507fcb" - }, - "date_created": "2023-08-04T18:31:00.000Z", - "date_modified": "2023-08-17T18:48:00.000Z", - "permissions_data": null, - "filesize_bytes": 6342 - } - } - }, - { - "type": "ListItem", - "element_id": "3602b0a8a126be064654623590163f49", - "text": "Bullet two", - "metadata": { - "languages": [ - "eng" - ], - "filetype": "text/html", - "data_source": { - "url": null, - "version": null, - "record_locator": { - "page_id": "c47a4566-4c7a-488b-ac2a-1292ee507fcb" - }, - "date_created": "2023-08-04T18:31:00.000Z", - "date_modified": "2023-08-17T18:48:00.000Z", - "permissions_data": null, - "filesize_bytes": 6342 - } - } - }, - { - "type": "NarrativeText", - "element_id": "27d5b17e90250d77a76da1f6d93f8e8b", - "text": "I quote myself testings Notion", - "metadata": { - "languages": [ - "eng" - ], - "filetype": "text/html", - "data_source": { - "url": null, - "version": null, - "record_locator": { - "page_id": "c47a4566-4c7a-488b-ac2a-1292ee507fcb" - }, - "date_created": "2023-08-04T18:31:00.000Z", - "date_modified": "2023-08-17T18:48:00.000Z", - "permissions_data": null, - "filesize_bytes": 6342 - } - } - }, - { - "type": "NarrativeText", - "element_id": "8831856d3670d91d6fa2121af0694022", - "text": "https://www.notion.so/icons/airplane_brown.svg I call this out", - "metadata": { - "link_texts": [ - "https://www.notion.so/icons/airplane_brown.svg" - ], - "link_urls": [ - "https://www.notion.so/icons/airplane_brown.svg" - ], - "languages": [ - "eng" - ], - "filetype": "text/html", - "data_source": { - "url": null, - "version": null, - "record_locator": { - "page_id": "c47a4566-4c7a-488b-ac2a-1292ee507fcb" - }, - "date_created": "2023-08-04T18:31:00.000Z", - "date_modified": "2023-08-17T18:48:00.000Z", - "permissions_data": null, - "filesize_bytes": 6342 - } - } - }, - { - "type": "Title", - "element_id": "df59e087da5910b2cb1c98801bb24c85", - "text": "https://www.wikipedia.org/", - "metadata": { - "link_texts": [ - "https://www.wikipedia.org/" - ], - "link_urls": [ - "https://www.wikipedia.org/" - ], - "languages": [ - "eng" - ], - "filetype": "text/html", - "data_source": { - "url": null, - "version": null, - "record_locator": { - "page_id": "c47a4566-4c7a-488b-ac2a-1292ee507fcb" - }, - "date_created": "2023-08-04T18:31:00.000Z", - "date_modified": "2023-08-17T18:48:00.000Z", - "permissions_data": null, - "filesize_bytes": 6342 - } - } - }, - { - "type": "Title", - "element_id": "0f215d56b4a1fc900dc2dad40b7df66f", - "text": "https://play-lh.googleusercontent.com/KwUBNPbMTk9jDXYS2AeX3illtVRTkrKVh5xR1Mg4WHd0CG2tV4mrh1z3kXi5z_warlk", - "metadata": { - "link_texts": [ - "https://play-lh.googleusercontent.com/KwUBNPbMTk9jDXYS2AeX3illtVRTkrKVh5xR1Mg4WHd0CG2tV4mrh1z3kXi5z_warlk" - ], - "link_urls": [ - "https://play-lh.googleusercontent.com/KwUBNPbMTk9jDXYS2AeX3illtVRTkrKVh5xR1Mg4WHd0CG2tV4mrh1z3kXi5z_warlk" - ], - "languages": [ - "eng" - ], - "filetype": "text/html", - "data_source": { - "url": null, - "version": null, - "record_locator": { - "page_id": "c47a4566-4c7a-488b-ac2a-1292ee507fcb" - }, - "date_created": "2023-08-04T18:31:00.000Z", - "date_modified": "2023-08-17T18:48:00.000Z", - "permissions_data": null, - "filesize_bytes": 6342 - } - } - }, - { - "type": "Title", - "element_id": "5da75c186c36d3117e60f08d49e66085", - "text": "Child Database:", - "metadata": { - "languages": [ - "eng" - ], - "filetype": "text/html", - "data_source": { - "url": null, - "version": null, - "record_locator": { - "page_id": "c47a4566-4c7a-488b-ac2a-1292ee507fcb" - }, - "date_created": "2023-08-04T18:31:00.000Z", - "date_modified": "2023-08-17T18:48:00.000Z", - "permissions_data": null, - "filesize_bytes": 6342 - } - } - }, - { - "type": "Title", - "element_id": "a82757a2b9004569ab1761d061847bd3", - "text": "Analytics", - "metadata": { - "link_texts": [ - "Analytics" - ], - "link_urls": [ - "https://www.notion.so/d1fad658f1cf4eedb0b5ee72b9f0b530" - ], - "languages": [ - "eng" - ], - "filetype": "text/html", - "data_source": { - "url": null, - "version": null, - "record_locator": { - "page_id": "c47a4566-4c7a-488b-ac2a-1292ee507fcb" - }, - "date_created": "2023-08-04T18:31:00.000Z", - "date_modified": "2023-08-17T18:48:00.000Z", - "permissions_data": null, - "filesize_bytes": 6342 - } - } - }, - { - "type": "Title", - "element_id": "29a6be22a8770f106f54f4abcdc1de68", - "text": "Child Page:", - "metadata": { - "languages": [ - "eng" - ], - "filetype": "text/html", - "data_source": { - "url": null, - "version": null, - "record_locator": { - "page_id": "c47a4566-4c7a-488b-ac2a-1292ee507fcb" - }, - "date_created": "2023-08-04T18:31:00.000Z", - "date_modified": "2023-08-17T18:48:00.000Z", - "permissions_data": null, - "filesize_bytes": 6342 - } - } - }, - { - "type": "Title", - "element_id": "d07d54a1ce286a7679952d4e4ce82c8e", - "text": "Untitled", - "metadata": { - "link_texts": [ - "Untitled" - ], - "link_urls": [ - "https://www.notion.so/9ba4d6da8a574cfc81ebceac1fde52bd" - ], - "languages": [ - "eng" - ], - "filetype": "text/html", - "data_source": { - "url": null, - "version": null, - "record_locator": { - "page_id": "c47a4566-4c7a-488b-ac2a-1292ee507fcb" - }, - "date_created": "2023-08-04T18:31:00.000Z", - "date_modified": "2023-08-17T18:48:00.000Z", - "permissions_data": null, - "filesize_bytes": 6342 - } - } - }, - { - "type": "NarrativeText", - "element_id": "d4c02f5b35a00e87ef7be603d82c5df3", - "text": "s = \"this is some code\"", - "metadata": { - "languages": [ - "eng" - ], - "filetype": "text/html", - "data_source": { - "url": null, - "version": null, - "record_locator": { - "page_id": "c47a4566-4c7a-488b-ac2a-1292ee507fcb" - }, - "date_created": "2023-08-04T18:31:00.000Z", - "date_modified": "2023-08-17T18:48:00.000Z", - "permissions_data": null, - "filesize_bytes": 6342 - } - } - }, - { - "type": "NarrativeText", - "element_id": "59aab31c8b60641b906a81db51c596a6", - "text": "This is my code caption", - "metadata": { - "languages": [ - "eng" - ], - "filetype": "text/html", - "data_source": { - "url": null, - "version": null, - "record_locator": { - "page_id": "c47a4566-4c7a-488b-ac2a-1292ee507fcb" - }, - "date_created": "2023-08-04T18:31:00.000Z", - "date_modified": "2023-08-17T18:48:00.000Z", - "permissions_data": null, - "filesize_bytes": 6342 - } - } - }, - { - "type": "NarrativeText", - "element_id": "7fc741d4226b15a910af95ff3fde6253", - "text": "This is some text", - "metadata": { - "languages": [ - "eng" - ], - "filetype": "text/html", - "data_source": { - "url": null, - "version": null, - "record_locator": { - "page_id": "c47a4566-4c7a-488b-ac2a-1292ee507fcb" - }, - "date_created": "2023-08-04T18:31:00.000Z", - "date_modified": "2023-08-17T18:48:00.000Z", - "permissions_data": null, - "filesize_bytes": 6342 - } - } - }, - { - "type": "NarrativeText", - "element_id": "f67f0aef4f1ceb0fa98491872aa741ac", - "text": "This is text in next column", - "metadata": { - "languages": [ - "eng" - ], - "filetype": "text/html", - "data_source": { - "url": null, - "version": null, - "record_locator": { - "page_id": "c47a4566-4c7a-488b-ac2a-1292ee507fcb" - }, - "date_created": "2023-08-04T18:31:00.000Z", - "date_modified": "2023-08-17T18:48:00.000Z", - "permissions_data": null, - "filesize_bytes": 6342 - } - } - }, - { - "type": "Title", - "element_id": "f08a88064f2c33164502652db93fad32", - "text": "Final text in column", - "metadata": { - "languages": [ - "eng" - ], - "filetype": "text/html", - "data_source": { - "url": null, - "version": null, - "record_locator": { - "page_id": "c47a4566-4c7a-488b-ac2a-1292ee507fcb" - }, - "date_created": "2023-08-04T18:31:00.000Z", - "date_modified": "2023-08-17T18:48:00.000Z", - "permissions_data": null, - "filesize_bytes": 6342 - } - } - }, - { - "type": "NarrativeText", - "element_id": "fa3e9d761730605036aaf854d9edd5b4", - "text": "Heading 1 content", - "metadata": { - "languages": [ - "eng" - ], - "filetype": "text/html", - "data_source": { - "url": null, - "version": null, - "record_locator": { - "page_id": "c47a4566-4c7a-488b-ac2a-1292ee507fcb" - }, - "date_created": "2023-08-04T18:31:00.000Z", - "date_modified": "2023-08-17T18:48:00.000Z", - "permissions_data": null, - "filesize_bytes": 6342 - } - } - }, - { - "type": "UncategorizedText", - "element_id": "c087a92c7251ca836ff023d35cb0a1aa", - "text": "d3d87fc6-61cc-4bb5-89ed-e9dff0df1526", - "metadata": { - "languages": [ - "eng" - ], - "filetype": "text/html", - "data_source": { - "url": null, - "version": null, - "record_locator": { - "page_id": "c47a4566-4c7a-488b-ac2a-1292ee507fcb" - }, - "date_created": "2023-08-04T18:31:00.000Z", - "date_modified": "2023-08-17T18:48:00.000Z", - "permissions_data": null, - "filesize_bytes": 6342 - } - } - }, - { - "type": "Title", - "element_id": "3126a68fa0a12481ca6dc64c16511a7e", - "text": "Stuff todo", - "metadata": { - "languages": [ - "eng" - ], - "filetype": "text/html", - "data_source": { - "url": null, - "version": null, - "record_locator": { - "page_id": "c47a4566-4c7a-488b-ac2a-1292ee507fcb" - }, - "date_created": "2023-08-04T18:31:00.000Z", - "date_modified": "2023-08-17T18:48:00.000Z", - "permissions_data": null, - "filesize_bytes": 6342 - } - } - }, - { - "type": "Title", - "element_id": "8cfa5b216c8d3f774f8e1def029681e6", - "text": "more stuff todo", - "metadata": { - "languages": [ - "eng" - ], - "filetype": "text/html", - "data_source": { - "url": null, - "version": null, - "record_locator": { - "page_id": "c47a4566-4c7a-488b-ac2a-1292ee507fcb" - }, - "date_created": "2023-08-04T18:31:00.000Z", - "date_modified": "2023-08-17T18:48:00.000Z", - "permissions_data": null, - "filesize_bytes": 6342 - } - } - }, - { - "type": "NarrativeText", - "element_id": "b538abdbf0aff3f9f1ab11d79bb5bc26", - "text": "More things to do", - "metadata": { - "languages": [ - "eng" - ], - "filetype": "text/html", - "data_source": { - "url": null, - "version": null, - "record_locator": { - "page_id": "c47a4566-4c7a-488b-ac2a-1292ee507fcb" - }, - "date_created": "2023-08-04T18:31:00.000Z", - "date_modified": "2023-08-17T18:48:00.000Z", - "permissions_data": null, - "filesize_bytes": 6342 - } - } - }, - { - "type": "NarrativeText", - "element_id": "570c50d8758c5639a1dfd0f238f609d5", - "text": "Something to do", - "metadata": { - "languages": [ - "eng" - ], - "filetype": "text/html", - "data_source": { - "url": null, - "version": null, - "record_locator": { - "page_id": "c47a4566-4c7a-488b-ac2a-1292ee507fcb" - }, - "date_created": "2023-08-04T18:31:00.000Z", - "date_modified": "2023-08-17T18:48:00.000Z", - "permissions_data": null, - "filesize_bytes": 6342 - } - } - } -] \ No newline at end of file diff --git a/test_e2e/expected-structured-output/notion/dfcbe584-30b1-4551-b533-e6a5759af842.html.json b/test_e2e/expected-structured-output/notion/dfcbe584-30b1-4551-b533-e6a5759af842.html.json deleted file mode 100644 index e6bcbf67a..000000000 --- a/test_e2e/expected-structured-output/notion/dfcbe584-30b1-4551-b533-e6a5759af842.html.json +++ /dev/null @@ -1,46 +0,0 @@ -[ - { - "type": "Title", - "element_id": "ee4edbe949900c6988a62505a9325d47", - "text": "Corporate Travel", - "metadata": { - "languages": [ - "eng" - ], - "filetype": "text/html", - "data_source": { - "url": null, - "version": null, - "record_locator": { - "page_id": "dfcbe584-30b1-4551-b533-e6a5759af842" - }, - "date_created": "2023-08-04T18:31:00.000Z", - "date_modified": "2023-08-04T18:31:00.000Z", - "permissions_data": null, - "filesize_bytes": 419 - } - } - }, - { - "type": "NarrativeText", - "element_id": "756651f18284432aa247200d0bc0cc62", - "text": "Notion Tip: A corporate travel policy is crucial for controlling costs, ensuring compliance, and guaranteeing the safety of employees when traveling for the company.", - "metadata": { - "languages": [ - "eng" - ], - "filetype": "text/html", - "data_source": { - "url": null, - "version": null, - "record_locator": { - "page_id": "dfcbe584-30b1-4551-b533-e6a5759af842" - }, - "date_created": "2023-08-04T18:31:00.000Z", - "date_modified": "2023-08-04T18:31:00.000Z", - "permissions_data": null, - "filesize_bytes": 419 - } - } - } -] \ No newline at end of file diff --git a/test_e2e/expected-structured-output/notion/feccfcd4-8ca0-4638-8212-1a5726461029.html.json b/test_e2e/expected-structured-output/notion/feccfcd4-8ca0-4638-8212-1a5726461029.html.json deleted file mode 100644 index 64e8d5506..000000000 --- a/test_e2e/expected-structured-output/notion/feccfcd4-8ca0-4638-8212-1a5726461029.html.json +++ /dev/null @@ -1,46 +0,0 @@ -[ - { - "type": "Title", - "element_id": "12f0e8957240cb6d2bedffde59586918", - "text": "Benefits Policies", - "metadata": { - "languages": [ - "eng" - ], - "filetype": "text/html", - "data_source": { - "url": null, - "version": null, - "record_locator": { - "page_id": "feccfcd4-8ca0-4638-8212-1a5726461029" - }, - "date_created": "2023-08-04T18:31:00.000Z", - "date_modified": "2023-08-04T18:31:00.000Z", - "permissions_data": null, - "filesize_bytes": 423 - } - } - }, - { - "type": "NarrativeText", - "element_id": "3e394812bcc3403068dc1d92a42271ce", - "text": "Notion Tip: Benefits policies can attract and retain employees, promote well-being, create positive culture, differentiate from competitors, and increase morale and satisfaction.", - "metadata": { - "languages": [ - "eng" - ], - "filetype": "text/html", - "data_source": { - "url": null, - "version": null, - "record_locator": { - "page_id": "feccfcd4-8ca0-4638-8212-1a5726461029" - }, - "date_created": "2023-08-04T18:31:00.000Z", - "date_modified": "2023-08-04T18:31:00.000Z", - "permissions_data": null, - "filesize_bytes": 423 - } - } - } -] \ No newline at end of file diff --git a/test_e2e/expected-structured-output/notion/fee2149e-6240-4431-8e98-a04a2e460a66.html.json b/test_e2e/expected-structured-output/notion/fee2149e-6240-4431-8e98-a04a2e460a66.html.json deleted file mode 100644 index 5b046d30d..000000000 --- a/test_e2e/expected-structured-output/notion/fee2149e-6240-4431-8e98-a04a2e460a66.html.json +++ /dev/null @@ -1,52 +0,0 @@ -[ - { - "type": "Title", - "element_id": "665e346acfccd4fb6110bcd1a2e36155", - "text": "Sprint 1", - "metadata": { - "languages": [ - "eng" - ], - "filetype": "text/html", - "data_source": { - "url": null, - "version": null, - "record_locator": { - "page_id": "fee2149e-6240-4431-8e98-a04a2e460a66" - }, - "date_created": "2023-08-02T20:36:00.000Z", - "date_modified": "2023-08-17T18:49:00.000Z", - "permissions_data": null, - "filesize_bytes": 214 - } - } - }, - { - "type": "NarrativeText", - "element_id": "cfa5ea5800f7a2510d64c98b58742e45", - "text": "Planning notes", - "metadata": { - "emphasized_text_contents": [ - "Planning notes" - ], - "emphasized_text_tags": [ - "b" - ], - "languages": [ - "eng" - ], - "filetype": "text/html", - "data_source": { - "url": null, - "version": null, - "record_locator": { - "page_id": "fee2149e-6240-4431-8e98-a04a2e460a66" - }, - "date_created": "2023-08-02T20:36:00.000Z", - "date_modified": "2023-08-17T18:49:00.000Z", - "permissions_data": null, - "filesize_bytes": 214 - } - } - } -] \ No newline at end of file diff --git a/test_e2e/src/notion.sh b/test_e2e/src/notion.sh deleted file mode 100755 index 2c41ba837..000000000 --- a/test_e2e/src/notion.sh +++ /dev/null @@ -1,48 +0,0 @@ -#!/usr/bin/env bash - -set -e - -SRC_PATH=$(dirname "$(realpath "$0")") -SCRIPT_DIR=$(dirname "$SRC_PATH") -cd "$SCRIPT_DIR"/.. || exit 1 -OUTPUT_FOLDER_NAME=notion -OUTPUT_ROOT=${OUTPUT_ROOT:-$SCRIPT_DIR} -OUTPUT_DIR=$OUTPUT_ROOT/structured-output/$OUTPUT_FOLDER_NAME -WORK_DIR=$OUTPUT_ROOT/workdir/$OUTPUT_FOLDER_NAME -DOWNLOAD_DIR=$SCRIPT_DIR/download/$OUTPUT_FOLDER_NAME -max_processes=${MAX_PROCESSES:=$(python3 -c "import os; print(os.cpu_count())")} -CI=${CI:-"false"} - -# shellcheck disable=SC1091 -source "$SCRIPT_DIR"/cleanup.sh -function cleanup() { - cleanup_dir "$OUTPUT_DIR" - cleanup_dir "$WORK_DIR" - if [ "$CI" == "true" ]; then - cleanup_dir "$DOWNLOAD_DIR" - fi -} -trap cleanup EXIT - -if [ -z "$NOTION_API_KEY" ]; then - echo "Skipping Notion ingest test because the NOTION_API_KEY env var is not set." - exit 8 -fi - -RUN_SCRIPT=${RUN_SCRIPT:-./unstructured_ingest/main.py} -PYTHONPATH=${PYTHONPATH:-.} "$RUN_SCRIPT" \ - notion \ - --api-key "$UNS_PAID_API_KEY" \ - --partition-by-api \ - --partition-endpoint "https://api.unstructuredapp.io" \ - --metadata-exclude coordinates,filename,file_directory,metadata.last_modified,metadata.data_source.date_processed,metadata.detection_class_prob,metadata.parent_id,metadata.category_depth \ - --download-dir "$DOWNLOAD_DIR" \ - --notion-api-key "$NOTION_API_KEY" \ - --output-dir "$OUTPUT_DIR" \ - --database-ids "122b2c22996b435b9de2ee0e9d2b04bc" \ - --num-processes "$max_processes" \ - --recursive \ - --verbose \ - --work-dir "$WORK_DIR" - -"$SCRIPT_DIR"/check-diff-expected-output.py --output-folder-name $OUTPUT_FOLDER_NAME From f90080a58976dbabfc74997776f40636f5f390b9 Mon Sep 17 00:00:00 2001 From: Bryan Chen Date: Tue, 7 Jan 2025 14:28:45 -0500 Subject: [PATCH 48/48] Add more complex integration test --- .../notion_database/directory_structure.json | 2 +- .../1572c3765a0a80d3a34ac5c0eecd1e88.html | 24 -- .../1722c3765a0a8082b382ebc2c62d3f4c.html | 330 ++++++++++++++++++ .../1572c3765a0a80d3a34ac5c0eecd1e88.json | 39 --- .../1722c3765a0a8082b382ebc2c62d3f4c.json | 39 +++ .../1572c3765a0a806299f0dd6999f9e4c7.html | 113 +++++- .../1572c3765a0a806299f0dd6999f9e4c7.json | 6 +- test/integration/connectors/test_notion.py | 2 +- .../types/database_properties/people.py | 1 + .../types/database_properties/select.py | 1 + .../types/database_properties/status.py | 1 + 11 files changed, 481 insertions(+), 77 deletions(-) delete mode 100644 test/integration/connectors/expected_results/notion_database/downloads/1572c3765a0a80d3a34ac5c0eecd1e88.html create mode 100644 test/integration/connectors/expected_results/notion_database/downloads/1722c3765a0a8082b382ebc2c62d3f4c.html delete mode 100644 test/integration/connectors/expected_results/notion_database/file_data/1572c3765a0a80d3a34ac5c0eecd1e88.json create mode 100644 test/integration/connectors/expected_results/notion_database/file_data/1722c3765a0a8082b382ebc2c62d3f4c.json diff --git a/test/integration/connectors/expected_results/notion_database/directory_structure.json b/test/integration/connectors/expected_results/notion_database/directory_structure.json index 87906fe7d..9962865c6 100644 --- a/test/integration/connectors/expected_results/notion_database/directory_structure.json +++ b/test/integration/connectors/expected_results/notion_database/directory_structure.json @@ -1,5 +1,5 @@ { "directory_structure": [ - "1572c3765a0a80d3a34ac5c0eecd1e88.html" + "1722c3765a0a8082b382ebc2c62d3f4c.html" ] } \ No newline at end of file diff --git a/test/integration/connectors/expected_results/notion_database/downloads/1572c3765a0a80d3a34ac5c0eecd1e88.html b/test/integration/connectors/expected_results/notion_database/downloads/1572c3765a0a80d3a34ac5c0eecd1e88.html deleted file mode 100644 index 7851bbe0b..000000000 --- a/test/integration/connectors/expected_results/notion_database/downloads/1572c3765a0a80d3a34ac5c0eecd1e88.html +++ /dev/null @@ -1,24 +0,0 @@ - - - - - - - - - -
    - Author - - Item -
    -
    - - test-author - -
    -
    -
    - test-page-in-database -
    -
    diff --git a/test/integration/connectors/expected_results/notion_database/downloads/1722c3765a0a8082b382ebc2c62d3f4c.html b/test/integration/connectors/expected_results/notion_database/downloads/1722c3765a0a8082b382ebc2c62d3f4c.html new file mode 100644 index 000000000..dce7fc3a5 --- /dev/null +++ b/test/integration/connectors/expected_results/notion_database/downloads/1722c3765a0a8082b382ebc2c62d3f4c.html @@ -0,0 +1,330 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    + Author + + Email + + Formula + + ID + + Item + + Phone + + Priority Level + + Publication Date + + Status + + Tag + + URL + + Views +
    + + +
    + xyz@abc.com +
    +
    +
    + 12 +
    +
    +
    + None-4 +
    +
    +
    + test-page4-in-database +
    +
    +
    + 1234567890 +
    +
    +
    + High +
    +
    +
    + 2025-01-31 +
    +
    +
    + Not started +
    +
    +
    + + V1 + + + V5 + + + V7 + +
    +
    + + https://abcde.com + + +
    + 6 +
    +
    + + +
    + xyz@abc.com +
    +
    +
    + 90 +
    +
    +
    + None-3 +
    +
    +
    + test-page3-in-database +
    +
    +
    + 1234567890 +
    +
    +
    + Medium +
    +
    +
    + 2025-01-06 +
    +
    +
    + In Review +
    +
    +
    + + V5 + + + V6 + +
    +
    + + https://abcde.com + + +
    + 45 +
    +
    + + +
    + xyz@abc.com +
    +
    +
    + 46 +
    +
    +
    + None-2 +
    +
    +
    + test-page2-in-database +
    +
    +
    + 1234567890 +
    +
    +
    + Low +
    +
    +
    + 2025-01-04 +
    +
    +
    + Done +
    +
    +
    + + V1 + + + V2 + + + V4 + +
    +
    + + https://abcde.com + + +
    + 23 +
    +
    + + +
    + xyz@abc.com +
    +
    +
    + 4 +
    +
    +
    + None-1 +
    +
    +
    + test-page1-in-datab +
    +
    +
    + 1234567890 +
    +
    +
    + High +
    +
    +
    + 2024-12-01 +
    +
    +
    + In progress +
    +
    +
    + + V1 + + + V3 + +
    +
    + + https://abcde.com + + +
    + 2 +
    +
    diff --git a/test/integration/connectors/expected_results/notion_database/file_data/1572c3765a0a80d3a34ac5c0eecd1e88.json b/test/integration/connectors/expected_results/notion_database/file_data/1572c3765a0a80d3a34ac5c0eecd1e88.json deleted file mode 100644 index 430d3fbbd..000000000 --- a/test/integration/connectors/expected_results/notion_database/file_data/1572c3765a0a80d3a34ac5c0eecd1e88.json +++ /dev/null @@ -1,39 +0,0 @@ -{ - "identifier": "1572c3765a0a80d3a34ac5c0eecd1e88", - "connector_type": "notion", - "source_identifiers": { - "filename": "1572c3765a0a80d3a34ac5c0eecd1e88.html", - "fullpath": "1572c3765a0a80d3a34ac5c0eecd1e88.html", - "rel_path": "1572c3765a0a80d3a34ac5c0eecd1e88.html" - }, - "metadata": { - "url": null, - "version": null, - "record_locator": { - "database_id": "1572c3765a0a80d3a34ac5c0eecd1e88" - }, - "date_created": "2024-12-09T11:54:00.000Z", - "date_modified": "2025-01-05T18:31:00.000Z", - "date_processed": "1736123419.51279", - "permissions_data": null, - "filesize_bytes": null - }, - "additional_metadata": { - "created_by": { - "id": "118d872b-594c-8171-b46f-00020d10d8b2", - "object": "user" - }, - "last_edited_by": { - "id": "118d872b-594c-8171-b46f-00020d10d8b2", - "object": "user" - }, - "parent": { - "page_id": "1572c376-5a0a-80d8-9619-cb35a622b8cc", - "type": "page_id" - }, - "url": "https://www.notion.so/1572c3765a0a80d3a34ac5c0eecd1e88" - }, - "reprocess": false, - "local_download_path": "/private/var/folders/h7/n848df9s5yn7ml8rxb61vhyc0000gp/T/tmp_lvvqhyy/1572c3765a0a80d3a34ac5c0eecd1e88.html", - "display_name": null -} \ No newline at end of file diff --git a/test/integration/connectors/expected_results/notion_database/file_data/1722c3765a0a8082b382ebc2c62d3f4c.json b/test/integration/connectors/expected_results/notion_database/file_data/1722c3765a0a8082b382ebc2c62d3f4c.json new file mode 100644 index 000000000..a49fbf0f2 --- /dev/null +++ b/test/integration/connectors/expected_results/notion_database/file_data/1722c3765a0a8082b382ebc2c62d3f4c.json @@ -0,0 +1,39 @@ +{ + "identifier": "1722c3765a0a8082b382ebc2c62d3f4c", + "connector_type": "notion", + "source_identifiers": { + "filename": "1722c3765a0a8082b382ebc2c62d3f4c.html", + "fullpath": "1722c3765a0a8082b382ebc2c62d3f4c.html", + "rel_path": "1722c3765a0a8082b382ebc2c62d3f4c.html" + }, + "metadata": { + "url": null, + "version": null, + "record_locator": { + "database_id": "1722c3765a0a8082b382ebc2c62d3f4c" + }, + "date_created": "2025-01-05T18:34:00.000Z", + "date_modified": "2025-01-07T19:15:00.000Z", + "date_processed": "1736277913.3980532", + "permissions_data": null, + "filesize_bytes": null + }, + "additional_metadata": { + "created_by": { + "id": "118d872b-594c-8171-b46f-00020d10d8b2", + "object": "user" + }, + "last_edited_by": { + "id": "118d872b-594c-8171-b46f-00020d10d8b2", + "object": "user" + }, + "parent": { + "type": "workspace", + "workspace": true + }, + "url": "https://www.notion.so/1722c3765a0a8082b382ebc2c62d3f4c" + }, + "reprocess": false, + "local_download_path": "/private/var/folders/h7/n848df9s5yn7ml8rxb61vhyc0000gp/T/tmpxu906ary/1722c3765a0a8082b382ebc2c62d3f4c.html", + "display_name": null +} \ No newline at end of file diff --git a/test/integration/connectors/expected_results/notion_page/downloads/1572c3765a0a806299f0dd6999f9e4c7.html b/test/integration/connectors/expected_results/notion_page/downloads/1572c3765a0a806299f0dd6999f9e4c7.html index 792e82426..6d816a0e7 100644 --- a/test/integration/connectors/expected_results/notion_page/downloads/1572c3765a0a806299f0dd6999f9e4c7.html +++ b/test/integration/connectors/expected_results/notion_page/downloads/1572c3765a0a806299f0dd6999f9e4c7.html @@ -16,7 +16,7 @@
    testtext2 testtext2 testtext2 testtext2 testtext2 testtext2 testtext2 testtext2 testtext2 testtext2
    - +
    1. @@ -80,14 +80,9 @@ Testdoc2 Heading 2 Sized Text
    +
    - Testdoc2 Heading 3 Sized Text -
    -
    - Testdoc2 Heading 4 Sized Text -
    -
    - Testdoc2 Heading 5 Sized Text + Table
    @@ -142,7 +137,107 @@
    - + +
    + 2 Columns in ColumnList +
    +
    +
    +
      +
    • + Item 1 +
    • +
    • + Item 2 +
    • +
    +
    + Expandable Heading +
    +
      +
    • + First child item +
    • +
    • + Second child item +
    • +
    +
    +
      +
    1. + First item in Numbered list +
    2. +
    3. + Second item in Numbered list +
    4. +
    +
    +
    +
    +
      +
    • + Expandable Section +
    • +
    +
      +
    • + Child item 1 +
    • +
    • + Child item 2 +
    • +
    +
    +
    +
    + Column list with indented items +
    +
      +
    • + First level item +
    • +
    +
    +
      +
    • + Second level item +
    • +
    +
    +
      +
    • + Third level item +
    • +
    +
    +
    +
    +
    +
    +
    +

    + 💡 +

    + this is a Callout block +
    +
    + this is a Quote block +
    +
    +
    +
    + + this is a Code block + +
    + +
    + this()is()a()block()equation. +

    diff --git a/test/integration/connectors/expected_results/notion_page/file_data/1572c3765a0a806299f0dd6999f9e4c7.json b/test/integration/connectors/expected_results/notion_page/file_data/1572c3765a0a806299f0dd6999f9e4c7.json index bad4851cd..88e9b69dc 100644 --- a/test/integration/connectors/expected_results/notion_page/file_data/1572c3765a0a806299f0dd6999f9e4c7.json +++ b/test/integration/connectors/expected_results/notion_page/file_data/1572c3765a0a806299f0dd6999f9e4c7.json @@ -13,8 +13,8 @@ "page_id": "1572c3765a0a806299f0dd6999f9e4c7" }, "date_created": "2024-12-09T18:13:00.000Z", - "date_modified": "2024-12-30T15:16:00.000Z", - "date_processed": "1736123422.122014", + "date_modified": "2025-01-07T19:24:00.000Z", + "date_processed": "1736277919.434568", "permissions_data": null, "filesize_bytes": null }, @@ -34,6 +34,6 @@ "url": "https://www.notion.so/test-doc1-1572c3765a0a806299f0dd6999f9e4c7" }, "reprocess": false, - "local_download_path": "/private/var/folders/h7/n848df9s5yn7ml8rxb61vhyc0000gp/T/tmp59aqv6nt/1572c3765a0a806299f0dd6999f9e4c7.html", + "local_download_path": "/private/var/folders/h7/n848df9s5yn7ml8rxb61vhyc0000gp/T/tmpluf__jry/1572c3765a0a806299f0dd6999f9e4c7.html", "display_name": null } \ No newline at end of file diff --git a/test/integration/connectors/test_notion.py b/test/integration/connectors/test_notion.py index d49231034..aabc5cf5c 100644 --- a/test/integration/connectors/test_notion.py +++ b/test/integration/connectors/test_notion.py @@ -27,7 +27,7 @@ def test_notion_source_database(temp_dir): access_config=access_config, ) index_config = NotionIndexerConfig( - database_ids=["1572c3765a0a80d3a34ac5c0eecd1e88"], recursive=False + database_ids=["1722c3765a0a8082b382ebc2c62d3f4c"], recursive=False ) download_config = NotionDownloaderConfig(download_dir=temp_dir) diff --git a/unstructured_ingest/v2/processes/connectors/notion/types/database_properties/people.py b/unstructured_ingest/v2/processes/connectors/notion/types/database_properties/people.py index 517fc082b..037822208 100644 --- a/unstructured_ingest/v2/processes/connectors/notion/types/database_properties/people.py +++ b/unstructured_ingest/v2/processes/connectors/notion/types/database_properties/people.py @@ -12,6 +12,7 @@ class People(DBPropertyBase): id: str name: str + description: Optional[str] = None type: str = "people" people: dict = field(default_factory=dict) diff --git a/unstructured_ingest/v2/processes/connectors/notion/types/database_properties/select.py b/unstructured_ingest/v2/processes/connectors/notion/types/database_properties/select.py index 5d504b478..45ce681b7 100644 --- a/unstructured_ingest/v2/processes/connectors/notion/types/database_properties/select.py +++ b/unstructured_ingest/v2/processes/connectors/notion/types/database_properties/select.py @@ -17,6 +17,7 @@ class SelectOption(FromJSONMixin): color: str id: str name: str + description: Optional[str] = None @classmethod def from_dict(cls, data: dict): diff --git a/unstructured_ingest/v2/processes/connectors/notion/types/database_properties/status.py b/unstructured_ingest/v2/processes/connectors/notion/types/database_properties/status.py index 8f6f5001d..1b1372098 100644 --- a/unstructured_ingest/v2/processes/connectors/notion/types/database_properties/status.py +++ b/unstructured_ingest/v2/processes/connectors/notion/types/database_properties/status.py @@ -17,6 +17,7 @@ class StatusOption(FromJSONMixin): color: str id: str name: str + description: Optional[str] = None @classmethod def from_dict(cls, data: dict):