diff --git a/CHANGELOG.md b/CHANGELOG.md index 1db625544..0b33500a7 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,7 @@ ### Enhancements +* **Migrate Notion Source Connector to V2** * **Migrate Vectara Destination Connector to v2** * **Added Redis destination connector** * **Improved Milvus error handling** diff --git a/test/integration/connectors/expected_results/notion_database/directory_structure.json b/test/integration/connectors/expected_results/notion_database/directory_structure.json new file mode 100644 index 000000000..9962865c6 --- /dev/null +++ b/test/integration/connectors/expected_results/notion_database/directory_structure.json @@ -0,0 +1,5 @@ +{ + "directory_structure": [ + "1722c3765a0a8082b382ebc2c62d3f4c.html" + ] +} \ No newline at end of file diff --git a/test/integration/connectors/expected_results/notion_database/downloads/1722c3765a0a8082b382ebc2c62d3f4c.html b/test/integration/connectors/expected_results/notion_database/downloads/1722c3765a0a8082b382ebc2c62d3f4c.html new file mode 100644 index 000000000..dce7fc3a5 --- /dev/null +++ b/test/integration/connectors/expected_results/notion_database/downloads/1722c3765a0a8082b382ebc2c62d3f4c.html @@ -0,0 +1,330 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ Author + + Email + + Formula + + ID + + Item + + Phone + + Priority Level + + Publication Date + + Status + + Tag + + URL + + Views +
+
+ + + Brian Raymond + + +
+
+
+ xyz@abc.com +
+
+
+ 12 +
+
+
+ None-4 +
+
+
+ test-page4-in-database +
+
+
+ 1234567890 +
+
+
+ High +
+
+
+ 2025-01-31 +
+
+
+ Not started +
+
+
+ + V1 + + + V5 + + + V7 + +
+
+ + https://abcde.com + + +
+ 6 +
+
+
+ + + Brian Raymond + + +
+
+
+ xyz@abc.com +
+
+
+ 90 +
+
+
+ None-3 +
+
+
+ test-page3-in-database +
+
+
+ 1234567890 +
+
+
+ Medium +
+
+
+ 2025-01-06 +
+
+
+ In Review +
+
+
+ + V5 + + + V6 + +
+
+ + https://abcde.com + + +
+ 45 +
+
+
+ + + Brian Raymond + + +
+
+
+ xyz@abc.com +
+
+
+ 46 +
+
+
+ None-2 +
+
+
+ test-page2-in-database +
+
+
+ 1234567890 +
+
+
+ Low +
+
+
+ 2025-01-04 +
+
+
+ Done +
+
+
+ + V1 + + + V2 + + + V4 + +
+
+ + https://abcde.com + + +
+ 23 +
+
+
+ + + Brian Raymond + + +
+
+
+ xyz@abc.com +
+
+
+ 4 +
+
+
+ None-1 +
+
+
+ test-page1-in-datab +
+
+
+ 1234567890 +
+
+
+ High +
+
+
+ 2024-12-01 +
+
+
+ In progress +
+
+
+ + V1 + + + V3 + +
+
+ + https://abcde.com + + +
+ 2 +
+
diff --git a/test/integration/connectors/expected_results/notion_database/file_data/1722c3765a0a8082b382ebc2c62d3f4c.json b/test/integration/connectors/expected_results/notion_database/file_data/1722c3765a0a8082b382ebc2c62d3f4c.json new file mode 100644 index 000000000..a49fbf0f2 --- /dev/null +++ b/test/integration/connectors/expected_results/notion_database/file_data/1722c3765a0a8082b382ebc2c62d3f4c.json @@ -0,0 +1,39 @@ +{ + "identifier": "1722c3765a0a8082b382ebc2c62d3f4c", + "connector_type": "notion", + "source_identifiers": { + "filename": "1722c3765a0a8082b382ebc2c62d3f4c.html", + "fullpath": "1722c3765a0a8082b382ebc2c62d3f4c.html", + "rel_path": "1722c3765a0a8082b382ebc2c62d3f4c.html" + }, + "metadata": { + "url": null, + "version": null, + "record_locator": { + "database_id": "1722c3765a0a8082b382ebc2c62d3f4c" + }, + "date_created": "2025-01-05T18:34:00.000Z", + "date_modified": "2025-01-07T19:15:00.000Z", + "date_processed": "1736277913.3980532", + "permissions_data": null, + "filesize_bytes": null + }, + "additional_metadata": { + "created_by": { + "id": "118d872b-594c-8171-b46f-00020d10d8b2", + "object": "user" + }, + "last_edited_by": { + "id": "118d872b-594c-8171-b46f-00020d10d8b2", + "object": "user" + }, + "parent": { + "type": "workspace", + "workspace": true + }, + "url": "https://www.notion.so/1722c3765a0a8082b382ebc2c62d3f4c" + }, + "reprocess": false, + "local_download_path": "/private/var/folders/h7/n848df9s5yn7ml8rxb61vhyc0000gp/T/tmpxu906ary/1722c3765a0a8082b382ebc2c62d3f4c.html", + "display_name": null +} \ No newline at end of file diff --git a/test/integration/connectors/expected_results/notion_page/directory_structure.json b/test/integration/connectors/expected_results/notion_page/directory_structure.json new file mode 100644 index 000000000..9d7654273 --- /dev/null +++ b/test/integration/connectors/expected_results/notion_page/directory_structure.json @@ -0,0 +1,5 @@ +{ + "directory_structure": [ + "1572c3765a0a806299f0dd6999f9e4c7.html" + ] +} \ No newline at end of file diff --git a/test/integration/connectors/expected_results/notion_page/downloads/1572c3765a0a806299f0dd6999f9e4c7.html b/test/integration/connectors/expected_results/notion_page/downloads/1572c3765a0a806299f0dd6999f9e4c7.html new file mode 100644 index 000000000..6d816a0e7 --- /dev/null +++ b/test/integration/connectors/expected_results/notion_page/downloads/1572c3765a0a806299f0dd6999f9e4c7.html @@ -0,0 +1,244 @@ + + + + test-doc1 + + + +

+ test-doc1 +

+
+
+
+ testtext2 testtext2 testtext2 testtext2 testtext2 testtext2 testtext2 testtext2 testtext2 testtext2 +
+
+ testtext2 testtext2 testtext2 testtext2 testtext2 testtext2 testtext2 testtext2 testtext2 testtext2 +
+ +
+
    +
  1. + Testdoc2 List Item 1 +
  2. +
+
    +
  1. + Testdoc2 List Item 1 Nested Item A +
  2. +
  3. + Testdoc2 List Item 1 Nested Item B +
  4. +
+
+
    +
  1. + Testdoc2 List Item 2 +
  2. +
  3. + Testdoc2 List Item 3 +
  4. +
+
+
+ + +
+
+ Testdoc2 Checklist Item 1 +
+
+ + +
+
+ Testdoc2 Checklist Item 2 (checked) +
+
+ +
+ + +
+ + Testdoc2 bold text + +
+
+ + Testdoc2 italic text + +
+
+ + Testdoc2 Heading 1 Sized Text + +
+
+ + Testdoc2 Heading 2 Sized Text + +
+
+
+ Table +
+ + + + + + + + + + + + + + + + +
+ + Testdoc2 Table: Column 1 Row 0 + + + + Testdoc2 Table: Column 2 Row 0 + + + + Testdoc2 Table: Column 3 Row 0 + +
+ + Testdoc2 Table: Column 1 Row 1 + + + + Testdoc2 Table: Column 2 Row 1 + + + + Testdoc2 Table: Column 3 Row 1 + +
+ + Testdoc2 Table: Column 1 Row 2 + + + + Testdoc2 Table: Column 2 Row 2 + + + + Testdoc2 Table: Column 3 Row 2 + +
+ +
+ 2 Columns in ColumnList +
+
+
+
    +
  • + Item 1 +
  • +
  • + Item 2 +
  • +
+
+ Expandable Heading +
+
    +
  • + First child item +
  • +
  • + Second child item +
  • +
+
+
    +
  1. + First item in Numbered list +
  2. +
  3. + Second item in Numbered list +
  4. +
+
+
+
+
    +
  • + Expandable Section +
  • +
+
    +
  • + Child item 1 +
  • +
  • + Child item 2 +
  • +
+
+
+
+ Column list with indented items +
+
    +
  • + First level item +
  • +
+
+
    +
  • + Second level item +
  • +
+
+
    +
  • + Third level item +
  • +
+
+
+
+
+
+
+

+ 💡 +

+ this is a Callout block +
+
+ this is a Quote block +
+
+
+
+ + this is a Code block + +
+
+ + https://www.notion.so/test-doc1-1572c3765a0a806299f0dd6999f9e4c7 + +
+
+ this()is()a()block()equation. +
+
+
+ + diff --git a/test/integration/connectors/expected_results/notion_page/file_data/1572c3765a0a806299f0dd6999f9e4c7.json b/test/integration/connectors/expected_results/notion_page/file_data/1572c3765a0a806299f0dd6999f9e4c7.json new file mode 100644 index 000000000..88e9b69dc --- /dev/null +++ b/test/integration/connectors/expected_results/notion_page/file_data/1572c3765a0a806299f0dd6999f9e4c7.json @@ -0,0 +1,39 @@ +{ + "identifier": "1572c3765a0a806299f0dd6999f9e4c7", + "connector_type": "notion", + "source_identifiers": { + "filename": "1572c3765a0a806299f0dd6999f9e4c7.html", + "fullpath": "1572c3765a0a806299f0dd6999f9e4c7.html", + "rel_path": "1572c3765a0a806299f0dd6999f9e4c7.html" + }, + "metadata": { + "url": null, + "version": null, + "record_locator": { + "page_id": "1572c3765a0a806299f0dd6999f9e4c7" + }, + "date_created": "2024-12-09T18:13:00.000Z", + "date_modified": "2025-01-07T19:24:00.000Z", + "date_processed": "1736277919.434568", + "permissions_data": null, + "filesize_bytes": null + }, + "additional_metadata": { + "created_by": { + "id": "118d872b-594c-8171-b46f-00020d10d8b2", + "object": "user" + }, + "last_edited_by": { + "id": "118d872b-594c-8171-b46f-00020d10d8b2", + "object": "user" + }, + "parent": { + "page_id": "1182c376-5a0a-8042-9a2a-fb003e00d57b", + "type": "page_id" + }, + "url": "https://www.notion.so/test-doc1-1572c3765a0a806299f0dd6999f9e4c7" + }, + "reprocess": false, + "local_download_path": "/private/var/folders/h7/n848df9s5yn7ml8rxb61vhyc0000gp/T/tmpluf__jry/1572c3765a0a806299f0dd6999f9e4c7.html", + "display_name": null +} \ No newline at end of file diff --git a/test/integration/connectors/test_notion.py b/test/integration/connectors/test_notion.py new file mode 100644 index 000000000..aabc5cf5c --- /dev/null +++ b/test/integration/connectors/test_notion.py @@ -0,0 +1,145 @@ +import os + +from test.integration.connectors.utils.validation.source import ( + SourceValidationConfigs, + get_all_file_data, + run_all_validations, + update_fixtures, +) +from unstructured_ingest.v2.interfaces import Downloader, Indexer +from unstructured_ingest.v2.processes.connectors.notion.connector import ( + NotionAccessConfig, + NotionConnectionConfig, + NotionDownloader, + NotionDownloaderConfig, + NotionIndexer, + NotionIndexerConfig, +) + + +def test_notion_source_database(temp_dir): + # Retrieve environment variables + notion_api_key = os.environ["NOTION_API_KEY"] + + # Create connection and indexer configurations + access_config = NotionAccessConfig(notion_api_key=notion_api_key) + connection_config = NotionConnectionConfig( + access_config=access_config, + ) + index_config = NotionIndexerConfig( + database_ids=["1722c3765a0a8082b382ebc2c62d3f4c"], recursive=False + ) + + download_config = NotionDownloaderConfig(download_dir=temp_dir) + + # Instantiate indexer and downloader + indexer = NotionIndexer( + connection_config=connection_config, + index_config=index_config, + ) + downloader = NotionDownloader( + connection_config=connection_config, + download_config=download_config, + ) + + # Run the source connector validation + source_connector_validation( + indexer=indexer, + downloader=downloader, + configs=SourceValidationConfigs( + test_id="notion_database", + expected_num_files=1, + validate_downloaded_files=True, + exclude_fields_extend=["metadata.date_created", "metadata.date_modified"], + ), + ) + + +def test_notion_source_page(temp_dir): + # Retrieve environment variables + notion_api_key = os.environ["NOTION_API_KEY"] + + # Create connection and indexer configurations + access_config = NotionAccessConfig(notion_api_key=notion_api_key) + connection_config = NotionConnectionConfig( + access_config=access_config, + ) + index_config = NotionIndexerConfig( + page_ids=["1572c3765a0a806299f0dd6999f9e4c7"], recursive=False + ) + + download_config = NotionDownloaderConfig(download_dir=temp_dir) + + # Instantiate indexer and downloader + indexer = NotionIndexer( + connection_config=connection_config, + index_config=index_config, + ) + downloader = NotionDownloader( + connection_config=connection_config, + download_config=download_config, + ) + + # Run the source connector validation + source_connector_validation( + indexer=indexer, + downloader=downloader, + configs=SourceValidationConfigs( + test_id="notion_page", + expected_num_files=1, + validate_downloaded_files=True, + exclude_fields_extend=["metadata.date_created", "metadata.date_modified"], + ), + ) + + +def source_connector_validation( + indexer: Indexer, + downloader: Downloader, + configs: SourceValidationConfigs, + overwrite_fixtures: bool = os.getenv("OVERWRITE_FIXTURES", "False").lower() == "true", +) -> None: + # Run common validations on the process of running a source connector, supporting dynamic + # validators that get passed in along with comparisons on the saved expected values. + # If overwrite_fixtures is st to True, will ignore all validators but instead overwrite the + # expected values with what gets generated by this test. + all_predownload_file_data = [] + all_postdownload_file_data = [] + indexer.precheck() + download_dir = downloader.download_config.download_dir + test_output_dir = configs.test_output_dir() + + for file_data in indexer.run(): + assert file_data + predownload_file_data = file_data.model_copy(deep=True) + all_predownload_file_data.append(predownload_file_data) + resp = downloader.run(file_data=file_data) + if isinstance(resp, list): + for r in resp: + postdownload_file_data = r["file_data"].model_copy(deep=True) + all_postdownload_file_data.append(postdownload_file_data) + else: + postdownload_file_data = resp["file_data"].model_copy(deep=True) + all_postdownload_file_data.append(postdownload_file_data) + + if not overwrite_fixtures: + print("Running validation") + run_all_validations( + configs=configs, + predownload_file_data=all_predownload_file_data, + postdownload_file_data=all_postdownload_file_data, + download_dir=download_dir, + test_output_dir=test_output_dir, + ) + else: + print("Running fixtures update") + update_fixtures( + output_dir=test_output_dir, + download_dir=download_dir, + all_file_data=get_all_file_data( + all_predownload_file_data=all_predownload_file_data, + all_postdownload_file_data=all_postdownload_file_data, + ), + save_downloads=configs.validate_downloaded_files, + save_filedata=configs.validate_file_data, + ) diff --git a/test_e2e/expected-structured-output/notion/122b2c22-996b-435b-9de2-ee0e9d2b04bc.json b/test_e2e/expected-structured-output/notion/122b2c22-996b-435b-9de2-ee0e9d2b04bc.json deleted file mode 100644 index a53e41bfc..000000000 --- a/test_e2e/expected-structured-output/notion/122b2c22-996b-435b-9de2-ee0e9d2b04bc.json +++ /dev/null @@ -1,14 +0,0 @@ -[ - { - "element_id": "59a715faf8dcf15a6855a2c070f5d4cd", - "metadata": { - "filetype": "text/html", - "languages": [ - "eng" - ], - "text_as_html": "
Created timeLast edited timeOwnerPageTagsVerification
2023-08-04T18:31:00.000Z2023-08-04T18:31:00.000ZRoman IseckeNew Pageunverified
2023-08-04T18:31:00.000Z2023-08-04T18:31:00.000ZRoman IseckeMorale EventsPoliciesunverified
2023-08-04T18:31:00.000Z2023-08-04T19:02:00.000ZRoman IseckeNew Page With Verificationexpired Roman Isecke 2023-08-04T04:00:00.000Z - 2023-08-11T04:00:00.000Z
2023-08-04T18:31:00.000Z2023-08-04T18:31:00.000ZRoman IseckeVacation PolicyPoliciesunverified
2023-08-04T18:31:00.000Z2023-08-04T18:31:00.000ZRoman IseckeMission, Vision, ValuesVision Company Updatesunverified
2023-08-04T18:31:00.000Z2023-08-04T18:31:00.000ZRoman IseckeRecent PressCompany Updatesunverified
2023-08-04T18:31:00.000Z2023-08-04T18:31:00.000ZRoman IseckeGetting Startedunverified
2023-08-04T18:31:00.000Z2023-08-17T18:48:00.000ZRoman IseckePage with every blockCompany Updates Policiesexpired Roman Isecke 2023-08-04T04:00:00.000Z - 2023-11-02T04:00:00.000Z
2023-08-04T18:31:00.000Z2023-08-04T18:31:00.000ZRoman IseckeCorporate TravelPoliciesunverified
2023-08-04T18:31:00.000Z2023-08-04T18:31:00.000ZRoman IseckeBenefits PoliciesPoliciesunverified
" - }, - "text": "Created time Last edited time Owner Page Tags Verification 2023-08-04T18:31:00.000Z 2023-08-04T18:31:00.000Z Roman Isecke New Page unverified 2023-08-04T18:31:00.000Z 2023-08-04T18:31:00.000Z Roman Isecke Morale Events Policies unverified 2023-08-04T18:31:00.000Z 2023-08-04T19:02:00.000Z Roman Isecke New Page With Verification expired Roman Isecke 2023-08-04T04:00:00.000Z - 2023-08-11T04:00:00.000Z 2023-08-04T18:31:00.000Z 2023-08-04T18:31:00.000Z Roman Isecke Vacation Policy Policies unverified 2023-08-04T18:31:00.000Z 2023-08-04T18:31:00.000Z Roman Isecke Mission, Vision, Values Vision Company Updates unverified 2023-08-04T18:31:00.000Z 2023-08-04T18:31:00.000Z Roman Isecke Recent Press Company Updates unverified 2023-08-04T18:31:00.000Z 2023-08-04T18:31:00.000Z Roman Isecke Getting Started unverified 2023-08-04T18:31:00.000Z 2023-08-17T18:48:00.000Z Roman Isecke Page with every block Company Updates Policies expired Roman Isecke 2023-08-04T04:00:00.000Z - 2023-11-02T04:00:00.000Z 2023-08-04T18:31:00.000Z 2023-08-04T18:31:00.000Z Roman Isecke Corporate Travel Policies unverified 2023-08-04T18:31:00.000Z 2023-08-04T18:31:00.000Z Roman Isecke Benefits Policies Policies unverified", - "type": "Table" - } -] \ No newline at end of file diff --git a/test_e2e/expected-structured-output/notion/438dbc49-2e06-4f01-8031-bf283be58a60.json b/test_e2e/expected-structured-output/notion/438dbc49-2e06-4f01-8031-bf283be58a60.json deleted file mode 100644 index 93f6daa5d..000000000 --- a/test_e2e/expected-structured-output/notion/438dbc49-2e06-4f01-8031-bf283be58a60.json +++ /dev/null @@ -1,24 +0,0 @@ -[ - { - "element_id": "b21b7e1a9374c90fad7b4ca0571a9a35", - "metadata": { - "filetype": "text/html", - "languages": [ - "eng" - ] - }, - "text": "New Page", - "type": "Title" - }, - { - "element_id": "6c9a1c66c3f1ef2814be722d6ff431b1", - "metadata": { - "filetype": "text/html", - "languages": [ - "eng" - ] - }, - "text": "Notion Tip: When creating a page, it's important to give it a clear title and provide some content. This could include verifying the information, summarizing the topic, or sharing your thoughts and opinions on something that matters to you.", - "type": "NarrativeText" - } -] \ No newline at end of file diff --git a/test_e2e/expected-structured-output/notion/4695ea53-f2b3-45b4-8638-2212fd054d73.json b/test_e2e/expected-structured-output/notion/4695ea53-f2b3-45b4-8638-2212fd054d73.json deleted file mode 100644 index 025aa548b..000000000 --- a/test_e2e/expected-structured-output/notion/4695ea53-f2b3-45b4-8638-2212fd054d73.json +++ /dev/null @@ -1,24 +0,0 @@ -[ - { - "element_id": "23d6a73618cedf6ecc9f28279cb62421", - "metadata": { - "filetype": "text/html", - "languages": [ - "eng" - ] - }, - "text": "Morale Events", - "type": "Title" - }, - { - "element_id": "cca3a9ec1c93fe24880b41dd9988d72d", - "metadata": { - "filetype": "text/html", - "languages": [ - "eng" - ] - }, - "text": "Notion Tip: Morale events increase employee satisfaction, motivation, and well-being, while promoting community and teamwork, resulting in higher productivity and retention rates.", - "type": "NarrativeText" - } -] \ No newline at end of file diff --git a/test_e2e/expected-structured-output/notion/5481f29c-799a-4d7b-93ce-b11bcaede531.json b/test_e2e/expected-structured-output/notion/5481f29c-799a-4d7b-93ce-b11bcaede531.json deleted file mode 100644 index 6d887d5fe..000000000 --- a/test_e2e/expected-structured-output/notion/5481f29c-799a-4d7b-93ce-b11bcaede531.json +++ /dev/null @@ -1,35 +0,0 @@ -[ - { - "element_id": "67500029518a859dc034db1601bf5fbe", - "metadata": { - "filetype": "text/html", - "languages": [ - "eng" - ] - }, - "text": "New Page With Verification", - "type": "Title" - }, - { - "element_id": "49873871ff17a9ffb6b6d4e11f6ea86d", - "metadata": { - "filetype": "text/html", - "languages": [ - "eng" - ] - }, - "text": "Notion Tip: When creating a page, it's important to give it a clear title and provide some content. This could include verifying the information, summarizing the topic, or sharing your thoughts and opinions on something that matters to you.", - "type": "NarrativeText" - }, - { - "element_id": "d32db2846683d992270e704251ca5c80", - "metadata": { - "filetype": "text/html", - "languages": [ - "eng" - ] - }, - "text": "Notion Tip: An owner of a page can verify it by clicking on the verification button above and choosing to verify the page for either a set amount of time or indefinitely!", - "type": "NarrativeText" - } -] \ No newline at end of file diff --git a/test_e2e/expected-structured-output/notion/60377009-e6b2-47f3-a8ff-f159fd8b69f5.json b/test_e2e/expected-structured-output/notion/60377009-e6b2-47f3-a8ff-f159fd8b69f5.json deleted file mode 100644 index 33ea5be25..000000000 --- a/test_e2e/expected-structured-output/notion/60377009-e6b2-47f3-a8ff-f159fd8b69f5.json +++ /dev/null @@ -1,24 +0,0 @@ -[ - { - "element_id": "4d5b94a60a5ae180faa4753897afbc5f", - "metadata": { - "filetype": "text/html", - "languages": [ - "eng" - ] - }, - "text": "Vacation Policy", - "type": "Title" - }, - { - "element_id": "e3b7316f50c3edad4ea72b199ad6b7d9", - "metadata": { - "filetype": "text/html", - "languages": [ - "eng" - ] - }, - "text": "Notion Tip: Vacation policies are crucial for employee well-being and productivity. They provide rest and recharge, reduce burnout and increase job satisfaction.", - "type": "NarrativeText" - } -] \ No newline at end of file diff --git a/test_e2e/expected-structured-output/notion/898538f2-26e1-4de7-81e6-354045d4d007.json b/test_e2e/expected-structured-output/notion/898538f2-26e1-4de7-81e6-354045d4d007.json deleted file mode 100644 index 5f944a562..000000000 --- a/test_e2e/expected-structured-output/notion/898538f2-26e1-4de7-81e6-354045d4d007.json +++ /dev/null @@ -1,24 +0,0 @@ -[ - { - "element_id": "52c2888160339820dfa8bb604c031ee9", - "metadata": { - "filetype": "text/html", - "languages": [ - "eng" - ] - }, - "text": "Mission, Vision, Values", - "type": "Title" - }, - { - "element_id": "be9ab17406409efa59dc98966370b9e7", - "metadata": { - "filetype": "text/html", - "languages": [ - "eng" - ] - }, - "text": "Notion Tip: A company mission provides direction and purpose, aligning actions and decisions towards a common goal. It also helps attract like-minded individuals who share the same values and vision for the company.", - "type": "NarrativeText" - } -] \ No newline at end of file diff --git a/test_e2e/expected-structured-output/notion/8d8bee42-2167-441c-af6c-7b2cff268809.json b/test_e2e/expected-structured-output/notion/8d8bee42-2167-441c-af6c-7b2cff268809.json deleted file mode 100644 index 93c03b812..000000000 --- a/test_e2e/expected-structured-output/notion/8d8bee42-2167-441c-af6c-7b2cff268809.json +++ /dev/null @@ -1,19 +0,0 @@ -[ - { - "element_id": "5bcc7126851f18b3a41c951030def658", - "metadata": { - "emphasized_text_contents": [ - "Planning notes" - ], - "emphasized_text_tags": [ - "b" - ], - "filetype": "text/html", - "languages": [ - "eng" - ] - }, - "text": "Planning notes", - "type": "NarrativeText" - } -] \ No newline at end of file diff --git a/test_e2e/expected-structured-output/notion/8db7ccc9-0a9c-4168-94c3-f997e60cb8cf.json b/test_e2e/expected-structured-output/notion/8db7ccc9-0a9c-4168-94c3-f997e60cb8cf.json deleted file mode 100644 index fe4b746af..000000000 --- a/test_e2e/expected-structured-output/notion/8db7ccc9-0a9c-4168-94c3-f997e60cb8cf.json +++ /dev/null @@ -1,24 +0,0 @@ -[ - { - "element_id": "6cb5211e45401c910bcc00e277092033", - "metadata": { - "filetype": "text/html", - "languages": [ - "eng" - ] - }, - "text": "Recent Press", - "type": "Title" - }, - { - "element_id": "d7335f2ec201cb754fc463da124e5970", - "metadata": { - "filetype": "text/html", - "languages": [ - "eng" - ] - }, - "text": "Notion Tip: Telling employees about news about your company is important because it helps them stay informed about the direction of the company and their role in it.", - "type": "NarrativeText" - } -] \ No newline at end of file diff --git a/test_e2e/expected-structured-output/notion/9e20be3d-cbe0-4e28-ad46-2170d40a8d37.json b/test_e2e/expected-structured-output/notion/9e20be3d-cbe0-4e28-ad46-2170d40a8d37.json deleted file mode 100644 index 09372ad68..000000000 --- a/test_e2e/expected-structured-output/notion/9e20be3d-cbe0-4e28-ad46-2170d40a8d37.json +++ /dev/null @@ -1,30 +0,0 @@ -[ - { - "element_id": "bfbac21d794d26d6aaa6f71337a632d9", - "metadata": { - "filetype": "text/html", - "languages": [ - "eng" - ] - }, - "text": "Sprint 3", - "type": "Title" - }, - { - "element_id": "e40bd670a8fbd37e3135ea5517c5dddc", - "metadata": { - "emphasized_text_contents": [ - "Planning notes" - ], - "emphasized_text_tags": [ - "b" - ], - "filetype": "text/html", - "languages": [ - "eng" - ] - }, - "text": "Planning notes", - "type": "NarrativeText" - } -] \ No newline at end of file diff --git a/test_e2e/expected-structured-output/notion/b2a12157-721e-4207-b3b7-527762b782c2.json b/test_e2e/expected-structured-output/notion/b2a12157-721e-4207-b3b7-527762b782c2.json deleted file mode 100644 index 816f3b02c..000000000 --- a/test_e2e/expected-structured-output/notion/b2a12157-721e-4207-b3b7-527762b782c2.json +++ /dev/null @@ -1,268 +0,0 @@ -[ - { - "element_id": "3e43f998d46d9c8315e1abe4f0da9d72", - "metadata": { - "filetype": "text/html", - "languages": [ - "eng" - ] - }, - "text": "Getting Started", - "type": "Title" - }, - { - "element_id": "06f9c166ac2e4f5dbb8fb754d833e477", - "metadata": { - "filetype": "text/html", - "languages": [ - "eng" - ] - }, - "text": "Notion Tip: When creating a page, it's important to give it a clear title and provide some content. This could include verifying the information, summarizing the topic, or sharing your thoughts and opinions on something that matters to you.", - "type": "NarrativeText" - }, - { - "element_id": "6cabe4b9a0571c55a80de4b06013ec43", - "metadata": { - "filetype": "text/html", - "languages": [ - "eng" - ] - }, - "text": "The Basics", - "type": "Title" - }, - { - "element_id": "2005f3cccf27dc851ae57fba48531195", - "metadata": { - "filetype": "text/html", - "languages": [ - "eng" - ] - }, - "text": "Create a Page", - "type": "Title" - }, - { - "element_id": "2a7bbcfd5c237889b2fda563db7462cc", - "metadata": { - "emphasized_text_contents": [ - "Workspace" - ], - "emphasized_text_tags": [ - "b" - ], - "filetype": "text/html", - "languages": [ - "eng" - ] - }, - "text": "In your sidebar, click the + that appears next to the word Workspace on hover. A new page will appear. Give it a title and start typing like you would in any other document.", - "type": "NarrativeText" - }, - { - "element_id": "2cd82188a21bdfed4ab1c658180a968b", - "metadata": { - "filetype": "text/html", - "languages": [ - "eng" - ] - }, - "text": "Headings", - "type": "Title" - }, - { - "element_id": "64f7ee0a4c1563451c22061bb09d339c", - "metadata": { - "filetype": "text/html", - "languages": [ - "eng" - ] - }, - "text": "You can add headings and subheadings in one of two ways:", - "type": "NarrativeText" - }, - { - "element_id": "0f8b41821ecbdad9478f4dbc39b1f2a4", - "metadata": { - "filetype": "text/html", - "languages": [ - "eng" - ] - }, - "text": "Type /heading or /h1 , /h2 , or /h3 to choose the heading size you want.", - "type": "ListItem" - }, - { - "element_id": "545568cf4c5fc32d56606019eee27510", - "metadata": { - "filetype": "text/html", - "languages": [ - "eng" - ] - }, - "text": "Use Markdown shortcuts, like # , ## , and ### .", - "type": "ListItem" - }, - { - "element_id": "be84c348ae4756f33d83d3145dd711af", - "metadata": { - "filetype": "text/html", - "languages": [ - "eng" - ] - }, - "text": "Create inline code by wrapping text with ` (or with the shortcut cmd/ctrl + e ).", - "type": "ListItem" - }, - { - "element_id": "c131435fecd2bcd5fe5fef8cb322aa55", - "metadata": { - "filetype": "text/html", - "languages": [ - "eng" - ] - }, - "text": "Toggle Lists", - "type": "Title" - }, - { - "element_id": "94ea1bd8465604db79b8f2c29420f5de", - "metadata": { - "filetype": "text/html", - "languages": [ - "eng" - ] - }, - "text": "Toggle lists streamline your content. Click the arrow to open.", - "type": "NarrativeText" - }, - { - "element_id": "4f733e296d95143a3c49dffc35ba64d0", - "metadata": { - "filetype": "text/html", - "languages": [ - "eng" - ] - }, - "text": "Callout Blocks", - "type": "Title" - }, - { - "element_id": "7e96268c98a95ade6f6dceb82fe91d1f", - "metadata": { - "emphasized_text_contents": [ - "Notion Tip:" - ], - "emphasized_text_tags": [ - "b" - ], - "filetype": "text/html", - "languages": [ - "eng" - ] - }, - "text": "Notion Tip: Create a callout block like this by typing /call and pressing enter . Helpful for adding inline instructions, warnings, disclaimers, and tips. Change the emoji icon by clicking on it.", - "type": "NarrativeText" - }, - { - "element_id": "cb33f03c0a3139caeb5607fc4ea55ffd", - "metadata": { - "filetype": "text/html", - "languages": [ - "eng" - ] - }, - "text": "Code Blocks", - "type": "Title" - }, - { - "element_id": "534f19a337f8114851ba68a69035da52", - "metadata": { - "filetype": "text/html", - "languages": [ - "eng" - ] - }, - "text": "You can add code notation to any Notion page:", - "type": "NarrativeText" - }, - { - "element_id": "70a3c8f18e2d5d32db68fe2150a5a72f", - "metadata": { - "emphasized_text_contents": [ - "Copy to Clipboard" - ], - "emphasized_text_tags": [ - "b" - ], - "filetype": "text/html", - "languages": [ - "eng" - ] - }, - "text": "Hover over this block to see the Copy to Clipboard option!", - "type": "NarrativeText" - }, - { - "element_id": "d23aeea612a881e54d4e91b26d795beb", - "metadata": { - "filetype": "text/html", - "languages": [ - "eng" - ] - }, - "text": "Your teammates can select any code to comment on it.", - "type": "ListItem" - }, - { - "element_id": "49c4ebfe04f72a068e8c2e4545d997ef", - "metadata": { - "filetype": "text/html", - "languages": [ - "eng" - ] - }, - "text": "Organizing Pages", - "type": "Title" - }, - { - "element_id": "db092f68e3263f38d6ed1af651e30e6d", - "metadata": { - "filetype": "text/html", - "languages": [ - "eng" - ] - }, - "text": "Instead of using folders, Notion lets you nest pages inside pages. Type /page and press enter to create a sub-page inside a page. Like this:", - "type": "NarrativeText" - }, - { - "element_id": "41a38caacb638fa8311b89164cc2cab4", - "metadata": { - "filetype": "text/html", - "languages": [ - "eng" - ] - }, - "text": "Advanced Techniques", - "type": "Title" - }, - { - "element_id": "ed1d5c56b71619eca5b877d0e2dc1e10", - "metadata": { - "filetype": "text/html", - "languages": [ - "eng" - ], - "link_texts": [ - "Notion Editor 101" - ], - "link_urls": [ - "https://www.notion.so/notion/Notion-editor-101-create-and-edit-68c7c67047494fdb87d50185429df93e" - ] - }, - "text": "Check out this Notion Editor 101 guide for more advanced tips and how-to's.", - "type": "NarrativeText" - } -] \ No newline at end of file diff --git a/test_e2e/expected-structured-output/notion/c47a4566-4c7a-488b-ac2a-1292ee507fcb.json b/test_e2e/expected-structured-output/notion/c47a4566-4c7a-488b-ac2a-1292ee507fcb.json deleted file mode 100644 index 4daf27cc8..000000000 --- a/test_e2e/expected-structured-output/notion/c47a4566-4c7a-488b-ac2a-1292ee507fcb.json +++ /dev/null @@ -1,463 +0,0 @@ -[ - { - "element_id": "cd153f73463db45ea02bd9ba6ce4168e", - "metadata": { - "filetype": "text/html", - "languages": [ - "eng" - ] - }, - "text": "Page with every block", - "type": "Title" - }, - { - "element_id": "098442d39ccc8a9731627be8a843d02a", - "metadata": { - "filetype": "text/html", - "languages": [ - "eng" - ] - }, - "text": "Notion Tip: Tag pages to let collaborators know what they can expect to use the page for. You can add one or many tags to any page in a wiki.", - "type": "NarrativeText" - }, - { - "element_id": "868a2b2294814990d664cf13ffd1e2a7", - "metadata": { - "filetype": "text/html", - "languages": [ - "eng" - ] - }, - "text": "Heading 2", - "type": "Title" - }, - { - "element_id": "af888c9a9a14c9c6616cf54ac230c20a", - "metadata": { - "filetype": "text/html", - "languages": [ - "eng" - ] - }, - "text": "This is some new text", - "type": "NarrativeText" - }, - { - "element_id": "99388232115e119009419bd8b07c93b9", - "metadata": { - "emphasized_text_contents": [ - "formatted" - ], - "emphasized_text_tags": [ - "b" - ], - "filetype": "text/html", - "languages": [ - "eng" - ], - "link_texts": [ - "text" - ], - "link_urls": [ - "/9ba4d6da8a574cfc81ebceac1fde52bd" - ] - }, - "text": "Some/less → more formatted text with other content and stuff 2023-08-07 : @Roman Isecke", - "type": "UncategorizedText" - }, - { - "element_id": "91b9abcc226cbe676d827950030c6702", - "metadata": { - "filetype": "text/html", - "languages": [ - "eng" - ], - "text_as_html": "
column 1column 2pages
c1r1 contentc2r1 table
2023-08-08T09:00:00.000-04:00
cell
Page with every block
c1r2 more contentc2r2 table cellUntitled
this is some green textthis is an equationUntitled
text1 text2 Multiline cellAnother cellUntitled
" - }, - "text": "column 1 column 2 pages c1r1 content c2r1 table \n 2023-08-08T09:00:00.000-04:00\n cell Page with every block c1r2 more content c2r2 table cell Untitled this is some green text this is an equation Untitled text1 text2 Multiline cell Another cell Untitled", - "type": "Table" - }, - { - "element_id": "0b73b1397f01db39dc98a983bd3aeb3d", - "metadata": { - "filetype": "text/html", - "languages": [ - "eng" - ] - }, - "text": "E = {mc^2}", - "type": "UncategorizedText" - }, - { - "element_id": "7535c23e3c0bda50ea38df65f7a64bca", - "metadata": { - "filetype": "text/html", - "languages": [ - "eng" - ] - }, - "text": "Numbered list", - "type": "ListItem" - }, - { - "element_id": "155061ede32096c81085eabf421f9fe0", - "metadata": { - "filetype": "text/html", - "languages": [ - "eng" - ] - }, - "text": "A number child", - "type": "ListItem" - }, - { - "element_id": "1ff4a64dcc74b4cbdf4270776c2adab0", - "metadata": { - "filetype": "text/html", - "languages": [ - "eng" - ] - }, - "text": "A number grandchild", - "type": "ListItem" - }, - { - "element_id": "9e0342a8c3a010f7802d874fa447f72b", - "metadata": { - "filetype": "text/html", - "languages": [ - "eng" - ] - }, - "text": "great", - "type": "ListItem" - }, - { - "element_id": "240e4a3a9b5843192b03086325da2169", - "metadata": { - "filetype": "text/html", - "languages": [ - "eng" - ] - }, - "text": "super great", - "type": "ListItem" - }, - { - "element_id": "d1e6a3da60ba834365b2230689c4d8a6", - "metadata": { - "filetype": "text/html", - "languages": [ - "eng" - ] - }, - "text": "with test text", - "type": "ListItem" - }, - { - "element_id": "db78c6b732dc265e380889e394c6354f", - "metadata": { - "filetype": "text/html", - "languages": [ - "eng" - ] - }, - "text": "Bullet one", - "type": "ListItem" - }, - { - "element_id": "f31b201c44870108f395a238bff36413", - "metadata": { - "filetype": "text/html", - "languages": [ - "eng" - ] - }, - "text": "A child bullet", - "type": "ListItem" - }, - { - "element_id": "5929608d0a4d2f055635bbab72df26ec", - "metadata": { - "filetype": "text/html", - "languages": [ - "eng" - ] - }, - "text": "A grandchild bullet", - "type": "ListItem" - }, - { - "element_id": "1e93d6f8cf7c8af51ddf222be77b4882", - "metadata": { - "filetype": "text/html", - "languages": [ - "eng" - ] - }, - "text": "great", - "type": "ListItem" - }, - { - "element_id": "c53244024b7b1e86b20bcc1489d9dc4a", - "metadata": { - "filetype": "text/html", - "languages": [ - "eng" - ] - }, - "text": "super great", - "type": "ListItem" - }, - { - "element_id": "3602b0a8a126be064654623590163f49", - "metadata": { - "filetype": "text/html", - "languages": [ - "eng" - ] - }, - "text": "Bullet two", - "type": "ListItem" - }, - { - "element_id": "27d5b17e90250d77a76da1f6d93f8e8b", - "metadata": { - "filetype": "text/html", - "languages": [ - "eng" - ] - }, - "text": "I quote myself testings Notion", - "type": "NarrativeText" - }, - { - "element_id": "8831856d3670d91d6fa2121af0694022", - "metadata": { - "filetype": "text/html", - "languages": [ - "eng" - ], - "link_texts": [ - "https://www.notion.so/icons/airplane_brown.svg" - ], - "link_urls": [ - "https://www.notion.so/icons/airplane_brown.svg" - ] - }, - "text": "https://www.notion.so/icons/airplane_brown.svg I call this out", - "type": "NarrativeText" - }, - { - "element_id": "df59e087da5910b2cb1c98801bb24c85", - "metadata": { - "filetype": "text/html", - "languages": [ - "eng" - ], - "link_texts": [ - "https://www.wikipedia.org/" - ], - "link_urls": [ - "https://www.wikipedia.org/" - ] - }, - "text": "https://www.wikipedia.org/", - "type": "Title" - }, - { - "element_id": "0f215d56b4a1fc900dc2dad40b7df66f", - "metadata": { - "filetype": "text/html", - "languages": [ - "eng" - ], - "link_texts": [ - "https://play-lh.googleusercontent.com/KwUBNPbMTk9jDXYS2AeX3illtVRTkrKVh5xR1Mg4WHd0CG2tV4mrh1z3kXi5z_warlk" - ], - "link_urls": [ - "https://play-lh.googleusercontent.com/KwUBNPbMTk9jDXYS2AeX3illtVRTkrKVh5xR1Mg4WHd0CG2tV4mrh1z3kXi5z_warlk" - ] - }, - "text": "https://play-lh.googleusercontent.com/KwUBNPbMTk9jDXYS2AeX3illtVRTkrKVh5xR1Mg4WHd0CG2tV4mrh1z3kXi5z_warlk", - "type": "Title" - }, - { - "element_id": "5da75c186c36d3117e60f08d49e66085", - "metadata": { - "filetype": "text/html", - "languages": [ - "eng" - ] - }, - "text": "Child Database:", - "type": "Title" - }, - { - "element_id": "a82757a2b9004569ab1761d061847bd3", - "metadata": { - "filetype": "text/html", - "languages": [ - "eng" - ], - "link_texts": [ - "Analytics" - ], - "link_urls": [ - "https://www.notion.so/d1fad658f1cf4eedb0b5ee72b9f0b530" - ] - }, - "text": "Analytics", - "type": "Title" - }, - { - "element_id": "29a6be22a8770f106f54f4abcdc1de68", - "metadata": { - "filetype": "text/html", - "languages": [ - "eng" - ] - }, - "text": "Child Page:", - "type": "Title" - }, - { - "element_id": "d07d54a1ce286a7679952d4e4ce82c8e", - "metadata": { - "filetype": "text/html", - "languages": [ - "eng" - ], - "link_texts": [ - "Untitled" - ], - "link_urls": [ - "https://www.notion.so/9ba4d6da8a574cfc81ebceac1fde52bd" - ] - }, - "text": "Untitled", - "type": "Title" - }, - { - "element_id": "d4c02f5b35a00e87ef7be603d82c5df3", - "metadata": { - "filetype": "text/html", - "languages": [ - "eng" - ] - }, - "text": "s = \"this is some code\"", - "type": "NarrativeText" - }, - { - "element_id": "59aab31c8b60641b906a81db51c596a6", - "metadata": { - "filetype": "text/html", - "languages": [ - "eng" - ] - }, - "text": "This is my code caption", - "type": "NarrativeText" - }, - { - "element_id": "7fc741d4226b15a910af95ff3fde6253", - "metadata": { - "filetype": "text/html", - "languages": [ - "eng" - ] - }, - "text": "This is some text", - "type": "NarrativeText" - }, - { - "element_id": "f67f0aef4f1ceb0fa98491872aa741ac", - "metadata": { - "filetype": "text/html", - "languages": [ - "eng" - ] - }, - "text": "This is text in next column", - "type": "NarrativeText" - }, - { - "element_id": "f08a88064f2c33164502652db93fad32", - "metadata": { - "filetype": "text/html", - "languages": [ - "eng" - ] - }, - "text": "Final text in column", - "type": "Title" - }, - { - "element_id": "fa3e9d761730605036aaf854d9edd5b4", - "metadata": { - "filetype": "text/html", - "languages": [ - "eng" - ] - }, - "text": "Heading 1 content", - "type": "NarrativeText" - }, - { - "element_id": "c087a92c7251ca836ff023d35cb0a1aa", - "metadata": { - "filetype": "text/html", - "languages": [ - "eng" - ] - }, - "text": "d3d87fc6-61cc-4bb5-89ed-e9dff0df1526", - "type": "UncategorizedText" - }, - { - "element_id": "3126a68fa0a12481ca6dc64c16511a7e", - "metadata": { - "filetype": "text/html", - "languages": [ - "eng" - ] - }, - "text": "Stuff todo", - "type": "Title" - }, - { - "element_id": "8cfa5b216c8d3f774f8e1def029681e6", - "metadata": { - "filetype": "text/html", - "languages": [ - "eng" - ] - }, - "text": "more stuff todo", - "type": "Title" - }, - { - "element_id": "b538abdbf0aff3f9f1ab11d79bb5bc26", - "metadata": { - "filetype": "text/html", - "languages": [ - "eng" - ] - }, - "text": "More things to do", - "type": "NarrativeText" - }, - { - "element_id": "570c50d8758c5639a1dfd0f238f609d5", - "metadata": { - "filetype": "text/html", - "languages": [ - "eng" - ] - }, - "text": "Something to do", - "type": "NarrativeText" - } -] \ No newline at end of file diff --git a/test_e2e/expected-structured-output/notion/dfcbe584-30b1-4551-b533-e6a5759af842.json b/test_e2e/expected-structured-output/notion/dfcbe584-30b1-4551-b533-e6a5759af842.json deleted file mode 100644 index b51260990..000000000 --- a/test_e2e/expected-structured-output/notion/dfcbe584-30b1-4551-b533-e6a5759af842.json +++ /dev/null @@ -1,24 +0,0 @@ -[ - { - "element_id": "ee4edbe949900c6988a62505a9325d47", - "metadata": { - "filetype": "text/html", - "languages": [ - "eng" - ] - }, - "text": "Corporate Travel", - "type": "Title" - }, - { - "element_id": "756651f18284432aa247200d0bc0cc62", - "metadata": { - "filetype": "text/html", - "languages": [ - "eng" - ] - }, - "text": "Notion Tip: A corporate travel policy is crucial for controlling costs, ensuring compliance, and guaranteeing the safety of employees when traveling for the company.", - "type": "NarrativeText" - } -] \ No newline at end of file diff --git a/test_e2e/expected-structured-output/notion/feccfcd4-8ca0-4638-8212-1a5726461029.json b/test_e2e/expected-structured-output/notion/feccfcd4-8ca0-4638-8212-1a5726461029.json deleted file mode 100644 index 5a6a8a45a..000000000 --- a/test_e2e/expected-structured-output/notion/feccfcd4-8ca0-4638-8212-1a5726461029.json +++ /dev/null @@ -1,24 +0,0 @@ -[ - { - "element_id": "12f0e8957240cb6d2bedffde59586918", - "metadata": { - "filetype": "text/html", - "languages": [ - "eng" - ] - }, - "text": "Benefits Policies", - "type": "Title" - }, - { - "element_id": "3e394812bcc3403068dc1d92a42271ce", - "metadata": { - "filetype": "text/html", - "languages": [ - "eng" - ] - }, - "text": "Notion Tip: Benefits policies can attract and retain employees, promote well-being, create positive culture, differentiate from competitors, and increase morale and satisfaction.", - "type": "NarrativeText" - } -] \ No newline at end of file diff --git a/test_e2e/expected-structured-output/notion/fee2149e-6240-4431-8e98-a04a2e460a66.json b/test_e2e/expected-structured-output/notion/fee2149e-6240-4431-8e98-a04a2e460a66.json deleted file mode 100644 index f69514cd9..000000000 --- a/test_e2e/expected-structured-output/notion/fee2149e-6240-4431-8e98-a04a2e460a66.json +++ /dev/null @@ -1,30 +0,0 @@ -[ - { - "element_id": "665e346acfccd4fb6110bcd1a2e36155", - "metadata": { - "filetype": "text/html", - "languages": [ - "eng" - ] - }, - "text": "Sprint 1", - "type": "Title" - }, - { - "element_id": "cfa5ea5800f7a2510d64c98b58742e45", - "metadata": { - "emphasized_text_contents": [ - "Planning notes" - ], - "emphasized_text_tags": [ - "b" - ], - "filetype": "text/html", - "languages": [ - "eng" - ] - }, - "text": "Planning notes", - "type": "NarrativeText" - } -] \ No newline at end of file diff --git a/test_e2e/src/notion.sh b/test_e2e/src/notion.sh deleted file mode 100755 index ce96b058a..000000000 --- a/test_e2e/src/notion.sh +++ /dev/null @@ -1,49 +0,0 @@ -#!/usr/bin/env bash - -set -e - -SRC_PATH=$(dirname "$(realpath "$0")") -SCRIPT_DIR=$(dirname "$SRC_PATH") -cd "$SCRIPT_DIR"/.. || exit 1 -OUTPUT_FOLDER_NAME=notion -OUTPUT_ROOT=${OUTPUT_ROOT:-$SCRIPT_DIR} -OUTPUT_DIR=$OUTPUT_ROOT/structured-output/$OUTPUT_FOLDER_NAME -WORK_DIR=$OUTPUT_ROOT/workdir/$OUTPUT_FOLDER_NAME -DOWNLOAD_DIR=$SCRIPT_DIR/download/$OUTPUT_FOLDER_NAME -max_processes=${MAX_PROCESSES:=$(python3 -c "import os; print(os.cpu_count())")} -CI=${CI:-"false"} - -# shellcheck disable=SC1091 -source "$SCRIPT_DIR"/cleanup.sh -function cleanup() { - cleanup_dir "$OUTPUT_DIR" - cleanup_dir "$WORK_DIR" - if [ "$CI" == "true" ]; then - cleanup_dir "$DOWNLOAD_DIR" - fi -} -trap cleanup EXIT - -if [ -z "$NOTION_API_KEY" ]; then - echo "Skipping Notion ingest test because the NOTION_API_KEY env var is not set." - exit 8 -fi - -RUN_SCRIPT=${RUN_SCRIPT:-./unstructured_ingest/main.py} -PYTHONPATH=${PYTHONPATH:-.} "$RUN_SCRIPT" \ - notion \ - --api-key "$UNS_PAID_API_KEY" \ - --partition-by-api \ - --partition-endpoint "https://api.unstructuredapp.io" \ - --metadata-exclude coordinates,filename,file_directory,metadata.last_modified,metadata.data_source.date_processed,metadata.detection_class_prob,metadata.parent_id,metadata.category_depth \ - --download-dir "$DOWNLOAD_DIR" \ - --notion-api-key "$NOTION_API_KEY" \ - --output-dir "$OUTPUT_DIR" \ - --database-ids "122b2c22996b435b9de2ee0e9d2b04bc" \ - --num-processes "$max_processes" \ - --recursive \ - --verbose \ - --work-dir "$WORK_DIR" \ - --max-retry-time 30 - -"$SCRIPT_DIR"/check-diff-expected-output.py --output-folder-name $OUTPUT_FOLDER_NAME diff --git a/test_e2e/test-dest.sh b/test_e2e/test-dest.sh index 1f1a325c3..b5e0e9321 100755 --- a/test_e2e/test-dest.sh +++ b/test_e2e/test-dest.sh @@ -49,7 +49,6 @@ trap print_last_run EXIT python_version=$(python --version 2>&1) tests_to_ignore=( - 'notion.sh' 'dropbox.sh' 'sharepoint.sh' ) diff --git a/unstructured_ingest/v2/processes/connectors/__init__.py b/unstructured_ingest/v2/processes/connectors/__init__.py index c5e466d3e..d9d20cd92 100644 --- a/unstructured_ingest/v2/processes/connectors/__init__.py +++ b/unstructured_ingest/v2/processes/connectors/__init__.py @@ -42,6 +42,8 @@ from .mongodb import mongodb_destination_entry, mongodb_source_entry from .neo4j import CONNECTOR_TYPE as NEO4J_CONNECTOR_TYPE from .neo4j import neo4j_destination_entry +from .notion.connector import CONNECTOR_TYPE as NOTION_CONNECTOR_TYPE +from .notion.connector import notion_source_entry from .onedrive import CONNECTOR_TYPE as ONEDRIVE_CONNECTOR_TYPE from .onedrive import onedrive_destination_entry, onedrive_source_entry from .outlook import CONNECTOR_TYPE as OUTLOOK_CONNECTOR_TYPE @@ -98,6 +100,7 @@ add_destination_entry(destination_type=KDBAI_CONNECTOR_TYPE, entry=kdbai_destination_entry) add_source_entry(source_type=AIRTABLE_CONNECTOR_TYPE, entry=airtable_source_entry) +add_source_entry(source_type=NOTION_CONNECTOR_TYPE, entry=notion_source_entry) add_source_entry(source_type=OUTLOOK_CONNECTOR_TYPE, entry=outlook_source_entry) diff --git a/unstructured_ingest/v2/processes/connectors/notion/__init__.py b/unstructured_ingest/v2/processes/connectors/notion/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/unstructured_ingest/v2/processes/connectors/notion/client.py b/unstructured_ingest/v2/processes/connectors/notion/client.py new file mode 100644 index 000000000..f24bacc81 --- /dev/null +++ b/unstructured_ingest/v2/processes/connectors/notion/client.py @@ -0,0 +1,349 @@ +from typing import Any, Generator, List, Optional, Tuple + +import httpx +import notion_client.errors +from notion_client import Client as NotionClient +from notion_client.api_endpoints import BlocksChildrenEndpoint as NotionBlocksChildrenEndpoint +from notion_client.api_endpoints import BlocksEndpoint as NotionBlocksEndpoint +from notion_client.api_endpoints import DatabasesEndpoint as NotionDatabasesEndpoint +from notion_client.api_endpoints import Endpoint +from notion_client.api_endpoints import PagesEndpoint as NotionPagesEndpoint +from notion_client.errors import HTTPResponseError, RequestTimeoutError + +from unstructured_ingest.ingest_backoff import RetryHandler +from unstructured_ingest.interfaces import RetryStrategyConfig +from unstructured_ingest.utils.dep_check import requires_dependencies +from unstructured_ingest.v2.processes.connectors.notion.types.block import Block +from unstructured_ingest.v2.processes.connectors.notion.types.database import Database +from unstructured_ingest.v2.processes.connectors.notion.types.database_properties import map_cells +from unstructured_ingest.v2.processes.connectors.notion.types.page import Page + + +@requires_dependencies(["httpx"], extras="notion") +def _get_retry_strategy( + endpoint: Endpoint, retry_strategy_config: RetryStrategyConfig +) -> RetryHandler: + import backoff + import httpx + + retryable_exceptions = ( + httpx.TimeoutException, + httpx.HTTPStatusError, + notion_client.errors.HTTPResponseError, + ) + + return RetryHandler( + backoff.expo, + retryable_exceptions, + max_time=retry_strategy_config.max_retry_time, + max_tries=retry_strategy_config.max_retries, + logger=endpoint.parent.logger, + start_log_level=endpoint.parent.logger.level, + backoff_log_level=endpoint.parent.logger.level, + ) + + +def get_retry_handler(endpoint: Endpoint) -> Optional[RetryHandler]: + if retry_strategy_config := getattr(endpoint, "retry_strategy_config"): + return _get_retry_strategy(endpoint=endpoint, retry_strategy_config=retry_strategy_config) + return None + + +class BlocksChildrenEndpoint(NotionBlocksChildrenEndpoint): + def __init__( + self, + *args, + retry_strategy_config: Optional[RetryStrategyConfig] = None, + **kwargs, + ): + super().__init__(*args, **kwargs) + self.retry_strategy_config = retry_strategy_config + + @property + def retry_handler(self) -> Optional[RetryHandler]: + return get_retry_handler(self) + + def list(self, block_id: str, **kwargs: Any) -> Tuple[List[Block], dict]: + resp: dict = ( + self.retry_handler(super().list, block_id=block_id, **kwargs) + if self.retry_handler + else super().list(block_id=block_id, **kwargs) + ) # type: ignore + child_blocks = [Block.from_dict(data=b) for b in resp.pop("results", [])] + return child_blocks, resp + + def iterate_list( + self, + block_id: str, + **kwargs: Any, + ) -> Generator[List[Block], None, None]: + next_cursor = None + while True: + response: dict = ( + self.retry_handler( + super().list, block_id=block_id, start_cursor=next_cursor, **kwargs + ) + if self.retry_handler + else super().list(block_id=block_id, start_cursor=next_cursor, **kwargs) + ) # type: ignore + child_blocks = [Block.from_dict(data=b) for b in response.pop("results", [])] + yield child_blocks + + next_cursor = response.get("next_cursor") + if not response.get("has_more") or not next_cursor: + return + + +class DatabasesEndpoint(NotionDatabasesEndpoint): + def __init__( + self, + *args, + retry_strategy_config: Optional[RetryStrategyConfig] = None, + **kwargs, + ): + super().__init__(*args, **kwargs) + self.retry_strategy_config = retry_strategy_config + + @property + def retry_handler(self) -> Optional[RetryHandler]: + return get_retry_handler(self) + + def retrieve(self, database_id: str, **kwargs: Any) -> Database: + resp: dict = ( + self.retry_handler(super().retrieve, database_id=database_id, **kwargs) + if (self.retry_handler) + else (super().retrieve(database_id=database_id, **kwargs)) + ) # type: ignore + return Database.from_dict(data=resp) + + @requires_dependencies(["httpx"], extras="notion") + def retrieve_status(self, database_id: str, **kwargs) -> int: + import httpx + + request = self.parent._build_request( + method="HEAD", + path=f"databases/{database_id}", + auth=kwargs.get("auth"), + ) + try: + response: httpx.Response = ( + self.retry_handler(self.parent.client.send, request) + if (self.retry_handler) + else (self.parent.client.send(request)) + ) # type: ignore + return response.status_code + except httpx.TimeoutException: + raise RequestTimeoutError() + + def query(self, database_id: str, **kwargs: Any) -> Tuple[List[Page], dict]: + """Get a list of [Pages](https://developers.notion.com/reference/page) contained in the database. + + *[🔗 Endpoint documentation](https://developers.notion.com/reference/post-database-query)* + """ # noqa: E501 + resp: dict = ( + self.retry_handler(super().query, database_id=database_id, **kwargs) + if (self.retry_handler) + else (super().query(database_id=database_id, **kwargs)) + ) # type: ignore + pages = [Page.from_dict(data=p) for p in resp.pop("results")] + for p in pages: + p.properties = map_cells(p.properties) + return pages, resp + + def iterate_query(self, database_id: str, **kwargs: Any) -> Generator[List[Page], None, None]: + next_cursor = None + while True: + response: dict = ( + self.retry_handler( + super().query, database_id=database_id, start_cursor=next_cursor, **kwargs + ) + if (self.retry_handler) + else (super().query(database_id=database_id, start_cursor=next_cursor, **kwargs)) + ) # type: ignore + pages = [Page.from_dict(data=p) for p in response.pop("results", [])] + for p in pages: + p.properties = map_cells(p.properties) + yield pages + + next_cursor = response.get("next_cursor") + if not response.get("has_more") or not next_cursor: + return + + +class BlocksEndpoint(NotionBlocksEndpoint): + def __init__( + self, + *args: Any, + retry_strategy_config: Optional[RetryStrategyConfig] = None, + **kwargs: Any, + ) -> None: + super().__init__(*args, **kwargs) + self.retry_strategy_config = retry_strategy_config + self.children = BlocksChildrenEndpoint( + retry_strategy_config=retry_strategy_config, + *args, + **kwargs, + ) + + @property + def retry_handler(self) -> Optional[RetryHandler]: + return get_retry_handler(self) + + def retrieve(self, block_id: str, **kwargs: Any) -> Block: + resp: dict = ( + self.retry_handler(super().retrieve, block_id=block_id, **kwargs) + if (self.retry_handler) + else (super().retrieve(block_id=block_id, **kwargs)) + ) # type: ignore + return Block.from_dict(data=resp) + + +class PagesEndpoint(NotionPagesEndpoint): + def __init__( + self, + *args, + retry_strategy_config: Optional[RetryStrategyConfig] = None, + **kwargs, + ): + super().__init__(*args, **kwargs) + self.retry_strategy_config = retry_strategy_config + + @property + def retry_handler(self) -> Optional[RetryHandler]: + return get_retry_handler(self) + + def retrieve(self, page_id: str, **kwargs: Any) -> Page: + resp: dict = ( + self.retry_handler(super().retrieve, page_id=page_id, **kwargs) + if (self.retry_handler) + else (super().retrieve(page_id=page_id, **kwargs)) + ) # type: ignore + return Page.from_dict(data=resp) + + @requires_dependencies(["httpx"], extras="notion") + def retrieve_status(self, page_id: str, **kwargs) -> int: + import httpx + + request = self.parent._build_request( + method="HEAD", + path=f"pages/{page_id}", + auth=kwargs.get("auth"), + ) + try: + response: httpx.Response = ( + self.retry_handler(self.parent.client.send, request) + if (self.retry_handler) + else (self.parent.client.send(request)) + ) # type: ignore + return response.status_code + except httpx.TimeoutException: + raise RequestTimeoutError() + + +class Client(NotionClient): + def __init__( + self, + *args: Any, + retry_strategy_config: Optional[RetryStrategyConfig] = None, + **kwargs: Any, + ) -> None: + super().__init__(*args, **kwargs) + self.blocks = BlocksEndpoint(retry_strategy_config=retry_strategy_config, parent=self) + self.pages = PagesEndpoint(retry_strategy_config=retry_strategy_config, parent=self) + self.databases = DatabasesEndpoint(retry_strategy_config=retry_strategy_config, parent=self) + + +class AsyncBlocksChildrenEndpoint(NotionBlocksChildrenEndpoint): + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self._http_client = httpx.AsyncClient() + + async def list(self, block_id: str, **kwargs: Any) -> tuple[List[Block], dict]: + """Fetch the list of child blocks asynchronously.""" + try: + response = await self._http_client.get( + f"{self.parent._api_base}/blocks/{block_id}/children", **kwargs + ) + response.raise_for_status() + except httpx.HTTPStatusError as e: + raise HTTPResponseError(f"Failed to list blocks: {str(e)}") + except httpx.TimeoutException: + raise RequestTimeoutError() + + resp = response.json() + child_blocks = [Block.from_dict(data=b) for b in resp.pop("results", [])] + return child_blocks, resp + + async def iterate_list( + self, block_id: str, **kwargs: Any + ) -> Generator[List[Block], None, None]: + """Fetch the list of child blocks in pages asynchronously.""" + next_cursor = None + while True: + params = {"start_cursor": next_cursor} if next_cursor else {} + params.update(kwargs) + child_blocks, response = await self.list(block_id, **params) + yield child_blocks + + next_cursor = response.get("next_cursor") + if not response.get("has_more") or not next_cursor: + return + + async def close(self): + """Close the HTTP client.""" + await self._http_client.aclose() + + +class AsyncDatabasesEndpoint(NotionDatabasesEndpoint): + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self._http_client = httpx.AsyncClient() + + async def retrieve(self, database_id: str, **kwargs: Any) -> Database: + """Fetch a database by its ID asynchronously.""" + try: + response = await self._http_client.get( + f"{self.parent._api_base}/databases/{database_id}", **kwargs + ) + response.raise_for_status() + except httpx.HTTPStatusError as e: + raise HTTPResponseError(f"Failed to retrieve database: {str(e)}") + except httpx.TimeoutException: + raise RequestTimeoutError() + + return Database.from_dict(data=response.json()) + + async def query(self, database_id: str, **kwargs: Any) -> tuple[List[Page], dict]: + """Query a database asynchronously.""" + try: + response = await self._http_client.post( + f"{self.parent._api_base}/databases/{database_id}/query", + json=kwargs.get("json", {}), + ) + response.raise_for_status() + except httpx.HTTPStatusError as e: + raise HTTPResponseError(f"Failed to query database: {str(e)}") + except httpx.TimeoutException: + raise RequestTimeoutError() + + resp = response.json() + pages = [Page.from_dict(data=p) for p in resp.pop("results", [])] + for p in pages: + p.properties = map_cells(p.properties) + return pages, resp + + async def close(self): + """Close the HTTP client.""" + await self._http_client.aclose() + + +class AsyncClient(NotionClient): + def __init__(self, *args: Any, **kwargs: Any) -> None: + super().__init__(*args, **kwargs) + self.blocks = AsyncBlocksChildrenEndpoint(parent=self) + self.databases = AsyncDatabasesEndpoint(parent=self) + + async def close(self): + """Close all async endpoints.""" + await self.blocks.close() + await self.databases.close() diff --git a/unstructured_ingest/v2/processes/connectors/notion/connector.py b/unstructured_ingest/v2/processes/connectors/notion/connector.py new file mode 100644 index 000000000..0b747f9ad --- /dev/null +++ b/unstructured_ingest/v2/processes/connectors/notion/connector.py @@ -0,0 +1,346 @@ +from dataclasses import dataclass +from time import time +from typing import TYPE_CHECKING, Any, AsyncGenerator, Generator, Optional + +from pydantic import UUID4, Field, Secret + +from unstructured_ingest.error import SourceConnectionError +from unstructured_ingest.utils.dep_check import requires_dependencies +from unstructured_ingest.v2.interfaces import ( + AccessConfig, + ConnectionConfig, + Downloader, + DownloaderConfig, + DownloadResponse, + FileData, + FileDataSourceMetadata, + Indexer, + IndexerConfig, + SourceIdentifiers, +) +from unstructured_ingest.v2.logger import logger +from unstructured_ingest.v2.processes.connector_registry import SourceRegistryEntry + +if TYPE_CHECKING: + from unstructured_ingest.v2.processes.connectors.notion.client import Client + +NOTION_API_VERSION = "2022-06-28" +CONNECTOR_TYPE = "notion" + + +class NotionAccessConfig(AccessConfig): + notion_api_key: str = Field(description="Notion API key") + + +class NotionConnectionConfig(ConnectionConfig): + access_config: Secret[NotionAccessConfig] + + @requires_dependencies(["notion_client"], extras="notion") + def get_client(self) -> "Client": + from unstructured_ingest.v2.processes.connectors.notion.client import Client + + return Client( + notion_version=NOTION_API_VERSION, + auth=self.access_config.get_secret_value().notion_api_key, + logger=logger, + log_level=logger.level, + ) + + +class NotionIndexerConfig(IndexerConfig): + page_ids: Optional[list[str]] = Field( + default=None, description="List of Notion page IDs to process" + ) + + database_ids: Optional[list[str]] = Field( + default=None, description="List of Notion database IDs to process" + ) + recursive: bool = Field( + default=False, description="Recursively process child pages and databases" + ) + + def __post_init__(self): + if self.page_ids: + self.page_ids: list[UUID4] = [UUID4(p.strip()) for p in self.page_ids] + + if self.database_ids: + self.database_ids: list[UUID4] = [UUID4(p.strip()) for p in self.database_ids] + + +@dataclass +class NotionIndexer(Indexer): + connection_config: NotionConnectionConfig + index_config: NotionIndexerConfig + + def is_async(self) -> bool: + return False + + def precheck(self) -> None: + """Check the connection to the Notion API.""" + try: + client = self.connection_config.get_client() + # Perform a simple request to verify connection + request = client._build_request("HEAD", "users") + response = client.client.send(request) + response.raise_for_status() + + except Exception as e: + logger.error(f"Failed to validate connection: {e}", exc_info=True) + raise SourceConnectionError(f"Failed to validate connection: {e}") + + def run(self, **kwargs: Any) -> Generator[FileData, None, None]: + client = self.connection_config.get_client() + processed_pages: set[str] = set() + processed_databases: set[str] = set() + + pages_to_process: set[str] = set(self.index_config.page_ids or []) + databases_to_process: set[str] = set(self.index_config.database_ids or []) + + while pages_to_process or databases_to_process: + # Process pages + for page_id in list(pages_to_process): + if page_id in processed_pages: + continue + + processed_pages.add(page_id) + pages_to_process.remove(page_id) + file_data = self.get_page_file_data(page_id=page_id, client=client) + if file_data: + yield file_data + + if self.index_config.recursive: + (child_pages, child_databases) = self.get_child_pages_and_databases( + page_id=page_id, + client=client, + processed_pages=processed_pages, + processed_databases=processed_databases, + ) + pages_to_process.update(child_pages) + databases_to_process.update(child_databases) + + # Process databases + for database_id in list(databases_to_process): + if database_id in processed_databases: + continue + processed_databases.add(database_id) + databases_to_process.remove(database_id) + file_data = self.get_database_file_data(database_id=database_id, client=client) + if file_data: + yield file_data + if self.index_config.recursive: + ( + child_pages, + child_databases, + ) = self.get_child_pages_and_databases_from_database( + database_id=database_id, + client=client, + processed_pages=processed_pages, + processed_databases=processed_databases, + ) + pages_to_process.update(child_pages) + databases_to_process.update(child_databases) + + @requires_dependencies(["notion_client"], extras="notion") + def get_page_file_data(self, page_id: str, client: "Client") -> Optional[FileData]: + try: + page_metadata = client.pages.retrieve(page_id=page_id) # type: ignore + date_created = page_metadata.created_time + date_modified = page_metadata.last_edited_time + identifier = page_id + source_identifiers = SourceIdentifiers( + filename=f"{page_id}.html", + fullpath=f"{page_id}.html", + rel_path=f"{page_id}.html", + ) + metadata = FileDataSourceMetadata( + date_created=date_created, + date_modified=date_modified, + record_locator={"page_id": page_id}, + date_processed=str(time()), + ) + # additional_metadata = page_metadata + additional_metadata = { + "created_by": page_metadata.created_by, + "last_edited_by": page_metadata.last_edited_by, + "parent": page_metadata.parent, + "url": page_metadata.url, + } + + return FileData( + identifier=identifier, + connector_type=CONNECTOR_TYPE, + source_identifiers=source_identifiers, + metadata=metadata, + additional_metadata=additional_metadata, + ) + except Exception as e: + logger.error(f"Error retrieving page {page_id}: {e}") + return None + + @requires_dependencies(["notion_client"], extras="notion") + def get_database_file_data(self, database_id: str, client: "Client") -> Optional[FileData]: + try: + # type: ignore + database_metadata = client.databases.retrieve(database_id=database_id) + date_created = database_metadata.created_time + date_modified = database_metadata.last_edited_time + identifier = database_id + source_identifiers = SourceIdentifiers( + filename=f"{database_id}.html", + fullpath=f"{database_id}.html", + rel_path=f"{database_id}.html", + ) + metadata = FileDataSourceMetadata( + date_created=date_created, + date_modified=date_modified, + record_locator={"database_id": database_id}, + date_processed=str(time()), + ) + additional_metadata = { + "created_by": database_metadata.created_by, + "last_edited_by": database_metadata.last_edited_by, + "parent": database_metadata.parent, + "url": database_metadata.url, + } + return FileData( + identifier=identifier, + connector_type=CONNECTOR_TYPE, + source_identifiers=source_identifiers, + metadata=metadata, + additional_metadata=additional_metadata, + ) + except Exception as e: + logger.error(f"Error retrieving database {database_id}: {e}") + return None + + def get_child_pages_and_databases( + self, + page_id: str, + client: "Client", + processed_pages: set[str], + processed_databases: set[str], + ) -> tuple[set[str], set[str]]: + from unstructured_ingest.v2.processes.connectors.notion.helpers import ( + get_recursive_content_from_page, + ) + + child_content = get_recursive_content_from_page( + client=client, + page_id=page_id, + logger=logger, + ) + child_pages = set(child_content.child_pages) - processed_pages + child_databases = set(child_content.child_databases) - processed_databases + return child_pages, child_databases + + def get_child_pages_and_databases_from_database( + self, + database_id: str, + client: "Client", + processed_pages: set[str], + processed_databases: set[str], + ) -> tuple[set[str], set[str]]: + from unstructured_ingest.v2.processes.connectors.notion.helpers import ( + get_recursive_content_from_database, + ) + + child_content = get_recursive_content_from_database( + client=client, + database_id=database_id, + logger=logger, + ) + child_pages = set(child_content.child_pages) - processed_pages + child_databases = set(child_content.child_databases) - processed_databases + return child_pages, child_databases + + async def run_async(self, **kwargs: Any) -> AsyncGenerator[None, None]: + # Asynchronous run is not implemented + raise NotImplementedError() + + +class NotionDownloaderConfig(DownloaderConfig): + pass + + +@dataclass +class NotionDownloader(Downloader): + connection_config: NotionConnectionConfig + download_config: NotionDownloaderConfig + connector_type: str = CONNECTOR_TYPE + + def run(self, file_data: FileData, **kwargs: Any) -> DownloadResponse: + client = self.connection_config.get_client() + record_locator = file_data.metadata.record_locator + + if "page_id" in record_locator: + return self.download_page( + client=client, + page_id=record_locator["page_id"], + file_data=file_data, + ) + elif "database_id" in record_locator: + return self.download_database( + client=client, + database_id=record_locator["database_id"], + file_data=file_data, + ) + else: + raise ValueError("Invalid record_locator in file_data") + + def download_page(self, client, page_id: str, file_data: FileData) -> DownloadResponse: + from unstructured_ingest.v2.processes.connectors.notion.helpers import extract_page_html + + try: + text_extraction = extract_page_html( + client=client, + page_id=page_id, + logger=logger, + ) + + if text_extraction.html: + download_path = self.get_download_path(file_data=file_data) + download_path.parent.mkdir(parents=True, exist_ok=True) + with download_path.open("w") as page_file: + page_file.write(text_extraction.html.render(pretty=True)) + return self.generate_download_response( + file_data=file_data, download_path=download_path + ) + else: + logger.error(f"No HTML content for page {page_id}") + return None + except Exception as e: + logger.error(f"Error downloading page {page_id}: {e}") + return None + + def download_database(self, client, database_id: str, file_data: FileData) -> DownloadResponse: + from unstructured_ingest.v2.processes.connectors.notion.helpers import extract_database_html + + try: + text_extraction = extract_database_html( + client=client, + database_id=database_id, + logger=logger, + ) + if text_extraction.html: + download_path = self.get_download_path(file_data=file_data) + download_path.parent.mkdir(parents=True, exist_ok=True) + with download_path.open("w") as database_file: + database_file.write(text_extraction.html.render(pretty=True)) + return self.generate_download_response( + file_data=file_data, download_path=download_path + ) + else: + logger.error(f"No HTML content for database {database_id}") + return None + except Exception as e: + logger.error(f"Error downloading database {database_id}: {e}") + return None + + +notion_source_entry = SourceRegistryEntry( + connection_config=NotionConnectionConfig, + indexer_config=NotionIndexerConfig, + indexer=NotionIndexer, + downloader_config=NotionDownloaderConfig, + downloader=NotionDownloader, +) diff --git a/unstructured_ingest/v2/processes/connectors/notion/helpers.py b/unstructured_ingest/v2/processes/connectors/notion/helpers.py new file mode 100644 index 000000000..07654ddb3 --- /dev/null +++ b/unstructured_ingest/v2/processes/connectors/notion/helpers.py @@ -0,0 +1,448 @@ +import enum +import logging +from dataclasses import dataclass, field +from typing import List, Optional, Tuple +from urllib.parse import urlparse +from uuid import UUID + +from htmlBuilder.attributes import Style +from htmlBuilder.tags import ( + Body, + Div, + Head, + Html, + HtmlTag, + Ol, + Table, + Td, + Th, + Title, + Tr, + Ul, +) +from notion_client.errors import APIResponseError + +import unstructured_ingest.v2.processes.connectors.notion.types.blocks as notion_blocks +from unstructured_ingest.v2.processes.connectors.notion.client import Client +from unstructured_ingest.v2.processes.connectors.notion.types.block import Block +from unstructured_ingest.v2.processes.connectors.notion.types.database import Database + + +@dataclass +class HtmlExtractionResponse: + html: Optional[HtmlTag] = None + child_pages: List[str] = field(default_factory=list) + child_databases: List[str] = field(default_factory=list) + + +def process_block( + current_block: dict, + parent_page_id: str, + client: Client, + child_pages: list, + child_databases: list, +) -> Tuple[dict, list, list, dict]: + if isinstance(current_block["block"].block, notion_blocks.ChildPage) and current_block[ + "block" + ].id != str(parent_page_id): + child_pages.append(current_block["block"].id) + return {}, child_pages, child_databases + if isinstance(current_block["block"].block, notion_blocks.ChildDatabase): + child_databases.append(current_block["block"].id) + return {}, child_pages, child_databases + + # recursively go through all blocks in a page, store each block in a dictionary + if current_block["block"].has_children: + children = [] + for children_block in client.blocks.children.iterate_list( + block_id=current_block["block"].id + ): + children.extend(children_block) + if children: + for child in children: + child_block = { + "block": child, + "level": current_block["level"] + 1, + "children": [], + "parent_id": current_block["block"].id, + } + child_element, child_pages, child_databases = process_block( + child_block, parent_page_id, client, child_pages, child_databases + ) + current_block["children"].append(child_element) + return current_block, child_pages, child_databases + + +def flush_list(type: str, item_list: list, html: list) -> Tuple[list, list]: + margin_left = 10 * (item_list[-1][1] - 1) + style = Style(f"margin-left: {margin_left}px") + if type == "bulleted_list": + html.append(Ul([style], [item[2] for item in item_list])) + else: + html.append(Ol([style], [item[2] for item in item_list])) + return [], html + + +def build_html( + current_block: dict, bulleted_list: list, numbered_list: list +) -> Tuple[list, list, list]: + html = [] + # extract current block's html + if isinstance(current_block["block"].block, notion_blocks.BulletedListItem): + if bulleted_list and current_block["parent_id"] != bulleted_list[-1][0]: + bulleted_list, html = flush_list("bulleted_list", bulleted_list, html) + bulleted_list.append( + (current_block["parent_id"], current_block["level"], current_block["block"].get_html()) + ) + if bulleted_list and current_block["peers_rank"] == current_block["peers_count"] - 1: + bulleted_list, html = flush_list("bulleted_list", bulleted_list, html) + elif isinstance(current_block["block"].block, notion_blocks.NumberedListItem): + if numbered_list and current_block["parent_id"] != numbered_list[-1][0]: + numbered_list, html = flush_list("numbered_list", numbered_list, html) + numbered_list.append( + (current_block["parent_id"], current_block["level"], current_block["block"].get_html()) + ) + if numbered_list and current_block["peers_rank"] == current_block["peers_count"] - 1: + numbered_list, html = flush_list("numbered_list", numbered_list, html) + else: + if bulleted_list: + bulleted_list, html = flush_list("bulleted_list", bulleted_list, html) + if numbered_list: + numbered_list, html = flush_list("numbered_list", numbered_list, html) + if ( + isinstance(current_block["block"].block, notion_blocks.TableRow) + and current_block["peers_rank"] == 0 + ): + current_block["block"].is_header = True + if current_block["block"].get_html(): + html.append(current_block["block"].get_html()) + else: + html.append([]) + # process current block's children + if current_block["children"]: + children_html = [] + for index, child in enumerate(current_block["children"]): + if child: + child["peers_rank"] = index + child["peers_count"] = len(current_block["children"]) + child_html, bulleted_list, numbered_list = build_html( + child, bulleted_list, numbered_list + ) + if child_html: + children_html.append(child_html) + if isinstance(current_block["block"].block, notion_blocks.Column): + html.append( + Div( + [Style(f"width:{100/current_block['peers_count']}%; float: left")], + children_html, + ) + ) + elif isinstance(current_block["block"].block, notion_blocks.Table): + html.append(Table([], children_html)) + else: + html.append(Div([], children_html)) + + return html, bulleted_list, numbered_list + + +def extract_page_html( + client: Client, + page_id: str, + logger: logging.Logger, +) -> HtmlExtractionResponse: + parent_page_id = UUID(page_id) + parent_block: Block = client.blocks.retrieve(block_id=page_id) # type: ignore + head = None + if isinstance(parent_block.block, notion_blocks.ChildPage): + head = Head([], Title([], parent_block.block.title)) + current_block = { + "block": parent_block, + "level": 0, + "children": [], + "parent_id": None, + "peers_rank": 0, + "peers_count": 1, + } + logger.debug(f"processing page id: {page_id}") + current_block, child_pages, child_databases = process_block( + current_block, parent_page_id, client, [], [] + ) + html, _, _ = build_html(current_block, [], []) + body = Body([], html) + all_elements = [body] + if head: + all_elements = [head] + all_elements + full_html = Html([], all_elements) + return HtmlExtractionResponse( + full_html, + child_pages=child_pages, + child_databases=child_databases, + ) + + +def extract_database_html( + client: Client, + database_id: str, + logger: logging.Logger, +) -> HtmlExtractionResponse: + logger.debug(f"processing database id: {database_id}") + database: Database = client.databases.retrieve(database_id=database_id) # type: ignore + property_keys = list(database.properties.keys()) + property_keys = sorted(property_keys) + table_html_rows = [] + child_pages: List[str] = [] + child_databases: List[str] = [] + # Create header row + table_html_rows.append(Tr([], [Th([], k) for k in property_keys])) + + all_pages = [] + for page_chunk in client.databases.iterate_query(database_id=database_id): # type: ignore + all_pages.extend(page_chunk) + + logger.debug(f"creating {len(all_pages)} rows") + for page in all_pages: + if is_database_url(client=client, url=page.url): + child_databases.append(page.id) + if is_page_url(client=client, url=page.url): + child_pages.append(page.id) + properties = page.properties + inner_html = [properties.get(k).get_html() for k in property_keys] # type: ignore + table_html_rows.append( + Tr( + [], + [Td([], cell) for cell in [html if html else Div([], []) for html in inner_html]], + ), + ) + + table_html = Table([], table_html_rows) + + return HtmlExtractionResponse( + html=table_html, + child_pages=child_pages, + child_databases=child_databases, + ) + + +@dataclass +class ChildExtractionResponse: + child_pages: List[str] = field(default_factory=list) + child_databases: List[str] = field(default_factory=list) + + +class QueueEntryType(enum.Enum): + DATABASE = "database" + PAGE = "page" + + +@dataclass +class QueueEntry: + type: QueueEntryType + id: UUID + + +def get_recursive_content_from_page( + client: Client, + page_id: str, + logger: logging.Logger, +) -> ChildExtractionResponse: + return get_recursive_content( + client=client, + init_entry=QueueEntry(type=QueueEntryType.PAGE, id=UUID(page_id)), + logger=logger, + ) + + +def get_recursive_content_from_database( + client: Client, + database_id: str, + logger: logging.Logger, +) -> ChildExtractionResponse: + return get_recursive_content( + client=client, + init_entry=QueueEntry(type=QueueEntryType.DATABASE, id=UUID(database_id)), + logger=logger, + ) + + +def get_recursive_content( + client: Client, + init_entry: QueueEntry, + logger: logging.Logger, +) -> ChildExtractionResponse: + parents: List[QueueEntry] = [init_entry] + child_pages: List[str] = [] + child_dbs: List[str] = [] + processed: List[str] = [] + while len(parents) > 0: + parent: QueueEntry = parents.pop() + processed.append(str(parent.id)) + if parent.type == QueueEntryType.PAGE: + logger.debug(f"getting child data from page: {parent.id}") + page_children = [] + try: + for children_block in client.blocks.children.iterate_list( # type: ignore + block_id=str(parent.id), + ): + page_children.extend(children_block) + except APIResponseError as api_error: + logger.error(f"failed to get page with id {parent.id}: {api_error}") + if str(parent.id) in child_pages: + child_pages.remove(str(parent.id)) + continue + if not page_children: + continue + + # Extract child pages + child_pages_from_page = [ + c for c in page_children if isinstance(c.block, notion_blocks.ChildPage) + ] + if child_pages_from_page: + child_page_blocks: List[notion_blocks.ChildPage] = [ + p.block + for p in child_pages_from_page + if isinstance(p.block, notion_blocks.ChildPage) + ] + logger.debug( + "found child pages from parent page {}: {}".format( + parent.id, + ", ".join([block.title for block in child_page_blocks]), + ), + ) + new_pages = [p.id for p in child_pages_from_page if p.id not in processed] + new_pages = list(set(new_pages)) + child_pages.extend(new_pages) + parents.extend( + [QueueEntry(type=QueueEntryType.PAGE, id=UUID(i)) for i in new_pages], + ) + + # Extract child databases + child_dbs_from_page = [ + c for c in page_children if isinstance(c.block, notion_blocks.ChildDatabase) + ] + if child_dbs_from_page: + child_db_blocks: List[notion_blocks.ChildDatabase] = [ + c.block + for c in page_children + if isinstance(c.block, notion_blocks.ChildDatabase) + ] + logger.debug( + "found child database from parent page {}: {}".format( + parent.id, + ", ".join([block.title for block in child_db_blocks]), + ), + ) + new_dbs = [db.id for db in child_dbs_from_page if db.id not in processed] + new_dbs = list(set(new_dbs)) + child_dbs.extend(new_dbs) + parents.extend( + [QueueEntry(type=QueueEntryType.DATABASE, id=UUID(i)) for i in new_dbs], + ) + + linked_to_others: List[notion_blocks.LinkToPage] = [ + c.block for c in page_children if isinstance(c.block, notion_blocks.LinkToPage) + ] + for link in linked_to_others: + if (page_id := link.page_id) and ( + page_id not in processed and page_id not in child_pages + ): + child_pages.append(page_id) + parents.append(QueueEntry(type=QueueEntryType.PAGE, id=UUID(page_id))) + if (database_id := link.database_id) and ( + database_id not in processed and database_id not in child_dbs + ): + child_dbs.append(database_id) + parents.append( + QueueEntry(type=QueueEntryType.DATABASE, id=UUID(database_id)), + ) + + elif parent.type == QueueEntryType.DATABASE: + logger.debug(f"getting child data from database: {parent.id}") + database_pages = [] + try: + for page_entries in client.databases.iterate_query( # type: ignore + database_id=str(parent.id), + ): + database_pages.extend(page_entries) + except APIResponseError as api_error: + logger.error(f"failed to get database with id {parent.id}: {api_error}") + if str(parent.id) in child_dbs: + child_dbs.remove(str(parent.id)) + continue + if not database_pages: + continue + + child_pages_from_db = [ + p for p in database_pages if is_page_url(client=client, url=p.url) + ] + if child_pages_from_db: + logger.debug( + "found child pages from parent database {}: {}".format( + parent.id, + ", ".join([p.url for p in child_pages_from_db]), + ), + ) + new_pages = [p.id for p in child_pages_from_db if p.id not in processed] + child_pages.extend(new_pages) + parents.extend( + [QueueEntry(type=QueueEntryType.PAGE, id=UUID(i)) for i in new_pages], + ) + + child_dbs_from_db = [ + p for p in database_pages if is_database_url(client=client, url=p.url) + ] + if child_dbs_from_db: + logger.debug( + "found child database from parent database {}: {}".format( + parent.id, + ", ".join([db.url for db in child_dbs_from_db]), + ), + ) + new_dbs = [db.id for db in child_dbs_from_db if db.id not in processed] + child_dbs.extend(new_dbs) + parents.extend( + [QueueEntry(type=QueueEntryType.DATABASE, id=UUID(i)) for i in new_dbs], + ) + + return ChildExtractionResponse( + child_pages=child_pages, + child_databases=child_dbs, + ) + + +def is_valid_uuid(uuid_str: str) -> bool: + try: + UUID(uuid_str) + return True + except Exception: + return False + + +def get_uuid_from_url(path: str) -> Optional[str]: + strings = path.split("-") + if len(strings) > 0 and is_valid_uuid(strings[-1]): + return strings[-1] + return None + + +def is_page_url(client: Client, url: str): + parsed_url = urlparse(url) + path = parsed_url.path.split("/")[-1] + if parsed_url.netloc != "www.notion.so": + return False + page_uuid = get_uuid_from_url(path=path) + if not page_uuid: + return False + check_resp = client.pages.retrieve_status(page_id=page_uuid) + return check_resp == 200 + + +def is_database_url(client: Client, url: str): + parsed_url = urlparse(url) + path = parsed_url.path.split("/")[-1] + if parsed_url.netloc != "www.notion.so": + return False + database_uuid = get_uuid_from_url(path=path) + if not database_uuid: + return False + check_resp = client.databases.retrieve_status(database_id=database_uuid) + return check_resp == 200 diff --git a/unstructured_ingest/v2/processes/connectors/notion/interfaces.py b/unstructured_ingest/v2/processes/connectors/notion/interfaces.py new file mode 100644 index 000000000..bcfa788d5 --- /dev/null +++ b/unstructured_ingest/v2/processes/connectors/notion/interfaces.py @@ -0,0 +1,32 @@ +from abc import ABC, abstractmethod +from typing import Optional + +from htmlBuilder.tags import HtmlTag + + +class FromJSONMixin(ABC): + @classmethod + @abstractmethod + def from_dict(cls, data: dict): + pass + + +class GetHTMLMixin(ABC): + @abstractmethod + def get_html(self) -> Optional[HtmlTag]: + pass + + +class BlockBase(FromJSONMixin, GetHTMLMixin): + @staticmethod + @abstractmethod + def can_have_children() -> bool: + pass + + +class DBPropertyBase(FromJSONMixin): + pass + + +class DBCellBase(FromJSONMixin, GetHTMLMixin): + pass diff --git a/unstructured_ingest/v2/processes/connectors/notion/types/__init__.py b/unstructured_ingest/v2/processes/connectors/notion/types/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/unstructured_ingest/v2/processes/connectors/notion/types/block.py b/unstructured_ingest/v2/processes/connectors/notion/types/block.py new file mode 100644 index 000000000..66168a09d --- /dev/null +++ b/unstructured_ingest/v2/processes/connectors/notion/types/block.py @@ -0,0 +1,96 @@ +# https://developers.notion.com/reference/page +from dataclasses import dataclass +from typing import Optional + +from htmlBuilder.tags import HtmlTag + +from unstructured_ingest.v2.processes.connectors.notion.interfaces import ( + BlockBase, + FromJSONMixin, + GetHTMLMixin, +) +from unstructured_ingest.v2.processes.connectors.notion.types import blocks +from unstructured_ingest.v2.processes.connectors.notion.types.parent import Parent +from unstructured_ingest.v2.processes.connectors.notion.types.user import PartialUser + +block_type_mapping = { + "bookmark": blocks.Bookmark, + "breadcrumb": blocks.Breadcrumb, + "bulleted_list_item": blocks.BulletedListItem, + "callout": blocks.Callout, + "child_database": blocks.ChildDatabase, + "child_page": blocks.ChildPage, + "code": blocks.Code, + "column": blocks.Column, + "column_list": blocks.ColumnList, + "divider": blocks.Divider, + "heading_1": blocks.Heading, + "heading_2": blocks.Heading, + "heading_3": blocks.Heading, + "embed": blocks.Embed, + "equation": blocks.Equation, + "file": blocks.File, + "image": blocks.Image, + "link_preview": blocks.LinkPreview, + "link_to_page": blocks.LinkToPage, + "numbered_list_item": blocks.NumberedListItem, + "paragraph": blocks.Paragraph, + "pdf": blocks.PDF, + "quote": blocks.Quote, + "synced_block": blocks.SyncBlock, + "table": blocks.Table, + "table_of_contents": blocks.TableOfContents, + "table_row": blocks.TableRow, + "template": blocks.Template, + "to_do": blocks.ToDo, + "toggle": blocks.Toggle, + "unsupported": blocks.Unsupported, + "video": blocks.Video, +} + + +@dataclass +class Block(FromJSONMixin, GetHTMLMixin): + id: str + type: str + created_time: str + created_by: PartialUser + last_edited_time: str + last_edited_by: PartialUser + archived: bool + in_trash: bool + has_children: bool + parent: Parent + block: BlockBase + object: str = "block" + request_id: Optional[str] = None + + def __repr__(self): + return f"{self.__class__.__name__}(id={self.id}, type={self.type})" + + @classmethod + def from_dict(cls, data: dict): + t = data["type"] + block_data = data.pop(t) + created_by = data.pop("created_by") + last_edited_by = data.pop("last_edited_by") + parent = data.pop("parent") + try: + block = cls( + created_by=PartialUser.from_dict(created_by), + last_edited_by=PartialUser.from_dict(last_edited_by), + parent=Parent.from_dict(parent), + block=block_type_mapping[t].from_dict(block_data), # type: ignore + **data, + ) + except KeyError as ke: + raise KeyError(f"failed to map to associated block type -> {t}: {block_data}") from ke + except TypeError as te: + raise TypeError(f"failed to map to associated block type -> {t}: {block_data}") from te + + return block + + def get_html(self) -> Optional[HtmlTag]: + if self.block: + return self.block.get_html() + return None diff --git a/unstructured_ingest/v2/processes/connectors/notion/types/blocks/__init__.py b/unstructured_ingest/v2/processes/connectors/notion/types/blocks/__init__.py new file mode 100644 index 000000000..5cd158bc8 --- /dev/null +++ b/unstructured_ingest/v2/processes/connectors/notion/types/blocks/__init__.py @@ -0,0 +1,63 @@ +from .bookmark import Bookmark +from .breadcrumb import Breadcrumb +from .bulleted_list_item import BulletedListItem +from .callout import Callout +from .child_database import ChildDatabase +from .child_page import ChildPage +from .code import Code +from .column_list import Column, ColumnList +from .divider import Divider +from .embed import Embed +from .equation import Equation +from .file import File +from .heading import Heading +from .image import Image +from .link_preview import LinkPreview +from .link_to_page import LinkToPage +from .numbered_list import NumberedListItem +from .paragraph import Paragraph +from .pdf import PDF +from .quote import Quote +from .synced_block import DuplicateSyncedBlock, OriginalSyncedBlock, SyncBlock +from .table import Table, TableRow +from .table_of_contents import TableOfContents +from .template import Template +from .todo import ToDo +from .toggle import Toggle +from .unsupported import Unsupported +from .video import Video + +__all__ = [ + "Bookmark", + "Breadcrumb", + "BulletedListItem", + "Callout", + "ChildDatabase", + "ChildPage", + "Code", + "Column", + "ColumnList", + "Divider", + "Embed", + "Equation", + "File", + "Heading", + "Image", + "LinkPreview", + "LinkToPage", + "NumberedListItem", + "Paragraph", + "PDF", + "Quote", + "SyncBlock", + "OriginalSyncedBlock", + "DuplicateSyncedBlock", + "Table", + "TableRow", + "TableOfContents", + "Template", + "ToDo", + "Toggle", + "Unsupported", + "Video", +] diff --git a/unstructured_ingest/v2/processes/connectors/notion/types/blocks/bookmark.py b/unstructured_ingest/v2/processes/connectors/notion/types/blocks/bookmark.py new file mode 100644 index 000000000..6f9e66c2c --- /dev/null +++ b/unstructured_ingest/v2/processes/connectors/notion/types/blocks/bookmark.py @@ -0,0 +1,40 @@ +# https://developers.notion.com/reference/block#bookmark +from dataclasses import dataclass, field +from typing import List, Optional + +from htmlBuilder.attributes import Href +from htmlBuilder.tags import A, Br, Div, HtmlTag + +from unstructured_ingest.connector.notion.interfaces import BlockBase +from unstructured_ingest.connector.notion.types.rich_text import RichText + + +@dataclass +class Bookmark(BlockBase): + url: str + caption: List[RichText] = field(default_factory=list) + + @classmethod + def from_dict(cls, data: dict): + captions = data.pop("caption", []) + return cls( + url=data["url"], + caption=[RichText.from_dict(c) for c in captions], + ) + + def get_html(self) -> Optional[HtmlTag]: + texts = [] + if self.url: + texts.append(A([Href(self.url)], self.url)) + if self.caption: + texts.append(Div([], [rt.get_html() for rt in self.caption])) + if not texts: + return None + joined = [Br()] * (len(texts) * 2 - 1) + joined[0::2] = texts + + return Div([], joined) + + @staticmethod + def can_have_children() -> bool: + return False diff --git a/unstructured_ingest/v2/processes/connectors/notion/types/blocks/breadcrumb.py b/unstructured_ingest/v2/processes/connectors/notion/types/blocks/breadcrumb.py new file mode 100644 index 000000000..1578da609 --- /dev/null +++ b/unstructured_ingest/v2/processes/connectors/notion/types/blocks/breadcrumb.py @@ -0,0 +1,21 @@ +# https://developers.notion.com/reference/block#breadcrumb +from dataclasses import dataclass +from typing import Optional + +from htmlBuilder.tags import HtmlTag + +from unstructured_ingest.connector.notion.interfaces import BlockBase + + +@dataclass +class Breadcrumb(BlockBase): + @staticmethod + def can_have_children() -> bool: + return False + + @classmethod + def from_dict(cls, data: dict): + return cls() + + def get_html(self) -> Optional[HtmlTag]: + pass diff --git a/unstructured_ingest/v2/processes/connectors/notion/types/blocks/bulleted_list_item.py b/unstructured_ingest/v2/processes/connectors/notion/types/blocks/bulleted_list_item.py new file mode 100644 index 000000000..70810a071 --- /dev/null +++ b/unstructured_ingest/v2/processes/connectors/notion/types/blocks/bulleted_list_item.py @@ -0,0 +1,31 @@ +# https://developers.notion.com/reference/block#bulleted-list-item +from dataclasses import dataclass, field +from typing import List, Optional + +from htmlBuilder.tags import HtmlTag, Li + +from unstructured_ingest.connector.notion.interfaces import BlockBase +from unstructured_ingest.connector.notion.types.rich_text import RichText + + +@dataclass +class BulletedListItem(BlockBase): + color: str + children: List[dict] = field(default_factory=list) + rich_text: List[RichText] = field(default_factory=list) + + @staticmethod + def can_have_children() -> bool: + return True + + @classmethod + def from_dict(cls, data: dict): + rich_text = data.pop("rich_text", []) + return cls( + color=data["color"], + children=data.get("children", []), + rich_text=[RichText.from_dict(rt) for rt in rich_text], + ) + + def get_html(self) -> Optional[HtmlTag]: + return Li([], [rt.get_html() for rt in self.rich_text]) diff --git a/unstructured_ingest/v2/processes/connectors/notion/types/blocks/callout.py b/unstructured_ingest/v2/processes/connectors/notion/types/blocks/callout.py new file mode 100644 index 000000000..8f8895c61 --- /dev/null +++ b/unstructured_ingest/v2/processes/connectors/notion/types/blocks/callout.py @@ -0,0 +1,94 @@ +# https://developers.notion.com/reference/block#callout +from dataclasses import dataclass, field +from typing import List, Optional, Union + +from htmlBuilder.attributes import Href, Style +from htmlBuilder.tags import A, Div, HtmlTag, P + +from unstructured_ingest.connector.notion.interfaces import ( + BlockBase, + FromJSONMixin, + GetHTMLMixin, +) +from unstructured_ingest.connector.notion.types.rich_text import RichText + + +@dataclass +class EmojiIcon(FromJSONMixin, GetHTMLMixin): + emoji: str + type: str = "emoji" + + @classmethod + def from_dict(cls, data: dict): + return cls(**data) + + def get_html(self) -> Optional[HtmlTag]: + return P([], self.emoji) + + +@dataclass +class ExternalIconContent(FromJSONMixin): + url: str + + @classmethod + def from_dict(cls, data: dict): + return cls(**data) + + +@dataclass +class ExternalIcon(FromJSONMixin, GetHTMLMixin): + external: ExternalIconContent + type: str = "external" + + @classmethod + def from_dict(cls, data: dict): + return cls(external=ExternalIconContent.from_dict(data=data.pop("external")), **data) + + def get_html(self) -> Optional[HtmlTag]: + if self.external: + return A([Href(self.external.url)], [self.external.url]) + else: + return None + + +class Icon(FromJSONMixin): + @classmethod + def from_dict(cls, data: dict) -> Union[EmojiIcon, ExternalIcon]: + t = data.get("type") + if t == "emoji": + return EmojiIcon.from_dict(data) + elif t == "external": + return ExternalIcon.from_dict(data) + else: + raise ValueError(f"Unexpected icon type: {t} ({data})") + + +@dataclass +class Callout(BlockBase): + color: str + icon: Optional[Union[EmojiIcon, ExternalIcon]] = None + rich_text: List[RichText] = field(default_factory=list) + + @staticmethod + def can_have_children() -> bool: + return True + + @classmethod + def from_dict(cls, data: dict): + rich_text = data.pop("rich_text", []) + return cls( + color=data["color"], + icon=Icon.from_dict(data.pop("icon")), + rich_text=[RichText.from_dict(rt) for rt in rich_text], + ) + + def get_html(self) -> Optional[HtmlTag]: + elements = [] + if self.icon and self.icon.get_html(): + elements.append(self.icon.get_html()) + if self.rich_text: + elements.extend([rt.get_html() for rt in self.rich_text]) + attributes = [] + if self.color: + attributes.append(Style(f"color:{self.color}")) + return Div(attributes, elements) diff --git a/unstructured_ingest/v2/processes/connectors/notion/types/blocks/child_database.py b/unstructured_ingest/v2/processes/connectors/notion/types/blocks/child_database.py new file mode 100644 index 000000000..0e44ce3d3 --- /dev/null +++ b/unstructured_ingest/v2/processes/connectors/notion/types/blocks/child_database.py @@ -0,0 +1,23 @@ +# https://developers.notion.com/reference/block#child-database +from dataclasses import dataclass +from typing import Optional + +from htmlBuilder.tags import HtmlTag, P + +from unstructured_ingest.connector.notion.interfaces import BlockBase + + +@dataclass +class ChildDatabase(BlockBase): + title: str + + @staticmethod + def can_have_children() -> bool: + return True + + @classmethod + def from_dict(cls, data: dict): + return cls(**data) + + def get_html(self) -> Optional[HtmlTag]: + return P([], self.title) diff --git a/unstructured_ingest/v2/processes/connectors/notion/types/blocks/child_page.py b/unstructured_ingest/v2/processes/connectors/notion/types/blocks/child_page.py new file mode 100644 index 000000000..25cefef57 --- /dev/null +++ b/unstructured_ingest/v2/processes/connectors/notion/types/blocks/child_page.py @@ -0,0 +1,23 @@ +# https://developers.notion.com/reference/block#child-page +from dataclasses import dataclass +from typing import Optional + +from htmlBuilder.tags import HtmlTag, P + +from unstructured_ingest.connector.notion.interfaces import BlockBase, GetHTMLMixin + + +@dataclass +class ChildPage(BlockBase, GetHTMLMixin): + title: str + + @staticmethod + def can_have_children() -> bool: + return True + + @classmethod + def from_dict(cls, data: dict): + return cls(**data) + + def get_html(self) -> Optional[HtmlTag]: + return P([], self.title) diff --git a/unstructured_ingest/v2/processes/connectors/notion/types/blocks/code.py b/unstructured_ingest/v2/processes/connectors/notion/types/blocks/code.py new file mode 100644 index 000000000..56b82b1bf --- /dev/null +++ b/unstructured_ingest/v2/processes/connectors/notion/types/blocks/code.py @@ -0,0 +1,43 @@ +# https://developers.notion.com/reference/block#code +from dataclasses import dataclass, field +from typing import List, Optional + +from htmlBuilder.tags import Br, Div, HtmlTag +from htmlBuilder.tags import Code as HtmlCode + +from unstructured_ingest.connector.notion.interfaces import BlockBase +from unstructured_ingest.connector.notion.types.rich_text import RichText + + +@dataclass +class Code(BlockBase): + language: str + rich_text: List[RichText] = field(default_factory=list) + caption: List[RichText] = field(default_factory=list) + + @staticmethod + def can_have_children() -> bool: + return False + + @classmethod + def from_dict(cls, data: dict): + rich_text = data.pop("rich_text", []) + caption = data.pop("caption", []) + return cls( + language=data["language"], + rich_text=[RichText.from_dict(rt) for rt in rich_text], + caption=[RichText.from_dict(c) for c in caption], + ) + + def get_html(self) -> Optional[HtmlTag]: + texts = [] + if self.rich_text: + texts.append(HtmlCode([], [rt.get_html() for rt in self.rich_text])) + if self.caption: + texts.append(Div([], [rt.get_html() for rt in self.caption])) + if not texts: + return None + joined = [Br()] * (len(texts) * 2 - 1) + joined[0::2] = texts + + return Div([], joined) diff --git a/unstructured_ingest/v2/processes/connectors/notion/types/blocks/column_list.py b/unstructured_ingest/v2/processes/connectors/notion/types/blocks/column_list.py new file mode 100644 index 000000000..9bb3f6739 --- /dev/null +++ b/unstructured_ingest/v2/processes/connectors/notion/types/blocks/column_list.py @@ -0,0 +1,35 @@ +# https://developers.notion.com/reference/block#column-list-and-column +from dataclasses import dataclass +from typing import Optional + +from htmlBuilder.tags import HtmlTag + +from unstructured_ingest.connector.notion.interfaces import BlockBase + + +@dataclass +class ColumnList(BlockBase): + @staticmethod + def can_have_children() -> bool: + return True + + @classmethod + def from_dict(cls, data: dict): + return cls() + + def get_html(self) -> Optional[HtmlTag]: + return None + + +@dataclass +class Column(BlockBase): + @staticmethod + def can_have_children() -> bool: + return True + + @classmethod + def from_dict(cls, data: dict): + return cls() + + def get_html(self) -> Optional[HtmlTag]: + return None diff --git a/unstructured_ingest/v2/processes/connectors/notion/types/blocks/divider.py b/unstructured_ingest/v2/processes/connectors/notion/types/blocks/divider.py new file mode 100644 index 000000000..4537829e9 --- /dev/null +++ b/unstructured_ingest/v2/processes/connectors/notion/types/blocks/divider.py @@ -0,0 +1,22 @@ +# https://developers.notion.com/reference/block#divider +from dataclasses import dataclass +from typing import Optional + +from htmlBuilder.attributes import Style +from htmlBuilder.tags import Hr, HtmlTag + +from unstructured_ingest.connector.notion.interfaces import BlockBase + + +@dataclass +class Divider(BlockBase): + @staticmethod + def can_have_children() -> bool: + return False + + @classmethod + def from_dict(cls, data: dict): + return cls() + + def get_html(self) -> Optional[HtmlTag]: + return Hr([Style("border-top: 3px solid #bbb")]) diff --git a/unstructured_ingest/v2/processes/connectors/notion/types/blocks/embed.py b/unstructured_ingest/v2/processes/connectors/notion/types/blocks/embed.py new file mode 100644 index 000000000..8a6429108 --- /dev/null +++ b/unstructured_ingest/v2/processes/connectors/notion/types/blocks/embed.py @@ -0,0 +1,36 @@ +# https://developers.notion.com/reference/block#embed +from dataclasses import dataclass, field +from typing import List, Optional + +from htmlBuilder.attributes import Href +from htmlBuilder.tags import A, Br, Div, HtmlTag + +from unstructured_ingest.connector.notion.interfaces import BlockBase +from unstructured_ingest.connector.notion.types.rich_text import RichText + + +@dataclass +class Embed(BlockBase): + url: str + caption: List[RichText] = field(default_factory=list) + + @staticmethod + def can_have_children() -> bool: + return False + + @classmethod + def from_dict(cls, data: dict): + return cls(caption=[RichText.from_dict(d) for d in data.pop("caption", [])], **data) + + def get_html(self) -> Optional[HtmlTag]: + texts = [] + if self.url: + texts.append(A([Href(self.url)], self.url)) + if self.caption: + texts.append(Div([], [rt.get_html() for rt in self.caption])) + if not texts: + return None + joined = [Br()] * (len(texts) * 2 - 1) + joined[0::2] = texts + + return Div([], joined) diff --git a/unstructured_ingest/v2/processes/connectors/notion/types/blocks/equation.py b/unstructured_ingest/v2/processes/connectors/notion/types/blocks/equation.py new file mode 100644 index 000000000..cc6039ce5 --- /dev/null +++ b/unstructured_ingest/v2/processes/connectors/notion/types/blocks/equation.py @@ -0,0 +1,23 @@ +# https://developers.notion.com/reference/block#equation +from dataclasses import dataclass +from typing import Optional + +from htmlBuilder.tags import Div, HtmlTag + +from unstructured_ingest.connector.notion.interfaces import BlockBase + + +@dataclass +class Equation(BlockBase): + expression: str + + @staticmethod + def can_have_children() -> bool: + return False + + @classmethod + def from_dict(cls, data: dict): + return cls(**data) + + def get_html(self) -> Optional[HtmlTag]: + return Div([], self.expression) diff --git a/unstructured_ingest/v2/processes/connectors/notion/types/blocks/file.py b/unstructured_ingest/v2/processes/connectors/notion/types/blocks/file.py new file mode 100644 index 000000000..81cefc205 --- /dev/null +++ b/unstructured_ingest/v2/processes/connectors/notion/types/blocks/file.py @@ -0,0 +1,49 @@ +# https://developers.notion.com/reference/block#file +from dataclasses import dataclass, field +from typing import List, Optional + +from htmlBuilder.attributes import Href +from htmlBuilder.tags import A, Br, Div, HtmlTag + +from unstructured_ingest.connector.notion.interfaces import BlockBase +from unstructured_ingest.connector.notion.types.file import External +from unstructured_ingest.connector.notion.types.file import File as FileContent +from unstructured_ingest.connector.notion.types.rich_text import RichText + + +@dataclass +class File(BlockBase): + type: str + external: Optional[External] = None + file: Optional[FileContent] = None + caption: List[RichText] = field(default_factory=list) + + @staticmethod + def can_have_children() -> bool: + return False + + @classmethod + def from_dict(cls, data: dict): + caption = [RichText.from_dict(rt) for rt in data.pop("caption", [])] + t = data["type"] + file = cls(type=t, caption=caption) + if t == "external": + file.external = External.from_dict(data["external"]) + elif t == "file": + file.file = FileContent.from_dict(data["file"]) + return file + + def get_html(self) -> Optional[HtmlTag]: + texts = [] + if self.file: + texts.append(A([Href(self.file.url)], self.file.url)) + if self.external: + texts.append(A([Href(self.external.url)], self.external.url)) + if self.caption: + texts.append(Div([], [rt.get_html() for rt in self.caption])) + if not texts: + return None + joined = [Br()] * (len(texts) * 2 - 1) + joined[0::2] = texts + + return Div([], joined) diff --git a/unstructured_ingest/v2/processes/connectors/notion/types/blocks/heading.py b/unstructured_ingest/v2/processes/connectors/notion/types/blocks/heading.py new file mode 100644 index 000000000..685dd4c87 --- /dev/null +++ b/unstructured_ingest/v2/processes/connectors/notion/types/blocks/heading.py @@ -0,0 +1,37 @@ +# https://developers.notion.com/reference/block#headings +from dataclasses import dataclass, field +from typing import List, Optional + +from htmlBuilder.attributes import Style +from htmlBuilder.tags import Div, HtmlTag + +from unstructured_ingest.connector.notion.interfaces import BlockBase +from unstructured_ingest.connector.notion.types.rich_text import RichText + + +@dataclass +class Heading(BlockBase): + color: str + is_toggleable: bool + rich_text: List[RichText] = field(default_factory=list) + + @staticmethod + def can_have_children() -> bool: + return False + + @classmethod + def from_dict(cls, data: dict): + rich_text = data.pop("rich_text", []) + heading = cls(**data) + heading.rich_text = [RichText.from_dict(rt) for rt in rich_text] + return heading + + def get_html(self) -> Optional[HtmlTag]: + if not self.rich_text: + return None + + texts = [rt.get_html() for rt in self.rich_text] + attributes = [] + if self.color and self.color != "default": + attributes.append(Style(f"color: {self.color}")) + return Div(attributes, texts) diff --git a/unstructured_ingest/v2/processes/connectors/notion/types/blocks/image.py b/unstructured_ingest/v2/processes/connectors/notion/types/blocks/image.py new file mode 100644 index 000000000..36fb173e8 --- /dev/null +++ b/unstructured_ingest/v2/processes/connectors/notion/types/blocks/image.py @@ -0,0 +1,21 @@ +# https://developers.notion.com/reference/block#image +from typing import Optional + +from htmlBuilder.attributes import Src +from htmlBuilder.tags import HtmlTag, Img + +from unstructured_ingest.connector.notion.interfaces import BlockBase +from unstructured_ingest.connector.notion.types.file import FileObject + + +class Image(BlockBase, FileObject): + @staticmethod + def can_have_children() -> bool: + return False + + def get_html(self) -> Optional[HtmlTag]: + if self.external: + return Img([Src(self.external.url)], []) + if self.file: + return Img([Src(self.file.url)], []) + return None diff --git a/unstructured_ingest/v2/processes/connectors/notion/types/blocks/link_preview.py b/unstructured_ingest/v2/processes/connectors/notion/types/blocks/link_preview.py new file mode 100644 index 000000000..96f8cb382 --- /dev/null +++ b/unstructured_ingest/v2/processes/connectors/notion/types/blocks/link_preview.py @@ -0,0 +1,24 @@ +# https://developers.notion.com/reference/block#link-preview +from dataclasses import dataclass +from typing import Optional + +from htmlBuilder.attributes import Href +from htmlBuilder.tags import A, HtmlTag + +from unstructured_ingest.connector.notion.interfaces import BlockBase + + +@dataclass +class LinkPreview(BlockBase): + url: str + + @staticmethod + def can_have_children() -> bool: + return False + + @classmethod + def from_dict(cls, data: dict): + return cls(**data) + + def get_html(self) -> Optional[HtmlTag]: + return A([Href(self.url)], self.url) diff --git a/unstructured_ingest/v2/processes/connectors/notion/types/blocks/link_to_page.py b/unstructured_ingest/v2/processes/connectors/notion/types/blocks/link_to_page.py new file mode 100644 index 000000000..8d9d01810 --- /dev/null +++ b/unstructured_ingest/v2/processes/connectors/notion/types/blocks/link_to_page.py @@ -0,0 +1,29 @@ +# https://developers.notion.com/reference/block#link-to-page +from dataclasses import dataclass +from typing import Optional + +from htmlBuilder.tags import Div, HtmlTag + +from unstructured_ingest.connector.notion.interfaces import BlockBase + + +@dataclass +class LinkToPage(BlockBase): + type: str + page_id: Optional[str] = None + database_id: Optional[str] = None + + @staticmethod + def can_have_children() -> bool: + return False + + @classmethod + def from_dict(cls, data: dict): + return cls(**data) + + def get_html(self) -> Optional[HtmlTag]: + if page_id := self.page_id: + return Div([], page_id) + if database_id := self.database_id: + return Div([], database_id) + return None diff --git a/unstructured_ingest/v2/processes/connectors/notion/types/blocks/numbered_list.py b/unstructured_ingest/v2/processes/connectors/notion/types/blocks/numbered_list.py new file mode 100644 index 000000000..e9236fba2 --- /dev/null +++ b/unstructured_ingest/v2/processes/connectors/notion/types/blocks/numbered_list.py @@ -0,0 +1,29 @@ +# https://developers.notion.com/reference/block#numbered-list-item +from dataclasses import dataclass, field +from typing import List, Optional + +from htmlBuilder.tags import HtmlTag, Li + +from unstructured_ingest.connector.notion.interfaces import BlockBase +from unstructured_ingest.connector.notion.types.rich_text import RichText + + +@dataclass +class NumberedListItem(BlockBase): + color: str + children: List[dict] = field(default_factory=list) + rich_text: List[RichText] = field(default_factory=list) + + @staticmethod + def can_have_children() -> bool: + return True + + @classmethod + def from_dict(cls, data: dict): + rich_text = data.pop("rich_text", []) + numbered_list = cls(**data) + numbered_list.rich_text = [RichText.from_dict(rt) for rt in rich_text] + return numbered_list + + def get_html(self) -> Optional[HtmlTag]: + return Li([], [rt.get_html() for rt in self.rich_text]) diff --git a/unstructured_ingest/v2/processes/connectors/notion/types/blocks/paragraph.py b/unstructured_ingest/v2/processes/connectors/notion/types/blocks/paragraph.py new file mode 100644 index 000000000..02170ee8c --- /dev/null +++ b/unstructured_ingest/v2/processes/connectors/notion/types/blocks/paragraph.py @@ -0,0 +1,31 @@ +# https://developers.notion.com/reference/block#paragraph +from dataclasses import dataclass, field +from typing import List, Optional + +from htmlBuilder.tags import Br, Div, HtmlTag + +from unstructured_ingest.connector.notion.interfaces import BlockBase +from unstructured_ingest.connector.notion.types.rich_text import RichText + + +@dataclass +class Paragraph(BlockBase): + color: str + children: List[dict] = field(default_factory=list) + rich_text: List[RichText] = field(default_factory=list) + + @staticmethod + def can_have_children() -> bool: + return True + + @classmethod + def from_dict(cls, data: dict): + rich_text = data.pop("rich_text", []) + paragraph = cls(**data) + paragraph.rich_text = [RichText.from_dict(rt) for rt in rich_text] + return paragraph + + def get_html(self) -> Optional[HtmlTag]: + if not self.rich_text: + return Br() + return Div([], [rt.get_html() for rt in self.rich_text]) diff --git a/unstructured_ingest/v2/processes/connectors/notion/types/blocks/pdf.py b/unstructured_ingest/v2/processes/connectors/notion/types/blocks/pdf.py new file mode 100644 index 000000000..6ec6971d7 --- /dev/null +++ b/unstructured_ingest/v2/processes/connectors/notion/types/blocks/pdf.py @@ -0,0 +1,49 @@ +# https://developers.notion.com/reference/block#pdf +from dataclasses import dataclass, field +from typing import List, Optional + +from htmlBuilder.attributes import Href +from htmlBuilder.tags import A, Br, Div, HtmlTag + +from unstructured_ingest.connector.notion.interfaces import BlockBase +from unstructured_ingest.connector.notion.types.file import External, File +from unstructured_ingest.connector.notion.types.rich_text import RichText + + +@dataclass +class PDF(BlockBase): + type: str + caption: List[RichText] = field(default_factory=list) + external: Optional[External] = None + file: Optional[File] = None + + @staticmethod + def can_have_children() -> bool: + return False + + @classmethod + def from_dict(cls, data: dict): + caption = data.pop("caption", []) + t = data["type"] + paragraph = cls(type=t) + paragraph.caption = [RichText.from_dict(c) for c in caption] + if t == "external": + paragraph.external = External.from_dict(data["external"]) + elif t == "file": + paragraph.file = File.from_dict(data["file"]) + return paragraph + + def get_html(self) -> Optional[HtmlTag]: + texts = [] + if self.external: + texts.append(A([Href(self.external.url)], self.external.url)) + if self.file: + texts.append(A([Href(self.file.url)], self.file.url)) + if self.caption: + texts.append(Div([], [rt.get_html() for rt in self.caption])) + if not texts: + return None + joined = [Br()] * (len(texts) * 2 - 1) + joined[0::2] = texts + + return Div([], joined) diff --git a/unstructured_ingest/v2/processes/connectors/notion/types/blocks/quote.py b/unstructured_ingest/v2/processes/connectors/notion/types/blocks/quote.py new file mode 100644 index 000000000..2c911c82d --- /dev/null +++ b/unstructured_ingest/v2/processes/connectors/notion/types/blocks/quote.py @@ -0,0 +1,37 @@ +# https://developers.notion.com/reference/block#quote +from dataclasses import dataclass, field +from typing import List, Optional + +from htmlBuilder.attributes import Style +from htmlBuilder.tags import Div, HtmlTag + +from unstructured_ingest.connector.notion.interfaces import BlockBase +from unstructured_ingest.connector.notion.types.rich_text import RichText + + +@dataclass +class Quote(BlockBase): + color: str + children: List[dict] = field(default_factory=list) + rich_text: List[RichText] = field(default_factory=list) + + @staticmethod + def can_have_children() -> bool: + return True + + @classmethod + def from_dict(cls, data: dict): + rich_text = data.pop("rich_text", []) + quote = cls(**data) + quote.rich_text = [RichText.from_dict(rt) for rt in rich_text] + return quote + + def get_html(self) -> Optional[HtmlTag]: + if not self.rich_text: + return None + + texts = [rt.get_html() for rt in self.rich_text] + attributes = [] + if self.color and self.color != "default": + attributes.append(Style(f"color: {self.color}")) + return Div(attributes, texts) diff --git a/unstructured_ingest/v2/processes/connectors/notion/types/blocks/synced_block.py b/unstructured_ingest/v2/processes/connectors/notion/types/blocks/synced_block.py new file mode 100644 index 000000000..6c158e701 --- /dev/null +++ b/unstructured_ingest/v2/processes/connectors/notion/types/blocks/synced_block.py @@ -0,0 +1,57 @@ +# https://developers.notion.com/reference/block#synced-block +from dataclasses import dataclass, field +from typing import List, Optional + +from htmlBuilder.tags import HtmlTag + +from unstructured_ingest.connector.notion.interfaces import BlockBase + + +@dataclass +class OriginalSyncedBlock(BlockBase): + synced_from: Optional[str] = None + children: List[dict] = field(default_factory=list) + + @staticmethod + def can_have_children() -> bool: + return True + + @classmethod + def from_dict(cls, data: dict): + return cls(children=data["children"]) + + def get_html(self) -> Optional[HtmlTag]: + return None + + +@dataclass +class DuplicateSyncedBlock(BlockBase): + type: str + block_id: str + + @staticmethod + def can_have_children() -> bool: + return True + + @classmethod + def from_dict(cls, data: dict): + return cls(**data) + + def get_html(self) -> Optional[HtmlTag]: + return None + + +class SyncBlock(BlockBase): + @staticmethod + def can_have_children() -> bool: + return True + + @classmethod + def from_dict(cls, data: dict): + if "synced_from" in data: + return OriginalSyncedBlock.from_dict(data) + else: + return DuplicateSyncedBlock.from_dict(data) + + def get_html(self) -> Optional[HtmlTag]: + return None diff --git a/unstructured_ingest/v2/processes/connectors/notion/types/blocks/table.py b/unstructured_ingest/v2/processes/connectors/notion/types/blocks/table.py new file mode 100644 index 000000000..32742a57d --- /dev/null +++ b/unstructured_ingest/v2/processes/connectors/notion/types/blocks/table.py @@ -0,0 +1,63 @@ +# https://developers.notion.com/reference/block#table +from dataclasses import dataclass, field +from typing import List, Optional + +from htmlBuilder.tags import HtmlTag, Td, Th, Tr + +from unstructured_ingest.connector.notion.interfaces import ( + BlockBase, + FromJSONMixin, +) +from unstructured_ingest.connector.notion.types.rich_text import RichText + + +@dataclass +class Table(BlockBase): + table_width: int + has_column_header: bool + has_row_header: bool + + @staticmethod + def can_have_children() -> bool: + return True + + @classmethod + def from_dict(cls, data: dict): + return cls(**data) + + def get_html(self) -> Optional[HtmlTag]: + return None + + +@dataclass +class TableCell(FromJSONMixin): + rich_texts: List[RichText] + + @classmethod + def from_dict(cls, data: dict): + return cls(rich_texts=[RichText.from_dict(rt) for rt in data.pop("rich_texts", [])]) + + def get_html(self, is_header: bool) -> Optional[HtmlTag]: + if is_header: + return Th([], [rt.get_html() for rt in self.rich_texts]) + else: + return Td([], [rt.get_html() for rt in self.rich_texts]) + + +# https://developers.notion.com/reference/block#table-rows +@dataclass +class TableRow(BlockBase): + is_header: bool = False + cells: List[TableCell] = field(default_factory=list) + + @classmethod + def from_dict(cls, data: dict): + cells = data.get("cells", []) + return cls(cells=[TableCell.from_dict({"rich_texts": c}) for c in cells]) + + @staticmethod + def can_have_children() -> bool: + return False + + def get_html(self) -> Optional[HtmlTag]: + return Tr([], [cell.get_html(is_header=self.is_header) for cell in self.cells]) diff --git a/unstructured_ingest/v2/processes/connectors/notion/types/blocks/table_of_contents.py b/unstructured_ingest/v2/processes/connectors/notion/types/blocks/table_of_contents.py new file mode 100644 index 000000000..86cedffd7 --- /dev/null +++ b/unstructured_ingest/v2/processes/connectors/notion/types/blocks/table_of_contents.py @@ -0,0 +1,23 @@ +# https://developers.notion.com/reference/block#table-of-contents +from dataclasses import dataclass +from typing import Optional + +from htmlBuilder.tags import HtmlTag + +from unstructured_ingest.connector.notion.interfaces import BlockBase + + +@dataclass +class TableOfContents(BlockBase): + color: str + + @staticmethod + def can_have_children() -> bool: + return False + + @classmethod + def from_dict(cls, data: dict): + return cls(**data) + + def get_html(self) -> Optional[HtmlTag]: + return None diff --git a/unstructured_ingest/v2/processes/connectors/notion/types/blocks/template.py b/unstructured_ingest/v2/processes/connectors/notion/types/blocks/template.py new file mode 100644 index 000000000..edb88de61 --- /dev/null +++ b/unstructured_ingest/v2/processes/connectors/notion/types/blocks/template.py @@ -0,0 +1,30 @@ +# https://developers.notion.com/reference/block#template +from dataclasses import dataclass, field +from typing import List, Optional + +from htmlBuilder.tags import Div, HtmlTag + +from unstructured_ingest.connector.notion.interfaces import BlockBase +from unstructured_ingest.connector.notion.types.rich_text import RichText + + +@dataclass +class Template(BlockBase): + children: List[dict] = field(default_factory=list) + rich_text: List[RichText] = field(default_factory=list) + + @staticmethod + def can_have_children() -> bool: + return True + + @classmethod + def from_dict(cls, data: dict): + rich_text = data.pop("rich_text", []) + template = cls(**data) + template.rich_text = [RichText.from_dict(rt) for rt in rich_text] + return template + + def get_html(self) -> Optional[HtmlTag]: + if not self.rich_text: + return None + return Div([], [rt.get_html() for rt in self.rich_text]) diff --git a/unstructured_ingest/v2/processes/connectors/notion/types/blocks/todo.py b/unstructured_ingest/v2/processes/connectors/notion/types/blocks/todo.py new file mode 100644 index 000000000..64c8fb5bc --- /dev/null +++ b/unstructured_ingest/v2/processes/connectors/notion/types/blocks/todo.py @@ -0,0 +1,42 @@ +# https://developers.notion.com/reference/block#to-do +from dataclasses import dataclass, field +from typing import List, Optional + +from htmlBuilder.attributes import Checked, Style, Type +from htmlBuilder.tags import Div, HtmlTag, Input + +from unstructured_ingest.connector.notion.interfaces import BlockBase +from unstructured_ingest.connector.notion.types.rich_text import RichText + + +@dataclass +class ToDo(BlockBase): + color: str + checked: bool = False + rich_text: List[RichText] = field(default_factory=list) + + @staticmethod + def can_have_children() -> bool: + return True + + @classmethod + def from_dict(cls, data: dict): + rich_text = data.pop("rich_text", []) + todo = cls(**data) + todo.rich_text = [RichText.from_dict(rt) for rt in rich_text] + return todo + + def get_html(self) -> Optional[HtmlTag]: + if not self.rich_text: + return None + + elements = [] + check_input_attributes = [Type("checkbox")] + if self.checked: + check_input_attributes.append(Checked("")) + elements.append(Input(check_input_attributes)) + elements.extend([rt.get_html() for rt in self.rich_text]) + attributes = [] + if self.color and self.color != "default": + attributes.append(Style(f"color: {self.color}")) + return Div(attributes, elements) diff --git a/unstructured_ingest/v2/processes/connectors/notion/types/blocks/toggle.py b/unstructured_ingest/v2/processes/connectors/notion/types/blocks/toggle.py new file mode 100644 index 000000000..dd3493c25 --- /dev/null +++ b/unstructured_ingest/v2/processes/connectors/notion/types/blocks/toggle.py @@ -0,0 +1,37 @@ +# https://developers.notion.com/reference/block#toggle-blocks +from dataclasses import dataclass, field +from typing import List, Optional + +from htmlBuilder.attributes import Style +from htmlBuilder.tags import Div, HtmlTag + +from unstructured_ingest.connector.notion.interfaces import BlockBase +from unstructured_ingest.connector.notion.types.rich_text import RichText + + +@dataclass +class Toggle(BlockBase): + color: str + children: List[dict] = field(default_factory=list) + rich_text: List[RichText] = field(default_factory=list) + + @staticmethod + def can_have_children() -> bool: + return True + + @classmethod + def from_dict(cls, data: dict): + rich_text = data.pop("rich_text", []) + toggle = cls(**data) + toggle.rich_text = [RichText.from_dict(rt) for rt in rich_text] + return toggle + + def get_html(self) -> Optional[HtmlTag]: + if not self.rich_text: + return None + + texts = [rt.get_html() for rt in self.rich_text] + attributes = [] + if self.color and self.color != "default": + attributes.append(Style(f"color: {self.color}")) + return Div(attributes, texts) diff --git a/unstructured_ingest/v2/processes/connectors/notion/types/blocks/unsupported.py b/unstructured_ingest/v2/processes/connectors/notion/types/blocks/unsupported.py new file mode 100644 index 000000000..25b7c149f --- /dev/null +++ b/unstructured_ingest/v2/processes/connectors/notion/types/blocks/unsupported.py @@ -0,0 +1,20 @@ +from dataclasses import dataclass +from typing import Optional + +from htmlBuilder.tags import HtmlTag + +from unstructured_ingest.connector.notion.interfaces import BlockBase + + +@dataclass +class Unsupported(BlockBase): + @staticmethod + def can_have_children() -> bool: + return False + + @classmethod + def from_dict(cls, data: dict): + return cls() + + def get_html(self) -> Optional[HtmlTag]: + return None diff --git a/unstructured_ingest/v2/processes/connectors/notion/types/blocks/video.py b/unstructured_ingest/v2/processes/connectors/notion/types/blocks/video.py new file mode 100644 index 000000000..54c5fe5a4 --- /dev/null +++ b/unstructured_ingest/v2/processes/connectors/notion/types/blocks/video.py @@ -0,0 +1,22 @@ +# https://developers.notion.com/reference/block#image +from typing import Optional + +from htmlBuilder.attributes import Src +from htmlBuilder.tags import HtmlTag, Source +from htmlBuilder.tags import Video as VideoHtml + +from unstructured_ingest.connector.notion.interfaces import BlockBase +from unstructured_ingest.connector.notion.types.file import FileObject + + +class Video(BlockBase, FileObject): + @staticmethod + def can_have_children() -> bool: + return False + + def get_html(self) -> Optional[HtmlTag]: + if self.external: + return VideoHtml([], [Source([Src(self.external.url)], [self.external.url])]) + if self.file: + return VideoHtml([], [Source([Src(self.file.url)], [self.file.url])]) + return None diff --git a/unstructured_ingest/v2/processes/connectors/notion/types/database.py b/unstructured_ingest/v2/processes/connectors/notion/types/database.py new file mode 100644 index 000000000..055571064 --- /dev/null +++ b/unstructured_ingest/v2/processes/connectors/notion/types/database.py @@ -0,0 +1,73 @@ +# https://developers.notion.com/reference/database +from dataclasses import dataclass, field +from typing import Dict, List, Optional + +from htmlBuilder.tags import Div, HtmlTag, Span + +from unstructured_ingest.v2.processes.connectors.notion.interfaces import ( + DBPropertyBase, + FromJSONMixin, + GetHTMLMixin, +) +from unstructured_ingest.v2.processes.connectors.notion.types.database_properties import ( + map_properties, +) +from unstructured_ingest.v2.processes.connectors.notion.types.file import FileObject +from unstructured_ingest.v2.processes.connectors.notion.types.parent import Parent +from unstructured_ingest.v2.processes.connectors.notion.types.rich_text import RichText +from unstructured_ingest.v2.processes.connectors.notion.types.user import PartialUser + + +@dataclass +class Database(FromJSONMixin, GetHTMLMixin): + id: str + created_time: str + created_by: PartialUser + last_edited_time: str + last_edited_by: PartialUser + archived: bool + in_trash: bool + parent: Parent + url: str + is_inline: bool + public_url: str + request_id: Optional[str] = None + properties: Dict[str, DBPropertyBase] = field(default_factory=dict) + title: List[RichText] = field(default_factory=list) + description: List[RichText] = field(default_factory=list) + icon: Optional[FileObject] = None + cover: Optional[FileObject] = None + object: str = "database" + + @classmethod + def from_dict(cls, data: dict): + created_by = data.pop("created_by") + last_edited_by = data.pop("last_edited_by") + icon = data.pop("icon") + cover = data.pop("cover") + parent = data.pop("parent") + title = data.pop("title") + description = data.pop("description") + page = cls( + properties=map_properties(data.pop("properties", {})), + created_by=PartialUser.from_dict(created_by), + last_edited_by=PartialUser.from_dict(last_edited_by), + icon=FileObject.from_dict(icon) if icon else None, + cover=FileObject.from_dict(cover) if cover else None, + parent=Parent.from_dict(parent), + title=[RichText.from_dict(data=r) for r in title], + description=[RichText.from_dict(data=r) for r in description], + **data, + ) + + return page + + def get_html(self) -> Optional[HtmlTag]: + spans = [] + if title := self.title: + spans.append(Span([], [rt.get_html() for rt in title])) + if description := self.description: + spans.append(Span([], [rt.get_html() for rt in description])) + if spans: + return Div([], spans) + return None diff --git a/unstructured_ingest/v2/processes/connectors/notion/types/database_properties/__init__.py b/unstructured_ingest/v2/processes/connectors/notion/types/database_properties/__init__.py new file mode 100644 index 000000000..95c548969 --- /dev/null +++ b/unstructured_ingest/v2/processes/connectors/notion/types/database_properties/__init__.py @@ -0,0 +1,106 @@ +from typing import Dict + +from unstructured_ingest.connector.notion.interfaces import DBCellBase, DBPropertyBase + +from .checkbox import Checkbox, CheckboxCell +from .created_by import CreatedBy, CreatedByCell +from .created_time import CreatedTime, CreatedTimeCell +from .date import Date, DateCell +from .email import Email, EmailCell +from .files import Files, FilesCell +from .formula import Formula, FormulaCell +from .last_edited_by import LastEditedBy, LastEditedByCell +from .last_edited_time import LastEditedTime, LastEditedTimeCell +from .multiselect import MultiSelect, MultiSelectCell +from .number import Number, NumberCell +from .people import People, PeopleCell +from .phone_number import PhoneNumber, PhoneNumberCell +from .relation import Relation, RelationCell +from .rich_text import RichText, RichTextCell +from .rollup import Rollup, RollupCell +from .select import Select, SelectCell +from .status import Status, StatusCell +from .title import Title, TitleCell +from .unique_id import UniqueID, UniqueIDCell +from .url import URL, URLCell +from .verification import Verification, VerificationCell + +db_prop_type_mapping = { + "checkbox": Checkbox, + "created_by": CreatedBy, + "created_time": CreatedTime, + "date": Date, + "email": Email, + "files": Files, + "formula": Formula, + "last_edited_by": LastEditedBy, + "last_edited_time": LastEditedTime, + "multi_select": MultiSelect, + "number": Number, + "people": People, + "phone_number": PhoneNumber, + "relation": Relation, + "rich_text": RichText, + "rollup": Rollup, + "select": Select, + "status": Status, + "title": Title, + "unique_id": UniqueID, + "url": URL, + "verification": Verification, +} + + +def map_properties(props: Dict[str, dict]) -> Dict[str, DBPropertyBase]: + mapped_dict = {} + for k, v in props.items(): + try: + mapped_dict[k] = db_prop_type_mapping[v["type"]].from_dict(v) # type: ignore + except KeyError as ke: + raise KeyError(f"failed to map to associated database property -> {k}: {v}") from ke + + return mapped_dict + + +db_cell_type_mapping = { + "checkbox": CheckboxCell, + "created_by": CreatedByCell, + "created_time": CreatedTimeCell, + "date": DateCell, + "email": EmailCell, + "files": FilesCell, + "formula": FormulaCell, + "last_edited_by": LastEditedByCell, + "last_edited_time": LastEditedTimeCell, + "multi_select": MultiSelectCell, + "number": NumberCell, + "people": PeopleCell, + "phone_number": PhoneNumberCell, + "relation": RelationCell, + "rich_text": RichTextCell, + "rollup": RollupCell, + "select": SelectCell, + "status": StatusCell, + "title": TitleCell, + "unique_id": UniqueIDCell, + "url": URLCell, + "verification": VerificationCell, +} + + +def map_cells(props: Dict[str, dict]) -> Dict[str, DBCellBase]: + mapped_dict = {} + for k, v in props.items(): + try: + t = v["type"] + mapped_dict[k] = db_cell_type_mapping[t].from_dict(v) # type: ignore + except KeyError as ke: + raise KeyError(f"failed to map to associated database property -> {k}: {v}") from ke + + return mapped_dict + + +__all__ = [ + "map_properties", + "map_cells", +] diff --git a/unstructured_ingest/v2/processes/connectors/notion/types/database_properties/checkbox.py b/unstructured_ingest/v2/processes/connectors/notion/types/database_properties/checkbox.py new file mode 100644 index 000000000..c4f50f2a3 --- /dev/null +++ b/unstructured_ingest/v2/processes/connectors/notion/types/database_properties/checkbox.py @@ -0,0 +1,38 @@ +# https://developers.notion.com/reference/property-object#checkbox +from dataclasses import dataclass, field +from typing import Optional + +from htmlBuilder.attributes import Checked, Type +from htmlBuilder.tags import Div, HtmlTag, Input + +from unstructured_ingest.connector.notion.interfaces import DBCellBase, DBPropertyBase + + +@dataclass +class Checkbox(DBPropertyBase): + id: str + name: str + type: str = "checkbox" + checkbox: dict = field(default_factory=dict) + + @classmethod + def from_dict(cls, data: dict): + return cls(**data) + + +@dataclass +class CheckboxCell(DBCellBase): + id: str + checkbox: bool + name: Optional[str] = None + type: str = "checkbox" + + @classmethod + def from_dict(cls, data: dict): + return cls(**data) + + def get_html(self) -> Optional[HtmlTag]: + check_input_attributes = [Type("checkbox")] + if self.checkbox: + check_input_attributes.append(Checked("")) + return Div([], Input(check_input_attributes)) diff --git a/unstructured_ingest/v2/processes/connectors/notion/types/database_properties/created_by.py b/unstructured_ingest/v2/processes/connectors/notion/types/database_properties/created_by.py new file mode 100644 index 000000000..4dda9a56e --- /dev/null +++ b/unstructured_ingest/v2/processes/connectors/notion/types/database_properties/created_by.py @@ -0,0 +1,35 @@ +# https://developers.notion.com/reference/property-object#created-by +from dataclasses import dataclass, field +from typing import Optional + +from htmlBuilder.tags import HtmlTag + +from unstructured_ingest.connector.notion.interfaces import DBCellBase, DBPropertyBase +from unstructured_ingest.connector.notion.types.user import People + + +@dataclass +class CreatedBy(DBPropertyBase): + id: str + name: str + type: str = "created_by" + created_by: dict = field(default_factory=dict) + + @classmethod + def from_dict(cls, data: dict): + return cls(**data) + + +@dataclass +class CreatedByCell(DBCellBase): + id: str + created_by: People + type: str = "created_by" + name: Optional[str] = None + + @classmethod + def from_dict(cls, data: dict): + return cls(created_by=People.from_dict(data.pop("created_by")), **data) + + def get_html(self) -> Optional[HtmlTag]: + return self.created_by.get_html() diff --git a/unstructured_ingest/v2/processes/connectors/notion/types/database_properties/created_time.py b/unstructured_ingest/v2/processes/connectors/notion/types/database_properties/created_time.py new file mode 100644 index 000000000..9ccf099dc --- /dev/null +++ b/unstructured_ingest/v2/processes/connectors/notion/types/database_properties/created_time.py @@ -0,0 +1,34 @@ +# https://developers.notion.com/reference/property-object#created-time +from dataclasses import dataclass, field +from typing import Optional + +from htmlBuilder.tags import Div, HtmlTag + +from unstructured_ingest.connector.notion.interfaces import DBCellBase, DBPropertyBase + + +@dataclass +class CreatedTime(DBPropertyBase): + id: str + name: str + type: str = "created_time" + created_time: dict = field(default_factory=dict) + + @classmethod + def from_dict(cls, data: dict): + return cls(**data) + + +@dataclass +class CreatedTimeCell(DBCellBase): + id: str + created_time: str + type: str = "created_time" + name: Optional[str] = None + + @classmethod + def from_dict(cls, data: dict): + return cls(**data) + + def get_html(self) -> Optional[HtmlTag]: + return Div([], self.created_time) diff --git a/unstructured_ingest/v2/processes/connectors/notion/types/database_properties/date.py b/unstructured_ingest/v2/processes/connectors/notion/types/database_properties/date.py new file mode 100644 index 000000000..79c4f5797 --- /dev/null +++ b/unstructured_ingest/v2/processes/connectors/notion/types/database_properties/date.py @@ -0,0 +1,41 @@ +# https://developers.notion.com/reference/property-object#date +from dataclasses import dataclass, field +from typing import Optional + +from htmlBuilder.tags import HtmlTag + +from unstructured_ingest.connector.notion.interfaces import DBCellBase, DBPropertyBase +from unstructured_ingest.connector.notion.types.date import Date as DateType + + +@dataclass +class Date(DBPropertyBase): + id: str + name: str + type: str = "date" + date: dict = field(default_factory=dict) + + @classmethod + def from_dict(cls, data: dict): + return cls(**data) + + +@dataclass +class DateCell(DBCellBase): + id: str + date: Optional[DateType] = None + name: Optional[str] = None + type: str = "date" + + @classmethod + def from_dict(cls, data: dict): + date = None + date_data = data.pop("date") + if date_data: + date = DateType.from_dict(date_data) + return cls(date=date, **data) + + def get_html(self) -> Optional[HtmlTag]: + if date := self.date: + return date.get_html() + return None diff --git a/unstructured_ingest/v2/processes/connectors/notion/types/database_properties/email.py b/unstructured_ingest/v2/processes/connectors/notion/types/database_properties/email.py new file mode 100644 index 000000000..c1b3b75e1 --- /dev/null +++ b/unstructured_ingest/v2/processes/connectors/notion/types/database_properties/email.py @@ -0,0 +1,36 @@ +# https://developers.notion.com/reference/property-object#email +from dataclasses import dataclass, field +from typing import Optional + +from htmlBuilder.tags import Div, HtmlTag + +from unstructured_ingest.connector.notion.interfaces import DBCellBase, DBPropertyBase + + +@dataclass +class Email(DBPropertyBase): + id: str + name: str + type: str = "email" + email: dict = field(default_factory=dict) + + @classmethod + def from_dict(cls, data: dict): + return cls(**data) + + +@dataclass +class EmailCell(DBCellBase): + id: str + email: str + name: Optional[str] = None + type: str = "email" + + @classmethod + def from_dict(cls, data: dict): + return cls(**data) + + def get_html(self) -> Optional[HtmlTag]: + if email := self.email: + return Div([], email) + return None diff --git a/unstructured_ingest/v2/processes/connectors/notion/types/database_properties/files.py b/unstructured_ingest/v2/processes/connectors/notion/types/database_properties/files.py new file mode 100644 index 000000000..7fd8d0156 --- /dev/null +++ b/unstructured_ingest/v2/processes/connectors/notion/types/database_properties/files.py @@ -0,0 +1,37 @@ +# https://developers.notion.com/reference/property-object#files +from dataclasses import dataclass, field +from typing import List, Optional + +from htmlBuilder.tags import Div, HtmlTag + +from unstructured_ingest.connector.notion.interfaces import DBCellBase, DBPropertyBase +from unstructured_ingest.connector.notion.types.file import FileObject + + +@dataclass +class Files(DBPropertyBase): + id: str + name: str + type: str = "files" + files: dict = field(default_factory=dict) + + @classmethod + def from_dict(cls, data: dict): + return cls(**data) + + +@dataclass +class FilesCell(DBCellBase): + id: str + files: List[FileObject] + type: str = "files" + name: Optional[str] = None + + @classmethod + def from_dict(cls, data: dict): + return cls(files=[FileObject.from_dict(f) for f in data.pop("files", [])], **data) + + def get_html(self) -> Optional[HtmlTag]: + if not self.files: + return None + return Div([], [f.get_html() for f in self.files]) diff --git a/unstructured_ingest/v2/processes/connectors/notion/types/database_properties/formula.py b/unstructured_ingest/v2/processes/connectors/notion/types/database_properties/formula.py new file mode 100644 index 000000000..99df0285b --- /dev/null +++ b/unstructured_ingest/v2/processes/connectors/notion/types/database_properties/formula.py @@ -0,0 +1,49 @@ +# https://developers.notion.com/reference/property-object#formula +from dataclasses import dataclass +from typing import Optional + +from htmlBuilder.tags import Div, HtmlTag + +from unstructured_ingest.connector.notion.interfaces import ( + DBCellBase, + DBPropertyBase, + FromJSONMixin, +) + + +@dataclass +class FormulaProp(FromJSONMixin): + expression: str + + @classmethod + def from_dict(cls, data: dict): + return cls(**data) + + +@dataclass +class Formula(DBPropertyBase): + id: str + name: str + formula: FormulaProp + type: str = "formula" + + @classmethod + def from_dict(cls, data: dict): + return cls(formula=FormulaProp.from_dict(data.pop("formula", {})), **data) + + +@dataclass +class FormulaCell(DBCellBase): + id: str + formula: dict + type: str = "formula" + name: Optional[str] = None + + @classmethod + def from_dict(cls, data: dict): + return cls(**data) + + def get_html(self) -> Optional[HtmlTag]: + formula = self.formula + t = formula.get("type") + return Div([], str(formula[t])) diff --git a/unstructured_ingest/v2/processes/connectors/notion/types/database_properties/last_edited_by.py b/unstructured_ingest/v2/processes/connectors/notion/types/database_properties/last_edited_by.py new file mode 100644 index 000000000..6c73ea625 --- /dev/null +++ b/unstructured_ingest/v2/processes/connectors/notion/types/database_properties/last_edited_by.py @@ -0,0 +1,34 @@ +# https://developers.notion.com/reference/property-object#last-edited-by +from dataclasses import dataclass +from typing import Optional + +from htmlBuilder.tags import HtmlTag + +from unstructured_ingest.connector.notion.interfaces import DBCellBase, DBPropertyBase +from unstructured_ingest.connector.notion.types.user import People + + +@dataclass +class LastEditedBy(DBPropertyBase): + @classmethod + def from_dict(cls, data: dict): + return cls() + + def get_text(self) -> Optional[str]: + return None + + +@dataclass +class LastEditedByCell(DBCellBase): + id: str + last_edited_by: People + type: str = "last_edited_by" + + name: Optional[str] = None + + @classmethod + def from_dict(cls, data: dict): + return cls(last_edited_by=People.from_dict(data.pop("last_edited_by", {})), **data) + + def get_html(self) -> Optional[HtmlTag]: + return self.last_edited_by.get_html() diff --git a/unstructured_ingest/v2/processes/connectors/notion/types/database_properties/last_edited_time.py b/unstructured_ingest/v2/processes/connectors/notion/types/database_properties/last_edited_time.py new file mode 100644 index 000000000..1da2ed863 --- /dev/null +++ b/unstructured_ingest/v2/processes/connectors/notion/types/database_properties/last_edited_time.py @@ -0,0 +1,34 @@ +# https://developers.notion.com/reference/property-object#last-edited-time +from dataclasses import dataclass, field +from typing import Optional + +from htmlBuilder.tags import Div, HtmlTag + +from unstructured_ingest.connector.notion.interfaces import DBCellBase, DBPropertyBase + + +@dataclass +class LastEditedTime(DBPropertyBase): + id: str + name: str + type: str = "last_edited_time" + last_edited_time: dict = field(default_factory=dict) + + @classmethod + def from_dict(cls, data: dict): + return cls(**data) + + +@dataclass +class LastEditedTimeCell(DBCellBase): + id: str + last_edited_time: str + type: str = "last_edited_time" + name: Optional[str] = None + + @classmethod + def from_dict(cls, data: dict): + return cls(**data) + + def get_html(self) -> Optional[HtmlTag]: + return Div([], self.last_edited_time) diff --git a/unstructured_ingest/v2/processes/connectors/notion/types/database_properties/multiselect.py b/unstructured_ingest/v2/processes/connectors/notion/types/database_properties/multiselect.py new file mode 100644 index 000000000..753a24922 --- /dev/null +++ b/unstructured_ingest/v2/processes/connectors/notion/types/database_properties/multiselect.py @@ -0,0 +1,73 @@ +# https://developers.notion.com/reference/property-object#multi-select +from dataclasses import dataclass, field +from typing import List, Optional + +from htmlBuilder.attributes import Style +from htmlBuilder.tags import Div, HtmlTag, Span + +from unstructured_ingest.connector.notion.interfaces import ( + DBCellBase, + DBPropertyBase, + FromJSONMixin, +) + + +@dataclass +class MultiSelectOption(FromJSONMixin): + color: str + id: str + name: str + + @classmethod + def from_dict(cls, data: dict): + return cls(**data) + + +@dataclass +class MultiSelectProp(FromJSONMixin): + options: List[MultiSelectOption] = field(default_factory=list) + + @classmethod + def from_dict(cls, data: dict): + return cls(options=[MultiSelectOption.from_dict(o) for o in data.get("options", [])]) + + +@dataclass +class MultiSelect(DBPropertyBase): + id: str + name: str + multi_select: MultiSelectProp + type: str = "multi_select" + + @classmethod + def from_dict(cls, data: dict): + return cls( + multi_select=data.pop("multi_select", {}), + **data, + ) + + +@dataclass +class MultiSelectCell(DBCellBase): + id: str + multi_select: List[MultiSelectOption] + type: str = "multi_select" + name: Optional[str] = None + + @classmethod + def from_dict(cls, data: dict): + return cls( + multi_select=[MultiSelectOption.from_dict(o) for o in data.pop("multi_select", [])], + **data, + ) + + def get_html(self) -> Optional[HtmlTag]: + if not self.multi_select: + return None + option_spans = [] + for option in self.multi_select: + option_attributes = [] + if option.color and option.color != "default": + option_attributes.append(Style(f"color: {option.color}")) + option_spans.append(Span(option_attributes, option.name)) + return Div([], option_spans) diff --git a/unstructured_ingest/v2/processes/connectors/notion/types/database_properties/number.py b/unstructured_ingest/v2/processes/connectors/notion/types/database_properties/number.py new file mode 100644 index 000000000..0e0dae5d7 --- /dev/null +++ b/unstructured_ingest/v2/processes/connectors/notion/types/database_properties/number.py @@ -0,0 +1,49 @@ +# https://developers.notion.com/reference/property-object#number +from dataclasses import dataclass +from typing import Optional + +from htmlBuilder.tags import Div, HtmlTag + +from unstructured_ingest.connector.notion.interfaces import ( + DBCellBase, + DBPropertyBase, + FromJSONMixin, +) + + +@dataclass +class NumberProp(FromJSONMixin): + format: str + + @classmethod + def from_dict(cls, data: dict): + return cls(**data) + + +@dataclass +class Number(DBPropertyBase): + id: str + name: str + number: NumberProp + type: str = "number" + + @classmethod + def from_dict(cls, data: dict): + return cls(number=NumberProp.from_dict(data.pop("number")), **data) + + +@dataclass +class NumberCell(DBCellBase): + id: str + number: Optional[int] = None + type: str = "number" + name: Optional[str] = None + + @classmethod + def from_dict(cls, data: dict): + return cls(**data) + + def get_html(self) -> Optional[HtmlTag]: + if number := self.number: + return Div([], str(number)) + return None diff --git a/unstructured_ingest/v2/processes/connectors/notion/types/database_properties/people.py b/unstructured_ingest/v2/processes/connectors/notion/types/database_properties/people.py new file mode 100644 index 000000000..037822208 --- /dev/null +++ b/unstructured_ingest/v2/processes/connectors/notion/types/database_properties/people.py @@ -0,0 +1,41 @@ +# https://developers.notion.com/reference/property-object#people +from dataclasses import dataclass, field +from typing import List, Optional + +from htmlBuilder.tags import Div, HtmlTag, Span + +from unstructured_ingest.connector.notion.interfaces import DBCellBase, DBPropertyBase +from unstructured_ingest.connector.notion.types.user import People as PeopleType + + +@dataclass +class People(DBPropertyBase): + id: str + name: str + description: Optional[str] = None + type: str = "people" + people: dict = field(default_factory=dict) + + @classmethod + def from_dict(cls, data: dict): + return cls(**data) + + +@dataclass +class PeopleCell(DBCellBase): + id: str + people: List[PeopleType] + type: str = "people" + name: Optional[str] = None + + @classmethod + def from_dict(cls, data: dict): + return cls(people=[PeopleType.from_dict(p) for p in data.pop("people", {})], **data) + + def get_html(self) -> Optional[HtmlTag]: + if not self.people: + return None + people_spans = [] + for person in self.people: + people_spans.append(Span([], person.get_html())) + return Div([], people_spans) diff --git a/unstructured_ingest/v2/processes/connectors/notion/types/database_properties/phone_number.py b/unstructured_ingest/v2/processes/connectors/notion/types/database_properties/phone_number.py new file mode 100644 index 000000000..5f7fe66b1 --- /dev/null +++ b/unstructured_ingest/v2/processes/connectors/notion/types/database_properties/phone_number.py @@ -0,0 +1,36 @@ +# https://developers.notion.com/reference/property-object#phone-number +from dataclasses import dataclass, field +from typing import Optional + +from htmlBuilder.tags import Div, HtmlTag + +from unstructured_ingest.connector.notion.interfaces import DBCellBase, DBPropertyBase + + +@dataclass +class PhoneNumber(DBPropertyBase): + id: str + name: str + type: str = "phone_number" + phone_number: dict = field(default_factory=dict) + + @classmethod + def from_dict(cls, data: dict): + return cls(**data) + + +@dataclass +class PhoneNumberCell(DBCellBase): + id: str + phone_number: Optional[str] + name: Optional[str] = None + type: str = "phone_number" + + @classmethod + def from_dict(cls, data: dict): + return cls(**data) + + def get_html(self) -> Optional[HtmlTag]: + if phone_number := self.phone_number: + return Div([], phone_number) + return None diff --git a/unstructured_ingest/v2/processes/connectors/notion/types/database_properties/relation.py b/unstructured_ingest/v2/processes/connectors/notion/types/database_properties/relation.py new file mode 100644 index 000000000..1376a387f --- /dev/null +++ b/unstructured_ingest/v2/processes/connectors/notion/types/database_properties/relation.py @@ -0,0 +1,67 @@ +# https://developers.notion.com/reference/property-object#relation +from dataclasses import dataclass +from typing import Optional +from urllib.parse import unquote + +from htmlBuilder.tags import Div, HtmlTag + +from unstructured_ingest.connector.notion.interfaces import ( + DBCellBase, + DBPropertyBase, + FromJSONMixin, +) + + +@dataclass +class DualProperty(FromJSONMixin): + synced_property_id: str + synced_property_name: str + + @classmethod + def from_dict(cls, data: dict): + return cls(**data) + + +@dataclass +class RelationProp(FromJSONMixin): + database_id: str + type: str + dual_property: DualProperty + + @classmethod + def from_dict(cls, data: dict): + t = data.get("type") + if t == "dual_property": + dual_property = DualProperty.from_dict(data.pop(t)) + else: + raise ValueError(f"{t} type not recognized") + + return cls(dual_property=dual_property, **data) + + +@dataclass +class Relation(DBPropertyBase): + id: str + name: str + relation: RelationProp + type: str = "relation" + + @classmethod + def from_dict(cls, data: dict): + return cls(relation=RelationProp.from_dict(data.pop("relation")), **data) + + +@dataclass +class RelationCell(DBCellBase): + id: str + has_more: bool + relation: list + type: str = "relation" + name: Optional[str] = None + + @classmethod + def from_dict(cls, data: dict): + return cls(**data) + + def get_html(self) -> Optional[HtmlTag]: + return Div([], unquote(self.id)) diff --git a/unstructured_ingest/v2/processes/connectors/notion/types/database_properties/rich_text.py b/unstructured_ingest/v2/processes/connectors/notion/types/database_properties/rich_text.py new file mode 100644 index 000000000..de5ca7dd6 --- /dev/null +++ b/unstructured_ingest/v2/processes/connectors/notion/types/database_properties/rich_text.py @@ -0,0 +1,43 @@ +# https://developers.notion.com/reference/property-object#rich-text +from dataclasses import dataclass, field +from typing import List, Optional + +from htmlBuilder.tags import Div, HtmlTag, Span + +from unstructured_ingest.connector.notion.interfaces import DBCellBase, DBPropertyBase +from unstructured_ingest.connector.notion.types.rich_text import ( + RichText as RichTextType, +) + + +@dataclass +class RichText(DBPropertyBase): + id: str + name: str + type: str = "rich_text" + rich_text: dict = field(default_factory=dict) + + @classmethod + def from_dict(cls, data: dict): + return cls(**data) + + +@dataclass +class RichTextCell(DBCellBase): + id: str + rich_text: List[RichTextType] + name: Optional[str] = None + type: str = "rich_text" + + @classmethod + def from_dict(cls, data: dict): + return cls( + rich_text=[RichTextType.from_dict(rt) for rt in data.pop("rich_text", [])], + **data, + ) + + def get_html(self) -> Optional[HtmlTag]: + if not self.rich_text: + return None + spans = [Span([], rt.get_html()) for rt in self.rich_text] + return Div([], spans) diff --git a/unstructured_ingest/v2/processes/connectors/notion/types/database_properties/rollup.py b/unstructured_ingest/v2/processes/connectors/notion/types/database_properties/rollup.py new file mode 100644 index 000000000..d82cb3ef1 --- /dev/null +++ b/unstructured_ingest/v2/processes/connectors/notion/types/database_properties/rollup.py @@ -0,0 +1,56 @@ +# https://developers.notion.com/reference/property-object#rollup +from dataclasses import dataclass +from typing import Optional + +from htmlBuilder.tags import Div, HtmlTag, Span + +from unstructured_ingest.connector.notion.interfaces import ( + DBCellBase, + DBPropertyBase, + FromJSONMixin, +) + + +@dataclass +class RollupProp(FromJSONMixin): + function: str + relation_property_id: str + relation_property_name: str + rollup_property_id: str + rollup_property_name: str + + @classmethod + def from_dict(cls, data: dict): + return cls(**data) + + +@dataclass +class Rollup(DBPropertyBase): + id: str + name: str + rollup: RollupProp + type: str = "rollup" + + @classmethod + def from_dict(cls, data: dict): + return cls(rollup=RollupProp.from_dict(data.pop("rollup")), **data) + + +@dataclass +class RollupCell(DBCellBase): + id: str + rollup: dict + type: str = "rollup" + name: Optional[str] = None + + @classmethod + def from_dict(cls, data: dict): + return cls(**data) + + def get_html(self) -> Optional[HtmlTag]: + rollup = self.rollup + t = rollup.get("type") + v = rollup[t] + if isinstance(v, list): + return Div([], [Span([], str(x)) for x in v]) + return Div([], str(v)) diff --git a/unstructured_ingest/v2/processes/connectors/notion/types/database_properties/select.py b/unstructured_ingest/v2/processes/connectors/notion/types/database_properties/select.py new file mode 100644 index 000000000..45ce681b7 --- /dev/null +++ b/unstructured_ingest/v2/processes/connectors/notion/types/database_properties/select.py @@ -0,0 +1,69 @@ +# https://developers.notion.com/reference/property-object#select +from dataclasses import dataclass, field +from typing import List, Optional + +from htmlBuilder.attributes import Style +from htmlBuilder.tags import Div, HtmlTag + +from unstructured_ingest.connector.notion.interfaces import ( + DBCellBase, + DBPropertyBase, + FromJSONMixin, +) + + +@dataclass +class SelectOption(FromJSONMixin): + color: str + id: str + name: str + description: Optional[str] = None + + @classmethod + def from_dict(cls, data: dict): + return cls(**data) + + +@dataclass +class SelectProp(FromJSONMixin): + options: List[SelectOption] = field(default_factory=list) + + @classmethod + def from_dict(cls, data: dict): + return cls(options=[SelectOption.from_dict(o) for o in data.get("options", [])]) + + +@dataclass +class Select(DBPropertyBase): + id: str + name: str + select: SelectProp + type: str = "select" + + @classmethod + def from_dict(cls, data: dict): + return cls(select=SelectProp.from_dict(data.pop("select", {})), **data) + + +@dataclass +class SelectCell(DBCellBase): + id: str + select: Optional[SelectOption] + type: str = "select" + name: Optional[str] = None + + @classmethod + def from_dict(cls, data: dict): + select_data = data.pop("select") + select = None + if select_data: + select = SelectOption.from_dict(select_data) + return cls(select=select, **data) + + def get_html(self) -> Optional[HtmlTag]: + if select := self.select: + select_attr = [] + if select.color and select.color != "default": + select_attr.append(Style(f"color: {select.color}")) + return Div(select_attr, select.name) + return None diff --git a/unstructured_ingest/v2/processes/connectors/notion/types/database_properties/status.py b/unstructured_ingest/v2/processes/connectors/notion/types/database_properties/status.py new file mode 100644 index 000000000..1b1372098 --- /dev/null +++ b/unstructured_ingest/v2/processes/connectors/notion/types/database_properties/status.py @@ -0,0 +1,81 @@ +# https://developers.notion.com/reference/property-object#status +from dataclasses import dataclass, field +from typing import List, Optional + +from htmlBuilder.attributes import Style +from htmlBuilder.tags import Div, HtmlTag + +from unstructured_ingest.connector.notion.interfaces import ( + DBCellBase, + DBPropertyBase, + FromJSONMixin, +) + + +@dataclass +class StatusOption(FromJSONMixin): + color: str + id: str + name: str + description: Optional[str] = None + + @classmethod + def from_dict(cls, data: dict): + return cls(**data) + + +@dataclass +class StatusGroup(FromJSONMixin): + color: str + id: str + name: str + option_ids: List[str] = field(default_factory=List[str]) + + @classmethod + def from_dict(cls, data: dict): + return cls(**data) + + +@dataclass +class StatusProp(FromJSONMixin): + options: List[StatusOption] = field(default_factory=list) + groups: List[StatusGroup] = field(default_factory=list) + + @classmethod + def from_dict(cls, data: dict): + return cls( + options=[StatusOption.from_dict(o) for o in data.get("options", [])], + groups=[StatusGroup.from_dict(g) for g in data.get("groups", [])], + ) + + +@dataclass +class Status(DBPropertyBase): + id: str + name: str + status: StatusProp + type: str = "status" + + @classmethod + def from_dict(cls, data: dict): + return cls(status=StatusProp.from_dict(data.pop("status", {})), **data) + + +@dataclass +class StatusCell(DBCellBase): + id: str + status: Optional[StatusOption] + type: str = "status" + name: Optional[str] = None + + @classmethod + def from_dict(cls, data: dict): + return cls(status=StatusOption.from_dict(data.pop("status", {})), **data) + + def get_html(self) -> Optional[HtmlTag]: + if status := self.status: + select_attr = [] + if status.color and status.color != "default": + select_attr.append(Style(f"color: {status.color}")) + return Div(select_attr, status.name) + return None diff --git a/unstructured_ingest/v2/processes/connectors/notion/types/database_properties/title.py b/unstructured_ingest/v2/processes/connectors/notion/types/database_properties/title.py new file mode 100644 index 000000000..f33734cdc --- /dev/null +++ b/unstructured_ingest/v2/processes/connectors/notion/types/database_properties/title.py @@ -0,0 +1,37 @@ +# https://developers.notion.com/reference/property-object#title +from dataclasses import dataclass, field +from typing import List, Optional + +from htmlBuilder.tags import Div, HtmlTag + +from unstructured_ingest.connector.notion.interfaces import DBCellBase, DBPropertyBase +from unstructured_ingest.connector.notion.types.rich_text import RichText + + +@dataclass +class Title(DBPropertyBase): + id: str + name: str + type: str = "title" + title: dict = field(default_factory=dict) + + @classmethod + def from_dict(cls, data: dict): + return cls(**data) + + +@dataclass +class TitleCell(DBCellBase): + id: str + title: List[RichText] + type: str = "title" + name: Optional[str] = None + + @classmethod + def from_dict(cls, data: dict): + return cls(title=[RichText.from_dict(rt) for rt in data.pop("title", [])], **data) + + def get_html(self) -> Optional[HtmlTag]: + if not self.title: + return None + return Div([], [rt.get_html() for rt in self.title]) diff --git a/unstructured_ingest/v2/processes/connectors/notion/types/database_properties/unique_id.py b/unstructured_ingest/v2/processes/connectors/notion/types/database_properties/unique_id.py new file mode 100644 index 000000000..69f07a815 --- /dev/null +++ b/unstructured_ingest/v2/processes/connectors/notion/types/database_properties/unique_id.py @@ -0,0 +1,50 @@ +# https://developers.notion.com/reference/property-object#title +from dataclasses import dataclass, field +from typing import Optional + +from htmlBuilder.tags import Div, HtmlTag + +from unstructured_ingest.connector.notion.interfaces import ( + DBCellBase, + DBPropertyBase, + FromJSONMixin, +) + + +@dataclass +class UniqueID(DBPropertyBase): + id: str + name: str + type: str = "unique_id" + unique_id: dict = field(default_factory=dict) + + @classmethod + def from_dict(cls, data: dict): + return cls(**data) + + +@dataclass +class UniqueIDCellData(FromJSONMixin): + prefix: str + number: int + + @classmethod + def from_dict(cls, data: dict): + return cls(**data) + + +@dataclass +class UniqueIDCell(DBCellBase): + id: str + unique_id: Optional[UniqueIDCellData] + type: str = "title" + name: Optional[str] = None + + @classmethod + def from_dict(cls, data: dict): + return cls(unique_id=UniqueIDCellData.from_dict(data.pop("unique_id")), **data) + + def get_html(self) -> Optional[HtmlTag]: + if unique_id := self.unique_id: + return Div([], f"{unique_id.prefix}-{unique_id.number}") + return None diff --git a/unstructured_ingest/v2/processes/connectors/notion/types/database_properties/url.py b/unstructured_ingest/v2/processes/connectors/notion/types/database_properties/url.py new file mode 100644 index 000000000..83bea8420 --- /dev/null +++ b/unstructured_ingest/v2/processes/connectors/notion/types/database_properties/url.py @@ -0,0 +1,37 @@ +# https://developers.notion.com/reference/property-object#url +from dataclasses import dataclass, field +from typing import Optional + +from htmlBuilder.attributes import Href +from htmlBuilder.tags import A, HtmlTag + +from unstructured_ingest.connector.notion.interfaces import DBCellBase, DBPropertyBase + + +@dataclass +class URL(DBPropertyBase): + id: str + name: str + type: str = "url" + url: dict = field(default_factory=dict) + + @classmethod + def from_dict(cls, data: dict): + return cls(**data) + + +@dataclass +class URLCell(DBCellBase): + id: str + url: Optional[str] = None + name: Optional[str] = None + type: str = "url" + + @classmethod + def from_dict(cls, data: dict): + return cls(**data) + + def get_html(self) -> Optional[HtmlTag]: + if url := self.url: + return A([Href(url)], url) + return None diff --git a/unstructured_ingest/v2/processes/connectors/notion/types/database_properties/verification.py b/unstructured_ingest/v2/processes/connectors/notion/types/database_properties/verification.py new file mode 100644 index 000000000..bd1b5a29b --- /dev/null +++ b/unstructured_ingest/v2/processes/connectors/notion/types/database_properties/verification.py @@ -0,0 +1,78 @@ +# https://developers.notion.com/reference/property-object#url +from dataclasses import dataclass, field +from typing import Optional + +from htmlBuilder.tags import Div, HtmlTag, Span + +from unstructured_ingest.connector.notion.interfaces import ( + DBCellBase, + DBPropertyBase, + FromJSONMixin, + GetHTMLMixin, +) +from unstructured_ingest.connector.notion.types.date import Date +from unstructured_ingest.connector.notion.types.user import People + + +@dataclass +class Verification(DBPropertyBase): + id: str + name: str + type: str = "verification" + verification: dict = field(default_factory=dict) + + @classmethod + def from_dict(cls, data: dict): + return cls(**data) + + +@dataclass +class VerificationData(FromJSONMixin, GetHTMLMixin): + state: Optional[str] + verified_by: Optional[People] + date: Optional[Date] + + @classmethod + def from_dict(cls, data: dict): + verified_by = data.pop("verified_by", None) + date = data.pop("date", None) + return cls( + verified_by=People.from_dict(data=verified_by) if verified_by else None, + date=Date.from_dict(data=date) if date else None, + **data, + ) + + def get_html(self) -> Optional[HtmlTag]: + elements = [] + if state := self.state: + elements.append(Span([], state)) + if (verified_by := self.verified_by) and (verified_by_html := verified_by.get_html()): + elements.append(verified_by_html) + if (date := self.date) and (date_html := date.get_html()): + elements.append(date_html) + if elements: + return Div([], elements) + return None + + +@dataclass +class VerificationCell(DBCellBase): + id: str + verification: Optional[VerificationData] + name: Optional[str] = None + type: str = "verification" + + @classmethod + def from_dict(cls, data: dict): + return cls(verification=VerificationData.from_dict(data.pop("verification")), **data) + + def get_html(self) -> Optional[HtmlTag]: + elements = [] + if name := self.name: + elements.append(Span([], name)) + if (verification := self.verification) and (verification_html := verification.get_html()): + elements.append(verification_html) + + if elements: + return Div([], elements) + return None diff --git a/unstructured_ingest/v2/processes/connectors/notion/types/date.py b/unstructured_ingest/v2/processes/connectors/notion/types/date.py new file mode 100644 index 000000000..451c03a75 --- /dev/null +++ b/unstructured_ingest/v2/processes/connectors/notion/types/date.py @@ -0,0 +1,29 @@ +# https://developers.notion.com/reference/property-value-object#date-property-values +from dataclasses import dataclass +from typing import Optional + +from htmlBuilder.tags import Div, HtmlTag + +from unstructured_ingest.v2.processes.connectors.notion.interfaces import ( + FromJSONMixin, + GetHTMLMixin, +) + + +@dataclass +class Date(FromJSONMixin, GetHTMLMixin): + start: str + end: Optional[str] = None + time_zone: Optional[str] = None + + @classmethod + def from_dict(cls, data: dict): + return cls(**data) + + def get_html(self) -> Optional[HtmlTag]: + text = f"{self.start}" + if end := self.end: + text += f" - {end}" + if self.time_zone: + text += f" {self.time_zone}" + return Div([], text) diff --git a/unstructured_ingest/v2/processes/connectors/notion/types/file.py b/unstructured_ingest/v2/processes/connectors/notion/types/file.py new file mode 100644 index 000000000..c785d0e62 --- /dev/null +++ b/unstructured_ingest/v2/processes/connectors/notion/types/file.py @@ -0,0 +1,54 @@ +# https://developers.notion.com/reference/file-object +from dataclasses import dataclass +from typing import Optional + +from htmlBuilder.attributes import Href +from htmlBuilder.tags import A, HtmlTag + +from unstructured_ingest.v2.processes.connectors.notion.interfaces import ( + FromJSONMixin, + GetHTMLMixin, +) + + +@dataclass +class External(FromJSONMixin): + url: str + + @classmethod + def from_dict(cls, data: dict): + return cls(**data) + + +@dataclass +class File(FromJSONMixin): + url: str + expiry_time: str + + @classmethod + def from_dict(cls, data: dict): + return cls(**data) + + +@dataclass +class FileObject(FromJSONMixin, GetHTMLMixin): + type: str + external: Optional[External] = None + file: Optional[File] = None + + @classmethod + def from_dict(cls, data: dict): + t = data["type"] + file_object = cls(type=t) + if t == "external": + file_object.external = External.from_dict(data["external"]) + elif t == "file": + file_object.file = File.from_dict(data["file"]) + return file_object + + def get_html(self) -> Optional[HtmlTag]: + if self.file: + return A([Href(self.file.url)], self.file.url) + if self.external: + return A([Href(self.external.url)], self.external.url) + return None diff --git a/unstructured_ingest/v2/processes/connectors/notion/types/page.py b/unstructured_ingest/v2/processes/connectors/notion/types/page.py new file mode 100644 index 000000000..497890dbb --- /dev/null +++ b/unstructured_ingest/v2/processes/connectors/notion/types/page.py @@ -0,0 +1,45 @@ +# https://developers.notion.com/reference/page +from dataclasses import dataclass +from typing import Optional + +from unstructured_ingest.v2.processes.connectors.notion.interfaces import FromJSONMixin +from unstructured_ingest.v2.processes.connectors.notion.types.file import FileObject +from unstructured_ingest.v2.processes.connectors.notion.types.parent import Parent +from unstructured_ingest.v2.processes.connectors.notion.types.user import PartialUser + + +@dataclass +class Page(FromJSONMixin): + id: str + created_time: str + created_by: PartialUser + last_edited_time: str + last_edited_by: PartialUser + archived: bool + in_trash: bool + properties: dict + parent: Parent + url: str + public_url: str + request_id: Optional[str] = None + object: str = "page" + icon: Optional[FileObject] = None + cover: Optional[FileObject] = None + + @classmethod + def from_dict(cls, data: dict): + created_by = data.pop("created_by") + last_edited_by = data.pop("last_edited_by") + icon = data.pop("icon") + cover = data.pop("cover") + parent = data.pop("parent") + page = cls( + created_by=PartialUser.from_dict(created_by), + last_edited_by=PartialUser.from_dict(last_edited_by), + icon=FileObject.from_dict(icon) if icon else None, + cover=FileObject.from_dict(cover) if cover else None, + parent=Parent.from_dict(parent), + **data, + ) + + return page diff --git a/unstructured_ingest/v2/processes/connectors/notion/types/parent.py b/unstructured_ingest/v2/processes/connectors/notion/types/parent.py new file mode 100644 index 000000000..8adf6a39a --- /dev/null +++ b/unstructured_ingest/v2/processes/connectors/notion/types/parent.py @@ -0,0 +1,66 @@ +# https://developers.notion.com/reference/parent-object +from dataclasses import dataclass + +from unstructured_ingest.v2.processes.connectors.notion.interfaces import FromJSONMixin + + +# https://developers.notion.com/reference/parent-object#database-parent +@dataclass +class DatabaseParent(FromJSONMixin): + database_id: str + type: str = "database_id" + + @classmethod + def from_dict(cls, data: dict): + return cls(database_id=data["database_id"]) + + +# https://developers.notion.com/reference/parent-object#page-parent +@dataclass +class PageParent(FromJSONMixin): + page_id: str + type: str = "page_id" + + @classmethod + def from_dict(cls, data: dict): + return cls(page_id=data["page_id"]) + + +# https://developers.notion.com/reference/parent-object#workspace-parent +@dataclass +class WorkspaceParent(FromJSONMixin): + type: str = "workspace" + workspace: bool = True + + @classmethod + def from_dict(cls, data: dict): + return cls() + + +# https://developers.notion.com/reference/parent-object#block-parent +@dataclass +class BlockParent(FromJSONMixin): + block_id: str + type: str = "block_id" + + @classmethod + def from_dict(cls, data: dict): + return cls(block_id=data["block_id"]) + + +@dataclass +class Parent(FromJSONMixin): + block_id: str + type: str = "block_id" + + @classmethod + def from_dict(cls, data: dict): + t = data["type"] + if t == "database_id": + return DatabaseParent.from_dict(data) + elif t == "page_id": + return PageParent.from_dict(data) + elif t == "workspace": + return WorkspaceParent.from_dict(data) + elif t == "block_id": + return BlockParent.from_dict(data) diff --git a/unstructured_ingest/v2/processes/connectors/notion/types/rich_text.py b/unstructured_ingest/v2/processes/connectors/notion/types/rich_text.py new file mode 100644 index 000000000..3764b177c --- /dev/null +++ b/unstructured_ingest/v2/processes/connectors/notion/types/rich_text.py @@ -0,0 +1,189 @@ +# https://developers.notion.com/reference/rich-text +from dataclasses import dataclass +from typing import Optional + +from htmlBuilder.attributes import Href, Style +from htmlBuilder.tags import A, B, Code, Div, HtmlTag, I, S, Span, U +from htmlBuilder.tags import Text as HtmlText + +from unstructured_ingest.v2.processes.connectors.notion.interfaces import ( + FromJSONMixin, + GetHTMLMixin, +) +from unstructured_ingest.v2.processes.connectors.notion.types.date import Date +from unstructured_ingest.v2.processes.connectors.notion.types.user import People + + +@dataclass +class Annotations(FromJSONMixin): + bold: bool + code: bool + italic: bool + strikethrough: bool + underline: bool + color: str + + @classmethod + def from_dict(cls, data: dict): + return cls(**data) + + +@dataclass +class Equation(FromJSONMixin, GetHTMLMixin): + expression: str + + @classmethod + def from_dict(cls, data: dict): + return cls(**data) + + def get_html(self) -> Optional[HtmlTag]: + return Code([], self.expression) if self.expression else None + + +@dataclass +class MentionDatabase(FromJSONMixin, GetHTMLMixin): + id: str + + @classmethod + def from_dict(cls, data: dict): + return cls(**data) + + def get_html(self) -> Optional[HtmlTag]: + return Div([], self.id) if self.id else None + + +@dataclass +class MentionLinkPreview(FromJSONMixin, GetHTMLMixin): + url: str + + @classmethod + def from_dict(cls, data: dict): + return cls(**data) + + def get_html(self) -> Optional[HtmlTag]: + return A([Href(self.url)], self.url) if self.url else None + + +@dataclass +class MentionPage(FromJSONMixin, GetHTMLMixin): + id: str + + @classmethod + def from_dict(cls, data: dict): + return cls(**data) + + def get_html(self) -> Optional[HtmlTag]: + return Div([], self.id) if self.id else None + + +@dataclass +class MentionTemplate(FromJSONMixin): + template_mention_date: Optional[str] + template_mention_user: Optional[str] + + @classmethod + def from_dict(cls, data: dict): + return cls(**data) + + +@dataclass +class Mention(FromJSONMixin, GetHTMLMixin): + type: str + database: Optional[MentionDatabase] = None + date: Optional[Date] = None + link_preview: Optional[MentionLinkPreview] = None + page: Optional[MentionPage] = None + template_mention: Optional[MentionTemplate] = None + user: Optional[People] = None + + @classmethod + def from_dict(cls, data: dict): + t = data["type"] + mention = cls(type=t) + if t == "date": + mention.date = Date.from_dict(data["date"]) + elif t == "database": + mention.database = MentionDatabase.from_dict(data["database"]) + elif t == "link_preview": + mention.link_preview = MentionLinkPreview.from_dict(data["link_preview"]) + elif t == "page": + mention.page = MentionPage.from_dict(data["page"]) + elif t == "template_mention": + mention.template_mention = MentionTemplate.from_dict(data["template_mention"]) + elif t == "user": + mention.user = People.from_dict(data["user"]) + + return mention + + def get_html(self) -> Optional[HtmlTag]: + t = self.type + if t == "date": + return self.date.get_html() if self.date else None + elif t == "database": + return self.database.get_html() if self.database else None + elif t == "link_preview": + return self.link_preview.get_html() if self.link_preview else None + elif t == "page": + return self.page.get_html() if self.page else None + elif t == "user": + return self.user.get_html() if self.user else None + return None + + +@dataclass +class Text(FromJSONMixin): + content: str + link: Optional[dict] + + @classmethod + def from_dict(cls, data: dict): + return cls(**data) + + +@dataclass +class RichText(FromJSONMixin, GetHTMLMixin): + type: str + plain_text: str + annotations: Optional[Annotations] = None + href: Optional[str] = None + text: Optional[Text] = None + mention: Optional[Mention] = None + equation: Optional[Equation] = None + + def get_html(self) -> Optional[HtmlTag]: + text = HtmlText(self.plain_text) + if self.href: + text = A([Href(self.href)], text) + if self.annotations: + annotations = self.annotations + if annotations.bold: + text = B([], text) + if annotations.code: + text = Code([], text) + if annotations.italic: + text = I([], text) + if annotations.strikethrough: + text = S([], text) + if annotations.underline: + text = U([], text) + if annotations.color and annotations.color != "default": + if isinstance(text, HtmlText): + text = Span([], text) + text.attributes.append(Style(f"color:{annotations.color}")) + return text + + @classmethod + def from_dict(cls, data: dict): + t = data["type"] + rich_text = cls( + annotations=Annotations.from_dict(data.pop("annotations")), + **data, + ) + if t == "text": + rich_text.text = Text.from_dict(data["text"]) + elif t == "mention": + rich_text.mention = Mention.from_dict(data["mention"]) + elif t == "equation": + rich_text.equation = Equation.from_dict(data["equation"]) + + return rich_text diff --git a/unstructured_ingest/v2/processes/connectors/notion/types/user.py b/unstructured_ingest/v2/processes/connectors/notion/types/user.py new file mode 100644 index 000000000..38417a9b3 --- /dev/null +++ b/unstructured_ingest/v2/processes/connectors/notion/types/user.py @@ -0,0 +1,79 @@ +# https://developers.notion.com/reference/user +from dataclasses import dataclass, field +from typing import Optional + +from htmlBuilder.attributes import Href +from htmlBuilder.tags import A, Div, HtmlTag + +from unstructured_ingest.v2.processes.connectors.notion.interfaces import ( + FromJSONMixin, + GetHTMLMixin, +) + + +@dataclass +class PartialUser(FromJSONMixin): + id: str + object: str = "user" + + @classmethod + def from_dict(cls, data: dict): + return cls(id=data["id"]) + + +@dataclass +class User(FromJSONMixin, GetHTMLMixin): + object: dict + id: str + type: Optional[str] = None + name: Optional[str] = None + avatar_url: Optional[str] = None + + @classmethod + def from_dict(cls, data: dict): + return cls(**data) + + def get_text(self) -> Optional[str]: + text = self.name + if self.avatar_url: + text = f"[{text}]({self.avatar_url}" + return text + + def get_html(self) -> Optional[HtmlTag]: + if self.avatar_url: + return A([Href(self.avatar_url)], self.name) + else: + return Div([], self.name) + + +@dataclass +class People(User): + person: dict = field(default_factory=dict) + + +@dataclass +class Bots(FromJSONMixin, GetHTMLMixin): + object: dict + id: str + bot: dict + owner: dict + type: str + workspace_name: str + name: Optional[str] = None + avatar_url: Optional[str] = None + + @classmethod + def from_dict(cls, data: dict): + return cls(**data) + + def get_text(self) -> Optional[str]: + text = self.name + if self.avatar_url: + text = f"[{text}]({self.avatar_url}" + return text + + def get_html(self) -> Optional[HtmlTag]: + if self.avatar_url: + return A([Href(self.avatar_url)], self.name) + else: + return Div([], self.name)