Skip to content

Commit 5abf754

Browse files
feat: add display name to connectors (#532)
# Add `display_name` attribute for `FileData` in connectors that miss it Display name might be helpful when debugging failed processing. This PR: - adds this attribute to the connectors that miss it - updates tests to check for that field - udpates fixtures/expected processed data files (=fixtures update) - __MOST OF THE CHANGES IN THIS PR__
1 parent b5d3268 commit 5abf754

File tree

172 files changed

+20104
-756
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

172 files changed

+20104
-756
lines changed

.github/workflows/ingest-test-fixtures-update-pr.yml

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -16,10 +16,9 @@ jobs:
1616
runs-on: ubuntu-latest
1717
steps:
1818
- uses: actions/checkout@v3
19-
- uses: ./.github/actions/base-cache
19+
- uses: ./.github/actions/setup-environment
2020
with:
2121
python-version: ${{ env.PYTHON_VERSION }}
22-
check-only: 'true'
2322

2423
update-fixtures-and-pr:
2524
runs-on: ubuntu-latest-m
@@ -35,7 +34,7 @@ jobs:
3534
id: full-python-version
3635
run: echo version=$(python -c "import sys; print('-'.join(str(v) for v in sys.version_info))") >> $GITHUB_OUTPUT
3736
- name: Setup virtual environment
38-
uses: ./.github/actions/base-cache
37+
uses: ./.github/actions/setup-environment
3938
with:
4039
python-version: ${{ env.PYTHON_VERSION }}
4140
- name: Update test fixtures

CHANGELOG.md

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,7 @@
1+
## 1.0.41
2+
3+
* **Add `display_name` to FileData in 14 connectors**
4+
15
## 1.0.40
26

37
* **Fix extracting embedded files from Confluence pages**

test/integration/connectors/databricks/test_volumes_native.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818
from test.integration.connectors.utils.validation.source import (
1919
SourceValidationConfigs,
2020
source_connector_validation,
21+
source_filedata_display_name_set_check,
2122
)
2223
from test.integration.utils import requires_env
2324
from unstructured_ingest.data_types.file_data import FileData, SourceIdentifiers
@@ -114,6 +115,8 @@ async def test_volumes_native_source(tmp_path: Path):
114115
configs=SourceValidationConfigs(
115116
test_id="databricks_volumes_native",
116117
expected_num_files=1,
118+
predownload_file_data_check=source_filedata_display_name_set_check,
119+
postdownload_file_data_check=source_filedata_display_name_set_check,
117120
),
118121
)
119122

@@ -144,6 +147,8 @@ async def test_volumes_native_source_pat(tmp_path: Path):
144147
configs=SourceValidationConfigs(
145148
test_id="databricks_volumes_native_pat",
146149
expected_num_files=1,
150+
predownload_file_data_check=source_filedata_display_name_set_check,
151+
postdownload_file_data_check=source_filedata_display_name_set_check,
147152
),
148153
)
149154

test/integration/connectors/discord/test_discord.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
from test.integration.connectors.utils.validation.source import (
1111
SourceValidationConfigs,
1212
source_connector_validation,
13+
source_filedata_display_name_set_check,
1314
)
1415
from test.integration.utils import requires_env
1516
from unstructured_ingest.error import SourceConnectionError
@@ -62,6 +63,8 @@ async def test_discord_source():
6263
expected_num_files=expected_num_files,
6364
expected_number_indexed_file_data=expected_num_files,
6465
validate_downloaded_files=True,
66+
predownload_file_data_check=source_filedata_display_name_set_check,
67+
postdownload_file_data_check=source_filedata_display_name_set_check,
6568
),
6669
)
6770

test/integration/connectors/elasticsearch/test_elasticsearch.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020
from test.integration.connectors.utils.validation.source import (
2121
SourceValidationConfigs,
2222
source_connector_validation,
23+
source_filedata_display_name_set_check,
2324
)
2425
from unstructured_ingest.error import DestinationConnectionError, SourceConnectionError
2526
from unstructured_ingest.data_types.file_data import FileData, SourceIdentifiers
@@ -203,6 +204,9 @@ async def test_elasticsearch_source(source_index: str, movies_dataframe: pd.Data
203204
expected_num_files=expected_num_files,
204205
expected_number_indexed_file_data=1,
205206
validate_downloaded_files=True,
207+
predownload_file_data_check=source_filedata_display_name_set_check,
208+
postdownload_file_data_check=source_filedata_display_name_set_check,
209+
exclude_fields_extend=["display_name"] # includes dynamic ids, might change
206210
),
207211
)
208212

test/integration/connectors/elasticsearch/test_opensearch.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919
from test.integration.connectors.utils.validation.source import (
2020
SourceValidationConfigs,
2121
source_connector_validation,
22+
source_filedata_display_name_set_check,
2223
)
2324
from unstructured_ingest.data_types.file_data import FileData, SourceIdentifiers
2425
from unstructured_ingest.error import (
@@ -193,6 +194,9 @@ async def test_opensearch_source(source_index: str, movies_dataframe: pd.DataFra
193194
expected_num_files=expected_num_files,
194195
expected_number_indexed_file_data=1,
195196
validate_downloaded_files=True,
197+
predownload_file_data_check=source_filedata_display_name_set_check,
198+
postdownload_file_data_check=source_filedata_display_name_set_check,
199+
exclude_fields_extend=["display_name"] # includes dynamic ids, might change
196200
),
197201
)
198202

test/integration/connectors/expected_results/astradb/file_data/25b75f1d-a2ea-4c97-b75f-1da2eadc97f7.csv.json

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@
1414
},
1515
"date_created": null,
1616
"date_modified": null,
17-
"date_processed": "1742508009.2832398",
17+
"date_processed": "1749474599.0632136",
1818
"permissions_data": null,
1919
"filesize_bytes": null
2020
},
@@ -23,6 +23,6 @@
2323
"keyspace": null
2424
},
2525
"reprocess": false,
26-
"local_download_path": "/private/var/folders/7s/nsbls8k12qngtdvkhhkpf1n00000gn/T/pytest-of-nathan/pytest-325/test_astra_search_source0/25b75f1d-a2ea-4c97-b75f-1da2eadc97f7.csv",
27-
"display_name": null
26+
"local_download_path": "/tmp/pytest-of-runner/pytest-0/test_astra_search_source0/25b75f1d-a2ea-4c97-b75f-1da2eadc97f7.csv",
27+
"display_name": "ingest_test_src-None-[762c0093-2277-4f3e-ac00-932277af3e0e..25b75f1d-a2ea-4c97-b75f-1da2eadc97f7]"
2828
}

test/integration/connectors/expected_results/astradb/file_data/43d02113-723f-5ec1-acbb-c8da8d3650dc.json

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88
"record_locator": null,
99
"date_created": null,
1010
"date_modified": null,
11-
"date_processed": "1742508009.054028",
11+
"date_processed": "1749474598.4987292",
1212
"permissions_data": null,
1313
"filesize_bytes": null
1414
},
@@ -18,7 +18,7 @@
1818
},
1919
"reprocess": false,
2020
"local_download_path": null,
21-
"display_name": null,
21+
"display_name": "ingest_test_src-None-[762c0093-2277-4f3e-ac00-932277af3e0e..25b75f1d-a2ea-4c97-b75f-1da2eadc97f7]",
2222
"batch_items": [
2323
{
2424
"identifier": "25b75f1d-a2ea-4c97-b75f-1da2eadc97f7",

test/integration/connectors/expected_results/astradb/file_data/60297eea-73d7-4fca-a97e-ea73d7cfca62.csv.json

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@
1414
},
1515
"date_created": null,
1616
"date_modified": null,
17-
"date_processed": "1742508009.284526",
17+
"date_processed": "1749474599.0628886",
1818
"permissions_data": null,
1919
"filesize_bytes": null
2020
},
@@ -23,6 +23,6 @@
2323
"keyspace": null
2424
},
2525
"reprocess": false,
26-
"local_download_path": "/private/var/folders/7s/nsbls8k12qngtdvkhhkpf1n00000gn/T/pytest-of-nathan/pytest-325/test_astra_search_source0/60297eea-73d7-4fca-a97e-ea73d7cfca62.csv",
27-
"display_name": null
26+
"local_download_path": "/tmp/pytest-of-runner/pytest-0/test_astra_search_source0/60297eea-73d7-4fca-a97e-ea73d7cfca62.csv",
27+
"display_name": "ingest_test_src-None-[762c0093-2277-4f3e-ac00-932277af3e0e..25b75f1d-a2ea-4c97-b75f-1da2eadc97f7]"
2828
}

test/integration/connectors/expected_results/astradb/file_data/641d99e3-9941-4c18-9d99-e399414c183d.csv.json

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@
1414
},
1515
"date_created": null,
1616
"date_modified": null,
17-
"date_processed": "1742508009.284161",
17+
"date_processed": "1749474599.0630517",
1818
"permissions_data": null,
1919
"filesize_bytes": null
2020
},
@@ -23,6 +23,6 @@
2323
"keyspace": null
2424
},
2525
"reprocess": false,
26-
"local_download_path": "/private/var/folders/7s/nsbls8k12qngtdvkhhkpf1n00000gn/T/pytest-of-nathan/pytest-325/test_astra_search_source0/641d99e3-9941-4c18-9d99-e399414c183d.csv",
27-
"display_name": null
26+
"local_download_path": "/tmp/pytest-of-runner/pytest-0/test_astra_search_source0/641d99e3-9941-4c18-9d99-e399414c183d.csv",
27+
"display_name": "ingest_test_src-None-[762c0093-2277-4f3e-ac00-932277af3e0e..25b75f1d-a2ea-4c97-b75f-1da2eadc97f7]"
2828
}

0 commit comments

Comments
 (0)