Skip to content

Commit 6fc428f

Browse files
authored
feat/split databricks into each auth type supported (#182)
* split databricks into each auth type supported * migrate volumes source connector e2e test * migrate volumes destination connector e2e test * bump changelog * Add databricks secrets to int test CI * Add s3 secrets to destination CI * expose azure databricks connector * add missing connector type definition * update changelog to be a minor bump
1 parent fe85640 commit 6fc428f

File tree

19 files changed

+801
-226
lines changed

19 files changed

+801
-226
lines changed

.github/workflows/e2e.yml

Lines changed: 14 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -84,7 +84,12 @@ jobs:
8484
uses: ./.github/actions/base-cache
8585
with:
8686
python-version: "3.10"
87-
- name: Test (end-to-end)
87+
- name: Run Integration Tests
88+
env:
89+
DATABRICKS_HOST: ${{secrets.DATABRICKS_HOST}}
90+
DATABRICKS_CATALOG: ${{secrets.DATABRICKS_CATALOG}}
91+
DATABRICKS_CLIENT_ID: ${{secrets.DATABRICKS_CLIENT_ID}}
92+
DATABRICKS_CLIENT_SECRET: ${{secrets.DATABRICKS_CLIENT_SECRET}}
8893
run : |
8994
source .venv/bin/activate
9095
sudo make install-docker-compose
@@ -108,7 +113,14 @@ jobs:
108113
uses: ./.github/actions/base-cache
109114
with:
110115
python-version: "3.10"
111-
- name: Test (end-to-end)
116+
- name: Run Integration Tests
117+
env:
118+
DATABRICKS_HOST: ${{secrets.DATABRICKS_HOST}}
119+
DATABRICKS_CATALOG: ${{secrets.DATABRICKS_CATALOG}}
120+
DATABRICKS_CLIENT_ID: ${{secrets.DATABRICKS_CLIENT_ID}}
121+
DATABRICKS_CLIENT_SECRET: ${{secrets.DATABRICKS_CLIENT_SECRET}}
122+
S3_INGEST_TEST_ACCESS_KEY: ${{ secrets.S3_INGEST_TEST_ACCESS_KEY }}
123+
S3_INGEST_TEST_SECRET_KEY: ${{ secrets.S3_INGEST_TEST_SECRET_KEY }}
112124
run : |
113125
source .venv/bin/activate
114126
sudo make install-docker-compose
@@ -261,10 +273,6 @@ jobs:
261273
ASTRA_DB_APPLICATION_TOKEN: ${{secrets.ASTRA_DB_APPLICATION_TOKEN}}
262274
ASTRA_DB_API_ENDPOINT: ${{secrets.ASTRA_DB_ENDPOINT}}
263275
CLARIFAI_API_KEY: ${{secrets.CLARIFAI_API_KEY}}
264-
DATABRICKS_HOST: ${{secrets.DATABRICKS_HOST}}
265-
DATABRICKS_CATALOG: ${{secrets.DATABRICKS_CATALOG}}
266-
DATABRICKS_CLIENT_ID: ${{secrets.DATABRICKS_CLIENT_ID}}
267-
DATABRICKS_CLIENT_SECRET: ${{secrets.DATABRICKS_CLIENT_SECRET}}
268276
SHAREPOINT_CLIENT_ID: ${{secrets.SHAREPOINT_CLIENT_ID}}
269277
SHAREPOINT_CRED: ${{secrets.SHAREPOINT_CRED}}
270278
KDBAI_BEARER_TOKEN: ${{ secrets.KDBAI_BEARER_TOKEN }}

CHANGELOG.md

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
## 0.0.26-dev7
1+
## 0.1.0
22

33
### Enhancements
44

@@ -8,6 +8,7 @@
88
* **Refactor sqlite and postgres to be distinct connectors to support better input validation**
99
* **Added MongoDB source V2 connector**
1010
* **Support optional access configs on connection configs**
11+
* **Refactor databricks into distinct connectors based on auth type**
1112

1213
### Fixes
1314

test/integration/connectors/databricks_tests/__init__.py

Whitespace-only changes.
Lines changed: 165 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,165 @@
1+
import json
2+
import os
3+
import tempfile
4+
import uuid
5+
from contextlib import contextmanager
6+
from dataclasses import dataclass
7+
from pathlib import Path
8+
9+
import pytest
10+
from databricks.sdk import WorkspaceClient
11+
from databricks.sdk.errors.platform import NotFound
12+
13+
from test.integration.connectors.utils.constants import DESTINATION_TAG, SOURCE_TAG
14+
from test.integration.connectors.utils.validation import (
15+
ValidationConfigs,
16+
source_connector_validation,
17+
)
18+
from test.integration.utils import requires_env
19+
from unstructured_ingest.v2.interfaces import FileData
20+
from unstructured_ingest.v2.processes.connectors.databricks.volumes_native import (
21+
CONNECTOR_TYPE,
22+
DatabricksNativeVolumesAccessConfig,
23+
DatabricksNativeVolumesConnectionConfig,
24+
DatabricksNativeVolumesDownloader,
25+
DatabricksNativeVolumesDownloaderConfig,
26+
DatabricksNativeVolumesIndexer,
27+
DatabricksNativeVolumesIndexerConfig,
28+
DatabricksNativeVolumesUploader,
29+
DatabricksNativeVolumesUploaderConfig,
30+
)
31+
32+
33+
@dataclass
34+
class EnvData:
35+
host: str
36+
client_id: str
37+
client_secret: str
38+
catalog: str
39+
40+
def get_connection_config(self) -> DatabricksNativeVolumesConnectionConfig:
41+
return DatabricksNativeVolumesConnectionConfig(
42+
host=self.host,
43+
access_config=DatabricksNativeVolumesAccessConfig(
44+
client_id=self.client_id,
45+
client_secret=self.client_secret,
46+
),
47+
)
48+
49+
50+
def get_env_data() -> EnvData:
51+
return EnvData(
52+
host=os.environ["DATABRICKS_HOST"],
53+
client_id=os.environ["DATABRICKS_CLIENT_ID"],
54+
client_secret=os.environ["DATABRICKS_CLIENT_SECRET"],
55+
catalog=os.environ["DATABRICKS_CATALOG"],
56+
)
57+
58+
59+
@pytest.mark.asyncio
60+
@pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG)
61+
@requires_env(
62+
"DATABRICKS_HOST", "DATABRICKS_CLIENT_ID", "DATABRICKS_CLIENT_SECRET", "DATABRICKS_CATALOG"
63+
)
64+
async def test_volumes_native_source():
65+
env_data = get_env_data()
66+
indexer_config = DatabricksNativeVolumesIndexerConfig(
67+
recursive=True,
68+
volume="test-platform",
69+
volume_path="databricks-volumes-test-input",
70+
catalog=env_data.catalog,
71+
)
72+
connection_config = env_data.get_connection_config()
73+
with tempfile.TemporaryDirectory() as tempdir:
74+
tempdir_path = Path(tempdir)
75+
download_config = DatabricksNativeVolumesDownloaderConfig(download_dir=tempdir_path)
76+
indexer = DatabricksNativeVolumesIndexer(
77+
connection_config=connection_config, index_config=indexer_config
78+
)
79+
downloader = DatabricksNativeVolumesDownloader(
80+
connection_config=connection_config, download_config=download_config
81+
)
82+
await source_connector_validation(
83+
indexer=indexer,
84+
downloader=downloader,
85+
configs=ValidationConfigs(
86+
test_id="databricks_volumes_native",
87+
expected_num_files=1,
88+
),
89+
)
90+
91+
92+
def _get_volume_path(catalog: str, volume: str, volume_path: str):
93+
return f"/Volumes/{catalog}/default/{volume}/{volume_path}"
94+
95+
96+
@contextmanager
97+
def databricks_destination_context(env_data: EnvData, volume: str, volume_path) -> WorkspaceClient:
98+
client = WorkspaceClient(
99+
host=env_data.host, client_id=env_data.client_id, client_secret=env_data.client_secret
100+
)
101+
try:
102+
yield client
103+
finally:
104+
# Cleanup
105+
try:
106+
for file in client.files.list_directory_contents(
107+
directory_path=_get_volume_path(env_data.catalog, volume, volume_path)
108+
):
109+
client.files.delete(file.path)
110+
client.files.delete_directory(_get_volume_path(env_data.catalog, volume, volume_path))
111+
except NotFound:
112+
# Directory was never created, don't need to delete
113+
pass
114+
115+
116+
def validate_upload(client: WorkspaceClient, catalog: str, volume: str, volume_path: str):
117+
files = list(
118+
client.files.list_directory_contents(
119+
directory_path=_get_volume_path(catalog, volume, volume_path)
120+
)
121+
)
122+
123+
assert len(files) == 1
124+
125+
resp = client.files.download(files[0].path)
126+
data = json.loads(resp.contents.read())
127+
128+
assert len(data) == 22
129+
element_types = {v["type"] for v in data}
130+
assert len(element_types) == 1
131+
assert "CompositeElement" in element_types
132+
133+
134+
@pytest.mark.asyncio
135+
@pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG)
136+
@requires_env(
137+
"DATABRICKS_HOST", "DATABRICKS_CLIENT_ID", "DATABRICKS_CLIENT_SECRET", "DATABRICKS_CATALOG"
138+
)
139+
async def test_volumes_native_destination(upload_file: Path):
140+
env_data = get_env_data()
141+
volume_path = f"databricks-volumes-test-output-{uuid.uuid4()}"
142+
mock_file_data = FileData(identifier="mock file data", connector_type=CONNECTOR_TYPE)
143+
with databricks_destination_context(
144+
volume="test-platform", volume_path=volume_path, env_data=env_data
145+
) as workspace_client:
146+
connection_config = env_data.get_connection_config()
147+
uploader = DatabricksNativeVolumesUploader(
148+
connection_config=connection_config,
149+
upload_config=DatabricksNativeVolumesUploaderConfig(
150+
volume="test-platform",
151+
volume_path=volume_path,
152+
catalog=env_data.catalog,
153+
),
154+
)
155+
if uploader.is_async():
156+
await uploader.run_async(path=upload_file, file_data=mock_file_data)
157+
else:
158+
uploader.run(path=upload_file, file_data=mock_file_data)
159+
160+
validate_upload(
161+
client=workspace_client,
162+
catalog=env_data.catalog,
163+
volume="test-platform",
164+
volume_path=volume_path,
165+
)
Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
{
2+
"directory_structure": [
3+
"fake-memo.pdf"
4+
]
5+
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
{
2+
"identifier": "9a6eb650-98d6-5465-8f1d-aa7118eee87e",
3+
"connector_type": "databricks_volumes",
4+
"source_identifiers": {
5+
"filename": "fake-memo.pdf",
6+
"fullpath": "/Volumes/utic-dev-tech-fixtures/default/test-platform/databricks-volumes-test-input/fake-memo.pdf",
7+
"rel_path": "fake-memo.pdf"
8+
},
9+
"doc_type": "file",
10+
"metadata": {
11+
"url": "/Volumes/utic-dev-tech-fixtures/default/test-platform/databricks-volumes-test-input/fake-memo.pdf",
12+
"version": null,
13+
"record_locator": null,
14+
"date_created": null,
15+
"date_modified": "1729186569000",
16+
"date_processed": null,
17+
"permissions_data": null,
18+
"filesize_bytes": null
19+
},
20+
"additional_metadata": {
21+
"catalog": "utic-dev-tech-fixtures",
22+
"path": "/Volumes/utic-dev-tech-fixtures/default/test-platform/databricks-volumes-test-input/fake-memo.pdf"
23+
},
24+
"reprocess": false,
25+
"local_download_path": "/private/var/folders/n8/rps3wl195pj4p_0vyxqj5jrw0000gn/T/tmpv8xsw7kf/fake-memo.pdf"
26+
}

test_e2e/dest/databricks-volumes.sh

Lines changed: 0 additions & 63 deletions
This file was deleted.

test_e2e/expected-structured-output/databricks-volumes/fake-memo.pdf.json

Lines changed: 0 additions & 77 deletions
This file was deleted.

0 commit comments

Comments
 (0)