Skip to content

feat: experimental transfer sharepoint authentication to AzureAD #326

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 16 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 9 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,9 @@
## 0.3.13-dev2

### Enchancements

* **Sharepoint coonner no longer uses depracted authorisation method. Uses EntraID instead via msal.**

## 0.3.13-dev1

### Fixes
Expand Down
88 changes: 88 additions & 0 deletions test/integration/connectors/test_sharepoint.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,88 @@
# test_sharepoint_integration.py
import os

import pytest

from test.integration.connectors.utils.validation.source import (
SourceValidationConfigs,
source_connector_validation,
)
from test.integration.utils import requires_env
from unstructured_ingest.v2.processes.connectors.sharepoint import (
CONNECTOR_TYPE,
SharepointAccessConfig,
SharepointConnectionConfig,
SharepointDownloader,
SharepointDownloaderConfig,
SharepointIndexer,
SharepointIndexerConfig,
SharepointPermissionsConfig,
)

SOURCE_TAG = "source"


@pytest.mark.asyncio
@pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG)
@requires_env(
"SHAREPOINT_CLIENT_ID", "SHAREPOINT_CRED", "SHAREPOINT_PERMISSIONS_TENANT", "SHAREPOINT_SITE"
)
async def test_sharepoint_source(temp_dir):
"""
Integration test that:
1) Creates a SharepointIndexer to list/enumerate items in a given site
2) Creates a SharepointDownloader to fetch each enumerated item
3) Runs a validation helper to confirm the end-to-end pipeline
"""
client_id = os.getenv("SHAREPOINT_CLIENT_ID")
client_cred = os.getenv("SHAREPOINT_CRED")
tenant = os.getenv("SHAREPOINT_PERMISSIONS_TENANT")
site_url = os.getenv("SHAREPOINT_SITE")

access_config = SharepointAccessConfig(client_cred=client_cred)

permissions_config = SharepointPermissionsConfig(
permissions_application_id=None,
permissions_tenant=tenant,
permissions_client_cred=None, # or SecretStr(...)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

should we provide with a value for permissions_client_cred and permissions_application_id here? otherwise this integration test would fail in CI pipeline

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It won't actually fail (and it doesn't) as this connector can work in 2 modes, either with or without permissions. There's a separate e2e test for that in the test_e2e/dest/sharepoint.sh and sharepoint_with_permissions.sh

)

connection_config = SharepointConnectionConfig(
client_id=client_id,
site=site_url,
access_config=access_config,
permissions_config=permissions_config,
)

index_config = SharepointIndexerConfig(
path=None,
recursive=True,
omit_files=False,
omit_pages=False,
omit_lists=True,
)

download_config = SharepointDownloaderConfig(
download_dir=temp_dir # Directory where the files get saved
)

indexer = SharepointIndexer(
connection_config=connection_config,
index_config=index_config,
)
downloader = SharepointDownloader(
connection_config=connection_config,
download_config=download_config,
)

expected_files = 7

await source_connector_validation(
indexer=indexer,
downloader=downloader,
configs=SourceValidationConfigs(
test_id="sharepoint",
expected_num_files=expected_files,
validate_downloaded_files=True,
),
)
2 changes: 1 addition & 1 deletion unstructured_ingest/__version__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = "0.3.13-dev1" # pragma: no cover
__version__ = "0.3.13-dev2" # pragma: no cover
27 changes: 22 additions & 5 deletions unstructured_ingest/v2/processes/connectors/sharepoint.py
Original file line number Diff line number Diff line change
Expand Up @@ -91,16 +91,33 @@ class SharepointConnectionConfig(ConnectionConfig):
access_config: Secret[SharepointAccessConfig]
permissions_config: Optional[SharepointPermissionsConfig] = None

@requires_dependencies(["office365"], extras="sharepoint")
@requires_dependencies(["msal"], extras="sharepoint")
def get_client(self) -> "ClientContext":
from office365.runtime.auth.client_credential import ClientCredential
from msal import ConfidentialClientApplication
from office365.sharepoint.client_context import ClientContext

try:
credentials = ClientCredential(
self.client_id, self.access_config.get_secret_value().client_cred
# Acquire the token using MSAL, similar to get_permissions_token
app = ConfidentialClientApplication(
authority=f"{self.permissions_config.authority_url.get_secret_value()}/"
f"{self.permissions_config.permissions_tenant}",
client_id=self.permissions_config.permissions_application_id,
client_credential=self.permissions_config.permissions_client_cred.get_secret_value(), # noqa: E501
)
token_result = app.acquire_token_for_client(
scopes=[
f"https://{self.permissions_config.permissions_tenant}.sharepoint.com/.default"
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

when I use SHAREPOINT_PERMISSIONS_TENANT value in Keeper, this line reports an error. After I remove the second part of the string and only keeps the first part, it works.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I might need a little more explanation on this not sure which part do you mean? Like it works without the /.default/ but it doesn't with it?

]
)
site_client = ClientContext(self.site).with_credentials(credentials)
if "access_token" not in token_result:
raise SourceConnectionNetworkError(
f"Failed to obtain token for SharePoint: \
{token_result.get('error_description', '')}"
)
access_token = token_result["access_token"]

# Then set up the SharePoint client context with that token
site_client = ClientContext(self.site).with_access_token(access_token)
except Exception as e:
logger.error(f"Couldn't set Sharepoint client: {e}")
raise e
Expand Down
Loading