Files with characters like "#" in their name are not properly quoted, resulting in 404 errors when they are downloaded.
I'm currently using this monkey patch as a work around:
from urllib.parse import quote
def _patch_sharepoint_downloader_path_quoting():
"""URL-encode the server_relative_path before calling get_by_path().
Characters like '#' in SharePoint filenames break the Office365 SDK's
REST URL construction. Quoting with safe="/" turns '#' into '%23'
while preserving path separators.
"""
from unstructured_ingest.processes.connectors.sharepoint import SharepointDownloader
_original_fetch = SharepointDownloader._fetch_file
# Skip if already patched
if getattr(_original_fetch, "_path_quoting_patched", False):
return
def _patched_fetch_file(self, file_data):
# Encode the fullpath so '#' and other URL-unsafe chars are escaped
if file_data.source_identifiers and file_data.source_identifiers.fullpath:
file_data.source_identifiers.fullpath = quote(
file_data.source_identifiers.fullpath, safe="/"
)
return _original_fetch(self, file_data)
_patched_fetch_file._path_quoting_patched = True
SharepointDownloader._fetch_file = _patched_fetch_file
logger.info("Patched SharepointDownloader._fetch_file with URL path quoting")
Files with characters like "#" in their name are not properly quoted, resulting in 404 errors when they are downloaded.
I'm currently using this monkey patch as a work around: