Skip to content

Commit dee9b40

Browse files
authored
feat: Sharepoint connector (#918)
1 parent ef5091f commit dee9b40

File tree

25 files changed

+913
-8
lines changed

25 files changed

+913
-8
lines changed

Diff for: .github/workflows/ci.yml

+3
Original file line numberDiff line numberDiff line change
@@ -200,6 +200,9 @@ jobs:
200200
MS_TENANT_ID: ${{ secrets.MS_TENANT_ID }}
201201
MS_USER_EMAIL: ${{ secrets.MS_USER_EMAIL }}
202202
MS_USER_PNAME: ${{ secrets.MS_USER_PNAME }}
203+
SHAREPOINT_CLIENT_ID: ${{secrets.SHAREPOINT_CLIENT_ID}}
204+
SHAREPOINT_CRED: ${{secrets.SHAREPOINT_CRED}}
205+
SHAREPOINT_SITE: ${{secrets.SHAREPOINT_SITE}}
203206
SLACK_TOKEN: ${{ secrets.SLACK_TOKEN }}
204207
UNS_API_KEY: ${{ secrets.UNS_API_KEY }}
205208
run: |

Diff for: .github/workflows/ingest-test-fixtures-update-pr.yml

+3
Original file line numberDiff line numberDiff line change
@@ -72,6 +72,9 @@ jobs:
7272
MS_TENANT_ID: ${{ secrets.MS_TENANT_ID }}
7373
MS_USER_EMAIL: ${{ secrets.MS_USER_EMAIL }}
7474
MS_USER_PNAME: ${{ secrets.MS_USER_PNAME }}
75+
SHAREPOINT_CLIENT_ID: ${{secrets.SHAREPOINT_CLIENT_ID}}
76+
SHAREPOINT_CRED: ${{secrets.SHAREPOINT_CRED}}
77+
SHAREPOINT_SITE: ${{secrets.SHAREPOINT_SITE}}
7578
SLACK_TOKEN: ${{ secrets.SLACK_TOKEN }}
7679
UNS_API_KEY: ${{ secrets.UNS_API_KEY }}
7780
OVERWRITE_FIXTURES: "true"

Diff for: CHANGELOG.md

+11
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,14 @@
1+
## 0.9.2-dev3
2+
=======
3+
4+
### Enhancements
5+
6+
### Features
7+
8+
* Adds Sharepoint connector.
9+
10+
### Fixes
11+
112
## 0.9.2-dev2
213
=======
314

Diff for: examples/ingest/onedrive/ingest.sh

100755100644
+1
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@ PYTHONPATH=. ./unstructured/ingest/main.py \
2626
--authority-url "<Authority URL, default is https://login.microsoftonline.com>" \
2727
--tenant "<Azure AD tenant_id, default is 'common'>" \
2828
--user-pname "<Azure AD principal name, in most cases is the email linked to the drive>" \
29+
--path "<Path to start parsing files from>" \
2930
--structured-output-dir onedrive-ingest-output \
3031
--num-processes 2 \
3132
--verbose

Diff for: examples/ingest/sharepoint/ingest.sh

+28
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,28 @@
1+
#!/usr/bin/env bash
2+
3+
# Processes the Unstructured-IO/unstructured repository
4+
# through Unstructured's library in 2 processes.
5+
6+
# Structured outputs are stored in sharepoint-ingest-output/
7+
8+
# NOTE, this script is not ready-to-run!
9+
# You must enter a MS Sharepoint app client-id, client secret and sharepoint site url
10+
# before running.
11+
12+
# To get the credentials for your Sharepoint app, follow these steps:
13+
# https://github.com/vgrem/Office365-REST-Python-Client/wiki/How-to-connect-to-SharePoint-Online-and-and-SharePoint-2013-2016-2019-on-premises--with-app-principal
14+
15+
16+
17+
SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
18+
cd "$SCRIPT_DIR"/../../.. || exit 1
19+
20+
PYTHONPATH=. ./unstructured/ingest/main.py \
21+
sharepoint \
22+
--client-id "<Microsoft Sharepoint app client-id>" \
23+
--client-cred "<Microsoft Sharepoint app client-secret>" \
24+
--site "<e.g https://contoso.sharepoint.com or https://contoso.admin.sharepoint.com to process all sites within tenant>" \
25+
--files-only "Flag to process only files within the site(s)" \
26+
--structured-output-dir sharepoint-ingest-output \
27+
--num-processes 2 \
28+
--verbose

Diff for: requirements/ingest-sharepoint.in

+6
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
-c constraints.in
2+
-c base.txt
3+
msal==1.23.0
4+
Office365-REST-Python-Client==2.4.2
5+
pyjwt==2.8.0
6+
cryptography==41.0.2

Diff for: requirements/ingest-sharepoint.txt

+50
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,50 @@
1+
#
2+
# This file is autogenerated by pip-compile with Python 3.8
3+
# by the following command:
4+
#
5+
# pip-compile requirements/ingest-sharepoint.in
6+
#
7+
certifi==2023.7.22
8+
# via
9+
# -c requirements/base.txt
10+
# -c requirements/constraints.in
11+
# requests
12+
cffi==1.15.1
13+
# via cryptography
14+
charset-normalizer==3.2.0
15+
# via
16+
# -c requirements/base.txt
17+
# requests
18+
cryptography==41.0.2
19+
# via
20+
# -r requirements/ingest-sharepoint.in
21+
# msal
22+
# pyjwt
23+
idna==3.4
24+
# via
25+
# -c requirements/base.txt
26+
# requests
27+
msal==1.23.0
28+
# via
29+
# -r requirements/ingest-sharepoint.in
30+
# office365-rest-python-client
31+
office365-rest-python-client==2.4.2
32+
# via -r requirements/ingest-sharepoint.in
33+
pycparser==2.21
34+
# via cffi
35+
pyjwt[crypto]==2.8.0
36+
# via
37+
# -r requirements/ingest-sharepoint.in
38+
# msal
39+
pytz==2023.3
40+
# via office365-rest-python-client
41+
requests==2.31.0
42+
# via
43+
# -c requirements/base.txt
44+
# msal
45+
# office365-rest-python-client
46+
urllib3==1.26.16
47+
# via
48+
# -c requirements/base.txt
49+
# -c requirements/constraints.in
50+
# requests
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,110 @@
1+
[
2+
{
3+
"type": "NarrativeText",
4+
"element_id": "1df8eeb8be847c3a1a7411e3be3e0396",
5+
"metadata": {
6+
"data_source": {
7+
"record_locator": {
8+
"site": "https://unstructuredio.sharepoint.com/",
9+
"unique_id": "880f80ca-cebf-48d0-b639-aeb671b3c431",
10+
"server_relative_url": "/Shared Documents/fake-text.txt"
11+
},
12+
"date_created": "2023-06-16T05:04:55Z",
13+
"date_modified": "2023-06-16T05:04:55Z"
14+
},
15+
"filename": "fake-text.txt",
16+
"filetype": "text/plain"
17+
},
18+
"text": "This is a test document to use for unit tests."
19+
},
20+
{
21+
"type": "Address",
22+
"element_id": "a9d4657034aa3fdb5177f1325e912362",
23+
"metadata": {
24+
"data_source": {
25+
"record_locator": {
26+
"site": "https://unstructuredio.sharepoint.com/",
27+
"unique_id": "880f80ca-cebf-48d0-b639-aeb671b3c431",
28+
"server_relative_url": "/Shared Documents/fake-text.txt"
29+
},
30+
"date_created": "2023-06-16T05:04:55Z",
31+
"date_modified": "2023-06-16T05:04:55Z"
32+
},
33+
"filename": "fake-text.txt",
34+
"filetype": "text/plain"
35+
},
36+
"text": "Doylestown, PA 18901"
37+
},
38+
{
39+
"type": "Title",
40+
"element_id": "9c218520320f238595f1fde74bdd137d",
41+
"metadata": {
42+
"data_source": {
43+
"record_locator": {
44+
"site": "https://unstructuredio.sharepoint.com/",
45+
"unique_id": "880f80ca-cebf-48d0-b639-aeb671b3c431",
46+
"server_relative_url": "/Shared Documents/fake-text.txt"
47+
},
48+
"date_created": "2023-06-16T05:04:55Z",
49+
"date_modified": "2023-06-16T05:04:55Z"
50+
},
51+
"filename": "fake-text.txt",
52+
"filetype": "text/plain"
53+
},
54+
"text": "Important points:"
55+
},
56+
{
57+
"type": "ListItem",
58+
"element_id": "39a3ae572581d0f1fe7511fd7b3aa414",
59+
"metadata": {
60+
"data_source": {
61+
"record_locator": {
62+
"site": "https://unstructuredio.sharepoint.com/",
63+
"unique_id": "880f80ca-cebf-48d0-b639-aeb671b3c431",
64+
"server_relative_url": "/Shared Documents/fake-text.txt"
65+
},
66+
"date_created": "2023-06-16T05:04:55Z",
67+
"date_modified": "2023-06-16T05:04:55Z"
68+
},
69+
"filename": "fake-text.txt",
70+
"filetype": "text/plain"
71+
},
72+
"text": "Hamburgers are delicious"
73+
},
74+
{
75+
"type": "ListItem",
76+
"element_id": "fc1adcb8eaceac694e500a103f9f698f",
77+
"metadata": {
78+
"data_source": {
79+
"record_locator": {
80+
"site": "https://unstructuredio.sharepoint.com/",
81+
"unique_id": "880f80ca-cebf-48d0-b639-aeb671b3c431",
82+
"server_relative_url": "/Shared Documents/fake-text.txt"
83+
},
84+
"date_created": "2023-06-16T05:04:55Z",
85+
"date_modified": "2023-06-16T05:04:55Z"
86+
},
87+
"filename": "fake-text.txt",
88+
"filetype": "text/plain"
89+
},
90+
"text": "Dogs are the best"
91+
},
92+
{
93+
"type": "ListItem",
94+
"element_id": "0b61e826b1c4ab05750184da72b89f83",
95+
"metadata": {
96+
"data_source": {
97+
"record_locator": {
98+
"site": "https://unstructuredio.sharepoint.com/",
99+
"unique_id": "880f80ca-cebf-48d0-b639-aeb671b3c431",
100+
"server_relative_url": "/Shared Documents/fake-text.txt"
101+
},
102+
"date_created": "2023-06-16T05:04:55Z",
103+
"date_modified": "2023-06-16T05:04:55Z"
104+
},
105+
"filename": "fake-text.txt",
106+
"filetype": "text/plain"
107+
},
108+
"text": "I love fuzzy blankets"
109+
}
110+
]
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,37 @@
1+
[
2+
{
3+
"type": "NarrativeText",
4+
"element_id": "c08fcabe68ba13b7a7cc6592bd5513a8",
5+
"metadata": {
6+
"data_source": {
7+
"record_locator": {
8+
"site": "https://unstructuredio.sharepoint.com/",
9+
"unique_id": "0dfe3d76-00c0-42db-ae1b-8cf22d4b3f10",
10+
"server_relative_url": "/Shared Documents/ideas-page.html"
11+
},
12+
"date_created": "2023-06-16T05:04:47Z",
13+
"date_modified": "2023-06-16T05:04:47Z"
14+
},
15+
"filename": "ideas-page.html",
16+
"filetype": "text/html",
17+
"page_number": 1,
18+
"links": [
19+
{
20+
"text": null,
21+
"url": "index.html"
22+
},
23+
{
24+
"text": null,
25+
"url": "https://twitter.com/stef/status/1617222428727586816"
26+
}
27+
],
28+
"emphasized_texts": [
29+
{
30+
"text": "(Someone fed my essays into GPT to make something that could answer\nquestions based on them, then asked it where good ideas come from. The\nanswer was ok, but not what I would have said. This is what I would have said.)",
31+
"tag": "i"
32+
}
33+
]
34+
},
35+
"text": "January 2023(Someone fed my essays into GPT to make something that could answer\nquestions based on them, then asked it where good ideas come from. The\nanswer was ok, but not what I would have said. This is what I would have said.)The way to get new ideas is to notice anomalies: what seems strange,\nor missing, or broken? You can see anomalies in everyday life (much\nof standup comedy is based on this), but the best place to look for\nthem is at the frontiers of knowledge.Knowledge grows fractally.\nFrom a distance its edges look smooth, but when you learn enough\nto get close to one, you'll notice it's full of gaps. These gaps\nwill seem obvious; it will seem inexplicable that no one has tried\nx or wondered about y. In the best case, exploring such gaps yields\nwhole new fractal buds."
36+
}
37+
]
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,44 @@
1+
[
2+
{
3+
"type": "Table",
4+
"element_id": "c00fc0e5ac303c40f9089791e5e485b1",
5+
"metadata": {
6+
"data_source": {
7+
"record_locator": {
8+
"site": "https://unstructuredio.sharepoint.com/",
9+
"unique_id": "b9956a33-8079-4321-91ea-609def07394d",
10+
"server_relative_url": "/Shared Documents/stanley-cups.xlsx"
11+
},
12+
"date_created": "2023-06-16T05:05:05Z",
13+
"date_modified": "2023-06-16T05:05:05Z"
14+
},
15+
"filename": "stanley-cups.xlsx",
16+
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
17+
"page_number": 1,
18+
"page_name": "Stanley Cups",
19+
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Team</td>\n <td>Location</td>\n <td>Stanley Cups</td>\n </tr>\n <tr>\n <td>Blues</td>\n <td>STL</td>\n <td>1</td>\n </tr>\n <tr>\n <td>Flyers</td>\n <td>PHI</td>\n <td>2</td>\n </tr>\n <tr>\n <td>Maple Leafs</td>\n <td>TOR</td>\n <td>13</td>\n </tr>\n </tbody>\n</table>"
20+
},
21+
"text": "\n\n\nTeam\nLocation\nStanley Cups\n\n\nBlues\nSTL\n1\n\n\nFlyers\nPHI\n2\n\n\nMaple Leafs\nTOR\n13\n\n\n"
22+
},
23+
{
24+
"type": "Table",
25+
"element_id": "31421b5cd94fedb10dc82738503b4505",
26+
"metadata": {
27+
"data_source": {
28+
"record_locator": {
29+
"site": "https://unstructuredio.sharepoint.com/",
30+
"unique_id": "b9956a33-8079-4321-91ea-609def07394d",
31+
"server_relative_url": "/Shared Documents/stanley-cups.xlsx"
32+
},
33+
"date_created": "2023-06-16T05:05:05Z",
34+
"date_modified": "2023-06-16T05:05:05Z"
35+
},
36+
"filename": "stanley-cups.xlsx",
37+
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
38+
"page_number": 2,
39+
"page_name": "Stanley Cups Since 67",
40+
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Team</td>\n <td>Location</td>\n <td>Stanley Cups</td>\n </tr>\n <tr>\n <td>Blues</td>\n <td>STL</td>\n <td>1</td>\n </tr>\n <tr>\n <td>Flyers</td>\n <td>PHI</td>\n <td>2</td>\n </tr>\n <tr>\n <td>Maple Leafs</td>\n <td>TOR</td>\n <td>0</td>\n </tr>\n </tbody>\n</table>"
41+
},
42+
"text": "\n\n\nTeam\nLocation\nStanley Cups\n\n\nBlues\nSTL\n1\n\n\nFlyers\nPHI\n2\n\n\nMaple Leafs\nTOR\n0\n\n\n"
43+
}
44+
]
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,42 @@
1+
[
2+
{
3+
"type": "Title",
4+
"element_id": "b4e929d8bcfe04189801a8ed61496d17",
5+
"metadata": {
6+
"data_source": {
7+
"version": "1.2",
8+
"record_locator": {
9+
"site": "https://unstructuredio.sharepoint.com/",
10+
"unique_id": "2b564fff-e9bb-4b64-9822-64f96a20ea10",
11+
"absolute_url": "https://unstructuredio.sharepoint.com/SitePages/Home.aspx"
12+
},
13+
"date_created": "0001-01-01T08:00:00Z",
14+
"date_modified": "2023-06-16T05:12:51Z"
15+
},
16+
"filename": "Home.html",
17+
"filetype": "text/html",
18+
"page_number": 1
19+
},
20+
"text": "Documents"
21+
},
22+
{
23+
"type": "Title",
24+
"element_id": "8d14f6e72de8f18ab1ee5c5330f00653",
25+
"metadata": {
26+
"data_source": {
27+
"version": "1.2",
28+
"record_locator": {
29+
"site": "https://unstructuredio.sharepoint.com/",
30+
"unique_id": "2b564fff-e9bb-4b64-9822-64f96a20ea10",
31+
"absolute_url": "https://unstructuredio.sharepoint.com/SitePages/Home.aspx"
32+
},
33+
"date_created": "0001-01-01T08:00:00Z",
34+
"date_modified": "2023-06-16T05:12:51Z"
35+
},
36+
"filename": "Home.html",
37+
"filetype": "text/html",
38+
"page_number": 1
39+
},
40+
"text": "Events"
41+
}
42+
]

0 commit comments

Comments
 (0)