1
- from typing import IO , Callable , Optional
1
+ import io
2
+ from typing import IO , Callable , Optional , Tuple
3
+
4
+ import requests
2
5
3
6
from unstructured .file_utils .filetype import FileType , detect_filetype
7
+ from unstructured .partition .common import exactly_one
4
8
from unstructured .partition .doc import partition_doc
5
9
from unstructured .partition .docx import partition_docx
6
10
from unstructured .partition .email import partition_email
@@ -22,6 +26,7 @@ def partition(
22
26
content_type : Optional [str ] = None ,
23
27
file : Optional [IO ] = None ,
24
28
file_filename : Optional [str ] = None ,
29
+ url : Optional [str ] = None ,
25
30
include_page_breaks : bool = False ,
26
31
strategy : str = "hi_res" ,
27
32
encoding : str = "utf-8" ,
@@ -42,6 +47,9 @@ def partition(
42
47
A file-like object using "rb" mode --> open(filename, "rb").
43
48
file_filename
44
49
When file is not None, the filename (string) to store in element metadata. E.g. "foo.txt"
50
+ url
51
+ The url for a remote document. Pass in content_type if you want partition to treat
52
+ the document as a specific content_type.
45
53
include_page_breaks
46
54
If True, the output will include page breaks if the filetype supports it
47
55
strategy
@@ -51,37 +59,50 @@ def partition(
51
59
encoding
52
60
The encoding method used to decode the text input. If None, utf-8 will be used.
53
61
"""
54
- filetype = detect_filetype (
55
- filename = filename ,
56
- file = file ,
57
- file_filename = file_filename ,
58
- content_type = content_type ,
59
- )
62
+ exactly_one (file = file , filename = filename , url = url )
63
+
64
+ if url is not None :
65
+ file , filetype = file_and_type_from_url (url = url , content_type = content_type )
66
+ else :
67
+ filetype = detect_filetype (
68
+ filename = filename ,
69
+ file = file ,
70
+ file_filename = file_filename ,
71
+ content_type = content_type ,
72
+ )
60
73
61
74
if file is not None :
62
75
file .seek (0 )
63
76
64
77
if filetype == FileType .DOC :
65
- return partition_doc (filename = filename , file = file )
66
- if filetype == FileType .DOCX :
67
- return partition_docx (filename = filename , file = file )
78
+ elements = partition_doc (filename = filename , file = file )
79
+ elif filetype == FileType .DOCX :
80
+ elements = partition_docx (filename = filename , file = file )
68
81
elif filetype == FileType .EML :
69
- return partition_email (filename = filename , file = file , encoding = encoding )
82
+ elements = partition_email (filename = filename , file = file , encoding = encoding )
70
83
elif filetype == FileType .MSG :
71
- return partition_msg (filename = filename , file = file )
84
+ elements = partition_msg (filename = filename , file = file )
72
85
elif filetype == FileType .HTML :
73
- return partition_html (
86
+ elements = partition_html (
74
87
filename = filename ,
75
88
file = file ,
76
89
include_page_breaks = include_page_breaks ,
77
90
encoding = encoding ,
78
91
)
79
92
elif filetype == FileType .EPUB :
80
- return partition_epub (filename = filename , file = file , include_page_breaks = include_page_breaks )
93
+ elements = partition_epub (
94
+ filename = filename ,
95
+ file = file ,
96
+ include_page_breaks = include_page_breaks ,
97
+ )
81
98
elif filetype == FileType .MD :
82
- return partition_md (filename = filename , file = file , include_page_breaks = include_page_breaks )
99
+ elements = partition_md (
100
+ filename = filename ,
101
+ file = file ,
102
+ include_page_breaks = include_page_breaks ,
103
+ )
83
104
elif filetype == FileType .PDF :
84
- return partition_pdf (
105
+ elements = partition_pdf (
85
106
filename = filename , # type: ignore
86
107
file = file , # type: ignore
87
108
url = None ,
@@ -90,27 +111,56 @@ def partition(
90
111
strategy = strategy ,
91
112
)
92
113
elif (filetype == FileType .PNG ) or (filetype == FileType .JPG ):
93
- return partition_image (
114
+ elements = partition_image (
94
115
filename = filename , # type: ignore
95
116
file = file , # type: ignore
96
117
url = None ,
97
118
include_page_breaks = include_page_breaks ,
98
119
)
99
120
elif filetype == FileType .TXT :
100
- return partition_text (
121
+ elements = partition_text (
101
122
filename = filename ,
102
123
file = file ,
103
124
encoding = encoding ,
104
125
paragraph_grouper = paragraph_grouper ,
105
126
)
106
127
elif filetype == FileType .RTF :
107
- return partition_rtf (filename = filename , file = file , include_page_breaks = include_page_breaks )
128
+ elements = partition_rtf (
129
+ filename = filename ,
130
+ file = file ,
131
+ include_page_breaks = include_page_breaks ,
132
+ )
108
133
elif filetype == FileType .PPT :
109
- return partition_ppt (filename = filename , file = file , include_page_breaks = include_page_breaks )
134
+ elements = partition_ppt (
135
+ filename = filename ,
136
+ file = file ,
137
+ include_page_breaks = include_page_breaks ,
138
+ )
110
139
elif filetype == FileType .PPTX :
111
- return partition_pptx (filename = filename , file = file , include_page_breaks = include_page_breaks )
140
+ elements = partition_pptx (
141
+ filename = filename ,
142
+ file = file ,
143
+ include_page_breaks = include_page_breaks ,
144
+ )
112
145
elif filetype == FileType .JSON :
113
- return partition_json (filename = filename , file = file )
146
+ elements = partition_json (filename = filename , file = file )
114
147
else :
115
148
msg = "Invalid file" if not filename else f"Invalid file { filename } "
116
149
raise ValueError (f"{ msg } . The { filetype } file type is not supported in partition." )
150
+
151
+ for element in elements :
152
+ element .metadata .url = url
153
+
154
+ return elements
155
+
156
+
157
+ def file_and_type_from_url (
158
+ url : str ,
159
+ content_type : Optional [str ] = None ,
160
+ ) -> Tuple [io .BytesIO , Optional [FileType ]]:
161
+ response = requests .get (url )
162
+ file = io .BytesIO (response .content )
163
+
164
+ content_type = content_type or response .headers .get ("Content-Type" )
165
+ filetype = detect_filetype (file = file , content_type = content_type )
166
+ return file , filetype
0 commit comments