Skip to content

Commit e2e473d

Browse files
authored
feat: add url kwarg to partititon (#470)
* added url option to auto partition * add test for partition from url * version and changelog * update docs * add url to element metadata
1 parent 2110a26 commit e2e473d

File tree

5 files changed

+97
-24
lines changed

5 files changed

+97
-24
lines changed

Diff for: CHANGELOG.md

+2-1
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
## 0.5.12-dev5
1+
## 0.5.12
22

33
### Enhancements
44

@@ -10,6 +10,7 @@
1010

1111
* Add --partition-by-api parameter to unstructured-ingest
1212
* Added `partition_rtf` for processing rich text files.
13+
* `partition` now accepts a `url` kwarg in addition to `file` and `filename`.
1314

1415
### Fixes
1516

Diff for: docs/source/bricks.rst

+15
Original file line numberDiff line numberDiff line change
@@ -116,6 +116,21 @@ faster processing and `"hi_res"` for
116116
elements = partition(filename="example-docs/layout-parser-paper-fast.pdf")
117117
118118
119+
The ``partition`` function also accepts a ``url`` kwarg for remotely hosted documents. If you want
120+
to force ``partition`` to treat the document as a particular MIME type, use the ``content_type``
121+
kwarg in conjunction with ``url``. Otherwise, ``partition`` will use the information from
122+
the ``Content-Type`` header in the HTTP response.
123+
124+
125+
.. code:: python
126+
127+
from unstructured.partition.auto import partition
128+
129+
url = "https://raw.githubusercontent.com/Unstructured-IO/unstructured/main/LICENSE.md"
130+
elements = partition(url=url)
131+
elements = partition(url=url, content_type="text/markdown")
132+
133+
119134
``partition_docx``
120135
------------------
121136

Diff for: test_unstructured/partition/test_auto.py

+7
Original file line numberDiff line numberDiff line change
@@ -367,3 +367,10 @@ def test_auto_partition_rtf_from_filename():
367367
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-doc.rtf")
368368
elements = partition(filename=filename)
369369
assert elements[0] == Title("My First Heading")
370+
371+
372+
def test_auto_partition_from_url():
373+
url = "https://raw.githubusercontent.com/Unstructured-IO/unstructured/main/LICENSE.md"
374+
elements = partition(url=url, content_type="text/plain")
375+
assert elements[0] == Title("Apache License")
376+
assert elements[0].metadata.url == url

Diff for: unstructured/__version__.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
__version__ = "0.5.12-dev5" # pragma: no cover
1+
__version__ = "0.5.12" # pragma: no cover

Diff for: unstructured/partition/auto.py

+72-22
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,10 @@
1-
from typing import IO, Callable, Optional
1+
import io
2+
from typing import IO, Callable, Optional, Tuple
3+
4+
import requests
25

36
from unstructured.file_utils.filetype import FileType, detect_filetype
7+
from unstructured.partition.common import exactly_one
48
from unstructured.partition.doc import partition_doc
59
from unstructured.partition.docx import partition_docx
610
from unstructured.partition.email import partition_email
@@ -22,6 +26,7 @@ def partition(
2226
content_type: Optional[str] = None,
2327
file: Optional[IO] = None,
2428
file_filename: Optional[str] = None,
29+
url: Optional[str] = None,
2530
include_page_breaks: bool = False,
2631
strategy: str = "hi_res",
2732
encoding: str = "utf-8",
@@ -42,6 +47,9 @@ def partition(
4247
A file-like object using "rb" mode --> open(filename, "rb").
4348
file_filename
4449
When file is not None, the filename (string) to store in element metadata. E.g. "foo.txt"
50+
url
51+
The url for a remote document. Pass in content_type if you want partition to treat
52+
the document as a specific content_type.
4553
include_page_breaks
4654
If True, the output will include page breaks if the filetype supports it
4755
strategy
@@ -51,37 +59,50 @@ def partition(
5159
encoding
5260
The encoding method used to decode the text input. If None, utf-8 will be used.
5361
"""
54-
filetype = detect_filetype(
55-
filename=filename,
56-
file=file,
57-
file_filename=file_filename,
58-
content_type=content_type,
59-
)
62+
exactly_one(file=file, filename=filename, url=url)
63+
64+
if url is not None:
65+
file, filetype = file_and_type_from_url(url=url, content_type=content_type)
66+
else:
67+
filetype = detect_filetype(
68+
filename=filename,
69+
file=file,
70+
file_filename=file_filename,
71+
content_type=content_type,
72+
)
6073

6174
if file is not None:
6275
file.seek(0)
6376

6477
if filetype == FileType.DOC:
65-
return partition_doc(filename=filename, file=file)
66-
if filetype == FileType.DOCX:
67-
return partition_docx(filename=filename, file=file)
78+
elements = partition_doc(filename=filename, file=file)
79+
elif filetype == FileType.DOCX:
80+
elements = partition_docx(filename=filename, file=file)
6881
elif filetype == FileType.EML:
69-
return partition_email(filename=filename, file=file, encoding=encoding)
82+
elements = partition_email(filename=filename, file=file, encoding=encoding)
7083
elif filetype == FileType.MSG:
71-
return partition_msg(filename=filename, file=file)
84+
elements = partition_msg(filename=filename, file=file)
7285
elif filetype == FileType.HTML:
73-
return partition_html(
86+
elements = partition_html(
7487
filename=filename,
7588
file=file,
7689
include_page_breaks=include_page_breaks,
7790
encoding=encoding,
7891
)
7992
elif filetype == FileType.EPUB:
80-
return partition_epub(filename=filename, file=file, include_page_breaks=include_page_breaks)
93+
elements = partition_epub(
94+
filename=filename,
95+
file=file,
96+
include_page_breaks=include_page_breaks,
97+
)
8198
elif filetype == FileType.MD:
82-
return partition_md(filename=filename, file=file, include_page_breaks=include_page_breaks)
99+
elements = partition_md(
100+
filename=filename,
101+
file=file,
102+
include_page_breaks=include_page_breaks,
103+
)
83104
elif filetype == FileType.PDF:
84-
return partition_pdf(
105+
elements = partition_pdf(
85106
filename=filename, # type: ignore
86107
file=file, # type: ignore
87108
url=None,
@@ -90,27 +111,56 @@ def partition(
90111
strategy=strategy,
91112
)
92113
elif (filetype == FileType.PNG) or (filetype == FileType.JPG):
93-
return partition_image(
114+
elements = partition_image(
94115
filename=filename, # type: ignore
95116
file=file, # type: ignore
96117
url=None,
97118
include_page_breaks=include_page_breaks,
98119
)
99120
elif filetype == FileType.TXT:
100-
return partition_text(
121+
elements = partition_text(
101122
filename=filename,
102123
file=file,
103124
encoding=encoding,
104125
paragraph_grouper=paragraph_grouper,
105126
)
106127
elif filetype == FileType.RTF:
107-
return partition_rtf(filename=filename, file=file, include_page_breaks=include_page_breaks)
128+
elements = partition_rtf(
129+
filename=filename,
130+
file=file,
131+
include_page_breaks=include_page_breaks,
132+
)
108133
elif filetype == FileType.PPT:
109-
return partition_ppt(filename=filename, file=file, include_page_breaks=include_page_breaks)
134+
elements = partition_ppt(
135+
filename=filename,
136+
file=file,
137+
include_page_breaks=include_page_breaks,
138+
)
110139
elif filetype == FileType.PPTX:
111-
return partition_pptx(filename=filename, file=file, include_page_breaks=include_page_breaks)
140+
elements = partition_pptx(
141+
filename=filename,
142+
file=file,
143+
include_page_breaks=include_page_breaks,
144+
)
112145
elif filetype == FileType.JSON:
113-
return partition_json(filename=filename, file=file)
146+
elements = partition_json(filename=filename, file=file)
114147
else:
115148
msg = "Invalid file" if not filename else f"Invalid file {filename}"
116149
raise ValueError(f"{msg}. The {filetype} file type is not supported in partition.")
150+
151+
for element in elements:
152+
element.metadata.url = url
153+
154+
return elements
155+
156+
157+
def file_and_type_from_url(
158+
url: str,
159+
content_type: Optional[str] = None,
160+
) -> Tuple[io.BytesIO, Optional[FileType]]:
161+
response = requests.get(url)
162+
file = io.BytesIO(response.content)
163+
164+
content_type = content_type or response.headers.get("Content-Type")
165+
filetype = detect_filetype(file=file, content_type=content_type)
166+
return file, filetype

0 commit comments

Comments
 (0)