feat: add url kwarg to partititon (#470)

MthwRobinson · web-flow · commit e2e473dddd71 · 2023-04-12T18:31:01.000Z
* added url option to auto partition

* add test for partition from url

* version and changelog

* update docs

* add url to element metadata
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,4 +1,4 @@
-## 0.5.12-dev5
+## 0.5.12
 
 ### Enhancements
 
@@ -10,6 +10,7 @@
 
 * Add --partition-by-api parameter to unstructured-ingest
 * Added `partition_rtf` for processing rich text files.
+* `partition` now accepts a `url` kwarg in addition to `file` and `filename`.
 
 ### Fixes
 
diff --git a/docs/source/bricks.rst b/docs/source/bricks.rst
@@ -116,6 +116,21 @@ faster processing and `"hi_res"` for
   elements = partition(filename="example-docs/layout-parser-paper-fast.pdf")
 
 
+The ``partition`` function also accepts a ``url`` kwarg for remotely hosted documents. If you want
+to force ``partition`` to treat the document as a particular MIME type, use the ``content_type``
+kwarg in conjunction with ``url``. Otherwise, ``partition`` will use the information from
+the ``Content-Type`` header in the HTTP response.
+
+
+.. code:: python
+
+  from unstructured.partition.auto import partition
+
+  url = "https://raw.githubusercontent.com/Unstructured-IO/unstructured/main/LICENSE.md"
+  elements = partition(url=url)
+  elements = partition(url=url, content_type="text/markdown")
+
+
 ``partition_docx``
 ------------------
 
diff --git a/test_unstructured/partition/test_auto.py b/test_unstructured/partition/test_auto.py
@@ -367,3 +367,10 @@ def test_auto_partition_rtf_from_filename():
     filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-doc.rtf")
     elements = partition(filename=filename)
     assert elements[0] == Title("My First Heading")
+
+
+def test_auto_partition_from_url():
+    url = "https://raw.githubusercontent.com/Unstructured-IO/unstructured/main/LICENSE.md"
+    elements = partition(url=url, content_type="text/plain")
+    assert elements[0] == Title("Apache License")
+    assert elements[0].metadata.url == url
diff --git a/unstructured/__version__.py b/unstructured/__version__.py
@@ -1 +1 @@
-__version__ = "0.5.12-dev5"  # pragma: no cover
+__version__ = "0.5.12"  # pragma: no cover
diff --git a/unstructured/partition/auto.py b/unstructured/partition/auto.py
@@ -1,6 +1,10 @@
-from typing import IO, Callable, Optional
+import io
+from typing import IO, Callable, Optional, Tuple
+
+import requests
 
 from unstructured.file_utils.filetype import FileType, detect_filetype
+from unstructured.partition.common import exactly_one
 from unstructured.partition.doc import partition_doc
 from unstructured.partition.docx import partition_docx
 from unstructured.partition.email import partition_email
@@ -22,6 +26,7 @@ def partition(
     content_type: Optional[str] = None,
     file: Optional[IO] = None,
     file_filename: Optional[str] = None,
+    url: Optional[str] = None,
     include_page_breaks: bool = False,
     strategy: str = "hi_res",
     encoding: str = "utf-8",
@@ -42,6 +47,9 @@ def partition(
         A file-like object using "rb" mode --> open(filename, "rb").
     file_filename
         When file is not None, the filename (string) to store in element metadata. E.g. "foo.txt"
+    url
+        The url for a remote document. Pass in content_type if you want partition to treat
+        the document as a specific content_type.
     include_page_breaks
         If True, the output will include page breaks if the filetype supports it
     strategy
@@ -51,37 +59,50 @@ def partition(
     encoding
         The encoding method used to decode the text input. If None, utf-8 will be used.
     """
-    filetype = detect_filetype(
-        filename=filename,
-        file=file,
-        file_filename=file_filename,
-        content_type=content_type,
-    )
+    exactly_one(file=file, filename=filename, url=url)
+
+    if url is not None:
+        file, filetype = file_and_type_from_url(url=url, content_type=content_type)
+    else:
+        filetype = detect_filetype(
+            filename=filename,
+            file=file,
+            file_filename=file_filename,
+            content_type=content_type,
+        )
 
     if file is not None:
         file.seek(0)
 
     if filetype == FileType.DOC:
-        return partition_doc(filename=filename, file=file)
-    if filetype == FileType.DOCX:
-        return partition_docx(filename=filename, file=file)
+        elements = partition_doc(filename=filename, file=file)
+    elif filetype == FileType.DOCX:
+        elements = partition_docx(filename=filename, file=file)
     elif filetype == FileType.EML:
-        return partition_email(filename=filename, file=file, encoding=encoding)
+        elements = partition_email(filename=filename, file=file, encoding=encoding)
     elif filetype == FileType.MSG:
-        return partition_msg(filename=filename, file=file)
+        elements = partition_msg(filename=filename, file=file)
     elif filetype == FileType.HTML:
-        return partition_html(
+        elements = partition_html(
             filename=filename,
             file=file,
             include_page_breaks=include_page_breaks,
             encoding=encoding,
         )
     elif filetype == FileType.EPUB:
-        return partition_epub(filename=filename, file=file, include_page_breaks=include_page_breaks)
+        elements = partition_epub(
+            filename=filename,
+            file=file,
+            include_page_breaks=include_page_breaks,
+        )
     elif filetype == FileType.MD:
-        return partition_md(filename=filename, file=file, include_page_breaks=include_page_breaks)
+        elements = partition_md(
+            filename=filename,
+            file=file,
+            include_page_breaks=include_page_breaks,
+        )
     elif filetype == FileType.PDF:
-        return partition_pdf(
+        elements = partition_pdf(
             filename=filename,  # type: ignore
             file=file,  # type: ignore
             url=None,
@@ -90,27 +111,56 @@ def partition(
             strategy=strategy,
         )
     elif (filetype == FileType.PNG) or (filetype == FileType.JPG):
-        return partition_image(
+        elements = partition_image(
             filename=filename,  # type: ignore
             file=file,  # type: ignore
             url=None,
             include_page_breaks=include_page_breaks,
         )
     elif filetype == FileType.TXT:
-        return partition_text(
+        elements = partition_text(
             filename=filename,
             file=file,
             encoding=encoding,
             paragraph_grouper=paragraph_grouper,
         )
     elif filetype == FileType.RTF:
-        return partition_rtf(filename=filename, file=file, include_page_breaks=include_page_breaks)
+        elements = partition_rtf(
+            filename=filename,
+            file=file,
+            include_page_breaks=include_page_breaks,
+        )
     elif filetype == FileType.PPT:
-        return partition_ppt(filename=filename, file=file, include_page_breaks=include_page_breaks)
+        elements = partition_ppt(
+            filename=filename,
+            file=file,
+            include_page_breaks=include_page_breaks,
+        )
     elif filetype == FileType.PPTX:
-        return partition_pptx(filename=filename, file=file, include_page_breaks=include_page_breaks)
+        elements = partition_pptx(
+            filename=filename,
+            file=file,
+            include_page_breaks=include_page_breaks,
+        )
     elif filetype == FileType.JSON:
-        return partition_json(filename=filename, file=file)
+        elements = partition_json(filename=filename, file=file)
     else:
         msg = "Invalid file" if not filename else f"Invalid file {filename}"
         raise ValueError(f"{msg}. The {filetype} file type is not supported in partition.")
+
+    for element in elements:
+        element.metadata.url = url
+
+    return elements
+
+
+def file_and_type_from_url(
+    url: str,
+    content_type: Optional[str] = None,
+) -> Tuple[io.BytesIO, Optional[FileType]]:
+    response = requests.get(url)
+    file = io.BytesIO(response.content)
+
+    content_type = content_type or response.headers.get("Content-Type")
+    filetype = detect_filetype(file=file, content_type=content_type)
+    return file, filetype

Original file line number	Diff line number	Diff line change
`@@ -1 +1 @@`
`1`		`-__version__ = "0.5.12-dev5" # pragma: no cover`
	`1`	`+__version__ = "0.5.12" # pragma: no cover`