Skip to content

Commit 0df50fe

Browse files
authored
Fix file detection when spooled file is pased (#3932)
This pull request fixes the scenario when SpooledTemporaryFile is passed to detect_file type. In such cases some weird number was assigned as 'name' (and it couldn't be overwritten as SpooledTemporaryFile can't have fields assigned 😩 ) so I added in our object factory just another scenario where we parse this type of file. For BytesIo `name` attr is None as it should be and some other metadata fields are leveraged for file type recognition
1 parent 147add9 commit 0df50fe

File tree

4 files changed

+33
-4
lines changed

4 files changed

+33
-4
lines changed

CHANGELOG.md

+10
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,13 @@
1+
## 0.16.23
2+
3+
### Enhancements
4+
5+
### Features
6+
7+
### Fixes
8+
- **Fixes detect_filetype when SpooledTemporaryFile is passed**. Previously some random name would get assigned to the file and the function raised error.
9+
10+
111
## 0.16.22
212

313
### Enhancements

test_unstructured/partition/test_auto.py

+13-1
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@
99
import warnings
1010
from importlib import import_module
1111
from typing import Iterator
12-
from unittest.mock import patch
12+
from unittest.mock import MagicMock, patch
1313

1414
import pytest
1515
from PIL import Image
@@ -42,6 +42,7 @@
4242
Text,
4343
Title,
4444
)
45+
from unstructured.file_utils.filetype import detect_filetype
4546
from unstructured.file_utils.model import FileType
4647
from unstructured.partition.auto import _PartitionerLoader, partition
4748
from unstructured.partition.common import UnsupportedFileFormatError
@@ -1241,6 +1242,17 @@ def test_auto_partition_applies_the_correct_filetype_for_all_filetypes(
12411242
)
12421243

12431244

1245+
def test_detect_filetype_maps_file_to_bytes_io_when_spooled_temp_file_used(mocker):
1246+
detect_filetype_mock = MagicMock(return_value=FileType.JSON)
1247+
mocker.patch("unstructured.file_utils.filetype._FileTypeDetector", detect_filetype_mock)
1248+
with tempfile.SpooledTemporaryFile() as f:
1249+
f.write(b'{"text": Hello, world!}')
1250+
f.seek(0)
1251+
detect_filetype(file=f)
1252+
file_detection_context = detect_filetype_mock.file_type.call_args[0][0]
1253+
assert file_detection_context.text_head == '{"text": Hello, world!}'
1254+
1255+
12441256
# -- .languages -----------------------------------------------------------
12451257

12461258

unstructured/__version__.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
__version__ = "0.16.22" # pragma: no cover
1+
__version__ = "0.16.23" # pragma: no cover

unstructured/file_utils/filetype.py

+9-2
Original file line numberDiff line numberDiff line change
@@ -31,9 +31,11 @@
3131
import contextlib
3232
import functools
3333
import importlib.util
34+
import io
3435
import json
3536
import os
3637
import re
38+
import tempfile
3739
import zipfile
3840
from typing import IO, Callable, Iterator, Optional
3941

@@ -60,7 +62,7 @@
6062

6163
def detect_filetype(
6264
file_path: str | None = None,
63-
file: IO[bytes] | None = None,
65+
file: IO[bytes] | tempfile.SpooledTemporaryFile | None = None,
6466
encoding: str | None = None,
6567
content_type: str | None = None,
6668
metadata_file_path: Optional[str] = None,
@@ -92,9 +94,14 @@ def detect_filetype(
9294
filesystem.
9395
- Neither `file_path` nor `file` were specified.
9496
"""
97+
file_buffer = file
98+
if isinstance(file, tempfile.SpooledTemporaryFile):
99+
file_buffer = io.BytesIO(file.read())
100+
file.seek(0)
101+
95102
ctx = _FileTypeDetectionContext.new(
96103
file_path=file_path,
97-
file=file,
104+
file=file_buffer,
98105
encoding=encoding,
99106
content_type=content_type,
100107
metadata_file_path=metadata_file_path,

0 commit comments

Comments
 (0)