Skip to content
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.

Commit 74b0647

Browse files
authoredMar 7, 2025··
Fix json bytes content type detection (#3941)
Fixes order of content type detection strategies for byte-encoded jsons. Before ``` json_bytes = json.dumps([{"example": "data"}]).encode("utf-8") file_buffer = io.BytesIO(json_bytes) detect_filetype(file=file_buffer, metadata_file_path="filename.pdf") ``` Before PDF Now JSON
1 parent 961c8d5 commit 74b0647

21 files changed

+317
-260
lines changed
 

‎CHANGELOG.md

+12
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,14 @@
1+
## 0.16.25
2+
3+
### Enhancements
4+
5+
### Features
6+
7+
### Fixes
8+
9+
- **Fixes filetype detection for jsons passed as byte streams** - Now it prioritizes magic mimetype prediction over file extension when detecting filetypes
10+
11+
112
## 0.16.24
213

314
### Enhancements
@@ -14,6 +25,7 @@
1425

1526
### Fixes
1627

28+
1729
## 0.16.23
1830

1931
### Enhancements

‎requirements/base.txt

+28-28
Original file line numberDiff line numberDiff line change
@@ -2,14 +2,14 @@
22
# This file is autogenerated by pip-compile with Python 3.9
33
# by the following command:
44
#
5-
# pip-compile base.in
5+
# pip-compile ./base.in
66
#
77
anyio==4.8.0
88
# via httpx
99
backoff==2.2.1
10-
# via -r base.in
10+
# via -r ./base.in
1111
beautifulsoup4==4.13.3
12-
# via -r base.in
12+
# via -r ./base.in
1313
certifi==2025.1.31
1414
# via
1515
# httpcore
@@ -19,7 +19,7 @@ certifi==2025.1.31
1919
cffi==1.17.1
2020
# via cryptography
2121
chardet==5.2.0
22-
# via -r base.in
22+
# via -r ./base.in
2323
charset-normalizer==3.4.1
2424
# via
2525
# requests
@@ -28,24 +28,24 @@ click==8.1.8
2828
# via
2929
# nltk
3030
# python-oxmsg
31-
cryptography==44.0.1
31+
cryptography==44.0.2
3232
# via unstructured-client
3333
dataclasses-json==0.6.7
3434
# via
35-
# -r base.in
35+
# -r ./base.in
3636
# unstructured-client
37-
deepdiff==8.2.0
37+
deepdiff==8.3.0
3838
# via unstructured-client
3939
emoji==2.14.1
40-
# via -r base.in
40+
# via -r ./base.in
4141
exceptiongroup==1.2.2
4242
# via anyio
4343
filetype==1.2.0
44-
# via -r base.in
44+
# via -r ./base.in
4545
h11==0.14.0
4646
# via httpcore
4747
html5lib==1.1
48-
# via -r base.in
48+
# via -r ./base.in
4949
httpcore==1.0.7
5050
# via httpx
5151
httpx==0.28.1
@@ -61,9 +61,9 @@ joblib==1.4.2
6161
jsonpath-python==1.0.6
6262
# via unstructured-client
6363
langdetect==1.0.9
64-
# via -r base.in
64+
# via -r ./base.in
6565
lxml==5.3.1
66-
# via -r base.in
66+
# via -r ./base.in
6767
marshmallow==3.26.1
6868
# via
6969
# dataclasses-json
@@ -75,9 +75,9 @@ mypy-extensions==1.0.0
7575
nest-asyncio==1.6.0
7676
# via unstructured-client
7777
nltk==3.9.1
78-
# via -r base.in
78+
# via -r ./base.in
7979
numpy==1.26.4
80-
# via -r base.in
80+
# via -r ./base.in
8181
olefile==0.47
8282
# via python-oxmsg
8383
orderly-set==5.3.0
@@ -87,26 +87,26 @@ packaging==24.2
8787
# marshmallow
8888
# unstructured-client
8989
psutil==7.0.0
90-
# via -r base.in
90+
# via -r ./base.in
9191
pycparser==2.22
9292
# via cffi
93-
pypdf==5.3.0
93+
pypdf==5.3.1
9494
# via unstructured-client
9595
python-dateutil==2.9.0.post0
9696
# via unstructured-client
9797
python-iso639==2025.2.18
98-
# via -r base.in
98+
# via -r ./base.in
9999
python-magic==0.4.27
100-
# via -r base.in
100+
# via -r ./base.in
101101
python-oxmsg==0.0.2
102-
# via -r base.in
103-
rapidfuzz==3.12.1
104-
# via -r base.in
102+
# via -r ./base.in
103+
rapidfuzz==3.12.2
104+
# via -r ./base.in
105105
regex==2024.11.6
106106
# via nltk
107107
requests==2.32.3
108108
# via
109-
# -r base.in
109+
# -r ./base.in
110110
# requests-toolbelt
111111
# unstructured-client
112112
requests-toolbelt==1.0.0
@@ -123,11 +123,11 @@ soupsieve==2.6
123123
# via beautifulsoup4
124124
tqdm==4.67.1
125125
# via
126-
# -r base.in
126+
# -r ./base.in
127127
# nltk
128128
typing-extensions==4.12.2
129129
# via
130-
# -r base.in
130+
# -r ./base.in
131131
# anyio
132132
# beautifulsoup4
133133
# pypdf
@@ -140,14 +140,14 @@ typing-inspect==0.9.0
140140
# unstructured-client
141141
unstructured-client==0.25.9
142142
# via
143-
# -c ./deps/constraints.txt
144-
# -r base.in
143+
# -c ././deps/constraints.txt
144+
# -r ./base.in
145145
urllib3==1.26.20
146146
# via
147-
# -c ./deps/constraints.txt
147+
# -c ././deps/constraints.txt
148148
# requests
149149
# unstructured-client
150150
webencodings==0.5.1
151151
# via html5lib
152152
wrapt==1.17.2
153-
# via -r base.in
153+
# via -r ./base.in

‎requirements/deps/constraints.txt

+2
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,8 @@
66
# we are using v3 client https://weaviate.io/developers/weaviate/client-libraries/python/python_v3
77
weaviate-client>=3.26.7,<4.0.0
88
# TODO: Constriant due to multiple versions being installed during pip-compile
9+
protobuf>=6.30.0
10+
# TODO: Constriant due to multiple versions being installed during pip-compile
911
grpcio>=1.65.5
1012
# TODO: Pinned in transformers package, remove when that gets updated (https://github.com/huggingface/transformers/blob/main/setup.py)
1113
tokenizers>=0.21,<0.22

‎requirements/dev.txt

+13-13
Original file line numberDiff line numberDiff line change
@@ -2,56 +2,56 @@
22
# This file is autogenerated by pip-compile with Python 3.9
33
# by the following command:
44
#
5-
# pip-compile dev.in
5+
# pip-compile ./dev.in
66
#
77
build==1.2.2.post1
88
# via pip-tools
99
cfgv==3.4.0
1010
# via pre-commit
1111
click==8.1.8
1212
# via
13-
# -c base.txt
14-
# -c test.txt
13+
# -c ./base.txt
14+
# -c ./test.txt
1515
# pip-tools
1616
distlib==0.3.9
1717
# via virtualenv
1818
filelock==3.17.0
1919
# via virtualenv
20-
identify==2.6.7
20+
identify==2.6.8
2121
# via pre-commit
2222
importlib-metadata==8.6.1
2323
# via
24-
# -c ./deps/constraints.txt
24+
# -c ././deps/constraints.txt
2525
# build
2626
nodeenv==1.9.1
2727
# via pre-commit
2828
packaging==24.2
2929
# via
30-
# -c base.txt
31-
# -c test.txt
30+
# -c ./base.txt
31+
# -c ./test.txt
3232
# build
3333
pip-tools==7.4.1
34-
# via -r dev.in
34+
# via -r ./dev.in
3535
platformdirs==4.3.6
3636
# via
37-
# -c test.txt
37+
# -c ./test.txt
3838
# virtualenv
3939
pre-commit==4.1.0
40-
# via -r dev.in
40+
# via -r ./dev.in
4141
pyproject-hooks==1.2.0
4242
# via
4343
# build
4444
# pip-tools
4545
pyyaml==6.0.2
4646
# via
47-
# -c test.txt
47+
# -c ./test.txt
4848
# pre-commit
4949
tomli==2.2.1
5050
# via
51-
# -c test.txt
51+
# -c ./test.txt
5252
# build
5353
# pip-tools
54-
virtualenv==20.29.2
54+
virtualenv==20.29.3
5555
# via pre-commit
5656
wheel==0.45.1
5757
# via pip-tools

‎requirements/extra-csv.txt

+5-5
Original file line numberDiff line numberDiff line change
@@ -2,23 +2,23 @@
22
# This file is autogenerated by pip-compile with Python 3.9
33
# by the following command:
44
#
5-
# pip-compile extra-csv.in
5+
# pip-compile ./extra-csv.in
66
#
77
numpy==1.26.4
88
# via
9-
# -c base.txt
9+
# -c ./base.txt
1010
# pandas
1111
pandas==2.2.3
12-
# via -r extra-csv.in
12+
# via -r ./extra-csv.in
1313
python-dateutil==2.9.0.post0
1414
# via
15-
# -c base.txt
15+
# -c ./base.txt
1616
# pandas
1717
pytz==2025.1
1818
# via pandas
1919
six==1.17.0
2020
# via
21-
# -c base.txt
21+
# -c ./base.txt
2222
# python-dateutil
2323
tzdata==2025.1
2424
# via pandas

‎requirements/extra-docx.txt

+4-4
Original file line numberDiff line numberDiff line change
@@ -2,15 +2,15 @@
22
# This file is autogenerated by pip-compile with Python 3.9
33
# by the following command:
44
#
5-
# pip-compile extra-docx.in
5+
# pip-compile ./extra-docx.in
66
#
77
lxml==5.3.1
88
# via
9-
# -c base.txt
9+
# -c ./base.txt
1010
# python-docx
1111
python-docx==1.1.2
12-
# via -r extra-docx.in
12+
# via -r ./extra-docx.in
1313
typing-extensions==4.12.2
1414
# via
15-
# -c base.txt
15+
# -c ./base.txt
1616
# python-docx

‎requirements/extra-epub.txt

+2-2
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
# This file is autogenerated by pip-compile with Python 3.9
33
# by the following command:
44
#
5-
# pip-compile extra-epub.in
5+
# pip-compile ./extra-epub.in
66
#
77
pypandoc==1.15
8-
# via -r extra-epub.in
8+
# via -r ./extra-epub.in

‎requirements/extra-markdown.txt

+3-3
Original file line numberDiff line numberDiff line change
@@ -2,13 +2,13 @@
22
# This file is autogenerated by pip-compile with Python 3.9
33
# by the following command:
44
#
5-
# pip-compile extra-markdown.in
5+
# pip-compile ./extra-markdown.in
66
#
77
importlib-metadata==8.6.1
88
# via
9-
# -c ./deps/constraints.txt
9+
# -c ././deps/constraints.txt
1010
# markdown
1111
markdown==3.7
12-
# via -r extra-markdown.in
12+
# via -r ./extra-markdown.in
1313
zipp==3.21.0
1414
# via importlib-metadata

‎requirements/extra-odt.txt

+5-5
Original file line numberDiff line numberDiff line change
@@ -2,17 +2,17 @@
22
# This file is autogenerated by pip-compile with Python 3.9
33
# by the following command:
44
#
5-
# pip-compile extra-odt.in
5+
# pip-compile ./extra-odt.in
66
#
77
lxml==5.3.1
88
# via
9-
# -c base.txt
9+
# -c ./base.txt
1010
# python-docx
1111
pypandoc==1.15
12-
# via -r extra-odt.in
12+
# via -r ./extra-odt.in
1313
python-docx==1.1.2
14-
# via -r extra-odt.in
14+
# via -r ./extra-odt.in
1515
typing-extensions==4.12.2
1616
# via
17-
# -c base.txt
17+
# -c ./base.txt
1818
# python-docx

0 commit comments

Comments
 (0)
Please sign in to comment.