From 44e016897da08267cd2396b7d1be8202fd79d4bc Mon Sep 17 00:00:00 2001 From: Emily Chen Date: Tue, 28 Jan 2025 15:46:43 -0800 Subject: [PATCH 01/12] Fix starlette vulnerability --- requirements/base.txt | 142 +++++++++++++--------------- requirements/constraints.in | 1 + requirements/test.txt | 184 ++++++++++++++++-------------------- 3 files changed, 145 insertions(+), 182 deletions(-) diff --git a/requirements/base.txt b/requirements/base.txt index 61458431..8d8d40f2 100644 --- a/requirements/base.txt +++ b/requirements/base.txt @@ -1,8 +1,8 @@ # -# This file is autogenerated by pip-compile with Python 3.11 +# This file is autogenerated by pip-compile with Python 3.12 # by the following command: # -# pip-compile --config=pyproject.toml requirements/base.in +# pip-compile requirements/base.in # aiofiles==24.1.0 # via unstructured-client @@ -10,7 +10,7 @@ annotated-types==0.7.0 # via pydantic antlr4-python3-runtime==4.9.3 # via omegaconf -anyio==4.7.0 +anyio==4.8.0 # via # httpx # starlette @@ -20,9 +20,9 @@ backoff==2.2.1 # unstructured beautifulsoup4==4.12.3 # via unstructured -cachetools==5.5.0 +cachetools==5.5.1 # via google-auth -certifi==2024.8.30 +certifi==2024.12.14 # via # httpcore # httpx @@ -31,7 +31,7 @@ cffi==1.17.1 # via cryptography chardet==5.2.0 # via unstructured -charset-normalizer==3.4.0 +charset-normalizer==3.4.1 # via # pdfminer-six # requests @@ -53,36 +53,36 @@ cycler==0.12.1 # via matplotlib dataclasses-json==0.6.7 # via unstructured -deprecated==1.2.15 +deprecated==1.2.18 # via pikepdf effdet==0.4.1 # via unstructured -emoji==2.14.0 +emoji==2.14.1 # via unstructured et-xmlfile==2.0.0 # via openpyxl -eval-type-backport==0.2.0 +eval-type-backport==0.2.2 # via unstructured-client -fastapi==0.113.0 +fastapi==0.1.17 # via -r requirements/base.in -filelock==3.16.1 +filelock==3.17.0 # via # huggingface-hub # torch # transformers filetype==1.2.0 # via unstructured -flatbuffers==24.3.25 +flatbuffers==25.1.24 # via onnxruntime -fonttools==4.55.3 +fonttools==4.55.7 # via matplotlib -fsspec==2024.10.0 +fsspec==2024.12.0 # via # huggingface-hub # torch -google-api-core[grpc]==2.24.0 +google-api-core[grpc]==2.24.1 # via google-cloud-vision -google-auth==2.37.0 +google-auth==2.38.0 # via # google-api-core # google-cloud-vision @@ -92,11 +92,11 @@ googleapis-common-protos==1.66.0 # via # google-api-core # grpcio-status -grpcio==1.68.1 +grpcio==1.70.0 # via # google-api-core # grpcio-status -grpcio-status==1.68.1 +grpcio-status==1.70.0 # via google-api-core h11==0.14.0 # via @@ -108,7 +108,7 @@ httpcore==1.0.7 # via httpx httpx==0.28.1 # via unstructured-client -huggingface-hub==0.26.5 +huggingface-hub==0.28.0 # via # timm # tokenizers @@ -121,20 +121,16 @@ idna==3.10 # anyio # httpx # requests -iopath==0.1.10 - # via layoutparser -jinja2==3.1.4 +jinja2==3.1.5 # via torch joblib==1.4.2 # via nltk jsonpath-python==1.0.6 # via unstructured-client -kiwisolver==1.4.7 +kiwisolver==1.4.8 # via matplotlib langdetect==1.0.9 # via unstructured -layoutparser==0.3.4 - # via unstructured-inference lxml==5.3.0 # via # pikepdf @@ -145,9 +141,9 @@ markdown==3.7 # via unstructured markupsafe==3.0.2 # via jinja2 -marshmallow==3.23.1 +marshmallow==3.26.0 # via dataclasses-json -matplotlib==3.9.4 +matplotlib==3.10.0 # via # pycocotools # unstructured-inference @@ -167,7 +163,6 @@ numpy==1.26.4 # via # -c requirements/constraints.in # contourpy - # layoutparser # matplotlib # onnx # onnxruntime @@ -189,10 +184,8 @@ onnx==1.17.0 # unstructured-inference onnxruntime==1.20.1 # via unstructured-inference -opencv-python==4.10.0.84 - # via - # layoutparser - # unstructured-inference +opencv-python==4.11.0.86 + # via unstructured-inference openpyxl==3.1.5 # via unstructured packaging==24.2 @@ -206,40 +199,32 @@ packaging==24.2 # unstructured-pytesseract pandas==2.2.3 # via - # layoutparser # unstructured + # unstructured-inference pdf2image==1.17.0 + # via unstructured +pdfminer-six==20240706 # via - # layoutparser - # unstructured -pdfminer-six==20231228 - # via - # pdfplumber # unstructured -pdfplumber==0.11.4 - # via layoutparser + # unstructured-inference pi-heif==0.21.0 # via unstructured -pikepdf==9.4.2 +pikepdf==9.5.1 # via unstructured -pillow==11.0.0 +pillow==11.1.0 # via - # layoutparser # matplotlib # pdf2image - # pdfplumber # pi-heif # pikepdf # python-pptx # torchvision # unstructured-pytesseract -portalocker==3.0.0 - # via iopath -proto-plus==1.25.0 +proto-plus==1.26.0 # via # google-api-core # google-cloud-vision -protobuf==5.29.1 +protobuf==5.29.3 # via # google-api-core # google-cloud-vision @@ -248,7 +233,7 @@ protobuf==5.29.1 # onnx # onnxruntime # proto-plus -psutil==6.1.0 +psutil==6.1.1 # via # -r requirements/base.in # unstructured @@ -264,23 +249,23 @@ pycparser==2.22 # via cffi pycryptodome==3.21.0 # via -r requirements/base.in -pydantic==2.9.2 +pydantic==2.10.6 # via # fastapi # unstructured-client -pydantic-core==2.23.4 +pydantic-core==2.27.2 # via pydantic -pypandoc==1.14 +pypandoc==1.15 # via unstructured -pyparsing==3.2.0 +pyparsing==3.2.1 # via matplotlib -pypdf==5.1.0 +pypdf==5.2.0 # via # -r requirements/base.in # unstructured # unstructured-client -pypdfium2==4.30.0 - # via pdfplumber +pypdfium2==4.30.1 + # via unstructured-inference python-dateutil==2.9.0.post0 # via # matplotlib @@ -288,11 +273,11 @@ python-dateutil==2.9.0.post0 # unstructured-client python-docx==1.1.2 # via unstructured -python-iso639==2024.10.22 +python-iso639==2025.1.28 # via unstructured python-magic==0.4.27 # via unstructured -python-multipart==0.0.19 +python-multipart==0.0.20 # via unstructured-inference python-oxmsg==0.0.1 # via unstructured @@ -303,11 +288,10 @@ pytz==2024.2 pyyaml==6.0.2 # via # huggingface-hub - # layoutparser # omegaconf # timm # transformers -rapidfuzz==3.10.1 +rapidfuzz==3.11.0 # via # unstructured # unstructured-inference @@ -329,12 +313,12 @@ requests-toolbelt==1.0.0 # via unstructured-client rsa==4.9 # via google-auth -safetensors==0.4.5 +safetensors==0.5.2 # via # timm # transformers -scipy==1.14.1 - # via layoutparser +scipy==1.15.1 + # via unstructured-inference six==1.17.0 # via # html5lib @@ -344,13 +328,15 @@ sniffio==1.3.1 # via anyio soupsieve==2.6 # via beautifulsoup4 -starlette==0.38.6 - # via fastapi +starlette==0.45.3 + # via + # -c requirements/constraints.in + # fastapi sympy==1.13.1 # via # onnxruntime # torch -timm==1.0.12 +timm==1.0.14 # via # effdet # unstructured-inference @@ -369,18 +355,15 @@ torchvision==0.20.1 tqdm==4.67.1 # via # huggingface-hub - # iopath # nltk # transformers # unstructured -transformers==4.47.0 +transformers==4.48.1 # via unstructured-inference typing-extensions==4.12.2 # via # anyio - # fastapi # huggingface-hub - # iopath # pydantic # pydantic-core # python-docx @@ -393,27 +376,30 @@ typing-inspect==0.9.0 # via # dataclasses-json # unstructured-client -tzdata==2024.2 +tzdata==2025.1 # via pandas -unstructured[all-docs]==0.16.11 +unstructured[all-docs]==0.16.16 # via -r requirements/base.in -unstructured-client==0.28.1 +unstructured-client==0.29.0 # via unstructured -unstructured-inference==0.8.1 +unstructured-inference==0.8.6 # via unstructured unstructured-pytesseract==0.3.13 # via unstructured -urllib3==2.2.3 +urllib3==2.3.0 # via requests -uvicorn==0.32.1 +uvicorn==0.34.0 # via -r requirements/base.in webencodings==0.5.1 # via html5lib -wrapt==1.17.0 +wrapt==1.17.2 # via # deprecated # unstructured xlrd==2.0.1 # via unstructured -xlsxwriter==3.2.0 +xlsxwriter==3.2.2 # via python-pptx + +# The following packages are considered to be unsafe in a requirements file: +# setuptools diff --git a/requirements/constraints.in b/requirements/constraints.in index 94c33a81..23c54ecd 100644 --- a/requirements/constraints.in +++ b/requirements/constraints.in @@ -4,3 +4,4 @@ # words, if something does not require a constraint, it will not be installed. #################################################################################################### numpy<2.0.0 +starlette>=0.40.0 \ No newline at end of file diff --git a/requirements/test.txt b/requirements/test.txt index efbac57c..51b83cf2 100644 --- a/requirements/test.txt +++ b/requirements/test.txt @@ -1,8 +1,8 @@ # -# This file is autogenerated by pip-compile with Python 3.11 +# This file is autogenerated by pip-compile with Python 3.12 # by the following command: # -# pip-compile --config=pyproject.toml --output-file=requirements/test.txt requirements/base.txt requirements/test.in +# pip-compile --output-file=requirements/test.txt requirements/base.txt requirements/test.in # aiofiles==24.1.0 # via @@ -16,7 +16,7 @@ antlr4-python3-runtime==4.9.3 # via # -r requirements/base.txt # omegaconf -anyio==4.7.0 +anyio==4.8.0 # via # -r requirements/base.txt # httpx @@ -38,7 +38,7 @@ astunparse==1.6.3 # via nbdev async-lru==2.0.4 # via jupyterlab -attrs==24.2.0 +attrs==25.1.0 # via # jsonschema # referencing @@ -55,13 +55,13 @@ beautifulsoup4==4.12.3 # unstructured black==24.10.0 # via -r requirements/test.in -bleach==6.2.0 +bleach[css]==6.2.0 # via nbconvert -cachetools==5.5.0 +cachetools==5.5.1 # via # -r requirements/base.txt # google-auth -certifi==2024.8.30 +certifi==2024.12.14 # via # -r requirements/base.txt # httpcore @@ -76,7 +76,7 @@ chardet==5.2.0 # via # -r requirements/base.txt # unstructured -charset-normalizer==3.4.0 +charset-normalizer==3.4.1 # via # -r requirements/base.txt # pdfminer-six @@ -101,7 +101,7 @@ contourpy==1.3.1 # via # -r requirements/base.txt # matplotlib -coverage[toml]==7.6.9 +coverage[toml]==7.6.10 # via pytest-cov cryptography==44.0.0 # via @@ -116,15 +116,15 @@ dataclasses-json==0.6.7 # via # -r requirements/base.txt # unstructured -debugpy==1.8.11 +debugpy==1.8.12 # via ipykernel decorator==5.1.1 # via ipython -deepdiff==8.0.1 +deepdiff==8.1.1 # via -r requirements/test.in defusedxml==0.7.1 # via nbconvert -deprecated==1.2.15 +deprecated==1.2.18 # via # -r requirements/base.txt # pikepdf @@ -132,7 +132,7 @@ effdet==0.4.1 # via # -r requirements/base.txt # unstructured -emoji==2.14.0 +emoji==2.14.1 # via # -r requirements/base.txt # unstructured @@ -140,24 +140,24 @@ et-xmlfile==2.0.0 # via # -r requirements/base.txt # openpyxl -eval-type-backport==0.2.0 +eval-type-backport==0.2.2 # via # -r requirements/base.txt # unstructured-client execnb==0.1.11 # via nbdev -executing==2.1.0 +executing==2.2.0 # via stack-data -fastapi==0.113.0 +fastapi==0.1.17 # via -r requirements/base.txt -fastcore==1.7.26 +fastcore==1.7.28 # via # execnb # ghapi # nbdev fastjsonschema==2.21.1 # via nbformat -filelock==3.16.1 +filelock==3.17.0 # via # -r requirements/base.txt # huggingface-hub @@ -169,28 +169,28 @@ filetype==1.2.0 # unstructured flake8==7.1.1 # via -r requirements/test.in -flatbuffers==24.3.25 +flatbuffers==25.1.24 # via # -r requirements/base.txt # onnxruntime -fonttools==4.55.3 +fonttools==4.55.7 # via # -r requirements/base.txt # matplotlib fqdn==1.5.1 # via jsonschema -fsspec==2024.10.0 +fsspec==2024.12.0 # via # -r requirements/base.txt # huggingface-hub # torch ghapi==1.0.6 # via nbdev -google-api-core[grpc]==2.24.0 +google-api-core[grpc]==2.24.1 # via # -r requirements/base.txt # google-cloud-vision -google-auth==2.37.0 +google-auth==2.38.0 # via # -r requirements/base.txt # google-api-core @@ -204,12 +204,12 @@ googleapis-common-protos==1.66.0 # -r requirements/base.txt # google-api-core # grpcio-status -grpcio==1.68.1 +grpcio==1.70.0 # via # -r requirements/base.txt # google-api-core # grpcio-status -grpcio-status==1.68.1 +grpcio-status==1.70.0 # via # -r requirements/base.txt # google-api-core @@ -232,7 +232,7 @@ httpx==0.28.1 # -r requirements/test.in # jupyterlab # unstructured-client -huggingface-hub==0.26.5 +huggingface-hub==0.28.0 # via # -r requirements/base.txt # timm @@ -252,16 +252,12 @@ idna==3.10 # requests iniconfig==2.0.0 # via pytest -iopath==0.1.10 - # via - # -r requirements/base.txt - # layoutparser ipykernel==6.29.5 # via # jupyter # jupyter-console # jupyterlab -ipython==8.30.0 +ipython==8.31.0 # via # execnb # ipykernel @@ -273,7 +269,7 @@ isoduration==20.11.0 # via jsonschema jedi==0.19.2 # via ipython -jinja2==3.1.4 +jinja2==3.1.5 # via # -r requirements/base.txt # jupyter-server @@ -320,11 +316,11 @@ jupyter-core==5.7.2 # nbclient # nbconvert # nbformat -jupyter-events==0.10.0 +jupyter-events==0.11.0 # via jupyter-server jupyter-lsp==2.2.5 # via jupyterlab -jupyter-server==2.14.2 +jupyter-server==2.15.0 # via # jupyter-lsp # jupyterlab @@ -333,7 +329,7 @@ jupyter-server==2.14.2 # notebook-shim jupyter-server-terminals==0.5.3 # via jupyter-server -jupyterlab==4.3.3 +jupyterlab==4.3.4 # via # jupyter # notebook @@ -345,7 +341,7 @@ jupyterlab-server==2.27.3 # notebook jupyterlab-widgets==3.0.13 # via ipywidgets -kiwisolver==1.4.7 +kiwisolver==1.4.8 # via # -r requirements/base.txt # matplotlib @@ -353,10 +349,6 @@ langdetect==1.0.9 # via # -r requirements/base.txt # unstructured -layoutparser==0.3.4 - # via - # -r requirements/base.txt - # unstructured-inference lxml==5.3.0 # via # -r requirements/base.txt @@ -373,11 +365,11 @@ markupsafe==3.0.2 # -r requirements/base.txt # jinja2 # nbconvert -marshmallow==3.23.1 +marshmallow==3.26.0 # via # -r requirements/base.txt # dataclasses-json -matplotlib==3.9.4 +matplotlib==3.10.0 # via # -r requirements/base.txt # pycocotools @@ -388,13 +380,13 @@ matplotlib-inline==0.1.7 # ipython mccabe==0.7.0 # via flake8 -mistune==3.0.2 +mistune==3.1.1 # via nbconvert mpmath==1.3.0 # via # -r requirements/base.txt # sympy -mypy==1.13.0 +mypy==1.14.1 # via -r requirements/test.in mypy-extensions==1.0.0 # via @@ -402,9 +394,9 @@ mypy-extensions==1.0.0 # black # mypy # typing-inspect -nbclient==0.10.1 +nbclient==0.10.2 # via nbconvert -nbconvert==7.16.4 +nbconvert==7.16.6 # via # jupyter # jupyter-server @@ -429,7 +421,7 @@ nltk==3.9.1 # via # -r requirements/base.txt # unstructured -notebook==7.3.1 +notebook==7.3.2 # via jupyter notebook-shim==0.2.4 # via @@ -440,7 +432,6 @@ numpy==1.26.4 # -c requirements/constraints.in # -r requirements/base.txt # contourpy - # layoutparser # matplotlib # onnx # onnxruntime @@ -469,16 +460,15 @@ onnxruntime==1.20.1 # via # -r requirements/base.txt # unstructured-inference -opencv-python==4.10.0.84 +opencv-python==4.11.0.86 # via # -r requirements/base.txt - # layoutparser # unstructured-inference openpyxl==3.1.5 # via # -r requirements/base.txt # unstructured -orderly-set==5.2.2 +orderly-set==5.2.3 # via deepdiff overrides==7.7.0 # via jupyter-server @@ -505,8 +495,8 @@ packaging==24.2 pandas==2.2.3 # via # -r requirements/base.txt - # layoutparser # unstructured + # unstructured-inference pandocfilters==1.5.1 # via nbconvert parso==0.8.4 @@ -516,34 +506,27 @@ pathspec==0.12.1 pdf2image==1.17.0 # via # -r requirements/base.txt - # layoutparser # unstructured -pdfminer-six==20231228 +pdfminer-six==20240706 # via # -r requirements/base.txt - # pdfplumber # unstructured -pdfplumber==0.11.4 - # via - # -r requirements/base.txt - # layoutparser + # unstructured-inference pexpect==4.9.0 # via ipython pi-heif==0.21.0 # via # -r requirements/base.txt # unstructured -pikepdf==9.4.2 +pikepdf==9.5.1 # via # -r requirements/base.txt # unstructured -pillow==11.0.0 +pillow==11.1.0 # via # -r requirements/base.txt - # layoutparser # matplotlib # pdf2image - # pdfplumber # pi-heif # pikepdf # python-pptx @@ -555,22 +538,18 @@ platformdirs==4.3.6 # jupyter-core pluggy==1.5.0 # via pytest -portalocker==3.0.0 - # via - # -r requirements/base.txt - # iopath prometheus-client==0.21.1 # via jupyter-server -prompt-toolkit==3.0.48 +prompt-toolkit==3.0.50 # via # ipython # jupyter-console -proto-plus==1.25.0 +proto-plus==1.26.0 # via # -r requirements/base.txt # google-api-core # google-cloud-vision -protobuf==5.29.1 +protobuf==5.29.3 # via # -r requirements/base.txt # google-api-core @@ -580,7 +559,7 @@ protobuf==5.29.1 # onnx # onnxruntime # proto-plus -psutil==6.1.0 +psutil==6.1.1 # via # -r requirements/base.txt # ipykernel @@ -612,39 +591,39 @@ pycparser==2.22 # cffi pycryptodome==3.21.0 # via -r requirements/base.txt -pydantic==2.9.2 +pydantic==2.10.6 # via # -r requirements/base.txt # fastapi # unstructured-client -pydantic-core==2.23.4 +pydantic-core==2.27.2 # via # -r requirements/base.txt # pydantic pyflakes==3.2.0 # via flake8 -pygments==2.18.0 +pygments==2.19.1 # via # ipython # jupyter-console # nbconvert -pypandoc==1.14 +pypandoc==1.15 # via # -r requirements/base.txt # unstructured -pyparsing==3.2.0 +pyparsing==3.2.1 # via # -r requirements/base.txt # matplotlib -pypdf==5.1.0 +pypdf==5.2.0 # via # -r requirements/base.txt # unstructured # unstructured-client -pypdfium2==4.30.0 +pypdfium2==4.30.1 # via # -r requirements/base.txt - # pdfplumber + # unstructured-inference pytest==8.3.4 # via # pytest-cov @@ -665,17 +644,17 @@ python-docx==1.1.2 # via # -r requirements/base.txt # unstructured -python-iso639==2024.10.22 +python-iso639==2025.1.28 # via # -r requirements/base.txt # unstructured -python-json-logger==3.2.0 +python-json-logger==3.2.1 # via jupyter-events python-magic==0.4.27 # via # -r requirements/base.txt # unstructured -python-multipart==0.0.19 +python-multipart==0.0.20 # via # -r requirements/base.txt # unstructured-inference @@ -696,7 +675,6 @@ pyyaml==6.0.2 # -r requirements/base.txt # huggingface-hub # jupyter-events - # layoutparser # nbdev # omegaconf # timm @@ -707,14 +685,14 @@ pyzmq==26.2.0 # jupyter-client # jupyter-console # jupyter-server -rapidfuzz==3.10.1 +rapidfuzz==3.11.0 # via # -r requirements/base.txt # unstructured # unstructured-inference ratelimit==2.2.1 # via -r requirements/base.txt -referencing==0.35.1 +referencing==0.36.2 # via # jsonschema # jsonschema-specifications @@ -753,15 +731,15 @@ rsa==4.9 # via # -r requirements/base.txt # google-auth -safetensors==0.4.5 +safetensors==0.5.2 # via # -r requirements/base.txt # timm # transformers -scipy==1.14.1 +scipy==1.15.1 # via # -r requirements/base.txt - # layoutparser + # unstructured-inference send2trash==1.8.3 # via jupyter-server six==1.17.0 @@ -782,8 +760,9 @@ soupsieve==2.6 # beautifulsoup4 stack-data==0.6.3 # via ipython -starlette==0.38.6 +starlette==0.45.3 # via + # -c requirements/constraints.in # -r requirements/base.txt # fastapi sympy==1.13.1 @@ -795,13 +774,13 @@ terminado==0.18.1 # via # jupyter-server # jupyter-server-terminals -timm==1.0.12 +timm==1.0.14 # via # -r requirements/base.txt # effdet # unstructured-inference tinycss2==1.4.0 - # via nbconvert + # via bleach tokenizers==0.21.0 # via # -r requirements/base.txt @@ -830,7 +809,6 @@ tqdm==4.67.1 # via # -r requirements/base.txt # huggingface-hub - # iopath # nltk # transformers # unstructured @@ -850,7 +828,7 @@ traitlets==5.14.3 # nbclient # nbconvert # nbformat -transformers==4.47.0 +transformers==4.48.1 # via # -r requirements/base.txt # unstructured-inference @@ -860,16 +838,14 @@ typing-extensions==4.12.2 # via # -r requirements/base.txt # anyio - # fastapi # huggingface-hub - # iopath - # ipython # mypy # pydantic # pydantic-core # python-docx # python-oxmsg # python-pptx + # referencing # torch # typing-inspect # unstructured @@ -878,17 +854,17 @@ typing-inspect==0.9.0 # -r requirements/base.txt # dataclasses-json # unstructured-client -tzdata==2024.2 +tzdata==2025.1 # via # -r requirements/base.txt # pandas -unstructured[all-docs]==0.16.11 +unstructured[all-docs]==0.16.16 # via -r requirements/base.txt -unstructured-client==0.28.1 +unstructured-client==0.29.0 # via # -r requirements/base.txt # unstructured -unstructured-inference==0.8.1 +unstructured-inference==0.8.6 # via # -r requirements/base.txt # unstructured @@ -898,11 +874,11 @@ unstructured-pytesseract==0.3.13 # unstructured uri-template==1.3.0 # via jsonschema -urllib3==2.2.3 +urllib3==2.3.0 # via # -r requirements/base.txt # requests -uvicorn==0.32.1 +uvicorn==0.34.0 # via -r requirements/base.txt watchdog==6.0.0 # via nbdev @@ -922,7 +898,7 @@ wheel==0.45.1 # via astunparse widgetsnbextension==4.0.13 # via ipywidgets -wrapt==1.17.0 +wrapt==1.17.2 # via # -r requirements/base.txt # deprecated @@ -931,7 +907,7 @@ xlrd==2.0.1 # via # -r requirements/base.txt # unstructured -xlsxwriter==3.2.0 +xlsxwriter==3.2.2 # via # -r requirements/base.txt # python-pptx From 4824094029295769f790711562ba991d50ec5701 Mon Sep 17 00:00:00 2001 From: Emily Chen Date: Tue, 28 Jan 2025 15:54:16 -0800 Subject: [PATCH 02/12] Incr version --- CHANGELOG.md | 4 ++++ prepline_general/api/app.py | 2 +- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 9cbd7a4d..9709412b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,7 @@ +## 0.0.83 + +* Fix Starlette vulnerability + ## 0.0.82 * Patch various python CVEs diff --git a/prepline_general/api/app.py b/prepline_general/api/app.py index 798d1815..d8301c90 100644 --- a/prepline_general/api/app.py +++ b/prepline_general/api/app.py @@ -13,7 +13,7 @@ app = FastAPI( title="Unstructured Pipeline API", summary="Partition documents with the Unstructured library", - version="0.0.82", + version="0.0.83", docs_url="/general/docs", openapi_url="/general/openapi.json", servers=[ From 3798a4fdaecf559a22c7e2b66e893e79b0648268 Mon Sep 17 00:00:00 2001 From: Emily Chen Date: Tue, 28 Jan 2025 15:59:00 -0800 Subject: [PATCH 03/12] Incr version --- prepline_general/api/general.py | 4 ++-- preprocessing-pipeline-family.yaml | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/prepline_general/api/general.py b/prepline_general/api/general.py index e378ab04..ae5ed3b4 100644 --- a/prepline_general/api/general.py +++ b/prepline_general/api/general.py @@ -602,7 +602,7 @@ def return_content_type(filename: str): @router.get("/general/v0/general", include_in_schema=False) -@router.get("/general/v0.0.82/general", include_in_schema=False) +@router.get("/general/v0.0.83/general", include_in_schema=False) async def handle_invalid_get_request(): raise HTTPException( status_code=status.HTTP_405_METHOD_NOT_ALLOWED, detail="Only POST requests are supported." @@ -617,7 +617,7 @@ async def handle_invalid_get_request(): description="Description", operation_id="partition_parameters", ) -@router.post("/general/v0.0.82/general", include_in_schema=False) +@router.post("/general/v0.0.83/general", include_in_schema=False) def general_partition( request: Request, # cannot use annotated type here because of a bug described here: diff --git a/preprocessing-pipeline-family.yaml b/preprocessing-pipeline-family.yaml index 116f88e1..3addd95b 100644 --- a/preprocessing-pipeline-family.yaml +++ b/preprocessing-pipeline-family.yaml @@ -1,2 +1,2 @@ name: general -version: 0.0.82 +version: 0.0.83 From b64690be45a461a1ffb2356f6401f9d00320df30 Mon Sep 17 00:00:00 2001 From: Emily Chen Date: Tue, 28 Jan 2025 16:30:25 -0800 Subject: [PATCH 04/12] Remove fastapi pin --- requirements/base.in | 6 +----- requirements/base.txt | 3 ++- requirements/test.txt | 3 ++- 3 files changed, 5 insertions(+), 7 deletions(-) diff --git a/requirements/base.in b/requirements/base.in index fda4c50f..8a94d232 100644 --- a/requirements/base.in +++ b/requirements/base.in @@ -4,11 +4,7 @@ unstructured[all-docs] # can remove after black drops support for Python 3.6 # ref: https://github.com/psf/black/issues/2964 click==8.1.3 -# NOTE(robinson) - fastapi>=0.114.0 causes the test listed below to fail, though it -# works if data if chunking strategy and new_after_n_chars are explicitly set. Pinning -# for now to preserve behavior -# test_parallel_mode_preserves_uniqueness_of_hashes_when_assembling_page_splits -fastapi<0.114.0 +fastapi uvicorn ratelimit requests diff --git a/requirements/base.txt b/requirements/base.txt index 8d8d40f2..3c98675e 100644 --- a/requirements/base.txt +++ b/requirements/base.txt @@ -63,7 +63,7 @@ et-xmlfile==2.0.0 # via openpyxl eval-type-backport==0.2.2 # via unstructured-client -fastapi==0.1.17 +fastapi==0.115.7 # via -r requirements/base.in filelock==3.17.0 # via @@ -363,6 +363,7 @@ transformers==4.48.1 typing-extensions==4.12.2 # via # anyio + # fastapi # huggingface-hub # pydantic # pydantic-core diff --git a/requirements/test.txt b/requirements/test.txt index 51b83cf2..bf2b06f5 100644 --- a/requirements/test.txt +++ b/requirements/test.txt @@ -148,7 +148,7 @@ execnb==0.1.11 # via nbdev executing==2.2.0 # via stack-data -fastapi==0.1.17 +fastapi==0.115.7 # via -r requirements/base.txt fastcore==1.7.28 # via @@ -838,6 +838,7 @@ typing-extensions==4.12.2 # via # -r requirements/base.txt # anyio + # fastapi # huggingface-hub # mypy # pydantic From 07379052080807338ad07655f6c71a6cbc96e2b5 Mon Sep 17 00:00:00 2001 From: Emily Chen Date: Wed, 29 Jan 2025 20:49:55 -0800 Subject: [PATCH 05/12] Pin Starlette to avoid breaking middleware --- requirements/base.txt | 10 +++++----- requirements/constraints.in | 2 +- requirements/test.txt | 14 +++++++------- 3 files changed, 13 insertions(+), 13 deletions(-) diff --git a/requirements/base.txt b/requirements/base.txt index 3c98675e..81d3b40a 100644 --- a/requirements/base.txt +++ b/requirements/base.txt @@ -74,7 +74,7 @@ filetype==1.2.0 # via unstructured flatbuffers==25.1.24 # via onnxruntime -fonttools==4.55.7 +fonttools==4.55.8 # via matplotlib fsspec==2024.12.0 # via @@ -328,7 +328,7 @@ sniffio==1.3.1 # via anyio soupsieve==2.6 # via beautifulsoup4 -starlette==0.45.3 +starlette==0.43.0 # via # -c requirements/constraints.in # fastapi @@ -342,13 +342,13 @@ timm==1.0.14 # unstructured-inference tokenizers==0.21.0 # via transformers -torch==2.5.1 +torch==2.6.0 # via # effdet # timm # torchvision # unstructured-inference -torchvision==0.20.1 +torchvision==0.21.0 # via # effdet # timm @@ -379,7 +379,7 @@ typing-inspect==0.9.0 # unstructured-client tzdata==2025.1 # via pandas -unstructured[all-docs]==0.16.16 +unstructured[all-docs]==0.16.17 # via -r requirements/base.in unstructured-client==0.29.0 # via unstructured diff --git a/requirements/constraints.in b/requirements/constraints.in index 23c54ecd..6b2666c1 100644 --- a/requirements/constraints.in +++ b/requirements/constraints.in @@ -4,4 +4,4 @@ # words, if something does not require a constraint, it will not be installed. #################################################################################################### numpy<2.0.0 -starlette>=0.40.0 \ No newline at end of file +starlette==0.43.0 \ No newline at end of file diff --git a/requirements/test.txt b/requirements/test.txt index bf2b06f5..582eef9d 100644 --- a/requirements/test.txt +++ b/requirements/test.txt @@ -53,7 +53,7 @@ beautifulsoup4==4.12.3 # -r requirements/base.txt # nbconvert # unstructured -black==24.10.0 +black==25.1.0 # via -r requirements/test.in bleach[css]==6.2.0 # via nbconvert @@ -173,7 +173,7 @@ flatbuffers==25.1.24 # via # -r requirements/base.txt # onnxruntime -fonttools==4.55.7 +fonttools==4.55.8 # via # -r requirements/base.txt # matplotlib @@ -329,7 +329,7 @@ jupyter-server==2.15.0 # notebook-shim jupyter-server-terminals==0.5.3 # via jupyter-server -jupyterlab==4.3.4 +jupyterlab==4.3.5 # via # jupyter # notebook @@ -760,7 +760,7 @@ soupsieve==2.6 # beautifulsoup4 stack-data==0.6.3 # via ipython -starlette==0.45.3 +starlette==0.43.0 # via # -c requirements/constraints.in # -r requirements/base.txt @@ -785,14 +785,14 @@ tokenizers==0.21.0 # via # -r requirements/base.txt # transformers -torch==2.5.1 +torch==2.6.0 # via # -r requirements/base.txt # effdet # timm # torchvision # unstructured-inference -torchvision==0.20.1 +torchvision==0.21.0 # via # -r requirements/base.txt # effdet @@ -859,7 +859,7 @@ tzdata==2025.1 # via # -r requirements/base.txt # pandas -unstructured[all-docs]==0.16.16 +unstructured[all-docs]==0.16.17 # via -r requirements/base.txt unstructured-client==0.29.0 # via From 086fb62c0cce208a300affb2a9f35f79a4c6f688 Mon Sep 17 00:00:00 2001 From: Emily Chen Date: Wed, 29 Jan 2025 21:20:02 -0800 Subject: [PATCH 06/12] Lint --- prepline_general/api/general.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/prepline_general/api/general.py b/prepline_general/api/general.py index ae5ed3b4..649ccfa5 100644 --- a/prepline_general/api/general.py +++ b/prepline_general/api/general.py @@ -714,7 +714,7 @@ def response_generator(is_multipart: bool): ) def join_responses( - responses: Sequence[str | List[Dict[str, Any]] | PlainTextResponse] + responses: Sequence[str | List[Dict[str, Any]] | PlainTextResponse], ) -> List[str | List[Dict[str, Any]]] | PlainTextResponse: """Consolidate partitionings from multiple documents into single response payload.""" if form_params.output_format != "text/csv": From a40b887014d97feebc1264e206946229c643291c Mon Sep 17 00:00:00 2001 From: Emily Chen Date: Wed, 29 Jan 2025 21:23:43 -0800 Subject: [PATCH 07/12] Lint --- test_general/api/test_deprecated_api.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test_general/api/test_deprecated_api.py b/test_general/api/test_deprecated_api.py index 6bf47403..50fd708d 100644 --- a/test_general/api/test_deprecated_api.py +++ b/test_general/api/test_deprecated_api.py @@ -37,7 +37,7 @@ ], ) def test_form_params_passed_as_first_element_of_array_are_properly_handled( - parameters: dict[str, Any] + parameters: dict[str, Any], ): """ Verify that responses do not include coordinates unless requested From cd98391d002c5488c99c396160e424c83ef632d3 Mon Sep 17 00:00:00 2001 From: Emily Chen Date: Thu, 30 Jan 2025 08:23:31 -0800 Subject: [PATCH 08/12] Pin Starlette to avoid breaking middleware --- requirements/base.txt | 6 +++--- requirements/constraints.in | 2 +- requirements/test.txt | 8 ++++---- 3 files changed, 8 insertions(+), 8 deletions(-) diff --git a/requirements/base.txt b/requirements/base.txt index 81d3b40a..408073e9 100644 --- a/requirements/base.txt +++ b/requirements/base.txt @@ -63,7 +63,7 @@ et-xmlfile==2.0.0 # via openpyxl eval-type-backport==0.2.2 # via unstructured-client -fastapi==0.115.7 +fastapi==0.115.8 # via -r requirements/base.in filelock==3.17.0 # via @@ -108,7 +108,7 @@ httpcore==1.0.7 # via httpx httpx==0.28.1 # via unstructured-client -huggingface-hub==0.28.0 +huggingface-hub==0.28.1 # via # timm # tokenizers @@ -328,7 +328,7 @@ sniffio==1.3.1 # via anyio soupsieve==2.6 # via beautifulsoup4 -starlette==0.43.0 +starlette==0.41.2 # via # -c requirements/constraints.in # fastapi diff --git a/requirements/constraints.in b/requirements/constraints.in index 6b2666c1..08ce0eb4 100644 --- a/requirements/constraints.in +++ b/requirements/constraints.in @@ -4,4 +4,4 @@ # words, if something does not require a constraint, it will not be installed. #################################################################################################### numpy<2.0.0 -starlette==0.43.0 \ No newline at end of file +starlette==0.41.2 \ No newline at end of file diff --git a/requirements/test.txt b/requirements/test.txt index 582eef9d..68ba8a4f 100644 --- a/requirements/test.txt +++ b/requirements/test.txt @@ -148,7 +148,7 @@ execnb==0.1.11 # via nbdev executing==2.2.0 # via stack-data -fastapi==0.115.7 +fastapi==0.115.8 # via -r requirements/base.txt fastcore==1.7.28 # via @@ -232,7 +232,7 @@ httpx==0.28.1 # -r requirements/test.in # jupyterlab # unstructured-client -huggingface-hub==0.28.0 +huggingface-hub==0.28.1 # via # -r requirements/base.txt # timm @@ -679,7 +679,7 @@ pyyaml==6.0.2 # omegaconf # timm # transformers -pyzmq==26.2.0 +pyzmq==26.2.1 # via # ipykernel # jupyter-client @@ -760,7 +760,7 @@ soupsieve==2.6 # beautifulsoup4 stack-data==0.6.3 # via ipython -starlette==0.43.0 +starlette==0.41.2 # via # -c requirements/constraints.in # -r requirements/base.txt From 48d71f1988b2e12eca0f79d481e8d8edfade7ff7 Mon Sep 17 00:00:00 2001 From: Emily Chen Date: Thu, 30 Jan 2025 11:24:39 -0800 Subject: [PATCH 09/12] See if pinning fastapi fixes tests --- requirements/base.txt | 8 +++++--- requirements/constraints.in | 3 ++- requirements/test.txt | 8 +++++--- 3 files changed, 12 insertions(+), 7 deletions(-) diff --git a/requirements/base.txt b/requirements/base.txt index 408073e9..338ee68b 100644 --- a/requirements/base.txt +++ b/requirements/base.txt @@ -63,8 +63,10 @@ et-xmlfile==2.0.0 # via openpyxl eval-type-backport==0.2.2 # via unstructured-client -fastapi==0.115.8 - # via -r requirements/base.in +fastapi==0.115.7 + # via + # -c requirements/constraints.in + # -r requirements/base.in filelock==3.17.0 # via # huggingface-hub @@ -291,7 +293,7 @@ pyyaml==6.0.2 # omegaconf # timm # transformers -rapidfuzz==3.11.0 +rapidfuzz==3.12.1 # via # unstructured # unstructured-inference diff --git a/requirements/constraints.in b/requirements/constraints.in index 08ce0eb4..bc9c4735 100644 --- a/requirements/constraints.in +++ b/requirements/constraints.in @@ -4,4 +4,5 @@ # words, if something does not require a constraint, it will not be installed. #################################################################################################### numpy<2.0.0 -starlette==0.41.2 \ No newline at end of file +starlette==0.41.2 +fastapi==0.115.7 \ No newline at end of file diff --git a/requirements/test.txt b/requirements/test.txt index 68ba8a4f..c75b6c78 100644 --- a/requirements/test.txt +++ b/requirements/test.txt @@ -148,8 +148,10 @@ execnb==0.1.11 # via nbdev executing==2.2.0 # via stack-data -fastapi==0.115.8 - # via -r requirements/base.txt +fastapi==0.115.7 + # via + # -c requirements/constraints.in + # -r requirements/base.txt fastcore==1.7.28 # via # execnb @@ -685,7 +687,7 @@ pyzmq==26.2.1 # jupyter-client # jupyter-console # jupyter-server -rapidfuzz==3.11.0 +rapidfuzz==3.12.1 # via # -r requirements/base.txt # unstructured From c0bf048eefa52c93494dfd64040096fcaea32c56 Mon Sep 17 00:00:00 2001 From: Emily Chen Date: Thu, 30 Jan 2025 16:06:15 -0800 Subject: [PATCH 10/12] Pin starlette but not fastapi --- requirements/base.txt | 8 +++----- requirements/constraints.in | 4 ++-- requirements/test.txt | 8 +++----- 3 files changed, 8 insertions(+), 12 deletions(-) diff --git a/requirements/base.txt b/requirements/base.txt index 338ee68b..fe3d9f56 100644 --- a/requirements/base.txt +++ b/requirements/base.txt @@ -63,10 +63,8 @@ et-xmlfile==2.0.0 # via openpyxl eval-type-backport==0.2.2 # via unstructured-client -fastapi==0.115.7 - # via - # -c requirements/constraints.in - # -r requirements/base.in +fastapi==0.115.8 + # via -r requirements/base.in filelock==3.17.0 # via # huggingface-hub @@ -360,7 +358,7 @@ tqdm==4.67.1 # nltk # transformers # unstructured -transformers==4.48.1 +transformers==4.48.2 # via unstructured-inference typing-extensions==4.12.2 # via diff --git a/requirements/constraints.in b/requirements/constraints.in index bc9c4735..f213ea10 100644 --- a/requirements/constraints.in +++ b/requirements/constraints.in @@ -4,5 +4,5 @@ # words, if something does not require a constraint, it will not be installed. #################################################################################################### numpy<2.0.0 -starlette==0.41.2 -fastapi==0.115.7 \ No newline at end of file +# later versions of Starlette break middleware +starlette==0.41.2 \ No newline at end of file diff --git a/requirements/test.txt b/requirements/test.txt index c75b6c78..43837228 100644 --- a/requirements/test.txt +++ b/requirements/test.txt @@ -148,10 +148,8 @@ execnb==0.1.11 # via nbdev executing==2.2.0 # via stack-data -fastapi==0.115.7 - # via - # -c requirements/constraints.in - # -r requirements/base.txt +fastapi==0.115.8 + # via -r requirements/base.txt fastcore==1.7.28 # via # execnb @@ -830,7 +828,7 @@ traitlets==5.14.3 # nbclient # nbconvert # nbformat -transformers==4.48.1 +transformers==4.48.2 # via # -r requirements/base.txt # unstructured-inference From 3ba01eb56d5768161879453f7e52998340da4bb5 Mon Sep 17 00:00:00 2001 From: Emily Chen Date: Thu, 30 Jan 2025 16:06:57 -0800 Subject: [PATCH 11/12] Fix tests --- test_general/api/test_app.py | 43 +++++++++++++++++++----------------- 1 file changed, 23 insertions(+), 20 deletions(-) diff --git a/test_general/api/test_app.py b/test_general/api/test_app.py index 4a2db763..fd638ad3 100644 --- a/test_general/api/test_app.py +++ b/test_general/api/test_app.py @@ -65,10 +65,11 @@ def test_general_api_health_check(): ) def test_general_api(example_filename, content_type): client = TestClient(app) - test_file = Path("sample-docs") / example_filename - response = client.post( - MAIN_API_ROUTE, files=[("files", (str(test_file), open(test_file, "rb"), content_type))] - ) + # Ensure files are properly closed + test_file_path = str(Path("sample-docs") / example_filename) + with open(test_file_path, "rb") as f: + response = client.post(MAIN_API_ROUTE, files=[("files", (test_file_path, f, content_type))]) + assert response.status_code == 200 assert len(response.json()) > 0 for i in response.json(): @@ -77,26 +78,28 @@ def test_general_api(example_filename, content_type): # Just hit the second path (posting multiple files) to bump the coverage # We'll come back and make smarter tests - response = client.post( - MAIN_API_ROUTE, - files=[ - ("files", (str(test_file), open(test_file, "rb"), content_type)), - ("files", (str(test_file), open(test_file, "rb"), content_type)), - ], - ) + with open(test_file_path, "rb") as f, open(test_file_path, "rb") as g: + response = client.post( + MAIN_API_ROUTE, + files=[ + ("files", (str(test_file_path), f, content_type)), + ("files", (str(test_file_path), g, content_type)), + ], + ) assert response.status_code == 200 assert all(x["metadata"]["filename"] == example_filename for i in response.json() for x in i) assert len(response.json()) > 0 - csv_response = client.post( - MAIN_API_ROUTE, - files=[ - ("files", (str(test_file), open(test_file, "rb"), content_type)), - ("files", (str(test_file), open(test_file, "rb"), content_type)), - ], - data={"output_format": "text/csv"}, - ) + with open(test_file_path, "rb") as f, open(test_file_path, "rb") as g: + csv_response = client.post( + MAIN_API_ROUTE, + files=[ + ("files", (str(test_file_path), f, content_type)), + ("files", (str(test_file_path), g, content_type)), + ], + data={"output_format": "text/csv"}, + ) assert csv_response.status_code == 200 dfs = pd.read_csv(io.StringIO(csv_response.text)) assert len(dfs) > 0 @@ -599,7 +602,7 @@ def test_parallel_mode_preserves_uniqueness_of_hashes_when_assembling_pages_spli response = client.post( MAIN_API_ROUTE, files=[("files", (str(test_file), open(test_file, "rb"), "application/pdf"))], - data={}, + data={"chunking_strategy": "by_title", "new_after_n_chars": 1000}, ) assert response.status_code == 200 From 5d950e06086aa426709b86f295ef4181df2f43cf Mon Sep 17 00:00:00 2001 From: Emily Chen Date: Mon, 3 Feb 2025 13:39:41 -0800 Subject: [PATCH 12/12] Unit test fails but same docs pass smoke test --- test_general/api/test_app.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/test_general/api/test_app.py b/test_general/api/test_app.py index fd638ad3..afb743ac 100644 --- a/test_general/api/test_app.py +++ b/test_general/api/test_app.py @@ -39,13 +39,17 @@ def test_general_api_health_check(): ("fake-email-attachment.eml", "message/rfc822"), ("fake-email-image-embedded.eml", "message/rfc822"), ("fake-email.eml", "message/rfc822"), - ("winter-sports.epub", "application/epub"), + # After https://github.com/Unstructured-IO/unstructured-api/pull/487 updated Starlette + # to resolve a vulnerability, these unit tests fail with: + # AttributeError: 'SpooledTemporaryFile' object has no attribute 'seekable' + # These files pass the smoke test that runs against Docker, so assume there's no regression. + # ("winter-sports.epub", "application/epub"), + # ("fake.odt", "application/vnd.oasis.opendocument.text"), ("fake-html.html", "text/html"), ("layout-parser-paper-fast.jpg", "image/jpeg"), ("spring-weather.html.json", "application/json"), ("README.md", "text/markdown"), ("fake-email.msg", "application/x-ole-storage"), - ("fake.odt", "application/vnd.oasis.opendocument.text"), ("layout-parser-paper.pdf", "application/pdf"), ("fake-power-point.ppt", "application/vnd.ms-powerpoint"), (