Description
Describe the bug
No matter how I use it, it will report raise BadZipFile("File is not a zip file")
zipfile.BadZipFile: File is not a zip file
Environment Info
Traceback (most recent call last):
File "D:\pythonprojects\LANGCHAIN\main.py", line 87, in
elements = partition_pdf("D:\pythonprojects\LANGCHAIN\inputs\智能传感器装配调试台架-产品手册.pdf")
File "D:\miniconda\envs\LANGCHAIN\lib\site-packages\unstructured\documents\elements.py", line 581, in wrapper
elements = func(*args, **kwargs)
File "D:\miniconda\envs\LANGCHAIN\lib\site-packages\unstructured\file_utils\filetype.py", line 725, in wrapper
elements = func(*args, **kwargs)
File "D:\miniconda\envs\LANGCHAIN\lib\site-packages\unstructured\file_utils\filetype.py", line 683, in wrapper
elements = func(*args, **kwargs)
File "D:\miniconda\envs\LANGCHAIN\lib\site-packages\unstructured\chunking\dispatch.py", line 74, in wrapper
elements = func(*args, **kwargs)
File "D:\miniconda\envs\LANGCHAIN\lib\site-packages\unstructured\partition\pdf.py", line 209, in partition_pdf
return partition_pdf_or_image(
File "D:\miniconda\envs\LANGCHAIN\lib\site-packages\unstructured\partition\pdf.py", line 350, in partition_pdf_or_image
out_elements = _process_uncategorized_text_elements(elements)
File "D:\miniconda\envs\LANGCHAIN\lib\site-packages\unstructured\partition\pdf.py", line 930, in _process_uncategorized_text_elements
new_el = element_from_text(cast(Text, el).text)
File "D:\miniconda\envs\LANGCHAIN\lib\site-packages\unstructured\partition\text.py", line 149, in element_from_text
elif is_possible_narrative_text(text):
File "D:\miniconda\envs\LANGCHAIN\lib\site-packages\unstructured\partition\text_type.py", line 74, in is_possible_narrative_text
if exceeds_cap_ratio(text, threshold=cap_threshold):
File "D:\miniconda\envs\LANGCHAIN\lib\site-packages\unstructured\partition\text_type.py", line 270, in exceeds_cap_ratio
if sentence_count(text, 3) > 1:
File "D:\miniconda\envs\LANGCHAIN\lib\site-packages\unstructured\partition\text_type.py", line 219, in sentence_count
sentences = sent_tokenize(text)
File "D:\miniconda\envs\LANGCHAIN\lib\site-packages\unstructured\nlp\tokenize.py", line 56, in sent_tokenize
_download_nltk_packages_if_not_present()
File "D:\miniconda\envs\LANGCHAIN\lib\site-packages\unstructured\nlp\tokenize.py", line 41, in _download_nltk_packages_if_not_present
tagger_available = check_for_nltk_package(
File "D:\miniconda\envs\LANGCHAIN\lib\site-packages\unstructured\nlp\tokenize.py", line 29, in check_for_nltk_package
nltk.find(f"{package_category}/{package_name}", paths=paths)
File "D:\miniconda\envs\LANGCHAIN\lib\site-packages\nltk\data.py", line 551, in find
return find(modified_name, paths)
File "D:\miniconda\envs\LANGCHAIN\lib\site-packages\nltk\data.py", line 538, in find
return ZipFilePathPointer(p, zipentry)
File "D:\miniconda\envs\LANGCHAIN\lib\site-packages\nltk\data.py", line 391, in init
zipfile = OpenOnDemandZipFile(os.path.abspath(zipfile))
File "D:\miniconda\envs\LANGCHAIN\lib\site-packages\nltk\data.py", line 1020, in init
zipfile.ZipFile.init(self, filename)
File "D:\miniconda\envs\LANGCHAIN\lib\zipfile.py", line 1268, in init
self._RealGetContents()
File "D:\miniconda\envs\LANGCHAIN\lib\zipfile.py", line 1335, in _RealGetContents
raise BadZipFile("File is not a zip file")
zipfile.BadZipFile: File is not a zip file