Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 2 additions & 8 deletions integrations/paddleocr/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ name = "paddleocr-haystack"
dynamic = ["version"]
description = 'An integration of PaddleOCR with Haystack'
readme = "README.md"
requires-python = ">=3.9"
requires-python = ">=3.10"
license = "Apache-2.0"
keywords = []
authors = [
Expand All @@ -16,15 +16,14 @@ authors = [
classifiers = [
"Development Status :: 4 - Beta",
"Programming Language :: Python",
"Programming Language :: Python :: 3.9",
"Programming Language :: Python :: 3.10",
"Programming Language :: Python :: 3.11",
"Programming Language :: Python :: 3.12",
"Programming Language :: Python :: Implementation :: CPython",
"Programming Language :: Python :: Implementation :: PyPy",
]
dependencies = [
"haystack-ai>=2.19.0",
"haystack-ai>=2.22.0",
"paddleocr>=3.3.2",
"paddlex[serving]>=3.3.10",
"requests>=2.25.0",
Expand Down Expand Up @@ -80,7 +79,6 @@ check_untyped_defs = true
disallow_incomplete_defs = true

[tool.ruff]
target-version = "py39"
line-length = 120

[tool.ruff.lint]
Expand Down Expand Up @@ -127,10 +125,6 @@ ignore = [
"B008",
"S101",
]
unfixable = [
# Don't touch unused imports
"F401",
]

[tool.ruff.lint.isort]
known-first-party = ["haystack_integrations"]
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
# SPDX-License-Identifier: Apache-2.0
import base64
from pathlib import Path
from typing import Any, Literal, Optional, Union
from typing import Any, Literal

import requests
from haystack import Document, component, default_from_dict, default_to_dict, logging
Expand All @@ -24,7 +24,7 @@
logger = logging.getLogger(__name__)


FileTypeInput = Union[Literal["pdf", "image"], None]
FileTypeInput = Literal["pdf", "image"] | None

# Supported image file extensions
_IMAGE_EXTENSIONS = {
Expand All @@ -41,9 +41,9 @@


def _infer_file_type_from_source(
source: Union[str, Path, ByteStream],
mime_type: Optional[str] = None,
) -> Optional[FileType]:
source: str | Path | ByteStream,
mime_type: str | None = None,
) -> FileType | None:
"""
Infer file type from file extension or MIME type.

Expand All @@ -56,7 +56,7 @@ def _infer_file_type_from_source(
determined.
"""
# Try to get extension from file path
file_path: Optional[str] = None
file_path: str | None = None

# Check if source is a file path
if isinstance(source, (str, Path)):
Expand Down Expand Up @@ -86,7 +86,7 @@ def _infer_file_type_from_source(
return None


def _normalize_file_type(file_type: Optional[FileTypeInput]) -> Optional[FileType]:
def _normalize_file_type(file_type: FileTypeInput) -> FileType | None:
"""
Normalize file type input to the numeric format expected by the API.

Expand Down Expand Up @@ -145,26 +145,26 @@ def __init__(
*,
api_url: str,
access_token: Secret = Secret.from_env_var("AISTUDIO_ACCESS_TOKEN"),
file_type: Optional[FileTypeInput] = None,
use_doc_orientation_classify: Optional[bool] = None,
use_doc_unwarping: Optional[bool] = None,
use_layout_detection: Optional[bool] = None,
use_chart_recognition: Optional[bool] = None,
layout_threshold: Optional[Union[float, dict]] = None,
layout_nms: Optional[bool] = None,
layout_unclip_ratio: Optional[Union[float, tuple[float, float], dict]] = None,
layout_merge_bboxes_mode: Optional[Union[str, dict]] = None,
prompt_label: Optional[str] = None,
format_block_content: Optional[bool] = None,
repetition_penalty: Optional[float] = None,
temperature: Optional[float] = None,
top_p: Optional[float] = None,
min_pixels: Optional[int] = None,
max_pixels: Optional[int] = None,
prettify_markdown: Optional[bool] = None,
show_formula_number: Optional[bool] = None,
visualize: Optional[bool] = None,
additional_params: Optional[dict[str, Any]] = None,
file_type: FileTypeInput = None,
use_doc_orientation_classify: bool | None = None,
use_doc_unwarping: bool | None = None,
use_layout_detection: bool | None = None,
use_chart_recognition: bool | None = None,
layout_threshold: float | dict | None = None,
layout_nms: bool | None = None,
layout_unclip_ratio: float | tuple[float, float] | dict | None = None,
layout_merge_bboxes_mode: str | dict | None = None,
prompt_label: str | None = None,
format_block_content: bool | None = None,
repetition_penalty: float | None = None,
temperature: float | None = None,
top_p: float | None = None,
min_pixels: int | None = None,
max_pixels: int | None = None,
prettify_markdown: bool | None = None,
show_formula_number: bool | None = None,
visualize: bool | None = None,
additional_params: dict[str, Any] | None = None,
):
"""
Create a `PaddleOCRVLDocumentConverter` component.
Expand Down Expand Up @@ -421,8 +421,8 @@ def _parse(self, data: bytes, file_type: FileType) -> tuple[str, dict[str, Any]]
@component.output_types(documents=list[Document], raw_paddleocr_responses=list[dict[str, Any]])
def run(
self,
sources: list[Union[str, Path, ByteStream]],
meta: Optional[Union[dict[str, Any], list[dict[str, Any]]]] = None,
sources: list[str | Path | ByteStream],
meta: dict[str, Any] | list[dict[str, Any]] | None = None,
) -> dict[str, Any]:
"""
Convert image or PDF files to Documents.
Expand All @@ -448,7 +448,7 @@ def run(

meta_list = normalize_metadata(meta, sources_count=len(sources))

for source, metadata in zip(sources, meta_list):
for source, metadata in zip(sources, meta_list, strict=True):
try:
bytestream = get_bytestream_from_source(source)
except Exception as e:
Expand Down