forked from firecrawl/pdf-inspector
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathpdf_inspector.pyi
More file actions
167 lines (140 loc) · 4.86 KB
/
Copy pathpdf_inspector.pyi
File metadata and controls
167 lines (140 loc) · 4.86 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
"""Type stubs for pdf_inspector."""
from typing import Optional
class PdfResult:
"""Result of processing a PDF file."""
pdf_type: str
"""'text_based', 'scanned', 'image_based', or 'mixed'."""
markdown: Optional[str]
page_count: int
processing_time_ms: int
pages_needing_ocr: list[int]
title: Optional[str]
confidence: float
is_complex_layout: bool
pages_with_tables: list[int]
pages_with_columns: list[int]
has_encoding_issues: bool
class PdfClassification:
"""Lightweight PDF classification result."""
pdf_type: str
"""'text_based', 'scanned', 'image_based', or 'mixed'."""
page_count: int
pages_needing_ocr: list[int]
"""0-indexed page numbers that need OCR."""
confidence: float
class TextItem:
"""A positioned text item extracted from a PDF."""
text: str
x: float
y: float
width: float
height: float
font: str
font_size: float
page: int
is_bold: bool
is_italic: bool
item_type: str
class RegionText:
"""Extracted text for a single region."""
text: str
needs_ocr: bool
"""True when the text should not be trusted."""
class PageRegionTexts:
"""Extracted text for one page's regions."""
page: int
"""0-indexed page number."""
regions: list[RegionText]
class PageMarkdown:
"""Per-page markdown extraction result."""
page: int
"""0-indexed page number."""
markdown: str
"""Formatted markdown for this page (empty string when needs_ocr is True)."""
needs_ocr: bool
"""True when text on this page is unreliable and OCR should be used instead."""
class PagesExtractionResult:
"""Per-page markdown output with document-wide layout classification."""
pages: list[PageMarkdown]
"""Per-page markdown results, in the order requested."""
pages_with_tables: list[int]
"""1-indexed pages where tables were detected."""
pages_with_columns: list[int]
"""1-indexed pages where multi-column layout was detected."""
pages_needing_ocr: list[int]
"""1-indexed pages that need OCR."""
is_complex: bool
"""True if any page has tables or multi-column layout."""
def process_pdf(path: str, pages: Optional[list[int]] = None) -> PdfResult:
"""Process a PDF: detect type, extract text, convert to Markdown."""
...
def process_pdf_bytes(data: bytes, pages: Optional[list[int]] = None) -> PdfResult:
"""Process a PDF from bytes in memory."""
...
def detect_pdf(path: str) -> PdfResult:
"""Fast detection only — no text extraction."""
...
def detect_pdf_bytes(data: bytes) -> PdfResult:
"""Fast detection from bytes."""
...
def classify_pdf(path: str) -> PdfClassification:
"""Lightweight classification — type, page count, and OCR pages (0-indexed)."""
...
def classify_pdf_bytes(data: bytes) -> PdfClassification:
"""Lightweight classification from bytes."""
...
def extract_text(path: str) -> str:
"""Extract plain text from a PDF."""
...
def extract_text_bytes(data: bytes) -> str:
"""Extract plain text from PDF bytes."""
...
def extract_text_with_positions(path: str, pages: Optional[list[int]] = None) -> list[TextItem]:
"""Extract text with position information."""
...
def extract_text_with_positions_bytes(data: bytes, pages: Optional[list[int]] = None) -> list[TextItem]:
"""Extract text with position information from bytes."""
...
def extract_text_in_regions(
path: str,
page_regions: list[tuple[int, list[list[float]]]],
) -> list[PageRegionTexts]:
"""Extract text within bounding-box regions from a PDF file.
Args:
path: Path to the PDF file.
page_regions: List of (page_0indexed, [[x1, y1, x2, y2], ...]) tuples.
"""
...
def extract_text_in_regions_bytes(
data: bytes,
page_regions: list[tuple[int, list[list[float]]]],
) -> list[PageRegionTexts]:
"""Extract text within bounding-box regions from PDF bytes.
Args:
data: PDF file contents as bytes.
page_regions: List of (page_0indexed, [[x1, y1, x2, y2], ...]) tuples.
"""
...
def extract_pages_markdown(
path: str,
pages: Optional[list[int]] = None,
) -> PagesExtractionResult:
"""Extract formatted markdown for pages of a PDF, with layout classification.
Args:
path: Path to the PDF file.
pages: Optional list of 0-indexed pages. When ``None`` (default), every
page is returned in document order. Otherwise, output matches the
caller-supplied order.
Returns:
PagesExtractionResult with per-page markdown and document-wide layout
classification (tables, columns, OCR needs).
"""
...
def extract_pages_markdown_bytes(
data: bytes,
pages: Optional[list[int]] = None,
) -> PagesExtractionResult:
"""Extract formatted markdown for pages of a PDF from bytes.
See :func:`extract_pages_markdown` for details.
"""
...