-
Notifications
You must be signed in to change notification settings - Fork 4
Expand file tree
/
Copy pathexample_03_extract_schema.py
More file actions
88 lines (70 loc) · 2.13 KB
/
example_03_extract_schema.py
File metadata and controls
88 lines (70 loc) · 2.13 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
from pathlib import Path
import json
from mellea.backends import model_ids
from docling_agent.agents import DoclingExtractingAgent, logger
def run_task(
schema: dict,
sources: list[Path],
opath: Path,
model_id=model_ids.OPENAI_GPT_OSS_20B,
tools: list | None = None,
):
agent = DoclingExtractingAgent(model_id=model_id, tools=tools or [])
document = agent.run(
task=json.dumps(schema),
sources=sources,
)
document.save_as_html(filename=opath)
logger.info(f"report written to `{opath}`")
def main():
model_id = model_ids.OPENAI_GPT_OSS_20B
schema_01 = {
"name": "string",
"birth year": "integer",
"nationality": "string",
"contact details": "string",
"latest education": "string",
"languages": "string",
"skills": "string",
}
schema_02 = {
"title": "string",
"authors": "string"
}
schema_03 = {
"invoice-number": "string",
"total": "float",
"currency": "string",
}
docdir = Path("./examples/example_03_extract") # Adjust to your data root
for _ in [
(
schema_01,
"curriculum_vitae",
),
(
schema_02,
"papers",
),
(
schema_03,
"invoices",
)
]:
cdir = docdir / _[1]
sources: list[Path] = []
# Collect PDFs and PNGs recursively under each source directory
sources.extend([p for p in cdir.rglob("*.pdf") if p.is_file()])
sources.extend([p for p in cdir.rglob("*.png") if p.is_file()])
sources.extend([p for p in cdir.rglob("*.jpg") if p.is_file()])
sources.extend([p for p in cdir.rglob("*.jpeg") if p.is_file()])
sources = sorted(sources)
logger.info(f"documents [{len(sources)}]:\n\n\t" + ",\n\t".join(str(p) for p in sources))
run_task(
schema=_[0],
sources=sources,
opath=docdir / f"{_[1]}_extraction_report.html",
model_id=model_id,
)
if __name__ == "__main__":
main()