-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathschema.py
More file actions
174 lines (136 loc) · 5.62 KB
/
Copy pathschema.py
File metadata and controls
174 lines (136 loc) · 5.62 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
"""Shared data models for CapScribe.
A single permissive `CapitalEvent` model covers all four event families
(allotment, bonus_issue, rights_issue, authorised_capital_change) because
each family carries a different field set. `extra="allow"` keeps any
unmodelled fields rather than dropping them, while the typed fields below
give validation and editor support for the common ones.
Provenance fields (page_number, source_snippet, confidence) make every
event citable back to the exact page and verbatim text it came from.
"""
from __future__ import annotations
from typing import Any, Literal
from pydantic import BaseModel, ConfigDict, Field
EventType = Literal[
"allotment",
"bonus_issue",
"rights_issue",
"authorised_capital_change",
# extended coverage — annual reports, financial statements, diligence files
"dividend_declaration",
"share_repurchase",
"warrant_exercise",
]
ExtractionMethod = Literal["table", "text", "llm", "ocr"]
class SourceProvenance(BaseModel):
"""Structured citation layer for an event.
Backwards-compatible mirror of the flat provenance fields below. It is
optional (defaults to ``None``) so existing extracted JSON and existing
Chroma collections remain valid; when present it records *how* an event
was extracted, which the evaluation harness breaks down per method.
"""
page: int | None = None # 1-indexed source page
section: str | None = None # nearest preceding heading
bbox: list[float] | None = None # [x0, y0, x1, y1] for tables
extraction_method: ExtractionMethod = "llm" # table | text | llm | ocr
confidence: float = 1.0 # 0.0-1.0
class CapitalEvent(BaseModel):
model_config = ConfigDict(extra="allow")
event_type: EventType
date: str | None = None
# provenance — every event traces back to a page and a verbatim excerpt
page_number: int | None = None # 1-indexed PDF page the event was found on
source_snippet: str | None = None # verbatim 1-2 sentence excerpt from that page
bbox: list[float] | None = None # [x0, y0, x1, y1] when table-sourced
confidence: float = 1.0 # 0.0-1.0; lowered when provenance is weak
# structured citation; optional + backwards-compatible (default None)
source_provenance: SourceProvenance | None = None
# allotment
shares: int | None = None
face_value: float | None = None
issue_price: float | None = None
consideration: str | None = None
allottee_category: str | None = None
# bonus / rights
ratio: str | None = None
shares_issued: int | None = None
pre_issue_capital: int | None = None
post_issue_capital: int | None = None
price: float | None = None
shares_offered: int | None = None
# authorised capital change
old_capital: int | None = None
new_capital: int | None = None
resolution_type: str | None = None
# dividend_declaration
amount_per_share: float | None = None
record_date: str | None = None
payment_date: str | None = None
total_outflow: int | None = None
# share_repurchase
shares_bought_back: int | None = None
remaining_buyback_authority: int | None = None
# warrant_exercise
warrants_exercised: int | None = None
exercise_price: float | None = None
def dedup_key(self) -> tuple:
"""Stable identity used for de-duplication across overlapping chunks."""
return (
self.event_type,
self.date,
self.shares,
self.ratio,
self.new_capital,
self.shares_offered,
)
class Citation(BaseModel):
"""Page-level provenance for a claim or retrieved event.
Only built when page_number AND source_snippet are genuinely known —
page numbers are never guessed.
"""
event_id: str
page_number: int
source_snippet: str
section_heading: str | None = None
bbox: list[float] | None = None
class PageText(BaseModel):
"""One page of extracted text, with OCR provenance."""
page_number: int # 1-indexed
text: str
ocr_used: bool = False
confidence: float = 1.0
class ExtractionResult(BaseModel):
model_config = ConfigDict(extra="allow")
source_file: str
total_pages: int | None = None
extraction_date: str | None = None
capital_events: list[CapitalEvent] = Field(default_factory=list)
class SearchHit(BaseModel):
event: dict[str, Any]
score: float
text: str
event_id: str | None = None
class AskResponse(BaseModel):
question: str
answer: str
mode: Literal["extractive", "llm"]
# backward-compatible: full event dicts backing the answer
citations: list[dict[str, Any]] = Field(default_factory=list)
# page-level provenance; only events with a known page appear here
page_citations: list[Citation] = Field(default_factory=list)
retrieval_strategy: str = "vector" # "hybrid" | "vector" | "bm25"
query_time_ms: float = 0.0
def citation_from_hit(hit: SearchHit) -> Citation | None:
"""Build a Citation from a search hit, or None when provenance is absent.
Never invents a page number: both page_number and source_snippet must
be present in the stored event metadata.
"""
page = hit.event.get("page_number")
snippet = hit.event.get("source_snippet")
if page is None or snippet is None:
return None
return Citation(
event_id=hit.event_id or f"{hit.event.get('event_type', 'event')}@p{page}",
page_number=int(page),
source_snippet=str(snippet),
bbox=hit.event.get("bbox") if isinstance(hit.event.get("bbox"), list) else None,
)