ra-mcp/packages/browse-mcp/src/ra_mcp_browse_mcp/formatter.py at main · AI-Riksarkivet/ra-mcp · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
"""
Plain text formatter for MCP/LLM output without any Rich markup.
"""

from ra_mcp_common.formatting import highlight_keyword_markdown, iiif_manifest_to_bildvisaren, page_id_to_number


class PlainTextFormatter:
    """Formatter that produces plain text without any Rich markup."""

    def format_text(self, text_content: str, style_name: str = "") -> str:
        """Return plain text without any styling."""
        return text_content

    def format_table(
        self,
        column_headers: list[str],
        table_rows: list[list[str]],
        table_title: str = "",
    ) -> str:
        """
        Create a plain text table without borders.

        Args:
            column_headers: List of column headers
            table_rows: List of row data
            table_title: Optional table title

        Returns:
            Plain text formatted table
        """
        formatted_lines = []
        if table_title:
            formatted_lines.append(table_title)
            formatted_lines.append("")

        # Calculate column widths
        all_table_rows = [column_headers, *table_rows]
        column_widths = [max(len(str(row[column_index])) for row in all_table_rows) for column_index in range(len(column_headers))]

        # Format header
        formatted_header = " | ".join(column_headers[column_index].ljust(column_widths[column_index]) for column_index in range(len(column_headers)))
        formatted_lines.append(formatted_header)

        # Add simple separator
        formatted_lines.append("-" * len(formatted_header))

        # Format rows
        for data_row in table_rows:
            formatted_row = " | ".join(str(data_row[column_index]).ljust(column_widths[column_index]) for column_index in range(len(data_row)))
            formatted_lines.append(formatted_row)

        return "\n".join(formatted_lines)

    def format_panel(self, panel_content: str, panel_title: str = "", panel_border_style: str = "") -> str:
        """
        Format content as plain text without panels or borders.

        Args:
            panel_content: Content to display
            panel_title: Optional title
            panel_border_style: Ignored in plain text mode

        Returns:
            Plain text formatted content
        """
        formatted_lines = []
        if panel_title:
            formatted_lines.append(panel_title)
            formatted_lines.append("")
        formatted_lines.append(panel_content)
        return "\n".join(formatted_lines)

    def highlight_search_keyword(self, text_content: str, search_keyword: str) -> str:
        """Highlight search keywords using markdown-style bold."""
        return highlight_keyword_markdown(text_content, search_keyword)

    def _iiif_manifest_to_bildvisaren(self, iiif_manifest_url: str) -> str:
        """Convert IIIF manifest URL to bildvisaren URL."""
        return iiif_manifest_to_bildvisaren(iiif_manifest_url)

    def format_no_results_message(self, search_result) -> str:
        """
        Generate appropriate message when no results are found.

        Args:
            search_result: SearchResult containing keyword, offset, and total_hits

        Returns:
            Formatted no results message
        """
        if search_result.offset > 0:
            return f"No more results found for '{search_result.keyword}' at offset {search_result.offset}. Total results: {search_result.total_hits}"
        return f"No results found for '{search_result.keyword}'. make sure to use \"\" "

    def _format_document_header(self, lines: list[str], document) -> None:
        """Emit ref_code, institution, date, and title lines for a document."""
        lines.append(f"📚 Document: {document.metadata.reference_code}")

        if document.metadata.archival_institution:
            institution = document.metadata.archival_institution[0].caption
            lines.append(f"🏛️  Institution: {institution}")

        if document.metadata.date:
            lines.append(f"📅 Date: {document.metadata.date}")

        title = document.get_title()
        title_display = title[:100] + "..." if len(title) > 100 else title
        lines.append(f"📄 Title: {title_display}")

    def _format_metadata_fields(self, lines: list[str], document) -> None:
        """Emit type, hierarchy, provenance, link, and IIIF lines for metadata-only documents."""
        if document.object_type:
            type_info = document.object_type
            if document.type:
                type_info += f" / {document.type}"
            lines.append(f"🏷️  Type: {type_info}")

        if document.metadata.hierarchy:
            hierarchy_parts = [h.caption for h in document.metadata.hierarchy[:3]]
            if hierarchy_parts:
                hierarchy_text = " → ".join(hierarchy_parts)
                if len(hierarchy_text) > 150:
                    hierarchy_text = hierarchy_text[:147] + "..."
                lines.append(f"📂 Context: {hierarchy_text}")

        if document.metadata.provenance:
            prov = document.metadata.provenance[0]
            prov_text = prov.caption
            if prov.date:
                prov_text += f" ({prov.date})"
            lines.append(f"👤 Creator: {prov_text}")

        if document.links and document.links.html:
            lines.append(f"🔗 View: {document.links.html}")

        if document.links and document.links.image:
            lines.append(f"🖼️  IIIF: {document.links.image[0]}")

    def _format_document_snippets(self, lines: list[str], document, keyword: str) -> None:
        """Emit page numbers, hit counts, and first 3 snippets for a transcribed document."""
        page_numbers = sorted({page.id for snippet in document.transcribed_text.snippets for page in snippet.pages})
        trimmed_page_numbers = [str(page_id_to_number(pid)) for pid in page_numbers]

        doc_snippet_count = len(document.transcribed_text.snippets)
        total_hits = document.get_total_hits()
        hit_label = "hit" if doc_snippet_count == 1 else "hits"

        lines.append(f"📖 Pages with hits: {', '.join(trimmed_page_numbers)}")

        if total_hits > doc_snippet_count:
            lines.append(f"💡 {doc_snippet_count} {hit_label} shown ({total_hits} total)")
        else:
            lines.append(f"💡 {doc_snippet_count} {hit_label} found")

        for snippet in document.transcribed_text.snippets[:3]:
            snippet_text = self.highlight_search_keyword(snippet.text, keyword)
            if snippet.pages:
                page_id = snippet.pages[0].id
                lines.append(f"   Page {page_id}: {snippet_text}")

        if len(document.transcribed_text.snippets) > 3:
            lines.append(f"   ...and {len(document.transcribed_text.snippets) - 3} more pages with hits")

    def format_search_results(self, search_result, maximum_documents_to_display: int = 20) -> str:
        """
        Format search results as plain text with emojis for MCP/LLM consumption.

        Args:
            search_result: SearchResult containing documents and metadata
            maximum_documents_to_display: Maximum number of documents to display

        Returns:
            Formatted plain text search results
        """
        if not search_result.items:
            return self.format_no_results_message(search_result)

        lines = []
        snippet_count = search_result.count_snippets()

        # Show "100+" if we hit the max limit, indicating more are available
        document_count = len(search_result.items)
        if document_count >= search_result.limit:
            document_display = f"{document_count}+"
        else:
            document_display = str(document_count)

        if snippet_count > 0:
            lines.append(f"Found {snippet_count} page-level hits across {document_display} volumes")
        else:
            lines.append(f"Found {document_display} volumes matching metadata")
        lines.append("")

        for _idx, document in enumerate(search_result.items[:maximum_documents_to_display]):
            has_snippets = document.transcribed_text and document.transcribed_text.snippets
            if snippet_count > 0 and not has_snippets:
                continue

            self._format_document_header(lines, document)

            if not has_snippets:
                self._format_metadata_fields(lines, document)

            if has_snippets:
                self._format_document_snippets(lines, document, search_result.keyword)

            lines.append("")

        total_document_count = len(search_result.items)
        if total_document_count > maximum_documents_to_display:
            remaining_documents = total_document_count - maximum_documents_to_display
            lines.append(f"... and {remaining_documents} more documents")

        return "\n".join(lines)

    def _format_non_digitised_metadata(self, lines: list[str], browse_result) -> bool:
        """Format metadata for non-digitised materials. Returns True if handled (early return)."""
        if browse_result.contexts or not browse_result.oai_metadata:
            return False

        lines.append("⚠️ This material is not digitised or transcribed - no page images or text available.")
        lines.append("Showing metadata only:")
        lines.append("")

        metadata = browse_result.oai_metadata
        lines.append(f"📄 Reference Code: {browse_result.reference_code}")

        if metadata.title and metadata.title != "(No title)":
            lines.append(f"📋 Title: {metadata.title}")

        if metadata.unitdate:
            lines.append(f"📅 Date Range: {metadata.unitdate}")

        if metadata.repository:
            lines.append(f"🏛️  Repository: {metadata.repository}")

        if metadata.unitid and metadata.unitid != browse_result.reference_code:
            lines.append(f"🔖 Unit ID: {metadata.unitid}")

        if metadata.description:
            lines.append(f"📝 Description: {metadata.description}")

        if metadata.nad_link:
            lines.append(f"🔗 View Online: {metadata.nad_link}")

        if metadata.iiif_manifest:
            bildvisaren_url = self._iiif_manifest_to_bildvisaren(metadata.iiif_manifest)
            if bildvisaren_url:
                lines.append(f"🖼️  View Images: {bildvisaren_url}")
            else:
                lines.append(f"🖼️  IIIF Manifest: {metadata.iiif_manifest}")

        if metadata.iiif_image:
            lines.append(f"🎨 Preview Image: {metadata.iiif_image}")

        return True

    def _format_oai_metadata(self, lines: list[str], metadata, ref_code: str) -> None:
        """Format OAI-PMH metadata section for a digitised document."""
        if metadata.title and metadata.title != "(No title)":
            lines.append(f"📋 Title: {metadata.title}")

        if metadata.unitdate:
            lines.append(f"📅 Date Range: {metadata.unitdate}")

        if metadata.repository:
            lines.append(f"🏛️  Repository: {metadata.repository}")

        if metadata.unitid and metadata.unitid != ref_code:
            lines.append(f"🔖 Unit ID: {metadata.unitid}")

        if metadata.description:
            desc = metadata.description
            if len(desc) > 200:
                desc = desc[:197] + "..."
            lines.append(f"📝 {desc}")

        if metadata.nad_link:
            lines.append(f"🔗 NAD Link: {metadata.nad_link}")

    def format_browse_results(
        self,
        browse_result,
        highlight_term=None,
        show_links: bool = False,
        show_success_message: bool = True,
        seen_page_numbers: set[int] | None = None,
    ) -> str:
        """
        Format browse results as plain text with emojis for MCP/LLM consumption.

        Args:
            browse_result: BrowseResult containing page contexts and metadata
            highlight_term: Optional term to highlight in text
            show_links: Whether to show ALTO/Image/Bildvisning links
            show_success_message: Whether to show success message (ignored in plain text)
            seen_page_numbers: Optional set of page numbers already shown in this session.
                               When provided, previously-shown pages get a one-liner stub.

        Returns:
            Formatted plain text browse results
        """
        lines: list[str] = []

        if self._format_non_digitised_metadata(lines, browse_result):
            return "\n".join(lines)

        if not browse_result.contexts:
            return f"No page contexts found for {browse_result.reference_code}"

        seen = seen_page_numbers or set()

        lines.append(f"📚 Document: {browse_result.reference_code}")

        if browse_result.oai_metadata:
            self._format_oai_metadata(lines, browse_result.oai_metadata, browse_result.reference_code)

        # Summary line with dedup info
        new_count = sum(1 for c in browse_result.contexts if c.page_number not in seen)
        reseen_count = len(browse_result.contexts) - new_count
        if seen and reseen_count > 0:
            lines.append(f"📖 Pages loaded: {len(browse_result.contexts)} ({new_count} new, {reseen_count} previously shown)")
        else:
            lines.append(f"📖 Pages loaded: {len(browse_result.contexts)}")
        lines.append("")

        for context in browse_result.contexts:
            if context.page_number in seen:
                lines.append(f"📄 Page {context.page_number} (previously shown in this session)")
                lines.append("")
                continue

            lines.append(f"📄 Page {context.page_number}")
            lines.append("─" * 40)

            if context.full_text.strip():
                display_text = context.full_text
                if highlight_term:
                    display_text = self.highlight_search_keyword(display_text, highlight_term)
                lines.append(display_text)
            else:
                lines.append("(Empty page - no transcribed text)")

            lines.append("")
            lines.append("🔗 Links:")
            lines.append(f"  📝 ALTO XML: {context.alto_url}")
            if context.image_url:
                lines.append(f"  🖼️  Image: {context.image_url}")
            if context.bildvisning_url:
                lines.append(f"  👁️  Bildvisning: {context.bildvisning_url}")

            lines.append("")

        lines.append(
            "Tip: Present the original text (quoted), provide a translation in the user's language, and include the bildvisaren link. Note uncertain readings."
        )

        return "\n".join(lines)