diff --git a/PATCHES.md b/PATCHES.md new file mode 100644 index 000000000..efeab2093 --- /dev/null +++ b/PATCHES.md @@ -0,0 +1,73 @@ +# Prefix-free list itemization — fork of datalab/marker + +This fork removes marker's reliance on prefix-pattern heuristics (`a.`, `1.`, +`•`, etc.) for list detection. List items are surfaced as individual blocks +with their own bounding boxes, driven only by surya's layout model and +line-level layout. Useful when prefix-based detection produces false +positives on dense forms / problem sets. + +Upstream: + +## What changed + +1. **`marker/builders/structure.py`** — disabled `group_lists` and + `unmark_lists` in `StructureBuilder.__call__`. Surya-labeled `ListItem` + blocks are no longer merged into `ListGroup`s, and isolated items are no + longer demoted to `Text`. Each item stays as its own page-level block + with its original bbox. + +2. **`marker/processors/list_line_explode.py`** *(new)* — splits every + multi-line `ListItem` into one `ListItem` per `Line` child. Each new + item's polygon is the line's polygon. Prefix-free. + +3. **`marker/processors/list_gap_cluster.py`** *(new)* — alternate + strategy. Same input, but clusters lines by vertical gap: lines whose + inter-line gap is at most `gap_factor * median_gap` (default 1.5×) stay + merged into one `ListItem`; larger gaps start a new item. Preserves + wrapped items in docs with even line spacing; collapses to a single item + when there's no measurable gap between sub-items. + +4. **`marker/converters/pdf.py`** — registers + `ListItemLineExplodeProcessor` in the default pipeline immediately after + `LineMergeProcessor`. To switch strategies, replace it with + `ListItemGapClusterProcessor` in the `default_processors` tuple. + +5. **`scripts/patch_surya_label.py`** *(new)* — patches the installed surya + package so the layout model never emits the `
` → `Form` label; + instead, those regions come through as `ListItem` so the patched + structure builder and list-itemization processors handle them. + Idempotent. Run once after installing or upgrading `surya-ocr`. + +6. **`scripts/marker_view.py`** *(new)* — standalone HTML viewer. Renders + each PDF page and overlays marker's JSON bboxes with color-coded labels + and a "ListItems only" filter. + +## Setup + +```bash +pip install -e . +pip install surya-ocr pypdfium2 +python scripts/patch_surya_label.py +``` + +## Usage + +```bash +# convert +SURYA_INFERENCE_BACKEND=llamacpp marker_single path/to.pdf \ + --output_dir out --output_format json --disable_multiprocessing + +# visualize +python scripts/marker_view.py path/to.pdf out//.json +``` + +## Choosing a strategy + +| Strategy | When to use | Tradeoff | +|---|---|---| +| `ListItemLineExplodeProcessor` (default) | You want **every visual line** to be its own bbox | Wrapped 2-line items get over-split | +| `ListItemGapClusterProcessor` | Source PDF uses extra spacing between items | Collapses to one item when sub-items are visually flush | + +To swap, edit `marker/converters/pdf.py` and replace the entry in +`default_processors`. Both processors are prefix-free; they look at line +bboxes only. diff --git a/README.md b/README.md index c9ce259a9..c6cfdcbc7 100644 --- a/README.md +++ b/README.md @@ -1,3 +1,8 @@ +> **margati fork notice** — this fork modifies marker's list detection to +> be prefix-free (no reliance on `1.`, `a.`, `•`, etc.). See +> [PATCHES.md](./PATCHES.md) for the full list of changes (structure +> builder, two new processors, surya label patch, viewer script). +

Datalab Logo

diff --git a/marker/builders/structure.py b/marker/builders/structure.py index 32a3c84ff..46aaaf089 100644 --- a/marker/builders/structure.py +++ b/marker/builders/structure.py @@ -28,8 +28,8 @@ def __init__(self, config=None): def __call__(self, document: Document): for page in document.pages: self.group_caption_blocks(page) - self.group_lists(page) - self.unmark_lists(page) + # self.group_lists(page) # keep each ListItem standalone (no merging into ListGroup) + # self.unmark_lists(page) # don't demote lone ListItems to Text def group_caption_blocks(self, page: PageGroup): gap_threshold_px = self.gap_threshold * page.polygon.height diff --git a/marker/converters/pdf.py b/marker/converters/pdf.py index d46a8647b..fa935613a 100644 --- a/marker/converters/pdf.py +++ b/marker/converters/pdf.py @@ -50,6 +50,8 @@ from marker.processors.order import OrderProcessor from marker.services.gemini import GoogleGeminiService from marker.processors.line_merge import LineMergeProcessor +from marker.processors.list_line_explode import ListItemLineExplodeProcessor +from marker.processors.list_gap_cluster import ListItemGapClusterProcessor # noqa: F401 # alternate strategy from marker.processors.llm.llm_mathblock import LLMMathBlockProcessor from marker.processors.llm.llm_page_correction import LLMPageCorrectionProcessor from marker.processors.llm.llm_sectionheader import LLMSectionHeaderProcessor @@ -75,6 +77,7 @@ class PdfConverter(BaseConverter): OrderProcessor, BlockRelabelProcessor, LineMergeProcessor, + ListItemLineExplodeProcessor, # swap to ListItemGapClusterProcessor for gap-clustered itemization BlockquoteProcessor, CodeProcessor, DocumentTOCProcessor, diff --git a/marker/processors/list_gap_cluster.py b/marker/processors/list_gap_cluster.py new file mode 100644 index 000000000..ad76ec170 --- /dev/null +++ b/marker/processors/list_gap_cluster.py @@ -0,0 +1,94 @@ +from copy import deepcopy +from statistics import median +from typing import Annotated, List + +from marker.processors import BaseProcessor +from marker.schema import BlockTypes +from marker.schema.document import Document +from marker.schema.registry import get_block_class + + +class ListItemGapClusterProcessor(BaseProcessor): + """ + Split multi-line ListItem blocks into per-item ListItems by clustering + Line children on vertical gap. Lines whose top-to-prev-bottom gap is at + most `gap_factor * median_gap` (with a minimum of `min_gap_pt`) are + treated as continuations of the previous item; larger gaps start a new + item. Each emitted ListItem's polygon is the merge of its lines' + polygons. + + Prefix-free: uses only line bboxes. + """ + block_types = (BlockTypes.ListItem,) + min_lines: Annotated[ + int, "Minimum number of Line children before clustering applies.", + ] = 2 + gap_factor: Annotated[ + float, + "Multiplier on the median inter-line gap. Anything <= this is treated as 'tight'.", + ] = 1.5 + min_gap_pt: Annotated[ + float, + "Floor on the 'tight' gap (PDF points). Below this, lines always belong to the same item.", + ] = 2.0 + + def __call__(self, document: Document): + ListItemCls = get_block_class(BlockTypes.ListItem) + for page in document.pages: + if not page.structure: + continue + new_structure = [] + for bid in list(page.structure): + block = page.get_block(bid) + if block is None or block.block_type != BlockTypes.ListItem: + new_structure.append(bid) + continue + + line_ids = list(block.structure or []) + lines = [] + for lid in line_ids: + ln = page.get_block(lid) + if ln is not None and ln.block_type == BlockTypes.Line: + lines.append(ln) + + if len(lines) < self.min_lines: + new_structure.append(bid) + continue + + # Sort top-to-bottom (defensive — should already be in order) + lines.sort(key=lambda l: l.polygon.y_start) + + gaps = [ + max(0.0, lines[i].polygon.y_start - lines[i - 1].polygon.y_end) + for i in range(1, len(lines)) + ] + if not gaps: + new_structure.append(bid) + continue + + med = median(gaps) + tight_threshold = max(self.min_gap_pt, med * self.gap_factor) + + clusters: List[List] = [[lines[0]]] + for i in range(1, len(lines)): + g = lines[i].polygon.y_start - lines[i - 1].polygon.y_end + if g <= tight_threshold: + clusters[-1].append(lines[i]) + else: + clusters.append([lines[i]]) + + if len(clusters) == 1: + # Nothing to split — keep the original block intact + new_structure.append(bid) + continue + + for cluster in clusters: + merged_poly = deepcopy(cluster[0].polygon) + if len(cluster) > 1: + merged_poly = merged_poly.merge([l.polygon for l in cluster[1:]]) + new_li = page.add_block(ListItemCls, merged_poly) + new_li.structure = [l.id for l in cluster] + new_structure.append(new_li.id) + block.removed = True + + page.structure = new_structure diff --git a/marker/processors/list_line_explode.py b/marker/processors/list_line_explode.py new file mode 100644 index 000000000..f0f853c2b --- /dev/null +++ b/marker/processors/list_line_explode.py @@ -0,0 +1,53 @@ +from copy import deepcopy +from typing import Annotated + +from marker.processors import BaseProcessor +from marker.schema import BlockTypes +from marker.schema.document import Document +from marker.schema.registry import get_block_class + + +class ListItemLineExplodeProcessor(BaseProcessor): + """ + Explode every multi-line ListItem into one ListItem per Line child. + Each new ListItem inherits the line's polygon and points at that single + Line as its structure. The parent block is dropped from the page structure. + + Prefix-free: relies only on line-level layout from LineBuilder. + """ + block_types = (BlockTypes.ListItem,) + min_lines: Annotated[ + int, + "Minimum number of Line children before a ListItem is exploded.", + ] = 2 + + def __call__(self, document: Document): + ListItemCls = get_block_class(BlockTypes.ListItem) + for page in document.pages: + if not page.structure: + continue + new_structure = [] + for bid in list(page.structure): + block = page.get_block(bid) + if block is None or block.block_type != BlockTypes.ListItem: + new_structure.append(bid) + continue + + line_ids = list(block.structure or []) + lines = [] + for lid in line_ids: + ln = page.get_block(lid) + if ln is not None and ln.block_type == BlockTypes.Line: + lines.append(ln) + + if len(lines) < self.min_lines: + new_structure.append(bid) + continue + + for line in lines: + new_li = page.add_block(ListItemCls, deepcopy(line.polygon)) + new_li.structure = [line.id] + new_structure.append(new_li.id) + block.removed = True + + page.structure = new_structure diff --git a/scripts/marker_view.py b/scripts/marker_view.py new file mode 100644 index 000000000..6821ff0a2 --- /dev/null +++ b/scripts/marker_view.py @@ -0,0 +1,136 @@ +#!/usr/bin/env python3 +"""Render a marker JSON result over the original PDF pages as an HTML viewer.""" +import argparse, json, html, base64, io, subprocess +from pathlib import Path +import pypdfium2 as pdfium + +LABEL_COLORS = { + "Text": "#3b82f6", "SectionHeader": "#ef4444", "PageHeader": "#a855f7", + "PageFooter": "#a855f7", "Picture": "#f59e0b", "PictureGroup": "#f59e0b", + "Figure": "#f59e0b", "FigureGroup": "#f59e0b", + "Table": "#10b981", "TableGroup": "#10b981", + "ListGroup": "#06b6d4", "ListItem": "#06b6d4", + "Form": "#22d3ee", + "Equation": "#ec4899", "Caption": "#f97316", "Footnote": "#94a3b8", + "Code": "#facc15", +} +DEFAULT_COLOR = "#6b7280" + +def render_pages(pdf, dpi=150): + doc = pdfium.PdfDocument(str(pdf)) + scale = dpi / 72 + pages = [] + for i in range(len(doc)): + page = doc[i] + pil = page.render(scale=scale).to_pil() + buf = io.BytesIO() + pil.save(buf, format="PNG") + pages.append({ + "b64": base64.b64encode(buf.getvalue()).decode(), + "w_px": pil.width, "h_px": pil.height, + "w_pt": page.get_width(), "h_pt": page.get_height(), + }) + return pages + +def collect_blocks(page_json): + """Top-level blocks on a marker page (ListItems remain individual after patch).""" + out = [] + for child in (page_json.get("children") or []): + out.append({ + "label": child.get("block_type", "?"), + "bbox": child.get("bbox"), + }) + return out + +def build_html(pdf_name, pages, marker_pages): + parts = [f""" +Marker (patched): {html.escape(pdf_name)} + +
+

{html.escape(pdf_name)} — patched marker (no ListGroup merging)

+
+ + + +
+
+
"""] + + labels_seen = set() + for idx, (img, mp) in enumerate(zip(pages, marker_pages), 1): + # marker bbox is in PDF points; image is rendered at dpi → multiply by (px / pt) + sx = img["w_px"] / img["w_pt"] + sy = img["h_px"] / img["h_pt"] + blocks = collect_blocks(mp) + li_count = sum(1 for b in blocks if b["label"] == "ListItem") + parts.append(f'
page {idx} — {len(blocks)} blocks ({li_count} ListItem)
') + parts.append(f'
') + parts.append(f'') + for b in blocks: + if not b["bbox"]: + continue + x0, y0, x1, y1 = b["bbox"] + x0, y0, x1, y1 = x0*sx, y0*sy, x1*sx, y1*sy + label = b["label"] + color = LABEL_COLORS.get(label, DEFAULT_COLOR) + labels_seen.add((label, color)) + parts.append( + f'
' + f'{html.escape(label)}
' + ) + parts.append("
") + + legend = "".join( + f'{html.escape(l)}' + for l, c in sorted(labels_seen) + ) + parts.append(f"") + return "".join(parts) + +def main(): + ap = argparse.ArgumentParser() + ap.add_argument("pdf") + ap.add_argument("marker_json") + ap.add_argument("--out", default="/tmp/marker_view") + ap.add_argument("--no-open", action="store_true") + ap.add_argument("--tag", default="", help="suffix added to the output html name") + args = ap.parse_args() + + pdf = Path(args.pdf).resolve() + out = Path(args.out); out.mkdir(parents=True, exist_ok=True) + data = json.loads(Path(args.marker_json).read_text()) + marker_pages = data["children"] + print(f"[1/2] rendering {len(marker_pages)} page(s) ...", flush=True) + pages = render_pages(pdf) + print(f"[2/2] writing HTML ...", flush=True) + suffix = f".{args.tag}" if args.tag else "" + html_path = out / f"{pdf.stem}.marker{suffix}.html" + html_path.write_text(build_html(pdf.name, pages, marker_pages)) + print(f"-> {html_path}") + if not args.no_open: + subprocess.Popen(["firefox", str(html_path)], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL) + +if __name__ == "__main__": + main() diff --git a/scripts/patch_surya_label.py b/scripts/patch_surya_label.py new file mode 100644 index 000000000..32ab728d6 --- /dev/null +++ b/scripts/patch_surya_label.py @@ -0,0 +1,41 @@ +#!/usr/bin/env python3 +""" +Patch the installed surya package so the layout model never emits the `Form` +label. Anything surya would have called `` becomes `ListItem` instead, +which lets marker's normal list path handle the region (including the +prefix-free itemization processors in marker.processors.list_line_explode +and marker.processors.list_gap_cluster). + +Idempotent. Run once after `pip install surya-ocr` (or after upgrading). +""" +import importlib.util +import sys +from pathlib import Path + + +def main(): + spec = importlib.util.find_spec("surya.layout.label") + if spec is None or spec.origin is None: + print("ERROR: could not find surya.layout.label — is surya installed?", file=sys.stderr) + sys.exit(1) + + path = Path(spec.origin) + src = path.read_text() + + target_old = '"": "Form",' + target_new = '"": "ListItem",' + + if target_new in src: + print(f"already patched: {path}") + return + if target_old not in src: + print(f"ERROR: expected line not found in {path}", file=sys.stderr) + print("Surya may have changed its label map. Patch manually.", file=sys.stderr) + sys.exit(1) + + path.write_text(src.replace(target_old, target_new)) + print(f"patched: {path}") + + +if __name__ == "__main__": + main()