Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
174 changes: 146 additions & 28 deletions packages/paper-qa-nemotron/src/paperqa_nemotron/reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,12 @@
# all with temperature of 0 and DPI 300
DEFAULT_BORDER_SIZE = 60 # pixels

# Nemotron-parse's native input dimensions, see:
# https://docs.nvidia.com/nim/vision-language-models/latest/examples/nemotron-parse/overview.html
# https://huggingface.co/nvidia/NVIDIA-Nemotron-Parse-v1.1/blob/main/preprocessor_config.json
NEMOTRON_PARSE_TARGET_WIDTH = 1648
NEMOTRON_PARSE_TARGET_HEIGHT = 2048


def pad_image_with_border(
image: "Image.Image",
Expand Down Expand Up @@ -72,6 +78,66 @@ def pad_image_with_border(
return canvas, border_x, border_y


def fit_image_to_target_aspect_ratio(
image: "Image.Image",
target_w: int = NEMOTRON_PARSE_TARGET_WIDTH,
target_h: int = NEMOTRON_PARSE_TARGET_HEIGHT,
pad_color: float | tuple[float, ...] | str = WHITE_RGB,
resample: "Image.Resampling" = Image.Resampling.LANCZOS,
) -> "tuple[Image.Image, float, int, int]":
"""Scale up and center the image onto the smallest possible canvas with nemotron-parse's aspect ratio.

Began from preprocess_image_like_eclair:
https://github.com/xinyu-dev/nemotron-parse-prod-hf/blob/9535a08f560e90c11f700d08c2870c40defa8aab/Step_2_Extract.ipynb

Args:
image: Input image.
target_w: Minimum output width, and width component of output aspect ratio.
target_h: Minimum output height, and height component of output aspect ratio.
pad_color: Color to use for canvas padding.
resample: Resampling filter to use when scaling up.

Returns:
Four-tuple of:
- Image on canvas with target aspect ratio
- Scale factor applied (1.0 if no scaling)
- X offset (px) where scaled image starts on canvas.
- Y offset (px) where scaled image starts on canvas.
"""
original_width, original_height = image.size
target_aspect_ratio = target_w / target_h

# Scale up small images to at least target dimensions
scale = max(target_w / original_width, target_h / original_height, 1.0)
scaled_image = (
image.resize(
(int(original_width * scale), int(original_height * scale)), resample
)
if scale > 1.0
else image
)
scaled_width, scaled_height = scaled_image.size

# Calculate smallest canvas with target aspect ratio that fits the scaled image
if scaled_width / scaled_height > target_aspect_ratio:
# Image is wider than target ratio: match width, extend height
canvas_width = scaled_width
canvas_height = int(scaled_width / target_aspect_ratio)
else:
# Image is taller than target ratio: match height, extend width
canvas_height = scaled_height
canvas_width = int(scaled_height * target_aspect_ratio)

# Create canvas and center the scaled image
canvas = Image.new(
scaled_image.mode, (canvas_width, canvas_height), pad_color # type: ignore[arg-type]
)
center_offset_x = (canvas_width - scaled_width) // 2
center_offset_y = (canvas_height - scaled_height) // 2
canvas.paste(scaled_image, (center_offset_x, center_offset_y))
return canvas, scale, center_offset_x, center_offset_y


async def parse_pdf_to_pages(
path: str | os.PathLike,
page_size_limit: int | None = None,
Expand All @@ -82,6 +148,7 @@ async def parse_pdf_to_pages(
api_params: Mapping[str, Any] | None = None,
concurrency: int | asyncio.Semaphore | None = 128,
border: int | tuple[int, int] = DEFAULT_BORDER_SIZE,
optimize_aspect_ratio: bool = True,
**_: Any,
) -> ParsedText:
"""Parse a PDF using Nvidia's nemotron-parse VLM.
Expand All @@ -106,6 +173,8 @@ async def parse_pdf_to_pages(
border: Border size (pixels) to add on all sides.
If a two-tuple it's the x border and y border,
otherwise both x and y borders are symmetric.
optimize_aspect_ratio: Flag (default is enabled) to preprocess images to the
aspect ratio used when training nemotron-parse before sending to the API.
**_: Thrown away kwargs.

Returns:
Expand Down Expand Up @@ -154,17 +223,34 @@ async def process_page( # noqa: PLR0912

rendered_page = page.render(**render_kwargs)
rendered_page_pil = rendered_page.to_pil()

# Initialize transformation tracking variables
aspect_scale = 1.0
aspect_offset_x = aspect_offset_y = 0
border_offset_x = border_offset_y = 0
if parse_media and not full_page: # If we need bounding boxes
if optimize_aspect_ratio:
aspect_image, aspect_scale, aspect_offset_x, aspect_offset_y = (
fit_image_to_target_aspect_ratio(rendered_page_pil)
)
else:
aspect_image = rendered_page_pil
# Apply white border padding to increase bounding box reliability
rendered_page_padded_pil, offset_x, offset_y = pad_image_with_border(
rendered_page_pil, border
rendered_page_padded_pil, border_offset_x, border_offset_y = (
pad_image_with_border(aspect_image, border)
)
image_for_api = np.array(rendered_page_padded_pil)
tool_name: Literal["markdown_bbox", "markdown_no_bbox"] = (
"markdown_bbox"
)
else:
image_for_api = rendered_page.to_numpy()
if optimize_aspect_ratio:
aspect_image, aspect_scale, aspect_offset_x, aspect_offset_y = (
fit_image_to_target_aspect_ratio(rendered_page_pil)
)
image_for_api = np.array(aspect_image)
else:
image_for_api = rendered_page.to_numpy()
tool_name = "markdown_no_bbox"
del rendered_page # Free pdfium bitmap memory

Expand All @@ -185,22 +271,38 @@ async def process_page( # noqa: PLR0912
async def extract_text(
detection: NemotronParseAnnotatedBBox,
) -> NemotronParseMarkdownBBox:
# Convert bbox from normalized [0, 1] to padded image pixel coordinates
padded_bbox = detection.bbox.to_page_coordinates(
rendered_page_padded_pil.height,
rendered_page_padded_pil.width,
# Convert bbox from normalized [0, 1] to padded image pixel
# coordinates, then convert to original image coordinates by
# removing offsets and scaling
pad_xmin, pad_ymin, pad_xmax, pad_ymax = (
detection.bbox.to_page_coordinates(
rendered_page_padded_pil.height,
rendered_page_padded_pil.width,
)
)
original_bbox = (
# xmin, ymin
# pylint: disable-next=possibly-used-before-assignment
max(0, padded_bbox[0] - offset_x),
# pylint: disable-next=possibly-used-before-assignment
max(0, padded_bbox[1] - offset_y),
# xmax, ymax
min(rendered_page_pil.width, padded_bbox[2] - offset_x),
min(rendered_page_pil.height, padded_bbox[3] - offset_y),
max(
0,
(pad_xmin - border_offset_x - aspect_offset_x)
/ aspect_scale,
),
max(
0,
(pad_ymin - border_offset_y - aspect_offset_y)
/ aspect_scale,
),
min(
rendered_page_pil.width,
(pad_xmax - border_offset_x - aspect_offset_x)
/ aspect_scale,
),
min(
rendered_page_pil.height,
(pad_ymax - border_offset_y - aspect_offset_y)
/ aspect_scale,
),
)
# Crop original image at bbox (without border)
# Crop original image at bbox (without border or aspect ratio padding)
region_pil = rendered_page_pil.crop(original_bbox)
# Use markdown_no_bbox to get text for this region
# abandoning the text if we're still hitting a length error
Expand Down Expand Up @@ -292,20 +394,36 @@ async def extract_text(
for item in cast(list[NemotronParseMarkdownBBox], response)
if item.type in CLASSIFICATIONS_WITH_MEDIA
):
# Convert bbox from normalized [0, 1] to padded image pixel coordinates
padded_bbox = item.bbox.to_page_coordinates(
rendered_page_padded_pil.height, rendered_page_padded_pil.width
# Convert bbox from normalized [0, 1] to padded image pixel
# coordinates, then convert to original image coordinates by
# removing offsets and scaling
pad_xmin, pad_ymin, pad_xmax, pad_ymax = (
item.bbox.to_page_coordinates(
rendered_page_padded_pil.height,
rendered_page_padded_pil.width,
)
)
# Adjust bbox to account for padding offsets
# Also if the bbox had extended into the padding zone,
# clamp it here as we're ditching the padding
original_bbox = (
# xmin, ymin
max(0, padded_bbox[0] - offset_x),
max(0, padded_bbox[1] - offset_y),
# xmax, ymax
min(rendered_page_pil.width, padded_bbox[2] - offset_x),
min(rendered_page_pil.height, padded_bbox[3] - offset_y),
max(
0,
(pad_xmin - border_offset_x - aspect_offset_x)
/ aspect_scale,
),
max(
0,
(pad_ymin - border_offset_y - aspect_offset_y)
/ aspect_scale,
),
min(
rendered_page_pil.width,
(pad_xmax - border_offset_x - aspect_offset_x)
/ aspect_scale,
),
min(
rendered_page_pil.height,
(pad_ymax - border_offset_y - aspect_offset_y)
/ aspect_scale,
),
)
region_pix = rendered_page_pil.crop(original_bbox)
img_bytes = io.BytesIO()
Expand Down
79 changes: 77 additions & 2 deletions packages/paper-qa-nemotron/tests/test_paperqa_nemotron.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,12 @@
from PIL import Image

from paperqa_nemotron import parse_pdf_to_pages
from paperqa_nemotron.reader import pad_image_with_border
from paperqa_nemotron.reader import (
NEMOTRON_PARSE_TARGET_HEIGHT,
NEMOTRON_PARSE_TARGET_WIDTH,
fit_image_to_target_aspect_ratio,
pad_image_with_border,
)

REPO_ROOT = Path(__file__).parents[3]
STUB_DATA_DIR = REPO_ROOT / "tests" / "stub_data"
Expand Down Expand Up @@ -46,7 +51,7 @@ async def test_parse_pdf_to_pages(api_params_base: dict[str, Any]) -> None:
# between Abstract and Introduction
matches = re.findall(
r"(?:###? 1 Introduction[\n]+)?We introduce Pa ?S[as],"
r" an advanced Paper Search agent powered by large language models\.",
r" an advanced (?:\*\*)?Paper Search(?:\*\*)? agent powered by large language models\.",
p1_text,
)
assert len(matches) == 1, f"Parsing failed to handle abstract in {p1_text}."
Expand Down Expand Up @@ -376,6 +381,76 @@ def test_pad_image_with_border(subtests: pytest.Subtests) -> None:
assert padded.height == grayscale_image.height + 60 * 2


def test_fit_image_to_target_aspect_ratio(subtests: pytest.Subtests) -> None:
target_ratio = NEMOTRON_PARSE_TARGET_WIDTH / NEMOTRON_PARSE_TARGET_HEIGHT

with subtests.test(msg="large-image-no-scale"):
# Image larger than target dims → no scaling, only aspect ratio adjustment
large_image = Image.new( # US letter paper size
"RGB", (2550, 3300), (128, 128, 128)
)
result, scale, offset_x, offset_y = fit_image_to_target_aspect_ratio(
large_image
)
assert result.mode == "RGB"
assert scale == 1.0, "Large image should not be scaled"
assert (
abs(result.width / result.height - target_ratio) < 0.01
), "Result should have target ratio"
assert result.width >= large_image.width
assert result.height >= large_image.height
assert offset_x == (result.width - int(large_image.width * scale)) // 2
assert offset_y == (result.height - int(large_image.height * scale)) // 2

with subtests.test(msg="small-image-scales-up"):
# Image smaller than target dims → scales up
small_image = Image.new("RGB", (800, 1000), (128, 128, 128))
result, scale, offset_x, offset_y = fit_image_to_target_aspect_ratio(
small_image
)
assert result.mode == "RGB"
assert scale > 1.0, "Small image should be scaled up"
assert (
abs(result.width / result.height - target_ratio) < 0.01
), "Result should have target ratio"
assert offset_x == (result.width - int(small_image.width * scale)) // 2
assert offset_y == (result.height - int(small_image.height * scale)) // 2

with subtests.test(msg="already-correct-ratio"):
# Image already has target ratio → should just be scaled/padded appropriately
ratio_image = Image.new(
"RGB",
(NEMOTRON_PARSE_TARGET_WIDTH, NEMOTRON_PARSE_TARGET_HEIGHT),
(128, 128, 128),
)
result, scale, offset_x, offset_y = fit_image_to_target_aspect_ratio(
ratio_image
)
assert result.mode == "RGB"
assert scale == 1.0
assert abs(result.width / result.height - target_ratio) < 0.01
assert offset_x == 0
assert offset_y == 0

with subtests.test(msg="wide-image"):
# Wide image (landscape) → should be placed on portrait canvas
wide_image = Image.new("RGB", (3000, 2000), (128, 128, 128))
result, scale, offset_x, offset_y = fit_image_to_target_aspect_ratio(wide_image)
assert result.mode == "RGB"
assert abs(result.width / result.height - target_ratio) < 0.01
assert offset_x == (result.width - int(wide_image.width * scale)) // 2
assert offset_y == (result.height - int(wide_image.height * scale)) // 2

with subtests.test(msg="preserves-mode"):
# Ensure image mode is preserved
rgba_image = Image.new("RGBA", (2000, 3000), (100, 100, 100, 255))
result, scale, offset_x, offset_y = fit_image_to_target_aspect_ratio(rgba_image)
assert result.mode == "RGBA"
assert scale == 1.0
assert offset_x == (result.width - int(rgba_image.width * scale)) // 2
assert offset_y == (result.height - int(rgba_image.height * scale)) // 2


@pytest.mark.asyncio
async def test_media_enrichment_filters_irrelevant() -> None:
parsed_text = await parse_pdf_to_pages(
Expand Down
Loading