Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 9 additions & 0 deletions pygexml/image.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
from dataclasses import dataclass
from dataclasses_json import DataClassJsonMixin


@dataclass
class Image(DataClassJsonMixin):
filename: str
width: int | None
height: int | None
35 changes: 31 additions & 4 deletions pygexml/page.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
from lxml.etree import _Element as Element, QName

from .geometry import Point, Box, Polygon, GeometryError
from .image import Image


def find_child(element: Element, name: str) -> Element | None:
Expand Down Expand Up @@ -225,7 +226,7 @@ def all_words(self) -> Iterable[str]:

@dataclass
class Page(DataClassJsonMixin):
image_filename: str
image: Image
regions: dict[ID, TextRegion]

@classmethod
Expand All @@ -234,12 +235,24 @@ def from_xml(cls, element: Element) -> "Page":
raise PageXMLError("Wrong element given")

if "imageFilename" not in element.attrib:
raise PageXMLError("No filename found")
raise PageXMLError("No image filename found")

regions = find_children(element, "TextRegion")

return Page(
image_filename=str(element.attrib["imageFilename"]),
image=Image(
filename=str(element.attrib["imageFilename"]),
width=(
int(element.attrib["imageWidth"])
if "imageWidth" in element.attrib
else None
),
height=(
int(element.attrib["imageHeight"])
if "imageHeight" in element.attrib
else None
),
),
regions={
tr.id: tr for tr in (TextRegion.from_xml(region) for region in regions)
},
Expand Down Expand Up @@ -289,8 +302,22 @@ def from_alto(cls, element: Element) -> "Page":

text_blocks = find_children(printspace_element, "TextBlock")

# ALTO allows for float values, but we convert to int for consistency with PAGE XML
image_width = (
int(float(page_element.attrib["WIDTH"]))
if "WIDTH" in page_element.attrib
else None
)
image_height = (
int(float(page_element.attrib["HEIGHT"]))
if "HEIGHT" in page_element.attrib
else None
)

return Page(
image_filename=image_filename,
image=Image(
filename=image_filename, width=image_width, height=image_height
),
regions={
tb.id: tb for tb in (TextRegion.from_alto(tb) for tb in text_blocks)
},
Expand Down
27 changes: 25 additions & 2 deletions pygexml/strategies.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
import hypothesis.strategies as st

from pygexml.geometry import Point, Box, Polygon
from pygexml.image import Image
from pygexml.page import Coords, Page, TextLine, TextRegion

st_points = st.builds(Point, x=st.integers(min_value=0), y=st.integers(min_value=0))
Expand Down Expand Up @@ -60,10 +61,32 @@ def st_simple_text(**kwargs):
),
)

st_images = st.builds(
Image,
filename=st_simple_text(),
width=st.one_of(st.none(), st.integers(min_value=1)),
height=st.one_of(st.none(), st.integers(min_value=1)),
)

st_images_with_dimensions = st.builds(
Image,
filename=st_simple_text(),
width=st.integers(min_value=1),
height=st.integers(min_value=1),
)


@st.composite
def st_pages(draw):
image_filename = draw(st_simple_text())
image = draw(st_images)
regions = {tr.id: tr for tr in draw(st.lists(st_text_regions))}
page = Page(image=image, regions=regions)
return page


@st.composite
def st_pages_with_dimensions(draw):
image = draw(st_images_with_dimensions)
regions = {tr.id: tr for tr in draw(st.lists(st_text_regions))}
page = Page(image_filename=image_filename, regions=regions)
page = Page(image=image, regions=regions)
return page
133 changes: 133 additions & 0 deletions pygexml/svg.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,133 @@
from lxml import etree
from lxml.etree import _Element as Element

from .page import Page, TextRegion, TextLine

SVG_NS = "http://www.w3.org/2000/svg"
XLINK_NS = "http://www.w3.org/1999/xlink"


class SVGError(Exception):
pass


def _coords_path(coords_str: str) -> str:
return f"M {coords_str} Z"


def _baseline_path_d(line: TextLine) -> str:
box = line.coords.polygon.bounding_box()
y_baseline = box.top_left.y + (box.bottom_right.y - box.top_left.y) * 2 // 3
return f"M {box.top_left.x},{y_baseline} {box.bottom_right.x},{y_baseline}"


def _line_to_svg(line: TextLine) -> Element:
g = etree.Element(f"{{{SVG_NS}}}g", attrib={"id": line.id, "class": "TextLine"})
etree.SubElement(
g,
f"{{{SVG_NS}}}path",
attrib={
"d": _coords_path(str(line.coords)),
"class": "Coords",
},
)
etree.SubElement(
g,
f"{{{SVG_NS}}}path",
attrib={
"id": f"bl-{line.id}",
"d": _baseline_path_d(line),
"class": "Baseline",
},
)
if line.text:
text = etree.SubElement(g, f"{{{SVG_NS}}}text")
text_path = etree.SubElement(
text,
f"{{{SVG_NS}}}textPath",
attrib={f"{{{XLINK_NS}}}href": f"#bl-{line.id}", "textLength": "100%"},
)
tspan = etree.SubElement(
text_path, f"{{{SVG_NS}}}tspan", attrib={"class": "Text"}
)
tspan.text = line.text
return g


def _region_to_svg(region: TextRegion) -> Element:
g = etree.Element(f"{{{SVG_NS}}}g", attrib={"id": region.id, "class": "TextRegion"})
etree.SubElement(
g,
f"{{{SVG_NS}}}path",
attrib={
"d": _coords_path(str(region.coords)),
"class": "Coords",
},
)
for line in region.textlines.values():
g.append(_line_to_svg(line))
return g


def _default_style(width: int, height: int) -> Element:
font_size = max(width, height) // 60
style = etree.Element(f"{{{SVG_NS}}}style")
style.text = (
f"\n"
f" path.Coords {{ fill: rgba(100,160,255,0.12); stroke: steelblue; stroke-width: {max(width, height) // 1500}; }}\n"
f" path.Baseline {{ stroke: #e74c3c; stroke-width: {max(width, height) // 2000}; fill: none; }}\n"
f" .TextLine text {{ font-size: {font_size}px; font-family: serif; fill: #000; opacity: 0; transition: opacity 0.15s; }}\n"
f" .TextLine:hover text {{ opacity: 1; }}\n"
f" "
)
return style


def page_to_svg(page: Page, include_style: bool = True) -> Element:
if page.image.width is None:
raise SVGError("Image width is required for SVG generation")
if page.image.height is None:
raise SVGError("Image height is required for SVG generation")

width = page.image.width
height = page.image.height

svg = etree.Element(
f"{{{SVG_NS}}}svg",
# the official way to do it although stubs are wrong:
nsmap={None: SVG_NS, "xlink": XLINK_NS}, # type: ignore
attrib={
"width": str(width),
"height": str(height),
"viewBox": f"0 0 {width} {height}",
},
)

etree.SubElement(
svg,
f"{{{SVG_NS}}}image",
attrib={
"x": "0",
"y": "0",
"width": str(width),
"height": str(height),
f"{{{XLINK_NS}}}href": page.image.filename,
"preserveAspectRatio": "none",
},
)

if include_style:
svg.insert(0, _default_style(width, height))

for region in page.regions.values():
svg.append(_region_to_svg(region))

return svg


def page_to_svg_string(page: Page, include_style: bool = True) -> str:
return etree.tostring(
page_to_svg(page, include_style=include_style),
encoding="unicode",
pretty_print=True,
)
29 changes: 29 additions & 0 deletions test/test_image.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
from hypothesis import given
import hypothesis.strategies as st

from pygexml.strategies import st_images
from pygexml.image import Image


def test_image_example() -> None:
image = Image(filename="a.jpg", width=800, height=600)
assert image.filename == "a.jpg"
assert image.width == 800
assert image.height == 600


@given(
st.text(),
st.one_of(st.none(), st.integers(min_value=1)),
st.one_of(st.none(), st.integers(min_value=1)),
)
def test_image_arbitrary(filename: str, width: int, height: int) -> None:
image = Image(filename=filename, width=width, height=height)
assert image.filename == filename
assert image.width == width
assert image.height == height


@given(st_images)
def test_image_serialization_roundtrip_arbitrary(image: Image) -> None:
assert Image.from_dict(image.to_dict()) == image
Loading
Loading