Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .devcontainer/Dockerfile
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
FROM ghcr.io/astral-sh/uv:python3.12-bookworm

RUN apt-get update && apt-get install -y just

# Create non-root user with UID/GID commonly used by VS Code (1000:1000)
RUN useradd -ms /bin/bash -u 1000 vscode \
&& apt-get update && apt-get install -y sudo \
Expand Down
8 changes: 7 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -20,14 +20,20 @@ dependencies = [
"rdflib>=7.1.4",
"SPARQLWrapper>=2.0.0",
"rapidfuzz>=3.0.0",
"typer>=0.12.5",
]
[dependency-groups]
dev = [
"notebook>=7.3.2",
]

[project.scripts]
cli = 'strings2things.cli:main'
# Primary multi-command CLI (Typer based)
strings2things = 'strings2things.cli:app'
# Shortcut to launch API server directly
strings2things-serve = 'strings2things.cli:serve'
# Backward compatibility alias if previous users relied on `cli`
cli = 'strings2things.cli:app'

[build-system]
requires = ["hatchling"]
Expand Down
188 changes: 152 additions & 36 deletions src/strings2things/app/api/endpoints.py
Original file line number Diff line number Diff line change
@@ -1,55 +1,171 @@
# app/api/endpoints.py
"""RDF transformation API endpoints.

from fastapi import APIRouter, UploadFile, File, Form
We intentionally split raw-body and multipart upload into two endpoints so that
the generated OpenAPI schema (Swagger UI) shows both usage patterns. A single
endpoint mixing `UploadFile` with a normal JSON/Turtle body would appear only
as `multipart/form-data` in the docs.

Endpoints
---------
POST /v1/things Raw JSON-LD (object/array) or Turtle string body
POST /v1/things/upload Multipart file upload

Shared Parameters
-----------------
serialization (required) jsonld | turtle
fuzzy (optional, default false)
fuzzy_threshold (0-100, default 90)
"""

from __future__ import annotations

from typing import Literal
import json
import logging
from fastapi import APIRouter, UploadFile, File, Query, Form, Request, Body
from fastapi.responses import Response
from rdflib import Graph

from strings2things.app.core.rdf_transformer import RDFTransformer
from strings2things.app.core.ontology_manager import OntologyManager
from strings2things.app.utils.rdf_utils import parse_rdf, serialize_rdf
import logging

router = APIRouter()
ontology_manager = OntologyManager() # Lazy load on first access


def _detect_input_format(content_type: str, data: bytes) -> str:
"""Infer whether payload is JSON-LD or Turtle.

Uses explicit content-type when present; falls back to leading character
heuristic (object/array starts imply JSON-LD).
"""
ct = (content_type or "").split(";")[0].strip().lower()
if ct in {"application/ld+json", "application/json"}:
return "json-ld"
if ct in {"text/turtle", "application/x-turtle", "text/plain"}:
return "turtle"
lead = data.lstrip()[:1]
return "json-ld" if lead in (b"{", b"[") else "turtle"


ontology_manager = OntologyManager()
ontology_manager.load_ontologies()
def _label_map() -> dict[str, str]:
if not ontology_manager.get_label_map():
try:
ontology_manager.load_ontologies()
except Exception: # pragma: no cover (resilience path)
logging.exception("Ontology loading failed; proceeding with empty label map")
return ontology_manager.get_label_map() or {}

# Keep transformer as a base instance
base_label_map = ontology_manager.get_label_map()

def _transform_bytes(
payload: bytes,
content_type: str,
serialization: Literal["jsonld", "turtle"],
fuzzy: bool,
fuzzy_threshold: int,
) -> tuple[str, str]:
input_format = _detect_input_format(content_type, payload)
graph = parse_rdf(payload, format=input_format)
transformer = RDFTransformer(_label_map(), fuzzy=fuzzy, fuzzy_threshold=fuzzy_threshold)
transformed = transformer.transform(graph)
out_fmt = "json-ld" if serialization == "jsonld" else "turtle"
media_type = "application/ld+json" if out_fmt == "json-ld" else "text/turtle"
serialized = serialize_rdf(transformed, output_format=out_fmt)
return serialized, media_type

@router.post("/transform")
async def transform_rdf(
file: UploadFile = File(...),
serialization: str = Form("turtle"),
fuzzy: bool = Form(False), # <-- new parameter
fuzzy_threshold: int = Form(90), # <-- configurable fuzzy matching threshold

@router.post(
"/v1/things",
tags=["transform"],
summary="Transform RDF (raw body)",
description=(
"Send RDF as either JSON-LD (application/ld+json or application/json) or plain Turtle (text/turtle or text/plain). "
"When sending Turtle, set the Content-Type header to text/turtle (no JSON wrapper needed)."
),
openapi_extra={
"requestBody": {
"required": True,
"content": {
"application/ld+json": {
"schema": {"type": "object", "description": "JSON-LD object graph"},
"examples": {
"jsonldObject": {
"summary": "Minimal JSON-LD",
"value": {"@context": {}, "@id": "_:b0", "name": "Example"},
}
},
},
"application/json": {
"schema": {"type": "object", "description": "Interpreted as JSON-LD"},
"examples": {
"genericJson": {
"summary": "Generic JSON treated as JSON-LD",
"value": {"label": "Some Value"},
}
},
},
"text/turtle": {
"schema": {"type": "string", "description": "Turtle serialization"},
"examples": {
"turtle": {
"summary": "Turtle example",
"value": "@prefix ex: <http://example/> . ex:s ex:p ex:o .",
}
},
},
"text/plain": {
"schema": {"type": "string", "description": "Plain text (assumed Turtle)"}
},
},
}
},
)
async def transform_rdf_raw(
request: Request,
serialization: Literal["jsonld", "turtle"] = Query(..., description="Output format"),
fuzzy: bool = Query(False, description="Enable fuzzy matching"),
fuzzy_threshold: int = Query(90, ge=0, le=100, description="Fuzzy threshold 0-100"),
) -> Response:
"""
Accepts an RDF file upload, transforms it using the label map,
and returns the modified RDF graph in the requested format.

Args:
file: The uploaded RDF file
serialization: Desired output RDF serialization (default: turtle)
fuzzy: Whether to use fuzzy matching (default: False)
threshold: Similarity threshold for fuzzy matching (0-100, default: 90)
"""
content = await file.read()
"""Accept raw body as JSON-LD or Turtle without requiring JSON wrapping for Turtle.

We read the raw bytes to allow multiple media types. Format detection uses
the Content-Type header first, then a leading character heuristic.
"""
try:
input_graph = parse_rdf(content)

transformer = RDFTransformer(
base_label_map,
fuzzy=fuzzy,
fuzzy_threshold=fuzzy_threshold,
body = await request.body()
if not body:
return Response(content="Empty body", status_code=400)
content_type = request.headers.get("content-type", "")
serialized, media_type = _transform_bytes(
body, content_type, serialization, fuzzy, fuzzy_threshold
)
return Response(content=serialized, media_type=media_type)
except Exception as e: # pragma: no cover
logging.exception("Transformation failed")
return Response(content=str(e), status_code=400)

transformed_graph = transformer.transform(input_graph)
serialized = serialize_rdf(transformed_graph, output_format=serialization)

return Response(content=serialized, media_type="text/plain")

except Exception as e:
@router.post(
"/v1/things/upload",
tags=["transform"],
summary="Transform RDF (file upload)",
description="Upload an RDF file (JSON-LD or Turtle) via multipart/form-data",
)
async def transform_rdf_upload(
file: UploadFile = File(..., description="RDF file (JSON-LD or Turtle)"),
serialization: Literal["jsonld", "turtle"] = Form(..., description="Output format"),
fuzzy: bool = Form(False, description="Enable fuzzy matching"),
fuzzy_threshold: int = Form(90, description="Fuzzy threshold 0-100"),
) -> Response:
try:
data = await file.read()
if not data:
return Response(content="Uploaded file is empty", status_code=400)
serialized, media_type = _transform_bytes(
data, file.content_type or "", serialization, fuzzy, fuzzy_threshold
)
return Response(content=serialized, media_type=media_type)
except Exception as e: # pragma: no cover
logging.exception("Transformation failed")
return Response(content=str(e), status_code=400)

19 changes: 12 additions & 7 deletions src/strings2things/app/core/ontology_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,28 +4,33 @@
from rdflib import Literal
from rdflib import XSD

settings = Settings()


class OntologyManager:
def __init__(self):
self.graph = Graph()
self.label_map: dict[str, str] = {}
# Defer settings creation until load time to avoid import-time failures
self.settings: Settings | None = None

def load_ontologies(self):
# Initialize settings on first load
if self.settings is None:
self.settings = Settings()

print(
f"[INFO] Connecting to SPARQL endpoint: {settings.ONTOLOGY_SPARQL_ENDPOINT}"
f"[INFO] Connecting to SPARQL endpoint: {self.settings.ONTOLOGY_SPARQL_ENDPOINT}"
)
for graph_iri in settings.get_graph_iris():
for graph_iri in self.settings.get_graph_iris():
print(f"[INFO] Loading named graph: {graph_iri}")
g = self._load_named_graph(settings.ONTOLOGY_SPARQL_ENDPOINT, graph_iri)
g = self._load_named_graph(self.settings.ONTOLOGY_SPARQL_ENDPOINT, graph_iri)
self.graph += g
print(f"[INFO] Loaded {len(self.graph)} triples.")
self._build_label_map()

def _load_named_graph(self, endpoint: str, graph_iri: str) -> Graph:
sparql = SPARQLWrapper(endpoint)
sparql.setCredentials(settings.GRAPHDB_USERNAME, settings.GRAPHDB_PASSWORD)
assert self.settings is not None, "Settings must be initialized before loading ontologies"
sparql.setCredentials(self.settings.GRAPHDB_USERNAME, self.settings.GRAPHDB_PASSWORD)
sparql.setQuery(
f"""
CONSTRUCT {{ ?s ?p ?o }}
Expand Down Expand Up @@ -80,7 +85,7 @@ def _check_ambiguities(self, seen: dict[str, str | list[str]]) -> dict[str, str]

if ambiguous_labels:
msg = f"Found ambiguous labels: {', '.join(sorted(ambiguous_labels))} \n Please resolve these in your ontology before proceeding."
if settings.FAIL_ON_AMBIGUOUS_LABELS:
if (self.settings or Settings()).FAIL_ON_AMBIGUOUS_LABELS:
raise ValueError(msg)
else:
print(f"[WARNING] {msg}")
Expand Down
104 changes: 104 additions & 0 deletions src/strings2things/cli.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,104 @@
"""Command Line Interface for strings2things.

Provides:
- `strings2things serve` : Run the FastAPI application (uvicorn)
- `strings2things transform` : Transform an RDF input file (Turtle or JSON-LD)

Installation (editable):
uv pip install -e .

Examples:
strings2things serve --host 0.0.0.0 --port 8080
strings2things transform --input data.ttl --serialization jsonld --output out.jsonld

Environment:
Ontology loading relies on environment variables (see config). Use `--no-ontology`
to run transformation without loading ontologies.
"""
from __future__ import annotations

import json
import sys
import typer
import uvicorn
from pathlib import Path
from typing import Optional, Literal

from strings2things.app.core.ontology_manager import OntologyManager
from strings2things.app.core.rdf_transformer import RDFTransformer
from strings2things.app.utils.rdf_utils import parse_rdf, serialize_rdf

app = typer.Typer(help="CLI tools for the strings2things RDF transformer")


def _load_label_map(disable: bool) -> dict[str, str]:
if disable:
return {}
manager = OntologyManager()
try:
manager.load_ontologies()
except Exception as e: # pragma: no cover - resilience path
typer.echo(f"[warn] Ontology load failed: {e}", err=True)
return {}
return manager.get_label_map() or {}


@app.command()
def serve(
host: str = typer.Option("127.0.0.1", help="Host interface"),
port: int = typer.Option(8000, help="Port to bind"),
reload: bool = typer.Option(False, help="Enable uvicorn reload (dev only)"),
log_level: str = typer.Option("info", help="Uvicorn log level"),
):
"""Run the FastAPI application via uvicorn."""
uvicorn.run(
"strings2things.app.main:app",
host=host,
port=port,
reload=reload,
log_level=log_level,
)


@app.command()
def transform(
input: Path = typer.Option(..., exists=True, dir_okay=False, readable=True, help="Input RDF file (Turtle or JSON-LD)"),
serialization: Literal["jsonld", "turtle"] = typer.Option(
..., "--serialization", "-s", help="Output serialization format"
),
output: Optional[Path] = typer.Option(
None, "--output", "-o", help="Output file path (stdout if omitted)"
),
fuzzy: bool = typer.Option(False, help="Enable fuzzy label matching"),
fuzzy_threshold: int = typer.Option(90, min=0, max=100, help="Fuzzy threshold 0-100"),
no_ontology: bool = typer.Option(
False, help="Disable ontology loading (use empty label map)"
),
):
"""Transform an RDF graph from a local file and emit the result."""
data = input.read_bytes()
# Simple format inference
first = data.lstrip()[:1]
if first in (b"{", b"["):
in_format = "json-ld"
else:
in_format = "turtle"

graph = parse_rdf(data, format=in_format)
label_map = _load_label_map(disable=no_ontology)
transformer = RDFTransformer(label_map, fuzzy=fuzzy, fuzzy_threshold=fuzzy_threshold)
transformed = transformer.transform(graph)
out_fmt = "json-ld" if serialization == "jsonld" else "turtle"
serialized = serialize_rdf(transformed, output_format=out_fmt)

if output:
output.write_text(serialized, encoding="utf-8")
else:
typer.echo(serialized)


def main(): # legacy entry point if needed
app()

if __name__ == "__main__": # pragma: no cover
main()
Loading