thegetty · benosteen · Aug 5, 2025 · Aug 7, 2025 · Aug 7, 2025 · Aug 7, 2025
diff --git a/.env.example b/.env.example
@@ -114,4 +114,7 @@ RDF_CONTEXT_CACHE_EXPIRES=30
 # a .env file DOES. Very annoying.
 # When defining environment variables in a file to be used with docker run --env-file, avoid quoting values unless you explicitly intend for the quotes to be part of the variable's content. If a value contains spaces or special characters, simply define it without quotes, as Docker will treat the entire string after the = as the value.
 
+LDP_BACKEND=True
+LDP_AUTOCREATE_CONTAINERS=True
+
 CONTENT_PROFILE_DATA={"name": "Simplified Dublin Core", "description": "No description given.", "url": "https://raw.githubusercontent.com/thegetty/getty-jsonld-sparql-patterns/refs/heads/main/src/gettysparqlpatterns/data/crmtodublincore.json", "patterns": [{"name": "InformationObject as DC", "description": "Views a CRM InformationObject in terms of Simplified Dublin Core", "sparql_pattern": "# MOCKED DCMI RESPONSE\nPREFIX aat: <http://vocab.getty.edu/aat/>\nPREFIX crm: <http://www.cidoc-crm.org/cidoc-crm/>\nPREFIX dc: <http://purl.org/dc/elements/1.1/>\nPREFIX dcterm: <http://purl.org/dc/terms/>\nPREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>\n\n\nCONSTRUCT {\n  <$URI> dc:identifier ?identifier ;\n     dc:title ?title ;\n     dc:description ?note ;\n     dc:creator ?agent_name ;\n     dc:date ?date_range ;\n     dc:date ?date_expression ;\n     dc:language ?lang ;\n     dc:language ?lang_as_text ;\n     dc:subject ?subject .\n  ?subject rdfs:label ?subject_text .\n  \n}\nWHERE {\nSELECT ?identifier ?title ?note ?subject ?subject_text ?date_range ?date_expression ?lang_as_text ?lang ?agent_name\n    WHERE {\n  GRAPH <$URI> {\n  <$URI> crm:P1_is_identified_by ?id_block .\n  ?id_block crm:P2_has_type ?id_type ;\n            crm:P190_has_symbolic_content ?identifier_text \n    BIND(CONCAT(?identifier_text, \" (\", STR(?id_type), \")\") as ?identifier) .\n  <$URI> rdfs:label ?title .\n      OPTIONAL {\n       <$URI> crm:P72_has_language ?lang .\n        ?lang rdfs:label ?lang_as_text\n      }\n     OPTIONAL {\n  <$URI> crm:P129_is_about ?subject .\n        ?subject a crm:E33_Linguistic_Object ;\n            rdfs:label ?subject_text\n      }\n      OPTIONAL {\n   <$URI> crm:P67i_is_referred_to_by ?notes .\n      ?notes crm:P2_has_type ?note_type ;\n            crm:P190_has_symbolic_content ?note_text \n      BIND(CONCAT(\"(\", STR(?note_type), \") \", ?note_text) as ?note) .\n      }\n      OPTIONAL {\n        ?prod_act a crm:E12_Production ;\n            OPTIONAL {\n        \t?prod_act  crm:P9_consists_of ?activities .\n            ?activities crm:P14_carried_out_by ?agent .\n            ?agent rdfs:label ?agent_name .\n        }\n        OPTIONAL {\n          ?prod_act crm:P4_has_time-span ?timespan .\n          ?timespan crm:P82a_begin_of_the_begin ?begin ;\n    \t\t\t    crm:P82b_end_of_the_end ?end \n          OPTIONAL {\n        BIND(CONCAT(STR(?begin), \" - \", STR(?end)) as ?date_range)\n         ?timespan crm:P1_is_identified_by ?expression .\n         ?expression a crm:E33_E41_Linguistic_Appellation ;\n    \t\t\t\t   crm:P190_has_symbolic_content ?date_expression ;\n        }\n      }\n      }\n  }\n}\n}", "stype": "construct", "keyword_parameters": ["URI"], "default_values": {}, "applies_to": ["InformationObject"], "ask_filter": null, "framing": null, "profile_uri": "urn:getty:dublincore"}, {"name": "LinguisticObject to dublincore", "description": "No description given", "sparql_pattern": "PREFIX crm: <http://www.cidoc-crm.org/cidoc-crm/>\nPREFIX dc: <http://purl.org/dc/elements/1.1/>\nPREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>\nCONSTRUCT {\n  <$URI> dc:title ?title ;\n         dc:description ?content ;\n         dc:type ?type ;\n         dc:type ?type_name .\n} WHERE {\n SELECT ?title ?content ?type ?type_name WHERE {\n    <$URI> a crm:E33_Linguistic_Object ;\n           rdfs:label ?title ;\n           crm:P2_has_type ?type ;\n           crm:P190_has_symbolic_content ?content .\n    ?type rdfs:label ?type_name .\n  }\n}", "stype": "construct", "keyword_parameters": ["URI"], "default_values": {}, "applies_to": ["LinguisticObject"], "ask_filter": null, "framing": null, "profile_uri": "urn:getty:dublincore"}]}
diff --git a/documentation/content_negotiation.md b/documentation/content_negotiation.md
@@ -1,5 +1,5 @@
 # Content Negotiation
-
+([back to ToC](/README.md))
 ## Overview
 
 The LOD Gateway when configured to support RDF Processing provides support for both standard HTTP Content Negotiation of mimetype as well as data-specific support for [Content Negotiation by Profile from the W3C](https://www.w3.org/TR/dx-prof-onneg/) (aka CNBP).

diff --git a/documentation/ldp.md b/documentation/ldp.md
@@ -0,0 +1,30 @@
+# Linked Data Platform Backend and API
+([back to ToC](/README.md))
+## Overview
+The [Linked Data Platform (LDP)](https://www.w3.org/TR/ldp/#ldpc) is a W3C standard that defines a set of rules for interacting with web resources using HTTP, enabling a read-write Linked Data architecture on the web. It provides a way to access, create, update, and delete RDF resources over HTTP in a standardized manner, facilitating data integration and interoperability. LDP builds upon the principles of Linked Data, using URIs as names for things, providing useful information when a URI is looked up, including links to other URIs.
+
+One of the core concepts is the idea of a 'container' which contains zero or more Linked Data resources as a set. A container may have resources added or removed from it, and when resolving its URI, it should be possible to find out more about the container as well as list every resource it contains. A container may contain almost anything that can be referenced with a URI.
+
+When the LDP features are enabled, the LOD Gateway is extended with container structures, and the service itself is viewed as having a single 'root' container, that everything else (resources and other containers) is part of. The API is also extended, to allow for a subset of the LDP API functionality as defined in the specification.
+
+### Limited Support for the Full LDP Specification
+The LOD Gateway and the LDP specification are based around different ideas on how to handle and store Linked Data: 
+- The LOD Gateway is built around a CRUD API for JSON documents and extends that by adding optional Named Graph JSON-LD RDF capabilities, Content Negotiation, SPARQL, Activitystreams and fast bulk ingest and updating.
+- LDP is a much broader specification which can involve both RDF and Non-RDF resources such as images, and specifies the concepts and API for containers. The focus is on the REST API that is required to manage these resources and containers.
+
+The LOD Gateway implementation supports a narrower portion of the full LDP specification:
+
+- The LOD Gateway LDP support is dependent on RDF support also being enabled (PROCESS_RDF=True)
+- The LOD Gateway root path is the root in the container hierarchy.
+- The Container hierarchy will mirror the URL path. eg /annotations/test/12345 will have the following container hierarchy:
+    - root '/' <-member- '/annotations/' <-member- '/annotations/test/' <-member '/annotations/test/12345'
+- A Container will contain either other containers, or named graphs (the JSON-LD documents). 
+    - Adding binary or Non-RDF content will be not supported
+- The LOD Gateway will use ldp:BasicContainers and only supports dc:title and dc:description fields for their user-editable metadata.
+- Pagination using the [LDP-PAGING specification](https://www.w3.org/TR/ldp-paging/) will be the default, and the members will be listed containers-first and then the member JSON-LD resources. This will be an ordered but dynamic response, so page responses may vary after resources are deleted.
+    - This is usefully different to the existing static activitystream responses, as the container listing will only contain members that are part of that container at the time of querying, and will not list deleted members.
+- Containers and Named Graphs must be added to an existing container resource, or the request will fail:
+    - A POST request to a container, using the Slug header to specify an optional relative identifier.
+    - A PUT request to a specific path. The Slug will be inferred from the path.
+
+NB There is an optional feature that can be enabled to make the LOD Gateway automatically create the necessary container structure if it does not exist.
diff --git a/source/web-service/flaskapp/__init__.py b/source/web-service/flaskapp/__init__.py
@@ -26,6 +26,7 @@
 from flaskapp.models import db
 from flaskapp.models.activity import Activity
 from flaskapp.models.record import Record
+from flaskapp.models.container import LDPContainer, LDPContainerContents
 from flaskapp import local_thesaurus
 from flaskapp.base_graph_utils import base_graph_filter, document_loader
 
@@ -131,6 +132,10 @@ def create_app():
     app.config["RDF_FILTER_SET"] = None
     app.config["CONTENT_PROFILE_DATA_URL"] = None
     app.config["CONTENT_PROFILE_PATTERNS"] = {}
+    app.config["LDP_BACKEND"] = False
+    app.config["LDP_API"] = False
+    app.config["LDP_AUTOCREATE_CONTAINERS"] = False
+    app.config["LDP_VALIDATE_SLUGS"] = False
 
     # Set up RDF/Content Profile defaults:
     app.config["USE_PYLD_REFORMAT"] = True
@@ -143,6 +148,42 @@ def create_app():
         app.config["SPARQL_QUERY_ENDPOINT"] = environ["SPARQL_QUERY_ENDPOINT"]
         app.config["SPARQL_UPDATE_ENDPOINT"] = environ["SPARQL_UPDATE_ENDPOINT"]
 
+        # LDP Support (NB all dependent on PROCESS_RDF being true)
+        # LDP_BACKEND -> This flag controls the backend bookkeeping where the container lists are kept
+        #                up-to-date based on record changes (add or delete).
+        # LDP_API -> This flag switches on the LDP support in the API (Link headers, API endpoint behaviors and responses)
+        #            Also dependent on the LDP_BACKEND flag
+        if ldp_backend := (environ.get("LDP_BACKEND", "False").lower() == "true"):
+            # LDP_API is dependent on the backend being switched on as well
+            app.config["LDP_BACKEND"] = ldp_backend
+
+            # Switch on the LDP API? This turns on both the advertising using Link headers, and also the routing support
+            # for containers, paging through their contents, and adding/updating resources and containers
+            app.config["LDP_API"] = environ.get("LDP_API", "False").lower() == "true"
+
+            # Should the backend automatically instantiate containers when a record is put through the /ingest route?
+            # eg if True, Adding '/objects/magazines/123456' will create an LDP container for '/objects' and for
+            # '/objects/magazines' with the parent-child relationship you'd expect, and add the resource
+            # '/objects/magazines/123456' to the container '/objects/magazines'
+            # If False, adding a resource through /ingest without the expected containers in place will fail, which will mimic
+            # the behavior of the LDP API if you attempt to add a resource to a container that does not exist
+            app.config["LDP_AUTOCREATE_CONTAINERS"] = (
+                environ.get("LDP_AUTOCREATE_CONTAINERS", "False").lower() == "true"
+            )
+
+            # Perform an extra step to validate that the generated slug for a resource doesn't exist in the container yet?
+            app.config["LDP_VALIDATE_SLUGS"] = (
+                environ.get("LDP_VALIDATE_SLUGS", "False").lower() == "true"
+            )
+
+            app.logger.info(
+                f"LDP Support: Backend active? {ldp_backend}, LDP API active? {app.config['LDP_API']}, Autocreate Containers on /ingest? {app.config['LDP_AUTOCREATE_CONTAINERS']}"
+            )
+        elif environ.get("LDP_API", "False").lower() == "true":
+            app.logger.error(
+                "LDP_API was set to True BUT LDP_BACKEND was not. LDP_API will NOT BE ACTIVE!"
+            )
+
         # Content Profiles - by URL resource
         app.config["CONTENT_PROFILE_DATA_URL"] = environ.get("CONTENT_PROFILE_DATA_URL")
         # or by JSON-encoded env variable:
@@ -304,6 +345,13 @@ def create_app():
                 app.config["RDF_BASE_GRAPH"], app.config["FULL_BASE_GRAPH"]
             )
 
+        # Are we able to use postgresql optimizations?
+        engine = db.engine
+        if engine.dialect.name == "postgresql":
+            app.config["DB_DIALECT"] = "postgresql"
+        else:
+            app.config["DB_DIALECT"] = "base"
+
         app.config["SERVER_CAPABILITIES"] = (
             ", ".join(
                 [
@@ -314,6 +362,8 @@ def create_app():
                         ("SUBADDRESSING", "Subaddressing"),
                         ("KEEP_LAST_VERSION", "Versioning"),
                         ("CONTENT_PROFILE_PATTERNS_AVAILABLE", "Content Profiles"),
+                        ("LDP_BACKEND", "LDP Container Backend"),
+                        ("LDP_API", "LDP API Support"),
                     ]
                     if app.config.get(k)
                 ]
@@ -326,9 +376,9 @@ def create_app():
         if link_bank_str:
             try:
                 app.config["LINK_BANK"] = json.loads(link_bank_str)
-            except json.decoder.JSONDecodeError as e:
+            except json.decoder.JSONDecodeError:
                 app.logger.error(
-                    f"The data in ENV: 'LINK_BANK' is not valid JSON! Will not load Link Bank values"
+                    "The data in ENV: 'LINK_BANK' is not valid JSON! Will not load Link Bank values"
                 )
 
         app.register_blueprint(home_page, url_prefix=f"/{ns}")

diff --git a/source/web-service/flaskapp/base_graph_utils.py b/source/web-service/flaskapp/base_graph_utils.py
@@ -93,9 +93,9 @@ def base_graph_filter(basegraphobj, fqdn_id):
     try:
         record = get_record(basegraphobj)
 
-        if record and record.data:
+        if record and "record" in record and record["record"].data:
             # only change the named graph to be a FQDN
-            data = dict(record.data)
+            data = dict(record["record"].data)
         else:
             current_app.logger.warning(
                 f"No base graph was present at {basegraphobj} - adding an empty base graph."

diff --git a/source/web-service/flaskapp/conneg.py b/source/web-service/flaskapp/conneg.py
@@ -1,6 +1,9 @@
 import re
+import json
 import requests
 
+from pyld import jsonld
+
 from werkzeug.http import parse_accept_header
 
 # type hint imports
@@ -9,7 +12,13 @@
 
 from flaskapp.errors import status_graphstore_error, status_nt
 
-from .graph_prefix_bindings import FORMATS
+from .graph_prefix_bindings import (
+    FORMATS,
+    get_bound_graph,
+    get_frame,
+    BASE_FRAME_CONTEXT,
+)
+from .utilities import triples_to_quads
 
 # Trying to use a regex to parse out a profile="" statement from the Accept header
 # Not in use yet, but is close to workable so keeping this here.
@@ -150,3 +159,75 @@ def get_data_using_profile_query(
         except Exception as e:
             print(f"Hit unexpected Exception {str(e)}")
             raise e
+
+
+def reformat_rdf(data, shortformat="turtle", use_pyld=True, rdf_docloader=None):
+    if shortformat == "json-ld":
+        # Assume data is *already* JSON-LD
+        return data
+    if use_pyld is True:
+        # Use the PyLD library to parse into nquads, and rdflib to convert
+        # rdflib's json-ld import has not been tested on our data, so not relying on it
+        proc = jsonld.JsonLdProcessor()
+        serialized_rdf = proc.to_rdf(
+            data,
+            {
+                "format": "application/n-quads",
+                "documentLoader": rdf_docloader,
+            },
+        )
+
+        ident = data.get("id") or data.get("@id")
+
+        # rdflib to load and format the nquads
+        # forcing it, because of pyld's awful nquad export
+        g = get_bound_graph(identifier=ident)
+
+        # May not be nquads, even though we requested it:
+        serialized_rdf = triples_to_quads(serialized_rdf, ident)
+
+        g.parse(data=serialized_rdf, format="nquads")
+        data = g.serialize(format=shortformat)
+        return data
+    else:
+        ident = data.get("id") or data.get("@id")
+
+        # using rdflib to both parse and re-serialize the RDF:
+        g = get_bound_graph(identifier=ident)
+
+        g.parse(data=json.dumps(data), format="json-ld")
+        data = g.serialize(format=shortformat)
+        # blank out the etag for now
+        return data
+
+
+def reformat_to_jsonld(data, incoming_format, target_base=None, top_level_id=None):
+    if incoming_format == "json-ld":
+        return data
+
+    # Rebase to target
+    if incoming_format in ["turtle", "n3"] and target_base is not None:
+        # rewrite the incoming base to the new one
+        # the json-ld property setter will validate and finish the rebase properly
+        data = "\n".join(
+            [f"@base <{target_base}> ."]
+            + [x for x in data.split("\n") if not x.startswith("@base")]
+        )
+
+    # using rdflib to both parse and re-serialize the RDF:
+    g = get_bound_graph(identifier=top_level_id)
+    g.parse(data=data, format=incoming_format)
+
+    # data = g.serialize(format="json-ld", base=target_base, compact=True)
+    data = g.serialize(format="json-ld", compact=True)
+
+    # load it as JSON
+    return json.loads(data)
+
+
+def frame_jsonld(data, target_uri):
+    # This will be used on data pulled in via turtle or similar, not JSON-LD
+    frame = get_frame(target_uri)
+    expanded = jsonld.expand(data)
+    framed = jsonld.frame(expanded, frame)
+    return jsonld.compact(framed, BASE_FRAME_CONTEXT)
diff --git a/source/web-service/flaskapp/errors.py b/source/web-service/flaskapp/errors.py
@@ -22,6 +22,9 @@
 status_record_not_found = status_nt(
     404, "Record Not Found", "Unable to obtain matching record from database"
 )
+status_container_not_found = status_nt(
+    404, "Container Not Found", "Unable to find container in database"
+)
 
 status_page_not_found = status_nt(404, "Page Not Found", "Page number out of bounds")
 
@@ -50,6 +53,15 @@
 )
 
 
+class RDFDataError(ValueError):
+    """Error representing a general problem treating some data as RDF"""
+
+
+class ResourceValidationError(RDFDataError):
+    """An error occurred trying to validate the given resource. This can happen if say,
+    the JSONLD doc has a @base but its top level 'id' has a scheme like https/urn/etc"""
+
+
 # Construct 'error response' object
 def construct_error_response(status, source: int = None, detail: str = None):
 

diff --git a/source/web-service/flaskapp/graph_prefix_bindings.py b/source/web-service/flaskapp/graph_prefix_bindings.py
@@ -1,5 +1,7 @@
 from rdflib import ConjunctiveGraph, Namespace
 
+from rdflib.namespace import DC, DCTERMS
+
 
 BINDING = {
     "crm": Namespace("http://www.cidoc-crm.org/cidoc-crm/"),
@@ -17,6 +19,14 @@
     "prov": Namespace("http://www.w3.org/ns/prov#"),
 }
 
+# For items uploaded outside of JSON-LD
+BASE_FRAME_CONTEXT = {
+    "id": "@id",
+    "type": "@type",
+}
+for k, v in BINDING.items():
+    BASE_FRAME_CONTEXT[k] = str(v)
+
 FORMATS = {
     # RDF triple formats
     "application/n-triples; charset=UTF-8": "nt11",
@@ -33,8 +43,19 @@
 }
 
 
+# Basic framing, anticipating a single top-level URI
+def get_frame(identifier):
+    return {
+        "@context": BASE_FRAME_CONTEXT,
+        "@id": identifier,
+        "@embed": "@always",
+    }
+
+
 def get_bound_graph(identifier):
     g = ConjunctiveGraph(identifier=identifier)
+    g.bind("dc", DC)
+    g.bind("dcterm", DCTERMS)
     for k, v in BINDING.items():
         g.bind(k, v)
     return g