Skip to content

Commit 7b04995

Browse files
committed
feat: add Crossref DOI provider
* include more flexible datacite doi prefix use similar to crossref
1 parent 06022bc commit 7b04995

20 files changed

Lines changed: 1919 additions & 367 deletions

invenio_rdm_records/config.py

Lines changed: 11 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,7 @@
3030
from . import tokens
3131
from .requests.community_inclusion import CommunityInclusion
3232
from .requests.community_submission import CommunitySubmission
33-
from .resources.serializers import CrossrefXMLSerializer, DataCite45JSONSerializer
33+
from .resources.serializers import DataCite45JSONSerializer
3434
from .services import facets
3535
from .services.config import lock_edit_published_files
3636
from .services.permissions import RDMRecordPermissionPolicy
@@ -536,14 +536,15 @@ def always_valid(identifier):
536536
"""
537537

538538
RDM_PERSISTENT_IDENTIFIERS = {
539-
# DOI automatically removed if DATACITE_ENABLED is False.
539+
# DOI automatically removed if DATACITE_ENABLED and CROSSREF_ENABLED are False.
540540
"doi": {
541-
"providers": ["datacite", "external"],
541+
"providers": ["datacite", "crossref", "external"],
542542
"required": True,
543543
"label": _("DOI"),
544544
"validator": idutils.is_doi,
545545
"normalizer": idutils.normalize_doi,
546-
"is_enabled": providers.DataCitePIDProvider.is_enabled,
546+
"is_enabled": providers.DataCitePIDProvider.is_enabled
547+
or providers.CrossrefPIDProvider.is_enabled,
547548
"ui": {"default_selected": "yes"}, # "yes", "no" or "not_needed"
548549
},
549550
"oai": {
@@ -612,6 +613,9 @@ def always_valid(identifier):
612613
DATACITE_PREFIX = ""
613614
"""DataCite DOI prefix."""
614615

616+
DATACITE_ADDITIONAL_PREFIXES = []
617+
"""List of additional DataCite DOI prefixes supported for registration."""
618+
615619
DATACITE_TEST_MODE = True
616620
"""DataCite test mode enabled."""
617621

@@ -651,6 +655,9 @@ def make_doi(prefix, record):
651655
CROSSREF_PREFIX = ""
652656
"""Crossref DOI prefix."""
653657

658+
CROSSREF_ADDITIONAL_PREFIXES = []
659+
"""List of additional Crossref DOI prefixes supported for registration."""
660+
654661
CROSSREF_DEPOSITOR = ""
655662
"""Crossref depositor name."""
656663

invenio_rdm_records/resources/serializers/crossref/__init__.py

Lines changed: 61 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
CrossrefError,
1313
CrossrefXMLSchema,
1414
Metadata,
15+
tostring,
1516
write_crossref_xml,
1617
)
1718
from flask import current_app
@@ -24,39 +25,86 @@ class CrossrefXMLSerializer(MarshmallowSerializer):
2425

2526
def __init__(self, **options):
2627
"""Constructor."""
28+
encoder = options.get("encoder", tostring)
2729
super().__init__(
2830
format_serializer_cls=SimpleSerializer,
2931
object_schema_cls=CrossrefXMLSchema,
3032
list_schema_cls=BaseListSchema,
31-
encoder=self.crossref_xml_tostring,
33+
encoder=encoder,
3234
**options,
3335
)
3436

35-
def dump_obj(self, record):
37+
def serialize_object(self, obj):
38+
"""Serialize a single record to Crossref XML bytes.
39+
40+
Overrides the default to avoid double-encoding, since
41+
``dump_obj`` already returns XML bytes.
42+
"""
43+
return self.dump_obj(obj)
44+
45+
def dump_obj(self, record, url=None):
3646
"""Dump a single record.
3747
38-
Uses config variables for Crossref XML head elements.
48+
Config variables for Crossref XML head elements are used in the
49+
XML head element.
3950
40-
:param record: Record instance.
51+
:param record: Record instance (dict, Record model, or ChainObject).
52+
:param url: the landing page URL for the DOI.
53+
Falls back to ``SITE_UI_URL``/records/<id> if not provided.
4154
"""
55+
# Determine the URL that the DOI resolves to, in the following order:
56+
#
57+
# 1. identifier of type url in ``metadata.identifiers``
58+
# (e.g. archived original content)
59+
# 2. The landing page URL passed by the PID service
60+
# 3. Default constructed from ``SITE_UI_URL`` and record ID
61+
# (e.g. for Celery tasks or tests without UI endpoints)
62+
identifiers = (
63+
record.get("metadata", {}).get("identifiers", [])
64+
if isinstance(record, dict)
65+
else getattr(getattr(record, "metadata", None), "get", lambda *a: [])(
66+
"identifiers", []
67+
)
68+
)
69+
registered_url = (
70+
next(
71+
(
72+
i.get("identifier")
73+
for i in (identifiers or [])
74+
if i.get("scheme") == "url" and i.get("identifier") is not None
75+
),
76+
None,
77+
)
78+
or url
79+
)
80+
81+
if registered_url is None:
82+
site_url = current_app.config.get("SITE_UI_URL", "")
83+
record_id = (
84+
record.get("id")
85+
if isinstance(record, dict)
86+
else getattr(record, "id", None)
87+
)
88+
if site_url and record_id:
89+
registered_url = f"{site_url}/records/{record_id}"
90+
4291
# Convert the metadata to crossref_xml format via the commonmeta intermediary format.
4392
# XML Schema validation errors raise CrossrefError.
4493
try:
4594
metadata = Metadata(
4695
record,
4796
via="inveniordm",
48-
depositor=current_app.config.get("CROSSREF_DEPOSITOR"),
49-
email=current_app.config.get("CROSSREF_EMAIL"),
50-
registrant=current_app.config.get("CROSSREF_REGISTRANT"),
97+
url=registered_url,
5198
)
52-
return write_crossref_xml(metadata)
99+
crossref_xml = write_crossref_xml(metadata)
100+
head = {
101+
"depositor": current_app.config.get("CROSSREF_DEPOSITOR"),
102+
"email": current_app.config.get("CROSSREF_EMAIL"),
103+
"registrant": current_app.config.get("CROSSREF_REGISTRANT"),
104+
}
105+
return tostring(crossref_xml, head=head)
53106
except CrossrefError as e:
54107
current_app.logger.error(
55108
f"CrossrefError while converting {metadata.id} to Crossref XML: {str(e)}"
56109
)
57110
return ""
58-
59-
@classmethod
60-
def crossref_xml_tostring(cls, record):
61-
"""Stringify a Crossref XML record."""
62-
return record

invenio_rdm_records/services/components/pids.py

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -304,6 +304,34 @@ def publish(self, identity, draft=None, record=None):
304304
# already published records that don't have one (i.e. legacy records).
305305
# Create all missing PIDs (this happens only on first publish)
306306
missing_required_schemes = required_schemes - current_schemes
307+
308+
# Clean up any PIDs without identifier BEFORE adding new ones
309+
# This handles cases where previous runs may have set provider/prefix without identifier
310+
for scheme in list(current_pids.keys()):
311+
if "identifier" not in current_pids[scheme]:
312+
del current_pids[scheme]
313+
314+
# Copy provider and prefix from child record PIDs to ensure consistency
315+
child_pids = draft.get("pids", {})
316+
for scheme in missing_required_schemes:
317+
# Only add provider/prefix info if:
318+
# 1. Child has this PID type
319+
# 2. Parent doesn't already have it in current_pids (after cleanup)
320+
# 3. Parent doesn't have it in the actual record (defensive check)
321+
if (
322+
scheme in child_pids
323+
and scheme not in current_pids
324+
and not record.parent.pids.get(scheme)
325+
):
326+
# Copy provider from child to parent
327+
current_pids[scheme] = {
328+
"provider": child_pids[scheme].get("provider"),
329+
}
330+
# Extract prefix from the child's identifier
331+
child_identifier = child_pids[scheme].get("identifier", "")
332+
if child_identifier and "/" in child_identifier:
333+
current_pids[scheme]["prefix"] = child_identifier.split("/")[0]
334+
307335
pids = self.service.pids.parent_pid_manager.create_all(
308336
record.parent,
309337
pids=current_pids,

invenio_rdm_records/services/pids/manager.py

Lines changed: 43 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -143,6 +143,11 @@ def create(self, draft, scheme, identifier=None, provider_name=None):
143143
"""
144144
provider = self._get_provider(scheme, provider_name)
145145
pid_attrs = {}
146+
# Extract prefix from existing PID metadata if available
147+
prefix = (
148+
draft.pids.get(scheme, {}).get("prefix") if draft.pids.get(scheme) else None
149+
)
150+
146151
if identifier is not None:
147152
try:
148153
pid = provider.get(identifier)
@@ -157,7 +162,9 @@ def create(self, draft, scheme, identifier=None, provider_name=None):
157162
)
158163
pid_attrs = {"identifier": identifier, "provider": provider.name}
159164
else:
160-
if draft.pids.get(scheme):
165+
# Only raise error if an identifier already exists
166+
# PIDs with only provider/prefix (without identifier) are incomplete and should be allowed
167+
if draft.pids.get(scheme, {}).get("identifier"):
161168
raise ValidationError(
162169
message=_("A PID already exists for type {scheme}").format(
163170
scheme=scheme
@@ -169,7 +176,11 @@ def create(self, draft, scheme, identifier=None, provider_name=None):
169176
message=_("External identifier value is required."),
170177
field_name=f"pids.{scheme}",
171178
)
172-
pid = provider.create(draft)
179+
# Generate ID with optional prefix override
180+
pid_kwargs = {}
181+
if prefix:
182+
pid_kwargs["prefix"] = prefix
183+
pid = provider.create(draft, **pid_kwargs)
173184
pid_attrs = {"identifier": pid.pid_value, "provider": provider.name}
174185

175186
if provider.client: # provider and identifier already in dict
@@ -183,16 +194,42 @@ def create_all(self, draft, pids=None, schemes=None):
183194

184195
# Create with an identifier value provided
185196
for scheme, pid_attrs in (pids or {}).items():
197+
# Temporarily store prefix in draft.pids for create() to read
198+
prefix = pid_attrs.get("prefix")
199+
if prefix:
200+
if scheme not in draft.pids:
201+
draft.pids[scheme] = {"prefix": prefix}
202+
elif "prefix" not in draft.pids[scheme]:
203+
draft.pids[scheme]["prefix"] = prefix
204+
186205
result[scheme] = self.create(
187206
draft,
188207
scheme,
189-
pid_attrs["identifier"],
208+
pid_attrs.get("identifier"),
190209
pid_attrs.get("provider"),
191210
)
192211

193212
# Create without an identifier value provided (only the scheme)
213+
# Use provider and prefix from pids dict if available
194214
for scheme in schemes or []:
195-
result[scheme] = self.create(draft, scheme)
215+
pid_attrs = (pids or {}).get(scheme, {})
216+
provider_name = pid_attrs.get("provider")
217+
218+
# Temporarily store prefix in draft.pids for create() to read
219+
prefix = pid_attrs.get("prefix")
220+
if prefix and scheme not in draft.pids:
221+
draft.pids[scheme] = {"prefix": prefix}
222+
elif prefix and scheme in draft.pids:
223+
# Preserve existing prefix if not already set
224+
if "prefix" not in draft.pids[scheme]:
225+
draft.pids[scheme]["prefix"] = prefix
226+
227+
result[scheme] = self.create(draft, scheme, provider_name=provider_name)
228+
229+
# Strip transient 'prefix' field from results - it's not part of the
230+
# JSON schema and should not be persisted on the record.
231+
for scheme_attrs in result.values():
232+
scheme_attrs.pop("prefix", None)
196233

197234
return result
198235

@@ -247,10 +284,7 @@ def discard(self, scheme, identifier, provider_name=None, soft_delete=False):
247284
if not provider.can_modify(pid) and not soft_delete:
248285
raise ValidationError(
249286
message=[
250-
_(
251-
"Cannot discard a reserved or registered persistent "
252-
"identifier."
253-
),
287+
_("Cannot discard a reserved or registered persistent identifier."),
254288
],
255289
field_name=f"pids.{scheme}",
256290
)
@@ -303,5 +337,5 @@ def create_and_reserve(self, record, **kwargs):
303337
"""Create and reserve a PID for the given record, and update the record with the reserved PID."""
304338
pids = record.get("pids", {})
305339
provider_pid_dicts = self._get_providers(pids)
306-
for provider, _ in provider_pid_dicts:
340+
for provider, pid_dict in provider_pid_dicts:
307341
provider.create_and_reserve(record)

invenio_rdm_records/services/pids/providers/base.py

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -83,7 +83,7 @@ def create(self, record, pid_value=None, status=None, **kwargs):
8383
if pid_value is None:
8484
if not self.is_managed():
8585
raise ValueError(_("You must provide a pid value."))
86-
pid_value = self.generate_id(record)
86+
pid_value = self.generate_id(record, **kwargs)
8787

8888
try:
8989
pid = self.get(pid_value)
@@ -98,10 +98,14 @@ def create(self, record, pid_value=None, status=None, **kwargs):
9898
status=status or self.default_status,
9999
)
100100

101-
# re-activate if previously deleted
101+
# PID exists - check if it can be reused
102+
# Re-activate if previously deleted
102103
if pid.is_deleted():
103104
pid.sync_status(PIDStatus.NEW)
104105
return pid
106+
# If PID exists and belongs to the same record, return it (idempotent)
107+
elif pid.object_uuid == record.id:
108+
return pid
105109
else:
106110
raise PIDAlreadyExists(self.pid_type, pid_value)
107111

0 commit comments

Comments
 (0)