Skip to content

Commit e96f0ff

Browse files
authored
Support METS of DSpace transfer
1 parent 25e9783 commit e96f0ff

File tree

6 files changed

+3662
-11
lines changed

6 files changed

+3662
-11
lines changed

fixtures/mets_with_dmdsecs_in_filesec.xml

Lines changed: 3468 additions & 0 deletions
Large diffs are not rendered by default.

metsrw/fsentry.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -263,7 +263,9 @@ def _add_metadata_element(self, md, subsection, mdtype, mode="mdwrap", **kwargs)
263263
loctype = kwargs.get("loctype")
264264
label = kwargs.get("label")
265265
otherloctype = kwargs.get("otherloctype")
266-
mdsec = MDRef(md, mdtype, loctype, label, otherloctype)
266+
xptr = kwargs.get("xptr")
267+
othermdtype = kwargs.get("othermdtype")
268+
mdsec = MDRef(md, mdtype, loctype, label, otherloctype, xptr, othermdtype)
267269
subsection = SubSection(subsection, mdsec)
268270
if subsection.subsection == "dmdSec":
269271
self.dmdsecs.append(subsection)
@@ -425,6 +427,8 @@ def serialize_filesec(self):
425427
el.attrib["GROUPID"] = self.group_id()
426428
if self.admids:
427429
el.set("ADMID", " ".join(self.admids))
430+
if self.dmdids and self.use == "original":
431+
el.set("DMDID", " ".join(self.dmdids))
428432
if self.checksum and self.checksumtype:
429433
el.attrib["CHECKSUM"] = self.checksum
430434
el.attrib["CHECKSUMTYPE"] = self.checksumtype

metsrw/metadata.py

Lines changed: 20 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -433,7 +433,16 @@ class MDRef:
433433

434434
VALID_LOCTYPE = ("ARK", "URN", "URL", "PURL", "HANDLE", "DOI", "OTHER")
435435

436-
def __init__(self, target, mdtype, loctype, label=None, otherloctype=None):
436+
def __init__(
437+
self,
438+
target,
439+
mdtype,
440+
loctype,
441+
label=None,
442+
otherloctype=None,
443+
xptr=None,
444+
othermdtype=None,
445+
):
437446
self.target = target
438447
self.mdtype = mdtype
439448
self.loctype = loctype
@@ -443,6 +452,8 @@ def __init__(self, target, mdtype, loctype, label=None, otherloctype=None):
443452
)
444453
self.label = label
445454
self.otherloctype = otherloctype
455+
self.xptr = xptr
456+
self.othermdtype = othermdtype
446457

447458
@classmethod
448459
def parse(cls, root):
@@ -475,8 +486,10 @@ def parse(cls, root):
475486
# Optional attributes
476487
label = root.get("LABEL")
477488
otherloctype = root.get("OTHERLOCTYPE")
489+
xptr = root.get("XPTR")
490+
othermdtype = root.get("OTHERMDTYPE")
478491

479-
return cls(target, mdtype, loctype, label, otherloctype)
492+
return cls(target, mdtype, loctype, label, otherloctype, xptr, othermdtype)
480493

481494
def serialize(self):
482495
# If the source document is a METS document, the XPTR attribute of
@@ -491,7 +504,9 @@ def serialize(self):
491504
]
492505
XPTR = "xpointer(id('{}'))".format(" ".join(dmdsecs))
493506
except Exception:
494-
pass
507+
# Otherwise use the Xpointer passed to the constructor.
508+
if self.xptr is not None:
509+
XPTR = self.xptr
495510

496511
el = etree.Element(utils.lxmlns("mets") + "mdRef")
497512
if self.label:
@@ -510,6 +525,8 @@ def serialize(self):
510525
el.attrib["OTHERLOCTYPE"] = self.otherloctype
511526
if XPTR:
512527
el.attrib["XPTR"] = XPTR
528+
if self.othermdtype:
529+
el.attrib["OTHERMDTYPE"] = self.othermdtype
513530
return el
514531

515532

metsrw/mets.py

Lines changed: 39 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@
2020
AIP_ENTRY_TYPE = "archival information package"
2121
FPtr = namedtuple(
2222
"FPtr",
23-
"file_uuid derived_from use path amdids checksum checksumtype fileid transform_files",
23+
"file_uuid derived_from use path amdids dmdids checksum checksumtype fileid transform_files",
2424
)
2525
TRANSFORM_PREFIX = "TRANSFORM"
2626
TRANSFORM_PREFIX_LEN = len(TRANSFORM_PREFIX)
@@ -300,17 +300,41 @@ def _filesec(self, files=None):
300300
# Get fileGrp, or create if not exist
301301
filegrp = filegrps.get(file_.use)
302302
if filegrp is None:
303-
filegrp = etree.SubElement(
304-
filesec, utils.lxmlns("mets") + "fileGrp", USE=file_.use
305-
)
303+
filegrp = etree.Element(utils.lxmlns("mets") + "fileGrp", USE=file_.use)
306304
filegrps[file_.use] = filegrp
307305

308306
file_el = file_.serialize_filesec()
309307
if file_el is not None:
310308
filegrp.append(file_el)
309+
for filegrp in self._sort_filegrps(filegrps):
310+
filesec.append(filegrp)
311311

312312
return filesec
313313

314+
def _sort_filegrps(self, filegrps):
315+
uses_order = [
316+
"original",
317+
"submissionDocumentation",
318+
"preservation",
319+
"service",
320+
"access",
321+
"license",
322+
"text/ocr",
323+
"metadata",
324+
"derivative",
325+
]
326+
result = []
327+
count = len(filegrps)
328+
for i, use in enumerate(filegrps.keys()):
329+
filegrp = filegrps[use]
330+
try:
331+
filegrp_position = uses_order.index(use)
332+
except ValueError:
333+
filegrp_position = count + i
334+
result.append((filegrp_position, filegrp))
335+
336+
return [v for i, v in sorted(result)]
337+
314338
def serialize(self, fully_qualified=True, normative_structmap=True):
315339
"""
316340
Returns this document serialized to an xml Element.
@@ -401,6 +425,7 @@ def _parse_tree_structmap(self, tree, parent_elem, normative_parent_elem=None):
401425
fs_entry = fsentry.FSEntry.from_fptr(
402426
label=None, type_="Item", fptr=fptr
403427
)
428+
self._add_dmdsecs_to_fs_entry(elem, fs_entry, fptr.dmdids)
404429
self._add_amdsecs_to_fs_entry(fptr.amdids, fs_entry, tree)
405430
siblings.append(fs_entry)
406431
continue
@@ -409,7 +434,7 @@ def _parse_tree_structmap(self, tree, parent_elem, normative_parent_elem=None):
409434
continue
410435
fptr = self._analyze_fptr(fptr_elems[0], tree, entry_type)
411436
fs_entry = fsentry.FSEntry.from_fptr(label, entry_type, fptr)
412-
self._add_dmdsecs_to_fs_entry(elem, fs_entry, tree)
437+
self._add_dmdsecs_to_fs_entry(elem, fs_entry, tree, fptr.dmdids)
413438
self._add_amdsecs_to_fs_entry(fptr.amdids, fs_entry, tree)
414439
siblings.append(fs_entry)
415440
return siblings
@@ -466,6 +491,7 @@ def _analyze_fptr(fptr_elem, tree, entry_type):
466491
" URL.".format(path)
467492
)
468493
amdids = file_elem.get("ADMID")
494+
dmdids = file_elem.get("DMDID")
469495
checksum = file_elem.get("CHECKSUM")
470496
checksumtype = file_elem.get("CHECKSUMTYPE")
471497
file_id_prefix = utils.FILE_ID_PREFIX
@@ -510,15 +536,21 @@ def _analyze_fptr(fptr_elem, tree, entry_type):
510536
use,
511537
path,
512538
amdids,
539+
dmdids,
513540
checksum,
514541
checksumtype,
515542
file_id,
516543
transform_files,
517544
)
518545

519546
@staticmethod
520-
def _add_dmdsecs_to_fs_entry(elem, fs_entry, tree):
521-
for dmdid in elem.get("DMDID", "").split():
547+
def _add_dmdsecs_to_fs_entry(elem, fs_entry, tree, dmdids=None):
548+
dmdids_to_add = elem.get("DMDID", "").split()
549+
if dmdids is not None:
550+
dmdids_to_add.extend(
551+
[dmdid for dmdid in dmdids.split() if dmdid not in dmdids_to_add]
552+
)
553+
for dmdid in dmdids_to_add:
522554
dmdsec_elem = tree.find(
523555
'mets:dmdSec[@ID="' + dmdid + '"]', namespaces=utils.NAMESPACES
524556
)

tests/test_metadata.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -398,13 +398,17 @@ def test_create_extra_params(self):
398398
label="Label",
399399
loctype="OTHER",
400400
otherloctype="OUTSIDE",
401+
xptr="xpointer(id('dmdSec_366 dmdSec_367'))",
402+
othermdtype="METSRIGHTS",
401403
)
402404
mdreffed = mdref.serialize()
403405

404406
assert mdreffed.get("LOCTYPE") == "OTHER"
405407
assert mdreffed.get("OTHERLOCTYPE") == "OUTSIDE"
406408
assert mdreffed.get(metsrw.lxmlns("xlink") + "href") == "path/to/file.txt"
407409
assert mdreffed.get("MDTYPE") == "OTHER"
410+
assert mdreffed.get("XPTR") == "xpointer(id('dmdSec_366 dmdSec_367'))"
411+
assert mdreffed.get("OTHERMDTYPE") == "METSRIGHTS"
408412

409413
def test_create_bad_loctype(self):
410414
metsrw.MDRef(None, None, loctype="ARK")

tests/test_mets.py

Lines changed: 126 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -350,6 +350,31 @@ def test_analyze_fptr(self):
350350
use="original",
351351
path="objects/AM68.csv",
352352
amdids="amdSec_3",
353+
dmdids=None,
354+
checksum=None,
355+
checksumtype=None,
356+
transform_files=[],
357+
)
358+
359+
def test_analyze_fptr_with_dmdsecs_in_filesec(self):
360+
parser = etree.XMLParser(remove_blank_text=True)
361+
tree = etree.parse("fixtures/mets_with_dmdsecs_in_filesec.xml", parser=parser)
362+
fptr_elem = tree.find(
363+
'/mets:structMap[@TYPE="physical"]//mets:div[@LABEL="bitstream_8266.pdf"]/mets:fptr',
364+
namespaces=metsrw.utils.NAMESPACES,
365+
)
366+
367+
# Test the integrity of the ``FPtr`` object returned.
368+
mw = metsrw.METSDocument()
369+
fptr = mw._analyze_fptr(fptr_elem, tree, "Item")
370+
assert fptr == metsrw.mets.FPtr(
371+
fileid="file-33f5f35a-8bde-4b94-b7cd-3d2c8b8f7a23",
372+
file_uuid="33f5f35a-8bde-4b94-b7cd-3d2c8b8f7a23",
373+
derived_from=None,
374+
use="original",
375+
path="objects/ITEM_2429-2700.zip-2023-07-07T23_05_58.201656_00_00/bitstream_8266.pdf",
376+
amdids="amdSec_3",
377+
dmdids="dmdSec_3 dmdSec_4",
353378
checksum=None,
354379
checksumtype=None,
355380
transform_files=[],
@@ -1001,6 +1026,107 @@ def test_serialize_normative_structmap(self):
10011026
tree = mw.serialize(normative_structmap=False)
10021027
assert tree.find(xpath, namespaces=metsrw.NAMESPACES) is None
10031028

1029+
def test_dspace_mets_dmdsecs(self):
1030+
mets_path = "fixtures/mets_with_dmdsecs_in_filesec.xml"
1031+
mw = metsrw.METSDocument.fromfile(mets_path)
1032+
fsentry = mw.get_file(
1033+
type="Item",
1034+
path="objects/ITEM_2429-2700.zip-2023-07-07T23_05_58.201656_00_00/bitstream_8266.pdf",
1035+
)
1036+
1037+
# The original object of a DSpace transfer should contain two dmdSecs:
1038+
assert len(fsentry.dmdsecs) == 2
1039+
1040+
# - The first contains Xpointers to descriptive metadata in
1041+
# the original mets.xml files exported from DSpace.
1042+
xpointer_dmdsec = [
1043+
dmdsec
1044+
for dmdsec in fsentry.dmdsecs
1045+
if isinstance(dmdsec.contents, metsrw.MDRef)
1046+
][0].serialize()
1047+
assert xpointer_dmdsec.attrib.get("STATUS") == "original"
1048+
assert xpointer_dmdsec.attrib.get("CREATED") == "2023-07-07T23:06:15"
1049+
1050+
xpointer = xpointer_dmdsec.find(
1051+
"mets:mdRef",
1052+
namespaces=metsrw.utils.NAMESPACES,
1053+
)
1054+
assert xpointer is not None
1055+
assert (
1056+
xpointer.attrib.get("LABEL")
1057+
== "mets.xml-Group-33f5f35a-8bde-4b94-b7cd-3d2c8b8f7a23"
1058+
)
1059+
assert xpointer.attrib.get("MDTYPE") == "OTHER"
1060+
assert xpointer.attrib.get("LOCTYPE") == "OTHER"
1061+
assert xpointer.attrib.get("OTHERLOCTYPE") == "SYSTEM"
1062+
assert xpointer.attrib.get("XPTR") == "xpointer(id('dmdSec_366 dmdSec_367'))"
1063+
1064+
# - The second dmdSec reflects the parent-child relationship between a
1065+
# DSpace object and its collection, using the handles as identifiers.
1066+
dc_dmdsec = [
1067+
dmdsec
1068+
for dmdsec in fsentry.dmdsecs
1069+
if isinstance(dmdsec.contents, metsrw.MDWrap)
1070+
][0].serialize()
1071+
assert dc_dmdsec.attrib.get("STATUS") == "original"
1072+
assert dc_dmdsec.attrib.get("CREATED") == "2023-07-07T23:06:15"
1073+
1074+
identifier = dc_dmdsec.find(
1075+
'mets:mdWrap[@MDTYPE="DC"]/mets:xmlData/dcterms:dublincore/dc:identifier',
1076+
namespaces=metsrw.utils.NAMESPACES,
1077+
)
1078+
assert identifier is not None
1079+
assert identifier.text == "hdl:2429/2700"
1080+
1081+
terms = dc_dmdsec.find(
1082+
'mets:mdWrap[@MDTYPE="DC"]/mets:xmlData/dcterms:dublincore/dcterms:isPartOf',
1083+
namespaces=metsrw.utils.NAMESPACES,
1084+
)
1085+
assert terms is not None
1086+
assert terms.text == "hdl:2429/1314"
1087+
1088+
def test_dspace_mets_amdsec(self):
1089+
mets_path = "fixtures/mets_with_dmdsecs_in_filesec.xml"
1090+
mw = metsrw.METSDocument.fromfile(mets_path)
1091+
fsentry = mw.get_file(
1092+
type="Item",
1093+
path="objects/ITEM_2429-2700.zip-2023-07-07T23_05_58.201656_00_00/bitstream_8266.pdf",
1094+
)
1095+
1096+
# The original object of a DSpace transfer should contain Xpointers
1097+
# to rights metadata in the original mets.xml files exported from
1098+
# DSpace.
1099+
rights_md = [
1100+
subsection.contents
1101+
for subsection in fsentry.amdsecs[0].subsections
1102+
if isinstance(subsection.contents, metsrw.MDRef)
1103+
][0].serialize()
1104+
assert (
1105+
rights_md.attrib.get("LABEL")
1106+
== "mets.xml-988d7030-3cde-43f2-ac1f-2b8cf9d5a70b"
1107+
)
1108+
assert rights_md.attrib.get("MDTYPE") == "OTHER"
1109+
assert rights_md.attrib.get("OTHERMDTYPE") == "METSRIGHTS"
1110+
assert rights_md.attrib.get("LOCTYPE") == "OTHER"
1111+
assert rights_md.attrib.get("OTHERLOCTYPE") == "SYSTEM"
1112+
assert (
1113+
rights_md.attrib.get("XPTR")
1114+
== "xpointer(id('rightsMD_371 rightsMD_374 rightsMD_384 rightsMD_393 rightsMD_401 rightsMD_409 rightsMD_417 rightsMD_425'))"
1115+
)
1116+
1117+
def test_dspace_filegrp_sorting_in_filesec(self):
1118+
mets_path = "fixtures/mets_with_dmdsecs_in_filesec.xml"
1119+
mw = metsrw.METSDocument.fromfile(mets_path)
1120+
filesec = mw._filesec()
1121+
1122+
assert [filegrp.attrib["USE"] for filegrp in filesec] == [
1123+
"original",
1124+
"submissionDocumentation",
1125+
"preservation",
1126+
"license",
1127+
"text/ocr",
1128+
]
1129+
10041130

10051131
@pytest.mark.parametrize(
10061132
"mets_path, expected_counts",

0 commit comments

Comments
 (0)