Skip to content

Commit 0ba0781

Browse files
committed
Use @Normal attribute for ASpace EAD dates
Why these changes are being introduced: We receieved a Sentry error that a record had an invalid date range. As described in the linked ticket, it became apparent this was related to a combination of date parsing and validation, and how it varies between Transmogrifier and OpenSearch environments. It has been noted that the EAD unitdate field @Normal attribute might contain a more normalized form of the date that could help with these kind of errors or ambiguity. How this addresses that need: The date parsing for ASpace EADs has been updated to use the @Normal attribute, which provides a cleaner and more predictable string to work with for splitting as a date range. Side effects of this change: There are 60 ASpace EADs that do not have a @Normal attribute in their archdesc.unitdate element. For these, it's possible they will lose a date value if they had been parsed previously. But it's believed this will not be the case, given their element text values were all also somewhat unusual. Relevant ticket(s): * https://mitlibraries.atlassian.net/browse/TIMX-235 * https://mitlibraries.atlassian.net/browse/TIMX-92
1 parent 5b880eb commit 0ba0781

File tree

4 files changed

+87
-99
lines changed

4 files changed

+87
-99
lines changed

tests/fixtures/ead/ead_record_all_fields.xml

+1-1
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,7 @@
3232
<num>VC.0002</num>
3333
</unittitle>
3434
<unittitle>Title 3</unittitle>
35-
<unitdate certainty="approximate" datechar="creation">1905-2012</unitdate>
35+
<unitdate certainty="approximate" datechar="creation" normal="1905/2012">1905-2012</unitdate>
3636
<unitid>
3737
1234
3838
</unitid>

tests/fixtures/ead/ead_record_attribute_and_subfield_variations.xml

+10-30
Original file line numberDiff line numberDiff line change
@@ -26,36 +26,16 @@
2626
<emph></emph>
2727
<num></num>
2828
</unittitle>
29-
<unitdate datechar=""></unitdate>
30-
<unitdate datechar="">1905-2012</unitdate>
31-
<unitdate datechar="">1905</unitdate>
32-
<unitdate certainty=""></unitdate>
33-
<unitdate certainty="">1905-2012</unitdate>
34-
<unitdate certainty="">1905</unitdate>
35-
<unitdate datechar="" certainty=""></unitdate>
36-
<unitdate datechar="" certainty="">1905-2012</unitdate>
37-
<unitdate datechar="" certainty="">1905</unitdate>
38-
<unitdate datechar="creation"></unitdate>
39-
<unitdate datechar="creation">1905-2012</unitdate>
40-
<unitdate datechar="creation">1905</unitdate>
41-
<unitdate datechar="creation">abcd-efgh</unitdate>
42-
<unitdate datechar="creation">abcd</unitdate>
43-
<unitdate datechar="creation" certainty=""></unitdate>
44-
<unitdate datechar="creation" certainty="">1905-2012</unitdate>
45-
<unitdate datechar="creation" certainty="">1905</unitdate>
46-
<unitdate datechar="creation" certainty="approximate"></unitdate>
47-
<unitdate datechar="creation" certainty="approximate">1905-2012</unitdate>
48-
<unitdate datechar="creation" certainty="approximate">1905</unitdate>
49-
<unitdate datechar="creation" certainty="approximate">abcd-efgh</unitdate>
50-
<unitdate datechar="creation" certainty="approximate">abcd</unitdate>
51-
<unitdate certainty="approximate"></unitdate>
52-
<unitdate certainty="approximate">1905-2012</unitdate>
53-
<unitdate certainty="approximate">1905</unitdate>
54-
<unitdate certainty="approximate">abcd-efgh</unitdate>
55-
<unitdate certainty="approximate">abcd</unitdate>
56-
<unitdate datechar="" certainty="approximate"></unitdate>
57-
<unitdate datechar="" certainty="approximate">1905-2012</unitdate>
58-
<unitdate datechar="" certainty="approximate">1905</unitdate>
29+
<unitdate normal="1905/2012">1905-2012</unitdate>
30+
<unitdate datechar="creation" normal="1905/2012">1905-2012</unitdate>
31+
<unitdate certainty="approximate" normal="1905/2012">1905-2012</unitdate>
32+
<unitdate datechar="creation" certainty="approximate" normal="1905/2012">1905-2012</unitdate>
33+
<unitdate normal="1953-11-09/1953-11-10">1953 November 9–10</unitdate>
34+
<unitdate normal="1969-03-04/1969-03-04">1969-03-04</unitdate> <!-- same date in normal range, collapses to single date -->
35+
<unitdate normal="2023">2023</unitdate> <!-- non-range value unlikely but maybe possible -->
36+
<unitdate>1984-1989</unitdate> <!-- missing @normal attribute, skipped -->
37+
<unitdate normal="undated">undated</unitdate> <!-- invalid single date, skipped -->
38+
<unitdate normal="2001/1999">2001-1999</unitdate> <!-- invalid range, skipped -->
5939
<unitid>
6040
<emph>Data enclosed in subelement</emph>
6141
</unitid>

tests/test_ead.py

+16-36
Original file line numberDiff line numberDiff line change
@@ -385,41 +385,24 @@ def test_ead_record_with_attribute_and_subfield_variations_transforms_correctly(
385385
timdex.Date(
386386
range=timdex.Date_Range(gte="1905", lte="2012"),
387387
),
388-
timdex.Date(value="1905"),
389-
timdex.Date(
390-
range=timdex.Date_Range(gte="1905", lte="2012"),
391-
),
392-
timdex.Date(value="1905"),
393-
timdex.Date(
394-
range=timdex.Date_Range(gte="1905", lte="2012"),
395-
),
396-
timdex.Date(value="1905"),
397-
timdex.Date(
398-
kind="creation",
399-
range=timdex.Date_Range(gte="1905", lte="2012"),
400-
),
401-
timdex.Date(kind="creation", value="1905"),
402388
timdex.Date(
403389
kind="creation",
404390
range=timdex.Date_Range(gte="1905", lte="2012"),
405391
),
406-
timdex.Date(kind="creation", value="1905"),
407392
timdex.Date(
408-
kind="creation",
409393
note="approximate",
410394
range=timdex.Date_Range(gte="1905", lte="2012"),
411395
),
412-
timdex.Date(kind="creation", note="approximate", value="1905"),
413396
timdex.Date(
397+
kind="creation",
414398
note="approximate",
415399
range=timdex.Date_Range(gte="1905", lte="2012"),
416400
),
417-
timdex.Date(note="approximate", value="1905"),
418401
timdex.Date(
419-
note="approximate",
420-
range=timdex.Date_Range(gte="1905", lte="2012"),
402+
range=timdex.Date_Range(gte="1953-11-09", lte="1953-11-10"),
421403
),
422-
timdex.Date(note="approximate", value="1905"),
404+
timdex.Date(value="1969-03-04"),
405+
timdex.Date(value="2023"),
423406
],
424407
identifiers=[
425408
timdex.Identifier(
@@ -509,21 +492,18 @@ def test_ead_record_invalid_date_and_date_range_are_omitted(caplog):
509492
"tests/fixtures/ead/ead_record_attribute_and_subfield_variations.xml"
510493
)
511494
output_record = next(Ead("aspace", ead_xml_records))
512-
assert "abcd" not in [d.value for d in output_record.dates]
513-
assert "abcd" not in [
514-
d.range.gte for d in output_record.dates if "gte" in dir(d.range)
515-
]
516-
assert "efgh" not in [
517-
d.range.lte for d in output_record.dates if "lte" in dir(d.range)
518-
]
519-
assert (
520-
"Record ID 'repositories/2/resources/6' has invalid values in a date range: "
521-
"'abcd', 'efgh'"
522-
) in caplog.text
523-
assert (
524-
"Record ID 'repositories/2/resources/6' has a date that couldn't be parsed: "
525-
"'abcd'"
526-
) in caplog.text
495+
496+
for date in output_record.dates:
497+
assert date.value != "undated"
498+
assert date.value != "1984"
499+
if date.range is not None:
500+
assert date.range.gte != "1984"
501+
assert date.range.lte != "1989"
502+
assert date.range.gte != "2001"
503+
assert date.range.lte != "1999"
504+
505+
assert ("has a date that couldn't be parsed: 'undated'") in caplog.text
506+
assert ("has a later start date than end date: '2001', '1999'") in caplog.text
527507

528508

529509
def test_ead_record_correct_identifiers_from_multiple_unitid(caplog):

transmogrifier/sources/ead.py

+60-32
Original file line numberDiff line numberDiff line change
@@ -103,39 +103,11 @@ def get_optional_fields(self, xml: Tag) -> Optional[dict]:
103103
identifier=self.generate_name_identifier_url(name_element),
104104
)
105105
)
106+
106107
# dates
107-
for date_element in collection_description_did.find_all("unitdate"):
108-
if date_value := self.create_string_from_mixed_value(
109-
date_element,
110-
" ",
111-
):
112-
date_instance = timdex.Date()
113-
if "-" in date_value:
114-
split = date_value.index("-")
115-
gte_date = date_value[:split].strip()
116-
lte_date = date_value[split + 1 :].strip()
117-
if validate_date_range(
118-
gte_date,
119-
lte_date,
120-
source_record_id,
121-
):
122-
date_instance.range = timdex.Date_Range(
123-
gte=gte_date,
124-
lte=lte_date,
125-
)
126-
else:
127-
date_instance.value = (
128-
date_value.strip()
129-
if validate_date(
130-
date_value,
131-
source_record_id,
132-
)
133-
else None
134-
)
135-
if date_instance.range or date_instance.value:
136-
date_instance.kind = date_element.get("datechar") or None
137-
date_instance.note = date_element.get("certainty") or None
138-
fields.setdefault("dates", []).append(date_instance)
108+
dates = self.parse_dates(collection_description_did, source_record_id)
109+
if dates:
110+
fields.setdefault("dates", []).extend(dates)
139111

140112
# edition field not used in EAD
141113

@@ -452,3 +424,59 @@ def parse_mixed_value(
452424
elif isinstance(item, Tag) and item.name not in skipped_elements:
453425
for child in item.children:
454426
yield from cls.parse_mixed_value(child, skipped_elements)
427+
428+
def parse_dates(
429+
self, collection_description_did: Tag, source_record_id: str
430+
) -> list[timdex.Date]:
431+
"""
432+
Dedicated method to parse dates. Targeting archdesc.unitdata elements, using
433+
only those with a @normal attribute value. These are almost uniformly ranges,
434+
but in the event they are not (or two identical values for the range) a single
435+
date value is produced.
436+
"""
437+
438+
dates = []
439+
for date_element in collection_description_did.find_all("unitdate"):
440+
normal_date = date_element.get("normal", "").strip()
441+
if normal_date == "":
442+
continue
443+
444+
date_instance = timdex.Date()
445+
446+
# date range
447+
if "/" in normal_date:
448+
gte_date, lte_date = normal_date.split("/")
449+
if gte_date != lte_date:
450+
if validate_date_range(
451+
gte_date,
452+
lte_date,
453+
source_record_id,
454+
):
455+
date_instance.range = timdex.Date_Range(
456+
gte=gte_date,
457+
lte=lte_date,
458+
)
459+
else:
460+
date_str = gte_date # arbitrarily take one
461+
if validate_date(
462+
date_str,
463+
source_record_id,
464+
):
465+
date_instance.value = date_str
466+
467+
# fallback on single date
468+
else:
469+
if validate_date(
470+
normal_date,
471+
source_record_id,
472+
):
473+
date_instance.value = normal_date
474+
475+
# include @datechar and @certainty attributes
476+
date_instance.kind = date_element.get("datechar")
477+
date_instance.note = date_element.get("certainty")
478+
479+
if date_instance.range or date_instance.value:
480+
dates.append(date_instance)
481+
482+
return dates

0 commit comments

Comments
 (0)