From 195d085a1ecea27b04d1ff9116bf713438101724 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Niklas=20Lindstr=C3=B6m?= Date: Mon, 27 Feb 2023 14:38:40 +0100 Subject: [PATCH 1/2] Overcome backwards xl-id form by handling both This adds the "normal" form (used in XL) as a sameAs to generated records if they are to have the backwards form mistakenly minted by this repository. An upper timestamp per dataset is used to check for if its member records are to have the backwards form. (Eventually we want to "garbage collect" this backwards form from XL, to ensure they don't "squat" on XL id:s in the future.) --- common.py | 1 + lxltools/datacompiler.py | 60 ++++++++++++++++++++++++++++++---------- lxltools/lxlslug.py | 12 ++++---- syscore.py | 5 ++-- 4 files changed, 56 insertions(+), 22 deletions(-) diff --git a/common.py b/common.py index 7ad3199db..9d97a0b45 100644 --- a/common.py +++ b/common.py @@ -11,6 +11,7 @@ context="sys/context/base.jsonld", system_base_iri="", union="common.jsonld.lines", + last_backwards_id_time="2022-10-14T16:26:16Z" ) if __name__ == "__main__": diff --git a/lxltools/datacompiler.py b/lxltools/datacompiler.py index d94be9e25..34c72e90d 100644 --- a/lxltools/datacompiler.py +++ b/lxltools/datacompiler.py @@ -33,7 +33,8 @@ def __init__(self, *, context=None, record_thing_link='mainEntity', system_base_iri=None, - union='all.jsonld.lines'): + union='all.jsonld.lines', + last_backwards_id_time=None): self.datasets_description = datasets_description self.datasets = {} self.current_ds_resources = set() @@ -49,6 +50,11 @@ def __init__(self, *, self.current_ds_file = None self.no_records = False + self.last_backwards_id_time = ( + timeutil.w3c_dtz_to_ms(last_backwards_id_time) + if isinstance(last_backwards_id_time, str) + else None) + if datasets_description: self._handlers_from_datasets_description(datasets_description) @@ -155,7 +161,8 @@ def _compile_dataset(self, name, result): data = self.to_jsonld(data) ds_url = urljoin(self.dataset_id, name) - self._create_dataset_description(ds_url, ds_created_ms, ds_modified_ms) + self._create_dataset_description( + ds_url, ds_created_ms, ds_created_ms=ds_created_ms) base_id = urljoin(self.dataset_id, base) @@ -172,10 +179,6 @@ def _compile_dataset(self, name, result): modified_ms = None fpath = urlparse(nodeid).path[1:] - if self.no_records: - self.write(node, fpath) - continue - meta = node.pop('meta', None) if meta: if 'created' in meta: @@ -189,10 +192,25 @@ def _compile_dataset(self, name, result): node, created_ms, modified_ms, - datasets=[self.dataset_id, ds_url]) - self.write(desc, fpath) + datasets=[self.dataset_id, ds_url], + ds_created_ms=ds_created_ms) + + # Keep sameAs "fowards" form in meta even if no_records is used + if self.no_records: + meta = meta or {} + sameas = meta.setdefault('sameAs', []) + rec = desc['@graph'][0] + if 'sameAs' in rec: + sameas.append({"@id": rec['@id']}) + for same in rec.get('sameAs', []): + sameas.append(same) + node['meta'] = meta + self.write(node, fpath) + else: + self.write(desc, fpath) - def _create_dataset_description(self, ds_url, created_ms, modified_ms=None, label=None): + def _create_dataset_description(self, ds_url, created_ms, modified_ms=None, + label=None, ds_created_ms=None): if not label: label = ds_url.rsplit('/', 1)[-1] ds = { @@ -211,7 +229,7 @@ def _create_dataset_description(self, ds_url, created_ms, modified_ms=None, labe return desc = self._to_node_description(ds, created_ms, modified_ms, - datasets={self.dataset_id, ds_url}) + datasets={self.dataset_id, ds_url}, ds_created_ms=ds_created_ms) record = desc['@graph'][0] if self.tool_id: @@ -220,14 +238,16 @@ def _create_dataset_description(self, ds_url, created_ms, modified_ms=None, labe self.write(desc, ds_path) def _to_node_description(self, node, created_ms, - modified_ms=None, datasets=None): + modified_ms=None, datasets=None, ds_created_ms=None): assert self.record_thing_link not in node node_id = node['@id'] record = OrderedDict() record['@type'] = 'Record' - record['@id'] = self.generate_record_id(created_ms, node_id) + + self.set_record_id(record, created_ms, node_id, ds_created_ms) + record[self.record_thing_link] = {'@id': node_id} # Add provenance @@ -241,9 +261,19 @@ def _to_node_description(self, node, created_ms, return {'@graph': items} - def generate_record_id(self, created_ms, node_id): - # FIXME: backwards_form=created_ms < 2015 - slug = lxlslug.librisencode(created_ms, lxlslug.checksum(node_id)) + def set_record_id(self, record, created_ms, node_id, ds_created_ms=None): + if ds_created_ms is None: + ds_created_ms = created_ms + backwards_form = ds_created_ms < self.last_backwards_id_time + # TODO: use normal form and keep backwards_form as sameAs until "GC:able"? + record['@id'] = self.generate_record_id(created_ms, node_id, backwards_form) + if backwards_form: + record['sameAs'] = [{'@id': self.generate_record_id(created_ms, node_id)}] + + def generate_record_id(self, created_ms, node_id, backwards_form=False): + slug = lxlslug.librisencode( + created_ms, lxlslug.checksum(node_id), backwards_form=backwards_form + ) return urljoin(self.system_base_iri, slug) def write(self, node, name): diff --git a/lxltools/lxlslug.py b/lxltools/lxlslug.py index ee514925f..e6e72ef2e 100755 --- a/lxltools/lxlslug.py +++ b/lxltools/lxlslug.py @@ -1,6 +1,5 @@ #!/usr/bin/env python -from __future__ import unicode_literals, print_function - +from typing import Any from zlib import crc32 import string import time @@ -32,9 +31,12 @@ def rotate(c): def checksum(data): return crc32(data.encode('utf-8')) & 0xffffffff -def librisencode(a, b): +def librisencode(a, b, backwards_form=False): alphabet = lower_consonants_numbers - timepart = "".join(reversed(caesarize(alphabet, tobase(alphabet, a)))) + chars = caesarize(alphabet, tobase(alphabet, a)) + if backwards_form: + chars = reversed(chars) + timepart = "".join(chars) codepart = tobase(alphabet, b) codelen = len(codepart) if codelen < 7: @@ -53,7 +55,7 @@ def librisencode(a, b): print("Usage: %s TIMESTAMP IDENTIFIER" % (cmd), file=sys.stderr) exit(1) - timestamp = args.pop(0) + timestamp: Any = args.pop(0) identifiers = args try: diff --git a/syscore.py b/syscore.py index 246ae2451..8bd7ba977 100644 --- a/syscore.py +++ b/syscore.py @@ -31,7 +31,8 @@ def _get_repo_version(): context='sys/context/base.jsonld', record_thing_link='mainEntity', system_base_iri='', - union='syscore.jsonld.lines') + union='syscore.jsonld.lines', + last_backwards_id_time='2022-11-20T00:00:00Z') @compiler.handler @@ -145,7 +146,7 @@ def _insert_record(graph, created_ms, dataset_id): record = {'@type': 'SystemRecord'} record[compiler.record_thing_link] = {'@id': entity['@id']} graph.insert(0, record) - record['@id'] = compiler.generate_record_id(created_ms, entity['@id']) + compiler.set_record_id(record, created_ms, entity['@id']) record['inDataset'] = [{'@id': compiler.dataset_id}, {'@id': dataset_id}] From f3abf54d84938ca249a73b857103cd2d04efee3c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Olov=20Ylinenp=C3=A4=C3=A4?= <51744858+olovy@users.noreply.github.com> Date: Wed, 1 Mar 2023 16:18:06 +0100 Subject: [PATCH 2/2] Fix typo --- lxltools/datacompiler.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lxltools/datacompiler.py b/lxltools/datacompiler.py index 34c72e90d..64a47ed63 100644 --- a/lxltools/datacompiler.py +++ b/lxltools/datacompiler.py @@ -195,7 +195,7 @@ def _compile_dataset(self, name, result): datasets=[self.dataset_id, ds_url], ds_created_ms=ds_created_ms) - # Keep sameAs "fowards" form in meta even if no_records is used + # Keep sameAs "forwards" form in meta even if no_records is used if self.no_records: meta = meta or {} sameas = meta.setdefault('sameAs', [])