Skip to content

Commit fcd147d

Browse files
committed
Overcome backwards xl-id form by handling both
This adds the "normal" form (used in XL) as a sameAs to generated records if they are to have the backwards form mistakenly minted by this repository. An upper timestamp per dataset is used to check for if its member records are to have the backwards form. (Eventually we want to "garbage collect" this backwards form from XL, to ensure they don't "squat" on XL id:s in the future.)
1 parent f399f23 commit fcd147d

File tree

4 files changed

+52
-19
lines changed

4 files changed

+52
-19
lines changed

common.py

+1
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111
context="sys/context/base.jsonld",
1212
system_base_iri="",
1313
union="common.jsonld.lines",
14+
last_backwards_id_time="2022-10-14T16:26:16Z"
1415
)
1516

1617
if __name__ == "__main__":

lxltools/datacompiler.py

+45-15
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,8 @@ def __init__(self, *,
3333
context=None,
3434
record_thing_link='mainEntity',
3535
system_base_iri=None,
36-
union='all.jsonld.lines'):
36+
union='all.jsonld.lines',
37+
last_backwards_id_time=None):
3738
self.datasets_description = datasets_description
3839
self.datasets = {}
3940
self.current_ds_resources = set()
@@ -49,6 +50,11 @@ def __init__(self, *,
4950
self.current_ds_file = None
5051
self.no_records = False
5152

53+
self.last_backwards_id_time = (
54+
timeutil.w3c_dtz_to_ms(last_backwards_id_time)
55+
if isinstance(last_backwards_id_time, str)
56+
else None)
57+
5258
if datasets_description:
5359
self._handlers_from_datasets_description(datasets_description)
5460

@@ -155,7 +161,8 @@ def _compile_dataset(self, name, result):
155161
data = self.to_jsonld(data)
156162

157163
ds_url = urljoin(self.dataset_id, name)
158-
self._create_dataset_description(ds_url, ds_created_ms, ds_modified_ms)
164+
self._create_dataset_description(
165+
ds_url, ds_created_ms, ds_created_ms=ds_created_ms)
159166

160167
base_id = urljoin(self.dataset_id, base)
161168

@@ -172,10 +179,6 @@ def _compile_dataset(self, name, result):
172179
modified_ms = None
173180
fpath = urlparse(nodeid).path[1:]
174181

175-
if self.no_records:
176-
self.write(node, fpath)
177-
continue
178-
179182
meta = node.pop('meta', None)
180183
if meta:
181184
if 'created' in meta:
@@ -189,10 +192,25 @@ def _compile_dataset(self, name, result):
189192
node,
190193
created_ms,
191194
modified_ms,
192-
datasets=[self.dataset_id, ds_url])
193-
self.write(desc, fpath)
195+
datasets=[self.dataset_id, ds_url],
196+
ds_created_ms=ds_created_ms)
197+
198+
# Keep sameAs "fowards" form in meta even if no_records is used
199+
if self.no_records:
200+
meta = meta or {}
201+
sameas = meta.setdefault('sameAs', [])
202+
rec = desc['@graph'][0]
203+
if 'sameAs' in rec:
204+
sameas.append({"@id": rec['@id']})
205+
for same in rec.get('sameAs', []):
206+
sameas.append(same)
207+
node['meta'] = meta
208+
self.write(node, fpath)
209+
else:
210+
self.write(desc, fpath)
194211

195-
def _create_dataset_description(self, ds_url, created_ms, modified_ms=None, label=None):
212+
def _create_dataset_description(self, ds_url, created_ms, modified_ms=None,
213+
label=None, ds_created_ms=None):
196214
if not label:
197215
label = ds_url.rsplit('/', 1)[-1]
198216
ds = {
@@ -211,7 +229,7 @@ def _create_dataset_description(self, ds_url, created_ms, modified_ms=None, labe
211229
return
212230

213231
desc = self._to_node_description(ds, created_ms, modified_ms,
214-
datasets={self.dataset_id, ds_url})
232+
datasets={self.dataset_id, ds_url}, ds_created_ms=ds_created_ms)
215233

216234
record = desc['@graph'][0]
217235
if self.tool_id:
@@ -220,14 +238,16 @@ def _create_dataset_description(self, ds_url, created_ms, modified_ms=None, labe
220238
self.write(desc, ds_path)
221239

222240
def _to_node_description(self, node, created_ms,
223-
modified_ms=None, datasets=None):
241+
modified_ms=None, datasets=None, ds_created_ms=None):
224242
assert self.record_thing_link not in node
225243

226244
node_id = node['@id']
227245

228246
record = OrderedDict()
229247
record['@type'] = 'Record'
230-
record['@id'] = self.generate_record_id(created_ms, node_id)
248+
249+
self.set_record_id(record, created_ms, node_id, ds_created_ms)
250+
231251
record[self.record_thing_link] = {'@id': node_id}
232252

233253
# Add provenance
@@ -241,9 +261,19 @@ def _to_node_description(self, node, created_ms,
241261

242262
return {'@graph': items}
243263

244-
def generate_record_id(self, created_ms, node_id):
245-
# FIXME: backwards_form=created_ms < 2015
246-
slug = lxlslug.librisencode(created_ms, lxlslug.checksum(node_id))
264+
def set_record_id(self, record, created_ms, node_id, ds_created_ms=None):
265+
if ds_created_ms is None:
266+
ds_created_ms = created_ms
267+
backwards_form = ds_created_ms < self.last_backwards_id_time
268+
# TODO: use normal form and keep backwards_form as sameAs until "GC:able"?
269+
record['@id'] = self.generate_record_id(created_ms, node_id, backwards_form)
270+
if backwards_form:
271+
record['sameAs'] = [{'@id': self.generate_record_id(created_ms, node_id)}]
272+
273+
def generate_record_id(self, created_ms, node_id, backwards_form=False):
274+
slug = lxlslug.librisencode(
275+
created_ms, lxlslug.checksum(node_id), backwards_form=backwards_form
276+
)
247277
return urljoin(self.system_base_iri, slug)
248278

249279
def write(self, node, name):

lxltools/lxlslug.py

+3-2
Original file line numberDiff line numberDiff line change
@@ -32,9 +32,10 @@ def rotate(c):
3232
def checksum(data):
3333
return crc32(data.encode('utf-8')) & 0xffffffff
3434

35-
def librisencode(a, b):
35+
def librisencode(a, b, backwards_form=True):
36+
form = reversed if backwards_form else lambda x: x
3637
alphabet = lower_consonants_numbers
37-
timepart = "".join(reversed(caesarize(alphabet, tobase(alphabet, a))))
38+
timepart = "".join(form(caesarize(alphabet, tobase(alphabet, a))))
3839
codepart = tobase(alphabet, b)
3940
codelen = len(codepart)
4041
if codelen < 7:

syscore.py

+3-2
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,8 @@ def _get_repo_version():
3131
context='sys/context/base.jsonld',
3232
record_thing_link='mainEntity',
3333
system_base_iri='',
34-
union='syscore.jsonld.lines')
34+
union='syscore.jsonld.lines',
35+
last_backwards_id_time='2022-11-20T00:00:00Z')
3536

3637

3738
@compiler.handler
@@ -145,7 +146,7 @@ def _insert_record(graph, created_ms, dataset_id):
145146
record = {'@type': 'SystemRecord'}
146147
record[compiler.record_thing_link] = {'@id': entity['@id']}
147148
graph.insert(0, record)
148-
record['@id'] = compiler.generate_record_id(created_ms, entity['@id'])
149+
compiler.set_record_id(record, created_ms, entity['@id'])
149150
record['inDataset'] = [{'@id': compiler.dataset_id}, {'@id': dataset_id}]
150151

151152

0 commit comments

Comments
 (0)