diff --git a/Dockerfile b/Dockerfile index fc2bbdd..88d6667 100644 --- a/Dockerfile +++ b/Dockerfile @@ -26,8 +26,8 @@ ARG BUILDPLATFORM=linux/amd64 FROM --platform=$BUILDPLATFORM registry.cern.ch/inveniosoftware/almalinux:1 -# Use XRootD 5.9.1 -ENV XROOTD_VERSION=5.9.1 +# Use XRootD 5.9.2 +ENV XROOTD_VERSION=5.9.2 # Install the CERN CA COPY docker/carepo.repo /etc/yum.repos.d/ diff --git a/cernopendata/config.py b/cernopendata/config.py index 9aba393..855482f 100644 --- a/cernopendata/config.py +++ b/cernopendata/config.py @@ -29,6 +29,11 @@ from celery.schedules import timedelta from flask import request +from invenio_accounts.views import rest as accounts_rest +from invenio_cern_sync.sso import cern_keycloak, cern_remote_app_name, handlers +from invenio_cern_sync.sso.api import confirm_registration_form +from invenio_cern_sync.users.profile import CERNUserProfileSchema +from invenio_oauthclient.views.client import auto_redirect_login from invenio_records_files.api import _Record from invenio_records_rest.config import RECORDS_REST_ENDPOINTS from invenio_records_rest.facets import nested_filter, range_filter, terms_filter @@ -43,8 +48,18 @@ from urllib3.exceptions import InsecureRequestWarning from cernopendata.cold_storage.tasks import CheckTransfersTask -from cernopendata.modules.pages.config import * + +# noinspection PyUnresolvedReferences +# from cernopendata.modules.pages.config import * +# noinspection PyUnresolvedReferences +from cernopendata.modules.releases import models +from cernopendata.modules.releases.utils import ( + user_info_with_cern_roles, + user_payload_with_cern_roles, +) from cernopendata.modules.search_ui.helpers import CODSearchAppInvenioRestConfigHelper + +# noinspection PyUnresolvedReferences from cernopendata.modules.theme.config import * from .views import search_legacy @@ -67,7 +82,23 @@ # Piwik tracking code: set None to disabled it THEME_PIWIK_ID = os.environ.get("PIWIK_ID", None) -ACCOUNTS_SESSION_ACTIVITY_ENABLED = None +# ACCOUNTS_SESSION_ACTIVITY_ENABLED = True +# ACCOUNTS_REGISTER = True # allow registration +# ACCOUNTS_CONFIRM_EMAIL = None# require email confirmation (recommended) +# ACCOUNTS_SESSION_RESTORATION = True +# ACCOUNTS_REST_AUTH_VIEWS= True +# SECURITY_REGISTERABLE = True +# SECURITY_RECOVERABLE = True +# SECURITY_CHANGEABLE = True + +# SECURITY_LOGIN_URL = "/login" +# SECURITY_LOGOUT_URL = "/logout" +# SECURITY_REGISTER_URL = "/signup" + +# Required for sessions +SECRET_KEY = "change-me-pleaaaseeeee!!!" +# SECURITY_PASSWORD_SALT = "change-me-too" + SITE_URL = os.environ.get("CERNOPENDATA_SITE_URL", "opendata.cern.ch") # Logging - Set up Sentry for Invenio-Logging @@ -162,7 +193,6 @@ "params": {"preprocessors": [flag_robots, anonymize_user, build_record_unique_id]}, } - STATS_AGGREGATIONS = { "file-download-agg": { "templates": "invenio_stats.contrib.aggregations.aggr_file_download", @@ -252,7 +282,6 @@ }, } - CELERY_BEAT_SCHEDULE = { # indexing of statistics events & aggregations "stats-process-events": { @@ -271,11 +300,17 @@ # JSONSchemas JSONSCHEMAS_ENDPOINT = "/schema" JSONSCHEMAS_HOST = "opendata.cern.ch" -JSONSCHEMAS_URL_SCHEME = "http" +# JSONSCHEMAS_URL_SCHEME = "http" +JSONSCHEMAS_REPLACE_REFS = True # HOST_URI -HOST_URI = "{}://{}".format(JSONSCHEMAS_URL_SCHEME, JSONSCHEMAS_HOST) - +# HOST_URI = "{}://{}".format(JSONSCHEMAS_URL_SCHEME, JSONSCHEMAS_HOST) +# +# TODO: This is a hack! There is an issue with the resolver. Setting these two things make the test fail +# If we don't set them, the curation process can't validate the json schema +if "CERNOPENDATA_CERN_APP_CREDENTIALS" in os.environ: + RECORDS_REFRESOLVER_STORE = "invenio_jsonschemas.proxies.current_refresolver_store" + RECORDS_REFRESOLVER_CLS = "invenio_records.resolver.InvenioRefResolver" # Records # Add tuple as array type on record validation # http://python-jsonschema.readthedocs.org/en/latest/validate/#validating-types @@ -306,47 +341,57 @@ ), ) +_RECORD_PERMISSION_FACTORY = ( + "cernopendata.modules.records.permissions:record_read_permission_factory" +) + RECORDS_UI_ENDPOINTS = dict( recid=dict( pid_type="recid", route="/record/", - permission_factory_imp=None, + permission_factory_imp=_RECORD_PERMISSION_FACTORY, record_class="cernopendata.api:RecordFilesWithIndex", view_imp="cernopendata.modules.records.utils:record_metadata_view", ), recid_files=dict( pid_type="recid", route="/record//files/", + permission_factory_imp=_RECORD_PERMISSION_FACTORY, view_imp="cernopendata.modules.records.utils:file_download_ui", record_class="cernopendata.api:RecordFilesWithIndex", ), recid_file_index=dict( pid_type="recid", route="/record//file_index/", + permission_factory_imp=_RECORD_PERMISSION_FACTORY, view_imp="cernopendata.modules.records.utils:get_file_index", record_class="cernopendata.api:RecordFilesWithIndex", ), recid_files_assets=dict( pid_type="recid", route="/record//files/assets/", + permission_factory_imp=_RECORD_PERMISSION_FACTORY, view_imp="cernopendata.modules.records.utils:eos_file_download_ui", record_class="cernopendata.api:RecordFilesWithIndex", ), recid_files_page=dict( pid_type="recid", route="/record//filepage/", + permission_factory_imp=_RECORD_PERMISSION_FACTORY, view_imp="cernopendata.modules.records.utils:record_file_page", record_class="cernopendata.api:RecordFilesWithIndex", ), recid_export=dict( pid_type="recid", route="/record//export/", + permission_factory_imp=_RECORD_PERMISSION_FACTORY, view_imp="cernopendata.modules.records.utils:export_json_view", record_class="cernopendata.api:RecordFilesWithIndex", ), recid_stage=dict( pid_type="recid", route="/record//stage", + permission_factory_imp=_RECORD_PERMISSION_FACTORY, view_imp="cernopendata.modules.records.utils:stage", methods=["POST"], record_class="cernopendata.api:RecordFilesWithIndex", @@ -354,6 +399,7 @@ recid_subscribe=dict( pid_type="recid", route="/record//subscribe", + permission_factory_imp=_RECORD_PERMISSION_FACTORY, view_imp="cernopendata.modules.records.utils:subscribe", methods=["POST"], record_class="cernopendata.api:RecordFilesWithIndex", @@ -367,13 +413,14 @@ docid=dict( pid_type="docid", route="/docs/", - permission_factory_imp=None, + permission_factory_imp=_RECORD_PERMISSION_FACTORY, record_class="invenio_records_files.api:Record", view_imp="cernopendata.modules.records.utils:doc_metadata_view", ), docid_export=dict( pid_type="docid", route="/docs//export/", + permission_factory_imp=_RECORD_PERMISSION_FACTORY, view_imp="invenio_records_ui.views.export", template="cernopendata_records_ui/default_export.html", ), @@ -425,6 +472,7 @@ def _query_parser_and(qstr=None): ), }, "search_factory_imp": "cernopendata.modules.records.queries:search_factory", + "read_permission_factory_imp": _RECORD_PERMISSION_FACTORY, } ) @@ -496,6 +544,15 @@ def _query_parser_and(qstr=None): ), } } +COVER_TEMPLATE = "invenio_theme/page_cover.html" +ACCOUNTS_COVER_TEMPLATE = "invenio_accounts/base_cover.html" +ACCOUNTS_BASE_TEMPLATE = "invenio_accounts/base.html" +# After 'Create app' +# MAIL_SERVER='smtp.example.com' +# MAIL_PORT = 465 +# MAIL_USE_SSL= True +# MAIL_USERNAME = 'username' +COMMUNITIES_IDENTITIES_CACHE_REDIS_URL = "redis://cache:6379/0" # TODO: based on invenio-records-rest default config RECORDS_REST_DEFAULT_SORT = dict( @@ -629,7 +686,6 @@ def _query_parser_and(qstr=None): "invenio_records_rest": CODSearchAppInvenioRestConfigHelper, } - SEARCH_UI_SEARCH_VIEW = search_legacy # OAI-PMH # ======= @@ -713,3 +769,52 @@ def _query_parser_and(qstr=None): # THIS ONE IS ONLY FOR THE DEVELOPMENT RATELIMIT_PER_ENDPOINT = {"static": "600 per minute"} + +# Checking communities +THEME_FRONTPAGE = False +# Enable communities +COMMUNITIES_ENABLED = True + +ACCOUNTS_LOCAL_LOGIN_ENABLED = False +ACCOUNTS_LOGIN_VIEW_FUNCTION = auto_redirect_login + +ACCOUNTS_USER_PROFILE_SCHEMA = CERNUserProfileSchema() + +USERPROFILES_EXTEND_SECURITY_FORMS = True +CERN_SYNC_KEYCLOAK_BASE_URL = "https://auth.cern.ch/" + +OAUTHCLIENT_SIGNUP_FORM = confirm_registration_form +OAUTHCLIENT_CERN_REALM_URL = "https://auth.cern.ch/auth/realms/cern" +OAUTHCLIENT_CERN_USER_INFO_URL = ( + "https://auth.cern.ch/auth/realms/cern/protocol/openid-connect/userinfo" +) +OAUTHCLIENT_SETTINGS_TEMPLATE = "invenio_oauthclient/settings/base.html" +OAUTHCLIENT_CERN_VERIFY_EXP = True +OAUTHCLIENT_CERN_VERIFY_AUD = False +OAUTHCLIENT_CERN_USER_INFO_FROM_ENDPOINT = True +OAUTHCLIENT_CERN_OPENID_ALLOWED_ROLES = [ + "cms-curator", + "atlas-curator", + "delphi-curator", + "alice-curator", + "default-role", +] +OAUTHCLIENT_AUTO_REDIRECT_TO_EXTERNAL_LOGIN = True + +OAUTH_REMOTE_APP_NAME = "cern_openid" +OAUTHCLIENT_REMOTE_APPS = { + cern_remote_app_name: cern_keycloak.remote_app, +} +CERN_APP_CREDENTIALS = { + "consumer_key": "opendata-dev", + "consumer_secret": os.environ.get( + "CERNOPENDATA_CERN_APP_CREDENTIALS", "" + ), +} + +accounts_rest.default_user_payload = user_payload_with_cern_roles +handlers["signup_handler"]["info"] = user_info_with_cern_roles + +# OAUTHCLIENT_SETTINGS_TEMPLATE = 'invenio_theme/page_settings.html' +COMMUNITIES_CUSTOM_FIELDS = None +MAX_CONTENT_LENGTH = 1000 * 1024 * 1024 diff --git a/cernopendata/modules/fixtures/cli.py b/cernopendata/modules/fixtures/cli.py index 3127ba5..0a72262 100644 --- a/cernopendata/modules/fixtures/cli.py +++ b/cernopendata/modules/fixtures/cli.py @@ -147,19 +147,6 @@ def _handle_record_files(record, data, logger=None): record.check_availability() -def delete_record(pid, logger=None): - """Deletes a record.""" - logger.info("Ready to delete the object {pid}") - record = RecordFilesWithIndex.get_record(pid.object_uuid) - - for o in ObjectVersion.get_by_bucket(record.bucket).all(): - o.remove() - FileInstance.query.filter_by(id=o.file_id).delete() - FileIndexMetadata.delete_by_record(record=record) - record.delete() - return None - - def create_record(data, skip_files, logger=None): """Creates a new record.""" id = uuid.uuid4() diff --git a/cernopendata/modules/globals/ext.py b/cernopendata/modules/globals/ext.py index 35c34ea..bb559fe 100644 --- a/cernopendata/modules/globals/ext.py +++ b/cernopendata/modules/globals/ext.py @@ -3,10 +3,10 @@ import json import logging import os - from counter_robots import is_robot_or_machine from flask import Flask, request +from cernopendata.modules.releases.utils import curator_experiments from cernopendata.version import __version__ logger = logging.getLogger(__name__) @@ -52,6 +52,7 @@ def __init__(self, app): if not isinstance(app, Flask): return + app.context_processor(curator_experiments) self.set_experiments(app) @staticmethod diff --git a/cernopendata/modules/records/permissions.py b/cernopendata/modules/records/permissions.py new file mode 100644 index 0000000..cdf67e5 --- /dev/null +++ b/cernopendata/modules/records/permissions.py @@ -0,0 +1,38 @@ +"""CERN Open Data record permissions.""" + +import logging + +from flask_login import current_user + +from cernopendata.modules.releases.utils import curator_experiments + +logger = logging.getLogger(__name__) + + +def record_read_permission_factory(record, *args, **kwargs): + """Return a read permission for a record. + + By default, records are publicly accessible. If the record appears + in a release that has not yet been published, access is restricted + to curators of the corresponding experiment. + """ + + def can(self): + prerelease = record.get("prerelease") + if not prerelease: + return True + + try: + experiment, _ = prerelease.split("/", 1) + except (ValueError, AttributeError): + logger.error( + f"Malformed prerelease field on record {record.get('recid')}: {prerelease}" + ) + return False + + if not current_user.is_authenticated: + return False + + return experiment in curator_experiments()["curator_experiments"] + + return type("RecordReadPermission", (), {"can": can})() diff --git a/cernopendata/modules/releases/__init__.py b/cernopendata/modules/releases/__init__.py new file mode 100644 index 0000000..6572484 --- /dev/null +++ b/cernopendata/modules/releases/__init__.py @@ -0,0 +1,25 @@ +# -*- coding: utf-8 -*- +# +# This file is part of CERN Open Data Portal. +# Copyright (C) 2024 CERN. +# +# CERN Open Data Portal is free software; you can redistribute it +# and/or modify it under the terms of the GNU General Public License as +# published by the Free Software Foundation; either version 2 of the +# License, or (at your option) any later version. +# +# CERN Open Data Portal is distributed in the hope that it will be +# useful, but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with CERN Open Data Portal; if not, write to the +# Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, +# MA 02111-1307, USA. +# +# In applying this license, CERN does not +# waive the privileges and immunities granted to it by virtue of its status +# as an Intergovernmental Organization or submit itself to any jurisdiction. + +"""CERN Open Data Releases.""" diff --git a/cernopendata/modules/releases/api.py b/cernopendata/modules/releases/api.py new file mode 100644 index 0000000..11189aa --- /dev/null +++ b/cernopendata/modules/releases/api.py @@ -0,0 +1,483 @@ +# -*- coding: utf-8 -*- +# +# This file is part of CERN Open Data Portal. +# Copyright (C) 2024 CERN. +# +# CERN Open Data Portal is free software; you can redistribute it +# and/or modify it under the terms of the GNU General Public License as +# published by the Free Software Foundation; either version 2 of the +# License, or (at your option) any later version. +# +# CERN Open Data Portal is distributed in the hope that it will be +# useful, but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with CERN Open Data Portal; if not, write to the +# Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, +# MA 02111-1307, USA. +# +# In applying this license, CERN does not +# waive the privileges and immunities granted to it by virtue of its status +# as an Intergovernmental Organization or submit itself to any jurisdiction. + +"""CERN Open Data Release api.""" + +import json +from copy import deepcopy +from datetime import datetime + +from deepdiff import DeepDiff +from invenio_db import db +from invenio_indexer.api import RecordIndexer +from invenio_pidstore.models import PersistentIdentifier +from sqlalchemy.exc import OperationalError +from sqlalchemy.orm import selectinload +from sqlalchemy.orm.attributes import flag_modified + +from cernopendata.modules.fixtures.cli import ( + create_record, + delete_record, + update_record, +) + +from .models import ( + ReleaseHistory, + ReleaseMetadata, + ReleaseStatus, + ReleaseValidationMetadata, +) +from .validations import VALIDATIONS +from .validations.base import Validation + + +class ReleaseValidation: + """Validation results for a release.""" # + + def __init__(self, metadata): + """Create a ReleaseValidation.""" + self._metadata = metadata + self.validator = next( + (v() for v in Validation.registry if v.name == self.name), None + ) + + @property + def fixable(self): + """Check if a validation can be fixed automatically.""" + return self.validator.fixable + + @property + def error_message(self): + """Error message that should be presented.""" + return self.validator.error_message + + @property + def optional(self): + """Boolean to check if the validation is optional.""" + return self.validator.optional + + @property + def name(self): + """Name of the validation.""" + return self._metadata.name + + @property + def status(self): + """Status of the validation.""" + return self._metadata.status + + def set_status(self, status): + """Put the status of the validation.""" + self._metadata.status = status + + @property + def enabled(self): + """Return if the validation is enabled for this particular release.""" + return self._metadata.enabled + + @classmethod + def get(cls, release_id, name): + """Get a particular validation from the release and validation name.""" + metadata = ReleaseValidationMetadata.query.filter_by( + id=release_id, name=name + ).first() + + return cls(metadata) + + def validate(self): + """Run the validation.""" + return self.validator.validate(self._metadata.release) + + def fix(self): + """Execute the fix for a validation.""" + return self.validator.fix(self._metadata.release) + + def to_dict(self): + """Convert into a dictionary.""" + return { + "id": self._metadata.id, + "name": self.name, + "enabled": self.enabled, + "optional": self.optional, + "status": self.status, + "error_message": self.error_message, + "release_id": self._metadata.release_id, + } + + @classmethod + def get(cls, validation_id): + """Get a release from the database.""" + metadata = ReleaseValidationMetadata.query.filter_by( + id=validation_id, + ).first() + + return cls(metadata) + + +class Release: + """Class for the release.""" + + record_schema = "local://records/record-v1.0.0.json" + + def __init__(self, metadata): + """Initialize the object.""" + self._metadata = metadata + + @property + def status(self): + """Status of the release.""" + return self._metadata.status + + @property + def records(self): + """Records of the release.""" + return self._metadata.records + + @property + def validations(self): + """Validation object.""" + return [ReleaseValidation(v) for v in self._metadata.validations] + + @classmethod + def create( + cls, *, experiment, records, current_user, name=None, discussion_url=None + ): + """Create a new draft release.""" + if not isinstance(records, list): + raise ValueError("records must be a list") + release = ReleaseMetadata( + name=name, + discussion_url=discussion_url, + experiment=experiment, + records=records, + status=ReleaseStatus.DRAFT.value, + ) + obj = cls(release) + obj.validate(current_user) + obj.create_validations() + db.session.add(release) + + db.session.commit() + + return obj + + def create_validations(self): + """Initializes the validations for a release.""" + for validation in VALIDATIONS: + if ( + validation.experiment + and validation.experiment != self._metadata.experiment + ): + continue + new_validation = ReleaseValidationMetadata( + release=self._metadata, + name=validation.name, + status=False, + enabled=not validation.optional, + ) + self._metadata.validations.append(new_validation) + db.session.add(new_validation) + + @classmethod + def list_releases(cls, experiment): + """Return all the releases for a given experiment.""" + return ( + db.session.query(ReleaseMetadata) + .options( + selectinload(ReleaseMetadata.history_events).selectinload( + ReleaseHistory.user + ) + ) + .filter(ReleaseMetadata.experiment == experiment) + .order_by(ReleaseMetadata.id.desc()) + .all() + ) + + @classmethod + def validate_experiment(cls, experiment): + """Ensure that the requested experiment exists.""" + return experiment in {"lhcb", "opera", "alice", "atlas", "cms", "delphi"} + + @classmethod + def get(cls, experiment, release_id): + """Get a release from the database.""" + metadata = ReleaseMetadata.query.filter_by( + id=release_id, experiment=experiment + ).first() + + return cls(metadata) + + def is_status(self, status): + """Check if the release is in a particular status.""" + if isinstance(status, (list, tuple, set)): + return self.status in [s.value for s in status] + return self._metadata.status == status.value + + def lock(self, status, lock_status, current_user): + """Acquire a DB row lock and mark this release as EDITING.""" + try: + db.session.query(ReleaseMetadata).filter_by( + id=self._metadata.id + ).with_for_update(nowait=True, of=ReleaseMetadata).one() + except OperationalError: + return False + if status and not self.is_status(status): + return False + + self.change_status(lock_status, current_user) + db.session.commit() + return True + + def change_status(self, status, current_user): + """Change the status of the releases.""" + event = ReleaseHistory( + release=self._metadata, + status=status.value, + timestamp=datetime.utcnow(), + user_id=current_user.id, + ) + self._metadata.status = status.value + db.session.add(event) + return event + + def delete(self): + """Delete a release.""" + db.session.delete(self._metadata) + db.session.commit() + + def update_records(self, records, current_user): + """Update the records of a release.""" + self._metadata.records = records + self.validate(current_user) + db.session.add(self._metadata) + db.session.commit() + + def validate(self, current_user): + """ + Check if a release is ready to be published. + + Checks: + 1. 'experiment' field exists, is a list, contains only expected_experiment + 2. Each record has 'title', 'recid', 'DOI' + 3. 'recid' and 'DOI' are unique across all entries + + Returns: + invalid_entries: a list of dicts with 'entry_index' and 'errors' + + + """ + self._metadata.num_records = len(self._metadata.records) + self._metadata.num_files = 0 + self._metadata.num_file_indices = 0 + self._metadata.errors = [] + + for validation in self.validations: + if validation.enabled: + errors = validation.validate() + if errors: + self._metadata.errors.extend(errors) + validation.set_status(len(errors) == 0) + db.session.add(validation._metadata) + + for i, entry in enumerate(self._metadata.records): + + if "files" in entry: + for j, file in enumerate(entry["files"]): + if "uri" not in file or "*" == file["uri"][-1:]: + self._metadata.errors.append( + f"Entry {i + 1}, file {j + 1}: The path is not expanded" + ) + validations["expanded_files"] = False + if "type" in file and file["type"] == "index.json": + self._metadata.num_file_indices += 1 + else: + self._metadata.num_files += 1 + + flag_modified(self._metadata, "records") + flag_modified(self._metadata, "errors") + self._metadata.num_errors = len(self._metadata.errors) + if self._metadata.num_errors == 0: + self.change_status(ReleaseStatus.READY, current_user) + else: + self.change_status(ReleaseStatus.DRAFT, current_user) + + def stage(self, schema, current_user): + """Stage the entries of a release.""" + if not self.is_status(ReleaseStatus.STAGING): + raise RuntimeError("Release is not READY") + + for record_data in self._metadata.records: + record_data["$schema"] = Release.record_schema + if "abstract" not in record_data: + record_data["abstract"] = {"description": ""} + if "description" not in record_data["abstract"]: + record_data["abstract"]["description"] = "" + record_data["prerelease"] = ( + f"{self._metadata.experiment}/{self._metadata.id}" + ) + record = create_record(record_data, False) + record.commit() + self.change_status(ReleaseStatus.STAGED, current_user) + db.session.add(self._metadata) + db.session.commit() + + def publish(self, current_user): + """Publish a release.""" + if not self.is_status(ReleaseStatus.STAGED): + raise RuntimeError("Release is not STAGED") + + indexer = RecordIndexer() + for record_data in self._metadata.records: + record_data["$schema"] = Release.record_schema + pid_object = PersistentIdentifier.get("recid", record_data["recid"]) + record = update_record(pid_object, record_data, True) + record.commit() + indexer.index(record) + self.change_status(ReleaseStatus.PUBLISHED, current_user) + db.session.add(self._metadata) + db.session.commit() + + def rollback(self, current_user): + """Remove the STAGED entries of a release.""" + if not self.is_status(ReleaseStatus.STAGED): + raise RuntimeError("Release is not STAGED") + + for record_data in self._metadata.records: + pid_object = PersistentIdentifier.get("recid", record_data["recid"]) + delete_record(pid_object, "recid") + + self.change_status(ReleaseStatus.READY, current_user) + db.session.add(self._metadata) + db.session.commit() + + def bulk_preview(self, updates, max_preview=10): + """Preview the changes of a bulk update.""" + diffs = [] + + for idx, record in enumerate(self._metadata.records[:max_preview]): + original = record + modified = deepcopy(record) + + # Apply ops to COPY + if "set" in updates: + for key, value in updates["set"].items(): + if key in ReleaseMetadata.BULK_IMMUTABLE_FIELDS: + continue + modified[key] = value + if "delete" in updates: + for key in updates["delete"]: + if key in ReleaseMetadata.BULK_IMMUTABLE_FIELDS: + continue + modified.pop(key, None) + + diff = DeepDiff( + original, + modified, + ignore_order=True, + ).to_json() + if diff: + diffs.append( + { + "index": idx, + "recid": record.get("recid"), + "diff": json.loads( + diff + ), # The to_json and json.loads is to make sure that the object can be jsonify + } + ) + return diffs + + def bulk_update(self, updates, current_user): + """Apply a bulk update to the records of a release.""" + records_modified = 0 + for record in self._metadata.records: + modified = False + if "set" in updates: + for key, value in updates["set"].items(): + if key in ReleaseMetadata.BULK_IMMUTABLE_FIELDS: + continue + record[key] = value + modified = True + if "delete" in updates: + for key in updates["delete"]: + if key in ReleaseMetadata.BULK_IMMUTABLE_FIELDS: + continue + if key in record: + del record[key] + modified = True + if modified: + records_modified += 1 + + if records_modified: + flag_modified(self._metadata, "records") + self.validate(current_user) + + db.session.add(self._metadata) + db.session.commit() + + return records_modified + + def generate_doi(self): + """ + Assign RECIDs to all records in the release. + + RECID format: - + """ + if self.valid_doi: + raise RuntimeError("RECIDs already generated") + + for record in self.records: + if "doi" not in record: + record["doi"] = f"FAKE DOI FOR {self.experiment}" + # This is to tell alchemy that the field has been modified + flag_modified(self, "records") + self.validate() + + def fix_checks(self, current_user): + """Fix all the validations that can be fixed automatically.""" + errors = [] + for validation in self.validations: + if not validation.status: + if validation.enabled and validation.fixable: + errors.extend(validation.fix()) + + if errors: + self._metadata.errors = errors + self.change_status(ReleaseStatus.DRAFT, current_user) + else: + self.validate(current_user) + flag_modified(self._metadata, "records") + db.session.add(self._metadata) + db.session.commit() + + def enable_validation(self, validation_id, enabled, current_user): + """Enables or disables a partircular validation.""" + validation = ReleaseValidation.get(validation_id) + + if not validation.optional: + raise RunTimeError(f"The validation {validation.name} can't be disabled") + validation._metadata.enabled = enabled + self.validate(current_user) + db.session.add(validation._metadata) + db.session.commit() diff --git a/cernopendata/modules/releases/models.py b/cernopendata/modules/releases/models.py new file mode 100644 index 0000000..c1e3fc3 --- /dev/null +++ b/cernopendata/modules/releases/models.py @@ -0,0 +1,175 @@ +# -*- coding: utf-8 -*- +# +# This file is part of CERN Open Data Portal. +# Copyright (C) 2024 CERN. +# +# CERN Open Data Portal is free software; you can redistribute it +# and/or modify it under the terms of the GNU General Public License as +# published by the Free Software Foundation; either version 2 of the +# License, or (at your option) any later version. +# +# CERN Open Data Portal is distributed in the hope that it will be +# useful, but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with CERN Open Data Portal; if not, write to the +# Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, +# MA 02111-1307, USA. +# +# In applying this license, CERN does not +# waive the privileges and immunities granted to it by virtue of its status +# as an Intergovernmental Organization or submit itself to any jurisdiction. + +"""CERN Open Data Release models.""" + +from enum import Enum + +from invenio_db import db +from sqlalchemy.dialects.postgresql import JSONB + + +class ReleaseStatus(Enum): + """Possible Status for a release.""" + + DRAFT = "DRAFT" + READY = "READY" + EDITING = "EDITING" + STAGED = "STAGED" + STAGING = "STAGING" + PUBLISHED = "PUBLISHED" + + +class ReleaseValidationMetadata(db.Model): + """Validation results for a release.""" + + __tablename__ = "releases_validations" + + id = db.Column(db.Integer, primary_key=True) + + release_id = db.Column( + db.Integer, + db.ForeignKey("releases_metadata.id", ondelete="CASCADE"), + nullable=False, + index=True, + ) + + name = db.Column( + db.String(100), + nullable=False, + index=True, + ) + + status = db.Column( + db.Boolean, + nullable=False, + default=False, + ) + + release = db.relationship( + "ReleaseMetadata", + back_populates="validations", + ) + + enabled = db.Column(db.Boolean, nullable=False, default=True) + + __table_args__ = ( + db.UniqueConstraint("release_id", "name", name="uq_release_validation"), + ) + + +class ReleaseHistory(db.Model): + """History of a release.""" + + __tablename__ = "releases_history" + + id = db.Column(db.Integer, primary_key=True) + + release_id = db.Column( + db.Integer, db.ForeignKey("releases_metadata.id"), nullable=False + ) + + status = db.Column(db.String, nullable=False) + timestamp = db.Column(db.DateTime, nullable=False) + user_id = db.Column(db.Integer, db.ForeignKey("accounts_user.id")) + + release = db.relationship("ReleaseMetadata", back_populates="history_events") + user = db.relationship("User") + + +class ReleaseMetadata(db.Model): + """Release model.""" + + __tablename__ = "releases_metadata" + + BULK_IMMUTABLE_FIELDS = ["recid", "title", "DOI"] + status = db.Column( + db.String(20), + nullable=False, + index=True, + ) + + # --- Identifiers --- + id = db.Column(db.Integer, primary_key=True) + + name = db.Column( + db.String(255), + nullable=False, + ) + + discussion_url = db.Column( + db.String(2048), + nullable=True, + ) + + history_events = db.relationship( + "ReleaseHistory", + back_populates="release", + order_by="ReleaseHistory.timestamp", + cascade="all, delete-orphan", + ) + + experiment = db.Column(db.String(50), nullable=False) + + # --- Content --- + json_fields = ["records", "documents", "glossary", "errors"] + for f in json_fields: + default = list if f == "errors" else dict + locals()[f] = db.Column(JSONB, nullable=False, default=default) + + # --- Counters --- + int_fields = [ + "num_records", + "num_errors", + "num_docs", + "num_files", + "num_file_indices", + ] + for f in int_fields: + locals()[f] = db.Column(db.Integer, nullable=False, default=0) + + size_files = db.Column( + db.BigInteger, + nullable=False, + default=0, + ) + + size_indexFiles = db.Column( + db.BigInteger, + nullable=False, + default=0, + ) + + # --- Validation flags --- + validations = db.relationship( + "ReleaseValidationMetadata", + back_populates="release", + cascade="all, delete-orphan", + ) + + max_recid = db.Column( + db.Integer, + nullable=False, + default=0, + ) diff --git a/cernopendata/modules/releases/utils.py b/cernopendata/modules/releases/utils.py new file mode 100644 index 0000000..035f13f --- /dev/null +++ b/cernopendata/modules/releases/utils.py @@ -0,0 +1,104 @@ +# -*- coding: utf-8 -*- +# +# This file is part of CERN Open Data Portal. +# Copyright (C) 2024 CERN. +# +# CERN Open Data Portal is free software; you can redistribute it +# and/or modify it under the terms of the GNU General Public License as +# published by the Free Software Foundation; either version 2 of the +# License, or (at your option) any later version. +# +# CERN Open Data Portal is distributed in the hope that it will be +# useful, but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with CERN Open Data Portal; if not, write to the +# Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, +# MA 02111-1307, USA. +# +# In applying this license, CERN does not +# waive the privileges and immunities granted to it by virtue of its status +# as an Intergovernmental Organization or submit itself to any jurisdiction. + +"""CERN Open Data Release utils.""" +import logging + +from flask import current_app +from flask_login import current_user +from invenio_accounts import current_accounts +from invenio_accounts.views.rest import role_to_dict +from invenio_db import db +from invenio_oauthclient.contrib.keycloak.helpers import get_user_info +from invenio_oauthclient.models import RemoteAccount + +logging.getLogger("oauthlib").setLevel(logging.DEBUG) +logging.getLogger("requests_oauthlib").setLevel(logging.DEBUG) + + +def user_info_with_cern_roles(remote, resp): + """Return a user with the roles from the SSO application.""" + token_user_info, user_info = get_user_info(remote, resp) + username = token_user_info["sub"] + email = token_user_info["email"] + # cern_person_id might be missing for non-CERN users (EduGain) + identity_id = token_user_info.get("cern_person_id") or username + preferred_language = user_info.get("cern_preferred_language", "en").lower() + client_id = current_app.config["CERN_APP_CREDENTIALS"]["consumer_key"] + user = current_accounts.datastore.get_user_by_email(email) + if user: + remote_user = RemoteAccount.get(user.id, client_id) + if remote_user: + remote_user.extra_data["cern_roles"] = token_user_info["cern_roles"] + db.session.add(remote_user) + db.session.commit() + return { + "user": { + "active": True, + "email": email, + "profile": { + "affiliations": user_info.get("home_institute", ""), + "full_name": user_info.get( + "name", token_user_info.get("name", "") + ), # user_info might be missing + "username": username, + }, + "prefs": { + "visibility": "public", + "email_visibility": "public", + "locale": preferred_language, + }, + }, + "external_id": identity_id, + "external_method": remote.name, + "cern_roles": token_user_info.get("cern_roles", []), + } + + +def user_payload_with_cern_roles(user): + """Parse user payload.""" + fmt_last_login_at = None + if user.login_info and user.login_info.last_login_at: + fmt_last_login_at = user.login_info.last_login_at.isoformat() + client_id = current_app.config["CERN_APP_CREDENTIALS"]["consumer_key"] + remote_user = RemoteAccount.get(user.id, client_id) + return { + "id": user.id, + "email": user.email, + "confirmed_at": user.confirmed_at.isoformat() if user.confirmed_at else None, + "last_login_at": fmt_last_login_at, + "roles": [role_to_dict(role) for role in user.roles], + "cern_roles": remote_user.extra_data["cern_roles"], + } + + +def curator_experiments(): + """Return experiments where the user is curator.""" + exps = [] + if current_user.is_authenticated: + client_id = current_app.config["CERN_APP_CREDENTIALS"]["consumer_key"] + remote_user = RemoteAccount.get(current_user.id, client_id) + roles = remote_user.extra_data.get("cern_roles", []) + exps = [r[:-8] for r in roles if r.endswith("-curator")] + return dict(curator_experiments=exps) diff --git a/cernopendata/modules/releases/validations/__init__.py b/cernopendata/modules/releases/validations/__init__.py new file mode 100644 index 0000000..a4075bc --- /dev/null +++ b/cernopendata/modules/releases/validations/__init__.py @@ -0,0 +1,37 @@ +# -*- coding: utf-8 -*- +# +# This file is part of CERN Open Data Portal. +# Copyright (C) 2024 CERN. +# +# CERN Open Data Portal is free software; you can redistribute it +# and/or modify it under the terms of the GNU General Public License as +# published by the Free Software Foundation; either version 2 of the +# License, or (at your option) any later version. +# +# CERN Open Data Portal is distributed in the hope that it will be +# useful, but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with CERN Open Data Portal; if not, write to the +# Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, +# MA 02111-1307, USA. +# +# In applying this license, CERN does not +# waive the privileges and immunities granted to it by virtue of its status +# as an Intergovernmental Organization or submit itself to any jurisdiction. + +"""CERN Open Data Validations.""" +from importlib import import_module +from pathlib import Path + +from .base import Validation + +package_dir = Path(__file__).parent + +for file in package_dir.glob("*.py"): + if file.name not in ["__init__.py", "base.py"]: + import_module(f"{__name__}.{file.stem}") + +VALIDATIONS = [cls() for cls in Validation.registry] diff --git a/cernopendata/modules/releases/validations/base.py b/cernopendata/modules/releases/validations/base.py new file mode 100644 index 0000000..2f5c455 --- /dev/null +++ b/cernopendata/modules/releases/validations/base.py @@ -0,0 +1,55 @@ +# -*- coding: utf-8 -*- +# +# This file is part of CERN Open Data Portal. +# Copyright (C) 2024 CERN. +# +# CERN Open Data Portal is free software; you can redistribute it +# and/or modify it under the terms of the GNU General Public License as +# published by the Free Software Foundation; either version 2 of the +# License, or (at your option) any later version. +# +# CERN Open Data Portal is distributed in the hope that it will be +# useful, but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with CERN Open Data Portal; if not, write to the +# Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, +# MA 02111-1307, USA. +# +# In applying this license, CERN does not +# waive the privileges and immunities granted to it by virtue of its status +# as an Intergovernmental Organization or submit itself to any jurisdiction. +"""Top Class to validate releases.""" + + +class Validation: + """Base validation class.""" + + registry = [] + abstract = False + + name = None + error_message = None + experiment = None + optional = False + + def __init_subclass__(cls, **kwargs): + """Keep a registry of all the validations.""" + super().__init_subclass__(**kwargs) + if cls.__name__ != "Validation" and not cls.abstract: + Validation.registry.append(cls) + + def validate(self, release): + """Validate a release. The method should be implemented in the child classes.""" + raise NotImplementedError + + def fix(self, release): + """Optional fix method.""" + raise NotImplementedError + + @property + def fixable(self): + """Check if a validation has a fix method.""" + return self.fix.__func__ is not Validation.fix diff --git a/cernopendata/modules/releases/validations/cms.py b/cernopendata/modules/releases/validations/cms.py new file mode 100644 index 0000000..212272d --- /dev/null +++ b/cernopendata/modules/releases/validations/cms.py @@ -0,0 +1,40 @@ +# -*- coding: utf-8 -*- +# +# This file is part of CERN Open Data Portal. +# Copyright (C) 2024 CERN. +# +# CERN Open Data Portal is free software; you can redistribute it +# and/or modify it under the terms of the GNU General Public License as +# published by the Free Software Foundation; either version 2 of the +# License, or (at your option) any later version. +# +# CERN Open Data Portal is distributed in the hope that it will be +# useful, but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with CERN Open Data Portal; if not, write to the +# Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, +# MA 02111-1307, USA. +# +# In applying this license, CERN does not +# waive the privileges and immunities granted to it by virtue of its status +# as an Intergovernmental Organization or submit itself to any jurisdiction. +"""Validation process.""" +from .expected_fields import ExpectedFieldsValidation + + +class CMS(ExpectedFieldsValidation): + """Check that the experiment is properly defined.""" + + abstract = False + + name = "CMS" + expected_fields = { + "collaboration": {"name": "CMS Collaboration"}, + "accelerator": "CERN-LHC", + } + + error_message = f"Standard CMS fields: {expected_fields.keys()}" + experiment = "cms" diff --git a/cernopendata/modules/releases/validations/cms_2016.py b/cernopendata/modules/releases/validations/cms_2016.py new file mode 100644 index 0000000..24cc52f --- /dev/null +++ b/cernopendata/modules/releases/validations/cms_2016.py @@ -0,0 +1,248 @@ +# -*- coding: utf-8 -*- +# +# This file is part of CERN Open Data Portal. +# Copyright (C) 2024 CERN. +# +# CERN Open Data Portal is free software; you can redistribute it +# and/or modify it under the terms of the GNU General Public License as +# published by the Free Software Foundation; either version 2 of the +# License, or (at your option) any later version. +# +# CERN Open Data Portal is distributed in the hope that it will be +# useful, but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with CERN Open Data Portal; if not, write to the +# Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, +# MA 02111-1307, USA. +# +# In applying this license, CERN does not +# waive the privileges and immunities granted to it by virtue of its status +# as an Intergovernmental Organization or submit itself to any jurisdiction. +"""Validation process.""" +from .base import Validation +from .expected_fields import ExpectedFieldsValidation + + +class CMS_2016_Simulated(ExpectedFieldsValidation): + """Check that the experiment is properly defined.""" + + abstract = False + + name = "CMS 2016 Simulated" + error_message = ( + "The records should follow the conventions of 2016 for simulated data." + ) + experiment = "cms" + optional = True + + def get_abstract(release, record): + """Getting the title.""" + parts = [p for p in record["title"].split("/") if p] + + dataset = parts[0] + data_format = parts[-1] + return { + "description": ( + f"

Simulated dataset {dataset} in {data_format} format for 2016 collision data.

" + f"

See the description of the simulated dataset names in: " + 'About CMS simulated dataset names.

' + "

These simulated datasets correspond to the collision data collected by the CMS " + "experiment in 2016.

" + ) + } + + @staticmethod + def get_record_type(record): + """Get the type of record.""" + title = record.get("title", "") + + if "MINIAODSIM" in title: + return "mini" + elif "NANOAODSIM" in title: + return "nano" + return None + + def get_usage(release, record): + """Get the usage for a particular record.""" + record_type = CMS_2016_Simulated.get_record_type(record) + usage = { + "mini": { + "description": ( + "You can access these data through the CMS Open Data container or the CMS Virtual " + "Machine. See the instructions for setting up one of the two alternative " + "environments and getting started in" + ), + "links": [ + { + "description": "Running CMS analysis code using Docker", + "url": "/docs/cms-guide-docker#images", + }, + { + "description": "How to install the CMS Virtual Machine", + "url": "/docs/cms-virtual-machine-cc7", + }, + { + "description": "Getting started with CMS open data", + "url": "/docs/cms-getting-started-miniaod", + }, + ], + }, + "nano": { + "links": [ + { + "url": "/docs/cms-guide-docker#nanoaod", + "description": "Using Docker containers", + }, + { + "url": "/docs/cms-getting-started-nanoaod", + "description": "Getting started with CMS NanoAOD", + }, + ], + "description": ( + "You can access these data through XRootD protocol or direct download, " + "and they can be analysed with common ROOT and Python tools. See the instructions" + "for getting started in" + ), + }, + } + return usage.get(record_type, None) + + def get_system_details(release, record): + """Get the system details for a record.""" + record_type = CMS_2016_Simulated.get_record_type(record) + usage = { + "mini": { + "container_images": [ + { + "name": "docker.io/cmsopendata/cmssw_10_6_30-slc7_amd64_gcc700:latest", + "registry": "dockerhub", + }, + { + "name": ( + "gitlab-registry.cern.ch/cms-cloud/cmssw-docker-opendata/" + "cmssw_10_6_30-slc7_amd64_gcc700:latest" + ), + "registry": "gitlab", + }, + ], + "global_tag": "106X_mcRun2_asymptotic_v17", + "release": "CMSSW_10_6_30", + }, + "nano": { + "description": ( + '

NANOAODSIM datasets are in the ROOT' + "tree format and their analysis does not require the use of CMSSW or CMS open" + "data environments. They can be analysed with common ROOT and Python tools.

" + ), + "container_images": [ + { + "name": "gitlab-registry.cern.ch/cms-cloud/root-vnc", + "registry": "gitlab", + }, + { + "name": "gitlab-registry.cern.ch/cms-cloud/python-vnc", + "registry": "gitlab", + }, + ], + }, + } + return usage.get(record_type, None) + + def parse_title(title): + """Parse the title of a record, extracting the title and the type of record.""" + parts = [p for p in title.split("/") if p] + + dataset = parts[0] if len(parts) > 0 else None + tier = parts[-1] if len(parts) > 0 else None + + return dataset, tier + + def get_relations(release, record): + """Get the relations of a record.""" + title = record.get("title", "") + dataset, tier = CMS_2016_Simulated.parse_title(title) + + if tier == "MINIAODSIM": + target_tier = "NANOAODSIM" + related = "NANO" + rel_type = "isChildOf" + + elif tier == "NANOAODSIM": + target_tier = "MINIAODSIM" + related = "MINI" + rel_type = "isParentOf" + + else: + return None + + # πŸ” find matching record + relation_record = next( + ( + r + for r in release.records + if CMS_2016_Simulated.parse_title(r.get("title", "")) + == (dataset, target_tier) + ), + None, + ) + + if relation_record is None: + return f"IS THE TITLE {target_title} ?" + return [ + { + "description": f"The corresponding {related}AODSIM dataset:", + "recid": relation_record["recid"], + "type": rel_type, + } + ] + + def get_distribution_formats(release, record): + """Get the distribution formats for a record.""" + record_type = CMS_2016_Simulated.get_record_type(record) + return [f"{record_type}aodsim", "root"] + + def get_title_aditional(release, record): + """Get the additional title for a record.""" + dataset, tier = CMS_2016_Simulated.parse_title(record.get("title")) + return f"Simulated dataset {dataset} in {tier} format for 2016 collision data" + + expected_fields = { + "abstract": get_abstract, + "collections": ["CMS-Simulated-Datasets"], + "collision_information": {"energy": "13TeV", "type": "pp"}, + "date_created": ["2016"], + "distribution.formats": get_distribution_formats, + "methodology.description": ( + "

These data were generated in several steps (see also " + 'CMS Monte Carlo ' + "production overview):

" + ), + "pileup": { + "description": ( + "

To make these simulated data comparable with the collision data, " + 'pile-up events are added to the' + "simulated event in the DIGI2RAW step.

" + ), + "links": [ + { + "recid": "30595", + "title": "/Neutrino_E-10_gun/RunIISummer20ULPrePremix-UL16_106X_mcRun2_asymptotic_v13-v1/PREMIX", + } + ], + }, + "run_period": ["Run2016G", "Run2016H"], + "system_details": get_system_details, + "type": {"primary": "Dataset", "secondary": ["Simulated"]}, + "relations": get_relations, + "title_additional": get_title_aditional, + "usage": get_usage, + "validation": { + "description": ( + "The generation and simulation of Monte Carlo data has been validated through general" + "CMS validation procedures." + ) + }, + } diff --git a/cernopendata/modules/releases/validations/duplicate_files.py b/cernopendata/modules/releases/validations/duplicate_files.py new file mode 100644 index 0000000..88c749c --- /dev/null +++ b/cernopendata/modules/releases/validations/duplicate_files.py @@ -0,0 +1,66 @@ +# -*- coding: utf-8 -*- +# +# This file is part of CERN Open Data Portal. +# Copyright (C) 2024 CERN. +# +# CERN Open Data Portal is free software; you can redistribute it +# and/or modify it under the terms of the GNU General Public License as +# published by the Free Software Foundation; either version 2 of the +# License, or (at your option) any later version. +# +# CERN Open Data Portal is distributed in the hope that it will be +# useful, but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with CERN Open Data Portal; if not, write to the +# Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, +# MA 02111-1307, USA. +# +# In applying this license, CERN does not +# waive the privileges and immunities granted to it by virtue of its status +# as an Intergovernmental Organization or submit itself to any jurisdiction. +"""Validation process.""" +from invenio_files_rest.models import FileInstance + +from ..models import ReleaseStatus +from .base import Validation + + +class CheckDuplicateFiles(Validation): + """Validation to check for duplicate files.""" + + name = "Duplicate files" + error_message = "Some of the files of the records are already registered" + + def validate(self, release): + """Check that URIs in this release are not already persisted in the system.""" + errors = [] + if not release.status or release.status in [ + ReleaseStatus.DRAFT.value, + ReleaseStatus.READY.value, + ReleaseStatus.EDITING.value, + ]: + uris = { + f["uri"] + for record in release.records + for f in record.get("files", []) + if "uri" in f + } + + if uris: + # Query ObjectVersion for existing URIs + existing_files = FileInstance.query.filter( + FileInstance.uri.in_(uris) + ).all() + + # Collect colliding URIs + used_uris = {obj.uri for obj in existing_files} + + if used_uris: + errors.append( + f"The following file URIs are already stored in the system: " + f"{', '.join(sorted(used_uris))}" + ) + return errors diff --git a/cernopendata/modules/releases/validations/expand_files.py b/cernopendata/modules/releases/validations/expand_files.py new file mode 100644 index 0000000..62292f7 --- /dev/null +++ b/cernopendata/modules/releases/validations/expand_files.py @@ -0,0 +1,106 @@ +# -*- coding: utf-8 -*- +# +# This file is part of CERN Open Data Portal. +# Copyright (C) 2024 CERN. +# +# CERN Open Data Portal is free software; you can redistribute it +# and/or modify it under the terms of the GNU General Public License as +# published by the Free Software Foundation; either version 2 of the +# License, or (at your option) any later version. +# +# CERN Open Data Portal is distributed in the hope that it will be +# useful, but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with CERN Open Data Portal; if not, write to the +# Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, +# MA 02111-1307, USA. +# +# In applying this license, CERN does not +# waive the privileges and immunities granted to it by virtue of its status +# as an Intergovernmental Organization or submit itself to any jurisdiction. +"""Expand Files validation.""" +import gfal2 + +from .files import ValidFiles + + +class FileExpansionError(Exception): + """Exception for the File Expansion.""" + + pass + + +class CheckExpandDirectories(ValidFiles): + """Validation to check for duplicate files.""" + + name = "Expand directories" + error_message = ( + "Some of the entries in the records are directories instead of files" + ) + + modified = False + + def _walk(self, ctx, base_uri): + """Recursively yield all file paths under base_uri.""" + http_uri = base_uri.replace("root://", "https://") + try: + entries = ctx.listdir(http_uri) + except gfal2.GError: + raise FileExpansionError + + for entry in entries: + full_uri = f"{base_uri}/{entry}" + isDir, size, checksum = self._get_entry_details(ctx, f"{http_uri}/{entry}") + + # If it’s a directory, recurse + if isDir: + yield from walk(full_uri) + else: + yield {"uri": full_uri, "size": size, "checksum": checksum} + + def validate(self, release): + """Check if there are any directories as input for a record.""" + errors = [] + for record in release.records: + if "files" not in record: + continue + + for file in record["files"]: + if "uri" in file and file["uri"].endswith("*"): + errors.append(f"The record {i} has a path like {file['uri']}") + return errors + + def fix(self, release): + """Fix the records that contain directories.""" + errors = [] + ctx = gfal2.creat_context() + + for record in release.records: + if "files" not in record: + continue + + new_files = [] + for file in record["files"]: + if "uri" in file and file["uri"].endswith("*"): + basedir = file["uri"][:-1] + try: + for f in self._walk(ctx, basedir): + new_files.append(f) + except FileExpansionError: + errors.append( + f"Error accessing the path {basedir} while expanding the file names" + ) + + # Append new files and remove the wildcard entry + if new_files: + # Remove the wildcard entry itself + record["files"] = [ + f + for f in record["files"] + if not (f.get("uri") and f["uri"].endswith("*")) + ] + record["files"].extend(new_files) + return errors diff --git a/cernopendata/modules/releases/validations/expected_fields.py b/cernopendata/modules/releases/validations/expected_fields.py new file mode 100644 index 0000000..d43e2a3 --- /dev/null +++ b/cernopendata/modules/releases/validations/expected_fields.py @@ -0,0 +1,92 @@ +# -*- coding: utf-8 -*- +# +# This file is part of CERN Open Data Portal. +# Copyright (C) 2024 CERN. +# +# CERN Open Data Portal is free software; you can redistribute it +# and/or modify it under the terms of the GNU General Public License as +# published by the Free Software Foundation; either version 2 of the +# License, or (at your option) any later version. +# +# CERN Open Data Portal is distributed in the hope that it will be +# useful, but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with CERN Open Data Portal; if not, write to the +# Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, +# MA 02111-1307, USA. +# +# In applying this license, CERN does not +# waive the privileges and immunities granted to it by virtue of its status +# as an Intergovernmental Organization or submit itself to any jurisdiction. +"""Validation process.""" +from .base import Validation + + +class ExpectedFieldsValidation(Validation): + """Abstract class that offers a dictionary of fields and expected values.""" + + expected_fields = {} + expected_record_fields = {} + + abstract = True + + def get_nested(self, data, path): + """Get a value (which might be nested.""" + keys = path.split(".") + for key in keys: + if data is None: + return None + data = data.get(key) + return data + + def resolve_expected_value(self, expected, release, record): + """Get the expected value of a field for a record.""" + # If callable β†’ compute value dynamically + if callable(expected): + return expected(release, record) + return expected + + def validate(self, release): + """Valiation that all the fields have the expected values.""" + errors = [] + + for field, expected in self.expected_fields.items(): + for i, record in enumerate(release.records): + expected_value = self.resolve_expected_value(expected, release, record) + if not expected_value: + errors.append( + f"Record {i}, field {field}: can't figure out what the value is supposed to be" + ) + continue + actual_value = self.get_nested(record, field) + + if actual_value != expected_value: + errors.append( + f"Record {i}, field {field}: expected: '{expected_value}' and got '{actual_value}'" + ) + + return errors + + def fix(self, release): + """Fix all the fields, setting the expected value for each of them.""" + for field, expected in self.expected_fields.items(): + for record in release.records: + expected_value = self.resolve_expected_value(expected, release, record) + if not expected_value: + errors.append( + f"Record {i}, field {field}: can't figure out what the value is supposed to be" + ) + continue + self.set_nested(record, field, expected_value) + + return [] + + def set_nested(self, data, path, value): + """Set a value for a field in a record.""" + keys = path.split(".") + for key in keys[:-1]: + data = data.setdefault(key, {}) + data[keys[-1]] = value diff --git a/cernopendata/modules/releases/validations/experiment.py b/cernopendata/modules/releases/validations/experiment.py new file mode 100644 index 0000000..b03fb6b --- /dev/null +++ b/cernopendata/modules/releases/validations/experiment.py @@ -0,0 +1,39 @@ +# -*- coding: utf-8 -*- +# +# This file is part of CERN Open Data Portal. +# Copyright (C) 2024 CERN. +# +# CERN Open Data Portal is free software; you can redistribute it +# and/or modify it under the terms of the GNU General Public License as +# published by the Free Software Foundation; either version 2 of the +# License, or (at your option) any later version. +# +# CERN Open Data Portal is distributed in the hope that it will be +# useful, but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with CERN Open Data Portal; if not, write to the +# Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, +# MA 02111-1307, USA. +# +# In applying this license, CERN does not +# waive the privileges and immunities granted to it by virtue of its status +# as an Intergovernmental Organization or submit itself to any jurisdiction. +"""Validation process.""" +from .expected_fields import ExpectedFieldsValidation + + +class ValidExperiment(ExpectedFieldsValidation): + """Check that the experiment is properly defined.""" + + abstract = False + + name = "Valid experiment" + error_message = "The records should be of the correct experiment." + expected_fields = { + "experiment": lambda release, record=None: [ + release._metadata.experiment.upper() + ] + } diff --git a/cernopendata/modules/releases/validations/files.py b/cernopendata/modules/releases/validations/files.py new file mode 100644 index 0000000..a1a58f5 --- /dev/null +++ b/cernopendata/modules/releases/validations/files.py @@ -0,0 +1,61 @@ +"""Validation process.""" + +import gfal2 + +from .base import Validation + + +class ValidFiles(Validation): + """Check if the files have the proper metadata.""" + + name = "File metadata" + error_message = "Some of the files are missing the size or checksum" + + def validate(self, release): + """Check that the files have size and checksum.""" + errors = [] + for i, record in enumerate(release.records): + if "files" in record: + for j, file in enumerate(record["files"]): + if "checksum" not in file or "size" not in file: + errors.append( + f"Entry {i + 1}, file {j + 1}: Missing size/checksum" + ) + return errors + + def _get_entry_details(self, ctx, url): + """Given a url, return if the entry is a directory or a file. In case of file, return size anc checksum.""" + st = ctx.stat(url) + + if st.st_mode & 0o40000: # POSIX directory flag + return True, None, None + + try: + checksum = ctx.checksum(url, "ADLER32") + except gfal2.GError: + checksum = "UNKNOWN" + + return False, st.st_size, checksum + + def fix(self, release): + """Add the size and checksum to the files.""" + ctx = gfal2.creat_context() + errors = [] + for record in release.records: + if "files" not in record: + continue + for file in record["files"]: + if "checksum" in file and "size" in file: + continue + try: + _, size, checksum = self._get_entry_details( + ctx, file["uri"].replace("root://", "https://") + ) + except Exception as e: + errors.append(f"Errors getting the metadata of {file['uri']}: {e}") + continue + if "checksum" not in file: + file["checksum"] = checksum + if "size" not in file: + file["size"] = size + return errors diff --git a/cernopendata/modules/releases/validations/metadata.py b/cernopendata/modules/releases/validations/metadata.py new file mode 100644 index 0000000..6f51c3d --- /dev/null +++ b/cernopendata/modules/releases/validations/metadata.py @@ -0,0 +1,21 @@ +"""Validation process.""" + +from datetime import datetime + +from flask_login import current_user + +from .expected_fields import ExpectedFieldsValidation + + +class ValidMetadata(ExpectedFieldsValidation): + """Check some common fields of the records.""" + + abstract = False + + name = "Valid standard fields" + error_message = "Some records are missing some of the standard fields" + expected_fields = { + "license.attribution": "CC0-1.0", + "publisher": "CERN Open Data Portal", + "date_published": f"{datetime.now().year}", + } diff --git a/cernopendata/modules/releases/validations/recid.py b/cernopendata/modules/releases/validations/recid.py new file mode 100644 index 0000000..ccb8d05 --- /dev/null +++ b/cernopendata/modules/releases/validations/recid.py @@ -0,0 +1,66 @@ +"""Validation process.""" + +from invenio_db import db +from invenio_pidstore.models import PersistentIdentifier +from sqlalchemy.sql import func + +from ..models import ReleaseMetadata, ReleaseStatus +from .base import Validation + + +class ValidRecid(Validation): + """Check the record ids.""" + + name = "valid recid" + error_message = "The records should have record id that do not exist." + + def validate(self, release): + """Check that all the entries have recid, and that they are not duplicated.""" + errors = [] + for i, entry in enumerate(release.records): + if "recid" not in entry or not entry["recid"]: + errors.append(f"Entry {i + 1}: Missing or empty required field 'recid'") + + # If the release is not STAGED, check for duplicates + if release.status != ReleaseStatus["STAGED"].value: + used = self._duplicate_pids(release) + if used: + errors.append(f"RECIDs already registered: {', '.join(used)}") + + return errors + + def _duplicate_pids(self, release): + """Check the pids that are duplicated.""" + recids = [r.get("recid") for r in release.records if r.get("recid")] + existing_pid = PersistentIdentifier.query.filter( + PersistentIdentifier.pid_type == "recid", + PersistentIdentifier.pid_value.in_(recids), + PersistentIdentifier.status == "R", + ).all() + if existing_pid: + used = [pid.pid_value for pid in existing_pid] + return used + return [] + + def next_recid_start(self, release): + """Find the next available recid.""" + max_value = ( + db.session.query(func.max(release.max_recid)) + .filter(ReleaseMetadata.experiment == release.experiment) + .scalar() + ) + return (max_value or 0) + 1 + + def fix(self, release): + """Assign RECIDs to all records in the release.""" + counter = self.next_recid_start(release) + duplicates = self._duplicate_pids(release) + + for record in release.records: + if "recid" not in record or record["recid"] in duplicates: + counter += 1 + record["recid"] = f"{release.experiment}-{counter}" + + if release.records: + release.max_recid = counter + return [] diff --git a/cernopendata/modules/releases/validations/schema.py b/cernopendata/modules/releases/validations/schema.py new file mode 100644 index 0000000..3711b8a --- /dev/null +++ b/cernopendata/modules/releases/validations/schema.py @@ -0,0 +1,44 @@ +"""Validation process.""" + +import json + +from flask import current_app +from jsonschema import Draft4Validator + +from .base import Validation + + +class ValidRecordSchema(Validation): + """Check that the record validates the json schema.""" + + name = "Valid record schema" + error_message = "The records do not comply with the json schema." + + def validate(self, release): + """Validate all records against record-v1.0.0.json.""" + schema_path = ( + current_app.extensions["invenio-jsonschemas"].get_schema_dir( + "records/record-v1.0.0.json" + ) + + "/records/record-v1.0.0.json" + ) + with open(schema_path) as f: + schema = json.load(f) + errors = [] + validator = Draft4Validator(schema) + if not isinstance(release.records, list): + return ["The field 'records' is not a list"] + + try: + for i, record in enumerate(release.records): + if not isinstance(record, dict): + errors.append(f"Record {i} is not an object") + continue + for error in validator.iter_errors(record): + path = ".".join(str(p) for p in error.path) + + errors.append(f"Record {i} -> {path}: {error.message}") + except Exception as e: + return [f"Could not validate the schema {e}"] + + return errors diff --git a/cernopendata/modules/releases/views.py b/cernopendata/modules/releases/views.py new file mode 100644 index 0000000..6363a6b --- /dev/null +++ b/cernopendata/modules/releases/views.py @@ -0,0 +1,381 @@ +# -*- coding: utf-8 -*- +# +# This file is part of CERN Open Data Portal. +# Copyright (C) 2024 CERN. +# +# CERN Open Data Portal is free software; you can redistribute it +# and/or modify it under the terms of the GNU General Public License as +# published by the Free Software Foundation; either version 2 of the +# License, or (at your option) any later version. +# +# CERN Open Data Portal is distributed in the hope that it will be +# useful, but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with CERN Open Data Portal; if not, write to the +# Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, +# MA 02111-1307, USA. +# +# In applying this license, CERN does not +# waive the privileges and immunities granted to it by virtue of its status +# as an Intergovernmental Organization or submit itself to any jurisdiction. + +"""CERN Open Data Release views.""" + +import json +from datetime import datetime + +import requests +from flask import Blueprint, abort, flash, jsonify, redirect, render_template, request +from flask_login import current_user, login_required + +from .api import Release +from .models import ReleaseStatus +from .utils import curator_experiments + +blueprint = Blueprint( + "cernopendata_curate", + __name__, + template_folder="templates", + static_folder="static", +) + + +@blueprint.route("/releases/api/list/", methods=["GET"]) +@login_required +def list_releases(experiment=None): + """Return list of releases.""" + if not Release.validate_experiment(experiment): + abort(404) + if experiment not in curator_experiments()["curator_experiments"]: + abort(403) + releases = Release.list_releases(experiment) + + # Serialize + def serialize_last_update(event): + return { + "status": event.status, + "timestamp": event.timestamp.isoformat(), + "user": event.user.email if event.user else None, + } + + return jsonify( + [ + { + "id": r.id, + "name": r.name, + "discussion": r.discussion_url, + "status": r.status, + "last_update": ( + serialize_last_update(r.history_events[-1]) + if r.history_events + else None + ), + "num_records": r.num_records, + "num_docs": r.num_docs, + "num_errors": r.num_errors, + "num_files": r.num_files, + } + for r in releases + ] + ) + + +@blueprint.route("/releases/") +@login_required +def release_view(experiment=None): + """Landing page for the curation process.""" + if not Release.validate_experiment(experiment): + abort(404) + + if experiment not in curator_experiments()["curator_experiments"]: + abort(403) + return render_template( + "cernopendata_pages/releases.html", experiment=experiment.upper() + ) + + +@blueprint.route("/releases/", methods=["POST"]) +@login_required +def release_upload(experiment): + """Upload a new release into the system.""" + if not Release.validate_experiment(experiment): + abort(404) + + if experiment not in curator_experiments()["curator_experiments"]: + abort(403) + source = request.form.get("source") + + if source == "file": + file = request.files.get("file") + if not file.filename.endswith(".json"): + return jsonify({"error": "Only JSON files are allowed"}), 400 + + try: + payload = json.load(file) + except Exception as e: + return jsonify({"error": "Invalid JSON", "details": str(e)}), 400 + release_name = file.filename.rsplit("/", 1)[-1] + + elif source == "url": + url = request.form.get("url") + if not url: + abort(400, "Missing URL") + + resp = requests.get(url, timeout=10) + resp.raise_for_status() + payload = resp.json() + release_name = url.rsplit("/", 1)[-1] + + else: + abort(400, "Invalid source") + if isinstance(payload, dict): + # In case we are reading from the cernopandata api, where the record is in the 'metadata' field + if "metadata" in payload: + for field in "_files", "_bucket", "bucket", "_file_indices": + if field in payload["metadata"]: + del payload["metadata"][field] + payload = [payload["metadata"]] + else: + payload = [payload] + release = Release.create( + records=payload, + experiment=experiment, + current_user=current_user, + name=release_name, + ) + + flash(f"Release {release._metadata.id} created.", "success") + + return render_template( + "cernopendata_pages/releases.html", experiment=experiment.upper() + ) + + +@blueprint.route("/releases//") +def release_detail(experiment, release_id): + """Get the details of a release.""" + release = _get_release(experiment, release_id) + return render_template( + "cernopendata_pages/release_details.html", + release=release, + experiment=experiment, + current_year=datetime.utcnow().year, + ) + + +# @blueprint.route( +# "/releases///generate_doi", +# methods=["POST"], +# ) +# def generate_doi(experiment, release_id): +# """Generate DOI for the records inside a release. TODO.""" +# release = _get_release( +# experiment, release_id, lock=True, status=ReleaseStatus.DRAFT +# )## +# +# if release.valid_doi: +# abort(400, "RECIDs already generated")# +# +# release.generate_doi() +# db.session.add(release) +# db.session.commit() +# +# flash(f"Recid created for records in release {release.id}.", "success") +# +# return redirect(f"/releases/{experiment}/{release_id}") + + +@blueprint.route( + "/releases///delete", + methods=["POST"], +) +@login_required +def delete_release(experiment, release_id): + """Delete a release.""" + release = _get_release(experiment, release_id) + + release.delete() + + flash("Release deleted successfully.", "success") + + return redirect(f"/releases/{experiment}") + + +@blueprint.route( + "/releases///update_records", + methods=["POST"], +) +@login_required +def update_records(experiment, release_id): + """Update the records of a release.""" + # Get JSON string from form + + data = request.get_json(silent=True) + if not data["records"]: + flash("No records provided", "error") + abort(400, "No records provided") + + if not isinstance(data["records"], list): + raise ValueError("Records must be a list") + release = _get_release(experiment, release_id, lock=ReleaseStatus.EDITING) + release.update_records(data["records"], current_user) + + flash("Records updated successfully.", "success") + return redirect(f"/releases/{experiment}/{release_id}") + + +@blueprint.route( + "/releases///stage", + methods=["POST"], +) +@login_required +def stage_release(experiment, release_id): + """Insert the objects defined in the release in the current system.""" + release = _get_release( + experiment, release_id, status=ReleaseStatus.READY, lock=ReleaseStatus.STAGING + ) + schema = "local://records/record-v1.0.0.json" + try: + release.stage(schema, current_user) + flash("Release staged successfully.", "success") + + except Exception as e: + flash(f" :( Error staging the release {e}", "error") + + return redirect(f"/releases/{experiment}/{release_id}") + + +def _get_release(experiment, release_id, lock=False, status=None): + """Given an experiment and a relese number, return the release object.""" + if not Release.validate_experiment(experiment): + abort(404) + if experiment not in curator_experiments()["curator_experiments"]: + abort(403) + + release = Release.get(experiment, release_id) + + if not release: + abort(404) + if status and not release.is_status(status): + abort(409) + if lock and not release.lock(ReleaseStatus(release.status), lock, current_user): + abort(409) + + return release + + +@blueprint.route("/releases///rollback", methods=["POST"]) +@login_required +def rollback(experiment, release_id): + """Remove the records from the instance.""" + release = _get_release(experiment, release_id, status=ReleaseStatus.STAGED) + + release.rollback(current_user) + + return redirect(f"/releases/{experiment}/{release_id}") + + +@blueprint.route("/releases///publish", methods=["POST"]) +@login_required +def publish(experiment, release_id): + """Publish the records: put them in the search, and remove the box saying that they were work in progress.""" + release = _get_release(experiment, release_id, status=ReleaseStatus.STAGED) + + release.publish(current_user) + flash("Release published!") + return redirect(f"/releases/{experiment}/{release_id}") + + +@blueprint.route( + "/releases///bulk_records/preview", + methods=["POST"], +) +@login_required +def bulk_edit_records_preview(experiment, release_id): + """Preview the changes that a bulk action would do.""" + payload = request.get_json() + updates = payload.get("updates", {}) + + release = _get_release(experiment, release_id) + diffs = release.bulk_preview(updates) + return { + "total_records": len(release.records), + "diffed_records": len(diffs), + "diffs": diffs, + } + + +@blueprint.route( + "/releases///bulk_records/apply", + methods=["POST"], +) +@login_required +def bulk_edit_records_apply(experiment, release_id): + """Perform a bulk update on all the records.""" + updates = None + if request.is_json: + data = request.get_json(silent=True) or {} + if "updates" in data: + updates = data["updates"] + elif "updates" in request.form: + try: + updates = json.loads(request.form["updates"]) + except ValueError: + abort(400, "Invalid JSON in upload") + + if not updates: + abort(400, "Missing updates") + + release = _get_release(experiment, release_id, lock=ReleaseStatus.EDITING) + diff = release.bulk_update(updates, current_user) + + flash( + f"Bulk edit applied to " f"{diff} records.", + "success", + ) + if request.is_json: + return jsonify({"status": "ok"}) + else: + return redirect(f"/releases/{experiment}/{release_id}") + + +@blueprint.route( + "/releases///fix_checks", + methods=["POST"], +) +@login_required +def fix_checks(experiment, release_id): + """Fix automatically part of the metadata of the release.""" + release = _get_release( + experiment, + release_id, + status=[ReleaseStatus.DRAFT, ReleaseStatus.READY], + lock=ReleaseStatus.EDITING, + ) + release.fix_checks(current_user) + + return redirect(f"/releases/{experiment}/{release_id}") + + +@blueprint.route( + "/releases///validations//enable", + methods=["POST"], +) +@login_required +def enable_validation(experiment, release_id, validation_id): + """Enables a validation.""" + release = _get_release( + experiment, + release_id, + status=[ReleaseStatus.DRAFT, ReleaseStatus.READY], + ) + data = request.get_json() or {} + enabled = data.get("enabled") + + release.enable_validation(validation_id, enabled, current_user) + + return {"success": True}, 200 + # return redirect(f"/releases/{experiment}/{release_id}") diff --git a/cernopendata/modules/search/component_templates/os-v2/opendata-common-v1.0.0.json b/cernopendata/modules/search/component_templates/os-v2/opendata-common-v1.0.0.json index 5d3617f..8e3b868 100644 --- a/cernopendata/modules/search/component_templates/os-v2/opendata-common-v1.0.0.json +++ b/cernopendata/modules/search/component_templates/os-v2/opendata-common-v1.0.0.json @@ -38,8 +38,7 @@ "type": "percolator" }, "recid": { - "null_value": 0, - "type": "integer" + "type": "keyword" }, "title": { "fields": { diff --git a/cernopendata/modules/theme/assets/semantic-ui/js/curate/CurateApp.js b/cernopendata/modules/theme/assets/semantic-ui/js/curate/CurateApp.js new file mode 100644 index 0000000..d0c6aaf --- /dev/null +++ b/cernopendata/modules/theme/assets/semantic-ui/js/curate/CurateApp.js @@ -0,0 +1,96 @@ +import $ from "jquery"; + +import { + AccordionField, + CustomFields, + FieldLabel, + RemoteSelectField, + SelectField, + TextField, + TextAreaField, + AffiliationsSuggestions, +} from "react-invenio-forms"; + + + + +import React from "react"; +import ReactDOM from "react-dom"; + +import ReleasesTable from "./ReleasesTable"; +import RecordsTable from "./RecordsTable"; +import ValidationToggle from "./ValidationToggle"; + +const container = document.getElementById("releases-react-root"); + +if (container) { + const experiment = container.dataset.experiment || null; + ReactDOM.render( + , + container + ); + + $('.ui.checkbox').checkbox(); + + $('#open-create-release').on('click', () => { + $('#create-release-modal').modal('show'); + }); + + function updateSource() { + const source = $('input[name="source"]:checked').val(); + + if (source === 'file') { + $('#file-source').show(); + $('#url-source').hide(); + } else { + $('#file-source').hide(); + $('#url-source').show(); + } + + validateForm(); + } + + function validateForm() { + const source = $('input[name="source"]:checked').val(); + let valid = false; + + if (source === 'file') { + valid = $('input[name="file"]').val() !== ''; + } else { + valid = $('input[name="url"]').val().trim() !== ''; + } + + $('#uploadButton').toggleClass('disabled', !valid); + } + + $('input[name="source"]').on('change', updateSource); + $('input[name="file"], input[name="url"]').on('input change', validateForm); + + updateSource(); + } + + const records_table = document.getElementById('records-table-root'); + if (records_table) { + ReactDOM.render( + , + records_table + ); + } + +document.querySelectorAll(".validation-toggle-root").forEach((el) => { + const validation = JSON.parse(el.dataset.validation); + + ReactDOM.render( + window.location.reload()} + />, + el + ); +}); \ No newline at end of file diff --git a/cernopendata/modules/theme/assets/semantic-ui/js/curate/RecordsTable.js b/cernopendata/modules/theme/assets/semantic-ui/js/curate/RecordsTable.js new file mode 100644 index 0000000..6fd4e91 --- /dev/null +++ b/cernopendata/modules/theme/assets/semantic-ui/js/curate/RecordsTable.js @@ -0,0 +1,411 @@ +import React, { useEffect, useState, useRef } from "react"; +import { Icon, Table, Loader, Button, Pagination, Modal, Form, TextArea } from "semantic-ui-react"; +import $ from 'jquery'; + +export default function RecordsTable({ + experiment, releaseId, + initialRecords, editDisabled = false, + viewDisabled = false,}) { + const [records, setRecords] = useState(initialRecords); + const [editingRecord, setEditingRecord] = useState(null); + const [editAllMode, setEditAllMode] = useState(false); + + +const tableRef = useRef(null); + const [page, setPage] = useState(0); + + const pageSize = 5; + const visible = records.slice(page * pageSize, (page + 1) * pageSize); + + const [bulkModalOpen, setBulkModalOpen] = useState(false); + const [bulkActions, setBulkActions] = useState([]); // each action: { mode, key, value } + const [bulkPreview, setBulkPreview] = useState([]); // preview diffs + const [bulkPreviewDone, setBulkPreviewDone] = useState(false); + +function typesetMath() { + if (window.MathJax && window.MathJax.Hub) { + + window.MathJax.Hub.Queue(['Typeset', window.MathJax.Hub, tableRef.current]); + } +} + + useEffect(() => { + typesetMath(); + }, [page, records]); + + const openEditModal = record => { + // reuse your existing global logic + window.editingRecordId = record.recid; + + $('#records-json-textarea').val( + JSON.stringify(record, null, 2) + ); + + $('#edit-records-modal').modal('show'); + }; + + function addBulkRow() { + setBulkActions([...bulkActions, { mode: 'set', key: '', value: '' }]); + setBulkPreviewDone(false); + } + + function removeBulkRow(idx) { + const updated = bulkActions.filter((_, i) => i !== idx); + setBulkActions(updated); + setBulkPreviewDone(false); + } + + function updateBulkRow(idx, field, newValue) { + const updated = [...bulkActions]; + updated[idx][field] = newValue; + setBulkActions(updated); + setBulkPreviewDone(false); + } + + async function previewBulk() { + if (bulkActions.length === 0) return; + + let updates; + try { + updates = collectBulkOperations(); + } catch (e) { + alert(e.message); + return; + } + + const res = await fetch(`/releases/${experiment}/${releaseId}/bulk_records/preview`, { + method: 'POST', + headers: { 'Content-Type': 'application/json' }, + body: JSON.stringify({ updates }) + }); + + if (!res.ok) { + alert('Preview failed'); + return; + } + + const data = await res.json(); + setBulkPreview(data.diffs || []); + setBulkPreviewDone(true); + } + + async function applyBulk() { + if (!bulkPreviewDone) return; + + let updates; + try { + updates = collectBulkOperations(); + } catch (e) { + alert(e.message); + return; + } + + const res = await fetch(`/releases/${experiment}/${releaseId}/bulk_records/apply`, { + method: 'POST', + headers: { 'Content-Type': 'application/json' }, + body: JSON.stringify({ updates }) + }); + + if (!res.ok) { + alert('Apply failed'); + return; + } + + window.location.reload(); + } + + function collectBulkOperations() { + const setOps = {}; + const deleteOps = []; + + bulkActions.forEach(({ mode, key, value }) => { + if (!key.trim()) return; + + if (mode === 'delete') { + deleteOps.push(key.trim()); + return; + } + + if (!value.trim()) throw new Error(`Missing value for key "${key}"`); + + try { + setOps[key.trim()] = JSON.parse(value); + } catch (e) { + throw new Error(`Invalid JSON for key "${key}"`); + } + }); + + return { set: setOps, delete: deleteOps }; + } + +function DiffObject({ diff }) { + return ( +
+ {diff.values_changed && + Object.entries(diff.values_changed).map(([path, change], i) => ( +
+ 🟑 {path}
+ - {JSON.stringify(change.old_value)}
+ + {JSON.stringify(change.new_value)} +
+ )) + } + {diff.type_changes && + Object.entries(diff.type_changes).map(([path, change], i) => ( +
+ 🟠 {path}
+ - {JSON.stringify(change.old_value)} ({change.old_type})
+ + {JSON.stringify(change.new_value)} ({change.new_type}) +
+ )) + } + {diff.dictionary_item_added && + diff.dictionary_item_added.map((path, i) => ( +
🟒 {path} added
+ )) + } + {diff.dictionary_item_removed && + diff.dictionary_item_removed.map((path, i) => ( +
πŸ”΄ {path} removed
+ )) + } +
+ ); +} + + return ( + <> + + +
+ + + + RecId + DOI + Title + + Actions + + + + + {visible.map(record => ( + + {record.recid} + {record.doi} + {record.title || 'β€”'} + + + + + + ))} + +
+
+ setPage(d.activePage - 1)} + /> + + { + setEditingRecord(null); + setEditAllMode(false); + }} + closeIcon + size="fullscreen" + > + {editAllMode + ? `Edit all records` + : `Edit record ${editingRecord?.recid}`} + +
+ + + + `); + invalidatePreview(); + }); + + tableBody.on('click', '.remove-row', function () { + $(this).closest('tr').remove(); + invalidatePreview(); + }); + + tableBody.on('input', '.bulk-key, .bulk-value', invalidatePreview); + + /* ------------------------ + * Preview + * ------------------------ */ + + $('#bulk-preview-btn').on('click', async function () { + if ($(this).hasClass('disabled')) return; + + let updates; + try { + updates = collectBulkOperations(); + } catch (e) { + alert(e.message); + return; + } + + const res = await fetch( + `/releases/{{ experiment }}/{{ release._metadata.id }}/bulk_records/preview`, + { + method: 'POST', + headers: { 'Content-Type': 'application/json' }, + body: JSON.stringify({ updates }) + } + ); + + if (!res.ok) { + alert('Preview failed'); + return; + } + + const data = await res.json(); + renderDiffs(data.diffs); + + previewBox.show(); + bulkPreviewDone = true; + updateButtons(); + }); + + /* ------------------------ + * Apply + * ------------------------ */ + + $('#bulk-apply-btn').on('click', async function () { + if ($(this).hasClass('disabled')) return; + + let updates; + try { + updates = collectBulkOperations(); + } catch (e) { + alert(e.message); + return; + } + + const res = await fetch( + `/releases/{{ experiment }}/{{ release._metadata.id }}/bulk_records/apply`, + { + method: 'POST', + headers: { 'Content-Type': 'application/json' }, + body: JSON.stringify({ updates }) + } + ); + + if (!res.ok) { + alert('Apply failed'); + return; + } + + location.reload(); + }); + $('#bulk-edit-table').on('change', '.bulk-mode', function () { + const row = $(this).closest('tr'); + const value = row.find('.bulk-value'); + + if (this.value === 'delete') { + value.prop('disabled', true) + .addClass('disabled') + .val(''); + } else { + value.prop('disabled', false) + .removeClass('disabled'); + } + + invalidatePreview(); + }); + + })(); + $('.ui.checkbox').checkbox(); + + const form = document.getElementById('primary-action-form'); + const checkbox = document.getElementById('generate-doi-checkbox'); + + if (form && checkbox) { + form.addEventListener('submit', function (e) { + if (!checkbox.checked) { + e.preventDefault(); + + $('#primary-action-btn') + .popup({ + content: 'You can generate DOIs for all entries before publishing.', + position: 'top center', + on: 'manual' + }) + .popup('show'); + + // Hide popup automatically after a moment + setTimeout(() => { + $('#primary-action-btn').popup('hide'); + }, 3000); + } + }); + } + +{%- endblock %} \ No newline at end of file diff --git a/cernopendata/templates/cernopendata_pages/release_details/_body.html b/cernopendata/templates/cernopendata_pages/release_details/_body.html new file mode 100644 index 0000000..7115319 --- /dev/null +++ b/cernopendata/templates/cernopendata_pages/release_details/_body.html @@ -0,0 +1,127 @@ +{% if release.status == 'STAGED' %} +
+ + This release is staged and read-only. To modify it, go back to DRAFT + + + +
+{% elif release.status == 'EDITING' %} +
+ The release is blocked because there are some changes being applied. If this message stays here for a long time, there might have been some issues staging the entries :( +
+ +{% elif release.status == 'STAGING' %} +
+ The release is being staged. If this message stays here for a long time, there might have been some issues staging the entries :( +
+{% elif release.status == 'PUBLISHED' %} +
+ + This release is published and locked. +
+{% endif %} + +{% if release.status in ['DRAFT', 'READY', 'EDITING'] %} + +{% set ns = namespace(automatic_fix=False) %} + +
+ {% set failed = release.validations | selectattr('enabled') | rejectattr('status') | sort(attribute='name')| list %} + {% set optional = release.validations | selectattr('optional') | rejectattr('enabled') | sort(attribute='name') | list %} + {% set success = release.validations | selectattr('enabled') | selectattr('status') | sort(attribute='name')| list %} + + {% for validation in failed + optional + success %} +
+
+ {{ validation.name }} +
+ +
+ {% if validation.enabled %} + + {% else %} + + {% endif %} +
+ +
+ {% if validation.enabled and not validation.status %} +
+ {{ validation.error_message }} +
+ {% endif %} +
+ +
+ + {% if validation.optional %} + +
+
+ {% endif %} + {% if validation.enabled and validation.fixable and not validation.status %} + + {% set ns.automatic_fix = True %} + Can be fixed automatically + {% endif %} + +
+ +
+ {% endfor %} + {% if ns.automatic_fix %} +
+
+

+ We can automatically fix some of the issues. +

+ +
+ +
+
+
+ {% else %} +
+ + All automatic validations are satisfied. +
+ {% endif %} +
+ {% if release._metadata.errors %} +
+
Showing {{ release._metadata.errors|length }} of {{ release._metadata.num_errors }} + errors: +
+ {% for item in release._metadata.errors %} +
  • + {{ item }}: +
  • + {% endfor %} +
    + {% endif %} +
    +
    + +{% endif %} + +{% set edit_disabled = release.status not in ['DRAFT', 'READY'] %} +{% set view_disabled = release.status not in ['STAGED', 'PUBLISHED'] %} + +
    diff --git a/cernopendata/templates/cernopendata_pages/release_details/_navigation.html b/cernopendata/templates/cernopendata_pages/release_details/_navigation.html new file mode 100644 index 0000000..9c76783 --- /dev/null +++ b/cernopendata/templates/cernopendata_pages/release_details/_navigation.html @@ -0,0 +1,75 @@ +{% set steps = ['DRAFT', 'STAGED', 'PUBLISHED'] %} + +{% set status_to_step = { +'DRAFT': 'DRAFT', +'EDITING': 'DRAFT', +'READY': 'DRAFT', +'STAGING': 'DRAFT', +'STAGED': 'STAGED', +'PUBLISHED': 'PUBLISHED' +} %} + +{% set current_step = status_to_step[release.status] %} +{% set current_index = steps.index(current_step) %} + +
    + {% for step in steps %} + {% set i = loop.index0 %} +
    +
    +
    {{ step }} {%if step == "DRAFT" and release.status!="DRAFT" %} ({{release.status}}) {%endif%}
    +
    +
    + {% endfor %} +
    +{% set transitions = { +'DRAFT': 'stage', +'STAGED': 'publish', +'PUBLISHED': None +} %} + +{% set next_action = transitions[release.status] %} + +{% set next_tooltip = { +'DRAFT': 'This release has unresolved issues.', +'STAGED': 'You are about to publish this release publicly. Please review carefully before proceeding.', +'EDITING': 'The release is being edited at the moment.', +'STAGING': 'The release is being staged.' +}[release.status] %} + +
    + + {# Extra options only when publishing #} + {% if release.status == 'STAGED' %} +
    + + +
    + {% endif %} + + + {# Next button #} +
    + +
    +
    + diff --git a/cernopendata/templates/cernopendata_pages/release_details/_summary.html b/cernopendata/templates/cernopendata_pages/release_details/_summary.html new file mode 100644 index 0000000..907ebba --- /dev/null +++ b/cernopendata/templates/cernopendata_pages/release_details/_summary.html @@ -0,0 +1,55 @@ +

    + Release #{{ release._metadata.id }}

    +

    {{ release._metadata.name }} +

    +{% set stats = [ +{"label": "Records", "value": release._metadata.num_records}, +{"label": "File indices", "value": release._metadata.num_file_indices}, +{"label": "Files", "value": release._metadata.num_files}, +{"label": "Documents", "value": release._metadata.num_docs}, +] %} + +
    + {% for stat in stats %} +
    +
    {{ stat.value }}
    +
    {{ stat.label }}
    +
    + {% endfor %} +
    +
    + + Timeline: +
    + + {% set events = release._metadata.history_events %} + + {% if events|length > 8 %} + {% set display_events = events[:3] + events[-5:] %} + {% else %} + {% set display_events = events %} + {% endif %} + + {% for event in display_events %} + + {% if events|length > 8 and loop.index == 4 %} +
    +
    ...
    +
    + {% endif %} + +
    +
    + {{ event.status|capitalize }} +
    + + {% if not loop.last %} + + {% endif %} +
    + + {% endfor %} +
    +
    diff --git a/cernopendata/templates/cernopendata_pages/releases.html b/cernopendata/templates/cernopendata_pages/releases.html new file mode 100644 index 0000000..f15e45f --- /dev/null +++ b/cernopendata/templates/cernopendata_pages/releases.html @@ -0,0 +1,106 @@ +{%- extends config.BASE_TEMPLATE %} + + +{%- block page_header %} +{% include 'cernopendata_theme/header_index.html' %} +{%- endblock page_header %} + + +{%- block page_body %} +
    +
    + +

    + Welcome to the curation page +
    Experiment: {{ experiment|upper }}
    +

    + +
    +

    + This page allows you to register new records in the repository. +
    + You will only be able to register records for + {{ experiment }}. +

    + +
    +
    +
    +
    +
    +
    + +
    +
    + +
    +
    +
    + +
    +
    +{%- endblock %} + +{% block javascript %} + {{ super() }} + {{ webpack['cernopendata_curate.js'] }} +{%- endblock %} \ No newline at end of file diff --git a/cernopendata/templates/cernopendata_records_ui/records/record_detail.html b/cernopendata/templates/cernopendata_records_ui/records/record_detail.html index 4bf5989..8417752 100644 --- a/cernopendata/templates/cernopendata_records_ui/records/record_detail.html +++ b/cernopendata/templates/cernopendata_records_ui/records/record_detail.html @@ -1,6 +1,9 @@ {%- extends "cernopendata_records_ui/records/detail.html" %} {% block heading %} + {% if record.prerelease %} +
    WARNING: This record is part of a release that has not been published yet. Only authorized people can see the record.
    + {% endif %}

    {{ record.title_additional or record.title }}

    diff --git a/cernopendata/templates/cernopendata_theme/header.html b/cernopendata/templates/cernopendata_theme/header.html index df796dc..74a24eb 100644 --- a/cernopendata/templates/cernopendata_theme/header.html +++ b/cernopendata/templates/cernopendata_theme/header.html @@ -6,6 +6,25 @@