diff --git a/components/renku_data_services/data_connectors/api.spec.yaml b/components/renku_data_services/data_connectors/api.spec.yaml index e3367c836..6c9817592 100644 --- a/components/renku_data_services/data_connectors/api.spec.yaml +++ b/components/renku_data_services/data_connectors/api.spec.yaml @@ -96,6 +96,33 @@ paths: $ref: "#/components/responses/Error" tags: - data_connectors + /data_connectors/search: + get: + summary: Get data connector details by DOI + parameters: + - in: query + name: doi + required: true + schema: + $ref: "#/components/schemas/DOI" + description: The DOI of the data connector + responses: + "200": + description: The data connector + content: + application/json: + schema: + $ref: "#/components/schemas/DataConnector" + "404": + description: The data connector with the given DOI does not exist or user does not have access to it + content: + application/json: + schema: + $ref: "#/components/schemas/ErrorResponse" + default: + $ref: "#/components/responses/Error" + tags: + - data_connectors /data_connectors/{data_connector_id}: parameters: - in: path @@ -289,6 +316,14 @@ paths: description: the ID of the data connector get: summary: Get all links from a given data connector to projects + parameters: + - in: query + description: query parameters + name: params + style: form + explode: true + schema: + $ref: "#/components/schemas/PaginationRequest" responses: "200": description: List of data connector to project links @@ -296,6 +331,27 @@ paths: application/json: schema: $ref: "#/components/schemas/DataConnectorToProjectLinksList" + headers: + page: + description: The index of the current page (starting at 1). + required: true + schema: + type: integer + per-page: + description: The number of items per page. + required: true + schema: + type: integer + total: + description: The total number of items. + required: true + schema: + type: integer + total-pages: + description: The total number of pages. + required: true + schema: + type: integer default: $ref: "#/components/responses/Error" tags: @@ -639,6 +695,10 @@ components: type: array items: $ref: "#/components/schemas/DataConnectorToProjectLink" + ProjectPath: + description: The path to the project page + type: string + example: "namespace/project-slug" DataConnectorToProjectLink: description: A link from a data connector to a project in Renku 2.0 type: object @@ -650,6 +710,8 @@ components: $ref: "#/components/schemas/Ulid" project_id: $ref: "#/components/schemas/Ulid" + project_path: + $ref: "#/components/schemas/ProjectPath" creation_date: $ref: "#/components/schemas/CreationDate" created_by: @@ -658,6 +720,7 @@ components: - id - data_connector_id - project_id + - project_path - creation_date - created_by DataConnectorToProjectLinkPost: @@ -961,7 +1024,6 @@ components: type: integer minimum: 0 description: The number of data links the user does not have access to - responses: Error: description: The schema for all 4xx and 5xx responses diff --git a/components/renku_data_services/data_connectors/apispec.py b/components/renku_data_services/data_connectors/apispec.py index c5d5303ba..1deedf159 100644 --- a/components/renku_data_services/data_connectors/apispec.py +++ b/components/renku_data_services/data_connectors/apispec.py @@ -1,6 +1,6 @@ # generated by datamodel-codegen: # filename: api.spec.yaml -# timestamp: 2025-12-03T09:49:11+00:00 +# timestamp: 2026-01-27T08:42:00+00:00 from __future__ import annotations @@ -129,6 +129,14 @@ class InaccessibleDataConnectorLinks(BaseAPISpec): ) +class DataConnectorsSearchGetParametersQuery(BaseAPISpec): + doi: str = Field(..., description="A DOI.", examples=["10.16904/envidat.33"]) + + +class DataConnectorsDataConnectorIdProjectLinksGetParametersQuery(BaseAPISpec): + params: Optional[PaginationRequest] = None + + class CloudStorageCore(BaseAPISpec): model_config = ConfigDict( extra="forbid", @@ -241,6 +249,11 @@ class DataConnectorToProjectLink(BaseAPISpec): min_length=26, pattern="^[0-7][0-9A-HJKMNP-TV-Z]{25}$", ) + project_path: str = Field( + ..., + description="The path to the project page", + examples=["namespace/project-slug"], + ) creation_date: datetime = Field( ..., description="The date and time the resource was created (in UTC and ISO-8601 format)", diff --git a/components/renku_data_services/data_connectors/blueprints.py b/components/renku_data_services/data_connectors/blueprints.py index bfe1a200e..cd4a0adec 100644 --- a/components/renku_data_services/data_connectors/blueprints.py +++ b/components/renku_data_services/data_connectors/blueprints.py @@ -308,20 +308,43 @@ async def _get_permissions(_: Request, user: base_models.APIUser, data_connector return "/data_connectors//permissions", ["GET"], _get_permissions + def get_one_by_doi(self) -> BlueprintFactoryResponse: + """Get data connector by DOI.""" + + @authenticate(self.authenticator) + @validate(query=apispec.DataConnectorsSearchGetParametersQuery) + async def _get_one_by_doi( + _: Request, + user: base_models.APIUser, + query: apispec.DataConnectorsSearchGetParametersQuery, + validator: RCloneValidator, + ) -> JSONResponse: + data_connector = await self.data_connector_repo.get_data_connector_by_doi(user=user, doi=query.doi) + return validated_json( + apispec.DataConnector, + self._dump_data_connector(data_connector, validator=validator), + ) + + return "/data_connectors/search", ["GET"], _get_one_by_doi + def get_all_project_links(self) -> BlueprintFactoryResponse: """List all links from a given data connector to projects.""" @authenticate(self.authenticator) + @paginate async def _get_all_project_links( _: Request, user: base_models.APIUser, data_connector_id: ULID, - ) -> JSONResponse: - links = await self.data_connector_repo.get_links_from(user=user, data_connector_id=data_connector_id) - return validated_json( - apispec.DataConnectorToProjectLinksList, - [self._dump_data_connector_to_project_link(link) for link in links], + pagination: PaginationRequest, + ) -> tuple[list[dict[str, Any]], int]: + links, total_num = await self.data_connector_repo.get_links_from( + user=user, data_connector_id=data_connector_id, pagination=pagination ) + return [ + validate_and_dump(apispec.DataConnectorToProjectLink, self._dump_data_connector_to_project_link(link)) + for link in links + ], total_num return "/data_connectors//project_links", ["GET"], _get_all_project_links @@ -529,6 +552,7 @@ def _dump_data_connector_to_project_link(link: models.DataConnectorToProjectLink id=str(link.id), data_connector_id=str(link.data_connector_id), project_id=str(link.project_id), + project_path=link.project_path, creation_date=link.creation_date, created_by=link.created_by, ) diff --git a/components/renku_data_services/data_connectors/db.py b/components/renku_data_services/data_connectors/db.py index 0ca942904..6585a5fed 100644 --- a/components/renku_data_services/data_connectors/db.py +++ b/components/renku_data_services/data_connectors/db.py @@ -195,6 +195,33 @@ async def get_data_connector_by_slug( return data_connector.dump() + async def get_data_connector_by_doi( + self, + user: base_models.APIUser, + doi: str, + ) -> models.DataConnector | models.GlobalDataConnector: + """Get a data connector from the database by DOI.""" + not_found_msg = f"Data connector with DOI '{doi}' does not exist or you do not have access to it." + + async with self.session_maker() as session: + stmt = select(schemas.DataConnectorORM).where(schemas.DataConnectorORM.doi == doi) + result = await session.scalars(stmt) + data_connector = result.one_or_none() + + if data_connector is None: + raise errors.MissingResourceError(message=not_found_msg) + + authorized = await self.authz.has_permission( + user=user, + resource_type=ResourceType.data_connector, + resource_id=data_connector.id, + scope=Scope.READ, + ) + if not authorized: + raise errors.MissingResourceError(message=not_found_msg) + + return data_connector.dump() + async def get_global_data_connector_by_slug( self, user: base_models.APIUser, @@ -612,8 +639,8 @@ async def get_data_connector_permissions( return permissions async def get_links_from( - self, user: base_models.APIUser, data_connector_id: ULID - ) -> list[models.DataConnectorToProjectLink]: + self, user: base_models.APIUser, data_connector_id: ULID, pagination: PaginationRequest + ) -> tuple[list[models.DataConnectorToProjectLink], int]: """Get links from a given data connector.""" authorized = await self.authz.has_permission(user, ResourceType.data_connector, data_connector_id, Scope.READ) if not authorized: @@ -628,10 +655,20 @@ async def get_links_from( select(schemas.DataConnectorToProjectLinkORM) .where(schemas.DataConnectorToProjectLinkORM.data_connector_id == data_connector_id) .where(schemas.DataConnectorToProjectLinkORM.project_id.in_(project_ids)) + .limit(pagination.per_page) + .offset(pagination.offset) + .order_by(schemas.DataConnectorToProjectLinkORM.id.desc()) + ) + stmt_count = ( + select(func.count()) + .select_from(schemas.DataConnectorToProjectLinkORM) + .where(schemas.DataConnectorToProjectLinkORM.data_connector_id == data_connector_id) + .where(schemas.DataConnectorToProjectLinkORM.project_id.in_(project_ids)) ) result = await session.scalars(stmt) links_orm = result.all() - return [link.dump() for link in links_orm] + total_elements = await session.scalar(stmt_count) or 0 + return [link.dump() for link in links_orm], total_elements async def get_links_to( self, user: base_models.APIUser, project_id: ULID diff --git a/components/renku_data_services/data_connectors/models.py b/components/renku_data_services/data_connectors/models.py index 7f212e0fd..3ec37775b 100644 --- a/components/renku_data_services/data_connectors/models.py +++ b/components/renku_data_services/data_connectors/models.py @@ -174,6 +174,7 @@ class DataConnectorToProjectLink(UnsavedDataConnectorToProjectLink): """A link from a data connector to a project.""" id: ULID + project_path: str created_by: str creation_date: datetime updated_at: datetime diff --git a/components/renku_data_services/data_connectors/orm.py b/components/renku_data_services/data_connectors/orm.py index ed7581f9d..7615e6400 100644 --- a/components/renku_data_services/data_connectors/orm.py +++ b/components/renku_data_services/data_connectors/orm.py @@ -193,12 +193,18 @@ class DataConnectorToProjectLinkORM(BaseORM): nullable=False, ) + project: Mapped["ProjectORM"] = relationship(init=False, repr=False, viewonly=True, lazy="joined") + """The project this link points to.""" + def dump(self) -> models.DataConnectorToProjectLink: """Create a link model from the DataConnectorProjectLinkORM.""" + project_path = f"{self.project.slug.namespace.slug}/{self.project.slug.slug}" + return models.DataConnectorToProjectLink( id=self.id, data_connector_id=self.data_connector_id, project_id=self.project_id, + project_path=project_path, created_by=self.created_by_id, creation_date=self.creation_date, updated_at=self.updated_at, diff --git a/components/renku_data_services/project/api.spec.yaml b/components/renku_data_services/project/api.spec.yaml index 276f27668..5133cd021 100644 --- a/components/renku_data_services/project/api.spec.yaml +++ b/components/renku_data_services/project/api.spec.yaml @@ -992,32 +992,6 @@ components: type: string description: Entity Tag example: "9EE498F9D565D0C41E511377425F32F3" - DataConnectorToProjectLinksList: - description: A list of links from a data connector to a project - type: array - items: - $ref: "#/components/schemas/DataConnectorToProjectLink" - DataConnectorToProjectLink: - description: A link from a data connector to a project in Renku 2.0 - type: object - additionalProperties: false - properties: - id: - $ref: "#/components/schemas/Ulid" - data_connector_id: - $ref: "#/components/schemas/Ulid" - project_id: - $ref: "#/components/schemas/Ulid" - creation_date: - $ref: "#/components/schemas/CreationDate" - created_by: - $ref: "#/components/schemas/UserId" - required: - - id - - data_connector_id - - project_id - - creation_date - - created_by ProjectGetQuery: description: Query params for project get request allOf: diff --git a/components/renku_data_services/project/apispec.py b/components/renku_data_services/project/apispec.py index 6bb149cb3..226c8ba7e 100644 --- a/components/renku_data_services/project/apispec.py +++ b/components/renku_data_services/project/apispec.py @@ -1,6 +1,6 @@ # generated by datamodel-codegen: # filename: api.spec.yaml -# timestamp: 2025-05-06T08:33:41+00:00 +# timestamp: 2026-01-23T15:08:41+00:00 from __future__ import annotations @@ -27,44 +27,6 @@ class Role(Enum): owner = "owner" -class DataConnectorToProjectLink(BaseAPISpec): - model_config = ConfigDict( - extra="forbid", - ) - id: str = Field( - ..., - description="ULID identifier", - max_length=26, - min_length=26, - pattern="^[0-7][0-9A-HJKMNP-TV-Z]{25}$", - ) - data_connector_id: str = Field( - ..., - description="ULID identifier", - max_length=26, - min_length=26, - pattern="^[0-7][0-9A-HJKMNP-TV-Z]{25}$", - ) - project_id: str = Field( - ..., - description="ULID identifier", - max_length=26, - min_length=26, - pattern="^[0-7][0-9A-HJKMNP-TV-Z]{25}$", - ) - creation_date: datetime = Field( - ..., - description="The date and time the resource was created (in UTC and ISO-8601 format)", - examples=["2023-11-01T17:32:28Z"], - ) - created_by: str = Field( - ..., - description="Keycloak user ID", - examples=["f74a228b-1790-4276-af5f-25c2424e9b0c"], - pattern="^[A-Za-z0-9]{1}[A-Za-z0-9-]+$", - ) - - class ProjectMigrationInfo(BaseAPISpec): project_id: str = Field( ..., @@ -138,22 +100,22 @@ class ErrorResponse(BaseAPISpec): error: Error -class NamespacesNamespaceProjectsSlugGetParametersQuery(BaseAPISpec): +class ProjectsProjectIdGetParametersQuery(BaseAPISpec): with_documentation: Optional[bool] = Field( None, description="Projects with or without possibly extensive documentation?" ) -class ProjectsProjectIdCopiesGetParametersQuery(BaseAPISpec): - writable: bool = False - - -class ProjectsProjectIdGetParametersQuery(BaseAPISpec): +class NamespacesNamespaceProjectsSlugGetParametersQuery(BaseAPISpec): with_documentation: Optional[bool] = Field( None, description="Projects with or without possibly extensive documentation?" ) +class ProjectsProjectIdCopiesGetParametersQuery(BaseAPISpec): + writable: bool = False + + class MigrationSessionLauncherPost(BaseAPISpec): model_config = ConfigDict( extra="forbid", @@ -235,12 +197,6 @@ class ProjectMemberResponse(BaseAPISpec): role: Role -class DataConnectorToProjectLinksList(RootModel[List[DataConnectorToProjectLink]]): - root: List[DataConnectorToProjectLink] = Field( - ..., description="A list of links from a data connector to a project" - ) - - class ProjectGetQuery(PaginationRequest): namespace: str = Field("", description="A namespace, used as a filter.") direct_member: bool = Field( @@ -386,11 +342,6 @@ class ProjectsGetParametersQuery(BaseAPISpec): params: Optional[ProjectGetQuery] = None -class RenkuV1ProjectsMigrationsGetParametersQuery(BaseAPISpec): - """This class no longer includes any parameters.""" - pass - - class Project(BaseAPISpec): id: str = Field( ..., diff --git a/pyproject.toml b/pyproject.toml index 0f8f523d6..a3ed998b1 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -274,6 +274,10 @@ module = [ ] ignore_missing_imports = true +[[tool.mypy.overrides]] +module = ["test.*"] +disable_error_code = ["no-untyped-def"] + [tool.coverage.run] source = ["bases/", "components/"] omit = ["components/renku_data_services/notebooks"] diff --git a/test/bases/renku_data_services/data_api/conftest.py b/test/bases/renku_data_services/data_api/conftest.py index d7d41d137..e9ce400ff 100644 --- a/test/bases/renku_data_services/data_api/conftest.py +++ b/test/bases/renku_data_services/data_api/conftest.py @@ -1,7 +1,7 @@ import asyncio import contextlib import json -from collections.abc import AsyncGenerator, Callable +from collections.abc import AsyncGenerator, Callable, Coroutine from copy import deepcopy from datetime import timedelta from typing import Any, Protocol @@ -9,7 +9,6 @@ import pytest import pytest_asyncio from authzed.api.v1 import Relationship, RelationshipUpdate, SubjectReference, WriteRelationshipsRequest -from httpx import Response from sanic import Sanic from sanic_testing.testing import SanicASGITestClient from ulid import ULID @@ -21,7 +20,7 @@ from renku_data_services.base_models.core import APIUser, InternalServiceAdmin, NamespacePath, ServiceAdminId from renku_data_services.data_api.app import register_all_handlers from renku_data_services.data_api.dependencies import DependencyManager -from renku_data_services.data_connectors.apispec import DataConnector as ApiDataConnector +from renku_data_services.data_connectors.apispec import DataConnector, DataConnectorToProjectLink from renku_data_services.k8s.clients import K8sClusterClient from renku_data_services.k8s.config import from_kubeconfig_file, get_clusters from renku_data_services.k8s.constants import ClusterId @@ -622,15 +621,36 @@ async def create_data_connector_helper( return create_data_connector_helper +@pytest_asyncio.fixture +async def create_global_data_connector( + sanic_client: SanicASGITestClient, user_headers: dict[str, str] +) -> Callable[[str], Coroutine[Any, Any, DataConnector]]: + async def create_global_data_connector_helper(**storage: str) -> DataConnector: + payload = { + "storage": { + "configuration": {"type": "s3", "provider": "AWS", "region": "us-east-1"}, + "source_path": "", + "target_path": "", + }, + } + payload["storage"].update(storage) + _, response = await sanic_client.post("/api/data/data_connectors/global", headers=user_headers, json=payload) + assert response.status_code == 201, response.text + + return DataConnector.model_validate(response.json) + + return create_global_data_connector_helper + + class CreateDataConnectorCall(Protocol): - async def __call__(self, name: str, user: UserInfo | None = None, **payload) -> ApiDataConnector: ... + async def __call__(self, name: str, user: UserInfo | None = None, **payload) -> DataConnector: ... @pytest_asyncio.fixture async def create_data_connector_model( sanic_client: SanicASGITestClient, regular_user: UserInfo, admin_user: UserInfo ) -> CreateDataConnectorCall: - async def create_dc_helper(name: str, user: UserInfo | None = None, **payload) -> ApiDataConnector: + async def create_dc_helper(name: str, user: UserInfo | None = None, **payload) -> DataConnector: user = user or regular_user headers = __make_headers(user, admin=user.id == admin_user.id) dc_payload = { @@ -652,7 +672,7 @@ async def create_dc_helper(name: str, user: UserInfo | None = None, **payload) - _, response = await sanic_client.post("/api/data/data_connectors", headers=headers, json=dc_payload) assert response.status_code == 201, response.text - return ApiDataConnector.model_validate(response.json) + return DataConnector.model_validate(response.json) return create_dc_helper @@ -702,23 +722,28 @@ async def create_data_connector_and_link_project_helper( data_connector = await create_data_connector(name, user=user, headers=headers, **payload) data_connector_id = data_connector["id"] - response = await link_data_connector(project_id, data_connector_id, headers=headers) - data_connector_link = response.json + data_connector_link = await link_data_connector(project_id, data_connector_id, headers=headers) - return data_connector, data_connector_link + return data_connector, data_connector_link.model_dump() return create_data_connector_and_link_project_helper @pytest.fixture -def link_data_connector(sanic_client: SanicASGITestClient): - async def _link_data_connector(project_id: str, dc_id: str, headers: dict[str, str]) -> Response: +def link_data_connector( + sanic_client: SanicASGITestClient, user_headers: dict[str, str] +) -> Callable[[str, str, dict[str, str] | None], Coroutine[Any, Any, DataConnectorToProjectLink]]: + async def _link_data_connector( + project_id: str, data_connector_id: str, headers: dict[str, str] | None = None + ) -> DataConnectorToProjectLink: + headers = headers or user_headers payload = {"project_id": project_id} _, response = await sanic_client.post( - f"/api/data/data_connectors/{dc_id}/project_links", headers=headers, json=payload + f"/api/data/data_connectors/{data_connector_id}/project_links", headers=headers, json=payload ) assert response.status_code == 201, response.text - return response + + return DataConnectorToProjectLink.model_validate(response.json) return _link_data_connector diff --git a/test/bases/renku_data_services/data_api/test_data_connectors.py b/test/bases/renku_data_services/data_api/test_data_connectors.py index d9e93d7fb..27de2593f 100644 --- a/test/bases/renku_data_services/data_api/test_data_connectors.py +++ b/test/bases/renku_data_services/data_api/test_data_connectors.py @@ -1,9 +1,8 @@ import warnings from dataclasses import dataclass -from typing import TYPE_CHECKING, Any, cast +from typing import TYPE_CHECKING import pytest -from httpx import Response from sanic_testing.testing import SanicASGITestClient from renku_data_services.authz.models import Visibility @@ -21,25 +20,6 @@ from pytest import MonkeyPatch -async def create_data_connector( - sanic_client: SanicASGITestClient, headers: dict[str, Any], namespace: str, slug: str, private: bool -) -> Response: - storage_config = { - "configuration": {"type": "s3", "endpoint": "http://s3.aws.com"}, - "source_path": "giab", - "target_path": "giab", - } - payload = { - "name": slug, - "namespace": namespace, - "slug": slug, - "storage": storage_config, - "visibility": "private" if private else "public", - } - _, response = await sanic_client.post("/api/data/data_connectors", headers=headers, json=payload) - return cast(Response, response) - - @pytest.mark.asyncio async def test_post_data_connector( sanic_client: SanicASGITestClient, regular_user: UserInfo, user_headers, app_manager @@ -118,7 +98,7 @@ async def test_post_global_data_connector( description="""0.7.2 (2019-11-15)\nBug Fixes\n""", # noqa E501 keywords=[], ) - _mock_get_dataset_metadata(metadata=zenodo_metadata, sanic_client=sanic_client, monkeypatch=monkeypatch) + _mock_get_dataset_metadata(metadata=zenodo_metadata, monkeypatch=monkeypatch) payload = { "storage": { @@ -178,7 +158,7 @@ async def test_post_global_data_connector_dataverse( description="""

This dataset contains the metadata of the datasets published in 101 Dataverse installations, information about the metadata blocks of 106 installations, and the lists of pre-defined licenses or dataset terms that depositors can apply to datasets in the 88 installations that were running versions of the Dataverse software that include the "multiple-license" feature.\n\n

The data is useful for improving understandings about how certain Dataverse features and metadata fields are used and for learning about the quality of dataset and file-level metadata within and across Dataverse installations.\n\n

How the metadata was downloaded\n

The dataset metadata and metadata block JSON files were downloaded from each installation between August 25 and August 30, 2024 using a "get_dataverse_installations_metadata" function in a collection of Python functions at https://github.com/jggautier/dataverse-scripts/blob/main/dataverse_repository_curation_assistant/dataverse_repository_curation_assistant_functions.py.\n\n

In order to get the metadata from installations that require an installation account API token to use certain Dataverse software APIs, I created a CSV file with two columns: one column named "hostname" listing each installation URL for which I was able to create an account and another column named "apikey" listing my accounts\' API tokens. The Python script expects the CSV file and the listed API tokens to get metadata and other information from installations that require API tokens in order to use certain API endpoints.\n\n

How the files are organized\n\n

\n├── csv_files_with_metadata_from_most_known_dataverse_installations\n│\xa0\xa0 ├── author_2024.08.25-2024.08.30.csv\n│\xa0\xa0 ├── contributor_2024.08.25-2024.08.30.csv\n│\xa0\xa0 ├── data_source_2024.08.25-2024.08.30.csv\n│\xa0\xa0 ├── ...\n│\xa0\xa0 └── topic_classification_2024.08.25-2024.08.30.csv\n├── dataverse_json_metadata_from_each_known_dataverse_installation\n│\xa0\xa0 ├── Abacus_2024.08.26_15.52.42.zip\n│\xa0\xa0\xa0\xa0\xa0\xa0 ├── dataset_pids_Abacus_2024.08.26_15.52.42.csv\n│\xa0\xa0\xa0\xa0\xa0\xa0 ├── Dataverse_JSON_metadata_2024.08.26_15.52.42\n│\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0  ├── hdl_11272.1_AB2_0AQZNT_v1.0(latest_version).json\n│\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0  ├── ...\n│\xa0\xa0\xa0\xa0\xa0\xa0 ├── metadatablocks_v5.9\n│\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0  ├── astrophysics_v5.9.json\n│\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0  ├── biomedical_v5.9.json\n│\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0  ├── citation_v5.9.json\n│\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0  ├── ...\n│\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0  ├── socialscience_v5.6.json\n│\xa0\xa0 ├── ACSS_Dataverse_2024.08.26_00.02.51.zip\n│\xa0\xa0 ├── ...\n│\xa0\xa0 └── Yale_Dataverse_2024.08.25_03.52.57.zip\n└── dataverse_installations_summary_2024.08.30.csv\n└── dataset_pids_from_most_known_dataverse_installations_2024.08.csv\n└── license_options_for_each_dataverse_installation_2024.08.28_14.42.54.csv\n└── metadatablocks_from_most_known_dataverse_installations_2024.08.30.csv\n\n
\n\n

This dataset contains two directories and four CSV files not in a directory.\n

One directory, "csv_files_with_metadata_from_most_known_dataverse_installations", contains 20 CSV files that list the values of many of the metadata fields in the "Citation" metadata block and "Geospatial" metadata block of datasets in the 101 Dataverse installations. For example, author_2024.08.25-2024.08.30.csv contains the "Author" metadata for the latest versions of all published, non-deaccessioned datasets in 101 installations, with a column for each of the four child fields: author name, affiliation, identifier type, and identifier.\n

The other directory, "dataverse_json_metadata_from_each_known_dataverse_installation", contains 106 zip files, one zip file for each of the 106 Dataverse installations whose sites were functioning when I attempted to collect their metadata. Each zip file contains a directory with JSON files that have information about the installation\'s metadata fields, such as the field names and how they\'re organized. For installations that had published datasets, and I was able to use Dataverse APIs to download the dataset metadata, the zip file also contains:\n

\n

The dataverse_installations_summary_2024.08.30.csv file contains information about each installation, including its name, URL, Dataverse software version, and counts of dataset metadata included and not included in this dataset.\n

The dataset_pids_from_most_known_dataverse_installations_2024.08.csv file contains the dataset PIDs of published datasets in 101 Dataverse installations, with a column to indicate if the Python script was able to download the dataset\'s metadata. It\'s a union of all "dataset_pids_....csv" files in each of the 101 zip files in the dataverse_json_metadata_from_each_known_dataverse_installation directory.\n

The license_options_for_each_dataverse_installation_2024.08.28_14.42.54.csv file contains information about the licenses and data use agreements that some installations let depositors choose when creating datasets. When I collected this data, 88 of the available 106 installations were running versions of the Dataverse software that allow depositors to choose a "predefined license or data use agreement" from a dropdown menu in the dataset deposit form. For more information about this Dataverse feature, see https://guides.dataverse.org/en/5.14/user/dataset-management.html#choosing-a-license.\n

The metadatablocks_from_most_known_dataverse_installations_2024.08.30.csv file contains the metadata block names, field names, child field names (if the field is a compound field), display names, descriptions/tooltip text, and watermarks of fields in the 106 Dataverse installations\' metadata blocks. This file is useful for learning about the metadata fields and field structures used in each installation.\n\n

Known errors\n

The metadata of a few datasets from several known and functioning installations could not be downloaded.\n

In some cases, this is because of download timeouts caused by the datasets\' relatively large metadata exports, which contain information about the datasets\' large number of versions and files.\n

In other cases, datasets were publicly findable but in unpublished or deaccessioned states that prevented me from downloading their metadata export.\n\n

About metadata blocks\n

Read about the Dataverse software\'s metadata blocks system at http://guides.dataverse.org/en/6.3/admin/metadatacustomization.html""", # noqa E501 keywords=["dataset metadata", "dataverse", "metadata blocks"], ) - _mock_get_dataset_metadata(metadata=dataverse_metadata, sanic_client=sanic_client, monkeypatch=monkeypatch) + _mock_get_dataset_metadata(metadata=dataverse_metadata, monkeypatch=monkeypatch) doi = "10.7910/DVN/2SA6SN" payload = { @@ -259,7 +239,7 @@ async def test_post_global_data_connector_no_duplicates( description="""0.7.2 (2019-11-15)\nBug Fixes\n

""", # noqa E501 keywords=[], ) - _mock_get_dataset_metadata(metadata=zenodo_metadata, sanic_client=sanic_client, monkeypatch=monkeypatch) + _mock_get_dataset_metadata(metadata=zenodo_metadata, monkeypatch=monkeypatch) doi = "10.5281/zenodo.2600782" payload = { @@ -344,7 +324,7 @@ async def test_post_data_connector_with_azure_url( "visibility": "public", "namespace": regular_user.namespace.path.serialize(), "storage": { - "storage_url": "azure://mycontainer/myfolder", + "storage_url": "azure://my-container/myfolder", "target_path": "my/target", }, "keywords": ["keyword 1", "keyword.2", "keyword-3", "KEYWORD_4"], @@ -361,7 +341,7 @@ async def test_post_data_connector_with_azure_url( assert data_connector.get("storage") is not None storage = data_connector["storage"] assert storage.get("storage_type") == "azureblob" - assert storage.get("source_path") == "mycontainer/myfolder" + assert storage.get("source_path") == "my-container/myfolder" assert storage.get("target_path") == "my/target" assert storage.get("readonly") is True assert data_connector.get("created_by") == "user" @@ -583,6 +563,46 @@ async def test_get_one_by_slug_data_connector( assert data_connector.get("slug") == "a-new-data-connector" +@pytest.mark.asyncio +async def test_get_data_connector_by_doi(create_global_data_connector, monkeypatch, sanic_client, user_headers) -> None: + doi = "10.5281/zenodo.2600782" + metadata = RCloneDOIMetadata( + DOI=doi, + URL="https://doi.org/10.5281/zenodo.2600782", + metadataURL="https://zenodo.org/api/records/3542869", + provider="zenodo", + ) + _mock_get_doi_metadata(metadata=metadata, sanic_client=sanic_client, monkeypatch=monkeypatch) + zenodo_metadata = DOIMetadata( + name="SwissDataScienceCenter/renku-python: Version 0.7.2", + description="""0.7.2 (2019-11-15)\nBug Fixes\n""", # noqa E501 + keywords=[], + ) + _mock_get_dataset_metadata(metadata=zenodo_metadata, monkeypatch=monkeypatch) + + data_connector = await create_global_data_connector(configuration={"type": "doi", "doi": doi}) + + _, response = await sanic_client.get(f"/api/data/data_connectors/search?doi={doi}", headers=user_headers) + + assert response.status_code == 200, response.text + assert response.json is not None + assert response.json["id"] == data_connector.id + assert response.json["name"] == data_connector.name + assert response.json["slug"] == data_connector.slug + assert response.json["doi"] == doi + + +@pytest.mark.asyncio +async def test_get_data_connector_by_doi_fails_if_not_found(sanic_client, user_headers) -> None: + non_existing_doi = "10.5281/zenodo.9999999" + + _, response = await sanic_client.get( + f"/api/data/data_connectors/search?doi={non_existing_doi}", headers=user_headers + ) + + assert response.status_code == 404, response.text + + @pytest.mark.asyncio @pytest.mark.parametrize("headers_name", ["unauthorized_headers", "member_1_headers"]) async def test_get_one_data_connector_unauthorized( @@ -867,7 +887,7 @@ async def test_patch_data_connector_as_group_editor( headers = merge_headers(user_headers, {"If-Match": data_connector["etag"]}) patch = { - # Test that we do require DELETE permission when sending the current namepace + # Test that we do require DELETE permission when sending the current namespace "namespace": data_connector["namespace"], # Test that we do require DELETE permission when sending the current visibility "visibility": data_connector["visibility"], @@ -955,7 +975,7 @@ async def test_patch_global_data_connector( description="""0.7.2 (2019-11-15)\nBug Fixes\n""", # noqa E501 keywords=[], ) - _mock_get_dataset_metadata(metadata=zenodo_metadata, sanic_client=sanic_client, monkeypatch=monkeypatch) + _mock_get_dataset_metadata(metadata=zenodo_metadata, monkeypatch=monkeypatch) doi = "10.5281/zenodo.2600782" payload = { @@ -1037,7 +1057,7 @@ async def test_delete_global_data_connector( description="""0.7.2 (2019-11-15)\nBug Fixes\n""", # noqa E501 keywords=[], ) - _mock_get_dataset_metadata(metadata=zenodo_metadata, sanic_client=sanic_client, monkeypatch=monkeypatch) + _mock_get_dataset_metadata(metadata=zenodo_metadata, monkeypatch=monkeypatch) doi = "10.5281/zenodo.2600782" payload = { @@ -1088,7 +1108,31 @@ async def test_get_data_connector_project_links_empty( @pytest.mark.asyncio -async def test_post_data_connector_project_link( +async def test_get_data_connector_project_link_pagination( + create_data_connector, create_project, link_data_connector, sanic_client, user_headers +) -> None: + data_connector = await create_data_connector("Data Connector") + for i in range(1, 10): + project = await create_project(sanic_client, f"project-{i}") + await link_data_connector(project["id"], data_connector["id"]) + + parameters = {"page": 2, "per_page": 3} + _, response = await sanic_client.get( + f"/api/data/data_connectors/{data_connector["id"]}/project_links", headers=user_headers, params=parameters + ) + + assert response.status_code == 200, response.text + assert response.json is not None + projects = response.json + assert {p["project_path"] for p in projects} == {f"user.doe/{p}" for p in ("project-4", "project-5", "project-6")} + assert response.headers["page"] == "2" + assert response.headers["per-page"] == "3" + assert response.headers["total"] == "9" + assert response.headers["total-pages"] == "3" + + +@pytest.mark.asyncio +async def test_post_data_connector_project_links( sanic_client: SanicASGITestClient, create_data_connector, create_project, user_headers ) -> None: data_connector = await create_data_connector("Data connector 1") @@ -2293,6 +2337,7 @@ async def test_move_data_connector( origin: DataConnectorTestCase, destination: DataConnectorTestCase, dc_visibility: Visibility, + create_data_connector, ) -> None: # Create origin namespace linked_project_id: str | None = None @@ -2346,13 +2391,12 @@ async def test_move_data_connector( destination_id = response.json["id"] # Create the data connector - response = await create_data_connector( - sanic_client, member_1_headers, origin_path.serialize(), "dc1", private=dc_visibility == Visibility.PRIVATE + data_connector = await create_data_connector( + "dc1", headers=member_1_headers, namespace=origin_path.serialize(), visibility=dc_visibility.value ) - assert response.status_code == 201, response.text - assert response.json["namespace"] == origin_path.serialize() - dc_id = response.json["id"] - dc_etag = response.json["etag"] + assert data_connector["namespace"] == origin_path.serialize() + dc_id = data_connector["id"] + dc_etag = data_connector["etag"] # Create a project to link the DC to if the origin is not project if not isinstance(origin_path, ProjectPath): @@ -2381,7 +2425,7 @@ async def test_move_data_connector( assert response.json["namespace"] == destination_path.serialize() assert response.json["visibility"] == dc_visibility.value - # Check the data connector link remains unchaged after moving + # Check the data connector link remains unchanged after moving _, response = await sanic_client.get( f"/api/data/projects/{linked_project_id}/data_connector_links", headers=headers ) @@ -2449,7 +2493,7 @@ async def _mock_get_doi_metadata(*args, **kwargs) -> RCloneDOIMetadata: monkeypatch.setattr(validator, "get_doi_metadata", _mock_get_doi_metadata) -def _mock_get_dataset_metadata(metadata: DOIMetadata, sanic_client: SanicASGITestClient, monkeypatch: "MonkeyPatch"): +def _mock_get_dataset_metadata(metadata: DOIMetadata, monkeypatch: "MonkeyPatch"): """Mock the _get_dataset_metadata_invenio method.""" # The Zenodo API may be unresponsive, so we mock its response @@ -2480,10 +2524,10 @@ async def _mock(*args, **kwargs) -> DOIMetadata | None: ) -def _mock_get_envidat_metadata(metadata: DOIMetadata, sanic_client: SanicASGITestClient, monkeypatch: "MonkeyPatch"): +def _mock_get_envidat_metadata(metadata: DOIMetadata, monkeypatch: "MonkeyPatch"): """Mock the _get_envidat_metadata method.""" - # The Evnidat API may be unresponsive, so we mock its response + # The Envidat API may be unresponsive, so we mock its response from renku_data_services.data_connectors.doi import metadata as metadata_mod _orig_get_envidat_metadata = metadata_mod._get_envidat_metadata @@ -2513,7 +2557,7 @@ async def test_validate_envidat_data_connector( sanic_client: SanicASGITestClient, monkeypatch: "MonkeyPatch", ) -> None: - _mock_get_envidat_metadata(envidat_metadata, sanic_client, monkeypatch) + _mock_get_envidat_metadata(envidat_metadata, monkeypatch) body = GlobalDataConnectorPost( storage=CloudStorageCorePost( storage_type="doi", @@ -2549,7 +2593,7 @@ async def test_add_envidat_data_connector( envidat_metadata: DOIMetadata, monkeypatch: "MonkeyPatch", ) -> None: - _mock_get_envidat_metadata(envidat_metadata, sanic_client, monkeypatch) + _mock_get_envidat_metadata(envidat_metadata, monkeypatch) payload = { "storage": { "configuration": {"type": "doi", "doi": "10.16904/12"}, diff --git a/test/bases/renku_data_services/data_api/test_projects.py b/test/bases/renku_data_services/data_api/test_projects.py index f54703261..bb3a840eb 100644 --- a/test/bases/renku_data_services/data_api/test_projects.py +++ b/test/bases/renku_data_services/data_api/test_projects.py @@ -1468,10 +1468,8 @@ async def test_project_copy_includes_public_data_connector_links_owned_by_others dc2 = await create_data_connector("dc2", member_1_user, member_1_headers, visibility="public") assert "id" in dc1 assert "id" in dc2 - link1_res = await link_data_connector(project_id, dc1["id"], user_headers) - link2_res = await link_data_connector(project_id, dc2["id"], user_headers) - link1 = link1_res.json - link2 = link2_res.json + link1 = await link_data_connector(project_id, dc1["id"]) + link2 = await link_data_connector(project_id, dc2["id"]) copy_project = await create_project_copy( sanic_client, @@ -1489,7 +1487,7 @@ async def test_project_copy_includes_public_data_connector_links_owned_by_others assert {d["data_connector_id"] for d in data_connector_links} == {dc1["id"], dc2["id"]} assert data_connector_links[0]["project_id"] == data_connector_links[1]["project_id"] == project_copy_id # NOTE: Check that new data connector links are created - assert {d["id"] for d in data_connector_links} != {link1["id"], link2["id"]} + assert {d["id"] for d in data_connector_links} != {link1.id, link2.id} @pytest.mark.asyncio