Skip to content

454 migrate resources from old cms to aiod platform #469

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 20 commits into
base: develop
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 14 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 19 additions & 0 deletions connectors/ai4europe_cms/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
FROM aiod_metadata_catalogue

COPY organisations.sh /opt/connectors/script/organisations.sh
COPY events.sh /opt/connectors/script/events.sh
COPY news.sh /opt/connectors/script/news.sh
COPY entry.sh /opt/connectors/script/entry.sh
COPY cron /etc/cron.d/aiod


USER root
RUN apt -y install cron

RUN chmod +x /opt/connectors/script/organisations.sh
RUN chmod +x /opt/connectors/script/events.sh
RUN chmod +x /opt/connectors/script/news.sh
RUN chmod +x /opt/connectors/script/entry.sh
RUN crontab /etc/cron.d/aiod

WORKDIR /app
3 changes: 3 additions & 0 deletions connectors/ai4europe_cms/cron
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
40 * * * * bash /opt/connectors/script/organisations.sh >> /opt/connectors/data/ai4europe_cms/organisation/cron.log 2>&1
40 * * * * bash /opt/connectors/script/events.sh >> /opt/connectors/data/ai4europe_cms/events/cron.log 2>&1
40 * * * * bash /opt/connectors/script/news.sh >> /opt/connectors/data/ai4europe_cms/news/cron.log 2>&1
7 changes: 7 additions & 0 deletions connectors/ai4europe_cms/entry.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
#!/bin/bash

mkdir -p /opt/connectors/data/ai4europe_cms/organisation
mkdir -p /opt/connectors/data/ai4europe_cms/event
mkdir -p /opt/connectors/data/ai4europe_cms/news

/usr/sbin/cron -f -l 4
19 changes: 19 additions & 0 deletions connectors/ai4europe_cms/events.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
#!/bin/bash

WORK_DIR=/opt/connectors/data/ai4europe_cms/event
CONNECTOR=connectors.ai4europe_cms.ai4europe_cms_event_connector.AI4EuropeCmsEventConnector

another_instance()
{
echo $(date -u) "This script is already running in a different thread."
exit 1
}
exec 9< "$0"
flock -n -x 9 || another_instance

echo $(date -u) "Starting synchronization..."
PYTHONPATH=/app /usr/local/bin/python3 /app/connectors/synchronization.py \
-c $CONNECTOR \
-w $WORK_DIR \
--save-every 100 >> ${WORK_DIR}/connector.log 2>&1
echo $(date -u) "Synchronization Done."
19 changes: 19 additions & 0 deletions connectors/ai4europe_cms/news.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
#!/bin/bash

WORK_DIR=/opt/connectors/data/ai4europe_cms/news
CONNECTOR=connectors.ai4europe_cms.ai4europe_cms_news_connector.AI4EuropeCmsNewsConnector

another_instance()
{
echo $(date -u) "This script is already running in a different thread."
exit 1
}
exec 9< "$0"
flock -n -x 9 || another_instance

echo $(date -u) "Starting synchronization..."
PYTHONPATH=/app /usr/local/bin/python3 /app/connectors/synchronization.py \
-c $CONNECTOR \
-w $WORK_DIR \
--save-every 100 >> ${WORK_DIR}/connector.log 2>&1
echo $(date -u) "Synchronization Done."
19 changes: 19 additions & 0 deletions connectors/ai4europe_cms/organisations.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
#!/bin/bash

WORK_DIR=/opt/connectors/data/ai4europe_cms/organisation
CONNECTOR=connectors.ai4europe_cms.ai4europe_cms_organisation_connector.AI4EuropeCmsOrganisationConnector

another_instance()
{
echo $(date -u) "This script is already running in a different thread."
exit 1
}
exec 9< "$0"
flock -n -x 9 || another_instance

echo $(date -u) "Starting synchronization..."
PYTHONPATH=/app /usr/local/bin/python3 /app/connectors/synchronization.py \
-c $CONNECTOR \
-w $WORK_DIR \
--save-every 100 >> ${WORK_DIR}/connector.log 2>&1
echo $(date -u) "Synchronization Done."
18 changes: 18 additions & 0 deletions docker-compose.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -109,6 +109,24 @@ services:
app:
condition: service_healthy

ai4europe_cms-connector:
profiles: ["ai4europe_cms"]
build:
context: connectors/ai4europe_cms
dockerfile: Dockerfile
image: aiod_ai4europe_cms_connector
container_name: ai4europe_cms-connector
environment:
- KEYCLOAK_CLIENT_SECRET=$KEYCLOAK_CLIENT_SECRET
volumes:
- ${DATA_PATH}/connectors:/opt/connectors/data
- ./src/config.override.toml:/app/config.override.toml:ro
command: >
/bin/bash -c "/opt/connectors/script/entry.sh"
depends_on:
app:
condition: service_healthy

aibuilder-connector:
profiles: ["aibuilder"]
build:
Expand Down
Empty file.
95 changes: 95 additions & 0 deletions src/connectors/ai4europe_cms/ai4europe_cms_event_connector.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,95 @@
import requests

from requests.exceptions import HTTPError
from typing import Iterator

from connectors.abstract.resource_connector import ResourceConnector
from connectors.record_error import RecordError
from database.model.platform.platform_names import PlatformName
from database.model.resource_read_and_create import resource_create
from database.model.event.event import Event
from connectors.resource_with_relations import ResourceWithRelations
from database.model.ai_resource.text import Text


class AI4EuropeCmsEventConnector(ResourceConnector[Event]):
@property
def resource_class(self) -> type[Event]:
return Event

@property
def platform_name(self) -> PlatformName:
return PlatformName.ai4europe_cms

def run(self, state: dict, **kwargs) -> Iterator[ResourceWithRelations[Event] | RecordError]:
"""Fetch resources and update the state"""

url_data = "https://community-dev-api.aiod.eu/api/events/"

headers = {"AuthorizationToken": "1234567890"}

response = requests.get(url_data, headers=headers, timeout=600)

if not response.ok:
status_code = response.status_code
msg = response.json()["error"]["message"]
err_msg = f"Error while fetching {url_data} from AI4Europe CMS: ({status_code}) {msg}"
err = HTTPError(err_msg)
yield RecordError(identifier=None, error=err)
return

try:
events = response.json()
except Exception as e:
yield RecordError(identifier=None, error=e)
return

for event in events:
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It seems like you are fetching -all- events from ai4eu every hour. This is very wasteful and imposes unneeded strain on both servers and databases, and more prone to connection errors. If you do not fix this from the ai4eu side, please consider at least updating the connector. For example, you could make sure the date_published exceeds the last added entry. You can use the state dictionary that's passed as an argument to this function for this. The dictionary is saved to a file every x items, and automatically loaded and passed through on the next invocation of the connector.

E.g., if state.get("last_entry_published", "2000-01-01T00:00:00") > event["last_published"]: continue at the start of the loop body and set state["last_entry_published"] = event["date_published"] after the yield statement should work and at least avoid trying to insert many events which already exist in the database.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think the more "clear" way to resolve this is by adding the ai4eu identifier into the response of the API. This way we could utilize ResourceConnectorById class since we know that every new asset (organisation / news / event) will get an identifier that is higher than the currest highest identifier.
@AlexJoom do you have any thoughts on this one? Can we add the identifier into the ai4eu API response?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@antonis96 the ai4europe api response has a platform_resource_identifier property.

For example page https://ai4europe.eu/node/2420
has platform_resource_identifier: "node-2420"

Is this sufficient?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@AlexJoom Yes, this is totally fine. I will proceed with the necessary changes.

pydantic_class = resource_create(Event)
desc = event.get("description") or {}

yield ResourceWithRelations[Event](
resource=pydantic_class(
platform_resource_identifier=(
event["platform_resource_identifier"]
if event.get("platform_resource_identifier") is not None
else None
),
platform=event["platform"] if event.get("platform") is not None else None,
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
platform=event["platform"] if event.get("platform") is not None else None,
platform=event.get("platform")

Safely retrieving the value for a key and otherwise returning None is exactly what dict.get does, so you can replace this pattern with just a call to dict.get.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Platform and Platform Identifier are specifically for connectors and are limited to exactly those platforms we have connectors for. In this case I think the platform should simply be PlatformName.ai4europe_cms with the identifier of the asset in the ai4europe_cms catalogue.

name=event["name"] if event.get("name") is not None else None,
date_published=event["date_published"]
if event.get("date_published") is not None
else None,
start_date=event["start_date"] if event.get("start_date") is not None else None,
end_date=event["end_date"] if event.get("end_date") is not None else None,
registration_link=event["registration_link"]
if event.get("registration_link") is not None
and len(event.get("registration_link")) <= 256
else None,
mode=event["mode"] if event.get("mode") is not None else None,
scientific_domain=[sd for sd in event.get("scientific_domain")]
if event.get("scientific_domain") is not None
else [],
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In these cases you may also need to use the second parameter which specifies the value to return if the key is not present in the dictionary, e.g.:

event.get("scientific_domain", [])
event["scientific_domain"] if event.get("scientific_comain") is not None else []

are equivalent.

industrial_sector=[ins for ins in event.get("industrial_sector")]
if event.get("industrial_sector") is not None
else [],
relevant_link=[rl for rl in event.get("relevant_link")]
if event.get("relevant_link") is not None
else [],
alternate_name=[an for an in event.get("alternate_name")]
if event.get("alternate_name") is not None
else [],
application_area=[ar for ar in event.get("application_area")]
if event.get("application_area") is not None
else [],
keyword=[k for k in event.get("keyword")]
if event.get("keyword") is not None
else [],
same_as=event["same_as"] if event.get("same_as") is not None else None,
description=Text(
plain=desc.get("plain") or "",
html=desc.get("html") or "",
),
),
resource_ORM_class=Event,
)
88 changes: 88 additions & 0 deletions src/connectors/ai4europe_cms/ai4europe_cms_news_connector.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,88 @@
import requests

from requests.exceptions import HTTPError
from typing import Iterator

from connectors.abstract.resource_connector import ResourceConnector
from connectors.record_error import RecordError
from database.model.platform.platform_names import PlatformName
from database.model.resource_read_and_create import resource_create
from database.model.news.news import News
from connectors.resource_with_relations import ResourceWithRelations


class AI4EuropeCmsNewsConnector(ResourceConnector[News]):
@property
def resource_class(self) -> type[News]:
return News

@property
def platform_name(self) -> PlatformName:
return PlatformName.ai4europe_cms

def run(self, state: dict, **kwargs) -> Iterator[ResourceWithRelations[News] | RecordError]:
"""Fetch resources and update the state"""

url_data = "https://community-dev-api.aiod.eu/api/news/"

headers = {"AuthorizationToken": "1234567890"}

response = requests.get(url_data, headers=headers, timeout=600)

if not response.ok:
status_code = response.status_code
msg = response.json()["error"]["message"]
err_msg = f"Error while fetching {url_data} from AI4Europe CMS: ({status_code}) {msg}"
err = HTTPError(err_msg)
yield RecordError(identifier=None, error=err)
return

try:
news = response.json()
except Exception as e:
yield RecordError(identifier=None, error=e)
return

for n in news:
pydantic_class = resource_create(News)

yield ResourceWithRelations[News](
resource=pydantic_class(
platform_resource_identifier=(
n["platform_resource_identifier"]
if n.get("platform_resource_identifier") is not None
else None
),
platform=n["platform"] if n.get("platform") is not None else None,
name=n["name"] if n.get("name") is not None else None,
date_published=n["date_published"]
if n.get("date_published") is not None
else None,
headline=n["headline"] if n.get("headline") is not None else None,
alternative_headline=n["alternative_headline"]
if n.get("alternative_headline") is not None
else None,
category=[cat for cat in n["category"]]
if n.get("category") is not None
else [],
source=n["source"] if n.get("source") is not None else None,
scientific_domain=[sd for sd in n.get("scientific_domain")]
if n.get("scientific_domain") is not None
else [],
industrial_sector=[ins for ins in n.get("industrial_sector")]
if n.get("industrial_sector") is not None
else [],
relevant_link=[rl for rl in n.get("relevant_link")]
if n.get("relevant_link") is not None
else [],
alternate_name=[an for an in n.get("alternate_name")]
if n.get("alternate_name") is not None
else [],
application_area=[ar for ar in n.get("application_area")]
if n.get("application_area") is not None
else [],
keyword=[k for k in n.get("keyword")] if n.get("keyword") is not None else [],
same_as=n["same_as"] if n.get("same_as") is not None else None,
),
resource_ORM_class=News,
)
Loading