Skip to content

Commit 9c5f6e4

Browse files
committed
feat(fao-open-knowledge): implement FAO Open Knowledge data models and URL collector
1 parent 795610a commit 9c5f6e4

File tree

3 files changed

+207
-0
lines changed

3 files changed

+207
-0
lines changed
Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,53 @@
1+
import logging
2+
from typing import List
3+
4+
import requests # type: ignore
5+
from requests.adapters import HTTPAdapter # type: ignore
6+
from welearn_database.data.models import Corpus, WeLearnDocument
7+
8+
from welearn_datastack import constants
9+
from welearn_datastack.data.source_models.fao_open_knowledge import FaoOKModel
10+
from welearn_datastack.data.url_collector import URLCollector
11+
from welearn_datastack.utils_.http_client_utils import get_new_https_session
12+
13+
logger = logging.getLogger(__name__)
14+
15+
16+
class FAOOpenKnowledgeURLCollector(URLCollector):
17+
related_corpus = "fao-open-knowledge"
18+
19+
def __init__(self, corpus: Corpus):
20+
self.corpus = corpus
21+
22+
self.api_base_url = "https://openknowledge.fao.org/server/api/"
23+
self.application_base_url = "https://openknowledge.fao.org/"
24+
self.headers = constants.HEADERS
25+
26+
def _extract_fao_ok_urls(
27+
self, fao_ok_api_response: FaoOKModel
28+
) -> List[WeLearnDocument]:
29+
urls: List[WeLearnDocument] = []
30+
for item in fao_ok_api_response.embedded.items:
31+
document = WeLearnDocument(
32+
url=self.application_base_url + f"items/{item.uuid}",
33+
external_id=item.uuid,
34+
corpus=self.corpus,
35+
)
36+
urls.append(document)
37+
return urls
38+
39+
def collect(self) -> List[WeLearnDocument]:
40+
session = get_new_https_session()
41+
42+
discover_url = f"{self.api_base_url}discover/browses/dateavailable/items"
43+
params = {
44+
"sort": "default,DESC",
45+
"page": 0,
46+
"size": 50,
47+
}
48+
fao_ok_resp = session.get(url=discover_url, headers=self.headers, params=params)
49+
fao_ok_resp.raise_for_status()
50+
fao_ok_response = FaoOKModel.model_validate(fao_ok_resp.json())
51+
52+
urls = self._extract_fao_ok_urls(fao_ok_response)
53+
return urls
Lines changed: 101 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,101 @@
1+
from typing import Any
2+
3+
from pydantic import BaseModel, Field
4+
5+
6+
class Bundles(BaseModel):
7+
href: str
8+
9+
10+
class MappedCollections(BaseModel):
11+
href: str
12+
13+
14+
class OwningCollection(BaseModel):
15+
href: str
16+
17+
18+
class Relationships(BaseModel):
19+
href: str
20+
21+
22+
class Version(BaseModel):
23+
href: str
24+
25+
26+
class TemplateItemOf(BaseModel):
27+
href: str
28+
29+
30+
class Thumbnail(BaseModel):
31+
href: str
32+
33+
34+
class Relateditemlistconfigs(BaseModel):
35+
href: str
36+
37+
38+
class Self(BaseModel):
39+
href: str
40+
41+
42+
class _Links(BaseModel):
43+
bundles: Bundles
44+
mappedCollections: MappedCollections
45+
owningCollection: OwningCollection
46+
relationships: Relationships
47+
version: Version
48+
templateItemOf: TemplateItemOf
49+
thumbnail: Thumbnail
50+
relateditemlistconfigs: Relateditemlistconfigs
51+
self: Self
52+
53+
54+
class Item(BaseModel):
55+
id: str
56+
uuid: str
57+
name: str
58+
handle: str
59+
metadata: dict[str, Any]
60+
inArchive: bool
61+
discoverable: bool
62+
withdrawn: bool
63+
lastModified: str
64+
entityType: Any
65+
type: str
66+
_links: _Links
67+
68+
69+
class Embedded(BaseModel):
70+
items: list[Item]
71+
72+
73+
class Next(BaseModel):
74+
href: str
75+
76+
77+
class Last(BaseModel):
78+
href: str
79+
80+
81+
class Self1(BaseModel):
82+
href: str
83+
84+
85+
class Links1(BaseModel):
86+
next: Next
87+
last: Last
88+
self: Self1
89+
90+
91+
class Page(BaseModel):
92+
number: int
93+
size: int
94+
totalPages: int
95+
totalElements: int
96+
97+
98+
class FaoOKModel(BaseModel):
99+
embedded: Embedded = Field(..., alias="_embedded")
100+
links: Links1 = Field(..., alias="_links")
101+
page: Page
Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,53 @@
1+
import logging
2+
import os
3+
4+
from welearn_database.data.models import Corpus
5+
6+
from welearn_datastack.collectors.fao_open_knowledge_collector import (
7+
FAOOpenKnowledgeURLCollector,
8+
)
9+
from welearn_datastack.nodes_workflow.URLCollectors.nodes_helpers.collect import (
10+
insert_urls,
11+
)
12+
from welearn_datastack.utils_.database_utils import create_db_session
13+
14+
log_level: int = logging.getLevelName(os.getenv("LOG_LEVEL", "INFO"))
15+
log_format: str = os.getenv(
16+
"LOG_FORMAT", "[%(asctime)s][%(name)s][%(levelname)s] - %(message)s"
17+
)
18+
19+
if not isinstance(log_level, int):
20+
raise ValueError(f"Log level is not recognized : '{log_level}'")
21+
22+
logging.basicConfig(
23+
level=logging.getLevelName(log_level),
24+
format=log_format,
25+
)
26+
logger = logging.getLogger(__name__)
27+
28+
29+
if __name__ == "__main__":
30+
logger.info("FAOOpenKnowledgeURLCollector collector starting...")
31+
session = create_db_session()
32+
corpus: Corpus | None = (
33+
session.query(Corpus)
34+
.filter_by(source_name=FAOOpenKnowledgeURLCollector.related_corpus)
35+
.one_or_none()
36+
)
37+
38+
if corpus is None:
39+
raise ValueError(
40+
f"Corpus {FAOOpenKnowledgeURLCollector.related_corpus} not found"
41+
)
42+
43+
fao_ok_collector = FAOOpenKnowledgeURLCollector(corpus=corpus)
44+
45+
urls = fao_ok_collector.collect()
46+
47+
logger.info("URLs retrieved : '%s'", len(urls))
48+
insert_urls(
49+
session=session,
50+
urls=urls,
51+
)
52+
53+
logger.info("FAOOpenKnowledgeURLCollector collector ended")

0 commit comments

Comments
 (0)