Skip to content

Commit 85acbe9

Browse files
committed
feat(fao-open-knowledge): improve metadata parsing and update test cases for consistency
1 parent 8caade9 commit 85acbe9

File tree

3 files changed

+47
-42
lines changed

3 files changed

+47
-42
lines changed

tests/document_collector_hub/plugins_test/test_fao_open_knowledge.py

Lines changed: 15 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,7 @@ def setUp(self):
3232
"language": "en",
3333
"authority": "FAO",
3434
"confidence": 1,
35-
"place": "Rome",
35+
"place": 0,
3636
}
3737
],
3838
"dc.contributor.author": [
@@ -41,7 +41,7 @@ def setUp(self):
4141
"language": "en",
4242
"authority": "FAO",
4343
"confidence": 1,
44-
"place": "Rome",
44+
"place": 0,
4545
}
4646
],
4747
"dc.description.abstract": [
@@ -50,7 +50,7 @@ def setUp(self):
5050
"language": "en",
5151
"authority": "FAO",
5252
"confidence": 1,
53-
"place": "Rome",
53+
"place": 0,
5454
}
5555
],
5656
"dc.identifier.doi": [
@@ -59,7 +59,7 @@ def setUp(self):
5959
"language": "en",
6060
"authority": "FAO",
6161
"confidence": 1,
62-
"place": "Rome",
62+
"place": 0,
6363
}
6464
],
6565
"dc.date.available": [
@@ -68,7 +68,7 @@ def setUp(self):
6868
"language": "en",
6969
"authority": "FAO",
7070
"confidence": 1,
71-
"place": "Rome",
71+
"place": 0,
7272
}
7373
],
7474
"dc.date.lastModified": [
@@ -77,7 +77,7 @@ def setUp(self):
7777
"language": "en",
7878
"authority": "FAO",
7979
"confidence": 1,
80-
"place": "Rome",
80+
"place": 0,
8181
}
8282
],
8383
"fao.taxonomy.type": [
@@ -86,7 +86,7 @@ def setUp(self):
8686
"language": "en",
8787
"authority": "FAO",
8888
"confidence": 1,
89-
"place": "Rome",
89+
"place": 0,
9090
}
9191
],
9292
},
@@ -140,17 +140,22 @@ def test_run_success(self, mock_get_metadata, mock_get_bundle, mock_get_pdf):
140140
# Simulate a successful run with valid PDF and metadata
141141
mock_get_metadata.return_value = self.item
142142
mock_get_bundle.return_value = [self.bundle]
143-
mock_get_pdf.return_value = "PDF content extracted."
143+
mock_get_pdf.return_value = "PDF content extracted. Lorem ispum"
144144
result = self.collector.run([self.doc])
145145
self.assertEqual(len(result), 1)
146146
doc_result = result[0]
147147
self.assertIsNone(doc_result.error_info)
148148
self.assertIsInstance(doc_result.document, WeLearnDocument)
149-
self.assertEqual(doc_result.document.full_content, "PDF content extracted.")
149+
self.assertEqual(
150+
doc_result.document.full_content, "PDF content extracted. Lorem ispum"
151+
)
150152
self.assertEqual(doc_result.document.title, "FAO Document Title")
151153
self.assertEqual(doc_result.document.description, "A description.")
152154
self.assertEqual(doc_result.document.details["doi"], "10.1234/fao.5678")
153-
self.assertEqual(doc_result.document.details["license_url"], "cc-by-4.0")
155+
self.assertEqual(
156+
doc_result.document.details["license_url"],
157+
"https://creativecommons.org/licenses/by/4.0/",
158+
)
154159
self.assertEqual(doc_result.document.details["type"], "Report")
155160
self.assertTrue(
156161
doc_result.document.details["contrent_from_pdf"]
@@ -238,19 +243,6 @@ def test_run_embargo(self, mock_get_metadata, mock_get_bundle, mock_get_pdf):
238243
self.assertIn("unauthorized state", result[0].error_info)
239244
self.assertTrue(result[0].is_error)
240245

241-
# @patch.object(FAOOpenKnowledgeCollector, "_get_pdf_content")
242-
# @patch.object(FAOOpenKnowledgeCollector, "get_bundle_json")
243-
# @patch.object(FAOOpenKnowledgeCollector, "get_metadata_json")
244-
# def test_run_pydantic_validation_error(
245-
# self, mock_get_metadata, mock_get_bundle, mock_get_pdf
246-
# ):
247-
# # Simulate pydantic validation error
248-
# mock_get_metadata.side_effect = pydantic.ValidationError([], "error")
249-
# result = self.collector.run([self.doc])
250-
# self.assertEqual(len(result), 1)
251-
# self.assertIn("validation error", result[0].error_info)
252-
# self.assertTrue(result[0].is_error)
253-
254246
@patch.object(FAOOpenKnowledgeCollector, "_get_pdf_content")
255247
@patch.object(FAOOpenKnowledgeCollector, "get_bundle_json")
256248
@patch.object(FAOOpenKnowledgeCollector, "get_metadata_json")

welearn_datastack/data/source_models/fao_open_knowledge.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -91,3 +91,8 @@ class MetadataEntry(BaseModel):
9191
authority: str | None
9292
confidence: int | None
9393
place: int
94+
95+
96+
#
97+
# class MetadataEntries(BaseModel):
98+
# metadata: list[MetadataEntry]

welearn_datastack/plugins/rest_requesters/fao_open_knowledge.py

Lines changed: 27 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
import io
22
import logging
33
import os
4+
from collections import defaultdict
45
from datetime import datetime
56
from typing import Any
67

@@ -201,39 +202,46 @@ def _extract_external_sdgs(sdgs_str: list[MetadataEntry]) -> list[int]:
201202
return ret
202203

203204
def _extract_details(self, fao_document: Item) -> dict:
204-
parsed_metadata: dict[str, MetadataEntry] = {}
205+
parsed_metadata: defaultdict[str, list[MetadataEntry]] = defaultdict(list)
205206
for metadata in fao_document.metadata:
206207
try:
207-
parsed_metadata[metadata] = MetadataEntry.model_validate(
208-
fao_document.metadata.get(metadata)
208+
mds = fao_document.metadata.get(metadata)
209+
210+
parsed_metadata[metadata].extend(
211+
[MetadataEntry.model_validate(md) for md in mds]
209212
)
210-
except pydantic.ValidationError:
211-
logger.warning(f"Cannot parse metadata entry: {metadata}")
213+
except pydantic.ValidationError as e:
214+
logger.warning(f"Cannot parse metadata entry: {metadata}: {e}")
212215
continue
213-
empty_entry = MetadataEntry(
214-
value="", language="", authority=None, confidence=-1, place=0
215-
)
216-
date_format = "yyyy-MM-ddTHH:mm:ssZ"
217-
publication_date = parsed_metadata.get("dc.date.available", empty_entry).value
218-
update_date = parsed_metadata.get("dc.date.lastModified", empty_entry).value
216+
empty_entry = [
217+
MetadataEntry(value="", language="", authority=None, confidence=-1, place=0)
218+
]
219+
date_format = "%Y-%m-%dT%H:%M:%SZ"
220+
[publication_date] = parsed_metadata.get("dc.date.available", empty_entry)
221+
[update_date] = parsed_metadata.get("dc.date.lastModified", empty_entry)
222+
[isbn] = parsed_metadata.get("dc.identifier.isbn", empty_entry)
223+
[doi] = parsed_metadata.get("dc.identifier.doi", empty_entry)
224+
[type_] = parsed_metadata.get("fao.taxonomy.type", empty_entry)
219225
ret: dict[str, Any] = {
220226
"publication_date": (
221227
None
222-
if not publication_date
223-
else datetime.strptime(publication_date, date_format).timestamp()
228+
if not publication_date.value
229+
else datetime.strptime(publication_date.value, date_format).timestamp()
224230
),
225231
"update_date": (
226232
None
227-
if not update_date
228-
else datetime.strptime(update_date, date_format).timestamp()
233+
if not update_date.value
234+
else datetime.strptime(update_date.value, date_format).timestamp()
229235
),
230-
"isbn": parsed_metadata.get("dc.identifier.isbn", empty_entry).value,
236+
"isbn": isbn.value,
231237
"license_url": self._extract_licence(fao_document),
232238
"authors": self._extract_authors(fao_document),
233-
"external_sdg": self._extract_external_sdgs(fao_document.metadata),
239+
"external_sdg": self._extract_external_sdgs(
240+
parsed_metadata.get("fao.sdgs", [])
241+
),
234242
"contrent_from_pdf": True,
235-
"doi": parsed_metadata.get("dc.identifier.doi", empty_entry).value,
236-
"type": parsed_metadata.get("fao.taxonomy.type", empty_entry).value,
243+
"doi": doi.value,
244+
"type": type_.value,
237245
}
238246
return ret
239247

0 commit comments

Comments
 (0)