Skip to content

Commit a89e554

Browse files
committed
feat(fao_open_knowledge): implement serialization for dataclass instances and enhance document processing
1 parent 0a6d5a5 commit a89e554

File tree

3 files changed

+37
-1
lines changed

3 files changed

+37
-1
lines changed

welearn_datastack/modules/computed_metadata.py

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
import os
44
import re
55
from collections import deque
6+
from dataclasses import asdict, is_dataclass
67
from functools import cache
78

89
from lingua import LanguageDetectorBuilder
@@ -268,3 +269,34 @@ def compute_readability(
268269
document.full_content, document.lang
269270
)
270271
return document
272+
273+
274+
def is_dataclass_instance(obj):
275+
return is_dataclass(obj) and not isinstance(obj, type)
276+
277+
278+
def _inner_serialize_dataclass(value):
279+
match value:
280+
case list():
281+
return [_inner_serialize_dataclass(item) for item in value]
282+
case dict():
283+
return {k: _inner_serialize_dataclass(v) for k, v in value.items()}
284+
if is_dataclass_instance(value):
285+
return asdict(value)
286+
return value
287+
288+
289+
def serialize_dataclass_instance(document: WeLearnDocument) -> WeLearnDocument:
290+
for detail_key, detail_value in document.details.items():
291+
match detail_value:
292+
case list():
293+
document.details[detail_key] = [
294+
_inner_serialize_dataclass(item) for item in detail_value
295+
]
296+
case dict():
297+
for k, v in detail_value.items():
298+
detail_value[k] = _inner_serialize_dataclass(v)
299+
document.details[detail_key] = detail_value
300+
case _:
301+
document.details[detail_key] = _inner_serialize_dataclass(detail_value)
302+
return document

welearn_datastack/nodes_workflow/DocumentHubCollector/document_collector.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
import logging
33
import os
44
import uuid
5+
from dataclasses import asdict
56
from typing import Dict, List, Tuple
67
from uuid import UUID
78

@@ -16,6 +17,7 @@
1617
compute_duration,
1718
compute_readability,
1819
identify_document_language,
20+
serialize_dataclass_instance,
1921
)
2022
from welearn_datastack.modules.validation import validate_non_null_fields_document
2123
from welearn_datastack.plugins.interface import IPlugin
@@ -91,6 +93,7 @@ def main() -> None:
9193
identify_document_language(doc)
9294
compute_duration(doc)
9395
compute_readability(doc)
96+
serialize_dataclass_instance(doc)
9497
flag_modified(doc, "details")
9598

9699
db_session.add_all(states)

welearn_datastack/plugins/rest_requesters/fao_open_knowledge.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -133,7 +133,8 @@ def _extract_authors(fao_document: Item) -> list[AuthorDetails]:
133133
contributors_names: list[str] = []
134134
for contributor_entry in messy_authors:
135135
for name in contributor_entry.value.split(";"):
136-
contributors_names.append(name.strip())
136+
if name.strip():
137+
contributors_names.append(name.strip())
137138

138139
for contributor in contributors_names:
139140
ret.append(AuthorDetails(name=contributor, misc=""))

0 commit comments

Comments
 (0)