File tree Expand file tree Collapse file tree 3 files changed +37
-1
lines changed
nodes_workflow/DocumentHubCollector Expand file tree Collapse file tree 3 files changed +37
-1
lines changed Original file line number Diff line number Diff line change 33import os
44import re
55from collections import deque
6+ from dataclasses import asdict , is_dataclass
67from functools import cache
78
89from lingua import LanguageDetectorBuilder
@@ -268,3 +269,34 @@ def compute_readability(
268269 document .full_content , document .lang
269270 )
270271 return document
272+
273+
274+ def is_dataclass_instance (obj ):
275+ return is_dataclass (obj ) and not isinstance (obj , type )
276+
277+
278+ def _inner_serialize_dataclass (value ):
279+ match value :
280+ case list ():
281+ return [_inner_serialize_dataclass (item ) for item in value ]
282+ case dict ():
283+ return {k : _inner_serialize_dataclass (v ) for k , v in value .items ()}
284+ if is_dataclass_instance (value ):
285+ return asdict (value )
286+ return value
287+
288+
289+ def serialize_dataclass_instance (document : WeLearnDocument ) -> WeLearnDocument :
290+ for detail_key , detail_value in document .details .items ():
291+ match detail_value :
292+ case list ():
293+ document .details [detail_key ] = [
294+ _inner_serialize_dataclass (item ) for item in detail_value
295+ ]
296+ case dict ():
297+ for k , v in detail_value .items ():
298+ detail_value [k ] = _inner_serialize_dataclass (v )
299+ document .details [detail_key ] = detail_value
300+ case _:
301+ document .details [detail_key ] = _inner_serialize_dataclass (detail_value )
302+ return document
Original file line number Diff line number Diff line change 22import logging
33import os
44import uuid
5+ from dataclasses import asdict
56from typing import Dict , List , Tuple
67from uuid import UUID
78
1617 compute_duration ,
1718 compute_readability ,
1819 identify_document_language ,
20+ serialize_dataclass_instance ,
1921)
2022from welearn_datastack .modules .validation import validate_non_null_fields_document
2123from welearn_datastack .plugins .interface import IPlugin
@@ -91,6 +93,7 @@ def main() -> None:
9193 identify_document_language (doc )
9294 compute_duration (doc )
9395 compute_readability (doc )
96+ serialize_dataclass_instance (doc )
9497 flag_modified (doc , "details" )
9598
9699 db_session .add_all (states )
Original file line number Diff line number Diff line change @@ -133,7 +133,8 @@ def _extract_authors(fao_document: Item) -> list[AuthorDetails]:
133133 contributors_names : list [str ] = []
134134 for contributor_entry in messy_authors :
135135 for name in contributor_entry .value .split (";" ):
136- contributors_names .append (name .strip ())
136+ if name .strip ():
137+ contributors_names .append (name .strip ())
137138
138139 for contributor in contributors_names :
139140 ret .append (AuthorDetails (name = contributor , misc = "" ))
You can’t perform that action at this time.
0 commit comments