Skip to content

Commit 4e3c789

Browse files
committed
Improve ID generation logic in ParsedText to handle empty or missing IDs in metadata
1 parent 916ec0b commit 4e3c789

1 file changed

Lines changed: 11 additions & 3 deletions

File tree

src/headhunter/models.py

Lines changed: 11 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -216,9 +216,17 @@ class ParsedText:
216216
warnings: list[str]
217217

218218
def __post_init__(self) -> None:
219-
"""Auto-generate ID from content hash if not provided in metadata."""
220-
if "id" not in self.metadata:
221-
warning_msg = "No document ID found. Generating one from content hash."
219+
"""Auto-generate ID from content hash if not provided or empty in metadata."""
220+
existing_id = self.metadata["id"]
221+
id_is_empty = (
222+
existing_id is None
223+
or pd.isna(existing_id)
224+
or (isinstance(existing_id, str) and not existing_id.strip())
225+
)
226+
if id_is_empty:
227+
warning_msg = (
228+
"No valid document ID found. Generating one from content hash."
229+
)
222230
self.warnings.append(warning_msg)
223231
logger.warning(warning_msg)
224232
self.metadata["id"] = hashlib.sha256(self.text.encode("utf-8")).hexdigest()

0 commit comments

Comments
 (0)