Skip to content

Commit 8cb030d

Browse files
committed
Fix hash ID generation to properly handle empty id values in structured data processing
1 parent e2a1070 commit 8cb030d

1 file changed

Lines changed: 13 additions & 1 deletion

File tree

src/headhunter/models.py

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -256,7 +256,19 @@ def __post_init__(self) -> None:
256256
)
257257
self.warnings.append(warning_msg)
258258
logger.warning(warning_msg)
259-
self.metadata["id"] = hashlib.sha256(self.text.encode("utf-8")).hexdigest()
259+
260+
# Since structured data has empty text field
261+
# include content values and row_index to ensure unique IDs
262+
if self.text:
263+
hash_source = self.text
264+
else:
265+
row_index = self.metadata["row_index"]
266+
content_values = [t.content for t in self.tokens if t.type == "content"]
267+
hash_source = f"{row_index}:{'|'.join(content_values)}"
268+
269+
self.metadata["id"] = hashlib.sha256(
270+
hash_source.encode("utf-8")
271+
).hexdigest()
260272

261273
def __repr__(self) -> str:
262274
"""Return a readable string representation."""

0 commit comments

Comments
 (0)