We read every piece of feedback, and take your input very seriously.
To see all available qualifiers, see our documentation.
1 parent e2a1070 commit 8cb030dCopy full SHA for 8cb030d
1 file changed
src/headhunter/models.py
@@ -256,7 +256,19 @@ def __post_init__(self) -> None:
256
)
257
self.warnings.append(warning_msg)
258
logger.warning(warning_msg)
259
- self.metadata["id"] = hashlib.sha256(self.text.encode("utf-8")).hexdigest()
+
260
+ # Since structured data has empty text field
261
+ # include content values and row_index to ensure unique IDs
262
+ if self.text:
263
+ hash_source = self.text
264
+ else:
265
+ row_index = self.metadata["row_index"]
266
+ content_values = [t.content for t in self.tokens if t.type == "content"]
267
+ hash_source = f"{row_index}:{'|'.join(content_values)}"
268
269
+ self.metadata["id"] = hashlib.sha256(
270
+ hash_source.encode("utf-8")
271
+ ).hexdigest()
272
273
def __repr__(self) -> str:
274
"""Return a readable string representation."""
0 commit comments