|
5 | 5 | from __future__ import annotations |
6 | 6 |
|
7 | 7 | import random |
| 8 | +import re |
8 | 9 | import string |
| 10 | +from dataclasses import replace as dc_replace |
9 | 11 | from datetime import date, datetime, time, timedelta |
10 | 12 | from typing import TYPE_CHECKING, Any, Callable |
11 | 13 |
|
@@ -448,6 +450,13 @@ def generate_column( |
448 | 450 | PERSON_RELATED_PRESETS = {"name", "name_full", "first_name", "last_name", "email", "user_name"} |
449 | 451 | BUSINESS_RELATED_PRESETS = {"job", "company"} |
450 | 452 |
|
| 453 | +# Default working-age bounds applied when business coherence is active |
| 454 | +_WORKING_AGE_MIN = 22 |
| 455 | +_WORKING_AGE_MAX = 65 |
| 456 | + |
| 457 | +# Pattern to detect age-like integer columns (case-insensitive) |
| 458 | +_AGE_COLUMN_RE = re.compile(r"(?:^|_)age(?:$|_)", re.IGNORECASE) |
| 459 | + |
451 | 460 |
|
452 | 461 | def _get_coherence_needs(fields: dict[str, Field]) -> tuple[bool, bool, bool]: |
453 | 462 | """Check what coherence is needed for the given fields.""" |
@@ -549,6 +558,25 @@ def generate_dataframe( |
549 | 558 | if needs_business: |
550 | 559 | coherent_presets.update(BUSINESS_RELATED_PRESETS) |
551 | 560 |
|
| 561 | + # When business coherence is active, constrain age-like integer columns to |
| 562 | + # working-age range so generated data doesn't have 15-year-old professionals |
| 563 | + # or 85-year-old active employees with fictitious employers. |
| 564 | + if needs_business: |
| 565 | + fields = dict(fields) # shallow copy so we don't mutate the caller's dict |
| 566 | + for col_name, col_field in list(fields.items()): |
| 567 | + if col_field.is_integer() and _AGE_COLUMN_RE.search(col_name): |
| 568 | + cur_min = getattr(col_field, "min_val", None) |
| 569 | + cur_max = getattr(col_field, "max_val", None) |
| 570 | + new_min = ( |
| 571 | + max(cur_min, _WORKING_AGE_MIN) if cur_min is not None else _WORKING_AGE_MIN |
| 572 | + ) |
| 573 | + new_max = ( |
| 574 | + min(cur_max, _WORKING_AGE_MAX) if cur_max is not None else _WORKING_AGE_MAX |
| 575 | + ) |
| 576 | + # Only replace if the bounds actually changed |
| 577 | + if new_min != cur_min or new_max != cur_max: |
| 578 | + fields[col_name] = dc_replace(col_field, min_val=new_min, max_val=new_max) |
| 579 | + |
552 | 580 | # Generate data for each column |
553 | 581 | data: dict[str, list[Any]] = {} |
554 | 582 | for col_name, field in fields.items(): |
|
0 commit comments