posit-dev · rich-iannone · Feb 18, 2026 · Feb 18, 2026 · Feb 18, 2026 · Feb 18, 2026
diff --git a/docs/user-guide/test-data-generation.qmd b/docs/user-guide/test-data-generation.qmd
@@ -420,6 +420,62 @@ rows. A French row will have a French name with a matching French email; a Japan
 Japanese name with a matching Japanese email. Non-preset columns (integers, floats, booleans, dates)
 are generated independently for each batch but still respect their field constraints.
 
+### Frequency-Weighted Sampling
+
+By default, names and cities are sampled uniformly at random from the locale data, giving every
+entry the same probability of being selected. Real-world distributions are far from uniform though:
+"James" and "Maria" appear orders of magnitude more often than "Thaddeus" or "Xiomara", and more
+people live in New York City than in Flagstaff. The `weighted=True` parameter makes generated data
+reflect this natural skew.
+
+```{python}
+schema = pb.Schema(
+    name=pb.string_field(preset="name"),
+    city=pb.string_field(preset="city"),
+)
+
+pb.preview(pb.generate_dataset(schema, n=200, seed=23, country="US", weighted=True))
+```
+
+With weighting enabled you will see popular names like James, John, Mary, and Patricia appear more
+frequently, while unusual names surface only occasionally. Similarly, cities like New York, Los
+Angeles, and Chicago dominate the output while smaller cities appear less often.
+
+The feature works by organizing locale data into four frequency tiers. Each tier has a sampling
+probability that determines how likely its members are to be selected:
+
+| Tier | Probability | Contents |
+|------|-------------|----------|
+| very_common | 45% | The top ~10% of entries by real-world frequency |
+| common | 30% | The next ~20% of entries |
+| uncommon | 20% | The next ~30% of entries |
+| rare | 5% | The remaining ~40% of entries |
+
+When a value is needed, a tier is first chosen according to these probabilities and then a single
+entry is picked uniformly at random within that tier. This two-step approach keeps sampling fast
+while producing a realistic long-tail distribution. Setting `weighted=False` pools all entries
+across every tier and samples them uniformly, which can be useful when you want an even spread
+rather than a realistic distribution.
+
+Weighted sampling combines seamlessly with multi-country mixing. Each country's batch uses its own
+tiered data independently, so a mixed dataset will have weighted US names alongside weighted German
+names:
+
+```{python}
+pb.preview(
+    pb.generate_dataset(
+        schema,
+        n=200,
+        seed=23,
+        country={"US": 0.6, "DE": 0.4},
+        weighted=True,
+    )
+)
+```
+
+All 55 supported country locales have tiered name and location data, so `weighted=True` produces
+realistic frequency distributions for every country.
+
 ## Output Formats
 
 The `generate_dataset()` function supports multiple output formats via the `output=` parameter,

diff --git a/pointblank/countries/__init__.py b/pointblank/countries/__init__.py
@@ -540,6 +540,43 @@ def clear_cache(self) -> None:
         self._cache.clear()
 
 
+# Default frequency tier weights for weighted sampling.
+# Keys must match the tier names used in tiered data files.
+FREQUENCY_TIERS: dict[str, float] = {
+    "very_common": 0.45,
+    "common": 0.30,
+    "uncommon": 0.20,
+    "rare": 0.05,
+}
+
+_TIER_KEYS = frozenset(FREQUENCY_TIERS)
+
+
+def _is_tiered(data: Any) -> bool:
+    """Return True if *data* is a dict whose keys are frequency tier names."""
+    return isinstance(data, dict) and bool(_TIER_KEYS & set(data.keys()))
+
+
+def _flatten_tiered(data: dict[str, list]) -> list:
+    """Flatten a tiered dict into a single flat list (preserves order by tier)."""
+    items: list = []
+    for tier in FREQUENCY_TIERS:
+        items.extend(data.get(tier, []))
+    return items
+
+
+def _pick_from_tiered(tiered_data: dict[str, list], rng: random.Random) -> Any:
+    """Pick an item from a tiered dict using frequency weights."""
+    available_tiers = [t for t in FREQUENCY_TIERS if t in tiered_data and tiered_data[t]]
+    if not available_tiers:
+        # Fallback: flatten and pick uniformly
+        all_items = [item for tier_list in tiered_data.values() for item in tier_list]
+        return rng.choice(all_items) if all_items else None
+    weights = [FREQUENCY_TIERS[t] for t in available_tiers]
+    chosen_tier = rng.choices(available_tiers, weights=weights, k=1)[0]
+    return rng.choice(tiered_data[chosen_tier])
+
+
 class LocaleGenerator:
     """
     Generator for country-specific test data.
@@ -548,7 +585,7 @@ class LocaleGenerator:
     addresses, etc. based on country-specific patterns and data.
     """
 
-    def __init__(self, country: str = "US", seed: int | None = None):
+    def __init__(self, country: str = "US", seed: int | None = None, weighted: bool = True):
         """
         Initialize the country data generator.
 
@@ -559,9 +596,14 @@ def __init__(self, country: str = "US", seed: int | None = None):
             Also accepts legacy locale codes like "en_US" for backwards compatibility.
         seed
             Random seed for reproducibility.
+        weighted
+            When True, names and locations are sampled according to real-world frequency
+            tiers (common names appear far more often than rare names). Only affects data
+            files using the tiered format; flat-list data always uses uniform sampling.
         """
         self.country_code = _normalize_country(country)
         self.rng = random.Random(seed)
+        self.weighted = weighted
         self._registry = LocaleRegistry()
         self._data = self._registry.get(self.country_code)
 
@@ -649,43 +691,70 @@ def _generate_first_name(self, gender: str | None = None) -> str:
         names = self._data.person.get("first_names", {})
 
         if gender and gender in names:
-            name_list = names[gender]
+            name_data = names[gender]
         elif "neutral" in names:
             # Combine all available names
-            all_names = []
+            name_data = []
             for category in ["male", "female", "neutral"]:
-                all_names.extend(names.get(category, []))
-            name_list = all_names if all_names else ["Alex"]
+                cat_data = names.get(category, [])
+                if _is_tiered(cat_data):
+                    name_data.extend(_flatten_tiered(cat_data))
+                elif isinstance(cat_data, list):
+                    name_data.extend(cat_data)
+            name_data = name_data if name_data else ["Alex"]
         else:
             # Flatten all categories
-            all_names = []
+            name_data = []
             for category_names in names.values():
-                if isinstance(category_names, list):
-                    all_names.extend(category_names)
-            name_list = all_names if all_names else ["Alex"]
+                if _is_tiered(category_names):
+                    name_data.extend(_flatten_tiered(category_names))
+                elif isinstance(category_names, list):
+                    name_data.extend(category_names)
+            name_data = name_data if name_data else ["Alex"]
+
+        # Tiered weighted sampling
+        if self.weighted and _is_tiered(name_data):
+            return _pick_from_tiered(name_data, self.rng)
+
+        # Flat list (or tiered with weighted=False — flatten first)
+        if _is_tiered(name_data):
+            name_data = _flatten_tiered(name_data)
 
-        return self.rng.choice(name_list)
+        return self.rng.choice(name_data)
 
     def _generate_last_name(self, gender: str | None = None) -> str:
         """Generate a random last name (internal, no caching).
 
         If last_names is a dict with 'male'/'female' keys (e.g., IS patronymics), picks from the
-        gender-appropriate list.
+        gender-appropriate list.  Also handles frequency-tiered dicts when ``self.weighted``.
         """
         names = self._data.person.get("last_names", ["Smith"])
 
+        # Tiered last names (top-level tiers without gender sub-keys)
+        if _is_tiered(names):
+            if self.weighted:
+                return _pick_from_tiered(names, self.rng)
+            return self.rng.choice(_flatten_tiered(names))
+
         if isinstance(names, dict):
             # Gendered last names (e.g., Icelandic patronymics)
             if gender and gender in names:
-                name_list = names[gender]
+                name_data = names[gender]
             else:
                 # Flatten all categories
                 all_names = []
                 for cat_names in names.values():
                     if isinstance(cat_names, list):
                         all_names.extend(cat_names)
-                name_list = all_names if all_names else ["Smith"]
-            return self.rng.choice(name_list)
+                    elif _is_tiered(cat_names):
+                        all_names.extend(_flatten_tiered(cat_names))
+                name_data = all_names if all_names else ["Smith"]
+
+            if self.weighted and _is_tiered(name_data):
+                return _pick_from_tiered(name_data, self.rng)
+            if _is_tiered(name_data):
+                name_data = _flatten_tiered(name_data)
+            return self.rng.choice(name_data)
 
         return self.rng.choice(names)
 
@@ -1178,8 +1247,21 @@ def name_full(self, gender: str | None = None) -> str:
     def _get_location(self) -> dict[str, str]:
         """Get a coherent location (city, state, postcode_prefix) from the data."""
         locations = self._data.address.get("locations", [])
-        if locations:
+
+        # Tiered locations
+        if _is_tiered(locations):
+            if self.weighted:
+                loc = _pick_from_tiered(locations, self.rng)
+                if loc is not None:
+                    return loc
+            else:
+                flat = _flatten_tiered(locations)
+                if flat:
+                    return self.rng.choice(flat)
+
+        elif locations:
             return self.rng.choice(locations)
+
         # Fallback for old-style data
         return {
             "city": "Springfield",
@@ -2656,7 +2738,9 @@ def _generate_from_format(self, fmt: str) -> str:
 _default_registry = LocaleRegistry()
 
 
-def get_generator(country: str = "US", seed: int | None = None) -> LocaleGenerator:
+def get_generator(
+    country: str = "US", seed: int | None = None, weighted: bool = True
+) -> LocaleGenerator:
     """
     Get a country data generator instance.
 
@@ -2667,10 +2751,12 @@ def get_generator(country: str = "US", seed: int | None = None) -> LocaleGenerat
         Also accepts legacy locale codes like "en_US" for backwards compatibility.
     seed
         Random seed for reproducibility.
+    weighted
+        When True, names and locations are sampled with real-world frequency tiers.
 
     Returns
     -------
     LocaleGenerator
         A generator configured for the specified country.
     """
-    return LocaleGenerator(country=country, seed=seed)
+    return LocaleGenerator(country=country, seed=seed, weighted=weighted)