|
52 | 52 | } |
53 | 53 |
|
54 | 54 |
|
55 | | -def _generate_random_word(seed: int, min_length: int, max_length: int) -> str: |
56 | | - """Generate deterministic random word from seed.""" |
57 | | - random.seed(seed) |
58 | | - word_length = random.randint(min_length, max_length) |
59 | | - return "".join(random.choices(ALPHABET, k=word_length)) |
| 55 | +def _generate_random_word(rng: random.Random, min_length: int, max_length: int) -> str: |
| 56 | + """Generate deterministic random word using local RNG instance.""" |
| 57 | + word_length = rng.randint(min_length, max_length) |
| 58 | + return "".join(rng.choices(ALPHABET, k=word_length)) |
60 | 59 |
|
61 | 60 |
|
62 | | -def _apply_fuzzy_edit(word: str, edit_type: str) -> str: |
63 | | - """Apply single Levenshtein edit operation to word.""" |
| 61 | +def _apply_fuzzy_edit(word: str, edit_type: str, rng: random.Random) -> str: |
| 62 | + """Apply single Levenshtein edit operation using local RNG instance.""" |
64 | 63 | if len(word) < 2: |
65 | 64 | return word |
66 | 65 |
|
67 | 66 | if edit_type == "insert": |
68 | | - pos = random.randint(0, len(word)) |
69 | | - return word[:pos] + random.choice(ALPHABET) + word[pos:] |
| 67 | + pos = rng.randint(0, len(word)) |
| 68 | + return word[:pos] + rng.choice(ALPHABET) + word[pos:] |
70 | 69 | elif edit_type == "delete": |
71 | | - pos = random.randint(0, len(word) - 1) |
| 70 | + pos = rng.randint(0, len(word) - 1) |
72 | 71 | return word[:pos] + word[pos + 1 :] |
73 | 72 | elif edit_type == "substitute": |
74 | | - pos = random.randint(0, len(word) - 1) |
75 | | - return word[:pos] + random.choice(ALPHABET) + word[pos + 1 :] |
| 73 | + # Retry until we get a different character (avoid no-op) |
| 74 | + pos = rng.randint(0, len(word) - 1) |
| 75 | + original_char = word[pos] |
| 76 | + new_char = rng.choice(ALPHABET) |
| 77 | + while new_char == original_char and len(ALPHABET) > 1: |
| 78 | + new_char = rng.choice(ALPHABET) |
| 79 | + return word[:pos] + new_char + word[pos + 1 :] |
76 | 80 |
|
77 | 81 | return word |
78 | 82 |
|
@@ -252,20 +256,23 @@ def apply_transforms( |
252 | 256 | within_term = (doc_num - 1) % docs_per_term |
253 | 257 | variant_id = within_term // docs_per_variant |
254 | 258 |
|
255 | | - # Generate base word (consistent per term) |
256 | | - base_word = _generate_random_word(term_id, min_word_length, max_word_length) |
| 259 | + # Generate base word using isolated RNG |
| 260 | + term_rng = random.Random(term_id) |
| 261 | + base_word = _generate_random_word( |
| 262 | + term_rng, min_word_length, max_word_length |
| 263 | + ) |
257 | 264 |
|
258 | 265 | # Generate variant (misspelling) |
259 | 266 | if variant_id == 0: |
260 | 267 | variant = base_word # First variant is correct spelling |
261 | 268 | else: |
262 | | - # Generate consistent misspelling for this variant_id |
263 | | - random.seed(term_id * 1000 + variant_id) |
| 269 | + # Use tuple hash for collision-free seed |
| 270 | + variant_rng = random.Random(hash((term_id, variant_id))) |
264 | 271 | variant = base_word |
265 | 272 | # Apply target_distance edits |
266 | 273 | for _ in range(target_distance): |
267 | | - edit_type = random.choice(["insert", "delete", "substitute"]) |
268 | | - variant = _apply_fuzzy_edit(variant, edit_type) |
| 274 | + edit_type = variant_rng.choice(["insert", "delete", "substitute"]) |
| 275 | + variant = _apply_fuzzy_edit(variant, edit_type, variant_rng) |
269 | 276 |
|
270 | 277 | # Store just the variant (no term prefix) |
271 | 278 | content = variant |
@@ -522,8 +529,9 @@ def generate_queries(output_dir: Path, config: dict, filename: str) -> Path: |
522 | 529 |
|
523 | 530 | writer.writerow(["term"]) |
524 | 531 | for term_id in range(1, num_queries + 1): |
525 | | - base_word = _generate_random_word(term_id, min_length, max_length) |
526 | | - # Query just the base word (no term prefix) |
| 532 | + # Use isolated RNG for reproducibility |
| 533 | + term_rng = random.Random(term_id) |
| 534 | + base_word = _generate_random_word(term_rng, min_length, max_length) |
527 | 535 | writer.writerow([base_word]) |
528 | 536 |
|
529 | 537 | logging.info(f"Complete: {filename} ({num_queries} queries)") |
|
0 commit comments