Skip to content

Commit 8c57ec1

Browse files
author
Ram Prasad Voleti
committed
Address comments of PR to use local RNG for stable seed generation
Address comments of PR to use local RNG for stable seed generation Signed-off-by: Ram Prasad Voleti <ramvolet@amazon.com>
1 parent c7d342d commit 8c57ec1

File tree

1 file changed

+28
-20
lines changed

1 file changed

+28
-20
lines changed

scripts/setup_datasets.py

Lines changed: 28 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -52,27 +52,31 @@
5252
}
5353

5454

55-
def _generate_random_word(seed: int, min_length: int, max_length: int) -> str:
56-
"""Generate deterministic random word from seed."""
57-
random.seed(seed)
58-
word_length = random.randint(min_length, max_length)
59-
return "".join(random.choices(ALPHABET, k=word_length))
55+
def _generate_random_word(rng: random.Random, min_length: int, max_length: int) -> str:
56+
"""Generate deterministic random word using local RNG instance."""
57+
word_length = rng.randint(min_length, max_length)
58+
return "".join(rng.choices(ALPHABET, k=word_length))
6059

6160

62-
def _apply_fuzzy_edit(word: str, edit_type: str) -> str:
63-
"""Apply single Levenshtein edit operation to word."""
61+
def _apply_fuzzy_edit(word: str, edit_type: str, rng: random.Random) -> str:
62+
"""Apply single Levenshtein edit operation using local RNG instance."""
6463
if len(word) < 2:
6564
return word
6665

6766
if edit_type == "insert":
68-
pos = random.randint(0, len(word))
69-
return word[:pos] + random.choice(ALPHABET) + word[pos:]
67+
pos = rng.randint(0, len(word))
68+
return word[:pos] + rng.choice(ALPHABET) + word[pos:]
7069
elif edit_type == "delete":
71-
pos = random.randint(0, len(word) - 1)
70+
pos = rng.randint(0, len(word) - 1)
7271
return word[:pos] + word[pos + 1 :]
7372
elif edit_type == "substitute":
74-
pos = random.randint(0, len(word) - 1)
75-
return word[:pos] + random.choice(ALPHABET) + word[pos + 1 :]
73+
# Retry until we get a different character (avoid no-op)
74+
pos = rng.randint(0, len(word) - 1)
75+
original_char = word[pos]
76+
new_char = rng.choice(ALPHABET)
77+
while new_char == original_char and len(ALPHABET) > 1:
78+
new_char = rng.choice(ALPHABET)
79+
return word[:pos] + new_char + word[pos + 1 :]
7680

7781
return word
7882

@@ -252,20 +256,23 @@ def apply_transforms(
252256
within_term = (doc_num - 1) % docs_per_term
253257
variant_id = within_term // docs_per_variant
254258

255-
# Generate base word (consistent per term)
256-
base_word = _generate_random_word(term_id, min_word_length, max_word_length)
259+
# Generate base word using isolated RNG
260+
term_rng = random.Random(term_id)
261+
base_word = _generate_random_word(
262+
term_rng, min_word_length, max_word_length
263+
)
257264

258265
# Generate variant (misspelling)
259266
if variant_id == 0:
260267
variant = base_word # First variant is correct spelling
261268
else:
262-
# Generate consistent misspelling for this variant_id
263-
random.seed(term_id * 1000 + variant_id)
269+
# Use tuple hash for collision-free seed
270+
variant_rng = random.Random(hash((term_id, variant_id)))
264271
variant = base_word
265272
# Apply target_distance edits
266273
for _ in range(target_distance):
267-
edit_type = random.choice(["insert", "delete", "substitute"])
268-
variant = _apply_fuzzy_edit(variant, edit_type)
274+
edit_type = variant_rng.choice(["insert", "delete", "substitute"])
275+
variant = _apply_fuzzy_edit(variant, edit_type, variant_rng)
269276

270277
# Store just the variant (no term prefix)
271278
content = variant
@@ -522,8 +529,9 @@ def generate_queries(output_dir: Path, config: dict, filename: str) -> Path:
522529

523530
writer.writerow(["term"])
524531
for term_id in range(1, num_queries + 1):
525-
base_word = _generate_random_word(term_id, min_length, max_length)
526-
# Query just the base word (no term prefix)
532+
# Use isolated RNG for reproducibility
533+
term_rng = random.Random(term_id)
534+
base_word = _generate_random_word(term_rng, min_length, max_length)
527535
writer.writerow([base_word])
528536

529537
logging.info(f"Complete: {filename} ({num_queries} queries)")

0 commit comments

Comments
 (0)