-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathprepare_data.py
More file actions
100 lines (78 loc) · 3.51 KB
/
prepare_data.py
File metadata and controls
100 lines (78 loc) · 3.51 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Tue Oct 25 13:17:12 2022
@author: will
"""
import os
import numpy as np
import itertools as it
MISS = np.uint8(0)
MISPLACED = np.uint8(1)
EXACT = np.uint8(2)
possible_words = []
with open('possible_words.txt') as fp:
possible_words.extend([word.strip() for word in fp.readlines()])
allowed_words = []
with open('allowed_words.txt') as fp:
allowed_words.extend([word.strip() for word in fp.readlines()])
matrix = generate_pattern_matrix(allowed_words,possible_words)
np.save('matrix.npy', matrix)
def words_to_int_arrays(words):
return np.array([[ord(c)for c in w] for w in words], dtype=np.uint8)
def generate_pattern_matrix(words1, words2):
"""
https://github.com/3b1b/videos/blob/master/_2022/wordle/simulations.py
A pattern for two words represents the wordle-similarity
pattern (grey -> 0, yellow -> 1, green -> 2) but as an integer
between 0 and 3^5. Reading this integer in ternary gives the
associated pattern.
This function computes the pairwise patterns between two lists
of words, returning the result as a grid of hash values. Since
this can be time-consuming, many operations that can be are vectorized
(perhaps at the expense of easier readibility), and the the result
is saved to file so that this only needs to be evaluated once, and
all remaining pattern matching is a lookup.
"""
# Number of letters/words
nl = len(words1[0])
nw1 = len(words1) # Number of words
nw2 = len(words2) # Number of words
# Convert word lists to integer arrays
word_arr1, word_arr2 = map(words_to_int_arrays, (words1, words2))
# equality_grid keeps track of all equalities between all pairs
# of letters in words. Specifically, equality_grid[a, b, i, j]
# is true when words[i][a] == words[b][j]
equality_grid = np.zeros((nw1, nw2, nl, nl), dtype=bool)
for i, j in it.product(range(nl), range(nl)):
equality_grid[:, :, i, j] = np.equal.outer(word_arr1[:, i], word_arr2[:, j])
# full_pattern_matrix[a, b] should represent the 5-color pattern
# for guess a and answer b, with 0 -> grey, 1 -> yellow, 2 -> green
full_pattern_matrix = np.zeros((nw1, nw2, nl), dtype=np.uint8)
# Green pass
for i in range(nl):
matches = equality_grid[:, :, i, i].flatten() # matches[a, b] is true when words[a][i] = words[b][i]
full_pattern_matrix[:, :, i].flat[matches] = EXACT
for k in range(nl):
# If it's a match, mark all elements associated with
# that letter, both from the guess and answer, as covered.
# That way, it won't trigger the yellow pass.
equality_grid[:, :, k, i].flat[matches] = False
equality_grid[:, :, i, k].flat[matches] = False
# Yellow pass
for i, j in it.product(range(nl), range(nl)):
matches = equality_grid[:, :, i, j].flatten()
full_pattern_matrix[:, :, i].flat[matches] = MISPLACED
for k in range(nl):
# Similar to above, we want to mark this letter
# as taken care of, both for answer and guess
equality_grid[:, :, k, j].flat[matches] = False
equality_grid[:, :, i, k].flat[matches] = False
# Rather than representing a color pattern as a lists of integers,
# store it as a single integer, whose ternary representations corresponds
# to that list of integers.
pattern_matrix = np.dot(
full_pattern_matrix,
(3**np.arange(nl)).astype(np.uint8)
)
return pattern_matrix