4 from typing import Dict, Mapping
10 cfg = config.add_commandline_args(
11 f'Unscramble! ({__file__})', 'A fast word unscrambler.'
14 "--unscramble_indexfile",
15 help="Path to a file of signature -> word index.",
17 default="/usr/share/dict/sparse_index",
20 logger = logging.getLogger(__name__)
23 letters_mask = 2 ** letters_bits - 1
26 fprint_mask = (2 ** fprint_bits - 1) << letters_bits
28 fprint_feature_bit = {
87 class Unscrambler(object):
88 """A class that unscrambles words quickly by computing a signature
89 (sig) for the word based on its position independent letter
90 population and then using a pregenerated index to look up known
91 words the same set of letters.
93 Note that each instance of Unscrambler caches its index to speed
94 up lookups number 2..N; careless reinstantiation will by slower.
96 Sigs are designed to cluster similar words near each other so both
97 lookup methods support a "fuzzy match" argument that can be set to
98 request similar words that do not match exactly in addition to any
104 # Cached index per instance.
108 if 'unscramble_indexfile' in config.config:
109 indexfile = config.config['unscramble_indexfile']
111 indexfile = "/usr/share/dict/sparse_index"
113 with open(indexfile, 'r') as rf:
114 lines = rf.readlines()
117 (fsig, word) = line.split('+')
119 self.sigs.append(fsig)
120 self.words.append(word)
124 def _compute_word_fingerprint(word: str, population: Mapping[str, int]) -> int:
126 for pair in sorted(population.items(), key=lambda x: x[1], reverse=True):
128 if letter in fprint_feature_bit:
132 shift = fprint_feature_bit[letter]
135 return fp << letters_bits
139 def _compute_word_letter_sig(
140 letter_sigs: Mapping[str, int],
142 population: Mapping[str, int],
145 for pair in sorted(population.items(), key=lambda x: x[1], reverse=True):
147 if letter not in letter_sigs:
149 s = letter_sigs[letter]
165 @decorator_utils.memoized
166 def compute_word_sig(word: str) -> int:
167 """Given a word, compute its signature for subsequent lookup
168 operations. Signatures are computed based on the letters in
169 the word and their frequencies. We try to cluster "similar"
170 words close to each other in the signature space.
172 >>> train = Unscrambler.compute_word_sig('train')
176 >>> retain = Unscrambler.compute_word_sig('retrain')
184 population = list_utils.population_counts(word)
185 fprint = Unscrambler._compute_word_fingerprint(word, population)
186 letter_sig = Unscrambler._compute_word_letter_sig(letter_sigs, word, population)
187 assert fprint & letter_sig == 0
188 sig = fprint | letter_sig
193 letter_sigs: Dict[str, int],
194 dictfile: str = '/usr/share/dict/words',
195 indexfile: str = '/usr/share/dict/sparse_index',
197 """Before calling this method, change letter_sigs from the default above
198 unless you want to populate the same exact files.
203 with open(dictfile, "r") as f:
205 word = word.replace('\n', '')
207 sig = Unscrambler.compute_word_sig(letter_sigs, word)
208 logger.debug("%s => 0x%x" % (word, sig))
212 if sig in words_by_sigs:
213 words_by_sigs[sig] += ",%s" % word
215 words_by_sigs[sig] = word
216 with open(indexfile, 'w') as f:
217 for sig in sorted(words_by_sigs.keys()):
218 word = words_by_sigs[sig]
219 print(f'0x{sig:x}+{word}', file=f)
221 def lookup(self, word: str, *, window_size: int = 5) -> Dict[str, bool]:
222 """Looks up a potentially scrambled word optionally including near
225 >>> u = Unscrambler()
226 >>> u.lookup('eanycleocipd', window_size=0)
227 {'encyclopedia': True}
230 sig = Unscrambler.compute_word_sig(word)
231 return self.lookup_by_sig(sig, window_size=window_size)
233 def lookup_by_sig(self, sig: int, *, window_size: int = 5) -> Dict[str, bool]:
234 """Looks up a word that has already been translated into a signature by
235 a previous call to Unscrambler.compute_word_sig. Optionally returns
236 near "fuzzy" matches.
238 >>> sig = Unscrambler.compute_word_sig('sunepsapetuargiarin')
242 >>> u = Unscrambler()
243 >>> u.lookup_by_sig(sig)
244 {'pupigerous': False, 'pupigenous': False, 'unpurposing': False, 'superpurgation': False, 'unsupporting': False, 'superseptuaginarian': True, 'purpurogallin': False, 'scuppaug': False, 'purpurigenous': False, 'purpurogenous': False, 'proppage': False}
248 (exact, location) = list_utils.binary_search(self.sigs, sig)
249 start = location - window_size
252 end = location + 1 + window_size
253 if end > len(self.words):
254 end = len(self.words)
256 for x in range(start, end):
259 if window_size > 0 or (fsig == sig):
260 ret[word] = fsig == sig
265 # To repopulate, change letter_sigs and then call Unscrambler.repopulate.
266 # See notes above. See also ~/bin/unscramble.py --populate_destructively.
270 if __name__ == "__main__":