4 from typing import Dict, Mapping
10 cfg = config.add_commandline_args(
11 f'Unscramble! ({__file__})', 'A fast word unscrambler.'
14 "--unscramble_indexfile",
15 help="Path to a file of signature -> word index.",
17 default="/usr/share/dict/sparse_index",
20 logger = logging.getLogger(__name__)
23 letters_mask = 2 ** letters_bits - 1
26 fprint_mask = (2 ** fprint_bits - 1) << letters_bits
28 fprint_feature_bit = {
87 class Unscrambler(object):
88 """A class that unscrambles words quickly by computing a signature
89 (sig) for the word based on its position independent letter
90 population and then using a pregenerated index to look up known
91 words the same set of letters.
93 Note that each instance of Unscrambler caches its index to speed
94 up lookups number 2..N; careless reinstantiation will by slower.
96 Sigs are designed to cluster similar words near each other so both
97 lookup methods support a "fuzzy match" argument that can be set to
98 request similar words that do not match exactly in addition to any
104 # Cached index per instance.
108 if 'unscramble_indexfile' in config.config:
109 indexfile = config.config['unscramble_indexfile']
111 indexfile = "/usr/share/dict/sparse_index"
113 with open(indexfile, 'r') as rf:
114 lines = rf.readlines()
117 (fsig, word) = line.split('+')
119 self.sigs.append(fsig)
120 self.words.append(word)
124 def _compute_word_fingerprint(
125 word: str, population: Mapping[str, int]
129 population.items(), key=lambda x: x[1], reverse=True
132 if letter in fprint_feature_bit:
136 shift = fprint_feature_bit[letter]
139 return fp << letters_bits
143 def _compute_word_letter_sig(
144 letter_sigs: Mapping[str, int],
146 population: Mapping[str, int],
150 population.items(), key=lambda x: x[1], reverse=True
153 if letter not in letter_sigs:
155 s = letter_sigs[letter]
171 @decorator_utils.memoized
172 def compute_word_sig(word: str) -> int:
173 """Given a word, compute its signature for subsequent lookup
174 operations. Signatures are computed based on the letters in
175 the word and their frequencies. We try to cluster "similar"
176 words close to each other in the signature space.
178 >>> train = Unscrambler.compute_word_sig('train')
182 >>> retain = Unscrambler.compute_word_sig('retrain')
190 population = list_utils.population_counts(word)
191 fprint = Unscrambler._compute_word_fingerprint(word, population)
192 letter_sig = Unscrambler._compute_word_letter_sig(
193 letter_sigs, word, population
195 assert fprint & letter_sig == 0
196 sig = fprint | letter_sig
201 letter_sigs: Dict[str, int],
202 dictfile: str = '/usr/share/dict/words',
203 indexfile: str = '/usr/share/dict/sparse_index',
205 """Before calling this method, change letter_sigs from the default above
206 unless you want to populate the same exact files.
211 with open(dictfile, "r") as f:
213 word = word.replace('\n', '')
215 sig = Unscrambler.compute_word_sig(letter_sigs, word)
216 logger.debug("%s => 0x%x" % (word, sig))
220 if sig in words_by_sigs:
221 words_by_sigs[sig] += ",%s" % word
223 words_by_sigs[sig] = word
224 with open(indexfile, 'w') as f:
225 for sig in sorted(words_by_sigs.keys()):
226 word = words_by_sigs[sig]
227 print(f'0x{sig:x}+{word}', file=f)
230 self, word: str, *, include_fuzzy_matches: bool = False
231 ) -> Dict[str, bool]:
232 """Looks up a potentially scrambled word optionally including near
235 >>> u = Unscrambler()
236 >>> u.lookup('eanycleocipd', include_fuzzy_matches=False)
237 {'encyclopedia': True}
240 sig = Unscrambler.compute_word_sig(word)
241 return self.lookup_by_sig(
242 sig, include_fuzzy_matches=include_fuzzy_matches
246 self, sig: int, *, include_fuzzy_matches: bool = False
247 ) -> Dict[str, bool]:
248 """Looks up a word that has already been translated into a signature by
249 a previous call to Unscrambler.compute_word_sig. Optionally returns
250 near "fuzzy" matches.
252 >>> sig = Unscrambler.compute_word_sig('sunepsapetuargiarin')
256 >>> u = Unscrambler()
257 >>> u.lookup_by_sig(sig, include_fuzzy_matches=True)
258 {'pupigerous': False, 'pupigenous': False, 'unpurposing': False, 'superpurgation': False, 'unsupporting': False, 'superseptuaginarian': True, 'purpurogallin': False, 'scuppaug': False, 'purpurigenous': False, 'purpurogenous': False, 'proppage': False}
262 (exact, location) = list_utils.binary_search(self.sigs, sig)
267 if end > len(self.words):
268 end = len(self.words)
270 for x in range(start, end):
273 if include_fuzzy_matches is True or (fsig == sig):
274 ret[word] = fsig == sig
279 # To repopulate, change letter_sigs and then call Unscrambler.repopulate.
280 # See notes above. See also ~/bin/unscramble.py --populate_destructively.
284 if __name__ == "__main__":