4 from typing import Dict, Mapping
10 cfg = config.add_commandline_args(
11 f'Unscramble! ({__file__})', 'A fast word unscrambler.'
14 "--unscramble_indexfile",
15 help="Path to a file of signature -> word index.",
17 default="/usr/share/dict/sparse_index",
20 logger = logging.getLogger(__name__)
23 letters_mask = 2 ** letters_bits - 1
26 fprint_mask = (2 ** fprint_bits - 1) << letters_bits
28 fprint_feature_bit = {
87 class Unscrambler(object):
88 """A class that unscrambles words quickly by computing a signature
89 (sig) for the word based on its position independent letter
90 population and then using a pregenerated index to look up known
91 words the same set of letters.
93 Note that each instance of Unscrambler caches its index to speed
94 up lookups number 2..N; careless reinstantiation will by slower.
96 Sigs are designed to cluster similar words near each other so both
97 lookup methods support a "fuzzy match" argument that can be set to
98 request similar words that do not match exactly in addition to any
104 # Cached index per instance.
108 if 'unscramble_indexfile' in config.config:
109 indexfile = config.config['unscramble_indexfile']
111 indexfile = "/usr/share/dict/sparse_index"
113 with open(indexfile, 'r') as rf:
114 lines = rf.readlines()
117 (fsig, word) = line.split('+')
119 self.sigs.append(fsig)
120 self.words.append(word)
124 def _compute_word_fingerprint(
126 population: Mapping[str, int]
129 for pair in sorted(population.items(), key=lambda x: x[1], reverse=True):
131 if letter in fprint_feature_bit:
135 shift = fprint_feature_bit[letter]
138 return fp << letters_bits
142 def _compute_word_letter_sig(
143 letter_sigs: Mapping[str, int],
145 population: Mapping[str, int],
148 for pair in sorted(population.items(), key=lambda x: x[1], reverse=True):
150 if letter not in letter_sigs:
152 s = letter_sigs[letter]
168 @decorator_utils.memoized
169 def compute_word_sig(word: str) -> int:
170 """Given a word, compute its signature for subsequent lookup
171 operations. Signatures are computed based on the letters in
172 the word and their frequencies. We try to cluster "similar"
173 words close to each other in the signature space.
175 >>> train = Unscrambler.compute_word_sig('train')
179 >>> retain = Unscrambler.compute_word_sig('retrain')
187 population = list_utils.population_counts(word)
188 fprint = Unscrambler._compute_word_fingerprint(word, population)
189 letter_sig = Unscrambler._compute_word_letter_sig(letter_sigs, word, population)
190 assert fprint & letter_sig == 0
191 sig = fprint | letter_sig
196 letter_sigs: Dict[str, int],
197 dictfile: str = '/usr/share/dict/words',
198 indexfile: str = '/usr/share/dict/sparse_index',
200 """Before calling this method, change letter_sigs from the default above
201 unless you want to populate the same exact files.
206 with open(dictfile, "r") as f:
208 word = word.replace('\n', '')
210 sig = Unscrambler.compute_word_sig(letter_sigs, word)
211 logger.debug("%s => 0x%x" % (word, sig))
215 if sig in words_by_sigs:
216 words_by_sigs[sig] += ",%s" % word
218 words_by_sigs[sig] = word
219 with open(indexfile, 'w') as f:
220 for sig in sorted(words_by_sigs.keys()):
221 word = words_by_sigs[sig]
222 print(f'0x{sig:x}+{word}', file=f)
228 include_fuzzy_matches: bool = False
229 ) -> Dict[str, bool]:
230 """Looks up a potentially scrambled word optionally including near
233 >>> u = Unscrambler()
234 >>> u.lookup('eanycleocipd', include_fuzzy_matches=False)
235 {'encyclopedia': True}
238 sig = Unscrambler.compute_word_sig(word)
239 return self.lookup_by_sig(sig, include_fuzzy_matches=include_fuzzy_matches)
245 include_fuzzy_matches:bool = False
246 ) -> Dict[str, bool]:
247 """Looks up a word that has already been translated into a signature by
248 a previous call to Unscrambler.compute_word_sig. Optionally returns
249 near "fuzzy" matches.
251 >>> sig = Unscrambler.compute_word_sig('sunepsapetuargiarin')
255 >>> u = Unscrambler()
256 >>> u.lookup_by_sig(sig, include_fuzzy_matches=True)
257 {'pupigerous': False, 'pupigenous': False, 'unpurposing': False, 'superpurgation': False, 'unsupporting': False, 'superseptuaginarian': True, 'purpurogallin': False, 'scuppaug': False, 'purpurigenous': False, 'purpurogenous': False, 'proppage': False}
261 (exact, location) = list_utils.binary_search(self.sigs, sig)
266 if end > len(self.words):
267 end = len(self.words)
269 for x in range(start, end):
272 if include_fuzzy_matches is True or (fsig == sig):
273 ret[word] = fsig == sig
278 # To repopulate, change letter_sigs and then call Unscrambler.repopulate.
279 # See notes above. See also ~/bin/unscramble.py --populate_destructively.
283 if __name__ == "__main__":