4 from typing import Dict
10 cfg = config.add_commandline_args(
11 f'Unscramble! ({__file__})', 'A fast word unscrambler.'
14 "--unscramble_indexfile",
15 help="Path to a file of signature -> word index.",
17 default="/usr/share/dict/sparse_index",
20 logger = logging.getLogger(__name__)
23 letters_mask = 2 ** letters_bits - 1
26 fprint_mask = (2 ** fprint_bits - 1) << letters_bits
28 fprint_feature_bit = {
87 class Unscrambler(object):
96 def _compute_word_fingerprint(word: str, population) -> int:
98 for pair in sorted(population.items(), key=lambda x: x[1], reverse=True):
100 if letter in fprint_feature_bit:
104 shift = fprint_feature_bit[letter]
107 return fp << letters_bits
111 def _compute_word_letter_sig(letter_sigs, word: str, population) -> int:
113 for pair in sorted(population.items(), key=lambda x: x[1], reverse=True):
115 if letter not in letter_sigs:
117 s = letter_sigs[letter]
133 @decorator_utils.memoized
134 def compute_word_sig(word: str) -> int:
135 """Given a word, compute its signature for subsequent lookup
136 operations. Signatures are computed based on the letters in
137 the word and their frequencies. We try to cluster "similar"
138 words close to each other in the signature space.
140 >>> train = Unscrambler.compute_word_sig('train')
144 >>> retain = Unscrambler.compute_word_sig('retrain')
152 population = list_utils.population_counts(word)
153 fprint = Unscrambler._compute_word_fingerprint(word, population)
154 letter_sig = Unscrambler._compute_word_letter_sig(letter_sigs, word, population)
155 assert fprint & letter_sig == 0
156 sig = fprint | letter_sig
161 letter_sigs: Dict[str, int],
162 dictfile: str = '/usr/share/dict/words',
163 indexfile: str = '/usr/share/dict/sparse_index',
165 """Before calling this method, change letter_sigs from the default above
166 unless you want to populate the same exact files."""
170 with open(dictfile, "r") as f:
172 word = word.replace('\n', '')
174 sig = Unscrambler.compute_word_sig(letter_sigs, word)
175 logger.debug("%s => 0x%x" % (word, sig))
179 if sig in words_by_sigs:
180 words_by_sigs[sig] += ",%s" % word
182 words_by_sigs[sig] = word
183 with open(indexfile, 'w') as f:
184 for sig in sorted(words_by_sigs.keys()):
185 word = words_by_sigs[sig]
186 print(f'0x{sig:x}+{word}', file=f)
189 def lookup(word: str, *, include_fuzzy_matches=False) -> Dict[str, bool]:
190 """Looks up a potentially scrambled word optionally including near
193 >>> Unscrambler.lookup('eanycleocipd', include_fuzzy_matches=False)
194 {'encyclopedia': True}
197 sig = Unscrambler.compute_word_sig(word)
198 return Unscrambler.lookup_by_sig(
199 sig, include_fuzzy_matches=include_fuzzy_matches
203 def lookup_by_sig(sig, *, include_fuzzy_matches=False) -> Dict[str, bool]:
204 """Looks up a word that has already been translated into a signature by
205 a previous call to Unscrambler.compute_word_sig. Optionally returns
206 near "fuzzy" matches.
208 >>> sig = Unscrambler.compute_word_sig('sunepsapetuargiarin')
212 >>> Unscrambler.lookup_by_sig(sig, include_fuzzy_matches=True)
213 {'pupigerous': False, 'pupigenous': False, 'unpurposing': False, 'superpurgation': False, 'unsupporting': False, 'superseptuaginarian': True, 'purpurogallin': False, 'scuppaug': False, 'purpurigenous': False, 'purpurogenous': False, 'proppage': False}
216 # Cache the index; it doesn't change and this may be called
218 if len(Unscrambler.sigs) == 0:
219 if 'unscramble_indexfile' in config.config:
220 indexfile = config.config['unscramble_indexfile']
222 indexfile = "/usr/share/dict/sparse_index"
223 with open(indexfile, 'r') as rf:
224 lines = rf.readlines()
227 (fsig, word) = line.split('+')
229 Unscrambler.sigs.append(fsig)
230 Unscrambler.words.append(word)
233 (exact, location) = list_utils.binary_search(Unscrambler.sigs, sig)
238 if end > len(Unscrambler.words):
239 end = len(Unscrambler.words)
241 for x in range(start, end):
242 word = Unscrambler.words[x]
243 fsig = Unscrambler.sigs[x]
244 if include_fuzzy_matches is True or (fsig == sig):
245 ret[word] = fsig == sig
250 # To repopulate, change letter_sigs and then call Unscrambler.repopulate.
251 # See notes above. See also ~/bin/unscramble.py --populate_destructively.
255 if __name__ == "__main__":