4 from typing import Dict
10 cfg = config.add_commandline_args(
11 f'Unscramble! ({__file__})',
12 'A fast word unscrambler.'
15 "--unscramble_indexfile",
16 help="Path to a file of signature -> word index.",
18 default="/usr/share/dict/sparse_index",
21 logger = logging.getLogger(__name__)
24 letters_mask = 2 ** letters_bits - 1
27 fprint_mask = (2 ** fprint_bits - 1) << letters_bits
29 fprint_feature_bit = {
88 class Unscrambler(object):
97 def _compute_word_fingerprint(word: str, population) -> int:
99 for pair in sorted(population.items(), key=lambda x: x[1], reverse=True):
101 if letter in fprint_feature_bit:
105 shift = fprint_feature_bit[letter]
108 return fp << letters_bits
112 def _compute_word_letter_sig(letter_sigs, word: str, population) -> int:
114 for pair in sorted(population.items(), key=lambda x: x[1], reverse=True):
116 if letter not in letter_sigs:
118 s = letter_sigs[letter]
134 @decorator_utils.memoized
135 def compute_word_sig(word: str) -> int:
136 """Given a word, compute its signature for subsequent lookup
137 operations. Signatures are computed based on the letters in
138 the word and their frequencies. We try to cluster "similar"
139 words close to each other in the signature space.
141 >>> train = Unscrambler.compute_word_sig('train')
145 >>> retain = Unscrambler.compute_word_sig('retrain')
153 population = list_utils.population_counts(word)
154 fprint = Unscrambler._compute_word_fingerprint(word, population)
155 letter_sig = Unscrambler._compute_word_letter_sig(letter_sigs, word, population)
156 assert fprint & letter_sig == 0
157 sig = fprint | letter_sig
162 letter_sigs: Dict[str, int],
163 dictfile: str = '/usr/share/dict/words',
164 indexfile: str = '/usr/share/dict/sparse_index',
166 """Before calling this method, change letter_sigs from the default above
167 unless you want to populate the same exact files."""
171 with open(dictfile, "r") as f:
173 word = word.replace('\n', '')
175 sig = Unscrambler.compute_word_sig(letter_sigs, word)
176 logger.debug("%s => 0x%x" % (word, sig))
180 if sig in words_by_sigs:
181 words_by_sigs[sig] += ",%s" % word
183 words_by_sigs[sig] = word
184 with open(indexfile, 'w') as f:
185 for sig in sorted(words_by_sigs.keys()):
186 word = words_by_sigs[sig]
187 print(f'0x{sig:x}+{word}', file=f)
190 def lookup(word: str, *, include_fuzzy_matches=False) -> Dict[str, bool]:
191 """Looks up a potentially scrambled word optionally including near
194 >>> Unscrambler.lookup('eanycleocipd', include_fuzzy_matches=False)
195 {'encyclopedia': True}
198 sig = Unscrambler.compute_word_sig(word)
199 return Unscrambler.lookup_by_sig(sig, include_fuzzy_matches=include_fuzzy_matches)
202 def lookup_by_sig(sig, *, include_fuzzy_matches=False) -> Dict[str, bool]:
203 """Looks up a word that has already been translated into a signature by
204 a previous call to Unscrambler.compute_word_sig. Optionally returns
205 near "fuzzy" matches.
207 >>> sig = Unscrambler.compute_word_sig('sunepsapetuargiarin')
211 >>> Unscrambler.lookup_by_sig(sig, include_fuzzy_matches=True)
212 {'pupigerous': False, 'pupigenous': False, 'unpurposing': False, 'superpurgation': False, 'unsupporting': False, 'superseptuaginarian': True, 'purpurogallin': False, 'scuppaug': False, 'purpurigenous': False, 'purpurogenous': False, 'proppage': False}
215 # Cache the index; it doesn't change and this may be called
217 if len(Unscrambler.sigs) == 0:
218 if 'unscramble_indexfile' in config.config:
219 indexfile = config.config['unscramble_indexfile']
221 indexfile = "/usr/share/dict/sparse_index"
222 with open(indexfile, 'r') as rf:
223 lines = rf.readlines()
226 (fsig, word) = line.split('+')
228 Unscrambler.sigs.append(fsig)
229 Unscrambler.words.append(word)
232 (exact, location) = list_utils.binary_search(Unscrambler.sigs, sig)
237 if end > len(Unscrambler.words):
238 end = len(Unscrambler.words)
240 for x in range(start, end):
241 word = Unscrambler.words[x]
242 fsig = Unscrambler.sigs[x]
243 if include_fuzzy_matches is True or (fsig == sig):
244 ret[word] = (fsig == sig)
248 # To repopulate, change letter_sigs and then call Unscrambler.repopulate.
253 if __name__ == "__main__":