4 from typing import Dict, Mapping
10 cfg = config.add_commandline_args(f'Unscramble! ({__file__})', 'A fast word unscrambler.')
12 "--unscramble_indexfile",
13 help="Path to a file of signature -> word index.",
15 default="/usr/share/dict/sparse_index",
18 logger = logging.getLogger(__name__)
21 letters_mask = 2 ** letters_bits - 1
24 fprint_mask = (2 ** fprint_bits - 1) << letters_bits
26 fprint_feature_bit = {
85 class Unscrambler(object):
86 """A class that unscrambles words quickly by computing a signature
87 (sig) for the word based on its position independent letter
88 population and then using a pregenerated index to look up known
89 words the same set of letters.
91 Note that each instance of Unscrambler caches its index to speed
92 up lookups number 2..N; careless reinstantiation will by slower.
94 Sigs are designed to cluster similar words near each other so both
95 lookup methods support a "fuzzy match" argument that can be set to
96 request similar words that do not match exactly in addition to any
102 # Cached index per instance.
106 if 'unscramble_indexfile' in config.config:
107 indexfile = config.config['unscramble_indexfile']
109 indexfile = "/usr/share/dict/sparse_index"
111 with open(indexfile, 'r') as rf:
112 lines = rf.readlines()
115 (fsig, word) = line.split('+')
117 self.sigs.append(fsig)
118 self.words.append(word)
122 def _compute_word_fingerprint(word: str, population: Mapping[str, int]) -> int:
124 for pair in sorted(population.items(), key=lambda x: x[1], reverse=True):
126 if letter in fprint_feature_bit:
130 shift = fprint_feature_bit[letter]
133 return fp << letters_bits
137 def _compute_word_letter_sig(
138 letter_sigs: Mapping[str, int],
140 population: Mapping[str, int],
143 for pair in sorted(population.items(), key=lambda x: x[1], reverse=True):
145 if letter not in letter_sigs:
147 s = letter_sigs[letter]
163 @decorator_utils.memoized
164 def compute_word_sig(word: str) -> int:
165 """Given a word, compute its signature for subsequent lookup
166 operations. Signatures are computed based on the letters in
167 the word and their frequencies. We try to cluster "similar"
168 words close to each other in the signature space.
170 >>> train = Unscrambler.compute_word_sig('train')
174 >>> retain = Unscrambler.compute_word_sig('retrain')
182 population = list_utils.population_counts(word)
183 fprint = Unscrambler._compute_word_fingerprint(word, population)
184 letter_sig = Unscrambler._compute_word_letter_sig(letter_sigs, word, population)
185 assert fprint & letter_sig == 0
186 sig = fprint | letter_sig
191 letter_sigs: Dict[str, int],
192 dictfile: str = '/usr/share/dict/words',
193 indexfile: str = '/usr/share/dict/sparse_index',
195 """Before calling this method, change letter_sigs from the default above
196 unless you want to populate the same exact files.
199 words_by_sigs: Dict[int, str] = {}
201 with open(dictfile, "r") as f:
203 word = word.replace('\n', '')
205 sig = Unscrambler.compute_word_sig(letter_sigs, word)
206 logger.debug("%s => 0x%x" % (word, sig))
210 if sig in words_by_sigs:
211 words_by_sigs[sig] += ",%s" % word
213 words_by_sigs[sig] = word
214 with open(indexfile, 'w') as f:
215 for sig in sorted(words_by_sigs.keys()):
216 word = words_by_sigs[sig]
217 print(f'0x{sig:x}+{word}', file=f)
219 def lookup(self, word: str, *, window_size: int = 5) -> Dict[str, bool]:
220 """Looks up a potentially scrambled word optionally including near
223 >>> u = Unscrambler()
224 >>> u.lookup('eanycleocipd', window_size=0)
225 {'encyclopedia': True}
228 sig = Unscrambler.compute_word_sig(word)
229 return self.lookup_by_sig(sig, window_size=window_size)
231 def lookup_by_sig(self, sig: int, *, window_size: int = 5) -> Dict[str, bool]:
232 """Looks up a word that has already been translated into a signature by
233 a previous call to Unscrambler.compute_word_sig. Optionally returns
234 near "fuzzy" matches.
236 >>> sig = Unscrambler.compute_word_sig('sunepsapetuargiarin')
240 >>> u = Unscrambler()
241 >>> u.lookup_by_sig(sig)
242 {'pupigerous': False, 'pupigenous': False, 'unpurposing': False, 'superpurgation': False, 'unsupporting': False, 'superseptuaginarian': True, 'purpurogallin': False, 'scuppaug': False, 'purpurigenous': False, 'purpurogenous': False, 'proppage': False}
246 (exact, location) = list_utils.binary_search(self.sigs, sig)
247 start = location - window_size
250 end = location + 1 + window_size
251 if end > len(self.words):
252 end = len(self.words)
254 for x in range(start, end):
257 if window_size > 0 or (fsig == sig):
258 ret[word] = fsig == sig
263 # To repopulate, change letter_sigs and then call Unscrambler.repopulate.
264 # See notes above. See also ~/bin/unscramble.py --populate_destructively.
268 if __name__ == "__main__":