3 """A fast word unscrambler library."""
6 from typing import Dict, Mapping, Optional
13 cfg = config.add_commandline_args(
14 f'Unscrambler base library ({__file__})', 'A fast word unscrambler.'
17 "--unscrambler_default_indexfile",
18 help="Path to a file of signature -> word index.",
20 default="/usr/share/dict/sparse_index",
23 logger = logging.getLogger(__name__)
26 letters_mask = 2**letters_bits - 1
29 fprint_mask = (2**fprint_bits - 1) << letters_bits
31 fprint_feature_bit = {
90 class Unscrambler(object):
91 """A class that unscrambles words quickly by computing a signature
92 (sig) for the word based on its position independent letter
93 population and then using a pregenerated index to look up known
94 words the same set of letters.
96 Note that each instance of Unscrambler caches its index to speed
97 up lookups number 2..N; careless reinstantiation will by slower.
99 Sigs are designed to cluster similar words near each other so both
100 lookup methods support a "fuzzy match" argument that can be set to
101 request similar words that do not match exactly in addition to any
106 def __init__(self, indexfile: Optional[str] = None):
107 # Cached index per instance.
111 filename = Unscrambler.get_indexfile(indexfile)
112 with open(filename, 'r') as rf:
113 lines = rf.readlines()
116 (fsig, word) = line.split('+')
118 self.sigs.append(isig)
119 self.words.append(word)
122 def get_indexfile(indexfile: Optional[str]) -> str:
123 if indexfile is None:
124 if 'unscrambler_default_indexfile' in config.config:
125 indexfile = config.config['unscramble_indexfile']
127 indexfile = "/usr/share/dict/sparse_index"
129 assert file_utils.file_is_readable(indexfile), f"Can't read {indexfile}"
134 def _compute_word_fingerprint(population: Mapping[str, int]) -> int:
136 for pair in sorted(population.items(), key=lambda x: x[1], reverse=True):
138 if letter in fprint_feature_bit:
139 count = min(pair[1], 3)
140 shift = fprint_feature_bit[letter]
143 return fp << letters_bits
147 def _compute_word_letter_sig(
148 lsigs: Mapping[str, int],
150 population: Mapping[str, int],
153 for pair in sorted(population.items(), key=lambda x: x[1], reverse=True):
155 if letter not in lsigs:
164 length = min(len(word), 31)
171 @decorator_utils.memoized
172 def compute_word_sig(word: str) -> int:
173 """Given a word, compute its signature for subsequent lookup
174 operations. Signatures are computed based on the letters in
175 the word and their frequencies. We try to cluster "similar"
176 words close to each other in the signature space.
178 >>> train = Unscrambler.compute_word_sig('train')
182 >>> retain = Unscrambler.compute_word_sig('retrain')
190 population = list_utils.population_counts(word)
191 fprint = Unscrambler._compute_word_fingerprint(population)
192 letter_sig = Unscrambler._compute_word_letter_sig(letter_sigs, word, population)
193 assert fprint & letter_sig == 0
194 sig = fprint | letter_sig
199 dictfile: str = '/usr/share/dict/words',
200 indexfile: str = '/usr/share/dict/sparse_index',
202 """Before calling this method, change letter_sigs from the default above
203 unless you want to populate the same exact files.
206 words_by_sigs: Dict[int, str] = {}
208 with open(dictfile, "r") as f:
210 word = word.replace('\n', '')
212 sig = Unscrambler.compute_word_sig(word)
213 logger.debug("%s => 0x%x", word, sig)
217 if sig in words_by_sigs:
218 words_by_sigs[sig] += f",{word}"
220 words_by_sigs[sig] = word
221 with open(indexfile, 'w') as f:
222 for sig in sorted(words_by_sigs.keys()):
223 word = words_by_sigs[sig]
224 print(f'0x{sig:x}+{word}', file=f)
226 def lookup(self, word: str, *, window_size: int = 5) -> Dict[str, bool]:
227 """Looks up a potentially scrambled word optionally including near
230 >>> u = Unscrambler()
231 >>> u.lookup('eanycleocipd', window_size=0)
232 {'encyclopedia': True}
235 sig = Unscrambler.compute_word_sig(word)
236 return self.lookup_by_sig(sig, window_size=window_size)
238 def lookup_by_sig(self, sig: int, *, window_size: int = 5) -> Dict[str, bool]:
239 """Looks up a word that has already been translated into a signature by
240 a previous call to Unscrambler.compute_word_sig. Optionally returns
241 near "fuzzy" matches.
243 >>> sig = Unscrambler.compute_word_sig('sunepsapetuargiarin')
247 >>> u = Unscrambler()
248 >>> u.lookup_by_sig(sig)
249 {'pupigerous': False, 'pupigenous': False, 'unpurposing': False, 'superpurgation': False, 'unsupporting': False, 'superseptuaginarian': True, 'purpurogallin': False, 'scuppaug': False, 'purpurigenous': False, 'purpurogenous': False, 'proppage': False}
253 (_, location) = list_utils.binary_search(self.sigs, sig)
254 start = location - window_size
255 start = max(start, 0)
256 end = location + 1 + window_size
257 end = min(end, len(self.words))
259 for x in range(start, end):
262 if window_size > 0 or (fsig == sig):
263 ret[word] = fsig == sig
268 # To repopulate, change letter_sigs and then call Unscrambler.repopulate.
269 # See notes above. See also ~/bin/unscramble.py --populate_destructively.
273 if __name__ == "__main__":