3 # © Copyright 2021-2022, Scott Gasch
5 """A fast word unscrambler library."""
8 from typing import Dict, Mapping, Optional
11 import decorator_utils
15 cfg = config.add_commandline_args(
16 f'Unscrambler base library ({__file__})', 'A fast word unscrambler.'
19 "--unscrambler_default_indexfile",
20 help="Path to a file of signature -> word index.",
22 default="/usr/share/dict/sparse_index",
25 logger = logging.getLogger(__name__)
28 letters_mask = 2**letters_bits - 1
31 fprint_mask = (2**fprint_bits - 1) << letters_bits
33 fprint_feature_bit = {
92 class Unscrambler(object):
93 """A class that unscrambles words quickly by computing a signature
94 (sig) for the word based on its position independent letter
95 population and then using a pregenerated index to look up known
96 words the same set of letters.
98 Note that each instance of Unscrambler caches its index to speed
99 up lookups number 2..N; careless reinstantiation will by slower.
101 Sigs are designed to cluster similar words near each other so both
102 lookup methods support a "fuzzy match" argument that can be set to
103 request similar words that do not match exactly in addition to any
108 def __init__(self, indexfile: Optional[str] = None):
109 # Cached index per instance.
113 filename = Unscrambler.get_indexfile(indexfile)
114 with open(filename, 'r') as rf:
115 lines = rf.readlines()
118 (fsig, word) = line.split('+')
120 self.sigs.append(isig)
121 self.words.append(word)
124 def get_indexfile(indexfile: Optional[str]) -> str:
125 if indexfile is None:
126 if 'unscrambler_default_indexfile' in config.config:
127 indexfile = config.config['unscramble_indexfile']
129 indexfile = "/usr/share/dict/sparse_index"
131 assert file_utils.file_is_readable(indexfile), f"Can't read {indexfile}"
136 def _compute_word_fingerprint(population: Mapping[str, int]) -> int:
138 for pair in sorted(population.items(), key=lambda x: x[1], reverse=True):
140 if letter in fprint_feature_bit:
141 count = min(pair[1], 3)
142 shift = fprint_feature_bit[letter]
145 return fp << letters_bits
149 def _compute_word_letter_sig(
150 lsigs: Mapping[str, int],
152 population: Mapping[str, int],
155 for pair in sorted(population.items(), key=lambda x: x[1], reverse=True):
157 if letter not in lsigs:
166 length = min(len(word), 31)
173 @decorator_utils.memoized
174 def compute_word_sig(word: str) -> int:
175 """Given a word, compute its signature for subsequent lookup
176 operations. Signatures are computed based on the letters in
177 the word and their frequencies. We try to cluster "similar"
178 words close to each other in the signature space.
180 >>> train = Unscrambler.compute_word_sig('train')
184 >>> retain = Unscrambler.compute_word_sig('retrain')
192 population = list_utils.population_counts(word)
193 fprint = Unscrambler._compute_word_fingerprint(population)
194 letter_sig = Unscrambler._compute_word_letter_sig(letter_sigs, word, population)
195 assert fprint & letter_sig == 0
196 sig = fprint | letter_sig
201 dictfile: str = '/usr/share/dict/words',
202 indexfile: str = '/usr/share/dict/sparse_index',
204 """Before calling this method, change letter_sigs from the default above
205 unless you want to populate the same exact files.
208 words_by_sigs: Dict[int, str] = {}
210 with open(dictfile, "r") as f:
212 word = word.replace('\n', '')
214 sig = Unscrambler.compute_word_sig(word)
215 logger.debug("%s => 0x%x", word, sig)
219 if sig in words_by_sigs:
220 words_by_sigs[sig] += f",{word}"
222 words_by_sigs[sig] = word
223 with open(indexfile, 'w') as f:
224 for sig in sorted(words_by_sigs.keys()):
225 word = words_by_sigs[sig]
226 print(f'0x{sig:x}+{word}', file=f)
228 def lookup(self, word: str, *, window_size: int = 5) -> Dict[str, bool]:
229 """Looks up a potentially scrambled word optionally including near
232 >>> u = Unscrambler()
233 >>> u.lookup('eanycleocipd', window_size=0)
234 {'encyclopedia': True}
237 sig = Unscrambler.compute_word_sig(word)
238 return self.lookup_by_sig(sig, window_size=window_size)
240 def lookup_by_sig(self, sig: int, *, window_size: int = 5) -> Dict[str, bool]:
241 """Looks up a word that has already been translated into a signature by
242 a previous call to Unscrambler.compute_word_sig. Optionally returns
243 near "fuzzy" matches.
245 >>> sig = Unscrambler.compute_word_sig('sunepsapetuargiarin')
249 >>> u = Unscrambler()
250 >>> u.lookup_by_sig(sig)
251 {'pupigerous': False, 'pupigenous': False, 'unpurposing': False, 'superpurgation': False, 'unsupporting': False, 'superseptuaginarian': True, 'purpurogallin': False, 'scuppaug': False, 'purpurigenous': False, 'purpurogenous': False, 'proppage': False}
255 (_, location) = list_utils.binary_search(self.sigs, sig)
256 start = location - window_size
257 start = max(start, 0)
258 end = location + 1 + window_size
259 end = min(end, len(self.words))
261 for x in range(start, end):
264 if window_size > 0 or (fsig == sig):
265 ret[word] = fsig == sig
270 # To repopulate, change letter_sigs and then call Unscrambler.repopulate.
271 # See notes above. See also ~/bin/unscramble.py --populate_destructively.
275 if __name__ == "__main__":