3 # © Copyright 2021-2022, Scott Gasch
5 """A fast word unscrambler library."""
8 from typing import Dict, Mapping, Optional
10 from pyutils import config, decorator_utils, list_utils
11 from pyutils.files import file_utils
13 cfg = config.add_commandline_args(
14 f'Unscrambler base library ({__file__})', 'A fast word unscrambler.'
17 "--unscrambler_default_indexfile",
18 help="Path to a file of signature -> word index.",
20 default="/usr/share/dict/sparse_index",
23 logger = logging.getLogger(__name__)
26 letters_mask = 2**letters_bits - 1
29 fprint_mask = (2**fprint_bits - 1) << letters_bits
31 fprint_feature_bit = {
90 class Unscrambler(object):
91 """A class that unscrambles words quickly by computing a signature
92 (sig) for the word based on its position independent letter
93 population and then using a pregenerated index to look up known
94 words the same set of letters.
96 Note that each instance of Unscrambler caches its index to speed
97 up lookups number 2..N; careless reinstantiation will by slower.
99 Sigs are designed to cluster similar words near each other so both
100 lookup methods support a "fuzzy match" argument that can be set to
101 request similar words that do not match exactly in addition to any
105 def __init__(self, indexfile: Optional[str] = None):
107 Constructs an unscrambler.
110 indexfile: overrides the default indexfile location if provided
113 # Cached index per instance.
117 filename = Unscrambler.get_indexfile(indexfile)
118 with open(filename, 'r') as rf:
119 lines = rf.readlines()
122 (fsig, word) = line.split('+')
124 self.sigs.append(isig)
125 self.words.append(word)
128 def get_indexfile(indexfile: Optional[str]) -> str:
129 """Returns the current indexfile location."""
130 if indexfile is None:
131 if 'unscrambler_default_indexfile' in config.config:
132 indexfile = config.config['unscrambler_default_indexfile']
133 assert type(indexfile) == str
135 indexfile = "/usr/share/dict/sparse_index"
137 assert file_utils.file_is_readable(indexfile), f"Can't read {indexfile}"
142 def _compute_word_fingerprint(population: Mapping[str, int]) -> int:
144 for pair in sorted(population.items(), key=lambda x: x[1], reverse=True):
146 if letter in fprint_feature_bit:
147 count = min(pair[1], 3)
148 shift = fprint_feature_bit[letter]
151 return fp << letters_bits
155 def _compute_word_letter_sig(
156 lsigs: Mapping[str, int],
158 population: Mapping[str, int],
161 for pair in sorted(population.items(), key=lambda x: x[1], reverse=True):
163 if letter not in lsigs:
172 length = min(len(word), 31)
179 @decorator_utils.memoized
180 def compute_word_sig(word: str) -> int:
181 """Given a word, compute its signature for subsequent lookup
182 operations. Signatures are computed based on the letters in
183 the word and their frequencies. We try to cluster "similar"
184 words close to each other in the signature space.
187 word: the word to compute a signature for
190 The word's signature.
192 >>> train = Unscrambler.compute_word_sig('train')
196 >>> retain = Unscrambler.compute_word_sig('retrain')
204 population = list_utils.population_counts(word)
205 fprint = Unscrambler._compute_word_fingerprint(population)
206 letter_sig = Unscrambler._compute_word_letter_sig(letter_sigs, word, population)
207 assert fprint & letter_sig == 0
208 sig = fprint | letter_sig
213 dictfile: str = '/usr/share/dict/words',
214 indexfile: str = '/usr/share/dict/sparse_index',
217 Repopulates the indexfile.
221 Before calling this method, change letter_sigs from the
222 default above unless you want to populate the same exact
225 words_by_sigs: Dict[int, str] = {}
227 with open(dictfile, "r") as f:
229 word = word.replace('\n', '')
231 sig = Unscrambler.compute_word_sig(word)
232 logger.debug("%s => 0x%x", word, sig)
236 if sig in words_by_sigs:
237 words_by_sigs[sig] += f",{word}"
239 words_by_sigs[sig] = word
240 with open(indexfile, 'w') as f:
241 for sig in sorted(words_by_sigs.keys()):
242 word = words_by_sigs[sig]
243 print(f'0x{sig:x}+{word}', file=f)
245 def lookup(self, word: str, *, window_size: int = 5) -> Dict[str, bool]:
246 """Looks up a potentially scrambled word optionally including near
250 word: the word to lookup
251 window_size: the number of nearby fuzzy matches to return
254 A dict of word -> bool containing unscrambled words with (close
255 to or precisely) the same letters as the input word. The bool
256 values in this dict indicate whether the key word is an exact
257 or near match. The count of entries in this dict is controlled
258 by the window_size param.
260 >>> u = Unscrambler()
261 >>> u.lookup('eanycleocipd', window_size=0)
262 {'encyclopedia': True}
264 sig = Unscrambler.compute_word_sig(word)
265 return self.lookup_by_sig(sig, window_size=window_size)
267 def lookup_by_sig(self, sig: int, *, window_size: int = 5) -> Dict[str, bool]:
268 """Looks up a word that has already been translated into a signature by
269 a previous call to Unscrambler.compute_word_sig. Optionally returns
270 near "fuzzy" matches.
273 sig: the signature of the word to lookup (see :meth:`compute_word_sig`
274 to generate these signatures).
275 window_size: the number of nearby fuzzy matches to return
278 A dict of word -> bool containing unscrambled words with (close
279 to or precisely) the same letters as the input word. The bool
280 values in this dict indicate whether the key word is an exact
281 or near match. The count of entries in this dict is controlled
282 by the window_size param.
284 >>> sig = Unscrambler.compute_word_sig('sunepsapetuargiarin')
288 >>> u = Unscrambler()
289 >>> u.lookup_by_sig(sig)
290 {'pupigerous': False, 'pupigenous': False, 'unpurposing': False, 'superpurgation': False, 'unsupporting': False, 'superseptuaginarian': True, 'purpurogallin': False, 'scuppaug': False, 'purpurigenous': False, 'purpurogenous': False, 'proppage': False}
293 (_, location) = list_utils.binary_search(self.sigs, sig)
294 start = location - window_size
295 start = max(start, 0)
296 end = location + 1 + window_size
297 end = min(end, len(self.words))
299 for x in range(start, end):
302 if window_size > 0 or (fsig == sig):
303 ret[word] = fsig == sig
308 # To repopulate, change letter_sigs and then call Unscrambler.repopulate.
309 # See notes above. See also ~/bin/unscramble.py --populate_destructively.
313 if __name__ == "__main__":