3 # © Copyright 2021-2022, Scott Gasch
5 """A fast word unscrambler library."""
8 from typing import Dict, Mapping, Optional
11 import decorator_utils
15 cfg = config.add_commandline_args(
16 f'Unscrambler base library ({__file__})', 'A fast word unscrambler.'
19 "--unscrambler_default_indexfile",
20 help="Path to a file of signature -> word index.",
22 default="/usr/share/dict/sparse_index",
25 logger = logging.getLogger(__name__)
28 letters_mask = 2**letters_bits - 1
31 fprint_mask = (2**fprint_bits - 1) << letters_bits
33 fprint_feature_bit = {
92 class Unscrambler(object):
93 """A class that unscrambles words quickly by computing a signature
94 (sig) for the word based on its position independent letter
95 population and then using a pregenerated index to look up known
96 words the same set of letters.
98 Note that each instance of Unscrambler caches its index to speed
99 up lookups number 2..N; careless reinstantiation will by slower.
101 Sigs are designed to cluster similar words near each other so both
102 lookup methods support a "fuzzy match" argument that can be set to
103 request similar words that do not match exactly in addition to any
107 def __init__(self, indexfile: Optional[str] = None):
109 Constructs an unscrambler.
112 indexfile: overrides the default indexfile location if provided
115 # Cached index per instance.
119 filename = Unscrambler.get_indexfile(indexfile)
120 with open(filename, 'r') as rf:
121 lines = rf.readlines()
124 (fsig, word) = line.split('+')
126 self.sigs.append(isig)
127 self.words.append(word)
130 def get_indexfile(indexfile: Optional[str]) -> str:
131 """Returns the current indexfile location."""
132 if indexfile is None:
133 if 'unscrambler_default_indexfile' in config.config:
134 indexfile = config.config['unscramble_indexfile']
136 indexfile = "/usr/share/dict/sparse_index"
138 assert file_utils.file_is_readable(indexfile), f"Can't read {indexfile}"
143 def _compute_word_fingerprint(population: Mapping[str, int]) -> int:
145 for pair in sorted(population.items(), key=lambda x: x[1], reverse=True):
147 if letter in fprint_feature_bit:
148 count = min(pair[1], 3)
149 shift = fprint_feature_bit[letter]
152 return fp << letters_bits
156 def _compute_word_letter_sig(
157 lsigs: Mapping[str, int],
159 population: Mapping[str, int],
162 for pair in sorted(population.items(), key=lambda x: x[1], reverse=True):
164 if letter not in lsigs:
173 length = min(len(word), 31)
180 @decorator_utils.memoized
181 def compute_word_sig(word: str) -> int:
182 """Given a word, compute its signature for subsequent lookup
183 operations. Signatures are computed based on the letters in
184 the word and their frequencies. We try to cluster "similar"
185 words close to each other in the signature space.
188 word: the word to compute a signature for
191 The word's signature.
193 >>> train = Unscrambler.compute_word_sig('train')
197 >>> retain = Unscrambler.compute_word_sig('retrain')
205 population = list_utils.population_counts(word)
206 fprint = Unscrambler._compute_word_fingerprint(population)
207 letter_sig = Unscrambler._compute_word_letter_sig(letter_sigs, word, population)
208 assert fprint & letter_sig == 0
209 sig = fprint | letter_sig
214 dictfile: str = '/usr/share/dict/words',
215 indexfile: str = '/usr/share/dict/sparse_index',
218 Repopulates the indexfile.
222 Before calling this method, change letter_sigs from the
223 default above unless you want to populate the same exact
226 words_by_sigs: Dict[int, str] = {}
228 with open(dictfile, "r") as f:
230 word = word.replace('\n', '')
232 sig = Unscrambler.compute_word_sig(word)
233 logger.debug("%s => 0x%x", word, sig)
237 if sig in words_by_sigs:
238 words_by_sigs[sig] += f",{word}"
240 words_by_sigs[sig] = word
241 with open(indexfile, 'w') as f:
242 for sig in sorted(words_by_sigs.keys()):
243 word = words_by_sigs[sig]
244 print(f'0x{sig:x}+{word}', file=f)
246 def lookup(self, word: str, *, window_size: int = 5) -> Dict[str, bool]:
247 """Looks up a potentially scrambled word optionally including near
251 word: the word to lookup
252 window_size: the number of nearby fuzzy matches to return
255 A dict of word -> bool containing unscrambled words with (close
256 to or precisely) the same letters as the input word. The bool
257 values in this dict indicate whether the key word is an exact
258 or near match. The count of entries in this dict is controlled
259 by the window_size param.
261 >>> u = Unscrambler()
262 >>> u.lookup('eanycleocipd', window_size=0)
263 {'encyclopedia': True}
265 sig = Unscrambler.compute_word_sig(word)
266 return self.lookup_by_sig(sig, window_size=window_size)
268 def lookup_by_sig(self, sig: int, *, window_size: int = 5) -> Dict[str, bool]:
269 """Looks up a word that has already been translated into a signature by
270 a previous call to Unscrambler.compute_word_sig. Optionally returns
271 near "fuzzy" matches.
274 sig: the signature of the word to lookup (see :meth:`compute_word_sig`
275 to generate these signatures).
276 window_size: the number of nearby fuzzy matches to return
279 A dict of word -> bool containing unscrambled words with (close
280 to or precisely) the same letters as the input word. The bool
281 values in this dict indicate whether the key word is an exact
282 or near match. The count of entries in this dict is controlled
283 by the window_size param.
285 >>> sig = Unscrambler.compute_word_sig('sunepsapetuargiarin')
289 >>> u = Unscrambler()
290 >>> u.lookup_by_sig(sig)
291 {'pupigerous': False, 'pupigenous': False, 'unpurposing': False, 'superpurgation': False, 'unsupporting': False, 'superseptuaginarian': True, 'purpurogallin': False, 'scuppaug': False, 'purpurigenous': False, 'purpurogenous': False, 'proppage': False}
294 (_, location) = list_utils.binary_search(self.sigs, sig)
295 start = location - window_size
296 start = max(start, 0)
297 end = location + 1 + window_size
298 end = min(end, len(self.words))
300 for x in range(start, end):
303 if window_size > 0 or (fsig == sig):
304 ret[word] = fsig == sig
309 # To repopulate, change letter_sigs and then call Unscrambler.repopulate.
310 # See notes above. See also ~/bin/unscramble.py --populate_destructively.
314 if __name__ == "__main__":